From 06eaf7232e9a920468c0f8d74dcf2fe8b555501c Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Sat, 13 Apr 2024 14:24:36 +0200
Subject: Adding upstream version 1:10.11.6.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 storage/innobase/.clang-format-old                 |    11 +
 storage/innobase/CMakeLists.txt                    |   511 +
 storage/innobase/COPYING.Google                    |    30 +
 storage/innobase/COPYING.Percona                   |    30 +
 storage/innobase/btr/btr0btr.cc                    |  5433 +++++
 storage/innobase/btr/btr0bulk.cc                   |  1233 ++
 storage/innobase/btr/btr0cur.cc                    |  7017 ++++++
 storage/innobase/btr/btr0defragment.cc             |   820 +
 storage/innobase/btr/btr0pcur.cc                   |   667 +
 storage/innobase/btr/btr0sea.cc                    |  2328 ++
 storage/innobase/buf/buf0block_hint.cc             |    59 +
 storage/innobase/buf/buf0buddy.cc                  |   769 +
 storage/innobase/buf/buf0buf.cc                    |  4180 ++++
 storage/innobase/buf/buf0checksum.cc               |    98 +
 storage/innobase/buf/buf0dblwr.cc                  |   779 +
 storage/innobase/buf/buf0dump.cc                   |   765 +
 storage/innobase/buf/buf0flu.cc                    |  2765 +++
 storage/innobase/buf/buf0lru.cc                    |  1452 ++
 storage/innobase/buf/buf0rea.cc                    |   710 +
 storage/innobase/data/data0data.cc                 |   820 +
 storage/innobase/data/data0type.cc                 |   212 +
 storage/innobase/dict/dict0boot.cc                 |   440 +
 storage/innobase/dict/dict0crea.cc                 |  1906 ++
 storage/innobase/dict/dict0defrag_bg.cc            |   434 +
 storage/innobase/dict/dict0dict.cc                 |  4859 +++++
 storage/innobase/dict/dict0load.cc                 |  3213 +++
 storage/innobase/dict/dict0mem.cc                  |  1379 ++
 storage/innobase/dict/dict0stats.cc                |  4724 +++++
 storage/innobase/dict/dict0stats_bg.cc             |   424 +
 storage/innobase/dict/drop.cc                      |   297 +
 storage/innobase/eval/eval0eval.cc                 |   643 +
 storage/innobase/eval/eval0proc.cc                 |   286 +
 storage/innobase/fil/fil0crypt.cc                  |  2425 +++
 storage/innobase/fil/fil0fil.cc                    |  3282 +++
 storage/innobase/fil/fil0pagecompress.cc           |   584 +
 storage/innobase/fsp/fsp0file.cc                   |   936 +
 storage/innobase/fsp/fsp0fsp.cc                    |  3070 +++
 storage/innobase/fsp/fsp0space.cc                  |   224 +
 storage/innobase/fsp/fsp0sysspace.cc               |  1019 +
 storage/innobase/fts/Makefile.query                |    18 +
 storage/innobase/fts/fts0ast.cc                    |   816 +
 storage/innobase/fts/fts0blex.cc                   |  2177 ++
 storage/innobase/fts/fts0blex.l                    |    74 +
 storage/innobase/fts/fts0config.cc                 |   428 +
 storage/innobase/fts/fts0fts.cc                    |  6182 ++++++
 storage/innobase/fts/fts0opt.cc                    |  3054 +++
 storage/innobase/fts/fts0pars.cc                   |  2007 ++
 storage/innobase/fts/fts0pars.y                    |   293 +
 storage/innobase/fts/fts0plugin.cc                 |   283 +
 storage/innobase/fts/fts0que.cc                    |  4612 ++++
 storage/innobase/fts/fts0sql.cc                    |   208 +
 storage/innobase/fts/fts0tlex.cc                   |  2169 ++
 storage/innobase/fts/fts0tlex.l                    |    69 +
 storage/innobase/fts/make_parser.sh                |    49 +
 storage/innobase/fut/fut0lst.cc                    |   416 +
 storage/innobase/gis/gis0geo.cc                    |   650 +
 storage/innobase/gis/gis0rtree.cc                  |  1934 ++
 storage/innobase/gis/gis0sea.cc                    |  2403 +++
 storage/innobase/ha/ha0storage.cc                  |   178 +
 storage/innobase/handler/ha_innodb.cc              | 21217 +++++++++++++++++++
 storage/innobase/handler/ha_innodb.h               |   937 +
 storage/innobase/handler/handler0alter.cc          | 11843 +++++++++++
 storage/innobase/handler/i_s.cc                    |  6506 ++++++
 storage/innobase/handler/i_s.h                     |    91 +
 storage/innobase/ibuf/ibuf0ibuf.cc                 |  4617 ++++
 storage/innobase/include/btr0btr.h                 |   543 +
 storage/innobase/include/btr0btr.inl               |   111 +
 storage/innobase/include/btr0bulk.h                |   371 +
 storage/innobase/include/btr0cur.h                 |   855 +
 storage/innobase/include/btr0cur.inl               |   170 +
 storage/innobase/include/btr0defragment.h          |    65 +
 storage/innobase/include/btr0pcur.h                |   459 +
 storage/innobase/include/btr0pcur.inl              |   372 +
 storage/innobase/include/btr0sea.h                 |   403 +
 storage/innobase/include/btr0sea.inl               |   117 +
 storage/innobase/include/btr0types.h               |   154 +
 storage/innobase/include/buf0block_hint.h          |    76 +
 storage/innobase/include/buf0buddy.h               |    91 +
 storage/innobase/include/buf0buf.h                 |  2190 ++
 storage/innobase/include/buf0buf.inl               |   132 +
 storage/innobase/include/buf0checksum.h            |    57 +
 storage/innobase/include/buf0dblwr.h               |   164 +
 storage/innobase/include/buf0dump.h                |    44 +
 storage/innobase/include/buf0flu.h                 |   125 +
 storage/innobase/include/buf0lru.h                 |   193 +
 storage/innobase/include/buf0rea.h                 |   120 +
 storage/innobase/include/buf0types.h               |   235 +
 storage/innobase/include/data0data.h               |   704 +
 storage/innobase/include/data0data.inl             |   633 +
 storage/innobase/include/data0type.h               |   591 +
 storage/innobase/include/data0type.inl             |   487 +
 storage/innobase/include/data0types.h              |    36 +
 storage/innobase/include/db0err.h                  |   170 +
 storage/innobase/include/dict0boot.h               |   297 +
 storage/innobase/include/dict0crea.h               |   277 +
 storage/innobase/include/dict0crea.inl             |   136 +
 storage/innobase/include/dict0defrag_bg.h          |   101 +
 storage/innobase/include/dict0dict.h               |  1744 ++
 storage/innobase/include/dict0dict.inl             |  1217 ++
 storage/innobase/include/dict0load.h               |   220 +
 storage/innobase/include/dict0mem.h                |  2649 +++
 storage/innobase/include/dict0mem.inl              |    68 +
 storage/innobase/include/dict0pagecompress.h       |    61 +
 storage/innobase/include/dict0pagecompress.inl     |    81 +
 storage/innobase/include/dict0stats.h              |   238 +
 storage/innobase/include/dict0stats.inl            |   219 +
 storage/innobase/include/dict0stats_bg.h           |    59 +
 storage/innobase/include/dict0types.h              |   176 +
 storage/innobase/include/dyn0buf.h                 |   442 +
 storage/innobase/include/dyn0types.h               |    39 +
 storage/innobase/include/eval0eval.h               |   109 +
 storage/innobase/include/eval0eval.inl             |   254 +
 storage/innobase/include/eval0proc.h               |    94 +
 storage/innobase/include/eval0proc.inl             |    88 +
 storage/innobase/include/fil0crypt.h               |   396 +
 storage/innobase/include/fil0crypt.inl             |    81 +
 storage/innobase/include/fil0fil.h                 |  1823 ++
 storage/innobase/include/fil0pagecompress.h        |    57 +
 storage/innobase/include/fsp0file.h                |   509 +
 storage/innobase/include/fsp0fsp.h                 |   762 +
 storage/innobase/include/fsp0space.h               |   209 +
 storage/innobase/include/fsp0sysspace.h            |   278 +
 storage/innobase/include/fsp0types.h               |   404 +
 storage/innobase/include/fts0ast.h                 |   340 +
 storage/innobase/include/fts0blex.h                |   702 +
 storage/innobase/include/fts0fts.h                 |   947 +
 storage/innobase/include/fts0opt.h                 |    39 +
 storage/innobase/include/fts0pars.h                |    72 +
 storage/innobase/include/fts0plugin.h              |    50 +
 storage/innobase/include/fts0priv.h                |   485 +
 storage/innobase/include/fts0priv.inl              |   121 +
 storage/innobase/include/fts0tlex.h                |   702 +
 storage/innobase/include/fts0tokenize.h            |   189 +
 storage/innobase/include/fts0types.h               |   354 +
 storage/innobase/include/fts0types.inl             |   231 +
 storage/innobase/include/fts0vlc.h                 |   124 +
 storage/innobase/include/fut0lst.h                 |   156 +
 storage/innobase/include/gis0geo.h                 |   122 +
 storage/innobase/include/gis0rtree.h               |   513 +
 storage/innobase/include/gis0rtree.inl             |   245 +
 storage/innobase/include/gis0type.h                |   146 +
 storage/innobase/include/ha0ha.h                   |    60 +
 storage/innobase/include/ha0ha.inl                 |   154 +
 storage/innobase/include/ha0storage.h              |   137 +
 storage/innobase/include/ha0storage.inl            |   142 +
 storage/innobase/include/ha_prototypes.h           |   476 +
 storage/innobase/include/handler0alter.h           |   108 +
 storage/innobase/include/hash0hash.h               |   190 +
 storage/innobase/include/ibuf0ibuf.h               |   436 +
 storage/innobase/include/ibuf0ibuf.inl             |   282 +
 storage/innobase/include/lock0iter.h               |    66 +
 storage/innobase/include/lock0lock.h               |  1271 ++
 storage/innobase/include/lock0lock.inl             |    78 +
 storage/innobase/include/lock0prdt.h               |   192 +
 storage/innobase/include/lock0priv.h               |   582 +
 storage/innobase/include/lock0priv.inl             |   255 +
 storage/innobase/include/lock0types.h              |   251 +
 storage/innobase/include/log0crypt.h               |   115 +
 storage/innobase/include/log0log.h                 |   529 +
 storage/innobase/include/log0recv.h                |   491 +
 storage/innobase/include/log0types.h               |    38 +
 storage/innobase/include/mach0data.h               |   375 +
 storage/innobase/include/mach0data.inl             |   837 +
 storage/innobase/include/mariadb_stats.h           |   119 +
 storage/innobase/include/mem0mem.h                 |   345 +
 storage/innobase/include/mem0mem.inl               |   468 +
 storage/innobase/include/mtr0log.h                 |   637 +
 storage/innobase/include/mtr0mtr.h                 |   780 +
 storage/innobase/include/mtr0types.h               |   347 +
 storage/innobase/include/os0file.h                 |  1188 ++
 storage/innobase/include/os0file.inl               |   412 +
 storage/innobase/include/page0cur.h                |   303 +
 storage/innobase/include/page0cur.inl              |   203 +
 storage/innobase/include/page0page.h               |  1101 +
 storage/innobase/include/page0page.inl             |   550 +
 storage/innobase/include/page0types.h              |   188 +
 storage/innobase/include/page0zip.h                |   383 +
 storage/innobase/include/page0zip.inl              |   317 +
 storage/innobase/include/pars0grm.h                |   151 +
 storage/innobase/include/pars0opt.h                |    68 +
 storage/innobase/include/pars0pars.h               |   695 +
 storage/innobase/include/pars0sym.h                |   243 +
 storage/innobase/include/pars0types.h              |    50 +
 storage/innobase/include/que0que.h                 |   314 +
 storage/innobase/include/que0que.inl               |   245 +
 storage/innobase/include/que0types.h               |    97 +
 storage/innobase/include/read0types.h              |   275 +
 storage/innobase/include/rem0cmp.h                 |   286 +
 storage/innobase/include/rem0rec.h                 |  1276 ++
 storage/innobase/include/rem0rec.inl               |  1134 +
 storage/innobase/include/rem0types.h               |    78 +
 storage/innobase/include/row0ext.h                 |   101 +
 storage/innobase/include/row0ext.inl               |    87 +
 storage/innobase/include/row0ftsort.h              |   268 +
 storage/innobase/include/row0import.h              |    67 +
 storage/innobase/include/row0ins.h                 |   224 +
 storage/innobase/include/row0log.h                 |   239 +
 storage/innobase/include/row0merge.h               |   496 +
 storage/innobase/include/row0mysql.h               |   841 +
 storage/innobase/include/row0purge.h               |   149 +
 storage/innobase/include/row0quiesce.h             |    67 +
 storage/innobase/include/row0row.h                 |   431 +
 storage/innobase/include/row0row.inl               |   221 +
 storage/innobase/include/row0sel.h                 |   457 +
 storage/innobase/include/row0types.h               |    54 +
 storage/innobase/include/row0uins.h                |    50 +
 storage/innobase/include/row0umod.h                |    46 +
 storage/innobase/include/row0undo.h                |   114 +
 storage/innobase/include/row0upd.h                 |   559 +
 storage/innobase/include/row0upd.inl               |   153 +
 storage/innobase/include/row0vers.h                |   143 +
 storage/innobase/include/rw_lock.h                 |   138 +
 storage/innobase/include/small_vector.h            |   100 +
 storage/innobase/include/srv0mon.h                 |   846 +
 storage/innobase/include/srv0mon.inl               |   113 +
 storage/innobase/include/srv0srv.h                 |   715 +
 storage/innobase/include/srv0start.h               |   124 +
 storage/innobase/include/srw_lock.h                |   554 +
 storage/innobase/include/sux_lock.h                |   472 +
 .../innobase/include/transactional_lock_guard.h    |   174 +
 storage/innobase/include/trx0i_s.h                 |   277 +
 storage/innobase/include/trx0purge.h               |   427 +
 storage/innobase/include/trx0rec.h                 |   299 +
 storage/innobase/include/trx0roll.h                |   168 +
 storage/innobase/include/trx0rseg.h                |   301 +
 storage/innobase/include/trx0sys.h                 |  1274 ++
 storage/innobase/include/trx0trx.h                 |  1268 ++
 storage/innobase/include/trx0trx.inl               |    86 +
 storage/innobase/include/trx0types.h               |   131 +
 storage/innobase/include/trx0undo.h                |   514 +
 storage/innobase/include/trx0undo.inl              |   129 +
 storage/innobase/include/trx0xa.h                  |    61 +
 storage/innobase/include/univ.i                    |   503 +
 storage/innobase/include/ut0byte.h                 |   107 +
 storage/innobase/include/ut0byte.inl               |    90 +
 storage/innobase/include/ut0counter.h              |   123 +
 storage/innobase/include/ut0dbg.h                  |   179 +
 storage/innobase/include/ut0list.h                 |   146 +
 storage/innobase/include/ut0list.inl               |    80 +
 storage/innobase/include/ut0lst.h                  |   563 +
 storage/innobase/include/ut0mem.h                  |    76 +
 storage/innobase/include/ut0mem.inl                |   246 +
 storage/innobase/include/ut0new.h                  |  1099 +
 storage/innobase/include/ut0pool.h                 |   365 +
 storage/innobase/include/ut0rbt.h                  |   254 +
 storage/innobase/include/ut0rnd.h                  |   128 +
 storage/innobase/include/ut0rnd.inl                |   128 +
 storage/innobase/include/ut0sort.h                 |   104 +
 storage/innobase/include/ut0stage.h                |   499 +
 storage/innobase/include/ut0ut.h                   |   444 +
 storage/innobase/include/ut0ut.inl                 |   143 +
 storage/innobase/include/ut0vec.h                  |   285 +
 storage/innobase/include/ut0vec.inl                |   348 +
 storage/innobase/include/ut0wqueue.h               |    86 +
 storage/innobase/lock/lock0iter.cc                 |    88 +
 storage/innobase/lock/lock0lock.cc                 |  6812 ++++++
 storage/innobase/lock/lock0prdt.cc                 |   928 +
 storage/innobase/log/log0crypt.cc                  |   641 +
 storage/innobase/log/log0log.cc                    |  1358 ++
 storage/innobase/log/log0recv.cc                   |  4870 +++++
 storage/innobase/log/log0sync.cc                   |   404 +
 storage/innobase/log/log0sync.h                    |    99 +
 storage/innobase/mem/mem0mem.cc                    |   436 +
 storage/innobase/mtr/mtr0mtr.cc                    |  1667 ++
 .../mysql-test/storage_engine/alter_tablespace.opt |     2 +
 .../storage_engine/autoinc_secondary.rdiff         |    30 +
 .../mysql-test/storage_engine/cache_index.rdiff    |    71 +
 .../storage_engine/checksum_table_live.rdiff       |    13 +
 .../mysql-test/storage_engine/col_opt_not_null.opt |     1 +
 .../mysql-test/storage_engine/col_opt_null.opt     |     1 +
 .../mysql-test/storage_engine/define_engine.inc    |    45 +
 .../mysql-test/storage_engine/disabled.def         |     9 +
 .../storage_engine/fulltext_search.rdiff           |    49 +
 .../storage_engine/index_enable_disable.rdiff      |    33 +
 .../storage_engine/index_type_hash.rdiff           |    60 +
 .../mysql-test/storage_engine/insert_delayed.rdiff |    26 +
 .../storage_engine/lock_concurrent.rdiff           |    25 +
 .../mysql-test/storage_engine/optimize_table.rdiff |    37 +
 .../storage_engine/parts/checksum_table.rdiff      |    13 +
 .../storage_engine/parts/create_table.rdiff        |    20 +
 .../mysql-test/storage_engine/parts/disabled.def   |     1 +
 .../storage_engine/parts/optimize_table.rdiff      |    58 +
 .../storage_engine/parts/repair_table.rdiff        |   158 +
 .../mysql-test/storage_engine/parts/suite.opt      |     2 +
 .../mysql-test/storage_engine/repair_table.rdiff   |   139 +
 .../innobase/mysql-test/storage_engine/suite.opt   |     1 +
 .../storage_engine/tbl_opt_index_dir.rdiff         |    23 +
 .../storage_engine/tbl_opt_insert_method.rdiff     |    11 +
 .../storage_engine/tbl_opt_row_format.rdiff        |    44 +
 .../mysql-test/storage_engine/tbl_opt_union.rdiff  |    16 +
 .../trx/cons_snapshot_serializable.rdiff           |    18 +
 .../storage_engine/trx/level_read_committed.rdiff  |    11 +
 .../trx/level_read_uncommitted.rdiff               |    11 +
 .../mysql-test/storage_engine/trx/suite.opt        |     3 +
 .../mysql-test/storage_engine/type_blob.opt        |     1 +
 .../storage_engine/type_char_indexes.rdiff         |    11 +
 .../storage_engine/type_float_indexes.rdiff        |    11 +
 .../mysql-test/storage_engine/type_text.opt        |     1 +
 storage/innobase/os/os0file.cc                     |  4270 ++++
 storage/innobase/page/page0cur.cc                  |  3097 +++
 storage/innobase/page/page0page.cc                 |  2523 +++
 storage/innobase/page/page0zip.cc                  |  4666 ++++
 storage/innobase/pars/lexyy.cc                     |  2841 +++
 storage/innobase/pars/make_bison.sh                |    32 +
 storage/innobase/pars/make_flex.sh                 |    50 +
 storage/innobase/pars/pars0grm.cc                  |  2504 +++
 storage/innobase/pars/pars0grm.y                   |   609 +
 storage/innobase/pars/pars0lex.l                   |   614 +
 storage/innobase/pars/pars0opt.cc                  |  1263 ++
 storage/innobase/pars/pars0pars.cc                 |  2381 +++
 storage/innobase/pars/pars0sym.cc                  |   413 +
 storage/innobase/que/que0que.cc                    |   708 +
 storage/innobase/read/read0read.cc                 |   265 +
 storage/innobase/rem/rem0cmp.cc                    |   901 +
 storage/innobase/rem/rem0rec.cc                    |  2820 +++
 storage/innobase/row/row0ext.cc                    |   132 +
 storage/innobase/row/row0ftsort.cc                 |  1791 ++
 storage/innobase/row/row0import.cc                 |  4585 ++++
 storage/innobase/row/row0ins.cc                    |  3843 ++++
 storage/innobase/row/row0log.cc                    |  4134 ++++
 storage/innobase/row/row0merge.cc                  |  5406 +++++
 storage/innobase/row/row0mysql.cc                  |  2916 +++
 storage/innobase/row/row0purge.cc                  |  1304 ++
 storage/innobase/row/row0quiesce.cc                |   715 +
 storage/innobase/row/row0row.cc                    |  1720 ++
 storage/innobase/row/row0sel.cc                    |  6947 ++++++
 storage/innobase/row/row0uins.cc                   |   652 +
 storage/innobase/row/row0umod.cc                   |  1288 ++
 storage/innobase/row/row0undo.cc                   |   453 +
 storage/innobase/row/row0upd.cc                    |  3002 +++
 storage/innobase/row/row0vers.cc                   |  1419 ++
 storage/innobase/srv/srv0mon.cc                    |  1799 ++
 storage/innobase/srv/srv0srv.cc                    |  1659 ++
 storage/innobase/srv/srv0start.cc                  |  2101 ++
 storage/innobase/sync/srw_lock.cc                  |   550 +
 storage/innobase/trx/trx0i_s.cc                    |  1471 ++
 storage/innobase/trx/trx0purge.cc                  |  1480 ++
 storage/innobase/trx/trx0rec.cc                    |  2448 +++
 storage/innobase/trx/trx0roll.cc                   |   933 +
 storage/innobase/trx/trx0rseg.cc                   |   727 +
 storage/innobase/trx/trx0sys.cc                    |   370 +
 storage/innobase/trx/trx0trx.cc                    |  2292 ++
 storage/innobase/trx/trx0undo.cc                   |  1478 ++
 storage/innobase/unittest/CMakeLists.txt           |    34 +
 storage/innobase/unittest/innodb_fts-t.cc          |    52 +
 storage/innobase/unittest/innodb_sync-t.cc         |   185 +
 storage/innobase/ut/ut0dbg.cc                      |    61 +
 storage/innobase/ut/ut0list.cc                     |   151 +
 storage/innobase/ut/ut0mem.cc                      |    55 +
 storage/innobase/ut/ut0new.cc                      |   112 +
 storage/innobase/ut/ut0rbt.cc                      |  1142 +
 storage/innobase/ut/ut0rnd.cc                      |    93 +
 storage/innobase/ut/ut0ut.cc                       |   599 +
 storage/innobase/ut/ut0vec.cc                      |    73 +
 storage/innobase/ut/ut0wqueue.cc                   |   118 +
 355 files changed, 309116 insertions(+)
 create mode 100644 storage/innobase/.clang-format-old
 create mode 100644 storage/innobase/CMakeLists.txt
 create mode 100644 storage/innobase/COPYING.Google
 create mode 100644 storage/innobase/COPYING.Percona
 create mode 100644 storage/innobase/btr/btr0btr.cc
 create mode 100644 storage/innobase/btr/btr0bulk.cc
 create mode 100644 storage/innobase/btr/btr0cur.cc
 create mode 100644 storage/innobase/btr/btr0defragment.cc
 create mode 100644 storage/innobase/btr/btr0pcur.cc
 create mode 100644 storage/innobase/btr/btr0sea.cc
 create mode 100644 storage/innobase/buf/buf0block_hint.cc
 create mode 100644 storage/innobase/buf/buf0buddy.cc
 create mode 100644 storage/innobase/buf/buf0buf.cc
 create mode 100644 storage/innobase/buf/buf0checksum.cc
 create mode 100644 storage/innobase/buf/buf0dblwr.cc
 create mode 100644 storage/innobase/buf/buf0dump.cc
 create mode 100644 storage/innobase/buf/buf0flu.cc
 create mode 100644 storage/innobase/buf/buf0lru.cc
 create mode 100644 storage/innobase/buf/buf0rea.cc
 create mode 100644 storage/innobase/data/data0data.cc
 create mode 100644 storage/innobase/data/data0type.cc
 create mode 100644 storage/innobase/dict/dict0boot.cc
 create mode 100644 storage/innobase/dict/dict0crea.cc
 create mode 100644 storage/innobase/dict/dict0defrag_bg.cc
 create mode 100644 storage/innobase/dict/dict0dict.cc
 create mode 100644 storage/innobase/dict/dict0load.cc
 create mode 100644 storage/innobase/dict/dict0mem.cc
 create mode 100644 storage/innobase/dict/dict0stats.cc
 create mode 100644 storage/innobase/dict/dict0stats_bg.cc
 create mode 100644 storage/innobase/dict/drop.cc
 create mode 100644 storage/innobase/eval/eval0eval.cc
 create mode 100644 storage/innobase/eval/eval0proc.cc
 create mode 100644 storage/innobase/fil/fil0crypt.cc
 create mode 100644 storage/innobase/fil/fil0fil.cc
 create mode 100644 storage/innobase/fil/fil0pagecompress.cc
 create mode 100644 storage/innobase/fsp/fsp0file.cc
 create mode 100644 storage/innobase/fsp/fsp0fsp.cc
 create mode 100644 storage/innobase/fsp/fsp0space.cc
 create mode 100644 storage/innobase/fsp/fsp0sysspace.cc
 create mode 100644 storage/innobase/fts/Makefile.query
 create mode 100644 storage/innobase/fts/fts0ast.cc
 create mode 100644 storage/innobase/fts/fts0blex.cc
 create mode 100644 storage/innobase/fts/fts0blex.l
 create mode 100644 storage/innobase/fts/fts0config.cc
 create mode 100644 storage/innobase/fts/fts0fts.cc
 create mode 100644 storage/innobase/fts/fts0opt.cc
 create mode 100644 storage/innobase/fts/fts0pars.cc
 create mode 100644 storage/innobase/fts/fts0pars.y
 create mode 100644 storage/innobase/fts/fts0plugin.cc
 create mode 100644 storage/innobase/fts/fts0que.cc
 create mode 100644 storage/innobase/fts/fts0sql.cc
 create mode 100644 storage/innobase/fts/fts0tlex.cc
 create mode 100644 storage/innobase/fts/fts0tlex.l
 create mode 100755 storage/innobase/fts/make_parser.sh
 create mode 100644 storage/innobase/fut/fut0lst.cc
 create mode 100644 storage/innobase/gis/gis0geo.cc
 create mode 100644 storage/innobase/gis/gis0rtree.cc
 create mode 100644 storage/innobase/gis/gis0sea.cc
 create mode 100644 storage/innobase/ha/ha0storage.cc
 create mode 100644 storage/innobase/handler/ha_innodb.cc
 create mode 100644 storage/innobase/handler/ha_innodb.h
 create mode 100644 storage/innobase/handler/handler0alter.cc
 create mode 100644 storage/innobase/handler/i_s.cc
 create mode 100644 storage/innobase/handler/i_s.h
 create mode 100644 storage/innobase/ibuf/ibuf0ibuf.cc
 create mode 100644 storage/innobase/include/btr0btr.h
 create mode 100644 storage/innobase/include/btr0btr.inl
 create mode 100644 storage/innobase/include/btr0bulk.h
 create mode 100644 storage/innobase/include/btr0cur.h
 create mode 100644 storage/innobase/include/btr0cur.inl
 create mode 100644 storage/innobase/include/btr0defragment.h
 create mode 100644 storage/innobase/include/btr0pcur.h
 create mode 100644 storage/innobase/include/btr0pcur.inl
 create mode 100644 storage/innobase/include/btr0sea.h
 create mode 100644 storage/innobase/include/btr0sea.inl
 create mode 100644 storage/innobase/include/btr0types.h
 create mode 100644 storage/innobase/include/buf0block_hint.h
 create mode 100644 storage/innobase/include/buf0buddy.h
 create mode 100644 storage/innobase/include/buf0buf.h
 create mode 100644 storage/innobase/include/buf0buf.inl
 create mode 100644 storage/innobase/include/buf0checksum.h
 create mode 100644 storage/innobase/include/buf0dblwr.h
 create mode 100644 storage/innobase/include/buf0dump.h
 create mode 100644 storage/innobase/include/buf0flu.h
 create mode 100644 storage/innobase/include/buf0lru.h
 create mode 100644 storage/innobase/include/buf0rea.h
 create mode 100644 storage/innobase/include/buf0types.h
 create mode 100644 storage/innobase/include/data0data.h
 create mode 100644 storage/innobase/include/data0data.inl
 create mode 100644 storage/innobase/include/data0type.h
 create mode 100644 storage/innobase/include/data0type.inl
 create mode 100644 storage/innobase/include/data0types.h
 create mode 100644 storage/innobase/include/db0err.h
 create mode 100644 storage/innobase/include/dict0boot.h
 create mode 100644 storage/innobase/include/dict0crea.h
 create mode 100644 storage/innobase/include/dict0crea.inl
 create mode 100644 storage/innobase/include/dict0defrag_bg.h
 create mode 100644 storage/innobase/include/dict0dict.h
 create mode 100644 storage/innobase/include/dict0dict.inl
 create mode 100644 storage/innobase/include/dict0load.h
 create mode 100644 storage/innobase/include/dict0mem.h
 create mode 100644 storage/innobase/include/dict0mem.inl
 create mode 100644 storage/innobase/include/dict0pagecompress.h
 create mode 100644 storage/innobase/include/dict0pagecompress.inl
 create mode 100644 storage/innobase/include/dict0stats.h
 create mode 100644 storage/innobase/include/dict0stats.inl
 create mode 100644 storage/innobase/include/dict0stats_bg.h
 create mode 100644 storage/innobase/include/dict0types.h
 create mode 100644 storage/innobase/include/dyn0buf.h
 create mode 100644 storage/innobase/include/dyn0types.h
 create mode 100644 storage/innobase/include/eval0eval.h
 create mode 100644 storage/innobase/include/eval0eval.inl
 create mode 100644 storage/innobase/include/eval0proc.h
 create mode 100644 storage/innobase/include/eval0proc.inl
 create mode 100644 storage/innobase/include/fil0crypt.h
 create mode 100644 storage/innobase/include/fil0crypt.inl
 create mode 100644 storage/innobase/include/fil0fil.h
 create mode 100644 storage/innobase/include/fil0pagecompress.h
 create mode 100644 storage/innobase/include/fsp0file.h
 create mode 100644 storage/innobase/include/fsp0fsp.h
 create mode 100644 storage/innobase/include/fsp0space.h
 create mode 100644 storage/innobase/include/fsp0sysspace.h
 create mode 100644 storage/innobase/include/fsp0types.h
 create mode 100644 storage/innobase/include/fts0ast.h
 create mode 100644 storage/innobase/include/fts0blex.h
 create mode 100644 storage/innobase/include/fts0fts.h
 create mode 100644 storage/innobase/include/fts0opt.h
 create mode 100644 storage/innobase/include/fts0pars.h
 create mode 100644 storage/innobase/include/fts0plugin.h
 create mode 100644 storage/innobase/include/fts0priv.h
 create mode 100644 storage/innobase/include/fts0priv.inl
 create mode 100644 storage/innobase/include/fts0tlex.h
 create mode 100644 storage/innobase/include/fts0tokenize.h
 create mode 100644 storage/innobase/include/fts0types.h
 create mode 100644 storage/innobase/include/fts0types.inl
 create mode 100644 storage/innobase/include/fts0vlc.h
 create mode 100644 storage/innobase/include/fut0lst.h
 create mode 100644 storage/innobase/include/gis0geo.h
 create mode 100644 storage/innobase/include/gis0rtree.h
 create mode 100644 storage/innobase/include/gis0rtree.inl
 create mode 100644 storage/innobase/include/gis0type.h
 create mode 100644 storage/innobase/include/ha0ha.h
 create mode 100644 storage/innobase/include/ha0ha.inl
 create mode 100644 storage/innobase/include/ha0storage.h
 create mode 100644 storage/innobase/include/ha0storage.inl
 create mode 100644 storage/innobase/include/ha_prototypes.h
 create mode 100644 storage/innobase/include/handler0alter.h
 create mode 100644 storage/innobase/include/hash0hash.h
 create mode 100644 storage/innobase/include/ibuf0ibuf.h
 create mode 100644 storage/innobase/include/ibuf0ibuf.inl
 create mode 100644 storage/innobase/include/lock0iter.h
 create mode 100644 storage/innobase/include/lock0lock.h
 create mode 100644 storage/innobase/include/lock0lock.inl
 create mode 100644 storage/innobase/include/lock0prdt.h
 create mode 100644 storage/innobase/include/lock0priv.h
 create mode 100644 storage/innobase/include/lock0priv.inl
 create mode 100644 storage/innobase/include/lock0types.h
 create mode 100644 storage/innobase/include/log0crypt.h
 create mode 100644 storage/innobase/include/log0log.h
 create mode 100644 storage/innobase/include/log0recv.h
 create mode 100644 storage/innobase/include/log0types.h
 create mode 100644 storage/innobase/include/mach0data.h
 create mode 100644 storage/innobase/include/mach0data.inl
 create mode 100644 storage/innobase/include/mariadb_stats.h
 create mode 100644 storage/innobase/include/mem0mem.h
 create mode 100644 storage/innobase/include/mem0mem.inl
 create mode 100644 storage/innobase/include/mtr0log.h
 create mode 100644 storage/innobase/include/mtr0mtr.h
 create mode 100644 storage/innobase/include/mtr0types.h
 create mode 100644 storage/innobase/include/os0file.h
 create mode 100644 storage/innobase/include/os0file.inl
 create mode 100644 storage/innobase/include/page0cur.h
 create mode 100644 storage/innobase/include/page0cur.inl
 create mode 100644 storage/innobase/include/page0page.h
 create mode 100644 storage/innobase/include/page0page.inl
 create mode 100644 storage/innobase/include/page0types.h
 create mode 100644 storage/innobase/include/page0zip.h
 create mode 100644 storage/innobase/include/page0zip.inl
 create mode 100644 storage/innobase/include/pars0grm.h
 create mode 100644 storage/innobase/include/pars0opt.h
 create mode 100644 storage/innobase/include/pars0pars.h
 create mode 100644 storage/innobase/include/pars0sym.h
 create mode 100644 storage/innobase/include/pars0types.h
 create mode 100644 storage/innobase/include/que0que.h
 create mode 100644 storage/innobase/include/que0que.inl
 create mode 100644 storage/innobase/include/que0types.h
 create mode 100644 storage/innobase/include/read0types.h
 create mode 100644 storage/innobase/include/rem0cmp.h
 create mode 100644 storage/innobase/include/rem0rec.h
 create mode 100644 storage/innobase/include/rem0rec.inl
 create mode 100644 storage/innobase/include/rem0types.h
 create mode 100644 storage/innobase/include/row0ext.h
 create mode 100644 storage/innobase/include/row0ext.inl
 create mode 100644 storage/innobase/include/row0ftsort.h
 create mode 100644 storage/innobase/include/row0import.h
 create mode 100644 storage/innobase/include/row0ins.h
 create mode 100644 storage/innobase/include/row0log.h
 create mode 100644 storage/innobase/include/row0merge.h
 create mode 100644 storage/innobase/include/row0mysql.h
 create mode 100644 storage/innobase/include/row0purge.h
 create mode 100644 storage/innobase/include/row0quiesce.h
 create mode 100644 storage/innobase/include/row0row.h
 create mode 100644 storage/innobase/include/row0row.inl
 create mode 100644 storage/innobase/include/row0sel.h
 create mode 100644 storage/innobase/include/row0types.h
 create mode 100644 storage/innobase/include/row0uins.h
 create mode 100644 storage/innobase/include/row0umod.h
 create mode 100644 storage/innobase/include/row0undo.h
 create mode 100644 storage/innobase/include/row0upd.h
 create mode 100644 storage/innobase/include/row0upd.inl
 create mode 100644 storage/innobase/include/row0vers.h
 create mode 100644 storage/innobase/include/rw_lock.h
 create mode 100644 storage/innobase/include/small_vector.h
 create mode 100644 storage/innobase/include/srv0mon.h
 create mode 100644 storage/innobase/include/srv0mon.inl
 create mode 100644 storage/innobase/include/srv0srv.h
 create mode 100644 storage/innobase/include/srv0start.h
 create mode 100644 storage/innobase/include/srw_lock.h
 create mode 100644 storage/innobase/include/sux_lock.h
 create mode 100644 storage/innobase/include/transactional_lock_guard.h
 create mode 100644 storage/innobase/include/trx0i_s.h
 create mode 100644 storage/innobase/include/trx0purge.h
 create mode 100644 storage/innobase/include/trx0rec.h
 create mode 100644 storage/innobase/include/trx0roll.h
 create mode 100644 storage/innobase/include/trx0rseg.h
 create mode 100644 storage/innobase/include/trx0sys.h
 create mode 100644 storage/innobase/include/trx0trx.h
 create mode 100644 storage/innobase/include/trx0trx.inl
 create mode 100644 storage/innobase/include/trx0types.h
 create mode 100644 storage/innobase/include/trx0undo.h
 create mode 100644 storage/innobase/include/trx0undo.inl
 create mode 100644 storage/innobase/include/trx0xa.h
 create mode 100644 storage/innobase/include/univ.i
 create mode 100644 storage/innobase/include/ut0byte.h
 create mode 100644 storage/innobase/include/ut0byte.inl
 create mode 100644 storage/innobase/include/ut0counter.h
 create mode 100644 storage/innobase/include/ut0dbg.h
 create mode 100644 storage/innobase/include/ut0list.h
 create mode 100644 storage/innobase/include/ut0list.inl
 create mode 100644 storage/innobase/include/ut0lst.h
 create mode 100644 storage/innobase/include/ut0mem.h
 create mode 100644 storage/innobase/include/ut0mem.inl
 create mode 100644 storage/innobase/include/ut0new.h
 create mode 100644 storage/innobase/include/ut0pool.h
 create mode 100644 storage/innobase/include/ut0rbt.h
 create mode 100644 storage/innobase/include/ut0rnd.h
 create mode 100644 storage/innobase/include/ut0rnd.inl
 create mode 100644 storage/innobase/include/ut0sort.h
 create mode 100644 storage/innobase/include/ut0stage.h
 create mode 100644 storage/innobase/include/ut0ut.h
 create mode 100644 storage/innobase/include/ut0ut.inl
 create mode 100644 storage/innobase/include/ut0vec.h
 create mode 100644 storage/innobase/include/ut0vec.inl
 create mode 100644 storage/innobase/include/ut0wqueue.h
 create mode 100644 storage/innobase/lock/lock0iter.cc
 create mode 100644 storage/innobase/lock/lock0lock.cc
 create mode 100644 storage/innobase/lock/lock0prdt.cc
 create mode 100644 storage/innobase/log/log0crypt.cc
 create mode 100644 storage/innobase/log/log0log.cc
 create mode 100644 storage/innobase/log/log0recv.cc
 create mode 100644 storage/innobase/log/log0sync.cc
 create mode 100644 storage/innobase/log/log0sync.h
 create mode 100644 storage/innobase/mem/mem0mem.cc
 create mode 100644 storage/innobase/mtr/mtr0mtr.cc
 create mode 100644 storage/innobase/mysql-test/storage_engine/alter_tablespace.opt
 create mode 100644 storage/innobase/mysql-test/storage_engine/autoinc_secondary.rdiff
 create mode 100644 storage/innobase/mysql-test/storage_engine/cache_index.rdiff
 create mode 100644 storage/innobase/mysql-test/storage_engine/checksum_table_live.rdiff
 create mode 100644 storage/innobase/mysql-test/storage_engine/col_opt_not_null.opt
 create mode 100644 storage/innobase/mysql-test/storage_engine/col_opt_null.opt
 create mode 100644 storage/innobase/mysql-test/storage_engine/define_engine.inc
 create mode 100644 storage/innobase/mysql-test/storage_engine/disabled.def
 create mode 100644 storage/innobase/mysql-test/storage_engine/fulltext_search.rdiff
 create mode 100644 storage/innobase/mysql-test/storage_engine/index_enable_disable.rdiff
 create mode 100644 storage/innobase/mysql-test/storage_engine/index_type_hash.rdiff
 create mode 100644 storage/innobase/mysql-test/storage_engine/insert_delayed.rdiff
 create mode 100644 storage/innobase/mysql-test/storage_engine/lock_concurrent.rdiff
 create mode 100644 storage/innobase/mysql-test/storage_engine/optimize_table.rdiff
 create mode 100644 storage/innobase/mysql-test/storage_engine/parts/checksum_table.rdiff
 create mode 100644 storage/innobase/mysql-test/storage_engine/parts/create_table.rdiff
 create mode 100644 storage/innobase/mysql-test/storage_engine/parts/disabled.def
 create mode 100644 storage/innobase/mysql-test/storage_engine/parts/optimize_table.rdiff
 create mode 100644 storage/innobase/mysql-test/storage_engine/parts/repair_table.rdiff
 create mode 100644 storage/innobase/mysql-test/storage_engine/parts/suite.opt
 create mode 100644 storage/innobase/mysql-test/storage_engine/repair_table.rdiff
 create mode 100644 storage/innobase/mysql-test/storage_engine/suite.opt
 create mode 100644 storage/innobase/mysql-test/storage_engine/tbl_opt_index_dir.rdiff
 create mode 100644 storage/innobase/mysql-test/storage_engine/tbl_opt_insert_method.rdiff
 create mode 100644 storage/innobase/mysql-test/storage_engine/tbl_opt_row_format.rdiff
 create mode 100644 storage/innobase/mysql-test/storage_engine/tbl_opt_union.rdiff
 create mode 100644 storage/innobase/mysql-test/storage_engine/trx/cons_snapshot_serializable.rdiff
 create mode 100644 storage/innobase/mysql-test/storage_engine/trx/level_read_committed.rdiff
 create mode 100644 storage/innobase/mysql-test/storage_engine/trx/level_read_uncommitted.rdiff
 create mode 100644 storage/innobase/mysql-test/storage_engine/trx/suite.opt
 create mode 100644 storage/innobase/mysql-test/storage_engine/type_blob.opt
 create mode 100644 storage/innobase/mysql-test/storage_engine/type_char_indexes.rdiff
 create mode 100644 storage/innobase/mysql-test/storage_engine/type_float_indexes.rdiff
 create mode 100644 storage/innobase/mysql-test/storage_engine/type_text.opt
 create mode 100644 storage/innobase/os/os0file.cc
 create mode 100644 storage/innobase/page/page0cur.cc
 create mode 100644 storage/innobase/page/page0page.cc
 create mode 100644 storage/innobase/page/page0zip.cc
 create mode 100644 storage/innobase/pars/lexyy.cc
 create mode 100755 storage/innobase/pars/make_bison.sh
 create mode 100755 storage/innobase/pars/make_flex.sh
 create mode 100644 storage/innobase/pars/pars0grm.cc
 create mode 100644 storage/innobase/pars/pars0grm.y
 create mode 100644 storage/innobase/pars/pars0lex.l
 create mode 100644 storage/innobase/pars/pars0opt.cc
 create mode 100644 storage/innobase/pars/pars0pars.cc
 create mode 100644 storage/innobase/pars/pars0sym.cc
 create mode 100644 storage/innobase/que/que0que.cc
 create mode 100644 storage/innobase/read/read0read.cc
 create mode 100644 storage/innobase/rem/rem0cmp.cc
 create mode 100644 storage/innobase/rem/rem0rec.cc
 create mode 100644 storage/innobase/row/row0ext.cc
 create mode 100644 storage/innobase/row/row0ftsort.cc
 create mode 100644 storage/innobase/row/row0import.cc
 create mode 100644 storage/innobase/row/row0ins.cc
 create mode 100644 storage/innobase/row/row0log.cc
 create mode 100644 storage/innobase/row/row0merge.cc
 create mode 100644 storage/innobase/row/row0mysql.cc
 create mode 100644 storage/innobase/row/row0purge.cc
 create mode 100644 storage/innobase/row/row0quiesce.cc
 create mode 100644 storage/innobase/row/row0row.cc
 create mode 100644 storage/innobase/row/row0sel.cc
 create mode 100644 storage/innobase/row/row0uins.cc
 create mode 100644 storage/innobase/row/row0umod.cc
 create mode 100644 storage/innobase/row/row0undo.cc
 create mode 100644 storage/innobase/row/row0upd.cc
 create mode 100644 storage/innobase/row/row0vers.cc
 create mode 100644 storage/innobase/srv/srv0mon.cc
 create mode 100644 storage/innobase/srv/srv0srv.cc
 create mode 100644 storage/innobase/srv/srv0start.cc
 create mode 100644 storage/innobase/sync/srw_lock.cc
 create mode 100644 storage/innobase/trx/trx0i_s.cc
 create mode 100644 storage/innobase/trx/trx0purge.cc
 create mode 100644 storage/innobase/trx/trx0rec.cc
 create mode 100644 storage/innobase/trx/trx0roll.cc
 create mode 100644 storage/innobase/trx/trx0rseg.cc
 create mode 100644 storage/innobase/trx/trx0sys.cc
 create mode 100644 storage/innobase/trx/trx0trx.cc
 create mode 100644 storage/innobase/trx/trx0undo.cc
 create mode 100644 storage/innobase/unittest/CMakeLists.txt
 create mode 100644 storage/innobase/unittest/innodb_fts-t.cc
 create mode 100644 storage/innobase/unittest/innodb_sync-t.cc
 create mode 100644 storage/innobase/ut/ut0dbg.cc
 create mode 100644 storage/innobase/ut/ut0list.cc
 create mode 100644 storage/innobase/ut/ut0mem.cc
 create mode 100644 storage/innobase/ut/ut0new.cc
 create mode 100644 storage/innobase/ut/ut0rbt.cc
 create mode 100644 storage/innobase/ut/ut0rnd.cc
 create mode 100644 storage/innobase/ut/ut0ut.cc
 create mode 100644 storage/innobase/ut/ut0vec.cc
 create mode 100644 storage/innobase/ut/ut0wqueue.cc

(limited to 'storage/innobase')

diff --git a/storage/innobase/.clang-format-old b/storage/innobase/.clang-format-old
new file mode 100644
index 00000000..54f7b47b
--- /dev/null
+++ b/storage/innobase/.clang-format-old
@@ -0,0 +1,11 @@
+UseTab: Always
+TabWidth: 8
+IndentWidth: 8
+ContinuationIndentWidth: 8
+BreakBeforeBinaryOperators: All
+PointerAlignment: Left
+BreakBeforeBraces: Custom
+ColumnLimit: 79
+BraceWrapping:
+  AfterFunction: true
+AccessModifierOffset: -8
diff --git a/storage/innobase/CMakeLists.txt b/storage/innobase/CMakeLists.txt
new file mode 100644
index 00000000..32c0a437
--- /dev/null
+++ b/storage/innobase/CMakeLists.txt
@@ -0,0 +1,511 @@
+
+# Copyright (c) 2006, 2017, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2014, 2022, MariaDB Corporation.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1335 USA
+
+# This is the CMakeLists for InnoDB
+
+INCLUDE(CheckFunctionExists)
+INCLUDE(CheckCSourceCompiles)
+INCLUDE(CheckCSourceRuns)
+INCLUDE(numa)
+INCLUDE(TestBigEndian)
+
+MYSQL_CHECK_NUMA()
+
+INCLUDE(${MYSQL_CMAKE_SCRIPT_DIR}/compile_flags.cmake)
+
+IF(CMAKE_CROSSCOMPILING)
+  # Use CHECK_C_SOURCE_COMPILES instead of CHECK_C_SOURCE_RUNS when
+  # cross-compiling. Not as precise, but usually good enough.
+  # This only make sense for atomic tests in this file, this trick doesn't
+  # work in a general case.
+  MACRO(CHECK_C_SOURCE SOURCE VAR)
+    CHECK_C_SOURCE_COMPILES("${SOURCE}" "${VAR}")
+  ENDMACRO()
+ELSE()
+  MACRO(CHECK_C_SOURCE SOURCE VAR)
+    CHECK_C_SOURCE_RUNS("${SOURCE}" "${VAR}")
+  ENDMACRO()
+ENDIF()
+
+# OS tests
+IF(UNIX)
+  IF(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+    ADD_DEFINITIONS("-D_GNU_SOURCE=1")
+    IF(HAVE_LIBNUMA)
+      LINK_LIBRARIES(numa)
+    ENDIF()
+  ENDIF()
+ENDIF()
+
+# Enable InnoDB's UNIV_DEBUG in debug builds
+SET(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DUNIV_DEBUG")
+
+OPTION(WITH_INNODB_AHI "Include innodb_adaptive_hash_index" ON)
+OPTION(WITH_INNODB_ROOT_GUESS "Cache index root block descriptors" ON)
+IF(WITH_INNODB_AHI)
+  ADD_DEFINITIONS(-DBTR_CUR_HASH_ADAPT -DBTR_CUR_ADAPT)
+  IF(NOT WITH_INNODB_ROOT_GUESS)
+    MESSAGE(WARNING "WITH_INNODB_AHI implies WITH_INNODB_ROOT_GUESS")
+    SET(WITH_INNODB_ROOT_GUESS ON)
+  ENDIF()
+ELSEIF(WITH_INNODB_ROOT_GUESS)
+  ADD_DEFINITIONS(-DBTR_CUR_ADAPT)
+ENDIF()
+ADD_FEATURE_INFO(INNODB_AHI WITH_INNODB_AHI "InnoDB Adaptive Hash Index")
+ADD_FEATURE_INFO(INNODB_ROOT_GUESS WITH_INNODB_ROOT_GUESS
+                 "Cache index root block descriptors in InnoDB")
+
+OPTION(WITH_INNODB_EXTRA_DEBUG "Enable extra InnoDB debug checks" OFF)
+IF(WITH_INNODB_EXTRA_DEBUG)
+  ADD_DEFINITIONS(-DUNIV_ZIP_DEBUG)
+ENDIF()
+ADD_FEATURE_INFO(INNODB_EXTRA_DEBUG WITH_INNODB_EXTRA_DEBUG "Extra InnoDB debug checks")
+
+IF(HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE)
+ ADD_DEFINITIONS(-DHAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE=1)
+ENDIF()
+
+IF (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR
+    CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+  SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wconversion -Wno-sign-conversion")
+  SET_SOURCE_FILES_PROPERTIES(fts/fts0pars.cc
+    PROPERTIES COMPILE_FLAGS -Wno-conversion)
+ENDIF()
+
+IF(NOT MSVC)
+  # Work around MDEV-18417, MDEV-18656, MDEV-18417
+  IF(WITH_ASAN AND CMAKE_COMPILER_IS_GNUCC AND
+     CMAKE_C_COMPILER_VERSION VERSION_LESS "6.0.0")
+    SET_SOURCE_FILES_PROPERTIES(trx/trx0rec.cc PROPERTIES COMPILE_FLAGS -O1)
+  ENDIF()
+ENDIF(NOT MSVC)
+
+CHECK_FUNCTION_EXISTS(vasprintf  HAVE_VASPRINTF)
+
+# Include directories under innobase
+INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/storage/innobase/include
+		    ${CMAKE_SOURCE_DIR}/storage/innobase/handler)
+
+# Sun Studio bug with -xO2
+IF(CMAKE_CXX_COMPILER_ID MATCHES "SunPro"
+	AND CMAKE_CXX_FLAGS_RELEASE MATCHES "O2"
+	AND NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
+	# Sun Studio 12 crashes with -xO2 flag, but not with higher optimization
+	# -xO3
+	SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_SOURCE_DIR}/rem/rem0rec.cc
+    PROPERTIES COMPILE_FLAGS -xO3)
+ENDIF()
+
+
+IF(MSVC)
+  # Avoid "unreferenced label" warning in generated file
+  GET_FILENAME_COMPONENT(_SRC_DIR ${CMAKE_CURRENT_LIST_FILE} PATH)
+  SET_SOURCE_FILES_PROPERTIES(${_SRC_DIR}/pars/pars0grm.c
+          PROPERTIES COMPILE_FLAGS "/wd4102")
+  SET_SOURCE_FILES_PROPERTIES(${_SRC_DIR}/pars/lexyy.c
+          PROPERTIES COMPILE_FLAGS "/wd4003")
+ENDIF()
+
+# Include directories under innobase
+INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/storage/innobase/include
+                    ${CMAKE_SOURCE_DIR}/storage/innobase/handler
+                    ${CMAKE_SOURCE_DIR}/libbinlogevents/include)
+INCLUDE_DIRECTORIES(${PROJECT_SOURCE_DIR}/tpool)
+
+SET(INNOBASE_SOURCES
+	btr/btr0btr.cc
+	btr/btr0bulk.cc
+	btr/btr0cur.cc
+	btr/btr0pcur.cc
+	btr/btr0sea.cc
+	btr/btr0defragment.cc
+	buf/buf0block_hint.cc
+	buf/buf0buddy.cc
+	buf/buf0buf.cc
+	buf/buf0dblwr.cc
+	buf/buf0checksum.cc
+	buf/buf0dump.cc
+	buf/buf0flu.cc
+	buf/buf0lru.cc
+	buf/buf0rea.cc
+	data/data0data.cc
+	data/data0type.cc
+	dict/dict0boot.cc
+	dict/dict0crea.cc
+	dict/dict0dict.cc
+	dict/dict0load.cc
+	dict/dict0mem.cc
+	dict/dict0stats.cc
+	dict/dict0stats_bg.cc
+	dict/dict0defrag_bg.cc
+	dict/drop.cc
+	eval/eval0eval.cc
+	eval/eval0proc.cc
+	fil/fil0fil.cc
+	fil/fil0pagecompress.cc
+	fil/fil0crypt.cc
+	fsp/fsp0fsp.cc
+	fsp/fsp0file.cc
+	fsp/fsp0space.cc
+	fsp/fsp0sysspace.cc
+	fut/fut0lst.cc
+	ha/ha0storage.cc
+	fts/fts0fts.cc
+	fts/fts0ast.cc
+	fts/fts0blex.cc
+	fts/fts0config.cc
+	fts/fts0opt.cc
+	fts/fts0pars.cc
+	fts/fts0que.cc
+	fts/fts0sql.cc
+	fts/fts0tlex.cc
+	gis/gis0geo.cc
+	gis/gis0rtree.cc
+	gis/gis0sea.cc
+	fts/fts0plugin.cc
+	handler/ha_innodb.cc
+	handler/handler0alter.cc
+	handler/i_s.cc
+	ibuf/ibuf0ibuf.cc
+	include/btr0btr.h
+	include/btr0btr.inl
+	include/btr0bulk.h
+	include/btr0cur.h
+	include/btr0cur.inl
+	include/btr0defragment.h
+	include/btr0pcur.h
+	include/btr0pcur.inl
+	include/btr0sea.h
+	include/btr0sea.inl
+	include/btr0types.h
+	include/buf0buddy.h
+	include/buf0buf.h
+	include/buf0buf.inl
+	include/buf0checksum.h
+	include/buf0dblwr.h
+	include/buf0dump.h
+	include/buf0flu.h
+	include/buf0lru.h
+	include/buf0rea.h
+	include/buf0types.h
+	include/data0data.h
+	include/data0data.inl
+	include/data0type.h
+	include/data0type.inl
+	include/data0types.h
+	include/db0err.h
+	include/dict0boot.h
+	include/dict0crea.h
+	include/dict0crea.inl
+	include/dict0defrag_bg.h
+	include/dict0dict.h
+	include/dict0dict.inl
+	include/dict0load.h
+	include/dict0mem.h
+	include/dict0mem.inl
+	include/dict0pagecompress.h
+	include/dict0pagecompress.inl
+	include/dict0stats.h
+	include/dict0stats.inl
+	include/dict0stats_bg.h
+	include/dict0types.h
+	include/dyn0buf.h
+	include/dyn0types.h
+	include/eval0eval.h
+	include/eval0eval.inl
+	include/eval0proc.h
+	include/eval0proc.inl
+	include/fil0crypt.h
+	include/fil0crypt.inl
+	include/fil0fil.h
+	include/fil0pagecompress.h
+	include/fsp0file.h
+	include/fsp0fsp.h
+	include/fsp0space.h
+	include/fsp0sysspace.h
+	include/fsp0types.h
+	include/fts0ast.h
+	include/fts0blex.h
+	include/fts0fts.h
+	include/fts0opt.h
+	include/fts0pars.h
+	include/fts0plugin.h
+	include/fts0priv.h
+	include/fts0priv.inl
+	include/fts0tlex.h
+	include/fts0tokenize.h
+	include/fts0types.h
+	include/fts0types.inl
+	include/fts0vlc.h
+	include/fut0lst.h
+	include/gis0geo.h
+	include/gis0rtree.h
+	include/gis0rtree.inl
+	include/gis0type.h
+	include/ha_prototypes.h
+	include/ha0ha.h
+	include/ha0ha.inl
+	include/ha0storage.h
+	include/ha0storage.inl
+	include/handler0alter.h
+	include/hash0hash.h
+	include/ibuf0ibuf.h
+	include/ibuf0ibuf.inl
+	include/lock0iter.h
+	include/lock0lock.h
+	include/lock0lock.inl
+	include/lock0prdt.h
+	include/lock0priv.h
+	include/lock0priv.inl
+	include/lock0types.h
+	include/log0crypt.h
+	include/log0log.h
+	include/log0recv.h
+	include/log0types.h
+	include/mach0data.h
+	include/mach0data.inl
+	include/mem0mem.h
+	include/mem0mem.inl
+	include/mtr0log.h
+	include/mtr0mtr.h
+	include/mtr0types.h
+	include/os0file.h
+	include/os0file.inl
+	include/page0cur.h
+	include/page0cur.inl
+	include/page0page.h
+	include/page0page.inl
+	include/page0types.h
+	include/page0zip.h
+	include/page0zip.inl
+	include/pars0grm.h
+	include/pars0opt.h
+	include/pars0pars.h
+	include/pars0sym.h
+	include/pars0types.h
+	include/que0que.h
+	include/que0que.inl
+	include/que0types.h
+	include/read0types.h
+	include/rem0cmp.h
+	include/rem0rec.h
+	include/rem0rec.inl
+	include/rem0types.h
+	include/row0ext.h
+	include/row0ext.inl
+	include/row0ftsort.h
+	include/row0import.h
+	include/row0ins.h
+	include/row0log.h
+	include/row0merge.h
+	include/row0mysql.h
+	include/row0purge.h
+	include/row0quiesce.h
+	include/row0row.h
+	include/row0row.inl
+	include/row0sel.h
+	include/row0types.h
+	include/row0uins.h
+	include/row0umod.h
+	include/row0undo.h
+	include/row0upd.h
+	include/row0upd.inl
+	include/row0vers.h
+	include/rw_lock.h
+	include/small_vector.h
+	include/srv0mon.h
+	include/srv0mon.inl
+	include/srv0srv.h
+	include/srv0start.h
+	include/srw_lock.h
+	include/sux_lock.h
+	include/transactional_lock_guard.h
+	include/trx0i_s.h
+	include/trx0purge.h
+	include/trx0rec.h
+	include/trx0roll.h
+	include/trx0rseg.h
+	include/trx0sys.h
+	include/trx0trx.h
+	include/trx0trx.inl
+	include/trx0types.h
+	include/trx0undo.h
+	include/trx0undo.inl
+	include/trx0xa.h
+	include/univ.i
+	include/ut0byte.h
+	include/ut0byte.inl
+	include/ut0counter.h
+	include/ut0dbg.h
+	include/ut0list.h
+	include/ut0list.inl
+	include/ut0lst.h
+	include/ut0mem.h
+	include/ut0mem.inl
+	include/ut0new.h
+	include/ut0pool.h
+	include/ut0rbt.h
+	include/ut0rnd.h
+	include/ut0rnd.inl
+	include/ut0sort.h
+	include/ut0stage.h
+	include/ut0ut.h
+	include/ut0ut.inl
+	include/ut0vec.h
+	include/ut0vec.inl
+	include/ut0wqueue.h
+	lock/lock0iter.cc
+	lock/lock0prdt.cc
+	lock/lock0lock.cc
+	log/log0log.cc
+	log/log0recv.cc
+	log/log0crypt.cc
+	log/log0sync.cc
+	mem/mem0mem.cc
+	mtr/mtr0mtr.cc
+	os/os0file.cc
+	page/page0cur.cc
+	page/page0page.cc
+	page/page0zip.cc
+	pars/lexyy.cc
+	pars/pars0grm.cc
+	pars/pars0opt.cc
+	pars/pars0pars.cc
+	pars/pars0sym.cc
+	que/que0que.cc
+	read/read0read.cc
+	rem/rem0cmp.cc
+	rem/rem0rec.cc
+	row/row0ext.cc
+	row/row0ftsort.cc
+	row/row0import.cc
+	row/row0ins.cc
+	row/row0merge.cc
+	row/row0mysql.cc
+	row/row0log.cc
+	row/row0purge.cc
+	row/row0row.cc
+	row/row0sel.cc
+	row/row0uins.cc
+	row/row0umod.cc
+	row/row0undo.cc
+	row/row0upd.cc
+	row/row0quiesce.cc
+	row/row0vers.cc
+	srv/srv0mon.cc
+	srv/srv0srv.cc
+	srv/srv0start.cc
+	sync/srw_lock.cc
+	trx/trx0i_s.cc
+	trx/trx0purge.cc
+	trx/trx0rec.cc
+	trx/trx0roll.cc
+	trx/trx0rseg.cc
+	trx/trx0sys.cc
+	trx/trx0trx.cc
+	trx/trx0undo.cc
+	ut/ut0dbg.cc
+	ut/ut0list.cc
+	ut/ut0mem.cc
+	ut/ut0new.cc
+	ut/ut0rbt.cc
+	ut/ut0rnd.cc
+	ut/ut0ut.cc
+	ut/ut0vec.cc
+	ut/ut0wqueue.cc)
+
+OPTION(WITH_PMEM "Support redo log in persistent memory" OFF)
+FIND_PACKAGE(PMEM)
+IF(PMEM_FOUND)
+  INCLUDE_DIRECTORIES(${PMEM_INCLUDES})
+  ADD_COMPILE_FLAGS(log/log0log.cc log/log0recv.cc
+    buf/buf0flu.cc mtr/mtr0mtr.cc trx/trx0trx.cc srv/srv0start.cc
+    COMPILE_FLAGS "-DHAVE_PMEM")
+  SET(PMEM_LIBRARY ${PMEM_LIBRARIES})
+ELSE()
+  IF(WITH_PMEM)
+    MESSAGE(FATAL_ERROR "WITH_PMEM=ON cannot be satisfied")
+  ENDIF()
+ENDIF()
+
+MYSQL_ADD_PLUGIN(innobase ${INNOBASE_SOURCES} STORAGE_ENGINE
+  MODULE_OUTPUT_NAME ha_innodb
+  DEFAULT RECOMPILE_FOR_EMBEDDED
+  LINK_LIBRARIES
+	${ZLIB_LIBRARY}
+	${PMEM_LIBRARY}
+	${NUMA_LIBRARY}
+	${LIBSYSTEMD}
+	${LINKER_SCRIPT})
+
+IF(NOT TARGET innobase)
+  RETURN()
+ENDIF()
+
+ADD_DEFINITIONS(${SSL_DEFINES} ${TPOOL_DEFINES})
+
+# A GCC bug causes crash when compiling these files on ARM64 with -O1+
+# Compile them with -O0 as a workaround.
+IF(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64"
+   AND CMAKE_C_COMPILER_VERSION VERSION_LESS "5.2.0")
+  ADD_COMPILE_FLAGS(
+      btr/btr0btr.cc
+      btr/btr0cur.cc
+      buf/buf0buf.cc
+      fts/fts0fts.cc
+      gis/gis0sea.cc
+      handler/handler0alter.cc
+      mtr/mtr0mtr.cc
+      row/row0merge.cc
+      row/row0mysql.cc
+      srv/srv0srv.cc
+      COMPILE_FLAGS "-O0"
+      )
+ENDIF()
+
+# Older gcc version insist on -mhtm flag for including the
+# htmxlintrin.h header. This is also true for new gcc versions
+# like 11.2.0 in Debian Sid
+# s390x because of the way it defines the high level intrinsics
+# as not-inline in the header file can only be included by one
+# source file that has -mhtm enabled.
+IF(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64|powerpc64|s390x"
+   OR CMAKE_SYSTEM_NAME MATCHES "AIX")
+  ADD_COMPILE_FLAGS(
+      sync/srw_lock.cc
+      COMPILE_FLAGS "-mhtm"
+      )
+ENDIF()
+IF(MSVC)
+  IF(CMAKE_SIZEOF_VOID_P EQUAL 8)
+   ADD_COMPILE_FLAGS(
+      pars/lexyy.cc
+      COMPILE_FLAGS "/wd4267")
+  ENDIF()
+  # silence "switch statement contains 'default' but no 'case' label
+  # on generated file.
+  TARGET_COMPILE_OPTIONS(innobase PRIVATE "/wd4065")
+ENDIF()
+
+IF(NOT (PLUGIN_INNOBASE STREQUAL DYNAMIC))
+  TARGET_LINK_LIBRARIES(innobase tpool mysys)
+  ADD_SUBDIRECTORY(${CMAKE_SOURCE_DIR}/extra/mariabackup ${CMAKE_BINARY_DIR}/extra/mariabackup)
+ENDIF()
+
+IF(WITH_UNIT_TESTS)
+  ADD_SUBDIRECTORY(unittest)
+ENDIF()
diff --git a/storage/innobase/COPYING.Google b/storage/innobase/COPYING.Google
new file mode 100644
index 00000000..5ade2b0e
--- /dev/null
+++ b/storage/innobase/COPYING.Google
@@ -0,0 +1,30 @@
+Portions of this software contain modifications contributed by Google, Inc.
+These contributions are used with the following license:
+
+Copyright (c) 2008, Google Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+      * Redistributions of source code must retain the above copyright
+        notice, this list of conditions and the following disclaimer.
+      * Redistributions in binary form must reproduce the above
+        copyright notice, this list of conditions and the following
+        disclaimer in the documentation and/or other materials
+        provided with the distribution.
+      * Neither the name of the Google Inc. nor the names of its
+        contributors may be used to endorse or promote products
+        derived from this software without specific prior written
+        permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/storage/innobase/COPYING.Percona b/storage/innobase/COPYING.Percona
new file mode 100644
index 00000000..8c786811
--- /dev/null
+++ b/storage/innobase/COPYING.Percona
@@ -0,0 +1,30 @@
+Portions of this software contain modifications contributed by Percona, Inc.
+These contributions are used with the following license:
+
+Copyright (c) 2008, 2009, Percona Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+      * Redistributions of source code must retain the above copyright
+        notice, this list of conditions and the following disclaimer.
+      * Redistributions in binary form must reproduce the above
+        copyright notice, this list of conditions and the following
+        disclaimer in the documentation and/or other materials
+        provided with the distribution.
+      * Neither the name of the Percona Inc. nor the names of its
+        contributors may be used to endorse or promote products
+        derived from this software without specific prior written
+        permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc
new file mode 100644
index 00000000..08be1991
--- /dev/null
+++ b/storage/innobase/btr/btr0btr.cc
@@ -0,0 +1,5433 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2014, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file btr/btr0btr.cc
+The B-tree
+
+Created 6/2/1994 Heikki Tuuri
+*******************************************************/
+
+#include "btr0btr.h"
+
+#include "page0page.h"
+#include "page0zip.h"
+#include "gis0rtree.h"
+
+#include "btr0cur.h"
+#include "btr0sea.h"
+#include "btr0pcur.h"
+#include "btr0defragment.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "ibuf0ibuf.h"
+#include "trx0trx.h"
+#include "srv0mon.h"
+#include "gis0geo.h"
+#include "dict0boot.h"
+#include "row0sel.h" /* row_search_max_autoinc() */
+#include "log.h"
+
+/**************************************************************//**
+Checks if the page in the cursor can be merged with given page.
+If necessary, re-organize the merge_page.
+@return	true if possible to merge. */
+static
+bool
+btr_can_merge_with_page(
+/*====================*/
+	btr_cur_t*	cursor,		/*!< in: cursor on the page to merge */
+	uint32_t	page_no,	/*!< in: a sibling page */
+	buf_block_t**	merge_block,	/*!< out: the merge block */
+	mtr_t*		mtr);		/*!< in: mini-transaction */
+
+/*
+Latching strategy of the InnoDB B-tree
+--------------------------------------
+
+Node pointer page latches acquisition is protected by index->lock latch.
+
+Before MariaDB 10.2.2, all node pointer pages were protected by index->lock
+either in S (shared) or X (exclusive) mode and block->lock was not acquired on
+node pointer pages.
+
+After MariaDB 10.2.2, block->lock S-latch or X-latch is used to protect
+node pointer pages and obtaiment of node pointer page latches is protected by
+index->lock.
+
+(0) Definition: B-tree level.
+
+(0.1) The leaf pages of the B-tree are at level 0.
+
+(0.2) The parent of a page at level L has level L+1. (The level of the
+root page is equal to the tree height.)
+
+(0.3) The B-tree lock (index->lock) is the parent of the root page and
+has a level = tree height + 1.
+
+Index->lock has 3 possible locking modes:
+
+(1) S-latch:
+
+(1.1) All latches for pages must be obtained in descending order of tree level.
+
+(1.2) Before obtaining the first node pointer page latch at a given B-tree
+level, parent latch must be held (at level +1 ).
+
+(1.3) If a node pointer page is already latched at the same level
+we can only obtain latch to its right sibling page latch at the same level.
+
+(1.4) Release of the node pointer page latches must be done in
+child-to-parent order. (Prevents deadlocks when obtained index->lock
+in SX mode).
+
+(1.4.1) Level L node pointer page latch can be released only when
+no latches at children level i.e. level < L are hold.
+
+(1.4.2) All latches from node pointer pages must be released so
+that no latches are obtained between.
+
+(1.5) [implied by (1.1), (1.2)] Root page latch must be first node pointer
+latch obtained.
+
+(2) SX-latch:
+
+In this case rules (1.2) and (1.3) from S-latch case are relaxed and
+merged into (2.2) and rule (1.4) is removed. Thus, latch acquisition
+can be skipped at some tree levels and latches can be obtained in
+a less restricted order.
+
+(2.1) [identical to (1.1)]: All latches for pages must be obtained in descending
+order of tree level.
+
+(2.2) When a node pointer latch at level L is obtained,
+the left sibling page latch in the same level or some ancestor
+page latch (at level > L) must be hold.
+
+(2.3) [implied by (2.1), (2.2)] The first node pointer page latch obtained can
+be any node pointer page.
+
+(3) X-latch:
+
+Node pointer latches can be obtained in any order.
+
+NOTE: New rules after MariaDB 10.2.2 does not affect the latching rules of leaf pages:
+
+index->lock S-latch is needed in read for the node pointer traversal. When the leaf
+level is reached, index-lock can be released (and with the MariaDB 10.2.2 changes, all
+node pointer latches). Left to right index travelsal in leaf page level can be safely done
+by obtaining right sibling leaf page latch and then releasing the old page latch.
+
+Single leaf page modifications (BTR_MODIFY_LEAF) are protected by index->lock
+S-latch.
+
+B-tree operations involving page splits or merges (BTR_MODIFY_TREE) and page
+allocations are protected by index->lock X-latch.
+
+Node pointers
+-------------
+Leaf pages of a B-tree contain the index records stored in the
+tree. On levels n > 0 we store 'node pointers' to pages on level
+n - 1. For each page there is exactly one node pointer stored:
+thus the our tree is an ordinary B-tree, not a B-link tree.
+
+A node pointer contains a prefix P of an index record. The prefix
+is long enough so that it determines an index record uniquely.
+The file page number of the child page is added as the last
+field. To the child page we can store node pointers or index records
+which are >= P in the alphabetical order, but < P1 if there is
+a next node pointer on the level, and P1 is its prefix.
+
+If a node pointer with a prefix P points to a non-leaf child,
+then the leftmost record in the child must have the same
+prefix P. If it points to a leaf node, the child is not required
+to contain any record with a prefix equal to P. The leaf case
+is decided this way to allow arbitrary deletions in a leaf node
+without touching upper levels of the tree.
+
+We have predefined a special minimum record which we
+define as the smallest record in any alphabetical order.
+A minimum record is denoted by setting a bit in the record
+header. A minimum record acts as the prefix of a node pointer
+which points to a leftmost node on any level of the tree.
+
+File page allocation
+--------------------
+In the root node of a B-tree there are two file segment headers.
+The leaf pages of a tree are allocated from one file segment, to
+make them consecutive on disk if possible. From the other file segment
+we allocate pages for the non-leaf levels of the tree.
+*/
+
+/** Check a file segment header within a B-tree root page.
+@param offset      file segment header offset
+@param block       B-tree root page
+@param space       tablespace
+@return whether the segment header is valid */
+static bool btr_root_fseg_validate(ulint offset,
+                                   const buf_block_t &block,
+                                   const fil_space_t &space)
+{
+  ut_ad(block.page.id().space() == space.id);
+  const uint16_t hdr= mach_read_from_2(offset + FSEG_HDR_OFFSET +
+                                       block.page.frame);
+  if (FIL_PAGE_DATA <= hdr && hdr <= srv_page_size - FIL_PAGE_DATA_END &&
+      mach_read_from_4(block.page.frame + offset + FSEG_HDR_SPACE) == space.id)
+    return true;
+  sql_print_error("InnoDB: Index root page " UINT32PF " in %s is corrupted "
+                  "at " ULINTPF,
+                  block.page.id().page_no(),
+                  UT_LIST_GET_FIRST(space.chain)->name);
+  return false;
+}
+
+/** Report a decryption failure. */
+ATTRIBUTE_COLD void btr_decryption_failed(const dict_index_t &index)
+{
+  ib_push_warning(static_cast<void*>(nullptr), DB_DECRYPTION_FAILED,
+                  "Table %s is encrypted but encryption service or"
+                  " used key_id is not available. "
+                  " Can't continue reading table.",
+                  index.table->name.m_name);
+  index.table->file_unreadable= true;
+}
+
+/** Get an index page and declare its latching order level.
+@param[in]	index	index tree
+@param[in]	page	page number
+@param[in]	mode	latch mode
+@param[in]	merge	whether change buffer merge should be attempted
+@param[in,out]	mtr	mini-transaction
+@param[out]	err	error code
+@return block */
+buf_block_t *btr_block_get(const dict_index_t &index,
+                           uint32_t page, rw_lock_type_t mode, bool merge,
+                           mtr_t *mtr, dberr_t *err)
+{
+  ut_ad(mode != RW_NO_LATCH);
+  dberr_t local_err;
+  if (!err)
+    err= &local_err;
+  buf_block_t *block=
+    buf_page_get_gen(page_id_t{index.table->space->id, page},
+                     index.table->space->zip_size(), mode, nullptr, BUF_GET,
+                     mtr, err, merge && !index.is_clust());
+  ut_ad(!block == (*err != DB_SUCCESS));
+
+  if (UNIV_LIKELY(block != nullptr))
+  {
+    if (!!page_is_comp(block->page.frame) != index.table->not_redundant() ||
+        btr_page_get_index_id(block->page.frame) != index.id ||
+        !fil_page_index_page_check(block->page.frame) ||
+        index.is_spatial() !=
+        (fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE))
+    {
+      *err= DB_PAGE_CORRUPTED;
+      block= nullptr;
+    }
+  }
+  else if (*err == DB_DECRYPTION_FAILED)
+    btr_decryption_failed(index);
+
+  return block;
+}
+
+/**************************************************************//**
+Gets the root node of a tree and x- or s-latches it.
+@return root page, x- or s-latched */
+buf_block_t*
+btr_root_block_get(
+/*===============*/
+	dict_index_t*		index,	/*!< in: index tree */
+	rw_lock_type_t		mode,	/*!< in: either RW_S_LATCH
+					or RW_X_LATCH */
+	mtr_t*			mtr,	/*!< in: mtr */
+	dberr_t*		err)	/*!< out: error code */
+{
+  if (!index->table || !index->table->space)
+  {
+    *err= DB_TABLESPACE_NOT_FOUND;
+    return nullptr;
+  }
+
+  buf_block_t *block;
+#ifndef BTR_CUR_ADAPT
+  static constexpr buf_block_t *guess= nullptr;
+#else
+  buf_block_t *&guess= btr_search_get_info(index)->root_guess;
+  guess=
+#endif
+  block=
+    buf_page_get_gen(page_id_t{index->table->space->id, index->page},
+                     index->table->space->zip_size(), mode, guess, BUF_GET,
+                     mtr, err, false);
+  ut_ad(!block == (*err != DB_SUCCESS));
+
+  if (UNIV_LIKELY(block != nullptr))
+  {
+    if (UNIV_UNLIKELY(mode == RW_NO_LATCH));
+    else if (!!page_is_comp(block->page.frame) !=
+             index->table->not_redundant() ||
+             btr_page_get_index_id(block->page.frame) != index->id ||
+             !fil_page_index_page_check(block->page.frame) ||
+             index->is_spatial() !=
+             (fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE))
+    {
+      *err= DB_PAGE_CORRUPTED;
+      block= nullptr;
+    }
+    else if (index->is_ibuf());
+    else if (!btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF,
+                                     *block, *index->table->space) ||
+             !btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP,
+                                     *block, *index->table->space))
+    {
+      *err= DB_CORRUPTION;
+      block= nullptr;
+    }
+  }
+  else if (*err == DB_DECRYPTION_FAILED)
+    btr_decryption_failed(*index);
+
+  return block;
+}
+
+/**************************************************************//**
+Gets the root node of a tree and sx-latches it for segment access.
+@return root page, sx-latched */
+static
+page_t*
+btr_root_get(
+/*=========*/
+	dict_index_t*		index,	/*!< in: index tree */
+	mtr_t*			mtr,	/*!< in: mtr */
+	dberr_t*		err)	/*!< out: error code */
+{
+  /* Intended to be used for accessing file segment lists.
+  Concurrent read of other data is allowed. */
+  if (buf_block_t *root= btr_root_block_get(index, RW_SX_LATCH, mtr, err))
+    return root->page.frame;
+  return nullptr;
+}
+
+/**************************************************************//**
+Checks a file segment header within a B-tree root page and updates
+the segment header space id.
+@return TRUE if valid */
+static
+bool
+btr_root_fseg_adjust_on_import(
+/*===========================*/
+	fseg_header_t*	seg_header,	/*!< in/out: segment header */
+	page_zip_des_t*	page_zip,	/*!< in/out: compressed page,
+					or NULL */
+	ulint		space)		/*!< in: tablespace identifier */
+{
+	ulint	offset = mach_read_from_2(seg_header + FSEG_HDR_OFFSET);
+
+	if (offset < FIL_PAGE_DATA
+	    || offset > srv_page_size - FIL_PAGE_DATA_END) {
+		return false;
+	}
+
+	seg_header += FSEG_HDR_SPACE;
+
+	mach_write_to_4(seg_header, space);
+	if (UNIV_LIKELY_NULL(page_zip)) {
+		memcpy(page_zip->data + page_offset(seg_header), seg_header,
+		       4);
+	}
+
+	return true;
+}
+
+/**************************************************************//**
+Checks and adjusts the root node of a tree during IMPORT TABLESPACE.
+@return error code, or DB_SUCCESS */
+dberr_t
+btr_root_adjust_on_import(
+/*======================*/
+	const dict_index_t*	index)	/*!< in: index tree */
+{
+	dberr_t			err;
+	mtr_t			mtr;
+	page_t*			page;
+	page_zip_des_t*		page_zip;
+	dict_table_t*		table = index->table;
+
+	DBUG_EXECUTE_IF("ib_import_trigger_corruption_3",
+			return(DB_CORRUPTION););
+
+	mtr_start(&mtr);
+
+	mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO);
+
+	buf_block_t* block = buf_page_get_gen(
+		page_id_t(table->space->id, index->page),
+		table->space->zip_size(), RW_X_LATCH, NULL, BUF_GET,
+		&mtr, &err);
+	if (!block) {
+		ut_ad(err != DB_SUCCESS);
+		goto func_exit;
+	}
+
+	page = buf_block_get_frame(block);
+	page_zip = buf_block_get_page_zip(block);
+
+	if (!fil_page_index_page_check(page) || page_has_siblings(page)) {
+		err = DB_CORRUPTION;
+
+	} else if (dict_index_is_clust(index)) {
+		bool	page_is_compact_format;
+
+		page_is_compact_format = page_is_comp(page) > 0;
+
+		/* Check if the page format and table format agree. */
+		if (page_is_compact_format != dict_table_is_comp(table)) {
+			err = DB_CORRUPTION;
+		} else {
+			/* Check that the table flags and the tablespace
+			flags match. */
+			uint32_t tf = dict_tf_to_fsp_flags(table->flags);
+			uint32_t sf = table->space->flags;
+			sf &= ~FSP_FLAGS_MEM_MASK;
+			tf &= ~FSP_FLAGS_MEM_MASK;
+			if (fil_space_t::is_flags_equal(tf, sf)
+			    || fil_space_t::is_flags_equal(sf, tf)) {
+				mysql_mutex_lock(&fil_system.mutex);
+				table->space->flags = (table->space->flags
+						       & ~FSP_FLAGS_MEM_MASK)
+					| (tf & FSP_FLAGS_MEM_MASK);
+				mysql_mutex_unlock(&fil_system.mutex);
+				err = DB_SUCCESS;
+			} else {
+				err = DB_CORRUPTION;
+			}
+		}
+	} else {
+		err = DB_SUCCESS;
+	}
+
+	/* Check and adjust the file segment headers, if all OK so far. */
+	if (err == DB_SUCCESS
+	    && (!btr_root_fseg_adjust_on_import(
+			FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
+			+ page, page_zip, table->space_id)
+		|| !btr_root_fseg_adjust_on_import(
+			FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
+			+ page, page_zip, table->space_id))) {
+
+		err = DB_CORRUPTION;
+	}
+
+func_exit:
+	mtr_commit(&mtr);
+
+	return(err);
+}
+
+/**************************************************************//**
+Creates a new index page (not the root, and also not
+used in page reorganization).  @see btr_page_empty(). */
+void
+btr_page_create(
+/*============*/
+	buf_block_t*	block,	/*!< in/out: page to be created */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	dict_index_t*	index,	/*!< in: index */
+	ulint		level,	/*!< in: the B-tree level of the page */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+  ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+  byte *index_id= my_assume_aligned<2>(PAGE_HEADER + PAGE_INDEX_ID +
+                                       block->page.frame);
+
+  if (UNIV_LIKELY_NULL(page_zip))
+  {
+    mach_write_to_8(index_id, index->id);
+    page_create_zip(block, index, level, 0, mtr);
+  }
+  else
+  {
+    page_create(block, mtr, dict_table_is_comp(index->table));
+    if (index->is_spatial())
+    {
+      static_assert(((FIL_PAGE_INDEX & 0xff00) | byte(FIL_PAGE_RTREE)) ==
+                    FIL_PAGE_RTREE, "compatibility");
+      mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->page.frame,
+                    byte(FIL_PAGE_RTREE));
+      if (mach_read_from_8(block->page.frame + FIL_RTREE_SPLIT_SEQ_NUM))
+        mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM, 8, 0);
+    }
+    /* Set the level of the new index page */
+    mtr->write<2,mtr_t::MAYBE_NOP>(*block,
+                                   my_assume_aligned<2>(PAGE_HEADER +
+                                                        PAGE_LEVEL +
+                                                        block->page.frame),
+				   level);
+    mtr->write<8,mtr_t::MAYBE_NOP>(*block, index_id, index->id);
+  }
+}
+
+buf_block_t *
+mtr_t::get_already_latched(const page_id_t id, mtr_memo_type_t type) const
+{
+  ut_ad(is_active());
+  ut_ad(type == MTR_MEMO_PAGE_X_FIX || type == MTR_MEMO_PAGE_SX_FIX ||
+        type == MTR_MEMO_PAGE_S_FIX);
+  for (ulint i= 0; i < m_memo.size(); i++)
+  {
+    const mtr_memo_slot_t &slot= m_memo[i];
+    const auto slot_type= mtr_memo_type_t(slot.type & ~MTR_MEMO_MODIFY);
+    if (slot_type == MTR_MEMO_PAGE_X_FIX || slot_type == type)
+    {
+      buf_block_t *block= static_cast<buf_block_t*>(slot.object);
+      if (block->page.id() == id)
+        return block;
+    }
+  }
+  return nullptr;
+}
+
+/** Fetch an index root page that was already latched in the
+mini-transaction. */
+static buf_block_t *btr_get_latched_root(const dict_index_t &index, mtr_t *mtr)
+{
+  return mtr->get_already_latched(page_id_t{index.table->space_id, index.page},
+                                  MTR_MEMO_PAGE_SX_FIX);
+}
+
+/** Fetch an index page that should have been already latched in the
+mini-transaction. */
+static buf_block_t *
+btr_block_reget(mtr_t *mtr, const dict_index_t &index,
+                const page_id_t id, dberr_t *err)
+{
+  if (buf_block_t *block= mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX))
+  {
+    *err= DB_SUCCESS;
+    return block;
+  }
+
+  ut_ad(mtr->memo_contains_flagged(&index.lock, MTR_MEMO_X_LOCK));
+  return btr_block_get(index, id.page_no(), RW_X_LATCH, true, mtr, err);
+}
+
+/**************************************************************//**
+Allocates a new file page to be used in an ibuf tree. Takes the page from
+the free list of the tree, which must contain pages!
+@return new allocated block, x-latched */
+static
+buf_block_t*
+btr_page_alloc_for_ibuf(
+/*====================*/
+	dict_index_t*	index,	/*!< in: index tree */
+	mtr_t*		mtr,	/*!< in: mtr */
+	dberr_t*	err)	/*!< out: error code */
+{
+  buf_block_t *root= btr_get_latched_root(*index, mtr);
+  if (UNIV_UNLIKELY(!root))
+    return root;
+  buf_block_t *new_block=
+    buf_page_get_gen(page_id_t(IBUF_SPACE_ID,
+                               mach_read_from_4(PAGE_HEADER +
+                                                PAGE_BTR_IBUF_FREE_LIST +
+                                                FLST_FIRST + FIL_ADDR_PAGE +
+                                                root->page.frame)),
+                     0, RW_X_LATCH, nullptr, BUF_GET, mtr, err);
+  if (new_block)
+    *err= flst_remove(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, new_block,
+                PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr);
+  ut_d(if (*err == DB_SUCCESS)
+         flst_validate(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr));
+  return new_block;
+}
+
+/**************************************************************//**
+Allocates a new file page to be used in an index tree. NOTE: we assume
+that the caller has made the reservation for free extents!
+@retval NULL if no page could be allocated */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+buf_block_t*
+btr_page_alloc_low(
+/*===============*/
+	dict_index_t*	index,		/*!< in: index */
+	uint32_t	hint_page_no,	/*!< in: hint of a good page */
+	byte		file_direction,	/*!< in: direction where a possible
+					page split is made */
+	ulint		level,		/*!< in: level where the page is placed
+					in the tree */
+	mtr_t*		mtr,		/*!< in/out: mini-transaction
+					for the allocation */
+	mtr_t*		init_mtr,	/*!< in/out: mtr or another
+					mini-transaction in which the
+					page should be initialized. */
+	dberr_t*	err)		/*!< out: error code */
+{
+  const auto savepoint= mtr->get_savepoint();
+  buf_block_t *root= btr_root_block_get(index, RW_NO_LATCH, mtr, err);
+  if (UNIV_UNLIKELY(!root))
+    return root;
+
+  const bool have_latch= mtr->have_u_or_x_latch(*root);
+#ifdef BTR_CUR_HASH_ADAPT
+  ut_ad(!have_latch || !root->index || !root->index->freed());
+#endif
+  mtr->rollback_to_savepoint(savepoint);
+
+  if (!have_latch &&
+      UNIV_UNLIKELY(!(root= btr_root_block_get(index, RW_SX_LATCH, mtr, err))))
+    return root;
+
+  fseg_header_t *seg_header= root->page.frame +
+    (level ? PAGE_HEADER + PAGE_BTR_SEG_TOP : PAGE_HEADER + PAGE_BTR_SEG_LEAF);
+  return fseg_alloc_free_page_general(seg_header, hint_page_no, file_direction,
+                                      true, mtr, init_mtr, err);
+}
+
+/**************************************************************//**
+Allocates a new file page to be used in an index tree. NOTE: we assume
+that the caller has made the reservation for free extents!
+@retval NULL if no page could be allocated */
+buf_block_t*
+btr_page_alloc(
+/*===========*/
+	dict_index_t*	index,		/*!< in: index */
+	uint32_t	hint_page_no,	/*!< in: hint of a good page */
+	byte		file_direction,	/*!< in: direction where a possible
+					page split is made */
+	ulint		level,		/*!< in: level where the page is placed
+					in the tree */
+	mtr_t*		mtr,		/*!< in/out: mini-transaction
+					for the allocation */
+	mtr_t*		init_mtr,	/*!< in/out: mini-transaction
+					for x-latching and initializing
+					the page */
+	dberr_t*	err)		/*!< out: error code */
+{
+  ut_ad(level < BTR_MAX_NODE_LEVEL);
+  return index->is_ibuf()
+    ? btr_page_alloc_for_ibuf(index, mtr, err)
+    : btr_page_alloc_low(index, hint_page_no, file_direction, level,
+                         mtr, init_mtr, err);
+}
+
+/**************************************************************//**
+Frees a page used in an ibuf tree. Puts the page to the free list of the
+ibuf tree. */
+static
+dberr_t
+btr_page_free_for_ibuf(
+/*===================*/
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: block to be freed, x-latched */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+  ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+  buf_block_t *root= btr_get_latched_root(*index, mtr);
+  dberr_t err=
+    flst_add_first(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+                   block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr);
+  ut_d(if (err == DB_SUCCESS)
+         flst_validate(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr));
+  return err;
+}
+
+/** Free an index page.
+@param[in,out]	index	index tree
+@param[in,out]	block	block to be freed
+@param[in,out]	mtr	mini-transaction
+@param[in]	blob	whether this is freeing a BLOB page
+@param[in]	latched	whether index->table->space->x_lock() was called
+@return error code */
+dberr_t btr_page_free(dict_index_t* index, buf_block_t* block, mtr_t* mtr,
+                      bool blob, bool space_latched)
+{
+  ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+#if defined BTR_CUR_HASH_ADAPT && defined UNIV_DEBUG
+  if (btr_search_check_marked_free_index(block))
+  {
+    ut_ad(!blob);
+    ut_ad(page_is_leaf(block->page.frame));
+  }
+#endif
+  const uint32_t page{block->page.id().page_no()};
+  ut_ad(index->table->space_id == block->page.id().space());
+  /* The root page is freed by btr_free_root(). */
+  ut_ad(page != index->page);
+  ut_ad(mtr->is_named_space(index->table->space));
+
+  /* The page gets invalid for optimistic searches: increment the frame
+  modify clock */
+  buf_block_modify_clock_inc(block);
+
+  /* TODO: Discard any operations for block from mtr->m_log.
+  The page will be freed, so previous changes to it by this
+  mini-transaction should not matter. */
+
+  if (index->is_ibuf())
+    return btr_page_free_for_ibuf(index, block, mtr);
+
+  fil_space_t *space= index->table->space;
+  dberr_t err;
+
+  const auto savepoint= mtr->get_savepoint();
+  if (buf_block_t *root= btr_root_block_get(index, RW_NO_LATCH, mtr, &err))
+  {
+    const bool have_latch= mtr->have_u_or_x_latch(*root);
+#ifdef BTR_CUR_HASH_ADAPT
+    ut_ad(!have_latch || !root->index || !root->index->freed());
+#endif
+    mtr->rollback_to_savepoint(savepoint);
+    if (have_latch ||
+        (root= btr_root_block_get(index, RW_SX_LATCH, mtr, &err)))
+      err= fseg_free_page(&root->page.frame[blob ||
+                                            page_is_leaf(block->page.frame)
+                                            ? PAGE_HEADER + PAGE_BTR_SEG_LEAF
+                                            : PAGE_HEADER + PAGE_BTR_SEG_TOP],
+                          space, page, mtr, space_latched);
+  }
+  if (err == DB_SUCCESS)
+    buf_page_free(space, page, mtr);
+
+  /* The page was marked free in the allocation bitmap, but it
+  should remain exclusively latched until mtr_t::commit() or until it
+  is explicitly freed from the mini-transaction. */
+  ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+  return err;
+}
+
+/** Set the child page number in a node pointer record.
+@param[in,out]  block   non-leaf index page
+@param[in,out]  rec     node pointer record in the page
+@param[in]      offsets rec_get_offsets(rec)
+@param[in]      page_no child page number
+@param[in,out]  mtr     mini-transaction
+Sets the child node file address in a node pointer. */
+inline void btr_node_ptr_set_child_page_no(buf_block_t *block,
+                                           rec_t *rec, const rec_offs *offsets,
+                                           ulint page_no, mtr_t *mtr)
+{
+  ut_ad(rec_offs_validate(rec, NULL, offsets));
+  ut_ad(!page_rec_is_leaf(rec));
+  ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec));
+
+  const ulint offs= rec_offs_data_size(offsets);
+  ut_ad(rec_offs_nth_size(offsets, rec_offs_n_fields(offsets) - 1) ==
+        REC_NODE_PTR_SIZE);
+
+  if (UNIV_LIKELY_NULL(block->page.zip.data))
+    page_zip_write_node_ptr(block, rec, offs, page_no, mtr);
+  else
+    mtr->write<4>(*block, rec + offs - REC_NODE_PTR_SIZE, page_no);
+}
+
+MY_ATTRIBUTE((nonnull(1,2,3,4),warn_unused_result))
+/************************************************************//**
+Returns the child page of a node pointer and sx-latches it.
+@return child page, sx-latched */
+static
+buf_block_t*
+btr_node_ptr_get_child(
+/*===================*/
+	const rec_t*	node_ptr,/*!< in: node pointer */
+	dict_index_t*	index,	/*!< in: index */
+	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
+	mtr_t*		mtr,	/*!< in: mtr */
+	dberr_t*	err = nullptr)	/*!< out: error code */
+{
+	ut_ad(rec_offs_validate(node_ptr, index, offsets));
+	ut_ad(index->table->space_id
+	      == page_get_space_id(page_align(node_ptr)));
+
+	return btr_block_get(
+		*index, btr_node_ptr_get_child_page_no(node_ptr, offsets),
+		RW_SX_LATCH, btr_page_get_level(page_align(node_ptr)) == 1,
+		mtr, err);
+}
+
+MY_ATTRIBUTE((nonnull(2,3,4), warn_unused_result))
+/************************************************************//**
+Returns the upper level node pointer to a page. It is assumed that mtr holds
+an sx-latch on the tree.
+@return rec_get_offsets() of the node pointer record */
+static
+rec_offs*
+btr_page_get_father_node_ptr_for_validate(
+	rec_offs*	offsets,/*!< in: work area for the return value */
+	mem_heap_t*	heap,	/*!< in: memory heap to use */
+	btr_cur_t*	cursor,	/*!< in: cursor pointing to user record,
+				out: cursor on node pointer record,
+				its page x-latched */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	const uint32_t page_no = btr_cur_get_block(cursor)->page.id().page_no();
+	dict_index_t* index = btr_cur_get_index(cursor);
+	ut_ad(!dict_index_is_spatial(index));
+	ut_ad(mtr->memo_contains(index->lock, MTR_MEMO_X_LOCK));
+	ut_ad(dict_index_get_page(index) != page_no);
+
+	const auto level = btr_page_get_level(btr_cur_get_page(cursor));
+
+	const rec_t* user_rec = btr_cur_get_rec(cursor);
+	ut_a(page_rec_is_user_rec(user_rec));
+
+	if (btr_cur_search_to_nth_level(level + 1,
+					dict_index_build_node_ptr(index,
+								  user_rec, 0,
+								  heap, level),
+					RW_S_LATCH,
+					cursor, mtr) != DB_SUCCESS) {
+		return nullptr;
+	}
+
+	const rec_t* node_ptr = btr_cur_get_rec(cursor);
+
+	offsets = rec_get_offsets(node_ptr, index, offsets, 0,
+				  ULINT_UNDEFINED, &heap);
+
+	if (btr_node_ptr_get_child_page_no(node_ptr, offsets) != page_no) {
+		offsets = nullptr;
+	}
+
+	return(offsets);
+}
+
+MY_ATTRIBUTE((nonnull(2,3,4), warn_unused_result))
+/** Return the node pointer to a page.
+@param offsets   work area for the return value
+@param heap      memory heap
+@param cursor    in: child page; out: node pointer to it
+@param mtr       mini-transaction
+@return rec_get_offsets() of the node pointer record
+@retval nullptr  if the parent page had not been latched in mtr */
+static rec_offs *btr_page_get_parent(rec_offs *offsets, mem_heap_t *heap,
+                                     btr_cur_t *cursor, mtr_t *mtr)
+{
+  const uint32_t page_no= cursor->block()->page.id().page_no();
+  const dict_index_t *index= cursor->index();
+  ut_ad(!index->is_spatial());
+  ut_ad(index->page != page_no);
+
+  uint32_t p= index->page;
+  auto level= btr_page_get_level(cursor->block()->page.frame);
+  const dtuple_t *tuple=
+    dict_index_build_node_ptr(index, btr_cur_get_rec(cursor), 0, heap, level);
+  level++;
+
+  ulint i;
+  for (i= 0; i < mtr->get_savepoint(); i++)
+    if (buf_block_t *block= mtr->block_at_savepoint(i))
+      if (block->page.id().page_no() == p)
+      {
+        ut_ad(block->page.lock.have_u_or_x() ||
+              (!block->page.lock.have_s() && index->lock.have_x()));
+        ulint up_match= 0, low_match= 0;
+        cursor->page_cur.block= block;
+        if (page_cur_search_with_match(tuple, PAGE_CUR_LE, &up_match,
+                                       &low_match, &cursor->page_cur,
+                                       nullptr))
+          return nullptr;
+        offsets= rec_get_offsets(cursor->page_cur.rec, index, offsets, 0,
+                                 ULINT_UNDEFINED, &heap);
+        p= btr_node_ptr_get_child_page_no(cursor->page_cur.rec, offsets);
+        if (p != page_no)
+        {
+          if (btr_page_get_level(block->page.frame) == level)
+            return nullptr;
+          i= 0; // MDEV-29835 FIXME: require all pages to be latched in order!
+          continue;
+        }
+        ut_ad(block->page.lock.have_u_or_x());
+        if (block->page.lock.have_u_not_x())
+        {
+          /* btr_cur_t::search_leaf(BTR_MODIFY_TREE) only U-latches the
+          root page initially. */
+          ut_ad(block->page.id().page_no() == index->page);
+          block->page.lock.u_x_upgrade();
+          mtr->page_lock_upgrade(*block);
+        }
+        return offsets;
+      }
+
+  return nullptr;
+}
+
+/************************************************************//**
+Returns the upper level node pointer to a page. It is assumed that mtr holds
+an x-latch on the tree.
+@return rec_get_offsets() of the node pointer record */
+static
+rec_offs*
+btr_page_get_father_block(
+/*======================*/
+	rec_offs*	offsets,/*!< in: work area for the return value */
+	mem_heap_t*	heap,	/*!< in: memory heap to use */
+	mtr_t*		mtr,	/*!< in: mtr */
+	btr_cur_t*	cursor)	/*!< out: cursor on node pointer record,
+				its page x-latched */
+{
+  rec_t *rec=
+    page_rec_get_next(page_get_infimum_rec(cursor->block()->page.frame));
+  if (UNIV_UNLIKELY(!rec))
+    return nullptr;
+  cursor->page_cur.rec= rec;
+  return btr_page_get_parent(offsets, heap, cursor, mtr);
+}
+
+/** Seek to the parent page of a B-tree page.
+@param[in,out]	mtr	mini-transaction
+@param[in,out]	cursor	cursor pointing to the x-latched parent page
+@return whether the cursor was successfully positioned */
+bool btr_page_get_father(mtr_t* mtr, btr_cur_t* cursor)
+{
+  rec_t *rec=
+    page_rec_get_next(page_get_infimum_rec(cursor->block()->page.frame));
+  if (UNIV_UNLIKELY(!rec))
+    return false;
+  cursor->page_cur.rec= rec;
+  mem_heap_t *heap= mem_heap_create(100);
+  const bool got= btr_page_get_parent(nullptr, heap, cursor, mtr);
+  mem_heap_free(heap);
+  return got;
+}
+
+#ifdef UNIV_DEBUG
+/** PAGE_INDEX_ID value for freed index B-trees */
+constexpr index_id_t	BTR_FREED_INDEX_ID = 0;
+#endif
+
+/** Free a B-tree root page. btr_free_but_not_root() must already
+have been called.
+@param block   index root page
+@param space   tablespace
+@param mtr     mini-transaction */
+static void btr_free_root(buf_block_t *block, const fil_space_t &space,
+                          mtr_t *mtr)
+{
+  ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX |
+                                   MTR_MEMO_PAGE_SX_FIX));
+  ut_ad(mtr->is_named_space(&space));
+
+  btr_search_drop_page_hash_index(block, false);
+
+  if (btr_root_fseg_validate(PAGE_HEADER + PAGE_BTR_SEG_TOP, *block, space))
+  {
+    /* Free the entire segment in small steps. */
+    ut_d(mtr->freeing_tree());
+    while (!fseg_free_step(PAGE_HEADER + PAGE_BTR_SEG_TOP +
+                           block->page.frame, mtr));
+  }
+}
+
+MY_ATTRIBUTE((warn_unused_result))
+/** Prepare to free a B-tree.
+@param[in]	page_id		page id
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	index_id	PAGE_INDEX_ID contents
+@param[in,out]	mtr		mini-transaction
+@return root block, to invoke btr_free_but_not_root() and btr_free_root()
+@retval NULL if the page is no longer a matching B-tree page */
+static
+buf_block_t *btr_free_root_check(const page_id_t page_id, ulint zip_size,
+				 index_id_t index_id, mtr_t *mtr)
+{
+  ut_ad(page_id.space() != SRV_TMP_SPACE_ID);
+  ut_ad(index_id != BTR_FREED_INDEX_ID);
+
+  buf_block_t *block= buf_page_get_gen(page_id, zip_size, RW_X_LATCH,
+                                       nullptr, BUF_GET_POSSIBLY_FREED, mtr);
+
+  if (!block);
+  else if (fil_page_index_page_check(block->page.frame) &&
+           index_id == btr_page_get_index_id(block->page.frame))
+    /* This should be a root page. It should not be possible to
+    reassign the same index_id for some other index in the
+    tablespace. */
+    ut_ad(!page_has_siblings(block->page.frame));
+  else
+    block= nullptr;
+
+  return block;
+}
+
+/** Initialize the root page of the b-tree
+@param[in,out]  block           root block
+@param[in]      index_id        index id
+@param[in]      index           index of root page
+@param[in,out]  mtr             mini-transaction */
+static void btr_root_page_init(buf_block_t *block, index_id_t index_id,
+                               dict_index_t *index, mtr_t *mtr)
+{
+  constexpr uint16_t field= PAGE_HEADER + PAGE_INDEX_ID;
+  byte *page_index_id= my_assume_aligned<2>(field + block->page.frame);
+
+  /* Create a new index page on the allocated segment page */
+  if (UNIV_LIKELY_NULL(block->page.zip.data))
+  {
+    mach_write_to_8(page_index_id, index_id);
+    ut_ad(!page_has_siblings(block->page.zip.data));
+    page_create_zip(block, index, 0, 0, mtr);
+  }
+  else
+  {
+    page_create(block, mtr, index && index->table->not_redundant());
+    if (index && index->is_spatial())
+    {
+      static_assert(((FIL_PAGE_INDEX & 0xff00) | byte(FIL_PAGE_RTREE)) ==
+                    FIL_PAGE_RTREE, "compatibility");
+      mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->page.frame,
+                    byte(FIL_PAGE_RTREE));
+      if (mach_read_from_8(block->page.frame + FIL_RTREE_SPLIT_SEQ_NUM))
+        mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM, 8, 0);
+    }
+    /* Set the level of the new index page */
+    mtr->write<2,mtr_t::MAYBE_NOP>(
+        *block, PAGE_HEADER + PAGE_LEVEL + block->page.frame, 0U);
+    mtr->write<8,mtr_t::MAYBE_NOP>(*block, page_index_id, index_id);
+  }
+}
+
+/** Create the root node for a new index tree.
+@param[in]	type			type of the index
+@param[in]	index_id		index id
+@param[in,out]	space			tablespace where created
+@param[in]	index			index, or NULL to create a system table
+@param[in,out]	mtr			mini-transaction
+@param[out]	err			error code
+@return	page number of the created root
+@retval	FIL_NULL	if did not succeed */
+uint32_t
+btr_create(
+	ulint			type,
+	fil_space_t*		space,
+	index_id_t		index_id,
+	dict_index_t*		index,
+	mtr_t*			mtr,
+	dberr_t*		err)
+{
+	buf_block_t*		block;
+
+	ut_ad(mtr->is_named_space(space));
+	ut_ad(index_id != BTR_FREED_INDEX_ID);
+	ut_ad(index || space == fil_system.sys_space);
+
+	/* Create the two new segments (one, in the case of an ibuf tree) for
+	the index tree; the segment headers are put on the allocated root page
+	(for an ibuf tree, not in the root, but on a separate ibuf header
+	page) */
+
+	if (UNIV_UNLIKELY(type & DICT_IBUF)) {
+		/* Allocate first the ibuf header page */
+		buf_block_t*	ibuf_hdr_block = fseg_create(
+			space, IBUF_HEADER + IBUF_TREE_SEG_HEADER, mtr, err);
+
+		if (ibuf_hdr_block == NULL) {
+			return(FIL_NULL);
+		}
+
+		ut_ad(ibuf_hdr_block->page.id().page_no()
+		      == IBUF_HEADER_PAGE_NO);
+		/* Allocate then the next page to the segment: it will be the
+		tree root page */
+
+		block = fseg_alloc_free_page_general(
+			buf_block_get_frame(ibuf_hdr_block)
+			+ IBUF_HEADER + IBUF_TREE_SEG_HEADER,
+			IBUF_TREE_ROOT_PAGE_NO,
+			FSP_UP, false, mtr, mtr, err);
+
+		if (block == NULL) {
+			return(FIL_NULL);
+		}
+
+		ut_ad(block->page.id() == page_id_t(0,IBUF_TREE_ROOT_PAGE_NO));
+
+		flst_init(block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr);
+	} else {
+		block = fseg_create(space, PAGE_HEADER + PAGE_BTR_SEG_TOP,
+				    mtr, err);
+
+		if (block == NULL) {
+			return(FIL_NULL);
+		}
+
+		if (!fseg_create(space, PAGE_HEADER + PAGE_BTR_SEG_LEAF, mtr,
+				 err, false, block)) {
+			/* Not enough space for new segment, free root
+			segment before return. */
+			btr_free_root(block, *space, mtr);
+			return(FIL_NULL);
+		}
+	}
+
+	ut_ad(!page_has_siblings(block->page.frame));
+
+	btr_root_page_init(block, index_id, index, mtr);
+
+	/* We reset the free bits for the page in a separate
+	mini-transaction to allow creation of several trees in the
+	same mtr, otherwise the latch on a bitmap page would prevent
+	it because of the latching order.
+
+	Note: Insert Buffering is disabled for temporary tables given that
+	most temporary tables are smaller in size and short-lived. */
+	if (!(type & DICT_CLUSTERED)
+	    && (!index || !index->table->is_temporary())) {
+		ibuf_reset_free_bits(block);
+	}
+
+	/* In the following assertion we test that two records of maximum
+	allowed size fit on the root page: this fact is needed to ensure
+	correctness of split algorithms */
+
+	ut_ad(page_get_max_insert_size(block->page.frame, 2)
+	      > 2 * BTR_PAGE_MAX_REC_SIZE);
+
+	return(block->page.id().page_no());
+}
+
+/** Free a B-tree except the root page. The root page MUST be freed after
+this by calling btr_free_root.
+@param[in,out]	block		root page
+@param[in]	log_mode	mtr logging mode */
+static
+void
+btr_free_but_not_root(
+	buf_block_t*	block,
+	mtr_log_t	log_mode
+#ifdef BTR_CUR_HASH_ADAPT
+	,bool		ahi=false
+#endif
+	)
+{
+	mtr_t	mtr;
+
+	ut_ad(fil_page_index_page_check(block->page.frame));
+	ut_ad(!page_has_siblings(block->page.frame));
+leaf_loop:
+	mtr_start(&mtr);
+	ut_d(mtr.freeing_tree());
+	mtr_set_log_mode(&mtr, log_mode);
+	fil_space_t *space = mtr.set_named_space_id(block->page.id().space());
+
+	if (!btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF,
+				    *block, *space)
+	    || !btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP,
+				       *block, *space)) {
+		mtr_commit(&mtr);
+		return;
+	}
+
+	/* NOTE: page hash indexes are dropped when a page is freed inside
+	fsp0fsp. */
+
+	bool finished = fseg_free_step(PAGE_HEADER + PAGE_BTR_SEG_LEAF
+				       + block->page.frame, &mtr
+#ifdef BTR_CUR_HASH_ADAPT
+				       , ahi
+#endif /* BTR_CUR_HASH_ADAPT */
+				       );
+	mtr_commit(&mtr);
+
+	if (!finished) {
+
+		goto leaf_loop;
+	}
+top_loop:
+	mtr_start(&mtr);
+	mtr_set_log_mode(&mtr, log_mode);
+	space = mtr.set_named_space_id(block->page.id().space());
+
+	finished = !btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP,
+					   *block, *space)
+		|| fseg_free_step_not_header(PAGE_HEADER + PAGE_BTR_SEG_TOP
+					     + block->page.frame, &mtr
+#ifdef BTR_CUR_HASH_ADAPT
+					     ,ahi
+#endif /* BTR_CUR_HASH_ADAPT */
+					     );
+	mtr_commit(&mtr);
+
+	if (!finished) {
+		goto top_loop;
+	}
+}
+
+/** Clear the index tree and reinitialize the root page, in the
+rollback of TRX_UNDO_EMPTY. The BTR_SEG_LEAF is freed and reinitialized.
+@param thr query thread
+@return error code */
+TRANSACTIONAL_TARGET
+dberr_t dict_index_t::clear(que_thr_t *thr)
+{
+  mtr_t mtr;
+  mtr.start();
+  if (table->is_temporary())
+    mtr.set_log_mode(MTR_LOG_NO_REDO);
+  else
+    set_modified(mtr);
+  mtr_sx_lock_index(this, &mtr);
+
+  dberr_t err;
+  if (buf_block_t *root_block=
+      buf_page_get_gen(page_id_t(table->space->id, page),
+                       table->space->zip_size(),
+                       RW_X_LATCH, nullptr, BUF_GET, &mtr, &err))
+  {
+    btr_free_but_not_root(root_block, mtr.get_log_mode()
+#ifdef BTR_CUR_HASH_ADAPT
+		          ,n_ahi_pages() != 0
+#endif
+                         );
+
+#ifdef BTR_CUR_HASH_ADAPT
+    if (root_block->index)
+      btr_search_drop_page_hash_index(root_block, false);
+    ut_ad(n_ahi_pages() == 0);
+#endif
+    mtr.memset(root_block, PAGE_HEADER + PAGE_BTR_SEG_LEAF,
+               FSEG_HEADER_SIZE, 0);
+    if (fseg_create(table->space, PAGE_HEADER + PAGE_BTR_SEG_LEAF, &mtr,
+                    &err, false, root_block))
+      btr_root_page_init(root_block, id, this, &mtr);
+  }
+
+  mtr.commit();
+  return err;
+}
+
+/** Free a persistent index tree if it exists.
+@param[in,out]	space		tablespce
+@param[in]	page		root page number
+@param[in]	index_id	PAGE_INDEX_ID contents
+@param[in,out]	mtr		mini-transaction */
+void btr_free_if_exists(fil_space_t *space, uint32_t page,
+                        index_id_t index_id, mtr_t *mtr)
+{
+  if (buf_block_t *root= btr_free_root_check(page_id_t(space->id, page),
+					     space->zip_size(),
+					     index_id, mtr))
+  {
+    btr_free_but_not_root(root, mtr->get_log_mode());
+    mtr->set_named_space(space);
+    btr_free_root(root, *space, mtr);
+  }
+}
+
+/** Drop a temporary table
+@param table   temporary table */
+void btr_drop_temporary_table(const dict_table_t &table)
+{
+  ut_ad(table.is_temporary());
+  ut_ad(table.space == fil_system.temp_space);
+  mtr_t mtr;
+  mtr.start();
+  for (const dict_index_t *index= table.indexes.start; index;
+       index= dict_table_get_next_index(index))
+  {
+    if (buf_block_t *block= buf_page_get_low({SRV_TMP_SPACE_ID, index->page}, 0,
+                                             RW_X_LATCH, nullptr, BUF_GET, &mtr,
+                                             nullptr, false))
+    {
+      btr_free_but_not_root(block, MTR_LOG_NO_REDO);
+      mtr.set_log_mode(MTR_LOG_NO_REDO);
+      btr_free_root(block, *fil_system.temp_space, &mtr);
+      mtr.commit();
+      mtr.start();
+    }
+  }
+  mtr.commit();
+}
+
+/** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC.
+@param[in,out]	index	clustered index
+@return	the last used AUTO_INCREMENT value
+@retval	0 on error or if no AUTO_INCREMENT value was used yet */
+ib_uint64_t
+btr_read_autoinc(dict_index_t* index)
+{
+	ut_ad(index->is_primary());
+	ut_ad(index->table->persistent_autoinc);
+	ut_ad(!index->table->is_temporary());
+	mtr_t		mtr;
+	mtr.start();
+	ib_uint64_t	autoinc;
+	if (buf_block_t* block = buf_page_get(
+		    page_id_t(index->table->space_id, index->page),
+		    index->table->space->zip_size(),
+		    RW_S_LATCH, &mtr)) {
+		autoinc = page_get_autoinc(block->page.frame);
+	} else {
+		autoinc = 0;
+	}
+	mtr.commit();
+	return autoinc;
+}
+
+/** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC,
+or fall back to MAX(auto_increment_column).
+@param[in]	table	table containing an AUTO_INCREMENT column
+@param[in]	col_no	index of the AUTO_INCREMENT column
+@return	the AUTO_INCREMENT value
+@retval	0 on error or if no AUTO_INCREMENT value was used yet */
+ib_uint64_t
+btr_read_autoinc_with_fallback(const dict_table_t* table, unsigned col_no)
+{
+	ut_ad(table->persistent_autoinc);
+	ut_ad(!table->is_temporary());
+
+	dict_index_t*	index = dict_table_get_first_index(table);
+
+	if (index == NULL) {
+		return 0;
+	}
+
+	mtr_t		mtr;
+	mtr.start();
+	buf_block_t*	block = buf_page_get(
+		page_id_t(index->table->space_id, index->page),
+		index->table->space->zip_size(),
+		RW_S_LATCH, &mtr);
+
+	ib_uint64_t	autoinc	= block
+		? page_get_autoinc(block->page.frame) : 0;
+	const bool	retry	= block && autoinc == 0
+		&& !page_is_empty(block->page.frame);
+	mtr.commit();
+
+	if (retry) {
+		/* This should be an old data file where
+		PAGE_ROOT_AUTO_INC was initialized to 0.
+		Fall back to reading MAX(autoinc_col).
+		There should be an index on it. */
+		const dict_col_t*	autoinc_col
+			= dict_table_get_nth_col(table, col_no);
+		while (index && index->fields[0].col != autoinc_col) {
+			index = dict_table_get_next_index(index);
+		}
+
+		if (index) {
+			autoinc = row_search_max_autoinc(index);
+		}
+	}
+
+	return autoinc;
+}
+
+/** Write the next available AUTO_INCREMENT value to PAGE_ROOT_AUTO_INC.
+@param[in,out]	index	clustered index
+@param[in]	autoinc	the AUTO_INCREMENT value
+@param[in]	reset	whether to reset the AUTO_INCREMENT
+			to a possibly smaller value than currently
+			exists in the page */
+void
+btr_write_autoinc(dict_index_t* index, ib_uint64_t autoinc, bool reset)
+{
+  ut_ad(index->is_primary());
+  ut_ad(index->table->persistent_autoinc);
+  ut_ad(!index->table->is_temporary());
+
+  mtr_t mtr;
+  mtr.start();
+  fil_space_t *space= index->table->space;
+  if (buf_block_t *root= buf_page_get(page_id_t(space->id, index->page),
+				      space->zip_size(), RW_SX_LATCH, &mtr))
+  {
+    mtr.set_named_space(space);
+    page_set_autoinc(root, autoinc, &mtr, reset);
+  }
+
+  mtr.commit();
+}
+
+/** Reorganize an index page.
+@param cursor      index page cursor
+@param mtr         mini-transaction */
+static dberr_t btr_page_reorganize_low(page_cur_t *cursor, mtr_t *mtr)
+{
+  buf_block_t *const block= cursor->block;
+
+  ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+  ut_ad(!is_buf_block_get_page_zip(block));
+  ut_ad(fil_page_index_page_check(block->page.frame));
+  ut_ad(cursor->index->is_dummy ||
+        block->page.id().space() == cursor->index->table->space->id);
+  ut_ad(cursor->index->is_dummy ||
+        block->page.id().page_no() != cursor->index->page ||
+        !page_has_siblings(block->page.frame));
+
+  /* Save the cursor position. */
+  const ulint pos= page_rec_get_n_recs_before(cursor->rec);
+
+  if (UNIV_UNLIKELY(pos == ULINT_UNDEFINED))
+    return DB_CORRUPTION;
+
+  btr_search_drop_page_hash_index(block, false);
+
+  buf_block_t *old= buf_block_alloc();
+  /* Copy the old page to temporary space */
+  memcpy_aligned<UNIV_PAGE_SIZE_MIN>(old->page.frame, block->page.frame,
+                                     srv_page_size);
+
+  const mtr_log_t log_mode= mtr->set_log_mode(MTR_LOG_NO_REDO);
+
+  page_create(block, mtr, cursor->index->table->not_redundant());
+  if (cursor->index->is_spatial())
+    block->page.frame[FIL_PAGE_TYPE + 1]= byte(FIL_PAGE_RTREE);
+
+  static_assert(((FIL_PAGE_INDEX & 0xff00) | byte(FIL_PAGE_RTREE)) ==
+                FIL_PAGE_RTREE, "compatibility");
+
+  /* Copy the records from the temporary space to the recreated page;
+  do not copy the lock bits yet */
+
+  dberr_t err=
+    page_copy_rec_list_end_no_locks(block, old,
+                                    page_get_infimum_rec(old->page.frame),
+                                    cursor->index, mtr);
+  mtr->set_log_mode(log_mode);
+
+  if (UNIV_UNLIKELY(err != DB_SUCCESS))
+    return err;
+
+  /* Copy the PAGE_MAX_TRX_ID or PAGE_ROOT_AUTO_INC. */
+  ut_ad(!page_get_max_trx_id(block->page.frame));
+  memcpy_aligned<8>(PAGE_MAX_TRX_ID + PAGE_HEADER + block->page.frame,
+                    PAGE_MAX_TRX_ID + PAGE_HEADER + old->page.frame, 8);
+#ifdef UNIV_DEBUG
+  if (page_get_max_trx_id(block->page.frame))
+    /* PAGE_MAX_TRX_ID must be zero on non-leaf pages other than
+    clustered index root pages. */
+    ut_ad(dict_index_is_sec_or_ibuf(cursor->index)
+          ? page_is_leaf(block->page.frame)
+          : block->page.id().page_no() == cursor->index->page);
+  else
+    /* PAGE_MAX_TRX_ID is unused in clustered index pages (other than
+    the root where it is repurposed as PAGE_ROOT_AUTO_INC), non-leaf
+    pages, and in temporary tables.  It was always zero-initialized in
+    page_create().  PAGE_MAX_TRX_ID must be nonzero on
+    dict_index_is_sec_or_ibuf() leaf pages. */
+    ut_ad(cursor->index->table->is_temporary() ||
+          !page_is_leaf(block->page.frame) ||
+          !dict_index_is_sec_or_ibuf(cursor->index));
+#endif
+
+  const uint16_t data_size1= page_get_data_size(old->page.frame);
+  const uint16_t data_size2= page_get_data_size(block->page.frame);
+  const ulint max1=
+    page_get_max_insert_size_after_reorganize(old->page.frame, 1);
+  const ulint max2=
+    page_get_max_insert_size_after_reorganize(block->page.frame, 1);
+
+  if (UNIV_UNLIKELY(data_size1 != data_size2 || max1 != max2))
+  {
+    sql_print_error("InnoDB: Page old data size %u new data size %u"
+                    ", page old max ins size %zu new max ins size %zu",
+                    data_size1, data_size2, max1, max2);
+    return DB_CORRUPTION;
+  }
+
+  /* Restore the cursor position. */
+  if (!pos)
+    ut_ad(cursor->rec == page_get_infimum_rec(block->page.frame));
+  else if (!(cursor->rec= page_rec_get_nth(block->page.frame, pos)))
+    return DB_CORRUPTION;
+
+  if (block->page.id().page_no() != cursor->index->page ||
+      fil_page_get_type(old->page.frame) != FIL_PAGE_TYPE_INSTANT)
+    ut_ad(!memcmp(old->page.frame, block->page.frame, PAGE_HEADER));
+  else if (!cursor->index->is_instant())
+  {
+    ut_ad(!memcmp(old->page.frame, block->page.frame, FIL_PAGE_TYPE));
+    ut_ad(!memcmp(old->page.frame + FIL_PAGE_TYPE + 2,
+                  block->page.frame + FIL_PAGE_TYPE + 2,
+                  PAGE_HEADER - FIL_PAGE_TYPE - 2));
+    mtr->write<2,mtr_t::FORCED>(*block, FIL_PAGE_TYPE + block->page.frame,
+                                FIL_PAGE_INDEX);
+  }
+  else
+  {
+    /* Preserve the PAGE_INSTANT information. */
+    memcpy_aligned<2>(FIL_PAGE_TYPE + block->page.frame,
+                      FIL_PAGE_TYPE + old->page.frame, 2);
+    memcpy_aligned<2>(PAGE_HEADER + PAGE_INSTANT + block->page.frame,
+                      PAGE_HEADER + PAGE_INSTANT + old->page.frame, 2);
+    if (!cursor->index->table->instant);
+    else if (page_is_comp(block->page.frame))
+    {
+      memcpy(PAGE_NEW_INFIMUM + block->page.frame,
+             PAGE_NEW_INFIMUM + old->page.frame, 8);
+      memcpy(PAGE_NEW_SUPREMUM + block->page.frame,
+             PAGE_NEW_SUPREMUM + old->page.frame, 8);
+    }
+    else
+    {
+      memcpy(PAGE_OLD_INFIMUM + block->page.frame,
+             PAGE_OLD_INFIMUM + old->page.frame, 8);
+      memcpy(PAGE_OLD_SUPREMUM + block->page.frame,
+             PAGE_OLD_SUPREMUM + old->page.frame, 8);
+    }
+
+    ut_ad(!memcmp(old->page.frame, block->page.frame, PAGE_HEADER));
+  }
+
+  ut_ad(!memcmp(old->page.frame + PAGE_MAX_TRX_ID + PAGE_HEADER,
+                block->page.frame + PAGE_MAX_TRX_ID + PAGE_HEADER,
+                PAGE_DATA - (PAGE_MAX_TRX_ID + PAGE_HEADER)));
+
+  if (!cursor->index->has_locking());
+  else if (cursor->index->page == FIL_NULL)
+    ut_ad(cursor->index->is_dummy);
+  else
+    lock_move_reorganize_page(block, old);
+
+  /* Write log for the changes, if needed. */
+  if (log_mode == MTR_LOG_ALL)
+  {
+    /* Check and log the changes in the page header. */
+    ulint a, e;
+    for (a= PAGE_HEADER, e= PAGE_MAX_TRX_ID + PAGE_HEADER; a < e; a++)
+    {
+      if (old->page.frame[a] == block->page.frame[a])
+        continue;
+      while (--e, old->page.frame[e] == block->page.frame[e]);
+      e++;
+      ut_ad(a < e);
+      /* Write log for the changed page header fields. */
+      mtr->memcpy(*block, a, e - a);
+      break;
+    }
+
+    const uint16_t top= page_header_get_offs(block->page.frame, PAGE_HEAP_TOP);
+
+    if (page_is_comp(block->page.frame))
+    {
+      /* info_bits=0, n_owned=1, heap_no=0, status */
+      ut_ad(!memcmp(PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES +
+                    block->page.frame,
+                    PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES +
+                    old->page.frame, 3));
+      /* If the 'next' pointer of the infimum record has changed, log it. */
+      a= PAGE_NEW_INFIMUM - 2;
+      e= a + 2;
+      if (block->page.frame[a] == old->page.frame[a])
+        a++;
+      if (--e, block->page.frame[e] != old->page.frame[e])
+        e++;
+      if (ulint len= e - a)
+        mtr->memcpy(*block, a, len);
+      /* The infimum record itself must not change. */
+      ut_ad(!memcmp(PAGE_NEW_INFIMUM + block->page.frame,
+                    PAGE_NEW_INFIMUM + old->page.frame, 8));
+      /* Log any change of the n_owned of the supremum record. */
+      a= PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES;
+      if (block->page.frame[a] != old->page.frame[a])
+        mtr->memcpy(*block, a, 1);
+      /* The rest of the supremum record must not change. */
+      ut_ad(!memcmp(&block->page.frame[a + 1], &old->page.frame[a + 1],
+                    PAGE_NEW_SUPREMUM_END - PAGE_NEW_SUPREMUM +
+                    REC_N_NEW_EXTRA_BYTES - 1));
+
+      /* Log the differences in the payload. */
+      for (a= PAGE_NEW_SUPREMUM_END, e= top; a < e; a++)
+      {
+        if (old->page.frame[a] == block->page.frame[a])
+          continue;
+        while (--e, old->page.frame[e] == block->page.frame[e]);
+        e++;
+        ut_ad(a < e);
+        /* TODO: write MEMMOVE records to minimize this further! */
+        mtr->memcpy(*block, a, e - a);
+        break;
+      }
+    }
+    else
+    {
+      /* info_bits=0, n_owned=1, heap_no=0, number of fields, 1-byte format */
+      ut_ad(!memcmp(PAGE_OLD_INFIMUM - REC_N_OLD_EXTRA_BYTES +
+                    block->page.frame,
+                    PAGE_OLD_INFIMUM - REC_N_OLD_EXTRA_BYTES +
+                    old->page.frame, 4));
+      /* If the 'next' pointer of the infimum record has changed, log it. */
+      a= PAGE_OLD_INFIMUM - 2;
+      e= a + 2;
+      if (block->page.frame[a] == old->page.frame[a])
+        a++;
+      if (--e, block->page.frame[e] != old->page.frame[e])
+        e++;
+      if (ulint len= e - a)
+        mtr->memcpy(*block, a, len);
+      /* The infimum record itself must not change. */
+      ut_ad(!memcmp(PAGE_OLD_INFIMUM + block->page.frame,
+                    PAGE_OLD_INFIMUM + old->page.frame, 8));
+      /* Log any change of the n_owned of the supremum record. */
+      a= PAGE_OLD_SUPREMUM - REC_N_OLD_EXTRA_BYTES;
+      if (block->page.frame[a] != old->page.frame[a])
+        mtr->memcpy(*block, a, 1);
+      ut_ad(!memcmp(&block->page.frame[a + 1], &old->page.frame[a + 1],
+                    PAGE_OLD_SUPREMUM_END - PAGE_OLD_SUPREMUM +
+                    REC_N_OLD_EXTRA_BYTES - 1));
+
+      /* Log the differences in the payload. */
+      for (a= PAGE_OLD_SUPREMUM_END, e= top; a < e; a++)
+      {
+        if (old->page.frame[a] == block->page.frame[a])
+          continue;
+        while (--e, old->page.frame[e] == block->page.frame[e]);
+        e++;
+        ut_ad(a < e);
+        /* TODO: write MEMMOVE records to minimize this further! */
+        mtr->memcpy(*block, a, e - a);
+        break;
+      }
+    }
+
+    e= srv_page_size - PAGE_DIR;
+    a= e - PAGE_DIR_SLOT_SIZE * page_dir_get_n_slots(block->page.frame);
+
+    /* Zero out the payload area. */
+    mtr->memset(*block, top, a - top, 0);
+
+    /* Log changes to the page directory. */
+    for (; a < e; a++)
+    {
+      if (old->page.frame[a] == block->page.frame[a])
+        continue;
+      while (--e, old->page.frame[e] == block->page.frame[e]);
+      e++;
+      ut_ad(a < e);
+      /* Write log for the changed page directory slots. */
+      mtr->memcpy(*block, a, e - a);
+      break;
+    }
+  }
+
+  buf_block_free(old);
+
+  MONITOR_INC(MONITOR_INDEX_REORG_ATTEMPTS);
+  MONITOR_INC(MONITOR_INDEX_REORG_SUCCESSFUL);
+  return DB_SUCCESS;
+}
+
+/*************************************************************//**
+Reorganizes an index page.
+
+IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index. This has to
+be done either within the same mini-transaction, or by invoking
+ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
+IBUF_BITMAP_FREE is unaffected by reorganization.
+
+@return error code
+@retval DB_FAIL if reorganizing a ROW_FORMAT=COMPRESSED page failed */
+dberr_t
+btr_page_reorganize_block(
+	ulint		z_level,/*!< in: compression level to be used
+				if dealing with compressed page */
+	buf_block_t*	block,	/*!< in/out: B-tree page */
+	dict_index_t*	index,	/*!< in: the index tree of the page */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+  if (buf_block_get_page_zip(block))
+    return page_zip_reorganize(block, index, z_level, mtr, true);
+  page_cur_t cur;
+  page_cur_set_before_first(block, &cur);
+  cur.index= index;
+  return btr_page_reorganize_low(&cur, mtr);
+}
+
+/*************************************************************//**
+Reorganizes an index page.
+
+IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index. This has to
+be done either within the same mini-transaction, or by invoking
+ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
+IBUF_BITMAP_FREE is unaffected by reorganization.
+
+@param cursor  page cursor
+@param mtr     mini-transaction
+@return error code
+@retval DB_FAIL if reorganizing a ROW_FORMAT=COMPRESSED page failed */
+dberr_t btr_page_reorganize(page_cur_t *cursor, mtr_t *mtr)
+{
+  if (!buf_block_get_page_zip(cursor->block))
+    return btr_page_reorganize_low(cursor, mtr);
+
+  ulint pos= page_rec_get_n_recs_before(cursor->rec);
+  if (UNIV_UNLIKELY(pos == ULINT_UNDEFINED))
+    return DB_CORRUPTION;
+
+  dberr_t err= page_zip_reorganize(cursor->block, cursor->index,
+                                   page_zip_level, mtr, true);
+  if (err == DB_FAIL);
+  else if (!pos)
+    ut_ad(cursor->rec == page_get_infimum_rec(cursor->block->page.frame));
+  else if (!(cursor->rec= page_rec_get_nth(cursor->block->page.frame, pos)))
+    err= DB_CORRUPTION;
+
+  return err;
+}
+
+/** Empty an index page (possibly the root page). @see btr_page_create().
+@param[in,out]	block		page to be emptied
+@param[in,out]	page_zip	compressed page frame, or NULL
+@param[in]	index		index of the page
+@param[in]	level		B-tree level of the page (0=leaf)
+@param[in,out]	mtr		mini-transaction */
+void
+btr_page_empty(
+	buf_block_t*	block,
+	page_zip_des_t*	page_zip,
+	dict_index_t*	index,
+	ulint		level,
+	mtr_t*		mtr)
+{
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(page_zip == buf_block_get_page_zip(block));
+	ut_ad(!index->is_dummy);
+	ut_ad(index->table->space->id == block->page.id().space());
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!page_zip
+	     || page_zip_validate(page_zip, block->page.frame, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+	btr_search_drop_page_hash_index(block, false);
+
+	/* Recreate the page: note that global data on page (possible
+	segment headers, next page-field, etc.) is preserved intact */
+
+	/* Preserve PAGE_ROOT_AUTO_INC when creating a clustered index
+	root page. */
+	const ib_uint64_t	autoinc
+		= dict_index_is_clust(index)
+		&& index->page == block->page.id().page_no()
+		? page_get_autoinc(block->page.frame)
+		: 0;
+
+	if (page_zip) {
+		page_create_zip(block, index, level, autoinc, mtr);
+	} else {
+		page_create(block, mtr, index->table->not_redundant());
+		if (index->is_spatial()) {
+			static_assert(((FIL_PAGE_INDEX & 0xff00)
+				       | byte(FIL_PAGE_RTREE))
+				      == FIL_PAGE_RTREE, "compatibility");
+			mtr->write<1>(*block, FIL_PAGE_TYPE + 1
+				      + block->page.frame,
+				      byte(FIL_PAGE_RTREE));
+			if (mach_read_from_8(block->page.frame
+					     + FIL_RTREE_SPLIT_SEQ_NUM)) {
+				mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM,
+					    8, 0);
+			}
+		}
+		mtr->write<2,mtr_t::MAYBE_NOP>(*block, PAGE_HEADER + PAGE_LEVEL
+					       + block->page.frame, level);
+		if (autoinc) {
+			mtr->write<8>(*block, PAGE_HEADER + PAGE_MAX_TRX_ID
+				      + block->page.frame, autoinc);
+		}
+	}
+}
+
+/** Write instant ALTER TABLE metadata to a root page.
+@param[in,out]	root	clustered index root page
+@param[in]	index	clustered index with instant ALTER TABLE
+@param[in,out]	mtr	mini-transaction */
+void btr_set_instant(buf_block_t* root, const dict_index_t& index, mtr_t* mtr)
+{
+	ut_ad(index.n_core_fields > 0);
+	ut_ad(index.n_core_fields < REC_MAX_N_FIELDS);
+	ut_ad(index.is_instant());
+	ut_ad(fil_page_get_type(root->page.frame) == FIL_PAGE_TYPE_INSTANT
+	      || fil_page_get_type(root->page.frame) == FIL_PAGE_INDEX);
+	ut_ad(!page_has_siblings(root->page.frame));
+	ut_ad(root->page.id().page_no() == index.page);
+
+	rec_t* infimum = page_get_infimum_rec(root->page.frame);
+	rec_t* supremum = page_get_supremum_rec(root->page.frame);
+	byte* page_type = root->page.frame + FIL_PAGE_TYPE;
+	uint16_t i = page_header_get_field(root->page.frame, PAGE_INSTANT);
+
+	switch (mach_read_from_2(page_type)) {
+	case FIL_PAGE_TYPE_INSTANT:
+		ut_ad(page_get_instant(root->page.frame)
+		      == index.n_core_fields);
+		if (memcmp(infimum, "infimum", 8)
+		    || memcmp(supremum, "supremum", 8)) {
+			ut_ad(index.table->instant);
+			ut_ad(!memcmp(infimum, field_ref_zero, 8));
+			ut_ad(!memcmp(supremum, field_ref_zero, 7));
+			/* The n_core_null_bytes only matters for
+			ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC tables. */
+			ut_ad(supremum[7] == index.n_core_null_bytes
+			      || !index.table->not_redundant());
+			return;
+		}
+		break;
+	default:
+		ut_ad("wrong page type" == 0);
+		/* fall through */
+	case FIL_PAGE_INDEX:
+		ut_ad(!page_is_comp(root->page.frame)
+		      || !page_get_instant(root->page.frame));
+		ut_ad(!memcmp(infimum, "infimum", 8));
+		ut_ad(!memcmp(supremum, "supremum", 8));
+		mtr->write<2>(*root, page_type, FIL_PAGE_TYPE_INSTANT);
+		ut_ad(i <= PAGE_NO_DIRECTION);
+		i |= static_cast<uint16_t>(index.n_core_fields << 3);
+		mtr->write<2>(*root, PAGE_HEADER + PAGE_INSTANT
+			      + root->page.frame, i);
+		break;
+	}
+
+	if (index.table->instant) {
+		mtr->memset(root, infimum - root->page.frame, 8, 0);
+		mtr->memset(root, supremum - root->page.frame, 7, 0);
+		mtr->write<1,mtr_t::MAYBE_NOP>(*root, &supremum[7],
+					       index.n_core_null_bytes);
+	}
+}
+
+/** Reset the table to the canonical format on ROLLBACK of instant ALTER TABLE.
+@param[in]      index   clustered index with instant ALTER TABLE
+@param[in]      all     whether to reset FIL_PAGE_TYPE as well
+@param[in,out]  mtr     mini-transaction */
+ATTRIBUTE_COLD
+void btr_reset_instant(const dict_index_t &index, bool all, mtr_t *mtr)
+{
+  ut_ad(!index.table->is_temporary());
+  ut_ad(index.is_primary());
+  buf_block_t *root= btr_get_latched_root(index, mtr);
+  byte *page_type= root->page.frame + FIL_PAGE_TYPE;
+  if (all)
+  {
+    ut_ad(mach_read_from_2(page_type) == FIL_PAGE_TYPE_INSTANT ||
+          mach_read_from_2(page_type) == FIL_PAGE_INDEX);
+    mtr->write<2,mtr_t::MAYBE_NOP>(*root, page_type, FIL_PAGE_INDEX);
+    byte *instant= PAGE_INSTANT + PAGE_HEADER + root->page.frame;
+    mtr->write<2,mtr_t::MAYBE_NOP>(*root, instant,
+                                   page_ptr_get_direction(instant + 1));
+  }
+  else
+    ut_ad(mach_read_from_2(page_type) == FIL_PAGE_TYPE_INSTANT);
+  static const byte supremuminfimum[8 + 8] = "supremuminfimum";
+  uint16_t infimum, supremum;
+  if (page_is_comp(root->page.frame))
+  {
+    infimum= PAGE_NEW_INFIMUM;
+    supremum= PAGE_NEW_SUPREMUM;
+  }
+  else
+  {
+    infimum= PAGE_OLD_INFIMUM;
+    supremum= PAGE_OLD_SUPREMUM;
+  }
+  ut_ad(!memcmp(&root->page.frame[infimum], supremuminfimum + 8, 8) ==
+        !memcmp(&root->page.frame[supremum], supremuminfimum, 8));
+  mtr->memcpy<mtr_t::MAYBE_NOP>(*root, &root->page.frame[infimum],
+                                supremuminfimum + 8, 8);
+  mtr->memcpy<mtr_t::MAYBE_NOP>(*root, &root->page.frame[supremum],
+                                supremuminfimum, 8);
+}
+
+/*************************************************************//**
+Makes tree one level higher by splitting the root, and inserts
+the tuple. It is assumed that mtr contains an x-latch on the tree.
+NOTE that the operation of this function must always succeed,
+we cannot reverse it: therefore enough free disk space must be
+guaranteed to be available before this function is called.
+@return inserted record */
+rec_t*
+btr_root_raise_and_insert(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in: cursor at which to insert: must be
+				on the root page; when the function returns,
+				the cursor is positioned on the predecessor
+				of the inserted record */
+	rec_offs**	offsets,/*!< out: offsets on inserted record */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
+	const dtuple_t*	tuple,	/*!< in: tuple to insert */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	mtr_t*		mtr,	/*!< in: mtr */
+	dberr_t*	err)	/*!< out: error code */
+{
+	dict_index_t*	index;
+	rec_t*		rec;
+	dtuple_t*	node_ptr;
+	ulint		level;
+	rec_t*		node_ptr_rec;
+	page_cur_t*	page_cursor;
+	page_zip_des_t*	root_page_zip;
+	page_zip_des_t*	new_page_zip;
+	buf_block_t*	root;
+	buf_block_t*	new_block;
+
+	root = btr_cur_get_block(cursor);
+	root_page_zip = buf_block_get_page_zip(root);
+	ut_ad(!page_is_empty(root->page.frame));
+	index = btr_cur_get_index(cursor);
+	ut_ad(index->n_core_null_bytes <= UT_BITS_IN_BYTES(index->n_nullable));
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!root_page_zip
+	     || page_zip_validate(root_page_zip, root->page.frame, index));
+#endif /* UNIV_ZIP_DEBUG */
+	const page_id_t root_id{root->page.id()};
+
+	ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+					 | MTR_MEMO_SX_LOCK));
+	ut_ad(mtr->memo_contains_flagged(root, MTR_MEMO_PAGE_X_FIX));
+
+	if (index->page != root_id.page_no()) {
+		ut_ad("corrupted root page number" == 0);
+		return nullptr;
+	}
+
+	if (index->is_ibuf()) {
+        } else if (!btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF,
+                                           *root, *index->table->space)
+                   || !btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP,
+                                              *root, *index->table->space)) {
+                return nullptr;
+	}
+
+	/* Allocate a new page to the tree. Root splitting is done by first
+	moving the root records to the new page, emptying the root, putting
+	a node pointer to the new page, and then splitting the new page. */
+
+	level = btr_page_get_level(root->page.frame);
+
+	new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr, mtr, err);
+
+	if (!new_block) {
+		return nullptr;
+	}
+
+	new_page_zip = buf_block_get_page_zip(new_block);
+	ut_a(!new_page_zip == !root_page_zip);
+	ut_a(!new_page_zip
+	     || page_zip_get_size(new_page_zip)
+	     == page_zip_get_size(root_page_zip));
+
+	btr_page_create(new_block, new_page_zip, index, level, mtr);
+	if (page_has_siblings(new_block->page.frame)) {
+		compile_time_assert(FIL_PAGE_NEXT == FIL_PAGE_PREV + 4);
+		compile_time_assert(FIL_NULL == 0xffffffff);
+		static_assert(FIL_PAGE_PREV % 8 == 0, "alignment");
+		memset_aligned<8>(new_block->page.frame + FIL_PAGE_PREV,
+				  0xff, 8);
+		mtr->memset(new_block, FIL_PAGE_PREV, 8, 0xff);
+		if (UNIV_LIKELY_NULL(new_page_zip)) {
+			memset_aligned<8>(new_page_zip->data + FIL_PAGE_PREV,
+					  0xff, 8);
+		}
+	}
+
+	/* Copy the records from root to the new page one by one. */
+	if (0
+#ifdef UNIV_ZIP_COPY
+	    || new_page_zip
+#endif /* UNIV_ZIP_COPY */
+	    || !page_copy_rec_list_end(new_block, root,
+				       page_get_infimum_rec(root->page.frame),
+				       index, mtr, err)) {
+		switch (*err) {
+		case DB_SUCCESS:
+			break;
+		case DB_FAIL:
+			*err = DB_SUCCESS;
+			break;
+		default:
+			return nullptr;
+		}
+
+		ut_a(new_page_zip);
+
+		/* Copy the page byte for byte. */
+		page_zip_copy_recs(new_block, root_page_zip,
+				   root->page.frame, index, mtr);
+
+		/* Update the lock table and possible hash index. */
+		if (index->has_locking()) {
+			lock_move_rec_list_end(
+				new_block, root,
+				page_get_infimum_rec(root->page.frame));
+		}
+
+		/* Move any existing predicate locks */
+		if (dict_index_is_spatial(index)) {
+			lock_prdt_rec_move(new_block, root_id);
+		} else {
+			btr_search_move_or_delete_hash_entries(
+				new_block, root);
+		}
+	}
+
+	constexpr uint16_t max_trx_id = PAGE_HEADER + PAGE_MAX_TRX_ID;
+	if (dict_index_is_sec_or_ibuf(index)) {
+		/* In secondary indexes and the change buffer,
+		PAGE_MAX_TRX_ID can be reset on the root page, because
+		the field only matters on leaf pages, and the root no
+		longer is a leaf page. (Older versions of InnoDB did
+		set PAGE_MAX_TRX_ID on all secondary index pages.) */
+		byte* p = my_assume_aligned<8>(
+			PAGE_HEADER + PAGE_MAX_TRX_ID + root->page.frame);
+		if (mach_read_from_8(p)) {
+			mtr->memset(root, max_trx_id, 8, 0);
+			if (UNIV_LIKELY_NULL(root->page.zip.data)) {
+				memset_aligned<8>(max_trx_id
+						  + root->page.zip.data, 0, 8);
+			}
+		}
+	} else {
+		/* PAGE_ROOT_AUTO_INC is only present in the clustered index
+		root page; on other clustered index pages, we want to reserve
+		the field PAGE_MAX_TRX_ID for future use. */
+		byte* p = my_assume_aligned<8>(
+			PAGE_HEADER + PAGE_MAX_TRX_ID + new_block->page.frame);
+		if (mach_read_from_8(p)) {
+			mtr->memset(new_block, max_trx_id, 8, 0);
+			if (UNIV_LIKELY_NULL(new_block->page.zip.data)) {
+				memset_aligned<8>(max_trx_id
+						  + new_block->page.zip.data,
+						  0, 8);
+			}
+		}
+	}
+
+	/* If this is a pessimistic insert which is actually done to
+	perform a pessimistic update then we have stored the lock
+	information of the record to be inserted on the infimum of the
+	root page: we cannot discard the lock structs on the root page */
+
+	if (index->has_locking()) {
+		lock_update_root_raise(*new_block, root_id);
+	}
+
+	/* Create a memory heap where the node pointer is stored */
+	if (!*heap) {
+		*heap = mem_heap_create(1000);
+	}
+
+	const uint32_t new_page_no = new_block->page.id().page_no();
+	rec = page_rec_get_next(page_get_infimum_rec(new_block->page.frame));
+	ut_ad(rec); /* We just created the page. */
+
+	/* Build the node pointer (= node key and page address) for the
+	child */
+	if (dict_index_is_spatial(index)) {
+		rtr_mbr_t		new_mbr;
+
+		rtr_page_cal_mbr(index, new_block, &new_mbr, *heap);
+		node_ptr = rtr_index_build_node_ptr(
+			index, &new_mbr, rec, new_page_no, *heap);
+	} else {
+		node_ptr = dict_index_build_node_ptr(
+			index, rec, new_page_no, *heap, level);
+	}
+	/* The node pointer must be marked as the predefined minimum record,
+	as there is no lower alphabetical limit to records in the leftmost
+	node of a level: */
+	dtuple_set_info_bits(node_ptr,
+			     dtuple_get_info_bits(node_ptr)
+			     | REC_INFO_MIN_REC_FLAG);
+
+	/* Rebuild the root page to get free space */
+	btr_page_empty(root, root_page_zip, index, level + 1, mtr);
+	/* btr_page_empty() is supposed to zero-initialize the field. */
+	ut_ad(!page_get_instant(root->page.frame));
+
+	if (index->is_instant()) {
+		ut_ad(!root_page_zip);
+		btr_set_instant(root, *index, mtr);
+	}
+
+	ut_ad(!page_has_siblings(root->page.frame));
+
+	page_cursor = btr_cur_get_page_cur(cursor);
+
+	/* Insert node pointer to the root */
+
+	page_cur_set_before_first(root, page_cursor);
+
+	node_ptr_rec = page_cur_tuple_insert(page_cursor, node_ptr,
+					     offsets, heap, 0, mtr);
+
+	/* The root page should only contain the node pointer
+	to new_block at this point.  Thus, the data should fit. */
+	ut_a(node_ptr_rec);
+
+	/* We play safe and reset the free bits for the new page */
+
+	if (!dict_index_is_clust(index)
+	    && !index->table->is_temporary()) {
+		ibuf_reset_free_bits(new_block);
+	}
+
+	page_cursor->block = new_block;
+	page_cursor->index = index;
+
+	ut_ad(dtuple_check_typed(tuple));
+	/* Reposition the cursor to the child node */
+	ulint low_match = 0, up_match = 0;
+
+	if (page_cur_search_with_match(tuple, PAGE_CUR_LE,
+				       &up_match, &low_match,
+				       page_cursor, nullptr)) {
+		*err = DB_CORRUPTION;
+		return nullptr;
+	}
+
+	/* Split the child and insert tuple */
+	return btr_page_split_and_insert(flags, cursor, offsets, heap,
+					 tuple, n_ext, mtr, err);
+}
+
+/** Decide if the page should be split at the convergence point of inserts
+converging to the left.
+@param[in]	cursor	insert position
+@return the first record to be moved to the right half page
+@retval	NULL if no split is recommended */
+rec_t* btr_page_get_split_rec_to_left(const btr_cur_t* cursor)
+{
+	rec_t* split_rec = btr_cur_get_rec(cursor);
+	const page_t* page = page_align(split_rec);
+
+	if (page_header_get_ptr(page, PAGE_LAST_INSERT)
+	    != page_rec_get_next(split_rec)) {
+		return NULL;
+	}
+
+	/* The metadata record must be present in the leftmost leaf page
+	of the clustered index, if and only if index->is_instant().
+	However, during innobase_instant_try(), index->is_instant()
+	would already hold when row_ins_clust_index_entry_low()
+	is being invoked to insert the the metadata record.
+	So, we can only assert that when the metadata record exists,
+	index->is_instant() must hold. */
+	ut_ad(!page_is_leaf(page) || page_has_prev(page)
+	      || cursor->index()->is_instant()
+	      || !(rec_get_info_bits(page_rec_get_next_const(
+					     page_get_infimum_rec(page)),
+				     cursor->index()->table->not_redundant())
+		   & REC_INFO_MIN_REC_FLAG));
+
+	const rec_t* infimum = page_get_infimum_rec(page);
+
+	/* If the convergence is in the middle of a page, include also
+	the record immediately before the new insert to the upper
+	page. Otherwise, we could repeatedly move from page to page
+	lots of records smaller than the convergence point. */
+
+	if (split_rec == infimum
+	    || split_rec == page_rec_get_next_const(infimum)) {
+		split_rec = page_rec_get_next(split_rec);
+	}
+
+	return split_rec;
+}
+
+/** Decide if the page should be split at the convergence point of inserts
+converging to the right.
+@param[in]	cursor		insert position
+@param[out]	split_rec	if split recommended, the first record
+				on the right half page, or
+				NULL if the to-be-inserted record
+				should be first
+@return whether split is recommended */
+bool
+btr_page_get_split_rec_to_right(const btr_cur_t* cursor, rec_t** split_rec)
+{
+	rec_t* insert_point = btr_cur_get_rec(cursor);
+	const page_t* page = page_align(insert_point);
+
+	/* We use eager heuristics: if the new insert would be right after
+	the previous insert on the same page, we assume that there is a
+	pattern of sequential inserts here. */
+
+	if (page_header_get_ptr(page, PAGE_LAST_INSERT) != insert_point) {
+		return false;
+	}
+
+	insert_point = page_rec_get_next(insert_point);
+
+	if (!insert_point || page_rec_is_supremum(insert_point)) {
+		insert_point = NULL;
+	} else {
+		insert_point = page_rec_get_next(insert_point);
+		if (page_rec_is_supremum(insert_point)) {
+			insert_point = NULL;
+		}
+
+		/* If there are >= 2 user records up from the insert
+		point, split all but 1 off. We want to keep one because
+		then sequential inserts can use the adaptive hash
+		index, as they can do the necessary checks of the right
+		search position just by looking at the records on this
+		page. */
+	}
+
+	*split_rec = insert_point;
+	return true;
+}
+
+/*************************************************************//**
+Calculates a split record such that the tuple will certainly fit on
+its half-page when the split is performed. We assume in this function
+only that the cursor page has at least one user record.
+@return split record, or NULL if tuple will be the first record on
+the lower or upper half-page (determined by btr_page_tuple_smaller()) */
+static
+rec_t*
+btr_page_get_split_rec(
+/*===================*/
+	btr_cur_t*	cursor,	/*!< in: cursor at which insert should be made */
+	const dtuple_t*	tuple,	/*!< in: tuple to insert */
+	ulint		n_ext)	/*!< in: number of externally stored columns */
+{
+	page_t*		page;
+	page_zip_des_t*	page_zip;
+	ulint		insert_size;
+	ulint		free_space;
+	ulint		total_data;
+	ulint		total_n_recs;
+	ulint		total_space;
+	ulint		incl_data;
+	rec_t*		ins_rec;
+	rec_t*		rec;
+	rec_t*		next_rec;
+	ulint		n;
+	mem_heap_t*	heap;
+	rec_offs*	offsets;
+
+	page = btr_cur_get_page(cursor);
+
+	insert_size = rec_get_converted_size(cursor->index(), tuple, n_ext);
+	free_space  = page_get_free_space_of_empty(page_is_comp(page));
+
+	page_zip = btr_cur_get_page_zip(cursor);
+	if (page_zip) {
+		/* Estimate the free space of an empty compressed page. */
+		ulint	free_space_zip = page_zip_empty_size(
+			cursor->index()->n_fields,
+			page_zip_get_size(page_zip));
+
+		if (free_space > (ulint) free_space_zip) {
+			free_space = (ulint) free_space_zip;
+		}
+	}
+
+	/* free_space is now the free space of a created new page */
+
+	total_data   = page_get_data_size(page) + insert_size;
+	total_n_recs = ulint(page_get_n_recs(page)) + 1;
+	ut_ad(total_n_recs >= 2);
+	total_space  = total_data + page_dir_calc_reserved_space(total_n_recs);
+
+	n = 0;
+	incl_data = 0;
+	ins_rec = btr_cur_get_rec(cursor);
+	rec = page_get_infimum_rec(page);
+
+	heap = NULL;
+	offsets = NULL;
+
+	/* We start to include records to the left half, and when the
+	space reserved by them exceeds half of total_space, then if
+	the included records fit on the left page, they will be put there
+	if something was left over also for the right page,
+	otherwise the last included record will be the first on the right
+	half page */
+
+	do {
+		/* Decide the next record to include */
+		if (rec == ins_rec) {
+			rec = NULL;	/* NULL denotes that tuple is
+					now included */
+		} else if (rec == NULL) {
+			rec = page_rec_get_next(ins_rec);
+		} else {
+			rec = page_rec_get_next(rec);
+		}
+
+		if (rec == NULL) {
+			/* Include tuple */
+			incl_data += insert_size;
+		} else {
+			offsets = rec_get_offsets(rec, cursor->index(),
+						  offsets, page_is_leaf(page)
+						  ? cursor->index()
+						  ->n_core_fields
+						  : 0,
+						  ULINT_UNDEFINED, &heap);
+			incl_data += rec_offs_size(offsets);
+		}
+
+		n++;
+	} while (incl_data + page_dir_calc_reserved_space(n)
+		 < total_space / 2);
+
+	if (incl_data + page_dir_calc_reserved_space(n) <= free_space) {
+		/* The next record will be the first on
+		the right half page if it is not the
+		supremum record of page */
+
+		if (rec == ins_rec) {
+			rec = NULL;
+
+			goto func_exit;
+		} else if (rec == NULL) {
+			next_rec = page_rec_get_next(ins_rec);
+		} else {
+			next_rec = page_rec_get_next(rec);
+		}
+		ut_ad(next_rec);
+		if (!page_rec_is_supremum(next_rec)) {
+			rec = next_rec;
+		}
+	}
+
+func_exit:
+	if (heap) {
+		mem_heap_free(heap);
+	}
+	return(rec);
+}
+
+#ifdef UNIV_DEBUG
+/*************************************************************//**
+Returns TRUE if the insert fits on the appropriate half-page with the
+chosen split_rec.
+@return true if fits */
+static MY_ATTRIBUTE((nonnull(1,3,4,6), warn_unused_result))
+bool
+btr_page_insert_fits(
+/*=================*/
+	btr_cur_t*	cursor,	/*!< in: cursor at which insert
+				should be made */
+	const rec_t*	split_rec,/*!< in: suggestion for first record
+				on upper half-page, or NULL if
+				tuple to be inserted should be first */
+	rec_offs**	offsets,/*!< in: rec_get_offsets(
+				split_rec, cursor->index()); out: garbage */
+	const dtuple_t*	tuple,	/*!< in: tuple to insert */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	mem_heap_t**	heap)	/*!< in: temporary memory heap */
+{
+	page_t*		page;
+	ulint		insert_size;
+	ulint		free_space;
+	ulint		total_data;
+	ulint		total_n_recs;
+	const rec_t*	rec;
+	const rec_t*	end_rec;
+
+	page = btr_cur_get_page(cursor);
+
+	ut_ad(!split_rec
+	      || !page_is_comp(page) == !rec_offs_comp(*offsets));
+	ut_ad(!split_rec
+	      || rec_offs_validate(split_rec, cursor->index(), *offsets));
+
+	insert_size = rec_get_converted_size(cursor->index(), tuple, n_ext);
+	free_space  = page_get_free_space_of_empty(page_is_comp(page));
+
+	/* free_space is now the free space of a created new page */
+
+	total_data   = page_get_data_size(page) + insert_size;
+	total_n_recs = ulint(page_get_n_recs(page)) + 1;
+
+	/* We determine which records (from rec to end_rec, not including
+	end_rec) will end up on the other half page from tuple when it is
+	inserted. */
+
+	if (!(end_rec = split_rec)) {
+		end_rec = page_rec_get_next(btr_cur_get_rec(cursor));
+	} else if (cmp_dtuple_rec(tuple, split_rec, cursor->index(),
+				  *offsets) < 0) {
+		rec = split_rec;
+		end_rec = page_get_supremum_rec(page);
+		goto got_rec;
+	}
+
+	if (!(rec = page_rec_get_next(page_get_infimum_rec(page)))) {
+		return false;
+	}
+
+got_rec:
+	if (total_data + page_dir_calc_reserved_space(total_n_recs)
+	    <= free_space) {
+
+		/* Ok, there will be enough available space on the
+		half page where the tuple is inserted */
+
+		return(true);
+	}
+
+	while (rec != end_rec) {
+		/* In this loop we calculate the amount of reserved
+		space after rec is removed from page. */
+
+		*offsets = rec_get_offsets(rec, cursor->index(), *offsets,
+					   page_is_leaf(page)
+					   ? cursor->index()->n_core_fields
+					   : 0,
+					   ULINT_UNDEFINED, heap);
+
+		total_data -= rec_offs_size(*offsets);
+		total_n_recs--;
+
+		if (total_data + page_dir_calc_reserved_space(total_n_recs)
+		    <= free_space) {
+
+			/* Ok, there will be enough available space on the
+			half page where the tuple is inserted */
+
+			return(true);
+		}
+
+		if (!(rec = page_rec_get_next_const(rec))) {
+			break;
+		}
+	}
+
+	return(false);
+}
+#endif
+
+/*******************************************************//**
+Inserts a data tuple to a tree on a non-leaf level. It is assumed
+that mtr holds an x-latch on the tree. */
+dberr_t
+btr_insert_on_non_leaf_level(
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	dict_index_t*	index,	/*!< in: index */
+	ulint		level,	/*!< in: level, must be > 0 */
+	dtuple_t*	tuple,	/*!< in: the record to be inserted */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	big_rec_t*	dummy_big_rec;
+	btr_cur_t	cursor;
+	rec_t*		rec;
+	mem_heap_t*	heap = NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets         = offsets_;
+	rec_offs_init(offsets_);
+	rtr_info_t	rtr_info;
+
+	ut_ad(level > 0);
+
+	flags |= BTR_NO_LOCKING_FLAG | BTR_KEEP_SYS_FLAG
+		| BTR_NO_UNDO_LOG_FLAG;
+	cursor.page_cur.index = index;
+
+	dberr_t err;
+
+	if (index->is_spatial()) {
+		/* For spatial index, initialize structures to track
+		its parents etc. */
+		rtr_init_rtr_info(&rtr_info, false, &cursor, index, false);
+
+		rtr_info_update_btr(&cursor, &rtr_info);
+		err = rtr_search_to_nth_level(level, tuple,
+					      PAGE_CUR_RTREE_INSERT,
+					      BTR_CONT_MODIFY_TREE,
+					      &cursor, mtr);
+	} else {
+		err = btr_cur_search_to_nth_level(level, tuple, RW_X_LATCH,
+						  &cursor, mtr);
+	}
+
+	ut_ad(cursor.flag == BTR_CUR_BINARY);
+	ut_ad(btr_cur_get_block(&cursor)
+	      != mtr->at_savepoint(mtr->get_savepoint() - 1)
+	      || index->is_spatial()
+	      || mtr->memo_contains(index->lock, MTR_MEMO_X_LOCK));
+
+	if (UNIV_LIKELY(err == DB_SUCCESS)) {
+		err = btr_cur_optimistic_insert(flags,
+						&cursor, &offsets, &heap,
+						tuple, &rec,
+						&dummy_big_rec, 0, NULL, mtr);
+	}
+
+	if (err == DB_FAIL) {
+		err = btr_cur_pessimistic_insert(flags,
+						 &cursor, &offsets, &heap,
+						 tuple, &rec,
+						 &dummy_big_rec, 0, NULL, mtr);
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	if (index->is_spatial()) {
+		ut_ad(cursor.rtr_info);
+
+		rtr_clean_rtr_info(&rtr_info, true);
+	}
+
+	return err;
+}
+
+static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
+static_assert(FIL_PAGE_PREV % 4 == 0, "alignment");
+static_assert(FIL_PAGE_NEXT % 4 == 0, "alignment");
+
+MY_ATTRIBUTE((nonnull,warn_unused_result))
+/**************************************************************//**
+Attaches the halves of an index page on the appropriate level in an
+index tree. */
+static
+dberr_t
+btr_attach_half_pages(
+/*==================*/
+	ulint		flags,		/*!< in: undo logging and
+					locking flags */
+	dict_index_t*	index,		/*!< in: the index tree */
+	buf_block_t*	block,		/*!< in/out: page to be split */
+	const rec_t*	split_rec,	/*!< in: first record on upper
+					half page */
+	buf_block_t*	new_block,	/*!< in/out: the new half page */
+	ulint		direction,	/*!< in: FSP_UP or FSP_DOWN */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	dtuple_t*	node_ptr_upper;
+	mem_heap_t*	heap;
+	buf_block_t*	prev_block = nullptr;
+	buf_block_t*	next_block = nullptr;
+	buf_block_t*	lower_block;
+	buf_block_t*	upper_block;
+
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr->memo_contains_flagged(new_block, MTR_MEMO_PAGE_X_FIX));
+
+	/* Create a memory heap where the data tuple is stored */
+	heap = mem_heap_create(1024);
+
+	/* Based on split direction, decide upper and lower pages */
+	if (direction == FSP_DOWN) {
+
+		btr_cur_t	cursor;
+		rec_offs*	offsets;
+
+		lower_block = new_block;
+		upper_block = block;
+
+		cursor.page_cur.block = block;
+		cursor.page_cur.index = index;
+
+		/* Look up the index for the node pointer to page */
+		offsets = btr_page_get_father_block(nullptr, heap, mtr,
+						    &cursor);
+
+		/* Replace the address of the old child node (= page) with the
+		address of the new lower half */
+
+		btr_node_ptr_set_child_page_no(
+			btr_cur_get_block(&cursor),
+			btr_cur_get_rec(&cursor),
+			offsets, lower_block->page.id().page_no(), mtr);
+		mem_heap_empty(heap);
+	} else {
+		lower_block = block;
+		upper_block = new_block;
+	}
+
+	/* Get the level of the split pages */
+	const ulint level = btr_page_get_level(block->page.frame);
+	ut_ad(level == btr_page_get_level(new_block->page.frame));
+	page_id_t id{block->page.id()};
+
+	/* Get the previous and next pages of page */
+	const uint32_t prev_page_no = btr_page_get_prev(block->page.frame);
+	const uint32_t next_page_no = btr_page_get_next(block->page.frame);
+
+	/* for consistency, both blocks should be locked, before change */
+	if (prev_page_no != FIL_NULL && direction == FSP_DOWN) {
+		id.set_page_no(prev_page_no);
+		prev_block = mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX);
+#if 1 /* MDEV-29835 FIXME: acquire page latches upfront */
+		if (!prev_block) {
+			ut_ad(mtr->memo_contains(index->lock,
+						 MTR_MEMO_X_LOCK));
+			prev_block = btr_block_get(*index, prev_page_no,
+						   RW_X_LATCH, !level, mtr);
+		}
+#endif
+	}
+	if (next_page_no != FIL_NULL && direction != FSP_DOWN) {
+		id.set_page_no(next_page_no);
+		next_block = mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX);
+#if 1 /* MDEV-29835 FIXME: acquire page latches upfront */
+		if (!next_block) {
+			ut_ad(mtr->memo_contains(index->lock,
+						 MTR_MEMO_X_LOCK));
+			next_block = btr_block_get(*index, next_page_no,
+						   RW_X_LATCH, !level, mtr);
+		}
+#endif
+	}
+
+	/* Build the node pointer (= node key and page address) for the upper
+	half */
+
+	node_ptr_upper = dict_index_build_node_ptr(
+		index, split_rec, upper_block->page.id().page_no(),
+		heap, level);
+
+	/* Insert it next to the pointer to the lower half. Note that this
+	may generate recursion leading to a split on the higher level. */
+
+	dberr_t err = btr_insert_on_non_leaf_level(
+		flags, index, level + 1, node_ptr_upper, mtr);
+
+	/* Free the memory heap */
+	mem_heap_free(heap);
+
+	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+		return err;
+	}
+
+	/* Update page links of the level */
+
+	if (prev_block) {
+		if (UNIV_UNLIKELY(memcmp_aligned<4>(prev_block->page.frame
+                                                    + FIL_PAGE_NEXT,
+                                                    block->page.frame
+                                                    + FIL_PAGE_OFFSET,
+                                                    4))) {
+			return DB_CORRUPTION;
+		}
+		btr_page_set_next(prev_block, lower_block->page.id().page_no(),
+				  mtr);
+	}
+
+	if (next_block) {
+		if (UNIV_UNLIKELY(memcmp_aligned<4>(next_block->page.frame
+                                                    + FIL_PAGE_PREV,
+                                                    block->page.frame
+                                                    + FIL_PAGE_OFFSET,
+                                                    4))) {
+			return DB_CORRUPTION;
+		}
+		btr_page_set_prev(next_block, upper_block->page.id().page_no(),
+				  mtr);
+	}
+
+	if (direction == FSP_DOWN) {
+		ut_ad(lower_block == new_block);
+		ut_ad(btr_page_get_next(upper_block->page.frame)
+		      == next_page_no);
+		btr_page_set_prev(lower_block, prev_page_no, mtr);
+	} else {
+		ut_ad(upper_block == new_block);
+		ut_ad(btr_page_get_prev(lower_block->page.frame)
+		      == prev_page_no);
+		btr_page_set_next(upper_block, next_page_no, mtr);
+	}
+
+	btr_page_set_prev(upper_block, lower_block->page.id().page_no(), mtr);
+	btr_page_set_next(lower_block, upper_block->page.id().page_no(), mtr);
+
+	return DB_SUCCESS;
+}
+
+/*************************************************************//**
+Determine if a tuple is smaller than any record on the page.
+@return TRUE if smaller */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+btr_page_tuple_smaller(
+/*===================*/
+	btr_cur_t*	cursor,	/*!< in: b-tree cursor */
+	const dtuple_t*	tuple,	/*!< in: tuple to consider */
+	rec_offs**	offsets,/*!< in/out: temporary storage */
+	ulint		n_uniq,	/*!< in: number of unique fields
+				in the index page records */
+	mem_heap_t**	heap)	/*!< in/out: heap for offsets */
+{
+	buf_block_t*	block;
+	const rec_t*	first_rec;
+	page_cur_t	pcur;
+
+	/* Read the first user record in the page. */
+	block = btr_cur_get_block(cursor);
+	page_cur_set_before_first(block, &pcur);
+	if (UNIV_UNLIKELY(!(first_rec = page_cur_move_to_next(&pcur)))) {
+		ut_ad("corrupted page" == 0);
+		return false;
+	}
+
+	*offsets = rec_get_offsets(first_rec, cursor->index(), *offsets,
+				   page_is_leaf(block->page.frame)
+				   ? cursor->index()->n_core_fields : 0,
+				   n_uniq, heap);
+
+	return cmp_dtuple_rec(tuple, first_rec, cursor->index(), *offsets) < 0;
+}
+
+/** Insert the tuple into the right sibling page, if the cursor is at the end
+of a page.
+@param[in]	flags	undo logging and locking flags
+@param[in,out]	cursor	cursor at which to insert; when the function succeeds,
+			the cursor is positioned before the insert point.
+@param[out]	offsets	offsets on inserted record
+@param[in,out]	heap	memory heap for allocating offsets
+@param[in]	tuple	tuple to insert
+@param[in]	n_ext	number of externally stored columns
+@param[in,out]	mtr	mini-transaction
+@return	inserted record (first record on the right sibling page);
+	the cursor will be positioned on the page infimum
+@retval	NULL if the operation was not performed */
+static
+rec_t*
+btr_insert_into_right_sibling(
+	ulint		flags,
+	btr_cur_t*	cursor,
+	rec_offs**	offsets,
+	mem_heap_t*	heap,
+	const dtuple_t*	tuple,
+	ulint		n_ext,
+	mtr_t*		mtr)
+{
+	buf_block_t*	block = btr_cur_get_block(cursor);
+	page_t*		page = buf_block_get_frame(block);
+	const uint32_t	next_page_no = btr_page_get_next(page);
+
+	ut_ad(mtr->memo_contains_flagged(&cursor->index()->lock,
+					 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(heap);
+	ut_ad(dtuple_check_typed(tuple));
+
+	if (next_page_no == FIL_NULL || !page_rec_is_supremum(
+			page_rec_get_next(btr_cur_get_rec(cursor)))) {
+
+		return nullptr;
+	}
+
+	page_cur_t	next_page_cursor;
+	buf_block_t*	next_block;
+	page_t*		next_page;
+	btr_cur_t	next_father_cursor;
+	rec_t*		rec = nullptr;
+	ulint		max_size;
+
+	next_block = btr_block_get(*cursor->index(), next_page_no, RW_X_LATCH,
+				   page_is_leaf(page), mtr);
+	if (UNIV_UNLIKELY(!next_block)) {
+		return nullptr;
+	}
+	next_page = buf_block_get_frame(next_block);
+	const bool is_leaf = page_is_leaf(next_page);
+
+	next_page_cursor.index = cursor->index();
+	next_page_cursor.block = next_block;
+	next_father_cursor.page_cur = next_page_cursor;
+
+	if (!btr_page_get_father(mtr, &next_father_cursor)) {
+		return nullptr;
+	}
+
+	ulint up_match = 0, low_match = 0;
+
+	if (page_cur_search_with_match(tuple,
+				       PAGE_CUR_LE, &up_match, &low_match,
+				       &next_page_cursor, nullptr)) {
+		return nullptr;
+	}
+
+	max_size = page_get_max_insert_size_after_reorganize(next_page, 1);
+
+	/* Extends gap lock for the next page */
+	if (is_leaf && cursor->index()->has_locking()) {
+		lock_update_node_pointer(block, next_block);
+	}
+
+	rec = page_cur_tuple_insert(&next_page_cursor, tuple, offsets, &heap,
+				    n_ext, mtr);
+
+	if (!rec) {
+		if (is_leaf
+		    && next_block->page.zip.ssize
+		    && !dict_index_is_clust(cursor->index())
+		    && !cursor->index()->table->is_temporary()) {
+			/* Reset the IBUF_BITMAP_FREE bits, because
+			page_cur_tuple_insert() will have attempted page
+			reorganize before failing. */
+			ibuf_reset_free_bits(next_block);
+		}
+		return nullptr;
+	}
+
+	ibool	compressed;
+	dberr_t	err;
+	ulint	level = btr_page_get_level(next_page);
+
+	/* adjust cursor position */
+	*btr_cur_get_page_cur(cursor) = next_page_cursor;
+
+	ut_ad(btr_cur_get_rec(cursor) == page_get_infimum_rec(next_page));
+	ut_ad(page_rec_get_next(page_get_infimum_rec(next_page)) == rec);
+
+	/* We have to change the parent node pointer */
+
+	compressed = btr_cur_pessimistic_delete(
+		&err, TRUE, &next_father_cursor,
+		BTR_CREATE_FLAG, false, mtr);
+
+	if (err != DB_SUCCESS) {
+		return nullptr;
+	}
+
+	if (!compressed) {
+		btr_cur_compress_if_useful(&next_father_cursor, false, mtr);
+	}
+
+	dtuple_t*	node_ptr = dict_index_build_node_ptr(
+		cursor->index(), rec, next_block->page.id().page_no(),
+		heap, level);
+
+	if (btr_insert_on_non_leaf_level(flags, cursor->index(), level + 1,
+					 node_ptr, mtr) != DB_SUCCESS) {
+		return nullptr;
+	}
+
+	ut_ad(rec_offs_validate(rec, cursor->index(), *offsets));
+
+	if (is_leaf
+	    && !dict_index_is_clust(cursor->index())
+	    && !cursor->index()->table->is_temporary()) {
+		/* Update the free bits of the B-tree page in the
+		insert buffer bitmap. */
+
+		if (next_block->page.zip.ssize) {
+			ibuf_update_free_bits_zip(next_block, mtr);
+		} else {
+			ibuf_update_free_bits_if_full(
+				next_block, max_size,
+				rec_offs_size(*offsets) + PAGE_DIR_SLOT_SIZE);
+		}
+	}
+
+	return(rec);
+}
+
+/*************************************************************//**
+Moves record list end to another page. Moved records include
+split_rec.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return error code */
+static
+dberr_t
+page_move_rec_list_end(
+/*===================*/
+	buf_block_t*	new_block,	/*!< in/out: index page where to move */
+	buf_block_t*	block,		/*!< in: index page from where to move */
+	rec_t*		split_rec,	/*!< in: first record to move */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	page_t*		new_page	= buf_block_get_frame(new_block);
+	ulint		old_data_size;
+	ulint		new_data_size;
+	ulint		old_n_recs;
+	ulint		new_n_recs;
+
+	ut_ad(!dict_index_is_spatial(index));
+
+	old_data_size = page_get_data_size(new_page);
+	old_n_recs = page_get_n_recs(new_page);
+#ifdef UNIV_ZIP_DEBUG
+	{
+		page_zip_des_t*	new_page_zip
+			= buf_block_get_page_zip(new_block);
+		page_zip_des_t*	page_zip
+			= buf_block_get_page_zip(block);
+		ut_a(!new_page_zip == !page_zip);
+		ut_a(!new_page_zip
+		     || page_zip_validate(new_page_zip, new_page, index));
+		ut_a(!page_zip
+		     || page_zip_validate(page_zip, page_align(split_rec),
+					  index));
+	}
+#endif /* UNIV_ZIP_DEBUG */
+
+	dberr_t err;
+	if (!page_copy_rec_list_end(new_block, block,
+				    split_rec, index, mtr, &err)) {
+		return err;
+	}
+
+	new_data_size = page_get_data_size(new_page);
+	new_n_recs = page_get_n_recs(new_page);
+
+	ut_ad(new_data_size >= old_data_size);
+
+	return page_delete_rec_list_end(split_rec, block, index,
+					new_n_recs - old_n_recs,
+					new_data_size - old_data_size, mtr);
+}
+
+/*************************************************************//**
+Moves record list start to another page. Moved records do not include
+split_rec.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return error code */
+static
+dberr_t
+page_move_rec_list_start(
+/*=====================*/
+	buf_block_t*	new_block,	/*!< in/out: index page where to move */
+	buf_block_t*	block,		/*!< in/out: page containing split_rec */
+	rec_t*		split_rec,	/*!< in: first record not to move */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+  dberr_t err;
+  if (page_copy_rec_list_start(new_block, block, split_rec, index, mtr, &err))
+    page_delete_rec_list_start(split_rec, block, index, mtr);
+  return err;
+}
+
+/*************************************************************//**
+Splits an index page to halves and inserts the tuple. It is assumed
+that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is
+released within this function! NOTE that the operation of this
+function must always succeed, we cannot reverse it: therefore enough
+free disk space (2 pages) must be guaranteed to be available before
+this function is called.
+@return inserted record or NULL if run out of space */
+rec_t*
+btr_page_split_and_insert(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in: cursor at which to insert; when the
+				function returns, the cursor is positioned
+				on the predecessor of the inserted record */
+	rec_offs**	offsets,/*!< out: offsets on inserted record */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
+	const dtuple_t*	tuple,	/*!< in: tuple to insert */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	mtr_t*		mtr,	/*!< in: mtr */
+	dberr_t*	err)	/*!< out: error code */
+{
+	buf_block_t*	block;
+	page_t*		page;
+	page_zip_des_t*	page_zip;
+	buf_block_t*	new_block;
+	page_t*		new_page;
+	page_zip_des_t*	new_page_zip;
+	rec_t*		split_rec;
+	buf_block_t*	left_block;
+	buf_block_t*	right_block;
+	page_cur_t*	page_cursor;
+	rec_t*		first_rec;
+	byte*		buf = 0; /* remove warning */
+	rec_t*		move_limit;
+	ulint		n_iterations = 0;
+	ulint		n_uniq;
+
+	ut_ad(*err == DB_SUCCESS);
+	ut_ad(dtuple_check_typed(tuple));
+
+	buf_pool.pages_split++;
+
+	if (cursor->index()->is_spatial()) {
+		/* Split rtree page and update parent */
+		return rtr_page_split_and_insert(flags, cursor, offsets, heap,
+						 tuple, n_ext, mtr, err);
+	}
+
+	if (!*heap) {
+		*heap = mem_heap_create(1024);
+	}
+	n_uniq = dict_index_get_n_unique_in_tree(cursor->index());
+func_start:
+	mem_heap_empty(*heap);
+	*offsets = NULL;
+
+	ut_ad(mtr->memo_contains_flagged(&cursor->index()->lock,
+					 MTR_MEMO_X_LOCK
+					 | MTR_MEMO_SX_LOCK));
+	ut_ad(!dict_index_is_online_ddl(cursor->index())
+	      || (flags & BTR_CREATE_FLAG)
+	      || dict_index_is_clust(cursor->index()));
+	ut_ad(cursor->index()->lock.have_u_or_x());
+
+	block = btr_cur_get_block(cursor);
+	page = buf_block_get_frame(block);
+	page_zip = buf_block_get_page_zip(block);
+
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(!page_is_empty(page));
+
+	/* try to insert to the next page if possible before split */
+	if (rec_t* rec = btr_insert_into_right_sibling(
+		    flags, cursor, offsets, *heap, tuple, n_ext, mtr)) {
+		return(rec);
+	}
+
+	/* 1. Decide the split record; split_rec == NULL means that the
+	tuple to be inserted should be the first record on the upper
+	half-page */
+	bool insert_left = false;
+	uint32_t hint_page_no = block->page.id().page_no() + 1;
+	byte direction = FSP_UP;
+
+	if (n_iterations > 0) {
+		split_rec = btr_page_get_split_rec(cursor, tuple, n_ext);
+
+		if (split_rec == NULL) {
+			insert_left = btr_page_tuple_smaller(
+				cursor, tuple, offsets, n_uniq, heap);
+		}
+	} else if (btr_page_get_split_rec_to_right(cursor, &split_rec)) {
+	} else if ((split_rec = btr_page_get_split_rec_to_left(cursor))) {
+		direction = FSP_DOWN;
+		hint_page_no -= 2;
+	} else {
+		/* If there is only one record in the index page, we
+		can't split the node in the middle by default. We need
+		to determine whether the new record will be inserted
+		to the left or right. */
+
+		if (page_get_n_recs(page) > 1) {
+			split_rec = page_get_middle_rec(page);
+		} else if (btr_page_tuple_smaller(cursor, tuple,
+						  offsets, n_uniq, heap)) {
+			split_rec = page_rec_get_next(
+				page_get_infimum_rec(page));
+		} else {
+			split_rec = NULL;
+			goto got_split_rec;
+		}
+
+		if (UNIV_UNLIKELY(!split_rec)) {
+			*err = DB_CORRUPTION;
+			return nullptr;
+		}
+	}
+
+got_split_rec:
+	/* 2. Allocate a new page to the index */
+	const uint16_t page_level = btr_page_get_level(page);
+	new_block = btr_page_alloc(cursor->index(), hint_page_no, direction,
+				   page_level, mtr, mtr, err);
+
+	if (!new_block) {
+		return nullptr;
+	}
+
+	new_page = buf_block_get_frame(new_block);
+	new_page_zip = buf_block_get_page_zip(new_block);
+
+	if (page_level && UNIV_LIKELY_NULL(new_page_zip)) {
+		/* ROW_FORMAT=COMPRESSED non-leaf pages are not expected
+		to contain FIL_NULL in FIL_PAGE_PREV at this stage. */
+		memset_aligned<4>(new_page + FIL_PAGE_PREV, 0, 4);
+	}
+	btr_page_create(new_block, new_page_zip, cursor->index(),
+			page_level, mtr);
+	/* Only record the leaf level page splits. */
+	if (!page_level) {
+		cursor->index()->stat_defrag_n_page_split ++;
+		cursor->index()->stat_defrag_modified_counter ++;
+		btr_defragment_save_defrag_stats_if_needed(cursor->index());
+	}
+
+	/* 3. Calculate the first record on the upper half-page, and the
+	first record (move_limit) on original page which ends up on the
+	upper half */
+
+	if (split_rec) {
+		first_rec = move_limit = split_rec;
+
+		*offsets = rec_get_offsets(split_rec, cursor->index(),
+					   *offsets, page_is_leaf(page)
+					   ? cursor->index()->n_core_fields
+					   : 0,
+					   n_uniq, heap);
+
+		insert_left = cmp_dtuple_rec(tuple, split_rec, cursor->index(),
+					     *offsets) < 0;
+
+		if (!insert_left && new_page_zip && n_iterations > 0) {
+			/* If a compressed page has already been split,
+			avoid further splits by inserting the record
+			to an empty page. */
+			split_rec = NULL;
+			goto insert_empty;
+		}
+	} else if (insert_left) {
+		if (UNIV_UNLIKELY(!n_iterations)) {
+corrupted:
+			*err = DB_CORRUPTION;
+			return nullptr;
+		}
+		first_rec = page_rec_get_next(page_get_infimum_rec(page));
+insert_move_limit:
+		move_limit = page_rec_get_next(btr_cur_get_rec(cursor));
+		if (UNIV_UNLIKELY(!first_rec || !move_limit)) {
+			goto corrupted;
+		}
+	} else {
+insert_empty:
+		ut_ad(!split_rec);
+		ut_ad(!insert_left);
+		buf = UT_NEW_ARRAY_NOKEY(
+			byte,
+			rec_get_converted_size(cursor->index(), tuple, n_ext));
+
+		first_rec = rec_convert_dtuple_to_rec(buf, cursor->index(),
+						      tuple, n_ext);
+		goto insert_move_limit;
+	}
+
+	/* 4. Do first the modifications in the tree structure */
+
+	/* FIXME: write FIL_PAGE_PREV,FIL_PAGE_NEXT in new_block earlier! */
+	*err = btr_attach_half_pages(flags, cursor->index(), block,
+				     first_rec, new_block, direction, mtr);
+
+	if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+		return nullptr;
+	}
+
+#ifdef UNIV_DEBUG
+	/* If the split is made on the leaf level and the insert will fit
+	on the appropriate half-page, we may release the tree x-latch.
+	We can then move the records after releasing the tree latch,
+	thus reducing the tree latch contention. */
+	const bool insert_will_fit = !new_page_zip
+		&& btr_page_insert_fits(cursor, split_rec, offsets, tuple,
+					n_ext, heap);
+#endif
+	if (!split_rec && !insert_left) {
+		UT_DELETE_ARRAY(buf);
+		buf = NULL;
+	}
+
+#if 0 // FIXME: this used to be a no-op, and may cause trouble if enabled
+	if (insert_will_fit
+	    && page_is_leaf(page)
+	    && !dict_index_is_online_ddl(cursor->index())) {
+		mtr->release(cursor->index()->lock);
+		/* NOTE: We cannot release root block latch here, because it
+		has segment header and already modified in most of cases.*/
+	}
+#endif
+
+	/* 5. Move then the records to the new page */
+	if (direction == FSP_DOWN) {
+		/*		fputs("Split left\n", stderr); */
+
+		if (0
+#ifdef UNIV_ZIP_COPY
+		    || page_zip
+#endif /* UNIV_ZIP_COPY */
+		    || (*err = page_move_rec_list_start(new_block, block,
+							move_limit,
+							cursor->index(),
+							mtr))) {
+			if (*err != DB_FAIL) {
+				return nullptr;
+			}
+
+			/* For some reason, compressing new_block failed,
+			even though it should contain fewer records than
+			the original page.  Copy the page byte for byte
+			and then delete the records from both pages
+			as appropriate.  Deleting will always succeed. */
+			ut_a(new_page_zip);
+
+			page_zip_copy_recs(new_block, page_zip, page,
+					   cursor->index(), mtr);
+			*err = page_delete_rec_list_end(move_limit
+							- page + new_page,
+							new_block,
+							cursor->index(),
+							ULINT_UNDEFINED,
+							ULINT_UNDEFINED, mtr);
+			if (*err != DB_SUCCESS) {
+				return nullptr;
+			}
+
+			/* Update the lock table and possible hash index. */
+			if (cursor->index()->has_locking()) {
+				lock_move_rec_list_start(
+					new_block, block, move_limit,
+					new_page + PAGE_NEW_INFIMUM);
+			}
+
+			btr_search_move_or_delete_hash_entries(
+				new_block, block);
+
+			/* Delete the records from the source page. */
+
+			page_delete_rec_list_start(move_limit, block,
+						   cursor->index(), mtr);
+		}
+
+		left_block = new_block;
+		right_block = block;
+
+		if (cursor->index()->has_locking()) {
+			lock_update_split_left(right_block, left_block);
+		}
+	} else {
+		/*		fputs("Split right\n", stderr); */
+
+		if (0
+#ifdef UNIV_ZIP_COPY
+		    || page_zip
+#endif /* UNIV_ZIP_COPY */
+		    || (*err = page_move_rec_list_end(new_block, block,
+						      move_limit,
+						      cursor->index(), mtr))) {
+			if (*err != DB_FAIL) {
+				return nullptr;
+			}
+
+			/* For some reason, compressing new_page failed,
+			even though it should contain fewer records than
+			the original page.  Copy the page byte for byte
+			and then delete the records from both pages
+			as appropriate.  Deleting will always succeed. */
+			ut_a(new_page_zip);
+
+			page_zip_copy_recs(new_block, page_zip, page,
+					   cursor->index(), mtr);
+			page_delete_rec_list_start(move_limit - page
+						   + new_page, new_block,
+						   cursor->index(), mtr);
+
+			/* Update the lock table and possible hash index. */
+			if (cursor->index()->has_locking()) {
+				lock_move_rec_list_end(new_block, block,
+						       move_limit);
+			}
+
+			btr_search_move_or_delete_hash_entries(
+				new_block, block);
+
+			/* Delete the records from the source page. */
+
+			*err = page_delete_rec_list_end(move_limit, block,
+							cursor->index(),
+							ULINT_UNDEFINED,
+							ULINT_UNDEFINED, mtr);
+			if (*err != DB_SUCCESS) {
+				return nullptr;
+			}
+		}
+
+		left_block = block;
+		right_block = new_block;
+
+		if (cursor->index()->has_locking()) {
+			lock_update_split_right(right_block, left_block);
+		}
+	}
+
+#ifdef UNIV_ZIP_DEBUG
+	if (page_zip) {
+		ut_a(page_zip_validate(page_zip, page, cursor->index()));
+		ut_a(page_zip_validate(new_page_zip, new_page,
+				       cursor->index()));
+	}
+#endif /* UNIV_ZIP_DEBUG */
+
+	/* At this point, split_rec, move_limit and first_rec may point
+	to garbage on the old page. */
+
+	/* 6. The split and the tree modification is now completed. Decide the
+	page where the tuple should be inserted */
+	rec_t* rec;
+	buf_block_t* const insert_block = insert_left
+		? left_block : right_block;
+
+	/* 7. Reposition the cursor for insert and try insertion */
+	page_cursor = btr_cur_get_page_cur(cursor);
+	page_cursor->block = insert_block;
+
+	ulint up_match = 0, low_match = 0;
+
+	if (page_cur_search_with_match(tuple,
+				       PAGE_CUR_LE, &up_match, &low_match,
+				       page_cursor, nullptr)) {
+		*err = DB_CORRUPTION;
+		return nullptr;
+	}
+
+	rec = page_cur_tuple_insert(page_cursor, tuple,
+				    offsets, heap, n_ext, mtr);
+
+#ifdef UNIV_ZIP_DEBUG
+	{
+		page_t*		insert_page
+			= buf_block_get_frame(insert_block);
+
+		page_zip_des_t*	insert_page_zip
+			= buf_block_get_page_zip(insert_block);
+
+		ut_a(!insert_page_zip
+		     || page_zip_validate(insert_page_zip, insert_page,
+					  cursor->index()));
+	}
+#endif /* UNIV_ZIP_DEBUG */
+
+	if (rec != NULL) {
+
+		goto func_exit;
+	}
+
+	/* 8. If insert did not fit, try page reorganization.
+	For compressed pages, page_cur_tuple_insert() will have
+	attempted this already. */
+
+	if (page_cur_get_page_zip(page_cursor)) {
+		goto insert_failed;
+	}
+
+	*err = btr_page_reorganize(page_cursor, mtr);
+
+	if (*err != DB_SUCCESS) {
+		return nullptr;
+	}
+
+	rec = page_cur_tuple_insert(page_cursor, tuple,
+				    offsets, heap, n_ext, mtr);
+
+	if (rec == NULL) {
+		/* The insert did not fit on the page: loop back to the
+		start of the function for a new split */
+insert_failed:
+		/* We play safe and reset the free bits for new_page */
+		if (!dict_index_is_clust(page_cursor->index)
+		    && !page_cursor->index->table->is_temporary()) {
+			ibuf_reset_free_bits(new_block);
+			ibuf_reset_free_bits(block);
+		}
+
+		n_iterations++;
+		ut_ad(n_iterations < 2
+		      || buf_block_get_page_zip(insert_block));
+		ut_ad(!insert_will_fit);
+
+		goto func_start;
+	}
+
+func_exit:
+	/* Insert fit on the page: update the free bits for the
+	left and right pages in the same mtr */
+
+	if (!dict_index_is_clust(page_cursor->index)
+	    && !page_cursor->index->table->is_temporary()
+	    && page_is_leaf(page)) {
+
+		ibuf_update_free_bits_for_two_pages_low(
+			left_block, right_block, mtr);
+	}
+
+	ut_ad(page_validate(buf_block_get_frame(left_block),
+			    page_cursor->index));
+	ut_ad(page_validate(buf_block_get_frame(right_block),
+			    page_cursor->index));
+
+	ut_ad(!rec || rec_offs_validate(rec, page_cursor->index, *offsets));
+	return(rec);
+}
+
+/** Remove a page from the level list of pages.
+@param[in]	block		page to remove
+@param[in]	index		index tree
+@param[in,out]	mtr		mini-transaction */
+dberr_t btr_level_list_remove(const buf_block_t& block,
+                              const dict_index_t& index, mtr_t* mtr)
+{
+  ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_X_FIX));
+  ut_ad(block.zip_size() == index.table->space->zip_size());
+  ut_ad(index.table->space->id == block.page.id().space());
+  /* Get the previous and next page numbers of page */
+  const uint32_t prev_page_no= btr_page_get_prev(block.page.frame);
+  const uint32_t next_page_no= btr_page_get_next(block.page.frame);
+  page_id_t id{block.page.id()};
+  buf_block_t *prev= nullptr, *next;
+  dberr_t err;
+
+  /* Update page links of the level */
+  if (prev_page_no != FIL_NULL)
+  {
+    id.set_page_no(prev_page_no);
+    prev= mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX);
+#if 1 /* MDEV-29835 FIXME: acquire page latches upfront */
+    if (!prev)
+    {
+      ut_ad(mtr->memo_contains(index.lock, MTR_MEMO_X_LOCK));
+      prev= btr_block_get(index, id.page_no(), RW_X_LATCH,
+                          page_is_leaf(block.page.frame), mtr, &err);
+      if (UNIV_UNLIKELY(!prev))
+        return err;
+    }
+#endif
+  }
+
+  if (next_page_no != FIL_NULL)
+  {
+    id.set_page_no(next_page_no);
+    next= mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX);
+#if 1 /* MDEV-29835 FIXME: acquire page latches upfront */
+    if (!next)
+    {
+      ut_ad(mtr->memo_contains(index.lock, MTR_MEMO_X_LOCK));
+      next= btr_block_get(index, id.page_no(), RW_X_LATCH,
+                          page_is_leaf(block.page.frame), mtr, &err);
+      if (UNIV_UNLIKELY(!next))
+        return err;
+    }
+#endif
+    btr_page_set_prev(next, prev_page_no, mtr);
+  }
+
+  if (prev)
+    btr_page_set_next(prev, next_page_no, mtr);
+
+  return DB_SUCCESS;
+}
+
+/*************************************************************//**
+If page is the only on its level, this function moves its records to the
+father page, thus reducing the tree height.
+@return father block */
+buf_block_t*
+btr_lift_page_up(
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: page which is the only on its level;
+				must not be empty: use
+				btr_discard_only_page_on_level if the last
+				record from the page should be removed */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	dberr_t*	err)	/*!< out: error code */
+{
+	buf_block_t*	father_block;
+	ulint		page_level;
+	page_zip_des_t*	father_page_zip;
+	page_t*		page		= buf_block_get_frame(block);
+	ulint		root_page_no;
+	buf_block_t*	blocks[BTR_MAX_LEVELS];
+	ulint		n_blocks;	/*!< last used index in blocks[] */
+	ulint		i;
+	bool		lift_father_up;
+	buf_block_t*	block_orig	= block;
+
+	ut_ad(!page_has_siblings(page));
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(!page_is_empty(page));
+
+	page_level = btr_page_get_level(page);
+	root_page_no = dict_index_get_page(index);
+
+	{
+		btr_cur_t	cursor;
+		rec_offs*	offsets	= NULL;
+		mem_heap_t*	heap	= mem_heap_create(
+			sizeof(*offsets)
+			* (REC_OFFS_HEADER_SIZE + 1 + 1
+			   + unsigned(index->n_fields)));
+		buf_block_t*	b;
+		cursor.page_cur.index = index;
+		cursor.page_cur.block = block;
+
+		if (index->is_spatial()) {
+			offsets = rtr_page_get_father_block(
+				nullptr, heap, mtr, nullptr, &cursor);
+		} else {
+			offsets = btr_page_get_father_block(offsets, heap,
+							    mtr, &cursor);
+		}
+		father_block = btr_cur_get_block(&cursor);
+		father_page_zip = buf_block_get_page_zip(father_block);
+
+		n_blocks = 0;
+
+		/* Store all ancestor pages so we can reset their
+		levels later on.  We have to do all the searches on
+		the tree now because later on, after we've replaced
+		the first level, the tree is in an inconsistent state
+		and can not be searched. */
+		for (b = father_block;
+		     b->page.id().page_no() != root_page_no; ) {
+			ut_a(n_blocks < BTR_MAX_LEVELS);
+
+			if (index->is_spatial()) {
+				offsets = rtr_page_get_father_block(
+					nullptr, heap, mtr, nullptr, &cursor);
+			} else {
+				offsets = btr_page_get_father_block(offsets,
+								    heap,
+								    mtr,
+								    &cursor);
+			}
+
+			blocks[n_blocks++] = b = btr_cur_get_block(&cursor);
+		}
+
+		lift_father_up = (n_blocks && page_level == 0);
+		if (lift_father_up) {
+			/* The father page also should be the only on its level (not
+			root). We should lift up the father page at first.
+			Because the leaf page should be lifted up only for root page.
+			The freeing page is based on page_level (==0 or !=0)
+			to choose segment. If the page_level is changed ==0 from !=0,
+			later freeing of the page doesn't find the page allocation
+			to be freed.*/
+
+			block = father_block;
+			page = buf_block_get_frame(block);
+			page_level = btr_page_get_level(page);
+
+			ut_ad(!page_has_siblings(page));
+			ut_ad(mtr->memo_contains_flagged(block,
+							 MTR_MEMO_PAGE_X_FIX));
+
+			father_block = blocks[0];
+			father_page_zip = buf_block_get_page_zip(father_block);
+		}
+
+		mem_heap_free(heap);
+	}
+
+	btr_search_drop_page_hash_index(block, false);
+
+	/* Make the father empty */
+	btr_page_empty(father_block, father_page_zip, index, page_level, mtr);
+	/* btr_page_empty() is supposed to zero-initialize the field. */
+	ut_ad(!page_get_instant(father_block->page.frame));
+
+	if (index->is_instant()
+	    && father_block->page.id().page_no() == root_page_no) {
+		ut_ad(!father_page_zip);
+
+		if (page_is_leaf(page)) {
+			const rec_t* rec = page_rec_get_next(
+				page_get_infimum_rec(page));
+			ut_ad(rec_is_metadata(rec, *index));
+			if (rec_is_add_metadata(rec, *index)
+			    && page_get_n_recs(page) == 1) {
+				index->clear_instant_add();
+				goto copied;
+			}
+		}
+
+		btr_set_instant(father_block, *index, mtr);
+	}
+
+	/* Copy the records to the father page one by one. */
+	if (0
+#ifdef UNIV_ZIP_COPY
+	    || father_page_zip
+#endif /* UNIV_ZIP_COPY */
+	    || !page_copy_rec_list_end(father_block, block,
+				       page_get_infimum_rec(page),
+				       index, mtr, err)) {
+		switch (*err) {
+		case DB_SUCCESS:
+			break;
+		case DB_FAIL:
+			*err = DB_SUCCESS;
+			break;
+		default:
+			return nullptr;
+		}
+
+		const page_zip_des_t*	page_zip
+			= buf_block_get_page_zip(block);
+		ut_a(father_page_zip);
+		ut_a(page_zip);
+
+		/* Copy the page byte for byte. */
+		page_zip_copy_recs(father_block,
+				   page_zip, page, index, mtr);
+
+		/* Update the lock table and possible hash index. */
+
+		if (index->has_locking()) {
+			lock_move_rec_list_end(father_block, block,
+					       page_get_infimum_rec(page));
+		}
+
+		/* Also update the predicate locks */
+		if (dict_index_is_spatial(index)) {
+			lock_prdt_rec_move(father_block, block->page.id());
+		} else {
+			btr_search_move_or_delete_hash_entries(
+				father_block, block);
+		}
+	}
+
+copied:
+	if (index->has_locking()) {
+		const page_id_t id{block->page.id()};
+		/* Free predicate page locks on the block */
+		if (index->is_spatial()) {
+			lock_sys.prdt_page_free_from_discard(id);
+		} else {
+			lock_update_copy_and_discard(*father_block, id);
+		}
+	}
+
+	page_level++;
+
+	/* Go upward to root page, decrementing levels by one. */
+	for (i = lift_father_up ? 1 : 0; i < n_blocks; i++, page_level++) {
+		ut_ad(btr_page_get_level(blocks[i]->page.frame)
+		      == page_level + 1);
+		btr_page_set_level(blocks[i], page_level, mtr);
+	}
+
+	if (dict_index_is_spatial(index)) {
+		rtr_check_discard_page(index, NULL, block);
+	}
+
+	/* Free the file page */
+	btr_page_free(index, block, mtr);
+
+	/* We play it safe and reset the free bits for the father */
+	if (!dict_index_is_clust(index)
+	    && !index->table->is_temporary()) {
+		ibuf_reset_free_bits(father_block);
+	}
+	ut_ad(page_validate(father_block->page.frame, index));
+	ut_ad(btr_check_node_ptr(index, father_block, mtr));
+
+	return(lift_father_up ? block_orig : father_block);
+}
+
+/*************************************************************//**
+Tries to merge the page first to the left immediate brother if such a
+brother exists, and the node pointers to the current page and to the brother
+reside on the same page. If the left brother does not satisfy these
+conditions, looks at the right brother. If the page is the only one on that
+level lifts the records of the page to the father page, thus reducing the
+tree height. It is assumed that mtr holds an x-latch on the tree and on the
+page. If cursor is on the leaf level, mtr must also hold x-latches to the
+brothers, if they exist.
+@return error code */
+dberr_t
+btr_compress(
+/*=========*/
+	btr_cur_t*	cursor,	/*!< in/out: cursor on the page to merge
+				or lift; the page must not be empty:
+				when deleting records, use btr_discard_page()
+				if the page would become empty */
+	bool		adjust,	/*!< in: whether the cursor position should be
+				adjusted even when compression occurs */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	dict_index_t*	index;
+	buf_block_t*	merge_block = nullptr;
+	page_t*		merge_page = nullptr;
+	page_zip_des_t*	merge_page_zip;
+	ibool		is_left;
+	buf_block_t*	block;
+	page_t*		page;
+	btr_cur_t	father_cursor;
+	mem_heap_t*	heap;
+	rec_offs*	offsets;
+	ulint		nth_rec = 0; /* remove bogus warning */
+	bool		mbr_changed = false;
+#ifdef UNIV_DEBUG
+	bool		leftmost_child;
+#endif
+	DBUG_ENTER("btr_compress");
+
+	block = btr_cur_get_block(cursor);
+	page = btr_cur_get_page(cursor);
+	index = btr_cur_get_index(cursor);
+
+	ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+					 | MTR_MEMO_SX_LOCK));
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+
+	MONITOR_INC(MONITOR_INDEX_MERGE_ATTEMPTS);
+
+	const uint32_t left_page_no = btr_page_get_prev(page);
+	const uint32_t right_page_no = btr_page_get_next(page);
+	dberr_t err = DB_SUCCESS;
+
+	ut_ad(page_is_leaf(page) || left_page_no != FIL_NULL
+	      || (REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
+			  page_rec_get_next(page_get_infimum_rec(page)),
+			  page_is_comp(page))));
+
+	heap = mem_heap_create(100);
+	father_cursor.page_cur.index = index;
+	father_cursor.page_cur.block = block;
+
+	if (index->is_spatial()) {
+		offsets = rtr_page_get_father_block(
+			NULL, heap, mtr, cursor, &father_cursor);
+		ut_ad(cursor->page_cur.block->page.id() == block->page.id());
+		rec_t*  my_rec = father_cursor.page_cur.rec;
+
+		ulint page_no = btr_node_ptr_get_child_page_no(my_rec, offsets);
+
+		if (page_no != block->page.id().page_no()) {
+			ib::info() << "father positioned on page "
+				<< page_no << "instead of "
+				<< block->page.id().page_no();
+			offsets = btr_page_get_father_block(
+				NULL, heap, mtr, &father_cursor);
+		}
+	} else {
+		offsets = btr_page_get_father_block(
+			NULL, heap, mtr, &father_cursor);
+	}
+
+	if (adjust) {
+		nth_rec = page_rec_get_n_recs_before(btr_cur_get_rec(cursor));
+		if (UNIV_UNLIKELY(!nth_rec || nth_rec == ULINT_UNDEFINED)) {
+		corrupted:
+			err = DB_CORRUPTION;
+		err_exit:
+			/* We play it safe and reset the free bits. */
+			if (merge_block && merge_block->zip_size()
+			    && page_is_leaf(merge_block->page.frame)
+			    && !index->is_clust()) {
+				ibuf_reset_free_bits(merge_block);
+			}
+			goto func_exit;
+		}
+	}
+
+	if (left_page_no == FIL_NULL && right_page_no == FIL_NULL) {
+		/* The page is the only one on the level, lift the records
+		to the father */
+
+		merge_block = btr_lift_page_up(index, block, mtr, &err);
+success:
+		if (adjust) {
+			ut_ad(nth_rec > 0);
+			if (rec_t* nth
+			    = page_rec_get_nth(merge_block->page.frame,
+					       nth_rec)) {
+				btr_cur_position(index, nth,
+						 merge_block, cursor);
+			} else {
+				goto corrupted;
+			}
+		}
+
+		MONITOR_INC(MONITOR_INDEX_MERGE_SUCCESSFUL);
+func_exit:
+		mem_heap_free(heap);
+		DBUG_RETURN(err);
+	}
+
+	ut_d(leftmost_child =
+		left_page_no != FIL_NULL
+		&& (page_rec_get_next(
+			page_get_infimum_rec(
+				btr_cur_get_page(&father_cursor)))
+		    == btr_cur_get_rec(&father_cursor)));
+
+	/* Decide the page to which we try to merge and which will inherit
+	the locks */
+
+	is_left = btr_can_merge_with_page(cursor, left_page_no,
+					  &merge_block, mtr);
+
+	DBUG_EXECUTE_IF("ib_always_merge_right", is_left = FALSE;);
+retry:
+	if (!is_left
+	   && !btr_can_merge_with_page(cursor, right_page_no, &merge_block,
+				       mtr)) {
+		if (!merge_block) {
+			merge_page = NULL;
+		}
+cannot_merge:
+		err = DB_FAIL;
+		goto err_exit;
+	}
+
+	merge_page = buf_block_get_frame(merge_block);
+
+	if (UNIV_UNLIKELY(memcmp_aligned<4>(merge_page + (is_left
+							  ? FIL_PAGE_NEXT
+							  : FIL_PAGE_PREV),
+					    block->page.frame
+					    + FIL_PAGE_OFFSET, 4))) {
+		goto corrupted;
+	}
+
+	ut_ad(page_validate(merge_page, index));
+
+	merge_page_zip = buf_block_get_page_zip(merge_block);
+#ifdef UNIV_ZIP_DEBUG
+	if (merge_page_zip) {
+		const page_zip_des_t*	page_zip
+			= buf_block_get_page_zip(block);
+		ut_a(page_zip);
+		ut_a(page_zip_validate(merge_page_zip, merge_page, index));
+		ut_a(page_zip_validate(page_zip, page, index));
+	}
+#endif /* UNIV_ZIP_DEBUG */
+
+	btr_cur_t cursor2;
+	cursor2.page_cur.index = index;
+	cursor2.page_cur.block = merge_block;
+
+	/* Move records to the merge page */
+	if (is_left) {
+		rtr_mbr_t	new_mbr;
+		rec_offs*	offsets2 = NULL;
+
+		/* For rtree, we need to update father's mbr. */
+		if (index->is_spatial()) {
+			/* We only support merge pages with the same parent
+			page */
+			if (!rtr_check_same_block(
+				index, &cursor2,
+				btr_cur_get_block(&father_cursor), heap)) {
+				is_left = false;
+				goto retry;
+			}
+
+			/* Set rtr_info for cursor2, since it is
+			necessary in recursive page merge. */
+			cursor2.rtr_info = cursor->rtr_info;
+			cursor2.tree_height = cursor->tree_height;
+
+			offsets2 = rec_get_offsets(
+				btr_cur_get_rec(&cursor2), index, NULL,
+				page_is_leaf(btr_cur_get_page(&cursor2))
+				? index->n_fields : 0,
+				ULINT_UNDEFINED, &heap);
+
+			/* Check if parent entry needs to be updated */
+			mbr_changed = rtr_merge_mbr_changed(
+				&cursor2, &father_cursor,
+				offsets2, offsets, &new_mbr);
+		}
+
+		rec_t*	orig_pred = page_copy_rec_list_start(
+			merge_block, block, page_get_supremum_rec(page),
+			index, mtr, &err);
+
+		if (!orig_pred) {
+			goto err_exit;
+		}
+
+		btr_search_drop_page_hash_index(block, false);
+
+		/* Remove the page from the level list */
+		err = btr_level_list_remove(*block, *index, mtr);
+
+		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+			goto err_exit;
+		}
+
+		const page_id_t id{block->page.id()};
+
+		if (index->is_spatial()) {
+			rec_t*  my_rec = father_cursor.page_cur.rec;
+
+			ulint page_no = btr_node_ptr_get_child_page_no(
+						my_rec, offsets);
+
+			if (page_no != block->page.id().page_no()) {
+				ib::fatal() << "father positioned on "
+					<< page_no << " instead of "
+					<< block->page.id().page_no();
+			}
+
+			if (mbr_changed) {
+				rtr_update_mbr_field(
+					&cursor2, offsets2, &father_cursor,
+					merge_page, &new_mbr, NULL, mtr);
+			} else {
+				rtr_node_ptr_delete(&father_cursor, mtr);
+			}
+
+			/* No GAP lock needs to be worrying about */
+			lock_sys.prdt_page_free_from_discard(id);
+		} else {
+			err = btr_cur_node_ptr_delete(&father_cursor, mtr);
+			if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+				goto err_exit;
+			}
+			if (index->has_locking()) {
+				lock_update_merge_left(
+					*merge_block, orig_pred, id);
+			}
+		}
+
+		if (adjust) {
+			ulint n = page_rec_get_n_recs_before(orig_pred);
+			if (UNIV_UNLIKELY(!n || n == ULINT_UNDEFINED)) {
+				goto corrupted;
+			}
+			nth_rec += n;
+		}
+	} else {
+		rec_t*		orig_succ;
+		ibool		compressed;
+		dberr_t		err;
+		byte		fil_page_prev[4];
+
+		if (index->is_spatial()) {
+			/* For spatial index, we disallow merge of blocks
+			with different parents, since the merge would need
+			to update entry (for MBR and Primary key) in the
+			parent of block being merged */
+			if (!rtr_check_same_block(
+				index, &cursor2,
+				btr_cur_get_block(&father_cursor), heap)) {
+				goto cannot_merge;
+			}
+
+			/* Set rtr_info for cursor2, since it is
+			necessary in recursive page merge. */
+			cursor2.rtr_info = cursor->rtr_info;
+			cursor2.tree_height = cursor->tree_height;
+		} else if (!btr_page_get_father(mtr, &cursor2)) {
+			goto cannot_merge;
+		}
+
+		if (merge_page_zip && left_page_no == FIL_NULL) {
+
+			/* The function page_zip_compress(), which will be
+			invoked by page_copy_rec_list_end() below,
+			requires that FIL_PAGE_PREV be FIL_NULL.
+			Clear the field, but prepare to restore it. */
+			static_assert(FIL_PAGE_PREV % 8 == 0, "alignment");
+			memcpy(fil_page_prev, merge_page + FIL_PAGE_PREV, 4);
+			compile_time_assert(FIL_NULL == 0xffffffffU);
+			memset_aligned<4>(merge_page + FIL_PAGE_PREV, 0xff, 4);
+		}
+
+		orig_succ = page_copy_rec_list_end(merge_block, block,
+						   page_get_infimum_rec(page),
+						   cursor->index(), mtr, &err);
+
+		if (!orig_succ) {
+			ut_a(merge_page_zip);
+			if (left_page_no == FIL_NULL) {
+				/* FIL_PAGE_PREV was restored from
+				merge_page_zip. */
+				ut_ad(!memcmp(fil_page_prev,
+					      merge_page + FIL_PAGE_PREV, 4));
+			}
+			goto err_exit;
+		}
+
+		btr_search_drop_page_hash_index(block, false);
+
+		if (merge_page_zip && left_page_no == FIL_NULL) {
+
+			/* Restore FIL_PAGE_PREV in order to avoid an assertion
+			failure in btr_level_list_remove(), which will set
+			the field again to FIL_NULL.  Even though this makes
+			merge_page and merge_page_zip inconsistent for a
+			split second, it is harmless, because the pages
+			are X-latched. */
+			memcpy(merge_page + FIL_PAGE_PREV, fil_page_prev, 4);
+		}
+
+		/* Remove the page from the level list */
+		err = btr_level_list_remove(*block, *index, mtr);
+
+		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+			goto err_exit;
+		}
+
+		ut_ad(btr_node_ptr_get_child_page_no(
+			      btr_cur_get_rec(&father_cursor), offsets)
+		      == block->page.id().page_no());
+
+		/* Replace the address of the old child node (= page) with the
+		address of the merge page to the right */
+		btr_node_ptr_set_child_page_no(
+			btr_cur_get_block(&father_cursor),
+			btr_cur_get_rec(&father_cursor),
+			offsets, right_page_no, mtr);
+
+#ifdef UNIV_DEBUG
+		if (!page_is_leaf(page) && left_page_no == FIL_NULL) {
+			ut_ad(REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
+				page_rec_get_next(page_get_infimum_rec(
+					buf_block_get_frame(merge_block))),
+				page_is_comp(page)));
+		}
+#endif /* UNIV_DEBUG */
+
+		/* For rtree, we need to update father's mbr. */
+		if (index->is_spatial()) {
+			rec_offs* offsets2;
+			ulint	rec_info;
+
+			offsets2 = rec_get_offsets(
+				btr_cur_get_rec(&cursor2), index, NULL,
+				page_is_leaf(btr_cur_get_page(&cursor2))
+				? index->n_fields : 0,
+				ULINT_UNDEFINED, &heap);
+
+			ut_ad(btr_node_ptr_get_child_page_no(
+				btr_cur_get_rec(&cursor2), offsets2)
+				== right_page_no);
+
+			rec_info = rec_get_info_bits(
+				btr_cur_get_rec(&father_cursor),
+				rec_offs_comp(offsets));
+			if (rec_info & REC_INFO_MIN_REC_FLAG) {
+				/* When the father node ptr is minimal rec,
+				we will keep it and delete the node ptr of
+				merge page. */
+				rtr_merge_and_update_mbr(&father_cursor,
+							 &cursor2,
+							 offsets, offsets2,
+							 merge_page, mtr);
+			} else {
+				/* Otherwise, we will keep the node ptr of
+				merge page and delete the father node ptr.
+				This is for keeping the rec order in upper
+				level. */
+				rtr_merge_and_update_mbr(&cursor2,
+							 &father_cursor,
+							 offsets2, offsets,
+							 merge_page, mtr);
+			}
+			const page_id_t id{block->page.id()};
+			lock_sys.prdt_page_free_from_discard(id);
+		} else {
+
+			compressed = btr_cur_pessimistic_delete(&err, TRUE,
+								&cursor2,
+								BTR_CREATE_FLAG,
+								false, mtr);
+			ut_a(err == DB_SUCCESS);
+
+			if (!compressed) {
+				btr_cur_compress_if_useful(&cursor2, false,
+							   mtr);
+			}
+
+			if (index->has_locking()) {
+				lock_update_merge_right(
+					merge_block, orig_succ, block);
+			}
+		}
+	}
+
+	if (!dict_index_is_clust(index)
+	    && !index->table->is_temporary()
+	    && page_is_leaf(merge_page)) {
+		/* Update the free bits of the B-tree page in the
+		insert buffer bitmap.  This has to be done in a
+		separate mini-transaction that is committed before the
+		main mini-transaction.  We cannot update the insert
+		buffer bitmap in this mini-transaction, because
+		btr_compress() can be invoked recursively without
+		committing the mini-transaction in between.  Since
+		insert buffer bitmap pages have a lower rank than
+		B-tree pages, we must not access other pages in the
+		same mini-transaction after accessing an insert buffer
+		bitmap page. */
+
+		/* The free bits in the insert buffer bitmap must
+		never exceed the free space on a page.  It is safe to
+		decrement or reset the bits in the bitmap in a
+		mini-transaction that is committed before the
+		mini-transaction that affects the free space. */
+
+		/* It is unsafe to increment the bits in a separately
+		committed mini-transaction, because in crash recovery,
+		the free bits could momentarily be set too high. */
+
+		if (merge_block->zip_size()) {
+			/* Because the free bits may be incremented
+			and we cannot update the insert buffer bitmap
+			in the same mini-transaction, the only safe
+			thing we can do here is the pessimistic
+			approach: reset the free bits. */
+			ibuf_reset_free_bits(merge_block);
+		} else {
+			/* On uncompressed pages, the free bits will
+			never increase here.  Thus, it is safe to
+			write the bits accurately in a separate
+			mini-transaction. */
+			ibuf_update_free_bits_if_full(merge_block,
+						      srv_page_size,
+						      ULINT_UNDEFINED);
+		}
+	}
+
+	ut_ad(page_validate(merge_page, index));
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!merge_page_zip || page_zip_validate(merge_page_zip, merge_page,
+						  index));
+#endif /* UNIV_ZIP_DEBUG */
+
+	if (dict_index_is_spatial(index)) {
+		rtr_check_discard_page(index, NULL, block);
+	}
+
+	/* Free the file page */
+	err = btr_page_free(index, block, mtr);
+        if (err == DB_SUCCESS) {
+		ut_ad(leftmost_child
+		      || btr_check_node_ptr(index, merge_block, mtr));
+		goto success;
+        } else {
+		goto err_exit;
+        }
+}
+
+/*************************************************************//**
+Discards a page that is the only page on its level.  This will empty
+the whole B-tree, leaving just an empty root page.  This function
+should almost never be reached, because btr_compress(), which is invoked in
+delete operations, calls btr_lift_page_up() to flatten the B-tree. */
+ATTRIBUTE_COLD
+static
+void
+btr_discard_only_page_on_level(
+/*===========================*/
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: page which is the only on its level */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ulint		page_level = 0;
+
+	ut_ad(!index->is_dummy);
+
+	/* Save the PAGE_MAX_TRX_ID from the leaf page. */
+	const trx_id_t max_trx_id = page_get_max_trx_id(block->page.frame);
+	const rec_t* r = page_rec_get_next(
+		page_get_infimum_rec(block->page.frame));
+	/* In the caller we checked that a valid key exists in the page,
+	because we were able to look up a parent page. */
+	ut_ad(r);
+	ut_ad(rec_is_metadata(r, *index) == index->is_instant());
+
+	while (block->page.id().page_no() != dict_index_get_page(index)) {
+		btr_cur_t	cursor;
+		buf_block_t*	father;
+		const page_t*	page	= buf_block_get_frame(block);
+
+		ut_a(page_get_n_recs(page) == 1);
+		ut_a(page_level == btr_page_get_level(page));
+		ut_a(!page_has_siblings(page));
+		ut_ad(fil_page_index_page_check(page));
+		ut_ad(block->page.id().space() == index->table->space->id);
+		ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+		btr_search_drop_page_hash_index(block, false);
+		cursor.page_cur.index = index;
+		cursor.page_cur.block = block;
+
+		if (index->is_spatial()) {
+			/* Check any concurrent search having this page */
+			rtr_check_discard_page(index, NULL, block);
+			if (!rtr_page_get_father(mtr, nullptr, &cursor)) {
+				return;
+			}
+		} else {
+			if (!btr_page_get_father(mtr, &cursor)) {
+				return;
+			}
+		}
+		father = btr_cur_get_block(&cursor);
+
+		if (index->has_locking()) {
+			lock_update_discard(
+				father, PAGE_HEAP_NO_SUPREMUM, block);
+		}
+
+		/* Free the file page */
+		if (btr_page_free(index, block, mtr) != DB_SUCCESS) {
+			return;
+		}
+
+		block = father;
+		page_level++;
+	}
+
+	/* block is the root page, which must be empty, except
+	for the node pointer to the (now discarded) block(s). */
+	ut_ad(!page_has_siblings(block->page.frame));
+
+	mem_heap_t* heap = nullptr;
+	const rec_t* rec = nullptr;
+	rec_offs* offsets = nullptr;
+	if (index->table->instant || index->must_avoid_clear_instant_add()) {
+		if (!rec_is_metadata(r, *index)) {
+		} else if (!index->table->instant
+			   || rec_is_alter_metadata(r, *index)) {
+			heap = mem_heap_create(srv_page_size);
+			offsets = rec_get_offsets(r, index, nullptr,
+						  index->n_core_fields,
+						  ULINT_UNDEFINED, &heap);
+			rec = rec_copy(mem_heap_alloc(heap,
+						      rec_offs_size(offsets)),
+				       r, offsets);
+			rec_offs_make_valid(rec, index, true, offsets);
+		}
+	}
+
+	btr_page_empty(block, buf_block_get_page_zip(block), index, 0, mtr);
+	ut_ad(page_is_leaf(buf_block_get_frame(block)));
+	/* btr_page_empty() is supposed to zero-initialize the field. */
+	ut_ad(!page_get_instant(block->page.frame));
+
+	if (index->is_primary()) {
+		if (rec) {
+			page_cur_t cur;
+			page_cur_set_before_first(block, &cur);
+			cur.index = index;
+			DBUG_ASSERT(index->table->instant);
+			DBUG_ASSERT(rec_is_alter_metadata(rec, *index));
+			btr_set_instant(block, *index, mtr);
+			rec = page_cur_insert_rec_low(&cur, rec, offsets, mtr);
+			ut_ad(rec);
+			mem_heap_free(heap);
+		} else if (index->is_instant()) {
+			index->clear_instant_add();
+		}
+	} else if (!index->table->is_temporary()) {
+		/* We play it safe and reset the free bits for the root */
+		ibuf_reset_free_bits(block);
+
+		ut_a(max_trx_id);
+		page_set_max_trx_id(block,
+				    buf_block_get_page_zip(block),
+				    max_trx_id, mtr);
+	}
+}
+
+/*************************************************************//**
+Discards a page from a B-tree. This is used to remove the last record from
+a B-tree page: the whole page must be removed at the same time. This cannot
+be used for the root page, which is allowed to be empty. */
+dberr_t
+btr_discard_page(
+/*=============*/
+	btr_cur_t*	cursor,	/*!< in: cursor on the page to discard: not on
+				the root page */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	dict_index_t*	index;
+	buf_block_t*	merge_block;
+	buf_block_t*	block;
+	btr_cur_t	parent_cursor;
+
+	block = btr_cur_get_block(cursor);
+	index = btr_cur_get_index(cursor);
+	parent_cursor.page_cur = cursor->page_cur;
+
+	ut_ad(dict_index_get_page(index) != block->page.id().page_no());
+
+	ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+					 | MTR_MEMO_SX_LOCK));
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+
+	MONITOR_INC(MONITOR_INDEX_DISCARD);
+
+	if (index->is_spatial()
+	    ? !rtr_page_get_father(mtr, cursor, &parent_cursor)
+	    : !btr_page_get_father(mtr, &parent_cursor)) {
+		return DB_CORRUPTION;
+	}
+
+	/* Decide the page which will inherit the locks */
+
+	const uint32_t left_page_no = btr_page_get_prev(block->page.frame);
+	const uint32_t right_page_no = btr_page_get_next(block->page.frame);
+	page_id_t merge_page_id{block->page.id()};
+
+	ut_d(bool parent_is_different = false);
+	dberr_t err;
+	if (left_page_no != FIL_NULL) {
+		merge_page_id.set_page_no(left_page_no);
+		merge_block = btr_block_reget(mtr, *index, merge_page_id,
+					      &err);
+		if (UNIV_UNLIKELY(!merge_block)) {
+			return err;
+		}
+#if 1 /* MDEV-29835 FIXME: Acquire the page latch upfront. */
+		ut_ad(!memcmp_aligned<4>(merge_block->page.frame
+					 + FIL_PAGE_NEXT,
+					 block->page.frame + FIL_PAGE_OFFSET,
+					 4));
+#else
+		if (UNIV_UNLIKELY(memcmp_aligned<4>(merge_block->page.frame
+						    + FIL_PAGE_NEXT,
+						    block->page.frame
+						    + FIL_PAGE_OFFSET, 4))) {
+			return DB_CORRUPTION;
+		}
+#endif
+		ut_d(parent_is_different =
+			(page_rec_get_next(
+				page_get_infimum_rec(
+					btr_cur_get_page(
+						&parent_cursor)))
+			 == btr_cur_get_rec(&parent_cursor)));
+	} else if (right_page_no != FIL_NULL) {
+		merge_page_id.set_page_no(right_page_no);
+		merge_block = btr_block_reget(mtr, *index, merge_page_id,
+                                              &err);
+		if (UNIV_UNLIKELY(!merge_block)) {
+			return err;
+		}
+#if 1 /* MDEV-29835 FIXME: Acquire the page latch upfront. */
+		ut_ad(!memcmp_aligned<4>(merge_block->page.frame
+					 + FIL_PAGE_PREV,
+					 block->page.frame + FIL_PAGE_OFFSET,
+					 4));
+#else
+		if (UNIV_UNLIKELY(memcmp_aligned<4>(merge_block->page.frame
+						    + FIL_PAGE_PREV,
+						    block->page.frame
+						    + FIL_PAGE_OFFSET, 4))) {
+			return DB_CORRUPTION;
+		}
+#endif
+		ut_d(parent_is_different = page_rec_is_supremum(
+			page_rec_get_next(btr_cur_get_rec(&parent_cursor))));
+		if (page_is_leaf(merge_block->page.frame)) {
+		} else if (rec_t* node_ptr =
+                           page_rec_get_next(page_get_infimum_rec(
+					   merge_block->page.frame))) {
+			ut_ad(page_rec_is_user_rec(node_ptr));
+			/* We have to mark the leftmost node pointer as the
+			predefined minimum record. */
+			btr_set_min_rec_mark<true>(node_ptr, *merge_block,
+						   mtr);
+		} else {
+			return DB_CORRUPTION;
+		}
+	} else {
+		btr_discard_only_page_on_level(index, block, mtr);
+		return DB_SUCCESS;
+	}
+
+	if (UNIV_UNLIKELY(memcmp_aligned<2>(&merge_block->page.frame
+					    [PAGE_HEADER + PAGE_LEVEL],
+					    &block->page.frame
+					    [PAGE_HEADER + PAGE_LEVEL], 2))) {
+		return DB_CORRUPTION;
+	}
+
+	btr_search_drop_page_hash_index(block, false);
+
+	if (dict_index_is_spatial(index)) {
+		rtr_node_ptr_delete(&parent_cursor, mtr);
+	} else if (dberr_t err =
+		   btr_cur_node_ptr_delete(&parent_cursor, mtr)) {
+		return err;
+	}
+
+	/* Remove the page from the level list */
+	if (dberr_t err = btr_level_list_remove(*block, *index, mtr)) {
+		return err;
+	}
+
+#ifdef UNIV_ZIP_DEBUG
+	if (page_zip_des_t* merge_page_zip
+	    = buf_block_get_page_zip(merge_block))
+		ut_a(page_zip_validate(merge_page_zip,
+				       merge_block->page.frame, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+	if (index->has_locking()) {
+		if (left_page_no != FIL_NULL) {
+			lock_update_discard(merge_block, PAGE_HEAP_NO_SUPREMUM,
+					    block);
+		} else {
+			lock_update_discard(merge_block,
+					    lock_get_min_heap_no(merge_block),
+					    block);
+		}
+
+		if (index->is_spatial()) {
+			rtr_check_discard_page(index, cursor, block);
+		}
+	}
+
+	/* Free the file page */
+	err = btr_page_free(index, block, mtr);
+
+	if (err == DB_SUCCESS) {
+		/* btr_check_node_ptr() needs parent block latched.
+		If the merge_block's parent block is not same,
+		we cannot use btr_check_node_ptr() */
+		ut_ad(parent_is_different
+		      || btr_check_node_ptr(index, merge_block, mtr));
+
+		if (btr_cur_get_block(&parent_cursor)->page.id().page_no()
+		    == index->page
+		    && !page_has_siblings(btr_cur_get_page(&parent_cursor))
+		    && page_get_n_recs(btr_cur_get_page(&parent_cursor))
+		    == 1) {
+			btr_lift_page_up(index, merge_block, mtr, &err);
+		}
+	}
+
+	return err;
+}
+
+#ifdef UNIV_BTR_PRINT
+/*************************************************************//**
+Prints size info of a B-tree. */
+void
+btr_print_size(
+/*===========*/
+	dict_index_t*	index)	/*!< in: index tree */
+{
+	page_t*		root;
+	fseg_header_t*	seg;
+	mtr_t		mtr;
+
+	if (dict_index_is_ibuf(index)) {
+		fputs("Sorry, cannot print info of an ibuf tree:"
+		      " use ibuf functions\n", stderr);
+
+		return;
+	}
+
+	mtr_start(&mtr);
+
+	root = btr_root_get(index, &mtr);
+
+	seg = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
+
+	fputs("INFO OF THE NON-LEAF PAGE SEGMENT\n", stderr);
+	fseg_print(seg, &mtr);
+
+	if (!dict_index_is_ibuf(index)) {
+
+		seg = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
+
+		fputs("INFO OF THE LEAF PAGE SEGMENT\n", stderr);
+		fseg_print(seg, &mtr);
+	}
+
+	mtr_commit(&mtr);
+}
+
+/************************************************************//**
+Prints recursively index tree pages. */
+static
+void
+btr_print_recursive(
+/*================*/
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: index page */
+	ulint		width,	/*!< in: print this many entries from start
+				and end */
+	mem_heap_t**	heap,	/*!< in/out: heap for rec_get_offsets() */
+	rec_offs**	offsets,/*!< in/out: buffer for rec_get_offsets() */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	const page_t*	page	= buf_block_get_frame(block);
+	page_cur_t	cursor;
+	ulint		n_recs;
+	ulint		i	= 0;
+	mtr_t		mtr2;
+
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_SX_FIX));
+
+	ib::info() << "NODE ON LEVEL " << btr_page_get_level(page)
+		<< " page " << block->page.id;
+
+	page_print(block, index, width, width);
+
+	n_recs = page_get_n_recs(page);
+
+	page_cur_set_before_first(block, &cursor);
+	page_cur_move_to_next(&cursor);
+
+	while (!page_cur_is_after_last(&cursor)) {
+
+		if (page_is_leaf(page)) {
+
+			/* If this is the leaf level, do nothing */
+
+		} else if ((i <= width) || (i >= n_recs - width)) {
+
+			const rec_t*	node_ptr;
+
+			mtr_start(&mtr2);
+
+			node_ptr = page_cur_get_rec(&cursor);
+
+			*offsets = rec_get_offsets(
+				node_ptr, index, *offsets, 0,
+				ULINT_UNDEFINED, heap);
+			if (buf_block_t *child =
+			    btr_node_ptr_get_child(node_ptr, index, *offsets,
+						   &mtr2)) {
+				btr_print_recursive(index, child, width, heap,
+						    offsets, &mtr2);
+			}
+			mtr_commit(&mtr2);
+		}
+
+		page_cur_move_to_next(&cursor);
+		i++;
+	}
+}
+
+/**************************************************************//**
+Prints directories and other info of all nodes in the tree. */
+void
+btr_print_index(
+/*============*/
+	dict_index_t*	index,	/*!< in: index */
+	ulint		width)	/*!< in: print this many entries from start
+				and end */
+{
+	mtr_t		mtr;
+	buf_block_t*	root;
+	mem_heap_t*	heap	= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets	= offsets_;
+	rec_offs_init(offsets_);
+
+	fputs("--------------------------\n"
+	      "INDEX TREE PRINT\n", stderr);
+
+	mtr_start(&mtr);
+
+	root = btr_root_block_get(index, RW_SX_LATCH, &mtr);
+
+	btr_print_recursive(index, root, width, &heap, &offsets, &mtr);
+	if (heap) {
+		mem_heap_free(heap);
+	}
+
+	mtr_commit(&mtr);
+
+	ut_ad(btr_validate_index(index, 0));
+}
+#endif /* UNIV_BTR_PRINT */
+
+#ifdef UNIV_DEBUG
+/************************************************************//**
+Checks that the node pointer to a page is appropriate.
+@return TRUE */
+ibool
+btr_check_node_ptr(
+/*===============*/
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: index page */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	mem_heap_t*	heap;
+	dtuple_t*	tuple;
+	rec_offs*	offsets;
+	btr_cur_t	cursor;
+	page_t*		page = buf_block_get_frame(block);
+
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+
+	if (dict_index_get_page(index) == block->page.id().page_no()) {
+
+		return(TRUE);
+	}
+
+	cursor.page_cur.index = index;
+	cursor.page_cur.block = block;
+
+	heap = mem_heap_create(256);
+
+	if (dict_index_is_spatial(index)) {
+		offsets = rtr_page_get_father_block(NULL, heap, mtr,
+						    NULL, &cursor);
+	} else {
+		offsets = btr_page_get_father_block(NULL, heap, mtr, &cursor);
+	}
+
+	ut_ad(offsets);
+
+	if (page_is_leaf(page)) {
+
+		goto func_exit;
+	}
+
+	tuple = dict_index_build_node_ptr(
+		index, page_rec_get_next(page_get_infimum_rec(page)), 0, heap,
+		btr_page_get_level(page));
+
+	/* For spatial index, the MBR in the parent rec could be different
+	with that of first rec of child, their relationship should be
+	"WITHIN" relationship */
+	if (dict_index_is_spatial(index)) {
+		ut_a(!cmp_dtuple_rec_with_gis(
+			tuple, btr_cur_get_rec(&cursor),
+			PAGE_CUR_WITHIN));
+	} else {
+		ut_a(!cmp_dtuple_rec(tuple, btr_cur_get_rec(&cursor), index,
+				     offsets));
+	}
+func_exit:
+	mem_heap_free(heap);
+
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/************************************************************//**
+Display identification information for a record. */
+static
+void
+btr_index_rec_validate_report(
+/*==========================*/
+	const page_t*		page,	/*!< in: index page */
+	const rec_t*		rec,	/*!< in: index record */
+	const dict_index_t*	index)	/*!< in: index */
+{
+	ib::info() << "Record in index " << index->name
+		<< " of table " << index->table->name
+		<< ", page " << page_id_t(page_get_space_id(page),
+					  page_get_page_no(page))
+		<< ", at offset " << page_offset(rec);
+}
+
+/************************************************************//**
+Checks the size and number of fields in a record based on the definition of
+the index.
+@return TRUE if ok */
+ibool
+btr_index_rec_validate(
+/*===================*/
+	const rec_t*		rec,		/*!< in: index record */
+	const dict_index_t*	index,		/*!< in: index */
+	ibool			dump_on_error)	/*!< in: TRUE if the function
+						should print hex dump of record
+						and page on error */
+{
+	ulint		len;
+	const page_t*	page;
+	mem_heap_t*	heap	= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets	= offsets_;
+	rec_offs_init(offsets_);
+
+	page = page_align(rec);
+
+	ut_ad(index->n_core_fields);
+
+	if (index->is_ibuf()) {
+		/* The insert buffer index tree can contain records from any
+		other index: we cannot check the number of fields or
+		their length */
+
+		return(TRUE);
+	}
+
+#ifdef VIRTUAL_INDEX_DEBUG
+	if (dict_index_has_virtual(index)) {
+		fprintf(stderr, "index name is %s\n", index->name());
+	}
+#endif
+	if ((ibool)!!page_is_comp(page) != dict_table_is_comp(index->table)) {
+		btr_index_rec_validate_report(page, rec, index);
+
+		ib::error() << "Compact flag=" << !!page_is_comp(page)
+			<< ", should be " << dict_table_is_comp(index->table);
+
+		return(FALSE);
+	}
+
+	const bool is_alter_metadata = page_is_leaf(page)
+		&& !page_has_prev(page)
+		&& index->is_primary() && index->table->instant
+		&& rec == page_rec_get_next_const(page_get_infimum_rec(page));
+
+	if (is_alter_metadata
+	    && !rec_is_alter_metadata(rec, page_is_comp(page))) {
+		btr_index_rec_validate_report(page, rec, index);
+
+		ib::error() << "First record is not ALTER TABLE metadata";
+		return FALSE;
+	}
+
+	if (!page_is_comp(page)) {
+		const ulint n_rec_fields = rec_get_n_fields_old(rec);
+		if (n_rec_fields == DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD
+		    && index->id == DICT_INDEXES_ID) {
+			/* A record for older SYS_INDEXES table
+			(missing merge_threshold column) is acceptable. */
+		} else if (is_alter_metadata) {
+			if (n_rec_fields != ulint(index->n_fields) + 1) {
+				goto n_field_mismatch;
+			}
+		} else if (n_rec_fields < index->n_core_fields
+			   || n_rec_fields > index->n_fields) {
+n_field_mismatch:
+			btr_index_rec_validate_report(page, rec, index);
+
+			ib::error() << "Has " << rec_get_n_fields_old(rec)
+				    << " fields, should have "
+				    << index->n_core_fields << ".."
+				    << index->n_fields;
+
+			if (dump_on_error) {
+				fputs("InnoDB: corrupt record ", stderr);
+				rec_print_old(stderr, rec);
+				putc('\n', stderr);
+			}
+			return(FALSE);
+		}
+	}
+
+	offsets = rec_get_offsets(rec, index, offsets, page_is_leaf(page)
+				  ? index->n_core_fields : 0,
+				  ULINT_UNDEFINED, &heap);
+	const dict_field_t* field = index->fields;
+	ut_ad(rec_offs_n_fields(offsets)
+	      == ulint(index->n_fields) + is_alter_metadata);
+
+	for (unsigned i = 0; i < rec_offs_n_fields(offsets); i++) {
+		rec_get_nth_field_offs(offsets, i, &len);
+
+		ulint fixed_size;
+
+		if (is_alter_metadata && i == index->first_user_field()) {
+			fixed_size = FIELD_REF_SIZE;
+			if (len != FIELD_REF_SIZE
+			    || !rec_offs_nth_extern(offsets, i)) {
+				goto len_mismatch;
+			}
+
+			continue;
+		} else {
+			fixed_size = dict_col_get_fixed_size(
+				field->col, page_is_comp(page));
+			if (rec_offs_nth_extern(offsets, i)) {
+				const byte* data = rec_get_nth_field(
+					rec, offsets, i, &len);
+				len -= BTR_EXTERN_FIELD_REF_SIZE;
+				ulint extern_len = mach_read_from_4(
+					data + len + BTR_EXTERN_LEN + 4);
+				if (fixed_size == extern_len + len) {
+					goto next_field;
+				}
+			}
+		}
+
+		/* Note that if fixed_size != 0, it equals the
+		length of a fixed-size column in the clustered index.
+		We should adjust it here.
+		A prefix index of the column is of fixed, but different
+		length.  When fixed_size == 0, prefix_len is the maximum
+		length of the prefix index column. */
+
+		if (len_is_stored(len)
+		    && (field->prefix_len
+			? len > field->prefix_len
+			: (fixed_size && len != fixed_size))) {
+len_mismatch:
+			btr_index_rec_validate_report(page, rec, index);
+			ib::error	error;
+
+			error << "Field " << i << " len is " << len
+				<< ", should be " << fixed_size;
+
+			if (dump_on_error) {
+				error << "; ";
+				rec_print(error.m_oss, rec,
+					  rec_get_info_bits(
+						  rec, rec_offs_comp(offsets)),
+					  offsets);
+			}
+			if (heap) {
+				mem_heap_free(heap);
+			}
+			return(FALSE);
+		}
+next_field:
+		field++;
+	}
+
+#ifdef VIRTUAL_INDEX_DEBUG
+	if (dict_index_has_virtual(index)) {
+		rec_print_new(stderr, rec, offsets);
+	}
+#endif
+
+	if (heap) {
+		mem_heap_free(heap);
+	}
+	return(TRUE);
+}
+
+/************************************************************//**
+Checks the size and number of fields in records based on the definition of
+the index.
+@return true if ok */
+static
+bool
+btr_index_page_validate(
+/*====================*/
+	buf_block_t*	block,	/*!< in: index page */
+	dict_index_t*	index)	/*!< in: index */
+{
+	page_cur_t	cur;
+#ifndef DBUG_OFF
+	ulint		nth	= 1;
+#endif /* !DBUG_OFF */
+
+	page_cur_set_before_first(block, &cur);
+
+	/* Directory slot 0 should only contain the infimum record. */
+	DBUG_EXECUTE_IF("check_table_rec_next",
+			ut_a(page_rec_get_nth_const(
+				     page_cur_get_page(&cur), 0)
+			     == cur.rec);
+			ut_a(page_dir_slot_get_n_owned(
+				     page_dir_get_nth_slot(
+					     page_cur_get_page(&cur), 0))
+			     == 1););
+
+	while (page_cur_move_to_next(&cur)) {
+		if (page_cur_is_after_last(&cur)) {
+			return true;
+		}
+
+		if (!btr_index_rec_validate(cur.rec, index, TRUE)) {
+			break;
+		}
+
+		/* Verify that page_rec_get_nth_const() is correctly
+		retrieving each record. */
+		DBUG_EXECUTE_IF("check_table_rec_next",
+				ut_a(cur.rec == page_rec_get_nth_const(
+					     page_cur_get_page(&cur),
+					     page_rec_get_n_recs_before(
+						     cur.rec)));
+				ut_a(nth++ == page_rec_get_n_recs_before(
+					     cur.rec)););
+	}
+
+	return false;
+}
+
+/************************************************************//**
+Report an error on one page of an index tree. */
+static
+void
+btr_validate_report1(
+/*=================*/
+	dict_index_t*		index,	/*!< in: index */
+	ulint			level,	/*!< in: B-tree level */
+	const buf_block_t*	block)	/*!< in: index page */
+{
+	ib::error	error;
+	error << "In page " << block->page.id().page_no()
+		<< " of index " << index->name
+		<< " of table " << index->table->name;
+
+	if (level > 0) {
+		error << ", index tree level " << level;
+	}
+}
+
+/************************************************************//**
+Report an error on two pages of an index tree. */
+static
+void
+btr_validate_report2(
+/*=================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			level,	/*!< in: B-tree level */
+	const buf_block_t*	block1,	/*!< in: first index page */
+	const buf_block_t*	block2)	/*!< in: second index page */
+{
+  ib::error error;
+  error << "In pages " << block1->page.id()
+	<< " and " << block2->page.id() << " of index " << index->name
+	<< " of table " << index->table->name;
+
+  if (level)
+    error << ", index tree level " << level;
+}
+
+/** Validate an index tree level. */
+static
+dberr_t
+btr_validate_level(
+/*===============*/
+	dict_index_t*	index,	/*!< in: index tree */
+	const trx_t*	trx,	/*!< in: transaction or NULL */
+	ulint		level)	/*!< in: level number */
+{
+	buf_block_t*	block;
+	page_t*		page;
+	buf_block_t*	right_block = 0; /* remove warning */
+	page_t*		right_page = 0; /* remove warning */
+	page_t*		father_page;
+	btr_cur_t	node_cur;
+	btr_cur_t	right_node_cur;
+	rec_t*		rec;
+	page_cur_t	cursor;
+	dtuple_t*	node_ptr_tuple;
+	mtr_t		mtr;
+	mem_heap_t*	heap	= mem_heap_create(256);
+	rec_offs*	offsets	= NULL;
+	rec_offs*	offsets2= NULL;
+#ifdef UNIV_ZIP_DEBUG
+	page_zip_des_t*	page_zip;
+#endif /* UNIV_ZIP_DEBUG */
+
+	mtr.start();
+
+	mtr_x_lock_index(index, &mtr);
+
+	dberr_t err;
+	block = btr_root_block_get(index, RW_SX_LATCH, &mtr, &err);
+	if (!block) {
+		mtr.commit();
+		return err;
+	}
+	page = buf_block_get_frame(block);
+
+	fil_space_t*		space	= index->table->space;
+
+	while (level != btr_page_get_level(page)) {
+		const rec_t*	node_ptr;
+		switch (dberr_t e =
+			fseg_page_is_allocated(space,
+					       block->page.id().page_no())) {
+		case DB_SUCCESS_LOCKED_REC:
+			break;
+		case DB_SUCCESS:
+			btr_validate_report1(index, level, block);
+			ib::warn() << "Page is free";
+			e = DB_CORRUPTION;
+			/* fall through */
+		default:
+			err = e;
+		}
+		ut_ad(index->table->space_id == block->page.id().space());
+		ut_ad(block->page.id().space() == page_get_space_id(page));
+#ifdef UNIV_ZIP_DEBUG
+		page_zip = buf_block_get_page_zip(block);
+		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+		if (page_is_leaf(page)) {
+corrupted:
+			err = DB_CORRUPTION;
+			goto invalid_page;
+		}
+
+		page_cur_set_before_first(block, &cursor);
+		if (!(node_ptr = page_cur_move_to_next(&cursor))) {
+			goto corrupted;
+		}
+
+		offsets = rec_get_offsets(node_ptr, index, offsets, 0,
+					  ULINT_UNDEFINED, &heap);
+
+		block = btr_node_ptr_get_child(node_ptr, index, offsets, &mtr,
+					       &err);
+		if (!block) {
+			break;
+		}
+		page = buf_block_get_frame(block);
+
+		/* For R-Tree, since record order might not be the same as
+		linked index page in the lower level, we need to travers
+		backwards to get the first page rec in this level.
+		This is only used for index validation. Spatial index
+		does not use such scan for any of its DML or query
+		operations  */
+		if (dict_index_is_spatial(index)) {
+			uint32_t left_page_no = btr_page_get_prev(page);
+
+			while (left_page_no != FIL_NULL) {
+				/* To obey latch order of tree blocks,
+				we should release the right_block once to
+				obtain lock of the uncle block. */
+				mtr.release_last_page();
+
+				block = btr_block_get(*index, left_page_no,
+						      RW_SX_LATCH, false,
+						      &mtr, &err);
+				if (!block) {
+					goto invalid_page;
+				}
+				page = buf_block_get_frame(block);
+				left_page_no = btr_page_get_prev(page);
+			}
+		}
+	}
+
+	/* Now we are on the desired level. Loop through the pages on that
+	level. */
+
+loop:
+	if (!block) {
+invalid_page:
+		mtr.commit();
+func_exit:
+		mem_heap_free(heap);
+		return err;
+	}
+
+	mem_heap_empty(heap);
+	offsets = offsets2 = NULL;
+
+	mtr_x_lock_index(index, &mtr);
+
+	page = block->page.frame;
+
+#ifdef UNIV_ZIP_DEBUG
+	page_zip = buf_block_get_page_zip(block);
+	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+	if (DB_SUCCESS_LOCKED_REC
+	    != fseg_page_is_allocated(space, block->page.id().page_no())) {
+		btr_validate_report1(index, level, block);
+
+		ib::warn() << "Page is marked as free";
+		err = DB_CORRUPTION;
+	} else if (btr_page_get_index_id(page) != index->id) {
+		ib::error() << "Page index id " << btr_page_get_index_id(page)
+			<< " != data dictionary index id " << index->id;
+		err = DB_CORRUPTION;
+	} else if (!page_validate(page, index)) {
+		btr_validate_report1(index, level, block);
+		err = DB_CORRUPTION;
+	} else if (btr_page_get_level(page) != level) {
+		btr_validate_report1(index, level, block);
+		ib::error() << "Page level is not " << level;
+		err = DB_CORRUPTION;
+	} else if (level == 0 && !btr_index_page_validate(block, index)) {
+		/* We are on level 0. Check that the records have the right
+		number of fields, and field lengths are right. */
+		err = DB_CORRUPTION;
+	} else if (!page_is_empty(page)) {
+	} else if (level) {
+		btr_validate_report1(index, level, block);
+		ib::error() << "Non-leaf page is empty";
+	} else if (block->page.id().page_no() != index->page) {
+		btr_validate_report1(index, level, block);
+		ib::error() << "Empty leaf page is not index root";
+	}
+
+	uint32_t right_page_no = btr_page_get_next(page);
+	uint32_t left_page_no = btr_page_get_prev(page);
+
+	if (right_page_no != FIL_NULL) {
+		const rec_t*	right_rec;
+
+		right_block = btr_block_get(*index, right_page_no, RW_SX_LATCH,
+					    !level, &mtr, &err);
+		if (!right_block) {
+			btr_validate_report1(index, level, block);
+			fputs("InnoDB: broken FIL_PAGE_NEXT link\n", stderr);
+			goto invalid_page;
+		}
+		right_page = buf_block_get_frame(right_block);
+
+		if (btr_page_get_prev(right_page) != page_get_page_no(page)) {
+			btr_validate_report2(index, level, block, right_block);
+			fputs("InnoDB: broken FIL_PAGE_NEXT"
+			      " or FIL_PAGE_PREV links\n", stderr);
+                        err = DB_CORRUPTION;
+		}
+
+		if (!(rec = page_rec_get_prev(page_get_supremum_rec(page)))) {
+broken_links:
+			btr_validate_report1(index, level, block);
+			fputs("InnoDB: broken record links\n", stderr);
+			goto invalid_page;
+		}
+		if (!(right_rec =
+		      page_rec_get_next(page_get_infimum_rec(right_page)))) {
+			goto broken_links;
+		}
+
+		offsets = rec_get_offsets(rec, index, offsets,
+					  page_is_leaf(page)
+					  ? index->n_core_fields : 0,
+					  ULINT_UNDEFINED, &heap);
+		offsets2 = rec_get_offsets(right_rec, index, offsets2,
+					   page_is_leaf(right_page)
+					   ? index->n_core_fields : 0,
+					   ULINT_UNDEFINED, &heap);
+
+		/* For spatial index, we cannot guarantee the key ordering
+		across pages, so skip the record compare verification for
+		now. Will enhanced in special R-Tree index validation scheme */
+		if (index->is_btree()
+		    && cmp_rec_rec(rec, right_rec,
+				   offsets, offsets2, index) >= 0) {
+
+			btr_validate_report2(index, level, block, right_block);
+
+			fputs("InnoDB: records in wrong order"
+			      " on adjacent pages\n", stderr);
+
+			rec = page_rec_get_prev(page_get_supremum_rec(page));
+			if (rec) {
+				fputs("InnoDB: record ", stderr);
+				rec_print(stderr, rec, index);
+				putc('\n', stderr);
+			}
+			fputs("InnoDB: record ", stderr);
+			rec = page_rec_get_next(
+				page_get_infimum_rec(right_page));
+			if (rec) {
+				rec_print(stderr, rec, index);
+			}
+			putc('\n', stderr);
+			err = DB_CORRUPTION;
+		}
+	}
+
+	if (!level || left_page_no != FIL_NULL) {
+	} else if (const rec_t* first =
+		   page_rec_get_next_const(page_get_infimum_rec(page))) {
+		if (!(REC_INFO_MIN_REC_FLAG
+		      & rec_get_info_bits(first, page_is_comp(page)))) {
+			btr_validate_report1(index, level, block);
+			ib::error() << "Missing REC_INFO_MIN_REC_FLAG";
+			err = DB_CORRUPTION;
+		}
+	} else {
+		err = DB_CORRUPTION;
+		goto node_ptr_fails;
+	}
+
+	/* Similarly skip the father node check for spatial index for now,
+	for a couple of reasons:
+	1) As mentioned, there is no ordering relationship between records
+	in parent level and linked pages in the child level.
+	2) Search parent from root is very costly for R-tree.
+	We will add special validation mechanism for R-tree later (WL #7520) */
+	if (index->is_btree() && block->page.id().page_no() != index->page) {
+		/* Check father node pointers */
+		rec_t*	node_ptr
+			= page_rec_get_next(page_get_infimum_rec(page));
+		if (!node_ptr) {
+			err = DB_CORRUPTION;
+			goto node_ptr_fails;
+		}
+
+		btr_cur_position(index, node_ptr, block, &node_cur);
+		offsets = btr_page_get_father_node_ptr_for_validate(
+			offsets, heap, &node_cur, &mtr);
+
+		father_page = btr_cur_get_page(&node_cur);
+		node_ptr = btr_cur_get_rec(&node_cur);
+
+		rec = page_rec_get_prev(page_get_supremum_rec(page));
+		if (rec) {
+			btr_cur_position(index, rec, block, &node_cur);
+
+			offsets = btr_page_get_father_node_ptr_for_validate(
+				offsets, heap, &node_cur, &mtr);
+		} else {
+			offsets = nullptr;
+		}
+
+		if (!offsets || node_ptr != btr_cur_get_rec(&node_cur)
+		    || btr_node_ptr_get_child_page_no(node_ptr, offsets)
+		    != block->page.id().page_no()) {
+
+			btr_validate_report1(index, level, block);
+
+			fputs("InnoDB: node pointer to the page is wrong\n",
+			      stderr);
+
+			fputs("InnoDB: node ptr ", stderr);
+			rec_print(stderr, node_ptr, index);
+
+			if (offsets) {
+				rec = btr_cur_get_rec(&node_cur);
+				fprintf(stderr, "\n"
+					"InnoDB: node ptr child page n:o %u\n",
+					btr_node_ptr_get_child_page_no(
+						rec, offsets));
+				fputs("InnoDB: record on page ", stderr);
+				rec_print_new(stderr, rec, offsets);
+				putc('\n', stderr);
+			}
+
+			err = DB_CORRUPTION;
+			goto node_ptr_fails;
+		}
+
+		if (page_is_leaf(page)) {
+		} else if (const rec_t* first_rec =
+			   page_rec_get_next(page_get_infimum_rec(page))) {
+			node_ptr_tuple = dict_index_build_node_ptr(
+				index, first_rec,
+				0, heap, btr_page_get_level(page));
+
+			if (cmp_dtuple_rec(node_ptr_tuple, node_ptr, index,
+					   offsets)) {
+				btr_validate_report1(index, level, block);
+
+				ib::error() << "Node ptrs differ on levels > 0";
+
+				fputs("InnoDB: node ptr ",stderr);
+				rec_print_new(stderr, node_ptr, offsets);
+				fputs("InnoDB: first rec ", stderr);
+				rec_print(stderr, first_rec, index);
+				putc('\n', stderr);
+				err = DB_CORRUPTION;
+				goto node_ptr_fails;
+			}
+		} else {
+			err = DB_CORRUPTION;
+			goto node_ptr_fails;
+		}
+
+		if (left_page_no == FIL_NULL) {
+			if (page_has_prev(father_page)
+			    || node_ptr != page_rec_get_next(
+				     page_get_infimum_rec(father_page))) {
+				err = DB_CORRUPTION;
+				goto node_ptr_fails;
+			}
+		}
+
+		if (right_page_no == FIL_NULL) {
+			if (page_has_next(father_page)
+			    || node_ptr != page_rec_get_prev(
+				     page_get_supremum_rec(father_page))) {
+				err = DB_CORRUPTION;
+				goto node_ptr_fails;
+			}
+		} else if (const rec_t* right_node_ptr
+			   = page_rec_get_next(node_ptr)) {
+			btr_cur_position(
+				index,
+				page_get_infimum_rec(right_block->page.frame),
+				right_block, &right_node_cur);
+			if (!page_cur_move_to_next(&right_node_cur.page_cur)) {
+				goto node_pointer_corrupted;
+			}
+
+			offsets = btr_page_get_father_node_ptr_for_validate(
+					offsets, heap, &right_node_cur, &mtr);
+
+			if (right_node_ptr
+			    != page_get_supremum_rec(father_page)) {
+
+				if (btr_cur_get_rec(&right_node_cur)
+				    != right_node_ptr) {
+node_pointer_corrupted:
+					err = DB_CORRUPTION;
+					fputs("InnoDB: node pointer to"
+					      " the right page is wrong\n",
+					      stderr);
+
+					btr_validate_report1(index, level,
+							     block);
+				}
+			} else {
+				page_t*	right_father_page
+					= btr_cur_get_page(&right_node_cur);
+
+				if (btr_cur_get_rec(&right_node_cur)
+				    != page_rec_get_next(
+					    page_get_infimum_rec(
+						    right_father_page))) {
+					err = DB_CORRUPTION;
+					fputs("InnoDB: node pointer 2 to"
+					      " the right page is wrong\n",
+					      stderr);
+
+					btr_validate_report1(index, level,
+							     block);
+				}
+
+				if (page_get_page_no(right_father_page)
+				    != btr_page_get_next(father_page)) {
+
+					err = DB_CORRUPTION;
+					fputs("InnoDB: node pointer 3 to"
+					      " the right page is wrong\n",
+					      stderr);
+
+					btr_validate_report1(index, level,
+							     block);
+				}
+			}
+		} else {
+			err = DB_CORRUPTION;
+		}
+	}
+
+node_ptr_fails:
+	/* Commit the mini-transaction to release the latch on 'page'.
+	Re-acquire the latch on right_page, which will become 'page'
+	on the next loop.  The page has already been checked. */
+	mtr.commit();
+
+	if (trx_is_interrupted(trx)) {
+		/* On interrupt, return the current status. */
+	} else if (right_page_no != FIL_NULL) {
+
+		mtr.start();
+
+		block = btr_block_get(*index, right_page_no, RW_SX_LATCH,
+				      !level, &mtr, &err);
+		goto loop;
+	}
+
+	goto func_exit;
+}
+
+/**************************************************************//**
+Checks the consistency of an index tree.
+@return	DB_SUCCESS if ok, error code if not */
+dberr_t
+btr_validate_index(
+/*===============*/
+	dict_index_t*	index,	/*!< in: index */
+	const trx_t*	trx)	/*!< in: transaction or NULL */
+{
+  mtr_t mtr;
+  mtr.start();
+
+  mtr_x_lock_index(index, &mtr);
+
+  dberr_t err;
+  if (page_t *root= btr_root_get(index, &mtr, &err))
+    for (auto level= btr_page_get_level(root);; level--)
+    {
+      if (dberr_t err_level= btr_validate_level(index, trx, level))
+        err= err_level;
+      if (!level)
+        break;
+    }
+
+  mtr.commit();
+  return err;
+}
+
+/**************************************************************//**
+Checks if the page in the cursor can be merged with given page.
+If necessary, re-organize the merge_page.
+@return	true if possible to merge. */
+static
+bool
+btr_can_merge_with_page(
+/*====================*/
+	btr_cur_t*	cursor,		/*!< in: cursor on the page to merge */
+	uint32_t	page_no,	/*!< in: a sibling page */
+	buf_block_t**	merge_block,	/*!< out: the merge block */
+	mtr_t*		mtr)		/*!< in: mini-transaction */
+{
+	dict_index_t*	index;
+	page_t*		page;
+	ulint		n_recs;
+	ulint		data_size;
+	ulint		max_ins_size_reorg;
+	ulint		max_ins_size;
+	buf_block_t*	mblock;
+	page_t*		mpage;
+	DBUG_ENTER("btr_can_merge_with_page");
+
+	if (page_no == FIL_NULL) {
+error:
+		*merge_block = NULL;
+		DBUG_RETURN(false);
+	}
+
+	index = btr_cur_get_index(cursor);
+	page = btr_cur_get_page(cursor);
+
+	mblock = btr_block_get(*index, page_no, RW_X_LATCH, page_is_leaf(page),
+			       mtr);
+	if (!mblock) {
+		goto error;
+	}
+	mpage = buf_block_get_frame(mblock);
+
+	n_recs = page_get_n_recs(page);
+	data_size = page_get_data_size(page);
+
+	max_ins_size_reorg = page_get_max_insert_size_after_reorganize(
+		mpage, n_recs);
+
+	if (data_size > max_ins_size_reorg) {
+		goto error;
+	}
+
+	/* If compression padding tells us that merging will result in
+	too packed up page i.e.: which is likely to cause compression
+	failure then don't merge the pages. */
+	if (mblock->page.zip.data && page_is_leaf(mpage)
+	    && (page_get_data_size(mpage) + data_size
+		>= dict_index_zip_pad_optimal_page_size(index))) {
+
+		goto error;
+	}
+
+	max_ins_size = page_get_max_insert_size(mpage, n_recs);
+
+	if (data_size > max_ins_size) {
+		/* We have to reorganize mpage */
+		if (btr_page_reorganize_block(page_zip_level, mblock, index,
+					      mtr) != DB_SUCCESS) {
+			goto error;
+		}
+
+		max_ins_size = page_get_max_insert_size(mpage, n_recs);
+
+		ut_ad(page_validate(mpage, index));
+		ut_ad(max_ins_size == max_ins_size_reorg);
+
+		if (data_size > max_ins_size) {
+
+			/* Add fault tolerance, though this should
+			never happen */
+
+			goto error;
+		}
+	}
+
+	*merge_block = mblock;
+	DBUG_RETURN(true);
+}
diff --git a/storage/innobase/btr/btr0bulk.cc b/storage/innobase/btr/btr0bulk.cc
new file mode 100644
index 00000000..013cd131
--- /dev/null
+++ b/storage/innobase/btr/btr0bulk.cc
@@ -0,0 +1,1233 @@
+/*****************************************************************************
+
+Copyright (c) 2014, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file btr/btr0bulk.cc
+The B-tree bulk load
+
+Created 03/11/2014 Shaohua Wang
+*******************************************************/
+
+#include "btr0bulk.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "btr0pcur.h"
+#include "ibuf0ibuf.h"
+#include "page0page.h"
+#include "trx0trx.h"
+
+/** Innodb B-tree index fill factor for bulk load. */
+uint	innobase_fill_factor;
+
+/** Initialize members, allocate page if needed and start mtr.
+Note: we commit all mtrs on failure.
+@return error code. */
+dberr_t
+PageBulk::init()
+{
+	buf_block_t*	new_block;
+	page_t*		new_page;
+
+	ut_ad(m_heap == NULL);
+	m_heap = mem_heap_create(1000);
+
+	m_mtr.start();
+	m_index->set_modified(m_mtr);
+
+	if (m_page_no == FIL_NULL) {
+		mtr_t	alloc_mtr;
+
+		/* We commit redo log for allocation by a separate mtr,
+		because we don't guarantee pages are committed following
+		the allocation order, and we will always generate redo log
+		for page allocation, even when creating a new tablespace. */
+		alloc_mtr.start();
+		m_index->set_modified(alloc_mtr);
+
+		uint32_t n_reserved;
+		dberr_t err = fsp_reserve_free_extents(
+			&n_reserved, m_index->table->space, 1, FSP_NORMAL,
+			&alloc_mtr);
+		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+oom:
+			alloc_mtr.commit();
+			m_mtr.commit();
+			return err;
+		}
+
+		/* Allocate a new page. */
+		new_block = btr_page_alloc(m_index, 0, FSP_UP, m_level,
+					   &alloc_mtr, &m_mtr, &err);
+		if (!new_block) {
+			goto oom;
+		}
+
+		m_index->table->space->release_free_extents(n_reserved);
+
+		alloc_mtr.commit();
+
+		new_page = buf_block_get_frame(new_block);
+		m_page_no = new_block->page.id().page_no();
+
+		byte* index_id = my_assume_aligned<2>
+			(PAGE_HEADER + PAGE_INDEX_ID + new_page);
+		compile_time_assert(FIL_PAGE_NEXT == FIL_PAGE_PREV + 4);
+		compile_time_assert(FIL_NULL == 0xffffffff);
+		memset_aligned<8>(new_page + FIL_PAGE_PREV, 0xff, 8);
+
+		if (UNIV_LIKELY_NULL(new_block->page.zip.data)) {
+			mach_write_to_8(index_id, m_index->id);
+			page_create_zip(new_block, m_index, m_level, 0,
+					&m_mtr);
+		} else {
+			ut_ad(!m_index->is_spatial());
+			page_create(new_block, &m_mtr,
+				    m_index->table->not_redundant());
+			m_mtr.memset(*new_block, FIL_PAGE_PREV, 8, 0xff);
+			m_mtr.write<2,mtr_t::MAYBE_NOP>(*new_block, PAGE_HEADER
+							+ PAGE_LEVEL
+							+ new_page, m_level);
+			m_mtr.write<8>(*new_block, index_id, m_index->id);
+		}
+	} else {
+		new_block = btr_block_get(*m_index, m_page_no, RW_X_LATCH,
+					  false, &m_mtr);
+		if (!new_block) {
+			m_mtr.commit();
+			return(DB_CORRUPTION);
+		}
+
+		new_page = buf_block_get_frame(new_block);
+
+		ut_ad(page_dir_get_n_heap(new_page) == PAGE_HEAP_NO_USER_LOW);
+
+		btr_page_set_level(new_block, m_level, &m_mtr);
+	}
+
+	m_page_zip = buf_block_get_page_zip(new_block);
+
+	if (!m_level && dict_index_is_sec_or_ibuf(m_index)) {
+		page_update_max_trx_id(new_block, m_page_zip, m_trx_id,
+				       &m_mtr);
+	}
+
+	m_block = new_block;
+	m_page = new_page;
+	m_cur_rec = page_get_infimum_rec(new_page);
+	ut_ad(m_is_comp == !!page_is_comp(new_page));
+	m_free_space = page_get_free_space_of_empty(m_is_comp);
+
+	if (innobase_fill_factor == 100 && dict_index_is_clust(m_index)) {
+		/* Keep default behavior compatible with 5.6 */
+		m_reserved_space = dict_index_get_space_reserve();
+	} else {
+		m_reserved_space =
+			srv_page_size * (100 - innobase_fill_factor) / 100;
+	}
+
+	m_padding_space =
+		srv_page_size - dict_index_zip_pad_optimal_page_size(m_index);
+	m_heap_top = page_header_get_ptr(new_page, PAGE_HEAP_TOP);
+	m_rec_no = page_header_get_field(new_page, PAGE_N_RECS);
+	/* Temporarily reset PAGE_DIRECTION_B from PAGE_NO_DIRECTION to 0,
+	without writing redo log, to ensure that needs_finish() will hold
+	on an empty page. */
+	ut_ad(m_page[PAGE_HEADER + PAGE_DIRECTION_B] == PAGE_NO_DIRECTION);
+	m_page[PAGE_HEADER + PAGE_DIRECTION_B] = 0;
+	ut_d(m_total_data = 0);
+
+	return(DB_SUCCESS);
+}
+
+/** Insert a record in the page.
+@tparam fmt     the page format
+@param[in,out]	rec		record
+@param[in]	offsets		record offsets */
+template<PageBulk::format fmt>
+inline void PageBulk::insertPage(rec_t *rec, rec_offs *offsets)
+{
+  ut_ad((m_page_zip != nullptr) == (fmt == COMPRESSED));
+  ut_ad((fmt != REDUNDANT) == m_is_comp);
+  ut_ad(page_align(m_heap_top) == m_page);
+  ut_ad(m_heap);
+
+  const ulint rec_size= rec_offs_size(offsets);
+  const ulint extra_size= rec_offs_extra_size(offsets);
+  ut_ad(page_align(m_heap_top + rec_size) == m_page);
+  ut_d(const bool is_leaf= page_rec_is_leaf(m_cur_rec));
+
+#ifdef UNIV_DEBUG
+  /* Check whether records are in order. */
+  if (page_offset(m_cur_rec) !=
+      (fmt == REDUNDANT ? PAGE_OLD_INFIMUM : PAGE_NEW_INFIMUM))
+  {
+    const rec_t *old_rec = m_cur_rec;
+    rec_offs *old_offsets= rec_get_offsets(old_rec, m_index, nullptr, is_leaf
+                                           ? m_index->n_core_fields : 0,
+                                           ULINT_UNDEFINED, &m_heap);
+    ut_ad(cmp_rec_rec(rec, old_rec, offsets, old_offsets, m_index) > 0);
+  }
+
+  m_total_data+= rec_size;
+#endif /* UNIV_DEBUG */
+
+  rec_t* const insert_rec= m_heap_top + extra_size;
+
+  /* Insert the record in the linked list. */
+  if (fmt != REDUNDANT)
+  {
+    const rec_t *next_rec= m_page +
+      page_offset(m_cur_rec + mach_read_from_2(m_cur_rec - REC_NEXT));
+    if (fmt != COMPRESSED)
+      m_mtr.write<2>(*m_block, m_cur_rec - REC_NEXT,
+                     static_cast<uint16_t>(insert_rec - m_cur_rec));
+    else
+    {
+      mach_write_to_2(m_cur_rec - REC_NEXT,
+                      static_cast<uint16_t>(insert_rec - m_cur_rec));
+      memcpy(m_heap_top, rec - extra_size, rec_size);
+    }
+
+    rec_t * const this_rec= fmt != COMPRESSED
+      ? const_cast<rec_t*>(rec) : insert_rec;
+    rec_set_bit_field_1(this_rec, 0, REC_NEW_N_OWNED, REC_N_OWNED_MASK,
+                        REC_N_OWNED_SHIFT);
+    rec_set_bit_field_2(this_rec, PAGE_HEAP_NO_USER_LOW + m_rec_no,
+                        REC_NEW_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+    mach_write_to_2(this_rec - REC_NEXT,
+                    static_cast<uint16_t>(next_rec - insert_rec));
+  }
+  else
+  {
+    memcpy(const_cast<rec_t*>(rec) - REC_NEXT, m_cur_rec - REC_NEXT, 2);
+    m_mtr.write<2>(*m_block, m_cur_rec - REC_NEXT, page_offset(insert_rec));
+    rec_set_bit_field_1(const_cast<rec_t*>(rec), 0,
+                        REC_OLD_N_OWNED, REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+    rec_set_bit_field_2(const_cast<rec_t*>(rec),
+                        PAGE_HEAP_NO_USER_LOW + m_rec_no,
+                        REC_OLD_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+  }
+
+  if (fmt == COMPRESSED)
+    /* We already wrote the record. Log is written in PageBulk::compress(). */;
+  else if (page_offset(m_cur_rec) ==
+           (fmt == REDUNDANT ? PAGE_OLD_INFIMUM : PAGE_NEW_INFIMUM))
+    m_mtr.memcpy(*m_block, m_heap_top, rec - extra_size, rec_size);
+  else
+  {
+    /* Try to copy common prefix from the preceding record. */
+    const byte *r= rec - extra_size;
+    const byte * const insert_rec_end= m_heap_top + rec_size;
+    byte *b= m_heap_top;
+
+    /* Skip any unchanged prefix of the record. */
+    for (; * b == *r; b++, r++);
+
+    ut_ad(b < insert_rec_end);
+
+    const byte *c= m_cur_rec - (rec - r);
+    const byte * const c_end= std::min(m_cur_rec + rec_offs_data_size(offsets),
+                                       m_heap_top);
+
+    /* Try to copy any bytes of the preceding record. */
+    if (UNIV_LIKELY(c >= m_page && c < c_end))
+    {
+      const byte *cm= c;
+      byte *bm= b;
+      const byte *rm= r;
+      for (; cm < c_end && *rm == *cm; cm++, bm++, rm++);
+      ut_ad(bm <= insert_rec_end);
+      size_t len= static_cast<size_t>(rm - r);
+      ut_ad(!memcmp(r, c, len));
+      if (len > 2)
+      {
+        memcpy(b, c, len);
+        m_mtr.memmove(*m_block, page_offset(b), page_offset(c), len);
+        c= cm;
+        b= bm;
+        r= rm;
+      }
+    }
+
+    if (c < m_cur_rec)
+    {
+      if (!rec_offs_data_size(offsets))
+      {
+no_data:
+        m_mtr.memcpy<mtr_t::FORCED>(*m_block, b, r, m_cur_rec - c);
+        goto rec_done;
+      }
+      /* Some header bytes differ. Compare the data separately. */
+      const byte *cd= m_cur_rec;
+      byte *bd= insert_rec;
+      const byte *rd= rec;
+      /* Skip any unchanged prefix of the record. */
+      for (;; cd++, bd++, rd++)
+        if (bd == insert_rec_end)
+          goto no_data;
+        else if (*bd != *rd)
+          break;
+
+      /* Try to copy any data bytes of the preceding record. */
+      if (c_end - cd > 2)
+      {
+        const byte *cdm= cd;
+        const byte *rdm= rd;
+        for (; cdm < c_end && *rdm == *cdm; cdm++, rdm++)
+        ut_ad(rdm - rd + bd <= insert_rec_end);
+        size_t len= static_cast<size_t>(rdm - rd);
+        ut_ad(!memcmp(rd, cd, len));
+        if (len > 2)
+        {
+          m_mtr.memcpy<mtr_t::FORCED>(*m_block, b, r, m_cur_rec - c);
+          memcpy(bd, cd, len);
+          m_mtr.memmove(*m_block, page_offset(bd), page_offset(cd), len);
+          c= cdm;
+          b= rdm - rd + bd;
+          r= rdm;
+        }
+      }
+    }
+
+    if (size_t len= static_cast<size_t>(insert_rec_end - b))
+      m_mtr.memcpy<mtr_t::FORCED>(*m_block, b, r, len);
+  }
+
+rec_done:
+  ut_ad(fmt == COMPRESSED || !memcmp(m_heap_top, rec - extra_size, rec_size));
+  rec_offs_make_valid(insert_rec, m_index, is_leaf, offsets);
+
+  /* Update the member variables. */
+  ulint slot_size= page_dir_calc_reserved_space(m_rec_no + 1) -
+    page_dir_calc_reserved_space(m_rec_no);
+
+  ut_ad(m_free_space >= rec_size + slot_size);
+  ut_ad(m_heap_top + rec_size < m_page + srv_page_size);
+
+  m_free_space-= rec_size + slot_size;
+  m_heap_top+= rec_size;
+  m_rec_no++;
+  m_cur_rec= insert_rec;
+}
+
+/** Insert a record in the page.
+@param[in]	rec		record
+@param[in]	offsets		record offsets */
+inline void PageBulk::insert(const rec_t *rec, rec_offs *offsets)
+{
+  byte rec_hdr[REC_N_OLD_EXTRA_BYTES];
+  static_assert(REC_N_OLD_EXTRA_BYTES > REC_N_NEW_EXTRA_BYTES, "file format");
+
+  if (UNIV_LIKELY_NULL(m_page_zip))
+    insertPage<COMPRESSED>(const_cast<rec_t*>(rec), offsets);
+  else if (m_is_comp)
+  {
+    memcpy(rec_hdr, rec - REC_N_NEW_EXTRA_BYTES, REC_N_NEW_EXTRA_BYTES);
+    insertPage<DYNAMIC>(const_cast<rec_t*>(rec), offsets);
+    memcpy(const_cast<rec_t*>(rec) - REC_N_NEW_EXTRA_BYTES, rec_hdr,
+           REC_N_NEW_EXTRA_BYTES);
+  }
+  else
+  {
+    memcpy(rec_hdr, rec - REC_N_OLD_EXTRA_BYTES, REC_N_OLD_EXTRA_BYTES);
+    insertPage<REDUNDANT>(const_cast<rec_t*>(rec), offsets);
+    memcpy(const_cast<rec_t*>(rec) - REC_N_OLD_EXTRA_BYTES, rec_hdr,
+           REC_N_OLD_EXTRA_BYTES);
+  }
+}
+
+/** Set the number of owned records in the uncompressed page of
+a ROW_FORMAT=COMPRESSED record without redo-logging. */
+static void rec_set_n_owned_zip(rec_t *rec, ulint n_owned)
+{
+  rec_set_bit_field_1(rec, n_owned, REC_NEW_N_OWNED,
+                      REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+}
+
+/** Mark end of insertion to the page. Scan all records to set page dirs,
+and set page header members.
+@tparam fmt  page format */
+template<PageBulk::format fmt>
+inline void PageBulk::finishPage()
+{
+  ut_ad((m_page_zip != nullptr) == (fmt == COMPRESSED));
+  ut_ad((fmt != REDUNDANT) == m_is_comp);
+
+  ulint count= 0;
+  byte *slot= my_assume_aligned<2>(m_page + srv_page_size -
+                                   (PAGE_DIR + PAGE_DIR_SLOT_SIZE));
+  const page_dir_slot_t *const slot0 = slot;
+  compile_time_assert(PAGE_DIR_SLOT_SIZE == 2);
+  if (fmt != REDUNDANT)
+  {
+    uint16_t offset= mach_read_from_2(PAGE_NEW_INFIMUM - REC_NEXT + m_page);
+    ut_ad(offset >= PAGE_NEW_SUPREMUM - PAGE_NEW_INFIMUM);
+    offset= static_cast<uint16_t>(offset + PAGE_NEW_INFIMUM);
+    /* Set owner & dir. */
+    while (offset != PAGE_NEW_SUPREMUM)
+    {
+      ut_ad(offset >= PAGE_NEW_SUPREMUM);
+      ut_ad(offset < page_offset(slot));
+      count++;
+
+      if (count == (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2)
+      {
+        slot-= PAGE_DIR_SLOT_SIZE;
+        mach_write_to_2(slot, offset);
+
+        if (fmt != COMPRESSED)
+          page_rec_set_n_owned<false>(m_block, m_page + offset, count, true,
+                                      &m_mtr);
+        else
+          rec_set_n_owned_zip(m_page + offset, count);
+
+        count= 0;
+      }
+
+      uint16_t next= static_cast<uint16_t>
+        ((mach_read_from_2(m_page + offset - REC_NEXT) + offset) &
+         (srv_page_size - 1));
+      ut_ad(next);
+      offset= next;
+    }
+
+    if (slot0 != slot && (count + 1 + (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2 <=
+                          PAGE_DIR_SLOT_MAX_N_OWNED))
+    {
+      /* Merge the last two slots, like page_cur_insert_rec_low() does. */
+      count+= (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2;
+
+      rec_t *rec= const_cast<rec_t*>(page_dir_slot_get_rec(slot));
+      if (fmt != COMPRESSED)
+        page_rec_set_n_owned<false>(m_block, rec, 0, true, &m_mtr);
+      else
+        rec_set_n_owned_zip(rec, 0);
+    }
+    else
+      slot-= PAGE_DIR_SLOT_SIZE;
+
+    mach_write_to_2(slot, PAGE_NEW_SUPREMUM);
+    if (fmt != COMPRESSED)
+      page_rec_set_n_owned<false>(m_block, m_page + PAGE_NEW_SUPREMUM,
+                                  count + 1, true, &m_mtr);
+    else
+      rec_set_n_owned_zip(m_page + PAGE_NEW_SUPREMUM, count + 1);
+  }
+  else
+  {
+    rec_t *insert_rec= m_page +
+      mach_read_from_2(PAGE_OLD_INFIMUM - REC_NEXT + m_page);
+
+    /* Set owner & dir. */
+    while (insert_rec != m_page + PAGE_OLD_SUPREMUM)
+    {
+      count++;
+
+      if (count == (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2)
+      {
+        slot-= PAGE_DIR_SLOT_SIZE;
+        mach_write_to_2(slot, page_offset(insert_rec));
+        page_rec_set_n_owned<false>(m_block, insert_rec, count, false, &m_mtr);
+        count= 0;
+      }
+
+      insert_rec= m_page + mach_read_from_2(insert_rec - REC_NEXT);
+    }
+
+    if (slot0 != slot && (count + 1 + (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2 <=
+                          PAGE_DIR_SLOT_MAX_N_OWNED))
+    {
+      /* Merge the last two slots, like page_cur_insert_rec_low() does. */
+      count+= (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2;
+
+      rec_t *rec= const_cast<rec_t*>(page_dir_slot_get_rec(slot));
+      page_rec_set_n_owned<false>(m_block, rec, 0, false, &m_mtr);
+    }
+    else
+      slot-= PAGE_DIR_SLOT_SIZE;
+
+    mach_write_to_2(slot, PAGE_OLD_SUPREMUM);
+    page_rec_set_n_owned<false>(m_block, m_page + PAGE_OLD_SUPREMUM, count + 1,
+                                false, &m_mtr);
+  }
+
+  if (!m_rec_no);
+  else if (fmt != COMPRESSED)
+  {
+    static_assert(PAGE_N_DIR_SLOTS == 0, "compatibility");
+    alignas(8) byte page_header[PAGE_N_HEAP + 2];
+    mach_write_to_2(page_header + PAGE_N_DIR_SLOTS,
+                    1 + (slot0 - slot) / PAGE_DIR_SLOT_SIZE);
+    mach_write_to_2(page_header + PAGE_HEAP_TOP, m_heap_top - m_page);
+    mach_write_to_2(page_header + PAGE_N_HEAP,
+                    (PAGE_HEAP_NO_USER_LOW + m_rec_no) |
+                    uint16_t{fmt != REDUNDANT} << 15);
+    m_mtr.memcpy(*m_block, PAGE_HEADER + m_page, page_header,
+                 sizeof page_header);
+    m_mtr.write<2>(*m_block, PAGE_HEADER + PAGE_N_RECS + m_page, m_rec_no);
+    m_mtr.memcpy(*m_block, page_offset(slot), slot0 - slot);
+  }
+  else
+  {
+    /* For ROW_FORMAT=COMPRESSED, redo log may be written in
+    PageBulk::compress(). */
+    mach_write_to_2(PAGE_HEADER + PAGE_N_DIR_SLOTS + m_page,
+                    1 + (slot0 - slot) / PAGE_DIR_SLOT_SIZE);
+    mach_write_to_2(PAGE_HEADER + PAGE_HEAP_TOP + m_page,
+                    static_cast<ulint>(m_heap_top - m_page));
+    mach_write_to_2(PAGE_HEADER + PAGE_N_HEAP + m_page,
+                    (PAGE_HEAP_NO_USER_LOW + m_rec_no) | 1U << 15);
+    mach_write_to_2(PAGE_HEADER + PAGE_N_RECS + m_page, m_rec_no);
+  }
+}
+
+inline bool PageBulk::needs_finish() const
+{
+  ut_ad(page_align(m_cur_rec) == m_block->page.frame);
+  ut_ad(m_page == m_block->page.frame);
+  if (!m_page[PAGE_HEADER + PAGE_DIRECTION_B])
+    return true;
+  ulint heap_no, n_heap= page_header_get_field(m_page, PAGE_N_HEAP);
+  ut_ad((n_heap & 0x7fff) >= PAGE_HEAP_NO_USER_LOW);
+  if (n_heap & 0x8000)
+  {
+    n_heap&= 0x7fff;
+    heap_no= rec_get_heap_no_new(m_cur_rec);
+    if (heap_no == PAGE_HEAP_NO_INFIMUM &&
+	page_header_get_field(m_page, PAGE_HEAP_TOP) == PAGE_NEW_SUPREMUM_END)
+      return false;
+  }
+  else
+  {
+    heap_no= rec_get_heap_no_old(m_cur_rec);
+    if (heap_no == PAGE_HEAP_NO_INFIMUM &&
+	page_header_get_field(m_page, PAGE_HEAP_TOP) == PAGE_OLD_SUPREMUM_END)
+      return false;
+  }
+  return heap_no != n_heap - 1;
+}
+
+/** Mark end of insertion to the page. Scan all records to set page dirs,
+and set page header members.
+@tparam compressed  whether the page is in ROW_FORMAT=COMPRESSED */
+inline void PageBulk::finish()
+{
+  ut_ad(!m_index->is_spatial());
+
+  if (!needs_finish());
+  else if (UNIV_LIKELY_NULL(m_page_zip))
+    finishPage<COMPRESSED>();
+  else if (m_is_comp)
+    finishPage<DYNAMIC>();
+  else
+    finishPage<REDUNDANT>();
+
+  /* In MariaDB 10.2, 10.3, 10.4, we would initialize
+  PAGE_DIRECTION_B, PAGE_N_DIRECTION, PAGE_LAST_INSERT
+  in the same way as we would during normal INSERT operations.
+  Starting with MariaDB Server 10.5, bulk insert will not
+  touch those fields. */
+  ut_ad(!m_page[PAGE_HEADER + PAGE_INSTANT]);
+  /* Restore the temporary change of PageBulk::init() that was necessary to
+  ensure that PageBulk::needs_finish() holds on an empty page. */
+  m_page[PAGE_HEADER + PAGE_DIRECTION_B]= PAGE_NO_DIRECTION;
+
+  ut_ad(!page_header_get_field(m_page, PAGE_FREE));
+  ut_ad(!page_header_get_field(m_page, PAGE_GARBAGE));
+  ut_ad(!page_header_get_field(m_page, PAGE_LAST_INSERT));
+  ut_ad(!page_header_get_field(m_page, PAGE_N_DIRECTION));
+  ut_ad(m_total_data + page_dir_calc_reserved_space(m_rec_no) <=
+        page_get_free_space_of_empty(m_is_comp));
+  ut_ad(!needs_finish());
+  ut_ad(page_validate(m_page, m_index));
+}
+
+/** Commit inserts done to the page
+@param[in]	success		Flag whether all inserts succeed. */
+void PageBulk::commit(bool success)
+{
+  finish();
+  if (success && !m_index->is_clust() && page_is_leaf(m_page))
+    ibuf_set_bitmap_for_bulk_load(m_block, &m_mtr,
+                                  innobase_fill_factor == 100);
+  m_mtr.commit();
+}
+
+/** Compress a page of compressed table
+@return	true	compress successfully or no need to compress
+@return	false	compress failed. */
+bool
+PageBulk::compress()
+{
+	ut_ad(m_page_zip != NULL);
+
+	return page_zip_compress(m_block, m_index, page_zip_level, &m_mtr);
+}
+
+/** Get node pointer
+@return node pointer */
+dtuple_t*
+PageBulk::getNodePtr()
+{
+	rec_t*		first_rec;
+	dtuple_t*	node_ptr;
+
+	/* Create node pointer */
+	first_rec = page_rec_get_next(page_get_infimum_rec(m_page));
+	ut_a(page_rec_is_user_rec(first_rec));
+	node_ptr = dict_index_build_node_ptr(m_index, first_rec, m_page_no,
+					     m_heap, m_level);
+
+	return(node_ptr);
+}
+
+/** Get split rec in left page.We split a page in half when compresssion fails,
+and the split rec will be copied to right page.
+@return split rec */
+rec_t*
+PageBulk::getSplitRec()
+{
+	rec_t*		rec;
+	rec_offs*	offsets;
+	ulint		total_used_size;
+	ulint		total_recs_size;
+	ulint		n_recs;
+
+	ut_ad(m_page_zip != NULL);
+	ut_ad(m_rec_no >= 2);
+	ut_ad(!m_index->is_instant());
+
+	ut_ad(page_get_free_space_of_empty(m_is_comp) > m_free_space);
+	total_used_size = page_get_free_space_of_empty(m_is_comp)
+		- m_free_space;
+
+	total_recs_size = 0;
+	n_recs = 0;
+	offsets = NULL;
+	rec = page_get_infimum_rec(m_page);
+	const ulint n_core = page_is_leaf(m_page) ? m_index->n_core_fields : 0;
+
+	do {
+		rec = page_rec_get_next(rec);
+		ut_ad(page_rec_is_user_rec(rec));
+
+		offsets = rec_get_offsets(rec, m_index, offsets, n_core,
+					  ULINT_UNDEFINED, &m_heap);
+		total_recs_size += rec_offs_size(offsets);
+		n_recs++;
+	} while (total_recs_size + page_dir_calc_reserved_space(n_recs)
+		 < total_used_size / 2);
+
+	/* Keep at least one record on left page */
+	if (page_rec_is_first(rec, m_page)) {
+		rec = page_rec_get_next(rec);
+		ut_ad(page_rec_is_user_rec(rec));
+	}
+
+	return(rec);
+}
+
+/** Copy all records after split rec including itself.
+@param[in]	rec	split rec */
+void
+PageBulk::copyIn(
+	rec_t*		split_rec)
+{
+
+	rec_t*		rec = split_rec;
+	rec_offs*	offsets = NULL;
+
+	ut_ad(m_rec_no == 0);
+	ut_ad(page_rec_is_user_rec(rec));
+
+	const ulint n_core = page_rec_is_leaf(rec)
+		? m_index->n_core_fields : 0;
+
+	do {
+		offsets = rec_get_offsets(rec, m_index, offsets, n_core,
+					  ULINT_UNDEFINED, &m_heap);
+
+		insert(rec, offsets);
+
+		rec = page_rec_get_next(rec);
+	} while (!page_rec_is_supremum(rec));
+
+	ut_ad(m_rec_no > 0);
+}
+
+/** Remove all records after split rec including itself.
+@param[in]	rec	split rec	*/
+void
+PageBulk::copyOut(
+	rec_t*		split_rec)
+{
+	/* Suppose before copyOut, we have 5 records on the page:
+	infimum->r1->r2->r3->r4->r5->supremum, and r3 is the split rec.
+
+	after copyOut, we have 2 records on the page:
+	infimum->r1->r2->supremum. slot ajustment is not done. */
+
+	rec_t *rec = page_get_infimum_rec(m_page);
+	ulint n;
+
+	for (n = 0;; n++) {
+		rec_t *next = page_rec_get_next(rec);
+		if (next == split_rec) {
+			break;
+		}
+		rec = next;
+	}
+
+	ut_ad(n > 0);
+
+        const rec_t *last_rec = split_rec;
+	for (;;) {
+		const rec_t *next = page_rec_get_next_const(last_rec);
+		if (page_rec_is_supremum(next)) {
+			break;
+		}
+		last_rec = next;
+	}
+
+	/* Set last record's next in page */
+	const ulint n_core = page_rec_is_leaf(split_rec)
+		? m_index->n_core_fields : 0;
+
+	rec_offs* offsets = rec_get_offsets(rec, m_index, nullptr, n_core,
+					    ULINT_UNDEFINED, &m_heap);
+	mach_write_to_2(rec - REC_NEXT, m_is_comp
+			? static_cast<uint16_t>
+			(PAGE_NEW_SUPREMUM - page_offset(rec))
+			: PAGE_OLD_SUPREMUM);
+
+	/* Set related members */
+	m_cur_rec = rec;
+	m_heap_top = rec_get_end(rec, offsets);
+
+	offsets = rec_get_offsets(last_rec, m_index, offsets, n_core,
+				  ULINT_UNDEFINED, &m_heap);
+
+	m_free_space += ulint(rec_get_end(last_rec, offsets) - m_heap_top)
+		+ page_dir_calc_reserved_space(m_rec_no)
+		- page_dir_calc_reserved_space(n);
+	ut_ad(lint(m_free_space) > 0);
+	m_rec_no = n;
+
+#ifdef UNIV_DEBUG
+	m_total_data -= ulint(rec_get_end(last_rec, offsets) - m_heap_top);
+#endif /* UNIV_DEBUG */
+}
+
+/** Set next page
+@param[in]	next_page_no	next page no */
+inline void PageBulk::setNext(ulint next_page_no)
+{
+  if (UNIV_LIKELY_NULL(m_page_zip))
+    /* For ROW_FORMAT=COMPRESSED, redo log may be written
+    in PageBulk::compress(). */
+    mach_write_to_4(m_page + FIL_PAGE_NEXT, next_page_no);
+  else
+    m_mtr.write<4>(*m_block, m_page + FIL_PAGE_NEXT, next_page_no);
+}
+
+/** Set previous page
+@param[in]	prev_page_no	previous page no */
+inline void PageBulk::setPrev(ulint prev_page_no)
+{
+  if (UNIV_LIKELY_NULL(m_page_zip))
+    /* For ROW_FORMAT=COMPRESSED, redo log may be written
+    in PageBulk::compress(). */
+    mach_write_to_4(m_page + FIL_PAGE_PREV, prev_page_no);
+  else
+    m_mtr.write<4>(*m_block, m_page + FIL_PAGE_PREV, prev_page_no);
+}
+
+/** Check if required space is available in the page for the rec to be inserted.
+We check fill factor & padding here.
+@param[in]	length		required length
+@return true	if space is available */
+bool
+PageBulk::isSpaceAvailable(
+	ulint		rec_size)
+{
+	if (m_rec_no >= 8190) {
+		ut_ad(srv_page_size == 65536);
+		return false;
+	}
+
+	ulint	slot_size;
+	ulint	required_space;
+
+	slot_size = page_dir_calc_reserved_space(m_rec_no + 1)
+		- page_dir_calc_reserved_space(m_rec_no);
+
+	required_space = rec_size + slot_size;
+
+	if (required_space > m_free_space) {
+		ut_ad(m_rec_no > 0);
+		return false;
+	}
+
+	/* Fillfactor & Padding apply to both leaf and non-leaf pages.
+	Note: we keep at least 2 records in a page to avoid B-tree level
+	growing too high. */
+	if (m_rec_no >= 2
+	    && ((m_page_zip == NULL && m_free_space - required_space
+		 < m_reserved_space)
+		|| (m_page_zip != NULL && m_free_space - required_space
+		    < m_padding_space))) {
+		return(false);
+	}
+
+	return(true);
+}
+
+/** Check whether the record needs to be stored externally.
+@return false if the entire record can be stored locally on the page  */
+bool
+PageBulk::needExt(
+	const dtuple_t*		tuple,
+	ulint			rec_size)
+{
+	return page_zip_rec_needs_ext(rec_size, m_is_comp,
+				      dtuple_get_n_fields(tuple),
+				      m_block->zip_size());
+}
+
+/** Store external record
+Since the record is not logged yet, so we don't log update to the record.
+the blob data is logged first, then the record is logged in bulk mode.
+@param[in]	big_rec		external recrod
+@param[in]	offsets		record offsets
+@return	error code */
+dberr_t
+PageBulk::storeExt(
+	const big_rec_t*	big_rec,
+	rec_offs*		offsets)
+{
+	finish();
+
+	/* Note: not all fields are initialized in btr_pcur. */
+	btr_pcur_t	btr_pcur;
+	btr_pcur.pos_state = BTR_PCUR_IS_POSITIONED;
+	btr_pcur.latch_mode = BTR_MODIFY_LEAF;
+	btr_pcur.btr_cur.page_cur.index = m_index;
+	btr_pcur.btr_cur.page_cur.rec = m_cur_rec;
+	btr_pcur.btr_cur.page_cur.offsets = offsets;
+	btr_pcur.btr_cur.page_cur.block = m_block;
+
+	dberr_t	err = btr_store_big_rec_extern_fields(
+		&btr_pcur, offsets, big_rec, &m_mtr, BTR_STORE_INSERT_BULK);
+
+	return(err);
+}
+
+/** Release block by commiting mtr
+Note: log_free_check requires holding no lock/latch in current thread. */
+void
+PageBulk::release()
+{
+	finish();
+
+	/* We fix the block because we will re-pin it soon. */
+	m_block->page.fix();
+
+	/* No other threads can modify this block. */
+	m_modify_clock = buf_block_get_modify_clock(m_block);
+
+	m_mtr.commit();
+}
+
+/** Start mtr and latch the block */
+void PageBulk::latch()
+{
+  m_mtr.start();
+  m_index->set_modified(m_mtr);
+#ifdef BTR_CUR_HASH_ADAPT
+  ut_ad(!m_block->index);
+#endif
+  m_block->page.lock.x_lock();
+  ut_ad(m_block->page.buf_fix_count());
+  m_mtr.memo_push(m_block, MTR_MEMO_PAGE_X_FIX);
+
+  ut_ad(m_cur_rec > m_page);
+  ut_ad(m_cur_rec < m_heap_top);
+}
+
+/** Split a page
+@param[in]	page_bulk	page to split
+@param[in]	next_page_bulk	next page
+@return	error code */
+dberr_t
+BtrBulk::pageSplit(
+	PageBulk*	page_bulk,
+	PageBulk*	next_page_bulk)
+{
+	ut_ad(page_bulk->getPageZip() != NULL);
+
+	if (page_bulk->getRecNo() <= 1) {
+		return(DB_TOO_BIG_RECORD);
+	}
+
+	/* Initialize a new page */
+	PageBulk new_page_bulk(m_index, m_trx->id, FIL_NULL,
+			       page_bulk->getLevel());
+	dberr_t	err = new_page_bulk.init();
+	if (err != DB_SUCCESS) {
+		return(err);
+	}
+
+	/* Copy the upper half to the new page. */
+	rec_t*	split_rec = page_bulk->getSplitRec();
+	new_page_bulk.copyIn(split_rec);
+	page_bulk->copyOut(split_rec);
+
+	/* Commit the pages after split. */
+	err = pageCommit(page_bulk, &new_page_bulk, true);
+	if (err != DB_SUCCESS) {
+		pageAbort(&new_page_bulk);
+		return(err);
+	}
+
+	err = pageCommit(&new_page_bulk, next_page_bulk, true);
+	if (err != DB_SUCCESS) {
+		pageAbort(&new_page_bulk);
+		return(err);
+	}
+
+	return(err);
+}
+
+/** Commit(finish) a page. We set next/prev page no, compress a page of
+compressed table and split the page if compression fails, insert a node
+pointer to father page if needed, and commit mini-transaction.
+@param[in]	page_bulk	page to commit
+@param[in]	next_page_bulk	next page
+@param[in]	insert_father	false when page_bulk is a root page and
+				true when it's a non-root page
+@return	error code */
+dberr_t
+BtrBulk::pageCommit(
+	PageBulk*	page_bulk,
+	PageBulk*	next_page_bulk,
+	bool		insert_father)
+{
+	page_bulk->finish();
+
+	/* Set page links */
+	if (next_page_bulk != NULL) {
+		ut_ad(page_bulk->getLevel() == next_page_bulk->getLevel());
+
+		page_bulk->setNext(next_page_bulk->getPageNo());
+		next_page_bulk->setPrev(page_bulk->getPageNo());
+	} else {
+		ut_ad(!page_has_next(page_bulk->getPage()));
+		/* If a page is released and latched again, we need to
+		mark it modified in mini-transaction.  */
+		page_bulk->set_modified();
+	}
+
+	ut_ad(!m_index->lock.have_any());
+
+	/* Compress page if it's a compressed table. */
+	if (page_bulk->getPageZip() != NULL && !page_bulk->compress()) {
+		return(pageSplit(page_bulk, next_page_bulk));
+	}
+
+	/* Insert node pointer to father page. */
+	if (insert_father) {
+		dtuple_t*	node_ptr = page_bulk->getNodePtr();
+		dberr_t		err = insert(node_ptr, page_bulk->getLevel()+1);
+
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+	}
+
+	/* Commit mtr. */
+	page_bulk->commit(true);
+
+	return(DB_SUCCESS);
+}
+
+/** Log free check */
+inline void BtrBulk::logFreeCheck()
+{
+	if (log_sys.check_flush_or_checkpoint()) {
+		release();
+
+		log_check_margins();
+
+		latch();
+	}
+}
+
+/** Release all latches */
+void
+BtrBulk::release()
+{
+	ut_ad(m_root_level + 1 == m_page_bulks.size());
+
+	for (ulint level = 0; level <= m_root_level; level++) {
+		PageBulk*    page_bulk = m_page_bulks.at(level);
+
+		page_bulk->release();
+	}
+}
+
+/** Re-latch all latches */
+void
+BtrBulk::latch()
+{
+	ut_ad(m_root_level + 1 == m_page_bulks.size());
+
+	for (ulint level = 0; level <= m_root_level; level++) {
+		PageBulk*    page_bulk = m_page_bulks.at(level);
+		page_bulk->latch();
+	}
+}
+
+/** Insert a tuple to page in a level
+@param[in]	tuple	tuple to insert
+@param[in]	level	B-tree level
+@return error code */
+dberr_t
+BtrBulk::insert(
+	dtuple_t*	tuple,
+	ulint		level)
+{
+	bool		is_left_most = false;
+	dberr_t		err = DB_SUCCESS;
+
+	/* Check if we need to create a PageBulk for the level. */
+	if (level + 1 > m_page_bulks.size()) {
+		PageBulk*	new_page_bulk
+			= UT_NEW_NOKEY(PageBulk(m_index, m_trx->id, FIL_NULL,
+						level));
+		err = new_page_bulk->init();
+		if (err != DB_SUCCESS) {
+			UT_DELETE(new_page_bulk);
+			return(err);
+		}
+
+		m_page_bulks.push_back(new_page_bulk);
+		ut_ad(level + 1 == m_page_bulks.size());
+		m_root_level = level;
+
+		is_left_most = true;
+	}
+
+	ut_ad(m_page_bulks.size() > level);
+
+	PageBulk*	page_bulk = m_page_bulks.at(level);
+
+	if (is_left_most && level > 0 && page_bulk->getRecNo() == 0) {
+		/* The node pointer must be marked as the predefined minimum
+		record,	as there is no lower alphabetical limit to records in
+		the leftmost node of a level: */
+		dtuple_set_info_bits(tuple, dtuple_get_info_bits(tuple)
+					    | REC_INFO_MIN_REC_FLAG);
+	}
+
+	ulint		n_ext = 0;
+	ulint		rec_size = rec_get_converted_size(m_index, tuple, n_ext);
+	big_rec_t*	big_rec = NULL;
+	rec_t*		rec = NULL;
+	rec_offs*	offsets = NULL;
+
+	if (page_bulk->needExt(tuple, rec_size)) {
+		/* The record is so big that we have to store some fields
+		externally on separate database pages */
+		big_rec = dtuple_convert_big_rec(m_index, 0, tuple, &n_ext);
+
+		if (big_rec == NULL) {
+			return(DB_TOO_BIG_RECORD);
+		}
+
+		rec_size = rec_get_converted_size(m_index, tuple, n_ext);
+	}
+
+	if (page_bulk->getPageZip() != NULL
+	    && page_zip_is_too_big(m_index, tuple)) {
+		err = DB_TOO_BIG_RECORD;
+		goto func_exit;
+	}
+
+	if (!page_bulk->isSpaceAvailable(rec_size)) {
+		/* Create a sibling page_bulk. */
+		PageBulk*	sibling_page_bulk;
+		sibling_page_bulk = UT_NEW_NOKEY(PageBulk(m_index, m_trx->id,
+							  FIL_NULL, level));
+		err = sibling_page_bulk->init();
+		if (err != DB_SUCCESS) {
+			UT_DELETE(sibling_page_bulk);
+			goto func_exit;
+		}
+
+		/* Commit page bulk. */
+		err = pageCommit(page_bulk, sibling_page_bulk, true);
+		if (err != DB_SUCCESS) {
+			pageAbort(sibling_page_bulk);
+			UT_DELETE(sibling_page_bulk);
+			goto func_exit;
+		}
+
+		/* Set new page bulk to page_bulks. */
+		ut_ad(sibling_page_bulk->getLevel() <= m_root_level);
+		m_page_bulks.at(level) = sibling_page_bulk;
+
+		UT_DELETE(page_bulk);
+		page_bulk = sibling_page_bulk;
+
+		/* Important: log_free_check whether we need a checkpoint. */
+		if (page_is_leaf(sibling_page_bulk->getPage())) {
+			if (trx_is_interrupted(m_trx)) {
+				err = DB_INTERRUPTED;
+				goto func_exit;
+			}
+
+			srv_inc_activity_count();
+			logFreeCheck();
+		}
+	}
+
+	/* Convert tuple to rec. */
+        rec = rec_convert_dtuple_to_rec(static_cast<byte*>(mem_heap_alloc(
+		page_bulk->m_heap, rec_size)), m_index, tuple, n_ext);
+        offsets = rec_get_offsets(rec, m_index, offsets, level
+				  ? 0 : m_index->n_core_fields,
+				  ULINT_UNDEFINED, &page_bulk->m_heap);
+
+	page_bulk->insert(rec, offsets);
+
+	if (big_rec != NULL) {
+		ut_ad(dict_index_is_clust(m_index));
+		ut_ad(page_bulk->getLevel() == 0);
+		ut_ad(page_bulk == m_page_bulks.at(0));
+
+		/* Release all pages above the leaf level */
+		for (ulint level = 1; level <= m_root_level; level++) {
+			m_page_bulks.at(level)->release();
+		}
+
+		err = page_bulk->storeExt(big_rec, offsets);
+
+		/* Latch */
+		for (ulint level = 1; level <= m_root_level; level++) {
+			PageBulk*    page_bulk = m_page_bulks.at(level);
+			page_bulk->latch();
+		}
+	}
+
+func_exit:
+	if (big_rec != NULL) {
+		dtuple_convert_back_big_rec(m_index, tuple, big_rec);
+	}
+
+	return(err);
+}
+
+/** Btree bulk load finish. We commit the last page in each level
+and copy the last page in top level to the root page of the index
+if no error occurs.
+@param[in]	err	whether bulk load was successful until now
+@return error code  */
+dberr_t
+BtrBulk::finish(dberr_t	err)
+{
+	uint32_t last_page_no = FIL_NULL;
+
+	ut_ad(!m_index->table->is_temporary());
+
+	if (m_page_bulks.size() == 0) {
+		/* The table is empty. The root page of the index tree
+		is already in a consistent state. No need to flush. */
+		return(err);
+	}
+
+	ut_ad(m_root_level + 1 == m_page_bulks.size());
+
+	/* Finish all page bulks */
+	for (ulint level = 0; level <= m_root_level; level++) {
+		PageBulk*	page_bulk = m_page_bulks.at(level);
+
+		last_page_no = page_bulk->getPageNo();
+
+		if (err == DB_SUCCESS) {
+			err = pageCommit(page_bulk, NULL,
+					 level != m_root_level);
+		}
+
+		if (err != DB_SUCCESS) {
+			pageAbort(page_bulk);
+		}
+
+		UT_DELETE(page_bulk);
+	}
+
+	if (err == DB_SUCCESS) {
+		rec_t*		first_rec;
+		mtr_t		mtr;
+		buf_block_t*	last_block;
+		PageBulk	root_page_bulk(m_index, m_trx->id,
+					       m_index->page, m_root_level);
+
+		mtr.start();
+		m_index->set_modified(mtr);
+		mtr_x_lock_index(m_index, &mtr);
+
+		ut_ad(last_page_no != FIL_NULL);
+		last_block = btr_block_get(*m_index, last_page_no, RW_X_LATCH,
+					   false, &mtr);
+		if (!last_block) {
+			err = DB_CORRUPTION;
+err_exit:
+			mtr.commit();
+			return err;
+		}
+
+		first_rec = page_rec_get_next(
+			page_get_infimum_rec(last_block->page.frame));
+		/* Because this index tree is being created by this thread,
+		we assume that it cannot be corrupted. */
+		ut_ad(first_rec);
+		ut_ad(page_rec_is_user_rec(first_rec));
+
+		/* Copy last page to root page. */
+		err = root_page_bulk.init();
+		if (err != DB_SUCCESS) {
+			goto err_exit;
+		}
+		root_page_bulk.copyIn(first_rec);
+		root_page_bulk.finish();
+
+		/* Remove last page. */
+		err = btr_page_free(m_index, last_block, &mtr);
+		mtr.commit();
+
+		if (dberr_t e = pageCommit(&root_page_bulk, NULL, false)) {
+			err = e;
+		}
+		ut_ad(err == DB_SUCCESS);
+	}
+
+	ut_ad(err != DB_SUCCESS
+	      || btr_validate_index(m_index, NULL) == DB_SUCCESS);
+	return(err);
+}
diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc
new file mode 100644
index 00000000..e736f338
--- /dev/null
+++ b/storage/innobase/btr/btr0cur.cc
@@ -0,0 +1,7017 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2015, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file btr/btr0cur.cc
+The index tree cursor
+
+All changes that row operations make to a B-tree or the records
+there must go through this module! Undo log records are written here
+of every modify or insert of a clustered index record.
+
+			NOTE!!!
+To make sure we do not run out of disk space during a pessimistic
+insert or update, we have to reserve 2 x the height of the index tree
+many pages in the tablespace before we start the operation, because
+if leaf splitting has been started, it is difficult to undo, except
+by crashing the database and doing a roll-forward.
+
+Created 10/16/1994 Heikki Tuuri
+*******************************************************/
+
+#include "btr0cur.h"
+#include "row0upd.h"
+#include "mtr0log.h"
+#include "page0page.h"
+#include "page0zip.h"
+#include "rem0rec.h"
+#include "rem0cmp.h"
+#include "buf0lru.h"
+#include "buf0rea.h"
+#include "btr0btr.h"
+#include "btr0sea.h"
+#include "row0log.h"
+#include "row0purge.h"
+#include "row0upd.h"
+#include "trx0rec.h"
+#include "trx0roll.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "srv0srv.h"
+#include "ibuf0ibuf.h"
+#include "lock0lock.h"
+#include "zlib.h"
+#include "srv0start.h"
+#include "mysql_com.h"
+#include "dict0stats.h"
+#include "row0ins.h"
+#ifdef WITH_WSREP
+#include "mysql/service_wsrep.h"
+#endif /* WITH_WSREP */
+#include "log.h"
+
+/** Buffered B-tree operation types, introduced as part of delete buffering. */
+enum btr_op_t {
+	BTR_NO_OP = 0,			/*!< Not buffered */
+	BTR_INSERT_OP,			/*!< Insert, do not ignore UNIQUE */
+	BTR_INSERT_IGNORE_UNIQUE_OP,	/*!< Insert, ignoring UNIQUE */
+	BTR_DELETE_OP,			/*!< Purge a delete-marked record */
+	BTR_DELMARK_OP			/*!< Mark a record for deletion */
+};
+
+/** Modification types for the B-tree operation.
+    Note that the order must be DELETE, BOTH, INSERT !!
+ */
+enum btr_intention_t {
+	BTR_INTENTION_DELETE,
+	BTR_INTENTION_BOTH,
+	BTR_INTENTION_INSERT
+};
+
+/** For the index->lock scalability improvement, only possibility of clear
+performance regression observed was caused by grown huge history list length.
+That is because the exclusive use of index->lock also worked as reserving
+free blocks and read IO bandwidth with priority. To avoid huge glowing history
+list as same level with previous implementation, prioritizes pessimistic tree
+operations by purge as the previous, when it seems to be growing huge.
+
+ Experimentally, the history list length starts to affect to performance
+throughput clearly from about 100000. */
+#define BTR_CUR_FINE_HISTORY_LENGTH	100000
+
+#ifdef BTR_CUR_HASH_ADAPT
+/** Number of searches down the B-tree in btr_cur_t::search_leaf(). */
+ib_counter_t<ulint, ib_counter_element_t>	btr_cur_n_non_sea;
+/** Old value of btr_cur_n_non_sea.  Copied by
+srv_refresh_innodb_monitor_stats().  Referenced by
+srv_printf_innodb_monitor(). */
+ulint	btr_cur_n_non_sea_old;
+/** Number of successful adaptive hash index lookups in
+btr_cur_t::search_leaf(). */
+ib_counter_t<ulint, ib_counter_element_t>	btr_cur_n_sea;
+/** Old value of btr_cur_n_sea.  Copied by
+srv_refresh_innodb_monitor_stats().  Referenced by
+srv_printf_innodb_monitor(). */
+ulint	btr_cur_n_sea_old;
+#endif /* BTR_CUR_HASH_ADAPT */
+
+#ifdef UNIV_DEBUG
+/* Flag to limit optimistic insert records */
+uint	btr_cur_limit_optimistic_insert_debug;
+#endif /* UNIV_DEBUG */
+
+/** In the optimistic insert, if the insert does not fit, but this much space
+can be released by page reorganize, then it is reorganized */
+#define BTR_CUR_PAGE_REORGANIZE_LIMIT	(srv_page_size / 32)
+
+/** The structure of a BLOB part header */
+/* @{ */
+/*--------------------------------------*/
+#define BTR_BLOB_HDR_PART_LEN		0	/*!< BLOB part len on this
+						page */
+#define BTR_BLOB_HDR_NEXT_PAGE_NO	4	/*!< next BLOB part page no,
+						FIL_NULL if none */
+/*--------------------------------------*/
+#define BTR_BLOB_HDR_SIZE		8	/*!< Size of a BLOB
+						part header, in bytes */
+
+/* @} */
+
+/*******************************************************************//**
+Marks all extern fields in a record as owned by the record. This function
+should be called if the delete mark of a record is removed: a not delete
+marked record always owns all its extern fields. */
+static
+void
+btr_cur_unmark_extern_fields(
+/*=========================*/
+	buf_block_t*	block,	/*!< in/out: index page */
+	rec_t*		rec,	/*!< in/out: record in a clustered index */
+	dict_index_t*	index,	/*!< in: index of the page */
+	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
+	mtr_t*		mtr);	/*!< in: mtr, or NULL if not logged */
+/***********************************************************//**
+Frees the externally stored fields for a record, if the field is mentioned
+in the update vector. */
+static
+void
+btr_rec_free_updated_extern_fields(
+/*===============================*/
+	dict_index_t*	index,	/*!< in: index of rec; the index tree MUST be
+				X-latched */
+	rec_t*		rec,	/*!< in: record */
+	buf_block_t*	block,	/*!< in: index page of rec */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	const upd_t*	update,	/*!< in: update vector */
+	bool		rollback,/*!< in: performing rollback? */
+	mtr_t*		mtr);	/*!< in: mini-transaction handle which contains
+				an X-latch to record page and to the tree */
+/***********************************************************//**
+Frees the externally stored fields for a record. */
+static
+void
+btr_rec_free_externally_stored_fields(
+/*==================================*/
+	dict_index_t*	index,	/*!< in: index of the data, the index
+				tree MUST be X-latched */
+	rec_t*		rec,	/*!< in: record */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	buf_block_t*	block,	/*!< in: index page of rec */
+	bool		rollback,/*!< in: performing rollback? */
+	mtr_t*		mtr);	/*!< in: mini-transaction handle which contains
+				an X-latch to record page and to the index
+				tree */
+
+/*==================== B-TREE SEARCH =========================*/
+
+/** Load the instant ALTER TABLE metadata from the clustered index
+when loading a table definition.
+@param[in,out]	index	clustered index definition
+@param[in,out]	mtr	mini-transaction
+@return	error code
+@retval	DB_SUCCESS	if no error occurred
+@retval	DB_CORRUPTION	if any corruption was noticed */
+static dberr_t btr_cur_instant_init_low(dict_index_t* index, mtr_t* mtr)
+{
+	ut_ad(index->is_primary());
+	ut_ad(index->n_core_null_bytes == dict_index_t::NO_CORE_NULL_BYTES);
+	ut_ad(index->table->supports_instant());
+	ut_ad(index->table->is_readable());
+
+	dberr_t err;
+	const fil_space_t* space = index->table->space;
+	if (!space) {
+corrupted:
+		err = DB_CORRUPTION;
+unreadable:
+		ib::error() << "Table " << index->table->name
+			    << " has an unreadable root page";
+		index->table->corrupted = true;
+		index->table->file_unreadable = true;
+		return err;
+	}
+
+	buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr, &err);
+	if (!root) {
+		goto unreadable;
+	}
+
+	if (btr_cur_instant_root_init(index, root->page.frame)) {
+		goto corrupted;
+	}
+
+	ut_ad(index->n_core_null_bytes != dict_index_t::NO_CORE_NULL_BYTES);
+
+	if (fil_page_get_type(root->page.frame) == FIL_PAGE_INDEX) {
+		ut_ad(!index->is_instant());
+		return DB_SUCCESS;
+	}
+
+	btr_cur_t cur;
+	/* Relax the assertion in rec_init_offsets(). */
+	ut_ad(!index->in_instant_init);
+	ut_d(index->in_instant_init = true);
+	err = cur.open_leaf(true, index, BTR_SEARCH_LEAF, mtr);
+	ut_d(index->in_instant_init = false);
+	if (err != DB_SUCCESS) {
+		index->table->file_unreadable = true;
+		index->table->corrupted = true;
+		return err;
+	}
+
+	ut_ad(page_cur_is_before_first(&cur.page_cur));
+	ut_ad(page_is_leaf(cur.page_cur.block->page.frame));
+
+	const rec_t* rec = page_cur_move_to_next(&cur.page_cur);
+	const ulint comp = dict_table_is_comp(index->table);
+	const ulint info_bits = rec ? rec_get_info_bits(rec, comp) : 0;
+
+	if (page_rec_is_supremum(rec)
+	    || !(info_bits & REC_INFO_MIN_REC_FLAG)) {
+		if (rec && !index->is_instant()) {
+			/* The FIL_PAGE_TYPE_INSTANT and PAGE_INSTANT may be
+			assigned even if instant ADD COLUMN was not
+			committed. Changes to these page header fields are not
+			undo-logged, but changes to the hidden metadata record
+			are. If the server is killed and restarted, the page
+			header fields could remain set even though no metadata
+			record is present. */
+			return DB_SUCCESS;
+		}
+
+		ib::error() << "Table " << index->table->name
+			    << " is missing instant ALTER metadata";
+		index->table->corrupted = true;
+		return DB_CORRUPTION;
+	}
+
+	if ((info_bits & ~REC_INFO_DELETED_FLAG) != REC_INFO_MIN_REC_FLAG
+	    || (comp && rec_get_status(rec) != REC_STATUS_INSTANT)) {
+incompatible:
+		ib::error() << "Table " << index->table->name
+			<< " contains unrecognizable instant ALTER metadata";
+		index->table->corrupted = true;
+		return DB_CORRUPTION;
+	}
+
+	/* Read the metadata. We can get here on server restart
+	or when the table was evicted from the data dictionary cache
+	and is now being accessed again.
+
+	Here, READ COMMITTED and REPEATABLE READ should be equivalent.
+	Committing the ADD COLUMN operation would acquire
+	MDL_EXCLUSIVE and LOCK_X|LOCK_TABLE, which would prevent any
+	concurrent operations on the table, including table eviction
+	from the cache. */
+
+	if (info_bits & REC_INFO_DELETED_FLAG) {
+		/* This metadata record includes a BLOB that identifies
+		any dropped or reordered columns. */
+		ulint trx_id_offset = index->trx_id_offset;
+		/* If !index->trx_id_offset, the PRIMARY KEY contains
+		variable-length columns. For the metadata record,
+		variable-length columns should be written with zero
+		length. However, before MDEV-21088 was fixed, for
+		variable-length encoded PRIMARY KEY column of type
+		CHAR, we wrote more than zero bytes. That is why we
+		must determine the actual length of each PRIMARY KEY
+		column.  The DB_TRX_ID will start right after any
+		PRIMARY KEY columns. */
+		ut_ad(index->n_uniq);
+
+		/* We cannot invoke rec_get_offsets() before
+		index->table->deserialise_columns(). Therefore,
+		we must duplicate some logic here. */
+		if (trx_id_offset) {
+		} else if (index->table->not_redundant()) {
+			/* The PRIMARY KEY contains variable-length columns.
+			For the metadata record, variable-length columns are
+			always written with zero length. The DB_TRX_ID will
+			start right after any fixed-length columns. */
+
+			/* OK, before MDEV-21088 was fixed, for
+			variable-length encoded PRIMARY KEY column of
+			type CHAR, we wrote more than zero bytes. In
+			order to allow affected tables to be accessed,
+			it would be nice to determine the actual
+			length of each PRIMARY KEY column. However, to
+			be able to do that, we should determine the
+			size of the null-bit bitmap in the metadata
+			record. And we cannot know that before reading
+			the metadata BLOB, whose starting point we are
+			trying to find here. (Although the PRIMARY KEY
+			columns cannot be NULL, we would have to know
+			where the lengths of variable-length PRIMARY KEY
+			columns start.)
+
+			So, unfortunately we cannot help users who
+			were affected by MDEV-21088 on a ROW_FORMAT=COMPACT
+			or ROW_FORMAT=DYNAMIC table. */
+
+			for (uint i = index->n_uniq; i--; ) {
+				trx_id_offset += index->fields[i].fixed_len;
+			}
+		} else if (rec_get_1byte_offs_flag(rec)) {
+			trx_id_offset = rec_1_get_field_end_info(
+				rec, index->n_uniq - 1);
+			ut_ad(!(trx_id_offset & REC_1BYTE_SQL_NULL_MASK));
+			trx_id_offset &= ~REC_1BYTE_SQL_NULL_MASK;
+		} else {
+			trx_id_offset = rec_2_get_field_end_info(
+				rec, index->n_uniq - 1);
+			ut_ad(!(trx_id_offset & REC_2BYTE_SQL_NULL_MASK));
+			trx_id_offset &= ~REC_2BYTE_SQL_NULL_MASK;
+		}
+
+		const byte* ptr = rec + trx_id_offset
+			+ (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+		if (mach_read_from_4(ptr + BTR_EXTERN_LEN)) {
+			goto incompatible;
+		}
+
+		uint len = mach_read_from_4(ptr + BTR_EXTERN_LEN + 4);
+		if (!len
+		    || mach_read_from_4(ptr + BTR_EXTERN_OFFSET)
+		    != FIL_PAGE_DATA
+		    || mach_read_from_4(ptr + BTR_EXTERN_SPACE_ID)
+		    != space->id) {
+			goto incompatible;
+		}
+
+		buf_block_t* block = buf_page_get(
+			page_id_t(space->id,
+				  mach_read_from_4(ptr + BTR_EXTERN_PAGE_NO)),
+			0, RW_S_LATCH, mtr);
+		if (!block) {
+			goto incompatible;
+		}
+
+		if (fil_page_get_type(block->page.frame) != FIL_PAGE_TYPE_BLOB
+		    || mach_read_from_4(&block->page.frame
+					[FIL_PAGE_DATA
+					 + BTR_BLOB_HDR_NEXT_PAGE_NO])
+		    != FIL_NULL
+		    || mach_read_from_4(&block->page.frame
+					[FIL_PAGE_DATA
+					 + BTR_BLOB_HDR_PART_LEN])
+		    != len) {
+			goto incompatible;
+		}
+
+		/* The unused part of the BLOB page should be zero-filled. */
+		for (const byte* b = block->page.frame
+		       + (FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE) + len,
+		       * const end = block->page.frame + srv_page_size
+		       - BTR_EXTERN_LEN;
+		     b < end; ) {
+			if (*b++) {
+				goto incompatible;
+			}
+		}
+
+		if (index->table->deserialise_columns(
+			    &block->page.frame
+			    [FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE], len)) {
+			goto incompatible;
+		}
+
+		/* Proceed to initialize the default values of
+		any instantly added columns. */
+	}
+
+	mem_heap_t* heap = NULL;
+	rec_offs* offsets = rec_get_offsets(rec, index, NULL,
+					    index->n_core_fields,
+					    ULINT_UNDEFINED, &heap);
+	if (rec_offs_any_default(offsets)) {
+inconsistent:
+		mem_heap_free(heap);
+		goto incompatible;
+	}
+
+	/* In fact, because we only ever append fields to the metadata
+	record, it is also OK to perform READ UNCOMMITTED and
+	then ignore any extra fields, provided that
+	trx_sys.is_registered(DB_TRX_ID). */
+	if (rec_offs_n_fields(offsets)
+	    > ulint(index->n_fields) + !!index->table->instant
+	    && !trx_sys.is_registered(current_trx(),
+				      row_get_rec_trx_id(rec, index,
+							 offsets))) {
+		goto inconsistent;
+	}
+
+	for (unsigned i = index->n_core_fields; i < index->n_fields; i++) {
+		dict_col_t* col = index->fields[i].col;
+		const unsigned o = i + !!index->table->instant;
+		ulint len;
+		const byte* data = rec_get_nth_field(rec, offsets, o, &len);
+		ut_ad(!col->is_added());
+		ut_ad(!col->def_val.data);
+		col->def_val.len = len;
+		switch (len) {
+		case UNIV_SQL_NULL:
+			continue;
+		case 0:
+			col->def_val.data = field_ref_zero;
+			continue;
+		}
+		ut_ad(len != UNIV_SQL_DEFAULT);
+		if (!rec_offs_nth_extern(offsets, o)) {
+			col->def_val.data = mem_heap_dup(
+				index->table->heap, data, len);
+		} else if (len < BTR_EXTERN_FIELD_REF_SIZE
+			   || !memcmp(data + len - BTR_EXTERN_FIELD_REF_SIZE,
+				      field_ref_zero,
+				      BTR_EXTERN_FIELD_REF_SIZE)) {
+			col->def_val.len = UNIV_SQL_DEFAULT;
+			goto inconsistent;
+		} else {
+			col->def_val.data = btr_copy_externally_stored_field(
+				&col->def_val.len, data,
+				cur.page_cur.block->zip_size(),
+				len, index->table->heap);
+		}
+	}
+
+	mem_heap_free(heap);
+	return DB_SUCCESS;
+}
+
+/** Load the instant ALTER TABLE metadata from the clustered index
+when loading a table definition.
+@param[in,out]	table	table definition from the data dictionary
+@return	error code
+@retval	DB_SUCCESS	if no error occurred */
+dberr_t
+btr_cur_instant_init(dict_table_t* table)
+{
+	mtr_t		mtr;
+	dict_index_t*	index = dict_table_get_first_index(table);
+	mtr.start();
+	dberr_t	err = index
+		? btr_cur_instant_init_low(index, &mtr)
+		: DB_CORRUPTION;
+	mtr.commit();
+	return(err);
+}
+
+/** Initialize the n_core_null_bytes on first access to a clustered
+index root page.
+@param[in]	index	clustered index that is on its first access
+@param[in]	page	clustered index root page
+@return	whether the page is corrupted */
+bool btr_cur_instant_root_init(dict_index_t* index, const page_t* page)
+{
+	ut_ad(!index->is_dummy);
+	ut_ad(index->is_primary());
+	ut_ad(!index->is_instant());
+	ut_ad(index->table->supports_instant());
+
+	if (page_has_siblings(page)) {
+		return true;
+	}
+
+	/* This is normally executed as part of btr_cur_instant_init()
+	when dict_load_table_one() is loading a table definition.
+	Other threads should not access or modify the n_core_null_bytes,
+	n_core_fields before dict_load_table_one() returns.
+
+	This can also be executed during IMPORT TABLESPACE, where the
+	table definition is exclusively locked. */
+
+	switch (fil_page_get_type(page)) {
+	default:
+		return true;
+	case FIL_PAGE_INDEX:
+		/* The field PAGE_INSTANT is guaranteed 0 on clustered
+		index root pages of ROW_FORMAT=COMPACT or
+		ROW_FORMAT=DYNAMIC when instant ADD COLUMN is not used. */
+		if (page_is_comp(page) && page_get_instant(page)) {
+			return true;
+		}
+		index->n_core_null_bytes = static_cast<uint8_t>(
+			UT_BITS_IN_BYTES(unsigned(index->n_nullable)));
+		return false;
+	case FIL_PAGE_TYPE_INSTANT:
+		break;
+	}
+
+	const uint16_t n = page_get_instant(page);
+
+	if (n < index->n_uniq + DATA_ROLL_PTR) {
+		/* The PRIMARY KEY (or hidden DB_ROW_ID) and
+		DB_TRX_ID,DB_ROLL_PTR columns must always be present
+		as 'core' fields. */
+		return true;
+	}
+
+	if (n > REC_MAX_N_FIELDS) {
+		return true;
+	}
+
+	index->n_core_fields = n & dict_index_t::MAX_N_FIELDS;
+
+	const rec_t* infimum = page_get_infimum_rec(page);
+	const rec_t* supremum = page_get_supremum_rec(page);
+
+	if (!memcmp(infimum, "infimum", 8)
+	    && !memcmp(supremum, "supremum", 8)) {
+		if (n > index->n_fields) {
+			/* All fields, including those for instantly
+			added columns, must be present in the
+			data dictionary. */
+			return true;
+		}
+
+		ut_ad(!index->is_dummy);
+		ut_d(index->is_dummy = true);
+		index->n_core_null_bytes = static_cast<uint8_t>(
+			UT_BITS_IN_BYTES(index->get_n_nullable(n)));
+		ut_d(index->is_dummy = false);
+		return false;
+	}
+
+	if (memcmp(infimum, field_ref_zero, 8)
+	    || memcmp(supremum, field_ref_zero, 7)) {
+		/* The infimum and supremum records must either contain
+		the original strings, or they must be filled with zero
+		bytes, except for the bytes that we have repurposed. */
+		return true;
+	}
+
+	index->n_core_null_bytes = supremum[7];
+	return index->n_core_null_bytes > 128;
+}
+
+/**
+Gets intention in btr_intention_t from latch_mode, and cleares the intention
+at the latch_mode.
+@param latch_mode	in/out: pointer to latch_mode
+@return intention for latching tree */
+static
+btr_intention_t btr_cur_get_and_clear_intention(btr_latch_mode *latch_mode)
+{
+	btr_intention_t	intention;
+
+	switch (*latch_mode & (BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE)) {
+	case BTR_LATCH_FOR_INSERT:
+		intention = BTR_INTENTION_INSERT;
+		break;
+	case BTR_LATCH_FOR_DELETE:
+		intention = BTR_INTENTION_DELETE;
+		break;
+	default:
+		/* both or unknown */
+		intention = BTR_INTENTION_BOTH;
+	}
+	*latch_mode = btr_latch_mode(
+		*latch_mode & ~(BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE));
+
+	return(intention);
+}
+
+/** @return whether the distance between two records is at most the
+specified value */
+static bool
+page_rec_distance_is_at_most(const rec_t *left, const rec_t *right, ulint val)
+{
+  do
+  {
+    if (left == right)
+      return true;
+    left= page_rec_get_next_const(left);
+  }
+  while (left && val--);
+  return false;
+}
+
+/** Detects whether the modifying record might need a modifying tree structure.
+@param[in]	index		index
+@param[in]	page		page
+@param[in]	lock_intention	lock intention for the tree operation
+@param[in]	rec		record (current node_ptr)
+@param[in]	rec_size	size of the record or max size of node_ptr
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	mtr		mtr
+@return true if tree modification is needed */
+static
+bool
+btr_cur_will_modify_tree(
+	dict_index_t*	index,
+	const page_t*	page,
+	btr_intention_t	lock_intention,
+	const rec_t*	rec,
+	ulint		rec_size,
+	ulint		zip_size,
+	mtr_t*		mtr)
+{
+	ut_ad(!page_is_leaf(page));
+	ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+					 | MTR_MEMO_SX_LOCK));
+
+	/* Pessimistic delete of the first record causes delete & insert
+	of node_ptr at upper level. And a subsequent page shrink is
+	possible. It causes delete of node_ptr at the upper level.
+	So we should pay attention also to 2nd record not only
+	first record and last record. Because if the "delete & insert" are
+	done for the different page, the 2nd record become
+	first record and following compress might delete the record and causes
+	the uppper level node_ptr modification. */
+
+	const ulint n_recs = page_get_n_recs(page);
+
+	if (lock_intention <= BTR_INTENTION_BOTH) {
+		compile_time_assert(BTR_INTENTION_DELETE < BTR_INTENTION_BOTH);
+		compile_time_assert(BTR_INTENTION_BOTH < BTR_INTENTION_INSERT);
+
+		if (!page_has_siblings(page)) {
+			return true;
+		}
+
+		ulint margin = rec_size;
+
+		if (lock_intention == BTR_INTENTION_BOTH) {
+			ulint	level = btr_page_get_level(page);
+
+			/* This value is the worst expectation for the node_ptr
+			records to be deleted from this page. It is used to
+			expect whether the cursor position can be the left_most
+			record in this page or not. */
+			ulint   max_nodes_deleted = 0;
+
+			/* By modifying tree operations from the under of this
+			level, logically (2 ^ (level - 1)) opportunities to
+			deleting records in maximum even unreally rare case. */
+			if (level > 7) {
+				/* TODO: adjust this practical limit. */
+				max_nodes_deleted = 64;
+			} else if (level > 0) {
+				max_nodes_deleted = (ulint)1 << (level - 1);
+			}
+			/* check delete will cause. (BTR_INTENTION_BOTH
+			or BTR_INTENTION_DELETE) */
+			if (n_recs <= max_nodes_deleted * 2
+			    || page_rec_is_first(rec, page)) {
+				/* The cursor record can be the left most record
+				in this page. */
+				return true;
+			}
+
+			if (page_has_prev(page)
+			    && page_rec_distance_is_at_most(
+				    page_get_infimum_rec(page), rec,
+				    max_nodes_deleted)) {
+				return true;
+			}
+
+			if (page_has_next(page)
+			    && page_rec_distance_is_at_most(
+				    rec, page_get_supremum_rec(page),
+				    max_nodes_deleted)) {
+				return true;
+			}
+
+			/* Delete at leftmost record in a page causes delete
+			& insert at its parent page. After that, the delete
+			might cause btr_compress() and delete record at its
+			parent page. Thus we should consider max deletes. */
+			margin *= max_nodes_deleted;
+		}
+
+		/* Safe because we already have SX latch of the index tree */
+		if (page_get_data_size(page)
+		    < margin + BTR_CUR_PAGE_COMPRESS_LIMIT(index)) {
+			return(true);
+		}
+	}
+
+	if (lock_intention >= BTR_INTENTION_BOTH) {
+		/* check insert will cause. BTR_INTENTION_BOTH
+		or BTR_INTENTION_INSERT*/
+
+		/* Once we invoke the btr_cur_limit_optimistic_insert_debug,
+		we should check it here in advance, since the max allowable
+		records in a page is limited. */
+		LIMIT_OPTIMISTIC_INSERT_DEBUG(n_recs, return true);
+
+		/* needs 2 records' space for the case the single split and
+		insert cannot fit.
+		page_get_max_insert_size_after_reorganize() includes space
+		for page directory already */
+		ulint	max_size
+			= page_get_max_insert_size_after_reorganize(page, 2);
+
+		if (max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT + rec_size
+		    || max_size < rec_size * 2) {
+			return(true);
+		}
+
+		/* TODO: optimize this condition for ROW_FORMAT=COMPRESSED.
+		This is based on the worst case, and we could invoke
+		page_zip_available() on the block->page.zip. */
+		/* needs 2 records' space also for worst compress rate. */
+		if (zip_size
+		    && page_zip_empty_size(index->n_fields, zip_size)
+		    <= rec_size * 2 + page_get_data_size(page)
+		    + page_dir_calc_reserved_space(n_recs + 2)) {
+			return(true);
+		}
+	}
+
+	return(false);
+}
+
+/** Detects whether the modifying record might need a opposite modification
+to the intention.
+@param bpage             buffer pool page
+@param is_clust          whether this is a clustered index
+@param lock_intention    lock intention for the tree operation
+@param node_ptr_max_size the maximum size of a node pointer
+@param compress_limit    BTR_CUR_PAGE_COMPRESS_LIMIT(index)
+@param rec               record (current node_ptr)
+@return true if tree modification is needed */
+static bool btr_cur_need_opposite_intention(const buf_page_t &bpage,
+                                            bool is_clust,
+                                            btr_intention_t lock_intention,
+                                            ulint node_ptr_max_size,
+                                            ulint compress_limit,
+                                            const rec_t *rec)
+{
+  if (UNIV_LIKELY_NULL(bpage.zip.data) &&
+      !page_zip_available(&bpage.zip, is_clust, node_ptr_max_size, 1))
+    return true;
+  const page_t *const page= bpage.frame;
+  if (lock_intention != BTR_INTENTION_INSERT)
+  {
+    /* We compensate also for btr_cur_compress_recommendation() */
+    if (!page_has_siblings(page) ||
+        page_rec_is_first(rec, page) || page_rec_is_last(rec, page) ||
+        page_get_data_size(page) < node_ptr_max_size + compress_limit)
+      return true;
+    if (lock_intention == BTR_INTENTION_DELETE)
+      return false;
+  }
+  else if (page_has_next(page) && page_rec_is_last(rec, page))
+    return true;
+  LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page), return true);
+  const ulint max_size= page_get_max_insert_size_after_reorganize(page, 2);
+  return max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT + node_ptr_max_size ||
+    max_size < node_ptr_max_size * 2;
+}
+
+/**
+@param[in]	index b-tree
+@return maximum size of a node pointer record in bytes */
+static ulint btr_node_ptr_max_size(const dict_index_t* index)
+{
+	if (dict_index_is_ibuf(index)) {
+		/* cannot estimate accurately */
+		/* This is universal index for change buffer.
+		The max size of the entry is about max key length * 2.
+		(index key + primary key to be inserted to the index)
+		(The max key length is UNIV_PAGE_SIZE / 16 * 3 at
+		 ha_innobase::max_supported_key_length(),
+		 considering MAX_KEY_LENGTH = 3072 at MySQL imposes
+		 the 3500 historical InnoDB value for 16K page size case.)
+		For the universal index, node_ptr contains most of the entry.
+		And 512 is enough to contain ibuf columns and meta-data */
+		return srv_page_size / 8 * 3 + 512;
+	}
+
+	/* Each record has page_no, length of page_no and header. */
+	ulint comp = dict_table_is_comp(index->table);
+	ulint rec_max_size = comp
+		? REC_NODE_PTR_SIZE + 1 + REC_N_NEW_EXTRA_BYTES
+		+ UT_BITS_IN_BYTES(index->n_nullable)
+		: REC_NODE_PTR_SIZE + 2 + REC_N_OLD_EXTRA_BYTES
+		+ 2 * index->n_fields;
+
+	/* Compute the maximum possible record size. */
+	for (ulint i = 0; i < dict_index_get_n_unique_in_tree(index); i++) {
+		const dict_field_t*	field
+			= dict_index_get_nth_field(index, i);
+		const dict_col_t*	col
+			= dict_field_get_col(field);
+		ulint			field_max_size;
+		ulint			field_ext_max_size;
+
+		/* Determine the maximum length of the index field. */
+
+		field_max_size = dict_col_get_fixed_size(col, comp);
+		if (field_max_size) {
+			/* dict_index_add_col() should guarantee this */
+			ut_ad(!field->prefix_len
+			      || field->fixed_len == field->prefix_len);
+			/* Fixed lengths are not encoded
+			in ROW_FORMAT=COMPACT. */
+			rec_max_size += field_max_size;
+			continue;
+		}
+
+		field_max_size = dict_col_get_max_size(col);
+		if (UNIV_UNLIKELY(!field_max_size)) {
+			switch (col->mtype) {
+			case DATA_VARCHAR:
+				if (!comp
+				    && (!strcmp(index->table->name.m_name,
+						"SYS_FOREIGN")
+					|| !strcmp(index->table->name.m_name,
+						   "SYS_FOREIGN_COLS"))) {
+					break;
+				}
+				/* fall through */
+			case DATA_FIXBINARY:
+			case DATA_BINARY:
+			case DATA_VARMYSQL:
+			case DATA_CHAR:
+			case DATA_MYSQL:
+				/* BINARY(0), VARBINARY(0),
+				CHAR(0) and VARCHAR(0) are possible
+				data type definitions in MariaDB.
+				The InnoDB internal SQL parser maps
+				CHAR to DATA_VARCHAR, so DATA_CHAR (or
+				DATA_MYSQL) is only coming from the
+				MariaDB SQL layer. */
+				if (comp) {
+					/* Add a length byte, because
+					fixed-length empty field are
+					encoded as variable-length.
+					For ROW_FORMAT=REDUNDANT,
+					these bytes were added to
+					rec_max_size before this loop. */
+					rec_max_size++;
+				}
+				continue;
+			}
+
+			/* SYS_FOREIGN.ID is defined as CHAR in the
+			InnoDB internal SQL parser, which translates
+			into the incorrect VARCHAR(0).  InnoDB does
+			not enforce maximum lengths of columns, so
+			that is why any data can be inserted in the
+			first place.
+
+			Likewise, SYS_FOREIGN.FOR_NAME,
+			SYS_FOREIGN.REF_NAME, SYS_FOREIGN_COLS.ID, are
+			defined as CHAR, and also they are part of a key. */
+
+			ut_ad(!strcmp(index->table->name.m_name,
+				      "SYS_FOREIGN")
+			      || !strcmp(index->table->name.m_name,
+					 "SYS_FOREIGN_COLS"));
+			ut_ad(!comp);
+			ut_ad(col->mtype == DATA_VARCHAR);
+
+			rec_max_size += (srv_page_size == UNIV_PAGE_SIZE_MAX)
+				? REDUNDANT_REC_MAX_DATA_SIZE
+				: page_get_free_space_of_empty(FALSE) / 2;
+		} else if (field_max_size == NAME_LEN && i == 1
+			   && (!strcmp(index->table->name.m_name,
+				       TABLE_STATS_NAME)
+			       || !strcmp(index->table->name.m_name,
+					  INDEX_STATS_NAME))) {
+			/* Interpret "table_name" as VARCHAR(199) even
+			if it was incorrectly defined as VARCHAR(64).
+			While the caller of ha_innobase enforces the
+			maximum length on any data written, the InnoDB
+			internal SQL parser will happily write as much
+			data as is provided. The purpose of this hack
+			is to avoid InnoDB hangs after persistent
+			statistics on partitioned tables are
+			deleted. */
+			field_max_size = 199 * SYSTEM_CHARSET_MBMAXLEN;
+		}
+		field_ext_max_size = field_max_size < 256 ? 1 : 2;
+
+		if (field->prefix_len
+		    && field->prefix_len < field_max_size) {
+			field_max_size = field->prefix_len;
+		}
+
+		if (comp) {
+			/* Add the extra size for ROW_FORMAT=COMPACT.
+			For ROW_FORMAT=REDUNDANT, these bytes were
+			added to rec_max_size before this loop. */
+			rec_max_size += field_ext_max_size;
+		}
+
+		rec_max_size += field_max_size;
+	}
+
+	return rec_max_size;
+}
+
+/** @return a B-tree search mode suitable for non-leaf pages
+@param mode  leaf page search mode */
+static inline page_cur_mode_t btr_cur_nonleaf_mode(page_cur_mode_t mode)
+{
+  if (mode > PAGE_CUR_GE)
+  {
+    ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE);
+    return mode;
+  }
+  if (mode == PAGE_CUR_GE)
+    return PAGE_CUR_L;
+  ut_ad(mode == PAGE_CUR_G);
+  return PAGE_CUR_LE;
+}
+
+static MY_ATTRIBUTE((nonnull))
+/** Acquire a latch on the previous page without violating the latching order.
+@param block    index page
+@param page_id  page identifier with valid space identifier
+@param zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param rw_latch the latch on block (RW_S_LATCH or RW_X_LATCH)
+@param mtr      mini-transaction
+@param err      error code
+@retval 0  if an error occurred
+@retval 1  if the page could be latched in the wrong order
+@retval -1 if the latch on block was temporarily released */
+int btr_latch_prev(buf_block_t *block, page_id_t page_id, ulint zip_size,
+                   rw_lock_type_t rw_latch, mtr_t *mtr, dberr_t *err)
+{
+  ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_X_LATCH);
+  ut_ad(page_id.space() == block->page.id().space());
+
+  const auto prev_savepoint= mtr->get_savepoint();
+  ut_ad(block == mtr->at_savepoint(prev_savepoint - 1));
+
+  page_id.set_page_no(btr_page_get_prev(block->page.frame));
+  buf_block_t *prev= buf_page_get_gen(page_id, zip_size, RW_NO_LATCH, nullptr,
+                                      BUF_GET, mtr, err, false);
+  if (UNIV_UNLIKELY(!prev))
+    return 0;
+
+  int ret= 1;
+  if (UNIV_UNLIKELY(rw_latch == RW_S_LATCH))
+  {
+    if (UNIV_LIKELY(prev->page.lock.s_lock_try()))
+    {
+      mtr->lock_register(prev_savepoint, MTR_MEMO_PAGE_S_FIX);
+      goto prev_latched;
+    }
+    block->page.lock.s_unlock();
+  }
+  else
+  {
+    if (UNIV_LIKELY(prev->page.lock.x_lock_try()))
+    {
+      mtr->lock_register(prev_savepoint, MTR_MEMO_PAGE_X_FIX);
+      goto prev_latched;
+    }
+    block->page.lock.x_unlock();
+  }
+
+  ret= -1;
+  mtr->lock_register(prev_savepoint - 1, MTR_MEMO_BUF_FIX);
+  mtr->rollback_to_savepoint(prev_savepoint);
+  prev= buf_page_get_gen(page_id, zip_size, rw_latch, prev,
+                         BUF_GET, mtr, err, false);
+  if (UNIV_UNLIKELY(!prev))
+    return 0;
+  mtr->upgrade_buffer_fix(prev_savepoint - 1, rw_latch);
+
+ prev_latched:
+  if (memcmp_aligned<2>(FIL_PAGE_TYPE + prev->page.frame,
+                        FIL_PAGE_TYPE + block->page.frame, 2) ||
+      memcmp_aligned<2>(PAGE_HEADER + PAGE_INDEX_ID + prev->page.frame,
+                        PAGE_HEADER + PAGE_INDEX_ID + block->page.frame, 8) ||
+      page_is_comp(prev->page.frame) != page_is_comp(block->page.frame))
+  {
+    ut_ad("corrupted" == 0); // FIXME: remove this
+    *err= DB_CORRUPTION;
+    ret= 0;
+  }
+
+  return ret;
+}
+
+dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,
+                               btr_latch_mode latch_mode, mtr_t *mtr)
+{
+  ut_ad(index()->is_btree() || index()->is_ibuf());
+  ut_ad(!index()->is_ibuf() || ibuf_inside(mtr));
+
+  buf_block_t *guess;
+  btr_op_t btr_op;
+  btr_intention_t lock_intention;
+  bool detected_same_key_root= false;
+
+  mem_heap_t*	heap		= NULL;
+  rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+  rec_offs*	offsets		= offsets_;
+  rec_offs	offsets2_[REC_OFFS_NORMAL_SIZE];
+  rec_offs*	offsets2	= offsets2_;
+  rec_offs_init(offsets_);
+  rec_offs_init(offsets2_);
+
+  ut_ad(dict_index_check_search_tuple(index(), tuple));
+  ut_ad(dtuple_check_typed(tuple));
+  ut_ad(index()->page != FIL_NULL);
+
+  MEM_UNDEFINED(&up_match, sizeof up_match);
+  MEM_UNDEFINED(&up_bytes, sizeof up_bytes);
+  MEM_UNDEFINED(&low_match, sizeof low_match);
+  MEM_UNDEFINED(&low_bytes, sizeof low_bytes);
+  ut_d(up_match= ULINT_UNDEFINED);
+  ut_d(low_match= ULINT_UNDEFINED);
+
+  ut_ad(!(latch_mode & BTR_ALREADY_S_LATCHED) ||
+        mtr->memo_contains_flagged(&index()->lock,
+                                   MTR_MEMO_S_LOCK | MTR_MEMO_SX_LOCK |
+                                   MTR_MEMO_X_LOCK));
+
+  /* These flags are mutually exclusive, they are lumped together
+     with the latch mode for historical reasons. It's possible for
+     none of the flags to be set. */
+  switch (UNIV_EXPECT(latch_mode & BTR_DELETE, 0)) {
+  default:
+    btr_op= BTR_NO_OP;
+    break;
+  case BTR_INSERT:
+    btr_op= (latch_mode & BTR_IGNORE_SEC_UNIQUE)
+      ? BTR_INSERT_IGNORE_UNIQUE_OP
+      : BTR_INSERT_OP;
+    break;
+  case BTR_DELETE:
+    btr_op= BTR_DELETE_OP;
+    ut_a(purge_node);
+    break;
+  case BTR_DELETE_MARK:
+    btr_op= BTR_DELMARK_OP;
+    break;
+  }
+
+  /* Operations on the insert buffer tree cannot be buffered. */
+  ut_ad(btr_op == BTR_NO_OP || !index()->is_ibuf());
+  /* Operations on the clustered index cannot be buffered. */
+  ut_ad(btr_op == BTR_NO_OP || !index()->is_clust());
+  /* Operations on the temporary table(indexes) cannot be buffered. */
+  ut_ad(btr_op == BTR_NO_OP || !index()->table->is_temporary());
+
+  const bool latch_by_caller= latch_mode & BTR_ALREADY_S_LATCHED;
+  lock_intention= btr_cur_get_and_clear_intention(&latch_mode);
+  latch_mode= BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
+
+  ut_ad(!latch_by_caller
+        || latch_mode == BTR_SEARCH_LEAF
+        || latch_mode == BTR_MODIFY_LEAF
+        || latch_mode == BTR_MODIFY_TREE
+        || latch_mode == BTR_MODIFY_ROOT_AND_LEAF);
+
+  flag= BTR_CUR_BINARY;
+#ifndef BTR_CUR_ADAPT
+  guess= nullptr;
+#else
+  btr_search_t *info= btr_search_get_info(index());
+  guess= info->root_guess;
+
+# ifdef BTR_CUR_HASH_ADAPT
+#  ifdef UNIV_SEARCH_PERF_STAT
+  info->n_searches++;
+#  endif
+  bool ahi_enabled= btr_search_enabled && !index()->is_ibuf();
+  /* We do a dirty read of btr_search_enabled below,
+     and btr_search_guess_on_hash() will have to check it again. */
+  if (!ahi_enabled);
+  else if (btr_search_guess_on_hash(index(), info, tuple, mode,
+                                    latch_mode, this, mtr))
+  {
+    /* Search using the hash index succeeded */
+    ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_GE);
+    ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE);
+    ut_ad(low_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE);
+    ++btr_cur_n_sea;
+
+    return DB_SUCCESS;
+  }
+  else
+    ++btr_cur_n_non_sea;
+# endif
+#endif
+
+  /* If the hash search did not succeed, do binary search down the
+     tree */
+
+  /* Store the position of the tree latch we push to mtr so that we
+     know how to release it when we have latched leaf node(s) */
+
+  const ulint savepoint= mtr->get_savepoint();
+
+  ulint node_ptr_max_size= 0, compress_limit= 0;
+  rw_lock_type_t rw_latch= RW_S_LATCH;
+
+  switch (latch_mode) {
+  case BTR_MODIFY_TREE:
+    rw_latch= RW_X_LATCH;
+    node_ptr_max_size= btr_node_ptr_max_size(index());
+    if (latch_by_caller)
+    {
+      ut_ad(mtr->memo_contains_flagged(&index()->lock, MTR_MEMO_X_LOCK));
+      break;
+    }
+    if (lock_intention == BTR_INTENTION_DELETE)
+    {
+      compress_limit= BTR_CUR_PAGE_COMPRESS_LIMIT(index());
+      if (os_aio_pending_reads_approx() &&
+          trx_sys.history_size_approx() > BTR_CUR_FINE_HISTORY_LENGTH)
+      {
+        /* Most delete-intended operations are due to the purge of history.
+        Prioritize them when the history list is growing huge. */
+        mtr_x_lock_index(index(), mtr);
+        break;
+      }
+    }
+    mtr_sx_lock_index(index(), mtr);
+    break;
+#ifdef UNIV_DEBUG
+  case BTR_CONT_MODIFY_TREE:
+    ut_ad("invalid mode" == 0);
+    break;
+#endif
+  case BTR_MODIFY_ROOT_AND_LEAF:
+    rw_latch= RW_SX_LATCH;
+    /* fall through */
+  default:
+    if (!latch_by_caller)
+      mtr_s_lock_index(index(), mtr);
+  }
+
+  const ulint zip_size= index()->table->space->zip_size();
+
+  /* Start with the root page. */
+  page_id_t page_id(index()->table->space_id, index()->page);
+
+  const page_cur_mode_t page_mode= btr_cur_nonleaf_mode(mode);
+  ulint height= ULINT_UNDEFINED;
+  up_match= 0;
+  up_bytes= 0;
+  low_match= 0;
+  low_bytes= 0;
+  ulint buf_mode= BUF_GET;
+ search_loop:
+  dberr_t err;
+  auto block_savepoint= mtr->get_savepoint();
+  buf_block_t *block=
+    buf_page_get_gen(page_id, zip_size, rw_latch, guess, buf_mode, mtr,
+                     &err, height == 0 && !index()->is_clust());
+  if (!block)
+  {
+    switch (err) {
+    case DB_DECRYPTION_FAILED:
+      btr_decryption_failed(*index());
+      /* fall through */
+    default:
+    func_exit:
+      if (UNIV_LIKELY_NULL(heap))
+        mem_heap_free(heap);
+      return err;
+    case DB_SUCCESS:
+      /* This must be a search to perform an insert, delete mark, or delete;
+      try using the change buffer */
+      ut_ad(height == 0);
+      ut_ad(thr);
+      break;
+    }
+
+    switch (btr_op) {
+    default:
+      MY_ASSERT_UNREACHABLE();
+      break;
+    case BTR_INSERT_OP:
+    case BTR_INSERT_IGNORE_UNIQUE_OP:
+      ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
+
+      if (ibuf_insert(IBUF_OP_INSERT, tuple, index(), page_id, zip_size, thr))
+      {
+        flag= BTR_CUR_INSERT_TO_IBUF;
+        goto func_exit;
+      }
+      break;
+
+    case BTR_DELMARK_OP:
+      ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
+
+      if (ibuf_insert(IBUF_OP_DELETE_MARK, tuple,
+                      index(), page_id, zip_size, thr))
+      {
+        flag = BTR_CUR_DEL_MARK_IBUF;
+        goto func_exit;
+      }
+
+      break;
+
+    case BTR_DELETE_OP:
+      ut_ad(buf_mode == BUF_GET_IF_IN_POOL_OR_WATCH);
+      auto& chain = buf_pool.page_hash.cell_get(page_id.fold());
+
+      if (!row_purge_poss_sec(purge_node, index(), tuple))
+        /* The record cannot be purged yet. */
+        flag= BTR_CUR_DELETE_REF;
+      else if (ibuf_insert(IBUF_OP_DELETE, tuple, index(),
+                           page_id, zip_size, thr))
+        /* The purge was buffered. */
+        flag= BTR_CUR_DELETE_IBUF;
+      else
+      {
+        /* The purge could not be buffered. */
+        buf_pool.watch_unset(page_id, chain);
+        break;
+      }
+
+      buf_pool.watch_unset(page_id, chain);
+      goto func_exit;
+    }
+
+    /* Change buffering did not succeed, we must read the page. */
+    buf_mode= BUF_GET;
+    goto search_loop;
+  }
+
+  if (!!page_is_comp(block->page.frame) != index()->table->not_redundant() ||
+      btr_page_get_index_id(block->page.frame) != index()->id ||
+      fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE ||
+      !fil_page_index_page_check(block->page.frame))
+  {
+  corrupted:
+    ut_ad("corrupted" == 0); // FIXME: remove this
+    err= DB_CORRUPTION;
+    goto func_exit;
+  }
+
+  page_cur.block= block;
+  ut_ad(block == mtr->at_savepoint(block_savepoint));
+  ut_ad(rw_latch != RW_NO_LATCH);
+#ifdef UNIV_ZIP_DEBUG
+  if (const page_zip_des_t *page_zip= buf_block_get_page_zip(block))
+    ut_a(page_zip_validate(page_zip, block->page.frame, index()));
+#endif /* UNIV_ZIP_DEBUG */
+
+  const uint32_t page_level= btr_page_get_level(block->page.frame);
+
+  if (height == ULINT_UNDEFINED)
+  {
+    /* We are in the B-tree index root page. */
+#ifdef BTR_CUR_ADAPT
+    info->root_guess= block;
+#endif
+    height= page_level;
+    tree_height= height + 1;
+
+    if (!height)
+    {
+      /* The root page is also a leaf page.
+      We may have to reacquire the page latch in a different mode. */
+      switch (rw_latch) {
+      case RW_S_LATCH:
+        if ((latch_mode & ~12) != RW_S_LATCH)
+        {
+          ut_ad(rw_lock_type_t(latch_mode & ~12) == RW_X_LATCH);
+          goto relatch_x;
+        }
+        if (latch_mode != BTR_MODIFY_PREV)
+        {
+          if (!latch_by_caller)
+            /* Release the tree s-latch */
+            mtr->rollback_to_savepoint(savepoint, savepoint + 1);
+          goto reached_latched_leaf;
+        }
+        /* fall through */
+      case RW_SX_LATCH:
+        ut_ad(rw_latch == RW_S_LATCH ||
+              latch_mode == BTR_MODIFY_ROOT_AND_LEAF);
+      relatch_x:
+        mtr->rollback_to_savepoint(block_savepoint);
+        height= ULINT_UNDEFINED;
+        rw_latch= RW_X_LATCH;
+        goto search_loop;
+      case RW_X_LATCH:
+        if (latch_mode == BTR_MODIFY_TREE)
+          goto reached_index_root_and_leaf;
+        goto reached_root_and_leaf;
+      case RW_NO_LATCH:
+        ut_ad(0);
+      }
+      goto reached_leaf;
+    }
+  }
+  else if (UNIV_UNLIKELY(height != page_level))
+    goto corrupted;
+  else
+    switch (latch_mode) {
+    case BTR_MODIFY_TREE:
+      break;
+    case BTR_MODIFY_ROOT_AND_LEAF:
+      ut_ad((mtr->at_savepoint(block_savepoint - 1)->page.id().page_no() ==
+             index()->page) == (tree_height <= height + 2));
+      if (tree_height <= height + 2)
+        /* Retain the root page latch. */
+        break;
+      /* fall through */
+    default:
+      ut_ad(block_savepoint > savepoint);
+      mtr->rollback_to_savepoint(block_savepoint - 1, block_savepoint);
+      block_savepoint--;
+    }
+
+  if (!height)
+  {
+  reached_leaf:
+    /* We reached the leaf level. */
+    ut_ad(block == mtr->at_savepoint(block_savepoint));
+
+    if (latch_mode == BTR_MODIFY_ROOT_AND_LEAF)
+    {
+    reached_root_and_leaf:
+      if (!latch_by_caller)
+        mtr->rollback_to_savepoint(savepoint, savepoint + 1);
+    reached_index_root_and_leaf:
+      ut_ad(rw_latch == RW_X_LATCH);
+#ifdef BTR_CUR_HASH_ADAPT
+      btr_search_drop_page_hash_index(block, true);
+#endif
+      if (page_cur_search_with_match(tuple, mode, &up_match, &low_match,
+                                     &page_cur, nullptr))
+        goto corrupted;
+      ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_GE);
+      ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE);
+      ut_ad(low_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE);
+      goto func_exit;
+    }
+
+    switch (latch_mode) {
+    case BTR_SEARCH_PREV:
+    case BTR_MODIFY_PREV:
+      static_assert(BTR_MODIFY_PREV & BTR_MODIFY_LEAF, "");
+      static_assert(BTR_SEARCH_PREV & BTR_SEARCH_LEAF, "");
+      ut_ad(!latch_by_caller);
+      ut_ad(rw_latch ==
+            rw_lock_type_t(latch_mode & (RW_X_LATCH | RW_S_LATCH)));
+
+      /* latch also siblings from left to right */
+      if (page_has_prev(block->page.frame) &&
+          !btr_latch_prev(block, page_id, zip_size, rw_latch, mtr, &err))
+        goto func_exit;
+      if (page_has_next(block->page.frame) &&
+          !btr_block_get(*index(), btr_page_get_next(block->page.frame),
+                         rw_latch, false, mtr, &err))
+        goto func_exit;
+      goto release_tree;
+    case BTR_SEARCH_LEAF:
+    case BTR_MODIFY_LEAF:
+      if (!latch_by_caller)
+      {
+release_tree:
+        /* Release the tree s-latch */
+        block_savepoint--;
+        mtr->rollback_to_savepoint(savepoint, savepoint + 1);
+      }
+      /* release upper blocks */
+      if (savepoint < block_savepoint)
+        mtr->rollback_to_savepoint(savepoint, block_savepoint);
+      break;
+    default:
+      ut_ad(latch_mode == BTR_MODIFY_TREE);
+      ut_ad(rw_latch == RW_X_LATCH);
+      /* x-latch also siblings from left to right */
+      if (page_has_prev(block->page.frame) &&
+          !btr_latch_prev(block, page_id, zip_size, rw_latch, mtr, &err))
+        goto func_exit;
+      if (page_has_next(block->page.frame) &&
+          !btr_block_get(*index(), btr_page_get_next(block->page.frame),
+                         RW_X_LATCH, false, mtr, &err))
+        goto func_exit;
+      if (btr_cur_need_opposite_intention(block->page, index()->is_clust(),
+                                          lock_intention,
+                                          node_ptr_max_size, compress_limit,
+                                          page_cur.rec))
+        goto need_opposite_intention;
+    }
+
+  reached_latched_leaf:
+#ifdef BTR_CUR_HASH_ADAPT
+    if (ahi_enabled && !(tuple->info_bits & REC_INFO_MIN_REC_FLAG))
+    {
+      if (page_cur_search_with_match_bytes(tuple, mode,
+                                           &up_match, &up_bytes,
+                                           &low_match, &low_bytes, &page_cur))
+        goto corrupted;
+    }
+    else
+#endif /* BTR_CUR_HASH_ADAPT */
+    if (page_cur_search_with_match(tuple, mode, &up_match, &low_match,
+                                   &page_cur, nullptr))
+      goto corrupted;
+
+    ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_GE);
+    ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE);
+    ut_ad(low_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE);
+
+#ifdef BTR_CUR_HASH_ADAPT
+    /* We do a dirty read of btr_search_enabled here.  We will
+    properly check btr_search_enabled again in
+    btr_search_build_page_hash_index() before building a page hash
+    index, while holding search latch. */
+    if (!btr_search_enabled);
+    else if (tuple->info_bits & REC_INFO_MIN_REC_FLAG)
+      /* This may be a search tuple for btr_pcur_t::restore_position(). */
+      ut_ad(tuple->is_metadata() ||
+            (tuple->is_metadata(tuple->info_bits ^ REC_STATUS_INSTANT)));
+    else if (index()->table->is_temporary());
+    else if (!rec_is_metadata(page_cur.rec, *index()))
+      btr_search_info_update(index(), this);
+#endif /* BTR_CUR_HASH_ADAPT */
+
+    goto func_exit;
+  }
+
+  guess= nullptr;
+  if (page_cur_search_with_match(tuple, page_mode, &up_match, &low_match,
+                                 &page_cur, nullptr))
+    goto corrupted;
+  offsets= rec_get_offsets(page_cur.rec, index(), offsets, 0, ULINT_UNDEFINED,
+                           &heap);
+
+  ut_ad(block == mtr->at_savepoint(block_savepoint));
+
+  switch (latch_mode) {
+  default:
+    break;
+  case BTR_MODIFY_TREE:
+    if (btr_cur_need_opposite_intention(block->page, index()->is_clust(),
+                                        lock_intention,
+                                        node_ptr_max_size, compress_limit,
+                                        page_cur.rec))
+      /* If the rec is the first or last in the page for pessimistic
+      delete intention, it might cause node_ptr insert for the upper
+      level. We should change the intention and retry. */
+    need_opposite_intention:
+      return pessimistic_search_leaf(tuple, mode, mtr);
+
+    if (detected_same_key_root || lock_intention != BTR_INTENTION_BOTH ||
+        index()->is_unique() ||
+        (up_match <= rec_offs_n_fields(offsets) &&
+         low_match <= rec_offs_n_fields(offsets)))
+      break;
+
+    /* If the first or the last record of the page or the same key
+    value to the first record or last record, then another page might
+    be chosen when BTR_CONT_MODIFY_TREE.  So, the parent page should
+    not released to avoiding deadlock with blocking the another search
+    with the same key value. */
+    const rec_t *first=
+      page_rec_get_next_const(page_get_infimum_rec(block->page.frame));
+    ulint matched_fields;
+
+    if (UNIV_UNLIKELY(!first))
+      goto corrupted;
+    if (page_cur.rec == first ||
+        page_rec_is_last(page_cur.rec, block->page.frame))
+    {
+    same_key_root:
+      detected_same_key_root= true;
+      break;
+    }
+
+    matched_fields= 0;
+    offsets2= rec_get_offsets(first, index(), offsets2, 0, ULINT_UNDEFINED,
+                              &heap);
+    cmp_rec_rec(page_cur.rec, first, offsets, offsets2, index(), false,
+                &matched_fields);
+    if (matched_fields >= rec_offs_n_fields(offsets) - 1)
+      goto same_key_root;
+    if (const rec_t* last=
+        page_rec_get_prev_const(page_get_supremum_rec(block->page.frame)))
+    {
+      matched_fields= 0;
+      offsets2= rec_get_offsets(last, index(), offsets2, 0, ULINT_UNDEFINED,
+                                &heap);
+      cmp_rec_rec(page_cur.rec, last, offsets, offsets2, index(), false,
+                  &matched_fields);
+      if (matched_fields >= rec_offs_n_fields(offsets) - 1)
+        goto same_key_root;
+    }
+    else
+      goto corrupted;
+
+    /* Release the non-root parent page unless it may need to be modified. */
+    if (tree_height > height + 1 &&
+        !btr_cur_will_modify_tree(index(), block->page.frame, lock_intention,
+                                  page_cur.rec, node_ptr_max_size,
+                                  zip_size, mtr))
+    {
+      mtr->rollback_to_savepoint(block_savepoint - 1, block_savepoint);
+      block_savepoint--;
+    }
+  }
+
+  /* Go to the child node */
+  page_id.set_page_no(btr_node_ptr_get_child_page_no(page_cur.rec, offsets));
+
+  if (!--height)
+  {
+    /* We are about to access the leaf level. */
+
+    switch (latch_mode) {
+    case BTR_MODIFY_ROOT_AND_LEAF:
+      rw_latch= RW_X_LATCH;
+      break;
+    case BTR_MODIFY_PREV: /* ibuf_insert() or btr_pcur_move_to_prev() */
+    case BTR_SEARCH_PREV: /* btr_pcur_move_to_prev() */
+      ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_X_LATCH);
+
+      if (page_has_prev(block->page.frame) &&
+          page_rec_is_first(page_cur.rec, block->page.frame))
+      {
+        ut_ad(block_savepoint + 1 == mtr->get_savepoint());
+
+        /* Latch the previous page if the node pointer is the leftmost
+        of the current page. */
+        int ret= btr_latch_prev(block, page_id, zip_size, rw_latch, mtr, &err);
+        if (!ret)
+          goto func_exit;
+        ut_ad(block_savepoint + 2 == mtr->get_savepoint());
+        if (ret < 0)
+        {
+          /* While our latch on the level-2 page prevents splits or
+          merges of this level-1 block, other threads may have
+          modified it due to splitting or merging some level-0 (leaf)
+          pages underneath it. Thus, we must search again. */
+          if (page_cur_search_with_match(tuple, page_mode,
+                                         &up_match, &low_match,
+                                         &page_cur, nullptr))
+            goto corrupted;
+          offsets= rec_get_offsets(page_cur.rec, index(), offsets, 0,
+                                   ULINT_UNDEFINED, &heap);
+          page_id.set_page_no(btr_node_ptr_get_child_page_no(page_cur.rec,
+                                                             offsets));
+        }
+      }
+      rw_latch= rw_lock_type_t(latch_mode & (RW_X_LATCH | RW_S_LATCH));
+      break;
+    case BTR_MODIFY_LEAF:
+    case BTR_SEARCH_LEAF:
+      rw_latch= rw_lock_type_t(latch_mode);
+      if (btr_op != BTR_NO_OP && !index()->is_ibuf() &&
+          ibuf_should_try(index(), btr_op != BTR_INSERT_OP))
+        /* Try to buffer the operation if the leaf page
+        is not in the buffer pool. */
+        buf_mode= btr_op == BTR_DELETE_OP
+          ? BUF_GET_IF_IN_POOL_OR_WATCH
+          : BUF_GET_IF_IN_POOL;
+      break;
+    case BTR_MODIFY_TREE:
+      ut_ad(rw_latch == RW_X_LATCH);
+
+      if (lock_intention == BTR_INTENTION_INSERT &&
+          page_has_next(block->page.frame) &&
+          page_rec_is_last(page_cur.rec, block->page.frame))
+      {
+        /* btr_insert_into_right_sibling() might cause deleting node_ptr
+        at upper level */
+        mtr->rollback_to_savepoint(block_savepoint);
+        goto need_opposite_intention;
+      }
+      break;
+    default:
+      ut_ad(rw_latch == RW_X_LATCH);
+    }
+  }
+
+  goto search_loop;
+}
+
+ATTRIBUTE_COLD void mtr_t::index_lock_upgrade()
+{
+  auto &slot= m_memo[get_savepoint() - 1];
+  if (slot.type == MTR_MEMO_X_LOCK)
+    return;
+  ut_ad(slot.type == MTR_MEMO_SX_LOCK);
+  index_lock *lock= static_cast<index_lock*>(slot.object);
+  lock->u_x_upgrade(SRW_LOCK_CALL);
+  slot.type= MTR_MEMO_X_LOCK;
+}
+
+ATTRIBUTE_COLD
+dberr_t btr_cur_t::pessimistic_search_leaf(const dtuple_t *tuple,
+                                           page_cur_mode_t mode, mtr_t *mtr)
+{
+  ut_ad(index()->is_btree() || index()->is_ibuf());
+  ut_ad(!index()->is_ibuf() || ibuf_inside(mtr));
+
+  rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+  rec_offs* offsets= offsets_;
+  rec_offs_init(offsets_);
+
+  ut_ad(flag == BTR_CUR_BINARY);
+  ut_ad(dict_index_check_search_tuple(index(), tuple));
+  ut_ad(dtuple_check_typed(tuple));
+  buf_block_t *block= mtr->at_savepoint(1);
+  ut_ad(block->page.id().page_no() == index()->page);
+  block->page.fix();
+  mtr->rollback_to_savepoint(1);
+  mtr->index_lock_upgrade();
+
+  const page_cur_mode_t page_mode{btr_cur_nonleaf_mode(mode)};
+
+  mtr->page_lock(block, RW_X_LATCH);
+
+  up_match= 0;
+  up_bytes= 0;
+  low_match= 0;
+  low_bytes= 0;
+  ulint height= btr_page_get_level(block->page.frame);
+  tree_height= height + 1;
+  mem_heap_t *heap= nullptr;
+
+ search_loop:
+  dberr_t err;
+  page_cur.block= block;
+
+  if (UNIV_UNLIKELY(!height))
+  {
+    if (page_cur_search_with_match(tuple, mode, &up_match, &low_match,
+                                   &page_cur, nullptr))
+    corrupted:
+      err= DB_CORRUPTION;
+    else
+    {
+      ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_GE);
+      ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE);
+      ut_ad(low_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE);
+
+#ifdef BTR_CUR_HASH_ADAPT
+      /* We do a dirty read of btr_search_enabled here.  We will
+      properly check btr_search_enabled again in
+      btr_search_build_page_hash_index() before building a page hash
+      index, while holding search latch. */
+      if (!btr_search_enabled);
+      else if (tuple->info_bits & REC_INFO_MIN_REC_FLAG)
+        /* This may be a search tuple for btr_pcur_t::restore_position(). */
+        ut_ad(tuple->is_metadata() ||
+              (tuple->is_metadata(tuple->info_bits ^ REC_STATUS_INSTANT)));
+      else if (index()->table->is_temporary());
+      else if (!rec_is_metadata(page_cur.rec, *index()))
+        btr_search_info_update(index(), this);
+#endif /* BTR_CUR_HASH_ADAPT */
+      err= DB_SUCCESS;
+    }
+
+  func_exit:
+    if (UNIV_LIKELY_NULL(heap))
+      mem_heap_free(heap);
+    return err;
+  }
+
+  if (page_cur_search_with_match(tuple, page_mode, &up_match, &low_match,
+                                 &page_cur, nullptr))
+    goto corrupted;
+
+  page_id_t page_id{block->page.id()};
+
+  offsets= rec_get_offsets(page_cur.rec, index(), offsets, 0, ULINT_UNDEFINED,
+                           &heap);
+  /* Go to the child node */
+  page_id.set_page_no(btr_node_ptr_get_child_page_no(page_cur.rec, offsets));
+
+  block=
+    buf_page_get_gen(page_id, block->zip_size(), RW_X_LATCH, nullptr, BUF_GET,
+                     mtr, &err, !--height && !index()->is_clust());
+
+  if (!block)
+  {
+    if (err == DB_DECRYPTION_FAILED)
+      btr_decryption_failed(*index());
+    goto func_exit;
+  }
+
+  if (!!page_is_comp(block->page.frame) != index()->table->not_redundant() ||
+      btr_page_get_index_id(block->page.frame) != index()->id ||
+      fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE ||
+      !fil_page_index_page_check(block->page.frame))
+    goto corrupted;
+
+  if (height != btr_page_get_level(block->page.frame))
+    goto corrupted;
+
+#ifdef UNIV_ZIP_DEBUG
+  const page_zip_des_t *page_zip= buf_block_get_page_zip(block);
+  ut_a(!page_zip || page_zip_validate(page_zip, block->page.frame, index()));
+#endif /* UNIV_ZIP_DEBUG */
+
+  if (page_has_prev(block->page.frame) &&
+      !btr_latch_prev(block, page_id, block->zip_size(),
+                      RW_X_LATCH, mtr, &err))
+    goto func_exit;
+  if (page_has_next(block->page.frame) &&
+      !btr_block_get(*index(), btr_page_get_next(block->page.frame),
+                     RW_X_LATCH, false, mtr, &err))
+    goto func_exit;
+  goto search_loop;
+}
+
+/********************************************************************//**
+Searches an index tree and positions a tree cursor on a given non-leaf level.
+NOTE: n_fields_cmp in tuple must be set so that it cannot be compared
+to node pointer page number fields on the upper levels of the tree!
+cursor->up_match and cursor->low_match both will have sensible values.
+Cursor is left at the place where an insert of the
+search tuple should be performed in the B-tree. InnoDB does an insert
+immediately after the cursor. Thus, the cursor may end up on a user record,
+or on a page infimum record.
+@param level      the tree level of search
+@param tuple      data tuple; NOTE: n_fields_cmp in tuple must be set so that
+                  it cannot get compared to the node ptr page number field!
+@param latch      RW_S_LATCH or RW_X_LATCH
+@param cursor     tree cursor; the cursor page is s- or x-latched, but see also
+                  above!
+@param mtr        mini-transaction
+@return DB_SUCCESS on success or error code otherwise */
+TRANSACTIONAL_TARGET
+dberr_t btr_cur_search_to_nth_level(ulint level,
+                                    const dtuple_t *tuple,
+                                    rw_lock_type_t rw_latch,
+                                    btr_cur_t *cursor, mtr_t *mtr)
+{
+  dict_index_t *const index= cursor->index();
+
+  ut_ad(index->is_btree() || index->is_ibuf());
+  mem_heap_t *heap= nullptr;
+  rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+  rec_offs *offsets= offsets_;
+  rec_offs_init(offsets_);
+  ut_ad(level);
+  ut_ad(dict_index_check_search_tuple(index, tuple));
+  ut_ad(index->is_ibuf() ? ibuf_inside(mtr) : index->is_btree());
+  ut_ad(dtuple_check_typed(tuple));
+  ut_ad(index->page != FIL_NULL);
+
+  MEM_UNDEFINED(&cursor->up_bytes, sizeof cursor->up_bytes);
+  MEM_UNDEFINED(&cursor->low_bytes, sizeof cursor->low_bytes);
+  cursor->up_match= 0;
+  cursor->low_match= 0;
+  cursor->flag= BTR_CUR_BINARY;
+
+#ifndef BTR_CUR_ADAPT
+  buf_block_t *block= nullptr;
+#else
+  btr_search_t *info= btr_search_get_info(index);
+  buf_block_t *block= info->root_guess;
+#endif /* BTR_CUR_ADAPT */
+
+  ut_ad(mtr->memo_contains_flagged(&index->lock,
+                                   MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
+
+  const ulint zip_size= index->table->space->zip_size();
+
+  /* Start with the root page. */
+  page_id_t page_id(index->table->space_id, index->page);
+  ulint height= ULINT_UNDEFINED;
+
+search_loop:
+  dberr_t err= DB_SUCCESS;
+  if (buf_block_t *b=
+      mtr->get_already_latched(page_id, mtr_memo_type_t(rw_latch)))
+    block= b;
+  else if (!(block= buf_page_get_gen(page_id, zip_size, rw_latch,
+                                     block, BUF_GET, mtr, &err)))
+  {
+    if (err == DB_DECRYPTION_FAILED)
+      btr_decryption_failed(*index);
+    goto func_exit;
+  }
+
+#ifdef UNIV_ZIP_DEBUG
+  if (const page_zip_des_t *page_zip= buf_block_get_page_zip(block))
+    ut_a(page_zip_validate(page_zip, block->page.frame, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+  if (!!page_is_comp(block->page.frame) != index->table->not_redundant() ||
+      btr_page_get_index_id(block->page.frame) != index->id ||
+      fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE ||
+      !fil_page_index_page_check(block->page.frame))
+  {
+  corrupted:
+    err= DB_CORRUPTION;
+  func_exit:
+    if (UNIV_LIKELY_NULL(heap))
+      mem_heap_free(heap);
+    return err;
+  }
+
+  const uint32_t page_level= btr_page_get_level(block->page.frame);
+
+  if (height == ULINT_UNDEFINED)
+  {
+    /* We are in the root node */
+    height= page_level;
+    if (!height)
+      goto corrupted;
+    cursor->tree_height= height + 1;
+  }
+  else if (height != ulint{page_level})
+    goto corrupted;
+
+  cursor->page_cur.block= block;
+
+  /* Search for complete index fields. */
+  if (page_cur_search_with_match(tuple, PAGE_CUR_LE, &cursor->up_match,
+                                 &cursor->low_match, &cursor->page_cur,
+                                 nullptr))
+    goto corrupted;
+
+  /* If this is the desired level, leave the loop */
+  if (level == height)
+    goto func_exit;
+
+  ut_ad(height > level);
+  height--;
+
+  offsets = rec_get_offsets(cursor->page_cur.rec, index, offsets, 0,
+                            ULINT_UNDEFINED, &heap);
+  /* Go to the child node */
+  page_id.set_page_no(btr_node_ptr_get_child_page_no(cursor->page_cur.rec,
+                                                     offsets));
+  block= nullptr;
+  goto search_loop;
+}
+
+dberr_t btr_cur_t::open_leaf(bool first, dict_index_t *index,
+                             btr_latch_mode latch_mode, mtr_t *mtr)
+{
+  ulint n_blocks= 0;
+  mem_heap_t *heap= nullptr;
+  rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+  rec_offs *offsets= offsets_;
+  dberr_t err;
+
+  rec_offs_init(offsets_);
+
+  const bool latch_by_caller= latch_mode & BTR_ALREADY_S_LATCHED;
+  latch_mode= btr_latch_mode(latch_mode & ~BTR_ALREADY_S_LATCHED);
+
+  btr_intention_t lock_intention= btr_cur_get_and_clear_intention(&latch_mode);
+
+  /* Store the position of the tree latch we push to mtr so that we
+  know how to release it when we have latched the leaf node */
+
+  auto savepoint= mtr->get_savepoint();
+
+  rw_lock_type_t upper_rw_latch= RW_X_LATCH;
+  ulint node_ptr_max_size= 0, compress_limit= 0;
+
+  if (latch_mode == BTR_MODIFY_TREE)
+  {
+    node_ptr_max_size= btr_node_ptr_max_size(index);
+    /* Most of delete-intended operations are purging. Free blocks
+    and read IO bandwidth should be prioritized for them, when the
+    history list is growing huge. */
+    savepoint++;
+    if (lock_intention == BTR_INTENTION_DELETE)
+    {
+      compress_limit= BTR_CUR_PAGE_COMPRESS_LIMIT(index);
+
+      if (os_aio_pending_reads_approx() &&
+          trx_sys.history_size_approx() > BTR_CUR_FINE_HISTORY_LENGTH)
+      {
+        mtr_x_lock_index(index, mtr);
+        goto index_locked;
+      }
+    }
+    mtr_sx_lock_index(index, mtr);
+  }
+  else
+  {
+    static_assert(int{BTR_CONT_MODIFY_TREE} == (12 | BTR_MODIFY_LEAF), "");
+    ut_ad(!(latch_mode & 8));
+    /* This function doesn't need to lock left page of the leaf page */
+    static_assert(int{BTR_SEARCH_PREV} == (4 | BTR_SEARCH_LEAF), "");
+    static_assert(int{BTR_MODIFY_PREV} == (4 | BTR_MODIFY_LEAF), "");
+    latch_mode= btr_latch_mode(latch_mode & ~4);
+    ut_ad(!latch_by_caller ||
+          mtr->memo_contains_flagged(&index->lock,
+                                     MTR_MEMO_SX_LOCK | MTR_MEMO_S_LOCK));
+    upper_rw_latch= RW_S_LATCH;
+    if (!latch_by_caller)
+    {
+      savepoint++;
+      mtr_s_lock_index(index, mtr);
+    }
+  }
+
+index_locked:
+  ut_ad(savepoint == mtr->get_savepoint());
+
+  const rw_lock_type_t root_leaf_rw_latch=
+    rw_lock_type_t(latch_mode & (RW_S_LATCH | RW_X_LATCH));
+
+  page_cur.index = index;
+
+  uint32_t page= index->page;
+  const auto zip_size= index->table->space->zip_size();
+
+  for (ulint height= ULINT_UNDEFINED;;)
+  {
+    ut_ad(n_blocks < BTR_MAX_LEVELS);
+    ut_ad(savepoint + n_blocks == mtr->get_savepoint());
+
+    buf_block_t* block=
+      btr_block_get(*index, page,
+                    height ? upper_rw_latch : root_leaf_rw_latch,
+                    !height, mtr, &err);
+    ut_ad(!block == (err != DB_SUCCESS));
+
+    if (!block)
+    {
+      if (err == DB_DECRYPTION_FAILED)
+        btr_decryption_failed(*index);
+      break;
+    }
+
+    if (first)
+      page_cur_set_before_first(block, &page_cur);
+    else
+      page_cur_set_after_last(block, &page_cur);
+
+    const uint32_t l= btr_page_get_level(block->page.frame);
+
+    if (height == ULINT_UNDEFINED)
+    {
+      /* We are in the root node */
+      height= l;
+      if (height);
+      else if (upper_rw_latch != root_leaf_rw_latch)
+      {
+        /* We should retry to get the page, because the root page
+        is latched with different level as a leaf page. */
+        ut_ad(n_blocks == 0);
+        ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
+        upper_rw_latch= root_leaf_rw_latch;
+        mtr->rollback_to_savepoint(savepoint);
+        height= ULINT_UNDEFINED;
+        continue;
+      }
+      else
+      {
+      reached_leaf:
+        const auto leaf_savepoint= mtr->get_savepoint();
+        ut_ad(leaf_savepoint);
+        ut_ad(block == mtr->at_savepoint(leaf_savepoint - 1));
+
+        if (latch_mode == BTR_MODIFY_TREE)
+        {
+          /* x-latch also siblings from left to right */
+          if (page_has_prev(block->page.frame) &&
+              !btr_latch_prev(block, block->page.id(), zip_size, RW_X_LATCH,
+                              mtr, &err))
+            break;
+          if (page_has_next(block->page.frame) &&
+              !btr_block_get(*index, btr_page_get_next(block->page.frame),
+                             RW_X_LATCH, false, mtr, &err))
+            break;
+
+          if (!index->lock.have_x() &&
+              btr_cur_need_opposite_intention(block->page, index->is_clust(),
+                                              lock_intention,
+                                              node_ptr_max_size,
+                                              compress_limit, page_cur.rec))
+            goto need_opposite_intention;
+        }
+        else
+        {
+          if (latch_mode != BTR_CONT_MODIFY_TREE)
+          {
+            ut_ad(latch_mode == BTR_MODIFY_LEAF ||
+                  latch_mode == BTR_SEARCH_LEAF);
+            /* Release index->lock if needed, and the non-leaf pages. */
+            mtr->rollback_to_savepoint(savepoint - !latch_by_caller,
+                                       leaf_savepoint - 1);
+          }
+        }
+        break;
+      }
+    }
+    else if (UNIV_UNLIKELY(height != l))
+    {
+    corrupted:
+      err= DB_CORRUPTION;
+      break;
+    }
+
+    if (!height)
+      goto reached_leaf;
+
+    height--;
+
+    if (first
+        ? !page_cur_move_to_next(&page_cur)
+        : !page_cur_move_to_prev(&page_cur))
+      goto corrupted;
+
+    offsets= rec_get_offsets(page_cur.rec, index, offsets, 0, ULINT_UNDEFINED,
+                             &heap);
+
+    ut_ad(latch_mode != BTR_MODIFY_TREE || upper_rw_latch == RW_X_LATCH);
+
+    if (latch_mode != BTR_MODIFY_TREE);
+    else if (btr_cur_need_opposite_intention(block->page, index->is_clust(),
+                                             lock_intention,
+                                             node_ptr_max_size, compress_limit,
+                                             page_cur.rec))
+    {
+    need_opposite_intention:
+      /* If the rec is the first or last in the page for pessimistic
+      delete intention, it might cause node_ptr insert for the upper
+      level. We should change the intention and retry. */
+
+      mtr->rollback_to_savepoint(savepoint);
+      mtr->index_lock_upgrade();
+      /* X-latch all pages from now on */
+      latch_mode= BTR_CONT_MODIFY_TREE;
+      page= index->page;
+      height= ULINT_UNDEFINED;
+      n_blocks= 0;
+      continue;
+    }
+    else
+    {
+      if (!btr_cur_will_modify_tree(index, block->page.frame,
+                                    lock_intention, page_cur.rec,
+                                    node_ptr_max_size, zip_size, mtr))
+      {
+        ut_ad(n_blocks);
+        /* release buffer-fixes on pages that will not be modified
+        (except the root) */
+        if (n_blocks > 1)
+        {
+          mtr->rollback_to_savepoint(savepoint + 1, savepoint + n_blocks - 1);
+          n_blocks= 1;
+        }
+      }
+    }
+
+    /* Go to the child node */
+    page= btr_node_ptr_get_child_page_no(page_cur.rec, offsets);
+    n_blocks++;
+  }
+
+  if (UNIV_LIKELY_NULL(heap))
+    mem_heap_free(heap);
+
+  return err;
+}
+
+/*==================== B-TREE INSERT =========================*/
+
+/*************************************************************//**
+Inserts a record if there is enough space, or if enough space can
+be freed by reorganizing. Differs from btr_cur_optimistic_insert because
+no heuristics is applied to whether it pays to use CPU time for
+reorganizing the page or not.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return pointer to inserted record if succeed, else NULL */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+rec_t*
+btr_cur_insert_if_possible(
+/*=======================*/
+	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert;
+				cursor stays valid */
+	const dtuple_t*	tuple,	/*!< in: tuple to insert; the size info need not
+				have been stored to tuple */
+	rec_offs**	offsets,/*!< out: offsets on *rec */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	page_cur_t*	page_cursor;
+	rec_t*		rec;
+
+	ut_ad(dtuple_check_typed(tuple));
+
+	ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
+					 MTR_MEMO_PAGE_X_FIX));
+	page_cursor = btr_cur_get_page_cur(cursor);
+
+	/* Now, try the insert */
+	rec = page_cur_tuple_insert(page_cursor, tuple, offsets, heap, n_ext,
+				    mtr);
+
+	/* If the record did not fit, reorganize.
+	For compressed pages, page_cur_tuple_insert()
+	attempted this already. */
+	if (!rec && !page_cur_get_page_zip(page_cursor)
+	    && btr_page_reorganize(page_cursor, mtr) == DB_SUCCESS) {
+		rec = page_cur_tuple_insert(page_cursor, tuple, offsets, heap,
+					    n_ext, mtr);
+	}
+
+	ut_ad(!rec || rec_offs_validate(rec, page_cursor->index, *offsets));
+	return(rec);
+}
+
+/*************************************************************//**
+For an insert, checks the locks and does the undo logging if desired.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_FAIL, or error number */
+UNIV_INLINE MY_ATTRIBUTE((warn_unused_result, nonnull(2,3,5,6)))
+dberr_t
+btr_cur_ins_lock_and_undo(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags: if
+				not zero, the parameters index and thr
+				should be specified */
+	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert */
+	dtuple_t*	entry,	/*!< in/out: entry to insert */
+	que_thr_t*	thr,	/*!< in: query thread or NULL */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	bool*		inherit)/*!< out: true if the inserted new record maybe
+				should inherit LOCK_GAP type locks from the
+				successor record */
+{
+	if (!(~flags | (BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG))) {
+		return DB_SUCCESS;
+	}
+
+	/* Check if we have to wait for a lock: enqueue an explicit lock
+	request if yes */
+
+	rec_t* rec = btr_cur_get_rec(cursor);
+	dict_index_t* index = cursor->index();
+
+	ut_ad(!dict_index_is_online_ddl(index)
+	      || dict_index_is_clust(index)
+	      || (flags & BTR_CREATE_FLAG));
+	ut_ad((flags & BTR_NO_UNDO_LOG_FLAG)
+	      || !index->table->skip_alter_undo);
+
+	ut_ad(mtr->is_named_space(index->table->space));
+
+	/* Check if there is predicate or GAP lock preventing the insertion */
+	if (!(flags & BTR_NO_LOCKING_FLAG)) {
+		const unsigned type = index->type;
+		if (UNIV_UNLIKELY(type & DICT_SPATIAL)) {
+			lock_prdt_t	prdt;
+			rtr_mbr_t	mbr;
+
+			rtr_get_mbr_from_tuple(entry, &mbr);
+
+			/* Use on stack MBR variable to test if a lock is
+			needed. If so, the predicate (MBR) will be allocated
+			from lock heap in lock_prdt_insert_check_and_lock() */
+			lock_init_prdt_from_mbr(&prdt, &mbr, 0, nullptr);
+
+			if (dberr_t err = lock_prdt_insert_check_and_lock(
+				    rec, btr_cur_get_block(cursor),
+				    index, thr, mtr, &prdt)) {
+				return err;
+			}
+			*inherit = false;
+		} else {
+			ut_ad(!dict_index_is_online_ddl(index)
+			      || index->is_primary()
+			      || (flags & BTR_CREATE_FLAG));
+#ifdef WITH_WSREP
+			trx_t* trx= thr_get_trx(thr);
+			/* If transaction scanning an unique secondary
+			key is wsrep high priority thread (brute
+			force) this scanning may involve GAP-locking
+			in the index. As this locking happens also
+			when applying replication events in high
+			priority applier threads, there is a
+			probability for lock conflicts between two
+			wsrep high priority threads. To avoid this
+			GAP-locking we mark that this transaction
+			is using unique key scan here. */
+			if ((type & (DICT_CLUSTERED | DICT_UNIQUE)) == DICT_UNIQUE
+			    && trx->is_wsrep()
+			    && wsrep_thd_is_BF(trx->mysql_thd, false)) {
+				trx->wsrep = 3;
+			}
+#endif /* WITH_WSREP */
+			if (dberr_t err = lock_rec_insert_check_and_lock(
+				    rec, btr_cur_get_block(cursor),
+				    index, thr, mtr, inherit)) {
+				return err;
+			}
+		}
+	}
+
+	if (!index->is_primary() || !page_is_leaf(page_align(rec))) {
+		return DB_SUCCESS;
+	}
+
+	constexpr roll_ptr_t dummy_roll_ptr = roll_ptr_t{1}
+		<< ROLL_PTR_INSERT_FLAG_POS;
+	roll_ptr_t roll_ptr = dummy_roll_ptr;
+
+	if (!(flags & BTR_NO_UNDO_LOG_FLAG)) {
+		if (dberr_t err = trx_undo_report_row_operation(
+			    thr, index, entry, NULL, 0, NULL, NULL,
+			    &roll_ptr)) {
+			return err;
+		}
+
+		if (roll_ptr != dummy_roll_ptr) {
+			dfield_t* r = dtuple_get_nth_field(entry,
+							   index->db_trx_id());
+			trx_write_trx_id(static_cast<byte*>(r->data),
+					 thr_get_trx(thr)->id);
+		}
+	}
+
+	if (!(flags & BTR_KEEP_SYS_FLAG)) {
+		dfield_t* r = dtuple_get_nth_field(
+			entry, index->db_roll_ptr());
+		ut_ad(r->len == DATA_ROLL_PTR_LEN);
+		trx_write_roll_ptr(static_cast<byte*>(r->data), roll_ptr);
+	}
+
+	return DB_SUCCESS;
+}
+
+/**
+Prefetch siblings of the leaf for the pessimistic operation.
+@param block	leaf page
+@param index    index of the page */
+static void btr_cur_prefetch_siblings(const buf_block_t *block,
+                                      const dict_index_t *index)
+{
+  ut_ad(page_is_leaf(block->page.frame));
+
+  if (index->is_ibuf())
+    return;
+
+  const page_t *page= block->page.frame;
+  uint32_t prev= mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_PREV));
+  uint32_t next= mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_NEXT));
+
+  fil_space_t *space= index->table->space;
+
+  if (prev == FIL_NULL);
+  else if (space->acquire())
+    buf_read_page_background(space, page_id_t(space->id, prev),
+                             block->zip_size());
+  if (next == FIL_NULL);
+  else if (space->acquire())
+    buf_read_page_background(space, page_id_t(space->id, next),
+                             block->zip_size());
+}
+
+/*************************************************************//**
+Tries to perform an insert to a page in an index tree, next to cursor.
+It is assumed that mtr holds an x-latch on the page. The operation does
+not succeed if there is too little space on the page. If there is just
+one record on the page, the insert will always succeed; this is to
+prevent trying to split a page with just one record.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_FAIL, or error number */
+dberr_t
+btr_cur_optimistic_insert(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags: if not
+				zero, the parameters index and thr should be
+				specified */
+	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert;
+				cursor stays valid */
+	rec_offs**	offsets,/*!< out: offsets on *rec */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap */
+	dtuple_t*	entry,	/*!< in/out: entry to insert */
+	rec_t**		rec,	/*!< out: pointer to inserted record if
+				succeed */
+	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
+				be stored externally by the caller */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	que_thr_t*	thr,	/*!< in/out: query thread; can be NULL if
+				!(~flags
+				& (BTR_NO_LOCKING_FLAG
+				| BTR_NO_UNDO_LOG_FLAG)) */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction;
+				if this function returns DB_SUCCESS on
+				a leaf page of a secondary index in a
+				compressed tablespace, the caller must
+				mtr_commit(mtr) before latching
+				any further pages */
+{
+	big_rec_t*	big_rec_vec	= NULL;
+	dict_index_t*	index;
+	page_cur_t*	page_cursor;
+	buf_block_t*	block;
+	page_t*		page;
+	rec_t*		dummy;
+	bool		leaf;
+	bool		reorg __attribute__((unused));
+	bool		inherit = true;
+	ulint		rec_size;
+	dberr_t		err;
+
+	ut_ad(thr || !(~flags & (BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG)));
+	*big_rec = NULL;
+
+	block = btr_cur_get_block(cursor);
+	page = buf_block_get_frame(block);
+	index = cursor->index();
+
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(!dict_index_is_online_ddl(index)
+	      || dict_index_is_clust(index)
+	      || (flags & BTR_CREATE_FLAG));
+	ut_ad(dtuple_check_typed(entry));
+
+#ifdef HAVE_valgrind
+	if (block->page.zip.data) {
+		MEM_CHECK_DEFINED(page, srv_page_size);
+		MEM_CHECK_DEFINED(block->page.zip.data, block->zip_size());
+	}
+#endif /* HAVE_valgrind */
+
+	leaf = page_is_leaf(page);
+
+	if (UNIV_UNLIKELY(entry->is_alter_metadata())) {
+		ut_ad(leaf);
+		goto convert_big_rec;
+	}
+
+	/* Calculate the record size when entry is converted to a record */
+	rec_size = rec_get_converted_size(index, entry, n_ext);
+
+	if (page_zip_rec_needs_ext(rec_size, page_is_comp(page),
+				   dtuple_get_n_fields(entry),
+				   block->zip_size())) {
+convert_big_rec:
+		/* The record is so big that we have to store some fields
+		externally on separate database pages */
+		big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext);
+
+		if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
+
+			return(DB_TOO_BIG_RECORD);
+		}
+
+		rec_size = rec_get_converted_size(index, entry, n_ext);
+	}
+
+	if (block->page.zip.data && page_zip_is_too_big(index, entry)) {
+		if (big_rec_vec != NULL) {
+			dtuple_convert_back_big_rec(index, entry, big_rec_vec);
+		}
+
+		return(DB_TOO_BIG_RECORD);
+	}
+
+	LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page), goto fail);
+
+	if (block->page.zip.data && leaf
+	    && (page_get_data_size(page) + rec_size
+		>= dict_index_zip_pad_optimal_page_size(index))) {
+		/* If compression padding tells us that insertion will
+		result in too packed up page i.e.: which is likely to
+		cause compression failure then don't do an optimistic
+		insertion. */
+fail:
+		err = DB_FAIL;
+
+		/* prefetch siblings of the leaf for the pessimistic
+		operation, if the page is leaf. */
+		if (leaf) {
+			btr_cur_prefetch_siblings(block, index);
+		}
+fail_err:
+
+		if (big_rec_vec) {
+			dtuple_convert_back_big_rec(index, entry, big_rec_vec);
+		}
+
+		return(err);
+	}
+
+	ulint	max_size = page_get_max_insert_size_after_reorganize(page, 1);
+	if (max_size < rec_size) {
+		goto fail;
+	}
+
+	const ulint n_recs = page_get_n_recs(page);
+	if (UNIV_UNLIKELY(n_recs >= 8189)) {
+		ut_ad(srv_page_size == 65536);
+		goto fail;
+	}
+
+	if (page_has_garbage(page)) {
+		if (max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT
+		    && n_recs > 1
+		    && page_get_max_insert_size(page, 1) < rec_size) {
+
+			goto fail;
+		}
+	}
+
+	/* If there have been many consecutive inserts to the
+	clustered index leaf page of an uncompressed table, check if
+	we have to split the page to reserve enough free space for
+	future updates of records. */
+
+	if (leaf && !block->page.zip.data && dict_index_is_clust(index)
+	    && page_get_n_recs(page) >= 2
+	    && dict_index_get_space_reserve() + rec_size > max_size
+	    && (btr_page_get_split_rec_to_right(cursor, &dummy)
+		|| btr_page_get_split_rec_to_left(cursor))) {
+		goto fail;
+	}
+
+	page_cursor = btr_cur_get_page_cur(cursor);
+
+	DBUG_LOG("ib_cur",
+		 "insert " << index->name << " (" << index->id << ") by "
+		 << ib::hex(thr ? thr->graph->trx->id : 0)
+		 << ' ' << rec_printer(entry).str());
+	DBUG_EXECUTE_IF("do_page_reorganize",
+			ut_a(!n_recs || btr_page_reorganize(page_cursor, mtr)
+			     == DB_SUCCESS););
+
+	/* Now, try the insert */
+	{
+		const rec_t*	page_cursor_rec = page_cur_get_rec(page_cursor);
+
+		/* Check locks and write to the undo log,
+		if specified */
+		err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
+						thr, mtr, &inherit);
+		if (err != DB_SUCCESS) {
+			goto fail_err;
+		}
+
+#ifdef UNIV_DEBUG
+		if (!(flags & BTR_CREATE_FLAG)
+		    && leaf && index->is_primary()) {
+			const dfield_t* trx_id = dtuple_get_nth_field(
+				entry, dict_col_get_clust_pos(
+					dict_table_get_sys_col(index->table,
+							       DATA_TRX_ID),
+					index));
+
+			ut_ad(trx_id->len == DATA_TRX_ID_LEN);
+			ut_ad(trx_id[1].len == DATA_ROLL_PTR_LEN);
+			ut_ad(*static_cast<const byte*>
+			      (trx_id[1].data) & 0x80);
+			if (flags & BTR_NO_UNDO_LOG_FLAG) {
+				ut_ad(!memcmp(trx_id->data, reset_trx_id,
+					      DATA_TRX_ID_LEN));
+			} else {
+				ut_ad(thr->graph->trx->id);
+				ut_ad(thr->graph->trx->bulk_insert
+				      || thr->graph->trx->id
+				      == trx_read_trx_id(
+					      static_cast<const byte*>(
+							trx_id->data))
+				      || index->table->is_temporary());
+			}
+		}
+#endif
+
+		*rec = page_cur_tuple_insert(page_cursor, entry, offsets, heap,
+					     n_ext, mtr);
+
+		reorg = page_cursor_rec != page_cur_get_rec(page_cursor);
+	}
+
+	if (*rec) {
+	} else if (block->page.zip.data) {
+		ut_ad(!index->table->is_temporary());
+		/* Reset the IBUF_BITMAP_FREE bits, because
+		page_cur_tuple_insert() will have attempted page
+		reorganize before failing. */
+		if (leaf
+		    && !dict_index_is_clust(index)) {
+			ibuf_reset_free_bits(block);
+		}
+
+		goto fail;
+	} else {
+		ut_ad(!reorg);
+		reorg = true;
+
+		/* If the record did not fit, reorganize */
+		err = btr_page_reorganize(page_cursor, mtr);
+		if (err != DB_SUCCESS
+		    || page_get_max_insert_size(page, 1) != max_size
+		    || !(*rec = page_cur_tuple_insert(page_cursor, entry,
+						      offsets, heap, n_ext,
+						      mtr))) {
+			err = DB_CORRUPTION;
+			goto fail_err;
+		}
+	}
+
+#ifdef BTR_CUR_HASH_ADAPT
+	if (!leaf) {
+	} else if (entry->info_bits & REC_INFO_MIN_REC_FLAG) {
+		ut_ad(entry->is_metadata());
+		ut_ad(index->is_instant());
+		ut_ad(flags == BTR_NO_LOCKING_FLAG);
+	} else if (index->table->is_temporary()) {
+	} else {
+		srw_spin_lock* ahi_latch = btr_search_sys.get_latch(*index);
+		if (!reorg && cursor->flag == BTR_CUR_HASH) {
+			btr_search_update_hash_node_on_insert(
+				cursor, ahi_latch);
+		} else {
+			btr_search_update_hash_on_insert(cursor, ahi_latch);
+		}
+	}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+	if (!(flags & BTR_NO_LOCKING_FLAG) && inherit) {
+
+		lock_update_insert(block, *rec);
+	}
+
+	if (leaf
+	    && !dict_index_is_clust(index)
+	    && !index->table->is_temporary()) {
+		/* Update the free bits of the B-tree page in the
+		insert buffer bitmap. */
+
+		/* The free bits in the insert buffer bitmap must
+		never exceed the free space on a page.  It is safe to
+		decrement or reset the bits in the bitmap in a
+		mini-transaction that is committed before the
+		mini-transaction that affects the free space. */
+
+		/* It is unsafe to increment the bits in a separately
+		committed mini-transaction, because in crash recovery,
+		the free bits could momentarily be set too high. */
+
+		if (block->page.zip.data) {
+			/* Update the bits in the same mini-transaction. */
+			ibuf_update_free_bits_zip(block, mtr);
+		} else {
+			/* Decrement the bits in a separate
+			mini-transaction. */
+			ibuf_update_free_bits_if_full(
+				block, max_size,
+				rec_size + PAGE_DIR_SLOT_SIZE);
+		}
+	}
+
+	*big_rec = big_rec_vec;
+
+	return(DB_SUCCESS);
+}
+
+/*************************************************************//**
+Performs an insert on a page of an index tree. It is assumed that mtr
+holds an x-latch on the tree and on the cursor page. If the insert is
+made on the leaf level, to avoid deadlocks, mtr must also own x-latches
+to brothers of page, if those brothers exist.
+@return DB_SUCCESS or error number */
+dberr_t
+btr_cur_pessimistic_insert(
+/*=======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags: if not
+				zero, the parameter thr should be
+				specified; if no undo logging is specified,
+				then the caller must have reserved enough
+				free extents in the file space so that the
+				insertion will certainly succeed */
+	btr_cur_t*	cursor,	/*!< in: cursor after which to insert;
+				cursor stays valid */
+	rec_offs**	offsets,/*!< out: offsets on *rec */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap
+				that can be emptied */
+	dtuple_t*	entry,	/*!< in/out: entry to insert */
+	rec_t**		rec,	/*!< out: pointer to inserted record if
+				succeed */
+	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
+				be stored externally by the caller */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	que_thr_t*	thr,	/*!< in/out: query thread; can be NULL if
+				!(~flags
+				& (BTR_NO_LOCKING_FLAG
+				| BTR_NO_UNDO_LOG_FLAG)) */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	dict_index_t*	index		= cursor->index();
+	big_rec_t*	big_rec_vec	= NULL;
+	bool		inherit = false;
+	uint32_t	n_reserved	= 0;
+
+	ut_ad(dtuple_check_typed(entry));
+	ut_ad(thr || !(~flags & (BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG)));
+
+	*big_rec = NULL;
+
+	ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+					 | MTR_MEMO_SX_LOCK));
+	ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
+					 MTR_MEMO_PAGE_X_FIX));
+	ut_ad(!dict_index_is_online_ddl(index)
+	      || dict_index_is_clust(index)
+	      || (flags & BTR_CREATE_FLAG));
+
+	cursor->flag = BTR_CUR_BINARY;
+
+	/* Check locks and write to undo log, if specified */
+
+	dberr_t err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
+						thr, mtr, &inherit);
+
+	if (err != DB_SUCCESS) {
+		return(err);
+	}
+
+	/* First reserve enough free space for the file segments of
+	the index tree, so that the insert will not fail because of
+	lack of space */
+
+	if (!index->is_ibuf()
+	    && (err = fsp_reserve_free_extents(&n_reserved, index->table->space,
+					       uint32_t(cursor->tree_height / 16
+							+ 3),
+					       FSP_NORMAL, mtr))
+	    != DB_SUCCESS) {
+		return err;
+	}
+
+	if (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, n_ext),
+				   index->table->not_redundant(),
+				   dtuple_get_n_fields(entry),
+				   btr_cur_get_block(cursor)->zip_size())
+	    || UNIV_UNLIKELY(entry->is_alter_metadata()
+			     && !dfield_is_ext(
+				     dtuple_get_nth_field(
+					     entry,
+					     index->first_user_field())))) {
+		/* The record is so big that we have to store some fields
+		externally on separate database pages */
+
+		if (UNIV_LIKELY_NULL(big_rec_vec)) {
+			/* This should never happen, but we handle
+			the situation in a robust manner. */
+			ut_ad(0);
+			dtuple_convert_back_big_rec(index, entry, big_rec_vec);
+		}
+
+		big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext);
+
+		if (big_rec_vec == NULL) {
+
+			index->table->space->release_free_extents(n_reserved);
+			return(DB_TOO_BIG_RECORD);
+		}
+	}
+
+	*rec = index->page == btr_cur_get_block(cursor)->page.id().page_no()
+		? btr_root_raise_and_insert(flags, cursor, offsets, heap,
+					    entry, n_ext, mtr, &err)
+		: btr_page_split_and_insert(flags, cursor, offsets, heap,
+					    entry, n_ext, mtr, &err);
+
+	if (!*rec) {
+		goto func_exit;
+	}
+
+	ut_ad(page_rec_get_next(btr_cur_get_rec(cursor)) == *rec
+	      || dict_index_is_spatial(index));
+
+	if (!(flags & BTR_NO_LOCKING_FLAG)) {
+		ut_ad(!index->table->is_temporary());
+		if (dict_index_is_spatial(index)) {
+			/* Do nothing */
+		} else {
+			/* The cursor might be moved to the other page
+			and the max trx id field should be updated after
+			the cursor was fixed. */
+			if (!dict_index_is_clust(index)) {
+				page_update_max_trx_id(
+					btr_cur_get_block(cursor),
+					btr_cur_get_page_zip(cursor),
+					thr_get_trx(thr)->id, mtr);
+			}
+
+			if (!page_rec_is_infimum(btr_cur_get_rec(cursor))
+			    || !page_has_prev(btr_cur_get_page(cursor))) {
+				/* split and inserted need to call
+				lock_update_insert() always. */
+				inherit = true;
+			}
+		}
+	}
+
+	if (!page_is_leaf(btr_cur_get_page(cursor))) {
+		ut_ad(!big_rec_vec);
+	} else {
+#ifdef BTR_CUR_HASH_ADAPT
+		if (entry->info_bits & REC_INFO_MIN_REC_FLAG) {
+			ut_ad(entry->is_metadata());
+			ut_ad(index->is_instant());
+			ut_ad(flags & BTR_NO_LOCKING_FLAG);
+			ut_ad(!(flags & BTR_CREATE_FLAG));
+		} else if (index->table->is_temporary()) {
+		} else {
+			btr_search_update_hash_on_insert(
+				cursor, btr_search_sys.get_latch(*index));
+		}
+#endif /* BTR_CUR_HASH_ADAPT */
+		if (inherit && !(flags & BTR_NO_LOCKING_FLAG)) {
+
+			lock_update_insert(btr_cur_get_block(cursor), *rec);
+		}
+	}
+
+	err = DB_SUCCESS;
+func_exit:
+	index->table->space->release_free_extents(n_reserved);
+	*big_rec = big_rec_vec;
+
+	return err;
+}
+
+/*==================== B-TREE UPDATE =========================*/
+
+/*************************************************************//**
+For an update, checks the locks and does the undo logging.
+@return DB_SUCCESS, DB_LOCK_WAIT, or error number */
+UNIV_INLINE MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+btr_cur_upd_lock_and_undo(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in: cursor on record to update */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets() on cursor */
+	const upd_t*	update,	/*!< in: update vector */
+	ulint		cmpl_info,/*!< in: compiler info on secondary index
+				updates */
+	que_thr_t*	thr,	/*!< in: query thread
+				(can be NULL if BTR_NO_LOCKING_FLAG) */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	roll_ptr_t*	roll_ptr)/*!< out: roll pointer */
+{
+	dict_index_t*	index;
+	const rec_t*	rec;
+	dberr_t		err;
+
+	ut_ad((thr != NULL) || (flags & BTR_NO_LOCKING_FLAG));
+
+	rec = btr_cur_get_rec(cursor);
+	index = cursor->index();
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(mtr->is_named_space(index->table->space));
+
+	if (!dict_index_is_clust(index)) {
+		ut_ad(dict_index_is_online_ddl(index)
+		      == !!(flags & BTR_CREATE_FLAG));
+
+		/* We do undo logging only when we update a clustered index
+		record */
+		return(lock_sec_rec_modify_check_and_lock(
+			       flags, btr_cur_get_block(cursor), rec,
+			       index, thr, mtr));
+	}
+
+	/* Check if we have to wait for a lock: enqueue an explicit lock
+	request if yes */
+
+	if (!(flags & BTR_NO_LOCKING_FLAG)) {
+		err = lock_clust_rec_modify_check_and_lock(
+			btr_cur_get_block(cursor), rec, index,
+			offsets, thr);
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+	}
+
+	/* Append the info about the update in the undo log */
+
+	return((flags & BTR_NO_UNDO_LOG_FLAG)
+	       ? DB_SUCCESS
+	       : trx_undo_report_row_operation(
+		       thr, index, NULL, update,
+		       cmpl_info, rec, offsets, roll_ptr));
+}
+
+/** Write DB_TRX_ID,DB_ROLL_PTR to a clustered index entry.
+@param[in,out]	entry		clustered index entry
+@param[in]	index		clustered index
+@param[in]	trx_id		DB_TRX_ID
+@param[in]	roll_ptr	DB_ROLL_PTR */
+static void btr_cur_write_sys(
+	dtuple_t*		entry,
+	const dict_index_t*	index,
+	trx_id_t		trx_id,
+	roll_ptr_t		roll_ptr)
+{
+	dfield_t* t = dtuple_get_nth_field(entry, index->db_trx_id());
+	ut_ad(t->len == DATA_TRX_ID_LEN);
+	trx_write_trx_id(static_cast<byte*>(t->data), trx_id);
+	dfield_t* r = dtuple_get_nth_field(entry, index->db_roll_ptr());
+	ut_ad(r->len == DATA_ROLL_PTR_LEN);
+	trx_write_roll_ptr(static_cast<byte*>(r->data), roll_ptr);
+}
+
+MY_ATTRIBUTE((warn_unused_result))
+/** Update DB_TRX_ID, DB_ROLL_PTR in a clustered index record.
+@param[in,out]  block           clustered index leaf page
+@param[in,out]  rec             clustered index record
+@param[in]      index           clustered index
+@param[in]      offsets         rec_get_offsets(rec, index)
+@param[in]      trx             transaction
+@param[in]      roll_ptr        DB_ROLL_PTR value
+@param[in,out]  mtr             mini-transaction
+@return error code */
+static dberr_t btr_cur_upd_rec_sys(buf_block_t *block, rec_t *rec,
+                                   dict_index_t *index, const rec_offs *offsets,
+                                   const trx_t *trx, roll_ptr_t roll_ptr,
+                                   mtr_t *mtr)
+{
+  ut_ad(index->is_primary());
+  ut_ad(rec_offs_validate(rec, index, offsets));
+
+  if (UNIV_LIKELY_NULL(block->page.zip.data))
+  {
+    page_zip_write_trx_id_and_roll_ptr(block, rec, offsets, index->db_trx_id(),
+                                       trx->id, roll_ptr, mtr);
+    return DB_SUCCESS;
+  }
+
+  ulint offset= index->trx_id_offset;
+
+  if (!offset)
+    offset= row_get_trx_id_offset(index, offsets);
+
+  compile_time_assert(DATA_TRX_ID + 1 == DATA_ROLL_PTR);
+
+  /* During IMPORT the trx id in the record can be in the future, if
+  the .ibd file is being imported from another instance. During IMPORT
+  roll_ptr will be 0. */
+  ut_ad(roll_ptr == 0 ||
+        lock_check_trx_id_sanity(trx_read_trx_id(rec + offset),
+                                 rec, index, offsets));
+
+  byte sys[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN];
+
+  trx_write_trx_id(sys, trx->id);
+  trx_write_roll_ptr(sys + DATA_TRX_ID_LEN, roll_ptr);
+
+  ulint d= 0;
+  const byte *src= nullptr;
+  byte *dest= rec + offset;
+  ulint len= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+
+  if (UNIV_LIKELY(index->trx_id_offset))
+  {
+    const rec_t *prev= page_rec_get_prev_const(rec);
+    if (UNIV_UNLIKELY(!prev || prev == rec))
+      return DB_CORRUPTION;
+    else if (page_rec_is_infimum(prev));
+    else
+      for (src= prev + offset; d < DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; d++)
+        if (src[d] != sys[d])
+          break;
+    if (d > 6 && memcmp(dest, sys, d))
+    {
+      /* We save space by replacing a single record
+
+      WRITE,page_offset(dest),byte[13]
+
+      with two records:
+
+      MEMMOVE,page_offset(dest),d(1 byte),offset(1..3 bytes),
+      WRITE|0x80,0,byte[13-d]
+
+      The single WRITE record would be x+13 bytes long, with x>2.
+      The MEMMOVE record would be up to x+1+3 = x+4 bytes, and the
+      second WRITE would be 1+1+13-d = 15-d bytes.
+
+      The total size is: x+13 versus x+4+15-d = x+19-d bytes.
+      To save space, we must have d>6, that is, the complete DB_TRX_ID and
+      the first byte(s) of DB_ROLL_PTR must match the previous record. */
+      memcpy(dest, src, d);
+      mtr->memmove(*block, page_offset(dest), page_offset(src), d);
+      dest+= d;
+      len-= d;
+      /* DB_TRX_ID,DB_ROLL_PTR must be unique in each record when
+      DB_TRX_ID refers to an active transaction. */
+      ut_ad(len);
+    }
+    else
+      d= 0;
+  }
+
+  if (UNIV_LIKELY(len)) /* extra safety, to avoid corrupting the log */
+    mtr->memcpy<mtr_t::MAYBE_NOP>(*block, dest, sys + d, len);
+
+  return DB_SUCCESS;
+}
+
+/*************************************************************//**
+See if there is enough place in the page modification log to log
+an update-in-place.
+
+@retval false if out of space; IBUF_BITMAP_FREE will be reset
+outside mtr if the page was recompressed
+@retval true if enough place;
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE if this is
+a secondary index leaf page. This has to be done either within the
+same mini-transaction, or by invoking ibuf_reset_free_bits() before
+mtr_commit(mtr). */
+bool
+btr_cur_update_alloc_zip_func(
+/*==========================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	page_cur_t*	cursor,	/*!< in/out: B-tree page cursor */
+#ifdef UNIV_DEBUG
+	rec_offs*	offsets,/*!< in/out: offsets of the cursor record */
+#endif /* UNIV_DEBUG */
+	ulint		length,	/*!< in: size needed */
+	bool		create,	/*!< in: true=delete-and-insert,
+				false=update-in-place */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	dict_index_t*	index = cursor->index;
+
+	/* Have a local copy of the variables as these can change
+	dynamically. */
+	const page_t*	page = page_cur_get_page(cursor);
+
+	ut_ad(page_zip == page_cur_get_page_zip(cursor));
+	ut_ad(!dict_index_is_ibuf(index));
+	ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
+
+	if (page_zip_available(page_zip, dict_index_is_clust(index),
+			       length, create)) {
+		return(true);
+	}
+
+	if (!page_zip->m_nonempty && !page_has_garbage(page)) {
+		/* The page has been freshly compressed, so
+		reorganizing it will not help. */
+		return(false);
+	}
+
+	if (create && page_is_leaf(page)
+	    && (length + page_get_data_size(page)
+		>= dict_index_zip_pad_optimal_page_size(index))) {
+		return(false);
+	}
+
+	if (btr_page_reorganize(cursor, mtr) == DB_SUCCESS) {
+		rec_offs_make_valid(page_cur_get_rec(cursor), index,
+				    page_is_leaf(page), offsets);
+
+		/* After recompressing a page, we must make sure that the free
+		bits in the insert buffer bitmap will not exceed the free
+		space on the page.  Because this function will not attempt
+		recompression unless page_zip_available() fails above, it is
+		safe to reset the free bits if page_zip_available() fails
+		again, below.  The free bits can safely be reset in a separate
+		mini-transaction.  If page_zip_available() succeeds below, we
+		can be sure that the btr_page_reorganize() above did not reduce
+		the free space available on the page. */
+
+		if (page_zip_available(page_zip, dict_index_is_clust(index),
+				       length, create)) {
+			return true;
+		}
+	}
+
+	if (!dict_index_is_clust(index)
+	    && !index->table->is_temporary()
+	    && page_is_leaf(page)) {
+		ibuf_reset_free_bits(page_cur_get_block(cursor));
+	}
+
+	return(false);
+}
+
+/** Apply an update vector to a record. No field size changes are allowed.
+
+This is usually invoked on a clustered index. The only use case for a
+secondary index is row_ins_sec_index_entry_by_modify() or its
+counterpart in ibuf_insert_to_index_page().
+@param[in,out]  rec     index record
+@param[in]      index   the index of the record
+@param[in]      offsets rec_get_offsets(rec, index)
+@param[in]      update  update vector
+@param[in,out]  block   index page
+@param[in,out]  mtr     mini-transaction */
+void btr_cur_upd_rec_in_place(rec_t *rec, const dict_index_t *index,
+                              const rec_offs *offsets, const upd_t *update,
+                              buf_block_t *block, mtr_t *mtr)
+{
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(!index->table->skip_alter_undo);
+	ut_ad(!block->page.zip.data || index->table->not_redundant());
+
+#ifdef UNIV_DEBUG
+	if (rec_offs_comp(offsets)) {
+		switch (rec_get_status(rec)) {
+		case REC_STATUS_ORDINARY:
+			break;
+		case REC_STATUS_INSTANT:
+			ut_ad(index->is_instant());
+			break;
+		case REC_STATUS_NODE_PTR:
+		case REC_STATUS_INFIMUM:
+		case REC_STATUS_SUPREMUM:
+			ut_ad("wrong record status in update" == 0);
+		}
+	}
+#endif /* UNIV_DEBUG */
+
+	static_assert(REC_INFO_BITS_SHIFT == 0, "compatibility");
+	if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+		ut_ad(rec_offs_comp(offsets));
+		byte* info_bits = &rec[-REC_NEW_INFO_BITS];
+		const bool flip_del_mark = (*info_bits ^ update->info_bits)
+			& REC_INFO_DELETED_FLAG;
+		*info_bits &= byte(~REC_INFO_BITS_MASK);
+		*info_bits |= update->info_bits;
+
+		if (flip_del_mark) {
+			page_zip_rec_set_deleted(block, rec, update->info_bits
+						 & REC_INFO_DELETED_FLAG, mtr);
+		}
+	} else {
+		byte* info_bits = &rec[rec_offs_comp(offsets)
+				       ? -REC_NEW_INFO_BITS
+				       : -REC_OLD_INFO_BITS];
+
+		mtr->write<1,mtr_t::MAYBE_NOP>(*block, info_bits,
+					       (*info_bits
+						& ~REC_INFO_BITS_MASK)
+					       | update->info_bits);
+	}
+
+	for (ulint i = 0; i < update->n_fields; i++) {
+		const upd_field_t* uf = upd_get_nth_field(update, i);
+		if (upd_fld_is_virtual_col(uf) && !index->has_virtual()) {
+			continue;
+		}
+		const ulint n = uf->field_no;
+
+		ut_ad(!dfield_is_ext(&uf->new_val)
+		      == !rec_offs_nth_extern(offsets, n));
+		ut_ad(!rec_offs_nth_default(offsets, n));
+
+		if (UNIV_UNLIKELY(dfield_is_null(&uf->new_val))) {
+			if (rec_offs_nth_sql_null(offsets, n)) {
+				ut_ad(index->table->is_instant());
+				ut_ad(n >= index->n_core_fields);
+				continue;
+			}
+
+			ut_ad(!index->table->not_redundant());
+			switch (ulint size = rec_get_nth_field_size(rec, n)) {
+			case 0:
+				break;
+			case 1:
+				mtr->write<1,mtr_t::MAYBE_NOP>(
+					*block,
+					rec_get_field_start_offs(rec, n) + rec,
+					0U);
+				break;
+			default:
+				mtr->memset(
+					block,
+					page_offset(rec_get_field_start_offs(
+							    rec, n) + rec),
+					size, 0);
+			}
+			ulint l = rec_get_1byte_offs_flag(rec)
+				? (n + 1) : (n + 1) * 2;
+			byte* b = rec - REC_N_OLD_EXTRA_BYTES - l;
+			compile_time_assert(REC_1BYTE_SQL_NULL_MASK << 8
+					    == REC_2BYTE_SQL_NULL_MASK);
+			mtr->write<1>(*block, b,
+				      byte(*b | REC_1BYTE_SQL_NULL_MASK));
+			continue;
+		}
+
+		ulint len;
+		byte* data = rec_get_nth_field(rec, offsets, n, &len);
+		if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+			ut_ad(len == uf->new_val.len);
+			memcpy(data, uf->new_val.data, len);
+			continue;
+		}
+
+		if (UNIV_UNLIKELY(len != uf->new_val.len)) {
+			ut_ad(len == UNIV_SQL_NULL);
+			ut_ad(!rec_offs_comp(offsets));
+			len = uf->new_val.len;
+			ut_ad(len == rec_get_nth_field_size(rec, n));
+			ulint l = rec_get_1byte_offs_flag(rec)
+				? (n + 1) : (n + 1) * 2;
+			byte* b = rec - REC_N_OLD_EXTRA_BYTES - l;
+			compile_time_assert(REC_1BYTE_SQL_NULL_MASK << 8
+					    == REC_2BYTE_SQL_NULL_MASK);
+			mtr->write<1>(*block, b,
+				      byte(*b & ~REC_1BYTE_SQL_NULL_MASK));
+		}
+
+		if (len) {
+			mtr->memcpy<mtr_t::MAYBE_NOP>(*block, data,
+						      uf->new_val.data, len);
+		}
+	}
+
+	if (UNIV_LIKELY(!block->page.zip.data)) {
+		return;
+	}
+
+	switch (update->n_fields) {
+	case 0:
+		/* We only changed the delete-mark flag. */
+		return;
+	case 1:
+		if (!index->is_clust()
+		    || update->fields[0].field_no != index->db_roll_ptr()) {
+			break;
+		}
+		goto update_sys;
+	case 2:
+		if (!index->is_clust()
+		    || update->fields[0].field_no != index->db_trx_id()
+		    || update->fields[1].field_no != index->db_roll_ptr()) {
+			break;
+		}
+	update_sys:
+		ulint len;
+		const byte* sys = rec_get_nth_field(rec, offsets,
+						    index->db_trx_id(), &len);
+		ut_ad(len == DATA_TRX_ID_LEN);
+		page_zip_write_trx_id_and_roll_ptr(
+			block, rec, offsets, index->db_trx_id(),
+			trx_read_trx_id(sys),
+			trx_read_roll_ptr(sys + DATA_TRX_ID_LEN), mtr);
+		return;
+	}
+
+	page_zip_write_rec(block, rec, index, offsets, 0, mtr);
+}
+
+/** Check if a ROW_FORMAT=COMPRESSED page can be updated in place
+@param cur     cursor pointing to ROW_FORMAT=COMPRESSED page
+@param offsets rec_get_offsets(btr_cur_get_rec(cur))
+@param update  index fields being updated
+@param mtr     mini-transaction
+@return the record in the ROW_FORMAT=COMPRESSED page
+@retval nullptr if the page cannot be updated in place */
+ATTRIBUTE_COLD static
+rec_t *btr_cur_update_in_place_zip_check(btr_cur_t *cur, rec_offs *offsets,
+                                         const upd_t& update, mtr_t *mtr)
+{
+  dict_index_t *index= cur->index();
+  ut_ad(!index->table->is_temporary());
+
+  switch (update.n_fields) {
+  case 0:
+    /* We are only changing the delete-mark flag. */
+    break;
+  case 1:
+    if (!index->is_clust() ||
+        update.fields[0].field_no != index->db_roll_ptr())
+      goto check_for_overflow;
+    /* We are only changing the delete-mark flag and DB_ROLL_PTR. */
+    break;
+  case 2:
+    if (!index->is_clust() ||
+        update.fields[0].field_no != index->db_trx_id() ||
+        update.fields[1].field_no != index->db_roll_ptr())
+      goto check_for_overflow;
+    /* We are only changing DB_TRX_ID, DB_ROLL_PTR, and the delete-mark.
+    They can be updated in place in the uncompressed part of the
+    ROW_FORMAT=COMPRESSED page. */
+    break;
+  check_for_overflow:
+  default:
+    if (!btr_cur_update_alloc_zip(btr_cur_get_page_zip(cur),
+                                  btr_cur_get_page_cur(cur),
+                                  offsets, rec_offs_size(offsets),
+                                  false, mtr))
+      return nullptr;
+  }
+
+  return btr_cur_get_rec(cur);
+}
+
+/*************************************************************//**
+Updates a record when the update causes no size changes in its fields.
+We assume here that the ordering fields of the record do not change.
+@return locking or undo log related error code, or
+@retval DB_SUCCESS on success
+@retval DB_ZIP_OVERFLOW if there is not enough space left
+on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
+dberr_t
+btr_cur_update_in_place(
+/*====================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in: cursor on the record to update;
+				cursor stays valid and positioned on the
+				same record */
+	rec_offs*	offsets,/*!< in/out: offsets on cursor->page_cur.rec */
+	const upd_t*	update,	/*!< in: update vector */
+	ulint		cmpl_info,/*!< in: compiler info on secondary index
+				updates */
+	que_thr_t*	thr,	/*!< in: query thread */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction; if this
+				is a secondary index, the caller must
+				mtr_commit(mtr) before latching any
+				further pages */
+{
+	dict_index_t*	index;
+	dberr_t		err;
+	rec_t*		rec;
+	roll_ptr_t	roll_ptr	= 0;
+	ulint		was_delete_marked;
+
+	ut_ad(page_is_leaf(cursor->page_cur.block->page.frame));
+	rec = btr_cur_get_rec(cursor);
+	index = cursor->index();
+	ut_ad(!index->is_ibuf());
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
+	ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG)
+	      || index->table->is_temporary());
+	/* The insert buffer tree should never be updated in place. */
+	ut_ad(!dict_index_is_ibuf(index));
+	ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
+	      || dict_index_is_clust(index));
+	ut_ad(thr_get_trx(thr)->id == trx_id
+	      || (flags & ulint(~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP)))
+	      == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
+		  | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
+	ut_ad(fil_page_index_page_check(btr_cur_get_page(cursor)));
+	ut_ad(btr_page_get_index_id(btr_cur_get_page(cursor)) == index->id);
+	ut_ad(!(update->info_bits & REC_INFO_MIN_REC_FLAG));
+
+	DBUG_LOG("ib_cur",
+		 "update-in-place " << index->name << " (" << index->id
+		 << ") by " << ib::hex(trx_id) << ": "
+		 << rec_printer(rec, offsets).str());
+
+	buf_block_t* block = btr_cur_get_block(cursor);
+	page_zip_des_t*	page_zip = buf_block_get_page_zip(block);
+
+	/* Check that enough space is available on the compressed page. */
+	if (UNIV_LIKELY_NULL(page_zip)
+	    && !(rec = btr_cur_update_in_place_zip_check(
+			 cursor, offsets, *update, mtr))) {
+		return DB_ZIP_OVERFLOW;
+	}
+
+	/* Do lock checking and undo logging */
+	err = btr_cur_upd_lock_and_undo(flags, cursor, offsets,
+					update, cmpl_info,
+					thr, mtr, &roll_ptr);
+	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+		/* We may need to update the IBUF_BITMAP_FREE
+		bits after a reorganize that was done in
+		btr_cur_update_alloc_zip(). */
+		goto func_exit;
+	}
+
+	if (!(flags & BTR_KEEP_SYS_FLAG)) {
+		err = btr_cur_upd_rec_sys(block, rec, index, offsets,
+					  thr_get_trx(thr), roll_ptr, mtr);
+		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+			goto func_exit;
+		}
+	}
+
+	was_delete_marked = rec_get_deleted_flag(
+		rec, page_is_comp(buf_block_get_frame(block)));
+	/* In delete-marked records, DB_TRX_ID must always refer to an
+	existing undo log record. */
+	ut_ad(!was_delete_marked
+	      || !dict_index_is_clust(index)
+	      || row_get_rec_trx_id(rec, index, offsets));
+
+#ifdef BTR_CUR_HASH_ADAPT
+	{
+		srw_spin_lock* ahi_latch = block->index
+			? btr_search_sys.get_latch(*index) : NULL;
+		if (ahi_latch) {
+			/* TO DO: Can we skip this if none of the fields
+			index->search_info->curr_n_fields
+			are being updated? */
+
+			/* The function row_upd_changes_ord_field_binary
+			does not work on a secondary index. */
+
+			if (!dict_index_is_clust(index)
+			    || row_upd_changes_ord_field_binary(
+				    index, update, thr, NULL, NULL)) {
+				ut_ad(!(update->info_bits
+					& REC_INFO_MIN_REC_FLAG));
+				/* Remove possible hash index pointer
+				to this record */
+				btr_search_update_hash_on_delete(cursor);
+			}
+
+			ahi_latch->wr_lock(SRW_LOCK_CALL);
+		}
+
+		assert_block_ahi_valid(block);
+#endif /* BTR_CUR_HASH_ADAPT */
+
+		btr_cur_upd_rec_in_place(rec, index, offsets, update, block,
+					 mtr);
+
+#ifdef BTR_CUR_HASH_ADAPT
+		if (ahi_latch) {
+			ahi_latch->wr_unlock();
+		}
+	}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+	if (was_delete_marked
+	    && !rec_get_deleted_flag(
+		    rec, page_is_comp(buf_block_get_frame(block)))) {
+		/* The new updated record owns its possible externally
+		stored fields */
+
+		btr_cur_unmark_extern_fields(block, rec, index, offsets, mtr);
+	}
+
+	ut_ad(err == DB_SUCCESS);
+
+func_exit:
+	if (page_zip
+	    && !(flags & BTR_KEEP_IBUF_BITMAP)
+	    && !dict_index_is_clust(index)
+	    && page_is_leaf(buf_block_get_frame(block))) {
+		/* Update the free bits in the insert buffer. */
+		ut_ad(!index->table->is_temporary());
+		ibuf_update_free_bits_zip(block, mtr);
+	}
+
+	return(err);
+}
+
+/** Trim a metadata record during the rollback of instant ALTER TABLE.
+@param[in]	entry	metadata tuple
+@param[in]	index	primary key
+@param[in]	update	update vector for the rollback */
+ATTRIBUTE_COLD
+static void btr_cur_trim_alter_metadata(dtuple_t* entry,
+					const dict_index_t* index,
+					const upd_t* update)
+{
+	ut_ad(index->is_instant());
+	ut_ad(update->is_alter_metadata());
+	ut_ad(entry->is_alter_metadata());
+
+	ut_ad(update->fields[0].field_no == index->first_user_field());
+	ut_ad(update->fields[0].new_val.ext);
+	ut_ad(update->fields[0].new_val.len == FIELD_REF_SIZE);
+	ut_ad(entry->n_fields - 1 == index->n_fields);
+
+	const byte* ptr = static_cast<const byte*>(
+		update->fields[0].new_val.data);
+	ut_ad(!mach_read_from_4(ptr + BTR_EXTERN_LEN));
+	ut_ad(mach_read_from_4(ptr + BTR_EXTERN_LEN + 4) > 4);
+	ut_ad(mach_read_from_4(ptr + BTR_EXTERN_OFFSET) == FIL_PAGE_DATA);
+	ut_ad(mach_read_from_4(ptr + BTR_EXTERN_SPACE_ID)
+	      == index->table->space->id);
+
+	ulint n_fields = update->fields[1].field_no;
+	ut_ad(n_fields <= index->n_fields);
+	if (n_fields != index->n_uniq) {
+		ut_ad(n_fields
+		      >= index->n_core_fields);
+		entry->n_fields = n_fields;
+		return;
+	}
+
+	/* This is based on dict_table_t::deserialise_columns()
+	and btr_cur_instant_init_low(). */
+	mtr_t mtr;
+	mtr.start();
+	buf_block_t* block = buf_page_get(
+		page_id_t(index->table->space->id,
+			  mach_read_from_4(ptr + BTR_EXTERN_PAGE_NO)),
+		0, RW_S_LATCH, &mtr);
+	if (!block) {
+		ut_ad("corruption" == 0);
+		mtr.commit();
+		return;
+	}
+	ut_ad(fil_page_get_type(block->page.frame) == FIL_PAGE_TYPE_BLOB);
+	ut_ad(mach_read_from_4(&block->page.frame
+			       [FIL_PAGE_DATA + BTR_BLOB_HDR_NEXT_PAGE_NO])
+	      == FIL_NULL);
+	ut_ad(mach_read_from_4(&block->page.frame
+			       [FIL_PAGE_DATA + BTR_BLOB_HDR_PART_LEN])
+	      == mach_read_from_4(ptr + BTR_EXTERN_LEN + 4));
+	n_fields = mach_read_from_4(
+		&block->page.frame[FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE])
+		+ index->first_user_field();
+	/* Rollback should not increase the number of fields. */
+	ut_ad(n_fields <= index->n_fields);
+	ut_ad(n_fields + 1 <= entry->n_fields);
+	/* dict_index_t::clear_instant_alter() cannot be invoked while
+	rollback of an instant ALTER TABLE transaction is in progress
+	for an is_alter_metadata() record. */
+	ut_ad(n_fields >= index->n_core_fields);
+
+	mtr.commit();
+	entry->n_fields = n_fields + 1;
+}
+
+/** Trim an update tuple due to instant ADD COLUMN, if needed.
+For normal records, the trailing instantly added fields that match
+the initial default values are omitted.
+
+For the special metadata record on a table on which instant
+ADD COLUMN has already been executed, both ADD COLUMN and the
+rollback of ADD COLUMN need to be handled specially.
+
+@param[in,out]	entry	index entry
+@param[in]	index	index
+@param[in]	update	update vector
+@param[in]	thr	execution thread */
+static inline
+void
+btr_cur_trim(
+	dtuple_t*		entry,
+	const dict_index_t*	index,
+	const upd_t*		update,
+	const que_thr_t*	thr)
+{
+	if (!index->is_instant()) {
+	} else if (UNIV_UNLIKELY(update->is_metadata())) {
+		/* We are either updating a metadata record
+		(instant ALTER TABLE on a table where instant ALTER was
+		already executed) or rolling back such an operation. */
+		ut_ad(!upd_get_nth_field(update, 0)->orig_len);
+		ut_ad(entry->is_metadata());
+
+		if (thr->graph->trx->in_rollback) {
+			/* This rollback can occur either as part of
+			ha_innobase::commit_inplace_alter_table() rolling
+			back after a failed innobase_add_instant_try(),
+			or as part of crash recovery. Either way, the
+			table will be in the data dictionary cache, with
+			the instantly added columns going to be removed
+			later in the rollback. */
+			ut_ad(index->table->cached);
+			/* The DB_TRX_ID,DB_ROLL_PTR are always last,
+			and there should be some change to roll back.
+			The first field in the update vector is the
+			first instantly added column logged by
+			innobase_add_instant_try(). */
+			ut_ad(update->n_fields > 2);
+			if (update->is_alter_metadata()) {
+				btr_cur_trim_alter_metadata(
+					entry, index, update);
+				return;
+			}
+			ut_ad(!entry->is_alter_metadata());
+
+			ulint n_fields = upd_get_nth_field(update, 0)
+				->field_no;
+			ut_ad(n_fields + 1 >= entry->n_fields);
+			entry->n_fields = n_fields;
+		}
+	} else {
+		entry->trim(*index);
+	}
+}
+
+/*************************************************************//**
+Tries to update a record on a page in an index tree. It is assumed that mtr
+holds an x-latch on the page. The operation does not succeed if there is too
+little space on the page or if the update would result in too empty a page,
+so that tree compression is recommended. We assume here that the ordering
+fields of the record do not change.
+@return error code, including
+@retval DB_SUCCESS on success
+@retval DB_OVERFLOW if the updated record does not fit
+@retval DB_UNDERFLOW if the page would become too empty
+@retval DB_ZIP_OVERFLOW if there is not enough space left
+on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
+dberr_t
+btr_cur_optimistic_update(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in: cursor on the record to update;
+				cursor stays valid and positioned on the
+				same record */
+	rec_offs**	offsets,/*!< out: offsets on cursor->page_cur.rec */
+	mem_heap_t**	heap,	/*!< in/out: pointer to NULL or memory heap */
+	const upd_t*	update,	/*!< in: update vector; this must also
+				contain trx id and roll ptr fields */
+	ulint		cmpl_info,/*!< in: compiler info on secondary index
+				updates */
+	que_thr_t*	thr,	/*!< in: query thread */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction; if this
+				is a secondary index, the caller must
+				mtr_commit(mtr) before latching any
+				further pages */
+{
+	dict_index_t*	index;
+	page_cur_t*	page_cursor;
+	dberr_t		err;
+	buf_block_t*	block;
+	page_t*		page;
+	page_zip_des_t*	page_zip;
+	rec_t*		rec;
+	ulint		max_size;
+	ulint		new_rec_size;
+	ulint		old_rec_size;
+	ulint		max_ins_size = 0;
+	dtuple_t*	new_entry;
+	roll_ptr_t	roll_ptr;
+	ulint		i;
+
+	block = btr_cur_get_block(cursor);
+	page = buf_block_get_frame(block);
+	rec = btr_cur_get_rec(cursor);
+	index = cursor->index();
+	ut_ad(index->has_locking());
+	ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG)
+	      || index->table->is_temporary());
+	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+	/* This is intended only for leaf page updates */
+	ut_ad(page_is_leaf(page));
+	/* The insert buffer tree should never be updated in place. */
+	ut_ad(!dict_index_is_ibuf(index));
+	ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
+	      || dict_index_is_clust(index));
+	ut_ad(thr_get_trx(thr)->id == trx_id
+	      || (flags & ulint(~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP)))
+	      == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
+		  | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
+	ut_ad(fil_page_index_page_check(page));
+	ut_ad(btr_page_get_index_id(page) == index->id);
+
+	*offsets = rec_get_offsets(rec, index, *offsets, index->n_core_fields,
+				   ULINT_UNDEFINED, heap);
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+	ut_a(!rec_offs_any_null_extern(rec, *offsets)
+	     || thr_get_trx(thr) == trx_roll_crash_recv_trx);
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
+	if (UNIV_LIKELY(!update->is_metadata())
+	    && !row_upd_changes_field_size_or_external(index, *offsets,
+						       update)) {
+
+		/* The simplest and the most common case: the update does not
+		change the size of any field and none of the updated fields is
+		externally stored in rec or update, and there is enough space
+		on the compressed page to log the update. */
+
+		return(btr_cur_update_in_place(
+			       flags, cursor, *offsets, update,
+			       cmpl_info, thr, trx_id, mtr));
+	}
+
+	if (rec_offs_any_extern(*offsets)) {
+any_extern:
+		ut_ad(!index->is_ibuf());
+		/* Externally stored fields are treated in pessimistic
+		update */
+
+		/* prefetch siblings of the leaf for the pessimistic
+		operation. */
+		btr_cur_prefetch_siblings(block, index);
+
+		return(DB_OVERFLOW);
+	}
+
+	if (rec_is_metadata(rec, *index) && index->table->instant) {
+		goto any_extern;
+	}
+
+	for (i = 0; i < upd_get_n_fields(update); i++) {
+		if (dfield_is_ext(&upd_get_nth_field(update, i)->new_val)) {
+
+			goto any_extern;
+		}
+	}
+
+	DBUG_LOG("ib_cur",
+		 "update " << index->name << " (" << index->id << ") by "
+		 << ib::hex(trx_id) << ": "
+		 << rec_printer(rec, *offsets).str());
+
+	page_cursor = btr_cur_get_page_cur(cursor);
+
+	if (!*heap) {
+		*heap = mem_heap_create(
+			rec_offs_size(*offsets)
+			+ DTUPLE_EST_ALLOC(rec_offs_n_fields(*offsets)));
+	}
+
+	new_entry = row_rec_to_index_entry(rec, index, *offsets, *heap);
+	ut_ad(!dtuple_get_n_ext(new_entry));
+
+	/* The page containing the clustered index record
+	corresponding to new_entry is latched in mtr.
+	Thus the following call is safe. */
+	row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
+						     *heap);
+	btr_cur_trim(new_entry, index, update, thr);
+	old_rec_size = rec_offs_size(*offsets);
+	new_rec_size = rec_get_converted_size(index, new_entry, 0);
+
+	page_zip = buf_block_get_page_zip(block);
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+	if (page_zip) {
+		ut_ad(!index->table->is_temporary());
+
+		if (page_zip_rec_needs_ext(new_rec_size, page_is_comp(page),
+					   dict_index_get_n_fields(index),
+					   block->zip_size())) {
+			goto any_extern;
+		}
+
+		if (!btr_cur_update_alloc_zip(
+			    page_zip, page_cursor, *offsets,
+			    new_rec_size, true, mtr)) {
+			return(DB_ZIP_OVERFLOW);
+		}
+
+		rec = page_cur_get_rec(page_cursor);
+	}
+
+	/* We limit max record size to 16k even for 64k page size. */
+	if (new_rec_size >= COMPRESSED_REC_MAX_DATA_SIZE ||
+			(!dict_table_is_comp(index->table)
+			 && new_rec_size >= REDUNDANT_REC_MAX_DATA_SIZE)) {
+		err = DB_OVERFLOW;
+		goto func_exit;
+	}
+
+	if (UNIV_UNLIKELY(new_rec_size
+			  >= (page_get_free_space_of_empty(page_is_comp(page))
+			      / 2))) {
+		/* We may need to update the IBUF_BITMAP_FREE
+		bits after a reorganize that was done in
+		btr_cur_update_alloc_zip(). */
+		err = DB_OVERFLOW;
+		goto func_exit;
+	}
+
+	if (UNIV_UNLIKELY(page_get_data_size(page)
+			  - old_rec_size + new_rec_size
+			  < BTR_CUR_PAGE_COMPRESS_LIMIT(index))) {
+		/* We may need to update the IBUF_BITMAP_FREE
+		bits after a reorganize that was done in
+		btr_cur_update_alloc_zip(). */
+
+		/* The page would become too empty */
+		err = DB_UNDERFLOW;
+		goto func_exit;
+	}
+
+	/* We do not attempt to reorganize if the page is compressed.
+	This is because the page may fail to compress after reorganization. */
+	max_size = page_zip
+		? page_get_max_insert_size(page, 1)
+		: (old_rec_size
+		   + page_get_max_insert_size_after_reorganize(page, 1));
+
+	if (!page_zip) {
+		max_ins_size = page_get_max_insert_size_after_reorganize(
+				page, 1);
+	}
+
+	if (!(((max_size >= BTR_CUR_PAGE_REORGANIZE_LIMIT)
+	       && (max_size >= new_rec_size))
+	      || (page_get_n_recs(page) <= 1))) {
+
+		/* We may need to update the IBUF_BITMAP_FREE
+		bits after a reorganize that was done in
+		btr_cur_update_alloc_zip(). */
+
+		/* There was not enough space, or it did not pay to
+		reorganize: for simplicity, we decide what to do assuming a
+		reorganization is needed, though it might not be necessary */
+
+		err = DB_OVERFLOW;
+		goto func_exit;
+	}
+
+	/* Do lock checking and undo logging */
+	err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
+					update, cmpl_info,
+					thr, mtr, &roll_ptr);
+	if (err != DB_SUCCESS) {
+		/* We may need to update the IBUF_BITMAP_FREE
+		bits after a reorganize that was done in
+		btr_cur_update_alloc_zip(). */
+		goto func_exit;
+	}
+
+	/* Ok, we may do the replacement. Store on the page infimum the
+	explicit locks on rec, before deleting rec (see the comment in
+	btr_cur_pessimistic_update). */
+	if (index->has_locking()) {
+		lock_rec_store_on_page_infimum(block, rec);
+	}
+
+	if (UNIV_UNLIKELY(update->is_metadata())) {
+		ut_ad(new_entry->is_metadata());
+		ut_ad(index->is_instant());
+		/* This can be innobase_add_instant_try() performing a
+		subsequent instant ADD COLUMN, or its rollback by
+		row_undo_mod_clust_low(). */
+		ut_ad(flags & BTR_NO_LOCKING_FLAG);
+	} else {
+		btr_search_update_hash_on_delete(cursor);
+	}
+
+	page_cur_delete_rec(page_cursor, *offsets, mtr);
+
+	if (!page_cur_move_to_prev(page_cursor)) {
+		return DB_CORRUPTION;
+	}
+
+	if (!(flags & BTR_KEEP_SYS_FLAG)) {
+		btr_cur_write_sys(new_entry, index, trx_id, roll_ptr);
+	}
+
+	rec = btr_cur_insert_if_possible(cursor, new_entry, offsets, heap,
+					 0/*n_ext*/, mtr);
+	if (UNIV_UNLIKELY(!rec)) {
+		goto corrupted;
+	}
+
+	if (UNIV_UNLIKELY(update->is_metadata())) {
+		/* We must empty the PAGE_FREE list, because if this
+		was a rollback, the shortened metadata record
+		would have too many fields, and we would be unable to
+		know the size of the freed record. */
+		err = btr_page_reorganize(page_cursor, mtr);
+		if (err != DB_SUCCESS) {
+			goto func_exit;
+		}
+	} else {
+		/* Restore the old explicit lock state on the record */
+		lock_rec_restore_from_page_infimum(*block, rec,
+						   block->page.id());
+	}
+
+	ut_ad(err == DB_SUCCESS);
+	if (!page_cur_move_to_next(page_cursor)) {
+corrupted:
+		err = DB_CORRUPTION;
+	}
+
+func_exit:
+	if (!(flags & BTR_KEEP_IBUF_BITMAP)
+	    && !dict_index_is_clust(index)) {
+		/* Update the free bits in the insert buffer. */
+		if (page_zip) {
+			ut_ad(!index->table->is_temporary());
+			ibuf_update_free_bits_zip(block, mtr);
+		} else if (!index->table->is_temporary()) {
+			ibuf_update_free_bits_low(block, max_ins_size, mtr);
+		}
+	}
+
+	if (err != DB_SUCCESS) {
+		/* prefetch siblings of the leaf for the pessimistic
+		operation. */
+		btr_cur_prefetch_siblings(block, index);
+	}
+
+	return(err);
+}
+
+/*************************************************************//**
+If, in a split, a new supremum record was created as the predecessor of the
+updated record, the supremum record must inherit exactly the locks on the
+updated record. In the split it may have inherited locks from the successor
+of the updated record, which is not correct. This function restores the
+right locks for the new supremum. */
+static
+dberr_t
+btr_cur_pess_upd_restore_supremum(
+/*==============================*/
+	buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*	rec,	/*!< in: updated record */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_t*		page;
+
+	page = buf_block_get_frame(block);
+
+	if (page_rec_get_next(page_get_infimum_rec(page)) != rec) {
+		/* Updated record is not the first user record on its page */
+		return DB_SUCCESS;
+	}
+
+	const uint32_t	prev_page_no = btr_page_get_prev(page);
+
+	const page_id_t block_id{block->page.id()};
+	const page_id_t	prev_id(block_id.space(), prev_page_no);
+	dberr_t err;
+	buf_block_t* prev_block
+		= buf_page_get_gen(prev_id, 0, RW_NO_LATCH, nullptr,
+				   BUF_PEEK_IF_IN_POOL, mtr, &err);
+	/* Since we already held an x-latch on prev_block, it must
+	be available and not be corrupted unless the buffer pool got
+	corrupted somehow. */
+	if (UNIV_UNLIKELY(!prev_block)) {
+		return err;
+	}
+	ut_ad(!memcmp_aligned<4>(prev_block->page.frame + FIL_PAGE_NEXT,
+				 block->page.frame + FIL_PAGE_OFFSET, 4));
+
+	/* We must already have an x-latch on prev_block! */
+	ut_ad(mtr->memo_contains_flagged(prev_block, MTR_MEMO_PAGE_X_FIX));
+
+	lock_rec_reset_and_inherit_gap_locks(*prev_block, block_id,
+					     PAGE_HEAP_NO_SUPREMUM,
+					     page_rec_get_heap_no(rec));
+	return DB_SUCCESS;
+}
+
+/*************************************************************//**
+Performs an update of a record on a page of a tree. It is assumed
+that mtr holds an x-latch on the tree and on the cursor page. If the
+update is made on the leaf level, to avoid deadlocks, mtr must also
+own x-latches to brothers of page, if those brothers exist. We assume
+here that the ordering fields of the record do not change.
+@return DB_SUCCESS or error code */
+dberr_t
+btr_cur_pessimistic_update(
+/*=======================*/
+	ulint		flags,	/*!< in: undo logging, locking, and rollback
+				flags */
+	btr_cur_t*	cursor,	/*!< in/out: cursor on the record to update;
+				cursor may become invalid if *big_rec == NULL
+				|| !(flags & BTR_KEEP_POS_FLAG) */
+	rec_offs**	offsets,/*!< out: offsets on cursor->page_cur.rec */
+	mem_heap_t**	offsets_heap,
+				/*!< in/out: pointer to memory heap
+				that can be emptied */
+	mem_heap_t*	entry_heap,
+				/*!< in/out: memory heap for allocating
+				big_rec and the index tuple */
+	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
+				be stored externally by the caller */
+	upd_t*		update,	/*!< in/out: update vector; this is allowed to
+				also contain trx id and roll ptr fields.
+				Non-updated columns that are moved offpage will
+				be appended to this. */
+	ulint		cmpl_info,/*!< in: compiler info on secondary index
+				updates */
+	que_thr_t*	thr,	/*!< in: query thread */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction; must be
+				committed before latching any further pages */
+{
+	big_rec_t*	big_rec_vec	= NULL;
+	big_rec_t*	dummy_big_rec;
+	dict_index_t*	index;
+	buf_block_t*	block;
+	page_zip_des_t*	page_zip;
+	rec_t*		rec;
+	page_cur_t*	page_cursor;
+	dberr_t		err;
+	dberr_t		optim_err;
+	roll_ptr_t	roll_ptr;
+	bool		was_first;
+	uint32_t	n_reserved	= 0;
+
+	*offsets = NULL;
+	*big_rec = NULL;
+
+	block = btr_cur_get_block(cursor);
+	page_zip = buf_block_get_page_zip(block);
+	index = cursor->index();
+	ut_ad(index->has_locking());
+
+	ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK |
+					 MTR_MEMO_SX_LOCK));
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!page_zip
+	     || page_zip_validate(page_zip, block->page.frame, index));
+#endif /* UNIV_ZIP_DEBUG */
+	ut_ad(!page_zip || !index->table->is_temporary());
+	/* The insert buffer tree should never be updated in place. */
+	ut_ad(!dict_index_is_ibuf(index));
+	ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG)
+	      || index->table->is_temporary());
+	ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
+	      || dict_index_is_clust(index));
+	ut_ad(thr_get_trx(thr)->id == trx_id
+	      || (flags & ulint(~BTR_KEEP_POS_FLAG))
+	      == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
+		  | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
+
+	err = optim_err = btr_cur_optimistic_update(
+		flags | BTR_KEEP_IBUF_BITMAP,
+		cursor, offsets, offsets_heap, update,
+		cmpl_info, thr, trx_id, mtr);
+
+	switch (err) {
+	case DB_ZIP_OVERFLOW:
+	case DB_UNDERFLOW:
+	case DB_OVERFLOW:
+		break;
+	default:
+	err_exit:
+		/* We suppressed this with BTR_KEEP_IBUF_BITMAP.
+		For DB_ZIP_OVERFLOW, the IBUF_BITMAP_FREE bits were
+		already reset by btr_cur_update_alloc_zip() if the
+		page was recompressed. */
+		if (page_zip
+		    && optim_err != DB_ZIP_OVERFLOW
+		    && !dict_index_is_clust(index)
+		    && page_is_leaf(block->page.frame)) {
+			ut_ad(!index->table->is_temporary());
+			ibuf_update_free_bits_zip(block, mtr);
+		}
+
+		if (big_rec_vec != NULL) {
+			dtuple_big_rec_free(big_rec_vec);
+		}
+
+		return(err);
+	}
+
+	rec = btr_cur_get_rec(cursor);
+	ut_ad(rec_offs_validate(rec, index, *offsets));
+
+	dtuple_t* new_entry;
+
+	const bool is_metadata = rec_is_metadata(rec, *index);
+
+	if (UNIV_UNLIKELY(is_metadata)) {
+		ut_ad(update->is_metadata());
+		ut_ad(flags & BTR_NO_LOCKING_FLAG);
+		ut_ad(index->is_instant());
+		new_entry = row_metadata_to_tuple(
+			rec, index, *offsets, entry_heap,
+			update->info_bits, !thr_get_trx(thr)->in_rollback);
+		ut_ad(new_entry->n_fields
+		      == ulint(index->n_fields)
+		      + update->is_alter_metadata());
+	} else {
+		new_entry = row_rec_to_index_entry(rec, index, *offsets,
+						   entry_heap);
+	}
+
+	/* The page containing the clustered index record
+	corresponding to new_entry is latched in mtr.  If the
+	clustered index record is delete-marked, then its externally
+	stored fields cannot have been purged yet, because then the
+	purge would also have removed the clustered index record
+	itself.  Thus the following call is safe. */
+	row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
+						     entry_heap);
+	btr_cur_trim(new_entry, index, update, thr);
+
+	/* We have to set appropriate extern storage bits in the new
+	record to be inserted: we have to remember which fields were such */
+
+	ut_ad(!page_is_comp(block->page.frame) || !rec_get_node_ptr_flag(rec));
+	ut_ad(rec_offs_validate(rec, index, *offsets));
+
+	if ((flags & BTR_NO_UNDO_LOG_FLAG)
+	    && rec_offs_any_extern(*offsets)) {
+		/* We are in a transaction rollback undoing a row
+		update: we must free possible externally stored fields
+		which got new values in the update, if they are not
+		inherited values. They can be inherited if we have
+		updated the primary key to another value, and then
+		update it back again. */
+
+		ut_ad(big_rec_vec == NULL);
+		ut_ad(dict_index_is_clust(index));
+		ut_ad(thr_get_trx(thr)->in_rollback);
+
+		DEBUG_SYNC_C("blob_rollback_middle");
+
+		btr_rec_free_updated_extern_fields(
+			index, rec, block, *offsets, update, true, mtr);
+	}
+
+	ulint n_ext = index->is_primary() ? dtuple_get_n_ext(new_entry) : 0;
+
+	if (page_zip_rec_needs_ext(
+		    rec_get_converted_size(index, new_entry, n_ext),
+		    page_is_comp(block->page.frame),
+		    dict_index_get_n_fields(index),
+		    block->zip_size())
+	    || (UNIV_UNLIKELY(update->is_alter_metadata())
+		&& !dfield_is_ext(dtuple_get_nth_field(
+					  new_entry,
+					  index->first_user_field())))) {
+		big_rec_vec = dtuple_convert_big_rec(index, update, new_entry, &n_ext);
+		if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
+
+			/* We cannot goto return_after_reservations,
+			because we may need to update the
+			IBUF_BITMAP_FREE bits, which was suppressed by
+			BTR_KEEP_IBUF_BITMAP. */
+#ifdef UNIV_ZIP_DEBUG
+			ut_a(!page_zip
+			     || page_zip_validate(page_zip, block->page.frame,
+						  index));
+#endif /* UNIV_ZIP_DEBUG */
+			index->table->space->release_free_extents(n_reserved);
+			err = DB_TOO_BIG_RECORD;
+			goto err_exit;
+		}
+
+		ut_ad(page_is_leaf(block->page.frame));
+		ut_ad(dict_index_is_clust(index));
+		if (UNIV_UNLIKELY(!(flags & BTR_KEEP_POS_FLAG))) {
+			ut_ad(page_zip != NULL);
+			dtuple_convert_back_big_rec(index, new_entry,
+						    big_rec_vec);
+			big_rec_vec = NULL;
+			n_ext = dtuple_get_n_ext(new_entry);
+		}
+	}
+
+	/* Do lock checking and undo logging */
+	err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
+					update, cmpl_info,
+					thr, mtr, &roll_ptr);
+	if (err != DB_SUCCESS) {
+		goto err_exit;
+	}
+
+	if (optim_err == DB_OVERFLOW) {
+		/* First reserve enough free space for the file segments
+		of the index tree, so that the update will not fail because
+		of lack of space */
+
+		err = fsp_reserve_free_extents(
+			&n_reserved, index->table->space,
+			uint32_t(cursor->tree_height / 16 + 3),
+			flags & BTR_NO_UNDO_LOG_FLAG
+			? FSP_CLEANING : FSP_NORMAL,
+			mtr);
+                if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+			err = DB_OUT_OF_FILE_SPACE;
+			goto err_exit;
+		}
+	}
+
+	if (!(flags & BTR_KEEP_SYS_FLAG)) {
+		btr_cur_write_sys(new_entry, index, trx_id, roll_ptr);
+	}
+
+	const ulint max_ins_size = page_zip
+		? 0
+		: page_get_max_insert_size_after_reorganize(block->page.frame,
+							    1);
+
+	if (UNIV_UNLIKELY(is_metadata)) {
+		ut_ad(new_entry->is_metadata());
+		ut_ad(index->is_instant());
+		/* This can be innobase_add_instant_try() performing a
+		subsequent instant ALTER TABLE, or its rollback by
+		row_undo_mod_clust_low(). */
+		ut_ad(flags & BTR_NO_LOCKING_FLAG);
+	} else {
+		btr_search_update_hash_on_delete(cursor);
+
+		/* Store state of explicit locks on rec on the page
+		infimum record, before deleting rec. The page infimum
+		acts as a dummy carrier of the locks, taking care also
+		of lock releases, before we can move the locks back on
+		the actual record. There is a special case: if we are
+		inserting on the root page and the insert causes a
+		call of btr_root_raise_and_insert. Therefore we cannot
+		in the lock system delete the lock structs set on the
+		root page even if the root page carries just node
+		pointers. */
+		lock_rec_store_on_page_infimum(block, rec);
+	}
+
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!page_zip
+	     || page_zip_validate(page_zip, block->page.frame, index));
+#endif /* UNIV_ZIP_DEBUG */
+	page_cursor = btr_cur_get_page_cur(cursor);
+
+	page_cur_delete_rec(page_cursor, *offsets, mtr);
+
+	if (!page_cur_move_to_prev(page_cursor)) {
+		err = DB_CORRUPTION;
+		goto return_after_reservations;
+	}
+
+	rec = btr_cur_insert_if_possible(cursor, new_entry,
+					 offsets, offsets_heap, n_ext, mtr);
+
+	if (rec) {
+		page_cursor->rec = rec;
+
+		if (UNIV_UNLIKELY(is_metadata)) {
+			/* We must empty the PAGE_FREE list, because if this
+			was a rollback, the shortened metadata record
+			would have too many fields, and we would be unable to
+			know the size of the freed record. */
+			err = btr_page_reorganize(page_cursor, mtr);
+			if (err != DB_SUCCESS) {
+				goto return_after_reservations;
+			}
+			rec = page_cursor->rec;
+			rec_offs_make_valid(rec, index, true, *offsets);
+			if (page_cursor->block->page.id().page_no()
+			    == index->page) {
+				btr_set_instant(page_cursor->block, *index,
+						mtr);
+			}
+		} else {
+			lock_rec_restore_from_page_infimum(
+				*btr_cur_get_block(cursor), rec,
+				block->page.id());
+		}
+
+		if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))
+		    || rec_is_alter_metadata(rec, *index)) {
+			/* The new inserted record owns its possible externally
+			stored fields */
+			btr_cur_unmark_extern_fields(btr_cur_get_block(cursor),
+						     rec, index, *offsets, mtr);
+		} else {
+			/* In delete-marked records, DB_TRX_ID must
+			always refer to an existing undo log record. */
+			ut_ad(row_get_rec_trx_id(rec, index, *offsets));
+		}
+
+		bool adjust = big_rec_vec && (flags & BTR_KEEP_POS_FLAG);
+		ut_ad(!adjust || page_is_leaf(block->page.frame));
+
+		if (btr_cur_compress_if_useful(cursor, adjust, mtr)) {
+			if (adjust) {
+				rec_offs_make_valid(page_cursor->rec, index,
+						    true, *offsets);
+			}
+		} else if (!dict_index_is_clust(index)
+			   && page_is_leaf(block->page.frame)) {
+			/* Update the free bits in the insert buffer.
+			This is the same block which was skipped by
+			BTR_KEEP_IBUF_BITMAP. */
+			if (page_zip) {
+				ut_ad(!index->table->is_temporary());
+				ibuf_update_free_bits_zip(block, mtr);
+			} else if (!index->table->is_temporary()) {
+				ibuf_update_free_bits_low(block, max_ins_size,
+							  mtr);
+			}
+		}
+
+#if 0 // FIXME: this used to be a no-op, and will cause trouble if enabled
+		if (!big_rec_vec
+		    && page_is_leaf(block->page.frame)
+		    && !dict_index_is_online_ddl(index)) {
+			mtr->release(index->lock);
+			/* NOTE: We cannot release root block latch here, because it
+			has segment header and already modified in most of cases.*/
+		}
+#endif
+
+		err = DB_SUCCESS;
+		goto return_after_reservations;
+	} else {
+		/* If the page is compressed and it initially
+		compresses very well, and there is a subsequent insert
+		of a badly-compressing record, it is possible for
+		btr_cur_optimistic_update() to return DB_UNDERFLOW and
+		btr_cur_insert_if_possible() to return FALSE. */
+		ut_a(page_zip || optim_err != DB_UNDERFLOW);
+
+		/* Out of space: reset the free bits.
+		This is the same block which was skipped by
+		BTR_KEEP_IBUF_BITMAP. */
+		if (!dict_index_is_clust(index)
+		    && !index->table->is_temporary()
+		    && page_is_leaf(block->page.frame)) {
+			ibuf_reset_free_bits(block);
+		}
+	}
+
+	if (big_rec_vec != NULL) {
+		ut_ad(page_is_leaf(block->page.frame));
+		ut_ad(dict_index_is_clust(index));
+		ut_ad(flags & BTR_KEEP_POS_FLAG);
+
+		/* btr_page_split_and_insert() in
+		btr_cur_pessimistic_insert() invokes
+		mtr->release(index->lock).
+		We must keep the index->lock when we created a
+		big_rec, so that row_upd_clust_rec() can store the
+		big_rec in the same mini-transaction. */
+
+		ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+						 | MTR_MEMO_SX_LOCK));
+		mtr_sx_lock_index(index, mtr);
+	}
+
+	/* Was the record to be updated positioned as the first user
+	record on its page? */
+	was_first = page_cur_is_before_first(page_cursor);
+
+	/* Lock checks and undo logging were already performed by
+	btr_cur_upd_lock_and_undo(). We do not try
+	btr_cur_optimistic_insert() because
+	btr_cur_insert_if_possible() already failed above. */
+
+	err = btr_cur_pessimistic_insert(BTR_NO_UNDO_LOG_FLAG
+					 | BTR_NO_LOCKING_FLAG
+					 | BTR_KEEP_SYS_FLAG,
+					 cursor, offsets, offsets_heap,
+					 new_entry, &rec,
+					 &dummy_big_rec, n_ext, NULL, mtr);
+	ut_a(err == DB_SUCCESS);
+	ut_a(rec);
+	ut_a(dummy_big_rec == NULL);
+	ut_ad(rec_offs_validate(rec, cursor->index(), *offsets));
+	page_cursor->rec = rec;
+
+	/* Multiple transactions cannot simultaneously operate on the
+	same temp-table in parallel.
+	max_trx_id is ignored for temp tables because it not required
+	for MVCC. */
+	if (dict_index_is_sec_or_ibuf(index)
+	    && !index->table->is_temporary()) {
+		/* Update PAGE_MAX_TRX_ID in the index page header.
+		It was not updated by btr_cur_pessimistic_insert()
+		because of BTR_NO_LOCKING_FLAG. */
+		page_update_max_trx_id(btr_cur_get_block(cursor),
+				       btr_cur_get_page_zip(cursor),
+				       trx_id, mtr);
+	}
+
+	if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
+		/* The new inserted record owns its possible externally
+		stored fields */
+#ifdef UNIV_ZIP_DEBUG
+		ut_a(!page_zip
+		     || page_zip_validate(page_zip, block->page.frame, index));
+#endif /* UNIV_ZIP_DEBUG */
+		btr_cur_unmark_extern_fields(btr_cur_get_block(cursor), rec,
+					     index, *offsets, mtr);
+	} else {
+		/* In delete-marked records, DB_TRX_ID must
+		always refer to an existing undo log record. */
+		ut_ad(row_get_rec_trx_id(rec, index, *offsets));
+	}
+
+	if (UNIV_UNLIKELY(is_metadata)) {
+		/* We must empty the PAGE_FREE list, because if this
+		was a rollback, the shortened metadata record
+		would have too many fields, and we would be unable to
+		know the size of the freed record. */
+		err = btr_page_reorganize(page_cursor, mtr);
+		if (err != DB_SUCCESS) {
+			goto return_after_reservations;
+		}
+		rec = page_cursor->rec;
+	} else {
+		lock_rec_restore_from_page_infimum(
+			*btr_cur_get_block(cursor), rec, block->page.id());
+	}
+
+	/* If necessary, restore also the correct lock state for a new,
+	preceding supremum record created in a page split. While the old
+	record was nonexistent, the supremum might have inherited its locks
+	from a wrong record. */
+
+	if (!was_first) {
+		err = btr_cur_pess_upd_restore_supremum(
+			btr_cur_get_block(cursor), rec, mtr);
+	}
+
+return_after_reservations:
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(err ||
+	     !page_zip || page_zip_validate(btr_cur_get_page_zip(cursor),
+					    btr_cur_get_page(cursor), index));
+#endif /* UNIV_ZIP_DEBUG */
+
+	index->table->space->release_free_extents(n_reserved);
+	*big_rec = big_rec_vec;
+	return(err);
+}
+
+/*==================== B-TREE DELETE MARK AND UNMARK ===============*/
+
+/** Modify the delete-mark flag of a record.
+@tparam         flag    the value of the delete-mark flag
+@param[in,out]  block   buffer block
+@param[in,out]  rec     record on a physical index page
+@param[in,out]  mtr     mini-transaction  */
+template<bool flag>
+void btr_rec_set_deleted(buf_block_t *block, rec_t *rec, mtr_t *mtr)
+{
+  if (page_rec_is_comp(rec))
+  {
+    byte *b= &rec[-REC_NEW_INFO_BITS];
+    const byte v= flag
+      ? (*b | REC_INFO_DELETED_FLAG)
+      : (*b & byte(~REC_INFO_DELETED_FLAG));
+    if (*b == v);
+    else if (UNIV_LIKELY_NULL(block->page.zip.data))
+    {
+      *b= v;
+      page_zip_rec_set_deleted(block, rec, flag, mtr);
+    }
+    else
+      mtr->write<1>(*block, b, v);
+  }
+  else
+  {
+    ut_ad(!block->page.zip.data);
+    byte *b= &rec[-REC_OLD_INFO_BITS];
+    const byte v = flag
+      ? (*b | REC_INFO_DELETED_FLAG)
+      : (*b & byte(~REC_INFO_DELETED_FLAG));
+    mtr->write<1,mtr_t::MAYBE_NOP>(*block, b, v);
+  }
+}
+
+template void btr_rec_set_deleted<false>(buf_block_t *, rec_t *, mtr_t *);
+template void btr_rec_set_deleted<true>(buf_block_t *, rec_t *, mtr_t *);
+
+/***********************************************************//**
+Marks a clustered index record deleted. Writes an undo log record to
+undo log on this delete marking. Writes in the trx id field the id
+of the deleting transaction, and in the roll ptr field pointer to the
+undo log record created.
+@return DB_SUCCESS, DB_LOCK_WAIT, or error number */
+dberr_t
+btr_cur_del_mark_set_clust_rec(
+/*===========================*/
+	buf_block_t*	block,	/*!< in/out: buffer block of the record */
+	rec_t*		rec,	/*!< in/out: record */
+	dict_index_t*	index,	/*!< in: clustered index of the record */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec) */
+	que_thr_t*	thr,	/*!< in: query thread */
+	const dtuple_t*	entry,	/*!< in: dtuple for the deleting record, also
+				contains the virtual cols if there are any */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	roll_ptr_t	roll_ptr;
+	dberr_t		err;
+	trx_t*		trx;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
+	ut_ad(buf_block_get_frame(block) == page_align(rec));
+	ut_ad(page_rec_is_leaf(rec));
+	ut_ad(mtr->is_named_space(index->table->space));
+
+	if (rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
+		/* We may already have delete-marked this record
+		when executing an ON DELETE CASCADE operation. */
+		ut_ad(row_get_rec_trx_id(rec, index, offsets)
+		      == thr_get_trx(thr)->id);
+		return(DB_SUCCESS);
+	}
+
+	err = trx_undo_report_row_operation(thr, index,
+					    entry, NULL, 0, rec, offsets,
+					    &roll_ptr);
+	if (err != DB_SUCCESS) {
+
+		return(err);
+	}
+
+	/* The search latch is not needed here, because
+	the adaptive hash index does not depend on the delete-mark
+	and the delete-mark is being updated in place. */
+
+	btr_rec_set_deleted<true>(block, rec, mtr);
+
+	trx = thr_get_trx(thr);
+
+	DBUG_LOG("ib_cur",
+		 "delete-mark clust " << index->table->name
+		 << " (" << index->id << ") by "
+		 << ib::hex(trx->id) << ": "
+		 << rec_printer(rec, offsets).str());
+
+	return btr_cur_upd_rec_sys(block, rec, index, offsets, trx, roll_ptr,
+				   mtr);
+}
+
+/*==================== B-TREE RECORD REMOVE =========================*/
+
+/*************************************************************//**
+Tries to compress a page of the tree if it seems useful. It is assumed
+that mtr holds an x-latch on the tree and on the cursor page. To avoid
+deadlocks, mtr must also own x-latches to brothers of page, if those
+brothers exist. NOTE: it is assumed that the caller has reserved enough
+free extents so that the compression will always succeed if done!
+@return whether compression occurred */
+bool
+btr_cur_compress_if_useful(
+/*=======================*/
+	btr_cur_t*	cursor,	/*!< in/out: cursor on the page to compress;
+				cursor does not stay valid if !adjust and
+				compression occurs */
+	bool		adjust,	/*!< in: whether the cursor position should be
+				adjusted even when compression occurs */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	ut_ad(mtr->memo_contains_flagged(&cursor->index()->lock,
+					 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
+	ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
+					 MTR_MEMO_PAGE_X_FIX));
+
+	if (cursor->index()->is_spatial()) {
+		const trx_t*	trx = cursor->rtr_info->thr
+			? thr_get_trx(cursor->rtr_info->thr)
+			: NULL;
+		const buf_block_t* block = btr_cur_get_block(cursor);
+
+		/* Check whether page lock prevents the compression */
+		if (!lock_test_prdt_page_lock(trx, block->page.id())) {
+			return(false);
+		}
+	}
+
+	return btr_cur_compress_recommendation(cursor, mtr)
+		&& btr_compress(cursor, adjust, mtr) == DB_SUCCESS;
+}
+
+/*******************************************************//**
+Removes the record on which the tree cursor is positioned on a leaf page.
+It is assumed that the mtr has an x-latch on the page where the cursor is
+positioned, but no latch on the whole tree.
+@return error code
+@retval DB_FAIL if the page would become too empty */
+dberr_t
+btr_cur_optimistic_delete(
+/*======================*/
+	btr_cur_t*	cursor,	/*!< in: cursor on leaf page, on the record to
+				delete; cursor stays valid: if deletion
+				succeeds, on function exit it points to the
+				successor of the deleted record */
+	ulint		flags,	/*!< in: BTR_CREATE_FLAG or 0 */
+	mtr_t*		mtr)	/*!< in: mtr; if this function returns
+				TRUE on a leaf page of a secondary
+				index, the mtr must be committed
+				before latching any further pages */
+{
+	buf_block_t*	block;
+	rec_t*		rec;
+	mem_heap_t*	heap		= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
+	ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
+					 MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr->is_named_space(cursor->index()->table->space));
+	ut_ad(!cursor->index()->is_dummy);
+
+	/* This is intended only for leaf page deletions */
+
+	block = btr_cur_get_block(cursor);
+
+	ut_ad(block->page.id().space() == cursor->index()->table->space->id);
+	ut_ad(page_is_leaf(buf_block_get_frame(block)));
+	ut_ad(!dict_index_is_online_ddl(cursor->index())
+	      || cursor->index()->is_clust()
+	      || (flags & BTR_CREATE_FLAG));
+
+	rec = btr_cur_get_rec(cursor);
+
+	offsets = rec_get_offsets(rec, cursor->index(), offsets,
+				  cursor->index()->n_core_fields,
+				  ULINT_UNDEFINED, &heap);
+
+	dberr_t err = DB_SUCCESS;
+	if (rec_offs_any_extern(offsets)
+	    || !btr_cur_can_delete_without_compress(cursor,
+						    rec_offs_size(offsets),
+						    mtr)) {
+		/* prefetch siblings of the leaf for the pessimistic
+		operation. */
+		btr_cur_prefetch_siblings(block, cursor->index());
+		err = DB_FAIL;
+		goto func_exit;
+	}
+
+	if (UNIV_UNLIKELY(block->page.id().page_no() == cursor->index()->page
+			  && page_get_n_recs(block->page.frame) == 1
+			  + (cursor->index()->is_instant()
+			     && !rec_is_metadata(rec, *cursor->index()))
+			  && !cursor->index()
+			  ->must_avoid_clear_instant_add())) {
+		/* The whole index (and table) becomes logically empty.
+		Empty the whole page. That is, if we are deleting the
+		only user record, also delete the metadata record
+		if one exists for instant ADD COLUMN (not generic ALTER TABLE).
+		If we are deleting the metadata record and the
+		table becomes empty, clean up the whole page. */
+		dict_index_t* index = cursor->index();
+		const rec_t* first_rec = page_rec_get_next_const(
+			page_get_infimum_rec(block->page.frame));
+		if (UNIV_UNLIKELY(!first_rec)) {
+			err = DB_CORRUPTION;
+			goto func_exit;
+		}
+		ut_ad(!index->is_instant()
+		      || rec_is_metadata(first_rec, *index));
+		const bool is_metadata = rec_is_metadata(rec, *index);
+		/* We can remove the metadata when rolling back an
+		instant ALTER TABLE operation, or when deleting the
+		last user record on the page such that only metadata for
+		instant ADD COLUMN (not generic ALTER TABLE) remains. */
+		const bool empty_table = is_metadata
+			|| !index->is_instant()
+			|| (first_rec != rec
+			    && rec_is_add_metadata(first_rec, *index));
+		if (UNIV_LIKELY(empty_table)) {
+			if (UNIV_LIKELY(!is_metadata && !flags)) {
+				lock_update_delete(block, rec);
+			}
+			btr_page_empty(block, buf_block_get_page_zip(block),
+				       index, 0, mtr);
+			if (index->is_instant()) {
+				/* MDEV-17383: free metadata BLOBs! */
+				index->clear_instant_alter();
+			}
+
+			page_cur_set_after_last(block,
+						btr_cur_get_page_cur(cursor));
+			goto func_exit;
+		}
+	}
+
+	{
+		page_t*		page	= buf_block_get_frame(block);
+		page_zip_des_t*	page_zip= buf_block_get_page_zip(block);
+
+		if (UNIV_UNLIKELY(rec_get_info_bits(rec, page_rec_is_comp(rec))
+				  & REC_INFO_MIN_REC_FLAG)) {
+			/* This should be rolling back instant ADD COLUMN.
+			If this is a recovered transaction, then
+			index->is_instant() will hold until the
+			insert into SYS_COLUMNS is rolled back. */
+			ut_ad(cursor->index()->table->supports_instant());
+			ut_ad(cursor->index()->is_primary());
+			ut_ad(!page_zip);
+			page_cur_delete_rec(btr_cur_get_page_cur(cursor),
+					    offsets, mtr);
+			/* We must empty the PAGE_FREE list, because
+			after rollback, this deleted metadata record
+			would have too many fields, and we would be
+			unable to know the size of the freed record. */
+			err = btr_page_reorganize(btr_cur_get_page_cur(cursor),
+						  mtr);
+			goto func_exit;
+		} else {
+			if (!flags) {
+				lock_update_delete(block, rec);
+			}
+
+			btr_search_update_hash_on_delete(cursor);
+		}
+
+		if (page_zip) {
+#ifdef UNIV_ZIP_DEBUG
+			ut_a(page_zip_validate(page_zip, page,
+					       cursor->index()));
+#endif /* UNIV_ZIP_DEBUG */
+			page_cur_delete_rec(btr_cur_get_page_cur(cursor),
+					    offsets, mtr);
+#ifdef UNIV_ZIP_DEBUG
+			ut_a(page_zip_validate(page_zip, page,
+					       cursor->index()));
+#endif /* UNIV_ZIP_DEBUG */
+
+			/* On compressed pages, the IBUF_BITMAP_FREE
+			space is not affected by deleting (purging)
+			records, because it is defined as the minimum
+			of space available *without* reorganize, and
+			space available in the modification log. */
+		} else {
+			const ulint	max_ins
+				= page_get_max_insert_size_after_reorganize(
+					page, 1);
+
+			page_cur_delete_rec(btr_cur_get_page_cur(cursor),
+					    offsets, mtr);
+
+			/* The change buffer does not handle inserts
+			into non-leaf pages, into clustered indexes,
+			or into the change buffer. */
+			if (!cursor->index()->is_clust()
+			    && !cursor->index()->table->is_temporary()
+			    && !dict_index_is_ibuf(cursor->index())) {
+				ibuf_update_free_bits_low(block, max_ins, mtr);
+			}
+		}
+	}
+
+func_exit:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	return err;
+}
+
+/*************************************************************//**
+Removes the record on which the tree cursor is positioned. Tries
+to compress the page if its fillfactor drops below a threshold
+or if it is the only page on the level. It is assumed that mtr holds
+an x-latch on the tree and on the cursor page. To avoid deadlocks,
+mtr must also own x-latches to brothers of page, if those brothers
+exist.
+@return TRUE if compression occurred and FALSE if not or something
+wrong. */
+ibool
+btr_cur_pessimistic_delete(
+/*=======================*/
+	dberr_t*	err,	/*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE;
+				the latter may occur because we may have
+				to update node pointers on upper levels,
+				and in the case of variable length keys
+				these may actually grow in size */
+	ibool		has_reserved_extents, /*!< in: TRUE if the
+				caller has already reserved enough free
+				extents so that he knows that the operation
+				will succeed */
+	btr_cur_t*	cursor,	/*!< in: cursor on the record to delete;
+				if compression does not occur, the cursor
+				stays valid: it points to successor of
+				deleted record on function exit */
+	ulint		flags,	/*!< in: BTR_CREATE_FLAG or 0 */
+	bool		rollback,/*!< in: performing rollback? */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	buf_block_t*	block;
+	page_t*		page;
+	page_zip_des_t*	page_zip;
+	dict_index_t*	index;
+	rec_t*		rec;
+	uint32_t	n_reserved	= 0;
+	ibool		ret		= FALSE;
+	mem_heap_t*	heap;
+	rec_offs*	offsets;
+#ifdef UNIV_DEBUG
+	bool		parent_latched	= false;
+#endif /* UNIV_DEBUG */
+
+	block = btr_cur_get_block(cursor);
+	page = buf_block_get_frame(block);
+	index = btr_cur_get_index(cursor);
+
+	ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
+	ut_ad(!dict_index_is_online_ddl(index)
+	      || dict_index_is_clust(index)
+	      || (flags & BTR_CREATE_FLAG));
+	ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+					 | MTR_MEMO_SX_LOCK));
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr->is_named_space(index->table->space));
+	ut_ad(!index->is_dummy);
+	ut_ad(block->page.id().space() == index->table->space->id);
+
+	if (!has_reserved_extents) {
+		/* First reserve enough free space for the file segments
+		of the index tree, so that the node pointer updates will
+		not fail because of lack of space */
+
+		uint32_t n_extents = uint32_t(cursor->tree_height / 32 + 1);
+
+		*err = fsp_reserve_free_extents(&n_reserved,
+						index->table->space,
+						n_extents,
+						FSP_CLEANING, mtr);
+		if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+			return(FALSE);
+		}
+	}
+
+	heap = mem_heap_create(1024);
+	rec = btr_cur_get_rec(cursor);
+	page_zip = buf_block_get_page_zip(block);
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+	offsets = rec_get_offsets(rec, index, NULL, page_is_leaf(page)
+				  ? index->n_core_fields : 0,
+				  ULINT_UNDEFINED, &heap);
+
+	if (rec_offs_any_extern(offsets)) {
+		btr_rec_free_externally_stored_fields(index,
+						      rec, offsets, block,
+						      rollback, mtr);
+#ifdef UNIV_ZIP_DEBUG
+		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+	}
+
+	rec_t* next_rec = NULL;
+	bool min_mark_next_rec = false;
+
+	if (page_is_leaf(page)) {
+		const bool is_metadata = rec_is_metadata(
+			rec, page_rec_is_comp(rec));
+		if (UNIV_UNLIKELY(is_metadata)) {
+			/* This should be rolling back instant ALTER TABLE.
+			If this is a recovered transaction, then
+			index->is_instant() will hold until the
+			insert into SYS_COLUMNS is rolled back. */
+			ut_ad(rollback);
+			ut_ad(index->table->supports_instant());
+			ut_ad(index->is_primary());
+		} else if (flags == 0) {
+			lock_update_delete(block, rec);
+		}
+
+		if (block->page.id().page_no() != index->page) {
+			if (page_get_n_recs(page) < 2) {
+				goto discard_page;
+			}
+		} else if (page_get_n_recs(page) == 1
+			   + (index->is_instant() && !is_metadata)
+			   && !index->must_avoid_clear_instant_add()) {
+			/* The whole index (and table) becomes logically empty.
+			Empty the whole page. That is, if we are deleting the
+			only user record, also delete the metadata record
+			if one exists for instant ADD COLUMN
+			(not generic ALTER TABLE).
+			If we are deleting the metadata record
+			(in the rollback of instant ALTER TABLE) and the
+			table becomes empty, clean up the whole page. */
+
+			const rec_t* first_rec = page_rec_get_next_const(
+				page_get_infimum_rec(page));
+			if (UNIV_UNLIKELY(!first_rec)) {
+				*err = DB_CORRUPTION;
+				goto err_exit;
+			}
+			ut_ad(!index->is_instant()
+			      || rec_is_metadata(first_rec, *index));
+			if (is_metadata || !index->is_instant()
+			    || (first_rec != rec
+				&& rec_is_add_metadata(first_rec, *index))) {
+				btr_page_empty(block, page_zip, index, 0, mtr);
+				if (index->is_instant()) {
+					/* MDEV-17383: free metadata BLOBs! */
+					index->clear_instant_alter();
+				}
+
+				page_cur_set_after_last(
+					block,
+					btr_cur_get_page_cur(cursor));
+				ret = TRUE;
+				goto return_after_reservations;
+			}
+		}
+
+		if (UNIV_LIKELY(!is_metadata)) {
+			btr_search_update_hash_on_delete(cursor);
+		} else {
+			page_cur_delete_rec(btr_cur_get_page_cur(cursor),
+					    offsets, mtr);
+			/* We must empty the PAGE_FREE list, because
+			after rollback, this deleted metadata record
+			would carry too many fields, and we would be
+			unable to know the size of the freed record. */
+			*err = btr_page_reorganize(btr_cur_get_page_cur(cursor),
+						   mtr);
+			ut_ad(!ret);
+			goto err_exit;
+		}
+	} else if (UNIV_UNLIKELY(page_rec_is_first(rec, page))) {
+		if (page_rec_is_last(rec, page)) {
+discard_page:
+			ut_ad(page_get_n_recs(page) == 1);
+			/* If there is only one record, drop
+			the whole page. */
+
+			btr_discard_page(cursor, mtr);
+
+			ret = TRUE;
+			goto return_after_reservations;
+		}
+
+		if (UNIV_UNLIKELY(!(next_rec = page_rec_get_next(rec)))) {
+			ut_ad(!ret);
+			*err = DB_CORRUPTION;
+			goto err_exit;
+		}
+
+		btr_cur_t cursor;
+		cursor.page_cur.index = index;
+		cursor.page_cur.block = block;
+
+		if (!page_has_prev(page)) {
+			/* If we delete the leftmost node pointer on a
+			non-leaf level, we must mark the new leftmost node
+			pointer as the predefined minimum record */
+
+			min_mark_next_rec = true;
+		} else if (index->is_spatial()) {
+			/* For rtree, if delete the leftmost node pointer,
+			we need to update parent page. */
+			rtr_mbr_t	father_mbr;
+			rec_t*		father_rec;
+			rec_offs*	offsets;
+			ulint		len;
+
+			rtr_page_get_father_block(NULL, heap, mtr, NULL,
+						  &cursor);
+			father_rec = btr_cur_get_rec(&cursor);
+			offsets = rec_get_offsets(father_rec, index, NULL,
+						  0, ULINT_UNDEFINED, &heap);
+
+			rtr_read_mbr(rec_get_nth_field(
+				father_rec, offsets, 0, &len), &father_mbr);
+
+			rtr_update_mbr_field(&cursor, offsets, NULL,
+					     page, &father_mbr, next_rec, mtr);
+			ut_d(parent_latched = true);
+		} else {
+			/* Otherwise, if we delete the leftmost node pointer
+			on a page, we have to change the parent node pointer
+			so that it is equal to the new leftmost node pointer
+			on the page */
+			ret = btr_page_get_father(mtr, &cursor);
+			if (!ret) {
+				*err = DB_CORRUPTION;
+				goto err_exit;
+			}
+			*err = btr_cur_node_ptr_delete(&cursor, mtr);
+			if (*err != DB_SUCCESS) {
+got_err:
+				ret = FALSE;
+				goto err_exit;
+			}
+
+			const ulint	level = btr_page_get_level(page);
+			// FIXME: reuse the node_ptr from above
+			dtuple_t*	node_ptr = dict_index_build_node_ptr(
+				index, next_rec, block->page.id().page_no(),
+				heap, level);
+
+			*err = btr_insert_on_non_leaf_level(
+				flags, index, level + 1, node_ptr, mtr);
+			if (*err != DB_SUCCESS) {
+				ret = FALSE;
+				goto got_err;
+			}
+
+			ut_d(parent_latched = true);
+		}
+	}
+
+	/* SPATIAL INDEX never use U locks; we can allow page merges
+	while holding X lock on the spatial index tree.
+	Do not allow merges of non-leaf B-tree pages unless it is
+	safe to do so. */
+	{
+		const bool allow_merge = page_is_leaf(page)
+			|| dict_index_is_spatial(index)
+			|| btr_cur_will_modify_tree(
+				index, page, BTR_INTENTION_DELETE, rec,
+				btr_node_ptr_max_size(index),
+				block->zip_size(), mtr);
+		page_cur_delete_rec(btr_cur_get_page_cur(cursor),
+				    offsets, mtr);
+
+		if (min_mark_next_rec) {
+			btr_set_min_rec_mark(next_rec, *block, mtr);
+		}
+
+#ifdef UNIV_ZIP_DEBUG
+		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+		ut_ad(!parent_latched
+		      || btr_check_node_ptr(index, block, mtr));
+
+		if (!ret && btr_cur_compress_recommendation(cursor, mtr)) {
+			if (UNIV_LIKELY(allow_merge)) {
+				ret = btr_cur_compress_if_useful(
+					cursor, FALSE, mtr);
+			} else {
+				ib::warn() << "Not merging page "
+					   << block->page.id()
+					   << " in index " << index->name
+					   << " of " << index->table->name;
+				ut_ad("MDEV-14637" == 0);
+			}
+		}
+	}
+
+return_after_reservations:
+	*err = DB_SUCCESS;
+err_exit:
+	mem_heap_free(heap);
+
+#if 0 // FIXME: this used to be a no-op, and will cause trouble if enabled
+	if (page_is_leaf(page)
+	    && !dict_index_is_online_ddl(index)) {
+		mtr->release(index->lock);
+		/* NOTE: We cannot release root block latch here, because it
+		has segment header and already modified in most of cases.*/
+	}
+#endif
+
+	index->table->space->release_free_extents(n_reserved);
+	return(ret);
+}
+
+/** Delete the node pointer in a parent page.
+@param[in,out]	parent	cursor pointing to parent record
+@param[in,out]	mtr	mini-transaction */
+dberr_t btr_cur_node_ptr_delete(btr_cur_t* parent, mtr_t* mtr)
+{
+	ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(parent),
+					 MTR_MEMO_PAGE_X_FIX));
+	dberr_t err;
+	ibool compressed = btr_cur_pessimistic_delete(&err, TRUE, parent,
+						      BTR_CREATE_FLAG, false,
+						      mtr);
+	if (err == DB_SUCCESS && !compressed) {
+		btr_cur_compress_if_useful(parent, FALSE, mtr);
+	}
+
+	return err;
+}
+
+/** Represents the cursor for the number of rows estimation. The
+content is used for level-by-level diving and estimation the number of rows
+on each level. */
+class btr_est_cur_t
+{
+  /* Assume a page like:
+  records:             (inf, a, b, c, d, sup)
+  index of the record:    0, 1, 2, 3, 4, 5
+  */
+
+  /** Index of the record where the page cursor stopped on this level
+  (index in alphabetical order). In the above example, if the search stopped on
+  record 'c', then nth_rec will be 3. */
+  ulint m_nth_rec;
+
+  /** Number of the records on the page, not counting inf and sup.
+  In the above example n_recs will be 4. */
+  ulint m_n_recs;
+
+  /** Search tuple */
+  const dtuple_t &m_tuple;
+  /** Cursor search mode */
+  page_cur_mode_t m_mode;
+  /** Page cursor which is used for search */
+  page_cur_t m_page_cur;
+  /** Page id of the page to get on level down, can differ from
+  m_block->page.id at the moment when the child's page id is already found, but
+  the child's block has not fetched yet */
+  page_id_t m_page_id;
+  /** Current block */
+  buf_block_t *m_block;
+  /** Page search mode, can differ from m_mode for non-leaf pages, see c-tor
+  comments for details */
+  page_cur_mode_t m_page_mode;
+
+  /** Matched fields and bytes which are used for on-page search, see
+  btr_cur_t::(up|low)_(match|bytes) comments for details */
+  ulint m_up_match= 0;
+  ulint m_up_bytes= 0;
+  ulint m_low_match= 0;
+  ulint m_low_bytes= 0;
+
+public:
+  btr_est_cur_t(dict_index_t *index, const dtuple_t &tuple,
+                page_cur_mode_t mode)
+      : m_tuple(tuple), m_mode(mode),
+        m_page_id(index->table->space_id, index->page), m_block(nullptr)
+  {
+
+    ut_ad(dict_index_check_search_tuple(index, &tuple));
+    ut_ad(dtuple_check_typed(&tuple));
+
+    m_page_cur.index = index;
+    /* We use these modified search modes on non-leaf levels of the B-tree.
+    These let us end up in the right B-tree leaf. In that leaf we use the
+    original search mode. */
+    switch (mode) {
+    case PAGE_CUR_GE:
+      m_page_mode= PAGE_CUR_L;
+      break;
+    case PAGE_CUR_G:
+      m_page_mode= PAGE_CUR_LE;
+      break;
+    default:
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+      ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE ||
+            mode == PAGE_CUR_LE_OR_EXTENDS);
+#else  /* PAGE_CUR_LE_OR_EXTENDS */
+      ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE);
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+      m_page_mode= mode;
+      break;
+    }
+  }
+
+  /** Retrieve block with m_page_id, release the previously gotten block
+  if necessary. If this is a left border block cursor and both left and right
+  border blocks have the same parent, don't unlatch the parent, as it must be
+  latched to get the right block, and will be unlatched after the right block
+  is fetched.
+  @param  level distance from the leaf page level; ULINT_UNDEFINED when
+          fetching the root page
+  @param  mtr mtr
+  @param  right_parent right border block parent, nullptr if the function
+          is called for the right block itself
+  @return true on success or false otherwise. */
+  bool fetch_child(ulint level, mtr_t &mtr, const buf_block_t *right_parent)
+  {
+    buf_block_t *parent_block= m_block;
+
+    m_block= btr_block_get(*index(), m_page_id.page_no(), RW_S_LATCH, !level,
+                           &mtr, nullptr);
+    if (!m_block)
+      return false;
+
+    if (parent_block && parent_block != right_parent)
+    {
+      ut_ad(mtr.get_savepoint() >= 2);
+      mtr.rollback_to_savepoint(1, 2);
+    }
+
+    return level == ULINT_UNDEFINED ||
+      btr_page_get_level(m_block->page.frame) == level;
+  }
+
+  /** Sets page mode for leaves */
+  void set_page_mode_for_leaves() { m_page_mode= m_mode; }
+
+  /** Does search on the current page. If there is no border in m_tuple, then
+  just move the cursor to the most left or right record.
+  @param level current level on tree.
+  @param root_height root height
+  @param left true if this is left border, false otherwise.
+  @return true on success, false otherwise. */
+  bool search_on_page(ulint level, ulint root_height, bool left)
+  {
+    if (level != btr_page_get_level(m_block->page.frame))
+      return false;
+
+    m_n_recs= page_get_n_recs(m_block->page.frame);
+
+    if (dtuple_get_n_fields(&m_tuple) > 0)
+    {
+      m_up_bytes= m_low_bytes= 0;
+      m_page_cur.block= m_block;
+      if (page_cur_search_with_match(&m_tuple, m_page_mode,
+                                     &m_up_match, &m_low_match, &m_page_cur,
+                                     nullptr))
+        return false;
+      m_nth_rec= page_rec_get_n_recs_before(page_cur_get_rec(&m_page_cur));
+    }
+    else if (left)
+    {
+      page_cur_set_before_first(m_block, &m_page_cur);
+      if (level)
+      {
+        if (!page_cur_move_to_next(&m_page_cur))
+          return false;
+        m_nth_rec= 1;
+      }
+      else
+        m_nth_rec= 0;
+    }
+    else
+    {
+      m_nth_rec= m_n_recs;
+      if (!level)
+      {
+        page_cur_set_after_last(m_block, &m_page_cur);
+        ++m_nth_rec;
+      }
+      else
+      {
+        m_page_cur.block= m_block;
+        m_page_cur.rec= page_rec_get_nth(m_block->page.frame, m_nth_rec);
+      }
+    }
+
+    return true;
+  }
+
+  /** Read page id of the current record child.
+  @param offsets offsets array.
+  @param heap heap for offsets array */
+  void read_child_page_id(rec_offs **offsets, mem_heap_t **heap)
+  {
+    const rec_t *node_ptr= page_cur_get_rec(&m_page_cur);
+
+    /* FIXME: get the child page number directly without computing offsets */
+    *offsets= rec_get_offsets(node_ptr, index(), *offsets, 0, ULINT_UNDEFINED,
+                              heap);
+
+    /* Go to the child node */
+    m_page_id.set_page_no(btr_node_ptr_get_child_page_no(node_ptr, *offsets));
+  }
+
+  /** @return true if left border should be counted */
+  bool should_count_the_left_border() const
+  {
+    if (dtuple_get_n_fields(&m_tuple) > 0)
+    {
+      ut_ad(!page_rec_is_infimum(page_cur_get_rec(&m_page_cur)));
+      return !page_rec_is_supremum(page_cur_get_rec(&m_page_cur));
+    }
+    ut_ad(page_rec_is_infimum(page_cur_get_rec(&m_page_cur)));
+    return false;
+  }
+
+  /** @return true if right border should be counted */
+  bool should_count_the_right_border() const
+  {
+    if (dtuple_get_n_fields(&m_tuple) > 0)
+    {
+      const rec_t *rec= page_cur_get_rec(&m_page_cur);
+      ut_ad(!(m_mode == PAGE_CUR_L && page_rec_is_supremum(rec)));
+
+      return (m_mode == PAGE_CUR_LE /* if the range is '<=' */
+              /* and the record was found */
+              && m_low_match >= dtuple_get_n_fields(&m_tuple)) ||
+             (m_mode == PAGE_CUR_L /* or if the range is '<' */
+              /* and there are any records to match the criteria, i.e. if the
+              minimum record on the tree is 5 and x < 7 is specified then the
+              cursor will be positioned at 5 and we should count the border,
+              but if x < 2 is specified, then the cursor will be positioned at
+              'inf' and we should not count the border */
+              && !page_rec_is_infimum(rec));
+      /* Notice that for "WHERE col <= 'foo'" the server passes to
+      ha_innobase::records_in_range(): min_key=NULL (left-unbounded) which is
+      expected max_key='foo' flag=HA_READ_AFTER_KEY (PAGE_CUR_G), which is
+      unexpected - one would expect flag=HA_READ_KEY_OR_PREV (PAGE_CUR_LE). In
+      this case the cursor will be positioned on the first record to the right
+      of the requested one (can also be positioned on the 'sup') and we should
+      not count the right border. */
+    }
+    ut_ad(page_rec_is_supremum(page_cur_get_rec(&m_page_cur)));
+
+    /* The range specified is without a right border, just 'x > 123'
+    or 'x >= 123' and search_on_page() positioned the cursor on the
+    supremum record on the rightmost page, which must not be counted. */
+    return false;
+  }
+
+  /** @return index */
+  const dict_index_t *index() const { return m_page_cur.index; }
+
+  /** @return current block */
+  const buf_block_t *block() const { return m_block; }
+
+  /** @return current page id */
+  page_id_t page_id() const { return m_page_id; }
+
+  /** Copies block pointer and savepoint from another btr_est_cur_t in the case
+  if both left and right border cursors point to the same block.
+  @param o reference to the other btr_est_cur_t object. */
+  void set_block(const btr_est_cur_t &o) { m_block= o.m_block; }
+
+  /** @return current record number. */
+  ulint nth_rec() const { return m_nth_rec; }
+
+  /** @return number of records in the current page. */
+  ulint n_recs() const { return m_n_recs; }
+};
+
+/** Estimate the number of rows between the left record of the path and the
+right one(non-inclusive) for the certain level on a B-tree. This function
+starts from the page next to the left page and reads a few pages to the right,
+counting their records. If we reach the right page quickly then we know exactly
+how many records there are between left and right records and we set
+is_n_rows_exact to true. After some page is latched, the previous page is
+unlatched. If we cannot reach the right page quickly then we calculate the
+average number of records in the pages scanned so far and assume that all pages
+that we did not scan up to the right page contain the same number of records,
+then we multiply that average to the number of pages between right and left
+records (which is n_rows_on_prev_level). In this case we set is_n_rows_exact to
+false.
+@param level current level.
+@param left_cur the cursor of the left page.
+@param right_page_no right page number.
+@param n_rows_on_prev_level number of rows on the previous level.
+@param[out] is_n_rows_exact true if exact rows number is returned.
+@param[in,out] mtr mtr,
+@return number of rows, not including the borders (exact or estimated). */
+static ha_rows btr_estimate_n_rows_in_range_on_level(
+    ulint level, btr_est_cur_t &left_cur, uint32_t right_page_no,
+    ha_rows n_rows_on_prev_level, bool &is_n_rows_exact, mtr_t &mtr)
+{
+  ha_rows n_rows= 0;
+  uint n_pages_read= 0;
+  /* Do not read more than this number of pages in order not to hurt
+  performance with this code which is just an estimation. If we read this many
+  pages before reaching right_page_no, then we estimate the average from the
+  pages scanned so far. */
+  static constexpr uint n_pages_read_limit= 9;
+  buf_block_t *block= nullptr;
+  const dict_index_t *index= left_cur.index();
+
+  /* Assume by default that we will scan all pages between left and right(non
+  inclusive) pages */
+  is_n_rows_exact= true;
+
+  /* Add records from the left page which are to the right of the record which
+  serves as a left border of the range, if any (we don't include the record
+  itself in this count). */
+  if (left_cur.nth_rec() <= left_cur.n_recs())
+  {
+    n_rows+= left_cur.n_recs() - left_cur.nth_rec();
+  }
+
+  /* Count the records in the pages between left and right (non inclusive)
+  pages */
+
+  const fil_space_t *space= index->table->space;
+  page_id_t page_id(space->id,
+                    btr_page_get_next(buf_block_get_frame(left_cur.block())));
+
+  if (page_id.page_no() == FIL_NULL)
+    goto inexact;
+
+  do
+  {
+    page_t *page;
+    buf_block_t *prev_block= block;
+
+    /* Fetch the page. */
+    block= btr_block_get(*index, page_id.page_no(), RW_S_LATCH, !level, &mtr,
+                         nullptr);
+
+    if (prev_block)
+    {
+      ulint savepoint = mtr.get_savepoint();
+      /* Index s-lock, p1, p2 latches, can also be p1 and p2 parent latch if
+      they are not diverged */
+      ut_ad(savepoint >= 3);
+      mtr.rollback_to_savepoint(savepoint - 2, savepoint - 1);
+    }
+
+    if (!block || btr_page_get_level(buf_block_get_frame(block)) != level)
+      goto inexact;
+
+    page= buf_block_get_frame(block);
+
+    /* It is possible but highly unlikely that the page was originally written
+    by an old version of InnoDB that did not initialize FIL_PAGE_TYPE on other
+    than B-tree pages. For example, this could be an almost-empty BLOB page
+    that happens to contain the magic values in the fields
+    that we checked above. */
+
+    n_pages_read++;
+
+    n_rows+= page_get_n_recs(page);
+
+    page_id.set_page_no(btr_page_get_next(page));
+
+    if (n_pages_read == n_pages_read_limit)
+    {
+      /* We read too many pages or we reached the end of the level
+      without passing through right_page_no. */
+      goto inexact;
+    }
+
+  } while (page_id.page_no() != right_page_no);
+
+  if (block)
+  {
+    ut_ad(block == mtr.at_savepoint(mtr.get_savepoint() - 1));
+    mtr.rollback_to_savepoint(mtr.get_savepoint() - 1);
+  }
+
+  return (n_rows);
+
+inexact:
+
+  if (block)
+  {
+    ut_ad(block == mtr.at_savepoint(mtr.get_savepoint() - 1));
+    mtr.rollback_to_savepoint(mtr.get_savepoint() - 1);
+  }
+
+  is_n_rows_exact= false;
+
+  /* We did interrupt before reaching right page */
+
+  if (n_pages_read > 0)
+  {
+    /* The number of pages on this level is
+    n_rows_on_prev_level, multiply it by the
+    average number of recs per page so far */
+    n_rows= n_rows_on_prev_level * n_rows / n_pages_read;
+  }
+  else
+  {
+    n_rows= 10;
+  }
+
+  return (n_rows);
+}
+
+/** Estimates the number of rows in a given index range. Do search in the left
+page, then if there are pages between left and right ones, read a few pages to
+the right, if the right page is reached, count the exact number of rows without
+fetching the right page, the right page will be fetched in the caller of this
+function and the amount of its rows will be added. If the right page is not
+reached, count the estimated(see btr_estimate_n_rows_in_range_on_level() for
+details) rows number, and fetch the right page. If leaves are reached, unlatch
+non-leaf pages except the right leaf parent. After the right leaf page is
+fetched, commit mtr.
+@param[in]  index index
+@param[in]  range_start range start
+@param[in]  range_end   range end
+@return estimated number of rows; */
+ha_rows btr_estimate_n_rows_in_range(dict_index_t *index,
+                                     btr_pos_t *range_start,
+                                     btr_pos_t *range_end)
+{
+  DBUG_ENTER("btr_estimate_n_rows_in_range");
+
+  if (UNIV_UNLIKELY(index->page == FIL_NULL || index->is_corrupted()))
+    DBUG_RETURN(0);
+
+  ut_ad(index->is_btree());
+
+  btr_est_cur_t p1(index, *range_start->tuple, range_start->mode);
+  btr_est_cur_t p2(index, *range_end->tuple, range_end->mode);
+  mtr_t mtr;
+
+  ulint height;
+  ulint root_height= 0; /* remove warning */
+
+  mem_heap_t *heap= NULL;
+  rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+  rec_offs *offsets= offsets_;
+  rec_offs_init(offsets_);
+
+  mtr.start();
+
+  ut_ad(mtr.get_savepoint() == 0);
+  mtr_s_lock_index(index, &mtr);
+
+  ha_rows table_n_rows= dict_table_get_n_rows(index->table);
+
+  height= ULINT_UNDEFINED;
+
+  /* This becomes true when the two paths do not pass through the same pages
+  anymore. */
+  bool diverged= false;
+  /* This is the height, i.e. the number of levels from the root, where paths
+   are not the same or adjacent any more. */
+  ulint divergence_height= ULINT_UNDEFINED;
+  bool should_count_the_left_border= true;
+  bool should_count_the_right_border= true;
+  bool is_n_rows_exact= true;
+  ha_rows n_rows= 0;
+
+  /* Loop and search until we arrive at the desired level. */
+search_loop:
+  if (!p1.fetch_child(height, mtr, p2.block()))
+    goto error;
+
+  if (height == ULINT_UNDEFINED)
+  {
+    /* We are in the root node */
+    height= btr_page_get_level(buf_block_get_frame(p1.block()));
+    root_height= height;
+  }
+
+  if (!height)
+  {
+    p1.set_page_mode_for_leaves();
+    p2.set_page_mode_for_leaves();
+  }
+
+  if (p1.page_id() == p2.page_id())
+    p2.set_block(p1);
+  else
+  {
+    ut_ad(diverged);
+    if (divergence_height != ULINT_UNDEFINED) {
+      /* We need to call p1.search_on_page() here as
+      btr_estimate_n_rows_in_range_on_level() uses p1.m_n_recs and
+      p1.m_nth_rec. */
+      if (!p1.search_on_page(height, root_height, true))
+        goto error;
+      n_rows= btr_estimate_n_rows_in_range_on_level(
+          height, p1, p2.page_id().page_no(), n_rows, is_n_rows_exact, mtr);
+    }
+    if (!p2.fetch_child(height, mtr, nullptr))
+      goto error;
+  }
+
+  if (height == 0)
+    /* There is no need to release non-leaf pages here as they must already be
+    unlatched in btr_est_cur_t::fetch_child(). Try to search on pages after
+    releasing the index latch, to decrease contention. */
+    mtr.rollback_to_savepoint(0, 1);
+
+  /* There is no need to search on left page if
+  divergence_height != ULINT_UNDEFINED, as it was already searched before
+  btr_estimate_n_rows_in_range_on_level() call */
+  if (divergence_height == ULINT_UNDEFINED &&
+      !p1.search_on_page(height, root_height, true))
+    goto error;
+
+  if (!p2.search_on_page(height, root_height, false))
+    goto error;
+
+  if (!diverged && (p1.nth_rec() != p2.nth_rec()))
+  {
+    ut_ad(p1.page_id() == p2.page_id());
+    diverged= true;
+    if (p1.nth_rec() < p2.nth_rec())
+    {
+      /* We do not count the borders (nor the left nor the right one), thus
+      "- 1". */
+      n_rows= p2.nth_rec() - p1.nth_rec() - 1;
+
+      if (n_rows > 0)
+      {
+        /* There is at least one row between the two borders pointed to by p1
+        and p2, so on the level below the slots will point to non-adjacent
+        pages. */
+        divergence_height= root_height - height;
+      }
+    }
+    else
+    {
+      /* It is possible that p1->nth_rec > p2->nth_rec if, for example, we have
+      a single page tree which contains (inf, 5, 6, supr) and we select where x
+      > 20 and x < 30; in this case p1->nth_rec will point to the supr record
+      and p2->nth_rec will point to 6. */
+      n_rows= 0;
+      should_count_the_left_border= false;
+      should_count_the_right_border= false;
+    }
+  }
+  else if (diverged && divergence_height == ULINT_UNDEFINED)
+  {
+
+    if (p1.nth_rec() < p1.n_recs() || p2.nth_rec() > 1)
+    {
+      ut_ad(p1.page_id() != p2.page_id());
+      divergence_height= root_height - height;
+
+      n_rows= 0;
+
+      if (p1.nth_rec() < p1.n_recs())
+      {
+        n_rows+= p1.n_recs() - p1.nth_rec();
+      }
+
+      if (p2.nth_rec() > 1)
+      {
+        n_rows+= p2.nth_rec() - 1;
+      }
+    }
+  }
+  else if (divergence_height != ULINT_UNDEFINED)
+  {
+    /* All records before the right page was already counted. Add records from
+    p2->page_no which are to the left of the record which servers as a right
+    border of the range, if any (we don't include the record itself in this
+    count). */
+    if (p2.nth_rec() > 1)
+      n_rows+= p2.nth_rec() - 1;
+  }
+
+  if (height)
+  {
+    ut_ad(height > 0);
+    height--;
+    ut_ad(mtr.memo_contains(p1.index()->lock, MTR_MEMO_S_LOCK));
+    ut_ad(mtr.memo_contains_flagged(p1.block(), MTR_MEMO_PAGE_S_FIX));
+    p1.read_child_page_id(&offsets, &heap);
+    ut_ad(mtr.memo_contains(p2.index()->lock, MTR_MEMO_S_LOCK));
+    ut_ad(mtr.memo_contains_flagged(p2.block(), MTR_MEMO_PAGE_S_FIX));
+    p2.read_child_page_id(&offsets, &heap);
+    goto search_loop;
+  }
+
+  should_count_the_left_border=
+      should_count_the_left_border && p1.should_count_the_left_border();
+  should_count_the_right_border=
+      should_count_the_right_border && p2.should_count_the_right_border();
+
+  mtr.commit();
+  if (UNIV_LIKELY_NULL(heap))
+    mem_heap_free(heap);
+
+
+  range_start->page_id= p1.page_id();
+  range_end->page_id= p2.page_id();
+
+  /* Here none of the borders were counted. For example, if on the leaf level
+  we descended to:
+  (inf, a, b, c, d, e, f, sup)
+           ^        ^
+         path1    path2
+  then n_rows will be 2 (c and d). */
+
+  if (is_n_rows_exact)
+  {
+    /* Only fiddle to adjust this off-by-one if the number is exact, otherwise
+    we do much grosser adjustments below. */
+
+    /* If both paths end up on the same record on the leaf level. */
+    if (p1.page_id() == p2.page_id() && p1.nth_rec() == p2.nth_rec())
+    {
+
+      /* n_rows can be > 0 here if the paths were first different and then
+      converged to the same record on the leaf level.
+      For example:
+      SELECT ... LIKE 'wait/synch/rwlock%'
+      mode1=PAGE_CUR_GE,
+      tuple1="wait/synch/rwlock"
+      path1[0]={nth_rec=58, n_recs=58,
+                page_no=3, page_level=1}
+      path1[1]={nth_rec=56, n_recs=55,
+                page_no=119, page_level=0}
+
+      mode2=PAGE_CUR_G
+      tuple2="wait/synch/rwlock"
+      path2[0]={nth_rec=57, n_recs=57,
+                page_no=3, page_level=1}
+      path2[1]={nth_rec=56, n_recs=55,
+                page_no=119, page_level=0} */
+
+      /* If the range is such that we should count both borders, then avoid
+      counting that record twice - once as a left border and once as a right
+      border. Some of the borders should not be counted, e.g. [3,3). */
+      n_rows= should_count_the_left_border && should_count_the_right_border;
+    }
+    else
+      n_rows+= should_count_the_left_border + should_count_the_right_border;
+  }
+
+  if (root_height > divergence_height && !is_n_rows_exact)
+    /* In trees whose height is > 1 our algorithm tends to underestimate:
+    multiply the estimate by 2: */
+    n_rows*= 2;
+
+  DBUG_EXECUTE_IF("bug14007649", DBUG_RETURN(n_rows););
+
+  /* Do not estimate the number of rows in the range to over 1 / 2 of the
+  estimated rows in the whole table */
+
+  if (n_rows > table_n_rows / 2 && !is_n_rows_exact)
+  {
+
+    n_rows= table_n_rows / 2;
+
+    /* If there are just 0 or 1 rows in the table, then we estimate all rows
+    are in the range */
+
+    if (n_rows == 0)
+      n_rows= table_n_rows;
+  }
+
+  DBUG_RETURN(n_rows);
+
+error:
+  mtr.commit();
+  if (UNIV_LIKELY_NULL(heap))
+    mem_heap_free(heap);
+
+  DBUG_RETURN(0);
+}
+
+/*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/
+
+/***********************************************************//**
+Gets the offset of the pointer to the externally stored part of a field.
+@return offset of the pointer to the externally stored part */
+static
+ulint
+btr_rec_get_field_ref_offs(
+/*=======================*/
+	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n)	/*!< in: index of the external field */
+{
+	ulint	field_ref_offs;
+	ulint	local_len;
+
+	ut_a(rec_offs_nth_extern(offsets, n));
+	field_ref_offs = rec_get_nth_field_offs(offsets, n, &local_len);
+	ut_a(len_is_stored(local_len));
+	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+	return(field_ref_offs + local_len - BTR_EXTERN_FIELD_REF_SIZE);
+}
+
+/** Gets a pointer to the externally stored part of a field.
+@param rec record
+@param offsets rec_get_offsets(rec)
+@param n index of the externally stored field
+@return pointer to the externally stored part */
+#define btr_rec_get_field_ref(rec, offsets, n)			\
+	((rec) + btr_rec_get_field_ref_offs(offsets, n))
+
+/** Gets the externally stored size of a record, in units of a database page.
+@param[in]	rec	record
+@param[in]	offsets	array returned by rec_get_offsets()
+@return externally stored part, in units of a database page */
+ulint
+btr_rec_get_externally_stored_len(
+	const rec_t*	rec,
+	const rec_offs*	offsets)
+{
+	ulint	n_fields;
+	ulint	total_extern_len = 0;
+	ulint	i;
+
+	ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
+
+	if (!rec_offs_any_extern(offsets)) {
+		return(0);
+	}
+
+	n_fields = rec_offs_n_fields(offsets);
+
+	for (i = 0; i < n_fields; i++) {
+		if (rec_offs_nth_extern(offsets, i)) {
+
+			ulint	extern_len = mach_read_from_4(
+				btr_rec_get_field_ref(rec, offsets, i)
+				+ BTR_EXTERN_LEN + 4);
+
+			total_extern_len += ut_calc_align(
+				extern_len, ulint(srv_page_size));
+		}
+	}
+
+	return total_extern_len >> srv_page_size_shift;
+}
+
+/*******************************************************************//**
+Sets the ownership bit of an externally stored field in a record. */
+static
+void
+btr_cur_set_ownership_of_extern_field(
+/*==================================*/
+	buf_block_t*	block,	/*!< in/out: index page */
+	rec_t*		rec,	/*!< in/out: clustered index record */
+	dict_index_t*	index,	/*!< in: index of the page */
+	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		i,	/*!< in: field number */
+	bool		val,	/*!< in: value to set */
+	mtr_t*		mtr)	/*!< in: mtr, or NULL if not logged */
+{
+	byte*	data;
+	ulint	local_len;
+	ulint	byte_val;
+
+	data = rec_get_nth_field(rec, offsets, i, &local_len);
+	ut_ad(rec_offs_nth_extern(offsets, i));
+	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+	local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+
+	byte_val = mach_read_from_1(data + local_len + BTR_EXTERN_LEN);
+
+	if (val) {
+		byte_val &= ~BTR_EXTERN_OWNER_FLAG;
+	} else {
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+		ut_a(!(byte_val & BTR_EXTERN_OWNER_FLAG));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+		byte_val |= BTR_EXTERN_OWNER_FLAG;
+	}
+
+	if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+		mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
+		page_zip_write_blob_ptr(block, rec, index, offsets, i, mtr);
+	} else {
+		mtr->write<1,mtr_t::MAYBE_NOP>(*block, data + local_len
+					       + BTR_EXTERN_LEN, byte_val);
+	}
+}
+
+/*******************************************************************//**
+Marks non-updated off-page fields as disowned by this record. The ownership
+must be transferred to the updated record which is inserted elsewhere in the
+index tree. In purge only the owner of externally stored field is allowed
+to free the field. */
+void
+btr_cur_disown_inherited_fields(
+/*============================*/
+	buf_block_t*	block,	/*!< in/out: index page */
+	rec_t*		rec,	/*!< in/out: record in a clustered index */
+	dict_index_t*	index,	/*!< in: index of the page */
+	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
+	const upd_t*	update,	/*!< in: update vector */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
+	ut_ad(rec_offs_any_extern(offsets));
+
+	for (uint16_t i = 0; i < rec_offs_n_fields(offsets); i++) {
+		if (rec_offs_nth_extern(offsets, i)
+		    && !upd_get_field_by_field_no(update, i, false)) {
+			btr_cur_set_ownership_of_extern_field(
+				block, rec, index, offsets, i, false, mtr);
+		}
+	}
+}
+
+/*******************************************************************//**
+Marks all extern fields in a record as owned by the record. This function
+should be called if the delete mark of a record is removed: a not delete
+marked record always owns all its extern fields. */
+static
+void
+btr_cur_unmark_extern_fields(
+/*=========================*/
+	buf_block_t*	block,	/*!< in/out: index page */
+	rec_t*		rec,	/*!< in/out: record in a clustered index */
+	dict_index_t*	index,	/*!< in: index of the page */
+	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
+	mtr_t*		mtr)	/*!< in: mtr, or NULL if not logged */
+{
+	ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
+	if (!rec_offs_any_extern(offsets)) {
+		return;
+	}
+
+	const ulint n = rec_offs_n_fields(offsets);
+
+	for (ulint i = 0; i < n; i++) {
+		if (rec_offs_nth_extern(offsets, i)) {
+			btr_cur_set_ownership_of_extern_field(
+				block, rec, index, offsets, i, true, mtr);
+		}
+	}
+}
+
+/*******************************************************************//**
+Returns the length of a BLOB part stored on the header page.
+@return part length */
+static
+uint32_t
+btr_blob_get_part_len(
+/*==================*/
+	const byte*	blob_header)	/*!< in: blob header */
+{
+	return(mach_read_from_4(blob_header + BTR_BLOB_HDR_PART_LEN));
+}
+
+/*******************************************************************//**
+Returns the page number where the next BLOB part is stored.
+@return page number or FIL_NULL if no more pages */
+static
+uint32_t
+btr_blob_get_next_page_no(
+/*======================*/
+	const byte*	blob_header)	/*!< in: blob header */
+{
+	return(mach_read_from_4(blob_header + BTR_BLOB_HDR_NEXT_PAGE_NO));
+}
+
+/** Deallocate a buffer block that was reserved for a BLOB part.
+@param block   buffer block
+@param all     flag whether to remove a ROW_FORMAT=COMPRESSED page
+@param mtr     mini-transaction to commit */
+static void btr_blob_free(buf_block_t *block, bool all, mtr_t *mtr)
+{
+  const page_id_t page_id(block->page.id());
+  ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+  mtr->commit();
+
+  buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold());
+  mysql_mutex_lock(&buf_pool.mutex);
+
+  if (buf_page_t *bpage= buf_pool.page_hash.get(page_id, chain))
+    if (!buf_LRU_free_page(bpage, all) && all && bpage->zip.data)
+      /* Attempt to deallocate the redundant copy of the uncompressed page
+      if the whole ROW_FORMAT=COMPRESSED block cannot be deallocted. */
+      buf_LRU_free_page(bpage, false);
+
+  mysql_mutex_unlock(&buf_pool.mutex);
+}
+
+/** Helper class used while writing blob pages, during insert or update. */
+struct btr_blob_log_check_t {
+	/** Persistent cursor on a clusterex index record with blobs. */
+	btr_pcur_t*	m_pcur;
+	/** Mini transaction holding the latches for m_pcur */
+	mtr_t*		m_mtr;
+	/** rec_get_offsets(rec, index); offset of clust_rec */
+	const rec_offs*	m_offsets;
+	/** The block containing clustered record */
+	buf_block_t**	m_block;
+	/** The clustered record pointer */
+	rec_t**		m_rec;
+	/** The blob operation code */
+	enum blob_op	m_op;
+
+	/** Constructor
+	@param[in]	pcur		persistent cursor on a clustered
+					index record with blobs.
+	@param[in]	mtr		mini-transaction holding latches for
+					pcur.
+	@param[in]	offsets		offsets of the clust_rec
+	@param[in,out]	block		record block containing pcur record
+	@param[in,out]	rec		the clustered record pointer
+	@param[in]	op		the blob operation code */
+	btr_blob_log_check_t(
+		btr_pcur_t*	pcur,
+		mtr_t*		mtr,
+		const rec_offs*	offsets,
+		buf_block_t**	block,
+		rec_t**		rec,
+		enum blob_op	op)
+		: m_pcur(pcur),
+		  m_mtr(mtr),
+		  m_offsets(offsets),
+		  m_block(block),
+		  m_rec(rec),
+		  m_op(op)
+	{
+		ut_ad(rec_offs_validate(*m_rec, m_pcur->index(), m_offsets));
+		ut_ad((*m_block)->page.frame == page_align(*m_rec));
+		ut_ad(*m_rec == btr_pcur_get_rec(m_pcur));
+	}
+
+	/** Check if there is enough space in log file. Commit and re-start the
+	mini transaction. */
+	void check()
+	{
+		dict_index_t*	index = m_pcur->index();
+		ulint		offs = 0;
+		uint32_t	page_no = FIL_NULL;
+
+		if (UNIV_UNLIKELY(m_op == BTR_STORE_INSERT_BULK)) {
+			offs = page_offset(*m_rec);
+			page_no = (*m_block)->page.id().page_no();
+			(*m_block)->page.fix();
+			ut_ad(page_no != FIL_NULL);
+		} else {
+			btr_pcur_store_position(m_pcur, m_mtr);
+		}
+		m_mtr->commit();
+
+		DEBUG_SYNC_C("blob_write_middle");
+
+		const mtr_log_t log_mode = m_mtr->get_log_mode();
+		m_mtr->start();
+		m_mtr->set_log_mode(log_mode);
+		index->set_modified(*m_mtr);
+
+		log_free_check();
+
+		DEBUG_SYNC_C("blob_write_middle_after_check");
+
+		if (UNIV_UNLIKELY(page_no != FIL_NULL)) {
+			dberr_t err;
+			if (UNIV_LIKELY(index->page != page_no)) {
+				ut_a(btr_root_block_get(index, RW_SX_LATCH,
+							m_mtr, &err));
+			}
+			m_pcur->btr_cur.page_cur.block = btr_block_get(
+				*index, page_no, RW_X_LATCH, false, m_mtr);
+			/* The page should not be evicted or corrupted while
+			we are holding a buffer-fix on it. */
+			m_pcur->btr_cur.page_cur.block->page.unfix();
+			m_pcur->btr_cur.page_cur.rec
+				= m_pcur->btr_cur.page_cur.block->page.frame
+				+ offs;
+		} else {
+			ut_ad(m_pcur->rel_pos == BTR_PCUR_ON);
+			mtr_sx_lock_index(index, m_mtr);
+			ut_a(m_pcur->restore_position(
+			      BTR_MODIFY_ROOT_AND_LEAF_ALREADY_LATCHED,
+			      m_mtr) == btr_pcur_t::SAME_ALL);
+		}
+
+		*m_block	= btr_pcur_get_block(m_pcur);
+		*m_rec		= btr_pcur_get_rec(m_pcur);
+
+		rec_offs_make_valid(*m_rec, index, true,
+				    const_cast<rec_offs*>(m_offsets));
+
+		ut_ad(m_mtr->memo_contains_page_flagged(
+		      *m_rec,
+		      MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX));
+
+		ut_ad((m_op == BTR_STORE_INSERT_BULK)
+		      == !m_mtr->memo_contains_flagged(&index->lock,
+						       MTR_MEMO_SX_LOCK
+						       | MTR_MEMO_X_LOCK));
+	}
+};
+
+/*******************************************************************//**
+Stores the fields in big_rec_vec to the tablespace and puts pointers to
+them in rec.  The extern flags in rec will have to be set beforehand.
+The fields are stored on pages allocated from leaf node
+file segment of the index tree.
+
+TODO: If the allocation extends the tablespace, it will not be redo logged, in
+any mini-transaction.  Tablespace extension should be redo-logged, so that
+recovery will not fail when the big_rec was written to the extended portion of
+the file, in case the file was somehow truncated in the crash.
+
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+dberr_t
+btr_store_big_rec_extern_fields(
+/*============================*/
+	btr_pcur_t*	pcur,		/*!< in: a persistent cursor */
+	rec_offs*	offsets,	/*!< in/out: rec_get_offsets() on
+					pcur. the "external storage" flags
+					in offsets will correctly correspond
+					to rec when this function returns */
+	const big_rec_t*big_rec_vec,	/*!< in: vector containing fields
+					to be stored externally */
+	mtr_t*		btr_mtr,	/*!< in/out: mtr containing the
+					latches to the clustered index. can be
+					committed and restarted. */
+	enum blob_op	op)		/*! in: operation code */
+{
+	byte*		field_ref;
+	ulint		extern_len;
+	ulint		store_len;
+	ulint		i;
+	mtr_t		mtr;
+	mem_heap_t*	heap = NULL;
+	page_zip_des_t*	page_zip;
+	z_stream	c_stream;
+	dberr_t		error		= DB_SUCCESS;
+	dict_index_t*	index		= pcur->index();
+	buf_block_t*	rec_block	= btr_pcur_get_block(pcur);
+	rec_t*		rec		= btr_pcur_get_rec(pcur);
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(rec_offs_any_extern(offsets));
+	ut_ad(op == BTR_STORE_INSERT_BULK
+	      || btr_mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+						| MTR_MEMO_SX_LOCK));
+	ut_ad(btr_mtr->memo_contains_flagged(rec_block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(buf_block_get_frame(rec_block) == page_align(rec));
+	ut_a(dict_index_is_clust(index));
+
+	if (!fil_page_index_page_check(page_align(rec))) {
+		if (op != BTR_STORE_INSERT_BULK) {
+			return DB_PAGE_CORRUPTED;
+		}
+	}
+
+	btr_blob_log_check_t redo_log(pcur, btr_mtr, offsets, &rec_block,
+				      &rec, op);
+	page_zip = buf_block_get_page_zip(rec_block);
+
+	if (page_zip) {
+		int	err;
+
+		/* Zlib deflate needs 128 kilobytes for the default
+		window size, plus 512 << memLevel, plus a few
+		kilobytes for small objects.  We use reduced memLevel
+		to limit the memory consumption, and preallocate the
+		heap, hoping to avoid memory fragmentation. */
+		heap = mem_heap_create(250000);
+		page_zip_set_alloc(&c_stream, heap);
+
+		err = deflateInit2(&c_stream, int(page_zip_level),
+				   Z_DEFLATED, 15, 7, Z_DEFAULT_STRATEGY);
+		ut_a(err == Z_OK);
+	}
+
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+	/* All pointers to externally stored columns in the record
+	must either be zero or they must be pointers to inherited
+	columns, owned by this record or an earlier record version. */
+	for (i = 0; i < big_rec_vec->n_fields; i++) {
+		field_ref = btr_rec_get_field_ref(
+			rec, offsets, big_rec_vec->fields[i].field_no);
+
+		ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
+		/* Either this must be an update in place,
+		or the BLOB must be inherited, or the BLOB pointer
+		must be zero (will be written in this function). */
+		ut_a(op == BTR_STORE_UPDATE
+		     || (field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_INHERITED_FLAG)
+		     || !memcmp(field_ref, field_ref_zero,
+				BTR_EXTERN_FIELD_REF_SIZE));
+	}
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
+	/* Space available in compressed page to carry blob data */
+	const ulint	payload_size_zip = rec_block->physical_size()
+		- FIL_PAGE_DATA;
+
+	/* Space available in uncompressed page to carry blob data */
+	const ulint	payload_size = payload_size_zip
+		- (BTR_BLOB_HDR_SIZE + FIL_PAGE_DATA_END);
+
+	/* We have to create a file segment to the tablespace
+	for each field and put the pointer to the field in rec */
+
+	for (i = 0; i < big_rec_vec->n_fields; i++) {
+		const ulint field_no = big_rec_vec->fields[i].field_no;
+
+		field_ref = btr_rec_get_field_ref(rec, offsets, field_no);
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+		/* A zero BLOB pointer should have been initially inserted. */
+		ut_a(!memcmp(field_ref, field_ref_zero,
+			     BTR_EXTERN_FIELD_REF_SIZE));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+		extern_len = big_rec_vec->fields[i].len;
+		MEM_CHECK_DEFINED(big_rec_vec->fields[i].data, extern_len);
+		ut_a(extern_len > 0);
+
+		uint32_t prev_page_no = FIL_NULL;
+
+		if (page_zip) {
+			int	err = deflateReset(&c_stream);
+			ut_a(err == Z_OK);
+
+			c_stream.next_in = (Bytef*)
+				big_rec_vec->fields[i].data;
+			c_stream.avail_in = static_cast<uInt>(extern_len);
+		}
+
+		for (ulint blob_npages = 0;; ++blob_npages) {
+			buf_block_t*	block;
+			const ulint	commit_freq = 4;
+			uint32_t	r_extents;
+
+			ut_ad(page_align(field_ref) == page_align(rec));
+
+			if (!(blob_npages % commit_freq)) {
+
+				redo_log.check();
+
+				field_ref = btr_rec_get_field_ref(
+					rec, offsets, field_no);
+
+				page_zip = buf_block_get_page_zip(rec_block);
+			}
+
+			ut_ad(btr_mtr->get_already_latched(
+				      page_id_t{index->table->space_id, index->page},
+				      MTR_MEMO_PAGE_SX_FIX));
+
+			mtr.start();
+			index->set_modified(mtr);
+			mtr.set_log_mode_sub(*btr_mtr);
+
+			rec_block->page.fix();
+			rec_block->page.lock.x_lock();
+
+			mtr.memo_push(rec_block, MTR_MEMO_PAGE_X_FIX);
+#ifdef BTR_CUR_HASH_ADAPT
+			ut_ad(!btr_search_check_marked_free_index(rec_block));
+#endif
+
+			uint32_t hint_prev = prev_page_no;
+			if (hint_prev == FIL_NULL) {
+				hint_prev = rec_block->page.id().page_no();
+			}
+
+			error = fsp_reserve_free_extents(
+				&r_extents, index->table->space, 1,
+				FSP_BLOB, &mtr, 1);
+			if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+alloc_fail:
+				mtr.commit();
+				goto func_exit;
+			}
+
+			block = btr_page_alloc(index, hint_prev + 1,
+					       FSP_NO_DIR, 0, &mtr, &mtr,
+					       &error);
+
+			index->table->space->release_free_extents(r_extents);
+			if (!block) {
+				goto alloc_fail;
+			}
+
+			const uint32_t space_id = block->page.id().space();
+			const uint32_t page_no = block->page.id().page_no();
+
+			if (prev_page_no == FIL_NULL) {
+			} else if (buf_block_t* prev_block =
+				   buf_page_get_gen(page_id_t(space_id,
+							  prev_page_no),
+                                                    rec_block->zip_size(),
+                                                    RW_X_LATCH, nullptr,
+                                                    BUF_GET, &mtr, &error)) {
+				if (page_zip) {
+					mtr.write<4>(*prev_block,
+						     prev_block->page.frame
+						     + FIL_PAGE_NEXT,
+						     page_no);
+					memcpy_aligned<4>(
+						buf_block_get_page_zip(
+							prev_block)
+						->data + FIL_PAGE_NEXT,
+						prev_block->page.frame
+						+ FIL_PAGE_NEXT, 4);
+				} else {
+					mtr.write<4>(*prev_block,
+						     BTR_BLOB_HDR_NEXT_PAGE_NO
+						     + FIL_PAGE_DATA
+						     + prev_block->page.frame,
+						     page_no);
+				}
+			} else {
+				goto alloc_fail;
+			}
+
+			ut_ad(!page_has_siblings(block->page.frame));
+			ut_ad(!fil_page_get_type(block->page.frame));
+
+			if (page_zip) {
+				int		err;
+				page_zip_des_t*	blob_page_zip;
+
+				mtr.write<1>(*block,
+					     FIL_PAGE_TYPE + 1
+					     + block->page.frame,
+					     prev_page_no == FIL_NULL
+					     ? FIL_PAGE_TYPE_ZBLOB
+					     : FIL_PAGE_TYPE_ZBLOB2);
+				block->page.zip.data[FIL_PAGE_TYPE + 1]
+					= block->page.frame[FIL_PAGE_TYPE + 1];
+
+				c_stream.next_out = block->page.frame
+					+ FIL_PAGE_DATA;
+				c_stream.avail_out = static_cast<uInt>(
+					payload_size_zip);
+
+				err = deflate(&c_stream, Z_FINISH);
+				ut_a(err == Z_OK || err == Z_STREAM_END);
+				ut_a(err == Z_STREAM_END
+				     || c_stream.avail_out == 0);
+
+				mtr.memcpy(*block,
+					   FIL_PAGE_DATA,
+					   page_zip_get_size(page_zip)
+					   - FIL_PAGE_DATA
+					   - c_stream.avail_out);
+				/* Copy the page to compressed storage,
+				because it will be flushed to disk
+				from there. */
+				blob_page_zip = buf_block_get_page_zip(block);
+				ut_ad(blob_page_zip);
+				ut_ad(page_zip_get_size(blob_page_zip)
+				      == page_zip_get_size(page_zip));
+				memcpy(blob_page_zip->data, block->page.frame,
+				       page_zip_get_size(page_zip));
+
+				if (err == Z_OK && prev_page_no != FIL_NULL) {
+
+					goto next_zip_page;
+				}
+
+				if (err == Z_STREAM_END) {
+					mach_write_to_4(field_ref
+							+ BTR_EXTERN_LEN, 0);
+					mach_write_to_4(field_ref
+							+ BTR_EXTERN_LEN + 4,
+							c_stream.total_in);
+				} else {
+					memset(field_ref + BTR_EXTERN_LEN,
+					       0, 8);
+				}
+
+				if (prev_page_no == FIL_NULL) {
+					ut_ad(blob_npages == 0);
+					mach_write_to_4(field_ref
+							+ BTR_EXTERN_SPACE_ID,
+							space_id);
+
+					mach_write_to_4(field_ref
+							+ BTR_EXTERN_PAGE_NO,
+							page_no);
+
+					mach_write_to_4(field_ref
+							+ BTR_EXTERN_OFFSET,
+							FIL_PAGE_NEXT);
+				}
+
+				/* We compress a page when finish bulk insert.*/
+				if (UNIV_LIKELY(op != BTR_STORE_INSERT_BULK)) {
+					page_zip_write_blob_ptr(
+						rec_block, rec, index, offsets,
+						field_no, &mtr);
+				}
+
+next_zip_page:
+				prev_page_no = page_no;
+
+				/* Commit mtr and release the
+				uncompressed page frame to save memory. */
+				btr_blob_free(block, FALSE, &mtr);
+
+				if (err == Z_STREAM_END) {
+					break;
+				}
+			} else {
+				mtr.write<1>(*block, FIL_PAGE_TYPE + 1
+					     + block->page.frame,
+					     FIL_PAGE_TYPE_BLOB);
+
+				if (extern_len > payload_size) {
+					store_len = payload_size;
+				} else {
+					store_len = extern_len;
+				}
+
+				mtr.memcpy<mtr_t::MAYBE_NOP>(
+					*block,
+					FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE
+					+ block->page.frame,
+					static_cast<const byte*>
+					(big_rec_vec->fields[i].data)
+					+ big_rec_vec->fields[i].len
+					- extern_len, store_len);
+				mtr.write<4>(*block, BTR_BLOB_HDR_PART_LEN
+					     + FIL_PAGE_DATA
+					     + block->page.frame,
+					     store_len);
+				compile_time_assert(FIL_NULL == 0xffffffff);
+				mtr.memset(block, BTR_BLOB_HDR_NEXT_PAGE_NO
+					   + FIL_PAGE_DATA, 4, 0xff);
+
+				extern_len -= store_len;
+
+				ut_ad(!mach_read_from_4(BTR_EXTERN_LEN
+							+ field_ref));
+				mtr.write<4>(*rec_block,
+					     BTR_EXTERN_LEN + 4 + field_ref,
+					     big_rec_vec->fields[i].len
+					     - extern_len);
+
+				if (prev_page_no == FIL_NULL) {
+					ut_ad(blob_npages == 0);
+					mtr.write<4,mtr_t::MAYBE_NOP>(
+						*rec_block,
+						field_ref + BTR_EXTERN_SPACE_ID,
+						space_id);
+
+					mtr.write<4>(*rec_block, field_ref
+						     + BTR_EXTERN_PAGE_NO,
+						     page_no);
+
+					mtr.write<4>(*rec_block, field_ref
+						     + BTR_EXTERN_OFFSET,
+						     FIL_PAGE_DATA);
+				}
+
+				prev_page_no = page_no;
+
+				mtr.commit();
+
+				if (extern_len == 0) {
+					break;
+				}
+			}
+		}
+
+		DBUG_EXECUTE_IF("btr_store_big_rec_extern",
+				error = DB_OUT_OF_FILE_SPACE;
+				goto func_exit;);
+
+		rec_offs_make_nth_extern(offsets, field_no);
+	}
+
+func_exit:
+	if (page_zip) {
+		deflateEnd(&c_stream);
+	}
+
+	if (heap != NULL) {
+		mem_heap_free(heap);
+	}
+
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+	/* All pointers to externally stored columns in the record
+	must be valid. */
+	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+		if (!rec_offs_nth_extern(offsets, i)) {
+			continue;
+		}
+
+		field_ref = btr_rec_get_field_ref(rec, offsets, i);
+
+		/* The pointer must not be zero if the operation
+		succeeded. */
+		ut_a(0 != memcmp(field_ref, field_ref_zero,
+				 BTR_EXTERN_FIELD_REF_SIZE)
+		     || error != DB_SUCCESS);
+		/* The column must not be disowned by this record. */
+		ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
+	}
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+	return(error);
+}
+
+/** Check the FIL_PAGE_TYPE on an uncompressed BLOB page.
+@param block   uncompressed BLOB page
+@param op      operation
+@return whether the type is invalid */
+static bool btr_check_blob_fil_page_type(const buf_block_t& block,
+                                         const char *op)
+{
+  uint16_t type= fil_page_get_type(block.page.frame);
+
+  if (UNIV_LIKELY(type == FIL_PAGE_TYPE_BLOB));
+  else if (fil_space_t *space= fil_space_t::get(block.page.id().space()))
+  {
+    /* Old versions of InnoDB did not initialize FIL_PAGE_TYPE on BLOB
+    pages.  Do not print anything about the type mismatch when reading
+    a BLOB page that may be from old versions. */
+    bool fail= space->full_crc32() || DICT_TF_HAS_ATOMIC_BLOBS(space->flags);
+    if (fail)
+      sql_print_error("InnoDB: FIL_PAGE_TYPE=%u on BLOB %s file %s page %u",
+                      type, op, space->chain.start->name,
+                      block.page.id().page_no());
+    space->release();
+    return fail;
+  }
+  return false;
+}
+
+/*******************************************************************//**
+Frees the space in an externally stored field to the file space
+management if the field in data is owned by the externally stored field,
+in a rollback we may have the additional condition that the field must
+not be inherited. */
+void
+btr_free_externally_stored_field(
+/*=============================*/
+	dict_index_t*	index,		/*!< in: index of the data, the index
+					tree MUST be X-latched; if the tree
+					height is 1, then also the root page
+					must be X-latched! (this is relevant
+					in the case this function is called
+					from purge where 'data' is located on
+					an undo log page, not an index
+					page) */
+	byte*		field_ref,	/*!< in/out: field reference */
+	const rec_t*	rec,		/*!< in: record containing field_ref, for
+					page_zip_write_blob_ptr(), or NULL */
+	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec, index),
+					or NULL */
+	buf_block_t*	block,		/*!< in/out: page of field_ref */
+	ulint		i,		/*!< in: field number of field_ref;
+					ignored if rec == NULL */
+	bool		rollback,	/*!< in: performing rollback? */
+	mtr_t*		local_mtr)	/*!< in: mtr
+					containing the latch to data an an
+					X-latch to the index tree */
+{
+	const uint32_t	space_id	= mach_read_from_4(
+		field_ref + BTR_EXTERN_SPACE_ID);
+
+	ut_ad(index->is_primary());
+	ut_ad(block->page.lock.have_x());
+	ut_ad(local_mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+					       | MTR_MEMO_SX_LOCK));
+	ut_ad(local_mtr->memo_contains_page_flagged(field_ref,
+						    MTR_MEMO_PAGE_X_FIX));
+	ut_ad(!rec || rec_offs_validate(rec, index, offsets));
+	ut_ad(!rec || field_ref == btr_rec_get_field_ref(rec, offsets, i));
+	ut_ad(index->table->space_id == index->table->space->id);
+	ut_ad(local_mtr->is_named_space(index->table->space));
+
+	if (UNIV_UNLIKELY(!memcmp(field_ref, field_ref_zero,
+				  BTR_EXTERN_FIELD_REF_SIZE))) {
+		/* In the rollback, we may encounter a clustered index
+		record with some unwritten off-page columns. There is
+		nothing to free then. */
+		ut_a(rollback);
+		return;
+	}
+
+	ut_ad(!(mach_read_from_4(field_ref + BTR_EXTERN_LEN)
+	        & ~((BTR_EXTERN_OWNER_FLAG
+	             | BTR_EXTERN_INHERITED_FLAG) << 24)));
+	ut_ad(space_id == index->table->space_id);
+
+	const ulint ext_zip_size = index->table->space->zip_size();
+	/* !rec holds in a call from purge when field_ref is in an undo page */
+	ut_ad(rec || !block->page.zip.data);
+
+	for (;;) {
+		mtr_t mtr;
+
+		mtr.start();
+		mtr.set_spaces(*local_mtr);
+		mtr.set_log_mode_sub(*local_mtr);
+
+		ut_ad(!index->table->is_temporary()
+		      || local_mtr->get_log_mode() == MTR_LOG_NO_REDO);
+
+		const uint32_t page_no = mach_read_from_4(
+			field_ref + BTR_EXTERN_PAGE_NO);
+		buf_block_t* ext_block;
+
+		if (/* There is no external storage data */
+		    page_no == FIL_NULL
+		    /* This field does not own the externally stored field */
+		    || (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
+			& BTR_EXTERN_OWNER_FLAG)
+		    /* Rollback and inherited field */
+		    || (rollback
+			&& (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
+			    & BTR_EXTERN_INHERITED_FLAG))) {
+skip_free:
+			/* Do not free */
+			mtr.commit();
+
+			return;
+		}
+
+		ext_block = buf_page_get(page_id_t(space_id, page_no),
+					 ext_zip_size, RW_X_LATCH, &mtr);
+
+		if (!ext_block) {
+			goto skip_free;
+		}
+
+		/* The buffer pool block containing the BLOB pointer is
+		exclusively latched by local_mtr. To satisfy some design
+		constraints, we must recursively latch it in mtr as well. */
+		block->fix();
+		block->page.lock.x_lock();
+
+		mtr.memo_push(block, MTR_MEMO_PAGE_X_FIX);
+#ifdef BTR_CUR_HASH_ADAPT
+		ut_ad(!btr_search_check_marked_free_index(block));
+#endif
+
+		const page_t* page = buf_block_get_frame(ext_block);
+
+		if (ext_zip_size) {
+			/* Note that page_zip will be NULL
+			in row_purge_upd_exist_or_extern(). */
+			switch (fil_page_get_type(page)) {
+			case FIL_PAGE_TYPE_ZBLOB:
+			case FIL_PAGE_TYPE_ZBLOB2:
+				break;
+			default:
+				MY_ASSERT_UNREACHABLE();
+			}
+			const uint32_t next_page_no = mach_read_from_4(
+				page + FIL_PAGE_NEXT);
+
+			btr_page_free(index, ext_block, &mtr, true,
+				      local_mtr->memo_contains(
+					      *index->table->space));
+
+			if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+				mach_write_to_4(field_ref + BTR_EXTERN_PAGE_NO,
+						next_page_no);
+				memset(field_ref + BTR_EXTERN_LEN + 4, 0, 4);
+				page_zip_write_blob_ptr(block, rec, index,
+							offsets, i, &mtr);
+			} else {
+				mtr.write<4>(*block,
+					     BTR_EXTERN_PAGE_NO + field_ref,
+					     next_page_no);
+				mtr.write<4,mtr_t::MAYBE_NOP>(*block,
+							      BTR_EXTERN_LEN
+							      + 4 + field_ref,
+							      0U);
+			}
+		} else {
+			ut_ad(!block->page.zip.data);
+			btr_check_blob_fil_page_type(*ext_block, "purge");
+
+			const uint32_t next_page_no = mach_read_from_4(
+				page + FIL_PAGE_DATA
+				+ BTR_BLOB_HDR_NEXT_PAGE_NO);
+			btr_page_free(index, ext_block, &mtr, true,
+				      local_mtr->memo_contains(
+					      *index->table->space));
+
+			mtr.write<4>(*block, BTR_EXTERN_PAGE_NO + field_ref,
+				     next_page_no);
+			/* Zero out the BLOB length.  If the server
+			crashes during the execution of this function,
+			trx_rollback_all_recovered() could
+			dereference the half-deleted BLOB, fetching a
+			wrong prefix for the BLOB. */
+			mtr.write<4,mtr_t::MAYBE_NOP>(*block,
+						      BTR_EXTERN_LEN + 4
+						      + field_ref, 0U);
+		}
+
+		/* Commit mtr and release the BLOB block to save memory. */
+		btr_blob_free(ext_block, TRUE, &mtr);
+	}
+}
+
+/***********************************************************//**
+Frees the externally stored fields for a record. */
+static
+void
+btr_rec_free_externally_stored_fields(
+/*==================================*/
+	dict_index_t*	index,	/*!< in: index of the data, the index
+				tree MUST be X-latched */
+	rec_t*		rec,	/*!< in/out: record */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	buf_block_t*	block,	/*!< in: index page of rec */
+	bool		rollback,/*!< in: performing rollback? */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle which contains
+				an X-latch to record page and to the index
+				tree */
+{
+	ulint	n_fields;
+	ulint	i;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(index->is_primary());
+	ut_ad(page_rec_is_leaf(rec));
+	/* Free possible externally stored fields in the record */
+
+	ut_ad(dict_table_is_comp(index->table) == !!rec_offs_comp(offsets));
+	n_fields = rec_offs_n_fields(offsets);
+
+	for (i = 0; i < n_fields; i++) {
+		if (rec_offs_nth_extern(offsets, i)) {
+			btr_free_externally_stored_field(
+				index, btr_rec_get_field_ref(rec, offsets, i),
+				rec, offsets, block, i, rollback, mtr);
+		}
+	}
+}
+
+/***********************************************************//**
+Frees the externally stored fields for a record, if the field is mentioned
+in the update vector. */
+static
+void
+btr_rec_free_updated_extern_fields(
+/*===============================*/
+	dict_index_t*	index,	/*!< in: index of rec; the index tree MUST be
+				X-latched */
+	rec_t*		rec,	/*!< in/out: record */
+	buf_block_t*	block,	/*!< in: index page of rec */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	const upd_t*	update,	/*!< in: update vector */
+	bool		rollback,/*!< in: performing rollback? */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle which contains
+				an X-latch to record page and to the tree */
+{
+	ulint	n_fields;
+	ulint	i;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX));
+
+	/* Free possible externally stored fields in the record */
+
+	n_fields = upd_get_n_fields(update);
+
+	for (i = 0; i < n_fields; i++) {
+		const upd_field_t* ufield = upd_get_nth_field(update, i);
+
+		if (rec_offs_nth_extern(offsets, ufield->field_no)) {
+			ulint	len;
+			byte*	data = rec_get_nth_field(
+				rec, offsets, ufield->field_no, &len);
+			ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+			btr_free_externally_stored_field(
+				index, data + len - BTR_EXTERN_FIELD_REF_SIZE,
+				rec, offsets, block,
+				ufield->field_no, rollback, mtr);
+		}
+	}
+}
+
+/*******************************************************************//**
+Copies the prefix of an uncompressed BLOB.  The clustered index record
+that points to this BLOB must be protected by a lock or a page latch.
+@return number of bytes written to buf */
+static
+ulint
+btr_copy_blob_prefix(
+/*=================*/
+	byte*		buf,	/*!< out: the externally stored part of
+				the field, or a prefix of it */
+	uint32_t	len,	/*!< in: length of buf, in bytes */
+	page_id_t	id,	/*!< in: page identifier of the first BLOB page */
+	uint32_t	offset)	/*!< in: offset on the first BLOB page */
+{
+	ulint	copied_len	= 0;
+
+	for (;;) {
+		mtr_t		mtr;
+		buf_block_t*	block;
+		const page_t*	page;
+		const byte*	blob_header;
+		ulint		part_len;
+		ulint		copy_len;
+
+		mtr_start(&mtr);
+
+		block = buf_page_get(id, 0, RW_S_LATCH, &mtr);
+		if (!block || btr_check_blob_fil_page_type(*block, "read")) {
+			mtr.commit();
+			return copied_len;
+		}
+		page = buf_block_get_frame(block);
+
+		blob_header = page + offset;
+		part_len = btr_blob_get_part_len(blob_header);
+		copy_len = ut_min(part_len, len - copied_len);
+
+		memcpy(buf + copied_len,
+		       blob_header + BTR_BLOB_HDR_SIZE, copy_len);
+		copied_len += copy_len;
+
+		id.set_page_no(btr_blob_get_next_page_no(blob_header));
+
+		mtr_commit(&mtr);
+
+		if (id.page_no() == FIL_NULL || copy_len != part_len) {
+			MEM_CHECK_DEFINED(buf, copied_len);
+			return(copied_len);
+		}
+
+		/* On other BLOB pages except the first the BLOB header
+		always is at the page data start: */
+
+		offset = FIL_PAGE_DATA;
+
+		ut_ad(copied_len <= len);
+	}
+}
+
+/** Copies the prefix of a compressed BLOB.
+The clustered index record that points to this BLOB must be protected
+by a lock or a page latch.
+@param[out]	buf		the externally stored part of the field,
+or a prefix of it
+@param[in]	len		length of buf, in bytes
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size
+@param[in]	id		page identifier of the BLOB pages
+@return number of bytes written to buf */
+static
+ulint
+btr_copy_zblob_prefix(
+	byte*			buf,
+	uint32_t		len,
+	ulint			zip_size,
+	page_id_t		id,
+	uint32_t		offset)
+{
+	ulint		page_type = FIL_PAGE_TYPE_ZBLOB;
+	mem_heap_t*	heap;
+	int		err;
+	z_stream	d_stream;
+
+	d_stream.next_out = buf;
+	d_stream.avail_out = static_cast<uInt>(len);
+	d_stream.next_in = Z_NULL;
+	d_stream.avail_in = 0;
+
+	/* Zlib inflate needs 32 kilobytes for the default
+	window size, plus a few kilobytes for small objects. */
+	heap = mem_heap_create(40000);
+	page_zip_set_alloc(&d_stream, heap);
+
+	ut_ad(zip_size);
+	ut_ad(ut_is_2pow(zip_size));
+	ut_ad(id.space());
+
+	err = inflateInit(&d_stream);
+	ut_a(err == Z_OK);
+
+	for (;;) {
+		buf_page_t*	bpage;
+		uint32_t	next_page_no;
+
+		/* There is no latch on bpage directly.  Instead,
+		bpage is protected by the B-tree page latch that
+		is being held on the clustered index record, or,
+		in row_merge_copy_blobs(), by an exclusive table lock. */
+		bpage = buf_page_get_zip(id, zip_size);
+
+		if (UNIV_UNLIKELY(!bpage)) {
+			ib::error() << "Cannot load compressed BLOB " << id;
+			goto func_exit;
+		}
+
+		if (UNIV_UNLIKELY
+		    (fil_page_get_type(bpage->zip.data) != page_type)) {
+
+			ib::error() << "Unexpected type "
+				<< fil_page_get_type(bpage->zip.data)
+				<< " of compressed BLOB page " << id;
+
+			ut_ad(0);
+			goto end_of_blob;
+		}
+
+		next_page_no = mach_read_from_4(bpage->zip.data + offset);
+
+		if (UNIV_LIKELY(offset == FIL_PAGE_NEXT)) {
+			/* When the BLOB begins at page header,
+			the compressed data payload does not
+			immediately follow the next page pointer. */
+			offset = FIL_PAGE_DATA;
+		} else {
+			offset += 4;
+		}
+
+		d_stream.next_in = bpage->zip.data + offset;
+		d_stream.avail_in = uInt(zip_size - offset);
+
+		err = inflate(&d_stream, Z_NO_FLUSH);
+		switch (err) {
+		case Z_OK:
+			if (!d_stream.avail_out) {
+				goto end_of_blob;
+			}
+			break;
+		case Z_STREAM_END:
+			if (next_page_no == FIL_NULL) {
+				goto end_of_blob;
+			}
+			/* fall through */
+		default:
+inflate_error:
+			ib::error() << "inflate() of compressed BLOB page "
+				<< id
+				<< " returned " << err
+				<< " (" << d_stream.msg << ")";
+
+		case Z_BUF_ERROR:
+			goto end_of_blob;
+		}
+
+		if (next_page_no == FIL_NULL) {
+			if (!d_stream.avail_in) {
+				ib::error()
+					<< "Unexpected end of compressed "
+					<< "BLOB page " << id;
+			} else {
+				err = inflate(&d_stream, Z_FINISH);
+				switch (err) {
+				case Z_STREAM_END:
+				case Z_BUF_ERROR:
+					break;
+				default:
+					goto inflate_error;
+				}
+			}
+
+end_of_blob:
+			bpage->lock.s_unlock();
+			bpage->unfix();
+			goto func_exit;
+		}
+
+		bpage->lock.s_unlock();
+		bpage->unfix();
+
+		/* On other BLOB pages except the first
+		the BLOB header always is at the page header: */
+
+		id.set_page_no(next_page_no);
+		offset = FIL_PAGE_NEXT;
+		page_type = FIL_PAGE_TYPE_ZBLOB2;
+	}
+
+func_exit:
+	inflateEnd(&d_stream);
+	mem_heap_free(heap);
+	MEM_CHECK_DEFINED(buf, d_stream.total_out);
+	return(d_stream.total_out);
+}
+
+/** Copies the prefix of an externally stored field of a record.
+The clustered index record that points to this BLOB must be protected
+by a lock or a page latch.
+@param[out]	buf		the externally stored part of the
+field, or a prefix of it
+@param[in]	len		length of buf, in bytes
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	id		page identifier of the first BLOB page
+@param[in]	offset		offset on the first BLOB page
+@return number of bytes written to buf */
+static
+ulint
+btr_copy_externally_stored_field_prefix_low(
+	byte*			buf,
+	uint32_t		len,
+	ulint			zip_size,
+	page_id_t		id,
+	uint32_t		offset)
+{
+  if (len == 0)
+    return 0;
+
+  return zip_size
+    ? btr_copy_zblob_prefix(buf, len, zip_size, id, offset)
+    : btr_copy_blob_prefix(buf, len, id, offset);
+}
+
+/** Copies the prefix of an externally stored field of a record.
+The clustered index record must be protected by a lock or a page latch.
+@param[out]	buf		the field, or a prefix of it
+@param[in]	len		length of buf, in bytes
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	data		'internally' stored part of the field
+containing also the reference to the external part; must be protected by
+a lock or a page latch
+@param[in]	local_len	length of data, in bytes
+@return the length of the copied field, or 0 if the column was being
+or has been deleted */
+ulint
+btr_copy_externally_stored_field_prefix(
+	byte*			buf,
+	ulint			len,
+	ulint			zip_size,
+	const byte*		data,
+	ulint			local_len)
+{
+	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+	local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+
+	if (UNIV_UNLIKELY(local_len >= len)) {
+		memcpy(buf, data, len);
+		return(len);
+	}
+
+	memcpy(buf, data, local_len);
+	data += local_len;
+
+	ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE));
+
+	if (!mach_read_from_4(data + BTR_EXTERN_LEN + 4)) {
+		/* The externally stored part of the column has been
+		(partially) deleted.  Signal the half-deleted BLOB
+		to the caller. */
+
+		return(0);
+	}
+
+	uint32_t space_id = mach_read_from_4(data + BTR_EXTERN_SPACE_ID);
+	uint32_t page_no = mach_read_from_4(data + BTR_EXTERN_PAGE_NO);
+	uint32_t offset = mach_read_from_4(data + BTR_EXTERN_OFFSET);
+	len -= local_len;
+
+	return(local_len
+	       + btr_copy_externally_stored_field_prefix_low(buf + local_len,
+							     uint32_t(len),
+							     zip_size,
+							     page_id_t(
+								     space_id,
+								     page_no),
+							     offset));
+}
+
+/** Copies an externally stored field of a record to mem heap.
+The clustered index record must be protected by a lock or a page latch.
+@param[out]	len		length of the whole field
+@param[in]	data		'internally' stored part of the field
+containing also the reference to the external part; must be protected by
+a lock or a page latch
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	local_len	length of data
+@param[in,out]	heap		mem heap
+@return the whole field copied to heap */
+byte*
+btr_copy_externally_stored_field(
+	ulint*			len,
+	const byte*		data,
+	ulint			zip_size,
+	ulint			local_len,
+	mem_heap_t*		heap)
+{
+	byte*	buf;
+
+	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+	local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+
+	uint32_t space_id = mach_read_from_4(data + local_len
+					     + BTR_EXTERN_SPACE_ID);
+	uint32_t page_no = mach_read_from_4(data + local_len
+					    + BTR_EXTERN_PAGE_NO);
+	uint32_t offset = mach_read_from_4(data + local_len
+					   + BTR_EXTERN_OFFSET);
+
+	/* Currently a BLOB cannot be bigger than 4 GB; we
+	leave the 4 upper bytes in the length field unused */
+
+	uint32_t extern_len = mach_read_from_4(data + local_len
+					       + BTR_EXTERN_LEN + 4);
+
+	buf = (byte*) mem_heap_alloc(heap, local_len + extern_len);
+
+	memcpy(buf, data, local_len);
+	*len = local_len
+		+ btr_copy_externally_stored_field_prefix_low(buf + local_len,
+							      extern_len,
+							      zip_size,
+							      page_id_t(
+								      space_id,
+								      page_no),
+							      offset);
+
+	return(buf);
+}
+
+/** Copies an externally stored field of a record to mem heap.
+@param[in]	rec		record in a clustered index; must be
+protected by a lock or a page latch
+@param[in]	offset		array returned by rec_get_offsets()
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	no		field number
+@param[out]	len		length of the field
+@param[in,out]	heap		mem heap
+@return the field copied to heap, or NULL if the field is incomplete */
+byte*
+btr_rec_copy_externally_stored_field(
+	const rec_t*		rec,
+	const rec_offs*		offsets,
+	ulint			zip_size,
+	ulint			no,
+	ulint*			len,
+	mem_heap_t*		heap)
+{
+	ulint		local_len;
+	const byte*	data;
+
+	ut_a(rec_offs_nth_extern(offsets, no));
+
+	/* An externally stored field can contain some initial
+	data from the field, and in the last 20 bytes it has the
+	space id, page number, and offset where the rest of the
+	field data is stored, and the data length in addition to
+	the data stored locally. We may need to store some data
+	locally to get the local record length above the 128 byte
+	limit so that field offsets are stored in two bytes, and
+	the extern bit is available in those two bytes. */
+
+	data = rec_get_nth_field(rec, offsets, no, &local_len);
+
+	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+	if (UNIV_UNLIKELY
+	    (!memcmp(data + local_len - BTR_EXTERN_FIELD_REF_SIZE,
+		     field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE))) {
+		/* The externally stored field was not written yet.
+		This record should only be seen by
+		trx_rollback_recovered() or any
+		TRX_ISO_READ_UNCOMMITTED transactions. */
+		return(NULL);
+	}
+
+	return(btr_copy_externally_stored_field(len, data,
+						zip_size, local_len, heap));
+}
diff --git a/storage/innobase/btr/btr0defragment.cc b/storage/innobase/btr/btr0defragment.cc
new file mode 100644
index 00000000..642db0e9
--- /dev/null
+++ b/storage/innobase/btr/btr0defragment.cc
@@ -0,0 +1,820 @@
+/*****************************************************************************
+
+Copyright (C) 2012, 2014 Facebook, Inc. All Rights Reserved.
+Copyright (C) 2014, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/**************************************************//**
+@file btr/btr0defragment.cc
+Index defragmentation.
+
+Created  05/29/2014 Rongrong Zhong
+Modified 16/07/2014 Sunguck Lee
+Modified 30/07/2014 Jan Lindström jan.lindstrom@mariadb.com
+*******************************************************/
+
+#include "btr0defragment.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "btr0sea.h"
+#include "btr0pcur.h"
+#include "dict0stats.h"
+#include "dict0stats_bg.h"
+#include "dict0defrag_bg.h"
+#include "ibuf0ibuf.h"
+#include "lock0lock.h"
+#include "srv0start.h"
+#include "mysqld.h"
+
+#include <list>
+
+/* When there's no work, either because defragment is disabled, or because no
+query is submitted, thread checks state every BTR_DEFRAGMENT_SLEEP_IN_USECS.*/
+#define BTR_DEFRAGMENT_SLEEP_IN_USECS		1000000
+/* Reduce the target page size by this amount when compression failure happens
+during defragmentaiton. 512 is chosen because it's a power of 2 and it is about
+3% of the page size. When there are compression failures in defragmentation,
+our goal is to get a decent defrag ratio with as few compression failure as
+possible. From experimentation it seems that reduce the target size by 512 every
+time will make sure the page is compressible within a couple of iterations. */
+#define BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE	512
+
+/** Item in the work queue for btr_degrament_thread. */
+struct btr_defragment_item_t
+{
+  /** persistent cursor where btr_defragment_n_pages should start */
+  btr_pcur_t * const pcur;
+  /** completion signal */
+  pthread_cond_t *cond;
+  /** timestamp of last time this index is processed by defragment thread */
+  ulonglong last_processed= 0;
+
+  btr_defragment_item_t(btr_pcur_t *pcur, pthread_cond_t *cond)
+    : pcur(pcur), cond(cond) {}
+};
+
+/* Work queue for defragmentation. */
+typedef std::list<btr_defragment_item_t*>	btr_defragment_wq_t;
+static btr_defragment_wq_t	btr_defragment_wq;
+
+/* Mutex protecting the defragmentation work queue.*/
+static mysql_mutex_t btr_defragment_mutex;
+#ifdef UNIV_PFS_MUTEX
+mysql_pfs_key_t btr_defragment_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+/* Number of compression failures caused by defragmentation since server
+start. */
+Atomic_counter<ulint> btr_defragment_compression_failures;
+/* Number of btr_defragment_n_pages calls that altered page but didn't
+manage to release any page. */
+Atomic_counter<ulint> btr_defragment_failures;
+/* Total number of btr_defragment_n_pages calls that altered page.
+The difference between btr_defragment_count and btr_defragment_failures shows
+the amount of effort wasted. */
+Atomic_counter<ulint> btr_defragment_count;
+
+bool btr_defragment_active;
+static void btr_defragment_chunk(void*);
+
+static tpool::timer* btr_defragment_timer;
+static tpool::task_group task_group(1);
+static tpool::task btr_defragment_task(btr_defragment_chunk, 0, &task_group);
+static void btr_defragment_start();
+
+static void submit_defragment_task(void*arg=0)
+{
+	srv_thread_pool->submit_task(&btr_defragment_task);
+}
+
+/******************************************************************//**
+Initialize defragmentation. */
+void
+btr_defragment_init()
+{
+	srv_defragment_interval = 1000000000ULL / srv_defragment_frequency;
+	mysql_mutex_init(btr_defragment_mutex_key, &btr_defragment_mutex,
+			 nullptr);
+	btr_defragment_timer = srv_thread_pool->create_timer(submit_defragment_task);
+	btr_defragment_active = true;
+}
+
+/******************************************************************//**
+Shutdown defragmentation. Release all resources. */
+void
+btr_defragment_shutdown()
+{
+	if (!btr_defragment_timer)
+		return;
+	delete btr_defragment_timer;
+	btr_defragment_timer = 0;
+	task_group.cancel_pending(&btr_defragment_task);
+	mysql_mutex_lock(&btr_defragment_mutex);
+	std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+	while(iter != btr_defragment_wq.end()) {
+		btr_defragment_item_t* item = *iter;
+		iter = btr_defragment_wq.erase(iter);
+		if (item->cond) {
+			pthread_cond_signal(item->cond);
+		}
+	}
+	mysql_mutex_unlock(&btr_defragment_mutex);
+	mysql_mutex_destroy(&btr_defragment_mutex);
+	btr_defragment_active = false;
+}
+
+
+/******************************************************************//**
+Functions used by the query threads: btr_defragment_xxx_index
+Query threads find/add/remove index. */
+/******************************************************************//**
+Check whether the given index is in btr_defragment_wq. We use index->id
+to identify indices. */
+bool
+btr_defragment_find_index(
+	dict_index_t*	index)	/*!< Index to find. */
+{
+	mysql_mutex_lock(&btr_defragment_mutex);
+	for (std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+	     iter != btr_defragment_wq.end();
+	     ++iter) {
+		btr_defragment_item_t* item = *iter;
+		btr_pcur_t* pcur = item->pcur;
+		btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
+		dict_index_t* idx = btr_cur_get_index(cursor);
+		if (index->id == idx->id) {
+			mysql_mutex_unlock(&btr_defragment_mutex);
+			return true;
+		}
+	}
+	mysql_mutex_unlock(&btr_defragment_mutex);
+	return false;
+}
+
+/** Defragment an index.
+@param pcur      persistent cursor
+@param thd       current session, for checking thd_killed()
+@return whether the operation was interrupted */
+bool btr_defragment_add_index(btr_pcur_t *pcur, THD *thd)
+{
+  dict_stats_empty_defrag_summary(pcur->index());
+  pthread_cond_t cond;
+  pthread_cond_init(&cond, nullptr);
+  btr_defragment_item_t item(pcur, &cond);
+  mysql_mutex_lock(&btr_defragment_mutex);
+  btr_defragment_wq.push_back(&item);
+  if (btr_defragment_wq.size() == 1)
+    /* Kick off defragmentation work */
+    btr_defragment_start();
+  bool interrupted= false;
+  for (;;)
+  {
+    timespec abstime;
+    set_timespec(abstime, 1);
+    if (!my_cond_timedwait(&cond, &btr_defragment_mutex.m_mutex, &abstime))
+      break;
+    if (thd_killed(thd))
+    {
+      item.cond= nullptr;
+      interrupted= true;
+      break;
+    }
+  }
+
+  pthread_cond_destroy(&cond);
+  mysql_mutex_unlock(&btr_defragment_mutex);
+  return interrupted;
+}
+
+/******************************************************************//**
+When table is dropped, this function is called to mark a table as removed in
+btr_efragment_wq. The difference between this function and the remove_index
+function is this will not NULL the event. */
+void
+btr_defragment_remove_table(
+	dict_table_t*	table)	/*!< Index to be removed. */
+{
+  mysql_mutex_lock(&btr_defragment_mutex);
+  for (auto item : btr_defragment_wq)
+  {
+    if (item->cond && table == item->pcur->index()->table)
+    {
+      pthread_cond_signal(item->cond);
+      item->cond= nullptr;
+    }
+  }
+  mysql_mutex_unlock(&btr_defragment_mutex);
+}
+
+/*********************************************************************//**
+Check whether we should save defragmentation statistics to persistent storage.
+Currently we save the stats to persistent storage every 100 updates. */
+void btr_defragment_save_defrag_stats_if_needed(dict_index_t *index)
+{
+	if (srv_defragment_stats_accuracy != 0 // stats tracking disabled
+	    && index->table->space_id != 0 // do not track system tables
+	    && !index->table->is_temporary()
+	    && index->stat_defrag_modified_counter
+	       >= srv_defragment_stats_accuracy) {
+		dict_stats_defrag_pool_add(index);
+		index->stat_defrag_modified_counter = 0;
+	}
+}
+
+/*********************************************************************//**
+Main defragment functionalities used by defragment thread.*/
+/*************************************************************//**
+Calculate number of records from beginning of block that can
+fit into size_limit
+@return number of records */
+static
+ulint
+btr_defragment_calc_n_recs_for_size(
+	buf_block_t* block,	/*!< in: B-tree page */
+	dict_index_t* index,	/*!< in: index of the page */
+	ulint size_limit,	/*!< in: size limit to fit records in */
+	ulint* n_recs_size)	/*!< out: actual size of the records that fit
+				in size_limit. */
+{
+	page_t* page = buf_block_get_frame(block);
+	ulint n_recs = 0;
+	rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs* offsets = offsets_;
+	rec_offs_init(offsets_);
+	mem_heap_t* heap = NULL;
+	ulint size = 0;
+	page_cur_t cur;
+
+	const ulint n_core = page_is_leaf(page) ? index->n_core_fields : 0;
+	page_cur_set_before_first(block, &cur);
+	while (rec_t* cur_rec = page_cur_move_to_next(&cur)) {
+		if (page_rec_is_supremum(cur_rec)) {
+			break;
+		}
+		offsets = rec_get_offsets(cur_rec, index, offsets, n_core,
+					  ULINT_UNDEFINED, &heap);
+		ulint rec_size = rec_offs_size(offsets);
+		size += rec_size;
+		if (size > size_limit) {
+			size = size - rec_size;
+			break;
+		}
+		n_recs ++;
+	}
+	*n_recs_size = size;
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return n_recs;
+}
+
+MY_ATTRIBUTE((nonnull(2,3,4), warn_unused_result))
+/************************************************************//**
+Returns the upper level node pointer to a page. It is assumed that mtr holds
+an sx-latch on the tree.
+@return rec_get_offsets() of the node pointer record */
+static
+rec_offs*
+btr_page_search_father_node_ptr(
+	rec_offs*	offsets,/*!< in: work area for the return value */
+	mem_heap_t*	heap,	/*!< in: memory heap to use */
+	btr_cur_t*	cursor,	/*!< in: cursor pointing to user record,
+				out: cursor on node pointer record,
+				its page x-latched */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	const uint32_t page_no = btr_cur_get_block(cursor)->page.id().page_no();
+	dict_index_t* index = btr_cur_get_index(cursor);
+	ut_ad(!index->is_spatial());
+
+	ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+					 | MTR_MEMO_SX_LOCK));
+	ut_ad(dict_index_get_page(index) != page_no);
+
+	const auto level = btr_page_get_level(btr_cur_get_page(cursor));
+
+	const rec_t* user_rec = btr_cur_get_rec(cursor);
+	ut_a(page_rec_is_user_rec(user_rec));
+
+	if (btr_cur_search_to_nth_level(level + 1,
+					dict_index_build_node_ptr(index,
+								  user_rec, 0,
+								  heap, level),
+					RW_X_LATCH,
+					cursor, mtr) != DB_SUCCESS) {
+		return nullptr;
+	}
+
+	const rec_t* node_ptr = btr_cur_get_rec(cursor);
+	ut_ad(!btr_cur_get_block(cursor)->page.lock.not_recursive()
+	      || mtr->memo_contains(index->lock, MTR_MEMO_X_LOCK));
+
+	offsets = rec_get_offsets(node_ptr, index, offsets, 0,
+				  ULINT_UNDEFINED, &heap);
+
+	if (btr_node_ptr_get_child_page_no(node_ptr, offsets) != page_no) {
+		offsets = nullptr;
+	}
+
+	return(offsets);
+}
+
+static bool btr_page_search_father(mtr_t *mtr, btr_cur_t *cursor)
+{
+  rec_t *rec=
+    page_rec_get_next(page_get_infimum_rec(cursor->block()->page.frame));
+  if (UNIV_UNLIKELY(!rec))
+    return false;
+  cursor->page_cur.rec= rec;
+  mem_heap_t *heap= mem_heap_create(100);
+  const bool got= btr_page_search_father_node_ptr(nullptr, heap, cursor, mtr);
+  mem_heap_free(heap);
+  return got;
+}
+
+/*************************************************************//**
+Merge as many records from the from_block to the to_block. Delete
+the from_block if all records are successfully merged to to_block.
+@return the to_block to target for next merge operation.
+@retval nullptr if corruption was noticed */
+static
+buf_block_t*
+btr_defragment_merge_pages(
+	dict_index_t*	index,		/*!< in: index tree */
+	buf_block_t*	from_block,	/*!< in: origin of merge */
+	buf_block_t*	to_block,	/*!< in: destination of merge */
+	ulint		zip_size,	/*!< in: ROW_FORMAT=COMPRESSED size */
+	ulint		reserved_space,	/*!< in: space reserved for future
+					insert to avoid immediate page split */
+	ulint*		max_data_size,	/*!< in/out: max data size to
+					fit in a single compressed page. */
+	mem_heap_t*	heap,		/*!< in/out: pointer to memory heap */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
+{
+	page_t* from_page = buf_block_get_frame(from_block);
+	page_t* to_page = buf_block_get_frame(to_block);
+	ulint level = btr_page_get_level(from_page);
+	ulint n_recs = page_get_n_recs(from_page);
+	ulint new_data_size = page_get_data_size(to_page);
+	ulint max_ins_size =
+		page_get_max_insert_size(to_page, n_recs);
+	ulint max_ins_size_reorg =
+		page_get_max_insert_size_after_reorganize(
+			to_page, n_recs);
+	ulint max_ins_size_to_use = max_ins_size_reorg > reserved_space
+				    ? max_ins_size_reorg - reserved_space : 0;
+	ulint move_size = 0;
+	ulint n_recs_to_move = 0;
+	rec_t* rec = NULL;
+	ulint target_n_recs = 0;
+	rec_t* orig_pred;
+
+	// Estimate how many records can be moved from the from_page to
+	// the to_page.
+	if (zip_size) {
+		ulint page_diff = srv_page_size - *max_data_size;
+		max_ins_size_to_use = (max_ins_size_to_use > page_diff)
+			       ? max_ins_size_to_use - page_diff : 0;
+	}
+	n_recs_to_move = btr_defragment_calc_n_recs_for_size(
+		from_block, index, max_ins_size_to_use, &move_size);
+
+	// If max_ins_size >= move_size, we can move the records without
+	// reorganizing the page, otherwise we need to reorganize the page
+	// first to release more space.
+	if (move_size > max_ins_size) {
+		dberr_t err = btr_page_reorganize_block(page_zip_level,
+                                                        to_block, index, mtr);
+		if (err != DB_SUCCESS) {
+			if (!dict_index_is_clust(index)
+			    && page_is_leaf(to_page)) {
+				ibuf_reset_free_bits(to_block);
+			}
+			// If reorganization fails, that means page is
+			// not compressable. There's no point to try
+			// merging into this page. Continue to the
+			// next page.
+			return err == DB_FAIL ? from_block : nullptr;
+		}
+		ut_ad(page_validate(to_page, index));
+		max_ins_size = page_get_max_insert_size(to_page, n_recs);
+		if (max_ins_size < move_size) {
+			return nullptr;
+		}
+	}
+
+	// Move records to pack to_page more full.
+	orig_pred = NULL;
+	target_n_recs = n_recs_to_move;
+	dberr_t err;
+	while (n_recs_to_move > 0) {
+		if (!(rec = page_rec_get_nth(from_page, n_recs_to_move + 1))) {
+			return nullptr;
+		}
+		orig_pred = page_copy_rec_list_start(
+			to_block, from_block, rec, index, mtr, &err);
+		if (orig_pred)
+			break;
+		if (err != DB_FAIL) {
+			return nullptr;
+		}
+
+		// If we reach here, that means compression failed after packing
+		// n_recs_to_move number of records to to_page. We try to reduce
+		// the targeted data size on the to_page by
+		// BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE and try again.
+		btr_defragment_compression_failures++;
+		max_ins_size_to_use =
+			move_size > BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE
+			? move_size - BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE
+			: 0;
+		if (max_ins_size_to_use == 0) {
+			n_recs_to_move = 0;
+			move_size = 0;
+			break;
+		}
+		n_recs_to_move = btr_defragment_calc_n_recs_for_size(
+			from_block, index, max_ins_size_to_use, &move_size);
+	}
+	// If less than target_n_recs are moved, it means there are
+	// compression failures during page_copy_rec_list_start. Adjust
+	// the max_data_size estimation to reduce compression failures
+	// in the following runs.
+	if (target_n_recs > n_recs_to_move
+	    && *max_data_size > new_data_size + move_size) {
+		*max_data_size = new_data_size + move_size;
+	}
+	// Set ibuf free bits if necessary.
+	if (!dict_index_is_clust(index)
+	    && page_is_leaf(to_page)) {
+		if (zip_size) {
+			ibuf_reset_free_bits(to_block);
+		} else {
+			ibuf_update_free_bits_if_full(
+				to_block,
+				srv_page_size,
+				ULINT_UNDEFINED);
+		}
+	}
+	btr_cur_t parent;
+	parent.page_cur.index = index;
+	parent.page_cur.block = from_block;
+
+	if (!btr_page_search_father(mtr, &parent)) {
+		to_block = nullptr;
+	} else if (n_recs_to_move == n_recs) {
+		/* The whole page is merged with the previous page,
+		free it. */
+		lock_update_merge_left(*to_block, orig_pred,
+				       from_block->page.id());
+		btr_search_drop_page_hash_index(from_block, false);
+		if (btr_level_list_remove(*from_block, *index, mtr)
+		    != DB_SUCCESS
+		    || btr_cur_node_ptr_delete(&parent, mtr) != DB_SUCCESS
+		    || btr_page_free(index, from_block, mtr) != DB_SUCCESS) {
+			return nullptr;
+		}
+	} else {
+		// There are still records left on the page, so
+		// increment n_defragmented. Node pointer will be changed
+		// so remove the old node pointer.
+		if (n_recs_to_move > 0) {
+			// Part of the page is merged to left, remove
+			// the merged records, update record locks and
+			// node pointer.
+			dtuple_t* node_ptr;
+			page_delete_rec_list_start(rec, from_block,
+						   index, mtr);
+			lock_update_split_and_merge(to_block,
+						    orig_pred,
+						    from_block);
+			// FIXME: reuse the node_ptr!
+			if (btr_cur_node_ptr_delete(&parent, mtr)
+			    != DB_SUCCESS) {
+				return nullptr;
+			}
+			rec = page_rec_get_next(
+				page_get_infimum_rec(from_page));
+			if (!rec) {
+				return nullptr;
+			}
+			node_ptr = dict_index_build_node_ptr(
+				index, rec, page_get_page_no(from_page),
+				heap, level);
+			if (btr_insert_on_non_leaf_level(0, index, level+1,
+							 node_ptr, mtr)
+			    != DB_SUCCESS) {
+				return nullptr;
+			}
+		}
+		to_block = from_block;
+	}
+	return to_block;
+}
+
+/*************************************************************//**
+Tries to merge N consecutive pages, starting from the page pointed by the
+cursor. Skip space 0. Only consider leaf pages.
+This function first loads all N pages into memory, then for each of
+the pages other than the first page, it tries to move as many records
+as possible to the left sibling to keep the left sibling full. During
+the process, if any page becomes empty, that page will be removed from
+the level list. Record locks, hash, and node pointers are updated after
+page reorganization.
+@return pointer to the last block processed, or NULL if reaching end of index */
+static
+buf_block_t*
+btr_defragment_n_pages(
+	buf_block_t*	block,	/*!< in: starting block for defragmentation */
+	dict_index_t*	index,	/*!< in: index tree */
+	uint		n_pages,/*!< in: number of pages to defragment */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	/* We will need to load the n+1 block because if the last page is freed
+	and we need to modify the prev_page_no of that block. */
+	buf_block_t*	blocks[BTR_DEFRAGMENT_MAX_N_PAGES + 1];
+	page_t*		first_page;
+	buf_block_t*	current_block;
+	ulint		total_data_size = 0;
+	ulint		total_n_recs = 0;
+	ulint		data_size_per_rec;
+	ulint		optimal_page_size;
+	ulint		reserved_space;
+	ulint		max_data_size = 0;
+	uint		n_defragmented = 0;
+	uint		n_new_slots;
+	mem_heap_t*	heap;
+	ibool		end_of_index = FALSE;
+
+	/* It doesn't make sense to call this function with n_pages = 1. */
+	ut_ad(n_pages > 1);
+
+	if (!page_is_leaf(block->page.frame)) {
+		return NULL;
+	}
+
+	if (!index->table->space || !index->table->space_id) {
+		/* Ignore space 0. */
+		return NULL;
+	}
+
+	if (n_pages > BTR_DEFRAGMENT_MAX_N_PAGES) {
+		n_pages = BTR_DEFRAGMENT_MAX_N_PAGES;
+	}
+
+	first_page = buf_block_get_frame(block);
+	const ulint zip_size = index->table->space->zip_size();
+
+	/* 1. Load the pages and calculate the total data size. */
+	blocks[0] = block;
+	for (uint i = 1; i <= n_pages; i++) {
+		page_t* page = buf_block_get_frame(blocks[i-1]);
+		uint32_t page_no = btr_page_get_next(page);
+		total_data_size += page_get_data_size(page);
+		total_n_recs += page_get_n_recs(page);
+		if (page_no == FIL_NULL) {
+			n_pages = i;
+			end_of_index = TRUE;
+			break;
+		}
+
+		blocks[i] = btr_block_get(*index, page_no, RW_X_LATCH, true,
+					  mtr);
+		if (!blocks[i]) {
+			return nullptr;
+		}
+	}
+
+	if (n_pages == 1) {
+		if (!page_has_prev(first_page)) {
+			/* last page in the index */
+			if (dict_index_get_page(index)
+			    == page_get_page_no(first_page))
+				return NULL;
+			/* given page is the last page.
+			Lift the records to father. */
+			dberr_t err;
+			btr_lift_page_up(index, block, mtr, &err);
+		}
+		return NULL;
+	}
+
+	/* 2. Calculate how many pages data can fit in. If not compressable,
+	return early. */
+	ut_a(total_n_recs != 0);
+	data_size_per_rec = total_data_size / total_n_recs;
+	// For uncompressed pages, the optimal data size if the free space of a
+	// empty page.
+	optimal_page_size = page_get_free_space_of_empty(
+		page_is_comp(first_page));
+	// For compressed pages, we take compression failures into account.
+	if (zip_size) {
+		ulint size = 0;
+		uint i = 0;
+		// We estimate the optimal data size of the index use samples of
+		// data size. These samples are taken when pages failed to
+		// compress due to insertion on the page. We use the average
+		// of all samples we have as the estimation. Different pages of
+		// the same index vary in compressibility. Average gives a good
+		// enough estimation.
+		for (;i < STAT_DEFRAG_DATA_SIZE_N_SAMPLE; i++) {
+			if (index->stat_defrag_data_size_sample[i] == 0) {
+				break;
+			}
+			size += index->stat_defrag_data_size_sample[i];
+		}
+		if (i != 0) {
+			size /= i;
+			optimal_page_size = ut_min(optimal_page_size, size);
+		}
+		max_data_size = optimal_page_size;
+	}
+
+	reserved_space = ut_min(static_cast<ulint>(
+					static_cast<double>(optimal_page_size)
+					* (1 - srv_defragment_fill_factor)),
+			     (data_size_per_rec
+			      * srv_defragment_fill_factor_n_recs));
+	optimal_page_size -= reserved_space;
+	n_new_slots = uint((total_data_size + optimal_page_size - 1)
+			   / optimal_page_size);
+	if (n_new_slots >= n_pages) {
+		/* Can't defragment. */
+		if (end_of_index)
+			return NULL;
+		return blocks[n_pages-1];
+	}
+
+	/* 3. Defragment pages. */
+	heap = mem_heap_create(256);
+	// First defragmented page will be the first page.
+	current_block = blocks[0];
+	// Start from the second page.
+	for (uint i = 1; i < n_pages; i ++) {
+		buf_block_t* new_block = btr_defragment_merge_pages(
+			index, blocks[i], current_block, zip_size,
+			reserved_space, &max_data_size, heap, mtr);
+		if (new_block != current_block) {
+			n_defragmented ++;
+			current_block = new_block;
+			if (!new_block) {
+				break;
+			}
+		}
+	}
+	mem_heap_free(heap);
+	n_defragmented ++;
+	btr_defragment_count++;
+	if (n_pages == n_defragmented) {
+		btr_defragment_failures++;
+	} else {
+		index->stat_defrag_n_pages_freed += (n_pages - n_defragmented);
+	}
+	if (end_of_index)
+		return NULL;
+	return current_block;
+}
+
+
+
+void btr_defragment_start() {
+	if (!srv_defragment)
+		return;
+	ut_ad(!btr_defragment_wq.empty());
+	submit_defragment_task();
+}
+
+
+/**
+Callback used by defragment timer
+
+Throttling "sleep", is implemented via rescheduling the
+threadpool timer, which, when fired, will resume the work again,
+where it is left.
+
+The state (current item) is stored in function parameter.
+*/
+static void btr_defragment_chunk(void*)
+{
+	THD *thd = innobase_create_background_thd("InnoDB defragment");
+	set_current_thd(thd);
+
+	btr_defragment_item_t* item = nullptr;
+	mtr_t		mtr;
+
+	mysql_mutex_lock(&btr_defragment_mutex);
+
+	while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
+		if (!item) {
+			if (btr_defragment_wq.empty()) {
+release_and_exit:
+				mysql_mutex_unlock(&btr_defragment_mutex);
+func_exit:
+				set_current_thd(nullptr);
+				destroy_background_thd(thd);
+				return;
+			}
+			item = *btr_defragment_wq.begin();
+			ut_ad(item);
+		}
+
+		if (!item->cond) {
+processed:
+			btr_defragment_wq.remove(item);
+			item = nullptr;
+			continue;
+		}
+
+		mysql_mutex_unlock(&btr_defragment_mutex);
+
+		ulonglong now = my_interval_timer();
+		ulonglong elapsed = now - item->last_processed;
+
+		if (elapsed < srv_defragment_interval) {
+			/* If we see an index again before the interval
+			determined by the configured frequency is reached,
+			we just sleep until the interval pass. Since
+			defragmentation of all indices queue up on a single
+			thread, it's likely other indices that follow this one
+			don't need to sleep again. */
+			int sleep_ms = (int)((srv_defragment_interval - elapsed) / 1000 / 1000);
+			if (sleep_ms) {
+				btr_defragment_timer->set_time(sleep_ms, 0);
+				goto func_exit;
+			}
+		}
+		log_free_check();
+		mtr_start(&mtr);
+		dict_index_t *index = item->pcur->index();
+		index->set_modified(mtr);
+		/* To follow the latching order defined in WL#6326,
+		acquire index->lock X-latch.  This entitles us to
+		acquire page latches in any order for the index. */
+		mtr_x_lock_index(index, &mtr);
+		if (buf_block_t *last_block =
+		    item->pcur->restore_position(
+			    BTR_PURGE_TREE_ALREADY_LATCHED, &mtr)
+		    == btr_pcur_t::CORRUPTED
+		    ? nullptr
+		    : btr_defragment_n_pages(btr_pcur_get_block(item->pcur),
+					     index, srv_defragment_n_pages,
+					     &mtr)) {
+			/* If we haven't reached the end of the index,
+			place the cursor on the last record of last page,
+			store the cursor position, and put back in queue. */
+			page_t* last_page = buf_block_get_frame(last_block);
+			rec_t* rec = page_rec_get_prev(
+				page_get_supremum_rec(last_page));
+			if (rec && page_rec_is_user_rec(rec)) {
+				page_cur_position(rec, last_block,
+						  btr_pcur_get_page_cur(
+							  item->pcur));
+			}
+			btr_pcur_store_position(item->pcur, &mtr);
+			mtr_commit(&mtr);
+			/* Update the last_processed time of this index. */
+			item->last_processed = now;
+			mysql_mutex_lock(&btr_defragment_mutex);
+		} else {
+			mtr_commit(&mtr);
+			/* Reaching the end of the index. */
+			dict_stats_empty_defrag_stats(index);
+			if (dberr_t err= dict_stats_save_defrag_stats(index)) {
+				ib::error() << "Saving defragmentation stats for table "
+					    << index->table->name
+					    << " index " << index->name()
+					    << " failed with error " << err;
+			} else {
+				err = dict_stats_save_defrag_summary(index,
+								     thd);
+
+				if (err != DB_SUCCESS) {
+					ib::error() << "Saving defragmentation summary for table "
+					    << index->table->name
+					    << " index " << index->name()
+					    << " failed with error " << err;
+				}
+			}
+
+			mysql_mutex_lock(&btr_defragment_mutex);
+			if (item->cond) {
+				pthread_cond_signal(item->cond);
+			}
+			goto processed;
+		}
+	}
+
+	goto release_and_exit;
+}
diff --git a/storage/innobase/btr/btr0pcur.cc b/storage/innobase/btr/btr0pcur.cc
new file mode 100644
index 00000000..54dd15ac
--- /dev/null
+++ b/storage/innobase/btr/btr0pcur.cc
@@ -0,0 +1,667 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file btr/btr0pcur.cc
+The index tree persistent cursor
+
+Created 2/23/1996 Heikki Tuuri
+*******************************************************/
+
+#include "btr0pcur.h"
+#include "ut0byte.h"
+#include "rem0cmp.h"
+#include "trx0trx.h"
+
+/**************************************************************//**
+Resets a persistent cursor object, freeing ::old_rec_buf if it is
+allocated and resetting the other members to their initial values. */
+void
+btr_pcur_reset(
+/*===========*/
+	btr_pcur_t*	cursor)	/*!< in, out: persistent cursor */
+{
+	ut_free(cursor->old_rec_buf);
+	memset(&cursor->btr_cur.page_cur, 0, sizeof(page_cur_t));
+	cursor->old_rec_buf = NULL;
+	cursor->old_rec = NULL;
+	cursor->old_n_core_fields = 0;
+	cursor->old_n_fields = 0;
+
+	cursor->latch_mode = BTR_NO_LATCHES;
+	cursor->pos_state = BTR_PCUR_NOT_POSITIONED;
+}
+
+/**************************************************************//**
+The position of the cursor is stored by taking an initial segment of the
+record the cursor is positioned on, before, or after, and copying it to the
+cursor data structure, or just setting a flag if the cursor id before the
+first in an EMPTY tree, or after the last in an EMPTY tree. NOTE that the
+page where the cursor is positioned must not be empty if the index tree is
+not totally empty! */
+void
+btr_pcur_store_position(
+/*====================*/
+	btr_pcur_t*	cursor, /*!< in: persistent cursor */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_cur_t*	page_cursor;
+	buf_block_t*	block;
+	rec_t*		rec;
+	dict_index_t*	index;
+	ulint		offs;
+
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	block = btr_pcur_get_block(cursor);
+	index = btr_cur_get_index(btr_pcur_get_btr_cur(cursor));
+
+	page_cursor = btr_pcur_get_page_cur(cursor);
+
+	rec = page_cur_get_rec(page_cursor);
+	offs = rec - block->page.frame;
+	ut_ad(block->page.id().page_no()
+	      == page_get_page_no(block->page.frame));
+	ut_ad(block->page.buf_fix_count());
+	/* For spatial index, when we do positioning on parent
+	buffer if necessary, it might not hold latches, but the
+	tree must be locked to prevent change on the page */
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_S_FIX
+					 | MTR_MEMO_PAGE_X_FIX)
+	      || (index->is_spatial()
+		  && mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+						| MTR_MEMO_SX_LOCK)));
+
+	if (page_is_empty(block->page.frame)) {
+		/* It must be an empty index tree; NOTE that in this case
+		we do not store the modify_clock, but always do a search
+		if we restore the cursor position */
+
+		ut_a(!page_has_siblings(block->page.frame));
+		ut_ad(page_is_leaf(block->page.frame));
+		ut_ad(block->page.id().page_no() == index->page);
+
+		if (page_rec_is_supremum_low(offs)) {
+			cursor->rel_pos = BTR_PCUR_AFTER_LAST_IN_TREE;
+		} else {
+before_first:
+			cursor->rel_pos = BTR_PCUR_BEFORE_FIRST_IN_TREE;
+		}
+
+		return;
+	}
+
+	if (page_rec_is_supremum_low(offs)) {
+		rec = page_rec_get_prev(rec);
+		if (UNIV_UNLIKELY(!rec || page_rec_is_infimum(rec))) {
+			ut_ad("corrupted index" == 0);
+			cursor->rel_pos = BTR_PCUR_AFTER_LAST_IN_TREE;
+			return;
+		}
+
+		ut_ad(!page_rec_is_infimum(rec));
+		if (UNIV_UNLIKELY(rec_is_metadata(rec, *index))) {
+#if 0 /* MDEV-22867 had to relax this */
+			/* If the table is emptied during an ALGORITHM=NOCOPY
+			DROP COLUMN ... that is not ALGORITHM=INSTANT,
+			then we must preserve any instant ADD metadata. */
+			ut_ad(index->table->instant
+			      || block->page.id().page_no() != index->page);
+#endif
+			ut_ad(index->is_instant()
+			      || block->page.id().page_no() != index->page);
+			ut_ad(page_get_n_recs(block->page.frame) == 1);
+			ut_ad(page_is_leaf(block->page.frame));
+			ut_ad(!page_has_prev(block->page.frame));
+			cursor->rel_pos = BTR_PCUR_AFTER_LAST_IN_TREE;
+			return;
+		}
+
+		cursor->rel_pos = BTR_PCUR_AFTER;
+	} else if (page_rec_is_infimum_low(offs)) {
+		rec = page_rec_get_next(rec);
+
+		if (UNIV_UNLIKELY(!rec)) {
+			ut_ad("corrupted page" == 0);
+			goto before_first;
+		}
+
+		if (rec_is_metadata(rec, *index)) {
+			ut_ad(!page_has_prev(block->page.frame));
+			rec = page_rec_get_next(rec);
+			ut_ad(rec);
+			if (!rec || page_rec_is_supremum(rec)) {
+				goto before_first;
+			}
+		}
+
+		cursor->rel_pos = BTR_PCUR_BEFORE;
+	} else {
+		cursor->rel_pos = BTR_PCUR_ON;
+	}
+
+	if (index->is_ibuf()) {
+		ut_ad(!index->table->not_redundant());
+		cursor->old_n_fields = uint16_t(rec_get_n_fields_old(rec));
+	} else {
+		cursor->old_n_fields = static_cast<uint16>(
+			dict_index_get_n_unique_in_tree(index));
+		if (index->is_spatial() && !page_rec_is_leaf(rec)) {
+			ut_ad(dict_index_get_n_unique_in_tree_nonleaf(index)
+			      == DICT_INDEX_SPATIAL_NODEPTR_SIZE);
+			/* For R-tree, we have to compare
+			the child page numbers as well. */
+			cursor->old_n_fields
+				= DICT_INDEX_SPATIAL_NODEPTR_SIZE + 1;
+		}
+	}
+
+	cursor->old_n_core_fields = index->n_core_fields;
+	cursor->old_rec = rec_copy_prefix_to_buf(rec, index,
+						 cursor->old_n_fields,
+						 &cursor->old_rec_buf,
+						 &cursor->buf_size);
+	cursor->block_when_stored.store(block);
+
+	/* Function try to check if block is S/X latch. */
+	cursor->modify_clock = buf_block_get_modify_clock(block);
+}
+
+/**************************************************************//**
+Copies the stored position of a pcur to another pcur. */
+void
+btr_pcur_copy_stored_position(
+/*==========================*/
+	btr_pcur_t*	pcur_receive,	/*!< in: pcur which will receive the
+					position info */
+	btr_pcur_t*	pcur_donate)	/*!< in: pcur from which the info is
+					copied */
+{
+	ut_free(pcur_receive->old_rec_buf);
+	memcpy(pcur_receive, pcur_donate, sizeof(btr_pcur_t));
+
+	if (pcur_donate->old_rec_buf) {
+
+		pcur_receive->old_rec_buf = (byte*)
+			ut_malloc_nokey(pcur_donate->buf_size);
+
+		memcpy(pcur_receive->old_rec_buf, pcur_donate->old_rec_buf,
+		       pcur_donate->buf_size);
+		pcur_receive->old_rec = pcur_receive->old_rec_buf
+			+ (pcur_donate->old_rec - pcur_donate->old_rec_buf);
+	}
+
+	pcur_receive->old_n_core_fields = pcur_donate->old_n_core_fields;
+	pcur_receive->old_n_fields = pcur_donate->old_n_fields;
+}
+
+/** Optimistically latches the leaf page or pages requested.
+@param[in]	block		guessed buffer block
+@param[in,out]	pcur		cursor
+@param[in,out]	latch_mode	BTR_SEARCH_LEAF, ...
+@param[in,out]	mtr		mini-transaction
+@return true if success */
+TRANSACTIONAL_TARGET
+static bool btr_pcur_optimistic_latch_leaves(buf_block_t *block,
+                                             btr_pcur_t *pcur,
+                                             btr_latch_mode *latch_mode,
+                                             mtr_t *mtr)
+{
+  ut_ad(block->page.buf_fix_count());
+  ut_ad(block->page.in_file());
+  ut_ad(block->page.frame);
+
+  static_assert(BTR_SEARCH_PREV & BTR_SEARCH_LEAF, "");
+  static_assert(BTR_MODIFY_PREV & BTR_MODIFY_LEAF, "");
+  static_assert((BTR_SEARCH_PREV ^ BTR_MODIFY_PREV) ==
+                (RW_S_LATCH ^ RW_X_LATCH), "");
+
+  const rw_lock_type_t mode=
+    rw_lock_type_t(*latch_mode & (RW_X_LATCH | RW_S_LATCH));
+
+  switch (*latch_mode) {
+  default:
+    ut_ad(*latch_mode == BTR_SEARCH_LEAF || *latch_mode == BTR_MODIFY_LEAF);
+    return buf_page_optimistic_get(mode, block, pcur->modify_clock, mtr);
+  case BTR_SEARCH_PREV:
+  case BTR_MODIFY_PREV:
+    page_id_t id{0};
+    uint32_t left_page_no;
+    ulint zip_size;
+    buf_block_t *left_block= nullptr;
+    {
+      transactional_shared_lock_guard<block_lock> g{block->page.lock};
+      if (block->modify_clock != pcur->modify_clock)
+        return false;
+      id= block->page.id();
+      zip_size= block->zip_size();
+      left_page_no= btr_page_get_prev(block->page.frame);
+    }
+
+    if (left_page_no != FIL_NULL)
+    {
+      left_block=
+        buf_page_get_gen(page_id_t(id.space(), left_page_no), zip_size,
+                         mode, nullptr, BUF_GET_POSSIBLY_FREED, mtr);
+
+      if (left_block &&
+          btr_page_get_next(left_block->page.frame) != id.page_no())
+      {
+release_left_block:
+        mtr->release_last_page();
+        return false;
+      }
+    }
+
+    if (buf_page_optimistic_get(mode, block, pcur->modify_clock, mtr))
+    {
+      if (btr_page_get_prev(block->page.frame) == left_page_no)
+      {
+        /* block was already buffer-fixed while entering the function and
+        buf_page_optimistic_get() buffer-fixes it again. */
+        ut_ad(2 <= block->page.buf_fix_count());
+        *latch_mode= btr_latch_mode(mode);
+        return true;
+      }
+
+      mtr->release_last_page();
+    }
+
+    ut_ad(block->page.buf_fix_count());
+    if (left_block)
+      goto release_left_block;
+    return false;
+  }
+}
+
+/** Structure acts as functor to do the latching of leaf pages.
+It returns true if latching of leaf pages succeeded and false
+otherwise. */
+struct optimistic_latch_leaves
+{
+  btr_pcur_t *const cursor;
+  btr_latch_mode *const latch_mode;
+  mtr_t *const mtr;
+
+  bool operator()(buf_block_t *hint) const
+  {
+    return hint &&
+      btr_pcur_optimistic_latch_leaves(hint, cursor, latch_mode, mtr);
+  }
+};
+
+/** Restores the stored position of a persistent cursor bufferfixing
+the page and obtaining the specified latches. If the cursor position
+was saved when the
+(1) cursor was positioned on a user record: this function restores the
+position to the last record LESS OR EQUAL to the stored record;
+(2) cursor was positioned on a page infimum record: restores the
+position to the last record LESS than the user record which was the
+successor of the page infimum;
+(3) cursor was positioned on the page supremum: restores to the first
+record GREATER than the user record which was the predecessor of the
+supremum.
+(4) cursor was positioned before the first or after the last in an
+empty tree: restores to before first or after the last in the tree.
+@param latch_mode  BTR_SEARCH_LEAF, ...
+@param mtr         mini-transaction
+@return btr_pcur_t::SAME_ALL cursor position on user rec and points on
+the record with the same field values as in the stored record,
+btr_pcur_t::SAME_UNIQ cursor position is on user rec and points on the
+record with the same unique field values as in the stored record,
+btr_pcur_t::NOT_SAME cursor position is not on user rec or points on
+the record with not the samebuniq field values as in the stored */
+btr_pcur_t::restore_status
+btr_pcur_t::restore_position(btr_latch_mode restore_latch_mode, mtr_t *mtr)
+{
+	dict_index_t*	index;
+	dtuple_t*	tuple;
+	page_cur_mode_t	mode;
+	page_cur_mode_t	old_mode;
+	mem_heap_t*	heap;
+
+	ut_ad(mtr->is_active());
+	ut_ad(pos_state == BTR_PCUR_WAS_POSITIONED
+	      || pos_state == BTR_PCUR_IS_POSITIONED);
+
+	index = btr_cur_get_index(&btr_cur);
+
+	if (UNIV_UNLIKELY
+	    (rel_pos == BTR_PCUR_AFTER_LAST_IN_TREE
+	     || rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE)) {
+		/* In these cases we do not try an optimistic restoration,
+		but always do a search */
+
+		if (btr_cur.open_leaf(rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE,
+				      index, restore_latch_mode, mtr)
+		    != DB_SUCCESS) {
+			return restore_status::CORRUPTED;
+		}
+
+		latch_mode =
+			BTR_LATCH_MODE_WITHOUT_INTENTION(restore_latch_mode);
+		pos_state = BTR_PCUR_IS_POSITIONED;
+		block_when_stored.clear();
+
+		return restore_status::NOT_SAME;
+	}
+
+	ut_a(old_rec);
+	ut_a(old_n_core_fields);
+	ut_a(old_n_core_fields <= index->n_core_fields);
+	ut_a(old_n_fields);
+
+	static_assert(BTR_SEARCH_PREV == (4 | BTR_SEARCH_LEAF), "");
+	static_assert(BTR_MODIFY_PREV == (4 | BTR_MODIFY_LEAF), "");
+
+	switch (restore_latch_mode | 4) {
+	case BTR_SEARCH_PREV:
+	case BTR_MODIFY_PREV:
+		/* Try optimistic restoration. */
+		if (block_when_stored.run_with_hint(
+			optimistic_latch_leaves{this, &restore_latch_mode,
+						mtr})) {
+			pos_state = BTR_PCUR_IS_POSITIONED;
+			latch_mode = restore_latch_mode;
+
+			if (rel_pos == BTR_PCUR_ON) {
+#ifdef UNIV_DEBUG
+				const rec_t*	rec;
+				rec_offs	offsets1_[REC_OFFS_NORMAL_SIZE];
+				rec_offs	offsets2_[REC_OFFS_NORMAL_SIZE];
+				rec_offs*	offsets1 = offsets1_;
+				rec_offs*	offsets2 = offsets2_;
+				rec = btr_pcur_get_rec(this);
+
+				rec_offs_init(offsets1_);
+				rec_offs_init(offsets2_);
+
+				heap = mem_heap_create(256);
+				ut_ad(old_n_core_fields
+				      == index->n_core_fields);
+
+				offsets1 = rec_get_offsets(
+					old_rec, index, offsets1,
+					old_n_core_fields,
+					old_n_fields, &heap);
+				offsets2 = rec_get_offsets(
+					rec, index, offsets2,
+					index->n_core_fields,
+					old_n_fields, &heap);
+
+				ut_ad(!cmp_rec_rec(old_rec,
+						   rec, offsets1, offsets2,
+						   index));
+				mem_heap_free(heap);
+#endif /* UNIV_DEBUG */
+				return restore_status::SAME_ALL;
+			}
+			/* This is the same record as stored,
+			may need to be adjusted for BTR_PCUR_BEFORE/AFTER,
+			depending on search mode and direction. */
+			if (btr_pcur_is_on_user_rec(this)) {
+				pos_state
+					= BTR_PCUR_IS_POSITIONED_OPTIMISTIC;
+			}
+			return restore_status::NOT_SAME;
+		}
+	}
+
+	/* If optimistic restoration did not succeed, open the cursor anew */
+
+	heap = mem_heap_create(256);
+
+	tuple = dtuple_create(heap, old_n_fields);
+
+	dict_index_copy_types(tuple, index, old_n_fields);
+
+	rec_copy_prefix_to_dtuple(tuple, old_rec, index,
+				  old_n_core_fields,
+				  old_n_fields, heap);
+	ut_ad(dtuple_check_typed(tuple));
+
+	/* Save the old search mode of the cursor */
+	old_mode = search_mode;
+
+	switch (rel_pos) {
+	case BTR_PCUR_ON:
+		mode = PAGE_CUR_LE;
+		break;
+	case BTR_PCUR_AFTER:
+		mode = PAGE_CUR_G;
+		break;
+	case BTR_PCUR_BEFORE:
+		mode = PAGE_CUR_L;
+		break;
+	default:
+		MY_ASSERT_UNREACHABLE();
+		mode = PAGE_CUR_UNSUPP;
+	}
+
+	if (btr_pcur_open_with_no_init(tuple, mode, restore_latch_mode,
+				       this, mtr) != DB_SUCCESS) {
+		mem_heap_free(heap);
+		return restore_status::CORRUPTED;
+        }
+
+	/* Restore the old search mode */
+	search_mode = old_mode;
+
+	ut_ad(rel_pos == BTR_PCUR_ON
+	      || rel_pos == BTR_PCUR_BEFORE
+	      || rel_pos == BTR_PCUR_AFTER);
+	rec_offs offsets[REC_OFFS_NORMAL_SIZE];
+	rec_offs_init(offsets);
+	restore_status ret_val= restore_status::NOT_SAME;
+	if (rel_pos == BTR_PCUR_ON && btr_pcur_is_on_user_rec(this)) {
+		ulint n_matched_fields= 0;
+		if (!cmp_dtuple_rec_with_match(
+		      tuple, btr_pcur_get_rec(this), index,
+		      rec_get_offsets(btr_pcur_get_rec(this), index, offsets,
+			index->n_core_fields, ULINT_UNDEFINED, &heap),
+		      &n_matched_fields)) {
+
+			/* We have to store the NEW value for the modify clock,
+			since the cursor can now be on a different page!
+			But we can retain the value of old_rec */
+
+			block_when_stored.store(btr_pcur_get_block(this));
+			modify_clock= buf_block_get_modify_clock(
+			    block_when_stored.block());
+
+			mem_heap_free(heap);
+
+			return restore_status::SAME_ALL;
+		}
+		if (n_matched_fields >= index->n_uniq)
+			ret_val= restore_status::SAME_UNIQ;
+	}
+
+	mem_heap_free(heap);
+
+	/* We have to store new position information, modify_clock etc.,
+	to the cursor because it can now be on a different page, the record
+	under it may have been removed, etc. */
+
+	btr_pcur_store_position(this, mtr);
+
+	return ret_val;
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the first record on the next page. Releases the
+latch on the current page, and bufferunfixes it. Note that there must not be
+modifications on the current page, as then the x-latch can be released only in
+mtr_commit. */
+dberr_t
+btr_pcur_move_to_next_page(
+/*=======================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor; must be on the
+				last record of the current page */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+	ut_ad(btr_pcur_is_after_last_on_page(cursor));
+
+	cursor->old_rec = nullptr;
+
+	const page_t* page = btr_pcur_get_page(cursor);
+	const uint32_t next_page_no = btr_page_get_next(page);
+
+	switch (next_page_no) {
+	case 0:
+	case 1:
+	case FIL_NULL:
+		return DB_CORRUPTION;
+	}
+
+	if (UNIV_UNLIKELY(next_page_no == btr_pcur_get_block(cursor)
+			  ->page.id().page_no())) {
+		return DB_CORRUPTION;
+	}
+
+	dberr_t err;
+	buf_block_t* next_block = btr_block_get(
+		*cursor->index(), next_page_no,
+		rw_lock_type_t(cursor->latch_mode & (RW_X_LATCH | RW_S_LATCH)),
+		page_is_leaf(page), mtr, &err);
+
+	if (UNIV_UNLIKELY(!next_block)) {
+		return err;
+	}
+
+	const page_t* next_page = buf_block_get_frame(next_block);
+
+	if (UNIV_UNLIKELY(memcmp_aligned<4>(next_page + FIL_PAGE_PREV,
+					    page + FIL_PAGE_OFFSET, 4))) {
+		return DB_CORRUPTION;
+	}
+
+	page_cur_set_before_first(next_block, btr_pcur_get_page_cur(cursor));
+
+	ut_d(page_check_dir(next_page));
+
+	const auto s = mtr->get_savepoint();
+	mtr->rollback_to_savepoint(s - 2, s - 1);
+	return DB_SUCCESS;
+}
+
+MY_ATTRIBUTE((nonnull,warn_unused_result))
+/*********************************************************//**
+Moves the persistent cursor backward if it is on the first record of the page.
+Commits mtr. Note that to prevent a possible deadlock, the operation
+first stores the position of the cursor, commits mtr, acquires the necessary
+latches and restores the cursor position again before returning. The
+alphabetical position of the cursor is guaranteed to be sensible on
+return, but it may happen that the cursor is not positioned on the last
+record of any page, because the structure of the tree may have changed
+during the time when the cursor had no latches. */
+static
+bool
+btr_pcur_move_backward_from_page(
+/*=============================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor, must be on the first
+				record of the current page */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ut_ad(btr_pcur_is_before_first_on_page(cursor));
+	ut_ad(!btr_pcur_is_before_first_in_tree(cursor));
+
+	const auto latch_mode = cursor->latch_mode;
+	ut_ad(latch_mode == BTR_SEARCH_LEAF || latch_mode == BTR_MODIFY_LEAF);
+
+	btr_pcur_store_position(cursor, mtr);
+
+	mtr_commit(mtr);
+
+	mtr_start(mtr);
+
+	static_assert(BTR_SEARCH_PREV == (4 | BTR_SEARCH_LEAF), "");
+	static_assert(BTR_MODIFY_PREV == (4 | BTR_MODIFY_LEAF), "");
+
+	if (UNIV_UNLIKELY(cursor->restore_position(
+				  btr_latch_mode(4 | latch_mode), mtr)
+			  == btr_pcur_t::CORRUPTED)) {
+		return true;
+	}
+
+	buf_block_t* block = btr_pcur_get_block(cursor);
+
+	if (page_has_prev(block->page.frame)) {
+		buf_block_t* left_block
+			= mtr->at_savepoint(mtr->get_savepoint() - 1);
+		const page_t* const left = left_block->page.frame;
+		if (memcmp_aligned<4>(left + FIL_PAGE_NEXT,
+				      block->page.frame
+				      + FIL_PAGE_OFFSET, 4)) {
+			/* This should be the right sibling page, or
+			if there is none, the current block. */
+			ut_ad(left_block == block
+			      || !memcmp_aligned<4>(left + FIL_PAGE_PREV,
+						    block->page.frame
+						    + FIL_PAGE_OFFSET, 4));
+			/* The previous one must be the left sibling. */
+			left_block
+				= mtr->at_savepoint(mtr->get_savepoint() - 2);
+			ut_ad(!memcmp_aligned<4>(left_block->page.frame
+						 + FIL_PAGE_NEXT,
+						 block->page.frame
+						 + FIL_PAGE_OFFSET, 4));
+		}
+		if (btr_pcur_is_before_first_on_page(cursor)) {
+			page_cur_set_after_last(left_block,
+						&cursor->btr_cur.page_cur);
+			/* Release the right sibling. */
+		} else {
+			/* Release the left sibling. */
+			block = left_block;
+		}
+		mtr->release(*block);
+	}
+
+	cursor->latch_mode = latch_mode;
+	cursor->old_rec = nullptr;
+	return false;
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the previous record in the tree. If no records
+are left, the cursor stays 'before first in tree'.
+@return TRUE if the cursor was not before first in tree */
+bool
+btr_pcur_move_to_prev(
+/*==================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor; NOTE that the
+				function may release the page latch */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	cursor->old_rec = nullptr;
+
+	if (btr_pcur_is_before_first_on_page(cursor)) {
+		return (!btr_pcur_is_before_first_in_tree(cursor)
+			&& !btr_pcur_move_backward_from_page(cursor, mtr));
+	}
+
+	return btr_pcur_move_to_prev_on_page(cursor) != nullptr;
+}
diff --git a/storage/innobase/btr/btr0sea.cc b/storage/innobase/btr/btr0sea.cc
new file mode 100644
index 00000000..8435047c
--- /dev/null
+++ b/storage/innobase/btr/btr0sea.cc
@@ -0,0 +1,2328 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file btr/btr0sea.cc
+The index tree adaptive search
+
+Created 2/17/1996 Heikki Tuuri
+*************************************************************************/
+
+#include "btr0sea.h"
+#ifdef BTR_CUR_HASH_ADAPT
+#include "buf0buf.h"
+#include "page0page.h"
+#include "page0cur.h"
+#include "btr0cur.h"
+#include "btr0pcur.h"
+#include "btr0btr.h"
+#include "srv0mon.h"
+
+/** Is search system enabled.
+Search system is protected by array of latches. */
+char		btr_search_enabled;
+
+/** Number of adaptive hash index partition. */
+ulong		btr_ahi_parts;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+/** Number of successful adaptive hash index lookups */
+ulint		btr_search_n_succ	= 0;
+/** Number of failed adaptive hash index lookups */
+ulint		btr_search_n_hash_fail	= 0;
+#endif /* UNIV_SEARCH_PERF_STAT */
+
+#ifdef UNIV_PFS_RWLOCK
+mysql_pfs_key_t	btr_search_latch_key;
+#endif /* UNIV_PFS_RWLOCK */
+
+/** The adaptive hash index */
+btr_search_sys_t btr_search_sys;
+
+/** If the number of records on the page divided by this parameter
+would have been successfully accessed using a hash index, the index
+is then built on the page, assuming the global limit has been reached */
+#define BTR_SEARCH_PAGE_BUILD_LIMIT	16U
+
+/** The global limit for consecutive potentially successful hash searches,
+before hash index building is started */
+#define BTR_SEARCH_BUILD_LIMIT		100U
+
+/** Compute a hash value of a record in a page.
+@param[in]	rec		index record
+@param[in]	offsets		return value of rec_get_offsets()
+@param[in]	n_fields	number of complete fields to fold
+@param[in]	n_bytes		number of bytes to fold in the last field
+@param[in]	index_id	index tree ID
+@return the hash value */
+static inline
+ulint
+rec_fold(
+	const rec_t*	rec,
+	const rec_offs*	offsets,
+	ulint		n_fields,
+	ulint		n_bytes,
+	index_id_t	tree_id)
+{
+	ulint		i;
+	const byte*	data;
+	ulint		len;
+	ulint		fold;
+	ulint		n_fields_rec;
+
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	ut_ad(rec_validate(rec, offsets));
+	ut_ad(page_rec_is_leaf(rec));
+	ut_ad(!page_rec_is_metadata(rec));
+	ut_ad(n_fields > 0 || n_bytes > 0);
+
+	n_fields_rec = rec_offs_n_fields(offsets);
+	ut_ad(n_fields <= n_fields_rec);
+	ut_ad(n_fields < n_fields_rec || n_bytes == 0);
+
+	if (n_fields > n_fields_rec) {
+		n_fields = n_fields_rec;
+	}
+
+	if (n_fields == n_fields_rec) {
+		n_bytes = 0;
+	}
+
+	fold = ut_fold_ull(tree_id);
+
+	for (i = 0; i < n_fields; i++) {
+		data = rec_get_nth_field(rec, offsets, i, &len);
+
+		if (len != UNIV_SQL_NULL) {
+			fold = ut_fold_ulint_pair(fold,
+						  ut_fold_binary(data, len));
+		}
+	}
+
+	if (n_bytes > 0) {
+		data = rec_get_nth_field(rec, offsets, i, &len);
+
+		if (len != UNIV_SQL_NULL) {
+			if (len > n_bytes) {
+				len = n_bytes;
+			}
+
+			fold = ut_fold_ulint_pair(fold,
+						  ut_fold_binary(data, len));
+		}
+	}
+
+	return(fold);
+}
+
+/** Determine the number of accessed key fields.
+@param[in]	n_fields	number of complete fields
+@param[in]	n_bytes		number of bytes in an incomplete last field
+@return	number of complete or incomplete fields */
+inline MY_ATTRIBUTE((warn_unused_result))
+ulint
+btr_search_get_n_fields(
+	ulint	n_fields,
+	ulint	n_bytes)
+{
+	return(n_fields + (n_bytes > 0 ? 1 : 0));
+}
+
+/** Determine the number of accessed key fields.
+@param[in]	cursor		b-tree cursor
+@return	number of complete or incomplete fields */
+inline MY_ATTRIBUTE((warn_unused_result))
+ulint
+btr_search_get_n_fields(
+	const btr_cur_t*	cursor)
+{
+	return(btr_search_get_n_fields(cursor->n_fields, cursor->n_bytes));
+}
+
+/** This function should be called before reserving any btr search mutex, if
+the intended operation might add nodes to the search system hash table.
+Because of the latching order, once we have reserved the btr search system
+latch, we cannot allocate a free frame from the buffer pool. Checks that
+there is a free buffer frame allocated for hash table heap in the btr search
+system. If not, allocates a free frames for the heap. This check makes it
+probable that, when have reserved the btr search system latch and we need to
+allocate a new node to the hash table, it will succeed. However, the check
+will not guarantee success.
+@param[in]	index	index handler */
+static void btr_search_check_free_space_in_heap(const dict_index_t *index)
+{
+  /* Note that we peek the value of heap->free_block without reserving
+  the latch: this is ok, because we will not guarantee that there will
+  be enough free space in the hash table. */
+
+  buf_block_t *block= buf_block_alloc();
+  auto part= btr_search_sys.get_part(*index);
+
+  part->latch.wr_lock(SRW_LOCK_CALL);
+
+  if (!btr_search_enabled || part->heap->free_block)
+    buf_block_free(block);
+  else
+    part->heap->free_block= block;
+
+  part->latch.wr_unlock();
+}
+
+/** Set index->ref_count = 0 on all indexes of a table.
+@param[in,out]	table	table handler */
+static void btr_search_disable_ref_count(dict_table_t *table)
+{
+  for (dict_index_t *index= dict_table_get_first_index(table); index;
+       index= dict_table_get_next_index(index))
+    index->search_info->ref_count= 0;
+}
+
+/** Lazily free detached metadata when removing the last reference. */
+ATTRIBUTE_COLD static void btr_search_lazy_free(dict_index_t *index)
+{
+  ut_ad(index->freed());
+  dict_table_t *table= index->table;
+  table->autoinc_mutex.wr_lock();
+
+  /* Perform the skipped steps of dict_index_remove_from_cache_low(). */
+  UT_LIST_REMOVE(table->freed_indexes, index);
+  index->lock.free();
+  dict_mem_index_free(index);
+
+  if (!UT_LIST_GET_LEN(table->freed_indexes) &&
+      !UT_LIST_GET_LEN(table->indexes))
+  {
+    ut_ad(!table->id);
+    table->autoinc_mutex.wr_unlock();
+    table->autoinc_mutex.destroy();
+    dict_mem_table_free(table);
+    return;
+  }
+
+  table->autoinc_mutex.wr_unlock();
+}
+
+/** Disable the adaptive hash search system and empty the index. */
+void btr_search_disable()
+{
+	dict_table_t*	table;
+
+	dict_sys.freeze(SRW_LOCK_CALL);
+
+	btr_search_x_lock_all();
+
+	if (!btr_search_enabled) {
+		dict_sys.unfreeze();
+		btr_search_x_unlock_all();
+		return;
+	}
+
+	btr_search_enabled = false;
+
+	/* Clear the index->search_info->ref_count of every index in
+	the data dictionary cache. */
+	for (table = UT_LIST_GET_FIRST(dict_sys.table_LRU); table;
+	     table = UT_LIST_GET_NEXT(table_LRU, table)) {
+
+		btr_search_disable_ref_count(table);
+	}
+
+	for (table = UT_LIST_GET_FIRST(dict_sys.table_non_LRU); table;
+	     table = UT_LIST_GET_NEXT(table_LRU, table)) {
+
+		btr_search_disable_ref_count(table);
+	}
+
+	dict_sys.unfreeze();
+
+	/* Set all block->index = NULL. */
+	buf_pool.clear_hash_index();
+
+	/* Clear the adaptive hash index. */
+	btr_search_sys.clear();
+
+	btr_search_x_unlock_all();
+}
+
+/** Enable the adaptive hash search system.
+@param resize whether buf_pool_t::resize() is the caller */
+void btr_search_enable(bool resize)
+{
+	if (!resize) {
+		mysql_mutex_lock(&buf_pool.mutex);
+		bool changed = srv_buf_pool_old_size != srv_buf_pool_size;
+		mysql_mutex_unlock(&buf_pool.mutex);
+		if (changed) {
+			return;
+		}
+	}
+
+	btr_search_x_lock_all();
+	ulint hash_size = buf_pool_get_curr_size() / sizeof(void *) / 64;
+
+	if (btr_search_sys.parts[0].heap) {
+		ut_ad(btr_search_enabled);
+		btr_search_x_unlock_all();
+		return;
+	}
+
+	btr_search_sys.alloc(hash_size);
+
+	btr_search_enabled = true;
+	btr_search_x_unlock_all();
+}
+
+/** Updates the search info of an index about hash successes. NOTE that info
+is NOT protected by any semaphore, to save CPU time! Do not assume its fields
+are consistent.
+@param[in,out]	info	search info
+@param[in]	cursor	cursor which was just positioned */
+static void btr_search_info_update_hash(btr_search_t *info, btr_cur_t *cursor)
+{
+	dict_index_t*	index = cursor->index();
+	int		cmp;
+
+	if (dict_index_is_ibuf(index)) {
+		/* So many deletes are performed on an insert buffer tree
+		that we do not consider a hash index useful on it: */
+
+		return;
+	}
+
+	uint16_t n_unique = dict_index_get_n_unique_in_tree(index);
+
+	if (info->n_hash_potential == 0) {
+
+		goto set_new_recomm;
+	}
+
+	/* Test if the search would have succeeded using the recommended
+	hash prefix */
+
+	if (info->n_fields >= n_unique && cursor->up_match >= n_unique) {
+increment_potential:
+		info->n_hash_potential++;
+
+		return;
+	}
+
+	cmp = ut_pair_cmp(info->n_fields, info->n_bytes,
+			  cursor->low_match, cursor->low_bytes);
+
+	if (info->left_side ? cmp <= 0 : cmp > 0) {
+
+		goto set_new_recomm;
+	}
+
+	cmp = ut_pair_cmp(info->n_fields, info->n_bytes,
+			  cursor->up_match, cursor->up_bytes);
+
+	if (info->left_side ? cmp <= 0 : cmp > 0) {
+
+		goto increment_potential;
+	}
+
+set_new_recomm:
+	/* We have to set a new recommendation; skip the hash analysis
+	for a while to avoid unnecessary CPU time usage when there is no
+	chance for success */
+
+	info->hash_analysis = 0;
+
+	cmp = ut_pair_cmp(cursor->up_match, cursor->up_bytes,
+			  cursor->low_match, cursor->low_bytes);
+	info->left_side = cmp >= 0;
+	info->n_hash_potential = cmp != 0;
+
+	if (cmp == 0) {
+		/* For extra safety, we set some sensible values here */
+		info->n_fields = 1;
+		info->n_bytes = 0;
+	} else if (cmp > 0) {
+		info->n_hash_potential = 1;
+
+		if (cursor->up_match >= n_unique) {
+
+			info->n_fields = n_unique;
+			info->n_bytes = 0;
+
+		} else if (cursor->low_match < cursor->up_match) {
+
+			info->n_fields = static_cast<uint16_t>(
+				cursor->low_match + 1);
+			info->n_bytes = 0;
+		} else {
+			info->n_fields = static_cast<uint16_t>(
+				cursor->low_match);
+			info->n_bytes = static_cast<uint16_t>(
+				cursor->low_bytes + 1);
+		}
+	} else {
+		if (cursor->low_match >= n_unique) {
+
+			info->n_fields = n_unique;
+			info->n_bytes = 0;
+		} else if (cursor->low_match > cursor->up_match) {
+
+			info->n_fields = static_cast<uint16_t>(
+				cursor->up_match + 1);
+			info->n_bytes = 0;
+		} else {
+			info->n_fields = static_cast<uint16_t>(
+				cursor->up_match);
+			info->n_bytes = static_cast<uint16_t>(
+				cursor->up_bytes + 1);
+		}
+	}
+}
+
+/** Update the block search info on hash successes. NOTE that info and
+block->n_hash_helps, n_fields, n_bytes, left_side are NOT protected by any
+semaphore, to save CPU time! Do not assume the fields are consistent.
+@return TRUE if building a (new) hash index on the block is recommended
+@param[in,out]	info	search info
+@param[in,out]	block	buffer block */
+static
+bool
+btr_search_update_block_hash_info(btr_search_t* info, buf_block_t* block)
+{
+	ut_ad(block->page.lock.have_x() || block->page.lock.have_s());
+
+	info->last_hash_succ = FALSE;
+	ut_ad(block->page.frame);
+	ut_ad(info->magic_n == BTR_SEARCH_MAGIC_N);
+
+	if ((block->n_hash_helps > 0)
+	    && (info->n_hash_potential > 0)
+	    && (block->n_fields == info->n_fields)
+	    && (block->n_bytes == info->n_bytes)
+	    && (block->left_side == info->left_side)) {
+
+		if ((block->index)
+		    && (block->curr_n_fields == info->n_fields)
+		    && (block->curr_n_bytes == info->n_bytes)
+		    && (block->curr_left_side == info->left_side)) {
+
+			/* The search would presumably have succeeded using
+			the hash index */
+
+			info->last_hash_succ = TRUE;
+		}
+
+		block->n_hash_helps++;
+	} else {
+		block->n_hash_helps = 1;
+		block->n_fields = info->n_fields;
+		block->n_bytes = info->n_bytes;
+		block->left_side = info->left_side;
+	}
+
+	if ((block->n_hash_helps > page_get_n_recs(block->page.frame)
+	     / BTR_SEARCH_PAGE_BUILD_LIMIT)
+	    && (info->n_hash_potential >= BTR_SEARCH_BUILD_LIMIT)) {
+
+		if ((!block->index)
+		    || (block->n_hash_helps
+			> 2U * page_get_n_recs(block->page.frame))
+		    || (block->n_fields != block->curr_n_fields)
+		    || (block->n_bytes != block->curr_n_bytes)
+		    || (block->left_side != block->curr_left_side)) {
+
+			/* Build a new hash index on the page */
+
+			return(true);
+		}
+	}
+
+	return(false);
+}
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+/** Maximum number of records in a page */
+constexpr ulint MAX_N_POINTERS = UNIV_PAGE_SIZE_MAX / REC_N_NEW_EXTRA_BYTES;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+__attribute__((nonnull))
+/**
+Insert an entry into the hash table. If an entry with the same fold number
+is found, its node is updated to point to the new data, and no new node
+is inserted.
+@param table hash table
+@param heap  memory heap
+@param fold  folded value of the record
+@param block buffer block containing the record
+@param data  the record
+@retval true on success
+@retval false if no more memory could be allocated */
+static bool ha_insert_for_fold(hash_table_t *table, mem_heap_t* heap,
+                               ulint fold,
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+                               buf_block_t *block, /*!< buffer block of data */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+                               const rec_t *data)
+{
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+  ut_a(block->page.frame == page_align(data));
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+  ut_ad(btr_search_enabled);
+
+  hash_cell_t *cell= &table->array[table->calc_hash(fold)];
+
+  for (ha_node_t *prev= static_cast<ha_node_t*>(cell->node); prev;
+       prev= prev->next)
+  {
+    if (prev->fold == fold)
+    {
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+      buf_block_t *prev_block= prev->block;
+      ut_a(prev_block->page.frame == page_align(prev->data));
+      ut_a(prev_block->n_pointers-- < MAX_N_POINTERS);
+      ut_a(block->n_pointers++ < MAX_N_POINTERS);
+
+      prev->block= block;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+      prev->data= data;
+      return true;
+    }
+  }
+
+  /* We have to allocate a new chain node */
+  ha_node_t *node= static_cast<ha_node_t*>(mem_heap_alloc(heap, sizeof *node));
+
+  if (!node)
+    return false;
+
+  ha_node_set_data(node, block, data);
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+  ut_a(block->n_pointers++ < MAX_N_POINTERS);
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+  node->fold= fold;
+  node->next= nullptr;
+
+  ha_node_t *prev= static_cast<ha_node_t*>(cell->node);
+  if (!prev)
+    cell->node= node;
+  else
+  {
+    while (prev->next)
+      prev= prev->next;
+    prev->next= node;
+  }
+  return true;
+}
+
+__attribute__((nonnull))
+/** Delete a record.
+@param table     hash table
+@param heap      memory heap
+@param del_node  record to be deleted */
+static void ha_delete_hash_node(hash_table_t *table, mem_heap_t *heap,
+                                ha_node_t *del_node)
+{
+  ut_ad(btr_search_enabled);
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+  ut_a(del_node->block->page.frame == page_align(del_node->data));
+  ut_a(del_node->block->n_pointers-- < MAX_N_POINTERS);
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+  const ulint fold= del_node->fold;
+
+  HASH_DELETE(ha_node_t, next, table, fold, del_node);
+
+  ha_node_t *top= static_cast<ha_node_t*>(mem_heap_get_top(heap, sizeof *top));
+
+  if (del_node != top)
+  {
+    /* Compact the heap of nodes by moving the top in the place of del_node. */
+    *del_node= *top;
+    hash_cell_t *cell= &table->array[table->calc_hash(top->fold)];
+
+    /* Look for the pointer to the top node, to update it */
+    if (cell->node == top)
+      /* The top node is the first in the chain */
+      cell->node= del_node;
+    else
+    {
+      /* We have to look for the predecessor */
+      ha_node_t *node= static_cast<ha_node_t*>(cell->node);
+
+      while (top != HASH_GET_NEXT(next, node))
+        node= static_cast<ha_node_t*>(HASH_GET_NEXT(next, node));
+
+      /* Now we have the predecessor node */
+      node->next= del_node;
+    }
+  }
+
+  /* Free the occupied space */
+  mem_heap_free_top(heap, sizeof *top);
+}
+
+__attribute__((nonnull))
+/** Delete all pointers to a page.
+@param table     hash table
+@param heap      memory heap
+@param page      record to be deleted */
+static void ha_remove_all_nodes_to_page(hash_table_t *table, mem_heap_t *heap,
+                                        ulint fold, const page_t *page)
+{
+  for (ha_node_t *node= ha_chain_get_first(table, fold); node; )
+  {
+    if (page_align(ha_node_get_data(node)) == page)
+    {
+      ha_delete_hash_node(table, heap, node);
+      /* The deletion may compact the heap of nodes and move other nodes! */
+      node= ha_chain_get_first(table, fold);
+    }
+    else
+      node= ha_chain_get_next(node);
+  }
+#ifdef UNIV_DEBUG
+  /* Check that all nodes really got deleted */
+  for (ha_node_t *node= ha_chain_get_first(table, fold); node;
+       node= ha_chain_get_next(node))
+    ut_ad(page_align(ha_node_get_data(node)) != page);
+#endif /* UNIV_DEBUG */
+}
+
+/** Delete a record if found.
+@param table     hash table
+@param heap      memory heap for the hash bucket chain
+@param fold      folded value of the searched data
+@param data      pointer to the record
+@return whether the record was found */
+static bool ha_search_and_delete_if_found(hash_table_t *table,
+                                          mem_heap_t *heap,
+                                          ulint fold, const rec_t *data)
+{
+  if (ha_node_t *node= ha_search_with_data(table, fold, data))
+  {
+    ha_delete_hash_node(table, heap, node);
+    return true;
+  }
+
+  return false;
+}
+
+__attribute__((nonnull))
+/** Looks for an element when we know the pointer to the data and
+updates the pointer to data if found.
+@param table     hash table
+@param fold      folded value of the searched data
+@param data      pointer to the data
+@param new_data  new pointer to the data
+@return whether the element was found */
+static bool ha_search_and_update_if_found(hash_table_t *table, ulint fold,
+                                          const rec_t *data,
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+                                          /** block containing new_data */
+                                          buf_block_t *new_block,
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+                                          const rec_t *new_data)
+{
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+  ut_a(new_block->page.frame == page_align(new_data));
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+  if (!btr_search_enabled)
+    return false;
+
+  if (ha_node_t *node= ha_search_with_data(table, fold, data))
+  {
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+    ut_a(node->block->n_pointers-- < MAX_N_POINTERS);
+    ut_a(new_block->n_pointers++ < MAX_N_POINTERS);
+    node->block= new_block;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+    node->data= new_data;
+
+    return true;
+  }
+
+  return false;
+}
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+#else
+# define ha_insert_for_fold(t,h,f,b,d) ha_insert_for_fold(t,h,f,d)
+# define ha_search_and_update_if_found(table,fold,data,new_block,new_data) \
+	ha_search_and_update_if_found(table,fold,data,new_data)
+#endif
+
+/** Updates a hash node reference when it has been unsuccessfully used in a
+search which could have succeeded with the used hash parameters. This can
+happen because when building a hash index for a page, we do not check
+what happens at page boundaries, and therefore there can be misleading
+hash nodes. Also, collisions in the fold value can lead to misleading
+references. This function lazily fixes these imperfections in the hash
+index.
+@param[in]	info	search info
+@param[in]	block	buffer block where cursor positioned
+@param[in]	cursor	cursor */
+static
+void
+btr_search_update_hash_ref(
+	const btr_search_t*	info,
+	buf_block_t*		block,
+	const btr_cur_t*	cursor)
+{
+	ut_ad(cursor->flag == BTR_CUR_HASH_FAIL);
+
+	ut_ad(block->page.lock.have_x() || block->page.lock.have_s());
+	ut_ad(page_align(btr_cur_get_rec(cursor)) == block->page.frame);
+	ut_ad(page_is_leaf(block->page.frame));
+	assert_block_ahi_valid(block);
+
+	dict_index_t* index = block->index;
+
+	if (!index || !info->n_hash_potential) {
+		return;
+	}
+
+	if (index != cursor->index()) {
+		ut_ad(index->id == cursor->index()->id);
+		btr_search_drop_page_hash_index(block, false);
+		return;
+	}
+
+	ut_ad(block->page.id().space() == index->table->space_id);
+	ut_ad(index == cursor->index());
+	ut_ad(!dict_index_is_ibuf(index));
+	auto part = btr_search_sys.get_part(*index);
+	part->latch.wr_lock(SRW_LOCK_CALL);
+	ut_ad(!block->index || block->index == index);
+
+	if (block->index
+	    && (block->curr_n_fields == info->n_fields)
+	    && (block->curr_n_bytes == info->n_bytes)
+	    && (block->curr_left_side == info->left_side)
+	    && btr_search_enabled) {
+		mem_heap_t*	heap		= NULL;
+		rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+		rec_offs_init(offsets_);
+
+		const rec_t* rec = btr_cur_get_rec(cursor);
+
+		if (!page_rec_is_user_rec(rec)) {
+			goto func_exit;
+		}
+
+		ulint fold = rec_fold(
+			rec,
+			rec_get_offsets(rec, index, offsets_,
+					index->n_core_fields,
+					ULINT_UNDEFINED, &heap),
+			block->curr_n_fields,
+			block->curr_n_bytes, index->id);
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+
+		ha_insert_for_fold(&part->table, part->heap, fold, block, rec);
+
+		MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);
+	}
+
+func_exit:
+	part->latch.wr_unlock();
+}
+
+/** Checks if a guessed position for a tree cursor is right. Note that if
+mode is PAGE_CUR_LE, which is used in inserts, and the function returns
+TRUE, then cursor->up_match and cursor->low_match both have sensible values.
+@param[in,out]	cursor		guess cursor position
+@param[in]	can_only_compare_to_cursor_rec
+				if we do not have a latch on the page of cursor,
+				but a latch corresponding search system, then
+				ONLY the columns of the record UNDER the cursor
+				are protected, not the next or previous record
+				in the chain: we cannot look at the next or
+				previous record to check our guess!
+@param[in]	tuple		data tuple
+@param[in]	mode		PAGE_CUR_L, PAGE_CUR_LE, PAGE_CUR_G, PAGE_CUR_GE
+@return	whether a match was found */
+static
+bool
+btr_search_check_guess(
+	btr_cur_t*	cursor,
+	bool		can_only_compare_to_cursor_rec,
+	const dtuple_t*	tuple,
+	ulint		mode)
+{
+	rec_t*		rec;
+	ulint		n_unique;
+	ulint		match;
+	int		cmp;
+	mem_heap_t*	heap		= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets		= offsets_;
+	bool		success		= false;
+	rec_offs_init(offsets_);
+
+	n_unique = dict_index_get_n_unique_in_tree(cursor->index());
+
+	rec = btr_cur_get_rec(cursor);
+
+	if (UNIV_UNLIKELY(!page_rec_is_user_rec(rec)
+			  || !page_rec_is_leaf(rec))) {
+		ut_ad("corrupted index" == 0);
+		return false;
+	} else if (cursor->index()->table->not_redundant()) {
+		switch (rec_get_status(rec)) {
+		case REC_STATUS_INSTANT:
+		case REC_STATUS_ORDINARY:
+			break;
+		default:
+			ut_ad("corrupted index" == 0);
+			return false;
+		}
+	}
+
+	match = 0;
+
+	offsets = rec_get_offsets(rec, cursor->index(), offsets,
+				  cursor->index()->n_core_fields,
+				  n_unique, &heap);
+	cmp = cmp_dtuple_rec_with_match(tuple, rec, cursor->index(), offsets,
+					&match);
+
+	if (mode == PAGE_CUR_GE) {
+		if (cmp > 0) {
+			goto exit_func;
+		}
+
+		cursor->up_match = match;
+
+		if (match >= n_unique) {
+			success = true;
+			goto exit_func;
+		}
+	} else if (mode == PAGE_CUR_LE) {
+		if (cmp < 0) {
+			goto exit_func;
+		}
+
+		cursor->low_match = match;
+
+	} else if (mode == PAGE_CUR_G) {
+		if (cmp >= 0) {
+			goto exit_func;
+		}
+	} else if (mode == PAGE_CUR_L) {
+		if (cmp <= 0) {
+			goto exit_func;
+		}
+	}
+
+	if (can_only_compare_to_cursor_rec) {
+		/* Since we could not determine if our guess is right just by
+		looking at the record under the cursor, return FALSE */
+		goto exit_func;
+	}
+
+	match = 0;
+
+	if ((mode == PAGE_CUR_G) || (mode == PAGE_CUR_GE)) {
+		const rec_t* prev_rec = page_rec_get_prev(rec);
+
+		if (UNIV_UNLIKELY(!prev_rec)) {
+			ut_ad("corrupted index" == 0);
+			goto exit_func;
+		}
+
+		if (page_rec_is_infimum(prev_rec)) {
+			success = !page_has_prev(page_align(prev_rec));
+			goto exit_func;
+		}
+
+		if (cursor->index()->table->not_redundant()) {
+			switch (rec_get_status(prev_rec)) {
+			case REC_STATUS_INSTANT:
+			case REC_STATUS_ORDINARY:
+				break;
+			default:
+				ut_ad("corrupted index" == 0);
+				goto exit_func;
+			}
+		}
+
+		offsets = rec_get_offsets(prev_rec, cursor->index(), offsets,
+					  cursor->index()->n_core_fields,
+					  n_unique, &heap);
+		cmp = cmp_dtuple_rec_with_match(tuple, prev_rec,
+						cursor->index(), offsets,
+						&match);
+		if (mode == PAGE_CUR_GE) {
+			success = cmp > 0;
+		} else {
+			success = cmp >= 0;
+		}
+	} else {
+		ut_ad(!page_rec_is_supremum(rec));
+
+		const rec_t* next_rec = page_rec_get_next(rec);
+
+		if (UNIV_UNLIKELY(!next_rec)) {
+			ut_ad("corrupted index" == 0);
+			goto exit_func;
+		}
+
+		if (page_rec_is_supremum(next_rec)) {
+			if (!page_has_next(page_align(next_rec))) {
+				cursor->up_match = 0;
+				success = true;
+			}
+
+			goto exit_func;
+		}
+
+		if (cursor->index()->table->not_redundant()) {
+			switch (rec_get_status(next_rec)) {
+			case REC_STATUS_INSTANT:
+			case REC_STATUS_ORDINARY:
+				break;
+			default:
+				ut_ad("corrupted index" == 0);
+				goto exit_func;
+			}
+		}
+
+		offsets = rec_get_offsets(next_rec, cursor->index(), offsets,
+					  cursor->index()->n_core_fields,
+					  n_unique, &heap);
+		cmp = cmp_dtuple_rec_with_match(
+			tuple, next_rec, cursor->index(), offsets, &match);
+		if (mode == PAGE_CUR_LE) {
+			success = cmp < 0;
+			cursor->up_match = match;
+		} else {
+			success = cmp <= 0;
+		}
+	}
+exit_func:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(success);
+}
+
+static
+void
+btr_search_failure(btr_search_t* info, btr_cur_t* cursor)
+{
+	cursor->flag = BTR_CUR_HASH_FAIL;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+	++info->n_hash_fail;
+
+	if (info->n_hash_succ > 0) {
+		--info->n_hash_succ;
+	}
+#endif /* UNIV_SEARCH_PERF_STAT */
+
+	info->last_hash_succ = FALSE;
+}
+
+/** Clear the adaptive hash index on all pages in the buffer pool. */
+inline void buf_pool_t::clear_hash_index()
+{
+  ut_ad(!resizing);
+  ut_ad(!btr_search_enabled);
+
+  std::set<dict_index_t*> garbage;
+
+  for (chunk_t *chunk= chunks + n_chunks; chunk-- != chunks; )
+  {
+    for (buf_block_t *block= chunk->blocks, * const end= block + chunk->size;
+         block != end; block++)
+    {
+      dict_index_t *index= block->index;
+      assert_block_ahi_valid(block);
+
+      /* We can clear block->index and block->n_pointers when
+      holding all AHI latches exclusively; see the comments in buf0buf.h */
+
+      if (!index)
+      {
+# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+        ut_a(!block->n_pointers);
+# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+        continue;
+      }
+
+      ut_d(const auto s= block->page.state());
+      /* Another thread may have set the state to
+      REMOVE_HASH in buf_LRU_block_remove_hashed().
+
+      The state change in buf_pool_t::realloc() is not observable
+      here, because in that case we would have !block->index.
+
+      In the end, the entire adaptive hash index will be removed. */
+      ut_ad(s >= buf_page_t::UNFIXED || s == buf_page_t::REMOVE_HASH);
+# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+      block->n_pointers= 0;
+# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+      if (index->freed())
+        garbage.insert(index);
+      block->index= nullptr;
+    }
+  }
+
+  for (dict_index_t *index : garbage)
+    btr_search_lazy_free(index);
+}
+
+/** Get a buffer block from an adaptive hash index pointer.
+This function does not return if the block is not identified.
+@param ptr  pointer to within a page frame
+@return pointer to block, never NULL */
+inline buf_block_t* buf_pool_t::block_from_ahi(const byte *ptr) const
+{
+  chunk_t::map *chunk_map = chunk_t::map_ref;
+  ut_ad(chunk_t::map_ref == chunk_t::map_reg);
+  ut_ad(!resizing);
+
+  chunk_t::map::const_iterator it= chunk_map->upper_bound(ptr);
+  ut_a(it != chunk_map->begin());
+
+  chunk_t *chunk= it == chunk_map->end()
+    ? chunk_map->rbegin()->second
+    : (--it)->second;
+
+  const size_t offs= size_t(ptr - chunk->blocks->page.frame) >>
+    srv_page_size_shift;
+  ut_a(offs < chunk->size);
+
+  buf_block_t *block= &chunk->blocks[offs];
+  /* buf_pool_t::chunk_t::init() invokes buf_block_init() so that
+  block[n].frame == block->page.frame + n * srv_page_size.  Check it. */
+  ut_ad(block->page.frame == page_align(ptr));
+  /* Read the state of the block without holding hash_lock.
+  A state transition to REMOVE_HASH is possible during
+  this execution. */
+  ut_ad(block->page.state() >= buf_page_t::REMOVE_HASH);
+
+  return block;
+}
+
+/** Tries to guess the right search position based on the hash search info
+of the index. Note that if mode is PAGE_CUR_LE, which is used in inserts,
+and the function returns TRUE, then cursor->up_match and cursor->low_match
+both have sensible values.
+@param[in,out]	index		index
+@param[in,out]	info		index search info
+@param[in]	tuple		logical record
+@param[in]	mode		PAGE_CUR_L, ....
+@param[in]	latch_mode	BTR_SEARCH_LEAF, ...
+@param[out]	cursor		tree cursor
+@param[in]	mtr		mini-transaction
+@return whether the search succeeded */
+TRANSACTIONAL_TARGET
+bool
+btr_search_guess_on_hash(
+	dict_index_t*	index,
+	btr_search_t*	info,
+	const dtuple_t*	tuple,
+	ulint		mode,
+	ulint		latch_mode,
+	btr_cur_t*	cursor,
+	mtr_t*		mtr)
+{
+	ulint		fold;
+	index_id_t	index_id;
+
+	ut_ad(mtr->is_active());
+	ut_ad(index->is_btree() || index->is_ibuf());
+
+	/* Note that, for efficiency, the struct info may not be protected by
+	any latch here! */
+
+	if (latch_mode > BTR_MODIFY_LEAF
+	    || !info->last_hash_succ || !info->n_hash_potential
+	    || (tuple->info_bits & REC_INFO_MIN_REC_FLAG)) {
+		return false;
+	}
+
+	ut_ad(index->is_btree());
+        ut_ad(!index->table->is_temporary());
+
+	ut_ad(latch_mode == BTR_SEARCH_LEAF || latch_mode == BTR_MODIFY_LEAF);
+	compile_time_assert(ulint{BTR_SEARCH_LEAF} == ulint{RW_S_LATCH});
+	compile_time_assert(ulint{BTR_MODIFY_LEAF} == ulint{RW_X_LATCH});
+
+	cursor->n_fields = info->n_fields;
+	cursor->n_bytes = info->n_bytes;
+
+	if (dtuple_get_n_fields(tuple) < btr_search_get_n_fields(cursor)) {
+		return false;
+	}
+
+	index_id = index->id;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+	info->n_hash_succ++;
+#endif
+	fold = dtuple_fold(tuple, cursor->n_fields, cursor->n_bytes, index_id);
+
+	cursor->fold = fold;
+	cursor->flag = BTR_CUR_HASH;
+
+	auto part = btr_search_sys.get_part(*index);
+	const rec_t* rec;
+
+	part->latch.rd_lock(SRW_LOCK_CALL);
+
+	if (!btr_search_enabled) {
+		goto ahi_release_and_fail;
+	}
+
+	rec = static_cast<const rec_t*>(
+		ha_search_and_get_data(&part->table, fold));
+
+	if (!rec) {
+ahi_release_and_fail:
+		part->latch.rd_unlock();
+fail:
+		btr_search_failure(info, cursor);
+		return false;
+	}
+
+	buf_block_t* block = buf_pool.block_from_ahi(rec);
+
+	buf_pool_t::hash_chain& chain = buf_pool.page_hash.cell_get(
+		block->page.id().fold());
+	bool got_latch;
+	{
+		transactional_shared_lock_guard<page_hash_latch> g{
+			buf_pool.page_hash.lock_get(chain)};
+		got_latch = (latch_mode == BTR_SEARCH_LEAF)
+			? block->page.lock.s_lock_try()
+			: block->page.lock.x_lock_try();
+	}
+
+	if (!got_latch) {
+		goto ahi_release_and_fail;
+	}
+
+	const auto state = block->page.state();
+	if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED)) {
+		ut_ad(state == buf_page_t::REMOVE_HASH);
+block_and_ahi_release_and_fail:
+		if (latch_mode == BTR_SEARCH_LEAF) {
+			block->page.lock.s_unlock();
+		} else {
+			block->page.lock.x_unlock();
+		}
+		goto ahi_release_and_fail;
+	}
+
+	ut_ad(state < buf_page_t::READ_FIX || state >= buf_page_t::WRITE_FIX);
+	ut_ad(state < buf_page_t::READ_FIX || latch_mode == BTR_SEARCH_LEAF);
+
+	if (index != block->index && index_id == block->index->id) {
+		ut_a(block->index->freed());
+		goto block_and_ahi_release_and_fail;
+	}
+
+	block->page.fix();
+	block->page.set_accessed();
+	buf_page_make_young_if_needed(&block->page);
+	static_assert(ulint{MTR_MEMO_PAGE_S_FIX} == ulint{BTR_SEARCH_LEAF},
+		      "");
+	static_assert(ulint{MTR_MEMO_PAGE_X_FIX} == ulint{BTR_MODIFY_LEAF},
+		      "");
+
+	part->latch.rd_unlock();
+
+	++buf_pool.stat.n_page_gets;
+
+	mtr->memo_push(block, mtr_memo_type_t(latch_mode));
+
+	ut_ad(page_rec_is_user_rec(rec));
+
+	btr_cur_position(index, (rec_t*) rec, block, cursor);
+
+	/* Check the validity of the guess within the page */
+
+	/* If we only have the latch on search system, not on the
+	page, it only protects the columns of the record the cursor
+	is positioned on. We cannot look at the next of the previous
+	record to determine if our guess for the cursor position is
+	right. */
+	if (index_id != btr_page_get_index_id(block->page.frame)
+	    || !btr_search_check_guess(cursor, false, tuple, mode)) {
+		mtr->release_last_page();
+		goto fail;
+	}
+
+	if (info->n_hash_potential < BTR_SEARCH_BUILD_LIMIT + 5) {
+
+		info->n_hash_potential++;
+	}
+
+	info->last_hash_succ = TRUE;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+	btr_search_n_succ++;
+#endif
+	return true;
+}
+
+/** Drop any adaptive hash index entries that point to an index page.
+@param[in,out]	block	block containing index page, s- or x-latched, or an
+			index page for which we know that
+			block->buf_fix_count == 0 or it is an index page which
+			has already been removed from the buf_pool.page_hash
+			i.e.: it is in state BUF_BLOCK_REMOVE_HASH
+@param[in]	garbage_collect	drop ahi only if the index is marked
+				as freed */
+void btr_search_drop_page_hash_index(buf_block_t* block,
+				     bool garbage_collect)
+{
+	ulint			n_fields;
+	ulint			n_bytes;
+	const rec_t*		rec;
+	mem_heap_t*		heap;
+	rec_offs*		offsets;
+
+retry:
+	if (!block->index) {
+		return;
+	}
+
+	ut_d(const auto state = block->page.state());
+	ut_ad(state == buf_page_t::REMOVE_HASH
+	      || state >= buf_page_t::UNFIXED);
+	ut_ad(state == buf_page_t::REMOVE_HASH
+	      || !(~buf_page_t::LRU_MASK & state)
+	      || block->page.lock.have_any());
+	ut_ad(state < buf_page_t::READ_FIX || state >= buf_page_t::WRITE_FIX);
+	ut_ad(page_is_leaf(block->page.frame));
+
+	/* We must not dereference block->index here, because it could be freed
+	if (!index->table->get_ref_count() && !dict_sys.frozen()).
+	Determine the ahi_slot based on the block contents. */
+
+	const index_id_t	index_id
+		= btr_page_get_index_id(block->page.frame);
+
+	auto part = btr_search_sys.get_part(index_id,
+					    block->page.id().space());
+
+	part->latch.rd_lock(SRW_LOCK_CALL);
+
+	dict_index_t* index = block->index;
+	bool is_freed = index && index->freed();
+
+	if (is_freed) {
+		part->latch.rd_unlock();
+		part->latch.wr_lock(SRW_LOCK_CALL);
+		if (index != block->index) {
+			part->latch.wr_unlock();
+			goto retry;
+		}
+	} else if (garbage_collect) {
+		part->latch.rd_unlock();
+		return;
+	}
+
+	assert_block_ahi_valid(block);
+
+	if (!index || !btr_search_enabled) {
+		if (is_freed) {
+			part->latch.wr_unlock();
+		} else {
+			part->latch.rd_unlock();
+		}
+		return;
+	}
+
+	ut_ad(!index->table->is_temporary());
+	ut_ad(btr_search_enabled);
+
+	ut_ad(block->page.id().space() == index->table->space_id);
+	ut_a(index_id == index->id);
+	ut_ad(!dict_index_is_ibuf(index));
+
+	n_fields = block->curr_n_fields;
+	n_bytes = block->curr_n_bytes;
+
+	/* NOTE: The AHI fields of block must not be accessed after
+	releasing search latch, as the index page might only be s-latched! */
+
+	if (!is_freed) {
+		part->latch.rd_unlock();
+	}
+
+	ut_a(n_fields > 0 || n_bytes > 0);
+
+	const page_t* const page = block->page.frame;
+	ulint n_recs = page_get_n_recs(page);
+	if (!n_recs) {
+		ut_ad("corrupted adaptive hash index" == 0);
+		return;
+	}
+
+	/* Calculate and cache fold values into an array for fast deletion
+	from the hash index */
+
+	rec = page_get_infimum_rec(page);
+	rec = page_rec_get_next_low(rec, page_is_comp(page));
+
+	ulint* folds;
+	ulint n_cached = 0;
+	ulint prev_fold = 0;
+
+	if (rec && rec_is_metadata(rec, *index)) {
+		rec = page_rec_get_next_low(rec, page_is_comp(page));
+		if (!--n_recs) {
+			/* The page only contains the hidden metadata record
+			for instant ALTER TABLE that the adaptive hash index
+			never points to. */
+			folds = nullptr;
+			goto all_deleted;
+		}
+	}
+
+	folds = (ulint*) ut_malloc_nokey(n_recs * sizeof(ulint));
+	heap = nullptr;
+	offsets = nullptr;
+
+	while (rec) {
+		if (n_cached >= n_recs) {
+			ut_ad(page_rec_is_supremum(rec));
+			break;
+		}
+		ut_ad(page_rec_is_user_rec(rec));
+		offsets = rec_get_offsets(
+			rec, index, offsets, index->n_core_fields,
+			btr_search_get_n_fields(n_fields, n_bytes),
+			&heap);
+		const ulint fold = rec_fold(rec, offsets, n_fields, n_bytes,
+					    index_id);
+
+		if (fold == prev_fold && prev_fold != 0) {
+
+			goto next_rec;
+		}
+
+		/* Remove all hash nodes pointing to this page from the
+		hash chain */
+		folds[n_cached++] = fold;
+
+next_rec:
+		rec = page_rec_get_next_low(rec, page_rec_is_comp(rec));
+		if (!rec || page_rec_is_supremum(rec)) {
+			break;
+		}
+		prev_fold = fold;
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+all_deleted:
+	if (!is_freed) {
+		part->latch.wr_lock(SRW_LOCK_CALL);
+
+		if (UNIV_UNLIKELY(!block->index)) {
+			/* Someone else has meanwhile dropped the
+			hash index */
+			goto cleanup;
+		}
+
+		ut_a(block->index == index);
+	}
+
+	if (block->curr_n_fields != n_fields
+	    || block->curr_n_bytes != n_bytes) {
+
+		/* Someone else has meanwhile built a new hash index on the
+		page, with different parameters */
+
+		part->latch.wr_unlock();
+
+		ut_free(folds);
+		goto retry;
+	}
+
+	for (ulint i = 0; i < n_cached; i++) {
+		ha_remove_all_nodes_to_page(&part->table, part->heap,
+					    folds[i], page);
+	}
+
+	switch (index->search_info->ref_count--) {
+	case 0:
+		ut_error;
+	case 1:
+		if (index->freed()) {
+			btr_search_lazy_free(index);
+		}
+	}
+
+	block->index = nullptr;
+
+	MONITOR_INC(MONITOR_ADAPTIVE_HASH_PAGE_REMOVED);
+	MONITOR_INC_VALUE(MONITOR_ADAPTIVE_HASH_ROW_REMOVED, n_cached);
+
+cleanup:
+	assert_block_ahi_valid(block);
+	part->latch.wr_unlock();
+
+	ut_free(folds);
+}
+
+/** Drop possible adaptive hash index entries when a page is evicted
+from the buffer pool or freed in a file, or the index is being dropped.
+@param[in]	page_id		page id */
+void btr_search_drop_page_hash_when_freed(const page_id_t page_id)
+{
+	buf_block_t*	block;
+	mtr_t		mtr;
+
+	mtr_start(&mtr);
+
+	/* If the caller has a latch on the page, then the caller must
+	have a x-latch on the page and it must have already dropped
+	the hash index for the page. Because of the x-latch that we
+	are possibly holding, we cannot s-latch the page, but must
+	(recursively) x-latch it, even though we are only reading. */
+
+	block = buf_page_get_gen(page_id, 0, RW_X_LATCH, NULL,
+				 BUF_PEEK_IF_IN_POOL, &mtr);
+
+	if (block && block->index) {
+		/* In all our callers, the table handle should
+		be open, or we should be in the process of
+		dropping the table (preventing eviction). */
+		DBUG_ASSERT(block->index->table->get_ref_count()
+			    || dict_sys.locked());
+		btr_search_drop_page_hash_index(block, false);
+	}
+
+	mtr_commit(&mtr);
+}
+
+/** Build a hash index on a page with the given parameters. If the page already
+has a hash index with different parameters, the old hash index is removed.
+If index is non-NULL, this function checks if n_fields and n_bytes are
+sensible, and does not build a hash index if not.
+@param[in,out]	index		index for which to build.
+@param[in,out]	block		index page, s-/x- latched.
+@param[in,out]	ahi_latch	the adaptive search latch
+@param[in]	n_fields	hash this many full fields
+@param[in]	n_bytes		hash this many bytes of the next field
+@param[in]	left_side	hash for searches from left side */
+static
+void
+btr_search_build_page_hash_index(
+	dict_index_t*	index,
+	buf_block_t*	block,
+	srw_spin_lock*	ahi_latch,
+	uint16_t	n_fields,
+	uint16_t	n_bytes,
+	bool		left_side)
+{
+	const rec_t*	rec;
+	ulint		fold;
+	ulint		next_fold;
+	ulint		n_cached;
+	ulint		n_recs;
+	ulint*		folds;
+	const rec_t**	recs;
+	mem_heap_t*	heap		= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets		= offsets_;
+
+	ut_ad(!index->table->is_temporary());
+
+	if (!btr_search_enabled) {
+		return;
+	}
+
+	rec_offs_init(offsets_);
+	ut_ad(ahi_latch == &btr_search_sys.get_part(*index)->latch);
+	ut_ad(index);
+	ut_ad(block->page.id().space() == index->table->space_id);
+	ut_ad(!dict_index_is_ibuf(index));
+	ut_ad(page_is_leaf(block->page.frame));
+
+	ut_ad(block->page.lock.have_x() || block->page.lock.have_s());
+	ut_ad(block->page.id().page_no() >= 3);
+
+	ahi_latch->rd_lock(SRW_LOCK_CALL);
+
+	const bool enabled = btr_search_enabled;
+	const bool rebuild = enabled && block->index
+		&& (block->curr_n_fields != n_fields
+		    || block->curr_n_bytes != n_bytes
+		    || block->curr_left_side != left_side);
+
+	ahi_latch->rd_unlock();
+
+	if (!enabled) {
+		return;
+	}
+
+	if (rebuild) {
+		btr_search_drop_page_hash_index(block, false);
+	}
+
+	/* Check that the values for hash index build are sensible */
+
+	if (n_fields == 0 && n_bytes == 0) {
+
+		return;
+	}
+
+	if (dict_index_get_n_unique_in_tree(index)
+	    < btr_search_get_n_fields(n_fields, n_bytes)) {
+		return;
+	}
+
+	page_t*		page	= buf_block_get_frame(block);
+	n_recs = page_get_n_recs(page);
+
+	if (n_recs == 0) {
+
+		return;
+	}
+
+	rec = page_rec_get_next_const(page_get_infimum_rec(page));
+        if (!rec) return;
+
+	if (rec_is_metadata(rec, *index)) {
+		rec = page_rec_get_next_const(rec);
+		if (!rec || !--n_recs) return;
+	}
+
+	/* Calculate and cache fold values and corresponding records into
+	an array for fast insertion to the hash index */
+
+	folds = static_cast<ulint*>(ut_malloc_nokey(n_recs * sizeof *folds));
+	recs = static_cast<const rec_t**>(
+		ut_malloc_nokey(n_recs * sizeof *recs));
+
+	n_cached = 0;
+
+	ut_a(index->id == btr_page_get_index_id(page));
+
+	offsets = rec_get_offsets(
+		rec, index, offsets, index->n_core_fields,
+		btr_search_get_n_fields(n_fields, n_bytes),
+		&heap);
+	ut_ad(page_rec_is_supremum(rec)
+	      || n_fields == rec_offs_n_fields(offsets) - (n_bytes > 0));
+
+	fold = rec_fold(rec, offsets, n_fields, n_bytes, index->id);
+
+	if (left_side) {
+
+		folds[n_cached] = fold;
+		recs[n_cached] = rec;
+		n_cached++;
+	}
+
+	while (const rec_t* next_rec = page_rec_get_next_const(rec)) {
+		if (page_rec_is_supremum(next_rec)) {
+
+			if (!left_side) {
+
+				folds[n_cached] = fold;
+				recs[n_cached] = rec;
+				n_cached++;
+			}
+
+			break;
+		}
+
+		offsets = rec_get_offsets(
+			next_rec, index, offsets, index->n_core_fields,
+			btr_search_get_n_fields(n_fields, n_bytes), &heap);
+		next_fold = rec_fold(next_rec, offsets, n_fields,
+				     n_bytes, index->id);
+
+		if (fold != next_fold) {
+			/* Insert an entry into the hash index */
+
+			if (left_side) {
+
+				folds[n_cached] = next_fold;
+				recs[n_cached] = next_rec;
+				n_cached++;
+			} else {
+				folds[n_cached] = fold;
+				recs[n_cached] = rec;
+				n_cached++;
+			}
+		}
+
+		rec = next_rec;
+		fold = next_fold;
+	}
+
+	btr_search_check_free_space_in_heap(index);
+
+	ahi_latch->wr_lock(SRW_LOCK_CALL);
+
+	if (!btr_search_enabled) {
+		goto exit_func;
+	}
+
+	/* This counter is decremented every time we drop page
+	hash index entries and is incremented here. Since we can
+	rebuild hash index for a page that is already hashed, we
+	have to take care not to increment the counter in that
+	case. */
+	if (!block->index) {
+		assert_block_ahi_empty(block);
+		index->search_info->ref_count++;
+	} else if (block->curr_n_fields != n_fields
+		   || block->curr_n_bytes != n_bytes
+		   || block->curr_left_side != left_side) {
+		goto exit_func;
+	}
+
+	block->n_hash_helps = 0;
+
+	block->curr_n_fields = n_fields & dict_index_t::MAX_N_FIELDS;
+	block->curr_n_bytes = n_bytes & ((1U << 15) - 1);
+	block->curr_left_side = left_side;
+	block->index = index;
+
+	{
+		auto part = btr_search_sys.get_part(*index);
+		for (ulint i = 0; i < n_cached; i++) {
+			ha_insert_for_fold(&part->table, part->heap,
+					   folds[i], block, recs[i]);
+		}
+	}
+
+	MONITOR_INC(MONITOR_ADAPTIVE_HASH_PAGE_ADDED);
+	MONITOR_INC_VALUE(MONITOR_ADAPTIVE_HASH_ROW_ADDED, n_cached);
+exit_func:
+	assert_block_ahi_valid(block);
+	ahi_latch->wr_unlock();
+
+	ut_free(folds);
+	ut_free(recs);
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+}
+
+/** Updates the search info.
+@param[in,out]	info	search info
+@param[in,out]	cursor	cursor which was just positioned */
+void btr_search_info_update_slow(btr_search_t *info, btr_cur_t *cursor)
+{
+	srw_spin_lock*	ahi_latch = &btr_search_sys.get_part(*cursor->index())
+		->latch;
+	buf_block_t*	block = btr_cur_get_block(cursor);
+
+	/* NOTE that the following two function calls do NOT protect
+	info or block->n_fields etc. with any semaphore, to save CPU time!
+	We cannot assume the fields are consistent when we return from
+	those functions! */
+
+	btr_search_info_update_hash(info, cursor);
+
+	bool build_index = btr_search_update_block_hash_info(info, block);
+
+	if (build_index || (cursor->flag == BTR_CUR_HASH_FAIL)) {
+
+		btr_search_check_free_space_in_heap(cursor->index());
+	}
+
+	if (cursor->flag == BTR_CUR_HASH_FAIL) {
+		/* Update the hash node reference, if appropriate */
+
+#ifdef UNIV_SEARCH_PERF_STAT
+		btr_search_n_hash_fail++;
+#endif /* UNIV_SEARCH_PERF_STAT */
+
+		btr_search_update_hash_ref(info, block, cursor);
+	}
+
+	if (build_index) {
+		/* Note that since we did not protect block->n_fields etc.
+		with any semaphore, the values can be inconsistent. We have
+		to check inside the function call that they make sense. */
+		btr_search_build_page_hash_index(cursor->index(), block,
+						 ahi_latch,
+						 block->n_fields,
+						 block->n_bytes,
+						 block->left_side);
+	}
+}
+
+/** Move or delete hash entries for moved records, usually in a page split.
+If new_block is already hashed, then any hash index for block is dropped.
+If new_block is not hashed, and block is hashed, then a new hash index is
+built to new_block with the same parameters as block.
+@param[in,out]	new_block	destination page
+@param[in,out]	block		source page (subject to deletion later) */
+void
+btr_search_move_or_delete_hash_entries(
+	buf_block_t*	new_block,
+	buf_block_t*	block)
+{
+	ut_ad(block->page.lock.have_x());
+	ut_ad(new_block->page.lock.have_x());
+
+	if (!btr_search_enabled) {
+		return;
+	}
+
+	dict_index_t* index = block->index;
+	if (!index) {
+		index = new_block->index;
+	} else {
+		ut_ad(!new_block->index || index == new_block->index);
+	}
+	assert_block_ahi_valid(block);
+	assert_block_ahi_valid(new_block);
+
+	srw_spin_lock* ahi_latch = index
+		? &btr_search_sys.get_part(*index)->latch
+		: nullptr;
+
+	if (new_block->index) {
+drop_exit:
+		btr_search_drop_page_hash_index(block, false);
+		return;
+	}
+
+	if (!index) {
+		return;
+	}
+
+	ahi_latch->rd_lock(SRW_LOCK_CALL);
+
+	if (index->freed()) {
+		ahi_latch->rd_unlock();
+		goto drop_exit;
+	}
+
+	if (block->index) {
+		uint16_t n_fields = block->curr_n_fields;
+		uint16_t n_bytes = block->curr_n_bytes;
+		bool left_side = block->curr_left_side;
+
+		new_block->n_fields = block->curr_n_fields;
+		new_block->n_bytes = block->curr_n_bytes;
+		new_block->left_side = left_side;
+
+		ahi_latch->rd_unlock();
+
+		ut_a(n_fields > 0 || n_bytes > 0);
+
+		btr_search_build_page_hash_index(
+			index, new_block, ahi_latch,
+			n_fields, n_bytes, left_side);
+		ut_ad(n_fields == block->curr_n_fields);
+		ut_ad(n_bytes == block->curr_n_bytes);
+		ut_ad(left_side == block->curr_left_side);
+		return;
+	}
+
+	ahi_latch->rd_unlock();
+}
+
+/** Updates the page hash index when a single record is deleted from a page.
+@param[in]	cursor	cursor which was positioned on the record to delete
+			using btr_cur_search_, the record is not yet deleted.*/
+void btr_search_update_hash_on_delete(btr_cur_t *cursor)
+{
+	buf_block_t*	block;
+	const rec_t*	rec;
+	ulint		fold;
+	dict_index_t*	index;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	mem_heap_t*	heap		= NULL;
+	rec_offs_init(offsets_);
+
+	ut_ad(page_is_leaf(btr_cur_get_page(cursor)));
+
+	if (!btr_search_enabled) {
+		return;
+	}
+
+	block = btr_cur_get_block(cursor);
+
+	ut_ad(block->page.lock.have_x());
+
+	assert_block_ahi_valid(block);
+	index = block->index;
+
+	if (!index) {
+
+		return;
+	}
+
+	ut_ad(!cursor->index()->table->is_temporary());
+
+	if (index != cursor->index()) {
+		btr_search_drop_page_hash_index(block, false);
+		return;
+	}
+
+	ut_ad(block->page.id().space() == index->table->space_id);
+	ut_a(index == cursor->index());
+	ut_a(block->curr_n_fields > 0 || block->curr_n_bytes > 0);
+	ut_ad(!dict_index_is_ibuf(index));
+
+	rec = btr_cur_get_rec(cursor);
+
+	fold = rec_fold(rec, rec_get_offsets(rec, index, offsets_,
+					     index->n_core_fields,
+					     ULINT_UNDEFINED, &heap),
+			block->curr_n_fields, block->curr_n_bytes, index->id);
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	auto part = btr_search_sys.get_part(*index);
+
+	part->latch.wr_lock(SRW_LOCK_CALL);
+	assert_block_ahi_valid(block);
+
+	if (block->index && btr_search_enabled) {
+		ut_a(block->index == index);
+
+		if (ha_search_and_delete_if_found(&part->table, part->heap,
+						  fold, rec)) {
+			MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_REMOVED);
+		} else {
+			MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_REMOVE_NOT_FOUND);
+		}
+
+		assert_block_ahi_valid(block);
+	}
+
+	part->latch.wr_unlock();
+}
+
+/** Updates the page hash index when a single record is inserted on a page.
+@param[in]	cursor	cursor which was positioned to the place to insert
+			using btr_cur_search_, and the new record has been
+			inserted next to the cursor.
+@param[in]	ahi_latch	the adaptive hash index latch */
+void btr_search_update_hash_node_on_insert(btr_cur_t *cursor,
+                                           srw_spin_lock *ahi_latch)
+{
+	buf_block_t*	block;
+	dict_index_t*	index;
+	rec_t*		rec;
+
+	ut_ad(ahi_latch == &btr_search_sys.get_part(*cursor->index())->latch);
+
+	if (!btr_search_enabled) {
+		return;
+	}
+
+	rec = btr_cur_get_rec(cursor);
+
+	block = btr_cur_get_block(cursor);
+
+	ut_ad(block->page.lock.have_x());
+
+	index = block->index;
+
+	if (!index) {
+
+		return;
+	}
+
+	ut_ad(!cursor->index()->table->is_temporary());
+
+	if (index != cursor->index()) {
+		ut_ad(index->id == cursor->index()->id);
+		btr_search_drop_page_hash_index(block, false);
+		return;
+	}
+
+	ut_a(cursor->index() == index);
+	ut_ad(!dict_index_is_ibuf(index));
+	ahi_latch->wr_lock(SRW_LOCK_CALL);
+
+	if (!block->index || !btr_search_enabled) {
+
+		goto func_exit;
+	}
+
+	ut_a(block->index == index);
+
+	if ((cursor->flag == BTR_CUR_HASH)
+	    && (cursor->n_fields == block->curr_n_fields)
+	    && (cursor->n_bytes == block->curr_n_bytes)
+	    && !block->curr_left_side) {
+		if (const rec_t *new_rec = page_rec_get_next_const(rec)) {
+			if (ha_search_and_update_if_found(
+				&btr_search_sys.get_part(*cursor->index())
+				->table,
+				cursor->fold, rec, block, new_rec)) {
+				MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_UPDATED);
+			}
+		} else {
+			ut_ad("corrupted page" == 0);
+		}
+
+func_exit:
+		assert_block_ahi_valid(block);
+		ahi_latch->wr_unlock();
+	} else {
+		ahi_latch->wr_unlock();
+
+		btr_search_update_hash_on_insert(cursor, ahi_latch);
+	}
+}
+
+/** Updates the page hash index when a single record is inserted on a page.
+@param[in,out]	cursor		cursor which was positioned to the
+				place to insert using btr_cur_search_...,
+				and the new record has been inserted next
+				to the cursor
+@param[in]	ahi_latch	the adaptive hash index latch */
+void btr_search_update_hash_on_insert(btr_cur_t *cursor,
+                                      srw_spin_lock *ahi_latch)
+{
+	buf_block_t*	block;
+	dict_index_t*	index;
+	const rec_t*	rec;
+	const rec_t*	ins_rec;
+	const rec_t*	next_rec;
+	ulint		fold;
+	ulint		ins_fold;
+	ulint		next_fold = 0; /* remove warning (??? bug ???) */
+	ulint		n_fields;
+	ulint		n_bytes;
+	mem_heap_t*	heap		= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(ahi_latch == &btr_search_sys.get_part(*cursor->index())->latch);
+	ut_ad(page_is_leaf(btr_cur_get_page(cursor)));
+
+	if (!btr_search_enabled) {
+		return;
+	}
+
+	block = btr_cur_get_block(cursor);
+
+	ut_ad(block->page.lock.have_x());
+	assert_block_ahi_valid(block);
+
+	index = block->index;
+
+	if (!index) {
+
+		return;
+	}
+
+	ut_ad(block->page.id().space() == index->table->space_id);
+	btr_search_check_free_space_in_heap(index);
+
+	rec = btr_cur_get_rec(cursor);
+
+	ut_ad(!cursor->index()->table->is_temporary());
+
+	if (index != cursor->index()) {
+		ut_ad(index->id == cursor->index()->id);
+drop:
+		btr_search_drop_page_hash_index(block, false);
+		return;
+	}
+
+	ut_a(index == cursor->index());
+	ut_ad(!dict_index_is_ibuf(index));
+
+	n_fields = block->curr_n_fields;
+	n_bytes = block->curr_n_bytes;
+	const bool left_side = block->curr_left_side;
+
+	ins_rec = page_rec_get_next_const(rec);
+	if (UNIV_UNLIKELY(!ins_rec)) goto drop;
+	next_rec = page_rec_get_next_const(ins_rec);
+	if (UNIV_UNLIKELY(!next_rec)) goto drop;
+
+	offsets = rec_get_offsets(ins_rec, index, offsets,
+				  index->n_core_fields,
+				  ULINT_UNDEFINED, &heap);
+	ins_fold = rec_fold(ins_rec, offsets, n_fields, n_bytes, index->id);
+
+	if (!page_rec_is_supremum(next_rec)) {
+		offsets = rec_get_offsets(
+			next_rec, index, offsets, index->n_core_fields,
+			btr_search_get_n_fields(n_fields, n_bytes), &heap);
+		next_fold = rec_fold(next_rec, offsets, n_fields,
+				     n_bytes, index->id);
+	}
+
+	/* We must not look up "part" before acquiring ahi_latch. */
+	btr_search_sys_t::partition* part= nullptr;
+	bool locked = false;
+
+	if (!page_rec_is_infimum(rec) && !rec_is_metadata(rec, *index)) {
+		offsets = rec_get_offsets(
+			rec, index, offsets, index->n_core_fields,
+			btr_search_get_n_fields(n_fields, n_bytes), &heap);
+		fold = rec_fold(rec, offsets, n_fields, n_bytes, index->id);
+	} else {
+		if (left_side) {
+			locked = true;
+			ahi_latch->wr_lock(SRW_LOCK_CALL);
+
+			if (!btr_search_enabled || !block->index) {
+				goto function_exit;
+			}
+
+			part = btr_search_sys.get_part(*index);
+			ha_insert_for_fold(&part->table, part->heap,
+					   ins_fold, block, ins_rec);
+			MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);
+		}
+
+		goto check_next_rec;
+	}
+
+	if (fold != ins_fold) {
+
+		if (!locked) {
+			locked = true;
+			ahi_latch->wr_lock(SRW_LOCK_CALL);
+
+			if (!btr_search_enabled || !block->index) {
+				goto function_exit;
+			}
+
+			part = btr_search_sys.get_part(*index);
+		}
+
+		if (!left_side) {
+			ha_insert_for_fold(&part->table, part->heap,
+					   fold, block, rec);
+		} else {
+			ha_insert_for_fold(&part->table, part->heap,
+					   ins_fold, block, ins_rec);
+		}
+		MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);
+	}
+
+check_next_rec:
+	if (page_rec_is_supremum(next_rec)) {
+
+		if (!left_side) {
+			if (!locked) {
+				locked = true;
+				ahi_latch->wr_lock(SRW_LOCK_CALL);
+
+				if (!btr_search_enabled || !block->index) {
+					goto function_exit;
+				}
+
+				part = btr_search_sys.get_part(*index);
+			}
+
+			ha_insert_for_fold(&part->table, part->heap,
+					   ins_fold, block, ins_rec);
+			MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);
+		}
+
+		goto function_exit;
+	}
+
+	if (ins_fold != next_fold) {
+		if (!locked) {
+			locked = true;
+			ahi_latch->wr_lock(SRW_LOCK_CALL);
+
+			if (!btr_search_enabled || !block->index) {
+				goto function_exit;
+			}
+
+			part = btr_search_sys.get_part(*index);
+		}
+
+		if (!left_side) {
+			ha_insert_for_fold(&part->table, part->heap,
+					   ins_fold, block, ins_rec);
+		} else {
+			ha_insert_for_fold(&part->table, part->heap,
+					   next_fold, block, next_rec);
+		}
+		MONITOR_INC(MONITOR_ADAPTIVE_HASH_ROW_ADDED);
+	}
+
+function_exit:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	if (locked) {
+		ahi_latch->wr_unlock();
+	}
+}
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+__attribute__((nonnull))
+/** @return whether a range of the cells is valid */
+static bool ha_validate(const hash_table_t *table,
+                        ulint start_index, ulint end_index)
+{
+  ut_a(start_index <= end_index);
+  ut_a(end_index < table->n_cells);
+
+  bool ok= true;
+
+  for (ulint i= start_index; i <= end_index; i++)
+  {
+    for (auto node= static_cast<const ha_node_t*>(table->array[i].node); node;
+         node= node->next)
+    {
+      if (table->calc_hash(node->fold) != i) {
+        ib::error() << "Hash table node fold value " << node->fold
+		    << " does not match the cell number " << i;
+	ok= false;
+      }
+    }
+  }
+
+  return ok;
+}
+
+/** Validates the search system for given hash table.
+@param thd            connection, for checking if CHECK TABLE has been killed
+@param hash_table_id  hash table to validate
+@return true if ok */
+static bool btr_search_hash_table_validate(THD *thd, ulint hash_table_id)
+{
+	ha_node_t*	node;
+	bool		ok		= true;
+	ulint		i;
+	ulint		cell_count;
+	mem_heap_t*	heap		= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets		= offsets_;
+
+	btr_search_x_lock_all();
+	if (!btr_search_enabled || (thd && thd_kill_level(thd))) {
+func_exit:
+		btr_search_x_unlock_all();
+
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+
+		return ok;
+	}
+
+	/* How many cells to check before temporarily releasing
+	search latches. */
+	ulint		chunk_size = 10000;
+
+	rec_offs_init(offsets_);
+
+	mysql_mutex_lock(&buf_pool.mutex);
+
+	auto &part = btr_search_sys.parts[hash_table_id];
+
+	cell_count = part.table.n_cells;
+
+	for (i = 0; i < cell_count; i++) {
+		/* We release search latches every once in a while to
+		give other queries a chance to run. */
+		if ((i != 0) && ((i % chunk_size) == 0)) {
+
+			mysql_mutex_unlock(&buf_pool.mutex);
+			btr_search_x_unlock_all();
+
+			std::this_thread::yield();
+
+			btr_search_x_lock_all();
+
+			if (!btr_search_enabled
+			    || (thd && thd_kill_level(thd))) {
+				goto func_exit;
+			}
+
+			mysql_mutex_lock(&buf_pool.mutex);
+
+			ulint curr_cell_count = part.table.n_cells;
+
+			if (cell_count != curr_cell_count) {
+
+				cell_count = curr_cell_count;
+
+				if (i >= cell_count) {
+					break;
+				}
+			}
+		}
+
+		node = static_cast<ha_node_t*>(part.table.array[i].node);
+
+		for (; node != NULL; node = node->next) {
+			const buf_block_t*	block
+				= buf_pool.block_from_ahi((byte*) node->data);
+			index_id_t		page_index_id;
+
+			if (UNIV_LIKELY(block->page.in_file())) {
+				/* The space and offset are only valid
+				for file blocks.  It is possible that
+				the block is being freed
+				(BUF_BLOCK_REMOVE_HASH, see the
+				assertion and the comment below) */
+				const page_id_t id(block->page.id());
+				if (const buf_page_t* hash_page
+				    = buf_pool.page_hash.get(
+					    id, buf_pool.page_hash.cell_get(
+						    id.fold()))) {
+					ut_ad(hash_page == &block->page);
+					goto state_ok;
+				}
+			}
+
+			/* When a block is being freed,
+			buf_LRU_search_and_free_block() first removes
+			the block from buf_pool.page_hash by calling
+			buf_LRU_block_remove_hashed_page(). Then it
+			invokes btr_search_drop_page_hash_index(). */
+			ut_a(block->page.state() == buf_page_t::REMOVE_HASH);
+state_ok:
+			ut_ad(!dict_index_is_ibuf(block->index));
+			ut_ad(block->page.id().space()
+			      == block->index->table->space_id);
+
+			const page_t* page = block->page.frame;
+
+			page_index_id = btr_page_get_index_id(page);
+
+			offsets = rec_get_offsets(
+				node->data, block->index, offsets,
+				block->index->n_core_fields,
+				btr_search_get_n_fields(block->curr_n_fields,
+							block->curr_n_bytes),
+				&heap);
+
+			const ulint	fold = rec_fold(
+				node->data, offsets,
+				block->curr_n_fields,
+				block->curr_n_bytes,
+				page_index_id);
+
+			if (node->fold != fold) {
+				ok = FALSE;
+
+				ib::error() << "Error in an adaptive hash"
+					<< " index pointer to page "
+					<< block->page.id()
+					<< ", ptr mem address "
+					<< reinterpret_cast<const void*>(
+						node->data)
+					<< ", index id " << page_index_id
+					<< ", node fold " << node->fold
+					<< ", rec fold " << fold;
+
+				fputs("InnoDB: Record ", stderr);
+				rec_print_new(stderr, node->data, offsets);
+				fprintf(stderr, "\nInnoDB: on that page."
+					" Page mem address %p, is hashed %p,"
+					" n fields %lu\n"
+					"InnoDB: side %lu\n",
+					(void*) page, (void*) block->index,
+					(ulong) block->curr_n_fields,
+					(ulong) block->curr_left_side);
+				ut_ad(0);
+			}
+		}
+	}
+
+	for (i = 0; i < cell_count; i += chunk_size) {
+		/* We release search latches every once in a while to
+		give other queries a chance to run. */
+		if (i != 0) {
+			mysql_mutex_unlock(&buf_pool.mutex);
+			btr_search_x_unlock_all();
+
+			std::this_thread::yield();
+
+			btr_search_x_lock_all();
+
+			if (!btr_search_enabled
+			    || (thd && thd_kill_level(thd))) {
+				goto func_exit;
+			}
+
+			mysql_mutex_lock(&buf_pool.mutex);
+
+			ulint curr_cell_count = part.table.n_cells;
+
+			if (cell_count != curr_cell_count) {
+
+				cell_count = curr_cell_count;
+
+				if (i >= cell_count) {
+					break;
+				}
+			}
+		}
+
+		ulint end_index = ut_min(i + chunk_size - 1, cell_count - 1);
+
+		if (!ha_validate(&part.table, i, end_index)) {
+			ok = false;
+		}
+	}
+
+	mysql_mutex_unlock(&buf_pool.mutex);
+	goto func_exit;
+}
+
+/** Validates the search system.
+@param thd   connection, for checking if CHECK TABLE has been killed
+@return true if ok */
+bool btr_search_validate(THD *thd)
+{
+  for (ulint i= 0; i < btr_ahi_parts; ++i)
+    if (!btr_search_hash_table_validate(thd, i))
+      return(false);
+  return true;
+}
+
+#ifdef UNIV_DEBUG
+bool btr_search_check_marked_free_index(const buf_block_t *block)
+{
+  const index_id_t index_id= btr_page_get_index_id(block->page.frame);
+  auto part= btr_search_sys.get_part(index_id, block->page.id().space());
+
+  part->latch.rd_lock(SRW_LOCK_CALL);
+
+  bool is_freed= block->index && block->index->freed();
+
+  part->latch.rd_unlock();
+
+  return is_freed;
+}
+#endif /* UNIV_DEBUG */
+#endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */
+#endif /* BTR_CUR_HASH_ADAPT */
diff --git a/storage/innobase/buf/buf0block_hint.cc b/storage/innobase/buf/buf0block_hint.cc
new file mode 100644
index 00000000..6bd01faa
--- /dev/null
+++ b/storage/innobase/buf/buf0block_hint.cc
@@ -0,0 +1,59 @@
+/*****************************************************************************
+
+Copyright (c) 2020, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2020, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License, version 2.0, as published by the
+Free Software Foundation.
+
+This program is also distributed with certain software (including but not
+limited to OpenSSL) that is licensed under separate terms, as designated in a
+particular file or component or in included license documentation. The authors
+of MySQL hereby grant you an additional permission to link the program and
+your derivative works with the separately licensed software that they have
+included with MySQL.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0,
+for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA
+
+*****************************************************************************/
+
+#include "buf0block_hint.h"
+namespace buf {
+
+TRANSACTIONAL_TARGET
+void Block_hint::buffer_fix_block_if_still_valid()
+{
+  /* To check if m_block belongs to the current buf_pool, we must
+  prevent freeing memory while we check, and until we buffer-fix the
+  block. For this purpose it is enough to latch any of the many
+  latches taken by buf_pool_t::resize().
+
+  Similar to buf_page_optimistic_get(), we must validate
+  m_block->page.id() after acquiring the hash_lock, because the object
+  may have been freed and not actually attached to buf_pool.page_hash
+  at the moment. (The block could have been reused to store a
+  different page, and that slice of buf_pool.page_hash could be protected
+  by another hash_lock that we are not holding.)
+
+  Finally, we must ensure that the block is not being freed. */
+  if (m_block)
+  {
+    auto &cell= buf_pool.page_hash.cell_get(m_page_id.fold());
+    transactional_shared_lock_guard<page_hash_latch> g
+      {buf_pool.page_hash.lock_get(cell)};
+    if (buf_pool.is_uncompressed(m_block) && m_page_id == m_block->page.id() &&
+        m_block->page.frame && m_block->page.in_file())
+      m_block->page.fix();
+    else
+      clear();
+  }
+}
+}  // namespace buf
diff --git a/storage/innobase/buf/buf0buddy.cc b/storage/innobase/buf/buf0buddy.cc
new file mode 100644
index 00000000..85a698bc
--- /dev/null
+++ b/storage/innobase/buf/buf0buddy.cc
@@ -0,0 +1,769 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0buddy.cc
+Binary buddy allocator for compressed pages
+
+Created December 2006 by Marko Makela
+*******************************************************/
+
+#include "buf0buddy.h"
+#include "buf0buf.h"
+#include "buf0lru.h"
+#include "buf0flu.h"
+#include "page0zip.h"
+#include "srv0start.h"
+
+/** When freeing a buf we attempt to coalesce by looking at its buddy
+and deciding whether it is free or not. To ascertain if the buddy is
+free we look for BUF_BUDDY_STAMP_FREE at BUF_BUDDY_STAMP_OFFSET
+within the buddy. The question is how we can be sure that it is
+safe to look at BUF_BUDDY_STAMP_OFFSET.
+The answer lies in following invariants:
+* All blocks allocated by buddy allocator are used for compressed
+page frame.
+* A compressed table always have space_id < SRV_SPACE_ID_UPPER_BOUND
+* BUF_BUDDY_STAMP_OFFSET always points to the space_id field in
+a frame.
+  -- The above is true because we look at these fields when the
+     corresponding buddy block is free which implies that:
+     * The block we are looking at must have an address aligned at
+       the same size that its free buddy has. For example, if we have
+       a free block of 8K then its buddy's address must be aligned at
+       8K as well.
+     * It is possible that the block we are looking at may have been
+       further divided into smaller sized blocks but its starting
+       address must still remain the start of a page frame i.e.: it
+       cannot be middle of a block. For example, if we have a free
+       block of size 8K then its buddy may be divided into blocks
+       of, say, 1K, 1K, 2K, 4K but the buddy's address will still be
+       the starting address of first 1K compressed page.
+     * What is important to note is that for any given block, the
+       buddy's address cannot be in the middle of a larger block i.e.:
+       in above example, our 8K block cannot have a buddy whose address
+       is aligned on 8K but it is part of a larger 16K block.
+*/
+
+/** Offset within buf_buddy_free_t where free or non_free stamps
+are written.*/
+#define BUF_BUDDY_STAMP_OFFSET	FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID
+
+/** Value that we stamp on all buffers that are currently on the zip_free
+list. This value is stamped at BUF_BUDDY_STAMP_OFFSET offset */
+#define BUF_BUDDY_STAMP_FREE	 SRV_SPACE_ID_UPPER_BOUND
+
+/** Stamp value for non-free buffers. Will be overwritten by a non-zero
+value by the consumer of the block */
+#define BUF_BUDDY_STAMP_NONFREE	0XFFFFFFFFUL
+
+/** Return type of buf_buddy_is_free() */
+enum buf_buddy_state_t {
+	BUF_BUDDY_STATE_FREE,	/*!< If the buddy to completely free */
+	BUF_BUDDY_STATE_USED,	/*!< Buddy currently in used */
+	BUF_BUDDY_STATE_PARTIALLY_USED/*!< Some sub-blocks in the buddy
+				are in use */
+};
+
+/**********************************************************************//**
+Invalidate memory area that we won't access while page is free */
+UNIV_INLINE
+void
+buf_buddy_mem_invalid(
+/*==================*/
+	buf_buddy_free_t*	buf,	/*!< in: block to check */
+	ulint			i)	/*!< in: index of zip_free[] */
+{
+  ut_ad(i <= BUF_BUDDY_SIZES);
+
+  MEM_CHECK_ADDRESSABLE(buf, BUF_BUDDY_LOW << i);
+  MEM_UNDEFINED(buf, BUF_BUDDY_LOW << i);
+}
+
+/**********************************************************************//**
+Check if a buddy is stamped free.
+@return whether the buddy is free */
+UNIV_INLINE MY_ATTRIBUTE((warn_unused_result))
+bool
+buf_buddy_stamp_is_free(
+/*====================*/
+	const buf_buddy_free_t*	buf)	/*!< in: block to check */
+{
+	compile_time_assert(BUF_BUDDY_STAMP_FREE < BUF_BUDDY_STAMP_NONFREE);
+	return(mach_read_from_4(buf->stamp.bytes + BUF_BUDDY_STAMP_OFFSET)
+	       == BUF_BUDDY_STAMP_FREE);
+}
+
+/**********************************************************************//**
+Stamps a buddy free. */
+UNIV_INLINE
+void
+buf_buddy_stamp_free(
+/*=================*/
+	buf_buddy_free_t*	buf,	/*!< in/out: block to stamp */
+	ulint			i)	/*!< in: block size */
+{
+	ut_d(memset(&buf->stamp.bytes, int(i), BUF_BUDDY_LOW << i));
+	buf_buddy_mem_invalid(buf, i);
+	mach_write_to_4(buf->stamp.bytes + BUF_BUDDY_STAMP_OFFSET,
+			BUF_BUDDY_STAMP_FREE);
+	buf->stamp.size = i;
+}
+
+/**********************************************************************//**
+Stamps a buddy nonfree.
+@param[in,out]	buf	block to stamp
+@param[in]	i	block size */
+static inline void buf_buddy_stamp_nonfree(buf_buddy_free_t* buf, ulint i)
+{
+	buf_buddy_mem_invalid(buf, i);
+	compile_time_assert(BUF_BUDDY_STAMP_NONFREE == 0xffffffffU);
+	memset(buf->stamp.bytes + BUF_BUDDY_STAMP_OFFSET, 0xff, 4);
+}
+
+/**********************************************************************//**
+Get the offset of the buddy of a compressed page frame.
+@return the buddy relative of page */
+UNIV_INLINE
+void*
+buf_buddy_get(
+/*==========*/
+	byte*	page,	/*!< in: compressed page */
+	ulint	size)	/*!< in: page size in bytes */
+{
+	ut_ad(ut_is_2pow(size));
+	ut_ad(size >= BUF_BUDDY_LOW);
+	ut_ad(BUF_BUDDY_LOW <= UNIV_ZIP_SIZE_MIN);
+	ut_ad(size < BUF_BUDDY_HIGH);
+	ut_ad(BUF_BUDDY_HIGH == srv_page_size);
+	ut_ad(!ut_align_offset(page, size));
+
+	if (((ulint) page) & size) {
+		return(page - size);
+	} else {
+		return(page + size);
+	}
+}
+
+#ifdef UNIV_DEBUG
+/** Validate a given zip_free list. */
+struct	CheckZipFree {
+	CheckZipFree(ulint i) : m_i(i) {}
+
+	void operator()(const buf_buddy_free_t* elem) const
+	{
+		ut_ad(buf_buddy_stamp_is_free(elem));
+		ut_ad(elem->stamp.size <= m_i);
+	}
+
+	const ulint m_i;
+};
+
+/** Validate a buddy list.
+@param[in]	i		buddy size to validate */
+static void buf_buddy_list_validate(ulint i)
+{
+	ut_list_validate(buf_pool.zip_free[i], CheckZipFree(i));
+}
+
+/**********************************************************************//**
+Debug function to validate that a buffer is indeed free i.e.: in the
+zip_free[].
+@param[in]	buf		block to check
+@param[in]	i		index of buf_pool.zip_free[]
+@return true if free */
+static bool buf_buddy_check_free(const buf_buddy_free_t* buf, ulint i)
+{
+	const ulint	size	= BUF_BUDDY_LOW << i;
+
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+	ut_ad(!ut_align_offset(buf, size));
+	ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+
+	buf_buddy_free_t* itr;
+
+	for (itr = UT_LIST_GET_FIRST(buf_pool.zip_free[i]);
+	     itr && itr != buf;
+	     itr = UT_LIST_GET_NEXT(list, itr)) {
+	}
+
+	return(itr == buf);
+}
+#endif /* UNIV_DEBUG */
+
+/**********************************************************************//**
+Checks if a buf is free i.e.: in the zip_free[].
+@retval BUF_BUDDY_STATE_FREE if fully free
+@retval BUF_BUDDY_STATE_USED if currently in use
+@retval BUF_BUDDY_STATE_PARTIALLY_USED if partially in use. */
+static  MY_ATTRIBUTE((warn_unused_result))
+buf_buddy_state_t
+buf_buddy_is_free(
+/*==============*/
+	buf_buddy_free_t*	buf,	/*!< in: block to check */
+	ulint			i)	/*!< in: index of
+					buf_pool.zip_free[] */
+{
+#ifdef UNIV_DEBUG
+	const ulint	size	= BUF_BUDDY_LOW << i;
+	ut_ad(!ut_align_offset(buf, size));
+	ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+#endif /* UNIV_DEBUG */
+
+	/* We assume that all memory from buf_buddy_alloc()
+	is used for compressed page frames. */
+
+	/* We look inside the allocated objects returned by
+	buf_buddy_alloc() and assume that each block is a compressed
+	page that contains one of the following in space_id.
+	* BUF_BUDDY_STAMP_FREE if the block is in a zip_free list or
+	* BUF_BUDDY_STAMP_NONFREE if the block has been allocated but
+	not initialized yet or
+	* A valid space_id of a compressed tablespace
+
+	The call below attempts to read from free memory.  The memory
+	is "owned" by the buddy allocator (and it has been allocated
+	from the buffer pool), so there is nothing wrong about this. */
+	if (!buf_buddy_stamp_is_free(buf)) {
+		return(BUF_BUDDY_STATE_USED);
+	}
+
+	/* A block may be free but a fragment of it may still be in use.
+	To guard against that we write the free block size in terms of
+	zip_free index at start of stamped block. Note that we can
+	safely rely on this value only if the buf is free. */
+	ut_ad(buf->stamp.size <= i);
+	return(buf->stamp.size == i
+	       ? BUF_BUDDY_STATE_FREE
+	       : BUF_BUDDY_STATE_PARTIALLY_USED);
+}
+
+/** Add a block to the head of the appropriate buddy free list.
+@param[in,out]	buf		block to be freed
+@param[in]	i		index of buf_pool.zip_free[] */
+UNIV_INLINE
+void
+buf_buddy_add_to_free(buf_buddy_free_t* buf, ulint i)
+{
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+	ut_ad(buf_pool.zip_free[i].start != buf);
+
+	buf_buddy_stamp_free(buf, i);
+	UT_LIST_ADD_FIRST(buf_pool.zip_free[i], buf);
+	ut_d(buf_buddy_list_validate(i));
+}
+
+/** Remove a block from the appropriate buddy free list.
+@param[in,out]	buf		block to be freed
+@param[in]	i		index of buf_pool.zip_free[] */
+UNIV_INLINE
+void
+buf_buddy_remove_from_free(buf_buddy_free_t* buf, ulint i)
+{
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+	ut_ad(buf_buddy_check_free(buf, i));
+
+	UT_LIST_REMOVE(buf_pool.zip_free[i], buf);
+	buf_buddy_stamp_nonfree(buf, i);
+}
+
+/** Try to allocate a block from buf_pool.zip_free[].
+@param[in]	i		index of buf_pool.zip_free[]
+@return allocated block, or NULL if buf_pool.zip_free[] was empty */
+static buf_buddy_free_t* buf_buddy_alloc_zip(ulint i)
+{
+	buf_buddy_free_t*	buf;
+
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+	ut_a(i < BUF_BUDDY_SIZES);
+	ut_a(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+
+	ut_d(buf_buddy_list_validate(i));
+
+	buf = UT_LIST_GET_FIRST(buf_pool.zip_free[i]);
+
+	if (buf_pool.is_shrinking()
+	    && UT_LIST_GET_LEN(buf_pool.withdraw)
+	    < buf_pool.withdraw_target) {
+
+		while (buf != NULL
+		       && buf_pool.will_be_withdrawn(
+			       reinterpret_cast<byte*>(buf))) {
+			/* This should be withdrawn, not to be allocated */
+			buf = UT_LIST_GET_NEXT(list, buf);
+		}
+	}
+
+	if (buf) {
+		buf_buddy_remove_from_free(buf, i);
+	} else if (i + 1 < BUF_BUDDY_SIZES) {
+		/* Attempt to split. */
+		buf = buf_buddy_alloc_zip(i + 1);
+
+		if (buf) {
+			buf_buddy_free_t* buddy =
+				reinterpret_cast<buf_buddy_free_t*>(
+					reinterpret_cast<byte*>(buf)
+					+ (BUF_BUDDY_LOW << i));
+			ut_ad(!buf_pool.contains_zip(buddy));
+			buf_buddy_add_to_free(buddy, i);
+		}
+	}
+
+	if (buf) {
+		/* Trash the page other than the BUF_BUDDY_STAMP_NONFREE. */
+		MEM_UNDEFINED(buf, BUF_BUDDY_STAMP_OFFSET);
+		MEM_UNDEFINED(BUF_BUDDY_STAMP_OFFSET + 4 + buf->stamp.bytes,
+			      (BUF_BUDDY_LOW << i)
+			      - (BUF_BUDDY_STAMP_OFFSET + 4));
+		ut_ad(mach_read_from_4(buf->stamp.bytes
+				       + BUF_BUDDY_STAMP_OFFSET)
+		      == BUF_BUDDY_STAMP_NONFREE);
+	}
+
+	return(buf);
+}
+
+/** Deallocate a buffer frame of srv_page_size.
+@param[in]	buf		buffer frame to deallocate */
+static
+void
+buf_buddy_block_free(void* buf)
+{
+	const ulint	fold	= BUF_POOL_ZIP_FOLD_PTR(buf);
+	buf_page_t*	bpage;
+	buf_block_t*	block;
+
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+	ut_a(!ut_align_offset(buf, srv_page_size));
+
+	HASH_SEARCH(hash, &buf_pool.zip_hash, fold, buf_page_t*, bpage,
+		    ut_ad(bpage->state() == buf_page_t::MEMORY
+			  && bpage->in_zip_hash),
+		    bpage->frame == buf);
+	ut_a(bpage);
+	ut_a(bpage->state() == buf_page_t::MEMORY);
+	ut_ad(bpage->in_zip_hash);
+	ut_d(bpage->in_zip_hash = false);
+	HASH_DELETE(buf_page_t, hash, &buf_pool.zip_hash, fold, bpage);
+	bpage->hash = nullptr;
+
+	ut_d(memset(buf, 0, srv_page_size));
+	MEM_UNDEFINED(buf, srv_page_size);
+
+	block = (buf_block_t*) bpage;
+	buf_LRU_block_free_non_file_page(block);
+
+	ut_ad(buf_pool.buddy_n_frames > 0);
+	ut_d(buf_pool.buddy_n_frames--);
+}
+
+/**********************************************************************//**
+Allocate a buffer block to the buddy allocator. */
+static
+void
+buf_buddy_block_register(
+/*=====================*/
+	buf_block_t*	block)	/*!< in: buffer frame to allocate */
+{
+	const ulint	fold = BUF_POOL_ZIP_FOLD(block);
+	ut_ad(block->page.state() == buf_page_t::MEMORY);
+
+	ut_a(block->page.frame);
+	ut_a(!ut_align_offset(block->page.frame, srv_page_size));
+
+	ut_ad(!block->page.in_zip_hash);
+	ut_d(block->page.in_zip_hash = true);
+	HASH_INSERT(buf_page_t, hash, &buf_pool.zip_hash, fold, &block->page);
+
+	ut_d(buf_pool.buddy_n_frames++);
+}
+
+/** Allocate a block from a bigger object.
+@param[in]	buf		a block that is free to use
+@param[in]	i		index of buf_pool.zip_free[]
+@param[in]	j		size of buf as an index of buf_pool.zip_free[]
+@return allocated block */
+static
+void*
+buf_buddy_alloc_from(void* buf, ulint i, ulint j)
+{
+	ulint	offs	= BUF_BUDDY_LOW << j;
+	ut_ad(j <= BUF_BUDDY_SIZES);
+	ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+	ut_ad(j >= i);
+	ut_ad(!ut_align_offset(buf, offs));
+
+	/* Add the unused parts of the block to the free lists. */
+	while (j > i) {
+		buf_buddy_free_t*	zip_buf;
+
+		offs >>= 1;
+		j--;
+
+		zip_buf = reinterpret_cast<buf_buddy_free_t*>(
+			reinterpret_cast<byte*>(buf) + offs);
+		buf_buddy_add_to_free(zip_buf, j);
+	}
+
+	buf_buddy_stamp_nonfree(reinterpret_cast<buf_buddy_free_t*>(buf), i);
+	return(buf);
+}
+
+/** Allocate a ROW_FORMAT=COMPRESSED block.
+@param i      index of buf_pool.zip_free[] or BUF_BUDDY_SIZES
+@param lru    assigned to true if buf_pool.mutex was temporarily released
+@return allocated block, never NULL */
+byte *buf_buddy_alloc_low(ulint i, bool *lru)
+{
+	buf_block_t*	block;
+
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+	ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+
+	if (i < BUF_BUDDY_SIZES) {
+		/* Try to allocate from the buddy system. */
+		block = (buf_block_t*) buf_buddy_alloc_zip(i);
+
+		if (block) {
+			goto func_exit;
+		}
+	}
+
+	/* Try allocating from the buf_pool.free list. */
+	block = buf_LRU_get_free_only();
+
+	if (block) {
+		goto alloc_big;
+	}
+
+	/* Try replacing an uncompressed page in the buffer pool. */
+	block = buf_LRU_get_free_block(true);
+	if (lru) {
+		*lru = true;
+	}
+
+alloc_big:
+	buf_buddy_block_register(block);
+
+	block = reinterpret_cast<buf_block_t*>(
+		buf_buddy_alloc_from(block->page.frame, i, BUF_BUDDY_SIZES));
+
+func_exit:
+	buf_pool.buddy_stat[i].used++;
+	return reinterpret_cast<byte*>(block);
+}
+
+/** Try to relocate a block. The caller must hold zip_free_mutex, and this
+function will release and lock it again.
+@param[in]	src		block to relocate
+@param[in]	dst		free block to relocated to
+@param[in]	i		index of buf_pool.zip_free[]
+@param[in]	force		true if we must relocated always
+@return true if relocated */
+static bool buf_buddy_relocate(void* src, void* dst, ulint i, bool force)
+{
+	buf_page_t*	bpage;
+	const ulint	size = BUF_BUDDY_LOW << i;
+
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+	ut_ad(!ut_align_offset(src, size));
+	ut_ad(!ut_align_offset(dst, size));
+	ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+	MEM_CHECK_ADDRESSABLE(dst, size);
+
+	uint32_t space = mach_read_from_4(static_cast<const byte*>(src)
+					  + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+	uint32_t offset = mach_read_from_4(static_cast<const byte*>(src)
+					   + FIL_PAGE_OFFSET);
+
+	/* Suppress Valgrind or MSAN warnings. */
+	MEM_MAKE_DEFINED(&space, sizeof space);
+	MEM_MAKE_DEFINED(&offset, sizeof offset);
+
+	ut_ad(space != BUF_BUDDY_STAMP_FREE);
+
+	const page_id_t	page_id(space, offset);
+	/* FIXME: we are computing this while holding buf_pool.mutex */
+	auto &cell= buf_pool.page_hash.cell_get(page_id.fold());
+
+	bpage = buf_pool.page_hash.get(page_id, cell);
+
+	if (!bpage || bpage->zip.data != src) {
+		/* The block has probably been freshly
+		allocated by buf_LRU_get_free_block() but not
+		added to buf_pool.page_hash yet.  Obviously,
+		it cannot be relocated. */
+
+		if (!force || space != 0 || offset != 0) {
+			return(false);
+		}
+
+		/* It might be just uninitialized page.
+		We should search from LRU list also. */
+
+		bpage = UT_LIST_GET_FIRST(buf_pool.LRU);
+		while (bpage != NULL) {
+			if (bpage->zip.data == src) {
+				ut_ad(bpage->id() == page_id);
+				break;
+			}
+			bpage = UT_LIST_GET_NEXT(LRU, bpage);
+		}
+
+		if (bpage == NULL) {
+			return(false);
+		}
+	}
+
+	if (page_zip_get_size(&bpage->zip) != size) {
+		/* The block is of different size.  We would
+		have to relocate all blocks covered by src.
+		For the sake of simplicity, give up. */
+		ut_ad(page_zip_get_size(&bpage->zip) < size);
+		return(false);
+	}
+
+	/* The block must have been allocated, but it may
+	contain uninitialized data. */
+	MEM_CHECK_ADDRESSABLE(src, size);
+
+	if (!bpage->can_relocate()) {
+		return false;
+	}
+
+	page_hash_latch &hash_lock = buf_pool.page_hash.lock_get(cell);
+	/* It does not make sense to use transactional_lock_guard here,
+	because the memcpy() of 1024 to 16384 bytes would likely make the
+	memory transaction too large. */
+	hash_lock.lock();
+
+	if (bpage->can_relocate()) {
+		/* Relocate the compressed page. */
+		const ulonglong ns = my_interval_timer();
+
+		ut_a(bpage->zip.data == src);
+
+		memcpy(dst, src, size);
+		bpage->zip.data = reinterpret_cast<page_zip_t*>(dst);
+
+		hash_lock.unlock();
+
+		buf_buddy_mem_invalid(
+			reinterpret_cast<buf_buddy_free_t*>(src), i);
+
+		buf_buddy_stat_t*	buddy_stat = &buf_pool.buddy_stat[i];
+		buddy_stat->relocated++;
+		buddy_stat->relocated_usec+= (my_interval_timer() - ns) / 1000;
+		return(true);
+	}
+
+	hash_lock.unlock();
+
+	return(false);
+}
+
+/** Deallocate a block.
+@param[in]	buf	block to be freed, must not be pointed to
+			by the buffer pool
+@param[in]	i	index of buf_pool.zip_free[], or BUF_BUDDY_SIZES */
+void buf_buddy_free_low(void* buf, ulint i)
+{
+	buf_buddy_free_t*	buddy;
+
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+	ut_ad(i <= BUF_BUDDY_SIZES);
+	ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+	ut_ad(buf_pool.buddy_stat[i].used > 0);
+
+	buf_pool.buddy_stat[i].used--;
+recombine:
+	MEM_UNDEFINED(buf, BUF_BUDDY_LOW << i);
+
+	if (i == BUF_BUDDY_SIZES) {
+		buf_buddy_block_free(buf);
+		return;
+	}
+
+	ut_ad(i < BUF_BUDDY_SIZES);
+	ut_ad(buf == ut_align_down(buf, BUF_BUDDY_LOW << i));
+	ut_ad(!buf_pool.contains_zip(buf));
+
+	/* Do not recombine blocks if there are few free blocks.
+	We may waste up to 15360*max_len bytes to free blocks
+	(1024 + 2048 + 4096 + 8192 = 15360) */
+	if (UT_LIST_GET_LEN(buf_pool.zip_free[i]) < 16
+	    && !buf_pool.is_shrinking()) {
+		goto func_exit;
+	}
+
+	/* Try to combine adjacent blocks. */
+	buddy = reinterpret_cast<buf_buddy_free_t*>(
+		buf_buddy_get(reinterpret_cast<byte*>(buf),
+			      BUF_BUDDY_LOW << i));
+
+	switch (buf_buddy_is_free(buddy, i)) {
+	case BUF_BUDDY_STATE_FREE:
+		/* The buddy is free: recombine */
+		buf_buddy_remove_from_free(buddy, i);
+buddy_is_free:
+		ut_ad(!buf_pool.contains_zip(buddy));
+		i++;
+		buf = ut_align_down(buf, BUF_BUDDY_LOW << i);
+
+		goto recombine;
+
+	case BUF_BUDDY_STATE_USED:
+		ut_d(buf_buddy_list_validate(i));
+
+		/* The buddy is not free. Is there a free block of
+		this size? */
+		if (buf_buddy_free_t* zip_buf =
+			UT_LIST_GET_FIRST(buf_pool.zip_free[i])) {
+
+			/* Remove the block from the free list, because
+			a successful buf_buddy_relocate() will overwrite
+			zip_free->list. */
+			buf_buddy_remove_from_free(zip_buf, i);
+
+			/* Try to relocate the buddy of buf to the free
+			block. */
+			if (buf_buddy_relocate(buddy, zip_buf, i, false)) {
+				goto buddy_is_free;
+			}
+
+			buf_buddy_add_to_free(zip_buf, i);
+		}
+
+		break;
+	case BUF_BUDDY_STATE_PARTIALLY_USED:
+		/* Some sub-blocks in the buddy are still in use.
+		Relocation will fail. No need to try. */
+		break;
+	}
+
+func_exit:
+	/* Free the block to the buddy list. */
+	buf_buddy_add_to_free(reinterpret_cast<buf_buddy_free_t*>(buf), i);
+}
+
+/** Try to reallocate a block.
+@param[in]	buf	buf_pool block to be reallocated
+@param[in]	size	block size, up to srv_page_size
+@return	whether the reallocation succeeded */
+bool
+buf_buddy_realloc(void* buf, ulint size)
+{
+	buf_block_t*	block = NULL;
+	ulint		i = buf_buddy_get_slot(size);
+
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+	ut_ad(i <= BUF_BUDDY_SIZES);
+	ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN));
+
+	if (i < BUF_BUDDY_SIZES) {
+		/* Try to allocate from the buddy system. */
+		block = reinterpret_cast<buf_block_t*>(buf_buddy_alloc_zip(i));
+	}
+
+	if (block == NULL) {
+		/* Try allocating from the buf_pool.free list. */
+		block = buf_LRU_get_free_only();
+
+		if (block == NULL) {
+			return(false); /* free_list was not enough */
+		}
+
+		buf_buddy_block_register(block);
+
+		block = reinterpret_cast<buf_block_t*>(
+			buf_buddy_alloc_from(
+				block->page.frame, i, BUF_BUDDY_SIZES));
+	}
+
+	buf_pool.buddy_stat[i].used++;
+
+	/* Try to relocate the buddy of buf to the free block. */
+	if (buf_buddy_relocate(buf, block, i, true)) {
+		/* succeeded */
+		buf_buddy_free_low(buf, i);
+	} else {
+		/* failed */
+		buf_buddy_free_low(block, i);
+	}
+
+	return(true); /* free_list was enough */
+}
+
+/** Combine all pairs of free buddies. */
+void buf_buddy_condense_free()
+{
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+	ut_ad(buf_pool.is_shrinking());
+
+	for (ulint i = 0; i < UT_ARR_SIZE(buf_pool.zip_free); ++i) {
+		buf_buddy_free_t* buf =
+			UT_LIST_GET_FIRST(buf_pool.zip_free[i]);
+
+		/* seek to withdraw target */
+		while (buf != NULL
+		       && !buf_pool.will_be_withdrawn(
+			       reinterpret_cast<byte*>(buf))) {
+			buf = UT_LIST_GET_NEXT(list, buf);
+		}
+
+		while (buf != NULL) {
+			buf_buddy_free_t* next =
+				UT_LIST_GET_NEXT(list, buf);
+
+			buf_buddy_free_t* buddy =
+				reinterpret_cast<buf_buddy_free_t*>(
+					buf_buddy_get(
+						reinterpret_cast<byte*>(buf),
+						BUF_BUDDY_LOW << i));
+
+			/* seek to the next withdraw target */
+			while (true) {
+				while (next != NULL
+				       && !buf_pool.will_be_withdrawn(
+						reinterpret_cast<byte*>(next))) {
+					 next = UT_LIST_GET_NEXT(list, next);
+				}
+
+				if (buddy != next) {
+					break;
+				}
+
+				next = UT_LIST_GET_NEXT(list, next);
+			}
+
+			if (buf_buddy_is_free(buddy, i)
+			    == BUF_BUDDY_STATE_FREE) {
+				/* Both buf and buddy are free.
+				Try to combine them. */
+				buf_buddy_remove_from_free(buf, i);
+				buf_pool.buddy_stat[i].used++;
+
+				buf_buddy_free_low(buf, i);
+			}
+
+			buf = next;
+		}
+	}
+}
diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc
new file mode 100644
index 00000000..8ef18ee0
--- /dev/null
+++ b/storage/innobase/buf/buf0buf.cc
@@ -0,0 +1,4180 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0buf.cc
+The database buffer buf_pool
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "assume_aligned.h"
+#include "mtr0types.h"
+#include "mach0data.h"
+#include "buf0checksum.h"
+#include "mariadb_stats.h"
+#include <string.h>
+
+#ifdef UNIV_INNOCHECKSUM
+# include "my_sys.h"
+# include "buf0buf.h"
+#else
+#include "my_cpu.h"
+#include "mem0mem.h"
+#include "btr0btr.h"
+#include "fil0fil.h"
+#include "fil0crypt.h"
+#include "buf0rea.h"
+#include "buf0flu.h"
+#include "buf0buddy.h"
+#include "buf0dblwr.h"
+#include "lock0lock.h"
+#include "btr0sea.h"
+#include "ibuf0ibuf.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "log0log.h"
+#include "dict0stats_bg.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "dict0dict.h"
+#include "log0recv.h"
+#include "srv0mon.h"
+#include "log0crypt.h"
+#include "fil0pagecompress.h"
+#endif /* !UNIV_INNOCHECKSUM */
+#include "page0zip.h"
+#include "buf0dump.h"
+#include <map>
+#include <sstream>
+#include "log.h"
+
+using st_::span;
+
+#ifdef HAVE_LIBNUMA
+#include <numa.h>
+#include <numaif.h>
+struct set_numa_interleave_t
+{
+	set_numa_interleave_t()
+	{
+		if (srv_numa_interleave) {
+
+			struct bitmask *numa_mems_allowed = numa_get_mems_allowed();
+			ib::info() << "Setting NUMA memory policy to"
+				" MPOL_INTERLEAVE";
+			if (set_mempolicy(MPOL_INTERLEAVE,
+					  numa_mems_allowed->maskp,
+					  numa_mems_allowed->size) != 0) {
+
+				ib::warn() << "Failed to set NUMA memory"
+					" policy to MPOL_INTERLEAVE: "
+					<< strerror(errno);
+			}
+			numa_bitmask_free(numa_mems_allowed);
+		}
+	}
+
+	~set_numa_interleave_t()
+	{
+		if (srv_numa_interleave) {
+
+			ib::info() << "Setting NUMA memory policy to"
+				" MPOL_DEFAULT";
+			if (set_mempolicy(MPOL_DEFAULT, NULL, 0) != 0) {
+				ib::warn() << "Failed to set NUMA memory"
+					" policy to MPOL_DEFAULT: "
+					<< strerror(errno);
+			}
+		}
+	}
+};
+
+#define NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE set_numa_interleave_t scoped_numa
+#else
+#define NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE
+#endif /* HAVE_LIBNUMA */
+
+/*
+		IMPLEMENTATION OF THE BUFFER POOL
+		=================================
+
+		Buffer frames and blocks
+		------------------------
+Following the terminology of Gray and Reuter, we call the memory
+blocks where file pages are loaded buffer frames. For each buffer
+frame there is a control block, or shortly, a block, in the buffer
+control array. The control info which does not need to be stored
+in the file along with the file page, resides in the control block.
+
+		Buffer pool struct
+		------------------
+The buffer buf_pool contains a single mutex which protects all the
+control data structures of the buf_pool. The content of a buffer frame is
+protected by a separate read-write lock in its control block, though.
+These locks can be locked and unlocked without owning the buf_pool.mutex.
+The OS events in the buf_pool struct can be waited for without owning the
+buf_pool.mutex.
+
+The buf_pool.mutex is a hot-spot in main memory, causing a lot of
+memory bus traffic on multiprocessor systems when processors
+alternately access the mutex. On our Pentium, the mutex is accessed
+maybe every 10 microseconds. We gave up the solution to have mutexes
+for each control block, for instance, because it seemed to be
+complicated.
+
+A solution to reduce mutex contention of the buf_pool.mutex is to
+create a separate mutex for the page hash table. On Pentium,
+accessing the hash table takes 2 microseconds, about half
+of the total buf_pool.mutex hold time.
+
+		Control blocks
+		--------------
+
+The control block contains, for instance, the bufferfix count
+which is incremented when a thread wants a file page to be fixed
+in a buffer frame. The bufferfix operation does not lock the
+contents of the frame, however. For this purpose, the control
+block contains a read-write lock.
+
+The buffer frames have to be aligned so that the start memory
+address of a frame is divisible by the universal page size, which
+is a power of two.
+
+The control blocks containing file pages are put to a hash table
+according to the file address of the page.
+We could speed up the access to an individual page by using
+"pointer swizzling": we could replace the page references on
+non-leaf index pages by direct pointers to the page, if it exists
+in the buf_pool. We could make a separate hash table where we could
+chain all the page references in non-leaf pages residing in the buf_pool,
+using the page reference as the hash key,
+and at the time of reading of a page update the pointers accordingly.
+Drawbacks of this solution are added complexity and,
+possibly, extra space required on non-leaf pages for memory pointers.
+A simpler solution is just to speed up the hash table mechanism
+in the database, using tables whose size is a power of 2.
+
+		Lists of blocks
+		---------------
+
+There are several lists of control blocks.
+
+The free list (buf_pool.free) contains blocks which are currently not
+used.
+
+The common LRU list contains all the blocks holding a file page
+except those for which the bufferfix count is non-zero.
+The pages are in the LRU list roughly in the order of the last
+access to the page, so that the oldest pages are at the end of the
+list. We also keep a pointer to near the end of the LRU list,
+which we can use when we want to artificially age a page in the
+buf_pool. This is used if we know that some page is not needed
+again for some time: we insert the block right after the pointer,
+causing it to be replaced sooner than would normally be the case.
+Currently this aging mechanism is used for read-ahead mechanism
+of pages, and it can also be used when there is a scan of a full
+table which cannot fit in the memory. Putting the pages near the
+end of the LRU list, we make sure that most of the buf_pool stays
+in the main memory, undisturbed.
+
+The unzip_LRU list contains a subset of the common LRU list.  The
+blocks on the unzip_LRU list hold a compressed file page and the
+corresponding uncompressed page frame.  A block is in unzip_LRU if and
+only if the predicate block->page.belongs_to_unzip_LRU()
+holds.  The blocks in unzip_LRU will be in same order as they are in
+the common LRU list.  That is, each manipulation of the common LRU
+list will result in the same manipulation of the unzip_LRU list.
+
+The chain of modified blocks (buf_pool.flush_list) contains the blocks
+holding persistent file pages that have been modified in the memory
+but not written to disk yet. The block with the oldest modification
+which has not yet been written to disk is at the end of the chain.
+The access to this list is protected by buf_pool.flush_list_mutex.
+
+The control blocks for uncompressed pages are accessible via
+buf_block_t objects that are reachable via buf_pool.chunks[].
+The control blocks (buf_page_t) of those ROW_FORMAT=COMPRESSED pages
+that are not in buf_pool.flush_list and for which no uncompressed
+page has been allocated in buf_pool are only accessible via
+buf_pool.LRU.
+
+The chains of free memory blocks (buf_pool.zip_free[]) are used by
+the buddy allocator (buf0buddy.cc) to keep track of currently unused
+memory blocks of size 1024..innodb_page_size / 2.  These
+blocks are inside the memory blocks of size innodb_page_size and type
+BUF_BLOCK_MEMORY that the buddy allocator requests from the buffer
+pool.  The buddy allocator is solely used for allocating
+ROW_FORMAT=COMPRESSED page frames.
+
+		Loading a file page
+		-------------------
+
+First, a victim block for replacement has to be found in the
+buf_pool. It is taken from the free list or searched for from the
+end of the LRU-list. An exclusive lock is reserved for the frame,
+the io_fix is set in the block fixing the block in buf_pool,
+and the io-operation for loading the page is queued. The io-handler thread
+releases the X-lock on the frame and releases the io_fix
+when the io operation completes.
+
+A thread may request the above operation using the function
+buf_page_get(). It may then continue to request a lock on the frame.
+The lock is granted when the io-handler releases the x-lock.
+
+		Read-ahead
+		----------
+
+The read-ahead mechanism is intended to be intelligent and
+isolated from the semantically higher levels of the database
+index management. From the higher level we only need the
+information if a file page has a natural successor or
+predecessor page. On the leaf level of a B-tree index,
+these are the next and previous pages in the natural
+order of the pages.
+
+Let us first explain the read-ahead mechanism when the leafs
+of a B-tree are scanned in an ascending or descending order.
+When a read page is the first time referenced in the buf_pool,
+the buffer manager checks if it is at the border of a so-called
+linear read-ahead area. The tablespace is divided into these
+areas of size 64 blocks, for example. So if the page is at the
+border of such an area, the read-ahead mechanism checks if
+all the other blocks in the area have been accessed in an
+ascending or descending order. If this is the case, the system
+looks at the natural successor or predecessor of the page,
+checks if that is at the border of another area, and in this case
+issues read-requests for all the pages in that area. Maybe
+we could relax the condition that all the pages in the area
+have to be accessed: if data is deleted from a table, there may
+appear holes of unused pages in the area.
+
+A different read-ahead mechanism is used when there appears
+to be a random access pattern to a file.
+If a new page is referenced in the buf_pool, and several pages
+of its random access area (for instance, 32 consecutive pages
+in a tablespace) have recently been referenced, we may predict
+that the whole area may be needed in the near future, and issue
+the read requests for the whole area.
+*/
+
+#ifndef UNIV_INNOCHECKSUM
+# ifdef SUX_LOCK_GENERIC
+void page_hash_latch::read_lock_wait()
+{
+  /* First, try busy spinning for a while. */
+  for (auto spin= srv_n_spin_wait_rounds; spin--; )
+  {
+    LF_BACKOFF();
+    if (read_trylock())
+      return;
+  }
+  /* Fall back to yielding to other threads. */
+  do
+    std::this_thread::yield();
+  while (!read_trylock());
+}
+
+void page_hash_latch::write_lock_wait()
+{
+  write_lock_wait_start();
+
+  /* First, try busy spinning for a while. */
+  for (auto spin= srv_n_spin_wait_rounds; spin--; )
+  {
+    if (write_lock_poll())
+      return;
+    LF_BACKOFF();
+  }
+
+  /* Fall back to yielding to other threads. */
+  do
+    std::this_thread::yield();
+  while (!write_lock_poll());
+}
+# endif
+
+/** Number of attempts made to read in a page in the buffer pool */
+constexpr ulint	BUF_PAGE_READ_MAX_RETRIES= 100;
+/** The maximum portion of the buffer pool that can be used for the
+read-ahead buffer.  (Divide buf_pool size by this amount) */
+constexpr uint32_t BUF_READ_AHEAD_PORTION= 32;
+
+/** A 64KiB buffer of NUL bytes, for use in assertions and checks,
+and dummy default values of instantly dropped columns.
+Initially, BLOB field references are set to NUL bytes, in
+dtuple_convert_big_rec(). */
+const byte *field_ref_zero;
+
+/** The InnoDB buffer pool */
+buf_pool_t buf_pool;
+buf_pool_t::chunk_t::map *buf_pool_t::chunk_t::map_reg;
+buf_pool_t::chunk_t::map *buf_pool_t::chunk_t::map_ref;
+
+#ifdef UNIV_DEBUG
+/** This is used to insert validation operations in execution
+in the debug version */
+static Atomic_counter<size_t> buf_dbg_counter;
+#endif /* UNIV_DEBUG */
+
+/** Macro to determine whether the read of write counter is used depending
+on the io_type */
+#define MONITOR_RW_COUNTER(read, counter)		\
+	(read ? (counter##_READ) : (counter##_WRITTEN))
+
+/** Decrypt a page for temporary tablespace.
+@param[in,out]	tmp_frame	Temporary buffer
+@param[in]	src_frame	Page to decrypt
+@return true if temporary tablespace decrypted, false if not */
+static bool buf_tmp_page_decrypt(byte* tmp_frame, byte* src_frame)
+{
+	if (buf_is_zeroes(span<const byte>(src_frame, srv_page_size))) {
+		return true;
+	}
+
+	/* read space & lsn */
+	uint header_len = FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION;
+
+	/* Copy FIL page header, it is not encrypted */
+	memcpy(tmp_frame, src_frame, header_len);
+
+	/* Calculate the offset where decryption starts */
+	const byte* src = src_frame + header_len;
+	byte* dst = tmp_frame + header_len;
+	uint srclen = uint(srv_page_size)
+		- (header_len + FIL_PAGE_FCRC32_CHECKSUM);
+	ulint offset = mach_read_from_4(src_frame + FIL_PAGE_OFFSET);
+
+	if (!log_tmp_block_decrypt(src, srclen, dst,
+				   (offset * srv_page_size))) {
+		return false;
+	}
+
+	static_assert(FIL_PAGE_FCRC32_CHECKSUM == 4, "alignment");
+	memcpy_aligned<4>(tmp_frame + srv_page_size - FIL_PAGE_FCRC32_CHECKSUM,
+			  src_frame + srv_page_size - FIL_PAGE_FCRC32_CHECKSUM,
+			  FIL_PAGE_FCRC32_CHECKSUM);
+
+	memcpy_aligned<UNIV_PAGE_SIZE_MIN>(src_frame, tmp_frame,
+					   srv_page_size);
+	srv_stats.pages_decrypted.inc();
+	srv_stats.n_temp_blocks_decrypted.inc();
+
+	return true; /* page was decrypted */
+}
+
+/** Decrypt a page.
+@param[in,out]	bpage	Page control block
+@param[in]	node	data file
+@return whether the operation was successful */
+static bool buf_page_decrypt_after_read(buf_page_t *bpage,
+                                        const fil_node_t &node)
+{
+	ut_ad(node.space->referenced());
+	ut_ad(node.space->id == bpage->id().space());
+	const auto flags = node.space->flags;
+
+	byte* dst_frame = bpage->zip.data ? bpage->zip.data : bpage->frame;
+	bool page_compressed = node.space->is_compressed()
+		&& buf_page_is_compressed(dst_frame, flags);
+	const page_id_t id(bpage->id());
+
+	if (id.page_no() == 0) {
+		/* File header pages are not encrypted/compressed */
+		return (true);
+	}
+
+	buf_tmp_buffer_t* slot;
+
+	if (id.space() == SRV_TMP_SPACE_ID
+	    && innodb_encrypt_temporary_tables) {
+		slot = buf_pool.io_buf_reserve();
+		slot->allocate();
+		bool ok = buf_tmp_page_decrypt(slot->crypt_buf, dst_frame);
+		slot->release();
+		return ok;
+	}
+
+	/* Page is encrypted if encryption information is found from
+	tablespace and page contains used key_version. This is true
+	also for pages first compressed and then encrypted. */
+
+	uint key_version = buf_page_get_key_version(dst_frame, flags);
+
+	if (page_compressed && !key_version) {
+		/* the page we read is unencrypted */
+		/* Find free slot from temporary memory array */
+decompress:
+		if (fil_space_t::full_crc32(flags)
+		    && buf_page_is_corrupted(true, dst_frame, flags)) {
+			return false;
+		}
+
+		slot = buf_pool.io_buf_reserve();
+		slot->allocate();
+
+decompress_with_slot:
+		ulint write_size = fil_page_decompress(
+			slot->crypt_buf, dst_frame, flags);
+		slot->release();
+		ut_ad(node.space->referenced());
+		return write_size != 0;
+	}
+
+	if (key_version && node.space->crypt_data) {
+		/* Verify encryption checksum before we even try to
+		decrypt. */
+		if (!buf_page_verify_crypt_checksum(dst_frame, flags)) {
+decrypt_failed:
+			ib::error() << "Encrypted page " << id
+				    << " in file " << node.name
+				    << " looks corrupted; key_version="
+				    << key_version;
+			return false;
+		}
+
+		slot = buf_pool.io_buf_reserve();
+		slot->allocate();
+
+		/* decrypt using crypt_buf to dst_frame */
+		if (!fil_space_decrypt(node.space, slot->crypt_buf, dst_frame)) {
+			slot->release();
+			goto decrypt_failed;
+		}
+
+		if ((fil_space_t::full_crc32(flags) && page_compressed)
+		    || fil_page_get_type(dst_frame)
+		    == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) {
+			goto decompress_with_slot;
+		}
+
+		slot->release();
+	} else if (fil_page_get_type(dst_frame)
+		   == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) {
+		goto decompress;
+	}
+
+	ut_ad(node.space->referenced());
+	return true;
+}
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** Checks if the page is in crc32 checksum format.
+@param[in]	read_buf		database page
+@param[in]	checksum_field1		new checksum field
+@param[in]	checksum_field2		old checksum field
+@return true if the page is in crc32 checksum format. */
+static
+bool
+buf_page_is_checksum_valid_crc32(
+	const byte*			read_buf,
+	ulint				checksum_field1,
+	ulint				checksum_field2)
+{
+	const uint32_t	crc32 = buf_calc_page_crc32(read_buf);
+
+#ifdef UNIV_INNOCHECKSUM
+	extern FILE* log_file;
+	extern uint32_t cur_page_num;
+	if (log_file) {
+		fprintf(log_file, "page::" UINT32PF ";"
+			" crc32 calculated = " UINT32PF ";"
+			" recorded checksum field1 = " ULINTPF " recorded"
+			" checksum field2 =" ULINTPF "\n", cur_page_num,
+			crc32, checksum_field1, checksum_field2);
+	}
+#endif /* UNIV_INNOCHECKSUM */
+
+	if (checksum_field1 != checksum_field2) {
+		return false;
+	}
+
+	return checksum_field1 == crc32;
+}
+
+/** Checks whether the lsn present in the page is lesser than the
+peek current lsn.
+@param[in]	check_lsn	lsn to check
+@param[in]	read_buf	page. */
+static void buf_page_check_lsn(bool check_lsn, const byte* read_buf)
+{
+#ifndef UNIV_INNOCHECKSUM
+	if (check_lsn && recv_lsn_checks_on) {
+		const lsn_t current_lsn = log_sys.get_lsn();
+		const lsn_t	page_lsn
+			= mach_read_from_8(read_buf + FIL_PAGE_LSN);
+
+		/* Since we are going to reset the page LSN during the import
+		phase it makes no sense to spam the log with error messages. */
+		if (current_lsn < page_lsn) {
+
+			const uint32_t space_id = mach_read_from_4(
+				read_buf + FIL_PAGE_SPACE_ID);
+			const uint32_t page_no = mach_read_from_4(
+				read_buf + FIL_PAGE_OFFSET);
+
+			ib::error() << "Page " << page_id_t(space_id, page_no)
+				<< " log sequence number " << page_lsn
+				<< " is in the future! Current system"
+				<< " log sequence number "
+				<< current_lsn << ".";
+
+			ib::error() << "Your database may be corrupt or"
+				" you may have copied the InnoDB"
+				" tablespace but not the InnoDB"
+				" log files. "
+				<< FORCE_RECOVERY_MSG;
+
+		}
+	}
+#endif /* !UNIV_INNOCHECKSUM */
+}
+
+
+/** Check if a buffer is all zeroes.
+@param[in]	buf	data to check
+@return whether the buffer is all zeroes */
+bool buf_is_zeroes(span<const byte> buf)
+{
+  ut_ad(buf.size() <= UNIV_PAGE_SIZE_MAX);
+  return memcmp(buf.data(), field_ref_zero, buf.size()) == 0;
+}
+
+/** Check if a page is corrupt.
+@param check_lsn   whether FIL_PAGE_LSN should be checked
+@param read_buf    database page
+@param fsp_flags   contents of FIL_SPACE_FLAGS
+@return whether the page is corrupted */
+bool buf_page_is_corrupted(bool check_lsn, const byte *read_buf,
+                           uint32_t fsp_flags)
+{
+	if (fil_space_t::full_crc32(fsp_flags)) {
+		bool compressed = false, corrupted = false;
+		const uint size = buf_page_full_crc32_size(
+			read_buf, &compressed, &corrupted);
+		if (corrupted) {
+			return true;
+		}
+		const byte* end = read_buf + (size - FIL_PAGE_FCRC32_CHECKSUM);
+		uint crc32 = mach_read_from_4(end);
+
+		if (!crc32 && size == srv_page_size
+		    && buf_is_zeroes(span<const byte>(read_buf, size))) {
+			return false;
+		}
+
+		DBUG_EXECUTE_IF(
+			"page_intermittent_checksum_mismatch", {
+			static int page_counter;
+			if (page_counter++ == 3) {
+				crc32++;
+			}
+		});
+
+		if (crc32 != my_crc32c(0, read_buf,
+				       size - FIL_PAGE_FCRC32_CHECKSUM)) {
+			return true;
+		}
+		static_assert(FIL_PAGE_FCRC32_KEY_VERSION == 0, "alignment");
+		static_assert(FIL_PAGE_LSN % 4 == 0, "alignment");
+		static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment");
+		if (!compressed
+		    && !mach_read_from_4(FIL_PAGE_FCRC32_KEY_VERSION
+					 + read_buf)
+		    && memcmp_aligned<4>(read_buf + (FIL_PAGE_LSN + 4),
+					 end - (FIL_PAGE_FCRC32_END_LSN
+						- FIL_PAGE_FCRC32_CHECKSUM),
+					 4)) {
+			return true;
+		}
+
+		buf_page_check_lsn(check_lsn, read_buf);
+		return false;
+	}
+
+	const ulint zip_size = fil_space_t::zip_size(fsp_flags);
+	const uint16_t page_type = fil_page_get_type(read_buf);
+
+	/* We can trust page type if page compression is set on tablespace
+	flags because page compression flag means file must have been
+	created with 10.1 (later than 5.5 code base). In 10.1 page
+	compressed tables do not contain post compression checksum and
+	FIL_PAGE_END_LSN_OLD_CHKSUM field stored. Note that space can
+	be null if we are in fil_check_first_page() and first page
+	is not compressed or encrypted. Page checksum is verified
+	after decompression (i.e. normally pages are already
+	decompressed at this stage). */
+	if ((page_type == FIL_PAGE_PAGE_COMPRESSED ||
+	     page_type == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED)
+#ifndef UNIV_INNOCHECKSUM
+	    && FSP_FLAGS_HAS_PAGE_COMPRESSION(fsp_flags)
+#endif
+	) {
+		return(false);
+	}
+
+	static_assert(FIL_PAGE_LSN % 4 == 0, "alignment");
+	static_assert(FIL_PAGE_END_LSN_OLD_CHKSUM % 4 == 0, "alignment");
+
+	if (!zip_size
+	    && memcmp_aligned<4>(read_buf + FIL_PAGE_LSN + 4,
+				 read_buf + srv_page_size
+				 - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) {
+		/* Stored log sequence numbers at the start and the end
+		of page do not match */
+
+		return(true);
+	}
+
+	buf_page_check_lsn(check_lsn, read_buf);
+
+	/* Check whether the checksum fields have correct values */
+
+	if (zip_size) {
+		return !page_zip_verify_checksum(read_buf, zip_size);
+	}
+
+	const uint32_t checksum_field1 = mach_read_from_4(
+		read_buf + FIL_PAGE_SPACE_OR_CHKSUM);
+
+	const uint32_t checksum_field2 = mach_read_from_4(
+		read_buf + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM);
+
+	static_assert(FIL_PAGE_LSN % 8 == 0, "alignment");
+
+	/* A page filled with NUL bytes is considered not corrupted.
+	Before MariaDB Server 10.1.25 (MDEV-12113) or 10.2.2 (or MySQL 5.7),
+	the FIL_PAGE_FILE_FLUSH_LSN field may have been written nonzero
+	for the first page of each file of the system tablespace.
+	We want to ignore it for the system tablespace, but because
+	we do not know the expected tablespace here, we ignore the
+	field for all data files, except for
+	innodb_checksum_algorithm=full_crc32 which we handled above. */
+	if (!checksum_field1 && !checksum_field2) {
+		/* Checksum fields can have valid value as zero.
+		If the page is not empty then do the checksum
+		calculation for the page. */
+		bool all_zeroes = true;
+		for (size_t i = 0; i < srv_page_size; i++) {
+#ifndef UNIV_INNOCHECKSUM
+			if (i == FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION) {
+				i += 8;
+			}
+#endif
+			if (read_buf[i]) {
+				all_zeroes = false;
+				break;
+			}
+		}
+
+		if (all_zeroes) {
+			return false;
+		}
+	}
+
+#ifndef UNIV_INNOCHECKSUM
+	switch (srv_checksum_algorithm) {
+	case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+#endif /* !UNIV_INNOCHECKSUM */
+		return !buf_page_is_checksum_valid_crc32(
+			read_buf, checksum_field1, checksum_field2);
+#ifndef UNIV_INNOCHECKSUM
+	default:
+		if (checksum_field1 == BUF_NO_CHECKSUM_MAGIC
+		    && checksum_field2 == BUF_NO_CHECKSUM_MAGIC) {
+			return false;
+		}
+
+		const uint32_t crc32 = buf_calc_page_crc32(read_buf);
+
+		/* Very old versions of InnoDB only stored 8 byte lsn to the
+		start and the end of the page. */
+
+		/* Since innodb_checksum_algorithm is not strict_* allow
+		any of the algos to match for the old field */
+
+		if (checksum_field2
+		    != mach_read_from_4(read_buf + FIL_PAGE_LSN)
+		    && checksum_field2 != BUF_NO_CHECKSUM_MAGIC) {
+
+			DBUG_EXECUTE_IF(
+				"page_intermittent_checksum_mismatch", {
+				static int page_counter;
+				if (page_counter++ == 3) return true;
+			});
+
+			if ((checksum_field1 != crc32
+			     || checksum_field2 != crc32)
+			    && checksum_field2
+			    != buf_calc_page_old_checksum(read_buf)) {
+				return true;
+			}
+		}
+
+		switch (checksum_field1) {
+		case 0:
+		case BUF_NO_CHECKSUM_MAGIC:
+			return false;
+		}
+		return (checksum_field1 != crc32 || checksum_field2 != crc32)
+			&& checksum_field1
+			!= buf_calc_page_new_checksum(read_buf);
+	}
+#endif /* !UNIV_INNOCHECKSUM */
+}
+
+#ifndef UNIV_INNOCHECKSUM
+
+#if defined(DBUG_OFF) && defined(HAVE_MADVISE) &&  defined(MADV_DODUMP)
+/** Enable buffers to be dumped to core files
+
+A convience function, not called anyhwere directly however
+it is left available for gdb or any debugger to call
+in the event that you want all of the memory to be dumped
+to a core file.
+
+Returns number of errors found in madvise calls. */
+MY_ATTRIBUTE((used))
+int
+buf_madvise_do_dump()
+{
+	int ret= 0;
+
+	/* mirrors allocation in log_t::create() */
+	if (log_sys.buf) {
+		ret += madvise(log_sys.buf, log_sys.buf_size, MADV_DODUMP);
+		ret += madvise(log_sys.flush_buf, log_sys.buf_size,
+			       MADV_DODUMP);
+	}
+
+	mysql_mutex_lock(&buf_pool.mutex);
+	auto chunk = buf_pool.chunks;
+
+	for (ulint n = buf_pool.n_chunks; n--; chunk++) {
+		ret+= madvise(chunk->mem, chunk->mem_size(), MADV_DODUMP);
+	}
+
+	mysql_mutex_unlock(&buf_pool.mutex);
+	return ret;
+}
+#endif
+
+#ifndef UNIV_DEBUG
+static inline byte hex_to_ascii(byte hex_digit)
+{
+  const int offset= hex_digit <= 9 ? '0' : 'a' - 10;
+  return byte(hex_digit + offset);
+}
+#endif
+
+/** Dump a page to stderr.
+@param[in]	read_buf	database page
+@param[in]	zip_size	compressed page size, or 0 */
+ATTRIBUTE_COLD
+void buf_page_print(const byte *read_buf, ulint zip_size)
+{
+#ifndef UNIV_DEBUG
+  const size_t size = zip_size ? zip_size : srv_page_size;
+  const byte * const end= read_buf + size;
+  sql_print_information("InnoDB: Page dump (%zu bytes):", size);
+
+  do
+  {
+    byte row[64];
+
+    for (byte *r= row; r != &row[64]; r+= 2, read_buf++)
+    {
+      r[0]= hex_to_ascii(byte(*read_buf >> 4));
+      r[1]= hex_to_ascii(*read_buf & 15);
+    }
+
+    sql_print_information("InnoDB: %.*s", 64, row);
+  }
+  while (read_buf != end);
+
+  sql_print_information("InnoDB: End of page dump");
+#endif
+}
+
+/** Initialize a buffer page descriptor.
+@param[in,out]	block	buffer page descriptor
+@param[in]	frame	buffer page frame */
+static
+void
+buf_block_init(buf_block_t* block, byte* frame)
+{
+	/* This function should only be executed at database startup or by
+	buf_pool.resize(). Either way, adaptive hash index must not exist. */
+	assert_block_ahi_empty_on_init(block);
+
+	block->page.frame = frame;
+
+	MEM_MAKE_DEFINED(&block->modify_clock, sizeof block->modify_clock);
+	ut_ad(!block->modify_clock);
+	MEM_MAKE_DEFINED(&block->page.lock, sizeof block->page.lock);
+	block->page.init(buf_page_t::NOT_USED, page_id_t(~0ULL));
+#ifdef BTR_CUR_HASH_ADAPT
+	MEM_MAKE_DEFINED(&block->index, sizeof block->index);
+	ut_ad(!block->index);
+#endif /* BTR_CUR_HASH_ADAPT */
+	ut_d(block->in_unzip_LRU_list = false);
+	ut_d(block->in_withdraw_list = false);
+
+	page_zip_des_init(&block->page.zip);
+
+	MEM_MAKE_DEFINED(&block->page.hash, sizeof block->page.hash);
+	ut_ad(!block->page.hash);
+}
+
+/** Allocate a chunk of buffer frames.
+@param bytes    requested size
+@return whether the allocation succeeded */
+inline bool buf_pool_t::chunk_t::create(size_t bytes)
+{
+  DBUG_EXECUTE_IF("ib_buf_chunk_init_fails", return false;);
+  /* Round down to a multiple of page size, although it already should be. */
+  bytes= ut_2pow_round<size_t>(bytes, srv_page_size);
+
+  mem= buf_pool.allocator.allocate_large_dontdump(bytes, &mem_pfx);
+
+  if (UNIV_UNLIKELY(!mem))
+    return false;
+
+  MEM_UNDEFINED(mem, mem_size());
+
+#ifdef HAVE_LIBNUMA
+  if (srv_numa_interleave)
+  {
+    struct bitmask *numa_mems_allowed= numa_get_mems_allowed();
+    if (mbind(mem, mem_size(), MPOL_INTERLEAVE,
+              numa_mems_allowed->maskp, numa_mems_allowed->size,
+              MPOL_MF_MOVE))
+    {
+      ib::warn() << "Failed to set NUMA memory policy of"
+              " buffer pool page frames to MPOL_INTERLEAVE"
+              " (error: " << strerror(errno) << ").";
+    }
+    numa_bitmask_free(numa_mems_allowed);
+  }
+#endif /* HAVE_LIBNUMA */
+
+
+  /* Allocate the block descriptors from
+  the start of the memory block. */
+  blocks= reinterpret_cast<buf_block_t*>(mem);
+
+  /* Align a pointer to the first frame.  Note that when
+  opt_large_page_size is smaller than srv_page_size,
+  (with max srv_page_size at 64k don't think any hardware
+  makes this true),
+  we may allocate one fewer block than requested.  When
+  it is bigger, we may allocate more blocks than requested. */
+  static_assert(sizeof(byte*) == sizeof(ulint), "pointer size");
+
+  byte *frame= reinterpret_cast<byte*>((reinterpret_cast<ulint>(mem) +
+                                        srv_page_size - 1) &
+                                       ~ulint{srv_page_size - 1});
+  size= (mem_pfx.m_size >> srv_page_size_shift) - (frame != mem);
+
+  /* Subtract the space needed for block descriptors. */
+  {
+    ulint s= size;
+
+    while (frame < reinterpret_cast<const byte*>(blocks + s))
+    {
+      frame+= srv_page_size;
+      s--;
+    }
+
+    size= s;
+  }
+
+  /* Init block structs and assign frames for them. Then we assign the
+  frames to the first blocks (we already mapped the memory above). */
+
+  buf_block_t *block= blocks;
+
+  for (auto i= size; i--; ) {
+    buf_block_init(block, frame);
+    MEM_UNDEFINED(block->page.frame, srv_page_size);
+    /* Add the block to the free list */
+    UT_LIST_ADD_LAST(buf_pool.free, &block->page);
+
+    ut_d(block->page.in_free_list = TRUE);
+    block++;
+    frame+= srv_page_size;
+  }
+
+  reg();
+
+  return true;
+}
+
+#ifdef UNIV_DEBUG
+/** Check that all file pages in the buffer chunk are in a replaceable state.
+@return address of a non-free block
+@retval nullptr if all freed */
+inline const buf_block_t *buf_pool_t::chunk_t::not_freed() const
+{
+  buf_block_t *block= blocks;
+  for (auto i= size; i--; block++)
+  {
+    if (block->page.in_file())
+    {
+      /* The uncompressed buffer pool should never
+      contain ROW_FORMAT=COMPRESSED block descriptors. */
+      ut_ad(block->page.frame);
+      const lsn_t lsn= block->page.oldest_modification();
+
+      if (srv_read_only_mode)
+      {
+        /* The page cleaner is disabled in read-only mode.  No pages
+        can be dirtied, so all of them must be clean. */
+        ut_ad(lsn == 0 || lsn == recv_sys.lsn ||
+              srv_force_recovery == SRV_FORCE_NO_LOG_REDO);
+        break;
+      }
+
+      if (fsp_is_system_temporary(block->page.id().space()))
+      {
+        ut_ad(lsn == 0 || lsn == 2);
+        break;
+      }
+
+      if (lsn > 1 || !block->page.can_relocate())
+        return block;
+
+      break;
+    }
+  }
+
+  return nullptr;
+}
+#endif /* UNIV_DEBUG */
+
+/** Create the hash table.
+@param n  the lower bound of n_cells */
+void buf_pool_t::page_hash_table::create(ulint n)
+{
+  n_cells= ut_find_prime(n);
+  const size_t size= MY_ALIGN(pad(n_cells) * sizeof *array,
+                              CPU_LEVEL1_DCACHE_LINESIZE);
+  void *v= aligned_malloc(size, CPU_LEVEL1_DCACHE_LINESIZE);
+  memset_aligned<CPU_LEVEL1_DCACHE_LINESIZE>(v, 0, size);
+  array= static_cast<hash_chain*>(v);
+}
+
+/** Create the buffer pool.
+@return whether the creation failed */
+bool buf_pool_t::create()
+{
+  ut_ad(this == &buf_pool);
+  ut_ad(srv_buf_pool_size % srv_buf_pool_chunk_unit == 0);
+  ut_ad(!is_initialised());
+  ut_ad(srv_buf_pool_size > 0);
+  ut_ad(!resizing);
+  ut_ad(!chunks_old);
+  /* mariabackup loads tablespaces, and it requires field_ref_zero to be
+  allocated before innodb initialization */
+  ut_ad(srv_operation >= SRV_OPERATION_RESTORE || !field_ref_zero);
+
+  NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE;
+
+  if (!field_ref_zero) {
+    if (auto b= aligned_malloc(UNIV_PAGE_SIZE_MAX, 4096))
+      field_ref_zero= static_cast<const byte*>
+        (memset_aligned<4096>(b, 0, UNIV_PAGE_SIZE_MAX));
+    else
+      return true;
+  }
+
+  chunk_t::map_reg= UT_NEW_NOKEY(chunk_t::map());
+
+  new(&allocator) ut_allocator<unsigned char>(mem_key_buf_buf_pool);
+
+  n_chunks= srv_buf_pool_size / srv_buf_pool_chunk_unit;
+  const size_t chunk_size= srv_buf_pool_chunk_unit;
+
+  chunks= static_cast<chunk_t*>(ut_zalloc_nokey(n_chunks * sizeof *chunks));
+  UT_LIST_INIT(free, &buf_page_t::list);
+  curr_size= 0;
+  auto chunk= chunks;
+
+  do
+  {
+    if (!chunk->create(chunk_size))
+    {
+      while (--chunk >= chunks)
+      {
+        buf_block_t* block= chunk->blocks;
+
+        for (auto i= chunk->size; i--; block++)
+          block->page.lock.free();
+
+        allocator.deallocate_large_dodump(chunk->mem, &chunk->mem_pfx);
+      }
+      ut_free(chunks);
+      chunks= nullptr;
+      UT_DELETE(chunk_t::map_reg);
+      chunk_t::map_reg= nullptr;
+      aligned_free(const_cast<byte*>(field_ref_zero));
+      field_ref_zero= nullptr;
+      ut_ad(!is_initialised());
+      return true;
+    }
+
+    curr_size+= chunk->size;
+  }
+  while (++chunk < chunks + n_chunks);
+
+  ut_ad(is_initialised());
+#if defined(__aarch64__)
+  mysql_mutex_init(buf_pool_mutex_key, &mutex, MY_MUTEX_INIT_FAST);
+#else
+  mysql_mutex_init(buf_pool_mutex_key, &mutex, nullptr);
+#endif
+
+  UT_LIST_INIT(LRU, &buf_page_t::LRU);
+  UT_LIST_INIT(withdraw, &buf_page_t::list);
+  withdraw_target= 0;
+  UT_LIST_INIT(flush_list, &buf_page_t::list);
+  UT_LIST_INIT(unzip_LRU, &buf_block_t::unzip_LRU);
+
+  for (size_t i= 0; i < UT_ARR_SIZE(zip_free); ++i)
+    UT_LIST_INIT(zip_free[i], &buf_buddy_free_t::list);
+  ulint s= curr_size;
+  s/= BUF_READ_AHEAD_PORTION;
+  read_ahead_area= s >= READ_AHEAD_PAGES
+    ? READ_AHEAD_PAGES
+    : my_round_up_to_next_power(static_cast<uint32_t>(s));
+  curr_pool_size= srv_buf_pool_size;
+
+  n_chunks_new= n_chunks;
+
+  page_hash.create(2 * curr_size);
+  zip_hash.create(2 * curr_size);
+  last_printout_time= time(NULL);
+
+  mysql_mutex_init(flush_list_mutex_key, &flush_list_mutex,
+                   MY_MUTEX_INIT_FAST);
+
+  pthread_cond_init(&done_flush_LRU, nullptr);
+  pthread_cond_init(&done_flush_list, nullptr);
+  pthread_cond_init(&do_flush_list, nullptr);
+  pthread_cond_init(&done_free, nullptr);
+
+  try_LRU_scan= true;
+
+  ut_d(flush_hp.m_mutex= &flush_list_mutex;);
+  ut_d(lru_hp.m_mutex= &mutex);
+  ut_d(lru_scan_itr.m_mutex= &mutex);
+
+  io_buf.create((srv_n_read_io_threads + srv_n_write_io_threads) *
+                OS_AIO_N_PENDING_IOS_PER_THREAD);
+
+  /* FIXME: remove some of these variables */
+  srv_buf_pool_curr_size= curr_pool_size;
+  srv_buf_pool_old_size= srv_buf_pool_size;
+  srv_buf_pool_base_size= srv_buf_pool_size;
+
+  last_activity_count= srv_get_activity_count();
+
+  chunk_t::map_ref= chunk_t::map_reg;
+  buf_LRU_old_ratio_update(100 * 3 / 8, false);
+  btr_search_sys_create();
+  ut_ad(is_initialised());
+  return false;
+}
+
+/** Clean up after successful create() */
+void buf_pool_t::close()
+{
+  ut_ad(this == &buf_pool);
+  if (!is_initialised())
+    return;
+
+  mysql_mutex_destroy(&mutex);
+  mysql_mutex_destroy(&flush_list_mutex);
+
+  for (buf_page_t *bpage= UT_LIST_GET_LAST(LRU), *prev_bpage= nullptr; bpage;
+       bpage= prev_bpage)
+  {
+    prev_bpage= UT_LIST_GET_PREV(LRU, bpage);
+    ut_ad(bpage->in_file());
+    ut_ad(bpage->in_LRU_list);
+    /* The buffer pool must be clean during normal shutdown.
+    Only on aborted startup (with recovery) or with innodb_fast_shutdown=2
+    we may discard changes. */
+    ut_d(const lsn_t oldest= bpage->oldest_modification();)
+    ut_ad(fsp_is_system_temporary(bpage->id().space())
+          ? (oldest == 0 || oldest == 2)
+          : oldest <= 1 || srv_is_being_started || srv_fast_shutdown == 2);
+
+    if (UNIV_UNLIKELY(!bpage->frame))
+    {
+      bpage->lock.free();
+      ut_free(bpage);
+    }
+  }
+
+  for (auto chunk= chunks + n_chunks; --chunk >= chunks; )
+  {
+    buf_block_t *block= chunk->blocks;
+
+    for (auto i= chunk->size; i--; block++)
+      block->page.lock.free();
+
+    allocator.deallocate_large_dodump(chunk->mem, &chunk->mem_pfx);
+  }
+
+  pthread_cond_destroy(&done_flush_LRU);
+  pthread_cond_destroy(&done_flush_list);
+  pthread_cond_destroy(&do_flush_list);
+  pthread_cond_destroy(&done_free);
+
+  ut_free(chunks);
+  chunks= nullptr;
+  page_hash.free();
+  zip_hash.free();
+
+  io_buf.close();
+  UT_DELETE(chunk_t::map_reg);
+  chunk_t::map_reg= chunk_t::map_ref= nullptr;
+  aligned_free(const_cast<byte*>(field_ref_zero));
+  field_ref_zero= nullptr;
+}
+
+/** Try to reallocate a control block.
+@param block  control block to reallocate
+@return whether the reallocation succeeded */
+inline bool buf_pool_t::realloc(buf_block_t *block)
+{
+	buf_block_t*	new_block;
+
+	mysql_mutex_assert_owner(&mutex);
+	ut_ad(block->page.in_file());
+	ut_ad(block->page.frame);
+
+	new_block = buf_LRU_get_free_only();
+
+	if (new_block == NULL) {
+		mysql_mutex_lock(&buf_pool.flush_list_mutex);
+		page_cleaner_wakeup();
+		mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+		return(false); /* free list was not enough */
+	}
+
+	const page_id_t id{block->page.id()};
+	hash_chain& chain = page_hash.cell_get(id.fold());
+	page_hash_latch& hash_lock = page_hash.lock_get(chain);
+	/* It does not make sense to use transactional_lock_guard
+	here, because copying innodb_page_size (4096 to 65536) bytes
+	as well as other changes would likely make the memory
+	transaction too large. */
+	hash_lock.lock();
+
+	if (block->page.can_relocate()) {
+		memcpy_aligned<UNIV_PAGE_SIZE_MIN>(
+			new_block->page.frame, block->page.frame,
+			srv_page_size);
+		mysql_mutex_lock(&buf_pool.flush_list_mutex);
+		const auto frame = new_block->page.frame;
+		new_block->page.lock.free();
+		new (&new_block->page) buf_page_t(block->page);
+		new_block->page.frame = frame;
+
+		/* relocate LRU list */
+		if (buf_page_t*	prev_b = buf_pool.LRU_remove(&block->page)) {
+			UT_LIST_INSERT_AFTER(LRU, prev_b, &new_block->page);
+		} else {
+			UT_LIST_ADD_FIRST(LRU, &new_block->page);
+		}
+
+		if (LRU_old == &block->page) {
+			LRU_old = &new_block->page;
+		}
+
+		ut_ad(new_block->page.in_LRU_list);
+
+		/* relocate unzip_LRU list */
+		if (block->page.zip.data != NULL) {
+			ut_ad(block->in_unzip_LRU_list);
+			ut_d(new_block->in_unzip_LRU_list = true);
+
+			buf_block_t*	prev_block = UT_LIST_GET_PREV(unzip_LRU, block);
+			UT_LIST_REMOVE(unzip_LRU, block);
+
+			ut_d(block->in_unzip_LRU_list = false);
+			block->page.zip.data = NULL;
+			page_zip_set_size(&block->page.zip, 0);
+
+			if (prev_block != NULL) {
+				UT_LIST_INSERT_AFTER(unzip_LRU, prev_block, new_block);
+			} else {
+				UT_LIST_ADD_FIRST(unzip_LRU, new_block);
+			}
+		} else {
+			ut_ad(!block->in_unzip_LRU_list);
+			ut_d(new_block->in_unzip_LRU_list = false);
+		}
+
+		/* relocate page_hash */
+		hash_chain& chain = page_hash.cell_get(id.fold());
+		ut_ad(&block->page == page_hash.get(id, chain));
+		buf_pool.page_hash.replace(chain, &block->page,
+					   &new_block->page);
+		buf_block_modify_clock_inc(block);
+		static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
+		memset_aligned<4>(block->page.frame
+				  + FIL_PAGE_OFFSET, 0xff, 4);
+		static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2,
+			      "not perfect alignment");
+		memset_aligned<2>(block->page.frame
+				  + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xff, 4);
+		MEM_UNDEFINED(block->page.frame, srv_page_size);
+		block->page.set_state(buf_page_t::REMOVE_HASH);
+		if (!fsp_is_system_temporary(id.space())) {
+			buf_flush_relocate_on_flush_list(&block->page,
+							 &new_block->page);
+		}
+		mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+		block->page.set_corrupt_id();
+
+		/* set other flags of buf_block_t */
+
+#ifdef BTR_CUR_HASH_ADAPT
+		/* This code should only be executed by resize(),
+		while the adaptive hash index is disabled. */
+		assert_block_ahi_empty(block);
+		assert_block_ahi_empty_on_init(new_block);
+		ut_ad(!block->index);
+		new_block->index	= NULL;
+		new_block->n_hash_helps	= 0;
+		new_block->n_fields	= 1;
+		new_block->left_side	= TRUE;
+#endif /* BTR_CUR_HASH_ADAPT */
+		ut_d(block->page.set_state(buf_page_t::MEMORY));
+		/* free block */
+		new_block = block;
+	}
+
+	hash_lock.unlock();
+	buf_LRU_block_free_non_file_page(new_block);
+	return(true); /* free_list was enough */
+}
+
+void buf_pool_t::io_buf_t::create(ulint n_slots)
+{
+  this->n_slots= n_slots;
+  slots= static_cast<buf_tmp_buffer_t*>
+    (ut_malloc_nokey(n_slots * sizeof *slots));
+  memset((void*) slots, 0, n_slots * sizeof *slots);
+}
+
+void buf_pool_t::io_buf_t::close()
+{
+  for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++)
+  {
+    aligned_free(s->crypt_buf);
+    aligned_free(s->comp_buf);
+  }
+  ut_free(slots);
+  slots= nullptr;
+  n_slots= 0;
+}
+
+buf_tmp_buffer_t *buf_pool_t::io_buf_t::reserve()
+{
+  for (;;)
+  {
+    for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++)
+      if (s->acquire())
+        return s;
+    os_aio_wait_until_no_pending_writes(true);
+    for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++)
+      if (s->acquire())
+        return s;
+    os_aio_wait_until_no_pending_reads(true);
+  }
+}
+
+/** Sets the global variable that feeds MySQL's innodb_buffer_pool_resize_status
+to the specified string. The format and the following parameters are the
+same as the ones used for printf(3).
+@param[in]	fmt	format
+@param[in]	...	extra parameters according to fmt */
+static
+void
+buf_resize_status(
+	const char*	fmt,
+	...)
+{
+	va_list	ap;
+
+	va_start(ap, fmt);
+
+	vsnprintf(
+		export_vars.innodb_buffer_pool_resize_status,
+		sizeof(export_vars.innodb_buffer_pool_resize_status),
+		fmt, ap);
+
+	va_end(ap);
+
+	ib::info() << export_vars.innodb_buffer_pool_resize_status;
+}
+
+/** Withdraw blocks from the buffer pool until meeting withdraw_target.
+@return whether retry is needed */
+inline bool buf_pool_t::withdraw_blocks()
+{
+	buf_block_t*	block;
+	ulint		loop_count = 0;
+
+	ib::info() << "Start to withdraw the last "
+		<< withdraw_target << " blocks.";
+
+	while (UT_LIST_GET_LEN(withdraw) < withdraw_target) {
+
+		/* try to withdraw from free_list */
+		ulint	count1 = 0;
+
+		mysql_mutex_lock(&mutex);
+		buf_buddy_condense_free();
+		block = reinterpret_cast<buf_block_t*>(
+			UT_LIST_GET_FIRST(free));
+		while (block != NULL
+		       && UT_LIST_GET_LEN(withdraw) < withdraw_target) {
+			ut_ad(block->page.in_free_list);
+			ut_ad(!block->page.oldest_modification());
+			ut_ad(!block->page.in_LRU_list);
+			ut_a(!block->page.in_file());
+
+			buf_block_t*	next_block;
+			next_block = reinterpret_cast<buf_block_t*>(
+				UT_LIST_GET_NEXT(
+					list, &block->page));
+
+			if (will_be_withdrawn(block->page)) {
+				/* This should be withdrawn */
+				UT_LIST_REMOVE(free, &block->page);
+				UT_LIST_ADD_LAST(withdraw, &block->page);
+				ut_d(block->in_withdraw_list = true);
+				count1++;
+			}
+
+			block = next_block;
+		}
+
+		/* reserve free_list length */
+		if (UT_LIST_GET_LEN(withdraw) < withdraw_target) {
+			buf_flush_LRU(
+				std::max<ulint>(withdraw_target
+						- UT_LIST_GET_LEN(withdraw),
+						srv_LRU_scan_depth),
+				true);
+			mysql_mutex_unlock(&buf_pool.mutex);
+			buf_dblwr.flush_buffered_writes();
+			mysql_mutex_lock(&buf_pool.flush_list_mutex);
+			buf_flush_wait_LRU_batch_end();
+			mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+			mysql_mutex_lock(&buf_pool.mutex);
+		}
+
+		/* relocate blocks/buddies in withdrawn area */
+		ulint	count2 = 0;
+
+		buf_pool_mutex_exit_forbid();
+		for (buf_page_t* bpage = UT_LIST_GET_FIRST(LRU), *next_bpage;
+		     bpage; bpage = next_bpage) {
+			ut_ad(bpage->in_file());
+			next_bpage = UT_LIST_GET_NEXT(LRU, bpage);
+			if (UNIV_LIKELY_NULL(bpage->zip.data)
+			    && will_be_withdrawn(bpage->zip.data)
+			    && bpage->can_relocate()) {
+				if (!buf_buddy_realloc(
+					    bpage->zip.data,
+					    page_zip_get_size(&bpage->zip))) {
+					/* failed to allocate block */
+					break;
+				}
+				count2++;
+				if (bpage->frame) {
+					goto realloc_frame;
+				}
+			}
+
+			if (bpage->frame && will_be_withdrawn(*bpage)
+			    && bpage->can_relocate()) {
+realloc_frame:
+				if (!realloc(reinterpret_cast<buf_block_t*>(
+						     bpage))) {
+					/* failed to allocate block */
+					break;
+				}
+				count2++;
+			}
+		}
+		buf_pool_mutex_exit_allow();
+		mysql_mutex_unlock(&mutex);
+
+		buf_resize_status(
+			"Withdrawing blocks. (" ULINTPF "/" ULINTPF ").",
+			UT_LIST_GET_LEN(withdraw),
+			withdraw_target);
+
+		ib::info() << "Withdrew "
+			<< count1 << " blocks from free list."
+			<< " Tried to relocate " << count2 << " blocks ("
+			<< UT_LIST_GET_LEN(withdraw) << "/"
+			<< withdraw_target << ").";
+
+		if (++loop_count >= 10) {
+			/* give up for now.
+			retried after user threads paused. */
+
+			ib::info() << "will retry to withdraw later";
+
+			/* need retry later */
+			return(true);
+		}
+	}
+
+	/* confirm withdrawn enough */
+	for (const chunk_t* chunk = chunks + n_chunks_new,
+	     * const echunk = chunks + n_chunks; chunk != echunk; chunk++) {
+		block = chunk->blocks;
+		for (ulint j = chunk->size; j--; block++) {
+			ut_a(block->page.state() == buf_page_t::NOT_USED);
+			ut_ad(block->in_withdraw_list);
+		}
+	}
+
+	ib::info() << "Withdrawn target: " << UT_LIST_GET_LEN(withdraw)
+		   << " blocks.";
+
+	return(false);
+}
+
+
+
+inline void buf_pool_t::page_hash_table::write_lock_all()
+{
+  for (auto n= pad(n_cells) & ~ELEMENTS_PER_LATCH;; n-= ELEMENTS_PER_LATCH + 1)
+  {
+    reinterpret_cast<page_hash_latch&>(array[n]).lock();
+    if (!n)
+      break;
+  }
+}
+
+
+inline void buf_pool_t::page_hash_table::write_unlock_all()
+{
+  for (auto n= pad(n_cells) & ~ELEMENTS_PER_LATCH;; n-= ELEMENTS_PER_LATCH + 1)
+  {
+    reinterpret_cast<page_hash_latch&>(array[n]).unlock();
+    if (!n)
+      break;
+  }
+}
+
+
+namespace
+{
+
+struct find_interesting_trx
+{
+  void operator()(const trx_t &trx)
+  {
+    if (trx.state == TRX_STATE_NOT_STARTED)
+      return;
+    if (trx.mysql_thd == nullptr)
+      return;
+    if (withdraw_started <= trx.start_time_micro)
+      return;
+
+    if (!found)
+    {
+      ib::warn() << "The following trx might hold "
+                    "the blocks in buffer pool to "
+                    "be withdrawn. Buffer pool "
+                    "resizing can complete only "
+                    "after all the transactions "
+                    "below release the blocks.";
+      found= true;
+    }
+
+    lock_trx_print_wait_and_mvcc_state(stderr, &trx, current_time);
+  }
+
+  bool &found;
+  /** microsecond_interval_timer() */
+  const ulonglong withdraw_started;
+  const my_hrtime_t current_time;
+};
+
+} // namespace
+
+/** Resize from srv_buf_pool_old_size to srv_buf_pool_size. */
+inline void buf_pool_t::resize()
+{
+  ut_ad(this == &buf_pool);
+
+	bool		warning = false;
+
+	NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE;
+
+	ut_ad(!resize_in_progress());
+	ut_ad(srv_buf_pool_chunk_unit > 0);
+
+	ulint new_instance_size = srv_buf_pool_size >> srv_page_size_shift;
+	std::ostringstream str_old_size, str_new_size, str_chunk_size;
+	str_old_size << ib::bytes_iec{srv_buf_pool_old_size};
+	str_new_size << ib::bytes_iec{srv_buf_pool_size};
+	str_chunk_size << ib::bytes_iec{srv_buf_pool_chunk_unit};
+
+	buf_resize_status("Resizing buffer pool from %s to %s (unit = %s).",
+			  str_old_size.str().c_str(),
+			  str_new_size.str().c_str(),
+			  str_chunk_size.str().c_str());
+
+#ifdef BTR_CUR_HASH_ADAPT
+	/* disable AHI if needed */
+	buf_resize_status("Disabling adaptive hash index.");
+
+	btr_search_s_lock_all();
+	const bool btr_search_disabled = btr_search_enabled;
+	btr_search_s_unlock_all();
+
+	btr_search_disable();
+
+	if (btr_search_disabled) {
+		ib::info() << "disabled adaptive hash index.";
+	}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+	mysql_mutex_lock(&mutex);
+	ut_ad(n_chunks_new == n_chunks);
+	ut_ad(UT_LIST_GET_LEN(withdraw) == 0);
+
+	n_chunks_new = (new_instance_size << srv_page_size_shift)
+		/ srv_buf_pool_chunk_unit;
+	curr_size = n_chunks_new * chunks->size;
+	mysql_mutex_unlock(&mutex);
+
+	if (is_shrinking()) {
+		/* set withdraw target */
+		size_t w = 0;
+
+		for (const chunk_t* chunk = chunks + n_chunks_new,
+		     * const echunk = chunks + n_chunks;
+		     chunk != echunk; chunk++)
+			w += chunk->size;
+
+		ut_ad(withdraw_target == 0);
+		withdraw_target = w;
+	}
+
+	buf_resize_status("Withdrawing blocks to be shrunken.");
+
+	ulonglong	withdraw_started = microsecond_interval_timer();
+	ulonglong	message_interval = 60ULL * 1000 * 1000;
+	ulint		retry_interval = 1;
+
+withdraw_retry:
+	/* wait for the number of blocks fit to the new size (if needed)*/
+	bool	should_retry_withdraw = is_shrinking()
+		&& withdraw_blocks();
+
+	if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
+		/* abort to resize for shutdown. */
+		return;
+	}
+
+	/* abort buffer pool load */
+	buf_load_abort();
+
+	const ulonglong current_time = microsecond_interval_timer();
+
+	if (should_retry_withdraw
+	    && current_time - withdraw_started >= message_interval) {
+
+		if (message_interval > 900000000) {
+			message_interval = 1800000000;
+		} else {
+			message_interval *= 2;
+		}
+
+		bool found= false;
+		find_interesting_trx f
+			{found, withdraw_started, my_hrtime_coarse()};
+		withdraw_started = current_time;
+
+		/* This is going to exceed the maximum size of a
+		memory transaction. */
+		LockMutexGuard g{SRW_LOCK_CALL};
+		trx_sys.trx_list.for_each(f);
+	}
+
+	if (should_retry_withdraw) {
+		ib::info() << "Will retry to withdraw " << retry_interval
+			<< " seconds later.";
+		std::this_thread::sleep_for(
+			std::chrono::seconds(retry_interval));
+
+		if (retry_interval > 5) {
+			retry_interval = 10;
+		} else {
+			retry_interval *= 2;
+		}
+
+		goto withdraw_retry;
+	}
+
+	buf_resize_status("Latching entire buffer pool.");
+
+#ifndef DBUG_OFF
+	{
+		bool	should_wait = true;
+
+		while (should_wait) {
+			should_wait = false;
+			DBUG_EXECUTE_IF(
+				"ib_buf_pool_resize_wait_before_resize",
+				should_wait = true;
+				std::this_thread::sleep_for(
+					std::chrono::milliseconds(10)););
+		}
+	}
+#endif /* !DBUG_OFF */
+
+	if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
+		return;
+	}
+
+	/* Indicate critical path */
+	resizing.store(true, std::memory_order_relaxed);
+
+	mysql_mutex_lock(&mutex);
+	page_hash.write_lock_all();
+
+	chunk_t::map_reg = UT_NEW_NOKEY(chunk_t::map());
+
+	/* add/delete chunks */
+
+	buf_resize_status("Resizing buffer pool from "
+			  ULINTPF " chunks to " ULINTPF " chunks.",
+			  n_chunks, n_chunks_new);
+
+	if (is_shrinking()) {
+		/* delete chunks */
+		chunk_t* chunk = chunks + n_chunks_new;
+		const chunk_t* const echunk = chunks + n_chunks;
+
+		ulint	sum_freed = 0;
+
+		while (chunk < echunk) {
+			/* buf_LRU_block_free_non_file_page() invokes
+			MEM_NOACCESS() on any buf_pool.free blocks.
+			We must cancel the effect of that. In
+			MemorySanitizer, MEM_NOACCESS() is no-op, so
+			we must not do anything special for it here. */
+#ifdef HAVE_valgrind
+# if !__has_feature(memory_sanitizer)
+			MEM_MAKE_DEFINED(chunk->mem, chunk->mem_size());
+# endif
+#else
+			MEM_MAKE_ADDRESSABLE(chunk->mem, chunk->size);
+#endif
+
+			buf_block_t*	block = chunk->blocks;
+
+			for (ulint j = chunk->size; j--; block++) {
+				block->page.lock.free();
+			}
+
+			allocator.deallocate_large_dodump(
+				chunk->mem, &chunk->mem_pfx);
+			sum_freed += chunk->size;
+			++chunk;
+		}
+
+		/* discard withdraw list */
+		UT_LIST_INIT(withdraw, &buf_page_t::list);
+		withdraw_target = 0;
+
+		ib::info() << n_chunks - n_chunks_new
+			   << " Chunks (" << sum_freed
+			   << " blocks) were freed.";
+
+		n_chunks = n_chunks_new;
+	}
+
+	{
+		/* reallocate chunks */
+		const size_t	new_chunks_size
+			= n_chunks_new * sizeof(chunk_t);
+
+		chunk_t*	new_chunks = static_cast<chunk_t*>(
+			ut_zalloc_nokey_nofatal(new_chunks_size));
+
+		DBUG_EXECUTE_IF("buf_pool_resize_chunk_null",
+				ut_free(new_chunks); new_chunks= nullptr; );
+
+		if (!new_chunks) {
+			ib::error() << "failed to allocate"
+				" the chunk array.";
+			n_chunks_new = n_chunks;
+			warning = true;
+			chunks_old = NULL;
+			goto calc_buf_pool_size;
+		}
+
+		ulint	n_chunks_copy = ut_min(n_chunks_new, n_chunks);
+
+		memcpy(new_chunks, chunks,
+		       n_chunks_copy * sizeof *new_chunks);
+
+		for (ulint j = 0; j < n_chunks_copy; j++) {
+			new_chunks[j].reg();
+		}
+
+		chunks_old = chunks;
+		chunks = new_chunks;
+	}
+
+	if (n_chunks_new > n_chunks) {
+		/* add chunks */
+		ulint	sum_added = 0;
+		ulint	n = n_chunks;
+		const size_t unit = srv_buf_pool_chunk_unit;
+
+		for (chunk_t* chunk = chunks + n_chunks,
+		     * const echunk = chunks + n_chunks_new;
+		     chunk != echunk; chunk++) {
+			if (!chunk->create(unit)) {
+				ib::error() << "failed to allocate"
+					" memory for buffer pool chunk";
+
+				warning = true;
+				n_chunks_new = n_chunks;
+				break;
+			}
+
+			sum_added += chunk->size;
+			++n;
+		}
+
+		ib::info() << n_chunks_new - n_chunks
+			   << " chunks (" << sum_added
+			   << " blocks) were added.";
+
+		n_chunks = n;
+	}
+calc_buf_pool_size:
+	/* recalc curr_size */
+	ulint	new_size = 0;
+
+	{
+		chunk_t* chunk = chunks;
+		const chunk_t* const echunk = chunk + n_chunks;
+		do {
+			new_size += chunk->size;
+		} while (++chunk != echunk);
+	}
+
+	curr_size = new_size;
+	n_chunks_new = n_chunks;
+
+	if (chunks_old) {
+		ut_free(chunks_old);
+		chunks_old = NULL;
+	}
+
+	chunk_t::map* chunk_map_old = chunk_t::map_ref;
+	chunk_t::map_ref = chunk_t::map_reg;
+
+	/* set size */
+	ut_ad(UT_LIST_GET_LEN(withdraw) == 0);
+  ulint s= curr_size;
+  s/= BUF_READ_AHEAD_PORTION;
+  read_ahead_area= s >= READ_AHEAD_PAGES
+    ? READ_AHEAD_PAGES
+    : my_round_up_to_next_power(static_cast<uint32_t>(s));
+  curr_pool_size= n_chunks * srv_buf_pool_chunk_unit;
+  srv_buf_pool_curr_size= curr_pool_size;/* FIXME: remove*/
+  extern ulonglong innobase_buffer_pool_size;
+  innobase_buffer_pool_size= buf_pool_size_align(srv_buf_pool_curr_size);
+
+	const bool	new_size_too_diff
+		= srv_buf_pool_base_size > srv_buf_pool_size * 2
+			|| srv_buf_pool_base_size * 2 < srv_buf_pool_size;
+
+  mysql_mutex_unlock(&mutex);
+  page_hash.write_unlock_all();
+
+	UT_DELETE(chunk_map_old);
+
+	resizing.store(false, std::memory_order_relaxed);
+
+	/* Normalize other components, if the new size is too different */
+	if (!warning && new_size_too_diff) {
+		srv_buf_pool_base_size = srv_buf_pool_size;
+
+		buf_resize_status("Resizing other hash tables.");
+
+		srv_lock_table_size = 5
+			* (srv_buf_pool_size >> srv_page_size_shift);
+		lock_sys.resize(srv_lock_table_size);
+		dict_sys.resize();
+
+		ib::info() << "Resized hash tables: lock_sys,"
+#ifdef BTR_CUR_HASH_ADAPT
+			" adaptive hash index,"
+#endif /* BTR_CUR_HASH_ADAPT */
+			" and dictionary.";
+	}
+
+	/* normalize ibuf.max_size */
+	ibuf_max_size_update(srv_change_buffer_max_size);
+
+	if (srv_buf_pool_old_size != srv_buf_pool_size) {
+
+	        buf_resize_status("Completed resizing buffer pool from %zu to %zu bytes."
+			    ,srv_buf_pool_old_size, srv_buf_pool_size);
+		srv_buf_pool_old_size = srv_buf_pool_size;
+	}
+
+#ifdef BTR_CUR_HASH_ADAPT
+	/* enable AHI if needed */
+	if (btr_search_disabled) {
+		btr_search_enable(true);
+		ib::info() << "Re-enabled adaptive hash index.";
+	}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+	if (warning)
+		buf_resize_status("Resizing buffer pool failed");
+
+	ut_d(validate());
+
+	return;
+}
+
+/** Thread pool task invoked by innodb_buffer_pool_size changes. */
+static void buf_resize_callback(void *)
+{
+  DBUG_ENTER("buf_resize_callback");
+  ut_ad(srv_shutdown_state < SRV_SHUTDOWN_CLEANUP);
+  mysql_mutex_lock(&buf_pool.mutex);
+  const auto size= srv_buf_pool_size;
+  const bool work= srv_buf_pool_old_size != size;
+  mysql_mutex_unlock(&buf_pool.mutex);
+
+  if (work)
+    buf_pool.resize();
+  else
+  {
+    std::ostringstream sout;
+    sout << "Size did not change: old size = new size = " << size;
+    buf_resize_status(sout.str().c_str());
+  }
+  DBUG_VOID_RETURN;
+}
+
+/* Ensure that task does not run in parallel, by setting max_concurrency to 1 for the thread group */
+static tpool::task_group single_threaded_group(1);
+static tpool::waitable_task buf_resize_task(buf_resize_callback,
+	nullptr, &single_threaded_group);
+
+void buf_resize_start()
+{
+	srv_thread_pool->submit_task(&buf_resize_task);
+}
+
+void buf_resize_shutdown()
+{
+	buf_resize_task.wait();
+}
+
+
+/** Relocate a ROW_FORMAT=COMPRESSED block in the LRU list and
+buf_pool.page_hash.
+The caller must relocate bpage->list.
+@param bpage   ROW_FORMAT=COMPRESSED only block
+@param dpage   destination control block */
+static void buf_relocate(buf_page_t *bpage, buf_page_t *dpage)
+{
+  const page_id_t id{bpage->id()};
+  buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id.fold());
+  ut_ad(!bpage->frame);
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+  ut_ad(buf_pool.page_hash.lock_get(chain).is_write_locked());
+  ut_ad(bpage == buf_pool.page_hash.get(id, chain));
+  ut_ad(!buf_pool.watch_is_sentinel(*bpage));
+  ut_d(const auto state= bpage->state());
+  ut_ad(state >= buf_page_t::FREED);
+  ut_ad(state <= buf_page_t::READ_FIX);
+  ut_ad(bpage->lock.is_write_locked());
+  const auto frame= dpage->frame;
+
+  dpage->lock.free();
+  new (dpage) buf_page_t(*bpage);
+
+  dpage->frame= frame;
+
+  /* Important that we adjust the hazard pointer before
+  removing bpage from LRU list. */
+  if (buf_page_t *b= buf_pool.LRU_remove(bpage))
+    UT_LIST_INSERT_AFTER(buf_pool.LRU, b, dpage);
+  else
+    UT_LIST_ADD_FIRST(buf_pool.LRU, dpage);
+
+  if (UNIV_UNLIKELY(buf_pool.LRU_old == bpage))
+  {
+    buf_pool.LRU_old= dpage;
+#ifdef UNIV_LRU_DEBUG
+    /* buf_pool.LRU_old must be the first item in the LRU list
+    whose "old" flag is set. */
+    ut_a(buf_pool.LRU_old->old);
+    ut_a(!UT_LIST_GET_PREV(LRU, buf_pool.LRU_old) ||
+         !UT_LIST_GET_PREV(LRU, buf_pool.LRU_old)->old);
+    ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old) ||
+         UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old)->old);
+  }
+  else
+  {
+    /* Check that the "old" flag is consistent in
+    the block and its neighbours. */
+    dpage->set_old(dpage->is_old());
+#endif /* UNIV_LRU_DEBUG */
+  }
+
+  ut_d(CheckInLRUList::validate());
+
+  buf_pool.page_hash.replace(chain, bpage, dpage);
+}
+
+buf_page_t *buf_pool_t::watch_set(const page_id_t id,
+                                  buf_pool_t::hash_chain &chain)
+{
+  ut_ad(&chain == &page_hash.cell_get(id.fold()));
+  page_hash.lock_get(chain).lock();
+
+  buf_page_t *bpage= page_hash.get(id, chain);
+
+  if (bpage)
+  {
+got_block:
+    bpage->fix();
+    if (watch_is_sentinel(*bpage))
+      bpage= nullptr;
+    page_hash.lock_get(chain).unlock();
+    return bpage;
+  }
+
+  page_hash.lock_get(chain).unlock();
+  /* Allocate a watch[] and then try to insert it into the page_hash. */
+  mysql_mutex_lock(&mutex);
+
+  /* The maximum number of purge tasks should never exceed
+  the UT_ARR_SIZE(watch) - 1, and there is no way for a purge task to hold a
+  watch when setting another watch. */
+  for (buf_page_t *w= &watch[UT_ARR_SIZE(watch)]; w-- >= watch; )
+  {
+    ut_ad(w->access_time == 0);
+    ut_ad(!w->oldest_modification());
+    ut_ad(!w->zip.data);
+    ut_ad(!w->in_zip_hash);
+    static_assert(buf_page_t::NOT_USED == 0, "efficiency");
+    if (ut_d(auto s=) w->state())
+    {
+      /* This watch may be in use for some other page. */
+      ut_ad(s >= buf_page_t::UNFIXED);
+      continue;
+    }
+    /* w is pointing to watch[], which is protected by mutex.
+    Normally, buf_page_t::id for objects that are reachable by
+    page_hash.get(id, chain) are protected by hash_lock. */
+    w->set_state(buf_page_t::UNFIXED + 1);
+    w->id_= id;
+
+    page_hash.lock_get(chain).lock();
+    bpage= page_hash.get(id, chain);
+    if (UNIV_LIKELY_NULL(bpage))
+    {
+      w->set_state(buf_page_t::NOT_USED);
+      mysql_mutex_unlock(&mutex);
+      goto got_block;
+    }
+
+    ut_ad(w->state() == buf_page_t::UNFIXED + 1);
+    buf_pool.page_hash.append(chain, w);
+    mysql_mutex_unlock(&mutex);
+    page_hash.lock_get(chain).unlock();
+    return nullptr;
+  }
+
+  ut_error;
+}
+
+/** Stop watching whether a page has been read in.
+watch_set(id) must have returned nullptr before.
+@param id         page identifier
+@param chain      unlocked hash table chain */
+TRANSACTIONAL_TARGET
+void buf_pool_t::watch_unset(const page_id_t id, buf_pool_t::hash_chain &chain)
+{
+  mysql_mutex_assert_not_owner(&mutex);
+  buf_page_t *w;
+  {
+    transactional_lock_guard<page_hash_latch> g{page_hash.lock_get(chain)};
+    /* The page must exist because watch_set() did fix(). */
+    w= page_hash.get(id, chain);
+    ut_ad(w->in_page_hash);
+    if (!watch_is_sentinel(*w))
+    {
+    no_watch:
+      w->unfix();
+      w= nullptr;
+    }
+    else
+    {
+      const auto state= w->state();
+      ut_ad(~buf_page_t::LRU_MASK & state);
+      ut_ad(state >= buf_page_t::UNFIXED + 1);
+      if (state != buf_page_t::UNFIXED + 1)
+        goto no_watch;
+    }
+  }
+
+  if (!w)
+    return;
+
+  const auto old= w;
+  /* The following is based on buf_pool_t::watch_remove(). */
+  mysql_mutex_lock(&mutex);
+  w= page_hash.get(id, chain);
+
+  {
+    transactional_lock_guard<page_hash_latch> g
+      {buf_pool.page_hash.lock_get(chain)};
+    auto f= w->unfix();
+    ut_ad(f < buf_page_t::READ_FIX || w != old);
+
+    if (f == buf_page_t::UNFIXED && w == old)
+    {
+      page_hash.remove(chain, w);
+      // Now that w is detached from page_hash, release it to watch[].
+      ut_ad(w->id_ == id);
+      ut_ad(!w->frame);
+      ut_ad(!w->zip.data);
+      w->set_state(buf_page_t::NOT_USED);
+    }
+  }
+
+  mysql_mutex_unlock(&mutex);
+}
+
+/** Mark the page status as FREED for the given tablespace and page number.
+@param[in,out]	space	tablespace
+@param[in]	page	page number
+@param[in,out]	mtr	mini-transaction */
+TRANSACTIONAL_TARGET
+void buf_page_free(fil_space_t *space, uint32_t page, mtr_t *mtr)
+{
+  ut_ad(mtr);
+  ut_ad(mtr->is_active());
+
+  if (srv_immediate_scrub_data_uncompressed
+#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
+      || space->is_compressed()
+#endif
+      )
+    mtr->add_freed_offset(space, page);
+
+  ++buf_pool.stat.n_page_gets;
+  const page_id_t page_id(space->id, page);
+  buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold());
+  uint32_t fix;
+  buf_block_t *block;
+  {
+    transactional_shared_lock_guard<page_hash_latch> g
+      {buf_pool.page_hash.lock_get(chain)};
+    block= reinterpret_cast<buf_block_t*>
+      (buf_pool.page_hash.get(page_id, chain));
+    if (!block || !block->page.frame)
+      /* FIXME: convert ROW_FORMAT=COMPRESSED, without buf_zip_decompress() */
+      return;
+    /* To avoid a deadlock with buf_LRU_free_page() of some other page
+    and buf_page_write_complete() of this page, we must not wait for a
+    page latch while holding a page_hash latch. */
+    fix= block->page.fix();
+  }
+
+  if (UNIV_UNLIKELY(fix < buf_page_t::UNFIXED))
+  {
+    block->page.unfix();
+    return;
+  }
+
+  block->page.lock.x_lock();
+  if (block->page.is_ibuf_exist())
+    ibuf_merge_or_delete_for_page(nullptr, page_id, block->page.zip_size());
+#ifdef BTR_CUR_HASH_ADAPT
+  if (block->index)
+    btr_search_drop_page_hash_index(block, false);
+#endif /* BTR_CUR_HASH_ADAPT */
+  block->page.set_freed(block->page.state());
+  mtr->memo_push(block, MTR_MEMO_PAGE_X_MODIFY);
+}
+
+/** Get read access to a compressed page (usually of type
+FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2).
+The page must be released with unfix().
+NOTE: the page is not protected by any latch.  Mutual exclusion has to
+be implemented at a higher level.  In other words, all possible
+accesses to a given page through this function must be protected by
+the same set of mutexes or latches.
+@param page_id   page identifier
+@param zip_size  ROW_FORMAT=COMPRESSED page size in bytes
+@return pointer to the block, s-latched */
+TRANSACTIONAL_TARGET
+buf_page_t* buf_page_get_zip(const page_id_t page_id, ulint zip_size)
+{
+  ut_ad(zip_size);
+  ut_ad(ut_is_2pow(zip_size));
+  ++buf_pool.stat.n_page_gets;
+  mariadb_increment_pages_accessed();
+
+  buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold());
+  page_hash_latch &hash_lock= buf_pool.page_hash.lock_get(chain);
+  buf_page_t *bpage;
+
+lookup:
+  for (bool discard_attempted= false;;)
+  {
+#ifndef NO_ELISION
+    if (xbegin())
+    {
+      if (hash_lock.is_locked())
+        xabort();
+      bpage= buf_pool.page_hash.get(page_id, chain);
+      if (!bpage || buf_pool.watch_is_sentinel(*bpage))
+      {
+        xend();
+        goto must_read_page;
+      }
+      if (!bpage->zip.data)
+      {
+        /* There is no ROW_FORMAT=COMPRESSED page. */
+        xend();
+        return nullptr;
+      }
+      if (discard_attempted || !bpage->frame)
+      {
+        if (!bpage->lock.s_lock_try())
+          xabort();
+        xend();
+        break;
+      }
+      xend();
+    }
+    else
+#endif
+    {
+      hash_lock.lock_shared();
+      bpage= buf_pool.page_hash.get(page_id, chain);
+      if (!bpage || buf_pool.watch_is_sentinel(*bpage))
+      {
+        hash_lock.unlock_shared();
+        goto must_read_page;
+      }
+
+      ut_ad(bpage->in_file());
+      ut_ad(page_id == bpage->id());
+
+      if (!bpage->zip.data)
+      {
+        /* There is no ROW_FORMAT=COMPRESSED page. */
+        hash_lock.unlock_shared();
+        return nullptr;
+      }
+
+      if (discard_attempted || !bpage->frame)
+      {
+        /* Even when we are holding a hash_lock, it should be
+        acceptable to wait for a page S-latch here, because
+        buf_page_t::read_complete() will not wait for buf_pool.mutex,
+        and because S-latch would not conflict with a U-latch
+        that would be protecting buf_page_t::write_complete(). */
+        bpage->lock.s_lock();
+        hash_lock.unlock_shared();
+        break;
+      }
+
+      hash_lock.unlock_shared();
+    }
+
+    discard_attempted= true;
+    mysql_mutex_lock(&buf_pool.mutex);
+    if (buf_page_t *bpage= buf_pool.page_hash.get(page_id, chain))
+      buf_LRU_free_page(bpage, false);
+    mysql_mutex_unlock(&buf_pool.mutex);
+  }
+
+  {
+    ut_d(const auto s=) bpage->fix();
+    ut_ad(s >= buf_page_t::UNFIXED);
+    ut_ad(s < buf_page_t::READ_FIX || s >= buf_page_t::WRITE_FIX);
+  }
+
+  bpage->set_accessed();
+  buf_page_make_young_if_needed(bpage);
+
+#ifdef UNIV_DEBUG
+  if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
+#endif /* UNIV_DEBUG */
+  return bpage;
+
+must_read_page:
+  switch (dberr_t err= buf_read_page(page_id, zip_size)) {
+  case DB_SUCCESS:
+  case DB_SUCCESS_LOCKED_REC:
+    mariadb_increment_pages_read();
+    goto lookup;
+  default:
+    ib::error() << "Reading compressed page " << page_id
+                << " failed with error: " << err;
+    return nullptr;
+  }
+}
+
+/********************************************************************//**
+Initialize some fields of a control block. */
+UNIV_INLINE
+void
+buf_block_init_low(
+/*===============*/
+	buf_block_t*	block)	/*!< in: block to init */
+{
+#ifdef BTR_CUR_HASH_ADAPT
+	/* No adaptive hash index entries may point to a previously
+	unused (and now freshly allocated) block. */
+	assert_block_ahi_empty_on_init(block);
+	block->index		= NULL;
+
+	block->n_hash_helps	= 0;
+	block->n_fields		= 1;
+	block->n_bytes		= 0;
+	block->left_side	= TRUE;
+#endif /* BTR_CUR_HASH_ADAPT */
+}
+
+/********************************************************************//**
+Decompress a block.
+@return TRUE if successful */
+ibool
+buf_zip_decompress(
+/*===============*/
+	buf_block_t*	block,	/*!< in/out: block */
+	ibool		check)	/*!< in: TRUE=verify the page checksum */
+{
+	const byte*	frame = block->page.zip.data;
+	ulint		size = page_zip_get_size(&block->page.zip);
+	/* The tablespace will not be found if this function is called
+	during IMPORT. */
+	fil_space_t* space= fil_space_t::get(block->page.id().space());
+	const unsigned key_version = mach_read_from_4(
+		frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
+	fil_space_crypt_t* crypt_data = space ? space->crypt_data : NULL;
+	const bool encrypted = crypt_data
+		&& crypt_data->type != CRYPT_SCHEME_UNENCRYPTED
+		&& (!crypt_data->is_default_encryption()
+		    || srv_encrypt_tables);
+
+	ut_ad(block->zip_size());
+	ut_a(block->page.id().space() != 0);
+
+	if (UNIV_UNLIKELY(check && !page_zip_verify_checksum(frame, size))) {
+
+		ib::error() << "Compressed page checksum mismatch for "
+			<< (space ? space->chain.start->name : "")
+			<< block->page.id() << ": stored: "
+			<< mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM)
+			<< ", crc32: "
+			<< page_zip_calc_checksum(frame, size, false)
+			<< " adler32: "
+			<< page_zip_calc_checksum(frame, size, true);
+		goto err_exit;
+	}
+
+	switch (fil_page_get_type(frame)) {
+	case FIL_PAGE_INDEX:
+	case FIL_PAGE_RTREE:
+		if (page_zip_decompress(&block->page.zip,
+					block->page.frame, TRUE)) {
+func_exit:
+			if (space) {
+				space->release();
+			}
+			return(TRUE);
+		}
+
+		ib::error() << "Unable to decompress "
+			<< (space ? space->chain.start->name : "")
+			<< block->page.id();
+		goto err_exit;
+	case FIL_PAGE_TYPE_ALLOCATED:
+	case FIL_PAGE_INODE:
+	case FIL_PAGE_IBUF_BITMAP:
+	case FIL_PAGE_TYPE_FSP_HDR:
+	case FIL_PAGE_TYPE_XDES:
+	case FIL_PAGE_TYPE_ZBLOB:
+	case FIL_PAGE_TYPE_ZBLOB2:
+		/* Copy to uncompressed storage. */
+		memcpy(block->page.frame, frame, block->zip_size());
+		goto func_exit;
+	}
+
+	ib::error() << "Unknown compressed page type "
+		<< fil_page_get_type(frame)
+		<< " in " << (space ? space->chain.start->name : "")
+		<< block->page.id();
+
+err_exit:
+	if (encrypted) {
+		ib::info() << "Row compressed page could be encrypted"
+			" with key_version " << key_version;
+	}
+
+	if (space) {
+		space->release();
+	}
+
+	return(FALSE);
+}
+
+/** Low level function used to get access to a database page.
+@param[in]	page_id			page id
+@param[in]	zip_size		ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	rw_latch		RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
+@param[in]	guess			guessed block or NULL
+@param[in]	mode			BUF_GET, BUF_GET_IF_IN_POOL,
+BUF_PEEK_IF_IN_POOL, or BUF_GET_IF_IN_POOL_OR_WATCH
+@param[in]	mtr			mini-transaction
+@param[out]	err			DB_SUCCESS or error code
+@param[in]	allow_ibuf_merge	Allow change buffer merge to happen
+while reading the page from file
+then it makes sure that it does merging of change buffer changes while
+reading the page from file.
+@return pointer to the block or NULL */
+TRANSACTIONAL_TARGET
+buf_block_t*
+buf_page_get_low(
+	const page_id_t		page_id,
+	ulint			zip_size,
+	ulint			rw_latch,
+	buf_block_t*		guess,
+	ulint			mode,
+	mtr_t*			mtr,
+	dberr_t*		err,
+	bool			allow_ibuf_merge)
+{
+	unsigned	access_time;
+	ulint		retries = 0;
+
+	ut_ad(!mtr || mtr->is_active());
+	ut_ad(mtr || mode == BUF_PEEK_IF_IN_POOL);
+	ut_ad((rw_latch == RW_S_LATCH)
+	      || (rw_latch == RW_X_LATCH)
+	      || (rw_latch == RW_SX_LATCH)
+	      || (rw_latch == RW_NO_LATCH));
+
+	if (err) {
+		*err = DB_SUCCESS;
+	}
+
+#ifdef UNIV_DEBUG
+	switch (mode) {
+	default:
+		ut_ad(!allow_ibuf_merge);
+		ut_ad(mode == BUF_PEEK_IF_IN_POOL);
+		break;
+	case BUF_GET_POSSIBLY_FREED:
+	case BUF_GET_IF_IN_POOL:
+		/* The caller may pass a dummy page size,
+		because it does not really matter. */
+		break;
+	case BUF_GET:
+	case BUF_GET_IF_IN_POOL_OR_WATCH:
+		ut_ad(!mtr->is_freeing_tree());
+		fil_space_t* s = fil_space_get(page_id.space());
+		ut_ad(s);
+		ut_ad(s->zip_size() == zip_size);
+	}
+#endif /* UNIV_DEBUG */
+
+	ut_ad(!mtr || !ibuf_inside(mtr)
+	      || ibuf_page_low(page_id, zip_size, FALSE, NULL));
+
+	++buf_pool.stat.n_page_gets;
+        mariadb_increment_pages_accessed();
+
+	auto& chain= buf_pool.page_hash.cell_get(page_id.fold());
+	page_hash_latch& hash_lock = buf_pool.page_hash.lock_get(chain);
+loop:
+	buf_block_t* block = guess;
+	uint32_t state;
+
+	if (block) {
+		transactional_shared_lock_guard<page_hash_latch> g{hash_lock};
+		if (buf_pool.is_uncompressed(block)
+		    && page_id == block->page.id()) {
+			ut_ad(!block->page.in_zip_hash);
+			state = block->page.state();
+			/* Ignore guesses that point to read-fixed blocks.
+			We can only avoid a race condition by
+			looking up the block via buf_pool.page_hash. */
+			if ((state >= buf_page_t::FREED
+			     && state < buf_page_t::READ_FIX)
+			    || state >= buf_page_t::WRITE_FIX) {
+				state = block->page.fix();
+				goto got_block;
+			}
+		}
+	}
+
+	guess = nullptr;
+
+	/* A memory transaction would frequently be aborted here. */
+	hash_lock.lock_shared();
+	block = reinterpret_cast<buf_block_t*>(
+		buf_pool.page_hash.get(page_id, chain));
+	if (UNIV_LIKELY(block
+			&& !buf_pool.watch_is_sentinel(block->page))) {
+		state = block->page.fix();
+		hash_lock.unlock_shared();
+		goto got_block;
+	}
+	hash_lock.unlock_shared();
+
+	/* Page not in buf_pool: needs to be read from file */
+	switch (mode) {
+	case BUF_GET_IF_IN_POOL:
+	case BUF_PEEK_IF_IN_POOL:
+		return nullptr;
+	case BUF_GET_IF_IN_POOL_OR_WATCH:
+		/* Buffer-fixing inside watch_set() will prevent eviction */
+		block = reinterpret_cast<buf_block_t*>
+			(buf_pool.watch_set(page_id, chain));
+
+		if (block) {
+			state = block->page.state();
+			goto got_block_fixed;
+		}
+
+		return nullptr;
+	}
+
+	/* The call path is buf_read_page() ->
+	buf_read_page_low() (fil_space_t::io()) ->
+	buf_page_t::read_complete() ->
+	buf_decrypt_after_read(). Here fil_space_t* is used
+	and we decrypt -> buf_page_check_corrupt() where page
+	checksums are compared. Decryption, decompression as
+	well as error handling takes place at a lower level.
+	Here we only need to know whether the page really is
+	corrupted, or if an encrypted page with a valid
+	checksum cannot be decypted. */
+
+	switch (dberr_t local_err = buf_read_page(page_id, zip_size)) {
+	case DB_SUCCESS:
+	case DB_SUCCESS_LOCKED_REC:
+                mariadb_increment_pages_read();
+		buf_read_ahead_random(page_id, zip_size, ibuf_inside(mtr));
+		break;
+	default:
+		if (mode != BUF_GET_POSSIBLY_FREED
+		    && retries++ < BUF_PAGE_READ_MAX_RETRIES) {
+			DBUG_EXECUTE_IF("intermittent_read_failure",
+					retries = BUF_PAGE_READ_MAX_RETRIES;);
+		}
+		/* fall through */
+	case DB_PAGE_CORRUPTED:
+		if (err) {
+			*err = local_err;
+		}
+		return nullptr;
+	}
+
+	ut_d(if (!(++buf_dbg_counter % 5771)) buf_pool.validate());
+	goto loop;
+
+got_block:
+	ut_ad(!block->page.in_zip_hash);
+	state++;
+got_block_fixed:
+	ut_ad(state > buf_page_t::FREED);
+
+	if (state > buf_page_t::READ_FIX && state < buf_page_t::WRITE_FIX) {
+		if (mode == BUF_PEEK_IF_IN_POOL) {
+ignore_block:
+			ut_ad(mode == BUF_GET_POSSIBLY_FREED
+			      || mode == BUF_PEEK_IF_IN_POOL);
+			block->unfix();
+			if (err) {
+				*err = DB_CORRUPTION;
+			}
+			return nullptr;
+		}
+
+		if (UNIV_UNLIKELY(!block->page.frame)) {
+			goto wait_for_unzip;
+		}
+		/* A read-fix is released after block->page.lock
+		in buf_page_t::read_complete() or
+		buf_pool_t::corrupted_evict(), or
+		after buf_zip_decompress() in this function. */
+		block->page.lock.s_lock();
+		state = block->page.state();
+		ut_ad(state < buf_page_t::READ_FIX
+		      || state >= buf_page_t::WRITE_FIX);
+		const page_id_t id{block->page.id()};
+		block->page.lock.s_unlock();
+
+		if (UNIV_UNLIKELY(id != page_id)) {
+			ut_ad(id == page_id_t{~0ULL});
+			block->page.unfix();
+			if (++retries < BUF_PAGE_READ_MAX_RETRIES) {
+				goto loop;
+			}
+
+			if (err) {
+				*err = DB_PAGE_CORRUPTED;
+			}
+
+			return nullptr;
+		}
+	} else if (mode != BUF_PEEK_IF_IN_POOL) {
+	} else if (!mtr) {
+		ut_ad(!block->page.oldest_modification());
+		mysql_mutex_lock(&buf_pool.mutex);
+		block->unfix();
+
+free_unfixed_block:
+		if (!buf_LRU_free_page(&block->page, true)) {
+			ut_ad(0);
+		}
+
+		mysql_mutex_unlock(&buf_pool.mutex);
+		return nullptr;
+	} else if (UNIV_UNLIKELY(!block->page.frame)) {
+		/* The BUF_PEEK_IF_IN_POOL mode is mainly used for dropping an
+		adaptive hash index. There cannot be an
+		adaptive hash index for a compressed-only page. */
+		goto ignore_block;
+	}
+
+	ut_ad(mode == BUF_GET_IF_IN_POOL || mode == BUF_PEEK_IF_IN_POOL
+	      || block->zip_size() == zip_size);
+
+	if (UNIV_UNLIKELY(!block->page.frame)) {
+		if (!block->page.lock.x_lock_try()) {
+wait_for_unzip:
+			/* The page is being read or written, or
+			another thread is executing buf_zip_decompress()
+			in buf_page_get_low() on it. */
+			block->page.unfix();
+			std::this_thread::sleep_for(
+				std::chrono::microseconds(100));
+			goto loop;
+		}
+
+		buf_block_t *new_block = buf_LRU_get_free_block(false);
+		buf_block_init_low(new_block);
+
+wait_for_unfix:
+		mysql_mutex_lock(&buf_pool.mutex);
+		page_hash_latch& hash_lock=buf_pool.page_hash.lock_get(chain);
+
+		/* It does not make sense to use
+		transactional_lock_guard here, because buf_relocate()
+		would likely make a  memory transaction too large. */
+		hash_lock.lock();
+
+		/* block->page.lock implies !block->page.can_relocate() */
+		ut_ad(&block->page == buf_pool.page_hash.get(page_id, chain));
+
+		/* Wait for any other threads to release their buffer-fix
+		on the compressed-only block descriptor.
+		FIXME: Never fix() before acquiring the lock.
+		Only in buf_page_get_gen(), buf_page_get_low(), buf_page_free()
+		we are violating that principle. */
+		state = block->page.state();
+
+		switch (state) {
+		case buf_page_t::UNFIXED + 1:
+		case buf_page_t::IBUF_EXIST + 1:
+		case buf_page_t::REINIT + 1:
+			break;
+		default:
+			ut_ad(state < buf_page_t::READ_FIX);
+
+			if (state < buf_page_t::UNFIXED + 1) {
+				ut_ad(state > buf_page_t::FREED);
+				block->page.lock.x_unlock();
+				hash_lock.unlock();
+				buf_LRU_block_free_non_file_page(new_block);
+				mysql_mutex_unlock(&buf_pool.mutex);
+				goto ignore_block;
+			}
+
+			mysql_mutex_unlock(&buf_pool.mutex);
+			hash_lock.unlock();
+			std::this_thread::sleep_for(
+				std::chrono::microseconds(100));
+			goto wait_for_unfix;
+		}
+
+		/* Ensure that another buf_page_get_low() will wait for
+		new_block->page.lock.x_unlock(). */
+		block->page.set_state(buf_page_t::READ_FIX);
+
+		/* Move the compressed page from block->page to new_block,
+		and uncompress it. */
+
+		mysql_mutex_lock(&buf_pool.flush_list_mutex);
+		buf_relocate(&block->page, &new_block->page);
+
+		/* X-latch the block for the duration of the decompression. */
+		new_block->page.lock.x_lock();
+		ut_d(block->page.lock.x_unlock());
+
+		buf_flush_relocate_on_flush_list(&block->page,
+						 &new_block->page);
+		mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+		/* Insert at the front of unzip_LRU list */
+		buf_unzip_LRU_add_block(new_block, FALSE);
+
+		mysql_mutex_unlock(&buf_pool.mutex);
+		hash_lock.unlock();
+
+#if defined SUX_LOCK_GENERIC || defined UNIV_DEBUG
+		block->page.lock.free();
+#endif
+		ut_free(reinterpret_cast<buf_page_t*>(block));
+		block = new_block;
+
+		buf_pool.n_pend_unzip++;
+
+		access_time = block->page.is_accessed();
+
+		if (!access_time && !recv_no_ibuf_operations
+		    && ibuf_page_exists(block->page.id(), block->zip_size())) {
+			state = buf_page_t::IBUF_EXIST + 1;
+		}
+
+		/* Decompress the page while not holding
+		buf_pool.mutex. */
+		const auto ok = buf_zip_decompress(block, false);
+		--buf_pool.n_pend_unzip;
+		if (!ok) {
+			if (err) {
+				*err = DB_PAGE_CORRUPTED;
+			}
+			mysql_mutex_lock(&buf_pool.mutex);
+		}
+		state = block->page.read_unfix(state);
+		block->page.lock.x_unlock();
+
+		if (!ok) {
+			goto free_unfixed_block;
+		}
+	}
+
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+re_evict:
+	if (mode != BUF_GET_IF_IN_POOL
+	    && mode != BUF_GET_IF_IN_POOL_OR_WATCH) {
+	} else if (!ibuf_debug || recv_recovery_is_on()) {
+	} else if (fil_space_t* space = fil_space_t::get(page_id.space())) {
+		for (ulint i = 0; i < mtr->get_savepoint(); i++) {
+			if (buf_block_t* b = mtr->block_at_savepoint(i)) {
+				if (b->page.oldest_modification() > 2
+				    && b->page.lock.have_any()) {
+					/* We are holding a dirty page latch
+					that would hang buf_flush_sync(). */
+					space->release();
+					goto re_evict_fail;
+				}
+			}
+		}
+
+		/* Try to evict the block from the buffer pool, to use the
+		insert buffer (change buffer) as much as possible. */
+
+		mysql_mutex_lock(&buf_pool.mutex);
+
+		block->unfix();
+
+		/* Blocks cannot be relocated or enter or exit the
+		buf_pool while we are holding the buf_pool.mutex. */
+		const bool evicted = buf_LRU_free_page(&block->page, true);
+		space->release();
+
+		if (!evicted) {
+			block->fix();
+		}
+
+		mysql_mutex_unlock(&buf_pool.mutex);
+
+		if (evicted) {
+			if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
+				buf_pool.watch_set(page_id, chain);
+			}
+			return(NULL);
+		}
+
+		buf_flush_sync();
+
+		state = block->page.state();
+
+		if (state == buf_page_t::UNFIXED + 1
+		    && !block->page.oldest_modification()) {
+			goto re_evict;
+		}
+
+		/* Failed to evict the page; change it directly */
+	}
+re_evict_fail:
+#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+
+	if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED)) {
+		goto ignore_block;
+	}
+	ut_ad((~buf_page_t::LRU_MASK) & state);
+	ut_ad(state > buf_page_t::WRITE_FIX || state < buf_page_t::READ_FIX);
+
+#ifdef UNIV_DEBUG
+	if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
+#endif /* UNIV_DEBUG */
+	ut_ad(block->page.frame);
+
+	if (state >= buf_page_t::UNFIXED
+	    && allow_ibuf_merge
+	    && fil_page_get_type(block->page.frame) == FIL_PAGE_INDEX
+	    && page_is_leaf(block->page.frame)) {
+		block->page.lock.x_lock();
+		ut_ad(block->page.id() == page_id
+		      || (state >= buf_page_t::READ_FIX
+			  && state < buf_page_t::WRITE_FIX));
+
+#ifdef BTR_CUR_HASH_ADAPT
+		btr_search_drop_page_hash_index(block, true);
+#endif /* BTR_CUR_HASH_ADAPT */
+
+		dberr_t e;
+
+		if (UNIV_UNLIKELY(block->page.id() != page_id)) {
+page_id_mismatch:
+			state = block->page.state();
+			e = DB_CORRUPTION;
+ibuf_merge_corrupted:
+			if (err) {
+				*err = e;
+			}
+
+			if (block->page.id().is_corrupted()) {
+				buf_pool.corrupted_evict(&block->page, state);
+			}
+			return nullptr;
+		}
+
+		state = block->page.state();
+		ut_ad(state < buf_page_t::READ_FIX);
+
+		if (state >= buf_page_t::IBUF_EXIST
+		    && state < buf_page_t::REINIT) {
+			block->page.clear_ibuf_exist();
+			e = ibuf_merge_or_delete_for_page(block, page_id,
+							  block->zip_size());
+			if (UNIV_UNLIKELY(e != DB_SUCCESS)) {
+				goto ibuf_merge_corrupted;
+			}
+		}
+
+		if (rw_latch == RW_X_LATCH) {
+			goto get_latch_valid;
+		} else {
+			block->page.lock.x_unlock();
+			goto get_latch;
+		}
+	} else {
+get_latch:
+		switch (rw_latch) {
+		case RW_NO_LATCH:
+			mtr->memo_push(block, MTR_MEMO_BUF_FIX);
+			return block;
+		case RW_S_LATCH:
+			block->page.lock.s_lock();
+			ut_ad(!block->page.is_read_fixed());
+			if (UNIV_UNLIKELY(block->page.id() != page_id)) {
+				block->page.lock.s_unlock();
+				block->page.lock.x_lock();
+				goto page_id_mismatch;
+			}
+get_latch_valid:
+			mtr->memo_push(block, mtr_memo_type_t(rw_latch));
+#ifdef BTR_CUR_HASH_ADAPT
+			btr_search_drop_page_hash_index(block, true);
+#endif /* BTR_CUR_HASH_ADAPT */
+			break;
+		case RW_SX_LATCH:
+			block->page.lock.u_lock();
+			ut_ad(!block->page.is_io_fixed());
+			if (UNIV_UNLIKELY(block->page.id() != page_id)) {
+				block->page.lock.u_x_upgrade();
+				goto page_id_mismatch;
+			}
+			goto get_latch_valid;
+		default:
+			ut_ad(rw_latch == RW_X_LATCH);
+			if (block->page.lock.x_lock_upgraded()) {
+				ut_ad(block->page.id() == page_id);
+				block->unfix();
+				mtr->page_lock_upgrade(*block);
+				return block;
+			}
+			if (UNIV_UNLIKELY(block->page.id() != page_id)) {
+				goto page_id_mismatch;
+			}
+			goto get_latch_valid;
+		}
+
+		ut_ad(page_id_t(page_get_space_id(block->page.frame),
+				page_get_page_no(block->page.frame))
+		      == page_id);
+
+		if (mode == BUF_GET_POSSIBLY_FREED
+		    || mode == BUF_PEEK_IF_IN_POOL) {
+			return block;
+		}
+
+		const bool not_first_access{block->page.set_accessed()};
+		buf_page_make_young_if_needed(&block->page);
+		if (!not_first_access) {
+			buf_read_ahead_linear(page_id, block->zip_size(),
+					      ibuf_inside(mtr));
+		}
+	}
+
+	return block;
+}
+
+/** Get access to a database page. Buffered redo log may be applied.
+@param[in]	page_id			page id
+@param[in]	zip_size		ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	rw_latch		RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
+@param[in]	guess			guessed block or NULL
+@param[in]	mode			BUF_GET, BUF_GET_IF_IN_POOL,
+BUF_PEEK_IF_IN_POOL, or BUF_GET_IF_IN_POOL_OR_WATCH
+@param[in,out]	mtr			mini-transaction, or NULL
+@param[out]	err			DB_SUCCESS or error code
+@param[in]	allow_ibuf_merge	Allow change buffer merge while
+reading the pages from file.
+@return pointer to the block or NULL */
+buf_block_t*
+buf_page_get_gen(
+	const page_id_t		page_id,
+	ulint			zip_size,
+	ulint			rw_latch,
+	buf_block_t*		guess,
+	ulint			mode,
+	mtr_t*			mtr,
+	dberr_t*		err,
+	bool			allow_ibuf_merge)
+{
+  buf_block_t *block= recv_sys.recover(page_id);
+  if (UNIV_LIKELY(!block))
+    return buf_page_get_low(page_id, zip_size, rw_latch,
+                            guess, mode, mtr, err, allow_ibuf_merge);
+  else if (UNIV_UNLIKELY(block == reinterpret_cast<buf_block_t*>(-1)))
+  {
+  corrupted:
+    if (err)
+      *err= DB_CORRUPTION;
+    return nullptr;
+  }
+  /* Recovery is a special case; we fix() before acquiring lock. */
+  auto s= block->page.fix();
+  ut_ad(s >= buf_page_t::FREED);
+  /* The block may be write-fixed at this point because we are not
+  holding a lock, but it must not be read-fixed. */
+  ut_ad(s < buf_page_t::READ_FIX || s >= buf_page_t::WRITE_FIX);
+  if (err)
+    *err= DB_SUCCESS;
+  const bool must_merge= allow_ibuf_merge &&
+    ibuf_page_exists(page_id, block->zip_size());
+  if (s < buf_page_t::UNFIXED)
+  {
+  got_freed_page:
+    ut_ad(mode == BUF_GET_POSSIBLY_FREED || mode == BUF_PEEK_IF_IN_POOL);
+    mysql_mutex_lock(&buf_pool.mutex);
+    block->page.unfix();
+    buf_LRU_free_page(&block->page, true);
+    mysql_mutex_unlock(&buf_pool.mutex);
+    goto corrupted;
+  }
+  else if (must_merge &&
+           fil_page_get_type(block->page.frame) == FIL_PAGE_INDEX &&
+           page_is_leaf(block->page.frame))
+  {
+    block->page.lock.x_lock();
+    s= block->page.state();
+    ut_ad(s > buf_page_t::FREED);
+    ut_ad(s < buf_page_t::READ_FIX);
+    if (s < buf_page_t::UNFIXED)
+    {
+      block->page.lock.x_unlock();
+      goto got_freed_page;
+    }
+    else
+    {
+      if (block->page.is_ibuf_exist())
+        block->page.clear_ibuf_exist();
+      if (dberr_t e=
+          ibuf_merge_or_delete_for_page(block, page_id, block->zip_size()))
+      {
+        if (err)
+          *err= e;
+        buf_pool.corrupted_evict(&block->page, s);
+        return nullptr;
+      }
+    }
+
+    if (rw_latch == RW_X_LATCH)
+    {
+      mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX);
+      return block;
+    }
+    block->page.lock.x_unlock();
+  }
+  mtr->page_lock(block, rw_latch);
+  return block;
+}
+
+/********************************************************************//**
+This is the general function used to get optimistic access to a database
+page.
+@return TRUE if success */
+TRANSACTIONAL_TARGET
+bool buf_page_optimistic_get(ulint rw_latch, buf_block_t *block,
+                             uint64_t modify_clock, mtr_t *mtr)
+{
+  ut_ad(block);
+  ut_ad(mtr);
+  ut_ad(mtr->is_active());
+  ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_X_LATCH);
+
+  if (have_transactional_memory);
+  else if (UNIV_UNLIKELY(!block->page.frame))
+    return false;
+  else
+  {
+    const auto state= block->page.state();
+    if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED ||
+                      state >= buf_page_t::READ_FIX))
+      return false;
+  }
+
+  bool success;
+  const page_id_t id{block->page.id()};
+  buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id.fold());
+  bool have_u_not_x= false;
+
+  {
+    transactional_shared_lock_guard<page_hash_latch> g
+      {buf_pool.page_hash.lock_get(chain)};
+    if (UNIV_UNLIKELY(id != block->page.id() || !block->page.frame))
+      return false;
+    const auto state= block->page.state();
+    if (UNIV_UNLIKELY(state < buf_page_t::UNFIXED ||
+                      state >= buf_page_t::READ_FIX))
+      return false;
+
+    if (rw_latch == RW_S_LATCH)
+      success= block->page.lock.s_lock_try();
+    else
+    {
+      have_u_not_x= block->page.lock.have_u_not_x();
+      success= have_u_not_x || block->page.lock.x_lock_try();
+    }
+  }
+
+  if (!success)
+    return false;
+
+  if (have_u_not_x)
+  {
+    block->page.lock.u_x_upgrade();
+    mtr->page_lock_upgrade(*block);
+    ut_ad(id == block->page.id());
+    ut_ad(modify_clock == block->modify_clock);
+  }
+  else
+  {
+    ut_ad(rw_latch == RW_S_LATCH || !block->page.is_io_fixed());
+    ut_ad(id == block->page.id());
+    ut_ad(!ibuf_inside(mtr) || ibuf_page(id, block->zip_size(), nullptr));
+
+    if (modify_clock != block->modify_clock || block->page.is_freed())
+    {
+      if (rw_latch == RW_S_LATCH)
+        block->page.lock.s_unlock();
+      else
+        block->page.lock.x_unlock();
+      return false;
+    }
+
+    block->page.fix();
+    ut_ad(!block->page.is_read_fixed());
+    block->page.set_accessed();
+    buf_page_make_young_if_needed(&block->page);
+    mtr->memo_push(block, mtr_memo_type_t(rw_latch));
+  }
+
+  ut_d(if (!(++buf_dbg_counter % 5771)) buf_pool.validate());
+  ut_d(const auto state = block->page.state());
+  ut_ad(state > buf_page_t::UNFIXED);
+  ut_ad(state < buf_page_t::READ_FIX || state > buf_page_t::WRITE_FIX);
+  ut_ad(~buf_page_t::LRU_MASK & state);
+  ut_ad(block->page.frame);
+
+  return true;
+}
+
+/** Try to S-latch a page.
+Suitable for using when holding the lock_sys latches (as it avoids deadlock).
+@param[in]	page_id	page identifier
+@param[in,out]	mtr	mini-transaction
+@return the block
+@retval nullptr if an S-latch cannot be granted immediately */
+TRANSACTIONAL_TARGET
+buf_block_t *buf_page_try_get(const page_id_t page_id, mtr_t *mtr)
+{
+  ut_ad(mtr);
+  ut_ad(mtr->is_active());
+  buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold());
+  buf_block_t *block;
+
+  {
+    transactional_shared_lock_guard<page_hash_latch> g
+      {buf_pool.page_hash.lock_get(chain)};
+    block= reinterpret_cast<buf_block_t*>
+      (buf_pool.page_hash.get(page_id, chain));
+    if (!block || !block->page.frame || !block->page.lock.s_lock_try())
+      return nullptr;
+  }
+
+  block->page.fix();
+  ut_ad(!block->page.is_read_fixed());
+  mtr->memo_push(block, MTR_MEMO_PAGE_S_FIX);
+
+#ifdef UNIV_DEBUG
+  if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
+#endif /* UNIV_DEBUG */
+  ut_ad(block->page.buf_fix_count());
+  ut_ad(block->page.id() == page_id);
+
+  ++buf_pool.stat.n_page_gets;
+  mariadb_increment_pages_accessed();
+  return block;
+}
+
+/** Initialize the block.
+@param page_id  page identifier
+@param zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param fix      initial buf_fix_count() */
+void buf_block_t::initialise(const page_id_t page_id, ulint zip_size,
+                             uint32_t fix)
+{
+  ut_ad(!page.in_file());
+  buf_block_init_low(this);
+  page.init(fix, page_id);
+  page.set_os_used();
+  page_zip_set_size(&page.zip, zip_size);
+}
+
+TRANSACTIONAL_TARGET
+static buf_block_t *buf_page_create_low(page_id_t page_id, ulint zip_size,
+                                        mtr_t *mtr, buf_block_t *free_block)
+{
+  ut_ad(mtr->is_active());
+  ut_ad(page_id.space() != 0 || !zip_size);
+
+  free_block->initialise(page_id, zip_size, buf_page_t::MEMORY);
+
+  buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold());
+retry:
+  mysql_mutex_lock(&buf_pool.mutex);
+
+  buf_page_t *bpage= buf_pool.page_hash.get(page_id, chain);
+
+  if (bpage && !buf_pool.watch_is_sentinel(*bpage))
+  {
+#ifdef BTR_CUR_HASH_ADAPT
+    const dict_index_t *drop_hash_entry= nullptr;
+#endif
+    bool ibuf_exist= false;
+
+    if (!mtr->have_x_latch(reinterpret_cast<const buf_block_t&>(*bpage)))
+    {
+      const bool got= bpage->lock.x_lock_try();
+      if (!got)
+      {
+        mysql_mutex_unlock(&buf_pool.mutex);
+        bpage->lock.x_lock();
+        const page_id_t id{bpage->id()};
+        if (UNIV_UNLIKELY(id != page_id))
+        {
+          ut_ad(id.is_corrupted());
+          bpage->lock.x_unlock();
+          goto retry;
+        }
+        mysql_mutex_lock(&buf_pool.mutex);
+      }
+
+      auto state= bpage->fix();
+      ut_ad(state >= buf_page_t::FREED);
+      ut_ad(state < buf_page_t::READ_FIX);
+
+      if (state < buf_page_t::UNFIXED)
+        bpage->set_reinit(buf_page_t::FREED);
+      else
+      {
+        bpage->set_reinit(state & buf_page_t::LRU_MASK);
+        ibuf_exist= (state & buf_page_t::LRU_MASK) == buf_page_t::IBUF_EXIST;
+      }
+
+      if (UNIV_LIKELY(bpage->frame != nullptr))
+      {
+        mysql_mutex_unlock(&buf_pool.mutex);
+        buf_block_t *block= reinterpret_cast<buf_block_t*>(bpage);
+        mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX);
+#ifdef BTR_CUR_HASH_ADAPT
+        drop_hash_entry= block->index;
+#endif
+      }
+      else
+      {
+        auto state= bpage->state();
+        ut_ad(state >= buf_page_t::FREED);
+        ut_ad(state < buf_page_t::READ_FIX);
+
+        page_hash_latch &hash_lock= buf_pool.page_hash.lock_get(chain);
+        /* It does not make sense to use transactional_lock_guard here,
+        because buf_relocate() would likely make the memory transaction
+        too large. */
+        hash_lock.lock();
+
+        if (state < buf_page_t::UNFIXED)
+          bpage->set_reinit(buf_page_t::FREED);
+        else
+        {
+          bpage->set_reinit(state & buf_page_t::LRU_MASK);
+          ibuf_exist= (state & buf_page_t::LRU_MASK) == buf_page_t::IBUF_EXIST;
+        }
+
+        mysql_mutex_lock(&buf_pool.flush_list_mutex);
+        buf_relocate(bpage, &free_block->page);
+        free_block->page.lock.x_lock();
+        buf_flush_relocate_on_flush_list(bpage, &free_block->page);
+        mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+        buf_unzip_LRU_add_block(free_block, FALSE);
+
+        mysql_mutex_unlock(&buf_pool.mutex);
+        hash_lock.unlock();
+#if defined SUX_LOCK_GENERIC || defined UNIV_DEBUG
+        bpage->lock.x_unlock();
+        bpage->lock.free();
+#endif
+        ut_free(bpage);
+        mtr->memo_push(free_block, MTR_MEMO_PAGE_X_FIX);
+        bpage= &free_block->page;
+      }
+    }
+    else
+    {
+      mysql_mutex_unlock(&buf_pool.mutex);
+      ut_ad(bpage->frame);
+#ifdef BTR_CUR_HASH_ADAPT
+      ut_ad(!reinterpret_cast<buf_block_t*>(bpage)->index);
+#endif
+      const auto state= bpage->state();
+      ut_ad(state >= buf_page_t::FREED);
+      bpage->set_reinit(state < buf_page_t::UNFIXED ? buf_page_t::FREED
+                        : state & buf_page_t::LRU_MASK);
+    }
+
+#ifdef BTR_CUR_HASH_ADAPT
+    if (drop_hash_entry)
+      btr_search_drop_page_hash_index(reinterpret_cast<buf_block_t*>(bpage),
+                                      false);
+#endif /* BTR_CUR_HASH_ADAPT */
+
+    if (ibuf_exist && !recv_recovery_is_on())
+      ibuf_merge_or_delete_for_page(nullptr, page_id, zip_size);
+
+    return reinterpret_cast<buf_block_t*>(bpage);
+  }
+
+  /* If we get here, the page was not in buf_pool: init it there */
+
+  DBUG_PRINT("ib_buf", ("create page %u:%u",
+                        page_id.space(), page_id.page_no()));
+
+  bpage= &free_block->page;
+
+  ut_ad(bpage->state() == buf_page_t::MEMORY);
+  bpage->lock.x_lock();
+
+  /* The block must be put to the LRU list */
+  buf_LRU_add_block(bpage, false);
+  {
+    transactional_lock_guard<page_hash_latch> g
+      {buf_pool.page_hash.lock_get(chain)};
+    bpage->set_state(buf_page_t::REINIT + 1);
+    buf_pool.page_hash.append(chain, bpage);
+  }
+
+  if (UNIV_UNLIKELY(zip_size))
+  {
+    bpage->zip.data= buf_buddy_alloc(zip_size);
+
+    /* To maintain the invariant block->in_unzip_LRU_list ==
+    block->page.belongs_to_unzip_LRU() we have to add this
+    block to unzip_LRU after block->page.zip.data is set. */
+    ut_ad(bpage->belongs_to_unzip_LRU());
+    buf_unzip_LRU_add_block(reinterpret_cast<buf_block_t*>(bpage), FALSE);
+  }
+
+  buf_pool.stat.n_pages_created++;
+  mysql_mutex_unlock(&buf_pool.mutex);
+
+  mtr->memo_push(reinterpret_cast<buf_block_t*>(bpage), MTR_MEMO_PAGE_X_FIX);
+
+  bpage->set_accessed();
+
+  /* Delete possible entries for the page from the insert buffer:
+  such can exist if the page belonged to an index which was dropped */
+  if (page_id < page_id_t{SRV_SPACE_ID_UPPER_BOUND, 0} &&
+      !srv_is_undo_tablespace(page_id.space()) &&
+      !recv_recovery_is_on())
+    ibuf_merge_or_delete_for_page(nullptr, page_id, zip_size);
+
+  static_assert(FIL_PAGE_PREV + 4 == FIL_PAGE_NEXT, "adjacent");
+  memset_aligned<8>(bpage->frame + FIL_PAGE_PREV, 0xff, 8);
+  mach_write_to_2(bpage->frame + FIL_PAGE_TYPE, FIL_PAGE_TYPE_ALLOCATED);
+
+  /* FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION is only used on the
+  following pages:
+  (1) The first page of the InnoDB system tablespace (page 0:0)
+  (2) FIL_RTREE_SPLIT_SEQ_NUM on R-tree pages
+  (3) key_version on encrypted pages (not page 0:0) */
+
+  memset(bpage->frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8);
+  memset_aligned<8>(bpage->frame + FIL_PAGE_LSN, 0, 8);
+
+#ifdef UNIV_DEBUG
+  if (!(++buf_dbg_counter % 5771)) buf_pool.validate();
+#endif /* UNIV_DEBUG */
+  return reinterpret_cast<buf_block_t*>(bpage);
+}
+
+/** Initialize a page in the buffer pool. The page is usually not read
+from a file even if it cannot be found in the buffer buf_pool. This is one
+of the functions which perform to a block a state transition NOT_USED =>
+FILE_PAGE (the other is buf_page_get_gen).
+@param[in,out]	space		space object
+@param[in]	offset		offset of the tablespace
+				or deferred space id if space
+				object is null
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out]	mtr		mini-transaction
+@param[in,out]	free_block	pre-allocated buffer block
+@return pointer to the block, page bufferfixed */
+buf_block_t*
+buf_page_create(fil_space_t *space, uint32_t offset,
+                ulint zip_size, mtr_t *mtr, buf_block_t *free_block)
+{
+  space->free_page(offset, false);
+  return buf_page_create_low({space->id, offset}, zip_size, mtr, free_block);
+}
+
+/** Initialize a page in buffer pool while initializing the
+deferred tablespace
+@param space_id		space identfier
+@param zip_size		ROW_FORMAT=COMPRESSED page size or 0
+@param mtr		mini-transaction
+@param free_block 	pre-allocated buffer block
+@return pointer to the block, page bufferfixed */
+buf_block_t* buf_page_create_deferred(uint32_t space_id, ulint zip_size,
+                                      mtr_t *mtr, buf_block_t *free_block)
+{
+  return buf_page_create_low({space_id, 0}, zip_size, mtr, free_block);
+}
+
+/** Monitor the buffer page read/write activity, and increment corresponding
+counter value in MONITOR_MODULE_BUF_PAGE.
+@param bpage   buffer page whose read or write was completed
+@param read    true=read, false=write */
+ATTRIBUTE_COLD void buf_page_monitor(const buf_page_t &bpage, bool read)
+{
+	monitor_id_t	counter;
+
+	const byte* frame = bpage.zip.data ? bpage.zip.data : bpage.frame;
+
+	switch (fil_page_get_type(frame)) {
+		ulint	level;
+	case FIL_PAGE_TYPE_INSTANT:
+	case FIL_PAGE_INDEX:
+	case FIL_PAGE_RTREE:
+		level = btr_page_get_level(frame);
+
+		/* Check if it is an index page for insert buffer */
+		if (fil_page_get_type(frame) == FIL_PAGE_INDEX
+		    && btr_page_get_index_id(frame)
+		    == (index_id_t)(DICT_IBUF_ID_MIN + IBUF_SPACE_ID)) {
+			if (level == 0) {
+				counter = MONITOR_RW_COUNTER(
+					read, MONITOR_INDEX_IBUF_LEAF_PAGE);
+			} else {
+				counter = MONITOR_RW_COUNTER(
+					read,
+					MONITOR_INDEX_IBUF_NON_LEAF_PAGE);
+			}
+		} else {
+			if (level == 0) {
+				counter = MONITOR_RW_COUNTER(
+					read, MONITOR_INDEX_LEAF_PAGE);
+			} else {
+				counter = MONITOR_RW_COUNTER(
+					read, MONITOR_INDEX_NON_LEAF_PAGE);
+			}
+		}
+		break;
+
+	case FIL_PAGE_UNDO_LOG:
+		counter = MONITOR_RW_COUNTER(read, MONITOR_UNDO_LOG_PAGE);
+		break;
+
+	case FIL_PAGE_INODE:
+		counter = MONITOR_RW_COUNTER(read, MONITOR_INODE_PAGE);
+		break;
+
+	case FIL_PAGE_IBUF_FREE_LIST:
+		counter = MONITOR_RW_COUNTER(read, MONITOR_IBUF_FREELIST_PAGE);
+		break;
+
+	case FIL_PAGE_IBUF_BITMAP:
+		counter = MONITOR_RW_COUNTER(read, MONITOR_IBUF_BITMAP_PAGE);
+		break;
+
+	case FIL_PAGE_TYPE_SYS:
+		counter = MONITOR_RW_COUNTER(read, MONITOR_SYSTEM_PAGE);
+		break;
+
+	case FIL_PAGE_TYPE_TRX_SYS:
+		counter = MONITOR_RW_COUNTER(read, MONITOR_TRX_SYSTEM_PAGE);
+		break;
+
+	case FIL_PAGE_TYPE_FSP_HDR:
+		counter = MONITOR_RW_COUNTER(read, MONITOR_FSP_HDR_PAGE);
+		break;
+
+	case FIL_PAGE_TYPE_XDES:
+		counter = MONITOR_RW_COUNTER(read, MONITOR_XDES_PAGE);
+		break;
+
+	case FIL_PAGE_TYPE_BLOB:
+		counter = MONITOR_RW_COUNTER(read, MONITOR_BLOB_PAGE);
+		break;
+
+	case FIL_PAGE_TYPE_ZBLOB:
+		counter = MONITOR_RW_COUNTER(read, MONITOR_ZBLOB_PAGE);
+		break;
+
+	case FIL_PAGE_TYPE_ZBLOB2:
+		counter = MONITOR_RW_COUNTER(read, MONITOR_ZBLOB2_PAGE);
+		break;
+
+	default:
+		counter = MONITOR_RW_COUNTER(read, MONITOR_OTHER_PAGE);
+	}
+
+	MONITOR_INC_NOCHECK(counter);
+}
+
+/** Check if the encrypted page is corrupted for the full crc32 format.
+@param[in]	space_id	page belongs to space id
+@param[in]	d		page
+@param[in]	is_compressed	compressed page
+@return true if page is corrupted or false if it isn't */
+static bool buf_page_full_crc32_is_corrupted(ulint space_id, const byte* d,
+                                             bool is_compressed)
+{
+  if (space_id != mach_read_from_4(d + FIL_PAGE_SPACE_ID))
+    return true;
+
+  static_assert(FIL_PAGE_LSN % 4 == 0, "alignment");
+  static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment");
+
+  return !is_compressed &&
+    memcmp_aligned<4>(FIL_PAGE_LSN + 4 + d,
+                      d + srv_page_size - FIL_PAGE_FCRC32_END_LSN, 4);
+}
+
+/** Check if page is maybe compressed, encrypted or both when we encounter
+corrupted page. Note that we can't be 100% sure if page is corrupted
+or decrypt/decompress just failed.
+@param[in,out]	bpage		page
+@param[in]	node		data file
+@return	whether the operation succeeded
+@retval	DB_SUCCESS		if page has been read and is not corrupted
+@retval	DB_PAGE_CORRUPTED	if page based on checksum check is corrupted
+@retval	DB_DECRYPTION_FAILED	if page post encryption checksum matches but
+after decryption normal page checksum does not match. */
+static dberr_t buf_page_check_corrupt(buf_page_t *bpage,
+                                      const fil_node_t &node)
+{
+	ut_ad(node.space->referenced());
+
+	byte* dst_frame = bpage->zip.data ? bpage->zip.data : bpage->frame;
+	dberr_t err = DB_SUCCESS;
+	uint key_version = buf_page_get_key_version(dst_frame,
+						    node.space->flags);
+
+	/* In buf_decrypt_after_read we have either decrypted the page if
+	page post encryption checksum matches and used key_id is found
+	from the encryption plugin. If checksum did not match page was
+	not decrypted and it could be either encrypted and corrupted
+	or corrupted or good page. If we decrypted, there page could
+	still be corrupted if used key does not match. */
+	const bool seems_encrypted = !node.space->full_crc32() && key_version
+		&& node.space->crypt_data
+		&& node.space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED;
+	ut_ad(node.space->purpose != FIL_TYPE_TEMPORARY ||
+	      node.space->full_crc32());
+
+	/* If traditional checksums match, we assume that page is
+	not anymore encrypted. */
+	if (node.space->full_crc32()
+	    && !buf_is_zeroes(span<const byte>(dst_frame,
+					       node.space->physical_size()))
+	    && (key_version || node.space->is_compressed()
+		|| node.space->purpose == FIL_TYPE_TEMPORARY)) {
+		if (buf_page_full_crc32_is_corrupted(
+			    bpage->id().space(), dst_frame,
+			    node.space->is_compressed())) {
+			err = DB_PAGE_CORRUPTED;
+		}
+	} else if (buf_page_is_corrupted(true, dst_frame, node.space->flags)) {
+		err = DB_PAGE_CORRUPTED;
+	}
+
+	if (seems_encrypted && err == DB_PAGE_CORRUPTED
+	    && bpage->id().page_no() != 0) {
+		err = DB_DECRYPTION_FAILED;
+
+		ib::error()
+			<< "The page " << bpage->id()
+			<< " in file '" << node.name
+			<< "' cannot be decrypted; key_version="
+			<< key_version;
+	}
+
+	return (err);
+}
+
+/** Complete a read of a page.
+@param node     data file
+@return whether the operation succeeded
+@retval DB_PAGE_CORRUPTED    if the checksum fails
+@retval DB_DECRYPTION_FAILED if the page cannot be decrypted
+@retval DB_FAIL              if the page contains the wrong ID */
+dberr_t buf_page_t::read_complete(const fil_node_t &node)
+{
+  const page_id_t expected_id{id()};
+  ut_ad(is_read_fixed());
+  ut_ad(!buf_dblwr.is_inside(id()));
+  ut_ad(id().space() == node.space->id);
+  ut_ad(zip_size() == node.space->zip_size());
+  ut_ad(!!zip.ssize == !!zip.data);
+
+  const byte *read_frame= zip.data ? zip.data : frame;
+  ut_ad(read_frame);
+
+  dberr_t err;
+  if (!buf_page_decrypt_after_read(this, node))
+  {
+    err= DB_DECRYPTION_FAILED;
+    goto database_corrupted;
+  }
+
+  if (belongs_to_unzip_LRU())
+  {
+    buf_pool.n_pend_unzip++;
+    auto ok= buf_zip_decompress(reinterpret_cast<buf_block_t*>(this), false);
+    buf_pool.n_pend_unzip--;
+
+    if (!ok)
+    {
+      ib::info() << "Page " << expected_id << " zip_decompress failure.";
+      err= DB_PAGE_CORRUPTED;
+      goto database_corrupted;
+    }
+  }
+
+  {
+    const page_id_t read_id(mach_read_from_4(read_frame + FIL_PAGE_SPACE_ID),
+                            mach_read_from_4(read_frame + FIL_PAGE_OFFSET));
+
+    if (read_id == expected_id);
+    else if (read_id == page_id_t(0, 0))
+    {
+      /* This is likely an uninitialized (all-zero) page. */
+      err= DB_FAIL;
+      goto release_page;
+    }
+    else if (!node.space->full_crc32() &&
+             page_id_t(0, read_id.page_no()) == expected_id)
+      /* FIL_PAGE_SPACE_ID was written as garbage in the system tablespace
+      before MySQL 4.1.1, which introduced innodb_file_per_table. */;
+    else if (node.space->full_crc32() &&
+             *reinterpret_cast<const uint32_t*>
+             (&read_frame[FIL_PAGE_FCRC32_KEY_VERSION]) &&
+             node.space->crypt_data &&
+             node.space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED)
+    {
+      ib::error() << "Cannot decrypt " << expected_id;
+      err= DB_DECRYPTION_FAILED;
+      goto release_page;
+    }
+    else
+    {
+      ib::error() << "Space id and page no stored in the page, read in are "
+                  << read_id << ", should be " << expected_id;
+      err= DB_PAGE_CORRUPTED;
+      goto release_page;
+    }
+  }
+
+  err= buf_page_check_corrupt(this, node);
+  if (UNIV_UNLIKELY(err != DB_SUCCESS))
+  {
+database_corrupted:
+    if (belongs_to_unzip_LRU())
+      memset_aligned<UNIV_PAGE_SIZE_MIN>(frame, 0, srv_page_size);
+
+    if (err == DB_PAGE_CORRUPTED)
+    {
+      ib::error() << "Database page corruption on disk"
+                     " or a failed read of file '"
+                  << node.name << "' page " << expected_id
+                  << ". You may have to recover from a backup.";
+
+      buf_page_print(read_frame, zip_size());
+
+      node.space->set_corrupted();
+
+      ib::info() << " You can use CHECK TABLE to scan"
+                    " your table for corruption. "
+                 << FORCE_RECOVERY_MSG;
+    }
+
+    if (!srv_force_recovery)
+      goto release_page;
+  }
+
+  if (err == DB_PAGE_CORRUPTED || err == DB_DECRYPTION_FAILED)
+  {
+release_page:
+    buf_pool.corrupted_evict(this, buf_page_t::READ_FIX);
+    return err;
+  }
+
+  const bool recovery= recv_recovery_is_on();
+
+  if (recovery && !recv_recover_page(node.space, this))
+    return DB_PAGE_CORRUPTED;
+
+  const bool ibuf_may_exist= frame && !recv_no_ibuf_operations &&
+    (!expected_id.space() || !is_predefined_tablespace(expected_id.space())) &&
+    fil_page_get_type(read_frame) == FIL_PAGE_INDEX &&
+    page_is_leaf(read_frame);
+
+  if (UNIV_UNLIKELY(MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE)))
+    buf_page_monitor(*this, true);
+  DBUG_PRINT("ib_buf", ("read page %u:%u", id().space(), id().page_no()));
+
+  if (!recovery)
+  {
+    ut_d(auto f=) zip.fix.fetch_sub(ibuf_may_exist
+                                    ? READ_FIX - IBUF_EXIST
+                                    : READ_FIX - UNFIXED);
+    ut_ad(f >= READ_FIX);
+    ut_ad(f < WRITE_FIX);
+  }
+  else if (ibuf_may_exist)
+    set_ibuf_exist();
+
+  lock.x_unlock(true);
+
+  return DB_SUCCESS;
+}
+
+#ifdef UNIV_DEBUG
+/** Check that all blocks are in a replaceable state.
+@return address of a non-free block
+@retval nullptr if all freed */
+void buf_pool_t::assert_all_freed()
+{
+  mysql_mutex_lock(&mutex);
+  const chunk_t *chunk= chunks;
+  for (auto i= n_chunks; i--; chunk++)
+    if (const buf_block_t* block= chunk->not_freed())
+      ib::fatal() << "Page " << block->page.id() << " still fixed or dirty";
+  mysql_mutex_unlock(&mutex);
+}
+#endif /* UNIV_DEBUG */
+
+/** Refresh the statistics used to print per-second averages. */
+void buf_refresh_io_stats()
+{
+	buf_pool.last_printout_time = time(NULL);
+	buf_pool.old_stat = buf_pool.stat;
+}
+
+/** Invalidate all pages in the buffer pool.
+All pages must be in a replaceable state (not modified or latched). */
+void buf_pool_invalidate()
+{
+	mysql_mutex_lock(&buf_pool.mutex);
+
+	/* It is possible that a write batch that has been posted
+	earlier is still not complete. For buffer pool invalidation to
+	proceed we must ensure there is NO write activity happening. */
+
+	ut_d(mysql_mutex_unlock(&buf_pool.mutex));
+	ut_d(buf_pool.assert_all_freed());
+	ut_d(mysql_mutex_lock(&buf_pool.mutex));
+
+	while (UT_LIST_GET_LEN(buf_pool.LRU)) {
+		buf_LRU_scan_and_free_block();
+	}
+
+	ut_ad(UT_LIST_GET_LEN(buf_pool.unzip_LRU) == 0);
+
+	buf_pool.freed_page_clock = 0;
+	buf_pool.LRU_old = NULL;
+	buf_pool.LRU_old_len = 0;
+	buf_pool.stat.init();
+
+	buf_refresh_io_stats();
+	mysql_mutex_unlock(&buf_pool.mutex);
+}
+
+#ifdef UNIV_DEBUG
+/** Validate the buffer pool. */
+void buf_pool_t::validate()
+{
+	ulint		n_lru		= 0;
+	ulint		n_flushing	= 0;
+	ulint		n_free		= 0;
+	ulint		n_zip		= 0;
+
+	mysql_mutex_lock(&mutex);
+
+	chunk_t* chunk = chunks;
+
+	/* Check the uncompressed blocks. */
+
+	for (auto i = n_chunks; i--; chunk++) {
+		buf_block_t*	block = chunk->blocks;
+
+		for (auto j = chunk->size; j--; block++) {
+			ut_ad(block->page.frame);
+			switch (const auto f = block->page.state()) {
+			case buf_page_t::NOT_USED:
+				n_free++;
+				break;
+
+			case buf_page_t::MEMORY:
+			case buf_page_t::REMOVE_HASH:
+				/* do nothing */
+				break;
+
+			default:
+				if (f >= buf_page_t::READ_FIX
+				    && f < buf_page_t::WRITE_FIX) {
+					/* A read-fixed block is not
+					necessarily in the page_hash yet. */
+					break;
+				}
+				ut_ad(f >= buf_page_t::FREED);
+				const page_id_t id{block->page.id()};
+				ut_ad(page_hash.get(
+					      id,
+					      page_hash.cell_get(id.fold()))
+				      == &block->page);
+				n_lru++;
+			}
+		}
+	}
+
+	/* Check dirty blocks. */
+
+	mysql_mutex_lock(&flush_list_mutex);
+	for (buf_page_t* b = UT_LIST_GET_FIRST(flush_list); b;
+	     b = UT_LIST_GET_NEXT(list, b)) {
+		ut_ad(b->in_file());
+		ut_ad(b->oldest_modification());
+		ut_ad(!fsp_is_system_temporary(b->id().space()));
+		n_flushing++;
+
+		if (UNIV_UNLIKELY(!b->frame)) {
+			n_lru++;
+			n_zip++;
+		}
+		const page_id_t id{b->id()};
+		ut_ad(page_hash.get(id, page_hash.cell_get(id.fold())) == b);
+	}
+
+	ut_ad(UT_LIST_GET_LEN(flush_list) == n_flushing);
+
+	mysql_mutex_unlock(&flush_list_mutex);
+
+	if (n_chunks_new == n_chunks
+	    && n_lru + n_free > curr_size + n_zip) {
+
+		ib::fatal() << "n_LRU " << n_lru << ", n_free " << n_free
+			<< ", pool " << curr_size
+			<< " zip " << n_zip << ". Aborting...";
+	}
+
+	ut_ad(UT_LIST_GET_LEN(LRU) >= n_lru);
+
+	if (n_chunks_new == n_chunks
+	    && UT_LIST_GET_LEN(free) != n_free) {
+
+		ib::fatal() << "Free list len "
+			<< UT_LIST_GET_LEN(free)
+			<< ", free blocks " << n_free << ". Aborting...";
+	}
+
+	mysql_mutex_unlock(&mutex);
+
+	ut_d(buf_LRU_validate());
+	ut_d(buf_flush_validate());
+}
+#endif /* UNIV_DEBUG */
+
+#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG
+/** Write information of the buf_pool to the error log. */
+void buf_pool_t::print()
+{
+	index_id_t*	index_ids;
+	ulint*		counts;
+	ulint		size;
+	ulint		i;
+	ulint		j;
+	index_id_t	id;
+	ulint		n_found;
+	chunk_t*	chunk;
+	dict_index_t*	index;
+
+	size = curr_size;
+
+	index_ids = static_cast<index_id_t*>(
+		ut_malloc_nokey(size * sizeof *index_ids));
+
+	counts = static_cast<ulint*>(ut_malloc_nokey(sizeof(ulint) * size));
+
+	mysql_mutex_lock(&mutex);
+	mysql_mutex_lock(&flush_list_mutex);
+
+	ib::info()
+		<< "[buffer pool: size=" << curr_size
+		<< ", database pages=" << UT_LIST_GET_LEN(LRU)
+		<< ", free pages=" << UT_LIST_GET_LEN(free)
+		<< ", modified database pages="
+		<< UT_LIST_GET_LEN(flush_list)
+		<< ", n pending decompressions=" << n_pend_unzip
+		<< ", n pending flush LRU=" << n_flush()
+		<< " list=" << os_aio_pending_writes()
+		<< ", pages made young=" << stat.n_pages_made_young
+		<< ", not young=" << stat.n_pages_not_made_young
+		<< ", pages read=" << stat.n_pages_read
+		<< ", created=" << stat.n_pages_created
+		<< ", written=" << stat.n_pages_written << "]";
+
+	mysql_mutex_unlock(&flush_list_mutex);
+
+	/* Count the number of blocks belonging to each index in the buffer */
+
+	n_found = 0;
+
+	chunk = chunks;
+
+	for (i = n_chunks; i--; chunk++) {
+		buf_block_t*	block		= chunk->blocks;
+		ulint		n_blocks	= chunk->size;
+
+		for (; n_blocks--; block++) {
+			const buf_frame_t* frame = block->page.frame;
+
+			if (fil_page_index_page_check(frame)) {
+
+				id = btr_page_get_index_id(frame);
+
+				/* Look for the id in the index_ids array */
+				j = 0;
+
+				while (j < n_found) {
+
+					if (index_ids[j] == id) {
+						counts[j]++;
+
+						break;
+					}
+					j++;
+				}
+
+				if (j == n_found) {
+					n_found++;
+					index_ids[j] = id;
+					counts[j] = 1;
+				}
+			}
+		}
+	}
+
+	mysql_mutex_unlock(&mutex);
+
+	for (i = 0; i < n_found; i++) {
+		index = dict_index_get_if_in_cache(index_ids[i]);
+
+		if (!index) {
+			ib::info() << "Block count for index "
+				<< index_ids[i] << " in buffer is about "
+				<< counts[i];
+		} else {
+			ib::info() << "Block count for index " << index_ids[i]
+				<< " in buffer is about " << counts[i]
+				<< ", index " << index->name
+				<< " of table " << index->table->name;
+		}
+	}
+
+	ut_free(index_ids);
+	ut_free(counts);
+
+	validate();
+}
+#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG */
+
+#ifdef UNIV_DEBUG
+/** @return the number of latched pages in the buffer pool */
+ulint buf_get_latched_pages_number()
+{
+  ulint fixed_pages_number= 0;
+
+  mysql_mutex_lock(&buf_pool.mutex);
+
+  for (buf_page_t *b= UT_LIST_GET_FIRST(buf_pool.LRU); b;
+       b= UT_LIST_GET_NEXT(LRU, b))
+    if (b->state() > buf_page_t::UNFIXED)
+      fixed_pages_number++;
+
+  mysql_mutex_unlock(&buf_pool.mutex);
+
+  return fixed_pages_number;
+}
+#endif /* UNIV_DEBUG */
+
+/** Collect buffer pool metadata.
+@param[out]	pool_info	buffer pool metadata */
+void buf_stats_get_pool_info(buf_pool_info_t *pool_info)
+{
+	time_t			current_time;
+	double			time_elapsed;
+
+	mysql_mutex_lock(&buf_pool.mutex);
+
+	pool_info->pool_size = buf_pool.curr_size;
+
+	pool_info->lru_len = UT_LIST_GET_LEN(buf_pool.LRU);
+
+	pool_info->old_lru_len = buf_pool.LRU_old_len;
+
+	pool_info->free_list_len = UT_LIST_GET_LEN(buf_pool.free);
+
+	mysql_mutex_lock(&buf_pool.flush_list_mutex);
+	pool_info->flush_list_len = UT_LIST_GET_LEN(buf_pool.flush_list);
+
+	pool_info->n_pend_unzip = UT_LIST_GET_LEN(buf_pool.unzip_LRU);
+
+	pool_info->n_pend_reads = os_aio_pending_reads_approx();
+
+	pool_info->n_pending_flush_lru = buf_pool.n_flush();
+
+	pool_info->n_pending_flush_list = os_aio_pending_writes();
+	mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+	current_time = time(NULL);
+	time_elapsed = 0.001 + difftime(current_time,
+					buf_pool.last_printout_time);
+
+	pool_info->n_pages_made_young = buf_pool.stat.n_pages_made_young;
+
+	pool_info->n_pages_not_made_young =
+		buf_pool.stat.n_pages_not_made_young;
+
+	pool_info->n_pages_read = buf_pool.stat.n_pages_read;
+
+	pool_info->n_pages_created = buf_pool.stat.n_pages_created;
+
+	pool_info->n_pages_written = buf_pool.stat.n_pages_written;
+
+	pool_info->n_page_gets = buf_pool.stat.n_page_gets;
+
+	pool_info->n_ra_pages_read_rnd = buf_pool.stat.n_ra_pages_read_rnd;
+	pool_info->n_ra_pages_read = buf_pool.stat.n_ra_pages_read;
+
+	pool_info->n_ra_pages_evicted = buf_pool.stat.n_ra_pages_evicted;
+
+	pool_info->page_made_young_rate =
+	static_cast<double>(buf_pool.stat.n_pages_made_young
+			    - buf_pool.old_stat.n_pages_made_young)
+	/ time_elapsed;
+
+	pool_info->page_not_made_young_rate =
+	static_cast<double>(buf_pool.stat.n_pages_not_made_young
+			    - buf_pool.old_stat.n_pages_not_made_young)
+	/ time_elapsed;
+
+	pool_info->pages_read_rate =
+	static_cast<double>(buf_pool.stat.n_pages_read
+			    - buf_pool.old_stat.n_pages_read)
+	/ time_elapsed;
+
+	pool_info->pages_created_rate =
+	static_cast<double>(buf_pool.stat.n_pages_created
+			    - buf_pool.old_stat.n_pages_created)
+	/ time_elapsed;
+
+	pool_info->pages_written_rate =
+	static_cast<double>(buf_pool.stat.n_pages_written
+			    - buf_pool.old_stat.n_pages_written)
+	/ time_elapsed;
+
+	pool_info->n_page_get_delta = buf_pool.stat.n_page_gets
+				      - buf_pool.old_stat.n_page_gets;
+
+	if (pool_info->n_page_get_delta) {
+		pool_info->page_read_delta = buf_pool.stat.n_pages_read
+					     - buf_pool.old_stat.n_pages_read;
+
+		pool_info->young_making_delta =
+			buf_pool.stat.n_pages_made_young
+			- buf_pool.old_stat.n_pages_made_young;
+
+		pool_info->not_young_making_delta =
+			buf_pool.stat.n_pages_not_made_young
+			- buf_pool.old_stat.n_pages_not_made_young;
+	}
+	pool_info->pages_readahead_rnd_rate =
+	static_cast<double>(buf_pool.stat.n_ra_pages_read_rnd
+			    - buf_pool.old_stat.n_ra_pages_read_rnd)
+	/ time_elapsed;
+
+
+	pool_info->pages_readahead_rate =
+	static_cast<double>(buf_pool.stat.n_ra_pages_read
+			    - buf_pool.old_stat.n_ra_pages_read)
+	/ time_elapsed;
+
+	pool_info->pages_evicted_rate =
+	static_cast<double>(buf_pool.stat.n_ra_pages_evicted
+			    - buf_pool.old_stat.n_ra_pages_evicted)
+	/ time_elapsed;
+
+	pool_info->unzip_lru_len = UT_LIST_GET_LEN(buf_pool.unzip_LRU);
+
+	pool_info->io_sum = buf_LRU_stat_sum.io;
+
+	pool_info->io_cur = buf_LRU_stat_cur.io;
+
+	pool_info->unzip_sum = buf_LRU_stat_sum.unzip;
+
+	pool_info->unzip_cur = buf_LRU_stat_cur.unzip;
+
+	buf_refresh_io_stats();
+	mysql_mutex_unlock(&buf_pool.mutex);
+}
+
+/*********************************************************************//**
+Prints info of the buffer i/o. */
+static
+void
+buf_print_io_instance(
+/*==================*/
+	buf_pool_info_t*pool_info,	/*!< in: buffer pool info */
+	FILE*		file)		/*!< in/out: buffer where to print */
+{
+	ut_ad(pool_info);
+
+	fprintf(file,
+		"Buffer pool size   " ULINTPF "\n"
+		"Free buffers       " ULINTPF "\n"
+		"Database pages     " ULINTPF "\n"
+		"Old database pages " ULINTPF "\n"
+		"Modified db pages  " ULINTPF "\n"
+		"Percent of dirty pages(LRU & free pages): %.3f\n"
+		"Max dirty pages percent: %.3f\n"
+		"Pending reads " ULINTPF "\n"
+		"Pending writes: LRU " ULINTPF ", flush list " ULINTPF "\n",
+		pool_info->pool_size,
+		pool_info->free_list_len,
+		pool_info->lru_len,
+		pool_info->old_lru_len,
+		pool_info->flush_list_len,
+		static_cast<double>(pool_info->flush_list_len)
+		/ (static_cast<double>(pool_info->lru_len
+				       + pool_info->free_list_len) + 1.0)
+		* 100.0,
+		srv_max_buf_pool_modified_pct,
+		pool_info->n_pend_reads,
+		pool_info->n_pending_flush_lru,
+		pool_info->n_pending_flush_list);
+
+	fprintf(file,
+		"Pages made young " ULINTPF ", not young " ULINTPF "\n"
+		"%.2f youngs/s, %.2f non-youngs/s\n"
+		"Pages read " ULINTPF ", created " ULINTPF
+		", written " ULINTPF "\n"
+		"%.2f reads/s, %.2f creates/s, %.2f writes/s\n",
+		pool_info->n_pages_made_young,
+		pool_info->n_pages_not_made_young,
+		pool_info->page_made_young_rate,
+		pool_info->page_not_made_young_rate,
+		pool_info->n_pages_read,
+		pool_info->n_pages_created,
+		pool_info->n_pages_written,
+		pool_info->pages_read_rate,
+		pool_info->pages_created_rate,
+		pool_info->pages_written_rate);
+
+	if (pool_info->n_page_get_delta) {
+		double hit_rate = static_cast<double>(
+			pool_info->page_read_delta)
+			/ static_cast<double>(pool_info->n_page_get_delta);
+
+		if (hit_rate > 1) {
+			hit_rate = 1;
+		}
+
+		fprintf(file,
+			"Buffer pool hit rate " ULINTPF " / 1000,"
+			" young-making rate " ULINTPF " / 1000 not "
+			ULINTPF " / 1000\n",
+			ulint(1000 * (1 - hit_rate)),
+			ulint(1000
+			      * double(pool_info->young_making_delta)
+			      / double(pool_info->n_page_get_delta)),
+			ulint(1000 * double(pool_info->not_young_making_delta)
+			      / double(pool_info->n_page_get_delta)));
+	} else {
+		fputs("No buffer pool page gets since the last printout\n",
+		      file);
+	}
+
+	/* Statistics about read ahead algorithm */
+	fprintf(file, "Pages read ahead %.2f/s,"
+		" evicted without access %.2f/s,"
+		" Random read ahead %.2f/s\n",
+
+		pool_info->pages_readahead_rate,
+		pool_info->pages_evicted_rate,
+		pool_info->pages_readahead_rnd_rate);
+
+	/* Print some values to help us with visualizing what is
+	happening with LRU eviction. */
+	fprintf(file,
+		"LRU len: " ULINTPF ", unzip_LRU len: " ULINTPF "\n"
+		"I/O sum[" ULINTPF "]:cur[" ULINTPF "], "
+		"unzip sum[" ULINTPF "]:cur[" ULINTPF "]\n",
+		pool_info->lru_len, pool_info->unzip_lru_len,
+		pool_info->io_sum, pool_info->io_cur,
+		pool_info->unzip_sum, pool_info->unzip_cur);
+}
+
+/*********************************************************************//**
+Prints info of the buffer i/o. */
+void
+buf_print_io(
+/*=========*/
+	FILE*	file)	/*!< in/out: buffer where to print */
+{
+	buf_pool_info_t	pool_info;
+
+	buf_stats_get_pool_info(&pool_info);
+	buf_print_io_instance(&pool_info, file);
+}
+
+/** Verify that post encryption checksum match with the calculated checksum.
+This function should be called only if tablespace contains crypt data metadata.
+@param page       page frame
+@param fsp_flags  contents of FSP_SPACE_FLAGS
+@return whether the page is encrypted and valid */
+bool buf_page_verify_crypt_checksum(const byte *page, uint32_t fsp_flags)
+{
+	if (!fil_space_t::full_crc32(fsp_flags)) {
+		return fil_space_verify_crypt_checksum(
+			page, fil_space_t::zip_size(fsp_flags));
+	}
+
+	return !buf_page_is_corrupted(true, page, fsp_flags);
+}
+
+/** Print the given page_id_t object.
+@param[in,out]	out	the output stream
+@param[in]	page_id	the page_id_t object to be printed
+@return the output stream */
+std::ostream& operator<<(std::ostream &out, const page_id_t page_id)
+{
+  out << "[page id: space=" << page_id.space()
+      << ", page number=" << page_id.page_no() << "]";
+  return out;
+}
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/buf/buf0checksum.cc b/storage/innobase/buf/buf0checksum.cc
new file mode 100644
index 00000000..662343ae
--- /dev/null
+++ b/storage/innobase/buf/buf0checksum.cc
@@ -0,0 +1,98 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0checksum.cc
+Buffer pool checksum functions, also linked from /extra/innochecksum.cc
+
+Created Aug 11, 2011 Vasil Dimov
+*******************************************************/
+
+#include "buf0checksum.h"
+#include "fil0fil.h"
+#include "ut0rnd.h"
+
+#ifndef UNIV_INNOCHECKSUM
+#include "srv0srv.h"
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** Calculate the CRC32 checksum of a page. The value is stored to the page
+when it is written to a file and also checked for a match when reading from
+the file. Note that we must be careful to calculate the same value on all
+architectures.
+@param[in]	page			buffer page (srv_page_size bytes)
+@return	CRC-32C */
+uint32_t buf_calc_page_crc32(const byte* page)
+{
+	/* Note: innodb_checksum_algorithm=crc32 could and should have
+	included the entire page in the checksum, and CRC-32 values
+	should be combined with the CRC-32 function, not with
+	exclusive OR. We stick to the current algorithm in order to
+	remain compatible with old data files. */
+	return my_crc32c(0, page + FIL_PAGE_OFFSET,
+			 FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION
+                         - FIL_PAGE_OFFSET)
+		^ my_crc32c(0, page + FIL_PAGE_DATA,
+                            srv_page_size
+                            - (FIL_PAGE_DATA + FIL_PAGE_END_LSN_OLD_CHKSUM));
+}
+
+#ifndef UNIV_INNOCHECKSUM
+/** Calculate a checksum which is stored to the page when it is written
+to a file. Note that we must be careful to calculate the same value on
+32-bit and 64-bit architectures.
+@param[in]	page	file page (srv_page_size bytes)
+@return checksum */
+uint32_t
+buf_calc_page_new_checksum(const byte* page)
+{
+	ulint checksum;
+
+	/* Since the field FIL_PAGE_FILE_FLUSH_LSN, and in versions <= 4.1.x
+	FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, are written outside the buffer pool
+	to the first pages of data files, we have to skip them in the page
+	checksum calculation.
+	We must also skip the field FIL_PAGE_SPACE_OR_CHKSUM where the
+	checksum is stored, and also the last 8 bytes of page because
+	there we store the old formula checksum. */
+
+	checksum = ut_fold_binary(page + FIL_PAGE_OFFSET,
+				  FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION
+				  - FIL_PAGE_OFFSET)
+		+ ut_fold_binary(page + FIL_PAGE_DATA,
+				 srv_page_size - FIL_PAGE_DATA
+				 - FIL_PAGE_END_LSN_OLD_CHKSUM);
+	return(static_cast<uint32_t>(checksum));
+}
+
+/** In MySQL before 4.0.14 or 4.1.1 there was an InnoDB bug that
+the checksum only looked at the first few bytes of the page.
+This calculates that old checksum.
+NOTE: we must first store the new formula checksum to
+FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum
+because this takes that field as an input!
+@param[in]	page	file page (srv_page_size bytes)
+@return checksum */
+uint32_t
+buf_calc_page_old_checksum(const byte* page)
+{
+	return(static_cast<uint32_t>
+	       (ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION)));
+}
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc
new file mode 100644
index 00000000..e9aea355
--- /dev/null
+++ b/storage/innobase/buf/buf0dblwr.cc
@@ -0,0 +1,779 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0dblwr.cc
+Doublwrite buffer module
+
+Created 2011/12/19
+*******************************************************/
+
+#include "buf0dblwr.h"
+#include "buf0flu.h"
+#include "buf0checksum.h"
+#include "srv0start.h"
+#include "srv0srv.h"
+#include "page0zip.h"
+#include "trx0sys.h"
+#include "fil0crypt.h"
+#include "fil0pagecompress.h"
+
+using st_::span;
+
+/** The doublewrite buffer */
+buf_dblwr_t buf_dblwr;
+
+/** @return the TRX_SYS page */
+inline buf_block_t *buf_dblwr_trx_sys_get(mtr_t *mtr)
+{
+  return buf_page_get(page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO),
+                      0, RW_X_LATCH, mtr);
+}
+
+void buf_dblwr_t::init()
+{
+  if (!active_slot)
+  {
+    active_slot= &slots[0];
+    mysql_mutex_init(buf_dblwr_mutex_key, &mutex, nullptr);
+    pthread_cond_init(&cond, nullptr);
+  }
+}
+
+/** Initialise the persistent storage of the doublewrite buffer.
+@param header   doublewrite page header in the TRX_SYS page */
+inline void buf_dblwr_t::init(const byte *header)
+{
+  ut_ad(!active_slot->first_free);
+  ut_ad(!active_slot->reserved);
+  ut_ad(!batch_running);
+
+  block1= page_id_t(0, mach_read_from_4(header + TRX_SYS_DOUBLEWRITE_BLOCK1));
+  block2= page_id_t(0, mach_read_from_4(header + TRX_SYS_DOUBLEWRITE_BLOCK2));
+
+  const uint32_t buf_size= 2 * block_size();
+  for (int i= 0; i < 2; i++)
+  {
+    slots[i].write_buf= static_cast<byte*>
+      (aligned_malloc(buf_size << srv_page_size_shift, srv_page_size));
+    slots[i].buf_block_arr= static_cast<element*>
+      (ut_zalloc_nokey(buf_size * sizeof(element)));
+  }
+  active_slot= &slots[0];
+}
+
+/** Create or restore the doublewrite buffer in the TRX_SYS page.
+@return whether the operation succeeded */
+bool buf_dblwr_t::create()
+{
+  if (is_created())
+    return true;
+
+  mtr_t mtr;
+  const ulint size= block_size();
+
+start_again:
+  mtr.start();
+
+  dberr_t err;
+  buf_block_t *trx_sys_block= buf_dblwr_trx_sys_get(&mtr);
+  if (!trx_sys_block)
+  {
+    mtr.commit();
+    return false;
+  }
+
+  if (mach_read_from_4(TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC +
+                       trx_sys_block->page.frame) ==
+      TRX_SYS_DOUBLEWRITE_MAGIC_N)
+  {
+    /* The doublewrite buffer has already been created: just read in
+    some numbers */
+    init(TRX_SYS_DOUBLEWRITE + trx_sys_block->page.frame);
+    mtr.commit();
+    return true;
+  }
+
+  if (UT_LIST_GET_FIRST(fil_system.sys_space->chain)->size < 3 * size)
+  {
+    ib::error() << "Cannot create doublewrite buffer: "
+                   "the first file in innodb_data_file_path must be at least "
+                << (3 * (size >> (20U - srv_page_size_shift))) << "M.";
+fail:
+    mtr.commit();
+    return false;
+  }
+  else
+  {
+    buf_block_t *b= fseg_create(fil_system.sys_space,
+                                TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG,
+                                &mtr, &err, false, trx_sys_block);
+    if (!b)
+    {
+      ib::error() << "Cannot create doublewrite buffer: " << err;
+      goto fail;
+    }
+
+    ib::info() << "Doublewrite buffer not found: creating new";
+
+    /* FIXME: After this point, the doublewrite buffer creation
+    is not atomic. The doublewrite buffer should not exist in
+    the InnoDB system tablespace file in the first place.
+    It could be located in separate optional file(s) in a
+    user-specified location. */
+  }
+
+  byte *fseg_header= TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG +
+    trx_sys_block->page.frame;
+  for (uint32_t prev_page_no= 0, i= 0, extent_size= FSP_EXTENT_SIZE;
+       i < 2 * size + extent_size / 2; i++)
+  {
+    buf_block_t *new_block=
+      fseg_alloc_free_page_general(fseg_header, prev_page_no + 1, FSP_UP,
+                                   false, &mtr, &mtr, &err);
+    if (!new_block)
+    {
+      ib::error() << "Cannot create doublewrite buffer: "
+                     " you must increase your tablespace size."
+                     " Cannot continue operation.";
+      /* This may essentially corrupt the doublewrite
+      buffer. However, usually the doublewrite buffer
+      is created at database initialization, and it
+      should not matter (just remove all newly created
+      InnoDB files and restart). */
+      mtr.commit();
+      return false;
+    }
+
+    /* We read the allocated pages to the buffer pool; when they are
+    written to disk in a flush, the space id and page number fields
+    are also written to the pages. When we at database startup read
+    pages from the doublewrite buffer, we know that if the space id
+    and page number in them are the same as the page position in the
+    tablespace, then the page has not been written to in
+    doublewrite. */
+
+    ut_ad(new_block->page.lock.not_recursive());
+    const page_id_t id= new_block->page.id();
+    /* We only do this in the debug build, to ensure that the check in
+    buf_flush_init_for_writing() will see a valid page type. The
+    flushes of new_block are actually unnecessary here.  */
+    ut_d(mtr.write<2>(*new_block, FIL_PAGE_TYPE + new_block->page.frame,
+                      FIL_PAGE_TYPE_SYS));
+
+    if (i == size / 2)
+    {
+      ut_a(id.page_no() == size);
+      mtr.write<4>(*trx_sys_block,
+                   TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_BLOCK1 +
+                   trx_sys_block->page.frame, id.page_no());
+      mtr.write<4>(*trx_sys_block,
+                   TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_REPEAT +
+                   TRX_SYS_DOUBLEWRITE_BLOCK1 + trx_sys_block->page.frame,
+                   id.page_no());
+    }
+    else if (i == size / 2 + size)
+    {
+      ut_a(id.page_no() == 2 * size);
+      mtr.write<4>(*trx_sys_block,
+                   TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_BLOCK2 +
+                   trx_sys_block->page.frame, id.page_no());
+      mtr.write<4>(*trx_sys_block,
+                   TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_REPEAT +
+                   TRX_SYS_DOUBLEWRITE_BLOCK2 + trx_sys_block->page.frame,
+                   id.page_no());
+    }
+    else if (i > size / 2)
+      ut_a(id.page_no() == prev_page_no + 1);
+
+    if (((i + 1) & 15) == 0) {
+      /* rw_locks can only be recursively x-locked 2048 times. (on 32
+      bit platforms, (lint) 0 - (X_LOCK_DECR * 2049) is no longer a
+      negative number, and thus lock_word becomes like a shared lock).
+      For 4k page size this loop will lock the fseg header too many
+      times. Since this code is not done while any other threads are
+      active, restart the MTR occasionally. */
+      mtr.commit();
+      mtr.start();
+      trx_sys_block= buf_dblwr_trx_sys_get(&mtr);
+      fseg_header= TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG +
+        trx_sys_block->page.frame;
+    }
+
+    prev_page_no= id.page_no();
+  }
+
+  mtr.write<4>(*trx_sys_block,
+               TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC +
+               trx_sys_block->page.frame, TRX_SYS_DOUBLEWRITE_MAGIC_N);
+  mtr.write<4>(*trx_sys_block,
+               TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC +
+               TRX_SYS_DOUBLEWRITE_REPEAT + trx_sys_block->page.frame,
+               TRX_SYS_DOUBLEWRITE_MAGIC_N);
+
+  mtr.write<4>(*trx_sys_block,
+               TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED +
+               trx_sys_block->page.frame,
+               TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N);
+  mtr.commit();
+
+  buf_flush_wait_flushed(mtr.commit_lsn());
+
+  /* Remove doublewrite pages from LRU */
+  buf_pool_invalidate();
+  goto start_again;
+}
+
+/** Initialize the doublewrite buffer memory structure on recovery.
+If we are upgrading from a version before MySQL 4.1, then this
+function performs the necessary update operations to support
+innodb_file_per_table. If we are in a crash recovery, this function
+loads the pages from double write buffer into memory.
+@param file File handle
+@param path Path name of file
+@return DB_SUCCESS or error code */
+dberr_t buf_dblwr_t::init_or_load_pages(pfs_os_file_t file, const char *path)
+{
+  ut_ad(this == &buf_dblwr);
+  const uint32_t size= block_size();
+
+  /* We do the file i/o past the buffer pool */
+  byte *read_buf= static_cast<byte*>(aligned_malloc(srv_page_size,
+                                                    srv_page_size));
+  /* Read the TRX_SYS header to check if we are using the doublewrite buffer */
+  dberr_t err= os_file_read(IORequestRead, file, read_buf,
+                            TRX_SYS_PAGE_NO << srv_page_size_shift,
+                            srv_page_size, nullptr);
+
+  if (err != DB_SUCCESS)
+  {
+    ib::error() << "Failed to read the system tablespace header page";
+func_exit:
+    aligned_free(read_buf);
+    return err;
+  }
+
+  /* TRX_SYS_PAGE_NO is not encrypted see fil_crypt_rotate_page() */
+  if (mach_read_from_4(TRX_SYS_DOUBLEWRITE_MAGIC + TRX_SYS_DOUBLEWRITE +
+                       read_buf) != TRX_SYS_DOUBLEWRITE_MAGIC_N)
+  {
+    /* There is no doublewrite buffer initialized in the TRX_SYS page.
+    This should normally not be possible; the doublewrite buffer should
+    be initialized when creating the database. */
+    err= DB_SUCCESS;
+    goto func_exit;
+  }
+
+  init(TRX_SYS_DOUBLEWRITE + read_buf);
+
+  const bool upgrade_to_innodb_file_per_table=
+    mach_read_from_4(TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED +
+                     TRX_SYS_DOUBLEWRITE + read_buf) !=
+    TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N;
+
+  auto write_buf= active_slot->write_buf;
+  /* Read the pages from the doublewrite buffer to memory */
+  err= os_file_read(IORequestRead, file, write_buf,
+                    block1.page_no() << srv_page_size_shift,
+                    size << srv_page_size_shift, nullptr);
+
+  if (err != DB_SUCCESS)
+  {
+    ib::error() << "Failed to read the first double write buffer extent";
+    goto func_exit;
+  }
+
+  err= os_file_read(IORequestRead, file,
+                    write_buf + (size << srv_page_size_shift),
+                    block2.page_no() << srv_page_size_shift,
+                    size << srv_page_size_shift, nullptr);
+  if (err != DB_SUCCESS)
+  {
+    ib::error() << "Failed to read the second double write buffer extent";
+    goto func_exit;
+  }
+
+  byte *page= write_buf;
+
+  if (UNIV_UNLIKELY(upgrade_to_innodb_file_per_table))
+  {
+    ib::info() << "Resetting space id's in the doublewrite buffer";
+
+    for (ulint i= 0; i < size * 2; i++, page += srv_page_size)
+    {
+      memset(page + FIL_PAGE_SPACE_ID, 0, 4);
+      /* For pre-MySQL-4.1 innodb_checksum_algorithm=innodb, we do not need to
+      calculate new checksums for the pages because the field
+      .._SPACE_ID does not affect them. Write the page back to where
+      we read it from. */
+      const ulint source_page_no= i < size
+        ? block1.page_no() + i
+        : block2.page_no() + i - size;
+      err= os_file_write(IORequestWrite, path, file, page,
+                         source_page_no << srv_page_size_shift, srv_page_size);
+      if (err != DB_SUCCESS)
+      {
+        ib::error() << "Failed to upgrade the double write buffer";
+        goto func_exit;
+      }
+    }
+    os_file_flush(file);
+  }
+  else
+    for (ulint i= 0; i < size * 2; i++, page += srv_page_size)
+      if (mach_read_from_8(my_assume_aligned<8>(page + FIL_PAGE_LSN)))
+        /* Each valid page header must contain a nonzero FIL_PAGE_LSN field. */
+        recv_sys.dblwr.add(page);
+
+  err= DB_SUCCESS;
+  goto func_exit;
+}
+
+/** Process and remove the double write buffer pages for all tablespaces. */
+void buf_dblwr_t::recover()
+{
+  ut_ad(log_sys.last_checkpoint_lsn);
+  if (!is_created())
+    return;
+
+  uint32_t page_no_dblwr= 0;
+  byte *read_buf= static_cast<byte*>(aligned_malloc(3 * srv_page_size,
+                                                    srv_page_size));
+  byte *const buf= read_buf + srv_page_size;
+
+  for (recv_dblwr_t::list::iterator i= recv_sys.dblwr.pages.begin();
+       i != recv_sys.dblwr.pages.end(); ++i, ++page_no_dblwr)
+  {
+    byte *page= *i;
+    const uint32_t page_no= page_get_page_no(page);
+    if (!page_no) /* recovered via recv_dblwr_t::restore_first_page() */
+      continue;
+
+    const lsn_t lsn= mach_read_from_8(page + FIL_PAGE_LSN);
+    if (log_sys.last_checkpoint_lsn > lsn)
+      /* Pages written before the checkpoint are not useful for recovery. */
+      continue;
+    const uint32_t space_id= page_get_space_id(page);
+    const page_id_t page_id(space_id, page_no);
+
+    if (recv_sys.scanned_lsn < lsn)
+    {
+      ib::info() << "Ignoring a doublewrite copy of page " << page_id
+                 << " with future log sequence number " << lsn;
+      continue;
+    }
+
+    fil_space_t *space= fil_space_t::get(space_id);
+
+    if (!space)
+      /* The tablespace that this page once belonged to does not exist */
+      continue;
+
+    if (UNIV_UNLIKELY(page_no >= space->get_size()))
+    {
+      /* Do not report the warning for undo tablespaces, because they
+      can be truncated in place. */
+      if (!srv_is_undo_tablespace(space_id))
+        ib::warn() << "A copy of page " << page_no
+                   << " in the doublewrite buffer slot " << page_no_dblwr
+                   << " is beyond the end of " << space->chain.start->name
+                   << " (" << space->size << " pages)";
+next_page:
+      space->release();
+      continue;
+    }
+
+    const ulint physical_size= space->physical_size();
+    ut_ad(!buf_is_zeroes(span<const byte>(page, physical_size)));
+
+    /* We want to ensure that for partial reads the unread portion of
+    the page is NUL. */
+    memset(read_buf, 0x0, physical_size);
+
+    /* Read in the actual page from the file */
+    fil_io_t fio= space->io(IORequest(IORequest::DBLWR_RECOVER),
+                            os_offset_t{page_no} * physical_size,
+                            physical_size, read_buf);
+
+    if (UNIV_UNLIKELY(fio.err != DB_SUCCESS))
+    {
+       ib::warn() << "Double write buffer recovery: " << page_id
+                  << " ('" << space->chain.start->name
+                  << "') read failed with error: " << fio.err;
+       continue;
+    }
+
+    if (buf_is_zeroes(span<const byte>(read_buf, physical_size)))
+    {
+      /* We will check if the copy in the doublewrite buffer is
+      valid. If not, we will ignore this page (there should be redo
+      log records to initialize it). */
+    }
+    else if (recv_sys.dblwr.validate_page(page_id, read_buf, space, buf))
+      goto next_page;
+    else
+      /* We intentionally skip this message for all-zero pages. */
+      ib::info() << "Trying to recover page " << page_id
+                 << " from the doublewrite buffer.";
+
+    page= recv_sys.dblwr.find_page(page_id, space, buf);
+
+    if (!page)
+      goto next_page;
+
+    /* Write the good page from the doublewrite buffer to the intended
+    position. */
+    space->reacquire();
+    fio= space->io(IORequestWrite,
+                   os_offset_t{page_id.page_no()} * physical_size,
+                   physical_size, page);
+
+    if (fio.err == DB_SUCCESS)
+      ib::info() << "Recovered page " << page_id << " to '" << fio.node->name
+                 << "' from the doublewrite buffer.";
+    goto next_page;
+  }
+
+  recv_sys.dblwr.pages.clear();
+  fil_flush_file_spaces();
+  aligned_free(read_buf);
+}
+
+/** Free the doublewrite buffer. */
+void buf_dblwr_t::close()
+{
+  if (!active_slot)
+    return;
+
+  ut_ad(!active_slot->reserved);
+  ut_ad(!active_slot->first_free);
+  ut_ad(!batch_running);
+
+  pthread_cond_destroy(&cond);
+  for (int i= 0; i < 2; i++)
+  {
+    aligned_free(slots[i].write_buf);
+    ut_free(slots[i].buf_block_arr);
+  }
+  mysql_mutex_destroy(&mutex);
+
+  memset((void*) this, 0, sizeof *this);
+}
+
+/** Update the doublewrite buffer on write completion. */
+void buf_dblwr_t::write_completed()
+{
+  ut_ad(this == &buf_dblwr);
+  ut_ad(!srv_read_only_mode);
+
+  mysql_mutex_lock(&mutex);
+
+  ut_ad(is_created());
+  ut_ad(srv_use_doublewrite_buf);
+  ut_ad(batch_running);
+  slot *flush_slot= active_slot == &slots[0] ? &slots[1] : &slots[0];
+  ut_ad(flush_slot->reserved);
+  ut_ad(flush_slot->reserved <= flush_slot->first_free);
+
+  if (!--flush_slot->reserved)
+  {
+    mysql_mutex_unlock(&mutex);
+    /* This will finish the batch. Sync data files to the disk. */
+    fil_flush_file_spaces();
+    mysql_mutex_lock(&mutex);
+
+    /* We can now reuse the doublewrite memory buffer: */
+    flush_slot->first_free= 0;
+    batch_running= false;
+    pthread_cond_broadcast(&cond);
+  }
+
+  mysql_mutex_unlock(&mutex);
+}
+
+#ifdef UNIV_DEBUG
+/** Check the LSN values on the page.
+@param[in] page  page to check
+@param[in] s     tablespace */
+static void buf_dblwr_check_page_lsn(const page_t* page, const fil_space_t& s)
+{
+  /* Ignore page_compressed or encrypted pages */
+  if (s.is_compressed() || buf_page_get_key_version(page, s.flags))
+    return;
+  const byte* lsn_start= FIL_PAGE_LSN + 4 + page;
+  const byte* lsn_end= page + srv_page_size -
+    (s.full_crc32()
+     ? FIL_PAGE_FCRC32_END_LSN
+     : FIL_PAGE_END_LSN_OLD_CHKSUM - 4);
+  static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment");
+  static_assert(FIL_PAGE_LSN % 4 == 0, "alignment");
+  ut_ad(!memcmp_aligned<4>(lsn_start, lsn_end, 4));
+}
+
+static void buf_dblwr_check_page_lsn(const buf_page_t &b, const byte *page)
+{
+  if (fil_space_t *space= fil_space_t::get_for_write(b.id().space()))
+  {
+    buf_dblwr_check_page_lsn(page, *space);
+    space->release();
+  }
+}
+
+/** Check the LSN values on the page with which this block is associated. */
+static void buf_dblwr_check_block(const buf_page_t *bpage)
+{
+  ut_ad(bpage->in_file());
+  const page_t *page= bpage->frame;
+  ut_ad(page);
+
+  switch (fil_page_get_type(page)) {
+  case FIL_PAGE_INDEX:
+  case FIL_PAGE_TYPE_INSTANT:
+  case FIL_PAGE_RTREE:
+    if (page_is_comp(page))
+    {
+      if (page_simple_validate_new(page))
+        return;
+    }
+    else if (page_simple_validate_old(page))
+      return;
+    /* While it is possible that this is not an index page but just
+    happens to have wrongly set FIL_PAGE_TYPE, such pages should never
+    be modified to without also adjusting the page type during page
+    allocation or buf_flush_init_for_writing() or
+    fil_block_reset_type(). */
+    buf_page_print(page);
+
+    ib::fatal() << "Apparent corruption of an index page " << bpage->id()
+                << " to be written to data file. We intentionally crash"
+                " the server to prevent corrupt data from ending up in"
+                " data files.";
+  }
+}
+#endif /* UNIV_DEBUG */
+
+bool buf_dblwr_t::flush_buffered_writes(const ulint size)
+{
+  mysql_mutex_assert_owner(&mutex);
+  ut_ad(size == block_size());
+
+  for (;;)
+  {
+    if (!active_slot->first_free)
+      return false;
+    if (!batch_running)
+      break;
+    my_cond_wait(&cond, &mutex.m_mutex);
+  }
+
+  ut_ad(active_slot->reserved == active_slot->first_free);
+  ut_ad(!flushing_buffered_writes);
+
+  /* Disallow anyone else to start another batch of flushing. */
+  slot *flush_slot= active_slot;
+  /* Switch the active slot */
+  active_slot= active_slot == &slots[0] ? &slots[1] : &slots[0];
+  ut_a(active_slot->first_free == 0);
+  batch_running= true;
+  const ulint old_first_free= flush_slot->first_free;
+  auto write_buf= flush_slot->write_buf;
+  const bool multi_batch= block1 + static_cast<uint32_t>(size) != block2 &&
+    old_first_free > size;
+  flushing_buffered_writes= 1 + multi_batch;
+  /* Now safe to release the mutex. */
+  mysql_mutex_unlock(&mutex);
+#ifdef UNIV_DEBUG
+  for (ulint len2= 0, i= 0; i < old_first_free; len2 += srv_page_size, i++)
+  {
+    buf_page_t *bpage= flush_slot->buf_block_arr[i].request.bpage;
+
+    if (bpage->zip.data)
+      /* No simple validate for ROW_FORMAT=COMPRESSED pages exists. */
+      continue;
+
+    /* Check that the actual page in the buffer pool is not corrupt
+    and the LSN values are sane. */
+    buf_dblwr_check_block(bpage);
+    ut_d(buf_dblwr_check_page_lsn(*bpage, write_buf + len2));
+  }
+#endif /* UNIV_DEBUG */
+  const IORequest request{nullptr, nullptr, fil_system.sys_space->chain.start,
+                          IORequest::DBLWR_BATCH};
+  ut_a(fil_system.sys_space->acquire());
+  if (multi_batch)
+  {
+    fil_system.sys_space->reacquire();
+    os_aio(request, write_buf,
+           os_offset_t{block1.page_no()} << srv_page_size_shift,
+           size << srv_page_size_shift);
+    os_aio(request, write_buf + (size << srv_page_size_shift),
+           os_offset_t{block2.page_no()} << srv_page_size_shift,
+           (old_first_free - size) << srv_page_size_shift);
+  }
+  else
+    os_aio(request, write_buf,
+           os_offset_t{block1.page_no()} << srv_page_size_shift,
+           old_first_free << srv_page_size_shift);
+  return true;
+}
+
+static void *get_frame(const IORequest &request)
+{
+  if (request.slot)
+    return request.slot->out_buf;
+  const buf_page_t *bpage= request.bpage;
+  return bpage->zip.data ? bpage->zip.data : bpage->frame;
+}
+
+void buf_dblwr_t::flush_buffered_writes_completed(const IORequest &request)
+{
+  ut_ad(this == &buf_dblwr);
+  ut_ad(srv_use_doublewrite_buf);
+  ut_ad(is_created());
+  ut_ad(!srv_read_only_mode);
+  ut_ad(!request.bpage);
+  ut_ad(request.node == fil_system.sys_space->chain.start);
+  ut_ad(request.type == IORequest::DBLWR_BATCH);
+  mysql_mutex_lock(&mutex);
+  ut_ad(batch_running);
+  ut_ad(flushing_buffered_writes);
+  ut_ad(flushing_buffered_writes <= 2);
+  writes_completed++;
+  if (UNIV_UNLIKELY(--flushing_buffered_writes))
+  {
+    mysql_mutex_unlock(&mutex);
+    return;
+  }
+
+  slot *const flush_slot= active_slot == &slots[0] ? &slots[1] : &slots[0];
+  ut_ad(flush_slot->reserved == flush_slot->first_free);
+  /* increment the doublewrite flushed pages counter */
+  pages_written+= flush_slot->first_free;
+  mysql_mutex_unlock(&mutex);
+
+  /* Now flush the doublewrite buffer data to disk */
+  fil_system.sys_space->flush<false>();
+
+  /* The writes have been flushed to disk now and in recovery we will
+  find them in the doublewrite buffer blocks. Next, write the data pages. */
+  for (ulint i= 0, first_free= flush_slot->first_free; i < first_free; i++)
+  {
+    auto e= flush_slot->buf_block_arr[i];
+    buf_page_t* bpage= e.request.bpage;
+    ut_ad(bpage->in_file());
+
+    void *frame= get_frame(e.request);
+    ut_ad(frame);
+
+    auto e_size= e.size;
+
+    if (UNIV_LIKELY_NULL(bpage->zip.data))
+    {
+      e_size= bpage->zip_size();
+      ut_ad(e_size);
+    }
+    else
+    {
+      ut_ad(!bpage->zip_size());
+      ut_d(buf_dblwr_check_page_lsn(*bpage, static_cast<const byte*>(frame)));
+    }
+
+    const lsn_t lsn= mach_read_from_8(my_assume_aligned<8>
+                                      (FIL_PAGE_LSN +
+                                       static_cast<const byte*>(frame)));
+    ut_ad(lsn);
+    ut_ad(lsn >= bpage->oldest_modification());
+    log_write_up_to(lsn, true);
+    e.request.node->space->io(e.request, bpage->physical_offset(), e_size,
+                              frame, bpage);
+  }
+}
+
+/** Flush possible buffered writes to persistent storage.
+It is very important to call this function after a batch of writes has been
+posted, and also when we may have to wait for a page latch!
+Otherwise a deadlock of threads can occur. */
+void buf_dblwr_t::flush_buffered_writes()
+{
+  if (!is_created() || !srv_use_doublewrite_buf)
+  {
+    fil_flush_file_spaces();
+    return;
+  }
+
+  ut_ad(!srv_read_only_mode);
+  const ulint size= block_size();
+
+  mysql_mutex_lock(&mutex);
+  if (!flush_buffered_writes(size))
+    mysql_mutex_unlock(&mutex);
+}
+
+/** Schedule a page write. If the doublewrite memory buffer is full,
+flush_buffered_writes() will be invoked to make space.
+@param request    asynchronous write request
+@param size       payload size in bytes */
+void buf_dblwr_t::add_to_batch(const IORequest &request, size_t size)
+{
+  ut_ad(request.is_async());
+  ut_ad(request.is_write());
+  ut_ad(request.bpage);
+  ut_ad(request.bpage->in_file());
+  ut_ad(request.node);
+  ut_ad(request.node->space->purpose == FIL_TYPE_TABLESPACE);
+  ut_ad(request.node->space->id == request.bpage->id().space());
+  ut_ad(request.node->space->referenced());
+  ut_ad(!srv_read_only_mode);
+
+  const ulint buf_size= 2 * block_size();
+
+  mysql_mutex_lock(&mutex);
+
+  for (;;)
+  {
+    ut_ad(active_slot->first_free <= buf_size);
+    if (active_slot->first_free != buf_size)
+      break;
+
+    if (flush_buffered_writes(buf_size / 2))
+      mysql_mutex_lock(&mutex);
+  }
+
+  byte *p= active_slot->write_buf + srv_page_size * active_slot->first_free;
+
+  /* "frame" is at least 1024-byte aligned for ROW_FORMAT=COMPRESSED pages,
+  and at least srv_page_size (4096-byte) for everything else. */
+  memcpy_aligned<UNIV_ZIP_SIZE_MIN>(p, get_frame(request), size);
+  /* fil_page_compress() for page_compressed guarantees 256-byte alignment */
+  memset_aligned<256>(p + size, 0, srv_page_size - size);
+  /* FIXME: Inform the compiler that "size" and "srv_page_size - size"
+  are integer multiples of 256, so the above can translate into simple
+  SIMD instructions. Currently, we make no such assumptions about the
+  non-pointer parameters that are passed to the _aligned templates. */
+  ut_ad(!request.bpage->zip_size() || request.bpage->zip_size() == size);
+  ut_ad(active_slot->reserved == active_slot->first_free);
+  ut_ad(active_slot->reserved < buf_size);
+  new (active_slot->buf_block_arr + active_slot->first_free++)
+    element{request, size};
+  active_slot->reserved= active_slot->first_free;
+
+  if (active_slot->first_free != buf_size ||
+      !flush_buffered_writes(buf_size / 2))
+    mysql_mutex_unlock(&mutex);
+}
diff --git a/storage/innobase/buf/buf0dump.cc b/storage/innobase/buf/buf0dump.cc
new file mode 100644
index 00000000..957632db
--- /dev/null
+++ b/storage/innobase/buf/buf0dump.cc
@@ -0,0 +1,765 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0dump.cc
+Implements a buffer pool dump/load.
+
+Created April 08, 2011 Vasil Dimov
+*******************************************************/
+
+#include "my_global.h"
+#include "mysqld.h"
+#include "my_sys.h"
+
+#include "mysql/psi/mysql_stage.h"
+#include "mysql/psi/psi.h"
+
+#include "buf0rea.h"
+#include "buf0dump.h"
+#include "dict0dict.h"
+#include "os0file.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "ut0byte.h"
+
+#include <algorithm>
+
+#include "mysql/service_wsrep.h" /* wsrep_recovery */
+#include <my_service_manager.h>
+
+static void buf_do_load_dump();
+
+enum status_severity {
+	STATUS_INFO,
+	STATUS_ERR
+};
+
+#define SHUTTING_DOWN()	(srv_shutdown_state != SRV_SHUTDOWN_NONE)
+
+/* Flags that tell the buffer pool dump/load thread which action should it
+take after being waked up. */
+static volatile bool	buf_dump_should_start;
+static volatile bool	buf_load_should_start;
+
+static bool	buf_load_abort_flag;
+
+/** Start the buffer pool dump/load task and instructs it to start a dump. */
+void buf_dump_start()
+{
+  buf_dump_should_start= true;
+  buf_do_load_dump();
+}
+
+/** Start the buffer pool dump/load task and instructs it to start a load. */
+void buf_load_start()
+{
+  buf_load_should_start= true;
+  buf_do_load_dump();
+}
+
+/*****************************************************************//**
+Sets the global variable that feeds MySQL's innodb_buffer_pool_dump_status
+to the specified string. The format and the following parameters are the
+same as the ones used for printf(3). The value of this variable can be
+retrieved by:
+SELECT variable_value FROM information_schema.global_status WHERE
+variable_name = 'INNODB_BUFFER_POOL_DUMP_STATUS';
+or by:
+SHOW STATUS LIKE 'innodb_buffer_pool_dump_status'; */
+static MY_ATTRIBUTE((nonnull, format(printf, 2, 3)))
+void
+buf_dump_status(
+/*============*/
+	enum status_severity	severity,/*!< in: status severity */
+	const char*		fmt,	/*!< in: format */
+	...)				/*!< in: extra parameters according
+					to fmt */
+{
+	va_list	ap;
+
+	va_start(ap, fmt);
+
+	vsnprintf(
+		export_vars.innodb_buffer_pool_dump_status,
+		sizeof(export_vars.innodb_buffer_pool_dump_status),
+		fmt, ap);
+
+	switch (severity) {
+	case STATUS_INFO:
+		ib::info() << export_vars.innodb_buffer_pool_dump_status;
+		break;
+
+	case STATUS_ERR:
+		ib::error() << export_vars.innodb_buffer_pool_dump_status;
+		break;
+	}
+
+	va_end(ap);
+}
+
+/*****************************************************************//**
+Sets the global variable that feeds MySQL's innodb_buffer_pool_load_status
+to the specified string. The format and the following parameters are the
+same as the ones used for printf(3). The value of this variable can be
+retrieved by:
+SELECT variable_value FROM information_schema.global_status WHERE
+variable_name = 'INNODB_BUFFER_POOL_LOAD_STATUS';
+or by:
+SHOW STATUS LIKE 'innodb_buffer_pool_load_status'; */
+static MY_ATTRIBUTE((nonnull, format(printf, 2, 3)))
+void
+buf_load_status(
+/*============*/
+	enum status_severity	severity,/*!< in: status severity */
+	const char*	fmt,	/*!< in: format */
+	...)			/*!< in: extra parameters according to fmt */
+{
+	va_list	ap;
+
+	va_start(ap, fmt);
+
+	vsnprintf(
+		export_vars.innodb_buffer_pool_load_status,
+		sizeof(export_vars.innodb_buffer_pool_load_status),
+		fmt, ap);
+
+	switch (severity) {
+	case STATUS_INFO:
+		ib::info() << export_vars.innodb_buffer_pool_load_status;
+		break;
+
+	case STATUS_ERR:
+		ib::error() << export_vars.innodb_buffer_pool_load_status;
+		break;
+	}
+
+	va_end(ap);
+}
+
+/** Returns the directory path where the buffer pool dump file will be created.
+@return directory path */
+static
+const char*
+get_buf_dump_dir()
+{
+	const char*	dump_dir;
+
+	/* The dump file should be created in the default data directory if
+	innodb_data_home_dir is set as an empty string. */
+	if (!*srv_data_home) {
+		dump_dir = fil_path_to_mysql_datadir;
+	} else {
+		dump_dir = srv_data_home;
+	}
+
+	return(dump_dir);
+}
+
+/** Generate the path to the buffer pool dump/load file.
+@param[out]	path		generated path
+@param[in]	path_size	size of 'path', used as in snprintf(3). */
+static void buf_dump_generate_path(char *path, size_t path_size)
+{
+	char	buf[FN_REFLEN];
+
+	mysql_mutex_lock(&LOCK_global_system_variables);
+	snprintf(buf, sizeof buf, "%s/%s", get_buf_dump_dir(),
+		 srv_buf_dump_filename);
+	mysql_mutex_unlock(&LOCK_global_system_variables);
+
+	os_file_type_t	type;
+	bool		exists = false;
+	bool		ret;
+
+	ret = os_file_status(buf, &exists, &type);
+
+	/* For realpath() to succeed the file must exist. */
+
+	if (ret && exists) {
+		/* my_realpath() assumes the destination buffer is big enough
+		to hold FN_REFLEN bytes. */
+		ut_a(path_size >= FN_REFLEN);
+
+		my_realpath(path, buf, 0);
+	} else {
+		/* If it does not exist, then resolve only srv_data_home
+		and append srv_buf_dump_filename to it. */
+		char	srv_data_home_full[FN_REFLEN];
+
+		my_realpath(srv_data_home_full, get_buf_dump_dir(), 0);
+		const char *format;
+
+		switch (srv_data_home_full[strlen(srv_data_home_full) - 1]) {
+#ifdef _WIN32
+		case '\\':
+#endif
+		case '/':
+			format = "%s%s";
+			break;
+		default:
+			format = "%s/%s";
+		}
+
+		snprintf(path, path_size, format,
+			 srv_data_home_full, srv_buf_dump_filename);
+	}
+}
+
+
+/*****************************************************************//**
+Perform a buffer pool dump into the file specified by
+innodb_buffer_pool_filename. If any errors occur then the value of
+innodb_buffer_pool_dump_status will be set accordingly, see buf_dump_status().
+The dump filename can be specified by (relative to srv_data_home):
+SET GLOBAL innodb_buffer_pool_filename='filename'; */
+static
+void
+buf_dump(
+/*=====*/
+	ibool	obey_shutdown)	/*!< in: quit if we are in a shutting down
+				state */
+{
+#define SHOULD_QUIT()	(SHUTTING_DOWN() && obey_shutdown)
+
+	char	full_filename[OS_FILE_MAX_PATH];
+	char	tmp_filename[OS_FILE_MAX_PATH + sizeof "incomplete"];
+	char	now[32];
+	FILE*	f;
+	int	ret;
+
+	buf_dump_generate_path(full_filename, sizeof(full_filename));
+
+	snprintf(tmp_filename, sizeof(tmp_filename),
+		 "%s.incomplete", full_filename);
+
+	buf_dump_status(STATUS_INFO, "Dumping buffer pool(s) to %s",
+			full_filename);
+
+#ifdef _WIN32
+	/* use my_fopen() for correct permissions during bootstrap*/
+	f = my_fopen(tmp_filename, O_RDWR|O_TRUNC|O_CREAT, 0);
+#elif defined(__GLIBC__) || O_CLOEXEC == 0
+	f = fopen(tmp_filename, "w" STR_O_CLOEXEC);
+#else
+	{
+		int	fd;
+		fd = open(tmp_filename, O_CREAT | O_TRUNC | O_CLOEXEC | O_WRONLY, 0640);
+		if (fd >= 0) {
+			f = fdopen(fd, "w");
+		}
+		else {
+			f = NULL;
+		}
+	}
+#endif
+	if (f == NULL) {
+		buf_dump_status(STATUS_ERR,
+				"Cannot open '%s' for writing: %s",
+				tmp_filename, strerror(errno));
+		return;
+	}
+	const buf_page_t*	bpage;
+	page_id_t*		dump;
+	ulint			n_pages;
+	ulint			j;
+
+	mysql_mutex_lock(&buf_pool.mutex);
+
+	n_pages = UT_LIST_GET_LEN(buf_pool.LRU);
+
+	/* skip empty buffer pools */
+	if (n_pages == 0) {
+		mysql_mutex_unlock(&buf_pool.mutex);
+		goto done;
+	}
+
+	if (srv_buf_pool_dump_pct != 100) {
+		ulint		t_pages;
+
+		/* limit the number of total pages dumped to X% of the
+		total number of pages */
+		t_pages = buf_pool.curr_size * srv_buf_pool_dump_pct / 100;
+		if (n_pages > t_pages) {
+			buf_dump_status(STATUS_INFO,
+					"Restricted to " ULINTPF
+					" pages due to "
+					"innodb_buf_pool_dump_pct=%lu",
+					t_pages, srv_buf_pool_dump_pct);
+			n_pages = t_pages;
+		}
+
+		if (n_pages == 0) {
+			n_pages = 1;
+		}
+	}
+
+	dump = static_cast<page_id_t*>(ut_malloc_nokey(
+					       n_pages * sizeof(*dump)));
+
+	if (dump == NULL) {
+		std::ostringstream str_bytes;
+		mysql_mutex_unlock(&buf_pool.mutex);
+		fclose(f);
+		str_bytes << ib::bytes_iec{n_pages * sizeof(*dump)};
+		buf_dump_status(STATUS_ERR,
+				"Cannot allocate %s: %s",
+				str_bytes.str().c_str(),
+				strerror(errno));
+		/* leave tmp_filename to exist */
+		return;
+	}
+
+	for (bpage = UT_LIST_GET_FIRST(buf_pool.LRU), j = 0;
+	     bpage != NULL && j < n_pages;
+	     bpage = UT_LIST_GET_NEXT(LRU, bpage)) {
+		const auto status = bpage->state();
+		if (status < buf_page_t::UNFIXED) {
+			ut_a(status >= buf_page_t::FREED);
+			continue;
+		}
+		const page_id_t id{bpage->id()};
+
+		if (id.space() == SRV_TMP_SPACE_ID) {
+			/* Ignore the innodb_temporary tablespace. */
+			continue;
+		}
+
+		dump[j++] = id;
+	}
+
+	mysql_mutex_unlock(&buf_pool.mutex);
+
+	ut_a(j <= n_pages);
+	n_pages = j;
+
+	for (j = 0; j < n_pages && !SHOULD_QUIT(); j++) {
+		ret = fprintf(f, "%u,%u\n",
+			      dump[j].space(), dump[j].page_no());
+		if (ret < 0) {
+			ut_free(dump);
+			fclose(f);
+			buf_dump_status(STATUS_ERR,
+					"Cannot write to '%s': %s",
+					tmp_filename, strerror(errno));
+			/* leave tmp_filename to exist */
+			return;
+		}
+		if (SHUTTING_DOWN() && !(j & 1023)) {
+			service_manager_extend_timeout(
+				INNODB_EXTEND_TIMEOUT_INTERVAL,
+				"Dumping buffer pool page "
+				ULINTPF "/" ULINTPF, j + 1, n_pages);
+		}
+	}
+
+	ut_free(dump);
+
+done:
+	ret = IF_WIN(my_fclose(f,0),fclose(f));
+	if (ret != 0) {
+		buf_dump_status(STATUS_ERR,
+				"Cannot close '%s': %s",
+				tmp_filename, strerror(errno));
+		return;
+	}
+	/* else */
+
+	ret = unlink(full_filename);
+	if (ret != 0 && errno != ENOENT) {
+		buf_dump_status(STATUS_ERR,
+				"Cannot delete '%s': %s",
+				full_filename, strerror(errno));
+		/* leave tmp_filename to exist */
+		return;
+	}
+	/* else */
+
+	ret = rename(tmp_filename, full_filename);
+	if (ret != 0) {
+		buf_dump_status(STATUS_ERR,
+				"Cannot rename '%s' to '%s': %s",
+				tmp_filename, full_filename,
+				strerror(errno));
+		/* leave tmp_filename to exist */
+		return;
+	}
+	/* else */
+
+	/* success */
+
+	ut_sprintf_timestamp(now);
+
+	buf_dump_status(STATUS_INFO,
+			"Buffer pool(s) dump completed at %s", now);
+
+	/* Though dumping doesn't related to an incomplete load,
+	 we reset this to 0 here to indicate that a shutdown can also perform
+	 a dump */
+	export_vars.innodb_buffer_pool_load_incomplete = 0;
+}
+
+/*****************************************************************//**
+Perform a buffer pool load from the file specified by
+innodb_buffer_pool_filename. If any errors occur then the value of
+innodb_buffer_pool_load_status will be set accordingly, see buf_load_status().
+The dump filename can be specified by (relative to srv_data_home):
+SET GLOBAL innodb_buffer_pool_filename='filename'; */
+static
+void
+buf_load()
+/*======*/
+{
+	char		full_filename[OS_FILE_MAX_PATH];
+	char		now[32];
+	FILE*		f;
+	page_id_t*	dump;
+	ulint		dump_n;
+	ulint		i;
+	uint32_t	space_id;
+	uint32_t	page_no;
+	int		fscanf_ret;
+
+	/* Ignore any leftovers from before */
+	buf_load_abort_flag = false;
+
+	buf_dump_generate_path(full_filename, sizeof(full_filename));
+
+	buf_load_status(STATUS_INFO,
+			"Loading buffer pool(s) from %s", full_filename);
+
+	f = fopen(full_filename, "r" STR_O_CLOEXEC);
+	if (f == NULL) {
+		buf_load_status(STATUS_INFO,
+				"Cannot open '%s' for reading: %s",
+				full_filename, strerror(errno));
+		return;
+	}
+	/* else */
+
+	/* First scan the file to estimate how many entries are in it.
+	This file is tiny (approx 500KB per 1GB buffer pool), reading it
+	two times is fine. */
+	dump_n = 0;
+	while (fscanf(f, "%u,%u", &space_id, &page_no) == 2
+	       && !SHUTTING_DOWN()) {
+		dump_n++;
+	}
+
+	if (!SHUTTING_DOWN() && !feof(f)) {
+		/* fscanf() returned != 2 */
+		const char*	what;
+		if (ferror(f)) {
+			what = "reading";
+		} else {
+			what = "parsing";
+		}
+		fclose(f);
+		buf_load_status(STATUS_ERR, "Error %s '%s',"
+				" unable to load buffer pool (stage 1)",
+				what, full_filename);
+		return;
+	}
+
+	/* If dump is larger than the buffer pool(s), then we ignore the
+	extra trailing. This could happen if a dump is made, then buffer
+	pool is shrunk and then load is attempted. */
+	dump_n = std::min(dump_n, buf_pool.get_n_pages());
+
+	if (dump_n != 0) {
+		dump = static_cast<page_id_t*>(ut_malloc_nokey(
+				dump_n * sizeof(*dump)));
+	} else {
+		fclose(f);
+		ut_sprintf_timestamp(now);
+		buf_load_status(STATUS_INFO,
+				"Buffer pool(s) load completed at %s"
+				" (%s was empty)", now, full_filename);
+		return;
+	}
+
+	if (dump == NULL) {
+		std::ostringstream str_bytes;
+		fclose(f);
+		str_bytes << ib::bytes_iec{dump_n * sizeof(*dump)};
+		buf_dump_status(STATUS_ERR,
+				"Cannot allocate %s: %s",
+				str_bytes.str().c_str(),
+				strerror(errno));
+		/* leave tmp_filename to exist */
+		return;
+	}
+
+	rewind(f);
+
+	export_vars.innodb_buffer_pool_load_incomplete = 1;
+
+	for (i = 0; i < dump_n && !SHUTTING_DOWN(); i++) {
+		fscanf_ret = fscanf(f, "%u,%u", &space_id, &page_no);
+
+		if (fscanf_ret != 2) {
+			if (feof(f)) {
+				break;
+			}
+			/* else */
+
+			ut_free(dump);
+			fclose(f);
+			buf_load_status(STATUS_ERR,
+					"Error parsing '%s', unable"
+					" to load buffer pool (stage 2)",
+					full_filename);
+			return;
+		}
+
+		if (space_id > ULINT32_MASK || page_no > ULINT32_MASK) {
+			ut_free(dump);
+			fclose(f);
+			buf_load_status(STATUS_ERR,
+					"Error parsing '%s': bogus"
+					" space,page %u,%u at line " ULINTPF
+					", unable to load buffer pool",
+					full_filename,
+					space_id, page_no,
+					i);
+			return;
+		}
+
+		dump[i] = page_id_t(space_id, page_no);
+	}
+
+	/* Set dump_n to the actual number of initialized elements,
+	i could be smaller than dump_n here if the file got truncated after
+	we read it the first time. */
+	dump_n = i;
+
+	fclose(f);
+
+	if (dump_n == 0) {
+		ut_free(dump);
+		ut_sprintf_timestamp(now);
+		buf_load_status(STATUS_INFO,
+				"Buffer pool(s) load completed at %s"
+				" (%s was empty or had errors)", now, full_filename);
+		return;
+	}
+
+	if (!SHUTTING_DOWN()) {
+		std::sort(dump, dump + dump_n);
+	}
+
+	/* Avoid calling the expensive fil_space_t::get() for each
+	page within the same tablespace. dump[] is sorted by (space, page),
+	so all pages from a given tablespace are consecutive. */
+	uint32_t	cur_space_id = dump[0].space();
+	fil_space_t*	space = fil_space_t::get(cur_space_id);
+	ulint		zip_size = space ? space->zip_size() : 0;
+
+	PSI_stage_progress*	pfs_stage_progress __attribute__((unused))
+		= mysql_set_stage(srv_stage_buffer_pool_load.m_key);
+	mysql_stage_set_work_estimated(pfs_stage_progress, dump_n);
+	mysql_stage_set_work_completed(pfs_stage_progress, 0);
+
+	for (i = 0; i < dump_n && !SHUTTING_DOWN(); i++) {
+
+		/* space_id for this iteration of the loop */
+		const uint32_t this_space_id = dump[i].space();
+
+		if (this_space_id >= SRV_SPACE_ID_UPPER_BOUND) {
+			continue;
+		}
+
+		if (this_space_id != cur_space_id) {
+			if (space) {
+				space->release();
+			}
+
+			cur_space_id = this_space_id;
+			space = fil_space_t::get(cur_space_id);
+
+			if (!space) {
+				continue;
+			}
+
+			zip_size = space->zip_size();
+		}
+
+		/* JAN: TODO: As we use background page read below,
+		if tablespace is encrypted we cant use it. */
+		if (!space || dump[i].page_no() >= space->get_size() ||
+		    (space->crypt_data &&
+		     space->crypt_data->encryption != FIL_ENCRYPTION_OFF &&
+		     space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED)) {
+			continue;
+		}
+
+		if (space->is_stopping()) {
+			space->release();
+			space = nullptr;
+			continue;
+		}
+
+		space->reacquire();
+		buf_read_page_background(space, dump[i], zip_size);
+
+		if (buf_load_abort_flag) {
+			if (space) {
+				space->release();
+			}
+			buf_load_abort_flag = false;
+			ut_free(dump);
+			buf_load_status(
+				STATUS_INFO,
+				"Buffer pool(s) load aborted on request");
+			/* Premature end, set estimated = completed = i and
+			end the current stage event. */
+
+			mysql_stage_set_work_estimated(pfs_stage_progress, i);
+			mysql_stage_set_work_completed(pfs_stage_progress, i);
+
+			mysql_end_stage();
+			return;
+		}
+
+#ifdef UNIV_DEBUG
+		if ((i+1) >= srv_buf_pool_load_pages_abort) {
+			buf_load_abort_flag = true;
+		}
+#endif
+	}
+
+	if (space) {
+		space->release();
+	}
+
+	ut_free(dump);
+
+	if (i == dump_n) {
+		os_aio_wait_until_no_pending_reads(true);
+	}
+
+	ut_sprintf_timestamp(now);
+
+	if (i == dump_n) {
+		buf_load_status(STATUS_INFO,
+			"Buffer pool(s) load completed at %s", now);
+		export_vars.innodb_buffer_pool_load_incomplete = 0;
+	} else if (!buf_load_abort_flag) {
+		buf_load_status(STATUS_INFO,
+			"Buffer pool(s) load aborted due to user instigated abort at %s",
+			now);
+		/* intentionally don't reset innodb_buffer_pool_load_incomplete
+                   as we don't want a shutdown to save the buffer pool */
+	} else {
+		buf_load_status(STATUS_INFO,
+			"Buffer pool(s) load aborted due to shutdown at %s",
+			now);
+		/* intentionally don't reset innodb_buffer_pool_load_incomplete
+                   as we want to abort without saving the buffer pool */
+	}
+
+	/* Make sure that estimated = completed when we end. */
+	mysql_stage_set_work_completed(pfs_stage_progress, dump_n);
+	/* End the stage progress event. */
+	mysql_end_stage();
+}
+
+/** Abort a currently running buffer pool load. */
+void buf_load_abort()
+{
+  buf_load_abort_flag= true;
+}
+
+/*****************************************************************//**
+This is the main task for buffer pool dump/load. when scheduled
+either performs a dump or load, depending on server state, state of the variables etc- */
+static void buf_dump_load_func(void *)
+{
+	ut_ad(!srv_read_only_mode);
+	static bool first_time = true;
+	if (first_time && srv_buffer_pool_load_at_startup) {
+
+#ifdef WITH_WSREP
+		if (!get_wsrep_recovery()) {
+#endif /* WITH_WSREP */
+			srv_thread_pool->set_concurrency(srv_n_read_io_threads);
+			buf_load();
+			srv_thread_pool->set_concurrency();
+#ifdef WITH_WSREP
+		}
+#endif /* WITH_WSREP */
+	}
+	first_time = false;
+
+	while (!SHUTTING_DOWN()) {
+		if (buf_dump_should_start) {
+			buf_dump_should_start = false;
+			buf_dump(true);
+		}
+		if (buf_load_should_start) {
+			buf_load_should_start = false;
+			buf_load();
+		}
+
+		if (!buf_dump_should_start && !buf_load_should_start) {
+			return;
+		}
+	}
+
+	/* In shutdown */
+	if (srv_buffer_pool_dump_at_shutdown && srv_fast_shutdown != 2) {
+		if (export_vars.innodb_buffer_pool_load_incomplete) {
+			buf_dump_status(STATUS_INFO,
+				"Dumping of buffer pool not started"
+				" as load was incomplete");
+#ifdef WITH_WSREP
+		} else if (get_wsrep_recovery()) {
+#endif /* WITH_WSREP */
+		} else {
+			buf_dump(false/* do complete dump at shutdown */);
+		}
+	}
+}
+
+
+/* Execute task with max.concurrency */
+static tpool::task_group tpool_group(1);
+static tpool::waitable_task buf_dump_load_task(buf_dump_load_func, &tpool_group);
+static bool load_dump_enabled;
+
+/** Start async buffer pool load, if srv_buffer_pool_load_at_startup was set.*/
+void buf_load_at_startup()
+{
+  load_dump_enabled= true;
+  if (srv_buffer_pool_load_at_startup)
+    buf_do_load_dump();
+}
+
+static void buf_do_load_dump()
+{
+  if (load_dump_enabled && !buf_dump_load_task.is_running())
+    srv_thread_pool->submit_task(&buf_dump_load_task);
+}
+
+/** Wait for currently running load/dumps to finish*/
+void buf_load_dump_end()
+{
+  ut_ad(SHUTTING_DOWN());
+  buf_dump_load_task.wait();
+}
diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc
new file mode 100644
index 00000000..b6357989
--- /dev/null
+++ b/storage/innobase/buf/buf0flu.cc
@@ -0,0 +1,2765 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+Copyright (c) 2013, 2014, Fusion-io
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0flu.cc
+The database buffer buf_pool flush algorithm
+
+Created 11/11/1995 Heikki Tuuri
+*******************************************************/
+
+#include "univ.i"
+#include <my_service_manager.h>
+#include <mysql/service_thd_wait.h>
+#include <sql_class.h>
+
+#include "buf0flu.h"
+#include "buf0buf.h"
+#include "buf0checksum.h"
+#include "buf0dblwr.h"
+#include "srv0start.h"
+#include "page0zip.h"
+#include "fil0fil.h"
+#include "log0crypt.h"
+#include "srv0mon.h"
+#include "fil0pagecompress.h"
+#include "lzo/lzo1x.h"
+#include "snappy-c.h"
+
+/** Number of pages flushed via LRU. Protected by buf_pool.mutex.
+Also included in buf_pool.stat.n_pages_written. */
+ulint buf_lru_flush_page_count;
+
+/** Number of pages freed without flushing. Protected by buf_pool.mutex. */
+ulint buf_lru_freed_page_count;
+
+/** Flag indicating if the page_cleaner is in active state. */
+Atomic_relaxed<bool> buf_page_cleaner_is_active;
+
+/** Factor for scan length to determine n_pages for intended oldest LSN
+progress */
+static constexpr ulint buf_flush_lsn_scan_factor = 3;
+
+/** Average redo generation rate */
+static lsn_t lsn_avg_rate = 0;
+
+/** Target oldest_modification for the page cleaner background flushing;
+writes are protected by buf_pool.flush_list_mutex */
+static Atomic_relaxed<lsn_t> buf_flush_async_lsn;
+/** Target oldest_modification for the page cleaner furious flushing;
+writes are protected by buf_pool.flush_list_mutex */
+static Atomic_relaxed<lsn_t> buf_flush_sync_lsn;
+
+#ifdef UNIV_PFS_THREAD
+mysql_pfs_key_t page_cleaner_thread_key;
+#endif /* UNIV_PFS_THREAD */
+
+/** Page cleaner structure */
+static struct
+{
+  /** total elapsed time in adaptive flushing, in seconds */
+  ulint flush_time;
+  /** number of adaptive flushing passes */
+  ulint flush_pass;
+} page_cleaner;
+
+/* @} */
+
+#ifdef UNIV_DEBUG
+/** Validate the flush list. */
+static void buf_flush_validate_low();
+
+/** Validates the flush list some of the time. */
+static void buf_flush_validate_skip()
+{
+/** Try buf_flush_validate_low() every this many times */
+# define BUF_FLUSH_VALIDATE_SKIP	23
+
+	/** The buf_flush_validate_low() call skip counter.
+	Use a signed type because of the race condition below. */
+	static int buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
+
+	/* There is a race condition below, but it does not matter,
+	because this call is only for heuristic purposes. We want to
+	reduce the call frequency of the costly buf_flush_validate_low()
+	check in debug builds. */
+	if (--buf_flush_validate_count > 0) {
+		return;
+	}
+
+	buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
+	buf_flush_validate_low();
+}
+#endif /* UNIV_DEBUG */
+
+void buf_pool_t::page_cleaner_wakeup(bool for_LRU)
+{
+  ut_d(buf_flush_validate_skip());
+  if (!page_cleaner_idle())
+  {
+    if (for_LRU)
+      /* Ensure that the page cleaner is not in a timed wait. */
+      pthread_cond_signal(&do_flush_list);
+    return;
+  }
+  double dirty_pct= double(UT_LIST_GET_LEN(buf_pool.flush_list)) * 100.0 /
+    double(UT_LIST_GET_LEN(buf_pool.LRU) + UT_LIST_GET_LEN(buf_pool.free));
+  double pct_lwm= srv_max_dirty_pages_pct_lwm;
+
+  /* if pct_lwm != 0.0, adaptive flushing is enabled.
+  signal buf page cleaner thread
+  - if pct_lwm <= dirty_pct then it will invoke apdative flushing flow
+  - if pct_lwm > dirty_pct then it will invoke idle flushing flow.
+
+  idle_flushing:
+  dirty_pct < innodb_max_dirty_pages_pct_lwm so it could be an
+  idle flushing use-case.
+
+  Why is last_activity_count not updated always?
+  - let's first understand when is server activity count updated.
+  - it is updated on commit of a transaction trx_t::commit() and not
+    on adding a page to the flush list.
+  - page_cleaner_wakeup is called when a page is added to the flush list.
+
+  - now let's say the first user thread, updates the count from X -> Y but
+    is yet to commit the transaction (so activity count is still Y).
+    followup user threads will see the updated count as (Y) that is matching
+    the universal server activity count (Y), giving a false impression that
+    the server is idle.
+
+  How to avoid this?
+  - by allowing last_activity_count to updated when page-cleaner is made
+    active and has work to do. This ensures that the last_activity signal
+    is consumed by the page-cleaner before the next one is generated. */
+  if (for_LRU ||
+      (pct_lwm != 0.0 && (pct_lwm <= dirty_pct ||
+                          last_activity_count == srv_get_activity_count())) ||
+      srv_max_buf_pool_modified_pct <= dirty_pct)
+  {
+    page_cleaner_status-= PAGE_CLEANER_IDLE;
+    pthread_cond_signal(&do_flush_list);
+  }
+}
+
+/** Remove a block from flush_list.
+@param bpage   buffer pool page */
+void buf_pool_t::delete_from_flush_list(buf_page_t *bpage) noexcept
+{
+  ut_ad(!fsp_is_system_temporary(bpage->id().space()));
+  mysql_mutex_assert_owner(&flush_list_mutex);
+  flush_hp.adjust(bpage);
+  UT_LIST_REMOVE(flush_list, bpage);
+  flush_list_bytes-= bpage->physical_size();
+  bpage->clear_oldest_modification();
+#ifdef UNIV_DEBUG
+  buf_flush_validate_skip();
+#endif /* UNIV_DEBUG */
+}
+
+/** Remove all dirty pages belonging to a given tablespace when we are
+deleting the data file of that tablespace.
+The pages still remain a part of LRU and are evicted from
+the list as they age towards the tail of the LRU.
+@param id    tablespace identifier */
+void buf_flush_remove_pages(uint32_t id)
+{
+  const page_id_t first(id, 0), end(id + 1, 0);
+  ut_ad(id);
+
+  for (;;)
+  {
+    mysql_mutex_lock(&buf_pool.mutex);
+    bool deferred= false;
+
+    mysql_mutex_lock(&buf_pool.flush_list_mutex);
+
+    for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); bpage; )
+    {
+      const auto s= bpage->state();
+      ut_ad(s >= buf_page_t::REMOVE_HASH);
+      ut_ad(s < buf_page_t::READ_FIX || s >= buf_page_t::WRITE_FIX);
+      buf_page_t *prev= UT_LIST_GET_PREV(list, bpage);
+
+      const page_id_t bpage_id(bpage->id());
+
+      if (bpage_id < first || bpage_id >= end);
+      else if (s >= buf_page_t::WRITE_FIX)
+        deferred= true;
+      else
+        buf_pool.delete_from_flush_list(bpage);
+
+      bpage= prev;
+    }
+
+    mysql_mutex_unlock(&buf_pool.mutex);
+    mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+    if (!deferred)
+      break;
+
+    os_aio_wait_until_no_pending_writes(true);
+  }
+}
+
+/*******************************************************************//**
+Relocates a buffer control block on the flush_list.
+Note that it is assumed that the contents of bpage have already been
+copied to dpage.
+IMPORTANT: When this function is called bpage and dpage are not
+exact copies of each other. For example, they both will have different
+::state. Also the ::list pointers in dpage may be stale. We need to
+use the current list node (bpage) to do the list manipulation because
+the list pointers could have changed between the time that we copied
+the contents of bpage to the dpage and the flush list manipulation
+below. */
+ATTRIBUTE_COLD
+void
+buf_flush_relocate_on_flush_list(
+/*=============================*/
+	buf_page_t*	bpage,	/*!< in/out: control block being moved */
+	buf_page_t*	dpage)	/*!< in/out: destination block */
+{
+	buf_page_t*	prev;
+
+	mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
+	ut_ad(!fsp_is_system_temporary(bpage->id().space()));
+
+	const lsn_t lsn = bpage->oldest_modification();
+
+	if (!lsn) {
+		return;
+	}
+
+	ut_ad(lsn == 1 || lsn > 2);
+	ut_ad(dpage->oldest_modification() == lsn);
+
+	/* Important that we adjust the hazard pointer before removing
+	the bpage from the flush list. */
+	buf_pool.flush_hp.adjust(bpage);
+
+	prev = UT_LIST_GET_PREV(list, bpage);
+	UT_LIST_REMOVE(buf_pool.flush_list, bpage);
+
+	bpage->clear_oldest_modification();
+
+	if (lsn == 1) {
+		buf_pool.flush_list_bytes -= dpage->physical_size();
+		dpage->list.prev = nullptr;
+		dpage->list.next = nullptr;
+		dpage->clear_oldest_modification();
+	} else if (prev) {
+		ut_ad(prev->oldest_modification());
+		UT_LIST_INSERT_AFTER(buf_pool.flush_list, prev, dpage);
+	} else {
+		UT_LIST_ADD_FIRST(buf_pool.flush_list, dpage);
+	}
+
+	ut_d(buf_flush_validate_low());
+}
+
+/** Note that a block is no longer dirty, while not removing
+it from buf_pool.flush_list
+@param temporary   whether the page belongs to the temporary tablespace
+@param error       whether an error may have occurred while writing */
+inline void buf_page_t::write_complete(bool temporary, bool error)
+{
+  ut_ad(temporary == fsp_is_system_temporary(id().space()));
+  if (UNIV_UNLIKELY(error));
+  else if (temporary)
+  {
+    ut_ad(oldest_modification() == 2);
+    oldest_modification_= 0;
+  }
+  else
+  {
+    /* We use release memory order to guarantee that callers of
+    oldest_modification_acquire() will observe the block as
+    being detached from buf_pool.flush_list, after reading the value 0. */
+    ut_ad(oldest_modification() > 2);
+    oldest_modification_.store(1, std::memory_order_release);
+  }
+  const auto s= state();
+  ut_ad(s >= WRITE_FIX);
+  zip.fix.fetch_sub((s >= WRITE_FIX_REINIT)
+                    ? (WRITE_FIX_REINIT - UNFIXED)
+                    : (WRITE_FIX - UNFIXED));
+  lock.u_unlock(true);
+}
+
+inline void buf_pool_t::n_flush_inc()
+{
+  mysql_mutex_assert_owner(&flush_list_mutex);
+  page_cleaner_status+= LRU_FLUSH;
+}
+
+inline void buf_pool_t::n_flush_dec()
+{
+  mysql_mutex_lock(&flush_list_mutex);
+  ut_ad(page_cleaner_status >= LRU_FLUSH);
+  if ((page_cleaner_status-= LRU_FLUSH) < LRU_FLUSH)
+    pthread_cond_broadcast(&done_flush_LRU);
+  mysql_mutex_unlock(&flush_list_mutex);
+}
+
+inline void buf_pool_t::n_flush_dec_holding_mutex()
+{
+  mysql_mutex_assert_owner(&flush_list_mutex);
+  ut_ad(page_cleaner_status >= LRU_FLUSH);
+  page_cleaner_status-= LRU_FLUSH;
+}
+
+/** Complete write of a file page from buf_pool.
+@param request write request
+@param error   whether the write may have failed */
+void buf_page_write_complete(const IORequest &request, bool error)
+{
+  ut_ad(request.is_write());
+  ut_ad(!srv_read_only_mode);
+  buf_page_t *bpage= request.bpage;
+  ut_ad(bpage);
+  const auto state= bpage->state();
+  /* io-fix can only be cleared by buf_page_t::write_complete()
+  and buf_page_t::read_complete() */
+  ut_ad(state >= buf_page_t::WRITE_FIX);
+  ut_ad(!buf_dblwr.is_inside(bpage->id()));
+  ut_ad(request.node->space->id == bpage->id().space());
+
+  if (request.slot)
+    request.slot->release();
+
+  if (UNIV_UNLIKELY(MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE)))
+    buf_page_monitor(*bpage, false);
+  DBUG_PRINT("ib_buf", ("write page %u:%u",
+                        bpage->id().space(), bpage->id().page_no()));
+
+  mysql_mutex_assert_not_owner(&buf_pool.mutex);
+  mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
+
+  if (request.is_LRU())
+  {
+    const bool temp= bpage->oldest_modification() == 2;
+    if (!temp && state < buf_page_t::WRITE_FIX_REINIT &&
+        request.node->space->use_doublewrite())
+      buf_dblwr.write_completed();
+    /* We must hold buf_pool.mutex while releasing the block, so that
+    no other thread can access it before we have freed it. */
+    mysql_mutex_lock(&buf_pool.mutex);
+    bpage->write_complete(temp, error);
+    if (!error)
+      buf_LRU_free_page(bpage, true);
+    mysql_mutex_unlock(&buf_pool.mutex);
+
+    buf_pool.n_flush_dec();
+  }
+  else
+  {
+    if (state < buf_page_t::WRITE_FIX_REINIT &&
+        request.node->space->use_doublewrite())
+      buf_dblwr.write_completed();
+    bpage->write_complete(false, error);
+  }
+}
+
+/** Calculate a ROW_FORMAT=COMPRESSED page checksum and update the page.
+@param[in,out]	page		page to update
+@param[in]	size		compressed page size */
+void buf_flush_update_zip_checksum(buf_frame_t *page, ulint size)
+{
+  ut_ad(size > 0);
+  mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
+                  page_zip_calc_checksum(page, size, false));
+}
+
+/** Assign the full crc32 checksum for non-compressed page.
+@param[in,out]	page	page to be updated */
+void buf_flush_assign_full_crc32_checksum(byte* page)
+{
+	ut_d(bool compressed = false);
+	ut_d(bool corrupted = false);
+	ut_d(const uint size = buf_page_full_crc32_size(page, &compressed,
+							&corrupted));
+	ut_ad(!compressed);
+	ut_ad(!corrupted);
+	ut_ad(size == uint(srv_page_size));
+	const ulint payload = srv_page_size - FIL_PAGE_FCRC32_CHECKSUM;
+	mach_write_to_4(page + payload, my_crc32c(0, page, payload));
+}
+
+/** Initialize a page for writing to the tablespace.
+@param[in]	block			buffer block; NULL if bypassing
+					the buffer pool
+@param[in,out]	page			page frame
+@param[in,out]	page_zip_		compressed page, or NULL if
+					uncompressed
+@param[in]	use_full_checksum	whether tablespace uses full checksum */
+void
+buf_flush_init_for_writing(
+	const buf_block_t*	block,
+	byte*			page,
+	void*			page_zip_,
+	bool			use_full_checksum)
+{
+	if (block && block->page.frame != page) {
+		/* If page is encrypted in full crc32 format then
+		checksum stored already as a part of fil_encrypt_buf() */
+		ut_ad(use_full_checksum);
+		return;
+	}
+
+	ut_ad(!block || block->page.frame == page);
+	ut_ad(page);
+
+	if (page_zip_) {
+		page_zip_des_t*	page_zip;
+		ulint		size;
+
+		page_zip = static_cast<page_zip_des_t*>(page_zip_);
+		ut_ad(!block || &block->page.zip == page_zip);
+		size = page_zip_get_size(page_zip);
+
+		ut_ad(size);
+		ut_ad(ut_is_2pow(size));
+		ut_ad(size <= UNIV_ZIP_SIZE_MAX);
+
+		switch (fil_page_get_type(page)) {
+		case FIL_PAGE_TYPE_ALLOCATED:
+		case FIL_PAGE_INODE:
+		case FIL_PAGE_IBUF_BITMAP:
+		case FIL_PAGE_TYPE_FSP_HDR:
+		case FIL_PAGE_TYPE_XDES:
+			/* These are essentially uncompressed pages. */
+			memcpy(page_zip->data, page, size);
+			/* fall through */
+		case FIL_PAGE_TYPE_ZBLOB:
+		case FIL_PAGE_TYPE_ZBLOB2:
+		case FIL_PAGE_INDEX:
+		case FIL_PAGE_RTREE:
+			buf_flush_update_zip_checksum(page_zip->data, size);
+			return;
+		}
+
+		ib::error() << "The compressed page to be written"
+			" seems corrupt:";
+		ut_print_buf(stderr, page, size);
+		fputs("\nInnoDB: Possibly older version of the page:", stderr);
+		ut_print_buf(stderr, page_zip->data, size);
+		putc('\n', stderr);
+		ut_error;
+	}
+
+	if (use_full_checksum) {
+		static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "aligned");
+		static_assert(FIL_PAGE_LSN % 4 == 0, "aligned");
+		memcpy_aligned<4>(page + srv_page_size
+				  - FIL_PAGE_FCRC32_END_LSN,
+				  FIL_PAGE_LSN + 4 + page, 4);
+		return buf_flush_assign_full_crc32_checksum(page);
+	}
+
+	static_assert(FIL_PAGE_END_LSN_OLD_CHKSUM % 8 == 0, "aligned");
+	static_assert(FIL_PAGE_LSN % 8 == 0, "aligned");
+	memcpy_aligned<8>(page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM,
+			  FIL_PAGE_LSN + page, 8);
+
+	if (block && srv_page_size == 16384) {
+		/* The page type could be garbage in old files
+		created before MySQL 5.5. Such files always
+		had a page size of 16 kilobytes. */
+		ulint	page_type = fil_page_get_type(page);
+		ulint	reset_type = page_type;
+
+		switch (block->page.id().page_no() % 16384) {
+		case 0:
+			reset_type = block->page.id().page_no() == 0
+				? FIL_PAGE_TYPE_FSP_HDR
+				: FIL_PAGE_TYPE_XDES;
+			break;
+		case 1:
+			reset_type = FIL_PAGE_IBUF_BITMAP;
+			break;
+		case FSP_TRX_SYS_PAGE_NO:
+			if (block->page.id()
+			    == page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO)) {
+				reset_type = FIL_PAGE_TYPE_TRX_SYS;
+				break;
+			}
+			/* fall through */
+		default:
+			switch (page_type) {
+			case FIL_PAGE_INDEX:
+			case FIL_PAGE_TYPE_INSTANT:
+			case FIL_PAGE_RTREE:
+			case FIL_PAGE_UNDO_LOG:
+			case FIL_PAGE_INODE:
+			case FIL_PAGE_IBUF_FREE_LIST:
+			case FIL_PAGE_TYPE_ALLOCATED:
+			case FIL_PAGE_TYPE_SYS:
+			case FIL_PAGE_TYPE_TRX_SYS:
+			case FIL_PAGE_TYPE_BLOB:
+			case FIL_PAGE_TYPE_ZBLOB:
+			case FIL_PAGE_TYPE_ZBLOB2:
+				break;
+			case FIL_PAGE_TYPE_FSP_HDR:
+			case FIL_PAGE_TYPE_XDES:
+			case FIL_PAGE_IBUF_BITMAP:
+				/* These pages should have
+				predetermined page numbers
+				(see above). */
+			default:
+				reset_type = FIL_PAGE_TYPE_UNKNOWN;
+				break;
+			}
+		}
+
+		if (UNIV_UNLIKELY(page_type != reset_type)) {
+			ib::info()
+				<< "Resetting invalid page "
+				<< block->page.id() << " type "
+				<< page_type << " to "
+				<< reset_type << " when flushing.";
+			fil_page_set_type(page, reset_type);
+		}
+	}
+
+	const uint32_t checksum = buf_calc_page_crc32(page);
+	mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
+	mach_write_to_4(page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM,
+			checksum);
+}
+
+/** Reserve a buffer for compression.
+@param[in,out]  slot    reserved slot */
+static void buf_tmp_reserve_compression_buf(buf_tmp_buffer_t* slot)
+{
+  if (slot->comp_buf)
+    return;
+  /* Both Snappy and LZO compression methods require that the output
+  buffer be bigger than input buffer. Adjust the allocated size. */
+  ulint size= srv_page_size;
+  if (provider_service_lzo->is_loaded)
+    size= LZO1X_1_15_MEM_COMPRESS;
+  else if (provider_service_snappy->is_loaded)
+    size= snappy_max_compressed_length(size);
+  slot->comp_buf= static_cast<byte*>(aligned_malloc(size, srv_page_size));
+}
+
+/** Encrypt a buffer of temporary tablespace
+@param[in]      offset  Page offset
+@param[in]      s       Page to encrypt
+@param[in,out]  d       Output buffer
+@return encrypted buffer or NULL */
+static byte* buf_tmp_page_encrypt(ulint offset, const byte* s, byte* d)
+{
+  /* Calculate the start offset in a page */
+  uint srclen= static_cast<uint>(srv_page_size) -
+    (FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION +
+     FIL_PAGE_FCRC32_CHECKSUM);
+  const byte* src= s + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION;
+  byte* dst= d + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION;
+
+  memcpy(d, s, FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
+
+  if (!log_tmp_block_encrypt(src, srclen, dst, (offset * srv_page_size), true))
+    return NULL;
+
+  const ulint payload= srv_page_size - FIL_PAGE_FCRC32_CHECKSUM;
+  mach_write_to_4(d + payload, my_crc32c(0, d, payload));
+
+  srv_stats.pages_encrypted.inc();
+  srv_stats.n_temp_blocks_encrypted.inc();
+  return d;
+}
+
+/** Encryption and page_compression hook that is called just before
+a page is written to disk.
+@param[in,out]  space   tablespace
+@param[in,out]  bpage   buffer page
+@param[in]      s       physical page frame that is being encrypted
+@param[in,out]  size    payload size in bytes
+@return page frame to be written to file
+(may be src_frame or an encrypted/compressed copy of it) */
+static byte *buf_page_encrypt(fil_space_t* space, buf_page_t* bpage, byte* s,
+                              buf_tmp_buffer_t **slot, size_t *size)
+{
+  ut_ad(!bpage->is_freed());
+  ut_ad(space->id == bpage->id().space());
+  ut_ad(!*slot);
+
+  const uint32_t page_no= bpage->id().page_no();
+
+  switch (page_no) {
+  case TRX_SYS_PAGE_NO:
+    if (bpage->id().space() != TRX_SYS_SPACE)
+      break;
+    /* The TRX_SYS page is neither encrypted nor compressed, because
+    it contains the address of the doublewrite buffer. */
+    /* fall through */
+  case 0:
+    /* Page 0 of a tablespace is not encrypted/compressed */
+    return s;
+  }
+
+  fil_space_crypt_t *crypt_data= space->crypt_data;
+  bool encrypted, page_compressed;
+  if (space->purpose == FIL_TYPE_TEMPORARY)
+  {
+    ut_ad(!crypt_data);
+    encrypted= innodb_encrypt_temporary_tables;
+    page_compressed= false;
+  }
+  else
+  {
+    encrypted= crypt_data && !crypt_data->not_encrypted() &&
+      crypt_data->type != CRYPT_SCHEME_UNENCRYPTED &&
+      (!crypt_data->is_default_encryption() || srv_encrypt_tables);
+    page_compressed= space->is_compressed();
+  }
+
+  const bool full_crc32= space->full_crc32();
+
+  if (!encrypted && !page_compressed)
+  {
+    /* No need to encrypt or compress. Clear key-version & crypt-checksum. */
+    static_assert(FIL_PAGE_FCRC32_KEY_VERSION % 4 == 0, "alignment");
+    static_assert(FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION % 4 == 2,
+                  "not perfect alignment");
+    if (full_crc32)
+      memset_aligned<4>(s + FIL_PAGE_FCRC32_KEY_VERSION, 0, 4);
+    else
+      memset_aligned<2>(s + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8);
+    return s;
+  }
+
+  static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment");
+  static_assert(FIL_PAGE_LSN % 8 == 0, "alignment");
+  if (full_crc32)
+    memcpy_aligned<4>(s + srv_page_size - FIL_PAGE_FCRC32_END_LSN,
+                      FIL_PAGE_LSN + 4 + s, 4);
+
+  ut_ad(!bpage->zip_size() || !page_compressed);
+  /* Find free slot from temporary memory array */
+  *slot= buf_pool.io_buf_reserve();
+  ut_a(*slot);
+  (*slot)->allocate();
+
+  byte *d= (*slot)->crypt_buf;
+
+  if (!page_compressed)
+  {
+not_compressed:
+    d= space->purpose == FIL_TYPE_TEMPORARY
+      ? buf_tmp_page_encrypt(page_no, s, d)
+      : fil_space_encrypt(space, page_no, s, d);
+  }
+  else
+  {
+    ut_ad(space->purpose != FIL_TYPE_TEMPORARY);
+    /* First we compress the page content */
+    buf_tmp_reserve_compression_buf(*slot);
+    byte *tmp= (*slot)->comp_buf;
+    ulint len= fil_page_compress(s, tmp, space->flags,
+                                 fil_space_get_block_size(space, page_no),
+                                 encrypted);
+
+    if (!len)
+      goto not_compressed;
+
+    *size= len;
+
+    if (full_crc32)
+    {
+      ut_d(bool compressed = false);
+      len= buf_page_full_crc32_size(tmp,
+#ifdef UNIV_DEBUG
+                                    &compressed,
+#else
+                                    NULL,
+#endif
+                                    NULL);
+      ut_ad(compressed);
+    }
+
+    /* Workaround for MDEV-15527. */
+    memset(tmp + len, 0 , srv_page_size - len);
+
+    if (encrypted)
+      tmp= fil_space_encrypt(space, page_no, tmp, d);
+
+    if (full_crc32)
+    {
+      static_assert(FIL_PAGE_FCRC32_CHECKSUM == 4, "alignment");
+      mach_write_to_4(tmp + len - 4, my_crc32c(0, tmp, len - 4));
+      ut_ad(!buf_page_is_corrupted(true, tmp, space->flags));
+    }
+
+    d= tmp;
+  }
+
+  (*slot)->out_buf= d;
+  return d;
+}
+
+/** Free a page whose underlying file page has been freed. */
+ATTRIBUTE_COLD void buf_pool_t::release_freed_page(buf_page_t *bpage) noexcept
+{
+  mysql_mutex_assert_owner(&mutex);
+  ut_d(const lsn_t oldest_modification= bpage->oldest_modification();)
+  if (fsp_is_system_temporary(bpage->id().space()))
+  {
+    ut_ad(bpage->frame);
+    ut_ad(oldest_modification == 2);
+    bpage->clear_oldest_modification();
+  }
+  else
+  {
+    mysql_mutex_lock(&flush_list_mutex);
+    ut_ad(oldest_modification > 2);
+    delete_from_flush_list(bpage);
+    mysql_mutex_unlock(&flush_list_mutex);
+  }
+
+  bpage->lock.u_unlock(true);
+  buf_LRU_free_page(bpage, true);
+}
+
+/** Write a flushable page to a file or free a freeable block.
+@param evict       whether to evict the page on write completion
+@param space       tablespace
+@return whether a page write was initiated and buf_pool.mutex released */
+bool buf_page_t::flush(bool evict, fil_space_t *space)
+{
+  mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
+  ut_ad(in_file());
+  ut_ad(in_LRU_list);
+  ut_ad((space->purpose == FIL_TYPE_TEMPORARY) ==
+        (space == fil_system.temp_space));
+  ut_ad(evict || space != fil_system.temp_space);
+  ut_ad(space->referenced());
+
+  const auto s= state();
+  ut_a(s >= FREED);
+
+  if (s < UNFIXED)
+  {
+    if (UNIV_LIKELY(space->purpose == FIL_TYPE_TABLESPACE))
+    {
+      const lsn_t lsn=
+        mach_read_from_8(my_assume_aligned<8>
+                         (FIL_PAGE_LSN + (zip.data ? zip.data : frame)));
+      ut_ad(lsn >= oldest_modification());
+      if (lsn > log_sys.get_flushed_lsn())
+      {
+        mysql_mutex_unlock(&buf_pool.mutex);
+        log_write_up_to(lsn, true);
+        mysql_mutex_lock(&buf_pool.mutex);
+      }
+    }
+    buf_pool.release_freed_page(this);
+    return false;
+  }
+
+  ut_d(const auto f=) zip.fix.fetch_add(WRITE_FIX - UNFIXED);
+  ut_ad(f >= UNFIXED);
+  ut_ad(f < READ_FIX);
+  ut_ad((space == fil_system.temp_space)
+        ? oldest_modification() == 2
+        : oldest_modification() > 2);
+
+  /* Increment the I/O operation count used for selecting LRU policy. */
+  buf_LRU_stat_inc_io();
+  mysql_mutex_unlock(&buf_pool.mutex);
+
+  IORequest::Type type= IORequest::WRITE_ASYNC;
+  if (UNIV_UNLIKELY(evict))
+  {
+    type= IORequest::WRITE_LRU;
+    mysql_mutex_lock(&buf_pool.flush_list_mutex);
+    buf_pool.n_flush_inc();
+    mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+  }
+
+  /* Apart from the U-lock, this block will also be protected by
+  is_write_fixed() and oldest_modification()>1.
+  Thus, it cannot be relocated or removed. */
+
+  DBUG_PRINT("ib_buf", ("%s %u page %u:%u",
+                        evict ? "LRU" : "flush_list",
+                        id().space(), id().page_no()));
+
+  buf_block_t *block= reinterpret_cast<buf_block_t*>(this);
+  page_t *write_frame= zip.data;
+
+  space->reacquire();
+  size_t size;
+#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
+  size_t orig_size;
+#endif
+  buf_tmp_buffer_t *slot= nullptr;
+
+  if (UNIV_UNLIKELY(!frame)) /* ROW_FORMAT=COMPRESSED */
+  {
+    ut_ad(!space->full_crc32());
+    ut_ad(!space->is_compressed()); /* not page_compressed */
+    size= zip_size();
+#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
+    orig_size= size;
+#endif
+    buf_flush_update_zip_checksum(write_frame, size);
+    write_frame= buf_page_encrypt(space, this, write_frame, &slot, &size);
+    ut_ad(size == zip_size());
+  }
+  else
+  {
+    byte *page= frame;
+    size= block->physical_size();
+#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
+    orig_size= size;
+#endif
+
+    if (space->full_crc32())
+    {
+      /* innodb_checksum_algorithm=full_crc32 is not implemented for
+      ROW_FORMAT=COMPRESSED pages. */
+      ut_ad(!write_frame);
+      page= buf_page_encrypt(space, this, page, &slot, &size);
+      buf_flush_init_for_writing(block, page, nullptr, true);
+    }
+    else
+    {
+      buf_flush_init_for_writing(block, page, write_frame ? &zip : nullptr,
+                                 false);
+      page= buf_page_encrypt(space, this, write_frame ? write_frame : page,
+                             &slot, &size);
+    }
+
+#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
+    if (size != orig_size)
+    {
+      switch (space->chain.start->punch_hole) {
+      case 1:
+        static_assert(IORequest::PUNCH_LRU - IORequest::PUNCH ==
+                      IORequest::WRITE_LRU - IORequest::WRITE_ASYNC, "");
+        type=
+          IORequest::Type(type + (IORequest::PUNCH - IORequest::WRITE_ASYNC));
+        break;
+      case 2:
+        size= orig_size;
+      }
+    }
+#endif
+    write_frame= page;
+  }
+
+  if ((s & LRU_MASK) == REINIT || !space->use_doublewrite())
+  {
+    if (UNIV_LIKELY(space->purpose == FIL_TYPE_TABLESPACE))
+    {
+      const lsn_t lsn=
+        mach_read_from_8(my_assume_aligned<8>(FIL_PAGE_LSN +
+                                              (write_frame ? write_frame
+                                               : frame)));
+      ut_ad(lsn >= oldest_modification());
+      log_write_up_to(lsn, true);
+    }
+    space->io(IORequest{type, this, slot}, physical_offset(), size,
+              write_frame, this);
+  }
+  else
+    buf_dblwr.add_to_batch(IORequest{this, slot, space->chain.start, type},
+                           size);
+  return true;
+}
+
+/** Check whether a page can be flushed from the buf_pool.
+@param id          page identifier
+@param fold        id.fold()
+@param evict       true=buf_pool.LRU; false=buf_pool.flush_list
+@return whether the page can be flushed */
+static bool buf_flush_check_neighbor(const page_id_t id, ulint fold,
+                                     bool evict)
+{
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+  ut_ad(fold == id.fold());
+
+  /* FIXME: cell_get() is being invoked while holding buf_pool.mutex */
+  const buf_page_t *bpage=
+    buf_pool.page_hash.get(id, buf_pool.page_hash.cell_get(fold));
+
+  if (!bpage || buf_pool.watch_is_sentinel(*bpage))
+    return false;
+
+  /* We avoid flushing 'non-old' blocks in an eviction flush, because the
+  flushed blocks are soon freed */
+  if (evict && !bpage->is_old())
+    return false;
+
+  return bpage->oldest_modification() > 1 && !bpage->is_io_fixed();
+}
+
+/** Check which neighbors of a page can be flushed from the buf_pool.
+@param space       tablespace
+@param id          page identifier of a dirty page
+@param contiguous  whether to consider contiguous areas of pages
+@param evict       true=buf_pool.LRU; false=buf_pool.flush_list
+@return last page number that can be flushed */
+static page_id_t buf_flush_check_neighbors(const fil_space_t &space,
+                                           page_id_t &id, bool contiguous,
+                                           bool evict)
+{
+  ut_ad(id.page_no() < space.size +
+        (space.physical_size() == 2048 ? 1
+         : space.physical_size() == 1024 ? 3 : 0));
+  /* When flushed, dirty blocks are searched in neighborhoods of this
+  size, and flushed along with the original page. */
+  const ulint s= buf_pool.curr_size / 16;
+  const uint32_t read_ahead= buf_pool.read_ahead_area;
+  const uint32_t buf_flush_area= read_ahead > s
+    ? static_cast<uint32_t>(s) : read_ahead;
+  page_id_t low= id - (id.page_no() % buf_flush_area);
+  page_id_t high= low + buf_flush_area;
+  high.set_page_no(std::min(high.page_no(), space.last_page_number()));
+
+  if (!contiguous)
+  {
+    high= std::max(id + 1, high);
+    id= low;
+    return high;
+  }
+
+  /* Determine the contiguous dirty area around id. */
+  const ulint id_fold= id.fold();
+
+  mysql_mutex_lock(&buf_pool.mutex);
+
+  if (id > low)
+  {
+    ulint fold= id_fold;
+    for (page_id_t i= id - 1;; --i)
+    {
+      fold--;
+      if (!buf_flush_check_neighbor(i, fold, evict))
+      {
+        low= i + 1;
+        break;
+      }
+      if (i == low)
+        break;
+    }
+  }
+
+  page_id_t i= id;
+  id= low;
+  ulint fold= id_fold;
+  while (++i < high)
+  {
+    ++fold;
+    if (!buf_flush_check_neighbor(i, fold, evict))
+      break;
+  }
+
+  mysql_mutex_unlock(&buf_pool.mutex);
+  return i;
+}
+
+MY_ATTRIBUTE((warn_unused_result))
+/** Apply freed_ranges to the file.
+@param writable whether the file is writable
+@return number of pages written or hole-punched */
+uint32_t fil_space_t::flush_freed(bool writable)
+{
+  const bool punch_hole= chain.start->punch_hole == 1;
+  if (!punch_hole && !srv_immediate_scrub_data_uncompressed)
+    return 0;
+
+  mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
+  mysql_mutex_assert_not_owner(&buf_pool.mutex);
+
+  for (;;)
+  {
+    freed_range_mutex.lock();
+    if (freed_ranges.empty())
+    {
+      freed_range_mutex.unlock();
+      return 0;
+    }
+    const lsn_t flush_lsn= last_freed_lsn;
+    if (log_sys.get_flushed_lsn() >= flush_lsn)
+      break;
+    freed_range_mutex.unlock();
+    log_write_up_to(flush_lsn, true);
+  }
+
+  const unsigned physical{physical_size()};
+
+  range_set freed= std::move(freed_ranges);
+  uint32_t written= 0;
+
+  if (!writable);
+  else if (punch_hole)
+  {
+    for (const auto &range : freed)
+    {
+      written+= range.last - range.first + 1;
+      reacquire();
+      io(IORequest(IORequest::PUNCH_RANGE),
+         os_offset_t{range.first} * physical,
+         (range.last - range.first + 1) * physical, nullptr);
+    }
+  }
+  else
+  {
+    for (const auto &range : freed)
+    {
+      written+= range.last - range.first + 1;
+      for (os_offset_t i= range.first; i <= range.last; i++)
+      {
+        reacquire();
+        io(IORequest(IORequest::WRITE_ASYNC), i * physical, physical,
+           const_cast<byte*>(field_ref_zero));
+      }
+    }
+  }
+
+  freed_range_mutex.unlock();
+  return written;
+}
+
+/** Flushes to disk all flushable pages within the flush area
+and also write zeroes or punch the hole for the freed ranges of pages.
+@param space       tablespace
+@param page_id     page identifier
+@param bpage       buffer page
+@param contiguous  whether to consider contiguous areas of pages
+@param evict       true=buf_pool.LRU; false=buf_pool.flush_list
+@param n_flushed   number of pages flushed so far in this batch
+@param n_to_flush  maximum number of pages we are allowed to flush
+@return number of pages flushed */
+static ulint buf_flush_try_neighbors(fil_space_t *space,
+                                     const page_id_t page_id,
+                                     buf_page_t *bpage,
+                                     bool contiguous, bool evict,
+                                     ulint n_flushed, ulint n_to_flush)
+{
+  mysql_mutex_unlock(&buf_pool.mutex);
+
+  ut_ad(space->id == page_id.space());
+  ut_ad(bpage->id() == page_id);
+
+  ulint count= 0;
+  page_id_t id= page_id;
+  page_id_t high= buf_flush_check_neighbors(*space, id, contiguous, evict);
+
+  ut_ad(page_id >= id);
+  ut_ad(page_id < high);
+
+  for (ulint id_fold= id.fold(); id < high; ++id, ++id_fold)
+  {
+    if (UNIV_UNLIKELY(space->is_stopping_writes()))
+    {
+      if (bpage)
+        bpage->lock.u_unlock(true);
+      break;
+    }
+
+    if (count + n_flushed >= n_to_flush)
+    {
+      if (id > page_id)
+        break;
+      /* If the page whose neighbors we are flushing has not been
+      flushed yet, we must flush the page that we selected originally. */
+      id= page_id;
+      id_fold= id.fold();
+    }
+
+    const buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id_fold);
+    mysql_mutex_lock(&buf_pool.mutex);
+
+    if (buf_page_t *b= buf_pool.page_hash.get(id, chain))
+    {
+      ut_ad(b->in_file());
+      if (id == page_id)
+      {
+        ut_ad(bpage == b);
+        bpage= nullptr;
+        ut_ad(!buf_pool.watch_is_sentinel(*b));
+        ut_ad(b->oldest_modification() > 1);
+      flush:
+        if (b->flush(evict, space))
+        {
+          ++count;
+          continue;
+        }
+      }
+      /* We avoid flushing 'non-old' blocks in an eviction flush,
+      because the flushed blocks are soon freed */
+      else if ((!evict || b->is_old()) && !buf_pool.watch_is_sentinel(*b) &&
+               b->oldest_modification() > 1 && b->lock.u_lock_try(true))
+      {
+        if (b->oldest_modification() < 2)
+          b->lock.u_unlock(true);
+        else
+          goto flush;
+      }
+    }
+
+    mysql_mutex_unlock(&buf_pool.mutex);
+  }
+
+  if (count > 1)
+  {
+    MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
+                                 MONITOR_FLUSH_NEIGHBOR_COUNT,
+                                 MONITOR_FLUSH_NEIGHBOR_PAGES, count - 1);
+  }
+
+  return count;
+}
+
+/*******************************************************************//**
+This utility moves the uncompressed frames of pages to the free list.
+Note that this function does not actually flush any data to disk. It
+just detaches the uncompressed frames from the compressed pages at the
+tail of the unzip_LRU and puts those freed frames in the free list.
+@return number of blocks moved to the free list. */
+static ulint buf_free_from_unzip_LRU_list_batch()
+{
+	ulint		scanned = 0;
+	ulint		count = 0;
+
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+
+	buf_block_t*	block = UT_LIST_GET_LAST(buf_pool.unzip_LRU);
+
+	while (block
+	       && UT_LIST_GET_LEN(buf_pool.free) < srv_LRU_scan_depth
+	       && UT_LIST_GET_LEN(buf_pool.unzip_LRU)
+	       > UT_LIST_GET_LEN(buf_pool.LRU) / 10) {
+
+		++scanned;
+		if (buf_LRU_free_page(&block->page, false)) {
+			/* Block was freed. buf_pool.mutex potentially
+			released and reacquired */
+			++count;
+			block = UT_LIST_GET_LAST(buf_pool.unzip_LRU);
+		} else {
+			block = UT_LIST_GET_PREV(unzip_LRU, block);
+		}
+	}
+
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+
+	if (scanned) {
+		MONITOR_INC_VALUE_CUMULATIVE(
+			MONITOR_LRU_BATCH_SCANNED,
+			MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
+			MONITOR_LRU_BATCH_SCANNED_PER_CALL,
+			scanned);
+	}
+
+	return(count);
+}
+
+/** Acquire a tablespace reference for writing.
+@param id      tablespace identifier
+@return tablespace
+@retval nullptr if the tablespace is missing or inaccessible */
+fil_space_t *fil_space_t::get_for_write(uint32_t id)
+{
+  mysql_mutex_lock(&fil_system.mutex);
+  fil_space_t *space= fil_space_get_by_id(id);
+  const uint32_t n= space ? space->acquire_low(STOPPING_WRITES) : 0;
+
+  if (n & STOPPING_WRITES)
+    space= nullptr;
+  else if ((n & CLOSING) && !space->prepare_acquired())
+    space= nullptr;
+
+  mysql_mutex_unlock(&fil_system.mutex);
+  return space;
+}
+
+/** Start writing out pages for a tablespace.
+@param id   tablespace identifier
+@return tablespace and number of pages written */
+static std::pair<fil_space_t*, uint32_t> buf_flush_space(const uint32_t id)
+{
+  if (fil_space_t *space= fil_space_t::get_for_write(id))
+    return {space, space->flush_freed(true)};
+  return {nullptr, 0};
+}
+
+struct flush_counters_t
+{
+  /** number of dirty pages flushed */
+  ulint flushed;
+  /** number of clean pages evicted */
+  ulint evicted;
+};
+
+/** Discard a dirty page, and release buf_pool.flush_list_mutex.
+@param bpage      dirty page whose tablespace is not accessible */
+static void buf_flush_discard_page(buf_page_t *bpage)
+{
+  ut_ad(bpage->in_file());
+  ut_ad(bpage->oldest_modification());
+
+  buf_pool.delete_from_flush_list(bpage);
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+  ut_d(const auto state= bpage->state());
+  ut_ad(state == buf_page_t::FREED || state == buf_page_t::UNFIXED ||
+        state == buf_page_t::IBUF_EXIST || state == buf_page_t::REINIT);
+  bpage->lock.u_unlock(true);
+  buf_LRU_free_page(bpage, true);
+}
+
+/** Flush dirty blocks from the end buf_pool.LRU,
+and move clean blocks to buf_pool.free.
+@param max    maximum number of blocks to flush
+@param evict  whether dirty pages are to be evicted after flushing them
+@param n      counts of flushed and evicted pages */
+static void buf_flush_LRU_list_batch(ulint max, bool evict,
+                                     flush_counters_t *n)
+{
+  ulint scanned= 0;
+  ulint free_limit= srv_LRU_scan_depth;
+
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+  if (buf_pool.withdraw_target && buf_pool.is_shrinking())
+    free_limit+= buf_pool.withdraw_target - UT_LIST_GET_LEN(buf_pool.withdraw);
+
+  const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN
+    ? 0 : srv_flush_neighbors;
+  fil_space_t *space= nullptr;
+  uint32_t last_space_id= FIL_NULL;
+  static_assert(FIL_NULL > SRV_TMP_SPACE_ID, "consistency");
+  static_assert(FIL_NULL > SRV_SPACE_ID_UPPER_BOUND, "consistency");
+
+  for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.LRU);
+       bpage &&
+       ((UT_LIST_GET_LEN(buf_pool.LRU) > BUF_LRU_MIN_LEN &&
+         UT_LIST_GET_LEN(buf_pool.free) < free_limit) ||
+        recv_recovery_is_on());
+       ++scanned, bpage= buf_pool.lru_hp.get())
+  {
+    buf_page_t *prev= UT_LIST_GET_PREV(LRU, bpage);
+    buf_pool.lru_hp.set(prev);
+    auto state= bpage->state();
+    ut_ad(state >= buf_page_t::FREED);
+    ut_ad(bpage->in_LRU_list);
+
+    if (!bpage->oldest_modification())
+    {
+    evict:
+      if (state != buf_page_t::FREED &&
+          (state >= buf_page_t::READ_FIX || (~buf_page_t::LRU_MASK & state)))
+        continue;
+      buf_LRU_free_page(bpage, true);
+      ++n->evicted;
+      if (UNIV_LIKELY(scanned & 31))
+        continue;
+      mysql_mutex_unlock(&buf_pool.mutex);
+    reacquire_mutex:
+      mysql_mutex_lock(&buf_pool.mutex);
+      continue;
+    }
+
+    if (state < buf_page_t::READ_FIX && bpage->lock.u_lock_try(true))
+    {
+      ut_ad(!bpage->is_io_fixed());
+      bool do_evict= evict;
+      switch (bpage->oldest_modification()) {
+      case 1:
+        mysql_mutex_lock(&buf_pool.flush_list_mutex);
+        if (ut_d(lsn_t lsn=) bpage->oldest_modification())
+        {
+          ut_ad(lsn == 1); /* It must be clean while we hold bpage->lock */
+          buf_pool.delete_from_flush_list(bpage);
+        }
+        mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+        /* fall through */
+      case 0:
+        bpage->lock.u_unlock(true);
+        goto evict;
+      case 2:
+        /* LRU flushing will always evict pages of the temporary tablespace. */
+        do_evict= true;
+      }
+      /* Block is ready for flush. Dispatch an IO request.
+      If do_evict, the page may be evicted by buf_page_write_complete(). */
+      const page_id_t page_id(bpage->id());
+      const uint32_t space_id= page_id.space();
+      if (!space || space->id != space_id)
+      {
+        if (last_space_id != space_id)
+        {
+          buf_pool.lru_hp.set(bpage);
+          mysql_mutex_unlock(&buf_pool.mutex);
+          if (space)
+            space->release();
+          auto p= buf_flush_space(space_id);
+          space= p.first;
+          last_space_id= space_id;
+          if (!space)
+          {
+            mysql_mutex_lock(&buf_pool.mutex);
+            goto no_space;
+          }
+          mysql_mutex_lock(&buf_pool.mutex);
+          buf_pool.stat.n_pages_written+= p.second;
+        }
+        else
+        {
+          ut_ad(!space);
+          goto no_space;
+        }
+      }
+      else if (space->is_stopping_writes())
+      {
+        space->release();
+        space= nullptr;
+      no_space:
+        mysql_mutex_lock(&buf_pool.flush_list_mutex);
+        buf_flush_discard_page(bpage);
+        continue;
+      }
+
+      if (n->flushed >= max && !recv_recovery_is_on())
+      {
+        bpage->lock.u_unlock(true);
+        break;
+      }
+
+      if (neighbors && space->is_rotational())
+        n->flushed+= buf_flush_try_neighbors(space, page_id, bpage,
+                                             neighbors == 1,
+                                             do_evict, n->flushed, max);
+      else if (bpage->flush(do_evict, space))
+        ++n->flushed;
+      else
+        continue;
+
+      goto reacquire_mutex;
+    }
+    else
+      /* Can't evict or dispatch this block. Go to previous. */
+      ut_ad(buf_pool.lru_hp.is_hp(prev));
+  }
+
+  buf_pool.lru_hp.set(nullptr);
+
+  if (space)
+    space->release();
+
+  if (scanned)
+    MONITOR_INC_VALUE_CUMULATIVE(MONITOR_LRU_BATCH_SCANNED,
+                                 MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
+                                 MONITOR_LRU_BATCH_SCANNED_PER_CALL,
+                                 scanned);
+}
+
+/** Flush and move pages from LRU or unzip_LRU list to the free list.
+Whether LRU or unzip_LRU is used depends on the state of the system.
+@param max    maximum number of blocks to flush
+@param evict  whether dirty pages are to be evicted after flushing them
+@param n      counts of flushed and evicted pages */
+static void buf_do_LRU_batch(ulint max, bool evict, flush_counters_t *n)
+{
+  if (buf_LRU_evict_from_unzip_LRU())
+    buf_free_from_unzip_LRU_list_batch();
+  n->evicted= 0;
+  n->flushed= 0;
+  buf_flush_LRU_list_batch(max, evict, n);
+
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+  buf_lru_freed_page_count+= n->evicted;
+  buf_lru_flush_page_count+= n->flushed;
+  buf_pool.stat.n_pages_written+= n->flushed;
+}
+
+/** This utility flushes dirty blocks from the end of the flush_list.
+The calling thread is not allowed to own any latches on pages!
+@param max_n    maximum mumber of blocks to flush
+@param lsn      once an oldest_modification>=lsn is found, terminate the batch
+@return number of blocks for which the write request was queued */
+static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn)
+{
+  ulint count= 0;
+  ulint scanned= 0;
+
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+  mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
+
+  const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN
+    ? 0 : srv_flush_neighbors;
+  fil_space_t *space= nullptr;
+  uint32_t last_space_id= FIL_NULL;
+  static_assert(FIL_NULL > SRV_TMP_SPACE_ID, "consistency");
+  static_assert(FIL_NULL > SRV_SPACE_ID_UPPER_BOUND, "consistency");
+
+  /* Start from the end of the list looking for a suitable block to be
+  flushed. */
+  ulint len= UT_LIST_GET_LEN(buf_pool.flush_list);
+
+  for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list);
+       bpage && len && count < max_n; ++scanned, len--)
+  {
+    const lsn_t oldest_modification= bpage->oldest_modification();
+    if (oldest_modification >= lsn)
+      break;
+    ut_ad(bpage->in_file());
+
+    {
+      buf_page_t *prev= UT_LIST_GET_PREV(list, bpage);
+
+      if (oldest_modification == 1)
+      {
+      clear:
+        buf_pool.delete_from_flush_list(bpage);
+      skip:
+        bpage= prev;
+        continue;
+      }
+
+      ut_ad(oldest_modification > 2);
+
+      if (!bpage->lock.u_lock_try(true))
+        goto skip;
+
+      ut_ad(!bpage->is_io_fixed());
+
+      if (bpage->oldest_modification() == 1)
+      {
+        bpage->lock.u_unlock(true);
+        goto clear;
+      }
+
+      /* In order not to degenerate this scan to O(n*n) we attempt to
+      preserve the pointer position. Any thread that would remove 'prev'
+      from buf_pool.flush_list must adjust the hazard pointer.
+
+      Note: A concurrent execution of buf_flush_list_space() may
+      terminate this scan prematurely. The buf_pool.flush_list_active
+      should prevent multiple threads from executing
+      buf_do_flush_list_batch() concurrently,
+      but buf_flush_list_space() is ignoring that. */
+      buf_pool.flush_hp.set(prev);
+    }
+
+    const page_id_t page_id(bpage->id());
+    const uint32_t space_id= page_id.space();
+    if (!space || space->id != space_id)
+    {
+      if (last_space_id != space_id)
+      {
+        mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+        mysql_mutex_unlock(&buf_pool.mutex);
+        if (space)
+          space->release();
+        auto p= buf_flush_space(space_id);
+        space= p.first;
+        last_space_id= space_id;
+        mysql_mutex_lock(&buf_pool.mutex);
+        buf_pool.stat.n_pages_written+= p.second;
+        mysql_mutex_lock(&buf_pool.flush_list_mutex);
+      }
+      else
+        ut_ad(!space);
+    }
+    else if (space->is_stopping_writes())
+    {
+      space->release();
+      space= nullptr;
+    }
+
+    if (!space)
+      buf_flush_discard_page(bpage);
+    else
+    {
+      mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+      do
+      {
+        if (neighbors && space->is_rotational())
+          count+= buf_flush_try_neighbors(space, page_id, bpage,
+                                          neighbors == 1, false, count, max_n);
+        else if (bpage->flush(false, space))
+          ++count;
+        else
+          continue;
+        mysql_mutex_lock(&buf_pool.mutex);
+      }
+      while (0);
+    }
+
+    mysql_mutex_lock(&buf_pool.flush_list_mutex);
+    bpage= buf_pool.flush_hp.get();
+  }
+
+  buf_pool.flush_hp.set(nullptr);
+
+  if (space)
+    space->release();
+
+  if (scanned)
+    MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_SCANNED,
+                                 MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
+                                 MONITOR_FLUSH_BATCH_SCANNED_PER_CALL,
+                                 scanned);
+  return count;
+}
+
+/** Wait until a LRU flush batch ends. */
+void buf_flush_wait_LRU_batch_end()
+{
+  mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
+  mysql_mutex_assert_not_owner(&buf_pool.mutex);
+
+  if (buf_pool.n_flush())
+  {
+    tpool::tpool_wait_begin();
+    thd_wait_begin(nullptr, THD_WAIT_DISKIO);
+    do
+      my_cond_wait(&buf_pool.done_flush_LRU,
+                   &buf_pool.flush_list_mutex.m_mutex);
+    while (buf_pool.n_flush());
+    tpool::tpool_wait_end();
+    thd_wait_end(nullptr);
+  }
+}
+
+/** Write out dirty blocks from buf_pool.flush_list.
+The caller must invoke buf_dblwr.flush_buffered_writes()
+after releasing buf_pool.mutex.
+@param max_n    wished maximum mumber of blocks flushed
+@param lsn      buf_pool.get_oldest_modification(LSN_MAX) target
+@return the number of processed pages
+@retval 0 if a buf_pool.flush_list batch is already running */
+static ulint buf_flush_list_holding_mutex(ulint max_n= ULINT_UNDEFINED,
+                                          lsn_t lsn= LSN_MAX)
+{
+  ut_ad(lsn);
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+
+  mysql_mutex_lock(&buf_pool.flush_list_mutex);
+  if (buf_pool.flush_list_active())
+  {
+nothing_to_do:
+    mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+    return 0;
+  }
+  if (!buf_pool.get_oldest_modification(0))
+  {
+    pthread_cond_broadcast(&buf_pool.done_flush_list);
+    goto nothing_to_do;
+  }
+  buf_pool.flush_list_set_active();
+  const ulint n_flushed= buf_do_flush_list_batch(max_n, lsn);
+  if (n_flushed)
+    buf_pool.stat.n_pages_written+= n_flushed;
+  buf_pool.flush_list_set_inactive();
+  pthread_cond_broadcast(&buf_pool.done_flush_list);
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+  if (n_flushed)
+    MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_TOTAL_PAGE,
+                                 MONITOR_FLUSH_BATCH_COUNT,
+                                 MONITOR_FLUSH_BATCH_PAGES,
+                                 n_flushed);
+
+  DBUG_PRINT("ib_buf", ("flush_list completed, " ULINTPF " pages", n_flushed));
+  return n_flushed;
+}
+
+/** Write out dirty blocks from buf_pool.flush_list.
+@param max_n    wished maximum mumber of blocks flushed
+@param lsn      buf_pool.get_oldest_modification(LSN_MAX) target
+@return the number of processed pages
+@retval 0 if a buf_pool.flush_list batch is already running */
+static ulint buf_flush_list(ulint max_n= ULINT_UNDEFINED,
+                            lsn_t lsn= LSN_MAX)
+{
+  mysql_mutex_lock(&buf_pool.mutex);
+  ulint n= buf_flush_list_holding_mutex(max_n, lsn);
+  mysql_mutex_unlock(&buf_pool.mutex);
+  buf_dblwr.flush_buffered_writes();
+  return n;
+}
+
+/** Try to flush all the dirty pages that belong to a given tablespace.
+@param space       tablespace
+@param n_flushed   number of pages written
+@return whether the flush for some pages might not have been initiated */
+bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed)
+{
+  const auto space_id= space->id;
+  ut_ad(space_id <= SRV_SPACE_ID_UPPER_BOUND);
+
+  bool may_have_skipped= false;
+  ulint max_n_flush= srv_io_capacity;
+  ulint n_flush= 0;
+
+  bool acquired= space->acquire_for_write();
+  {
+    const uint32_t written{space->flush_freed(acquired)};
+    mysql_mutex_lock(&buf_pool.mutex);
+    if (written)
+      buf_pool.stat.n_pages_written+= written;
+  }
+  mysql_mutex_lock(&buf_pool.flush_list_mutex);
+
+  for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); bpage; )
+  {
+    ut_ad(bpage->oldest_modification());
+    ut_ad(bpage->in_file());
+
+    buf_page_t *prev= UT_LIST_GET_PREV(list, bpage);
+    if (bpage->oldest_modification() == 1)
+    clear:
+      buf_pool.delete_from_flush_list(bpage);
+    else if (bpage->id().space() != space_id);
+    else if (!bpage->lock.u_lock_try(true))
+      may_have_skipped= true;
+    else if (bpage->oldest_modification() == 1)
+    {
+      bpage->lock.u_unlock(true);
+      goto clear;
+    }
+    else
+    {
+      /* In order not to degenerate this scan to O(n*n) we attempt to
+      preserve the pointer position. Any thread that would remove 'prev'
+      from buf_pool.flush_list must adjust the hazard pointer.
+
+      Note: Multiple executions of buf_flush_list_space() may be
+      interleaved, and also buf_do_flush_list_batch() may be running
+      concurrently. This may terminate our iteration prematurely,
+      leading us to return may_have_skipped=true. */
+      buf_pool.flush_hp.set(prev);
+
+      if (!acquired)
+      was_freed:
+        buf_flush_discard_page(bpage);
+      else
+      {
+        if (space->is_stopping_writes())
+        {
+          space->release();
+          acquired= false;
+          goto was_freed;
+        }
+        mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+        if (bpage->flush(false, space))
+        {
+          ++n_flush;
+          if (!--max_n_flush)
+          {
+            mysql_mutex_lock(&buf_pool.mutex);
+            mysql_mutex_lock(&buf_pool.flush_list_mutex);
+            may_have_skipped= true;
+            goto done;
+          }
+          mysql_mutex_lock(&buf_pool.mutex);
+        }
+      }
+
+      mysql_mutex_lock(&buf_pool.flush_list_mutex);
+      if (!buf_pool.flush_hp.is_hp(prev))
+        may_have_skipped= true;
+      bpage= buf_pool.flush_hp.get();
+      continue;
+    }
+
+    bpage= prev;
+  }
+
+  /* Note: this loop may have been executed concurrently with
+  buf_do_flush_list_batch() as well as other threads executing
+  buf_flush_list_space(). We should always return true from
+  buf_flush_list_space() if that should be the case; in
+  buf_do_flush_list_batch() we will simply perform less work. */
+done:
+  buf_pool.flush_hp.set(nullptr);
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+  buf_pool.stat.n_pages_written+= n_flush;
+
+  buf_pool.try_LRU_scan= true;
+  pthread_cond_broadcast(&buf_pool.done_free);
+  mysql_mutex_unlock(&buf_pool.mutex);
+
+  if (n_flushed)
+    *n_flushed= n_flush;
+
+  if (acquired)
+    space->release();
+
+  if (space->purpose == FIL_TYPE_IMPORT)
+    os_aio_wait_until_no_pending_writes(true);
+  else
+    buf_dblwr.flush_buffered_writes();
+
+  return may_have_skipped;
+}
+
+/** Write out dirty blocks from buf_pool.LRU,
+and move clean blocks to buf_pool.free.
+The caller must invoke buf_dblwr.flush_buffered_writes()
+after releasing buf_pool.mutex.
+@param max_n    wished maximum mumber of blocks flushed
+@param evict    whether to evict pages after flushing
+@return evict ? number of processed pages : number of pages written */
+ulint buf_flush_LRU(ulint max_n, bool evict)
+{
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+
+  flush_counters_t n;
+  buf_do_LRU_batch(max_n, evict, &n);
+
+  ulint pages= n.flushed;
+
+  if (n.evicted)
+  {
+    if (evict)
+      pages+= n.evicted;
+    buf_pool.try_LRU_scan= true;
+    pthread_cond_broadcast(&buf_pool.done_free);
+  }
+
+  return pages;
+}
+
+#ifdef HAVE_PMEM
+# include <libpmem.h>
+#endif
+
+/** Write checkpoint information to the log header and release mutex.
+@param end_lsn    start LSN of the FILE_CHECKPOINT mini-transaction */
+inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept
+{
+  ut_ad(!srv_read_only_mode);
+  ut_ad(end_lsn >= next_checkpoint_lsn);
+  ut_ad(end_lsn <= get_lsn());
+  ut_ad(end_lsn + SIZE_OF_FILE_CHECKPOINT <= get_lsn() ||
+        srv_shutdown_state > SRV_SHUTDOWN_INITIATED);
+
+  DBUG_PRINT("ib_log",
+             ("checkpoint at " LSN_PF " written", next_checkpoint_lsn));
+
+  auto n= next_checkpoint_no;
+  const size_t offset{(n & 1) ? CHECKPOINT_2 : CHECKPOINT_1};
+  static_assert(CPU_LEVEL1_DCACHE_LINESIZE >= 64, "efficiency");
+  static_assert(CPU_LEVEL1_DCACHE_LINESIZE <= 4096, "compatibility");
+  byte* c= my_assume_aligned<CPU_LEVEL1_DCACHE_LINESIZE>
+    (is_pmem() ? buf + offset : checkpoint_buf);
+  memset_aligned<CPU_LEVEL1_DCACHE_LINESIZE>(c, 0, CPU_LEVEL1_DCACHE_LINESIZE);
+  mach_write_to_8(my_assume_aligned<8>(c), next_checkpoint_lsn);
+  mach_write_to_8(my_assume_aligned<8>(c + 8), end_lsn);
+  mach_write_to_4(my_assume_aligned<4>(c + 60), my_crc32c(0, c, 60));
+
+  lsn_t resizing;
+
+#ifdef HAVE_PMEM
+  if (is_pmem())
+  {
+    resizing= resize_lsn.load(std::memory_order_relaxed);
+
+    if (resizing > 1 && resizing <= next_checkpoint_lsn)
+    {
+      memcpy_aligned<64>(resize_buf + CHECKPOINT_1, c, 64);
+      header_write(resize_buf, resizing, is_encrypted());
+      pmem_persist(resize_buf, resize_target);
+    }
+    pmem_persist(c, 64);
+  }
+  else
+#endif
+  {
+    ut_ad(!checkpoint_pending);
+    checkpoint_pending= true;
+    latch.wr_unlock();
+    log_write_and_flush_prepare();
+    resizing= resize_lsn.load(std::memory_order_relaxed);
+    /* FIXME: issue an asynchronous write */
+    log.write(offset, {c, get_block_size()});
+    if (resizing > 1 && resizing <= next_checkpoint_lsn)
+    {
+      byte *buf= static_cast<byte*>(aligned_malloc(4096, 4096));
+      memset_aligned<4096>(buf, 0, 4096);
+      header_write(buf, resizing, is_encrypted());
+      resize_log.write(0, {buf, 4096});
+      aligned_free(buf);
+      resize_log.write(CHECKPOINT_1, {c, get_block_size()});
+    }
+
+    if (srv_file_flush_method != SRV_O_DSYNC)
+      ut_a(log.flush());
+    latch.wr_lock(SRW_LOCK_CALL);
+    ut_ad(checkpoint_pending);
+    checkpoint_pending= false;
+    resizing= resize_lsn.load(std::memory_order_relaxed);
+  }
+
+  ut_ad(!checkpoint_pending);
+  next_checkpoint_no++;
+  const lsn_t checkpoint_lsn{next_checkpoint_lsn};
+  last_checkpoint_lsn= checkpoint_lsn;
+
+  DBUG_PRINT("ib_log", ("checkpoint ended at " LSN_PF ", flushed to " LSN_PF,
+                        checkpoint_lsn, get_flushed_lsn()));
+  if (overwrite_warned)
+  {
+    sql_print_information("InnoDB: Crash recovery was broken "
+                          "between LSN=" LSN_PF
+                          " and checkpoint LSN=" LSN_PF ".",
+                          overwrite_warned, checkpoint_lsn);
+    overwrite_warned= 0;
+  }
+
+  lsn_t resizing_completed= 0;
+
+  if (resizing > 1 && resizing <= checkpoint_lsn)
+  {
+    ut_ad(is_pmem() == !resize_flush_buf);
+
+    if (!is_pmem())
+    {
+      if (srv_file_flush_method != SRV_O_DSYNC)
+        ut_a(resize_log.flush());
+      IF_WIN(log.close(),);
+    }
+
+    if (resize_rename())
+    {
+      /* Resizing failed. Discard the log_sys.resize_log. */
+#ifdef HAVE_PMEM
+      if (is_pmem())
+        my_munmap(resize_buf, resize_target);
+      else
+#endif
+      {
+        ut_free_dodump(resize_buf, buf_size);
+        ut_free_dodump(resize_flush_buf, buf_size);
+#ifdef _WIN32
+        ut_ad(!log.is_opened());
+        bool success;
+        log.m_file=
+          os_file_create_func(get_log_file_path().c_str(),
+                              OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT,
+                              OS_FILE_NORMAL, OS_LOG_FILE, false, &success);
+        ut_a(success);
+        ut_a(log.is_opened());
+#endif
+      }
+    }
+    else
+    {
+      /* Adopt the resized log. */
+#ifdef HAVE_PMEM
+      if (is_pmem())
+      {
+        my_munmap(buf, file_size);
+        buf= resize_buf;
+        buf_free= START_OFFSET + (get_lsn() - resizing);
+      }
+      else
+#endif
+      {
+        IF_WIN(,log.close());
+        std::swap(log, resize_log);
+        ut_free_dodump(buf, buf_size);
+        ut_free_dodump(flush_buf, buf_size);
+        buf= resize_buf;
+        flush_buf= resize_flush_buf;
+      }
+      srv_log_file_size= resizing_completed= file_size= resize_target;
+      first_lsn= resizing;
+      set_capacity();
+    }
+    ut_ad(!resize_log.is_opened());
+    resize_buf= nullptr;
+    resize_flush_buf= nullptr;
+    resize_target= 0;
+    resize_lsn.store(0, std::memory_order_relaxed);
+  }
+
+  log_resize_release();
+
+  if (UNIV_LIKELY(resizing <= 1));
+  else if (resizing > checkpoint_lsn)
+    buf_flush_ahead(resizing, false);
+  else if (resizing_completed)
+    ib::info() << "Resized log to " << ib::bytes_iec{resizing_completed}
+      << "; start LSN=" << resizing;
+  else
+    buf_flush_ahead(end_lsn + 1, false);
+}
+
+/** Initiate a log checkpoint, discarding the start of the log.
+@param oldest_lsn   the checkpoint LSN
+@param end_lsn      log_sys.get_lsn()
+@return true if success, false if a checkpoint write was already running */
+static bool log_checkpoint_low(lsn_t oldest_lsn, lsn_t end_lsn)
+{
+  ut_ad(!srv_read_only_mode);
+#ifndef SUX_LOCK_GENERIC
+  ut_ad(log_sys.latch.is_write_locked());
+#endif
+  ut_ad(oldest_lsn <= end_lsn);
+  ut_ad(end_lsn == log_sys.get_lsn());
+
+  if (oldest_lsn == log_sys.last_checkpoint_lsn ||
+      (oldest_lsn == end_lsn &&
+       !log_sys.resize_in_progress() &&
+       oldest_lsn == log_sys.last_checkpoint_lsn +
+       (log_sys.is_encrypted()
+        ? SIZE_OF_FILE_CHECKPOINT + 8 : SIZE_OF_FILE_CHECKPOINT)))
+  {
+    /* Do nothing, because nothing was logged (other than a
+    FILE_CHECKPOINT record) since the previous checkpoint. */
+  do_nothing:
+    log_sys.latch.wr_unlock();
+    return true;
+  }
+
+  ut_ad(!recv_no_log_write);
+  ut_ad(oldest_lsn > log_sys.last_checkpoint_lsn);
+  /* Repeat the FILE_MODIFY records after the checkpoint, in case some
+  log records between the checkpoint and log_sys.lsn need them.
+  Finally, write a FILE_CHECKPOINT record. Redo log apply expects to
+  see a FILE_CHECKPOINT after the checkpoint, except on clean
+  shutdown, where the log will be empty after the checkpoint.
+
+  It is important that we write out the redo log before any further
+  dirty pages are flushed to the tablespace files.  At this point,
+  because we hold exclusive log_sys.latch,
+  mtr_t::commit() in other threads will be blocked,
+  and no pages can be added to buf_pool.flush_list. */
+  const lsn_t flush_lsn{fil_names_clear(oldest_lsn)};
+  ut_ad(flush_lsn >= end_lsn + SIZE_OF_FILE_CHECKPOINT);
+  log_sys.latch.wr_unlock();
+  log_write_up_to(flush_lsn, true);
+  log_sys.latch.wr_lock(SRW_LOCK_CALL);
+  if (log_sys.last_checkpoint_lsn >= oldest_lsn)
+    goto do_nothing;
+
+  ut_ad(log_sys.get_flushed_lsn() >= flush_lsn);
+
+  if (log_sys.checkpoint_pending)
+  {
+    /* A checkpoint write is running */
+    log_sys.latch.wr_unlock();
+    return false;
+  }
+
+  log_sys.next_checkpoint_lsn= oldest_lsn;
+  log_sys.write_checkpoint(end_lsn);
+
+  return true;
+}
+
+/** Make a checkpoint. Note that this function does not flush dirty
+blocks from the buffer pool: it only checks what is lsn of the oldest
+modification in the pool, and writes information about the lsn in
+log file. Use log_make_checkpoint() to flush also the pool.
+@retval true if the checkpoint was or had been made
+@retval false if a checkpoint write was already running */
+static bool log_checkpoint()
+{
+  if (recv_recovery_is_on())
+    recv_sys.apply(true);
+
+  switch (srv_file_flush_method) {
+  case SRV_NOSYNC:
+  case SRV_O_DIRECT_NO_FSYNC:
+    break;
+  default:
+    fil_flush_file_spaces();
+  }
+
+  log_sys.latch.wr_lock(SRW_LOCK_CALL);
+  const lsn_t end_lsn= log_sys.get_lsn();
+  mysql_mutex_lock(&buf_pool.flush_list_mutex);
+  const lsn_t oldest_lsn= buf_pool.get_oldest_modification(end_lsn);
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+  return log_checkpoint_low(oldest_lsn, end_lsn);
+}
+
+/** Make a checkpoint. */
+ATTRIBUTE_COLD void log_make_checkpoint()
+{
+  buf_flush_wait_flushed(log_sys.get_lsn(std::memory_order_acquire));
+  while (!log_checkpoint());
+}
+
+/** Wait for all dirty pages up to an LSN to be written out.
+NOTE: The calling thread is not allowed to hold any buffer page latches! */
+static void buf_flush_wait(lsn_t lsn)
+{
+  ut_ad(lsn <= log_sys.get_lsn());
+
+  lsn_t oldest_lsn;
+
+  while ((oldest_lsn= buf_pool.get_oldest_modification(lsn)) < lsn)
+  {
+    if (buf_flush_sync_lsn < lsn)
+    {
+      buf_flush_sync_lsn= lsn;
+      buf_pool.page_cleaner_set_idle(false);
+      pthread_cond_signal(&buf_pool.do_flush_list);
+      my_cond_wait(&buf_pool.done_flush_list,
+                   &buf_pool.flush_list_mutex.m_mutex);
+      oldest_lsn= buf_pool.get_oldest_modification(lsn);
+      if (oldest_lsn >= lsn)
+        break;
+    }
+    mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+    os_aio_wait_until_no_pending_writes(false);
+    mysql_mutex_lock(&buf_pool.flush_list_mutex);
+  }
+
+  if (oldest_lsn >= buf_flush_sync_lsn)
+  {
+    buf_flush_sync_lsn= 0;
+    pthread_cond_broadcast(&buf_pool.done_flush_list);
+  }
+}
+
+/** Wait until all persistent pages are flushed up to a limit.
+@param sync_lsn   buf_pool.get_oldest_modification(LSN_MAX) to wait for */
+ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn)
+{
+  ut_ad(sync_lsn);
+  ut_ad(sync_lsn < LSN_MAX);
+  ut_ad(!srv_read_only_mode);
+
+  if (recv_recovery_is_on())
+    recv_sys.apply(true);
+
+  mysql_mutex_lock(&buf_pool.flush_list_mutex);
+
+  if (buf_pool.get_oldest_modification(sync_lsn) < sync_lsn)
+  {
+    MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS);
+
+#if 1 /* FIXME: remove this, and guarantee that the page cleaner serves us */
+    if (UNIV_UNLIKELY(!buf_page_cleaner_is_active))
+    {
+      do
+      {
+        mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+        ulint n_pages= buf_flush_list(srv_max_io_capacity, sync_lsn);
+        if (n_pages)
+        {
+          MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE,
+                                       MONITOR_FLUSH_SYNC_COUNT,
+                                       MONITOR_FLUSH_SYNC_PAGES, n_pages);
+        }
+        os_aio_wait_until_no_pending_writes(false);
+        mysql_mutex_lock(&buf_pool.flush_list_mutex);
+      }
+      while (buf_pool.get_oldest_modification(sync_lsn) < sync_lsn);
+    }
+    else
+#endif
+    {
+      thd_wait_begin(nullptr, THD_WAIT_DISKIO);
+      tpool::tpool_wait_begin();
+      buf_flush_wait(sync_lsn);
+      tpool::tpool_wait_end();
+      thd_wait_end(nullptr);
+    }
+  }
+
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+  if (UNIV_UNLIKELY(log_sys.last_checkpoint_lsn < sync_lsn))
+  {
+    /* If the buffer pool was clean, no log write was guaranteed
+    to happen until now. There could be an outstanding FILE_CHECKPOINT
+    record from a previous fil_names_clear() call, which we must
+    write out before we can advance the checkpoint. */
+    log_write_up_to(sync_lsn, true);
+    DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", return;);
+    log_checkpoint();
+  }
+}
+
+/** Initiate more eager page flushing if the log checkpoint age is too old.
+@param lsn      buf_pool.get_oldest_modification(LSN_MAX) target
+@param furious  true=furious flushing, false=limit to innodb_io_capacity */
+ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious)
+{
+  ut_ad(!srv_read_only_mode);
+
+  if (recv_recovery_is_on())
+    recv_sys.apply(true);
+
+  DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", return;);
+
+  Atomic_relaxed<lsn_t> &limit= furious
+    ? buf_flush_sync_lsn : buf_flush_async_lsn;
+
+  if (limit < lsn)
+  {
+    mysql_mutex_lock(&buf_pool.flush_list_mutex);
+    if (limit < lsn)
+    {
+      limit= lsn;
+      buf_pool.page_cleaner_set_idle(false);
+      pthread_cond_signal(&buf_pool.do_flush_list);
+    }
+    mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+  }
+}
+
+/** Conduct checkpoint-related flushing for innodb_flush_sync=ON,
+and try to initiate checkpoints until the target is met.
+@param lsn   minimum value of buf_pool.get_oldest_modification(LSN_MAX) */
+ATTRIBUTE_COLD static void buf_flush_sync_for_checkpoint(lsn_t lsn)
+{
+  ut_ad(!srv_read_only_mode);
+  mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
+
+  for (;;)
+  {
+    if (ulint n_flushed= buf_flush_list(srv_max_io_capacity, lsn))
+    {
+      MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE,
+                                   MONITOR_FLUSH_SYNC_COUNT,
+                                   MONITOR_FLUSH_SYNC_PAGES, n_flushed);
+    }
+
+    switch (srv_file_flush_method) {
+    case SRV_NOSYNC:
+    case SRV_O_DIRECT_NO_FSYNC:
+      break;
+    default:
+      fil_flush_file_spaces();
+    }
+
+    log_sys.latch.wr_lock(SRW_LOCK_CALL);
+    const lsn_t newest_lsn= log_sys.get_lsn();
+    mysql_mutex_lock(&buf_pool.flush_list_mutex);
+    lsn_t measure= buf_pool.get_oldest_modification(0);
+    const lsn_t checkpoint_lsn= measure ? measure : newest_lsn;
+
+    if (!recv_recovery_is_on() &&
+        checkpoint_lsn > log_sys.last_checkpoint_lsn + SIZE_OF_FILE_CHECKPOINT)
+    {
+      mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+      log_checkpoint_low(checkpoint_lsn, newest_lsn);
+      mysql_mutex_lock(&buf_pool.flush_list_mutex);
+      measure= buf_pool.get_oldest_modification(LSN_MAX);
+    }
+    else
+    {
+      log_sys.latch.wr_unlock();
+      if (!measure)
+        measure= LSN_MAX;
+    }
+
+    /* After attempting log checkpoint, check if we have reached our target. */
+    const lsn_t target= buf_flush_sync_lsn;
+
+    if (measure >= target)
+      buf_flush_sync_lsn= 0;
+    else if (measure >= buf_flush_async_lsn)
+      buf_flush_async_lsn= 0;
+
+    /* wake up buf_flush_wait() */
+    pthread_cond_broadcast(&buf_pool.done_flush_list);
+    mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+    lsn= std::max(lsn, target);
+
+    if (measure >= lsn)
+      return;
+  }
+}
+
+/** Check if the adpative flushing threshold is recommended based on
+redo log capacity filled threshold.
+@param oldest_lsn     buf_pool.get_oldest_modification()
+@return true if adaptive flushing is recommended. */
+static bool af_needed_for_redo(lsn_t oldest_lsn)
+{
+  lsn_t age= (log_sys.get_lsn() - oldest_lsn);
+  lsn_t af_lwm= static_cast<lsn_t>(srv_adaptive_flushing_lwm *
+    static_cast<double>(log_sys.log_capacity) / 100);
+
+  /* if age > af_lwm adaptive flushing is recommended */
+  return (age > af_lwm);
+}
+
+/*********************************************************************//**
+Calculates if flushing is required based on redo generation rate.
+@return percent of io_capacity to flush to manage redo space */
+static
+ulint
+af_get_pct_for_lsn(
+/*===============*/
+	lsn_t	age)	/*!< in: current age of LSN. */
+{
+	lsn_t	af_lwm = static_cast<lsn_t>(
+		srv_adaptive_flushing_lwm
+		* static_cast<double>(log_sys.log_capacity) / 100);
+
+	if (age < af_lwm) {
+		/* No adaptive flushing. */
+		return(0);
+	}
+
+	lsn_t lsn_age_factor = (age * 100) / log_sys.max_modified_age_async;
+
+	ut_ad(srv_max_io_capacity >= srv_io_capacity);
+	return static_cast<ulint>(
+		(static_cast<double>(srv_max_io_capacity / srv_io_capacity
+				     * lsn_age_factor)
+		 * sqrt(static_cast<double>(lsn_age_factor))
+		 / 7.5));
+}
+
+/** This function is called approximately once every second by
+buf_flush_page_cleaner() if innodb_max_dirty_pages_pct_lwm>0
+and innodb_adaptive_flushing=ON.
+Based on various factors it decides if there is a need to do flushing.
+@return number of pages recommended to be flushed
+@param last_pages_in  number of pages flushed in previous batch
+@param oldest_lsn     buf_pool.get_oldest_modification(0)
+@param pct_lwm        innodb_max_dirty_pages_pct_lwm, or 0 to ignore it
+@param dirty_blocks   UT_LIST_GET_LEN(buf_pool.flush_list)
+@param dirty_pct      100*flush_list.count / (LRU.count + free.count) */
+static ulint page_cleaner_flush_pages_recommendation(ulint last_pages_in,
+                                                     lsn_t oldest_lsn,
+                                                     double pct_lwm,
+                                                     ulint dirty_blocks,
+                                                     double dirty_pct)
+{
+	static	lsn_t		prev_lsn = 0;
+	static	ulint		sum_pages = 0;
+	static	ulint		avg_page_rate = 0;
+	static	ulint		n_iterations = 0;
+	static	time_t		prev_time;
+	lsn_t			lsn_rate;
+	ulint			n_pages = 0;
+
+	const lsn_t cur_lsn = log_sys.get_lsn();
+	ut_ad(oldest_lsn <= cur_lsn);
+	ulint pct_for_lsn = af_get_pct_for_lsn(cur_lsn - oldest_lsn);
+	time_t curr_time = time(nullptr);
+	const double max_pct = srv_max_buf_pool_modified_pct;
+
+	if (!prev_lsn || !pct_for_lsn) {
+		prev_time = curr_time;
+		prev_lsn = cur_lsn;
+		if (max_pct > 0.0) {
+			dirty_pct /= max_pct;
+		}
+
+		n_pages = ulint(dirty_pct * double(srv_io_capacity));
+		if (n_pages < dirty_blocks) {
+			n_pages= std::min<ulint>(srv_io_capacity, dirty_blocks);
+		}
+
+func_exit:
+		page_cleaner.flush_pass++;
+		return n_pages;
+	}
+
+	sum_pages += last_pages_in;
+
+	const ulint time_elapsed = std::max<ulint>(curr_time - prev_time, 1);
+
+	/* We update our variables every innodb_flushing_avg_loops
+	iterations to smooth out transition in workload. */
+	if (++n_iterations >= srv_flushing_avg_loops
+	    || time_elapsed >= srv_flushing_avg_loops) {
+
+		avg_page_rate = (sum_pages / time_elapsed + avg_page_rate) / 2;
+
+		/* How much LSN we have generated since last call. */
+		lsn_rate = (cur_lsn - prev_lsn) / time_elapsed;
+
+		lsn_avg_rate = (lsn_avg_rate + lsn_rate) / 2;
+
+		if (page_cleaner.flush_pass) {
+			page_cleaner.flush_time /= page_cleaner.flush_pass;
+		}
+
+		prev_lsn = cur_lsn;
+		prev_time = curr_time;
+
+		MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME,
+			    page_cleaner.flush_time);
+		MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_PASS,
+			    page_cleaner.flush_pass);
+
+		page_cleaner.flush_time = 0;
+		page_cleaner.flush_pass = 0;
+
+		n_iterations = 0;
+		sum_pages = 0;
+	}
+
+	MONITOR_SET(MONITOR_FLUSH_PCT_FOR_LSN, pct_for_lsn);
+
+	double total_ratio;
+	if (pct_lwm == 0.0 || max_pct == 0.0) {
+		total_ratio = 1;
+	} else {
+		total_ratio = std::max(double(pct_for_lsn) / 100,
+				       (dirty_pct / max_pct));
+	}
+
+	MONITOR_SET(MONITOR_FLUSH_PCT_FOR_DIRTY, ulint(total_ratio * 100));
+
+	/* Estimate pages to be flushed for the lsn progress */
+	lsn_t	target_lsn = oldest_lsn
+		+ lsn_avg_rate * buf_flush_lsn_scan_factor;
+	ulint	pages_for_lsn = 0;
+
+	mysql_mutex_lock(&buf_pool.flush_list_mutex);
+
+	for (buf_page_t* b = UT_LIST_GET_LAST(buf_pool.flush_list);
+	     b != NULL;
+	     b = UT_LIST_GET_PREV(list, b)) {
+		if (b->oldest_modification() > target_lsn) {
+			break;
+		}
+		if (++pages_for_lsn >= srv_max_io_capacity) {
+			break;
+		}
+	}
+	mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+	pages_for_lsn /= buf_flush_lsn_scan_factor;
+	if (pages_for_lsn < 1) {
+		pages_for_lsn = 1;
+	}
+
+	n_pages = (ulint(double(srv_io_capacity) * total_ratio)
+		   + avg_page_rate + pages_for_lsn) / 3;
+
+	if (n_pages > srv_max_io_capacity) {
+		n_pages = srv_max_io_capacity;
+	}
+
+	MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_REQUESTED, n_pages);
+
+	MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_BY_AGE, pages_for_lsn);
+
+	MONITOR_SET(MONITOR_FLUSH_AVG_PAGE_RATE, avg_page_rate);
+	MONITOR_SET(MONITOR_FLUSH_LSN_AVG_RATE, lsn_avg_rate);
+
+	goto func_exit;
+}
+
+#if defined __aarch64__&&defined __GNUC__&&__GNUC__==4&&!defined __clang__
+/* Avoid GCC 4.8.5 internal compiler error "could not split insn".
+We would only need this for buf_flush_page_cleaner(),
+but GCC 4.8.5 does not support pop_options. */
+# pragma GCC optimize ("O0")
+#endif
+/** page_cleaner thread tasked with flushing dirty pages from the buffer
+pools. As of now we'll have only one coordinator. */
+static void buf_flush_page_cleaner()
+{
+  my_thread_init();
+#ifdef UNIV_PFS_THREAD
+  pfs_register_thread(page_cleaner_thread_key);
+#endif /* UNIV_PFS_THREAD */
+  ut_ad(!srv_read_only_mode);
+  ut_ad(buf_page_cleaner_is_active);
+
+  ulint last_pages= 0;
+  timespec abstime;
+  set_timespec(abstime, 1);
+
+  lsn_t lsn_limit;
+  ulint last_activity_count= srv_get_activity_count();
+
+  for (;;)
+  {
+    lsn_limit= buf_flush_sync_lsn;
+
+    if (UNIV_UNLIKELY(lsn_limit != 0) && UNIV_LIKELY(srv_flush_sync))
+    {
+    furious_flush:
+      buf_flush_sync_for_checkpoint(lsn_limit);
+      last_pages= 0;
+      set_timespec(abstime, 1);
+      continue;
+    }
+
+    mysql_mutex_lock(&buf_pool.flush_list_mutex);
+    if (buf_pool.ran_out())
+      goto no_wait;
+    else if (srv_shutdown_state > SRV_SHUTDOWN_INITIATED)
+      break;
+
+    if (buf_pool.page_cleaner_idle() &&
+        (!UT_LIST_GET_LEN(buf_pool.flush_list) ||
+         srv_max_dirty_pages_pct_lwm == 0.0))
+      /* We are idle; wait for buf_pool.page_cleaner_wakeup() */
+      my_cond_wait(&buf_pool.do_flush_list,
+                   &buf_pool.flush_list_mutex.m_mutex);
+    else
+      my_cond_timedwait(&buf_pool.do_flush_list,
+                        &buf_pool.flush_list_mutex.m_mutex, &abstime);
+  no_wait:
+    set_timespec(abstime, 1);
+
+    lsn_limit= buf_flush_sync_lsn;
+    lsn_t oldest_lsn= buf_pool.get_oldest_modification(0);
+
+    if (!oldest_lsn)
+    {
+    fully_unemployed:
+      buf_flush_sync_lsn= 0;
+    set_idle:
+      buf_pool.page_cleaner_set_idle(true);
+    set_almost_idle:
+      pthread_cond_broadcast(&buf_pool.done_flush_LRU);
+      pthread_cond_broadcast(&buf_pool.done_flush_list);
+      if (UNIV_UNLIKELY(srv_shutdown_state > SRV_SHUTDOWN_INITIATED))
+        break;
+      mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+      buf_dblwr.flush_buffered_writes();
+
+      do
+      {
+        DBUG_EXECUTE_IF("ib_log_checkpoint_avoid", continue;);
+        DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", continue;);
+
+        if (!recv_recovery_is_on() &&
+            !srv_startup_is_before_trx_rollback_phase &&
+            srv_operation <= SRV_OPERATION_EXPORT_RESTORED)
+          log_checkpoint();
+      }
+      while (false);
+
+      if (!buf_pool.ran_out())
+        continue;
+      mysql_mutex_lock(&buf_pool.flush_list_mutex);
+      oldest_lsn= buf_pool.get_oldest_modification(0);
+    }
+
+    lsn_t soft_lsn_limit= buf_flush_async_lsn;
+
+    if (UNIV_UNLIKELY(lsn_limit != 0))
+    {
+      if (srv_flush_sync)
+        goto do_furious_flush;
+      if (oldest_lsn >= lsn_limit)
+      {
+        buf_flush_sync_lsn= 0;
+        pthread_cond_broadcast(&buf_pool.done_flush_list);
+      }
+      else if (lsn_limit > soft_lsn_limit)
+        soft_lsn_limit= lsn_limit;
+    }
+
+    double pct_lwm= 0.0;
+    ulint n_flushed= 0, n;
+
+    if (UNIV_UNLIKELY(soft_lsn_limit != 0))
+    {
+      if (oldest_lsn >= soft_lsn_limit)
+        buf_flush_async_lsn= soft_lsn_limit= 0;
+    }
+    else if (buf_pool.ran_out())
+    {
+      buf_pool.page_cleaner_set_idle(false);
+      buf_pool.n_flush_inc();
+      /* Remove clean blocks from buf_pool.flush_list before the LRU scan. */
+      for (buf_page_t *p= UT_LIST_GET_FIRST(buf_pool.flush_list); p; )
+      {
+        const lsn_t lsn{p->oldest_modification()};
+        ut_ad(lsn > 2 || lsn == 1);
+        buf_page_t *n= UT_LIST_GET_NEXT(list, p);
+        if (lsn <= 1)
+          buf_pool.delete_from_flush_list(p);
+        p= n;
+      }
+      mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+      n= srv_max_io_capacity;
+      mysql_mutex_lock(&buf_pool.mutex);
+    LRU_flush:
+      n= buf_flush_LRU(n, false);
+      mysql_mutex_unlock(&buf_pool.mutex);
+      last_pages+= n;
+    check_oldest_and_set_idle:
+      mysql_mutex_lock(&buf_pool.flush_list_mutex);
+      buf_pool.n_flush_dec_holding_mutex();
+      oldest_lsn= buf_pool.get_oldest_modification(0);
+      if (!oldest_lsn)
+        goto fully_unemployed;
+      if (oldest_lsn >= buf_flush_async_lsn)
+        buf_flush_async_lsn= 0;
+      buf_pool.page_cleaner_set_idle(false);
+      goto set_almost_idle;
+    }
+    else if (UNIV_UNLIKELY(srv_shutdown_state > SRV_SHUTDOWN_INITIATED))
+      break;
+
+    const ulint dirty_blocks= UT_LIST_GET_LEN(buf_pool.flush_list);
+    /* We perform dirty reads of the LRU+free list lengths here.
+    Division by zero is not possible, because buf_pool.flush_list is
+    guaranteed to be nonempty, and it is a subset of buf_pool.LRU. */
+    const double dirty_pct= double(dirty_blocks) * 100.0 /
+      double(UT_LIST_GET_LEN(buf_pool.LRU) + UT_LIST_GET_LEN(buf_pool.free));
+    pct_lwm= srv_max_dirty_pages_pct_lwm;
+    if (pct_lwm != 0.0)
+    {
+      const ulint activity_count= srv_get_activity_count();
+      if (activity_count != last_activity_count)
+      {
+        last_activity_count= activity_count;
+        goto maybe_unemployed;
+      }
+      else if (buf_pool.page_cleaner_idle() && !os_aio_pending_reads())
+      {
+        /* reaching here means 3 things:
+           - last_activity_count == activity_count: suggesting server is idle
+           (no trx_t::commit() activity)
+           - page cleaner is idle (dirty_pct < srv_max_dirty_pages_pct_lwm)
+           - there are no pending reads but there are dirty pages to flush */
+        buf_pool.update_last_activity_count(activity_count);
+        buf_pool.n_flush_inc();
+        mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+        goto idle_flush;
+      }
+      else
+      {
+      maybe_unemployed:
+        const bool below{dirty_pct < pct_lwm};
+        pct_lwm= 0.0;
+        if (below)
+          goto possibly_unemployed;
+      }
+    }
+    else if (dirty_pct < srv_max_buf_pool_modified_pct)
+    possibly_unemployed:
+      if (!soft_lsn_limit && !af_needed_for_redo(oldest_lsn))
+        goto set_idle;
+
+    buf_pool.page_cleaner_set_idle(false);
+    buf_pool.n_flush_inc();
+    mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+    if (UNIV_UNLIKELY(soft_lsn_limit != 0))
+    {
+      n= srv_max_io_capacity;
+      goto background_flush;
+    }
+
+    if (!srv_adaptive_flushing)
+    {
+    idle_flush:
+      n= srv_io_capacity;
+      soft_lsn_limit= LSN_MAX;
+    background_flush:
+      mysql_mutex_lock(&buf_pool.mutex);
+      n_flushed= buf_flush_list_holding_mutex(n, soft_lsn_limit);
+      MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
+                                   MONITOR_FLUSH_BACKGROUND_COUNT,
+                                   MONITOR_FLUSH_BACKGROUND_PAGES,
+                                   n_flushed);
+    }
+    else if ((n= page_cleaner_flush_pages_recommendation(last_pages,
+                                                         oldest_lsn,
+                                                         pct_lwm,
+                                                         dirty_blocks,
+                                                         dirty_pct)) != 0)
+    {
+      const ulint tm= ut_time_ms();
+      mysql_mutex_lock(&buf_pool.mutex);
+      last_pages= n_flushed= buf_flush_list_holding_mutex(n);
+      page_cleaner.flush_time+= ut_time_ms() - tm;
+      MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
+                                   MONITOR_FLUSH_ADAPTIVE_COUNT,
+                                   MONITOR_FLUSH_ADAPTIVE_PAGES,
+                                   n_flushed);
+    }
+    else if (buf_flush_async_lsn <= oldest_lsn)
+      goto check_oldest_and_set_idle;
+
+    n= n >= n_flushed ? n - n_flushed : 0;
+    goto LRU_flush;
+  }
+
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+  if (srv_fast_shutdown != 2)
+  {
+    buf_dblwr.flush_buffered_writes();
+    mysql_mutex_lock(&buf_pool.flush_list_mutex);
+    buf_flush_wait_LRU_batch_end();
+    mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+    os_aio_wait_until_no_pending_writes(false);
+  }
+
+  mysql_mutex_lock(&buf_pool.flush_list_mutex);
+  lsn_limit= buf_flush_sync_lsn;
+  if (UNIV_UNLIKELY(lsn_limit != 0))
+  {
+  do_furious_flush:
+    mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+    goto furious_flush;
+  }
+  buf_page_cleaner_is_active= false;
+  pthread_cond_broadcast(&buf_pool.done_flush_list);
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+  my_thread_end();
+
+#ifdef UNIV_PFS_THREAD
+  pfs_delete_thread();
+#endif
+}
+
+/** Initialize page_cleaner. */
+ATTRIBUTE_COLD void buf_flush_page_cleaner_init()
+{
+  ut_ad(!buf_page_cleaner_is_active);
+  ut_ad(srv_operation <= SRV_OPERATION_EXPORT_RESTORED ||
+        srv_operation == SRV_OPERATION_RESTORE ||
+        srv_operation == SRV_OPERATION_RESTORE_EXPORT);
+  buf_flush_async_lsn= 0;
+  buf_flush_sync_lsn= 0;
+  buf_page_cleaner_is_active= true;
+  std::thread(buf_flush_page_cleaner).detach();
+}
+
+/** Flush the buffer pool on shutdown. */
+ATTRIBUTE_COLD void buf_flush_buffer_pool()
+{
+  ut_ad(!os_aio_pending_reads());
+  ut_ad(!buf_page_cleaner_is_active);
+  ut_ad(!buf_flush_sync_lsn);
+
+  service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
+                                 "Waiting to flush the buffer pool");
+
+  mysql_mutex_lock(&buf_pool.flush_list_mutex);
+
+  while (buf_pool.get_oldest_modification(0))
+  {
+    mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+    buf_flush_list(srv_max_io_capacity);
+    os_aio_wait_until_no_pending_writes(false);
+    mysql_mutex_lock(&buf_pool.flush_list_mutex);
+    service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
+                                   "Waiting to flush " ULINTPF " pages",
+                                   UT_LIST_GET_LEN(buf_pool.flush_list));
+  }
+
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+  ut_ad(!os_aio_pending_reads());
+}
+
+/** Synchronously flush dirty blocks during recv_sys_t::apply().
+NOTE: The calling thread is not allowed to hold any buffer page latches! */
+void buf_flush_sync_batch(lsn_t lsn)
+{
+  lsn= std::max(lsn, log_sys.get_lsn());
+  mysql_mutex_lock(&buf_pool.flush_list_mutex);
+  buf_flush_wait(lsn);
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+}
+
+/** Synchronously flush dirty blocks.
+NOTE: The calling thread is not allowed to hold any buffer page latches! */
+void buf_flush_sync()
+{
+  if (recv_recovery_is_on())
+  {
+    mysql_mutex_lock(&recv_sys.mutex);
+    recv_sys.apply(true);
+    mysql_mutex_unlock(&recv_sys.mutex);
+  }
+
+  thd_wait_begin(nullptr, THD_WAIT_DISKIO);
+  tpool::tpool_wait_begin();
+  mysql_mutex_lock(&buf_pool.flush_list_mutex);
+  for (;;)
+  {
+    const lsn_t lsn= log_sys.get_lsn();
+    buf_flush_wait(lsn);
+    /* Wait for the page cleaner to be idle (for log resizing at startup) */
+    while (buf_flush_sync_lsn)
+      my_cond_wait(&buf_pool.done_flush_list,
+                   &buf_pool.flush_list_mutex.m_mutex);
+    if (lsn == log_sys.get_lsn())
+      break;
+  }
+
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+  tpool::tpool_wait_end();
+  thd_wait_end(nullptr);
+}
+
+#ifdef UNIV_DEBUG
+/** Functor to validate the flush list. */
+struct	Check {
+	void operator()(const buf_page_t* elem) const
+	{
+		ut_ad(elem->oldest_modification());
+		ut_ad(!fsp_is_system_temporary(elem->id().space()));
+	}
+};
+
+/** Validate the flush list. */
+static void buf_flush_validate_low()
+{
+	buf_page_t*		bpage;
+
+	mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
+
+	ut_list_validate(buf_pool.flush_list, Check());
+
+	bpage = UT_LIST_GET_FIRST(buf_pool.flush_list);
+
+	while (bpage != NULL) {
+		const lsn_t	om = bpage->oldest_modification();
+		/* A page in buf_pool.flush_list can be in
+		BUF_BLOCK_REMOVE_HASH state. This happens when a page
+		is in the middle of being relocated. In that case the
+		original descriptor can have this state and still be
+		in the flush list waiting to acquire the
+		buf_pool.flush_list_mutex to complete the relocation. */
+		ut_d(const auto s= bpage->state());
+		ut_ad(s >= buf_page_t::REMOVE_HASH);
+		ut_ad(om == 1 || om > 2);
+
+		bpage = UT_LIST_GET_NEXT(list, bpage);
+		ut_ad(om == 1 || !bpage || recv_recovery_is_on()
+		      || om >= bpage->oldest_modification());
+	}
+}
+
+/** Validate the flush list. */
+void buf_flush_validate()
+{
+  mysql_mutex_lock(&buf_pool.flush_list_mutex);
+  buf_flush_validate_low();
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+}
+#endif /* UNIV_DEBUG */
diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc
new file mode 100644
index 00000000..65ee8fa3
--- /dev/null
+++ b/storage/innobase/buf/buf0lru.cc
@@ -0,0 +1,1452 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0lru.cc
+The database buffer replacement algorithm
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "buf0lru.h"
+#include "fil0fil.h"
+#include "btr0btr.h"
+#include "buf0buddy.h"
+#include "buf0buf.h"
+#include "buf0flu.h"
+#include "buf0rea.h"
+#include "btr0sea.h"
+#include "os0file.h"
+#include "page0zip.h"
+#include "log0recv.h"
+#include "srv0srv.h"
+#include "srv0mon.h"
+#include "my_cpu.h"
+
+/** Flush this many pages in buf_LRU_get_free_block() */
+size_t innodb_lru_flush_size;
+
+/** The number of blocks from the LRU_old pointer onward, including
+the block pointed to, must be buf_pool.LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV
+of the whole LRU list length, except that the tolerance defined below
+is allowed. Note that the tolerance must be small enough such that for
+even the BUF_LRU_OLD_MIN_LEN long LRU list, the LRU_old pointer is not
+allowed to point to either end of the LRU list. */
+
+static constexpr ulint BUF_LRU_OLD_TOLERANCE = 20;
+
+/** The minimum amount of non-old blocks when the LRU_old list exists
+(that is, when there are more than BUF_LRU_OLD_MIN_LEN blocks).
+@see buf_LRU_old_adjust_len */
+#define BUF_LRU_NON_OLD_MIN_LEN	5
+
+/** If we switch on the InnoDB monitor because there are too few available
+frames in the buffer pool, we set this to TRUE */
+static bool buf_lru_switched_on_innodb_mon = false;
+
+/** True if diagnostic message about difficult to find free blocks
+in the buffer bool has already printed. */
+static bool	buf_lru_free_blocks_error_printed;
+
+/******************************************************************//**
+These statistics are not 'of' LRU but 'for' LRU.  We keep count of I/O
+and page_zip_decompress() operations.  Based on the statistics,
+buf_LRU_evict_from_unzip_LRU() decides if we want to evict from
+unzip_LRU or the regular LRU.  From unzip_LRU, we will only evict the
+uncompressed frame (meaning we can evict dirty blocks as well).  From
+the regular LRU, we will evict the entire block (i.e.: both the
+uncompressed and compressed data), which must be clean. */
+
+/* @{ */
+
+/** Number of intervals for which we keep the history of these stats.
+Updated at SRV_MONITOR_INTERVAL (the buf_LRU_stat_update() call rate). */
+static constexpr ulint BUF_LRU_STAT_N_INTERVAL= 4;
+
+/** Co-efficient with which we multiply I/O operations to equate them
+with page_zip_decompress() operations. */
+static constexpr ulint BUF_LRU_IO_TO_UNZIP_FACTOR= 50;
+
+/** Sampled values buf_LRU_stat_cur.
+Not protected by any mutex.  Updated by buf_LRU_stat_update(). */
+static buf_LRU_stat_t		buf_LRU_stat_arr[BUF_LRU_STAT_N_INTERVAL];
+
+/** Cursor to buf_LRU_stat_arr[] that is updated in a round-robin fashion. */
+static ulint			buf_LRU_stat_arr_ind;
+
+/** Current operation counters.  Not protected by any mutex.  Cleared
+by buf_LRU_stat_update(). */
+buf_LRU_stat_t	buf_LRU_stat_cur;
+
+/** Running sum of past values of buf_LRU_stat_cur.
+Updated by buf_LRU_stat_update().  Not Protected by any mutex. */
+buf_LRU_stat_t	buf_LRU_stat_sum;
+
+/* @} */
+
+/** @name Heuristics for detecting index scan @{ */
+/** Move blocks to "new" LRU list only if the first access was at
+least this many milliseconds ago.  Not protected by any mutex or latch. */
+uint	buf_LRU_old_threshold_ms;
+/* @} */
+
+/** Remove bpage from buf_pool.LRU and buf_pool.page_hash.
+
+If !bpage->frame && bpage->oldest_modification() <= 1,
+the object will be freed.
+
+@param bpage      buffer block
+@param id         page identifier
+@param chain      locked buf_pool.page_hash chain (will be released here)
+@param zip        whether bpage->zip of BUF_BLOCK_FILE_PAGE should be freed
+
+If a compressed page is freed other compressed pages may be relocated.
+@retval true if bpage with bpage->frame was removed from page_hash. The
+caller needs to free the page to the free list
+@retval false if block without bpage->frame was removed from page_hash. In
+this case the block is already returned to the buddy allocator. */
+static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id,
+                                        buf_pool_t::hash_chain &chain,
+                                        bool zip);
+
+/** Free a block to buf_pool */
+static void buf_LRU_block_free_hashed_page(buf_block_t *block)
+{
+  block->page.free_file_page();
+  buf_LRU_block_free_non_file_page(block);
+}
+
+/** Increase LRU size in bytes by the page size.
+@param[in]	bpage		control block */
+static inline void incr_LRU_size_in_bytes(const buf_page_t* bpage)
+{
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+
+	buf_pool.stat.LRU_bytes += bpage->physical_size();
+
+	ut_ad(buf_pool.stat.LRU_bytes <= buf_pool.curr_pool_size);
+}
+
+/** @return whether the unzip_LRU list should be used for evicting a victim
+instead of the general LRU list */
+bool buf_LRU_evict_from_unzip_LRU()
+{
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+
+	/* If the unzip_LRU list is empty, we can only use the LRU. */
+	if (UT_LIST_GET_LEN(buf_pool.unzip_LRU) == 0) {
+		return false;
+	}
+
+	/* If unzip_LRU is at most 10% of the size of the LRU list,
+	then use the LRU.  This slack allows us to keep hot
+	decompressed pages in the buffer pool. */
+	if (UT_LIST_GET_LEN(buf_pool.unzip_LRU)
+	    <= UT_LIST_GET_LEN(buf_pool.LRU) / 10) {
+		return false;
+	}
+
+	/* If eviction hasn't started yet, we assume by default
+	that a workload is disk bound. */
+	if (buf_pool.freed_page_clock == 0) {
+		return true;
+	}
+
+	/* Calculate the average over past intervals, and add the values
+	of the current interval. */
+	ulint	io_avg = buf_LRU_stat_sum.io / BUF_LRU_STAT_N_INTERVAL
+		+ buf_LRU_stat_cur.io;
+
+	ulint	unzip_avg = buf_LRU_stat_sum.unzip / BUF_LRU_STAT_N_INTERVAL
+		+ buf_LRU_stat_cur.unzip;
+
+	/* Decide based on our formula.  If the load is I/O bound
+	(unzip_avg is smaller than the weighted io_avg), evict an
+	uncompressed frame from unzip_LRU.  Otherwise we assume that
+	the load is CPU bound and evict from the regular LRU. */
+	return(unzip_avg <= io_avg * BUF_LRU_IO_TO_UNZIP_FACTOR);
+}
+
+/** Try to free an uncompressed page of a compressed block from the unzip
+LRU list.  The compressed page is preserved, and it need not be clean.
+@param limit  maximum number of blocks to scan
+@return true if freed */
+static bool buf_LRU_free_from_unzip_LRU_list(ulint limit)
+{
+	if (!buf_LRU_evict_from_unzip_LRU()) {
+		return(false);
+	}
+
+	ulint	scanned = 0;
+	bool	freed = false;
+
+	for (buf_block_t* block = UT_LIST_GET_LAST(buf_pool.unzip_LRU);
+	     block && scanned < limit; ++scanned) {
+		buf_block_t* prev_block = UT_LIST_GET_PREV(unzip_LRU, block);
+
+		ut_ad(block->page.in_file());
+		ut_ad(block->page.belongs_to_unzip_LRU());
+		ut_ad(block->in_unzip_LRU_list);
+		ut_ad(block->page.in_LRU_list);
+
+		freed = buf_LRU_free_page(&block->page, false);
+		if (freed) {
+			scanned++;
+			break;
+		}
+
+		block = prev_block;
+	}
+
+	if (scanned) {
+		MONITOR_INC_VALUE_CUMULATIVE(
+			MONITOR_LRU_UNZIP_SEARCH_SCANNED,
+			MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL,
+			MONITOR_LRU_UNZIP_SEARCH_SCANNED_PER_CALL,
+			scanned);
+	}
+
+	return(freed);
+}
+
+/** Try to free a clean page from the common LRU list.
+@param limit  maximum number of blocks to scan
+@return whether a page was freed */
+static bool buf_LRU_free_from_common_LRU_list(ulint limit)
+{
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+
+	ulint		scanned = 0;
+	bool		freed = false;
+
+	for (buf_page_t* bpage = buf_pool.lru_scan_itr.start();
+	     bpage && scanned < limit;
+	     ++scanned, bpage = buf_pool.lru_scan_itr.get()) {
+		buf_page_t*	prev = UT_LIST_GET_PREV(LRU, bpage);
+		buf_pool.lru_scan_itr.set(prev);
+
+		const auto accessed = bpage->is_accessed();
+
+		if (buf_LRU_free_page(bpage, true)) {
+			if (!accessed) {
+				/* Keep track of pages that are evicted without
+				ever being accessed. This gives us a measure of
+				the effectiveness of readahead */
+				++buf_pool.stat.n_ra_pages_evicted;
+			}
+
+			freed = true;
+			scanned++;
+			break;
+		}
+	}
+
+	MONITOR_INC_VALUE_CUMULATIVE(
+		MONITOR_LRU_SEARCH_SCANNED,
+		MONITOR_LRU_SEARCH_SCANNED_NUM_CALL,
+		MONITOR_LRU_SEARCH_SCANNED_PER_CALL,
+		scanned);
+
+	return(freed);
+}
+
+/** @return a buffer block from the buf_pool.free list
+@retval	NULL	if the free list is empty */
+buf_block_t* buf_LRU_get_free_only()
+{
+	buf_block_t*	block;
+
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+
+	block = reinterpret_cast<buf_block_t*>(
+		UT_LIST_GET_FIRST(buf_pool.free));
+
+	while (block != NULL) {
+		ut_ad(block->page.in_free_list);
+		ut_d(block->page.in_free_list = FALSE);
+		ut_ad(!block->page.oldest_modification());
+		ut_ad(!block->page.in_LRU_list);
+		ut_a(!block->page.in_file());
+		UT_LIST_REMOVE(buf_pool.free, &block->page);
+
+		if (!buf_pool.is_shrinking()
+		    || UT_LIST_GET_LEN(buf_pool.withdraw)
+			>= buf_pool.withdraw_target
+		    || !buf_pool.will_be_withdrawn(block->page)) {
+			/* No adaptive hash index entries may point to
+			a free block. */
+			assert_block_ahi_empty(block);
+
+			block->page.set_state(buf_page_t::MEMORY);
+			block->page.set_os_used();
+			break;
+		}
+
+		/* This should be withdrawn */
+		UT_LIST_ADD_LAST(buf_pool.withdraw, &block->page);
+		ut_d(block->in_withdraw_list = true);
+
+		block = reinterpret_cast<buf_block_t*>(
+			UT_LIST_GET_FIRST(buf_pool.free));
+	}
+
+	return(block);
+}
+
+/******************************************************************//**
+Checks how much of buf_pool is occupied by non-data objects like
+AHI, lock heaps etc. Depending on the size of non-data objects this
+function will either assert or issue a warning and switch on the
+status monitor. */
+static void buf_LRU_check_size_of_non_data_objects()
+{
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+
+  if (recv_recovery_is_on() || buf_pool.n_chunks_new != buf_pool.n_chunks)
+    return;
+
+  const auto s= UT_LIST_GET_LEN(buf_pool.free) + UT_LIST_GET_LEN(buf_pool.LRU);
+
+  if (s < buf_pool.curr_size / 20)
+    ib::fatal() << "Over 95 percent of the buffer pool is"
+            " occupied by lock heaps"
+#ifdef BTR_CUR_HASH_ADAPT
+            " or the adaptive hash index"
+#endif /* BTR_CUR_HASH_ADAPT */
+            "! Check that your transactions do not set too many"
+            " row locks, or review if innodb_buffer_pool_size="
+                << (buf_pool.curr_size >> (20U - srv_page_size_shift))
+                << "M could be bigger.";
+
+  if (s < buf_pool.curr_size / 3)
+  {
+    if (!buf_lru_switched_on_innodb_mon && srv_monitor_timer)
+    {
+      /* Over 67 % of the buffer pool is occupied by lock heaps or
+      the adaptive hash index. This may be a memory leak! */
+      ib::warn() << "Over 67 percent of the buffer pool is"
+              " occupied by lock heaps"
+#ifdef BTR_CUR_HASH_ADAPT
+              " or the adaptive hash index"
+#endif /* BTR_CUR_HASH_ADAPT */
+              "! Check that your transactions do not set too many row locks."
+              " innodb_buffer_pool_size="
+                 << (buf_pool.curr_size >> (20U - srv_page_size_shift))
+                 << "M. Starting the InnoDB Monitor to print diagnostics.";
+      buf_lru_switched_on_innodb_mon= true;
+      srv_print_innodb_monitor= TRUE;
+      srv_monitor_timer_schedule_now();
+    }
+  }
+  else if (buf_lru_switched_on_innodb_mon)
+  {
+    /* Switch off the InnoDB Monitor; this is a simple way to stop the
+    monitor if the situation becomes less urgent, but may also
+    surprise users who did SET GLOBAL innodb_status_output=ON earlier! */
+    buf_lru_switched_on_innodb_mon= false;
+    srv_print_innodb_monitor= FALSE;
+  }
+}
+
+/** Get a block from the buf_pool.free list.
+If the list is empty, blocks will be moved from the end of buf_pool.LRU
+to buf_pool.free.
+
+This function is called from a user thread when it needs a clean
+block to read in a page. Note that we only ever get a block from
+the free list. Even when we flush a page or find a page in LRU scan
+we put it to free list to be used.
+* iteration 0:
+  * get a block from the buf_pool.free list, success:done
+  * if buf_pool.try_LRU_scan is set
+    * scan LRU up to 100 pages to free a clean block
+    * success:retry the free list
+  * flush up to innodb_lru_flush_size LRU blocks to data files
+    (until UT_LIST_GET_GEN(buf_pool.free) < innodb_lru_scan_depth)
+    * on buf_page_write_complete() the blocks will put on buf_pool.free list
+    * success: retry the free list
+* subsequent iterations: same as iteration 0 except:
+  * scan whole LRU list
+  * scan LRU list even if buf_pool.try_LRU_scan is not set
+
+@param have_mutex  whether buf_pool.mutex is already being held
+@return the free control block, in state BUF_BLOCK_MEMORY */
+buf_block_t *buf_LRU_get_free_block(bool have_mutex)
+{
+	ulint		n_iterations	= 0;
+	ulint		flush_failures	= 0;
+	MONITOR_INC(MONITOR_LRU_GET_FREE_SEARCH);
+	if (have_mutex) {
+		mysql_mutex_assert_owner(&buf_pool.mutex);
+		goto got_mutex;
+	}
+	DBUG_EXECUTE_IF("recv_ran_out_of_buffer",
+			if (recv_recovery_is_on()
+			    && recv_sys.apply_log_recs) {
+				mysql_mutex_lock(&buf_pool.mutex);
+				goto flush_lru;
+			});
+get_mutex:
+	mysql_mutex_lock(&buf_pool.mutex);
+got_mutex:
+	buf_LRU_check_size_of_non_data_objects();
+	buf_block_t* block;
+
+	DBUG_EXECUTE_IF("ib_lru_force_no_free_page",
+		if (!buf_lru_free_blocks_error_printed) {
+			n_iterations = 21;
+			goto not_found;});
+
+retry:
+	/* If there is a block in the free list, take it */
+	if ((block = buf_LRU_get_free_only()) != nullptr) {
+got_block:
+		if (!have_mutex) {
+			mysql_mutex_unlock(&buf_pool.mutex);
+		}
+		block->page.zip.clear();
+		return block;
+	}
+
+	MONITOR_INC( MONITOR_LRU_GET_FREE_LOOPS );
+	if (n_iterations || buf_pool.try_LRU_scan) {
+		/* If no block was in the free list, search from the
+		end of the LRU list and try to free a block there.
+		If we are doing for the first time we'll scan only
+		tail of the LRU list otherwise we scan the whole LRU
+		list. */
+		if (buf_LRU_scan_and_free_block(n_iterations
+						? ULINT_UNDEFINED : 100)) {
+			goto retry;
+		}
+
+		/* Tell other threads that there is no point
+		in scanning the LRU list. */
+		buf_pool.try_LRU_scan = false;
+	}
+
+	for (;;) {
+		if ((block = buf_LRU_get_free_only()) != nullptr) {
+			goto got_block;
+		}
+		mysql_mutex_unlock(&buf_pool.mutex);
+		mysql_mutex_lock(&buf_pool.flush_list_mutex);
+		const auto n_flush = buf_pool.n_flush();
+		if (!buf_pool.try_LRU_scan) {
+			buf_pool.page_cleaner_wakeup(true);
+		}
+		mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+		mysql_mutex_lock(&buf_pool.mutex);
+		if (!n_flush) {
+			goto not_found;
+		}
+		if (!buf_pool.try_LRU_scan) {
+			my_cond_wait(&buf_pool.done_free,
+				     &buf_pool.mutex.m_mutex);
+		}
+	}
+
+not_found:
+	if (n_iterations > 1) {
+		MONITOR_INC( MONITOR_LRU_GET_FREE_WAITS );
+	}
+
+	if (n_iterations == 21 && !buf_lru_free_blocks_error_printed
+	    && srv_buf_pool_old_size == srv_buf_pool_size) {
+		buf_lru_free_blocks_error_printed = true;
+		mysql_mutex_unlock(&buf_pool.mutex);
+		ib::warn() << "Difficult to find free blocks in the buffer pool"
+			" (" << n_iterations << " search iterations)! "
+			<< flush_failures << " failed attempts to"
+			" flush a page!"
+			" Consider increasing innodb_buffer_pool_size."
+			" Pending flushes (fsync): "
+			<< fil_n_pending_tablespace_flushes
+			<< ". " << os_n_file_reads << " OS file reads, "
+			<< os_n_file_writes << " OS file writes, "
+			<< os_n_fsyncs
+			<< " OS fsyncs.";
+		mysql_mutex_lock(&buf_pool.mutex);
+	}
+
+	/* No free block was found: try to flush the LRU list.
+	The freed blocks will be up for grabs for all threads.
+
+	TODO: A more elegant way would have been to return one freed
+	up block to the caller here but the code that deals with
+	removing the block from buf_pool.page_hash and buf_pool.LRU is fairly
+	involved (particularly in case of ROW_FORMAT=COMPRESSED pages). We
+	can do that in a separate patch sometime in future. */
+#ifndef DBUG_OFF
+flush_lru:
+#endif
+	if (!buf_flush_LRU(innodb_lru_flush_size, true)) {
+		MONITOR_INC(MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT);
+		++flush_failures;
+	}
+
+	n_iterations++;
+	buf_pool.stat.LRU_waits++;
+	mysql_mutex_unlock(&buf_pool.mutex);
+	buf_dblwr.flush_buffered_writes();
+	goto get_mutex;
+}
+
+/** Move the LRU_old pointer so that the length of the old blocks list
+is inside the allowed limits. */
+static void buf_LRU_old_adjust_len()
+{
+	ulint	old_len;
+	ulint	new_len;
+
+	ut_a(buf_pool.LRU_old);
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+	ut_ad(buf_pool.LRU_old_ratio >= BUF_LRU_OLD_RATIO_MIN);
+	ut_ad(buf_pool.LRU_old_ratio <= BUF_LRU_OLD_RATIO_MAX);
+	compile_time_assert(BUF_LRU_OLD_RATIO_MIN * BUF_LRU_OLD_MIN_LEN
+			    > BUF_LRU_OLD_RATIO_DIV
+			    * (BUF_LRU_OLD_TOLERANCE + 5));
+	compile_time_assert(BUF_LRU_NON_OLD_MIN_LEN < BUF_LRU_OLD_MIN_LEN);
+
+#ifdef UNIV_LRU_DEBUG
+	/* buf_pool.LRU_old must be the first item in the LRU list
+	whose "old" flag is set. */
+	ut_a(buf_pool.LRU_old->old);
+	ut_a(!UT_LIST_GET_PREV(LRU, buf_pool.LRU_old)
+	     || !UT_LIST_GET_PREV(LRU, buf_pool.LRU_old)->old);
+	ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old)
+	     || UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old)->old);
+#endif /* UNIV_LRU_DEBUG */
+
+	old_len = buf_pool.LRU_old_len;
+	new_len = ut_min(UT_LIST_GET_LEN(buf_pool.LRU)
+			 * buf_pool.LRU_old_ratio / BUF_LRU_OLD_RATIO_DIV,
+			 UT_LIST_GET_LEN(buf_pool.LRU)
+			 - (BUF_LRU_OLD_TOLERANCE
+			    + BUF_LRU_NON_OLD_MIN_LEN));
+
+	for (;;) {
+		buf_page_t*	LRU_old = buf_pool.LRU_old;
+
+		ut_a(LRU_old);
+		ut_ad(LRU_old->in_LRU_list);
+#ifdef UNIV_LRU_DEBUG
+		ut_a(LRU_old->old);
+#endif /* UNIV_LRU_DEBUG */
+
+		/* Update the LRU_old pointer if necessary */
+
+		if (old_len + BUF_LRU_OLD_TOLERANCE < new_len) {
+
+			buf_pool.LRU_old = LRU_old = UT_LIST_GET_PREV(
+				LRU, LRU_old);
+#ifdef UNIV_LRU_DEBUG
+			ut_a(!LRU_old->old);
+#endif /* UNIV_LRU_DEBUG */
+			old_len = ++buf_pool.LRU_old_len;
+			LRU_old->set_old(true);
+
+		} else if (old_len > new_len + BUF_LRU_OLD_TOLERANCE) {
+
+			buf_pool.LRU_old = UT_LIST_GET_NEXT(LRU, LRU_old);
+			old_len = --buf_pool.LRU_old_len;
+			LRU_old->set_old(false);
+		} else {
+			return;
+		}
+	}
+}
+
+/** Initialize the old blocks pointer in the LRU list. This function should be
+called when the LRU list grows to BUF_LRU_OLD_MIN_LEN length. */
+static void buf_LRU_old_init()
+{
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+	ut_a(UT_LIST_GET_LEN(buf_pool.LRU) == BUF_LRU_OLD_MIN_LEN);
+
+	/* We first initialize all blocks in the LRU list as old and then use
+	the adjust function to move the LRU_old pointer to the right
+	position */
+
+	for (buf_page_t* bpage = UT_LIST_GET_LAST(buf_pool.LRU);
+	     bpage != NULL;
+	     bpage = UT_LIST_GET_PREV(LRU, bpage)) {
+
+		ut_ad(bpage->in_LRU_list);
+
+		/* This loop temporarily violates the
+		assertions of buf_page_t::set_old(). */
+		bpage->old = true;
+	}
+
+	buf_pool.LRU_old = UT_LIST_GET_FIRST(buf_pool.LRU);
+	buf_pool.LRU_old_len = UT_LIST_GET_LEN(buf_pool.LRU);
+
+	buf_LRU_old_adjust_len();
+}
+
+/** Remove a block from the unzip_LRU list if it belonged to the list.
+@param[in]	bpage	control block */
+static void buf_unzip_LRU_remove_block_if_needed(buf_page_t* bpage)
+{
+	ut_ad(bpage->in_file());
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+
+	if (bpage->belongs_to_unzip_LRU()) {
+		buf_block_t*	block = reinterpret_cast<buf_block_t*>(bpage);
+
+		ut_ad(block->in_unzip_LRU_list);
+		ut_d(block->in_unzip_LRU_list = false);
+
+		UT_LIST_REMOVE(buf_pool.unzip_LRU, block);
+	}
+}
+
+/** Removes a block from the LRU list.
+@param[in]	bpage	control block */
+static inline void buf_LRU_remove_block(buf_page_t* bpage)
+{
+	/* Important that we adjust the hazard pointers before removing
+	bpage from the LRU list. */
+	buf_page_t* prev_bpage = buf_pool.LRU_remove(bpage);
+
+	/* If the LRU_old pointer is defined and points to just this block,
+	move it backward one step */
+
+	if (bpage == buf_pool.LRU_old) {
+
+		/* Below: the previous block is guaranteed to exist,
+		because the LRU_old pointer is only allowed to differ
+		by BUF_LRU_OLD_TOLERANCE from strict
+		buf_pool.LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV of the LRU
+		list length. */
+		ut_a(prev_bpage);
+#ifdef UNIV_LRU_DEBUG
+		ut_a(!prev_bpage->old);
+#endif /* UNIV_LRU_DEBUG */
+		buf_pool.LRU_old = prev_bpage;
+		prev_bpage->set_old(true);
+
+		buf_pool.LRU_old_len++;
+	}
+
+	buf_pool.stat.LRU_bytes -= bpage->physical_size();
+
+	buf_unzip_LRU_remove_block_if_needed(bpage);
+
+	/* If the LRU list is so short that LRU_old is not defined,
+	clear the "old" flags and return */
+	if (UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN) {
+
+		for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool.LRU);
+		     bpage != NULL;
+		     bpage = UT_LIST_GET_NEXT(LRU, bpage)) {
+
+			/* This loop temporarily violates the
+			assertions of buf_page_t::set_old(). */
+			bpage->old = false;
+		}
+
+		buf_pool.LRU_old = NULL;
+		buf_pool.LRU_old_len = 0;
+
+		return;
+	}
+
+	ut_ad(buf_pool.LRU_old);
+
+	/* Update the LRU_old_len field if necessary */
+	if (bpage->old) {
+		buf_pool.LRU_old_len--;
+	}
+
+	/* Adjust the length of the old block list if necessary */
+	buf_LRU_old_adjust_len();
+}
+
+/******************************************************************//**
+Adds a block to the LRU list of decompressed zip pages. */
+void
+buf_unzip_LRU_add_block(
+/*====================*/
+	buf_block_t*	block,	/*!< in: control block */
+	ibool		old)	/*!< in: TRUE if should be put to the end
+				of the list, else put to the start */
+{
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+	ut_a(block->page.belongs_to_unzip_LRU());
+	ut_ad(!block->in_unzip_LRU_list);
+	ut_d(block->in_unzip_LRU_list = true);
+
+	if (old) {
+		UT_LIST_ADD_LAST(buf_pool.unzip_LRU, block);
+	} else {
+		UT_LIST_ADD_FIRST(buf_pool.unzip_LRU, block);
+	}
+}
+
+/******************************************************************//**
+Adds a block to the LRU list. Please make sure that the page_size is
+already set when invoking the function, so that we can get correct
+page_size from the buffer page when adding a block into LRU */
+void
+buf_LRU_add_block(
+	buf_page_t*	bpage,	/*!< in: control block */
+	bool		old)	/*!< in: true if should be put to the old blocks
+				in the LRU list, else put to the start; if the
+				LRU list is very short, the block is added to
+				the start, regardless of this parameter */
+{
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+	ut_ad(!bpage->in_LRU_list);
+
+	if (!old || (UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN)) {
+
+		UT_LIST_ADD_FIRST(buf_pool.LRU, bpage);
+
+		bpage->freed_page_clock = buf_pool.freed_page_clock
+			& ((1U << 31) - 1);
+	} else {
+#ifdef UNIV_LRU_DEBUG
+		/* buf_pool.LRU_old must be the first item in the LRU list
+		whose "old" flag is set. */
+		ut_a(buf_pool.LRU_old->old);
+		ut_a(!UT_LIST_GET_PREV(LRU, buf_pool.LRU_old)
+		     || !UT_LIST_GET_PREV(LRU, buf_pool.LRU_old)->old);
+		ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old)
+		     || UT_LIST_GET_NEXT(LRU, buf_pool.LRU_old)->old);
+#endif /* UNIV_LRU_DEBUG */
+		UT_LIST_INSERT_AFTER(buf_pool.LRU, buf_pool.LRU_old,
+			bpage);
+
+		buf_pool.LRU_old_len++;
+	}
+
+	ut_d(bpage->in_LRU_list = TRUE);
+
+	incr_LRU_size_in_bytes(bpage);
+
+	if (UT_LIST_GET_LEN(buf_pool.LRU) > BUF_LRU_OLD_MIN_LEN) {
+
+		ut_ad(buf_pool.LRU_old);
+
+		/* Adjust the length of the old block list if necessary */
+
+		bpage->set_old(old);
+		buf_LRU_old_adjust_len();
+
+	} else if (UT_LIST_GET_LEN(buf_pool.LRU) == BUF_LRU_OLD_MIN_LEN) {
+
+		/* The LRU list is now long enough for LRU_old to become
+		defined: init it */
+
+		buf_LRU_old_init();
+	} else {
+		bpage->set_old(buf_pool.LRU_old != NULL);
+	}
+
+	/* If this is a zipped block with decompressed frame as well
+	then put it on the unzip_LRU list */
+	if (bpage->belongs_to_unzip_LRU()) {
+		buf_unzip_LRU_add_block((buf_block_t*) bpage, old);
+	}
+}
+
+/** Move a block to the start of the LRU list. */
+void buf_page_make_young(buf_page_t *bpage)
+{
+  if (bpage->is_read_fixed())
+    return;
+
+  ut_ad(bpage->in_file());
+
+  mysql_mutex_lock(&buf_pool.mutex);
+
+  if (UNIV_UNLIKELY(bpage->old))
+    buf_pool.stat.n_pages_made_young++;
+
+  buf_LRU_remove_block(bpage);
+  buf_LRU_add_block(bpage, false);
+
+  mysql_mutex_unlock(&buf_pool.mutex);
+}
+
+/** Try to free a block. If bpage is a descriptor of a compressed-only
+ROW_FORMAT=COMPRESSED page, the buf_page_t object will be freed as well.
+The caller must hold buf_pool.mutex.
+@param bpage      block to be freed
+@param zip        whether to remove both copies of a ROW_FORMAT=COMPRESSED page
+@retval true if freed and buf_pool.mutex may have been temporarily released
+@retval false if the page was not freed */
+bool buf_LRU_free_page(buf_page_t *bpage, bool zip)
+{
+	const page_id_t id{bpage->id()};
+	buf_page_t*	b = nullptr;
+
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+
+	/* First, perform a quick check before we acquire hash_lock. */
+	if (!bpage->can_relocate()) {
+		return false;
+	}
+
+	/* We must hold an exclusive hash_lock to prevent
+	bpage->can_relocate() from changing due to a concurrent
+	execution of buf_page_get_low(). */
+	buf_pool_t::hash_chain& chain= buf_pool.page_hash.cell_get(id.fold());
+	page_hash_latch& hash_lock = buf_pool.page_hash.lock_get(chain);
+	/* We cannot use transactional_lock_guard here,
+	because buf_buddy_relocate() in buf_buddy_free() could get stuck. */
+	hash_lock.lock();
+	const lsn_t oldest_modification = bpage->oldest_modification_acquire();
+
+	if (UNIV_UNLIKELY(!bpage->can_relocate())) {
+		/* Do not free buffer fixed and I/O-fixed blocks. */
+		goto func_exit;
+	}
+
+	switch (oldest_modification) {
+	case 2:
+		ut_ad(id.space() == SRV_TMP_SPACE_ID);
+		ut_ad(!bpage->zip.data);
+		if (!bpage->is_freed()) {
+			goto func_exit;
+		}
+		bpage->clear_oldest_modification();
+		break;
+	case 1:
+		mysql_mutex_lock(&buf_pool.flush_list_mutex);
+		if (const lsn_t om = bpage->oldest_modification()) {
+			ut_ad(om == 1);
+			buf_pool.delete_from_flush_list(bpage);
+		}
+		mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+		ut_ad(!bpage->oldest_modification());
+		/* fall through */
+	case 0:
+		if (zip || !bpage->zip.data || !bpage->frame) {
+			break;
+		}
+relocate_compressed:
+		b = static_cast<buf_page_t*>(ut_zalloc_nokey(sizeof *b));
+		ut_a(b);
+		mysql_mutex_lock(&buf_pool.flush_list_mutex);
+		new (b) buf_page_t(*bpage);
+		b->frame = nullptr;
+		{
+			ut_d(uint32_t s=) b->fix();
+			ut_ad(s == buf_page_t::FREED
+			      || s == buf_page_t::UNFIXED
+			      || s == buf_page_t::IBUF_EXIST
+			      || s == buf_page_t::REINIT);
+		}
+		break;
+	default:
+		if (zip || !bpage->zip.data || !bpage->frame) {
+			/* This would completely free the block. */
+			/* Do not completely free dirty blocks. */
+func_exit:
+			hash_lock.unlock();
+			return(false);
+		}
+		goto relocate_compressed;
+	}
+
+	mysql_mutex_assert_owner(&buf_pool.mutex);
+
+	DBUG_PRINT("ib_buf", ("free page %u:%u", id.space(), id.page_no()));
+
+	ut_ad(bpage->can_relocate());
+
+	if (!buf_LRU_block_remove_hashed(bpage, id, chain, zip)) {
+		ut_ad(!b);
+		mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex);
+		return(true);
+	}
+
+	/* We have just freed a BUF_BLOCK_FILE_PAGE. If b != nullptr
+	then it was a compressed page with an uncompressed frame and
+	we are interested in freeing only the uncompressed frame.
+	Therefore we have to reinsert the compressed page descriptor
+	into the LRU and page_hash (and possibly flush_list).
+	if !b then it was a regular page that has been freed */
+
+	if (UNIV_LIKELY_NULL(b)) {
+		buf_page_t*	prev_b	= UT_LIST_GET_PREV(LRU, b);
+
+		ut_ad(!buf_pool.page_hash.get(id, chain));
+		ut_ad(b->zip_size());
+
+		/* The field in_LRU_list of
+		the to-be-freed block descriptor should have
+		been cleared in
+		buf_LRU_block_remove_hashed(), which
+		invokes buf_LRU_remove_block(). */
+		ut_ad(!bpage->in_LRU_list);
+		ut_ad(bpage->frame);
+		ut_ad(!((buf_block_t*) bpage)->in_unzip_LRU_list);
+
+		/* The fields of bpage were copied to b before
+		buf_LRU_block_remove_hashed() was invoked. */
+		ut_ad(!b->in_zip_hash);
+		ut_ad(b->in_LRU_list);
+		ut_ad(b->in_page_hash);
+		ut_d(b->in_page_hash = false);
+		b->hash = nullptr;
+
+		buf_pool.page_hash.append(chain, b);
+
+		/* Insert b where bpage was in the LRU list. */
+		if (prev_b) {
+			ulint	lru_len;
+
+			ut_ad(prev_b->in_LRU_list);
+			ut_ad(prev_b->in_file());
+
+			UT_LIST_INSERT_AFTER(buf_pool.LRU, prev_b, b);
+
+			incr_LRU_size_in_bytes(b);
+
+			if (b->is_old()) {
+				buf_pool.LRU_old_len++;
+				if (buf_pool.LRU_old
+				    == UT_LIST_GET_NEXT(LRU, b)) {
+
+					buf_pool.LRU_old = b;
+				}
+			}
+
+			lru_len = UT_LIST_GET_LEN(buf_pool.LRU);
+
+			if (lru_len > BUF_LRU_OLD_MIN_LEN) {
+				ut_ad(buf_pool.LRU_old);
+				/* Adjust the length of the
+				old block list if necessary */
+				buf_LRU_old_adjust_len();
+			} else if (lru_len == BUF_LRU_OLD_MIN_LEN) {
+				/* The LRU list is now long
+				enough for LRU_old to become
+				defined: init it */
+				buf_LRU_old_init();
+			}
+#ifdef UNIV_LRU_DEBUG
+			/* Check that the "old" flag is consistent
+			in the block and its neighbours. */
+			b->set_old(b->is_old());
+#endif /* UNIV_LRU_DEBUG */
+		} else {
+			ut_d(b->in_LRU_list = FALSE);
+			buf_LRU_add_block(b, b->old);
+		}
+
+		buf_flush_relocate_on_flush_list(bpage, b);
+		mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+		bpage->zip.data = nullptr;
+
+		page_zip_set_size(&bpage->zip, 0);
+
+		b->lock.x_lock();
+		hash_lock.unlock();
+	} else if (!zip) {
+		hash_lock.unlock();
+	}
+
+	buf_block_t* block = reinterpret_cast<buf_block_t*>(bpage);
+
+#ifdef BTR_CUR_HASH_ADAPT
+	if (block->index) {
+		mysql_mutex_unlock(&buf_pool.mutex);
+
+		/* Remove the adaptive hash index on the page.
+		The page was declared uninitialized by
+		buf_LRU_block_remove_hashed().  We need to flag
+		the contents of the page valid (which it still is) in
+		order to avoid bogus Valgrind or MSAN warnings.*/
+
+		MEM_MAKE_DEFINED(block->page.frame, srv_page_size);
+		btr_search_drop_page_hash_index(block, false);
+		MEM_UNDEFINED(block->page.frame, srv_page_size);
+		mysql_mutex_lock(&buf_pool.mutex);
+	}
+#endif
+	if (UNIV_LIKELY_NULL(b)) {
+		ut_ad(b->zip_size());
+		b->lock.x_unlock();
+		b->unfix();
+	}
+
+	buf_LRU_block_free_hashed_page(block);
+
+	return(true);
+}
+
+/******************************************************************//**
+Puts a block back to the free list. */
+void
+buf_LRU_block_free_non_file_page(
+/*=============================*/
+	buf_block_t*	block)	/*!< in: block, must not contain a file page */
+{
+	void*		data;
+
+	ut_ad(block->page.state() == buf_page_t::MEMORY);
+	assert_block_ahi_empty(block);
+	ut_ad(!block->page.in_free_list);
+	ut_ad(!block->page.oldest_modification());
+	ut_ad(!block->page.in_LRU_list);
+	ut_ad(!block->page.hash);
+
+	block->page.set_state(buf_page_t::NOT_USED);
+
+	MEM_UNDEFINED(block->page.frame, srv_page_size);
+	data = block->page.zip.data;
+
+	if (data != NULL) {
+		block->page.zip.data = NULL;
+		buf_pool_mutex_exit_forbid();
+
+		ut_ad(block->zip_size());
+
+		buf_buddy_free(data, block->zip_size());
+
+		buf_pool_mutex_exit_allow();
+		page_zip_set_size(&block->page.zip, 0);
+	}
+
+	if (buf_pool.is_shrinking()
+	    && UT_LIST_GET_LEN(buf_pool.withdraw) < buf_pool.withdraw_target
+	    && buf_pool.will_be_withdrawn(block->page)) {
+		/* This should be withdrawn */
+		UT_LIST_ADD_LAST(
+			buf_pool.withdraw,
+			&block->page);
+		ut_d(block->in_withdraw_list = true);
+	} else {
+		UT_LIST_ADD_FIRST(buf_pool.free, &block->page);
+		ut_d(block->page.in_free_list = true);
+		buf_pool.try_LRU_scan= true;
+		pthread_cond_broadcast(&buf_pool.done_free);
+	}
+
+	block->page.set_os_unused();
+}
+
+/** Release a memory block to the buffer pool. */
+ATTRIBUTE_COLD void buf_pool_t::free_block(buf_block_t *block)
+{
+  ut_ad(this == &buf_pool);
+  mysql_mutex_lock(&mutex);
+  buf_LRU_block_free_non_file_page(block);
+  mysql_mutex_unlock(&mutex);
+}
+
+
+/** Remove bpage from buf_pool.LRU and buf_pool.page_hash.
+
+If !bpage->frame && !bpage->oldest_modification(), the object will be freed.
+
+@param bpage      buffer block
+@param id         page identifier
+@param chain      locked buf_pool.page_hash chain (will be released here)
+@param zip        whether bpage->zip of BUF_BLOCK_FILE_PAGE should be freed
+
+If a compressed page is freed other compressed pages may be relocated.
+@retval true if BUF_BLOCK_FILE_PAGE was removed from page_hash. The
+caller needs to free the page to the free list
+@retval false if BUF_BLOCK_ZIP_PAGE was removed from page_hash. In
+this case the block is already returned to the buddy allocator. */
+static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id,
+                                        buf_pool_t::hash_chain &chain,
+                                        bool zip)
+{
+	ut_a(bpage->can_relocate());
+	ut_ad(buf_pool.page_hash.lock_get(chain).is_write_locked());
+
+	buf_LRU_remove_block(bpage);
+
+	buf_pool.freed_page_clock += 1;
+
+	if (UNIV_LIKELY(!bpage->zip.data)) {
+		MEM_CHECK_ADDRESSABLE(bpage, sizeof(buf_block_t));
+		MEM_CHECK_ADDRESSABLE(bpage->frame, srv_page_size);
+		buf_block_modify_clock_inc((buf_block_t*) bpage);
+	} else if (const page_t *page = bpage->frame) {
+		MEM_CHECK_ADDRESSABLE(bpage, sizeof(buf_block_t));
+		MEM_CHECK_ADDRESSABLE(bpage->frame, srv_page_size);
+		buf_block_modify_clock_inc((buf_block_t*) bpage);
+
+		ut_a(!zip || !bpage->oldest_modification());
+		ut_ad(bpage->zip_size());
+		/* Skip consistency checks if the page was freed.
+		In recovery, we could get a sole FREE_PAGE record
+		and nothing else, for a ROW_FORMAT=COMPRESSED page.
+		Its contents would be garbage. */
+		if (!bpage->is_freed())
+		switch (fil_page_get_type(page)) {
+		case FIL_PAGE_TYPE_ALLOCATED:
+		case FIL_PAGE_INODE:
+		case FIL_PAGE_IBUF_BITMAP:
+		case FIL_PAGE_TYPE_FSP_HDR:
+		case FIL_PAGE_TYPE_XDES:
+			/* These are essentially uncompressed pages. */
+			if (!zip) {
+				/* InnoDB writes the data to the
+				uncompressed page frame.  Copy it
+				to the compressed page, which will
+				be preserved. */
+				memcpy(bpage->zip.data, page,
+				       bpage->zip_size());
+			}
+			break;
+		case FIL_PAGE_TYPE_ZBLOB:
+		case FIL_PAGE_TYPE_ZBLOB2:
+		case FIL_PAGE_INDEX:
+		case FIL_PAGE_RTREE:
+			break;
+		default:
+			ib::error() << "The compressed page to be"
+				" evicted seems corrupt:";
+			ut_print_buf(stderr, page, srv_page_size);
+
+			ib::error() << "Possibly older version of"
+				" the page:";
+
+			ut_print_buf(stderr, bpage->zip.data,
+				     bpage->zip_size());
+			putc('\n', stderr);
+			ut_error;
+		}
+	} else {
+		ut_a(!bpage->oldest_modification());
+		MEM_CHECK_ADDRESSABLE(bpage->zip.data, bpage->zip_size());
+	}
+
+	ut_ad(!bpage->in_zip_hash);
+	buf_pool.page_hash.remove(chain, bpage);
+	page_hash_latch& hash_lock = buf_pool.page_hash.lock_get(chain);
+
+	if (UNIV_UNLIKELY(!bpage->frame)) {
+		ut_ad(!bpage->in_free_list);
+		ut_ad(!bpage->in_LRU_list);
+		ut_a(bpage->zip.data);
+		ut_a(bpage->zip.ssize);
+		ut_ad(!bpage->oldest_modification());
+
+		hash_lock.unlock();
+		buf_pool_mutex_exit_forbid();
+
+		buf_buddy_free(bpage->zip.data, bpage->zip_size());
+
+		buf_pool_mutex_exit_allow();
+		bpage->lock.free();
+		ut_free(bpage);
+		return false;
+	} else {
+		static_assert(FIL_NULL == 0xffffffffU, "fill pattern");
+		static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
+		memset_aligned<4>(bpage->frame + FIL_PAGE_OFFSET, 0xff, 4);
+		static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2,
+			      "not perfect alignment");
+		memset_aligned<2>(bpage->frame
+				  + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xff, 4);
+		MEM_UNDEFINED(bpage->frame, srv_page_size);
+		bpage->set_state(buf_page_t::REMOVE_HASH);
+
+		if (!zip) {
+			return true;
+		}
+
+		hash_lock.unlock();
+
+		if (bpage->zip.data) {
+			/* Free the compressed page. */
+			void*	data = bpage->zip.data;
+			bpage->zip.data = NULL;
+
+			ut_ad(!bpage->in_free_list);
+			ut_ad(!bpage->oldest_modification());
+			ut_ad(!bpage->in_LRU_list);
+			buf_pool_mutex_exit_forbid();
+
+			buf_buddy_free(data, bpage->zip_size());
+
+			buf_pool_mutex_exit_allow();
+
+			page_zip_set_size(&bpage->zip, 0);
+		}
+
+		return true;
+	}
+}
+
+/** Release and evict a corrupted page.
+@param bpage    x-latched page that was found corrupted
+@param state    expected current state of the page */
+ATTRIBUTE_COLD
+void buf_pool_t::corrupted_evict(buf_page_t *bpage, uint32_t state)
+{
+  const page_id_t id{bpage->id()};
+  buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id.fold());
+  page_hash_latch &hash_lock= buf_pool.page_hash.lock_get(chain);
+
+  recv_sys.free_corrupted_page(id);
+  mysql_mutex_lock(&mutex);
+  hash_lock.lock();
+
+  ut_ad(!bpage->oldest_modification());
+  bpage->set_corrupt_id();
+  auto unfix= state - buf_page_t::FREED;
+  auto s= bpage->zip.fix.fetch_sub(unfix) - unfix;
+  bpage->lock.x_unlock(true);
+
+  while (s != buf_page_t::FREED || bpage->lock.is_locked_or_waiting())
+  {
+    ut_ad(s >= buf_page_t::FREED);
+    ut_ad(s < buf_page_t::UNFIXED);
+    /* Wait for other threads to release the fix count
+    before releasing the bpage from LRU list. */
+    (void) LF_BACKOFF();
+    s= bpage->state();
+  }
+
+  /* remove from LRU and page_hash */
+  if (buf_LRU_block_remove_hashed(bpage, id, chain, true))
+    buf_LRU_block_free_hashed_page(reinterpret_cast<buf_block_t*>(bpage));
+
+  mysql_mutex_unlock(&mutex);
+}
+
+/** Update buf_pool.LRU_old_ratio.
+@param[in]	old_pct		Reserve this percentage of
+				the buffer pool for "old" blocks
+@param[in]	adjust		true=adjust the LRU list;
+				false=just assign buf_pool.LRU_old_ratio
+				during the initialization of InnoDB
+@return updated old_pct */
+uint buf_LRU_old_ratio_update(uint old_pct, bool adjust)
+{
+	uint	ratio = old_pct * BUF_LRU_OLD_RATIO_DIV / 100;
+	if (ratio < BUF_LRU_OLD_RATIO_MIN) {
+		ratio = BUF_LRU_OLD_RATIO_MIN;
+	} else if (ratio > BUF_LRU_OLD_RATIO_MAX) {
+		ratio = BUF_LRU_OLD_RATIO_MAX;
+	}
+
+	if (adjust) {
+		mysql_mutex_lock(&buf_pool.mutex);
+
+		if (ratio != buf_pool.LRU_old_ratio) {
+			buf_pool.LRU_old_ratio = ratio;
+
+			if (UT_LIST_GET_LEN(buf_pool.LRU)
+			    >= BUF_LRU_OLD_MIN_LEN) {
+				buf_LRU_old_adjust_len();
+			}
+		}
+
+		mysql_mutex_unlock(&buf_pool.mutex);
+	} else {
+		buf_pool.LRU_old_ratio = ratio;
+	}
+	/* the reverse of
+	ratio = old_pct * BUF_LRU_OLD_RATIO_DIV / 100 */
+	return((uint) (ratio * 100 / (double) BUF_LRU_OLD_RATIO_DIV + 0.5));
+}
+
+/********************************************************************//**
+Update the historical stats that we are collecting for LRU eviction
+policy at the end of each interval. */
+void
+buf_LRU_stat_update()
+{
+	buf_LRU_stat_t*	item;
+	buf_LRU_stat_t	cur_stat;
+
+	if (!buf_pool.freed_page_clock) {
+		goto func_exit;
+	}
+
+	/* Update the index. */
+	item = &buf_LRU_stat_arr[buf_LRU_stat_arr_ind];
+	buf_LRU_stat_arr_ind++;
+	buf_LRU_stat_arr_ind %= BUF_LRU_STAT_N_INTERVAL;
+
+	/* Add the current value and subtract the obsolete entry.
+	Since buf_LRU_stat_cur is not protected by any mutex,
+	it can be changing between adding to buf_LRU_stat_sum
+	and copying to item. Assign it to local variables to make
+	sure the same value assign to the buf_LRU_stat_sum
+	and item */
+	cur_stat = buf_LRU_stat_cur;
+
+	buf_LRU_stat_sum.io += cur_stat.io - item->io;
+	buf_LRU_stat_sum.unzip += cur_stat.unzip - item->unzip;
+
+	/* Put current entry in the array. */
+	memcpy(item, &cur_stat, sizeof *item);
+
+func_exit:
+	/* Clear the current entry. */
+	memset(&buf_LRU_stat_cur, 0, sizeof buf_LRU_stat_cur);
+}
+
+#if defined __aarch64__&&defined __GNUC__&&__GNUC__==4&&!defined __clang__
+/* Avoid GCC 4.8.5 internal compiler error "could not split insn".
+We would only need this for buf_LRU_scan_and_free_block(),
+but GCC 4.8.5 does not support pop_options. */
+# pragma GCC optimize ("O0")
+#endif
+/** Try to free a replaceable block.
+@param limit  maximum number of blocks to scan
+@return true if found and freed */
+bool buf_LRU_scan_and_free_block(ulint limit)
+{
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+
+  return buf_LRU_free_from_unzip_LRU_list(limit) ||
+    buf_LRU_free_from_common_LRU_list(limit);
+}
+
+#ifdef UNIV_DEBUG
+/** Validate the LRU list. */
+void buf_LRU_validate()
+{
+	ulint	old_len;
+	ulint	new_len;
+
+	mysql_mutex_lock(&buf_pool.mutex);
+
+	if (UT_LIST_GET_LEN(buf_pool.LRU) >= BUF_LRU_OLD_MIN_LEN) {
+
+		ut_a(buf_pool.LRU_old);
+		old_len = buf_pool.LRU_old_len;
+
+		new_len = ut_min(UT_LIST_GET_LEN(buf_pool.LRU)
+				 * buf_pool.LRU_old_ratio
+				 / BUF_LRU_OLD_RATIO_DIV,
+				 UT_LIST_GET_LEN(buf_pool.LRU)
+				 - (BUF_LRU_OLD_TOLERANCE
+				    + BUF_LRU_NON_OLD_MIN_LEN));
+
+		ut_a(old_len >= new_len - BUF_LRU_OLD_TOLERANCE);
+		ut_a(old_len <= new_len + BUF_LRU_OLD_TOLERANCE);
+	}
+
+	CheckInLRUList::validate();
+
+	old_len = 0;
+
+	for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool.LRU);
+	     bpage != NULL;
+             bpage = UT_LIST_GET_NEXT(LRU, bpage)) {
+		ut_ad(bpage->in_file());
+		ut_ad(!bpage->frame
+		      || reinterpret_cast<buf_block_t*>(bpage)
+		      ->in_unzip_LRU_list
+		      == bpage->belongs_to_unzip_LRU());
+
+		if (bpage->is_old()) {
+			const buf_page_t*	prev
+				= UT_LIST_GET_PREV(LRU, bpage);
+			const buf_page_t*	next
+				= UT_LIST_GET_NEXT(LRU, bpage);
+
+			if (!old_len++) {
+				ut_a(buf_pool.LRU_old == bpage);
+			} else {
+				ut_a(!prev || prev->is_old());
+			}
+
+			ut_a(!next || next->is_old());
+		}
+	}
+
+	ut_a(buf_pool.LRU_old_len == old_len);
+
+	CheckInFreeList::validate();
+
+	for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool.free);
+	     bpage != NULL;
+	     bpage = UT_LIST_GET_NEXT(list, bpage)) {
+
+		ut_a(bpage->state() == buf_page_t::NOT_USED);
+	}
+
+	CheckUnzipLRUAndLRUList::validate();
+
+	for (buf_block_t* block = UT_LIST_GET_FIRST(buf_pool.unzip_LRU);
+	     block != NULL;
+	     block = UT_LIST_GET_NEXT(unzip_LRU, block)) {
+
+		ut_ad(block->in_unzip_LRU_list);
+		ut_ad(block->page.in_LRU_list);
+		ut_a(block->page.belongs_to_unzip_LRU());
+	}
+
+	mysql_mutex_unlock(&buf_pool.mutex);
+}
+#endif /* UNIV_DEBUG */
+
+#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG
+/** Dump the LRU list to stderr. */
+void buf_LRU_print()
+{
+	mysql_mutex_lock(&buf_pool.mutex);
+
+	for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool.LRU);
+	     bpage != NULL;
+	     bpage = UT_LIST_GET_NEXT(LRU, bpage)) {
+		const page_id_t id(bpage->id());
+
+		fprintf(stderr, "BLOCK space %u page %u ",
+			id.space(), id.page_no());
+
+		if (bpage->is_old()) {
+			fputs("old ", stderr);
+		}
+
+		const unsigned s = bpage->state();
+		if (s > buf_page_t::UNFIXED) {
+			fprintf(stderr, "fix %u ", s - buf_page_t::UNFIXED);
+		} else {
+			ut_ad(s == buf_page_t::UNFIXED
+			      || s == buf_page_t::REMOVE_HASH);
+		}
+
+		if (bpage->oldest_modification()) {
+			fputs("modif. ", stderr);
+		}
+
+		if (const byte* frame = bpage->zip.data) {
+			fprintf(stderr, "\ntype %u size " ULINTPF
+				" index id " IB_ID_FMT "\n",
+				fil_page_get_type(frame),
+				bpage->zip_size(),
+				btr_page_get_index_id(frame));
+		} else {
+			fprintf(stderr, "\ntype %u index id " IB_ID_FMT "\n",
+				fil_page_get_type(bpage->frame),
+				btr_page_get_index_id(bpage->frame));
+		}
+	}
+
+	mysql_mutex_unlock(&buf_pool.mutex);
+}
+#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG */
diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc
new file mode 100644
index 00000000..c4f07738
--- /dev/null
+++ b/storage/innobase/buf/buf0rea.cc
@@ -0,0 +1,710 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0rea.cc
+The database buffer read
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "univ.i"
+#include <mysql/service_thd_wait.h>
+
+#include "buf0rea.h"
+#include "fil0fil.h"
+#include "mtr0mtr.h"
+#include "buf0buf.h"
+#include "buf0flu.h"
+#include "buf0lru.h"
+#include "buf0buddy.h"
+#include "buf0dblwr.h"
+#include "ibuf0ibuf.h"
+#include "log0recv.h"
+#include "trx0sys.h"
+#include "os0file.h"
+#include "srv0start.h"
+#include "srv0srv.h"
+#include "log.h"
+#include "mariadb_stats.h"
+
+/** If there are buf_pool.curr_size per the number below pending reads, then
+read-ahead is not done: this is to prevent flooding the buffer pool with
+i/o-fixed buffer blocks */
+#define BUF_READ_AHEAD_PEND_LIMIT	2
+
+/** Remove the sentinel block for the watch before replacing it with a
+real block. watch_unset() or watch_occurred() will notice
+that the block has been replaced with the real block.
+@param w          sentinel
+@param chain      locked hash table chain
+@return           w->state() */
+inline uint32_t buf_pool_t::watch_remove(buf_page_t *w,
+                                         buf_pool_t::hash_chain &chain)
+{
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+  ut_ad(xtest() || page_hash.lock_get(chain).is_write_locked());
+  ut_ad(w >= &watch[0]);
+  ut_ad(w < &watch[array_elements(watch)]);
+  ut_ad(!w->in_zip_hash);
+  ut_ad(!w->zip.data);
+
+  uint32_t s{w->state()};
+  w->set_state(buf_page_t::NOT_USED);
+  ut_ad(s >= buf_page_t::UNFIXED);
+  ut_ad(s < buf_page_t::READ_FIX);
+
+  if (~buf_page_t::LRU_MASK & s)
+    page_hash.remove(chain, w);
+
+  ut_ad(!w->in_page_hash);
+  w->id_= page_id_t(~0ULL);
+  return s;
+}
+
+/** Initialize a page for read to the buffer buf_pool. If the page is
+(1) already in buf_pool, or
+(2) if we specify to read only ibuf pages and the page is not an ibuf page, or
+(3) if the space is deleted or being deleted,
+then this function does nothing.
+Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock
+on the buffer frame. The io-handler must take care that the flag is cleared
+and the lock released later.
+@param[in]	mode			BUF_READ_IBUF_PAGES_ONLY, ...
+@param[in]	page_id			page id
+@param[in]	zip_size		ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	unzip			whether the uncompressed page is
+					requested (for ROW_FORMAT=COMPRESSED)
+@return pointer to the block
+@retval	NULL	in case of an error */
+TRANSACTIONAL_TARGET
+static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id,
+                                          ulint zip_size, bool unzip)
+{
+  mtr_t mtr;
+
+  if (mode == BUF_READ_IBUF_PAGES_ONLY)
+  {
+    /* It is a read-ahead within an ibuf routine */
+    ut_ad(!ibuf_bitmap_page(page_id, zip_size));
+    ibuf_mtr_start(&mtr);
+
+    if (!recv_no_ibuf_operations && !ibuf_page(page_id, zip_size, &mtr))
+    {
+      ibuf_mtr_commit(&mtr);
+      return nullptr;
+    }
+  }
+  else
+    ut_ad(mode == BUF_READ_ANY_PAGE);
+
+  buf_page_t *bpage= nullptr;
+  buf_block_t *block= nullptr;
+  if (!zip_size || unzip || recv_recovery_is_on())
+  {
+    block= buf_LRU_get_free_block(false);
+    block->initialise(page_id, zip_size, buf_page_t::READ_FIX);
+    /* x_unlock() will be invoked
+    in buf_page_t::read_complete() by the io-handler thread. */
+    block->page.lock.x_lock(true);
+  }
+
+  buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold());
+
+  mysql_mutex_lock(&buf_pool.mutex);
+
+  buf_page_t *hash_page= buf_pool.page_hash.get(page_id, chain);
+  if (hash_page && !buf_pool.watch_is_sentinel(*hash_page))
+  {
+    /* The page is already in the buffer pool. */
+    if (block)
+    {
+      block->page.lock.x_unlock(true);
+      ut_d(block->page.set_state(buf_page_t::MEMORY));
+      buf_LRU_block_free_non_file_page(block);
+    }
+    goto func_exit;
+  }
+
+  if (UNIV_LIKELY(block != nullptr))
+  {
+    bpage= &block->page;
+
+    /* Insert into the hash table of file pages */
+    if (hash_page)
+    {
+      transactional_lock_guard<page_hash_latch> g
+        {buf_pool.page_hash.lock_get(chain)};
+      bpage->set_state(buf_pool.watch_remove(hash_page, chain) +
+                       (buf_page_t::READ_FIX - buf_page_t::UNFIXED));
+      buf_pool.page_hash.append(chain, &block->page);
+    }
+    else
+    {
+      transactional_lock_guard<page_hash_latch> g
+        {buf_pool.page_hash.lock_get(chain)};
+      buf_pool.page_hash.append(chain, &block->page);
+    }
+
+    /* The block must be put to the LRU list, to the old blocks */
+    buf_LRU_add_block(&block->page, true/* to old blocks */);
+
+    if (UNIV_UNLIKELY(zip_size))
+    {
+      /* buf_pool.mutex may be released and reacquired by
+      buf_buddy_alloc(). We must defer this operation until after the
+      block descriptor has been added to buf_pool.LRU and
+      buf_pool.page_hash. */
+      block->page.zip.data= static_cast<page_zip_t*>
+        (buf_buddy_alloc(zip_size));
+
+      /* To maintain the invariant
+      block->in_unzip_LRU_list == block->page.belongs_to_unzip_LRU()
+      we have to add this block to unzip_LRU
+      after block->page.zip.data is set. */
+      ut_ad(block->page.belongs_to_unzip_LRU());
+      buf_unzip_LRU_add_block(block, TRUE);
+    }
+  }
+  else
+  {
+    /* The compressed page must be allocated before the
+    control block (bpage), in order to avoid the
+    invocation of buf_buddy_relocate_block() on
+    uninitialized data. */
+    bool lru= false;
+    void *data= buf_buddy_alloc(zip_size, &lru);
+
+    /* If buf_buddy_alloc() allocated storage from the LRU list,
+    it released and reacquired buf_pool.mutex.  Thus, we must
+    check the page_hash again, as it may have been modified. */
+    if (UNIV_UNLIKELY(lru))
+    {
+      hash_page= buf_pool.page_hash.get(page_id, chain);
+
+      if (UNIV_UNLIKELY(hash_page && !buf_pool.watch_is_sentinel(*hash_page)))
+      {
+        /* The block was added by some other thread. */
+        buf_buddy_free(data, zip_size);
+        goto func_exit;
+      }
+    }
+
+    bpage= static_cast<buf_page_t*>(ut_zalloc_nokey(sizeof *bpage));
+
+    page_zip_des_init(&bpage->zip);
+    page_zip_set_size(&bpage->zip, zip_size);
+    bpage->zip.data = (page_zip_t*) data;
+
+    bpage->init(buf_page_t::READ_FIX, page_id);
+    bpage->lock.x_lock(true);
+
+    {
+      transactional_lock_guard<page_hash_latch> g
+        {buf_pool.page_hash.lock_get(chain)};
+
+      if (hash_page)
+        bpage->set_state(buf_pool.watch_remove(hash_page, chain) +
+                         (buf_page_t::READ_FIX - buf_page_t::UNFIXED));
+
+      buf_pool.page_hash.append(chain, bpage);
+    }
+
+    /* The block must be put to the LRU list, to the old blocks.
+    The zip size is already set into the page zip */
+    buf_LRU_add_block(bpage, true/* to old blocks */);
+  }
+
+  buf_pool.stat.n_pages_read++;
+func_exit:
+  mysql_mutex_unlock(&buf_pool.mutex);
+
+  if (mode == BUF_READ_IBUF_PAGES_ONLY)
+    ibuf_mtr_commit(&mtr);
+
+  ut_ad(!bpage || bpage->in_file());
+
+  return bpage;
+}
+
+/** Low-level function which reads a page asynchronously from a file to the
+buffer buf_pool if it is not already there, in which case does nothing.
+Sets the io_fix flag and sets an exclusive lock on the buffer frame. The
+flag is cleared and the x-lock released by an i/o-handler thread.
+
+@param[in,out] space	tablespace
+@param[in] sync		true if synchronous aio is desired
+@param[in] mode		BUF_READ_IBUF_PAGES_ONLY, ...,
+@param[in] page_id	page id
+@param[in] zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in] unzip	true=request uncompressed page
+@return error code
+@retval DB_SUCCESS if the page was read
+@retval DB_SUCCESS_LOCKED_REC if the page exists in the buffer pool already */
+static
+dberr_t
+buf_read_page_low(
+	fil_space_t*		space,
+	bool			sync,
+	ulint			mode,
+	const page_id_t		page_id,
+	ulint			zip_size,
+	bool			unzip)
+{
+	buf_page_t*	bpage;
+
+	if (buf_dblwr.is_inside(page_id)) {
+		space->release();
+		return DB_PAGE_CORRUPTED;
+	}
+
+	if (sync) {
+	} else if (trx_sys_hdr_page(page_id)
+		   || ibuf_bitmap_page(page_id, zip_size)
+		   || (!recv_no_ibuf_operations
+		       && ibuf_page(page_id, zip_size, nullptr))) {
+
+		/* Trx sys header is so low in the latching order that we play
+		safe and do not leave the i/o-completion to an asynchronous
+		i/o-thread. Change buffer pages must always be read with
+		synchronous i/o, to make sure they do not get involved in
+		thread deadlocks. */
+		sync = true;
+	}
+
+	/* The following call will also check if the tablespace does not exist
+	or is being dropped; if we succeed in initing the page in the buffer
+	pool for read, then DISCARD cannot proceed until the read has
+	completed */
+	bpage = buf_page_init_for_read(mode, page_id, zip_size, unzip);
+
+	if (!bpage) {
+		space->release();
+		return DB_SUCCESS_LOCKED_REC;
+	}
+
+	ut_ad(bpage->in_file());
+	ulonglong mariadb_timer= 0;
+
+	if (sync) {
+		thd_wait_begin(nullptr, THD_WAIT_DISKIO);
+		if (mariadb_stats_active())
+		  mariadb_timer= mariadb_measure();
+	}
+
+	DBUG_LOG("ib_buf",
+		 "read page " << page_id << " zip_size=" << zip_size
+		 << " unzip=" << unzip << ',' << (sync ? "sync" : "async"));
+
+	void* dst = zip_size ? bpage->zip.data : bpage->frame;
+	const ulint len = zip_size ? zip_size : srv_page_size;
+
+	auto fio = space->io(IORequest(sync
+				       ? IORequest::READ_SYNC
+				       : IORequest::READ_ASYNC),
+			     os_offset_t{page_id.page_no()} * len, len,
+			     dst, bpage);
+
+	if (UNIV_UNLIKELY(fio.err != DB_SUCCESS)) {
+		buf_pool.corrupted_evict(bpage, buf_page_t::READ_FIX);
+	} else if (sync) {
+		thd_wait_end(NULL);
+		/* The i/o was already completed in space->io() */
+		fio.err = bpage->read_complete(*fio.node);
+		space->release();
+		if (fio.err == DB_FAIL) {
+			fio.err = DB_PAGE_CORRUPTED;
+		}
+		if (mariadb_timer)
+		  mariadb_increment_pages_read_time(mariadb_timer);
+	}
+
+	return fio.err;
+}
+
+/** Applies a random read-ahead in buf_pool if there are at least a threshold
+value of accessed pages from the random read-ahead area. Does not read any
+page, not even the one at the position (space, offset), if the read-ahead
+mechanism is not activated. NOTE 1: the calling thread may own latches on
+pages: to avoid deadlocks this function must be written such that it cannot
+end up waiting for these latches! NOTE 2: the calling thread must want
+access to the page given: this rule is set to prevent unintended read-aheads
+performed by ibuf routines, a situation which could result in a deadlock if
+the OS does not support asynchronous i/o.
+@param[in]	page_id		page id of a page which the current thread
+wants to access
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	ibuf		whether we are inside ibuf routine
+@return number of page read requests issued; NOTE that if we read ibuf
+pages, it may happen that the page at the given page number does not
+get read even if we return a positive value! */
+TRANSACTIONAL_TARGET
+ulint
+buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf)
+{
+  if (!srv_random_read_ahead || page_id.space() >= SRV_TMP_SPACE_ID)
+    /* Disable the read-ahead for temporary tablespace */
+    return 0;
+
+  if (srv_startup_is_before_trx_rollback_phase)
+    /* No read-ahead to avoid thread deadlocks */
+    return 0;
+
+  if (ibuf_bitmap_page(page_id, zip_size) || trx_sys_hdr_page(page_id))
+    /* If it is an ibuf bitmap page or trx sys hdr, we do no
+    read-ahead, as that could break the ibuf page access order */
+    return 0;
+
+  if (os_aio_pending_reads_approx() >
+      buf_pool.curr_size / BUF_READ_AHEAD_PEND_LIMIT)
+    return 0;
+
+  fil_space_t* space= fil_space_t::get(page_id.space());
+  if (!space)
+    return 0;
+
+  const uint32_t buf_read_ahead_area= buf_pool.read_ahead_area;
+  ulint count= 5 + buf_read_ahead_area / 8;
+  const page_id_t low= page_id - (page_id.page_no() % buf_read_ahead_area);
+  page_id_t high= low + buf_read_ahead_area;
+  high.set_page_no(std::min(high.page_no(), space->last_page_number()));
+
+  /* Count how many blocks in the area have been recently accessed,
+  that is, reside near the start of the LRU list. */
+
+  for (page_id_t i= low; i < high; ++i)
+  {
+    buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(i.fold());
+    transactional_shared_lock_guard<page_hash_latch> g
+      {buf_pool.page_hash.lock_get(chain)};
+    if (const buf_page_t *bpage= buf_pool.page_hash.get(i, chain))
+      if (bpage->is_accessed() && buf_page_peek_if_young(bpage) && !--count)
+        goto read_ahead;
+  }
+
+no_read_ahead:
+  space->release();
+  return 0;
+
+read_ahead:
+  if (space->is_stopping())
+    goto no_read_ahead;
+
+  /* Read all the suitable blocks within the area */
+  const ulint ibuf_mode= ibuf ? BUF_READ_IBUF_PAGES_ONLY : BUF_READ_ANY_PAGE;
+
+  for (page_id_t i= low; i < high; ++i)
+  {
+    if (ibuf_bitmap_page(i, zip_size))
+      continue;
+    if (space->is_stopping())
+      break;
+    space->reacquire();
+    if (buf_read_page_low(space, false, ibuf_mode, i, zip_size, false) ==
+        DB_SUCCESS)
+      count++;
+  }
+
+  if (count)
+  {
+    DBUG_PRINT("ib_buf", ("random read-ahead %zu pages from %s: %u",
+			  count, space->chain.start->name,
+			  low.page_no()));
+    mysql_mutex_lock(&buf_pool.mutex);
+    /* Read ahead is considered one I/O operation for the purpose of
+    LRU policy decision. */
+    buf_LRU_stat_inc_io();
+    buf_pool.stat.n_ra_pages_read_rnd+= count;
+    mysql_mutex_unlock(&buf_pool.mutex);
+  }
+
+  space->release();
+  return count;
+}
+
+/** High-level function which reads a page from a file to buf_pool
+if it is not already there. Sets the io_fix and an exclusive lock
+on the buffer frame. The flag is cleared and the x-lock
+released by the i/o-handler thread.
+@param[in]	page_id		page id
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@retval DB_SUCCESS if the page was read and is not corrupted
+@retval DB_SUCCESS_LOCKED_REC if the page was not read
+@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted
+@retval DB_DECRYPTION_FAILED if page post encryption checksum matches but
+after decryption normal page checksum does not match.
+@retval DB_TABLESPACE_DELETED if tablespace .ibd file is missing */
+dberr_t buf_read_page(const page_id_t page_id, ulint zip_size)
+{
+  fil_space_t *space= fil_space_t::get(page_id.space());
+  if (!space)
+  {
+    ib::info() << "trying to read page " << page_id
+               << " in nonexisting or being-dropped tablespace";
+    return DB_TABLESPACE_DELETED;
+  }
+
+  buf_LRU_stat_inc_io(); /* NOT protected by buf_pool.mutex */
+  return buf_read_page_low(space, true, BUF_READ_ANY_PAGE,
+                           page_id, zip_size, false);
+}
+
+/** High-level function which reads a page asynchronously from a file to the
+buffer buf_pool if it is not already there. Sets the io_fix flag and sets
+an exclusive lock on the buffer frame. The flag is cleared and the x-lock
+released by the i/o-handler thread.
+@param[in,out]	space		tablespace
+@param[in]	page_id		page id
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0 */
+void buf_read_page_background(fil_space_t *space, const page_id_t page_id,
+                              ulint zip_size)
+{
+	buf_read_page_low(space, false, BUF_READ_ANY_PAGE,
+			  page_id, zip_size, false);
+
+	/* We do not increment number of I/O operations used for LRU policy
+	here (buf_LRU_stat_inc_io()). We use this in heuristics to decide
+	about evicting uncompressed version of compressed pages from the
+	buffer pool. Since this function is called from buffer pool load
+	these IOs are deliberate and are not part of normal workload we can
+	ignore these in our heuristics. */
+}
+
+/** Applies linear read-ahead if in the buf_pool the page is a border page of
+a linear read-ahead area and all the pages in the area have been accessed.
+Does not read any page if the read-ahead mechanism is not activated. Note
+that the algorithm looks at the 'natural' adjacent successor and
+predecessor of the page, which on the leaf level of a B-tree are the next
+and previous page in the chain of leaves. To know these, the page specified
+in (space, offset) must already be present in the buf_pool. Thus, the
+natural way to use this function is to call it when a page in the buf_pool
+is accessed the first time, calling this function just after it has been
+bufferfixed.
+NOTE 1: as this function looks at the natural predecessor and successor
+fields on the page, what happens, if these are not initialized to any
+sensible value? No problem, before applying read-ahead we check that the
+area to read is within the span of the space, if not, read-ahead is not
+applied. An uninitialized value may result in a useless read operation, but
+only very improbably.
+NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this
+function must be written such that it cannot end up waiting for these
+latches!
+NOTE 3: the calling thread must want access to the page given: this rule is
+set to prevent unintended read-aheads performed by ibuf routines, a situation
+which could result in a deadlock if the OS does not support asynchronous io.
+@param[in]	page_id		page id; see NOTE 3 above
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	ibuf		whether if we are inside ibuf routine
+@return number of page read requests issued */
+TRANSACTIONAL_TARGET
+ulint
+buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf)
+{
+  /* check if readahead is disabled.
+  Disable the read ahead logic for temporary tablespace */
+  if (!srv_read_ahead_threshold || page_id.space() >= SRV_TMP_SPACE_ID)
+    return 0;
+
+  if (srv_startup_is_before_trx_rollback_phase)
+    /* No read-ahead to avoid thread deadlocks */
+    return 0;
+
+  if (os_aio_pending_reads_approx() >
+      buf_pool.curr_size / BUF_READ_AHEAD_PEND_LIMIT)
+    return 0;
+
+  const uint32_t buf_read_ahead_area= buf_pool.read_ahead_area;
+  const page_id_t low= page_id - (page_id.page_no() % buf_read_ahead_area);
+  const page_id_t high_1= low + (buf_read_ahead_area - 1);
+
+  /* We will check that almost all pages in the area have been accessed
+  in the desired order. */
+  const bool descending= page_id != low;
+
+  if (!descending && page_id != high_1)
+    /* This is not a border page of the area */
+    return 0;
+
+  if (ibuf_bitmap_page(page_id, zip_size) || trx_sys_hdr_page(page_id))
+    /* If it is an ibuf bitmap page or trx sys hdr, we do no
+    read-ahead, as that could break the ibuf page access order */
+    return 0;
+
+  fil_space_t *space= fil_space_t::get(page_id.space());
+  if (!space)
+    return 0;
+
+  if (high_1.page_no() > space->last_page_number())
+  {
+    /* The area is not whole. */
+fail:
+    space->release();
+    return 0;
+  }
+
+  /* How many out of order accessed pages can we ignore
+  when working out the access pattern for linear readahead */
+  ulint count= std::min<ulint>(buf_pool_t::READ_AHEAD_PAGES -
+                               srv_read_ahead_threshold,
+                               uint32_t{buf_pool.read_ahead_area});
+  page_id_t new_low= low, new_high_1= high_1;
+  unsigned prev_accessed= 0;
+  for (page_id_t i= low; i <= high_1; ++i)
+  {
+    buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(i.fold());
+    page_hash_latch &hash_lock= buf_pool.page_hash.lock_get(chain);
+    /* It does not make sense to use transactional_lock_guard here,
+    because we would have many complex conditions inside the memory
+    transaction. */
+    hash_lock.lock_shared();
+
+    const buf_page_t* bpage= buf_pool.page_hash.get(i, chain);
+    if (!bpage)
+    {
+      hash_lock.unlock_shared();
+      if (i == page_id)
+        goto fail;
+failed:
+      if (--count)
+        continue;
+      goto fail;
+    }
+    const unsigned accessed= bpage->is_accessed();
+    if (i == page_id)
+    {
+      /* Read the natural predecessor and successor page addresses from
+      the page; NOTE that because the calling thread may have an x-latch
+      on the page, we do not acquire an s-latch on the page, this is to
+      prevent deadlocks. The hash_lock is only protecting the
+      buf_pool.page_hash for page i, not the bpage contents itself. */
+      const byte *f= bpage->frame ? bpage->frame : bpage->zip.data;
+      uint32_t prev= mach_read_from_4(my_assume_aligned<4>(f + FIL_PAGE_PREV));
+      uint32_t next= mach_read_from_4(my_assume_aligned<4>(f + FIL_PAGE_NEXT));
+      hash_lock.unlock_shared();
+      if (prev == FIL_NULL || next == FIL_NULL)
+        goto fail;
+      page_id_t id= page_id;
+      if (descending)
+      {
+        if (id == high_1)
+          ++id;
+        else if (next - 1 != page_id.page_no())
+          goto fail;
+        else
+          id.set_page_no(prev);
+      }
+      else
+      {
+        if (prev + 1 != page_id.page_no())
+          goto fail;
+        id.set_page_no(next);
+      }
+
+      new_low= id - (id.page_no() % buf_read_ahead_area);
+      new_high_1= new_low + (buf_read_ahead_area - 1);
+
+      if (id != new_low && id != new_high_1)
+        /* This is not a border page of the area: return */
+        goto fail;
+      if (new_high_1.page_no() > space->last_page_number())
+        /* The area is not whole */
+        goto fail;
+    }
+    else
+      hash_lock.unlock_shared();
+
+    if (!accessed)
+      goto failed;
+    /* Note that buf_page_t::is_accessed() returns the time of the
+    first access. If some blocks of the extent existed in the buffer
+    pool at the time of a linear access pattern, the first access
+    times may be nonmonotonic, even though the latest access times
+    were linear. The threshold (srv_read_ahead_factor) should help a
+    little against this. */
+    bool fail= prev_accessed &&
+      (descending ? prev_accessed > accessed : prev_accessed < accessed);
+    prev_accessed= accessed;
+    if (fail)
+      goto failed;
+  }
+
+  /* If we got this far, read-ahead can be sensible: do it */
+  count= 0;
+  for (ulint ibuf_mode= ibuf ? BUF_READ_IBUF_PAGES_ONLY : BUF_READ_ANY_PAGE;
+       new_low <= new_high_1; ++new_low)
+  {
+    if (ibuf_bitmap_page(new_low, zip_size))
+      continue;
+    if (space->is_stopping())
+      break;
+    space->reacquire();
+    if (buf_read_page_low(space, false, ibuf_mode, new_low, zip_size, false) ==
+        DB_SUCCESS)
+      count++;
+  }
+
+  if (count)
+  {
+    DBUG_PRINT("ib_buf", ("random read-ahead %zu pages from %s: %u",
+                          count, space->chain.start->name,
+                          new_low.page_no()));
+    mysql_mutex_lock(&buf_pool.mutex);
+    /* Read ahead is considered one I/O operation for the purpose of
+    LRU policy decision. */
+    buf_LRU_stat_inc_io();
+    buf_pool.stat.n_ra_pages_read+= count;
+    mysql_mutex_unlock(&buf_pool.mutex);
+  }
+
+  space->release();
+  return count;
+}
+
+/** Schedule a page for recovery.
+@param space    tablespace
+@param page_id  page identifier
+@param recs     log records
+@param init     page initialization, or nullptr if the page needs to be read */
+void buf_read_recover(fil_space_t *space, const page_id_t page_id,
+                      page_recv_t &recs, recv_init *init)
+{
+  ut_ad(space->id == page_id.space());
+  space->reacquire();
+  const ulint zip_size= space->zip_size();
+
+  if (init)
+  {
+    if (buf_page_t *bpage= buf_page_init_for_read(BUF_READ_ANY_PAGE, page_id,
+                                                  zip_size, true))
+    {
+      ut_ad(bpage->in_file());
+      os_fake_read(IORequest{bpage, (buf_tmp_buffer_t*) &recs,
+                             UT_LIST_GET_FIRST(space->chain),
+                             IORequest::READ_ASYNC}, ptrdiff_t(init));
+    }
+  }
+  else if (dberr_t err= buf_read_page_low(space, false, BUF_READ_ANY_PAGE,
+                                          page_id, zip_size, true))
+  {
+    if (err != DB_SUCCESS_LOCKED_REC)
+      sql_print_error("InnoDB: Recovery failed to read page "
+                      UINT32PF " from %s",
+                      page_id.page_no(), space->chain.start->name);
+  }
+}
diff --git a/storage/innobase/data/data0data.cc b/storage/innobase/data/data0data.cc
new file mode 100644
index 00000000..9a7eff21
--- /dev/null
+++ b/storage/innobase/data/data0data.cc
@@ -0,0 +1,820 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file data/data0data.cc
+SQL data field and tuple
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "data0data.h"
+#include "rem0rec.h"
+#include "rem0cmp.h"
+#include "page0page.h"
+#include "page0zip.h"
+#include "dict0dict.h"
+#include "btr0cur.h"
+#include "row0upd.h"
+
+#ifdef UNIV_DEBUG
+/** Dummy variable to catch access to uninitialized fields.  In the
+debug version, dtuple_create() will make all fields of dtuple_t point
+to data_error. */
+ut_d(byte data_error);
+#endif /* UNIV_DEBUG */
+
+/** Trim the tail of an index tuple before insert or update.
+After instant ADD COLUMN, if the last fields of a clustered index tuple
+match the default values that were explicitly specified or implied during
+ADD COLUMN, there will be no need to store them.
+NOTE: A page latch in the index must be held, so that the index
+may not lose 'instantness' before the trimmed tuple has been
+inserted or updated.
+@param[in]	index	index possibly with instantly added columns */
+void dtuple_t::trim(const dict_index_t& index)
+{
+	ut_ad(n_fields >= index.n_core_fields);
+	ut_ad(n_fields <= index.n_fields);
+	ut_ad(index.is_instant());
+
+	ulint i = n_fields;
+	for (; i > index.n_core_fields; i--) {
+		const dfield_t* dfield = dtuple_get_nth_field(this, i - 1);
+		const dict_col_t* col = dict_index_get_nth_col(&index, i - 1);
+
+		if (col->is_dropped()) {
+			continue;
+		}
+
+		ut_ad(col->is_added());
+		ulint len = dfield_get_len(dfield);
+		if (len != col->def_val.len) {
+			break;
+		}
+
+		if (len != 0 && len != UNIV_SQL_NULL
+		    && dfield->data != col->def_val.data
+		    && memcmp(dfield->data, col->def_val.data, len)) {
+			break;
+		}
+	}
+
+	n_fields = i;
+}
+
+/*********************************************************************//**
+Sets number of fields used in a tuple. Normally this is set in
+dtuple_create, but if you want later to set it smaller, you can use this. */
+void
+dtuple_set_n_fields(
+/*================*/
+	dtuple_t*	tuple,		/*!< in: tuple */
+	ulint		n_fields)	/*!< in: number of fields */
+{
+	tuple->n_fields = n_fields;
+	tuple->n_fields_cmp = n_fields;
+}
+
+/**********************************************************//**
+Checks that a data field is typed.
+@return TRUE if ok */
+static
+ibool
+dfield_check_typed_no_assert(
+/*=========================*/
+	const dfield_t*	field)	/*!< in: data field */
+{
+	if (dfield_get_type(field)->mtype > DATA_MTYPE_CURRENT_MAX
+	    || dfield_get_type(field)->mtype < DATA_MTYPE_CURRENT_MIN) {
+
+		ib::error() << "Data field type "
+			<< dfield_get_type(field)->mtype
+			<< ", len " << dfield_get_len(field);
+
+		return(FALSE);
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************//**
+Checks that a data tuple is typed.
+@return TRUE if ok */
+static
+ibool
+dtuple_check_typed_no_assert(
+/*=========================*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+{
+	const dfield_t*	field;
+	ulint		i;
+
+	if (dtuple_get_n_fields(tuple) > REC_MAX_N_FIELDS) {
+		ib::error() << "Index entry has "
+			<< dtuple_get_n_fields(tuple) << " fields";
+dump:
+		fputs("InnoDB: Tuple contents: ", stderr);
+		dtuple_print(stderr, tuple);
+		putc('\n', stderr);
+
+		return(FALSE);
+	}
+
+	for (i = 0; i < dtuple_get_n_fields(tuple); i++) {
+
+		field = dtuple_get_nth_field(tuple, i);
+
+		if (!dfield_check_typed_no_assert(field)) {
+			goto dump;
+		}
+	}
+
+	return(TRUE);
+}
+
+#ifdef UNIV_DEBUG
+/**********************************************************//**
+Checks that a data field is typed. Asserts an error if not.
+@return TRUE if ok */
+ibool
+dfield_check_typed(
+/*===============*/
+	const dfield_t*	field)	/*!< in: data field */
+{
+	if (dfield_get_type(field)->mtype > DATA_MTYPE_CURRENT_MAX
+	    || dfield_get_type(field)->mtype < DATA_MTYPE_CURRENT_MIN) {
+
+		ib::fatal() << "Data field type "
+			<< dfield_get_type(field)->mtype
+			<< ", len " << dfield_get_len(field);
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************//**
+Checks that a data tuple is typed. Asserts an error if not.
+@return TRUE if ok */
+ibool
+dtuple_check_typed(
+/*===============*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+{
+	const dfield_t*	field;
+	ulint		i;
+
+	for (i = 0; i < dtuple_get_n_fields(tuple); i++) {
+
+		field = dtuple_get_nth_field(tuple, i);
+
+		ut_a(dfield_check_typed(field));
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************//**
+Validates the consistency of a tuple which must be complete, i.e,
+all fields must have been set.
+@return TRUE if ok */
+ibool
+dtuple_validate(
+/*============*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+{
+	ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N);
+#ifdef HAVE_valgrind
+	const ulint n_fields = dtuple_get_n_fields(tuple);
+
+	for (ulint i = 0; i < n_fields; i++) {
+		const dfield_t*	field = dtuple_get_nth_field(tuple, i);
+
+		if (!dfield_is_null(field)) {
+			MEM_CHECK_DEFINED(dfield_get_data(field),
+					  dfield_get_len(field));
+		}
+	}
+#endif /* HAVE_valgrind */
+	ut_ad(dtuple_check_typed(tuple));
+
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/*************************************************************//**
+Pretty prints a dfield value according to its data type. */
+void
+dfield_print(
+/*=========*/
+	const dfield_t*	dfield)	/*!< in: dfield */
+{
+	const byte*	data;
+	ulint		len;
+	ulint		i;
+
+	len = dfield_get_len(dfield);
+	data = static_cast<const byte*>(dfield_get_data(dfield));
+
+	if (dfield_is_null(dfield)) {
+		fputs("NULL", stderr);
+
+		return;
+	}
+
+	switch (dtype_get_mtype(dfield_get_type(dfield))) {
+	case DATA_CHAR:
+	case DATA_VARCHAR:
+		for (i = 0; i < len; i++) {
+			int	c = *data++;
+			putc(isprint(c) ? c : ' ', stderr);
+		}
+
+		if (dfield_is_ext(dfield)) {
+			fputs("(external)", stderr);
+		}
+		break;
+	case DATA_INT:
+		ut_a(len == 4); /* only works for 32-bit integers */
+		fprintf(stderr, "%d", (int) mach_read_from_4(data));
+		break;
+	default:
+		ut_error;
+	}
+}
+
+/*************************************************************//**
+Pretty prints a dfield value according to its data type. Also the hex string
+is printed if a string contains non-printable characters. */
+void
+dfield_print_also_hex(
+/*==================*/
+	const dfield_t*	dfield)	/*!< in: dfield */
+{
+	const byte*	data;
+	ulint		len;
+	ulint		prtype;
+	ulint		i;
+	ibool		print_also_hex;
+
+	len = dfield_get_len(dfield);
+	data = static_cast<const byte*>(dfield_get_data(dfield));
+
+	if (dfield_is_null(dfield)) {
+		fputs("NULL", stderr);
+
+		return;
+	}
+
+	prtype = dtype_get_prtype(dfield_get_type(dfield));
+
+	switch (dtype_get_mtype(dfield_get_type(dfield))) {
+		ib_id_t	id;
+	case DATA_INT:
+		switch (len) {
+			ulint	val;
+		case 1:
+			val = mach_read_from_1(data);
+
+			if (!(prtype & DATA_UNSIGNED)) {
+				val &= ~0x80U;
+				fprintf(stderr, "%ld", (long) val);
+			} else {
+				fprintf(stderr, "%lu", (ulong) val);
+			}
+			break;
+
+		case 2:
+			val = mach_read_from_2(data);
+
+			if (!(prtype & DATA_UNSIGNED)) {
+				val &= ~0x8000U;
+				fprintf(stderr, "%ld", (long) val);
+			} else {
+				fprintf(stderr, "%lu", (ulong) val);
+			}
+			break;
+
+		case 3:
+			val = mach_read_from_3(data);
+
+			if (!(prtype & DATA_UNSIGNED)) {
+				val &= ~0x800000U;
+				fprintf(stderr, "%ld", (long) val);
+			} else {
+				fprintf(stderr, "%lu", (ulong) val);
+			}
+			break;
+
+		case 4:
+			val = mach_read_from_4(data);
+
+			if (!(prtype & DATA_UNSIGNED)) {
+				val &= ~0x80000000;
+				fprintf(stderr, "%ld", (long) val);
+			} else {
+				fprintf(stderr, "%lu", (ulong) val);
+			}
+			break;
+
+		case 6:
+			id = mach_read_from_6(data);
+			fprintf(stderr, IB_ID_FMT, id);
+			break;
+
+		case 7:
+			id = mach_read_from_7(data);
+			fprintf(stderr, IB_ID_FMT, id);
+			break;
+		case 8:
+			id = mach_read_from_8(data);
+			fprintf(stderr, IB_ID_FMT, id);
+			break;
+		default:
+			goto print_hex;
+		}
+		break;
+
+	case DATA_SYS:
+		switch (prtype & DATA_SYS_PRTYPE_MASK) {
+		case DATA_TRX_ID:
+			id = mach_read_from_6(data);
+
+			fprintf(stderr, "trx_id " TRX_ID_FMT, id);
+			break;
+
+		case DATA_ROLL_PTR:
+			id = mach_read_from_7(data);
+
+			fprintf(stderr, "roll_ptr " TRX_ID_FMT, id);
+			break;
+
+		case DATA_ROW_ID:
+			id = mach_read_from_6(data);
+
+			fprintf(stderr, "row_id " TRX_ID_FMT, id);
+			break;
+
+		default:
+			goto print_hex;
+		}
+		break;
+
+	case DATA_CHAR:
+	case DATA_VARCHAR:
+		print_also_hex = FALSE;
+
+		for (i = 0; i < len; i++) {
+			int c = *data++;
+
+			if (!isprint(c)) {
+				print_also_hex = TRUE;
+
+				fprintf(stderr, "\\x%02x", (unsigned char) c);
+			} else {
+				putc(c, stderr);
+			}
+		}
+
+		if (dfield_is_ext(dfield)) {
+			fputs("(external)", stderr);
+		}
+
+		if (!print_also_hex) {
+			break;
+		}
+
+		data = static_cast<const byte*>(dfield_get_data(dfield));
+		/* fall through */
+
+	case DATA_BINARY:
+	default:
+print_hex:
+		fputs(" Hex: ",stderr);
+
+		for (i = 0; i < len; i++) {
+			fprintf(stderr, "%02x", *data++);
+		}
+
+		if (dfield_is_ext(dfield)) {
+			fputs("(external)", stderr);
+		}
+	}
+}
+
+/*************************************************************//**
+Print a dfield value using ut_print_buf. */
+static
+void
+dfield_print_raw(
+/*=============*/
+	FILE*		f,		/*!< in: output stream */
+	const dfield_t*	dfield)		/*!< in: dfield */
+{
+	ulint	len	= dfield_get_len(dfield);
+	if (!dfield_is_null(dfield)) {
+		ulint	print_len = ut_min(len, static_cast<ulint>(1000));
+		ut_print_buf(f, dfield_get_data(dfield), print_len);
+		if (len != print_len) {
+			std::ostringstream str_bytes;
+			str_bytes << ib::bytes_iec{len};
+			fprintf(f, "(total %s%s)",
+				str_bytes.str().c_str(),
+				dfield_is_ext(dfield) ? ", external" : "");
+		}
+	} else {
+		fputs(" SQL NULL", f);
+	}
+}
+
+/**********************************************************//**
+The following function prints the contents of a tuple. */
+void
+dtuple_print(
+/*=========*/
+	FILE*		f,	/*!< in: output stream */
+	const dtuple_t*	tuple)	/*!< in: tuple */
+{
+	ulint		n_fields;
+	ulint		i;
+
+	n_fields = dtuple_get_n_fields(tuple);
+
+	fprintf(f, "DATA TUPLE: %lu fields;\n", (ulong) n_fields);
+
+	for (i = 0; i < n_fields; i++) {
+		fprintf(f, " %lu:", (ulong) i);
+
+		dfield_print_raw(f, dtuple_get_nth_field(tuple, i));
+
+		putc(';', f);
+		putc('\n', f);
+	}
+
+	ut_ad(dtuple_validate(tuple));
+}
+
+/** Print the contents of a tuple.
+@param[out]	o	output stream
+@param[in]	field	array of data fields
+@param[in]	n	number of data fields */
+void
+dfield_print(
+	std::ostream&	o,
+	const dfield_t*	field,
+	ulint		n)
+{
+	for (ulint i = 0; i < n; i++, field++) {
+		const void*	data	= dfield_get_data(field);
+		const ulint	len	= dfield_get_len(field);
+
+		if (i) {
+			o << ',';
+		}
+
+		if (dfield_is_null(field)) {
+			o << "NULL";
+		} else if (dfield_is_ext(field)) {
+			ulint	local_len = len - BTR_EXTERN_FIELD_REF_SIZE;
+			ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+			o << '['
+			  << local_len
+			  << '+' << BTR_EXTERN_FIELD_REF_SIZE << ']';
+			ut_print_buf(o, data, local_len);
+			ut_print_buf_hex(o, static_cast<const byte*>(data)
+					 + local_len,
+					 BTR_EXTERN_FIELD_REF_SIZE);
+		} else {
+			o << '[' << len << ']';
+			ut_print_buf(o, data, len);
+		}
+	}
+}
+
+/** Print the contents of a tuple.
+@param[out]	o	output stream
+@param[in]	tuple	data tuple */
+void
+dtuple_print(
+	std::ostream&	o,
+	const dtuple_t*	tuple)
+{
+	const ulint	n	= dtuple_get_n_fields(tuple);
+
+	o << "TUPLE (info_bits=" << dtuple_get_info_bits(tuple)
+	  << ", " << n << " fields): {";
+
+	dfield_print(o, tuple->fields, n);
+
+	o << "}";
+}
+
+/**************************************************************//**
+Moves parts of long fields in entry to the big record vector so that
+the size of tuple drops below the maximum record size allowed in the
+database. Moves data only from those fields which are not necessary
+to determine uniquely the insertion place of the tuple in the index.
+@return own: created big record vector, NULL if we are not able to
+shorten the entry enough, i.e., if there are too many fixed-length or
+short fields in entry or the index is clustered */
+big_rec_t*
+dtuple_convert_big_rec(
+/*===================*/
+	dict_index_t*	index,	/*!< in: index */
+	upd_t*		upd,	/*!< in/out: update vector */
+	dtuple_t*	entry,	/*!< in/out: index entry */
+	ulint*		n_ext)	/*!< in/out: number of
+				externally stored columns */
+{
+	mem_heap_t*	heap;
+	big_rec_t*	vector;
+	dfield_t*	dfield;
+	ulint		size;
+	ulint		local_prefix_len;
+
+	if (!dict_index_is_clust(index)) {
+		return(NULL);
+	}
+
+	if (!index->table->space) {
+		return NULL;
+	}
+
+	ulint local_len = index->table->get_overflow_field_local_len();
+	const auto zip_size = index->table->space->zip_size();
+
+	ut_ad(index->n_uniq > 0);
+
+	ut_a(dtuple_check_typed_no_assert(entry));
+
+	size = rec_get_converted_size(index, entry, *n_ext);
+
+	if (UNIV_UNLIKELY(size > 1000000000)) {
+		ib::warn() << "Tuple size is very big: " << ib::bytes_iec{size};
+		fputs("InnoDB: Tuple contents: ", stderr);
+		dtuple_print(stderr, entry);
+		putc('\n', stderr);
+	}
+
+	heap = mem_heap_create(size + dtuple_get_n_fields(entry)
+			       * sizeof(big_rec_field_t) + 1000);
+
+	vector = big_rec_t::alloc(heap, dtuple_get_n_fields(entry));
+
+	/* Decide which fields to shorten: the algorithm is to look for
+	a variable-length field that yields the biggest savings when
+	stored externally */
+
+	ut_d(ulint n_fields = 0);
+	uint16_t longest_i;
+	ulint longest;
+
+	const bool mblob = entry->is_alter_metadata();
+	ut_ad(entry->n_fields - mblob >= index->first_user_field());
+	ut_ad(entry->n_fields - mblob <= index->n_fields);
+
+	if (mblob) {
+		longest_i = index->first_user_field();
+		dfield = dtuple_get_nth_field(entry, longest_i);
+		local_len = BTR_EXTERN_FIELD_REF_SIZE;
+		ut_ad(!dfield_is_ext(dfield));
+		goto ext_write;
+	}
+
+	if (!dict_table_has_atomic_blobs(index->table)) {
+		/* up to MySQL 5.1: store a 768-byte prefix locally */
+		local_len = BTR_EXTERN_FIELD_REF_SIZE
+			+ DICT_ANTELOPE_MAX_INDEX_COL_LEN;
+	} else {
+		/* new-format table: do not store any BLOB prefix locally */
+		local_len = BTR_EXTERN_FIELD_REF_SIZE;
+	}
+
+	while (page_zip_rec_needs_ext(rec_get_converted_size(index, entry,
+							     *n_ext),
+				      index->table->not_redundant(),
+				      dict_index_get_n_fields(index),
+				      zip_size)) {
+		longest_i = 0;
+		longest = 0;
+		for (uint16_t i = index->first_user_field();
+		     i < entry->n_fields - mblob; i++) {
+			ulint	savings;
+			dfield = dtuple_get_nth_field(entry, i + mblob);
+
+			const dict_field_t* ifield = dict_index_get_nth_field(
+				index, i);
+
+			/* Skip fixed-length, NULL, externally stored,
+			or short columns */
+
+			if (ifield->fixed_len
+			    || dfield_is_null(dfield)
+			    || dfield_is_ext(dfield)
+			    || dfield_get_len(dfield) <= local_len
+			    || dfield_get_len(dfield)
+			    <= BTR_EXTERN_LOCAL_STORED_MAX_SIZE) {
+				goto skip_field;
+			}
+
+			savings = dfield_get_len(dfield) - local_len;
+
+			/* Check that there would be savings */
+			if (longest >= savings) {
+				goto skip_field;
+			}
+
+			/* In DYNAMIC and COMPRESSED format, store
+			locally any non-BLOB columns whose maximum
+			length does not exceed 256 bytes.  This is
+			because there is no room for the "external
+			storage" flag when the maximum length is 255
+			bytes or less. This restriction trivially
+			holds in REDUNDANT and COMPACT format, because
+			there we always store locally columns whose
+			length is up to local_len == 788 bytes.
+			@see rec_init_offsets_comp_ordinary */
+			if (!DATA_BIG_COL(ifield->col)) {
+				goto skip_field;
+			}
+
+			longest_i = uint16_t(i + mblob);
+			longest = savings;
+
+skip_field:
+			continue;
+		}
+
+		if (!longest_i) {
+			/* Cannot shorten more */
+
+			mem_heap_free(heap);
+
+			return(NULL);
+		}
+
+		/* Move data from field longest_i to big rec vector.
+
+		We store the first bytes locally to the record. Then
+		we can calculate all ordering fields in all indexes
+		from locally stored data. */
+		dfield = dtuple_get_nth_field(entry, longest_i);
+ext_write:
+		local_prefix_len = local_len - BTR_EXTERN_FIELD_REF_SIZE;
+
+		vector->append(
+			big_rec_field_t(
+				longest_i,
+				dfield_get_len(dfield) - local_prefix_len,
+				static_cast<char*>(dfield_get_data(dfield))
+				+ local_prefix_len));
+
+		/* Allocate the locally stored part of the column. */
+		byte* data = static_cast<byte*>(
+			mem_heap_alloc(heap, local_len));
+
+		/* Copy the local prefix. */
+		memcpy(data, dfield_get_data(dfield), local_prefix_len);
+		/* Clear the extern field reference (BLOB pointer). */
+		memset(data + local_prefix_len, 0, BTR_EXTERN_FIELD_REF_SIZE);
+
+		dfield_set_data(dfield, data, local_len);
+		dfield_set_ext(dfield);
+
+		(*n_ext)++;
+		ut_ad(++n_fields < dtuple_get_n_fields(entry));
+
+		if (upd && !upd->is_modified(longest_i)) {
+
+			DEBUG_SYNC_C("ib_mv_nonupdated_column_offpage");
+
+			upd_field_t	upd_field;
+			upd_field.field_no = longest_i;
+			upd_field.orig_len = 0;
+			upd_field.exp = NULL;
+			upd_field.old_v_val = NULL;
+			dfield_copy(&upd_field.new_val,
+				    dfield->clone(upd->heap));
+			upd->append(upd_field);
+			ut_ad(upd->is_modified(longest_i));
+
+			ut_ad(upd_field.new_val.len
+			      >= BTR_EXTERN_FIELD_REF_SIZE);
+			ut_ad(upd_field.new_val.len == local_len);
+			ut_ad(upd_field.new_val.len == dfield_get_len(dfield));
+		}
+	}
+
+	ut_ad(n_fields == vector->n_fields);
+
+	return(vector);
+}
+
+/**************************************************************//**
+Puts back to entry the data stored in vector. Note that to ensure the
+fields in entry can accommodate the data, vector must have been created
+from entry with dtuple_convert_big_rec. */
+void
+dtuple_convert_back_big_rec(
+/*========================*/
+	dict_index_t*	index MY_ATTRIBUTE((unused)),	/*!< in: index */
+	dtuple_t*	entry,	/*!< in/out: entry whose data was put to vector */
+	big_rec_t*	vector)	/*!< in, own: big rec vector; it is
+				freed in this function */
+{
+	big_rec_field_t*		b	= vector->fields;
+	const big_rec_field_t* const	end	= b + vector->n_fields;
+
+	for (; b < end; b++) {
+		dfield_t*	dfield;
+		ulint		local_len;
+
+		dfield = dtuple_get_nth_field(entry, b->field_no);
+		local_len = dfield_get_len(dfield);
+
+		ut_ad(dfield_is_ext(dfield));
+		ut_ad(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+		local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+
+		/* Only in REDUNDANT and COMPACT format, we store
+		up to DICT_ANTELOPE_MAX_INDEX_COL_LEN (768) bytes
+		locally */
+		ut_ad(local_len <= DICT_ANTELOPE_MAX_INDEX_COL_LEN);
+
+		dfield_set_data(dfield,
+				(char*) b->data - local_len,
+				b->len + local_len);
+	}
+
+	mem_heap_free(vector->heap);
+}
+
+/** Allocate a big_rec_t object in the given memory heap, and for storing
+n_fld number of fields.
+@param[in]	heap	memory heap in which this object is allocated
+@param[in]	n_fld	maximum number of fields that can be stored in
+			this object
+
+@return the allocated object */
+big_rec_t*
+big_rec_t::alloc(
+	mem_heap_t*	heap,
+	ulint		n_fld)
+{
+	big_rec_t*	rec = static_cast<big_rec_t*>(
+		mem_heap_alloc(heap, sizeof(big_rec_t)));
+
+	new(rec) big_rec_t(n_fld);
+
+	rec->heap = heap;
+	rec->fields = static_cast<big_rec_field_t*>(
+		mem_heap_alloc(heap,
+			       n_fld * sizeof(big_rec_field_t)));
+
+	rec->n_fields = 0;
+	return(rec);
+}
+
+/** Create a deep copy of this object.
+@param[in,out]	heap	memory heap in which the clone will be created
+@return	the cloned object */
+dfield_t*
+dfield_t::clone(mem_heap_t* heap) const
+{
+	const ulint size = len == UNIV_SQL_NULL ? 0 : len;
+	dfield_t* obj = static_cast<dfield_t*>(
+		mem_heap_alloc(heap, sizeof(dfield_t) + size));
+
+	ut_ad(len != UNIV_SQL_DEFAULT);
+	obj->ext  = ext;
+	obj->len  = len;
+	obj->type = type;
+	obj->spatial_status = spatial_status;
+
+	if (len != UNIV_SQL_NULL) {
+		obj->data = obj + 1;
+		memcpy(obj->data, data, len);
+	} else {
+		obj->data = 0;
+	}
+
+	return(obj);
+}
diff --git a/storage/innobase/data/data0type.cc b/storage/innobase/data/data0type.cc
new file mode 100644
index 00000000..b1952bcc
--- /dev/null
+++ b/storage/innobase/data/data0type.cc
@@ -0,0 +1,212 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file data/data0type.cc
+Data types
+
+Created 1/16/1996 Heikki Tuuri
+*******************************************************/
+
+#include "dict0mem.h"
+#include "my_sys.h"
+
+/** The DB_TRX_ID,DB_ROLL_PTR values for "no history is available" */
+const byte reset_trx_id[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN] = {
+	0, 0, 0, 0, 0, 0,
+	0x80, 0, 0, 0, 0, 0, 0
+};
+
+/* At the database startup we store the default-charset collation number of
+this MySQL installation to this global variable. If we have < 4.1.2 format
+column definitions, or records in the insert buffer, we use this
+charset-collation code for them. */
+
+ulint	data_mysql_default_charset_coll;
+
+/*********************************************************************//**
+Determine how many bytes the first n characters of the given string occupy.
+If the string is shorter than n characters, returns the number of bytes
+the characters in the string occupy.
+@return length of the prefix, in bytes */
+ulint
+dtype_get_at_most_n_mbchars(
+/*========================*/
+	ulint		prtype,		/*!< in: precise type */
+	ulint		mbminlen,	/*!< in: minimum length of
+					a multi-byte character, in bytes */
+	ulint		mbmaxlen,	/*!< in: maximum length of
+					a multi-byte character, in bytes */
+	ulint		prefix_len,	/*!< in: length of the requested
+					prefix, in characters, multiplied by
+					dtype_get_mbmaxlen(dtype) */
+	ulint		data_len,	/*!< in: length of str (in bytes) */
+	const char*	str)		/*!< in: the string whose prefix
+					length is being determined */
+{
+	ut_a(len_is_stored(data_len));
+	ut_ad(!mbmaxlen || !(prefix_len % mbmaxlen) || !(prefix_len % 4));
+
+	if (mbminlen != mbmaxlen) {
+		ut_a(!(prefix_len % mbmaxlen) || !(prefix_len % 4));
+		return(innobase_get_at_most_n_mbchars(
+			dtype_get_charset_coll(prtype),
+			prefix_len, data_len, str));
+	}
+
+	if (prefix_len < data_len) {
+
+		return(prefix_len);
+
+	}
+
+	return(data_len);
+}
+
+/*********************************************************************//**
+Validates a data type structure.
+@return TRUE if ok */
+ibool
+dtype_validate(
+/*===========*/
+	const dtype_t*	type)	/*!< in: type struct to validate */
+{
+	ut_a(type);
+	ut_a(type->mtype >= DATA_VARCHAR);
+	ut_a(type->mtype <= DATA_MTYPE_MAX);
+
+	if (type->mtype == DATA_SYS) {
+		ut_a((type->prtype & DATA_MYSQL_TYPE_MASK) < DATA_N_SYS_COLS);
+	}
+
+	ut_a(dtype_get_mbminlen(type) <= dtype_get_mbmaxlen(type));
+
+	return(TRUE);
+}
+
+#ifdef UNIV_DEBUG
+/** Print a data type structure.
+@param[in]	type	data type */
+void
+dtype_print(const dtype_t* type)
+{
+	ulint	mtype;
+	ulint	prtype;
+	ulint	len;
+
+	ut_a(type);
+
+	mtype = type->mtype;
+	prtype = type->prtype;
+
+	switch (mtype) {
+	case DATA_VARCHAR:
+		fputs("DATA_VARCHAR", stderr);
+		break;
+
+	case DATA_CHAR:
+		fputs("DATA_CHAR", stderr);
+		break;
+
+	case DATA_BINARY:
+		fputs("DATA_BINARY", stderr);
+		break;
+
+	case DATA_FIXBINARY:
+		fputs("DATA_FIXBINARY", stderr);
+		break;
+
+	case DATA_BLOB:
+		fputs("DATA_BLOB", stderr);
+		break;
+
+	case DATA_GEOMETRY:
+		fputs("DATA_GEOMETRY", stderr);
+		break;
+
+	case DATA_INT:
+		fputs("DATA_INT", stderr);
+		break;
+
+	case DATA_MYSQL:
+		fputs("DATA_MYSQL", stderr);
+		break;
+
+	case DATA_SYS:
+		fputs("DATA_SYS", stderr);
+		break;
+
+	case DATA_FLOAT:
+		fputs("DATA_FLOAT", stderr);
+		break;
+
+	case DATA_DOUBLE:
+		fputs("DATA_DOUBLE", stderr);
+		break;
+
+	case DATA_DECIMAL:
+		fputs("DATA_DECIMAL", stderr);
+		break;
+
+	case DATA_VARMYSQL:
+		fputs("DATA_VARMYSQL", stderr);
+		break;
+
+	default:
+		fprintf(stderr, "type %lu", (ulong) mtype);
+		break;
+	}
+
+	len = type->len;
+
+	if ((type->mtype == DATA_SYS)
+	    || (type->mtype == DATA_VARCHAR)
+	    || (type->mtype == DATA_CHAR)) {
+		putc(' ', stderr);
+		if (prtype == DATA_ROW_ID) {
+			fputs("DATA_ROW_ID", stderr);
+			len = DATA_ROW_ID_LEN;
+		} else if (prtype == DATA_ROLL_PTR) {
+			fputs("DATA_ROLL_PTR", stderr);
+			len = DATA_ROLL_PTR_LEN;
+		} else if (prtype == DATA_TRX_ID) {
+			fputs("DATA_TRX_ID", stderr);
+			len = DATA_TRX_ID_LEN;
+		} else if (prtype == DATA_ENGLISH) {
+			fputs("DATA_ENGLISH", stderr);
+		} else {
+			fprintf(stderr, "prtype %lu", (ulong) prtype);
+		}
+	} else {
+		if (prtype & DATA_UNSIGNED) {
+			fputs(" DATA_UNSIGNED", stderr);
+		}
+
+		if (prtype & DATA_BINARY_TYPE) {
+			fputs(" DATA_BINARY_TYPE", stderr);
+		}
+
+		if (prtype & DATA_NOT_NULL) {
+			fputs(" DATA_NOT_NULL", stderr);
+		}
+	}
+
+	fprintf(stderr, " len %lu", (ulong) len);
+}
+#endif /* UNIV_DEBUG */
diff --git a/storage/innobase/dict/dict0boot.cc b/storage/innobase/dict/dict0boot.cc
new file mode 100644
index 00000000..5516bce9
--- /dev/null
+++ b/storage/innobase/dict/dict0boot.cc
@@ -0,0 +1,440 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file dict/dict0boot.cc
+Data dictionary creation and booting
+
+Created 4/18/1996 Heikki Tuuri
+*******************************************************/
+
+#include "dict0boot.h"
+#include "dict0crea.h"
+#include "btr0btr.h"
+#include "dict0load.h"
+#include "trx0trx.h"
+#include "srv0srv.h"
+#include "ibuf0ibuf.h"
+#include "buf0flu.h"
+#include "log0recv.h"
+#include "os0file.h"
+
+/** The DICT_HDR page identifier */
+static constexpr page_id_t hdr_page_id{DICT_HDR_SPACE, DICT_HDR_PAGE_NO};
+
+/** @return the DICT_HDR block, x-latched */
+static buf_block_t *dict_hdr_get(mtr_t *mtr)
+{
+  /* We assume that the DICT_HDR page is always readable and available. */
+  return buf_page_get_gen(hdr_page_id, 0, RW_X_LATCH, nullptr, BUF_GET, mtr);
+}
+
+/**********************************************************************//**
+Returns a new table, index, or space id. */
+void
+dict_hdr_get_new_id(
+/*================*/
+	table_id_t*		table_id,	/*!< out: table id
+						(not assigned if NULL) */
+	index_id_t*		index_id,	/*!< out: index id
+						(not assigned if NULL) */
+	uint32_t*		space_id)	/*!< out: space id
+						(not assigned if NULL) */
+{
+	ib_id_t		id;
+	mtr_t		mtr;
+
+	mtr.start();
+	buf_block_t* dict_hdr = dict_hdr_get(&mtr);
+
+	if (table_id) {
+		id = mach_read_from_8(DICT_HDR + DICT_HDR_TABLE_ID
+				      + dict_hdr->page.frame);
+		id++;
+		mtr.write<8>(*dict_hdr, DICT_HDR + DICT_HDR_TABLE_ID
+			     + dict_hdr->page.frame, id);
+		*table_id = id;
+	}
+
+	if (index_id) {
+		id = mach_read_from_8(DICT_HDR + DICT_HDR_INDEX_ID
+				      + dict_hdr->page.frame);
+		id++;
+		mtr.write<8>(*dict_hdr, DICT_HDR + DICT_HDR_INDEX_ID
+			     + dict_hdr->page.frame, id);
+		*index_id = id;
+	}
+
+	if (space_id) {
+		*space_id = mach_read_from_4(DICT_HDR + DICT_HDR_MAX_SPACE_ID
+					     + dict_hdr->page.frame);
+		if (fil_assign_new_space_id(space_id)) {
+			mtr.write<4>(*dict_hdr,
+				     DICT_HDR + DICT_HDR_MAX_SPACE_ID
+				     + dict_hdr->page.frame, *space_id);
+		}
+	}
+
+	mtr.commit();
+}
+
+/** Update dict_sys.row_id in the dictionary header file page. */
+void dict_hdr_flush_row_id(row_id_t id)
+{
+  mtr_t mtr;
+  mtr.start();
+  buf_block_t* d= dict_hdr_get(&mtr);
+  byte *row_id= DICT_HDR + DICT_HDR_ROW_ID + d->page.frame;
+  if (mach_read_from_8(row_id) < id)
+    mtr.write<8>(*d, row_id, id);
+  mtr.commit();
+}
+
+/** Create the DICT_HDR page on database initialization.
+@return error code */
+dberr_t dict_create()
+{
+	ulint		root_page_no;
+
+	dberr_t err;
+	mtr_t mtr;
+	mtr.start();
+	compile_time_assert(DICT_HDR_SPACE == 0);
+
+	/* Create the dictionary header file block in a new, allocated file
+	segment in the system tablespace */
+	buf_block_t* d = fseg_create(fil_system.sys_space,
+				     DICT_HDR + DICT_HDR_FSEG_HEADER, &mtr,
+                                     &err);
+	if (!d) {
+		goto func_exit;
+	}
+	ut_a(d->page.id() == hdr_page_id);
+
+	/* Start counting row, table, index, and tree ids from
+	DICT_HDR_FIRST_ID */
+	mtr.write<8>(*d, DICT_HDR + DICT_HDR_ROW_ID + d->page.frame,
+		     DICT_HDR_FIRST_ID);
+	mtr.write<8>(*d, DICT_HDR + DICT_HDR_TABLE_ID + d->page.frame,
+		     DICT_HDR_FIRST_ID);
+	mtr.write<8>(*d, DICT_HDR + DICT_HDR_INDEX_ID + d->page.frame,
+		     DICT_HDR_FIRST_ID);
+
+	ut_ad(!mach_read_from_4(DICT_HDR + DICT_HDR_MAX_SPACE_ID
+				+ d->page.frame));
+
+	/* Obsolete, but we must initialize it anyway. */
+	mtr.write<4>(*d, DICT_HDR + DICT_HDR_MIX_ID_LOW + d->page.frame,
+		     DICT_HDR_FIRST_ID);
+
+	/* Create the B-tree roots for the clustered indexes of the basic
+	system tables */
+
+	/*--------------------------*/
+	root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE,
+				  fil_system.sys_space, DICT_TABLES_ID,
+				  nullptr, &mtr, &err);
+	if (root_page_no == FIL_NULL) {
+		goto func_exit;
+	}
+
+	mtr.write<4>(*d, DICT_HDR + DICT_HDR_TABLES + d->page.frame,
+		     root_page_no);
+	/*--------------------------*/
+	root_page_no = btr_create(DICT_UNIQUE,
+				  fil_system.sys_space, DICT_TABLE_IDS_ID,
+				  nullptr, &mtr, &err);
+	if (root_page_no == FIL_NULL) {
+		goto func_exit;
+	}
+
+	mtr.write<4>(*d, DICT_HDR + DICT_HDR_TABLE_IDS + d->page.frame,
+		     root_page_no);
+	/*--------------------------*/
+	root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE,
+				  fil_system.sys_space, DICT_COLUMNS_ID,
+				  nullptr, &mtr, &err);
+	if (root_page_no == FIL_NULL) {
+		goto func_exit;
+	}
+
+	mtr.write<4>(*d, DICT_HDR + DICT_HDR_COLUMNS + d->page.frame,
+		     root_page_no);
+	/*--------------------------*/
+	root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE,
+				  fil_system.sys_space, DICT_INDEXES_ID,
+				  nullptr, &mtr, &err);
+	if (root_page_no == FIL_NULL) {
+		goto func_exit;
+	}
+
+	mtr.write<4>(*d, DICT_HDR + DICT_HDR_INDEXES + d->page.frame,
+		     root_page_no);
+	/*--------------------------*/
+	root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE,
+				  fil_system.sys_space, DICT_FIELDS_ID,
+				  nullptr, &mtr, &err);
+	if (root_page_no == FIL_NULL) {
+		goto func_exit;
+	}
+
+	mtr.write<4>(*d, DICT_HDR + DICT_HDR_FIELDS + d->page.frame,
+		     root_page_no);
+func_exit:
+	mtr.commit();
+	return err ? err : dict_boot();
+}
+
+/*****************************************************************//**
+Initializes the data dictionary memory structures when the database is
+started. This function is also called when the data dictionary is created.
+@return DB_SUCCESS or error code. */
+dberr_t dict_boot()
+{
+	dict_table_t*	table;
+	dict_index_t*	index;
+	mem_heap_t*	heap;
+	mtr_t		mtr;
+
+	static_assert(DICT_NUM_COLS__SYS_TABLES == 8, "compatibility");
+	static_assert(DICT_NUM_FIELDS__SYS_TABLES == 10, "compatibility");
+	static_assert(DICT_NUM_FIELDS__SYS_TABLE_IDS == 2, "compatibility");
+	static_assert(DICT_NUM_COLS__SYS_COLUMNS == 7, "compatibility");
+	static_assert(DICT_NUM_FIELDS__SYS_COLUMNS == 9, "compatibility");
+	static_assert(DICT_NUM_COLS__SYS_INDEXES == 8, "compatibility");
+	static_assert(DICT_NUM_FIELDS__SYS_INDEXES == 10, "compatibility");
+	static_assert(DICT_NUM_COLS__SYS_FIELDS == 3, "compatibility");
+	static_assert(DICT_NUM_FIELDS__SYS_FIELDS == 5, "compatibility");
+	static_assert(DICT_NUM_COLS__SYS_FOREIGN == 4, "compatibility");
+	static_assert(DICT_NUM_FIELDS__SYS_FOREIGN == 6, "compatibility");
+	static_assert(DICT_NUM_FIELDS__SYS_FOREIGN_FOR_NAME == 2,
+		      "compatibility");
+	static_assert(DICT_NUM_COLS__SYS_FOREIGN_COLS == 4, "compatibility");
+	static_assert(DICT_NUM_FIELDS__SYS_FOREIGN_COLS == 6, "compatibility");
+
+	mtr.start();
+	/* Create the hash tables etc. */
+	dict_sys.create();
+
+	dberr_t err;
+	const buf_block_t *d = buf_page_get_gen(hdr_page_id, 0, RW_X_LATCH,
+						nullptr, BUF_GET, &mtr, &err);
+        if (!d) {
+		mtr.commit();
+		return err;
+        }
+
+	heap = mem_heap_create(450);
+
+	dict_sys.lock(SRW_LOCK_CALL);
+
+	const byte* dict_hdr = &d->page.frame[DICT_HDR];
+
+	/* Because we only write new row ids to disk-based data structure
+	(dictionary header) when it is divisible by
+	DICT_HDR_ROW_ID_WRITE_MARGIN, in recovery we will not recover
+	the latest value of the row id counter. Therefore we advance
+	the counter at the database startup to avoid overlapping values.
+	Note that when a user after database startup first time asks for
+	a new row id, then because the counter is now divisible by
+	..._MARGIN, it will immediately be updated to the disk-based
+	header. */
+
+	dict_sys.recover_row_id(mach_read_from_8(dict_hdr + DICT_HDR_ROW_ID));
+	if (uint32_t max_space_id
+	    = mach_read_from_4(dict_hdr + DICT_HDR_MAX_SPACE_ID)) {
+		max_space_id--;
+		fil_assign_new_space_id(&max_space_id);
+	}
+
+	/* Insert into the dictionary cache the descriptions of the basic
+	system tables */
+	/*-------------------------*/
+	table = dict_table_t::create(dict_sys.SYS_TABLE[dict_sys.SYS_TABLES],
+				     fil_system.sys_space,
+				     DICT_NUM_COLS__SYS_TABLES, 0, 0, 0);
+	dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0,
+			       MAX_FULL_NAME_LEN);
+	dict_mem_table_add_col(table, heap, "ID", DATA_BINARY, 0, 8);
+	/* ROW_FORMAT = (N_COLS >> 31) ? COMPACT : REDUNDANT */
+	dict_mem_table_add_col(table, heap, "N_COLS", DATA_INT, 0, 4);
+	/* The low order bit of TYPE is always set to 1.  If ROW_FORMAT
+	is not REDUNDANT or COMPACT, this field matches table->flags. */
+	dict_mem_table_add_col(table, heap, "TYPE", DATA_INT, 0, 4);
+	dict_mem_table_add_col(table, heap, "MIX_ID", DATA_BINARY, 0, 0);
+	/* MIX_LEN may contain additional table flags when
+	ROW_FORMAT!=REDUNDANT. */
+	dict_mem_table_add_col(table, heap, "MIX_LEN", DATA_INT, 0, 4);
+	dict_mem_table_add_col(table, heap, "CLUSTER_NAME", DATA_BINARY, 0, 0);
+	dict_mem_table_add_col(table, heap, "SPACE", DATA_INT, 0, 4);
+
+	table->id = DICT_TABLES_ID;
+
+	dict_table_add_system_columns(table, heap);
+	table->add_to_cache();
+	dict_sys.sys_tables = table;
+	mem_heap_empty(heap);
+
+	index = dict_mem_index_create(table, "CLUST_IND",
+				      DICT_UNIQUE | DICT_CLUSTERED, 1);
+
+	dict_mem_index_add_field(index, "NAME", 0);
+
+	index->id = DICT_TABLES_ID;
+	err = dict_index_add_to_cache(
+		index, mach_read_from_4(dict_hdr + DICT_HDR_TABLES));
+	ut_a(err == DB_SUCCESS);
+	ut_ad(!table->is_instant());
+	table->indexes.start->n_core_null_bytes = static_cast<uint8_t>(
+		UT_BITS_IN_BYTES(unsigned(table->indexes.start->n_nullable)));
+
+	/*-------------------------*/
+	index = dict_mem_index_create(table, "ID_IND", DICT_UNIQUE, 1);
+	dict_mem_index_add_field(index, "ID", 0);
+
+	index->id = DICT_TABLE_IDS_ID;
+	err = dict_index_add_to_cache(
+		index, mach_read_from_4(dict_hdr + DICT_HDR_TABLE_IDS));
+	ut_a(err == DB_SUCCESS);
+
+	/*-------------------------*/
+	table = dict_table_t::create(dict_sys.SYS_TABLE[dict_sys.SYS_COLUMNS],
+				     fil_system.sys_space,
+				     DICT_NUM_COLS__SYS_COLUMNS, 0, 0, 0);
+	dict_mem_table_add_col(table, heap, "TABLE_ID", DATA_BINARY, 0, 8);
+	dict_mem_table_add_col(table, heap, "POS", DATA_INT, 0, 4);
+	dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, 0);
+	dict_mem_table_add_col(table, heap, "MTYPE", DATA_INT, 0, 4);
+	dict_mem_table_add_col(table, heap, "PRTYPE", DATA_INT, 0, 4);
+	dict_mem_table_add_col(table, heap, "LEN", DATA_INT, 0, 4);
+	dict_mem_table_add_col(table, heap, "PREC", DATA_INT, 0, 4);
+
+	table->id = DICT_COLUMNS_ID;
+
+	dict_table_add_system_columns(table, heap);
+	table->add_to_cache();
+	dict_sys.sys_columns = table;
+	mem_heap_empty(heap);
+
+	index = dict_mem_index_create(table, "CLUST_IND",
+				      DICT_UNIQUE | DICT_CLUSTERED, 2);
+
+	dict_mem_index_add_field(index, "TABLE_ID", 0);
+	dict_mem_index_add_field(index, "POS", 0);
+
+	index->id = DICT_COLUMNS_ID;
+	err = dict_index_add_to_cache(
+		index, mach_read_from_4(dict_hdr + DICT_HDR_COLUMNS));
+	ut_a(err == DB_SUCCESS);
+	ut_ad(!table->is_instant());
+	table->indexes.start->n_core_null_bytes = static_cast<uint8_t>(
+		UT_BITS_IN_BYTES(unsigned(table->indexes.start->n_nullable)));
+
+	/*-------------------------*/
+	table = dict_table_t::create(dict_sys.SYS_TABLE[dict_sys.SYS_INDEXES],
+				     fil_system.sys_space,
+				     DICT_NUM_COLS__SYS_INDEXES, 0, 0, 0);
+
+	dict_mem_table_add_col(table, heap, "TABLE_ID", DATA_BINARY, 0, 8);
+	dict_mem_table_add_col(table, heap, "ID", DATA_BINARY, 0, 8);
+	dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, 0);
+	dict_mem_table_add_col(table, heap, "N_FIELDS", DATA_INT, 0, 4);
+	dict_mem_table_add_col(table, heap, "TYPE", DATA_INT, 0, 4);
+	/* SYS_INDEXES.SPACE is only read by in dict_drop_index_tree() */
+	dict_mem_table_add_col(table, heap, "SPACE", DATA_INT, 0, 4);
+	dict_mem_table_add_col(table, heap, "PAGE_NO", DATA_INT, 0, 4);
+	dict_mem_table_add_col(table, heap, "MERGE_THRESHOLD", DATA_INT, 0, 4);
+
+	table->id = DICT_INDEXES_ID;
+
+	dict_table_add_system_columns(table, heap);
+	/* The column SYS_INDEXES.MERGE_THRESHOLD was "instantly"
+	added in MySQL 5.7 and MariaDB 10.2.2. Assign it DEFAULT NULL.
+	Because of file format compatibility, we must treat SYS_INDEXES
+	as a special case, relaxing some debug assertions
+	for DICT_INDEXES_ID. */
+	dict_table_get_nth_col(table, DICT_COL__SYS_INDEXES__MERGE_THRESHOLD)
+		->def_val.len = UNIV_SQL_NULL;
+	table->add_to_cache();
+	dict_sys.sys_indexes = table;
+	mem_heap_empty(heap);
+
+	index = dict_mem_index_create(table, "CLUST_IND",
+				      DICT_UNIQUE | DICT_CLUSTERED, 2);
+
+	dict_mem_index_add_field(index, "TABLE_ID", 0);
+	dict_mem_index_add_field(index, "ID", 0);
+
+	index->id = DICT_INDEXES_ID;
+	err = dict_index_add_to_cache(
+		index, mach_read_from_4(dict_hdr + DICT_HDR_INDEXES));
+	ut_a(err == DB_SUCCESS);
+	ut_ad(!table->is_instant());
+	table->indexes.start->n_core_null_bytes = static_cast<uint8_t>(
+		UT_BITS_IN_BYTES(unsigned(table->indexes.start->n_nullable)));
+
+	/*-------------------------*/
+	table = dict_table_t::create(dict_sys.SYS_TABLE[dict_sys.SYS_FIELDS],
+				     fil_system.sys_space,
+				     DICT_NUM_COLS__SYS_FIELDS, 0, 0, 0);
+	dict_mem_table_add_col(table, heap, "INDEX_ID", DATA_BINARY, 0, 8);
+	dict_mem_table_add_col(table, heap, "POS", DATA_INT, 0, 4);
+	dict_mem_table_add_col(table, heap, "COL_NAME", DATA_BINARY, 0, 0);
+
+	table->id = DICT_FIELDS_ID;
+
+	dict_table_add_system_columns(table, heap);
+	table->add_to_cache();
+	dict_sys.sys_fields = table;
+	mem_heap_free(heap);
+
+	index = dict_mem_index_create(table, "CLUST_IND",
+				      DICT_UNIQUE | DICT_CLUSTERED, 2);
+
+	dict_mem_index_add_field(index, "INDEX_ID", 0);
+	dict_mem_index_add_field(index, "POS", 0);
+
+	index->id = DICT_FIELDS_ID;
+	err = dict_index_add_to_cache(
+		index, mach_read_from_4(dict_hdr + DICT_HDR_FIELDS));
+	ut_a(err == DB_SUCCESS);
+	ut_ad(!table->is_instant());
+	table->indexes.start->n_core_null_bytes = static_cast<uint8_t>(
+		UT_BITS_IN_BYTES(unsigned(table->indexes.start->n_nullable)));
+
+	mtr.commit();
+
+	err = ibuf_init_at_db_start();
+
+	if (err == DB_SUCCESS || srv_force_recovery >= SRV_FORCE_NO_DDL_UNDO) {
+		err = DB_SUCCESS;
+		/* Load definitions of other indexes on system tables */
+
+		dict_load_sys_table(dict_sys.sys_tables);
+		dict_load_sys_table(dict_sys.sys_columns);
+		dict_load_sys_table(dict_sys.sys_indexes);
+		dict_load_sys_table(dict_sys.sys_fields);
+		dict_sys.unlock();
+		dict_sys.load_sys_tables();
+	} else {
+		dict_sys.unlock();
+	}
+
+	return err;
+}
diff --git a/storage/innobase/dict/dict0crea.cc b/storage/innobase/dict/dict0crea.cc
new file mode 100644
index 00000000..cce5f2f2
--- /dev/null
+++ b/storage/innobase/dict/dict0crea.cc
@@ -0,0 +1,1906 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file dict/dict0crea.cc
+Database object creation
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#include "dict0crea.h"
+#include "btr0pcur.h"
+#ifdef BTR_CUR_HASH_ADAPT
+# include "btr0sea.h"
+#endif /* BTR_CUR_HASH_ADAPT */
+#include "page0page.h"
+#include "mach0data.h"
+#include "dict0boot.h"
+#include "dict0dict.h"
+#include "lock0lock.h"
+#include "que0que.h"
+#include "row0ins.h"
+#include "row0mysql.h"
+#include "pars0pars.h"
+#include "trx0roll.h"
+#include "trx0rseg.h"
+#include "trx0undo.h"
+#include "ut0vec.h"
+#include "fts0priv.h"
+#include "srv0start.h"
+#include "log.h"
+
+/*****************************************************************//**
+Based on a table object, this function builds the entry to be inserted
+in the SYS_TABLES system table.
+@return the tuple which should be inserted */
+static
+dtuple_t*
+dict_create_sys_tables_tuple(
+/*=========================*/
+	const dict_table_t*	table,	/*!< in: table */
+	mem_heap_t*		heap)	/*!< in: memory heap from
+					which the memory for the built
+					tuple is allocated */
+{
+	dtuple_t*	entry;
+	dfield_t*	dfield;
+	byte*		ptr;
+	ulint		type;
+
+	ut_ad(table);
+	ut_ad(!table->space || table->space->id == table->space_id);
+	ut_ad(heap);
+	ut_ad(table->n_cols >= DATA_N_SYS_COLS);
+
+	entry = dtuple_create(heap, 8 + DATA_N_SYS_COLS);
+
+	dict_table_copy_types(entry, dict_sys.sys_tables);
+
+	/* 0: NAME -----------------------------*/
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_TABLES__NAME);
+
+	dfield_set_data(dfield,
+			table->name.m_name, strlen(table->name.m_name));
+
+	/* 1: DB_TRX_ID added later */
+	/* 2: DB_ROLL_PTR added later */
+	/* 3: ID -------------------------------*/
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_TABLES__ID);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 8));
+	mach_write_to_8(ptr, table->id);
+
+	dfield_set_data(dfield, ptr, 8);
+
+	/* 4: N_COLS ---------------------------*/
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_TABLES__N_COLS);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+
+	/* If there is any virtual column, encode it in N_COLS */
+	mach_write_to_4(ptr, dict_table_encode_n_col(
+				ulint(table->n_cols - DATA_N_SYS_COLS),
+				ulint(table->n_v_def))
+			| (ulint(table->flags & DICT_TF_COMPACT) << 31));
+	dfield_set_data(dfield, ptr, 4);
+
+	/* 5: TYPE (table flags) -----------------------------*/
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_TABLES__TYPE);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+
+	/* Validate the table flags and convert them to what is saved in
+	SYS_TABLES.TYPE.  Table flag values 0 and 1 are both written to
+	SYS_TABLES.TYPE as 1. */
+	type = dict_tf_to_sys_tables_type(table->flags);
+	mach_write_to_4(ptr, type);
+
+	dfield_set_data(dfield, ptr, 4);
+
+	/* 6: MIX_ID (obsolete) ---------------------------*/
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_TABLES__MIX_ID);
+
+	ptr = static_cast<byte*>(mem_heap_zalloc(heap, 8));
+
+	dfield_set_data(dfield, ptr, 8);
+
+	/* 7: MIX_LEN (additional flags) --------------------------*/
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_TABLES__MIX_LEN);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+	/* Be sure all non-used bits are zero. */
+	ut_a(!(table->flags2 & DICT_TF2_UNUSED_BIT_MASK));
+	mach_write_to_4(ptr, table->flags2);
+
+	dfield_set_data(dfield, ptr, 4);
+
+	/* 8: CLUSTER_NAME ---------------------*/
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_TABLES__CLUSTER_ID);
+	dfield_set_null(dfield); /* not supported */
+
+	/* 9: SPACE ----------------------------*/
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_TABLES__SPACE);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+	mach_write_to_4(ptr, table->space_id);
+
+	dfield_set_data(dfield, ptr, 4);
+	/*----------------------------------*/
+
+	return(entry);
+}
+
+/*****************************************************************//**
+Based on a table object, this function builds the entry to be inserted
+in the SYS_COLUMNS system table.
+@return the tuple which should be inserted */
+static
+dtuple_t*
+dict_create_sys_columns_tuple(
+/*==========================*/
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			i,	/*!< in: column number */
+	mem_heap_t*		heap)	/*!< in: memory heap from
+					which the memory for the built
+					tuple is allocated */
+{
+	dtuple_t*		entry;
+	const dict_col_t*	column;
+	dfield_t*		dfield;
+	byte*			ptr;
+	const char*		col_name;
+	ulint			num_base = 0;
+	ulint			v_col_no = ULINT_UNDEFINED;
+
+	ut_ad(table);
+	ut_ad(heap);
+
+	/* Any column beyond table->n_def would be virtual columns */
+        if (i >= table->n_def) {
+		dict_v_col_t*	v_col = dict_table_get_nth_v_col(
+					table, i - table->n_def);
+		column = &v_col->m_col;
+		num_base = v_col->num_base;
+		v_col_no = column->ind;
+	} else {
+		column = dict_table_get_nth_col(table, i);
+		ut_ad(!column->is_virtual());
+	}
+
+	entry = dtuple_create(heap, 7 + DATA_N_SYS_COLS);
+
+	dict_table_copy_types(entry, dict_sys.sys_columns);
+
+	/* 0: TABLE_ID -----------------------*/
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__TABLE_ID);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 8));
+	mach_write_to_8(ptr, table->id);
+
+	dfield_set_data(dfield, ptr, 8);
+
+	/* 1: POS ----------------------------*/
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__POS);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+
+	if (v_col_no != ULINT_UNDEFINED) {
+		/* encode virtual column's position in MySQL table and InnoDB
+		table in "POS" */
+		mach_write_to_4(ptr, dict_create_v_col_pos(
+				i - table->n_def, v_col_no));
+	} else {
+		mach_write_to_4(ptr, i);
+	}
+
+	dfield_set_data(dfield, ptr, 4);
+
+	/* 2: DB_TRX_ID added later */
+	/* 3: DB_ROLL_PTR added later */
+	/* 4: NAME ---------------------------*/
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__NAME);
+
+        if (i >= table->n_def) {
+		col_name = dict_table_get_v_col_name(table, i - table->n_def);
+	} else {
+		col_name = dict_table_get_col_name(table, i);
+	}
+
+	dfield_set_data(dfield, col_name, strlen(col_name));
+
+	/* 5: MTYPE --------------------------*/
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__MTYPE);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+	mach_write_to_4(ptr, column->mtype);
+
+	dfield_set_data(dfield, ptr, 4);
+
+	/* 6: PRTYPE -------------------------*/
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__PRTYPE);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+	mach_write_to_4(ptr, column->prtype);
+
+	dfield_set_data(dfield, ptr, 4);
+
+	/* 7: LEN ----------------------------*/
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__LEN);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+	mach_write_to_4(ptr, column->len);
+
+	dfield_set_data(dfield, ptr, 4);
+
+	/* 8: PREC ---------------------------*/
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_COLUMNS__PREC);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+	mach_write_to_4(ptr, num_base);
+
+	dfield_set_data(dfield, ptr, 4);
+	/*---------------------------------*/
+
+	return(entry);
+}
+
+/** Based on a table object, this function builds the entry to be inserted
+in the SYS_VIRTUAL system table. Each row maps a virtual column to one of
+its base column.
+@param[in]	table	table
+@param[in]	v_col_n	virtual column number
+@param[in]	b_col_n	base column sequence num
+@param[in]	heap	memory heap
+@return the tuple which should be inserted */
+static
+dtuple_t*
+dict_create_sys_virtual_tuple(
+	const dict_table_t*	table,
+	ulint			v_col_n,
+	ulint			b_col_n,
+	mem_heap_t*		heap)
+{
+	dtuple_t*		entry;
+	const dict_col_t*	base_column;
+	dfield_t*		dfield;
+	byte*			ptr;
+
+	ut_ad(table);
+	ut_ad(heap);
+
+	ut_ad(v_col_n < table->n_v_def);
+	dict_v_col_t*	v_col = dict_table_get_nth_v_col(table, v_col_n);
+	base_column = v_col->base_col[b_col_n];
+
+	entry = dtuple_create(heap, DICT_NUM_COLS__SYS_VIRTUAL
+			      + DATA_N_SYS_COLS);
+
+	dict_table_copy_types(entry, dict_sys.sys_virtual);
+
+	/* 0: TABLE_ID -----------------------*/
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_VIRTUAL__TABLE_ID);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 8));
+	mach_write_to_8(ptr, table->id);
+
+	dfield_set_data(dfield, ptr, 8);
+
+	/* 1: POS ---------------------------*/
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_VIRTUAL__POS);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+	ulint	v_col_no = dict_create_v_col_pos(v_col_n, v_col->m_col.ind);
+	mach_write_to_4(ptr, v_col_no);
+
+	dfield_set_data(dfield, ptr, 4);
+
+	/* 2: BASE_POS ----------------------------*/
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_VIRTUAL__BASE_POS);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+	mach_write_to_4(ptr, base_column->ind);
+
+	dfield_set_data(dfield, ptr, 4);
+
+	/* 3: DB_TRX_ID added later */
+	/* 4: DB_ROLL_PTR added later */
+
+	/*---------------------------------*/
+	return(entry);
+}
+
+/***************************************************************//**
+Builds a table definition to insert.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+dict_build_table_def_step(
+/*======================*/
+	que_thr_t*	thr,	/*!< in: query thread */
+	tab_node_t*	node)	/*!< in: table create node */
+{
+	ut_ad(dict_sys.locked());
+	dict_table_t*	table = node->table;
+	ut_ad(!table->is_temporary());
+	ut_ad(!table->space);
+	ut_ad(table->space_id == UINT32_MAX);
+	dict_hdr_get_new_id(&table->id, nullptr, nullptr);
+
+	/* Always set this bit for all new created tables */
+	DICT_TF2_FLAG_SET(table, DICT_TF2_FTS_AUX_HEX_NAME);
+	DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name",
+			DICT_TF2_FLAG_UNSET(table,
+					    DICT_TF2_FTS_AUX_HEX_NAME););
+
+	if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_USE_FILE_PER_TABLE)) {
+		/* This table will need a new tablespace. */
+
+		ut_ad(DICT_TF_GET_ZIP_SSIZE(table->flags) == 0
+		      || dict_table_has_atomic_blobs(table));
+		/* Get a new tablespace ID */
+		dict_hdr_get_new_id(NULL, NULL, &table->space_id);
+
+		DBUG_EXECUTE_IF(
+			"ib_create_table_fail_out_of_space_ids",
+			table->space_id = UINT32_MAX;
+		);
+
+		if (table->space_id == UINT32_MAX) {
+			return DB_ERROR;
+		}
+	} else {
+		ut_ad(dict_tf_get_rec_format(table->flags)
+		      != REC_FORMAT_COMPRESSED);
+		table->space = fil_system.sys_space;
+		table->space_id = TRX_SYS_SPACE;
+	}
+
+	ins_node_set_new_row(node->tab_def,
+			     dict_create_sys_tables_tuple(table, node->heap));
+	return DB_SUCCESS;
+}
+
+/** Builds a SYS_VIRTUAL row definition to insert.
+@param[in]	node	table create node */
+static
+void
+dict_build_v_col_def_step(
+	tab_node_t*	node)
+{
+	dtuple_t*	row;
+
+	row = dict_create_sys_virtual_tuple(node->table, node->col_no,
+					    node->base_col_no,
+					    node->heap);
+	ins_node_set_new_row(node->v_col_def, row);
+}
+
+/*****************************************************************//**
+Based on an index object, this function builds the entry to be inserted
+in the SYS_INDEXES system table.
+@return the tuple which should be inserted */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dtuple_t*
+dict_create_sys_indexes_tuple(
+/*==========================*/
+	const dict_index_t*	index,	/*!< in: index */
+	mem_heap_t*		heap)	/*!< in: memory heap from
+					which the memory for the built
+					tuple is allocated */
+{
+	dtuple_t*	entry;
+	dfield_t*	dfield;
+	byte*		ptr;
+
+	ut_ad(dict_sys.locked());
+	ut_ad(index);
+	ut_ad(index->table->space || !UT_LIST_GET_LEN(index->table->indexes)
+	      || index->table->file_unreadable);
+	ut_ad(!index->table->space
+	      || index->table->space->id == index->table->space_id);
+	ut_ad(heap);
+
+	entry = dtuple_create(
+		heap, DICT_NUM_COLS__SYS_INDEXES + DATA_N_SYS_COLS);
+
+	dict_table_copy_types(entry, dict_sys.sys_indexes);
+
+	/* 0: TABLE_ID -----------------------*/
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_INDEXES__TABLE_ID);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 8));
+	mach_write_to_8(ptr, index->table->id);
+
+	dfield_set_data(dfield, ptr, 8);
+
+	/* 1: ID ----------------------------*/
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_INDEXES__ID);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 8));
+	mach_write_to_8(ptr, index->id);
+
+	dfield_set_data(dfield, ptr, 8);
+
+	/* 2: DB_TRX_ID added later */
+	/* 3: DB_ROLL_PTR added later */
+	/* 4: NAME --------------------------*/
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_INDEXES__NAME);
+
+	if (!index->is_committed()) {
+		ulint	len	= strlen(index->name) + 1;
+		char*	name	= static_cast<char*>(
+			mem_heap_alloc(heap, len));
+		*name = *TEMP_INDEX_PREFIX_STR;
+		memcpy(name + 1, index->name, len - 1);
+		dfield_set_data(dfield, name, len);
+	} else {
+		dfield_set_data(dfield, index->name, strlen(index->name));
+	}
+
+	/* 5: N_FIELDS ----------------------*/
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_INDEXES__N_FIELDS);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+	mach_write_to_4(ptr, index->n_fields);
+
+	dfield_set_data(dfield, ptr, 4);
+
+	/* 6: TYPE --------------------------*/
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_INDEXES__TYPE);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+	mach_write_to_4(ptr, index->type);
+
+	dfield_set_data(dfield, ptr, 4);
+
+	/* 7: SPACE --------------------------*/
+
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_INDEXES__SPACE);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+	mach_write_to_4(ptr, index->table->space_id);
+
+	dfield_set_data(dfield, ptr, 4);
+
+	/* 8: PAGE_NO --------------------------*/
+
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_INDEXES__PAGE_NO);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+	mach_write_to_4(ptr, FIL_NULL);
+
+	dfield_set_data(dfield, ptr, 4);
+
+	/* 9: MERGE_THRESHOLD ----------------*/
+
+	dfield = dtuple_get_nth_field(
+		entry, DICT_COL__SYS_INDEXES__MERGE_THRESHOLD);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+	mach_write_to_4(ptr, DICT_INDEX_MERGE_THRESHOLD_DEFAULT);
+
+	dfield_set_data(dfield, ptr, 4);
+
+	/*--------------------------------*/
+
+	return(entry);
+}
+
+/*****************************************************************//**
+Based on an index object, this function builds the entry to be inserted
+in the SYS_FIELDS system table.
+@return the tuple which should be inserted */
+static
+dtuple_t*
+dict_create_sys_fields_tuple(
+/*=========================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			fld_no,	/*!< in: field number */
+	mem_heap_t*		heap)	/*!< in: memory heap from
+					which the memory for the built
+					tuple is allocated */
+{
+	dtuple_t*	entry;
+	dict_field_t*	field;
+	dfield_t*	dfield;
+	byte*		ptr;
+	bool		wide_pos = false;
+
+	ut_ad(index);
+	ut_ad(heap);
+
+	for (unsigned j = 0; j < index->n_fields; j++) {
+		const dict_field_t* f = dict_index_get_nth_field(index, j);
+		if (f->prefix_len || f->descending) {
+			wide_pos = true;
+			break;
+		}
+	}
+
+	field = dict_index_get_nth_field(index, fld_no);
+
+	entry = dtuple_create(heap, 3 + DATA_N_SYS_COLS);
+
+	dict_table_copy_types(entry, dict_sys.sys_fields);
+
+	/* 0: INDEX_ID -----------------------*/
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_FIELDS__INDEX_ID);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 8));
+	mach_write_to_8(ptr, index->id);
+
+	dfield_set_data(dfield, ptr, 8);
+
+	/* 1: POS; FIELD NUMBER & PREFIX LENGTH -----------------------*/
+
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_FIELDS__POS);
+
+	ptr = static_cast<byte*>(mem_heap_alloc(heap, 4));
+
+	if (wide_pos) {
+		/* If there are column prefixes or columns with
+		descending order in the index, then we write the
+		field number to the 16 most significant bits,
+		the DESC flag to bit 15, and the prefix length
+		in the 15 least significant bits. */
+		mach_write_to_4(ptr, (fld_no << 16)
+				| (!!field->descending) << 15
+				| field->prefix_len);
+	} else {
+		/* Else we store the number of the field to the 2 LOW bytes.
+		This is to keep the storage format compatible with
+		InnoDB versions < 4.0.14. */
+
+		mach_write_to_4(ptr, fld_no);
+	}
+
+	dfield_set_data(dfield, ptr, 4);
+
+	/* 2: DB_TRX_ID added later */
+	/* 3: DB_ROLL_PTR added later */
+	/* 4: COL_NAME -------------------------*/
+	dfield = dtuple_get_nth_field(entry, DICT_COL__SYS_FIELDS__COL_NAME);
+
+	dfield_set_data(dfield, field->name, strlen(field->name));
+	/*---------------------------------*/
+
+	return(entry);
+}
+
+/*****************************************************************//**
+Creates the tuple with which the index entry is searched for writing the index
+tree root page number, if such a tree is created.
+@return the tuple for search */
+static
+dtuple_t*
+dict_create_search_tuple(
+/*=====================*/
+	const dtuple_t*	tuple,	/*!< in: the tuple inserted in the SYS_INDEXES
+				table */
+	mem_heap_t*	heap)	/*!< in: memory heap from which the memory for
+				the built tuple is allocated */
+{
+	dtuple_t*	search_tuple;
+	const dfield_t*	field1;
+	dfield_t*	field2;
+
+	ut_ad(tuple && heap);
+
+	search_tuple = dtuple_create(heap, 2);
+
+	field1 = dtuple_get_nth_field(tuple, 0);
+	field2 = dtuple_get_nth_field(search_tuple, 0);
+
+	dfield_copy(field2, field1);
+
+	field1 = dtuple_get_nth_field(tuple, 1);
+	field2 = dtuple_get_nth_field(search_tuple, 1);
+
+	dfield_copy(field2, field1);
+
+	ut_ad(dtuple_validate(search_tuple));
+
+	return(search_tuple);
+}
+
+/***************************************************************//**
+Builds an index definition row to insert.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+dict_build_index_def_step(
+/*======================*/
+	que_thr_t*	thr,	/*!< in: query thread */
+	ind_node_t*	node)	/*!< in: index create node */
+{
+	dict_table_t*	table;
+	dict_index_t*	index;
+	dtuple_t*	row;
+	trx_t*		trx;
+
+	ut_ad(dict_sys.locked());
+
+	trx = thr_get_trx(thr);
+
+	index = node->index;
+
+	table = dict_table_open_on_name(
+		node->table_name, true, DICT_ERR_IGNORE_TABLESPACE);
+
+	if (!table) {
+		return DB_TABLE_NOT_FOUND;
+	}
+
+	index->table = table;
+
+	ut_ad((UT_LIST_GET_LEN(table->indexes) > 0)
+	      || dict_index_is_clust(index));
+
+	dict_hdr_get_new_id(NULL, &index->id, NULL);
+
+	node->page_no = FIL_NULL;
+	row = dict_create_sys_indexes_tuple(index, node->heap);
+	node->ind_row = row;
+
+	ins_node_set_new_row(node->ind_def, row);
+
+	/* Note that the index was created by this transaction. */
+	index->trx_id = trx->id;
+	ut_ad(table->def_trx_id <= trx->id);
+	table->def_trx_id = trx->id;
+	table->release();
+
+	return(DB_SUCCESS);
+}
+
+/***************************************************************//**
+Builds an index definition without updating SYSTEM TABLES.
+@return DB_SUCCESS or error code */
+void
+dict_build_index_def(
+/*=================*/
+	const dict_table_t*	table,	/*!< in: table */
+	dict_index_t*		index,	/*!< in/out: index */
+	trx_t*			trx)	/*!< in/out: InnoDB transaction handle */
+{
+	ut_ad(dict_sys.locked());
+
+	ut_ad((UT_LIST_GET_LEN(table->indexes) > 0)
+	      || dict_index_is_clust(index));
+
+	dict_hdr_get_new_id(NULL, &index->id, NULL);
+
+	/* Note that the index was created by this transaction. */
+	index->trx_id = trx->id;
+}
+
+/***************************************************************//**
+Builds a field definition row to insert. */
+static
+void
+dict_build_field_def_step(
+/*======================*/
+	ind_node_t*	node)	/*!< in: index create node */
+{
+	dict_index_t*	index;
+	dtuple_t*	row;
+
+	index = node->index;
+
+	row = dict_create_sys_fields_tuple(index, node->field_no, node->heap);
+
+	ins_node_set_new_row(node->field_def, row);
+}
+
+/***************************************************************//**
+Creates an index tree for the index.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+dict_create_index_tree_step(
+/*========================*/
+	ind_node_t*	node)	/*!< in: index create node */
+{
+	mtr_t		mtr;
+	btr_pcur_t	pcur;
+	dict_index_t*	index;
+	dtuple_t*	search_tuple;
+
+	ut_ad(dict_sys.locked());
+
+	index = node->index;
+
+	if (index->type == DICT_FTS) {
+		/* FTS index does not need an index tree */
+		return(DB_SUCCESS);
+	}
+
+	/* Run a mini-transaction in which the index tree is allocated for
+	the index and its root address is written to the index entry in
+	sys_indexes */
+
+	mtr.start();
+
+	search_tuple = dict_create_search_tuple(node->ind_row, node->heap);
+	node->page_no = FIL_NULL;
+	pcur.btr_cur.page_cur.index =
+		UT_LIST_GET_FIRST(dict_sys.sys_indexes->indexes);
+
+	dberr_t err = btr_pcur_open(search_tuple, PAGE_CUR_L, BTR_MODIFY_LEAF,
+				    &pcur, &mtr);
+
+	if (err != DB_SUCCESS) {
+func_exit:
+		mtr.commit();
+		return err;
+	}
+
+	btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+
+	if (UNIV_UNLIKELY(btr_pcur_is_after_last_on_page(&pcur))) {
+corrupted:
+		err = DB_CORRUPTION;
+		goto func_exit;
+	}
+
+	ulint	len;
+	byte*	data = rec_get_nth_field_old(btr_pcur_get_rec(&pcur),
+					     DICT_FLD__SYS_INDEXES__ID,
+					     &len);
+	if (UNIV_UNLIKELY(len != 8 || mach_read_from_8(data) != index->id)) {
+		goto corrupted;
+	}
+
+	data = rec_get_nth_field_old(btr_pcur_get_rec(&pcur),
+				     DICT_FLD__SYS_INDEXES__PAGE_NO, &len);
+	if (len != 4) {
+		goto corrupted;
+	}
+
+	if (index->is_readable()) {
+		index->set_modified(mtr);
+
+		node->page_no = btr_create(
+			index->type, index->table->space,
+			index->id, index, &mtr, &err);
+
+		DBUG_EXECUTE_IF("ib_import_create_index_failure_1",
+				node->page_no = FIL_NULL;
+				err = DB_OUT_OF_FILE_SPACE; );
+	}
+
+	mtr.write<4,mtr_t::MAYBE_NOP>(*btr_pcur_get_block(&pcur), data,
+				      node->page_no);
+	goto func_exit;
+}
+
+/***************************************************************//**
+Creates an index tree for the index if it is not a member of a cluster.
+Don't update SYSTEM TABLES.
+@return error code */
+dberr_t
+dict_create_index_tree_in_mem(
+/*==========================*/
+	dict_index_t*	index,	/*!< in/out: index */
+	const trx_t*	trx)	/*!< in: InnoDB transaction handle */
+{
+	mtr_t		mtr;
+
+	ut_ad(dict_sys.locked());
+	ut_ad(!(index->type & DICT_FTS));
+
+	mtr_start(&mtr);
+	mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO);
+
+	/* Currently this function is being used by temp-tables only.
+	Import/Discard of temp-table is blocked and so this assert. */
+	ut_ad(index->is_readable());
+	ut_ad(!(index->table->flags2 & DICT_TF2_DISCARDED));
+
+	dberr_t err;
+	index->page = btr_create(index->type, index->table->space,
+				 index->id, index, &mtr, &err);
+	mtr_commit(&mtr);
+
+	index->trx_id = trx->id;
+
+	return err;
+}
+
+/** Drop the index tree associated with a row in SYS_INDEXES table.
+@param[in,out]	pcur	persistent cursor on rec
+@param[in,out]	trx	dictionary transaction
+@param[in,out]	mtr	mini-transaction
+@return tablespace ID to drop (if this is the clustered index)
+@retval 0 if no tablespace is to be dropped */
+uint32_t dict_drop_index_tree(btr_pcur_t *pcur, trx_t *trx, mtr_t *mtr)
+{
+  rec_t *rec= btr_pcur_get_rec(pcur);
+
+  ut_ad(!trx || dict_sys.locked());
+  ut_ad(!dict_table_is_comp(dict_sys.sys_indexes));
+  btr_pcur_store_position(pcur, mtr);
+
+  static_assert(DICT_FLD__SYS_INDEXES__TABLE_ID == 0, "compatibility");
+  static_assert(DICT_FLD__SYS_INDEXES__ID == 1, "compatibility");
+
+  ulint len= rec_get_n_fields_old(rec);
+  if (len < DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD ||
+      len > DICT_NUM_FIELDS__SYS_INDEXES)
+  {
+rec_corrupted:
+    sql_print_error("InnoDB: Corrupted SYS_INDEXES record");
+    return 0;
+  }
+
+  if (rec_get_1byte_offs_flag(rec))
+  {
+    if (rec_1_get_field_end_info(rec, 0) != 8 ||
+        rec_1_get_field_end_info(rec, 1) != 8 + 8)
+      goto rec_corrupted;
+  }
+  else if (rec_2_get_field_end_info(rec, 0) != 8 ||
+           rec_2_get_field_end_info(rec, 1) != 8 + 8)
+    goto rec_corrupted;
+
+  const byte *p= rec_get_nth_field_old(rec, DICT_FLD__SYS_INDEXES__TYPE, &len);
+  if (len != 4)
+    goto rec_corrupted;
+  const uint32_t type= mach_read_from_4(p);
+  p= rec_get_nth_field_old(rec, DICT_FLD__SYS_INDEXES__PAGE_NO, &len);
+  if (len != 4)
+    goto rec_corrupted;
+  const uint32_t root_page_no= mach_read_from_4(p);
+  p= rec_get_nth_field_old(rec, DICT_FLD__SYS_INDEXES__SPACE, &len);
+  if (len != 4)
+    goto rec_corrupted;
+
+  const uint32_t space_id= mach_read_from_4(p);
+  ut_ad(root_page_no == FIL_NULL || space_id <= SRV_SPACE_ID_UPPER_BOUND);
+
+  if (space_id && (type & DICT_CLUSTERED))
+    return space_id;
+
+  if (root_page_no == FIL_NULL)
+    /* The tree has already been freed */;
+  else if (fil_space_t*s= fil_space_t::get(space_id))
+  {
+    /* Ensure that the tablespace file exists
+    in order to avoid a crash in buf_page_get_gen(). */
+    if (root_page_no < s->get_size())
+    {
+      static_assert(FIL_NULL == 0xffffffff, "compatibility");
+      static_assert(DICT_FLD__SYS_INDEXES__PAGE_NO ==
+                    DICT_FLD__SYS_INDEXES__SPACE + 1, "compatibility");
+      mtr->memset(btr_pcur_get_block(pcur), page_offset(p + 4), 4, 0xff);
+      btr_free_if_exists(s, root_page_no, mach_read_from_8(rec + 8), mtr);
+    }
+    s->release();
+  }
+
+  return 0;
+}
+
+/*********************************************************************//**
+Creates a table create graph.
+@return own: table create node */
+tab_node_t*
+tab_create_graph_create(
+/*====================*/
+	dict_table_t*	table,	/*!< in: table to create, built as a memory data
+				structure */
+	mem_heap_t*	heap)	/*!< in: heap where created */
+{
+	tab_node_t*	node;
+
+	node = static_cast<tab_node_t*>(
+		mem_heap_alloc(heap, sizeof(tab_node_t)));
+
+	node->common.type = QUE_NODE_CREATE_TABLE;
+
+	node->table = table;
+
+	node->state = TABLE_BUILD_TABLE_DEF;
+	node->heap = mem_heap_create(256);
+
+	node->tab_def = ins_node_create(INS_DIRECT, dict_sys.sys_tables,
+					heap);
+	node->tab_def->common.parent = node;
+
+	node->col_def = ins_node_create(INS_DIRECT, dict_sys.sys_columns,
+					heap);
+	node->col_def->common.parent = node;
+
+	node->v_col_def = ins_node_create(INS_DIRECT, dict_sys.sys_virtual,
+                                          heap);
+	node->v_col_def->common.parent = node;
+
+	return(node);
+}
+
+/** Creates an index create graph.
+@param[in]	index	index to create, built as a memory data structure
+@param[in]	table	table name
+@param[in,out]	heap	heap where created
+@param[in]	mode	encryption mode (for creating a table)
+@param[in]	key_id	encryption key identifier (for creating a table)
+@param[in]	add_v	new virtual columns added in the same clause with
+			add index
+@return own: index create node */
+ind_node_t*
+ind_create_graph_create(
+	dict_index_t*		index,
+	const char*		table,
+	mem_heap_t*		heap,
+	fil_encryption_t	mode,
+	uint32_t		key_id,
+	const dict_add_v_col_t*	add_v)
+{
+	ind_node_t*	node;
+
+	node = static_cast<ind_node_t*>(
+		mem_heap_alloc(heap, sizeof(ind_node_t)));
+
+	node->common.type = QUE_NODE_CREATE_INDEX;
+
+	node->index = index;
+
+	node->table_name = table;
+
+	node->key_id = key_id;
+	node->mode = mode;
+	node->add_v = add_v;
+
+	node->state = INDEX_BUILD_INDEX_DEF;
+	node->page_no = FIL_NULL;
+	node->heap = mem_heap_create(256);
+
+	node->ind_def = ins_node_create(INS_DIRECT,
+					dict_sys.sys_indexes, heap);
+	node->ind_def->common.parent = node;
+
+	node->field_def = ins_node_create(INS_DIRECT,
+					  dict_sys.sys_fields, heap);
+	node->field_def->common.parent = node;
+
+	return(node);
+}
+
+/***********************************************************//**
+Creates a table. This is a high-level function used in SQL execution graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+dict_create_table_step(
+/*===================*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	tab_node_t*	node;
+	dberr_t		err	= DB_ERROR;
+	trx_t*		trx;
+
+	ut_ad(thr);
+	ut_ad(dict_sys.locked());
+
+	trx = thr_get_trx(thr);
+
+	node = static_cast<tab_node_t*>(thr->run_node);
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_CREATE_TABLE);
+
+	if (thr->prev_node == que_node_get_parent(node)) {
+		node->state = TABLE_BUILD_TABLE_DEF;
+	}
+
+	if (node->state == TABLE_BUILD_TABLE_DEF) {
+
+		/* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */
+
+		err = dict_build_table_def_step(thr, node);
+		if (err != DB_SUCCESS) {
+
+			goto function_exit;
+		}
+
+		node->state = TABLE_BUILD_COL_DEF;
+		node->col_no = 0;
+
+		thr->run_node = node->tab_def;
+
+		return(thr);
+	}
+
+	if (node->state == TABLE_BUILD_COL_DEF) {
+		if (node->col_no + DATA_N_SYS_COLS
+		    < (static_cast<ulint>(node->table->n_def)
+		       + static_cast<ulint>(node->table->n_v_def))) {
+
+			ulint i = node->col_no++;
+			if (i + DATA_N_SYS_COLS >= node->table->n_def) {
+				i += DATA_N_SYS_COLS;
+			}
+
+			ins_node_set_new_row(
+				node->col_def,
+				dict_create_sys_columns_tuple(node->table, i,
+							      node->heap));
+
+			thr->run_node = node->col_def;
+
+			return(thr);
+		} else {
+			/* Move on to SYS_VIRTUAL table */
+			node->col_no = 0;
+                        node->base_col_no = 0;
+                        node->state = TABLE_BUILD_V_COL_DEF;
+		}
+	}
+
+	if (node->state == TABLE_BUILD_V_COL_DEF) {
+
+		if (node->col_no < static_cast<ulint>(node->table->n_v_def)) {
+			dict_v_col_t*   v_col = dict_table_get_nth_v_col(
+						node->table, node->col_no);
+
+			/* If no base column */
+			while (v_col->num_base == 0) {
+				node->col_no++;
+				if (node->col_no == static_cast<ulint>(
+					(node->table)->n_v_def)) {
+					node->state = TABLE_ADD_TO_CACHE;
+					break;
+				}
+
+				v_col = dict_table_get_nth_v_col(
+					node->table, node->col_no);
+				node->base_col_no = 0;
+			}
+
+			if (node->state != TABLE_ADD_TO_CACHE) {
+				ut_ad(node->col_no == v_col->v_pos);
+				dict_build_v_col_def_step(node);
+
+				if (node->base_col_no
+				    < unsigned{v_col->num_base} - 1) {
+					/* move on to next base column */
+					node->base_col_no++;
+				} else {
+					/* move on to next virtual column */
+					node->col_no++;
+					node->base_col_no = 0;
+				}
+
+				thr->run_node = node->v_col_def;
+
+				return(thr);
+			}
+		} else {
+			node->state = TABLE_ADD_TO_CACHE;
+		}
+	}
+
+	if (node->state == TABLE_ADD_TO_CACHE) {
+		node->table->can_be_evicted = !node->table->fts;
+		node->table->add_to_cache();
+
+		err = DB_SUCCESS;
+	}
+
+function_exit:
+	trx->error_state = err;
+
+	if (err != DB_SUCCESS) {
+		return(NULL);
+	}
+
+	thr->run_node = que_node_get_parent(node);
+
+	return(thr);
+}
+
+static dberr_t dict_create_index_space(const ind_node_t &node)
+{
+  dict_table_t *table= node.index->table;
+  if (table->space || (table->flags2 & DICT_TF2_DISCARDED))
+    return DB_SUCCESS;
+  ut_ad(table->space_id);
+  ut_ad(table->space_id < SRV_TMP_SPACE_ID);
+  /* Determine the tablespace flags. */
+  const bool has_data_dir= DICT_TF_HAS_DATA_DIR(table->flags);
+  ut_ad(!has_data_dir || table->data_dir_path);
+  char* filepath= fil_make_filepath(has_data_dir
+                                    ? table->data_dir_path : nullptr,
+                                    table->name, IBD, has_data_dir);
+  if (!filepath)
+    return DB_OUT_OF_MEMORY;
+
+  /* We create a new single-table tablespace for the table.
+  We initially let it be 4 pages:
+  - page 0 is the fsp header and an extent descriptor page,
+  - page 1 is an ibuf bitmap page,
+  - page 2 is the first inode page,
+  - page 3 will contain the root of the clustered index of
+  the table we create here. */
+  dberr_t err;
+  table->space= fil_ibd_create(table->space_id, table->name, filepath,
+                               dict_tf_to_fsp_flags(table->flags),
+                               FIL_IBD_FILE_INITIAL_SIZE,
+                               node.mode, node.key_id, &err);
+  ut_ad((err != DB_SUCCESS) == !table->space);
+  ut_free(filepath);
+
+  return err;
+}
+
+/***********************************************************//**
+Creates an index. This is a high-level function used in SQL execution
+graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+dict_create_index_step(
+/*===================*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ind_node_t*	node;
+	dberr_t		err	= DB_ERROR;
+	trx_t*		trx;
+
+	ut_ad(thr);
+	ut_ad(dict_sys.locked());
+
+	trx = thr_get_trx(thr);
+
+	node = static_cast<ind_node_t*>(thr->run_node);
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_CREATE_INDEX);
+
+	if (thr->prev_node == que_node_get_parent(node)) {
+		node->state = INDEX_BUILD_INDEX_DEF;
+	}
+
+	if (node->state == INDEX_BUILD_INDEX_DEF) {
+		/* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */
+		err = dict_build_index_def_step(thr, node);
+
+		if (err != DB_SUCCESS) {
+
+			goto function_exit;
+		}
+
+		node->state = INDEX_BUILD_FIELD_DEF;
+		node->field_no = 0;
+
+		thr->run_node = node->ind_def;
+
+		return(thr);
+	}
+
+	if (node->state == INDEX_BUILD_FIELD_DEF) {
+		err = dict_create_index_space(*node);
+		if (err != DB_SUCCESS) {
+			dict_mem_index_free(node->index);
+			node->index = nullptr;
+			goto function_exit;
+		}
+
+		if (node->field_no < (node->index)->n_fields) {
+
+			dict_build_field_def_step(node);
+
+			node->field_no++;
+
+			thr->run_node = node->field_def;
+
+			return(thr);
+		} else {
+			node->state = INDEX_ADD_TO_CACHE;
+		}
+	}
+
+	if (node->state == INDEX_ADD_TO_CACHE) {
+		err = dict_index_add_to_cache(node->index, FIL_NULL,
+					      node->add_v);
+
+		ut_ad(!node->index == (err != DB_SUCCESS));
+
+		if (!node->index) {
+			goto function_exit;
+		}
+
+		ut_ad(!node->index->is_instant());
+		ut_ad(node->index->n_core_null_bytes
+		      == ((dict_index_is_clust(node->index)
+			   && node->index->table->supports_instant())
+			  ? dict_index_t::NO_CORE_NULL_BYTES
+			  : UT_BITS_IN_BYTES(
+				  unsigned(node->index->n_nullable))));
+		node->index->n_core_null_bytes = static_cast<uint8_t>(
+			UT_BITS_IN_BYTES(unsigned(node->index->n_nullable)));
+		node->state = INDEX_CREATE_INDEX_TREE;
+	}
+
+	if (node->state == INDEX_CREATE_INDEX_TREE) {
+
+		err = dict_create_index_tree_step(node);
+
+		DBUG_EXECUTE_IF("ib_dict_create_index_tree_fail",
+				err = DB_OUT_OF_MEMORY;);
+
+		if (err != DB_SUCCESS) {
+			dict_table_t* table = node->index->table;
+			/* If this is a FTS index, we will need to remove
+			it from fts->cache->indexes list as well */
+			if (!(node->index->type & DICT_FTS)) {
+			} else if (auto fts = table->fts) {
+				fts_index_cache_t*	index_cache;
+
+				mysql_mutex_lock(&fts->cache->init_lock);
+
+				index_cache = (fts_index_cache_t*)
+					 fts_find_index_cache(
+						fts->cache,
+						node->index);
+
+				if (index_cache->words) {
+					rbt_free(index_cache->words);
+					index_cache->words = 0;
+				}
+
+				ib_vector_remove(
+					fts->cache->indexes,
+					*reinterpret_cast<void**>(index_cache));
+
+				mysql_mutex_unlock(&fts->cache->init_lock);
+			}
+
+#ifdef BTR_CUR_HASH_ADAPT
+			ut_ad(!node->index->search_info->ref_count);
+#endif /* BTR_CUR_HASH_ADAPT */
+			dict_index_remove_from_cache(table, node->index);
+			node->index = NULL;
+
+			goto function_exit;
+		}
+
+		node->index->page = node->page_no;
+		/* These should have been set in
+		dict_build_index_def_step() and
+		dict_index_add_to_cache(). */
+		ut_ad(node->index->trx_id == trx->id);
+		ut_ad(node->index->table->def_trx_id == trx->id);
+	}
+
+function_exit:
+	trx->error_state = err;
+
+	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+		return nullptr;
+	}
+
+	thr->run_node = que_node_get_parent(node);
+
+	return(thr);
+}
+
+bool dict_sys_t::load_sys_tables()
+{
+  ut_ad(!srv_any_background_activity());
+  bool mismatch= false;
+  lock(SRW_LOCK_CALL);
+  if (!(sys_foreign= load_table(SYS_TABLE[SYS_FOREIGN],
+                                DICT_ERR_IGNORE_FK_NOKEY)));
+  else if (UT_LIST_GET_LEN(sys_foreign->indexes) == 3 &&
+           sys_foreign->n_cols == DICT_NUM_COLS__SYS_FOREIGN + DATA_N_SYS_COLS)
+    prevent_eviction(sys_foreign);
+  else
+  {
+    sys_foreign= nullptr;
+    mismatch= true;
+    sql_print_error("InnoDB: Invalid definition of SYS_FOREIGN");
+  }
+  if (!(sys_foreign_cols= load_table(SYS_TABLE[SYS_FOREIGN_COLS],
+                                     DICT_ERR_IGNORE_FK_NOKEY)));
+  else if (UT_LIST_GET_LEN(sys_foreign_cols->indexes) == 1 &&
+           sys_foreign_cols->n_cols ==
+           DICT_NUM_COLS__SYS_FOREIGN_COLS + DATA_N_SYS_COLS)
+    prevent_eviction(sys_foreign_cols);
+  else
+  {
+    sys_foreign_cols= nullptr;
+    mismatch= true;
+    sql_print_error("InnoDB: Invalid definition of SYS_FOREIGN_COLS");
+  }
+  if (!(sys_virtual= load_table(SYS_TABLE[SYS_VIRTUAL],
+                                DICT_ERR_IGNORE_FK_NOKEY)));
+  else if (UT_LIST_GET_LEN(sys_virtual->indexes) == 1 &&
+           sys_virtual->n_cols == DICT_NUM_COLS__SYS_VIRTUAL + DATA_N_SYS_COLS)
+    prevent_eviction(sys_virtual);
+  else
+  {
+    sys_virtual= nullptr;
+    mismatch= true;
+    sql_print_error("InnoDB: Invalid definition of SYS_VIRTUAL");
+  }
+  unlock();
+  return mismatch;
+}
+
+dberr_t dict_sys_t::create_or_check_sys_tables()
+{
+  if (sys_tables_exist())
+    return DB_SUCCESS;
+
+  if (srv_read_only_mode || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO)
+    return DB_READ_ONLY;
+
+  if (load_sys_tables())
+  {
+    sql_print_information("InnoDB: Set innodb_read_only=1 "
+                          "or innodb_force_recovery=3 to start up");
+    return DB_CORRUPTION;
+  }
+
+  if (sys_tables_exist())
+    return DB_SUCCESS;
+
+  trx_t *trx= trx_create();
+  trx_start_for_ddl(trx);
+
+  {
+    /* Do not bother with transactional memory; this is only
+    executed at startup, with no conflicts present. */
+    LockMutexGuard g{SRW_LOCK_CALL};
+    trx->mutex_lock();
+    lock_table_create(dict_sys.sys_tables, LOCK_X, trx);
+    lock_table_create(dict_sys.sys_columns, LOCK_X, trx);
+    lock_table_create(dict_sys.sys_indexes, LOCK_X, trx);
+    lock_table_create(dict_sys.sys_fields, LOCK_X, trx);
+    trx->mutex_unlock();
+  }
+
+  row_mysql_lock_data_dictionary(trx);
+
+  /* NOTE: when designing InnoDB's foreign key support in 2001, Heikki Tuuri
+  made a mistake and defined table names and the foreign key id to be of type
+  CHAR (internally, really VARCHAR). The type should have been VARBINARY. */
+
+  /* System tables are always created inside the system tablespace. */
+  const auto srv_file_per_table_backup= srv_file_per_table;
+  srv_file_per_table= 0;
+  dberr_t error;
+  span<const char> tablename;
+
+  if (!sys_foreign)
+  {
+    error= que_eval_sql(nullptr, "PROCEDURE CREATE_FOREIGN() IS\n"
+                        "BEGIN\n"
+                        "CREATE TABLE\n"
+                        "SYS_FOREIGN(ID CHAR, FOR_NAME CHAR,"
+                        " REF_NAME CHAR, N_COLS INT);\n"
+                        "CREATE UNIQUE CLUSTERED INDEX ID_IND"
+                        " ON SYS_FOREIGN (ID);\n"
+                        "CREATE INDEX FOR_IND"
+                        " ON SYS_FOREIGN (FOR_NAME);\n"
+                        "CREATE INDEX REF_IND"
+                        " ON SYS_FOREIGN (REF_NAME);\n"
+                        "END;\n", trx);
+    if (UNIV_UNLIKELY(error != DB_SUCCESS))
+    {
+      tablename= SYS_TABLE[SYS_FOREIGN];
+err_exit:
+      sql_print_error("InnoDB: Creation of %.*s failed: %s",
+                      int(tablename.size()), tablename.data(),
+                      ut_strerr(error));
+      trx->rollback();
+      row_mysql_unlock_data_dictionary(trx);
+      trx->free();
+      srv_file_per_table= srv_file_per_table_backup;
+      return error;
+    }
+  }
+  if (!sys_foreign_cols)
+  {
+    error= que_eval_sql(nullptr, "PROCEDURE CREATE_FOREIGN_COLS() IS\n"
+                        "BEGIN\n"
+                        "CREATE TABLE\n"
+                        "SYS_FOREIGN_COLS(ID CHAR, POS INT,"
+                        " FOR_COL_NAME CHAR, REF_COL_NAME CHAR);\n"
+                        "CREATE UNIQUE CLUSTERED INDEX ID_IND"
+                        " ON SYS_FOREIGN_COLS (ID, POS);\n"
+                        "END;\n", trx);
+    if (UNIV_UNLIKELY(error != DB_SUCCESS))
+    {
+      tablename= SYS_TABLE[SYS_FOREIGN_COLS];
+      goto err_exit;
+    }
+  }
+  if (!sys_virtual)
+  {
+    error= que_eval_sql(nullptr, "PROCEDURE CREATE_VIRTUAL() IS\n"
+                        "BEGIN\n"
+                        "CREATE TABLE\n"
+                        "SYS_VIRTUAL(TABLE_ID BIGINT,POS INT,BASE_POS INT);\n"
+                        "CREATE UNIQUE CLUSTERED INDEX BASE_IDX"
+                        " ON SYS_VIRTUAL(TABLE_ID, POS, BASE_POS);\n"
+                        "END;\n", trx);
+    if (UNIV_UNLIKELY(error != DB_SUCCESS))
+    {
+      tablename= SYS_TABLE[SYS_VIRTUAL];
+      goto err_exit;
+    }
+  }
+
+  trx->commit();
+  row_mysql_unlock_data_dictionary(trx);
+  trx->free();
+  srv_file_per_table= srv_file_per_table_backup;
+
+  lock(SRW_LOCK_CALL);
+  if (sys_foreign);
+  else if (!(sys_foreign= load_table(SYS_TABLE[SYS_FOREIGN])))
+  {
+    tablename= SYS_TABLE[SYS_FOREIGN];
+load_fail:
+    unlock();
+    sql_print_error("InnoDB: Failed to CREATE TABLE %.*s",
+                    int(tablename.size()), tablename.data());
+    return DB_TABLE_NOT_FOUND;
+  }
+  else
+    prevent_eviction(sys_foreign);
+
+  if (sys_foreign_cols);
+  else if (!(sys_foreign_cols= load_table(SYS_TABLE[SYS_FOREIGN_COLS])))
+  {
+    tablename= SYS_TABLE[SYS_FOREIGN_COLS];
+    goto load_fail;
+  }
+  else
+    prevent_eviction(sys_foreign_cols);
+
+  if (sys_virtual);
+  else if (!(sys_virtual= load_table(SYS_TABLE[SYS_VIRTUAL])))
+  {
+    tablename= SYS_TABLE[SYS_VIRTUAL];
+    goto load_fail;
+  }
+  else
+    prevent_eviction(sys_virtual);
+
+  unlock();
+  return DB_SUCCESS;
+}
+
+/****************************************************************//**
+Evaluate the given foreign key SQL statement.
+@return error code or DB_SUCCESS */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+dict_foreign_eval_sql(
+/*==================*/
+	pars_info_t*	info,	/*!< in: info struct */
+	const char*	sql,	/*!< in: SQL string to evaluate */
+	const char*	name,	/*!< in: table name (for diagnostics) */
+	const char*	id,	/*!< in: foreign key id */
+	trx_t*		trx)	/*!< in/out: transaction */
+{
+	FILE*	ef	= dict_foreign_err_file;
+
+	dberr_t error = que_eval_sql(info, sql, trx);
+
+	switch (error) {
+	case DB_SUCCESS:
+		break;
+	case DB_DUPLICATE_KEY:
+		mysql_mutex_lock(&dict_foreign_err_mutex);
+		rewind(ef);
+		ut_print_timestamp(ef);
+		fputs(" Error in foreign key constraint creation for table ",
+		      ef);
+		ut_print_name(ef, trx, name);
+		fputs(".\nA foreign key constraint of name ", ef);
+		ut_print_name(ef, trx, id);
+		fputs("\nalready exists."
+		      " (Note that internally InnoDB adds 'databasename'\n"
+		      "in front of the user-defined constraint name.)\n"
+		      "Note that InnoDB's FOREIGN KEY system tables store\n"
+		      "constraint names as case-insensitive, with the\n"
+		      "MariaDB standard latin1_swedish_ci collation. If you\n"
+		      "create tables or databases whose names differ only in\n"
+		      "the character case, then collisions in constraint\n"
+		      "names can occur. Workaround: name your constraints\n"
+		      "explicitly with unique names.\n",
+		      ef);
+		goto release;
+	default:
+		sql_print_error("InnoDB: "
+				"Foreign key constraint creation failed: %s",
+				ut_strerr(error));
+
+		mysql_mutex_lock(&dict_foreign_err_mutex);
+		ut_print_timestamp(ef);
+		fputs(" Internal error in foreign key constraint creation"
+		      " for table ", ef);
+		ut_print_name(ef, trx, name);
+		fputs(".\n"
+		      "See the MariaDB .err log in the datadir"
+		      " for more information.\n", ef);
+release:
+		mysql_mutex_unlock(&dict_foreign_err_mutex);
+	}
+
+	return error;
+}
+
+/********************************************************************//**
+Add a single foreign key field definition to the data dictionary tables in
+the database.
+@return error code or DB_SUCCESS */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+dict_create_add_foreign_field_to_dictionary(
+/*========================================*/
+	ulint			field_nr,	/*!< in: field number */
+	const char*		table_name,	/*!< in: table name */
+	const dict_foreign_t*	foreign,	/*!< in: foreign */
+	trx_t*			trx)		/*!< in/out: transaction */
+{
+	DBUG_ENTER("dict_create_add_foreign_field_to_dictionary");
+
+	pars_info_t*	info = pars_info_create();
+
+	pars_info_add_str_literal(info, "id", foreign->id);
+
+	pars_info_add_int4_literal(info, "pos", field_nr);
+
+	pars_info_add_str_literal(info, "for_col_name",
+				  foreign->foreign_col_names[field_nr]);
+
+	pars_info_add_str_literal(info, "ref_col_name",
+				  foreign->referenced_col_names[field_nr]);
+
+	DBUG_RETURN(dict_foreign_eval_sql(
+		       info,
+		       "PROCEDURE P () IS\n"
+		       "BEGIN\n"
+		       "INSERT INTO SYS_FOREIGN_COLS VALUES"
+		       "(:id, :pos, :for_col_name, :ref_col_name);\n"
+		       "END;\n",
+		       table_name, foreign->id, trx));
+}
+
+/********************************************************************//**
+Construct foreign key constraint defintion from data dictionary information.
+*/
+static
+char*
+dict_foreign_def_get(
+/*=================*/
+	dict_foreign_t*	foreign,/*!< in: foreign */
+	trx_t*		trx)	/*!< in: trx */
+{
+	char* fk_def = (char *)mem_heap_alloc(foreign->heap, 4*1024);
+	const char* tbname;
+	char tablebuf[MAX_TABLE_NAME_LEN + 1] = "";
+	unsigned i;
+	char* bufend;
+
+	tbname = dict_remove_db_name(foreign->id);
+	bufend = innobase_convert_name(tablebuf, MAX_TABLE_NAME_LEN,
+				tbname, strlen(tbname), trx->mysql_thd);
+	tablebuf[bufend - tablebuf] = '\0';
+
+	sprintf(fk_def,
+		(char *)"CONSTRAINT %s FOREIGN KEY (", (char *)tablebuf);
+
+	for(i = 0; i < foreign->n_fields; i++) {
+		char	buf[MAX_TABLE_NAME_LEN + 1] = "";
+		innobase_convert_name(buf, MAX_TABLE_NAME_LEN,
+				foreign->foreign_col_names[i],
+				strlen(foreign->foreign_col_names[i]),
+				trx->mysql_thd);
+		strcat(fk_def, buf);
+		if (i < static_cast<unsigned>(foreign->n_fields-1)) {
+			strcat(fk_def, (char *)",");
+		}
+	}
+
+	strcat(fk_def,(char *)") REFERENCES ");
+
+	bufend = innobase_convert_name(tablebuf, MAX_TABLE_NAME_LEN,
+	        	        foreign->referenced_table_name,
+			        strlen(foreign->referenced_table_name),
+			        trx->mysql_thd);
+	tablebuf[bufend - tablebuf] = '\0';
+
+	strcat(fk_def, tablebuf);
+	strcat(fk_def, " (");
+
+	for(i = 0; i < foreign->n_fields; i++) {
+		char	buf[MAX_TABLE_NAME_LEN + 1] = "";
+		bufend = innobase_convert_name(buf, MAX_TABLE_NAME_LEN,
+				foreign->referenced_col_names[i],
+				strlen(foreign->referenced_col_names[i]),
+				trx->mysql_thd);
+		buf[bufend - buf] = '\0';
+		strcat(fk_def, buf);
+		if (i < (uint)foreign->n_fields-1) {
+			strcat(fk_def, (char *)",");
+		}
+	}
+	strcat(fk_def, (char *)")");
+
+	return fk_def;
+}
+
+/********************************************************************//**
+Convert foreign key column names from data dictionary to SQL-layer.
+*/
+static
+void
+dict_foreign_def_get_fields(
+/*========================*/
+	dict_foreign_t*	foreign,/*!< in: foreign */
+	trx_t*		trx,	/*!< in: trx */
+	char**		field,  /*!< out: foreign column */
+	char**		field2, /*!< out: referenced column */
+	ulint		col_no) /*!< in: column number */
+{
+	char* bufend;
+	char* fieldbuf = (char *)mem_heap_alloc(foreign->heap, MAX_TABLE_NAME_LEN+1);
+	char* fieldbuf2 = (char *)mem_heap_alloc(foreign->heap, MAX_TABLE_NAME_LEN+1);
+
+	bufend = innobase_convert_name(fieldbuf, MAX_TABLE_NAME_LEN,
+			foreign->foreign_col_names[col_no],
+			strlen(foreign->foreign_col_names[col_no]),
+			trx->mysql_thd);
+
+	fieldbuf[bufend - fieldbuf] = '\0';
+
+	bufend = innobase_convert_name(fieldbuf2, MAX_TABLE_NAME_LEN,
+			foreign->referenced_col_names[col_no],
+			strlen(foreign->referenced_col_names[col_no]),
+			trx->mysql_thd);
+
+	fieldbuf2[bufend - fieldbuf2] = '\0';
+	*field = fieldbuf;
+	*field2 = fieldbuf2;
+}
+
+/********************************************************************//**
+Add a foreign key definition to the data dictionary tables.
+@return error code or DB_SUCCESS */
+dberr_t
+dict_create_add_foreign_to_dictionary(
+/*==================================*/
+	const char*		name,	/*!< in: table name */
+	const dict_foreign_t*	foreign,/*!< in: foreign key */
+	trx_t*			trx)	/*!< in/out: dictionary transaction */
+{
+	dberr_t		error;
+
+	DBUG_ENTER("dict_create_add_foreign_to_dictionary");
+
+	pars_info_t*	info = pars_info_create();
+
+	pars_info_add_str_literal(info, "id", foreign->id);
+
+	pars_info_add_str_literal(info, "for_name", name);
+
+	pars_info_add_str_literal(info, "ref_name",
+				  foreign->referenced_table_name);
+
+	pars_info_add_int4_literal(info, "n_cols",
+				   ulint(foreign->n_fields)
+				   | (ulint(foreign->type) << 24));
+
+	DBUG_PRINT("dict_create_add_foreign_to_dictionary",
+		   ("'%s', '%s', '%s', %d", foreign->id, name,
+		    foreign->referenced_table_name,
+		    foreign->n_fields + (foreign->type << 24)));
+
+	error = dict_foreign_eval_sql(info,
+				      "PROCEDURE P () IS\n"
+				      "BEGIN\n"
+				      "INSERT INTO SYS_FOREIGN VALUES"
+				      "(:id, :for_name, :ref_name, :n_cols);\n"
+				      "END;\n"
+				      , name, foreign->id, trx);
+
+	if (error != DB_SUCCESS) {
+
+		if (error == DB_DUPLICATE_KEY) {
+			char	buf[MAX_TABLE_NAME_LEN + 1] = "";
+			char	tablename[MAX_TABLE_NAME_LEN + 1] = "";
+			char*	fk_def;
+
+			innobase_convert_name(tablename, MAX_TABLE_NAME_LEN,
+				name, strlen(name), trx->mysql_thd);
+
+			innobase_convert_name(buf, MAX_TABLE_NAME_LEN,
+				foreign->id, strlen(foreign->id), trx->mysql_thd);
+
+			fk_def = dict_foreign_def_get((dict_foreign_t*)foreign, trx);
+
+			ib_push_warning(trx, error,
+				"Create or Alter table %s with foreign key constraint"
+				" failed. Foreign key constraint %s"
+				" already exists on data dictionary."
+				" Foreign key constraint names need to be unique in database."
+				" Error in foreign key definition: %s.",
+				tablename, buf, fk_def);
+		}
+
+		DBUG_RETURN(error);
+	}
+
+	for (ulint i = 0; i < foreign->n_fields; i++) {
+		error = dict_create_add_foreign_field_to_dictionary(
+			i, name, foreign, trx);
+
+		if (error != DB_SUCCESS) {
+			char	buf[MAX_TABLE_NAME_LEN + 1] = "";
+			char	tablename[MAX_TABLE_NAME_LEN + 1] = "";
+			char*	field=NULL;
+			char*	field2=NULL;
+			char*	fk_def;
+
+			innobase_convert_name(tablename, MAX_TABLE_NAME_LEN,
+				name, strlen(name), trx->mysql_thd);
+			innobase_convert_name(buf, MAX_TABLE_NAME_LEN,
+				foreign->id, strlen(foreign->id), trx->mysql_thd);
+			fk_def = dict_foreign_def_get((dict_foreign_t*)foreign, trx);
+			dict_foreign_def_get_fields((dict_foreign_t*)foreign, trx, &field, &field2, i);
+
+			ib_push_warning(trx, error,
+				"Create or Alter table %s with foreign key constraint"
+				" failed. Error adding foreign  key constraint name %s"
+				" fields %s or %s to the dictionary."
+				" Error in foreign key definition: %s.",
+				tablename, buf, i+1, fk_def);
+
+			DBUG_RETURN(error);
+		}
+	}
+
+	DBUG_RETURN(error);
+}
+
+/** Check if a foreign constraint is on the given column name.
+@param[in]	col_name	column name to be searched for fk constraint
+@param[in]	table		table to which foreign key constraint belongs
+@return true if fk constraint is present on the table, false otherwise. */
+static
+bool
+dict_foreign_base_for_stored(
+	const char*		col_name,
+	const dict_table_t*	table)
+{
+	/* Loop through each stored column and check if its base column has
+	the same name as the column name being checked */
+	dict_s_col_list::const_iterator	it;
+	for (it = table->s_cols->begin();
+	     it != table->s_cols->end(); ++it) {
+		dict_s_col_t	s_col = *it;
+
+		for (ulint j = 0; j < s_col.num_base; j++) {
+			if (strcmp(col_name, dict_table_get_col_name(
+						table,
+						s_col.base_col[j]->ind)) == 0) {
+				return(true);
+			}
+		}
+	}
+
+	return(false);
+}
+
+/** Check if a foreign constraint is on columns served as base columns
+of any stored column. This is to prevent creating SET NULL or CASCADE
+constraint on such columns
+@param[in]	local_fk_set	set of foreign key objects, to be added to
+the dictionary tables
+@param[in]	table		table to which the foreign key objects in
+local_fk_set belong to
+@return true if yes, otherwise, false */
+bool
+dict_foreigns_has_s_base_col(
+	const dict_foreign_set&	local_fk_set,
+	const dict_table_t*	table)
+{
+	dict_foreign_t*	foreign;
+
+	if (table->s_cols == NULL) {
+		return (false);
+	}
+
+	for (dict_foreign_set::const_iterator it = local_fk_set.begin();
+	     it != local_fk_set.end(); ++it) {
+
+		foreign = *it;
+		ulint	type = foreign->type;
+
+		type &= ~(DICT_FOREIGN_ON_DELETE_NO_ACTION
+			  | DICT_FOREIGN_ON_UPDATE_NO_ACTION);
+
+		if (type == 0) {
+			continue;
+		}
+
+		for (ulint i = 0; i < foreign->n_fields; i++) {
+			/* Check if the constraint is on a column that
+			is a base column of any stored column */
+			if (dict_foreign_base_for_stored(
+				foreign->foreign_col_names[i], table)) {
+				return(true);
+			}
+		}
+	}
+
+	return(false);
+}
+
+/** Adds the given set of foreign key objects to the dictionary tables
+in the database. This function does not modify the dictionary cache. The
+caller must ensure that all foreign key objects contain a valid constraint
+name in foreign->id.
+@param[in]	local_fk_set	set of foreign key objects, to be added to
+the dictionary tables
+@param[in]	table		table to which the foreign key objects in
+local_fk_set belong to
+@param[in,out]	trx		transaction
+@return error code or DB_SUCCESS */
+dberr_t
+dict_create_add_foreigns_to_dictionary(
+/*===================================*/
+	const dict_foreign_set&	local_fk_set,
+	const dict_table_t*	table,
+	trx_t*			trx)
+{
+  ut_ad(dict_sys.locked());
+
+  if (!dict_sys.sys_foreign)
+  {
+    sql_print_error("InnoDB: Table SYS_FOREIGN not found"
+                    " in internal data dictionary");
+    return DB_ERROR;
+  }
+
+  for (auto fk : local_fk_set)
+    if (dberr_t error=
+        dict_create_add_foreign_to_dictionary(table->name.m_name, fk, trx))
+      return error;
+
+  return DB_SUCCESS;
+}
diff --git a/storage/innobase/dict/dict0defrag_bg.cc b/storage/innobase/dict/dict0defrag_bg.cc
new file mode 100644
index 00000000..bec6da8e
--- /dev/null
+++ b/storage/innobase/dict/dict0defrag_bg.cc
@@ -0,0 +1,434 @@
+/*****************************************************************************
+
+Copyright (c) 2016, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file dict/dict0defrag_bg.cc
+Defragmentation routines.
+
+Created 25/08/2016 Jan Lindström
+*******************************************************/
+
+#include "dict0dict.h"
+#include "dict0stats.h"
+#include "dict0stats_bg.h"
+#include "dict0defrag_bg.h"
+#include "btr0btr.h"
+#include "srv0start.h"
+#include "trx0trx.h"
+#include "lock0lock.h"
+#include "row0mysql.h"
+
+static mysql_mutex_t defrag_pool_mutex;
+
+/** Iterator type for iterating over the elements of objects of type
+defrag_pool_t. */
+typedef defrag_pool_t::iterator		defrag_pool_iterator_t;
+
+/** Pool where we store information on which tables are to be processed
+by background defragmentation. */
+defrag_pool_t			defrag_pool;
+
+
+/*****************************************************************//**
+Initialize the defrag pool, called once during thread initialization. */
+void
+dict_defrag_pool_init(void)
+/*=======================*/
+{
+	ut_ad(!srv_read_only_mode);
+	mysql_mutex_init(0, &defrag_pool_mutex, nullptr);
+}
+
+/*****************************************************************//**
+Free the resources occupied by the defrag pool, called once during
+thread de-initialization. */
+void
+dict_defrag_pool_deinit(void)
+/*=========================*/
+{
+	ut_ad(!srv_read_only_mode);
+
+	mysql_mutex_destroy(&defrag_pool_mutex);
+}
+
+/*****************************************************************//**
+Get an index from the auto defrag pool. The returned index id is removed
+from the pool.
+@return true if the pool was non-empty and "id" was set, false otherwise */
+static
+bool
+dict_stats_defrag_pool_get(
+/*=======================*/
+	table_id_t*	table_id,	/*!< out: table id, or unmodified if
+					list is empty */
+	index_id_t*	index_id)	/*!< out: index id, or unmodified if
+					list is empty */
+{
+	ut_ad(!srv_read_only_mode);
+
+	mysql_mutex_lock(&defrag_pool_mutex);
+
+	if (defrag_pool.empty()) {
+		mysql_mutex_unlock(&defrag_pool_mutex);
+		return(false);
+	}
+
+	defrag_pool_item_t& item = defrag_pool.back();
+	*table_id = item.table_id;
+	*index_id = item.index_id;
+
+	defrag_pool.pop_back();
+
+	mysql_mutex_unlock(&defrag_pool_mutex);
+
+	return(true);
+}
+
+/*****************************************************************//**
+Add an index in a table to the defrag pool, which is processed by the
+background stats gathering thread. Only the table id and index id are
+added to the list, so the table can be closed after being enqueued and
+it will be opened when needed. If the table or index does not exist later
+(has been DROPped), then it will be removed from the pool and skipped. */
+void
+dict_stats_defrag_pool_add(
+/*=======================*/
+	const dict_index_t*	index)	/*!< in: table to add */
+{
+	defrag_pool_item_t item;
+
+	ut_ad(!srv_read_only_mode);
+
+	mysql_mutex_lock(&defrag_pool_mutex);
+
+	/* quit if already in the list */
+	for (defrag_pool_iterator_t iter = defrag_pool.begin();
+	     iter != defrag_pool.end();
+	     ++iter) {
+		if ((*iter).table_id == index->table->id
+		    && (*iter).index_id == index->id) {
+			mysql_mutex_unlock(&defrag_pool_mutex);
+			return;
+		}
+	}
+
+	item.table_id = index->table->id;
+	item.index_id = index->id;
+	defrag_pool.push_back(item);
+	if (defrag_pool.size() == 1) {
+		/* Kick off dict stats optimizer work */
+		dict_stats_schedule_now();
+	}
+	mysql_mutex_unlock(&defrag_pool_mutex);
+}
+
+/*****************************************************************//**
+Delete a given index from the auto defrag pool. */
+void
+dict_stats_defrag_pool_del(
+/*=======================*/
+	const dict_table_t*	table,	/*!<in: if given, remove
+					all entries for the table */
+	const dict_index_t*	index)	/*!< in: if given, remove this index */
+{
+	ut_a((table && !index) || (!table && index));
+	ut_ad(!srv_read_only_mode);
+	ut_ad(dict_sys.frozen());
+
+	mysql_mutex_lock(&defrag_pool_mutex);
+
+	defrag_pool_iterator_t iter = defrag_pool.begin();
+	while (iter != defrag_pool.end()) {
+		if ((table && (*iter).table_id == table->id)
+		    || (index
+			&& (*iter).table_id == index->table->id
+			&& (*iter).index_id == index->id)) {
+			/* erase() invalidates the iterator */
+			iter = defrag_pool.erase(iter);
+			if (index)
+				break;
+		} else {
+			iter++;
+		}
+	}
+
+	mysql_mutex_unlock(&defrag_pool_mutex);
+}
+
+/*****************************************************************//**
+Get the first index that has been added for updating persistent defrag
+stats and eventually save its stats. */
+static void dict_stats_process_entry_from_defrag_pool(THD *thd)
+{
+  table_id_t table_id;
+  index_id_t index_id;
+
+  ut_ad(!srv_read_only_mode);
+
+  /* pop the first index from the auto defrag pool */
+  if (!dict_stats_defrag_pool_get(&table_id, &index_id))
+    /* no index in defrag pool */
+    return;
+
+  /* If the table is no longer cached, we've already lost the in
+  memory stats so there's nothing really to write to disk. */
+  MDL_ticket *mdl= nullptr;
+  if (dict_table_t *table=
+      dict_table_open_on_id(table_id, false, DICT_TABLE_OP_OPEN_ONLY_IF_CACHED,
+                            thd, &mdl))
+  {
+    if (dict_index_t *index= !table->corrupted
+        ? dict_table_find_index_on_id(table, index_id) : nullptr)
+      if (index->is_btree())
+        dict_stats_save_defrag_stats(index);
+    dict_table_close(table, false, thd, mdl);
+  }
+}
+
+/**
+Get the first index that has been added for updating persistent defrag
+stats and eventually save its stats. */
+void dict_defrag_process_entries_from_defrag_pool(THD *thd)
+{
+  while (!defrag_pool.empty())
+    dict_stats_process_entry_from_defrag_pool(thd);
+}
+
+/*********************************************************************//**
+Save defragmentation result.
+@return DB_SUCCESS or error code */
+dberr_t dict_stats_save_defrag_summary(dict_index_t *index, THD *thd)
+{
+  if (index->is_ibuf())
+    return DB_SUCCESS;
+
+  MDL_ticket *mdl_table= nullptr, *mdl_index= nullptr;
+  dict_table_t *table_stats= dict_table_open_on_name(TABLE_STATS_NAME, false,
+                                                     DICT_ERR_IGNORE_NONE);
+  if (table_stats)
+  {
+    dict_sys.freeze(SRW_LOCK_CALL);
+    table_stats= dict_acquire_mdl_shared<false>(table_stats, thd, &mdl_table);
+    dict_sys.unfreeze();
+  }
+  if (!table_stats || strcmp(table_stats->name.m_name, TABLE_STATS_NAME))
+  {
+release_and_exit:
+    if (table_stats)
+      dict_table_close(table_stats, false, thd, mdl_table);
+    return DB_STATS_DO_NOT_EXIST;
+  }
+
+  dict_table_t *index_stats= dict_table_open_on_name(INDEX_STATS_NAME, false,
+                                                     DICT_ERR_IGNORE_NONE);
+  if (index_stats)
+  {
+    dict_sys.freeze(SRW_LOCK_CALL);
+    index_stats= dict_acquire_mdl_shared<false>(index_stats, thd, &mdl_index);
+    dict_sys.unfreeze();
+  }
+  if (!index_stats)
+    goto release_and_exit;
+  if (strcmp(index_stats->name.m_name, INDEX_STATS_NAME))
+  {
+    dict_table_close(index_stats, false, thd, mdl_index);
+    goto release_and_exit;
+  }
+
+  trx_t *trx= trx_create();
+  trx->mysql_thd= thd;
+  trx_start_internal(trx);
+  dberr_t ret= trx->read_only
+    ? DB_READ_ONLY
+    : lock_table_for_trx(table_stats, trx, LOCK_X);
+  if (ret == DB_SUCCESS)
+    ret= lock_table_for_trx(index_stats, trx, LOCK_X);
+  row_mysql_lock_data_dictionary(trx);
+  if (ret == DB_SUCCESS)
+    ret= dict_stats_save_index_stat(index, time(nullptr), "n_pages_freed",
+                                    index->stat_defrag_n_pages_freed,
+                                    nullptr,
+                                    "Number of pages freed during"
+                                    " last defragmentation run.",
+                                    trx);
+  if (ret == DB_SUCCESS)
+    trx->commit();
+  else
+    trx->rollback();
+
+  if (table_stats)
+    dict_table_close(table_stats, true, thd, mdl_table);
+  if (index_stats)
+    dict_table_close(index_stats, true, thd, mdl_index);
+
+  row_mysql_unlock_data_dictionary(trx);
+  trx->free();
+
+  return ret;
+}
+
+/**************************************************************//**
+Gets the number of reserved and used pages in a B-tree.
+@return	number of pages reserved, or ULINT_UNDEFINED if the index
+is unavailable */
+static
+ulint
+btr_get_size_and_reserved(
+	dict_index_t*	index,	/*!< in: index */
+	ulint		flag,	/*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
+	ulint*		used,	/*!< out: number of pages used (<= reserved) */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction where index
+				is s-latched */
+{
+	ulint		dummy;
+
+	ut_ad(mtr->memo_contains(index->lock, MTR_MEMO_SX_LOCK));
+	ut_a(flag == BTR_N_LEAF_PAGES || flag == BTR_TOTAL_SIZE);
+
+	if (index->page == FIL_NULL
+	    || dict_index_is_online_ddl(index)
+	    || !index->is_committed()
+	    || !index->table->space) {
+		return(ULINT_UNDEFINED);
+	}
+
+	dberr_t err;
+	buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr, &err);
+	*used = 0;
+	if (!root) {
+		return ULINT_UNDEFINED;
+	}
+
+	mtr->x_lock_space(index->table->space);
+
+	ulint n = fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_LEAF
+					+ root->page.frame, used, mtr);
+	if (flag == BTR_TOTAL_SIZE) {
+		n += fseg_n_reserved_pages(*root,
+					   PAGE_HEADER + PAGE_BTR_SEG_TOP
+					   + root->page.frame, &dummy, mtr);
+		*used += dummy;
+	}
+
+	return(n);
+}
+
+/*********************************************************************//**
+Save defragmentation stats for a given index.
+@return DB_SUCCESS or error code */
+dberr_t
+dict_stats_save_defrag_stats(
+/*============================*/
+	dict_index_t*	index)	/*!< in: index */
+{
+  if (index->is_ibuf())
+    return DB_SUCCESS;
+  if (!index->is_readable())
+    return dict_stats_report_error(index->table, true);
+
+  const time_t now= time(nullptr);
+  mtr_t mtr;
+  ulint n_leaf_pages;
+  mtr.start();
+  mtr_sx_lock_index(index, &mtr);
+  ulint n_leaf_reserved= btr_get_size_and_reserved(index, BTR_N_LEAF_PAGES,
+                                                   &n_leaf_pages, &mtr);
+  mtr.commit();
+
+  if (n_leaf_reserved == ULINT_UNDEFINED)
+    return DB_SUCCESS;
+
+  THD *thd= current_thd;
+  MDL_ticket *mdl_table= nullptr, *mdl_index= nullptr;
+  dict_table_t* table_stats= dict_table_open_on_name(TABLE_STATS_NAME, false,
+                                                     DICT_ERR_IGNORE_NONE);
+  if (table_stats)
+  {
+    dict_sys.freeze(SRW_LOCK_CALL);
+    table_stats= dict_acquire_mdl_shared<false>(table_stats, thd, &mdl_table);
+    dict_sys.unfreeze();
+  }
+  if (!table_stats || strcmp(table_stats->name.m_name, TABLE_STATS_NAME))
+  {
+release_and_exit:
+    if (table_stats)
+      dict_table_close(table_stats, false, thd, mdl_table);
+    return DB_STATS_DO_NOT_EXIST;
+  }
+
+  dict_table_t *index_stats= dict_table_open_on_name(INDEX_STATS_NAME, false,
+                                                     DICT_ERR_IGNORE_NONE);
+  if (index_stats)
+  {
+    dict_sys.freeze(SRW_LOCK_CALL);
+    index_stats= dict_acquire_mdl_shared<false>(index_stats, thd, &mdl_index);
+    dict_sys.unfreeze();
+  }
+  if (!index_stats)
+    goto release_and_exit;
+
+  if (strcmp(index_stats->name.m_name, INDEX_STATS_NAME))
+  {
+    dict_table_close(index_stats, false, thd, mdl_index);
+    goto release_and_exit;
+  }
+
+  trx_t *trx= trx_create();
+  trx->mysql_thd= thd;
+  trx_start_internal(trx);
+  dberr_t ret= trx->read_only
+    ? DB_READ_ONLY
+    : lock_table_for_trx(table_stats, trx, LOCK_X);
+  if (ret == DB_SUCCESS)
+    ret= lock_table_for_trx(index_stats, trx, LOCK_X);
+
+  row_mysql_lock_data_dictionary(trx);
+
+  if (ret == DB_SUCCESS)
+    ret= dict_stats_save_index_stat(index, now, "n_page_split",
+                                    index->stat_defrag_n_page_split, nullptr,
+                                    "Number of new page splits on leaves"
+                                    " since last defragmentation.", trx);
+
+  if (ret == DB_SUCCESS)
+    ret= dict_stats_save_index_stat(index, now, "n_leaf_pages_defrag",
+                                    n_leaf_pages, nullptr,
+                                    "Number of leaf pages when"
+                                    " this stat is saved to disk", trx);
+
+  if (ret == DB_SUCCESS)
+    ret= dict_stats_save_index_stat(index, now, "n_leaf_pages_reserved",
+                                    n_leaf_reserved, nullptr,
+                                    "Number of pages reserved for"
+                                    " this index leaves"
+                                    " when this stat is saved to disk", trx);
+
+  if (ret == DB_SUCCESS)
+    trx->commit();
+  else
+    trx->rollback();
+
+  if (table_stats)
+    dict_table_close(table_stats, true, thd, mdl_table);
+  if (index_stats)
+    dict_table_close(index_stats, true, thd, mdl_index);
+  row_mysql_unlock_data_dictionary(trx);
+  trx->free();
+
+  return ret;
+}
diff --git a/storage/innobase/dict/dict0dict.cc b/storage/innobase/dict/dict0dict.cc
new file mode 100644
index 00000000..5bc7ab6e
--- /dev/null
+++ b/storage/innobase/dict/dict0dict.cc
@@ -0,0 +1,4859 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file dict/dict0dict.cc
+Data dictionary system
+
+Created 1/8/1996 Heikki Tuuri
+***********************************************************************/
+
+#include <my_config.h>
+#include <string>
+
+#include "ha_prototypes.h"
+#include <mysqld.h>
+#include <strfunc.h>
+
+#include "dict0dict.h"
+#include "fts0fts.h"
+#include "fil0fil.h"
+#include <algorithm>
+#include "sql_class.h"
+#include "sql_table.h"
+#include <mysql/service_thd_mdl.h>
+
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "btr0sea.h"
+#include "buf0buf.h"
+#include "data0type.h"
+#include "dict0boot.h"
+#include "dict0load.h"
+#include "dict0crea.h"
+#include "dict0mem.h"
+#include "dict0stats.h"
+#include "fts0fts.h"
+#include "fts0types.h"
+#include "lock0lock.h"
+#include "mach0data.h"
+#include "mem0mem.h"
+#include "page0page.h"
+#include "page0zip.h"
+#include "pars0pars.h"
+#include "pars0sym.h"
+#include "que0que.h"
+#include "rem0cmp.h"
+#include "row0log.h"
+#include "row0merge.h"
+#include "row0mysql.h"
+#include "row0upd.h"
+#include "srv0mon.h"
+#include "srv0start.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+
+#include <vector>
+#include <algorithm>
+
+/** the dictionary system */
+dict_sys_t	dict_sys;
+
+/** System table names; @see dict_system_id_t */
+const span<const char> dict_sys_t::SYS_TABLE[]=
+{
+  {C_STRING_WITH_LEN("SYS_TABLES")},{C_STRING_WITH_LEN("SYS_INDEXES")},
+  {C_STRING_WITH_LEN("SYS_COLUMNS")},{C_STRING_WITH_LEN("SYS_FIELDS")},
+  {C_STRING_WITH_LEN("SYS_FOREIGN")},{C_STRING_WITH_LEN("SYS_FOREIGN_COLS")},
+  {C_STRING_WITH_LEN("SYS_VIRTUAL")}
+};
+
+/** Diagnostic message for exceeding the mutex_lock_wait() timeout */
+const char dict_sys_t::fatal_msg[]=
+  "innodb_fatal_semaphore_wait_threshold was exceeded for dict_sys.latch. "
+  "Please refer to "
+  "https://mariadb.com/kb/en/how-to-produce-a-full-stack-trace-for-mysqld/";
+
+/** Percentage of compression failures that are allowed in a single
+round */
+ulong	zip_failure_threshold_pct = 5;
+
+/** Maximum percentage of a page that can be allowed as a pad to avoid
+compression failures */
+ulong	zip_pad_max = 50;
+
+#define	DICT_HEAP_SIZE		100	/*!< initial memory heap size when
+					creating a table or index object */
+#define DICT_POOL_PER_TABLE_HASH 512	/*!< buffer pool max size per table
+					hash table fixed size in bytes */
+#define DICT_POOL_PER_VARYING	4	/*!< buffer pool max size per data
+					dictionary varying size in bytes */
+
+/** Identifies generated InnoDB foreign key names */
+static char	dict_ibfk[] = "_ibfk_";
+
+/*******************************************************************//**
+Tries to find column names for the index and sets the col field of the
+index.
+@param[in]	index	index
+@param[in]	add_v	new virtual columns added along with an add index call
+@return whether the column names were found */
+static
+bool
+dict_index_find_cols(
+	dict_index_t*		index,
+	const dict_add_v_col_t*	add_v);
+/*******************************************************************//**
+Builds the internal dictionary cache representation for a clustered
+index, containing also system fields not defined by the user.
+@return own: the internal representation of the clustered index */
+static
+dict_index_t*
+dict_index_build_internal_clust(
+/*============================*/
+	dict_index_t*		index);	/*!< in: user representation of
+					a clustered index */
+/*******************************************************************//**
+Builds the internal dictionary cache representation for a non-clustered
+index, containing also system fields not defined by the user.
+@return own: the internal representation of the non-clustered index */
+static
+dict_index_t*
+dict_index_build_internal_non_clust(
+/*================================*/
+	dict_index_t*		index);	/*!< in: user representation of
+					a non-clustered index */
+/**********************************************************************//**
+Builds the internal dictionary cache representation for an FTS index.
+@return own: the internal representation of the FTS index */
+static
+dict_index_t*
+dict_index_build_internal_fts(
+/*==========================*/
+	dict_index_t*	index);	/*!< in: user representation of an FTS index */
+
+/**********************************************************************//**
+Removes an index from the dictionary cache. */
+static
+void
+dict_index_remove_from_cache_low(
+/*=============================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	dict_index_t*	index,		/*!< in, own: index */
+	ibool		lru_evict);	/*!< in: TRUE if page being evicted
+					to make room in the table LRU list */
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Validate the dictionary table LRU list.
+@return TRUE if validate OK */
+static
+ibool
+dict_lru_validate(void);
+/*===================*/
+#endif /* UNIV_DEBUG */
+
+/* Stream for storing detailed information about the latest foreign key
+and unique key errors. Only created if !srv_read_only_mode */
+FILE*	dict_foreign_err_file		= NULL;
+/* mutex protecting the foreign and unique error buffers */
+mysql_mutex_t dict_foreign_err_mutex;
+
+/********************************************************************//**
+Checks if the database name in two table names is the same.
+@return TRUE if same db name */
+ibool
+dict_tables_have_same_db(
+/*=====================*/
+	const char*	name1,	/*!< in: table name in the form
+				dbname '/' tablename */
+	const char*	name2)	/*!< in: table name in the form
+				dbname '/' tablename */
+{
+	for (; *name1 == *name2; name1++, name2++) {
+		if (*name1 == '/') {
+			return(TRUE);
+		}
+		ut_a(*name1); /* the names must contain '/' */
+	}
+	return(FALSE);
+}
+
+/********************************************************************//**
+Return the end of table name where we have removed dbname and '/'.
+@return table name */
+const char*
+dict_remove_db_name(
+/*================*/
+	const char*	name)	/*!< in: table name in the form
+				dbname '/' tablename */
+{
+	const char*	s = strchr(name, '/');
+	ut_a(s);
+
+	return(s + 1);
+}
+
+/** Decrement the count of open handles */
+void dict_table_close(dict_table_t *table)
+{
+  if (table->get_ref_count() == 1 &&
+      dict_stats_is_persistent_enabled(table) &&
+      strchr(table->name.m_name, '/'))
+  {
+    /* It looks like we are closing the last handle. The user could
+    have executed FLUSH TABLES in order to have the statistics reloaded
+    from the InnoDB persistent statistics tables. We must acquire
+    exclusive dict_sys.latch to prevent a race condition with another
+    thread concurrently acquiring a handle on the table. */
+    dict_sys.lock(SRW_LOCK_CALL);
+    if (table->release())
+    {
+      table->stats_mutex_lock();
+      if (table->get_ref_count() == 0)
+        dict_stats_deinit(table);
+      table->stats_mutex_unlock();
+    }
+    dict_sys.unlock();
+  }
+  else
+    table->release();
+}
+
+/** Decrements the count of open handles of a table.
+@param[in,out]	table		table
+@param[in]	dict_locked	whether dict_sys.latch is being held
+@param[in]	thd		thread to release MDL
+@param[in]	mdl		metadata lock or NULL if the thread
+				is a foreground one. */
+void
+dict_table_close(
+	dict_table_t*	table,
+	bool		dict_locked,
+	THD*		thd,
+	MDL_ticket*	mdl)
+{
+  if (!dict_locked)
+    dict_table_close(table);
+  else
+  {
+    if (table->release() && dict_stats_is_persistent_enabled(table) &&
+	strchr(table->name.m_name, '/'))
+    {
+      /* Force persistent stats re-read upon next open of the table so
+      that FLUSH TABLE can be used to forcibly fetch stats from disk if
+      they have been manually modified. */
+      table->stats_mutex_lock();
+      if (table->get_ref_count() == 0)
+        dict_stats_deinit(table);
+      table->stats_mutex_unlock();
+    }
+
+    ut_ad(dict_lru_validate());
+    ut_ad(dict_sys.find(table));
+  }
+
+  if (!thd || !mdl);
+  else if (MDL_context *mdl_context= static_cast<MDL_context*>
+           (thd_mdl_context(thd)))
+    mdl_context->release_lock(mdl);
+}
+
+/** Check if the table has a given (non_virtual) column.
+@param[in]	table		table object
+@param[in]	col_name	column name
+@param[in]	col_nr		column number guessed, 0 as default
+@return column number if the table has the specified column,
+otherwise table->n_def */
+ulint
+dict_table_has_column(
+	const dict_table_t*	table,
+	const char*		col_name,
+	ulint			col_nr)
+{
+	ulint		col_max = table->n_def;
+
+	ut_ad(table);
+	ut_ad(col_name);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+	if (col_nr < col_max
+	    && innobase_strcasecmp(
+		col_name, dict_table_get_col_name(table, col_nr)) == 0) {
+		return(col_nr);
+	}
+
+	/** The order of column may changed, check it with other columns */
+	for (ulint i = 0; i < col_max; i++) {
+		if (i != col_nr
+		    && innobase_strcasecmp(
+			col_name, dict_table_get_col_name(table, i)) == 0) {
+
+			return(i);
+		}
+	}
+
+	return(col_max);
+}
+
+/** Retrieve the column name.
+@param[in]	table	the table of this column */
+const char* dict_col_t::name(const dict_table_t& table) const
+{
+	ut_ad(table.magic_n == DICT_TABLE_MAGIC_N);
+
+	size_t col_nr;
+	const char *s;
+
+	if (is_virtual()) {
+		col_nr = size_t(reinterpret_cast<const dict_v_col_t*>(this)
+				- table.v_cols);
+		ut_ad(col_nr < table.n_v_def);
+		s = table.v_col_names;
+	} else {
+		col_nr = size_t(this - table.cols);
+		ut_ad(col_nr < table.n_def);
+		s = table.col_names;
+	}
+
+	if (s) {
+		for (size_t i = 0; i < col_nr; i++) {
+			s += strlen(s) + 1;
+		}
+	}
+
+	return(s);
+}
+
+/** Returns a virtual column's name.
+@param[in]	table	target table
+@param[in]	col_nr	virtual column number (nth virtual column)
+@return column name or NULL if column number out of range. */
+const char*
+dict_table_get_v_col_name(
+	const dict_table_t*	table,
+	ulint			col_nr)
+{
+	const char*	s;
+
+	ut_ad(table);
+	ut_ad(col_nr < table->n_v_def);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+	if (col_nr >= table->n_v_def) {
+		return(NULL);
+	}
+
+	s = table->v_col_names;
+
+	if (s != NULL) {
+		for (ulint i = 0; i < col_nr; i++) {
+			s += strlen(s) + 1;
+		}
+	}
+
+	return(s);
+}
+
+/** Search virtual column's position in InnoDB according to its position
+in original table's position
+@param[in]	table	target table
+@param[in]	col_nr	column number (nth column in the MySQL table)
+@return virtual column's position in InnoDB, ULINT_UNDEFINED if not find */
+static
+ulint
+dict_table_get_v_col_pos_for_mysql(
+	const dict_table_t*	table,
+	ulint			col_nr)
+{
+	ulint	i;
+
+	ut_ad(table);
+	ut_ad(col_nr < static_cast<ulint>(table->n_t_def));
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+	for (i = 0; i < table->n_v_def; i++) {
+		if (col_nr == dict_get_v_col_mysql_pos(
+				table->v_cols[i].m_col.ind)) {
+			break;
+		}
+	}
+
+	if (i == table->n_v_def) {
+		return(ULINT_UNDEFINED);
+	}
+
+	return(i);
+}
+
+/** Returns a virtual column's name according to its original
+MySQL table position.
+@param[in]	table	target table
+@param[in]	col_nr	column number (nth column in the table)
+@return column name. */
+static
+const char*
+dict_table_get_v_col_name_mysql(
+	const dict_table_t*	table,
+	ulint			col_nr)
+{
+	ulint	i = dict_table_get_v_col_pos_for_mysql(table, col_nr);
+
+	if (i == ULINT_UNDEFINED) {
+		return(NULL);
+	}
+
+	return(dict_table_get_v_col_name(table, i));
+}
+
+/** Get nth virtual column according to its original MySQL table position
+@param[in]	table	target table
+@param[in]	col_nr	column number in MySQL Table definition
+@return dict_v_col_t ptr */
+dict_v_col_t*
+dict_table_get_nth_v_col_mysql(
+	const dict_table_t*	table,
+	ulint			col_nr)
+{
+	ulint	i = dict_table_get_v_col_pos_for_mysql(table, col_nr);
+
+	if (i == ULINT_UNDEFINED) {
+		return(NULL);
+	}
+
+	return(dict_table_get_nth_v_col(table, i));
+}
+
+
+/** Get all the FTS indexes on a table.
+@param[in]	table	table
+@param[out]	indexes	all FTS indexes on this table
+@return number of FTS indexes */
+ulint
+dict_table_get_all_fts_indexes(
+	const dict_table_t*	table,
+	ib_vector_t*		indexes)
+{
+	dict_index_t* index;
+
+	ut_a(ib_vector_size(indexes) == 0);
+
+	for (index = dict_table_get_first_index(table);
+	     index;
+	     index = dict_table_get_next_index(index)) {
+
+		if (index->type == DICT_FTS) {
+			ib_vector_push(indexes, &index);
+		}
+	}
+
+	return(ib_vector_size(indexes));
+}
+
+/** Looks for column n in an index.
+@param[in]	index		index
+@param[in]	n		column number
+@param[in]	inc_prefix	true=consider column prefixes too
+@param[in]	is_virtual	true==virtual column
+@param[out]	prefix_col_pos	col num if prefix
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+ulint
+dict_index_get_nth_col_or_prefix_pos(
+	const dict_index_t*	index,
+	ulint			n,
+	bool			inc_prefix,
+	bool			is_virtual,
+	ulint*			prefix_col_pos)
+{
+	const dict_field_t*	field;
+	const dict_col_t*	col;
+	ulint			pos;
+	ulint			n_fields;
+
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	if (prefix_col_pos) {
+		*prefix_col_pos = ULINT_UNDEFINED;
+	}
+
+	if (is_virtual) {
+		col = &(dict_table_get_nth_v_col(index->table, n)->m_col);
+	} else {
+		col = dict_table_get_nth_col(index->table, n);
+	}
+
+	if (dict_index_is_clust(index)) {
+
+		return(dict_col_get_clust_pos(col, index));
+	}
+
+	n_fields = dict_index_get_n_fields(index);
+
+	for (pos = 0; pos < n_fields; pos++) {
+		field = dict_index_get_nth_field(index, pos);
+
+		if (col == field->col) {
+			if (prefix_col_pos) {
+				*prefix_col_pos = pos;
+			}
+			if (inc_prefix || field->prefix_len == 0) {
+				return(pos);
+			}
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/** Check if the index contains a column or a prefix of that column.
+@param[in]	n		column number
+@param[in]	is_virtual	whether it is a virtual col
+@return whether the index contains the column or its prefix */
+bool dict_index_t::contains_col_or_prefix(ulint n, bool is_virtual) const
+{
+	ut_ad(magic_n == DICT_INDEX_MAGIC_N);
+
+	if (is_primary()) {
+		return(!is_virtual);
+	}
+
+	const dict_col_t* col = is_virtual
+		? &dict_table_get_nth_v_col(table, n)->m_col
+		: dict_table_get_nth_col(table, n);
+
+	for (ulint pos = 0; pos < n_fields; pos++) {
+		if (col == fields[pos].col) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+/********************************************************************//**
+Looks for a matching field in an index. The column has to be the same. The
+column in index must be complete, or must contain a prefix longer than the
+column in index2. That is, we must be able to construct the prefix in index2
+from the prefix in index.
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+ulint
+dict_index_get_nth_field_pos(
+/*=========================*/
+	const dict_index_t*	index,	/*!< in: index from which to search */
+	const dict_index_t*	index2,	/*!< in: index */
+	ulint			n)	/*!< in: field number in index2 */
+{
+	const dict_field_t*	field;
+	const dict_field_t*	field2;
+	ulint			n_fields;
+	ulint			pos;
+
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	field2 = dict_index_get_nth_field(index2, n);
+
+	n_fields = dict_index_get_n_fields(index);
+
+	/* Are we looking for a MBR (Minimum Bound Box) field of
+	a spatial index */
+	bool	is_mbr_fld = (n == 0 && dict_index_is_spatial(index2));
+
+	for (pos = 0; pos < n_fields; pos++) {
+		field = dict_index_get_nth_field(index, pos);
+
+		/* The first field of a spatial index is a transformed
+		MBR (Minimum Bound Box) field made out of original column,
+		so its field->col still points to original cluster index
+		col, but the actual content is different. So we cannot
+		consider them equal if neither of them is MBR field */
+		if (pos == 0 && dict_index_is_spatial(index) && !is_mbr_fld) {
+			continue;
+		}
+
+		if (field->col == field2->col
+		    && (field->prefix_len == 0
+			|| (field->prefix_len >= field2->prefix_len
+			    && field2->prefix_len != 0))) {
+
+			return(pos);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/** Parse the table file name into table name and database name.
+@tparam        dict_frozen  whether the caller holds dict_sys.latch
+@param[in,out] db_name      database name buffer
+@param[in,out] tbl_name     table name buffer
+@param[out] db_name_len     database name length
+@param[out] tbl_name_len    table name length
+@return whether the table name is visible to SQL */
+template<bool dict_frozen>
+bool dict_table_t::parse_name(char (&db_name)[NAME_LEN + 1],
+                              char (&tbl_name)[NAME_LEN + 1],
+                              size_t *db_name_len, size_t *tbl_name_len) const
+{
+  char db_buf[MAX_DATABASE_NAME_LEN + 1];
+  char tbl_buf[MAX_TABLE_NAME_LEN + 1];
+
+  if (!dict_frozen)
+    dict_sys.freeze(SRW_LOCK_CALL); /* protect against renaming */
+  ut_ad(dict_sys.frozen());
+  const size_t db_len= name.dblen();
+  ut_ad(db_len <= MAX_DATABASE_NAME_LEN);
+
+  memcpy(db_buf, mdl_name.m_name, db_len);
+  db_buf[db_len]= 0;
+
+  size_t tbl_len= strlen(mdl_name.m_name + db_len + 1);
+  const bool is_temp= mdl_name.is_temporary();
+
+  if (is_temp);
+  else if (const char *is_part= static_cast<const char*>
+           (memchr(mdl_name.m_name + db_len + 1, '#', tbl_len)))
+    tbl_len= static_cast<size_t>(is_part - &mdl_name.m_name[db_len + 1]);
+
+  memcpy(tbl_buf, mdl_name.m_name + db_len + 1, tbl_len);
+  tbl_buf[tbl_len]= 0;
+
+  if (!dict_frozen)
+    dict_sys.unfreeze();
+
+  *db_name_len= filename_to_tablename(db_buf, db_name,
+                                      MAX_DATABASE_NAME_LEN + 1, true);
+
+  if (is_temp)
+    return false;
+
+  *tbl_name_len= filename_to_tablename(tbl_buf, tbl_name,
+                                       MAX_TABLE_NAME_LEN + 1, true);
+  return true;
+}
+
+template bool
+dict_table_t::parse_name<>(char(&)[NAME_LEN + 1], char(&)[NAME_LEN + 1],
+                           size_t*, size_t*) const;
+
+/** Acquire MDL shared for the table name.
+@tparam trylock whether to use non-blocking operation
+@param[in,out]  table           table object
+@param[in,out]  thd             background thread
+@param[out]     mdl             mdl ticket
+@param[in]      table_op        operation to perform when opening
+@return table object after locking MDL shared
+@retval nullptr if the table is not readable, or if trylock && MDL blocked */
+template<bool trylock>
+dict_table_t*
+dict_acquire_mdl_shared(dict_table_t *table,
+                        THD *thd,
+                        MDL_ticket **mdl,
+                        dict_table_op_t table_op)
+{
+  if (!table || !mdl)
+    return table;
+
+  MDL_context *mdl_context= static_cast<MDL_context*>(thd_mdl_context(thd));
+  size_t db_len;
+  dict_table_t *not_found= nullptr;
+
+  if (trylock)
+  {
+    dict_sys.freeze(SRW_LOCK_CALL);
+    db_len= dict_get_db_name_len(table->name.m_name);
+    dict_sys.unfreeze();
+  }
+  else
+  {
+    ut_ad(dict_sys.frozen_not_locked());
+    db_len= dict_get_db_name_len(table->name.m_name);
+  }
+
+  if (db_len == 0)
+    return table; /* InnoDB system tables are not covered by MDL */
+
+  if (!mdl_context)
+    return nullptr;
+
+  table_id_t table_id= table->id;
+  char db_buf[NAME_LEN + 1], db_buf1[NAME_LEN + 1];
+  char tbl_buf[NAME_LEN + 1], tbl_buf1[NAME_LEN + 1];
+  size_t tbl_len;
+  bool unaccessible= false;
+
+  if (!table->parse_name<!trylock>(db_buf, tbl_buf, &db_len, &tbl_len))
+    /* The name of an intermediate table starts with #sql */
+    return table;
+
+retry:
+  if (!unaccessible && (!table->is_readable() || table->corrupted))
+  {
+    if (*mdl)
+    {
+      mdl_context->release_lock(*mdl);
+      *mdl= nullptr;
+    }
+    unaccessible= true;
+  }
+
+  if (!trylock)
+    table->release();
+
+  if (unaccessible)
+    return nullptr;
+
+  if (!trylock)
+    dict_sys.unfreeze();
+
+  {
+    MDL_request request;
+    MDL_REQUEST_INIT(&request,MDL_key::TABLE, db_buf, tbl_buf, MDL_SHARED,
+                     MDL_EXPLICIT);
+    if (trylock
+        ? mdl_context->try_acquire_lock(&request)
+        : mdl_context->acquire_lock(&request,
+                                    /* FIXME: use compatible type, and maybe
+                                    remove this parameter altogether! */
+                                    static_cast<double>(global_system_variables
+                                                        .lock_wait_timeout)))
+    {
+      *mdl= nullptr;
+      if (trylock)
+        return nullptr;
+    }
+    else
+    {
+      *mdl= request.ticket;
+      if (trylock && !*mdl)
+        return nullptr;
+    }
+  }
+
+  dict_sys.freeze(SRW_LOCK_CALL);
+  table= dict_sys.find_table(table_id);
+  if (table)
+    table->acquire();
+  if (!table && table_op != DICT_TABLE_OP_OPEN_ONLY_IF_CACHED)
+  {
+    dict_sys.unfreeze();
+    dict_sys.lock(SRW_LOCK_CALL);
+    table= dict_load_table_on_id(table_id,
+                                 table_op == DICT_TABLE_OP_LOAD_TABLESPACE
+                                 ? DICT_ERR_IGNORE_RECOVER_LOCK
+                                 : DICT_ERR_IGNORE_FK_NOKEY);
+    if (table)
+      table->acquire();
+    dict_sys.unlock();
+    dict_sys.freeze(SRW_LOCK_CALL);
+  }
+
+  if (!table || !table->is_accessible())
+  {
+    table= nullptr;
+return_without_mdl:
+    if (trylock)
+      dict_sys.unfreeze();
+    if (*mdl)
+    {
+      mdl_context->release_lock(*mdl);
+      *mdl= nullptr;
+    }
+    return not_found;
+  }
+
+  size_t db1_len, tbl1_len;
+
+  if (!table->parse_name<true>(db_buf1, tbl_buf1, &db1_len, &tbl1_len))
+  {
+    /* The table was renamed to #sql prefix.
+    Release MDL (if any) for the old name and return. */
+    goto return_without_mdl;
+  }
+
+  if (*mdl)
+  {
+    if (db_len == db1_len && tbl_len == tbl1_len &&
+        !memcmp(db_buf, db_buf1, db_len) &&
+        !memcmp(tbl_buf, tbl_buf1, tbl_len))
+    {
+      if (trylock)
+        dict_sys.unfreeze();
+      return table;
+    }
+
+    /* The table was renamed. Release MDL for the old name and
+    try to acquire MDL for the new name. */
+    mdl_context->release_lock(*mdl);
+    *mdl= nullptr;
+  }
+
+  db_len= db1_len;
+  tbl_len= tbl1_len;
+
+  memcpy(tbl_buf, tbl_buf1, tbl_len + 1);
+  memcpy(db_buf, db_buf1, db_len + 1);
+  goto retry;
+}
+
+template dict_table_t* dict_acquire_mdl_shared<false>
+(dict_table_t*,THD*,MDL_ticket**,dict_table_op_t);
+template dict_table_t* dict_acquire_mdl_shared<true>
+(dict_table_t*,THD*,MDL_ticket**,dict_table_op_t);
+
+/** Look up a table by numeric identifier.
+@param[in]      table_id        table identifier
+@param[in]      dict_locked     data dictionary locked
+@param[in]      table_op        operation to perform when opening
+@param[in,out]  thd             background thread, or NULL to not acquire MDL
+@param[out]     mdl             mdl ticket, or NULL
+@return table, NULL if does not exist */
+dict_table_t *dict_table_open_on_id(table_id_t table_id, bool dict_locked,
+                                    dict_table_op_t table_op, THD *thd,
+                                    MDL_ticket **mdl)
+{
+  if (!dict_locked)
+    dict_sys.freeze(SRW_LOCK_CALL);
+
+  dict_table_t *table= dict_sys.find_table(table_id);
+
+  if (table)
+  {
+    table->acquire();
+    if (thd && !dict_locked)
+      table= dict_acquire_mdl_shared<false>(table, thd, mdl, table_op);
+  }
+  else if (table_op != DICT_TABLE_OP_OPEN_ONLY_IF_CACHED)
+  {
+    if (!dict_locked)
+    {
+      dict_sys.unfreeze();
+      dict_sys.lock(SRW_LOCK_CALL);
+    }
+    table= dict_load_table_on_id(table_id,
+                                 table_op == DICT_TABLE_OP_LOAD_TABLESPACE
+                                 ? DICT_ERR_IGNORE_RECOVER_LOCK
+                                 : DICT_ERR_IGNORE_FK_NOKEY);
+    if (table)
+      table->acquire();
+    if (!dict_locked)
+    {
+      dict_sys.unlock();
+      if (table && thd)
+      {
+        dict_sys.freeze(SRW_LOCK_CALL);
+        table= dict_acquire_mdl_shared<false>(table, thd, mdl, table_op);
+        dict_sys.unfreeze();
+      }
+      return table;
+    }
+  }
+
+  if (!dict_locked)
+    dict_sys.unfreeze();
+
+  return table;
+}
+
+/********************************************************************//**
+Looks for column n position in the clustered index.
+@return position in internal representation of the clustered index */
+unsigned
+dict_table_get_nth_col_pos(
+/*=======================*/
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			n,	/*!< in: column number */
+	ulint*			prefix_col_pos)
+{
+  ulint pos= dict_index_get_nth_col_pos(dict_table_get_first_index(table),
+					n, prefix_col_pos);
+  DBUG_ASSERT(pos <= dict_index_t::MAX_N_FIELDS);
+  return static_cast<unsigned>(pos);
+}
+
+/********************************************************************//**
+Checks if a column is in the ordering columns of the clustered index of a
+table. Column prefixes are treated like whole columns.
+@return TRUE if the column, or its prefix, is in the clustered key */
+ibool
+dict_table_col_in_clustered_key(
+/*============================*/
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			n)	/*!< in: column number */
+{
+	const dict_index_t*	index;
+	const dict_field_t*	field;
+	const dict_col_t*	col;
+	ulint			pos;
+	ulint			n_fields;
+
+	col = dict_table_get_nth_col(table, n);
+
+	index = dict_table_get_first_index(table);
+
+	n_fields = dict_index_get_n_unique(index);
+
+	for (pos = 0; pos < n_fields; pos++) {
+		field = dict_index_get_nth_field(index, pos);
+
+		if (col == field->col) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/** Initialise the data dictionary cache. */
+void dict_sys_t::create()
+{
+  ut_ad(this == &dict_sys);
+  ut_ad(!is_initialised());
+  m_initialised= true;
+  UT_LIST_INIT(table_LRU, &dict_table_t::table_LRU);
+  UT_LIST_INIT(table_non_LRU, &dict_table_t::table_LRU);
+
+  const ulint hash_size = buf_pool_get_curr_size()
+    / (DICT_POOL_PER_TABLE_HASH * UNIV_WORD_SIZE);
+
+  table_hash.create(hash_size);
+  table_id_hash.create(hash_size);
+  temp_id_hash.create(hash_size);
+
+  latch.SRW_LOCK_INIT(dict_operation_lock_key);
+
+  if (!srv_read_only_mode)
+  {
+    dict_foreign_err_file= os_file_create_tmpfile();
+    ut_a(dict_foreign_err_file);
+  }
+
+  mysql_mutex_init(dict_foreign_err_mutex_key, &dict_foreign_err_mutex,
+                   nullptr);
+}
+
+
+void dict_sys_t::lock_wait(SRW_LOCK_ARGS(const char *file, unsigned line))
+{
+  ulonglong now= my_hrtime_coarse().val, old= 0;
+  if (latch_ex_wait_start.compare_exchange_strong
+      (old, now, std::memory_order_relaxed, std::memory_order_relaxed))
+  {
+    latch.wr_lock(SRW_LOCK_ARGS(file, line));
+    latch_ex_wait_start.store(0, std::memory_order_relaxed);
+    ut_ad(!latch_readers);
+    ut_ad(!latch_ex);
+    ut_d(latch_ex= pthread_self());
+    return;
+  }
+
+  ut_ad(old);
+  /* We could have old > now due to our use of my_hrtime_coarse(). */
+  ulong waited= old <= now ? static_cast<ulong>((now - old) / 1000000) : 0;
+  const ulong threshold= srv_fatal_semaphore_wait_threshold;
+
+  if (waited >= threshold)
+    ib::fatal() << fatal_msg;
+
+  if (waited > threshold / 4)
+    ib::warn() << "A long wait (" << waited
+               << " seconds) was observed for dict_sys.latch";
+  latch.wr_lock(SRW_LOCK_ARGS(file, line));
+  ut_ad(!latch_readers);
+  ut_ad(!latch_ex);
+  ut_d(latch_ex= pthread_self());
+}
+
+#ifdef UNIV_PFS_RWLOCK
+ATTRIBUTE_NOINLINE void dict_sys_t::unlock()
+{
+  ut_ad(latch_ex == pthread_self());
+  ut_ad(!latch_readers);
+  ut_d(latch_ex= 0);
+  latch.wr_unlock();
+}
+
+ATTRIBUTE_NOINLINE void dict_sys_t::freeze(const char *file, unsigned line)
+{
+  latch.rd_lock(file, line);
+  ut_ad(!latch_ex);
+  ut_d(latch_readers++);
+}
+
+ATTRIBUTE_NOINLINE void dict_sys_t::unfreeze()
+{
+  ut_ad(!latch_ex);
+  ut_ad(latch_readers--);
+  latch.rd_unlock();
+}
+#endif /* UNIV_PFS_RWLOCK */
+
+/**********************************************************************//**
+Returns a table object and increments its open handle count.
+NOTE! This is a high-level function to be used mainly from outside the
+'dict' directory. Inside this directory dict_table_get_low
+is usually the appropriate function.
+@param[in] table_name Table name
+@param[in] dict_locked whether dict_sys.latch is being held exclusively
+@param[in] ignore_err error to be ignored when loading the table
+@return table
+@retval nullptr if does not exist */
+dict_table_t*
+dict_table_open_on_name(
+	const char*		table_name,
+	bool			dict_locked,
+	dict_err_ignore_t	ignore_err)
+{
+  dict_table_t *table;
+  DBUG_ENTER("dict_table_open_on_name");
+  DBUG_PRINT("dict_table_open_on_name", ("table: '%s'", table_name));
+
+  const span<const char> name{table_name, strlen(table_name)};
+
+  if (!dict_locked)
+  {
+    dict_sys.freeze(SRW_LOCK_CALL);
+    table= dict_sys.find_table(name);
+    if (table)
+    {
+      ut_ad(table->cached);
+      if (!(ignore_err & ~DICT_ERR_IGNORE_FK_NOKEY) &&
+          !table->is_readable() && table->corrupted)
+      {
+        ulint algo = table->space->get_compression_algo();
+        if (algo <= PAGE_ALGORITHM_LAST && !fil_comp_algo_loaded(algo)) {
+	  my_printf_error(ER_PROVIDER_NOT_LOADED,
+            "Table %s is compressed with %s, which is not currently loaded. "
+            "Please load the %s provider plugin to open the table",
+	    MYF(ME_ERROR_LOG), table->name,
+            page_compression_algorithms[algo], page_compression_algorithms[algo]);
+        } else {
+	  my_printf_error(ER_TABLE_CORRUPT,
+            "Table %s is corrupted. Please drop the table and recreate.",
+	    MYF(ME_ERROR_LOG), table->name);
+	}
+        dict_sys.unfreeze();
+        DBUG_RETURN(nullptr);
+      }
+      table->acquire();
+      dict_sys.unfreeze();
+      DBUG_RETURN(table);
+    }
+    dict_sys.unfreeze();
+    dict_sys.lock(SRW_LOCK_CALL);
+  }
+
+  table= dict_sys.load_table(name, ignore_err);
+
+  if (table)
+  {
+    ut_ad(table->cached);
+    if (!(ignore_err & ~DICT_ERR_IGNORE_FK_NOKEY) &&
+        !table->is_readable() && table->corrupted)
+    {
+      ib::error() << "Table " << table->name
+                  << " is corrupted. Please drop the table and recreate.";
+      if (!dict_locked)
+        dict_sys.unlock();
+      DBUG_RETURN(nullptr);
+    }
+
+    table->acquire();
+  }
+
+  ut_ad(dict_lru_validate());
+  if (!dict_locked)
+    dict_sys.unlock();
+
+  DBUG_RETURN(table);
+}
+
+/**********************************************************************//**
+Adds system columns to a table object. */
+void
+dict_table_add_system_columns(
+/*==========================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	mem_heap_t*	heap)	/*!< in: temporary heap */
+{
+	ut_ad(table->n_def == table->n_cols - DATA_N_SYS_COLS);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+	ut_ad(!table->cached);
+
+	/* NOTE: the system columns MUST be added in the following order
+	(so that they can be indexed by the numerical value of DATA_ROW_ID,
+	etc.) and as the last columns of the table memory object.
+	The clustered index will not always physically contain all system
+	columns. */
+
+	dict_mem_table_add_col(table, heap, "DB_ROW_ID", DATA_SYS,
+			       DATA_ROW_ID | DATA_NOT_NULL,
+			       DATA_ROW_ID_LEN);
+
+	compile_time_assert(DATA_ROW_ID == 0);
+	dict_mem_table_add_col(table, heap, "DB_TRX_ID", DATA_SYS,
+			       DATA_TRX_ID | DATA_NOT_NULL,
+			       DATA_TRX_ID_LEN);
+	compile_time_assert(DATA_TRX_ID == 1);
+	dict_mem_table_add_col(table, heap, "DB_ROLL_PTR", DATA_SYS,
+			       DATA_ROLL_PTR | DATA_NOT_NULL,
+			       DATA_ROLL_PTR_LEN);
+	compile_time_assert(DATA_ROLL_PTR == 2);
+
+	/* This check reminds that if a new system column is added to
+	the program, it should be dealt with here */
+	compile_time_assert(DATA_N_SYS_COLS == 3);
+}
+
+/** Add the table definition to the data dictionary cache */
+void dict_table_t::add_to_cache()
+{
+	cached = TRUE;
+
+	dict_sys.add(this);
+}
+
+/** Add a table definition to the data dictionary cache */
+inline void dict_sys_t::add(dict_table_t* table)
+{
+	ut_ad(!find(table));
+
+	ulint fold = my_crc32c(0, table->name.m_name,
+			       strlen(table->name.m_name));
+
+	table->autoinc_mutex.init();
+	table->lock_mutex_init();
+
+	/* Look for a table with the same name: error if such exists */
+	{
+		dict_table_t*	table2;
+		HASH_SEARCH(name_hash, &table_hash, fold,
+			    dict_table_t*, table2, ut_ad(table2->cached),
+			    !strcmp(table2->name.m_name, table->name.m_name));
+		ut_a(table2 == NULL);
+
+#ifdef UNIV_DEBUG
+		/* Look for the same table pointer with a different name */
+		HASH_SEARCH_ALL(name_hash, &table_hash,
+				dict_table_t*, table2, ut_ad(table2->cached),
+				table2 == table);
+		ut_ad(table2 == NULL);
+#endif /* UNIV_DEBUG */
+	}
+	HASH_INSERT(dict_table_t, name_hash, &table_hash, fold, table);
+
+	/* Look for a table with the same id: error if such exists */
+	hash_table_t* id_hash = table->is_temporary()
+		? &temp_id_hash : &table_id_hash;
+	const ulint id_fold = ut_fold_ull(table->id);
+	{
+		dict_table_t*	table2;
+		HASH_SEARCH(id_hash, id_hash, id_fold,
+			    dict_table_t*, table2, ut_ad(table2->cached),
+			    table2->id == table->id);
+		ut_a(table2 == NULL);
+
+#ifdef UNIV_DEBUG
+		/* Look for the same table pointer with a different id */
+		HASH_SEARCH_ALL(id_hash, id_hash,
+				dict_table_t*, table2, ut_ad(table2->cached),
+				table2 == table);
+		ut_ad(table2 == NULL);
+#endif /* UNIV_DEBUG */
+
+		HASH_INSERT(dict_table_t, id_hash, id_hash, id_fold, table);
+	}
+
+	UT_LIST_ADD_FIRST(table->can_be_evicted ? table_LRU : table_non_LRU,
+			  table);
+	ut_ad(dict_lru_validate());
+}
+
+/** Test whether a table can be evicted from dict_sys.table_LRU.
+@param table   table to be considered for eviction
+@return whether the table can be evicted */
+TRANSACTIONAL_TARGET
+static bool dict_table_can_be_evicted(dict_table_t *table)
+{
+	ut_ad(dict_sys.locked());
+	ut_a(table->can_be_evicted);
+	ut_a(table->foreign_set.empty());
+	ut_a(table->referenced_set.empty());
+
+	if (table->get_ref_count() == 0) {
+		/* The transaction commit and rollback are called from
+		outside the handler interface. This means that there is
+		a window where the table->n_ref_count can be zero but
+		the table instance is in "use". */
+
+		if (lock_table_has_locks(table)) {
+			return false;
+		}
+
+#ifdef BTR_CUR_HASH_ADAPT
+		/* We cannot really evict the table if adaptive hash
+		index entries are pointing to any of its indexes. */
+		for (const dict_index_t* index
+			     = dict_table_get_first_index(table);
+		     index; index = dict_table_get_next_index(index)) {
+			if (index->n_ahi_pages()) {
+				return false;
+			}
+		}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+		ut_ad(!table->fts);
+		return true;
+	}
+
+	return false;
+}
+
+#ifdef BTR_CUR_HASH_ADAPT
+/** @return a clone of this */
+dict_index_t *dict_index_t::clone() const
+{
+  ut_ad(n_fields);
+  ut_ad(is_btree());
+  ut_ad(online_status == ONLINE_INDEX_COMPLETE);
+  ut_ad(is_committed());
+  ut_ad(!is_dummy);
+  ut_ad(!parser);
+  ut_ad(!online_log);
+  ut_ad(!rtr_track);
+
+  const size_t size= sizeof *this + n_fields * sizeof(*fields) +
+#ifdef BTR_CUR_ADAPT
+    sizeof *search_info +
+#endif
+    1 + strlen(name) +
+    n_uniq * (sizeof *stat_n_diff_key_vals +
+              sizeof *stat_n_sample_sizes +
+              sizeof *stat_n_non_null_key_vals);
+
+  mem_heap_t* heap= mem_heap_create(size);
+  dict_index_t *index= static_cast<dict_index_t*>
+    (mem_heap_alloc(heap, sizeof *this));
+  *index= *this;
+  index->lock.SRW_LOCK_INIT(index_tree_rw_lock_key);
+  index->heap= heap;
+  index->name= mem_heap_strdup(heap, name);
+  index->fields= static_cast<dict_field_t*>
+    (mem_heap_dup(heap, fields, n_fields * sizeof *fields));
+#ifdef BTR_CUR_ADAPT
+  index->search_info= btr_search_info_create(index->heap);
+#endif /* BTR_CUR_ADAPT */
+  index->stat_n_diff_key_vals= static_cast<ib_uint64_t*>
+    (mem_heap_zalloc(heap, n_uniq * sizeof *stat_n_diff_key_vals));
+  index->stat_n_sample_sizes= static_cast<ib_uint64_t*>
+    (mem_heap_zalloc(heap, n_uniq * sizeof *stat_n_sample_sizes));
+  index->stat_n_non_null_key_vals= static_cast<ib_uint64_t*>
+    (mem_heap_zalloc(heap, n_uniq * sizeof *stat_n_non_null_key_vals));
+  new (&index->zip_pad.mutex) std::mutex();
+  return index;
+}
+
+/** Clone this index for lazy dropping of the adaptive hash.
+@return this or a clone */
+dict_index_t *dict_index_t::clone_if_needed()
+{
+  if (!search_info->ref_count)
+    return this;
+  dict_index_t *prev= UT_LIST_GET_PREV(indexes, this);
+
+  table->autoinc_mutex.wr_lock();
+  UT_LIST_REMOVE(table->indexes, this);
+  UT_LIST_ADD_LAST(table->freed_indexes, this);
+  dict_index_t *index= clone();
+  set_freed();
+  if (prev)
+    UT_LIST_INSERT_AFTER(table->indexes, prev, index);
+  else
+    UT_LIST_ADD_FIRST(table->indexes, index);
+  table->autoinc_mutex.wr_unlock();
+  return index;
+}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+/** Evict unused, unlocked tables from table_LRU.
+@param half whether to consider half the tables only (instead of all)
+@return number of tables evicted */
+ulint dict_sys_t::evict_table_LRU(bool half)
+{
+#ifdef MYSQL_DYNAMIC_PLUGIN
+	constexpr ulint max_tables = 400;
+#else
+	extern ulong tdc_size;
+	const ulint max_tables = tdc_size;
+#endif
+	ulint n_evicted = 0;
+
+	lock(SRW_LOCK_CALL);
+	ut_ad(dict_lru_validate());
+
+	const ulint len = UT_LIST_GET_LEN(table_LRU);
+
+	if (len < max_tables) {
+func_exit:
+		unlock();
+		return(n_evicted);
+	}
+
+	const ulint check_up_to = half ? len / 2 : 0;
+	ulint i = len;
+
+	/* Find a suitable candidate to evict from the cache. Don't scan the
+	entire LRU list. Only scan pct_check list entries. */
+
+	for (dict_table_t *table = UT_LIST_GET_LAST(table_LRU);
+	     table && i > check_up_to && (len - n_evicted) > max_tables; --i) {
+		dict_table_t* prev_table = UT_LIST_GET_PREV(table_LRU, table);
+
+		if (dict_table_can_be_evicted(table)) {
+			remove(table, true);
+			++n_evicted;
+		}
+
+		table = prev_table;
+	}
+
+	goto func_exit;
+}
+
+/** Looks for an index with the given id given a table instance.
+@param[in]	table	table instance
+@param[in]	id	index id
+@return index or NULL */
+dict_index_t*
+dict_table_find_index_on_id(
+	const dict_table_t*	table,
+	index_id_t		id)
+{
+	dict_index_t*	index;
+
+	for (index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+
+		if (id == index->id) {
+			/* Found */
+
+			return(index);
+		}
+	}
+
+	return(NULL);
+}
+
+/** Function object to remove a foreign key constraint from the
+referenced_set of the referenced table.  The foreign key object is
+also removed from the dictionary cache.  The foreign key constraint
+is not removed from the foreign_set of the table containing the
+constraint. */
+struct dict_foreign_remove_partial
+{
+	void operator()(dict_foreign_t* foreign) {
+		dict_table_t*	table = foreign->referenced_table;
+		if (table != NULL) {
+			table->referenced_set.erase(foreign);
+		}
+		dict_foreign_free(foreign);
+	}
+};
+
+/** This function returns a new path name after replacing the basename
+in an old path with a new basename.  The old_path is a full path
+name including the extension.  The tablename is in the normal
+form "databasename/tablename".  The new base name is found after
+the forward slash.  Both input strings are null terminated.
+
+This function allocates memory to be returned.  It is the callers
+responsibility to free the return value after it is no longer needed.
+
+@param[in]	old_path		Pathname
+@param[in]	tablename		Contains new base name
+@return own: new full pathname */
+static char *dir_pathname(const char *old_path, span<const char> tablename)
+{
+  /* Split the tablename into its database and table name components.
+  They are separated by a '/'. */
+  const char *base_name= tablename.data();
+  for (const char *last= tablename.end(); last > tablename.data(); last--)
+  {
+    if (last[-1] == '/')
+    {
+      base_name= last;
+      break;
+    }
+  }
+  const size_t base_name_len= tablename.end() - base_name;
+
+  /* Find the offset of the last slash. We will strip off the
+  old basename.ibd which starts after that slash. */
+  const char *last_slash= strrchr(old_path, '/');
+#ifdef _WIN32
+  if (const char *last= strrchr(old_path, '\\'))
+    if (last > last_slash)
+      last_slash= last;
+#endif
+
+  size_t dir_len= last_slash
+    ? size_t(last_slash - old_path)
+    : strlen(old_path);
+
+  /* allocate a new path and move the old directory path to it. */
+  size_t new_path_len= dir_len + base_name_len + sizeof "/.ibd";
+  char *new_path= static_cast<char*>(ut_malloc_nokey(new_path_len));
+  memcpy(new_path, old_path, dir_len);
+  snprintf(new_path + dir_len, new_path_len - dir_len, "/%.*s.ibd",
+           int(base_name_len), base_name);
+  return new_path;
+}
+
+/** Rename the data file.
+@param new_name     name of the table
+@param replace      whether to replace the file with the new name
+                    (as part of rolling back TRUNCATE) */
+dberr_t
+dict_table_t::rename_tablespace(span<const char> new_name, bool replace) const
+{
+  ut_ad(dict_table_is_file_per_table(this));
+  ut_ad(!is_temporary());
+
+  if (!space)
+    return DB_SUCCESS;
+
+  const char *old_path= UT_LIST_GET_FIRST(space->chain)->name;
+  const bool data_dir= DICT_TF_HAS_DATA_DIR(flags);
+  char *path= data_dir
+    ? dir_pathname(old_path, new_name)
+    : fil_make_filepath(nullptr, new_name, IBD, false);
+  dberr_t err;
+  if (!path)
+    err= DB_OUT_OF_MEMORY;
+  else if (!strcmp(path, old_path))
+    err= DB_SUCCESS;
+  else if (data_dir &&
+           DB_SUCCESS != RemoteDatafile::create_link_file(new_name, path))
+    err= DB_TABLESPACE_EXISTS;
+  else
+  {
+    space->x_lock();
+    err= space->rename(path, true, replace);
+    if (data_dir)
+    {
+      if (err == DB_SUCCESS)
+        new_name= {name.m_name, strlen(name.m_name)};
+      RemoteDatafile::delete_link_file(new_name);
+    }
+    space->x_unlock();
+  }
+
+  ut_free(path);
+  return err;
+}
+
+/**********************************************************************//**
+Renames a table object.
+@return TRUE if success */
+dberr_t
+dict_table_rename_in_cache(
+/*=======================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	span<const char> new_name,	/*!< in: new name */
+	bool		replace_new_file)
+					/*!< in: whether to replace the
+					file with the new name
+					(as part of rolling back TRUNCATE) */
+{
+	dict_foreign_t*	foreign;
+	char		old_name[MAX_FULL_NAME_LEN + 1];
+
+	ut_ad(dict_sys.locked());
+
+	/* store the old/current name to an automatic variable */
+	const size_t old_name_len = strlen(table->name.m_name);
+	ut_a(old_name_len < sizeof old_name);
+	strcpy(old_name, table->name.m_name);
+
+	const uint32_t fold= my_crc32c(0, new_name.data(), new_name.size());
+	ut_a(!dict_sys.find_table(new_name));
+
+	if (!dict_table_is_file_per_table(table)) {
+	} else if (dberr_t err = table->rename_tablespace(new_name,
+							  replace_new_file)) {
+		return err;
+	}
+
+	/* Remove table from the hash tables of tables */
+	HASH_DELETE(dict_table_t, name_hash, &dict_sys.table_hash,
+		    my_crc32c(0, table->name.m_name, old_name_len), table);
+
+        bool keep_mdl_name = !table->name.is_temporary();
+
+	if (!keep_mdl_name) {
+	} else if (const char* s = static_cast<const char*>
+		   (memchr(new_name.data(), '/', new_name.size()))) {
+		keep_mdl_name = new_name.end() - s >= 5
+			&& !memcmp(s, "/#sql", 5);
+	}
+
+	if (keep_mdl_name) {
+		/* Preserve the original table name for
+		dict_table_t::parse_name() and dict_acquire_mdl_shared(). */
+		table->mdl_name.m_name = mem_heap_strdup(table->heap,
+							 table->name.m_name);
+	}
+
+	if (new_name.size() > strlen(table->name.m_name)) {
+		/* We allocate MAX_FULL_NAME_LEN + 1 bytes here to avoid
+		memory fragmentation, we assume a repeated calls of
+		ut_realloc() with the same size do not cause fragmentation */
+		ut_a(new_name.size() <= MAX_FULL_NAME_LEN);
+
+		table->name.m_name = static_cast<char*>(
+			ut_realloc(table->name.m_name, MAX_FULL_NAME_LEN + 1));
+	}
+	memcpy(table->name.m_name, new_name.data(), new_name.size());
+	table->name.m_name[new_name.size()] = '\0';
+
+	if (!keep_mdl_name) {
+		table->mdl_name.m_name = table->name.m_name;
+	}
+
+	/* Add table to hash table of tables */
+	HASH_INSERT(dict_table_t, name_hash, &dict_sys.table_hash, fold,
+		    table);
+
+	if (table->name.is_temporary()) {
+		/* In ALTER TABLE we think of the rename table operation
+		in the direction table -> temporary table (#sql...)
+		as dropping the table with the old name and creating
+		a new with the new name. Thus we kind of drop the
+		constraints from the dictionary cache here. The foreign key
+		constraints will be inherited to the new table from the
+		system tables through a call of dict_load_foreigns. */
+
+		/* Remove the foreign constraints from the cache */
+		std::for_each(table->foreign_set.begin(),
+			      table->foreign_set.end(),
+			      dict_foreign_remove_partial());
+		table->foreign_set.clear();
+
+		/* Reset table field in referencing constraints */
+		for (dict_foreign_set::iterator it
+			= table->referenced_set.begin();
+		     it != table->referenced_set.end();
+		     ++it) {
+
+			foreign = *it;
+			foreign->referenced_table = NULL;
+			foreign->referenced_index = NULL;
+
+		}
+
+		/* Make the set of referencing constraints empty */
+		table->referenced_set.clear();
+
+		return(DB_SUCCESS);
+	}
+
+	/* Update the table name fields in foreign constraints, and update also
+	the constraint id of new format >= 4.0.18 constraints. Note that at
+	this point we have already changed table->name to the new name. */
+
+	dict_foreign_set	fk_set;
+
+	for (;;) {
+
+		dict_foreign_set::iterator	it
+			= table->foreign_set.begin();
+
+		if (it == table->foreign_set.end()) {
+			break;
+		}
+
+		foreign = *it;
+
+		if (foreign->referenced_table) {
+			foreign->referenced_table->referenced_set.erase(foreign);
+		}
+
+		if (strlen(foreign->foreign_table_name)
+		    < strlen(table->name.m_name)) {
+			/* Allocate a longer name buffer;
+			TODO: store buf len to save memory */
+
+			foreign->foreign_table_name = mem_heap_strdup(
+				foreign->heap, table->name.m_name);
+			dict_mem_foreign_table_name_lookup_set(foreign, TRUE);
+		} else {
+			strcpy(foreign->foreign_table_name,
+			       table->name.m_name);
+			dict_mem_foreign_table_name_lookup_set(foreign, FALSE);
+		}
+		if (strchr(foreign->id, '/')) {
+			/* This is a >= 4.0.18 format id */
+
+			ulint	db_len;
+			char*	old_id;
+			char    old_name_cs_filename[MAX_FULL_NAME_LEN+1];
+			uint    errors = 0;
+
+			/* All table names are internally stored in charset
+			my_charset_filename (except the temp tables and the
+			partition identifier suffix in partition tables). The
+			foreign key constraint names are internally stored
+			in UTF-8 charset.  The variable fkid here is used
+			to store foreign key constraint name in charset
+			my_charset_filename for comparison further below. */
+			char    fkid[MAX_TABLE_NAME_LEN * 2 + 20];
+
+			/* The old table name in my_charset_filename is stored
+			in old_name_cs_filename */
+
+			strcpy(old_name_cs_filename, old_name);
+			old_name_cs_filename[MAX_FULL_NAME_LEN] = '\0';
+			if (!dict_table_t::is_temporary_name(old_name)) {
+				innobase_convert_to_system_charset(
+					strchr(old_name_cs_filename, '/') + 1,
+					strchr(old_name, '/') + 1,
+					MAX_TABLE_NAME_LEN, &errors);
+
+				if (errors) {
+					/* There has been an error to convert
+					old table into UTF-8.  This probably
+					means that the old table name is
+					actually in UTF-8. */
+					innobase_convert_to_filename_charset(
+						strchr(old_name_cs_filename,
+						       '/') + 1,
+						strchr(old_name, '/') + 1,
+						MAX_TABLE_NAME_LEN);
+				} else {
+					/* Old name already in
+					my_charset_filename */
+					strcpy(old_name_cs_filename, old_name);
+					old_name_cs_filename[MAX_FULL_NAME_LEN]
+						= '\0';
+				}
+			}
+
+			strncpy(fkid, foreign->id, (sizeof fkid) - 1);
+			fkid[(sizeof fkid) - 1] = '\0';
+
+			const bool on_tmp = dict_table_t::is_temporary_name(
+				fkid);
+
+			if (!on_tmp) {
+				innobase_convert_to_filename_charset(
+					strchr(fkid, '/') + 1,
+					strchr(foreign->id, '/') + 1,
+					MAX_TABLE_NAME_LEN+20);
+			}
+
+			old_id = mem_strdup(foreign->id);
+
+			if (strlen(fkid) > strlen(old_name_cs_filename)
+			    + ((sizeof dict_ibfk) - 1)
+			    && !memcmp(fkid, old_name_cs_filename,
+				       strlen(old_name_cs_filename))
+			    && !memcmp(fkid + strlen(old_name_cs_filename),
+				       dict_ibfk, (sizeof dict_ibfk) - 1)) {
+
+				/* This is a generated >= 4.0.18 format id */
+
+				char	table_name[MAX_TABLE_NAME_LEN + 1];
+				uint	errors = 0;
+
+				if (strlen(table->name.m_name)
+				    > strlen(old_name)) {
+					foreign->id = static_cast<char*>(
+						mem_heap_alloc(
+						foreign->heap,
+						strlen(table->name.m_name)
+						+ strlen(old_id) + 1));
+				}
+
+				/* Convert the table name to UTF-8 */
+				strncpy(table_name, table->name.m_name,
+					MAX_TABLE_NAME_LEN);
+				table_name[MAX_TABLE_NAME_LEN] = '\0';
+				innobase_convert_to_system_charset(
+					strchr(table_name, '/') + 1,
+					strchr(table->name.m_name, '/') + 1,
+					MAX_TABLE_NAME_LEN, &errors);
+
+				if (errors) {
+					/* Table name could not be converted
+					from charset my_charset_filename to
+					UTF-8. This means that the table name
+					is already in UTF-8 (#mysql50#). */
+					strncpy(table_name, table->name.m_name,
+						MAX_TABLE_NAME_LEN);
+					table_name[MAX_TABLE_NAME_LEN] = '\0';
+				}
+
+				/* Replace the prefix 'databasename/tablename'
+				with the new names */
+				strcpy(foreign->id, table_name);
+				if (on_tmp) {
+					strcat(foreign->id,
+					       old_id + strlen(old_name));
+				} else {
+					sprintf(strchr(foreign->id, '/') + 1,
+						"%s%s",
+						strchr(table_name, '/') +1,
+						strstr(old_id, "_ibfk_") );
+				}
+
+			} else {
+				/* This is a >= 4.0.18 format id where the user
+				gave the id name */
+				db_len = dict_get_db_name_len(
+					table->name.m_name) + 1;
+
+				if (db_len - 1
+				    > dict_get_db_name_len(foreign->id)) {
+
+					foreign->id = static_cast<char*>(
+						mem_heap_alloc(
+						foreign->heap,
+						db_len + strlen(old_id) + 1));
+				}
+
+				/* Replace the database prefix in id with the
+				one from table->name */
+
+				memcpy(foreign->id,
+				       table->name.m_name, db_len);
+
+				strcpy(foreign->id + db_len,
+				       dict_remove_db_name(old_id));
+			}
+
+			ut_free(old_id);
+		}
+
+		table->foreign_set.erase(it);
+		fk_set.insert(foreign);
+
+		if (foreign->referenced_table) {
+			foreign->referenced_table->referenced_set.insert(foreign);
+		}
+	}
+
+	ut_a(table->foreign_set.empty());
+	table->foreign_set.swap(fk_set);
+
+	for (dict_foreign_set::iterator it = table->referenced_set.begin();
+	     it != table->referenced_set.end();
+	     ++it) {
+
+		foreign = *it;
+
+		if (strlen(foreign->referenced_table_name)
+		    < strlen(table->name.m_name)) {
+			/* Allocate a longer name buffer;
+			TODO: store buf len to save memory */
+
+			foreign->referenced_table_name = mem_heap_strdup(
+				foreign->heap, table->name.m_name);
+
+			dict_mem_referenced_table_name_lookup_set(
+				foreign, TRUE);
+		} else {
+			/* Use the same buffer */
+			strcpy(foreign->referenced_table_name,
+			       table->name.m_name);
+
+			dict_mem_referenced_table_name_lookup_set(
+				foreign, FALSE);
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/**********************************************************************//**
+Change the id of a table object in the dictionary cache. This is used in
+DISCARD TABLESPACE. */
+void
+dict_table_change_id_in_cache(
+/*==========================*/
+	dict_table_t*	table,	/*!< in/out: table object already in cache */
+	table_id_t	new_id)	/*!< in: new id to set */
+{
+	ut_ad(dict_sys.locked());
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+	ut_ad(!table->is_temporary());
+
+	/* Remove the table from the hash table of id's */
+
+	HASH_DELETE(dict_table_t, id_hash, &dict_sys.table_id_hash,
+		    ut_fold_ull(table->id), table);
+	table->id = new_id;
+
+	/* Add the table back to the hash table */
+	HASH_INSERT(dict_table_t, id_hash, &dict_sys.table_id_hash,
+		    ut_fold_ull(table->id), table);
+}
+
+/** Evict a table definition from the InnoDB data dictionary cache.
+@param[in,out]	table	cached table definition to be evicted
+@param[in]	lru	whether this is part of least-recently-used eviction
+@param[in]	keep	whether to keep (not free) the object */
+void dict_sys_t::remove(dict_table_t* table, bool lru, bool keep)
+{
+	dict_foreign_t*	foreign;
+	dict_index_t*	index;
+
+	ut_ad(dict_lru_validate());
+	ut_a(table->get_ref_count() == 0);
+	ut_a(table->n_rec_locks == 0);
+	ut_ad(find(table));
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+	/* Remove the foreign constraints from the cache */
+	std::for_each(table->foreign_set.begin(), table->foreign_set.end(),
+		      dict_foreign_remove_partial());
+	table->foreign_set.clear();
+
+	/* Reset table field in referencing constraints */
+	for (dict_foreign_set::iterator it = table->referenced_set.begin();
+	     it != table->referenced_set.end();
+	     ++it) {
+
+		foreign = *it;
+		foreign->referenced_table = NULL;
+		foreign->referenced_index = NULL;
+	}
+
+	/* Remove the indexes from the cache */
+
+	for (index = UT_LIST_GET_LAST(table->indexes);
+	     index != NULL;
+	     index = UT_LIST_GET_LAST(table->indexes)) {
+
+		dict_index_remove_from_cache_low(table, index, lru);
+	}
+
+	/* Remove table from the hash tables of tables */
+
+	HASH_DELETE(dict_table_t, name_hash, &table_hash,
+		    my_crc32c(0, table->name.m_name,
+			      strlen(table->name.m_name)),
+		    table);
+
+	hash_table_t* id_hash = table->is_temporary()
+		? &temp_id_hash : &table_id_hash;
+	const ulint id_fold = ut_fold_ull(table->id);
+	HASH_DELETE(dict_table_t, id_hash, id_hash, id_fold, table);
+
+	/* Remove table from LRU or non-LRU list. */
+	if (table->can_be_evicted) {
+		UT_LIST_REMOVE(table_LRU, table);
+	} else {
+		UT_LIST_REMOVE(table_non_LRU, table);
+	}
+
+	/* Free virtual column template if any */
+	if (table->vc_templ != NULL) {
+		dict_free_vc_templ(table->vc_templ);
+		UT_DELETE(table->vc_templ);
+	}
+
+	table->lock_mutex_destroy();
+
+	if (keep) {
+		table->autoinc_mutex.destroy();
+		return;
+	}
+
+#ifdef BTR_CUR_HASH_ADAPT
+	if (table->fts) {
+		fts_optimize_remove_table(table);
+		table->fts->~fts_t();
+		table->fts = nullptr;
+	}
+
+	table->autoinc_mutex.wr_lock();
+
+	ulint freed = UT_LIST_GET_LEN(table->freed_indexes);
+
+	table->vc_templ = NULL;
+	table->id = 0;
+	table->autoinc_mutex.wr_unlock();
+
+	if (UNIV_UNLIKELY(freed != 0)) {
+		return;
+	}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+	table->autoinc_mutex.destroy();
+	dict_mem_table_free(table);
+}
+
+/****************************************************************//**
+If the given column name is reserved for InnoDB system columns, return
+TRUE.
+@return TRUE if name is reserved */
+ibool
+dict_col_name_is_reserved(
+/*======================*/
+	const char*	name)	/*!< in: column name */
+{
+	static const char*	reserved_names[] = {
+		"DB_ROW_ID", "DB_TRX_ID", "DB_ROLL_PTR"
+	};
+
+	compile_time_assert(UT_ARR_SIZE(reserved_names) == DATA_N_SYS_COLS);
+
+	for (ulint i = 0; i < UT_ARR_SIZE(reserved_names); i++) {
+		if (innobase_strcasecmp(name, reserved_names[i]) == 0) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/** Adds an index to the dictionary cache, with possible indexing newly
+added column.
+@param[in,out]	index	index; NOTE! The index memory
+			object is freed in this function!
+@param[in]	page_no	root page number of the index
+@param[in]	add_v	virtual columns being added along with ADD INDEX
+@return DB_SUCCESS, or DB_CORRUPTION */
+dberr_t
+dict_index_add_to_cache(
+	dict_index_t*&		index,
+	ulint			page_no,
+	const dict_add_v_col_t* add_v)
+{
+	dict_index_t*	new_index;
+	ulint		n_ord;
+	ulint		i;
+
+	ut_ad(dict_sys.locked());
+	ut_ad(index->n_def == index->n_fields);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+	ut_ad(!dict_index_is_online_ddl(index));
+	ut_ad(!dict_index_is_ibuf(index));
+
+	ut_d(mem_heap_validate(index->heap));
+	ut_a(!dict_index_is_clust(index)
+	     || UT_LIST_GET_LEN(index->table->indexes) == 0);
+	ut_ad(dict_index_is_clust(index) || !index->table->no_rollback());
+
+	if (!dict_index_find_cols(index, add_v)) {
+
+		dict_mem_index_free(index);
+		index = NULL;
+		return DB_CORRUPTION;
+	}
+
+	/* Build the cache internal representation of the index,
+	containing also the added system fields */
+
+	if (dict_index_is_clust(index)) {
+		new_index = dict_index_build_internal_clust(index);
+	} else {
+		new_index = (index->type & DICT_FTS)
+			? dict_index_build_internal_fts(index)
+			: dict_index_build_internal_non_clust(index);
+		new_index->n_core_null_bytes = static_cast<uint8_t>(
+			UT_BITS_IN_BYTES(unsigned(new_index->n_nullable)));
+	}
+
+	/* Set the n_fields value in new_index to the actual defined
+	number of fields in the cache internal representation */
+
+	new_index->n_fields = new_index->n_def;
+	new_index->trx_id = index->trx_id;
+	new_index->set_committed(index->is_committed());
+	new_index->nulls_equal = index->nulls_equal;
+
+	n_ord = new_index->n_uniq;
+	/* Flag the ordering columns and also set column max_prefix */
+
+	for (i = 0; i < n_ord; i++) {
+		const dict_field_t*	field
+			= dict_index_get_nth_field(new_index, i);
+
+		/* Check the column being added in the index for
+		the first time and flag the ordering column. */
+		if (field->col->ord_part == 0 ) {
+			field->col->max_prefix = field->prefix_len;
+			field->col->ord_part = 1;
+		} else if (field->prefix_len == 0) {
+			/* Set the max_prefix for a column to 0 if
+			its prefix length is 0 (for this index)
+			even if it was a part of any other index
+			with some prefix length. */
+			field->col->max_prefix = 0;
+		} else if (field->col->max_prefix != 0
+			   && field->prefix_len
+			   > field->col->max_prefix) {
+			/* Set the max_prefix value based on the
+			prefix_len. */
+			ut_ad(field->col->is_binary()
+			      || field->prefix_len % field->col->mbmaxlen == 0
+			      || field->prefix_len % 4 == 0);
+			field->col->max_prefix = field->prefix_len;
+		}
+		ut_ad(field->col->ord_part == 1);
+	}
+
+	new_index->stat_n_diff_key_vals =
+		static_cast<ib_uint64_t*>(mem_heap_zalloc(
+			new_index->heap,
+			dict_index_get_n_unique(new_index)
+			* sizeof(*new_index->stat_n_diff_key_vals)));
+
+	new_index->stat_n_sample_sizes =
+		static_cast<ib_uint64_t*>(mem_heap_zalloc(
+			new_index->heap,
+			dict_index_get_n_unique(new_index)
+			* sizeof(*new_index->stat_n_sample_sizes)));
+
+	new_index->stat_n_non_null_key_vals =
+		static_cast<ib_uint64_t*>(mem_heap_zalloc(
+			new_index->heap,
+			dict_index_get_n_unique(new_index)
+			* sizeof(*new_index->stat_n_non_null_key_vals)));
+
+	new_index->stat_index_size = 1;
+	new_index->stat_n_leaf_pages = 1;
+
+	new_index->stat_defrag_n_pages_freed = 0;
+	new_index->stat_defrag_n_page_split = 0;
+
+	new_index->stat_defrag_sample_next_slot = 0;
+	memset(&new_index->stat_defrag_data_size_sample,
+	       0x0, sizeof(ulint) * STAT_DEFRAG_DATA_SIZE_N_SAMPLE);
+
+	/* Add the new index as the last index for the table */
+
+	UT_LIST_ADD_LAST(new_index->table->indexes, new_index);
+#ifdef BTR_CUR_ADAPT
+	new_index->search_info = btr_search_info_create(new_index->heap);
+#endif /* BTR_CUR_ADAPT */
+
+	new_index->page = unsigned(page_no);
+	new_index->lock.SRW_LOCK_INIT(index_tree_rw_lock_key);
+
+	new_index->n_core_fields = new_index->n_fields;
+
+	dict_mem_index_free(index);
+	index = new_index;
+	return DB_SUCCESS;
+}
+
+/**********************************************************************//**
+Removes an index from the dictionary cache. */
+TRANSACTIONAL_TARGET
+static
+void
+dict_index_remove_from_cache_low(
+/*=============================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	dict_index_t*	index,		/*!< in, own: index */
+	ibool		lru_evict)	/*!< in: TRUE if index being evicted
+					to make room in the table LRU list */
+{
+	ut_ad(table && index);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+	ut_ad(dict_sys.locked());
+	ut_ad(table->id);
+#ifdef BTR_CUR_HASH_ADAPT
+	ut_ad(!index->freed());
+#endif /* BTR_CUR_HASH_ADAPT */
+
+	/* No need to acquire the dict_index_t::lock here because
+	there can't be any active operations on this index (or table). */
+
+	if (index->online_log) {
+		row_log_free(index->online_log);
+		index->online_log = NULL;
+	}
+
+	/* Remove the index from the list of indexes of the table */
+	UT_LIST_REMOVE(table->indexes, index);
+
+	/* The index is being dropped, remove any compression stats for it. */
+	if (!lru_evict && DICT_TF_GET_ZIP_SSIZE(index->table->flags)) {
+		mysql_mutex_lock(&page_zip_stat_per_index_mutex);
+		page_zip_stat_per_index.erase(index->id);
+		mysql_mutex_unlock(&page_zip_stat_per_index_mutex);
+	}
+
+	/* Remove the index from affected virtual column index list */
+	index->detach_columns();
+
+#ifdef BTR_CUR_HASH_ADAPT
+	/* We always create search info whether or not adaptive
+	hash index is enabled or not. */
+	/* We are not allowed to free the in-memory index struct
+	dict_index_t until all entries in the adaptive hash index
+	that point to any of the page belonging to his b-tree index
+	are dropped. This is so because dropping of these entries
+	require access to dict_index_t struct. To avoid such scenario
+	We keep a count of number of such pages in the search_info and
+	only free the dict_index_t struct when this count drops to
+	zero. See also: dict_table_can_be_evicted() */
+
+	if (index->n_ahi_pages()) {
+		table->autoinc_mutex.wr_lock();
+		index->set_freed();
+		UT_LIST_ADD_LAST(table->freed_indexes, index);
+		table->autoinc_mutex.wr_unlock();
+		return;
+	}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+	index->lock.free();
+
+	dict_mem_index_free(index);
+}
+
+/**********************************************************************//**
+Removes an index from the dictionary cache. */
+void
+dict_index_remove_from_cache(
+/*=========================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	dict_index_t*	index)	/*!< in, own: index */
+{
+	dict_index_remove_from_cache_low(table, index, FALSE);
+}
+
+/** Tries to find column names for the index and sets the col field of the
+index.
+@param[in]	table	table
+@param[in,out]	index	index
+@param[in]	add_v	new virtual columns added along with an add index call
+@return whether the column names were found */
+static
+bool
+dict_index_find_cols(
+	dict_index_t*		index,
+	const dict_add_v_col_t*	add_v)
+{
+	std::vector<ulint, ut_allocator<ulint> >	col_added;
+	std::vector<ulint, ut_allocator<ulint> >	v_col_added;
+
+	const dict_table_t* table = index->table;
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+	ut_ad(dict_sys.locked());
+
+	for (ulint i = 0; i < index->n_fields; i++) {
+		ulint		j;
+		dict_field_t*	field = dict_index_get_nth_field(index, i);
+
+		for (j = 0; j < table->n_cols; j++) {
+			if (!innobase_strcasecmp(dict_table_get_col_name(table, j),
+				    field->name)) {
+
+				/* Check if same column is being assigned again
+				which suggest that column has duplicate name. */
+				bool exists =
+					std::find(col_added.begin(),
+						  col_added.end(), j)
+					!= col_added.end();
+
+				if (exists) {
+					/* Duplicate column found. */
+					goto dup_err;
+				}
+
+				field->col = dict_table_get_nth_col(table, j);
+
+				col_added.push_back(j);
+
+				goto found;
+			}
+		}
+
+		/* Let's check if it is a virtual column */
+		for (j = 0; j < table->n_v_cols; j++) {
+			if (!strcmp(dict_table_get_v_col_name(table, j),
+				    field->name)) {
+
+				/* Check if same column is being assigned again
+				which suggest that column has duplicate name. */
+				bool exists =
+					std::find(v_col_added.begin(),
+						  v_col_added.end(), j)
+					!= v_col_added.end();
+
+				if (exists) {
+					/* Duplicate column found. */
+					break;
+				}
+
+				field->col = reinterpret_cast<dict_col_t*>(
+					dict_table_get_nth_v_col(table, j));
+
+				v_col_added.push_back(j);
+
+				goto found;
+			}
+		}
+
+		if (add_v) {
+			for (j = 0; j < add_v->n_v_col; j++) {
+				if (!strcmp(add_v->v_col_name[j],
+					    field->name)) {
+					field->col = const_cast<dict_col_t*>(
+						&add_v->v_col[j].m_col);
+					goto found;
+				}
+			}
+		}
+
+dup_err:
+#ifdef UNIV_DEBUG
+		/* It is an error not to find a matching column. */
+		ib::error() << "No matching column for " << field->name
+			<< " in index " << index->name
+			<< " of table " << table->name;
+#endif /* UNIV_DEBUG */
+		return(FALSE);
+
+found:
+		;
+	}
+
+	return(TRUE);
+}
+
+/** Add a column to an index.
+@param index          index
+@param table          table
+@param col            column
+@param prefix_len     column prefix length
+@param descending     whether to use descending order */
+void dict_index_add_col(dict_index_t *index, const dict_table_t *table,
+                        dict_col_t *col, ulint prefix_len, bool descending)
+{
+	dict_field_t*	field;
+	const char*	col_name;
+
+	if (col->is_virtual()) {
+		dict_v_col_t*	v_col = reinterpret_cast<dict_v_col_t*>(col);
+		/* Register the index with the virtual column index list */
+		v_col->v_indexes.push_front(dict_v_idx_t(index, index->n_def));
+		col_name = dict_table_get_v_col_name_mysql(
+			table, dict_col_get_no(col));
+	} else {
+		col_name = dict_table_get_col_name(table, dict_col_get_no(col));
+	}
+
+	dict_mem_index_add_field(index, col_name, prefix_len);
+
+	field = dict_index_get_nth_field(index, unsigned(index->n_def) - 1);
+
+	field->col = col;
+	field->fixed_len = static_cast<uint16_t>(
+		dict_col_get_fixed_size(
+			col, dict_table_is_comp(table)))
+		& ((1U << 10) - 1);
+
+	if (prefix_len && field->fixed_len > prefix_len) {
+		field->fixed_len = static_cast<uint16_t>(prefix_len)
+			& ((1U << 10) - 1);
+	}
+
+	/* Long fixed-length fields that need external storage are treated as
+	variable-length fields, so that the extern flag can be embedded in
+	the length word. */
+
+	if (field->fixed_len > DICT_MAX_FIXED_COL_LEN) {
+		field->fixed_len = 0;
+	}
+
+	field->descending = descending;
+
+	/* The comparison limit above must be constant.  If it were
+	changed, the disk format of some fixed-length columns would
+	change, which would be a disaster. */
+	compile_time_assert(DICT_MAX_FIXED_COL_LEN == 768);
+
+	if (!(col->prtype & DATA_NOT_NULL)) {
+		index->n_nullable++;
+	}
+}
+
+/*******************************************************************//**
+Copies fields contained in index2 to index1. */
+static
+void
+dict_index_copy(
+/*============*/
+	dict_index_t*		index1,	/*!< in: index to copy to */
+	const dict_index_t*	index2,	/*!< in: index to copy from */
+	ulint			start,	/*!< in: first position to copy */
+	ulint			end)	/*!< in: last position to copy */
+{
+	dict_field_t*	field;
+	ulint		i;
+
+	/* Copy fields contained in index2 */
+
+	for (i = start; i < end; i++) {
+
+		field = dict_index_get_nth_field(index2, i);
+
+		dict_index_add_col(index1, index2->table, field->col,
+				   field->prefix_len, field->descending);
+	}
+}
+
+/*******************************************************************//**
+Copies types of fields contained in index to tuple. */
+void
+dict_index_copy_types(
+/*==================*/
+	dtuple_t*		tuple,		/*!< in/out: data tuple */
+	const dict_index_t*	index,		/*!< in: index */
+	ulint			n_fields)	/*!< in: number of
+						field types to copy */
+{
+	ulint		i;
+
+	if (dict_index_is_ibuf(index)) {
+		dtuple_set_types_binary(tuple, n_fields);
+
+		return;
+	}
+
+	for (i = 0; i < n_fields; i++) {
+		const dict_field_t*	ifield;
+		dtype_t*		dfield_type;
+
+		ifield = dict_index_get_nth_field(index, i);
+		dfield_type = dfield_get_type(dtuple_get_nth_field(tuple, i));
+		dict_col_copy_type(dict_field_get_col(ifield), dfield_type);
+		if (dict_index_is_spatial(index)
+		    && DATA_GEOMETRY_MTYPE(dfield_type->mtype)) {
+			dfield_type->prtype |= DATA_GIS_MBR;
+		}
+	}
+}
+
+/** Copies types of virtual columns contained in table to tuple and sets all
+fields of the tuple to the SQL NULL value.  This function should
+be called right after dtuple_create().
+@param[in,out]	tuple	data tuple
+@param[in]	table	table
+*/
+void
+dict_table_copy_v_types(
+	dtuple_t*		tuple,
+	const dict_table_t*	table)
+{
+	/* tuple could have more virtual columns than existing table,
+	if we are calling this for creating index along with adding
+	virtual columns */
+	ulint	n_fields = ut_min(dtuple_get_n_v_fields(tuple),
+				  static_cast<ulint>(table->n_v_def));
+
+	for (ulint i = 0; i < n_fields; i++) {
+
+		dfield_t*	dfield	= dtuple_get_nth_v_field(tuple, i);
+		dtype_t*	dtype	= dfield_get_type(dfield);
+
+		dfield_set_null(dfield);
+		dict_col_copy_type(
+			&(dict_table_get_nth_v_col(table, i)->m_col),
+			dtype);
+	}
+}
+/*******************************************************************//**
+Copies types of columns contained in table to tuple and sets all
+fields of the tuple to the SQL NULL value.  This function should
+be called right after dtuple_create(). */
+void
+dict_table_copy_types(
+/*==================*/
+	dtuple_t*		tuple,	/*!< in/out: data tuple */
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ulint		i;
+
+	for (i = 0; i < dtuple_get_n_fields(tuple); i++) {
+
+		dfield_t*	dfield	= dtuple_get_nth_field(tuple, i);
+		dtype_t*	dtype	= dfield_get_type(dfield);
+
+		dfield_set_null(dfield);
+		dict_col_copy_type(dict_table_get_nth_col(table, i), dtype);
+	}
+
+	dict_table_copy_v_types(tuple, table);
+}
+
+/*******************************************************************//**
+Builds the internal dictionary cache representation for a clustered
+index, containing also system fields not defined by the user.
+@return own: the internal representation of the clustered index */
+static
+dict_index_t*
+dict_index_build_internal_clust(
+/*============================*/
+	dict_index_t*		index)	/*!< in: user representation of
+					a clustered index */
+{
+	dict_table_t*	table = index->table;
+	dict_index_t*	new_index;
+	dict_field_t*	field;
+	ulint		trx_id_pos;
+	ulint		i;
+	ibool*		indexed;
+
+	ut_ad(index->is_primary());
+	ut_ad(!index->has_virtual());
+
+	ut_ad(dict_sys.locked());
+
+	/* Create a new index object with certainly enough fields */
+	new_index = dict_mem_index_create(index->table, index->name,
+					  index->type,
+					  unsigned(index->n_fields
+						   + table->n_cols));
+
+	/* Copy other relevant data from the old index struct to the new
+	struct: it inherits the values */
+
+	new_index->n_user_defined_cols = index->n_fields;
+
+	new_index->id = index->id;
+
+	/* Copy the fields of index */
+	dict_index_copy(new_index, index, 0, index->n_fields);
+
+	if (dict_index_is_unique(index)) {
+		/* Only the fields defined so far are needed to identify
+		the index entry uniquely */
+
+		new_index->n_uniq = new_index->n_def;
+	} else {
+		/* Also the row id is needed to identify the entry */
+		new_index->n_uniq = unsigned(new_index->n_def + 1)
+			& dict_index_t::MAX_N_FIELDS;
+	}
+
+	new_index->trx_id_offset = 0;
+
+	/* Add system columns, trx id first */
+
+	trx_id_pos = new_index->n_def;
+
+	compile_time_assert(DATA_ROW_ID == 0);
+	compile_time_assert(DATA_TRX_ID == 1);
+	compile_time_assert(DATA_ROLL_PTR == 2);
+
+	if (!dict_index_is_unique(index)) {
+		dict_index_add_col(new_index, table,
+				   dict_table_get_sys_col(
+					   table, DATA_ROW_ID),
+				   0);
+		trx_id_pos++;
+	}
+
+	dict_index_add_col(
+		new_index, table,
+		dict_table_get_sys_col(table, DATA_TRX_ID), 0);
+
+	for (i = 0; i < trx_id_pos; i++) {
+
+		ulint	fixed_size = dict_col_get_fixed_size(
+			dict_index_get_nth_col(new_index, i),
+			dict_table_is_comp(table));
+
+		if (fixed_size == 0) {
+			new_index->trx_id_offset = 0;
+
+			break;
+		}
+
+		dict_field_t* field = dict_index_get_nth_field(
+			new_index, i);
+		if (field->prefix_len > 0) {
+			new_index->trx_id_offset = 0;
+
+			break;
+		}
+
+		/* Add fixed_size to new_index->trx_id_offset.
+		Because the latter is a bit-field, an overflow
+		can theoretically occur. Check for it. */
+		fixed_size += new_index->trx_id_offset;
+
+		new_index->trx_id_offset = static_cast<unsigned>(fixed_size)
+			& ((1U << 12) - 1);
+
+		if (new_index->trx_id_offset != fixed_size) {
+			/* Overflow. Pretend that this is a
+			variable-length PRIMARY KEY. */
+			ut_ad(0);
+			new_index->trx_id_offset = 0;
+			break;
+		}
+	}
+
+	dict_index_add_col(
+		new_index, table,
+		dict_table_get_sys_col(table, DATA_ROLL_PTR), 0);
+
+	/* Remember the table columns already contained in new_index */
+	indexed = static_cast<ibool*>(
+		ut_zalloc_nokey(table->n_cols * sizeof *indexed));
+
+	/* Mark the table columns already contained in new_index */
+	for (i = 0; i < new_index->n_def; i++) {
+
+		field = dict_index_get_nth_field(new_index, i);
+
+		/* If there is only a prefix of the column in the index
+		field, do not mark the column as contained in the index */
+
+		if (field->prefix_len == 0) {
+
+			indexed[field->col->ind] = TRUE;
+		}
+	}
+
+	/* Add to new_index non-system columns of table not yet included
+	there */
+	for (i = 0; i + DATA_N_SYS_COLS < ulint(table->n_cols); i++) {
+		dict_col_t*	col = dict_table_get_nth_col(table, i);
+		ut_ad(col->mtype != DATA_SYS);
+
+		if (!indexed[col->ind]) {
+			dict_index_add_col(new_index, table, col, 0);
+		}
+	}
+
+	ut_free(indexed);
+
+	ut_ad(UT_LIST_GET_LEN(table->indexes) == 0);
+
+	new_index->n_core_null_bytes = table->supports_instant()
+		? dict_index_t::NO_CORE_NULL_BYTES
+		: static_cast<uint8_t>(
+			UT_BITS_IN_BYTES(unsigned(new_index->n_nullable)));
+	new_index->cached = TRUE;
+
+	return(new_index);
+}
+
+/*******************************************************************//**
+Builds the internal dictionary cache representation for a non-clustered
+index, containing also system fields not defined by the user.
+@return own: the internal representation of the non-clustered index */
+static
+dict_index_t*
+dict_index_build_internal_non_clust(
+/*================================*/
+	dict_index_t*		index)	/*!< in: user representation of
+					a non-clustered index */
+{
+	dict_field_t*	field;
+	dict_index_t*	new_index;
+	dict_index_t*	clust_index;
+	dict_table_t*	table = index->table;
+	ulint		i;
+	ibool*		indexed;
+
+	ut_ad(table && index);
+	ut_ad(!dict_index_is_clust(index));
+	ut_ad(!dict_index_is_ibuf(index));
+	ut_ad(dict_sys.locked());
+
+	/* The clustered index should be the first in the list of indexes */
+	clust_index = UT_LIST_GET_FIRST(table->indexes);
+
+	ut_ad(clust_index);
+	ut_ad(dict_index_is_clust(clust_index));
+	ut_ad(!dict_index_is_ibuf(clust_index));
+
+	/* Create a new index */
+	new_index = dict_mem_index_create(
+		index->table, index->name, index->type,
+		ulint(index->n_fields + 1 + clust_index->n_uniq));
+
+	/* Copy other relevant data from the old index
+	struct to the new struct: it inherits the values */
+
+	new_index->n_user_defined_cols = index->n_fields;
+
+	new_index->id = index->id;
+
+	/* Copy fields from index to new_index */
+	dict_index_copy(new_index, index, 0, index->n_fields);
+
+	/* Remember the table columns already contained in new_index */
+	indexed = static_cast<ibool*>(
+		ut_zalloc_nokey(table->n_cols * sizeof *indexed));
+
+	/* Mark the table columns already contained in new_index */
+	for (i = 0; i < new_index->n_def; i++) {
+
+		field = dict_index_get_nth_field(new_index, i);
+
+		if (field->col->is_virtual()) {
+			continue;
+		}
+
+		/* If there is only a prefix of the column in the index
+		field, do not mark the column as contained in the index */
+
+		if (field->prefix_len == 0) {
+
+			indexed[field->col->ind] = TRUE;
+		}
+	}
+
+	/* Add to new_index the columns necessary to determine the clustered
+	index entry uniquely */
+
+	for (i = 0; i < clust_index->n_uniq; i++) {
+		field = dict_index_get_nth_field(clust_index, i);
+
+		if (!indexed[field->col->ind] || index->is_spatial()) {
+			dict_index_add_col(new_index, table, field->col,
+					   field->prefix_len,
+					   field->descending);
+		}
+	}
+
+	ut_free(indexed);
+
+	if (dict_index_is_unique(index)) {
+		new_index->n_uniq = index->n_fields;
+	} else {
+		new_index->n_uniq = new_index->n_def;
+	}
+
+	/* Set the n_fields value in new_index to the actual defined
+	number of fields */
+
+	new_index->n_fields = new_index->n_def;
+
+	new_index->cached = TRUE;
+
+	return(new_index);
+}
+
+/***********************************************************************
+Builds the internal dictionary cache representation for an FTS index.
+@return own: the internal representation of the FTS index */
+static
+dict_index_t*
+dict_index_build_internal_fts(
+/*==========================*/
+	dict_index_t*	index)	/*!< in: user representation of an FTS index */
+{
+	dict_index_t*	new_index;
+
+	ut_ad(index->type & DICT_FTS);
+	ut_ad(dict_sys.locked());
+
+	/* Create a new index */
+	new_index = dict_mem_index_create(index->table, index->name,
+					  index->type, index->n_fields);
+
+	/* Copy other relevant data from the old index struct to the new
+	struct: it inherits the values */
+
+	new_index->n_user_defined_cols = index->n_fields;
+
+	new_index->id = index->id;
+
+	/* Copy fields from index to new_index */
+	dict_index_copy(new_index, index, 0, index->n_fields);
+
+	new_index->n_uniq = 0;
+	new_index->cached = TRUE;
+
+	dict_table_t* table = index->table;
+
+	if (table->fts->cache == NULL) {
+		table->fts->cache = fts_cache_create(table);
+	}
+
+	mysql_mutex_lock(&table->fts->cache->init_lock);
+	/* Notify the FTS cache about this index. */
+	fts_cache_index_cache_create(table, new_index);
+	mysql_mutex_unlock(&table->fts->cache->init_lock);
+
+	return(new_index);
+}
+/*====================== FOREIGN KEY PROCESSING ========================*/
+
+/**********************************************************************//**
+Removes a foreign constraint struct from the dictionary cache. */
+void
+dict_foreign_remove_from_cache(
+/*===========================*/
+	dict_foreign_t*	foreign)	/*!< in, own: foreign constraint */
+{
+	ut_ad(dict_sys.locked());
+	ut_a(foreign);
+
+	if (foreign->referenced_table != NULL) {
+		foreign->referenced_table->referenced_set.erase(foreign);
+	}
+
+	if (foreign->foreign_table != NULL) {
+		foreign->foreign_table->foreign_set.erase(foreign);
+	}
+
+	dict_foreign_free(foreign);
+}
+
+/**********************************************************************//**
+Looks for the foreign constraint from the foreign and referenced lists
+of a table.
+@return foreign constraint */
+static
+dict_foreign_t*
+dict_foreign_find(
+/*==============*/
+	dict_table_t*	table,		/*!< in: table object */
+	dict_foreign_t*	foreign)	/*!< in: foreign constraint */
+{
+	ut_ad(dict_sys.frozen());
+
+	ut_ad(dict_foreign_set_validate(table->foreign_set));
+	ut_ad(dict_foreign_set_validate(table->referenced_set));
+
+	dict_foreign_set::iterator it = table->foreign_set.find(foreign);
+
+	if (it != table->foreign_set.end()) {
+		return(*it);
+	}
+
+	it = table->referenced_set.find(foreign);
+
+	if (it != table->referenced_set.end()) {
+		return(*it);
+	}
+
+	return(NULL);
+}
+
+/*********************************************************************//**
+Tries to find an index whose first fields are the columns in the array,
+in the same order and is not marked for deletion and is not the same
+as types_idx.
+@return matching index, NULL if not found */
+dict_index_t*
+dict_foreign_find_index(
+/*====================*/
+	const dict_table_t*	table,	/*!< in: table */
+	const char**		col_names,
+					/*!< in: column names, or NULL
+					to use table->col_names */
+	const char**		columns,/*!< in: array of column names */
+	ulint			n_cols,	/*!< in: number of columns */
+	const dict_index_t*	types_idx,
+					/*!< in: NULL or an index
+					whose types the column types
+					must match */
+	bool			check_charsets,
+					/*!< in: whether to check
+					charsets.  only has an effect
+					if types_idx != NULL */
+	ulint			check_null,
+					/*!< in: nonzero if none of
+					the columns must be declared
+					NOT NULL */
+	fkerr_t*		error,	/*!< out: error code */
+	ulint*			err_col_no,
+					/*!< out: column number where
+					error happened */
+	dict_index_t**		err_index)
+					/*!< out: index where error
+					happened */
+{
+	ut_ad(dict_sys.frozen());
+
+	if (error) {
+		*error = FK_INDEX_NOT_FOUND;
+	}
+
+	for (dict_index_t* index = dict_table_get_first_index(table);
+	     index;
+	     index = dict_table_get_next_index(index)) {
+		if (types_idx != index
+		    && !index->to_be_dropped
+		    && !dict_index_is_online_ddl(index)
+		    && dict_foreign_qualify_index(
+			    table, col_names, columns, n_cols,
+			    index, types_idx,
+			    check_charsets, check_null,
+			    error, err_col_no, err_index)) {
+			if (error) {
+				*error = FK_SUCCESS;
+			}
+
+			return(index);
+		}
+	}
+
+	return(NULL);
+}
+/**********************************************************************//**
+Report an error in a foreign key definition. */
+static
+void
+dict_foreign_error_report_low(
+/*==========================*/
+	FILE*		file,	/*!< in: output stream */
+	const char*	name)	/*!< in: table name */
+{
+	rewind(file);
+	ut_print_timestamp(file);
+	fprintf(file, " Error in foreign key constraint of table %s:\n",
+		name);
+}
+
+/**********************************************************************//**
+Report an error in a foreign key definition. */
+static
+void
+dict_foreign_error_report(
+/*======================*/
+	FILE*		file,	/*!< in: output stream */
+	dict_foreign_t*	fk,	/*!< in: foreign key constraint */
+	const char*	msg)	/*!< in: the error message */
+{
+	std::string fk_str;
+	mysql_mutex_lock(&dict_foreign_err_mutex);
+	dict_foreign_error_report_low(file, fk->foreign_table_name);
+	fputs(msg, file);
+	fputs(" Constraint:\n", file);
+	fk_str = dict_print_info_on_foreign_key_in_create_format(NULL, fk, TRUE);
+	fputs(fk_str.c_str(), file);
+	putc('\n', file);
+	if (fk->foreign_index) {
+		fprintf(file, "The index in the foreign key in table is"
+			" %s\n%s\n", fk->foreign_index->name(),
+			FOREIGN_KEY_CONSTRAINTS_MSG);
+	}
+	mysql_mutex_unlock(&dict_foreign_err_mutex);
+}
+
+/**********************************************************************//**
+Adds a foreign key constraint object to the dictionary cache. May free
+the object if there already is an object with the same identifier in.
+At least one of the foreign table and the referenced table must already
+be in the dictionary cache!
+@return DB_SUCCESS or error code */
+dberr_t
+dict_foreign_add_to_cache(
+/*======================*/
+	dict_foreign_t*		foreign,
+				/*!< in, own: foreign key constraint */
+	const char**		col_names,
+				/*!< in: column names, or NULL to use
+				foreign->foreign_table->col_names */
+	bool			check_charsets,
+				/*!< in: whether to check charset
+				compatibility */
+	dict_err_ignore_t	ignore_err)
+				/*!< in: error to be ignored */
+{
+	dict_table_t*	for_table;
+	dict_table_t*	ref_table;
+	dict_foreign_t*	for_in_cache		= NULL;
+	dict_index_t*	index;
+	ibool		added_to_referenced_list= FALSE;
+	FILE*		ef			= dict_foreign_err_file;
+
+	DBUG_ENTER("dict_foreign_add_to_cache");
+	DBUG_PRINT("dict_foreign_add_to_cache", ("id: %s", foreign->id));
+
+	ut_ad(dict_sys.locked());
+
+	for_table = dict_sys.find_table(
+		{foreign->foreign_table_name_lookup,
+		 strlen(foreign->foreign_table_name_lookup)});
+
+	ref_table = dict_sys.find_table(
+		{foreign->referenced_table_name_lookup,
+		 strlen(foreign->referenced_table_name_lookup)});
+	ut_a(for_table || ref_table);
+
+	if (for_table) {
+		for_in_cache = dict_foreign_find(for_table, foreign);
+	}
+
+	if (!for_in_cache && ref_table) {
+		for_in_cache = dict_foreign_find(ref_table, foreign);
+	}
+
+	if (for_in_cache) {
+		dict_foreign_free(foreign);
+	} else {
+		for_in_cache = foreign;
+
+	}
+
+	if (ref_table && !for_in_cache->referenced_table) {
+		index = dict_foreign_find_index(
+			ref_table, NULL,
+			for_in_cache->referenced_col_names,
+			for_in_cache->n_fields, for_in_cache->foreign_index,
+			check_charsets, false);
+
+		if (index == NULL
+		    && !(ignore_err & DICT_ERR_IGNORE_FK_NOKEY)) {
+			dict_foreign_error_report(
+				ef, for_in_cache,
+				"there is no index in referenced table"
+				" which would contain\n"
+				"the columns as the first columns,"
+				" or the data types in the\n"
+				"referenced table do not match"
+				" the ones in table.");
+
+			if (for_in_cache == foreign) {
+				dict_foreign_free(foreign);
+			}
+
+			DBUG_RETURN(DB_CANNOT_ADD_CONSTRAINT);
+		}
+
+		for_in_cache->referenced_table = ref_table;
+		for_in_cache->referenced_index = index;
+
+		std::pair<dict_foreign_set::iterator, bool>	ret
+			= ref_table->referenced_set.insert(for_in_cache);
+
+		ut_a(ret.second);	/* second is true if the insertion
+					took place */
+		added_to_referenced_list = TRUE;
+	}
+
+	if (for_table && !for_in_cache->foreign_table) {
+		index = dict_foreign_find_index(
+			for_table, col_names,
+			for_in_cache->foreign_col_names,
+			for_in_cache->n_fields,
+			for_in_cache->referenced_index, check_charsets,
+			for_in_cache->type
+			& (DICT_FOREIGN_ON_DELETE_SET_NULL
+			   | DICT_FOREIGN_ON_UPDATE_SET_NULL));
+
+		if (index == NULL
+		    && !(ignore_err & DICT_ERR_IGNORE_FK_NOKEY)) {
+			dict_foreign_error_report(
+				ef, for_in_cache,
+				"there is no index in the table"
+				" which would contain\n"
+				"the columns as the first columns,"
+				" or the data types in the\n"
+				"table do not match"
+				" the ones in the referenced table\n"
+				"or one of the ON ... SET NULL columns"
+				" is declared NOT NULL.");
+
+			if (for_in_cache == foreign) {
+				if (added_to_referenced_list) {
+					const dict_foreign_set::size_type
+						n = ref_table->referenced_set
+						  .erase(for_in_cache);
+
+					ut_a(n == 1);	/* the number of
+							elements removed must
+							be one */
+				}
+
+				dict_foreign_free(foreign);
+			}
+
+			DBUG_RETURN(DB_CANNOT_ADD_CONSTRAINT);
+		}
+
+		for_in_cache->foreign_table = for_table;
+		for_in_cache->foreign_index = index;
+
+		std::pair<dict_foreign_set::iterator, bool>	ret
+			= for_table->foreign_set.insert(for_in_cache);
+
+		ut_a(ret.second);	/* second is true if the insertion
+					took place */
+	}
+
+	/* We need to move the table to the non-LRU end of the table LRU
+	list. Otherwise it will be evicted from the cache. */
+
+	if (ref_table != NULL) {
+		dict_sys.prevent_eviction(ref_table);
+	}
+
+	if (for_table != NULL) {
+		dict_sys.prevent_eviction(for_table);
+	}
+
+	ut_ad(dict_lru_validate());
+	DBUG_RETURN(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Scans from pointer onwards. Stops if is at the start of a copy of
+'string' where characters are compared without case sensitivity, and
+only outside `` or "" quotes. Stops also at NUL.
+@return scanned up to this */
+static
+const char*
+dict_scan_to(
+/*=========*/
+	const char*	ptr,	/*!< in: scan from */
+	const char*	string)	/*!< in: look for this */
+{
+	char	quote	= '\0';
+	bool	escape	= false;
+
+	for (; *ptr; ptr++) {
+		if (*ptr == quote) {
+			/* Closing quote character: do not look for
+			starting quote or the keyword. */
+
+			/* If the quote character is escaped by a
+			backslash, ignore it. */
+			if (escape) {
+				escape = false;
+			} else {
+				quote = '\0';
+			}
+		} else if (quote) {
+			/* Within quotes: do nothing. */
+			if (escape) {
+				escape = false;
+			} else if (*ptr == '\\') {
+				escape = true;
+			}
+		} else if (*ptr == '`' || *ptr == '"' || *ptr == '\'') {
+			/* Starting quote: remember the quote character. */
+			quote = *ptr;
+		} else {
+			/* Outside quotes: look for the keyword. */
+			ulint	i;
+			for (i = 0; string[i]; i++) {
+				if (toupper((int)(unsigned char)(ptr[i]))
+				    != toupper((int)(unsigned char)
+					       (string[i]))) {
+					goto nomatch;
+				}
+			}
+			break;
+nomatch:
+			;
+		}
+	}
+
+	return(ptr);
+}
+
+/*********************************************************************//**
+Accepts a specified string. Comparisons are case-insensitive.
+@return if string was accepted, the pointer is moved after that, else
+ptr is returned */
+static
+const char*
+dict_accept(
+/*========*/
+	CHARSET_INFO*	cs,	/*!< in: the character set of ptr */
+	const char*	ptr,	/*!< in: scan from this */
+	const char*	string,	/*!< in: accept only this string as the next
+				non-whitespace string */
+	ibool*		success)/*!< out: TRUE if accepted */
+{
+	const char*	old_ptr = ptr;
+	const char*	old_ptr2;
+
+	*success = FALSE;
+
+	while (my_isspace(cs, *ptr)) {
+		ptr++;
+	}
+
+	old_ptr2 = ptr;
+
+	ptr = dict_scan_to(ptr, string);
+
+	if (*ptr == '\0' || old_ptr2 != ptr) {
+		return(old_ptr);
+	}
+
+	*success = TRUE;
+
+	return ptr + strlen(string);
+}
+
+/*********************************************************************//**
+Scans an id. For the lexical definition of an 'id', see the code below.
+Strips backquotes or double quotes from around the id.
+@return scanned to */
+static
+const char*
+dict_scan_id(
+/*=========*/
+	CHARSET_INFO*	cs,	/*!< in: the character set of ptr */
+	const char*	ptr,	/*!< in: scanned to */
+	mem_heap_t*	heap,	/*!< in: heap where to allocate the id
+				(NULL=id will not be allocated, but it
+				will point to string near ptr) */
+	const char**	id,	/*!< out,own: the id; NULL if no id was
+				scannable */
+	ibool		table_id,/*!< in: TRUE=convert the allocated id
+				as a table name; FALSE=convert to UTF-8 */
+	ibool		accept_also_dot)
+				/*!< in: TRUE if also a dot can appear in a
+				non-quoted id; in a quoted id it can appear
+				always */
+{
+	char		quote	= '\0';
+	ulint		len	= 0;
+	const char*	s;
+	char*		str;
+	char*		dst;
+
+	*id = NULL;
+
+	while (my_isspace(cs, *ptr)) {
+		ptr++;
+	}
+
+	if (*ptr == '\0') {
+
+		return(ptr);
+	}
+
+	if (*ptr == '`' || *ptr == '"') {
+		quote = *ptr++;
+	}
+
+	s = ptr;
+
+	if (quote) {
+		for (;;) {
+			if (!*ptr) {
+				/* Syntax error */
+				return(ptr);
+			}
+			if (*ptr == quote) {
+				ptr++;
+				if (*ptr != quote) {
+					break;
+				}
+			}
+			ptr++;
+			len++;
+		}
+	} else {
+		while (!my_isspace(cs, *ptr) && *ptr != '(' && *ptr != ')'
+		       && (accept_also_dot || *ptr != '.')
+		       && *ptr != ',' && *ptr != '\0') {
+
+			ptr++;
+		}
+
+		len = ulint(ptr - s);
+	}
+
+	if (heap == NULL) {
+		/* no heap given: id will point to source string */
+		*id = s;
+		return(ptr);
+	}
+
+	if (quote) {
+		char*	d;
+
+		str = d = static_cast<char*>(
+			mem_heap_alloc(heap, len + 1));
+
+		while (len--) {
+			if ((*d++ = *s++) == quote) {
+				s++;
+			}
+		}
+		*d++ = 0;
+		len = ulint(d - str);
+		ut_ad(*s == quote);
+		ut_ad(s + 1 == ptr);
+	} else {
+		str = mem_heap_strdupl(heap, s, len);
+	}
+
+	if (!table_id) {
+convert_id:
+		/* Convert the identifier from connection character set
+		to UTF-8. */
+		len = 3 * len + 1;
+		*id = dst = static_cast<char*>(mem_heap_alloc(heap, len));
+
+		innobase_convert_from_id(cs, dst, str, len);
+	} else if (!strncmp(str, srv_mysql50_table_name_prefix,
+			    sizeof(srv_mysql50_table_name_prefix) - 1)) {
+		/* This is a pre-5.1 table name
+		containing chars other than [A-Za-z0-9].
+		Discard the prefix and use raw UTF-8 encoding. */
+		str += sizeof(srv_mysql50_table_name_prefix) - 1;
+		len -= sizeof(srv_mysql50_table_name_prefix) - 1;
+		goto convert_id;
+	} else {
+		/* Encode using filename-safe characters. */
+		len = 5 * len + 1;
+		*id = dst = static_cast<char*>(mem_heap_alloc(heap, len));
+
+		innobase_convert_from_table_id(cs, dst, str, len);
+	}
+
+	return(ptr);
+}
+
+/*********************************************************************//**
+Open a table from its database and table name, this is currently used by
+foreign constraint parser to get the referenced table.
+@return complete table name with database and table name, allocated from
+heap memory passed in */
+char*
+dict_get_referenced_table(
+	const char*    name,		  /*!< in: foreign key table name */
+	const char*    database_name,	  /*!< in: table db name */
+	ulint	       database_name_len, /*!< in: db name length */
+	const char*    table_name,	  /*!< in: table name */
+	ulint	       table_name_len,	  /*!< in: table name length */
+	dict_table_t** table,		  /*!< out: table object or NULL */
+	mem_heap_t*    heap,		  /*!< in/out: heap memory */
+	CHARSET_INFO*  from_cs)		  /*!< in: table name charset */
+{
+	char*		ref;
+	char		db_name[MAX_DATABASE_NAME_LEN];
+	char		tbl_name[MAX_TABLE_NAME_LEN];
+	CHARSET_INFO*	to_cs = &my_charset_filename;
+	uint		errors;
+	ut_ad(database_name || name);
+	ut_ad(table_name);
+
+	if (!strncmp(table_name, srv_mysql50_table_name_prefix,
+		     sizeof(srv_mysql50_table_name_prefix) - 1)) {
+		/* This is a pre-5.1 table name
+		containing chars other than [A-Za-z0-9].
+		Discard the prefix and use raw UTF-8 encoding. */
+		table_name += sizeof(srv_mysql50_table_name_prefix) - 1;
+		table_name_len -= sizeof(srv_mysql50_table_name_prefix) - 1;
+
+		to_cs = system_charset_info;
+	}
+
+	table_name_len = strconvert(from_cs, table_name, table_name_len, to_cs,
+				    tbl_name, MAX_TABLE_NAME_LEN, &errors);
+	table_name     = tbl_name;
+
+	if (database_name) {
+		to_cs = &my_charset_filename;
+		if (!strncmp(database_name, srv_mysql50_table_name_prefix,
+			     sizeof(srv_mysql50_table_name_prefix) - 1)) {
+			database_name
+				+= sizeof(srv_mysql50_table_name_prefix) - 1;
+			database_name_len
+				-= sizeof(srv_mysql50_table_name_prefix) - 1;
+			to_cs = system_charset_info;
+		}
+
+		database_name_len = strconvert(
+			from_cs, database_name, database_name_len, to_cs,
+			db_name, MAX_DATABASE_NAME_LEN, &errors);
+		database_name = db_name;
+	} else {
+		/* Use the database name of the foreign key table */
+
+		database_name = name;
+		database_name_len = dict_get_db_name_len(name);
+	}
+
+	/* Copy database_name, '/', table_name, '\0' */
+	const size_t len = database_name_len + table_name_len + 1;
+	ref = static_cast<char*>(mem_heap_alloc(heap, len + 1));
+	memcpy(ref, database_name, database_name_len);
+	ref[database_name_len] = '/';
+	memcpy(ref + database_name_len + 1, table_name, table_name_len + 1);
+
+	/* Values;  0 = Store and compare as given; case sensitive
+	            1 = Store and compare in lower; case insensitive
+	            2 = Store as given, compare in lower; case semi-sensitive */
+	if (lower_case_table_names == 2) {
+		innobase_casedn_str(ref);
+		*table = dict_sys.load_table({ref, len});
+		memcpy(ref, database_name, database_name_len);
+		ref[database_name_len] = '/';
+		memcpy(ref + database_name_len + 1, table_name, table_name_len + 1);
+
+	} else {
+#ifndef _WIN32
+		if (lower_case_table_names == 1) {
+			innobase_casedn_str(ref);
+		}
+#else
+		innobase_casedn_str(ref);
+#endif /* !_WIN32 */
+		*table = dict_sys.load_table({ref, len});
+	}
+
+	return(ref);
+}
+
+/*********************************************************************//**
+Removes MySQL comments from an SQL string. A comment is either
+(a) '#' to the end of the line,
+(b) '--[space]' to the end of the line, or
+(c) '[slash][asterisk]' till the next '[asterisk][slash]' (like the familiar
+C comment syntax).
+@return own: SQL string stripped from comments; the caller must free
+this with ut_free()! */
+static
+char*
+dict_strip_comments(
+/*================*/
+	const char*	sql_string,	/*!< in: SQL string */
+	size_t		sql_length)	/*!< in: length of sql_string */
+{
+	char*		str;
+	const char*	sptr;
+	const char*	eptr	= sql_string + sql_length;
+	char*		ptr;
+	/* unclosed quote character (0 if none) */
+	char		quote	= 0;
+	bool		escape = false;
+
+	DBUG_ENTER("dict_strip_comments");
+
+	DBUG_PRINT("dict_strip_comments", ("%s", sql_string));
+
+	str = static_cast<char*>(ut_malloc_nokey(sql_length + 1));
+
+	sptr = sql_string;
+	ptr = str;
+
+	for (;;) {
+scan_more:
+		if (sptr >= eptr || *sptr == '\0') {
+end_of_string:
+			*ptr = '\0';
+
+			ut_a(ptr <= str + sql_length);
+
+			DBUG_PRINT("dict_strip_comments", ("%s", str));
+			DBUG_RETURN(str);
+		}
+
+		if (*sptr == quote) {
+			/* Closing quote character: do not look for
+			starting quote or comments. */
+
+			/* If the quote character is escaped by a
+			backslash, ignore it. */
+			if (escape) {
+				escape = false;
+			} else {
+				quote = 0;
+			}
+		} else if (quote) {
+			/* Within quotes: do not look for
+			starting quotes or comments. */
+			if (escape) {
+				escape = false;
+			} else if (*sptr == '\\') {
+				escape = true;
+			}
+		} else if (*sptr == '"' || *sptr == '`' || *sptr == '\'') {
+			/* Starting quote: remember the quote character. */
+			quote = *sptr;
+		} else if (*sptr == '#'
+			   || (sptr[0] == '-' && sptr[1] == '-'
+			       && sptr[2] == ' ')) {
+			for (;;) {
+				if (++sptr >= eptr) {
+					goto end_of_string;
+				}
+
+				/* In Unix a newline is 0x0A while in Windows
+				it is 0x0D followed by 0x0A */
+
+				switch (*sptr) {
+				case (char) 0X0A:
+				case (char) 0x0D:
+				case '\0':
+					goto scan_more;
+				}
+			}
+		} else if (!quote && *sptr == '/' && *(sptr + 1) == '*') {
+			sptr += 2;
+			for (;;) {
+				if (sptr >= eptr) {
+					goto end_of_string;
+				}
+
+				switch (*sptr) {
+				case '\0':
+					goto scan_more;
+				case '*':
+					if (sptr[1] == '/') {
+						sptr += 2;
+						goto scan_more;
+					}
+				}
+
+				sptr++;
+			}
+		}
+
+		*ptr = *sptr;
+
+		ptr++;
+		sptr++;
+	}
+}
+
+/*********************************************************************//**
+Finds the highest [number] for foreign key constraints of the table. Looks
+only at the >= 4.0.18-format id's, which are of the form
+databasename/tablename_ibfk_[number].
+@return highest number, 0 if table has no new format foreign key constraints */
+ulint
+dict_table_get_highest_foreign_id(
+/*==============================*/
+	dict_table_t*	table)	/*!< in: table in the dictionary memory cache */
+{
+	dict_foreign_t*	foreign;
+	char*		endp;
+	ulint		biggest_id	= 0;
+	ulint		id;
+	ulint		len;
+
+	DBUG_ENTER("dict_table_get_highest_foreign_id");
+
+	ut_a(table);
+
+	len = strlen(table->name.m_name);
+
+	for (dict_foreign_set::iterator it = table->foreign_set.begin();
+	     it != table->foreign_set.end();
+	     ++it) {
+		char    fkid[MAX_TABLE_NAME_LEN * 2 + 20];
+		foreign = *it;
+
+		strncpy(fkid, foreign->id, (sizeof fkid) - 1);
+		fkid[(sizeof fkid) - 1] = '\0';
+		/* Convert foreign key identifier on dictionary memory
+		cache to filename charset. */
+		innobase_convert_to_filename_charset(
+				strchr(fkid, '/') + 1,
+				strchr(foreign->id, '/') + 1,
+				MAX_TABLE_NAME_LEN);
+
+		if (strlen(fkid) > ((sizeof dict_ibfk) - 1) + len
+		    && 0 == memcmp(fkid, table->name.m_name, len)
+		    && 0 == memcmp(fkid + len,
+				   dict_ibfk, (sizeof dict_ibfk) - 1)
+		    && fkid[len + ((sizeof dict_ibfk) - 1)] != '0') {
+			/* It is of the >= 4.0.18 format */
+
+			id = strtoul(fkid + len
+				     + ((sizeof dict_ibfk) - 1),
+				     &endp, 10);
+			if (*endp == '\0') {
+				ut_a(id != biggest_id);
+
+				if (id > biggest_id) {
+					biggest_id = id;
+				}
+			}
+		}
+	}
+
+	DBUG_PRINT("dict_table_get_highest_foreign_id",
+		   ("id: " ULINTPF, biggest_id));
+
+	DBUG_RETURN(biggest_id);
+}
+
+/**********************************************************************//**
+Parses the CONSTRAINT id's to be dropped in an ALTER TABLE statement.
+@return DB_SUCCESS or DB_CANNOT_DROP_CONSTRAINT if syntax error or the
+constraint id does not match */
+dberr_t
+dict_foreign_parse_drop_constraints(
+/*================================*/
+	mem_heap_t*	heap,			/*!< in: heap from which we can
+						allocate memory */
+	trx_t*		trx,			/*!< in: transaction */
+	dict_table_t*	table,			/*!< in: table */
+	ulint*		n,			/*!< out: number of constraints
+						to drop */
+	const char***	constraints_to_drop)	/*!< out: id's of the
+						constraints to drop */
+{
+	ibool			success;
+	char*			str;
+	size_t			len;
+	const char*		ptr;
+	const char*		ptr1;
+	const char*		id;
+	CHARSET_INFO*		cs;
+
+	ut_a(trx->mysql_thd);
+
+	cs = thd_charset(trx->mysql_thd);
+
+	*n = 0;
+
+	*constraints_to_drop = static_cast<const char**>(
+		mem_heap_alloc(heap, 1000 * sizeof(char*)));
+
+	ptr = innobase_get_stmt_unsafe(trx->mysql_thd, &len);
+
+	str = dict_strip_comments(ptr, len);
+
+	ptr = str;
+
+	ut_ad(dict_sys.locked());
+loop:
+	ptr = dict_scan_to(ptr, "DROP");
+
+	if (*ptr == '\0') {
+		ut_free(str);
+
+		return(DB_SUCCESS);
+	}
+
+	ptr = dict_accept(cs, ptr, "DROP", &success);
+
+	if (!my_isspace(cs, *ptr)) {
+
+		goto loop;
+	}
+
+	ptr = dict_accept(cs, ptr, "FOREIGN", &success);
+
+	if (!success || !my_isspace(cs, *ptr)) {
+
+		goto loop;
+	}
+
+	ptr = dict_accept(cs, ptr, "KEY", &success);
+
+	if (!success) {
+
+		goto syntax_error;
+	}
+
+	ptr1 = dict_accept(cs, ptr, "IF", &success);
+
+	if (success && my_isspace(cs, *ptr1)) {
+		ptr1 = dict_accept(cs, ptr1, "EXISTS", &success);
+		if (success) {
+			ptr = ptr1;
+		}
+	}
+
+	ptr = dict_scan_id(cs, ptr, heap, &id, FALSE, TRUE);
+
+	if (id == NULL) {
+
+		goto syntax_error;
+	}
+
+	ut_a(*n < 1000);
+	(*constraints_to_drop)[*n] = id;
+	(*n)++;
+
+	if (std::find_if(table->foreign_set.begin(),
+			 table->foreign_set.end(),
+			 dict_foreign_matches_id(id))
+	    == table->foreign_set.end()) {
+
+		if (!srv_read_only_mode) {
+			FILE*	ef = dict_foreign_err_file;
+
+			mysql_mutex_lock(&dict_foreign_err_mutex);
+			rewind(ef);
+			ut_print_timestamp(ef);
+			fputs(" Error in dropping of a foreign key"
+			      " constraint of table ", ef);
+			ut_print_name(ef, NULL, table->name.m_name);
+			fprintf(ef, ",\nin SQL command\n%s"
+				"\nCannot find a constraint with the"
+				" given id %s.\n", str, id);
+			mysql_mutex_unlock(&dict_foreign_err_mutex);
+		}
+
+		ut_free(str);
+
+		return(DB_CANNOT_DROP_CONSTRAINT);
+	}
+
+	goto loop;
+
+syntax_error:
+	if (!srv_read_only_mode) {
+		FILE*	ef = dict_foreign_err_file;
+
+		mysql_mutex_lock(&dict_foreign_err_mutex);
+		rewind(ef);
+		ut_print_timestamp(ef);
+		fputs(" Syntax error in dropping of a"
+		      " foreign key constraint of table ", ef);
+		ut_print_name(ef, NULL, table->name.m_name);
+		fprintf(ef, ",\n"
+			"close to:\n%s\n in SQL command\n%s\n", ptr, str);
+		mysql_mutex_unlock(&dict_foreign_err_mutex);
+	}
+
+	ut_free(str);
+
+	return(DB_CANNOT_DROP_CONSTRAINT);
+}
+
+/*==================== END OF FOREIGN KEY PROCESSING ====================*/
+
+/**********************************************************************//**
+Returns an index object if it is found in the dictionary cache.
+Assumes that dict_sys.latch is already being held.
+@return index, NULL if not found */
+dict_index_t*
+dict_index_get_if_in_cache_low(
+/*===========================*/
+	index_id_t	index_id)	/*!< in: index id */
+{
+  ut_ad(dict_sys.frozen());
+
+  for (dict_table_t *table= UT_LIST_GET_FIRST(dict_sys.table_LRU);
+       table; table= UT_LIST_GET_NEXT(table_LRU, table))
+    if (dict_index_t *index= dict_table_find_index_on_id(table, index_id))
+      return index;
+
+  for (dict_table_t *table = UT_LIST_GET_FIRST(dict_sys.table_non_LRU);
+       table; table= UT_LIST_GET_NEXT(table_LRU, table))
+    if (dict_index_t *index= dict_table_find_index_on_id(table, index_id))
+      return index;
+
+  return nullptr;
+}
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Returns an index object if it is found in the dictionary cache.
+@return index, NULL if not found */
+dict_index_t*
+dict_index_get_if_in_cache(
+/*=======================*/
+	index_id_t	index_id)	/*!< in: index id */
+{
+	dict_index_t*	index;
+
+	if (!dict_sys.is_initialised()) {
+		return(NULL);
+	}
+
+	dict_sys.freeze(SRW_LOCK_CALL);
+
+	index = dict_index_get_if_in_cache_low(index_id);
+
+	dict_sys.unfreeze();
+
+	return(index);
+}
+
+/**********************************************************************//**
+Checks that a tuple has n_fields_cmp value in a sensible range, so that
+no comparison can occur with the page number field in a node pointer.
+@return TRUE if ok */
+ibool
+dict_index_check_search_tuple(
+/*==========================*/
+	const dict_index_t*	index,	/*!< in: index tree */
+	const dtuple_t*		tuple)	/*!< in: tuple used in a search */
+{
+	ut_ad(dtuple_get_n_fields_cmp(tuple)
+	      <= dict_index_get_n_unique_in_tree(index));
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/**********************************************************************//**
+Builds a node pointer out of a physical record and a page number.
+@return own: node pointer */
+dtuple_t*
+dict_index_build_node_ptr(
+/*======================*/
+	const dict_index_t*	index,	/*!< in: index */
+	const rec_t*		rec,	/*!< in: record for which to build node
+					pointer */
+	ulint			page_no,/*!< in: page number to put in node
+					pointer */
+	mem_heap_t*		heap,	/*!< in: memory heap where pointer
+					created */
+	ulint			level)	/*!< in: level of rec in tree:
+					0 means leaf level */
+{
+	dtuple_t*	tuple;
+	dfield_t*	field;
+	byte*		buf;
+	ulint		n_unique;
+
+	if (dict_index_is_ibuf(index)) {
+		/* In a universal index tree, we take the whole record as
+		the node pointer if the record is on the leaf level,
+		on non-leaf levels we remove the last field, which
+		contains the page number of the child page */
+
+		ut_a(!dict_table_is_comp(index->table));
+		n_unique = rec_get_n_fields_old(rec);
+
+		if (level > 0) {
+			ut_a(n_unique > 1);
+			n_unique--;
+		}
+	} else {
+		n_unique = dict_index_get_n_unique_in_tree_nonleaf(index);
+	}
+
+	tuple = dtuple_create(heap, n_unique + 1);
+
+	/* When searching in the tree for the node pointer, we must not do
+	comparison on the last field, the page number field, as on upper
+	levels in the tree there may be identical node pointers with a
+	different page number; therefore, we set the n_fields_cmp to one
+	less: */
+
+	dtuple_set_n_fields_cmp(tuple, n_unique);
+
+	dict_index_copy_types(tuple, index, n_unique);
+
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
+
+	mach_write_to_4(buf, page_no);
+
+	field = dtuple_get_nth_field(tuple, n_unique);
+	dfield_set_data(field, buf, 4);
+
+	dtype_set(dfield_get_type(field), DATA_SYS_CHILD, DATA_NOT_NULL, 4);
+
+	rec_copy_prefix_to_dtuple(tuple, rec, index,
+				  level ? 0 : index->n_core_fields,
+				  n_unique, heap);
+	dtuple_set_info_bits(tuple, dtuple_get_info_bits(tuple)
+			     | REC_STATUS_NODE_PTR);
+
+	ut_ad(dtuple_check_typed(tuple));
+
+	return(tuple);
+}
+
+/** Convert a physical record into a search tuple.
+@param[in]	rec		index record (not necessarily in an index page)
+@param[in]	index		index
+@param[in]	leaf		whether rec is in a leaf page
+@param[in]	n_fields	number of data fields
+@param[in,out]	heap		memory heap for allocation
+@return own: data tuple */
+dtuple_t*
+dict_index_build_data_tuple(
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	bool			leaf,
+	ulint			n_fields,
+	mem_heap_t*		heap)
+{
+	ut_ad(!index->is_clust());
+
+	dtuple_t* tuple = dtuple_create(heap, n_fields);
+
+	dict_index_copy_types(tuple, index, n_fields);
+
+	rec_copy_prefix_to_dtuple(tuple, rec, index,
+				  leaf ? n_fields : 0, n_fields, heap);
+
+	ut_ad(dtuple_check_typed(tuple));
+
+	return(tuple);
+}
+
+/*********************************************************************//**
+Calculates the minimum record length in an index. */
+ulint
+dict_index_calc_min_rec_len(
+/*========================*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+	ulint	sum	= 0;
+	ulint	i;
+	ulint	comp	= dict_table_is_comp(index->table);
+
+	if (comp) {
+		ulint nullable = 0;
+		sum = REC_N_NEW_EXTRA_BYTES;
+		for (i = 0; i < dict_index_get_n_fields(index); i++) {
+			const dict_col_t*	col
+				= dict_index_get_nth_col(index, i);
+			ulint	size = dict_col_get_fixed_size(col, comp);
+			sum += size;
+			if (!size) {
+				size = col->len;
+				sum += size < 128 ? 1 : 2;
+			}
+			if (!(col->prtype & DATA_NOT_NULL)) {
+				nullable++;
+			}
+		}
+
+		/* round the NULL flags up to full bytes */
+		sum += UT_BITS_IN_BYTES(nullable);
+
+		return(sum);
+	}
+
+	for (i = 0; i < dict_index_get_n_fields(index); i++) {
+		sum += dict_col_get_fixed_size(
+			dict_index_get_nth_col(index, i), comp);
+	}
+
+	if (sum > 127) {
+		sum += 2 * dict_index_get_n_fields(index);
+	} else {
+		sum += dict_index_get_n_fields(index);
+	}
+
+	sum += REC_N_OLD_EXTRA_BYTES;
+
+	return(sum);
+}
+
+/**********************************************************************//**
+Outputs info on a foreign key of a table in a format suitable for
+CREATE TABLE. */
+std::string
+dict_print_info_on_foreign_key_in_create_format(
+/*============================================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_foreign_t*	foreign,	/*!< in: foreign key constraint */
+	ibool		add_newline)	/*!< in: whether to add a newline */
+{
+	const char*	stripped_id;
+	ulint	i;
+	std::string	str;
+
+	if (strchr(foreign->id, '/')) {
+		/* Strip the preceding database name from the constraint id */
+		stripped_id = foreign->id + 1
+			+ dict_get_db_name_len(foreign->id);
+	} else {
+		stripped_id = foreign->id;
+	}
+
+	str.append(",");
+
+	if (add_newline) {
+		/* SHOW CREATE TABLE wants constraints each printed nicely
+		on its own line, while error messages want no newlines
+		inserted. */
+		str.append("\n ");
+	}
+
+	str.append(" CONSTRAINT ");
+
+	str.append(innobase_quote_identifier(trx, stripped_id));
+	str.append(" FOREIGN KEY (");
+
+	for (i = 0;;) {
+		str.append(innobase_quote_identifier(trx, foreign->foreign_col_names[i]));
+
+		if (++i < foreign->n_fields) {
+			str.append(", ");
+		} else {
+			break;
+		}
+	}
+
+	str.append(") REFERENCES ");
+
+	if (dict_tables_have_same_db(foreign->foreign_table_name_lookup,
+				     foreign->referenced_table_name_lookup)) {
+		/* Do not print the database name of the referenced table */
+		str.append(ut_get_name(trx,
+			      dict_remove_db_name(
+				      foreign->referenced_table_name)));
+	} else {
+		str.append(ut_get_name(trx,
+				foreign->referenced_table_name));
+	}
+
+	str.append(" (");
+
+	for (i = 0;;) {
+		str.append(innobase_quote_identifier(trx,
+				foreign->referenced_col_names[i]));
+
+		if (++i < foreign->n_fields) {
+			str.append(", ");
+		} else {
+			break;
+		}
+	}
+
+	str.append(")");
+
+	if (foreign->type & DICT_FOREIGN_ON_DELETE_CASCADE) {
+		str.append(" ON DELETE CASCADE");
+	}
+
+	if (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL) {
+		str.append(" ON DELETE SET NULL");
+	}
+
+	if (foreign->type & DICT_FOREIGN_ON_DELETE_NO_ACTION) {
+		str.append(" ON DELETE NO ACTION");
+	}
+
+	if (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE) {
+		str.append(" ON UPDATE CASCADE");
+	}
+
+	if (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL) {
+		str.append(" ON UPDATE SET NULL");
+	}
+
+	if (foreign->type & DICT_FOREIGN_ON_UPDATE_NO_ACTION) {
+		str.append(" ON UPDATE NO ACTION");
+	}
+
+	return str;
+}
+
+/**********************************************************************//**
+Outputs info on foreign keys of a table. */
+std::string
+dict_print_info_on_foreign_keys(
+/*============================*/
+	ibool		create_table_format, /*!< in: if TRUE then print in
+				a format suitable to be inserted into
+				a CREATE TABLE, otherwise in the format
+				of SHOW TABLE STATUS */
+	trx_t*		trx,	/*!< in: transaction */
+	dict_table_t*	table)	/*!< in: table */
+{
+	dict_foreign_t*	foreign;
+	std::string 	str;
+
+	dict_sys.freeze(SRW_LOCK_CALL);
+
+	for (dict_foreign_set::iterator it = table->foreign_set.begin();
+	     it != table->foreign_set.end();
+	     ++it) {
+
+		foreign = *it;
+
+		if (create_table_format) {
+			str.append(
+				dict_print_info_on_foreign_key_in_create_format(
+					trx, foreign, TRUE));
+		} else {
+			ulint	i;
+			str.append("; (");
+
+			for (i = 0; i < foreign->n_fields; i++) {
+				if (i) {
+					str.append(" ");
+				}
+
+				str.append(innobase_quote_identifier(trx,
+						foreign->foreign_col_names[i]));
+			}
+
+			str.append(") REFER ");
+			str.append(ut_get_name(trx,
+					foreign->referenced_table_name));
+			str.append(")");
+
+			for (i = 0; i < foreign->n_fields; i++) {
+				if (i) {
+					str.append(" ");
+				}
+				str.append(innobase_quote_identifier(
+						trx,
+						foreign->referenced_col_names[i]));
+			}
+
+			str.append(")");
+
+			if (foreign->type == DICT_FOREIGN_ON_DELETE_CASCADE) {
+				str.append(" ON DELETE CASCADE");
+			}
+
+			if (foreign->type == DICT_FOREIGN_ON_DELETE_SET_NULL) {
+				str.append(" ON DELETE SET NULL");
+			}
+
+			if (foreign->type & DICT_FOREIGN_ON_DELETE_NO_ACTION) {
+				str.append(" ON DELETE NO ACTION");
+			}
+
+			if (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE) {
+				str.append(" ON UPDATE CASCADE");
+			}
+
+			if (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL) {
+				str.append(" ON UPDATE SET NULL");
+			}
+
+			if (foreign->type & DICT_FOREIGN_ON_UPDATE_NO_ACTION) {
+				str.append(" ON UPDATE NO ACTION");
+			}
+		}
+	}
+
+	dict_sys.unfreeze();
+	return str;
+}
+
+/**********************************************************************//**
+Flags an index corrupted both in the data dictionary cache
+and in the SYS_INDEXES */
+void dict_set_corrupted(dict_index_t *index, const char *ctx)
+{
+	mem_heap_t*	heap;
+	mtr_t		mtr;
+	dict_index_t*	sys_index;
+	dtuple_t*	tuple;
+	dfield_t*	dfield;
+	byte*		buf;
+	const char*	status;
+	btr_cur_t	cursor;
+
+	dict_sys.lock(SRW_LOCK_CALL);
+
+	ut_ad(!dict_table_is_comp(dict_sys.sys_tables));
+	ut_ad(!dict_table_is_comp(dict_sys.sys_indexes));
+
+	/* Mark the table as corrupted only if the clustered index
+	is corrupted */
+	if (dict_index_is_clust(index)) {
+		index->table->corrupted = TRUE;
+		goto func_exit;
+	}
+
+	if (index->type & DICT_CORRUPT) {
+		/* The index was already flagged corrupted. */
+		ut_ad(!dict_index_is_clust(index) || index->table->corrupted);
+		goto func_exit;
+	}
+
+	/* If this is read only mode, do not update SYS_INDEXES, just
+	mark it as corrupted in memory */
+	if (high_level_read_only) {
+		index->type |= DICT_CORRUPT;
+		goto func_exit;
+	}
+
+	heap = mem_heap_create(sizeof(dtuple_t) + 2 * (sizeof(dfield_t)
+			       + sizeof(que_fork_t) + sizeof(upd_node_t)
+			       + sizeof(upd_t) + 12));
+	mtr_start(&mtr);
+	index->type |= DICT_CORRUPT;
+
+	sys_index = UT_LIST_GET_FIRST(dict_sys.sys_indexes->indexes);
+
+	/* Find the index row in SYS_INDEXES */
+	tuple = dtuple_create(heap, 2);
+
+	dfield = dtuple_get_nth_field(tuple, 0);
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
+	mach_write_to_8(buf, index->table->id);
+	dfield_set_data(dfield, buf, 8);
+
+	dfield = dtuple_get_nth_field(tuple, 1);
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
+	mach_write_to_8(buf, index->id);
+	dfield_set_data(dfield, buf, 8);
+
+	dict_index_copy_types(tuple, sys_index, 2);
+	cursor.page_cur.index = sys_index;
+
+	if (cursor.search_leaf(tuple, PAGE_CUR_LE, BTR_MODIFY_LEAF, &mtr)
+	    != DB_SUCCESS) {
+		goto fail;
+	}
+
+	if (cursor.low_match == dtuple_get_n_fields(tuple)) {
+		/* UPDATE SYS_INDEXES SET TYPE=index->type
+		WHERE TABLE_ID=index->table->id AND INDEX_ID=index->id */
+		ulint	len;
+		byte*	field	= rec_get_nth_field_old(
+			btr_cur_get_rec(&cursor),
+			DICT_FLD__SYS_INDEXES__TYPE, &len);
+		if (len != 4) {
+			goto fail;
+		}
+		mtr.write<4>(*btr_cur_get_block(&cursor), field, index->type);
+		status = "Flagged";
+	} else {
+fail:
+		status = "Unable to flag";
+	}
+
+	mtr_commit(&mtr);
+	mem_heap_free(heap);
+	ib::error() << status << " corruption of " << index->name
+		<< " in table " << index->table->name << " in " << ctx;
+
+func_exit:
+	dict_sys.unlock();
+}
+
+/** Sets merge_threshold in the SYS_INDEXES
+@param[in,out]	index		index
+@param[in]	merge_threshold	value to set */
+void
+dict_index_set_merge_threshold(
+	dict_index_t*	index,
+	ulint		merge_threshold)
+{
+	mem_heap_t*	heap;
+	mtr_t		mtr;
+	dict_index_t*	sys_index;
+	dtuple_t*	tuple;
+	dfield_t*	dfield;
+	byte*		buf;
+	btr_cur_t	cursor;
+
+	ut_ad(index != NULL);
+	ut_ad(!dict_table_is_comp(dict_sys.sys_tables));
+	ut_ad(!dict_table_is_comp(dict_sys.sys_indexes));
+
+	heap = mem_heap_create(sizeof(dtuple_t) + 2 * (sizeof(dfield_t)
+			       + sizeof(que_fork_t) + sizeof(upd_node_t)
+			       + sizeof(upd_t) + 12));
+
+	mtr.start();
+
+	sys_index = UT_LIST_GET_FIRST(dict_sys.sys_indexes->indexes);
+
+	/* Find the index row in SYS_INDEXES */
+	tuple = dtuple_create(heap, 2);
+
+	dfield = dtuple_get_nth_field(tuple, 0);
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
+	mach_write_to_8(buf, index->table->id);
+	dfield_set_data(dfield, buf, 8);
+
+	dfield = dtuple_get_nth_field(tuple, 1);
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 8));
+	mach_write_to_8(buf, index->id);
+	dfield_set_data(dfield, buf, 8);
+
+	dict_index_copy_types(tuple, sys_index, 2);
+	cursor.page_cur.index = sys_index;
+
+	if (cursor.search_leaf(tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF, &mtr)
+	    != DB_SUCCESS) {
+		goto func_exit;
+	}
+
+	if (cursor.up_match == dtuple_get_n_fields(tuple)
+	    && rec_get_n_fields_old(btr_cur_get_rec(&cursor))
+	       == DICT_NUM_FIELDS__SYS_INDEXES) {
+		ulint	len;
+		byte*	field	= rec_get_nth_field_old(
+			btr_cur_get_rec(&cursor),
+			DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD, &len);
+
+		ut_ad(len == 4);
+		mtr.write<4,mtr_t::MAYBE_NOP>(*btr_cur_get_block(&cursor),
+					      field, merge_threshold);
+	}
+
+func_exit:
+	mtr_commit(&mtr);
+	mem_heap_free(heap);
+}
+
+#ifdef UNIV_DEBUG
+/** Sets merge_threshold for all indexes in the list of tables
+@param[in]	list	pointer to the list of tables */
+inline
+void
+dict_set_merge_threshold_list_debug(
+	UT_LIST_BASE_NODE_T(dict_table_t)*	list,
+	uint					merge_threshold_all)
+{
+	for (dict_table_t* table = UT_LIST_GET_FIRST(*list);
+	     table != NULL;
+	     table = UT_LIST_GET_NEXT(table_LRU, table)) {
+		for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
+		     index != NULL;
+		     index = UT_LIST_GET_NEXT(indexes, index)) {
+			index->lock.x_lock(SRW_LOCK_CALL);
+			index->merge_threshold = merge_threshold_all
+				& ((1U << 6) - 1);
+			index->lock.x_unlock();
+		}
+	}
+}
+
+/** Sets merge_threshold for all indexes in dictionary cache for debug.
+@param[in]	merge_threshold_all	value to set for all indexes */
+void
+dict_set_merge_threshold_all_debug(
+	uint	merge_threshold_all)
+{
+	dict_sys.freeze(SRW_LOCK_CALL);
+
+	dict_set_merge_threshold_list_debug(
+		&dict_sys.table_LRU, merge_threshold_all);
+	dict_set_merge_threshold_list_debug(
+		&dict_sys.table_non_LRU, merge_threshold_all);
+
+	dict_sys.unfreeze();
+}
+
+#endif /* UNIV_DEBUG */
+
+/** Get an index by name.
+@param[in]	table		the table where to look for the index
+@param[in]	name		the index name to look for
+@return index, NULL if does not exist */
+dict_index_t*
+dict_table_get_index_on_name(dict_table_t* table, const char* name)
+{
+	dict_index_t*	index;
+
+	index = dict_table_get_first_index(table);
+
+	while (index != NULL) {
+		if (index->is_committed() && !strcmp(index->name, name)) {
+			return(index);
+		}
+
+		index = dict_table_get_next_index(index);
+	}
+
+	return(NULL);
+}
+
+/**********************************************************************//**
+Replace the index passed in with another equivalent index in the
+foreign key lists of the table.
+@return whether all replacements were found */
+bool
+dict_foreign_replace_index(
+/*=======================*/
+	dict_table_t*		table,  /*!< in/out: table */
+	const char**		col_names,
+					/*!< in: column names, or NULL
+					to use table->col_names */
+	const dict_index_t*	index)	/*!< in: index to be replaced */
+{
+	bool		found	= true;
+	dict_foreign_t*	foreign;
+
+	ut_ad(index->to_be_dropped);
+	ut_ad(index->table == table);
+
+	for (dict_foreign_set::iterator it = table->foreign_set.begin();
+	     it != table->foreign_set.end();
+	     ++it) {
+
+		foreign = *it;
+		if (foreign->foreign_index == index) {
+			ut_ad(foreign->foreign_table == index->table);
+
+			dict_index_t* new_index = dict_foreign_find_index(
+				foreign->foreign_table, col_names,
+				foreign->foreign_col_names,
+				foreign->n_fields, index,
+				/*check_charsets=*/TRUE, /*check_null=*/FALSE,
+				NULL, NULL, NULL);
+			if (new_index) {
+				ut_ad(new_index->table == index->table);
+				ut_ad(!new_index->to_be_dropped);
+			} else {
+				found = false;
+			}
+
+			foreign->foreign_index = new_index;
+		}
+	}
+
+	for (dict_foreign_set::iterator it = table->referenced_set.begin();
+	     it != table->referenced_set.end();
+	     ++it) {
+
+		foreign = *it;
+		if (foreign->referenced_index == index) {
+			ut_ad(foreign->referenced_table == index->table);
+
+			dict_index_t* new_index = dict_foreign_find_index(
+				foreign->referenced_table, NULL,
+				foreign->referenced_col_names,
+				foreign->n_fields, index,
+				/*check_charsets=*/TRUE, /*check_null=*/FALSE,
+				NULL, NULL, NULL);
+			/* There must exist an alternative index,
+			since this must have been checked earlier. */
+			if (new_index) {
+				ut_ad(new_index->table == index->table);
+				ut_ad(!new_index->to_be_dropped);
+			} else {
+				found = false;
+			}
+
+			foreign->referenced_index = new_index;
+		}
+	}
+
+	return(found);
+}
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Check for duplicate index entries in a table [using the index name] */
+void
+dict_table_check_for_dup_indexes(
+/*=============================*/
+	const dict_table_t*	table,	/*!< in: Check for dup indexes
+					in this table */
+	enum check_name		check)	/*!< in: whether and when to allow
+					temporary index names */
+{
+	/* Check for duplicates, ignoring indexes that are marked
+	as to be dropped */
+
+	const dict_index_t*	index1;
+	const dict_index_t*	index2;
+
+	ut_ad(dict_sys.frozen());
+
+	/* The primary index _must_ exist */
+	ut_a(UT_LIST_GET_LEN(table->indexes) > 0);
+
+	index1 = UT_LIST_GET_FIRST(table->indexes);
+
+	do {
+		if (!index1->is_committed()) {
+			ut_a(!dict_index_is_clust(index1));
+
+			switch (check) {
+			case CHECK_ALL_COMPLETE:
+				ut_error;
+			case CHECK_ABORTED_OK:
+				switch (dict_index_get_online_status(index1)) {
+				case ONLINE_INDEX_COMPLETE:
+				case ONLINE_INDEX_CREATION:
+					ut_error;
+					break;
+				case ONLINE_INDEX_ABORTED:
+				case ONLINE_INDEX_ABORTED_DROPPED:
+					break;
+				}
+				/* fall through */
+			case CHECK_PARTIAL_OK:
+				break;
+			}
+		}
+
+		for (index2 = UT_LIST_GET_NEXT(indexes, index1);
+		     index2 != NULL;
+		     index2 = UT_LIST_GET_NEXT(indexes, index2)) {
+			ut_ad(index1->is_committed()
+			      != index2->is_committed()
+			      || strcmp(index1->name, index2->name) != 0);
+		}
+
+		index1 = UT_LIST_GET_NEXT(indexes, index1);
+	} while (index1);
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Converts a database and table name from filesystem encoding
+(e.g. d@i1b/a@q1b@1Kc, same format as used in dict_table_t::name) in two
+strings in UTF8 encoding (e.g. dцb and aюbØc). The output buffers must be
+at least MAX_DB_UTF8_LEN and MAX_TABLE_UTF8_LEN bytes. */
+void
+dict_fs2utf8(
+/*=========*/
+	const char*	db_and_table,	/*!< in: database and table names,
+					e.g. d@i1b/a@q1b@1Kc */
+	char*		db_utf8,	/*!< out: database name, e.g. dцb */
+	size_t		db_utf8_size,	/*!< in: dbname_utf8 size */
+	char*		table_utf8,	/*!< out: table name, e.g. aюbØc */
+	size_t		table_utf8_size)/*!< in: table_utf8 size */
+{
+	char	db[MAX_DATABASE_NAME_LEN + 1];
+	ulint	db_len;
+	uint	errors;
+
+	db_len = dict_get_db_name_len(db_and_table);
+
+	ut_a(db_len <= sizeof(db));
+
+	memcpy(db, db_and_table, db_len);
+	db[db_len] = '\0';
+
+	strconvert(
+		&my_charset_filename, db, uint(db_len), system_charset_info,
+		db_utf8, uint(db_utf8_size), &errors);
+
+	/* convert each # to @0023 in table name and store the result in buf */
+	const char*	table = dict_remove_db_name(db_and_table);
+	const char*	table_p;
+	char		buf[MAX_TABLE_NAME_LEN * 5 + 1];
+	char*		buf_p;
+	for (table_p = table, buf_p = buf; table_p[0] != '\0'; table_p++) {
+		if (table_p[0] != '#') {
+			buf_p[0] = table_p[0];
+			buf_p++;
+		} else {
+			buf_p[0] = '@';
+			buf_p[1] = '0';
+			buf_p[2] = '0';
+			buf_p[3] = '2';
+			buf_p[4] = '3';
+			buf_p += 5;
+		}
+		ut_a((size_t) (buf_p - buf) < sizeof(buf));
+	}
+	buf_p[0] = '\0';
+
+	errors = 0;
+	strconvert(
+		&my_charset_filename, buf, (uint) (buf_p - buf),
+		system_charset_info,
+		table_utf8, uint(table_utf8_size),
+		&errors);
+
+	if (errors != 0) {
+		snprintf(table_utf8, table_utf8_size, "%s%s",
+			    srv_mysql50_table_name_prefix, table);
+	}
+}
+
+/** Resize the hash tables based on the current buffer pool size. */
+void dict_sys_t::resize()
+{
+  ut_ad(this == &dict_sys);
+  ut_ad(is_initialised());
+  lock(SRW_LOCK_CALL);
+
+  /* all table entries are in table_LRU and table_non_LRU lists */
+  table_hash.free();
+  table_id_hash.free();
+  temp_id_hash.free();
+
+  const ulint hash_size = buf_pool_get_curr_size()
+    / (DICT_POOL_PER_TABLE_HASH * UNIV_WORD_SIZE);
+  table_hash.create(hash_size);
+  table_id_hash.create(hash_size);
+  temp_id_hash.create(hash_size);
+
+  for (dict_table_t *table= UT_LIST_GET_FIRST(table_LRU); table;
+       table= UT_LIST_GET_NEXT(table_LRU, table))
+  {
+    ut_ad(!table->is_temporary());
+    ulint fold= my_crc32c(0, table->name.m_name, strlen(table->name.m_name));
+    ulint id_fold= ut_fold_ull(table->id);
+
+    HASH_INSERT(dict_table_t, name_hash, &table_hash, fold, table);
+    HASH_INSERT(dict_table_t, id_hash, &table_id_hash, id_fold, table);
+  }
+
+  for (dict_table_t *table = UT_LIST_GET_FIRST(table_non_LRU); table;
+       table= UT_LIST_GET_NEXT(table_LRU, table))
+  {
+    ulint fold= my_crc32c(0, table->name.m_name, strlen(table->name.m_name));
+    ulint id_fold= ut_fold_ull(table->id);
+
+    HASH_INSERT(dict_table_t, name_hash, &table_hash, fold, table);
+
+    hash_table_t *id_hash= table->is_temporary()
+      ? &temp_id_hash : &table_id_hash;
+
+    HASH_INSERT(dict_table_t, id_hash, id_hash, id_fold, table);
+  }
+
+  unlock();
+}
+
+/** Close the data dictionary cache on shutdown. */
+void dict_sys_t::close()
+{
+  ut_ad(this == &dict_sys);
+  if (!is_initialised()) return;
+
+  lock(SRW_LOCK_CALL);
+
+  /* Free the hash elements. We don't remove them from table_hash
+  because we are invoking table_hash.free() below. */
+  for (ulint i= table_hash.n_cells; i--; )
+    while (dict_table_t *table= static_cast<dict_table_t*>
+           (HASH_GET_FIRST(&table_hash, i)))
+      dict_sys.remove(table);
+
+  table_hash.free();
+
+  /* table_id_hash contains the same elements as in table_hash,
+  therefore we don't delete the individual elements. */
+  table_id_hash.free();
+
+  /* No temporary tables should exist at this point. */
+  temp_id_hash.free();
+
+  unlock();
+  latch.destroy();
+
+  mysql_mutex_destroy(&dict_foreign_err_mutex);
+
+  if (dict_foreign_err_file)
+  {
+    my_fclose(dict_foreign_err_file, MYF(MY_WME));
+    dict_foreign_err_file = NULL;
+  }
+
+  m_initialised= false;
+}
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Validate the dictionary table LRU list.
+@return TRUE if valid */
+static
+ibool
+dict_lru_validate(void)
+/*===================*/
+{
+	dict_table_t*	table;
+
+	ut_ad(dict_sys.frozen());
+
+	for (table = UT_LIST_GET_FIRST(dict_sys.table_LRU);
+	     table != NULL;
+	     table = UT_LIST_GET_NEXT(table_LRU, table)) {
+
+		ut_a(table->can_be_evicted);
+	}
+
+	for (table = UT_LIST_GET_FIRST(dict_sys.table_non_LRU);
+	     table != NULL;
+	     table = UT_LIST_GET_NEXT(table_LRU, table)) {
+
+		ut_a(!table->can_be_evicted);
+	}
+
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+/*********************************************************************//**
+Check an index to see whether its first fields are the columns in the array,
+in the same order and is not marked for deletion and is not the same
+as types_idx.
+@return true if the index qualifies, otherwise false */
+bool
+dict_foreign_qualify_index(
+/*=======================*/
+	const dict_table_t*	table,	/*!< in: table */
+	const char**		col_names,
+					/*!< in: column names, or NULL
+					to use table->col_names */
+	const char**		columns,/*!< in: array of column names */
+	ulint			n_cols,	/*!< in: number of columns */
+	const dict_index_t*	index,	/*!< in: index to check */
+	const dict_index_t*	types_idx,
+					/*!< in: NULL or an index
+					whose types the column types
+					must match */
+	bool			check_charsets,
+					/*!< in: whether to check
+					charsets.  only has an effect
+					if types_idx != NULL */
+	ulint			check_null,
+					/*!< in: nonzero if none of
+					the columns must be declared
+					NOT NULL */
+	fkerr_t*		error,	/*!< out: error code */
+	ulint*			err_col_no,
+					/*!< out: column number where
+					error happened */
+	dict_index_t**		err_index)
+					/*!< out: index where error
+					happened */
+{
+	if (dict_index_get_n_fields(index) < n_cols) {
+		return(false);
+	}
+
+	if (!index->is_btree()) {
+		return false;
+	}
+
+	if (index->online_status >= ONLINE_INDEX_ABORTED) {
+		return false;
+	}
+
+	for (ulint i = 0; i < n_cols; i++) {
+		dict_field_t*	field;
+		const char*	col_name;
+		ulint		col_no;
+
+		field = dict_index_get_nth_field(index, i);
+		col_no = dict_col_get_no(field->col);
+
+		if (field->prefix_len != 0) {
+			/* We do not accept column prefix
+			indexes here */
+			if (error && err_col_no && err_index) {
+				*error = FK_IS_PREFIX_INDEX;
+				*err_col_no = i;
+				*err_index = (dict_index_t*)index;
+			}
+			return(false);
+		}
+
+		if (check_null
+		    && (field->col->prtype & DATA_NOT_NULL)) {
+			if (error && err_col_no && err_index) {
+				*error = FK_COL_NOT_NULL;
+				*err_col_no = i;
+				*err_index = (dict_index_t*)index;
+			}
+			return(false);
+		}
+
+		if (field->col->is_virtual()) {
+			col_name = "";
+			for (ulint j = 0; j < table->n_v_def; j++) {
+				col_name = dict_table_get_v_col_name(table, j);
+				if (innobase_strcasecmp(field->name,col_name) == 0) {
+					break;
+				}
+			}
+		} else {
+			col_name = col_names
+				? col_names[col_no]
+				: dict_table_get_col_name(table, col_no);
+		}
+
+		if (0 != innobase_strcasecmp(columns[i], col_name)) {
+			return(false);
+		}
+
+		if (types_idx && !cmp_cols_are_equal(
+			    dict_index_get_nth_col(index, i),
+			    dict_index_get_nth_col(types_idx, i),
+			    check_charsets)) {
+			if (error && err_col_no && err_index) {
+				*error = FK_COLS_NOT_EQUAL;
+				*err_col_no = i;
+				*err_index = (dict_index_t*)index;
+			}
+
+			return(false);
+		}
+	}
+
+	return(true);
+}
+
+/*********************************************************************//**
+Update the state of compression failure padding heuristics. This is
+called whenever a compression operation succeeds or fails.
+The caller must be holding info->mutex */
+static
+void
+dict_index_zip_pad_update(
+/*======================*/
+	zip_pad_info_t*	info,	/*<! in/out: info to be updated */
+	ulint	zip_threshold)	/*<! in: zip threshold value */
+{
+	ulint	total;
+	ulint	fail_pct;
+
+	ut_ad(info);
+	ut_ad(info->pad % ZIP_PAD_INCR == 0);
+
+	total = info->success + info->failure;
+
+	ut_ad(total > 0);
+
+	if (zip_threshold == 0) {
+		/* User has just disabled the padding. */
+		return;
+	}
+
+	if (total < ZIP_PAD_ROUND_LEN) {
+		/* We are in middle of a round. Do nothing. */
+		return;
+	}
+
+	/* We are at a 'round' boundary. Reset the values but first
+	calculate fail rate for our heuristic. */
+	fail_pct = (info->failure * 100) / total;
+	info->failure = 0;
+	info->success = 0;
+
+	if (fail_pct > zip_threshold) {
+		/* Compression failures are more then user defined
+		threshold. Increase the pad size to reduce chances of
+		compression failures.
+
+		Only do increment if it won't increase padding
+		beyond max pad size. */
+		if (info->pad + ZIP_PAD_INCR
+		    < (srv_page_size * zip_pad_max) / 100) {
+			info->pad.fetch_add(ZIP_PAD_INCR);
+
+			MONITOR_INC(MONITOR_PAD_INCREMENTS);
+		}
+
+		info->n_rounds = 0;
+
+	} else {
+		/* Failure rate was OK. Another successful round
+		completed. */
+		++info->n_rounds;
+
+		/* If enough successful rounds are completed with
+		compression failure rate in control, decrease the
+		padding. */
+		if (info->n_rounds >= ZIP_PAD_SUCCESSFUL_ROUND_LIMIT
+		    && info->pad > 0) {
+			info->pad.fetch_sub(ZIP_PAD_INCR);
+
+			info->n_rounds = 0;
+
+			MONITOR_INC(MONITOR_PAD_DECREMENTS);
+		}
+	}
+}
+
+/*********************************************************************//**
+This function should be called whenever a page is successfully
+compressed. Updates the compression padding information. */
+void
+dict_index_zip_success(
+/*===================*/
+	dict_index_t*	index)	/*!< in/out: index to be updated. */
+{
+	ulint zip_threshold = zip_failure_threshold_pct;
+	if (!zip_threshold) {
+		/* Disabled by user. */
+		return;
+	}
+
+	index->zip_pad.mutex.lock();
+	++index->zip_pad.success;
+	dict_index_zip_pad_update(&index->zip_pad, zip_threshold);
+	index->zip_pad.mutex.unlock();
+}
+
+/*********************************************************************//**
+This function should be called whenever a page compression attempt
+fails. Updates the compression padding information. */
+void
+dict_index_zip_failure(
+/*===================*/
+	dict_index_t*	index)	/*!< in/out: index to be updated. */
+{
+	ulint zip_threshold = zip_failure_threshold_pct;
+	if (!zip_threshold) {
+		/* Disabled by user. */
+		return;
+	}
+
+	index->zip_pad.mutex.lock();
+	++index->zip_pad.failure;
+	dict_index_zip_pad_update(&index->zip_pad, zip_threshold);
+	index->zip_pad.mutex.unlock();
+}
+
+/*********************************************************************//**
+Return the optimal page size, for which page will likely compress.
+@return page size beyond which page might not compress */
+ulint
+dict_index_zip_pad_optimal_page_size(
+/*=================================*/
+	dict_index_t*	index)	/*!< in: index for which page size
+				is requested */
+{
+	ulint	pad;
+	ulint	min_sz;
+	ulint	sz;
+
+	if (!zip_failure_threshold_pct) {
+		/* Disabled by user. */
+		return(srv_page_size);
+	}
+
+	pad = index->zip_pad.pad;
+
+	ut_ad(pad < srv_page_size);
+	sz = srv_page_size - pad;
+
+	/* Min size allowed by user. */
+	ut_ad(zip_pad_max < 100);
+	min_sz = (srv_page_size * (100 - zip_pad_max)) / 100;
+
+	return(ut_max(sz, min_sz));
+}
+
+/*************************************************************//**
+Convert table flag to row format string.
+@return row format name. */
+const char*
+dict_tf_to_row_format_string(
+/*=========================*/
+	ulint	table_flag)		/*!< in: row format setting */
+{
+	switch (dict_tf_get_rec_format(table_flag)) {
+	case REC_FORMAT_REDUNDANT:
+		return("ROW_TYPE_REDUNDANT");
+	case REC_FORMAT_COMPACT:
+		return("ROW_TYPE_COMPACT");
+	case REC_FORMAT_COMPRESSED:
+		return("ROW_TYPE_COMPRESSED");
+	case REC_FORMAT_DYNAMIC:
+		return("ROW_TYPE_DYNAMIC");
+	}
+
+	ut_error;
+	return(0);
+}
diff --git a/storage/innobase/dict/dict0load.cc b/storage/innobase/dict/dict0load.cc
new file mode 100644
index 00000000..f769839d
--- /dev/null
+++ b/storage/innobase/dict/dict0load.cc
@@ -0,0 +1,3213 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file dict/dict0load.cc
+Loads to the memory cache database object definitions
+from dictionary tables
+
+Created 4/24/1996 Heikki Tuuri
+*******************************************************/
+
+#include "dict0load.h"
+
+#include "log.h"
+#include "btr0pcur.h"
+#include "btr0btr.h"
+#include "dict0boot.h"
+#include "dict0crea.h"
+#include "dict0dict.h"
+#include "dict0mem.h"
+#include "dict0stats.h"
+#include "fsp0file.h"
+#include "fts0priv.h"
+#include "mach0data.h"
+#include "page0page.h"
+#include "rem0cmp.h"
+#include "srv0start.h"
+#include "srv0srv.h"
+#include "fts0opt.h"
+#include "row0vers.h"
+
+/** Loads a table definition and also all its index definitions.
+
+Loads those foreign key constraints whose referenced table is already in
+dictionary cache.  If a foreign key constraint is not loaded, then the
+referenced table is pushed into the output stack (fk_tables), if it is not
+NULL.  These tables must be subsequently loaded so that all the foreign
+key constraints are loaded into memory.
+
+@param[in]	name		Table name in the db/tablename format
+@param[in]	ignore_err	Error to be ignored when loading table
+				and its index definition
+@param[out]	fk_tables	Related table names that must also be
+				loaded to ensure that all foreign key
+				constraints are loaded.
+@return table, possibly with file_unreadable flag set
+@retval nullptr if the table does not exist */
+static dict_table_t *dict_load_table_one(const span<const char> &name,
+                                         dict_err_ignore_t ignore_err,
+                                         dict_names_t &fk_tables);
+
+/** Load an index definition from a SYS_INDEXES record to dict_index_t.
+@return	error message
+@retval	NULL on success */
+static
+const char*
+dict_load_index_low(
+	byte*		table_id,	/*!< in/out: table id (8 bytes),
+					an "in" value if mtr
+					and "out" when !mtr */
+	bool		uncommitted,	/*!< in: false=READ COMMITTED,
+					true=READ UNCOMMITTED */
+	mem_heap_t*	heap,		/*!< in/out: temporary memory heap */
+	const rec_t*	rec,		/*!< in: SYS_INDEXES record */
+	mtr_t*		mtr,		/*!< in/out: mini-transaction,
+					or nullptr if a pre-allocated
+					*index is to be filled in */
+	dict_table_t*	table,		/*!< in/out: table, or NULL */
+	dict_index_t**	index);		/*!< out,own: index, or NULL */
+
+/** Load a table column definition from a SYS_COLUMNS record to dict_table_t.
+@param table           table, or nullptr if the output will be in column
+@param use_uncommitted 0=READ COMMITTED, 1=detect, 2=READ UNCOMMITTED
+@param heap            memory heap for temporary storage
+@param column          pointer to output buffer, or nullptr if table!=nullptr
+@param table_id        table identifier
+@param col_name        column name
+@param rec             SYS_COLUMNS record
+@param mtr             mini-transaction
+@param nth_v_col       nullptr, or pointer to a counter of virtual columns
+@return error message
+@retval nullptr on success */
+static const char *dict_load_column_low(dict_table_t *table,
+                                        unsigned use_uncommitted,
+                                        mem_heap_t *heap, dict_col_t *column,
+                                        table_id_t *table_id,
+                                        const char **col_name,
+                                        const rec_t *rec,
+                                        mtr_t *mtr,
+                                        ulint *nth_v_col);
+
+/** Load a virtual column "mapping" (to base columns) information
+from a SYS_VIRTUAL record
+@param[in,out]	table		table
+@param[in]	uncommitted	false=READ COMMITTED, true=READ UNCOMMITTED
+@param[in,out]	column		mapped base column's dict_column_t
+@param[in,out]	table_id	table id
+@param[in,out]	pos		virtual column position
+@param[in,out]	base_pos	base column position
+@param[in]	rec		SYS_VIRTUAL record
+@return	error message
+@retval	NULL on success */
+static
+const char*
+dict_load_virtual_low(
+	dict_table_t*	table,
+	bool		uncommitted,
+	dict_col_t**	column,
+	table_id_t*	table_id,
+	ulint*		pos,
+	ulint*		base_pos,
+	const rec_t*	rec);
+
+/** Load an index field definition from a SYS_FIELDS record to dict_index_t.
+@return	error message
+@retval	NULL on success */
+static
+const char*
+dict_load_field_low(
+	byte*		index_id,	/*!< in/out: index id (8 bytes)
+					an "in" value if index != NULL
+					and "out" if index == NULL */
+	bool		uncommitted,	/*!< in: false=READ COMMITTED,
+					true=READ UNCOMMITTED */
+	dict_index_t*	index,		/*!< in/out: index, could be NULL
+					if we just populate a dict_field_t
+					struct with information from
+					a SYS_FIELDS record */
+	dict_field_t*	sys_field,	/*!< out: dict_field_t to be
+					filled */
+	ulint*		pos,		/*!< out: Field position */
+	byte*		last_index_id,	/*!< in: last index id */
+	mem_heap_t*	heap,		/*!< in/out: memory heap
+					for temporary storage */
+	mtr_t*		mtr,		/*!< in/out: mini-transaction */
+	const rec_t*	rec);		/*!< in: SYS_FIELDS record */
+
+#ifdef UNIV_DEBUG
+/****************************************************************//**
+Compare the name of an index column.
+@return TRUE if the i'th column of index is 'name'. */
+static
+ibool
+name_of_col_is(
+/*===========*/
+	const dict_table_t*	table,	/*!< in: table */
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			i,	/*!< in: index field offset */
+	const char*		name)	/*!< in: name to compare to */
+{
+	ulint	tmp = dict_col_get_no(dict_field_get_col(
+					      dict_index_get_nth_field(
+						      index, i)));
+
+	return(strcmp(name, dict_table_get_col_name(table, tmp)) == 0);
+}
+#endif /* UNIV_DEBUG */
+
+/********************************************************************//**
+This function gets the next system table record as it scans the table.
+@return the next record if found, NULL if end of scan */
+static
+const rec_t*
+dict_getnext_system_low(
+/*====================*/
+	btr_pcur_t*	pcur,		/*!< in/out: persistent cursor to the
+					record*/
+	mtr_t*		mtr)		/*!< in: the mini-transaction */
+{
+	rec_t*	rec = NULL;
+
+	while (!rec) {
+		btr_pcur_move_to_next_user_rec(pcur, mtr);
+
+		rec = btr_pcur_get_rec(pcur);
+
+		if (!btr_pcur_is_on_user_rec(pcur)) {
+			/* end of index */
+			btr_pcur_close(pcur);
+
+			return(NULL);
+		}
+	}
+
+	/* Get a record, let's save the position */
+	btr_pcur_store_position(pcur, mtr);
+
+	return(rec);
+}
+
+/********************************************************************//**
+This function opens a system table, and returns the first record.
+@return first record of the system table */
+const rec_t*
+dict_startscan_system(
+/*==================*/
+	btr_pcur_t*	pcur,		/*!< out: persistent cursor to
+					the record */
+	mtr_t*		mtr,		/*!< in: the mini-transaction */
+	dict_table_t*	table)		/*!< in: system table */
+{
+  btr_pcur_init(pcur);
+  if (pcur->open_leaf(true, table->indexes.start, BTR_SEARCH_LEAF, mtr) !=
+      DB_SUCCESS)
+    return nullptr;
+  const rec_t *rec;
+  do
+    rec= dict_getnext_system_low(pcur, mtr);
+  while (rec && rec_get_deleted_flag(rec, 0));
+  return rec;
+}
+
+/********************************************************************//**
+This function gets the next system table record as it scans the table.
+@return the next record if found, NULL if end of scan */
+const rec_t*
+dict_getnext_system(
+/*================*/
+	btr_pcur_t*	pcur,		/*!< in/out: persistent cursor
+					to the record */
+	mtr_t*		mtr)		/*!< in: the mini-transaction */
+{
+  const rec_t *rec=nullptr;
+  if (pcur->restore_position(BTR_SEARCH_LEAF, mtr) != btr_pcur_t::CORRUPTED)
+    do
+      rec= dict_getnext_system_low(pcur, mtr);
+    while (rec && rec_get_deleted_flag(rec, 0));
+  return rec;
+}
+
+/********************************************************************//**
+This function parses a SYS_INDEXES record and populate a dict_index_t
+structure with the information from the record. For detail information
+about SYS_INDEXES fields, please refer to dict_boot() function.
+@return error message, or NULL on success */
+const char*
+dict_process_sys_indexes_rec(
+/*=========================*/
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	const rec_t*	rec,		/*!< in: current SYS_INDEXES rec */
+	dict_index_t*	index,		/*!< out: index to be filled */
+	table_id_t*	table_id)	/*!< out: index table id */
+{
+  byte buf[8];
+
+  ut_d(index->is_dummy = true);
+  ut_d(index->in_instant_init = false);
+
+  /* Parse the record, and get "dict_index_t" struct filled */
+  const char *err_msg= dict_load_index_low(buf, false, heap, rec,
+                                           nullptr, nullptr, &index);
+  *table_id= mach_read_from_8(buf);
+  return err_msg;
+}
+
+/********************************************************************//**
+This function parses a SYS_COLUMNS record and populate a dict_column_t
+structure with the information from the record.
+@return error message, or NULL on success */
+const char*
+dict_process_sys_columns_rec(
+/*=========================*/
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	const rec_t*	rec,		/*!< in: current SYS_COLUMNS rec */
+	dict_col_t*	column,		/*!< out: dict_col_t to be filled */
+	table_id_t*	table_id,	/*!< out: table id */
+	const char**	col_name,	/*!< out: column name */
+	ulint*		nth_v_col)	/*!< out: if virtual col, this is
+					record's sequence number */
+{
+	const char*	err_msg;
+
+	/* Parse the record, and get "dict_col_t" struct filled */
+	err_msg = dict_load_column_low(NULL, 0, heap, column,
+				       table_id, col_name, rec, nullptr,
+				       nth_v_col);
+
+	return(err_msg);
+}
+
+/** This function parses a SYS_VIRTUAL record and extracts virtual column
+information
+@param[in]	rec		current SYS_COLUMNS rec
+@param[in,out]	table_id	table id
+@param[in,out]	pos		virtual column position
+@param[in,out]	base_pos	base column position
+@return error message, or NULL on success */
+const char*
+dict_process_sys_virtual_rec(
+	const rec_t*	rec,
+	table_id_t*	table_id,
+	ulint*		pos,
+	ulint*		base_pos)
+{
+  return dict_load_virtual_low(nullptr, false, nullptr, table_id,
+                               pos, base_pos, rec);
+}
+
+/********************************************************************//**
+This function parses a SYS_FIELDS record and populates a dict_field_t
+structure with the information from the record.
+@return error message, or NULL on success */
+const char*
+dict_process_sys_fields_rec(
+/*========================*/
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	const rec_t*	rec,		/*!< in: current SYS_FIELDS rec */
+	dict_field_t*	sys_field,	/*!< out: dict_field_t to be
+					filled */
+	ulint*		pos,		/*!< out: Field position */
+	index_id_t*	index_id,	/*!< out: current index id */
+	index_id_t	last_id)	/*!< in: previous index id */
+{
+	byte		buf[8];
+	byte		last_index_id[8];
+	const char*	err_msg;
+
+	mach_write_to_8(last_index_id, last_id);
+
+	err_msg = dict_load_field_low(buf, false, nullptr, sys_field,
+				      pos, last_index_id, heap, nullptr, rec);
+
+	*index_id = mach_read_from_8(buf);
+
+	return(err_msg);
+
+}
+
+/********************************************************************//**
+This function parses a SYS_FOREIGN record and populate a dict_foreign_t
+structure with the information from the record. For detail information
+about SYS_FOREIGN fields, please refer to dict_load_foreign() function.
+@return error message, or NULL on success */
+const char*
+dict_process_sys_foreign_rec(
+/*=========================*/
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	const rec_t*	rec,		/*!< in: current SYS_FOREIGN rec */
+	dict_foreign_t*	foreign)	/*!< out: dict_foreign_t struct
+					to be filled */
+{
+	ulint		len;
+	const byte*	field;
+
+	if (rec_get_deleted_flag(rec, 0)) {
+		return("delete-marked record in SYS_FOREIGN");
+	}
+
+	if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_FOREIGN) {
+		return("wrong number of columns in SYS_FOREIGN record");
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN__ID, &len);
+	if (len == 0 || len == UNIV_SQL_NULL) {
+err_len:
+		return("incorrect column length in SYS_FOREIGN");
+	}
+
+	/* This receives a dict_foreign_t* that points to a stack variable.
+	So dict_foreign_free(foreign) is not used as elsewhere.
+	Since the heap used here is freed elsewhere, foreign->heap
+	is not assigned. */
+	foreign->id = mem_heap_strdupl(heap, (const char*) field, len);
+
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_FOREIGN__DB_TRX_ID, &len);
+	if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
+		goto err_len;
+	}
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_FOREIGN__DB_ROLL_PTR, &len);
+	if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
+		goto err_len;
+	}
+
+	/* The _lookup versions of the referenced and foreign table names
+	 are not assigned since they are not used in this dict_foreign_t */
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN__FOR_NAME, &len);
+	if (len == 0 || len == UNIV_SQL_NULL) {
+		goto err_len;
+	}
+	foreign->foreign_table_name = mem_heap_strdupl(
+		heap, (const char*) field, len);
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN__REF_NAME, &len);
+	if (len == 0 || len == UNIV_SQL_NULL) {
+		goto err_len;
+	}
+	foreign->referenced_table_name = mem_heap_strdupl(
+		heap, (const char*) field, len);
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN__N_COLS, &len);
+	if (len != 4) {
+		goto err_len;
+	}
+	uint32_t n_fields_and_type = mach_read_from_4(field);
+
+	foreign->type = n_fields_and_type >> 24 & ((1U << 6) - 1);
+	foreign->n_fields = n_fields_and_type & dict_index_t::MAX_N_FIELDS;
+
+	return(NULL);
+}
+
+/********************************************************************//**
+This function parses a SYS_FOREIGN_COLS record and extract necessary
+information from the record and return to caller.
+@return error message, or NULL on success */
+const char*
+dict_process_sys_foreign_col_rec(
+/*=============================*/
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	const rec_t*	rec,		/*!< in: current SYS_FOREIGN_COLS rec */
+	const char**	name,		/*!< out: foreign key constraint name */
+	const char**	for_col_name,	/*!< out: referencing column name */
+	const char**	ref_col_name,	/*!< out: referenced column name
+					in referenced table */
+	ulint*		pos)		/*!< out: column position */
+{
+	ulint		len;
+	const byte*	field;
+
+	if (rec_get_deleted_flag(rec, 0)) {
+		return("delete-marked record in SYS_FOREIGN_COLS");
+	}
+
+	if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_FOREIGN_COLS) {
+		return("wrong number of columns in SYS_FOREIGN_COLS record");
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN_COLS__ID, &len);
+	if (len == 0 || len == UNIV_SQL_NULL) {
+err_len:
+		return("incorrect column length in SYS_FOREIGN_COLS");
+	}
+	*name = mem_heap_strdupl(heap, (char*) field, len);
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN_COLS__POS, &len);
+	if (len != 4) {
+		goto err_len;
+	}
+	*pos = mach_read_from_4(field);
+
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_FOREIGN_COLS__DB_TRX_ID, &len);
+	if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
+		goto err_len;
+	}
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_FOREIGN_COLS__DB_ROLL_PTR, &len);
+	if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
+		goto err_len;
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN_COLS__FOR_COL_NAME, &len);
+	if (len == 0 || len == UNIV_SQL_NULL) {
+		goto err_len;
+	}
+	*for_col_name = mem_heap_strdupl(heap, (char*) field, len);
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME, &len);
+	if (len == 0 || len == UNIV_SQL_NULL) {
+		goto err_len;
+	}
+	*ref_col_name = mem_heap_strdupl(heap, (char*) field, len);
+
+	return(NULL);
+}
+
+/** Check the validity of a SYS_TABLES record
+Make sure the fields are the right length and that they
+do not contain invalid contents.
+@param[in]	rec	SYS_TABLES record
+@return error message, or NULL on success */
+static
+const char*
+dict_sys_tables_rec_check(
+	const rec_t*	rec)
+{
+	const byte*	field;
+	ulint		len;
+
+	ut_ad(dict_sys.locked());
+
+	if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_TABLES) {
+		return("wrong number of columns in SYS_TABLES record");
+	}
+
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_TABLES__NAME, &len);
+	if (len == 0 || len == UNIV_SQL_NULL) {
+err_len:
+		return("incorrect column length in SYS_TABLES");
+	}
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_TABLES__DB_TRX_ID, &len);
+	if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
+		goto err_len;
+	}
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_TABLES__DB_ROLL_PTR, &len);
+	if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
+		goto err_len;
+	}
+
+	rec_get_nth_field_offs_old(rec, DICT_FLD__SYS_TABLES__ID, &len);
+	if (len != 8) {
+		goto err_len;
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLES__N_COLS, &len);
+	if (field == NULL || len != 4) {
+		goto err_len;
+	}
+
+	rec_get_nth_field_offs_old(rec, DICT_FLD__SYS_TABLES__TYPE, &len);
+	if (len != 4) {
+		goto err_len;
+	}
+
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_TABLES__MIX_ID, &len);
+	if (len != 8) {
+		goto err_len;
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLES__MIX_LEN, &len);
+	if (field == NULL || len != 4) {
+		goto err_len;
+	}
+
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_TABLES__CLUSTER_ID, &len);
+	if (len != UNIV_SQL_NULL) {
+		goto err_len;
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLES__SPACE, &len);
+	if (field == NULL || len != 4) {
+		goto err_len;
+	}
+
+	return(NULL);
+}
+
+/** Check if SYS_TABLES.TYPE is valid
+@param[in]	type		SYS_TABLES.TYPE
+@param[in]	not_redundant	whether ROW_FORMAT=REDUNDANT is not used
+@return	whether the SYS_TABLES.TYPE value is valid */
+static
+bool
+dict_sys_tables_type_valid(ulint type, bool not_redundant)
+{
+	/* The DATA_DIRECTORY flag can be assigned fully independently
+	of all other persistent table flags. */
+	type &= ~DICT_TF_MASK_DATA_DIR;
+
+	if (type == 1) {
+		return(true); /* ROW_FORMAT=REDUNDANT or ROW_FORMAT=COMPACT */
+	}
+
+	if (!(type & 1)) {
+		/* For ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT,
+		SYS_TABLES.TYPE=1. Else, it is the same as
+		dict_table_t::flags, and the least significant bit
+		would be set. So, the bit never can be 0. */
+		return(false);
+	}
+
+	if (!not_redundant) {
+		/* SYS_TABLES.TYPE must be 1 or 1|DICT_TF_MASK_NO_ROLLBACK
+		for ROW_FORMAT=REDUNDANT. */
+		return !(type & ~(1U | DICT_TF_MASK_NO_ROLLBACK));
+	}
+
+	if (type >= 1U << DICT_TF_POS_UNUSED) {
+		/* Some unknown bits are set. */
+		return(false);
+	}
+
+	return(dict_tf_is_valid_not_redundant(type));
+}
+
+/** Convert SYS_TABLES.TYPE to dict_table_t::flags.
+@param[in]	type		SYS_TABLES.TYPE
+@param[in]	not_redundant	whether ROW_FORMAT=REDUNDANT is not used
+@return	table flags */
+static
+uint32_t dict_sys_tables_type_to_tf(uint32_t type, bool not_redundant)
+{
+	ut_ad(dict_sys_tables_type_valid(type, not_redundant));
+	uint32_t flags = not_redundant ? 1 : 0;
+
+	/* ZIP_SSIZE, ATOMIC_BLOBS, DATA_DIR, PAGE_COMPRESSION,
+	PAGE_COMPRESSION_LEVEL are the same. */
+	flags |= type & (DICT_TF_MASK_ZIP_SSIZE
+			 | DICT_TF_MASK_ATOMIC_BLOBS
+			 | DICT_TF_MASK_DATA_DIR
+			 | DICT_TF_MASK_PAGE_COMPRESSION
+			 | DICT_TF_MASK_PAGE_COMPRESSION_LEVEL
+			 | DICT_TF_MASK_NO_ROLLBACK);
+
+	ut_ad(dict_tf_is_valid(flags));
+	return(flags);
+}
+
+/** Outcome of dict_sys_tables_rec_read() */
+enum table_read_status { READ_OK= 0, READ_ERROR, READ_NOT_FOUND };
+
+/** Read and return 5 integer fields from a SYS_TABLES record.
+@param[in]	rec		A record of SYS_TABLES
+@param[in]	uncommitted	true=use READ UNCOMMITTED, false=READ COMMITTED
+@param[in]	mtr		mini-transaction
+@param[out]	table_id	Pointer to the table_id for this table
+@param[out]	space_id	Pointer to the space_id for this table
+@param[out]	n_cols		Pointer to number of columns for this table.
+@param[out]	flags		Pointer to table flags
+@param[out]	flags2		Pointer to table flags2
+@param[out]	trx_id		DB_TRX_ID of the committed SYS_TABLES record,
+				or nullptr to perform READ UNCOMMITTED
+@return whether the record was read correctly */
+MY_ATTRIBUTE((warn_unused_result))
+static
+table_read_status
+dict_sys_tables_rec_read(
+	const rec_t*		rec,
+	bool			uncommitted,
+	mtr_t*			mtr,
+	table_id_t*		table_id,
+	uint32_t*		space_id,
+	uint32_t*		n_cols,
+	uint32_t*		flags,
+	uint32_t*		flags2,
+	trx_id_t*		trx_id)
+{
+	const byte*	field;
+	ulint		len;
+	mem_heap_t*	heap = nullptr;
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLES__DB_TRX_ID, &len);
+	ut_ad(len == 6 || len == UNIV_SQL_NULL);
+	trx_id_t id = len == 6 ? trx_read_trx_id(field) : 0;
+	if (id && !uncommitted && trx_sys.find(nullptr, id, false)) {
+		const auto savepoint = mtr->get_savepoint();
+		heap = mem_heap_create(1024);
+		dict_index_t* index = UT_LIST_GET_FIRST(
+			dict_sys.sys_tables->indexes);
+		rec_offs* offsets = rec_get_offsets(
+			rec, index, nullptr, true, ULINT_UNDEFINED, &heap);
+		const rec_t* old_vers;
+		row_vers_build_for_semi_consistent_read(
+			nullptr, rec, mtr, index, &offsets, &heap,
+			heap, &old_vers, nullptr);
+		mtr->rollback_to_savepoint(savepoint);
+		rec = old_vers;
+		if (!rec) {
+			mem_heap_free(heap);
+			return READ_NOT_FOUND;
+		}
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_TABLES__DB_TRX_ID, &len);
+		if (UNIV_UNLIKELY(len != 6)) {
+			mem_heap_free(heap);
+			return READ_ERROR;
+		}
+		id = trx_read_trx_id(field);
+	}
+
+	if (rec_get_deleted_flag(rec, 0)) {
+		ut_ad(id);
+		if (trx_id) {
+			return READ_NOT_FOUND;
+		}
+	}
+
+	if (trx_id) {
+		*trx_id = id;
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLES__ID, &len);
+	ut_ad(len == 8);
+	*table_id = static_cast<table_id_t>(mach_read_from_8(field));
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLES__SPACE, &len);
+	ut_ad(len == 4);
+	*space_id = mach_read_from_4(field);
+
+	/* Read the 4 byte flags from the TYPE field */
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLES__TYPE, &len);
+	ut_a(len == 4);
+	uint32_t type = mach_read_from_4(field);
+
+	/* Handle MDEV-12873 InnoDB SYS_TABLES.TYPE incompatibility
+	for PAGE_COMPRESSED=YES in MariaDB 10.2.2 to 10.2.6.
+
+	MariaDB 10.2.2 introduced the SHARED_SPACE flag from MySQL 5.7,
+	shifting the flags PAGE_COMPRESSION, PAGE_COMPRESSION_LEVEL,
+	ATOMIC_WRITES (repurposed to NO_ROLLBACK in 10.3.1) by one bit.
+	The SHARED_SPACE flag would always
+	be written as 0 by MariaDB, because MariaDB does not support
+	CREATE TABLESPACE or CREATE TABLE...TABLESPACE for InnoDB.
+
+	So, instead of the bits AALLLLCxxxxxxx we would have
+	AALLLLC0xxxxxxx if the table was created with MariaDB 10.2.2
+	to 10.2.6. (AA=ATOMIC_WRITES, LLLL=PAGE_COMPRESSION_LEVEL,
+	C=PAGE_COMPRESSED, xxxxxxx=7 bits that were not moved.)
+
+	The case LLLLC=00000 is not a problem. The problem is the case
+	AALLLL10DB00001 where D is the (mostly ignored) DATA_DIRECTORY
+	flag and B is the ATOMIC_BLOBS flag (1 for ROW_FORMAT=DYNAMIC
+	and 0 for ROW_FORMAT=COMPACT in this case). Other low-order
+	bits must be so, because PAGE_COMPRESSED=YES is only allowed
+	for ROW_FORMAT=DYNAMIC and ROW_FORMAT=COMPACT, not for
+	ROW_FORMAT=REDUNDANT or ROW_FORMAT=COMPRESSED.
+
+	Starting with MariaDB 10.2.4, the flags would be
+	00LLLL10DB00001, because ATOMIC_WRITES is always written as 0.
+
+	We will concentrate on the PAGE_COMPRESSION_LEVEL and
+	PAGE_COMPRESSED=YES. PAGE_COMPRESSED=NO implies
+	PAGE_COMPRESSION_LEVEL=0, and in that case all the affected
+	bits will be 0. For PAGE_COMPRESSED=YES, the values 1..9 are
+	allowed for PAGE_COMPRESSION_LEVEL. That is, we must interpret
+	the bits AALLLL10DB00001 as AALLLL1DB00001.
+
+	If someone created a table in MariaDB 10.2.2 or 10.2.3 with
+	the attribute ATOMIC_WRITES=OFF (value 2) and without
+	PAGE_COMPRESSED=YES or PAGE_COMPRESSION_LEVEL, that should be
+	rejected. The value ATOMIC_WRITES=ON (1) would look like
+	ATOMIC_WRITES=OFF, but it would be ignored starting with
+	MariaDB 10.2.4. */
+	compile_time_assert(DICT_TF_POS_PAGE_COMPRESSION == 7);
+	compile_time_assert(DICT_TF_POS_UNUSED == 14);
+
+	if ((type & 0x19f) != 0x101) {
+		/* The table cannot have been created with MariaDB
+		10.2.2 to 10.2.6, because they would write the
+		low-order bits of SYS_TABLES.TYPE as 0b10xx00001 for
+		PAGE_COMPRESSED=YES. No adjustment is applicable. */
+	} else if (type >= 3 << 13) {
+		/* 10.2.2 and 10.2.3 write ATOMIC_WRITES less than 3,
+		and no other flags above that can be set for the
+		SYS_TABLES.TYPE to be in the 10.2.2..10.2.6 format.
+		This would in any case be invalid format for 10.2 and
+		earlier releases. */
+		ut_ad(!dict_sys_tables_type_valid(type, true));
+	} else {
+		/* SYS_TABLES.TYPE is of the form AALLLL10DB00001.  We
+		must still validate that the LLLL bits are between 0
+		and 9 before we can discard the extraneous 0 bit. */
+		ut_ad(!DICT_TF_GET_PAGE_COMPRESSION(type));
+
+		if ((((type >> 9) & 0xf) - 1) < 9) {
+			ut_ad(DICT_TF_GET_PAGE_COMPRESSION_LEVEL(type) & 1);
+
+			type = (type & 0x7fU) | (type >> 1 & ~0x7fU);
+
+			ut_ad(DICT_TF_GET_PAGE_COMPRESSION(type));
+			ut_ad(DICT_TF_GET_PAGE_COMPRESSION_LEVEL(type) >= 1);
+			ut_ad(DICT_TF_GET_PAGE_COMPRESSION_LEVEL(type) <= 9);
+		} else {
+			ut_ad(!dict_sys_tables_type_valid(type, true));
+		}
+	}
+
+	/* The low order bit of SYS_TABLES.TYPE is always set to 1. But in
+	dict_table_t::flags the low order bit is used to determine if the
+	ROW_FORMAT=REDUNDANT (0) or anything else (1).
+	Read the 4 byte N_COLS field and look at the high order bit.  It
+	should be set for COMPACT and later.  It should not be set for
+	REDUNDANT. */
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_TABLES__N_COLS, &len);
+	ut_a(len == 4);
+	*n_cols = mach_read_from_4(field);
+
+	const bool not_redundant = 0 != (*n_cols & DICT_N_COLS_COMPACT);
+
+	if (!dict_sys_tables_type_valid(type, not_redundant)) {
+		sql_print_error("InnoDB: Table %.*s in InnoDB"
+				" data dictionary contains invalid flags."
+				" SYS_TABLES.TYPE=" UINT32PF
+				" SYS_TABLES.N_COLS=" UINT32PF,
+				int(rec_get_field_start_offs(rec, 1)), rec,
+				type, *n_cols);
+err_exit:
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+		return READ_ERROR;
+	}
+
+	*flags = dict_sys_tables_type_to_tf(type, not_redundant);
+
+	/* For tables created before MySQL 4.1, there may be
+	garbage in SYS_TABLES.MIX_LEN where flags2 are found. Such tables
+	would always be in ROW_FORMAT=REDUNDANT which do not have the
+	high bit set in n_cols, and flags would be zero.
+	MySQL 4.1 was the first version to support innodb_file_per_table,
+	that is, *space_id != 0. */
+	if (not_redundant || *space_id != 0 || *n_cols & DICT_N_COLS_COMPACT
+	    || fil_system.sys_space->full_crc32()) {
+
+		/* Get flags2 from SYS_TABLES.MIX_LEN */
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_TABLES__MIX_LEN, &len);
+		*flags2 = mach_read_from_4(field);
+
+		if (!dict_tf2_is_valid(*flags, *flags2)) {
+			sql_print_error("InnoDB: Table %.*s in InnoDB"
+					" data dictionary"
+					" contains invalid flags."
+					" SYS_TABLES.TYPE=" UINT32PF
+					" SYS_TABLES.MIX_LEN=" UINT32PF,
+					int(rec_get_field_start_offs(rec, 1)),
+					rec,
+					type, *flags2);
+			goto err_exit;
+		}
+
+		/* DICT_TF2_FTS will be set when indexes are being loaded */
+		*flags2 &= ~DICT_TF2_FTS;
+
+		/* Now that we have used this bit, unset it. */
+		*n_cols &= ~DICT_N_COLS_COMPACT;
+	} else {
+		*flags2 = 0;
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	return READ_OK;
+}
+
+/** Check each tablespace found in the data dictionary.
+Then look at each table defined in SYS_TABLES that has a space_id > 0
+to find all the file-per-table tablespaces.
+
+In a crash recovery we already have some tablespace objects created from
+processing the REDO log. We will compare the
+space_id information in the data dictionary to what we find in the
+tablespace file. In addition, more validation will be done if recovery
+was needed and force_recovery is not set.
+
+We also scan the biggest space id, and store it to fil_system. */
+void dict_check_tablespaces_and_store_max_id()
+{
+	uint32_t	max_space_id = 0;
+	btr_pcur_t	pcur;
+	mtr_t		mtr;
+
+	DBUG_ENTER("dict_check_tablespaces_and_store_max_id");
+
+	mtr.start();
+
+	dict_sys.lock(SRW_LOCK_CALL);
+
+	for (const rec_t *rec = dict_startscan_system(&pcur, &mtr,
+						      dict_sys.sys_tables);
+	     rec; rec = dict_getnext_system_low(&pcur, &mtr)) {
+		ulint		len;
+		table_id_t	table_id;
+		uint32_t	space_id;
+		uint32_t	n_cols;
+		uint32_t	flags;
+		uint32_t	flags2;
+
+		/* If a table record is not useable, ignore it and continue
+		on to the next record. Error messages were logged. */
+		if (dict_sys_tables_rec_check(rec)) {
+			continue;
+		}
+
+		const char *field = reinterpret_cast<const char*>(
+			rec_get_nth_field_old(rec, DICT_FLD__SYS_TABLES__NAME,
+					      &len));
+
+		DBUG_PRINT("dict_check_sys_tables",
+			   ("name: %*.s", static_cast<int>(len), field));
+
+		if (dict_sys_tables_rec_read(rec, false,
+					     &mtr, &table_id, &space_id,
+					     &n_cols, &flags, &flags2, nullptr)
+		    != READ_OK
+		    || space_id == TRX_SYS_SPACE) {
+			continue;
+		}
+
+		if (flags2 & DICT_TF2_DISCARDED) {
+			sql_print_information("InnoDB: Ignoring tablespace"
+					      " for %.*s because "
+					      "the DISCARD flag is set",
+					      static_cast<int>(len), field);
+			continue;
+		}
+
+		/* For tables or partitions using .ibd files, the flag
+		DICT_TF2_USE_FILE_PER_TABLE was not set in MIX_LEN
+		before MySQL 5.6.5. The flag should not have been
+		introduced in persistent storage. MariaDB will keep
+		setting the flag when writing SYS_TABLES entries for
+		newly created or rebuilt tables or partitions, but
+		will otherwise ignore the flag. */
+
+		if (fil_space_for_table_exists_in_mem(space_id, flags)) {
+			continue;
+		}
+
+		const span<const char> name{field, len};
+
+		char*	filepath = fil_make_filepath(nullptr, name,
+						     IBD, false);
+
+		const bool not_dropped{!rec_get_deleted_flag(rec, 0)};
+
+		/* Check that the .ibd file exists. */
+		if (fil_ibd_open(not_dropped, FIL_TYPE_TABLESPACE,
+				 space_id, dict_tf_to_fsp_flags(flags),
+				 name, filepath)) {
+		} else if (!not_dropped) {
+		} else if (srv_operation == SRV_OPERATION_NORMAL
+			   && srv_start_after_restore
+			   && srv_force_recovery < SRV_FORCE_NO_BACKGROUND
+			   && dict_table_t::is_temporary_name(filepath)) {
+			/* Mariabackup will not copy files whose
+			names start with #sql-. This table ought to
+			be dropped by drop_garbage_tables_after_restore()
+			a little later. */
+		} else {
+			sql_print_warning("InnoDB: Ignoring tablespace for"
+					  " %.*s because it"
+					  " could not be opened.",
+					  static_cast<int>(len), field);
+		}
+
+		max_space_id = ut_max(max_space_id, space_id);
+
+		ut_free(filepath);
+	}
+
+	mtr.commit();
+
+	fil_set_max_space_id_if_bigger(max_space_id);
+
+	dict_sys.unlock();
+
+	DBUG_VOID_RETURN;
+}
+
+/** Error message for a delete-marked record in dict_load_column_low() */
+static const char *dict_load_column_del= "delete-marked record in SYS_COLUMNS";
+/** Error message for a missing record in dict_load_column_low() */
+static const char *dict_load_column_none= "SYS_COLUMNS record not found";
+/** Message for incomplete instant ADD/DROP in dict_load_column_low() */
+static const char *dict_load_column_instant= "incomplete instant ADD/DROP";
+
+/** Load a table column definition from a SYS_COLUMNS record to dict_table_t.
+@param table           table, or nullptr if the output will be in column
+@param use_uncommitted 0=READ COMMITTED, 1=detect, 2=READ UNCOMMITTED
+@param heap            memory heap for temporary storage
+@param column          pointer to output buffer, or nullptr if table!=nullptr
+@param table_id        table identifier
+@param col_name        column name
+@param rec             SYS_COLUMNS record
+@param mtr             mini-transaction
+@param nth_v_col       nullptr, or pointer to a counter of virtual columns
+@return error message
+@retval nullptr on success */
+static const char *dict_load_column_low(dict_table_t *table,
+                                        unsigned use_uncommitted,
+                                        mem_heap_t *heap, dict_col_t *column,
+                                        table_id_t *table_id,
+                                        const char **col_name,
+                                        const rec_t *rec,
+                                        mtr_t *mtr,
+                                        ulint *nth_v_col)
+{
+	char*		name;
+	const byte*	field;
+	ulint		len;
+	ulint		mtype;
+	ulint		prtype;
+	ulint		col_len;
+	ulint		pos;
+	ulint		num_base;
+
+	ut_ad(!table == !!column);
+
+	if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_COLUMNS) {
+		return("wrong number of columns in SYS_COLUMNS record");
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_COLUMNS__TABLE_ID, &len);
+	if (len != 8) {
+err_len:
+		return("incorrect column length in SYS_COLUMNS");
+	}
+
+	if (table_id) {
+		*table_id = mach_read_from_8(field);
+	} else if (table->id != mach_read_from_8(field)) {
+		return dict_load_column_none;
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_COLUMNS__POS, &len);
+	if (len != 4) {
+		goto err_len;
+	}
+
+	pos = mach_read_from_4(field);
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_COLUMNS__DB_TRX_ID, &len);
+	if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
+		goto err_len;
+	}
+
+	const trx_id_t trx_id = trx_read_trx_id(field);
+
+	if (trx_id && mtr && use_uncommitted < 2
+	    && trx_sys.find(nullptr, trx_id, false)) {
+		if (use_uncommitted) {
+			return dict_load_column_instant;
+		}
+		const auto savepoint = mtr->get_savepoint();
+		dict_index_t* index = UT_LIST_GET_FIRST(
+			dict_sys.sys_columns->indexes);
+		rec_offs* offsets = rec_get_offsets(
+			rec, index, nullptr, true, ULINT_UNDEFINED, &heap);
+		const rec_t* old_vers;
+		row_vers_build_for_semi_consistent_read(
+			nullptr, rec, mtr, index, &offsets, &heap,
+			heap, &old_vers, nullptr);
+		mtr->rollback_to_savepoint(savepoint);
+		rec = old_vers;
+		if (!old_vers) {
+			return dict_load_column_none;
+		}
+		ut_ad(!rec_get_deleted_flag(rec, 0));
+	}
+
+	if (rec_get_deleted_flag(rec, 0)) {
+		ut_ad(trx_id);
+		return dict_load_column_del;
+	}
+
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_COLUMNS__DB_ROLL_PTR, &len);
+	if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
+		goto err_len;
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_COLUMNS__NAME, &len);
+	if (len == 0 || len == UNIV_SQL_NULL) {
+		goto err_len;
+	}
+
+	*col_name = name = mem_heap_strdupl(heap, (const char*) field, len);
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_COLUMNS__MTYPE, &len);
+	if (len != 4) {
+		goto err_len;
+	}
+
+	mtype = mach_read_from_4(field);
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_COLUMNS__PRTYPE, &len);
+	if (len != 4) {
+		goto err_len;
+	}
+	prtype = mach_read_from_4(field);
+
+	if (dtype_get_charset_coll(prtype) == 0
+	    && dtype_is_string_type(mtype)) {
+		/* The table was created with < 4.1.2. */
+
+		if (dtype_is_binary_string_type(mtype, prtype)) {
+			/* Use the binary collation for
+			string columns of binary type. */
+
+			prtype = dtype_form_prtype(
+				prtype,
+				DATA_MYSQL_BINARY_CHARSET_COLL);
+		} else {
+			/* Use the default charset for
+			other than binary columns. */
+
+			prtype = dtype_form_prtype(
+				prtype,
+				data_mysql_default_charset_coll);
+		}
+	}
+
+	if (table && table->n_def != pos && !(prtype & DATA_VIRTUAL)) {
+		return("SYS_COLUMNS.POS mismatch");
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_COLUMNS__LEN, &len);
+	if (len != 4) {
+		goto err_len;
+	}
+	col_len = mach_read_from_4(field);
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_COLUMNS__PREC, &len);
+	if (len != 4) {
+		goto err_len;
+	}
+	num_base = mach_read_from_4(field);
+
+	if (table) {
+		if (prtype & DATA_VIRTUAL) {
+#ifdef UNIV_DEBUG
+			dict_v_col_t*	vcol =
+#endif
+			dict_mem_table_add_v_col(
+				table, heap, name, mtype,
+				prtype, col_len,
+				dict_get_v_col_mysql_pos(pos), num_base);
+			ut_ad(vcol->v_pos == dict_get_v_col_pos(pos));
+		} else {
+			ut_ad(num_base == 0);
+			dict_mem_table_add_col(table, heap, name, mtype,
+					       prtype, col_len);
+		}
+
+		if (trx_id > table->def_trx_id) {
+			table->def_trx_id = trx_id;
+		}
+	} else {
+		dict_mem_fill_column_struct(column, pos, mtype,
+					    prtype, col_len);
+	}
+
+	/* Report the virtual column number */
+	if ((prtype & DATA_VIRTUAL) && nth_v_col != NULL) {
+		*nth_v_col = dict_get_v_col_pos(pos);
+	}
+
+	return(NULL);
+}
+
+/** Error message for a delete-marked record in dict_load_virtual_low() */
+static const char *dict_load_virtual_del= "delete-marked record in SYS_VIRTUAL";
+static const char *dict_load_virtual_none= "SYS_VIRTUAL record not found";
+
+/** Load a virtual column "mapping" (to base columns) information
+from a SYS_VIRTUAL record
+@param[in,out]	table		table
+@param[in]	uncommitted	false=READ COMMITTED, true=READ UNCOMMITTED
+@param[in,out]	column		mapped base column's dict_column_t
+@param[in,out]	table_id	table id
+@param[in,out]	pos		virtual column position
+@param[in,out]	base_pos	base column position
+@param[in]	rec		SYS_VIRTUAL record
+@return	error message
+@retval	NULL on success */
+static
+const char*
+dict_load_virtual_low(
+	dict_table_t*	table,
+	bool		uncommitted,
+	dict_col_t**	column,
+	table_id_t*	table_id,
+	ulint*		pos,
+	ulint*		base_pos,
+	const rec_t*	rec)
+{
+	const byte*	field;
+	ulint		len;
+	ulint		base;
+
+	if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_VIRTUAL) {
+		return("wrong number of columns in SYS_VIRTUAL record");
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_VIRTUAL__TABLE_ID, &len);
+	if (len != 8) {
+err_len:
+		return("incorrect column length in SYS_VIRTUAL");
+	}
+
+	if (table_id != NULL) {
+		*table_id = mach_read_from_8(field);
+	} else if (table->id != mach_read_from_8(field)) {
+		return dict_load_virtual_none;
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_VIRTUAL__POS, &len);
+	if (len != 4) {
+		goto err_len;
+	}
+
+	if (pos != NULL) {
+		*pos = mach_read_from_4(field);
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_VIRTUAL__BASE_POS, &len);
+	if (len != 4) {
+		goto err_len;
+	}
+
+	base = mach_read_from_4(field);
+
+	if (base_pos != NULL) {
+		*base_pos = base;
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_VIRTUAL__DB_TRX_ID, &len);
+	if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
+		goto err_len;
+	}
+
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_VIRTUAL__DB_ROLL_PTR, &len);
+	if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
+		goto err_len;
+	}
+
+	const trx_id_t trx_id = trx_read_trx_id(field);
+
+	if (trx_id && column && !uncommitted
+	    && trx_sys.find(nullptr, trx_id, false)) {
+		if (!rec_get_deleted_flag(rec, 0)) {
+			return dict_load_virtual_none;
+		}
+	} else if (rec_get_deleted_flag(rec, 0)) {
+		ut_ad(trx_id != 0);
+		return dict_load_virtual_del;
+	}
+
+	if (column != NULL) {
+		*column = dict_table_get_nth_col(table, base);
+	}
+
+	return(NULL);
+}
+
+/** Load the definitions for table columns.
+@param table           table
+@param use_uncommitted 0=READ COMMITTED, 1=detect, 2=READ UNCOMMITTED
+@param heap            memory heap for temporary storage
+@return error code
+@retval DB_SUCCESS on success
+@retval DB_SUCCESS_LOCKED_REC on success if use_uncommitted=1
+and instant ADD/DROP/reorder was detected */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+static dberr_t dict_load_columns(dict_table_t *table, unsigned use_uncommitted,
+                                 mem_heap_t *heap)
+{
+	btr_pcur_t	pcur;
+	mtr_t		mtr;
+	ulint		n_skipped = 0;
+
+	ut_ad(dict_sys.locked());
+
+	mtr.start();
+
+	dict_index_t* sys_index = dict_sys.sys_columns->indexes.start;
+	ut_ad(!dict_sys.sys_columns->not_redundant());
+
+	ut_ad(name_of_col_is(dict_sys.sys_columns, sys_index,
+			     DICT_FLD__SYS_COLUMNS__NAME, "NAME"));
+	ut_ad(name_of_col_is(dict_sys.sys_columns, sys_index,
+			     DICT_FLD__SYS_COLUMNS__PREC, "PREC"));
+
+	dfield_t dfield;
+	dtuple_t tuple{
+		0,1,1,&dfield,0,nullptr
+#ifdef UNIV_DEBUG
+		, DATA_TUPLE_MAGIC_N
+#endif
+	};
+	byte table_id[8];
+	mach_write_to_8(table_id, table->id);
+	dfield_set_data(&dfield, table_id, 8);
+	dict_index_copy_types(&tuple, sys_index, 1);
+	pcur.btr_cur.page_cur.index = sys_index;
+
+	dberr_t err = btr_pcur_open_on_user_rec(&tuple,
+						BTR_SEARCH_LEAF, &pcur, &mtr);
+	if (err != DB_SUCCESS) {
+		goto func_exit;
+	}
+
+	ut_ad(table->n_t_cols == static_cast<ulint>(
+	      table->n_cols) + static_cast<ulint>(table->n_v_cols));
+
+	for (ulint i = 0;
+	     i + DATA_N_SYS_COLS < table->n_t_cols + n_skipped;
+	     i++) {
+		const char*	err_msg;
+		const char*	name = NULL;
+		ulint		nth_v_col = ULINT_UNDEFINED;
+		const rec_t*	rec = btr_pcur_get_rec(&pcur);
+
+		err_msg = btr_pcur_is_on_user_rec(&pcur)
+			? dict_load_column_low(table, use_uncommitted,
+					       heap, NULL, NULL,
+					       &name, rec, &mtr, &nth_v_col)
+			: dict_load_column_none;
+
+		if (!err_msg) {
+		} else if (err_msg == dict_load_column_del) {
+			n_skipped++;
+			goto next_rec;
+		} else if (err_msg == dict_load_column_instant) {
+			err = DB_SUCCESS_LOCKED_REC;
+			goto func_exit;
+		} else if (err_msg == dict_load_column_none
+			   && strstr(table->name.m_name,
+				     "/" TEMP_FILE_PREFIX_INNODB)) {
+			break;
+		} else {
+			ib::error() << err_msg << " for table " << table->name;
+			err = DB_CORRUPTION;
+			goto func_exit;
+		}
+
+		/* Note: Currently we have one DOC_ID column that is
+		shared by all FTS indexes on a table. And only non-virtual
+		column can be used for FULLTEXT index */
+		if (innobase_strcasecmp(name,
+					FTS_DOC_ID_COL_NAME) == 0
+		    && nth_v_col == ULINT_UNDEFINED) {
+			dict_col_t*	col;
+			/* As part of normal loading of tables the
+			table->flag is not set for tables with FTS
+			till after the FTS indexes are loaded. So we
+			create the fts_t instance here if there isn't
+			one already created.
+
+			This case does not arise for table create as
+			the flag is set before the table is created. */
+			if (table->fts == NULL) {
+				table->fts = fts_create(table);
+				table->fts->cache = fts_cache_create(table);
+				DICT_TF2_FLAG_SET(table, DICT_TF2_FTS_AUX_HEX_NAME);
+			}
+
+			ut_a(table->fts->doc_col == ULINT_UNDEFINED);
+
+			col = dict_table_get_nth_col(table, i - n_skipped);
+
+			ut_ad(col->len == sizeof(doc_id_t));
+
+			if (col->prtype & DATA_FTS_DOC_ID) {
+				DICT_TF2_FLAG_SET(
+					table, DICT_TF2_FTS_HAS_DOC_ID);
+				DICT_TF2_FLAG_UNSET(
+					table, DICT_TF2_FTS_ADD_DOC_ID);
+			}
+
+			table->fts->doc_col = i - n_skipped;
+		}
+next_rec:
+		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+	}
+
+func_exit:
+	mtr.commit();
+	return err;
+}
+
+/** Loads SYS_VIRTUAL info for one virtual column
+@param table	   table definition
+@param uncommitted false=READ COMMITTED, true=READ UNCOMMITTED
+@param nth_v_col   virtual column position */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+static
+dberr_t
+dict_load_virtual_col(dict_table_t *table, bool uncommitted, ulint nth_v_col)
+{
+	const dict_v_col_t* v_col = dict_table_get_nth_v_col(table, nth_v_col);
+
+	if (v_col->num_base == 0) {
+		return DB_SUCCESS;
+	}
+
+	dict_index_t*	sys_virtual_index;
+	btr_pcur_t	pcur;
+	mtr_t		mtr;
+
+	ut_ad(dict_sys.locked());
+
+	mtr.start();
+
+	sys_virtual_index = dict_sys.sys_virtual->indexes.start;
+	ut_ad(!dict_sys.sys_virtual->not_redundant());
+
+	ut_ad(name_of_col_is(dict_sys.sys_virtual, sys_virtual_index,
+			     DICT_FLD__SYS_VIRTUAL__POS, "POS"));
+
+	dfield_t dfield[2];
+	dtuple_t tuple{
+		0,2,2,dfield,0,nullptr
+#ifdef UNIV_DEBUG
+		, DATA_TUPLE_MAGIC_N
+#endif
+	};
+	byte table_id[8], vcol_pos[4];
+	mach_write_to_8(table_id, table->id);
+	dfield_set_data(&dfield[0], table_id, 8);
+	mach_write_to_4(vcol_pos,
+			dict_create_v_col_pos(nth_v_col, v_col->m_col.ind));
+	dfield_set_data(&dfield[1], vcol_pos, 4);
+
+	dict_index_copy_types(&tuple, sys_virtual_index, 2);
+	pcur.btr_cur.page_cur.index = sys_virtual_index;
+
+	dberr_t err = btr_pcur_open_on_user_rec(&tuple,
+						BTR_SEARCH_LEAF, &pcur, &mtr);
+	if (err != DB_SUCCESS) {
+		goto func_exit;
+	}
+
+	for (ulint i = 0, skipped = 0;
+	     i < unsigned{v_col->num_base} + skipped; i++) {
+		ulint		pos;
+		const char*	err_msg
+			= btr_pcur_is_on_user_rec(&pcur)
+			? dict_load_virtual_low(table, uncommitted,
+						&v_col->base_col[i - skipped],
+						NULL,
+					        &pos, NULL,
+						btr_pcur_get_rec(&pcur))
+			: dict_load_virtual_none;
+
+		if (!err_msg) {
+			ut_ad(pos == mach_read_from_4(vcol_pos));
+		} else if (err_msg == dict_load_virtual_del) {
+			skipped++;
+		} else if (err_msg == dict_load_virtual_none
+			   && strstr(table->name.m_name,
+				     "/" TEMP_FILE_PREFIX_INNODB)) {
+			break;
+		} else {
+			ib::error() << err_msg << " for table " << table->name;
+			err = DB_CORRUPTION;
+			break;
+		}
+
+		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+	}
+
+func_exit:
+	mtr.commit();
+	return err;
+}
+
+/** Loads info from SYS_VIRTUAL for virtual columns.
+@param table	   table definition
+@param uncommitted false=READ COMMITTED, true=READ UNCOMMITTED */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+static dberr_t dict_load_virtual(dict_table_t *table, bool uncommitted)
+{
+  for (ulint i= 0; i < table->n_v_cols; i++)
+    if (dberr_t err= dict_load_virtual_col(table, uncommitted, i))
+      return err;
+  return DB_SUCCESS;
+}
+
+/** Error message for a delete-marked record in dict_load_field_low() */
+static const char *dict_load_field_del= "delete-marked record in SYS_FIELDS";
+
+static const char *dict_load_field_none= "SYS_FIELDS record not found";
+
+/** Load an index field definition from a SYS_FIELDS record to dict_index_t.
+@return	error message
+@retval	NULL on success */
+static
+const char*
+dict_load_field_low(
+	byte*		index_id,	/*!< in/out: index id (8 bytes)
+					an "in" value if index != NULL
+					and "out" if index == NULL */
+	bool		uncommitted,	/*!< in: false=READ COMMITTED,
+					true=READ UNCOMMITTED */
+	dict_index_t*	index,		/*!< in/out: index, could be NULL
+					if we just populate a dict_field_t
+					struct with information from
+					a SYS_FIELDS record */
+	dict_field_t*	sys_field,	/*!< out: dict_field_t to be
+					filled */
+	ulint*		pos,		/*!< out: Field position */
+	byte*		last_index_id,	/*!< in: last index id */
+	mem_heap_t*	heap,		/*!< in/out: memory heap
+					for temporary storage */
+	mtr_t*		mtr,		/*!< in/out: mini-transaction */
+	const rec_t*	rec)		/*!< in: SYS_FIELDS record */
+{
+	const byte*	field;
+	ulint		len;
+	unsigned	pos_and_prefix_len;
+	unsigned	prefix_len;
+	bool		descending;
+	bool		first_field;
+	ulint		position;
+
+	/* Either index or sys_field is supplied, not both */
+	ut_ad((!index) != (!sys_field));
+	ut_ad((!index) == !mtr);
+
+	if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_FIELDS) {
+		return("wrong number of columns in SYS_FIELDS record");
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FIELDS__INDEX_ID, &len);
+	if (len != 8) {
+err_len:
+		return("incorrect column length in SYS_FIELDS");
+	}
+
+	if (!index) {
+		ut_a(last_index_id);
+		memcpy(index_id, (const char*) field, 8);
+		first_field = memcmp(index_id, last_index_id, 8);
+	} else {
+		first_field = (index->n_def == 0);
+		if (memcmp(field, index_id, 8)) {
+			return dict_load_field_none;
+		}
+	}
+
+	/* The next field stores the field position in the index and a
+	possible column prefix length if the index field does not
+	contain the whole column. The storage format is like this: if
+	there is at least one prefix field in the index, then the HIGH
+	2 bytes contain the field number (index->n_def) and the low 2
+	bytes the prefix length for the field. Otherwise the field
+	number (index->n_def) is contained in the 2 LOW bytes. */
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FIELDS__POS, &len);
+	if (len != 4) {
+		goto err_len;
+	}
+
+	pos_and_prefix_len = mach_read_from_4(field);
+
+	if (index && UNIV_UNLIKELY
+	    ((pos_and_prefix_len & 0xFFFFUL) != index->n_def
+	     && (pos_and_prefix_len >> 16 & 0xFFFF) != index->n_def)) {
+		return("SYS_FIELDS.POS mismatch");
+	}
+
+	if (first_field || pos_and_prefix_len > 0xFFFFUL) {
+		prefix_len = pos_and_prefix_len & 0x7FFFUL;
+		descending = (pos_and_prefix_len & 0x8000UL);
+		position = (pos_and_prefix_len & 0xFFFF0000UL)  >> 16;
+	} else {
+		prefix_len = 0;
+		descending = false;
+		position = pos_and_prefix_len & 0xFFFFUL;
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FIELDS__DB_TRX_ID, &len);
+	if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
+		goto err_len;
+	}
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_FIELDS__DB_ROLL_PTR, &len);
+	if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
+		goto err_len;
+	}
+
+	const trx_id_t trx_id = trx_read_trx_id(field);
+
+	if (!trx_id) {
+		ut_ad(!rec_get_deleted_flag(rec, 0));
+	} else if (!mtr || uncommitted) {
+	} else if (trx_sys.find(nullptr, trx_id, false)) {
+		const auto savepoint = mtr->get_savepoint();
+		dict_index_t* sys_field = UT_LIST_GET_FIRST(
+			dict_sys.sys_fields->indexes);
+		rec_offs* offsets = rec_get_offsets(
+			rec, sys_field, nullptr, true, ULINT_UNDEFINED, &heap);
+		const rec_t* old_vers;
+		row_vers_build_for_semi_consistent_read(
+			nullptr, rec, mtr, sys_field, &offsets, &heap,
+			heap, &old_vers, nullptr);
+		mtr->rollback_to_savepoint(savepoint);
+		rec = old_vers;
+		if (!old_vers || rec_get_deleted_flag(rec, 0)) {
+			return dict_load_field_none;
+		}
+	}
+
+	if (rec_get_deleted_flag(rec, 0)) {
+		return(dict_load_field_del);
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FIELDS__COL_NAME, &len);
+	if (len == 0 || len == UNIV_SQL_NULL) {
+		goto err_len;
+	}
+
+	if (index) {
+		dict_mem_index_add_field(
+			index, mem_heap_strdupl(heap, (const char*) field, len),
+			prefix_len, descending);
+	} else {
+		sys_field->name = mem_heap_strdupl(
+			heap, (const char*) field, len);
+		sys_field->prefix_len = prefix_len & ((1U << 12) - 1);
+		sys_field->descending = descending;
+		*pos = position;
+	}
+
+	return(NULL);
+}
+
+/**
+Load definitions for index fields.
+@param index       index whose fields are to be loaded
+@param uncommitted false=READ COMMITTED, true=READ UNCOMMITTED
+@param heap        memory heap for temporary storage
+@return error code
+@return DB_SUCCESS if the fields were loaded successfully */
+static dberr_t dict_load_fields(dict_index_t *index, bool uncommitted,
+                                mem_heap_t *heap)
+{
+	btr_pcur_t	pcur;
+	mtr_t		mtr;
+
+	ut_ad(dict_sys.locked());
+
+	mtr.start();
+
+	dict_index_t* sys_index = dict_sys.sys_fields->indexes.start;
+	ut_ad(!dict_sys.sys_fields->not_redundant());
+	ut_ad(name_of_col_is(dict_sys.sys_fields, sys_index,
+			     DICT_FLD__SYS_FIELDS__COL_NAME, "COL_NAME"));
+
+	dfield_t dfield;
+	dtuple_t tuple{
+		0,1,1,&dfield,0,nullptr
+#ifdef UNIV_DEBUG
+		, DATA_TUPLE_MAGIC_N
+#endif
+	};
+	byte index_id[8];
+	mach_write_to_8(index_id, index->id);
+	dfield_set_data(&dfield, index_id, 8);
+	dict_index_copy_types(&tuple, sys_index, 1);
+	pcur.btr_cur.page_cur.index = sys_index;
+
+	dberr_t error = btr_pcur_open_on_user_rec(&tuple, BTR_SEARCH_LEAF,
+						  &pcur, &mtr);
+	if (error != DB_SUCCESS) {
+		goto func_exit;
+	}
+
+	for (ulint i = 0; i < index->n_fields; i++) {
+		const char *err_msg = btr_pcur_is_on_user_rec(&pcur)
+			? dict_load_field_low(index_id, uncommitted, index,
+					      nullptr, nullptr, nullptr,
+					      heap, &mtr,
+					      btr_pcur_get_rec(&pcur))
+			: dict_load_field_none;
+
+		if (!err_msg) {
+		} else if (err_msg == dict_load_field_del) {
+			/* There could be delete marked records in
+			SYS_FIELDS because SYS_FIELDS.INDEX_ID can be
+			updated by ALTER TABLE ADD INDEX. */
+		} else {
+			if (err_msg != dict_load_field_none
+			    || strstr(index->table->name.m_name,
+				      "/" TEMP_FILE_PREFIX_INNODB)) {
+				ib::error() << err_msg << " for index "
+					    << index->name
+					    << " of table "
+					    << index->table->name;
+			}
+			error = DB_CORRUPTION;
+			break;
+		}
+
+		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+	}
+
+func_exit:
+	mtr.commit();
+	return error;
+}
+
+/** Error message for a delete-marked record in dict_load_index_low() */
+static const char *dict_load_index_del= "delete-marked record in SYS_INDEXES";
+/** Error message for table->id mismatch in dict_load_index_low() */
+static const char *dict_load_index_none= "SYS_INDEXES record not found";
+/** Error message for SYS_TABLES flags mismatch in dict_load_table_low() */
+static const char *dict_load_table_flags= "incorrect flags in SYS_TABLES";
+
+/** Load an index definition from a SYS_INDEXES record to dict_index_t.
+@return	error message
+@retval	NULL on success */
+static
+const char*
+dict_load_index_low(
+	byte*		table_id,	/*!< in/out: table id (8 bytes),
+					an "in" value if mtr
+					and "out" when !mtr */
+	bool		uncommitted,	/*!< in: false=READ COMMITTED,
+					true=READ UNCOMMITTED */
+	mem_heap_t*	heap,		/*!< in/out: temporary memory heap */
+	const rec_t*	rec,		/*!< in: SYS_INDEXES record */
+	mtr_t*		mtr,		/*!< in/out: mini-transaction,
+					or nullptr if a pre-allocated
+					*index is to be filled in */
+	dict_table_t*	table,		/*!< in/out: table, or NULL */
+	dict_index_t**	index)		/*!< out,own: index, or NULL */
+{
+	const byte*	field;
+	ulint		len;
+	index_id_t	id;
+	ulint		n_fields;
+	ulint		type;
+	unsigned	merge_threshold;
+
+	if (mtr) {
+		*index = NULL;
+	}
+
+	if (rec_get_n_fields_old(rec) == DICT_NUM_FIELDS__SYS_INDEXES) {
+		/* MERGE_THRESHOLD exists */
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD, &len);
+		switch (len) {
+		case 4:
+			merge_threshold = mach_read_from_4(field);
+			break;
+		case UNIV_SQL_NULL:
+			merge_threshold = DICT_INDEX_MERGE_THRESHOLD_DEFAULT;
+			break;
+		default:
+			return("incorrect MERGE_THRESHOLD length"
+			       " in SYS_INDEXES");
+		}
+	} else if (rec_get_n_fields_old(rec)
+		   == DICT_NUM_FIELDS__SYS_INDEXES - 1) {
+		/* MERGE_THRESHOLD doesn't exist */
+
+		merge_threshold = DICT_INDEX_MERGE_THRESHOLD_DEFAULT;
+	} else {
+		return("wrong number of columns in SYS_INDEXES record");
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__TABLE_ID, &len);
+	if (len != 8) {
+err_len:
+		return("incorrect column length in SYS_INDEXES");
+	}
+
+	if (!mtr) {
+		/* We are reading a SYS_INDEXES record. Copy the table_id */
+		memcpy(table_id, (const char*) field, 8);
+	} else if (memcmp(field, table_id, 8)) {
+		/* Caller supplied table_id, verify it is the same
+		id as on the index record */
+		return dict_load_index_none;
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__ID, &len);
+	if (len != 8) {
+		goto err_len;
+	}
+
+	id = mach_read_from_8(field);
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__DB_TRX_ID, &len);
+	if (len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL) {
+		goto err_len;
+	}
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_INDEXES__DB_ROLL_PTR, &len);
+	if (len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL) {
+		goto err_len;
+	}
+
+	const trx_id_t trx_id = trx_read_trx_id(field);
+	if (!trx_id) {
+		ut_ad(!rec_get_deleted_flag(rec, 0));
+	} else if (!mtr || uncommitted) {
+	} else if (trx_sys.find(nullptr, trx_id, false)) {
+		const auto savepoint = mtr->get_savepoint();
+		dict_index_t* sys_index = UT_LIST_GET_FIRST(
+			dict_sys.sys_indexes->indexes);
+		rec_offs* offsets = rec_get_offsets(
+			rec, sys_index, nullptr, true, ULINT_UNDEFINED, &heap);
+		const rec_t* old_vers;
+		row_vers_build_for_semi_consistent_read(
+			nullptr, rec, mtr, sys_index, &offsets, &heap,
+			heap, &old_vers, nullptr);
+		mtr->rollback_to_savepoint(savepoint);
+		rec = old_vers;
+		if (!old_vers || rec_get_deleted_flag(rec, 0)) {
+			return dict_load_index_none;
+		}
+	} else if (rec_get_deleted_flag(rec, 0)
+		   && rec[8 + 8 + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN]
+		   != static_cast<byte>(*TEMP_INDEX_PREFIX_STR)
+		   && table->def_trx_id < trx_id) {
+		table->def_trx_id = trx_id;
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__N_FIELDS, &len);
+	if (len != 4) {
+		goto err_len;
+	}
+	n_fields = mach_read_from_4(field);
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__TYPE, &len);
+	if (len != 4) {
+		goto err_len;
+	}
+	type = mach_read_from_4(field);
+	if (type & (~0U << DICT_IT_BITS)) {
+		return("unknown SYS_INDEXES.TYPE bits");
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_INDEXES__PAGE_NO, &len);
+	if (len != 4) {
+		goto err_len;
+	}
+
+	ut_d(const auto name_offs =)
+	rec_get_nth_field_offs_old(rec, DICT_FLD__SYS_INDEXES__NAME, &len);
+	ut_ad(name_offs == 8 + 8 + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+	if (len == 0 || len == UNIV_SQL_NULL) {
+		goto err_len;
+	}
+
+	if (rec_get_deleted_flag(rec, 0)) {
+		return dict_load_index_del;
+	}
+
+	char* name = mem_heap_strdupl(heap, reinterpret_cast<const char*>(rec)
+				      + (8 + 8 + DATA_TRX_ID_LEN
+					 + DATA_ROLL_PTR_LEN),
+				      len);
+
+	if (mtr) {
+		*index = dict_mem_index_create(table, name, type, n_fields);
+	} else {
+		dict_mem_fill_index_struct(*index, nullptr, name,
+					   type, n_fields);
+	}
+
+	(*index)->id = id;
+	(*index)->page = mach_read_from_4(field);
+	ut_ad((*index)->page);
+	(*index)->merge_threshold = merge_threshold & ((1U << 6) - 1);
+
+	return(NULL);
+}
+
+/** Load definitions for table indexes. Adds them to the data dictionary cache.
+@param table       table definition
+@param uncommitted false=READ COMMITTED, true=READ UNCOMMITTED
+@param heap        memory heap for temporary storage
+@param ignore_err  errors to be ignored when loading the index definition
+@return error code
+@retval DB_SUCCESS if all indexes were successfully loaded
+@retval DB_CORRUPTION if corruption of dictionary table
+@retval DB_UNSUPPORTED if table has unknown index type */
+static MY_ATTRIBUTE((nonnull))
+dberr_t dict_load_indexes(dict_table_t *table, bool uncommitted,
+                          mem_heap_t *heap, dict_err_ignore_t ignore_err)
+{
+	dict_index_t*	sys_index;
+	btr_pcur_t	pcur;
+	byte		table_id[8];
+	mtr_t		mtr;
+
+	ut_ad(dict_sys.locked());
+
+	mtr.start();
+
+	sys_index = dict_sys.sys_indexes->indexes.start;
+	ut_ad(!dict_sys.sys_indexes->not_redundant());
+	ut_ad(name_of_col_is(dict_sys.sys_indexes, sys_index,
+			     DICT_FLD__SYS_INDEXES__NAME, "NAME"));
+	ut_ad(name_of_col_is(dict_sys.sys_indexes, sys_index,
+			     DICT_FLD__SYS_INDEXES__PAGE_NO, "PAGE_NO"));
+
+	dfield_t dfield;
+	dtuple_t tuple{
+		0,1,1,&dfield,0,nullptr
+#ifdef UNIV_DEBUG
+		, DATA_TUPLE_MAGIC_N
+#endif
+	};
+	mach_write_to_8(table_id, table->id);
+	dfield_set_data(&dfield, table_id, 8);
+	dict_index_copy_types(&tuple, sys_index, 1);
+	pcur.btr_cur.page_cur.index = sys_index;
+
+	dberr_t error = btr_pcur_open_on_user_rec(&tuple, BTR_SEARCH_LEAF,
+						  &pcur, &mtr);
+	if (error != DB_SUCCESS) {
+		goto func_exit;
+	}
+
+	while (btr_pcur_is_on_user_rec(&pcur)) {
+		dict_index_t*	index = NULL;
+		const char*	err_msg;
+		const rec_t*	rec = btr_pcur_get_rec(&pcur);
+		if ((ignore_err & DICT_ERR_IGNORE_RECOVER_LOCK)
+		    && (rec_get_n_fields_old(rec)
+			== DICT_NUM_FIELDS__SYS_INDEXES
+			/* a record for older SYS_INDEXES table
+			(missing merge_threshold column) is acceptable. */
+			|| rec_get_n_fields_old(rec)
+			   == DICT_NUM_FIELDS__SYS_INDEXES - 1)) {
+			const byte*	field;
+			ulint		len;
+			field = rec_get_nth_field_old(
+				rec, DICT_FLD__SYS_INDEXES__NAME, &len);
+
+			if (len != UNIV_SQL_NULL
+			    && static_cast<char>(*field)
+			    == static_cast<char>(*TEMP_INDEX_PREFIX_STR)) {
+				/* Skip indexes whose name starts with
+				TEMP_INDEX_PREFIX_STR, because they will
+				be dropped by row_merge_drop_temp_indexes()
+				during crash recovery. */
+				goto next_rec;
+			}
+		}
+
+		err_msg = dict_load_index_low(table_id, uncommitted, heap, rec,
+					      &mtr, table, &index);
+		ut_ad(!index == !!err_msg);
+
+		if (err_msg == dict_load_index_none) {
+			/* We have ran out of index definitions for
+			the table. */
+			break;
+		}
+
+		if (err_msg == dict_load_index_del) {
+			goto next_rec;
+		} else if (err_msg) {
+			ib::error() << err_msg;
+			if (ignore_err & DICT_ERR_IGNORE_INDEX) {
+				goto next_rec;
+			}
+			error = DB_CORRUPTION;
+			goto func_exit;
+		} else if (rec[8 + 8 + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN]
+			   == static_cast<byte>(*TEMP_INDEX_PREFIX_STR)) {
+			dict_mem_index_free(index);
+			goto next_rec;
+		} else {
+			const trx_id_t id = trx_read_trx_id(rec + 8 + 8);
+			if (id > table->def_trx_id) {
+				table->def_trx_id = id;
+			}
+		}
+
+		ut_ad(index);
+		ut_ad(!dict_index_is_online_ddl(index));
+
+		/* Check whether the index is corrupted */
+		if (ignore_err != DICT_ERR_IGNORE_DROP
+		    && index->is_corrupted() && index->is_clust()) {
+			dict_mem_index_free(index);
+			error = DB_TABLE_CORRUPT;
+			goto func_exit;
+		}
+
+		if (index->type & DICT_FTS
+		    && !dict_table_has_fts_index(table)) {
+			/* This should have been created by now. */
+			ut_a(table->fts != NULL);
+			DICT_TF2_FLAG_SET(table, DICT_TF2_FTS);
+		}
+
+		/* We check for unsupported types first, so that the
+		subsequent checks are relevant for the supported types. */
+		if (index->type & ~(DICT_CLUSTERED | DICT_UNIQUE
+				    | DICT_CORRUPT | DICT_FTS
+				    | DICT_SPATIAL | DICT_VIRTUAL)) {
+
+			ib::error() << "Unknown type " << index->type
+				<< " of index " << index->name
+				<< " of table " << table->name;
+
+			error = DB_UNSUPPORTED;
+			dict_mem_index_free(index);
+			goto func_exit;
+		} else if (index->page == FIL_NULL
+			   && table->is_readable()
+			   && (!(index->type & DICT_FTS))) {
+			if (!uncommitted
+			    && ignore_err != DICT_ERR_IGNORE_DROP) {
+				ib::error_or_warn(!(ignore_err
+						    & DICT_ERR_IGNORE_INDEX))
+					<< "Index " << index->name
+					<< " for table " << table->name
+					<< " has been freed!";
+			}
+
+			if (!(ignore_err & DICT_ERR_IGNORE_INDEX)) {
+corrupted:
+				dict_mem_index_free(index);
+				error = DB_CORRUPTION;
+				goto func_exit;
+			}
+			/* If caller can tolerate this error,
+			we will continue to load the index and
+			let caller deal with this error. However
+			mark the index and table corrupted. We
+			only need to mark such in the index
+			dictionary cache for such metadata corruption,
+			since we would always be able to set it
+			when loading the dictionary cache */
+			if (index->is_clust()) {
+				index->table->corrupted = true;
+				index->table->file_unreadable = true;
+			}
+			index->type |= DICT_CORRUPT;
+		} else if (!dict_index_is_clust(index)
+			   && NULL == dict_table_get_first_index(table)) {
+
+			ib::error() << "Trying to load index " << index->name
+				<< " for table " << table->name
+				<< ", but the first index is not clustered!";
+
+			goto corrupted;
+		} else if (dict_is_sys_table(table->id)
+			   && (dict_index_is_clust(index)
+			       || ((table == dict_sys.sys_tables)
+				   && !strcmp("ID_IND", index->name)))) {
+
+			/* The index was created in memory already at booting
+			of the database server */
+			dict_mem_index_free(index);
+		} else {
+			error = dict_load_fields(index, uncommitted, heap);
+			if (error != DB_SUCCESS) {
+				goto func_exit;
+			}
+
+			/* The data dictionary tables should never contain
+			invalid index definitions.  If we ignored this error
+			and simply did not load this index definition, the
+			.frm file would disagree with the index definitions
+			inside InnoDB. */
+			if ((error = dict_index_add_to_cache(index,
+							     index->page))
+			    != DB_SUCCESS) {
+				goto func_exit;
+			}
+
+#ifdef UNIV_DEBUG
+			// The following assertion doesn't hold for FTS indexes
+			// as it may have prefix_len=1 with any charset
+			if (index->type != DICT_FTS) {
+				for (uint i = 0; i < index->n_fields; i++) {
+					dict_field_t &f = index->fields[i];
+					ut_ad(f.col->mbmaxlen == 0
+					      || f.prefix_len
+					      % f.col->mbmaxlen == 0);
+				}
+			}
+#endif /* UNIV_DEBUG */
+		}
+next_rec:
+		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+	}
+
+	if (!dict_table_get_first_index(table)
+	    && !(ignore_err & DICT_ERR_IGNORE_INDEX)) {
+		ib::warn() << "No indexes found for table " << table->name;
+		error = DB_CORRUPTION;
+		goto func_exit;
+	}
+
+	ut_ad(table->fts_doc_id_index == NULL);
+
+	if (table->fts != NULL) {
+		dict_index_t *idx = dict_table_get_index_on_name(
+			table, FTS_DOC_ID_INDEX_NAME);
+		if (idx && dict_index_is_unique(idx)) {
+			table->fts_doc_id_index = idx;
+		}
+	}
+
+	/* If the table contains FTS indexes, populate table->fts->indexes */
+	if (dict_table_has_fts_index(table)) {
+		ut_ad(table->fts_doc_id_index != NULL);
+		/* table->fts->indexes should have been created. */
+		ut_a(table->fts->indexes != NULL);
+		dict_table_get_all_fts_indexes(table, table->fts->indexes);
+	}
+
+func_exit:
+	mtr.commit();
+	return error;
+}
+
+/** Load a table definition from a SYS_TABLES record to dict_table_t.
+Do not load any columns or indexes.
+@param[in,out]	mtr		mini-transaction
+@param[in]	uncommitted	whether to use READ UNCOMMITTED isolation level
+@param[in]	rec		SYS_TABLES record
+@param[out,own]	table		table, or nullptr
+@return	error message
+@retval	nullptr on success */
+const char *dict_load_table_low(mtr_t *mtr, bool uncommitted,
+                                const rec_t *rec, dict_table_t **table)
+{
+	table_id_t	table_id;
+	uint32_t	space_id, t_num, flags, flags2;
+	ulint		n_cols, n_v_col;
+	trx_id_t	trx_id;
+
+	if (const char* error_text = dict_sys_tables_rec_check(rec)) {
+		*table = NULL;
+		return(error_text);
+	}
+
+	if (auto r = dict_sys_tables_rec_read(rec, uncommitted, mtr,
+					      &table_id, &space_id,
+					      &t_num, &flags, &flags2,
+					      &trx_id)) {
+		*table = NULL;
+		return r == READ_ERROR ? dict_load_table_flags : nullptr;
+	}
+
+	dict_table_decode_n_col(t_num, &n_cols, &n_v_col);
+
+	*table = dict_table_t::create(
+		span<const char>(reinterpret_cast<const char*>(rec),
+				 rec_get_field_start_offs(rec, 1)),
+		nullptr, n_cols + n_v_col, n_v_col, flags, flags2);
+	(*table)->space_id = space_id;
+	(*table)->id = table_id;
+	(*table)->file_unreadable = !!(flags2 & DICT_TF2_DISCARDED);
+	(*table)->def_trx_id = trx_id;
+	return(NULL);
+}
+
+/** Make sure the data_file_name is saved in dict_table_t if needed.
+@param[in,out]	table		Table object */
+void dict_get_and_save_data_dir_path(dict_table_t *table)
+{
+  ut_ad(!table->is_temporary());
+  ut_ad(!table->space || table->space->id == table->space_id);
+
+  if (!table->data_dir_path && table->space_id && table->space)
+  {
+    const char *filepath= table->space->chain.start->name;
+    if (strncmp(fil_path_to_mysql_datadir, filepath,
+                strlen(fil_path_to_mysql_datadir)))
+    {
+      table->lock_mutex_lock();
+      table->flags|= 1 << DICT_TF_POS_DATA_DIR & ((1U << DICT_TF_BITS) - 1);
+      table->data_dir_path= mem_heap_strdup(table->heap, filepath);
+      os_file_make_data_dir_path(table->data_dir_path);
+      table->lock_mutex_unlock();
+    }
+  }
+}
+
+/** Opens a tablespace for dict_load_table_one()
+@param[in,out]	table		A table that refers to the tablespace to open
+@param[in]	ignore_err	Whether to ignore an error. */
+UNIV_INLINE
+void
+dict_load_tablespace(
+	dict_table_t*		table,
+	dict_err_ignore_t	ignore_err)
+{
+	ut_ad(!table->is_temporary());
+	ut_ad(!table->space);
+	ut_ad(table->space_id < SRV_SPACE_ID_UPPER_BOUND);
+	ut_ad(fil_system.sys_space);
+
+	if (table->space_id == TRX_SYS_SPACE) {
+		table->space = fil_system.sys_space;
+		return;
+	}
+
+	if (table->flags2 & DICT_TF2_DISCARDED) {
+		ib::warn() << "Tablespace for table " << table->name
+			<< " is set as discarded.";
+		table->file_unreadable = true;
+		return;
+	}
+
+	/* The tablespace may already be open. */
+	table->space = fil_space_for_table_exists_in_mem(table->space_id,
+							 table->flags);
+	if (table->space) {
+		return;
+	}
+
+	if (ignore_err >= DICT_ERR_IGNORE_TABLESPACE) {
+		table->file_unreadable = true;
+		return;
+	}
+
+	if (!(ignore_err & DICT_ERR_IGNORE_RECOVER_LOCK)) {
+		ib::error() << "Failed to find tablespace for table "
+			<< table->name << " in the cache. Attempting"
+			" to load the tablespace with space id "
+			<< table->space_id;
+	}
+
+	/* Use the remote filepath if needed. This parameter is optional
+	in the call to fil_ibd_open(). If not supplied, it will be built
+	from the table->name. */
+	char* filepath = NULL;
+	if (DICT_TF_HAS_DATA_DIR(table->flags)) {
+		/* This will set table->data_dir_path from fil_system */
+		dict_get_and_save_data_dir_path(table);
+
+		if (table->data_dir_path) {
+			filepath = fil_make_filepath(
+				table->data_dir_path, table->name, IBD, true);
+		}
+	}
+
+	table->space = fil_ibd_open(
+		2, FIL_TYPE_TABLESPACE, table->space_id,
+		dict_tf_to_fsp_flags(table->flags),
+		{table->name.m_name, strlen(table->name.m_name)}, filepath);
+
+	if (!table->space) {
+		/* We failed to find a sensible tablespace file */
+		table->file_unreadable = true;
+	}
+
+	ut_free(filepath);
+}
+
+/** Loads a table definition and also all its index definitions.
+
+Loads those foreign key constraints whose referenced table is already in
+dictionary cache.  If a foreign key constraint is not loaded, then the
+referenced table is pushed into the output stack (fk_tables), if it is not
+NULL.  These tables must be subsequently loaded so that all the foreign
+key constraints are loaded into memory.
+
+@param[in]	name		Table name in the db/tablename format
+@param[in]	ignore_err	Error to be ignored when loading table
+				and its index definition
+@param[out]	fk_tables	Related table names that must also be
+				loaded to ensure that all foreign key
+				constraints are loaded.
+@return table, possibly with file_unreadable flag set
+@retval nullptr if the table does not exist */
+static dict_table_t *dict_load_table_one(const span<const char> &name,
+                                         dict_err_ignore_t ignore_err,
+                                         dict_names_t &fk_tables)
+{
+	btr_pcur_t	pcur;
+	mtr_t		mtr;
+
+	DBUG_ENTER("dict_load_table_one");
+	DBUG_PRINT("dict_load_table_one",
+		   ("table: %.*s", int(name.size()), name.data()));
+
+	ut_ad(dict_sys.locked());
+
+	dict_index_t *sys_index = dict_sys.sys_tables->indexes.start;
+	ut_ad(!dict_sys.sys_tables->not_redundant());
+	ut_ad(name_of_col_is(dict_sys.sys_tables, sys_index,
+			     DICT_FLD__SYS_TABLES__ID, "ID"));
+	ut_ad(name_of_col_is(dict_sys.sys_tables, sys_index,
+			     DICT_FLD__SYS_TABLES__N_COLS, "N_COLS"));
+	ut_ad(name_of_col_is(dict_sys.sys_tables, sys_index,
+			     DICT_FLD__SYS_TABLES__TYPE, "TYPE"));
+	ut_ad(name_of_col_is(dict_sys.sys_tables, sys_index,
+			     DICT_FLD__SYS_TABLES__MIX_LEN, "MIX_LEN"));
+	ut_ad(name_of_col_is(dict_sys.sys_tables, sys_index,
+			     DICT_FLD__SYS_TABLES__SPACE, "SPACE"));
+
+	dfield_t dfield;
+	dtuple_t tuple{
+		0,1,1,&dfield,0,nullptr
+#ifdef UNIV_DEBUG
+		, DATA_TUPLE_MAGIC_N
+#endif
+	};
+	dfield_set_data(&dfield, name.data(), name.size());
+	dict_index_copy_types(&tuple, sys_index, 1);
+	pcur.btr_cur.page_cur.index = sys_index;
+
+	bool uncommitted = false;
+reload:
+	mtr.start();
+	dberr_t err = btr_pcur_open_on_user_rec(&tuple,
+						BTR_SEARCH_LEAF, &pcur, &mtr);
+
+	if (err != DB_SUCCESS || !btr_pcur_is_on_user_rec(&pcur)) {
+		/* Not found */
+err_exit:
+		mtr.commit();
+		DBUG_RETURN(nullptr);
+	}
+
+	const rec_t* rec = btr_pcur_get_rec(&pcur);
+
+	/* Check if the table name in record is the searched one */
+	if (rec_get_field_start_offs(rec, 1) != name.size()
+            || memcmp(name.data(), rec, name.size())) {
+		goto err_exit;
+	}
+
+	dict_table_t* table;
+	if (const char* err_msg =
+	    dict_load_table_low(&mtr, uncommitted, rec, &table)) {
+		if (err_msg != dict_load_table_flags) {
+			ib::error() << err_msg;
+		}
+		goto err_exit;
+	}
+	if (!table) {
+		goto err_exit;
+	}
+
+        const unsigned use_uncommitted = uncommitted
+		? 2
+		: table->id == mach_read_from_8(
+			rec + rec_get_field_start_offs(
+				rec, DICT_FLD__SYS_TABLES__ID));
+
+	mtr.commit();
+
+	mem_heap_t* heap = mem_heap_create(32000);
+
+	dict_load_tablespace(table, ignore_err);
+
+	switch (dict_load_columns(table, use_uncommitted, heap)) {
+	case DB_SUCCESS_LOCKED_REC:
+		ut_ad(!uncommitted);
+		uncommitted = true;
+		dict_mem_table_free(table);
+		mem_heap_free(heap);
+		goto reload;
+	case DB_SUCCESS:
+		if (!dict_load_virtual(table, uncommitted)) {
+			break;
+		}
+		/* fall through */
+	default:
+		dict_mem_table_free(table);
+		mem_heap_free(heap);
+		DBUG_RETURN(nullptr);
+	}
+
+	dict_table_add_system_columns(table, heap);
+
+	table->can_be_evicted = true;
+	table->add_to_cache();
+
+	mem_heap_empty(heap);
+
+	ut_ad(dict_tf2_is_valid(table->flags, table->flags2));
+
+	/* If there is no tablespace for the table then we only need to
+	load the index definitions. So that we can IMPORT the tablespace
+	later. When recovering table locks for resurrected incomplete
+	transactions, the tablespace should exist, because DDL operations
+	were not allowed while the table is being locked by a transaction. */
+	dict_err_ignore_t index_load_err =
+		!(ignore_err & DICT_ERR_IGNORE_RECOVER_LOCK)
+		&& !table->is_readable()
+		? DICT_ERR_IGNORE_ALL
+		: ignore_err;
+
+	err = dict_load_indexes(table, uncommitted, heap, index_load_err);
+
+	if (err == DB_TABLE_CORRUPT) {
+		/* Refuse to load the table if the table has a corrupted
+		cluster index */
+		ut_ad(index_load_err != DICT_ERR_IGNORE_DROP);
+		ib::error() << "Refusing to load corrupted table "
+			    << table->name;
+evict:
+		dict_sys.remove(table);
+		mem_heap_free(heap);
+		DBUG_RETURN(nullptr);
+	}
+
+	if (err != DB_SUCCESS || !table->is_readable()) {
+	} else if (dict_index_t* pk = dict_table_get_first_index(table)) {
+		ut_ad(pk->is_primary());
+		if (pk->is_corrupted()
+		    || pk->page >= table->space->get_size()) {
+corrupted:
+			table->corrupted = true;
+			table->file_unreadable = true;
+			err = DB_TABLE_CORRUPT;
+		} else if (table->space->id
+			   && ignore_err == DICT_ERR_IGNORE_DROP) {
+			/* Do not bother to load data from .ibd files
+			only to delete the .ibd files. */
+			goto corrupted;
+		} else {
+			const page_id_t page_id{table->space->id, pk->page};
+			mtr.start();
+			buf_block_t* block = buf_page_get(
+				page_id, table->space->zip_size(),
+				RW_S_LATCH, &mtr);
+			const bool corrupted = !block
+				|| page_get_space_id(block->page.frame)
+				!= page_id.space()
+				|| page_get_page_no(block->page.frame)
+				!= page_id.page_no()
+				|| (mach_read_from_2(FIL_PAGE_TYPE
+						    + block->page.frame)
+				    != FIL_PAGE_INDEX
+				    && mach_read_from_2(FIL_PAGE_TYPE
+							+ block->page.frame)
+				    != FIL_PAGE_TYPE_INSTANT);
+			mtr.commit();
+			if (corrupted) {
+				goto corrupted;
+			}
+
+			if (table->supports_instant()) {
+				err = btr_cur_instant_init(table);
+			}
+		}
+	} else {
+		ut_ad(ignore_err & DICT_ERR_IGNORE_INDEX);
+		if (ignore_err != DICT_ERR_IGNORE_DROP) {
+			err = DB_CORRUPTION;
+			goto evict;
+		}
+	}
+
+	/* Initialize table foreign_child value. Its value could be
+	changed when dict_load_foreigns() is called below */
+	table->fk_max_recusive_level = 0;
+
+	/* We will load the foreign key information only if
+	all indexes were loaded. */
+	if (!table->is_readable()) {
+		/* Don't attempt to load the indexes from disk. */
+	} else if (err == DB_SUCCESS) {
+		err = dict_load_foreigns(table->name.m_name, nullptr,
+					 0, true, ignore_err, fk_tables);
+
+		if (err != DB_SUCCESS) {
+			ib::warn() << "Load table " << table->name
+				<< " failed, the table has missing"
+				" foreign key indexes. Turn off"
+				" 'foreign_key_checks' and try again.";
+			goto evict;
+		} else {
+			dict_mem_table_fill_foreign_vcol_set(table);
+			table->fk_max_recusive_level = 0;
+		}
+	}
+
+	mem_heap_free(heap);
+
+	ut_ad(!table
+	      || (ignore_err & ~DICT_ERR_IGNORE_FK_NOKEY)
+	      || !table->is_readable()
+	      || !table->corrupted);
+
+	if (table && table->fts) {
+		if (!(dict_table_has_fts_index(table)
+		      || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)
+		      || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID))) {
+			/* the table->fts could be created in dict_load_column
+			when a user defined FTS_DOC_ID is present, but no
+			FTS */
+			table->fts->~fts_t();
+			table->fts = nullptr;
+		} else if (fts_optimize_wq) {
+			fts_optimize_add_table(table);
+		} else if (table->can_be_evicted) {
+			/* fts_optimize_thread is not started yet.
+			So make the table as non-evictable from cache. */
+			dict_sys.prevent_eviction(table);
+		}
+	}
+
+	ut_ad(err != DB_SUCCESS || dict_foreign_set_validate(*table));
+
+	DBUG_RETURN(table);
+}
+
+dict_table_t *dict_sys_t::load_table(const span<const char> &name,
+                                     dict_err_ignore_t ignore)
+{
+  if (dict_table_t *table= find_table(name))
+    return table;
+  dict_names_t fk_list;
+  dict_table_t *table= dict_load_table_one(name, ignore, fk_list);
+  while (!fk_list.empty())
+  {
+    const char *f= fk_list.front();
+    const span<const char> name{f, strlen(f)};
+    if (!find_table(name))
+      dict_load_table_one(name, ignore, fk_list);
+    fk_list.pop_front();
+  }
+
+  return table;
+}
+
+/***********************************************************************//**
+Loads a table object based on the table id.
+@return table; NULL if table does not exist */
+dict_table_t*
+dict_load_table_on_id(
+/*==================*/
+	table_id_t		table_id,	/*!< in: table id */
+	dict_err_ignore_t	ignore_err)	/*!< in: errors to ignore
+						when loading the table */
+{
+	byte		id_buf[8];
+	btr_pcur_t	pcur;
+	const byte*	field;
+	ulint		len;
+	mtr_t		mtr;
+
+	ut_ad(dict_sys.locked());
+
+	/* NOTE that the operation of this function is protected by
+	dict_sys.latch, and therefore no deadlocks can occur
+	with other dictionary operations. */
+
+	mtr.start();
+	/*---------------------------------------------------*/
+	/* Get the secondary index based on ID for table SYS_TABLES */
+	dict_index_t *sys_table_ids =
+		dict_sys.sys_tables->indexes.start->indexes.next;
+
+	dfield_t dfield;
+	dtuple_t tuple{
+		0,1,1,&dfield,0,nullptr
+#ifdef UNIV_DEBUG
+		, DATA_TUPLE_MAGIC_N
+#endif
+	};
+
+	/* Write the table id in byte format to id_buf */
+	mach_write_to_8(id_buf, table_id);
+	dfield_set_data(&dfield, id_buf, 8);
+	dict_index_copy_types(&tuple, sys_table_ids, 1);
+	pcur.btr_cur.page_cur.index = sys_table_ids;
+
+	dict_table_t* table = nullptr;
+
+	if (btr_pcur_open_on_user_rec(&tuple, BTR_SEARCH_LEAF, &pcur, &mtr)
+	    == DB_SUCCESS
+	    && btr_pcur_is_on_user_rec(&pcur)) {
+		/*---------------------------------------------------*/
+		/* Now we have the record in the secondary index
+		containing the table ID and NAME */
+		const rec_t* rec = btr_pcur_get_rec(&pcur);
+check_rec:
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_TABLE_IDS__ID, &len);
+		ut_ad(len == 8);
+
+		/* Check if the table id in record is the one searched for */
+		if (table_id == mach_read_from_8(field)) {
+			field = rec_get_nth_field_old(rec,
+				DICT_FLD__SYS_TABLE_IDS__NAME, &len);
+			table = dict_sys.load_table(
+				{reinterpret_cast<const char*>(field),
+				 len}, ignore_err);
+			if (table && table->id != table_id) {
+				ut_ad(rec_get_deleted_flag(rec, 0));
+				table = nullptr;
+			}
+			if (!table) {
+				while (btr_pcur_move_to_next(&pcur, &mtr)) {
+					rec = btr_pcur_get_rec(&pcur);
+
+					if (page_rec_is_user_rec(rec)) {
+						goto check_rec;
+					}
+				}
+			}
+		}
+	}
+
+	mtr.commit();
+	return table;
+}
+
+/********************************************************************//**
+This function is called when the database is booted. Loads system table
+index definitions except for the clustered index which is added to the
+dictionary cache at booting before calling this function. */
+void
+dict_load_sys_table(
+/*================*/
+	dict_table_t*	table)	/*!< in: system table */
+{
+	mem_heap_t*	heap;
+
+	ut_ad(dict_sys.locked());
+
+	heap = mem_heap_create(1000);
+
+	dict_load_indexes(table, false, heap, DICT_ERR_IGNORE_NONE);
+
+	mem_heap_free(heap);
+}
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/********************************************************************//**
+Loads foreign key constraint col names (also for the referenced table).
+Members that must be set (and valid) in foreign:
+foreign->heap
+foreign->n_fields
+foreign->id ('\0'-terminated)
+Members that will be created and set by this function:
+foreign->foreign_col_names[i]
+foreign->referenced_col_names[i]
+(for i=0..foreign->n_fields-1) */
+static dberr_t dict_load_foreign_cols(dict_foreign_t *foreign, trx_id_t trx_id)
+{
+	btr_pcur_t	pcur;
+	mtr_t		mtr;
+	size_t		id_len;
+
+	ut_ad(dict_sys.locked());
+
+	id_len = strlen(foreign->id);
+
+	foreign->foreign_col_names = static_cast<const char**>(
+		mem_heap_alloc(foreign->heap,
+			       foreign->n_fields * sizeof(void*)));
+
+	foreign->referenced_col_names = static_cast<const char**>(
+		mem_heap_alloc(foreign->heap,
+			       foreign->n_fields * sizeof(void*)));
+
+	mtr.start();
+
+	dict_index_t* sys_index = dict_sys.sys_foreign_cols->indexes.start;
+	ut_ad(!dict_sys.sys_foreign_cols->not_redundant());
+
+	dfield_t dfield;
+	dtuple_t tuple{
+		0,1,1,&dfield,0,nullptr
+#ifdef UNIV_DEBUG
+		, DATA_TUPLE_MAGIC_N
+#endif
+	};
+
+	dfield_set_data(&dfield, foreign->id, id_len);
+	dict_index_copy_types(&tuple, sys_index, 1);
+	pcur.btr_cur.page_cur.index = sys_index;
+
+	mem_heap_t* heap = nullptr;
+	dberr_t err = btr_pcur_open_on_user_rec(&tuple,
+						BTR_SEARCH_LEAF, &pcur, &mtr);
+	if (err != DB_SUCCESS) {
+		goto func_exit;
+	}
+	for (ulint i = 0; i < foreign->n_fields; i++) {
+		ut_a(btr_pcur_is_on_user_rec(&pcur));
+
+		const rec_t* rec = btr_pcur_get_rec(&pcur);
+		ulint len;
+		const byte* field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_FOREIGN_COLS__DB_TRX_ID, &len);
+		ut_a(len == DATA_TRX_ID_LEN);
+
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_empty(heap);
+		}
+
+		const trx_id_t id = trx_read_trx_id(field);
+		if (!id) {
+		} else if (id != trx_id && trx_sys.find(nullptr, id, false)) {
+			const auto savepoint = mtr.get_savepoint();
+			rec_offs* offsets = rec_get_offsets(
+				rec, sys_index, nullptr, true, ULINT_UNDEFINED,
+				&heap);
+			const rec_t* old_vers;
+			row_vers_build_for_semi_consistent_read(
+				nullptr, rec, &mtr, sys_index, &offsets, &heap,
+				heap, &old_vers, nullptr);
+			mtr.rollback_to_savepoint(savepoint);
+			rec = old_vers;
+			if (!rec || rec_get_deleted_flag(rec, 0)) {
+				goto next;
+			}
+		}
+
+		if (rec_get_deleted_flag(rec, 0)) {
+			ut_ad(id);
+			goto next;
+		}
+
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_FOREIGN_COLS__ID, &len);
+
+		if (len != id_len || memcmp(foreign->id, field, len)) {
+			const rec_t*	pos;
+			ulint		pos_len;
+			const rec_t*	for_col_name;
+			ulint		for_col_name_len;
+			const rec_t*	ref_col_name;
+			ulint		ref_col_name_len;
+
+			pos = rec_get_nth_field_old(
+				rec, DICT_FLD__SYS_FOREIGN_COLS__POS,
+				&pos_len);
+
+			for_col_name = rec_get_nth_field_old(
+				rec, DICT_FLD__SYS_FOREIGN_COLS__FOR_COL_NAME,
+				&for_col_name_len);
+
+			ref_col_name = rec_get_nth_field_old(
+				rec, DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME,
+				&ref_col_name_len);
+
+			ib::error	sout;
+
+			sout << "Unable to load column names for foreign"
+				" key '" << foreign->id
+				<< "' because it was not found in"
+				" InnoDB internal table SYS_FOREIGN_COLS. The"
+				" closest entry we found is:"
+				" (ID='";
+			sout.write(field, len);
+			sout << "', POS=" << mach_read_from_4(pos)
+				<< ", FOR_COL_NAME='";
+			sout.write(for_col_name, for_col_name_len);
+			sout << "', REF_COL_NAME='";
+			sout.write(ref_col_name, ref_col_name_len);
+			sout << "')";
+
+			err = DB_CORRUPTION;
+			break;
+		}
+
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_FOREIGN_COLS__POS, &len);
+		ut_a(len == 4);
+		ut_a(i == mach_read_from_4(field));
+
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_FOREIGN_COLS__FOR_COL_NAME, &len);
+		foreign->foreign_col_names[i] = mem_heap_strdupl(
+			foreign->heap, (char*) field, len);
+
+		field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME, &len);
+		foreign->referenced_col_names[i] = mem_heap_strdupl(
+			foreign->heap, (char*) field, len);
+
+next:
+		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+	}
+func_exit:
+	mtr.commit();
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return err;
+}
+
+/***********************************************************************//**
+Loads a foreign key constraint to the dictionary cache. If the referenced
+table is not yet loaded, it is added in the output parameter (fk_tables).
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+dict_load_foreign(
+/*==============*/
+	const char*		table_name,	/*!< in: table name */
+	bool			uncommitted,	/*!< in: use READ UNCOMMITTED
+						transaction isolation level */
+	const char**		col_names,
+				/*!< in: column names, or NULL
+				to use foreign->foreign_table->col_names */
+	trx_id_t		trx_id,
+				/*!< in: current transaction id, or 0 */
+	bool			check_recursive,
+				/*!< in: whether to record the foreign table
+				parent count to avoid unlimited recursive
+				load of chained foreign tables */
+	bool			check_charsets,
+				/*!< in: whether to check charset
+				compatibility */
+	span<const char>	id,
+				/*!< in: foreign constraint id */
+	dict_err_ignore_t	ignore_err,
+				/*!< in: error to be ignored */
+	dict_names_t&	fk_tables)
+				/*!< out: the foreign key constraint is added
+				to the dictionary cache only if the referenced
+				table is already in cache.  Otherwise, the
+				foreign key constraint is not added to cache,
+				and the referenced table is added to this
+				stack. */
+{
+	dict_foreign_t*	foreign;
+	btr_pcur_t	pcur;
+	const byte*	field;
+	ulint		len;
+	mtr_t		mtr;
+	dict_table_t*	for_table;
+	dict_table_t*	ref_table;
+
+	DBUG_ENTER("dict_load_foreign");
+	DBUG_PRINT("dict_load_foreign",
+		   ("id: '%.*s', check_recursive: %d",
+		    int(id.size()), id.data(), check_recursive));
+
+	ut_ad(dict_sys.locked());
+
+	dict_index_t* sys_index = dict_sys.sys_foreign->indexes.start;
+	ut_ad(!dict_sys.sys_foreign->not_redundant());
+
+	dfield_t dfield;
+	dtuple_t tuple{
+		0,1,1,&dfield,0,nullptr
+#ifdef UNIV_DEBUG
+		, DATA_TUPLE_MAGIC_N
+#endif
+	};
+	dfield_set_data(&dfield, id.data(), id.size());
+	dict_index_copy_types(&tuple, sys_index, 1);
+	pcur.btr_cur.page_cur.index = sys_index;
+
+	mtr.start();
+
+	mem_heap_t* heap = nullptr;
+	dberr_t err = btr_pcur_open_on_user_rec(&tuple,
+						BTR_SEARCH_LEAF, &pcur, &mtr);
+	if (err != DB_SUCCESS) {
+		goto err_exit;
+	}
+
+	if (!btr_pcur_is_on_user_rec(&pcur)) {
+not_found:
+		err = DB_NOT_FOUND;
+err_exit:
+		mtr.commit();
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+		DBUG_RETURN(err);
+	}
+
+	const rec_t* rec = btr_pcur_get_rec(&pcur);
+	static_assert(DICT_FLD__SYS_FOREIGN__ID == 0, "compatibility");
+	field = rec_get_nth_field_old(rec, DICT_FLD__SYS_FOREIGN__ID, &len);
+
+	/* Check if the id in record is the searched one */
+	if (len != id.size() || memcmp(id.data(), field, id.size())) {
+		goto not_found;
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN__DB_TRX_ID, &len);
+	ut_a(len == DATA_TRX_ID_LEN);
+
+	const trx_id_t tid = trx_read_trx_id(field);
+
+	if (tid && tid != trx_id && !uncommitted
+	    && trx_sys.find(nullptr, tid, false)) {
+		const auto savepoint = mtr.get_savepoint();
+		rec_offs* offsets = rec_get_offsets(
+			rec, sys_index, nullptr, true, ULINT_UNDEFINED, &heap);
+		const rec_t* old_vers;
+		row_vers_build_for_semi_consistent_read(
+			nullptr, rec, &mtr, sys_index, &offsets, &heap,
+			heap, &old_vers, nullptr);
+		mtr.rollback_to_savepoint(savepoint);
+		rec = old_vers;
+		if (!rec) {
+			goto not_found;
+		}
+	}
+
+	if (rec_get_deleted_flag(rec, 0)) {
+		ut_ad(tid);
+		goto not_found;
+	}
+
+	/* Read the table names and the number of columns associated
+	with the constraint */
+
+	foreign = dict_mem_foreign_create();
+
+	uint32_t n_fields_and_type = mach_read_from_4(
+		rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_FOREIGN__N_COLS, &len));
+
+	ut_a(len == 4);
+
+	/* We store the type in the bits 24..29 of n_fields_and_type. */
+
+	foreign->type = (n_fields_and_type >> 24) & ((1U << 6) - 1);
+	foreign->n_fields = n_fields_and_type & dict_index_t::MAX_N_FIELDS;
+
+	foreign->id = mem_heap_strdupl(foreign->heap, id.data(), id.size());
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN__FOR_NAME, &len);
+
+	foreign->foreign_table_name = mem_heap_strdupl(
+		foreign->heap, (char*) field, len);
+	dict_mem_foreign_table_name_lookup_set(foreign, TRUE);
+
+	const size_t foreign_table_name_len = len;
+	const size_t table_name_len = strlen(table_name);
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN__REF_NAME, &len);
+
+	if (!my_charset_latin1.strnncoll(table_name, table_name_len,
+					 foreign->foreign_table_name,
+					 foreign_table_name_len)) {
+	} else if (!check_recursive
+		   && !my_charset_latin1.strnncoll(table_name, table_name_len,
+						   (const char*) field, len)) {
+	} else {
+		dict_foreign_free(foreign);
+		goto not_found;
+	}
+
+	foreign->referenced_table_name = mem_heap_strdupl(
+		foreign->heap, (const char*) field, len);
+	dict_mem_referenced_table_name_lookup_set(foreign, TRUE);
+
+	mtr.commit();
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	err = dict_load_foreign_cols(foreign, trx_id);
+	if (err != DB_SUCCESS) {
+		goto load_error;
+	}
+
+	ref_table = dict_sys.find_table(
+		{foreign->referenced_table_name_lookup,
+		 strlen(foreign->referenced_table_name_lookup)});
+	for_table = dict_sys.find_table(
+		{foreign->foreign_table_name_lookup,
+		 strlen(foreign->foreign_table_name_lookup)});
+
+	if (!for_table) {
+		/* To avoid recursively loading the tables related through
+		the foreign key constraints, the child table name is saved
+		here.  The child table will be loaded later, along with its
+		foreign key constraint. */
+
+		ut_a(ref_table != NULL);
+		fk_tables.push_back(
+			mem_heap_strdupl(ref_table->heap,
+					 foreign->foreign_table_name_lookup,
+					 foreign_table_name_len));
+load_error:
+		dict_foreign_remove_from_cache(foreign);
+		DBUG_RETURN(err);
+	}
+
+	ut_a(for_table || ref_table);
+
+	/* Note that there may already be a foreign constraint object in
+	the dictionary cache for this constraint: then the following
+	call only sets the pointers in it to point to the appropriate table
+	and index objects and frees the newly created object foreign.
+	Adding to the cache should always succeed since we are not creating
+	a new foreign key constraint but loading one from the data
+	dictionary. */
+
+	DBUG_RETURN(dict_foreign_add_to_cache(foreign, col_names,
+					      check_charsets,
+					      ignore_err));
+}
+
+/***********************************************************************//**
+Loads foreign key constraints where the table is either the foreign key
+holder or where the table is referenced by a foreign key. Adds these
+constraints to the data dictionary.
+
+The foreign key constraint is loaded only if the referenced table is also
+in the dictionary cache.  If the referenced table is not in dictionary
+cache, then it is added to the output parameter (fk_tables).
+
+@return DB_SUCCESS or error code */
+dberr_t
+dict_load_foreigns(
+	const char*		table_name,	/*!< in: table name */
+	const char**		col_names,	/*!< in: column names, or NULL
+						to use table->col_names */
+	trx_id_t		trx_id,		/*!< in: DDL transaction id,
+						or 0 to check
+						recursive load of tables
+						chained by FK */
+	bool			check_charsets,	/*!< in: whether to check
+						charset compatibility */
+	dict_err_ignore_t	ignore_err,	/*!< in: error to be ignored */
+	dict_names_t&		fk_tables)
+						/*!< out: stack of table
+						names which must be loaded
+						subsequently to load all the
+						foreign key constraints. */
+{
+	btr_pcur_t	pcur;
+	mtr_t		mtr;
+
+	DBUG_ENTER("dict_load_foreigns");
+
+	ut_ad(dict_sys.locked());
+
+	if (!dict_sys.sys_foreign || !dict_sys.sys_foreign_cols) {
+		if (ignore_err & DICT_ERR_IGNORE_FK_NOKEY) {
+			DBUG_RETURN(DB_SUCCESS);
+		}
+		sql_print_information("InnoDB: No foreign key system tables"
+				      " in the database");
+		DBUG_RETURN(DB_ERROR);
+	}
+
+	ut_ad(!dict_sys.sys_foreign->not_redundant());
+
+	dict_index_t *sec_index = dict_table_get_next_index(
+		dict_table_get_first_index(dict_sys.sys_foreign));
+	ut_ad(!strcmp(sec_index->fields[0].name, "FOR_NAME"));
+	bool check_recursive = !trx_id;
+	dfield_t dfield;
+	dtuple_t tuple{
+		0,1,1,&dfield,0,nullptr
+#ifdef UNIV_DEBUG
+		, DATA_TUPLE_MAGIC_N
+#endif
+	};
+
+start_load:
+	mtr.start();
+	dfield_set_data(&dfield, table_name, strlen(table_name));
+	dict_index_copy_types(&tuple, sec_index, 1);
+	pcur.btr_cur.page_cur.index = sec_index;
+
+	dberr_t err = btr_pcur_open_on_user_rec(&tuple,
+						BTR_SEARCH_LEAF, &pcur, &mtr);
+	if (err != DB_SUCCESS) {
+		DBUG_RETURN(err);
+	}
+loop:
+	const rec_t* rec = btr_pcur_get_rec(&pcur);
+	const byte* field;
+	const auto maybe_deleted = rec_get_deleted_flag(rec, 0);
+
+	if (!btr_pcur_is_on_user_rec(&pcur)) {
+		/* End of index */
+
+		goto load_next_index;
+	}
+
+	/* Now we have the record in the secondary index containing a table
+	name and a foreign constraint ID */
+
+	ulint len;
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN_FOR_NAME__NAME, &len);
+
+	/* Check if the table name in the record is the one searched for; the
+	following call does the comparison in the latin1_swedish_ci
+	charset-collation, in a case-insensitive way. */
+
+	if (cmp_data(dfield_get_type(&dfield)->mtype,
+		     dfield_get_type(&dfield)->prtype,
+		     false,
+		     reinterpret_cast<const byte*>(table_name),
+		     dfield_get_len(&dfield),
+		     field, len)) {
+		goto load_next_index;
+	}
+
+	/* Since table names in SYS_FOREIGN are stored in a case-insensitive
+	order, we have to check that the table name matches also in a binary
+	string comparison. On Unix, MySQL allows table names that only differ
+	in character case.  If lower_case_table_names=2 then what is stored
+	may not be the same case, but the previous comparison showed that they
+	match with no-case.  */
+
+	if (lower_case_table_names != 2 && memcmp(field, table_name, len)) {
+		goto next_rec;
+	}
+
+	/* Now we get a foreign key constraint id */
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_FOREIGN_FOR_NAME__ID, &len);
+
+	/* Copy the string because the page may be modified or evicted
+	after mtr.commit() below. */
+	char	fk_id[MAX_TABLE_NAME_LEN + NAME_LEN];
+	err = DB_SUCCESS;
+	if (UNIV_LIKELY(len < sizeof fk_id)) {
+		memcpy(fk_id, field, len);
+	}
+
+	btr_pcur_store_position(&pcur, &mtr);
+
+	mtr.commit();
+
+	/* Load the foreign constraint definition to the dictionary cache */
+
+	err = len < sizeof fk_id
+		? dict_load_foreign(table_name, false, col_names, trx_id,
+				    check_recursive, check_charsets,
+				    {fk_id, len}, ignore_err, fk_tables)
+		: DB_CORRUPTION;
+
+	switch (err) {
+	case DB_SUCCESS:
+		break;
+	case DB_NOT_FOUND:
+		if (maybe_deleted) {
+			break;
+		}
+		sql_print_error("InnoDB: Cannot load foreign constraint %.*s:"
+				" could not find the relevant record in "
+				"SYS_FOREIGN", int(len), fk_id);
+		/* fall through */
+	default:
+corrupted:
+		ut_free(pcur.old_rec_buf);
+		DBUG_RETURN(err);
+	}
+
+	mtr.start();
+	if (pcur.restore_position(BTR_SEARCH_LEAF, &mtr)
+	    == btr_pcur_t::CORRUPTED) {
+		mtr.commit();
+		goto corrupted;
+	}
+next_rec:
+	btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+
+	goto loop;
+
+load_next_index:
+	mtr.commit();
+
+	if ((sec_index = dict_table_get_next_index(sec_index))) {
+		/* Switch to scan index on REF_NAME, fk_max_recusive_level
+		already been updated when scanning FOR_NAME index, no need to
+		update again */
+		check_recursive = false;
+		goto start_load;
+	}
+
+	ut_free(pcur.old_rec_buf);
+	DBUG_RETURN(DB_SUCCESS);
+}
diff --git a/storage/innobase/dict/dict0mem.cc b/storage/innobase/dict/dict0mem.cc
new file mode 100644
index 00000000..b8b2d583
--- /dev/null
+++ b/storage/innobase/dict/dict0mem.cc
@@ -0,0 +1,1379 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file dict/dict0mem.cc
+Data dictionary memory object creation
+
+Created 1/8/1996 Heikki Tuuri
+***********************************************************************/
+
+#include "ha_prototypes.h"
+#include <mysql_com.h>
+
+#include "dict0mem.h"
+#include "rem0rec.h"
+#include "data0type.h"
+#include "mach0data.h"
+#include "dict0dict.h"
+#include "fts0priv.h"
+#include "lock0lock.h"
+#include "row0row.h"
+#include "sql_string.h"
+#include <iostream>
+
+#define	DICT_HEAP_SIZE		100	/*!< initial memory heap size when
+					creating a table or index object */
+
+/** System databases */
+static const char* innobase_system_databases[] = {
+	"mysql/",
+	"information_schema/",
+	"performance_schema/",
+	NullS
+};
+
+/** Determine if a table belongs to innobase_system_databases[]
+@param[in]	name	database_name/table_name
+@return	whether the database_name is in innobase_system_databases[] */
+static bool dict_mem_table_is_system(const char *name)
+{
+	/* table has the following format: database/table
+	and some system table are of the form SYS_* */
+	if (!strchr(name, '/')) {
+		return true;
+	}
+	size_t table_len = strlen(name);
+	const char *system_db;
+	int i = 0;
+	while ((system_db = innobase_system_databases[i++])
+	       && (system_db != NullS)) {
+		size_t len = strlen(system_db);
+		if (table_len > len && !strncmp(name, system_db, len)) {
+			return true;
+		}
+	}
+	return false;
+}
+
+/** The start of the table basename suffix for partitioned tables */
+const char table_name_t::part_suffix[4]
+#ifdef _WIN32
+= "#p#";
+#else
+= "#P#";
+#endif
+
+/** Display an identifier.
+@param[in,out]	s	output stream
+@param[in]	id_name	SQL identifier (other than table name)
+@return the output stream */
+std::ostream&
+operator<<(
+	std::ostream&		s,
+	const id_name_t&	id_name)
+{
+	const char	q	= '`';
+	const char*	c	= id_name;
+	s << q;
+	for (; *c != 0; c++) {
+		if (*c == q) {
+			s << *c;
+		}
+		s << *c;
+	}
+	s << q;
+	return(s);
+}
+
+/** Display a table name.
+@param[in,out]	s		output stream
+@param[in]	table_name	table name
+@return the output stream */
+std::ostream&
+operator<<(
+	std::ostream&		s,
+	const table_name_t&	table_name)
+{
+	return(s << ut_get_name(NULL, table_name.m_name));
+}
+
+bool dict_col_t::same_encoding(uint16_t a, uint16_t b)
+{
+  if (const CHARSET_INFO *acs= get_charset(a, MYF(MY_WME)))
+    if (const CHARSET_INFO *bcs= get_charset(b, MYF(MY_WME)))
+      return Charset(bcs).encoding_allows_reinterpret_as(acs);
+  return false;
+}
+
+/** Create metadata.
+@param name     table name
+@param space    tablespace
+@param n_cols   total number of columns (both virtual and non-virtual)
+@param n_v_cols number of virtual columns
+@param flags    table flags
+@param flags2   table flags2
+@return newly allocated table object */
+dict_table_t *dict_table_t::create(const span<const char> &name,
+                                   fil_space_t *space,
+                                   ulint n_cols, ulint n_v_cols, ulint flags,
+                                   ulint flags2)
+{
+  ut_ad(!space || space->purpose == FIL_TYPE_TABLESPACE ||
+        space->purpose == FIL_TYPE_TEMPORARY ||
+        space->purpose == FIL_TYPE_IMPORT);
+  ut_a(dict_tf2_is_valid(flags, flags2));
+  ut_a(!(flags2 & DICT_TF2_UNUSED_BIT_MASK));
+
+  mem_heap_t *heap= mem_heap_create(DICT_HEAP_SIZE);
+
+  dict_table_t *table= static_cast<dict_table_t*>
+    (mem_heap_zalloc(heap, sizeof(*table)));
+
+  lock_table_lock_list_init(&table->locks);
+  UT_LIST_INIT(table->indexes, &dict_index_t::indexes);
+#ifdef BTR_CUR_HASH_ADAPT
+  UT_LIST_INIT(table->freed_indexes, &dict_index_t::indexes);
+#endif /* BTR_CUR_HASH_ADAPT */
+  table->heap= heap;
+
+  ut_d(table->magic_n= DICT_TABLE_MAGIC_N);
+
+  table->flags= static_cast<unsigned>(flags) & ((1U << DICT_TF_BITS) - 1);
+  table->flags2= static_cast<unsigned>(flags2) & ((1U << DICT_TF2_BITS) - 1);
+  table->name.m_name= mem_strdupl(name.data(), name.size());
+  table->mdl_name.m_name= table->name.m_name;
+  table->is_system_db= dict_mem_table_is_system(table->name.m_name);
+  table->space= space;
+  table->space_id= space ? space->id : UINT32_MAX;
+  table->n_t_cols= static_cast<unsigned>(n_cols + DATA_N_SYS_COLS) &
+    dict_index_t::MAX_N_FIELDS;
+  table->n_v_cols= static_cast<unsigned>(n_v_cols) &
+    dict_index_t::MAX_N_FIELDS;
+  table->n_cols= static_cast<unsigned>(table->n_t_cols - table->n_v_cols) &
+    dict_index_t::MAX_N_FIELDS;
+  table->cols= static_cast<dict_col_t*>
+    (mem_heap_alloc(heap, table->n_cols * sizeof *table->cols));
+  table->v_cols= static_cast<dict_v_col_t*>
+    (mem_heap_alloc(heap, n_v_cols * sizeof *table->v_cols));
+  for (ulint i = n_v_cols; i--; )
+    new (&table->v_cols[i]) dict_v_col_t();
+  table->autoinc_lock= static_cast<ib_lock_t*>
+    (mem_heap_alloc(heap, sizeof *table->autoinc_lock));
+  /* If the table has an FTS index or we are in the process
+  of building one, create the table->fts */
+  if (dict_table_has_fts_index(table) ||
+      DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID |
+                           DICT_TF2_FTS_ADD_DOC_ID))
+  {
+    table->fts= fts_create(table);
+    table->fts->cache= fts_cache_create(table);
+  }
+
+  new (&table->foreign_set) dict_foreign_set();
+  new (&table->referenced_set) dict_foreign_set();
+
+  return table;
+}
+
+/****************************************************************//**
+Free a table memory object. */
+void
+dict_mem_table_free(
+/*================*/
+	dict_table_t*	table)		/*!< in: table */
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+	ut_ad(UT_LIST_GET_LEN(table->indexes) == 0);
+#ifdef BTR_CUR_HASH_ADAPT
+	ut_ad(UT_LIST_GET_LEN(table->freed_indexes) == 0);
+#endif /* BTR_CUR_HASH_ADAPT */
+	ut_d(table->cached = FALSE);
+
+	if (dict_table_has_fts_index(table)
+	    || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)
+	    || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) {
+		if (table->fts) {
+			table->fts->~fts_t();
+		}
+	}
+
+	dict_mem_table_free_foreign_vcol_set(table);
+
+	table->foreign_set.~dict_foreign_set();
+	table->referenced_set.~dict_foreign_set();
+
+	ut_free(table->name.m_name);
+
+	/* Clean up virtual index info structures that are registered
+	with virtual columns */
+	for (ulint i = 0; i < table->n_v_def; i++) {
+		dict_table_get_nth_v_col(table, i)->~dict_v_col_t();
+	}
+
+	UT_DELETE(table->s_cols);
+
+	mem_heap_free(table->heap);
+}
+
+/****************************************************************//**
+Append 'name' to 'col_names'.  @see dict_table_t::col_names
+@return new column names array */
+static
+const char*
+dict_add_col_name(
+/*==============*/
+	const char*	col_names,	/*!< in: existing column names, or
+					NULL */
+	ulint		cols,		/*!< in: number of existing columns */
+	const char*	name,		/*!< in: new column name */
+	mem_heap_t*	heap)		/*!< in: heap */
+{
+	ulint	old_len;
+	ulint	new_len;
+	ulint	total_len;
+	char*	res;
+
+	ut_ad(!cols == !col_names);
+
+	/* Find out length of existing array. */
+	if (col_names) {
+		const char*	s = col_names;
+		ulint		i;
+
+		for (i = 0; i < cols; i++) {
+			s += strlen(s) + 1;
+		}
+
+		old_len = unsigned(s - col_names);
+	} else {
+		old_len = 0;
+	}
+
+	new_len = strlen(name) + 1;
+	total_len = old_len + new_len;
+
+	res = static_cast<char*>(mem_heap_alloc(heap, total_len));
+
+	if (old_len > 0) {
+		memcpy(res, col_names, old_len);
+	}
+
+	memcpy(res + old_len, name, new_len);
+
+	return(res);
+}
+
+/**********************************************************************//**
+Adds a column definition to a table. */
+void
+dict_mem_table_add_col(
+/*===================*/
+	dict_table_t*	table,	/*!< in: table */
+	mem_heap_t*	heap,	/*!< in: temporary memory heap, or NULL */
+	const char*	name,	/*!< in: column name, or NULL */
+	ulint		mtype,	/*!< in: main datatype */
+	ulint		prtype,	/*!< in: precise type */
+	ulint		len)	/*!< in: precision */
+{
+	dict_col_t*	col;
+	unsigned	i;
+
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+	ut_ad(!heap == !name);
+
+	ut_ad(!(prtype & DATA_VIRTUAL));
+
+	i = table->n_def++;
+
+	table->n_t_def++;
+
+	if (name) {
+		if (table->n_def == table->n_cols) {
+			heap = table->heap;
+		}
+		if (i && !table->col_names) {
+			/* All preceding column names are empty. */
+			char* s = static_cast<char*>(
+				mem_heap_zalloc(heap, table->n_def));
+
+			table->col_names = s;
+		}
+
+		table->col_names = dict_add_col_name(table->col_names,
+						     i, name, heap);
+	}
+
+	col = dict_table_get_nth_col(table, i);
+
+	dict_mem_fill_column_struct(col, i, mtype, prtype, len);
+
+	switch (prtype & DATA_VERSIONED) {
+	case DATA_VERS_START:
+		ut_ad(!table->vers_start);
+		table->vers_start = i & dict_index_t::MAX_N_FIELDS;
+		break;
+	case DATA_VERS_END:
+		ut_ad(!table->vers_end);
+		table->vers_end = i & dict_index_t::MAX_N_FIELDS;
+	}
+}
+
+/** Adds a virtual column definition to a table.
+@param[in,out]	table		table
+@param[in,out]	heap		temporary memory heap, or NULL. It is
+				used to store name when we have not finished
+				adding all columns. When all columns are
+				added, the whole name will copy to memory from
+				table->heap
+@param[in]	name		column name
+@param[in]	mtype		main datatype
+@param[in]	prtype		precise type
+@param[in]	len		length
+@param[in]	pos		position in a table
+@param[in]	num_base	number of base columns
+@return the virtual column definition */
+dict_v_col_t*
+dict_mem_table_add_v_col(
+	dict_table_t*	table,
+	mem_heap_t*	heap,
+	const char*	name,
+	ulint		mtype,
+	ulint		prtype,
+	ulint		len,
+	ulint		pos,
+	ulint		num_base)
+{
+	dict_v_col_t*	v_col;
+
+	ut_ad(table);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+	ut_ad(!heap == !name);
+
+	ut_ad(prtype & DATA_VIRTUAL);
+
+	unsigned i = table->n_v_def++;
+
+	table->n_t_def++;
+
+	if (name != NULL) {
+		if (table->n_v_def == table->n_v_cols) {
+			heap = table->heap;
+		}
+
+		if (i && !table->v_col_names) {
+			/* All preceding column names are empty. */
+			char* s = static_cast<char*>(
+				mem_heap_zalloc(heap, table->n_v_def));
+
+			table->v_col_names = s;
+		}
+
+		table->v_col_names = dict_add_col_name(table->v_col_names,
+						       i, name, heap);
+	}
+
+	v_col = &table->v_cols[i];
+
+	dict_mem_fill_column_struct(&v_col->m_col, pos, mtype, prtype, len);
+	v_col->v_pos = i & dict_index_t::MAX_N_FIELDS;
+
+	if (num_base != 0) {
+		v_col->base_col = static_cast<dict_col_t**>(mem_heap_zalloc(
+					table->heap, num_base * sizeof(
+						*v_col->base_col)));
+	} else {
+		v_col->base_col = NULL;
+	}
+
+	v_col->num_base = static_cast<unsigned>(num_base)
+		& dict_index_t::MAX_N_FIELDS;
+
+	/* Initialize the index list for virtual columns */
+	ut_ad(v_col->v_indexes.empty());
+
+	return(v_col);
+}
+
+/** Adds a stored column definition to a table.
+@param[in]	table		table
+@param[in]	num_base	number of base columns. */
+void
+dict_mem_table_add_s_col(
+	dict_table_t*	table,
+	ulint		num_base)
+{
+	unsigned	i = unsigned(table->n_def) - 1;
+	dict_col_t*	col = dict_table_get_nth_col(table, i);
+	dict_s_col_t	s_col;
+
+	ut_ad(col != NULL);
+
+	if (table->s_cols == NULL) {
+		table->s_cols = UT_NEW_NOKEY(dict_s_col_list());
+	}
+
+	s_col.m_col = col;
+	s_col.s_pos = i + table->n_v_def;
+
+	if (num_base != 0) {
+		s_col.base_col = static_cast<dict_col_t**>(mem_heap_zalloc(
+			table->heap, num_base * sizeof(dict_col_t*)));
+	} else {
+		s_col.base_col = NULL;
+	}
+
+	s_col.num_base = num_base;
+	table->s_cols->push_front(s_col);
+}
+
+/**********************************************************************//**
+Renames a column of a table in the data dictionary cache. */
+static MY_ATTRIBUTE((nonnull))
+void
+dict_mem_table_col_rename_low(
+/*==========================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	unsigned	i,	/*!< in: column offset corresponding to s */
+	const char*	to,	/*!< in: new column name */
+	const char*	s,	/*!< in: pointer to table->col_names */
+	bool		is_virtual)
+				/*!< in: if this is a virtual column */
+{
+	char*	t_col_names = const_cast<char*>(
+		is_virtual ? table->v_col_names : table->col_names);
+	ulint	n_col = is_virtual ? table->n_v_def : table->n_def;
+
+	size_t from_len = strlen(s), to_len = strlen(to);
+
+	ut_ad(i < table->n_def || is_virtual);
+	ut_ad(i < table->n_v_def || !is_virtual);
+
+	ut_ad(from_len <= NAME_LEN);
+	ut_ad(to_len <= NAME_LEN);
+
+	char from[NAME_LEN + 1];
+	strncpy(from, s, sizeof from - 1);
+	from[sizeof from - 1] = '\0';
+
+	if (from_len == to_len) {
+		/* The easy case: simply replace the column name in
+		table->col_names. */
+		strcpy(const_cast<char*>(s), to);
+	} else {
+		/* We need to adjust all affected index->field
+		pointers, as in dict_index_add_col(). First, copy
+		table->col_names. */
+		ulint	prefix_len	= ulint(s - t_col_names);
+
+		for (; i < n_col; i++) {
+			s += strlen(s) + 1;
+		}
+
+		ulint	full_len	= ulint(s - t_col_names);
+		char*	col_names;
+
+		if (to_len > from_len) {
+			col_names = static_cast<char*>(
+				mem_heap_alloc(
+					table->heap,
+					full_len + to_len - from_len));
+
+			memcpy(col_names, t_col_names, prefix_len);
+		} else {
+			col_names = const_cast<char*>(t_col_names);
+		}
+
+		memcpy(col_names + prefix_len, to, to_len);
+		memmove(col_names + prefix_len + to_len,
+			t_col_names + (prefix_len + from_len),
+			full_len - (prefix_len + from_len));
+
+		/* Replace the field names in every index. */
+		for (dict_index_t* index = dict_table_get_first_index(table);
+		     index != NULL;
+		     index = dict_table_get_next_index(index)) {
+			ulint	n_fields = dict_index_get_n_fields(index);
+
+			for (ulint i = 0; i < n_fields; i++) {
+				dict_field_t*	field
+					= dict_index_get_nth_field(
+						index, i);
+
+				ut_ad(!field->name
+				      == field->col->is_dropped());
+				if (!field->name) {
+					/* dropped columns lack a name */
+					ut_ad(index->is_instant());
+					continue;
+				}
+
+				/* if is_virtual and that in field->col does
+				not match, continue */
+				if ((!is_virtual) !=
+				    (!field->col->is_virtual())) {
+					continue;
+				}
+
+				ulint		name_ofs
+					= ulint(field->name - t_col_names);
+				if (name_ofs <= prefix_len) {
+					field->name = col_names + name_ofs;
+				} else {
+					ut_a(name_ofs < full_len);
+					field->name = col_names
+						+ name_ofs + to_len - from_len;
+				}
+			}
+		}
+
+		if (is_virtual) {
+			table->v_col_names = col_names;
+		} else {
+			table->col_names = col_names;
+		}
+	}
+
+	/* Virtual columns are not allowed for foreign key */
+	if (is_virtual) {
+		return;
+	}
+
+	dict_foreign_t*	foreign;
+
+	/* Replace the field names in every foreign key constraint. */
+	for (dict_foreign_set::iterator it = table->foreign_set.begin();
+	     it != table->foreign_set.end();
+	     ++it) {
+
+		foreign = *it;
+
+		if (foreign->foreign_index == NULL) {
+			/* We may go here when we set foreign_key_checks to 0,
+			and then try to rename a column and modify the
+			corresponding foreign key constraint. The index
+			would have been dropped, we have to find an equivalent
+			one */
+			for (unsigned f = 0; f < foreign->n_fields; f++) {
+				if (strcmp(foreign->foreign_col_names[f], from)
+				    == 0) {
+
+					char** rc = const_cast<char**>(
+						foreign->foreign_col_names
+						+ f);
+
+					if (to_len <= strlen(*rc)) {
+						memcpy(*rc, to, to_len + 1);
+					} else {
+						*rc = static_cast<char*>(
+							mem_heap_dup(
+								foreign->heap,
+								to,
+								to_len + 1));
+					}
+				}
+			}
+
+			/* New index can be null if InnoDB already dropped
+			the foreign index when FOREIGN_KEY_CHECKS is
+			disabled */
+			foreign->foreign_index = dict_foreign_find_index(
+				foreign->foreign_table, NULL,
+				foreign->foreign_col_names,
+				foreign->n_fields, NULL, true, false,
+				NULL, NULL, NULL);
+
+		} else {
+
+			for (unsigned f = 0; f < foreign->n_fields; f++) {
+				/* These can point straight to
+				table->col_names, because the foreign key
+				constraints will be freed at the same time
+				when the table object is freed. */
+				foreign->foreign_col_names[f]
+					= dict_index_get_nth_field(
+						foreign->foreign_index,
+						f)->name;
+			}
+		}
+	}
+
+	for (dict_foreign_set::iterator it = table->referenced_set.begin();
+	     it != table->referenced_set.end();
+	     ++it) {
+
+		foreign = *it;
+
+		if (!foreign->referenced_index) {
+			/* Referenced index could have been dropped
+			when foreign_key_checks is disabled. In that case,
+			rename the corresponding referenced_col_names and
+			find the equivalent referenced index also */
+			for (unsigned f = 0; f < foreign->n_fields; f++) {
+
+				const char*& rc =
+					foreign->referenced_col_names[f];
+				if (strcmp(rc, from)) {
+					continue;
+				}
+
+				if (to_len <= strlen(rc)) {
+					memcpy(const_cast<char*>(rc), to,
+					       to_len + 1);
+				} else {
+					rc = static_cast<char*>(
+						mem_heap_dup(
+							foreign->heap,
+							to, to_len + 1));
+				}
+			}
+
+			/* New index can be null if InnoDB already dropped
+			the referenced index when FOREIGN_KEY_CHECKS is
+			disabled */
+			foreign->referenced_index = dict_foreign_find_index(
+				foreign->referenced_table, NULL,
+				foreign->referenced_col_names,
+				foreign->n_fields, NULL, true, false,
+				NULL, NULL, NULL);
+			return;
+		}
+
+
+		for (unsigned f = 0; f < foreign->n_fields; f++) {
+			/* foreign->referenced_col_names[] need to be
+			copies, because the constraint may become
+			orphan when foreign_key_checks=0 and the
+			parent table is dropped. */
+
+			const char* col_name = dict_index_get_nth_field(
+				foreign->referenced_index, f)->name;
+
+			if (strcmp(foreign->referenced_col_names[f],
+				   col_name)) {
+				char**	rc = const_cast<char**>(
+					foreign->referenced_col_names + f);
+				size_t	col_name_len_1 = strlen(col_name) + 1;
+
+				if (col_name_len_1 <= strlen(*rc) + 1) {
+					memcpy(*rc, col_name, col_name_len_1);
+				} else {
+					*rc = static_cast<char*>(
+						mem_heap_dup(
+							foreign->heap,
+							col_name,
+							col_name_len_1));
+				}
+			}
+		}
+	}
+}
+
+/**********************************************************************//**
+Renames a column of a table in the data dictionary cache. */
+void
+dict_mem_table_col_rename(
+/*======================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	ulint		nth_col,/*!< in: column index */
+	const char*	from,	/*!< in: old column name */
+	const char*	to,	/*!< in: new column name */
+	bool		is_virtual)
+				/*!< in: if this is a virtual column */
+{
+	const char*	s = is_virtual ? table->v_col_names : table->col_names;
+
+	ut_ad((!is_virtual && nth_col < table->n_def)
+	       || (is_virtual && nth_col < table->n_v_def));
+
+	for (ulint i = 0; i < nth_col; i++) {
+		size_t	len = strlen(s);
+		ut_ad(len > 0);
+		s += len + 1;
+	}
+
+	ut_ad(!my_strcasecmp(system_charset_info, from, s));
+
+	dict_mem_table_col_rename_low(table, static_cast<unsigned>(nth_col),
+				      to, s, is_virtual);
+}
+
+/**********************************************************************//**
+This function populates a dict_col_t memory structure with
+supplied information. */
+void
+dict_mem_fill_column_struct(
+/*========================*/
+	dict_col_t*	column,		/*!< out: column struct to be
+					filled */
+	ulint		col_pos,	/*!< in: column position */
+	ulint		mtype,		/*!< in: main data type */
+	ulint		prtype,		/*!< in: precise type */
+	ulint		col_len)	/*!< in: column length */
+{
+	unsigned mbminlen, mbmaxlen;
+
+	column->ind = static_cast<unsigned>(col_pos)
+		& dict_index_t::MAX_N_FIELDS;
+	column->ord_part = 0;
+	column->max_prefix = 0;
+	column->mtype = static_cast<uint8_t>(mtype);
+	column->prtype = static_cast<unsigned>(prtype);
+	column->len = static_cast<uint16_t>(col_len);
+	dtype_get_mblen(mtype, prtype, &mbminlen, &mbmaxlen);
+	column->mbminlen = mbminlen & 7;
+	column->mbmaxlen = mbmaxlen & 7;
+	column->def_val.data = NULL;
+	column->def_val.len = UNIV_SQL_DEFAULT;
+	ut_ad(!column->is_dropped());
+}
+
+/**********************************************************************//**
+Creates an index memory object.
+@return own: index object */
+dict_index_t*
+dict_mem_index_create(
+/*==================*/
+	dict_table_t*	table,		/*!< in: table */
+	const char*	index_name,	/*!< in: index name */
+	ulint		type,		/*!< in: DICT_UNIQUE,
+					DICT_CLUSTERED, ... ORed */
+	ulint		n_fields)	/*!< in: number of fields */
+{
+	dict_index_t*	index;
+	mem_heap_t*	heap;
+
+	ut_ad(!table || table->magic_n == DICT_TABLE_MAGIC_N);
+	ut_ad(index_name);
+
+	heap = mem_heap_create(DICT_HEAP_SIZE);
+
+	index = static_cast<dict_index_t*>(
+		mem_heap_zalloc(heap, sizeof(*index)));
+	index->table = table;
+
+	dict_mem_fill_index_struct(index, heap, index_name, type, n_fields);
+
+	new (&index->zip_pad.mutex) std::mutex();
+
+	if (type & DICT_SPATIAL) {
+		index->rtr_track = new
+			(mem_heap_alloc(heap, sizeof *index->rtr_track))
+			rtr_info_track_t();
+		mysql_mutex_init(rtr_active_mutex_key,
+				 &index->rtr_track->rtr_active_mutex, nullptr);
+	}
+
+	return(index);
+}
+
+/**********************************************************************//**
+Creates and initializes a foreign constraint memory object.
+@return own: foreign constraint struct */
+dict_foreign_t*
+dict_mem_foreign_create(void)
+/*=========================*/
+{
+	dict_foreign_t*	foreign;
+	mem_heap_t*	heap;
+	DBUG_ENTER("dict_mem_foreign_create");
+
+	heap = mem_heap_create(100);
+
+	foreign = static_cast<dict_foreign_t*>(
+		mem_heap_zalloc(heap, sizeof(dict_foreign_t)));
+
+	foreign->heap = heap;
+
+	foreign->v_cols = NULL;
+
+	DBUG_PRINT("dict_mem_foreign_create", ("heap: %p", heap));
+
+	DBUG_RETURN(foreign);
+}
+
+/**********************************************************************//**
+Sets the foreign_table_name_lookup pointer based on the value of
+lower_case_table_names.  If that is 0 or 1, foreign_table_name_lookup
+will point to foreign_table_name.  If 2, then another string is
+allocated from foreign->heap and set to lower case. */
+void
+dict_mem_foreign_table_name_lookup_set(
+/*===================================*/
+	dict_foreign_t*	foreign,	/*!< in/out: foreign struct */
+	ibool		do_alloc)	/*!< in: is an alloc needed */
+{
+	if (lower_case_table_names == 2) {
+		if (do_alloc) {
+			ulint	len;
+
+			len = strlen(foreign->foreign_table_name) + 1;
+
+			foreign->foreign_table_name_lookup =
+				static_cast<char*>(
+					mem_heap_alloc(foreign->heap, len));
+		}
+		strcpy(foreign->foreign_table_name_lookup,
+		       foreign->foreign_table_name);
+		innobase_casedn_str(foreign->foreign_table_name_lookup);
+	} else {
+		foreign->foreign_table_name_lookup
+			= foreign->foreign_table_name;
+	}
+}
+
+/**********************************************************************//**
+Sets the referenced_table_name_lookup pointer based on the value of
+lower_case_table_names.  If that is 0 or 1, referenced_table_name_lookup
+will point to referenced_table_name.  If 2, then another string is
+allocated from foreign->heap and set to lower case. */
+void
+dict_mem_referenced_table_name_lookup_set(
+/*======================================*/
+	dict_foreign_t*	foreign,	/*!< in/out: foreign struct */
+	ibool		do_alloc)	/*!< in: is an alloc needed */
+{
+	if (lower_case_table_names == 2) {
+		if (do_alloc) {
+			ulint	len;
+
+			len = strlen(foreign->referenced_table_name) + 1;
+
+			foreign->referenced_table_name_lookup =
+				static_cast<char*>(
+					mem_heap_alloc(foreign->heap, len));
+		}
+		strcpy(foreign->referenced_table_name_lookup,
+		       foreign->referenced_table_name);
+		innobase_casedn_str(foreign->referenced_table_name_lookup);
+	} else {
+		foreign->referenced_table_name_lookup
+			= foreign->referenced_table_name;
+	}
+}
+
+/** Fill the virtual column set with virtual column information
+present in the given virtual index.
+@param[in]	index	virtual index
+@param[out]	v_cols	virtual column set. */
+static
+void
+dict_mem_fill_vcol_has_index(
+	const dict_index_t*	index,
+	dict_vcol_set**		v_cols)
+{
+	for (ulint i = 0; i < index->table->n_v_cols; i++) {
+		dict_v_col_t*	v_col = dict_table_get_nth_v_col(
+					index->table, i);
+		if (!v_col->m_col.ord_part) {
+			continue;
+		}
+
+		for (const auto& v_idx : v_col->v_indexes) {
+			if (v_idx.index != index) {
+				continue;
+			}
+
+			if (*v_cols == NULL) {
+				*v_cols = UT_NEW_NOKEY(dict_vcol_set());
+			}
+
+			(*v_cols)->insert(v_col);
+		}
+	}
+}
+
+/** Fill the virtual column set with the virtual column of the index
+if the index contains given column name.
+@param[in]	col_name	column name
+@param[in]	table		innodb table object
+@param[out]	v_cols		set of virtual column information. */
+static
+void
+dict_mem_fill_vcol_from_v_indexes(
+	const char*		col_name,
+	const dict_table_t*	table,
+	dict_vcol_set**		v_cols)
+{
+	/* virtual column can't be Primary Key, so start with
+	secondary index */
+	for (dict_index_t* index = dict_table_get_next_index(
+			dict_table_get_first_index(table));
+		index;
+		index = dict_table_get_next_index(index)) {
+
+		/* Skip if the index have newly added
+		virtual column because field name is NULL.
+		Later virtual column set will be
+		refreshed during loading of table. */
+		if (!dict_index_has_virtual(index)
+		    || index->has_new_v_col()) {
+			continue;
+		}
+
+		for (ulint i = 0; i < index->n_fields; i++) {
+			dict_field_t*	field =
+				dict_index_get_nth_field(index, i);
+
+			if (strcmp(field->name, col_name) == 0) {
+				dict_mem_fill_vcol_has_index(
+					index, v_cols);
+			}
+		}
+	}
+}
+
+/** Fill the virtual column set with virtual columns which have base columns
+as the given col_name
+@param[in]	col_name	column name
+@param[in]	table		table object
+@param[out]	v_cols		set of virtual columns. */
+static
+void
+dict_mem_fill_vcol_set_for_base_col(
+	const char*		col_name,
+	const dict_table_t*	table,
+	dict_vcol_set**		v_cols)
+{
+	for (ulint i = 0; i < table->n_v_cols; i++) {
+		dict_v_col_t*	v_col = dict_table_get_nth_v_col(table, i);
+
+		if (!v_col->m_col.ord_part) {
+			continue;
+		}
+
+		for (ulint j = 0; j < unsigned{v_col->num_base}; j++) {
+			if (strcmp(col_name, dict_table_get_col_name(
+					table,
+					v_col->base_col[j]->ind)) == 0) {
+
+				if (*v_cols == NULL) {
+					*v_cols = UT_NEW_NOKEY(dict_vcol_set());
+				}
+
+				(*v_cols)->insert(v_col);
+			}
+		}
+	}
+}
+
+/** Fills the dependent virtual columns in a set.
+Reason for being dependent are
+1) FK can be present on base column of virtual columns
+2) FK can be present on column which is a part of virtual index
+@param[in,out]  foreign foreign key information. */
+void
+dict_mem_foreign_fill_vcol_set(
+        dict_foreign_t* foreign)
+{
+	ulint	type = foreign->type;
+
+	if (type == 0) {
+		return;
+	}
+
+	for (ulint i = 0; i < foreign->n_fields; i++) {
+		/** FK can be present on base columns
+		of virtual columns. */
+		dict_mem_fill_vcol_set_for_base_col(
+			foreign->foreign_col_names[i],
+			foreign->foreign_table,
+			&foreign->v_cols);
+
+		/** FK can be present on the columns
+		which can be a part of virtual index. */
+		dict_mem_fill_vcol_from_v_indexes(
+			foreign->foreign_col_names[i],
+			foreign->foreign_table,
+			&foreign->v_cols);
+	}
+}
+
+/** Fill virtual columns set in each fk constraint present in the table.
+@param[in,out]	table	innodb table object. */
+void
+dict_mem_table_fill_foreign_vcol_set(
+	dict_table_t*	table)
+{
+	dict_foreign_set	fk_set = table->foreign_set;
+	dict_foreign_t*		foreign;
+
+	dict_foreign_set::iterator it;
+	for (it = fk_set.begin(); it != fk_set.end(); ++it) {
+		foreign = *it;
+
+		dict_mem_foreign_fill_vcol_set(foreign);
+	}
+}
+
+/** Free the vcol_set from all foreign key constraint on the table.
+@param[in,out]	table	innodb table object. */
+void
+dict_mem_table_free_foreign_vcol_set(
+	dict_table_t*	table)
+{
+	dict_foreign_set	fk_set = table->foreign_set;
+	dict_foreign_t*		foreign;
+
+	dict_foreign_set::iterator it;
+	for (it = fk_set.begin(); it != fk_set.end(); ++it) {
+
+		foreign = *it;
+
+		if (foreign->v_cols != NULL) {
+			UT_DELETE(foreign->v_cols);
+			foreign->v_cols = NULL;
+		}
+	}
+}
+
+/**********************************************************************//**
+Frees an index memory object. */
+void
+dict_mem_index_free(
+/*================*/
+	dict_index_t*	index)	/*!< in: index */
+{
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	index->zip_pad.mutex.~mutex();
+
+	if (dict_index_is_spatial(index)) {
+		for (auto& rtr_info : index->rtr_track->rtr_active) {
+			rtr_info->index = NULL;
+		}
+
+		mysql_mutex_destroy(&index->rtr_track->rtr_active_mutex);
+		index->rtr_track->~rtr_info_track_t();
+	}
+
+	index->detach_columns();
+	mem_heap_free(index->heap);
+}
+
+/** Create a temporary tablename like "#sql-ibNNN".
+@param[in]	heap	A memory heap
+@param[in]	dbtab	Table name in the form database/table name
+@param[in]	id	Table id
+@return A unique temporary tablename suitable for InnoDB use */
+char*
+dict_mem_create_temporary_tablename(
+	mem_heap_t*	heap,
+	const char*	dbtab,
+	table_id_t	id)
+{
+	size_t		size;
+	char*		name;
+	const char*	dbend   = strchr(dbtab, '/');
+	ut_ad(dbend);
+	size_t		dblen   = size_t(dbend - dbtab) + 1;
+
+	size = dblen + (sizeof(TEMP_FILE_PREFIX_INNODB) + 20);
+	name = static_cast<char*>(mem_heap_alloc(heap, size));
+	memcpy(name, dbtab, dblen);
+	snprintf(name + dblen, size - dblen,
+		 TEMP_FILE_PREFIX_INNODB UINT64PF, id);
+
+	return(name);
+}
+
+/** Validate the search order in the foreign key set.
+@param[in]	fk_set	the foreign key set to be validated
+@return true if search order is fine in the set, false otherwise. */
+bool
+dict_foreign_set_validate(
+	const dict_foreign_set&	fk_set)
+{
+	dict_foreign_not_exists	not_exists(fk_set);
+
+	dict_foreign_set::const_iterator it = std::find_if(
+		fk_set.begin(), fk_set.end(), not_exists);
+
+	if (it == fk_set.end()) {
+		return(true);
+	}
+
+	dict_foreign_t*	foreign = *it;
+	std::cerr << "Foreign key lookup failed: " << *foreign;
+	std::cerr << fk_set;
+	ut_ad(0);
+	return(false);
+}
+
+/** Validate the search order in the foreign key sets of the table
+(foreign_set and referenced_set).
+@param[in]	table	table whose foreign key sets are to be validated
+@return true if foreign key sets are fine, false otherwise. */
+bool
+dict_foreign_set_validate(
+	const dict_table_t&	table)
+{
+	return(dict_foreign_set_validate(table.foreign_set)
+	       && dict_foreign_set_validate(table.referenced_set));
+}
+
+std::ostream&
+operator<< (std::ostream& out, const dict_foreign_t& foreign)
+{
+	out << "[dict_foreign_t: id='" << foreign.id << "'";
+
+	if (foreign.foreign_table_name != NULL) {
+		out << ",for: '" << foreign.foreign_table_name << "'";
+	}
+
+	out << "]";
+	return(out);
+}
+
+std::ostream&
+operator<< (std::ostream& out, const dict_foreign_set& fk_set)
+{
+	out << "[dict_foreign_set:";
+	std::for_each(fk_set.begin(), fk_set.end(), dict_foreign_print(out));
+	out << "]" << std::endl;
+	return(out);
+}
+
+/** Check whether fulltext index gets affected by foreign
+key constraint. */
+bool dict_foreign_t::affects_fulltext() const
+{
+  if (foreign_table == referenced_table || !foreign_table->fts)
+    return false;
+
+  for (ulint i= 0; i < n_fields; i++)
+  {
+    const dict_col_t *col= dict_index_get_nth_col(foreign_index, i);
+    if (dict_table_is_fts_column(foreign_table->fts->indexes, col->ind,
+                                 col->is_virtual()) != ULINT_UNDEFINED)
+      return true;
+  }
+
+  return false;
+}
+
+/** Reconstruct the clustered index fields.
+@return whether metadata is incorrect */
+inline bool dict_index_t::reconstruct_fields()
+{
+	DBUG_ASSERT(is_primary());
+
+	const auto old_n_fields = n_fields;
+
+	n_fields = (n_fields + table->instant->n_dropped)
+		& dict_index_t::MAX_N_FIELDS;
+	n_def = (n_def + table->instant->n_dropped)
+		& dict_index_t::MAX_N_FIELDS;
+
+	const unsigned n_first = first_user_field();
+
+	dict_field_t* tfields = static_cast<dict_field_t*>(
+		mem_heap_zalloc(heap, n_fields * sizeof *fields));
+
+	memcpy(tfields, fields, n_first * sizeof *fields);
+
+	n_nullable = 0;
+	ulint n_core_null = 0;
+	const bool comp = dict_table_is_comp(table);
+	const auto* field_map_it = table->instant->field_map;
+	for (unsigned i = n_first, j = 0; i < n_fields; ) {
+		dict_field_t& f = tfields[i++];
+		auto c = *field_map_it++;
+		if (c.is_dropped()) {
+			f.col = &table->instant->dropped[j++];
+			DBUG_ASSERT(f.col->is_dropped());
+			f.fixed_len = dict_col_get_fixed_size(f.col, comp)
+				& ((1U << 10) - 1);
+		} else {
+			DBUG_ASSERT(!c.is_not_null());
+			const auto old = std::find_if(
+				fields + n_first, fields + old_n_fields,
+				[c](const dict_field_t& o)
+				{ return o.col->ind == c.ind(); });
+
+			if (old >= fields + old_n_fields
+			    || old->prefix_len
+			    || old->col != &table->cols[c.ind()]) {
+				return true;
+			}
+
+			ut_ad(old >= &fields[n_first]);
+			f = *old;
+		}
+
+		f.col->clear_instant();
+		if (f.col->is_nullable()) {
+			n_nullable++;
+			n_core_null += i <= n_core_fields;
+		}
+	}
+
+	fields = tfields;
+	n_core_null_bytes = static_cast<byte>(UT_BITS_IN_BYTES(n_core_null));
+
+	return false;
+}
+
+/** Reconstruct dropped or reordered columns.
+@param[in]	metadata	data from serialise_columns()
+@param[in]	len		length of the metadata, in bytes
+@return whether parsing the metadata failed */
+bool dict_table_t::deserialise_columns(const byte* metadata, ulint len)
+{
+	DBUG_ASSERT(!instant);
+
+	unsigned num_non_pk_fields = mach_read_from_4(metadata);
+	metadata += 4;
+
+	if (num_non_pk_fields >= REC_MAX_N_FIELDS - 3) {
+		return true;
+	}
+
+	dict_index_t* index = UT_LIST_GET_FIRST(indexes);
+
+	if (num_non_pk_fields < unsigned(index->n_fields)
+	    - index->first_user_field()) {
+		return true;
+	}
+
+	field_map_element_t* field_map = static_cast<field_map_element_t*>(
+		mem_heap_alloc(heap,
+			       num_non_pk_fields * sizeof *field_map));
+
+	unsigned n_dropped_cols = 0;
+
+	for (unsigned i = 0; i < num_non_pk_fields; i++) {
+		auto c = field_map[i] = mach_read_from_2(metadata);
+		metadata += 2;
+
+		if (field_map[i].is_dropped()) {
+			if (c.ind() > DICT_MAX_FIXED_COL_LEN + 1) {
+				return true;
+			}
+			n_dropped_cols++;
+		} else if (c >= n_cols) {
+			return true;
+		}
+	}
+
+	dict_col_t* dropped_cols = static_cast<dict_col_t*>(mem_heap_zalloc(
+		heap, n_dropped_cols * sizeof(dict_col_t)));
+	instant = new (mem_heap_alloc(heap, sizeof *instant)) dict_instant_t();
+	instant->n_dropped = n_dropped_cols;
+	instant->dropped = dropped_cols;
+	instant->field_map = field_map;
+
+	dict_col_t* col = dropped_cols;
+	for (unsigned i = 0; i < num_non_pk_fields; i++) {
+		if (field_map[i].is_dropped()) {
+			auto fixed_len = field_map[i].ind();
+			DBUG_ASSERT(fixed_len <= DICT_MAX_FIXED_COL_LEN + 1);
+			(col++)->set_dropped(field_map[i].is_not_null(),
+					     fixed_len == 1,
+					     fixed_len > 1 ? fixed_len - 1
+					     : 0);
+		}
+	}
+	DBUG_ASSERT(col == &dropped_cols[n_dropped_cols]);
+
+	return UT_LIST_GET_FIRST(indexes)->reconstruct_fields();
+}
+
+/** Check if record in clustered index is historical row.
+@param[in]	rec	clustered row
+@param[in]	offsets	offsets
+@return true if row is historical */
+bool
+dict_index_t::vers_history_row(
+	const rec_t*		rec,
+	const rec_offs*		offsets)
+{
+	ut_ad(is_primary());
+
+	ulint len;
+	dict_col_t& col= table->cols[table->vers_end];
+	ut_ad(col.vers_sys_end());
+	ulint nfield = dict_col_get_clust_pos(&col, this);
+	const byte *data = rec_get_nth_field(rec, offsets, nfield, &len);
+	if (col.vers_native()) {
+		ut_ad(len == sizeof trx_id_max_bytes);
+		return 0 != memcmp(data, trx_id_max_bytes, len);
+	}
+	ut_ad(len == sizeof timestamp_max_bytes);
+	return 0 != memcmp(data, timestamp_max_bytes, len);
+}
+
+/** Check if record in secondary index is historical row.
+@param[in]	rec	record in a secondary index
+@param[out]	history_row true if row is historical
+@return true on error */
+bool
+dict_index_t::vers_history_row(
+	const rec_t* rec,
+	bool &history_row)
+{
+	ut_ad(!is_primary());
+
+	/*
+	  Get row_end from clustered index
+
+	  TODO (optimization): row_end can be taken from unique secondary index
+	  as well. For that dict_index_t::vers_end member should be added and
+	  updated at index init (dict_index_build_internal_non_clust()).
+
+	  Test case:
+
+		create or replace table t1 (x int unique, y int unique,
+			foreign key r (y) references t1 (x))
+			with system versioning engine innodb;
+		insert into t1 values (1, 1);
+	 */
+	bool error = false;
+	mem_heap_t* heap = NULL;
+	dict_index_t* clust_index = NULL;
+	rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs* offsets = offsets_;
+	rec_offs_init(offsets_);
+
+	mtr_t mtr;
+	mtr.start();
+
+	rec_t* clust_rec =
+	    row_get_clust_rec(BTR_SEARCH_LEAF, rec, this, &clust_index, &mtr);
+	if (clust_rec) {
+		offsets = rec_get_offsets(clust_rec, clust_index, offsets,
+					  clust_index->n_core_fields,
+					  ULINT_UNDEFINED, &heap);
+
+		history_row = clust_index->vers_history_row(clust_rec, offsets);
+        } else {
+		ib::error() << "foreign constraints: secondary index is out of "
+			       "sync";
+		ut_ad("secondary index is out of sync" == 0);
+		error = true;
+	}
+	mtr.commit();
+	if (heap) {
+		mem_heap_free(heap);
+	}
+	return(error);
+}
diff --git a/storage/innobase/dict/dict0stats.cc b/storage/innobase/dict/dict0stats.cc
new file mode 100644
index 00000000..40969335
--- /dev/null
+++ b/storage/innobase/dict/dict0stats.cc
@@ -0,0 +1,4724 @@
+/*****************************************************************************
+
+Copyright (c) 2009, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file dict/dict0stats.cc
+Code used for calculating and manipulating table statistics.
+
+Created Jan 06, 2010 Vasil Dimov
+*******************************************************/
+
+#include "dict0stats.h"
+#include "dyn0buf.h"
+#include "row0sel.h"
+#include "trx0trx.h"
+#include "lock0lock.h"
+#include "pars0pars.h"
+#include <mysql_com.h>
+#include "log.h"
+#include "btr0btr.h"
+#include "que0que.h"
+#include "scope.h"
+#include "debug_sync.h"
+
+#include <algorithm>
+#include <map>
+#include <vector>
+#include <thread>
+
+/* Sampling algorithm description @{
+
+The algorithm is controlled by one number - N_SAMPLE_PAGES(index),
+let it be A, which is the number of leaf pages to analyze for a given index
+for each n-prefix (if the index is on 3 columns, then 3*A leaf pages will be
+analyzed).
+
+Let the total number of leaf pages in the table be T.
+Level 0 - leaf pages, level H - root.
+
+Definition: N-prefix-boring record is a record on a non-leaf page that equals
+the next (to the right, cross page boundaries, skipping the supremum and
+infimum) record on the same level when looking at the fist n-prefix columns.
+The last (user) record on a level is not boring (it does not match the
+non-existent user record to the right). We call the records boring because all
+the records on the page below a boring record are equal to that boring record.
+
+We avoid diving below boring records when searching for a leaf page to
+estimate the number of distinct records because we know that such a leaf
+page will have number of distinct records == 1.
+
+For each n-prefix: start from the root level and full scan subsequent lower
+levels until a level that contains at least A*10 distinct records is found.
+Lets call this level LA.
+As an optimization the search is canceled if it has reached level 1 (never
+descend to the level 0 (leaf)) and also if the next level to be scanned
+would contain more than A pages. The latter is because the user has asked
+to analyze A leaf pages and it does not make sense to scan much more than
+A non-leaf pages with the sole purpose of finding a good sample of A leaf
+pages.
+
+After finding the appropriate level LA with >A*10 distinct records (or less in
+the exceptions described above), divide it into groups of equal records and
+pick A such groups. Then pick the last record from each group. For example,
+let the level be:
+
+index:  0,1,2,3,4,5,6,7,8,9,10
+record: 1,1,1,2,2,7,7,7,7,7,9
+
+There are 4 groups of distinct records and if A=2 random ones are selected,
+e.g. 1,1,1 and 7,7,7,7,7, then records with indexes 2 and 9 will be selected.
+
+After selecting A records as described above, dive below them to find A leaf
+pages and analyze them, finding the total number of distinct records. The
+dive to the leaf level is performed by selecting a non-boring record from
+each page and diving below it.
+
+This way, a total of A leaf pages are analyzed for the given n-prefix.
+
+Let the number of different key values found in each leaf page i be Pi (i=1..A).
+Let N_DIFF_AVG_LEAF be (P1 + P2 + ... + PA) / A.
+Let the number of different key values on level LA be N_DIFF_LA.
+Let the total number of records on level LA be TOTAL_LA.
+Let R be N_DIFF_LA / TOTAL_LA, we assume this ratio is the same on the
+leaf level.
+Let the number of leaf pages be N.
+Then the total number of different key values on the leaf level is:
+N * R * N_DIFF_AVG_LEAF.
+See REF01 for the implementation.
+
+The above describes how to calculate the cardinality of an index.
+This algorithm is executed for each n-prefix of a multi-column index
+where n=1..n_uniq.
+@} */
+
+/* names of the tables from the persistent statistics storage */
+#define TABLE_STATS_NAME_PRINT	"mysql.innodb_table_stats"
+#define INDEX_STATS_NAME_PRINT	"mysql.innodb_index_stats"
+
+#ifdef UNIV_STATS_DEBUG
+#define DEBUG_PRINTF(fmt, ...)	printf(fmt, ## __VA_ARGS__)
+#else /* UNIV_STATS_DEBUG */
+#define DEBUG_PRINTF(fmt, ...)	/* noop */
+#endif /* UNIV_STATS_DEBUG */
+
+/* Gets the number of leaf pages to sample in persistent stats estimation */
+#define N_SAMPLE_PAGES(index)					\
+	static_cast<ib_uint64_t>(				\
+		(index)->table->stats_sample_pages != 0		\
+		? (index)->table->stats_sample_pages		\
+		: srv_stats_persistent_sample_pages)
+
+/* number of distinct records on a given level that are required to stop
+descending to lower levels and fetch N_SAMPLE_PAGES(index) records
+from that level */
+#define N_DIFF_REQUIRED(index)	(N_SAMPLE_PAGES(index) * 10)
+
+/* A dynamic array where we store the boundaries of each distinct group
+of keys. For example if a btree level is:
+index: 0,1,2,3,4,5,6,7,8,9,10,11,12
+data:  b,b,b,b,b,b,g,g,j,j,j, x, y
+then we would store 5,7,10,11,12 in the array. */
+typedef std::vector<ib_uint64_t, ut_allocator<ib_uint64_t> >	boundaries_t;
+
+/** Allocator type used for index_map_t. */
+typedef ut_allocator<std::pair<const char* const, dict_index_t*> >
+	index_map_t_allocator;
+
+/** Auxiliary map used for sorting indexes by name in dict_stats_save(). */
+typedef std::map<const char*, dict_index_t*, ut_strcmp_functor,
+		index_map_t_allocator>	index_map_t;
+
+bool dict_table_t::is_stats_table() const
+{
+  return !strcmp(name.m_name, TABLE_STATS_NAME) ||
+         !strcmp(name.m_name, INDEX_STATS_NAME);
+}
+
+bool trx_t::has_stats_table_lock() const
+{
+  for (const lock_t *l : lock.table_locks)
+    if (l && l->un_member.tab_lock.table->is_stats_table())
+      return true;
+  return false;
+}
+
+/*********************************************************************//**
+Checks whether an index should be ignored in stats manipulations:
+* stats fetch
+* stats recalc
+* stats save
+@return true if exists and all tables are ok */
+UNIV_INLINE
+bool
+dict_stats_should_ignore_index(
+/*===========================*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+  return !index->is_btree() || index->to_be_dropped || !index->is_committed();
+}
+
+
+/** expected column definition */
+struct dict_col_meta_t
+{
+  /** column name */
+  const char *name;
+  /** main type */
+  unsigned mtype;
+  /** prtype mask; all these bits have to be set in prtype */
+  unsigned prtype_mask;
+  /** column length in bytes */
+  unsigned len;
+};
+
+/** For checking whether a table exists and has a predefined schema */
+struct dict_table_schema_t
+{
+  /** table name */
+  span<const char> table_name;
+  /** table name in SQL */
+  const char *table_name_sql;
+  /** number of columns */
+  unsigned n_cols;
+  /** columns */
+  const dict_col_meta_t columns[8];
+};
+
+static const dict_table_schema_t table_stats_schema =
+{
+  {C_STRING_WITH_LEN(TABLE_STATS_NAME)}, TABLE_STATS_NAME_PRINT, 6,
+  {
+    {"database_name", DATA_VARMYSQL, DATA_NOT_NULL, 192},
+    {"table_name", DATA_VARMYSQL, DATA_NOT_NULL, 597},
+    /*
+      Don't check the DATA_UNSIGNED flag in last_update.
+      It presents if the server is running in a pure MariaDB installation,
+      because MariaDB's Field_timestampf::flags has UNSIGNED_FLAG.
+      But DATA_UNSIGNED misses when the server starts on a MySQL-5.7 directory
+      (during a migration), because MySQL's Field_timestampf::flags does not
+      have UNSIGNED_FLAG.
+      This is fine not to check DATA_UNSIGNED, because Field_timestampf
+      in both MariaDB and MySQL support only non-negative time_t values.
+    */
+    {"last_update", DATA_INT, DATA_NOT_NULL, 4},
+    {"n_rows", DATA_INT, DATA_NOT_NULL | DATA_UNSIGNED, 8},
+    {"clustered_index_size", DATA_INT, DATA_NOT_NULL | DATA_UNSIGNED, 8},
+    {"sum_of_other_index_sizes", DATA_INT, DATA_NOT_NULL | DATA_UNSIGNED, 8},
+  }
+};
+
+static const dict_table_schema_t index_stats_schema =
+{
+  {C_STRING_WITH_LEN(INDEX_STATS_NAME)}, INDEX_STATS_NAME_PRINT, 8,
+  {
+    {"database_name", DATA_VARMYSQL, DATA_NOT_NULL, 192},
+    {"table_name", DATA_VARMYSQL, DATA_NOT_NULL, 597},
+    {"index_name", DATA_VARMYSQL, DATA_NOT_NULL, 192},
+    /*
+      Don't check the DATA_UNSIGNED flag in last_update.
+      See comments about last_update in table_stats_schema above.
+    */
+    {"last_update", DATA_INT, DATA_NOT_NULL, 4},
+    {"stat_name", DATA_VARMYSQL, DATA_NOT_NULL, 64*3},
+    {"stat_value", DATA_INT, DATA_NOT_NULL | DATA_UNSIGNED, 8},
+    {"sample_size", DATA_INT, DATA_UNSIGNED, 8},
+    {"stat_description", DATA_VARMYSQL, DATA_NOT_NULL, 1024*3}
+  }
+};
+
+/** Construct the type's SQL name (e.g. BIGINT UNSIGNED)
+@param mtype   InnoDB main type
+@param prtype  InnoDB precise type
+@param len     length of the column
+@param name    the SQL name
+@param name_sz size of the name buffer
+@return number of bytes written (excluding the terminating NUL byte) */
+static int dtype_sql_name(unsigned mtype, unsigned prtype, unsigned len,
+                          char *name, size_t name_sz)
+{
+  const char *Unsigned= "";
+  const char *Main= "UNKNOWN";
+
+  switch (mtype) {
+  case DATA_INT:
+    switch (len) {
+    case 1:
+      Main= "TINYINT";
+      break;
+    case 2:
+      Main= "SMALLINT";
+      break;
+    case 3:
+      Main= "MEDIUMINT";
+      break;
+    case 4:
+      Main= "INT";
+      break;
+    case 8:
+      Main= "BIGINT";
+      break;
+    }
+
+  append_unsigned:
+    if (prtype & DATA_UNSIGNED)
+      Unsigned= " UNSIGNED";
+    len= 0;
+    break;
+  case DATA_FLOAT:
+    Main= "FLOAT";
+    goto append_unsigned;
+  case DATA_DOUBLE:
+    Main= "DOUBLE";
+    goto append_unsigned;
+  case DATA_FIXBINARY:
+    Main= "BINARY";
+    break;
+  case DATA_CHAR:
+  case DATA_MYSQL:
+    Main= "CHAR";
+    break;
+  case DATA_VARCHAR:
+  case DATA_VARMYSQL:
+    Main= "VARCHAR";
+    break;
+  case DATA_BINARY:
+    Main= "VARBINARY";
+    break;
+  case DATA_GEOMETRY:
+    Main= "GEOMETRY";
+    len= 0;
+    break;
+  case DATA_BLOB:
+    switch (len) {
+    case 9:
+      Main= "TINYBLOB";
+      break;
+    case 10:
+      Main= "BLOB";
+      break;
+    case 11:
+      Main= "MEDIUMBLOB";
+      break;
+    case 12:
+      Main= "LONGBLOB";
+      break;
+    }
+    len= 0;
+  }
+
+  const char* Not_null= (prtype & DATA_NOT_NULL) ? " NOT NULL" : "";
+  if (len)
+    return snprintf(name, name_sz, "%s(%u)%s%s", Main, len, Unsigned,
+                    Not_null);
+  else
+    return snprintf(name, name_sz, "%s%s%s", Main, Unsigned, Not_null);
+}
+
+static bool innodb_table_stats_not_found;
+static bool innodb_index_stats_not_found;
+static bool innodb_table_stats_not_found_reported;
+static bool innodb_index_stats_not_found_reported;
+
+/*********************************************************************//**
+Checks whether a table exists and whether it has the given structure.
+The table must have the same number of columns with the same names and
+types. The order of the columns does not matter.
+dict_table_schema_check() @{
+@return DB_SUCCESS if the table exists and contains the necessary columns */
+static
+dberr_t
+dict_table_schema_check(
+/*====================*/
+	const dict_table_schema_t* req_schema,	/*!< in: required table
+						schema */
+	char*			errstr,		/*!< out: human readable error
+						message if != DB_SUCCESS is
+						returned */
+	size_t			errstr_sz)	/*!< in: errstr size */
+{
+	const dict_table_t* table= dict_sys.load_table(req_schema->table_name);
+
+	if (!table) {
+		if (opt_bootstrap)
+			return DB_TABLE_NOT_FOUND;
+		if (req_schema == &table_stats_schema) {
+			if (innodb_table_stats_not_found_reported) {
+				return DB_STATS_DO_NOT_EXIST;
+			}
+			innodb_table_stats_not_found = true;
+			innodb_table_stats_not_found_reported = true;
+		} else {
+			ut_ad(req_schema == &index_stats_schema);
+			if (innodb_index_stats_not_found_reported) {
+				return DB_STATS_DO_NOT_EXIST;
+			}
+			innodb_index_stats_not_found = true;
+			innodb_index_stats_not_found_reported = true;
+		}
+
+		snprintf(errstr, errstr_sz, "Table %s not found.",
+			 req_schema->table_name_sql);
+		return DB_TABLE_NOT_FOUND;
+	}
+
+	if (!table->is_readable() && !table->space) {
+		/* missing tablespace */
+		snprintf(errstr, errstr_sz,
+			 "Tablespace for table %s is missing.",
+			 req_schema->table_name_sql);
+		return DB_TABLE_NOT_FOUND;
+	}
+
+	if (unsigned(table->n_def - DATA_N_SYS_COLS) != req_schema->n_cols) {
+		/* the table has a different number of columns than required */
+		snprintf(errstr, errstr_sz,
+			 "%s has %d columns but should have %u.",
+			 req_schema->table_name_sql,
+			 table->n_def - DATA_N_SYS_COLS,
+			 req_schema->n_cols);
+		return DB_ERROR;
+	}
+
+	/* For each column from req_schema->columns[] search
+	whether it is present in table->cols[].
+	The following algorithm is O(n_cols^2), but is optimized to
+	be O(n_cols) if the columns are in the same order in both arrays. */
+
+	for (unsigned i = 0; i < req_schema->n_cols; i++) {
+		ulint	j = dict_table_has_column(
+			table, req_schema->columns[i].name, i);
+
+		if (j == table->n_def) {
+			snprintf(errstr, errstr_sz,
+				    "required column %s"
+				    " not found in table %s.",
+				    req_schema->columns[i].name,
+				    req_schema->table_name_sql);
+
+			return(DB_ERROR);
+		}
+
+		/* we found a column with the same name on j'th position,
+		compare column types and flags */
+
+		/* check length for exact match */
+		if (req_schema->columns[i].len != table->cols[j].len) {
+			sql_print_warning("InnoDB: Table %s has"
+					  " length mismatch in the"
+					  " column name %s."
+					  " Please run mariadb-upgrade",
+					  req_schema->table_name_sql,
+					  req_schema->columns[i].name);
+		}
+
+		/*
+                  check mtype for exact match.
+                  This check is relaxed to allow use to use TIMESTAMP
+                  (ie INT) for last_update instead of DATA_BINARY.
+                  We have to test for both values as the innodb_table_stats
+                  table may come from MySQL and have the old type.
+                */
+		if (req_schema->columns[i].mtype != table->cols[j].mtype &&
+                    !(req_schema->columns[i].mtype == DATA_INT &&
+                      table->cols[j].mtype == DATA_FIXBINARY)) {
+		} else if ((~table->cols[j].prtype
+			    & req_schema->columns[i].prtype_mask)) {
+		} else {
+			continue;
+		}
+
+		int s = snprintf(errstr, errstr_sz,
+				 "Column %s in table %s is ",
+				 req_schema->columns[i].name,
+				 req_schema->table_name_sql);
+		if (s < 0 || static_cast<size_t>(s) >= errstr_sz) {
+			return DB_ERROR;
+		}
+		errstr += s;
+		errstr_sz -= s;
+		s = dtype_sql_name(table->cols[j].mtype, table->cols[j].prtype,
+				   table->cols[j].len, errstr, errstr_sz);
+		if (s < 0 || static_cast<size_t>(s) + sizeof " but should be "
+		    >= errstr_sz) {
+			return DB_ERROR;
+		}
+		errstr += s;
+		memcpy(errstr, " but should be ", sizeof " but should be ");
+		errstr += (sizeof " but should be ") - 1;
+		errstr_sz -= s + (sizeof " but should be ") - 1;
+		s = dtype_sql_name(req_schema->columns[i].mtype,
+				   req_schema->columns[i].prtype_mask,
+				   req_schema->columns[i].len,
+				   errstr, errstr_sz);
+		return DB_ERROR;
+	}
+
+	if (size_t n_foreign = table->foreign_set.size()) {
+		snprintf(errstr, errstr_sz,
+			 "Table %s has %zu foreign key(s) pointing"
+			 " to other tables, but it must have 0.",
+			 req_schema->table_name_sql, n_foreign);
+		return DB_ERROR;
+	}
+
+	if (size_t n_referenced = table->referenced_set.size()) {
+		snprintf(errstr, errstr_sz,
+			 "There are %zu foreign key(s) pointing to %s, "
+			 "but there must be 0.", n_referenced,
+			 req_schema->table_name_sql);
+		return DB_ERROR;
+	}
+
+	return DB_SUCCESS;
+}
+
+/*********************************************************************//**
+Checks whether the persistent statistics storage exists and that all
+tables have the proper structure.
+@return true if exists and all tables are ok */
+static bool dict_stats_persistent_storage_check(bool dict_already_locked)
+{
+	char		errstr[512];
+	dberr_t		ret;
+
+	if (!dict_already_locked) {
+		dict_sys.lock(SRW_LOCK_CALL);
+	}
+
+	ut_ad(dict_sys.locked());
+
+	/* first check table_stats */
+	ret = dict_table_schema_check(&table_stats_schema, errstr,
+				      sizeof(errstr));
+	if (ret == DB_SUCCESS) {
+		/* if it is ok, then check index_stats */
+		ret = dict_table_schema_check(&index_stats_schema, errstr,
+					      sizeof(errstr));
+	}
+
+	if (!dict_already_locked) {
+		dict_sys.unlock();
+	}
+
+	switch (ret) {
+	case DB_SUCCESS:
+		return true;
+	default:
+		if (!opt_bootstrap) {
+			ib::error() << errstr;
+		}
+		/* fall through */
+	case DB_STATS_DO_NOT_EXIST:
+		return false;
+	}
+}
+
+/** Executes a given SQL statement using the InnoDB internal SQL parser.
+This function will free the pinfo object.
+@param[in,out]	pinfo	pinfo to pass to que_eval_sql() must already
+have any literals bound to it
+@param[in]	sql	SQL string to execute
+@param[in,out]	trx	transaction
+@return DB_SUCCESS or error code */
+static
+dberr_t dict_stats_exec_sql(pars_info_t *pinfo, const char* sql, trx_t *trx)
+{
+  ut_ad(dict_sys.locked());
+
+  if (!dict_stats_persistent_storage_check(true))
+  {
+    pars_info_free(pinfo);
+    return DB_STATS_DO_NOT_EXIST;
+  }
+
+  return que_eval_sql(pinfo, sql, trx);
+}
+
+/*********************************************************************//**
+Duplicate a table object and its indexes.
+This function creates a dummy dict_table_t object and initializes the
+following table and index members:
+dict_table_t::id (copied)
+dict_table_t::heap (newly created)
+dict_table_t::name (copied)
+dict_table_t::corrupted (copied)
+dict_table_t::indexes<> (newly created)
+dict_table_t::magic_n
+for each entry in dict_table_t::indexes, the following are initialized:
+(indexes that have DICT_FTS set in index->type are skipped)
+dict_index_t::id (copied)
+dict_index_t::name (copied)
+dict_index_t::table_name (points to the copied table name)
+dict_index_t::table (points to the above semi-initialized object)
+dict_index_t::type (copied)
+dict_index_t::to_be_dropped (copied)
+dict_index_t::online_status (copied)
+dict_index_t::n_uniq (copied)
+dict_index_t::fields[] (newly created, only first n_uniq, only fields[i].name)
+dict_index_t::indexes<> (newly created)
+dict_index_t::stat_n_diff_key_vals[] (only allocated, left uninitialized)
+dict_index_t::stat_n_sample_sizes[] (only allocated, left uninitialized)
+dict_index_t::stat_n_non_null_key_vals[] (only allocated, left uninitialized)
+dict_index_t::magic_n
+The returned object should be freed with dict_stats_table_clone_free()
+when no longer needed.
+@return incomplete table object */
+static
+dict_table_t*
+dict_stats_table_clone_create(
+/*==========================*/
+	const dict_table_t*	table)	/*!< in: table whose stats to copy */
+{
+	size_t		heap_size;
+	dict_index_t*	index;
+
+	/* Estimate the size needed for the table and all of its indexes */
+
+	heap_size = 0;
+	heap_size += sizeof(dict_table_t);
+	heap_size += strlen(table->name.m_name) + 1;
+
+	for (index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+
+		if (dict_stats_should_ignore_index(index)) {
+			continue;
+		}
+
+		ut_ad(!dict_index_is_ibuf(index));
+
+		ulint	n_uniq = dict_index_get_n_unique(index);
+
+		heap_size += sizeof(dict_index_t);
+		heap_size += strlen(index->name) + 1;
+		heap_size += n_uniq * sizeof(index->fields[0]);
+		for (ulint i = 0; i < n_uniq; i++) {
+			heap_size += strlen(index->fields[i].name) + 1;
+		}
+		heap_size += n_uniq * sizeof(index->stat_n_diff_key_vals[0]);
+		heap_size += n_uniq * sizeof(index->stat_n_sample_sizes[0]);
+		heap_size += n_uniq * sizeof(index->stat_n_non_null_key_vals[0]);
+	}
+
+	/* Allocate the memory and copy the members */
+
+	mem_heap_t*	heap;
+
+	heap = mem_heap_create(heap_size);
+
+	dict_table_t*	t;
+
+	t = (dict_table_t*) mem_heap_zalloc(heap, sizeof(*t));
+
+	t->stats_mutex_init();
+
+	MEM_CHECK_DEFINED(&table->id, sizeof(table->id));
+	t->id = table->id;
+
+	t->heap = heap;
+
+	t->name.m_name = mem_heap_strdup(heap, table->name.m_name);
+	t->mdl_name.m_name = t->name.m_name;
+
+	t->corrupted = table->corrupted;
+
+	UT_LIST_INIT(t->indexes, &dict_index_t::indexes);
+#ifdef BTR_CUR_HASH_ADAPT
+	UT_LIST_INIT(t->freed_indexes, &dict_index_t::indexes);
+#endif /* BTR_CUR_HASH_ADAPT */
+
+	for (index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+
+		if (dict_stats_should_ignore_index(index)) {
+			continue;
+		}
+
+		ut_ad(!dict_index_is_ibuf(index));
+
+		dict_index_t*	idx;
+
+		idx = (dict_index_t*) mem_heap_zalloc(heap, sizeof(*idx));
+
+		MEM_CHECK_DEFINED(&index->id, sizeof(index->id));
+		idx->id = index->id;
+
+		idx->name = mem_heap_strdup(heap, index->name);
+
+		idx->table = t;
+
+		idx->type = index->type;
+
+		idx->to_be_dropped = 0;
+
+		idx->online_status = ONLINE_INDEX_COMPLETE;
+		idx->set_committed(true);
+
+		idx->n_uniq = index->n_uniq;
+
+		idx->fields = (dict_field_t*) mem_heap_zalloc(
+			heap, idx->n_uniq * sizeof(idx->fields[0]));
+
+		for (ulint i = 0; i < idx->n_uniq; i++) {
+			idx->fields[i].name = mem_heap_strdup(
+				heap, index->fields[i].name);
+		}
+
+		/* hook idx into t->indexes */
+		UT_LIST_ADD_LAST(t->indexes, idx);
+
+		idx->stat_n_diff_key_vals = (ib_uint64_t*) mem_heap_zalloc(
+			heap,
+			idx->n_uniq * sizeof(idx->stat_n_diff_key_vals[0]));
+
+		idx->stat_n_sample_sizes = (ib_uint64_t*) mem_heap_zalloc(
+			heap,
+			idx->n_uniq * sizeof(idx->stat_n_sample_sizes[0]));
+
+		idx->stat_n_non_null_key_vals = (ib_uint64_t*) mem_heap_zalloc(
+			heap,
+			idx->n_uniq * sizeof(idx->stat_n_non_null_key_vals[0]));
+		ut_d(idx->magic_n = DICT_INDEX_MAGIC_N);
+
+		idx->stat_defrag_n_page_split = 0;
+		idx->stat_defrag_n_pages_freed = 0;
+	}
+
+	ut_d(t->magic_n = DICT_TABLE_MAGIC_N);
+
+	return(t);
+}
+
+/*********************************************************************//**
+Free the resources occupied by an object returned by
+dict_stats_table_clone_create(). */
+static
+void
+dict_stats_table_clone_free(
+/*========================*/
+	dict_table_t*	t)	/*!< in: dummy table object to free */
+{
+	t->stats_mutex_destroy();
+	mem_heap_free(t->heap);
+}
+
+/*********************************************************************//**
+Write all zeros (or 1 where it makes sense) into an index
+statistics members. The resulting stats correspond to an empty index. */
+static
+void
+dict_stats_empty_index(
+/*===================*/
+	dict_index_t*	index,	/*!< in/out: index */
+	bool		empty_defrag_stats)
+				/*!< in: whether to empty defrag stats */
+{
+	ut_ad(!(index->type & DICT_FTS));
+	ut_ad(!dict_index_is_ibuf(index));
+	ut_ad(index->table->stats_mutex_is_owner());
+
+	ulint	n_uniq = index->n_uniq;
+
+	for (ulint i = 0; i < n_uniq; i++) {
+		index->stat_n_diff_key_vals[i] = 0;
+		index->stat_n_sample_sizes[i] = 1;
+		index->stat_n_non_null_key_vals[i] = 0;
+	}
+
+	index->stat_index_size = 1;
+	index->stat_n_leaf_pages = 1;
+
+	if (empty_defrag_stats) {
+		dict_stats_empty_defrag_stats(index);
+		dict_stats_empty_defrag_summary(index);
+	}
+}
+
+/*********************************************************************//**
+Write all zeros (or 1 where it makes sense) into a table and its indexes'
+statistics members. The resulting stats correspond to an empty table. */
+static
+void
+dict_stats_empty_table(
+/*===================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	bool		empty_defrag_stats)
+				/*!< in: whether to empty defrag stats */
+{
+	/* Initialize table/index level stats is now protected by
+	table level lock_mutex.*/
+	table->stats_mutex_lock();
+
+	/* Zero the stats members */
+	table->stat_n_rows = 0;
+	table->stat_clustered_index_size = 1;
+	/* 1 page for each index, not counting the clustered */
+	table->stat_sum_of_other_index_sizes
+		= UT_LIST_GET_LEN(table->indexes) - 1;
+	table->stat_modified_counter = 0;
+
+	dict_index_t*	index;
+
+	for (index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+
+		if (index->type & DICT_FTS) {
+			continue;
+		}
+
+		ut_ad(!dict_index_is_ibuf(index));
+
+		dict_stats_empty_index(index, empty_defrag_stats);
+	}
+
+	table->stat_initialized = TRUE;
+	table->stats_mutex_unlock();
+}
+
+/*********************************************************************//**
+Check whether index's stats are initialized (assert if they are not). */
+static
+void
+dict_stats_assert_initialized_index(
+/*================================*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+	MEM_CHECK_DEFINED(
+		index->stat_n_diff_key_vals,
+		index->n_uniq * sizeof(index->stat_n_diff_key_vals[0]));
+
+	MEM_CHECK_DEFINED(
+		index->stat_n_sample_sizes,
+		index->n_uniq * sizeof(index->stat_n_sample_sizes[0]));
+
+	MEM_CHECK_DEFINED(
+		index->stat_n_non_null_key_vals,
+		index->n_uniq * sizeof(index->stat_n_non_null_key_vals[0]));
+
+	MEM_CHECK_DEFINED(
+		&index->stat_index_size,
+		sizeof(index->stat_index_size));
+
+	MEM_CHECK_DEFINED(
+		&index->stat_n_leaf_pages,
+		sizeof(index->stat_n_leaf_pages));
+}
+
+/*********************************************************************//**
+Check whether table's stats are initialized (assert if they are not). */
+static
+void
+dict_stats_assert_initialized(
+/*==========================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ut_a(table->stat_initialized);
+
+	MEM_CHECK_DEFINED(&table->stats_last_recalc,
+			  sizeof table->stats_last_recalc);
+
+	MEM_CHECK_DEFINED(&table->stat_persistent,
+			  sizeof table->stat_persistent);
+
+	MEM_CHECK_DEFINED(&table->stats_auto_recalc,
+			  sizeof table->stats_auto_recalc);
+
+	MEM_CHECK_DEFINED(&table->stats_sample_pages,
+			  sizeof table->stats_sample_pages);
+
+	MEM_CHECK_DEFINED(&table->stat_n_rows,
+			  sizeof table->stat_n_rows);
+
+	MEM_CHECK_DEFINED(&table->stat_clustered_index_size,
+			  sizeof table->stat_clustered_index_size);
+
+	MEM_CHECK_DEFINED(&table->stat_sum_of_other_index_sizes,
+			  sizeof table->stat_sum_of_other_index_sizes);
+
+	MEM_CHECK_DEFINED(&table->stat_modified_counter,
+			  sizeof table->stat_modified_counter);
+
+	for (dict_index_t* index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+
+		if (!dict_stats_should_ignore_index(index)) {
+			dict_stats_assert_initialized_index(index);
+		}
+	}
+}
+
+#define INDEX_EQ(i1, i2) \
+	((i1) != NULL \
+	 && (i2) != NULL \
+	 && (i1)->id == (i2)->id \
+	 && strcmp((i1)->name, (i2)->name) == 0)
+
+/*********************************************************************//**
+Copy table and index statistics from one table to another, including index
+stats. Extra indexes in src are ignored and extra indexes in dst are
+initialized to correspond to an empty index. */
+static
+void
+dict_stats_copy(
+/*============*/
+	dict_table_t*		dst,	/*!< in/out: destination table */
+	const dict_table_t*	src,	/*!< in: source table */
+	bool reset_ignored_indexes)	/*!< in: if true, set ignored indexes
+                                             to have the same statistics as if
+                                             the table was empty */
+{
+	ut_ad(src->stats_mutex_is_owner());
+	ut_ad(dst->stats_mutex_is_owner());
+
+	dst->stats_last_recalc = src->stats_last_recalc;
+	dst->stat_n_rows = src->stat_n_rows;
+	dst->stat_clustered_index_size = src->stat_clustered_index_size;
+	dst->stat_sum_of_other_index_sizes = src->stat_sum_of_other_index_sizes;
+	dst->stat_modified_counter = src->stat_modified_counter;
+
+	dict_index_t*	dst_idx;
+	dict_index_t*	src_idx;
+
+	for (dst_idx = dict_table_get_first_index(dst),
+	     src_idx = dict_table_get_first_index(src);
+	     dst_idx != NULL;
+	     dst_idx = dict_table_get_next_index(dst_idx),
+	     (src_idx != NULL
+	      && (src_idx = dict_table_get_next_index(src_idx)))) {
+
+		if (dict_stats_should_ignore_index(dst_idx)) {
+			if (reset_ignored_indexes) {
+				/* Reset index statistics for all ignored indexes,
+				unless they are FT indexes (these have no statistics)*/
+				if (dst_idx->type & DICT_FTS) {
+					continue;
+				}
+				dict_stats_empty_index(dst_idx, true);
+			} else {
+				continue;
+			}
+		}
+
+		ut_ad(!dict_index_is_ibuf(dst_idx));
+
+		if (!INDEX_EQ(src_idx, dst_idx)) {
+			for (src_idx = dict_table_get_first_index(src);
+			     src_idx != NULL;
+			     src_idx = dict_table_get_next_index(src_idx)) {
+
+				if (INDEX_EQ(src_idx, dst_idx)) {
+					break;
+				}
+			}
+		}
+
+		if (!INDEX_EQ(src_idx, dst_idx)) {
+			dict_stats_empty_index(dst_idx, true);
+			continue;
+		}
+
+		ulint	n_copy_el;
+
+		if (dst_idx->n_uniq > src_idx->n_uniq) {
+			n_copy_el = src_idx->n_uniq;
+			/* Since src is smaller some elements in dst
+			will remain untouched by the following memmove(),
+			thus we init all of them here. */
+			dict_stats_empty_index(dst_idx, true);
+		} else {
+			n_copy_el = dst_idx->n_uniq;
+		}
+
+		memmove(dst_idx->stat_n_diff_key_vals,
+			src_idx->stat_n_diff_key_vals,
+			n_copy_el * sizeof(dst_idx->stat_n_diff_key_vals[0]));
+
+		memmove(dst_idx->stat_n_sample_sizes,
+			src_idx->stat_n_sample_sizes,
+			n_copy_el * sizeof(dst_idx->stat_n_sample_sizes[0]));
+
+		memmove(dst_idx->stat_n_non_null_key_vals,
+			src_idx->stat_n_non_null_key_vals,
+			n_copy_el * sizeof(dst_idx->stat_n_non_null_key_vals[0]));
+
+		dst_idx->stat_index_size = src_idx->stat_index_size;
+
+		dst_idx->stat_n_leaf_pages = src_idx->stat_n_leaf_pages;
+
+		dst_idx->stat_defrag_modified_counter =
+			src_idx->stat_defrag_modified_counter;
+		dst_idx->stat_defrag_n_pages_freed =
+			src_idx->stat_defrag_n_pages_freed;
+		dst_idx->stat_defrag_n_page_split =
+			src_idx->stat_defrag_n_page_split;
+	}
+
+	dst->stat_initialized = TRUE;
+}
+
+/** Duplicate the stats of a table and its indexes.
+This function creates a dummy dict_table_t object and copies the input
+table's stats into it. The returned table object is not in the dictionary
+cache and cannot be accessed by any other threads. In addition to the
+members copied in dict_stats_table_clone_create() this function initializes
+the following:
+dict_table_t::stat_initialized
+dict_table_t::stat_persistent
+dict_table_t::stat_n_rows
+dict_table_t::stat_clustered_index_size
+dict_table_t::stat_sum_of_other_index_sizes
+dict_table_t::stat_modified_counter
+dict_index_t::stat_n_diff_key_vals[]
+dict_index_t::stat_n_sample_sizes[]
+dict_index_t::stat_n_non_null_key_vals[]
+dict_index_t::stat_index_size
+dict_index_t::stat_n_leaf_pages
+dict_index_t::stat_defrag_modified_counter
+dict_index_t::stat_defrag_n_pages_freed
+dict_index_t::stat_defrag_n_page_split
+The returned object should be freed with dict_stats_snapshot_free()
+when no longer needed.
+@param[in]	table	table whose stats to copy
+@return incomplete table object */
+static
+dict_table_t*
+dict_stats_snapshot_create(
+	dict_table_t*	table)
+{
+	dict_sys.lock(SRW_LOCK_CALL);
+
+	dict_stats_assert_initialized(table);
+
+	dict_table_t*	t;
+
+	t = dict_stats_table_clone_create(table);
+
+	table->stats_mutex_lock();
+	ut_d(t->stats_mutex_lock());
+
+	dict_stats_copy(t, table, false);
+
+	ut_d(t->stats_mutex_unlock());
+	table->stats_mutex_unlock();
+
+	t->stat_persistent = table->stat_persistent;
+	t->stats_auto_recalc = table->stats_auto_recalc;
+	t->stats_sample_pages = table->stats_sample_pages;
+
+	dict_sys.unlock();
+
+	return(t);
+}
+
+/*********************************************************************//**
+Free the resources occupied by an object returned by
+dict_stats_snapshot_create(). */
+static
+void
+dict_stats_snapshot_free(
+/*=====================*/
+	dict_table_t*	t)	/*!< in: dummy table object to free */
+{
+	dict_stats_table_clone_free(t);
+}
+
+/** Statistics for one field of an index. */
+struct index_field_stats_t
+{
+  ib_uint64_t n_diff_key_vals;
+  ib_uint64_t n_sample_sizes;
+  ib_uint64_t n_non_null_key_vals;
+
+  index_field_stats_t(ib_uint64_t n_diff_key_vals= 0,
+                      ib_uint64_t n_sample_sizes= 0,
+                      ib_uint64_t n_non_null_key_vals= 0)
+      : n_diff_key_vals(n_diff_key_vals), n_sample_sizes(n_sample_sizes),
+        n_non_null_key_vals(n_non_null_key_vals)
+  {
+  }
+
+  bool is_bulk_operation() const
+  {
+    return n_diff_key_vals == UINT64_MAX &&
+      n_sample_sizes == UINT64_MAX && n_non_null_key_vals == UINT64_MAX;
+  }
+};
+
+/*******************************************************************//**
+Record the number of non_null key values in a given index for
+each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
+The estimates are eventually stored in the array:
+index->stat_n_non_null_key_vals[], which is indexed from 0 to n-1. */
+static
+void
+btr_record_not_null_field_in_rec(
+/*=============================*/
+	ulint		n_unique,	/*!< in: dict_index_get_n_unique(index),
+					number of columns uniquely determine
+					an index entry */
+	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec, index),
+					its size could be for all fields or
+					that of "n_unique" */
+	ib_uint64_t*	n_not_null)	/*!< in/out: array to record number of
+					not null rows for n-column prefix */
+{
+	ulint	i;
+
+	ut_ad(rec_offs_n_fields(offsets) >= n_unique);
+
+	if (n_not_null == NULL) {
+		return;
+	}
+
+	for (i = 0; i < n_unique; i++) {
+		if (rec_offs_nth_sql_null(offsets, i)) {
+			break;
+		}
+
+		n_not_null[i]++;
+	}
+}
+
+inline dberr_t
+btr_cur_t::open_random_leaf(rec_offs *&offsets, mem_heap_t *&heap, mtr_t &mtr)
+{
+  ut_ad(!index()->is_spatial());
+  ut_ad(!mtr.get_savepoint());
+
+  mtr_s_lock_index(index(), &mtr);
+
+  if (index()->page == FIL_NULL)
+    return DB_CORRUPTION;
+
+  dberr_t err;
+  auto offset= index()->page;
+  bool merge= false;
+  ulint height= ULINT_UNDEFINED;
+
+  while (buf_block_t *block=
+         btr_block_get(*index(), offset, RW_S_LATCH, merge, &mtr, &err))
+  {
+    page_cur.block= block;
+
+    if (height == ULINT_UNDEFINED)
+    {
+      height= btr_page_get_level(block->page.frame);
+      if (height > BTR_MAX_LEVELS)
+        return DB_CORRUPTION;
+
+      if (height == 0)
+        goto got_leaf;
+    }
+
+    if (height == 0)
+    {
+      mtr.rollback_to_savepoint(0, mtr.get_savepoint() - 1);
+    got_leaf:
+      page_cur.rec= page_get_infimum_rec(block->page.frame);
+      return DB_SUCCESS;
+    }
+
+    if (!--height)
+      merge= !index()->is_clust();
+
+    page_cur_open_on_rnd_user_rec(&page_cur);
+
+    offsets= rec_get_offsets(page_cur.rec, page_cur.index, offsets, 0,
+                             ULINT_UNDEFINED, &heap);
+
+    /* Go to the child node */
+    offset= btr_node_ptr_get_child_page_no(page_cur.rec, offsets);
+  }
+
+  return err;
+}
+
+/** Estimated table level stats from sampled value.
+@param value sampled stats
+@param index index being sampled
+@param sample number of sampled rows
+@param ext_size external stored data size
+@param not_empty table not empty
+@return estimated table wide stats from sampled value */
+#define BTR_TABLE_STATS_FROM_SAMPLE(value, index, sample, ext_size, not_empty) \
+	(((value) * static_cast<ib_uint64_t>(index->stat_n_leaf_pages) \
+	  + (sample) - 1 + (ext_size) + (not_empty)) / ((sample) + (ext_size)))
+
+/** Estimates the number of different key values in a given index, for
+each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
+The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed
+0..n_uniq-1) and the number of pages that were sampled is saved in
+result.n_sample_sizes[].
+If innodb_stats_method is nulls_ignored, we also record the number of
+non-null values for each prefix and stored the estimates in
+array result.n_non_null_key_vals.
+@param index          B-tree index
+@param bulk_trx_id    the value of index->table->bulk_trx_id at the start
+@return vector with statistics information
+empty vector if the index is unavailable. */
+static
+std::vector<index_field_stats_t>
+btr_estimate_number_of_different_key_vals(dict_index_t* index,
+					  trx_id_t bulk_trx_id)
+{
+	page_t*		page;
+	rec_t*		rec;
+	ulint		n_cols;
+	ib_uint64_t*	n_diff;
+	ib_uint64_t*	n_not_null;
+	ibool		stats_null_not_equal;
+	uintmax_t	n_sample_pages=1; /* number of pages to sample */
+	ulint		not_empty_flag	= 0;
+	ulint		total_external_size = 0;
+	uintmax_t	add_on;
+	mtr_t		mtr;
+	mem_heap_t*	heap		= NULL;
+	rec_offs*	offsets_rec	= NULL;
+	rec_offs*	offsets_next_rec = NULL;
+
+	std::vector<index_field_stats_t> result;
+
+	ut_ad(index->is_btree());
+
+	n_cols = dict_index_get_n_unique(index);
+
+	heap = mem_heap_create((sizeof *n_diff + sizeof *n_not_null)
+			       * n_cols
+			       + dict_index_get_n_fields(index)
+			       * (sizeof *offsets_rec
+				  + sizeof *offsets_next_rec));
+
+	n_diff = (ib_uint64_t*) mem_heap_zalloc(
+		heap, n_cols * sizeof(n_diff[0]));
+
+	n_not_null = NULL;
+
+	/* Check srv_innodb_stats_method setting, and decide whether we
+	need to record non-null value and also decide if NULL is
+	considered equal (by setting stats_null_not_equal value) */
+	switch (srv_innodb_stats_method) {
+	case SRV_STATS_NULLS_IGNORED:
+		n_not_null = (ib_uint64_t*) mem_heap_zalloc(
+			heap, n_cols * sizeof *n_not_null);
+		/* fall through */
+
+	case SRV_STATS_NULLS_UNEQUAL:
+		/* for both SRV_STATS_NULLS_IGNORED and SRV_STATS_NULLS_UNEQUAL
+		case, we will treat NULLs as unequal value */
+		stats_null_not_equal = TRUE;
+		break;
+
+	case SRV_STATS_NULLS_EQUAL:
+		stats_null_not_equal = FALSE;
+		break;
+
+	default:
+		ut_error;
+	}
+
+	if (srv_stats_sample_traditional) {
+		/* It makes no sense to test more pages than are contained
+		in the index, thus we lower the number if it is too high */
+		if (srv_stats_transient_sample_pages > index->stat_index_size) {
+			if (index->stat_index_size > 0) {
+				n_sample_pages = index->stat_index_size;
+			}
+		} else {
+			n_sample_pages = srv_stats_transient_sample_pages;
+		}
+	} else {
+		/* New logaritmic number of pages that are estimated.
+		Number of pages estimated should be between 1 and
+		index->stat_index_size.
+
+		If we have only 0 or 1 index pages then we can only take 1
+		sample. We have already initialized n_sample_pages to 1.
+
+		So taking index size as I and sample as S and log(I)*S as L
+
+		requirement 1) we want the out limit of the expression to not exceed I;
+		requirement 2) we want the ideal pages to be at least S;
+		so the current expression is min(I, max( min(S,I), L)
+
+		looking for simplifications:
+
+		case 1: assume S < I
+		min(I, max( min(S,I), L) -> min(I , max( S, L))
+
+		but since L=LOG2(I)*S and log2(I) >=1   L>S always so max(S,L) = L.
+
+		so we have: min(I , L)
+
+		case 2: assume I < S
+		    min(I, max( min(S,I), L) -> min(I, max( I, L))
+
+		case 2a: L > I
+		    min(I, max( I, L)) -> min(I, L) -> I
+
+		case 2b: when L < I
+		    min(I, max( I, L))  ->  min(I, I ) -> I
+
+		so taking all case2 paths is I, our expression is:
+		n_pages = S < I? min(I,L) : I
+                */
+		if (index->stat_index_size > 1) {
+			n_sample_pages = (srv_stats_transient_sample_pages < index->stat_index_size)
+				? ut_min(index->stat_index_size,
+					 static_cast<ulint>(
+						 log2(double(index->stat_index_size))
+						 * double(srv_stats_transient_sample_pages)))
+				: index->stat_index_size;
+		}
+	}
+
+	/* Sanity check */
+	ut_ad(n_sample_pages > 0 && n_sample_pages <= (index->stat_index_size <= 1 ? 1 : index->stat_index_size));
+
+	/* We sample some pages in the index to get an estimate */
+	btr_cur_t cursor;
+	cursor.page_cur.index = index;
+
+	for (ulint i = 0; i < n_sample_pages; i++) {
+		mtr.start();
+
+		if (cursor.open_random_leaf(offsets_rec, heap, mtr) !=
+                    DB_SUCCESS
+		    || index->table->bulk_trx_id != bulk_trx_id) {
+			mtr.commit();
+			goto exit_loop;
+		}
+
+		/* Count the number of different key values for each prefix of
+		the key on this index page. If the prefix does not determine
+		the index record uniquely in the B-tree, then we subtract one
+		because otherwise our algorithm would give a wrong estimate
+		for an index where there is just one key value. */
+
+		page = btr_cur_get_page(&cursor);
+
+		rec = page_rec_get_next(cursor.page_cur.rec);
+		const ulint n_core = index->n_core_fields;
+
+		if (rec && !page_rec_is_supremum(rec)) {
+			not_empty_flag = 1;
+			offsets_rec = rec_get_offsets(rec, index, offsets_rec,
+						      n_core,
+						      ULINT_UNDEFINED, &heap);
+
+			if (n_not_null != NULL) {
+				btr_record_not_null_field_in_rec(
+					n_cols, offsets_rec, n_not_null);
+			}
+		}
+
+		while (!page_rec_is_supremum(rec)) {
+			ulint	matched_fields;
+			rec_t*	next_rec = page_rec_get_next(rec);
+			if (!next_rec || page_rec_is_supremum(next_rec)) {
+				total_external_size +=
+					btr_rec_get_externally_stored_len(
+						rec, offsets_rec);
+				break;
+			}
+
+			offsets_next_rec = rec_get_offsets(next_rec, index,
+							   offsets_next_rec,
+							   n_core,
+							   ULINT_UNDEFINED,
+							   &heap);
+
+			cmp_rec_rec(rec, next_rec,
+				    offsets_rec, offsets_next_rec,
+				    index, stats_null_not_equal,
+				    &matched_fields);
+
+			for (ulint j = matched_fields; j < n_cols; j++) {
+				/* We add one if this index record has
+				a different prefix from the previous */
+
+				n_diff[j]++;
+			}
+
+			if (n_not_null != NULL) {
+				btr_record_not_null_field_in_rec(
+					n_cols, offsets_next_rec, n_not_null);
+			}
+
+			total_external_size
+				+= btr_rec_get_externally_stored_len(
+					rec, offsets_rec);
+
+			rec = next_rec;
+			/* Initialize offsets_rec for the next round
+			and assign the old offsets_rec buffer to
+			offsets_next_rec. */
+			{
+				rec_offs* offsets_tmp = offsets_rec;
+				offsets_rec = offsets_next_rec;
+				offsets_next_rec = offsets_tmp;
+			}
+		}
+
+		if (n_cols == dict_index_get_n_unique_in_tree(index)
+		    && page_has_siblings(page)) {
+
+			/* If there is more than one leaf page in the tree,
+			we add one because we know that the first record
+			on the page certainly had a different prefix than the
+			last record on the previous index page in the
+			alphabetical order. Before this fix, if there was
+			just one big record on each clustered index page, the
+			algorithm grossly underestimated the number of rows
+			in the table. */
+
+			n_diff[n_cols - 1]++;
+		}
+
+		mtr.commit();
+	}
+
+exit_loop:
+	/* If we saw k borders between different key values on
+	n_sample_pages leaf pages, we can estimate how many
+	there will be in index->stat_n_leaf_pages */
+
+	/* We must take into account that our sample actually represents
+	also the pages used for external storage of fields (those pages are
+	included in index->stat_n_leaf_pages) */
+
+	result.reserve(n_cols);
+
+	for (ulint j = 0; j < n_cols; j++) {
+		index_field_stats_t stat;
+
+		stat.n_diff_key_vals
+			= BTR_TABLE_STATS_FROM_SAMPLE(
+				n_diff[j], index, n_sample_pages,
+				total_external_size, not_empty_flag);
+
+		/* If the tree is small, smaller than
+		10 * n_sample_pages + total_external_size, then
+		the above estimate is ok. For bigger trees it is common that we
+		do not see any borders between key values in the few pages
+		we pick. But still there may be n_sample_pages
+		different key values, or even more. Let us try to approximate
+		that: */
+
+		add_on = index->stat_n_leaf_pages
+			/ (10 * (n_sample_pages
+				 + total_external_size));
+
+		if (add_on > n_sample_pages) {
+			add_on = n_sample_pages;
+		}
+
+		stat.n_diff_key_vals += add_on;
+
+		stat.n_sample_sizes = n_sample_pages;
+
+		if (n_not_null != NULL) {
+			stat.n_non_null_key_vals =
+				 BTR_TABLE_STATS_FROM_SAMPLE(
+					n_not_null[j], index, n_sample_pages,
+					total_external_size, not_empty_flag);
+		}
+
+		result.push_back(stat);
+	}
+
+	mem_heap_free(heap);
+	return result;
+}
+
+/*********************************************************************//**
+Calculates new estimates for index statistics. This function is
+relatively quick and is used to calculate transient statistics that
+are not saved on disk. This was the only way to calculate statistics
+before the Persistent Statistics feature was introduced.
+This function doesn't update the defragmentation related stats.
+Only persistent statistics supports defragmentation stats.
+@return error code
+@retval DB_SUCCESS_LOCKED_REC if the table under bulk insert operation */
+static
+dberr_t
+dict_stats_update_transient_for_index(
+/*==================================*/
+	dict_index_t*	index)	/*!< in/out: index */
+{
+	dberr_t err = DB_SUCCESS;
+	if (srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO
+	    && (srv_force_recovery >= SRV_FORCE_NO_LOG_REDO
+		|| !dict_index_is_clust(index))) {
+		/* If we have set a high innodb_force_recovery
+		level, do not calculate statistics, as a badly
+		corrupted index can cause a crash in it.
+		Initialize some bogus index cardinality
+		statistics, so that the data can be queried in
+		various means, also via secondary indexes. */
+dummy_empty:
+		index->table->stats_mutex_lock();
+		dict_stats_empty_index(index, false);
+		index->table->stats_mutex_unlock();
+		return err;
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+	} else if (ibuf_debug && !dict_index_is_clust(index)) {
+		goto dummy_empty;
+#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+	} else if (dict_index_is_online_ddl(index) || !index->is_committed()
+		   || !index->table->space) {
+		goto dummy_empty;
+	} else {
+		mtr_t	mtr;
+
+		mtr.start();
+		mtr_sx_lock_index(index, &mtr);
+
+		dberr_t err;
+		buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH,
+						       &mtr, &err);
+		if (!root) {
+invalid:
+			mtr.commit();
+			goto dummy_empty;
+		}
+
+		const auto bulk_trx_id = index->table->bulk_trx_id;
+		if (bulk_trx_id && trx_sys.find(nullptr, bulk_trx_id, false)) {
+			err= DB_SUCCESS_LOCKED_REC;
+			goto invalid;
+		}
+
+		mtr.x_lock_space(index->table->space);
+
+		ulint dummy, size;
+		index->stat_index_size
+			= fseg_n_reserved_pages(*root, PAGE_HEADER
+						+ PAGE_BTR_SEG_LEAF
+						+ root->page.frame, &size,
+						&mtr)
+			+ fseg_n_reserved_pages(*root, PAGE_HEADER
+						+ PAGE_BTR_SEG_TOP
+						+ root->page.frame, &dummy,
+						&mtr);
+
+		mtr.commit();
+
+		index->stat_n_leaf_pages = size ? size : 1;
+
+		/* Do not continue if table decryption has failed or
+		table is already marked as corrupted. */
+		if (index->is_readable()) {
+			std::vector<index_field_stats_t> stats
+				= btr_estimate_number_of_different_key_vals(
+					index, bulk_trx_id);
+
+			if (!stats.empty()) {
+				index->table->stats_mutex_lock();
+				for (size_t i = 0; i < stats.size(); ++i) {
+					index->stat_n_diff_key_vals[i]
+						= stats[i].n_diff_key_vals;
+					index->stat_n_sample_sizes[i]
+						= stats[i].n_sample_sizes;
+					index->stat_n_non_null_key_vals[i]
+						= stats[i].n_non_null_key_vals;
+				}
+				index->table->stats_mutex_unlock();
+			}
+		}
+	}
+
+	return err;
+}
+
+/*********************************************************************//**
+Calculates new estimates for table and index statistics. This function
+is relatively quick and is used to calculate transient statistics that
+are not saved on disk.
+This was the only way to calculate statistics before the
+Persistent Statistics feature was introduced.
+@return error code
+@retval DB_SUCCESS_LOCKED REC if the table under bulk insert operation */
+static
+dberr_t
+dict_stats_update_transient(
+/*========================*/
+	dict_table_t*	table)	/*!< in/out: table */
+{
+	ut_ad(!table->stats_mutex_is_owner());
+
+	dict_index_t*	index;
+	ulint		sum_of_index_sizes	= 0;
+	dberr_t		err = DB_SUCCESS;
+
+	/* Find out the sizes of the indexes and how many different values
+	for the key they approximately have */
+
+	index = dict_table_get_first_index(table);
+
+	if (!table->space) {
+		/* Nothing to do. */
+empty_table:
+		dict_stats_empty_table(table, true);
+		return err;
+	} else if (index == NULL) {
+		/* Table definition is corrupt */
+
+		ib::warn() << "Table " << table->name
+			<< " has no indexes. Cannot calculate statistics.";
+		goto empty_table;
+	}
+
+	for (; index != NULL; index = dict_table_get_next_index(index)) {
+
+		ut_ad(!dict_index_is_ibuf(index));
+
+		if (!index->is_btree()) {
+			continue;
+		}
+
+		if (dict_stats_should_ignore_index(index)
+		    || !index->is_readable()
+		    || err == DB_SUCCESS_LOCKED_REC) {
+			index->table->stats_mutex_lock();
+			dict_stats_empty_index(index, false);
+			index->table->stats_mutex_unlock();
+			continue;
+		}
+
+		err = dict_stats_update_transient_for_index(index);
+
+		sum_of_index_sizes += index->stat_index_size;
+	}
+
+	table->stats_mutex_lock();
+
+	index = dict_table_get_first_index(table);
+
+	table->stat_n_rows = index->stat_n_diff_key_vals[
+		dict_index_get_n_unique(index) - 1];
+
+	table->stat_clustered_index_size = index->stat_index_size;
+
+	table->stat_sum_of_other_index_sizes = sum_of_index_sizes
+		- index->stat_index_size;
+
+	table->stats_last_recalc = time(NULL);
+
+	table->stat_modified_counter = 0;
+
+	table->stat_initialized = TRUE;
+
+	table->stats_mutex_unlock();
+
+	return err;
+}
+
+/** Open a cursor at the first page in a tree level.
+@param page_cur  cursor
+@param level     level to search for (0=leaf)
+@param mtr       mini-transaction */
+static dberr_t page_cur_open_level(page_cur_t *page_cur, ulint level,
+                                   mtr_t *mtr)
+{
+  mem_heap_t *heap= nullptr;
+  rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+  rec_offs *offsets= offsets_;
+  dberr_t err;
+
+  dict_index_t *const index= page_cur->index;
+
+  rec_offs_init(offsets_);
+  ut_ad(level != ULINT_UNDEFINED);
+  ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_SX_LOCK));
+  ut_ad(mtr->get_savepoint() == 1);
+
+  uint32_t page= index->page;
+
+  for (ulint height = ULINT_UNDEFINED;; height--)
+  {
+    buf_block_t* block=
+      btr_block_get(*index, page, RW_S_LATCH,
+                    !height && !index->is_clust(), mtr, &err);
+    if (!block)
+      break;
+
+    const uint32_t l= btr_page_get_level(block->page.frame);
+
+    if (height == ULINT_UNDEFINED)
+    {
+      ut_ad(!heap);
+      /* We are in the root node */
+      height= l;
+      if (UNIV_UNLIKELY(height < level))
+        return DB_CORRUPTION;
+    }
+    else if (UNIV_UNLIKELY(height != l) || page_has_prev(block->page.frame))
+    {
+      err= DB_CORRUPTION;
+      break;
+    }
+
+    page_cur_set_before_first(block, page_cur);
+
+    if (height == level)
+      break;
+
+    ut_ad(height);
+
+    if (!page_cur_move_to_next(page_cur))
+    {
+      err= DB_CORRUPTION;
+      break;
+    }
+
+    offsets= rec_get_offsets(page_cur->rec, index, offsets, 0, ULINT_UNDEFINED,
+                             &heap);
+    page= btr_node_ptr_get_child_page_no(page_cur->rec, offsets);
+  }
+
+  if (UNIV_LIKELY_NULL(heap))
+    mem_heap_free(heap);
+
+  /* Release all page latches except the one on the desired page. */
+  const auto end= mtr->get_savepoint();
+  if (end > 1)
+    mtr->rollback_to_savepoint(1, end - 1);
+
+  return err;
+}
+
+/** Open a cursor at the first page in a tree level.
+@param page_cur  cursor
+@param level     level to search for (0=leaf)
+@param mtr       mini-transaction
+@param index     index tree */
+static dberr_t btr_pcur_open_level(btr_pcur_t *pcur, ulint level, mtr_t *mtr,
+                                   dict_index_t *index)
+{
+  pcur->latch_mode= BTR_SEARCH_LEAF;
+  pcur->search_mode= PAGE_CUR_G;
+  pcur->pos_state= BTR_PCUR_IS_POSITIONED;
+  pcur->btr_cur.page_cur.index= index;
+  return page_cur_open_level(&pcur->btr_cur.page_cur, level, mtr);
+}
+
+
+/* @{ Pseudo code about the relation between the following functions
+
+let N = N_SAMPLE_PAGES(index)
+
+dict_stats_analyze_index()
+  for each n_prefix
+    search for good enough level:
+      dict_stats_analyze_index_level() // only called if level has <= N pages
+        // full scan of the level in one mtr
+        collect statistics about the given level
+      if we are not satisfied with the level, search next lower level
+    we have found a good enough level here
+    dict_stats_analyze_index_for_n_prefix(that level, stats collected above)
+      // full scan of the level in one mtr
+      dive below some records and analyze the leaf page there:
+      dict_stats_analyze_index_below_cur()
+@} */
+
+/*********************************************************************//**
+Find the total number and the number of distinct keys on a given level in
+an index. Each of the 1..n_uniq prefixes are looked up and the results are
+saved in the array n_diff[0] .. n_diff[n_uniq - 1]. The total number of
+records on the level is saved in total_recs.
+Also, the index of the last record in each group of equal records is saved
+in n_diff_boundaries[0..n_uniq - 1], records indexing starts from the leftmost
+record on the level and continues cross pages boundaries, counting from 0. */
+static
+void
+dict_stats_analyze_index_level(
+/*===========================*/
+	dict_index_t*	index,		/*!< in: index */
+	ulint		level,		/*!< in: level */
+	ib_uint64_t*	n_diff,		/*!< out: array for number of
+					distinct keys for all prefixes */
+	ib_uint64_t*	total_recs,	/*!< out: total number of records */
+	ib_uint64_t*	total_pages,	/*!< out: total number of pages */
+	boundaries_t*	n_diff_boundaries,/*!< out: boundaries of the groups
+					of distinct keys */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
+{
+	ulint		n_uniq;
+	mem_heap_t*	heap;
+	btr_pcur_t	pcur;
+	const page_t*	page;
+	const rec_t*	rec;
+	const rec_t*	prev_rec;
+	bool		prev_rec_is_copied;
+	byte*		prev_rec_buf = NULL;
+	ulint		prev_rec_buf_size = 0;
+	rec_offs*	rec_offsets;
+	rec_offs*	prev_rec_offsets;
+	ulint		i;
+
+	DEBUG_PRINTF("    %s(table=%s, index=%s, level=" ULINTPF ")\n",
+		     __func__, index->table->name, index->name, level);
+
+	*total_recs = 0;
+	*total_pages = 0;
+
+	n_uniq = dict_index_get_n_unique(index);
+
+	/* elements in the n_diff array are 0..n_uniq-1 (inclusive) */
+	memset(n_diff, 0x0, n_uniq * sizeof(n_diff[0]));
+
+	/* Allocate space for the offsets header (the allocation size at
+	offsets[0] and the REC_OFFS_HEADER_SIZE bytes), and n_uniq + 1,
+	so that this will never be less than the size calculated in
+	rec_get_offsets_func(). */
+	i = (REC_OFFS_HEADER_SIZE + 1 + 1) + n_uniq;
+
+	heap = mem_heap_create((2 * sizeof *rec_offsets) * i);
+	rec_offsets = static_cast<rec_offs*>(
+		mem_heap_alloc(heap, i * sizeof *rec_offsets));
+	prev_rec_offsets = static_cast<rec_offs*>(
+		mem_heap_alloc(heap, i * sizeof *prev_rec_offsets));
+	rec_offs_set_n_alloc(rec_offsets, i);
+	rec_offs_set_n_alloc(prev_rec_offsets, i);
+
+	/* reset the dynamic arrays n_diff_boundaries[0..n_uniq-1] */
+	if (n_diff_boundaries != NULL) {
+		for (i = 0; i < n_uniq; i++) {
+			n_diff_boundaries[i].erase(
+				n_diff_boundaries[i].begin(),
+				n_diff_boundaries[i].end());
+		}
+	}
+
+	/* Position pcur on the leftmost record on the leftmost page
+	on the desired level. */
+
+	if (btr_pcur_open_level(&pcur, level, mtr, index) != DB_SUCCESS
+	    || !btr_pcur_move_to_next_on_page(&pcur)) {
+		goto func_exit;
+	}
+
+	page = btr_pcur_get_page(&pcur);
+
+	/* The page must not be empty, except when
+	it is the root page (and the whole index is empty). */
+	ut_ad(btr_pcur_is_on_user_rec(&pcur) || page_is_leaf(page));
+
+	prev_rec = NULL;
+	prev_rec_is_copied = false;
+
+	if (REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
+		    btr_pcur_get_rec(&pcur), page_is_comp(page))) {
+		ut_ad(btr_pcur_is_on_user_rec(&pcur));
+		if (level == 0) {
+			/* Skip the metadata pseudo-record */
+			ut_ad(index->is_instant());
+			btr_pcur_move_to_next_user_rec(&pcur, mtr);
+		}
+	} else if (UNIV_UNLIKELY(level != 0)) {
+		/* The first record on the leftmost page must be
+		marked as such on each level except the leaf level. */
+		goto func_exit;
+	}
+
+	/* iterate over all user records on this level
+	and compare each two adjacent ones, even the last on page
+	X and the fist on page X+1 */
+	for (;
+	     btr_pcur_is_on_user_rec(&pcur);
+	     btr_pcur_move_to_next_user_rec(&pcur, mtr)) {
+
+		bool	rec_is_last_on_page;
+
+		rec = btr_pcur_get_rec(&pcur);
+
+		/* If rec and prev_rec are on different pages, then prev_rec
+		must have been copied, because we hold latch only on the page
+		where rec resides. */
+		if (prev_rec != NULL
+		    && page_align(rec) != page_align(prev_rec)) {
+
+			ut_a(prev_rec_is_copied);
+		}
+
+		rec_is_last_on_page =
+			page_rec_is_supremum(page_rec_get_next_const(rec));
+
+		/* increment the pages counter at the end of each page */
+		if (rec_is_last_on_page) {
+
+			(*total_pages)++;
+		}
+
+		/* Skip delete-marked records on the leaf level. If we
+		do not skip them, then ANALYZE quickly after DELETE
+		could count them or not (purge may have already wiped
+		them away) which brings non-determinism. We skip only
+		leaf-level delete marks because delete marks on
+		non-leaf level do not make sense. */
+
+		if (level == 0
+		    && !srv_stats_include_delete_marked
+		    && rec_get_deleted_flag(rec, page_rec_is_comp(rec))) {
+			if (rec_is_last_on_page
+			    && !prev_rec_is_copied
+			    && prev_rec != NULL) {
+				/* copy prev_rec */
+
+				prev_rec_offsets = rec_get_offsets(
+					prev_rec, index, prev_rec_offsets,
+					index->n_core_fields,
+					n_uniq, &heap);
+
+				prev_rec = rec_copy_prefix_to_buf(
+					prev_rec, index, n_uniq,
+					&prev_rec_buf, &prev_rec_buf_size);
+
+				prev_rec_is_copied = true;
+			}
+
+			continue;
+		}
+		rec_offsets = rec_get_offsets(rec, index, rec_offsets,
+					      level ? 0 : index->n_core_fields,
+					      n_uniq, &heap);
+
+		(*total_recs)++;
+
+		if (prev_rec != NULL) {
+			ulint	matched_fields;
+
+			prev_rec_offsets = rec_get_offsets(
+				prev_rec, index, prev_rec_offsets,
+				level ? 0 : index->n_core_fields,
+				n_uniq, &heap);
+
+			cmp_rec_rec(prev_rec, rec,
+				    prev_rec_offsets, rec_offsets, index,
+				    false, &matched_fields);
+
+			for (i = matched_fields; i < n_uniq; i++) {
+
+				if (n_diff_boundaries != NULL) {
+					/* push the index of the previous
+					record, that is - the last one from
+					a group of equal keys */
+
+					ib_uint64_t	idx;
+
+					/* the index of the current record
+					is total_recs - 1, the index of the
+					previous record is total_recs - 2;
+					we know that idx is not going to
+					become negative here because if we
+					are in this branch then there is a
+					previous record and thus
+					total_recs >= 2 */
+					idx = *total_recs - 2;
+
+					n_diff_boundaries[i].push_back(idx);
+				}
+
+				/* increment the number of different keys
+				for n_prefix=i+1 (e.g. if i=0 then we increment
+				for n_prefix=1 which is stored in n_diff[0]) */
+				n_diff[i]++;
+			}
+		} else {
+			/* this is the first non-delete marked record */
+			for (i = 0; i < n_uniq; i++) {
+				n_diff[i] = 1;
+			}
+		}
+
+		if (rec_is_last_on_page) {
+			/* end of a page has been reached */
+
+			/* we need to copy the record instead of assigning
+			like prev_rec = rec; because when we traverse the
+			records on this level at some point we will jump from
+			one page to the next and then rec and prev_rec will
+			be on different pages and
+			btr_cur_move_to_next_user_rec() will release the
+			latch on the page that prev_rec is on */
+			prev_rec = rec_copy_prefix_to_buf(
+				rec, index, n_uniq,
+				&prev_rec_buf, &prev_rec_buf_size);
+			prev_rec_is_copied = true;
+
+		} else {
+			/* still on the same page, the next call to
+			btr_cur_move_to_next_user_rec() will not jump
+			on the next page, we can simply assign pointers
+			instead of copying the records like above */
+
+			prev_rec = rec;
+			prev_rec_is_copied = false;
+		}
+	}
+
+	/* if *total_pages is left untouched then the above loop was not
+	entered at all and there is one page in the whole tree which is
+	empty or the loop was entered but this is level 0, contains one page
+	and all records are delete-marked */
+	if (*total_pages == 0) {
+
+		ut_ad(level == 0);
+		ut_ad(*total_recs == 0);
+
+		*total_pages = 1;
+	}
+
+	/* if there are records on this level and boundaries
+	should be saved */
+	if (*total_recs > 0 && n_diff_boundaries != NULL) {
+
+		/* remember the index of the last record on the level as the
+		last one from the last group of equal keys; this holds for
+		all possible prefixes */
+		for (i = 0; i < n_uniq; i++) {
+			ib_uint64_t	idx;
+
+			idx = *total_recs - 1;
+
+			n_diff_boundaries[i].push_back(idx);
+		}
+	}
+
+	/* now in n_diff_boundaries[i] there are exactly n_diff[i] integers,
+	for i=0..n_uniq-1 */
+
+#ifdef UNIV_STATS_DEBUG
+	for (i = 0; i < n_uniq; i++) {
+
+		DEBUG_PRINTF("    %s(): total recs: " UINT64PF
+			     ", total pages: " UINT64PF
+			     ", n_diff[" ULINTPF "]: " UINT64PF "\n",
+			     __func__, *total_recs,
+			     *total_pages,
+			     i, n_diff[i]);
+
+#if 0
+		if (n_diff_boundaries != NULL) {
+			ib_uint64_t	j;
+
+			DEBUG_PRINTF("    %s(): boundaries[%lu]: ",
+				     __func__, i);
+
+			for (j = 0; j < n_diff[i]; j++) {
+				ib_uint64_t	idx;
+
+				idx = n_diff_boundaries[i][j];
+
+				DEBUG_PRINTF(UINT64PF "=" UINT64PF ", ",
+					     j, idx);
+			}
+			DEBUG_PRINTF("\n");
+		}
+#endif
+	}
+#endif /* UNIV_STATS_DEBUG */
+
+func_exit:
+	ut_free(prev_rec_buf);
+	mem_heap_free(heap);
+}
+
+
+/************************************************************//**
+Gets the pointer to the next non delete-marked record on the page.
+If all subsequent records are delete-marked, then this function
+will return the supremum record.
+@return pointer to next non delete-marked record or pointer to supremum */
+static
+const rec_t*
+page_rec_get_next_non_del_marked(
+/*=============================*/
+	const rec_t*	rec)	/*!< in: pointer to record */
+{
+  const page_t *const page= page_align(rec);
+
+  if (page_is_comp(page))
+  {
+    for (rec= page_rec_get_next_low(rec, TRUE);
+         rec && rec_get_deleted_flag(rec, TRUE);
+         rec= page_rec_get_next_low(rec, TRUE));
+    return rec ? rec : page + PAGE_NEW_SUPREMUM;
+  }
+  else
+  {
+    for (rec= page_rec_get_next_low(rec, FALSE);
+         rec && rec_get_deleted_flag(rec, FALSE);
+         rec= page_rec_get_next_low(rec, FALSE));
+    return rec ? rec : page + PAGE_OLD_SUPREMUM;
+  }
+}
+
+/** Scan a page, reading records from left to right and counting the number
+of distinct records (looking only at the first n_prefix
+columns) and the number of external pages pointed by records from this page.
+If scan_method is QUIT_ON_FIRST_NON_BORING then the function
+will return as soon as it finds a record that does not match its neighbor
+to the right, which means that in the case of QUIT_ON_FIRST_NON_BORING the
+returned n_diff can either be 0 (empty page), 1 (the whole page has all keys
+equal) or 2 (the function found a non-boring record and returned).
+@param[out]	out_rec			record, or NULL
+@param[out]	offsets1		rec_get_offsets() working space (must
+be big enough)
+@param[out]	offsets2		rec_get_offsets() working space (must
+be big enough)
+@param[in]	index			index of the page
+@param[in]	page			the page to scan
+@param[in]	n_prefix		look at the first n_prefix columns
+@param[in]	n_core			0, or index->n_core_fields for leaf
+@param[out]	n_diff			number of distinct records encountered
+@param[out]	n_external_pages	if this is non-NULL then it will be set
+to the number of externally stored pages which were encountered
+@return offsets1 or offsets2 (the offsets of *out_rec),
+or NULL if the page is empty and does not contain user records. */
+UNIV_INLINE
+rec_offs*
+dict_stats_scan_page(
+	const rec_t**		out_rec,
+	rec_offs*		offsets1,
+	rec_offs*		offsets2,
+	const dict_index_t*	index,
+	const page_t*		page,
+	ulint			n_prefix,
+	ulint		 	n_core,
+	ib_uint64_t*		n_diff,
+	ib_uint64_t*		n_external_pages)
+{
+	rec_offs*	offsets_rec		= offsets1;
+	rec_offs*	offsets_next_rec	= offsets2;
+	const rec_t*	rec;
+	const rec_t*	next_rec;
+	/* A dummy heap, to be passed to rec_get_offsets().
+	Because offsets1,offsets2 should be big enough,
+	this memory heap should never be used. */
+	mem_heap_t*	heap			= NULL;
+	ut_ad(!!n_core == page_is_leaf(page));
+	const rec_t*	(*get_next)(const rec_t*)
+		= !n_core || srv_stats_include_delete_marked
+		? page_rec_get_next_const
+		: page_rec_get_next_non_del_marked;
+
+	const bool	should_count_external_pages = n_external_pages != NULL;
+
+	if (should_count_external_pages) {
+		*n_external_pages = 0;
+	}
+
+	rec = get_next(page_get_infimum_rec(page));
+
+	if (!rec || page_rec_is_supremum(rec)) {
+		/* the page is empty or contains only delete-marked records */
+		*n_diff = 0;
+		*out_rec = NULL;
+		return(NULL);
+	}
+
+	offsets_rec = rec_get_offsets(rec, index, offsets_rec, n_core,
+				      ULINT_UNDEFINED, &heap);
+
+	if (should_count_external_pages) {
+		*n_external_pages += btr_rec_get_externally_stored_len(
+			rec, offsets_rec);
+	}
+
+	next_rec = get_next(rec);
+
+	*n_diff = 1;
+
+	while (next_rec && !page_rec_is_supremum(next_rec)) {
+
+		ulint	matched_fields;
+
+		offsets_next_rec = rec_get_offsets(next_rec, index,
+						   offsets_next_rec, n_core,
+						   ULINT_UNDEFINED,
+						   &heap);
+
+		/* check whether rec != next_rec when looking at
+		the first n_prefix fields */
+		cmp_rec_rec(rec, next_rec, offsets_rec, offsets_next_rec,
+			    index, false, &matched_fields);
+
+		if (matched_fields < n_prefix) {
+			/* rec != next_rec, => rec is non-boring */
+
+			(*n_diff)++;
+
+			if (!n_core) {
+				break;
+			}
+		}
+
+		rec = next_rec;
+		/* Assign offsets_rec = offsets_next_rec so that
+		offsets_rec matches with rec which was just assigned
+		rec = next_rec above.  Also need to point
+		offsets_next_rec to the place where offsets_rec was
+		pointing before because we have just 2 placeholders
+		where data is actually stored: offsets1 and offsets2
+		and we are using them in circular fashion
+		(offsets[_next]_rec are just pointers to those
+		placeholders). */
+		std::swap(offsets_rec, offsets_next_rec);
+
+		if (should_count_external_pages) {
+			*n_external_pages += btr_rec_get_externally_stored_len(
+				rec, offsets_rec);
+		}
+
+		next_rec = get_next(next_rec);
+	}
+
+	/* offsets1,offsets2 should have been big enough */
+	ut_a(heap == NULL);
+	*out_rec = rec;
+	return(offsets_rec);
+}
+
+/** Dive below the current position of a cursor and calculate the number of
+distinct records on the leaf page, when looking at the fist n_prefix
+columns. Also calculate the number of external pages pointed by records
+on the leaf page.
+@param[in]	cur			cursor
+@param[in]	n_prefix		look at the first n_prefix columns
+when comparing records
+@param[out]	n_diff			number of distinct records
+@param[out]	n_external_pages	number of external pages
+@return number of distinct records on the leaf page */
+static
+void
+dict_stats_analyze_index_below_cur(
+	const btr_cur_t*	cur,
+	ulint			n_prefix,
+	ib_uint64_t*		n_diff,
+	ib_uint64_t*		n_external_pages)
+{
+	dict_index_t*	index;
+	buf_block_t*	block;
+	const page_t*	page;
+	mem_heap_t*	heap;
+	const rec_t*	rec;
+	rec_offs*	offsets1;
+	rec_offs*	offsets2;
+	rec_offs*	offsets_rec;
+	ulint		size;
+	mtr_t		mtr;
+
+	index = btr_cur_get_index(cur);
+
+	/* Allocate offsets for the record and the node pointer, for
+	node pointer records. In a secondary index, the node pointer
+	record will consist of all index fields followed by a child
+	page number.
+	Allocate space for the offsets header (the allocation size at
+	offsets[0] and the REC_OFFS_HEADER_SIZE bytes), and n_fields + 1,
+	so that this will never be less than the size calculated in
+	rec_get_offsets_func(). */
+	size = (1 + REC_OFFS_HEADER_SIZE) + 1 + dict_index_get_n_fields(index);
+
+	heap = mem_heap_create(size * (sizeof *offsets1 + sizeof *offsets2));
+
+	offsets1 = static_cast<rec_offs*>(mem_heap_alloc(
+			heap, size * sizeof *offsets1));
+
+	offsets2 = static_cast<rec_offs*>(mem_heap_alloc(
+			heap, size * sizeof *offsets2));
+
+	rec_offs_set_n_alloc(offsets1, size);
+	rec_offs_set_n_alloc(offsets2, size);
+
+	rec = btr_cur_get_rec(cur);
+	page = page_align(rec);
+	ut_ad(!page_rec_is_leaf(rec));
+
+	offsets_rec = rec_get_offsets(rec, index, offsets1, 0,
+				      ULINT_UNDEFINED, &heap);
+
+	page_id_t		page_id(index->table->space_id,
+					btr_node_ptr_get_child_page_no(
+						rec, offsets_rec));
+	const ulint zip_size = index->table->space->zip_size();
+
+	/* assume no external pages by default - in case we quit from this
+	function without analyzing any leaf pages */
+	*n_external_pages = 0;
+
+	mtr_start(&mtr);
+
+	/* descend to the leaf level on the B-tree */
+	for (;;) {
+		dberr_t err;
+
+		block = buf_page_get_gen(page_id, zip_size,
+					 RW_S_LATCH, NULL, BUF_GET,
+					 &mtr, &err,
+					 !index->is_clust()
+					 && 1 == btr_page_get_level(page));
+		if (!block) {
+			goto func_exit;
+		}
+
+		page = block->page.frame;
+
+		if (page_is_leaf(page)) {
+			/* leaf level */
+			break;
+		}
+		/* else */
+
+		/* search for the first non-boring record on the page */
+		offsets_rec = dict_stats_scan_page(
+			&rec, offsets1, offsets2, index, page, n_prefix,
+			0, n_diff, NULL);
+
+		/* pages on level > 0 are not allowed to be empty */
+		ut_a(offsets_rec != NULL);
+		/* if page is not empty (offsets_rec != NULL) then n_diff must
+		be > 0, otherwise there is a bug in dict_stats_scan_page() */
+		ut_a(*n_diff > 0);
+
+		if (*n_diff == 1) {
+			mtr_commit(&mtr);
+
+			/* page has all keys equal and the end of the page
+			was reached by dict_stats_scan_page(), no need to
+			descend to the leaf level */
+			mem_heap_free(heap);
+			/* can't get an estimate for n_external_pages here
+			because we do not dive to the leaf level, assume no
+			external pages (*n_external_pages was assigned to 0
+			above). */
+			return;
+		}
+		/* else */
+
+		/* when we instruct dict_stats_scan_page() to quit on the
+		first non-boring record it finds, then the returned n_diff
+		can either be 0 (empty page), 1 (page has all keys equal) or
+		2 (non-boring record was found) */
+		ut_a(*n_diff == 2);
+
+		/* we have a non-boring record in rec, descend below it */
+
+		page_id.set_page_no(
+			btr_node_ptr_get_child_page_no(rec, offsets_rec));
+	}
+
+	/* make sure we got a leaf page as a result from the above loop */
+	ut_ad(page_is_leaf(page));
+
+	/* scan the leaf page and find the number of distinct keys,
+	when looking only at the first n_prefix columns; also estimate
+	the number of externally stored pages pointed by records on this
+	page */
+
+	offsets_rec = dict_stats_scan_page(
+		&rec, offsets1, offsets2, index, page, n_prefix,
+		index->n_core_fields, n_diff,
+		n_external_pages);
+
+#if 0
+	DEBUG_PRINTF("      %s(): n_diff below page_no=%lu: " UINT64PF "\n",
+		     __func__, page_no, n_diff);
+#endif
+
+func_exit:
+	mtr_commit(&mtr);
+	mem_heap_free(heap);
+}
+
+/** Input data that is used to calculate dict_index_t::stat_n_diff_key_vals[]
+for each n-columns prefix (n from 1 to n_uniq). */
+struct n_diff_data_t {
+	/** Index of the level on which the descent through the btree
+	stopped. level 0 is the leaf level. This is >= 1 because we
+	avoid scanning the leaf level because it may contain too many
+	pages and doing so is useless when combined with the random dives -
+	if we are to scan the leaf level, this means a full scan and we can
+	simply do that instead of fiddling with picking random records higher
+	in the tree and to dive below them. At the start of the analyzing
+	we may decide to do full scan of the leaf level, but then this
+	structure is not used in that code path. */
+	ulint		level;
+
+	/** Number of records on the level where the descend through the btree
+	stopped. When we scan the btree from the root, we stop at some mid
+	level, choose some records from it and dive below them towards a leaf
+	page to analyze. */
+	ib_uint64_t	n_recs_on_level;
+
+	/** Number of different key values that were found on the mid level. */
+	ib_uint64_t	n_diff_on_level;
+
+	/** Number of leaf pages that are analyzed. This is also the same as
+	the number of records that we pick from the mid level and dive below
+	them. */
+	ib_uint64_t	n_leaf_pages_to_analyze;
+
+	/** Cumulative sum of the number of different key values that were
+	found on all analyzed pages. */
+	ib_uint64_t	n_diff_all_analyzed_pages;
+
+	/** Cumulative sum of the number of external pages (stored outside of
+	the btree but in the same file segment). */
+	ib_uint64_t	n_external_pages_sum;
+};
+
+/** Estimate the number of different key values in an index when looking at
+the first n_prefix columns. For a given level in an index select
+n_diff_data->n_leaf_pages_to_analyze records from that level and dive below
+them to the corresponding leaf pages, then scan those leaf pages and save the
+sampling results in n_diff_data->n_diff_all_analyzed_pages.
+@param[in]	index			index
+@param[in]	n_prefix		look at first 'n_prefix' columns when
+comparing records
+@param[in]	boundaries		a vector that contains
+n_diff_data->n_diff_on_level integers each of which represents the index (on
+level 'level', counting from left/smallest to right/biggest from 0) of the
+last record from each group of distinct keys
+@param[in,out]	n_diff_data		n_diff_all_analyzed_pages and
+n_external_pages_sum in this structure will be set by this function. The
+members level, n_diff_on_level and n_leaf_pages_to_analyze must be set by the
+caller in advance - they are used by some calculations inside this function
+@param[in,out]	mtr			mini-transaction */
+static
+void
+dict_stats_analyze_index_for_n_prefix(
+	dict_index_t*		index,
+	ulint			n_prefix,
+	const boundaries_t*	boundaries,
+	n_diff_data_t*		n_diff_data,
+	mtr_t*			mtr)
+{
+	btr_pcur_t	pcur;
+	const page_t*	page;
+	ib_uint64_t	rec_idx;
+	ib_uint64_t	i;
+
+#if 0
+	DEBUG_PRINTF("    %s(table=%s, index=%s, level=%lu, n_prefix=%lu,"
+		     " n_diff_on_level=" UINT64PF ")\n",
+		     __func__, index->table->name, index->name, level,
+		     n_prefix, n_diff_data->n_diff_on_level);
+#endif
+
+	ut_ad(n_diff_data->level);
+
+	/* Position pcur on the leftmost record on the leftmost page
+	on the desired level. */
+
+	n_diff_data->n_diff_all_analyzed_pages = 0;
+	n_diff_data->n_external_pages_sum = 0;
+
+	if (btr_pcur_open_level(&pcur, n_diff_data->level, mtr, index)
+	    != DB_SUCCESS
+	    || !btr_pcur_move_to_next_on_page(&pcur)) {
+		return;
+	}
+
+	page = btr_pcur_get_page(&pcur);
+
+	const rec_t*	first_rec = btr_pcur_get_rec(&pcur);
+
+	/* The page must not be empty, except when
+	it is the root page (and the whole index is empty). */
+	if (page_has_prev(page)
+	    || !btr_pcur_is_on_user_rec(&pcur)
+	    || btr_page_get_level(page) != n_diff_data->level
+	    || first_rec != page_rec_get_next_const(page_get_infimum_rec(page))
+	    || !(rec_get_info_bits(first_rec, page_is_comp(page))
+		 & REC_INFO_MIN_REC_FLAG)) {
+		return;
+	}
+
+	const ib_uint64_t	last_idx_on_level = boundaries->at(
+		static_cast<unsigned>(n_diff_data->n_diff_on_level - 1));
+
+	rec_idx = 0;
+
+	for (i = 0; i < n_diff_data->n_leaf_pages_to_analyze; i++) {
+		/* there are n_diff_on_level elements
+		in 'boundaries' and we divide those elements
+		into n_leaf_pages_to_analyze segments, for example:
+
+		let n_diff_on_level=100, n_leaf_pages_to_analyze=4, then:
+		segment i=0:  [0, 24]
+		segment i=1: [25, 49]
+		segment i=2: [50, 74]
+		segment i=3: [75, 99] or
+
+		let n_diff_on_level=1, n_leaf_pages_to_analyze=1, then:
+		segment i=0: [0, 0] or
+
+		let n_diff_on_level=2, n_leaf_pages_to_analyze=2, then:
+		segment i=0: [0, 0]
+		segment i=1: [1, 1] or
+
+		let n_diff_on_level=13, n_leaf_pages_to_analyze=7, then:
+		segment i=0:  [0,  0]
+		segment i=1:  [1,  2]
+		segment i=2:  [3,  4]
+		segment i=3:  [5,  6]
+		segment i=4:  [7,  8]
+		segment i=5:  [9, 10]
+		segment i=6: [11, 12]
+
+		then we select a random record from each segment and dive
+		below it */
+		const ib_uint64_t	n_diff = n_diff_data->n_diff_on_level;
+		const ib_uint64_t	n_pick
+			= n_diff_data->n_leaf_pages_to_analyze;
+
+		const ib_uint64_t	left = n_diff * i / n_pick;
+		const ib_uint64_t	right = n_diff * (i + 1) / n_pick - 1;
+
+		ut_a(left <= right);
+		ut_a(right <= last_idx_on_level);
+
+		const ulint	rnd = ut_rnd_interval(
+			static_cast<ulint>(right - left));
+
+		const ib_uint64_t	dive_below_idx
+			= boundaries->at(static_cast<unsigned>(left + rnd));
+
+#if 0
+		DEBUG_PRINTF("    %s(): dive below record with index="
+			     UINT64PF "\n", __func__, dive_below_idx);
+#endif
+
+		/* seek to the record with index dive_below_idx */
+		while (rec_idx < dive_below_idx
+		       && btr_pcur_is_on_user_rec(&pcur)) {
+
+			btr_pcur_move_to_next_user_rec(&pcur, mtr);
+			rec_idx++;
+		}
+
+		/* if the level has finished before the record we are
+		searching for, this means that the B-tree has changed in
+		the meantime, quit our sampling and use whatever stats
+		we have collected so far */
+		if (rec_idx < dive_below_idx) {
+
+			ut_ad(!btr_pcur_is_on_user_rec(&pcur));
+			break;
+		}
+
+		/* it could be that the tree has changed in such a way that
+		the record under dive_below_idx is the supremum record, in
+		this case rec_idx == dive_below_idx and pcur is positioned
+		on the supremum, we do not want to dive below it */
+		if (!btr_pcur_is_on_user_rec(&pcur)) {
+			break;
+		}
+
+		ut_a(rec_idx == dive_below_idx);
+
+		ib_uint64_t	n_diff_on_leaf_page;
+		ib_uint64_t	n_external_pages;
+
+		dict_stats_analyze_index_below_cur(btr_pcur_get_btr_cur(&pcur),
+						   n_prefix,
+						   &n_diff_on_leaf_page,
+						   &n_external_pages);
+
+		/* We adjust n_diff_on_leaf_page here to avoid counting
+		one value twice - once as the last on some page and once
+		as the first on another page. Consider the following example:
+		Leaf level:
+		page: (2,2,2,2,3,3)
+		... many pages like (3,3,3,3,3,3) ...
+		page: (3,3,3,3,5,5)
+		... many pages like (5,5,5,5,5,5) ...
+		page: (5,5,5,5,8,8)
+		page: (8,8,8,8,9,9)
+		our algo would (correctly) get an estimate that there are
+		2 distinct records per page (average). Having 4 pages below
+		non-boring records, it would (wrongly) estimate the number
+		of distinct records to 8. */
+		if (n_diff_on_leaf_page > 0) {
+			n_diff_on_leaf_page--;
+		}
+
+		n_diff_data->n_diff_all_analyzed_pages += n_diff_on_leaf_page;
+
+		n_diff_data->n_external_pages_sum += n_external_pages;
+	}
+}
+
+/** statistics for an index */
+struct index_stats_t
+{
+  std::vector<index_field_stats_t> stats;
+  ulint index_size;
+  ulint n_leaf_pages;
+
+  index_stats_t(ulint n_uniq) : index_size(1), n_leaf_pages(1)
+  {
+    stats.reserve(n_uniq);
+    for (ulint i= 0; i < n_uniq; ++i)
+      stats.push_back(index_field_stats_t{0, 1, 0});
+  }
+
+  void set_bulk_operation()
+  {
+    memset((void*) &stats[0], 0xff, stats.size() * sizeof stats[0]);
+  }
+
+  bool is_bulk_operation() const
+  {
+    for (auto &s : stats)
+      if (!s.is_bulk_operation())
+        return false;
+    return true;
+  }
+};
+
+/** Set dict_index_t::stat_n_diff_key_vals[] and stat_n_sample_sizes[].
+@param[in]	n_diff_data	input data to use to derive the results
+@param[in,out]	index_stats	index stats to set */
+UNIV_INLINE
+void
+dict_stats_index_set_n_diff(
+	const n_diff_data_t*	n_diff_data,
+	index_stats_t&		index_stats)
+{
+	for (ulint n_prefix = index_stats.stats.size();
+	     n_prefix >= 1;
+	     n_prefix--) {
+		/* n_diff_all_analyzed_pages can be 0 here if
+		all the leaf pages sampled contained only
+		delete-marked records. In this case we should assign
+		0 to index->stat_n_diff_key_vals[n_prefix - 1], which
+		the formula below does. */
+
+		const n_diff_data_t*	data = &n_diff_data[n_prefix - 1];
+
+		ut_ad(data->n_leaf_pages_to_analyze > 0);
+		ut_ad(data->n_recs_on_level > 0);
+
+		ib_uint64_t	n_ordinary_leaf_pages;
+
+		if (data->level == 1) {
+			/* If we know the number of records on level 1, then
+			this number is the same as the number of pages on
+			level 0 (leaf). */
+			n_ordinary_leaf_pages = data->n_recs_on_level;
+		} else {
+			/* If we analyzed D ordinary leaf pages and found E
+			external pages in total linked from those D ordinary
+			leaf pages, then this means that the ratio
+			ordinary/external is D/E. Then the ratio ordinary/total
+			is D / (D + E). Knowing that the total number of pages
+			is T (including ordinary and external) then we estimate
+			that the total number of ordinary leaf pages is
+			T * D / (D + E). */
+			n_ordinary_leaf_pages
+				= index_stats.n_leaf_pages
+				* data->n_leaf_pages_to_analyze
+				/ (data->n_leaf_pages_to_analyze
+				   + data->n_external_pages_sum);
+		}
+
+		/* See REF01 for an explanation of the algorithm */
+		index_stats.stats[n_prefix - 1].n_diff_key_vals
+			= n_ordinary_leaf_pages
+
+			* data->n_diff_on_level
+			/ data->n_recs_on_level
+
+			* data->n_diff_all_analyzed_pages
+			/ data->n_leaf_pages_to_analyze;
+
+		index_stats.stats[n_prefix - 1].n_sample_sizes
+			= data->n_leaf_pages_to_analyze;
+
+		DEBUG_PRINTF("    %s(): n_diff=" UINT64PF
+			     " for n_prefix=" ULINTPF
+			     " (" ULINTPF
+			     " * " UINT64PF " / " UINT64PF
+			     " * " UINT64PF " / " UINT64PF ")\n",
+			     __func__,
+			     index_stats.stats[n_prefix - 1].n_diff_key_vals,
+			     n_prefix,
+			     index_stats.n_leaf_pages,
+			     data->n_diff_on_level,
+			     data->n_recs_on_level,
+			     data->n_diff_all_analyzed_pages,
+			     data->n_leaf_pages_to_analyze);
+	}
+}
+
+/** Calculates new statistics for a given index and saves them to the index
+members stat_n_diff_key_vals[], stat_n_sample_sizes[], stat_index_size and
+stat_n_leaf_pages. This function can be slow.
+@param[in]	index	index to analyze
+@return index stats */
+static index_stats_t dict_stats_analyze_index(dict_index_t* index)
+{
+	bool		level_is_analyzed;
+	ulint		n_uniq;
+	ulint		n_prefix;
+	ib_uint64_t	total_recs;
+	ib_uint64_t	total_pages;
+	mtr_t		mtr;
+	index_stats_t	result(index->n_uniq);
+	DBUG_ENTER("dict_stats_analyze_index");
+
+	DBUG_PRINT("info", ("index: %s, online status: %d", index->name(),
+			    dict_index_get_online_status(index)));
+
+	ut_ad(!index->table->stats_mutex_is_owner());
+	ut_ad(index->table->get_ref_count());
+
+	if (!index->is_btree()) {
+		DBUG_RETURN(result);
+	}
+
+	DEBUG_PRINTF("  %s(index=%s)\n", __func__, index->name());
+
+	mtr.start();
+	mtr_sx_lock_index(index, &mtr);
+	dberr_t err;
+	buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, &mtr, &err);
+	if (!root) {
+empty_index:
+		mtr.commit();
+		dict_stats_assert_initialized_index(index);
+		DBUG_RETURN(result);
+	}
+
+	uint16_t root_level = btr_page_get_level(root->page.frame);
+	mtr.x_lock_space(index->table->space);
+	ulint dummy, size;
+	result.index_size
+		= fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_LEAF
+					+ root->page.frame, &size, &mtr)
+		+ fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_TOP
+					+ root->page.frame, &dummy, &mtr);
+	result.n_leaf_pages = size ? size : 1;
+
+	const auto bulk_trx_id = index->table->bulk_trx_id;
+	if (bulk_trx_id && trx_sys.find(nullptr, bulk_trx_id, false)) {
+		result.set_bulk_operation();
+		goto empty_index;
+	}
+
+	mtr.commit();
+
+	mtr.start();
+	mtr_sx_lock_index(index, &mtr);
+
+	n_uniq = dict_index_get_n_unique(index);
+
+	/* If the tree has just one level (and one page) or if the user
+	has requested to sample too many pages then do full scan.
+
+	For each n-column prefix (for n=1..n_uniq) N_SAMPLE_PAGES(index)
+	will be sampled, so in total N_SAMPLE_PAGES(index) * n_uniq leaf
+	pages will be sampled. If that number is bigger than the total
+	number of leaf pages then do full scan of the leaf level instead
+	since it will be faster and will give better results. */
+
+	if (root_level == 0
+	    || N_SAMPLE_PAGES(index) * n_uniq > result.n_leaf_pages) {
+
+		if (root_level == 0) {
+			DEBUG_PRINTF("  %s(): just one page,"
+				     " doing full scan\n", __func__);
+		} else {
+			DEBUG_PRINTF("  %s(): too many pages requested for"
+				     " sampling, doing full scan\n", __func__);
+		}
+
+		/* do full scan of level 0; save results directly
+		into the index */
+
+		dict_stats_analyze_index_level(index,
+					       0 /* leaf level */,
+					       index->stat_n_diff_key_vals,
+					       &total_recs,
+					       &total_pages,
+					       NULL /* boundaries not needed */,
+					       &mtr);
+
+		mtr.commit();
+
+		index->table->stats_mutex_lock();
+		for (ulint i = 0; i < n_uniq; i++) {
+			result.stats[i].n_diff_key_vals = index->stat_n_diff_key_vals[i];
+			result.stats[i].n_sample_sizes = total_pages;
+			result.stats[i].n_non_null_key_vals = index->stat_n_non_null_key_vals[i];
+		}
+		result.n_leaf_pages = index->stat_n_leaf_pages;
+		index->table->stats_mutex_unlock();
+
+		DBUG_RETURN(result);
+	}
+
+	/* For each level that is being scanned in the btree, this contains the
+	number of different key values for all possible n-column prefixes. */
+	ib_uint64_t*	n_diff_on_level = UT_NEW_ARRAY(
+		ib_uint64_t, n_uniq, mem_key_dict_stats_n_diff_on_level);
+
+	/* For each level that is being scanned in the btree, this contains the
+	index of the last record from each group of equal records (when
+	comparing only the first n columns, n=1..n_uniq). */
+	boundaries_t*	n_diff_boundaries = UT_NEW_ARRAY_NOKEY(boundaries_t,
+							       n_uniq);
+
+	/* For each n-column prefix this array contains the input data that is
+	used to calculate dict_index_t::stat_n_diff_key_vals[]. */
+	n_diff_data_t*	n_diff_data = UT_NEW_ARRAY_NOKEY(n_diff_data_t, n_uniq);
+
+	/* total_recs is also used to estimate the number of pages on one
+	level below, so at the start we have 1 page (the root) */
+	total_recs = 1;
+
+	/* Here we use the following optimization:
+	If we find that level L is the first one (searching from the
+	root) that contains at least D distinct keys when looking at
+	the first n_prefix columns, then:
+	if we look at the first n_prefix-1 columns then the first
+	level that contains D distinct keys will be either L or a
+	lower one.
+	So if we find that the first level containing D distinct
+	keys (on n_prefix columns) is L, we continue from L when
+	searching for D distinct keys on n_prefix-1 columns. */
+	auto level = root_level;
+	level_is_analyzed = false;
+
+	for (n_prefix = n_uniq; n_prefix >= 1; n_prefix--) {
+
+		DEBUG_PRINTF("  %s(): searching level with >=%llu "
+			     "distinct records, n_prefix=" ULINTPF "\n",
+			     __func__, N_DIFF_REQUIRED(index), n_prefix);
+
+		/* Commit the mtr to release the tree S lock to allow
+		other threads to do some work too. */
+		mtr.commit();
+		mtr.start();
+		mtr_sx_lock_index(index, &mtr);
+		ut_ad(mtr.get_savepoint() == 1);
+		buf_block_t *root = btr_root_block_get(index, RW_S_LATCH,
+						       &mtr, &err);
+		if (!root || root_level != btr_page_get_level(root->page.frame)
+		    || index->table->bulk_trx_id != bulk_trx_id) {
+			/* Just quit if the tree has changed beyond
+			recognition here. The old stats from previous
+			runs will remain in the values that we have
+			not calculated yet. Initially when the index
+			object is created the stats members are given
+			some sensible values so leaving them untouched
+			here even the first time will not cause us to
+			read uninitialized memory later. */
+			break;
+		}
+
+		mtr.rollback_to_savepoint(1);
+
+		/* check whether we should pick the current level;
+		we pick level 1 even if it does not have enough
+		distinct records because we do not want to scan the
+		leaf level because it may contain too many records */
+		if (level_is_analyzed
+		    && (n_diff_on_level[n_prefix - 1] >= N_DIFF_REQUIRED(index)
+			|| level == 1)) {
+
+			goto found_level;
+		}
+
+		/* search for a level that contains enough distinct records */
+
+		if (level_is_analyzed && level > 1) {
+
+			/* if this does not hold we should be on
+			"found_level" instead of here */
+			ut_ad(n_diff_on_level[n_prefix - 1]
+			      < N_DIFF_REQUIRED(index));
+
+			level--;
+			level_is_analyzed = false;
+		}
+
+		/* descend into the tree, searching for "good enough" level */
+		for (;;) {
+
+			/* make sure we do not scan the leaf level
+			accidentally, it may contain too many pages */
+			ut_ad(level > 0);
+
+			/* scanning the same level twice is an optimization
+			bug */
+			ut_ad(!level_is_analyzed);
+
+			/* Do not scan if this would read too many pages.
+			Here we use the following fact:
+			the number of pages on level L equals the number
+			of records on level L+1, thus we deduce that the
+			following call would scan total_recs pages, because
+			total_recs is left from the previous iteration when
+			we scanned one level upper or we have not scanned any
+			levels yet in which case total_recs is 1. */
+			if (total_recs > N_SAMPLE_PAGES(index)) {
+
+				/* if the above cond is true then we are
+				not at the root level since on the root
+				level total_recs == 1 (set before we
+				enter the n-prefix loop) and cannot
+				be > N_SAMPLE_PAGES(index) */
+				ut_a(level != root_level);
+
+				/* step one level back and be satisfied with
+				whatever it contains */
+				level++;
+				level_is_analyzed = true;
+
+				break;
+			}
+
+			mtr.rollback_to_savepoint(1);
+			dict_stats_analyze_index_level(index,
+						       level,
+						       n_diff_on_level,
+						       &total_recs,
+						       &total_pages,
+						       n_diff_boundaries,
+						       &mtr);
+			mtr.rollback_to_savepoint(1);
+			level_is_analyzed = true;
+
+			if (level == 1
+			    || n_diff_on_level[n_prefix - 1]
+			    >= N_DIFF_REQUIRED(index)) {
+				/* we have reached the last level we could scan
+				or we found a good level with many distinct
+				records */
+				break;
+			}
+
+			level--;
+			level_is_analyzed = false;
+		}
+found_level:
+
+		DEBUG_PRINTF("  %s(): found level " ULINTPF
+			     " that has " UINT64PF
+			     " distinct records for n_prefix=" ULINTPF "\n",
+			     __func__, level, n_diff_on_level[n_prefix - 1],
+			     n_prefix);
+		/* here we are either on level 1 or the level that we are on
+		contains >= N_DIFF_REQUIRED distinct keys or we did not scan
+		deeper levels because they would contain too many pages */
+
+		ut_ad(level > 0);
+
+		ut_ad(level_is_analyzed);
+
+		/* if any of these is 0 then there is exactly one page in the
+		B-tree and it is empty and we should have done full scan and
+		should not be here */
+		ut_ad(total_recs > 0);
+		ut_ad(n_diff_on_level[n_prefix - 1] > 0);
+
+		ut_ad(N_SAMPLE_PAGES(index) > 0);
+
+		n_diff_data_t*	data = &n_diff_data[n_prefix - 1];
+
+		data->level = level;
+
+		data->n_recs_on_level = total_recs;
+
+		data->n_diff_on_level = n_diff_on_level[n_prefix - 1];
+
+		data->n_leaf_pages_to_analyze = std::min(
+			N_SAMPLE_PAGES(index),
+			n_diff_on_level[n_prefix - 1]);
+
+		/* pick some records from this level and dive below them for
+		the given n_prefix */
+
+		dict_stats_analyze_index_for_n_prefix(
+			index, n_prefix, &n_diff_boundaries[n_prefix - 1],
+			data, &mtr);
+	}
+
+	mtr.commit();
+
+	UT_DELETE_ARRAY(n_diff_boundaries);
+
+	UT_DELETE_ARRAY(n_diff_on_level);
+
+	/* n_prefix == 0 means that the above loop did not end up prematurely
+	due to tree being changed and so n_diff_data[] is set up. */
+	if (n_prefix == 0) {
+		dict_stats_index_set_n_diff(n_diff_data, result);
+	}
+
+	UT_DELETE_ARRAY(n_diff_data);
+
+	DBUG_RETURN(result);
+}
+
+/*********************************************************************//**
+Calculates new estimates for table and index statistics. This function
+is relatively slow and is used to calculate persistent statistics that
+will be saved on disk.
+@return DB_SUCCESS or error code
+@retval DB_SUCCESS_LOCKED_REC if the table under bulk insert operation */
+static
+dberr_t
+dict_stats_update_persistent(
+/*=========================*/
+	dict_table_t*	table)		/*!< in/out: table */
+{
+	dict_index_t*	index;
+
+	DEBUG_PRINTF("%s(table=%s)\n", __func__, table->name);
+
+	DEBUG_SYNC_C("dict_stats_update_persistent");
+
+	/* analyze the clustered index first */
+
+	index = dict_table_get_first_index(table);
+
+	if (index == NULL
+	    || index->is_corrupted()
+	    || (index->type | DICT_UNIQUE) != (DICT_CLUSTERED | DICT_UNIQUE)) {
+
+		/* Table definition is corrupt */
+		dict_stats_empty_table(table, true);
+
+		return(DB_CORRUPTION);
+	}
+
+	ut_ad(!dict_index_is_ibuf(index));
+	table->stats_mutex_lock();
+	dict_stats_empty_index(index, false);
+	table->stats_mutex_unlock();
+
+	index_stats_t stats = dict_stats_analyze_index(index);
+
+	if (stats.is_bulk_operation()) {
+		dict_stats_empty_table(table, false);
+		return DB_SUCCESS_LOCKED_REC;
+	}
+
+	table->stats_mutex_lock();
+	index->stat_index_size = stats.index_size;
+	index->stat_n_leaf_pages = stats.n_leaf_pages;
+	for (size_t i = 0; i < stats.stats.size(); ++i) {
+		index->stat_n_diff_key_vals[i] = stats.stats[i].n_diff_key_vals;
+		index->stat_n_sample_sizes[i] = stats.stats[i].n_sample_sizes;
+		index->stat_n_non_null_key_vals[i] = stats.stats[i].n_non_null_key_vals;
+	}
+
+	ulint	n_unique = dict_index_get_n_unique(index);
+
+	table->stat_n_rows = index->stat_n_diff_key_vals[n_unique - 1];
+
+	table->stat_clustered_index_size = index->stat_index_size;
+
+	/* analyze other indexes from the table, if any */
+
+	table->stat_sum_of_other_index_sizes = 0;
+
+	for (index = dict_table_get_next_index(index);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+
+		if (!index->is_btree()) {
+			continue;
+		}
+
+		dict_stats_empty_index(index, false);
+
+		if (dict_stats_should_ignore_index(index)) {
+			continue;
+		}
+
+		table->stats_mutex_unlock();
+		stats = dict_stats_analyze_index(index);
+		table->stats_mutex_lock();
+
+		if (stats.is_bulk_operation()) {
+			table->stats_mutex_unlock();
+			dict_stats_empty_table(table, false);
+			return DB_SUCCESS_LOCKED_REC;
+		}
+
+		index->stat_index_size = stats.index_size;
+		index->stat_n_leaf_pages = stats.n_leaf_pages;
+
+		for (size_t i = 0; i < stats.stats.size(); ++i) {
+			index->stat_n_diff_key_vals[i]
+				= stats.stats[i].n_diff_key_vals;
+			index->stat_n_sample_sizes[i]
+				= stats.stats[i].n_sample_sizes;
+			index->stat_n_non_null_key_vals[i]
+				= stats.stats[i].n_non_null_key_vals;
+		}
+
+		table->stat_sum_of_other_index_sizes
+			+= index->stat_index_size;
+	}
+
+	table->stats_last_recalc = time(NULL);
+
+	table->stat_modified_counter = 0;
+
+	table->stat_initialized = TRUE;
+
+	dict_stats_assert_initialized(table);
+
+	table->stats_mutex_unlock();
+
+	return(DB_SUCCESS);
+}
+
+#include "mysql_com.h"
+/** Save an individual index's statistic into the persistent statistics
+storage.
+@param[in]	index			index to be updated
+@param[in]	last_update		timestamp of the stat
+@param[in]	stat_name		name of the stat
+@param[in]	stat_value		value of the stat
+@param[in]	sample_size		n pages sampled or NULL
+@param[in]	stat_description	description of the stat
+@param[in,out]	trx			transaction
+@return DB_SUCCESS or error code */
+dberr_t
+dict_stats_save_index_stat(
+	dict_index_t*	index,
+	time_t		last_update,
+	const char*	stat_name,
+	ib_uint64_t	stat_value,
+	ib_uint64_t*	sample_size,
+	const char*	stat_description,
+	trx_t*		trx)
+{
+	dberr_t		ret;
+	pars_info_t*	pinfo;
+	char		db_utf8[MAX_DB_UTF8_LEN];
+	char		table_utf8[MAX_TABLE_UTF8_LEN];
+
+	ut_ad(dict_sys.locked());
+
+	dict_fs2utf8(index->table->name.m_name, db_utf8, sizeof(db_utf8),
+		     table_utf8, sizeof(table_utf8));
+
+	pinfo = pars_info_create();
+	pars_info_add_str_literal(pinfo, "database_name", db_utf8);
+	pars_info_add_str_literal(pinfo, "table_name", table_utf8);
+	pars_info_add_str_literal(pinfo, "index_name", index->name);
+	MEM_CHECK_DEFINED(&last_update, 4);
+	pars_info_add_int4_literal(pinfo, "last_update", uint32(last_update));
+	MEM_CHECK_DEFINED(stat_name, strlen(stat_name));
+	pars_info_add_str_literal(pinfo, "stat_name", stat_name);
+	MEM_CHECK_DEFINED(&stat_value, 8);
+	pars_info_add_ull_literal(pinfo, "stat_value", stat_value);
+	if (sample_size != NULL) {
+		MEM_CHECK_DEFINED(sample_size, 8);
+		pars_info_add_ull_literal(pinfo, "sample_size", *sample_size);
+	} else {
+		pars_info_add_literal(pinfo, "sample_size", NULL,
+				      UNIV_SQL_NULL, DATA_FIXBINARY, 0);
+	}
+	pars_info_add_str_literal(pinfo, "stat_description",
+				  stat_description);
+
+	ret = dict_stats_exec_sql(
+		pinfo,
+		"PROCEDURE INDEX_STATS_SAVE () IS\n"
+		"BEGIN\n"
+
+		"DELETE FROM \"" INDEX_STATS_NAME "\"\n"
+		"WHERE\n"
+		"database_name = :database_name AND\n"
+		"table_name = :table_name AND\n"
+		"index_name = :index_name AND\n"
+		"stat_name = :stat_name;\n"
+
+		"INSERT INTO \"" INDEX_STATS_NAME "\"\n"
+		"VALUES\n"
+		"(\n"
+		":database_name,\n"
+		":table_name,\n"
+		":index_name,\n"
+		":last_update,\n"
+		":stat_name,\n"
+		":stat_value,\n"
+		":sample_size,\n"
+		":stat_description\n"
+		");\n"
+		"END;", trx);
+
+	if (UNIV_UNLIKELY(ret != DB_SUCCESS)) {
+		if (innodb_index_stats_not_found == false &&
+		    index->stats_error_printed == false) {
+		ib::error() << "Cannot save index statistics for table "
+			<< index->table->name
+			<< ", index " << index->name
+			<< ", stat name \"" << stat_name << "\": "
+			<< ret;
+			index->stats_error_printed = true;
+		}
+	}
+
+	return(ret);
+}
+
+/** Report an error if updating table statistics failed because
+.ibd file is missing, table decryption failed or table is corrupted.
+@param[in,out]	table	Table
+@param[in]	defragment	true if statistics is for defragment
+@retval DB_DECRYPTION_FAILED if decryption of the table failed
+@retval DB_TABLESPACE_DELETED if .ibd file is missing
+@retval DB_CORRUPTION if table is marked as corrupted */
+dberr_t
+dict_stats_report_error(dict_table_t* table, bool defragment)
+{
+	dberr_t		err;
+
+	const char*	df = defragment ? " defragment" : "";
+
+	if (!table->space) {
+		ib::warn() << "Cannot save" << df << " statistics for table "
+			   << table->name
+			   << " because the .ibd file is missing. "
+			   << TROUBLESHOOTING_MSG;
+		err = DB_TABLESPACE_DELETED;
+	} else {
+		ib::warn() << "Cannot save" << df << " statistics for table "
+			   << table->name
+			   << " because file "
+			   << table->space->chain.start->name
+			   << (table->corrupted
+			       ? " is corrupted."
+			       : " cannot be decrypted.");
+		err = table->corrupted ? DB_CORRUPTION : DB_DECRYPTION_FAILED;
+	}
+
+	dict_stats_empty_table(table, defragment);
+	return err;
+}
+
+/** Save the table's statistics into the persistent statistics storage.
+@param[in]	table_orig	table whose stats to save
+@param[in]	only_for_index	if this is non-NULL, then stats for indexes
+that are not equal to it will not be saved, if NULL, then all indexes' stats
+are saved
+@return DB_SUCCESS or error code */
+static
+dberr_t
+dict_stats_save(
+	dict_table_t*		table_orig,
+	const index_id_t*	only_for_index)
+{
+	pars_info_t*	pinfo;
+	char		db_utf8[MAX_DB_UTF8_LEN];
+	char		table_utf8[MAX_TABLE_UTF8_LEN];
+
+#ifdef ENABLED_DEBUG_SYNC
+	DBUG_EXECUTE_IF("dict_stats_save_exit_notify",
+	   SCOPE_EXIT([] {
+	       debug_sync_set_action(current_thd,
+	       STRING_WITH_LEN("now SIGNAL dict_stats_save_finished"));
+	    });
+	);
+#endif /* ENABLED_DEBUG_SYNC */
+
+	if (high_level_read_only) {
+		return DB_READ_ONLY;
+	}
+
+	if (!table_orig->is_readable()) {
+		return (dict_stats_report_error(table_orig));
+	}
+
+	THD* thd = current_thd;
+	MDL_ticket *mdl_table = nullptr, *mdl_index = nullptr;
+	dict_table_t* table_stats = dict_table_open_on_name(
+		TABLE_STATS_NAME, false, DICT_ERR_IGNORE_NONE);
+	if (table_stats) {
+		dict_sys.freeze(SRW_LOCK_CALL);
+		table_stats = dict_acquire_mdl_shared<false>(table_stats, thd,
+							     &mdl_table);
+		dict_sys.unfreeze();
+	}
+	if (!table_stats
+	    || strcmp(table_stats->name.m_name, TABLE_STATS_NAME)) {
+release_and_exit:
+		if (table_stats) {
+			dict_table_close(table_stats, false, thd, mdl_table);
+		}
+		return DB_STATS_DO_NOT_EXIST;
+	}
+
+	dict_table_t* index_stats = dict_table_open_on_name(
+		INDEX_STATS_NAME, false, DICT_ERR_IGNORE_NONE);
+	if (index_stats) {
+		dict_sys.freeze(SRW_LOCK_CALL);
+		index_stats = dict_acquire_mdl_shared<false>(index_stats, thd,
+							     &mdl_index);
+		dict_sys.unfreeze();
+	}
+	if (!index_stats) {
+		goto release_and_exit;
+	}
+	if (strcmp(index_stats->name.m_name, INDEX_STATS_NAME)) {
+		dict_table_close(index_stats, false, thd, mdl_index);
+		goto release_and_exit;
+	}
+
+	dict_table_t* table = dict_stats_snapshot_create(table_orig);
+
+	dict_fs2utf8(table->name.m_name, db_utf8, sizeof(db_utf8),
+		     table_utf8, sizeof(table_utf8));
+	const time_t now = time(NULL);
+	trx_t*	trx = trx_create();
+	trx->mysql_thd = thd;
+	trx_start_internal(trx);
+	dberr_t ret = trx->read_only
+		? DB_READ_ONLY
+		: lock_table_for_trx(table_stats, trx, LOCK_X);
+	if (ret == DB_SUCCESS) {
+		ret = lock_table_for_trx(index_stats, trx, LOCK_X);
+	}
+	if (ret != DB_SUCCESS) {
+		if (trx->state != TRX_STATE_NOT_STARTED) {
+			trx->commit();
+		}
+		goto unlocked_free_and_exit;
+	}
+
+	pinfo = pars_info_create();
+
+	pars_info_add_str_literal(pinfo, "database_name", db_utf8);
+	pars_info_add_str_literal(pinfo, "table_name", table_utf8);
+	pars_info_add_int4_literal(pinfo, "last_update", uint32(now));
+	pars_info_add_ull_literal(pinfo, "n_rows", table->stat_n_rows);
+	pars_info_add_ull_literal(pinfo, "clustered_index_size",
+		table->stat_clustered_index_size);
+	pars_info_add_ull_literal(pinfo, "sum_of_other_index_sizes",
+		table->stat_sum_of_other_index_sizes);
+
+	dict_sys.lock(SRW_LOCK_CALL);
+	trx->dict_operation_lock_mode = true;
+
+	ret = dict_stats_exec_sql(
+		pinfo,
+		"PROCEDURE TABLE_STATS_SAVE () IS\n"
+		"BEGIN\n"
+
+		"DELETE FROM \"" TABLE_STATS_NAME "\"\n"
+		"WHERE\n"
+		"database_name = :database_name AND\n"
+		"table_name = :table_name;\n"
+
+		"INSERT INTO \"" TABLE_STATS_NAME "\"\n"
+		"VALUES\n"
+		"(\n"
+		":database_name,\n"
+		":table_name,\n"
+		":last_update,\n"
+		":n_rows,\n"
+		":clustered_index_size,\n"
+		":sum_of_other_index_sizes\n"
+		");\n"
+		"END;", trx);
+
+	if (UNIV_UNLIKELY(ret != DB_SUCCESS)) {
+		ib::error() << "Cannot save table statistics for table "
+			<< table->name << ": " << ret;
+rollback_and_exit:
+		trx->rollback();
+free_and_exit:
+		trx->dict_operation_lock_mode = false;
+		dict_sys.unlock();
+unlocked_free_and_exit:
+		trx->free();
+		dict_stats_snapshot_free(table);
+		dict_table_close(table_stats, false, thd, mdl_table);
+		dict_table_close(index_stats, false, thd, mdl_index);
+		return ret;
+	}
+
+	dict_index_t*	index;
+	index_map_t	indexes(
+		(ut_strcmp_functor()),
+		index_map_t_allocator(mem_key_dict_stats_index_map_t));
+
+	/* Below we do all the modifications in innodb_index_stats in a single
+	transaction for performance reasons. Modifying more than one row in a
+	single transaction may deadlock with other transactions if they
+	lock the rows in different order. Other transaction could be for
+	example when we DROP a table and do
+	DELETE FROM innodb_index_stats WHERE database_name = '...'
+	AND table_name = '...'; which will affect more than one row. To
+	prevent deadlocks we always lock the rows in the same order - the
+	order of the PK, which is (database_name, table_name, index_name,
+	stat_name). This is why below we sort the indexes by name and then
+	for each index, do the mods ordered by stat_name. */
+
+	for (index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+
+		indexes[index->name] = index;
+	}
+
+	index_map_t::const_iterator	it;
+
+	for (it = indexes.begin(); it != indexes.end(); ++it) {
+
+		index = it->second;
+
+		if (only_for_index != NULL && index->id != *only_for_index) {
+			continue;
+		}
+
+		if (dict_stats_should_ignore_index(index)) {
+			continue;
+		}
+
+		ut_ad(!dict_index_is_ibuf(index));
+
+		for (unsigned i = 0; i < index->n_uniq; i++) {
+
+			char	stat_name[16];
+			char	stat_description[1024];
+
+			snprintf(stat_name, sizeof(stat_name),
+				 "n_diff_pfx%02u", i + 1);
+
+			/* craft a string that contains the column names */
+			snprintf(stat_description, sizeof(stat_description),
+				 "%s", index->fields[0].name());
+			for (unsigned j = 1; j <= i; j++) {
+				size_t	len;
+
+				len = strlen(stat_description);
+
+				snprintf(stat_description + len,
+					 sizeof(stat_description) - len,
+					 ",%s", index->fields[j].name());
+			}
+
+			ret = dict_stats_save_index_stat(
+				index, now, stat_name,
+				index->stat_n_diff_key_vals[i],
+				&index->stat_n_sample_sizes[i],
+				stat_description, trx);
+
+			if (ret != DB_SUCCESS) {
+				goto rollback_and_exit;
+			}
+		}
+
+		ret = dict_stats_save_index_stat(index, now, "n_leaf_pages",
+						 index->stat_n_leaf_pages,
+						 NULL,
+						 "Number of leaf pages "
+						 "in the index", trx);
+		if (ret != DB_SUCCESS) {
+			goto rollback_and_exit;
+		}
+
+		ret = dict_stats_save_index_stat(index, now, "size",
+						 index->stat_index_size,
+						 NULL,
+						 "Number of pages "
+						 "in the index", trx);
+		if (ret != DB_SUCCESS) {
+			goto rollback_and_exit;
+		}
+	}
+
+	ret= trx->bulk_insert_apply();
+	if (ret != DB_SUCCESS) {
+		goto rollback_and_exit;
+	}
+
+	trx->commit();
+	goto free_and_exit;
+}
+
+/*********************************************************************//**
+Called for the row that is selected by
+SELECT ... FROM mysql.innodb_table_stats WHERE table='...'
+The second argument is a pointer to the table and the fetched stats are
+written to it.
+@return non-NULL dummy */
+static
+ibool
+dict_stats_fetch_table_stats_step(
+/*==============================*/
+	void*	node_void,	/*!< in: select node */
+	void*	table_void)	/*!< out: table */
+{
+	sel_node_t*	node = (sel_node_t*) node_void;
+	dict_table_t*	table = (dict_table_t*) table_void;
+	que_common_t*	cnode;
+	int		i;
+
+	/* this should loop exactly 3 times - for
+	n_rows,clustered_index_size,sum_of_other_index_sizes */
+	for (cnode = static_cast<que_common_t*>(node->select_list), i = 0;
+	     cnode != NULL;
+	     cnode = static_cast<que_common_t*>(que_node_get_next(cnode)),
+	     i++) {
+
+		const byte*	data;
+		dfield_t*	dfield = que_node_get_val(cnode);
+		dtype_t*	type = dfield_get_type(dfield);
+		ulint		len = dfield_get_len(dfield);
+
+		data = static_cast<const byte*>(dfield_get_data(dfield));
+
+		switch (i) {
+		case 0: /* mysql.innodb_table_stats.n_rows */
+
+			ut_a(dtype_get_mtype(type) == DATA_INT);
+			ut_a(len == 8);
+
+			table->stat_n_rows = mach_read_from_8(data);
+
+			break;
+
+		case 1: /* mysql.innodb_table_stats.clustered_index_size */
+
+			ut_a(dtype_get_mtype(type) == DATA_INT);
+			ut_a(len == 8);
+
+			table->stat_clustered_index_size
+				= (ulint) mach_read_from_8(data);
+
+			break;
+
+		case 2: /* mysql.innodb_table_stats.sum_of_other_index_sizes */
+
+			ut_a(dtype_get_mtype(type) == DATA_INT);
+			ut_a(len == 8);
+
+			table->stat_sum_of_other_index_sizes
+				= (ulint) mach_read_from_8(data);
+
+			break;
+
+		default:
+
+			/* someone changed SELECT
+			n_rows,clustered_index_size,sum_of_other_index_sizes
+			to select more columns from innodb_table_stats without
+			adjusting here */
+			ut_error;
+		}
+	}
+
+	/* if i < 3 this means someone changed the
+	SELECT n_rows,clustered_index_size,sum_of_other_index_sizes
+	to select less columns from innodb_table_stats without adjusting here;
+	if i > 3 we would have ut_error'ed earlier */
+	ut_a(i == 3 /*n_rows,clustered_index_size,sum_of_other_index_sizes*/);
+
+	/* XXX this is not used but returning non-NULL is necessary */
+	return(TRUE);
+}
+
+/** Aux struct used to pass a table and a boolean to
+dict_stats_fetch_index_stats_step(). */
+struct index_fetch_t {
+	dict_table_t*	table;	/*!< table whose indexes are to be modified */
+	bool		stats_were_modified; /*!< will be set to true if at
+				least one index stats were modified */
+};
+
+/*********************************************************************//**
+Called for the rows that are selected by
+SELECT ... FROM mysql.innodb_index_stats WHERE table='...'
+The second argument is a pointer to the table and the fetched stats are
+written to its indexes.
+Let a table has N indexes and each index has Ui unique columns for i=1..N,
+then mysql.innodb_index_stats will have SUM(Ui) i=1..N rows for that table.
+So this function will be called SUM(Ui) times where SUM(Ui) is of magnitude
+N*AVG(Ui). In each call it searches for the currently fetched index into
+table->indexes linearly, assuming this list is not sorted. Thus, overall,
+fetching all indexes' stats from mysql.innodb_index_stats is O(N^2) where N
+is the number of indexes.
+This can be improved if we sort table->indexes in a temporary area just once
+and then search in that sorted list. Then the complexity will be O(N*log(N)).
+We assume a table will not have more than 100 indexes, so we go with the
+simpler N^2 algorithm.
+@return non-NULL dummy */
+static
+ibool
+dict_stats_fetch_index_stats_step(
+/*==============================*/
+	void*	node_void,	/*!< in: select node */
+	void*	arg_void)	/*!< out: table + a flag that tells if we
+				modified anything */
+{
+	sel_node_t*	node = (sel_node_t*) node_void;
+	index_fetch_t*	arg = (index_fetch_t*) arg_void;
+	dict_table_t*	table = arg->table;
+	dict_index_t*	index = NULL;
+	que_common_t*	cnode;
+	const char*	stat_name = NULL;
+	ulint		stat_name_len = ULINT_UNDEFINED;
+	ib_uint64_t	stat_value = UINT64_UNDEFINED;
+	ib_uint64_t	sample_size = UINT64_UNDEFINED;
+	int		i;
+
+	/* this should loop exactly 4 times - for the columns that
+	were selected: index_name,stat_name,stat_value,sample_size */
+	for (cnode = static_cast<que_common_t*>(node->select_list), i = 0;
+	     cnode != NULL;
+	     cnode = static_cast<que_common_t*>(que_node_get_next(cnode)),
+	     i++) {
+
+		const byte*	data;
+		dfield_t*	dfield = que_node_get_val(cnode);
+		dtype_t*	type = dfield_get_type(dfield);
+		ulint		len = dfield_get_len(dfield);
+
+		data = static_cast<const byte*>(dfield_get_data(dfield));
+
+		switch (i) {
+		case 0: /* mysql.innodb_index_stats.index_name */
+
+			ut_a(dtype_get_mtype(type) == DATA_VARMYSQL);
+
+			/* search for index in table's indexes whose name
+			matches data; the fetched index name is in data,
+			has no terminating '\0' and has length len */
+			for (index = dict_table_get_first_index(table);
+			     index != NULL;
+			     index = dict_table_get_next_index(index)) {
+
+				if (index->is_committed()
+				    && strlen(index->name) == len
+				    && memcmp(index->name, data, len) == 0) {
+					/* the corresponding index was found */
+					break;
+				}
+			}
+
+			/* if index is NULL here this means that
+			mysql.innodb_index_stats contains more rows than the
+			number of indexes in the table; this is ok, we just
+			return ignoring those extra rows; in other words
+			dict_stats_fetch_index_stats_step() has been called
+			for a row from index_stats with unknown index_name
+			column */
+			if (index == NULL) {
+
+				return(TRUE);
+			}
+
+			break;
+
+		case 1: /* mysql.innodb_index_stats.stat_name */
+
+			ut_a(dtype_get_mtype(type) == DATA_VARMYSQL);
+
+			ut_a(index != NULL);
+
+			stat_name = (const char*) data;
+			stat_name_len = len;
+
+			break;
+
+		case 2: /* mysql.innodb_index_stats.stat_value */
+
+			ut_a(dtype_get_mtype(type) == DATA_INT);
+			ut_a(len == 8);
+
+			ut_a(index != NULL);
+			ut_a(stat_name != NULL);
+			ut_a(stat_name_len != ULINT_UNDEFINED);
+
+			stat_value = mach_read_from_8(data);
+
+			break;
+
+		case 3: /* mysql.innodb_index_stats.sample_size */
+
+			ut_a(dtype_get_mtype(type) == DATA_INT);
+			ut_a(len == 8 || len == UNIV_SQL_NULL);
+
+			ut_a(index != NULL);
+			ut_a(stat_name != NULL);
+			ut_a(stat_name_len != ULINT_UNDEFINED);
+			ut_a(stat_value != UINT64_UNDEFINED);
+
+			if (len == UNIV_SQL_NULL) {
+				break;
+			}
+			/* else */
+
+			sample_size = mach_read_from_8(data);
+
+			break;
+
+		default:
+
+			/* someone changed
+			SELECT index_name,stat_name,stat_value,sample_size
+			to select more columns from innodb_index_stats without
+			adjusting here */
+			ut_error;
+		}
+	}
+
+	/* if i < 4 this means someone changed the
+	SELECT index_name,stat_name,stat_value,sample_size
+	to select less columns from innodb_index_stats without adjusting here;
+	if i > 4 we would have ut_error'ed earlier */
+	ut_a(i == 4 /* index_name,stat_name,stat_value,sample_size */);
+
+	ut_a(index != NULL);
+	ut_a(stat_name != NULL);
+	ut_a(stat_name_len != ULINT_UNDEFINED);
+	ut_a(stat_value != UINT64_UNDEFINED);
+	/* sample_size could be UINT64_UNDEFINED here, if it is NULL */
+
+#define PFX	"n_diff_pfx"
+#define PFX_LEN	10
+
+	if (stat_name_len == 4 /* strlen("size") */
+	    && strncasecmp("size", stat_name, stat_name_len) == 0) {
+		index->stat_index_size = (ulint) stat_value;
+		arg->stats_were_modified = true;
+	} else if (stat_name_len == 12 /* strlen("n_leaf_pages") */
+		   && strncasecmp("n_leaf_pages", stat_name, stat_name_len)
+		   == 0) {
+		index->stat_n_leaf_pages = (ulint) stat_value;
+		arg->stats_were_modified = true;
+	} else if (stat_name_len == 12 /* strlen("n_page_split") */
+		   && strncasecmp("n_page_split", stat_name, stat_name_len)
+		      == 0) {
+		index->stat_defrag_n_page_split = (ulint) stat_value;
+		arg->stats_were_modified = true;
+	} else if (stat_name_len == 13 /* strlen("n_pages_freed") */
+		   && strncasecmp("n_pages_freed", stat_name, stat_name_len)
+		      == 0) {
+		index->stat_defrag_n_pages_freed = (ulint) stat_value;
+		arg->stats_were_modified = true;
+	} else if (stat_name_len > PFX_LEN /* e.g. stat_name=="n_diff_pfx01" */
+		   && strncasecmp(PFX, stat_name, PFX_LEN) == 0) {
+
+		const char*	num_ptr;
+		unsigned long	n_pfx;
+
+		/* point num_ptr into "1" from "n_diff_pfx12..." */
+		num_ptr = stat_name + PFX_LEN;
+
+		/* stat_name should have exactly 2 chars appended to PFX
+		and they should be digits */
+		if (stat_name_len != PFX_LEN + 2
+		    || num_ptr[0] < '0' || num_ptr[0] > '9'
+		    || num_ptr[1] < '0' || num_ptr[1] > '9') {
+
+			char	db_utf8[MAX_DB_UTF8_LEN];
+			char	table_utf8[MAX_TABLE_UTF8_LEN];
+
+			dict_fs2utf8(table->name.m_name,
+				     db_utf8, sizeof(db_utf8),
+				     table_utf8, sizeof(table_utf8));
+
+			ib::info	out;
+			out << "Ignoring strange row from "
+				<< INDEX_STATS_NAME_PRINT << " WHERE"
+				" database_name = '" << db_utf8
+				<< "' AND table_name = '" << table_utf8
+				<< "' AND index_name = '" << index->name()
+				<< "' AND stat_name = '";
+			out.write(stat_name, stat_name_len);
+			out << "'; because stat_name is malformed";
+			return(TRUE);
+		}
+		/* else */
+
+		/* extract 12 from "n_diff_pfx12..." into n_pfx
+		note that stat_name does not have a terminating '\0' */
+		n_pfx = ulong(num_ptr[0] - '0') * 10 + ulong(num_ptr[1] - '0');
+
+		ulint	n_uniq = index->n_uniq;
+
+		if (n_pfx == 0 || n_pfx > n_uniq) {
+
+			char	db_utf8[MAX_DB_UTF8_LEN];
+			char	table_utf8[MAX_TABLE_UTF8_LEN];
+
+			dict_fs2utf8(table->name.m_name,
+				     db_utf8, sizeof(db_utf8),
+				     table_utf8, sizeof(table_utf8));
+
+			ib::info	out;
+			out << "Ignoring strange row from "
+				<< INDEX_STATS_NAME_PRINT << " WHERE"
+				" database_name = '" << db_utf8
+				<< "' AND table_name = '" << table_utf8
+				<< "' AND index_name = '" << index->name()
+				<< "' AND stat_name = '";
+			out.write(stat_name, stat_name_len);
+			out << "'; because stat_name is out of range, the index"
+				" has " << n_uniq << " unique columns";
+
+			return(TRUE);
+		}
+		/* else */
+
+		index->stat_n_diff_key_vals[n_pfx - 1] = stat_value;
+
+		if (sample_size != UINT64_UNDEFINED) {
+			index->stat_n_sample_sizes[n_pfx - 1] = sample_size;
+		} else {
+			/* hmm, strange... the user must have UPDATEd the
+			table manually and SET sample_size = NULL */
+			index->stat_n_sample_sizes[n_pfx - 1] = 0;
+		}
+
+		index->stat_n_non_null_key_vals[n_pfx - 1] = 0;
+
+		arg->stats_were_modified = true;
+	} else {
+		/* silently ignore rows with unknown stat_name, the
+		user may have developed her own stats */
+	}
+
+	/* XXX this is not used but returning non-NULL is necessary */
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Read table's statistics from the persistent statistics storage.
+@return DB_SUCCESS or error code */
+static
+dberr_t
+dict_stats_fetch_from_ps(
+/*=====================*/
+	dict_table_t*	table)	/*!< in/out: table */
+{
+	index_fetch_t	index_fetch_arg;
+	trx_t*		trx;
+	pars_info_t*	pinfo;
+	dberr_t		ret;
+	char		db_utf8[MAX_DB_UTF8_LEN];
+	char		table_utf8[MAX_TABLE_UTF8_LEN];
+
+	/* Initialize all stats to dummy values before fetching because if
+	the persistent storage contains incomplete stats (e.g. missing stats
+	for some index) then we would end up with (partially) uninitialized
+	stats. */
+	dict_stats_empty_table(table, true);
+
+	THD* thd = current_thd;
+	MDL_ticket *mdl_table = nullptr, *mdl_index = nullptr;
+	dict_table_t* table_stats = dict_table_open_on_name(
+		TABLE_STATS_NAME, false, DICT_ERR_IGNORE_NONE);
+	if (table_stats) {
+		dict_sys.freeze(SRW_LOCK_CALL);
+		table_stats = dict_acquire_mdl_shared<false>(table_stats, thd,
+							     &mdl_table);
+		dict_sys.unfreeze();
+	}
+	if (!table_stats
+	    || strcmp(table_stats->name.m_name, TABLE_STATS_NAME)) {
+release_and_exit:
+		if (table_stats) {
+			dict_table_close(table_stats, false, thd, mdl_table);
+		}
+		return DB_STATS_DO_NOT_EXIST;
+	}
+
+	dict_table_t* index_stats = dict_table_open_on_name(
+		INDEX_STATS_NAME, false, DICT_ERR_IGNORE_NONE);
+	if (index_stats) {
+		dict_sys.freeze(SRW_LOCK_CALL);
+		index_stats = dict_acquire_mdl_shared<false>(index_stats, thd,
+							     &mdl_index);
+		dict_sys.unfreeze();
+	}
+	if (!index_stats) {
+		goto release_and_exit;
+	}
+	if (strcmp(index_stats->name.m_name, INDEX_STATS_NAME)) {
+		dict_table_close(index_stats, false, thd, mdl_index);
+		goto release_and_exit;
+	}
+
+	trx = trx_create();
+
+	trx_start_internal_read_only(trx);
+
+	dict_fs2utf8(table->name.m_name, db_utf8, sizeof(db_utf8),
+		     table_utf8, sizeof(table_utf8));
+
+	pinfo = pars_info_create();
+
+	pars_info_add_str_literal(pinfo, "database_name", db_utf8);
+
+	pars_info_add_str_literal(pinfo, "table_name", table_utf8);
+
+	pars_info_bind_function(pinfo,
+			       "fetch_table_stats_step",
+			       dict_stats_fetch_table_stats_step,
+			       table);
+
+	index_fetch_arg.table = table;
+	index_fetch_arg.stats_were_modified = false;
+	pars_info_bind_function(pinfo,
+			        "fetch_index_stats_step",
+			        dict_stats_fetch_index_stats_step,
+			        &index_fetch_arg);
+	dict_sys.lock(SRW_LOCK_CALL); /* FIXME: remove this */
+	ret = que_eval_sql(pinfo,
+			   "PROCEDURE FETCH_STATS () IS\n"
+			   "found INT;\n"
+			   "DECLARE FUNCTION fetch_table_stats_step;\n"
+			   "DECLARE FUNCTION fetch_index_stats_step;\n"
+			   "DECLARE CURSOR table_stats_cur IS\n"
+			   "  SELECT\n"
+			   /* if you change the selected fields, be
+			   sure to adjust
+			   dict_stats_fetch_table_stats_step() */
+			   "  n_rows,\n"
+			   "  clustered_index_size,\n"
+			   "  sum_of_other_index_sizes\n"
+			   "  FROM \"" TABLE_STATS_NAME "\"\n"
+			   "  WHERE\n"
+			   "  database_name = :database_name AND\n"
+			   "  table_name = :table_name;\n"
+			   "DECLARE CURSOR index_stats_cur IS\n"
+			   "  SELECT\n"
+			   /* if you change the selected fields, be
+			   sure to adjust
+			   dict_stats_fetch_index_stats_step() */
+			   "  index_name,\n"
+			   "  stat_name,\n"
+			   "  stat_value,\n"
+			   "  sample_size\n"
+			   "  FROM \"" INDEX_STATS_NAME "\"\n"
+			   "  WHERE\n"
+			   "  database_name = :database_name AND\n"
+			   "  table_name = :table_name;\n"
+
+			   "BEGIN\n"
+
+			   "OPEN table_stats_cur;\n"
+			   "FETCH table_stats_cur INTO\n"
+			   "  fetch_table_stats_step();\n"
+			   "IF (SQL % NOTFOUND) THEN\n"
+			   "  CLOSE table_stats_cur;\n"
+			   "  RETURN;\n"
+			   "END IF;\n"
+			   "CLOSE table_stats_cur;\n"
+
+			   "OPEN index_stats_cur;\n"
+			   "found := 1;\n"
+			   "WHILE found = 1 LOOP\n"
+			   "  FETCH index_stats_cur INTO\n"
+			   "    fetch_index_stats_step();\n"
+			   "  IF (SQL % NOTFOUND) THEN\n"
+			   "    found := 0;\n"
+			   "  END IF;\n"
+			   "END LOOP;\n"
+			   "CLOSE index_stats_cur;\n"
+
+			   "END;", trx);
+	/* pinfo is freed by que_eval_sql() */
+	dict_sys.unlock();
+
+	dict_table_close(table_stats, false, thd, mdl_table);
+	dict_table_close(index_stats, false, thd, mdl_index);
+
+	trx_commit_for_mysql(trx);
+
+	trx->free();
+
+	if (!index_fetch_arg.stats_were_modified) {
+		return(DB_STATS_DO_NOT_EXIST);
+	}
+
+	return(ret);
+}
+
+/*********************************************************************//**
+Clear defragmentation stats modified counter for all indices in table. */
+static
+void
+dict_stats_empty_defrag_modified_counter(
+	dict_table_t*	table)	/*!< in: table */
+{
+	dict_index_t*	index;
+	ut_a(table);
+	for (index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+		index->stat_defrag_modified_counter = 0;
+	}
+}
+
+/*********************************************************************//**
+Fetches or calculates new estimates for index statistics. */
+void
+dict_stats_update_for_index(
+/*========================*/
+	dict_index_t*	index)	/*!< in/out: index */
+{
+	DBUG_ENTER("dict_stats_update_for_index");
+
+	if (dict_stats_is_persistent_enabled(index->table)) {
+
+		if (dict_stats_persistent_storage_check(false)) {
+			index_stats_t stats = dict_stats_analyze_index(index);
+			index->table->stats_mutex_lock();
+			index->stat_index_size = stats.index_size;
+			index->stat_n_leaf_pages = stats.n_leaf_pages;
+			for (size_t i = 0; i < stats.stats.size(); ++i) {
+				index->stat_n_diff_key_vals[i]
+					= stats.stats[i].n_diff_key_vals;
+				index->stat_n_sample_sizes[i]
+					= stats.stats[i].n_sample_sizes;
+				index->stat_n_non_null_key_vals[i]
+					= stats.stats[i].n_non_null_key_vals;
+			}
+			index->table->stat_sum_of_other_index_sizes
+				+= index->stat_index_size;
+			index->table->stats_mutex_unlock();
+
+			dict_stats_save(index->table, &index->id);
+			DBUG_VOID_RETURN;
+		}
+		/* else */
+
+		if (innodb_index_stats_not_found == false &&
+		    index->stats_error_printed == false) {
+			/* Fall back to transient stats since the persistent
+			storage is not present or is corrupted */
+
+		ib::info() << "Recalculation of persistent statistics"
+			" requested for table " << index->table->name
+			<< " index " << index->name
+			<< " but the required"
+			" persistent statistics storage is not present or is"
+			" corrupted. Using transient stats instead.";
+			index->stats_error_printed = false;
+		}
+	}
+
+	dict_stats_update_transient_for_index(index);
+
+	DBUG_VOID_RETURN;
+}
+
+/*********************************************************************//**
+Calculates new estimates for table and index statistics. The statistics
+are used in query optimization.
+@return DB_SUCCESS or error code
+@retval DB_SUCCESS_LOCKED_REC if the table under bulk insert operation */
+dberr_t
+dict_stats_update(
+/*==============*/
+	dict_table_t*		table,	/*!< in/out: table */
+	dict_stats_upd_option_t	stats_upd_option)
+					/*!< in: whether to (re) calc
+					the stats or to fetch them from
+					the persistent statistics
+					storage */
+{
+	ut_ad(!table->stats_mutex_is_owner());
+
+	if (!table->is_readable()) {
+		return (dict_stats_report_error(table));
+	} else if (srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN) {
+		/* If we have set a high innodb_force_recovery level, do
+		not calculate statistics, as a badly corrupted index can
+		cause a crash in it. */
+		dict_stats_empty_table(table, false);
+		return(DB_SUCCESS);
+	}
+
+	if (trx_id_t bulk_trx_id = table->bulk_trx_id) {
+		if (trx_sys.find(nullptr, bulk_trx_id, false)) {
+			dict_stats_empty_table(table, false);
+			return DB_SUCCESS_LOCKED_REC;
+		}
+	}
+
+	switch (stats_upd_option) {
+	case DICT_STATS_RECALC_PERSISTENT:
+
+		if (srv_read_only_mode) {
+			goto transient;
+		}
+
+		/* Persistent recalculation requested, called from
+		1) ANALYZE TABLE, or
+		2) the auto recalculation background thread, or
+		3) open table if stats do not exist on disk and auto recalc
+		   is enabled */
+
+		/* InnoDB internal tables (e.g. SYS_TABLES) cannot have
+		persistent stats enabled */
+		ut_a(strchr(table->name.m_name, '/') != NULL);
+
+		/* check if the persistent statistics storage exists
+		before calling the potentially slow function
+		dict_stats_update_persistent(); that is a
+		prerequisite for dict_stats_save() succeeding */
+		if (dict_stats_persistent_storage_check(false)) {
+
+			dberr_t	err;
+
+			err = dict_stats_update_persistent(table);
+
+			if (err != DB_SUCCESS) {
+				return(err);
+			}
+
+			err = dict_stats_save(table, NULL);
+
+			return(err);
+		}
+
+		/* Fall back to transient stats since the persistent
+		storage is not present or is corrupted */
+
+		if (innodb_table_stats_not_found == false &&
+		    table->stats_error_printed == false) {
+		ib::warn() << "Recalculation of persistent statistics"
+			" requested for table "
+			<< table->name
+			<< " but the required persistent"
+			" statistics storage is not present or is corrupted."
+			" Using transient stats instead.";
+			table->stats_error_printed = true;
+		}
+
+		goto transient;
+
+	case DICT_STATS_RECALC_TRANSIENT:
+
+		goto transient;
+
+	case DICT_STATS_EMPTY_TABLE:
+
+		dict_stats_empty_table(table, true);
+
+		/* If table is using persistent stats,
+		then save the stats on disk */
+
+		if (dict_stats_is_persistent_enabled(table)) {
+
+			if (dict_stats_persistent_storage_check(false)) {
+
+				return(dict_stats_save(table, NULL));
+			}
+
+			return(DB_STATS_DO_NOT_EXIST);
+		}
+
+		return(DB_SUCCESS);
+
+	case DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY:
+
+		/* fetch requested, either fetch from persistent statistics
+		storage or use the old method */
+
+		if (table->stat_initialized) {
+			return(DB_SUCCESS);
+		}
+
+		/* InnoDB internal tables (e.g. SYS_TABLES) cannot have
+		persistent stats enabled */
+		ut_a(strchr(table->name.m_name, '/') != NULL);
+
+		if (!dict_stats_persistent_storage_check(false)) {
+			/* persistent statistics storage does not exist
+			or is corrupted, calculate the transient stats */
+
+			if (innodb_table_stats_not_found == false &&
+			    table->stats_error_printed == false &&
+			    !opt_bootstrap) {
+				ib::error() << "Fetch of persistent statistics"
+					" requested for table "
+					<< table->name
+					<< " but the required system tables "
+					<< TABLE_STATS_NAME_PRINT
+					<< " and " << INDEX_STATS_NAME_PRINT
+					<< " are not present or have unexpected"
+					" structure. Using transient stats instead.";
+					table->stats_error_printed = true;
+			}
+
+			goto transient;
+		}
+
+		dict_table_t*	t;
+
+		/* Create a dummy table object with the same name and
+		indexes, suitable for fetching the stats into it. */
+		t = dict_stats_table_clone_create(table);
+
+		dberr_t	err = dict_stats_fetch_from_ps(t);
+
+		t->stats_last_recalc = table->stats_last_recalc;
+		t->stat_modified_counter = 0;
+		dict_stats_empty_defrag_modified_counter(t);
+
+		switch (err) {
+		case DB_SUCCESS:
+
+			table->stats_mutex_lock();
+			/* t is localized to this thread so no need to
+			take stats mutex lock (limiting it to debug only) */
+			ut_d(t->stats_mutex_lock());
+
+			/* Pass reset_ignored_indexes=true as parameter
+			to dict_stats_copy. This will cause statictics
+			for corrupted indexes to be set to empty values */
+			dict_stats_copy(table, t, true);
+
+			dict_stats_assert_initialized(table);
+
+			ut_d(t->stats_mutex_unlock());
+			table->stats_mutex_unlock();
+
+			dict_stats_table_clone_free(t);
+
+			return(DB_SUCCESS);
+		case DB_STATS_DO_NOT_EXIST:
+
+			dict_stats_table_clone_free(t);
+
+			if (srv_read_only_mode) {
+				goto transient;
+			}
+
+			if (dict_stats_auto_recalc_is_enabled(table)) {
+				return(dict_stats_update(
+						table,
+						DICT_STATS_RECALC_PERSISTENT));
+			}
+
+			ib::info() << "Trying to use table " << table->name
+				<< " which has persistent statistics enabled,"
+				" but auto recalculation turned off and the"
+				" statistics do not exist in "
+				TABLE_STATS_NAME_PRINT
+				" and " INDEX_STATS_NAME_PRINT
+				". Please either run \"ANALYZE TABLE "
+				<< table->name << ";\" manually or enable the"
+				" auto recalculation with \"ALTER TABLE "
+				<< table->name << " STATS_AUTO_RECALC=1;\"."
+				" InnoDB will now use transient statistics for "
+				<< table->name << ".";
+
+			goto transient;
+		default:
+
+			dict_stats_table_clone_free(t);
+
+			if (innodb_table_stats_not_found == false &&
+			    table->stats_error_printed == false) {
+				ib::error() << "Error fetching persistent statistics"
+					" for table "
+					<< table->name
+					<< " from " TABLE_STATS_NAME_PRINT " and "
+					INDEX_STATS_NAME_PRINT ": " << err
+					<< ". Using transient stats method instead.";
+			}
+
+			goto transient;
+		}
+	/* no "default:" in order to produce a compilation warning
+	about unhandled enumeration value */
+	}
+
+transient:
+	return dict_stats_update_transient(table);
+}
+
+/** Execute DELETE FROM mysql.innodb_table_stats
+@param database_name  database name
+@param table_name     table name
+@param trx            transaction (nullptr=start and commit a new one)
+@return DB_SUCCESS or error code */
+dberr_t dict_stats_delete_from_table_stats(const char *database_name,
+                                           const char *table_name, trx_t *trx)
+{
+	pars_info_t*	pinfo;
+
+	ut_ad(dict_sys.locked());
+
+	pinfo = pars_info_create();
+
+	pars_info_add_str_literal(pinfo, "database_name", database_name);
+	pars_info_add_str_literal(pinfo, "table_name", table_name);
+
+	return dict_stats_exec_sql(
+		pinfo,
+		"PROCEDURE DELETE_FROM_TABLE_STATS () IS\n"
+		"BEGIN\n"
+		"DELETE FROM \"" TABLE_STATS_NAME "\" WHERE\n"
+		"database_name = :database_name AND\n"
+		"table_name = :table_name;\n"
+		"END;\n", trx);
+}
+
+/** Execute DELETE FROM mysql.innodb_index_stats
+@param database_name  database name
+@param table_name     table name
+@param trx            transaction
+@return DB_SUCCESS or error code */
+dberr_t dict_stats_delete_from_index_stats(const char *database_name,
+                                           const char *table_name, trx_t *trx)
+{
+	pars_info_t*	pinfo;
+
+	ut_ad(dict_sys.locked());
+
+	pinfo = pars_info_create();
+
+	pars_info_add_str_literal(pinfo, "database_name", database_name);
+	pars_info_add_str_literal(pinfo, "table_name", table_name);
+
+	return dict_stats_exec_sql(
+		pinfo,
+		"PROCEDURE DELETE_FROM_INDEX_STATS () IS\n"
+		"BEGIN\n"
+		"DELETE FROM \"" INDEX_STATS_NAME "\" WHERE\n"
+		"database_name = :database_name AND\n"
+		"table_name = :table_name;\n"
+		"END;\n", trx);
+}
+
+/** Execute DELETE FROM mysql.innodb_index_stats
+@param database_name  database name
+@param table_name     table name
+@param index_name     name of the index
+@param trx            transaction
+@return DB_SUCCESS or error code */
+dberr_t dict_stats_delete_from_index_stats(const char *database_name,
+                                           const char *table_name,
+                                           const char *index_name, trx_t *trx)
+{
+	pars_info_t*	pinfo;
+
+	ut_ad(dict_sys.locked());
+
+	pinfo = pars_info_create();
+
+	pars_info_add_str_literal(pinfo, "database_name", database_name);
+	pars_info_add_str_literal(pinfo, "table_name", table_name);
+	pars_info_add_str_literal(pinfo, "index_name", index_name);
+
+	return dict_stats_exec_sql(
+		pinfo,
+		"PROCEDURE DELETE_FROM_INDEX_STATS () IS\n"
+		"BEGIN\n"
+		"DELETE FROM \"" INDEX_STATS_NAME "\" WHERE\n"
+		"database_name = :database_name AND\n"
+		"table_name = :table_name AND\n"
+		"index_name = :index_name;\n"
+		"END;\n", trx);
+}
+
+/** Rename a table in InnoDB persistent stats storage.
+@param old_name  old table name
+@param new_name  new table name
+@param trx       transaction
+@return DB_SUCCESS or error code */
+dberr_t dict_stats_rename_table(const char *old_name, const char *new_name,
+                                trx_t *trx)
+{
+  /* skip the statistics tables themselves */
+  if (!strcmp(old_name, TABLE_STATS_NAME) ||
+      !strcmp(old_name, INDEX_STATS_NAME) ||
+      !strcmp(new_name, TABLE_STATS_NAME) ||
+      !strcmp(new_name, INDEX_STATS_NAME))
+    return DB_SUCCESS;
+
+  char old_db[MAX_DB_UTF8_LEN];
+  char new_db[MAX_DB_UTF8_LEN];
+  char old_table[MAX_TABLE_UTF8_LEN];
+  char new_table[MAX_TABLE_UTF8_LEN];
+
+  dict_fs2utf8(old_name, old_db, sizeof old_db, old_table, sizeof old_table);
+  dict_fs2utf8(new_name, new_db, sizeof new_db, new_table, sizeof new_table);
+
+  if (dict_table_t::is_temporary_name(old_name) ||
+      dict_table_t::is_temporary_name(new_name))
+  {
+    if (dberr_t e= dict_stats_delete_from_table_stats(old_db, old_table, trx))
+      return e;
+    return dict_stats_delete_from_index_stats(old_db, old_table, trx);
+  }
+
+  pars_info_t *pinfo= pars_info_create();
+  pars_info_add_str_literal(pinfo, "old_db", old_db);
+  pars_info_add_str_literal(pinfo, "old_table", old_table);
+  pars_info_add_str_literal(pinfo, "new_db", new_db);
+  pars_info_add_str_literal(pinfo, "new_table", new_table);
+
+  static const char sql[]=
+    "PROCEDURE RENAME_TABLE_IN_STATS() IS\n"
+    "BEGIN\n"
+    "UPDATE \"" TABLE_STATS_NAME "\" SET\n"
+    "database_name=:new_db, table_name=:new_table\n"
+    "WHERE database_name=:old_db AND table_name=:old_table;\n"
+    "UPDATE \"" INDEX_STATS_NAME "\" SET\n"
+    "database_name=:new_db, table_name=:new_table\n"
+    "WHERE database_name=:old_db AND table_name=:old_table;\n"
+    "END;\n";
+
+  return dict_stats_exec_sql(pinfo, sql, trx);
+}
+
+/** Rename an index in InnoDB persistent statistics.
+@param db         database name
+@param table      table name
+@param old_name   old table name
+@param new_name   new table name
+@param trx        transaction
+@return DB_SUCCESS or error code */
+dberr_t dict_stats_rename_index(const char *db, const char *table,
+                                const char *old_name, const char *new_name,
+                                trx_t *trx)
+{
+  if (!dict_stats_persistent_storage_check(true))
+    return DB_STATS_DO_NOT_EXIST;
+  pars_info_t *pinfo= pars_info_create();
+
+  pars_info_add_str_literal(pinfo, "db", db);
+  pars_info_add_str_literal(pinfo, "table", table);
+  pars_info_add_str_literal(pinfo, "old", old_name);
+  pars_info_add_str_literal(pinfo, "new", new_name);
+
+  static const char sql[]=
+    "PROCEDURE RENAME_INDEX_IN_STATS() IS\n"
+    "BEGIN\n"
+    "UPDATE \"" INDEX_STATS_NAME "\" SET index_name=:new\n"
+    "WHERE database_name=:db AND table_name=:table AND index_name=:old;\n"
+    "END;\n";
+
+  return dict_stats_exec_sql(pinfo, sql, trx);
+}
+
+/** Delete all persistent statistics for a database.
+@param db    database name
+@param trx   transaction
+@return DB_SUCCESS or error code */
+dberr_t dict_stats_delete(const char *db, trx_t *trx)
+{
+  static const char sql[] =
+    "PROCEDURE DROP_DATABASE_STATS () IS\n"
+    "BEGIN\n"
+    "DELETE FROM \"" TABLE_STATS_NAME "\" WHERE database_name=:db;\n"
+    "DELETE FROM \"" INDEX_STATS_NAME "\" WHERE database_name=:db;\n"
+    "END;\n";
+
+  pars_info_t *pinfo= pars_info_create();
+  pars_info_add_str_literal(pinfo, "db", db);
+  return dict_stats_exec_sql(pinfo, sql, trx);
+}
+
+/* tests @{ */
+#ifdef UNIV_ENABLE_UNIT_TEST_DICT_STATS
+/* save/fetch aux macros @{ */
+#define TEST_DATABASE_NAME		"foobardb"
+#define TEST_TABLE_NAME			"test_dict_stats"
+
+#define TEST_N_ROWS			111
+#define TEST_CLUSTERED_INDEX_SIZE	222
+#define TEST_SUM_OF_OTHER_INDEX_SIZES	333
+
+#define TEST_IDX1_NAME			"tidx1"
+#define TEST_IDX1_COL1_NAME		"tidx1_col1"
+#define TEST_IDX1_INDEX_SIZE		123
+#define TEST_IDX1_N_LEAF_PAGES		234
+#define TEST_IDX1_N_DIFF1		50
+#define TEST_IDX1_N_DIFF1_SAMPLE_SIZE	500
+
+#define TEST_IDX2_NAME			"tidx2"
+#define TEST_IDX2_COL1_NAME		"tidx2_col1"
+#define TEST_IDX2_COL2_NAME		"tidx2_col2"
+#define TEST_IDX2_COL3_NAME		"tidx2_col3"
+#define TEST_IDX2_COL4_NAME		"tidx2_col4"
+#define TEST_IDX2_INDEX_SIZE		321
+#define TEST_IDX2_N_LEAF_PAGES		432
+#define TEST_IDX2_N_DIFF1		60
+#define TEST_IDX2_N_DIFF1_SAMPLE_SIZE	600
+#define TEST_IDX2_N_DIFF2		61
+#define TEST_IDX2_N_DIFF2_SAMPLE_SIZE	610
+#define TEST_IDX2_N_DIFF3		62
+#define TEST_IDX2_N_DIFF3_SAMPLE_SIZE	620
+#define TEST_IDX2_N_DIFF4		63
+#define TEST_IDX2_N_DIFF4_SAMPLE_SIZE	630
+/* @} */
+
+/* test_dict_stats_save() @{ */
+void
+test_dict_stats_save()
+{
+	dict_table_t	table;
+	dict_index_t	index1;
+	dict_field_t	index1_fields[1];
+	ib_uint64_t	index1_stat_n_diff_key_vals[1];
+	ib_uint64_t	index1_stat_n_sample_sizes[1];
+	dict_index_t	index2;
+	dict_field_t	index2_fields[4];
+	ib_uint64_t	index2_stat_n_diff_key_vals[4];
+	ib_uint64_t	index2_stat_n_sample_sizes[4];
+	dberr_t		ret;
+
+	/* craft a dummy dict_table_t */
+	table.name.m_name = (char*) (TEST_DATABASE_NAME "/" TEST_TABLE_NAME);
+	table.stat_n_rows = TEST_N_ROWS;
+	table.stat_clustered_index_size = TEST_CLUSTERED_INDEX_SIZE;
+	table.stat_sum_of_other_index_sizes = TEST_SUM_OF_OTHER_INDEX_SIZES;
+	UT_LIST_INIT(table.indexes, &dict_index_t::indexes);
+#ifdef BTR_CUR_HASH_ADAPT
+	UT_LIST_INIT(table.freed_indexes, &dict_index_t::indexes);
+#endif /* BTR_CUR_HASH_ADAPT */
+	UT_LIST_ADD_LAST(table.indexes, &index1);
+	UT_LIST_ADD_LAST(table.indexes, &index2);
+	ut_d(table.magic_n = DICT_TABLE_MAGIC_N);
+	ut_d(index1.magic_n = DICT_INDEX_MAGIC_N);
+
+	index1.name = TEST_IDX1_NAME;
+	index1.table = &table;
+	index1.cached = 1;
+	index1.n_uniq = 1;
+	index1.fields = index1_fields;
+	index1.stat_n_diff_key_vals = index1_stat_n_diff_key_vals;
+	index1.stat_n_sample_sizes = index1_stat_n_sample_sizes;
+	index1.stat_index_size = TEST_IDX1_INDEX_SIZE;
+	index1.stat_n_leaf_pages = TEST_IDX1_N_LEAF_PAGES;
+	index1_fields[0].name = TEST_IDX1_COL1_NAME;
+	index1_stat_n_diff_key_vals[0] = TEST_IDX1_N_DIFF1;
+	index1_stat_n_sample_sizes[0] = TEST_IDX1_N_DIFF1_SAMPLE_SIZE;
+
+	ut_d(index2.magic_n = DICT_INDEX_MAGIC_N);
+	index2.name = TEST_IDX2_NAME;
+	index2.table = &table;
+	index2.cached = 1;
+	index2.n_uniq = 4;
+	index2.fields = index2_fields;
+	index2.stat_n_diff_key_vals = index2_stat_n_diff_key_vals;
+	index2.stat_n_sample_sizes = index2_stat_n_sample_sizes;
+	index2.stat_index_size = TEST_IDX2_INDEX_SIZE;
+	index2.stat_n_leaf_pages = TEST_IDX2_N_LEAF_PAGES;
+	index2_fields[0].name = TEST_IDX2_COL1_NAME;
+	index2_fields[1].name = TEST_IDX2_COL2_NAME;
+	index2_fields[2].name = TEST_IDX2_COL3_NAME;
+	index2_fields[3].name = TEST_IDX2_COL4_NAME;
+	index2_stat_n_diff_key_vals[0] = TEST_IDX2_N_DIFF1;
+	index2_stat_n_diff_key_vals[1] = TEST_IDX2_N_DIFF2;
+	index2_stat_n_diff_key_vals[2] = TEST_IDX2_N_DIFF3;
+	index2_stat_n_diff_key_vals[3] = TEST_IDX2_N_DIFF4;
+	index2_stat_n_sample_sizes[0] = TEST_IDX2_N_DIFF1_SAMPLE_SIZE;
+	index2_stat_n_sample_sizes[1] = TEST_IDX2_N_DIFF2_SAMPLE_SIZE;
+	index2_stat_n_sample_sizes[2] = TEST_IDX2_N_DIFF3_SAMPLE_SIZE;
+	index2_stat_n_sample_sizes[3] = TEST_IDX2_N_DIFF4_SAMPLE_SIZE;
+
+	ret = dict_stats_save(&table, NULL);
+
+	ut_a(ret == DB_SUCCESS);
+
+	printf("\nOK: stats saved successfully, now go ahead and read"
+	       " what's inside %s and %s:\n\n",
+	       TABLE_STATS_NAME_PRINT,
+	       INDEX_STATS_NAME_PRINT);
+
+	printf("SELECT COUNT(*) = 1 AS table_stats_saved_successfully\n"
+	       "FROM %s\n"
+	       "WHERE\n"
+	       "database_name = '%s' AND\n"
+	       "table_name = '%s' AND\n"
+	       "n_rows = %d AND\n"
+	       "clustered_index_size = %d AND\n"
+	       "sum_of_other_index_sizes = %d;\n"
+	       "\n",
+	       TABLE_STATS_NAME_PRINT,
+	       TEST_DATABASE_NAME,
+	       TEST_TABLE_NAME,
+	       TEST_N_ROWS,
+	       TEST_CLUSTERED_INDEX_SIZE,
+	       TEST_SUM_OF_OTHER_INDEX_SIZES);
+
+	printf("SELECT COUNT(*) = 3 AS tidx1_stats_saved_successfully\n"
+	       "FROM %s\n"
+	       "WHERE\n"
+	       "database_name = '%s' AND\n"
+	       "table_name = '%s' AND\n"
+	       "index_name = '%s' AND\n"
+	       "(\n"
+	       " (stat_name = 'size' AND stat_value = %d AND"
+	       "  sample_size IS NULL) OR\n"
+	       " (stat_name = 'n_leaf_pages' AND stat_value = %d AND"
+	       "  sample_size IS NULL) OR\n"
+	       " (stat_name = 'n_diff_pfx01' AND stat_value = %d AND"
+	       "  sample_size = '%d' AND stat_description = '%s')\n"
+	       ");\n"
+	       "\n",
+	       INDEX_STATS_NAME_PRINT,
+	       TEST_DATABASE_NAME,
+	       TEST_TABLE_NAME,
+	       TEST_IDX1_NAME,
+	       TEST_IDX1_INDEX_SIZE,
+	       TEST_IDX1_N_LEAF_PAGES,
+	       TEST_IDX1_N_DIFF1,
+	       TEST_IDX1_N_DIFF1_SAMPLE_SIZE,
+	       TEST_IDX1_COL1_NAME);
+
+	printf("SELECT COUNT(*) = 6 AS tidx2_stats_saved_successfully\n"
+	       "FROM %s\n"
+	       "WHERE\n"
+	       "database_name = '%s' AND\n"
+	       "table_name = '%s' AND\n"
+	       "index_name = '%s' AND\n"
+	       "(\n"
+	       " (stat_name = 'size' AND stat_value = %d AND"
+	       "  sample_size IS NULL) OR\n"
+	       " (stat_name = 'n_leaf_pages' AND stat_value = %d AND"
+	       "  sample_size IS NULL) OR\n"
+	       " (stat_name = 'n_diff_pfx01' AND stat_value = %d AND"
+	       "  sample_size = '%d' AND stat_description = '%s') OR\n"
+	       " (stat_name = 'n_diff_pfx02' AND stat_value = %d AND"
+	       "  sample_size = '%d' AND stat_description = '%s,%s') OR\n"
+	       " (stat_name = 'n_diff_pfx03' AND stat_value = %d AND"
+	       "  sample_size = '%d' AND stat_description = '%s,%s,%s') OR\n"
+	       " (stat_name = 'n_diff_pfx04' AND stat_value = %d AND"
+	       "  sample_size = '%d' AND stat_description = '%s,%s,%s,%s')\n"
+	       ");\n"
+	       "\n",
+	       INDEX_STATS_NAME_PRINT,
+	       TEST_DATABASE_NAME,
+	       TEST_TABLE_NAME,
+	       TEST_IDX2_NAME,
+	       TEST_IDX2_INDEX_SIZE,
+	       TEST_IDX2_N_LEAF_PAGES,
+	       TEST_IDX2_N_DIFF1,
+	       TEST_IDX2_N_DIFF1_SAMPLE_SIZE, TEST_IDX2_COL1_NAME,
+	       TEST_IDX2_N_DIFF2,
+	       TEST_IDX2_N_DIFF2_SAMPLE_SIZE,
+	       TEST_IDX2_COL1_NAME, TEST_IDX2_COL2_NAME,
+	       TEST_IDX2_N_DIFF3,
+	       TEST_IDX2_N_DIFF3_SAMPLE_SIZE,
+	       TEST_IDX2_COL1_NAME, TEST_IDX2_COL2_NAME, TEST_IDX2_COL3_NAME,
+	       TEST_IDX2_N_DIFF4,
+	       TEST_IDX2_N_DIFF4_SAMPLE_SIZE,
+	       TEST_IDX2_COL1_NAME, TEST_IDX2_COL2_NAME, TEST_IDX2_COL3_NAME,
+	       TEST_IDX2_COL4_NAME);
+}
+/* @} */
+
+/* test_dict_stats_fetch_from_ps() @{ */
+void
+test_dict_stats_fetch_from_ps()
+{
+	dict_table_t	table;
+	dict_index_t	index1;
+	ib_uint64_t	index1_stat_n_diff_key_vals[1];
+	ib_uint64_t	index1_stat_n_sample_sizes[1];
+	dict_index_t	index2;
+	ib_uint64_t	index2_stat_n_diff_key_vals[4];
+	ib_uint64_t	index2_stat_n_sample_sizes[4];
+	dberr_t		ret;
+
+	/* craft a dummy dict_table_t */
+	table.name.m_name = (char*) (TEST_DATABASE_NAME "/" TEST_TABLE_NAME);
+	UT_LIST_INIT(table.indexes, &dict_index_t::indexes);
+#ifdef BTR_CUR_HASH_ADAPT
+	UT_LIST_INIT(table.freed_indexes, &dict_index_t::indexes);
+#endif /* BTR_CUR_HASH_ADAPT */
+	UT_LIST_ADD_LAST(table.indexes, &index1);
+	UT_LIST_ADD_LAST(table.indexes, &index2);
+	ut_d(table.magic_n = DICT_TABLE_MAGIC_N);
+
+	index1.name = TEST_IDX1_NAME;
+	ut_d(index1.magic_n = DICT_INDEX_MAGIC_N);
+	index1.cached = 1;
+	index1.n_uniq = 1;
+	index1.stat_n_diff_key_vals = index1_stat_n_diff_key_vals;
+	index1.stat_n_sample_sizes = index1_stat_n_sample_sizes;
+
+	index2.name = TEST_IDX2_NAME;
+	ut_d(index2.magic_n = DICT_INDEX_MAGIC_N);
+	index2.cached = 1;
+	index2.n_uniq = 4;
+	index2.stat_n_diff_key_vals = index2_stat_n_diff_key_vals;
+	index2.stat_n_sample_sizes = index2_stat_n_sample_sizes;
+
+	ret = dict_stats_fetch_from_ps(&table);
+
+	ut_a(ret == DB_SUCCESS);
+
+	ut_a(table.stat_n_rows == TEST_N_ROWS);
+	ut_a(table.stat_clustered_index_size == TEST_CLUSTERED_INDEX_SIZE);
+	ut_a(table.stat_sum_of_other_index_sizes
+	     == TEST_SUM_OF_OTHER_INDEX_SIZES);
+
+	ut_a(index1.stat_index_size == TEST_IDX1_INDEX_SIZE);
+	ut_a(index1.stat_n_leaf_pages == TEST_IDX1_N_LEAF_PAGES);
+	ut_a(index1_stat_n_diff_key_vals[0] == TEST_IDX1_N_DIFF1);
+	ut_a(index1_stat_n_sample_sizes[0] == TEST_IDX1_N_DIFF1_SAMPLE_SIZE);
+
+	ut_a(index2.stat_index_size == TEST_IDX2_INDEX_SIZE);
+	ut_a(index2.stat_n_leaf_pages == TEST_IDX2_N_LEAF_PAGES);
+	ut_a(index2_stat_n_diff_key_vals[0] == TEST_IDX2_N_DIFF1);
+	ut_a(index2_stat_n_sample_sizes[0] == TEST_IDX2_N_DIFF1_SAMPLE_SIZE);
+	ut_a(index2_stat_n_diff_key_vals[1] == TEST_IDX2_N_DIFF2);
+	ut_a(index2_stat_n_sample_sizes[1] == TEST_IDX2_N_DIFF2_SAMPLE_SIZE);
+	ut_a(index2_stat_n_diff_key_vals[2] == TEST_IDX2_N_DIFF3);
+	ut_a(index2_stat_n_sample_sizes[2] == TEST_IDX2_N_DIFF3_SAMPLE_SIZE);
+	ut_a(index2_stat_n_diff_key_vals[3] == TEST_IDX2_N_DIFF4);
+	ut_a(index2_stat_n_sample_sizes[3] == TEST_IDX2_N_DIFF4_SAMPLE_SIZE);
+
+	printf("OK: fetch successful\n");
+}
+/* @} */
+
+/* test_dict_stats_all() @{ */
+void
+test_dict_stats_all()
+{
+	test_dict_table_schema_check();
+
+	test_dict_stats_save();
+
+	test_dict_stats_fetch_from_ps();
+}
+/* @} */
+
+#endif /* UNIV_ENABLE_UNIT_TEST_DICT_STATS */
+/* @} */
diff --git a/storage/innobase/dict/dict0stats_bg.cc b/storage/innobase/dict/dict0stats_bg.cc
new file mode 100644
index 00000000..a66aac22
--- /dev/null
+++ b/storage/innobase/dict/dict0stats_bg.cc
@@ -0,0 +1,424 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file dict/dict0stats_bg.cc
+Code used for background table and index stats gathering.
+
+Created Apr 25, 2012 Vasil Dimov
+*******************************************************/
+
+#include "dict0dict.h"
+#include "dict0stats.h"
+#include "dict0stats_bg.h"
+#include "dict0defrag_bg.h"
+#include "row0mysql.h"
+#include "srv0start.h"
+#include "fil0fil.h"
+#include "mysqld.h"
+#ifdef WITH_WSREP
+# include "trx0trx.h"
+# include "mysql/service_wsrep.h"
+# include "wsrep.h"
+# include "log.h"
+#endif
+
+#include <vector>
+
+/** Minimum time interval between stats recalc for a given table */
+#define MIN_RECALC_INTERVAL	10 /* seconds */
+static void dict_stats_schedule(int ms);
+
+/** Protects recalc_pool */
+static mysql_mutex_t recalc_pool_mutex;
+
+/** for signaling recalc::state */
+static pthread_cond_t recalc_pool_cond;
+
+/** Work item of the recalc_pool; protected by recalc_pool_mutex */
+struct recalc
+{
+  /** identifies a table with persistent statistics */
+  table_id_t id;
+  /** state of the entry */
+  enum { IDLE, IN_PROGRESS, IN_PROGRESS_DELETING, DELETING} state;
+};
+
+/** The multitude of tables whose stats are to be automatically recalculated */
+typedef std::vector<recalc, ut_allocator<recalc>> recalc_pool_t;
+
+/** Pool where we store information on which tables are to be processed
+by background statistics gathering. */
+static recalc_pool_t		recalc_pool;
+/** Whether the global data structures have been initialized */
+static bool			stats_initialised;
+
+/*****************************************************************//**
+Free the resources occupied by the recalc pool, called once during
+thread de-initialization. */
+static void dict_stats_recalc_pool_deinit()
+{
+	ut_ad(!srv_read_only_mode);
+
+	recalc_pool.clear();
+	defrag_pool.clear();
+        /*
+          recalc_pool may still have its buffer allocated. It will free it when
+          its destructor is called.
+          The problem is, memory leak detector is run before the recalc_pool's
+          destructor is invoked, and will report recalc_pool's buffer as leaked
+          memory.  To avoid that, we force recalc_pool to surrender its buffer
+          to empty_pool object, which will free it when leaving this function:
+        */
+	recalc_pool_t recalc_empty_pool;
+	defrag_pool_t defrag_empty_pool;
+	recalc_pool.swap(recalc_empty_pool);
+	defrag_pool.swap(defrag_empty_pool);
+}
+
+/*****************************************************************//**
+Add a table to the recalc pool, which is processed by the
+background stats gathering thread. Only the table id is added to the
+list, so the table can be closed after being enqueued and it will be
+opened when needed. If the table does not exist later (has been DROPped),
+then it will be removed from the pool and skipped. */
+static void dict_stats_recalc_pool_add(table_id_t id)
+{
+  ut_ad(!srv_read_only_mode);
+  ut_ad(id);
+  bool schedule = false;
+  mysql_mutex_lock(&recalc_pool_mutex);
+
+  const auto begin= recalc_pool.begin(), end= recalc_pool.end();
+  if (end == std::find_if(begin, end, [&](const recalc &r){return r.id == id;}))
+  {
+    recalc_pool.emplace_back(recalc{id, recalc::IDLE});
+    schedule = true;
+  }
+
+  mysql_mutex_unlock(&recalc_pool_mutex);
+  if (schedule)
+    dict_stats_schedule_now();
+}
+
+#ifdef WITH_WSREP
+/** Update the table modification counter and if necessary,
+schedule new estimates for table and index statistics to be calculated.
+@param[in,out]	table	persistent or temporary table
+@param[in]	thd	current session */
+void dict_stats_update_if_needed(dict_table_t *table, const trx_t &trx)
+#else
+/** Update the table modification counter and if necessary,
+schedule new estimates for table and index statistics to be calculated.
+@param[in,out]	table	persistent or temporary table */
+void dict_stats_update_if_needed_func(dict_table_t *table)
+#endif
+{
+	if (UNIV_UNLIKELY(!table->stat_initialized)) {
+		/* The table may have been evicted from dict_sys
+		and reloaded internally by InnoDB for FOREIGN KEY
+		processing, but not reloaded by the SQL layer.
+
+		We can (re)compute the transient statistics when the
+		table is actually loaded by the SQL layer.
+
+		Note: If InnoDB persistent statistics are enabled,
+		we will skip the updates. We must do this, because
+		dict_table_get_n_rows() below assumes that the
+		statistics have been initialized. The DBA may have
+		to execute ANALYZE TABLE. */
+		return;
+	}
+
+	ulonglong	counter = table->stat_modified_counter++;
+	ulonglong	n_rows = dict_table_get_n_rows(table);
+
+	if (dict_stats_is_persistent_enabled(table)) {
+		if (table->name.is_temporary()) {
+			return;
+		}
+		if (counter > n_rows / 10 /* 10% */
+		    && dict_stats_auto_recalc_is_enabled(table)) {
+
+#ifdef WITH_WSREP
+			/* Do not add table to background
+			statistic calculation if this thread is not a
+			applier (as all DDL, which is replicated (i.e
+			is binlogged in master node), will be executed
+			with high priority (a.k.a BF) in slave nodes)
+			and is BF. This could again lead BF lock
+			waits in applier node but it is better than
+			no persistent index/table statistics at
+			applier nodes. TODO: allow BF threads
+			wait for these InnoDB internal SQL-parser
+			generated row locks and allow BF thread
+			lock waits to be enqueued at head of waiting
+			queue. */
+			if (trx.is_wsrep()
+			    && !wsrep_thd_is_applying(trx.mysql_thd)
+			    && wsrep_thd_is_BF(trx.mysql_thd, 0)) {
+				WSREP_DEBUG("Avoiding background statistics"
+					    " calculation for table %s.",
+					table->name.m_name);
+				return;
+			}
+#endif /* WITH_WSREP */
+
+			dict_stats_recalc_pool_add(table->id);
+			table->stat_modified_counter = 0;
+		}
+		return;
+	}
+
+	/* Calculate new statistics if 1 / 16 of table has been modified
+	since the last time a statistics batch was run.
+	We calculate statistics at most every 16th round, since we may have
+	a counter table which is very small and updated very often. */
+	ulonglong threshold = 16 + n_rows / 16; /* 6.25% */
+
+	if (srv_stats_modified_counter) {
+		threshold = std::min(srv_stats_modified_counter, threshold);
+	}
+
+	if (counter > threshold) {
+		/* this will reset table->stat_modified_counter to 0 */
+		dict_stats_update(table, DICT_STATS_RECALC_TRANSIENT);
+	}
+}
+
+/** Delete a table from the auto recalc pool, and ensure that
+no statistics are being updated on it. */
+void dict_stats_recalc_pool_del(table_id_t id, bool have_mdl_exclusive)
+{
+  ut_ad(!srv_read_only_mode);
+  ut_ad(id);
+
+  mysql_mutex_lock(&recalc_pool_mutex);
+
+  auto end= recalc_pool.end();
+  auto i= std::find_if(recalc_pool.begin(), end,
+                       [&](const recalc &r){return r.id == id;});
+  if (i != end)
+  {
+    switch (i->state) {
+    case recalc::IN_PROGRESS:
+      if (!have_mdl_exclusive)
+      {
+        i->state= recalc::IN_PROGRESS_DELETING;
+        do
+        {
+          my_cond_wait(&recalc_pool_cond, &recalc_pool_mutex.m_mutex);
+          end= recalc_pool.end();
+          i= std::find_if(recalc_pool.begin(), end,
+                          [&](const recalc &r){return r.id == id;});
+          if (i == end)
+            goto done;
+        }
+        while (i->state == recalc::IN_PROGRESS_DELETING);
+      }
+      /* fall through */
+    case recalc::IDLE:
+      recalc_pool.erase(i);
+      break;
+    case recalc::IN_PROGRESS_DELETING:
+    case recalc::DELETING:
+      /* another thread will delete the entry in dict_stats_recalc_pool_del() */
+      break;
+    }
+  }
+
+done:
+  mysql_mutex_unlock(&recalc_pool_mutex);
+}
+
+/*****************************************************************//**
+Initialize global variables needed for the operation of dict_stats_thread()
+Must be called before dict_stats_thread() is started. */
+void dict_stats_init()
+{
+  ut_ad(!srv_read_only_mode);
+  mysql_mutex_init(recalc_pool_mutex_key, &recalc_pool_mutex, nullptr);
+  pthread_cond_init(&recalc_pool_cond, nullptr);
+  dict_defrag_pool_init();
+  stats_initialised= true;
+}
+
+/*****************************************************************//**
+Free resources allocated by dict_stats_init(), must be called
+after dict_stats task has exited. */
+void dict_stats_deinit()
+{
+	if (!stats_initialised) {
+		return;
+	}
+
+	ut_ad(!srv_read_only_mode);
+	stats_initialised = false;
+
+	dict_stats_recalc_pool_deinit();
+	dict_defrag_pool_deinit();
+
+	mysql_mutex_destroy(&recalc_pool_mutex);
+	pthread_cond_destroy(&recalc_pool_cond);
+}
+
+/**
+Get the first table that has been added for auto recalc and eventually
+update its stats.
+@return whether the first entry can be processed immediately */
+static bool dict_stats_process_entry_from_recalc_pool(THD *thd)
+{
+  ut_ad(!srv_read_only_mode);
+  table_id_t table_id;
+  mysql_mutex_lock(&recalc_pool_mutex);
+next_table_id_with_mutex:
+  for (auto &r : recalc_pool)
+  {
+    if ((table_id= r.id) && r.state == recalc::IDLE)
+    {
+      r.state= recalc::IN_PROGRESS;
+      mysql_mutex_unlock(&recalc_pool_mutex);
+      goto process;
+    }
+  }
+  mysql_mutex_unlock(&recalc_pool_mutex);
+  return false;
+
+process:
+  MDL_ticket *mdl= nullptr;
+  dict_table_t *table= dict_table_open_on_id(table_id, false,
+                                             DICT_TABLE_OP_NORMAL, thd, &mdl);
+  if (!table)
+  {
+invalid_table_id:
+    mysql_mutex_lock(&recalc_pool_mutex);
+    auto i= std::find_if(recalc_pool.begin(), recalc_pool.end(),
+                         [&](const recalc &r){return r.id == table_id;});
+    if (i == recalc_pool.end());
+    else if (UNIV_LIKELY(i->state == recalc::IN_PROGRESS))
+      recalc_pool.erase(i);
+    else
+    {
+      ut_ad(i->state == recalc::IN_PROGRESS_DELETING);
+      i->state= recalc::DELETING;
+      pthread_cond_broadcast(&recalc_pool_cond);
+    }
+    goto next_table_id_with_mutex;
+  }
+
+  ut_ad(!table->is_temporary());
+
+  if (!mdl || !table->is_accessible())
+  {
+    dict_table_close(table, false, thd, mdl);
+    goto invalid_table_id;
+  }
+
+  /* time() could be expensive, the current function
+  is called once every time a table has been changed more than 10% and
+  on a system with lots of small tables, this could become hot. If we
+  find out that this is a problem, then the check below could eventually
+  be replaced with something else, though a time interval is the natural
+  approach. */
+  const bool update_now=
+    difftime(time(nullptr), table->stats_last_recalc) >= MIN_RECALC_INTERVAL;
+
+  const dberr_t err= update_now
+    ? dict_stats_update(table, DICT_STATS_RECALC_PERSISTENT)
+    : DB_SUCCESS_LOCKED_REC;
+
+  dict_table_close(table, false, thd, mdl);
+
+  mysql_mutex_lock(&recalc_pool_mutex);
+  auto i= std::find_if(recalc_pool.begin(), recalc_pool.end(),
+                       [&](const recalc &r){return r.id == table_id;});
+  if (i == recalc_pool.end())
+    goto done;
+  else if (i->state == recalc::IN_PROGRESS_DELETING)
+  {
+    i->state= recalc::DELETING;
+    pthread_cond_broadcast(&recalc_pool_cond);
+done:
+    mysql_mutex_unlock(&recalc_pool_mutex);
+  }
+  else
+  {
+    ut_ad(i->state == recalc::IN_PROGRESS);
+    recalc_pool.erase(i);
+    const bool reschedule= !update_now && recalc_pool.empty();
+    if (err == DB_SUCCESS_LOCKED_REC)
+      recalc_pool.emplace_back(recalc{table_id, recalc::IDLE});
+    mysql_mutex_unlock(&recalc_pool_mutex);
+    if (reschedule)
+      dict_stats_schedule(MIN_RECALC_INTERVAL * 1000);
+  }
+
+  return update_now;
+}
+
+static tpool::timer* dict_stats_timer;
+static std::mutex dict_stats_mutex;
+
+static void dict_stats_func(void*)
+{
+  THD *thd= innobase_create_background_thd("InnoDB statistics");
+  set_current_thd(thd);
+  while (dict_stats_process_entry_from_recalc_pool(thd)) {}
+  dict_defrag_process_entries_from_defrag_pool(thd);
+  set_current_thd(nullptr);
+  destroy_background_thd(thd);
+}
+
+
+void dict_stats_start()
+{
+  std::lock_guard<std::mutex> lk(dict_stats_mutex);
+  if (!dict_stats_timer)
+    dict_stats_timer= srv_thread_pool->create_timer(dict_stats_func);
+}
+
+
+static void dict_stats_schedule(int ms)
+{
+  std::unique_lock<std::mutex> lk(dict_stats_mutex, std::defer_lock);
+  /*
+    Use try_lock() to avoid deadlock in dict_stats_shutdown(), which
+    uses dict_stats_mutex too. If there is simultaneous timer reschedule,
+    the first one will win, which is fine.
+  */
+  if (!lk.try_lock())
+  {
+    return;
+  }
+  if (dict_stats_timer)
+    dict_stats_timer->set_time(ms,0);
+}
+
+void dict_stats_schedule_now()
+{
+  dict_stats_schedule(0);
+}
+
+/** Shut down the dict_stats_thread. */
+void dict_stats_shutdown()
+{
+  std::lock_guard<std::mutex> lk(dict_stats_mutex);
+  delete dict_stats_timer;
+  dict_stats_timer= 0;
+}
diff --git a/storage/innobase/dict/drop.cc b/storage/innobase/dict/drop.cc
new file mode 100644
index 00000000..dce71974
--- /dev/null
+++ b/storage/innobase/dict/drop.cc
@@ -0,0 +1,297 @@
+/*****************************************************************************
+
+Copyright (c) 2021, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**
+@file dict/drop.cc
+Data Dictionary Language operations that delete .ibd files */
+
+/* We implement atomic data dictionary operations as follows.
+
+1. A data dictionary transaction is started.
+2. We acquire exclusive lock on all the tables that are to be dropped
+during the execution of the transaction.
+3. We lock the data dictionary cache.
+4. All metadata tables will be updated within the single DDL transaction,
+including deleting or renaming InnoDB persistent statistics.
+4b. If any lock wait would occur while we are holding the dict_sys latches,
+we will instantly report a timeout error and roll back the transaction.
+5. The transaction metadata is marked as committed.
+6. If any files were deleted, we will durably write FILE_DELETE
+to the redo log and start deleting the files.
+6b. Also purge after a commit may perform file deletion. This is also the
+recovery mechanism if the server was killed between step 5 and 6.
+7. We unlock the data dictionary cache.
+8. The file handles of the unlinked files will be closed. This will actually
+reclaim the space in the file system (delete-on-close semantics).
+
+Notes:
+
+(a) Purge will be locked out by MDL. For internal tables related to
+FULLTEXT INDEX, purge will not acquire MDL on the user table name,
+and therefore, when we are dropping any FTS_ tables, we must suspend
+and resume purge to prevent a race condition.
+
+(b) If a transaction needs to both drop and create a table by some
+name, it must rename the table in between. This is used by
+ha_innobase::truncate() and fts_drop_common_tables().
+
+(c) No data is ever destroyed before the transaction is committed,
+so we can trivially roll back the transaction at any time.
+Lock waits during a DDL operation are no longer a fatal error
+that would cause the InnoDB to hang or to intentionally crash.
+(Only ALTER TABLE...DISCARD TABLESPACE may discard data before commit.)
+
+(d) The only changes to the data dictionary cache that are performed
+before transaction commit and must be rolled back explicitly are as follows:
+(d1) fts_optimize_add_table() to undo fts_optimize_remove_table()
+*/
+
+#include "trx0purge.h"
+#include "dict0dict.h"
+#include "dict0stats.h"
+#include "dict0stats_bg.h"
+
+#include "dict0defrag_bg.h"
+#include "btr0defragment.h"
+#include "ibuf0ibuf.h"
+#include "lock0lock.h"
+
+#include "que0que.h"
+#include "pars0pars.h"
+
+/** Try to drop the foreign key constraints for a persistent table.
+@param name        name of persistent table
+@return error code */
+dberr_t trx_t::drop_table_foreign(const table_name_t &name)
+{
+  ut_ad(dict_sys.locked());
+  ut_ad(state == TRX_STATE_ACTIVE);
+  ut_ad(dict_operation);
+  ut_ad(dict_operation_lock_mode);
+
+  if (!dict_sys.sys_foreign || dict_sys.sys_foreign->corrupted)
+    return DB_SUCCESS;
+
+  if (!dict_sys.sys_foreign_cols || dict_sys.sys_foreign_cols->corrupted)
+    return DB_SUCCESS;
+
+  pars_info_t *info= pars_info_create();
+  pars_info_add_str_literal(info, "name", name.m_name);
+  return que_eval_sql(info,
+                      "PROCEDURE DROP_FOREIGN() IS\n"
+                      "fid CHAR;\n"
+
+                      "DECLARE CURSOR fk IS\n"
+                      "SELECT ID FROM SYS_FOREIGN\n"
+                      "WHERE FOR_NAME=:name\n"
+                      "AND TO_BINARY(FOR_NAME)=TO_BINARY(:name)\n"
+                      "FOR UPDATE;\n"
+
+                      "BEGIN\n"
+                      "OPEN fk;\n"
+                      "WHILE 1=1 LOOP\n"
+                      "  FETCH fk INTO fid;\n"
+                      "  IF (SQL % NOTFOUND)THEN RETURN;END IF;\n"
+                      "  DELETE FROM SYS_FOREIGN_COLS"
+                      " WHERE ID=fid;\n"
+                      "  DELETE FROM SYS_FOREIGN WHERE ID=fid;\n"
+                      "END LOOP;\n"
+                      "CLOSE fk;\n"
+                      "END;\n", this);
+}
+
+/** Try to drop the statistics for a persistent table.
+@param name        name of persistent table
+@return error code */
+dberr_t trx_t::drop_table_statistics(const table_name_t &name)
+{
+  ut_ad(dict_sys.locked());
+  ut_ad(dict_operation_lock_mode);
+
+  if (strstr(name.m_name, "/" TEMP_FILE_PREFIX_INNODB) ||
+      !strcmp(name.m_name, TABLE_STATS_NAME) ||
+      !strcmp(name.m_name, INDEX_STATS_NAME))
+    return DB_SUCCESS;
+
+  char db[MAX_DB_UTF8_LEN], table[MAX_TABLE_UTF8_LEN];
+  dict_fs2utf8(name.m_name, db, sizeof db, table, sizeof table);
+
+  dberr_t err= dict_stats_delete_from_table_stats(db, table, this);
+  if (err == DB_SUCCESS || err == DB_STATS_DO_NOT_EXIST)
+  {
+    err= dict_stats_delete_from_index_stats(db, table, this);
+    if (err == DB_STATS_DO_NOT_EXIST)
+      err= DB_SUCCESS;
+  }
+  return err;
+}
+
+/** Try to drop a persistent table.
+@param table       persistent table
+@param fk          whether to drop FOREIGN KEY metadata
+@return error code */
+dberr_t trx_t::drop_table(const dict_table_t &table)
+{
+  ut_ad(dict_sys.locked());
+  ut_ad(state == TRX_STATE_ACTIVE);
+  ut_ad(dict_operation);
+  ut_ad(dict_operation_lock_mode);
+  ut_ad(!table.is_temporary());
+  /* The table must be exclusively locked by this transaction. */
+  ut_ad(table.get_ref_count() <= 1);
+  ut_ad(table.n_lock_x_or_s == 1);
+  ut_ad(UT_LIST_GET_LEN(table.locks) >= 1);
+#ifdef UNIV_DEBUG
+  bool found_x= false;
+  for (lock_t *lock= UT_LIST_GET_FIRST(table.locks); lock;
+       lock= UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock))
+  {
+    ut_ad(lock->trx == this);
+    switch (lock->type_mode) {
+    case LOCK_TABLE | LOCK_X:
+      found_x= true;
+      break;
+    case LOCK_TABLE | LOCK_IX:
+    case LOCK_TABLE | LOCK_AUTO_INC:
+      break;
+    default:
+      ut_ad("unexpected lock type" == 0);
+    }
+  }
+  ut_ad(found_x);
+#endif
+
+  if (dict_sys.sys_virtual && !dict_sys.sys_virtual->corrupted)
+  {
+    pars_info_t *info= pars_info_create();
+    pars_info_add_ull_literal(info, "id", table.id);
+    if (dberr_t err= que_eval_sql(info,
+                                  "PROCEDURE DROP_VIRTUAL() IS\n"
+                                  "BEGIN\n"
+                                  "DELETE FROM SYS_VIRTUAL"
+                                  " WHERE TABLE_ID=:id;\n"
+                                  "END;\n", this))
+      return err;
+  }
+
+  /* Once DELETE FROM SYS_INDEXES is committed, purge may invoke
+  dict_drop_index_tree(). */
+
+  if (!(table.flags2 & (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS)));
+  else if (dberr_t err= fts_drop_tables(this, table))
+  {
+    ib::error() << "Unable to remove FTS tables for "
+                << table.name << ": " << err;
+    return err;
+  }
+
+  mod_tables.emplace(const_cast<dict_table_t*>(&table), undo_no).
+    first->second.set_dropped();
+
+  pars_info_t *info= pars_info_create();
+  pars_info_add_ull_literal(info, "id", table.id);
+  return que_eval_sql(info,
+                      "PROCEDURE DROP_TABLE() IS\n"
+                      "iid CHAR;\n"
+
+                      "DECLARE CURSOR idx IS\n"
+                      "SELECT ID FROM SYS_INDEXES\n"
+                      "WHERE TABLE_ID=:id FOR UPDATE;\n"
+
+                      "BEGIN\n"
+
+                      "DELETE FROM SYS_TABLES WHERE ID=:id;\n"
+                      "DELETE FROM SYS_COLUMNS WHERE TABLE_ID=:id;\n"
+
+                      "OPEN idx;\n"
+                      "WHILE 1 = 1 LOOP\n"
+                      "  FETCH idx INTO iid;\n"
+                      "  IF (SQL % NOTFOUND) THEN EXIT; END IF;\n"
+                      "  DELETE FROM SYS_INDEXES WHERE CURRENT OF idx;\n"
+                      "  DELETE FROM SYS_FIELDS WHERE INDEX_ID=iid;\n"
+                      "END LOOP;\n"
+                      "CLOSE idx;\n"
+
+                      "END;\n", this);
+}
+
+/** Commit the transaction, possibly after drop_table().
+@param deleted   handles of data files that were deleted */
+void trx_t::commit(std::vector<pfs_os_file_t> &deleted)
+{
+  ut_ad(dict_operation);
+  flush_log_later= true;
+  commit_persist();
+  flush_log_later= false;
+  if (dict_operation)
+  {
+    std::vector<uint32_t> space_ids;
+    space_ids.reserve(mod_tables.size());
+    ut_ad(dict_sys.locked());
+    lock_sys.wr_lock(SRW_LOCK_CALL);
+    mutex_lock();
+    lock_release_on_drop(this);
+    ut_ad(UT_LIST_GET_LEN(lock.trx_locks) == 0);
+    ut_ad(ib_vector_is_empty(autoinc_locks));
+    mem_heap_empty(lock.lock_heap);
+    lock.table_locks.clear();
+    /* commit_persist() already reset this. */
+    ut_ad(!lock.was_chosen_as_deadlock_victim);
+    lock.n_rec_locks= 0;
+    while (dict_table_t *table= UT_LIST_GET_FIRST(lock.evicted_tables))
+    {
+      UT_LIST_REMOVE(lock.evicted_tables, table);
+      dict_mem_table_free(table);
+    }
+    dict_operation= false;
+    id= 0;
+    mutex_unlock();
+
+    for (const auto &p : mod_tables)
+    {
+      if (p.second.is_dropped())
+      {
+        dict_table_t *table= p.first;
+        dict_stats_recalc_pool_del(table->id, true);
+        dict_stats_defrag_pool_del(table, nullptr);
+        if (btr_defragment_active)
+          btr_defragment_remove_table(table);
+        const fil_space_t *space= table->space;
+        ut_ad(!p.second.is_aux_table() || purge_sys.must_wait_FTS());
+        dict_sys.remove(table);
+        if (const auto id= space ? space->id : 0)
+        {
+          space_ids.emplace_back(id);
+          pfs_os_file_t d= fil_delete_tablespace(id);
+          if (d != OS_FILE_CLOSED)
+            deleted.emplace_back(d);
+        }
+      }
+    }
+
+    lock_sys.wr_unlock();
+
+    mysql_mutex_lock(&lock_sys.wait_mutex);
+    lock_sys.deadlock_check();
+    mysql_mutex_unlock(&lock_sys.wait_mutex);
+
+    for (const auto id : space_ids)
+      ibuf_delete_for_discarded_space(id);
+  }
+  commit_cleanup();
+}
diff --git a/storage/innobase/eval/eval0eval.cc b/storage/innobase/eval/eval0eval.cc
new file mode 100644
index 00000000..bafb0b55
--- /dev/null
+++ b/storage/innobase/eval/eval0eval.cc
@@ -0,0 +1,643 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file eval/eval0eval.cc
+SQL evaluator: evaluates simple data structures, like expressions, in
+a query graph
+
+Created 12/29/1997 Heikki Tuuri
+*******************************************************/
+
+#include "eval0eval.h"
+#include "data0data.h"
+#include "row0sel.h"
+#include "rem0cmp.h"
+
+/** Dummy adress used when we should allocate a buffer of size 0 in
+eval_node_alloc_val_buf */
+
+static byte	eval_dummy;
+
+/*************************************************************************
+Gets the like node from the node */
+UNIV_INLINE
+que_node_t*
+que_node_get_like_node(
+/*===================*/
+				/* out: next node in a list of nodes */
+	que_node_t*     node)   /* in: node in a list */
+{
+	return(((sym_node_t*) node)->like_node);
+}
+
+/*****************************************************************//**
+Allocate a buffer from global dynamic memory for a value of a que_node.
+NOTE that this memory must be explicitly freed when the query graph is
+freed. If the node already has an allocated buffer, that buffer is freed
+here. NOTE that this is the only function where dynamic memory should be
+allocated for a query node val field.
+@return pointer to allocated buffer */
+byte*
+eval_node_alloc_val_buf(
+/*====================*/
+	que_node_t*	node,	/*!< in: query graph node; sets the val field
+				data field to point to the new buffer, and
+				len field equal to size */
+	ulint		size)	/*!< in: buffer size */
+{
+	dfield_t*	dfield;
+	byte*		data;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_SYMBOL
+	      || que_node_get_type(node) == QUE_NODE_FUNC);
+
+	dfield = que_node_get_val(node);
+
+	data = static_cast<byte*>(dfield_get_data(dfield));
+
+	if (data != &eval_dummy) {
+		ut_free(data);
+	}
+
+	if (size == 0) {
+		data = &eval_dummy;
+	} else {
+		data = static_cast<byte*>(ut_malloc_nokey(size));
+	}
+
+	que_node_set_val_buf_size(node, size);
+
+	dfield_set_data(dfield, data, size);
+
+	return(data);
+}
+
+/*****************************************************************//**
+Free the buffer from global dynamic memory for a value of a que_node,
+if it has been allocated in the above function. The freeing for pushed
+column values is done in sel_col_prefetch_buf_free. */
+void
+eval_node_free_val_buf(
+/*===================*/
+	que_node_t*	node)	/*!< in: query graph node */
+{
+	dfield_t*	dfield;
+	byte*		data;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_SYMBOL
+	      || que_node_get_type(node) == QUE_NODE_FUNC);
+
+	dfield = que_node_get_val(node);
+
+	data = static_cast<byte*>(dfield_get_data(dfield));
+
+	if (que_node_get_val_buf_size(node) > 0) {
+		ut_a(data);
+
+		ut_free(data);
+	}
+}
+
+/*********************************************************************
+Evaluates a LIKE comparison node.
+@return the result of the comparison */
+UNIV_INLINE
+ibool
+eval_cmp_like(
+/*==========*/
+	que_node_t*	arg1,		/* !< in: left operand */
+	que_node_t*	arg2)		/* !< in: right operand */
+{
+	ib_like_t	op;
+	que_node_t*	arg3;
+	que_node_t*	arg4;
+	const dfield_t*	dfield;
+
+	arg3 = que_node_get_like_node(arg2);
+
+	/* Get the comparison type operator */
+	ut_a(arg3);
+
+	dfield = que_node_get_val(arg3);
+	ut_ad(dtype_get_mtype(dfield_get_type(dfield)) == DATA_INT);
+	op = static_cast<ib_like_t>(
+		mach_read_from_4(static_cast<const byte*>(
+					 dfield_get_data(dfield))));
+
+	switch (op) {
+	case IB_LIKE_PREFIX:
+		arg4 = que_node_get_next(arg3);
+		return(cmp_dfield_dfield_eq_prefix(que_node_get_val(arg1),
+						   que_node_get_val(arg4)));
+	case IB_LIKE_EXACT:
+		return(!cmp_dfield_dfield(que_node_get_val(arg1),
+					  que_node_get_val(arg2)));
+	}
+
+	ut_error;
+	return(FALSE);
+}
+
+/*********************************************************************
+Evaluates a comparison node.
+@return the result of the comparison */
+ibool
+eval_cmp(
+/*=====*/
+	func_node_t*	cmp_node)	/*!< in: comparison node */
+{
+	que_node_t*	arg1;
+	que_node_t*	arg2;
+	int		res;
+	ibool		val	= FALSE; /* remove warning */
+
+	ut_ad(que_node_get_type(cmp_node) == QUE_NODE_FUNC);
+
+	arg1 = cmp_node->args;
+	arg2 = que_node_get_next(arg1);
+
+	switch (cmp_node->func) {
+	case '<':
+	case '=':
+	case '>':
+	case PARS_LE_TOKEN:
+	case PARS_NE_TOKEN:
+	case PARS_GE_TOKEN:
+		res = cmp_dfield_dfield(
+			que_node_get_val(arg1), que_node_get_val(arg2));
+
+		switch (cmp_node->func) {
+		case '<':
+			val = (res < 0);
+			break;
+		case '=':
+			val = (res == 0);
+			break;
+		case '>':
+			val = (res > 0);
+			break;
+		case PARS_LE_TOKEN:
+			val = (res <= 0);
+			break;
+		case PARS_NE_TOKEN:
+			val = (res != 0);
+			break;
+		case PARS_GE_TOKEN:
+			val = (res >= 0);
+			break;
+		}
+		break;
+	default:
+		val = eval_cmp_like(arg1, arg2);
+		break;
+	}
+
+	eval_node_set_ibool_val(cmp_node, val);
+
+	return(val);
+}
+
+/*****************************************************************//**
+Evaluates a logical operation node. */
+UNIV_INLINE
+void
+eval_logical(
+/*=========*/
+	func_node_t*	logical_node)	/*!< in: logical operation node */
+{
+	que_node_t*	arg1;
+	que_node_t*	arg2;
+	ibool		val1;
+	ibool		val2 = 0; /* remove warning */
+	ibool		val = 0;  /* remove warning */
+	int		func;
+
+	ut_ad(que_node_get_type(logical_node) == QUE_NODE_FUNC);
+
+	arg1 = logical_node->args;
+	arg2 = que_node_get_next(arg1); /* arg2 is NULL if func is 'NOT' */
+
+	val1 = eval_node_get_ibool_val(arg1);
+
+	if (arg2) {
+		val2 = eval_node_get_ibool_val(arg2);
+	}
+
+	func = logical_node->func;
+
+	if (func == PARS_AND_TOKEN) {
+		val = val1 & val2;
+	} else if (func == PARS_OR_TOKEN) {
+		val = val1 | val2;
+	} else if (func == PARS_NOT_TOKEN) {
+		val = TRUE - val1;
+	} else {
+		ut_error;
+	}
+
+	eval_node_set_ibool_val(logical_node, val);
+}
+
+/*****************************************************************//**
+Evaluates an arithmetic operation node. */
+UNIV_INLINE
+void
+eval_arith(
+/*=======*/
+	func_node_t*	arith_node)	/*!< in: arithmetic operation node */
+{
+	que_node_t*	arg1;
+	que_node_t*	arg2;
+	lint		val1;
+	lint		val2 = 0; /* remove warning */
+	lint		val;
+	int		func;
+
+	ut_ad(que_node_get_type(arith_node) == QUE_NODE_FUNC);
+
+	arg1 = arith_node->args;
+	arg2 = que_node_get_next(arg1); /* arg2 is NULL if func is unary '-' */
+
+	val1 = eval_node_get_int_val(arg1);
+
+	if (arg2) {
+		val2 = eval_node_get_int_val(arg2);
+	}
+
+	func = arith_node->func;
+
+	if (func == '+') {
+		val = val1 + val2;
+	} else if ((func == '-') && arg2) {
+		val = val1 - val2;
+	} else if (func == '-') {
+		val = -val1;
+	} else if (func == '*') {
+		val = val1 * val2;
+	} else {
+		ut_ad(func == '/');
+		val = val1 / val2;
+	}
+
+	eval_node_set_int_val(arith_node, val);
+}
+
+/*****************************************************************//**
+Evaluates an aggregate operation node. */
+UNIV_INLINE
+void
+eval_aggregate(
+/*===========*/
+	func_node_t*	node)	/*!< in: aggregate operation node */
+{
+	lint		val;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_FUNC);
+
+	val = eval_node_get_int_val(node);
+
+	ut_a(node->func == PARS_COUNT_TOKEN);
+	val = val + 1;
+	eval_node_set_int_val(node, val);
+}
+
+/*****************************************************************//**
+Evaluates a notfound-function node. */
+UNIV_INLINE
+void
+eval_notfound(
+/*==========*/
+	func_node_t*	func_node)	/*!< in: function node */
+{
+	sym_node_t*	cursor;
+	sel_node_t*	sel_node;
+	ibool		ibool_val;
+
+	ut_ad(func_node->func == PARS_NOTFOUND_TOKEN);
+
+	cursor = static_cast<sym_node_t*>(func_node->args);
+
+	ut_ad(que_node_get_type(cursor) == QUE_NODE_SYMBOL);
+
+	if (cursor->token_type == SYM_LIT) {
+		ut_ad(!memcmp(dfield_get_data(que_node_get_val(cursor)),
+			      "SQL", 3));
+		sel_node = cursor->sym_table->query_graph->last_sel_node;
+	} else {
+		sel_node = cursor->alias->cursor_def;
+	}
+
+	if (sel_node->state == SEL_NODE_NO_MORE_ROWS) {
+		ibool_val = TRUE;
+	} else {
+		ibool_val = FALSE;
+	}
+
+	eval_node_set_ibool_val(func_node, ibool_val);
+}
+
+/*****************************************************************//**
+Evaluates a substr-function node. */
+UNIV_INLINE
+void
+eval_substr(
+/*========*/
+	func_node_t*	func_node)	/*!< in: function node */
+{
+	que_node_t*	arg1;
+	que_node_t*	arg2;
+	que_node_t*	arg3;
+	dfield_t*	dfield;
+	byte*		str1;
+	ulint		len1;
+	ulint		len2;
+
+	arg1 = func_node->args;
+	arg2 = que_node_get_next(arg1);
+
+	ut_ad(func_node->func == PARS_SUBSTR_TOKEN);
+
+	arg3 = que_node_get_next(arg2);
+
+	str1 = static_cast<byte*>(dfield_get_data(que_node_get_val(arg1)));
+
+	const ulint str1_len = dfield_get_len(que_node_get_val(arg1));
+
+	len1 = (ulint) eval_node_get_int_val(arg2);
+	len2 = (ulint) eval_node_get_int_val(arg3);
+
+	dfield = que_node_get_val(func_node);
+
+	if (len1 > str1_len) {
+		len2 = 0;
+	} else {
+		str1 += len1;
+		if (len2 > str1_len - len1) {
+			len2 = str1_len - len1;
+		}
+	}
+
+	dfield_set_data(dfield, str1, len2);
+}
+
+/*****************************************************************//**
+Evaluates an instr-function node. */
+static
+void
+eval_instr(
+/*=======*/
+	func_node_t*	func_node)	/*!< in: function node */
+{
+	que_node_t*	arg1;
+	que_node_t*	arg2;
+	dfield_t*	dfield1;
+	dfield_t*	dfield2;
+	lint		int_val;
+	byte*		str1;
+	byte*		str2;
+	byte		match_char;
+	ulint		len1;
+	ulint		len2;
+	ulint		i;
+	ulint		j;
+
+	arg1 = func_node->args;
+	arg2 = que_node_get_next(arg1);
+
+	dfield1 = que_node_get_val(arg1);
+	dfield2 = que_node_get_val(arg2);
+
+	str1 = static_cast<byte*>(dfield_get_data(dfield1));
+	str2 = static_cast<byte*>(dfield_get_data(dfield2));
+
+	len1 = dfield_get_len(dfield1);
+	len2 = dfield_get_len(dfield2);
+
+	if (len2 == 0) {
+		ut_error;
+	}
+
+	match_char = str2[0];
+
+	for (i = 0; i < len1; i++) {
+		/* In this outer loop, the number of matched characters is 0 */
+
+		if (str1[i] == match_char) {
+
+			if (i + len2 > len1) {
+
+				break;
+			}
+
+			for (j = 1;; j++) {
+				/* We have already matched j characters */
+
+				if (j == len2) {
+					int_val = lint(i) + 1;
+
+					goto match_found;
+				}
+
+				if (str1[i + j] != str2[j]) {
+
+					break;
+				}
+			}
+		}
+	}
+
+	int_val = 0;
+
+match_found:
+	eval_node_set_int_val(func_node, int_val);
+}
+
+/*****************************************************************//**
+Evaluates a predefined function node. */
+static
+void
+eval_concat(
+/*========*/
+	func_node_t*	func_node)	/*!< in: function node */
+{
+	que_node_t*	arg;
+	dfield_t*	dfield;
+	byte*		data;
+	ulint		len;
+	ulint		len1;
+
+	arg = func_node->args;
+	len = 0;
+
+	while (arg) {
+		len1 = dfield_get_len(que_node_get_val(arg));
+
+		len += len1;
+
+		arg = que_node_get_next(arg);
+	}
+
+	data = eval_node_ensure_val_buf(func_node, len);
+
+	arg = func_node->args;
+	len = 0;
+
+	while (arg) {
+		dfield = que_node_get_val(arg);
+		len1 = dfield_get_len(dfield);
+
+		memcpy(data + len, dfield_get_data(dfield), len1);
+
+		len += len1;
+
+		arg = que_node_get_next(arg);
+	}
+}
+
+/*****************************************************************//**
+Evaluates a predefined function node. If the first argument is an integer,
+this function looks at the second argument which is the integer length in
+bytes, and converts the integer to a VARCHAR.
+If the first argument is of some other type, this function converts it to
+BINARY. */
+UNIV_INLINE
+void
+eval_to_binary(
+/*===========*/
+	func_node_t*	func_node)	/*!< in: function node */
+{
+	que_node_t*	arg1;
+	que_node_t*	arg2;
+	dfield_t*	dfield;
+	byte*		str1;
+	ulint		len;
+	ulint		len1;
+
+	arg1 = func_node->args;
+
+	str1 = static_cast<byte*>(dfield_get_data(que_node_get_val(arg1)));
+
+	if (dtype_get_mtype(que_node_get_data_type(arg1)) != DATA_INT) {
+
+		len = dfield_get_len(que_node_get_val(arg1));
+
+		dfield = que_node_get_val(func_node);
+
+		dfield_set_data(dfield, str1, len);
+
+		return;
+	}
+
+	arg2 = que_node_get_next(arg1);
+
+	len1 = (ulint) eval_node_get_int_val(arg2);
+
+	if (len1 > 4) {
+
+		ut_error;
+	}
+
+	dfield = que_node_get_val(func_node);
+
+	dfield_set_data(dfield, str1 + (4 - len1), len1);
+}
+
+/*****************************************************************//**
+Evaluate LENGTH(). */
+inline void eval_length(func_node_t* func_node)
+{
+	eval_node_set_int_val(func_node,
+			      dfield_get_len(que_node_get_val
+					     (func_node->args)));
+}
+
+/*****************************************************************//**
+Evaluates a function node. */
+void
+eval_func(
+/*======*/
+	func_node_t*	func_node)	/*!< in: function node */
+{
+	que_node_t*	arg;
+	ulint		fclass;
+
+	ut_ad(que_node_get_type(func_node) == QUE_NODE_FUNC);
+
+	fclass = func_node->fclass;
+	const int func = func_node->func;
+
+	arg = func_node->args;
+
+	/* Evaluate first the argument list */
+	while (arg) {
+		eval_exp(arg);
+
+		/* The functions are not defined for SQL null argument
+		values, except for eval_cmp and notfound */
+
+		if (dfield_is_null(que_node_get_val(arg))
+		    && (fclass != PARS_FUNC_CMP)
+		    && (func != PARS_NOTFOUND_TOKEN)) {
+			ut_error;
+		}
+
+		arg = que_node_get_next(arg);
+	}
+
+	switch (fclass) {
+	case PARS_FUNC_CMP:
+		eval_cmp(func_node);
+		return;
+	case PARS_FUNC_ARITH:
+		eval_arith(func_node);
+		return;
+	case PARS_FUNC_AGGREGATE:
+		eval_aggregate(func_node);
+		return;
+	case PARS_FUNC_PREDEFINED:
+		switch (func) {
+		case PARS_NOTFOUND_TOKEN:
+			eval_notfound(func_node);
+			return;
+		case PARS_SUBSTR_TOKEN:
+			eval_substr(func_node);
+			return;
+		case PARS_INSTR_TOKEN:
+			eval_instr(func_node);
+			return;
+		case PARS_CONCAT_TOKEN:
+			eval_concat(func_node);
+			return;
+		case PARS_TO_BINARY_TOKEN:
+			eval_to_binary(func_node);
+			return;
+		case PARS_LENGTH_TOKEN:
+			eval_length(func_node);
+			return;
+		default:
+			ut_error;
+		}
+	case PARS_FUNC_LOGICAL:
+		eval_logical(func_node);
+		return;
+	}
+
+	ut_error;
+}
diff --git a/storage/innobase/eval/eval0proc.cc b/storage/innobase/eval/eval0proc.cc
new file mode 100644
index 00000000..7e39443f
--- /dev/null
+++ b/storage/innobase/eval/eval0proc.cc
@@ -0,0 +1,286 @@
+/*****************************************************************************
+
+Copyright (c) 1998, 2016, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file eval/eval0proc.cc
+Executes SQL stored procedures and their control structures
+
+Created 1/20/1998 Heikki Tuuri
+*******************************************************/
+
+#include "eval0proc.h"
+
+/**********************************************************************//**
+Performs an execution step of an if-statement node.
+@return query thread to run next or NULL */
+que_thr_t*
+if_step(
+/*====*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	if_node_t*	node;
+	elsif_node_t*	elsif_node;
+
+	ut_ad(thr);
+
+	node = static_cast<if_node_t*>(thr->run_node);
+	ut_ad(que_node_get_type(node) == QUE_NODE_IF);
+
+	if (thr->prev_node == que_node_get_parent(node)) {
+
+		/* Evaluate the condition */
+
+		eval_exp(node->cond);
+
+		if (eval_node_get_ibool_val(node->cond)) {
+
+			/* The condition evaluated to TRUE: start execution
+			from the first statement in the statement list */
+
+			thr->run_node = node->stat_list;
+
+		} else if (node->else_part) {
+			thr->run_node = node->else_part;
+
+		} else if (node->elsif_list) {
+			elsif_node = node->elsif_list;
+
+			for (;;) {
+				eval_exp(elsif_node->cond);
+
+				if (eval_node_get_ibool_val(
+					    elsif_node->cond)) {
+
+					/* The condition evaluated to TRUE:
+					start execution from the first
+					statement in the statement list */
+
+					thr->run_node = elsif_node->stat_list;
+
+					break;
+				}
+
+				elsif_node = static_cast<elsif_node_t*>(
+					que_node_get_next(elsif_node));
+
+				if (elsif_node == NULL) {
+					thr->run_node = NULL;
+
+					break;
+				}
+			}
+		} else {
+			thr->run_node = NULL;
+		}
+	} else {
+		/* Move to the next statement */
+		ut_ad(que_node_get_next(thr->prev_node) == NULL);
+
+		thr->run_node = NULL;
+	}
+
+	if (thr->run_node == NULL) {
+		thr->run_node = que_node_get_parent(node);
+	}
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Performs an execution step of a while-statement node.
+@return query thread to run next or NULL */
+que_thr_t*
+while_step(
+/*=======*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	while_node_t*	node;
+
+	ut_ad(thr);
+
+	node = static_cast<while_node_t*>(thr->run_node);
+	ut_ad(que_node_get_type(node) == QUE_NODE_WHILE);
+
+	ut_ad((thr->prev_node == que_node_get_parent(node))
+	      || (que_node_get_next(thr->prev_node) == NULL));
+
+	/* Evaluate the condition */
+
+	eval_exp(node->cond);
+
+	if (eval_node_get_ibool_val(node->cond)) {
+
+		/* The condition evaluated to TRUE: start execution
+		from the first statement in the statement list */
+
+		thr->run_node = node->stat_list;
+	} else {
+		thr->run_node = que_node_get_parent(node);
+	}
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Performs an execution step of an assignment statement node.
+@return query thread to run next or NULL */
+que_thr_t*
+assign_step(
+/*========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	assign_node_t*	node;
+
+	ut_ad(thr);
+
+	node = static_cast<assign_node_t*>(thr->run_node);
+	ut_ad(que_node_get_type(node) == QUE_NODE_ASSIGNMENT);
+
+	/* Evaluate the value to assign */
+
+	eval_exp(node->val);
+
+	eval_node_copy_val(node->var->alias, node->val);
+
+	thr->run_node = que_node_get_parent(node);
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Performs an execution step of a for-loop node.
+@return query thread to run next or NULL */
+que_thr_t*
+for_step(
+/*=====*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	for_node_t*	node;
+	que_node_t*	parent;
+	lint		loop_var_value;
+
+	ut_ad(thr);
+
+	node = static_cast<for_node_t*>(thr->run_node);
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_FOR);
+
+	parent = que_node_get_parent(node);
+
+	if (thr->prev_node != parent) {
+
+		/* Move to the next statement */
+		thr->run_node = que_node_get_next(thr->prev_node);
+
+		if (thr->run_node != NULL) {
+
+			return(thr);
+		}
+
+		/* Increment the value of loop_var */
+
+		loop_var_value = 1 + eval_node_get_int_val(node->loop_var);
+	} else {
+		/* Initialize the loop */
+
+		eval_exp(node->loop_start_limit);
+		eval_exp(node->loop_end_limit);
+
+		loop_var_value = eval_node_get_int_val(node->loop_start_limit);
+
+		node->loop_end_value
+                  = (int) eval_node_get_int_val(node->loop_end_limit);
+	}
+
+	/* Check if we should do another loop */
+
+	if (loop_var_value > node->loop_end_value) {
+
+		/* Enough loops done */
+
+		thr->run_node = parent;
+	} else {
+		eval_node_set_int_val(node->loop_var, loop_var_value);
+
+		thr->run_node = node->stat_list;
+	}
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Performs an execution step of an exit statement node.
+@return query thread to run next or NULL */
+que_thr_t*
+exit_step(
+/*======*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	exit_node_t*	node;
+	que_node_t*	loop_node;
+
+	ut_ad(thr);
+
+	node = static_cast<exit_node_t*>(thr->run_node);
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_EXIT);
+
+	/* Loops exit by setting thr->run_node as the loop node's parent, so
+	find our containing loop node and get its parent. */
+
+	loop_node = que_node_get_containing_loop_node(node);
+
+	/* If someone uses an EXIT statement outside of a loop, this will
+	trigger. */
+	ut_a(loop_node);
+
+	thr->run_node = que_node_get_parent(loop_node);
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Performs an execution step of a return-statement node.
+@return query thread to run next or NULL */
+que_thr_t*
+return_step(
+/*========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	return_node_t*	node;
+	que_node_t*	parent;
+
+	ut_ad(thr);
+
+	node = static_cast<return_node_t*>(thr->run_node);
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_RETURN);
+
+	parent = node;
+
+	while (que_node_get_type(parent) != QUE_NODE_PROC) {
+
+		parent = que_node_get_parent(parent);
+	}
+
+	ut_a(parent);
+
+	thr->run_node = que_node_get_parent(parent);
+
+	return(thr);
+}
diff --git a/storage/innobase/fil/fil0crypt.cc b/storage/innobase/fil/fil0crypt.cc
new file mode 100644
index 00000000..97cb3994
--- /dev/null
+++ b/storage/innobase/fil/fil0crypt.cc
@@ -0,0 +1,2425 @@
+/*****************************************************************************
+Copyright (C) 2013, 2015, Google Inc. All Rights Reserved.
+Copyright (c) 2014, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/**************************************************//**
+@file fil0crypt.cc
+Innodb file space encrypt/decrypt
+
+Created            Jonas Oreland Google
+Modified           Jan Lindström jan.lindstrom@mariadb.com
+*******************************************************/
+
+#include "fil0crypt.h"
+#include "mach0data.h"
+#include "page0zip.h"
+#include "buf0checksum.h"
+#ifdef UNIV_INNOCHECKSUM
+# include "buf0buf.h"
+#else
+#include "buf0flu.h"
+#include "buf0dblwr.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+#include "ut0ut.h"
+#include "fsp0fsp.h"
+#include "fil0pagecompress.h"
+#include <my_crypt.h>
+
+static bool fil_crypt_threads_inited = false;
+
+/** Is encryption enabled/disabled */
+ulong srv_encrypt_tables;
+
+/** No of key rotation threads requested */
+uint srv_n_fil_crypt_threads;
+
+/** No of key rotation threads started */
+uint srv_n_fil_crypt_threads_started;
+
+/** At this age or older a space/page will be rotated */
+uint srv_fil_crypt_rotate_key_age;
+
+/** Whether the encryption plugin does key rotation */
+Atomic_relaxed<bool> srv_encrypt_rotate;
+
+/** Condition variable for srv_n_fil_crypt_threads_started */
+static pthread_cond_t fil_crypt_cond;
+
+/** Condition variable to to signal the key rotation threads */
+static pthread_cond_t fil_crypt_threads_cond;
+
+/** Condition variable for interrupting sleeptime_ms sleep at the end
+of fil_crypt_rotate_page() */
+static pthread_cond_t fil_crypt_throttle_sleep_cond;
+
+/** Mutex for key rotation threads. Acquired before fil_system.mutex! */
+static mysql_mutex_t fil_crypt_threads_mutex;
+
+/** Variable ensuring only 1 thread at time does initial conversion */
+static bool fil_crypt_start_converting;
+
+/** Variables for throttling */
+uint srv_n_fil_crypt_iops;	 // 10ms per iop
+static constexpr uint srv_alloc_time = 3; // allocate iops for 3s at a time
+static uint n_fil_crypt_iops_allocated;
+
+#define DEBUG_KEYROTATION_THROTTLING 0
+
+/** Statistics variables */
+static fil_crypt_stat_t crypt_stat;
+static mysql_mutex_t crypt_stat_mutex;
+
+/** Wake up the encryption threads */
+void fil_crypt_threads_signal(bool broadcast)
+{
+  mysql_mutex_lock(&fil_crypt_threads_mutex);
+  if (broadcast)
+    pthread_cond_broadcast(&fil_crypt_threads_cond);
+  else
+    pthread_cond_signal(&fil_crypt_threads_cond);
+  mysql_mutex_unlock(&fil_crypt_threads_mutex);
+}
+
+/***********************************************************************
+Check if a key needs rotation given a key_state
+@param[in]	crypt_data		Encryption information
+@param[in]	key_version		Current key version
+@param[in]	latest_key_version	Latest key version
+@param[in]	rotate_key_age		when to rotate
+@return true if key needs rotation, false if not */
+static bool
+fil_crypt_needs_rotation(
+	const fil_space_crypt_t*	crypt_data,
+	uint				key_version,
+	uint				latest_key_version,
+	uint				rotate_key_age)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************
+Init space crypt */
+void fil_space_crypt_init()
+{
+  pthread_cond_init(&fil_crypt_throttle_sleep_cond, nullptr);
+  mysql_mutex_init(0, &crypt_stat_mutex, nullptr);
+}
+
+/*********************************************************************
+Cleanup space crypt */
+void fil_space_crypt_cleanup()
+{
+  pthread_cond_destroy(&fil_crypt_throttle_sleep_cond);
+  mysql_mutex_destroy(&crypt_stat_mutex);
+}
+
+/**
+Get latest key version from encryption plugin.
+@return key version or ENCRYPTION_KEY_VERSION_INVALID */
+uint
+fil_space_crypt_t::key_get_latest_version(void)
+{
+	uint key_version = key_found;
+
+	if (is_key_found()) {
+		key_version = encryption_key_get_latest_version(key_id);
+		/* InnoDB does dirty read of srv_fil_crypt_rotate_key_age.
+		It doesn't matter because srv_encrypt_rotate
+		can be set to true only once */
+		if (!srv_encrypt_rotate
+		    && key_version > srv_fil_crypt_rotate_key_age) {
+			srv_encrypt_rotate = true;
+		}
+
+		srv_stats.n_key_requests.inc();
+		key_found = key_version;
+	}
+
+	return key_version;
+}
+
+/******************************************************************
+Get the latest(key-version), waking the encrypt thread, if needed
+@param[in,out]	crypt_data	Crypt data */
+static inline
+uint
+fil_crypt_get_latest_key_version(
+	fil_space_crypt_t* crypt_data)
+{
+	ut_ad(crypt_data != NULL);
+
+	uint key_version = crypt_data->key_get_latest_version();
+
+	if (crypt_data->is_key_found()) {
+
+		if (fil_crypt_needs_rotation(
+				crypt_data,
+				crypt_data->min_key_version,
+				key_version,
+				srv_fil_crypt_rotate_key_age)) {
+			if (fil_crypt_threads_inited) {
+				fil_crypt_threads_signal();
+			}
+		}
+	}
+
+	return key_version;
+}
+
+/******************************************************************
+Mutex helper for crypt_data->scheme */
+void
+crypt_data_scheme_locker(
+/*=====================*/
+	st_encryption_scheme*	scheme,
+	int			exit)
+{
+	fil_space_crypt_t* crypt_data =
+		static_cast<fil_space_crypt_t*>(scheme);
+
+	if (exit) {
+		mysql_mutex_unlock(&crypt_data->mutex);
+	} else {
+		mysql_mutex_lock(&crypt_data->mutex);
+	}
+}
+
+/******************************************************************
+Create a fil_space_crypt_t object
+@param[in]	type		CRYPT_SCHEME_UNENCRYPTE or
+				CRYPT_SCHEME_1
+@param[in]	encrypt_mode	FIL_ENCRYPTION_DEFAULT or
+				FIL_ENCRYPTION_ON or
+				FIL_ENCRYPTION_OFF
+@param[in]	min_key_version key_version or 0
+@param[in]	key_id		Used key id
+@return crypt object */
+static
+fil_space_crypt_t*
+fil_space_create_crypt_data(
+	uint			type,
+	fil_encryption_t	encrypt_mode,
+	uint			min_key_version,
+	uint			key_id)
+{
+	fil_space_crypt_t* crypt_data = NULL;
+	if (void* buf = ut_zalloc_nokey(sizeof(fil_space_crypt_t))) {
+		crypt_data = new(buf)
+			fil_space_crypt_t(
+				type,
+				min_key_version,
+				key_id,
+				encrypt_mode);
+	}
+
+	return crypt_data;
+}
+
+/******************************************************************
+Create a fil_space_crypt_t object
+@param[in]	encrypt_mode	FIL_ENCRYPTION_DEFAULT or
+				FIL_ENCRYPTION_ON or
+				FIL_ENCRYPTION_OFF
+
+@param[in]	key_id		Encryption key id
+@return crypt object */
+fil_space_crypt_t*
+fil_space_create_crypt_data(
+	fil_encryption_t	encrypt_mode,
+	uint			key_id)
+{
+	return (fil_space_create_crypt_data(0, encrypt_mode, 0, key_id));
+}
+
+/******************************************************************
+Merge fil_space_crypt_t object
+@param[in,out]	dst		Destination cryp data
+@param[in]	src		Source crypt data */
+static
+void
+fil_space_merge_crypt_data(
+	fil_space_crypt_t* dst,
+	const fil_space_crypt_t* src)
+{
+	mysql_mutex_lock(&dst->mutex);
+
+	/* validate that they are mergeable */
+	ut_a(src->type == CRYPT_SCHEME_UNENCRYPTED ||
+	     src->type == CRYPT_SCHEME_1);
+
+	ut_a(dst->type == CRYPT_SCHEME_UNENCRYPTED ||
+	     dst->type == CRYPT_SCHEME_1);
+
+	dst->encryption = src->encryption;
+	dst->type = src->type;
+	dst->min_key_version = src->min_key_version;
+	dst->keyserver_requests += src->keyserver_requests;
+
+	mysql_mutex_unlock(&dst->mutex);
+}
+
+/** Initialize encryption parameters from a tablespace header page.
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	page		first page of the tablespace
+@return crypt data from page 0
+@retval	NULL	if not present or not valid */
+fil_space_crypt_t* fil_space_read_crypt_data(ulint zip_size, const byte* page)
+{
+	const ulint offset = FSP_HEADER_OFFSET
+		+ fsp_header_get_encryption_offset(zip_size);
+
+	if (memcmp(page + offset, CRYPT_MAGIC, MAGIC_SZ) != 0) {
+		/* Crypt data is not stored. */
+		return NULL;
+	}
+
+	uint8_t type = mach_read_from_1(page + offset + MAGIC_SZ + 0);
+	uint8_t iv_length = mach_read_from_1(page + offset + MAGIC_SZ + 1);
+	fil_space_crypt_t* crypt_data;
+
+	if (!(type == CRYPT_SCHEME_UNENCRYPTED ||
+	      type == CRYPT_SCHEME_1)
+	    || iv_length != sizeof crypt_data->iv) {
+		ib::error() << "Found non sensible crypt scheme: "
+			    << type << "," << iv_length
+			    << " for space: "
+			    << page_get_space_id(page);
+		return NULL;
+	}
+
+	uint min_key_version = mach_read_from_4
+		(page + offset + MAGIC_SZ + 2 + iv_length);
+
+	uint key_id = mach_read_from_4
+		(page + offset + MAGIC_SZ + 2 + iv_length + 4);
+
+	fil_encryption_t encryption = (fil_encryption_t)mach_read_from_1(
+		page + offset + MAGIC_SZ + 2 + iv_length + 8);
+
+	crypt_data = fil_space_create_crypt_data(encryption, key_id);
+	/* We need to overwrite these as above function will initialize
+	members */
+	crypt_data->type = type;
+	crypt_data->min_key_version = min_key_version;
+	memcpy(crypt_data->iv, page + offset + MAGIC_SZ + 2, iv_length);
+
+	return crypt_data;
+}
+
+/******************************************************************
+Free a crypt data object
+@param[in,out] crypt_data	crypt data to be freed */
+void fil_space_destroy_crypt_data(fil_space_crypt_t **crypt_data)
+{
+	if (crypt_data != NULL && (*crypt_data) != NULL) {
+		fil_space_crypt_t* c;
+		if (UNIV_LIKELY(fil_crypt_threads_inited)) {
+			mysql_mutex_lock(&fil_crypt_threads_mutex);
+			c = *crypt_data;
+			*crypt_data = NULL;
+			mysql_mutex_unlock(&fil_crypt_threads_mutex);
+		} else {
+			ut_ad(srv_read_only_mode || !srv_was_started);
+			c = *crypt_data;
+			*crypt_data = NULL;
+		}
+		if (c) {
+			c->~fil_space_crypt_t();
+			ut_free(c);
+		}
+	}
+}
+
+/** Amend encryption information from redo log.
+@param[in]	space	tablespace
+@param[in]	data	encryption metadata */
+void fil_crypt_parse(fil_space_t* space, const byte* data)
+{
+	ut_ad(data[1] == MY_AES_BLOCK_SIZE);
+	if (void* buf = ut_zalloc_nokey(sizeof(fil_space_crypt_t))) {
+		fil_space_crypt_t* crypt_data = new(buf)
+			fil_space_crypt_t(
+				data[0],
+				mach_read_from_4(&data[2 + MY_AES_BLOCK_SIZE]),
+				mach_read_from_4(&data[6 + MY_AES_BLOCK_SIZE]),
+				static_cast<fil_encryption_t>
+				(data[10 + MY_AES_BLOCK_SIZE]));
+		memcpy(crypt_data->iv, data + 2, MY_AES_BLOCK_SIZE);
+		mysql_mutex_lock(&fil_system.mutex);
+		if (space->crypt_data) {
+			fil_space_merge_crypt_data(space->crypt_data,
+						   crypt_data);
+			fil_space_destroy_crypt_data(&crypt_data);
+			crypt_data = space->crypt_data;
+		} else {
+			space->crypt_data = crypt_data;
+		}
+		mysql_mutex_unlock(&fil_system.mutex);
+	}
+}
+
+/** Write encryption metadata to the first page.
+@param[in,out]	block	first page of the tablespace
+@param[in,out]	mtr	mini-transaction */
+void fil_space_crypt_t::write_page0(buf_block_t* block, mtr_t* mtr)
+{
+	const ulint offset = FSP_HEADER_OFFSET
+		+ fsp_header_get_encryption_offset(block->zip_size());
+	byte* b = block->page.frame + offset;
+
+	mtr->memcpy<mtr_t::MAYBE_NOP>(*block, b, CRYPT_MAGIC, MAGIC_SZ);
+
+	b += MAGIC_SZ;
+	byte* const start = b;
+	*b++ = static_cast<byte>(type);
+	compile_time_assert(sizeof iv == MY_AES_BLOCK_SIZE);
+	compile_time_assert(sizeof iv == CRYPT_SCHEME_1_IV_LEN);
+	*b++ = sizeof iv;
+	memcpy(b, iv, sizeof iv);
+	b += sizeof iv;
+	mach_write_to_4(b, min_key_version);
+	b += 4;
+	mach_write_to_4(b, key_id);
+	b += 4;
+	*b++ = byte(encryption);
+	ut_ad(b - start == 11 + MY_AES_BLOCK_SIZE);
+	/* We must log also any unchanged bytes, because recovery will
+	invoke fil_crypt_parse() based on this log record. */
+	mtr->memcpy(*block, offset + MAGIC_SZ, b - start);
+}
+
+/** Encrypt a buffer for non full checksum.
+@param[in,out]		crypt_data		Crypt data
+@param[in]		space			space_id
+@param[in]		offset			Page offset
+@param[in]		lsn			Log sequence number
+@param[in]		src_frame		Page to encrypt
+@param[in]		zip_size		ROW_FORMAT=COMPRESSED
+						page size, or 0
+@param[in,out]		dst_frame		Output buffer
+@return encrypted buffer or NULL */
+static byte* fil_encrypt_buf_for_non_full_checksum(
+	fil_space_crypt_t*	crypt_data,
+	ulint			space,
+	ulint			offset,
+	lsn_t			lsn,
+	const byte*		src_frame,
+	ulint			zip_size,
+	byte*			dst_frame)
+{
+	uint size = uint(zip_size ? zip_size : srv_page_size);
+	uint key_version = fil_crypt_get_latest_key_version(crypt_data);
+	ut_a(key_version != ENCRYPTION_KEY_VERSION_INVALID);
+	ut_ad(!ut_align_offset(src_frame, 8));
+	ut_ad(!ut_align_offset(dst_frame, 8));
+
+	const bool page_compressed = fil_page_get_type(src_frame)
+		== FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED;
+	uint header_len = FIL_PAGE_DATA;
+
+	if (page_compressed) {
+		header_len += FIL_PAGE_ENCRYPT_COMP_METADATA_LEN;
+	}
+
+	/* FIL page header is not encrypted */
+	memcpy(dst_frame, src_frame, header_len);
+	mach_write_to_4(dst_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION,
+			key_version);
+
+	/* Calculate the start offset in a page */
+	uint		unencrypted_bytes = header_len + FIL_PAGE_DATA_END;
+	uint		srclen = size - unencrypted_bytes;
+	const byte*	src = src_frame + header_len;
+	byte*		dst = dst_frame + header_len;
+	uint32		dstlen = 0;
+
+	if (page_compressed) {
+		srclen = mach_read_from_2(src_frame + FIL_PAGE_DATA);
+	}
+
+	int rc = encryption_scheme_encrypt(src, srclen, dst, &dstlen,
+					   crypt_data, key_version,
+					   (uint32)space, (uint32)offset, lsn);
+	ut_a(rc == MY_AES_OK);
+	ut_a(dstlen == srclen);
+
+	/* For compressed tables we do not store the FIL header because
+	the whole page is not stored to the disk. In compressed tables only
+	the FIL header + compressed (and now encrypted) payload alligned
+	to sector boundary is written. */
+	if (!page_compressed) {
+		/* FIL page trailer is also not encrypted */
+		static_assert(FIL_PAGE_DATA_END == 8, "alignment");
+		memcpy_aligned<8>(dst_frame + size - FIL_PAGE_DATA_END,
+				  src_frame + size - FIL_PAGE_DATA_END, 8);
+	} else {
+		/* Clean up rest of buffer */
+		memset(dst_frame+header_len+srclen, 0,
+		       size - (header_len + srclen));
+	}
+
+	/* store the post-encryption checksum after the key-version */
+	mach_write_to_4(dst_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + 4,
+			zip_size
+			? page_zip_calc_checksum(dst_frame, zip_size,
+						 SRV_CHECKSUM_ALGORITHM_CRC32)
+			: buf_calc_page_crc32(dst_frame));
+
+	ut_ad(fil_space_verify_crypt_checksum(dst_frame, zip_size));
+
+	srv_stats.pages_encrypted.inc();
+
+	return dst_frame;
+}
+
+/** Encrypt a buffer for full checksum format.
+@param[in,out]		crypt_data		Crypt data
+@param[in]		space			space_id
+@param[in]		offset			Page offset
+@param[in]		lsn			Log sequence number
+@param[in]		src_frame		Page to encrypt
+@param[in,out]		dst_frame		Output buffer
+@return encrypted buffer or NULL */
+static byte* fil_encrypt_buf_for_full_crc32(
+	fil_space_crypt_t*	crypt_data,
+	ulint			space,
+	ulint			offset,
+	lsn_t			lsn,
+	const byte*		src_frame,
+	byte*			dst_frame)
+{
+	uint key_version = fil_crypt_get_latest_key_version(crypt_data);
+	ut_d(bool corrupted = false);
+	const uint size = buf_page_full_crc32_size(src_frame, NULL,
+#ifdef UNIV_DEBUG
+						   &corrupted
+#else
+						   NULL
+#endif
+						   );
+	ut_ad(!corrupted);
+	uint srclen = size - (FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION
+			      + FIL_PAGE_FCRC32_CHECKSUM);
+	const byte* src = src_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION;
+	byte* dst = dst_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION;
+	uint dstlen = 0;
+
+	ut_a(key_version != ENCRYPTION_KEY_VERSION_INVALID);
+
+	/* Till FIL_PAGE_LSN, page is not encrypted */
+	memcpy(dst_frame, src_frame, FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
+
+	/* Write key version to the page. */
+	mach_write_to_4(dst_frame + FIL_PAGE_FCRC32_KEY_VERSION, key_version);
+
+	int rc = encryption_scheme_encrypt(src, srclen, dst, &dstlen,
+					   crypt_data, key_version,
+					   uint(space), uint(offset), lsn);
+	ut_a(rc == MY_AES_OK);
+	ut_a(dstlen == srclen);
+
+	const ulint payload = size - FIL_PAGE_FCRC32_CHECKSUM;
+	mach_write_to_4(dst_frame + payload, my_crc32c(0, dst_frame, payload));
+	/* Clean the rest of the buffer. FIXME: Punch holes when writing! */
+	memset(dst_frame + (payload + 4), 0, srv_page_size - (payload + 4));
+
+	srv_stats.pages_encrypted.inc();
+
+	return dst_frame;
+}
+
+/** Encrypt a buffer.
+@param[in,out]		crypt_data		Crypt data
+@param[in]		space			space_id
+@param[in]		offset			Page offset
+@param[in]		src_frame		Page to encrypt
+@param[in]		zip_size		ROW_FORMAT=COMPRESSED
+						page size, or 0
+@param[in,out]		dst_frame		Output buffer
+@param[in]		use_full_checksum	full crc32 algo is used
+@return encrypted buffer or NULL */
+byte* fil_encrypt_buf(
+	fil_space_crypt_t*	crypt_data,
+	ulint			space,
+	ulint			offset,
+	const byte*		src_frame,
+	ulint			zip_size,
+	byte*			dst_frame,
+	bool			use_full_checksum)
+{
+	const lsn_t lsn = mach_read_from_8(src_frame + FIL_PAGE_LSN);
+
+	if (use_full_checksum) {
+		ut_ad(!zip_size);
+		return fil_encrypt_buf_for_full_crc32(
+			crypt_data, space, offset,
+			lsn, src_frame, dst_frame);
+	}
+
+	return fil_encrypt_buf_for_non_full_checksum(
+		crypt_data, space, offset, lsn,
+		src_frame, zip_size, dst_frame);
+}
+
+/** Check whether these page types are allowed to encrypt.
+@param[in]	space		tablespace object
+@param[in]	src_frame	source page
+@return true if it is valid page type */
+static bool fil_space_encrypt_valid_page_type(
+	const fil_space_t*	space,
+	const byte*		src_frame)
+{
+	switch (fil_page_get_type(src_frame)) {
+	case FIL_PAGE_RTREE:
+		return space->full_crc32();
+	case FIL_PAGE_TYPE_FSP_HDR:
+	case FIL_PAGE_TYPE_XDES:
+		return false;
+	}
+
+	return true;
+}
+
+/******************************************************************
+Encrypt a page
+
+@param[in]		space		Tablespace
+@param[in]		offset		Page offset
+@param[in]		src_frame	Page to encrypt
+@param[in,out]		dst_frame	Output buffer
+@return encrypted buffer or NULL */
+byte* fil_space_encrypt(
+	const fil_space_t*	space,
+	ulint			offset,
+	byte*			src_frame,
+	byte*			dst_frame)
+{
+	if (!fil_space_encrypt_valid_page_type(space, src_frame)) {
+		return src_frame;
+	}
+
+	if (!space->crypt_data || !space->crypt_data->is_encrypted()) {
+		return (src_frame);
+	}
+
+	ut_ad(space->referenced());
+
+	return fil_encrypt_buf(space->crypt_data, space->id, offset,
+			       src_frame, space->zip_size(),
+			       dst_frame, space->full_crc32());
+}
+
+/** Decrypt a page for full checksum format.
+@param[in]	space			space id
+@param[in]	crypt_data		crypt_data
+@param[in]	tmp_frame		Temporary buffer
+@param[in,out]	src_frame		Page to decrypt
+@return DB_SUCCESS or error */
+static dberr_t fil_space_decrypt_full_crc32(
+	ulint			space,
+	fil_space_crypt_t*	crypt_data,
+	byte*			tmp_frame,
+	byte*			src_frame)
+{
+	uint key_version = mach_read_from_4(
+		src_frame + FIL_PAGE_FCRC32_KEY_VERSION);
+	lsn_t lsn = mach_read_from_8(src_frame + FIL_PAGE_LSN);
+	uint offset = mach_read_from_4(src_frame + FIL_PAGE_OFFSET);
+
+	ut_ad(key_version != ENCRYPTION_KEY_NOT_ENCRYPTED);
+
+	memcpy(tmp_frame, src_frame, FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
+
+	/* Calculate the offset where decryption starts */
+	const byte* src = src_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION;
+	byte* dst = tmp_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION;
+	uint dstlen = 0;
+	bool corrupted = false;
+	uint size = buf_page_full_crc32_size(src_frame, NULL, &corrupted);
+	if (UNIV_UNLIKELY(corrupted)) {
+		return DB_DECRYPTION_FAILED;
+	}
+
+	uint srclen = size - (FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION
+			      + FIL_PAGE_FCRC32_CHECKSUM);
+
+	int rc = encryption_scheme_decrypt(src, srclen, dst, &dstlen,
+					   crypt_data, key_version,
+					   (uint) space, offset, lsn);
+
+	if (rc != MY_AES_OK || dstlen != srclen) {
+		return DB_DECRYPTION_FAILED;
+	}
+
+	/* Copy only checksum part in the trailer */
+	memcpy(tmp_frame + srv_page_size - FIL_PAGE_FCRC32_CHECKSUM,
+	       src_frame + srv_page_size - FIL_PAGE_FCRC32_CHECKSUM,
+	       FIL_PAGE_FCRC32_CHECKSUM);
+
+	srv_stats.pages_decrypted.inc();
+
+	return DB_SUCCESS; /* page was decrypted */
+}
+
+/** Decrypt a page for non full checksum format.
+@param[in]	crypt_data		crypt_data
+@param[in]	tmp_frame		Temporary buffer
+@param[in]	physical_size		page size
+@param[in,out]	src_frame		Page to decrypt
+@return DB_SUCCESS or error */
+static dberr_t fil_space_decrypt_for_non_full_checksum(
+	fil_space_crypt_t*	crypt_data,
+	byte*			tmp_frame,
+	ulint			physical_size,
+	byte*			src_frame)
+{
+	uint key_version = mach_read_from_4(
+			src_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
+	bool page_compressed = (fil_page_get_type(src_frame)
+				== FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED);
+	uint offset = mach_read_from_4(src_frame + FIL_PAGE_OFFSET);
+	uint space = mach_read_from_4(
+			src_frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+	ib_uint64_t lsn = mach_read_from_8(src_frame + FIL_PAGE_LSN);
+
+	ut_ad(key_version != ENCRYPTION_KEY_NOT_ENCRYPTED);
+
+	/* read space & lsn */
+	uint header_len = FIL_PAGE_DATA;
+
+	if (page_compressed) {
+		header_len += FIL_PAGE_ENCRYPT_COMP_METADATA_LEN;
+	}
+
+	/* Copy FIL page header, it is not encrypted */
+	memcpy(tmp_frame, src_frame, header_len);
+
+	/* Calculate the offset where decryption starts */
+	const byte* src = src_frame + header_len;
+	byte* dst = tmp_frame + header_len;
+	uint32 dstlen = 0;
+	uint srclen = uint(physical_size) - header_len - FIL_PAGE_DATA_END;
+
+	if (page_compressed) {
+		srclen = mach_read_from_2(src_frame + FIL_PAGE_DATA);
+	}
+
+	int rc = encryption_scheme_decrypt(src, srclen, dst, &dstlen,
+					   crypt_data, key_version,
+					   space, offset, lsn);
+
+	if (! ((rc == MY_AES_OK) && ((ulint) dstlen == srclen))) {
+		return DB_DECRYPTION_FAILED;
+	}
+
+	/* For compressed tables we do not store the FIL header because
+	the whole page is not stored to the disk. In compressed tables only
+	the FIL header + compressed (and now encrypted) payload alligned
+	to sector boundary is written. */
+	if (!page_compressed) {
+		/* Copy FIL trailer */
+		memcpy(tmp_frame + physical_size - FIL_PAGE_DATA_END,
+		       src_frame + physical_size - FIL_PAGE_DATA_END,
+		       FIL_PAGE_DATA_END);
+	}
+
+	srv_stats.pages_decrypted.inc();
+
+	return DB_SUCCESS; /* page was decrypted */
+}
+
+/** Decrypt a page.
+@param[in]	space_id		tablespace id
+@param[in]	fsp_flags		Tablespace flags
+@param[in]	crypt_data		crypt_data
+@param[in]	tmp_frame		Temporary buffer
+@param[in]	physical_size		page size
+@param[in,out]	src_frame		Page to decrypt
+@retval DB_SUCCESS on success
+@retval DB_DECRYPTION_FAILED on error */
+dberr_t
+fil_space_decrypt(
+	uint32_t		space_id,
+	uint32_t		fsp_flags,
+	fil_space_crypt_t*	crypt_data,
+	byte*			tmp_frame,
+	ulint			physical_size,
+	byte*			src_frame)
+{
+	if (!crypt_data || !crypt_data->is_encrypted()) {
+		return DB_DECRYPTION_FAILED;
+	}
+
+	if (fil_space_t::full_crc32(fsp_flags)) {
+		return fil_space_decrypt_full_crc32(
+			space_id, crypt_data, tmp_frame, src_frame);
+	}
+
+	return fil_space_decrypt_for_non_full_checksum(crypt_data, tmp_frame,
+						       physical_size,
+						       src_frame);
+}
+
+/**
+Decrypt a page.
+@param[in]	space			Tablespace
+@param[in]	tmp_frame		Temporary buffer used for decrypting
+@param[in,out]	src_frame		Page to decrypt
+@return decrypted page, or original not encrypted page if decryption is
+not needed.
+@retval nullptr on failure */
+byte*
+fil_space_decrypt(
+	const fil_space_t* space,
+	byte*		tmp_frame,
+	byte*		src_frame)
+{
+	const ulint physical_size = space->physical_size();
+
+	ut_ad(space->referenced());
+
+	if (DB_SUCCESS != fil_space_decrypt(space->id, space->flags,
+					    space->crypt_data,
+					    tmp_frame, physical_size,
+					    src_frame)) {
+		return nullptr;
+	}
+
+	/* Copy the decrypted page back to page buffer, not
+	really any other options. */
+	return static_cast<byte*>(memcpy(src_frame, tmp_frame, physical_size));
+}
+
+/***********************************************************************/
+
+/** A copy of global key state */
+struct key_state_t {
+	key_state_t() : key_id(0), key_version(0),
+			rotate_key_age(srv_fil_crypt_rotate_key_age) {}
+	bool operator==(const key_state_t& other) const {
+		return key_version == other.key_version &&
+			rotate_key_age == other.rotate_key_age;
+	}
+	uint key_id;
+	uint key_version;
+	uint rotate_key_age;
+};
+
+/***********************************************************************
+Copy global key state
+@param[in,out]	new_state	key state
+@param[in]	crypt_data	crypt data */
+static void
+fil_crypt_get_key_state(
+	key_state_t*			new_state,
+	fil_space_crypt_t*		crypt_data)
+{
+	if (srv_encrypt_tables) {
+		new_state->key_version = crypt_data->key_get_latest_version();
+		new_state->rotate_key_age = srv_fil_crypt_rotate_key_age;
+
+		ut_a(new_state->key_version != ENCRYPTION_KEY_NOT_ENCRYPTED);
+	} else {
+		new_state->key_version = 0;
+		new_state->rotate_key_age = 0;
+	}
+}
+
+/***********************************************************************
+Check if a key needs rotation given a key_state
+@param[in]	crypt_data		Encryption information
+@param[in]	key_version		Current key version
+@param[in]	latest_key_version	Latest key version
+@param[in]	rotate_key_age		when to rotate
+@return true if key needs rotation, false if not */
+static bool
+fil_crypt_needs_rotation(
+	const fil_space_crypt_t*	crypt_data,
+	uint				key_version,
+	uint				latest_key_version,
+	uint				rotate_key_age)
+{
+	if (key_version == ENCRYPTION_KEY_VERSION_INVALID) {
+		return false;
+	}
+
+	if (key_version == 0 && latest_key_version != 0) {
+		/* this is rotation unencrypted => encrypted
+		* ignore rotate_key_age */
+		return true;
+	}
+
+	if (latest_key_version == 0 && key_version != 0) {
+		if (crypt_data->encryption == FIL_ENCRYPTION_DEFAULT) {
+			/* this is rotation encrypted => unencrypted */
+			return true;
+		}
+		return false;
+	}
+
+	if (crypt_data->encryption == FIL_ENCRYPTION_DEFAULT
+	    && crypt_data->type == CRYPT_SCHEME_1
+	    && !srv_encrypt_tables) {
+		/* This is rotation encrypted => unencrypted */
+		return true;
+	}
+
+	if (rotate_key_age == 0) {
+		return false;
+	}
+
+	/* this is rotation encrypted => encrypted,
+	* only reencrypt if key is sufficiently old */
+	if (key_version + rotate_key_age < latest_key_version) {
+		return true;
+	}
+
+	return false;
+}
+
+/** Read page 0 and possible crypt data from there.
+@param[in,out]	space		Tablespace */
+static inline void fil_crypt_read_crypt_data(fil_space_t *space)
+{
+  if (space->crypt_data || space->size || !space->get_size())
+    /* The encryption metadata has already been read, or the
+    tablespace is not encrypted and the file has been opened already,
+    or the file cannot be accessed, likely due to a concurrent DROP
+    (possibly as part of TRUNCATE or ALTER TABLE).
+
+    FIXME: The file can become unaccessible any time after this check!
+    We should really remove this function and instead make crypt_data
+    an integral part of fil_space_t. */
+    return;
+
+  const ulint zip_size= space->zip_size();
+  mtr_t mtr;
+  mtr.start();
+  if (buf_block_t* b= buf_page_get_gen(page_id_t{space->id, 0}, zip_size,
+                                       RW_S_LATCH, nullptr,
+                                       BUF_GET_POSSIBLY_FREED, &mtr))
+  {
+    mysql_mutex_lock(&fil_system.mutex);
+    if (!space->crypt_data && !space->is_stopping())
+      space->crypt_data= fil_space_read_crypt_data(zip_size, b->page.frame);
+    mysql_mutex_unlock(&fil_system.mutex);
+  }
+  mtr.commit();
+}
+
+/** Start encrypting a space
+@param[in,out]		space		Tablespace
+@return true if a recheck of tablespace is needed by encryption thread. */
+static bool fil_crypt_start_encrypting_space(fil_space_t* space)
+{
+	mysql_mutex_lock(&fil_crypt_threads_mutex);
+
+	fil_space_crypt_t *crypt_data = space->crypt_data;
+
+	/* If space is not encrypted and encryption is not enabled, then
+	do not continue encrypting the space. */
+	if (!crypt_data && !srv_encrypt_tables) {
+func_exit:
+		mysql_mutex_unlock(&fil_crypt_threads_mutex);
+		return false;
+	}
+
+	const bool recheck = fil_crypt_start_converting;
+
+	if (recheck || crypt_data || space->is_stopping()) {
+		mysql_mutex_unlock(&fil_crypt_threads_mutex);
+		return recheck;
+	}
+
+	/* NOTE: we need to write and flush page 0 before publishing
+	* the crypt data. This so that after restart there is no
+	* risk of finding encrypted pages without having
+	* crypt data in page 0 */
+
+	/* 1 - create crypt data */
+	crypt_data = fil_space_create_crypt_data(
+		FIL_ENCRYPTION_DEFAULT, FIL_DEFAULT_ENCRYPTION_KEY);
+
+	if (!crypt_data) {
+		goto func_exit;
+	}
+
+	fil_crypt_start_converting = true;
+	mysql_mutex_unlock(&fil_crypt_threads_mutex);
+
+	mtr_t mtr;
+	mtr.start();
+
+	/* 2 - get page 0 */
+	if (buf_block_t* block = buf_page_get_gen(
+		    page_id_t(space->id, 0), space->zip_size(),
+		    RW_X_LATCH, NULL, BUF_GET_POSSIBLY_FREED, &mtr)) {
+		crypt_data->type = CRYPT_SCHEME_1;
+		crypt_data->min_key_version = 0; // all pages are unencrypted
+		crypt_data->rotate_state.start_time = time(0);
+		crypt_data->rotate_state.starting = true;
+		crypt_data->rotate_state.active_threads = 1;
+
+		mysql_mutex_lock(&fil_system.mutex);
+		const bool stopping = space->is_stopping();
+		if (!stopping) {
+			space->crypt_data = crypt_data;
+		}
+		mysql_mutex_unlock(&fil_system.mutex);
+
+		if (stopping) {
+			goto abort;
+		}
+
+		/* 3 - write crypt data to page 0 */
+		mtr.set_named_space(space);
+		crypt_data->write_page0(block, &mtr);
+
+		mtr.commit();
+
+		/* 4 - sync tablespace before publishing crypt data */
+		while (buf_flush_list_space(space));
+
+		/* 5 - publish crypt data */
+		mysql_mutex_lock(&fil_crypt_threads_mutex);
+		mysql_mutex_lock(&crypt_data->mutex);
+		crypt_data->type = CRYPT_SCHEME_1;
+		ut_a(crypt_data->rotate_state.active_threads == 1);
+		crypt_data->rotate_state.active_threads = 0;
+		crypt_data->rotate_state.starting = false;
+
+		fil_crypt_start_converting = false;
+		mysql_mutex_unlock(&fil_crypt_threads_mutex);
+		mysql_mutex_unlock(&crypt_data->mutex);
+
+		return false;
+	}
+
+abort:
+	mtr.commit();
+	mysql_mutex_lock(&fil_crypt_threads_mutex);
+	fil_crypt_start_converting = false;
+	mysql_mutex_unlock(&fil_crypt_threads_mutex);
+
+	crypt_data->~fil_space_crypt_t();
+	ut_free(crypt_data);
+	return false;
+}
+
+/** State of a rotation thread */
+struct rotate_thread_t {
+  explicit rotate_thread_t(uint no) : thread_no(no) {}
+
+  uint thread_no;
+  bool first = true;              /*!< is position before first space */
+  space_list_t::iterator space
+    = fil_system.space_list.end();/*!< current space or .end() */
+  uint32_t offset = 0;            /*!< current page number */
+  ulint batch = 0;                /*!< #pages to rotate */
+  uint min_key_version_found = 0; /*!< min key version found but not rotated */
+  lsn_t end_lsn = 0;              /*!< max lsn when rotating this space */
+
+  uint estimated_max_iops = 20;/*!< estimation of max iops */
+  uint allocated_iops = 0;     /*!< allocated iops */
+  ulint cnt_waited = 0;	       /*!< #times waited during this slot */
+  uintmax_t sum_waited_us = 0; /*!< wait time during this slot */
+
+	fil_crypt_stat_t crypt_stat; // statistics
+
+	/** @return whether this thread should terminate */
+	bool should_shutdown() const {
+		mysql_mutex_assert_owner(&fil_crypt_threads_mutex);
+		switch (srv_shutdown_state) {
+		case SRV_SHUTDOWN_NONE:
+			return thread_no >= srv_n_fil_crypt_threads;
+		case SRV_SHUTDOWN_EXIT_THREADS:
+			/* srv_init_abort() must have been invoked */
+		case SRV_SHUTDOWN_CLEANUP:
+		case SRV_SHUTDOWN_INITIATED:
+			return true;
+		case SRV_SHUTDOWN_LAST_PHASE:
+			break;
+		}
+		ut_ad(0);
+		return true;
+	}
+};
+
+/** Avoid the removal of the tablespace from
+default_encrypt_list only when
+1) Another active encryption thread working on tablespace
+2) Eligible for tablespace key rotation
+3) Tablespace is in flushing phase
+@return true if tablespace should be removed from
+default encrypt */
+static bool fil_crypt_must_remove(const fil_space_t &space)
+{
+  ut_ad(space.purpose == FIL_TYPE_TABLESPACE);
+  fil_space_crypt_t *crypt_data = space.crypt_data;
+  mysql_mutex_assert_owner(&fil_system.mutex);
+  const ulong encrypt_tables= srv_encrypt_tables;
+  if (!crypt_data)
+    return !encrypt_tables;
+  if (!crypt_data->is_key_found())
+    return true;
+
+  mysql_mutex_lock(&crypt_data->mutex);
+  const bool remove= (space.is_stopping() || crypt_data->not_encrypted()) &&
+    (!crypt_data->rotate_state.flushing &&
+     !encrypt_tables == !!crypt_data->min_key_version &&
+     !crypt_data->rotate_state.active_threads);
+  mysql_mutex_unlock(&crypt_data->mutex);
+  return remove;
+}
+
+/***********************************************************************
+Check if space needs rotation given a key_state
+@param[in,out]		state		Key rotation state
+@param[in,out]		key_state	Key state
+@param[in,out]		recheck		needs recheck ?
+@return true if space needs key rotation */
+static
+bool
+fil_crypt_space_needs_rotation(
+	rotate_thread_t*	state,
+	key_state_t*		key_state,
+	bool*			recheck)
+{
+	mysql_mutex_assert_not_owner(&fil_crypt_threads_mutex);
+
+	fil_space_t* space = &*state->space;
+
+	ut_ad(space->referenced());
+	ut_ad(space->purpose == FIL_TYPE_TABLESPACE);
+
+	fil_space_crypt_t *crypt_data = space->crypt_data;
+
+	if (crypt_data == NULL) {
+		/**
+		* space has no crypt data
+		*   start encrypting it...
+		*/
+		*recheck = fil_crypt_start_encrypting_space(space);
+		crypt_data = space->crypt_data;
+
+		if (crypt_data == NULL) {
+			return false;
+		}
+
+		crypt_data->key_get_latest_version();
+	}
+
+	/* If used key_id is not found from encryption plugin we can't
+	continue to rotate the tablespace */
+	if (!crypt_data->is_key_found()) {
+		return false;
+	}
+
+	bool need_key_rotation = false;
+
+	mysql_mutex_lock(&crypt_data->mutex);
+
+	do {
+		/* prevent threads from starting to rotate space */
+		if (crypt_data->rotate_state.starting) {
+			/* recheck this space later */
+			*recheck = true;
+			break;
+		}
+
+		/* prevent threads from starting to rotate space */
+		if (space->is_stopping()) {
+			break;
+		}
+
+		if (crypt_data->rotate_state.flushing) {
+			break;
+		}
+
+		/* No need to rotate space if encryption is disabled */
+		if (crypt_data->not_encrypted()) {
+			break;
+		}
+
+		if (crypt_data->key_id != key_state->key_id) {
+			key_state->key_id= crypt_data->key_id;
+			fil_crypt_get_key_state(key_state, crypt_data);
+		}
+
+		need_key_rotation = fil_crypt_needs_rotation(
+			crypt_data,
+			crypt_data->min_key_version,
+			key_state->key_version,
+			key_state->rotate_key_age);
+	} while (0);
+
+	mysql_mutex_unlock(&crypt_data->mutex);
+	return need_key_rotation;
+}
+
+/***********************************************************************
+Update global statistics with thread statistics
+@param[in,out]	state		key rotation statistics */
+static void
+fil_crypt_update_total_stat(
+	rotate_thread_t *state)
+{
+	mysql_mutex_lock(&crypt_stat_mutex);
+	crypt_stat.pages_read_from_cache +=
+		state->crypt_stat.pages_read_from_cache;
+	crypt_stat.pages_read_from_disk +=
+		state->crypt_stat.pages_read_from_disk;
+	crypt_stat.pages_modified += state->crypt_stat.pages_modified;
+	crypt_stat.pages_flushed += state->crypt_stat.pages_flushed;
+	// remote old estimate
+	crypt_stat.estimated_iops -= state->crypt_stat.estimated_iops;
+	// add new estimate
+	crypt_stat.estimated_iops += state->estimated_max_iops;
+	mysql_mutex_unlock(&crypt_stat_mutex);
+
+	// make new estimate "current" estimate
+	state->crypt_stat.pages_read_from_cache = 0;
+	state->crypt_stat.pages_read_from_disk = 0;
+	state->crypt_stat.pages_modified = 0;
+	state->crypt_stat.pages_flushed = 0;
+	// record our old (current) estimate
+	state->crypt_stat.estimated_iops = state->estimated_max_iops;
+}
+
+/***********************************************************************
+Allocate iops to thread from global setting,
+used before starting to rotate a space.
+@param[in,out]		state		Rotation state
+@return true if allocation succeeded, false if failed */
+static bool fil_crypt_alloc_iops(rotate_thread_t *state)
+{
+	mysql_mutex_assert_owner(&fil_crypt_threads_mutex);
+	ut_ad(state->allocated_iops == 0);
+
+	/* We have not yet selected the space to rotate, thus
+	state might not contain space and we can't check
+	its status yet. */
+
+	uint max_iops = state->estimated_max_iops;
+
+	if (n_fil_crypt_iops_allocated >= srv_n_fil_crypt_iops) {
+wait:
+		my_cond_wait(&fil_crypt_threads_cond,
+			     &fil_crypt_threads_mutex.m_mutex);
+		return false;
+	}
+
+	uint alloc = srv_n_fil_crypt_iops - n_fil_crypt_iops_allocated;
+
+	if (alloc > max_iops) {
+		alloc = max_iops;
+	}
+
+	if (!alloc) {
+		goto wait;
+	}
+
+	n_fil_crypt_iops_allocated += alloc;
+
+	state->allocated_iops = alloc;
+	return true;
+}
+
+/**
+Reallocate iops to thread when processing a tablespace
+@param[in,out]		state		Rotation state
+@return whether the thread should continue running */
+static bool fil_crypt_realloc_iops(rotate_thread_t *state)
+{
+	ut_a(state->allocated_iops > 0);
+
+	if (10 * state->cnt_waited > state->batch) {
+		/* if we waited more than 10% re-estimate max_iops */
+		ulint avg_wait_time_us =
+			ulint(state->sum_waited_us / state->cnt_waited);
+
+		if (avg_wait_time_us == 0) {
+			avg_wait_time_us = 1; // prevent division by zero
+		}
+
+		DBUG_PRINT("ib_crypt",
+			("thr_no: %u - update estimated_max_iops from %u to "
+			 ULINTPF ".",
+			state->thread_no,
+			state->estimated_max_iops,
+			1000000 / avg_wait_time_us));
+
+		state->estimated_max_iops = std::max(
+			1U, uint(1000000 / avg_wait_time_us));
+		state->cnt_waited = 0;
+		state->sum_waited_us = 0;
+	} else {
+		DBUG_PRINT("ib_crypt",
+			   ("thr_no: %u only waited " ULINTPF
+			    "%% skip re-estimate.",
+			    state->thread_no,
+			    (100 * state->cnt_waited)
+			    / (state->batch ? state->batch : 1)));
+	}
+
+	ut_ad(state->estimated_max_iops);
+
+	mysql_mutex_lock(&fil_crypt_threads_mutex);
+
+	if (state->should_shutdown()) {
+		mysql_mutex_unlock(&fil_crypt_threads_mutex);
+		return false;
+	}
+
+	if (state->allocated_iops > state->estimated_max_iops) {
+		/* release iops */
+		uint extra = state->allocated_iops - state->estimated_max_iops;
+		state->allocated_iops = state->estimated_max_iops;
+		ut_ad(n_fil_crypt_iops_allocated >= extra);
+		n_fil_crypt_iops_allocated -= extra;
+		pthread_cond_broadcast(&fil_crypt_threads_cond);
+	} else if (srv_n_fil_crypt_iops > n_fil_crypt_iops_allocated) {
+		/* there are extra iops free */
+		uint add = srv_n_fil_crypt_iops - n_fil_crypt_iops_allocated;
+		if (state->allocated_iops + add > state->estimated_max_iops) {
+			/* but don't alloc more than our max */
+			add= state->estimated_max_iops - state->allocated_iops;
+		}
+		n_fil_crypt_iops_allocated += add;
+		state->allocated_iops += add;
+
+		DBUG_PRINT("ib_crypt",
+			   ("thr_no: %u increased iops from %u to %u.",
+			    state->thread_no,
+			    state->allocated_iops - add,
+			    state->allocated_iops));
+	}
+
+	fil_crypt_update_total_stat(state);
+	mysql_mutex_unlock(&fil_crypt_threads_mutex);
+	return true;
+}
+
+/** Release excess allocated iops
+@param state   rotation state
+@param wake    whether to wake up other threads */
+static void fil_crypt_return_iops(rotate_thread_t *state, bool wake= true)
+{
+  mysql_mutex_assert_owner(&fil_crypt_threads_mutex);
+
+  if (uint iops= state->allocated_iops)
+  {
+    ut_ad(n_fil_crypt_iops_allocated >= iops);
+    n_fil_crypt_iops_allocated-= iops;
+    state->allocated_iops= 0;
+    if (wake)
+      pthread_cond_broadcast(&fil_crypt_threads_cond);
+  }
+
+  fil_crypt_update_total_stat(state);
+}
+
+/** Acquire a tablespace reference.
+@return whether a tablespace reference was successfully acquired */
+inline bool fil_space_t::acquire_if_not_stopped()
+{
+  mysql_mutex_assert_owner(&fil_system.mutex);
+  const uint32_t n= acquire_low();
+  if (UNIV_LIKELY(!(n & (STOPPING | CLOSING))))
+    return true;
+  if (UNIV_UNLIKELY(n & STOPPING))
+    return false;
+  return UNIV_LIKELY(!(n & CLOSING)) || prepare_acquired();
+}
+
+bool fil_crypt_must_default_encrypt()
+{
+  return !srv_fil_crypt_rotate_key_age || !srv_encrypt_rotate;
+}
+
+/** Return the next tablespace from default_encrypt_tables list.
+@param space   previous tablespace (nullptr to start from the start)
+@param recheck whether the removal condition needs to be rechecked after
+the encryption parameters were changed
+@param encrypt expected state of innodb_encrypt_tables
+@return the next tablespace to process (n_pending_ops incremented)
+@retval fil_system.temp_space if there is no work to do
+@retval nullptr upon reaching the end of the iteration */
+inline fil_space_t *fil_system_t::default_encrypt_next(fil_space_t *space,
+                                                       bool recheck,
+                                                       bool encrypt)
+{
+  mysql_mutex_assert_owner(&mutex);
+
+  auto it= space && space->is_in_default_encrypt
+    ? sized_ilist<fil_space_t, default_encrypt_tag_t>::iterator(space)
+    : default_encrypt_tables.begin();
+  const auto end= default_encrypt_tables.end();
+
+  if (space)
+  {
+    const bool released= !space->release();
+
+    if (space->is_in_default_encrypt)
+    {
+      while (++it != end &&
+             (!UT_LIST_GET_LEN(it->chain) || it->is_stopping()));
+
+      /* If one of the encryption threads already started
+      the encryption of the table then don't remove the
+      unencrypted spaces from default encrypt list.
+
+      If there is a change in innodb_encrypt_tables variables
+      value then don't remove the last processed tablespace
+      from the default encrypt list. */
+      if (released && !recheck && fil_crypt_must_remove(*space))
+      {
+        ut_a(!default_encrypt_tables.empty());
+        default_encrypt_tables.remove(*space);
+        space->is_in_default_encrypt= false;
+      }
+    }
+  }
+  else while (it != end &&
+	      (!UT_LIST_GET_LEN(it->chain) || it->is_stopping()))
+  {
+    /* Find the next suitable default encrypt table if
+    beginning of default_encrypt_tables list has been scheduled
+    to be deleted */
+    it++;
+  }
+
+  if (it == end)
+    return temp_space;
+
+  do
+  {
+    space= &*it;
+    if (space->acquire_if_not_stopped())
+      return space;
+    if (++it == end)
+      return nullptr;
+  }
+  while (!UT_LIST_GET_LEN(it->chain) || it->is_stopping());
+
+  return nullptr;
+}
+
+/** Determine the next tablespace for encryption key rotation.
+@param space    current tablespace (nullptr to start from the beginning)
+@param recheck  whether the removal condition needs to be rechecked after
+encryption parameters were changed
+@param encrypt  expected state of innodb_encrypt_tables
+@return the next tablespace
+@retval fil_system.temp_space if there is no work to do
+@retval end() upon reaching the end of the iteration */
+space_list_t::iterator fil_space_t::next(space_list_t::iterator space,
+                                         bool recheck, bool encrypt)
+{
+  mysql_mutex_lock(&fil_system.mutex);
+
+  if (fil_crypt_must_default_encrypt())
+  {
+    fil_space_t *next_space=
+      fil_system.default_encrypt_next(space == fil_system.space_list.end()
+				      ? nullptr : &*space, recheck, encrypt);
+    space= next_space
+      ? space_list_t::iterator(next_space)
+      : fil_system.space_list.end();
+  }
+  else
+  {
+    if (space == fil_system.space_list.end())
+      space= fil_system.space_list.begin();
+    else
+    {
+      /* Move on to the next fil_space_t */
+      space->release();
+      ++space;
+    }
+
+    for (; space != fil_system.space_list.end(); ++space)
+    {
+      if (space->purpose != FIL_TYPE_TABLESPACE)
+        continue;
+      const uint32_t n= space->acquire_low();
+      if (UNIV_LIKELY(!(n & (STOPPING | CLOSING))))
+        break;
+      if (!(n & STOPPING) && space->prepare_acquired())
+        break;
+    }
+  }
+
+  mysql_mutex_unlock(&fil_system.mutex);
+  return space;
+}
+
+/** Search for a space needing rotation
+@param[in,out]	key_state	Key state
+@param[in,out]	state		Rotation state
+@param[in,out]	recheck		recheck of the tablespace is needed or
+				still encryption thread does write page 0
+@return whether the thread should keep running */
+static bool fil_crypt_find_space_to_rotate(
+	key_state_t*		key_state,
+	rotate_thread_t*	state,
+	bool*			recheck)
+{
+	/* we need iops to start rotating */
+	do {
+		if (state->should_shutdown()) {
+			if (state->space != fil_system.space_list.end()) {
+				state->space->release();
+				state->space = fil_system.space_list.end();
+			}
+			return false;
+		}
+	} while (!fil_crypt_alloc_iops(state));
+
+	if (state->first) {
+		state->first = false;
+		if (state->space != fil_system.space_list.end()) {
+			state->space->release();
+		}
+		state->space = fil_system.space_list.end();
+	}
+
+	state->space = fil_space_t::next(state->space, *recheck,
+					 key_state->key_version != 0);
+
+	bool wake = true;
+	while (state->space != fil_system.space_list.end()) {
+		if (state->space
+			== space_list_t::iterator(fil_system.temp_space)) {
+			wake = false;
+			goto done;
+		}
+
+		if (state->should_shutdown()) {
+			state->space->release();
+done:
+			state->space = fil_system.space_list.end();
+			break;
+		}
+
+		mysql_mutex_unlock(&fil_crypt_threads_mutex);
+		/* If there is no crypt data and we have not yet read
+		page 0 for this tablespace, we need to read it before
+		we can continue. */
+		if (!state->space->crypt_data) {
+			fil_crypt_read_crypt_data(&*state->space);
+		}
+
+		if (fil_crypt_space_needs_rotation(state, key_state, recheck)) {
+			ut_ad(key_state->key_id);
+			/* init state->min_key_version_found before
+			* starting on a space */
+			state->min_key_version_found = key_state->key_version;
+			mysql_mutex_lock(&fil_crypt_threads_mutex);
+			return true;
+		}
+
+		state->space = fil_space_t::next(state->space, *recheck,
+						 key_state->key_version != 0);
+		mysql_mutex_lock(&fil_crypt_threads_mutex);
+	}
+
+	/* no work to do; release our allocation of I/O capacity */
+	fil_crypt_return_iops(state, wake);
+	return true;
+}
+
+/***********************************************************************
+Start rotating a space
+@param[in]	key_state		Key state
+@param[in,out]	state			Rotation state */
+static
+void
+fil_crypt_start_rotate_space(
+	const key_state_t*	key_state,
+	rotate_thread_t*	state)
+{
+	fil_space_crypt_t *crypt_data = state->space->crypt_data;
+
+	ut_ad(crypt_data);
+	mysql_mutex_lock(&crypt_data->mutex);
+	ut_ad(key_state->key_id == crypt_data->key_id);
+
+	if (crypt_data->rotate_state.active_threads == 0) {
+		/* only first thread needs to init */
+		crypt_data->rotate_state.next_offset = 1; // skip page 0
+		/* no need to rotate beyond current max
+		* if space extends, it will be encrypted with newer version */
+		/* FIXME: max_offset could be removed and instead
+		space->size consulted.*/
+		crypt_data->rotate_state.max_offset = state->space->size;
+		crypt_data->rotate_state.end_lsn = 0;
+		crypt_data->rotate_state.min_key_version_found =
+			key_state->key_version;
+
+		crypt_data->rotate_state.start_time = time(0);
+
+		if (crypt_data->type == CRYPT_SCHEME_UNENCRYPTED &&
+			crypt_data->is_encrypted() &&
+			key_state->key_version != 0) {
+			/* this is rotation unencrypted => encrypted */
+			crypt_data->type = CRYPT_SCHEME_1;
+		}
+	}
+
+	/* count active threads in space */
+	crypt_data->rotate_state.active_threads++;
+
+	/* Initialize thread local state */
+	state->end_lsn = crypt_data->rotate_state.end_lsn;
+	state->min_key_version_found =
+		crypt_data->rotate_state.min_key_version_found;
+
+	mysql_mutex_unlock(&crypt_data->mutex);
+}
+
+/***********************************************************************
+Search for batch of pages needing rotation
+@param[in]	key_state		Key state
+@param[in,out]	state			Rotation state
+@return true if page needing key rotation found, false if not found */
+static
+bool
+fil_crypt_find_page_to_rotate(
+	const key_state_t*	key_state,
+	rotate_thread_t*	state)
+{
+	ulint batch = srv_alloc_time * state->allocated_iops;
+
+	ut_ad(state->space == fil_system.space_list.end()
+		|| state->space->referenced());
+
+	/* If space is marked to be dropped stop rotation. */
+	if (state->space == fil_system.space_list.end()
+		|| state->space->is_stopping()) {
+		return false;
+	}
+
+	fil_space_crypt_t *crypt_data = state->space->crypt_data;
+
+	mysql_mutex_lock(&crypt_data->mutex);
+	ut_ad(key_state->key_id == crypt_data->key_id);
+
+	bool found = crypt_data->rotate_state.max_offset >=
+		crypt_data->rotate_state.next_offset;
+
+	if (found) {
+		state->offset = crypt_data->rotate_state.next_offset;
+		ulint remaining = crypt_data->rotate_state.max_offset -
+			crypt_data->rotate_state.next_offset;
+
+		if (batch <= remaining) {
+			state->batch = batch;
+		} else {
+			state->batch = remaining;
+		}
+	}
+
+	crypt_data->rotate_state.next_offset += uint32_t(batch);
+	mysql_mutex_unlock(&crypt_data->mutex);
+	return found;
+}
+
+/***********************************************************************
+Get a page and compute sleep time
+@param[in,out]		state		Rotation state
+@param[in]		offset		Page offset
+@param[in,out]		mtr		Minitransaction
+@param[out]		sleeptime_ms	Sleep time
+@return page or NULL*/
+static
+buf_block_t*
+fil_crypt_get_page_throttle(
+	rotate_thread_t*	state,
+	uint32_t		offset,
+	mtr_t*			mtr,
+	ulint*			sleeptime_ms)
+{
+	fil_space_t* space = &*state->space;
+	const ulint zip_size = space->zip_size();
+	const page_id_t page_id(space->id, offset);
+	ut_ad(space->referenced());
+
+	/* Before reading from tablespace we need to make sure that
+	the tablespace is not about to be dropped. */
+	if (space->is_stopping()) {
+		return NULL;
+	}
+
+	buf_block_t* block = buf_page_get_gen(page_id, zip_size, RW_X_LATCH,
+					      NULL,
+					      BUF_PEEK_IF_IN_POOL, mtr);
+	if (block != NULL) {
+		/* page was in buffer pool */
+		state->crypt_stat.pages_read_from_cache++;
+		return block;
+	}
+
+	if (space->is_stopping()) {
+		return NULL;
+	}
+
+	if (offset % (zip_size ? zip_size : srv_page_size)
+	    && DB_SUCCESS_LOCKED_REC
+	    != fseg_page_is_allocated(space, offset)) {
+		/* page is already freed */
+		return NULL;
+	}
+
+	state->crypt_stat.pages_read_from_disk++;
+
+	const ulonglong start = my_interval_timer();
+	block = buf_page_get_gen(page_id, zip_size,
+				 RW_X_LATCH,
+				 NULL, BUF_GET_POSSIBLY_FREED, mtr);
+	const ulonglong end = my_interval_timer();
+
+	state->cnt_waited++;
+
+	if (end > start) {
+		state->sum_waited_us += (end - start) / 1000;
+	}
+
+	/* average page load */
+	ulint add_sleeptime_ms = 0;
+	ulint avg_wait_time_us =ulint(state->sum_waited_us / state->cnt_waited);
+	ulint alloc_wait_us = 1000000 / state->allocated_iops;
+
+	if (avg_wait_time_us < alloc_wait_us) {
+		/* we reading faster than we allocated */
+		add_sleeptime_ms = (alloc_wait_us - avg_wait_time_us) / 1000;
+	} else {
+		/* if page load time is longer than we want, skip sleeping */
+	}
+
+	*sleeptime_ms += add_sleeptime_ms;
+
+	return block;
+}
+
+/***********************************************************************
+Rotate one page
+@param[in,out]		key_state		Key state
+@param[in,out]		state			Rotation state */
+static
+void
+fil_crypt_rotate_page(
+	const key_state_t*	key_state,
+	rotate_thread_t*	state)
+{
+	fil_space_t *space = &*state->space;
+	ulint space_id = space->id;
+	uint32_t offset = state->offset;
+	ulint sleeptime_ms = 0;
+	fil_space_crypt_t *crypt_data = space->crypt_data;
+
+	ut_ad(space->referenced());
+	ut_ad(offset > 0);
+
+	/* In fil_crypt_thread where key rotation is done we have
+	acquired space and checked that this space is not yet
+	marked to be dropped. Similarly, in fil_crypt_find_page_to_rotate().
+	Check here also to give DROP TABLE or similar a change. */
+	if (space->is_stopping()) {
+		return;
+	}
+
+	if (space_id == TRX_SYS_SPACE && offset == TRX_SYS_PAGE_NO) {
+		/* don't encrypt this as it contains address to dblwr buffer */
+		return;
+	}
+
+	mtr_t mtr;
+	mtr.start();
+	if (buf_block_t* block = fil_crypt_get_page_throttle(state,
+							     offset, &mtr,
+							     &sleeptime_ms)) {
+		bool modified = false;
+		byte* frame = buf_block_get_frame(block);
+		const lsn_t block_lsn = mach_read_from_8(FIL_PAGE_LSN + frame);
+		uint kv = buf_page_get_key_version(frame, space->flags);
+
+		if (block->page.oldest_modification() > 1) {
+			/* Do not unnecessarily touch pages that are
+			already dirty. */
+		} else if (space->is_stopping()) {
+			/* The tablespace is closing (in DROP TABLE or
+			TRUNCATE TABLE or similar): avoid further access */
+		} else if (!kv && !*reinterpret_cast<uint16_t*>
+			   (&frame[FIL_PAGE_TYPE])) {
+			/* It looks like this page is not
+			allocated. Because key rotation is accessing
+			pages in a pattern that is unlike the normal
+			B-tree and undo log access pattern, we cannot
+			invoke fseg_page_is_allocated() here, because that
+			could result in a deadlock. If we invoked
+			fseg_page_is_allocated() and released the
+			tablespace latch before acquiring block->lock,
+			then the fseg_page_is_allocated() information
+			could be stale already. */
+
+			/* If the data file was originally created
+			before MariaDB 10.0 or MySQL 5.6, some
+			allocated data pages could carry 0 in
+			FIL_PAGE_TYPE. The FIL_PAGE_TYPE on those
+			pages will be updated in
+			buf_flush_init_for_writing() when the page
+			is modified the next time.
+
+			Also, when the doublewrite buffer pages are
+			allocated on bootstrap in a non-debug build,
+			some dummy pages will be allocated, with 0 in
+			the FIL_PAGE_TYPE. Those pages should be
+			skipped from key rotation forever. */
+		} else if (fil_crypt_needs_rotation(
+				crypt_data,
+				kv,
+				key_state->key_version,
+				key_state->rotate_key_age)) {
+
+			mtr.set_named_space(space);
+			modified = true;
+
+			/* force rotation by dummy updating page */
+			mtr.write<1,mtr_t::FORCED>(*block,
+						   &frame[FIL_PAGE_SPACE_ID],
+						   frame[FIL_PAGE_SPACE_ID]);
+
+			/* statistics */
+			state->crypt_stat.pages_modified++;
+		} else {
+			if (crypt_data->is_encrypted()) {
+				if (kv < state->min_key_version_found) {
+					state->min_key_version_found = kv;
+				}
+			}
+		}
+
+		mtr.commit();
+		lsn_t end_lsn = mtr.commit_lsn();
+
+
+		if (modified) {
+			/* if we modified page, we take lsn from mtr */
+			ut_a(end_lsn > state->end_lsn);
+			ut_a(end_lsn > block_lsn);
+			state->end_lsn = end_lsn;
+		} else {
+			/* if we did not modify page, check for max lsn */
+			if (block_lsn > state->end_lsn) {
+				state->end_lsn = block_lsn;
+			}
+		}
+	} else {
+		/* If block read failed mtr memo and log should be empty. */
+		ut_ad(!mtr.has_modifications());
+		ut_ad(mtr.is_empty());
+		mtr.commit();
+	}
+
+	if (sleeptime_ms) {
+		mysql_mutex_lock(&fil_crypt_threads_mutex);
+		timespec abstime;
+		set_timespec_nsec(abstime, 1000000ULL * sleeptime_ms);
+		my_cond_timedwait(&fil_crypt_throttle_sleep_cond,
+				  &fil_crypt_threads_mutex.m_mutex, &abstime);
+		mysql_mutex_unlock(&fil_crypt_threads_mutex);
+	}
+}
+
+/***********************************************************************
+Rotate a batch of pages
+@param[in,out]		key_state		Key state
+@param[in,out]		state			Rotation state */
+static
+void
+fil_crypt_rotate_pages(
+	const key_state_t*	key_state,
+	rotate_thread_t*	state)
+{
+	const uint32_t space_id = state->space->id;
+	uint32_t end = std::min(state->offset + uint32_t(state->batch),
+				state->space->free_limit);
+
+	ut_ad(state->space->referenced());
+
+	for (; state->offset < end; state->offset++) {
+
+		/* we can't rotate pages in dblwr buffer as
+		* it's not possible to read those due to lots of asserts
+		* in buffer pool.
+		*
+		* However since these are only (short-lived) copies of
+		* real pages, they will be updated anyway when the
+		* real page is updated
+		*/
+		if (buf_dblwr.is_inside(page_id_t(space_id, state->offset))) {
+			continue;
+		}
+
+		/* If space is marked as stopping, stop rotating
+		pages. */
+		if (state->space->is_stopping()) {
+			break;
+		}
+
+		fil_crypt_rotate_page(key_state, state);
+	}
+}
+
+/***********************************************************************
+Flush rotated pages and then update page 0
+
+@param[in,out]		state	rotation state */
+static
+void
+fil_crypt_flush_space(
+	rotate_thread_t*	state)
+{
+	fil_space_t* space = &*state->space;
+	fil_space_crypt_t *crypt_data = space->crypt_data;
+
+	ut_ad(space->referenced());
+
+	/* flush tablespace pages so that there are no pages left with old key */
+	lsn_t end_lsn = crypt_data->rotate_state.end_lsn;
+
+	if (end_lsn > 0 && !space->is_stopping()) {
+		ulint sum_pages = 0;
+		const ulonglong start = my_interval_timer();
+		while (buf_flush_list_space(space, &sum_pages));
+		if (sum_pages) {
+			const ulonglong end = my_interval_timer();
+
+			state->cnt_waited += sum_pages;
+			state->sum_waited_us += (end - start) / 1000;
+
+			/* statistics */
+			state->crypt_stat.pages_flushed += sum_pages;
+		}
+	}
+
+	if (crypt_data->min_key_version == 0) {
+		crypt_data->type = CRYPT_SCHEME_UNENCRYPTED;
+	}
+
+	if (space->is_stopping()) {
+		return;
+	}
+
+	/* update page 0 */
+	mtr_t mtr;
+	mtr.start();
+
+	if (buf_block_t* block = buf_page_get_gen(
+		    page_id_t(space->id, 0), space->zip_size(),
+		    RW_X_LATCH, NULL, BUF_GET_POSSIBLY_FREED, &mtr)) {
+		mtr.set_named_space(space);
+		crypt_data->write_page0(block, &mtr);
+	}
+
+	mtr.commit();
+}
+
+/***********************************************************************
+Complete rotating a space
+@param[in,out]		state			Rotation state */
+static void fil_crypt_complete_rotate_space(rotate_thread_t* state)
+{
+	fil_space_crypt_t *crypt_data = state->space->crypt_data;
+
+	ut_ad(crypt_data);
+	ut_ad(state->space->referenced());
+
+	mysql_mutex_lock(&crypt_data->mutex);
+
+	/* Space might already be dropped */
+	if (!state->space->is_stopping()) {
+		/**
+		* Update crypt data state with state from thread
+		*/
+		if (state->min_key_version_found <
+			crypt_data->rotate_state.min_key_version_found) {
+			crypt_data->rotate_state.min_key_version_found =
+				state->min_key_version_found;
+		}
+
+		if (state->end_lsn > crypt_data->rotate_state.end_lsn) {
+			crypt_data->rotate_state.end_lsn = state->end_lsn;
+		}
+
+		ut_a(crypt_data->rotate_state.active_threads > 0);
+		crypt_data->rotate_state.active_threads--;
+		bool last = crypt_data->rotate_state.active_threads == 0;
+
+		/**
+		* check if space is fully done
+		* this as when threads shutdown, it could be that we "complete"
+		* iterating before we have scanned the full space.
+		*/
+		bool done = crypt_data->rotate_state.next_offset >=
+			crypt_data->rotate_state.max_offset;
+
+		/**
+		* we should flush space if we're last thread AND
+		* the iteration is done
+		*/
+		bool should_flush = last && done;
+
+		if (should_flush) {
+			/* we're the last active thread */
+			crypt_data->rotate_state.flushing = true;
+			crypt_data->min_key_version =
+				crypt_data->rotate_state.min_key_version_found;
+			mysql_mutex_unlock(&crypt_data->mutex);
+			fil_crypt_flush_space(state);
+
+			mysql_mutex_lock(&crypt_data->mutex);
+			crypt_data->rotate_state.flushing = false;
+		}
+	} else {
+		ut_a(crypt_data->rotate_state.active_threads > 0);
+		crypt_data->rotate_state.active_threads--;
+	}
+
+	mysql_mutex_unlock(&crypt_data->mutex);
+}
+
+/** A thread which monitors global key state and rotates tablespaces
+accordingly */
+static void fil_crypt_thread()
+{
+	mysql_mutex_lock(&fil_crypt_threads_mutex);
+	rotate_thread_t thr(srv_n_fil_crypt_threads_started++);
+	pthread_cond_signal(&fil_crypt_cond); /* signal that we started */
+
+	if (!thr.should_shutdown()) {
+		/* if we find a tablespace that is starting, skip over it
+		and recheck it later */
+		bool recheck = false;
+
+wait_for_work:
+		if (!recheck && !thr.should_shutdown()) {
+			/* wait for key state changes
+			* i.e either new key version of change or
+			* new rotate_key_age */
+			my_cond_wait(&fil_crypt_threads_cond,
+				     &fil_crypt_threads_mutex.m_mutex);
+		}
+
+		recheck = false;
+		thr.first = true;      // restart from first tablespace
+
+		key_state_t new_state;
+
+		/* iterate all spaces searching for those needing rotation */
+		while (fil_crypt_find_space_to_rotate(&new_state, &thr,
+						      &recheck)) {
+			if (thr.space == fil_system.space_list.end()) {
+				goto wait_for_work;
+			}
+
+			/* we found a space to rotate */
+			mysql_mutex_unlock(&fil_crypt_threads_mutex);
+			fil_crypt_start_rotate_space(&new_state, &thr);
+
+			/* iterate all pages (cooperativly with other threads) */
+			while (fil_crypt_find_page_to_rotate(&new_state, &thr)) {
+
+				/* If space is marked as stopping, release
+				space and stop rotation. */
+				if (thr.space->is_stopping()) {
+					fil_crypt_complete_rotate_space(&thr);
+					thr.space->release();
+					thr.space = fil_system.space_list.end();
+					break;
+				}
+
+				fil_crypt_rotate_pages(&new_state, &thr);
+				/* realloc iops */
+				if (!fil_crypt_realloc_iops(&thr)) {
+					break;
+				}
+			}
+
+			/* complete rotation */
+			if (thr.space != fil_system.space_list.end()) {
+				fil_crypt_complete_rotate_space(&thr);
+			}
+
+			/* force key state refresh */
+			new_state.key_id = 0;
+
+			mysql_mutex_lock(&fil_crypt_threads_mutex);
+			/* release iops */
+			fil_crypt_return_iops(&thr);
+		}
+
+		if (thr.space != fil_system.space_list.end()) {
+			thr.space->release();
+			thr.space = fil_system.space_list.end();
+		}
+	}
+
+	fil_crypt_return_iops(&thr);
+	srv_n_fil_crypt_threads_started--;
+	pthread_cond_signal(&fil_crypt_cond); /* signal that we stopped */
+	mysql_mutex_unlock(&fil_crypt_threads_mutex);
+
+#ifdef UNIV_PFS_THREAD
+	pfs_delete_thread();
+#endif
+}
+
+/*********************************************************************
+Adjust thread count for key rotation
+@param[in]	enw_cnt		Number of threads to be used */
+void fil_crypt_set_thread_cnt(const uint new_cnt)
+{
+	if (!fil_crypt_threads_inited) {
+		if (srv_shutdown_state != SRV_SHUTDOWN_NONE)
+			return;
+		fil_crypt_threads_init();
+	}
+
+	mysql_mutex_lock(&fil_crypt_threads_mutex);
+
+	if (new_cnt > srv_n_fil_crypt_threads) {
+		uint add = new_cnt - srv_n_fil_crypt_threads;
+		srv_n_fil_crypt_threads = new_cnt;
+		for (uint i = 0; i < add; i++) {
+			std::thread thd(fil_crypt_thread);
+			ib::info() << "Creating #"
+				   << i+1 << " encryption thread id "
+				   << thd.get_id()
+				   << " total threads " << new_cnt << ".";
+			thd.detach();
+		}
+	} else if (new_cnt < srv_n_fil_crypt_threads) {
+		srv_n_fil_crypt_threads = new_cnt;
+	}
+
+	pthread_cond_broadcast(&fil_crypt_threads_cond);
+
+	while (srv_n_fil_crypt_threads_started != srv_n_fil_crypt_threads) {
+		my_cond_wait(&fil_crypt_cond,
+			     &fil_crypt_threads_mutex.m_mutex);
+	}
+
+	pthread_cond_broadcast(&fil_crypt_threads_cond);
+	mysql_mutex_unlock(&fil_crypt_threads_mutex);
+}
+
+/** Initialize the tablespace default_encrypt_tables
+if innodb_encryption_rotate_key_age=0. */
+static void fil_crypt_default_encrypt_tables_fill()
+{
+	mysql_mutex_assert_owner(&fil_system.mutex);
+
+	for (fil_space_t& space : fil_system.space_list) {
+		if (space.purpose != FIL_TYPE_TABLESPACE
+		    || space.is_in_default_encrypt
+		    || UT_LIST_GET_LEN(space.chain) == 0
+		    || !space.acquire_if_not_stopped()) {
+			continue;
+		}
+
+		/* Ensure that crypt_data has been initialized. */
+		ut_ad(space.size);
+
+		/* Skip ENCRYPTION!=DEFAULT tablespaces. */
+		if (space.crypt_data
+		    && !space.crypt_data->is_default_encryption()) {
+			goto next;
+		}
+
+		if (srv_encrypt_tables) {
+			/* Skip encrypted tablespaces if
+			innodb_encrypt_tables!=OFF */
+			if (space.crypt_data
+			    && space.crypt_data->min_key_version) {
+				goto next;
+			}
+		} else {
+			/* Skip unencrypted tablespaces if
+			innodb_encrypt_tables=OFF */
+			if (!space.crypt_data
+			    || !space.crypt_data->min_key_version) {
+				goto next;
+			}
+		}
+
+		fil_system.default_encrypt_tables.push_back(space);
+		space.is_in_default_encrypt = true;
+next:
+		space.release();
+	}
+}
+
+/*********************************************************************
+Adjust max key age
+@param[in]	val		New max key age */
+void fil_crypt_set_rotate_key_age(uint val)
+{
+  mysql_mutex_lock(&fil_crypt_threads_mutex);
+  mysql_mutex_lock(&fil_system.mutex);
+  srv_fil_crypt_rotate_key_age= val;
+  if (val == 0)
+    fil_crypt_default_encrypt_tables_fill();
+  mysql_mutex_unlock(&fil_system.mutex);
+  pthread_cond_broadcast(&fil_crypt_threads_cond);
+  mysql_mutex_unlock(&fil_crypt_threads_mutex);
+}
+
+/*********************************************************************
+Adjust rotation iops
+@param[in]	val		New max roation iops */
+void fil_crypt_set_rotation_iops(uint val)
+{
+  mysql_mutex_lock(&fil_crypt_threads_mutex);
+  srv_n_fil_crypt_iops= val;
+  pthread_cond_broadcast(&fil_crypt_threads_cond);
+  mysql_mutex_unlock(&fil_crypt_threads_mutex);
+}
+
+/*********************************************************************
+Adjust encrypt tables
+@param[in]	val		New setting for innodb-encrypt-tables */
+void fil_crypt_set_encrypt_tables(ulong val)
+{
+  if (!fil_crypt_threads_inited)
+    return;
+
+  mysql_mutex_lock(&fil_crypt_threads_mutex);
+
+  mysql_mutex_lock(&fil_system.mutex);
+  srv_encrypt_tables= val;
+
+  if (fil_crypt_must_default_encrypt())
+    fil_crypt_default_encrypt_tables_fill();
+
+  mysql_mutex_unlock(&fil_system.mutex);
+
+  pthread_cond_broadcast(&fil_crypt_threads_cond);
+  mysql_mutex_unlock(&fil_crypt_threads_mutex);
+}
+
+/*********************************************************************
+Init threads for key rotation */
+void fil_crypt_threads_init()
+{
+	if (!fil_crypt_threads_inited) {
+		pthread_cond_init(&fil_crypt_cond, nullptr);
+		pthread_cond_init(&fil_crypt_threads_cond, nullptr);
+		mysql_mutex_init(0, &fil_crypt_threads_mutex, nullptr);
+		uint cnt = srv_n_fil_crypt_threads;
+		srv_n_fil_crypt_threads = 0;
+		fil_crypt_threads_inited = true;
+		fil_crypt_set_thread_cnt(cnt);
+	}
+}
+
+/*********************************************************************
+Clean up key rotation threads resources */
+void fil_crypt_threads_cleanup()
+{
+	if (!fil_crypt_threads_inited) {
+		return;
+	}
+	ut_a(!srv_n_fil_crypt_threads_started);
+	pthread_cond_destroy(&fil_crypt_cond);
+	pthread_cond_destroy(&fil_crypt_threads_cond);
+	mysql_mutex_destroy(&fil_crypt_threads_mutex);
+	fil_crypt_threads_inited = false;
+}
+
+/*********************************************************************
+Wait for crypt threads to stop accessing space
+@param[in]	space		Tablespace */
+void fil_space_crypt_close_tablespace(const fil_space_t *space)
+{
+	fil_space_crypt_t* crypt_data = space->crypt_data;
+
+	if (!crypt_data || srv_n_fil_crypt_threads == 0
+	    || !fil_crypt_threads_inited) {
+		return;
+	}
+
+	time_t start = time(0);
+	time_t last = start;
+
+	mysql_mutex_lock(&crypt_data->mutex);
+
+	while (crypt_data->rotate_state.active_threads
+	       || crypt_data->rotate_state.flushing) {
+		mysql_mutex_unlock(&crypt_data->mutex);
+
+		/* wakeup throttle (all) sleepers */
+		mysql_mutex_lock(&fil_crypt_threads_mutex);
+		pthread_cond_broadcast(&fil_crypt_throttle_sleep_cond);
+		pthread_cond_broadcast(&fil_crypt_threads_cond);
+		mysql_mutex_unlock(&fil_crypt_threads_mutex);
+
+		std::this_thread::sleep_for(std::chrono::milliseconds(20));
+
+		time_t now = time(0);
+
+		if (UNIV_UNLIKELY(now >= last + 30)) {
+			ib::warn() << "Waited "
+				   << now - start
+				   << " seconds to drop space: "
+				   << space->chain.start->name << " ("
+				   << space->id << ") active threads "
+				   << crypt_data->rotate_state.active_threads
+				   << "flushing="
+				   << crypt_data->rotate_state.flushing << ".";
+			last = now;
+		}
+
+		mysql_mutex_lock(&crypt_data->mutex);
+	}
+
+	mysql_mutex_unlock(&crypt_data->mutex);
+}
+
+/*********************************************************************
+Get crypt status for a space (used by information_schema)
+@param[in]	space		Tablespace
+@param[out]	status		Crypt status */
+void
+fil_space_crypt_get_status(
+	const fil_space_t*			space,
+	struct fil_space_crypt_status_t*	status)
+{
+	memset(status, 0, sizeof(*status));
+
+	ut_ad(space->referenced());
+
+	/* If there is no crypt data and we have not yet read
+	page 0 for this tablespace, we need to read it before
+	we can continue. */
+	if (!space->crypt_data) {
+		fil_crypt_read_crypt_data(const_cast<fil_space_t*>(space));
+	}
+
+	status->space = ULINT_UNDEFINED;
+
+	if (fil_space_crypt_t* crypt_data = space->crypt_data) {
+		status->space = space->id;
+		mysql_mutex_lock(&crypt_data->mutex);
+		status->scheme = crypt_data->type;
+		status->keyserver_requests = crypt_data->keyserver_requests;
+		status->min_key_version = crypt_data->min_key_version;
+		status->key_id = crypt_data->key_id;
+
+		if (crypt_data->rotate_state.active_threads > 0 ||
+		    crypt_data->rotate_state.flushing) {
+			status->rotating = true;
+			status->flushing =
+				crypt_data->rotate_state.flushing;
+			status->rotate_next_page_number =
+				crypt_data->rotate_state.next_offset;
+			status->rotate_max_page_number =
+				crypt_data->rotate_state.max_offset;
+		}
+
+		mysql_mutex_unlock(&crypt_data->mutex);
+
+		if (srv_encrypt_tables || crypt_data->min_key_version) {
+			status->current_key_version =
+				fil_crypt_get_latest_key_version(crypt_data);
+		}
+	}
+}
+
+/*********************************************************************
+Return crypt statistics
+@param[out]	stat		Crypt statistics */
+void fil_crypt_total_stat(fil_crypt_stat_t *stat)
+{
+	mysql_mutex_lock(&crypt_stat_mutex);
+	*stat = crypt_stat;
+	mysql_mutex_unlock(&crypt_stat_mutex);
+}
+
+#endif /* UNIV_INNOCHECKSUM */
+
+/**
+Verify that post encryption checksum match calculated checksum.
+This function should be called only if tablespace contains crypt_data
+metadata (this is strong indication that tablespace is encrypted).
+Function also verifies that traditional checksum does not match
+calculated checksum as if it does page could be valid unencrypted,
+encrypted, or corrupted.
+
+@param[in,out]	page		page frame (checksum is temporarily modified)
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@return true if page is encrypted AND OK, false otherwise */
+bool fil_space_verify_crypt_checksum(const byte* page, ulint zip_size)
+{
+	if (ENCRYPTION_KEY_NOT_ENCRYPTED == mach_read_from_4(
+			page + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION)) {
+		return false;
+	}
+
+	/* Compressed and encrypted pages do not have checksum. Assume not
+	corrupted. Page verification happens after decompression in
+	buf_page_t::read_complete() using buf_page_is_corrupted(). */
+	if (fil_page_get_type(page) == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) {
+		return true;
+	}
+
+	/* Read stored post encryption checksum. */
+	const ib_uint32_t checksum = mach_read_from_4(
+		page + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION + 4);
+
+	/* If stored checksum matches one of the calculated checksums
+	page is not corrupted. */
+
+#ifndef UNIV_INNOCHECKSUM
+	switch (srv_checksum_algorithm) {
+	case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+#endif /* !UNIV_INNOCHECKSUM */
+		if (zip_size) {
+			return checksum == page_zip_calc_checksum(
+				page, zip_size, false);
+		}
+
+		return checksum == buf_calc_page_crc32(page);
+#ifndef UNIV_INNOCHECKSUM
+	default:
+		if (checksum == BUF_NO_CHECKSUM_MAGIC) {
+			return true;
+		}
+		if (zip_size) {
+			return checksum == page_zip_calc_checksum(
+				page, zip_size, false)
+				|| checksum == page_zip_calc_checksum(
+					page, zip_size, true);
+		}
+
+		return checksum == buf_calc_page_crc32(page)
+			|| checksum == buf_calc_page_new_checksum(page);
+	}
+#endif /* !UNIV_INNOCHECKSUM */
+}
diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc
new file mode 100644
index 00000000..8a88f4e2
--- /dev/null
+++ b/storage/innobase/fil/fil0fil.cc
@@ -0,0 +1,3282 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2021, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2014, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file fil/fil0fil.cc
+The tablespace memory cache
+
+Created 10/25/1995 Heikki Tuuri
+*******************************************************/
+
+#include "fil0fil.h"
+#include "fil0crypt.h"
+
+#include "btr0btr.h"
+#include "buf0buf.h"
+#include "dict0boot.h"
+#include "dict0dict.h"
+#include "dict0load.h"
+#include "fsp0file.h"
+#include "fsp0fsp.h"
+#include "hash0hash.h"
+#include "log0log.h"
+#include "log0recv.h"
+#include "mach0data.h"
+#include "mtr0log.h"
+#include "os0file.h"
+#include "page0zip.h"
+#include "row0mysql.h"
+#include "srv0start.h"
+#include "trx0purge.h"
+#include "buf0lru.h"
+#include "buf0flu.h"
+#include "log.h"
+#ifdef __linux__
+# include <sys/types.h>
+# include <sys/sysmacros.h>
+# include <dirent.h>
+#endif
+
+#include "lz4.h"
+#include "lzo/lzo1x.h"
+#include "lzma.h"
+#include "bzlib.h"
+#include "snappy-c.h"
+
+ATTRIBUTE_COLD void fil_space_t::set_corrupted() const
+{
+  if (!is_stopping() && !is_corrupted.test_and_set())
+    sql_print_error("InnoDB: File '%s' is corrupted", chain.start->name);
+}
+
+/** Try to close a file to adhere to the innodb_open_files limit.
+@param print_info   whether to diagnose why a file cannot be closed
+@return whether a file was closed */
+bool fil_space_t::try_to_close(bool print_info)
+{
+  mysql_mutex_assert_owner(&fil_system.mutex);
+  for (fil_space_t &space : fil_system.space_list)
+  {
+    switch (space.purpose) {
+    case FIL_TYPE_TEMPORARY:
+      continue;
+    case FIL_TYPE_IMPORT:
+      break;
+    case FIL_TYPE_TABLESPACE:
+      if (is_predefined_tablespace(space.id))
+        continue;
+    }
+
+    /* We are using an approximation of LRU replacement policy. In
+    fil_node_open_file_low(), newly opened files are moved to the end
+    of fil_system.space_list, so that they would be less likely to be
+    closed here. */
+    fil_node_t *node= UT_LIST_GET_FIRST(space.chain);
+    if (!node)
+      /* fil_ibd_create() did not invoke fil_space_t::add() yet */
+      continue;
+    ut_ad(!UT_LIST_GET_NEXT(chain, node));
+
+    if (!node->is_open())
+      continue;
+
+    const auto n= space.set_closing();
+    if (n & STOPPING)
+      /* Let fil_space_t::drop() in another thread handle this. */
+      continue;
+    if (n & (PENDING | NEEDS_FSYNC))
+    {
+      if (!print_info)
+        continue;
+      print_info= false;
+      const time_t now= time(nullptr);
+      if (now - fil_system.n_open_exceeded_time < 5)
+        continue; /* We display messages at most once in 5 seconds. */
+      fil_system.n_open_exceeded_time= now;
+
+      if (n & PENDING)
+        sql_print_information("InnoDB: Cannot close file %s because of "
+                              UINT32PF " pending operations%s", node->name,
+                              n & PENDING,
+                              (n & NEEDS_FSYNC) ? " and pending fsync" : "");
+      else if (n & NEEDS_FSYNC)
+        sql_print_information("InnoDB: Cannot close file %s because of "
+                              "pending fsync", node->name);
+      continue;
+    }
+
+    node->close();
+
+    fil_system.move_closed_last_to_space_list(node->space);
+
+    return true;
+  }
+
+  return false;
+}
+
+/*
+		IMPLEMENTATION OF THE TABLESPACE MEMORY CACHE
+		=============================================
+
+The tablespace cache is responsible for providing fast read/write access to
+tablespaces and logs of the database. File creation and deletion is done
+in other modules which know more of the logic of the operation, however.
+
+A tablespace consists of a chain of files. The size of the files does not
+have to be divisible by the database block size, because we may just leave
+the last incomplete block unused. When a new file is appended to the
+tablespace, the maximum size of the file is also specified. At the moment,
+we think that it is best to extend the file to its maximum size already at
+the creation of the file, because then we can avoid dynamically extending
+the file when more space is needed for the tablespace.
+
+A block's position in the tablespace is specified with a 32-bit unsigned
+integer. The files in the chain are thought to be catenated, and the block
+corresponding to an address n is the nth block in the catenated file (where
+the first block is named the 0th block, and the incomplete block fragments
+at the end of files are not taken into account). A tablespace can be extended
+by appending a new file at the end of the chain.
+
+Our tablespace concept is similar to the one of Oracle.
+
+To acquire more speed in disk transfers, a technique called disk striping is
+sometimes used. This means that logical block addresses are divided in a
+round-robin fashion across several disks. Windows NT supports disk striping,
+so there we do not need to support it in the database. Disk striping is
+implemented in hardware in RAID disks. We conclude that it is not necessary
+to implement it in the database. Oracle 7 does not support disk striping,
+either.
+
+Another trick used at some database sites is replacing tablespace files by
+raw disks, that is, the whole physical disk drive, or a partition of it, is
+opened as a single file, and it is accessed through byte offsets calculated
+from the start of the disk or the partition. This is recommended in some
+books on database tuning to achieve more speed in i/o. Using raw disk
+certainly prevents the OS from fragmenting disk space, but it is not clear
+if it really adds speed. We measured on the Pentium 100 MHz + NT + NTFS file
+system + EIDE Conner disk only a negligible difference in speed when reading
+from a file, versus reading from a raw disk.
+
+To have fast access to a tablespace or a log file, we put the data structures
+to a hash table. Each tablespace and log file is given an unique 32-bit
+identifier. */
+
+/** Reference to the server data directory. Usually it is the
+current working directory ".", but in the MariaDB Embedded Server Library
+it is an absolute path. */
+const char*	fil_path_to_mysql_datadir;
+
+/** Common InnoDB file extensions */
+const char* dot_ext[] = { "", ".ibd", ".isl", ".cfg" };
+
+/** Number of pending tablespace flushes */
+Atomic_counter<ulint> fil_n_pending_tablespace_flushes;
+
+/** The tablespace memory cache. This variable is NULL before the module is
+initialized. */
+fil_system_t	fil_system;
+
+/** At this age or older a space/page will be rotated */
+extern uint srv_fil_crypt_rotate_key_age;
+
+#ifdef UNIV_DEBUG
+/** Try fil_validate() every this many times */
+# define FIL_VALIDATE_SKIP	17
+
+/******************************************************************//**
+Checks the consistency of the tablespace cache some of the time.
+@return true if ok or the check was skipped */
+static
+bool
+fil_validate_skip(void)
+/*===================*/
+{
+	/** The fil_validate() call skip counter. */
+	static Atomic_counter<uint32_t> fil_validate_count;
+
+	/* We want to reduce the call frequency of the costly fil_validate()
+	check in debug builds. */
+	return (fil_validate_count++ % FIL_VALIDATE_SKIP) || fil_validate();
+}
+#endif /* UNIV_DEBUG */
+
+/** Look up a tablespace.
+@param tablespace identifier
+@return tablespace
+@retval nullptr if not found */
+fil_space_t *fil_space_get_by_id(uint32_t id)
+{
+	fil_space_t*	space;
+
+	ut_ad(fil_system.is_initialised());
+	mysql_mutex_assert_owner(&fil_system.mutex);
+
+	HASH_SEARCH(hash, &fil_system.spaces, id,
+		    fil_space_t*, space,, space->id == id);
+
+	return(space);
+}
+
+/** Look up a tablespace.
+The caller should hold an InnoDB table lock or a MDL that prevents
+the tablespace from being dropped during the operation,
+or the caller should be in single-threaded crash recovery mode
+(no user connections that could drop tablespaces).
+Normally, fil_space_t::get() should be used instead.
+@param[in]	id	tablespace ID
+@return tablespace, or NULL if not found */
+fil_space_t *fil_space_get(uint32_t id)
+{
+  mysql_mutex_lock(&fil_system.mutex);
+  fil_space_t *space= fil_space_get_by_id(id);
+  mysql_mutex_unlock(&fil_system.mutex);
+  return space;
+}
+
+/** Check if the compression algorithm is loaded
+@param[in]	comp_algo ulint compression algorithm
+@return whether the compression algorithm is loaded */
+bool fil_comp_algo_loaded(ulint comp_algo)
+{
+	switch (comp_algo) {
+	case PAGE_UNCOMPRESSED:
+	case PAGE_ZLIB_ALGORITHM:
+		return true;
+
+	case PAGE_LZ4_ALGORITHM:
+		return provider_service_lz4->is_loaded;
+
+	case PAGE_LZO_ALGORITHM:
+		return provider_service_lzo->is_loaded;
+
+	case PAGE_LZMA_ALGORITHM:
+		return provider_service_lzma->is_loaded;
+
+	case PAGE_BZIP2_ALGORITHM:
+		return provider_service_bzip2->is_loaded;
+
+	case PAGE_SNAPPY_ALGORITHM:
+		return provider_service_snappy->is_loaded;
+	}
+
+	return false;
+}
+
+/** Append a file to the chain of files of a space.
+@param[in]	name		file name of a file that is not open
+@param[in]	handle		file handle, or OS_FILE_CLOSED
+@param[in]	size		file size in entire database pages
+@param[in]	is_raw		whether this is a raw device
+@param[in]	atomic_write	true if atomic write could be enabled
+@param[in]	max_pages	maximum number of pages in file,
+or UINT32_MAX for unlimited
+@return file object */
+fil_node_t* fil_space_t::add(const char* name, pfs_os_file_t handle,
+			     uint32_t size, bool is_raw, bool atomic_write,
+			     uint32_t max_pages)
+{
+	mysql_mutex_assert_owner(&fil_system.mutex);
+
+	fil_node_t*	node;
+
+	ut_ad(name != NULL);
+	ut_ad(fil_system.is_initialised());
+
+	node = reinterpret_cast<fil_node_t*>(ut_zalloc_nokey(sizeof(*node)));
+
+	node->handle = handle;
+
+	node->name = mem_strdup(name);
+
+	ut_a(!is_raw || srv_start_raw_disk_in_use);
+
+	node->is_raw_disk = is_raw;
+
+	node->size = size;
+
+	node->init_size = size;
+	node->max_size = max_pages;
+
+	node->space = this;
+
+	node->atomic_write = atomic_write;
+
+	this->size += size;
+	UT_LIST_ADD_LAST(chain, node);
+	if (node->is_open()) {
+		clear_closing();
+		if (++fil_system.n_open >= srv_max_n_open_files) {
+			reacquire();
+			try_to_close(true);
+			release();
+		}
+	}
+
+	return node;
+}
+
+__attribute__((warn_unused_result, nonnull))
+/** Open a tablespace file.
+@param node  data file
+@return whether the file was successfully opened */
+static bool fil_node_open_file_low(fil_node_t *node)
+{
+  ut_ad(!node->is_open());
+  ut_ad(node->space->is_closing());
+  mysql_mutex_assert_owner(&fil_system.mutex);
+  ulint type;
+  static_assert(((UNIV_ZIP_SIZE_MIN >> 1) << 3) == 4096, "compatibility");
+  switch (FSP_FLAGS_GET_ZIP_SSIZE(node->space->flags)) {
+  case 1:
+  case 2:
+    type= OS_DATA_FILE_NO_O_DIRECT;
+    break;
+  default:
+    type= OS_DATA_FILE;
+  }
+
+  for (;;)
+  {
+    bool success;
+    node->handle= os_file_create(innodb_data_file_key, node->name,
+                                 node->is_raw_disk
+                                 ? OS_FILE_OPEN_RAW | OS_FILE_ON_ERROR_NO_EXIT
+                                 : OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT,
+                                 OS_FILE_AIO, type,
+                                 srv_read_only_mode, &success);
+
+    if (success && node->is_open())
+    {
+#ifndef _WIN32
+      if (!node->space->id && !srv_read_only_mode && my_disable_locking &&
+          os_file_lock(node->handle, node->name))
+      {
+        os_file_close(node->handle);
+        node->handle= OS_FILE_CLOSED;
+        return false;
+      }
+#endif
+      break;
+    }
+
+    /* The following call prints an error message */
+    if (os_file_get_last_error(true) == EMFILE + 100 &&
+        fil_space_t::try_to_close(true))
+      continue;
+
+    ib::warn() << "Cannot open '" << node->name << "'.";
+    return false;
+  }
+
+  ulint comp_algo = node->space->get_compression_algo();
+  bool comp_algo_invalid = false;
+
+  if (node->size);
+  else if (!node->read_page0() ||
+            // validate compression algorithm for full crc32 format
+            (node->space->full_crc32() &&
+             (comp_algo_invalid = !fil_comp_algo_loaded(comp_algo))))
+  {
+    if (comp_algo_invalid)
+    {
+      if (comp_algo <= PAGE_ALGORITHM_LAST)
+        ib::warn() << "'" << node->name << "' is compressed with "
+                   << page_compression_algorithms[comp_algo]
+                   << ", which is not currently loaded";
+      else
+        ib::warn() << "'" << node->name << "' is compressed with "
+                   << "invalid algorithm: " << comp_algo;
+    }
+
+    os_file_close(node->handle);
+    node->handle= OS_FILE_CLOSED;
+    return false;
+  }
+
+  ut_ad(node->is_open());
+
+  fil_system.move_opened_last_to_space_list(node->space);
+
+  fil_system.n_open++;
+  return true;
+}
+
+/** Open a tablespace file.
+@param node  data file
+@return whether the file was successfully opened */
+static bool fil_node_open_file(fil_node_t *node)
+{
+  mysql_mutex_assert_owner(&fil_system.mutex);
+  ut_ad(!node->is_open());
+  ut_ad(!is_predefined_tablespace(node->space->id) ||
+        srv_operation == SRV_OPERATION_BACKUP ||
+        srv_operation == SRV_OPERATION_RESTORE ||
+        srv_operation == SRV_OPERATION_RESTORE_DELTA);
+  ut_ad(node->space->purpose != FIL_TYPE_TEMPORARY);
+  ut_ad(node->space->referenced());
+
+  const auto old_time= fil_system.n_open_exceeded_time;
+
+  for (ulint count= 0; fil_system.n_open >= srv_max_n_open_files; count++)
+  {
+    if (fil_space_t::try_to_close(count > 1))
+      count= 0;
+    else if (count >= 2)
+    {
+      if (old_time != fil_system.n_open_exceeded_time)
+        sql_print_warning("InnoDB: innodb_open_files=" ULINTPF
+                          " is exceeded (" ULINTPF " files stay open)",
+                          srv_max_n_open_files, fil_system.n_open);
+      break;
+    }
+    else
+    {
+      mysql_mutex_unlock(&fil_system.mutex);
+      std::this_thread::sleep_for(std::chrono::milliseconds(20));
+      /* Flush tablespaces so that we can close modified files. */
+      fil_flush_file_spaces();
+      mysql_mutex_lock(&fil_system.mutex);
+      if (node->is_open())
+        return true;
+    }
+  }
+
+  /* The node can be opened beween releasing and acquiring fil_system.mutex
+  in the above code */
+  return node->is_open() || fil_node_open_file_low(node);
+}
+
+/** Close the file handle. */
+void fil_node_t::close()
+{
+  prepare_to_close_or_detach();
+
+  /* printf("Closing file %s\n", name); */
+  int ret= os_file_close(handle);
+  ut_a(ret);
+  handle= OS_FILE_CLOSED;
+}
+
+pfs_os_file_t fil_node_t::detach()
+{
+  prepare_to_close_or_detach();
+
+  pfs_os_file_t result= handle;
+  handle= OS_FILE_CLOSED;
+  return result;
+}
+
+void fil_node_t::prepare_to_close_or_detach()
+{
+  mysql_mutex_assert_owner(&fil_system.mutex);
+  ut_ad(space->is_ready_to_close() || srv_operation == SRV_OPERATION_BACKUP ||
+        srv_operation == SRV_OPERATION_RESTORE_DELTA);
+  ut_a(is_open());
+  ut_a(!being_extended);
+  ut_a(space->is_ready_to_close() || space->purpose == FIL_TYPE_TEMPORARY ||
+       srv_fast_shutdown == 2 || !srv_was_started);
+
+  ut_a(fil_system.n_open > 0);
+  fil_system.n_open--;
+}
+
+/** Flush any writes cached by the file system. */
+void fil_space_t::flush_low()
+{
+  mysql_mutex_assert_not_owner(&fil_system.mutex);
+
+  uint32_t n= 1;
+  while (!n_pending.compare_exchange_strong(n, n | NEEDS_FSYNC,
+                                            std::memory_order_acquire,
+                                            std::memory_order_relaxed))
+  {
+    ut_ad(n & PENDING);
+    if (n & STOPPING_WRITES)
+      return;
+    if (n & NEEDS_FSYNC)
+      break;
+  }
+
+  fil_n_pending_tablespace_flushes++;
+  for (fil_node_t *node= UT_LIST_GET_FIRST(chain); node;
+       node= UT_LIST_GET_NEXT(chain, node))
+  {
+    if (!node->is_open())
+    {
+      ut_ad(!is_in_unflushed_spaces);
+      continue;
+    }
+    IF_WIN(if (node->is_raw_disk) continue,);
+    os_file_flush(node->handle);
+  }
+
+  if (is_in_unflushed_spaces)
+  {
+    mysql_mutex_lock(&fil_system.mutex);
+    if (is_in_unflushed_spaces)
+    {
+      is_in_unflushed_spaces= false;
+      fil_system.unflushed_spaces.remove(*this);
+    }
+    mysql_mutex_unlock(&fil_system.mutex);
+  }
+
+  clear_flush();
+  fil_n_pending_tablespace_flushes--;
+}
+
+/** Try to extend a tablespace.
+@param[in,out]	space	tablespace to be extended
+@param[in,out]	node	last file of the tablespace
+@param[in]	size	desired size in number of pages
+@param[out]	success	whether the operation succeeded
+@return	whether the operation should be retried */
+static ATTRIBUTE_COLD __attribute__((warn_unused_result, nonnull))
+bool
+fil_space_extend_must_retry(
+	fil_space_t*	space,
+	fil_node_t*	node,
+	uint32_t	size,
+	bool*		success)
+{
+	mysql_mutex_assert_owner(&fil_system.mutex);
+	ut_ad(UT_LIST_GET_LAST(space->chain) == node);
+	ut_ad(size >= FIL_IBD_FILE_INITIAL_SIZE);
+	ut_ad(node->space == space);
+	ut_ad(space->referenced() || space->is_being_truncated);
+
+	*success = space->size >= size;
+
+	if (*success) {
+		/* Space already big enough */
+		return(false);
+	}
+
+	if (node->being_extended) {
+		/* Another thread is currently extending the file. Wait
+		for it to finish.
+		It'd have been better to use event driven mechanism but
+		the entire module is peppered with polling stuff. */
+		mysql_mutex_unlock(&fil_system.mutex);
+		std::this_thread::sleep_for(std::chrono::milliseconds(100));
+		return(true);
+	}
+
+	node->being_extended = true;
+
+	/* At this point it is safe to release fil_system.mutex. No
+	other thread can rename, delete, close or extend the file because
+	we have set the node->being_extended flag. */
+	mysql_mutex_unlock(&fil_system.mutex);
+
+	ut_ad(size >= space->size);
+
+	uint32_t	last_page_no		= space->size;
+	const uint32_t	file_start_page_no	= last_page_no - node->size;
+
+	const unsigned	page_size = space->physical_size();
+
+	/* Datafile::read_first_page() expects innodb_page_size bytes.
+	fil_node_t::read_page0() expects at least 4 * innodb_page_size bytes.
+	os_file_set_size() expects multiples of 4096 bytes.
+	For ROW_FORMAT=COMPRESSED tables using 1024-byte or 2048-byte
+	pages, we will preallocate up to an integer multiple of 4096 bytes,
+	and let normal writes append 1024, 2048, or 3072 bytes to the file. */
+	os_offset_t new_size = std::max(
+		(os_offset_t(size - file_start_page_no) * page_size)
+		& ~os_offset_t(4095),
+		os_offset_t(FIL_IBD_FILE_INITIAL_SIZE << srv_page_size_shift));
+
+	*success = os_file_set_size(node->name, node->handle, new_size,
+				    node->punch_hole == 1);
+
+	os_has_said_disk_full = *success;
+	if (*success) {
+		os_file_flush(node->handle);
+		last_page_no = size;
+	} else {
+		/* Let us measure the size of the file
+		to determine how much we were able to
+		extend it */
+		os_offset_t	fsize = os_file_get_size(node->handle);
+		ut_a(fsize != os_offset_t(-1));
+
+		last_page_no = uint32_t(fsize / page_size)
+			+ file_start_page_no;
+	}
+	mysql_mutex_lock(&fil_system.mutex);
+
+	ut_a(node->being_extended);
+	node->being_extended = false;
+	ut_a(last_page_no - file_start_page_no >= node->size);
+
+	uint32_t file_size = last_page_no - file_start_page_no;
+	space->size += file_size - node->size;
+	node->size = file_size;
+	const uint32_t pages_in_MiB = node->size
+		& ~uint32_t((1U << (20U - srv_page_size_shift)) - 1);
+
+	/* Keep the last data file size info up to date, rounded to
+	full megabytes */
+
+	switch (space->id) {
+	case TRX_SYS_SPACE:
+		srv_sys_space.set_last_file_size(pages_in_MiB);
+	do_flush:
+		space->reacquire();
+		mysql_mutex_unlock(&fil_system.mutex);
+		space->flush_low();
+		space->release();
+		mysql_mutex_lock(&fil_system.mutex);
+		break;
+	default:
+		ut_ad(space->purpose == FIL_TYPE_TABLESPACE
+		      || space->purpose == FIL_TYPE_IMPORT);
+		if (space->purpose == FIL_TYPE_TABLESPACE
+		    && !space->is_being_truncated) {
+			goto do_flush;
+		}
+		break;
+	case SRV_TMP_SPACE_ID:
+		ut_ad(space->purpose == FIL_TYPE_TEMPORARY);
+		srv_tmp_space.set_last_file_size(pages_in_MiB);
+		break;
+	}
+
+	return false;
+}
+
+/** @return whether the file is usable for io() */
+ATTRIBUTE_COLD bool fil_space_t::prepare_acquired()
+{
+  ut_ad(referenced());
+  mysql_mutex_assert_owner(&fil_system.mutex);
+  fil_node_t *node= UT_LIST_GET_LAST(chain);
+  ut_ad(!id || purpose == FIL_TYPE_TEMPORARY ||
+        node == UT_LIST_GET_FIRST(chain));
+
+  const bool is_open= node && (node->is_open() || fil_node_open_file(node));
+
+  if (!is_open)
+    release();
+  else if (node->deferred);
+  else if (auto desired_size= recv_size)
+  {
+    bool success;
+    while (fil_space_extend_must_retry(this, node, desired_size, &success))
+      mysql_mutex_lock(&fil_system.mutex);
+
+    mysql_mutex_assert_owner(&fil_system.mutex);
+    /* Crash recovery requires the file extension to succeed. */
+    ut_a(success);
+    /* InnoDB data files cannot shrink. */
+    ut_a(size >= desired_size);
+    if (desired_size > committed_size)
+      committed_size= desired_size;
+
+    /* There could be multiple concurrent I/O requests for this
+    tablespace (multiple threads trying to extend this tablespace).
+
+    Also, fil_space_set_recv_size_and_flags() may have been invoked
+    again during the file extension while fil_system.mutex was not
+    being held by us.
+
+    Only if recv_size matches what we read originally, reset the
+    field. In this way, a subsequent I/O request will handle any
+    pending fil_space_set_recv_size_and_flags(). */
+
+    if (desired_size == recv_size)
+    {
+      recv_size= 0;
+      goto clear;
+    }
+  }
+  else
+clear:
+    clear_closing();
+
+  return is_open;
+}
+
+/** @return whether the file is usable for io() */
+ATTRIBUTE_COLD bool fil_space_t::acquire_and_prepare()
+{
+  mysql_mutex_lock(&fil_system.mutex);
+  const auto flags= acquire_low() & (STOPPING | CLOSING);
+  const bool is_open= !flags || (flags == CLOSING && prepare_acquired());
+  mysql_mutex_unlock(&fil_system.mutex);
+  return is_open;
+}
+
+/** Try to extend a tablespace if it is smaller than the specified size.
+@param[in,out]	space	tablespace
+@param[in]	size	desired size in pages
+@return whether the tablespace is at least as big as requested */
+bool fil_space_extend(fil_space_t *space, uint32_t size)
+{
+  ut_ad(!srv_read_only_mode || space->purpose == FIL_TYPE_TEMPORARY);
+  bool success= false;
+  const bool acquired= space->acquire();
+  mysql_mutex_lock(&fil_system.mutex);
+  if (acquired || space->is_being_truncated)
+  {
+    while (fil_space_extend_must_retry(space, UT_LIST_GET_LAST(space->chain),
+                                       size, &success))
+      mysql_mutex_lock(&fil_system.mutex);
+  }
+  mysql_mutex_unlock(&fil_system.mutex);
+  if (acquired)
+    space->release();
+  return success;
+}
+
+/** Prepare to free a file from fil_system. */
+inline pfs_os_file_t fil_node_t::close_to_free(bool detach_handle)
+{
+  mysql_mutex_assert_owner(&fil_system.mutex);
+  ut_a(!being_extended);
+
+  if (is_open() &&
+      (space->n_pending.fetch_or(fil_space_t::CLOSING,
+                                 std::memory_order_acquire) &
+       fil_space_t::PENDING))
+  {
+    mysql_mutex_unlock(&fil_system.mutex);
+    while (space->referenced())
+      std::this_thread::sleep_for(std::chrono::microseconds(100));
+    mysql_mutex_lock(&fil_system.mutex);
+  }
+
+  while (is_open())
+  {
+    if (space->is_in_unflushed_spaces)
+    {
+      ut_ad(srv_file_flush_method != SRV_O_DIRECT_NO_FSYNC);
+      space->is_in_unflushed_spaces= false;
+      fil_system.unflushed_spaces.remove(*space);
+    }
+
+    ut_a(!being_extended);
+    if (detach_handle)
+    {
+      auto result= handle;
+      handle= OS_FILE_CLOSED;
+      return result;
+    }
+    bool ret= os_file_close(handle);
+    ut_a(ret);
+    handle= OS_FILE_CLOSED;
+    break;
+  }
+
+  return OS_FILE_CLOSED;
+}
+
+/** Detach a tablespace from the cache and close the files.
+@param space tablespace
+@param detach_handle whether to detach the handle, instead of closing
+@return detached handle
+@retval OS_FILE_CLOSED if no handle was detached */
+pfs_os_file_t fil_system_t::detach(fil_space_t *space, bool detach_handle)
+{
+  mysql_mutex_assert_owner(&fil_system.mutex);
+  HASH_DELETE(fil_space_t, hash, &spaces, space->id, space);
+
+  if (space->is_in_unflushed_spaces)
+  {
+    ut_ad(srv_file_flush_method != SRV_O_DIRECT_NO_FSYNC);
+    space->is_in_unflushed_spaces= false;
+    unflushed_spaces.remove(*space);
+  }
+
+  if (space->is_in_default_encrypt)
+  {
+    space->is_in_default_encrypt= false;
+    default_encrypt_tables.remove(*space);
+  }
+
+  {
+    space_list_t::iterator s= space_list_t::iterator(space);
+    if (space_list_last_opened == space)
+    {
+      if (s == space_list.begin())
+      {
+        ut_ad(srv_operation > SRV_OPERATION_EXPORT_RESTORED ||
+              srv_shutdown_state > SRV_SHUTDOWN_NONE);
+        space_list_last_opened= nullptr;
+      }
+      else
+      {
+        space_list_t::iterator prev= s;
+        space_list_last_opened= &*--prev;
+      }
+    }
+    space_list.erase(s);
+  }
+
+  if (space == sys_space)
+    sys_space= nullptr;
+  else if (space == temp_space)
+    temp_space= nullptr;
+
+  for (fil_node_t* node= UT_LIST_GET_FIRST(space->chain); node;
+       node= UT_LIST_GET_NEXT(chain, node))
+    if (node->is_open())
+    {
+      ut_ad(n_open > 0);
+      n_open--;
+    }
+
+  ut_ad(!detach_handle || space->id);
+  ut_ad(!detach_handle || UT_LIST_GET_LEN(space->chain) <= 1);
+
+  pfs_os_file_t handle= OS_FILE_CLOSED;
+
+  for (fil_node_t* node= UT_LIST_GET_FIRST(space->chain); node;
+       node= UT_LIST_GET_NEXT(chain, node))
+    handle= node->close_to_free(detach_handle);
+
+  ut_ad(!space->referenced());
+  return handle;
+}
+
+/** Free a tablespace object on which fil_system_t::detach() was invoked.
+There must not be any pending i/o's or flushes on the files.
+@param[in,out]	space		tablespace */
+static
+void
+fil_space_free_low(
+	fil_space_t*	space)
+{
+	/* The tablespace must not be in fil_system.named_spaces. */
+	ut_ad(srv_fast_shutdown == 2 || !srv_was_started
+	      || space->max_lsn == 0);
+
+	/* Wait for fil_space_t::release() after
+	fil_system_t::detach(), the tablespace cannot be found, so
+	fil_space_t::get() would return NULL */
+	while (space->referenced()) {
+		std::this_thread::sleep_for(std::chrono::microseconds(100));
+	}
+
+	for (fil_node_t* node = UT_LIST_GET_FIRST(space->chain);
+	     node != NULL; ) {
+		ut_d(space->size -= node->size);
+		ut_free(node->name);
+		fil_node_t* old_node = node;
+		node = UT_LIST_GET_NEXT(chain, node);
+		ut_free(old_node);
+	}
+
+	ut_ad(space->size == 0);
+
+	fil_space_destroy_crypt_data(&space->crypt_data);
+
+	space->~fil_space_t();
+	ut_free(space);
+}
+
+/** Frees a space object from the tablespace memory cache.
+Closes the files in the chain but does not delete them.
+There must not be any pending i/o's or flushes on the files.
+@param id          tablespace identifier
+@param x_latched   whether the caller holds exclusive fil_space_t::latch
+@return true if success */
+bool fil_space_free(uint32_t id, bool x_latched)
+{
+	ut_ad(id != TRX_SYS_SPACE);
+
+	mysql_mutex_lock(&fil_system.mutex);
+	fil_space_t*	space = fil_space_get_by_id(id);
+
+	if (space != NULL) {
+		fil_system.detach(space);
+	}
+
+	mysql_mutex_unlock(&fil_system.mutex);
+
+	if (space != NULL) {
+		if (x_latched) {
+			space->x_unlock();
+		}
+
+		if (!recv_recovery_is_on()) {
+			log_sys.latch.wr_lock(SRW_LOCK_CALL);
+
+			if (space->max_lsn) {
+				ut_d(space->max_lsn = 0);
+				fil_system.named_spaces.remove(*space);
+			}
+
+			log_sys.latch.wr_unlock();
+		} else {
+#ifndef SUX_LOCK_GENERIC
+			ut_ad(log_sys.latch.is_write_locked());
+#endif
+			if (space->max_lsn) {
+				ut_d(space->max_lsn = 0);
+				fil_system.named_spaces.remove(*space);
+			}
+		}
+
+		fil_space_free_low(space);
+	}
+
+	return(space != NULL);
+}
+
+/** Create a tablespace in fil_system.
+@param name       tablespace name
+@param id         tablespace identifier
+@param flags      tablespace flags
+@param purpose    tablespace purpose
+@param crypt_data encryption information
+@param mode       encryption mode
+@param opened     true if space files are opened
+@return pointer to created tablespace, to be filled in with add()
+@retval nullptr on failure (such as when the same tablespace exists) */
+fil_space_t *fil_space_t::create(uint32_t id, uint32_t flags,
+                                 fil_type_t purpose,
+				 fil_space_crypt_t *crypt_data,
+				 fil_encryption_t mode,
+				 bool opened)
+{
+	fil_space_t*	space;
+
+	mysql_mutex_assert_owner(&fil_system.mutex);
+	ut_ad(fil_system.is_initialised());
+	ut_ad(fil_space_t::is_valid_flags(flags & ~FSP_FLAGS_MEM_MASK, id));
+	ut_ad(srv_page_size == UNIV_PAGE_SIZE_ORIG || flags != 0);
+
+	DBUG_EXECUTE_IF("fil_space_create_failure", return(NULL););
+
+	/* FIXME: if calloc() is defined as an inline function that calls
+	memset() or bzero(), then GCC 6 -flifetime-dse can optimize it away */
+	space= new (ut_zalloc_nokey(sizeof(*space))) fil_space_t;
+
+	space->id = id;
+
+	UT_LIST_INIT(space->chain, &fil_node_t::chain);
+
+	space->purpose = purpose;
+	space->flags = flags;
+
+	space->crypt_data = crypt_data;
+	space->n_pending.store(CLOSING, std::memory_order_relaxed);
+
+	DBUG_LOG("tablespace", "Created metadata for " << id);
+	if (crypt_data) {
+		DBUG_LOG("crypt",
+			 "Tablespace " << id
+			 << " encryption " << crypt_data->encryption
+			 << " key id " << crypt_data->key_id
+			 << ":" << fil_crypt_get_mode(crypt_data)
+			 << " " << fil_crypt_get_type(crypt_data));
+	}
+
+	space->latch.SRW_LOCK_INIT(fil_space_latch_key);
+
+	if (const fil_space_t *old_space = fil_space_get_by_id(id)) {
+		ib::error() << "Trying to add tablespace with id " << id
+			    << " to the cache, but tablespace '"
+			    << (old_space->chain.start
+				? old_space->chain.start->name
+				: "")
+			    << "' already exists in the cache!";
+		space->~fil_space_t();
+		ut_free(space);
+		return(NULL);
+	}
+
+	HASH_INSERT(fil_space_t, hash, &fil_system.spaces, id, space);
+
+	if (opened)
+	  fil_system.add_opened_last_to_space_list(space);
+	else
+          fil_system.space_list.push_back(*space);
+
+	switch (id) {
+	case 0:
+		ut_ad(!fil_system.sys_space);
+		fil_system.sys_space = space;
+		break;
+	case SRV_TMP_SPACE_ID:
+		ut_ad(!fil_system.temp_space);
+		fil_system.temp_space = space;
+		break;
+	default:
+		ut_ad(purpose != FIL_TYPE_TEMPORARY);
+		if (UNIV_LIKELY(id <= fil_system.max_assigned_id)) {
+			break;
+		}
+		if (UNIV_UNLIKELY(srv_operation == SRV_OPERATION_BACKUP)) {
+			break;
+		}
+		if (!fil_system.space_id_reuse_warned) {
+			ib::warn() << "Allocated tablespace ID " << id
+				<< ", old maximum was "
+				<< fil_system.max_assigned_id;
+		}
+
+		fil_system.max_assigned_id = id;
+	}
+
+	const bool rotate = purpose == FIL_TYPE_TABLESPACE
+		&& (mode == FIL_ENCRYPTION_ON || mode == FIL_ENCRYPTION_OFF
+		    || srv_encrypt_tables)
+		&& fil_crypt_must_default_encrypt();
+
+	if (rotate) {
+		fil_system.default_encrypt_tables.push_back(*space);
+		space->is_in_default_encrypt = true;
+
+		if (srv_n_fil_crypt_threads_started) {
+			mysql_mutex_unlock(&fil_system.mutex);
+			fil_crypt_threads_signal();
+			mysql_mutex_lock(&fil_system.mutex);
+		}
+	}
+
+	return(space);
+}
+
+/*******************************************************************//**
+Assigns a new space id for a new single-table tablespace. This works simply by
+incrementing the global counter. If 4 billion id's is not enough, we may need
+to recycle id's.
+@return true if assigned, false if not */
+bool fil_assign_new_space_id(uint32_t *space_id)
+{
+	uint32_t id = *space_id;
+	bool	success;
+
+	mysql_mutex_lock(&fil_system.mutex);
+
+	if (id < fil_system.max_assigned_id) {
+		id = fil_system.max_assigned_id;
+	}
+
+	id++;
+
+	if (id > (SRV_SPACE_ID_UPPER_BOUND / 2) && (id % 1000000UL == 0)) {
+		ib::warn() << "You are running out of new single-table"
+			" tablespace id's. Current counter is " << id
+			<< " and it must not exceed" <<SRV_SPACE_ID_UPPER_BOUND
+			<< "! To reset the counter to zero you have to dump"
+			" all your tables and recreate the whole InnoDB"
+			" installation.";
+	}
+
+	success = (id < SRV_SPACE_ID_UPPER_BOUND);
+
+	if (success) {
+		*space_id = fil_system.max_assigned_id = id;
+	} else {
+		ib::warn() << "You have run out of single-table tablespace"
+			" id's! Current counter is " << id
+			<< ". To reset the counter to zero"
+			" you have to dump all your tables and"
+			" recreate the whole InnoDB installation.";
+		*space_id = UINT32_MAX;
+	}
+
+	mysql_mutex_unlock(&fil_system.mutex);
+
+	return(success);
+}
+
+/** Read the first page of a data file.
+@return whether the page was found valid */
+bool fil_space_t::read_page0()
+{
+  ut_ad(fil_system.is_initialised());
+  mysql_mutex_assert_owner(&fil_system.mutex);
+  if (size)
+    return true;
+
+  fil_node_t *node= UT_LIST_GET_FIRST(chain);
+  if (!node)
+    return false;
+  ut_ad(!UT_LIST_GET_NEXT(chain, node));
+
+  if (UNIV_UNLIKELY(acquire_low() & STOPPING))
+  {
+    ut_ad("this should not happen" == 0);
+    return false;
+  }
+  const bool ok= node->is_open() || fil_node_open_file(node);
+  release();
+  return ok;
+}
+
+/** Look up a tablespace and ensure that its first page has been validated. */
+static fil_space_t *fil_space_get_space(uint32_t id)
+{
+  if (fil_space_t *space= fil_space_get_by_id(id))
+    if (space->read_page0())
+      return space;
+  return nullptr;
+}
+
+void fil_space_set_recv_size_and_flags(uint32_t id, uint32_t size,
+                                       uint32_t flags)
+{
+  ut_ad(id < SRV_SPACE_ID_UPPER_BOUND);
+  mysql_mutex_lock(&fil_system.mutex);
+  if (fil_space_t *space= fil_space_get_space(id))
+  {
+    if (size)
+      space->recv_size= size;
+    if (flags != FSP_FLAGS_FCRC32_MASK_MARKER)
+      space->flags= flags;
+  }
+  mysql_mutex_unlock(&fil_system.mutex);
+}
+
+/** Open each file. Never invoked on .ibd files.
+@param create_new_db    whether to skip the call to fil_node_t::read_page0()
+@return whether all files were opened */
+bool fil_space_t::open(bool create_new_db)
+{
+  ut_ad(fil_system.is_initialised());
+  ut_ad(!id || create_new_db);
+
+  bool success= true;
+  bool skip_read= create_new_db;
+
+  mysql_mutex_lock(&fil_system.mutex);
+
+  for (fil_node_t *node= UT_LIST_GET_FIRST(chain); node;
+       node= UT_LIST_GET_NEXT(chain, node))
+  {
+    if (!node->is_open() && !fil_node_open_file_low(node))
+    {
+err_exit:
+      success= false;
+      break;
+    }
+
+    if (create_new_db)
+    {
+      node->find_metadata(node->handle);
+      continue;
+    }
+    if (skip_read)
+    {
+      size+= node->size;
+      continue;
+    }
+
+    if (!node->read_page0())
+    {
+      fil_system.n_open--;
+      os_file_close(node->handle);
+      node->handle= OS_FILE_CLOSED;
+      goto err_exit;
+    }
+
+    skip_read= true;
+  }
+
+  if (!create_new_db)
+    committed_size= size;
+  mysql_mutex_unlock(&fil_system.mutex);
+  return success;
+}
+
+/** Close each file. Only invoked on fil_system.temp_space. */
+void fil_space_t::close()
+{
+	if (!fil_system.is_initialised()) {
+		return;
+	}
+
+	mysql_mutex_lock(&fil_system.mutex);
+	ut_ad(this == fil_system.temp_space
+	      || srv_operation == SRV_OPERATION_BACKUP
+	      || srv_operation == SRV_OPERATION_RESTORE
+	      || srv_operation == SRV_OPERATION_RESTORE_DELTA);
+
+	for (fil_node_t* node = UT_LIST_GET_FIRST(chain);
+	     node != NULL;
+	     node = UT_LIST_GET_NEXT(chain, node)) {
+		if (node->is_open()) {
+			node->close();
+		}
+	}
+
+	mysql_mutex_unlock(&fil_system.mutex);
+}
+
+void fil_system_t::create(ulint hash_size)
+{
+	ut_ad(this == &fil_system);
+	ut_ad(!is_initialised());
+	ut_ad(!(srv_page_size % FSP_EXTENT_SIZE));
+	ut_ad(srv_page_size);
+	ut_ad(!spaces.array);
+
+	m_initialised = true;
+
+	compile_time_assert(!(UNIV_PAGE_SIZE_MAX % FSP_EXTENT_SIZE_MAX));
+	compile_time_assert(!(UNIV_PAGE_SIZE_MIN % FSP_EXTENT_SIZE_MIN));
+
+	ut_ad(hash_size > 0);
+
+	mysql_mutex_init(fil_system_mutex_key, &mutex, nullptr);
+
+	spaces.create(hash_size);
+
+	fil_space_crypt_init();
+#ifdef __linux__
+	ssd.clear();
+	char fn[sizeof(dirent::d_name)
+		+ sizeof "/sys/block/" "/queue/rotational"];
+	const size_t sizeof_fnp = (sizeof fn) - sizeof "/sys/block";
+	memcpy(fn, "/sys/block/", sizeof "/sys/block");
+	char* fnp = &fn[sizeof "/sys/block"];
+
+	std::set<std::string> ssd_devices;
+	if (DIR* d = opendir("/sys/block")) {
+		while (struct dirent* e = readdir(d)) {
+			if (e->d_name[0] == '.') {
+				continue;
+			}
+			snprintf(fnp, sizeof_fnp, "%s/queue/rotational",
+				 e->d_name);
+			int f = open(fn, O_RDONLY);
+			if (f == -1) {
+				continue;
+			}
+			char b[sizeof "4294967295:4294967295\n"];
+			ssize_t l = read(f, b, sizeof b);
+			::close(f);
+			if (l != 2 || memcmp("0\n", b, 2)) {
+				continue;
+			}
+			snprintf(fnp, sizeof_fnp, "%s/dev", e->d_name);
+			f = open(fn, O_RDONLY);
+			if (f == -1) {
+				continue;
+			}
+			l = read(f, b, sizeof b);
+			::close(f);
+			if (l <= 0 || b[l - 1] != '\n') {
+				continue;
+			}
+			b[l - 1] = '\0';
+			char* end = b;
+			unsigned long dev_major = strtoul(b, &end, 10);
+			if (b == end || *end != ':'
+			    || dev_major != unsigned(dev_major)) {
+				continue;
+			}
+			char* c = end + 1;
+			unsigned long dev_minor = strtoul(c, &end, 10);
+			if (c == end || *end
+			    || dev_minor != unsigned(dev_minor)) {
+				continue;
+			}
+			ssd.push_back(makedev(unsigned(dev_major),
+					      unsigned(dev_minor)));
+		}
+		closedir(d);
+	}
+	/* fil_system_t::is_ssd() assumes the following */
+	ut_ad(makedev(0, 8) == 8);
+	ut_ad(makedev(0, 4) == 4);
+	ut_ad(makedev(0, 2) == 2);
+	ut_ad(makedev(0, 1) == 1);
+#endif
+}
+
+void fil_system_t::close()
+{
+  ut_ad(this == &fil_system);
+  ut_a(unflushed_spaces.empty());
+  ut_a(space_list.empty());
+  ut_ad(!sys_space);
+  ut_ad(!temp_space);
+
+  if (is_initialised())
+  {
+    m_initialised= false;
+    spaces.free();
+    mysql_mutex_destroy(&mutex);
+    fil_space_crypt_cleanup();
+  }
+
+  ut_ad(!spaces.array);
+
+#ifdef __linux__
+  ssd.clear();
+  ssd.shrink_to_fit();
+#endif /* __linux__ */
+}
+
+void fil_system_t::add_opened_last_to_space_list(fil_space_t *space)
+{
+  if (UNIV_LIKELY(space_list_last_opened != nullptr))
+    space_list.insert(++space_list_t::iterator(space_list_last_opened), *space);
+  else
+    space_list.push_front(*space);
+  space_list_last_opened= space;
+}
+
+/** Extend all open data files to the recovered size */
+ATTRIBUTE_COLD void fil_system_t::extend_to_recv_size()
+{
+  ut_ad(is_initialised());
+  mysql_mutex_lock(&mutex);
+  for (fil_space_t &space : fil_system.space_list)
+  {
+    const uint32_t size= space.recv_size;
+
+    if (size > space.size)
+    {
+      if (space.is_closing())
+        continue;
+      space.reacquire();
+      bool success;
+      while (fil_space_extend_must_retry(&space, UT_LIST_GET_LAST(space.chain),
+                                         size, &success))
+        mysql_mutex_lock(&mutex);
+      /* Crash recovery requires the file extension to succeed. */
+      ut_a(success);
+      space.release();
+    }
+  }
+  mysql_mutex_unlock(&mutex);
+}
+
+/** Close all tablespace files at shutdown */
+void fil_space_t::close_all()
+{
+  if (!fil_system.is_initialised())
+    return;
+
+  /* At shutdown, we should not have any files in this list. */
+  ut_ad(srv_fast_shutdown == 2 || !srv_was_started ||
+        fil_system.named_spaces.empty());
+  fil_flush_file_spaces();
+
+  mysql_mutex_lock(&fil_system.mutex);
+
+  while (!fil_system.space_list.empty())
+  {
+    fil_space_t &space= fil_system.space_list.front();
+
+    for (fil_node_t *node= UT_LIST_GET_FIRST(space.chain); node != NULL;
+         node= UT_LIST_GET_NEXT(chain, node))
+    {
+
+      if (!node->is_open())
+      {
+      next:
+        continue;
+      }
+
+      for (ulint count= 10000; count--;)
+      {
+        const auto n= space.set_closing();
+        if (n & STOPPING)
+          goto next;
+        if (!(n & (PENDING | NEEDS_FSYNC)))
+        {
+          node->close();
+          goto next;
+        }
+        mysql_mutex_unlock(&fil_system.mutex);
+        std::this_thread::sleep_for(std::chrono::microseconds(100));
+        mysql_mutex_lock(&fil_system.mutex);
+        if (!node->is_open())
+          goto next;
+      }
+
+      ib::error() << "File '" << node->name << "' has " << space.referenced()
+                  << " operations";
+    }
+
+    fil_system.detach(&space);
+    mysql_mutex_unlock(&fil_system.mutex);
+    fil_space_free_low(&space);
+    mysql_mutex_lock(&fil_system.mutex);
+  }
+
+  mysql_mutex_unlock(&fil_system.mutex);
+
+  ut_ad(srv_fast_shutdown == 2 || !srv_was_started ||
+        fil_system.named_spaces.empty());
+}
+
+/*******************************************************************//**
+Sets the max tablespace id counter if the given number is bigger than the
+previous value. */
+void fil_set_max_space_id_if_bigger(uint32_t max_id)
+{
+	ut_a(max_id < SRV_SPACE_ID_UPPER_BOUND);
+
+	mysql_mutex_lock(&fil_system.mutex);
+
+	if (fil_system.max_assigned_id < max_id) {
+
+		fil_system.max_assigned_id = max_id;
+	}
+
+	mysql_mutex_unlock(&fil_system.mutex);
+}
+
+/** Acquire a tablespace reference.
+@param id      tablespace identifier
+@return tablespace
+@retval nullptr if the tablespace is missing or inaccessible */
+fil_space_t *fil_space_t::get(uint32_t id)
+{
+  mysql_mutex_lock(&fil_system.mutex);
+  fil_space_t *space= fil_space_get_by_id(id);
+  const uint32_t n= space ? space->acquire_low() : 0;
+
+  if (n & STOPPING)
+    space= nullptr;
+  else if ((n & CLOSING) && !space->prepare_acquired())
+    space= nullptr;
+
+  mysql_mutex_unlock(&fil_system.mutex);
+  return space;
+}
+
+/** Write a log record about a file operation.
+@param type           file operation
+@param first_page_no  first page number in the file
+@param path           file path
+@param new_path       new file path for type=FILE_RENAME */
+inline void mtr_t::log_file_op(mfile_type_t type, uint32_t space_id,
+			       const char *path, const char *new_path)
+{
+  ut_ad((new_path != nullptr) == (type == FILE_RENAME));
+  ut_ad(!(byte(type) & 15));
+
+  /* fil_name_parse() requires that there be at least one path
+  separator and that the file path end with ".ibd". */
+  ut_ad(strchr(path, '/'));
+  ut_ad(!strcmp(&path[strlen(path) - strlen(DOT_IBD)], DOT_IBD));
+
+  m_modifications= true;
+  if (!is_logged())
+    return;
+  m_last= nullptr;
+
+  const size_t len= strlen(path);
+  const size_t new_len= type == FILE_RENAME ? 1 + strlen(new_path) : 0;
+  ut_ad(len > 0);
+  byte *const log_ptr= m_log.open(1 + 3/*length*/ + 5/*space_id*/ +
+                                  1/*page_no=0*/);
+  byte *end= log_ptr + 1;
+  end= mlog_encode_varint(end, space_id);
+  *end++= 0;
+  if (UNIV_LIKELY(end + len + new_len >= &log_ptr[16]))
+  {
+    *log_ptr= type;
+    size_t total_len= len + new_len + end - log_ptr - 15;
+    if (total_len >= MIN_3BYTE)
+      total_len+= 2;
+    else if (total_len >= MIN_2BYTE)
+      total_len++;
+    end= mlog_encode_varint(log_ptr + 1, total_len);
+    end= mlog_encode_varint(end, space_id);
+    *end++= 0;
+  }
+  else
+  {
+    *log_ptr= static_cast<byte>(type | (end + len + new_len - &log_ptr[1]));
+    ut_ad(*log_ptr & 15);
+  }
+
+  m_log.close(end);
+
+  if (type == FILE_RENAME)
+  {
+    ut_ad(strchr(new_path, '/'));
+    m_log.push(reinterpret_cast<const byte*>(path), uint32_t(len + 1));
+    m_log.push(reinterpret_cast<const byte*>(new_path), uint32_t(new_len - 1));
+  }
+  else
+    m_log.push(reinterpret_cast<const byte*>(path), uint32_t(len));
+}
+
+/** Write FILE_MODIFY for a file.
+@param[in]	space_id	tablespace id
+@param[in]	name		tablespace file name
+@param[in,out]	mtr		mini-transaction */
+static void fil_name_write(uint32_t space_id, const char *name,
+                           mtr_t *mtr)
+{
+  ut_ad(!is_predefined_tablespace(space_id));
+  mtr->log_file_op(FILE_MODIFY, space_id, name);
+}
+
+fil_space_t *fil_space_t::drop(uint32_t id, pfs_os_file_t *detached_handle)
+{
+  ut_a(!is_system_tablespace(id));
+  mysql_mutex_lock(&fil_system.mutex);
+  fil_space_t *space= fil_space_get_by_id(id);
+
+  if (!space)
+  {
+    mysql_mutex_unlock(&fil_system.mutex);
+    return nullptr;
+  }
+
+  if (space->pending() & STOPPING)
+  {
+    /* A thread executing DDL and another thread executing purge may
+    be executing fil_delete_tablespace() concurrently for the same
+    tablespace. Wait for the other thread to complete the operation. */
+    for (ulint count= 0;; count++)
+    {
+      space= fil_space_get_by_id(id);
+      ut_ad(!space || space->is_stopping());
+      mysql_mutex_unlock(&fil_system.mutex);
+      if (!space)
+        return nullptr;
+      /* Issue a warning every 10.24 seconds, starting after 2.56 seconds */
+      if ((count & 511) == 128)
+        sql_print_warning("InnoDB: Waiting for tablespace " UINT32PF
+                          " to be deleted", id);
+      std::this_thread::sleep_for(std::chrono::milliseconds(20));
+      mysql_mutex_lock(&fil_system.mutex);
+    }
+  }
+
+  /* We must be the first one to set either STOPPING flag on the .ibd file,
+  because the flags are only being set here, within a critical section of
+  fil_system.mutex. */
+  unsigned pending;
+  ut_d(pending=)
+    space->n_pending.fetch_add(STOPPING_READS + 1, std::memory_order_relaxed);
+  ut_ad(!(pending & STOPPING));
+  mysql_mutex_unlock(&fil_system.mutex);
+
+  if (space->crypt_data)
+    fil_space_crypt_close_tablespace(space);
+
+  if (space->purpose == FIL_TYPE_TABLESPACE)
+  {
+    if (id >= srv_undo_space_id_start &&
+        id < srv_undo_space_id_start + srv_undo_tablespaces_open)
+    {
+      os_file_delete(innodb_data_file_key, space->chain.start->name);
+      goto deleted;
+    }
+
+    /* Before deleting the file, persistently write a log record. */
+    mtr_t mtr;
+    mtr.start();
+    mtr.log_file_op(FILE_DELETE, id, space->chain.start->name);
+    mtr.commit_file(*space, nullptr);
+
+    if (FSP_FLAGS_HAS_DATA_DIR(space->flags))
+      RemoteDatafile::delete_link_file(space->name());
+
+    os_file_delete(innodb_data_file_key, space->chain.start->name);
+  }
+  else
+    ut_ad(space->purpose == FIL_TYPE_IMPORT);
+
+  if (char *cfg_name= fil_make_filepath(space->chain.start->name,
+                                        fil_space_t::name_type{}, CFG, false))
+  {
+    os_file_delete_if_exists(innodb_data_file_key, cfg_name, nullptr);
+    ut_free(cfg_name);
+  }
+
+ deleted:
+  mysql_mutex_lock(&fil_system.mutex);
+  ut_ad(space == fil_space_get_by_id(id));
+  pending=
+    space->n_pending.fetch_add(STOPPING_WRITES - 1, std::memory_order_relaxed);
+  ut_ad((pending & STOPPING) == STOPPING_READS);
+  ut_ad(pending & PENDING);
+  pending&= PENDING;
+  if (--pending)
+  {
+    for (ulint count= 0;; count++)
+    {
+      ut_ad(space == fil_space_get_by_id(id));
+      pending= space->n_pending.load(std::memory_order_relaxed) & PENDING;
+      if (!pending)
+        break;
+      mysql_mutex_unlock(&fil_system.mutex);
+      /* Issue a warning every 10.24 seconds, starting after 2.56 seconds */
+      if ((count & 511) == 128)
+        sql_print_warning("InnoDB: Trying to delete tablespace '%s' "
+                          "but there are %u pending operations",
+                          space->chain.start->name, pending);
+      std::this_thread::sleep_for(std::chrono::milliseconds(20));
+      mysql_mutex_lock(&fil_system.mutex);
+    }
+  }
+
+  pfs_os_file_t handle= fil_system.detach(space, true);
+  mysql_mutex_unlock(&fil_system.mutex);
+  if (detached_handle)
+    *detached_handle = handle;
+  else
+    os_file_close(handle);
+  return space;
+}
+
+/** Close a single-table tablespace on failed IMPORT TABLESPACE.
+The tablespace must be cached in the memory cache.
+Free all pages used by the tablespace. */
+void fil_close_tablespace(uint32_t id)
+{
+	ut_ad(!is_system_tablespace(id));
+	fil_space_t* space = fil_space_t::drop(id, nullptr);
+	if (!space) {
+		return;
+	}
+
+	space->x_lock();
+	ut_ad(space->is_stopping());
+
+	/* Invalidate in the buffer pool all pages belonging to the
+	tablespace. Since space->is_stopping() holds, readahead
+	can no longer read more pages of this tablespace to buf_pool.
+	Thus we can clean the tablespace out of buf_pool
+	completely and permanently. */
+	while (buf_flush_list_space(space));
+
+	space->x_unlock();
+	log_sys.latch.wr_lock(SRW_LOCK_CALL);
+	if (space->max_lsn != 0) {
+		ut_d(space->max_lsn = 0);
+		fil_system.named_spaces.remove(*space);
+	}
+	log_sys.latch.wr_unlock();
+	fil_space_free_low(space);
+}
+
+/** Delete a tablespace and associated .ibd file.
+@param id    tablespace identifier
+@return detached file handle (to be closed by the caller)
+@return	OS_FILE_CLOSED if no file existed */
+pfs_os_file_t fil_delete_tablespace(uint32_t id)
+{
+  ut_ad(!is_system_tablespace(id));
+  pfs_os_file_t handle= OS_FILE_CLOSED;
+  if (fil_space_t *space= fil_space_t::drop(id, &handle))
+    fil_space_free_low(space);
+  return handle;
+}
+
+/*******************************************************************//**
+Allocates and builds a file name from a path, a table or tablespace name
+and a suffix. The string must be freed by caller with ut_free().
+@param[in] path NULL or the directory path or the full path and filename.
+@param[in] name {} if path is full, or Table/Tablespace name
+@param[in] ext the file extension to use
+@param[in] trim_name true if the last name on the path should be trimmed.
+@return own: file name */
+char* fil_make_filepath(const char *path, const fil_space_t::name_type &name,
+                        ib_extention ext, bool trim_name)
+{
+	/* The path may contain the basename of the file, if so we do not
+	need the name.  If the path is NULL, we can use the default path,
+	but there needs to be a name. */
+	ut_ad(path || name.data());
+
+	/* If we are going to strip a name off the path, there better be a
+	path and a new name to put back on. */
+	ut_ad(!trim_name || (path && name.data()));
+
+	if (path == NULL) {
+		path = fil_path_to_mysql_datadir;
+	}
+
+	ulint	len		= 0;	/* current length */
+	ulint	path_len	= strlen(path);
+	const char* suffix	= dot_ext[ext];
+	ulint	suffix_len	= strlen(suffix);
+	ulint	full_len	= path_len + 1 + name.size() + suffix_len + 1;
+
+	char*	full_name = static_cast<char*>(ut_malloc_nokey(full_len));
+	if (full_name == NULL) {
+		return NULL;
+	}
+
+	/* If the name is a relative or absolute path, do not prepend "./". */
+	if (path[0] == '.'
+	    && (path[1] == '\0' || path[1] == '/' IF_WIN(|| path[1] == '\\',))
+	    && name.size() && (name.data()[0] == '.'
+			       || is_absolute_path(name.data()))) {
+		path = NULL;
+		path_len = 0;
+	}
+
+	if (path != NULL) {
+		memcpy(full_name, path, path_len);
+		len = path_len;
+	}
+
+	full_name[len] = '\0';
+
+	if (trim_name) {
+		/* Find the offset of the last DIR separator and set it to
+		null in order to strip off the old basename from this path. */
+		char* last_dir_sep = strrchr(full_name, '/');
+#ifdef _WIN32
+		if (char *last = strrchr(full_name, '\\')) {
+			if (last > last_dir_sep) {
+				last_dir_sep = last;
+			}
+		}
+#endif
+		if (last_dir_sep) {
+			last_dir_sep[0] = '\0';
+			len = strlen(full_name);
+		}
+	}
+
+	if (name.size()) {
+		if (len && full_name[len - 1] != '/') {
+			/* Add a DIR separator */
+			full_name[len] = '/';
+			full_name[++len] = '\0';
+		}
+
+		char*	ptr = &full_name[len];
+		memcpy(ptr, name.data(), name.size());
+		len += name.size();
+		full_name[len] = '\0';
+	}
+
+	/* Make sure that the specified suffix is at the end of the filepath
+	string provided. This assumes that the suffix starts with '.'.
+	If the first char of the suffix is found in the filepath at the same
+	length as the suffix from the end, then we will assume that there is
+	a previous suffix that needs to be replaced. */
+	if (suffix != NULL) {
+		/* Need room for the trailing null byte. */
+		ut_ad(len < full_len);
+
+		if ((len > suffix_len)
+		   && (full_name[len - suffix_len] == suffix[0])) {
+			/* Another suffix exists, make it the one requested. */
+			memcpy(&full_name[len - suffix_len], suffix, suffix_len);
+
+		} else {
+			/* No previous suffix, add it. */
+			ut_ad(len + suffix_len < full_len);
+			memcpy(&full_name[len], suffix, suffix_len);
+			full_name[len + suffix_len] = '\0';
+		}
+	}
+
+	return(full_name);
+}
+
+char *fil_make_filepath(const char* path, const table_name_t name,
+                        ib_extention suffix, bool strip_name)
+{
+  return fil_make_filepath(path, {name.m_name, strlen(name.m_name)},
+                           suffix, strip_name);
+}
+
+dberr_t fil_space_t::rename(const char *path, bool log, bool replace)
+{
+  ut_ad(UT_LIST_GET_LEN(chain) == 1);
+  ut_ad(!is_predefined_tablespace(id));
+
+  const char *old_path= chain.start->name;
+
+  ut_ad(strchr(old_path, '/'));
+  ut_ad(strchr(path, '/'));
+
+  if (!strcmp(path, old_path))
+    return DB_SUCCESS;
+
+  if (!log)
+  {
+    if (!os_file_rename(innodb_data_file_key, old_path, path))
+      return DB_ERROR;
+    mysql_mutex_lock(&fil_system.mutex);
+    ut_free(chain.start->name);
+    chain.start->name= mem_strdup(path);
+    mysql_mutex_unlock(&fil_system.mutex);
+    return DB_SUCCESS;
+  }
+
+  bool exists= false;
+  os_file_type_t ftype;
+
+  /* Check upfront if the rename operation might succeed, because we
+  must durably write redo log before actually attempting to execute
+  the rename in the file system. */
+  if (os_file_status(old_path, &exists, &ftype) && !exists)
+  {
+    sql_print_error("InnoDB: Cannot rename '%s' to '%s'"
+                    " because the source file does not exist.",
+                    old_path, path);
+    return DB_TABLESPACE_NOT_FOUND;
+  }
+
+  exists= false;
+  if (replace);
+  else if (!os_file_status(path, &exists, &ftype) || exists)
+  {
+    sql_print_error("InnoDB: Cannot rename '%s' to '%s'"
+                    " because the target file exists.",
+                    old_path, path);
+    return DB_TABLESPACE_EXISTS;
+  }
+
+  mtr_t mtr;
+  mtr.start();
+  mtr.log_file_op(FILE_RENAME, id, old_path, path);
+  return mtr.commit_file(*this, path) ? DB_SUCCESS : DB_ERROR;
+}
+
+/** Create a tablespace file.
+@param[in]	space_id	Tablespace ID
+@param[in]	name		Tablespace name in dbname/tablename format.
+@param[in]	path		Path and filename of the datafile to create.
+@param[in]	flags		Tablespace flags
+@param[in]	size		Initial size of the tablespace file in pages,
+must be >= FIL_IBD_FILE_INITIAL_SIZE
+@param[in]	mode		MariaDB encryption mode
+@param[in]	key_id		MariaDB encryption key_id
+@param[out]	err		DB_SUCCESS or error code
+@return	the created tablespace
+@retval	NULL	on error */
+fil_space_t*
+fil_ibd_create(
+	uint32_t	space_id,
+	const table_name_t name,
+	const char*	path,
+	uint32_t	flags,
+	uint32_t	size,
+	fil_encryption_t mode,
+	uint32_t	key_id,
+	dberr_t*	err)
+{
+	pfs_os_file_t	file;
+	bool		success;
+	mtr_t		mtr;
+	bool		has_data_dir = FSP_FLAGS_HAS_DATA_DIR(flags) != 0;
+
+	ut_ad(!is_system_tablespace(space_id));
+	ut_ad(!srv_read_only_mode);
+	ut_a(space_id < SRV_SPACE_ID_UPPER_BOUND);
+	ut_a(size >= FIL_IBD_FILE_INITIAL_SIZE);
+	ut_a(fil_space_t::is_valid_flags(flags & ~FSP_FLAGS_MEM_MASK, space_id));
+
+	/* Create the subdirectories in the path, if they are
+	not there already. */
+	*err = os_file_create_subdirs_if_needed(path);
+	if (*err != DB_SUCCESS) {
+		return NULL;
+	}
+
+	mtr.start();
+	mtr.log_file_op(FILE_CREATE, space_id, path);
+	log_sys.latch.wr_lock(SRW_LOCK_CALL);
+	auto lsn= mtr.commit_files();
+	log_sys.latch.wr_unlock();
+	mtr.flag_wr_unlock();
+	log_write_up_to(lsn, true);
+
+	ulint type;
+	static_assert(((UNIV_ZIP_SIZE_MIN >> 1) << 3) == 4096,
+		      "compatibility");
+	switch (FSP_FLAGS_GET_ZIP_SSIZE(flags)) {
+	case 1:
+	case 2:
+		type = OS_DATA_FILE_NO_O_DIRECT;
+		break;
+	default:
+		type = OS_DATA_FILE;
+	}
+
+	file = os_file_create(
+		innodb_data_file_key, path,
+		OS_FILE_CREATE | OS_FILE_ON_ERROR_NO_EXIT,
+		OS_FILE_AIO, type, srv_read_only_mode, &success);
+
+	if (!success) {
+		/* The following call will print an error message */
+		switch (os_file_get_last_error(true)) {
+		case OS_FILE_ALREADY_EXISTS:
+			ib::info() << "The file '" << path << "'"
+				" already exists though the"
+				" corresponding table did not exist"
+				" in the InnoDB data dictionary."
+				" You can resolve the problem by removing"
+				" the file.";
+			*err = DB_TABLESPACE_EXISTS;
+			break;
+		case OS_FILE_DISK_FULL:
+			*err = DB_OUT_OF_FILE_SPACE;
+			break;
+		default:
+			*err = DB_ERROR;
+		}
+		ib::error() << "Cannot create file '" << path << "'";
+		return NULL;
+	}
+
+	const bool is_compressed = fil_space_t::is_compressed(flags);
+#ifdef _WIN32
+	const bool is_sparse = is_compressed;
+	if (is_compressed) {
+		os_file_set_sparse_win32(file);
+	}
+#else
+	const bool is_sparse = is_compressed
+		&& DB_SUCCESS == os_file_punch_hole(file, 0, 4096)
+		&& !my_test_if_thinly_provisioned(file);
+#endif
+
+	if (fil_space_t::full_crc32(flags)) {
+		flags |= FSP_FLAGS_FCRC32_PAGE_SSIZE();
+	} else {
+		flags |= FSP_FLAGS_PAGE_SSIZE();
+	}
+
+	/* Create crypt data if the tablespace is either encrypted or user has
+	requested it to remain unencrypted. */
+	fil_space_crypt_t* crypt_data = (mode != FIL_ENCRYPTION_DEFAULT
+					 || srv_encrypt_tables)
+		? fil_space_create_crypt_data(mode, key_id)
+		: nullptr;
+
+	if (!os_file_set_size(path, file,
+			      os_offset_t(size) << srv_page_size_shift,
+			      is_sparse)) {
+		*err = DB_OUT_OF_FILE_SPACE;
+err_exit:
+		os_file_close(file);
+		os_file_delete(innodb_data_file_key, path);
+		free(crypt_data);
+		return nullptr;
+	}
+
+	fil_space_t::name_type space_name;
+
+	if (has_data_dir) {
+		/* Make the ISL file if the IBD file is not
+		in the default location. */
+		space_name = {name.m_name, strlen(name.m_name)};
+		*err = RemoteDatafile::create_link_file(space_name, path);
+		if (*err != DB_SUCCESS) {
+			goto err_exit;
+		}
+	}
+
+	DBUG_EXECUTE_IF("checkpoint_after_file_create",
+			log_make_checkpoint(););
+
+	mysql_mutex_lock(&fil_system.mutex);
+	if (fil_space_t* space = fil_space_t::create(space_id, flags,
+						     FIL_TYPE_TABLESPACE,
+						     crypt_data, mode, true)) {
+		fil_node_t* node = space->add(path, file, size, false, true);
+		IF_WIN(node->find_metadata(), node->find_metadata(file, true));
+		mysql_mutex_unlock(&fil_system.mutex);
+		mtr.start();
+		mtr.set_named_space(space);
+		ut_a(fsp_header_init(space, size, &mtr) == DB_SUCCESS);
+		mtr.commit();
+		return space;
+	} else {
+		mysql_mutex_unlock(&fil_system.mutex);
+	}
+
+	if (space_name.data()) {
+		RemoteDatafile::delete_link_file(space_name);
+	}
+
+	*err = DB_ERROR;
+	goto err_exit;
+}
+
+/** Try to open a single-table tablespace and optionally check that the
+space id in it is correct. If this does not succeed, print an error message
+to the .err log. This function is used to open a tablespace when we start
+mysqld after the dictionary has been booted, and also in IMPORT TABLESPACE.
+
+NOTE that we assume this operation is used either at the database startup
+or under the protection of dict_sys.latch, so that two users cannot
+race here. This operation does not leave the file associated with the
+tablespace open, but closes it after we have looked at the space id in it.
+
+If the validate boolean is set, we read the first page of the file and
+check that the space id in the file is what we expect. We assume that
+this function runs much faster if no check is made, since accessing the
+file inode probably is much faster (the OS caches them) than accessing
+the first page of the file.  This boolean may be initially false, but if
+a remote tablespace is found it will be changed to true.
+
+If the fix_dict boolean is set, then it is safe to use an internal SQL
+statement to update the dictionary tables if they are incorrect.
+
+@param[in]	validate	0=maybe missing, 1=do not validate, 2=validate
+@param[in]	purpose		FIL_TYPE_TABLESPACE or FIL_TYPE_TEMPORARY
+@param[in]	id		tablespace ID
+@param[in]	flags		expected FSP_SPACE_FLAGS
+@param[in]	name		table name
+If file-per-table, it is the table name in the databasename/tablename format
+@param[in]	path_in		expected filepath, usually read from dictionary
+@param[out]	err		DB_SUCCESS or error code
+@return	tablespace
+@retval	NULL	if the tablespace could not be opened */
+fil_space_t*
+fil_ibd_open(
+	unsigned		validate,
+	fil_type_t		purpose,
+	uint32_t		id,
+	uint32_t		flags,
+	fil_space_t::name_type	name,
+	const char*		path_in,
+	dberr_t*		err)
+{
+	mysql_mutex_lock(&fil_system.mutex);
+	fil_space_t* space = fil_space_get_by_id(id);
+	mysql_mutex_unlock(&fil_system.mutex);
+	if (space) {
+		if (validate > 1 && !srv_read_only_mode) {
+			fsp_flags_try_adjust(space,
+					     flags & ~FSP_FLAGS_MEM_MASK);
+		}
+		return space;
+	}
+
+	dberr_t local_err = DB_SUCCESS;
+
+	/* Table flags can be ULINT_UNDEFINED if
+	dict_tf_to_fsp_flags_failure is set. */
+	if (flags == UINT32_MAX) {
+corrupted:
+		local_err = DB_CORRUPTION;
+func_exit:
+		if (err) *err = local_err;
+		return space;
+	}
+
+	ut_ad(fil_space_t::is_valid_flags(flags & ~FSP_FLAGS_MEM_MASK, id));
+
+	Datafile	df_default;	/* default location */
+	RemoteDatafile	df_remote;	/* remote location */
+	ulint		tablespaces_found = 0;
+	ulint		valid_tablespaces_found = 0;
+
+	df_default.init(flags);
+	df_remote.init(flags);
+
+	/* Discover the correct file by looking in three possible locations
+	while avoiding unecessary effort. */
+
+	/* We will always look for an ibd in the default location. */
+	df_default.make_filepath(nullptr, name, IBD);
+
+	/* Look for a filepath embedded in an ISL where the default file
+	would be. */
+	bool must_validate = df_remote.open_link_file(name);
+
+	if (must_validate) {
+		if (df_remote.open_read_only(true) == DB_SUCCESS) {
+			ut_ad(df_remote.is_open());
+			++tablespaces_found;
+		} else {
+			/* The following call prints an error message */
+			os_file_get_last_error(true);
+			ib::error() << "A link file was found named '"
+				    << df_remote.link_filepath()
+				    << "' but the linked tablespace '"
+				    << df_remote.filepath()
+				    << "' could not be opened read-only.";
+		}
+	} else if (path_in && !df_default.same_filepath_as(path_in)) {
+		/* Dict path is not the default path. Always validate
+		remote files. If default is opened, it was moved. */
+		must_validate = true;
+	} else if (validate > 1) {
+		must_validate = true;
+	}
+
+	const bool operation_not_for_export =
+	  srv_operation != SRV_OPERATION_RESTORE_EXPORT
+	  && srv_operation != SRV_OPERATION_EXPORT_RESTORED;
+
+	/* Always look for a file at the default location. But don't log
+	an error if the tablespace is already open in remote or dict. */
+	ut_a(df_default.filepath());
+
+	/* Mariabackup will not copy files whose names start with
+	#sql-. We will suppress messages about such files missing on
+	the first server startup. The tables ought to be dropped by
+	drop_garbage_tables_after_restore() a little later. */
+
+	const bool strict = validate && !tablespaces_found
+		&& operation_not_for_export
+		&& !(srv_operation == SRV_OPERATION_NORMAL
+		     && srv_start_after_restore
+		     && srv_force_recovery < SRV_FORCE_NO_BACKGROUND
+		     && dict_table_t::is_temporary_name(
+			     df_default.filepath()));
+
+	if (df_default.open_read_only(strict) == DB_SUCCESS) {
+		ut_ad(df_default.is_open());
+		++tablespaces_found;
+	}
+
+	/* Check if multiple locations point to the same file. */
+	if (tablespaces_found > 1 && df_default.same_as(df_remote)) {
+		/* A link file was found with the default path in it.
+		Use the default path and delete the link file. */
+		--tablespaces_found;
+		df_remote.delete_link_file();
+		df_remote.close();
+	}
+
+	/*  We have now checked all possible tablespace locations and
+	have a count of how many unique files we found.  If things are
+	normal, we only found 1. */
+	/* For encrypted tablespace, we need to check the
+	encryption in header of first page. */
+	if (!must_validate && tablespaces_found == 1) {
+		goto skip_validate;
+	}
+
+	/* Read and validate the first page of these three tablespace
+	locations, if found. */
+	valid_tablespaces_found +=
+		(df_remote.validate_to_dd(id, flags) == DB_SUCCESS);
+
+	valid_tablespaces_found +=
+		(df_default.validate_to_dd(id, flags) == DB_SUCCESS);
+
+	/* Make sense of these three possible locations.
+	First, bail out if no tablespace files were found. */
+	if (valid_tablespaces_found == 0) {
+		if (!strict
+		    && IF_WIN(GetLastError() == ERROR_FILE_NOT_FOUND
+			      || GetLastError() == ERROR_PATH_NOT_FOUND,
+			      errno == ENOENT)) {
+			/* Suppress a message about a missing file. */
+			goto corrupted;
+		}
+
+		os_file_get_last_error(operation_not_for_export,
+				       !operation_not_for_export);
+		if (!operation_not_for_export) {
+			goto corrupted;
+		}
+		sql_print_error("InnoDB: Could not find a valid tablespace"
+				" file for %.*s. %s",
+				static_cast<int>(name.size()), name.data(),
+				TROUBLESHOOT_DATADICT_MSG);
+		goto corrupted;
+	}
+	if (!must_validate) {
+		goto skip_validate;
+	}
+
+	/* Do not open any tablespaces if more than one tablespace with
+	the correct space ID and flags were found. */
+	if (df_default.is_open() && df_remote.is_open()) {
+		ib::error()
+			<< "A tablespace has been found in multiple places: "
+			<< df_default.filepath()
+			<< "(Space ID=" << df_default.space_id()
+			<< ", Flags=" << df_default.flags()
+			<< ") and "
+			<< df_remote.filepath()
+			<< "(Space ID=" << df_remote.space_id()
+			<< ", Flags=" << df_remote.flags()
+			<< (valid_tablespaces_found > 1 || srv_force_recovery
+			    ? "); will not open"
+			    : ")");
+
+		/* Force-recovery will allow some tablespaces to be
+		skipped by REDO if there was more than one file found.
+		Unlike during the REDO phase of recovery, we now know
+		if the tablespace is valid according to the dictionary,
+		which was not available then. So if we did not force
+		recovery and there is only one good tablespace, ignore
+		any bad tablespaces. */
+		if (valid_tablespaces_found > 1 || srv_force_recovery > 0) {
+			/* If the file is not open it cannot be valid. */
+			ut_ad(df_default.is_open() || !df_default.is_valid());
+			ut_ad(df_remote.is_open()  || !df_remote.is_valid());
+
+			/* Having established that, this is an easy way to
+			look for corrupted data files. */
+			if (df_default.is_open() != df_default.is_valid()
+			    || df_remote.is_open() != df_remote.is_valid()) {
+				goto corrupted;
+			}
+error:
+			local_err = DB_ERROR;
+			goto func_exit;
+		}
+
+		/* There is only one valid tablespace found and we did
+		not use srv_force_recovery during REDO.  Use this one
+		tablespace and clean up invalid tablespace pointers */
+		if (df_default.is_open() && !df_default.is_valid()) {
+			df_default.close();
+			tablespaces_found--;
+		}
+
+		if (df_remote.is_open() && !df_remote.is_valid()) {
+			df_remote.close();
+			tablespaces_found--;
+		}
+	}
+
+	/* At this point, there should be only one filepath. */
+	ut_a(tablespaces_found == 1);
+	ut_a(valid_tablespaces_found == 1);
+
+skip_validate:
+	const byte* first_page =
+		df_default.is_open() ? df_default.get_first_page() :
+		df_remote.get_first_page();
+
+	fil_space_crypt_t* crypt_data = first_page
+		? fil_space_read_crypt_data(fil_space_t::zip_size(flags),
+					    first_page)
+		: NULL;
+
+	mysql_mutex_lock(&fil_system.mutex);
+	space = fil_space_t::create(id, flags, purpose, crypt_data);
+	if (!space) {
+		mysql_mutex_unlock(&fil_system.mutex);
+		goto error;
+	}
+
+	/* We do not measure the size of the file, that is why
+	we pass the 0 below */
+
+	space->add(
+		df_remote.is_open() ? df_remote.filepath() :
+		df_default.filepath(), OS_FILE_CLOSED, 0, false, true);
+	mysql_mutex_unlock(&fil_system.mutex);
+
+	if (must_validate && !srv_read_only_mode) {
+		df_remote.close();
+		df_default.close();
+		if (space->acquire()) {
+			if (purpose != FIL_TYPE_IMPORT) {
+				fsp_flags_try_adjust(space, flags
+						     & ~FSP_FLAGS_MEM_MASK);
+			}
+			space->release();
+		}
+	}
+
+	goto func_exit;
+}
+
+/** Discover the correct IBD file to open given a remote or missing
+filepath from the REDO log. Administrators can move a crashed
+database to another location on the same machine and try to recover it.
+Remote IBD files might be moved as well to the new location.
+    The problem with this is that the REDO log contains the old location
+which may be still accessible.  During recovery, if files are found in
+both locations, we can chose on based on these priorities;
+1. Default location
+2. ISL location
+3. REDO location
+@param[in]	space_id	tablespace ID
+@param[in]	df		Datafile object with path from redo
+@return true if a valid datafile was found, false if not */
+static
+bool
+fil_ibd_discover(
+	ulint		space_id,
+	Datafile&	df)
+{
+	Datafile	df_def_per;	/* default file-per-table datafile */
+	RemoteDatafile	df_rem_per;	/* remote file-per-table datafile */
+
+	/* Look for the datafile in the default location. */
+	const char*	filename = df.filepath();
+	const char*	basename = base_name(filename);
+
+	/* If this datafile is file-per-table it will have a schema dir. */
+	ulint		sep_found = 0;
+	const char*	db = basename;
+	for (; db > filename && sep_found < 2; db--) {
+		switch (db[0]) {
+#ifdef _WIN32
+		case '\\':
+#endif
+		case '/':
+			sep_found++;
+		}
+	}
+	if (sep_found == 2) {
+		db += 2;
+		df_def_per.init(0);
+		df_def_per.set_filepath(db);
+		if (df_def_per.open_read_only(false) == DB_SUCCESS
+		    && df_def_per.validate_for_recovery() == DB_SUCCESS
+		    && df_def_per.space_id() == space_id) {
+			df.set_filepath(df_def_per.filepath());
+			df.open_read_only(false);
+			return(true);
+		}
+
+		/* Look for a remote file-per-table tablespace. */
+
+		switch (srv_operation) {
+		case SRV_OPERATION_BACKUP:
+		case SRV_OPERATION_RESTORE_DELTA:
+		case SRV_OPERATION_BACKUP_NO_DEFER:
+			ut_ad(0);
+			break;
+		case SRV_OPERATION_RESTORE_EXPORT:
+		case SRV_OPERATION_RESTORE:
+			break;
+		case SRV_OPERATION_NORMAL:
+		case SRV_OPERATION_EXPORT_RESTORED:
+			size_t len= strlen(db);
+			if (len <= 4 || strcmp(db + len - 4, dot_ext[IBD])) {
+				break;
+			}
+			df_rem_per.open_link_file({db, len - 4});
+
+			if (!df_rem_per.filepath()) {
+				break;
+			}
+
+			/* An ISL file was found with contents. */
+			if (df_rem_per.open_read_only(false) != DB_SUCCESS
+				|| df_rem_per.validate_for_recovery()
+				   != DB_SUCCESS) {
+
+				/* Assume that this ISL file is intended to
+				be used. Do not continue looking for another
+				if this file cannot be opened or is not
+				a valid IBD file. */
+				ib::error() << "ISL file '"
+					<< df_rem_per.link_filepath()
+					<< "' was found but the linked file '"
+					<< df_rem_per.filepath()
+					<< "' could not be opened or is"
+					" not correct.";
+				return(false);
+			}
+
+			/* Use this file if it has the space_id from the
+			FILE_ record. */
+			if (df_rem_per.space_id() == space_id) {
+				df.set_filepath(df_rem_per.filepath());
+				df.open_read_only(false);
+				return(true);
+			}
+
+			/* Since old MLOG records can use the same basename
+			in multiple CREATE/DROP TABLE sequences, this ISL
+			file could be pointing to a later version of this
+			basename.ibd file which has a different space_id.
+			Keep looking. */
+		}
+	}
+
+	/* No ISL files were found in the default location. Use the location
+	given in the redo log. */
+	if (df.open_read_only(false) == DB_SUCCESS
+	    && df.validate_for_recovery() == DB_SUCCESS
+	    && df.space_id() == space_id) {
+		return(true);
+	}
+
+	/* A datafile was not discovered for the filename given. */
+	return(false);
+}
+
+bool fil_crypt_check(fil_space_crypt_t *crypt_data, const char *f_name)
+{
+  if (crypt_data->is_key_found())
+    return true;
+  sql_print_error("InnoDB: Encryption key is not found for %s", f_name);
+  crypt_data->~fil_space_crypt_t();
+  ut_free(crypt_data);
+  return false;
+}
+
+/** Open an ibd tablespace and add it to the InnoDB data structures.
+This is similar to fil_ibd_open() except that it is used while processing
+the REDO log, so the data dictionary is not available and very little
+validation is done. The tablespace name is extracred from the
+dbname/tablename.ibd portion of the filename, which assumes that the file
+is a file-per-table tablespace.  Any name will do for now.  General
+tablespace names will be read from the dictionary after it has been
+recovered.  The tablespace flags are read at this time from the first page
+of the file in validate_for_recovery().
+@param[in]	space_id	tablespace ID
+@param[in]	filename	path/to/databasename/tablename.ibd
+@param[out]	space		the tablespace, or NULL on error
+@return status of the operation */
+enum fil_load_status
+fil_ibd_load(uint32_t space_id, const char *filename, fil_space_t *&space)
+{
+	/* If the a space is already in the file system cache with this
+	space ID, then there is nothing to do. */
+	mysql_mutex_lock(&fil_system.mutex);
+	space = fil_space_get_by_id(space_id);
+	mysql_mutex_unlock(&fil_system.mutex);
+
+	if (space) {
+		/* Compare the filename we are trying to open with the
+		filename from the first node of the tablespace we opened
+		previously. Fail if it is different. */
+		fil_node_t* node = UT_LIST_GET_FIRST(space->chain);
+		if (0 != strcmp(innobase_basename(filename),
+				innobase_basename(node->name))) {
+			ib::info()
+				<< "Ignoring data file '" << filename
+				<< "' with space ID " << space->id
+				<< ". Another data file called " << node->name
+				<< " exists with the same space ID.";
+			space = NULL;
+			return(FIL_LOAD_ID_CHANGED);
+		}
+		return(FIL_LOAD_OK);
+	}
+
+	if (srv_operation == SRV_OPERATION_RESTORE) {
+		/* Replace absolute DATA DIRECTORY file paths with
+		short names relative to the backup directory. */
+		const char* name = strrchr(filename, '/');
+#ifdef _WIN32
+		if (const char *last = strrchr(filename, '\\')) {
+			if (last > name) {
+				name = last;
+			}
+		}
+#endif
+		if (name) {
+			while (--name > filename
+#ifdef _WIN32
+			       && *name != '\\'
+#endif
+			       && *name != '/');
+			if (name > filename) {
+				filename = name + 1;
+			}
+		}
+	}
+
+	Datafile	file;
+	file.set_filepath(filename);
+	file.open_read_only(false);
+
+	if (!file.is_open()) {
+		/* The file has been moved or it is a remote datafile. */
+		if (!fil_ibd_discover(space_id, file)
+		    || !file.is_open()) {
+			return(FIL_LOAD_NOT_FOUND);
+		}
+	}
+
+	os_offset_t	size;
+	bool		deferred_space = false;
+
+	/* Read and validate the first page of the tablespace.
+	Assign a tablespace name based on the tablespace type. */
+	switch (file.validate_for_recovery()) {
+		os_offset_t	minimum_size;
+	case DB_SUCCESS:
+		deferred_space = file.m_defer;
+
+		if (deferred_space) {
+			goto tablespace_check;
+		}
+
+		if (file.space_id() != space_id) {
+			return(FIL_LOAD_ID_CHANGED);
+		}
+tablespace_check:
+		/* Get and test the file size. */
+		size = os_file_get_size(file.handle());
+
+		/* Every .ibd file is created >= 4 pages in size.
+		Smaller files cannot be OK. */
+		minimum_size = os_offset_t(FIL_IBD_FILE_INITIAL_SIZE)
+			<< srv_page_size_shift;
+
+		if (size == static_cast<os_offset_t>(-1)) {
+			/* The following call prints an error message */
+			os_file_get_last_error(true);
+
+			ib::error() << "Could not measure the size of"
+				" single-table tablespace file '"
+				<< file.filepath() << "'";
+		} else if (deferred_space) {
+			return FIL_LOAD_DEFER;
+		} else if (size < minimum_size) {
+			ib::error() << "The size of tablespace file '"
+				<< file.filepath() << "' is only " << size
+				<< ", should be at least " << minimum_size
+				<< "!";
+		} else {
+			/* Everything is fine so far. */
+			break;
+		}
+
+		/* fall through */
+
+	case DB_TABLESPACE_EXISTS:
+		return(FIL_LOAD_INVALID);
+
+	default:
+		return(FIL_LOAD_NOT_FOUND);
+	}
+
+	ut_ad(space == NULL);
+
+	/* Adjust the memory-based flags that would normally be set by
+	dict_tf_to_fsp_flags(). In recovery, we have no data dictionary. */
+	uint32_t flags = file.flags();
+	if (fil_space_t::is_compressed(flags)) {
+		flags |= page_zip_level
+			<< FSP_FLAGS_MEM_COMPRESSION_LEVEL;
+	}
+
+	const byte* first_page = file.get_first_page();
+	fil_space_crypt_t* crypt_data = first_page
+		? fil_space_read_crypt_data(fil_space_t::zip_size(flags),
+					    first_page)
+		: NULL;
+
+	if (crypt_data && !fil_crypt_check(crypt_data, filename)) {
+		return FIL_LOAD_INVALID;
+	}
+
+	mysql_mutex_lock(&fil_system.mutex);
+
+	space = fil_space_t::create(
+		space_id, flags, FIL_TYPE_TABLESPACE, crypt_data);
+
+	if (space == NULL) {
+		mysql_mutex_unlock(&fil_system.mutex);
+		return(FIL_LOAD_INVALID);
+	}
+
+	ut_ad(space->id == file.space_id());
+	ut_ad(space->id == space_id);
+
+	/* We do not use the size information we have about the file, because
+	the rounding formula for extents and pages is somewhat complex; we
+	let fil_node_open() do that task. */
+
+	space->add(file.filepath(), OS_FILE_CLOSED, 0, false, false);
+	mysql_mutex_unlock(&fil_system.mutex);
+
+	return(FIL_LOAD_OK);
+}
+
+/** Try to adjust FSP_SPACE_FLAGS if they differ from the expectations.
+(Typically when upgrading from MariaDB 10.1.0..10.1.20.)
+@param[in,out]	space		tablespace
+@param[in]	flags		desired tablespace flags */
+void fsp_flags_try_adjust(fil_space_t *space, uint32_t flags)
+{
+	ut_ad(!srv_read_only_mode);
+	ut_ad(fil_space_t::is_valid_flags(flags, space->id));
+	if (space->full_crc32() || fil_space_t::full_crc32(flags)) {
+		return;
+	}
+	if (!space->size && (space->purpose != FIL_TYPE_TABLESPACE
+			     || !space->get_size())) {
+		return;
+	}
+	/* This code is executed during server startup while no
+	connections are allowed. We do not need to protect against
+	DROP TABLE by fil_space_acquire(). */
+	mtr_t	mtr;
+	mtr.start();
+	if (buf_block_t* b = buf_page_get(
+		    page_id_t(space->id, 0), space->zip_size(),
+		    RW_X_LATCH, &mtr)) {
+		uint32_t f = fsp_header_get_flags(b->page.frame);
+		if (fil_space_t::full_crc32(f)) {
+			goto func_exit;
+		}
+		if (fil_space_t::is_flags_equal(f, flags)) {
+			goto func_exit;
+		}
+		/* Suppress the message if only the DATA_DIR flag to differs. */
+		if ((f ^ flags) & ~(1U << FSP_FLAGS_POS_RESERVED)) {
+			ib::warn()
+				<< "adjusting FSP_SPACE_FLAGS of file '"
+				<< UT_LIST_GET_FIRST(space->chain)->name
+				<< "' from " << ib::hex(f)
+				<< " to " << ib::hex(flags);
+		}
+		mtr.set_named_space(space);
+		mtr.write<4,mtr_t::FORCED>(*b,
+					   FSP_HEADER_OFFSET + FSP_SPACE_FLAGS
+					   + b->page.frame, flags);
+	}
+func_exit:
+	mtr.commit();
+}
+
+/** Determine if a matching tablespace exists in the InnoDB tablespace
+memory cache. Note that if we have not done a crash recovery at the database
+startup, there may be many tablespaces which are not yet in the memory cache.
+@param[in]	id		Tablespace ID
+@param[in]	table_flags	table flags
+@return the tablespace
+@retval	NULL	if no matching tablespace exists in the memory cache */
+fil_space_t *fil_space_for_table_exists_in_mem(uint32_t id,
+                                               uint32_t table_flags)
+{
+	const uint32_t expected_flags = dict_tf_to_fsp_flags(table_flags);
+
+	mysql_mutex_lock(&fil_system.mutex);
+	if (fil_space_t* space = fil_space_get_by_id(id)) {
+		uint32_t tf = expected_flags & ~FSP_FLAGS_MEM_MASK;
+		uint32_t sf = space->flags & ~FSP_FLAGS_MEM_MASK;
+
+		if (!fil_space_t::is_flags_equal(tf, sf)
+		    && !fil_space_t::is_flags_equal(sf, tf)) {
+			goto func_exit;
+		}
+
+		/* Adjust the flags that are in FSP_FLAGS_MEM_MASK.
+		FSP_SPACE_FLAGS will not be written back here. */
+		space->flags = (space->flags & ~FSP_FLAGS_MEM_MASK)
+			| (expected_flags & FSP_FLAGS_MEM_MASK);
+		mysql_mutex_unlock(&fil_system.mutex);
+		if (!srv_read_only_mode) {
+			fsp_flags_try_adjust(space, expected_flags
+					     & ~FSP_FLAGS_MEM_MASK);
+		}
+		return space;
+	}
+
+func_exit:
+	mysql_mutex_unlock(&fil_system.mutex);
+	return NULL;
+}
+
+/*============================ FILE I/O ================================*/
+
+/** Report information about an invalid page access. */
+ATTRIBUTE_COLD
+static void fil_invalid_page_access_msg(const char *name,
+                                        os_offset_t offset, ulint len,
+                                        bool is_read)
+{
+  sql_print_error("%s %zu bytes at " UINT64PF
+                  " outside the bounds of the file: %s",
+                  is_read
+                  ? "InnoDB: Trying to read"
+                  : "[FATAL] InnoDB: Trying to write", len, offset, name);
+  if (!is_read)
+    abort();
+}
+
+/** Update the data structures on write completion */
+inline void fil_node_t::complete_write()
+{
+  mysql_mutex_assert_not_owner(&fil_system.mutex);
+
+  if (space->purpose != FIL_TYPE_TEMPORARY &&
+      srv_file_flush_method != SRV_O_DIRECT_NO_FSYNC &&
+      space->set_needs_flush())
+  {
+    mysql_mutex_lock(&fil_system.mutex);
+    if (!space->is_in_unflushed_spaces)
+    {
+      space->is_in_unflushed_spaces= true;
+      fil_system.unflushed_spaces.push_front(*space);
+    }
+    mysql_mutex_unlock(&fil_system.mutex);
+  }
+}
+
+/** Read or write data.
+@param type     I/O context
+@param offset   offset in bytes
+@param len      number of bytes
+@param buf      the data to be read or written
+@param bpage    buffer block (for type.is_async() completion callback)
+@return status and file descriptor */
+fil_io_t fil_space_t::io(const IORequest &type, os_offset_t offset, size_t len,
+                         void *buf, buf_page_t *bpage)
+{
+	ut_ad(referenced());
+	ut_ad(offset % UNIV_ZIP_SIZE_MIN == 0);
+	ut_ad(len % 512 == 0); /* page_compressed */
+	ut_ad(fil_validate_skip());
+	ut_ad(type.is_read() || type.is_write());
+	ut_ad(type.type != IORequest::DBLWR_BATCH);
+
+	if (type.is_read()) {
+		srv_stats.data_read.add(len);
+	} else {
+		ut_ad(!srv_read_only_mode || this == fil_system.temp_space);
+		srv_stats.data_written.add(len);
+	}
+
+	fil_node_t* node= UT_LIST_GET_FIRST(chain);
+	ut_ad(node);
+	ulint p = static_cast<ulint>(offset >> srv_page_size_shift);
+	dberr_t err;
+
+	if (type.type == IORequest::READ_ASYNC && is_stopping()) {
+		err = DB_TABLESPACE_DELETED;
+		node = nullptr;
+		goto release;
+	}
+
+	DBUG_EXECUTE_IF("intermittent_recovery_failure",
+			if (type.is_read() && !(~get_rnd_value() & 0x3ff0))
+			goto io_error;);
+
+	DBUG_EXECUTE_IF("intermittent_read_failure",
+			if (srv_was_started && type.is_read() &&
+			    !(~get_rnd_value() & 0x3ff0)) goto io_error;);
+
+	if (UNIV_LIKELY_NULL(UT_LIST_GET_NEXT(chain, node))) {
+		ut_ad(this == fil_system.sys_space
+		      || this == fil_system.temp_space);
+		ut_ad(!(offset & ((1 << srv_page_size_shift) - 1)));
+
+		while (node->size <= p) {
+			p -= node->size;
+			node = UT_LIST_GET_NEXT(chain, node);
+			if (!node) {
+fail:
+				if (type.type != IORequest::READ_ASYNC) {
+					fil_invalid_page_access_msg(
+						node->name,
+						offset, len,
+						type.is_read());
+				}
+#ifndef DBUG_OFF
+io_error:
+#endif
+				set_corrupted();
+				err = DB_CORRUPTION;
+				node = nullptr;
+				goto release;
+			}
+		}
+
+		offset = os_offset_t{p} << srv_page_size_shift;
+	}
+
+	if (UNIV_UNLIKELY(node->size <= p)) {
+		goto fail;
+	}
+
+	if (type.type == IORequest::PUNCH_RANGE) {
+		err = os_file_punch_hole(node->handle, offset, len);
+		/* Punch hole is not supported, make space not to
+		support punch hole */
+		if (UNIV_UNLIKELY(err == DB_IO_NO_PUNCH_HOLE)) {
+			node->punch_hole = false;
+			err = DB_SUCCESS;
+		}
+		goto release_sync_write;
+	} else {
+		/* Queue the aio request */
+		err = os_aio(IORequest{bpage, type.slot, node, type.type},
+			     buf, offset, len);
+	}
+
+	if (!type.is_async()) {
+		if (type.is_write()) {
+release_sync_write:
+			node->complete_write();
+release:
+			release();
+			goto func_exit;
+		}
+		ut_ad(fil_validate_skip());
+	}
+	if (err != DB_SUCCESS) {
+		goto release;
+	}
+func_exit:
+	return {err, node};
+}
+
+#include <tpool.h>
+
+void IORequest::write_complete(int io_error) const
+{
+  ut_ad(fil_validate_skip());
+  ut_ad(node);
+  ut_ad(is_write());
+  node->complete_write();
+
+  if (!bpage)
+  {
+    ut_ad(!srv_read_only_mode);
+    if (type == IORequest::DBLWR_BATCH)
+      buf_dblwr.flush_buffered_writes_completed(*this);
+    else
+      ut_ad(type == IORequest::WRITE_ASYNC);
+  }
+  else
+    buf_page_write_complete(*this, io_error);
+
+  node->space->release();
+}
+
+void IORequest::read_complete(int io_error) const
+{
+  ut_ad(fil_validate_skip());
+  ut_ad(node);
+  ut_ad(is_read());
+  ut_ad(bpage);
+
+  /* IMPORTANT: since i/o handling for reads will read also the insert
+  buffer in fil_system.sys_space, we have to be very careful not to
+  introduce deadlocks. We never close fil_system.sys_space data files
+  and never issue asynchronous reads of change buffer pages. */
+  const page_id_t id(bpage->id());
+
+  if (UNIV_UNLIKELY(io_error != 0))
+  {
+    sql_print_error("InnoDB: Read error %d of page " UINT32PF " in file %s",
+                    io_error, id.page_no(), node->name);
+    buf_pool.corrupted_evict(bpage, buf_page_t::READ_FIX);
+  corrupted:
+    if (recv_recovery_is_on() && !srv_force_recovery)
+    {
+      mysql_mutex_lock(&recv_sys.mutex);
+      recv_sys.set_corrupt_fs();
+      mysql_mutex_unlock(&recv_sys.mutex);
+    }
+  }
+  else if (dberr_t err= bpage->read_complete(*node))
+  {
+    if (err != DB_FAIL)
+      ib::error() << "Failed to read page " << id.page_no()
+                  << " from file '" << node->name << "': " << err;
+    goto corrupted;
+  }
+
+  node->space->release();
+}
+
+/** Flush to disk the writes in file spaces of the given type
+possibly cached by the OS. */
+void fil_flush_file_spaces()
+{
+  if (srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)
+  {
+    ut_d(mysql_mutex_lock(&fil_system.mutex));
+    ut_ad(fil_system.unflushed_spaces.empty());
+    ut_d(mysql_mutex_unlock(&fil_system.mutex));
+    return;
+  }
+
+rescan:
+  mysql_mutex_lock(&fil_system.mutex);
+
+  for (fil_space_t &space : fil_system.unflushed_spaces)
+  {
+    if (space.needs_flush_not_stopping())
+    {
+      space.reacquire();
+      mysql_mutex_unlock(&fil_system.mutex);
+      space.flush_low();
+      space.release();
+      goto rescan;
+    }
+  }
+
+  mysql_mutex_unlock(&fil_system.mutex);
+}
+
+/** Functor to validate the file node list of a tablespace. */
+struct	Check {
+	/** Total size of file nodes visited so far */
+	ulint	size;
+	/** Total number of open files visited so far */
+	ulint	n_open;
+
+	/** Constructor */
+	Check() : size(0), n_open(0) {}
+
+	/** Visit a file node
+	@param[in]	elem	file node to visit */
+	void	operator()(const fil_node_t* elem)
+	{
+		n_open += elem->is_open();
+		size += elem->size;
+	}
+
+	/** Validate a tablespace.
+	@param[in]	space	tablespace to validate
+	@return		number of open file nodes */
+	static ulint validate(const fil_space_t* space)
+	{
+		mysql_mutex_assert_owner(&fil_system.mutex);
+		Check	check;
+		ut_list_validate(space->chain, check);
+		ut_a(space->size == check.size);
+
+		switch (space->id) {
+		case TRX_SYS_SPACE:
+			ut_ad(fil_system.sys_space == NULL
+			      || fil_system.sys_space == space);
+			break;
+		case SRV_TMP_SPACE_ID:
+			ut_ad(fil_system.temp_space == NULL
+			      || fil_system.temp_space == space);
+			break;
+		default:
+			break;
+		}
+
+		return(check.n_open);
+	}
+};
+
+/******************************************************************//**
+Checks the consistency of the tablespace cache.
+@return true if ok */
+bool fil_validate()
+{
+	ulint		n_open		= 0;
+
+	mysql_mutex_lock(&fil_system.mutex);
+
+	for (fil_space_t &space : fil_system.space_list) {
+		n_open += Check::validate(&space);
+	}
+
+	ut_a(fil_system.n_open == n_open);
+
+	mysql_mutex_unlock(&fil_system.mutex);
+
+	return(true);
+}
+
+/*********************************************************************//**
+Sets the file page type. */
+void
+fil_page_set_type(
+/*==============*/
+	byte*	page,	/*!< in/out: file page */
+	ulint	type)	/*!< in: type */
+{
+	ut_ad(page);
+
+	mach_write_to_2(page + FIL_PAGE_TYPE, type);
+}
+
+/********************************************************************//**
+Delete the tablespace file and any related files like .cfg.
+This should not be called for temporary tables.
+@param[in] ibd_filepath File path of the IBD tablespace */
+void fil_delete_file(const char *ibd_filepath)
+{
+  ib::info() << "Deleting " << ibd_filepath;
+  os_file_delete_if_exists(innodb_data_file_key, ibd_filepath, nullptr);
+
+  if (char *cfg_filepath= fil_make_filepath(ibd_filepath,
+					    fil_space_t::name_type{}, CFG,
+					    false))
+  {
+    os_file_delete_if_exists(innodb_data_file_key, cfg_filepath, nullptr);
+    ut_free(cfg_filepath);
+  }
+}
+
+#ifdef UNIV_DEBUG
+/** Check that a tablespace is valid for mtr_commit().
+@param[in]	space	persistent tablespace that has been changed */
+static
+void
+fil_space_validate_for_mtr_commit(
+	const fil_space_t*	space)
+{
+	mysql_mutex_assert_not_owner(&fil_system.mutex);
+	ut_ad(space != NULL);
+	ut_ad(space->purpose == FIL_TYPE_TABLESPACE);
+	ut_ad(!is_predefined_tablespace(space->id));
+
+	/* We are serving mtr_commit(). While there is an active
+	mini-transaction, we should have !space->stop_new_ops. This is
+	guaranteed by meta-data locks or transactional locks. */
+	ut_ad(!space->is_stopping()
+	      || space->is_being_truncated /* fil_truncate_prepare() */
+	      || space->referenced());
+}
+#endif /* UNIV_DEBUG */
+
+/** Note that a non-predefined persistent tablespace has been modified
+by redo log.
+@param[in,out]	space	tablespace */
+void
+fil_names_dirty(
+	fil_space_t*	space)
+{
+#ifndef SUX_LOCK_GENERIC
+	ut_ad(log_sys.latch.is_write_locked());
+#endif
+	ut_ad(recv_recovery_is_on());
+	ut_ad(log_sys.get_lsn() != 0);
+	ut_ad(space->max_lsn == 0);
+	ut_d(fil_space_validate_for_mtr_commit(space));
+
+	fil_system.named_spaces.push_back(*space);
+	space->max_lsn = log_sys.get_lsn();
+}
+
+/** Write a FILE_MODIFY record when a non-predefined persistent
+tablespace was modified for the first time since fil_names_clear(). */
+ATTRIBUTE_NOINLINE ATTRIBUTE_COLD void mtr_t::name_write()
+{
+#ifndef SUX_LOCK_GENERIC
+  ut_ad(log_sys.latch.is_write_locked());
+#endif
+  ut_d(fil_space_validate_for_mtr_commit(m_user_space));
+  ut_ad(!m_user_space->max_lsn);
+  m_user_space->max_lsn= log_sys.get_lsn();
+
+  fil_system.named_spaces.push_back(*m_user_space);
+  ut_ad(UT_LIST_GET_LEN(m_user_space->chain) == 1);
+
+  mtr_t mtr;
+  mtr.start();
+  fil_name_write(m_user_space->id,
+                 UT_LIST_GET_FIRST(m_user_space->chain)->name,
+                 &mtr);
+  mtr.commit_files();
+}
+
+/** On a log checkpoint, reset fil_names_dirty_and_write() flags
+and write out FILE_MODIFY if needed, and write FILE_CHECKPOINT.
+@param lsn  checkpoint LSN
+@return current LSN */
+lsn_t fil_names_clear(lsn_t lsn)
+{
+	mtr_t	mtr;
+
+#ifndef SUX_LOCK_GENERIC
+	ut_ad(log_sys.latch.is_write_locked());
+#endif
+	ut_ad(lsn);
+	ut_ad(log_sys.is_latest());
+
+	mtr.start();
+
+	for (auto it = fil_system.named_spaces.begin();
+	     it != fil_system.named_spaces.end(); ) {
+		if (mtr.get_log_size() + strlen(it->chain.start->name)
+		    >= recv_sys.MTR_SIZE_MAX - (3 + 5)) {
+			/* Prevent log parse buffer overflow */
+			mtr.commit_files();
+			mtr.start();
+		}
+
+		auto next = std::next(it);
+
+		ut_ad(it->max_lsn > 0);
+		if (it->max_lsn < lsn) {
+			/* The tablespace was last dirtied before the
+			checkpoint LSN. Remove it from the list, so
+			that if the tablespace is not going to be
+			modified any more, subsequent checkpoints will
+			avoid calling fil_names_write() on it. */
+			it->max_lsn = 0;
+			fil_system.named_spaces.erase(it);
+		}
+
+		/* max_lsn is the last LSN where fil_names_dirty_and_write()
+		was called. If we kept track of "min_lsn" (the first LSN
+		where max_lsn turned nonzero), we could avoid the
+		fil_names_write() call if min_lsn > lsn. */
+		ut_ad(UT_LIST_GET_LEN((*it).chain) == 1);
+		fil_name_write((*it).id, UT_LIST_GET_FIRST((*it).chain)->name,
+			       &mtr);
+		it = next;
+	}
+
+	return mtr.commit_files(lsn);
+}
+
+/* Unit Tests */
+#ifdef UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH
+#define MF  fil_make_filepath
+#define DISPLAY ib::info() << path
+void
+test_make_filepath()
+{
+	char* path;
+	const char* long_path =
+		"this/is/a/very/long/path/including/a/very/"
+		"looooooooooooooooooooooooooooooooooooooooooooooooo"
+		"oooooooooooooooooooooooooooooooooooooooooooooooooo"
+		"oooooooooooooooooooooooooooooooooooooooooooooooooo"
+		"oooooooooooooooooooooooooooooooooooooooooooooooooo"
+		"oooooooooooooooooooooooooooooooooooooooooooooooooo"
+		"oooooooooooooooooooooooooooooooooooooooooooooooooo"
+		"oooooooooooooooooooooooooooooooooooooooooooooooooo"
+		"oooooooooooooooooooooooooooooooooooooooooooooooooo"
+		"oooooooooooooooooooooooooooooooooooooooooooooooooo"
+		"oooooooooooooooooooooooooooooooooooooooooooooooong"
+		"/folder/name";
+	path = MF("/this/is/a/path/with/a/filename", NULL, IBD, false); DISPLAY;
+	path = MF("/this/is/a/path/with/a/filename", NULL, ISL, false); DISPLAY;
+	path = MF("/this/is/a/path/with/a/filename", NULL, CFG, false); DISPLAY;
+	path = MF("/this/is/a/path/with/a/filename.ibd", NULL, IBD, false); DISPLAY;
+	path = MF("/this/is/a/path/with/a/filename.ibd", NULL, IBD, false); DISPLAY;
+	path = MF("/this/is/a/path/with/a/filename.dat", NULL, IBD, false); DISPLAY;
+	path = MF(NULL, "tablespacename", NO_EXT, false); DISPLAY;
+	path = MF(NULL, "tablespacename", IBD, false); DISPLAY;
+	path = MF(NULL, "dbname/tablespacename", NO_EXT, false); DISPLAY;
+	path = MF(NULL, "dbname/tablespacename", IBD, false); DISPLAY;
+	path = MF(NULL, "dbname/tablespacename", ISL, false); DISPLAY;
+	path = MF(NULL, "dbname/tablespacename", CFG, false); DISPLAY;
+	path = MF(NULL, "dbname\\tablespacename", NO_EXT, false); DISPLAY;
+	path = MF(NULL, "dbname\\tablespacename", IBD, false); DISPLAY;
+	path = MF("/this/is/a/path", "dbname/tablespacename", IBD, false); DISPLAY;
+	path = MF("/this/is/a/path", "dbname/tablespacename", IBD, true); DISPLAY;
+	path = MF("./this/is/a/path", "dbname/tablespacename.ibd", IBD, true); DISPLAY;
+	path = MF("this\\is\\a\\path", "dbname/tablespacename", IBD, true); DISPLAY;
+	path = MF("/this/is/a/path", "dbname\\tablespacename", IBD, true); DISPLAY;
+	path = MF(long_path, NULL, IBD, false); DISPLAY;
+	path = MF(long_path, "tablespacename", IBD, false); DISPLAY;
+	path = MF(long_path, "tablespacename", IBD, true); DISPLAY;
+}
+#endif /* UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH */
+/* @} */
+
+/** Determine the block size of the data file.
+@param[in]	space		tablespace
+@param[in]	offset		page number
+@return	block size */
+ulint fil_space_get_block_size(const fil_space_t *space, unsigned offset)
+{
+	ulint block_size = 512;
+
+	for (fil_node_t* node = UT_LIST_GET_FIRST(space->chain);
+	     node != NULL;
+	     node = UT_LIST_GET_NEXT(chain, node)) {
+		block_size = node->block_size;
+		if (node->size > offset) {
+			ut_ad(node->size <= 0xFFFFFFFFU);
+			break;
+		}
+		offset -= static_cast<unsigned>(node->size);
+	}
+
+	/* Currently supporting block size up to 4K,
+	fall back to default if bigger requested. */
+	if (block_size > 4096) {
+		block_size = 512;
+	}
+
+	return block_size;
+}
+
+/** @return the tablespace name (databasename/tablename) */
+fil_space_t::name_type fil_space_t::name() const
+{
+  switch (id) {
+  case 0:
+    return name_type{"innodb_system", 13};
+  case SRV_TMP_SPACE_ID:
+    return name_type{"innodb_temporary", 16};
+  }
+
+  if (!UT_LIST_GET_FIRST(chain) || srv_is_undo_tablespace(id))
+    return name_type{};
+
+  ut_ad(purpose != FIL_TYPE_TEMPORARY);
+  ut_ad(UT_LIST_GET_LEN(chain) == 1);
+
+  const char *path= UT_LIST_GET_FIRST(chain)->name;
+  const char *sep= strchr(path, '/');
+  ut_ad(sep);
+
+  while (const char *next_sep= strchr(sep + 1, '/'))
+    path= sep + 1, sep= next_sep;
+
+#ifdef _WIN32
+  if (const char *last_sep= strchr(path, '\\'))
+    if (last_sep < sep)
+      path= last_sep;
+#endif
+
+  size_t len= strlen(path);
+  ut_ad(len > 4);
+  len-= 4;
+  ut_ad(!strcmp(&path[len], DOT_IBD));
+
+  return name_type{path, len};
+}
+
+#ifdef UNIV_DEBUG
+
+fil_space_t *fil_space_t::next_in_space_list()
+{
+  space_list_t::iterator it(this);
+  auto end= fil_system.space_list.end();
+  if (it == end)
+    return nullptr;
+  ++it;
+  return it == end ? nullptr : &*it;
+}
+
+fil_space_t *fil_space_t::prev_in_space_list()
+{
+  space_list_t::iterator it(this);
+  if (it == fil_system.space_list.begin())
+    return nullptr;
+  --it;
+  return &*it;
+}
+
+fil_space_t *fil_space_t::next_in_unflushed_spaces()
+{
+  sized_ilist<fil_space_t, unflushed_spaces_tag_t>::iterator it(this);
+  auto end= fil_system.unflushed_spaces.end();
+  if (it == end)
+    return nullptr;
+  ++it;
+  return it == end ? nullptr : &*it;
+}
+
+fil_space_t *fil_space_t::prev_in_unflushed_spaces()
+{
+  sized_ilist<fil_space_t, unflushed_spaces_tag_t>::iterator it(this);
+  if (it == fil_system.unflushed_spaces.begin())
+    return nullptr;
+  --it;
+  return &*it;
+}
+
+#endif
diff --git a/storage/innobase/fil/fil0pagecompress.cc b/storage/innobase/fil/fil0pagecompress.cc
new file mode 100644
index 00000000..16aea2a7
--- /dev/null
+++ b/storage/innobase/fil/fil0pagecompress.cc
@@ -0,0 +1,584 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file fil/fil0pagecompress.cc
+Implementation for page compressed file spaces.
+
+Created 11/12/2013 Jan Lindström jan.lindstrom@mariadb.com
+Updated 14/02/2015
+***********************************************************************/
+
+#include "fil0fil.h"
+#include "fil0pagecompress.h"
+
+#include <my_dbug.h>
+
+#include "mem0mem.h"
+#include "hash0hash.h"
+#include "os0file.h"
+#include "mach0data.h"
+#include "buf0buf.h"
+#include "buf0flu.h"
+#include "log0recv.h"
+#include "fsp0fsp.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+#include "dict0dict.h"
+#include "page0page.h"
+#include "page0zip.h"
+#include "trx0sys.h"
+#include "row0mysql.h"
+#include "buf0lru.h"
+#include "ibuf0ibuf.h"
+#include "zlib.h"
+#ifdef __linux__
+#include <linux/fs.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+#endif
+#include "row0mysql.h"
+#include "lz4.h"
+#include "lzo/lzo1x.h"
+#include "lzma.h"
+#include "bzlib.h"
+#include "snappy-c.h"
+
+/** Compress a page for the given compression algorithm.
+@param[in]	buf		page to be compressed
+@param[out]	out_buf		compressed page
+@param[in]	header_len	header length of the page
+@param[in]	comp_algo	compression algorithm
+@param[in]	comp_level	compression level
+@return actual length of compressed page data
+@retval 0 if the page was not compressed */
+static ulint fil_page_compress_low(
+	const byte*	buf,
+	byte*		out_buf,
+	ulint		header_len,
+	ulint		comp_algo,
+	unsigned	comp_level)
+{
+	ulint write_size = srv_page_size - header_len;
+
+	switch (comp_algo) {
+	default:
+		ut_ad("unknown compression method" == 0);
+		/* fall through */
+	case PAGE_UNCOMPRESSED:
+		return 0;
+
+	case PAGE_ZLIB_ALGORITHM:
+		{
+			ulong len = uLong(write_size);
+			if (Z_OK == compress2(
+				    out_buf + header_len, &len, buf,
+				    uLong(srv_page_size), int(comp_level))) {
+				return len;
+			}
+		}
+		break;
+
+	case PAGE_LZ4_ALGORITHM:
+		write_size = LZ4_compress_default(
+			reinterpret_cast<const char*>(buf),
+			reinterpret_cast<char*>(out_buf) + header_len,
+			int(srv_page_size), int(write_size));
+
+		return write_size;
+
+	case PAGE_LZO_ALGORITHM: {
+		lzo_uint len = write_size;
+
+		if (LZO_E_OK == lzo1x_1_15_compress(
+			    buf, srv_page_size,
+			    out_buf + header_len, &len,
+			    out_buf + srv_page_size)
+		    && len <= write_size) {
+			return len;
+		}
+		break;
+	}
+
+	case PAGE_LZMA_ALGORITHM: {
+		size_t out_pos = 0;
+
+		if (LZMA_OK == lzma_easy_buffer_encode(
+			    comp_level, LZMA_CHECK_NONE, NULL,
+			    buf, srv_page_size, out_buf + header_len,
+			    &out_pos, write_size)
+		     && out_pos <= write_size) {
+			return out_pos;
+		}
+		break;
+	}
+
+	case PAGE_BZIP2_ALGORITHM: {
+		unsigned len = unsigned(write_size);
+		if (BZ_OK == BZ2_bzBuffToBuffCompress(
+			    reinterpret_cast<char*>(out_buf + header_len),
+			    &len,
+			    const_cast<char*>(
+				    reinterpret_cast<const char*>(buf)),
+			    unsigned(srv_page_size), 1, 0, 0)
+		    && len <= write_size) {
+			return len;
+		}
+		break;
+	}
+
+	case PAGE_SNAPPY_ALGORITHM: {
+		size_t len = snappy_max_compressed_length(srv_page_size);
+
+		if (SNAPPY_OK == snappy_compress(
+			    reinterpret_cast<const char*>(buf),
+			    srv_page_size,
+			    reinterpret_cast<char*>(out_buf) + header_len,
+			    &len)
+		    && len <= write_size) {
+			return len;
+		}
+		break;
+	}
+	}
+
+	return 0;
+}
+
+/** Compress a page_compressed page for full crc32 format.
+@param[in]	buf		page to be compressed
+@param[out]	out_buf		compressed page
+@param[in]	flags		tablespace flags
+@param[in]	block_size	file system block size
+@return actual length of compressed page
+@retval 0 if the page was not compressed */
+static ulint fil_page_compress_for_full_crc32(
+	const byte*	buf,
+	byte*		out_buf,
+	uint32_t	flags,
+	ulint		block_size,
+	bool		encrypted)
+{
+	ulint comp_level = FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags);
+	ulint comp_algo = fil_space_t::get_compression_algo(flags);
+
+	if (comp_level == 0) {
+		comp_level = page_zip_level;
+	}
+
+	const ulint header_len = FIL_PAGE_COMP_ALGO;
+
+	ulint write_size = fil_page_compress_low(
+		buf, out_buf, header_len,
+		comp_algo,
+		static_cast<unsigned>(comp_level));
+
+	if (write_size == 0) {
+fail:
+		if (comp_algo != PAGE_UNCOMPRESSED)
+			srv_stats.pages_page_compression_error.inc();
+		return 0;
+	}
+
+	write_size += header_len;
+	const ulint actual_size = write_size;
+	/* Write the actual length of the data & page type
+	for full crc32 format. */
+	const bool lsb = fil_space_t::full_crc32_page_compressed_len(flags);
+	/* In the MSB, store the rounded-up page size. */
+	write_size = (write_size + lsb + (4 + 255)) & ~255;
+	if (write_size >= srv_page_size) {
+		goto fail;
+	}
+
+	/* Set up the page header */
+	memcpy(out_buf, buf, header_len);
+	out_buf[FIL_PAGE_TYPE] = 1U << (FIL_PAGE_COMPRESS_FCRC32_MARKER - 8);
+	out_buf[FIL_PAGE_TYPE + 1] = byte(write_size >> 8);
+	/* Clean up the buffer for the remaining write_size (except checksum) */
+	memset(out_buf + actual_size, 0, write_size - actual_size - 4);
+	if (lsb) {
+		/* Store the LSB */
+		out_buf[write_size - 5] = byte(actual_size + (1 + 4));
+	}
+
+	if (!block_size) {
+		block_size = 512;
+	}
+
+	ut_ad(write_size);
+	if (write_size & (block_size - 1)) {
+		size_t tmp = write_size;
+		write_size = (write_size + (block_size - 1))
+			& ~(block_size - 1);
+		memset(out_buf + tmp, 0, write_size - tmp);
+	}
+
+	srv_stats.page_compression_saved.add(srv_page_size - write_size);
+	srv_stats.pages_page_compressed.inc();
+
+	return write_size;
+}
+
+/** Compress a page_compressed page for non full crc32 format.
+@param[in]	buf		page to be compressed
+@param[out]	out_buf		compressed page
+@param[in]	flags		tablespace flags
+@param[in]	block_size	file system block size
+@param[in]	encrypted	whether the page will be subsequently encrypted
+@return actual length of compressed page
+@retval        0       if the page was not compressed */
+static ulint fil_page_compress_for_non_full_crc32(
+	const byte*	buf,
+	byte*		out_buf,
+	ulint		flags,
+	ulint		block_size,
+	bool		encrypted)
+{
+	uint comp_level = static_cast<uint>(
+		FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags));
+	ulint header_len = FIL_PAGE_DATA + FIL_PAGE_COMP_METADATA_LEN;
+	/* Cache to avoid change during function execution */
+	ulint comp_algo = innodb_compression_algorithm;
+
+	if (encrypted) {
+		header_len += FIL_PAGE_ENCRYPT_COMP_ALGO;
+	}
+
+	/* If no compression level was provided to this table, use system
+	default level */
+	if (comp_level == 0) {
+		comp_level = page_zip_level;
+	}
+
+	ulint write_size = fil_page_compress_low(
+				buf, out_buf,
+				header_len, comp_algo, comp_level);
+
+	if (write_size == 0) {
+		if (comp_algo != PAGE_UNCOMPRESSED)
+			srv_stats.pages_page_compression_error.inc();
+		return 0;
+	}
+
+	/* Set up the page header */
+	memcpy(out_buf, buf, FIL_PAGE_DATA);
+	/* Set up the checksum */
+	mach_write_to_4(out_buf + FIL_PAGE_SPACE_OR_CHKSUM, BUF_NO_CHECKSUM_MAGIC);
+
+	/* Set up the compression algorithm */
+	mach_write_to_8(out_buf + FIL_PAGE_COMP_ALGO, comp_algo);
+
+	if (encrypted) {
+		/* Set up the correct page type */
+		mach_write_to_2(out_buf + FIL_PAGE_TYPE,
+				FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED);
+
+		mach_write_to_2(out_buf + FIL_PAGE_DATA
+				+ FIL_PAGE_ENCRYPT_COMP_ALGO, comp_algo);
+	} else {
+		/* Set up the correct page type */
+		mach_write_to_2(out_buf + FIL_PAGE_TYPE, FIL_PAGE_PAGE_COMPRESSED);
+	}
+
+	/* Set up the actual payload lenght */
+	mach_write_to_2(out_buf + FIL_PAGE_DATA + FIL_PAGE_COMP_SIZE,
+			write_size);
+
+	ut_ad(mach_read_from_4(out_buf + FIL_PAGE_SPACE_OR_CHKSUM)
+	      == BUF_NO_CHECKSUM_MAGIC);
+
+	ut_ad(mach_read_from_2(out_buf + FIL_PAGE_DATA + FIL_PAGE_COMP_SIZE)
+	      == write_size);
+
+#ifdef UNIV_DEBUG
+	bool is_compressed = (mach_read_from_8(out_buf + FIL_PAGE_COMP_ALGO)
+			      == (ulint) comp_algo);
+
+	bool is_encrypted_compressed =
+		(mach_read_from_2(out_buf + FIL_PAGE_DATA
+				  + FIL_PAGE_ENCRYPT_COMP_ALGO)
+		 == (ulint) comp_algo);
+#endif /* UNIV_DEBUG */
+
+	ut_ad(is_compressed || is_encrypted_compressed);
+
+	write_size+=header_len;
+
+	if (block_size <= 0) {
+		block_size = 512;
+	}
+
+	ut_ad(write_size > 0 && block_size > 0);
+
+	/* Actual write needs to be alligned on block size */
+	if (write_size % block_size) {
+		size_t tmp = write_size;
+		write_size =  (size_t)ut_uint64_align_up(
+				(ib_uint64_t)write_size, block_size);
+		/* Clean up the end of buffer */
+		memset(out_buf+tmp, 0, write_size - tmp);
+#ifdef UNIV_DEBUG
+		ut_a(write_size > 0 && ((write_size % block_size) == 0));
+		ut_a(write_size >= tmp);
+#endif
+	}
+
+	srv_stats.page_compression_saved.add(srv_page_size - write_size);
+	srv_stats.pages_page_compressed.inc();
+
+	return write_size;
+}
+
+/** Compress a page_compressed page before writing to a data file.
+@param[in]	buf		page to be compressed
+@param[out]	out_buf		compressed page
+@param[in]	flags		tablespace flags
+@param[in]	block_size	file system block size
+@param[in]	encrypted	whether the page will be subsequently encrypted
+@return actual length of compressed page
+@retval	0	if the page was not compressed */
+ulint fil_page_compress(
+	const byte*	buf,
+	byte*		out_buf,
+	uint32_t	flags,
+	ulint		block_size,
+	bool		encrypted)
+{
+	/* The full_crc32 page_compressed format assumes this. */
+	ut_ad(!(block_size & 255));
+	ut_ad(ut_is_2pow(block_size));
+
+	/* Let's not compress file space header or
+	extent descriptor */
+	switch (fil_page_get_type(buf)) {
+	case 0:
+	case FIL_PAGE_TYPE_FSP_HDR:
+	case FIL_PAGE_TYPE_XDES:
+	case FIL_PAGE_PAGE_COMPRESSED:
+		return 0;
+	}
+
+	if (fil_space_t::full_crc32(flags)) {
+		return fil_page_compress_for_full_crc32(
+				buf, out_buf, flags, block_size, encrypted);
+	}
+
+	return fil_page_compress_for_non_full_crc32(
+			buf, out_buf, flags, block_size, encrypted);
+}
+
+/** Decompress a page that may be subject to page_compressed compression.
+@param[in,out]	tmp_buf		temporary buffer (of innodb_page_size)
+@param[in,out]	buf		possibly compressed page buffer
+@param[in]	comp_algo	compression algorithm
+@param[in]	header_len	header length of the page
+@param[in]	actual size	actual size of the page
+@retval true if the page is decompressed or false */
+static bool fil_page_decompress_low(
+	byte*		tmp_buf,
+	byte*		buf,
+	ulint		comp_algo,
+	ulint		header_len,
+	ulint		actual_size)
+{
+	switch (comp_algo) {
+	default:
+		ib::error() << "Unknown compression algorithm "
+			    << comp_algo;
+		return false;
+	case PAGE_ZLIB_ALGORITHM:
+		{
+			uLong len = srv_page_size;
+			return (Z_OK == uncompress(tmp_buf, &len,
+					       buf + header_len,
+					       uLong(actual_size))
+				&& len == srv_page_size);
+		}
+
+	case PAGE_LZ4_ALGORITHM:
+		return LZ4_decompress_safe(
+			reinterpret_cast<const char*>(buf) + header_len,
+			reinterpret_cast<char*>(tmp_buf),
+			static_cast<int>(actual_size),
+			static_cast<int>(srv_page_size)) ==
+			static_cast<int>(srv_page_size);
+
+	case PAGE_LZO_ALGORITHM:
+		{
+			lzo_uint len_lzo = srv_page_size;
+			return (LZO_E_OK == lzo1x_decompress_safe(
+					buf + header_len,
+					actual_size, tmp_buf, &len_lzo, NULL)
+				&& len_lzo == srv_page_size);
+		}
+
+	case PAGE_LZMA_ALGORITHM:
+		{
+			size_t		src_pos = 0;
+			size_t		dst_pos = 0;
+			uint64_t 	memlimit = UINT64_MAX;
+
+			return LZMA_OK == lzma_stream_buffer_decode(
+				&memlimit, 0, NULL, buf + header_len,
+				&src_pos, actual_size, tmp_buf, &dst_pos,
+				srv_page_size)
+				&& dst_pos == srv_page_size;
+		}
+
+	case PAGE_BZIP2_ALGORITHM:
+		{
+			uint dst_pos = static_cast<uint>(srv_page_size);
+			return BZ_OK == BZ2_bzBuffToBuffDecompress(
+				reinterpret_cast<char*>(tmp_buf),
+				&dst_pos,
+				reinterpret_cast<char*>(buf) + header_len,
+				static_cast<uint>(actual_size), 1, 0)
+				&& dst_pos == srv_page_size;
+		}
+
+	case PAGE_SNAPPY_ALGORITHM:
+		{
+			size_t olen = srv_page_size;
+
+			return SNAPPY_OK == snappy_uncompress(
+				reinterpret_cast<const char*>(buf)
+				+ header_len,
+				actual_size,
+				reinterpret_cast<char*>(tmp_buf), &olen)
+				&& olen == srv_page_size;
+		}
+	}
+
+	return false;
+}
+
+/** Decompress a page for full crc32 format.
+@param[in,out]	tmp_buf	temporary buffer (of innodb_page_size)
+@param[in,out]	buf	possibly compressed page buffer
+@param[in]	flags	tablespace flags
+@return size of the compressed data
+@retval	0		if decompression failed
+@retval	srv_page_size	if the page was not compressed */
+static size_t fil_page_decompress_for_full_crc32(byte *tmp_buf, byte *buf,
+                                                 uint32_t flags)
+{
+	ut_ad(fil_space_t::full_crc32(flags));
+	bool compressed = false;
+	size_t size = buf_page_full_crc32_size(buf, &compressed, NULL);
+	if (!compressed) {
+		ut_ad(size == srv_page_size);
+		return size;
+	}
+
+	if (!fil_space_t::is_compressed(flags)) {
+		return 0;
+	}
+
+	if (size >= srv_page_size) {
+		return 0;
+	}
+
+	if (fil_space_t::full_crc32_page_compressed_len(flags)) {
+		compile_time_assert(FIL_PAGE_FCRC32_CHECKSUM == 4);
+		if (size_t lsb = buf[size - 5]) {
+			size += lsb - 0x100;
+		}
+		size -= 5;
+	}
+
+	const size_t header_len = FIL_PAGE_COMP_ALGO;
+
+	if (!fil_page_decompress_low(tmp_buf, buf,
+				     fil_space_t::get_compression_algo(flags),
+				     header_len, size - header_len)) {
+		return 0;
+	}
+
+	srv_stats.pages_page_decompressed.inc();
+	memcpy(buf, tmp_buf, srv_page_size);
+	return size;
+}
+
+/** Decompress a page for non full crc32 format.
+@param[in,out] tmp_buf	temporary buffer (of innodb_page_size)
+@param[in,out] buf	possibly compressed page buffer
+@return size of the compressed data
+@retval	0		if decompression failed
+@retval	srv_page_size	if the page was not compressed */
+static size_t fil_page_decompress_for_non_full_crc32(byte *tmp_buf, byte *buf)
+{
+	ulint header_len;
+	uint comp_algo;
+	switch (fil_page_get_type(buf)) {
+	case FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED:
+		header_len= FIL_PAGE_DATA + FIL_PAGE_ENCRYPT_COMP_METADATA_LEN;
+		comp_algo = mach_read_from_2(
+			FIL_PAGE_DATA + FIL_PAGE_ENCRYPT_COMP_ALGO + buf);
+		break;
+	case FIL_PAGE_PAGE_COMPRESSED:
+		header_len = FIL_PAGE_DATA + FIL_PAGE_COMP_METADATA_LEN;
+		if (mach_read_from_6(FIL_PAGE_COMP_ALGO + buf)) {
+			return 0;
+		}
+		comp_algo = mach_read_from_2(FIL_PAGE_COMP_ALGO + 6 + buf);
+		break;
+	default:
+		return srv_page_size;
+	}
+
+	if (mach_read_from_4(buf + FIL_PAGE_SPACE_OR_CHKSUM)
+	    != BUF_NO_CHECKSUM_MAGIC) {
+		return 0;
+	}
+
+	ulint actual_size = mach_read_from_2(buf + FIL_PAGE_DATA
+					     + FIL_PAGE_COMP_SIZE);
+
+	/* Check if payload size is corrupted */
+	if (actual_size == 0 || actual_size > srv_page_size - header_len) {
+		return 0;
+	}
+
+	if (!fil_page_decompress_low(tmp_buf, buf, comp_algo, header_len,
+				     actual_size)) {
+		return 0;
+	}
+
+	srv_stats.pages_page_decompressed.inc();
+	memcpy(buf, tmp_buf, srv_page_size);
+	return actual_size;
+}
+
+/** Decompress a page that may be subject to page_compressed compression.
+@param[in,out]	tmp_buf		temporary buffer (of innodb_page_size)
+@param[in,out]	buf		possibly compressed page buffer
+@param[in]	flags		tablespace flags
+@return size of the compressed data
+@retval	0		if decompression failed
+@retval	srv_page_size	if the page was not compressed */
+ulint fil_page_decompress(byte *tmp_buf, byte *buf, uint32_t flags)
+{
+	if (fil_space_t::full_crc32(flags)) {
+		return fil_page_decompress_for_full_crc32(tmp_buf, buf, flags);
+	}
+
+	return fil_page_decompress_for_non_full_crc32(tmp_buf, buf);
+}
diff --git a/storage/innobase/fsp/fsp0file.cc b/storage/innobase/fsp/fsp0file.cc
new file mode 100644
index 00000000..cafff419
--- /dev/null
+++ b/storage/innobase/fsp/fsp0file.cc
@@ -0,0 +1,936 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file fsp/fsp0file.cc
+Tablespace data file implementation
+
+Created 2013-7-26 by Kevin Lewis
+*******************************************************/
+
+#include "fil0fil.h"
+#include "fsp0types.h"
+#include "os0file.h"
+#include "page0page.h"
+#include "srv0start.h"
+#include "log.h"
+
+/** Release the resources. */
+void
+Datafile::shutdown()
+{
+	close();
+
+	free_filepath();
+	free_first_page();
+}
+
+/** Create/open a data file.
+@param[in]	read_only_mode	if true, then readonly mode checks are enforced.
+@return DB_SUCCESS or error code */
+dberr_t
+Datafile::open_or_create(bool read_only_mode)
+{
+	bool success;
+	ut_a(m_filepath != NULL);
+	ut_ad(m_handle == OS_FILE_CLOSED);
+
+	m_handle = os_file_create(
+		innodb_data_file_key, m_filepath, m_open_flags,
+		OS_FILE_NORMAL, OS_DATA_FILE, read_only_mode, &success);
+
+	if (!success) {
+		m_last_os_error = os_file_get_last_error(true);
+		ib::error() << "Cannot open datafile '" << m_filepath << "'";
+		return(DB_CANNOT_OPEN_FILE);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/** Open a data file in read-only mode to check if it exists so that it
+can be validated.
+@param[in]	strict	whether to issue error messages
+@return DB_SUCCESS or error code */
+dberr_t
+Datafile::open_read_only(bool strict)
+{
+	bool	success = false;
+	ut_ad(m_handle == OS_FILE_CLOSED);
+
+	/* This function can be called for file objects that do not need
+	to be opened, which is the case when the m_filepath is NULL */
+	if (m_filepath == NULL) {
+		return(DB_ERROR);
+	}
+
+	set_open_flags(OS_FILE_OPEN);
+	m_handle = os_file_create_simple_no_error_handling(
+		innodb_data_file_key, m_filepath, m_open_flags,
+		OS_FILE_READ_ONLY, true, &success);
+
+	if (success) {
+		m_exists = true;
+		init_file_info();
+
+		return(DB_SUCCESS);
+	}
+
+	if (strict) {
+		m_last_os_error = os_file_get_last_error(true);
+		ib::error() << "Cannot open datafile for read-only: '"
+			<< m_filepath << "' OS error: " << m_last_os_error;
+	}
+
+	return(DB_CANNOT_OPEN_FILE);
+}
+
+/** Open a data file in read-write mode during start-up so that
+doublewrite pages can be restored and then it can be validated.*
+@return DB_SUCCESS or error code */
+inline dberr_t Datafile::open_read_write()
+{
+	bool	success = false;
+	ut_ad(m_handle == OS_FILE_CLOSED);
+	ut_ad(!srv_read_only_mode);
+
+	/* This function can be called for file objects that do not need
+	to be opened, which is the case when the m_filepath is NULL */
+	if (m_filepath == NULL) {
+		return(DB_ERROR);
+	}
+
+	set_open_flags(OS_FILE_OPEN);
+	m_handle = os_file_create_simple_no_error_handling(
+		innodb_data_file_key, m_filepath, m_open_flags,
+		OS_FILE_READ_WRITE, false, &success);
+
+	if (!success) {
+		m_last_os_error = os_file_get_last_error(true);
+		ib::error() << "Cannot open datafile for read-write: '"
+			<< m_filepath << "'";
+		return(DB_CANNOT_OPEN_FILE);
+	}
+
+	m_exists = true;
+
+	init_file_info();
+
+	return(DB_SUCCESS);
+}
+
+/** Initialize OS specific file info. */
+void
+Datafile::init_file_info()
+{
+#ifdef _WIN32
+	GetFileInformationByHandle((os_file_t)m_handle, &m_file_info);
+#else
+	fstat(m_handle, &m_file_info);
+#endif	/* WIN32 */
+}
+
+/** Close a data file.
+@return DB_SUCCESS or error code */
+dberr_t
+Datafile::close()
+{
+	if (m_handle != OS_FILE_CLOSED) {
+		ibool	success = os_file_close(m_handle);
+		ut_a(success);
+
+		m_handle = OS_FILE_CLOSED;
+	}
+
+	return(DB_SUCCESS);
+}
+
+/** Make a full filepath from a directory path and a filename.
+Prepend the dirpath to filename using the extension given.
+If dirpath is NULL, prepend the default datadir to filepath.
+Store the result in m_filepath.
+@param dirpath  directory path
+@param name     tablespace (table) name
+@param ext      filename extension */
+void Datafile::make_filepath(const char *dirpath, fil_space_t::name_type name,
+                             ib_extention ext)
+{
+  ut_ad(dirpath || name.size());
+  free_filepath();
+  m_filepath= fil_make_filepath(dirpath, name, ext, false);
+  ut_ad(m_filepath);
+  set_filename();
+}
+
+/** Set the filepath by duplicating the filepath sent in. This is the
+name of the file with its extension and absolute or relative path.
+@param[in]	filepath	filepath to set */
+void
+Datafile::set_filepath(const char* filepath)
+{
+	free_filepath();
+	m_filepath = static_cast<char*>(ut_malloc_nokey(strlen(filepath) + 1));
+	::strcpy(m_filepath, filepath);
+	set_filename();
+}
+
+/** Free the filepath buffer. */
+void
+Datafile::free_filepath()
+{
+	if (m_filepath != NULL) {
+		ut_free(m_filepath);
+		m_filepath = NULL;
+		m_filename = NULL;
+	}
+}
+
+/** Do a quick test if the filepath provided looks the same as this filepath
+byte by byte. If they are two different looking paths to the same file,
+same_as() will be used to show that after the files are opened.
+@param[in]	other	filepath to compare with
+@retval true if it is the same filename by byte comparison
+@retval false if it looks different */
+bool
+Datafile::same_filepath_as(
+	const char* other) const
+{
+	return(0 == strcmp(m_filepath, other));
+}
+
+/** Test if another opened datafile is the same file as this object.
+@param[in]	other	Datafile to compare with
+@return true if it is the same file, else false */
+bool
+Datafile::same_as(
+	const Datafile&	other) const
+{
+#ifdef _WIN32
+	return(m_file_info.dwVolumeSerialNumber
+	       == other.m_file_info.dwVolumeSerialNumber
+	       && m_file_info.nFileIndexHigh
+	          == other.m_file_info.nFileIndexHigh
+	       && m_file_info.nFileIndexLow
+	          == other.m_file_info.nFileIndexLow);
+#else
+	return(m_file_info.st_ino == other.m_file_info.st_ino
+	       && m_file_info.st_dev == other.m_file_info.st_dev);
+#endif /* WIN32 */
+}
+
+/** Reads a few significant fields from the first page of the first
+datafile.  The Datafile must already be open.
+@param[in]	read_only_mode	If true, then readonly mode checks are enforced.
+@return DB_SUCCESS or DB_IO_ERROR if page cannot be read */
+dberr_t
+Datafile::read_first_page(bool read_only_mode)
+{
+	if (m_handle == OS_FILE_CLOSED) {
+
+		dberr_t err = open_or_create(read_only_mode);
+
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+	}
+
+	/* Align the memory for a possible read from a raw device */
+
+	m_first_page = static_cast<byte*>(
+		aligned_malloc(UNIV_PAGE_SIZE_MAX, srv_page_size));
+
+	dberr_t		err = DB_ERROR;
+	size_t		page_size = UNIV_PAGE_SIZE_MAX;
+
+	/* Don't want unnecessary complaints about partial reads. */
+
+	while (page_size >= UNIV_PAGE_SIZE_MIN) {
+
+		ulint	n_read = 0;
+
+		err = os_file_read(
+			IORequestReadPartial, m_handle, m_first_page, 0,
+			page_size, &n_read);
+
+		if (err == DB_SUCCESS) {
+			break;
+		}
+
+		if (err == DB_IO_ERROR && n_read == 0) {
+			break;
+		}
+		if (err == DB_IO_ERROR && n_read >= UNIV_PAGE_SIZE_MIN) {
+			page_size >>= 1;
+		} else if (srv_operation == SRV_OPERATION_BACKUP) {
+			break;
+		} else {
+			ib::info() << "Cannot read first page of '"
+				<< m_filepath << "': " << err;
+			break;
+		}
+	}
+
+	if (err != DB_SUCCESS) {
+		return(err);
+	}
+
+	if (m_order == 0) {
+		if (memcmp_aligned<2>(FIL_PAGE_SPACE_ID + m_first_page,
+				      FSP_HEADER_OFFSET + FSP_SPACE_ID
+				      + m_first_page, 4)) {
+			ib::error()
+				<< "Inconsistent tablespace ID in "
+				<< m_filepath;
+			return DB_CORRUPTION;
+		}
+
+		m_space_id = mach_read_from_4(FIL_PAGE_SPACE_ID
+					      + m_first_page);
+		m_flags = fsp_header_get_flags(m_first_page);
+		if (!fil_space_t::is_valid_flags(m_flags, m_space_id)) {
+			uint32_t cflags = fsp_flags_convert_from_101(m_flags);
+			if (cflags == UINT32_MAX) {
+				switch (fsp_flags_is_incompatible_mysql(m_flags)) {
+				case 0:
+					sql_print_error("InnoDB: Invalid flags 0x%" PRIx32 " in %s",
+							m_flags, m_filepath);
+					return DB_CORRUPTION;
+				case 3:
+				case 2:
+					sql_print_error("InnoDB: MySQL-8.0 tablespace in %s",
+							m_filepath);
+					break;
+				case 1:
+					sql_print_error("InnoDB: MySQL Encrypted tablespace in %s",
+							m_filepath);
+					break;
+				}
+				sql_print_error("InnoDB: Restart in MySQL for migration/recovery.");
+				return DB_UNSUPPORTED;
+			} else {
+				m_flags = cflags;
+			}
+		}
+	}
+
+	const size_t physical_size = fil_space_t::physical_size(m_flags);
+
+	if (physical_size > page_size) {
+		ib::error() << "File " << m_filepath
+			<< " should be longer than "
+			<< page_size << " bytes";
+		return(DB_CORRUPTION);
+	}
+
+	return(err);
+}
+
+/** Free the first page from memory when it is no longer needed. */
+void Datafile::free_first_page()
+{
+  aligned_free(m_first_page);
+  m_first_page= nullptr;
+}
+
+/** Validates the datafile and checks that it conforms with the expected
+space ID and flags.  The file should exist and be successfully opened
+in order for this function to validate it.
+@param[in]	space_id	The expected tablespace ID.
+@param[in]	flags		The expected tablespace flags.
+@retval DB_SUCCESS if tablespace is valid, DB_ERROR if not.
+m_is_valid is also set true on success, else false. */
+dberr_t Datafile::validate_to_dd(uint32_t space_id, uint32_t flags)
+{
+	dberr_t err;
+
+	if (!is_open()) {
+		return DB_ERROR;
+	}
+
+	/* Validate this single-table-tablespace with the data dictionary,
+	but do not compare the DATA_DIR flag, in case the tablespace was
+	remotely located. */
+	err = validate_first_page();
+	if (err != DB_SUCCESS) {
+		return(err);
+	}
+
+	flags &= ~FSP_FLAGS_MEM_MASK;
+
+	/* Make sure the datafile we found matched the space ID.
+	If the datafile is a file-per-table tablespace then also match
+	the row format and zip page size. */
+	if (m_space_id == space_id
+	    && (fil_space_t::is_flags_equal(flags, m_flags)
+		|| fil_space_t::is_flags_equal(m_flags, flags))) {
+		/* Datafile matches the tablespace expected. */
+		return(DB_SUCCESS);
+	}
+
+	/* else do not use this tablespace. */
+	m_is_valid = false;
+
+	ib::error() << "Refusing to load '" << m_filepath << "' (id="
+		<< m_space_id << ", flags=" << ib::hex(m_flags)
+		<< "); dictionary contains id="
+		<< space_id << ", flags=" << ib::hex(flags);
+
+	return(DB_ERROR);
+}
+
+/** Validates this datafile for the purpose of recovery.  The file should
+exist and be successfully opened. We initially open it in read-only mode
+because we just want to read the SpaceID.  However, if the first page is
+corrupt and needs to be restored from the doublewrite buffer, we will
+reopen it in write mode and ry to restore that page.
+@retval DB_SUCCESS if tablespace is valid, DB_ERROR if not.
+m_is_valid is also set true on success, else false. */
+dberr_t
+Datafile::validate_for_recovery()
+{
+	dberr_t err;
+
+	ut_ad(is_open());
+	ut_ad(!srv_read_only_mode);
+
+	err = validate_first_page();
+
+	switch (err) {
+	case DB_TABLESPACE_EXISTS:
+		break;
+	case DB_SUCCESS:
+		if (!m_defer || !m_space_id) {
+			break;
+		}
+		/* InnoDB should check whether the deferred
+		tablespace page0 can be recovered from
+		double write buffer. InnoDB should try
+	        to recover only if m_space_id exists because
+		dblwr pages can be searched via {space_id, 0}.
+		m_space_id is set in read_first_page(). */
+		/* fall through */
+	default:
+		/* Re-open the file in read-write mode  Attempt to restore
+		page 0 from doublewrite and read the space ID from a survey
+		of the first few pages. */
+		close();
+		err = open_read_write();
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+
+		if (!m_defer) {
+			err = find_space_id();
+			if (err != DB_SUCCESS || m_space_id == 0) {
+				ib::error() << "Datafile '" << m_filepath
+					<< "' is corrupted. Cannot determine "
+					"the space ID from the first 64 pages.";
+				return(err);
+			}
+		}
+
+		if (m_space_id == UINT32_MAX) {
+			return DB_SUCCESS; /* empty file */
+		}
+
+		if (recv_sys.dblwr.restore_first_page(
+			m_space_id, m_filepath, m_handle)) {
+			return m_defer ? err : DB_CORRUPTION;
+		}
+
+		/* Free the previously read first page and then re-validate. */
+		free_first_page();
+		m_defer = false;
+		err = validate_first_page();
+	}
+
+	return(err);
+}
+
+/** Check the consistency of the first page of a datafile when the
+tablespace is opened.  This occurs before the fil_space_t is created
+so the Space ID found here must not already be open.
+m_is_valid is set true on success, else false.
+@retval DB_SUCCESS on if the datafile is valid
+@retval DB_CORRUPTION if the datafile is not readable
+@retval DB_TABLESPACE_EXISTS if there is a duplicate space_id */
+dberr_t Datafile::validate_first_page()
+{
+	const char*	error_txt = NULL;
+
+	m_is_valid = true;
+
+	if (m_first_page == NULL
+	    && read_first_page(srv_read_only_mode) != DB_SUCCESS) {
+
+		error_txt = "Cannot read first page";
+	}
+
+	if (error_txt != NULL) {
+err_exit:
+		free_first_page();
+
+		if (recv_recovery_is_on()
+		    || srv_operation == SRV_OPERATION_BACKUP) {
+			m_defer= true;
+			return DB_SUCCESS;
+		}
+
+		ib::info() << error_txt << " in datafile: " << m_filepath
+			<< ", Space ID:" << m_space_id  << ", Flags: "
+			<< m_flags;
+		m_is_valid = false;
+		return(DB_CORRUPTION);
+	}
+
+	/* Check if the whole page is blank. */
+	if (!m_space_id && !m_flags) {
+		const byte*	b		= m_first_page;
+		ulint		nonzero_bytes	= srv_page_size;
+
+		while (*b == '\0' && --nonzero_bytes != 0) {
+
+			b++;
+		}
+
+		if (nonzero_bytes == 0) {
+			error_txt = "Header page consists of zero bytes";
+			goto err_exit;
+		}
+	}
+
+	if (!fil_space_t::is_valid_flags(m_flags, m_space_id)) {
+		/* Tablespace flags must be valid. */
+		error_txt = "Tablespace flags are invalid";
+		goto err_exit;
+	}
+
+	ulint logical_size = fil_space_t::logical_size(m_flags);
+
+	if (srv_page_size != logical_size) {
+		free_first_page();
+		if (recv_recovery_is_on()
+		    || srv_operation == SRV_OPERATION_BACKUP) {
+			m_defer= true;
+			return DB_SUCCESS;
+		}
+		/* Logical size must be innodb_page_size. */
+		ib::error()
+			<< "Data file '" << m_filepath << "' uses page size "
+			<< logical_size << ", but the innodb_page_size"
+			" start-up parameter is "
+			<< srv_page_size;
+		return(DB_ERROR);
+	}
+
+	if (page_get_page_no(m_first_page) != 0) {
+		/* First page must be number 0 */
+		error_txt = "Header page contains inconsistent data";
+		goto err_exit;
+	}
+
+	if (m_space_id >= SRV_SPACE_ID_UPPER_BOUND) {
+		error_txt = "A bad Space ID was found";
+		goto err_exit;
+	}
+
+	if (buf_page_is_corrupted(false, m_first_page, m_flags)) {
+		/* Look for checksum and other corruptions. */
+		error_txt = "Checksum mismatch";
+		goto err_exit;
+	}
+
+	mysql_mutex_lock(&fil_system.mutex);
+
+	fil_space_t* space = fil_space_get_by_id(m_space_id);
+
+	if (space) {
+		fil_node_t* node = UT_LIST_GET_FIRST(space->chain);
+
+		if (node && !strcmp(m_filepath, node->name)) {
+ok_exit:
+			mysql_mutex_unlock(&fil_system.mutex);
+			return DB_SUCCESS;
+		}
+
+		if (!m_space_id
+		    && (recv_recovery_is_on()
+			|| srv_operation == SRV_OPERATION_BACKUP)) {
+			m_defer= true;
+			goto ok_exit;
+		}
+
+		/* Make sure the space_id has not already been opened. */
+		ib::error() << "Attempted to open a previously opened"
+			" tablespace. Previous tablespace: "
+			    << (node ? node->name : "(unknown)")
+			    << " uses space ID: " << m_space_id
+			    << ". Cannot open filepath: " << m_filepath
+			    << " which uses the same space ID.";
+	}
+
+	mysql_mutex_unlock(&fil_system.mutex);
+
+	if (space) {
+		m_is_valid = false;
+
+		free_first_page();
+
+		return(is_predefined_tablespace(m_space_id)
+		       ? DB_CORRUPTION
+		       : DB_TABLESPACE_EXISTS);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/** Determine the space id of the given file descriptor by reading a few
+pages from the beginning of the .ibd file.
+@return DB_SUCCESS if space id was successfully identified, else DB_ERROR. */
+dberr_t
+Datafile::find_space_id()
+{
+	os_offset_t	file_size;
+
+	ut_ad(m_handle != OS_FILE_CLOSED);
+
+	file_size = os_file_get_size(m_handle);
+
+	if (!file_size) {
+		return DB_SUCCESS;
+	}
+
+	if (file_size == (os_offset_t) -1) {
+		ib::error() << "Could not get file size of datafile '"
+			<< m_filepath << "'";
+		return(DB_CORRUPTION);
+	}
+
+	/* Assuming a page size, read the space_id from each page and store it
+	in a map.  Find out which space_id is agreed on by majority of the
+	pages.  Choose that space_id. */
+	for (ulint page_size = UNIV_ZIP_SIZE_MIN;
+	     page_size <= UNIV_PAGE_SIZE_MAX;
+	     page_size <<= 1) {
+		/* map[space_id] = count of pages */
+		typedef std::map<
+			uint32_t,
+			uint32_t,
+			std::less<uint32_t>,
+			ut_allocator<std::pair<const uint32_t, uint32_t> > >
+			Pages;
+
+		Pages	verify;
+		uint32_t page_count = 64;
+		uint32_t valid_pages = 0;
+
+		/* Adjust the number of pages to analyze based on file size */
+		while ((page_count * page_size) > file_size) {
+			--page_count;
+		}
+
+		ib::info()
+			<< "Page size:" << page_size
+			<< ". Pages to analyze:" << page_count;
+
+		byte*	page = static_cast<byte*>(
+			aligned_malloc(page_size, page_size));
+
+		uint32_t fsp_flags;
+		/* provide dummy value if the first os_file_read() fails */
+		switch (srv_checksum_algorithm) {
+		case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
+		case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
+			fsp_flags = 1U << FSP_FLAGS_FCRC32_POS_MARKER
+				| FSP_FLAGS_FCRC32_PAGE_SSIZE()
+				| uint(innodb_compression_algorithm)
+				       << FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO;
+			break;
+		default:
+			fsp_flags = 0;
+		}
+
+		for (ulint j = 0; j < page_count; ++j) {
+			if (os_file_read(IORequestRead, m_handle, page,
+					 j * page_size, page_size, nullptr)) {
+				ib::info()
+					<< "READ FAIL: page_no:" << j;
+				continue;
+			}
+
+			if (j == 0) {
+				fsp_flags = mach_read_from_4(
+					page + FSP_HEADER_OFFSET + FSP_SPACE_FLAGS);
+			}
+
+			bool	noncompressed_ok = false;
+
+			/* For noncompressed pages, the page size must be
+			equal to srv_page_size. */
+			if (page_size == srv_page_size
+			    && !fil_space_t::zip_size(fsp_flags)) {
+				noncompressed_ok = !buf_page_is_corrupted(
+					false, page, fsp_flags);
+			}
+
+			bool	compressed_ok = false;
+
+			if (srv_page_size <= UNIV_PAGE_SIZE_DEF
+			    && page_size == fil_space_t::zip_size(fsp_flags)) {
+				compressed_ok = !buf_page_is_corrupted(
+					false, page, fsp_flags);
+			}
+
+			if (noncompressed_ok || compressed_ok) {
+
+				uint32_t space_id = mach_read_from_4(page
+					+ FIL_PAGE_SPACE_ID);
+
+				if (space_id > 0) {
+
+					ib::info()
+						<< "VALID: space:"
+						<< space_id << " page_no:" << j
+						<< " page_size:" << page_size;
+
+					++valid_pages;
+
+					++verify[space_id];
+				}
+			}
+		}
+
+		aligned_free(page);
+
+		ib::info()
+			<< "Page size: " << page_size
+			<< ". Possible space_id count:" << verify.size();
+
+		const ulint	pages_corrupted = 3;
+
+		for (ulint missed = 0; missed <= pages_corrupted; ++missed) {
+
+			for (Pages::const_iterator it = verify.begin();
+			     it != verify.end();
+			     ++it) {
+
+				ib::info() << "space_id:" << it->first
+					<< ", Number of pages matched: "
+					<< it->second << "/" << valid_pages
+					<< " (" << page_size << ")";
+
+				if (it->second == (valid_pages - missed)) {
+					ib::info() << "Chosen space:"
+						<< it->first;
+
+					m_space_id = it->first;
+					return(DB_SUCCESS);
+				}
+			}
+
+		}
+	}
+
+	return(DB_CORRUPTION);
+}
+
+/** Read an InnoDB Symbolic Link (ISL) file by name.
+@param link_filepath   filepath of the ISL file
+@return data file name (must be freed by the caller)
+@retval nullptr  on error */
+static char *read_link_file(const char *link_filepath)
+{
+  if (FILE* file= fopen(link_filepath, "r+b" STR_O_CLOEXEC))
+  {
+    char *filepath= static_cast<char*>(ut_malloc_nokey(OS_FILE_MAX_PATH));
+
+    os_file_read_string(file, filepath, OS_FILE_MAX_PATH);
+    fclose(file);
+
+    if (size_t len= strlen(filepath))
+    {
+      /* Trim whitespace from end of filepath */
+      len--;
+      while (static_cast<byte>(filepath[len]) <= 0x20)
+      {
+        if (!len)
+          return nullptr;
+        filepath[len--]= 0;
+      }
+      /* Ensure that the last 2 path separators are forward slashes,
+      because elsewhere we are assuming that tablespace file names end
+      in "/databasename/tablename.ibd". */
+      unsigned trailing_slashes= 0;
+      for (; len; len--)
+      {
+        switch (filepath[len]) {
+#ifdef _WIN32
+        case '\\':
+          filepath[len]= '/';
+          /* fall through */
+#endif
+        case '/':
+          if (++trailing_slashes >= 2)
+            return filepath;
+        }
+      }
+    }
+  }
+
+  return nullptr;
+}
+
+/** Create a link filename,
+open that file, and read the contents into m_filepath.
+@param name   table name
+@return filepath()
+@retval nullptr  if the .isl file does not exist or cannot be read */
+const char *RemoteDatafile::open_link_file(const fil_space_t::name_type name)
+{
+  if (!m_link_filepath)
+    m_link_filepath= fil_make_filepath(nullptr, name, ISL, false);
+  m_filepath= read_link_file(m_link_filepath);
+  return m_filepath;
+}
+
+/** Release the resources. */
+void
+RemoteDatafile::shutdown()
+{
+	Datafile::shutdown();
+
+	if (m_link_filepath != 0) {
+		ut_free(m_link_filepath);
+		m_link_filepath = 0;
+	}
+}
+
+/** Create InnoDB Symbolic Link (ISL) file.
+@param name     tablespace name
+@param filepath full file name
+@return DB_SUCCESS or error code */
+dberr_t RemoteDatafile::create_link_file(fil_space_t::name_type name,
+                                         const char *filepath)
+{
+	bool		success;
+	dberr_t		err = DB_SUCCESS;
+	char*		link_filepath = NULL;
+	char*		prev_filepath = NULL;
+
+	ut_ad(!srv_read_only_mode);
+
+	link_filepath = fil_make_filepath(NULL, name, ISL, false);
+
+	if (link_filepath == NULL) {
+		return(DB_ERROR);
+	}
+
+	prev_filepath = read_link_file(link_filepath);
+	if (prev_filepath) {
+		/* Truncate (starting with MySQL 5.6, probably no
+		longer since MariaDB Server 10.2.19) used to call this
+		with an existing link file which contains the same filepath. */
+		bool same = !strncmp(prev_filepath, name.data(), name.size())
+			&& !strcmp(prev_filepath + name.size(), DOT_IBD);
+		ut_free(prev_filepath);
+		if (same) {
+			ut_free(link_filepath);
+			return(DB_SUCCESS);
+		}
+	}
+
+	/** Check if the file already exists. */
+	FILE*			file = NULL;
+	bool			exists;
+	os_file_type_t		ftype;
+
+	success = os_file_status(link_filepath, &exists, &ftype);
+	ulint error = 0;
+
+	if (success && !exists) {
+
+		file = fopen(link_filepath, "w");
+		if (file == NULL) {
+			/* This call will print its own error message */
+			error = os_file_get_last_error(true);
+		}
+	} else {
+		error = OS_FILE_ALREADY_EXISTS;
+	}
+
+	if (error != 0) {
+
+		ib::error() << "Cannot create file " << link_filepath << ".";
+
+		if (error == OS_FILE_ALREADY_EXISTS) {
+			ib::error() << "The link file: " << link_filepath
+				<< " already exists.";
+			err = DB_TABLESPACE_EXISTS;
+
+		} else if (error == OS_FILE_DISK_FULL) {
+			err = DB_OUT_OF_FILE_SPACE;
+
+		} else {
+			err = DB_ERROR;
+		}
+
+		/* file is not open, no need to close it. */
+		ut_free(link_filepath);
+		return(err);
+	}
+
+	const size_t len = strlen(filepath);
+	if (fwrite(filepath, 1, len, file) != len) {
+		error = os_file_get_last_error(true);
+		ib::error() <<
+			"Cannot write link file: "
+			    << link_filepath << " filepath: " << filepath;
+		err = DB_ERROR;
+	}
+
+	/* Close the file, we only need it at startup */
+	fclose(file);
+
+	ut_free(link_filepath);
+
+	return(err);
+}
+
+/** Delete an InnoDB Symbolic Link (ISL) file. */
+void
+RemoteDatafile::delete_link_file(void)
+{
+	ut_ad(m_link_filepath != NULL);
+
+	if (m_link_filepath != NULL) {
+		os_file_delete_if_exists(innodb_data_file_key,
+					 m_link_filepath, NULL);
+	}
+}
+
+/** Delete an InnoDB Symbolic Link (ISL) file by name.
+@param name	tablespace name */
+void RemoteDatafile::delete_link_file(fil_space_t::name_type name)
+{
+  if (char *link_filepath= fil_make_filepath(NULL, name, ISL, false))
+  {
+    os_file_delete_if_exists(innodb_data_file_key, link_filepath, nullptr);
+    ut_free(link_filepath);
+  }
+}
diff --git a/storage/innobase/fsp/fsp0fsp.cc b/storage/innobase/fsp/fsp0fsp.cc
new file mode 100644
index 00000000..6c5c354e
--- /dev/null
+++ b/storage/innobase/fsp/fsp0fsp.cc
@@ -0,0 +1,3070 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file fsp/fsp0fsp.cc
+File space management
+
+Created 11/29/1995 Heikki Tuuri
+***********************************************************************/
+
+#include "fsp0fsp.h"
+#include "buf0buf.h"
+#include "fil0fil.h"
+#include "fil0crypt.h"
+#include "mtr0log.h"
+#include "ut0byte.h"
+#include "page0page.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "ibuf0ibuf.h"
+#include "btr0btr.h"
+#include "btr0sea.h"
+#include "dict0boot.h"
+#include "log0log.h"
+#include "dict0mem.h"
+#include "fsp0types.h"
+#include "log.h"
+
+typedef uint32_t page_no_t;
+
+/** Returns the first extent descriptor for a segment.
+We think of the extent lists of the segment catenated in the order
+FSEG_FULL -> FSEG_NOT_FULL -> FSEG_FREE.
+@param[in]	inode		segment inode
+@param[in]	space		tablespace
+@param[in,out]	mtr		mini-transaction
+@param[out]	err		error code
+@return the first extent descriptor, or NULL if none */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+static
+xdes_t*
+fseg_get_first_extent(
+	fseg_inode_t*		inode,
+	const fil_space_t*	space,
+	mtr_t*			mtr,
+	dberr_t*		err);
+
+ATTRIBUTE_COLD MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Put new extents to the free list if there are free extents above the free
+limit. If an extent happens to contain an extent descriptor page, the extent
+is put to the FSP_FREE_FRAG list with the page marked as used.
+@param[in]	init_space	true if this is a single-table tablespace
+and we are only initializing the first extent and the first bitmap pages;
+then we will not allocate more extents
+@param[in,out]	space		tablespace
+@param[in,out]	header		tablespace header
+@param[in,out]	mtr		mini-transaction */
+static
+dberr_t
+fsp_fill_free_list(
+	bool		init_space,
+	fil_space_t*	space,
+	buf_block_t*	header,
+	mtr_t*		mtr);
+
+/** Allocates a single free page from a segment.
+This function implements the intelligent allocation strategy which tries to
+minimize file space fragmentation.
+@param[in,out]	space			tablespace
+@param[in,out]	seg_inode		segment inode
+@param[in,out]	iblock			segment inode page
+@param[in]	hint			hint of which page would be desirable
+@param[in]	direction		if the new page is needed because of
+an index page split, and records are inserted there in order, into which
+direction they go alphabetically: FSP_DOWN, FSP_UP, FSP_NO_DIR
+@param[in,out]	mtr			mini-transaction
+@param[in,out]	init_mtr		mtr or another mini-transaction in
+which the page should be initialized.
+@param[out]	err			error code
+@return the allocated page
+@retval nullptr	if no page could be allocated */
+static
+buf_block_t*
+fseg_alloc_free_page_low(
+	fil_space_t*		space,
+	fseg_inode_t*		seg_inode,
+	buf_block_t*		iblock,
+	uint32_t		hint,
+	byte			direction,
+#ifdef UNIV_DEBUG
+	bool			has_done_reservation,
+	/*!< whether the space has already been reserved */
+#endif /* UNIV_DEBUG */
+	mtr_t*			mtr,
+	mtr_t*			init_mtr,
+	dberr_t*		err)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Get the tablespace header block, SX-latched
+@param[in]      space           tablespace
+@param[in,out]  mtr             mini-transaction
+@param[out]     err             error code
+@return pointer to the space header, page x-locked
+@retval nullptr if the page cannot be retrieved or is corrupted */
+static buf_block_t *fsp_get_header(const fil_space_t *space, mtr_t *mtr,
+                                   dberr_t *err)
+{
+  const page_id_t id{space->id, 0};
+  buf_block_t *block= mtr->get_already_latched(id, MTR_MEMO_PAGE_SX_FIX);
+  if (block)
+    *err= DB_SUCCESS;
+  else
+  {
+    block= buf_page_get_gen(id, space->zip_size(), RW_SX_LATCH,
+                            nullptr, BUF_GET_POSSIBLY_FREED,
+                            mtr, err);
+    if (block &&
+        space->id != mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_ID +
+                                      block->page.frame))
+    {
+      *err= DB_CORRUPTION;
+      block= nullptr;
+    }
+  }
+  return block;
+}
+
+/** Set the XDES_FREE_BIT of a page.
+@tparam         free    desired value of XDES_FREE_BIT
+@param[in]      block   extent descriptor block
+@param[in,out]  descr   extent descriptor
+@param[in]      offset  page offset within the extent
+@param[in,out]  mtr     mini-transaction */
+template<bool free>
+inline void xdes_set_free(const buf_block_t &block, xdes_t *descr,
+                          ulint offset, mtr_t *mtr)
+{
+  ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_SX_FIX |
+                                   MTR_MEMO_PAGE_X_FIX));
+  ut_ad(offset < FSP_EXTENT_SIZE);
+  ut_ad(page_align(descr) == block.page.frame);
+  compile_time_assert(XDES_BITS_PER_PAGE == 2);
+  compile_time_assert(XDES_FREE_BIT == 0);
+  compile_time_assert(XDES_CLEAN_BIT == 1);
+
+  ulint index= XDES_BITS_PER_PAGE * offset;
+  byte *b= &descr[XDES_BITMAP + (index >> 3)];
+  /* xdes_init() should have set all XDES_CLEAN_BIT. */
+  ut_ad(!(~*b & 0xaa));
+  /* Clear or set XDES_FREE_BIT. */
+  byte val= free
+    ? static_cast<byte>(*b | 1 << (index & 7))
+    : static_cast<byte>(*b & ~(1 << (index & 7)));
+  mtr->write<1>(block, b, val);
+}
+
+/**
+Find a free page.
+@param descr   extent descriptor
+@param hint    page offset to start searching from (towards larger pages)
+@return free page offset
+@retval FIL_NULL if no page is free */
+inline uint32_t xdes_find_free(const xdes_t *descr, uint32_t hint= 0)
+{
+  const uint32_t extent_size= FSP_EXTENT_SIZE;
+  ut_ad(hint < extent_size);
+  for (uint32_t i= hint; i < extent_size; i++)
+    if (xdes_is_free(descr, i))
+      return i;
+  for (uint32_t i= 0; i < hint; i++)
+    if (xdes_is_free(descr, i))
+      return i;
+  return FIL_NULL;
+}
+
+/**
+Determine the number of used pages in a descriptor.
+@param descr  file descriptor
+@return number of pages used */
+inline uint32_t xdes_get_n_used(const xdes_t *descr)
+{
+  uint32_t count= 0;
+
+  for (uint32_t i= FSP_EXTENT_SIZE; i--; )
+    if (!xdes_is_free(descr, i))
+      count++;
+
+  return count;
+}
+
+/**
+Determine whether a file extent is full.
+@param descr  file descriptor
+@return whether all pages have been allocated */
+inline bool xdes_is_full(const xdes_t *descr)
+{
+  return FSP_EXTENT_SIZE == xdes_get_n_used(descr);
+}
+
+/** Set the state of an extent descriptor.
+@param[in]      block   extent descriptor block
+@param[in,out]  descr   extent descriptor
+@param[in]      state   the state
+@param[in,out]  mtr     mini-transaction */
+inline void xdes_set_state(const buf_block_t &block, xdes_t *descr,
+                           byte state, mtr_t *mtr)
+{
+  ut_ad(descr && mtr);
+  ut_ad(state >= XDES_FREE);
+  ut_ad(state <= XDES_FSEG);
+  ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_SX_FIX |
+                                   MTR_MEMO_PAGE_X_FIX));
+  ut_ad(page_align(descr) == block.page.frame);
+  ut_ad(mach_read_from_4(descr + XDES_STATE) <= XDES_FSEG);
+  mtr->write<1>(block, XDES_STATE + 3 + descr, state);
+}
+
+/**********************************************************************//**
+Gets the state of an xdes.
+@return state */
+UNIV_INLINE
+ulint
+xdes_get_state(
+/*===========*/
+	const xdes_t*	descr)	/*!< in: descriptor */
+{
+	ulint	state;
+
+	ut_ad(descr);
+	state = mach_read_from_4(descr + XDES_STATE);
+	ut_ad(state - 1 < XDES_FSEG);
+	return(state);
+}
+
+/**********************************************************************//**
+Inits an extent descriptor to the free and clean state. */
+inline void xdes_init(const buf_block_t &block, xdes_t *descr, mtr_t *mtr)
+{
+  ut_ad(mtr->memo_contains_flagged(&block, MTR_MEMO_PAGE_SX_FIX |
+                                   MTR_MEMO_PAGE_X_FIX));
+  mtr->memset(&block, uint16_t(descr - block.page.frame) + XDES_BITMAP,
+              XDES_SIZE - XDES_BITMAP, 0xff);
+  xdes_set_state(block, descr, XDES_FREE, mtr);
+}
+
+/** Mark a page used in an extent descriptor.
+@param[in,out]  seg_inode       segment inode
+@param[in,out]  iblock          segment inode page
+@param[in]      page            page number
+@param[in,out]  descr           extent descriptor
+@param[in,out]  xdes            extent descriptor page
+@param[in,out]  mtr             mini-transaction
+@return error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fseg_mark_page_used(fseg_inode_t *seg_inode, buf_block_t *iblock,
+                    ulint page, xdes_t *descr, buf_block_t *xdes, mtr_t *mtr)
+{
+  ut_ad(fil_page_get_type(iblock->page.frame) == FIL_PAGE_INODE);
+  ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
+  ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + seg_inode, 4));
+  ut_ad(!memcmp(seg_inode + FSEG_ID, descr + XDES_ID, 4));
+
+  const uint16_t xoffset= uint16_t(descr - xdes->page.frame + XDES_FLST_NODE);
+  const uint16_t ioffset= uint16_t(seg_inode - iblock->page.frame);
+
+  if (!xdes_get_n_used(descr))
+  {
+    /* We move the extent from the free list to the NOT_FULL list */
+    if (dberr_t err= flst_remove(iblock, uint16_t(FSEG_FREE + ioffset),
+                                 xdes, xoffset, mtr))
+      return err;
+    if (dberr_t err= flst_add_last(iblock, uint16_t(FSEG_NOT_FULL + ioffset),
+                                   xdes, xoffset, mtr))
+      return err;
+  }
+
+  if (UNIV_UNLIKELY(!xdes_is_free(descr, page % FSP_EXTENT_SIZE)))
+    return DB_CORRUPTION;
+
+  /* We mark the page as used */
+  xdes_set_free<false>(*xdes, descr, page % FSP_EXTENT_SIZE, mtr);
+
+  byte* p_not_full= seg_inode + FSEG_NOT_FULL_N_USED;
+  const uint32_t not_full_n_used= mach_read_from_4(p_not_full) + 1;
+  mtr->write<4>(*iblock, p_not_full, not_full_n_used);
+  if (xdes_is_full(descr))
+  {
+    /* We move the extent from the NOT_FULL list to the FULL list */
+    if (dberr_t err= flst_remove(iblock, uint16_t(FSEG_NOT_FULL + ioffset),
+                                 xdes, xoffset, mtr))
+      return err;
+    if (dberr_t err= flst_add_last(iblock, uint16_t(FSEG_FULL + ioffset),
+                                   xdes, xoffset, mtr))
+      return err;
+    mtr->write<4>(*iblock, seg_inode + FSEG_NOT_FULL_N_USED,
+                  not_full_n_used - FSP_EXTENT_SIZE);
+  }
+
+  return DB_SUCCESS;
+}
+
+/** Get pointer to a the extent descriptor of a page.
+@param[in,out]	sp_header	tablespace header page, x-latched
+@param[in]	space		tablespace
+@param[in]	offset		page offset
+@param[in,out]	mtr		mini-transaction
+@param[out]	err		error code
+@param[out]	desc_block	descriptor block
+@param[in]	init_space	whether the tablespace is being initialized
+@return pointer to the extent descriptor, NULL if the page does not
+exist in the space or if the offset exceeds free limit */
+UNIV_INLINE MY_ATTRIBUTE((warn_unused_result))
+xdes_t*
+xdes_get_descriptor_with_space_hdr(
+	buf_block_t*		header,
+	const fil_space_t*	space,
+	page_no_t		offset,
+	mtr_t*			mtr,
+	dberr_t*		err = nullptr,
+	buf_block_t**		desc_block = nullptr,
+	bool			init_space = false)
+{
+	ut_ad(space->is_owner());
+	ut_ad(mtr->memo_contains_flagged(header, MTR_MEMO_PAGE_SX_FIX
+					 | MTR_MEMO_PAGE_X_FIX));
+	/* Read free limit and space size */
+	uint32_t limit = mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT
+					  + header->page.frame);
+	uint32_t size  = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE
+					  + header->page.frame);
+	ut_ad(limit == space->free_limit
+	      || (space->free_limit == 0
+		  && (init_space
+		      || space->purpose == FIL_TYPE_TEMPORARY
+		      || (srv_startup_is_before_trx_rollback_phase
+			  && (space->id == TRX_SYS_SPACE
+			      || srv_is_undo_tablespace(space->id))))));
+	ut_ad(size == space->size_in_header);
+
+	if (offset >= size || offset >= limit) {
+		return nullptr;
+	}
+
+	const unsigned zip_size = space->zip_size();
+
+	uint32_t descr_page_no = xdes_calc_descriptor_page(zip_size, offset);
+
+	buf_block_t* block = header;
+
+	if (descr_page_no) {
+		block = buf_page_get_gen(page_id_t(space->id, descr_page_no),
+					 zip_size, RW_SX_LATCH, nullptr,
+					 BUF_GET_POSSIBLY_FREED, mtr, err);
+	}
+
+	if (desc_block) {
+		*desc_block = block;
+	}
+
+	return block
+		? XDES_ARR_OFFSET + XDES_SIZE
+		* xdes_calc_descriptor_index(zip_size, offset)
+		+ block->page.frame
+		: nullptr;
+}
+
+MY_ATTRIBUTE((nonnull(1,3), warn_unused_result))
+/** Get the extent descriptor of a page.
+The page where the extent descriptor resides is x-locked. If the page
+offset is equal to the free limit of the space, we will add new
+extents from above the free limit to the space free list, if not free
+limit == space size. This adding is necessary to make the descriptor
+defined, as they are uninitialized above the free limit.
+@param[in]	space		tablespace
+@param[in]	offset		page offset; if equal to the free limit, we
+try to add new extents to the space free list
+@param[in,out]	mtr		mini-transaction
+@param[out]	err		error code
+@param[out]	xdes		extent descriptor page
+@return the extent descriptor */
+static xdes_t *xdes_get_descriptor(const fil_space_t *space, page_no_t offset,
+                                   mtr_t *mtr, dberr_t *err= nullptr,
+                                   buf_block_t **xdes= nullptr)
+{
+  if (buf_block_t *block=
+      buf_page_get_gen(page_id_t(space->id, 0), space->zip_size(), RW_SX_LATCH,
+                       nullptr, BUF_GET_POSSIBLY_FREED, mtr, err))
+    return xdes_get_descriptor_with_space_hdr(block, space, offset, mtr,
+                                              err, xdes);
+  return nullptr;
+}
+
+MY_ATTRIBUTE((nonnull(3), warn_unused_result))
+/** Get a pointer to the extent descriptor. The page where the
+extent descriptor resides is x-locked.
+@param space    tablespace
+@param lst_node file address of the list node contained in the descriptor
+@param mtr      mini-transaction
+@param err      error code
+@param block    extent descriptor block
+@return pointer to the extent descriptor */
+static inline
+xdes_t *xdes_lst_get_descriptor(const fil_space_t &space, fil_addr_t lst_node,
+                                mtr_t *mtr, buf_block_t **block= nullptr,
+                                dberr_t *err= nullptr)
+{
+  ut_ad(mtr->memo_contains(space));
+  ut_ad(lst_node.boffset < space.physical_size());
+  buf_block_t *b;
+  if (!block)
+    block= &b;
+  *block= buf_page_get_gen(page_id_t{space.id, lst_node.page},
+                           space.zip_size(), RW_SX_LATCH,
+                           nullptr, BUF_GET_POSSIBLY_FREED, mtr, err);
+  if (*block)
+    return (*block)->page.frame + lst_node.boffset - XDES_FLST_NODE;
+
+  space.set_corrupted();
+  return nullptr;
+}
+
+/********************************************************************//**
+Returns page offset of the first page in extent described by a descriptor.
+@return offset of the first page in extent */
+static uint32_t xdes_get_offset(const xdes_t *descr)
+{
+  ut_ad(descr);
+  return page_get_page_no(page_align(descr)) +
+    uint32_t(((page_offset(descr) - XDES_ARR_OFFSET) / XDES_SIZE) *
+             FSP_EXTENT_SIZE);
+}
+
+/** Initialize a file page whose prior contents should be ignored.
+@param[in,out]	block	buffer pool block */
+void fsp_apply_init_file_page(buf_block_t *block)
+{
+  memset_aligned<UNIV_PAGE_SIZE_MIN>(block->page.frame, 0, srv_page_size);
+  const page_id_t id(block->page.id());
+
+  mach_write_to_4(block->page.frame + FIL_PAGE_OFFSET, id.page_no());
+  memset_aligned<8>(block->page.frame + FIL_PAGE_PREV, 0xff, 8);
+  mach_write_to_4(block->page.frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+                  id.space());
+  if (page_zip_des_t* page_zip= buf_block_get_page_zip(block))
+  {
+    memset_aligned<UNIV_ZIP_SIZE_MIN>(page_zip->data, 0,
+                                      page_zip_get_size(page_zip));
+    static_assert(FIL_PAGE_OFFSET == 4, "compatibility");
+    memcpy_aligned<4>(page_zip->data + FIL_PAGE_OFFSET,
+                      block->page.frame + FIL_PAGE_OFFSET, 4);
+    memset_aligned<8>(page_zip->data + FIL_PAGE_PREV, 0xff, 8);
+    static_assert(FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID % 4 == 2,
+                  "not perfect alignment");
+    memcpy_aligned<2>(page_zip->data + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+                      block->page.frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 4);
+  }
+}
+
+#ifdef UNIV_DEBUG
+/** Assert that the mini-transaction is compatible with
+updating an allocation bitmap page.
+@param[in]	mtr	mini-transaction */
+void fil_space_t::modify_check(const mtr_t& mtr) const
+{
+  switch (mtr.get_log_mode()) {
+  case MTR_LOG_NONE:
+    /* These modes are only allowed within a non-bitmap page
+       when there is a higher-level redo log record written. */
+    ut_ad(purpose == FIL_TYPE_TABLESPACE || purpose == FIL_TYPE_TEMPORARY);
+    break;
+  case MTR_LOG_NO_REDO:
+    ut_ad(purpose == FIL_TYPE_TEMPORARY || purpose == FIL_TYPE_IMPORT);
+    break;
+  default:
+    /* We may only write redo log for a persistent tablespace. */
+    ut_ad(purpose == FIL_TYPE_TABLESPACE);
+    ut_ad(mtr.is_named_space(id));
+  }
+}
+#endif
+
+/** Initialize a tablespace header.
+@param[in,out]	space	tablespace
+@param[in]	size	current size in blocks
+@param[in,out]	mtr	mini-transaction
+@return error code */
+dberr_t fsp_header_init(fil_space_t *space, uint32_t size, mtr_t *mtr)
+{
+	const page_id_t page_id(space->id, 0);
+	const ulint zip_size = space->zip_size();
+
+	buf_block_t *free_block = buf_LRU_get_free_block(false);
+
+	mtr->x_lock_space(space);
+
+	buf_block_t* block = buf_page_create(space, 0, zip_size, mtr,
+					     free_block);
+	if (UNIV_UNLIKELY(block != free_block)) {
+		buf_pool.free_block(free_block);
+	}
+
+	space->size_in_header = size;
+	space->free_len = 0;
+	space->free_limit = 0;
+
+	/* The prior contents of the file page should be ignored */
+
+	fsp_init_file_page(space, block, mtr);
+
+	mtr->write<2>(*block, block->page.frame + FIL_PAGE_TYPE,
+		      FIL_PAGE_TYPE_FSP_HDR);
+
+	mtr->write<4,mtr_t::MAYBE_NOP>(*block, FSP_HEADER_OFFSET + FSP_SPACE_ID
+				       + block->page.frame, space->id);
+	ut_ad(0 == mach_read_from_4(FSP_HEADER_OFFSET + FSP_NOT_USED
+				    + block->page.frame));
+	/* recv_sys_t::parse() expects to find a WRITE record that
+	covers all 4 bytes. Therefore, we must specify mtr_t::FORCED
+	in order to avoid optimizing away any unchanged most
+	significant bytes of FSP_SIZE. */
+	mtr->write<4,mtr_t::FORCED>(*block, FSP_HEADER_OFFSET + FSP_SIZE
+				    + block->page.frame, size);
+	ut_ad(0 == mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT
+				    + block->page.frame));
+	if (auto f = space->flags & ~FSP_FLAGS_MEM_MASK) {
+		mtr->write<4,mtr_t::FORCED>(*block,
+					    FSP_HEADER_OFFSET + FSP_SPACE_FLAGS
+					    + block->page.frame, f);
+	}
+	ut_ad(0 == mach_read_from_4(FSP_HEADER_OFFSET + FSP_FRAG_N_USED
+				    + block->page.frame));
+
+	flst_init(block, FSP_HEADER_OFFSET + FSP_FREE, mtr);
+	flst_init(block, FSP_HEADER_OFFSET + FSP_FREE_FRAG, mtr);
+	flst_init(block, FSP_HEADER_OFFSET + FSP_FULL_FRAG, mtr);
+	flst_init(block, FSP_HEADER_OFFSET + FSP_SEG_INODES_FULL, mtr);
+	flst_init(block, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE, mtr);
+
+	mtr->write<8>(*block, FSP_HEADER_OFFSET + FSP_SEG_ID
+		      + block->page.frame,
+		      1U);
+
+	if (dberr_t err = fsp_fill_free_list(!is_system_tablespace(space->id),
+					     space, block, mtr)) {
+		return err;
+	}
+
+	/* Write encryption metadata to page 0 if tablespace is
+	encrypted or encryption is disabled by table option. */
+	if (space->crypt_data &&
+	    (space->crypt_data->should_encrypt() ||
+	     space->crypt_data->not_encrypted())) {
+		space->crypt_data->write_page0(block, mtr);
+	}
+
+	return DB_SUCCESS;
+}
+
+/** Try to extend a single-table tablespace so that a page would fit in the
+data file.
+@param[in,out]	space	tablespace
+@param[in]	page_no	page number
+@param[in,out]	header	tablespace header
+@param[in,out]	mtr	mini-transaction
+@return true if success */
+static ATTRIBUTE_COLD __attribute__((warn_unused_result))
+bool
+fsp_try_extend_data_file_with_pages(
+	fil_space_t*	space,
+	uint32_t	page_no,
+	buf_block_t*	header,
+	mtr_t*		mtr)
+{
+	bool	success;
+	ulint	size;
+
+	ut_ad(!is_system_tablespace(space->id));
+	ut_d(space->modify_check(*mtr));
+
+	size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE
+				+ header->page.frame);
+	ut_ad(size == space->size_in_header);
+
+	ut_a(page_no >= size);
+
+	success = fil_space_extend(space, page_no + 1);
+	/* The size may be less than we wanted if we ran out of disk space. */
+	/* recv_sys_t::parse() expects to find a WRITE record that
+	covers all 4 bytes. Therefore, we must specify mtr_t::FORCED
+	in order to avoid optimizing away any unchanged most
+	significant bytes of FSP_SIZE. */
+	mtr->write<4,mtr_t::FORCED>(*header, FSP_HEADER_OFFSET + FSP_SIZE
+				    + header->page.frame, space->size);
+	space->size_in_header = space->size;
+
+	return(success);
+}
+
+/** Calculate the number of physical pages in an extent for this file.
+@param[in]	physical_size	page_size of the datafile
+@return number of pages in an extent for this file */
+inline uint32_t fsp_get_extent_size_in_pages(ulint physical_size)
+{
+  return uint32_t((FSP_EXTENT_SIZE << srv_page_size_shift) / physical_size);
+}
+
+
+/** Calculate the number of pages to extend a datafile.
+We extend single-table tablespaces first one extent at a time,
+but 4 at a time for bigger tablespaces. It is not enough to extend always
+by one extent, because we need to add at least one extent to FSP_FREE.
+A single extent descriptor page will track many extents. And the extent
+that uses its extent descriptor page is put onto the FSP_FREE_FRAG list.
+Extents that do not use their extent descriptor page are added to FSP_FREE.
+The physical page size is used to determine how many extents are tracked
+on one extent descriptor page. See xdes_calc_descriptor_page().
+@param[in]	physical_size	page size in data file
+@param[in]	size		current number of pages in the datafile
+@return number of pages to extend the file. */
+static uint32_t fsp_get_pages_to_extend_ibd(unsigned physical_size,
+					    uint32_t size)
+{
+	uint32_t extent_size = fsp_get_extent_size_in_pages(physical_size);
+	/* The threshold is set at 32MiB except when the physical page
+	size is small enough that it must be done sooner. */
+	uint32_t threshold = std::min(32 * extent_size, physical_size);
+
+	if (size >= threshold) {
+		/* Below in fsp_fill_free_list() we assume
+		that we add at most FSP_FREE_ADD extents at
+		a time */
+		extent_size *= FSP_FREE_ADD;
+	}
+
+	return extent_size;
+}
+
+/** Try to extend the last data file of a tablespace if it is auto-extending.
+@param[in,out]	space	tablespace
+@param[in,out]	header	tablespace header
+@param[in,out]	mtr	mini-transaction
+@return	number of pages added
+@retval	0 if the tablespace was not extended */
+ATTRIBUTE_COLD __attribute__((nonnull))
+static
+ulint
+fsp_try_extend_data_file(fil_space_t *space, buf_block_t *header, mtr_t *mtr)
+{
+	const char* OUT_OF_SPACE_MSG =
+		"ran out of space. Please add another file or use"
+		" 'autoextend' for the last file in setting";
+
+	ut_d(space->modify_check(*mtr));
+
+	if (space->id == TRX_SYS_SPACE
+	    && !srv_sys_space.can_auto_extend_last_file()) {
+
+		/* We print the error message only once to avoid
+		spamming the error log. Note that we don't need
+		to reset the flag to false as dealing with this
+		error requires server restart. */
+		if (!srv_sys_space.get_tablespace_full_status()) {
+			sql_print_error("InnoDB: The InnoDB system tablespace "
+                                        "%s" " innodb_data_file_path.",
+                                        OUT_OF_SPACE_MSG);
+			srv_sys_space.set_tablespace_full_status(true);
+		}
+		return(0);
+	} else if (space->id == SRV_TMP_SPACE_ID
+		   && !srv_tmp_space.can_auto_extend_last_file()) {
+
+		/* We print the error message only once to avoid
+		spamming the error log. Note that we don't need
+		to reset the flag to false as dealing with this
+		error requires server restart. */
+		if (!srv_tmp_space.get_tablespace_full_status()) {
+			sql_print_error("InnoDB: The InnoDB temporary"
+                                        " tablespace %s"
+                                        " innodb_temp_data_file_path.",
+                                        OUT_OF_SPACE_MSG);
+			srv_tmp_space.set_tablespace_full_status(true);
+		}
+		return(0);
+	}
+
+	uint32_t size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE
+					 + header->page.frame);
+	ut_ad(size == space->size_in_header);
+	uint32_t size_increase;
+
+	const unsigned ps = space->physical_size();
+
+	switch (space->id) {
+	case TRX_SYS_SPACE:
+		size_increase = srv_sys_space.get_increment();
+		break;
+	case SRV_TMP_SPACE_ID:
+		size_increase = srv_tmp_space.get_increment();
+		break;
+	default:
+		uint32_t extent_pages = fsp_get_extent_size_in_pages(ps);
+		if (size < extent_pages) {
+			/* Let us first extend the file to extent_size */
+			if (!fsp_try_extend_data_file_with_pages(
+				    space, extent_pages - 1, header, mtr)) {
+				return(0);
+			}
+
+			size = extent_pages;
+		}
+
+		size_increase = fsp_get_pages_to_extend_ibd(ps, size);
+	}
+
+	if (size_increase == 0) {
+		return(0);
+	}
+
+	if (!fil_space_extend(space, size + size_increase)) {
+		return(0);
+	}
+
+	/* For the system tablespace, we ignore any fragments of a
+	full megabyte when storing the size to the space header */
+
+	space->size_in_header = space->id
+		? space->size
+		: ut_2pow_round(space->size, (1024 * 1024) / ps);
+
+	/* recv_sys_t::parse() expects to find a WRITE record that
+	covers all 4 bytes. Therefore, we must specify mtr_t::FORCED
+	in order to avoid optimizing away any unchanged most
+	significant bytes of FSP_SIZE. */
+	mtr->write<4,mtr_t::FORCED>(*header, FSP_HEADER_OFFSET + FSP_SIZE
+				    + header->page.frame,
+				    space->size_in_header);
+
+	return(size_increase);
+}
+
+/** Reset the page type.
+Data files created before MySQL 5.1.48 may contain garbage in FIL_PAGE_TYPE.
+In MySQL 3.23.53, only undo log pages and index pages were tagged.
+Any other pages were written with uninitialized bytes in FIL_PAGE_TYPE.
+@param[in]	block	block with invalid FIL_PAGE_TYPE
+@param[in]	type	expected page type
+@param[in,out]	mtr	mini-transaction */
+ATTRIBUTE_COLD
+void fil_block_reset_type(const buf_block_t& block, ulint type, mtr_t* mtr)
+{
+  ib::info() << "Resetting invalid page " << block.page.id() << " type "
+             << fil_page_get_type(block.page.frame) << " to " << type << ".";
+  mtr->write<2>(block, block.page.frame + FIL_PAGE_TYPE, type);
+}
+
+/** Put new extents to the free list if there are free extents above the free
+limit. If an extent happens to contain an extent descriptor page, the extent
+is put to the FSP_FREE_FRAG list with the page marked as used.
+@param[in]	init_space	true if this is a single-table tablespace
+and we are only initializing the first extent and the first bitmap pages;
+then we will not allocate more extents
+@param[in,out]	space		tablespace
+@param[in,out]	header		tablespace header
+@param[in,out]	mtr		mini-transaction
+@return error code */
+static
+dberr_t
+fsp_fill_free_list(
+	bool		init_space,
+	fil_space_t*	space,
+	buf_block_t*	header,
+	mtr_t*		mtr)
+{
+  ut_d(space->modify_check(*mtr));
+
+  /* Check if we can fill free list from above the free list limit */
+  uint32_t size=
+    mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE + header->page.frame);
+  uint32_t limit=
+    mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT + header->page.frame);
+
+  ut_ad(size == space->size_in_header);
+  ut_ad(limit == space->free_limit);
+
+  const auto zip_size= space->zip_size();
+
+  if (size < limit + FSP_EXTENT_SIZE * FSP_FREE_ADD)
+  {
+    bool skip_resize= init_space;
+    switch (space->id) {
+    case TRX_SYS_SPACE:
+      skip_resize= !srv_sys_space.can_auto_extend_last_file();
+      break;
+    case SRV_TMP_SPACE_ID:
+      skip_resize= !srv_tmp_space.can_auto_extend_last_file();
+      break;
+    }
+
+    if (!skip_resize)
+    {
+      fsp_try_extend_data_file(space, header, mtr);
+      size= space->size_in_header;
+    }
+  }
+
+  uint32_t count= 0;
+  for (uint32_t i= limit, extent_size= FSP_EXTENT_SIZE,
+         physical_size= space->physical_size();
+       (init_space && i < 1) ||
+         (i + extent_size <= size && count < FSP_FREE_ADD);
+       i += extent_size)
+  {
+    const bool init_xdes= !ut_2pow_remainder(i, physical_size);
+    space->free_limit= i + extent_size;
+    mtr->write<4>(*header, FSP_HEADER_OFFSET + FSP_FREE_LIMIT +
+                  header->page.frame, i + extent_size);
+
+    if (init_xdes)
+    {
+      /* We are going to initialize a new descriptor page
+      and a new ibuf bitmap page: the prior contents of the
+      pages should be ignored. */
+
+      if (i)
+      {
+        buf_block_t *f= buf_LRU_get_free_block(false);
+        buf_block_t *block= buf_page_create(space, static_cast<uint32_t>(i),
+                               zip_size, mtr, f);
+        if (UNIV_UNLIKELY(block != f))
+          buf_pool.free_block(f);
+        fsp_init_file_page(space, block, mtr);
+        mtr->write<2>(*block, FIL_PAGE_TYPE + block->page.frame,
+                      FIL_PAGE_TYPE_XDES);
+      }
+
+      if (space->purpose != FIL_TYPE_TEMPORARY)
+      {
+        buf_block_t *f= buf_LRU_get_free_block(false);
+        buf_block_t *block=
+          buf_page_create(space,
+                          static_cast<uint32_t>(i + FSP_IBUF_BITMAP_OFFSET),
+                          zip_size, mtr, f);
+        if (UNIV_UNLIKELY(block != f))
+          buf_pool.free_block(f);
+        fsp_init_file_page(space, block, mtr);
+        mtr->write<2>(*block, FIL_PAGE_TYPE + block->page.frame,
+                      FIL_PAGE_IBUF_BITMAP);
+      }
+    }
+
+    buf_block_t *xdes= nullptr;
+    xdes_t *descr;
+    {
+      dberr_t err= DB_SUCCESS;
+      descr= xdes_get_descriptor_with_space_hdr(header, space, i, mtr,
+                                                &err, &xdes, init_space);
+      if (!descr)
+        return err;
+    }
+
+    if (xdes != header && !space->full_crc32())
+      fil_block_check_type(*xdes, FIL_PAGE_TYPE_XDES, mtr);
+    xdes_init(*xdes, descr, mtr);
+    const uint16_t xoffset=
+      static_cast<uint16_t>(descr - xdes->page.frame + XDES_FLST_NODE);
+    if (UNIV_UNLIKELY(init_xdes))
+    {
+      /* The first page in the extent is a descriptor page and the
+      second is an ibuf bitmap page: mark them used */
+      xdes_set_free<false>(*xdes, descr, 0, mtr);
+      xdes_set_free<false>(*xdes, descr, FSP_IBUF_BITMAP_OFFSET, mtr);
+      xdes_set_state(*xdes, descr, XDES_FREE_FRAG, mtr);
+      if (dberr_t err= flst_add_last(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG,
+                                     xdes, xoffset, mtr))
+        return err;
+      byte *n_used= FSP_HEADER_OFFSET + FSP_FRAG_N_USED + header->page.frame;
+      mtr->write<4>(*header, n_used, 2U + mach_read_from_4(n_used));
+    }
+    else
+    {
+      if (dberr_t err=
+          flst_add_last(header, FSP_HEADER_OFFSET + FSP_FREE,
+                        xdes, xoffset, mtr))
+        return err;
+      count++;
+    }
+  }
+
+  space->free_len+= count;
+  return DB_SUCCESS;
+}
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Allocates a new free extent.
+@param[in,out]	space		tablespace
+@param[in]	hint		hint of which extent would be desirable: any
+page offset in the extent goes; the hint must not be > FSP_FREE_LIMIT
+@param[out]	xdes		extent descriptor page
+@param[in,out]	mtr		mini-transaction
+@return extent descriptor
+@retval nullptr if cannot be allocated */
+static xdes_t *fsp_alloc_free_extent(fil_space_t *space, uint32_t hint,
+                                     buf_block_t **xdes, mtr_t *mtr,
+                                     dberr_t *err)
+{
+	fil_addr_t	first;
+	xdes_t*		descr;
+	buf_block_t*	desc_block;
+
+	buf_block_t* header = fsp_get_header(space, mtr, err);
+	if (!header) {
+corrupted:
+		space->set_corrupted();
+		return nullptr;
+	}
+
+	descr = xdes_get_descriptor_with_space_hdr(
+		header, space, hint, mtr, err, &desc_block);
+	if (!descr) {
+		goto corrupted;
+	}
+
+	if (desc_block != header && !space->full_crc32()) {
+		fil_block_check_type(*desc_block, FIL_PAGE_TYPE_XDES, mtr);
+	}
+
+	if (xdes_get_state(descr) == XDES_FREE) {
+		/* Ok, we can take this extent */
+	} else {
+		/* Take the first extent in the free list */
+		first = flst_get_first(FSP_HEADER_OFFSET + FSP_FREE
+				       + header->page.frame);
+
+		if (first.page == FIL_NULL) {
+			*err = fsp_fill_free_list(false, space, header, mtr);
+			if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+				goto corrupted;
+			}
+
+			first = flst_get_first(FSP_HEADER_OFFSET + FSP_FREE
+					       + header->page.frame);
+			if (first.page == FIL_NULL) {
+				return nullptr;	/* No free extents left */
+			}
+		}
+
+		descr = xdes_lst_get_descriptor(*space, first, mtr,
+						&desc_block, err);
+		if (!descr) {
+			return descr;
+		}
+	}
+
+	*err = flst_remove(header, FSP_HEADER_OFFSET + FSP_FREE, desc_block,
+			   static_cast<uint16_t>(descr - desc_block->page.frame
+						 + XDES_FLST_NODE),
+			   mtr);
+	if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+		return nullptr;
+	}
+
+	space->free_len--;
+	*xdes = desc_block;
+
+	return(descr);
+}
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Allocate a single free page.
+@param[in,out]	header	tablespace header
+@param[in,out]	xdes	extent descriptor page
+@param[in,out]	descr	extent descriptor
+@param[in]	bit	slot to allocate in the extent
+@param[in,out]	mtr	mini-transaction
+@return error code */
+static dberr_t
+fsp_alloc_from_free_frag(buf_block_t *header, buf_block_t *xdes, xdes_t *descr,
+			 ulint bit, mtr_t *mtr)
+{
+  if (UNIV_UNLIKELY(xdes_get_state(descr) != XDES_FREE_FRAG ||
+                    !xdes_is_free(descr, bit)))
+    return DB_CORRUPTION;
+  xdes_set_free<false>(*xdes, descr, bit, mtr);
+
+  /* Update the FRAG_N_USED field */
+  byte *n_used_p= FSP_HEADER_OFFSET + FSP_FRAG_N_USED + header->page.frame;
+  uint32_t n_used = mach_read_from_4(n_used_p) + 1;
+
+  if (xdes_is_full(descr))
+  {
+    /* The fragment is full: move it to another list */
+    const uint16_t xoffset=
+      static_cast<uint16_t>(descr - xdes->page.frame + XDES_FLST_NODE);
+    if (dberr_t err= flst_remove(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG,
+                                 xdes, xoffset, mtr))
+      return err;
+    if (dberr_t err= flst_add_last(header, FSP_HEADER_OFFSET + FSP_FULL_FRAG,
+                                   xdes, xoffset, mtr))
+      return err;
+    xdes_set_state(*xdes, descr, XDES_FULL_FRAG, mtr);
+    n_used-= FSP_EXTENT_SIZE;
+  }
+
+  mtr->write<4>(*header, n_used_p, n_used);
+  return DB_SUCCESS;
+}
+
+/** Gets a buffer block for an allocated page.
+@param[in,out]	space		tablespace
+@param[in]	offset		page number of the allocated page
+@param[in,out]	mtr		mini-transaction
+@return block, initialized */
+static
+buf_block_t*
+fsp_page_create(fil_space_t *space, page_no_t offset, mtr_t *mtr)
+{
+  buf_block_t *block, *free_block;
+
+  if (UNIV_UNLIKELY(space->is_being_truncated))
+  {
+    const page_id_t page_id{space->id, offset};
+    buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(page_id.fold());
+    mysql_mutex_lock(&buf_pool.mutex);
+    block= reinterpret_cast<buf_block_t*>
+      (buf_pool.page_hash.get(page_id, chain));
+    if (block && block->page.oldest_modification() <= 1)
+      block= nullptr;
+    mysql_mutex_unlock(&buf_pool.mutex);
+
+    if (block)
+    {
+      ut_ad(block->page.buf_fix_count() >= 1);
+      ut_ad(block->page.lock.x_lock_count() == 1);
+      ut_ad(mtr->have_x_latch(*block));
+      free_block= block;
+      goto got_free_block;
+    }
+  }
+
+  free_block= buf_LRU_get_free_block(false);
+got_free_block:
+  block= buf_page_create(space, static_cast<uint32_t>(offset),
+                         space->zip_size(), mtr, free_block);
+  if (UNIV_UNLIKELY(block != free_block))
+    buf_pool.free_block(free_block);
+
+  fsp_init_file_page(space, block, mtr);
+  return block;
+}
+
+/** Allocates a single free page from a space.
+The page is marked as used.
+@param[in,out]	space		tablespace
+@param[in]	hint		hint of which page would be desirable
+@param[in,out]	mtr		mini-transaction
+@param[in,out]	init_mtr	mini-transaction in which the page should be
+initialized (may be the same as mtr)
+@param[out]	err		error code
+@return allocated block
+@retval nullptr	if no page could be allocated */
+static MY_ATTRIBUTE((warn_unused_result, nonnull))
+buf_block_t *fsp_alloc_free_page(fil_space_t *space, uint32_t hint,
+                                 mtr_t *mtr, mtr_t *init_mtr, dberr_t *err)
+{
+  ut_d(space->modify_check(*mtr));
+  buf_block_t *block= fsp_get_header(space, mtr, err);
+  if (!block)
+    return block;
+
+  buf_block_t *xdes;
+  /* Get the hinted descriptor */
+  xdes_t *descr= xdes_get_descriptor_with_space_hdr(block, space, hint, mtr,
+                                                    err, &xdes);
+  if (descr && xdes_get_state(descr) == XDES_FREE_FRAG)
+    /* Ok, we can take this extent */;
+  else if (*err != DB_SUCCESS)
+  {
+  err_exit:
+    space->set_corrupted();
+    return nullptr;
+  }
+  else
+  {
+    /* Else take the first extent in free_frag list */
+    fil_addr_t first = flst_get_first(FSP_HEADER_OFFSET + FSP_FREE_FRAG +
+                                      block->page.frame);
+    if (first.page == FIL_NULL)
+    {
+      /* There are no partially full fragments: allocate a free extent
+      and add it to the FREE_FRAG list. NOTE that the allocation may
+      have as a side-effect that an extent containing a descriptor
+      page is added to the FREE_FRAG list. But we will allocate our
+      page from the the free extent anyway. */
+      descr= fsp_alloc_free_extent(space, hint, &xdes, mtr, err);
+      if (!descr)
+        return nullptr;
+      *err= flst_add_last(block, FSP_HEADER_OFFSET + FSP_FREE_FRAG, xdes,
+                          static_cast<uint16_t>(descr - xdes->page.frame +
+                                                XDES_FLST_NODE), mtr);
+      if (UNIV_UNLIKELY(*err != DB_SUCCESS))
+        return nullptr;
+      xdes_set_state(*xdes, descr, XDES_FREE_FRAG, mtr);
+    }
+    else
+    {
+      descr= xdes_lst_get_descriptor(*space, first, mtr, &xdes, err);
+      if (!descr)
+        return nullptr;
+      /* Reset the hint */
+      hint= 0;
+    }
+  }
+
+  /* Now we have in descr an extent with at least one free page. Look
+  for a free page in the extent. */
+  uint32_t free= xdes_find_free(descr, hint % FSP_EXTENT_SIZE);
+  if (free == FIL_NULL)
+  {
+  corrupted:
+    *err= DB_CORRUPTION;
+    goto err_exit;
+  }
+
+  uint32_t page_no= xdes_get_offset(descr) + free;
+  uint32_t space_size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE +
+                                         block->page.frame);
+  ut_ad(space_size == space->size_in_header ||
+        (space->id == TRX_SYS_SPACE &&
+         srv_startup_is_before_trx_rollback_phase));
+
+  if (space_size <= page_no)
+  {
+    /* It must be that we are extending a single-table tablespace
+    whose size is still < 64 pages */
+    ut_ad(!is_system_tablespace(space->id));
+    if (page_no >= FSP_EXTENT_SIZE)
+    {
+      sql_print_error("InnoDB: Trying to extend %s"
+                      " by single page(s) though the size is " UINT32PF "."
+                      " Page no " UINT32PF ".",
+                      space->chain.start->name, space_size, page_no);
+      goto corrupted;
+    }
+
+    if (!fsp_try_extend_data_file_with_pages(space, page_no, block, mtr))
+    {
+      *err= DB_OUT_OF_FILE_SPACE;
+      return nullptr;
+    }
+  }
+
+  *err= fsp_alloc_from_free_frag(block, xdes, descr, free, mtr);
+  if (UNIV_UNLIKELY(*err != DB_SUCCESS))
+    goto corrupted;
+  return fsp_page_create(space, page_no, init_mtr);
+}
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Return an extent to the free list of a space.
+@param[in,out]  space   tablespace
+@param[in]      offset  page number in the extent
+@param[in,out]  mtr     mini-transaction
+@return error code */
+static dberr_t fsp_free_extent(fil_space_t* space, page_no_t offset,
+                               mtr_t* mtr)
+{
+  ut_ad(space->is_owner());
+  dberr_t err;
+  buf_block_t *block= fsp_get_header(space, mtr, &err);
+  if (!block)
+    return err;
+  buf_block_t *xdes;
+  xdes_t *descr= xdes_get_descriptor_with_space_hdr(block, space, offset, mtr,
+                                                    &err, &xdes);
+  if (!descr)
+  {
+    ut_ad(err || space->is_stopping());
+    return err;
+  }
+
+  if (UNIV_UNLIKELY(xdes_get_state(descr) == XDES_FREE))
+  {
+    space->set_corrupted();
+    return DB_CORRUPTION;
+  }
+
+  xdes_init(*xdes, descr, mtr);
+  space->free_len++;
+  return flst_add_last(block, FSP_HEADER_OFFSET + FSP_FREE,
+                       xdes, static_cast<uint16_t>(descr - xdes->page.frame +
+                                                   XDES_FLST_NODE), mtr);
+}
+
+MY_ATTRIBUTE((nonnull))
+/** Frees a single page of a space.
+The page is marked as free and clean.
+@param[in,out]	space		tablespace
+@param[in]	offset		page number
+@param[in,out]	mtr		mini-transaction
+@return error code */
+static dberr_t fsp_free_page(fil_space_t *space, page_no_t offset, mtr_t *mtr)
+{
+	xdes_t*		descr;
+	ulint		frag_n_used;
+
+	ut_ad(mtr);
+	ut_d(space->modify_check(*mtr));
+
+	/* fprintf(stderr, "Freeing page %lu in space %lu\n", page, space); */
+
+	dberr_t err;
+	buf_block_t* header = fsp_get_header(space, mtr, &err);
+	if (!header) {
+		ut_ad(space->is_stopping());
+		return err;
+	}
+	buf_block_t* xdes;
+
+	descr = xdes_get_descriptor_with_space_hdr(header, space, offset, mtr,
+						   &err, &xdes);
+	if (!descr) {
+		ut_ad(err || space->is_stopping());
+		return err;
+	}
+
+	const auto state = xdes_get_state(descr);
+
+	switch (state) {
+	case XDES_FREE_FRAG:
+	case XDES_FULL_FRAG:
+		if (!xdes_is_free(descr, offset % FSP_EXTENT_SIZE)) {
+			break;
+		}
+		/* fall through */
+	default:
+		space->set_corrupted();
+		return DB_CORRUPTION;
+	}
+
+	frag_n_used = mach_read_from_4(FSP_HEADER_OFFSET + FSP_FRAG_N_USED
+				       + header->page.frame);
+
+	const uint16_t xoffset= static_cast<uint16_t>(descr - xdes->page.frame
+						      + XDES_FLST_NODE);
+
+	if (state == XDES_FULL_FRAG) {
+		/* The fragment was full: move it to another list */
+		err = flst_remove(header, FSP_HEADER_OFFSET + FSP_FULL_FRAG,
+				  xdes, xoffset, mtr);
+		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+			return err;
+		}
+		err = flst_add_last(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG,
+				    xdes, xoffset, mtr);
+		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+			return err;
+		}
+		xdes_set_state(*xdes, descr, XDES_FREE_FRAG, mtr);
+		mtr->write<4>(*header, FSP_HEADER_OFFSET + FSP_FRAG_N_USED
+			      + header->page.frame,
+			      frag_n_used + FSP_EXTENT_SIZE - 1);
+	} else if (UNIV_UNLIKELY(!frag_n_used)) {
+		return DB_CORRUPTION;
+	} else {
+		mtr->write<4>(*header, FSP_HEADER_OFFSET + FSP_FRAG_N_USED
+			      + header->page.frame, frag_n_used - 1);
+	}
+
+	mtr->free(*space, static_cast<uint32_t>(offset));
+	xdes_set_free<true>(*xdes, descr, offset % FSP_EXTENT_SIZE, mtr);
+	ut_ad(err == DB_SUCCESS);
+
+	if (!xdes_get_n_used(descr)) {
+		/* The extent has become free: move it to another list */
+		err = flst_remove(header, FSP_HEADER_OFFSET + FSP_FREE_FRAG,
+				  xdes, xoffset, mtr);
+		if (err == DB_SUCCESS) {
+			err = fsp_free_extent(space, offset, mtr);
+		}
+	}
+
+	return err;
+}
+
+/** @return Number of segment inodes which fit on a single page */
+inline ulint FSP_SEG_INODES_PER_PAGE(ulint physical_size)
+{
+	return (physical_size - FSEG_ARR_OFFSET - 10) / FSEG_INODE_SIZE;
+}
+
+/** Returns the nth inode slot on an inode page.
+@param[in]	page		segment inode page
+@param[in]	i		inode index on page
+@return segment inode */
+#define fsp_seg_inode_page_get_nth_inode(page, i)	\
+	FSEG_ARR_OFFSET + FSEG_INODE_SIZE * i + page
+
+/** Looks for a used segment inode on a segment inode page.
+@param page             segment inode page
+@param physical_size    page size
+@return segment inode index
+@retval ULINT_UNDEFINED if not found */
+static
+ulint
+fsp_seg_inode_page_find_used(const page_t *page, ulint physical_size)
+{
+  for (ulint i= 0; i < FSP_SEG_INODES_PER_PAGE(physical_size); i++)
+  {
+    const byte *inode= fsp_seg_inode_page_get_nth_inode(page, i);
+    if (mach_read_from_8(FSEG_ID + inode))
+    {
+      ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4));
+      return i;
+    }
+  }
+
+  return ULINT_UNDEFINED;
+}
+
+/** Looks for an unused segment inode on a segment inode page.
+@param[in]	page		segment inode page
+@param[in]	i		search forward starting from this index
+@param[in]	physical_size	page size
+@return segment inode index
+@retval ULINT_UNDEFINED if not found */
+static
+ulint
+fsp_seg_inode_page_find_free(const page_t *page, ulint i, ulint physical_size)
+{
+  for (; i < FSP_SEG_INODES_PER_PAGE(physical_size); i++)
+  {
+    const byte *inode= fsp_seg_inode_page_get_nth_inode(page, i);
+    if (mach_read_from_8(FSEG_ID + inode))
+      ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4));
+    else
+      /* This is unused */
+      return i;
+  }
+  return ULINT_UNDEFINED;
+}
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Allocate a file segment inode page.
+@param[in,out]  space   tablespace
+@param[in,out]  header  tablespace header
+@param[in,out]  mtr     mini-transaction
+@return error code */
+static dberr_t fsp_alloc_seg_inode_page(fil_space_t *space,
+                                        buf_block_t *header, mtr_t *mtr)
+{
+  ut_ad(header->page.id().space() == space->id);
+  dberr_t err;
+  buf_block_t *block= fsp_alloc_free_page(space, 0, mtr, mtr, &err);
+
+  if (!block)
+    return err;
+
+  ut_ad(block->page.lock.not_recursive());
+
+  mtr->write<2>(*block, block->page.frame + FIL_PAGE_TYPE, FIL_PAGE_INODE);
+
+#ifdef UNIV_DEBUG
+  const byte *inode= FSEG_ID + FSEG_ARR_OFFSET + block->page.frame;
+  for (ulint i= FSP_SEG_INODES_PER_PAGE(space->physical_size()); i--;
+       inode += FSEG_INODE_SIZE)
+    ut_ad(!mach_read_from_8(inode));
+#endif
+
+  return flst_add_last(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE,
+                       block, FSEG_INODE_PAGE_NODE, mtr);
+}
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Allocate a file segment inode.
+@param[in,out]  space   tablespace
+@param[in,out]  header  tablespace header
+@param[out]     iblock  segment inode page
+@param[in,out]  mtr     mini-transaction
+@param[out]     err     error code
+@return segment inode
+@retval nullptr on failure */
+static fseg_inode_t*
+fsp_alloc_seg_inode(fil_space_t *space, buf_block_t *header,
+                    buf_block_t **iblock, mtr_t *mtr, dberr_t *err)
+{
+  /* Allocate a new segment inode page if needed. */
+  if (!flst_get_len(FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE +
+                    header->page.frame))
+  {
+    *err= fsp_alloc_seg_inode_page(space, header, mtr);
+    if (*err != DB_SUCCESS)
+      return nullptr;
+  }
+
+  const page_id_t page_id
+  {
+    space->id,
+    mach_read_from_4(FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE + FLST_FIRST +
+                     FIL_ADDR_PAGE + header->page.frame)
+  };
+
+  buf_block_t *block=
+    buf_page_get_gen(page_id, space->zip_size(), RW_SX_LATCH,
+                     nullptr, BUF_GET_POSSIBLY_FREED, mtr, err);
+  if (!block)
+    return nullptr;
+
+  if (!space->full_crc32())
+    fil_block_check_type(*block, FIL_PAGE_INODE, mtr);
+
+  const ulint physical_size= space->physical_size();
+  ulint n= fsp_seg_inode_page_find_free(block->page.frame, 0, physical_size);
+
+  if (UNIV_UNLIKELY(n >= FSP_SEG_INODES_PER_PAGE(physical_size)))
+  {
+    *err= DB_CORRUPTION;
+    return nullptr;
+  }
+  fseg_inode_t *inode= fsp_seg_inode_page_get_nth_inode(block->page.frame, n);
+
+  if (ULINT_UNDEFINED == fsp_seg_inode_page_find_free(block->page.frame, n + 1,
+                                                      physical_size))
+  {
+    /* There are no other unused headers left on the page: move it
+    to another list */
+    *err= flst_remove(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE,
+                      block, FSEG_INODE_PAGE_NODE, mtr);
+    if (UNIV_UNLIKELY(*err != DB_SUCCESS))
+      return nullptr;
+    *err= flst_add_last(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FULL,
+                        block, FSEG_INODE_PAGE_NODE, mtr);
+    if (UNIV_UNLIKELY(*err != DB_SUCCESS))
+      return nullptr;
+  }
+
+  ut_ad(!mach_read_from_8(inode + FSEG_ID) ||
+        !memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4));
+  *iblock= block;
+  return inode;
+}
+
+MY_ATTRIBUTE((nonnull))
+/** Frees a file segment inode.
+@param[in,out]	space		tablespace
+@param[in,out]	inode		segment inode
+@param[in,out]	iblock		segment inode page
+@param[in,out]	mtr		mini-transaction */
+static void fsp_free_seg_inode(fil_space_t *space, fseg_inode_t *inode,
+                               buf_block_t *iblock, mtr_t *mtr)
+{
+  ut_d(space->modify_check(*mtr));
+
+  dberr_t err;
+  buf_block_t *header= fsp_get_header(space, mtr, &err);
+  if (!header)
+    return;
+  if (UNIV_UNLIKELY(memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4)))
+  {
+    space->set_corrupted();
+    return;
+  }
+
+  const ulint physical_size= space->physical_size();
+
+  if (ULINT_UNDEFINED == fsp_seg_inode_page_find_free(iblock->page.frame, 0,
+                                                      physical_size))
+  {
+    /* Move the page to another list */
+    if (flst_remove(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FULL,
+                    iblock, FSEG_INODE_PAGE_NODE, mtr) != DB_SUCCESS)
+      return;
+    if (flst_add_last(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE,
+                      iblock, FSEG_INODE_PAGE_NODE, mtr) != DB_SUCCESS)
+      return;
+  }
+
+  mtr->memset(iblock, page_offset(inode) + FSEG_ID, FSEG_INODE_SIZE, 0);
+
+  if (ULINT_UNDEFINED != fsp_seg_inode_page_find_used(iblock->page.frame,
+                                                      physical_size))
+    return;
+
+  /* There are no other used headers left on the page: free it */
+  if (flst_remove(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE,
+                  iblock, FSEG_INODE_PAGE_NODE, mtr) == DB_SUCCESS)
+    fsp_free_page(space, iblock->page.id().page_no(), mtr);
+}
+
+MY_ATTRIBUTE((nonnull(1,4,5), warn_unused_result))
+/** Returns the file segment inode, page x-latched.
+@param[in]	header		segment header
+@param[in]	space		space id
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out]	mtr		mini-transaction
+@param[out]	block		inode block
+@param[out]	err		error code
+@return segment inode, page x-latched
+@retrval nullptr if the inode is free or corruption was noticed */
+static
+fseg_inode_t*
+fseg_inode_try_get(
+	const fseg_header_t*	header,
+	uint32_t		space,
+	ulint			zip_size,
+	mtr_t*			mtr,
+	buf_block_t**		block,
+        dberr_t*		err = nullptr)
+{
+  if (UNIV_UNLIKELY(space != mach_read_from_4(header + FSEG_HDR_SPACE)))
+  {
+  corrupted:
+    if (err)
+      *err= DB_CORRUPTION;
+    return nullptr;
+  }
+
+  *block=
+    buf_page_get_gen(page_id_t(space,
+                               mach_read_from_4(header + FSEG_HDR_PAGE_NO)),
+                     zip_size, RW_SX_LATCH, nullptr, BUF_GET_POSSIBLY_FREED,
+                     mtr, err);
+  if (!*block)
+    return nullptr;
+
+  const uint16_t offset= mach_read_from_2(header + FSEG_HDR_OFFSET);
+  if (UNIV_UNLIKELY(offset >= (*block)->physical_size()))
+    goto corrupted;
+
+  fseg_inode_t *inode= (*block)->page.frame + offset;
+  if (UNIV_UNLIKELY(!mach_read_from_8(inode + FSEG_ID) ||
+                    memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4)))
+    goto corrupted;
+
+  return inode;
+}
+
+/** Get the page number from the nth fragment page slot.
+@param inode  file segment findex
+@param n      slot index
+@return page number
+@retval FIL_NULL if not in use */
+static uint32_t fseg_get_nth_frag_page_no(const fseg_inode_t *inode, ulint n)
+{
+	ut_ad(inode);
+	ut_ad(n < FSEG_FRAG_ARR_N_SLOTS);
+	ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4));
+	return(mach_read_from_4(inode + FSEG_FRAG_ARR
+				+ n * FSEG_FRAG_SLOT_SIZE));
+}
+
+/** Set the page number in the nth fragment page slot.
+@param[in,out]  inode   segment inode
+@param[in,out]  iblock  segment inode page
+@param[in]      n       slot index
+@param[in]      page_no page number to set
+@param[in,out]  mtr     mini-transaction */
+inline void fseg_set_nth_frag_page_no(fseg_inode_t *inode, buf_block_t *iblock,
+                                      ulint n, ulint page_no, mtr_t *mtr)
+{
+  ut_ad(n < FSEG_FRAG_ARR_N_SLOTS);
+  ut_ad(mtr->memo_contains_flagged(iblock, MTR_MEMO_PAGE_SX_FIX));
+  ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4));
+
+  mtr->write<4>(*iblock, inode + FSEG_FRAG_ARR + n * FSEG_FRAG_SLOT_SIZE,
+                page_no);
+}
+
+/**********************************************************************//**
+Finds a fragment page slot which is free.
+@return slot index; ULINT_UNDEFINED if none found */
+static
+ulint
+fseg_find_free_frag_page_slot(
+/*==========================*/
+	fseg_inode_t*	inode)	/*!< in: segment inode */
+{
+	ulint	i;
+	ulint	page_no;
+
+	for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) {
+		page_no = fseg_get_nth_frag_page_no(inode, i);
+
+		if (page_no == FIL_NULL) {
+
+			return(i);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/**********************************************************************//**
+Finds a fragment page slot which is used and last in the array.
+@return slot index; ULINT_UNDEFINED if none found */
+static
+ulint
+fseg_find_last_used_frag_page_slot(
+/*===============================*/
+	fseg_inode_t*	inode)	/*!< in: segment inode */
+{
+	ulint	i;
+	ulint	page_no;
+
+	for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) {
+		page_no = fseg_get_nth_frag_page_no(
+			inode, FSEG_FRAG_ARR_N_SLOTS - i - 1);
+
+		if (page_no != FIL_NULL) {
+
+			return(FSEG_FRAG_ARR_N_SLOTS - i - 1);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/** Calculate reserved fragment page slots.
+@param inode  file segment index
+@return number of fragment pages */
+static ulint fseg_get_n_frag_pages(const fseg_inode_t *inode)
+{
+	ulint	i;
+	ulint	count	= 0;
+
+	for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) {
+		if (FIL_NULL != fseg_get_nth_frag_page_no(inode, i)) {
+			count++;
+		}
+	}
+
+	return(count);
+}
+
+/** Create a new segment.
+@param space                tablespace
+@param byte_offset          byte offset of the created segment header
+@param mtr                  mini-transaction
+@param err                  error code
+@param has_done_reservation whether fsp_reserve_free_extents() was invoked
+@param block                block where segment header is placed,
+                            or NULL to allocate an additional page for that
+@return the block where the segment header is placed, x-latched
+@retval nullptr if could not create segment */
+buf_block_t*
+fseg_create(fil_space_t *space, ulint byte_offset, mtr_t *mtr, dberr_t *err,
+            bool has_done_reservation, buf_block_t *block)
+{
+	fseg_inode_t*	inode;
+	ib_id_t		seg_id;
+	uint32_t	n_reserved;
+	bool		reserved_extent = false;
+
+	DBUG_ENTER("fseg_create");
+
+	ut_ad(mtr);
+	ut_ad(byte_offset >= FIL_PAGE_DATA);
+	ut_ad(byte_offset + FSEG_HEADER_SIZE
+	      <= srv_page_size - FIL_PAGE_DATA_END);
+
+	mtr->x_lock_space(space);
+	ut_d(space->modify_check(*mtr));
+
+	ut_ad(!block || block->page.id().space() == space->id);
+
+	buf_block_t* header = fsp_get_header(space, mtr, err);
+	if (!header) {
+		block = nullptr;
+		goto funct_exit;
+	}
+
+	buf_block_t* iblock;
+
+inode_alloc:
+	inode = fsp_alloc_seg_inode(space, header, &iblock, mtr, err);
+
+	if (!inode) {
+		block = nullptr;
+reserve_extent:
+		if (!has_done_reservation && !reserved_extent) {
+			*err = fsp_reserve_free_extents(&n_reserved, space, 2,
+							FSP_NORMAL, mtr);
+			if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+				DBUG_RETURN(nullptr);
+			}
+
+			/* Extents reserved successfully. So
+			try allocating the page or inode */
+			reserved_extent = true;
+			if (inode) {
+				goto page_alloc;
+			}
+
+			goto inode_alloc;
+		}
+
+		if (inode) {
+			fsp_free_seg_inode(space, inode, iblock, mtr);
+		}
+		goto funct_exit;
+	}
+
+	/* Read the next segment id from space header and increment the
+	value in space header */
+
+	seg_id = mach_read_from_8(FSP_HEADER_OFFSET + FSP_SEG_ID
+				  + header->page.frame);
+
+	mtr->write<8>(*header,
+		      FSP_HEADER_OFFSET + FSP_SEG_ID + header->page.frame,
+		      seg_id + 1);
+	mtr->write<8>(*iblock, inode + FSEG_ID, seg_id);
+	ut_ad(!mach_read_from_4(inode + FSEG_NOT_FULL_N_USED));
+
+	flst_init(*iblock, inode + FSEG_FREE, mtr);
+	flst_init(*iblock, inode + FSEG_NOT_FULL, mtr);
+	flst_init(*iblock, inode + FSEG_FULL, mtr);
+
+	mtr->memcpy(*iblock, inode + FSEG_MAGIC_N, FSEG_MAGIC_N_BYTES, 4);
+	compile_time_assert(FSEG_FRAG_SLOT_SIZE == 4);
+	compile_time_assert(FIL_NULL == 0xffffffff);
+	mtr->memset(iblock,
+		    uint16_t(inode - iblock->page.frame) + FSEG_FRAG_ARR,
+		    FSEG_FRAG_SLOT_SIZE * FSEG_FRAG_ARR_N_SLOTS, 0xff);
+
+	if (!block) {
+page_alloc:
+		block = fseg_alloc_free_page_low(space,
+						 inode, iblock, 0, FSP_UP,
+#ifdef UNIV_DEBUG
+						 has_done_reservation,
+#endif /* UNIV_DEBUG */
+						 mtr, mtr, err);
+
+		if (!block) {
+			ut_ad(!has_done_reservation);
+			goto reserve_extent;
+		}
+
+		ut_d(const auto x = block->page.lock.x_lock_count());
+		ut_ad(x || block->page.lock.not_recursive());
+		ut_ad(x == 1 || space->is_being_truncated);
+		ut_ad(x <= 2);
+		ut_ad(!fil_page_get_type(block->page.frame));
+		mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->page.frame,
+			      FIL_PAGE_TYPE_SYS);
+	}
+
+	mtr->write<2>(*block, byte_offset + FSEG_HDR_OFFSET
+		      + block->page.frame, page_offset(inode));
+
+	mtr->write<4>(*block, byte_offset + FSEG_HDR_PAGE_NO
+		      + block->page.frame, iblock->page.id().page_no());
+
+	mtr->write<4,mtr_t::MAYBE_NOP>(*block, byte_offset + FSEG_HDR_SPACE
+				       + block->page.frame, space->id);
+
+funct_exit:
+	if (!has_done_reservation && reserved_extent) {
+		space->release_free_extents(n_reserved);
+	}
+
+	DBUG_RETURN(block);
+}
+
+/**********************************************************************//**
+Calculates the number of pages reserved by a segment, and how many pages are
+currently used.
+@return number of reserved pages */
+static
+ulint
+fseg_n_reserved_pages_low(
+/*======================*/
+	const fseg_inode_t*	inode,	/*!< in: segment inode */
+	ulint*		used)	/*!< out: number of pages used (not
+				more than reserved) */
+{
+	*used = mach_read_from_4(inode + FSEG_NOT_FULL_N_USED)
+		+ FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FULL)
+		+ fseg_get_n_frag_pages(inode);
+
+	return fseg_get_n_frag_pages(inode)
+		+ FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FREE)
+		+ FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_NOT_FULL)
+		+ FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FULL);
+}
+
+/** Calculate the number of pages reserved by a segment,
+and how many pages are currently used.
+@param[in]      block   buffer block containing the file segment header
+@param[in]      header  file segment header
+@param[out]     used    number of pages that are used (not more than reserved)
+@param[in,out]  mtr     mini-transaction
+@return number of reserved pages */
+ulint fseg_n_reserved_pages(const buf_block_t &block,
+                            const fseg_header_t *header, ulint *used,
+                            mtr_t *mtr)
+{
+  ut_ad(page_align(header) == block.page.frame);
+  buf_block_t *iblock;
+  if (fseg_inode_t *inode=
+      fseg_inode_try_get(header, block.page.id().space(), block.zip_size(),
+                         mtr, &iblock))
+    return fseg_n_reserved_pages_low(inode, used);
+  return *used= 0;
+}
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Tries to fill the free list of a segment with consecutive free extents.
+This happens if the segment is big enough to allow extents in the free list,
+the free list is empty, and the extents can be allocated consecutively from
+the hint onward.
+@param[in]	inode	segment inode
+@param[in,out]	iblock	segment inode page
+@param[in]	space	tablespace
+@param[in]	hint	hint which extent would be good as the first extent
+@param[in,out]	mtr	mini-transaction */
+static dberr_t fseg_fill_free_list(const fseg_inode_t *inode,
+                                   buf_block_t *iblock, fil_space_t *space,
+                                   uint32_t hint, mtr_t *mtr)
+{
+  ulint	used;
+
+  ut_ad(!((page_offset(inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
+  ut_d(space->modify_check(*mtr));
+
+  if (fseg_n_reserved_pages_low(inode, &used) <
+      FSEG_FREE_LIST_LIMIT * FSP_EXTENT_SIZE)
+    /* The segment is too small to allow extents in free list */
+    return DB_SUCCESS;
+
+  if (UNIV_UNLIKELY(memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4)))
+  {
+    space->set_corrupted();
+    return DB_CORRUPTION;
+  }
+
+  if (flst_get_len(inode + FSEG_FREE) > 0)
+    /* Free list is not empty */
+    return DB_SUCCESS;
+
+  for (ulint i= 0; i < FSEG_FREE_LIST_MAX_LEN; i++, hint += FSP_EXTENT_SIZE)
+  {
+    buf_block_t *xdes;
+    dberr_t err;
+    xdes_t *descr= xdes_get_descriptor(space, hint, mtr, &err, &xdes);
+    if (!descr || XDES_FREE != xdes_get_state(descr))
+      /* We cannot allocate the desired extent: stop */
+      return err;
+
+    descr= fsp_alloc_free_extent(space, hint, &xdes, mtr, &err);
+    if (UNIV_UNLIKELY(!descr))
+      return err;
+
+    if (dberr_t err=
+        flst_add_last(iblock,
+                      static_cast<uint16_t>(inode - iblock->page.frame +
+                                            FSEG_FREE), xdes,
+                      static_cast<uint16_t>(descr - xdes->page.frame +
+                                            XDES_FLST_NODE), mtr))
+      return err;
+    xdes_set_state(*xdes, descr, XDES_FSEG, mtr);
+    mtr->memcpy(*xdes, descr + XDES_ID, inode + FSEG_ID, 8);
+  }
+
+  return DB_SUCCESS;
+}
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Allocates a free extent for the segment: looks first in the free list of
+the segment, then tries to allocate from the space free list.
+NOTE that the extent returned still resides in the segment free list, it is
+not yet taken off it!
+@param[in]	inode		segment inode
+@param[in,out]	iblock		segment inode page
+@param[out]	xdes		extent descriptor page
+@param[in,out]	space		tablespace
+@param[in,out]	mtr		mini-transaction
+@param[out]	err		error code
+@retval nullptr	if no page could be allocated */
+static
+xdes_t*
+fseg_alloc_free_extent(
+	const fseg_inode_t*	inode,
+	buf_block_t*		iblock,
+	buf_block_t**		xdes,
+	fil_space_t*		space,
+	mtr_t*			mtr,
+	dberr_t*		err)
+{
+  ut_ad(!((page_offset(inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
+  ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4));
+  ut_d(space->modify_check(*mtr));
+
+  if (flst_get_len(inode + FSEG_FREE))
+  {
+    /* Segment free list is not empty, allocate from it */
+    return xdes_lst_get_descriptor(*space, flst_get_first(inode + FSEG_FREE),
+                                   mtr, xdes, err);
+  }
+
+  xdes_t* descr= fsp_alloc_free_extent(space, 0, xdes, mtr, err);
+  if (UNIV_UNLIKELY(!descr))
+    return descr;
+  xdes_set_state(**xdes, descr, XDES_FSEG, mtr);
+  mtr->memcpy<mtr_t::MAYBE_NOP>(**xdes, descr + XDES_ID, inode + FSEG_ID, 8);
+  *err= flst_add_last(iblock,
+                      static_cast<uint16_t>(inode - iblock->page.frame +
+                                            FSEG_FREE), *xdes,
+                      static_cast<uint16_t>(descr - (*xdes)->page.frame +
+                                            XDES_FLST_NODE), mtr);
+  if (UNIV_LIKELY(*err != DB_SUCCESS))
+    return nullptr;
+  /* Try to fill the segment free list */
+  *err= fseg_fill_free_list(inode, iblock, space,
+                            xdes_get_offset(descr) + FSP_EXTENT_SIZE, mtr);
+  if (UNIV_UNLIKELY(*err != DB_SUCCESS))
+    return nullptr;
+
+  return descr;
+}
+
+/** Allocates a single free page from a segment.
+This function implements the intelligent allocation strategy which tries to
+minimize file space fragmentation.
+@param[in,out]	space			tablespace
+@param[in,out]	seg_inode		segment inode
+@param[in,out]	iblock			segment inode page
+@param[in]	hint			hint of which page would be desirable
+@param[in]	direction		if the new page is needed because of
+an index page split, and records are inserted there in order, into which
+direction they go alphabetically: FSP_DOWN, FSP_UP, FSP_NO_DIR
+@param[in,out]	mtr			mini-transaction
+@param[in,out]	init_mtr		mtr or another mini-transaction in
+which the page should be initialized.
+@param[out]	err			error code
+@return the allocated page
+@retval nullptr	if no page could be allocated */
+static
+buf_block_t*
+fseg_alloc_free_page_low(
+	fil_space_t*		space,
+	fseg_inode_t*		seg_inode,
+	buf_block_t*		iblock,
+	uint32_t		hint,
+	byte			direction,
+#ifdef UNIV_DEBUG
+	bool			has_done_reservation,
+	/*!< whether the space has already been reserved */
+#endif /* UNIV_DEBUG */
+	mtr_t*			mtr,
+	mtr_t*			init_mtr,
+	dberr_t*		err)
+{
+	ib_id_t		seg_id;
+	ulint		used;
+	ulint		reserved;
+	xdes_t*		descr;		/*!< extent of the hinted page */
+	uint32_t	ret_page;	/*!< the allocated page offset, FIL_NULL
+					if could not be allocated */
+	xdes_t*		ret_descr;	/*!< the extent of the allocated page */
+	buf_block_t*	xdes;
+	ulint		n;
+
+	ut_ad((direction >= FSP_UP) && (direction <= FSP_NO_DIR));
+	ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + seg_inode, 4));
+	ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
+	seg_id = mach_read_from_8(seg_inode + FSEG_ID);
+
+	ut_ad(seg_id);
+	ut_d(space->modify_check(*mtr));
+	ut_ad(fil_page_get_type(page_align(seg_inode)) == FIL_PAGE_INODE);
+
+	reserved = fseg_n_reserved_pages_low(seg_inode, &used);
+
+	buf_block_t* header = fsp_get_header(space, mtr, err);
+	if (!header) {
+		return header;
+	}
+
+	descr = xdes_get_descriptor_with_space_hdr(header, space, hint, mtr,
+						   err, &xdes);
+	if (!descr) {
+		if (*err != DB_SUCCESS) {
+			return nullptr;
+		}
+		/* Hint outside space or too high above free limit: reset
+		hint */
+		/* The file space header page is always allocated. */
+		hint = 0;
+		descr = xdes_get_descriptor(space, hint, mtr, err, &xdes);
+		if (!descr) {
+			return nullptr;
+		}
+	}
+
+	/* In the big if-else below we look for ret_page and ret_descr */
+	/*-------------------------------------------------------------*/
+	if ((xdes_get_state(descr) == XDES_FSEG)
+	    && mach_read_from_8(descr + XDES_ID) == seg_id
+	    && xdes_is_free(descr, hint % FSP_EXTENT_SIZE)) {
+take_hinted_page:
+		/* 1. We can take the hinted page
+		=================================*/
+		ret_descr = descr;
+		ret_page = hint;
+		/* Skip the check for extending the tablespace. If the
+		page hint were not within the size of the tablespace,
+		we would have got (descr == NULL) above and reset the hint. */
+		goto got_hinted_page;
+		/*-----------------------------------------------------------*/
+	} else if (xdes_get_state(descr) == XDES_FREE
+		   && reserved - used < reserved / FSEG_FILLFACTOR
+		   && used >= FSEG_FRAG_LIMIT) {
+
+		/* 2. We allocate the free extent from space and can take
+		=========================================================
+		the hinted page
+		===============*/
+		ret_descr = fsp_alloc_free_extent(space, hint, &xdes,
+						  mtr, err);
+
+		if (UNIV_UNLIKELY(ret_descr != descr)) {
+			if (*err != DB_SUCCESS) {
+				*err = DB_CORRUPTION;
+			}
+			return nullptr;
+		}
+
+		xdes_set_state(*xdes, ret_descr, XDES_FSEG, mtr);
+		mtr->write<8,mtr_t::MAYBE_NOP>(*xdes, ret_descr + XDES_ID,
+					       seg_id);
+		*err = flst_add_last(
+			iblock,
+			static_cast<uint16_t>(seg_inode - iblock->page.frame
+					      + FSEG_FREE), xdes,
+			static_cast<uint16_t>(ret_descr
+					      - xdes->page.frame
+					      + XDES_FLST_NODE), mtr);
+		if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+			return nullptr;
+		}
+
+		/* Try to fill the segment free list */
+		*err = fseg_fill_free_list(seg_inode, iblock, space,
+					   hint + FSP_EXTENT_SIZE, mtr);
+		if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+			return nullptr;
+		}
+		goto take_hinted_page;
+		/*-----------------------------------------------------------*/
+	} else if ((direction != FSP_NO_DIR)
+		   && ((reserved - used) < reserved / FSEG_FILLFACTOR)
+		   && (used >= FSEG_FRAG_LIMIT)
+		   && (ret_descr = fseg_alloc_free_extent(seg_inode, iblock,
+							  &xdes, space,
+							  mtr, err))) {
+		/* 3. We take any free extent (which was already assigned above
+		===============================================================
+		in the if-condition to ret_descr) and take the lowest or
+		========================================================
+		highest page in it, depending on the direction
+		==============================================*/
+		ret_page = xdes_get_offset(ret_descr);
+
+		if (direction == FSP_DOWN) {
+			ret_page += FSP_EXTENT_SIZE - 1;
+		}
+		ut_ad(!has_done_reservation || ret_page != FIL_NULL);
+		/*-----------------------------------------------------------*/
+	} else if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+		return nullptr;
+	} else if ((xdes_get_state(descr) == XDES_FSEG)
+		   && mach_read_from_8(descr + XDES_ID) == seg_id
+		   && (!xdes_is_full(descr))) {
+
+		/* 4. We can take the page from the same extent as the
+		======================================================
+		hinted page (and the extent already belongs to the
+		==================================================
+		segment)
+		========*/
+		ret_descr = descr;
+		ret_page = xdes_find_free(ret_descr, hint % FSP_EXTENT_SIZE);
+		if (ret_page == FIL_NULL) {
+			ut_ad(!has_done_reservation);
+		} else {
+			ret_page += xdes_get_offset(ret_descr);
+		}
+		/*-----------------------------------------------------------*/
+	} else if (reserved - used > 0) {
+		/* 5. We take any unused page from the segment
+		==============================================*/
+		fil_addr_t	first;
+
+		if (flst_get_len(seg_inode + FSEG_NOT_FULL) > 0) {
+			first = flst_get_first(seg_inode + FSEG_NOT_FULL);
+		} else if (flst_get_len(seg_inode + FSEG_FREE) > 0) {
+			first = flst_get_first(seg_inode + FSEG_FREE);
+		} else {
+			ut_ad(!has_done_reservation);
+			return(NULL);
+		}
+
+		ret_descr = xdes_lst_get_descriptor(*space, first, mtr, &xdes);
+		if (!ret_descr) {
+			return nullptr;
+		}
+
+		ret_page = xdes_find_free(ret_descr);
+		if (ret_page == FIL_NULL) {
+			ut_ad(!has_done_reservation);
+		} else {
+			ret_page += xdes_get_offset(ret_descr);
+		}
+		/*-----------------------------------------------------------*/
+	} else if (used < FSEG_FRAG_LIMIT) {
+		/* 6. We allocate an individual page from the space
+		===================================================*/
+		buf_block_t* block = fsp_alloc_free_page(
+			space, hint, mtr, init_mtr, err);
+
+		ut_ad(block || !has_done_reservation || *err);
+
+		if (block) {
+			/* Put the page in the fragment page array of the
+			segment */
+			n = fseg_find_free_frag_page_slot(seg_inode);
+			if (UNIV_UNLIKELY(n == ULINT_UNDEFINED)) {
+				*err = DB_CORRUPTION;
+				return nullptr;
+			}
+
+			fseg_set_nth_frag_page_no(
+				seg_inode, iblock, n,
+				block->page.id().page_no(), mtr);
+		}
+
+		/* fsp_alloc_free_page() invoked fsp_init_file_page()
+		already. */
+		return(block);
+		/*-----------------------------------------------------------*/
+	} else {
+		/* 7. We allocate a new extent and take its first page
+		======================================================*/
+		ret_descr = fseg_alloc_free_extent(seg_inode, iblock, &xdes,
+						   space, mtr, err);
+
+		if (!ret_descr) {
+			ut_ad(!has_done_reservation || *err);
+			return nullptr;
+		} else {
+			ret_page = xdes_get_offset(ret_descr);
+		}
+	}
+
+	if (ret_page == FIL_NULL) {
+		/* Page could not be allocated */
+
+		ut_ad(!has_done_reservation);
+		return(NULL);
+	}
+
+	if (space->size <= ret_page && !is_predefined_tablespace(space->id)) {
+		/* It must be that we are extending a single-table
+		tablespace whose size is still < 64 pages */
+
+		if (ret_page >= FSP_EXTENT_SIZE) {
+			sql_print_error("InnoDB: Trying to extend '%s'"
+					" by single page(s) though the"
+					" space size " UINT32PF "."
+					" Page no " UINT32PF ".",
+					space->chain.start->name, space->size,
+					ret_page);
+			ut_ad(!has_done_reservation);
+			return(NULL);
+		}
+
+		if (!fsp_try_extend_data_file_with_pages(
+			    space, ret_page, header, mtr)) {
+			/* No disk space left */
+			ut_ad(!has_done_reservation);
+			return(NULL);
+		}
+	}
+
+got_hinted_page:
+	/* ret_descr == NULL if the block was allocated from free_frag
+	(XDES_FREE_FRAG) */
+	if (ret_descr != NULL) {
+		/* At this point we know the extent and the page offset.
+		The extent is still in the appropriate list (FSEG_NOT_FULL
+		or FSEG_FREE), and the page is not yet marked as used. */
+
+		ut_d(buf_block_t* xxdes);
+		ut_ad(xdes_get_descriptor(space, ret_page, mtr, err, &xxdes)
+		      == ret_descr);
+		ut_ad(xdes == xxdes);
+		ut_ad(xdes_is_free(ret_descr, ret_page % FSP_EXTENT_SIZE));
+
+		*err = fseg_mark_page_used(seg_inode, iblock, ret_page,
+                                           ret_descr, xdes, mtr);
+		if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+			return nullptr;
+		}
+	}
+
+	return fsp_page_create(space, ret_page, init_mtr);
+}
+
+/**********************************************************************//**
+Allocates a single free page from a segment. This function implements
+the intelligent allocation strategy which tries to minimize file space
+fragmentation.
+@retval NULL if no page could be allocated */
+buf_block_t*
+fseg_alloc_free_page_general(
+/*=========================*/
+	fseg_header_t*	seg_header,/*!< in/out: segment header */
+	uint32_t	hint,	/*!< in: hint of which page would be
+				desirable */
+	byte		direction,/*!< in: if the new page is needed because
+				of an index page split, and records are
+				inserted there in order, into which
+				direction they go alphabetically: FSP_DOWN,
+				FSP_UP, FSP_NO_DIR */
+	bool		has_done_reservation, /*!< in: true if the caller has
+				already done the reservation for the page
+				with fsp_reserve_free_extents, then there
+				is no need to do the check for this individual
+				page */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	mtr_t*		init_mtr,/*!< in/out: mtr or another mini-transaction
+				in which the page should be initialized. */
+	dberr_t*	err)	/*!< out: error code */
+{
+	fseg_inode_t*	inode;
+	fil_space_t*	space;
+	buf_block_t*	iblock;
+	buf_block_t*	block;
+	uint32_t	n_reserved;
+
+	const uint32_t space_id = page_get_space_id(page_align(seg_header));
+	space = mtr->x_lock_space(space_id);
+	inode = fseg_inode_try_get(seg_header, space_id, space->zip_size(),
+				   mtr, &iblock, err);
+	if (!inode) {
+		return nullptr;
+	}
+	if (!space->full_crc32()) {
+		fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr);
+	}
+
+	if (!has_done_reservation) {
+		*err = fsp_reserve_free_extents(&n_reserved, space, 2,
+						FSP_NORMAL, mtr);
+		if (*err != DB_SUCCESS) {
+			return nullptr;
+		}
+	}
+
+	block = fseg_alloc_free_page_low(space,
+					 inode, iblock, hint, direction,
+#ifdef UNIV_DEBUG
+					 has_done_reservation,
+#endif /* UNIV_DEBUG */
+					 mtr, init_mtr, err);
+
+	/* The allocation cannot fail if we have already reserved a
+	space for the page. */
+	ut_ad(block || !has_done_reservation || *err);
+
+	if (!has_done_reservation) {
+		space->release_free_extents(n_reserved);
+	}
+
+	return(block);
+}
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Check that we have at least n_pages frag pages free in the first extent
+of a single-table tablespace, and they are also physically initialized to
+the data file. That is we have already extended the data file so that those
+pages are inside the data file. If not, this function extends the tablespace
+with pages.
+@param[in,out]	space	tablespace
+@param[in,out]	header	tablespace header, x-latched
+@param[in]	size	tablespace size in pages, less than FSP_EXTENT_SIZE
+@param[in,out]	mtr	mini-transaction
+@param[in]	n_pages	number of pages to reserve
+@return error code */
+static
+dberr_t
+fsp_reserve_free_pages(
+	fil_space_t*	space,
+	buf_block_t*	header,
+	ulint		size,
+	mtr_t*		mtr,
+	uint32_t	n_pages)
+{
+  ut_ad(space != fil_system.sys_space && space != fil_system.temp_space);
+  ut_ad(size < FSP_EXTENT_SIZE);
+
+  dberr_t err= DB_OUT_OF_FILE_SPACE;
+  const xdes_t *descr=
+    xdes_get_descriptor_with_space_hdr(header, space, 0, mtr, &err);
+  if (!descr)
+    return err;
+  const uint32_t n_used= xdes_get_n_used(descr);
+  if (size >= n_used + n_pages)
+    return DB_SUCCESS;
+  if (n_used > size)
+    return DB_CORRUPTION;
+  return fsp_try_extend_data_file_with_pages(space, n_used + n_pages - 1,
+                                             header, mtr)
+    ? DB_SUCCESS
+    : DB_OUT_OF_FILE_SPACE;
+}
+
+/** Reserves free pages from a tablespace. All mini-transactions which may
+use several pages from the tablespace should call this function beforehand
+and reserve enough free extents so that they certainly will be able
+to do their operation, like a B-tree page split, fully. Reservations
+must be released with function fil_space_t::release_free_extents()!
+
+The alloc_type below has the following meaning: FSP_NORMAL means an
+operation which will probably result in more space usage, like an
+insert in a B-tree; FSP_UNDO means allocation to undo logs: if we are
+deleting rows, then this allocation will in the long run result in
+less space usage (after a purge); FSP_CLEANING means allocation done
+in a physical record delete (like in a purge) or other cleaning operation
+which will result in less space usage in the long run. We prefer the latter
+two types of allocation: when space is scarce, FSP_NORMAL allocations
+will not succeed, but the latter two allocations will succeed, if possible.
+The purpose is to avoid dead end where the database is full but the
+user cannot free any space because these freeing operations temporarily
+reserve some space.
+
+Single-table tablespaces whose size is < FSP_EXTENT_SIZE pages are a special
+case. In this function we would liberally reserve several extents for
+every page split or merge in a B-tree. But we do not want to waste disk space
+if the table only occupies < FSP_EXTENT_SIZE pages. That is why we apply
+different rules in that special case, just ensuring that there are n_pages
+free pages available.
+
+@param[out]	n_reserved	number of extents actually reserved; if we
+				return true and the tablespace size is <
+				FSP_EXTENT_SIZE pages, then this can be 0,
+				otherwise it is n_ext
+@param[in,out]	space		tablespace
+@param[in]	n_ext		number of extents to reserve
+@param[in]	alloc_type	page reservation type (FSP_BLOB, etc)
+@param[in,out]	mtr		the mini transaction
+@param[in]	n_pages		for small tablespaces (tablespace size is
+				less than FSP_EXTENT_SIZE), number of free
+				pages to reserve.
+@return error code
+@retval DB_SUCCESS if we were able to make the reservation */
+dberr_t
+fsp_reserve_free_extents(
+	uint32_t*	n_reserved,
+	fil_space_t*	space,
+	uint32_t	n_ext,
+	fsp_reserve_t	alloc_type,
+	mtr_t*		mtr,
+	uint32_t	n_pages)
+{
+	ulint		reserve;
+
+	ut_ad(mtr);
+	*n_reserved = n_ext;
+
+	const uint32_t extent_size = FSP_EXTENT_SIZE;
+
+	mtr->x_lock_space(space);
+	const unsigned physical_size = space->physical_size();
+
+	dberr_t err;
+	buf_block_t* header = fsp_get_header(space, mtr, &err);
+	if (!header) {
+		return err;
+	}
+try_again:
+	uint32_t size = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE
+					 + header->page.frame);
+	ut_ad(size == space->size_in_header);
+
+	if (size < extent_size && n_pages < extent_size / 2) {
+		/* Use different rules for small single-table tablespaces */
+		*n_reserved = 0;
+		return fsp_reserve_free_pages(space, header, size,
+					      mtr, n_pages);
+	}
+
+	uint32_t n_free_list_ext = flst_get_len(FSP_HEADER_OFFSET + FSP_FREE
+						+ header->page.frame);
+	ut_ad(space->free_len == n_free_list_ext);
+
+	uint32_t free_limit = mach_read_from_4(FSP_HEADER_OFFSET
+					       + FSP_FREE_LIMIT
+					       + header->page.frame);
+	ut_ad(space->free_limit == free_limit);
+
+	/* Below we play safe when counting free extents above the free limit:
+	some of them will contain extent descriptor pages, and therefore
+	will not be free extents */
+
+	uint32_t n_free_up;
+
+	if (size >= free_limit) {
+		n_free_up = (size - free_limit) / extent_size;
+		if (n_free_up) {
+			n_free_up--;
+			n_free_up -= n_free_up / (physical_size / extent_size);
+		}
+	} else {
+		ut_ad(alloc_type == FSP_BLOB);
+		n_free_up = 0;
+	}
+
+	uint32_t n_free = n_free_list_ext + n_free_up;
+
+	switch (alloc_type) {
+	case FSP_NORMAL:
+		/* We reserve 1 extent + 0.5 % of the space size to undo logs
+		and 1 extent + 0.5 % to cleaning operations; NOTE: this source
+		code is duplicated in the function below! */
+
+		reserve = 2 + ((size / extent_size) * 2) / 200;
+
+		if (n_free <= reserve + n_ext) {
+
+			goto try_to_extend;
+		}
+		break;
+	case FSP_UNDO:
+		/* We reserve 0.5 % of the space size to cleaning operations */
+
+		reserve = 1 + ((size / extent_size) * 1) / 200;
+
+		if (n_free <= reserve + n_ext) {
+
+			goto try_to_extend;
+		}
+		break;
+	case FSP_CLEANING:
+	case FSP_BLOB:
+		reserve = 0;
+		break;
+	default:
+		ut_error;
+	}
+
+	if (space->reserve_free_extents(n_free, n_ext)) {
+		return DB_SUCCESS;
+	}
+try_to_extend:
+	if (fsp_try_extend_data_file(space, header, mtr)) {
+		goto try_again;
+	}
+
+	return DB_OUT_OF_FILE_SPACE;
+}
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Frees a single page of a segment.
+@param[in]	seg_inode	segment inode
+@param[in,out]	space		tablespace
+@param[in]	offset		page number
+@param[in,out]	mtr		mini-transaction
+@param[in]	ahi		Drop adaptive hash index
+@return error code */
+static
+dberr_t
+fseg_free_page_low(
+	fseg_inode_t*		seg_inode,
+	buf_block_t*		iblock,
+	fil_space_t*		space,
+	page_no_t		offset,
+	mtr_t*			mtr
+#ifdef BTR_CUR_HASH_ADAPT
+	,bool			ahi=false
+#endif /* BTR_CUR_HASH_ADAPT */
+	)
+{
+	ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + seg_inode, 4));
+	ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
+	ut_ad(iblock->page.frame == page_align(seg_inode));
+	ut_d(space->modify_check(*mtr));
+
+#ifdef BTR_CUR_HASH_ADAPT
+	if (ahi) {
+		btr_search_drop_page_hash_when_freed(
+			page_id_t(space->id, offset));
+	}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+	const uint32_t extent_size = FSP_EXTENT_SIZE;
+	ut_ad(ut_is_2pow(extent_size));
+	buf_block_t* xdes;
+	dberr_t err;
+	xdes_t* descr = xdes_get_descriptor(space, offset, mtr, &err, &xdes);
+
+	if (!descr) {
+		return err;
+	}
+	if (UNIV_UNLIKELY(xdes_is_free(descr, offset & (extent_size - 1)))) {
+corrupted:
+		space->set_corrupted();
+		return DB_CORRUPTION;
+	}
+
+	if (xdes_get_state(descr) != XDES_FSEG) {
+		/* The page is in the fragment pages of the segment */
+		for (ulint i = 0;; i++) {
+			if (fseg_get_nth_frag_page_no(seg_inode, i)
+			    != offset) {
+				continue;
+			}
+
+			compile_time_assert(FIL_NULL == 0xffffffff);
+			mtr->memset(iblock, uint16_t(seg_inode
+						     - iblock->page.frame)
+				    + FSEG_FRAG_ARR
+				    + i * FSEG_FRAG_SLOT_SIZE, 4, 0xff);
+			break;
+		}
+
+		return fsp_free_page(space, offset, mtr);
+	}
+
+	/* If we get here, the page is in some extent of the segment */
+
+	if (UNIV_UNLIKELY(memcmp(descr + XDES_ID, seg_inode + FSEG_ID, 8))) {
+		goto corrupted;
+	}
+
+	byte* p_not_full = seg_inode + FSEG_NOT_FULL_N_USED;
+	uint32_t not_full_n_used = mach_read_from_4(p_not_full);
+	const uint16_t xoffset= uint16_t(descr - xdes->page.frame
+					 + XDES_FLST_NODE);
+	const uint16_t ioffset= uint16_t(seg_inode - iblock->page.frame);
+
+	if (xdes_is_full(descr)) {
+		/* The fragment is full: move it to another list */
+		err = flst_remove(iblock,
+				  static_cast<uint16_t>(FSEG_FULL + ioffset),
+				  xdes, xoffset, mtr);
+		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+			return err;
+		}
+		err = flst_add_last(iblock, static_cast<uint16_t>(FSEG_NOT_FULL
+								  + ioffset),
+				    xdes, xoffset, mtr);
+		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+			return err;
+		}
+		not_full_n_used += extent_size - 1;
+	} else {
+		if (!not_full_n_used) {
+			goto corrupted;
+		}
+		not_full_n_used--;
+	}
+
+	mtr->write<4>(*iblock, p_not_full, not_full_n_used);
+	xdes_set_free<true>(*xdes, descr, offset & (extent_size - 1), mtr);
+
+	if (!xdes_get_n_used(descr)) {
+		err = flst_remove(iblock, static_cast<uint16_t>(FSEG_NOT_FULL
+								+ ioffset),
+				  xdes, xoffset, mtr);
+		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+			return err;
+		}
+		err = fsp_free_extent(space, offset, mtr);
+		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+			return err;
+		}
+	}
+
+	mtr->free(*space, static_cast<uint32_t>(offset));
+	return DB_SUCCESS;
+}
+
+/** Free a page in a file segment.
+@param[in,out]	seg_header	file segment header
+@param[in,out]	space		tablespace
+@param[in]	offset		page number
+@param[in,out]	mtr		mini-transaction
+@param[in]	have_latch	whether space->x_lock() was already called
+@return error code */
+dberr_t fseg_free_page(fseg_header_t *seg_header, fil_space_t *space,
+                       uint32_t offset, mtr_t *mtr, bool have_latch)
+{
+  buf_block_t *iblock;
+  if (have_latch)
+    ut_ad(space->is_owner());
+  else
+    mtr->x_lock_space(space);
+
+  DBUG_PRINT("fseg_free_page",
+             ("space_id: " ULINTPF ", page_no: %u", space->id, offset));
+
+  dberr_t err;
+  if (fseg_inode_t *seg_inode= fseg_inode_try_get(seg_header,
+                                                  space->id, space->zip_size(),
+                                                  mtr, &iblock, &err))
+  {
+    if (!space->full_crc32())
+      fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr);
+    return fseg_free_page_low(seg_inode, iblock, space, offset, mtr);
+  }
+
+  return err;
+}
+
+/** Determine whether a page is allocated.
+@param space   tablespace
+@param page    page number
+@return error code
+@retval DB_SUCCESS             if the page is marked as free
+@retval DB_SUCCESS_LOCKED_REC  if the page is marked as allocated */
+dberr_t fseg_page_is_allocated(fil_space_t *space, unsigned page)
+{
+  mtr_t mtr;
+  uint32_t dpage= xdes_calc_descriptor_page(space->zip_size(), page);
+  const unsigned zip_size= space->zip_size();
+  dberr_t err= DB_SUCCESS;
+
+  mtr.start();
+  if (!space->is_owner())
+    mtr.x_lock_space(space);
+
+  if (page >= space->free_limit || page >= space->size_in_header);
+  else if (const buf_block_t *b=
+           buf_page_get_gen(page_id_t(space->id, dpage), space->zip_size(),
+                            RW_S_LATCH, nullptr, BUF_GET_POSSIBLY_FREED,
+                            &mtr, &err))
+  {
+    if (!dpage &&
+        (space->free_limit !=
+         mach_read_from_4(FSP_FREE_LIMIT + FSP_HEADER_OFFSET +
+                          b->page.frame) ||
+         space->size_in_header !=
+         mach_read_from_4(FSP_SIZE + FSP_HEADER_OFFSET + b->page.frame)))
+      err= DB_CORRUPTION;
+    else
+      err= xdes_is_free(b->page.frame + XDES_ARR_OFFSET + XDES_SIZE
+                        * xdes_calc_descriptor_index(zip_size, page),
+                        page & (FSP_EXTENT_SIZE - 1))
+        ? DB_SUCCESS
+        : DB_SUCCESS_LOCKED_REC;
+  }
+
+  mtr.commit();
+  return err;
+}
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Free an extent of a segment to the space free list.
+@param[in,out]	seg_inode	segment inode
+@param[in,out]	space		tablespace
+@param[in]	page		page number in the extent
+@param[in,out]	mtr		mini-transaction
+@return error code */
+static
+dberr_t
+fseg_free_extent(
+	fseg_inode_t*		seg_inode,
+	buf_block_t*		iblock,
+	fil_space_t*		space,
+	uint32_t		page,
+	mtr_t*			mtr
+#ifdef BTR_CUR_HASH_ADAPT
+	,bool			ahi=false
+#endif /* BTR_CUR_HASH_ADAPT */
+	)
+{
+	buf_block_t* xdes;
+	dberr_t err;
+	xdes_t*	descr = xdes_get_descriptor(space, page, mtr, &err, &xdes);
+
+	if (!descr) {
+		return err;
+	}
+
+	if (UNIV_UNLIKELY(xdes_get_state(descr) != XDES_FSEG
+			  || memcmp(descr + XDES_ID, seg_inode + FSEG_ID, 8)
+			  || memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N
+				    + seg_inode, 4))) {
+		return DB_CORRUPTION;
+	}
+	ut_d(space->modify_check(*mtr));
+	const uint32_t first_page_in_extent = page - (page % FSP_EXTENT_SIZE);
+
+	const uint16_t xoffset= uint16_t(descr - xdes->page.frame
+					 + XDES_FLST_NODE);
+	const uint16_t ioffset= uint16_t(seg_inode - iblock->page.frame);
+
+#ifdef BTR_CUR_HASH_ADAPT
+	if (ahi) {
+		for (uint32_t i = 0; i < FSP_EXTENT_SIZE; i++) {
+			if (!xdes_is_free(descr, i)) {
+				/* Drop search system page hash index
+				if the page is found in the pool and
+				is hashed */
+				btr_search_drop_page_hash_when_freed(
+					page_id_t(space->id,
+						 first_page_in_extent + i));
+			}
+		}
+	}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+	uint16_t lst;
+
+	if (xdes_is_full(descr)) {
+		lst = static_cast<uint16_t>(FSEG_FULL + ioffset);
+remove:
+		err = flst_remove(iblock, lst, xdes, xoffset, mtr);
+		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+			return err;
+		}
+	} else if (!xdes_get_n_used(descr)) {
+		lst = static_cast<uint16_t>(FSEG_FREE + ioffset);
+                goto remove;
+	} else {
+		err = flst_remove(
+			iblock, static_cast<uint16_t>(FSEG_NOT_FULL + ioffset),
+			xdes, xoffset, mtr);
+		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+			return err;
+		}
+		uint32_t not_full_n_used = mach_read_from_4(
+			FSEG_NOT_FULL_N_USED + seg_inode);
+		uint32_t descr_n_used = xdes_get_n_used(descr);
+		if (not_full_n_used < descr_n_used) {
+			return DB_CORRUPTION;
+		}
+		mtr->write<4>(*iblock, seg_inode + FSEG_NOT_FULL_N_USED,
+			      not_full_n_used - descr_n_used);
+	}
+
+	std::vector<uint8_t> going_to_free;
+	static_assert(FSP_EXTENT_SIZE_MIN == 256, "compatibility");
+	static_assert(FSP_EXTENT_SIZE_MAX == 64, "compatibility");
+
+	for (uint32_t i = 0; i < FSP_EXTENT_SIZE; i++) {
+		if (!xdes_is_free(descr, i)) {
+			going_to_free.emplace_back(uint8_t(i));
+		}
+	}
+
+	if (dberr_t err = fsp_free_extent(space, page, mtr)) {
+		return err;
+	}
+
+	for (uint32_t i : going_to_free) {
+		mtr->free(*space, first_page_in_extent + i);
+		buf_page_free(space, first_page_in_extent + i, mtr);
+	}
+
+	return DB_SUCCESS;
+}
+
+/** Frees part of a segment. This function can be used to free
+a segment by repeatedly calling this function in different
+mini-transactions. Doing the freeing in a single mini-transaction
+might result in too big a mini-transaction.
+@param	header	segment header; NOTE: if the header resides on first
+		page of the frag list of the segment, this pointer
+		becomes obsolete after the last freeing step
+@param	mtr	mini-transaction
+@param	ahi	Drop the adaptive hash index
+@return whether the freeing was completed */
+bool
+fseg_free_step(
+	fseg_header_t*	header,
+	mtr_t*		mtr
+#ifdef BTR_CUR_HASH_ADAPT
+	,bool		ahi
+#endif /* BTR_CUR_HASH_ADAPT */
+	)
+{
+	ulint		n;
+	fseg_inode_t*	inode;
+
+	const uint32_t space_id = page_get_space_id(page_align(header));
+	const uint32_t header_page = page_get_page_no(page_align(header));
+
+	fil_space_t* space = mtr->x_lock_space(space_id);
+	xdes_t* descr = xdes_get_descriptor(space, header_page, mtr);
+
+	if (!descr) {
+		return true;
+	}
+
+	/* Check that the header resides on a page which has not been
+	freed yet */
+
+	if (UNIV_UNLIKELY(xdes_is_free(descr,
+				       header_page & (FSP_EXTENT_SIZE - 1)))) {
+		/* Some corruption was detected: stop the freeing
+		in order to prevent a crash. */
+		return true;
+	}
+	buf_block_t* iblock;
+	const ulint zip_size = space->zip_size();
+	inode = fseg_inode_try_get(header, space_id, zip_size, mtr, &iblock);
+	if (!inode || space->is_stopping()) {
+		return true;
+	}
+
+	if (!space->full_crc32()) {
+		fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr);
+	}
+
+	dberr_t err;
+	descr = fseg_get_first_extent(inode, space, mtr, &err);
+
+	if (descr) {
+		/* Free the extent held by the segment */
+		return fseg_free_extent(inode, iblock, space,
+					xdes_get_offset(descr), mtr
+#ifdef BTR_CUR_HASH_ADAPT
+					, ahi
+#endif /* BTR_CUR_HASH_ADAPT */
+					) != DB_SUCCESS;
+	}
+
+	if (err != DB_SUCCESS || space->is_stopping()) {
+		return true;
+	}
+
+	/* Free a frag page */
+	n = fseg_find_last_used_frag_page_slot(inode);
+
+	if (n == ULINT_UNDEFINED) {
+		/* Freeing completed: free the segment inode */
+		fsp_free_seg_inode(space, inode, iblock, mtr);
+		return true;
+	}
+
+	page_no_t page_no = fseg_get_nth_frag_page_no(inode, n);
+
+	if (fseg_free_page_low(inode, iblock, space, page_no, mtr
+#ifdef BTR_CUR_HASH_ADAPT
+			       , ahi
+#endif /* BTR_CUR_HASH_ADAPT */
+			       ) != DB_SUCCESS) {
+		return true;
+	}
+
+	buf_page_free(space, page_no, mtr);
+
+	n = fseg_find_last_used_frag_page_slot(inode);
+
+	if (n == ULINT_UNDEFINED) {
+		/* Freeing completed: free the segment inode */
+		fsp_free_seg_inode(space, inode, iblock, mtr);
+
+		return true;
+	}
+
+	return false;
+}
+
+bool
+fseg_free_step_not_header(
+	fseg_header_t*	header,
+	mtr_t*		mtr
+#ifdef BTR_CUR_HASH_ADAPT
+	,bool		ahi
+#endif /* BTR_CUR_HASH_ADAPT */
+	)
+{
+	fseg_inode_t*	inode;
+
+	const uint32_t space_id = page_get_space_id(page_align(header));
+	ut_ad(mtr->is_named_space(space_id));
+
+	fil_space_t*		space = mtr->x_lock_space(space_id);
+	buf_block_t*		iblock;
+
+	inode = fseg_inode_try_get(header, space_id, space->zip_size(),
+				   mtr, &iblock);
+	if (space->is_stopping()) {
+		return true;
+	}
+
+	if (!inode) {
+		ib::warn() << "Double free of "
+			   << page_id_t(space_id,
+					page_get_page_no(page_align(header)));
+		return true;
+	}
+
+	if (!space->full_crc32()) {
+		fil_block_check_type(*iblock, FIL_PAGE_INODE, mtr);
+	}
+
+	dberr_t err;
+	if (xdes_t* descr = fseg_get_first_extent(inode, space, mtr, &err)) {
+		/* Free the extent held by the segment */
+		return fseg_free_extent(inode, iblock, space,
+					xdes_get_offset(descr),
+					mtr
+#ifdef BTR_CUR_HASH_ADAPT
+					, ahi
+#endif /* BTR_CUR_HASH_ADAPT */
+					) != DB_SUCCESS;
+	} else if (err != DB_SUCCESS) {
+		return true;
+	}
+
+	/* Free a frag page */
+
+	ulint n = fseg_find_last_used_frag_page_slot(inode);
+
+	if (UNIV_UNLIKELY(n == ULINT_UNDEFINED)) {
+		return true;
+	}
+
+	uint32_t page_no = fseg_get_nth_frag_page_no(inode, n);
+
+	if (page_no == page_get_page_no(page_align(header))) {
+		return true;
+	}
+
+	if (fseg_free_page_low(inode, iblock, space, page_no, mtr
+#ifdef BTR_CUR_HASH_ADAPT
+			       , ahi
+#endif /* BTR_CUR_HASH_ADAPT */
+			       ) != DB_SUCCESS) {
+		return true;
+	}
+	buf_page_free(space, page_no, mtr);
+	return false;
+}
+
+/** Returns the first extent descriptor for a segment.
+We think of the extent lists of the segment catenated in the order
+FSEG_FULL -> FSEG_NOT_FULL -> FSEG_FREE.
+@param[in]	inode		segment inode
+@param[in]	space		tablespace
+@param[in,out]	mtr		mini-transaction
+@return the first extent descriptor
+@retval nullptr if none, or on corruption */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+static
+xdes_t*
+fseg_get_first_extent(
+	fseg_inode_t*		inode,
+	const fil_space_t*	space,
+	mtr_t*			mtr,
+	dberr_t*		err)
+{
+  if (UNIV_UNLIKELY(space->id != page_get_space_id(page_align(inode)) ||
+                    memcmp(inode + FSEG_MAGIC_N, FSEG_MAGIC_N_BYTES, 4)))
+  {
+  corrupted:
+    *err= DB_CORRUPTION;
+    return nullptr;
+  }
+
+  fil_addr_t first;
+
+  if (flst_get_len(inode + FSEG_FULL))
+    first= flst_get_first(inode + FSEG_FULL);
+  else if (flst_get_len(inode + FSEG_NOT_FULL))
+    first= flst_get_first(inode + FSEG_NOT_FULL);
+  else if (flst_get_len(inode + FSEG_FREE))
+    first= flst_get_first(inode + FSEG_FREE);
+  else
+  {
+    *err= DB_SUCCESS;
+    return nullptr;
+  }
+
+  if (first.page == FIL_NULL)
+    goto corrupted;
+
+  return xdes_lst_get_descriptor(*space, first, mtr, nullptr, err);
+}
+
+#ifdef UNIV_BTR_PRINT
+/*******************************************************************//**
+Writes info of a segment. */
+static void fseg_print_low(const fseg_inode_t *inode)
+{
+	ulint	space;
+	ulint	n_used;
+	ulint	n_frag;
+	ulint	n_free;
+	ulint	n_not_full;
+	ulint	n_full;
+	ulint	reserved;
+	ulint	used;
+	ulint	page_no;
+	ib_id_t	seg_id;
+
+	space = page_get_space_id(page_align(inode));
+	page_no = page_get_page_no(page_align(inode));
+
+	reserved = fseg_n_reserved_pages_low(inode, &used);
+
+	seg_id = mach_read_from_8(inode + FSEG_ID);
+	n_used = mach_read_from_4(inode + FSEG_NOT_FULL_N_USED);
+	n_frag = fseg_get_n_frag_pages(inode);
+	n_free = flst_get_len(inode + FSEG_FREE);
+	n_not_full = flst_get_len(inode + FSEG_NOT_FULL);
+	n_full = flst_get_len(inode + FSEG_FULL);
+
+	ib::info() << "SEGMENT id " << seg_id
+		<< " space " << space << ";"
+		<< " page " << page_no << ";"
+		<< " res " << reserved << " used " << used << ";"
+		<< " full ext " << n_full << ";"
+		<< " fragm pages " << n_frag << ";"
+		<< " free extents " << n_free << ";"
+		<< " not full extents " << n_not_full << ": pages " << n_used;
+
+	ut_ad(!memcmp(FSEG_MAGIC_N_BYTES, FSEG_MAGIC_N + inode, 4));
+}
+
+/*******************************************************************//**
+Writes info of a segment. */
+void
+fseg_print(
+/*=======*/
+	fseg_header_t*	header, /*!< in: segment header */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+  const fil_space_t *space=
+    mtr->x_lock_space(page_get_space_id(page_align(header)));
+  buf_block_t *block;
+  if (fseg_inode_t *inode=
+      fseg_inode_try_get(header, space->id, space->zip_size(), mtr, &block))
+    fseg_print_low(inode);
+}
+#endif /* UNIV_BTR_PRINT */
+
+#ifdef UNIV_DEBUG
+std::ostream &fseg_header::to_stream(std::ostream &out) const
+{
+  out << "[fseg_header_t: space="
+      << mach_read_from_4(m_header + FSEG_HDR_SPACE)
+      << ", page=" << mach_read_from_4(m_header + FSEG_HDR_PAGE_NO)
+      << ", offset=" << mach_read_from_2(m_header + FSEG_HDR_OFFSET) << "]";
+  return out;
+}
+#endif /* UNIV_DEBUG */
diff --git a/storage/innobase/fsp/fsp0space.cc b/storage/innobase/fsp/fsp0space.cc
new file mode 100644
index 00000000..c2152b08
--- /dev/null
+++ b/storage/innobase/fsp/fsp0space.cc
@@ -0,0 +1,224 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file fsp/fsp0space.cc
+Shared tablespace implementation.
+
+Created 2012-11-16 by Sunny Bains as srv/srv0space.cc
+*******************************************************/
+
+#include "fsp0sysspace.h"
+#include "fsp0fsp.h"
+#include "os0file.h"
+#include "my_sys.h"
+
+/** Check if two tablespaces have common data file names.
+@param other_space	Tablespace to check against this.
+@return true if they have the same data filenames and paths */
+bool
+Tablespace::intersection(
+	const Tablespace*	other_space)
+{
+	for (files_t::const_iterator it(other_space->begin()),
+		     end(other_space->end()); it != end; ++it) {
+
+		if (find(it->m_filename)) {
+
+			return(true);
+		}
+	}
+
+	return(false);
+}
+
+/** Frees the memory allocated by the SysTablespace object. */
+void
+Tablespace::shutdown()
+{
+	for (iterator it = begin(); it != end(); ++it) {
+		it->shutdown();
+	}
+
+	m_files.clear();
+	ut_free(m_path);
+	m_path = NULL;
+	m_space_id = UINT32_MAX;
+}
+
+/** Note that the data file was found.
+@param[in,out] file	Data file object to set */
+void
+Tablespace::file_found(Datafile& file)
+{
+	/* Note that the file exists and can be opened
+	in the appropriate mode. */
+	file.m_exists = true;
+
+	file.set_open_flags(
+		&file == &m_files.front()
+		? OS_FILE_OPEN_RETRY : OS_FILE_OPEN);
+}
+
+/** Open or Create the data files if they do not exist.
+@param[in]	is_temp	whether this is a temporary tablespace
+@return DB_SUCCESS or error code */
+dberr_t
+Tablespace::open_or_create(bool is_temp)
+{
+	fil_space_t*		space = NULL;
+	dberr_t			err = DB_SUCCESS;
+
+	ut_ad(!m_files.empty());
+
+	for (iterator it = begin(); it != end(); ++it) {
+		if (it->m_exists) {
+			err = it->open_or_create(
+				m_ignore_read_only
+				? false : srv_read_only_mode);
+			if (err != DB_SUCCESS) {
+				return err;
+			}
+		} else {
+			err = it->open_or_create(
+				m_ignore_read_only
+				? false : srv_read_only_mode);
+
+			if (err != DB_SUCCESS) {
+				return err;
+			}
+
+			/* Set the correct open flags now that we have
+			successfully created the file. */
+			file_found(*it);
+		}
+
+		/* We can close the handle now and open the tablespace
+		the proper way. */
+		it->close();
+
+		if (it == begin()) {
+			/* First data file. */
+
+			/* Create the tablespace entry for the multi-file
+			tablespace in the tablespace manager. */
+			uint32_t fsp_flags;
+
+			switch (srv_checksum_algorithm) {
+			case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
+			case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
+				fsp_flags = (FSP_FLAGS_FCRC32_MASK_MARKER
+					     | FSP_FLAGS_FCRC32_PAGE_SSIZE());
+				break;
+			default:
+				fsp_flags = FSP_FLAGS_PAGE_SSIZE();
+			}
+
+			mysql_mutex_lock(&fil_system.mutex);
+			space = fil_space_t::create(
+				m_space_id, fsp_flags,
+				is_temp
+				? FIL_TYPE_TEMPORARY : FIL_TYPE_TABLESPACE,
+				NULL);
+			if (!space) {
+				mysql_mutex_unlock(&fil_system.mutex);
+				return DB_ERROR;
+			}
+		} else {
+			mysql_mutex_lock(&fil_system.mutex);
+		}
+		space->add(it->m_filepath, OS_FILE_CLOSED, it->m_size,
+			   false, true);
+		mysql_mutex_unlock(&fil_system.mutex);
+	}
+
+	return(err);
+}
+
+/** Find a filename in the list of Datafiles for a tablespace
+@return true if the filename exists in the data files */
+bool
+Tablespace::find(const char* filename) const
+{
+	for (const_iterator it = begin(); it != end(); ++it) {
+
+		if (innobase_strcasecmp(filename, it->m_filename) == 0) {
+			return(true);
+		}
+	}
+
+	return(false);
+}
+
+/** Delete all the data files. */
+void
+Tablespace::delete_files()
+{
+	for (iterator it = begin(); it != end(); ++it) {
+
+		it->close();
+
+		bool file_pre_exists;
+		bool success = os_file_delete_if_exists(
+			innodb_data_file_key, it->m_filepath, &file_pre_exists);
+
+		if (success && file_pre_exists) {
+			ib::info() << "Removed temporary tablespace data"
+				" file: \"" << it->m_filepath << "\"";
+		}
+	}
+}
+
+/** Use the ADD DATAFILE path to create a Datafile object and add it to the
+front of m_files.
+Parse the datafile path into a path and a filename with extension 'ibd'.
+This datafile_path provided may or may not be an absolute path, but it
+must end with the extension .ibd and have a basename of at least 1 byte.
+
+Set tablespace m_path member and add a Datafile with the filename.
+@param[in]	datafile_path	full path of the tablespace file. */
+dberr_t Tablespace::add_datafile(const char *filepath)
+{
+	/* The path provided ends in ".ibd".  This was assured by
+	validate_create_tablespace_info() */
+	ut_d(const char* dot = strrchr(filepath, '.'));
+	ut_ad(dot != NULL && 0 == strcmp(dot, DOT_IBD));
+
+	/* If the path is an absolute path, separate it onto m_path and a
+	basename. For relative paths, make the whole thing a basename so that
+	it can be appended to the datadir. */
+	bool	is_abs_path = is_absolute_path(filepath);
+	size_t	dirlen = (is_abs_path ? dirname_length(filepath) : 0);
+	const char* basename = filepath + dirlen;
+
+	/* If the pathname contains a directory separator, fill the
+	m_path member which is the default directory for files in this
+	tablespace. Leave it null otherwise. */
+	if (dirlen > 0) {
+		set_path(filepath, dirlen);
+	}
+
+	/* Now add a new Datafile and set the filepath
+	using the m_path created above. */
+	m_files.push_back(Datafile(m_flags, FIL_IBD_FILE_INITIAL_SIZE, 0));
+	m_files.back().make_filepath(m_path, {basename, strlen(basename) - 4},
+				     IBD);
+
+	return(DB_SUCCESS);
+}
diff --git a/storage/innobase/fsp/fsp0sysspace.cc b/storage/innobase/fsp/fsp0sysspace.cc
new file mode 100644
index 00000000..e4a43e48
--- /dev/null
+++ b/storage/innobase/fsp/fsp0sysspace.cc
@@ -0,0 +1,1019 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file fsp/fsp0space.cc
+Multi file, shared, system tablespace implementation.
+
+Created 2012-11-16 by Sunny Bains as srv/srv0space.cc
+Refactored 2013-7-26 by Kevin Lewis
+*******************************************************/
+
+#include "fsp0sysspace.h"
+#include "srv0start.h"
+#include "trx0sys.h"
+#include "dict0load.h"
+#include "mem0mem.h"
+#include "os0file.h"
+#include "row0mysql.h"
+#include "buf0dblwr.h"
+
+/** The server header file is included to access opt_initialize global variable.
+If server passes the option for create/open DB to SE, we should remove such
+direct reference to server header and global variable */
+#include "mysqld.h"
+
+/** The control info of the system tablespace. */
+SysTablespace srv_sys_space;
+
+/** The control info of a temporary table shared tablespace. */
+SysTablespace srv_tmp_space;
+
+/** If the last data file is auto-extended, we add this many pages to it
+at a time. We have to make this public because it is a config variable. */
+uint sys_tablespace_auto_extend_increment;
+
+/** Convert a numeric string that optionally ends in G or M or K,
+    to a number containing megabytes.
+@param[in]	str	String with a quantity in bytes
+@param[out]	megs	The number in megabytes
+@return next character in string */
+char*
+SysTablespace::parse_units(
+	char*	ptr,
+	ulint*	megs)
+{
+	char*		endp;
+
+	*megs = strtoul(ptr, &endp, 10);
+
+	ptr = endp;
+
+	switch (*ptr) {
+	case 'G': case 'g':
+		*megs *= 1024;
+		/* fall through */
+	case 'M': case 'm':
+		++ptr;
+		break;
+	case 'K': case 'k':
+		*megs /= 1024;
+		++ptr;
+		break;
+	default:
+		*megs /= 1024 * 1024;
+		break;
+	}
+
+	return(ptr);
+}
+
+/** Parse the input params and populate member variables.
+@param[in]	filepath	path to data files
+@param[in]	supports_raw	true if the tablespace supports raw devices
+@return true on success parse */
+bool
+SysTablespace::parse_params(
+	const char*	filepath_spec,
+	bool		supports_raw)
+{
+	char*	filepath;
+	ulint	size;
+	char*	input_str;
+	ulint	n_files = 0;
+
+	ut_ad(m_last_file_size_max == 0);
+	ut_ad(!m_auto_extend_last_file);
+
+	char*	new_str = mem_strdup(filepath_spec);
+	char*	str = new_str;
+
+	input_str = str;
+
+	/*---------------------- PASS 1 ---------------------------*/
+	/* First calculate the number of data files and check syntax:
+	filepath:size[K |M | G];filepath:size[K |M | G]... .
+	Note that a Windows path may contain a drive name and a ':'. */
+	while (*str != '\0') {
+		filepath = str;
+
+		while ((*str != ':' && *str != '\0')
+		       || (*str == ':'
+			   && (*(str + 1) == '\\' || *(str + 1) == '/'
+			       || *(str + 1) == ':'))) {
+			str++;
+		}
+
+		if (*str == '\0') {
+			ut_free(new_str);
+
+			ib::error()
+				<< "syntax error in file path or size"
+				" specified is less than 1 megabyte";
+			return(false);
+		}
+
+		str++;
+
+		str = parse_units(str, &size);
+
+		if (0 == strncmp(str, ":autoextend",
+				 (sizeof ":autoextend") - 1)) {
+
+			str += (sizeof ":autoextend") - 1;
+
+			if (0 == strncmp(str, ":max:",
+					 (sizeof ":max:") - 1)) {
+
+				str += (sizeof ":max:") - 1;
+
+				str = parse_units(str, &size);
+			}
+
+			if (*str != '\0') {
+				ut_free(new_str);
+				ib::error()
+					<< "syntax error in file path or"
+					<< " size specified is less than"
+					<< " 1 megabyte";
+				return(false);
+			}
+		}
+
+		if (::strlen(str) >= 6
+		    && *str == 'n'
+		    && *(str + 1) == 'e'
+		    && *(str + 2) == 'w') {
+
+			if (!supports_raw) {
+				ib::error()
+					<< "Tablespace doesn't support raw"
+					" devices";
+				ut_free(new_str);
+				return(false);
+			}
+
+			str += 3;
+		}
+
+		if (*str == 'r' && *(str + 1) == 'a' && *(str + 2) == 'w') {
+			str += 3;
+
+			if (!supports_raw) {
+				ib::error()
+					<< "Tablespace doesn't support raw"
+					" devices";
+				ut_free(new_str);
+				return(false);
+			}
+		}
+
+		if (size == 0) {
+
+			ut_free(new_str);
+
+			ib::error()
+				<< "syntax error in file path or size"
+				" specified is less than 1 megabyte";
+
+			return(false);
+		}
+
+		++n_files;
+
+		if (*str == ';') {
+			str++;
+		} else if (*str != '\0') {
+			ut_free(new_str);
+
+			ib::error()
+				<< "syntax error in file path or size"
+				" specified is less than 1 megabyte";
+			return(false);
+		}
+	}
+
+	if (n_files == 0) {
+
+		/* filepath_spec must contain at least one data file
+		definition */
+
+		ut_free(new_str);
+
+		ib::error()
+			<< "syntax error in file path or size specified"
+			" is less than 1 megabyte";
+
+		return(false);
+	}
+
+	/*---------------------- PASS 2 ---------------------------*/
+	/* Then store the actual values to our arrays */
+	str = input_str;
+	ulint order = 0;
+
+	while (*str != '\0') {
+		filepath = str;
+
+		/* Note that we must step over the ':' in a Windows filepath;
+		a Windows path normally looks like C:\ibdata\ibdata1:1G, but
+		a Windows raw partition may have a specification like
+		\\.\C::1Gnewraw or \\.\PHYSICALDRIVE2:1Gnewraw */
+
+		while ((*str != ':' && *str != '\0')
+		       || (*str == ':'
+			   && (*(str + 1) == '\\' || *(str + 1) == '/'
+			       || *(str + 1) == ':'))) {
+			str++;
+		}
+
+		if (*str == ':') {
+			/* Make filepath a null-terminated string */
+			*str = '\0';
+			str++;
+		}
+
+		str = parse_units(str, &size);
+
+		if (0 == strncmp(str, ":autoextend",
+				 (sizeof ":autoextend") - 1)) {
+
+			m_auto_extend_last_file = true;
+
+			str += (sizeof ":autoextend") - 1;
+
+			if (0 == strncmp(str, ":max:",
+					 (sizeof ":max:") - 1)) {
+
+				str += (sizeof ":max:") - 1;
+
+				str = parse_units(str, &m_last_file_size_max);
+			}
+
+			if (*str != '\0') {
+				ut_free(new_str);
+				ib::error() << "syntax error in file path or"
+					" size specified is less than 1"
+					" megabyte";
+				return(false);
+			}
+		}
+
+		m_files.push_back(Datafile(flags(), uint32_t(size), order));
+		m_files.back().make_filepath(path(),
+					     {filepath, strlen(filepath)},
+					     NO_EXT);
+
+		if (::strlen(str) >= 6
+		    && *str == 'n'
+		    && *(str + 1) == 'e'
+		    && *(str + 2) == 'w') {
+
+			ut_a(supports_raw);
+
+			str += 3;
+
+			/* Initialize new raw device only during initialize */
+			/* JAN: TODO: MySQL 5.7 used opt_initialize */
+			m_files.back().m_type =
+			opt_bootstrap ? SRV_NEW_RAW : SRV_OLD_RAW;
+		}
+
+		if (*str == 'r' && *(str + 1) == 'a' && *(str + 2) == 'w') {
+
+			ut_a(supports_raw);
+
+			str += 3;
+
+			/* Initialize new raw device only during initialize */
+			if (m_files.back().m_type == SRV_NOT_RAW) {
+				/* JAN: TODO: MySQL 5.7 used opt_initialize */
+				m_files.back().m_type =
+				opt_bootstrap ? SRV_NEW_RAW : SRV_OLD_RAW;
+			}
+		}
+
+		if (*str == ';') {
+			++str;
+		}
+		order++;
+	}
+
+	ut_ad(n_files == ulint(m_files.size()));
+
+	ut_free(new_str);
+
+	return(true);
+}
+
+/** Frees the memory allocated by the parse method. */
+void
+SysTablespace::shutdown()
+{
+	Tablespace::shutdown();
+
+	m_auto_extend_last_file = 0;
+	m_last_file_size_max = 0;
+	m_created_new_raw = 0;
+	m_is_tablespace_full = false;
+	m_sanity_checks_done = false;
+}
+
+/** Verify the size of the physical file.
+@param[in]	file	data file object
+@return DB_SUCCESS if OK else error code. */
+dberr_t
+SysTablespace::check_size(
+	Datafile&	file)
+{
+	os_offset_t	size = os_file_get_size(file.m_handle);
+	ut_a(size != (os_offset_t) -1);
+
+	/* Under some error conditions like disk full scenarios
+	or file size reaching filesystem limit the data file
+	could contain an incomplete extent at the end. When we
+	extend a data file and if some failure happens, then
+	also the data file could contain an incomplete extent.
+	So we need to round the size downward to a  megabyte.*/
+
+	const uint32_t	rounded_size_pages = static_cast<uint32_t>(
+		size >> srv_page_size_shift);
+
+	/* If last file */
+	if (&file == &m_files.back() && m_auto_extend_last_file) {
+
+		if (file.m_size > rounded_size_pages
+		    || (m_last_file_size_max > 0
+			&& m_last_file_size_max < rounded_size_pages)) {
+			ib::error() << "The Auto-extending data file '"
+				    << file.filepath()
+				    << "' is of a different size "
+				    << rounded_size_pages
+				    << " pages than specified"
+				" by innodb_data_file_path";
+			return(DB_ERROR);
+		}
+
+		file.m_size = rounded_size_pages;
+	}
+
+	if (rounded_size_pages != file.m_size) {
+		ib::error() << "The data file '"
+			<< file.filepath() << "' is of a different size "
+			<< rounded_size_pages << " pages"
+			" than the " << file.m_size << " pages specified by"
+			" innodb_data_file_path";
+		return(DB_ERROR);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/** Set the size of the file.
+@param[in]	file	data file object
+@return DB_SUCCESS or error code */
+dberr_t
+SysTablespace::set_size(
+	Datafile&	file)
+{
+	ut_ad(!srv_read_only_mode || m_ignore_read_only);
+	const ib::bytes_iec b{uint64_t{file.m_size} << srv_page_size_shift};
+
+	/* We created the data file and now write it full of zeros */
+	ib::info() << "Setting file '" << file.filepath() << "' size to " << b
+		<< ". Physically writing the file full; Please wait ...";
+
+	bool	success = os_file_set_size(
+		file.m_filepath, file.m_handle,
+		static_cast<os_offset_t>(file.m_size) << srv_page_size_shift);
+
+	if (success) {
+		ib::info() << "File '" << file.filepath() << "' size is now "
+			<< b
+			<< ".";
+	} else {
+		ib::error() << "Could not set the file size of '"
+			<< file.filepath() << "'. Probably out of disk space";
+
+		return(DB_ERROR);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/** Create a data file.
+@param[in]	file	data file object
+@return DB_SUCCESS or error code */
+dberr_t
+SysTablespace::create_file(
+	Datafile&	file)
+{
+	dberr_t	err = DB_SUCCESS;
+
+	ut_a(!file.m_exists);
+	ut_ad(!srv_read_only_mode || m_ignore_read_only);
+
+	switch (file.m_type) {
+	case SRV_NEW_RAW:
+
+		/* The partition is opened, not created; then it is
+		written over */
+		m_created_new_raw = true;
+
+		/* Fall through. */
+
+	case SRV_OLD_RAW:
+
+		srv_start_raw_disk_in_use = TRUE;
+
+		/* Fall through. */
+
+	case SRV_NOT_RAW:
+		err = file.open_or_create(
+			!m_ignore_read_only && srv_read_only_mode);
+		break;
+	}
+
+	if (err != DB_SUCCESS) {
+		return err;
+	}
+
+	switch (file.m_type) {
+	case SRV_OLD_RAW:
+		break;
+	case SRV_NOT_RAW:
+#ifndef _WIN32
+		if (!space_id() && my_disable_locking
+		    && os_file_lock(file.m_handle, file.m_filepath)) {
+			err = DB_ERROR;
+			break;
+		}
+#endif
+		/* fall through */
+	case SRV_NEW_RAW:
+		err = set_size(file);
+	}
+
+	return(err);
+}
+
+/** Open a data file.
+@param[in]	file	data file object
+@return DB_SUCCESS or error code */
+dberr_t
+SysTablespace::open_file(
+	Datafile&	file)
+{
+	dberr_t	err = DB_SUCCESS;
+
+	ut_a(file.m_exists);
+
+	switch (file.m_type) {
+	case SRV_NEW_RAW:
+		/* The partition is opened, not created; then it is
+		written over */
+		m_created_new_raw = true;
+
+		/* Fall through */
+
+	case SRV_OLD_RAW:
+		srv_start_raw_disk_in_use = TRUE;
+
+		if (srv_read_only_mode && !m_ignore_read_only) {
+			ib::error() << "Can't open a raw device '"
+				<< file.m_filepath << "' when"
+				" --innodb-read-only is set";
+
+			return(DB_ERROR);
+		}
+
+		/* Fall through */
+
+	case SRV_NOT_RAW:
+		err = file.open_or_create(
+			!m_ignore_read_only && srv_read_only_mode);
+
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+		break;
+	}
+
+	switch (file.m_type) {
+	case SRV_NEW_RAW:
+		/* Set file size for new raw device. */
+		err = set_size(file);
+		break;
+
+	case SRV_NOT_RAW:
+#ifndef _WIN32
+		if (!space_id() && (m_ignore_read_only || !srv_read_only_mode)
+		    && my_disable_locking
+		    && os_file_lock(file.m_handle, file.m_filepath)) {
+			err = DB_ERROR;
+			break;
+		}
+#endif
+		/* Check file size for existing file. */
+		err = check_size(file);
+		break;
+
+	case SRV_OLD_RAW:
+		err = DB_SUCCESS;
+		break;
+
+	}
+
+	if (err != DB_SUCCESS) {
+		file.close();
+	}
+
+	return(err);
+}
+
+/** Check the tablespace header for this tablespace.
+@return DB_SUCCESS or error code */
+inline dberr_t SysTablespace::read_lsn_and_check_flags()
+{
+	dberr_t	err;
+
+	files_t::iterator it = m_files.begin();
+
+	ut_a(it->m_exists);
+
+	if (it->m_handle == OS_FILE_CLOSED) {
+
+		err = it->open_or_create(
+			m_ignore_read_only ?  false : srv_read_only_mode);
+
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+	}
+
+	err = it->read_first_page(
+		m_ignore_read_only ?  false : srv_read_only_mode);
+
+	if (err != DB_SUCCESS) {
+		return(err);
+	}
+
+	ut_a(it->order() == 0);
+
+	if (srv_operation  <= SRV_OPERATION_EXPORT_RESTORED) {
+		buf_dblwr.init_or_load_pages(it->handle(), it->filepath());
+	}
+
+	/* Check the contents of the first page of the
+	first datafile. */
+	for (int retry = 0; retry < 2; ++retry) {
+
+		err = it->validate_first_page();
+
+		if (err != DB_SUCCESS
+		    && (retry == 1
+			|| recv_sys.dblwr.restore_first_page(
+				it->m_space_id, it->m_filepath,
+				it->handle()))) {
+
+			it->close();
+
+			return(err);
+		}
+	}
+
+	/* Make sure the tablespace space ID matches the
+	space ID on the first page of the first datafile. */
+	if (space_id() != it->m_space_id) {
+
+		ib::error()
+			<< "The data file '" << it->filepath()
+			<< "' has the wrong space ID. It should be "
+			<< space_id() << ", but " << it->m_space_id
+			<< " was found";
+
+		it->close();
+
+		return(err);
+	}
+
+	if (srv_operation == SRV_OPERATION_NORMAL) {
+		/* Prepare for possible upgrade from 0-sized ib_logfile0. */
+		ut_ad(!log_sys.next_checkpoint_lsn);
+		log_sys.next_checkpoint_lsn = mach_read_from_8(
+			it->m_first_page + 26/*FIL_PAGE_FILE_FLUSH_LSN*/);
+	}
+
+	it->close();
+
+	return(DB_SUCCESS);
+}
+
+/** Check if a file can be opened in the correct mode.
+@param[in]	file	data file object
+@param[out]	reason	exact reason if file_status check failed.
+@return DB_SUCCESS or error code. */
+dberr_t
+SysTablespace::check_file_status(
+	const Datafile&		file,
+	file_status_t&		reason)
+{
+	os_file_stat_t	stat;
+
+	memset(&stat, 0x0, sizeof(stat));
+
+	dberr_t	err = os_file_get_status(
+		file.m_filepath, &stat, true,
+		m_ignore_read_only ? false : srv_read_only_mode);
+
+	reason = FILE_STATUS_VOID;
+	/* File exists but we can't read the rw-permission settings. */
+	switch (err) {
+	case DB_FAIL:
+		ib::error() << "os_file_get_status() failed on '"
+			<< file.filepath()
+			<< "'. Can't determine file permissions";
+		err = DB_ERROR;
+		reason = FILE_STATUS_RW_PERMISSION_ERROR;
+		break;
+
+	case DB_SUCCESS:
+		/* Note: stat.rw_perm is only valid for "regular" files */
+
+		if (stat.type == OS_FILE_TYPE_FILE) {
+			if (!stat.rw_perm) {
+				ib::error() << "The data file"
+					    << " '" << file.filepath()
+					    << ((!srv_read_only_mode
+						 || m_ignore_read_only)
+						? "' must be writable"
+						: "' must be readable");
+
+				err = DB_ERROR;
+				reason = FILE_STATUS_READ_WRITE_ERROR;
+			}
+
+		} else {
+			/* Not a regular file, bail out. */
+			ib::error() << "The data file '" << file.filepath()
+				    << "' is not a regular file.";
+
+			err = DB_ERROR;
+			reason = FILE_STATUS_NOT_REGULAR_FILE_ERROR;
+		}
+		break;
+
+	case DB_NOT_FOUND:
+		break;
+
+	default:
+		ut_ad(0);
+	}
+
+	return(err);
+}
+
+/** Note that the data file was not found.
+@param[in]	file		data file object
+@param[out]	create_new_db	true if a new instance to be created
+@return DB_SUCESS or error code */
+dberr_t
+SysTablespace::file_not_found(
+	Datafile&	file,
+	bool*	create_new_db)
+{
+	file.m_exists = false;
+
+	if (m_ignore_read_only) {
+	} else if (srv_read_only_mode) {
+		ib::error() << "Can't create file '" << file.filepath()
+			<< "' when --innodb-read-only is set";
+		return(DB_ERROR);
+	} else if (srv_force_recovery && space_id() == TRX_SYS_SPACE) {
+		ib::error() << "Can't create file '" << file.filepath()
+			<< "' when --innodb-force-recovery is set";
+		return DB_ERROR;
+	}
+
+	if (&file == &m_files.front()) {
+
+		/* First data file. */
+		ut_a(!*create_new_db);
+		*create_new_db = TRUE;
+
+		if (space_id() == TRX_SYS_SPACE) {
+			ib::info() << "The first data file '"
+				<< file.filepath() << "' did not exist."
+				" A new tablespace will be created!";
+		}
+
+	} else {
+		ib::info() << "Need to create a new data file '"
+			   << file.filepath() << "'.";
+	}
+
+	/* Set the file create mode. */
+	switch (file.m_type) {
+	case SRV_NOT_RAW:
+		file.set_open_flags(OS_FILE_CREATE);
+		break;
+
+	case SRV_NEW_RAW:
+	case SRV_OLD_RAW:
+		file.set_open_flags(OS_FILE_OPEN_RAW);
+		break;
+	}
+
+	return(DB_SUCCESS);
+}
+
+/** Note that the data file was found.
+@param[in,out]	file	data file object
+@return true if a new instance to be created */
+bool
+SysTablespace::file_found(
+	Datafile&	file)
+{
+	/* Note that the file exists and can be opened
+	in the appropriate mode. */
+	file.m_exists = true;
+
+	/* Set the file open mode */
+	switch (file.m_type) {
+	case SRV_NOT_RAW:
+		file.set_open_flags(
+			&file == &m_files.front()
+			? OS_FILE_OPEN_RETRY : OS_FILE_OPEN);
+		break;
+
+	case SRV_NEW_RAW:
+	case SRV_OLD_RAW:
+		file.set_open_flags(OS_FILE_OPEN_RAW);
+		break;
+	}
+
+	/* Need to create the system tablespace for new raw device. */
+	return(file.m_type == SRV_NEW_RAW);
+}
+
+/** Check the data file specification.
+@param[out] create_new_db	true if a new database is to be created
+@param[in] min_expected_size	Minimum expected tablespace size in bytes
+@return DB_SUCCESS if all OK else error code */
+dberr_t
+SysTablespace::check_file_spec(
+	bool*	create_new_db,
+	ulint	min_expected_size)
+{
+	*create_new_db = FALSE;
+
+	if (m_files.size() >= 1000) {
+		ib::error() << "There must be < 1000 data files "
+			" but " << m_files.size() << " have been"
+			" defined.";
+
+		return(DB_ERROR);
+	}
+
+	if (!m_auto_extend_last_file
+	    && get_sum_of_sizes()
+	    < (min_expected_size >> srv_page_size_shift)) {
+		ib::error() << "Tablespace size must be at least "
+			<< (min_expected_size >> 20) << " MB";
+		return(DB_ERROR);
+	}
+
+	dberr_t	err = DB_SUCCESS;
+
+	ut_a(!m_files.empty());
+
+	/* If there is more than one data file and the last data file
+	doesn't exist, that is OK. We allow adding of new data files. */
+
+	files_t::iterator	begin = m_files.begin();
+	files_t::iterator	end = m_files.end();
+
+	for (files_t::iterator it = begin; it != end; ++it) {
+
+		file_status_t reason_if_failed;
+		err = check_file_status(*it, reason_if_failed);
+
+		if (err == DB_NOT_FOUND) {
+
+			err = file_not_found(*it, create_new_db);
+
+			if (err != DB_SUCCESS) {
+				break;
+			}
+
+		} else if (err != DB_SUCCESS) {
+			if (reason_if_failed == FILE_STATUS_READ_WRITE_ERROR) {
+				ib::error() << "The data file '"
+					    << it->filepath()
+					    << ((!srv_read_only_mode
+						 || m_ignore_read_only)
+						? "' must be writable"
+						: "' must be readable");
+			}
+
+			ut_a(err != DB_FAIL);
+			break;
+
+		} else if (*create_new_db) {
+			ib::error() << "The data file '"
+				    << begin->filepath()
+				    << "' was not found but"
+				" one of the other data files '"
+				    << it->filepath() << "' exists.";
+
+			err = DB_ERROR;
+			break;
+
+		} else {
+			*create_new_db = file_found(*it);
+		}
+	}
+
+	return(err);
+}
+
+/** Open or create the data files
+@param[in]  is_temp		whether this is a temporary tablespace
+@param[in]  create_new_db	whether we are creating a new database
+@param[out] sum_new_sizes	sum of sizes of the new files added
+@return DB_SUCCESS or error code */
+dberr_t
+SysTablespace::open_or_create(
+	bool	is_temp,
+	bool	create_new_db,
+	ulint*	sum_new_sizes)
+{
+	dberr_t		err	= DB_SUCCESS;
+	fil_space_t*	space	= NULL;
+
+	ut_ad(!m_files.empty());
+
+	if (sum_new_sizes) {
+		*sum_new_sizes = 0;
+	}
+
+	files_t::iterator	begin = m_files.begin();
+	files_t::iterator	end = m_files.end();
+
+	ut_ad(begin->order() == 0);
+
+	for (files_t::iterator it = begin; it != end; ++it) {
+
+		if (it->m_exists) {
+			err = open_file(*it);
+
+			/* For new raw device increment new size. */
+			if (sum_new_sizes && it->m_type == SRV_NEW_RAW) {
+
+				*sum_new_sizes += it->m_size;
+			}
+
+		} else {
+			err = create_file(*it);
+
+			if (sum_new_sizes) {
+				*sum_new_sizes += it->m_size;
+			}
+
+			/* Set the correct open flags now that we have
+			successfully created the file. */
+			if (err == DB_SUCCESS) {
+				/* We ignore new_db OUT parameter here
+				as the information is known at this stage */
+				file_found(*it);
+			}
+		}
+
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+
+	}
+
+	if (!create_new_db && space_id() == TRX_SYS_SPACE) {
+		/* Validate the header page in the first datafile. */
+		err = read_lsn_and_check_flags();
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+	}
+
+	/* Close the curent handles, add space and file info to the
+	fil_system cache and the Data Dictionary, and re-open them
+	in file_system cache so that they stay open until shutdown. */
+	mysql_mutex_lock(&fil_system.mutex);
+	ulint	node_counter = 0;
+	for (files_t::iterator it = begin; it != end; ++it) {
+		it->close();
+		it->m_exists = true;
+
+		if (it != begin) {
+		} else if (is_temp) {
+			ut_ad(space_id() == SRV_TMP_SPACE_ID);
+			space = fil_space_t::create(
+				SRV_TMP_SPACE_ID, flags(),
+				FIL_TYPE_TEMPORARY, NULL);
+			ut_ad(space == fil_system.temp_space);
+			if (!space) {
+				err = DB_ERROR;
+				break;
+			}
+			ut_ad(!space->is_compressed());
+			ut_ad(space->full_crc32());
+		} else {
+			ut_ad(space_id() == TRX_SYS_SPACE);
+			space = fil_space_t::create(
+				TRX_SYS_SPACE, it->flags(),
+				FIL_TYPE_TABLESPACE, NULL);
+			ut_ad(space == fil_system.sys_space);
+			if (!space) {
+				err = DB_ERROR;
+				break;
+			}
+		}
+
+		uint32_t max_size = (++node_counter == m_files.size()
+				    ? (m_last_file_size_max == 0
+				       ? UINT32_MAX
+				       : uint32_t(m_last_file_size_max))
+				    : it->m_size);
+
+		space->add(it->m_filepath, OS_FILE_CLOSED, it->m_size,
+			   it->m_type != SRV_NOT_RAW, true, max_size);
+	}
+
+	mysql_mutex_unlock(&fil_system.mutex);
+	return(err);
+}
+
+/** Normalize the file size, convert from megabytes to number of pages. */
+void
+SysTablespace::normalize_size()
+{
+	files_t::iterator	end = m_files.end();
+
+	for (files_t::iterator it = m_files.begin(); it != end; ++it) {
+
+		it->m_size <<= (20U - srv_page_size_shift);
+	}
+
+	m_last_file_size_max <<= (20U - srv_page_size_shift);
+}
+
+
+/**
+@return next increment size */
+uint32_t SysTablespace::get_increment() const
+{
+  if (m_last_file_size_max == 0)
+    return get_autoextend_increment();
+
+  if (!is_valid_size())
+  {
+     ib::error() << "The last data file has a size of " << last_file_size()
+                 << " but the max size allowed is "
+                 << m_last_file_size_max;
+  }
+
+  return std::min(uint32_t(m_last_file_size_max) - last_file_size(),
+                  get_autoextend_increment());
+}
+
+
+/**
+@return true if configured to use raw devices */
+bool
+SysTablespace::has_raw_device()
+{
+	files_t::iterator	end = m_files.end();
+
+	for (files_t::iterator it = m_files.begin(); it != end; ++it) {
+
+		if (it->is_raw_device()) {
+			return(true);
+		}
+	}
+
+	return(false);
+}
diff --git a/storage/innobase/fts/Makefile.query b/storage/innobase/fts/Makefile.query
new file mode 100644
index 00000000..d91b1b92
--- /dev/null
+++ b/storage/innobase/fts/Makefile.query
@@ -0,0 +1,18 @@
+LEX=flex
+YACC=bison
+PREFIX=fts
+
+all:	fts0pars.cc fts0blex.cc fts0tlex.cc
+
+fts0par.cc: fts0pars.y
+fts0blex.cc: fts0blex.l
+fts0tlex.cc: fts0tlex.l
+
+.l.cc:
+	echo '#include "univ.i"' > $*.cc
+	$(LEX) --stdout -P$(subst lex,,$*) -o $*.cc \
+	--header-file=../include/$*.h $< >> $*.cc
+
+.y.cc:
+	$(YACC) -p $(PREFIX) -o $*.cc -d $<
+	mv $*.h ../include
diff --git a/storage/innobase/fts/fts0ast.cc b/storage/innobase/fts/fts0ast.cc
new file mode 100644
index 00000000..74d02d63
--- /dev/null
+++ b/storage/innobase/fts/fts0ast.cc
@@ -0,0 +1,816 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2020, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file fts/fts0ast.cc
+Full Text Search parser helper file.
+
+Created 2007/3/16 Sunny Bains.
+***********************************************************************/
+
+#include "row0sel.h"
+#include "fts0ast.h"
+#include "fts0pars.h"
+#include "fts0fts.h"
+#include "trx0trx.h"
+
+/* The FTS ast visit pass. */
+enum fts_ast_visit_pass_t {
+	FTS_PASS_FIRST,		/*!< First visit pass,
+				process operators excluding
+				FTS_EXIST and FTS_IGNORE */
+	FTS_PASS_EXIST,		/*!< Exist visit pass,
+				process operator FTS_EXIST */
+	FTS_PASS_IGNORE		/*!< Ignore visit pass,
+				process operator FTS_IGNORE */
+};
+
+/******************************************************************//**
+Create an empty fts_ast_node_t.
+@return Create a new node */
+static
+fts_ast_node_t*
+fts_ast_node_create(void)
+/*=====================*/
+{
+	fts_ast_node_t*	node;
+
+	node = (fts_ast_node_t*) ut_zalloc_nokey(sizeof(*node));
+
+	return(node);
+}
+
+/** Track node allocations, in case there is an error during parsing. */
+static
+void
+fts_ast_state_add_node(
+	fts_ast_state_t*state,			/*!< in: ast instance */
+	fts_ast_node_t*	node)			/*!< in: node to add to ast */
+{
+	if (!state->list.head) {
+		ut_a(!state->list.tail);
+
+		state->list.head = state->list.tail = node;
+	} else {
+		state->list.tail->next_alloc = node;
+		state->list.tail = node;
+	}
+}
+
+/******************************************************************//**
+Create a operator fts_ast_node_t.
+@return new node */
+fts_ast_node_t*
+fts_ast_create_node_oper(
+/*=====================*/
+	void*		arg,			/*!< in: ast state instance */
+	fts_ast_oper_t	oper)			/*!< in: ast operator */
+{
+	fts_ast_node_t*	node = fts_ast_node_create();
+
+	node->type = FTS_AST_OPER;
+	node->oper = oper;
+
+	fts_ast_state_add_node((fts_ast_state_t*) arg, node);
+
+	return(node);
+}
+
+/******************************************************************//**
+This function takes ownership of the ptr and is responsible
+for free'ing it
+@return new node or a node list with tokenized words */
+fts_ast_node_t*
+fts_ast_create_node_term(
+/*=====================*/
+	void*			arg,		/*!< in: ast state instance */
+	const fts_ast_string_t*	ptr)		/*!< in: ast term string */
+{
+	fts_ast_state_t*	state = static_cast<fts_ast_state_t*>(arg);
+	ulint			len = ptr->len;
+	ulint			cur_pos = 0;
+	fts_ast_node_t*         node = NULL;
+	fts_ast_node_t*		node_list = NULL;
+	fts_ast_node_t*		first_node = NULL;
+
+	/* Scan the incoming string and filter out any "non-word" characters */
+	while (cur_pos < len) {
+		fts_string_t	str;
+		ulint		cur_len;
+
+		cur_len = innobase_mysql_fts_get_token(
+			state->charset,
+			reinterpret_cast<const byte*>(ptr->str) + cur_pos,
+			reinterpret_cast<const byte*>(ptr->str) + len, &str);
+
+		if (cur_len == 0) {
+			break;
+		}
+
+		cur_pos += cur_len;
+
+		if (str.f_n_char > 0) {
+			/* If the subsequent term (after the first one)'s size
+			is less than fts_min_token_size or the term is greater
+			than fts_max_token_size, we shall ignore that. This is
+			to make consistent with MyISAM behavior */
+			if ((first_node && (str.f_n_char < fts_min_token_size))
+			    || str.f_n_char > fts_max_token_size) {
+				continue;
+			}
+
+			node = fts_ast_node_create();
+
+			node->type = FTS_AST_TERM;
+
+			node->term.ptr = fts_ast_string_create(
+						str.f_str, str.f_len);
+
+			fts_ast_state_add_node(
+				static_cast<fts_ast_state_t*>(arg), node);
+
+			if (first_node) {
+				/* There is more than one word, create
+				a list to organize them */
+				if (!node_list) {
+					node_list = fts_ast_create_node_list(
+						static_cast<fts_ast_state_t*>(
+							arg),
+						 first_node);
+				}
+
+				fts_ast_add_node(node_list, node);
+			} else {
+				first_node = node;
+			}
+		}
+	}
+
+	return((node_list != NULL) ? node_list : first_node);
+}
+
+/******************************************************************//**
+Create an AST term node, makes a copy of ptr for plugin parser
+@return node */
+fts_ast_node_t*
+fts_ast_create_node_term_for_parser(
+/*================================*/
+	void*		arg,		/*!< in: ast state */
+	const char*	ptr,		/*!< in: term string */
+	const ulint	len)		/*!< in: term string length */
+{
+	fts_ast_node_t*		node = NULL;
+
+	/* '%' as first char is forbidden for LIKE in internal SQL parser;
+	'%' as last char is reserved for wildcard search;*/
+	if (len == 0 || len > FTS_MAX_WORD_LEN
+	    || ptr[0] == '%' || ptr[len - 1] == '%') {
+		return(NULL);
+	}
+
+	node = fts_ast_node_create();
+
+	node->type = FTS_AST_TERM;
+
+	node->term.ptr = fts_ast_string_create(
+			reinterpret_cast<const byte*>(ptr), len);
+
+	fts_ast_state_add_node(static_cast<fts_ast_state_t*>(arg), node);
+
+	return(node);
+}
+
+/******************************************************************//**
+This function takes ownership of the ptr and is responsible
+for free'ing it.
+@return new node */
+fts_ast_node_t*
+fts_ast_create_node_text(
+/*=====================*/
+	void*			arg,	/*!< in: ast state instance */
+	const fts_ast_string_t*	ptr)	/*!< in: ast text string */
+{
+	ulint		len = ptr->len;
+	fts_ast_node_t*	node = NULL;
+
+	/* Once we come here, the string must have at least 2 quotes ""
+	around the query string, which could be empty. Also the query
+	string may contain 0x00 in it, we don't treat it as null-terminated. */
+	ut_ad(len >= 2);
+	ut_ad(ptr->str[0] == '\"' && ptr->str[len - 1] == '\"');
+
+	if (len == 2) {
+		/* If the query string contains nothing except quotes,
+		it's obviously an invalid query. */
+		return(NULL);
+	}
+
+	node = fts_ast_node_create();
+
+	/*!< We ignore the actual quotes "" */
+	len -= 2;
+
+	node->type = FTS_AST_TEXT;
+	/*!< Skip copying the first quote */
+	node->text.ptr = fts_ast_string_create(
+			reinterpret_cast<const byte*>(ptr->str + 1), len);
+	node->text.distance = ULINT_UNDEFINED;
+
+	fts_ast_state_add_node((fts_ast_state_t*) arg, node);
+
+	return(node);
+}
+
+/******************************************************************//**
+Create an AST phrase list node for plugin parser
+@return node */
+fts_ast_node_t*
+fts_ast_create_node_phrase_list(
+/*============================*/
+	void*		arg)			/*!< in: ast state */
+{
+	fts_ast_node_t*		node = fts_ast_node_create();
+
+	node->type = FTS_AST_PARSER_PHRASE_LIST;
+
+	node->text.distance = ULINT_UNDEFINED;
+	node->list.head = node->list.tail = NULL;
+
+	fts_ast_state_add_node(static_cast<fts_ast_state_t*>(arg), node);
+
+	return(node);
+}
+
+/******************************************************************//**
+This function takes ownership of the expr and is responsible
+for free'ing it.
+@return new node */
+fts_ast_node_t*
+fts_ast_create_node_list(
+/*=====================*/
+	void*		arg,			/*!< in: ast state instance */
+	fts_ast_node_t*	expr)			/*!< in: ast expr instance */
+{
+	fts_ast_node_t*	node = fts_ast_node_create();
+
+	node->type = FTS_AST_LIST;
+	node->list.head = node->list.tail = expr;
+
+	fts_ast_state_add_node((fts_ast_state_t*) arg, node);
+
+	return(node);
+}
+
+/******************************************************************//**
+Create a sub-expression list node. This function takes ownership of
+expr and is responsible for deleting it.
+@return new node */
+fts_ast_node_t*
+fts_ast_create_node_subexp_list(
+/*============================*/
+	void*		arg,			/*!< in: ast state instance */
+	fts_ast_node_t*	expr)			/*!< in: ast expr instance */
+{
+	fts_ast_node_t*	node = fts_ast_node_create();
+
+	node->type = FTS_AST_SUBEXP_LIST;
+	node->list.head = node->list.tail = expr;
+
+	fts_ast_state_add_node((fts_ast_state_t*) arg, node);
+
+	return(node);
+}
+
+/******************************************************************//**
+Free an expr list node elements. */
+static
+void
+fts_ast_free_list(
+/*==============*/
+	fts_ast_node_t*	node)			/*!< in: ast node to free */
+{
+	ut_a(node->type == FTS_AST_LIST
+	     || node->type == FTS_AST_SUBEXP_LIST
+	     || node->type == FTS_AST_PARSER_PHRASE_LIST);
+
+	for (node = node->list.head;
+	     node != NULL;
+	     node = fts_ast_free_node(node)) {
+
+		/*!< No op */
+	}
+}
+
+/********************************************************************//**
+Free a fts_ast_node_t instance.
+@return next node to free */
+fts_ast_node_t*
+fts_ast_free_node(
+/*==============*/
+	fts_ast_node_t*	node)			/*!< in: the node to free */
+{
+	fts_ast_node_t*	next_node;
+
+	switch (node->type) {
+	case FTS_AST_TEXT:
+		if (node->text.ptr) {
+			fts_ast_string_free(node->text.ptr);
+			node->text.ptr = NULL;
+		}
+		break;
+
+	case FTS_AST_TERM:
+		if (node->term.ptr) {
+			fts_ast_string_free(node->term.ptr);
+			node->term.ptr = NULL;
+		}
+		break;
+
+	case FTS_AST_LIST:
+	case FTS_AST_SUBEXP_LIST:
+	case FTS_AST_PARSER_PHRASE_LIST:
+		fts_ast_free_list(node);
+		node->list.head = node->list.tail = NULL;
+		break;
+
+	case FTS_AST_OPER:
+		break;
+
+	default:
+		ut_error;
+	}
+
+	/*!< Get next node before freeing the node itself */
+	next_node = node->next;
+
+	ut_free(node);
+
+	return(next_node);
+}
+
+/******************************************************************//**
+This AST takes ownership of the expr and is responsible
+for free'ing it.
+@return in param "list" */
+fts_ast_node_t*
+fts_ast_add_node(
+/*=============*/
+	fts_ast_node_t*	node,			/*!< in: list instance */
+	fts_ast_node_t*	elem)			/*!< in: node to add to list */
+{
+	if (!elem) {
+		return(NULL);
+	}
+
+	ut_a(!elem->next);
+	ut_a(node->type == FTS_AST_LIST
+	     || node->type == FTS_AST_SUBEXP_LIST
+	     || node->type == FTS_AST_PARSER_PHRASE_LIST);
+
+	if (!node->list.head) {
+		ut_a(!node->list.tail);
+
+		node->list.head = node->list.tail = elem;
+	} else {
+		ut_a(node->list.tail);
+
+		node->list.tail->next = elem;
+		node->list.tail = elem;
+	}
+
+	return(node);
+}
+
+/******************************************************************//**
+Set the wildcard attribute of a term. */
+void
+fts_ast_term_set_wildcard(
+/*======================*/
+	fts_ast_node_t*	node)			/*!< in/out: set attribute of
+						a term node */
+{
+	if (!node) {
+		return;
+	}
+
+	/* If it's a node list, the wildcard should be set to the tail node*/
+	if (node->type == FTS_AST_LIST)	{
+		ut_ad(node->list.tail != NULL);
+		node = node->list.tail;
+	}
+
+	ut_a(node->type == FTS_AST_TERM);
+	ut_a(!node->term.wildcard);
+
+	node->term.wildcard = TRUE;
+}
+
+/******************************************************************//**
+Set the proximity attribute of a text node. */
+void
+fts_ast_text_set_distance(
+/*======================*/
+	fts_ast_node_t*	node,			/*!< in/out: text node */
+	ulint		distance)		/*!< in: the text proximity
+						distance */
+{
+	if (node == NULL) {
+		return;
+	}
+
+	ut_a(node->type == FTS_AST_TEXT);
+	ut_a(node->text.distance == ULINT_UNDEFINED);
+
+	node->text.distance = distance;
+}
+
+/******************************************************************//**
+Free node and expr allocations. */
+void
+fts_ast_state_free(
+/*===============*/
+	fts_ast_state_t*state)			/*!< in: ast state to free */
+{
+	fts_ast_node_t*	node = state->list.head;
+
+	/* Free the nodes that were allocated during parsing. */
+	while (node) {
+		fts_ast_node_t*	next = node->next_alloc;
+
+		if (node->type == FTS_AST_TEXT && node->text.ptr) {
+			fts_ast_string_free(node->text.ptr);
+			node->text.ptr = NULL;
+		} else if (node->type == FTS_AST_TERM && node->term.ptr) {
+			fts_ast_string_free(node->term.ptr);
+			node->term.ptr = NULL;
+		}
+
+		ut_free(node);
+		node = next;
+	}
+
+	state->root = state->list.head = state->list.tail = NULL;
+}
+
+/** Print the ast string
+@param[in] str		string to print */
+static
+void
+fts_ast_string_print(
+	const fts_ast_string_t*	ast_str)
+{
+	for (ulint i = 0; i < ast_str->len; ++i) {
+		printf("%c", ast_str->str[i]);
+	}
+
+	printf("\n");
+}
+
+/******************************************************************//**
+Print an ast node recursively. */
+static
+void
+fts_ast_node_print_recursive(
+/*=========================*/
+	fts_ast_node_t*	node,			/*!< in: ast node to print */
+	ulint		level)			/*!< in: recursive level */
+{
+	/* Print alignment blank */
+	for (ulint i = 0; i < level; i++) {
+		printf("  ");
+	}
+
+	switch (node->type) {
+	case FTS_AST_TEXT:
+		printf("TEXT: ");
+		fts_ast_string_print(node->text.ptr);
+		break;
+
+	case FTS_AST_TERM:
+		printf("TERM: ");
+		fts_ast_string_print(node->term.ptr);
+		break;
+
+	case FTS_AST_LIST:
+		printf("LIST: \n");
+
+		for (node = node->list.head; node; node = node->next) {
+			fts_ast_node_print_recursive(node, level + 1);
+		}
+		break;
+
+	case FTS_AST_SUBEXP_LIST:
+		printf("SUBEXP_LIST: \n");
+
+		for (node = node->list.head; node; node = node->next) {
+			fts_ast_node_print_recursive(node, level + 1);
+		}
+		break;
+
+	case FTS_AST_OPER:
+		printf("OPER: %d\n", node->oper);
+		break;
+
+	case FTS_AST_PARSER_PHRASE_LIST:
+		printf("PARSER_PHRASE_LIST: \n");
+
+		for (node = node->list.head; node; node = node->next) {
+			fts_ast_node_print_recursive(node, level + 1);
+		}
+		break;
+
+	default:
+		ut_error;
+	}
+}
+
+/******************************************************************//**
+Print an ast node */
+void
+fts_ast_node_print(
+/*===============*/
+	fts_ast_node_t* node)		/*!< in: ast node to print */
+{
+	fts_ast_node_print_recursive(node, 0);
+}
+
+/** Check only union operation involved in the node
+@param[in]	node	ast node to check
+@return true if the node contains only union else false. */
+bool
+fts_ast_node_check_union(
+	fts_ast_node_t*	node)
+{
+	if (node->type == FTS_AST_LIST
+	    || node->type == FTS_AST_SUBEXP_LIST) {
+
+		for (node = node->list.head; node; node = node->next) {
+			if (!fts_ast_node_check_union(node)) {
+				return(false);
+			}
+		}
+
+	} else if (node->type == FTS_AST_PARSER_PHRASE_LIST) {
+		/* Phrase search for plugin parser */
+		return(false);
+	} else if (node->type == FTS_AST_OPER
+		   && (node->oper == FTS_IGNORE
+		       || node->oper == FTS_EXIST)) {
+
+		return(false);
+	} else if (node->type == FTS_AST_TEXT) {
+		/* Distance or phrase search query. */
+		return(false);
+	}
+
+	return(true);
+}
+
+/******************************************************************//**
+Traverse the AST - in-order traversal, except for the FTX_EXIST and FTS_IGNORE
+nodes, which will be ignored in the first pass of each level, and visited in a
+second and third pass after all other nodes in the same level are visited.
+@return DB_SUCCESS if all went well */
+dberr_t
+fts_ast_visit(
+/*==========*/
+	fts_ast_oper_t		oper,		/*!< in: current operator */
+	fts_ast_node_t*		node,		/*!< in: current root node */
+	fts_ast_callback	visitor,	/*!< in: callback function */
+	void*			arg,		/*!< in: arg for callback */
+	bool*			has_ignore)	/*!< out: true, if the operator
+						was ignored during processing,
+						currently we ignore FTS_EXIST
+						and FTS_IGNORE operators */
+{
+	dberr_t			error = DB_SUCCESS;
+	fts_ast_node_t*		oper_node = NULL;
+	fts_ast_node_t*		start_node;
+	bool			revisit = false;
+	bool			will_be_ignored = false;
+	fts_ast_visit_pass_t	visit_pass = FTS_PASS_FIRST;
+	const trx_t*		trx = node->trx;
+
+	start_node = node->list.head;
+
+	ut_a(node->type == FTS_AST_LIST
+	     || node->type == FTS_AST_SUBEXP_LIST);
+
+	if (oper == FTS_EXIST_SKIP) {
+		visit_pass = FTS_PASS_EXIST;
+	} else if (oper == FTS_IGNORE_SKIP) {
+		visit_pass = FTS_PASS_IGNORE;
+	}
+
+	/* In the first pass of the tree, at the leaf level of the
+	tree, FTS_EXIST and FTS_IGNORE operation will be ignored.
+	It will be repeated at the level above the leaf level.
+
+	The basic idea here is that when we encounter FTS_EXIST or
+	FTS_IGNORE, we will change the operator node into FTS_EXIST_SKIP
+	or FTS_IGNORE_SKIP, and term node & text node with the operators
+	is ignored in the first pass. We have two passes during the revisit:
+	We process nodes with FTS_EXIST_SKIP in the exist pass, and then
+	process nodes with FTS_IGNORE_SKIP in the ignore pass.
+
+	The order should be restrictly followed, or we will get wrong results.
+	For example, we have a query 'a +b -c d +e -f'.
+	first pass: process 'a' and 'd' by union;
+	exist pass: process '+b' and '+e' by intersection;
+	ignore pass: process '-c' and '-f' by difference. */
+
+	for (node = node->list.head;
+	     node && (error == DB_SUCCESS);
+	     node = node->next) {
+
+		switch (node->type) {
+		case FTS_AST_LIST:
+			if (visit_pass != FTS_PASS_FIRST) {
+				break;
+			}
+
+			error = fts_ast_visit(oper, node, visitor,
+					      arg, &will_be_ignored);
+
+			/* If will_be_ignored is set to true, then
+			we encountered and ignored a FTS_EXIST or FTS_IGNORE
+			operator. */
+			if (will_be_ignored) {
+				revisit = true;
+				/* Remember oper for list in case '-abc&def',
+				ignored oper is from previous node of list.*/
+				node->oper = oper;
+			}
+
+			break;
+
+		case FTS_AST_OPER:
+			oper = node->oper;
+			oper_node = node;
+
+			/* Change the operator for revisit */
+			if (oper == FTS_EXIST) {
+				oper_node->oper = FTS_EXIST_SKIP;
+			} else if (oper == FTS_IGNORE) {
+				oper_node->oper = FTS_IGNORE_SKIP;
+			}
+
+			break;
+
+		default:
+			if (node->visited) {
+				continue;
+			}
+
+			ut_a(oper == FTS_NONE || !oper_node
+			     || oper_node->oper == oper
+			     || oper_node->oper == FTS_EXIST_SKIP
+			     || oper_node->oper == FTS_IGNORE_SKIP);
+
+			if (oper== FTS_EXIST || oper == FTS_IGNORE) {
+				*has_ignore = true;
+				continue;
+			}
+
+			/* Process leaf node accroding to its pass.*/
+			if (oper == FTS_EXIST_SKIP
+			    && visit_pass == FTS_PASS_EXIST) {
+				error = visitor(FTS_EXIST, node, arg);
+				node->visited = true;
+			} else if (oper == FTS_IGNORE_SKIP
+				   && visit_pass == FTS_PASS_IGNORE) {
+				error = visitor(FTS_IGNORE, node, arg);
+				node->visited = true;
+			} else if (visit_pass == FTS_PASS_FIRST) {
+				error = visitor(oper, node, arg);
+				node->visited = true;
+			}
+		}
+	}
+
+	if (trx_is_interrupted(trx)) {
+		return DB_INTERRUPTED;
+	}
+
+	if (revisit) {
+		/* Exist pass processes the skipped FTS_EXIST operation. */
+                for (node = start_node;
+		     node && error == DB_SUCCESS;
+		     node = node->next) {
+
+			if (node->type == FTS_AST_LIST
+			    && node->oper != FTS_IGNORE) {
+				error = fts_ast_visit(FTS_EXIST_SKIP, node,
+					visitor, arg, &will_be_ignored);
+			}
+		}
+
+		/* Ignore pass processes the skipped FTS_IGNORE operation. */
+		for (node = start_node;
+		     node && error == DB_SUCCESS;
+		     node = node->next) {
+
+			if (node->type == FTS_AST_LIST) {
+				error = fts_ast_visit(FTS_IGNORE_SKIP, node,
+					visitor, arg, &will_be_ignored);
+			}
+		}
+	}
+
+	return(error);
+}
+
+/**
+Create an ast string object, with NUL-terminator, so the string
+has one more byte than len
+@param[in] str		pointer to string
+@param[in] len		length of the string
+@return ast string with NUL-terminator */
+fts_ast_string_t*
+fts_ast_string_create(
+	const byte*	str,
+	ulint		len)
+{
+	fts_ast_string_t*	ast_str;
+
+	ut_ad(len > 0);
+
+	ast_str = static_cast<fts_ast_string_t*>(
+		ut_malloc_nokey(sizeof(fts_ast_string_t)));
+
+	ast_str->str = static_cast<byte*>(ut_malloc_nokey(len + 1));
+
+	ast_str->len = len;
+	memcpy(ast_str->str, str, len);
+	ast_str->str[len] = '\0';
+
+	return(ast_str);
+}
+
+/**
+Free an ast string instance
+@param[in,out] ast_str		string to free */
+void
+fts_ast_string_free(
+	fts_ast_string_t*	ast_str)
+{
+	if (ast_str != NULL) {
+		ut_free(ast_str->str);
+		ut_free(ast_str);
+	}
+}
+
+/**
+Translate ast string of type FTS_AST_NUMB to unsigned long by strtoul
+@param[in] str		string to translate
+@param[in] base		the base
+@return translated number */
+ulint
+fts_ast_string_to_ul(
+	const fts_ast_string_t*	ast_str,
+	int			base)
+{
+	return(strtoul(reinterpret_cast<const char*>(ast_str->str),
+		       NULL, base));
+}
+
+#ifdef UNIV_DEBUG
+const char*
+fts_ast_node_type_get(fts_ast_type_t	type)
+{
+	switch (type) {
+	case FTS_AST_OPER:
+		return("FTS_AST_OPER");
+	case FTS_AST_NUMB:
+		return("FTS_AST_NUMB");
+	case FTS_AST_TERM:
+		return("FTS_AST_TERM");
+	case FTS_AST_TEXT:
+		return("FTS_AST_TEXT");
+	case FTS_AST_LIST:
+		return("FTS_AST_LIST");
+	case FTS_AST_SUBEXP_LIST:
+		return("FTS_AST_SUBEXP_LIST");
+	case FTS_AST_PARSER_PHRASE_LIST:
+		return("FTS_AST_PARSER_PHRASE_LIST");
+	}
+	ut_ad(0);
+	return("FTS_UNKNOWN");
+}
+#endif /* UNIV_DEBUG */
diff --git a/storage/innobase/fts/fts0blex.cc b/storage/innobase/fts/fts0blex.cc
new file mode 100644
index 00000000..6a2b4202
--- /dev/null
+++ b/storage/innobase/fts/fts0blex.cc
@@ -0,0 +1,2177 @@
+#include "univ.i"
+#line 2 "fts0blex.cc"
+
+#line 4 "fts0blex.cc"
+
+#define  YY_INT_ALIGNED short int
+
+/* A lexical scanner generated by flex */
+
+#define FLEX_SCANNER
+#define YY_FLEX_MAJOR_VERSION 2
+#define YY_FLEX_MINOR_VERSION 6
+#define YY_FLEX_SUBMINOR_VERSION 4
+#if YY_FLEX_SUBMINOR_VERSION > 0
+#define FLEX_BETA
+#endif
+
+#ifdef yy_create_buffer
+#define fts0b_create_buffer_ALREADY_DEFINED
+#else
+#define yy_create_buffer fts0b_create_buffer
+#endif
+
+#ifdef yy_delete_buffer
+#define fts0b_delete_buffer_ALREADY_DEFINED
+#else
+#define yy_delete_buffer fts0b_delete_buffer
+#endif
+
+#ifdef yy_scan_buffer
+#define fts0b_scan_buffer_ALREADY_DEFINED
+#else
+#define yy_scan_buffer fts0b_scan_buffer
+#endif
+
+#ifdef yy_scan_string
+#define fts0b_scan_string_ALREADY_DEFINED
+#else
+#define yy_scan_string fts0b_scan_string
+#endif
+
+#ifdef yy_scan_bytes
+#define fts0b_scan_bytes_ALREADY_DEFINED
+#else
+#define yy_scan_bytes fts0b_scan_bytes
+#endif
+
+#ifdef yy_init_buffer
+#define fts0b_init_buffer_ALREADY_DEFINED
+#else
+#define yy_init_buffer fts0b_init_buffer
+#endif
+
+#ifdef yy_flush_buffer
+#define fts0b_flush_buffer_ALREADY_DEFINED
+#else
+#define yy_flush_buffer fts0b_flush_buffer
+#endif
+
+#ifdef yy_load_buffer_state
+#define fts0b_load_buffer_state_ALREADY_DEFINED
+#else
+#define yy_load_buffer_state fts0b_load_buffer_state
+#endif
+
+#ifdef yy_switch_to_buffer
+#define fts0b_switch_to_buffer_ALREADY_DEFINED
+#else
+#define yy_switch_to_buffer fts0b_switch_to_buffer
+#endif
+
+#ifdef yypush_buffer_state
+#define fts0bpush_buffer_state_ALREADY_DEFINED
+#else
+#define yypush_buffer_state fts0bpush_buffer_state
+#endif
+
+#ifdef yypop_buffer_state
+#define fts0bpop_buffer_state_ALREADY_DEFINED
+#else
+#define yypop_buffer_state fts0bpop_buffer_state
+#endif
+
+#ifdef yyensure_buffer_stack
+#define fts0bensure_buffer_stack_ALREADY_DEFINED
+#else
+#define yyensure_buffer_stack fts0bensure_buffer_stack
+#endif
+
+#ifdef yylex
+#define fts0blex_ALREADY_DEFINED
+#else
+#define yylex fts0blex
+#endif
+
+#ifdef yyrestart
+#define fts0brestart_ALREADY_DEFINED
+#else
+#define yyrestart fts0brestart
+#endif
+
+#ifdef yylex_init
+#define fts0blex_init_ALREADY_DEFINED
+#else
+#define yylex_init fts0blex_init
+#endif
+
+#ifdef yylex_init_extra
+#define fts0blex_init_extra_ALREADY_DEFINED
+#else
+#define yylex_init_extra fts0blex_init_extra
+#endif
+
+#ifdef yylex_destroy
+#define fts0blex_destroy_ALREADY_DEFINED
+#else
+#define yylex_destroy fts0blex_destroy
+#endif
+
+#ifdef yyget_debug
+#define fts0bget_debug_ALREADY_DEFINED
+#else
+#define yyget_debug fts0bget_debug
+#endif
+
+#ifdef yyset_debug
+#define fts0bset_debug_ALREADY_DEFINED
+#else
+#define yyset_debug fts0bset_debug
+#endif
+
+#ifdef yyget_extra
+#define fts0bget_extra_ALREADY_DEFINED
+#else
+#define yyget_extra fts0bget_extra
+#endif
+
+#ifdef yyset_extra
+#define fts0bset_extra_ALREADY_DEFINED
+#else
+#define yyset_extra fts0bset_extra
+#endif
+
+#ifdef yyget_in
+#define fts0bget_in_ALREADY_DEFINED
+#else
+#define yyget_in fts0bget_in
+#endif
+
+#ifdef yyset_in
+#define fts0bset_in_ALREADY_DEFINED
+#else
+#define yyset_in fts0bset_in
+#endif
+
+#ifdef yyget_out
+#define fts0bget_out_ALREADY_DEFINED
+#else
+#define yyget_out fts0bget_out
+#endif
+
+#ifdef yyset_out
+#define fts0bset_out_ALREADY_DEFINED
+#else
+#define yyset_out fts0bset_out
+#endif
+
+#ifdef yyget_leng
+#define fts0bget_leng_ALREADY_DEFINED
+#else
+#define yyget_leng fts0bget_leng
+#endif
+
+#ifdef yyget_text
+#define fts0bget_text_ALREADY_DEFINED
+#else
+#define yyget_text fts0bget_text
+#endif
+
+#ifdef yyget_lineno
+#define fts0bget_lineno_ALREADY_DEFINED
+#else
+#define yyget_lineno fts0bget_lineno
+#endif
+
+#ifdef yyset_lineno
+#define fts0bset_lineno_ALREADY_DEFINED
+#else
+#define yyset_lineno fts0bset_lineno
+#endif
+
+#ifdef yyget_column
+#define fts0bget_column_ALREADY_DEFINED
+#else
+#define yyget_column fts0bget_column
+#endif
+
+#ifdef yyset_column
+#define fts0bset_column_ALREADY_DEFINED
+#else
+#define yyset_column fts0bset_column
+#endif
+
+#ifdef yywrap
+#define fts0bwrap_ALREADY_DEFINED
+#else
+#define yywrap fts0bwrap
+#endif
+
+#ifdef yyalloc
+#define fts0balloc_ALREADY_DEFINED
+#else
+#define yyalloc fts0balloc
+#endif
+
+#ifdef yyrealloc
+#define fts0brealloc_ALREADY_DEFINED
+#else
+#define yyrealloc fts0brealloc
+#endif
+
+#ifdef yyfree
+#define fts0bfree_ALREADY_DEFINED
+#else
+#define yyfree fts0bfree
+#endif
+
+/* First, we deal with  platform-specific or compiler-specific issues. */
+
+/* begin standard C headers. */
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+
+/* end standard C headers. */
+
+/* flex integer type definitions */
+
+#ifndef FLEXINT_H
+#define FLEXINT_H
+
+/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+
+/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
+ * if you want the limit (max/min) macros for int types. 
+ */
+#ifndef __STDC_LIMIT_MACROS
+#define __STDC_LIMIT_MACROS 1
+#endif
+
+#include <inttypes.h>
+typedef int8_t flex_int8_t;
+typedef uint8_t flex_uint8_t;
+typedef int16_t flex_int16_t;
+typedef uint16_t flex_uint16_t;
+typedef int32_t flex_int32_t;
+typedef uint32_t flex_uint32_t;
+#else
+typedef signed char flex_int8_t;
+typedef short int flex_int16_t;
+typedef int flex_int32_t;
+typedef unsigned char flex_uint8_t; 
+typedef unsigned short int flex_uint16_t;
+typedef unsigned int flex_uint32_t;
+
+/* Limits of integral types. */
+#ifndef INT8_MIN
+#define INT8_MIN               (-128)
+#endif
+#ifndef INT16_MIN
+#define INT16_MIN              (-32767-1)
+#endif
+#ifndef INT32_MIN
+#define INT32_MIN              (-2147483647-1)
+#endif
+#ifndef INT8_MAX
+#define INT8_MAX               (127)
+#endif
+#ifndef INT16_MAX
+#define INT16_MAX              (32767)
+#endif
+#ifndef INT32_MAX
+#define INT32_MAX              (2147483647)
+#endif
+#ifndef UINT8_MAX
+#define UINT8_MAX              (255U)
+#endif
+#ifndef UINT16_MAX
+#define UINT16_MAX             (65535U)
+#endif
+#ifndef UINT32_MAX
+#define UINT32_MAX             (4294967295U)
+#endif
+
+#ifndef SIZE_MAX
+#define SIZE_MAX               (~(size_t)0)
+#endif
+
+#endif /* ! C99 */
+
+#endif /* ! FLEXINT_H */
+
+/* begin standard C++ headers. */
+
+/* TODO: this is always defined, so inline it */
+#define yyconst const
+
+#if defined(__GNUC__) && __GNUC__ >= 3
+#define yynoreturn __attribute__((__noreturn__))
+#else
+#define yynoreturn
+#endif
+
+/* Returned upon end-of-file. */
+#define YY_NULL 0
+
+/* Promotes a possibly negative, possibly signed char to an
+ *   integer in range [0..255] for use as an array index.
+ */
+#define YY_SC_TO_UI(c) ((YY_CHAR) (c))
+
+/* An opaque pointer. */
+#ifndef YY_TYPEDEF_YY_SCANNER_T
+#define YY_TYPEDEF_YY_SCANNER_T
+typedef void* yyscan_t;
+#endif
+
+/* For convenience, these vars (plus the bison vars far below)
+   are macros in the reentrant scanner. */
+#define yyin yyg->yyin_r
+#define yyout yyg->yyout_r
+#define yyextra yyg->yyextra_r
+#define yyleng yyg->yyleng_r
+#define yytext yyg->yytext_r
+#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno)
+#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column)
+#define yy_flex_debug yyg->yy_flex_debug_r
+
+/* Enter a start condition.  This macro really ought to take a parameter,
+ * but we do it the disgusting crufty way forced on us by the ()-less
+ * definition of BEGIN.
+ */
+#define BEGIN yyg->yy_start = 1 + 2 *
+/* Translate the current start state into a value that can be later handed
+ * to BEGIN to return to the state.  The YYSTATE alias is for lex
+ * compatibility.
+ */
+#define YY_START ((yyg->yy_start - 1) / 2)
+#define YYSTATE YY_START
+/* Action number for EOF rule of a given start state. */
+#define YY_STATE_EOF(state) (YY_END_OF_BUFFER + state + 1)
+/* Special action meaning "start processing a new file". */
+#define YY_NEW_FILE yyrestart( yyin , yyscanner )
+#define YY_END_OF_BUFFER_CHAR 0
+
+/* Size of default input buffer. */
+#ifndef YY_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k.
+ * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
+ * Ditto for the __ia64__ case accordingly.
+ */
+#define YY_BUF_SIZE 32768
+#else
+#define YY_BUF_SIZE 16384
+#endif /* __ia64__ */
+#endif
+
+/* The state buf must be large enough to hold one state per character in the main buffer.
+ */
+#define YY_STATE_BUF_SIZE   ((YY_BUF_SIZE + 2) * sizeof(yy_state_type))
+
+#ifndef YY_TYPEDEF_YY_BUFFER_STATE
+#define YY_TYPEDEF_YY_BUFFER_STATE
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+#endif
+
+#ifndef YY_TYPEDEF_YY_SIZE_T
+#define YY_TYPEDEF_YY_SIZE_T
+typedef size_t yy_size_t;
+#endif
+
+#define EOB_ACT_CONTINUE_SCAN 0
+#define EOB_ACT_END_OF_FILE 1
+#define EOB_ACT_LAST_MATCH 2
+    
+    #define YY_LESS_LINENO(n)
+    #define YY_LINENO_REWIND_TO(ptr)
+    
+/* Return all but the first "n" matched characters back to the input stream. */
+#define yyless(n) \
+	do \
+		{ \
+		/* Undo effects of setting up yytext. */ \
+        int yyless_macro_arg = (n); \
+        YY_LESS_LINENO(yyless_macro_arg);\
+		*yy_cp = yyg->yy_hold_char; \
+		YY_RESTORE_YY_MORE_OFFSET \
+		yyg->yy_c_buf_p = yy_cp = yy_bp + yyless_macro_arg - YY_MORE_ADJ; \
+		YY_DO_BEFORE_ACTION; /* set up yytext again */ \
+		} \
+	while ( 0 )
+#define unput(c) yyunput( c, yyg->yytext_ptr , yyscanner )
+
+#ifndef YY_STRUCT_YY_BUFFER_STATE
+#define YY_STRUCT_YY_BUFFER_STATE
+struct yy_buffer_state
+	{
+	FILE *yy_input_file;
+
+	char *yy_ch_buf;		/* input buffer */
+	char *yy_buf_pos;		/* current position in input buffer */
+
+	/* Size of input buffer in bytes, not including room for EOB
+	 * characters.
+	 */
+	int yy_buf_size;
+
+	/* Number of characters read into yy_ch_buf, not including EOB
+	 * characters.
+	 */
+	int yy_n_chars;
+
+	/* Whether we "own" the buffer - i.e., we know we created it,
+	 * and can realloc() it to grow it, and should free() it to
+	 * delete it.
+	 */
+	int yy_is_our_buffer;
+
+	/* Whether this is an "interactive" input source; if so, and
+	 * if we're using stdio for input, then we want to use getc()
+	 * instead of fread(), to make sure we stop fetching input after
+	 * each newline.
+	 */
+	int yy_is_interactive;
+
+	/* Whether we're considered to be at the beginning of a line.
+	 * If so, '^' rules will be active on the next match, otherwise
+	 * not.
+	 */
+	int yy_at_bol;
+
+    int yy_bs_lineno; /**< The line count. */
+    int yy_bs_column; /**< The column count. */
+
+	/* Whether to try to fill the input buffer when we reach the
+	 * end of it.
+	 */
+	int yy_fill_buffer;
+
+	int yy_buffer_status;
+
+#define YY_BUFFER_NEW 0
+#define YY_BUFFER_NORMAL 1
+	/* When an EOF's been seen but there's still some text to process
+	 * then we mark the buffer as YY_EOF_PENDING, to indicate that we
+	 * shouldn't try reading from the input source any more.  We might
+	 * still have a bunch of tokens to match, though, because of
+	 * possible backing-up.
+	 *
+	 * When we actually see the EOF, we change the status to "new"
+	 * (via yyrestart()), so that the user can continue scanning by
+	 * just pointing yyin at a new input file.
+	 */
+#define YY_BUFFER_EOF_PENDING 2
+
+	};
+#endif /* !YY_STRUCT_YY_BUFFER_STATE */
+
+/* We provide macros for accessing buffer states in case in the
+ * future we want to put the buffer states in a more general
+ * "scanner state".
+ *
+ * Returns the top of the stack, or NULL.
+ */
+#define YY_CURRENT_BUFFER ( yyg->yy_buffer_stack \
+                          ? yyg->yy_buffer_stack[yyg->yy_buffer_stack_top] \
+                          : 0)
+/* Same as previous macro, but useful when we know that the buffer stack is not
+ * NULL or when we need an lvalue. For internal use only.
+ */
+#define YY_CURRENT_BUFFER_LVALUE yyg->yy_buffer_stack[yyg->yy_buffer_stack_top]
+
+void yyrestart ( FILE *input_file , yyscan_t yyscanner );
+void yy_switch_to_buffer ( YY_BUFFER_STATE new_buffer , yyscan_t yyscanner );
+YY_BUFFER_STATE yy_create_buffer ( FILE *file, int size , yyscan_t yyscanner );
+void yy_delete_buffer ( YY_BUFFER_STATE b , yyscan_t yyscanner );
+void yy_flush_buffer ( YY_BUFFER_STATE b , yyscan_t yyscanner );
+void yypush_buffer_state ( YY_BUFFER_STATE new_buffer , yyscan_t yyscanner );
+void yypop_buffer_state ( yyscan_t yyscanner );
+
+static void yyensure_buffer_stack ( yyscan_t yyscanner );
+static void yy_load_buffer_state ( yyscan_t yyscanner );
+static void yy_init_buffer ( YY_BUFFER_STATE b, FILE *file , yyscan_t yyscanner );
+#define YY_FLUSH_BUFFER yy_flush_buffer( YY_CURRENT_BUFFER , yyscanner)
+
+YY_BUFFER_STATE yy_scan_buffer ( char *base, yy_size_t size , yyscan_t yyscanner );
+YY_BUFFER_STATE yy_scan_string ( const char *yy_str , yyscan_t yyscanner );
+YY_BUFFER_STATE yy_scan_bytes ( const char *bytes, int len , yyscan_t yyscanner );
+
+void *yyalloc ( yy_size_t , yyscan_t yyscanner );
+void *yyrealloc ( void *, yy_size_t , yyscan_t yyscanner );
+void yyfree ( void * , yyscan_t yyscanner );
+
+#define yy_new_buffer yy_create_buffer
+#define yy_set_interactive(is_interactive) \
+	{ \
+	if ( ! YY_CURRENT_BUFFER ){ \
+        yyensure_buffer_stack (yyscanner); \
+		YY_CURRENT_BUFFER_LVALUE =    \
+            yy_create_buffer( yyin, YY_BUF_SIZE , yyscanner); \
+	} \
+	YY_CURRENT_BUFFER_LVALUE->yy_is_interactive = is_interactive; \
+	}
+#define yy_set_bol(at_bol) \
+	{ \
+	if ( ! YY_CURRENT_BUFFER ){\
+        yyensure_buffer_stack (yyscanner); \
+		YY_CURRENT_BUFFER_LVALUE =    \
+            yy_create_buffer( yyin, YY_BUF_SIZE , yyscanner); \
+	} \
+	YY_CURRENT_BUFFER_LVALUE->yy_at_bol = at_bol; \
+	}
+#define YY_AT_BOL() (YY_CURRENT_BUFFER_LVALUE->yy_at_bol)
+
+/* Begin user sect3 */
+
+#define fts0bwrap(yyscanner) (/*CONSTCOND*/1)
+#define YY_SKIP_YYWRAP
+typedef flex_uint8_t YY_CHAR;
+
+typedef int yy_state_type;
+
+#define yytext_ptr yytext_r
+
+static yy_state_type yy_get_previous_state ( yyscan_t yyscanner );
+static yy_state_type yy_try_NUL_trans ( yy_state_type current_state  , yyscan_t yyscanner);
+static int yy_get_next_buffer ( yyscan_t yyscanner );
+static void yynoreturn yy_fatal_error ( const char* msg , yyscan_t yyscanner );
+
+/* Done after the current pattern has been matched and before the
+ * corresponding action - sets up yytext.
+ */
+#define YY_DO_BEFORE_ACTION \
+	yyg->yytext_ptr = yy_bp; \
+	yyleng = (int) (yy_cp - yy_bp); \
+	yyg->yy_hold_char = *yy_cp; \
+	*yy_cp = '\0'; \
+	yyg->yy_c_buf_p = yy_cp;
+#define YY_NUM_RULES 7
+#define YY_END_OF_BUFFER 8
+/* This struct is not used in this scanner,
+   but its presence is necessary. */
+struct yy_trans_info
+	{
+	flex_int32_t yy_verify;
+	flex_int32_t yy_nxt;
+	};
+static const flex_int16_t yy_accept[19] =
+    {   0,
+        4,    4,    8,    4,    1,    6,    1,    7,    7,    2,
+        3,    4,    1,    1,    0,    5,    3,    0
+    } ;
+
+static const YY_CHAR yy_ec[256] =
+    {   0,
+        1,    1,    1,    1,    1,    1,    1,    1,    2,    3,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    4,    1,    5,    1,    1,    6,    1,    1,    7,
+        7,    7,    7,    1,    7,    1,    1,    8,    8,    8,
+        8,    8,    8,    8,    8,    8,    8,    1,    1,    7,
+        1,    7,    1,    7,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    7,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1
+    } ;
+
+static const YY_CHAR yy_meta[9] =
+    {   0,
+        1,    2,    3,    4,    5,    5,    5,    1
+    } ;
+
+static const flex_int16_t yy_base[22] =
+    {   0,
+        0,    0,   22,    0,    7,   23,    0,   14,   23,   23,
+        7,    0,    0,    0,    5,   23,    0,   23,   11,   12,
+       16
+    } ;
+
+static const flex_int16_t yy_def[22] =
+    {   0,
+       18,    1,   18,   19,   19,   18,   20,   21,   18,   18,
+       19,   19,    5,   20,   21,   18,   11,    0,   18,   18,
+       18
+    } ;
+
+static const flex_int16_t yy_nxt[32] =
+    {   0,
+        4,    5,    6,    7,    8,    9,   10,   11,   13,   16,
+       14,   12,   12,   14,   17,   14,   15,   15,   16,   15,
+       15,   18,    3,   18,   18,   18,   18,   18,   18,   18,
+       18
+    } ;
+
+static const flex_int16_t yy_chk[32] =
+    {   0,
+        1,    1,    1,    1,    1,    1,    1,    1,    5,   15,
+        5,   19,   19,   20,   11,   20,   21,   21,    8,   21,
+       21,    3,   18,   18,   18,   18,   18,   18,   18,   18,
+       18
+    } ;
+
+/* The intent behind this definition is that it'll catch
+ * any uses of REJECT which flex missed.
+ */
+#define REJECT reject_used_but_not_detected
+#define yymore() yymore_used_but_not_detected
+#define YY_MORE_ADJ 0
+#define YY_RESTORE_YY_MORE_OFFSET
+#line 1 "fts0blex.l"
+/*****************************************************************************
+
+Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/**
+ * @file fts/fts0blex.l
+ * FTS parser lexical analyzer
+ *
+ * Created 2007/5/9 Sunny Bains
+ */
+#line 27 "fts0blex.l"
+
+#include "fts0ast.h"
+#include "fts0pars.h"
+
+/* Required for reentrant parser */
+#define YY_DECL int fts_blexer(YYSTYPE* val, yyscan_t yyscanner)
+#define exit(A)   ut_error
+
+#line 675 "fts0blex.cc"
+#define YY_NO_INPUT 1
+#line 677 "fts0blex.cc"
+
+#define INITIAL 0
+
+#ifndef YY_NO_UNISTD_H
+/* Special case for "unistd.h", since it is non-ANSI. We include it way
+ * down here because we want the user's section 1 to have been scanned first.
+ * The user has a chance to override it with an option.
+ */
+#include <unistd.h>
+#endif
+
+#ifndef YY_EXTRA_TYPE
+#define YY_EXTRA_TYPE void *
+#endif
+
+/* Holds the entire state of the reentrant scanner. */
+struct yyguts_t
+    {
+
+    /* User-defined. Not touched by flex. */
+    YY_EXTRA_TYPE yyextra_r;
+
+    /* The rest are the same as the globals declared in the non-reentrant scanner. */
+    FILE *yyin_r, *yyout_r;
+    size_t yy_buffer_stack_top; /**< index of top of stack. */
+    size_t yy_buffer_stack_max; /**< capacity of stack. */
+    YY_BUFFER_STATE * yy_buffer_stack; /**< Stack as an array. */
+    char yy_hold_char;
+    int yy_n_chars;
+    int yyleng_r;
+    char *yy_c_buf_p;
+    int yy_init;
+    int yy_start;
+    int yy_did_buffer_switch_on_eof;
+    int yy_start_stack_ptr;
+    int yy_start_stack_depth;
+    int *yy_start_stack;
+    yy_state_type yy_last_accepting_state;
+    char* yy_last_accepting_cpos;
+
+    int yylineno_r;
+    int yy_flex_debug_r;
+
+    char *yytext_r;
+    int yy_more_flag;
+    int yy_more_len;
+
+    }; /* end struct yyguts_t */
+
+static int yy_init_globals ( yyscan_t yyscanner );
+
+int yylex_init (yyscan_t* scanner);
+
+int yylex_init_extra ( YY_EXTRA_TYPE user_defined, yyscan_t* scanner);
+
+/* Accessor methods to globals.
+   These are made visible to non-reentrant scanners for convenience. */
+
+int yylex_destroy ( yyscan_t yyscanner );
+
+int yyget_debug ( yyscan_t yyscanner );
+
+void yyset_debug ( int debug_flag , yyscan_t yyscanner );
+
+YY_EXTRA_TYPE yyget_extra ( yyscan_t yyscanner );
+
+void yyset_extra ( YY_EXTRA_TYPE user_defined , yyscan_t yyscanner );
+
+FILE *yyget_in ( yyscan_t yyscanner );
+
+void yyset_in  ( FILE * _in_str , yyscan_t yyscanner );
+
+FILE *yyget_out ( yyscan_t yyscanner );
+
+void yyset_out  ( FILE * _out_str , yyscan_t yyscanner );
+
+			int yyget_leng ( yyscan_t yyscanner );
+
+char *yyget_text ( yyscan_t yyscanner );
+
+int yyget_lineno ( yyscan_t yyscanner );
+
+void yyset_lineno ( int _line_number , yyscan_t yyscanner );
+
+int yyget_column  ( yyscan_t yyscanner );
+
+void yyset_column ( int _column_no , yyscan_t yyscanner );
+
+/* Macros after this point can all be overridden by user definitions in
+ * section 1.
+ */
+
+#ifndef YY_SKIP_YYWRAP
+#ifdef __cplusplus
+extern "C" int yywrap ( yyscan_t yyscanner );
+#else
+extern int yywrap ( yyscan_t yyscanner );
+#endif
+#endif
+
+#ifndef YY_NO_UNPUT
+    
+#endif
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy ( char *, const char *, int , yyscan_t yyscanner);
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen ( const char * , yyscan_t yyscanner);
+#endif
+
+#ifndef YY_NO_INPUT
+#ifdef __cplusplus
+static int yyinput ( yyscan_t yyscanner );
+#else
+static int input ( yyscan_t yyscanner );
+#endif
+
+#endif
+
+/* Amount of stuff to slurp up with each read. */
+#ifndef YY_READ_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k */
+#define YY_READ_BUF_SIZE 16384
+#else
+#define YY_READ_BUF_SIZE 8192
+#endif /* __ia64__ */
+#endif
+
+/* Copy whatever the last rule matched to the standard output. */
+#ifndef ECHO
+/* This used to be an fputs(), but since the string might contain NUL's,
+ * we now use fwrite().
+ */
+#define ECHO do { if (fwrite( yytext, (size_t) yyleng, 1, yyout )) {} } while (0)
+#endif
+
+/* Gets input and stuffs it into "buf".  number of characters read, or YY_NULL,
+ * is returned in "result".
+ */
+#ifndef YY_INPUT
+#define YY_INPUT(buf,result,max_size) \
+	if ( YY_CURRENT_BUFFER_LVALUE->yy_is_interactive ) \
+		{ \
+		int c = '*'; \
+		int n; \
+		for ( n = 0; n < max_size && \
+			     (c = getc( yyin )) != EOF && c != '\n'; ++n ) \
+			buf[n] = (char) c; \
+		if ( c == '\n' ) \
+			buf[n++] = (char) c; \
+		if ( c == EOF && ferror( yyin ) ) \
+			YY_FATAL_ERROR( "input in flex scanner failed" ); \
+		result = n; \
+		} \
+	else \
+		{ \
+		errno=0; \
+		while ( (result = (int) fread(buf, 1, (yy_size_t) max_size, yyin)) == 0 && ferror(yyin)) \
+			{ \
+			if( errno != EINTR) \
+				{ \
+				YY_FATAL_ERROR( "input in flex scanner failed" ); \
+				break; \
+				} \
+			errno=0; \
+			clearerr(yyin); \
+			} \
+		}\
+\
+
+#endif
+
+/* No semi-colon after return; correct usage is to write "yyterminate();" -
+ * we don't want an extra ';' after the "return" because that will cause
+ * some compilers to complain about unreachable statements.
+ */
+#ifndef yyterminate
+#define yyterminate() return YY_NULL
+#endif
+
+/* Number of entries by which start-condition stack grows. */
+#ifndef YY_START_STACK_INCR
+#define YY_START_STACK_INCR 25
+#endif
+
+/* Report a fatal error. */
+#ifndef YY_FATAL_ERROR
+#define YY_FATAL_ERROR(msg) yy_fatal_error( msg , yyscanner)
+#endif
+
+/* end tables serialization structures and prototypes */
+
+/* Default declaration of generated scanner - a define so the user can
+ * easily add parameters.
+ */
+#ifndef YY_DECL
+#define YY_DECL_IS_OURS 1
+
+extern int yylex (yyscan_t yyscanner);
+
+#define YY_DECL int yylex (yyscan_t yyscanner)
+#endif /* !YY_DECL */
+
+/* Code executed at the beginning of each rule, after yytext and yyleng
+ * have been set up.
+ */
+#ifndef YY_USER_ACTION
+#define YY_USER_ACTION
+#endif
+
+/* Code executed at the end of each rule. */
+#ifndef YY_BREAK
+#define YY_BREAK /*LINTED*/break;
+#endif
+
+#define YY_RULE_SETUP \
+	YY_USER_ACTION
+
+/** The main scanner function which does all the work.
+ */
+YY_DECL
+{
+	yy_state_type yy_current_state;
+	char *yy_cp, *yy_bp;
+	int yy_act;
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	if ( !yyg->yy_init )
+		{
+		yyg->yy_init = 1;
+
+#ifdef YY_USER_INIT
+		YY_USER_INIT;
+#endif
+
+		if ( ! yyg->yy_start )
+			yyg->yy_start = 1;	/* first start state */
+
+		if ( ! yyin )
+			yyin = stdin;
+
+		if ( ! yyout )
+			yyout = stdout;
+
+		if ( ! YY_CURRENT_BUFFER ) {
+			yyensure_buffer_stack (yyscanner);
+			YY_CURRENT_BUFFER_LVALUE =
+				yy_create_buffer( yyin, YY_BUF_SIZE , yyscanner);
+		}
+
+		yy_load_buffer_state( yyscanner );
+		}
+
+	{
+#line 44 "fts0blex.l"
+
+
+#line 938 "fts0blex.cc"
+
+	while ( /*CONSTCOND*/1 )		/* loops until end-of-file is reached */
+		{
+		yy_cp = yyg->yy_c_buf_p;
+
+		/* Support of yytext. */
+		*yy_cp = yyg->yy_hold_char;
+
+		/* yy_bp points to the position in yy_ch_buf of the start of
+		 * the current run.
+		 */
+		yy_bp = yy_cp;
+
+		yy_current_state = yyg->yy_start;
+yy_match:
+		do
+			{
+			YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)] ;
+			if ( yy_accept[yy_current_state] )
+				{
+				yyg->yy_last_accepting_state = yy_current_state;
+				yyg->yy_last_accepting_cpos = yy_cp;
+				}
+			while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+				{
+				yy_current_state = (int) yy_def[yy_current_state];
+				if ( yy_current_state >= 19 )
+					yy_c = yy_meta[yy_c];
+				}
+			yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c];
+			++yy_cp;
+			}
+		while ( yy_current_state != 18 );
+		yy_cp = yyg->yy_last_accepting_cpos;
+		yy_current_state = yyg->yy_last_accepting_state;
+
+yy_find_action:
+		yy_act = yy_accept[yy_current_state];
+
+		YY_DO_BEFORE_ACTION;
+
+do_action:	/* This label is used only to access EOF actions. */
+
+		switch ( yy_act )
+	{ /* beginning of action switch */
+			case 0: /* must back up */
+			/* undo the effects of YY_DO_BEFORE_ACTION */
+			*yy_cp = yyg->yy_hold_char;
+			yy_cp = yyg->yy_last_accepting_cpos;
+			yy_current_state = yyg->yy_last_accepting_state;
+			goto yy_find_action;
+
+case 1:
+YY_RULE_SETUP
+#line 46 "fts0blex.l"
+/* Ignore whitespace */ ;
+	YY_BREAK
+case 2:
+YY_RULE_SETUP
+#line 48 "fts0blex.l"
+{
+	val->oper = fts0bget_text(yyscanner)[0];
+
+	return(val->oper);
+}
+	YY_BREAK
+case 3:
+YY_RULE_SETUP
+#line 54 "fts0blex.l"
+{
+	val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0bget_text(yyscanner)), fts0bget_leng(yyscanner));
+
+	return(FTS_NUMB);
+}
+	YY_BREAK
+case 4:
+YY_RULE_SETUP
+#line 60 "fts0blex.l"
+{
+	val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0bget_text(yyscanner)), fts0bget_leng(yyscanner));
+
+	return(FTS_TERM);
+}
+	YY_BREAK
+case 5:
+YY_RULE_SETUP
+#line 66 "fts0blex.l"
+{
+	val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0bget_text(yyscanner)), fts0bget_leng(yyscanner));
+
+	return(FTS_TEXT);
+}
+	YY_BREAK
+case 6:
+/* rule 6 can match eol */
+YY_RULE_SETUP
+#line 72 "fts0blex.l"
+
+	YY_BREAK
+case 7:
+YY_RULE_SETUP
+#line 74 "fts0blex.l"
+ECHO;
+	YY_BREAK
+#line 1043 "fts0blex.cc"
+case YY_STATE_EOF(INITIAL):
+	yyterminate();
+
+	case YY_END_OF_BUFFER:
+		{
+		/* Amount of text matched not including the EOB char. */
+		int yy_amount_of_matched_text = (int) (yy_cp - yyg->yytext_ptr) - 1;
+
+		/* Undo the effects of YY_DO_BEFORE_ACTION. */
+		*yy_cp = yyg->yy_hold_char;
+		YY_RESTORE_YY_MORE_OFFSET
+
+		if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_NEW )
+			{
+			/* We're scanning a new file or input source.  It's
+			 * possible that this happened because the user
+			 * just pointed yyin at a new source and called
+			 * yylex().  If so, then we have to assure
+			 * consistency between YY_CURRENT_BUFFER and our
+			 * globals.  Here is the right place to do so, because
+			 * this is the first action (other than possibly a
+			 * back-up) that will match for the new input source.
+			 */
+			yyg->yy_n_chars = YY_CURRENT_BUFFER_LVALUE->yy_n_chars;
+			YY_CURRENT_BUFFER_LVALUE->yy_input_file = yyin;
+			YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = YY_BUFFER_NORMAL;
+			}
+
+		/* Note that here we test for yy_c_buf_p "<=" to the position
+		 * of the first EOB in the buffer, since yy_c_buf_p will
+		 * already have been incremented past the NUL character
+		 * (since all states make transitions on EOB to the
+		 * end-of-buffer state).  Contrast this with the test
+		 * in input().
+		 */
+		if ( yyg->yy_c_buf_p <= &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] )
+			{ /* This was really a NUL. */
+			yy_state_type yy_next_state;
+
+			yyg->yy_c_buf_p = yyg->yytext_ptr + yy_amount_of_matched_text;
+
+			yy_current_state = yy_get_previous_state( yyscanner );
+
+			/* Okay, we're now positioned to make the NUL
+			 * transition.  We couldn't have
+			 * yy_get_previous_state() go ahead and do it
+			 * for us because it doesn't know how to deal
+			 * with the possibility of jamming (and we don't
+			 * want to build jamming into it because then it
+			 * will run more slowly).
+			 */
+
+			yy_next_state = yy_try_NUL_trans( yy_current_state , yyscanner);
+
+			yy_bp = yyg->yytext_ptr + YY_MORE_ADJ;
+
+			if ( yy_next_state )
+				{
+				/* Consume the NUL. */
+				yy_cp = ++yyg->yy_c_buf_p;
+				yy_current_state = yy_next_state;
+				goto yy_match;
+				}
+
+			else
+				{
+				yy_cp = yyg->yy_last_accepting_cpos;
+				yy_current_state = yyg->yy_last_accepting_state;
+				goto yy_find_action;
+				}
+			}
+
+		else switch ( yy_get_next_buffer( yyscanner ) )
+			{
+			case EOB_ACT_END_OF_FILE:
+				{
+				yyg->yy_did_buffer_switch_on_eof = 0;
+
+				if ( yywrap( yyscanner ) )
+					{
+					/* Note: because we've taken care in
+					 * yy_get_next_buffer() to have set up
+					 * yytext, we can now set up
+					 * yy_c_buf_p so that if some total
+					 * hoser (like flex itself) wants to
+					 * call the scanner after we return the
+					 * YY_NULL, it'll still work - another
+					 * YY_NULL will get returned.
+					 */
+					yyg->yy_c_buf_p = yyg->yytext_ptr + YY_MORE_ADJ;
+
+					yy_act = YY_STATE_EOF(YY_START);
+					goto do_action;
+					}
+
+				else
+					{
+					if ( ! yyg->yy_did_buffer_switch_on_eof )
+						YY_NEW_FILE;
+					}
+				break;
+				}
+
+			case EOB_ACT_CONTINUE_SCAN:
+				yyg->yy_c_buf_p =
+					yyg->yytext_ptr + yy_amount_of_matched_text;
+
+				yy_current_state = yy_get_previous_state( yyscanner );
+
+				yy_cp = yyg->yy_c_buf_p;
+				yy_bp = yyg->yytext_ptr + YY_MORE_ADJ;
+				goto yy_match;
+
+			case EOB_ACT_LAST_MATCH:
+				yyg->yy_c_buf_p =
+				&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars];
+
+				yy_current_state = yy_get_previous_state( yyscanner );
+
+				yy_cp = yyg->yy_c_buf_p;
+				yy_bp = yyg->yytext_ptr + YY_MORE_ADJ;
+				goto yy_find_action;
+			}
+		break;
+		}
+
+	default:
+		YY_FATAL_ERROR(
+			"fatal flex scanner internal error--no action found" );
+	} /* end of action switch */
+		} /* end of scanning one token */
+	} /* end of user's declarations */
+} /* end of yylex */
+
+/* yy_get_next_buffer - try to read in a new buffer
+ *
+ * Returns a code representing an action:
+ *	EOB_ACT_LAST_MATCH -
+ *	EOB_ACT_CONTINUE_SCAN - continue scanning from current position
+ *	EOB_ACT_END_OF_FILE - end of file
+ */
+static int yy_get_next_buffer (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	char *dest = YY_CURRENT_BUFFER_LVALUE->yy_ch_buf;
+	char *source = yyg->yytext_ptr;
+	int number_to_move, i;
+	int ret_val;
+
+	if ( yyg->yy_c_buf_p > &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars + 1] )
+		YY_FATAL_ERROR(
+		"fatal flex scanner internal error--end of buffer missed" );
+
+	if ( YY_CURRENT_BUFFER_LVALUE->yy_fill_buffer == 0 )
+		{ /* Don't try to fill the buffer, so this is an EOF. */
+		if ( yyg->yy_c_buf_p - yyg->yytext_ptr - YY_MORE_ADJ == 1 )
+			{
+			/* We matched a single character, the EOB, so
+			 * treat this as a final EOF.
+			 */
+			return EOB_ACT_END_OF_FILE;
+			}
+
+		else
+			{
+			/* We matched some text prior to the EOB, first
+			 * process it.
+			 */
+			return EOB_ACT_LAST_MATCH;
+			}
+		}
+
+	/* Try to read more data. */
+
+	/* First move last chars to start of buffer. */
+	number_to_move = (int) (yyg->yy_c_buf_p - yyg->yytext_ptr - 1);
+
+	for ( i = 0; i < number_to_move; ++i )
+		*(dest++) = *(source++);
+
+	if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_EOF_PENDING )
+		/* don't do the read, it's not guaranteed to return an EOF,
+		 * just force an EOF
+		 */
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars = 0;
+
+	else
+		{
+			int num_to_read =
+			YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1;
+
+		while ( num_to_read <= 0 )
+			{ /* Not enough room in the buffer - grow it. */
+
+			/* just a shorter name for the current buffer */
+			YY_BUFFER_STATE b = YY_CURRENT_BUFFER_LVALUE;
+
+			int yy_c_buf_p_offset =
+				(int) (yyg->yy_c_buf_p - b->yy_ch_buf);
+
+			if ( b->yy_is_our_buffer )
+				{
+				int new_size = b->yy_buf_size * 2;
+
+				if ( new_size <= 0 )
+					b->yy_buf_size += b->yy_buf_size / 8;
+				else
+					b->yy_buf_size *= 2;
+
+				b->yy_ch_buf = (char *)
+					/* Include room in for 2 EOB chars. */
+					yyrealloc( (void *) b->yy_ch_buf,
+							 (yy_size_t) (b->yy_buf_size + 2) , yyscanner );
+				}
+			else
+				/* Can't grow it, we don't own it. */
+				b->yy_ch_buf = NULL;
+
+			if ( ! b->yy_ch_buf )
+				YY_FATAL_ERROR(
+				"fatal error - scanner input buffer overflow" );
+
+			yyg->yy_c_buf_p = &b->yy_ch_buf[yy_c_buf_p_offset];
+
+			num_to_read = YY_CURRENT_BUFFER_LVALUE->yy_buf_size -
+						number_to_move - 1;
+
+			}
+
+		if ( num_to_read > YY_READ_BUF_SIZE )
+			num_to_read = YY_READ_BUF_SIZE;
+
+		/* Read in more data. */
+		YY_INPUT( (&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]),
+			yyg->yy_n_chars, num_to_read );
+
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars;
+		}
+
+	if ( yyg->yy_n_chars == 0 )
+		{
+		if ( number_to_move == YY_MORE_ADJ )
+			{
+			ret_val = EOB_ACT_END_OF_FILE;
+			yyrestart( yyin  , yyscanner);
+			}
+
+		else
+			{
+			ret_val = EOB_ACT_LAST_MATCH;
+			YY_CURRENT_BUFFER_LVALUE->yy_buffer_status =
+				YY_BUFFER_EOF_PENDING;
+			}
+		}
+
+	else
+		ret_val = EOB_ACT_CONTINUE_SCAN;
+
+	if ((yyg->yy_n_chars + number_to_move) > YY_CURRENT_BUFFER_LVALUE->yy_buf_size) {
+		/* Extend the array by 50%, plus the number we really need. */
+		int new_size = yyg->yy_n_chars + number_to_move + (yyg->yy_n_chars >> 1);
+		YY_CURRENT_BUFFER_LVALUE->yy_ch_buf = (char *) yyrealloc(
+			(void *) YY_CURRENT_BUFFER_LVALUE->yy_ch_buf, (yy_size_t) new_size , yyscanner );
+		if ( ! YY_CURRENT_BUFFER_LVALUE->yy_ch_buf )
+			YY_FATAL_ERROR( "out of dynamic memory in yy_get_next_buffer()" );
+		/* "- 2" to take care of EOB's */
+		YY_CURRENT_BUFFER_LVALUE->yy_buf_size = (int) (new_size - 2);
+	}
+
+	yyg->yy_n_chars += number_to_move;
+	YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] = YY_END_OF_BUFFER_CHAR;
+	YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars + 1] = YY_END_OF_BUFFER_CHAR;
+
+	yyg->yytext_ptr = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[0];
+
+	return ret_val;
+}
+
+/* yy_get_previous_state - get the state just before the EOB char was reached */
+
+    static yy_state_type yy_get_previous_state (yyscan_t yyscanner)
+{
+	yy_state_type yy_current_state;
+	char *yy_cp;
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	yy_current_state = yyg->yy_start;
+
+	for ( yy_cp = yyg->yytext_ptr + YY_MORE_ADJ; yy_cp < yyg->yy_c_buf_p; ++yy_cp )
+		{
+		YY_CHAR yy_c = (*yy_cp ? yy_ec[YY_SC_TO_UI(*yy_cp)] : 1);
+		if ( yy_accept[yy_current_state] )
+			{
+			yyg->yy_last_accepting_state = yy_current_state;
+			yyg->yy_last_accepting_cpos = yy_cp;
+			}
+		while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+			{
+			yy_current_state = (int) yy_def[yy_current_state];
+			if ( yy_current_state >= 19 )
+				yy_c = yy_meta[yy_c];
+			}
+		yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c];
+		}
+
+	return yy_current_state;
+}
+
+/* yy_try_NUL_trans - try to make a transition on the NUL character
+ *
+ * synopsis
+ *	next_state = yy_try_NUL_trans( current_state );
+ */
+    static yy_state_type yy_try_NUL_trans  (yy_state_type yy_current_state , yyscan_t yyscanner)
+{
+	int yy_is_jam;
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; /* This var may be unused depending upon options. */
+	char *yy_cp = yyg->yy_c_buf_p;
+
+	YY_CHAR yy_c = 1;
+	if ( yy_accept[yy_current_state] )
+		{
+		yyg->yy_last_accepting_state = yy_current_state;
+		yyg->yy_last_accepting_cpos = yy_cp;
+		}
+	while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+		{
+		yy_current_state = (int) yy_def[yy_current_state];
+		if ( yy_current_state >= 19 )
+			yy_c = yy_meta[yy_c];
+		}
+	yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c];
+	yy_is_jam = (yy_current_state == 18);
+
+	(void)yyg;
+	return yy_is_jam ? 0 : yy_current_state;
+}
+
+#ifndef YY_NO_UNPUT
+
+#endif
+
+#ifndef YY_NO_INPUT
+#ifdef __cplusplus
+    static int yyinput (yyscan_t yyscanner)
+#else
+    static int input  (yyscan_t yyscanner)
+#endif
+
+{
+	int c;
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	*yyg->yy_c_buf_p = yyg->yy_hold_char;
+
+	if ( *yyg->yy_c_buf_p == YY_END_OF_BUFFER_CHAR )
+		{
+		/* yy_c_buf_p now points to the character we want to return.
+		 * If this occurs *before* the EOB characters, then it's a
+		 * valid NUL; if not, then we've hit the end of the buffer.
+		 */
+		if ( yyg->yy_c_buf_p < &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] )
+			/* This was really a NUL. */
+			*yyg->yy_c_buf_p = '\0';
+
+		else
+			{ /* need more input */
+			int offset = (int) (yyg->yy_c_buf_p - yyg->yytext_ptr);
+			++yyg->yy_c_buf_p;
+
+			switch ( yy_get_next_buffer( yyscanner ) )
+				{
+				case EOB_ACT_LAST_MATCH:
+					/* This happens because yy_g_n_b()
+					 * sees that we've accumulated a
+					 * token and flags that we need to
+					 * try matching the token before
+					 * proceeding.  But for input(),
+					 * there's no matching to consider.
+					 * So convert the EOB_ACT_LAST_MATCH
+					 * to EOB_ACT_END_OF_FILE.
+					 */
+
+					/* Reset buffer status. */
+					yyrestart( yyin , yyscanner);
+
+					/*FALLTHROUGH*/
+
+				case EOB_ACT_END_OF_FILE:
+					{
+					if ( yywrap( yyscanner ) )
+						return 0;
+
+					if ( ! yyg->yy_did_buffer_switch_on_eof )
+						YY_NEW_FILE;
+#ifdef __cplusplus
+					return yyinput(yyscanner);
+#else
+					return input(yyscanner);
+#endif
+					}
+
+				case EOB_ACT_CONTINUE_SCAN:
+					yyg->yy_c_buf_p = yyg->yytext_ptr + offset;
+					break;
+				}
+			}
+		}
+
+	c = *(unsigned char *) yyg->yy_c_buf_p;	/* cast for 8-bit char's */
+	*yyg->yy_c_buf_p = '\0';	/* preserve yytext */
+	yyg->yy_hold_char = *++yyg->yy_c_buf_p;
+
+	return c;
+}
+#endif	/* ifndef YY_NO_INPUT */
+
+/** Immediately switch to a different input stream.
+ * @param input_file A readable stream.
+ * @param yyscanner The scanner object.
+ * @note This function does not reset the start condition to @c INITIAL .
+ */
+    void yyrestart  (FILE * input_file , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	if ( ! YY_CURRENT_BUFFER ){
+        yyensure_buffer_stack (yyscanner);
+		YY_CURRENT_BUFFER_LVALUE =
+            yy_create_buffer( yyin, YY_BUF_SIZE , yyscanner);
+	}
+
+	yy_init_buffer( YY_CURRENT_BUFFER, input_file , yyscanner);
+	yy_load_buffer_state( yyscanner );
+}
+
+/** Switch to a different input buffer.
+ * @param new_buffer The new input buffer.
+ * @param yyscanner The scanner object.
+ */
+    void yy_switch_to_buffer  (YY_BUFFER_STATE  new_buffer , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	/* TODO. We should be able to replace this entire function body
+	 * with
+	 *		yypop_buffer_state();
+	 *		yypush_buffer_state(new_buffer);
+     */
+	yyensure_buffer_stack (yyscanner);
+	if ( YY_CURRENT_BUFFER == new_buffer )
+		return;
+
+	if ( YY_CURRENT_BUFFER )
+		{
+		/* Flush out information for old buffer. */
+		*yyg->yy_c_buf_p = yyg->yy_hold_char;
+		YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = yyg->yy_c_buf_p;
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars;
+		}
+
+	YY_CURRENT_BUFFER_LVALUE = new_buffer;
+	yy_load_buffer_state( yyscanner );
+
+	/* We don't actually know whether we did this switch during
+	 * EOF (yywrap()) processing, but the only time this flag
+	 * is looked at is after yywrap() is called, so it's safe
+	 * to go ahead and always set it.
+	 */
+	yyg->yy_did_buffer_switch_on_eof = 1;
+}
+
+static void yy_load_buffer_state  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	yyg->yy_n_chars = YY_CURRENT_BUFFER_LVALUE->yy_n_chars;
+	yyg->yytext_ptr = yyg->yy_c_buf_p = YY_CURRENT_BUFFER_LVALUE->yy_buf_pos;
+	yyin = YY_CURRENT_BUFFER_LVALUE->yy_input_file;
+	yyg->yy_hold_char = *yyg->yy_c_buf_p;
+}
+
+/** Allocate and initialize an input buffer state.
+ * @param file A readable stream.
+ * @param size The character buffer size in bytes. When in doubt, use @c YY_BUF_SIZE.
+ * @param yyscanner The scanner object.
+ * @return the allocated buffer state.
+ */
+    YY_BUFFER_STATE yy_create_buffer  (FILE * file, int  size , yyscan_t yyscanner)
+{
+	YY_BUFFER_STATE b;
+    
+	b = (YY_BUFFER_STATE) yyalloc( sizeof( struct yy_buffer_state ) , yyscanner );
+	if ( ! b )
+		YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" );
+
+	b->yy_buf_size = size;
+
+	/* yy_ch_buf has to be 2 characters longer than the size given because
+	 * we need to put in 2 end-of-buffer characters.
+	 */
+	b->yy_ch_buf = (char *) yyalloc( (yy_size_t) (b->yy_buf_size + 2) , yyscanner );
+	if ( ! b->yy_ch_buf )
+		YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" );
+
+	b->yy_is_our_buffer = 1;
+
+	yy_init_buffer( b, file , yyscanner);
+
+	return b;
+}
+
+/** Destroy the buffer.
+ * @param b a buffer created with yy_create_buffer()
+ * @param yyscanner The scanner object.
+ */
+    void yy_delete_buffer (YY_BUFFER_STATE  b , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	if ( ! b )
+		return;
+
+	if ( b == YY_CURRENT_BUFFER ) /* Not sure if we should pop here. */
+		YY_CURRENT_BUFFER_LVALUE = (YY_BUFFER_STATE) 0;
+
+	if ( b->yy_is_our_buffer )
+		yyfree( (void *) b->yy_ch_buf , yyscanner );
+
+	yyfree( (void *) b , yyscanner );
+}
+
+/* Initializes or reinitializes a buffer.
+ * This function is sometimes called more than once on the same buffer,
+ * such as during a yyrestart() or at EOF.
+ */
+    static void yy_init_buffer  (YY_BUFFER_STATE  b, FILE * file , yyscan_t yyscanner)
+
+{
+	int oerrno = errno;
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	yy_flush_buffer( b , yyscanner);
+
+	b->yy_input_file = file;
+	b->yy_fill_buffer = 1;
+
+    /* If b is the current buffer, then yy_init_buffer was _probably_
+     * called from yyrestart() or through yy_get_next_buffer.
+     * In that case, we don't want to reset the lineno or column.
+     */
+    if (b != YY_CURRENT_BUFFER){
+        b->yy_bs_lineno = 1;
+        b->yy_bs_column = 0;
+    }
+
+        b->yy_is_interactive = 0;
+    
+	errno = oerrno;
+}
+
+/** Discard all buffered characters. On the next scan, YY_INPUT will be called.
+ * @param b the buffer state to be flushed, usually @c YY_CURRENT_BUFFER.
+ * @param yyscanner The scanner object.
+ */
+    void yy_flush_buffer (YY_BUFFER_STATE  b , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	if ( ! b )
+		return;
+
+	b->yy_n_chars = 0;
+
+	/* We always need two end-of-buffer characters.  The first causes
+	 * a transition to the end-of-buffer state.  The second causes
+	 * a jam in that state.
+	 */
+	b->yy_ch_buf[0] = YY_END_OF_BUFFER_CHAR;
+	b->yy_ch_buf[1] = YY_END_OF_BUFFER_CHAR;
+
+	b->yy_buf_pos = &b->yy_ch_buf[0];
+
+	b->yy_at_bol = 1;
+	b->yy_buffer_status = YY_BUFFER_NEW;
+
+	if ( b == YY_CURRENT_BUFFER )
+		yy_load_buffer_state( yyscanner );
+}
+
+/** Pushes the new state onto the stack. The new state becomes
+ *  the current state. This function will allocate the stack
+ *  if necessary.
+ *  @param new_buffer The new state.
+ *  @param yyscanner The scanner object.
+ */
+void yypush_buffer_state (YY_BUFFER_STATE new_buffer , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	if (new_buffer == NULL)
+		return;
+
+	yyensure_buffer_stack(yyscanner);
+
+	/* This block is copied from yy_switch_to_buffer. */
+	if ( YY_CURRENT_BUFFER )
+		{
+		/* Flush out information for old buffer. */
+		*yyg->yy_c_buf_p = yyg->yy_hold_char;
+		YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = yyg->yy_c_buf_p;
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars;
+		}
+
+	/* Only push if top exists. Otherwise, replace top. */
+	if (YY_CURRENT_BUFFER)
+		yyg->yy_buffer_stack_top++;
+	YY_CURRENT_BUFFER_LVALUE = new_buffer;
+
+	/* copied from yy_switch_to_buffer. */
+	yy_load_buffer_state( yyscanner );
+	yyg->yy_did_buffer_switch_on_eof = 1;
+}
+
+/** Removes and deletes the top of the stack, if present.
+ *  The next element becomes the new top.
+ *  @param yyscanner The scanner object.
+ */
+void yypop_buffer_state (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	if (!YY_CURRENT_BUFFER)
+		return;
+
+	yy_delete_buffer(YY_CURRENT_BUFFER , yyscanner);
+	YY_CURRENT_BUFFER_LVALUE = NULL;
+	if (yyg->yy_buffer_stack_top > 0)
+		--yyg->yy_buffer_stack_top;
+
+	if (YY_CURRENT_BUFFER) {
+		yy_load_buffer_state( yyscanner );
+		yyg->yy_did_buffer_switch_on_eof = 1;
+	}
+}
+
+/* Allocates the stack if it does not exist.
+ *  Guarantees space for at least one push.
+ */
+static void yyensure_buffer_stack (yyscan_t yyscanner)
+{
+	yy_size_t num_to_alloc;
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	if (!yyg->yy_buffer_stack) {
+
+		/* First allocation is just for 2 elements, since we don't know if this
+		 * scanner will even need a stack. We use 2 instead of 1 to avoid an
+		 * immediate realloc on the next call.
+         */
+      num_to_alloc = 1; /* After all that talk, this was set to 1 anyways... */
+		yyg->yy_buffer_stack = (struct yy_buffer_state**)yyalloc
+								(num_to_alloc * sizeof(struct yy_buffer_state*)
+								, yyscanner);
+		if ( ! yyg->yy_buffer_stack )
+			YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" );
+
+		memset(yyg->yy_buffer_stack, 0, num_to_alloc * sizeof(struct yy_buffer_state*));
+
+		yyg->yy_buffer_stack_max = num_to_alloc;
+		yyg->yy_buffer_stack_top = 0;
+		return;
+	}
+
+	if (yyg->yy_buffer_stack_top >= (yyg->yy_buffer_stack_max) - 1){
+
+		/* Increase the buffer to prepare for a possible push. */
+		yy_size_t grow_size = 8 /* arbitrary grow size */;
+
+		num_to_alloc = yyg->yy_buffer_stack_max + grow_size;
+		yyg->yy_buffer_stack = (struct yy_buffer_state**)yyrealloc
+								(yyg->yy_buffer_stack,
+								num_to_alloc * sizeof(struct yy_buffer_state*)
+								, yyscanner);
+		if ( ! yyg->yy_buffer_stack )
+			YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" );
+
+		/* zero only the new slots.*/
+		memset(yyg->yy_buffer_stack + yyg->yy_buffer_stack_max, 0, grow_size * sizeof(struct yy_buffer_state*));
+		yyg->yy_buffer_stack_max = num_to_alloc;
+	}
+}
+
+/** Setup the input buffer state to scan directly from a user-specified character buffer.
+ * @param base the character buffer
+ * @param size the size in bytes of the character buffer
+ * @param yyscanner The scanner object.
+ * @return the newly allocated buffer state object.
+ */
+YY_BUFFER_STATE yy_scan_buffer  (char * base, yy_size_t  size , yyscan_t yyscanner)
+{
+	YY_BUFFER_STATE b;
+    
+	if ( size < 2 ||
+	     base[size-2] != YY_END_OF_BUFFER_CHAR ||
+	     base[size-1] != YY_END_OF_BUFFER_CHAR )
+		/* They forgot to leave room for the EOB's. */
+		return NULL;
+
+	b = (YY_BUFFER_STATE) yyalloc( sizeof( struct yy_buffer_state ) , yyscanner );
+	if ( ! b )
+		YY_FATAL_ERROR( "out of dynamic memory in yy_scan_buffer()" );
+
+	b->yy_buf_size = (int) (size - 2);	/* "- 2" to take care of EOB's */
+	b->yy_buf_pos = b->yy_ch_buf = base;
+	b->yy_is_our_buffer = 0;
+	b->yy_input_file = NULL;
+	b->yy_n_chars = b->yy_buf_size;
+	b->yy_is_interactive = 0;
+	b->yy_at_bol = 1;
+	b->yy_fill_buffer = 0;
+	b->yy_buffer_status = YY_BUFFER_NEW;
+
+	yy_switch_to_buffer( b , yyscanner );
+
+	return b;
+}
+
+/** Setup the input buffer state to scan a string. The next call to yylex() will
+ * scan from a @e copy of @a str.
+ * @param yystr a NUL-terminated string to scan
+ * @param yyscanner The scanner object.
+ * @return the newly allocated buffer state object.
+ * @note If you want to scan bytes that may contain NUL values, then use
+ *       yy_scan_bytes() instead.
+ */
+YY_BUFFER_STATE yy_scan_string (const char * yystr , yyscan_t yyscanner)
+{
+    
+	return yy_scan_bytes( yystr, (int) strlen(yystr) , yyscanner);
+}
+
+/** Setup the input buffer state to scan the given bytes. The next call to yylex() will
+ * scan from a @e copy of @a bytes.
+ * @param yybytes the byte buffer to scan
+ * @param _yybytes_len the number of bytes in the buffer pointed to by @a bytes.
+ * @param yyscanner The scanner object.
+ * @return the newly allocated buffer state object.
+ */
+YY_BUFFER_STATE yy_scan_bytes  (const char * yybytes, int  _yybytes_len , yyscan_t yyscanner)
+{
+	YY_BUFFER_STATE b;
+	char *buf;
+	yy_size_t n;
+	int i;
+    
+	/* Get memory for full buffer, including space for trailing EOB's. */
+	n = (yy_size_t) (_yybytes_len + 2);
+	buf = (char *) yyalloc( n , yyscanner );
+	if ( ! buf )
+		YY_FATAL_ERROR( "out of dynamic memory in yy_scan_bytes()" );
+
+	for ( i = 0; i < _yybytes_len; ++i )
+		buf[i] = yybytes[i];
+
+	buf[_yybytes_len] = buf[_yybytes_len+1] = YY_END_OF_BUFFER_CHAR;
+
+	b = yy_scan_buffer( buf, n , yyscanner);
+	if ( ! b )
+		YY_FATAL_ERROR( "bad buffer in yy_scan_bytes()" );
+
+	/* It's okay to grow etc. this buffer, and we should throw it
+	 * away when we're done.
+	 */
+	b->yy_is_our_buffer = 1;
+
+	return b;
+}
+
+#ifndef YY_EXIT_FAILURE
+#define YY_EXIT_FAILURE 2
+#endif
+
+static void yynoreturn yy_fatal_error (const char* msg , yyscan_t yyscanner)
+{
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	(void)yyg;
+	fprintf( stderr, "%s\n", msg );
+	exit( YY_EXIT_FAILURE );
+}
+
+/* Redefine yyless() so it works in section 3 code. */
+
+#undef yyless
+#define yyless(n) \
+	do \
+		{ \
+		/* Undo effects of setting up yytext. */ \
+        int yyless_macro_arg = (n); \
+        YY_LESS_LINENO(yyless_macro_arg);\
+		yytext[yyleng] = yyg->yy_hold_char; \
+		yyg->yy_c_buf_p = yytext + yyless_macro_arg; \
+		yyg->yy_hold_char = *yyg->yy_c_buf_p; \
+		*yyg->yy_c_buf_p = '\0'; \
+		yyleng = yyless_macro_arg; \
+		} \
+	while ( 0 )
+
+/* Accessor  methods (get/set functions) to struct members. */
+
+/** Get the user-defined data for this scanner.
+ * @param yyscanner The scanner object.
+ */
+YY_EXTRA_TYPE yyget_extra  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    return yyextra;
+}
+
+/** Get the current line number.
+ * @param yyscanner The scanner object.
+ */
+int yyget_lineno  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+        if (! YY_CURRENT_BUFFER)
+            return 0;
+    
+    return yylineno;
+}
+
+/** Get the current column number.
+ * @param yyscanner The scanner object.
+ */
+int yyget_column  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+        if (! YY_CURRENT_BUFFER)
+            return 0;
+    
+    return yycolumn;
+}
+
+/** Get the input stream.
+ * @param yyscanner The scanner object.
+ */
+FILE *yyget_in  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    return yyin;
+}
+
+/** Get the output stream.
+ * @param yyscanner The scanner object.
+ */
+FILE *yyget_out  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    return yyout;
+}
+
+/** Get the length of the current token.
+ * @param yyscanner The scanner object.
+ */
+int yyget_leng  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    return yyleng;
+}
+
+/** Get the current token.
+ * @param yyscanner The scanner object.
+ */
+
+char *yyget_text  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    return yytext;
+}
+
+/** Set the user-defined data. This data is never touched by the scanner.
+ * @param user_defined The data to be associated with this scanner.
+ * @param yyscanner The scanner object.
+ */
+void yyset_extra (YY_EXTRA_TYPE  user_defined , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    yyextra = user_defined ;
+}
+
+/** Set the current line number.
+ * @param _line_number line number
+ * @param yyscanner The scanner object.
+ */
+void yyset_lineno (int  _line_number , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+        /* lineno is only valid if an input buffer exists. */
+        if (! YY_CURRENT_BUFFER )
+           YY_FATAL_ERROR( "yyset_lineno called with no buffer" );
+    
+    yylineno = _line_number;
+}
+
+/** Set the current column.
+ * @param _column_no column number
+ * @param yyscanner The scanner object.
+ */
+void yyset_column (int  _column_no , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+        /* column is only valid if an input buffer exists. */
+        if (! YY_CURRENT_BUFFER )
+           YY_FATAL_ERROR( "yyset_column called with no buffer" );
+    
+    yycolumn = _column_no;
+}
+
+/** Set the input stream. This does not discard the current
+ * input buffer.
+ * @param _in_str A readable stream.
+ * @param yyscanner The scanner object.
+ * @see yy_switch_to_buffer
+ */
+void yyset_in (FILE *  _in_str , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    yyin = _in_str ;
+}
+
+void yyset_out (FILE *  _out_str , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    yyout = _out_str ;
+}
+
+int yyget_debug  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    return yy_flex_debug;
+}
+
+void yyset_debug (int  _bdebug , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    yy_flex_debug = _bdebug ;
+}
+
+/* Accessor methods for yylval and yylloc */
+
+/* User-visible API */
+
+/* yylex_init is special because it creates the scanner itself, so it is
+ * the ONLY reentrant function that doesn't take the scanner as the last argument.
+ * That's why we explicitly handle the declaration, instead of using our macros.
+ */
+int yylex_init(yyscan_t* ptr_yy_globals)
+{
+    if (ptr_yy_globals == NULL){
+        errno = EINVAL;
+        return 1;
+    }
+
+    *ptr_yy_globals = (yyscan_t) yyalloc ( sizeof( struct yyguts_t ), NULL );
+
+    if (*ptr_yy_globals == NULL){
+        errno = ENOMEM;
+        return 1;
+    }
+
+    /* By setting to 0xAA, we expose bugs in yy_init_globals. Leave at 0x00 for releases. */
+    memset(*ptr_yy_globals,0x00,sizeof(struct yyguts_t));
+
+    return yy_init_globals ( *ptr_yy_globals );
+}
+
+/* yylex_init_extra has the same functionality as yylex_init, but follows the
+ * convention of taking the scanner as the last argument. Note however, that
+ * this is a *pointer* to a scanner, as it will be allocated by this call (and
+ * is the reason, too, why this function also must handle its own declaration).
+ * The user defined value in the first argument will be available to yyalloc in
+ * the yyextra field.
+ */
+int yylex_init_extra( YY_EXTRA_TYPE yy_user_defined, yyscan_t* ptr_yy_globals )
+{
+    struct yyguts_t dummy_yyguts;
+
+    yyset_extra (yy_user_defined, &dummy_yyguts);
+
+    if (ptr_yy_globals == NULL){
+        errno = EINVAL;
+        return 1;
+    }
+
+    *ptr_yy_globals = (yyscan_t) yyalloc ( sizeof( struct yyguts_t ), &dummy_yyguts );
+
+    if (*ptr_yy_globals == NULL){
+        errno = ENOMEM;
+        return 1;
+    }
+
+    /* By setting to 0xAA, we expose bugs in
+    yy_init_globals. Leave at 0x00 for releases. */
+    memset(*ptr_yy_globals,0x00,sizeof(struct yyguts_t));
+
+    yyset_extra (yy_user_defined, *ptr_yy_globals);
+
+    return yy_init_globals ( *ptr_yy_globals );
+}
+
+static int yy_init_globals (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    /* Initialization is the same as for the non-reentrant scanner.
+     * This function is called from yylex_destroy(), so don't allocate here.
+     */
+
+    yyg->yy_buffer_stack = NULL;
+    yyg->yy_buffer_stack_top = 0;
+    yyg->yy_buffer_stack_max = 0;
+    yyg->yy_c_buf_p = NULL;
+    yyg->yy_init = 0;
+    yyg->yy_start = 0;
+
+    yyg->yy_start_stack_ptr = 0;
+    yyg->yy_start_stack_depth = 0;
+    yyg->yy_start_stack =  NULL;
+
+/* Defined in main.c */
+#ifdef YY_STDINIT
+    yyin = stdin;
+    yyout = stdout;
+#else
+    yyin = NULL;
+    yyout = NULL;
+#endif
+
+    /* For future reference: Set errno on error, since we are called by
+     * yylex_init()
+     */
+    return 0;
+}
+
+/* yylex_destroy is for both reentrant and non-reentrant scanners. */
+int yylex_destroy  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+    /* Pop the buffer stack, destroying each element. */
+	while(YY_CURRENT_BUFFER){
+		yy_delete_buffer( YY_CURRENT_BUFFER , yyscanner );
+		YY_CURRENT_BUFFER_LVALUE = NULL;
+		yypop_buffer_state(yyscanner);
+	}
+
+	/* Destroy the stack itself. */
+	yyfree(yyg->yy_buffer_stack , yyscanner);
+	yyg->yy_buffer_stack = NULL;
+
+    /* Destroy the start condition stack. */
+        yyfree( yyg->yy_start_stack , yyscanner );
+        yyg->yy_start_stack = NULL;
+
+    /* Reset the globals. This is important in a non-reentrant scanner so the next time
+     * yylex() is called, initialization will occur. */
+    yy_init_globals( yyscanner);
+
+    /* Destroy the main struct (reentrant only). */
+    yyfree ( yyscanner , yyscanner );
+    yyscanner = NULL;
+    return 0;
+}
+
+/*
+ * Internal utility routines.
+ */
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy (char* s1, const char * s2, int n , yyscan_t yyscanner)
+{
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	(void)yyg;
+
+	int i;
+	for ( i = 0; i < n; ++i )
+		s1[i] = s2[i];
+}
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen (const char * s , yyscan_t yyscanner)
+{
+	int n;
+	for ( n = 0; s[n]; ++n )
+		;
+
+	return n;
+}
+#endif
+
+void *yyalloc (yy_size_t  size , yyscan_t yyscanner)
+{
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	(void)yyg;
+	return malloc(size);
+}
+
+void *yyrealloc  (void * ptr, yy_size_t  size , yyscan_t yyscanner)
+{
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	(void)yyg;
+
+	/* The cast to (char *) in the following accommodates both
+	 * implementations that use char* generic pointers, and those
+	 * that use void* generic pointers.  It works with the latter
+	 * because both ANSI C and C++ allow castless assignment from
+	 * any pointer type to void*, and deal with argument conversions
+	 * as though doing an assignment.
+	 */
+	return realloc(ptr, size);
+}
+
+void yyfree (void * ptr , yyscan_t yyscanner)
+{
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	(void)yyg;
+	free( (char *) ptr );	/* see yyrealloc() for (char *) cast */
+}
+
+#define YYTABLES_NAME "yytables"
+
+#line 74 "fts0blex.l"
+
+
diff --git a/storage/innobase/fts/fts0blex.l b/storage/innobase/fts/fts0blex.l
new file mode 100644
index 00000000..cf19cd0f
--- /dev/null
+++ b/storage/innobase/fts/fts0blex.l
@@ -0,0 +1,74 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**
+ * @file fts/fts0blex.l
+ * FTS parser lexical analyzer
+ *
+ * Created 2007/5/9 Sunny Bains
+ */
+
+%{
+
+#include "fts0ast.h"
+#include "fts0pars.h"
+
+/* Required for reentrant parser */
+#define YY_DECL int fts_blexer(YYSTYPE* val, yyscan_t yyscanner)
+#define exit(A)   ut_error
+
+%}
+
+%option noinput
+%option nounput
+%option noyywrap
+%option nostdinit
+%option reentrant
+%option never-interactive
+
+%%
+
+[\t ]+	/* Ignore whitespace */ ;
+
+[*()+\-<>~@]		{
+	val->oper = fts0bget_text(yyscanner)[0];
+
+	return(val->oper);
+}
+
+[0-9]+			{
+	val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0bget_text(yyscanner)), fts0bget_leng(yyscanner));
+
+	return(FTS_NUMB);
+}
+
+[^" \n*()+\-<>~@%]*		{
+	val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0bget_text(yyscanner)), fts0bget_leng(yyscanner));
+
+	return(FTS_TERM);
+}
+
+\"[^\"\n]*\"		{
+	val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0bget_text(yyscanner)), fts0bget_leng(yyscanner));
+
+	return(FTS_TEXT);
+}
+
+\n
+
+%%
diff --git a/storage/innobase/fts/fts0config.cc b/storage/innobase/fts/fts0config.cc
new file mode 100644
index 00000000..4566224e
--- /dev/null
+++ b/storage/innobase/fts/fts0config.cc
@@ -0,0 +1,428 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file fts/fts0config.cc
+Full Text Search configuration table.
+
+Created 2007/5/9 Sunny Bains
+***********************************************************************/
+
+#include "trx0roll.h"
+#include "row0sel.h"
+
+#include "fts0priv.h"
+
+/******************************************************************//**
+Callback function for fetching the config value.
+@return always returns TRUE */
+static
+ibool
+fts_config_fetch_value(
+/*===================*/
+	void*		row,			/*!< in: sel_node_t* */
+	void*		user_arg)		/*!< in: pointer to
+						 ib_vector_t */
+{
+	sel_node_t*	node = static_cast<sel_node_t*>(row);
+	fts_string_t*	value = static_cast<fts_string_t*>(user_arg);
+
+	dfield_t*	dfield = que_node_get_val(node->select_list);
+	dtype_t*	type = dfield_get_type(dfield);
+	ulint		len = dfield_get_len(dfield);
+	void*		data = dfield_get_data(dfield);
+
+	ut_a(dtype_get_mtype(type) == DATA_VARCHAR);
+
+	if (len != UNIV_SQL_NULL) {
+		ulint	max_len = ut_min(value->f_len - 1, len);
+
+		memcpy(value->f_str, data, max_len);
+		value->f_len = max_len;
+		value->f_str[value->f_len] = '\0';
+	}
+
+	return(TRUE);
+}
+
+/******************************************************************//**
+Get value from the config table. The caller must ensure that enough
+space is allocated for value to hold the column contents.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_config_get_value(
+/*=================*/
+	trx_t*		trx,			/*!< transaction */
+	fts_table_t*	fts_table,		/*!< in: the indexed
+						FTS table */
+	const char*	name,			/*!< in: get config value for
+						this parameter name */
+	fts_string_t*	value)			/*!< out: value read from
+						config table */
+{
+	pars_info_t*	info;
+	que_t*		graph;
+	dberr_t		error;
+	ulint		name_len = strlen(name);
+	char		table_name[MAX_FULL_NAME_LEN];
+
+	info = pars_info_create();
+
+	*value->f_str = '\0';
+	ut_a(value->f_len > 0);
+
+	pars_info_bind_function(info, "my_func", fts_config_fetch_value,
+				value);
+
+	/* The len field of value must be set to the max bytes that
+	it can hold. On a successful read, the len field will be set
+	to the actual number of bytes copied to value. */
+	pars_info_bind_varchar_literal(info, "name", (byte*) name, name_len);
+
+	fts_table->suffix = "CONFIG";
+	fts_get_table_name(fts_table, table_name);
+	pars_info_bind_id(info, "table_name", table_name);
+
+	graph = fts_parse_sql(
+		fts_table,
+		info,
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS SELECT value FROM $table_name"
+		" WHERE key = :name;\n"
+		"BEGIN\n"
+		""
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE c;");
+
+	trx->op_info = "getting FTS config value";
+
+	error = fts_eval_sql(trx, graph);
+	que_graph_free(graph);
+	return(error);
+}
+
+/*********************************************************************//**
+Create the config table name for retrieving index specific value.
+@return index config parameter name */
+char*
+fts_config_create_index_param_name(
+/*===============================*/
+	const char*		param,		/*!< in: base name of param */
+	const dict_index_t*	index)		/*!< in: index for config */
+{
+	ulint		len;
+	char*		name;
+
+	/* The format of the config name is: name_<index_id>. */
+	len = strlen(param);
+
+	/* Caller is responsible for deleting name. */
+	name = static_cast<char*>(ut_malloc_nokey(
+		len + FTS_AUX_MIN_TABLE_ID_LENGTH + 2));
+	::strcpy(name, param);
+	name[len] = '_';
+
+	fts_write_object_id(index->id, name + len + 1);
+
+	return(name);
+}
+
+/******************************************************************//**
+Get value specific to an FTS index from the config table. The caller
+must ensure that enough space is allocated for value to hold the
+column contents.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_config_get_index_value(
+/*=======================*/
+	trx_t*		trx,			/*!< transaction */
+	dict_index_t*	index,			/*!< in: index */
+	const char*	param,			/*!< in: get config value for
+						this parameter name */
+	fts_string_t*	value)			/*!< out: value read from
+						config table */
+{
+	char*		name;
+	dberr_t		error;
+	fts_table_t	fts_table;
+
+	FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE,
+			   index->table);
+
+	/* We are responsible for free'ing name. */
+	name = fts_config_create_index_param_name(param, index);
+
+	error = fts_config_get_value(trx, &fts_table, name, value);
+
+	ut_free(name);
+
+	return(error);
+}
+
+/******************************************************************//**
+Set the value in the config table for name.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_config_set_value(
+/*=================*/
+	trx_t*		trx,			/*!< transaction */
+	fts_table_t*	fts_table,		/*!< in: the indexed
+						FTS table */
+	const char*	name,			/*!< in: get config value for
+						this parameter name */
+	const fts_string_t*
+			value)			/*!< in: value to update */
+{
+	pars_info_t*	info;
+	que_t*		graph;
+	dberr_t		error;
+	undo_no_t	undo_no;
+	undo_no_t	n_rows_updated;
+	ulint		name_len = strlen(name);
+	char		table_name[MAX_FULL_NAME_LEN];
+
+	info = pars_info_create();
+
+	pars_info_bind_varchar_literal(info, "name", (byte*) name, name_len);
+	pars_info_bind_varchar_literal(info, "value",
+				       value->f_str, value->f_len);
+
+	const bool dict_locked = fts_table->table->fts->dict_locked;
+
+	fts_table->suffix = "CONFIG";
+	fts_get_table_name(fts_table, table_name, dict_locked);
+	pars_info_bind_id(info, "table_name", table_name);
+
+	graph = fts_parse_sql(
+		fts_table, info,
+		"BEGIN UPDATE $table_name SET value = :value"
+		" WHERE key = :name;");
+
+	trx->op_info = "setting FTS config value";
+
+	undo_no = trx->undo_no;
+
+	error = fts_eval_sql(trx, graph);
+
+	que_graph_free(graph);
+
+	n_rows_updated = trx->undo_no - undo_no;
+
+	/* Check if we need to do an insert. */
+	if (n_rows_updated == 0) {
+		info = pars_info_create();
+
+		pars_info_bind_varchar_literal(
+			info, "name", (byte*) name, name_len);
+
+		pars_info_bind_varchar_literal(
+			info, "value", value->f_str, value->f_len);
+
+		fts_get_table_name(fts_table, table_name, dict_locked);
+		pars_info_bind_id(info, "table_name", table_name);
+
+		graph = fts_parse_sql(
+			fts_table, info,
+			"BEGIN\n"
+			"INSERT INTO $table_name VALUES(:name, :value);");
+
+		trx->op_info = "inserting FTS config value";
+
+		error = fts_eval_sql(trx, graph);
+
+		que_graph_free(graph);
+	}
+
+	return(error);
+}
+
+/******************************************************************//**
+Set the value specific to an FTS index in the config table.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_config_set_index_value(
+/*=======================*/
+	trx_t*		trx,			/*!< transaction */
+	dict_index_t*	index,			/*!< in: index */
+	const char*	param,			/*!< in: get config value for
+						this parameter name */
+	fts_string_t*	value)			/*!< out: value read from
+						config table */
+{
+	char*		name;
+	dberr_t		error;
+	fts_table_t	fts_table;
+
+	FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE,
+			   index->table);
+
+	/* We are responsible for free'ing name. */
+	name = fts_config_create_index_param_name(param, index);
+
+	error = fts_config_set_value(trx, &fts_table, name, value);
+
+	ut_free(name);
+
+	return(error);
+}
+
+#ifdef FTS_OPTIMIZE_DEBUG
+/******************************************************************//**
+Get an ulint value from the config table.
+@return DB_SUCCESS if all OK else error code */
+dberr_t
+fts_config_get_index_ulint(
+/*=======================*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index,			/*!< in: FTS index */
+	const char*	name,			/*!< in: param name */
+	ulint*		int_value)		/*!< out: value */
+{
+	dberr_t		error;
+	fts_string_t	value;
+
+	/* We set the length of value to the max bytes it can hold. This
+	information is used by the callback that reads the value.*/
+	value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
+	value.f_str = static_cast<byte*>(ut_malloc_nokey(value.f_len + 1));
+
+	error = fts_config_get_index_value(trx, index, name, &value);
+
+	if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+		ib::error() << "(" << error << ") reading `" << name << "'";
+	} else {
+		*int_value = strtoul((char*) value.f_str, NULL, 10);
+	}
+
+	ut_free(value.f_str);
+
+	return(error);
+}
+
+/******************************************************************//**
+Set an ulint value in the config table.
+@return DB_SUCCESS if all OK else error code */
+dberr_t
+fts_config_set_index_ulint(
+/*=======================*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index,			/*!< in: FTS index */
+	const char*	name,			/*!< in: param name */
+	ulint		int_value)		/*!< in: value */
+{
+	dberr_t		error;
+	fts_string_t	value;
+
+	/* We set the length of value to the max bytes it can hold. This
+	information is used by the callback that reads the value.*/
+	value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
+	value.f_str = static_cast<byte*>(ut_malloc_nokey(value.f_len + 1));
+
+	// FIXME: Get rid of snprintf
+	ut_a(FTS_MAX_INT_LEN < FTS_MAX_CONFIG_VALUE_LEN);
+
+	value.f_len = snprintf(
+		(char*) value.f_str, FTS_MAX_INT_LEN, ULINTPF, int_value);
+
+	error = fts_config_set_index_value(trx, index, name, &value);
+
+	if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+		ib::error() << "(" << error << ") writing `" << name << "'";
+	}
+
+	ut_free(value.f_str);
+
+	return(error);
+}
+#endif /* FTS_OPTIMIZE_DEBUG */
+
+/******************************************************************//**
+Get an ulint value from the config table.
+@return DB_SUCCESS if all OK else error code */
+dberr_t
+fts_config_get_ulint(
+/*=================*/
+	trx_t*		trx,			/*!< in: transaction */
+	fts_table_t*	fts_table,		/*!< in: the indexed
+						FTS table */
+	const char*	name,			/*!< in: param name */
+	ulint*		int_value)		/*!< out: value */
+{
+	dberr_t		error;
+	fts_string_t	value;
+
+	/* We set the length of value to the max bytes it can hold. This
+	information is used by the callback that reads the value.*/
+	value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
+	value.f_str = static_cast<byte*>(ut_malloc_nokey(value.f_len + 1));
+
+	error = fts_config_get_value(trx, fts_table, name, &value);
+
+	if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+		ib::error() <<  "(" << error << ") reading `" << name << "'";
+	} else {
+		*int_value = strtoul((char*) value.f_str, NULL, 10);
+	}
+
+	ut_free(value.f_str);
+
+	return(error);
+}
+
+/******************************************************************//**
+Set an ulint value in the config table.
+@return DB_SUCCESS if all OK else error code */
+dberr_t
+fts_config_set_ulint(
+/*=================*/
+	trx_t*		trx,			/*!< in: transaction */
+	fts_table_t*	fts_table,		/*!< in: the indexed
+						FTS table */
+	const char*	name,			/*!< in: param name */
+	ulint		int_value)		/*!< in: value */
+{
+	dberr_t		error;
+	fts_string_t	value;
+
+	/* We set the length of value to the max bytes it can hold. This
+	information is used by the callback that reads the value.*/
+	value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
+	value.f_str = static_cast<byte*>(ut_malloc_nokey(value.f_len + 1));
+
+	ut_a(FTS_MAX_INT_LEN < FTS_MAX_CONFIG_VALUE_LEN);
+
+	value.f_len = (ulint) snprintf(
+		(char*) value.f_str, FTS_MAX_INT_LEN, ULINTPF, int_value);
+
+	error = fts_config_set_value(trx, fts_table, name, &value);
+
+	if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+		ib::error() <<  "(" << error << ") writing `" << name << "'";
+	}
+
+	ut_free(value.f_str);
+
+	return(error);
+}
diff --git a/storage/innobase/fts/fts0fts.cc b/storage/innobase/fts/fts0fts.cc
new file mode 100644
index 00000000..0775d939
--- /dev/null
+++ b/storage/innobase/fts/fts0fts.cc
@@ -0,0 +1,6182 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2021, Oracle and/or its affiliates.
+Copyright (c) 2016, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file fts/fts0fts.cc
+Full Text Search interface
+***********************************************************************/
+
+#include "trx0roll.h"
+#include "trx0purge.h"
+#include "row0mysql.h"
+#include "row0upd.h"
+#include "dict0types.h"
+#include "dict0stats_bg.h"
+#include "row0sel.h"
+#include "fts0fts.h"
+#include "fts0priv.h"
+#include "fts0types.h"
+#include "fts0types.inl"
+#include "fts0vlc.h"
+#include "fts0plugin.h"
+#include "dict0stats.h"
+#include "btr0pcur.h"
+
+static const ulint FTS_MAX_ID_LEN = 32;
+
+/** Column name from the FTS config table */
+#define FTS_MAX_CACHE_SIZE_IN_MB	"cache_size_in_mb"
+
+/** Verify if a aux table name is a obsolete table
+by looking up the key word in the obsolete table names */
+#define FTS_IS_OBSOLETE_AUX_TABLE(table_name)			\
+	(strstr((table_name), "DOC_ID") != NULL			\
+	 || strstr((table_name), "ADDED") != NULL		\
+	 || strstr((table_name), "STOPWORDS") != NULL)
+
+/** This is maximum FTS cache for each table and would be
+a configurable variable */
+Atomic_relaxed<size_t> fts_max_cache_size;
+
+/** Whether the total memory used for FTS cache is exhausted, and we will
+need a sync to free some memory */
+bool	fts_need_sync = false;
+
+/** Variable specifying the total memory allocated for FTS cache */
+Atomic_relaxed<size_t> fts_max_total_cache_size;
+
+/** This is FTS result cache limit for each query and would be
+a configurable variable */
+size_t	fts_result_cache_limit;
+
+/** Variable specifying the maximum FTS max token size */
+ulong	fts_max_token_size;
+
+/** Variable specifying the minimum FTS max token size */
+ulong	fts_min_token_size;
+
+
+// FIXME: testing
+static time_t elapsed_time;
+static ulint n_nodes;
+
+#ifdef FTS_CACHE_SIZE_DEBUG
+/** The cache size permissible lower limit (1K) */
+static const ulint FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB = 1;
+
+/** The cache size permissible upper limit (1G) */
+static const ulint FTS_CACHE_SIZE_UPPER_LIMIT_IN_MB = 1024;
+#endif
+
+/** Time to sleep after DEADLOCK error before retrying operation. */
+static const std::chrono::milliseconds FTS_DEADLOCK_RETRY_WAIT(100);
+
+/** InnoDB default stopword list:
+There are different versions of stopwords, the stop words listed
+below comes from "Google Stopword" list. Reference:
+http://meta.wikimedia.org/wiki/Stop_word_list/google_stop_word_list.
+The final version of InnoDB default stopword list is still pending
+for decision */
+const char *fts_default_stopword[] =
+{
+	"a",
+	"about",
+	"an",
+	"are",
+	"as",
+	"at",
+	"be",
+	"by",
+	"com",
+	"de",
+	"en",
+	"for",
+	"from",
+	"how",
+	"i",
+	"in",
+	"is",
+	"it",
+	"la",
+	"of",
+	"on",
+	"or",
+	"that",
+	"the",
+	"this",
+	"to",
+	"was",
+	"what",
+	"when",
+	"where",
+	"who",
+	"will",
+	"with",
+	"und",
+	"the",
+	"www",
+	NULL
+};
+
+/** FTS auxiliary table suffixes that are common to all FT indexes. */
+const char* fts_common_tables[] = {
+	"BEING_DELETED",
+	"BEING_DELETED_CACHE",
+	"CONFIG",
+	"DELETED",
+	"DELETED_CACHE",
+	NULL
+};
+
+/** FTS auxiliary INDEX split intervals. */
+const  fts_index_selector_t fts_index_selector[] = {
+	{ 9, "INDEX_1" },
+	{ 65, "INDEX_2" },
+	{ 70, "INDEX_3" },
+	{ 75, "INDEX_4" },
+	{ 80, "INDEX_5" },
+	{ 85, "INDEX_6" },
+	{  0 , NULL	 }
+};
+
+/** Default config values for FTS indexes on a table. */
+static const char* fts_config_table_insert_values_sql =
+	"PROCEDURE P() IS\n"
+	"BEGIN\n"
+	"\n"
+	"INSERT INTO $config_table VALUES('"
+		FTS_MAX_CACHE_SIZE_IN_MB "', '256');\n"
+	""
+	"INSERT INTO $config_table VALUES('"
+		FTS_OPTIMIZE_LIMIT_IN_SECS  "', '180');\n"
+	""
+	"INSERT INTO $config_table VALUES ('"
+		FTS_SYNCED_DOC_ID "', '0');\n"
+	""
+	"INSERT INTO $config_table VALUES ('"
+		FTS_TOTAL_DELETED_COUNT "', '0');\n"
+	"" /* Note: 0 == FTS_TABLE_STATE_RUNNING */
+	"INSERT INTO $config_table VALUES ('"
+		FTS_TABLE_STATE "', '0');\n"
+	"END;\n";
+
+/** FTS tokenize parmameter for plugin parser */
+struct fts_tokenize_param_t {
+	fts_doc_t*	result_doc;	/*!< Result doc for tokens */
+	ulint		add_pos;	/*!< Added position for tokens */
+};
+
+/** Run SYNC on the table, i.e., write out data from the cache to the
+FTS auxiliary INDEX table and clear the cache at the end.
+@param[in,out]	sync		sync state
+@param[in]	unlock_cache	whether unlock cache lock when write node
+@param[in]	wait		whether wait when a sync is in progress
+@return DB_SUCCESS if all OK */
+static
+dberr_t
+fts_sync(
+	fts_sync_t*	sync,
+	bool		unlock_cache,
+	bool		wait);
+
+/****************************************************************//**
+Release all resources help by the words rb tree e.g., the node ilist. */
+static
+void
+fts_words_free(
+/*===========*/
+	ib_rbt_t*	words)		/*!< in: rb tree of words */
+	MY_ATTRIBUTE((nonnull));
+#ifdef FTS_CACHE_SIZE_DEBUG
+/****************************************************************//**
+Read the max cache size parameter from the config table. */
+static
+void
+fts_update_max_cache_size(
+/*======================*/
+	fts_sync_t*	sync);		/*!< in: sync state */
+#endif
+
+/*********************************************************************//**
+This function fetches the document just inserted right before
+we commit the transaction, and tokenize the inserted text data
+and insert into FTS auxiliary table and its cache. */
+static
+void
+fts_add_doc_by_id(
+/*==============*/
+	fts_trx_table_t*ftt,		/*!< in: FTS trx table */
+	doc_id_t	doc_id);	/*!< in: doc id */
+
+/** Tokenize a document.
+@param[in,out]	doc	document to tokenize
+@param[out]	result	tokenization result
+@param[in]	parser	pluggable parser */
+static
+void
+fts_tokenize_document(
+	fts_doc_t*		doc,
+	fts_doc_t*		result,
+	st_mysql_ftparser*	parser);
+
+/** Continue to tokenize a document.
+@param[in,out]	doc	document to tokenize
+@param[in]	add_pos	add this position to all tokens from this tokenization
+@param[out]	result	tokenization result
+@param[in]	parser	pluggable parser */
+static
+void
+fts_tokenize_document_next(
+	fts_doc_t*		doc,
+	ulint			add_pos,
+	fts_doc_t*		result,
+	st_mysql_ftparser*	parser);
+
+/** Create the vector of fts_get_doc_t instances.
+@param[in,out]	cache	fts cache
+@return	vector of fts_get_doc_t instances */
+static
+ib_vector_t*
+fts_get_docs_create(
+	fts_cache_t*	cache);
+
+/** Free the FTS cache.
+@param[in,out]	cache to be freed */
+static
+void
+fts_cache_destroy(fts_cache_t* cache)
+{
+	mysql_mutex_destroy(&cache->lock);
+	mysql_mutex_destroy(&cache->init_lock);
+	mysql_mutex_destroy(&cache->deleted_lock);
+	mysql_mutex_destroy(&cache->doc_id_lock);
+	pthread_cond_destroy(&cache->sync->cond);
+
+	if (cache->stopword_info.cached_stopword) {
+		rbt_free(cache->stopword_info.cached_stopword);
+	}
+
+	if (cache->sync_heap->arg) {
+		mem_heap_free(static_cast<mem_heap_t*>(cache->sync_heap->arg));
+	}
+
+	mem_heap_free(cache->cache_heap);
+}
+
+/** Get a character set based on precise type.
+@param prtype precise type
+@return the corresponding character set */
+UNIV_INLINE
+CHARSET_INFO*
+fts_get_charset(ulint prtype)
+{
+#ifdef UNIV_DEBUG
+	switch (prtype & DATA_MYSQL_TYPE_MASK) {
+	case MYSQL_TYPE_BIT:
+	case MYSQL_TYPE_STRING:
+	case MYSQL_TYPE_VAR_STRING:
+	case MYSQL_TYPE_TINY_BLOB:
+	case MYSQL_TYPE_MEDIUM_BLOB:
+	case MYSQL_TYPE_BLOB:
+	case MYSQL_TYPE_LONG_BLOB:
+	case MYSQL_TYPE_VARCHAR:
+		break;
+	default:
+		ut_error;
+	}
+#endif /* UNIV_DEBUG */
+
+	uint cs_num = (uint) dtype_get_charset_coll(prtype);
+
+	if (CHARSET_INFO* cs = get_charset(cs_num, MYF(MY_WME))) {
+		return(cs);
+	}
+
+	ib::fatal() << "Unable to find charset-collation " << cs_num;
+	return(NULL);
+}
+
+/****************************************************************//**
+This function loads the default InnoDB stopword list */
+static
+void
+fts_load_default_stopword(
+/*======================*/
+	fts_stopword_t*		stopword_info)	/*!< in: stopword info */
+{
+	fts_string_t		str;
+	mem_heap_t*		heap;
+	ib_alloc_t*		allocator;
+	ib_rbt_t*		stop_words;
+
+	allocator = stopword_info->heap;
+	heap = static_cast<mem_heap_t*>(allocator->arg);
+
+	if (!stopword_info->cached_stopword) {
+		stopword_info->cached_stopword = rbt_create_arg_cmp(
+			sizeof(fts_tokenizer_word_t), innobase_fts_text_cmp,
+			&my_charset_latin1);
+	}
+
+	stop_words = stopword_info->cached_stopword;
+
+	str.f_n_char = 0;
+
+	for (ulint i = 0; fts_default_stopword[i]; ++i) {
+		char*			word;
+		fts_tokenizer_word_t	new_word;
+
+		/* We are going to duplicate the value below. */
+		word = const_cast<char*>(fts_default_stopword[i]);
+
+		new_word.nodes = ib_vector_create(
+			allocator, sizeof(fts_node_t), 4);
+
+		str.f_len = strlen(word);
+		str.f_str = reinterpret_cast<byte*>(word);
+
+		fts_string_dup(&new_word.text, &str, heap);
+
+		rbt_insert(stop_words, &new_word, &new_word);
+	}
+
+	stopword_info->status = STOPWORD_FROM_DEFAULT;
+}
+
+/****************************************************************//**
+Callback function to read a single stopword value.
+@return Always return TRUE */
+static
+ibool
+fts_read_stopword(
+/*==============*/
+	void*		row,		/*!< in: sel_node_t* */
+	void*		user_arg)	/*!< in: pointer to ib_vector_t */
+{
+	ib_alloc_t*	allocator;
+	fts_stopword_t*	stopword_info;
+	sel_node_t*	sel_node;
+	que_node_t*	exp;
+	ib_rbt_t*	stop_words;
+	dfield_t*	dfield;
+	fts_string_t	str;
+	mem_heap_t*	heap;
+	ib_rbt_bound_t	parent;
+	dict_table_t*	table;
+
+	sel_node = static_cast<sel_node_t*>(row);
+	table = sel_node->table_list->table;
+	stopword_info = static_cast<fts_stopword_t*>(user_arg);
+
+	stop_words = stopword_info->cached_stopword;
+	allocator =  static_cast<ib_alloc_t*>(stopword_info->heap);
+	heap = static_cast<mem_heap_t*>(allocator->arg);
+
+	exp = sel_node->select_list;
+
+	/* We only need to read the first column */
+	dfield = que_node_get_val(exp);
+
+	str.f_n_char = 0;
+	str.f_str = static_cast<byte*>(dfield_get_data(dfield));
+	str.f_len = dfield_get_len(dfield);
+	exp = que_node_get_next(exp);
+	ut_ad(exp);
+
+	if (table->versioned()) {
+		dfield = que_node_get_val(exp);
+		ut_ad(dfield_get_type(dfield)->vers_sys_end());
+		void* data = dfield_get_data(dfield);
+		ulint len = dfield_get_len(dfield);
+		if (table->versioned_by_id()) {
+			ut_ad(len == sizeof trx_id_max_bytes);
+			if (0 != memcmp(data, trx_id_max_bytes, len)) {
+				return true;
+			}
+		} else {
+			ut_ad(len == sizeof timestamp_max_bytes);
+			if (0 != memcmp(data, timestamp_max_bytes, len)) {
+				return true;
+			}
+		}
+	}
+	ut_ad(!que_node_get_next(exp));
+
+	/* Only create new node if it is a value not already existed */
+	if (str.f_len != UNIV_SQL_NULL
+	    && rbt_search(stop_words, &parent, &str) != 0) {
+
+		fts_tokenizer_word_t	new_word;
+
+		new_word.nodes = ib_vector_create(
+			allocator, sizeof(fts_node_t), 4);
+
+		new_word.text.f_str = static_cast<byte*>(
+			 mem_heap_alloc(heap, str.f_len + 1));
+
+		memcpy(new_word.text.f_str, str.f_str, str.f_len);
+
+		new_word.text.f_n_char = 0;
+		new_word.text.f_len = str.f_len;
+		new_word.text.f_str[str.f_len] = 0;
+
+		rbt_insert(stop_words, &new_word, &new_word);
+	}
+
+	return(TRUE);
+}
+
+/******************************************************************//**
+Load user defined stopword from designated user table
+@return whether the operation is successful */
+static
+bool
+fts_load_user_stopword(
+/*===================*/
+	fts_t*		fts,			/*!< in: FTS struct */
+	const char*	stopword_table_name,	/*!< in: Stopword table
+						name */
+	fts_stopword_t*	stopword_info)		/*!< in: Stopword info */
+{
+	if (!fts->dict_locked) {
+		dict_sys.lock(SRW_LOCK_CALL);
+	}
+
+	/* Validate the user table existence in the right format */
+	bool ret= false;
+	const char* row_end;
+	stopword_info->charset = fts_valid_stopword_table(stopword_table_name,
+							  &row_end);
+	if (!stopword_info->charset) {
+cleanup:
+		if (!fts->dict_locked) {
+			dict_sys.unlock();
+		}
+
+		return ret;
+	}
+
+	trx_t* trx = trx_create();
+	trx->op_info = "Load user stopword table into FTS cache";
+
+	if (!stopword_info->cached_stopword) {
+		/* Create the stopword RB tree with the stopword column
+		charset. All comparison will use this charset */
+		stopword_info->cached_stopword = rbt_create_arg_cmp(
+			sizeof(fts_tokenizer_word_t), innobase_fts_text_cmp,
+			(void*)stopword_info->charset);
+
+	}
+
+	pars_info_t* info = pars_info_create();
+
+	pars_info_bind_id(info, "table_stopword", stopword_table_name);
+	pars_info_bind_id(info, "row_end", row_end);
+
+	pars_info_bind_function(info, "my_func", fts_read_stopword,
+				stopword_info);
+
+	que_t* graph = pars_sql(
+		info,
+		"PROCEDURE P() IS\n"
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS"
+		" SELECT value, $row_end"
+		" FROM $table_stopword;\n"
+		"BEGIN\n"
+		"\n"
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE c;"
+		"END;\n");
+
+	for (;;) {
+		dberr_t error = fts_eval_sql(trx, graph);
+
+		if (UNIV_LIKELY(error == DB_SUCCESS)) {
+			fts_sql_commit(trx);
+			stopword_info->status = STOPWORD_USER_TABLE;
+			break;
+		} else {
+			fts_sql_rollback(trx);
+
+			if (error == DB_LOCK_WAIT_TIMEOUT) {
+				ib::warn() << "Lock wait timeout reading user"
+					" stopword table. Retrying!";
+
+				trx->error_state = DB_SUCCESS;
+			} else {
+				ib::error() << "Error '" << error
+					<< "' while reading user stopword"
+					" table.";
+				ret = FALSE;
+				break;
+			}
+		}
+	}
+
+	que_graph_free(graph);
+	trx->free();
+	ret = true;
+	goto cleanup;
+}
+
+/******************************************************************//**
+Initialize the index cache. */
+static
+void
+fts_index_cache_init(
+/*=================*/
+	ib_alloc_t*		allocator,	/*!< in: the allocator to use */
+	fts_index_cache_t*	index_cache)	/*!< in: index cache */
+{
+	ulint			i;
+
+	ut_a(index_cache->words == NULL);
+
+	index_cache->words = rbt_create_arg_cmp(
+		sizeof(fts_tokenizer_word_t), innobase_fts_text_cmp,
+		(void*) index_cache->charset);
+
+	ut_a(index_cache->doc_stats == NULL);
+
+	index_cache->doc_stats = ib_vector_create(
+		allocator, sizeof(fts_doc_stats_t), 4);
+
+	for (i = 0; i < FTS_NUM_AUX_INDEX; ++i) {
+		ut_a(index_cache->ins_graph[i] == NULL);
+		ut_a(index_cache->sel_graph[i] == NULL);
+	}
+}
+
+/*********************************************************************//**
+Initialize FTS cache. */
+void
+fts_cache_init(
+/*===========*/
+	fts_cache_t*	cache)		/*!< in: cache to initialize */
+{
+	ulint		i;
+
+	/* Just to make sure */
+	ut_a(cache->sync_heap->arg == NULL);
+
+	cache->sync_heap->arg = mem_heap_create(1024);
+
+	cache->total_size = 0;
+	cache->total_size_at_sync = 0;
+
+	mysql_mutex_lock(&cache->deleted_lock);
+	cache->deleted_doc_ids = ib_vector_create(
+		cache->sync_heap, sizeof(doc_id_t), 4);
+	mysql_mutex_unlock(&cache->deleted_lock);
+
+	/* Reset the cache data for all the FTS indexes. */
+	for (i = 0; i < ib_vector_size(cache->indexes); ++i) {
+		fts_index_cache_t*	index_cache;
+
+		index_cache = static_cast<fts_index_cache_t*>(
+			ib_vector_get(cache->indexes, i));
+
+		fts_index_cache_init(cache->sync_heap, index_cache);
+	}
+}
+
+/****************************************************************//**
+Create a FTS cache. */
+fts_cache_t*
+fts_cache_create(
+/*=============*/
+	dict_table_t*	table)	/*!< in: table owns the FTS cache */
+{
+	mem_heap_t*	heap;
+	fts_cache_t*	cache;
+
+	heap = static_cast<mem_heap_t*>(mem_heap_create(512));
+
+	cache = static_cast<fts_cache_t*>(
+		mem_heap_zalloc(heap, sizeof(*cache)));
+
+	cache->cache_heap = heap;
+
+	mysql_mutex_init(fts_cache_mutex_key, &cache->lock, nullptr);
+	mysql_mutex_init(fts_cache_init_mutex_key, &cache->init_lock, nullptr);
+	mysql_mutex_init(fts_delete_mutex_key, &cache->deleted_lock, nullptr);
+	mysql_mutex_init(fts_doc_id_mutex_key, &cache->doc_id_lock, nullptr);
+
+	/* This is the heap used to create the cache itself. */
+	cache->self_heap = ib_heap_allocator_create(heap);
+
+	/* This is a transient heap, used for storing sync data. */
+	cache->sync_heap = ib_heap_allocator_create(heap);
+	cache->sync_heap->arg = NULL;
+
+	cache->sync = static_cast<fts_sync_t*>(
+		mem_heap_zalloc(heap, sizeof(fts_sync_t)));
+
+	cache->sync->table = table;
+	pthread_cond_init(&cache->sync->cond, nullptr);
+
+	/* Create the index cache vector that will hold the inverted indexes. */
+	cache->indexes = ib_vector_create(
+		cache->self_heap, sizeof(fts_index_cache_t), 2);
+
+	fts_cache_init(cache);
+
+	cache->stopword_info.cached_stopword = NULL;
+	cache->stopword_info.charset = NULL;
+
+	cache->stopword_info.heap = cache->self_heap;
+
+	cache->stopword_info.status = STOPWORD_NOT_INIT;
+
+	return(cache);
+}
+
+/*******************************************************************//**
+Add a newly create index into FTS cache */
+void
+fts_add_index(
+/*==========*/
+	dict_index_t*	index,		/*!< FTS index to be added */
+	dict_table_t*	table)		/*!< table */
+{
+	fts_t*			fts = table->fts;
+	fts_cache_t*		cache;
+	fts_index_cache_t*	index_cache;
+
+	ut_ad(fts);
+	cache = table->fts->cache;
+
+	mysql_mutex_lock(&cache->init_lock);
+
+	ib_vector_push(fts->indexes, &index);
+
+	index_cache = fts_find_index_cache(cache, index);
+
+	if (!index_cache) {
+		/* Add new index cache structure */
+		index_cache = fts_cache_index_cache_create(table, index);
+	}
+
+	mysql_mutex_unlock(&cache->init_lock);
+}
+
+/*******************************************************************//**
+recalibrate get_doc structure after index_cache in cache->indexes changed */
+static
+void
+fts_reset_get_doc(
+/*==============*/
+	fts_cache_t*	cache)	/*!< in: FTS index cache */
+{
+	fts_get_doc_t*  get_doc;
+	ulint		i;
+
+	mysql_mutex_assert_owner(&cache->init_lock);
+
+	ib_vector_reset(cache->get_docs);
+
+	for (i = 0; i < ib_vector_size(cache->indexes); i++) {
+		fts_index_cache_t*	ind_cache;
+
+		ind_cache = static_cast<fts_index_cache_t*>(
+			ib_vector_get(cache->indexes, i));
+
+		get_doc = static_cast<fts_get_doc_t*>(
+			ib_vector_push(cache->get_docs, NULL));
+
+		memset(get_doc, 0x0, sizeof(*get_doc));
+
+		get_doc->index_cache = ind_cache;
+		get_doc->cache = cache;
+	}
+
+	ut_ad(ib_vector_size(cache->get_docs)
+	      == ib_vector_size(cache->indexes));
+}
+
+/*******************************************************************//**
+Check an index is in the table->indexes list
+@return TRUE if it exists */
+static
+ibool
+fts_in_dict_index(
+/*==============*/
+	dict_table_t*	table,		/*!< in: Table */
+	dict_index_t*	index_check)	/*!< in: index to be checked */
+{
+	dict_index_t*	index;
+
+	for (index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+
+		if (index == index_check) {
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/*******************************************************************//**
+Check an index is in the fts->cache->indexes list
+@return TRUE if it exists */
+static
+ibool
+fts_in_index_cache(
+/*===============*/
+	dict_table_t*	table,	/*!< in: Table */
+	dict_index_t*	index)	/*!< in: index to be checked */
+{
+	ulint	i;
+
+	for (i = 0; i < ib_vector_size(table->fts->cache->indexes); i++) {
+		fts_index_cache_t*      index_cache;
+
+		index_cache = static_cast<fts_index_cache_t*>(
+			ib_vector_get(table->fts->cache->indexes, i));
+
+		if (index_cache->index == index) {
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/*******************************************************************//**
+Check indexes in the fts->indexes is also present in index cache and
+table->indexes list
+@return TRUE if all indexes match */
+ibool
+fts_check_cached_index(
+/*===================*/
+	dict_table_t*	table)	/*!< in: Table where indexes are dropped */
+{
+	ulint	i;
+
+	if (!table->fts || !table->fts->cache) {
+		return(TRUE);
+	}
+
+	ut_a(ib_vector_size(table->fts->indexes)
+	      == ib_vector_size(table->fts->cache->indexes));
+
+	for (i = 0; i < ib_vector_size(table->fts->indexes); i++) {
+		dict_index_t*	index;
+
+		index = static_cast<dict_index_t*>(
+			ib_vector_getp(table->fts->indexes, i));
+
+		if (!fts_in_index_cache(table, index)) {
+			return(FALSE);
+		}
+
+		if (!fts_in_dict_index(table, index)) {
+			return(FALSE);
+		}
+	}
+
+	return(TRUE);
+}
+
+/** Clear all fts resources when there is no internal DOC_ID
+and there are no new fts index to add.
+@param[in,out]	table	table  where fts is to be freed */
+void fts_clear_all(dict_table_t *table)
+{
+  if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID) ||
+      !table->fts ||
+      !ib_vector_is_empty(table->fts->indexes))
+    return;
+
+  for (const dict_index_t *index= dict_table_get_first_index(table);
+       index; index= dict_table_get_next_index(index))
+    if (index->type & DICT_FTS)
+      return;
+
+  fts_optimize_remove_table(table);
+
+  table->fts->~fts_t();
+  table->fts= nullptr;
+  DICT_TF2_FLAG_UNSET(table, DICT_TF2_FTS);
+}
+
+/*******************************************************************//**
+Drop auxiliary tables related to an FTS index
+@return DB_SUCCESS or error number */
+dberr_t
+fts_drop_index(
+/*===========*/
+	dict_table_t*	table,	/*!< in: Table where indexes are dropped */
+	dict_index_t*	index,	/*!< in: Index to be dropped */
+	trx_t*		trx)	/*!< in: Transaction for the drop */
+{
+	ib_vector_t*	indexes = table->fts->indexes;
+	dberr_t		err = DB_SUCCESS;
+
+	ut_a(indexes);
+
+	if ((ib_vector_size(indexes) == 1
+	     && (index == static_cast<dict_index_t*>(
+			ib_vector_getp(table->fts->indexes, 0)))
+	     && DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID))
+	    || ib_vector_is_empty(indexes)) {
+		doc_id_t	current_doc_id;
+		doc_id_t	first_doc_id;
+
+		DICT_TF2_FLAG_UNSET(table, DICT_TF2_FTS);
+
+		current_doc_id = table->fts->cache->next_doc_id;
+		first_doc_id = table->fts->cache->first_doc_id;
+		fts_cache_clear(table->fts->cache);
+		fts_cache_destroy(table->fts->cache);
+		table->fts->cache = fts_cache_create(table);
+		table->fts->cache->next_doc_id = current_doc_id;
+		table->fts->cache->first_doc_id = first_doc_id;
+	} else {
+		fts_cache_t*            cache = table->fts->cache;
+		fts_index_cache_t*      index_cache;
+
+		mysql_mutex_lock(&cache->init_lock);
+
+		index_cache = fts_find_index_cache(cache, index);
+
+		if (index_cache != NULL) {
+			if (index_cache->words) {
+				fts_words_free(index_cache->words);
+				rbt_free(index_cache->words);
+			}
+
+			ib_vector_remove(cache->indexes, *(void**) index_cache);
+		}
+
+		if (cache->get_docs) {
+			fts_reset_get_doc(cache);
+		}
+
+		mysql_mutex_unlock(&cache->init_lock);
+	}
+
+	err = fts_drop_index_tables(trx, *index);
+
+	ib_vector_remove(indexes, (const void*) index);
+
+	return(err);
+}
+
+/****************************************************************//**
+Create an FTS index cache. */
+CHARSET_INFO*
+fts_index_get_charset(
+/*==================*/
+	dict_index_t*		index)		/*!< in: FTS index */
+{
+	CHARSET_INFO*		charset = NULL;
+	dict_field_t*		field;
+	ulint			prtype;
+
+	field = dict_index_get_nth_field(index, 0);
+	prtype = field->col->prtype;
+
+	charset = fts_get_charset(prtype);
+
+#ifdef FTS_DEBUG
+	/* Set up charset info for this index. Please note all
+	field of the FTS index should have the same charset */
+	for (i = 1; i < index->n_fields; i++) {
+		CHARSET_INFO*   fld_charset;
+
+		field = dict_index_get_nth_field(index, i);
+		prtype = field->col->prtype;
+
+		fld_charset = fts_get_charset(prtype);
+
+		/* All FTS columns should have the same charset */
+		if (charset) {
+			ut_a(charset == fld_charset);
+		} else {
+			charset = fld_charset;
+		}
+	}
+#endif
+
+	return(charset);
+
+}
+/****************************************************************//**
+Create an FTS index cache.
+@return Index Cache */
+fts_index_cache_t*
+fts_cache_index_cache_create(
+/*=========================*/
+	dict_table_t*		table,		/*!< in: table with FTS index */
+	dict_index_t*		index)		/*!< in: FTS index */
+{
+	ulint			n_bytes;
+	fts_index_cache_t*	index_cache;
+	fts_cache_t*		cache = table->fts->cache;
+
+	ut_a(cache != NULL);
+
+	mysql_mutex_assert_owner(&cache->init_lock);
+
+	/* Must not already exist in the cache vector. */
+	ut_a(fts_find_index_cache(cache, index) == NULL);
+
+	index_cache = static_cast<fts_index_cache_t*>(
+		ib_vector_push(cache->indexes, NULL));
+
+	memset(index_cache, 0x0, sizeof(*index_cache));
+
+	index_cache->index = index;
+
+	index_cache->charset = fts_index_get_charset(index);
+
+	n_bytes = sizeof(que_t*) * FTS_NUM_AUX_INDEX;
+
+	index_cache->ins_graph = static_cast<que_t**>(
+		mem_heap_zalloc(static_cast<mem_heap_t*>(
+			cache->self_heap->arg), n_bytes));
+
+	index_cache->sel_graph = static_cast<que_t**>(
+		mem_heap_zalloc(static_cast<mem_heap_t*>(
+			cache->self_heap->arg), n_bytes));
+
+	fts_index_cache_init(cache->sync_heap, index_cache);
+
+	if (cache->get_docs) {
+		fts_reset_get_doc(cache);
+	}
+
+	return(index_cache);
+}
+
+/****************************************************************//**
+Release all resources help by the words rb tree e.g., the node ilist. */
+static
+void
+fts_words_free(
+/*===========*/
+	ib_rbt_t*	words)			/*!< in: rb tree of words */
+{
+	const ib_rbt_node_t*	rbt_node;
+
+	/* Free the resources held by a word. */
+	for (rbt_node = rbt_first(words);
+	     rbt_node != NULL;
+	     rbt_node = rbt_first(words)) {
+
+		ulint			i;
+		fts_tokenizer_word_t*	word;
+
+		word = rbt_value(fts_tokenizer_word_t, rbt_node);
+
+		/* Free the ilists of this word. */
+		for (i = 0; i < ib_vector_size(word->nodes); ++i) {
+
+			fts_node_t* fts_node = static_cast<fts_node_t*>(
+				ib_vector_get(word->nodes, i));
+
+			ut_free(fts_node->ilist);
+			fts_node->ilist = NULL;
+		}
+
+		/* NOTE: We are responsible for free'ing the node */
+		ut_free(rbt_remove_node(words, rbt_node));
+	}
+}
+
+/** Clear cache.
+@param[in,out]	cache	fts cache */
+void
+fts_cache_clear(
+	fts_cache_t*	cache)
+{
+	ulint		i;
+
+	for (i = 0; i < ib_vector_size(cache->indexes); ++i) {
+		ulint			j;
+		fts_index_cache_t*	index_cache;
+
+		index_cache = static_cast<fts_index_cache_t*>(
+			ib_vector_get(cache->indexes, i));
+
+		fts_words_free(index_cache->words);
+
+		rbt_free(index_cache->words);
+
+		index_cache->words = NULL;
+
+		for (j = 0; j < FTS_NUM_AUX_INDEX; ++j) {
+
+			if (index_cache->ins_graph[j] != NULL) {
+
+				que_graph_free(index_cache->ins_graph[j]);
+
+				index_cache->ins_graph[j] = NULL;
+			}
+
+			if (index_cache->sel_graph[j] != NULL) {
+
+				que_graph_free(index_cache->sel_graph[j]);
+
+				index_cache->sel_graph[j] = NULL;
+			}
+		}
+
+		index_cache->doc_stats = NULL;
+	}
+
+	fts_need_sync = false;
+
+	cache->total_size = 0;
+
+	mysql_mutex_lock(&cache->deleted_lock);
+	cache->deleted_doc_ids = NULL;
+	mysql_mutex_unlock(&cache->deleted_lock);
+
+	mem_heap_free(static_cast<mem_heap_t*>(cache->sync_heap->arg));
+	cache->sync_heap->arg = NULL;
+}
+
+/*********************************************************************//**
+Search the index specific cache for a particular FTS index.
+@return the index cache else NULL */
+UNIV_INLINE
+fts_index_cache_t*
+fts_get_index_cache(
+/*================*/
+	fts_cache_t*		cache,		/*!< in: cache to search */
+	const dict_index_t*	index)		/*!< in: index to search for */
+{
+#ifdef SAFE_MUTEX
+	ut_ad(mysql_mutex_is_owner(&cache->lock)
+	      || mysql_mutex_is_owner(&cache->init_lock));
+#endif /* SAFE_MUTEX */
+
+	for (ulint i = 0; i < ib_vector_size(cache->indexes); ++i) {
+		fts_index_cache_t*	index_cache;
+
+		index_cache = static_cast<fts_index_cache_t*>(
+			ib_vector_get(cache->indexes, i));
+
+		if (index_cache->index == index) {
+
+			return(index_cache);
+		}
+	}
+
+	return(NULL);
+}
+
+#ifdef FTS_DEBUG
+/*********************************************************************//**
+Search the index cache for a get_doc structure.
+@return the fts_get_doc_t item else NULL */
+static
+fts_get_doc_t*
+fts_get_index_get_doc(
+/*==================*/
+	fts_cache_t*		cache,		/*!< in: cache to search */
+	const dict_index_t*	index)		/*!< in: index to search for */
+{
+	ulint			i;
+
+	mysql_mutex_assert_owner(&cache->init_lock);
+
+	for (i = 0; i < ib_vector_size(cache->get_docs); ++i) {
+		fts_get_doc_t*	get_doc;
+
+		get_doc = static_cast<fts_get_doc_t*>(
+			ib_vector_get(cache->get_docs, i));
+
+		if (get_doc->index_cache->index == index) {
+
+			return(get_doc);
+		}
+	}
+
+	return(NULL);
+}
+#endif
+
+/**********************************************************************//**
+Find an existing word, or if not found, create one and return it.
+@return specified word token */
+static
+fts_tokenizer_word_t*
+fts_tokenizer_word_get(
+/*===================*/
+	fts_cache_t*	cache,			/*!< in: cache */
+	fts_index_cache_t*
+			index_cache,		/*!< in: index cache */
+	fts_string_t*	text)			/*!< in: node text */
+{
+	fts_tokenizer_word_t*	word;
+	ib_rbt_bound_t		parent;
+
+	mysql_mutex_assert_owner(&cache->lock);
+
+	/* If it is a stopword, do not index it */
+	if (!fts_check_token(text,
+		    cache->stopword_info.cached_stopword,
+		    index_cache->charset)) {
+
+		return(NULL);
+	}
+
+	/* Check if we found a match, if not then add word to tree. */
+	if (rbt_search(index_cache->words, &parent, text) != 0) {
+		mem_heap_t*		heap;
+		fts_tokenizer_word_t	new_word;
+
+		heap = static_cast<mem_heap_t*>(cache->sync_heap->arg);
+
+		new_word.nodes = ib_vector_create(
+			cache->sync_heap, sizeof(fts_node_t), 4);
+
+		fts_string_dup(&new_word.text, text, heap);
+
+		parent.last = rbt_add_node(
+			index_cache->words, &parent, &new_word);
+
+		/* Take into account the RB tree memory use and the vector. */
+		cache->total_size += sizeof(new_word)
+			+ sizeof(ib_rbt_node_t)
+			+ text->f_len
+			+ (sizeof(fts_node_t) * 4)
+			+ sizeof(*new_word.nodes);
+
+		ut_ad(rbt_validate(index_cache->words));
+	}
+
+	word = rbt_value(fts_tokenizer_word_t, parent.last);
+
+	return(word);
+}
+
+/**********************************************************************//**
+Add the given doc_id/word positions to the given node's ilist. */
+void
+fts_cache_node_add_positions(
+/*=========================*/
+	fts_cache_t*	cache,		/*!< in: cache */
+	fts_node_t*	node,		/*!< in: word node */
+	doc_id_t	doc_id,		/*!< in: doc id */
+	ib_vector_t*	positions)	/*!< in: fts_token_t::positions */
+{
+	ulint		i;
+	byte*		ptr;
+	byte*		ilist;
+	ulint		enc_len;
+	ulint		last_pos;
+	byte*		ptr_start;
+	doc_id_t	doc_id_delta;
+
+#ifdef SAFE_MUTEX
+	if (cache) {
+		mysql_mutex_assert_owner(&cache->lock);
+	}
+#endif /* SAFE_MUTEX */
+
+	ut_ad(doc_id >= node->last_doc_id);
+
+	/* Calculate the space required to store the ilist. */
+	doc_id_delta = doc_id - node->last_doc_id;
+	enc_len = fts_get_encoded_len(doc_id_delta);
+
+	last_pos = 0;
+	for (i = 0; i < ib_vector_size(positions); i++) {
+		ulint	pos = *(static_cast<ulint*>(
+			ib_vector_get(positions, i)));
+
+		ut_ad(last_pos == 0 || pos > last_pos);
+
+		enc_len += fts_get_encoded_len(pos - last_pos);
+		last_pos = pos;
+	}
+
+	/* The 0x00 byte at the end of the token positions list. */
+	enc_len++;
+
+	if ((node->ilist_size_alloc - node->ilist_size) >= enc_len) {
+		/* No need to allocate more space, we can fit in the new
+		data at the end of the old one. */
+		ilist = NULL;
+		ptr = node->ilist + node->ilist_size;
+	} else {
+		ulint	new_size = node->ilist_size + enc_len;
+
+		/* Over-reserve space by a fixed size for small lengths and
+		by 20% for lengths >= 48 bytes. */
+		if (new_size < 16) {
+			new_size = 16;
+		} else if (new_size < 32) {
+			new_size = 32;
+		} else if (new_size < 48) {
+			new_size = 48;
+		} else {
+			new_size = new_size * 6 / 5;
+		}
+
+		ilist = static_cast<byte*>(ut_malloc_nokey(new_size));
+		ptr = ilist + node->ilist_size;
+
+		node->ilist_size_alloc = new_size;
+		if (cache) {
+			cache->total_size += new_size;
+		}
+	}
+
+	ptr_start = ptr;
+
+	/* Encode the new fragment. */
+	ptr = fts_encode_int(doc_id_delta, ptr);
+
+	last_pos = 0;
+	for (i = 0; i < ib_vector_size(positions); i++) {
+		ulint	pos = *(static_cast<ulint*>(
+			 ib_vector_get(positions, i)));
+
+		ptr = fts_encode_int(pos - last_pos, ptr);
+		last_pos = pos;
+	}
+
+	*ptr++ = 0;
+
+	ut_a(enc_len == (ulint)(ptr - ptr_start));
+
+	if (ilist) {
+		/* Copy old ilist to the start of the new one and switch the
+		new one into place in the node. */
+		if (node->ilist_size > 0) {
+			memcpy(ilist, node->ilist, node->ilist_size);
+			ut_free(node->ilist);
+			if (cache) {
+				cache->total_size -= node->ilist_size;
+			}
+		}
+
+		node->ilist = ilist;
+	}
+
+	node->ilist_size += enc_len;
+
+	if (node->first_doc_id == FTS_NULL_DOC_ID) {
+		node->first_doc_id = doc_id;
+	}
+
+	node->last_doc_id = doc_id;
+	++node->doc_count;
+}
+
+/**********************************************************************//**
+Add document to the cache. */
+static
+void
+fts_cache_add_doc(
+/*==============*/
+	fts_cache_t*	cache,			/*!< in: cache */
+	fts_index_cache_t*
+			index_cache,		/*!< in: index cache */
+	doc_id_t	doc_id,			/*!< in: doc id to add */
+	ib_rbt_t*	tokens)			/*!< in: document tokens */
+{
+	const ib_rbt_node_t*	node;
+	ulint			n_words;
+	fts_doc_stats_t*	doc_stats;
+
+	if (!tokens) {
+		return;
+	}
+
+	mysql_mutex_assert_owner(&cache->lock);
+
+	n_words = rbt_size(tokens);
+
+	for (node = rbt_first(tokens); node; node = rbt_first(tokens)) {
+
+		fts_tokenizer_word_t*	word;
+		fts_node_t*		fts_node = NULL;
+		fts_token_t*		token = rbt_value(fts_token_t, node);
+
+		/* Find and/or add token to the cache. */
+		word = fts_tokenizer_word_get(
+			cache, index_cache, &token->text);
+
+		if (!word) {
+			ut_free(rbt_remove_node(tokens, node));
+			continue;
+		}
+
+		if (ib_vector_size(word->nodes) > 0) {
+			fts_node = static_cast<fts_node_t*>(
+				ib_vector_last(word->nodes));
+		}
+
+		if (fts_node == NULL || fts_node->synced
+		    || fts_node->ilist_size > FTS_ILIST_MAX_SIZE
+		    || doc_id < fts_node->last_doc_id) {
+
+			fts_node = static_cast<fts_node_t*>(
+				ib_vector_push(word->nodes, NULL));
+
+			memset(fts_node, 0x0, sizeof(*fts_node));
+
+			cache->total_size += sizeof(*fts_node);
+		}
+
+		fts_cache_node_add_positions(
+			cache, fts_node, doc_id, token->positions);
+
+		ut_free(rbt_remove_node(tokens, node));
+	}
+
+	ut_a(rbt_empty(tokens));
+
+	/* Add to doc ids processed so far. */
+	doc_stats = static_cast<fts_doc_stats_t*>(
+		ib_vector_push(index_cache->doc_stats, NULL));
+
+	doc_stats->doc_id = doc_id;
+	doc_stats->word_count = n_words;
+
+	/* Add the doc stats memory usage too. */
+	cache->total_size += sizeof(*doc_stats);
+
+	if (doc_id > cache->sync->max_doc_id) {
+		cache->sync->max_doc_id = doc_id;
+	}
+}
+
+/** Drop a table.
+@param trx          transaction
+@param table_name   FTS_ table name
+@param rename       whether to rename before dropping
+@return error code
+@retval DB_SUCCESS  if the table was dropped
+@retval DB_FAIL     if the table did not exist */
+static dberr_t fts_drop_table(trx_t *trx, const char *table_name, bool rename)
+{
+  if (dict_table_t *table= dict_table_open_on_name(table_name, true,
+                                                   DICT_ERR_IGNORE_TABLESPACE))
+  {
+    table->release();
+    if (rename)
+    {
+      mem_heap_t *heap= mem_heap_create(FN_REFLEN);
+      char *tmp= dict_mem_create_temporary_tablename(heap, table->name.m_name,
+                                                     table->id);
+      dberr_t err= row_rename_table_for_mysql(table->name.m_name, tmp, trx,
+                                              false);
+      mem_heap_free(heap);
+      if (err != DB_SUCCESS)
+      {
+        ib::error() << "Unable to rename table " << table_name << ": " << err;
+        return err;
+      }
+    }
+    if (dberr_t err= trx->drop_table(*table))
+    {
+      ib::error() << "Unable to drop table " << table->name << ": " << err;
+      return err;
+    }
+
+#ifdef UNIV_DEBUG
+    for (auto &p : trx->mod_tables)
+    {
+      if (p.first == table)
+	p.second.set_aux_table();
+    }
+#endif /* UNIV_DEBUG */
+    return DB_SUCCESS;
+  }
+
+  return DB_FAIL;
+}
+
+/****************************************************************//**
+Rename a single auxiliary table due to database name change.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_rename_one_aux_table(
+/*=====================*/
+	const char*	new_name,		/*!< in: new parent tbl name */
+	const char*	fts_table_old_name,	/*!< in: old aux tbl name */
+	trx_t*		trx)			/*!< in: transaction */
+{
+	char	fts_table_new_name[MAX_TABLE_NAME_LEN];
+	ulint	new_db_name_len = dict_get_db_name_len(new_name);
+	ulint	old_db_name_len = dict_get_db_name_len(fts_table_old_name);
+	ulint	table_new_name_len = strlen(fts_table_old_name)
+				     + new_db_name_len - old_db_name_len;
+
+	/* Check if the new and old database names are the same, if so,
+	nothing to do */
+	ut_ad((new_db_name_len != old_db_name_len)
+	      || strncmp(new_name, fts_table_old_name, old_db_name_len) != 0);
+
+	/* Get the database name from "new_name", and table name
+	from the fts_table_old_name */
+	strncpy(fts_table_new_name, new_name, new_db_name_len);
+	strncpy(fts_table_new_name + new_db_name_len,
+	       strchr(fts_table_old_name, '/'),
+	       table_new_name_len - new_db_name_len);
+	fts_table_new_name[table_new_name_len] = 0;
+
+	return row_rename_table_for_mysql(
+		fts_table_old_name, fts_table_new_name, trx, false);
+}
+
+/****************************************************************//**
+Rename auxiliary tables for all fts index for a table. This(rename)
+is due to database name change
+@return DB_SUCCESS or error code */
+dberr_t
+fts_rename_aux_tables(
+/*==================*/
+	dict_table_t*	table,		/*!< in: user Table */
+	const char*     new_name,       /*!< in: new table name */
+	trx_t*		trx)		/*!< in: transaction */
+{
+	ulint		i;
+	fts_table_t	fts_table;
+
+	FTS_INIT_FTS_TABLE(&fts_table, NULL, FTS_COMMON_TABLE, table);
+
+	dberr_t err = DB_SUCCESS;
+	char old_table_name[MAX_FULL_NAME_LEN];
+
+	/* Rename common auxiliary tables */
+	for (i = 0; fts_common_tables[i] != NULL; ++i) {
+		fts_table.suffix = fts_common_tables[i];
+		fts_get_table_name(&fts_table, old_table_name, true);
+
+		err = fts_rename_one_aux_table(new_name, old_table_name, trx);
+
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+	}
+
+	fts_t*	fts = table->fts;
+
+	/* Rename index specific auxiliary tables */
+	for (i = 0; fts->indexes != 0 && i < ib_vector_size(fts->indexes);
+	     ++i) {
+		dict_index_t*	index;
+
+		index = static_cast<dict_index_t*>(
+			ib_vector_getp(fts->indexes, i));
+
+		FTS_INIT_INDEX_TABLE(&fts_table, NULL, FTS_INDEX_TABLE, index);
+
+		for (ulint j = 0; j < FTS_NUM_AUX_INDEX; ++j) {
+			fts_table.suffix = fts_get_suffix(j);
+			fts_get_table_name(&fts_table, old_table_name, true);
+
+			err = fts_rename_one_aux_table(
+				new_name, old_table_name, trx);
+
+			DBUG_EXECUTE_IF("fts_rename_failure",
+					err = DB_DEADLOCK;
+					fts_sql_rollback(trx););
+
+			if (err != DB_SUCCESS) {
+				return(err);
+			}
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/** Lock an internal FTS_ table, before fts_drop_table() */
+static dberr_t fts_lock_table(trx_t *trx, const char *table_name)
+{
+  ut_ad(purge_sys.must_wait_FTS());
+
+  if (dict_table_t *table= dict_table_open_on_name(table_name, false,
+                                                   DICT_ERR_IGNORE_TABLESPACE))
+  {
+    dberr_t err= lock_table_for_trx(table, trx, LOCK_X);
+    /* Wait for purge threads to stop using the table. */
+    for (uint n= 15; table->get_ref_count() > 1; )
+    {
+      if (!--n)
+      {
+        err= DB_LOCK_WAIT_TIMEOUT;
+        goto fail;
+      }
+      std::this_thread::sleep_for(std::chrono::milliseconds(50));
+    }
+fail:
+    table->release();
+    return err;
+  }
+  return DB_SUCCESS;
+}
+
+/** Lock the internal FTS_ tables for an index, before fts_drop_index_tables().
+@param trx   transaction
+@param index fulltext index */
+dberr_t fts_lock_index_tables(trx_t *trx, const dict_index_t &index)
+{
+  ut_ad(index.type & DICT_FTS);
+  fts_table_t fts_table;
+  char table_name[MAX_FULL_NAME_LEN];
+  FTS_INIT_INDEX_TABLE(&fts_table, nullptr, FTS_INDEX_TABLE, (&index));
+  for (const fts_index_selector_t *s= fts_index_selector; s->suffix; s++)
+  {
+    fts_table.suffix= s->suffix;
+    fts_get_table_name(&fts_table, table_name, false);
+    if (dberr_t err= fts_lock_table(trx, table_name))
+      return err;
+  }
+  return DB_SUCCESS;
+}
+
+/** Lock the internal common FTS_ tables, before fts_drop_common_tables().
+@param trx    transaction
+@param table  table containing FULLTEXT INDEX
+@return DB_SUCCESS or error code */
+dberr_t fts_lock_common_tables(trx_t *trx, const dict_table_t &table)
+{
+  fts_table_t fts_table;
+  char table_name[MAX_FULL_NAME_LEN];
+
+  FTS_INIT_FTS_TABLE(&fts_table, nullptr, FTS_COMMON_TABLE, (&table));
+
+  for (const char **suffix= fts_common_tables; *suffix; suffix++)
+  {
+    fts_table.suffix= *suffix;
+    fts_get_table_name(&fts_table, table_name, false);
+    if (dberr_t err= fts_lock_table(trx, table_name))
+      return err;
+  }
+  return DB_SUCCESS;
+}
+
+/** This function make sure that table doesn't
+have any other reference count.
+@param	table_name	table name */
+static void fts_table_no_ref_count(const char *table_name)
+{
+  dict_table_t *table= dict_table_open_on_name(
+    table_name, true, DICT_ERR_IGNORE_TABLESPACE);
+  if (!table)
+    return;
+
+  while (table->get_ref_count() > 1)
+  {
+    dict_sys.unlock();
+    std::this_thread::sleep_for(std::chrono::milliseconds(50));
+    dict_sys.lock(SRW_LOCK_CALL);
+  }
+
+  table->release();
+}
+
+/** Stop the purge thread and check n_ref_count of all auxiliary
+and common table associated with the fts table.
+@param	table		parent FTS table
+@param	already_stopped	True indicates purge threads were
+			already stopped*/
+void purge_sys_t::stop_FTS(const dict_table_t &table, bool already_stopped)
+{
+  if (!already_stopped)
+    purge_sys.stop_FTS();
+
+  dict_sys.lock(SRW_LOCK_CALL);
+
+  fts_table_t fts_table;
+  char table_name[MAX_FULL_NAME_LEN];
+
+  FTS_INIT_FTS_TABLE(&fts_table, nullptr, FTS_COMMON_TABLE, (&table));
+
+  for (const char **suffix= fts_common_tables; *suffix; suffix++)
+  {
+    fts_table.suffix= *suffix;
+    fts_get_table_name(&fts_table, table_name, true);
+    fts_table_no_ref_count(table_name);
+  }
+
+  if (table.fts)
+  {
+    if (auto indexes= table.fts->indexes)
+    {
+      for (ulint i= 0;i < ib_vector_size(indexes); ++i)
+      {
+        const dict_index_t *index= static_cast<const dict_index_t*>(
+          ib_vector_getp(indexes, i));
+        FTS_INIT_INDEX_TABLE(&fts_table, nullptr, FTS_INDEX_TABLE, index);
+        for (const fts_index_selector_t *s= fts_index_selector;
+             s->suffix; s++)
+        {
+          fts_table.suffix= s->suffix;
+          fts_get_table_name(&fts_table, table_name, true);
+          fts_table_no_ref_count(table_name);
+        }
+      }
+    }
+  }
+
+  dict_sys.unlock();
+}
+
+/** Lock the internal FTS_ tables for table, before fts_drop_tables().
+@param trx    transaction
+@param table  table containing FULLTEXT INDEX
+@return DB_SUCCESS or error code */
+dberr_t fts_lock_tables(trx_t *trx, const dict_table_t &table)
+{
+  if (dberr_t err= fts_lock_common_tables(trx, table))
+    return err;
+
+  if (!table.fts)
+    return DB_SUCCESS;
+
+  auto indexes= table.fts->indexes;
+  if (!indexes)
+    return DB_SUCCESS;
+
+  for (ulint i= 0; i < ib_vector_size(indexes); ++i)
+    if (dberr_t err=
+        fts_lock_index_tables(trx, *static_cast<const dict_index_t*>
+                              (ib_vector_getp(indexes, i))))
+      return err;
+  return DB_SUCCESS;
+}
+
+/** Drops the common ancillary tables needed for supporting an FTS index
+on the given table.
+@param trx          transaction to drop fts common table
+@param fts_table    table with an FTS index
+@param rename       whether to rename before dropping
+@return DB_SUCCESS or error code */
+static dberr_t fts_drop_common_tables(trx_t *trx, fts_table_t *fts_table,
+                                      bool rename)
+{
+  dberr_t error= DB_SUCCESS;
+
+  for (ulint i= 0; fts_common_tables[i]; ++i)
+  {
+    char table_name[MAX_FULL_NAME_LEN];
+
+    fts_table->suffix= fts_common_tables[i];
+    fts_get_table_name(fts_table, table_name, true);
+
+    if (dberr_t err= fts_drop_table(trx, table_name, rename))
+    {
+      if (trx->state != TRX_STATE_ACTIVE)
+        return err;
+      /* We only return the status of the last error. */
+      if (err != DB_FAIL)
+        error= err;
+    }
+  }
+
+  return error;
+}
+
+/****************************************************************//**
+Drops FTS auxiliary tables for an FTS index
+@return DB_SUCCESS or error code */
+dberr_t fts_drop_index_tables(trx_t *trx, const dict_index_t &index)
+{
+	ulint		i;
+	fts_table_t	fts_table;
+	dberr_t		error = DB_SUCCESS;
+
+	FTS_INIT_INDEX_TABLE(&fts_table, nullptr, FTS_INDEX_TABLE, (&index));
+
+	for (i = 0; i < FTS_NUM_AUX_INDEX; ++i) {
+		dberr_t	err;
+		char	table_name[MAX_FULL_NAME_LEN];
+
+		fts_table.suffix = fts_get_suffix(i);
+		fts_get_table_name(&fts_table, table_name, true);
+
+		err = fts_drop_table(trx, table_name, false);
+
+		/* We only return the status of the last error. */
+		if (err != DB_SUCCESS && err != DB_FAIL) {
+			error = err;
+		}
+	}
+
+	return(error);
+}
+
+/****************************************************************//**
+Drops FTS ancillary tables needed for supporting an FTS index
+on the given table.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_drop_all_index_tables(
+/*======================*/
+	trx_t*		trx,			/*!< in: transaction */
+	const fts_t*	fts)			/*!< in: fts instance */
+{
+  dberr_t error= DB_SUCCESS;
+  auto indexes= fts->indexes;
+  if (!indexes)
+    return DB_SUCCESS;
+
+  for (ulint i= 0; i < ib_vector_size(indexes); ++i)
+    if (dberr_t err= fts_drop_index_tables(trx,
+                                           *static_cast<const dict_index_t*>
+                                           (ib_vector_getp(indexes, i))))
+      error= err;
+  return error;
+}
+
+/** Drop the internal FTS_ tables for table.
+@param trx    transaction
+@param table  table containing FULLTEXT INDEX
+@return DB_SUCCESS or error code */
+dberr_t fts_drop_tables(trx_t *trx, const dict_table_t &table)
+{
+	dberr_t		error;
+	fts_table_t	fts_table;
+
+	FTS_INIT_FTS_TABLE(&fts_table, NULL, FTS_COMMON_TABLE, (&table));
+
+	error = fts_drop_common_tables(trx, &fts_table, false);
+
+	if (error == DB_SUCCESS && table.fts) {
+		error = fts_drop_all_index_tables(trx, table.fts);
+	}
+
+	return(error);
+}
+
+/** Create dict_table_t object for FTS Aux tables.
+@param[in]	aux_table_name	FTS Aux table name
+@param[in]	table		table object of FTS Index
+@param[in]	n_cols		number of columns for FTS Aux table
+@return table object for FTS Aux table */
+static
+dict_table_t*
+fts_create_in_mem_aux_table(
+	const char*		aux_table_name,
+	const dict_table_t*	table,
+	ulint			n_cols)
+{
+	dict_table_t*	new_table = dict_table_t::create(
+		{aux_table_name,strlen(aux_table_name)},
+		nullptr, n_cols, 0, table->flags,
+		table->space_id == TRX_SYS_SPACE
+		? 0 : table->space_id == SRV_TMP_SPACE_ID
+		? DICT_TF2_TEMPORARY : DICT_TF2_USE_FILE_PER_TABLE);
+
+	if (DICT_TF_HAS_DATA_DIR(table->flags)) {
+		ut_ad(table->data_dir_path != NULL);
+		new_table->data_dir_path = mem_heap_strdup(
+			new_table->heap, table->data_dir_path);
+	}
+
+	return(new_table);
+}
+
+/** Function to create on FTS common table.
+@param[in,out]	trx		InnoDB transaction
+@param[in]	table		Table that has FTS Index
+@param[in]	fts_table_name	FTS AUX table name
+@param[in]	fts_suffix	FTS AUX table suffix
+@param[in,out]	heap		temporary memory heap
+@return table object if created, else NULL */
+static
+dict_table_t*
+fts_create_one_common_table(
+	trx_t*			trx,
+	const dict_table_t*	table,
+	const char*		fts_table_name,
+	const char*		fts_suffix,
+	mem_heap_t*		heap)
+{
+	dict_table_t*		new_table;
+	dberr_t			error;
+	bool			is_config = strcmp(fts_suffix, "CONFIG") == 0;
+
+	if (!is_config) {
+
+		new_table = fts_create_in_mem_aux_table(
+			fts_table_name, table, FTS_DELETED_TABLE_NUM_COLS);
+
+		dict_mem_table_add_col(
+			new_table, heap, "doc_id", DATA_INT, DATA_UNSIGNED,
+			FTS_DELETED_TABLE_COL_LEN);
+	} else {
+		/* Config table has different schema. */
+		new_table = fts_create_in_mem_aux_table(
+			fts_table_name, table, FTS_CONFIG_TABLE_NUM_COLS);
+
+		dict_mem_table_add_col(
+			new_table, heap, "key", DATA_VARCHAR, 0,
+			FTS_CONFIG_TABLE_KEY_COL_LEN);
+
+		dict_mem_table_add_col(
+			new_table, heap, "value", DATA_VARCHAR, DATA_NOT_NULL,
+			FTS_CONFIG_TABLE_VALUE_COL_LEN);
+	}
+
+	dict_table_add_system_columns(new_table, heap);
+	error = row_create_table_for_mysql(new_table, trx);
+
+	if (error == DB_SUCCESS) {
+
+		dict_index_t*	index = dict_mem_index_create(
+			new_table, "FTS_COMMON_TABLE_IND",
+			DICT_UNIQUE|DICT_CLUSTERED, 1);
+
+		if (!is_config) {
+			dict_mem_index_add_field(index, "doc_id", 0);
+		} else {
+			dict_mem_index_add_field(index, "key", 0);
+		}
+
+		error =	row_create_index_for_mysql(index, trx, NULL,
+						   FIL_ENCRYPTION_DEFAULT,
+						   FIL_DEFAULT_ENCRYPTION_KEY);
+		if (error == DB_SUCCESS) {
+			return new_table;
+		}
+	}
+
+	ib::warn() << "Failed to create FTS common table " << fts_table_name;
+	trx->error_state = error;
+	return NULL;
+}
+
+/** Creates the common auxiliary tables needed for supporting an FTS index
+on the given table.
+The following tables are created.
+CREATE TABLE $FTS_PREFIX_DELETED
+	(doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id)
+CREATE TABLE $FTS_PREFIX_DELETED_CACHE
+	(doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id)
+CREATE TABLE $FTS_PREFIX_BEING_DELETED
+	(doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id)
+CREATE TABLE $FTS_PREFIX_BEING_DELETED_CACHE
+	(doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id)
+CREATE TABLE $FTS_PREFIX_CONFIG
+	(key CHAR(50), value CHAR(200), UNIQUE CLUSTERED INDEX on key)
+@param[in,out]	trx			transaction
+@param[in,out]	table			table with FTS index
+@param[in]	skip_doc_id_index	Skip index on doc id
+@return DB_SUCCESS if succeed */
+dberr_t
+fts_create_common_tables(
+	trx_t*		trx,
+	dict_table_t*	table,
+	bool		skip_doc_id_index)
+{
+	dberr_t		error;
+	que_t*		graph;
+	fts_table_t	fts_table;
+	mem_heap_t*	heap = mem_heap_create(1024);
+	pars_info_t*	info;
+	char		fts_name[MAX_FULL_NAME_LEN];
+	char		full_name[sizeof(fts_common_tables) / sizeof(char*)]
+				[MAX_FULL_NAME_LEN];
+
+	dict_index_t*					index = NULL;
+
+	FTS_INIT_FTS_TABLE(&fts_table, NULL, FTS_COMMON_TABLE, table);
+
+	error = fts_drop_common_tables(trx, &fts_table, true);
+
+	if (error != DB_SUCCESS) {
+
+		goto func_exit;
+	}
+
+	/* Create the FTS tables that are common to an FTS index. */
+	for (ulint i = 0; fts_common_tables[i] != NULL; ++i) {
+
+		fts_table.suffix = fts_common_tables[i];
+		fts_get_table_name(&fts_table, full_name[i], true);
+		dict_table_t*	common_table = fts_create_one_common_table(
+			trx, table, full_name[i], fts_table.suffix, heap);
+
+		if (!common_table) {
+			trx->error_state = DB_SUCCESS;
+			error = DB_ERROR;
+			goto func_exit;
+		}
+
+		mem_heap_empty(heap);
+	}
+
+	/* Write the default settings to the config table. */
+	info = pars_info_create();
+
+	fts_table.suffix = "CONFIG";
+	fts_get_table_name(&fts_table, fts_name, true);
+	pars_info_bind_id(info, "config_table", fts_name);
+
+	graph = pars_sql(
+		info, fts_config_table_insert_values_sql);
+
+	error = fts_eval_sql(trx, graph);
+
+	que_graph_free(graph);
+
+	if (error != DB_SUCCESS || skip_doc_id_index) {
+
+		goto func_exit;
+	}
+
+	if (table->versioned()) {
+		index = dict_mem_index_create(table, FTS_DOC_ID_INDEX_NAME,
+					      DICT_UNIQUE, 2);
+		dict_mem_index_add_field(index, FTS_DOC_ID_COL_NAME, 0);
+		dict_mem_index_add_field(index, table->cols[table->vers_end].name(*table), 0);
+	} else {
+		index = dict_mem_index_create(table, FTS_DOC_ID_INDEX_NAME,
+					      DICT_UNIQUE, 1);
+		dict_mem_index_add_field(index, FTS_DOC_ID_COL_NAME, 0);
+	}
+
+	error =	row_create_index_for_mysql(index, trx, NULL,
+					   FIL_ENCRYPTION_DEFAULT,
+					   FIL_DEFAULT_ENCRYPTION_KEY);
+
+func_exit:
+	mem_heap_free(heap);
+
+	return(error);
+}
+
+/** Create one FTS auxiliary index table for an FTS index.
+@param[in,out]	trx		transaction
+@param[in]	index		the index instance
+@param[in]	fts_table	fts_table structure
+@param[in,out]	heap		temporary memory heap
+@see row_merge_create_fts_sort_index()
+@return DB_SUCCESS or error code */
+static
+dict_table_t*
+fts_create_one_index_table(
+	trx_t*			trx,
+	const dict_index_t*	index,
+	const fts_table_t*	fts_table,
+	mem_heap_t*		heap)
+{
+	dict_field_t*		field;
+	dict_table_t*		new_table;
+	char			table_name[MAX_FULL_NAME_LEN];
+	dberr_t			error;
+	CHARSET_INFO*		charset;
+
+	ut_ad(index->type & DICT_FTS);
+
+	fts_get_table_name(fts_table, table_name, true);
+
+	new_table = fts_create_in_mem_aux_table(
+			table_name, fts_table->table,
+			FTS_AUX_INDEX_TABLE_NUM_COLS);
+
+	field = dict_index_get_nth_field(index, 0);
+	charset = fts_get_charset(field->col->prtype);
+
+	dict_mem_table_add_col(new_table, heap, "word",
+			       charset == &my_charset_latin1
+			       ? DATA_VARCHAR : DATA_VARMYSQL,
+			       field->col->prtype,
+			       FTS_MAX_WORD_LEN_IN_CHAR
+			       * unsigned(field->col->mbmaxlen));
+
+	dict_mem_table_add_col(new_table, heap, "first_doc_id", DATA_INT,
+			       DATA_NOT_NULL | DATA_UNSIGNED,
+			       FTS_INDEX_FIRST_DOC_ID_LEN);
+
+	dict_mem_table_add_col(new_table, heap, "last_doc_id", DATA_INT,
+			       DATA_NOT_NULL | DATA_UNSIGNED,
+			       FTS_INDEX_LAST_DOC_ID_LEN);
+
+	dict_mem_table_add_col(new_table, heap, "doc_count", DATA_INT,
+			       DATA_NOT_NULL | DATA_UNSIGNED,
+			       FTS_INDEX_DOC_COUNT_LEN);
+
+	/* The precise type calculation is as follows:
+	least signficiant byte: MySQL type code (not applicable for sys cols)
+	second least : DATA_NOT_NULL | DATA_BINARY_TYPE
+	third least  : the MySQL charset-collation code (DATA_MTYPE_MAX) */
+
+	dict_mem_table_add_col(
+		new_table, heap, "ilist", DATA_BLOB,
+		(DATA_MTYPE_MAX << 16) | DATA_UNSIGNED | DATA_NOT_NULL,
+		FTS_INDEX_ILIST_LEN);
+
+	dict_table_add_system_columns(new_table, heap);
+	error = row_create_table_for_mysql(new_table, trx);
+
+	if (error == DB_SUCCESS) {
+		dict_index_t*	index = dict_mem_index_create(
+			new_table, "FTS_INDEX_TABLE_IND",
+			DICT_UNIQUE|DICT_CLUSTERED, 2);
+		dict_mem_index_add_field(index, "word", 0);
+		dict_mem_index_add_field(index, "first_doc_id", 0);
+
+		error =	row_create_index_for_mysql(index, trx, NULL,
+						   FIL_ENCRYPTION_DEFAULT,
+						   FIL_DEFAULT_ENCRYPTION_KEY);
+
+		if (error == DB_SUCCESS) {
+			return new_table;
+		}
+	}
+
+	ib::warn() << "Failed to create FTS index table " << table_name;
+	trx->error_state = error;
+	return NULL;
+}
+
+/** Creates the column specific ancillary tables needed for supporting an
+FTS index on the given table.
+
+All FTS AUX Index tables have the following schema.
+CREAT TABLE $FTS_PREFIX_INDEX_[1-6](
+	word		VARCHAR(FTS_MAX_WORD_LEN),
+	first_doc_id	INT NOT NULL,
+	last_doc_id	UNSIGNED NOT NULL,
+	doc_count	UNSIGNED INT NOT NULL,
+	ilist		VARBINARY NOT NULL,
+	UNIQUE CLUSTERED INDEX ON (word, first_doc_id))
+@param[in,out]	trx	dictionary transaction
+@param[in]	index	fulltext index
+@param[in]	id	table id
+@return DB_SUCCESS or error code */
+dberr_t
+fts_create_index_tables(trx_t* trx, const dict_index_t* index, table_id_t id)
+{
+	ulint		i;
+	fts_table_t	fts_table;
+	dberr_t		error = DB_SUCCESS;
+	mem_heap_t*	heap = mem_heap_create(1024);
+
+	fts_table.type = FTS_INDEX_TABLE;
+	fts_table.index_id = index->id;
+	fts_table.table_id = id;
+	fts_table.table = index->table;
+
+	for (i = 0; i < FTS_NUM_AUX_INDEX && error == DB_SUCCESS; ++i) {
+		dict_table_t*	new_table;
+
+		/* Create the FTS auxiliary tables that are specific
+		to an FTS index. */
+		fts_table.suffix = fts_get_suffix(i);
+
+		new_table = fts_create_one_index_table(
+			trx, index, &fts_table, heap);
+
+		if (new_table == NULL) {
+			error = DB_FAIL;
+			break;
+		}
+
+		mem_heap_empty(heap);
+	}
+
+	mem_heap_free(heap);
+
+	return(error);
+}
+
+/******************************************************************//**
+Calculate the new state of a row given the existing state and a new event.
+@return new state of row */
+static
+fts_row_state
+fts_trx_row_get_new_state(
+/*======================*/
+	fts_row_state	old_state,		/*!< in: existing state of row */
+	fts_row_state	event)			/*!< in: new event */
+{
+	/* The rules for transforming states:
+
+	I = inserted
+	M = modified
+	D = deleted
+	N = nothing
+
+	M+D -> D:
+
+	If the row existed before the transaction started and it is modified
+	during the transaction, followed by a deletion of the row, only the
+	deletion will be signaled.
+
+	M+ -> M:
+
+	If the row existed before the transaction started and it is modified
+	more than once during the transaction, only the last modification
+	will be signaled.
+
+	IM*D -> N:
+
+	If a new row is added during the transaction (and possibly modified
+	after its initial insertion) but it is deleted before the end of the
+	transaction, nothing will be signaled.
+
+	IM* -> I:
+
+	If a new row is added during the transaction and modified after its
+	initial insertion, only the addition will be signaled.
+
+	M*DI -> M:
+
+	If the row existed before the transaction started and it is deleted,
+	then re-inserted, only a modification will be signaled. Note that
+	this case is only possible if the table is using the row's primary
+	key for FTS row ids, since those can be re-inserted by the user,
+	which is not true for InnoDB generated row ids.
+
+	It is easily seen that the above rules decompose such that we do not
+	need to store the row's entire history of events. Instead, we can
+	store just one state for the row and update that when new events
+	arrive. Then we can implement the above rules as a two-dimensional
+	look-up table, and get checking of invalid combinations "for free"
+	in the process. */
+
+	/* The lookup table for transforming states. old_state is the
+	Y-axis, event is the X-axis. */
+	static const fts_row_state table[4][4] = {
+			/*    I            M            D            N */
+		/* I */	{ FTS_INVALID, FTS_INSERT,  FTS_NOTHING, FTS_INVALID },
+		/* M */	{ FTS_INVALID, FTS_MODIFY,  FTS_DELETE,  FTS_INVALID },
+		/* D */	{ FTS_MODIFY,  FTS_INVALID, FTS_INVALID, FTS_INVALID },
+		/* N */	{ FTS_INVALID, FTS_INVALID, FTS_INVALID, FTS_INVALID }
+	};
+
+	fts_row_state result;
+
+	ut_a(old_state < FTS_INVALID);
+	ut_a(event < FTS_INVALID);
+
+	result = table[(int) old_state][(int) event];
+	ut_a(result != FTS_INVALID);
+
+	return(result);
+}
+
+/******************************************************************//**
+Create a savepoint instance.
+@return savepoint instance */
+static
+fts_savepoint_t*
+fts_savepoint_create(
+/*=================*/
+	ib_vector_t*	savepoints,		/*!< out: InnoDB transaction */
+	const char*	name,			/*!< in: savepoint name */
+	mem_heap_t*	heap)			/*!< in: heap */
+{
+	fts_savepoint_t*	savepoint;
+
+	savepoint = static_cast<fts_savepoint_t*>(
+		ib_vector_push(savepoints, NULL));
+
+	memset(savepoint, 0x0, sizeof(*savepoint));
+
+	if (name) {
+		savepoint->name = mem_heap_strdup(heap, name);
+	}
+
+	savepoint->tables = rbt_create(
+		sizeof(fts_trx_table_t*), fts_trx_table_cmp);
+
+	return(savepoint);
+}
+
+/******************************************************************//**
+Create an FTS trx.
+@return FTS trx */
+fts_trx_t*
+fts_trx_create(
+/*===========*/
+	trx_t*	trx)				/*!< in/out: InnoDB
+						transaction */
+{
+	fts_trx_t*		ftt;
+	ib_alloc_t*		heap_alloc;
+	mem_heap_t*		heap = mem_heap_create(1024);
+	trx_named_savept_t*	savep;
+
+	ut_a(trx->fts_trx == NULL);
+
+	ftt = static_cast<fts_trx_t*>(mem_heap_alloc(heap, sizeof(fts_trx_t)));
+	ftt->trx = trx;
+	ftt->heap = heap;
+
+	heap_alloc = ib_heap_allocator_create(heap);
+
+	ftt->savepoints = static_cast<ib_vector_t*>(ib_vector_create(
+		heap_alloc, sizeof(fts_savepoint_t), 4));
+
+	ftt->last_stmt = static_cast<ib_vector_t*>(ib_vector_create(
+		heap_alloc, sizeof(fts_savepoint_t), 4));
+
+	/* Default instance has no name and no heap. */
+	fts_savepoint_create(ftt->savepoints, NULL, NULL);
+	fts_savepoint_create(ftt->last_stmt, NULL, NULL);
+
+	/* Copy savepoints that already set before. */
+	for (savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
+	     savep != NULL;
+	     savep = UT_LIST_GET_NEXT(trx_savepoints, savep)) {
+
+		fts_savepoint_take(ftt, savep->name);
+	}
+
+	return(ftt);
+}
+
+/******************************************************************//**
+Create an FTS trx table.
+@return FTS trx table */
+static
+fts_trx_table_t*
+fts_trx_table_create(
+/*=================*/
+	fts_trx_t*	fts_trx,		/*!< in: FTS trx */
+	dict_table_t*	table)			/*!< in: table */
+{
+	fts_trx_table_t*	ftt;
+
+	ftt = static_cast<fts_trx_table_t*>(
+		mem_heap_zalloc(fts_trx->heap, sizeof *ftt));
+
+	ftt->table = table;
+	ftt->fts_trx = fts_trx;
+
+	ftt->rows = rbt_create(sizeof(fts_trx_row_t), fts_trx_row_doc_id_cmp);
+
+	return(ftt);
+}
+
+/******************************************************************//**
+Clone an FTS trx table.
+@return FTS trx table */
+static
+fts_trx_table_t*
+fts_trx_table_clone(
+/*=================*/
+	const fts_trx_table_t*	ftt_src)	/*!< in: FTS trx */
+{
+	fts_trx_table_t*	ftt;
+
+	ftt = static_cast<fts_trx_table_t*>(
+		mem_heap_alloc(ftt_src->fts_trx->heap, sizeof(*ftt)));
+
+	memset(ftt, 0x0, sizeof(*ftt));
+
+	ftt->table = ftt_src->table;
+	ftt->fts_trx = ftt_src->fts_trx;
+
+	ftt->rows = rbt_create(sizeof(fts_trx_row_t), fts_trx_row_doc_id_cmp);
+
+	/* Copy the rb tree values to the new savepoint. */
+	rbt_merge_uniq(ftt->rows, ftt_src->rows);
+
+	/* These are only added on commit. At this stage we only have
+	the updated row state. */
+	ut_a(ftt_src->added_doc_ids == NULL);
+
+	return(ftt);
+}
+
+/******************************************************************//**
+Initialize the FTS trx instance.
+@return FTS trx instance */
+static
+fts_trx_table_t*
+fts_trx_init(
+/*=========*/
+	trx_t*			trx,		/*!< in: transaction */
+	dict_table_t*		table,		/*!< in: FTS table instance */
+	ib_vector_t*		savepoints)	/*!< in: Savepoints */
+{
+	fts_trx_table_t*	ftt;
+	ib_rbt_bound_t		parent;
+	ib_rbt_t*		tables;
+	fts_savepoint_t*	savepoint;
+
+	savepoint = static_cast<fts_savepoint_t*>(ib_vector_last(savepoints));
+
+	tables = savepoint->tables;
+	rbt_search_cmp(tables, &parent, &table->id, fts_trx_table_id_cmp, NULL);
+
+	if (parent.result == 0) {
+		fts_trx_table_t**	fttp;
+
+		fttp = rbt_value(fts_trx_table_t*, parent.last);
+		ftt = *fttp;
+	} else {
+		ftt = fts_trx_table_create(trx->fts_trx, table);
+		rbt_add_node(tables, &parent, &ftt);
+	}
+
+	ut_a(ftt->table == table);
+
+	return(ftt);
+}
+
+/******************************************************************//**
+Notify the FTS system about an operation on an FTS-indexed table. */
+static
+void
+fts_trx_table_add_op(
+/*=================*/
+	fts_trx_table_t*ftt,			/*!< in: FTS trx table */
+	doc_id_t	doc_id,			/*!< in: doc id */
+	fts_row_state	state,			/*!< in: state of the row */
+	ib_vector_t*	fts_indexes)		/*!< in: FTS indexes affected */
+{
+	ib_rbt_t*	rows;
+	ib_rbt_bound_t	parent;
+
+	rows = ftt->rows;
+	rbt_search(rows, &parent, &doc_id);
+
+	/* Row id found, update state, and if new state is FTS_NOTHING,
+	we delete the row from our tree. */
+	if (parent.result == 0) {
+		fts_trx_row_t*	row = rbt_value(fts_trx_row_t, parent.last);
+
+		row->state = fts_trx_row_get_new_state(row->state, state);
+
+		if (row->state == FTS_NOTHING) {
+			if (row->fts_indexes) {
+				ib_vector_free(row->fts_indexes);
+			}
+
+			ut_free(rbt_remove_node(rows, parent.last));
+			row = NULL;
+		} else if (row->fts_indexes != NULL) {
+			ib_vector_free(row->fts_indexes);
+			row->fts_indexes = fts_indexes;
+		}
+
+	} else { /* Row-id not found, create a new one. */
+		fts_trx_row_t	row;
+
+		row.doc_id = doc_id;
+		row.state = state;
+		row.fts_indexes = fts_indexes;
+
+		rbt_add_node(rows, &parent, &row);
+	}
+}
+
+/******************************************************************//**
+Notify the FTS system about an operation on an FTS-indexed table. */
+void
+fts_trx_add_op(
+/*===========*/
+	trx_t*		trx,			/*!< in: InnoDB transaction */
+	dict_table_t*	table,			/*!< in: table */
+	doc_id_t	doc_id,			/*!< in: new doc id */
+	fts_row_state	state,			/*!< in: state of the row */
+	ib_vector_t*	fts_indexes)		/*!< in: FTS indexes affected
+						(NULL=all) */
+{
+	fts_trx_table_t*	tran_ftt;
+	fts_trx_table_t*	stmt_ftt;
+
+	if (!trx->fts_trx) {
+		trx->fts_trx = fts_trx_create(trx);
+	}
+
+	tran_ftt = fts_trx_init(trx, table, trx->fts_trx->savepoints);
+	stmt_ftt = fts_trx_init(trx, table, trx->fts_trx->last_stmt);
+
+	fts_trx_table_add_op(tran_ftt, doc_id, state, fts_indexes);
+	fts_trx_table_add_op(stmt_ftt, doc_id, state, fts_indexes);
+}
+
+/******************************************************************//**
+Fetch callback that converts a textual document id to a binary value and
+stores it in the given place.
+@return always returns NULL */
+static
+ibool
+fts_fetch_store_doc_id(
+/*===================*/
+	void*		row,			/*!< in: sel_node_t* */
+	void*		user_arg)		/*!< in: doc_id_t* to store
+						doc_id in */
+{
+	int		n_parsed;
+	sel_node_t*	node = static_cast<sel_node_t*>(row);
+	doc_id_t*	doc_id = static_cast<doc_id_t*>(user_arg);
+	dfield_t*	dfield = que_node_get_val(node->select_list);
+	dtype_t*	type = dfield_get_type(dfield);
+	ulint		len = dfield_get_len(dfield);
+
+	char		buf[32];
+
+	ut_a(dtype_get_mtype(type) == DATA_VARCHAR);
+	ut_a(len > 0 && len < sizeof(buf));
+
+	memcpy(buf, dfield_get_data(dfield), len);
+	buf[len] = '\0';
+
+	n_parsed = sscanf(buf, FTS_DOC_ID_FORMAT, doc_id);
+	ut_a(n_parsed == 1);
+
+	return(FALSE);
+}
+
+#ifdef FTS_CACHE_SIZE_DEBUG
+/******************************************************************//**
+Get the max cache size in bytes. If there is an error reading the
+value we simply print an error message here and return the default
+value to the caller.
+@return max cache size in bytes */
+static
+ulint
+fts_get_max_cache_size(
+/*===================*/
+	trx_t*		trx,			/*!< in: transaction */
+	fts_table_t*	fts_table)		/*!< in: table instance */
+{
+	dberr_t		error;
+	fts_string_t	value;
+	ulong		cache_size_in_mb;
+
+	/* Set to the default value. */
+	cache_size_in_mb = FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB;
+
+	/* We set the length of value to the max bytes it can hold. This
+	information is used by the callback that reads the value. */
+	value.f_n_char = 0;
+	value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
+	value.f_str = ut_malloc_nokey(value.f_len + 1);
+
+	error = fts_config_get_value(
+		trx, fts_table, FTS_MAX_CACHE_SIZE_IN_MB, &value);
+
+	if (UNIV_LIKELY(error == DB_SUCCESS)) {
+		value.f_str[value.f_len] = 0;
+		cache_size_in_mb = strtoul((char*) value.f_str, NULL, 10);
+
+		if (cache_size_in_mb > FTS_CACHE_SIZE_UPPER_LIMIT_IN_MB) {
+
+			ib::warn() << "FTS max cache size ("
+				<< cache_size_in_mb << ") out of range."
+				" Minimum value is "
+				<< FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB
+				<< "MB and the maximum value is "
+				<< FTS_CACHE_SIZE_UPPER_LIMIT_IN_MB
+				<< "MB, setting cache size to upper limit";
+
+			cache_size_in_mb = FTS_CACHE_SIZE_UPPER_LIMIT_IN_MB;
+
+		} else if  (cache_size_in_mb
+			    < FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB) {
+
+			ib::warn() << "FTS max cache size ("
+				<< cache_size_in_mb << ") out of range."
+				" Minimum value is "
+				<< FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB
+				<< "MB and the maximum value is"
+				<< FTS_CACHE_SIZE_UPPER_LIMIT_IN_MB
+				<< "MB, setting cache size to lower limit";
+
+			cache_size_in_mb = FTS_CACHE_SIZE_LOWER_LIMIT_IN_MB;
+		}
+	} else {
+		ib::error() << "(" << error << ") reading max"
+			" cache config value from config table "
+			<< fts_table->table->name;
+	}
+
+	ut_free(value.f_str);
+
+	return(cache_size_in_mb * 1024 * 1024);
+}
+#endif
+
+/*********************************************************************//**
+Get the next available document id.
+@return DB_SUCCESS if OK */
+dberr_t
+fts_get_next_doc_id(
+/*================*/
+	const dict_table_t*	table,		/*!< in: table */
+	doc_id_t*		doc_id)		/*!< out: new document id */
+{
+	fts_cache_t*	cache = table->fts->cache;
+
+	/* If the Doc ID system has not yet been initialized, we
+	will consult the CONFIG table and user table to re-establish
+	the initial value of the Doc ID */
+	if (cache->first_doc_id == FTS_NULL_DOC_ID) {
+		fts_init_doc_id(table);
+	}
+
+	if (!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
+		*doc_id = FTS_NULL_DOC_ID;
+		return(DB_SUCCESS);
+	}
+
+	DEBUG_SYNC_C("get_next_FTS_DOC_ID");
+	mysql_mutex_lock(&cache->doc_id_lock);
+	*doc_id = cache->next_doc_id++;
+	mysql_mutex_unlock(&cache->doc_id_lock);
+
+	return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+This function fetch the Doc ID from CONFIG table, and compare with
+the Doc ID supplied. And store the larger one to the CONFIG table.
+@return DB_SUCCESS if OK */
+static MY_ATTRIBUTE((nonnull))
+dberr_t
+fts_cmp_set_sync_doc_id(
+/*====================*/
+	const dict_table_t*	table,		/*!< in: table */
+	doc_id_t		cmp_doc_id,	/*!< in: Doc ID to compare */
+	ibool			read_only,	/*!< in: TRUE if read the
+						synced_doc_id only */
+	doc_id_t*		doc_id)		/*!< out: larger document id
+						after comparing "cmp_doc_id"
+						to the one stored in CONFIG
+						table */
+{
+	if (srv_read_only_mode) {
+		return DB_READ_ONLY;
+	}
+
+	trx_t*		trx;
+	pars_info_t*	info;
+	dberr_t		error;
+	fts_table_t	fts_table;
+	que_t*		graph = NULL;
+	fts_cache_t*	cache = table->fts->cache;
+	char		table_name[MAX_FULL_NAME_LEN];
+	ut_a(table->fts->doc_col != ULINT_UNDEFINED);
+
+	fts_table.suffix = "CONFIG";
+	fts_table.table_id = table->id;
+	fts_table.type = FTS_COMMON_TABLE;
+	fts_table.table = table;
+
+	trx= trx_create();
+retry:
+	trx_start_internal(trx);
+
+	trx->op_info = "update the next FTS document id";
+
+	info = pars_info_create();
+
+	pars_info_bind_function(
+		info, "my_func", fts_fetch_store_doc_id, doc_id);
+
+	fts_get_table_name(&fts_table, table_name);
+	pars_info_bind_id(info, "config_table", table_name);
+
+	graph = fts_parse_sql(
+		&fts_table, info,
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS SELECT value FROM $config_table"
+		" WHERE key = 'synced_doc_id' FOR UPDATE;\n"
+		"BEGIN\n"
+		""
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE c;");
+
+	*doc_id = 0;
+
+	error = fts_eval_sql(trx, graph);
+
+	que_graph_free(graph);
+
+	// FIXME: We need to retry deadlock errors
+	if (error != DB_SUCCESS) {
+		goto func_exit;
+	}
+
+	if (read_only) {
+		/* InnoDB stores actual synced_doc_id value + 1 in
+		FTS_CONFIG table. Reduce the value by 1 while reading
+		after startup. */
+		if (*doc_id) *doc_id -= 1;
+		goto func_exit;
+	}
+
+	if (cmp_doc_id == 0 && *doc_id) {
+		cache->synced_doc_id = *doc_id - 1;
+	} else {
+		cache->synced_doc_id = ut_max(cmp_doc_id, *doc_id);
+	}
+
+	mysql_mutex_lock(&cache->doc_id_lock);
+	/* For each sync operation, we will add next_doc_id by 1,
+	so to mark a sync operation */
+	if (cache->next_doc_id < cache->synced_doc_id + 1) {
+		cache->next_doc_id = cache->synced_doc_id + 1;
+	}
+	mysql_mutex_unlock(&cache->doc_id_lock);
+
+	if (cmp_doc_id && cmp_doc_id >= *doc_id) {
+		error = fts_update_sync_doc_id(
+			table, cache->synced_doc_id, trx);
+	}
+
+	*doc_id = cache->next_doc_id;
+
+func_exit:
+
+	if (UNIV_LIKELY(error == DB_SUCCESS)) {
+		fts_sql_commit(trx);
+	} else {
+		*doc_id = 0;
+
+		ib::error() << "(" << error << ") while getting next doc id "
+			"for table " << table->name;
+		fts_sql_rollback(trx);
+
+		if (error == DB_DEADLOCK || error == DB_LOCK_WAIT_TIMEOUT) {
+			DEBUG_SYNC_C("fts_cmp_set_sync_doc_id_retry");
+			std::this_thread::sleep_for(FTS_DEADLOCK_RETRY_WAIT);
+			goto retry;
+		}
+	}
+
+	trx->free();
+
+	return(error);
+}
+
+/** Update the last document id. This function could create a new
+transaction to update the last document id.
+@param  table   table to be updated
+@param  doc_id  last document id
+@param  trx     update trx or null
+@retval DB_SUCCESS if OK */
+dberr_t
+fts_update_sync_doc_id(
+	const dict_table_t*	table,
+	doc_id_t		doc_id,
+	trx_t*			trx)
+{
+	byte		id[FTS_MAX_ID_LEN];
+	pars_info_t*	info;
+	fts_table_t	fts_table;
+	ulint		id_len;
+	que_t*		graph = NULL;
+	dberr_t		error;
+	ibool		local_trx = FALSE;
+	fts_cache_t*	cache = table->fts->cache;
+	char		fts_name[MAX_FULL_NAME_LEN];
+
+	if (srv_read_only_mode) {
+		return DB_READ_ONLY;
+	}
+
+	fts_table.suffix = "CONFIG";
+	fts_table.table_id = table->id;
+	fts_table.type = FTS_COMMON_TABLE;
+	fts_table.table = table;
+
+	if (!trx) {
+		trx = trx_create();
+		trx_start_internal(trx);
+
+		trx->op_info = "setting last FTS document id";
+		local_trx = TRUE;
+	}
+
+	info = pars_info_create();
+
+	id_len = (ulint) snprintf(
+		(char*) id, sizeof(id), FTS_DOC_ID_FORMAT, doc_id + 1);
+
+	pars_info_bind_varchar_literal(info, "doc_id", id, id_len);
+
+	fts_get_table_name(&fts_table, fts_name,
+			   table->fts->dict_locked);
+	pars_info_bind_id(info, "table_name", fts_name);
+
+	graph = fts_parse_sql(
+		&fts_table, info,
+		"BEGIN"
+		" UPDATE $table_name SET value = :doc_id"
+		" WHERE key = 'synced_doc_id';");
+
+	error = fts_eval_sql(trx, graph);
+
+	que_graph_free(graph);
+
+	if (local_trx) {
+		if (UNIV_LIKELY(error == DB_SUCCESS)) {
+			fts_sql_commit(trx);
+			cache->synced_doc_id = doc_id;
+		} else {
+			ib::error() << "(" << error << ") while"
+				" updating last doc id for table"
+				<< table->name;
+
+			fts_sql_rollback(trx);
+		}
+		trx->free();
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Create a new fts_doc_ids_t.
+@return new fts_doc_ids_t */
+fts_doc_ids_t*
+fts_doc_ids_create(void)
+/*====================*/
+{
+	fts_doc_ids_t*	fts_doc_ids;
+	mem_heap_t*	heap = mem_heap_create(512);
+
+	fts_doc_ids = static_cast<fts_doc_ids_t*>(
+		mem_heap_alloc(heap, sizeof(*fts_doc_ids)));
+
+	fts_doc_ids->self_heap = ib_heap_allocator_create(heap);
+
+	fts_doc_ids->doc_ids = static_cast<ib_vector_t*>(ib_vector_create(
+		fts_doc_ids->self_heap, sizeof(doc_id_t), 32));
+
+	return(fts_doc_ids);
+}
+
+/*********************************************************************//**
+Do commit-phase steps necessary for the insertion of a new row. */
+void
+fts_add(
+/*====*/
+	fts_trx_table_t*ftt,			/*!< in: FTS trx table */
+	fts_trx_row_t*	row)			/*!< in: row */
+{
+	dict_table_t*	table = ftt->table;
+	doc_id_t	doc_id = row->doc_id;
+
+	ut_a(row->state == FTS_INSERT || row->state == FTS_MODIFY);
+
+	fts_add_doc_by_id(ftt, doc_id);
+
+	mysql_mutex_lock(&table->fts->cache->deleted_lock);
+	++table->fts->cache->added;
+	mysql_mutex_unlock(&table->fts->cache->deleted_lock);
+
+	if (!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)
+	    && doc_id >= table->fts->cache->next_doc_id) {
+		table->fts->cache->next_doc_id = doc_id + 1;
+	}
+}
+
+/*********************************************************************//**
+Do commit-phase steps necessary for the deletion of a row.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_delete(
+/*=======*/
+	fts_trx_table_t*ftt,			/*!< in: FTS trx table */
+	fts_trx_row_t*	row)			/*!< in: row */
+{
+	que_t*		graph;
+	fts_table_t	fts_table;
+	doc_id_t	write_doc_id;
+	dict_table_t*	table = ftt->table;
+	doc_id_t	doc_id = row->doc_id;
+	trx_t*		trx = ftt->fts_trx->trx;
+	pars_info_t*	info = pars_info_create();
+	fts_cache_t*	cache = table->fts->cache;
+
+	/* we do not index Documents whose Doc ID value is 0 */
+	if (doc_id == FTS_NULL_DOC_ID) {
+		ut_ad(!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID));
+		return DB_SUCCESS;
+	}
+
+	ut_a(row->state == FTS_DELETE || row->state == FTS_MODIFY);
+
+	FTS_INIT_FTS_TABLE(&fts_table, "DELETED", FTS_COMMON_TABLE, table);
+
+	/* Convert to "storage" byte order. */
+	fts_write_doc_id((byte*) &write_doc_id, doc_id);
+	fts_bind_doc_id(info, "doc_id", &write_doc_id);
+
+	/* It is possible we update a record that has not yet been sync-ed
+	into cache from last crash (delete Doc will not initialize the
+	sync). Avoid any added counter accounting until the FTS cache
+	is re-established and sync-ed */
+	if (table->fts->added_synced
+	    && doc_id > cache->synced_doc_id) {
+		mysql_mutex_lock(&table->fts->cache->deleted_lock);
+
+		/* The Doc ID could belong to those left in
+		ADDED table from last crash. So need to check
+		if it is less than first_doc_id when we initialize
+		the Doc ID system after reboot */
+		if (doc_id >= table->fts->cache->first_doc_id
+		    && table->fts->cache->added > 0) {
+			--table->fts->cache->added;
+		}
+
+		mysql_mutex_unlock(&table->fts->cache->deleted_lock);
+
+		/* Only if the row was really deleted. */
+		ut_a(row->state == FTS_DELETE || row->state == FTS_MODIFY);
+	}
+
+	/* Note the deleted document for OPTIMIZE to purge. */
+	char	table_name[MAX_FULL_NAME_LEN];
+
+	trx->op_info = "adding doc id to FTS DELETED";
+
+	fts_table.suffix = "DELETED";
+
+	fts_get_table_name(&fts_table, table_name);
+	pars_info_bind_id(info, "deleted", table_name);
+
+	graph = fts_parse_sql(&fts_table, info,
+			      "BEGIN INSERT INTO $deleted VALUES (:doc_id);");
+
+	dberr_t error = fts_eval_sql(trx, graph);
+	que_graph_free(graph);
+
+	/* Increment the total deleted count, this is used to calculate the
+	number of documents indexed. */
+	if (error == DB_SUCCESS) {
+		mysql_mutex_lock(&table->fts->cache->deleted_lock);
+
+		++table->fts->cache->deleted;
+
+		mysql_mutex_unlock(&table->fts->cache->deleted_lock);
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Do commit-phase steps necessary for the modification of a row.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_modify(
+/*=======*/
+	fts_trx_table_t*	ftt,		/*!< in: FTS trx table */
+	fts_trx_row_t*		row)		/*!< in: row */
+{
+	dberr_t	error;
+
+	ut_a(row->state == FTS_MODIFY);
+
+	error = fts_delete(ftt, row);
+
+	if (error == DB_SUCCESS) {
+		fts_add(ftt, row);
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+The given transaction is about to be committed; do whatever is necessary
+from the FTS system's POV.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_commit_table(
+/*=============*/
+	fts_trx_table_t*	ftt)		/*!< in: FTS table to commit*/
+{
+	if (srv_read_only_mode) {
+		return DB_READ_ONLY;
+	}
+
+	const ib_rbt_node_t*	node;
+	ib_rbt_t*		rows;
+	dberr_t			error = DB_SUCCESS;
+	fts_cache_t*		cache = ftt->table->fts->cache;
+	trx_t*			trx = trx_create();
+
+	trx_start_internal(trx);
+
+	rows = ftt->rows;
+
+	ftt->fts_trx->trx = trx;
+
+	if (cache->get_docs == NULL) {
+		mysql_mutex_lock(&cache->init_lock);
+		if (cache->get_docs == NULL) {
+			cache->get_docs = fts_get_docs_create(cache);
+		}
+		mysql_mutex_unlock(&cache->init_lock);
+	}
+
+	for (node = rbt_first(rows);
+	     node != NULL && error == DB_SUCCESS;
+	     node = rbt_next(rows, node)) {
+
+		fts_trx_row_t*	row = rbt_value(fts_trx_row_t, node);
+
+		switch (row->state) {
+		case FTS_INSERT:
+			fts_add(ftt, row);
+			break;
+
+		case FTS_MODIFY:
+			error = fts_modify(ftt, row);
+			break;
+
+		case FTS_DELETE:
+			error = fts_delete(ftt, row);
+			break;
+
+		default:
+			ut_error;
+		}
+	}
+
+	fts_sql_commit(trx);
+
+	trx->free();
+
+	return(error);
+}
+
+/*********************************************************************//**
+The given transaction is about to be committed; do whatever is necessary
+from the FTS system's POV.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_commit(
+/*=======*/
+	trx_t*	trx)				/*!< in: transaction */
+{
+	const ib_rbt_node_t*	node;
+	dberr_t			error;
+	ib_rbt_t*		tables;
+	fts_savepoint_t*	savepoint;
+
+	savepoint = static_cast<fts_savepoint_t*>(
+		ib_vector_last(trx->fts_trx->savepoints));
+	tables = savepoint->tables;
+
+	for (node = rbt_first(tables), error = DB_SUCCESS;
+	     node != NULL && error == DB_SUCCESS;
+	     node = rbt_next(tables, node)) {
+
+		fts_trx_table_t**	ftt;
+
+		ftt = rbt_value(fts_trx_table_t*, node);
+
+		error = fts_commit_table(*ftt);
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Initialize a document. */
+void
+fts_doc_init(
+/*=========*/
+	fts_doc_t*	doc)			/*!< in: doc to initialize */
+{
+	mem_heap_t*	heap = mem_heap_create(32);
+
+	memset(doc, 0, sizeof(*doc));
+
+	doc->self_heap = ib_heap_allocator_create(heap);
+}
+
+/*********************************************************************//**
+Free document. */
+void
+fts_doc_free(
+/*=========*/
+	fts_doc_t*	doc)			/*!< in: document */
+{
+	mem_heap_t*	heap = static_cast<mem_heap_t*>(doc->self_heap->arg);
+
+	if (doc->tokens) {
+		rbt_free(doc->tokens);
+	}
+
+	ut_d(memset(doc, 0, sizeof(*doc)));
+
+	mem_heap_free(heap);
+}
+
+/*********************************************************************//**
+Callback function for fetch that stores the text of an FTS document,
+converting each column to UTF-16.
+@return always FALSE */
+ibool
+fts_query_expansion_fetch_doc(
+/*==========================*/
+	void*		row,			/*!< in: sel_node_t* */
+	void*		user_arg)		/*!< in: fts_doc_t* */
+{
+	que_node_t*	exp;
+	sel_node_t*	node = static_cast<sel_node_t*>(row);
+	fts_doc_t*	result_doc = static_cast<fts_doc_t*>(user_arg);
+	dfield_t*	dfield;
+	ulint		len;
+	ulint		doc_len;
+	fts_doc_t	doc;
+	CHARSET_INFO*	doc_charset = NULL;
+	ulint		field_no = 0;
+
+	len = 0;
+
+	fts_doc_init(&doc);
+	doc.found = TRUE;
+
+	exp = node->select_list;
+	doc_len = 0;
+
+	doc_charset  = result_doc->charset;
+
+	/* Copy each indexed column content into doc->text.f_str */
+	while (exp) {
+		dfield = que_node_get_val(exp);
+		len = dfield_get_len(dfield);
+
+		/* NULL column */
+		if (len == UNIV_SQL_NULL) {
+			exp = que_node_get_next(exp);
+			continue;
+		}
+
+		if (!doc_charset) {
+			doc_charset = fts_get_charset(dfield->type.prtype);
+		}
+
+		doc.charset = doc_charset;
+
+		if (dfield_is_ext(dfield)) {
+			/* We ignore columns that are stored externally, this
+			could result in too many words to search */
+			exp = que_node_get_next(exp);
+			continue;
+		} else {
+			doc.text.f_n_char = 0;
+
+			doc.text.f_str = static_cast<byte*>(
+				dfield_get_data(dfield));
+
+			doc.text.f_len = len;
+		}
+
+		if (field_no == 0) {
+			fts_tokenize_document(&doc, result_doc,
+					      result_doc->parser);
+		} else {
+			fts_tokenize_document_next(&doc, doc_len, result_doc,
+						   result_doc->parser);
+		}
+
+		exp = que_node_get_next(exp);
+
+		doc_len += (exp) ? len + 1 : len;
+
+		field_no++;
+	}
+
+	ut_ad(doc_charset);
+
+	if (!result_doc->charset) {
+		result_doc->charset = doc_charset;
+	}
+
+	fts_doc_free(&doc);
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+fetch and tokenize the document. */
+static
+void
+fts_fetch_doc_from_rec(
+/*===================*/
+	fts_get_doc_t*  get_doc,	/*!< in: FTS index's get_doc struct */
+	dict_index_t*	clust_index,	/*!< in: cluster index */
+	btr_pcur_t*	pcur,		/*!< in: cursor whose position
+					has been stored */
+	rec_offs*	offsets,	/*!< in: offsets */
+	fts_doc_t*	doc)		/*!< out: fts doc to hold parsed
+					documents */
+{
+	dict_index_t*		index;
+	const rec_t*		clust_rec;
+	const dict_field_t*	ifield;
+	ulint			clust_pos;
+	ulint			doc_len = 0;
+	st_mysql_ftparser*	parser;
+
+	if (!get_doc) {
+		return;
+	}
+
+	index = get_doc->index_cache->index;
+	parser = get_doc->index_cache->index->parser;
+
+	clust_rec = btr_pcur_get_rec(pcur);
+	ut_ad(!page_rec_is_comp(clust_rec)
+	      || rec_get_status(clust_rec) == REC_STATUS_ORDINARY);
+
+	for (ulint i = 0; i < index->n_fields; i++) {
+		ifield = dict_index_get_nth_field(index, i);
+		clust_pos = dict_col_get_clust_pos(ifield->col, clust_index);
+
+		if (!get_doc->index_cache->charset) {
+			get_doc->index_cache->charset = fts_get_charset(
+				ifield->col->prtype);
+		}
+
+		if (rec_offs_nth_extern(offsets, clust_pos)) {
+			doc->text.f_str =
+				btr_rec_copy_externally_stored_field(
+					clust_rec, offsets,
+					btr_pcur_get_block(pcur)->zip_size(),
+					clust_pos, &doc->text.f_len,
+					static_cast<mem_heap_t*>(
+						doc->self_heap->arg));
+		} else {
+			doc->text.f_str = (byte*) rec_get_nth_field(
+				clust_rec, offsets, clust_pos,
+				&doc->text.f_len);
+		}
+
+		doc->found = TRUE;
+		doc->charset = get_doc->index_cache->charset;
+
+		/* Null Field */
+		if (doc->text.f_len == UNIV_SQL_NULL || doc->text.f_len == 0) {
+			continue;
+		}
+
+		if (!doc_len) {
+			fts_tokenize_document(doc, NULL, parser);
+		} else {
+			fts_tokenize_document_next(doc, doc_len, NULL, parser);
+		}
+
+		doc_len += doc->text.f_len + 1;
+	}
+}
+
+/** Fetch the data from tuple and tokenize the document.
+@param[in]     get_doc FTS index's get_doc struct
+@param[in]     tuple   tuple should be arranged in table schema order
+@param[out]    doc     fts doc to hold parsed documents. */
+static
+void
+fts_fetch_doc_from_tuple(
+       fts_get_doc_t*  get_doc,
+       const dtuple_t* tuple,
+       fts_doc_t*      doc)
+{
+       dict_index_t*           index;
+       st_mysql_ftparser*      parser;
+       ulint                   doc_len = 0;
+       ulint                   processed_doc = 0;
+       ulint                   num_field;
+
+       if (get_doc == NULL) {
+               return;
+       }
+
+       index = get_doc->index_cache->index;
+       parser = get_doc->index_cache->index->parser;
+       num_field = dict_index_get_n_fields(index);
+
+       for (ulint i = 0; i < num_field; i++) {
+               const dict_field_t*     ifield;
+               const dict_col_t*       col;
+               ulint                   pos;
+
+               ifield = dict_index_get_nth_field(index, i);
+               col = dict_field_get_col(ifield);
+               pos = dict_col_get_no(col);
+		const dfield_t* field = dtuple_get_nth_field(tuple, pos);
+
+               if (!get_doc->index_cache->charset) {
+                       get_doc->index_cache->charset = fts_get_charset(
+                               ifield->col->prtype);
+               }
+
+               ut_ad(!dfield_is_ext(field));
+
+               doc->text.f_str = (byte*) dfield_get_data(field);
+               doc->text.f_len = dfield_get_len(field);
+               doc->found = TRUE;
+               doc->charset = get_doc->index_cache->charset;
+
+               /* field data is NULL. */
+               if (doc->text.f_len == UNIV_SQL_NULL || doc->text.f_len == 0) {
+                       continue;
+               }
+
+               if (processed_doc == 0) {
+                       fts_tokenize_document(doc, NULL, parser);
+               } else {
+                       fts_tokenize_document_next(doc, doc_len, NULL, parser);
+               }
+
+               processed_doc++;
+               doc_len += doc->text.f_len + 1;
+       }
+}
+
+/** Fetch the document from tuple, tokenize the text data and
+insert the text data into fts auxiliary table and
+its cache. Moreover this tuple fields doesn't contain any information
+about externally stored field. This tuple contains data directly
+converted from mysql.
+@param[in]     ftt     FTS transaction table
+@param[in]     doc_id  doc id
+@param[in]     tuple   tuple from where data can be retrieved
+                       and tuple should be arranged in table
+                       schema order. */
+void
+fts_add_doc_from_tuple(
+       fts_trx_table_t*ftt,
+       doc_id_t        doc_id,
+       const dtuple_t* tuple)
+{
+       mtr_t           mtr;
+       fts_cache_t*    cache = ftt->table->fts->cache;
+
+       ut_ad(cache->get_docs);
+
+       if (!ftt->table->fts->added_synced) {
+               fts_init_index(ftt->table, FALSE);
+       }
+
+       mtr_start(&mtr);
+
+       ulint   num_idx = ib_vector_size(cache->get_docs);
+
+       for (ulint i = 0; i < num_idx; ++i) {
+               fts_doc_t       doc;
+               dict_table_t*   table;
+               fts_get_doc_t*  get_doc;
+
+               get_doc = static_cast<fts_get_doc_t*>(
+                       ib_vector_get(cache->get_docs, i));
+               table = get_doc->index_cache->index->table;
+
+               fts_doc_init(&doc);
+               fts_fetch_doc_from_tuple(
+                       get_doc, tuple, &doc);
+
+               if (doc.found) {
+                       mtr_commit(&mtr);
+                       mysql_mutex_lock(&table->fts->cache->lock);
+
+                       if (table->fts->cache->stopword_info.status
+                           & STOPWORD_NOT_INIT) {
+                               fts_load_stopword(table, NULL, NULL,
+                                                 true, true);
+                       }
+
+                       fts_cache_add_doc(
+                               table->fts->cache,
+                               get_doc->index_cache,
+                               doc_id, doc.tokens);
+
+                       mysql_mutex_unlock(&table->fts->cache->lock);
+
+                       if (cache->total_size > fts_max_cache_size / 5
+                           || fts_need_sync) {
+                               fts_sync(cache->sync, true, false);
+                       }
+
+                       mtr_start(&mtr);
+
+               }
+
+               fts_doc_free(&doc);
+       }
+
+       mtr_commit(&mtr);
+}
+
+/*********************************************************************//**
+This function fetches the document inserted during the committing
+transaction, and tokenize the inserted text data and insert into
+FTS auxiliary table and its cache. */
+static
+void
+fts_add_doc_by_id(
+/*==============*/
+	fts_trx_table_t*ftt,		/*!< in: FTS trx table */
+	doc_id_t	doc_id)		/*!< in: doc id */
+{
+	mtr_t		mtr;
+	mem_heap_t*	heap;
+	btr_pcur_t	pcur;
+	dict_table_t*	table;
+	dtuple_t*	tuple;
+	dfield_t*       dfield;
+	fts_get_doc_t*	get_doc;
+	doc_id_t        temp_doc_id;
+	dict_index_t*   clust_index;
+	dict_index_t*	fts_id_index;
+	ibool		is_id_cluster;
+	fts_cache_t*   	cache = ftt->table->fts->cache;
+
+	ut_ad(cache->get_docs);
+
+	/* If Doc ID has been supplied by the user, then the table
+	might not yet be sync-ed */
+
+	if (!ftt->table->fts->added_synced) {
+		fts_init_index(ftt->table, FALSE);
+	}
+
+	/* Get the first FTS index's get_doc */
+	get_doc = static_cast<fts_get_doc_t*>(
+		ib_vector_get(cache->get_docs, 0));
+	ut_ad(get_doc);
+
+	table = get_doc->index_cache->index->table;
+
+	heap = mem_heap_create(512);
+
+	clust_index = dict_table_get_first_index(table);
+	fts_id_index = table->fts_doc_id_index;
+
+	/* Check whether the index on FTS_DOC_ID is cluster index */
+	is_id_cluster = (clust_index == fts_id_index);
+
+	mtr_start(&mtr);
+
+	/* Search based on Doc ID. Here, we'll need to consider the case
+	when there is no primary index on Doc ID */
+	const ulint n_uniq = table->fts_n_uniq();
+	tuple = dtuple_create(heap, n_uniq);
+	dfield = dtuple_get_nth_field(tuple, 0);
+	dfield->type.mtype = DATA_INT;
+	dfield->type.prtype = DATA_NOT_NULL | DATA_UNSIGNED | DATA_BINARY_TYPE;
+
+	mach_write_to_8((byte*) &temp_doc_id, doc_id);
+	dfield_set_data(dfield, &temp_doc_id, sizeof(temp_doc_id));
+	pcur.btr_cur.page_cur.index = fts_id_index;
+
+	if (n_uniq == 2) {
+		ut_ad(table->versioned());
+		ut_ad(fts_id_index->fields[1].col->vers_sys_end());
+		dfield = dtuple_get_nth_field(tuple, 1);
+		dfield->type.mtype = fts_id_index->fields[1].col->mtype;
+		dfield->type.prtype = fts_id_index->fields[1].col->prtype;
+		if (table->versioned_by_id()) {
+			dfield_set_data(dfield, trx_id_max_bytes,
+					sizeof(trx_id_max_bytes));
+		} else {
+			dfield_set_data(dfield, timestamp_max_bytes,
+					sizeof(timestamp_max_bytes));
+		}
+	}
+
+	/* If we have a match, add the data to doc structure */
+	if (btr_pcur_open_with_no_init(tuple, PAGE_CUR_LE,
+				       BTR_SEARCH_LEAF, &pcur, &mtr)
+	    == DB_SUCCESS
+	    && btr_pcur_get_low_match(&pcur) == n_uniq) {
+		const rec_t*	rec;
+		btr_pcur_t*	doc_pcur;
+		const rec_t*	clust_rec;
+		btr_pcur_t	clust_pcur;
+		rec_offs*	offsets = NULL;
+		ulint		num_idx = ib_vector_size(cache->get_docs);
+
+		rec = btr_pcur_get_rec(&pcur);
+
+		/* Doc could be deleted */
+		if (page_rec_is_infimum(rec)
+		    || rec_get_deleted_flag(rec, dict_table_is_comp(table))) {
+
+			goto func_exit;
+		}
+
+		if (is_id_cluster) {
+			clust_rec = rec;
+			doc_pcur = &pcur;
+		} else {
+			dtuple_t*	clust_ref;
+			ulint		n_fields;
+
+			n_fields = dict_index_get_n_unique(clust_index);
+
+			clust_ref = dtuple_create(heap, n_fields);
+			dict_index_copy_types(clust_ref, clust_index, n_fields);
+
+			row_build_row_ref_in_tuple(
+				clust_ref, rec, fts_id_index, NULL);
+			clust_pcur.btr_cur.page_cur.index = clust_index;
+
+			if (btr_pcur_open_with_no_init(clust_ref,
+						       PAGE_CUR_LE,
+						       BTR_SEARCH_LEAF,
+						       &clust_pcur, &mtr)
+			    != DB_SUCCESS) {
+				goto func_exit;
+			}
+
+			doc_pcur = &clust_pcur;
+			clust_rec = btr_pcur_get_rec(&clust_pcur);
+		}
+
+		offsets = rec_get_offsets(clust_rec, clust_index, NULL,
+					  clust_index->n_core_fields,
+					  ULINT_UNDEFINED, &heap);
+
+		for (ulint i = 0; i < num_idx; ++i) {
+			fts_doc_t       doc;
+			dict_table_t*   table;
+			fts_get_doc_t*  get_doc;
+
+			get_doc = static_cast<fts_get_doc_t*>(
+				ib_vector_get(cache->get_docs, i));
+
+			table = get_doc->index_cache->index->table;
+
+			fts_doc_init(&doc);
+
+			fts_fetch_doc_from_rec(
+				get_doc, clust_index, doc_pcur, offsets, &doc);
+
+			if (doc.found) {
+
+				btr_pcur_store_position(doc_pcur, &mtr);
+				mtr_commit(&mtr);
+
+				mysql_mutex_lock(&table->fts->cache->lock);
+
+				if (table->fts->cache->stopword_info.status
+				    & STOPWORD_NOT_INIT) {
+					fts_load_stopword(table, NULL,
+							  NULL, true, true);
+				}
+
+				fts_cache_add_doc(
+					table->fts->cache,
+					get_doc->index_cache,
+					doc_id, doc.tokens);
+
+				bool	need_sync = !cache->sync->in_progress
+					&& (fts_need_sync
+					    || (cache->total_size
+						- cache->total_size_at_sync)
+					    > fts_max_cache_size / 10);
+				if (need_sync) {
+					cache->total_size_at_sync =
+						cache->total_size;
+				}
+
+				mysql_mutex_unlock(&table->fts->cache->lock);
+
+				DBUG_EXECUTE_IF(
+					"fts_instrument_sync",
+					fts_optimize_request_sync_table(table);
+					mysql_mutex_lock(&cache->lock);
+					if (cache->sync->in_progress)
+						my_cond_wait(
+							&cache->sync->cond,
+							&cache->lock.m_mutex);
+					mysql_mutex_unlock(&cache->lock);
+				);
+
+				DBUG_EXECUTE_IF(
+					"fts_instrument_sync_debug",
+					fts_sync(cache->sync, true, true);
+				);
+
+				DEBUG_SYNC_C("fts_instrument_sync_request");
+				DBUG_EXECUTE_IF(
+					"fts_instrument_sync_request",
+					fts_optimize_request_sync_table(table);
+				);
+
+				if (need_sync) {
+					fts_optimize_request_sync_table(table);
+				}
+
+				mtr_start(&mtr);
+
+				if (i < num_idx - 1) {
+					if (doc_pcur->restore_position(
+					      BTR_SEARCH_LEAF, &mtr)
+					    != btr_pcur_t::SAME_ALL) {
+						ut_ad("invalid state" == 0);
+						i = num_idx - 1;
+					}
+				}
+			}
+
+			fts_doc_free(&doc);
+		}
+
+		if (!is_id_cluster) {
+			ut_free(doc_pcur->old_rec_buf);
+		}
+	}
+func_exit:
+	mtr_commit(&mtr);
+
+	ut_free(pcur.old_rec_buf);
+
+	mem_heap_free(heap);
+}
+
+
+/*********************************************************************//**
+Callback function to read a single ulint column.
+return always returns TRUE */
+static
+ibool
+fts_read_ulint(
+/*===========*/
+	void*		row,		/*!< in: sel_node_t* */
+	void*		user_arg)	/*!< in: pointer to ulint */
+{
+	sel_node_t*	sel_node = static_cast<sel_node_t*>(row);
+	ulint*		value = static_cast<ulint*>(user_arg);
+	que_node_t*	exp = sel_node->select_list;
+	dfield_t*	dfield = que_node_get_val(exp);
+	void*		data = dfield_get_data(dfield);
+
+	*value = mach_read_from_4(static_cast<const byte*>(data));
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Get maximum Doc ID in a table if index "FTS_DOC_ID_INDEX" exists
+@return max Doc ID or 0 if index "FTS_DOC_ID_INDEX" does not exist */
+doc_id_t
+fts_get_max_doc_id(
+/*===============*/
+	dict_table_t*	table)		/*!< in: user table */
+{
+	dict_index_t*	index;
+	dict_field_t*	dfield MY_ATTRIBUTE((unused)) = NULL;
+	doc_id_t	doc_id = 0;
+	mtr_t		mtr;
+	btr_pcur_t	pcur;
+
+	index = table->fts_doc_id_index;
+
+	if (!index) {
+		return(0);
+	}
+
+	ut_ad(!index->is_instant());
+
+	dfield = dict_index_get_nth_field(index, 0);
+
+#if 0 /* This can fail when renaming a column to FTS_DOC_ID_COL_NAME. */
+	ut_ad(innobase_strcasecmp(FTS_DOC_ID_COL_NAME, dfield->name) == 0);
+#endif
+
+	mtr.start();
+
+	/* fetch the largest indexes value */
+	if (pcur.open_leaf(false, index, BTR_SEARCH_LEAF, &mtr) == DB_SUCCESS
+	    && !page_is_empty(btr_pcur_get_page(&pcur))) {
+		const rec_t*    rec = NULL;
+		constexpr ulint	doc_id_len= 8;
+
+		do {
+			rec = btr_pcur_get_rec(&pcur);
+
+			if (!page_rec_is_user_rec(rec)) {
+				continue;
+			}
+
+			if (index->n_uniq == 1) {
+				break;
+			}
+
+			ut_ad(table->versioned());
+			ut_ad(index->n_uniq == 2);
+
+			const byte *data = rec + doc_id_len;
+			if (table->versioned_by_id()) {
+				if (0 == memcmp(data, trx_id_max_bytes,
+						sizeof trx_id_max_bytes)) {
+					break;
+				}
+			} else {
+				if (0 == memcmp(data, timestamp_max_bytes,
+						sizeof timestamp_max_bytes)) {
+					break;
+				}
+			}
+		} while (btr_pcur_move_to_prev(&pcur, &mtr));
+
+		if (!rec || rec_is_metadata(rec, *index)) {
+			goto func_exit;
+		}
+
+		doc_id = fts_read_doc_id(rec);
+	}
+
+func_exit:
+	mtr.commit();
+	return(doc_id);
+}
+
+/*********************************************************************//**
+Fetch document with the given document id.
+@return DB_SUCCESS if OK else error */
+dberr_t
+fts_doc_fetch_by_doc_id(
+/*====================*/
+	fts_get_doc_t*	get_doc,	/*!< in: state */
+	doc_id_t	doc_id,		/*!< in: id of document to
+					fetch */
+	dict_index_t*	index_to_use,	/*!< in: caller supplied FTS index,
+					or NULL */
+	ulint		option,		/*!< in: search option, if it is
+					greater than doc_id or equal */
+	fts_sql_callback
+			callback,	/*!< in: callback to read */
+	void*		arg)		/*!< in: callback arg */
+{
+	pars_info_t*	info;
+	dberr_t		error;
+	const char*	select_str;
+	doc_id_t	write_doc_id;
+	dict_index_t*	index;
+	trx_t*		trx = trx_create();
+	que_t*          graph;
+
+	trx->op_info = "fetching indexed FTS document";
+
+	/* The FTS index can be supplied by caller directly with
+	"index_to_use", otherwise, get it from "get_doc" */
+	index = (index_to_use) ? index_to_use : get_doc->index_cache->index;
+
+	if (get_doc && get_doc->get_document_graph) {
+		info = get_doc->get_document_graph->info;
+	} else {
+		info = pars_info_create();
+	}
+
+	/* Convert to "storage" byte order. */
+	fts_write_doc_id((byte*) &write_doc_id, doc_id);
+	fts_bind_doc_id(info, "doc_id", &write_doc_id);
+	pars_info_bind_function(info, "my_func", callback, arg);
+
+	select_str = fts_get_select_columns_str(index, info, info->heap);
+	pars_info_bind_id(info, "table_name", index->table->name.m_name);
+
+	if (!get_doc || !get_doc->get_document_graph) {
+		if (option == FTS_FETCH_DOC_BY_ID_EQUAL) {
+			graph = fts_parse_sql(
+				NULL,
+				info,
+				mem_heap_printf(info->heap,
+					"DECLARE FUNCTION my_func;\n"
+					"DECLARE CURSOR c IS"
+					" SELECT %s FROM $table_name"
+					" WHERE %s = :doc_id;\n"
+					"BEGIN\n"
+					""
+					"OPEN c;\n"
+					"WHILE 1 = 1 LOOP\n"
+					"  FETCH c INTO my_func();\n"
+					"  IF c %% NOTFOUND THEN\n"
+					"    EXIT;\n"
+					"  END IF;\n"
+					"END LOOP;\n"
+					"CLOSE c;",
+					select_str, FTS_DOC_ID_COL_NAME));
+		} else {
+			ut_ad(option == FTS_FETCH_DOC_BY_ID_LARGE);
+
+			/* This is used for crash recovery of table with
+			hidden DOC ID or FTS indexes. We will scan the table
+			to re-processing user table rows whose DOC ID or
+			FTS indexed documents have not been sync-ed to disc
+			during recent crash.
+			In the case that all fulltext indexes are dropped
+			for a table, we will keep the "hidden" FTS_DOC_ID
+			column, and this scan is to retreive the largest
+			DOC ID being used in the table to determine the
+			appropriate next DOC ID.
+			In the case of there exists fulltext index(es), this
+			operation will re-tokenize any docs that have not
+			been sync-ed to the disk, and re-prime the FTS
+			cached */
+			graph = fts_parse_sql(
+				NULL,
+				info,
+				mem_heap_printf(info->heap,
+					"DECLARE FUNCTION my_func;\n"
+					"DECLARE CURSOR c IS"
+					" SELECT %s, %s FROM $table_name"
+					" WHERE %s > :doc_id;\n"
+					"BEGIN\n"
+					""
+					"OPEN c;\n"
+					"WHILE 1 = 1 LOOP\n"
+					"  FETCH c INTO my_func();\n"
+					"  IF c %% NOTFOUND THEN\n"
+					"    EXIT;\n"
+					"  END IF;\n"
+					"END LOOP;\n"
+					"CLOSE c;",
+					FTS_DOC_ID_COL_NAME,
+					select_str, FTS_DOC_ID_COL_NAME));
+		}
+		if (get_doc) {
+			get_doc->get_document_graph = graph;
+		}
+	} else {
+		graph = get_doc->get_document_graph;
+	}
+
+	error = fts_eval_sql(trx, graph);
+	fts_sql_commit(trx);
+	trx->free();
+
+	if (!get_doc) {
+		que_graph_free(graph);
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Write out a single word's data as new entry/entries in the INDEX table.
+@return DB_SUCCESS if all OK. */
+dberr_t
+fts_write_node(
+/*===========*/
+	trx_t*		trx,			/*!< in: transaction */
+	que_t**		graph,			/*!< in: query graph */
+	fts_table_t*	fts_table,		/*!< in: aux table */
+	fts_string_t*	word,			/*!< in: word in UTF-8 */
+	fts_node_t*	node)			/*!< in: node columns */
+{
+	pars_info_t*	info;
+	dberr_t		error;
+	ib_uint32_t	doc_count;
+	time_t		start_time;
+	doc_id_t	last_doc_id;
+	doc_id_t	first_doc_id;
+	char		table_name[MAX_FULL_NAME_LEN];
+
+	ut_a(node->ilist != NULL);
+
+	if (*graph) {
+		info = (*graph)->info;
+	} else {
+		info = pars_info_create();
+
+		fts_get_table_name(fts_table, table_name);
+		pars_info_bind_id(info, "index_table_name", table_name);
+	}
+
+	pars_info_bind_varchar_literal(info, "token", word->f_str, word->f_len);
+
+	/* Convert to "storage" byte order. */
+	fts_write_doc_id((byte*) &first_doc_id, node->first_doc_id);
+	fts_bind_doc_id(info, "first_doc_id", &first_doc_id);
+
+	/* Convert to "storage" byte order. */
+	fts_write_doc_id((byte*) &last_doc_id, node->last_doc_id);
+	fts_bind_doc_id(info, "last_doc_id", &last_doc_id);
+
+	ut_a(node->last_doc_id >= node->first_doc_id);
+
+	/* Convert to "storage" byte order. */
+	mach_write_to_4((byte*) &doc_count, node->doc_count);
+	pars_info_bind_int4_literal(
+		info, "doc_count", (const ib_uint32_t*) &doc_count);
+
+	/* Set copy_name to FALSE since it's a static. */
+	pars_info_bind_literal(
+		info, "ilist", node->ilist, node->ilist_size,
+		DATA_BLOB, DATA_BINARY_TYPE);
+
+	if (!*graph) {
+
+		*graph = fts_parse_sql(
+			fts_table,
+			info,
+			"BEGIN\n"
+			"INSERT INTO $index_table_name VALUES"
+			" (:token, :first_doc_id,"
+			"  :last_doc_id, :doc_count, :ilist);");
+	}
+
+	start_time = time(NULL);
+	error = fts_eval_sql(trx, *graph);
+	elapsed_time += time(NULL) - start_time;
+	++n_nodes;
+
+	return(error);
+}
+
+/*********************************************************************//**
+Add rows to the DELETED_CACHE table.
+@return DB_SUCCESS if all went well else error code*/
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_sync_add_deleted_cache(
+/*=======================*/
+	fts_sync_t*	sync,			/*!< in: sync state */
+	ib_vector_t*	doc_ids)		/*!< in: doc ids to add */
+{
+	ulint		i;
+	pars_info_t*	info;
+	que_t*		graph;
+	fts_table_t	fts_table;
+	char		table_name[MAX_FULL_NAME_LEN];
+	doc_id_t	dummy = 0;
+	dberr_t		error = DB_SUCCESS;
+	ulint		n_elems = ib_vector_size(doc_ids);
+
+	ut_a(ib_vector_size(doc_ids) > 0);
+
+	ib_vector_sort(doc_ids, fts_doc_id_cmp);
+
+	info = pars_info_create();
+
+	fts_bind_doc_id(info, "doc_id", &dummy);
+
+	FTS_INIT_FTS_TABLE(
+		&fts_table, "DELETED_CACHE", FTS_COMMON_TABLE, sync->table);
+
+	fts_get_table_name(&fts_table, table_name);
+	pars_info_bind_id(info, "table_name", table_name);
+
+	graph = fts_parse_sql(
+		&fts_table,
+		info,
+		"BEGIN INSERT INTO $table_name VALUES (:doc_id);");
+
+	for (i = 0; i < n_elems && error == DB_SUCCESS; ++i) {
+		doc_id_t*	update;
+		doc_id_t	write_doc_id;
+
+		update = static_cast<doc_id_t*>(ib_vector_get(doc_ids, i));
+
+		/* Convert to "storage" byte order. */
+		fts_write_doc_id((byte*) &write_doc_id, *update);
+		fts_bind_doc_id(info, "doc_id", &write_doc_id);
+
+		error = fts_eval_sql(sync->trx, graph);
+	}
+
+	que_graph_free(graph);
+
+	return(error);
+}
+
+/** Write the words and ilist to disk.
+@param[in,out]	trx		transaction
+@param[in]	index_cache	index cache
+@param[in]	unlock_cache	whether unlock cache when write node
+@return DB_SUCCESS if all went well else error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_sync_write_words(
+	trx_t*			trx,
+	fts_index_cache_t*	index_cache,
+	bool			unlock_cache)
+{
+	fts_table_t	fts_table;
+	ulint		n_nodes = 0;
+	ulint		n_words = 0;
+	const ib_rbt_node_t* rbt_node;
+	dberr_t		error = DB_SUCCESS;
+	ibool		print_error = FALSE;
+	dict_table_t*	table = index_cache->index->table;
+
+	FTS_INIT_INDEX_TABLE(
+		&fts_table, NULL, FTS_INDEX_TABLE, index_cache->index);
+
+	n_words = rbt_size(index_cache->words);
+
+	/* We iterate over the entire tree, even if there is an error,
+	since we want to free the memory used during caching. */
+	for (rbt_node = rbt_first(index_cache->words);
+	     rbt_node;
+	     rbt_node = rbt_next(index_cache->words, rbt_node)) {
+
+		ulint			i;
+		ulint			selected;
+		fts_tokenizer_word_t*	word;
+
+		word = rbt_value(fts_tokenizer_word_t, rbt_node);
+
+		DBUG_EXECUTE_IF(
+			"fts_instrument_write_words_before_select_index",
+			std::this_thread::sleep_for(
+				std::chrono::milliseconds(300)););
+
+		selected = fts_select_index(
+			index_cache->charset, word->text.f_str,
+			word->text.f_len);
+
+		fts_table.suffix = fts_get_suffix(selected);
+
+		/* We iterate over all the nodes even if there was an error */
+		for (i = 0; i < ib_vector_size(word->nodes); ++i) {
+
+			fts_node_t* fts_node = static_cast<fts_node_t*>(
+				ib_vector_get(word->nodes, i));
+
+			if (fts_node->synced) {
+				continue;
+			} else {
+				fts_node->synced = true;
+			}
+
+			/*FIXME: we need to handle the error properly. */
+			if (error == DB_SUCCESS) {
+				if (unlock_cache) {
+					mysql_mutex_unlock(
+						&table->fts->cache->lock);
+				}
+
+				error = fts_write_node(
+					trx,
+					&index_cache->ins_graph[selected],
+					&fts_table, &word->text, fts_node);
+
+				DEBUG_SYNC_C("fts_write_node");
+				DBUG_EXECUTE_IF("fts_write_node_crash",
+					DBUG_SUICIDE(););
+
+				DBUG_EXECUTE_IF(
+					"fts_instrument_sync_sleep",
+					std::this_thread::sleep_for(
+						std::chrono::seconds(1)););
+
+				if (unlock_cache) {
+					mysql_mutex_lock(
+						&table->fts->cache->lock);
+				}
+			}
+		}
+
+		n_nodes += ib_vector_size(word->nodes);
+
+		if (UNIV_UNLIKELY(error != DB_SUCCESS) && !print_error) {
+			ib::error() << "(" << error << ") writing"
+				" word node to FTS auxiliary index table "
+				<< table->name;
+			print_error = TRUE;
+		}
+	}
+
+	if (UNIV_UNLIKELY(fts_enable_diag_print)) {
+		printf("Avg number of nodes: %lf\n",
+		       (double) n_nodes / (double) (n_words > 1 ? n_words : 1));
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Begin Sync, create transaction, acquire locks, etc. */
+static
+void
+fts_sync_begin(
+/*===========*/
+	fts_sync_t*	sync)			/*!< in: sync state */
+{
+	fts_cache_t*	cache = sync->table->fts->cache;
+
+	n_nodes = 0;
+	elapsed_time = 0;
+
+	sync->start_time = time(NULL);
+
+	sync->trx = trx_create();
+	trx_start_internal(sync->trx);
+
+	if (UNIV_UNLIKELY(fts_enable_diag_print)) {
+		ib::info() << "FTS SYNC for table " << sync->table->name
+			<< ", deleted count: "
+			<< ib_vector_size(cache->deleted_doc_ids)
+			<< " size: " << ib::bytes_iec{cache->total_size};
+	}
+}
+
+/*********************************************************************//**
+Run SYNC on the table, i.e., write out data from the index specific
+cache to the FTS aux INDEX table and FTS aux doc id stats table.
+@return DB_SUCCESS if all OK */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_sync_index(
+/*===========*/
+	fts_sync_t*		sync,		/*!< in: sync state */
+	fts_index_cache_t*	index_cache)	/*!< in: index cache */
+{
+	trx_t*		trx = sync->trx;
+
+	trx->op_info = "doing SYNC index";
+
+	if (UNIV_UNLIKELY(fts_enable_diag_print)) {
+		ib::info() << "SYNC words: " << rbt_size(index_cache->words);
+	}
+
+	ut_ad(rbt_validate(index_cache->words));
+
+	return(fts_sync_write_words(trx, index_cache, sync->unlock_cache));
+}
+
+/** Check if index cache has been synced completely
+@param[in,out]	index_cache	index cache
+@return true if index is synced, otherwise false. */
+static
+bool
+fts_sync_index_check(
+	fts_index_cache_t*	index_cache)
+{
+	const ib_rbt_node_t*	rbt_node;
+
+	for (rbt_node = rbt_first(index_cache->words);
+	     rbt_node != NULL;
+	     rbt_node = rbt_next(index_cache->words, rbt_node)) {
+
+		fts_tokenizer_word_t*	word;
+		word = rbt_value(fts_tokenizer_word_t, rbt_node);
+
+		fts_node_t*	fts_node;
+		fts_node = static_cast<fts_node_t*>(ib_vector_last(word->nodes));
+
+		if (!fts_node->synced) {
+			return(false);
+		}
+	}
+
+	return(true);
+}
+
+/** Reset synced flag in index cache when rollback
+@param[in,out]	index_cache	index cache */
+static
+void
+fts_sync_index_reset(
+	fts_index_cache_t*	index_cache)
+{
+	const ib_rbt_node_t*	rbt_node;
+
+	for (rbt_node = rbt_first(index_cache->words);
+	     rbt_node != NULL;
+	     rbt_node = rbt_next(index_cache->words, rbt_node)) {
+
+		fts_tokenizer_word_t*	word;
+		word = rbt_value(fts_tokenizer_word_t, rbt_node);
+
+		fts_node_t*	fts_node;
+		fts_node = static_cast<fts_node_t*>(ib_vector_last(word->nodes));
+
+		fts_node->synced = false;
+	}
+}
+
+/** Commit the SYNC, change state of processed doc ids etc.
+@param[in,out]	sync	sync state
+@return DB_SUCCESS if all OK */
+static  MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_sync_commit(
+	fts_sync_t*	sync)
+{
+	dberr_t		error;
+	trx_t*		trx = sync->trx;
+	fts_cache_t*	cache = sync->table->fts->cache;
+	doc_id_t	last_doc_id;
+
+	trx->op_info = "doing SYNC commit";
+
+	/* After each Sync, update the CONFIG table about the max doc id
+	we just sync-ed to index table */
+	error = fts_cmp_set_sync_doc_id(sync->table, sync->max_doc_id, FALSE,
+					&last_doc_id);
+
+	/* Get the list of deleted documents that are either in the
+	cache or were headed there but were deleted before the add
+	thread got to them. */
+
+	if (error == DB_SUCCESS && ib_vector_size(cache->deleted_doc_ids) > 0) {
+
+		error = fts_sync_add_deleted_cache(
+			sync, cache->deleted_doc_ids);
+	}
+
+	/* We need to do this within the deleted lock since fts_delete() can
+	attempt to add a deleted doc id to the cache deleted id array. */
+	fts_cache_clear(cache);
+	DEBUG_SYNC_C("fts_deleted_doc_ids_clear");
+	fts_cache_init(cache);
+	mysql_mutex_unlock(&cache->lock);
+
+	if (UNIV_LIKELY(error == DB_SUCCESS)) {
+		fts_sql_commit(trx);
+	} else {
+		fts_sql_rollback(trx);
+		ib::error() << "(" << error << ") during SYNC of "
+			"table " << sync->table->name;
+	}
+
+	if (UNIV_UNLIKELY(fts_enable_diag_print) && elapsed_time) {
+		ib::info() << "SYNC for table " << sync->table->name
+			<< ": SYNC time: "
+			<< (time(NULL) - sync->start_time)
+			<< " secs: elapsed "
+			<< static_cast<double>(n_nodes)
+			/ static_cast<double>(elapsed_time)
+			<< " ins/sec";
+	}
+
+	/* Avoid assertion in trx_t::free(). */
+	trx->dict_operation_lock_mode = false;
+	trx->free();
+
+	return(error);
+}
+
+/** Rollback a sync operation
+@param[in,out]	sync	sync state */
+static
+void
+fts_sync_rollback(
+	fts_sync_t*	sync)
+{
+	trx_t*		trx = sync->trx;
+	fts_cache_t*	cache = sync->table->fts->cache;
+
+	for (ulint i = 0; i < ib_vector_size(cache->indexes); ++i) {
+		ulint			j;
+		fts_index_cache_t*	index_cache;
+
+		index_cache = static_cast<fts_index_cache_t*>(
+			ib_vector_get(cache->indexes, i));
+
+		/* Reset synced flag so nodes will not be skipped
+		in the next sync, see fts_sync_write_words(). */
+		fts_sync_index_reset(index_cache);
+
+		for (j = 0; fts_index_selector[j].value; ++j) {
+
+			if (index_cache->ins_graph[j] != NULL) {
+
+				que_graph_free(index_cache->ins_graph[j]);
+
+				index_cache->ins_graph[j] = NULL;
+			}
+
+			if (index_cache->sel_graph[j] != NULL) {
+
+				que_graph_free(index_cache->sel_graph[j]);
+
+				index_cache->sel_graph[j] = NULL;
+			}
+		}
+	}
+
+	mysql_mutex_unlock(&cache->lock);
+
+	fts_sql_rollback(trx);
+
+	/* Avoid assertion in trx_t::free(). */
+	trx->dict_operation_lock_mode = false;
+	trx->free();
+}
+
+/** Run SYNC on the table, i.e., write out data from the cache to the
+FTS auxiliary INDEX table and clear the cache at the end.
+@param[in,out]	sync		sync state
+@param[in]	unlock_cache	whether unlock cache lock when write node
+@param[in]	wait		whether wait when a sync is in progress
+@return DB_SUCCESS if all OK */
+static
+dberr_t
+fts_sync(
+	fts_sync_t*	sync,
+	bool		unlock_cache,
+	bool		wait)
+{
+	if (srv_read_only_mode) {
+		return DB_READ_ONLY;
+	}
+
+	ulint		i;
+	dberr_t		error = DB_SUCCESS;
+	fts_cache_t*	cache = sync->table->fts->cache;
+
+	mysql_mutex_lock(&cache->lock);
+
+	if (cache->total_size == 0) {
+                mysql_mutex_unlock(&cache->lock);
+		return DB_SUCCESS;
+	}
+
+	/* Check if cache is being synced.
+	Note: we release cache lock in fts_sync_write_words() to
+	avoid long wait for the lock by other threads. */
+	if (sync->in_progress) {
+		if (!wait) {
+			mysql_mutex_unlock(&cache->lock);
+			return(DB_SUCCESS);
+		}
+		do {
+			my_cond_wait(&sync->cond, &cache->lock.m_mutex);
+		} while (sync->in_progress);
+	}
+
+	sync->unlock_cache = unlock_cache;
+	sync->in_progress = true;
+
+	DEBUG_SYNC_C("fts_sync_begin");
+	fts_sync_begin(sync);
+
+begin_sync:
+	const size_t fts_cache_size= fts_max_cache_size;
+	if (cache->total_size > fts_cache_size) {
+		/* Avoid the case: sync never finish when
+		insert/update keeps comming. */
+		ut_ad(sync->unlock_cache);
+		sync->unlock_cache = false;
+		ib::warn() << "Total InnoDB FTS size "
+			<< cache->total_size << " for the table "
+			<< cache->sync->table->name
+			<< " exceeds the innodb_ft_cache_size "
+			<< fts_cache_size;
+	}
+
+	for (i = 0; i < ib_vector_size(cache->indexes); ++i) {
+		fts_index_cache_t*	index_cache;
+
+		index_cache = static_cast<fts_index_cache_t*>(
+			ib_vector_get(cache->indexes, i));
+
+		if (index_cache->index->to_be_dropped) {
+			continue;
+		}
+
+		DBUG_EXECUTE_IF("fts_instrument_sync_before_syncing",
+				std::this_thread::sleep_for(
+					std::chrono::milliseconds(300)););
+		error = fts_sync_index(sync, index_cache);
+
+		if (error != DB_SUCCESS) {
+			goto end_sync;
+		}
+
+		if (!sync->unlock_cache
+		    && cache->total_size < fts_max_cache_size) {
+			/* Reset the unlock cache if the value
+			is less than innodb_ft_cache_size */
+			sync->unlock_cache = true;
+		}
+	}
+
+	DBUG_EXECUTE_IF("fts_instrument_sync_interrupted",
+			sync->interrupted = true;
+			error = DB_INTERRUPTED;
+			goto end_sync;
+	);
+
+	/* Make sure all the caches are synced. */
+	for (i = 0; i < ib_vector_size(cache->indexes); ++i) {
+		fts_index_cache_t*	index_cache;
+
+		index_cache = static_cast<fts_index_cache_t*>(
+			ib_vector_get(cache->indexes, i));
+
+		if (index_cache->index->to_be_dropped
+		    || fts_sync_index_check(index_cache)) {
+			continue;
+		}
+
+		goto begin_sync;
+	}
+
+end_sync:
+	if (error == DB_SUCCESS && !sync->interrupted) {
+		error = fts_sync_commit(sync);
+	} else {
+		fts_sync_rollback(sync);
+	}
+
+	mysql_mutex_lock(&cache->lock);
+	ut_ad(sync->in_progress);
+	sync->interrupted = false;
+	sync->in_progress = false;
+	pthread_cond_broadcast(&sync->cond);
+	mysql_mutex_unlock(&cache->lock);
+
+	/* We need to check whether an optimize is required, for that
+	we make copies of the two variables that control the trigger. These
+	variables can change behind our back and we don't want to hold the
+	lock for longer than is needed. */
+	mysql_mutex_lock(&cache->deleted_lock);
+
+	cache->added = 0;
+	cache->deleted = 0;
+
+	mysql_mutex_unlock(&cache->deleted_lock);
+
+	return(error);
+}
+
+/** Run SYNC on the table, i.e., write out data from the cache to the
+FTS auxiliary INDEX table and clear the cache at the end.
+@param[in,out]	table		fts table
+@param[in]	wait		whether wait for existing sync to finish
+@return DB_SUCCESS on success, error code on failure. */
+dberr_t fts_sync_table(dict_table_t* table, bool wait)
+{
+  ut_ad(table->fts);
+
+  return table->space && !table->corrupted && table->fts->cache
+    ? fts_sync(table->fts->cache->sync, !wait, wait)
+    : DB_SUCCESS;
+}
+
+/** Check if a fts token is a stopword or less than fts_min_token_size
+or greater than fts_max_token_size.
+@param[in]	token		token string
+@param[in]	stopwords	stopwords rb tree
+@param[in]	cs		token charset
+@retval	true	if it is not stopword and length in range
+@retval	false	if it is stopword or lenght not in range */
+bool
+fts_check_token(
+	const fts_string_t*		token,
+	const ib_rbt_t*			stopwords,
+	const CHARSET_INFO*		cs)
+{
+	ut_ad(cs != NULL || stopwords == NULL);
+
+	ib_rbt_bound_t  parent;
+
+	return(token->f_n_char >= fts_min_token_size
+	       && token->f_n_char <= fts_max_token_size
+	       && (stopwords == NULL
+		   || rbt_search(stopwords, &parent, token) != 0));
+}
+
+/** Add the token and its start position to the token's list of positions.
+@param[in,out]	result_doc	result doc rb tree
+@param[in]	str		token string
+@param[in]	position	token position */
+static
+void
+fts_add_token(
+	fts_doc_t*	result_doc,
+	fts_string_t	str,
+	ulint		position)
+{
+	/* Ignore string whose character number is less than
+	"fts_min_token_size" or more than "fts_max_token_size" */
+
+	if (fts_check_token(&str, NULL, result_doc->charset)) {
+
+		mem_heap_t*	heap;
+		fts_string_t	t_str;
+		fts_token_t*	token;
+		ib_rbt_bound_t	parent;
+		ulint		newlen;
+
+		heap = static_cast<mem_heap_t*>(result_doc->self_heap->arg);
+
+		t_str.f_n_char = str.f_n_char;
+
+		t_str.f_len = str.f_len * result_doc->charset->casedn_multiply() + 1;
+
+		t_str.f_str = static_cast<byte*>(
+			mem_heap_alloc(heap, t_str.f_len));
+
+		/* For binary collations, a case sensitive search is
+		performed. Hence don't convert to lower case. */
+		if (my_binary_compare(result_doc->charset)) {
+			memcpy(t_str.f_str, str.f_str, str.f_len);
+			t_str.f_str[str.f_len]= 0;
+			newlen= str.f_len;
+		} else {
+			newlen = innobase_fts_casedn_str(
+				result_doc->charset, (char*) str.f_str, str.f_len,
+				(char*) t_str.f_str, t_str.f_len);
+		}
+
+		t_str.f_len = newlen;
+		t_str.f_str[newlen] = 0;
+
+		/* Add the word to the document statistics. If the word
+		hasn't been seen before we create a new entry for it. */
+		if (rbt_search(result_doc->tokens, &parent, &t_str) != 0) {
+			fts_token_t	new_token;
+
+			new_token.text.f_len = newlen;
+			new_token.text.f_str = t_str.f_str;
+			new_token.text.f_n_char = t_str.f_n_char;
+
+			new_token.positions = ib_vector_create(
+				result_doc->self_heap, sizeof(ulint), 32);
+
+			parent.last = rbt_add_node(
+				result_doc->tokens, &parent, &new_token);
+
+			ut_ad(rbt_validate(result_doc->tokens));
+		}
+
+		token = rbt_value(fts_token_t, parent.last);
+		ib_vector_push(token->positions, &position);
+	}
+}
+
+/********************************************************************
+Process next token from document starting at the given position, i.e., add
+the token's start position to the token's list of positions.
+@return number of characters handled in this call */
+static
+ulint
+fts_process_token(
+/*==============*/
+	fts_doc_t*	doc,		/* in/out: document to
+					tokenize */
+	fts_doc_t*	result,		/* out: if provided, save
+					result here */
+	ulint		start_pos,	/*!< in: start position in text */
+	ulint		add_pos)	/*!< in: add this position to all
+					tokens from this tokenization */
+{
+	ulint		ret;
+	fts_string_t	str;
+	ulint		position;
+	fts_doc_t*	result_doc;
+	byte		buf[FTS_MAX_WORD_LEN + 1];
+
+	str.f_str = buf;
+
+	/* Determine where to save the result. */
+	result_doc = (result != NULL) ? result : doc;
+
+	/* The length of a string in characters is set here only. */
+
+	ret = innobase_mysql_fts_get_token(
+		doc->charset, doc->text.f_str + start_pos,
+		doc->text.f_str + doc->text.f_len, &str);
+
+	position = start_pos + ret - str.f_len + add_pos;
+
+	fts_add_token(result_doc, str, position);
+
+	return(ret);
+}
+
+/*************************************************************//**
+Get token char size by charset
+@return token size */
+ulint
+fts_get_token_size(
+/*===============*/
+	const CHARSET_INFO*	cs,	/*!< in: Character set */
+	const char*		token,	/*!< in: token */
+	ulint			len)	/*!< in: token length */
+{
+	char*	start;
+	char*	end;
+	ulint	size = 0;
+
+	/* const_cast is for reinterpret_cast below, or it will fail. */
+	start = const_cast<char*>(token);
+	end = start + len;
+	while (start < end) {
+		int	ctype;
+		int	mbl;
+
+		mbl = cs->ctype(
+			&ctype,
+			reinterpret_cast<uchar*>(start),
+			reinterpret_cast<uchar*>(end));
+
+		size++;
+
+		start += mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1);
+	}
+
+	return(size);
+}
+
+/*************************************************************//**
+FTS plugin parser 'myql_parser' callback function for document tokenize.
+Refer to 'st_mysql_ftparser_param' for more detail.
+@return always returns 0 */
+int
+fts_tokenize_document_internal(
+/*===========================*/
+	MYSQL_FTPARSER_PARAM*	param,	/*!< in: parser parameter */
+	const char*		doc,/*!< in/out: document */
+	int			len)	/*!< in: document length */
+{
+	fts_string_t	str;
+	byte		buf[FTS_MAX_WORD_LEN + 1];
+	/* JAN: TODO: MySQL 5.7
+	MYSQL_FTPARSER_BOOLEAN_INFO bool_info =
+		{ FT_TOKEN_WORD, 0, 0, 0, 0, 0, ' ', 0 };
+	*/
+	MYSQL_FTPARSER_BOOLEAN_INFO bool_info =
+		{ FT_TOKEN_WORD, 0, 0, 0, 0, ' ', 0};
+
+	ut_ad(len >= 0);
+
+	str.f_str = buf;
+
+	for (ulint i = 0, inc = 0; i < static_cast<ulint>(len); i += inc) {
+		inc = innobase_mysql_fts_get_token(
+			const_cast<CHARSET_INFO*>(param->cs),
+			(uchar*)(doc) + i,
+			(uchar*)(doc) + len,
+			&str);
+
+		if (str.f_len > 0) {
+			/* JAN: TODO: MySQL 5.7
+			bool_info.position =
+				static_cast<int>(i + inc - str.f_len);
+			ut_ad(bool_info.position >= 0);
+			*/
+
+			/* Stop when add word fails */
+			if (param->mysql_add_word(
+				param,
+				reinterpret_cast<char*>(str.f_str),
+				static_cast<int>(str.f_len),
+				&bool_info)) {
+				break;
+			}
+		}
+	}
+
+	return(0);
+}
+
+/******************************************************************//**
+FTS plugin parser 'myql_add_word' callback function for document tokenize.
+Refer to 'st_mysql_ftparser_param' for more detail.
+@return always returns 0 */
+static
+int
+fts_tokenize_add_word_for_parser(
+/*=============================*/
+	MYSQL_FTPARSER_PARAM*	param,		/* in: parser paramter */
+	const char*			word,		/* in: token word */
+	int			word_len,	/* in: word len */
+	MYSQL_FTPARSER_BOOLEAN_INFO*)
+{
+	fts_string_t	str;
+	fts_tokenize_param_t*	fts_param;
+	fts_doc_t*	result_doc;
+	ulint		position;
+
+	fts_param = static_cast<fts_tokenize_param_t*>(param->mysql_ftparam);
+	result_doc = fts_param->result_doc;
+	ut_ad(result_doc != NULL);
+
+	str.f_str = (byte*)(word);
+	str.f_len = ulint(word_len);
+	str.f_n_char = fts_get_token_size(
+		const_cast<CHARSET_INFO*>(param->cs), word, str.f_len);
+
+	/* JAN: TODO: MySQL 5.7 FTS
+	ut_ad(boolean_info->position >= 0);
+	position = boolean_info->position + fts_param->add_pos;
+	*/
+	position = fts_param->add_pos++;
+
+	fts_add_token(result_doc, str, position);
+
+	return(0);
+}
+
+/******************************************************************//**
+Parse a document using an external / user supplied parser */
+static
+void
+fts_tokenize_by_parser(
+/*===================*/
+	fts_doc_t*		doc,	/* in/out: document to tokenize */
+	st_mysql_ftparser*	parser, /* in: plugin fts parser */
+	fts_tokenize_param_t*	fts_param) /* in: fts tokenize param */
+{
+	MYSQL_FTPARSER_PARAM	param;
+
+	ut_a(parser);
+
+	/* Set paramters for param */
+	param.mysql_parse = fts_tokenize_document_internal;
+	param.mysql_add_word = fts_tokenize_add_word_for_parser;
+	param.mysql_ftparam = fts_param;
+	param.cs = doc->charset;
+	param.doc = reinterpret_cast<char*>(doc->text.f_str);
+	param.length = static_cast<int>(doc->text.f_len);
+	param.mode= MYSQL_FTPARSER_SIMPLE_MODE;
+
+	PARSER_INIT(parser, &param);
+	parser->parse(&param);
+	PARSER_DEINIT(parser, &param);
+}
+
+/** Tokenize a document.
+@param[in,out]	doc	document to tokenize
+@param[out]	result	tokenization result
+@param[in]	parser	pluggable parser */
+static
+void
+fts_tokenize_document(
+	fts_doc_t*		doc,
+	fts_doc_t*		result,
+	st_mysql_ftparser*	parser)
+{
+	ut_a(!doc->tokens);
+	ut_a(doc->charset);
+
+	doc->tokens = rbt_create_arg_cmp(sizeof(fts_token_t),
+					 innobase_fts_text_cmp,
+					 (void*) doc->charset);
+
+	if (parser != NULL) {
+		fts_tokenize_param_t	fts_param;
+		fts_param.result_doc = (result != NULL) ? result : doc;
+		fts_param.add_pos = 0;
+
+		fts_tokenize_by_parser(doc, parser, &fts_param);
+	} else {
+		ulint		inc;
+
+		for (ulint i = 0; i < doc->text.f_len; i += inc) {
+			inc = fts_process_token(doc, result, i, 0);
+			ut_a(inc > 0);
+		}
+	}
+}
+
+/** Continue to tokenize a document.
+@param[in,out]	doc	document to tokenize
+@param[in]	add_pos	add this position to all tokens from this tokenization
+@param[out]	result	tokenization result
+@param[in]	parser	pluggable parser */
+static
+void
+fts_tokenize_document_next(
+	fts_doc_t*		doc,
+	ulint			add_pos,
+	fts_doc_t*		result,
+	st_mysql_ftparser*	parser)
+{
+	ut_a(doc->tokens);
+
+	if (parser) {
+		fts_tokenize_param_t	fts_param;
+
+		fts_param.result_doc = (result != NULL) ? result : doc;
+		fts_param.add_pos = add_pos;
+
+		fts_tokenize_by_parser(doc, parser, &fts_param);
+	} else {
+		ulint		inc;
+
+		for (ulint i = 0; i < doc->text.f_len; i += inc) {
+			inc = fts_process_token(doc, result, i, add_pos);
+			ut_a(inc > 0);
+		}
+	}
+}
+
+/** Create the vector of fts_get_doc_t instances.
+@param[in,out]	cache	fts cache
+@return	vector of fts_get_doc_t instances */
+static
+ib_vector_t*
+fts_get_docs_create(
+	fts_cache_t*	cache)
+{
+	ib_vector_t*	get_docs;
+
+	mysql_mutex_assert_owner(&cache->init_lock);
+
+	/* We need one instance of fts_get_doc_t per index. */
+	get_docs = ib_vector_create(cache->self_heap, sizeof(fts_get_doc_t), 4);
+
+	/* Create the get_doc instance, we need one of these
+	per FTS index. */
+	for (ulint i = 0; i < ib_vector_size(cache->indexes); ++i) {
+
+		dict_index_t**	index;
+		fts_get_doc_t*	get_doc;
+
+		index = static_cast<dict_index_t**>(
+			ib_vector_get(cache->indexes, i));
+
+		get_doc = static_cast<fts_get_doc_t*>(
+			ib_vector_push(get_docs, NULL));
+
+		memset(get_doc, 0x0, sizeof(*get_doc));
+
+		get_doc->index_cache = fts_get_index_cache(cache, *index);
+		get_doc->cache = cache;
+
+		/* Must find the index cache. */
+		ut_a(get_doc->index_cache != NULL);
+	}
+
+	return(get_docs);
+}
+
+/********************************************************************
+Release any resources held by the fts_get_doc_t instances. */
+static
+void
+fts_get_docs_clear(
+/*===============*/
+	ib_vector_t*	get_docs)		/*!< in: Doc retrieval vector */
+{
+	ulint		i;
+
+	/* Release the get doc graphs if any. */
+	for (i = 0; i < ib_vector_size(get_docs); ++i) {
+
+		fts_get_doc_t*	get_doc = static_cast<fts_get_doc_t*>(
+			ib_vector_get(get_docs, i));
+
+		if (get_doc->get_document_graph != NULL) {
+
+			ut_a(get_doc->index_cache);
+
+			que_graph_free(get_doc->get_document_graph);
+			get_doc->get_document_graph = NULL;
+		}
+	}
+}
+
+/*********************************************************************//**
+Get the initial Doc ID by consulting the CONFIG table
+@return initial Doc ID */
+doc_id_t
+fts_init_doc_id(
+/*============*/
+	const dict_table_t*	table)		/*!< in: table */
+{
+	doc_id_t	max_doc_id = 0;
+
+	mysql_mutex_lock(&table->fts->cache->lock);
+
+	/* Return if the table is already initialized for DOC ID */
+	if (table->fts->cache->first_doc_id != FTS_NULL_DOC_ID) {
+		mysql_mutex_unlock(&table->fts->cache->lock);
+		return(0);
+	}
+
+	DEBUG_SYNC_C("fts_initialize_doc_id");
+
+	/* Then compare this value with the ID value stored in the CONFIG
+	table. The larger one will be our new initial Doc ID */
+	fts_cmp_set_sync_doc_id(table, 0, FALSE, &max_doc_id);
+
+	/* If DICT_TF2_FTS_ADD_DOC_ID is set, we are in the process of
+	creating index (and add doc id column. No need to recovery
+	documents */
+	if (!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) {
+		fts_init_index((dict_table_t*) table, TRUE);
+	}
+
+	table->fts->added_synced = true;
+
+	table->fts->cache->first_doc_id = max_doc_id;
+
+	mysql_mutex_unlock(&table->fts->cache->lock);
+
+	ut_ad(max_doc_id > 0);
+
+	return(max_doc_id);
+}
+
+#ifdef FTS_MULT_INDEX
+/*********************************************************************//**
+Check if the index is in the affected set.
+@return TRUE if index is updated */
+static
+ibool
+fts_is_index_updated(
+/*=================*/
+	const ib_vector_t*	fts_indexes,	/*!< in: affected FTS indexes */
+	const fts_get_doc_t*	get_doc)	/*!< in: info for reading
+						document */
+{
+	ulint		i;
+	dict_index_t*	index = get_doc->index_cache->index;
+
+	for (i = 0; i < ib_vector_size(fts_indexes); ++i) {
+		const dict_index_t*	updated_fts_index;
+
+		updated_fts_index = static_cast<const dict_index_t*>(
+			ib_vector_getp_const(fts_indexes, i));
+
+		ut_a(updated_fts_index != NULL);
+
+		if (updated_fts_index == index) {
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+#endif
+
+/*********************************************************************//**
+Fetch COUNT(*) from specified table.
+@return the number of rows in the table */
+ulint
+fts_get_rows_count(
+/*===============*/
+	fts_table_t*	fts_table)	/*!< in: fts table to read */
+{
+	trx_t*		trx;
+	pars_info_t*	info;
+	que_t*		graph;
+	dberr_t		error;
+	ulint		count = 0;
+	char		table_name[MAX_FULL_NAME_LEN];
+
+	trx = trx_create();
+	trx->op_info = "fetching FT table rows count";
+
+	info = pars_info_create();
+
+	pars_info_bind_function(info, "my_func", fts_read_ulint, &count);
+
+	fts_get_table_name(fts_table, table_name);
+	pars_info_bind_id(info, "table_name", table_name);
+
+	graph = fts_parse_sql(
+		fts_table,
+		info,
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS"
+		" SELECT COUNT(*)"
+		" FROM $table_name;\n"
+		"BEGIN\n"
+		"\n"
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE c;");
+
+	for (;;) {
+		error = fts_eval_sql(trx, graph);
+
+		if (UNIV_LIKELY(error == DB_SUCCESS)) {
+			fts_sql_commit(trx);
+
+			break;				/* Exit the loop. */
+		} else {
+			fts_sql_rollback(trx);
+
+			if (error == DB_LOCK_WAIT_TIMEOUT) {
+				ib::warn() << "lock wait timeout reading"
+					" FTS table. Retrying!";
+
+				trx->error_state = DB_SUCCESS;
+			} else {
+				ib::error() << "(" << error
+					<< ") while reading FTS table "
+					<< table_name;
+
+				break;			/* Exit the loop. */
+			}
+		}
+	}
+
+	que_graph_free(graph);
+
+	trx->free();
+
+	return(count);
+}
+
+#ifdef FTS_CACHE_SIZE_DEBUG
+/*********************************************************************//**
+Read the max cache size parameter from the config table. */
+static
+void
+fts_update_max_cache_size(
+/*======================*/
+	fts_sync_t*	sync)			/*!< in: sync state */
+{
+	trx_t*		trx;
+	fts_table_t	fts_table;
+
+	trx = trx_create();
+
+	FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE, sync->table);
+
+	/* The size returned is in bytes. */
+	sync->max_cache_size = fts_get_max_cache_size(trx, &fts_table);
+
+	fts_sql_commit(trx);
+
+	trx->free();
+}
+#endif /* FTS_CACHE_SIZE_DEBUG */
+
+/*********************************************************************//**
+Free the modified rows of a table. */
+UNIV_INLINE
+void
+fts_trx_table_rows_free(
+/*====================*/
+	ib_rbt_t*	rows)			/*!< in: rbt of rows to free */
+{
+	const ib_rbt_node_t*	node;
+
+	for (node = rbt_first(rows); node; node = rbt_first(rows)) {
+		fts_trx_row_t*	row;
+
+		row = rbt_value(fts_trx_row_t, node);
+
+		if (row->fts_indexes != NULL) {
+			/* This vector shouldn't be using the
+			heap allocator.  */
+			ut_a(row->fts_indexes->allocator->arg == NULL);
+
+			ib_vector_free(row->fts_indexes);
+			row->fts_indexes = NULL;
+		}
+
+		ut_free(rbt_remove_node(rows, node));
+	}
+
+	ut_a(rbt_empty(rows));
+	rbt_free(rows);
+}
+
+/*********************************************************************//**
+Free an FTS savepoint instance. */
+UNIV_INLINE
+void
+fts_savepoint_free(
+/*===============*/
+	fts_savepoint_t*	savepoint)	/*!< in: savepoint instance */
+{
+	const ib_rbt_node_t*	node;
+	ib_rbt_t*		tables = savepoint->tables;
+
+	/* Nothing to free! */
+	if (tables == NULL) {
+		return;
+	}
+
+	for (node = rbt_first(tables); node; node = rbt_first(tables)) {
+		fts_trx_table_t*	ftt;
+		fts_trx_table_t**	fttp;
+
+		fttp = rbt_value(fts_trx_table_t*, node);
+		ftt = *fttp;
+
+		/* This can be NULL if a savepoint was released. */
+		if (ftt->rows != NULL) {
+			fts_trx_table_rows_free(ftt->rows);
+			ftt->rows = NULL;
+		}
+
+		/* This can be NULL if a savepoint was released. */
+		if (ftt->added_doc_ids != NULL) {
+			fts_doc_ids_free(ftt->added_doc_ids);
+			ftt->added_doc_ids = NULL;
+		}
+
+		/* The default savepoint name must be NULL. */
+		if (ftt->docs_added_graph) {
+			que_graph_free(ftt->docs_added_graph);
+		}
+
+		/* NOTE: We are responsible for free'ing the node */
+		ut_free(rbt_remove_node(tables, node));
+	}
+
+	ut_a(rbt_empty(tables));
+	rbt_free(tables);
+	savepoint->tables = NULL;
+}
+
+/*********************************************************************//**
+Free an FTS trx. */
+void
+fts_trx_free(
+/*=========*/
+	fts_trx_t*	fts_trx)		/* in, own: FTS trx */
+{
+	ulint		i;
+
+	for (i = 0; i < ib_vector_size(fts_trx->savepoints); ++i) {
+		fts_savepoint_t*	savepoint;
+
+		savepoint = static_cast<fts_savepoint_t*>(
+			ib_vector_get(fts_trx->savepoints, i));
+
+		/* The default savepoint name must be NULL. */
+		if (i == 0) {
+			ut_a(savepoint->name == NULL);
+		}
+
+		fts_savepoint_free(savepoint);
+	}
+
+	for (i = 0; i < ib_vector_size(fts_trx->last_stmt); ++i) {
+		fts_savepoint_t*	savepoint;
+
+		savepoint = static_cast<fts_savepoint_t*>(
+			ib_vector_get(fts_trx->last_stmt, i));
+
+		/* The default savepoint name must be NULL. */
+		if (i == 0) {
+			ut_a(savepoint->name == NULL);
+		}
+
+		fts_savepoint_free(savepoint);
+	}
+
+	if (fts_trx->heap) {
+		mem_heap_free(fts_trx->heap);
+	}
+}
+
+/*********************************************************************//**
+Extract the doc id from the FTS hidden column.
+@return doc id that was extracted from rec */
+doc_id_t
+fts_get_doc_id_from_row(
+/*====================*/
+	dict_table_t*	table,			/*!< in: table */
+	dtuple_t*	row)			/*!< in: row whose FTS doc id we
+						want to extract.*/
+{
+	dfield_t*	field;
+	doc_id_t	doc_id = 0;
+
+	ut_a(table->fts->doc_col != ULINT_UNDEFINED);
+
+	field = dtuple_get_nth_field(row, table->fts->doc_col);
+
+	ut_a(dfield_get_len(field) == sizeof(doc_id));
+	ut_a(dfield_get_type(field)->mtype == DATA_INT);
+
+	doc_id = fts_read_doc_id(
+		static_cast<const byte*>(dfield_get_data(field)));
+
+	return(doc_id);
+}
+
+/** Extract the doc id from the record that belongs to index.
+@param[in]	rec	record containing FTS_DOC_ID
+@param[in]	index	index of rec
+@param[in]	offsets	rec_get_offsets(rec,index)
+@return doc id that was extracted from rec */
+doc_id_t
+fts_get_doc_id_from_rec(
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	const rec_offs*		offsets)
+{
+	ulint f = dict_col_get_index_pos(
+		&index->table->cols[index->table->fts->doc_col], index);
+	ulint len;
+	doc_id_t doc_id = mach_read_from_8(
+		rec_get_nth_field(rec, offsets, f, &len));
+	ut_ad(len == 8);
+	return doc_id;
+}
+
+/*********************************************************************//**
+Search the index specific cache for a particular FTS index.
+@return the index specific cache else NULL */
+fts_index_cache_t*
+fts_find_index_cache(
+/*=================*/
+	const fts_cache_t*	cache,		/*!< in: cache to search */
+	const dict_index_t*	index)		/*!< in: index to search for */
+{
+	/* We cast away the const because our internal function, takes
+	non-const cache arg and returns a non-const pointer. */
+	return(static_cast<fts_index_cache_t*>(
+		fts_get_index_cache((fts_cache_t*) cache, index)));
+}
+
+/*********************************************************************//**
+Search cache for word.
+@return the word node vector if found else NULL */
+const ib_vector_t*
+fts_cache_find_word(
+/*================*/
+	const fts_index_cache_t*index_cache,	/*!< in: cache to search */
+	const fts_string_t*	text)		/*!< in: word to search for */
+{
+	ib_rbt_bound_t		parent;
+	const ib_vector_t*	nodes = NULL;
+
+	mysql_mutex_assert_owner(&index_cache->index->table->fts->cache->lock);
+
+	/* Lookup the word in the rb tree */
+	if (rbt_search(index_cache->words, &parent, text) == 0) {
+		const fts_tokenizer_word_t*	word;
+
+		word = rbt_value(fts_tokenizer_word_t, parent.last);
+
+		nodes = word->nodes;
+	}
+
+	return(nodes);
+}
+
+/*********************************************************************//**
+Append deleted doc ids to vector. */
+void
+fts_cache_append_deleted_doc_ids(
+/*=============================*/
+	fts_cache_t*		cache,		/*!< in: cache to use */
+	ib_vector_t*		vector)		/*!< in: append to this vector */
+{
+  mysql_mutex_lock(&cache->deleted_lock);
+
+  if (cache->deleted_doc_ids)
+    for (ulint i= 0; i < ib_vector_size(cache->deleted_doc_ids); ++i)
+    {
+      doc_id_t *update= static_cast<doc_id_t*>(
+        ib_vector_get(cache->deleted_doc_ids, i));
+      ib_vector_push(vector, &update);
+    }
+
+  mysql_mutex_unlock(&cache->deleted_lock);
+}
+
+/*********************************************************************//**
+Add the FTS document id hidden column. */
+void
+fts_add_doc_id_column(
+/*==================*/
+	dict_table_t*	table,	/*!< in/out: Table with FTS index */
+	mem_heap_t*	heap)	/*!< in: temporary memory heap, or NULL */
+{
+	dict_mem_table_add_col(
+		table, heap,
+		FTS_DOC_ID_COL_NAME,
+		DATA_INT,
+		dtype_form_prtype(
+			DATA_NOT_NULL | DATA_UNSIGNED
+			| DATA_BINARY_TYPE | DATA_FTS_DOC_ID, 0),
+		sizeof(doc_id_t));
+	DICT_TF2_FLAG_SET(table, DICT_TF2_FTS_HAS_DOC_ID);
+}
+
+/** Add new fts doc id to the update vector.
+@param[in]	table		the table that contains the FTS index.
+@param[in,out]	ufield		the fts doc id field in the update vector.
+				No new memory is allocated for this in this
+				function.
+@param[in,out]	next_doc_id	the fts doc id that has been added to the
+				update vector.  If 0, a new fts doc id is
+				automatically generated.  The memory provided
+				for this argument will be used by the update
+				vector. Ensure that the life time of this
+				memory matches that of the update vector.
+@return the fts doc id used in the update vector */
+doc_id_t
+fts_update_doc_id(
+	dict_table_t*	table,
+	upd_field_t*	ufield,
+	doc_id_t*	next_doc_id)
+{
+	doc_id_t	doc_id;
+	dberr_t		error = DB_SUCCESS;
+
+	if (*next_doc_id) {
+		doc_id = *next_doc_id;
+	} else {
+		/* Get the new document id that will be added. */
+		error = fts_get_next_doc_id(table, &doc_id);
+	}
+
+	if (error == DB_SUCCESS) {
+		dict_index_t*	clust_index;
+		dict_col_t*	col = dict_table_get_nth_col(
+			table, table->fts->doc_col);
+
+		ufield->exp = NULL;
+
+		ufield->new_val.len = sizeof(doc_id);
+
+		clust_index = dict_table_get_first_index(table);
+
+		ufield->field_no = static_cast<unsigned>(
+			dict_col_get_clust_pos(col, clust_index))
+			& dict_index_t::MAX_N_FIELDS;
+		dict_col_copy_type(col, dfield_get_type(&ufield->new_val));
+
+		/* It is possible we update record that has
+		not yet be sync-ed from last crash. */
+
+		/* Convert to storage byte order. */
+		ut_a(doc_id != FTS_NULL_DOC_ID);
+		fts_write_doc_id((byte*) next_doc_id, doc_id);
+
+		ufield->new_val.data = next_doc_id;
+                ufield->new_val.ext = 0;
+	}
+
+	return(doc_id);
+}
+
+/** fts_t constructor.
+@param[in]	table	table with FTS indexes
+@param[in,out]	heap	memory heap where 'this' is stored */
+fts_t::fts_t(
+	const dict_table_t*	table,
+	mem_heap_t*		heap)
+	:
+	added_synced(0), dict_locked(0),
+	add_wq(NULL),
+	cache(NULL),
+	doc_col(ULINT_UNDEFINED), in_queue(false), sync_message(false),
+	fts_heap(heap)
+{
+	ut_a(table->fts == NULL);
+
+	ib_alloc_t*	heap_alloc = ib_heap_allocator_create(fts_heap);
+
+	indexes = ib_vector_create(heap_alloc, sizeof(dict_index_t*), 4);
+
+	dict_table_get_all_fts_indexes(table, indexes);
+}
+
+/** fts_t destructor. */
+fts_t::~fts_t()
+{
+	ut_ad(add_wq == NULL);
+
+	if (cache) {
+		fts_cache_clear(cache);
+		fts_cache_destroy(cache);
+	}
+
+	/* There is no need to call ib_vector_free() on this->indexes
+	because it is stored in this->fts_heap. */
+	mem_heap_free(fts_heap);
+}
+
+/*********************************************************************//**
+Create an instance of fts_t.
+@return instance of fts_t */
+fts_t*
+fts_create(
+/*=======*/
+	dict_table_t*	table)		/*!< in/out: table with FTS indexes */
+{
+	fts_t*		fts;
+	mem_heap_t*	heap;
+
+	heap = mem_heap_create(512);
+
+	fts = static_cast<fts_t*>(mem_heap_alloc(heap, sizeof(*fts)));
+
+	new(fts) fts_t(table, heap);
+
+	return(fts);
+}
+
+/*********************************************************************//**
+Take a FTS savepoint. */
+UNIV_INLINE
+void
+fts_savepoint_copy(
+/*===============*/
+	const fts_savepoint_t*	src,	/*!< in: source savepoint */
+	fts_savepoint_t*	dst)	/*!< out: destination savepoint */
+{
+	const ib_rbt_node_t*	node;
+	const ib_rbt_t*		tables;
+
+	tables = src->tables;
+
+	for (node = rbt_first(tables); node; node = rbt_next(tables, node)) {
+
+		fts_trx_table_t*	ftt_dst;
+		const fts_trx_table_t**	ftt_src;
+
+		ftt_src = rbt_value(const fts_trx_table_t*, node);
+
+		ftt_dst = fts_trx_table_clone(*ftt_src);
+
+		rbt_insert(dst->tables, &ftt_dst, &ftt_dst);
+	}
+}
+
+/*********************************************************************//**
+Take a FTS savepoint. */
+void
+fts_savepoint_take(
+/*===============*/
+	fts_trx_t*	fts_trx,	/*!< in: fts transaction */
+	const char*	name)		/*!< in: savepoint name */
+{
+	mem_heap_t*		heap;
+	fts_savepoint_t*	savepoint;
+	fts_savepoint_t*	last_savepoint;
+
+	ut_a(name != NULL);
+
+	heap = fts_trx->heap;
+
+	/* The implied savepoint must exist. */
+	ut_a(ib_vector_size(fts_trx->savepoints) > 0);
+
+	last_savepoint = static_cast<fts_savepoint_t*>(
+		ib_vector_last(fts_trx->savepoints));
+	savepoint = fts_savepoint_create(fts_trx->savepoints, name, heap);
+
+	if (last_savepoint->tables != NULL) {
+		fts_savepoint_copy(last_savepoint, savepoint);
+	}
+}
+
+/*********************************************************************//**
+Lookup a savepoint instance by name.
+@return ULINT_UNDEFINED if not found */
+UNIV_INLINE
+ulint
+fts_savepoint_lookup(
+/*==================*/
+	ib_vector_t*	savepoints,	/*!< in: savepoints */
+	const char*	name)		/*!< in: savepoint name */
+{
+	ulint			i;
+
+	ut_a(ib_vector_size(savepoints) > 0);
+
+	for (i = 1; i < ib_vector_size(savepoints); ++i) {
+		fts_savepoint_t*	savepoint;
+
+		savepoint = static_cast<fts_savepoint_t*>(
+			ib_vector_get(savepoints, i));
+
+		if (strcmp(name, savepoint->name) == 0) {
+			return(i);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/*********************************************************************//**
+Release the savepoint data identified by  name. All savepoints created
+after the named savepoint are kept.
+@return DB_SUCCESS or error code */
+void
+fts_savepoint_release(
+/*==================*/
+	trx_t*		trx,		/*!< in: transaction */
+	const char*	name)		/*!< in: savepoint name */
+{
+	ut_a(name != NULL);
+
+	ib_vector_t*	savepoints = trx->fts_trx->savepoints;
+
+	ut_a(ib_vector_size(savepoints) > 0);
+
+	ulint   i = fts_savepoint_lookup(savepoints, name);
+	if (i != ULINT_UNDEFINED) {
+		ut_a(i >= 1);
+
+		fts_savepoint_t*        savepoint;
+		savepoint = static_cast<fts_savepoint_t*>(
+			ib_vector_get(savepoints, i));
+
+		if (i == ib_vector_size(savepoints) - 1) {
+			/* If the savepoint is the last, we save its
+			tables to the  previous savepoint. */
+			fts_savepoint_t*	prev_savepoint;
+			prev_savepoint = static_cast<fts_savepoint_t*>(
+				ib_vector_get(savepoints, i - 1));
+
+			ib_rbt_t*	tables = savepoint->tables;
+			savepoint->tables = prev_savepoint->tables;
+			prev_savepoint->tables = tables;
+		}
+
+		fts_savepoint_free(savepoint);
+		ib_vector_remove(savepoints, *(void**)savepoint);
+
+		/* Make sure we don't delete the implied savepoint. */
+		ut_a(ib_vector_size(savepoints) > 0);
+	}
+}
+
+/**********************************************************************//**
+Refresh last statement savepoint. */
+void
+fts_savepoint_laststmt_refresh(
+/*===========================*/
+	trx_t*			trx)	/*!< in: transaction */
+{
+
+	fts_trx_t*              fts_trx;
+	fts_savepoint_t*        savepoint;
+
+	fts_trx = trx->fts_trx;
+
+	savepoint = static_cast<fts_savepoint_t*>(
+		ib_vector_pop(fts_trx->last_stmt));
+	fts_savepoint_free(savepoint);
+
+	ut_ad(ib_vector_is_empty(fts_trx->last_stmt));
+	savepoint = fts_savepoint_create(fts_trx->last_stmt, NULL, NULL);
+}
+
+/********************************************************************
+Undo the Doc ID add/delete operations in last stmt */
+static
+void
+fts_undo_last_stmt(
+/*===============*/
+	fts_trx_table_t*	s_ftt,	/*!< in: Transaction FTS table */
+	fts_trx_table_t*	l_ftt)	/*!< in: last stmt FTS table */
+{
+	ib_rbt_t*		s_rows;
+	ib_rbt_t*		l_rows;
+	const ib_rbt_node_t*	node;
+
+	l_rows = l_ftt->rows;
+	s_rows = s_ftt->rows;
+
+	for (node = rbt_first(l_rows);
+	     node;
+	     node = rbt_next(l_rows, node)) {
+		fts_trx_row_t*	l_row = rbt_value(fts_trx_row_t, node);
+		ib_rbt_bound_t	parent;
+
+		rbt_search(s_rows, &parent, &(l_row->doc_id));
+
+		if (parent.result == 0) {
+			fts_trx_row_t*	s_row = rbt_value(
+				fts_trx_row_t, parent.last);
+
+			switch (l_row->state) {
+			case FTS_INSERT:
+				ut_free(rbt_remove_node(s_rows, parent.last));
+				break;
+
+			case FTS_DELETE:
+				if (s_row->state == FTS_NOTHING) {
+					s_row->state = FTS_INSERT;
+				} else if (s_row->state == FTS_DELETE) {
+					ut_free(rbt_remove_node(
+						s_rows, parent.last));
+				}
+				break;
+
+			/* FIXME: Check if FTS_MODIFY need to be addressed */
+			case FTS_MODIFY:
+			case FTS_NOTHING:
+				break;
+			default:
+				ut_error;
+			}
+		}
+	}
+}
+
+/**********************************************************************//**
+Rollback to savepoint indentified by name.
+@return DB_SUCCESS or error code */
+void
+fts_savepoint_rollback_last_stmt(
+/*=============================*/
+	trx_t*		trx)		/*!< in: transaction */
+{
+	ib_vector_t*		savepoints;
+	fts_savepoint_t*	savepoint;
+	fts_savepoint_t*	last_stmt;
+	fts_trx_t*		fts_trx;
+	ib_rbt_bound_t		parent;
+	const ib_rbt_node_t*    node;
+	ib_rbt_t*		l_tables;
+	ib_rbt_t*		s_tables;
+
+	fts_trx = trx->fts_trx;
+	savepoints = fts_trx->savepoints;
+
+	savepoint = static_cast<fts_savepoint_t*>(ib_vector_last(savepoints));
+	last_stmt = static_cast<fts_savepoint_t*>(
+		ib_vector_last(fts_trx->last_stmt));
+
+	l_tables = last_stmt->tables;
+	s_tables = savepoint->tables;
+
+	for (node = rbt_first(l_tables);
+	     node;
+	     node = rbt_next(l_tables, node)) {
+
+		fts_trx_table_t**	l_ftt;
+
+		l_ftt = rbt_value(fts_trx_table_t*, node);
+
+		rbt_search_cmp(
+			s_tables, &parent, &(*l_ftt)->table->id,
+			fts_trx_table_id_cmp, NULL);
+
+		if (parent.result == 0) {
+			fts_trx_table_t**	s_ftt;
+
+			s_ftt = rbt_value(fts_trx_table_t*, parent.last);
+
+			fts_undo_last_stmt(*s_ftt, *l_ftt);
+		}
+	}
+}
+
+/**********************************************************************//**
+Rollback to savepoint indentified by name.
+@return DB_SUCCESS or error code */
+void
+fts_savepoint_rollback(
+/*===================*/
+	trx_t*		trx,		/*!< in: transaction */
+	const char*	name)		/*!< in: savepoint name */
+{
+	ulint		i;
+	ib_vector_t*	savepoints;
+
+	ut_a(name != NULL);
+
+	savepoints = trx->fts_trx->savepoints;
+
+	/* We pop all savepoints from the the top of the stack up to
+	and including the instance that was found. */
+	i = fts_savepoint_lookup(savepoints, name);
+
+	if (i != ULINT_UNDEFINED) {
+		fts_savepoint_t*	savepoint;
+
+		ut_a(i > 0);
+
+		while (ib_vector_size(savepoints) > i) {
+			fts_savepoint_t*	savepoint;
+
+			savepoint = static_cast<fts_savepoint_t*>(
+				ib_vector_pop(savepoints));
+
+			if (savepoint->name != NULL) {
+				/* Since name was allocated on the heap, the
+				memory will be released when the transaction
+				completes. */
+				savepoint->name = NULL;
+
+				fts_savepoint_free(savepoint);
+			}
+		}
+
+		/* Pop all a elements from the top of the stack that may
+		have been released. We have to be careful that we don't
+		delete the implied savepoint. */
+
+		for (savepoint = static_cast<fts_savepoint_t*>(
+				ib_vector_last(savepoints));
+		     ib_vector_size(savepoints) > 1
+		     && savepoint->name == NULL;
+		     savepoint = static_cast<fts_savepoint_t*>(
+				ib_vector_last(savepoints))) {
+
+			ib_vector_pop(savepoints);
+		}
+
+		/* Make sure we don't delete the implied savepoint. */
+		ut_a(ib_vector_size(savepoints) > 0);
+
+		/* Restore the savepoint. */
+		fts_savepoint_take(trx->fts_trx, name);
+	}
+}
+
+bool fts_check_aux_table(const char *name,
+                         table_id_t *table_id,
+                         index_id_t *index_id)
+{
+  ulint len= strlen(name);
+  const char* ptr;
+  const char* end= name + len;
+
+  ut_ad(len <= MAX_FULL_NAME_LEN);
+  ptr= static_cast<const char*>(memchr(name, '/', len));
+  IF_WIN(if (!ptr) ptr= static_cast<const char*>(memchr(name, '\\', len)), );
+
+  if (!ptr)
+    return false;
+
+  /* We will start the match after the '/' */
+  ++ptr;
+  len= end - ptr;
+
+  /* All auxiliary tables are prefixed with "FTS_" and the name
+  length will be at the very least greater than 20 bytes. */
+  if (len > 24 && !memcmp(ptr, "FTS_", 4))
+  {
+    /* Skip the prefix. */
+    ptr+= 4;
+    len-= 4;
+
+    const char *table_id_ptr= ptr;
+    /* Skip the table id. */
+    ptr= static_cast<const char*>(memchr(ptr, '_', len));
+
+    if (!ptr)
+      return false;
+
+    /* Skip the underscore. */
+    ++ptr;
+    ut_ad(end > ptr);
+    len= end - ptr;
+
+    sscanf(table_id_ptr, UINT64PFx, table_id);
+    /* First search the common table suffix array. */
+    for (ulint i = 0; fts_common_tables[i]; ++i)
+    {
+      if (!strncmp(ptr, fts_common_tables[i], len))
+        return true;
+    }
+
+    /* Could be obsolete common tables. */
+    if ((len == 5 && !memcmp(ptr, "ADDED", len)) ||
+        (len == 9 && !memcmp(ptr, "STOPWORDS", len)))
+      return true;
+
+    const char* index_id_ptr= ptr;
+    /* Skip the index id. */
+    ptr= static_cast<const char*>(memchr(ptr, '_', len));
+    if (!ptr)
+      return false;
+
+    sscanf(index_id_ptr, UINT64PFx, index_id);
+
+    /* Skip the underscore. */
+    ++ptr;
+    ut_a(end > ptr);
+    len= end - ptr;
+
+    if (len <= 4)
+      return false;
+
+    len-= 4; /* .ibd suffix */
+
+    if (len > 7)
+      return false;
+
+    /* Search the FT index specific array. */
+    for (ulint i = 0; i < FTS_NUM_AUX_INDEX; ++i)
+    {
+      if (!memcmp(ptr, "INDEX_", len - 1))
+        return true;
+    }
+
+    /* Other FT index specific table(s). */
+    if (len == 6 && !memcmp(ptr, "DOC_ID", len))
+      return true;
+  }
+
+  return false;
+}
+
+/**********************************************************************//**
+Check whether user supplied stopword table is of the right format.
+Caller is responsible to hold dictionary locks.
+@param stopword_table_name   table name
+@param row_end   name of the system-versioning end column, or "value"
+@return the stopword column charset
+@retval NULL if the table does not exist or qualify */
+CHARSET_INFO*
+fts_valid_stopword_table(
+/*=====================*/
+	const char*	stopword_table_name,	/*!< in: Stopword table
+						name */
+	const char**	row_end) /* row_end value of system-versioned table */
+{
+	dict_table_t*	table;
+	dict_col_t*     col = NULL;
+
+	if (!stopword_table_name) {
+		return(NULL);
+	}
+
+	table = dict_sys.load_table(
+		{stopword_table_name, strlen(stopword_table_name)});
+
+	if (!table) {
+		ib::error() << "User stopword table " << stopword_table_name
+			<< " does not exist.";
+
+		return(NULL);
+	} else {
+		if (strcmp(dict_table_get_col_name(table, 0), "value")) {
+			ib::error() << "Invalid column name for stopword"
+				" table " << stopword_table_name << ". Its"
+				" first column must be named as 'value'.";
+
+			return(NULL);
+		}
+
+		col = dict_table_get_nth_col(table, 0);
+
+		if (col->mtype != DATA_VARCHAR
+		    && col->mtype != DATA_VARMYSQL) {
+			ib::error() << "Invalid column type for stopword"
+				" table " << stopword_table_name << ". Its"
+				" first column must be of varchar type";
+
+			return(NULL);
+		}
+	}
+
+	ut_ad(col);
+	ut_ad(!table->versioned() || col->ind != table->vers_end);
+
+	if (row_end) {
+		*row_end = table->versioned()
+			? dict_table_get_col_name(table, table->vers_end)
+			: "value"; /* for fts_load_user_stopword() */
+	}
+
+	return(fts_get_charset(col->prtype));
+}
+
+/**********************************************************************//**
+This function loads the stopword into the FTS cache. It also
+records/fetches stopword configuration to/from FTS configure
+table, depending on whether we are creating or reloading the
+FTS.
+@return true if load operation is successful */
+bool
+fts_load_stopword(
+/*==============*/
+	const dict_table_t*
+			table,			/*!< in: Table with FTS */
+	trx_t*		trx,			/*!< in: Transactions */
+	const char*	session_stopword_table,	/*!< in: Session stopword table
+						name */
+	bool		stopword_is_on,		/*!< in: Whether stopword
+						option is turned on/off */
+	bool		reload)			/*!< in: Whether it is
+						for reloading FTS table */
+{
+	fts_table_t	fts_table;
+	fts_string_t	str;
+	dberr_t		error = DB_SUCCESS;
+	ulint		use_stopword;
+	fts_cache_t*	cache;
+	const char*	stopword_to_use = NULL;
+	ibool		new_trx = FALSE;
+	byte		str_buffer[MAX_FULL_NAME_LEN + 1];
+
+	FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE, table);
+
+	cache = table->fts->cache;
+
+	if (!reload && !(cache->stopword_info.status & STOPWORD_NOT_INIT)) {
+		return true;
+	}
+
+	if (!trx) {
+		trx = trx_create();
+#ifdef UNIV_DEBUG
+		trx->start_line = __LINE__;
+		trx->start_file = __FILE__;
+#endif
+		trx_start_internal_low(trx, !high_level_read_only);
+		trx->op_info = "upload FTS stopword";
+		new_trx = TRUE;
+	}
+
+	/* First check whether stopword filtering is turned off */
+	if (reload) {
+		error = fts_config_get_ulint(
+			trx, &fts_table, FTS_USE_STOPWORD, &use_stopword);
+	} else {
+		use_stopword = (ulint) stopword_is_on;
+
+		error = fts_config_set_ulint(
+			trx, &fts_table, FTS_USE_STOPWORD, use_stopword);
+	}
+
+	if (error != DB_SUCCESS) {
+		goto cleanup;
+	}
+
+	/* If stopword is turned off, no need to continue to load the
+	stopword into cache, but still need to do initialization */
+	if (!use_stopword) {
+		cache->stopword_info.status = STOPWORD_OFF;
+		goto cleanup;
+	}
+
+	if (reload) {
+		/* Fetch the stopword table name from FTS config
+		table */
+		str.f_n_char = 0;
+		str.f_str = str_buffer;
+		str.f_len = sizeof(str_buffer) - 1;
+
+		error = fts_config_get_value(
+			trx, &fts_table, FTS_STOPWORD_TABLE_NAME, &str);
+
+		if (error != DB_SUCCESS) {
+			goto cleanup;
+		}
+
+		if (*str.f_str) {
+			stopword_to_use = (const char*) str.f_str;
+		}
+	} else {
+		stopword_to_use = session_stopword_table;
+	}
+
+	if (stopword_to_use
+	    && fts_load_user_stopword(table->fts, stopword_to_use,
+				      &cache->stopword_info)) {
+		/* Save the stopword table name to the configure
+		table */
+		if (!reload) {
+			str.f_n_char = 0;
+			str.f_str = (byte*) stopword_to_use;
+			str.f_len = strlen(stopword_to_use);
+
+			error = fts_config_set_value(
+				trx, &fts_table, FTS_STOPWORD_TABLE_NAME, &str);
+		}
+	} else {
+		/* Load system default stopword list */
+		fts_load_default_stopword(&cache->stopword_info);
+	}
+
+cleanup:
+	if (new_trx) {
+		if (error == DB_SUCCESS) {
+			fts_sql_commit(trx);
+		} else {
+			fts_sql_rollback(trx);
+		}
+
+		trx->free();
+	}
+
+	if (!cache->stopword_info.cached_stopword) {
+		cache->stopword_info.cached_stopword = rbt_create_arg_cmp(
+			sizeof(fts_tokenizer_word_t), innobase_fts_text_cmp,
+			&my_charset_latin1);
+	}
+
+	return error == DB_SUCCESS;
+}
+
+/**********************************************************************//**
+Callback function when we initialize the FTS at the start up
+time. It recovers the maximum Doc IDs presented in the current table.
+Tested by innodb_fts.crash_recovery
+@return: always returns TRUE */
+static
+ibool
+fts_init_get_doc_id(
+/*================*/
+	void*	row,			/*!< in: sel_node_t* */
+	void*	user_arg)		/*!< in: table with fts */
+{
+	doc_id_t	doc_id = FTS_NULL_DOC_ID;
+	sel_node_t*	node = static_cast<sel_node_t*>(row);
+	que_node_t*	exp = node->select_list;
+	dict_table_t*	table = static_cast<dict_table_t *>(user_arg);
+	fts_cache_t*    cache = table->fts->cache;
+
+	ut_ad(ib_vector_is_empty(cache->get_docs));
+
+	/* Copy each indexed column content into doc->text.f_str */
+	if (exp) {
+		dfield_t*	dfield = que_node_get_val(exp);
+		dtype_t*        type = dfield_get_type(dfield);
+		void*           data = dfield_get_data(dfield);
+
+		ut_a(dtype_get_mtype(type) == DATA_INT);
+
+		doc_id = static_cast<doc_id_t>(mach_read_from_8(
+			static_cast<const byte*>(data)));
+
+		exp = que_node_get_next(que_node_get_next(exp));
+		if (exp) {
+			ut_ad(table->versioned());
+			dfield = que_node_get_val(exp);
+			type = dfield_get_type(dfield);
+			ut_ad(type->vers_sys_end());
+			data = dfield_get_data(dfield);
+			ulint len = dfield_get_len(dfield);
+			if (table->versioned_by_id()) {
+				ut_ad(len == sizeof trx_id_max_bytes);
+				if (0 != memcmp(data, trx_id_max_bytes, len)) {
+					return true;
+				}
+			} else {
+				ut_ad(len == sizeof timestamp_max_bytes);
+				if (0 != memcmp(data, timestamp_max_bytes, len)) {
+					return true;
+				}
+			}
+			ut_ad(!(exp = que_node_get_next(exp)));
+		}
+		ut_ad(!exp);
+
+		if (doc_id >= cache->next_doc_id) {
+			cache->next_doc_id = doc_id + 1;
+		}
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Callback function when we initialize the FTS at the start up
+time. It recovers Doc IDs that have not sync-ed to the auxiliary
+table, and require to bring them back into FTS index.
+@return: always returns TRUE */
+static
+ibool
+fts_init_recover_doc(
+/*=================*/
+	void*	row,			/*!< in: sel_node_t* */
+	void*	user_arg)		/*!< in: fts cache */
+{
+
+	fts_doc_t       doc;
+	ulint		doc_len = 0;
+	ulint		field_no = 0;
+	fts_get_doc_t*  get_doc = static_cast<fts_get_doc_t*>(user_arg);
+	doc_id_t	doc_id = FTS_NULL_DOC_ID;
+	sel_node_t*	node = static_cast<sel_node_t*>(row);
+	que_node_t*	exp = node->select_list;
+	fts_cache_t*	cache = get_doc->cache;
+	st_mysql_ftparser*	parser = get_doc->index_cache->index->parser;
+
+	fts_doc_init(&doc);
+	doc.found = TRUE;
+
+	ut_ad(cache);
+
+	/* Copy each indexed column content into doc->text.f_str */
+	while (exp) {
+		dfield_t*	dfield = que_node_get_val(exp);
+		ulint		len = dfield_get_len(dfield);
+
+		if (field_no == 0) {
+			dtype_t*        type = dfield_get_type(dfield);
+			void*           data = dfield_get_data(dfield);
+
+			ut_a(dtype_get_mtype(type) == DATA_INT);
+
+			doc_id = static_cast<doc_id_t>(mach_read_from_8(
+				static_cast<const byte*>(data)));
+
+			field_no++;
+			exp = que_node_get_next(exp);
+			continue;
+		}
+
+		if (len == UNIV_SQL_NULL) {
+			exp = que_node_get_next(exp);
+			continue;
+		}
+
+		ut_ad(get_doc);
+
+		if (!get_doc->index_cache->charset) {
+			get_doc->index_cache->charset = fts_get_charset(
+				dfield->type.prtype);
+		}
+
+		doc.charset = get_doc->index_cache->charset;
+
+		if (dfield_is_ext(dfield)) {
+			dict_table_t*	table = cache->sync->table;
+
+			doc.text.f_str = btr_copy_externally_stored_field(
+				&doc.text.f_len,
+				static_cast<byte*>(dfield_get_data(dfield)),
+				table->space->zip_size(), len,
+				static_cast<mem_heap_t*>(doc.self_heap->arg));
+		} else {
+			doc.text.f_str = static_cast<byte*>(
+				dfield_get_data(dfield));
+
+			doc.text.f_len = len;
+		}
+
+		if (field_no == 1) {
+			fts_tokenize_document(&doc, NULL, parser);
+		} else {
+			fts_tokenize_document_next(&doc, doc_len, NULL, parser);
+		}
+
+		exp = que_node_get_next(exp);
+
+		doc_len += (exp) ? len + 1 : len;
+
+		field_no++;
+	}
+
+	fts_cache_add_doc(cache, get_doc->index_cache, doc_id, doc.tokens);
+
+	fts_doc_free(&doc);
+
+	cache->added++;
+
+	if (doc_id >= cache->next_doc_id) {
+		cache->next_doc_id = doc_id + 1;
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+This function brings FTS index in sync when FTS index is first
+used. There are documents that have not yet sync-ed to auxiliary
+tables from last server abnormally shutdown, we will need to bring
+such document into FTS cache before any further operations */
+void
+fts_init_index(
+/*===========*/
+	dict_table_t*	table,		/*!< in: Table with FTS */
+	bool		has_cache_lock)	/*!< in: Whether we already have
+					cache lock */
+{
+	dict_index_t*   index;
+	doc_id_t        start_doc;
+	fts_get_doc_t*  get_doc = NULL;
+	fts_cache_t*    cache = table->fts->cache;
+	bool		need_init = false;
+
+	/* First check cache->get_docs is initialized */
+	if (!has_cache_lock) {
+		mysql_mutex_lock(&cache->lock);
+	}
+
+	mysql_mutex_lock(&cache->init_lock);
+	if (cache->get_docs == NULL) {
+		cache->get_docs = fts_get_docs_create(cache);
+	}
+	mysql_mutex_unlock(&cache->init_lock);
+
+	if (table->fts->added_synced) {
+		goto func_exit;
+	}
+
+	need_init = true;
+
+	start_doc = cache->synced_doc_id;
+
+	if (!start_doc) {
+		fts_cmp_set_sync_doc_id(table, 0, TRUE, &start_doc);
+		cache->synced_doc_id = start_doc;
+	}
+
+	/* No FTS index, this is the case when previous FTS index
+	dropped, and we re-initialize the Doc ID system for subsequent
+	insertion */
+	if (ib_vector_is_empty(cache->get_docs)) {
+		index = table->fts_doc_id_index;
+
+		ut_a(index);
+
+		fts_doc_fetch_by_doc_id(NULL, start_doc, index,
+					FTS_FETCH_DOC_BY_ID_LARGE,
+					fts_init_get_doc_id, table);
+	} else {
+		if (table->fts->cache->stopword_info.status
+		    & STOPWORD_NOT_INIT) {
+			fts_load_stopword(table, NULL, NULL, true, true);
+		}
+
+		for (ulint i = 0; i < ib_vector_size(cache->get_docs); ++i) {
+			get_doc = static_cast<fts_get_doc_t*>(
+				ib_vector_get(cache->get_docs, i));
+
+			index = get_doc->index_cache->index;
+
+			fts_doc_fetch_by_doc_id(NULL, start_doc, index,
+						FTS_FETCH_DOC_BY_ID_LARGE,
+						fts_init_recover_doc, get_doc);
+		}
+	}
+
+	table->fts->added_synced = true;
+
+	fts_get_docs_clear(cache->get_docs);
+
+func_exit:
+	if (!has_cache_lock) {
+		mysql_mutex_unlock(&cache->lock);
+	}
+
+	if (need_init) {
+		dict_sys.lock(SRW_LOCK_CALL);
+		/* Register the table with the optimize thread. */
+		fts_optimize_add_table(table);
+		dict_sys.unlock();
+	}
+}
diff --git a/storage/innobase/fts/fts0opt.cc b/storage/innobase/fts/fts0opt.cc
new file mode 100644
index 00000000..fe31767d
--- /dev/null
+++ b/storage/innobase/fts/fts0opt.cc
@@ -0,0 +1,3054 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file fts/fts0opt.cc
+Full Text Search optimize thread
+
+Created 2007/03/27 Sunny Bains
+Completed 2011/7/10 Sunny and Jimmy Yang
+
+***********************************************************************/
+
+#include "fts0fts.h"
+#include "row0sel.h"
+#include "que0types.h"
+#include "fts0priv.h"
+#include "fts0types.h"
+#include "ut0wqueue.h"
+#include "srv0start.h"
+#include "ut0list.h"
+#include "zlib.h"
+#include "fts0opt.h"
+#include "fts0vlc.h"
+#include "wsrep.h"
+
+#ifdef WITH_WSREP
+extern Atomic_relaxed<bool> wsrep_sst_disable_writes;
+#else
+constexpr bool wsrep_sst_disable_writes= false;
+#endif
+
+/** The FTS optimize thread's work queue. */
+ib_wqueue_t* fts_optimize_wq;
+static void fts_optimize_callback(void *);
+static void timer_callback(void*);
+static tpool::timer* timer;
+
+static tpool::task_group task_group(1);
+static tpool::task task(fts_optimize_callback,0, &task_group);
+
+/** FTS optimize thread, for MDL acquisition */
+static THD *fts_opt_thd;
+
+/** The FTS vector to store fts_slot_t */
+static ib_vector_t*  fts_slots;
+
+/** Default optimize interval in secs. */
+static const ulint FTS_OPTIMIZE_INTERVAL_IN_SECS = 300;
+
+/** Server is shutting down, so does we exiting the optimize thread */
+static bool fts_opt_start_shutdown = false;
+
+/** Condition variable for shutting down the optimize thread.
+Protected by fts_optimize_wq->mutex. */
+static pthread_cond_t fts_opt_shutdown_cond;
+
+/** Initial size of nodes in fts_word_t. */
+static const ulint FTS_WORD_NODES_INIT_SIZE = 64;
+
+/** Last time we did check whether system need a sync */
+static time_t	last_check_sync_time;
+
+/** FTS optimize thread message types. */
+enum fts_msg_type_t {
+	FTS_MSG_STOP,			/*!< Stop optimizing and exit thread */
+
+	FTS_MSG_ADD_TABLE,		/*!< Add table to the optimize thread's
+					work queue */
+
+	FTS_MSG_DEL_TABLE,		/*!< Remove a table from the optimize
+					threads work queue */
+	FTS_MSG_SYNC_TABLE		/*!< Sync fts cache of a table */
+};
+
+/** Compressed list of words that have been read from FTS INDEX
+that needs to be optimized. */
+struct fts_zip_t {
+	lint		status;		/*!< Status of (un)/zip operation */
+
+	ulint		n_words;	/*!< Number of words compressed */
+
+	ulint		block_sz;	/*!< Size of a block in bytes */
+
+	ib_vector_t*	blocks;		/*!< Vector of compressed blocks */
+
+	ib_alloc_t*	heap_alloc;	/*!< Heap to use for allocations */
+
+	ulint		pos;		/*!< Offset into blocks */
+
+	ulint		last_big_block;	/*!< Offset of last block in the
+					blocks array that is of size
+					block_sz. Blocks beyond this offset
+					are of size FTS_MAX_WORD_LEN */
+
+	z_streamp	zp;		/*!< ZLib state */
+
+					/*!< The value of the last word read
+					from the FTS INDEX table. This is
+					used to discard duplicates */
+
+	fts_string_t	word;		/*!< UTF-8 string */
+
+	ulint		max_words;	/*!< maximum number of words to read
+					in one pase */
+};
+
+/** Prepared statemets used during optimize */
+struct fts_optimize_graph_t {
+					/*!< Delete a word from FTS INDEX */
+	que_t*		delete_nodes_graph;
+					/*!< Insert a word into FTS INDEX */
+	que_t*		write_nodes_graph;
+					/*!< COMMIT a transaction */
+	que_t*		commit_graph;
+					/*!< Read the nodes from FTS_INDEX */
+	que_t*		read_nodes_graph;
+};
+
+/** Used by fts_optimize() to store state. */
+struct fts_optimize_t {
+	trx_t*		trx;		/*!< The transaction used for all SQL */
+
+	ib_alloc_t*	self_heap;	/*!< Heap to use for allocations */
+
+	char*		name_prefix;	/*!< FTS table name prefix */
+
+	fts_table_t	fts_index_table;/*!< Common table definition */
+
+					/*!< Common table definition */
+	fts_table_t	fts_common_table;
+
+	dict_table_t*	table;		/*!< Table that has to be queried */
+
+	dict_index_t*	index;		/*!< The FTS index to be optimized */
+
+	fts_doc_ids_t*	to_delete;	/*!< doc ids to delete, we check against
+					this vector and purge the matching
+					entries during the optimizing
+					process. The vector entries are
+					sorted on doc id */
+
+	ulint		del_pos;	/*!< Offset within to_delete vector,
+					this is used to keep track of where
+					we are up to in the vector */
+
+	ibool		done;		/*!< TRUE when optimize finishes */
+
+	ib_vector_t*	words;		/*!< Word + Nodes read from FTS_INDEX,
+					it contains instances of fts_word_t */
+
+	fts_zip_t*	zip;		/*!< Words read from the FTS_INDEX */
+
+	fts_optimize_graph_t		/*!< Prepared statements used during */
+			graph;		/*optimize */
+
+	ulint		n_completed;	/*!< Number of FTS indexes that have
+					been optimized */
+	ibool		del_list_regenerated;
+					/*!< BEING_DELETED list regenarated */
+};
+
+/** Used by the optimize, to keep state during compacting nodes. */
+struct fts_encode_t {
+	doc_id_t	src_last_doc_id;/*!< Last doc id read from src node */
+	byte*		src_ilist_ptr;	/*!< Current ptr within src ilist */
+};
+
+/** We use this information to determine when to start the optimize
+cycle for a table. */
+struct fts_slot_t {
+	/** table, or NULL if the slot is unused */
+	dict_table_t*	table;
+
+	/** whether this slot is being processed */
+	bool		running;
+
+	ulint		added;		/*!< Number of doc ids added since the
+					last time this table was optimized */
+
+	ulint		deleted;	/*!< Number of doc ids deleted since the
+					last time this table was optimized */
+
+	/** time(NULL) of completing fts_optimize_table_bk() */
+	time_t		last_run;
+
+	/** time(NULL) of latest successful fts_optimize_table() */
+	time_t		completed;
+};
+
+/** A table remove message for the FTS optimize thread. */
+struct fts_msg_del_t
+{
+  /** the table to remove */
+  dict_table_t *table;
+  /** condition variable to signal message consumption */
+  pthread_cond_t *cond;
+};
+
+/** The FTS optimize message work queue message type. */
+struct fts_msg_t {
+	fts_msg_type_t	type;		/*!< Message type */
+
+	void*		ptr;		/*!< The message contents */
+
+	mem_heap_t*	heap;		/*!< The heap used to allocate this
+					message, the message consumer will
+					free the heap. */
+};
+
+/** The number of words to read and optimize in a single pass. */
+ulong	fts_num_word_optimize;
+
+/** Whether to enable additional FTS diagnostic printout. */
+char	fts_enable_diag_print;
+
+/** ZLib compressed block size.*/
+static ulint FTS_ZIP_BLOCK_SIZE	= 1024;
+
+/** The amount of time optimizing in a single pass, in seconds. */
+static ulint fts_optimize_time_limit;
+
+/** It's defined in fts0fts.cc  */
+extern const char* fts_common_tables[];
+
+/** SQL Statement for changing state of rows to be deleted from FTS Index. */
+static	const char* fts_init_delete_sql =
+	"BEGIN\n"
+	"\n"
+	"INSERT INTO $BEING_DELETED\n"
+		"SELECT doc_id FROM $DELETED;\n"
+	"\n"
+	"INSERT INTO $BEING_DELETED_CACHE\n"
+		"SELECT doc_id FROM $DELETED_CACHE;\n";
+
+static const char* fts_delete_doc_ids_sql =
+	"BEGIN\n"
+	"\n"
+	"DELETE FROM $DELETED WHERE doc_id = :doc_id1;\n"
+	"DELETE FROM $DELETED_CACHE WHERE doc_id = :doc_id2;\n";
+
+static const char* fts_end_delete_sql =
+	"BEGIN\n"
+	"\n"
+	"DELETE FROM $BEING_DELETED;\n"
+	"DELETE FROM $BEING_DELETED_CACHE;\n";
+
+/**********************************************************************//**
+Initialize fts_zip_t. */
+static
+void
+fts_zip_initialize(
+/*===============*/
+	fts_zip_t*	zip)		/*!< out: zip instance to initialize */
+{
+	zip->pos = 0;
+	zip->n_words = 0;
+
+	zip->status = Z_OK;
+
+	zip->last_big_block = 0;
+
+	zip->word.f_len = 0;
+	*zip->word.f_str = 0;
+
+	ib_vector_reset(zip->blocks);
+
+	memset(zip->zp, 0, sizeof(*zip->zp));
+}
+
+/**********************************************************************//**
+Create an instance of fts_zip_t.
+@return a new instance of fts_zip_t */
+static
+fts_zip_t*
+fts_zip_create(
+/*===========*/
+	mem_heap_t*	heap,		/*!< in: heap */
+	ulint		block_sz,	/*!< in: size of a zip block.*/
+	ulint		max_words)	/*!< in: max words to read */
+{
+	fts_zip_t*	zip;
+
+	zip = static_cast<fts_zip_t*>(mem_heap_zalloc(heap, sizeof(*zip)));
+
+	zip->word.f_str = static_cast<byte*>(
+		mem_heap_zalloc(heap, FTS_MAX_WORD_LEN + 1));
+
+	zip->block_sz = block_sz;
+
+	zip->heap_alloc = ib_heap_allocator_create(heap);
+
+	zip->blocks = ib_vector_create(zip->heap_alloc, sizeof(void*), 128);
+
+	zip->max_words = max_words;
+
+	zip->zp = static_cast<z_stream*>(
+		mem_heap_zalloc(heap, sizeof(*zip->zp)));
+
+	return(zip);
+}
+
+/**********************************************************************//**
+Initialize an instance of fts_zip_t. */
+static
+void
+fts_zip_init(
+/*=========*/
+
+	fts_zip_t*	zip)		/*!< in: zip instance to init */
+{
+	memset(zip->zp, 0, sizeof(*zip->zp));
+
+	zip->word.f_len = 0;
+	*zip->word.f_str = '\0';
+}
+
+/**********************************************************************//**
+Create a fts_optimizer_word_t instance.
+@return new instance */
+static
+fts_word_t*
+fts_word_init(
+/*==========*/
+	fts_word_t*	word,		/*!< in: word to initialize */
+	byte*		utf8,		/*!< in: UTF-8 string */
+	ulint		len)		/*!< in: length of string in bytes */
+{
+	mem_heap_t*	heap = mem_heap_create(sizeof(fts_node_t));
+
+	memset(word, 0, sizeof(*word));
+
+	word->text.f_len = len;
+	word->text.f_str = static_cast<byte*>(mem_heap_alloc(heap, len + 1));
+
+	/* Need to copy the NUL character too. */
+	memcpy(word->text.f_str, utf8, word->text.f_len);
+	word->text.f_str[word->text.f_len] = 0;
+
+	word->heap_alloc = ib_heap_allocator_create(heap);
+
+	word->nodes = ib_vector_create(
+		word->heap_alloc, sizeof(fts_node_t), FTS_WORD_NODES_INIT_SIZE);
+
+	return(word);
+}
+
+/**********************************************************************//**
+Read the FTS INDEX row.
+@return fts_node_t instance */
+static
+fts_node_t*
+fts_optimize_read_node(
+/*===================*/
+	fts_word_t*	word,		/*!< in: */
+	que_node_t*	exp)		/*!< in: */
+{
+	int		i;
+	fts_node_t*	node = static_cast<fts_node_t*>(
+		ib_vector_push(word->nodes, NULL));
+
+	/* Start from 1 since the first node has been read by the caller */
+	for (i = 1; exp; exp = que_node_get_next(exp), ++i) {
+
+		dfield_t*	dfield = que_node_get_val(exp);
+		byte*		data = static_cast<byte*>(
+			dfield_get_data(dfield));
+		ulint		len = dfield_get_len(dfield);
+
+		ut_a(len != UNIV_SQL_NULL);
+
+		/* Note: The column numbers below must match the SELECT */
+		switch (i) {
+		case 1: /* DOC_COUNT */
+			node->doc_count = mach_read_from_4(data);
+			break;
+
+		case 2: /* FIRST_DOC_ID */
+			node->first_doc_id = fts_read_doc_id(data);
+			break;
+
+		case 3: /* LAST_DOC_ID */
+			node->last_doc_id = fts_read_doc_id(data);
+			break;
+
+		case 4: /* ILIST */
+			node->ilist_size_alloc = node->ilist_size = len;
+			node->ilist = static_cast<byte*>(ut_malloc_nokey(len));
+			memcpy(node->ilist, data, len);
+			break;
+
+		default:
+			ut_error;
+		}
+	}
+
+	/* Make sure all columns were read. */
+	ut_a(i == 5);
+
+	return(node);
+}
+
+/**********************************************************************//**
+Callback function to fetch the rows in an FTS INDEX record.
+@return always returns non-NULL */
+ibool
+fts_optimize_index_fetch_node(
+/*==========================*/
+	void*		row,		/*!< in: sel_node_t* */
+	void*		user_arg)	/*!< in: pointer to ib_vector_t */
+{
+	fts_word_t*	word;
+	sel_node_t*	sel_node = static_cast<sel_node_t*>(row);
+	fts_fetch_t*	fetch = static_cast<fts_fetch_t*>(user_arg);
+	ib_vector_t*	words = static_cast<ib_vector_t*>(fetch->read_arg);
+	que_node_t*	exp = sel_node->select_list;
+	dfield_t*	dfield = que_node_get_val(exp);
+	void*		data = dfield_get_data(dfield);
+	ulint		dfield_len = dfield_get_len(dfield);
+	fts_node_t*	node;
+	bool		is_word_init = false;
+
+	ut_a(dfield_len <= FTS_MAX_WORD_LEN);
+
+	if (ib_vector_size(words) == 0) {
+
+		word = static_cast<fts_word_t*>(ib_vector_push(words, NULL));
+		fts_word_init(word, (byte*) data, dfield_len);
+		is_word_init = true;
+	}
+
+	word = static_cast<fts_word_t*>(ib_vector_last(words));
+
+	if (dfield_len != word->text.f_len
+	    || memcmp(word->text.f_str, data, dfield_len)) {
+
+		word = static_cast<fts_word_t*>(ib_vector_push(words, NULL));
+		fts_word_init(word, (byte*) data, dfield_len);
+		is_word_init = true;
+	}
+
+	node = fts_optimize_read_node(word, que_node_get_next(exp));
+
+	fetch->total_memory += node->ilist_size;
+	if (is_word_init) {
+		fetch->total_memory += sizeof(fts_word_t)
+			+ sizeof(ib_alloc_t) + sizeof(ib_vector_t) + dfield_len
+			+ sizeof(fts_node_t) * FTS_WORD_NODES_INIT_SIZE;
+	} else if (ib_vector_size(words) > FTS_WORD_NODES_INIT_SIZE) {
+		fetch->total_memory += sizeof(fts_node_t);
+	}
+
+	if (fetch->total_memory >= fts_result_cache_limit) {
+		return(FALSE);
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Read the rows from the FTS inde.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_index_fetch_nodes(
+/*==================*/
+	trx_t*		trx,		/*!< in: transaction */
+	que_t**		graph,		/*!< in: prepared statement */
+	fts_table_t*	fts_table,	/*!< in: table of the FTS INDEX */
+	const fts_string_t*
+			word,		/*!< in: the word to fetch */
+	fts_fetch_t*	fetch)		/*!< in: fetch callback.*/
+{
+	pars_info_t*	info;
+	dberr_t		error;
+	char		table_name[MAX_FULL_NAME_LEN];
+
+	trx->op_info = "fetching FTS index nodes";
+
+	if (*graph) {
+		info = (*graph)->info;
+	} else {
+		ulint	selected;
+
+		info = pars_info_create();
+
+		ut_a(fts_table->type == FTS_INDEX_TABLE);
+
+		selected = fts_select_index(fts_table->charset,
+					    word->f_str, word->f_len);
+
+		fts_table->suffix = fts_get_suffix(selected);
+
+		fts_get_table_name(fts_table, table_name);
+
+		pars_info_bind_id(info, "table_name", table_name);
+	}
+
+	pars_info_bind_function(info, "my_func", fetch->read_record, fetch);
+	pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
+
+	if (!*graph) {
+
+		*graph = fts_parse_sql(
+			fts_table,
+			info,
+			"DECLARE FUNCTION my_func;\n"
+			"DECLARE CURSOR c IS"
+			" SELECT word, doc_count, first_doc_id, last_doc_id,"
+			" ilist\n"
+			" FROM $table_name\n"
+			" WHERE word LIKE :word\n"
+			" ORDER BY first_doc_id;\n"
+			"BEGIN\n"
+			"\n"
+			"OPEN c;\n"
+			"WHILE 1 = 1 LOOP\n"
+			"  FETCH c INTO my_func();\n"
+			"  IF c % NOTFOUND THEN\n"
+			"    EXIT;\n"
+			"  END IF;\n"
+			"END LOOP;\n"
+			"CLOSE c;");
+	}
+
+	for (;;) {
+		error = fts_eval_sql(trx, *graph);
+
+		if (UNIV_LIKELY(error == DB_SUCCESS)) {
+			fts_sql_commit(trx);
+
+			break;				/* Exit the loop. */
+		} else {
+			fts_sql_rollback(trx);
+
+			if (error == DB_LOCK_WAIT_TIMEOUT) {
+				ib::warn() << "lock wait timeout reading"
+					" FTS index. Retrying!";
+
+				trx->error_state = DB_SUCCESS;
+			} else {
+				ib::error() << "(" << error
+					<< ") while reading FTS index.";
+
+				break;			/* Exit the loop. */
+			}
+		}
+	}
+
+	return(error);
+}
+
+/**********************************************************************//**
+Read a word */
+static
+byte*
+fts_zip_read_word(
+/*==============*/
+	fts_zip_t*	zip,		/*!< in: Zip state + data */
+	fts_string_t*	word)		/*!< out: uncompressed word */
+{
+	short		len = 0;
+	void*		null = NULL;
+	byte*		ptr = word->f_str;
+	int		flush = Z_NO_FLUSH;
+
+	/* Either there was an error or we are at the Z_STREAM_END. */
+	if (zip->status != Z_OK) {
+		return(NULL);
+	}
+
+	zip->zp->next_out = reinterpret_cast<byte*>(&len);
+	zip->zp->avail_out = sizeof(len);
+
+	while (zip->status == Z_OK && zip->zp->avail_out > 0) {
+
+		/* Finished decompressing block. */
+		if (zip->zp->avail_in == 0) {
+
+			/* Free the block that's been decompressed. */
+			if (zip->pos > 0) {
+				ulint	prev = zip->pos - 1;
+
+				ut_a(zip->pos < ib_vector_size(zip->blocks));
+
+				ut_free(ib_vector_getp(zip->blocks, prev));
+				ib_vector_set(zip->blocks, prev, &null);
+			}
+
+			/* Any more blocks to decompress. */
+			if (zip->pos < ib_vector_size(zip->blocks)) {
+
+				zip->zp->next_in = static_cast<byte*>(
+					ib_vector_getp(
+						zip->blocks, zip->pos));
+
+				if (zip->pos > zip->last_big_block) {
+					zip->zp->avail_in =
+						FTS_MAX_WORD_LEN;
+				} else {
+					zip->zp->avail_in =
+						static_cast<uInt>(zip->block_sz);
+				}
+
+				++zip->pos;
+			} else {
+				flush = Z_FINISH;
+			}
+		}
+
+		switch (zip->status = inflate(zip->zp, flush)) {
+		case Z_OK:
+			if (zip->zp->avail_out == 0 && len > 0) {
+
+				ut_a(len <= FTS_MAX_WORD_LEN);
+				ptr[len] = 0;
+
+				zip->zp->next_out = ptr;
+				zip->zp->avail_out = uInt(len);
+
+				word->f_len = ulint(len);
+				len = 0;
+			}
+			break;
+
+		case Z_BUF_ERROR:	/* No progress possible. */
+		case Z_STREAM_END:
+			inflateEnd(zip->zp);
+			break;
+
+		case Z_STREAM_ERROR:
+		default:
+			ut_error;
+		}
+	}
+
+	/* All blocks must be freed at end of inflate. */
+	if (zip->status != Z_OK) {
+		for (ulint i = 0; i < ib_vector_size(zip->blocks); ++i) {
+			if (ib_vector_getp(zip->blocks, i)) {
+				ut_free(ib_vector_getp(zip->blocks, i));
+				ib_vector_set(zip->blocks, i, &null);
+			}
+		}
+	}
+
+	if (ptr != NULL) {
+		ut_ad(word->f_len == strlen((char*) ptr));
+	}
+
+	return(zip->status == Z_OK || zip->status == Z_STREAM_END ? ptr : NULL);
+}
+
+/**********************************************************************//**
+Callback function to fetch and compress the word in an FTS
+INDEX record.
+@return FALSE on EOF */
+static
+ibool
+fts_fetch_index_words(
+/*==================*/
+	void*		row,		/*!< in: sel_node_t* */
+	void*		user_arg)	/*!< in: pointer to ib_vector_t */
+{
+	sel_node_t*	sel_node = static_cast<sel_node_t*>(row);
+	fts_zip_t*	zip = static_cast<fts_zip_t*>(user_arg);
+	que_node_t*	exp = sel_node->select_list;
+	dfield_t*	dfield = que_node_get_val(exp);
+
+	ut_a(dfield_get_len(dfield) <= FTS_MAX_WORD_LEN);
+
+	uint16		len = uint16(dfield_get_len(dfield));
+	void*		data = dfield_get_data(dfield);
+
+	/* Skip the duplicate words. */
+	if (zip->word.f_len == len && !memcmp(zip->word.f_str, data, len)) {
+		return(TRUE);
+	}
+
+	memcpy(zip->word.f_str, data, len);
+	zip->word.f_len = len;
+
+	ut_a(zip->zp->avail_in == 0);
+	ut_a(zip->zp->next_in == NULL);
+
+	/* The string is prefixed by len. */
+	/* FIXME: This is not byte order agnostic (InnoDB data files
+	with FULLTEXT INDEX are not portable between little-endian and
+	big-endian systems!) */
+	zip->zp->next_in = reinterpret_cast<byte*>(&len);
+	zip->zp->avail_in = sizeof(len);
+
+	/* Compress the word, create output blocks as necessary. */
+	while (zip->zp->avail_in > 0) {
+
+		/* No space left in output buffer, create a new one. */
+		if (zip->zp->avail_out == 0) {
+			byte*		block;
+
+			block = static_cast<byte*>(
+				ut_malloc_nokey(zip->block_sz));
+
+			ib_vector_push(zip->blocks, &block);
+
+			zip->zp->next_out = block;
+			zip->zp->avail_out = static_cast<uInt>(zip->block_sz);
+		}
+
+		switch (zip->status = deflate(zip->zp, Z_NO_FLUSH)) {
+		case Z_OK:
+			if (zip->zp->avail_in == 0) {
+				zip->zp->next_in = static_cast<byte*>(data);
+				zip->zp->avail_in = uInt(len);
+				ut_a(len <= FTS_MAX_WORD_LEN);
+				len = 0;
+			}
+			continue;
+
+		case Z_STREAM_END:
+		case Z_BUF_ERROR:
+		case Z_STREAM_ERROR:
+		default:
+			ut_error;
+		}
+	}
+
+	/* All data should have been compressed. */
+	ut_a(zip->zp->avail_in == 0);
+	zip->zp->next_in = NULL;
+
+	++zip->n_words;
+
+	return(zip->n_words >= zip->max_words ? FALSE : TRUE);
+}
+
+/**********************************************************************//**
+Finish Zip deflate. */
+static
+void
+fts_zip_deflate_end(
+/*================*/
+	fts_zip_t*	zip)		/*!< in: instance that should be closed*/
+{
+	ut_a(zip->zp->avail_in == 0);
+	ut_a(zip->zp->next_in == NULL);
+
+	zip->status = deflate(zip->zp, Z_FINISH);
+
+	ut_a(ib_vector_size(zip->blocks) > 0);
+	zip->last_big_block = ib_vector_size(zip->blocks) - 1;
+
+	/* Allocate smaller block(s), since this is trailing data. */
+	while (zip->status == Z_OK) {
+		byte*		block;
+
+		ut_a(zip->zp->avail_out == 0);
+
+		block = static_cast<byte*>(
+			ut_malloc_nokey(FTS_MAX_WORD_LEN + 1));
+
+		ib_vector_push(zip->blocks, &block);
+
+		zip->zp->next_out = block;
+		zip->zp->avail_out = FTS_MAX_WORD_LEN;
+
+		zip->status = deflate(zip->zp, Z_FINISH);
+	}
+
+	ut_a(zip->status == Z_STREAM_END);
+
+	zip->status = deflateEnd(zip->zp);
+	ut_a(zip->status == Z_OK);
+
+	/* Reset the ZLib data structure. */
+	memset(zip->zp, 0, sizeof(*zip->zp));
+}
+
+/**********************************************************************//**
+Read the words from the FTS INDEX.
+@return DB_SUCCESS if all OK, DB_TABLE_NOT_FOUND if no more indexes
+        to search else error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_index_fetch_words(
+/*==================*/
+	fts_optimize_t*		optim,	/*!< in: optimize scratch pad */
+	const fts_string_t*	word,	/*!< in: get words greater than this
+					 word */
+	ulint			n_words)/*!< in: max words to read */
+{
+	pars_info_t*	info;
+	que_t*		graph;
+	ulint		selected;
+	fts_zip_t*	zip = NULL;
+	dberr_t		error = DB_SUCCESS;
+	mem_heap_t*	heap = static_cast<mem_heap_t*>(optim->self_heap->arg);
+	ibool		inited = FALSE;
+
+	optim->trx->op_info = "fetching FTS index words";
+
+	if (optim->zip == NULL) {
+		optim->zip = fts_zip_create(heap, FTS_ZIP_BLOCK_SIZE, n_words);
+	} else {
+		fts_zip_initialize(optim->zip);
+	}
+
+	for (selected = fts_select_index(
+		     optim->fts_index_table.charset, word->f_str, word->f_len);
+	     selected < FTS_NUM_AUX_INDEX;
+	     selected++) {
+
+		char	table_name[MAX_FULL_NAME_LEN];
+
+		optim->fts_index_table.suffix = fts_get_suffix(selected);
+
+		info = pars_info_create();
+
+		pars_info_bind_function(
+			info, "my_func", fts_fetch_index_words, optim->zip);
+
+		pars_info_bind_varchar_literal(
+			info, "word", word->f_str, word->f_len);
+
+		fts_get_table_name(&optim->fts_index_table, table_name);
+		pars_info_bind_id(info, "table_name", table_name);
+
+		graph = fts_parse_sql(
+			&optim->fts_index_table,
+			info,
+			"DECLARE FUNCTION my_func;\n"
+			"DECLARE CURSOR c IS"
+			" SELECT word\n"
+			" FROM $table_name\n"
+			" WHERE word > :word\n"
+			" ORDER BY word;\n"
+			"BEGIN\n"
+			"\n"
+			"OPEN c;\n"
+			"WHILE 1 = 1 LOOP\n"
+			"  FETCH c INTO my_func();\n"
+			"  IF c % NOTFOUND THEN\n"
+			"    EXIT;\n"
+			"  END IF;\n"
+			"END LOOP;\n"
+			"CLOSE c;");
+
+		zip = optim->zip;
+
+		for (;;) {
+			int	err;
+
+			if (!inited && ((err = deflateInit(zip->zp, 9))
+					!= Z_OK)) {
+				ib::error() << "ZLib deflateInit() failed: "
+					<< err;
+
+				error = DB_ERROR;
+				break;
+			} else {
+				inited = TRUE;
+				error = fts_eval_sql(optim->trx, graph);
+			}
+
+			if (UNIV_LIKELY(error == DB_SUCCESS)) {
+				//FIXME fts_sql_commit(optim->trx);
+				break;
+			} else {
+				//FIXME fts_sql_rollback(optim->trx);
+
+				if (error == DB_LOCK_WAIT_TIMEOUT) {
+					ib::warn() << "Lock wait timeout"
+						" reading document. Retrying!";
+
+					/* We need to reset the ZLib state. */
+					inited = FALSE;
+					deflateEnd(zip->zp);
+					fts_zip_init(zip);
+
+					optim->trx->error_state = DB_SUCCESS;
+				} else {
+					ib::error() << "(" << error
+						<< ") while reading document.";
+
+					break;	/* Exit the loop. */
+				}
+			}
+		}
+
+		que_graph_free(graph);
+
+		/* Check if max word to fetch is exceeded */
+		if (optim->zip->n_words >= n_words) {
+			break;
+		}
+	}
+
+	if (error == DB_SUCCESS && zip->status == Z_OK && zip->n_words > 0) {
+
+		/* All data should have been read. */
+		ut_a(zip->zp->avail_in == 0);
+
+		fts_zip_deflate_end(zip);
+	} else {
+		deflateEnd(zip->zp);
+	}
+
+	return(error);
+}
+
+/**********************************************************************//**
+Callback function to fetch the doc id from the record.
+@return always returns TRUE */
+static
+ibool
+fts_fetch_doc_ids(
+/*==============*/
+	void*	row,		/*!< in: sel_node_t* */
+	void*	user_arg)	/*!< in: pointer to ib_vector_t */
+{
+	que_node_t*	exp;
+	int		i = 0;
+	sel_node_t*	sel_node = static_cast<sel_node_t*>(row);
+	fts_doc_ids_t*	fts_doc_ids = static_cast<fts_doc_ids_t*>(user_arg);
+	doc_id_t*	update = static_cast<doc_id_t*>(
+		ib_vector_push(fts_doc_ids->doc_ids, NULL));
+
+	for (exp = sel_node->select_list;
+	     exp;
+	     exp = que_node_get_next(exp), ++i) {
+
+		dfield_t*	dfield = que_node_get_val(exp);
+		void*		data = dfield_get_data(dfield);
+		ulint		len = dfield_get_len(dfield);
+
+		ut_a(len != UNIV_SQL_NULL);
+
+		/* Note: The column numbers below must match the SELECT. */
+		switch (i) {
+		case 0: /* DOC_ID */
+			*update = fts_read_doc_id(
+				static_cast<byte*>(data));
+			break;
+
+		default:
+			ut_error;
+		}
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Read the rows from a FTS common auxiliary table.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_table_fetch_doc_ids(
+/*====================*/
+	trx_t*		trx,		/*!< in: transaction */
+	fts_table_t*	fts_table,	/*!< in: table */
+	fts_doc_ids_t*	doc_ids)	/*!< in: For collecting doc ids */
+{
+	dberr_t		error;
+	que_t*		graph;
+	pars_info_t*	info = pars_info_create();
+	ibool		alloc_bk_trx = FALSE;
+	char		table_name[MAX_FULL_NAME_LEN];
+
+	ut_a(fts_table->suffix != NULL);
+	ut_a(fts_table->type == FTS_COMMON_TABLE);
+
+	if (!trx) {
+		trx = trx_create();
+		alloc_bk_trx = TRUE;
+	}
+
+	trx->op_info = "fetching FTS doc ids";
+
+	pars_info_bind_function(info, "my_func", fts_fetch_doc_ids, doc_ids);
+
+	fts_get_table_name(fts_table, table_name);
+	pars_info_bind_id(info, "table_name", table_name);
+
+	graph = fts_parse_sql(
+		fts_table,
+		info,
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS"
+		" SELECT doc_id FROM $table_name;\n"
+		"BEGIN\n"
+		"\n"
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE c;");
+
+	error = fts_eval_sql(trx, graph);
+	fts_sql_commit(trx);
+	que_graph_free(graph);
+
+	if (error == DB_SUCCESS) {
+		ib_vector_sort(doc_ids->doc_ids, fts_doc_id_cmp);
+	}
+
+	if (alloc_bk_trx) {
+		trx->free();
+	}
+
+	return(error);
+}
+
+/**********************************************************************//**
+Do a binary search for a doc id in the array
+@return +ve index if found -ve index where it should be inserted
+        if not found */
+int
+fts_bsearch(
+/*========*/
+	doc_id_t*	array,	/*!< in: array to sort */
+	int		lower,	/*!< in: the array lower bound */
+	int		upper,	/*!< in: the array upper bound */
+	doc_id_t	doc_id)	/*!< in: the doc id to search for */
+{
+	int	orig_size = upper;
+
+	if (upper == 0) {
+		/* Nothing to search */
+		return(-1);
+	} else {
+		while (lower < upper) {
+			int	i = (lower + upper) >> 1;
+
+			if (doc_id > array[i]) {
+				lower = i + 1;
+			} else if (doc_id < array[i]) {
+				upper = i - 1;
+			} else {
+				return(i); /* Found. */
+			}
+		}
+	}
+
+	if (lower == upper && lower < orig_size) {
+		if (doc_id == array[lower]) {
+			return(lower);
+		} else if (lower == 0) {
+			return(-1);
+		}
+	}
+
+	/* Not found. */
+	return( (lower == 0) ? -1 : -(lower));
+}
+
+/**********************************************************************//**
+Search in the to delete array whether any of the doc ids within
+the [first, last] range are to be deleted
+@return +ve index if found -ve index where it should be inserted
+        if not found */
+static
+int
+fts_optimize_lookup(
+/*================*/
+	ib_vector_t*	doc_ids,	/*!< in: array to search */
+	ulint		lower,		/*!< in: lower limit of array */
+	doc_id_t	first_doc_id,	/*!< in: doc id to lookup */
+	doc_id_t	last_doc_id)	/*!< in: doc id to lookup */
+{
+	int		pos;
+	int		upper = static_cast<int>(ib_vector_size(doc_ids));
+	doc_id_t*	array = (doc_id_t*) doc_ids->data;
+
+	pos = fts_bsearch(array, static_cast<int>(lower), upper, first_doc_id);
+
+	ut_a(abs(pos) <= upper + 1);
+
+	if (pos < 0) {
+
+		int	i = abs(pos);
+
+		/* If i is 1, it could be first_doc_id is less than
+		either the first or second array item, do a
+		double check */
+		if (i == 1 && array[0] <= last_doc_id
+		    && first_doc_id < array[0]) {
+			pos = 0;
+		} else if (i < upper && array[i] <= last_doc_id) {
+
+			/* Check if the "next" doc id is within the
+			first & last doc id of the node. */
+			pos = i;
+		}
+	}
+
+	return(pos);
+}
+
+/**********************************************************************//**
+Encode the word pos list into the node
+@return DB_SUCCESS or error code*/
+static MY_ATTRIBUTE((nonnull))
+dberr_t
+fts_optimize_encode_node(
+/*=====================*/
+	fts_node_t*	node,		/*!< in: node to fill*/
+	doc_id_t	doc_id,		/*!< in: doc id to encode */
+	fts_encode_t*	enc)		/*!< in: encoding state.*/
+{
+	byte*		dst;
+	ulint		enc_len;
+	ulint		pos_enc_len;
+	doc_id_t	doc_id_delta;
+	dberr_t		error = DB_SUCCESS;
+	const byte*	src = enc->src_ilist_ptr;
+
+	if (node->first_doc_id == 0) {
+		ut_a(node->last_doc_id == 0);
+
+		node->first_doc_id = doc_id;
+	}
+
+	/* Calculate the space required to store the ilist. */
+	ut_ad(doc_id > node->last_doc_id);
+	doc_id_delta = doc_id - node->last_doc_id;
+	enc_len = fts_get_encoded_len(static_cast<ulint>(doc_id_delta));
+
+	/* Calculate the size of the encoded pos array. */
+	while (*src) {
+		fts_decode_vlc(&src);
+	}
+
+	/* Skip the 0x00 byte at the end of the word positions list. */
+	++src;
+
+	/* Number of encoded pos bytes to copy. */
+	pos_enc_len = ulint(src - enc->src_ilist_ptr);
+
+	/* Total number of bytes required for copy. */
+	enc_len += pos_enc_len;
+
+	/* Check we have enough space in the destination buffer for
+	copying the document word list. */
+	if (!node->ilist) {
+		ulint	new_size;
+
+		ut_a(node->ilist_size == 0);
+
+		new_size = enc_len > FTS_ILIST_MAX_SIZE
+			? enc_len : FTS_ILIST_MAX_SIZE;
+
+		node->ilist = static_cast<byte*>(ut_malloc_nokey(new_size));
+		node->ilist_size_alloc = new_size;
+
+	} else if ((node->ilist_size + enc_len) > node->ilist_size_alloc) {
+		ulint	new_size = node->ilist_size + enc_len;
+		byte*	ilist = static_cast<byte*>(ut_malloc_nokey(new_size));
+
+		memcpy(ilist, node->ilist, node->ilist_size);
+
+		ut_free(node->ilist);
+
+		node->ilist = ilist;
+		node->ilist_size_alloc = new_size;
+	}
+
+	src = enc->src_ilist_ptr;
+	dst = node->ilist + node->ilist_size;
+
+	/* Encode the doc id. Cast to ulint, the delta should be small and
+	therefore no loss of precision. */
+	dst = fts_encode_int(doc_id_delta, dst);
+
+	/* Copy the encoded pos array. */
+	memcpy(dst, src, pos_enc_len);
+
+	node->last_doc_id = doc_id;
+
+	/* Data copied upto here. */
+	node->ilist_size += enc_len;
+	enc->src_ilist_ptr += pos_enc_len;
+
+	ut_a(node->ilist_size <= node->ilist_size_alloc);
+
+	return(error);
+}
+
+/**********************************************************************//**
+Optimize the data contained in a node.
+@return DB_SUCCESS or error code*/
+static MY_ATTRIBUTE((nonnull))
+dberr_t
+fts_optimize_node(
+/*==============*/
+	ib_vector_t*	del_vec,	/*!< in: vector of doc ids to delete*/
+	int*		del_pos,	/*!< in: offset into above vector */
+	fts_node_t*	dst_node,	/*!< in: node to fill*/
+	fts_node_t*	src_node,	/*!< in: source node for data*/
+	fts_encode_t*	enc)		/*!< in: encoding state */
+{
+	ulint		copied;
+	dberr_t		error = DB_SUCCESS;
+	doc_id_t	doc_id = enc->src_last_doc_id;
+
+	if (!enc->src_ilist_ptr) {
+		enc->src_ilist_ptr = src_node->ilist;
+	}
+
+	copied = ulint(enc->src_ilist_ptr - src_node->ilist);
+
+	/* While there is data in the source node and space to copy
+	into in the destination node. */
+	while (copied < src_node->ilist_size
+	       && dst_node->ilist_size < FTS_ILIST_MAX_SIZE) {
+
+		doc_id_t	delta;
+		doc_id_t	del_doc_id = FTS_NULL_DOC_ID;
+
+		delta = fts_decode_vlc(
+			(const byte**)&enc->src_ilist_ptr);
+
+test_again:
+		/* Check whether the doc id is in the delete list, if
+		so then we skip the entries but we need to track the
+		delta for decoding the entries following this document's
+		entries. */
+		if (*del_pos >= 0 && *del_pos < (int) ib_vector_size(del_vec)) {
+			doc_id_t*	update;
+
+			update = (doc_id_t*) ib_vector_get(
+				del_vec, ulint(*del_pos));
+
+			del_doc_id = *update;
+		}
+
+		if (enc->src_ilist_ptr == src_node->ilist && doc_id == 0) {
+			ut_a(delta == src_node->first_doc_id);
+		}
+
+		doc_id += delta;
+
+		if (del_doc_id > 0 && doc_id == del_doc_id) {
+
+			++*del_pos;
+
+			/* Skip the entries for this document. */
+			while (*enc->src_ilist_ptr) {
+				fts_decode_vlc((const byte**)&enc->src_ilist_ptr);
+			}
+
+			/* Skip the end of word position marker. */
+			++enc->src_ilist_ptr;
+
+		} else {
+
+			/* DOC ID already becomes larger than
+			del_doc_id, check the next del_doc_id */
+			if (del_doc_id > 0 && doc_id > del_doc_id) {
+				del_doc_id = 0;
+				++*del_pos;
+				delta = 0;
+				goto test_again;
+			}
+
+			/* Decode and copy the word positions into
+			the dest node. */
+			fts_optimize_encode_node(dst_node, doc_id, enc);
+
+			++dst_node->doc_count;
+
+			ut_a(dst_node->last_doc_id == doc_id);
+		}
+
+		/* Bytes copied so for from source. */
+		copied = ulint(enc->src_ilist_ptr - src_node->ilist);
+	}
+
+	if (copied >= src_node->ilist_size) {
+		ut_a(doc_id == src_node->last_doc_id);
+	}
+
+	enc->src_last_doc_id = doc_id;
+
+	return(error);
+}
+
+/**********************************************************************//**
+Determine the starting pos within the deleted doc id vector for a word.
+@return delete position */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+int
+fts_optimize_deleted_pos(
+/*=====================*/
+	fts_optimize_t*	optim,		/*!< in: optimize state data */
+	fts_word_t*	word)		/*!< in: the word data to check */
+{
+	int		del_pos;
+	ib_vector_t*	del_vec = optim->to_delete->doc_ids;
+
+	/* Get the first and last dict ids for the word, we will use
+	these values to determine which doc ids need to be removed
+	when we coalesce the nodes. This way we can reduce the numer
+	of elements that need to be searched in the deleted doc ids
+	vector and secondly we can remove the doc ids during the
+	coalescing phase. */
+	if (ib_vector_size(del_vec) > 0) {
+		fts_node_t*	node;
+		doc_id_t	last_id;
+		doc_id_t	first_id;
+		ulint		size = ib_vector_size(word->nodes);
+
+		node = (fts_node_t*) ib_vector_get(word->nodes, 0);
+		first_id = node->first_doc_id;
+
+		node = (fts_node_t*) ib_vector_get(word->nodes, size - 1);
+		last_id = node->last_doc_id;
+
+		ut_a(first_id <= last_id);
+
+		del_pos = fts_optimize_lookup(
+			del_vec, optim->del_pos, first_id, last_id);
+	} else {
+
+		del_pos = -1; /* Note that there is nothing to delete. */
+	}
+
+	return(del_pos);
+}
+
+#define FTS_DEBUG_PRINT
+/**********************************************************************//**
+Compact the nodes for a word, we also remove any doc ids during the
+compaction pass.
+@return DB_SUCCESS or error code.*/
+static
+ib_vector_t*
+fts_optimize_word(
+/*==============*/
+	fts_optimize_t*	optim,		/*!< in: optimize state data */
+	fts_word_t*	word)		/*!< in: the word to optimize */
+{
+	fts_encode_t	enc;
+	ib_vector_t*	nodes;
+	ulint		i = 0;
+	int		del_pos;
+	fts_node_t*	dst_node = NULL;
+	ib_vector_t*	del_vec = optim->to_delete->doc_ids;
+	ulint		size = ib_vector_size(word->nodes);
+
+	del_pos = fts_optimize_deleted_pos(optim, word);
+	nodes = ib_vector_create(word->heap_alloc, sizeof(*dst_node), 128);
+
+	enc.src_last_doc_id = 0;
+	enc.src_ilist_ptr = NULL;
+
+	while (i < size) {
+		ulint		copied;
+		fts_node_t*	src_node;
+
+		src_node = (fts_node_t*) ib_vector_get(word->nodes, i);
+
+		if (dst_node == NULL
+		    || dst_node->last_doc_id > src_node->first_doc_id) {
+
+			dst_node = static_cast<fts_node_t*>(
+				ib_vector_push(nodes, NULL));
+			memset(dst_node, 0, sizeof(*dst_node));
+		}
+
+		/* Copy from the src to the dst node. */
+		fts_optimize_node(del_vec, &del_pos, dst_node, src_node, &enc);
+
+		ut_a(enc.src_ilist_ptr != NULL);
+
+		/* Determine the numer of bytes copied to dst_node. */
+		copied = ulint(enc.src_ilist_ptr - src_node->ilist);
+
+		/* Can't copy more than whats in the vlc array. */
+		ut_a(copied <= src_node->ilist_size);
+
+		/* We are done with this node release the resources. */
+		if (copied == src_node->ilist_size) {
+
+			enc.src_last_doc_id = 0;
+			enc.src_ilist_ptr = NULL;
+
+			ut_free(src_node->ilist);
+
+			src_node->ilist = NULL;
+			src_node->ilist_size = src_node->ilist_size_alloc = 0;
+
+			src_node = NULL;
+
+			++i; /* Get next source node to OPTIMIZE. */
+		}
+
+		if (dst_node->ilist_size >= FTS_ILIST_MAX_SIZE || i >= size) {
+
+			dst_node = NULL;
+		}
+	}
+
+	/* All dst nodes created should have been added to the vector. */
+	ut_a(dst_node == NULL);
+
+	/* Return the OPTIMIZED nodes. */
+	return(nodes);
+}
+
+/**********************************************************************//**
+Update the FTS index table. This is a delete followed by an insert.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_write_word(
+/*====================*/
+	trx_t*		trx,		/*!< in: transaction */
+	fts_table_t*	fts_table,	/*!< in: table of FTS index */
+	fts_string_t*	word,		/*!< in: word data to write */
+	ib_vector_t*	nodes)		/*!< in: the nodes to write */
+{
+	ulint		i;
+	pars_info_t*	info;
+	que_t*		graph;
+	ulint		selected;
+	dberr_t		error = DB_SUCCESS;
+	char		table_name[MAX_FULL_NAME_LEN];
+
+	info = pars_info_create();
+
+	ut_ad(fts_table->charset);
+
+	pars_info_bind_varchar_literal(
+		info, "word", word->f_str, word->f_len);
+
+	selected = fts_select_index(fts_table->charset,
+				    word->f_str, word->f_len);
+
+	fts_table->suffix = fts_get_suffix(selected);
+	fts_get_table_name(fts_table, table_name);
+	pars_info_bind_id(info, "table_name", table_name);
+
+	graph = fts_parse_sql(
+		fts_table,
+		info,
+		"BEGIN DELETE FROM $table_name WHERE word = :word;");
+
+	error = fts_eval_sql(trx, graph);
+
+	if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+		ib::error() << "(" << error << ") during optimize,"
+			" when deleting a word from the FTS index.";
+	}
+
+	que_graph_free(graph);
+	graph = NULL;
+
+	/* Even if the operation needs to be rolled back and redone,
+	we iterate over the nodes in order to free the ilist. */
+	for (i = 0; i < ib_vector_size(nodes); ++i) {
+
+		fts_node_t* node = (fts_node_t*) ib_vector_get(nodes, i);
+
+		if (error == DB_SUCCESS) {
+			/* Skip empty node. */
+			if (node->ilist == NULL) {
+				ut_ad(node->ilist_size == 0);
+				continue;
+			}
+
+			error = fts_write_node(
+				trx, &graph, fts_table, word, node);
+
+			if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+				ib::error() << "(" << error << ")"
+					" during optimize, while adding a"
+					" word to the FTS index.";
+			}
+		}
+
+		ut_free(node->ilist);
+		node->ilist = NULL;
+		node->ilist_size = node->ilist_size_alloc = 0;
+	}
+
+	if (graph != NULL) {
+		que_graph_free(graph);
+	}
+
+	return(error);
+}
+
+/**********************************************************************//**
+Free fts_optimizer_word_t instanace.*/
+void
+fts_word_free(
+/*==========*/
+	fts_word_t*	word)		/*!< in: instance to free.*/
+{
+	mem_heap_t*	heap = static_cast<mem_heap_t*>(word->heap_alloc->arg);
+
+#ifdef UNIV_DEBUG
+	memset(word, 0, sizeof(*word));
+#endif /* UNIV_DEBUG */
+
+	mem_heap_free(heap);
+}
+
+/**********************************************************************//**
+Optimize the word ilist and rewrite data to the FTS index.
+@return status one of RESTART, EXIT, ERROR */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_compact(
+/*=================*/
+	fts_optimize_t*	optim,		/*!< in: optimize state data */
+	dict_index_t*	index,		/*!< in: current FTS being optimized */
+	time_t		start_time)	/*!< in: optimize start time */
+{
+	ulint		i;
+	dberr_t		error = DB_SUCCESS;
+	ulint		size = ib_vector_size(optim->words);
+
+	for (i = 0; i < size && error == DB_SUCCESS && !optim->done; ++i) {
+		fts_word_t*	word;
+		ib_vector_t*	nodes;
+		trx_t*		trx = optim->trx;
+
+		word = (fts_word_t*) ib_vector_get(optim->words, i);
+
+		/* nodes is allocated from the word heap and will be destroyed
+		when the word is freed. We however have to be careful about
+		the ilist, that needs to be freed explicitly. */
+		nodes = fts_optimize_word(optim, word);
+
+		/* Update the data on disk. */
+		error = fts_optimize_write_word(
+			trx, &optim->fts_index_table, &word->text, nodes);
+
+		if (error == DB_SUCCESS) {
+			/* Write the last word optimized to the config table,
+			we use this value for restarting optimize. */
+			error = fts_config_set_index_value(
+				optim->trx, index,
+				FTS_LAST_OPTIMIZED_WORD, &word->text);
+		}
+
+		/* Free the word that was optimized. */
+		fts_word_free(word);
+
+		ulint interval = ulint(time(NULL) - start_time);
+
+		if (fts_optimize_time_limit > 0
+		    && (lint(interval) < 0
+			|| interval > fts_optimize_time_limit)) {
+
+			optim->done = TRUE;
+		}
+	}
+
+	return(error);
+}
+
+/**********************************************************************//**
+Create an instance of fts_optimize_t. Also create a new
+background transaction.*/
+static
+fts_optimize_t*
+fts_optimize_create(
+/*================*/
+	dict_table_t*	table)		/*!< in: table with FTS indexes */
+{
+	fts_optimize_t*	optim;
+	mem_heap_t*	heap = mem_heap_create(128);
+
+	optim = (fts_optimize_t*) mem_heap_zalloc(heap, sizeof(*optim));
+
+	optim->self_heap = ib_heap_allocator_create(heap);
+
+	optim->to_delete = fts_doc_ids_create();
+
+	optim->words = ib_vector_create(
+		optim->self_heap, sizeof(fts_word_t), 256);
+
+	optim->table = table;
+
+	optim->trx = trx_create();
+	trx_start_internal(optim->trx);
+
+	optim->fts_common_table.table_id = table->id;
+	optim->fts_common_table.type = FTS_COMMON_TABLE;
+	optim->fts_common_table.table = table;
+
+	optim->fts_index_table.table_id = table->id;
+	optim->fts_index_table.type = FTS_INDEX_TABLE;
+	optim->fts_index_table.table = table;
+
+	/* The common prefix for all this parent table's aux tables. */
+	char table_id[FTS_AUX_MIN_TABLE_ID_LENGTH];
+	const size_t table_id_len = 1
+		+ size_t(fts_get_table_id(&optim->fts_common_table, table_id));
+	dict_sys.freeze(SRW_LOCK_CALL);
+	/* Include the separator as well. */
+	const size_t dbname_len = table->name.dblen() + 1;
+	ut_ad(dbname_len > 1);
+	const size_t prefix_name_len = dbname_len + 4 + table_id_len;
+	char* prefix_name = static_cast<char*>(
+		ut_malloc_nokey(prefix_name_len));
+	memcpy(prefix_name, table->name.m_name, dbname_len);
+	dict_sys.unfreeze();
+	memcpy(prefix_name + dbname_len, "FTS_", 4);
+	memcpy(prefix_name + dbname_len + 4, table_id, table_id_len);
+	optim->name_prefix =prefix_name;
+
+	return(optim);
+}
+
+#ifdef FTS_OPTIMIZE_DEBUG
+/**********************************************************************//**
+Get optimize start time of an FTS index.
+@return DB_SUCCESS if all OK else error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_get_index_start_time(
+/*==============================*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index,			/*!< in: FTS index */
+	time_t*		start_time)		/*!< out: time in secs */
+{
+	return(fts_config_get_index_ulint(
+		       trx, index, FTS_OPTIMIZE_START_TIME,
+		       (ulint*) start_time));
+}
+
+/**********************************************************************//**
+Set the optimize start time of an FTS index.
+@return DB_SUCCESS if all OK else error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_set_index_start_time(
+/*==============================*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index,			/*!< in: FTS index */
+	time_t		start_time)		/*!< in: start time */
+{
+	return(fts_config_set_index_ulint(
+		       trx, index, FTS_OPTIMIZE_START_TIME,
+		       (ulint) start_time));
+}
+
+/**********************************************************************//**
+Get optimize end time of an FTS index.
+@return DB_SUCCESS if all OK else error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_get_index_end_time(
+/*============================*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index,			/*!< in: FTS index */
+	time_t*		end_time)		/*!< out: time in secs */
+{
+	return(fts_config_get_index_ulint(
+		       trx, index, FTS_OPTIMIZE_END_TIME, (ulint*) end_time));
+}
+
+/**********************************************************************//**
+Set the optimize end time of an FTS index.
+@return DB_SUCCESS if all OK else error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_set_index_end_time(
+/*============================*/
+	trx_t*		trx,			/*!< in: transaction */
+	dict_index_t*	index,			/*!< in: FTS index */
+	time_t		end_time)		/*!< in: end time */
+{
+	return(fts_config_set_index_ulint(
+		       trx, index, FTS_OPTIMIZE_END_TIME, (ulint) end_time));
+}
+#endif
+
+/**********************************************************************//**
+Free the optimize prepared statements.*/
+static
+void
+fts_optimize_graph_free(
+/*====================*/
+	fts_optimize_graph_t*	graph)	/*!< in/out: The graph instances
+					to free */
+{
+	if (graph->commit_graph) {
+		que_graph_free(graph->commit_graph);
+		graph->commit_graph = NULL;
+	}
+
+	if (graph->write_nodes_graph) {
+		que_graph_free(graph->write_nodes_graph);
+		graph->write_nodes_graph = NULL;
+	}
+
+	if (graph->delete_nodes_graph) {
+		que_graph_free(graph->delete_nodes_graph);
+		graph->delete_nodes_graph = NULL;
+	}
+
+	if (graph->read_nodes_graph) {
+		que_graph_free(graph->read_nodes_graph);
+		graph->read_nodes_graph = NULL;
+	}
+}
+
+/**********************************************************************//**
+Free all optimize resources. */
+static
+void
+fts_optimize_free(
+/*==============*/
+	fts_optimize_t*	optim)		/*!< in: table with on FTS index */
+{
+	mem_heap_t*	heap = static_cast<mem_heap_t*>(optim->self_heap->arg);
+
+	trx_commit_for_mysql(optim->trx);
+	optim->trx->free();
+	optim->trx = NULL;
+
+	fts_doc_ids_free(optim->to_delete);
+	fts_optimize_graph_free(&optim->graph);
+
+	ut_free(optim->name_prefix);
+
+	/* This will free the heap from which optim itself was allocated. */
+	mem_heap_free(heap);
+}
+
+/**********************************************************************//**
+Get the max time optimize should run in millisecs.
+@return max optimize time limit in millisecs. */
+static
+ulint
+fts_optimize_get_time_limit(
+/*========================*/
+	trx_t*		trx,			/*!< in: transaction */
+	fts_table_t*	fts_table)		/*!< in: aux table */
+{
+	ulint	time_limit = 0;
+
+	fts_config_get_ulint(
+		trx, fts_table,
+		FTS_OPTIMIZE_LIMIT_IN_SECS, &time_limit);
+
+	/* FIXME: This is returning milliseconds, while the variable
+	is being stored and interpreted as seconds! */
+	return(time_limit * 1000);
+}
+
+/**********************************************************************//**
+Run OPTIMIZE on the given table. Note: this can take a very long time
+(hours). */
+static
+void
+fts_optimize_words(
+/*===============*/
+	fts_optimize_t*	optim,	/*!< in: optimize instance */
+	dict_index_t*	index,	/*!< in: current FTS being optimized */
+	fts_string_t*	word)	/*!< in: the starting word to optimize */
+{
+	fts_fetch_t	fetch;
+	que_t*		graph = NULL;
+	CHARSET_INFO*	charset = optim->fts_index_table.charset;
+
+	ut_a(!optim->done);
+
+	/* Get the time limit from the config table. */
+	fts_optimize_time_limit = fts_optimize_get_time_limit(
+		optim->trx, &optim->fts_common_table);
+
+	const time_t start_time = time(NULL);
+
+	/* Setup the callback to use for fetching the word ilist etc. */
+	fetch.read_arg = optim->words;
+	fetch.read_record = fts_optimize_index_fetch_node;
+
+	while (!optim->done) {
+		dberr_t	error;
+		trx_t*	trx = optim->trx;
+		ulint	selected;
+
+		ut_a(ib_vector_size(optim->words) == 0);
+
+		selected = fts_select_index(charset, word->f_str, word->f_len);
+
+		/* Read the index records to optimize. */
+		fetch.total_memory = 0;
+		error = fts_index_fetch_nodes(
+			trx, &graph, &optim->fts_index_table, word,
+			&fetch);
+		ut_ad(fetch.total_memory < fts_result_cache_limit);
+
+		if (error == DB_SUCCESS) {
+			/* There must be some nodes to read. */
+			ut_a(ib_vector_size(optim->words) > 0);
+
+			/* Optimize the nodes that were read and write
+			back to DB. */
+			error = fts_optimize_compact(optim, index, start_time);
+
+			if (error == DB_SUCCESS) {
+				fts_sql_commit(optim->trx);
+			} else {
+				fts_sql_rollback(optim->trx);
+			}
+		}
+
+		ib_vector_reset(optim->words);
+
+		if (error == DB_SUCCESS) {
+			if (!optim->done) {
+				if (!fts_zip_read_word(optim->zip, word)) {
+					optim->done = TRUE;
+				} else if (selected
+					   != fts_select_index(
+						charset, word->f_str,
+						word->f_len)
+					  && graph) {
+					que_graph_free(graph);
+					graph = NULL;
+				}
+			}
+		} else if (error == DB_LOCK_WAIT_TIMEOUT) {
+			ib::warn() << "Lock wait timeout during optimize."
+				" Retrying!";
+
+			trx->error_state = DB_SUCCESS;
+		} else if (error == DB_DEADLOCK) {
+			ib::warn() << "Deadlock during optimize. Retrying!";
+
+			trx->error_state = DB_SUCCESS;
+		} else {
+			optim->done = TRUE;		/* Exit the loop. */
+		}
+	}
+
+	if (graph != NULL) {
+		que_graph_free(graph);
+	}
+}
+
+/**********************************************************************//**
+Optimize is complete. Set the completion time, and reset the optimize
+start string for this FTS index to "".
+@return DB_SUCCESS if all OK */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_index_completed(
+/*=========================*/
+	fts_optimize_t*	optim,	/*!< in: optimize instance */
+	dict_index_t*	index)	/*!< in: table with one FTS index */
+{
+	fts_string_t	word;
+	dberr_t		error;
+	byte		buf[sizeof(ulint)];
+#ifdef FTS_OPTIMIZE_DEBUG
+	time_t		end_time = time(NULL);
+
+	error = fts_optimize_set_index_end_time(optim->trx, index, end_time);
+#endif
+
+	/* If we've reached the end of the index then set the start
+	word to the empty string. */
+
+	word.f_len = 0;
+	word.f_str = buf;
+	*word.f_str = '\0';
+
+	error = fts_config_set_index_value(
+		optim->trx, index, FTS_LAST_OPTIMIZED_WORD, &word);
+
+	if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+		ib::error() << "(" << error << ") while updating"
+			" last optimized word!";
+	}
+
+	return(error);
+}
+
+
+/**********************************************************************//**
+Read the list of words from the FTS auxiliary index that will be
+optimized in this pass.
+@return DB_SUCCESS if all OK */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_index_read_words(
+/*==========================*/
+	fts_optimize_t*	optim,	/*!< in: optimize instance */
+	dict_index_t*	index,	/*!< in: table with one FTS index */
+	fts_string_t*	word)	/*!< in: buffer to use */
+{
+	dberr_t	error = DB_SUCCESS;
+
+	if (optim->del_list_regenerated) {
+		word->f_len = 0;
+	} else {
+
+		/* Get the last word that was optimized from
+		the config table. */
+		error = fts_config_get_index_value(
+			optim->trx, index, FTS_LAST_OPTIMIZED_WORD, word);
+	}
+
+	/* If record not found then we start from the top. */
+	if (error == DB_RECORD_NOT_FOUND) {
+		word->f_len = 0;
+		error = DB_SUCCESS;
+	}
+
+	while (error == DB_SUCCESS) {
+
+		error = fts_index_fetch_words(
+			optim, word, fts_num_word_optimize);
+
+		if (error == DB_SUCCESS) {
+			/* Reset the last optimized word to '' if no
+			more words could be read from the FTS index. */
+			if (optim->zip->n_words == 0) {
+				word->f_len = 0;
+				*word->f_str = 0;
+			}
+
+			break;
+		}
+	}
+
+	return(error);
+}
+
+/**********************************************************************//**
+Run OPTIMIZE on the given FTS index. Note: this can take a very long
+time (hours).
+@return DB_SUCCESS if all OK */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_index(
+/*===============*/
+	fts_optimize_t*	optim,	/*!< in: optimize instance */
+	dict_index_t*	index)	/*!< in: table with one FTS index */
+{
+	fts_string_t	word;
+	dberr_t		error;
+	byte		str[FTS_MAX_WORD_LEN + 1];
+
+	/* Set the current index that we have to optimize. */
+	optim->fts_index_table.index_id = index->id;
+	optim->fts_index_table.charset = fts_index_get_charset(index);
+
+	optim->done = FALSE; /* Optimize until !done */
+
+	/* We need to read the last word optimized so that we start from
+	the next word. */
+	word.f_str = str;
+
+	/* We set the length of word to the size of str since we
+	need to pass the max len info to the fts_get_config_value() function. */
+	word.f_len = sizeof(str) - 1;
+
+	memset(word.f_str, 0x0, word.f_len);
+
+	/* Read the words that will be optimized in this pass. */
+	error = fts_optimize_index_read_words(optim, index, &word);
+
+	if (error == DB_SUCCESS) {
+		int	zip_error;
+
+		ut_a(optim->zip->pos == 0);
+		ut_a(optim->zip->zp->total_in == 0);
+		ut_a(optim->zip->zp->total_out == 0);
+
+		zip_error = inflateInit(optim->zip->zp);
+		ut_a(zip_error == Z_OK);
+
+		word.f_len = 0;
+		word.f_str = str;
+
+		/* Read the first word to optimize from the Zip buffer. */
+		if (!fts_zip_read_word(optim->zip, &word)) {
+
+			optim->done = TRUE;
+		} else {
+			fts_optimize_words(optim, index, &word);
+		}
+
+		/* If we couldn't read any records then optimize is
+		complete. Increment the number of indexes that have
+		been optimized and set FTS index optimize state to
+		completed. */
+		if (error == DB_SUCCESS && optim->zip->n_words == 0) {
+
+			error = fts_optimize_index_completed(optim, index);
+
+			if (error == DB_SUCCESS) {
+				++optim->n_completed;
+			}
+		}
+	}
+
+	return(error);
+}
+
+/**********************************************************************//**
+Delete the document ids in the delete, and delete cache tables.
+@return DB_SUCCESS if all OK */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_purge_deleted_doc_ids(
+/*===============================*/
+	fts_optimize_t*	optim)	/*!< in: optimize instance */
+{
+	ulint		i;
+	pars_info_t*	info;
+	que_t*		graph;
+	doc_id_t*	update;
+	doc_id_t	write_doc_id;
+	dberr_t		error = DB_SUCCESS;
+	char		deleted[MAX_FULL_NAME_LEN];
+	char		deleted_cache[MAX_FULL_NAME_LEN];
+
+	info = pars_info_create();
+
+	ut_a(ib_vector_size(optim->to_delete->doc_ids) > 0);
+
+	update = static_cast<doc_id_t*>(
+		ib_vector_get(optim->to_delete->doc_ids, 0));
+
+	/* Convert to "storage" byte order. */
+	fts_write_doc_id((byte*) &write_doc_id, *update);
+
+	/* This is required for the SQL parser to work. It must be able
+	to find the following variables. So we do it twice. */
+	fts_bind_doc_id(info, "doc_id1", &write_doc_id);
+	fts_bind_doc_id(info, "doc_id2", &write_doc_id);
+
+	/* Make sure the following two names are consistent with the name
+	used in the fts_delete_doc_ids_sql */
+	optim->fts_common_table.suffix = fts_common_tables[3];
+	fts_get_table_name(&optim->fts_common_table, deleted);
+	pars_info_bind_id(info, fts_common_tables[3], deleted);
+
+	optim->fts_common_table.suffix = fts_common_tables[4];
+	fts_get_table_name(&optim->fts_common_table, deleted_cache);
+	pars_info_bind_id(info, fts_common_tables[4], deleted_cache);
+
+	graph = fts_parse_sql(NULL, info, fts_delete_doc_ids_sql);
+
+	/* Delete the doc ids that were copied at the start. */
+	for (i = 0; i < ib_vector_size(optim->to_delete->doc_ids); ++i) {
+
+		update = static_cast<doc_id_t*>(ib_vector_get(
+			optim->to_delete->doc_ids, i));
+
+		/* Convert to "storage" byte order. */
+		fts_write_doc_id((byte*) &write_doc_id, *update);
+
+		fts_bind_doc_id(info, "doc_id1", &write_doc_id);
+
+		fts_bind_doc_id(info, "doc_id2", &write_doc_id);
+
+		error = fts_eval_sql(optim->trx, graph);
+
+		// FIXME: Check whether delete actually succeeded!
+		if (error != DB_SUCCESS) {
+
+			fts_sql_rollback(optim->trx);
+			break;
+		}
+	}
+
+	que_graph_free(graph);
+
+	return(error);
+}
+
+/**********************************************************************//**
+Delete the document ids in the pending delete, and delete tables.
+@return DB_SUCCESS if all OK */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_purge_deleted_doc_id_snapshot(
+/*=======================================*/
+	fts_optimize_t*	optim)	/*!< in: optimize instance */
+{
+	dberr_t		error;
+	que_t*		graph;
+	pars_info_t*	info;
+	char		being_deleted[MAX_FULL_NAME_LEN];
+	char		being_deleted_cache[MAX_FULL_NAME_LEN];
+
+	info = pars_info_create();
+
+	/* Make sure the following two names are consistent with the name
+	used in the fts_end_delete_sql */
+	optim->fts_common_table.suffix = fts_common_tables[0];
+	fts_get_table_name(&optim->fts_common_table, being_deleted);
+	pars_info_bind_id(info, fts_common_tables[0], being_deleted);
+
+	optim->fts_common_table.suffix = fts_common_tables[1];
+	fts_get_table_name(&optim->fts_common_table, being_deleted_cache);
+	pars_info_bind_id(info, fts_common_tables[1], being_deleted_cache);
+
+	/* Delete the doc ids that were copied to delete pending state at
+	the start of optimize. */
+	graph = fts_parse_sql(NULL, info, fts_end_delete_sql);
+
+	error = fts_eval_sql(optim->trx, graph);
+	que_graph_free(graph);
+
+	return(error);
+}
+
+/**********************************************************************//**
+Copy the deleted doc ids that will be purged during this optimize run
+to the being deleted FTS auxiliary tables. The transaction is committed
+upon successfull copy and rolled back on DB_DUPLICATE_KEY error.
+@return DB_SUCCESS if all OK */
+static
+ulint
+fts_optimize_being_deleted_count(
+/*=============================*/
+	fts_optimize_t*	optim)	/*!< in: optimize instance */
+{
+	fts_table_t	fts_table;
+
+	FTS_INIT_FTS_TABLE(&fts_table, "BEING_DELETED", FTS_COMMON_TABLE,
+			   optim->table);
+
+	return(fts_get_rows_count(&fts_table));
+}
+
+/*********************************************************************//**
+Copy the deleted doc ids that will be purged during this optimize run
+to the being deleted FTS auxiliary tables. The transaction is committed
+upon successfull copy and rolled back on DB_DUPLICATE_KEY error.
+@return DB_SUCCESS if all OK */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_create_deleted_doc_id_snapshot(
+/*========================================*/
+	fts_optimize_t*	optim)	/*!< in: optimize instance */
+{
+	dberr_t		error;
+	que_t*		graph;
+	pars_info_t*	info;
+	char		being_deleted[MAX_FULL_NAME_LEN];
+	char		deleted[MAX_FULL_NAME_LEN];
+	char		being_deleted_cache[MAX_FULL_NAME_LEN];
+	char		deleted_cache[MAX_FULL_NAME_LEN];
+
+	info = pars_info_create();
+
+	/* Make sure the following four names are consistent with the name
+	used in the fts_init_delete_sql */
+	optim->fts_common_table.suffix = fts_common_tables[0];
+	fts_get_table_name(&optim->fts_common_table, being_deleted);
+	pars_info_bind_id(info, fts_common_tables[0], being_deleted);
+
+	optim->fts_common_table.suffix = fts_common_tables[3];
+	fts_get_table_name(&optim->fts_common_table, deleted);
+	pars_info_bind_id(info, fts_common_tables[3], deleted);
+
+	optim->fts_common_table.suffix = fts_common_tables[1];
+	fts_get_table_name(&optim->fts_common_table, being_deleted_cache);
+	pars_info_bind_id(info, fts_common_tables[1], being_deleted_cache);
+
+	optim->fts_common_table.suffix = fts_common_tables[4];
+	fts_get_table_name(&optim->fts_common_table, deleted_cache);
+	pars_info_bind_id(info, fts_common_tables[4], deleted_cache);
+
+	/* Move doc_ids that are to be deleted to state being deleted. */
+	graph = fts_parse_sql(NULL, info, fts_init_delete_sql);
+
+	error = fts_eval_sql(optim->trx, graph);
+
+	que_graph_free(graph);
+
+	if (error != DB_SUCCESS) {
+		fts_sql_rollback(optim->trx);
+	} else {
+		fts_sql_commit(optim->trx);
+	}
+
+	optim->del_list_regenerated = TRUE;
+
+	return(error);
+}
+
+/*********************************************************************//**
+Read in the document ids that are to be purged during optimize. The
+transaction is committed upon successfully read.
+@return DB_SUCCESS if all OK */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_read_deleted_doc_id_snapshot(
+/*======================================*/
+	fts_optimize_t*	optim)	/*!< in: optimize instance */
+{
+	dberr_t		error;
+
+	optim->fts_common_table.suffix = "BEING_DELETED";
+
+	/* Read the doc_ids to delete. */
+	error = fts_table_fetch_doc_ids(
+		optim->trx, &optim->fts_common_table, optim->to_delete);
+
+	if (error == DB_SUCCESS) {
+
+		optim->fts_common_table.suffix = "BEING_DELETED_CACHE";
+
+		/* Read additional doc_ids to delete. */
+		error = fts_table_fetch_doc_ids(
+			optim->trx, &optim->fts_common_table, optim->to_delete);
+	}
+
+	if (error != DB_SUCCESS) {
+
+		fts_doc_ids_free(optim->to_delete);
+		optim->to_delete = NULL;
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Optimze all the FTS indexes, skipping those that have already been
+optimized, since the FTS auxiliary indexes are not guaranteed to be
+of the same cardinality.
+@return DB_SUCCESS if all OK */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_indexes(
+/*=================*/
+	fts_optimize_t*	optim)	/*!< in: optimize instance */
+{
+	ulint		i;
+	dberr_t		error = DB_SUCCESS;
+	fts_t*		fts = optim->table->fts;
+
+	/* Optimize the FTS indexes. */
+	for (i = 0; i < ib_vector_size(fts->indexes); ++i) {
+		dict_index_t*	index;
+
+#ifdef	FTS_OPTIMIZE_DEBUG
+		time_t	end_time;
+		time_t	start_time;
+
+		/* Get the start and end optimize times for this index. */
+		error = fts_optimize_get_index_start_time(
+			optim->trx, index, &start_time);
+
+		if (error != DB_SUCCESS) {
+			break;
+		}
+
+		error = fts_optimize_get_index_end_time(
+			optim->trx, index, &end_time);
+
+		if (error != DB_SUCCESS) {
+			break;
+		}
+
+		/* Start time will be 0 only for the first time or after
+		completing the optimization of all FTS indexes. */
+		if (start_time == 0) {
+			start_time = time(NULL);
+
+			error = fts_optimize_set_index_start_time(
+				optim->trx, index, start_time);
+		}
+
+		/* Check if this index needs to be optimized or not. */
+		if (difftime(end_time, start_time) < 0) {
+			error = fts_optimize_index(optim, index);
+
+			if (error != DB_SUCCESS) {
+				break;
+			}
+		} else {
+			++optim->n_completed;
+		}
+#endif
+		index = static_cast<dict_index_t*>(
+			ib_vector_getp(fts->indexes, i));
+		error = fts_optimize_index(optim, index);
+	}
+
+	if (error == DB_SUCCESS) {
+		fts_sql_commit(optim->trx);
+	} else {
+		fts_sql_rollback(optim->trx);
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Cleanup the snapshot tables and the master deleted table.
+@return DB_SUCCESS if all OK */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_purge_snapshot(
+/*========================*/
+	fts_optimize_t*	optim)	/*!< in: optimize instance */
+{
+	dberr_t		error;
+
+	/* Delete the doc ids from the master deleted tables, that were
+	in the snapshot that was taken at the start of optimize. */
+	error = fts_optimize_purge_deleted_doc_ids(optim);
+
+	if (error == DB_SUCCESS) {
+		/* Destroy the deleted doc id snapshot. */
+		error = fts_optimize_purge_deleted_doc_id_snapshot(optim);
+	}
+
+	if (error == DB_SUCCESS) {
+		fts_sql_commit(optim->trx);
+	} else {
+		fts_sql_rollback(optim->trx);
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Reset the start time to 0 so that a new optimize can be started.
+@return DB_SUCCESS if all OK */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_optimize_reset_start_time(
+/*==========================*/
+	fts_optimize_t*	optim)	/*!< in: optimize instance */
+{
+	dberr_t		error = DB_SUCCESS;
+#ifdef FTS_OPTIMIZE_DEBUG
+	fts_t*		fts = optim->table->fts;
+
+	/* Optimization should have been completed for all indexes. */
+	ut_a(optim->n_completed == ib_vector_size(fts->indexes));
+
+	for (uint i = 0; i < ib_vector_size(fts->indexes); ++i) {
+		dict_index_t*	index;
+
+		time_t	start_time = 0;
+
+		/* Reset the start time to 0 for this index. */
+		error = fts_optimize_set_index_start_time(
+			optim->trx, index, start_time);
+
+		index = static_cast<dict_index_t*>(
+			ib_vector_getp(fts->indexes, i));
+	}
+#endif
+
+	if (error == DB_SUCCESS) {
+		fts_sql_commit(optim->trx);
+	} else {
+		fts_sql_rollback(optim->trx);
+	}
+
+	return(error);
+}
+
+/*********************************************************************//**
+Run OPTIMIZE on the given table by a background thread.
+@return DB_SUCCESS if all OK */
+static MY_ATTRIBUTE((nonnull))
+dberr_t
+fts_optimize_table_bk(
+/*==================*/
+	fts_slot_t*	slot)	/*!< in: table to optimiza */
+{
+	const time_t now = time(NULL);
+	const ulint interval = ulint(now - slot->last_run);
+
+	/* Avoid optimizing tables that were optimized recently. */
+	if (slot->last_run > 0
+	    && lint(interval) >= 0
+	    && interval < FTS_OPTIMIZE_INTERVAL_IN_SECS) {
+
+		return(DB_SUCCESS);
+	}
+
+	dict_table_t*	table = slot->table;
+	dberr_t		error;
+
+	if (table->is_accessible()
+	    && table->fts && table->fts->cache
+	    && table->fts->cache->deleted >= FTS_OPTIMIZE_THRESHOLD) {
+		error = fts_optimize_table(table);
+
+		slot->last_run = time(NULL);
+
+		if (error == DB_SUCCESS) {
+			slot->running = false;
+			slot->completed = slot->last_run;
+		}
+	} else {
+		/* Note time this run completed. */
+		slot->last_run = now;
+		error = DB_SUCCESS;
+	}
+
+	return(error);
+}
+/*********************************************************************//**
+Run OPTIMIZE on the given table.
+@return DB_SUCCESS if all OK */
+dberr_t
+fts_optimize_table(
+/*===============*/
+	dict_table_t*	table)	/*!< in: table to optimiza */
+{
+	if (srv_read_only_mode) {
+		return DB_READ_ONLY;
+	}
+
+	dberr_t		error = DB_SUCCESS;
+	fts_optimize_t*	optim = NULL;
+	fts_t*		fts = table->fts;
+
+	if (UNIV_UNLIKELY(fts_enable_diag_print)) {
+		ib::info() << "FTS start optimize " << table->name;
+	}
+
+	optim = fts_optimize_create(table);
+
+	// FIXME: Call this only at the start of optimize, currently we
+	// rely on DB_DUPLICATE_KEY to handle corrupting the snapshot.
+
+	/* Check whether there are still records in BEING_DELETED table */
+	if (fts_optimize_being_deleted_count(optim) == 0) {
+		/* Take a snapshot of the deleted document ids, they are copied
+		to the BEING_ tables. */
+		error = fts_optimize_create_deleted_doc_id_snapshot(optim);
+	}
+
+	/* A duplicate error is OK, since we don't erase the
+	doc ids from the being deleted state until all FTS
+	indexes have been optimized. */
+	if (error == DB_DUPLICATE_KEY) {
+		error = DB_SUCCESS;
+	}
+
+	if (error == DB_SUCCESS) {
+
+		/* These document ids will be filtered out during the
+		index optimization phase. They are in the snapshot that we
+		took above, at the start of the optimize. */
+		error = fts_optimize_read_deleted_doc_id_snapshot(optim);
+
+		if (error == DB_SUCCESS) {
+
+			/* Commit the read of being deleted
+			doc ids transaction. */
+			fts_sql_commit(optim->trx);
+
+			/* We would do optimization only if there
+			are deleted records to be cleaned up */
+			if (ib_vector_size(optim->to_delete->doc_ids) > 0) {
+				error = fts_optimize_indexes(optim);
+			}
+
+		} else {
+			ut_a(optim->to_delete == NULL);
+		}
+
+		/* Only after all indexes have been optimized can we
+		delete the (snapshot) doc ids in the pending delete,
+		and master deleted tables. */
+		if (error == DB_SUCCESS
+		    && optim->n_completed == ib_vector_size(fts->indexes)) {
+
+			if (UNIV_UNLIKELY(fts_enable_diag_print)) {
+				ib::info() << "FTS_OPTIMIZE: Completed"
+					" Optimize, cleanup DELETED table";
+			}
+
+			if (ib_vector_size(optim->to_delete->doc_ids) > 0) {
+
+				/* Purge the doc ids that were in the
+				snapshot from the snapshot tables and
+				the master deleted table. */
+				error = fts_optimize_purge_snapshot(optim);
+			}
+
+			if (error == DB_SUCCESS) {
+				/* Reset the start time of all the FTS indexes
+				so that optimize can be restarted. */
+				error = fts_optimize_reset_start_time(optim);
+			}
+		}
+	}
+
+	fts_optimize_free(optim);
+
+	if (UNIV_UNLIKELY(fts_enable_diag_print)) {
+		ib::info() << "FTS end optimize " << table->name;
+	}
+
+	return(error);
+}
+
+/********************************************************************//**
+Add the table to add to the OPTIMIZER's list.
+@return new message instance */
+static
+fts_msg_t*
+fts_optimize_create_msg(
+/*====================*/
+	fts_msg_type_t	type,		/*!< in: type of message */
+	void*		ptr)		/*!< in: message payload */
+{
+	mem_heap_t*	heap;
+	fts_msg_t*	msg;
+
+	heap = mem_heap_create(sizeof(*msg) + sizeof(ib_list_node_t) + 16);
+	msg = static_cast<fts_msg_t*>(mem_heap_alloc(heap, sizeof(*msg)));
+
+	msg->ptr = ptr;
+	msg->type = type;
+	msg->heap = heap;
+
+	return(msg);
+}
+
+/** Add message to wqueue, signal thread pool*/
+static void add_msg(fts_msg_t *msg)
+{
+  ib_wqueue_add(fts_optimize_wq, msg, msg->heap, true);
+  srv_thread_pool->submit_task(&task);
+}
+
+/**
+Called by "idle" timer. Submits optimize task, which
+will only recalculate is_sync_needed, in case the queue is empty.
+*/
+static void timer_callback(void*)
+{
+  srv_thread_pool->submit_task(&task);
+}
+
+/** Add the table to add to the OPTIMIZER's list.
+@param[in]	table	table to add */
+void fts_optimize_add_table(dict_table_t* table)
+{
+	fts_msg_t*	msg;
+
+	if (!fts_optimize_wq) {
+		return;
+	}
+
+	/* Make sure table with FTS index cannot be evicted */
+	dict_sys.prevent_eviction(table);
+
+	msg = fts_optimize_create_msg(FTS_MSG_ADD_TABLE, table);
+
+	mysql_mutex_lock(&fts_optimize_wq->mutex);
+
+	add_msg(msg);
+
+	table->fts->in_queue = true;
+
+	mysql_mutex_unlock(&fts_optimize_wq->mutex);
+}
+
+/**********************************************************************//**
+Remove the table from the OPTIMIZER's list. We do wait for
+acknowledgement from the consumer of the message. */
+void
+fts_optimize_remove_table(
+/*======================*/
+	dict_table_t*	table)			/*!< in: table to remove */
+{
+  if (!fts_optimize_wq)
+    return;
+
+  if (fts_opt_start_shutdown)
+  {
+    ib::info() << "Try to remove table " << table->name
+               << " after FTS optimize thread exiting.";
+    while (fts_optimize_wq)
+      std::this_thread::sleep_for(std::chrono::milliseconds(10));
+    return;
+  }
+
+  mysql_mutex_lock(&fts_optimize_wq->mutex);
+
+  if (table->fts->in_queue)
+  {
+    fts_msg_t *msg= fts_optimize_create_msg(FTS_MSG_DEL_TABLE, nullptr);
+    pthread_cond_t cond;
+    pthread_cond_init(&cond, nullptr);
+    msg->ptr= new(mem_heap_alloc(msg->heap, sizeof(fts_msg_del_t)))
+      fts_msg_del_t{table, &cond};
+    add_msg(msg);
+    my_cond_wait(&cond, &fts_optimize_wq->mutex.m_mutex);
+    pthread_cond_destroy(&cond);
+    ut_ad(!table->fts->in_queue);
+  }
+
+  mysql_mutex_unlock(&fts_optimize_wq->mutex);
+}
+
+/** Send sync fts cache for the table.
+@param[in]	table	table to sync */
+void
+fts_optimize_request_sync_table(
+	dict_table_t*	table)
+{
+	/* if the optimize system not yet initialized, return */
+	if (!fts_optimize_wq) {
+		return;
+	}
+
+	mysql_mutex_lock(&fts_optimize_wq->mutex);
+
+	/* FTS optimizer thread is already exited */
+	if (fts_opt_start_shutdown) {
+		ib::info() << "Try to sync table " << table->name
+			<< " after FTS optimize thread exiting.";
+	} else if (table->fts->sync_message) {
+		/* If the table already has SYNC message in
+		fts_optimize_wq queue then ignore it */
+	} else {
+		add_msg(fts_optimize_create_msg(FTS_MSG_SYNC_TABLE, table));
+		table->fts->sync_message = true;
+		DBUG_EXECUTE_IF("fts_optimize_wq_count_check",
+				DBUG_ASSERT(fts_optimize_wq->length <= 1000););
+	}
+
+	mysql_mutex_unlock(&fts_optimize_wq->mutex);
+}
+
+/** Add a table to fts_slots if it doesn't already exist. */
+static bool fts_optimize_new_table(dict_table_t* table)
+{
+	ut_ad(table);
+
+	ulint		i;
+	fts_slot_t*	slot;
+	fts_slot_t*	empty = NULL;
+
+	/* Search for duplicates, also find a free slot if one exists. */
+	for (i = 0; i < ib_vector_size(fts_slots); ++i) {
+
+		slot = static_cast<fts_slot_t*>(ib_vector_get(fts_slots, i));
+
+		if (!slot->table) {
+			empty = slot;
+		} else if (slot->table == table) {
+			/* Already exists in our optimize queue. */
+			return false;
+		}
+	}
+
+	slot = empty ? empty : static_cast<fts_slot_t*>(
+		ib_vector_push(fts_slots, NULL));
+
+	memset(slot, 0x0, sizeof(*slot));
+
+	slot->table = table;
+	return true;
+}
+
+/** Remove a table from fts_slots if it exists.
+@param remove	table to be removed from fts_slots */
+static bool fts_optimize_del_table(fts_msg_del_t *remove)
+{
+	const dict_table_t* table = remove->table;
+	ut_ad(table);
+	for (ulint i = 0; i < ib_vector_size(fts_slots); ++i) {
+		fts_slot_t*	slot;
+
+		slot = static_cast<fts_slot_t*>(ib_vector_get(fts_slots, i));
+
+		if (slot->table == table) {
+			if (UNIV_UNLIKELY(fts_enable_diag_print)) {
+				ib::info() << "FTS Optimize Removing table "
+					<< table->name;
+			}
+
+			mysql_mutex_lock(&fts_optimize_wq->mutex);
+			table->fts->in_queue = false;
+			pthread_cond_signal(remove->cond);
+			mysql_mutex_unlock(&fts_optimize_wq->mutex);
+			slot->table = NULL;
+			return true;
+		}
+	}
+
+	mysql_mutex_lock(&fts_optimize_wq->mutex);
+	pthread_cond_signal(remove->cond);
+	mysql_mutex_unlock(&fts_optimize_wq->mutex);
+	return false;
+}
+
+/**********************************************************************//**
+Calculate how many tables in fts_slots need to be optimized.
+@return no. of tables to optimize */
+static ulint fts_optimize_how_many()
+{
+	ulint n_tables = 0;
+	const time_t current_time = time(NULL);
+
+	for (ulint i = 0; i < ib_vector_size(fts_slots); ++i) {
+		const fts_slot_t* slot = static_cast<const fts_slot_t*>(
+			ib_vector_get_const(fts_slots, i));
+		if (!slot->table) {
+			continue;
+		}
+
+		const time_t end = slot->running
+			? slot->last_run : slot->completed;
+		ulint interval = ulint(current_time - end);
+
+		if (lint(interval) < 0
+		    || interval >= FTS_OPTIMIZE_INTERVAL_IN_SECS) {
+			++n_tables;
+		}
+	}
+
+	return(n_tables);
+}
+
+/**********************************************************************//**
+Check if the total memory used by all FTS table exceeds the maximum limit.
+@return true if a sync is needed, false otherwise */
+static bool fts_is_sync_needed()
+{
+	ulint		total_memory = 0;
+	const time_t	now = time(NULL);
+	double		time_diff = difftime(now, last_check_sync_time);
+
+	if (fts_need_sync || (time_diff >= 0 && time_diff < 5)) {
+		return(false);
+	}
+
+	last_check_sync_time = now;
+
+	for (ulint i = 0; i < ib_vector_size(fts_slots); ++i) {
+		const fts_slot_t* slot = static_cast<const fts_slot_t*>(
+			ib_vector_get_const(fts_slots, i));
+
+		if (!slot->table) {
+			continue;
+		}
+
+		if (slot->table->fts && slot->table->fts->cache) {
+			total_memory += slot->table->fts->cache->total_size;
+		}
+
+		if (total_memory > fts_max_total_cache_size) {
+			return(true);
+		}
+	}
+
+	return(false);
+}
+
+/** Sync fts cache of a table
+@param[in,out]  table           table to be synced
+@param[in]      process_message processing messages from fts_optimize_wq */
+static void fts_optimize_sync_table(dict_table_t *table,
+                                    bool process_message= false)
+{
+  MDL_ticket* mdl_ticket= nullptr;
+  dict_table_t *sync_table= dict_acquire_mdl_shared<true>(table, fts_opt_thd,
+                                                          &mdl_ticket);
+
+  if (!sync_table)
+    return;
+
+  if (sync_table->fts && sync_table->fts->cache && sync_table->is_accessible())
+  {
+    fts_sync_table(sync_table, false);
+    if (process_message)
+    {
+      mysql_mutex_lock(&fts_optimize_wq->mutex);
+      sync_table->fts->sync_message = false;
+      mysql_mutex_unlock(&fts_optimize_wq->mutex);
+    }
+  }
+
+  DBUG_EXECUTE_IF("ib_optimize_wq_hang",
+		  std::this_thread::sleep_for(std::chrono::seconds(6)););
+
+  if (mdl_ticket)
+    dict_table_close(sync_table, false, fts_opt_thd, mdl_ticket);
+}
+
+/**********************************************************************//**
+Optimize all FTS tables.
+@return Dummy return */
+static void fts_optimize_callback(void *)
+{
+	ut_ad(!srv_read_only_mode);
+
+	static ulint	current;
+	static bool	done;
+	static ulint	n_optimize;
+
+	if (!fts_optimize_wq || done) {
+		/* Possibly timer initiated callback, can come after FTS_MSG_STOP.*/
+		return;
+	}
+
+	static ulint		n_tables = ib_vector_size(fts_slots);
+
+	while (!done && srv_shutdown_state <= SRV_SHUTDOWN_INITIATED) {
+		/* If there is no message in the queue and we have tables
+		to optimize then optimize the tables. */
+
+		if (!done
+		    && ib_wqueue_is_empty(fts_optimize_wq)
+		    && n_tables > 0
+		    && n_optimize > 0) {
+
+			/* The queue is empty but we have tables
+			to optimize. */
+			if (UNIV_UNLIKELY(wsrep_sst_disable_writes)) {
+retry_later:
+				if (fts_is_sync_needed()) {
+					fts_need_sync = true;
+				}
+				if (n_tables) {
+					timer->set_time(5000, 0);
+				}
+				return;
+			}
+
+			fts_slot_t* slot = static_cast<fts_slot_t*>(
+				ib_vector_get(fts_slots, current));
+
+			/* Handle the case of empty slots. */
+			if (slot->table) {
+				slot->running = true;
+				fts_optimize_table_bk(slot);
+			}
+
+			/* Wrap around the counter. */
+			if (++current >= ib_vector_size(fts_slots)) {
+				n_optimize = fts_optimize_how_many();
+				current = 0;
+			}
+		} else if (n_optimize == 0
+			   || !ib_wqueue_is_empty(fts_optimize_wq)) {
+			fts_msg_t* msg = static_cast<fts_msg_t*>
+				(ib_wqueue_nowait(fts_optimize_wq));
+			/* Timeout ? */
+			if (!msg) {
+				goto retry_later;
+			}
+
+			switch (msg->type) {
+			case FTS_MSG_STOP:
+				done = true;
+				break;
+
+			case FTS_MSG_ADD_TABLE:
+				ut_a(!done);
+				if (fts_optimize_new_table(
+					    static_cast<dict_table_t*>(
+						    msg->ptr))) {
+					++n_tables;
+				}
+				break;
+
+			case FTS_MSG_DEL_TABLE:
+				if (fts_optimize_del_table(
+					    static_cast<fts_msg_del_t*>(
+						    msg->ptr))) {
+					--n_tables;
+				}
+				break;
+
+			case FTS_MSG_SYNC_TABLE:
+				if (UNIV_UNLIKELY(wsrep_sst_disable_writes)) {
+					add_msg(msg);
+					goto retry_later;
+				}
+
+				DBUG_EXECUTE_IF(
+					"fts_instrument_msg_sync_sleep",
+					std::this_thread::sleep_for(
+						std::chrono::milliseconds(
+							300)););
+
+				fts_optimize_sync_table(
+					static_cast<dict_table_t*>(msg->ptr),
+					true);
+				break;
+
+			default:
+				ut_error;
+			}
+
+			mem_heap_free(msg->heap);
+			n_optimize = done ? 0 : fts_optimize_how_many();
+		}
+	}
+
+	/* Server is being shutdown, sync the data from FTS cache to disk
+	if needed */
+	if (n_tables > 0) {
+		for (ulint i = 0; i < ib_vector_size(fts_slots); i++) {
+			fts_slot_t* slot = static_cast<fts_slot_t*>(
+				ib_vector_get(fts_slots, i));
+
+			if (slot->table) {
+				fts_optimize_sync_table(slot->table);
+			}
+		}
+	}
+
+	ib_vector_free(fts_slots);
+	mysql_mutex_lock(&fts_optimize_wq->mutex);
+	fts_slots = NULL;
+	pthread_cond_broadcast(&fts_opt_shutdown_cond);
+	mysql_mutex_unlock(&fts_optimize_wq->mutex);
+
+	ib::info() << "FTS optimize thread exiting.";
+}
+
+/**********************************************************************//**
+Startup the optimize thread and create the work queue. */
+void
+fts_optimize_init(void)
+/*===================*/
+{
+	mem_heap_t*	heap;
+	ib_alloc_t*     heap_alloc;
+
+	ut_ad(!srv_read_only_mode);
+
+	/* For now we only support one optimize thread. */
+	ut_a(!fts_optimize_wq);
+
+	/* Create FTS optimize work queue */
+	fts_optimize_wq = ib_wqueue_create();
+	timer = srv_thread_pool->create_timer(timer_callback);
+
+	/* Create FTS vector to store fts_slot_t */
+	heap = mem_heap_create(sizeof(dict_table_t*) * 64);
+	heap_alloc = ib_heap_allocator_create(heap);
+	fts_slots = ib_vector_create(heap_alloc, sizeof(fts_slot_t), 4);
+
+	fts_opt_thd = innobase_create_background_thd("InnoDB FTS optimizer");
+	/* Add fts tables to fts_slots which could be skipped
+	during dict_load_table_one() because fts_optimize_thread
+	wasn't even started. */
+	dict_sys.freeze(SRW_LOCK_CALL);
+	for (dict_table_t* table = UT_LIST_GET_FIRST(dict_sys.table_LRU);
+	     table != NULL;
+	     table = UT_LIST_GET_NEXT(table_LRU, table)) {
+		if (!table->fts || !dict_table_has_fts_index(table)) {
+			continue;
+		}
+
+		/* fts_optimize_thread is not started yet. So there is no
+		need to acquire fts_optimize_wq->mutex for adding the fts
+		table to the fts slots. */
+		ut_ad(!table->can_be_evicted);
+		fts_optimize_new_table(table);
+		table->fts->in_queue = true;
+	}
+	dict_sys.unfreeze();
+
+	pthread_cond_init(&fts_opt_shutdown_cond, nullptr);
+	last_check_sync_time = time(NULL);
+}
+
+/** Shutdown fts optimize thread. */
+void
+fts_optimize_shutdown()
+{
+	ut_ad(!srv_read_only_mode);
+
+	/* If there is an ongoing activity on dictionary, such as
+	srv_master_evict_from_table_cache(), wait for it */
+	dict_sys.freeze(SRW_LOCK_CALL);
+	mysql_mutex_lock(&fts_optimize_wq->mutex);
+	/* Tells FTS optimizer system that we are exiting from
+	optimizer thread, message send their after will not be
+	processed */
+	fts_opt_start_shutdown = true;
+	dict_sys.unfreeze();
+
+	/* We tell the OPTIMIZE thread to switch to state done, we
+	can't delete the work queue here because the add thread needs
+	deregister the FTS tables. */
+	timer->disarm();
+	task_group.cancel_pending(&task);
+
+	add_msg(fts_optimize_create_msg(FTS_MSG_STOP, nullptr));
+
+	while (fts_slots) {
+		my_cond_wait(&fts_opt_shutdown_cond,
+			     &fts_optimize_wq->mutex.m_mutex);
+	}
+
+	destroy_background_thd(fts_opt_thd);
+	fts_opt_thd = NULL;
+	pthread_cond_destroy(&fts_opt_shutdown_cond);
+	mysql_mutex_unlock(&fts_optimize_wq->mutex);
+
+	ib_wqueue_free(fts_optimize_wq);
+	fts_optimize_wq = NULL;
+
+	delete timer;
+	timer = NULL;
+}
+
+/** Sync the table during commit phase
+@param[in]	table	table to be synced */
+void fts_sync_during_ddl(dict_table_t* table)
+{
+  if (!fts_optimize_wq)
+    return;
+  mysql_mutex_lock(&fts_optimize_wq->mutex);
+  const auto sync_message= table->fts->sync_message;
+  mysql_mutex_unlock(&fts_optimize_wq->mutex);
+  if (!sync_message)
+    return;
+
+  fts_sync_table(table, false);
+
+  mysql_mutex_lock(&fts_optimize_wq->mutex);
+  table->fts->sync_message = false;
+  mysql_mutex_unlock(&fts_optimize_wq->mutex);
+}
diff --git a/storage/innobase/fts/fts0pars.cc b/storage/innobase/fts/fts0pars.cc
new file mode 100644
index 00000000..cb51784a
--- /dev/null
+++ b/storage/innobase/fts/fts0pars.cc
@@ -0,0 +1,2007 @@
+/* A Bison parser, made by GNU Bison 2.5.  */
+
+/* Bison implementation for Yacc-like parsers in C
+
+      Copyright (C) 1984, 1989-1990, 2000-2011 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/* As a special exception, you may create a larger work that contains
+   part or all of the Bison parser skeleton and distribute that work
+   under terms of your choice, so long as that work isn't itself a
+   parser generator using the skeleton or a modified version thereof
+   as a parser skeleton.  Alternatively, if you modify or redistribute
+   the parser skeleton itself, you may (at your option) remove this
+   special exception, which will cause the skeleton and the resulting
+   Bison output files to be licensed under the GNU General Public
+   License without this special exception.
+
+   This special exception was added by the Free Software Foundation in
+   version 2.2 of Bison.  */
+
+/* C LALR(1) parser skeleton written by Richard Stallman, by
+   simplifying the original so-called "semantic" parser.  */
+
+/* All symbols defined below should begin with yy or YY, to avoid
+   infringing on user name space.  This should be done even for local
+   variables, as they might otherwise be expanded by user macros.
+   There are some unavoidable exceptions within include files to
+   define necessary library symbols; they are noted "INFRINGES ON
+   USER NAME SPACE" below.  */
+
+/* Identify Bison output.  */
+#define YYBISON 1
+
+/* Bison version.  */
+#define YYBISON_VERSION "2.5"
+
+/* Skeleton name.  */
+#define YYSKELETON_NAME "yacc.c"
+
+/* Pure parsers.  */
+#define YYPURE 1
+
+/* Push parsers.  */
+#define YYPUSH 0
+
+/* Pull parsers.  */
+#define YYPULL 1
+
+/* Using locations.  */
+#define YYLSP_NEEDED 0
+
+/* Substitute the variable and function names.  */
+#define yyparse         ftsparse
+#define yylex           ftslex
+#define yyerror         ftserror
+#define yylval          ftslval
+#define yychar          ftschar
+#define yydebug         ftsdebug
+#define yynerrs         ftsnerrs
+
+
+/* Copy the first part of user declarations.  */
+
+/* Line 268 of yacc.c  */
+#line 26 "fts0pars.y"
+
+#include "ha_prototypes.h"
+#include "mem0mem.h"
+#include "fts0ast.h"
+#include "fts0blex.h"
+#include "fts0tlex.h"
+#include "fts0pars.h"
+#include <my_sys.h>
+extern	int fts_lexer(YYSTYPE*, fts_lexer_t*);
+extern	int fts_blexer(YYSTYPE*, yyscan_t);
+extern	int fts_tlexer(YYSTYPE*, yyscan_t);
+#ifdef __GNUC__
+# pragma GCC diagnostic ignored "-Wpragmas"
+# pragma GCC diagnostic ignored "-Wunknown-warning-option"
+# pragma GCC diagnostic ignored "-Wunused-but-set-variable"
+#endif
+extern int ftserror(const char* p);
+/* Required for reentrant parser */
+#define ftslex	fts_lexer
+
+#define YYERROR_VERBOSE
+
+/* For passing an argument to yyparse() */
+#define YYPARSE_PARAM state
+#define YYLEX_PARAM ((fts_ast_state_t*) state)->lexer
+
+#define YYTOKENFREE(token) fts_ast_string_free((token))
+
+
+typedef	int	(*fts_scanner)(YYSTYPE* val, yyscan_t yyscanner);
+
+struct fts_lexer_t {
+	fts_scanner	scanner;
+	void*		yyscanner;
+};
+
+
+
+/* Line 268 of yacc.c  */
+#line 115 "fts0pars.cc"
+
+/* Enabling traces.  */
+#ifndef YYDEBUG
+# define YYDEBUG 0
+#endif
+
+/* Enabling verbose error messages.  */
+#ifdef YYERROR_VERBOSE
+# undef YYERROR_VERBOSE
+# define YYERROR_VERBOSE 1
+#else
+# define YYERROR_VERBOSE 0
+#endif
+
+/* Enabling the token table.  */
+#ifndef YYTOKEN_TABLE
+# define YYTOKEN_TABLE 0
+#endif
+
+
+/* Tokens.  */
+#ifndef YYTOKENTYPE
+# define YYTOKENTYPE
+   /* Put the tokens into the symbol table, so that GDB and other debuggers
+      know about them.  */
+   enum yytokentype {
+     FTS_OPER = 258,
+     FTS_TEXT = 259,
+     FTS_TERM = 260,
+     FTS_NUMB = 261
+   };
+#endif
+
+
+
+#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
+typedef union YYSTYPE
+{
+
+/* Line 293 of yacc.c  */
+#line 61 "fts0pars.y"
+
+	int			oper;
+	fts_ast_string_t*	token;
+	fts_ast_node_t*		node;
+
+
+
+/* Line 293 of yacc.c  */
+#line 165 "fts0pars.cc"
+} YYSTYPE;
+# define YYSTYPE_IS_TRIVIAL 1
+# define yystype YYSTYPE /* obsolescent; will be withdrawn */
+# define YYSTYPE_IS_DECLARED 1
+#endif
+
+
+/* Copy the second part of user declarations.  */
+
+
+/* Line 343 of yacc.c  */
+#line 177 "fts0pars.cc"
+
+#ifdef short
+# undef short
+#endif
+
+#ifdef YYTYPE_UINT8
+typedef YYTYPE_UINT8 yytype_uint8;
+#else
+typedef unsigned char yytype_uint8;
+#endif
+
+#ifdef YYTYPE_INT8
+typedef YYTYPE_INT8 yytype_int8;
+#elif (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+typedef signed char yytype_int8;
+#else
+typedef short int yytype_int8;
+#endif
+
+#ifdef YYTYPE_UINT16
+typedef YYTYPE_UINT16 yytype_uint16;
+#else
+typedef unsigned short int yytype_uint16;
+#endif
+
+#ifdef YYTYPE_INT16
+typedef YYTYPE_INT16 yytype_int16;
+#else
+typedef short int yytype_int16;
+#endif
+
+#ifndef YYSIZE_T
+# ifdef __SIZE_TYPE__
+#  define YYSIZE_T __SIZE_TYPE__
+# elif defined size_t
+#  define YYSIZE_T size_t
+# elif ! defined YYSIZE_T && (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+#  include <stddef.h> /* INFRINGES ON USER NAME SPACE */
+#  define YYSIZE_T size_t
+# else
+#  define YYSIZE_T unsigned int
+# endif
+#endif
+
+#define YYSIZE_MAXIMUM ((YYSIZE_T) -1)
+
+#ifndef YY_
+# if defined YYENABLE_NLS && YYENABLE_NLS
+#  if ENABLE_NLS
+#   include <libintl.h> /* INFRINGES ON USER NAME SPACE */
+#   define YY_(msgid) dgettext ("bison-runtime", msgid)
+#  endif
+# endif
+# ifndef YY_
+#  define YY_(msgid) msgid
+# endif
+#endif
+
+/* Suppress unused-variable warnings by "using" E.  */
+#if ! defined lint || defined __GNUC__
+# define YYUSE(e) ((void) (e))
+#else
+# define YYUSE(e) /* empty */
+#endif
+
+/* Identity function, used to suppress warnings about constant conditions.  */
+#ifndef lint
+# define YYID(n) (n)
+#else
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static int
+YYID (int yyi)
+#else
+static int
+YYID (yyi)
+    int yyi;
+#endif
+{
+  return yyi;
+}
+#endif
+
+#if ! defined yyoverflow || YYERROR_VERBOSE
+
+/* The parser invokes alloca or malloc; define the necessary symbols.  */
+
+# ifdef YYSTACK_USE_ALLOCA
+#  if YYSTACK_USE_ALLOCA
+#   ifdef __GNUC__
+#    define YYSTACK_ALLOC __builtin_alloca
+#   elif defined __BUILTIN_VA_ARG_INCR
+#    include <alloca.h> /* INFRINGES ON USER NAME SPACE */
+#   elif defined _MSC_VER
+#    include <malloc.h> /* INFRINGES ON USER NAME SPACE */
+#    define alloca _alloca
+#   else
+#    define YYSTACK_ALLOC alloca
+#    if ! defined _ALLOCA_H && ! defined EXIT_SUCCESS && (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+#     include <stdlib.h> /* INFRINGES ON USER NAME SPACE */
+#     ifndef EXIT_SUCCESS
+#      define EXIT_SUCCESS 0
+#     endif
+#    endif
+#   endif
+#  endif
+# endif
+
+# ifdef YYSTACK_ALLOC
+   /* Pacify GCC's `empty if-body' warning.  */
+#  define YYSTACK_FREE(Ptr) do { /* empty */; } while (YYID (0))
+#  ifndef YYSTACK_ALLOC_MAXIMUM
+    /* The OS might guarantee only one guard page at the bottom of the stack,
+       and a page size can be as small as 4096 bytes.  So we cannot safely
+       invoke alloca (N) if N exceeds 4096.  Use a slightly smaller number
+       to allow for a few compiler-allocated temporary stack slots.  */
+#   define YYSTACK_ALLOC_MAXIMUM 4032 /* reasonable circa 2006 */
+#  endif
+# else
+#  define YYSTACK_ALLOC YYMALLOC
+#  define YYSTACK_FREE YYFREE
+#  ifndef YYSTACK_ALLOC_MAXIMUM
+#   define YYSTACK_ALLOC_MAXIMUM YYSIZE_MAXIMUM
+#  endif
+#  if (defined __cplusplus && ! defined EXIT_SUCCESS \
+       && ! ((defined YYMALLOC || defined malloc) \
+	     && (defined YYFREE || defined free)))
+#   include <stdlib.h> /* INFRINGES ON USER NAME SPACE */
+#   ifndef EXIT_SUCCESS
+#    define EXIT_SUCCESS 0
+#   endif
+#  endif
+#  ifndef YYMALLOC
+#   define YYMALLOC malloc
+#   if ! defined malloc && ! defined EXIT_SUCCESS && (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+void *malloc (YYSIZE_T); /* INFRINGES ON USER NAME SPACE */
+#   endif
+#  endif
+#  ifndef YYFREE
+#   define YYFREE free
+#   if ! defined free && ! defined EXIT_SUCCESS && (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+void free (void *); /* INFRINGES ON USER NAME SPACE */
+#   endif
+#  endif
+# endif
+#endif /* ! defined yyoverflow || YYERROR_VERBOSE */
+
+
+#if (! defined yyoverflow \
+     && (! defined __cplusplus \
+	 || (defined YYSTYPE_IS_TRIVIAL && YYSTYPE_IS_TRIVIAL)))
+
+/* A type that is properly aligned for any stack member.  */
+union yyalloc
+{
+  yytype_int16 yyss_alloc;
+  YYSTYPE yyvs_alloc;
+};
+
+/* The size of the maximum gap between one aligned stack and the next.  */
+# define YYSTACK_GAP_MAXIMUM (sizeof (union yyalloc) - 1)
+
+/* The size of an array large to enough to hold all stacks, each with
+   N elements.  */
+# define YYSTACK_BYTES(N) \
+     ((N) * (sizeof (yytype_int16) + sizeof (YYSTYPE)) \
+      + YYSTACK_GAP_MAXIMUM)
+
+# define YYCOPY_NEEDED 1
+
+/* Relocate STACK from its old location to the new one.  The
+   local variables YYSIZE and YYSTACKSIZE give the old and new number of
+   elements in the stack, and YYPTR gives the new location of the
+   stack.  Advance YYPTR to a properly aligned location for the next
+   stack.  */
+# define YYSTACK_RELOCATE(Stack_alloc, Stack)				\
+    do									\
+      {									\
+	YYSIZE_T yynewbytes;						\
+	YYCOPY (&yyptr->Stack_alloc, Stack, yysize);			\
+	Stack = &yyptr->Stack_alloc;					\
+	yynewbytes = yystacksize * sizeof (*Stack) + YYSTACK_GAP_MAXIMUM; \
+	yyptr += yynewbytes / sizeof (*yyptr);				\
+      }									\
+    while (YYID (0))
+
+#endif
+
+#if defined YYCOPY_NEEDED && YYCOPY_NEEDED
+/* Copy COUNT objects from FROM to TO.  The source and destination do
+   not overlap.  */
+# ifndef YYCOPY
+#  if defined __GNUC__ && 1 < __GNUC__
+#   define YYCOPY(To, From, Count) \
+      __builtin_memcpy (To, From, (Count) * sizeof (*(From)))
+#  else
+#   define YYCOPY(To, From, Count)		\
+      do					\
+	{					\
+	  YYSIZE_T yyi;				\
+	  for (yyi = 0; yyi < (Count); yyi++)	\
+	    (To)[yyi] = (From)[yyi];		\
+	}					\
+      while (YYID (0))
+#  endif
+# endif
+#endif /* !YYCOPY_NEEDED */
+
+/* YYFINAL -- State number of the termination state.  */
+#define YYFINAL  3
+/* YYLAST -- Last index in YYTABLE.  */
+#define YYLAST   52
+
+/* YYNTOKENS -- Number of terminals.  */
+#define YYNTOKENS  16
+/* YYNNTS -- Number of nonterminals.  */
+#define YYNNTS  8
+/* YYNRULES -- Number of rules.  */
+#define YYNRULES  24
+/* YYNRULES -- Number of states.  */
+#define YYNSTATES  33
+
+/* YYTRANSLATE(YYLEX) -- Bison symbol number corresponding to YYLEX.  */
+#define YYUNDEFTOK  2
+#define YYMAXUTOK   261
+
+#define YYTRANSLATE(YYX)						\
+  ((unsigned int) (YYX) <= YYMAXUTOK ? yytranslate[YYX] : YYUNDEFTOK)
+
+/* YYTRANSLATE[YYLEX] -- Bison symbol number corresponding to YYLEX.  */
+static const yytype_uint8 yytranslate[] =
+{
+       0,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+      12,    13,    14,     7,     2,     8,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+      10,     2,    11,     2,    15,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     9,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     1,     2,     3,     4,
+       5,     6
+};
+
+#if YYDEBUG
+/* YYPRHS[YYN] -- Index of the first RHS symbol of rule number YYN in
+   YYRHS.  */
+static const yytype_uint8 yyprhs[] =
+{
+       0,     0,     3,     5,     6,     9,    12,    16,    21,    23,
+      25,    28,    32,    36,    39,    44,    47,    49,    51,    53,
+      55,    57,    59,    61,    64
+};
+
+/* YYRHS -- A `-1'-separated list of the rules' RHS.  */
+static const yytype_int8 yyrhs[] =
+{
+      17,     0,    -1,    18,    -1,    -1,    18,    20,    -1,    18,
+      19,    -1,    12,    18,    13,    -1,    21,    12,    18,    13,
+      -1,    22,    -1,    23,    -1,    22,    14,    -1,    23,    15,
+       6,    -1,    21,    22,    14,    -1,    21,    22,    -1,    21,
+      23,    15,     6,    -1,    21,    23,    -1,     8,    -1,     7,
+      -1,     9,    -1,    10,    -1,    11,    -1,     5,    -1,     6,
+      -1,    14,    22,    -1,     4,    -1
+};
+
+/* YYRLINE[YYN] -- source line where rule number YYN was defined.  */
+static const yytype_uint8 yyrline[] =
+{
+       0,    79,    79,    85,    89,    99,   111,   119,   129,   133,
+     137,   141,   146,   152,   157,   164,   170,   174,   178,   182,
+     186,   191,   196,   202,   207
+};
+#endif
+
+#if YYDEBUG || YYERROR_VERBOSE || YYTOKEN_TABLE
+/* YYTNAME[SYMBOL-NUM] -- String name of the symbol SYMBOL-NUM.
+   First, the terminals, then, starting at YYNTOKENS, nonterminals.  */
+static const char *const yytname[] =
+{
+  "$end", "error", "$undefined", "FTS_OPER", "FTS_TEXT", "FTS_TERM",
+  "FTS_NUMB", "'+'", "'-'", "'~'", "'<'", "'>'", "'('", "')'", "'*'",
+  "'@'", "$accept", "query", "expr_lst", "sub_expr", "expr", "prefix",
+  "term", "text", 0
+};
+#endif
+
+# ifdef YYPRINT
+/* YYTOKNUM[YYLEX-NUM] -- Internal token number corresponding to
+   token YYLEX-NUM.  */
+static const yytype_uint16 yytoknum[] =
+{
+       0,   256,   257,   258,   259,   260,   261,    43,    45,   126,
+      60,    62,    40,    41,    42,    64
+};
+# endif
+
+/* YYR1[YYN] -- Symbol number of symbol that rule YYN derives.  */
+static const yytype_uint8 yyr1[] =
+{
+       0,    16,    17,    18,    18,    18,    19,    19,    20,    20,
+      20,    20,    20,    20,    20,    20,    21,    21,    21,    21,
+      21,    22,    22,    22,    23
+};
+
+/* YYR2[YYN] -- Number of symbols composing right hand side of rule YYN.  */
+static const yytype_uint8 yyr2[] =
+{
+       0,     2,     1,     0,     2,     2,     3,     4,     1,     1,
+       2,     3,     3,     2,     4,     2,     1,     1,     1,     1,
+       1,     1,     1,     2,     1
+};
+
+/* YYDEFACT[STATE-NAME] -- Default reduction number in state STATE-NUM.
+   Performed when YYTABLE doesn't specify something else to do.  Zero
+   means the default is an error.  */
+static const yytype_uint8 yydefact[] =
+{
+       3,     0,     2,     1,    24,    21,    22,    17,    16,    18,
+      19,    20,     3,     0,     5,     4,     0,     8,     9,     0,
+      23,     3,    13,    15,    10,     0,     6,     0,    12,     0,
+      11,     7,    14
+};
+
+/* YYDEFGOTO[NTERM-NUM].  */
+static const yytype_int8 yydefgoto[] =
+{
+      -1,     1,     2,    14,    15,    16,    17,    18
+};
+
+/* YYPACT[STATE-NUM] -- Index in YYTABLE of the portion describing
+   STATE-NUM.  */
+#define YYPACT_NINF -5
+static const yytype_int8 yypact[] =
+{
+      -5,    38,    18,    -5,    -5,    -5,    -5,    -5,    -5,    -5,
+      -5,    -5,    -5,    31,    -5,    -5,    29,    30,    32,    -4,
+      -5,    -5,    34,    35,    -5,    40,    -5,     7,    -5,    43,
+      -5,    -5,    -5
+};
+
+/* YYPGOTO[NTERM-NUM].  */
+static const yytype_int8 yypgoto[] =
+{
+      -5,    -5,    19,    -5,    -5,    -5,    26,    36
+};
+
+/* YYTABLE[YYPACT[STATE-NUM]].  What to do in state STATE-NUM.  If
+   positive, shift that token.  If negative, reduce the rule which
+   number is the opposite.  If YYTABLE_NINF, syntax error.  */
+#define YYTABLE_NINF -1
+static const yytype_uint8 yytable[] =
+{
+       4,     5,     6,     7,     8,     9,    10,    11,    12,    26,
+      13,     4,     5,     6,     7,     8,     9,    10,    11,    12,
+      31,    13,     4,     5,     6,     7,     8,     9,    10,    11,
+      12,    19,    13,     4,     5,     6,     5,     6,     3,    20,
+      27,    21,    22,    13,    24,    13,    30,    25,    28,    32,
+      29,     0,    23
+};
+
+#define yypact_value_is_default(yystate) \
+  ((yystate) == (-5))
+
+#define yytable_value_is_error(yytable_value) \
+  YYID (0)
+
+static const yytype_int8 yycheck[] =
+{
+       4,     5,     6,     7,     8,     9,    10,    11,    12,    13,
+      14,     4,     5,     6,     7,     8,     9,    10,    11,    12,
+      13,    14,     4,     5,     6,     7,     8,     9,    10,    11,
+      12,    12,    14,     4,     5,     6,     5,     6,     0,    13,
+      21,    12,    16,    14,    14,    14,     6,    15,    14,     6,
+      15,    -1,    16
+};
+
+/* YYSTOS[STATE-NUM] -- The (internal number of the) accessing
+   symbol of state STATE-NUM.  */
+static const yytype_uint8 yystos[] =
+{
+       0,    17,    18,     0,     4,     5,     6,     7,     8,     9,
+      10,    11,    12,    14,    19,    20,    21,    22,    23,    18,
+      22,    12,    22,    23,    14,    15,    13,    18,    14,    15,
+       6,    13,     6
+};
+
+#define yyerrok		(yyerrstatus = 0)
+#define yyclearin	(yychar = YYEMPTY)
+#define YYEMPTY		(-2)
+#define YYEOF		0
+
+#define YYACCEPT	goto yyacceptlab
+#define YYABORT		goto yyabortlab
+#define YYERROR		goto yyerrorlab
+
+
+/* Like YYERROR except do call yyerror.  This remains here temporarily
+   to ease the transition to the new meaning of YYERROR, for GCC.
+   Once GCC version 2 has supplanted version 1, this can go.  However,
+   YYFAIL appears to be in use.  Nevertheless, it is formally deprecated
+   in Bison 2.4.2's NEWS entry, where a plan to phase it out is
+   discussed.  */
+
+#define YYFAIL		goto yyerrlab
+#if defined YYFAIL
+  /* This is here to suppress warnings from the GCC cpp's
+     -Wunused-macros.  Normally we don't worry about that warning, but
+     some users do, and we want to make it easy for users to remove
+     YYFAIL uses, which will produce warnings from Bison 2.5.  */
+#endif
+
+#define YYRECOVERING()  (!!yyerrstatus)
+
+#define YYBACKUP(Token, Value)					\
+do								\
+  if (yychar == YYEMPTY && yylen == 1)				\
+    {								\
+      yychar = (Token);						\
+      yylval = (Value);						\
+      YYPOPSTACK (1);						\
+      goto yybackup;						\
+    }								\
+  else								\
+    {								\
+      yyerror (YY_("syntax error: cannot back up")); \
+      YYERROR;							\
+    }								\
+while (YYID (0))
+
+
+#define YYTERROR	1
+#define YYERRCODE	256
+
+#define YYERRCLEANUP						\
+do								\
+  switch (yylastchar)						\
+    {								\
+      case FTS_NUMB:						\
+      case FTS_TEXT:						\
+      case FTS_TERM:						\
+        YYTOKENFREE(yylval.token);				\
+        break;							\
+      default:							\
+        break;							\
+    }								\
+while (YYID (0))
+
+/* YYLLOC_DEFAULT -- Set CURRENT to span from RHS[1] to RHS[N].
+   If N is 0, then set CURRENT to the empty location which ends
+   the previous symbol: RHS[0] (always defined).  */
+
+#define YYRHSLOC(Rhs, K) ((Rhs)[K])
+#ifndef YYLLOC_DEFAULT
+# define YYLLOC_DEFAULT(Current, Rhs, N)				\
+    do									\
+      if (YYID (N))                                                    \
+	{								\
+	  (Current).first_line   = YYRHSLOC (Rhs, 1).first_line;	\
+	  (Current).first_column = YYRHSLOC (Rhs, 1).first_column;	\
+	  (Current).last_line    = YYRHSLOC (Rhs, N).last_line;		\
+	  (Current).last_column  = YYRHSLOC (Rhs, N).last_column;	\
+	}								\
+      else								\
+	{								\
+	  (Current).first_line   = (Current).last_line   =		\
+	    YYRHSLOC (Rhs, 0).last_line;				\
+	  (Current).first_column = (Current).last_column =		\
+	    YYRHSLOC (Rhs, 0).last_column;				\
+	}								\
+    while (YYID (0))
+#endif
+
+
+/* This macro is provided for backward compatibility. */
+
+#ifndef YY_LOCATION_PRINT
+# define YY_LOCATION_PRINT(File, Loc) ((void) 0)
+#endif
+
+
+/* YYLEX -- calling `yylex' with the right arguments.  */
+
+#ifdef YYLEX_PARAM
+# define YYLEX yylex (&yylval, YYLEX_PARAM)
+#else
+# define YYLEX yylex (&yylval)
+#endif
+
+/* Enable debugging if requested.  */
+#if YYDEBUG
+
+# ifndef YYFPRINTF
+#  include <stdio.h> /* INFRINGES ON USER NAME SPACE */
+#  define YYFPRINTF fprintf
+# endif
+
+# define YYDPRINTF(Args)			\
+do {						\
+  if (yydebug)					\
+    YYFPRINTF Args;				\
+} while (YYID (0))
+
+# define YY_SYMBOL_PRINT(Title, Type, Value, Location)			  \
+do {									  \
+  if (yydebug)								  \
+    {									  \
+      YYFPRINTF (stderr, "%s ", Title);					  \
+      yy_symbol_print (stderr,						  \
+		  Type, Value); \
+      YYFPRINTF (stderr, "\n");						  \
+    }									  \
+} while (YYID (0))
+
+
+/*--------------------------------.
+| Print this symbol on YYOUTPUT.  |
+`--------------------------------*/
+
+/*ARGSUSED*/
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yy_symbol_value_print (FILE *yyoutput, int yytype, YYSTYPE const * const yyvaluep)
+#else
+static void
+yy_symbol_value_print (yyoutput, yytype, yyvaluep)
+    FILE *yyoutput;
+    int yytype;
+    YYSTYPE const * const yyvaluep;
+#endif
+{
+  if (!yyvaluep)
+    return;
+# ifdef YYPRINT
+  if (yytype < YYNTOKENS)
+    YYPRINT (yyoutput, yytoknum[yytype], *yyvaluep);
+# else
+  YYUSE (yyoutput);
+# endif
+  switch (yytype)
+    {
+      default:
+	break;
+    }
+}
+
+
+/*--------------------------------.
+| Print this symbol on YYOUTPUT.  |
+`--------------------------------*/
+
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yy_symbol_print (FILE *yyoutput, int yytype, YYSTYPE const * const yyvaluep)
+#else
+static void
+yy_symbol_print (yyoutput, yytype, yyvaluep)
+    FILE *yyoutput;
+    int yytype;
+    YYSTYPE const * const yyvaluep;
+#endif
+{
+  if (yytype < YYNTOKENS)
+    YYFPRINTF (yyoutput, "token %s (", yytname[yytype]);
+  else
+    YYFPRINTF (yyoutput, "nterm %s (", yytname[yytype]);
+
+  yy_symbol_value_print (yyoutput, yytype, yyvaluep);
+  YYFPRINTF (yyoutput, ")");
+}
+
+/*------------------------------------------------------------------.
+| yy_stack_print -- Print the state stack from its BOTTOM up to its |
+| TOP (included).                                                   |
+`------------------------------------------------------------------*/
+
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yy_stack_print (yytype_int16 *yybottom, yytype_int16 *yytop)
+#else
+static void
+yy_stack_print (yybottom, yytop)
+    yytype_int16 *yybottom;
+    yytype_int16 *yytop;
+#endif
+{
+  YYFPRINTF (stderr, "Stack now");
+  for (; yybottom <= yytop; yybottom++)
+    {
+      int yybot = *yybottom;
+      YYFPRINTF (stderr, " %d", yybot);
+    }
+  YYFPRINTF (stderr, "\n");
+}
+
+# define YY_STACK_PRINT(Bottom, Top)				\
+do {								\
+  if (yydebug)							\
+    yy_stack_print ((Bottom), (Top));				\
+} while (YYID (0))
+
+
+/*------------------------------------------------.
+| Report that the YYRULE is going to be reduced.  |
+`------------------------------------------------*/
+
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yy_reduce_print (YYSTYPE *yyvsp, int yyrule)
+#else
+static void
+yy_reduce_print (yyvsp, yyrule)
+    YYSTYPE *yyvsp;
+    int yyrule;
+#endif
+{
+  int yynrhs = yyr2[yyrule];
+  int yyi;
+  unsigned long int yylno = yyrline[yyrule];
+  YYFPRINTF (stderr, "Reducing stack by rule %d (line %lu):\n",
+	     yyrule - 1, yylno);
+  /* The symbols being reduced.  */
+  for (yyi = 0; yyi < yynrhs; yyi++)
+    {
+      YYFPRINTF (stderr, "   $%d = ", yyi + 1);
+      yy_symbol_print (stderr, yyrhs[yyprhs[yyrule] + yyi],
+		       &(yyvsp[(yyi + 1) - (yynrhs)])
+		       		       );
+      YYFPRINTF (stderr, "\n");
+    }
+}
+
+# define YY_REDUCE_PRINT(Rule)		\
+do {					\
+  if (yydebug)				\
+    yy_reduce_print (yyvsp, Rule); \
+} while (YYID (0))
+
+/* Nonzero means print parse trace.  It is left uninitialized so that
+   multiple parsers can coexist.  */
+int yydebug;
+#else /* !YYDEBUG */
+# define YYDPRINTF(Args)
+# define YY_SYMBOL_PRINT(Title, Type, Value, Location)
+# define YY_STACK_PRINT(Bottom, Top)
+# define YY_REDUCE_PRINT(Rule)
+#endif /* !YYDEBUG */
+
+
+/* YYINITDEPTH -- initial size of the parser's stacks.  */
+#ifndef	YYINITDEPTH
+# define YYINITDEPTH 200
+#endif
+
+/* YYMAXDEPTH -- maximum size the stacks can grow to (effective only
+   if the built-in stack extension method is used).
+
+   Do not make this value too large; the results are undefined if
+   YYSTACK_ALLOC_MAXIMUM < YYSTACK_BYTES (YYMAXDEPTH)
+   evaluated with infinite-precision integer arithmetic.  */
+
+#ifndef YYMAXDEPTH
+# define YYMAXDEPTH 10000
+#endif
+
+
+#if YYERROR_VERBOSE
+
+# ifndef yystrlen
+#  if defined __GLIBC__ && defined _STRING_H
+#   define yystrlen strlen
+#  else
+/* Return the length of YYSTR.  */
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static YYSIZE_T
+yystrlen (const char *yystr)
+#else
+static YYSIZE_T
+yystrlen (yystr)
+    const char *yystr;
+#endif
+{
+  YYSIZE_T yylen;
+  for (yylen = 0; yystr[yylen]; yylen++)
+    continue;
+  return yylen;
+}
+#  endif
+# endif
+
+# ifndef yystpcpy
+#  if defined __GLIBC__ && defined _STRING_H && defined _GNU_SOURCE
+#   define yystpcpy stpcpy
+#  else
+/* Copy YYSRC to YYDEST, returning the address of the terminating '\0' in
+   YYDEST.  */
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static char *
+yystpcpy (char *yydest, const char *yysrc)
+#else
+static char *
+yystpcpy (yydest, yysrc)
+    char *yydest;
+    const char *yysrc;
+#endif
+{
+  char *yyd = yydest;
+  const char *yys = yysrc;
+
+  while ((*yyd++ = *yys++) != '\0')
+    continue;
+
+  return yyd - 1;
+}
+#  endif
+# endif
+
+# ifndef yytnamerr
+/* Copy to YYRES the contents of YYSTR after stripping away unnecessary
+   quotes and backslashes, so that it's suitable for yyerror.  The
+   heuristic is that double-quoting is unnecessary unless the string
+   contains an apostrophe, a comma, or backslash (other than
+   backslash-backslash).  YYSTR is taken from yytname.  If YYRES is
+   null, do not copy; instead, return the length of what the result
+   would have been.  */
+static YYSIZE_T
+yytnamerr (char *yyres, const char *yystr)
+{
+  if (*yystr == '"')
+    {
+      YYSIZE_T yyn = 0;
+      char const *yyp = yystr;
+
+      for (;;)
+	switch (*++yyp)
+	  {
+	  case '\'':
+	  case ',':
+	    goto do_not_strip_quotes;
+
+	  case '\\':
+	    if (*++yyp != '\\')
+	      goto do_not_strip_quotes;
+	    /* Fall through.  */
+	  default:
+	    if (yyres)
+	      yyres[yyn] = *yyp;
+	    yyn++;
+	    break;
+
+	  case '"':
+	    if (yyres)
+	      yyres[yyn] = '\0';
+	    return yyn;
+	  }
+    do_not_strip_quotes: ;
+    }
+
+  if (! yyres)
+    return yystrlen (yystr);
+
+  return yystpcpy (yyres, yystr) - yyres;
+}
+# endif
+
+/* Copy into *YYMSG, which is of size *YYMSG_ALLOC, an error message
+   about the unexpected token YYTOKEN for the state stack whose top is
+   YYSSP.
+
+   Return 0 if *YYMSG was successfully written.  Return 1 if *YYMSG is
+   not large enough to hold the message.  In that case, also set
+   *YYMSG_ALLOC to the required number of bytes.  Return 2 if the
+   required number of bytes is too large to store.  */
+static int
+yysyntax_error (YYSIZE_T *yymsg_alloc, char **yymsg,
+                yytype_int16 *yyssp, int yytoken)
+{
+  YYSIZE_T yysize0 = yytnamerr (0, yytname[yytoken]);
+  YYSIZE_T yysize = yysize0;
+  YYSIZE_T yysize1;
+  enum { YYERROR_VERBOSE_ARGS_MAXIMUM = 5 };
+  /* Internationalized format string. */
+  const char *yyformat = 0;
+  /* Arguments of yyformat. */
+  char const *yyarg[YYERROR_VERBOSE_ARGS_MAXIMUM];
+  /* Number of reported tokens (one for the "unexpected", one per
+     "expected"). */
+  int yycount = 0;
+
+  /* There are many possibilities here to consider:
+     - Assume YYFAIL is not used.  It's too flawed to consider.  See
+       <http://lists.gnu.org/archive/html/bison-patches/2009-12/msg00024.html>
+       for details.  YYERROR is fine as it does not invoke this
+       function.
+     - If this state is a consistent state with a default action, then
+       the only way this function was invoked is if the default action
+       is an error action.  In that case, don't check for expected
+       tokens because there are none.
+     - The only way there can be no lookahead present (in yychar) is if
+       this state is a consistent state with a default action.  Thus,
+       detecting the absence of a lookahead is sufficient to determine
+       that there is no unexpected or expected token to report.  In that
+       case, just report a simple "syntax error".
+     - Don't assume there isn't a lookahead just because this state is a
+       consistent state with a default action.  There might have been a
+       previous inconsistent state, consistent state with a non-default
+       action, or user semantic action that manipulated yychar.
+     - Of course, the expected token list depends on states to have
+       correct lookahead information, and it depends on the parser not
+       to perform extra reductions after fetching a lookahead from the
+       scanner and before detecting a syntax error.  Thus, state merging
+       (from LALR or IELR) and default reductions corrupt the expected
+       token list.  However, the list is correct for canonical LR with
+       one exception: it will still contain any token that will not be
+       accepted due to an error action in a later state.
+  */
+  if (yytoken != YYEMPTY)
+    {
+      int yyn = yypact[*yyssp];
+      yyarg[yycount++] = yytname[yytoken];
+      if (!yypact_value_is_default (yyn))
+        {
+          /* Start YYX at -YYN if negative to avoid negative indexes in
+             YYCHECK.  In other words, skip the first -YYN actions for
+             this state because they are default actions.  */
+          int yyxbegin = yyn < 0 ? -yyn : 0;
+          /* Stay within bounds of both yycheck and yytname.  */
+          int yychecklim = YYLAST - yyn + 1;
+          int yyxend = yychecklim < YYNTOKENS ? yychecklim : YYNTOKENS;
+          int yyx;
+
+          for (yyx = yyxbegin; yyx < yyxend; ++yyx)
+            if (yycheck[yyx + yyn] == yyx && yyx != YYTERROR
+                && !yytable_value_is_error (yytable[yyx + yyn]))
+              {
+                if (yycount == YYERROR_VERBOSE_ARGS_MAXIMUM)
+                  {
+                    yycount = 1;
+                    yysize = yysize0;
+                    break;
+                  }
+                yyarg[yycount++] = yytname[yyx];
+                yysize1 = yysize + yytnamerr (0, yytname[yyx]);
+                if (! (yysize <= yysize1
+                       && yysize1 <= YYSTACK_ALLOC_MAXIMUM))
+                  return 2;
+                yysize = yysize1;
+              }
+        }
+    }
+
+  switch (yycount)
+    {
+# define YYCASE_(N, S)                      \
+      case N:                               \
+        yyformat = S;                       \
+      break
+      YYCASE_(0, YY_("syntax error"));
+      YYCASE_(1, YY_("syntax error, unexpected %s"));
+      YYCASE_(2, YY_("syntax error, unexpected %s, expecting %s"));
+      YYCASE_(3, YY_("syntax error, unexpected %s, expecting %s or %s"));
+      YYCASE_(4, YY_("syntax error, unexpected %s, expecting %s or %s or %s"));
+      YYCASE_(5, YY_("syntax error, unexpected %s, expecting %s or %s or %s or %s"));
+# undef YYCASE_
+    }
+
+  yysize1 = yysize + yystrlen (yyformat);
+  if (! (yysize <= yysize1 && yysize1 <= YYSTACK_ALLOC_MAXIMUM))
+    return 2;
+  yysize = yysize1;
+
+  if (*yymsg_alloc < yysize)
+    {
+      *yymsg_alloc = 2 * yysize;
+      if (! (yysize <= *yymsg_alloc
+             && *yymsg_alloc <= YYSTACK_ALLOC_MAXIMUM))
+        *yymsg_alloc = YYSTACK_ALLOC_MAXIMUM;
+      return 1;
+    }
+
+  /* Avoid sprintf, as that infringes on the user's name space.
+     Don't have undefined behavior even if the translation
+     produced a string with the wrong number of "%s"s.  */
+  {
+    char *yyp = *yymsg;
+    int yyi = 0;
+    while ((*yyp = *yyformat) != '\0')
+      if (*yyp == '%' && yyformat[1] == 's' && yyi < yycount)
+        {
+          yyp += yytnamerr (yyp, yyarg[yyi++]);
+          yyformat += 2;
+        }
+      else
+        {
+          yyp++;
+          yyformat++;
+        }
+  }
+  return 0;
+}
+#endif /* YYERROR_VERBOSE */
+
+/*-----------------------------------------------.
+| Release the memory associated to this symbol.  |
+`-----------------------------------------------*/
+
+/*ARGSUSED*/
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yydestruct (const char *yymsg, int yytype, YYSTYPE *yyvaluep)
+#else
+static void
+yydestruct (yymsg, yytype, yyvaluep)
+    const char *yymsg;
+    int yytype;
+    YYSTYPE *yyvaluep;
+#endif
+{
+  YYUSE (yyvaluep);
+
+  if (!yymsg)
+    yymsg = "Deleting";
+  YY_SYMBOL_PRINT (yymsg, yytype, yyvaluep, yylocationp);
+
+  switch (yytype)
+    {
+
+      default:
+	break;
+    }
+}
+
+
+/* Prevent warnings from -Wmissing-prototypes.  */
+#ifdef YYPARSE_PARAM
+#if defined __STDC__ || defined __cplusplus
+int yyparse (void *YYPARSE_PARAM);
+#else
+int yyparse ();
+#endif
+#else /* ! YYPARSE_PARAM */
+#if defined __STDC__ || defined __cplusplus
+int yyparse (void);
+#else
+int yyparse ();
+#endif
+#endif /* ! YYPARSE_PARAM */
+
+
+/*----------.
+| yyparse.  |
+`----------*/
+
+#ifdef YYPARSE_PARAM
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+int
+yyparse (void *YYPARSE_PARAM)
+#else
+int
+yyparse (YYPARSE_PARAM)
+    void *YYPARSE_PARAM;
+#endif
+#else /* ! YYPARSE_PARAM */
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+int
+yyparse (void)
+#else
+int
+yyparse ()
+
+#endif
+#endif
+{
+/* The lookahead symbol.  */
+int yychar;
+/* The backup of yychar when there is an error and we're in yyerrlab. */
+int yylastchar;
+
+/* The semantic value of the lookahead symbol.  */
+YYSTYPE yylval;
+
+    /* Number of syntax errors so far.  */
+    int yynerrs;
+
+    int yystate;
+    /* Number of tokens to shift before error messages enabled.  */
+    int yyerrstatus;
+
+    /* The stacks and their tools:
+       `yyss': related to states.
+       `yyvs': related to semantic values.
+
+       Refer to the stacks thru separate pointers, to allow yyoverflow
+       to reallocate them elsewhere.  */
+
+    /* The state stack.  */
+    yytype_int16 yyssa[YYINITDEPTH];
+    yytype_int16 *yyss;
+    yytype_int16 *yyssp;
+
+    /* The semantic value stack.  */
+    YYSTYPE yyvsa[YYINITDEPTH];
+    YYSTYPE *yyvs;
+    YYSTYPE *yyvsp;
+
+    YYSIZE_T yystacksize;
+
+  int yyn;
+  int yyresult;
+  /* Lookahead token as an internal (translated) token number.  */
+  int yytoken;
+  /* The variables used to return semantic value and location from the
+     action routines.  */
+  YYSTYPE yyval;
+
+#if YYERROR_VERBOSE
+  /* Buffer for error messages, and its allocated size.  */
+  char yymsgbuf[128];
+  char *yymsg = yymsgbuf;
+  YYSIZE_T yymsg_alloc = sizeof yymsgbuf;
+#endif
+
+#define YYPOPSTACK(N)   (yyvsp -= (N), yyssp -= (N))
+
+  /* The number of symbols on the RHS of the reduced rule.
+     Keep to zero when no symbol should be popped.  */
+  int yylen = 0;
+
+  yytoken = 0;
+  yyss = yyssa;
+  yyvs = yyvsa;
+  yystacksize = YYINITDEPTH;
+
+  YYDPRINTF ((stderr, "Starting parse\n"));
+
+  yystate = 0;
+  yyerrstatus = 0;
+  yynerrs = 0;
+  yychar = YYEMPTY; /* Cause a token to be read.  */
+
+  /* Initialize stack pointers.
+     Waste one element of value and location stack
+     so that they stay on the same level as the state stack.
+     The wasted elements are never initialized.  */
+  yyssp = yyss;
+  yyvsp = yyvs;
+
+  goto yysetstate;
+
+/*------------------------------------------------------------.
+| yynewstate -- Push a new state, which is found in yystate.  |
+`------------------------------------------------------------*/
+ yynewstate:
+  /* In all cases, when you get here, the value and location stacks
+     have just been pushed.  So pushing a state here evens the stacks.  */
+  yyssp++;
+
+ yysetstate:
+  *yyssp = yystate;
+
+  if (yyss + yystacksize - 1 <= yyssp)
+    {
+      /* Get the current used size of the three stacks, in elements.  */
+      YYSIZE_T yysize = yyssp - yyss + 1;
+
+#ifdef yyoverflow
+      {
+	/* Give user a chance to reallocate the stack.  Use copies of
+	   these so that the &'s don't force the real ones into
+	   memory.  */
+	YYSTYPE *yyvs1 = yyvs;
+	yytype_int16 *yyss1 = yyss;
+
+	/* Each stack pointer address is followed by the size of the
+	   data in use in that stack, in bytes.  This used to be a
+	   conditional around just the two extra args, but that might
+	   be undefined if yyoverflow is a macro.  */
+	yyoverflow (YY_("memory exhausted"),
+		    &yyss1, yysize * sizeof (*yyssp),
+		    &yyvs1, yysize * sizeof (*yyvsp),
+		    &yystacksize);
+
+	yyss = yyss1;
+	yyvs = yyvs1;
+      }
+#else /* no yyoverflow */
+# ifndef YYSTACK_RELOCATE
+      goto yyexhaustedlab;
+# else
+      /* Extend the stack our own way.  */
+      if (YYMAXDEPTH <= yystacksize)
+	goto yyexhaustedlab;
+      yystacksize *= 2;
+      if (YYMAXDEPTH < yystacksize)
+	yystacksize = YYMAXDEPTH;
+
+      {
+	yytype_int16 *yyss1 = yyss;
+	union yyalloc *yyptr =
+	  (union yyalloc *) YYSTACK_ALLOC (YYSTACK_BYTES (yystacksize));
+	if (! yyptr)
+	  goto yyexhaustedlab;
+	YYSTACK_RELOCATE (yyss_alloc, yyss);
+	YYSTACK_RELOCATE (yyvs_alloc, yyvs);
+#  undef YYSTACK_RELOCATE
+	if (yyss1 != yyssa)
+	  YYSTACK_FREE (yyss1);
+      }
+# endif
+#endif /* no yyoverflow */
+
+      yyssp = yyss + yysize - 1;
+      yyvsp = yyvs + yysize - 1;
+
+      YYDPRINTF ((stderr, "Stack size increased to %lu\n",
+		  (unsigned long int) yystacksize));
+
+      if (yyss + yystacksize - 1 <= yyssp)
+	YYABORT;
+    }
+
+  YYDPRINTF ((stderr, "Entering state %d\n", yystate));
+
+  if (yystate == YYFINAL)
+    YYACCEPT;
+
+  goto yybackup;
+
+/*-----------.
+| yybackup.  |
+`-----------*/
+yybackup:
+
+  /* Do appropriate processing given the current state.  Read a
+     lookahead token if we need one and don't already have one.  */
+
+  /* First try to decide what to do without reference to lookahead token.  */
+  yyn = yypact[yystate];
+  if (yypact_value_is_default (yyn))
+    goto yydefault;
+
+  /* Not known => get a lookahead token if don't already have one.  */
+
+  /* YYCHAR is either YYEMPTY or YYEOF or a valid lookahead symbol.  */
+  if (yychar == YYEMPTY)
+    {
+      YYDPRINTF ((stderr, "Reading a token: "));
+      yychar = YYLEX;
+    }
+
+  if (yychar <= YYEOF)
+    {
+      yychar = yytoken = YYEOF;
+      YYDPRINTF ((stderr, "Now at end of input.\n"));
+    }
+  else
+    {
+      yytoken = YYTRANSLATE (yychar);
+      YY_SYMBOL_PRINT ("Next token is", yytoken, &yylval, &yylloc);
+    }
+
+  /* If the proper action on seeing token YYTOKEN is to reduce or to
+     detect an error, take that action.  */
+  yyn += yytoken;
+  if (yyn < 0 || YYLAST < yyn || yycheck[yyn] != yytoken)
+    goto yydefault;
+  yyn = yytable[yyn];
+  if (yyn <= 0)
+    {
+      if (yytable_value_is_error (yyn))
+        goto yyerrlab;
+      yyn = -yyn;
+      goto yyreduce;
+    }
+
+  /* Count tokens shifted since error; after three, turn off error
+     status.  */
+  if (yyerrstatus)
+    yyerrstatus--;
+
+  /* Shift the lookahead token.  */
+  YY_SYMBOL_PRINT ("Shifting", yytoken, &yylval, &yylloc);
+
+  /* Discard the shifted token.  */
+  yychar = YYEMPTY;
+
+  yystate = yyn;
+  *++yyvsp = yylval;
+
+  goto yynewstate;
+
+
+/*-----------------------------------------------------------.
+| yydefault -- do the default action for the current state.  |
+`-----------------------------------------------------------*/
+yydefault:
+  yyn = yydefact[yystate];
+  if (yyn == 0)
+    goto yyerrlab;
+  goto yyreduce;
+
+
+/*-----------------------------.
+| yyreduce -- Do a reduction.  |
+`-----------------------------*/
+yyreduce:
+  /* yyn is the number of a rule to reduce with.  */
+  yylen = yyr2[yyn];
+
+  /* If YYLEN is nonzero, implement the default value of the action:
+     `$$ = $1'.
+
+     Otherwise, the following line sets YYVAL to garbage.
+     This behavior is undocumented and Bison
+     users should not rely upon it.  Assigning to YYVAL
+     unconditionally makes the parser a bit smaller, and it avoids a
+     GCC warning that YYVAL may be used uninitialized.  */
+  yyval = yyvsp[1-yylen];
+
+
+  YY_REDUCE_PRINT (yyn);
+  switch (yyn)
+    {
+        case 2:
+
+/* Line 1806 of yacc.c  */
+#line 79 "fts0pars.y"
+    {
+		(yyval.node) = (yyvsp[(1) - (1)].node);
+		((fts_ast_state_t*) state)->root = (yyval.node);
+	}
+    break;
+
+  case 3:
+
+/* Line 1806 of yacc.c  */
+#line 85 "fts0pars.y"
+    {
+		(yyval.node) = NULL;
+	}
+    break;
+
+  case 4:
+
+/* Line 1806 of yacc.c  */
+#line 89 "fts0pars.y"
+    {
+		(yyval.node) = (yyvsp[(1) - (2)].node);
+
+		if (!(yyval.node)) {
+			(yyval.node) = fts_ast_create_node_list(state, (yyvsp[(2) - (2)].node));
+		} else {
+			fts_ast_add_node((yyval.node), (yyvsp[(2) - (2)].node));
+		}
+	}
+    break;
+
+  case 5:
+
+/* Line 1806 of yacc.c  */
+#line 99 "fts0pars.y"
+    {
+		(yyval.node) = (yyvsp[(1) - (2)].node);
+		(yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (2)].node));
+
+		if (!(yyval.node)) {
+			(yyval.node) = (yyvsp[(2) - (2)].node);
+		} else {
+			fts_ast_add_node((yyval.node), (yyvsp[(2) - (2)].node));
+		}
+	}
+    break;
+
+  case 6:
+
+/* Line 1806 of yacc.c  */
+#line 111 "fts0pars.y"
+    {
+		(yyval.node) = (yyvsp[(2) - (3)].node);
+
+		if ((yyval.node)) {
+			(yyval.node) = fts_ast_create_node_subexp_list(state, (yyval.node));
+		}
+	}
+    break;
+
+  case 7:
+
+/* Line 1806 of yacc.c  */
+#line 119 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (4)].node));
+
+		if ((yyvsp[(3) - (4)].node)) {
+			fts_ast_add_node((yyval.node),
+				fts_ast_create_node_subexp_list(state, (yyvsp[(3) - (4)].node)));
+		}
+	}
+    break;
+
+  case 8:
+
+/* Line 1806 of yacc.c  */
+#line 129 "fts0pars.y"
+    {
+		(yyval.node) = (yyvsp[(1) - (1)].node);
+	}
+    break;
+
+  case 9:
+
+/* Line 1806 of yacc.c  */
+#line 133 "fts0pars.y"
+    {
+		(yyval.node) = (yyvsp[(1) - (1)].node);
+	}
+    break;
+
+  case 10:
+
+/* Line 1806 of yacc.c  */
+#line 137 "fts0pars.y"
+    {
+		fts_ast_term_set_wildcard((yyvsp[(1) - (2)].node));
+	}
+    break;
+
+  case 11:
+
+/* Line 1806 of yacc.c  */
+#line 141 "fts0pars.y"
+    {
+		fts_ast_text_set_distance((yyvsp[(1) - (3)].node), fts_ast_string_to_ul((yyvsp[(3) - (3)].token), 10));
+		fts_ast_string_free((yyvsp[(3) - (3)].token));
+	}
+    break;
+
+  case 12:
+
+/* Line 1806 of yacc.c  */
+#line 146 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (3)].node));
+		fts_ast_add_node((yyval.node), (yyvsp[(2) - (3)].node));
+		fts_ast_term_set_wildcard((yyvsp[(2) - (3)].node));
+	}
+    break;
+
+  case 13:
+
+/* Line 1806 of yacc.c  */
+#line 152 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (2)].node));
+		fts_ast_add_node((yyval.node), (yyvsp[(2) - (2)].node));
+	}
+    break;
+
+  case 14:
+
+/* Line 1806 of yacc.c  */
+#line 157 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (4)].node));
+		fts_ast_add_node((yyval.node), (yyvsp[(2) - (4)].node));
+		fts_ast_text_set_distance((yyvsp[(2) - (4)].node), fts_ast_string_to_ul((yyvsp[(4) - (4)].token), 10));
+		fts_ast_string_free((yyvsp[(4) - (4)].token));
+	}
+    break;
+
+  case 15:
+
+/* Line 1806 of yacc.c  */
+#line 164 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_list(state, (yyvsp[(1) - (2)].node));
+		fts_ast_add_node((yyval.node), (yyvsp[(2) - (2)].node));
+	}
+    break;
+
+  case 16:
+
+/* Line 1806 of yacc.c  */
+#line 170 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_oper(state, FTS_IGNORE);
+	}
+    break;
+
+  case 17:
+
+/* Line 1806 of yacc.c  */
+#line 174 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_oper(state, FTS_EXIST);
+	}
+    break;
+
+  case 18:
+
+/* Line 1806 of yacc.c  */
+#line 178 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_oper(state, FTS_NEGATE);
+	}
+    break;
+
+  case 19:
+
+/* Line 1806 of yacc.c  */
+#line 182 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_oper(state, FTS_DECR_RATING);
+	}
+    break;
+
+  case 20:
+
+/* Line 1806 of yacc.c  */
+#line 186 "fts0pars.y"
+    {
+		(yyval.node) = fts_ast_create_node_oper(state, FTS_INCR_RATING);
+	}
+    break;
+
+  case 21:
+
+/* Line 1806 of yacc.c  */
+#line 191 "fts0pars.y"
+    {
+		(yyval.node)  = fts_ast_create_node_term(state, (yyvsp[(1) - (1)].token));
+		fts_ast_string_free((yyvsp[(1) - (1)].token));
+	}
+    break;
+
+  case 22:
+
+/* Line 1806 of yacc.c  */
+#line 196 "fts0pars.y"
+    {
+		(yyval.node)  = fts_ast_create_node_term(state, (yyvsp[(1) - (1)].token));
+		fts_ast_string_free((yyvsp[(1) - (1)].token));
+	}
+    break;
+
+  case 23:
+
+/* Line 1806 of yacc.c  */
+#line 202 "fts0pars.y"
+    {
+		(yyval.node)  = (yyvsp[(2) - (2)].node);
+	}
+    break;
+
+  case 24:
+
+/* Line 1806 of yacc.c  */
+#line 207 "fts0pars.y"
+    {
+		(yyval.node)  = fts_ast_create_node_text(state, (yyvsp[(1) - (1)].token));
+		fts_ast_string_free((yyvsp[(1) - (1)].token));
+	}
+    break;
+
+
+
+/* Line 1806 of yacc.c  */
+#line 1663 "fts0pars.cc"
+      default: break;
+    }
+  /* User semantic actions sometimes alter yychar, and that requires
+     that yytoken be updated with the new translation.  We take the
+     approach of translating immediately before every use of yytoken.
+     One alternative is translating here after every semantic action,
+     but that translation would be missed if the semantic action invokes
+     YYABORT, YYACCEPT, or YYERROR immediately after altering yychar or
+     if it invokes YYBACKUP.  In the case of YYABORT or YYACCEPT, an
+     incorrect destructor might then be invoked immediately.  In the
+     case of YYERROR or YYBACKUP, subsequent parser actions might lead
+     to an incorrect destructor call or verbose syntax error message
+     before the lookahead is translated.  */
+  YY_SYMBOL_PRINT ("-> $$ =", yyr1[yyn], &yyval, &yyloc);
+
+  YYPOPSTACK (yylen);
+  yylen = 0;
+  YY_STACK_PRINT (yyss, yyssp);
+
+  *++yyvsp = yyval;
+
+  /* Now `shift' the result of the reduction.  Determine what state
+     that goes to, based on the state we popped back to and the rule
+     number reduced by.  */
+
+  yyn = yyr1[yyn];
+
+  yystate = yypgoto[yyn - YYNTOKENS] + *yyssp;
+  if (0 <= yystate && yystate <= YYLAST && yycheck[yystate] == *yyssp)
+    yystate = yytable[yystate];
+  else
+    yystate = yydefgoto[yyn - YYNTOKENS];
+
+  goto yynewstate;
+
+
+/*------------------------------------.
+| yyerrlab -- here on detecting error |
+`------------------------------------*/
+yyerrlab:
+  /* Backup yychar, in case we would change it. */
+  yylastchar = yychar;
+  /* Make sure we have latest lookahead translation.  See comments at
+     user semantic actions for why this is necessary.  */
+  yytoken = yychar == YYEMPTY ? YYEMPTY : YYTRANSLATE (yychar);
+
+  /* If not already recovering from an error, report this error.  */
+  if (!yyerrstatus)
+    {
+      ++yynerrs;
+#if ! YYERROR_VERBOSE
+      yyerror (YY_("syntax error"));
+#else
+# define YYSYNTAX_ERROR yysyntax_error (&yymsg_alloc, &yymsg, \
+                                        yyssp, yytoken)
+      {
+        char const *yymsgp = YY_("syntax error");
+        int yysyntax_error_status;
+        yysyntax_error_status = YYSYNTAX_ERROR;
+        if (yysyntax_error_status == 0)
+          yymsgp = yymsg;
+        else if (yysyntax_error_status == 1)
+          {
+            if (yymsg != yymsgbuf)
+              YYSTACK_FREE (yymsg);
+            yymsg = (char *) YYSTACK_ALLOC (yymsg_alloc);
+            if (!yymsg)
+              {
+                yymsg = yymsgbuf;
+                yymsg_alloc = sizeof yymsgbuf;
+                yysyntax_error_status = 2;
+              }
+            else
+              {
+                yysyntax_error_status = YYSYNTAX_ERROR;
+                yymsgp = yymsg;
+              }
+          }
+        yyerror (yymsgp);
+        if (yysyntax_error_status == 2)
+          goto yyexhaustedlab;
+      }
+# undef YYSYNTAX_ERROR
+#endif
+    }
+
+
+
+  if (yyerrstatus == 3)
+    {
+      /* If just tried and failed to reuse lookahead token after an
+	 error, discard it.  */
+
+      if (yychar <= YYEOF)
+	{
+	  /* Return failure if at end of input.  */
+	  if (yychar == YYEOF)
+	    {
+	      /* Since we don't need the token, we have to free it first. */
+	      YYERRCLEANUP;
+	      YYABORT;
+	    }
+	}
+      else
+	{
+	  yydestruct ("Error: discarding",
+		      yytoken, &yylval);
+	  yychar = YYEMPTY;
+	}
+    }
+
+  /* Else will try to reuse lookahead token after shifting the error
+     token.  */
+  goto yyerrlab1;
+
+
+/*---------------------------------------------------.
+| yyerrorlab -- error raised explicitly by YYERROR.  |
+`---------------------------------------------------*/
+yyerrorlab:
+
+  /* Pacify compilers like GCC when the user code never invokes
+     YYERROR and the label yyerrorlab therefore never appears in user
+     code.  */
+  if (/*CONSTCOND*/ 0)
+     goto yyerrorlab;
+
+  /* Do not reclaim the symbols of the rule which action triggered
+     this YYERROR.  */
+  YYPOPSTACK (yylen);
+  yylen = 0;
+  YY_STACK_PRINT (yyss, yyssp);
+  yystate = *yyssp;
+  goto yyerrlab1;
+
+
+/*-------------------------------------------------------------.
+| yyerrlab1 -- common code for both syntax error and YYERROR.  |
+`-------------------------------------------------------------*/
+yyerrlab1:
+  yyerrstatus = 3;	/* Each real token shifted decrements this.  */
+
+  for (;;)
+    {
+      yyn = yypact[yystate];
+      if (!yypact_value_is_default (yyn))
+	{
+	  yyn += YYTERROR;
+	  if (0 <= yyn && yyn <= YYLAST && yycheck[yyn] == YYTERROR)
+	    {
+	      yyn = yytable[yyn];
+	      if (0 < yyn)
+		break;
+	    }
+	}
+
+      /* Pop the current state because it cannot handle the error token.  */
+      if (yyssp == yyss)
+	{
+	  /* Since we don't need the error token, we have to free it first. */
+	  YYERRCLEANUP;
+	  YYABORT;
+	}
+
+
+      yydestruct ("Error: popping",
+		  yystos[yystate], yyvsp);
+      YYPOPSTACK (1);
+      yystate = *yyssp;
+      YY_STACK_PRINT (yyss, yyssp);
+    }
+
+  *++yyvsp = yylval;
+
+
+  /* Shift the error token.  */
+  YY_SYMBOL_PRINT ("Shifting", yystos[yyn], yyvsp, yylsp);
+
+  yystate = yyn;
+  goto yynewstate;
+
+
+/*-------------------------------------.
+| yyacceptlab -- YYACCEPT comes here.  |
+`-------------------------------------*/
+yyacceptlab:
+  yyresult = 0;
+  goto yyreturn;
+
+/*-----------------------------------.
+| yyabortlab -- YYABORT comes here.  |
+`-----------------------------------*/
+yyabortlab:
+  yyresult = 1;
+  goto yyreturn;
+
+#if !defined(yyoverflow) || YYERROR_VERBOSE
+/*-------------------------------------------------.
+| yyexhaustedlab -- memory exhaustion comes here.  |
+`-------------------------------------------------*/
+yyexhaustedlab:
+  yyerror (YY_("memory exhausted"));
+  yyresult = 2;
+  /* Fall through.  */
+#endif
+
+yyreturn:
+  if (yychar != YYEMPTY)
+    {
+      /* Make sure we have latest lookahead translation.  See comments at
+         user semantic actions for why this is necessary.  */
+      yytoken = YYTRANSLATE (yychar);
+      yydestruct ("Cleanup: discarding lookahead",
+                  yytoken, &yylval);
+    }
+  /* Do not reclaim the symbols of the rule which action triggered
+     this YYABORT or YYACCEPT.  */
+  YYPOPSTACK (yylen);
+  YY_STACK_PRINT (yyss, yyssp);
+  while (yyssp != yyss)
+    {
+      yydestruct ("Cleanup: popping",
+		  yystos[*yyssp], yyvsp);
+      YYPOPSTACK (1);
+    }
+#ifndef yyoverflow
+  if (yyss != yyssa)
+    YYSTACK_FREE (yyss);
+#endif
+#if YYERROR_VERBOSE
+  if (yymsg != yymsgbuf)
+    YYSTACK_FREE (yymsg);
+#endif
+  /* Make sure YYID is used.  */
+  return YYID (yyresult);
+}
+
+
+
+/* Line 2067 of yacc.c  */
+#line 212 "fts0pars.y"
+
+
+/********************************************************************
+*/
+int
+ftserror(
+/*=====*/
+	const char*	p)
+{
+	my_printf_error(ER_PARSE_ERROR, "%s", MYF(0), p);
+	return(0);
+}
+
+/********************************************************************
+Create a fts_lexer_t instance.*/
+fts_lexer_t*
+fts_lexer_create(
+/*=============*/
+	ibool		boolean_mode,
+	const byte*	query,
+	ulint		query_len)
+{
+	fts_lexer_t*	fts_lexer = static_cast<fts_lexer_t*>(
+		ut_malloc_nokey(sizeof(fts_lexer_t)));
+
+	if (boolean_mode) {
+		fts0blex_init(&fts_lexer->yyscanner);
+		fts0b_scan_bytes(
+			reinterpret_cast<const char*>(query),
+			static_cast<int>(query_len),
+			fts_lexer->yyscanner);
+		fts_lexer->scanner = fts_blexer;
+		/* FIXME: Debugging */
+		/* fts0bset_debug(1 , fts_lexer->yyscanner); */
+	} else {
+		fts0tlex_init(&fts_lexer->yyscanner);
+		fts0t_scan_bytes(
+			reinterpret_cast<const char*>(query),
+			static_cast<int>(query_len),
+			fts_lexer->yyscanner);
+		fts_lexer->scanner = fts_tlexer;
+	}
+
+	return(fts_lexer);
+}
+
+/********************************************************************
+Free an fts_lexer_t instance.*/
+void
+
+fts_lexer_free(
+/*===========*/
+	fts_lexer_t*	fts_lexer)
+{
+	if (fts_lexer->scanner == fts_blexer) {
+		fts0blex_destroy(fts_lexer->yyscanner);
+	} else {
+		fts0tlex_destroy(fts_lexer->yyscanner);
+	}
+
+	ut_free(fts_lexer);
+}
+
+/********************************************************************
+Call the appropaiate scanner.*/
+int
+fts_lexer(
+/*======*/
+	YYSTYPE*	val,
+	fts_lexer_t*	fts_lexer)
+{
+	fts_scanner func_ptr;
+
+	func_ptr = fts_lexer->scanner;
+
+	return(func_ptr(val, fts_lexer->yyscanner));
+}
+
+/********************************************************************
+Parse the query.*/
+int
+fts_parse(
+/*======*/
+	fts_ast_state_t*	state)
+{
+	return(ftsparse(state));
+}
+
diff --git a/storage/innobase/fts/fts0pars.y b/storage/innobase/fts/fts0pars.y
new file mode 100644
index 00000000..903c7280
--- /dev/null
+++ b/storage/innobase/fts/fts0pars.y
@@ -0,0 +1,293 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2014, Oracle and/or its affiliates. All rights reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**
+ * @file fts/fts0pars.y
+ * FTS parser: input file for the GNU Bison parser generator
+ *
+ * Created 2007/5/9 Sunny Bains
+ */
+
+%{
+#include "ha_prototypes.h"
+#include "mem0mem.h"
+#include "fts0ast.h"
+#include "fts0blex.h"
+#include "fts0tlex.h"
+#include "fts0pars.h"
+#include <my_sys.h>
+extern	int fts_lexer(YYSTYPE*, fts_lexer_t*);
+extern	int fts_blexer(YYSTYPE*, yyscan_t);
+extern	int fts_tlexer(YYSTYPE*, yyscan_t);
+#ifdef __GNUC__
+# pragma GCC diagnostic ignored "-Wpragmas"
+# pragma GCC diagnostic ignored "-Wunknown-warning-option"
+# pragma GCC diagnostic ignored "-Wunused-but-set-variable"
+#endif
+extern int ftserror(const char* p);
+/* Required for reentrant parser */
+#define ftslex	fts_lexer
+
+#define YYERROR_VERBOSE
+
+/* For passing an argument to yyparse() */
+#define YYPARSE_PARAM state
+#define YYLEX_PARAM ((fts_ast_state_t*) state)->lexer
+
+
+typedef	int	(*fts_scanner)(YYSTYPE* val, yyscan_t yyscanner);
+
+struct fts_lexer_struct {
+	fts_scanner	scanner;
+	void*		yyscanner;
+};
+
+%}
+
+%union {
+	int			oper;
+	fts_ast_string_t*	token;
+	fts_ast_node_t*		node;
+};
+
+/* Enable re-entrant parser */
+%pure_parser
+
+%token<oper>	FTS_OPER
+%token<token>	FTS_TEXT FTS_TERM FTS_NUMB
+
+%type<node>	prefix term text expr sub_expr expr_lst query
+
+%nonassoc	'+' '-' '~' '<' '>'
+
+%%
+
+query	: expr_lst	{
+		$$ = $1;
+		((fts_ast_state_t*) state)->root = $$;
+	}
+	;
+
+expr_lst: /* Empty */	{
+		$$ = NULL;
+	}
+
+	| expr_lst expr	{
+		$$ = $1;
+
+		if (!$$) {
+			$$ = fts_ast_create_node_list(state, $2);
+		} else {
+			fts_ast_add_node($$, $2);
+		}
+	}
+
+	| expr_lst sub_expr		{
+		$$ = $1;
+		$$ = fts_ast_create_node_list(state, $1);
+
+		if (!$$) {
+			$$ = $2;
+		} else {
+			fts_ast_add_node($$, $2);
+		}
+	}
+	;
+
+sub_expr: '(' expr_lst ')'		{
+		$$ = $2;
+
+		if ($$) {
+			$$ = fts_ast_create_node_subexp_list(state, $$);
+		}
+	}
+
+	| prefix '(' expr_lst ')'	{
+		$$ = fts_ast_create_node_list(state, $1);
+
+		if ($3) {
+			fts_ast_add_node($$,
+				fts_ast_create_node_subexp_list(state, $3));
+		}
+	}
+	;
+
+expr	: term		{
+		$$ = $1;
+	}
+
+	| text		{
+		$$ = $1;
+	}
+
+	| term '*' {
+		fts_ast_term_set_wildcard($1);
+	}
+
+	| text '@' FTS_NUMB {
+		fts_ast_text_set_distance($1, fts_ast_string_to_ul($3, 10));
+		fts_ast_string_free($3);
+	}
+
+	| prefix term '*' {
+		$$ = fts_ast_create_node_list(state, $1);
+		fts_ast_add_node($$, $2);
+		fts_ast_term_set_wildcard($2);
+	}
+
+	| prefix term	{
+		$$ = fts_ast_create_node_list(state, $1);
+		fts_ast_add_node($$, $2);
+	}
+
+	| prefix text '@' FTS_NUMB {
+		$$ = fts_ast_create_node_list(state, $1);
+		fts_ast_add_node($$, $2);
+		fts_ast_text_set_distance($2, fts_ast_string_to_ul($4, 10));
+		fts_ast_string_free($4);
+	}
+
+	| prefix text {
+		$$ = fts_ast_create_node_list(state, $1);
+		fts_ast_add_node($$, $2);
+	}
+	;
+
+prefix	: '-'		{
+		$$ = fts_ast_create_node_oper(state, FTS_IGNORE);
+	}
+
+	| '+'		{
+		$$ = fts_ast_create_node_oper(state, FTS_EXIST);
+	}
+
+	| '~'		{
+		$$ = fts_ast_create_node_oper(state, FTS_NEGATE);
+	}
+
+	| '<'		{
+		$$ = fts_ast_create_node_oper(state, FTS_DECR_RATING);
+	}
+
+	| '>'		{
+		$$ = fts_ast_create_node_oper(state, FTS_INCR_RATING);
+	}
+	;
+
+term	: FTS_TERM	{
+		$$  = fts_ast_create_node_term(state, $1);
+		fts_ast_string_free($1);
+	}
+
+	| FTS_NUMB	{
+		$$  = fts_ast_create_node_term(state, $1);
+		fts_ast_string_free($1);
+	}
+
+	/* Ignore leading '*' */
+	| '*' term {
+		$$  = $2;
+	}
+	;
+
+text	: FTS_TEXT	{
+		$$  = fts_ast_create_node_text(state, $1);
+		fts_ast_string_free($1);
+	}
+	;
+%%
+
+/********************************************************************
+*/
+int
+ftserror(
+/*=====*/
+	const char*	p)
+{
+	fprintf(stderr, "%s\n", p);
+	return(0);
+}
+
+/********************************************************************
+Create a fts_lexer_t instance.*/
+fts_lexer_t*
+fts_lexer_create(
+/*=============*/
+	ibool		boolean_mode,
+	const byte*	query,
+	ulint		query_len)
+{
+	fts_lexer_t*	fts_lexer = static_cast<fts_lexer_t*>(
+		ut_malloc_nokey(sizeof(fts_lexer_t)));
+
+	if (boolean_mode) {
+		fts0blex_init(&fts_lexer->yyscanner);
+		fts0b_scan_bytes((char*) query, (int) query_len, fts_lexer->yyscanner);
+		fts_lexer->scanner = fts_blexer;
+		/* FIXME: Debugging */
+		/* fts0bset_debug(1 , fts_lexer->yyscanner); */
+	} else {
+		fts0tlex_init(&fts_lexer->yyscanner);
+		fts0t_scan_bytes((char*) query, (int) query_len, fts_lexer->yyscanner);
+		fts_lexer->scanner = fts_tlexer;
+	}
+
+	return(fts_lexer);
+}
+
+/********************************************************************
+Free an fts_lexer_t instance.*/
+void
+
+fts_lexer_free(
+/*===========*/
+	fts_lexer_t*	fts_lexer)
+{
+	if (fts_lexer->scanner == fts_blexer) {
+		fts0blex_destroy(fts_lexer->yyscanner);
+	} else {
+		fts0tlex_destroy(fts_lexer->yyscanner);
+	}
+
+	ut_free(fts_lexer);
+}
+
+/********************************************************************
+Call the appropaiate scanner.*/
+int
+fts_lexer(
+/*======*/
+	YYSTYPE*	val,
+	fts_lexer_t*	fts_lexer)
+{
+	fts_scanner func_ptr;
+
+	func_ptr = fts_lexer->scanner;
+
+	return(func_ptr(val, fts_lexer->yyscanner));
+}
+
+/********************************************************************
+Parse the query.*/
+int
+fts_parse(
+/*======*/
+	fts_ast_state_t*	state)
+{
+	return(ftsparse(state));
+}
diff --git a/storage/innobase/fts/fts0plugin.cc b/storage/innobase/fts/fts0plugin.cc
new file mode 100644
index 00000000..de99d170
--- /dev/null
+++ b/storage/innobase/fts/fts0plugin.cc
@@ -0,0 +1,283 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2015, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file fts/fts0plugin.cc
+Full Text Search plugin support.
+
+Created 2013/06/04 Shaohua Wang
+***********************************************************************/
+
+#include "fts0ast.h"
+#include "fts0plugin.h"
+#include "fts0tokenize.h"
+
+#include "ft_global.h"
+
+/******************************************************************//**
+FTS default parser init
+@return 0 */
+static int fts_default_parser_init(MYSQL_FTPARSER_PARAM*) { return 0; }
+
+/******************************************************************//**
+FTS default parser deinit
+@return 0 */
+static int fts_default_parser_deinit(MYSQL_FTPARSER_PARAM*) { return 0; }
+
+/******************************************************************//**
+FTS default parser parse from ft_static.c in MYISAM.
+@return 0 if parse successfully, or return non-zero */
+static
+int
+fts_default_parser_parse(
+/*=====================*/
+	MYSQL_FTPARSER_PARAM *param)	/*!< in: plugin parser param */
+{
+	return(param->mysql_parse(param, param->doc, param->length));
+}
+
+/* FTS default parser from ft_static.c in MYISAM. */
+struct st_mysql_ftparser fts_default_parser =
+{
+	MYSQL_FTPARSER_INTERFACE_VERSION,
+	fts_default_parser_parse,
+	fts_default_parser_init,
+	fts_default_parser_deinit
+};
+
+/******************************************************************//**
+Get a operator node from token boolean info
+@return node */
+static
+fts_ast_node_t*
+fts_query_get_oper_node(
+/*====================*/
+	MYSQL_FTPARSER_BOOLEAN_INFO*	info,	/*!< in: token info */
+	fts_ast_state_t*		state)	/*!< in/out: query parse state*/
+{
+	fts_ast_node_t*	oper_node = NULL;
+
+	if (info->yesno > 0) {
+		oper_node = fts_ast_create_node_oper(state, FTS_EXIST);
+	} else if (info->yesno < 0) {
+		oper_node = fts_ast_create_node_oper(state, FTS_IGNORE);
+	} else if (info->weight_adjust > 0) {
+		oper_node = fts_ast_create_node_oper(state, FTS_INCR_RATING);
+	} else if (info->weight_adjust < 0) {
+		oper_node = fts_ast_create_node_oper(state, FTS_DECR_RATING);
+	} else if (info->wasign > 0) {
+		oper_node = fts_ast_create_node_oper(state, FTS_NEGATE);
+	}
+
+	return(oper_node);
+}
+
+/******************************************************************//**
+FTS plugin parser 'myql_add_word' callback function for query parse.
+Refer to 'st_mysql_ftparser_param' for more detail.
+Note:
+a. Parse logic refers to 'ftb_query_add_word' from ft_boolean_search.c in MYISAM;
+b. Parse node or tree refers to fts0pars.y.
+@return 0 if add successfully, or return non-zero. */
+static
+int
+fts_query_add_word_for_parser(
+/*==========================*/
+	MYSQL_FTPARSER_PARAM*	param,		/*!< in: parser param */
+	const char*			word,		/*!< in: token */
+	int			word_len,	/*!< in: token length */
+	MYSQL_FTPARSER_BOOLEAN_INFO*	info)	/*!< in: token info */
+{
+	fts_ast_state_t* state =
+		static_cast<fts_ast_state_t*>(param->mysql_ftparam);
+	fts_ast_node_t*	cur_node = state->cur_node;
+	fts_ast_node_t*	oper_node = NULL;
+	fts_ast_node_t*	term_node = NULL;
+	fts_ast_node_t*	node = NULL;
+
+	switch (info->type) {
+	case FT_TOKEN_STOPWORD:
+		/* We only handler stopword in phrase */
+		if (cur_node->type != FTS_AST_PARSER_PHRASE_LIST) {
+			break;
+		}
+		/* fall through */
+
+	case FT_TOKEN_WORD:
+		term_node = fts_ast_create_node_term_for_parser(
+			state, word, ulint(word_len));
+
+		if (info->trunc) {
+			fts_ast_term_set_wildcard(term_node);
+		}
+
+		if (cur_node->type == FTS_AST_PARSER_PHRASE_LIST) {
+			/* Ignore operator inside phrase */
+			fts_ast_add_node(cur_node, term_node);
+		} else {
+			ut_ad(cur_node->type == FTS_AST_LIST
+			      || cur_node->type == FTS_AST_SUBEXP_LIST);
+			oper_node = fts_query_get_oper_node(info, state);
+
+			if (oper_node) {
+				node = fts_ast_create_node_list(state, oper_node);
+				fts_ast_add_node(node, term_node);
+				fts_ast_add_node(cur_node, node);
+			} else {
+				fts_ast_add_node(cur_node, term_node);
+			}
+		}
+
+		break;
+
+	case FT_TOKEN_LEFT_PAREN:
+		/* Check parse error */
+		if (cur_node->type != FTS_AST_LIST
+		    && cur_node->type != FTS_AST_SUBEXP_LIST) {
+			return(1);
+		}
+
+		/* Set operator */
+                oper_node = fts_query_get_oper_node(info, state);
+		if (oper_node != NULL) {
+			node = fts_ast_create_node_list(state, oper_node);
+			fts_ast_add_node(cur_node, node);
+			node->go_up = true;
+			node->up_node = cur_node;
+			cur_node = node;
+		}
+
+		if (info->quot) {
+			/* Phrase node */
+			node = fts_ast_create_node_phrase_list(state);
+		} else {
+			/* Subexp list node */
+			node = fts_ast_create_node_subexp_list(state, NULL);
+		}
+
+		fts_ast_add_node(cur_node, node);
+
+		node->up_node = cur_node;
+		state->cur_node = node;
+		state->depth += 1;
+
+		break;
+
+	case FT_TOKEN_RIGHT_PAREN:
+		info->quot = 0;
+
+		if (cur_node->up_node != NULL) {
+			cur_node = cur_node->up_node;
+
+			if (cur_node->go_up) {
+				ut_a(cur_node->up_node
+				     && !(cur_node->up_node->go_up));
+				cur_node = cur_node->up_node;
+			}
+		}
+
+		state->cur_node = cur_node;
+
+		if (state->depth > 0) {
+			state->depth--;
+		} else {
+			/* Parentheses mismatch */
+			return(1);
+		}
+
+		break;
+
+	case FT_TOKEN_EOF:
+	default:
+		break;
+	}
+
+	return(0);
+}
+
+/******************************************************************//**
+FTS plugin parser 'myql_parser' callback function for query parse.
+Refer to 'st_mysql_ftparser_param' for more detail.
+@return 0 if parse successfully */
+static
+int
+fts_parse_query_internal(
+/*=====================*/
+	MYSQL_FTPARSER_PARAM*	param,	/*!< in: parser param */
+	const char*			query,	/*!< in: query string */
+	int			len)	/*!< in: query length */
+{
+	MYSQL_FTPARSER_BOOLEAN_INFO	info;
+	const CHARSET_INFO*		cs = param->cs;
+	uchar**	start = (uchar**)(&query);
+	uchar*	end = (uchar*)(query + len);
+	FT_WORD	w = {NULL, 0, 0};
+
+	info.prev = ' ';
+	info.quot = 0;
+	memset(&w, 0, sizeof(w));
+	/* Note: We don't handle simple parser mode here,
+	but user supplied plugin parser should handler it. */
+	while (fts_get_word(cs, start, end, &w, &info)) {
+		int ret = param->mysql_add_word(
+				param,
+				reinterpret_cast<char*>(w.pos),
+				int(w.len), &info);
+		if (ret) {
+			return(ret);
+		}
+	}
+
+	return(0);
+}
+
+/******************************************************************//**
+fts parse query by plugin parser.
+@return 0 if parse successfully, or return non-zero. */
+int
+fts_parse_by_parser(
+/*================*/
+	ibool			mode,		/*!< in: parse boolean mode */
+	uchar*			query_str,	/*!< in: query string */
+	ulint			query_len,	/*!< in: query string length */
+	st_mysql_ftparser*	parser,		/*!< in: fts plugin parser */
+	fts_ast_state_t*	state)		/*!< in/out: parser state */
+{
+	MYSQL_FTPARSER_PARAM	param;
+	int	ret;
+
+	ut_ad(parser);
+
+	/* Initial parser param */
+	param.mysql_parse = fts_parse_query_internal;
+	param.mysql_add_word = fts_query_add_word_for_parser;
+	param.mysql_ftparam = static_cast<void*>(state);
+	param.cs = state->charset;
+	param.doc = reinterpret_cast<char*>(query_str);
+	param.length = static_cast<int>(query_len);
+	param.flags = 0;
+	param.mode = mode ?
+		     MYSQL_FTPARSER_FULL_BOOLEAN_INFO :
+		     MYSQL_FTPARSER_SIMPLE_MODE;
+
+	PARSER_INIT(parser, &param);
+	ret = parser->parse(&param);
+	PARSER_DEINIT(parser, &param);
+
+	return(ret | state->depth);
+}
diff --git a/storage/innobase/fts/fts0que.cc b/storage/innobase/fts/fts0que.cc
new file mode 100644
index 00000000..9c92a117
--- /dev/null
+++ b/storage/innobase/fts/fts0que.cc
@@ -0,0 +1,4612 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2020, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file fts/fts0que.cc
+Full Text Search functionality.
+
+Created 2007/03/27 Sunny Bains
+Completed 2011/7/10 Sunny and Jimmy Yang
+*******************************************************/
+
+#include "dict0dict.h"
+#include "ut0rbt.h"
+#include "row0sel.h"
+#include "fts0fts.h"
+#include "fts0priv.h"
+#include "fts0ast.h"
+#include "fts0pars.h"
+#include "fts0types.h"
+#include "fts0plugin.h"
+#include "fts0vlc.h"
+
+#include <iomanip>
+#include <vector>
+
+#define FTS_ELEM(t, n, i, j) (t[(i) * n + (j)])
+
+#define RANK_DOWNGRADE		(-1.0F)
+#define RANK_UPGRADE		(1.0F)
+
+/* Maximum number of words supported in a phrase or proximity search. */
+#define MAX_PROXIMITY_ITEM	128
+
+/* Memory used by rbt itself for create and node add */
+#define SIZEOF_RBT_CREATE	sizeof(ib_rbt_t) + sizeof(ib_rbt_node_t) * 2
+#define SIZEOF_RBT_NODE_ADD	sizeof(ib_rbt_node_t)
+
+/*Initial byte length for 'words' in fts_ranking_t */
+#define RANKING_WORDS_INIT_LEN	4
+
+// FIXME: Need to have a generic iterator that traverses the ilist.
+
+typedef std::vector<fts_string_t, ut_allocator<fts_string_t> >	word_vector_t;
+
+struct fts_word_freq_t;
+
+/** State of an FTS query. */
+struct fts_query_t {
+	mem_heap_t*	heap;		/*!< Heap to use for allocations */
+
+	trx_t*		trx;		/*!< The query transaction */
+
+	dict_index_t*	index;		/*!< The FTS index to search */
+					/*!< FTS auxiliary common table def */
+
+	fts_table_t	fts_common_table;
+
+	fts_table_t	fts_index_table;/*!< FTS auxiliary index table def */
+
+	size_t		total_size;	/*!< total memory size used by query */
+
+	fts_doc_ids_t*	deleted;	/*!< Deleted doc ids that need to be
+					filtered from the output */
+
+	fts_ast_node_t*	root;		/*!< Abstract syntax tree */
+
+	fts_ast_node_t* cur_node;	/*!< Current tree node */
+
+	ib_rbt_t*	word_map;	/*!< Matched word map for
+					searching by word*/
+
+	word_vector_t*	word_vector;	/*!< Matched word vector for
+					searching by index */
+
+	ib_rbt_t*       doc_ids;	/*!< The current set of matching
+					doc ids, elements are of
+					type fts_ranking_t */
+
+	ib_rbt_t*	intersection;	/*!< The doc ids that were found in
+					doc_ids, this tree will become
+					the new doc_ids, elements are of type
+					fts_ranking_t */
+
+					/*!< Prepared statement to read the
+					nodes from the FTS INDEX */
+	que_t*		read_nodes_graph;
+
+	fts_ast_oper_t	oper;		/*!< Current boolean mode operator */
+
+					/*!< TRUE if we want to collect the
+					word positions within the document */
+	ibool		collect_positions;
+
+	ulint		flags;		/*!< Specify the full text search type,
+					such as  boolean search, phrase
+					search, proximity search etc. */
+
+	ulint		distance;	/*!< The proximity distance of a
+					phrase search. */
+
+					/*!< These doc ids are used as a
+					boundary condition when searching the
+					FTS index rows */
+
+	doc_id_t	lower_doc_id;	/*!< Lowest doc id in doc_ids */
+
+	doc_id_t	upper_doc_id;	/*!< Highest doc id in doc_ids */
+
+	bool		boolean_mode;	/*!< TRUE if boolean mode query */
+
+	ib_vector_t*	matched;	/*!< Array of matching documents
+					(fts_match_t) to search for a phrase */
+
+	ib_vector_t**	match_array;	/*!< Used for proximity search, contains
+					position info for each matched word
+					in the word list */
+
+	ib_uint64_t	total_docs;	/*!< The total number of documents */
+
+	ulint		total_words;	/*!< The total number of words */
+
+	dberr_t		error;		/*!< Error code if any, that is
+					encountered during query processing */
+
+	ib_rbt_t*	word_freqs;	/*!< RB tree of word frequencies per
+					document, its elements are of type
+					fts_word_freq_t */
+
+	ib_rbt_t*	wildcard_words;	/*!< words with wildcard */
+
+	bool		multi_exist;	/*!< multiple FTS_EXIST oper */
+	byte		visiting_sub_exp; /*!< count of nested
+					fts_ast_visit_sub_exp() */
+
+	st_mysql_ftparser*	parser;	/*!< fts plugin parser */
+};
+
+/** For phrase matching, first we collect the documents and the positions
+then we match. */
+struct fts_match_t {
+	doc_id_t	doc_id;		/*!< Document id */
+
+	ulint		start;		/*!< Start the phrase match from
+					this offset within the positions
+					vector. */
+
+	ib_vector_t*	positions;	/*!< Offsets of a word in a
+					document */
+};
+
+/** For matching tokens in a phrase search. We use this data structure in
+the callback that determines whether a document should be accepted or
+rejected for a phrase search. */
+struct fts_select_t {
+	doc_id_t	doc_id;		/*!< The document id to match */
+
+	ulint		min_pos;	/*!< For found to be TRUE at least
+					one position must be greater than
+					min_pos. */
+
+	ibool		found;		/*!< TRUE if found */
+
+	fts_word_freq_t*
+			word_freq;	/*!< Word frequency instance of the
+					current word being looked up in
+					the FTS index */
+};
+
+typedef std::vector<ulint, ut_allocator<ulint> >       pos_vector_t;
+
+/** structure defines a set of ranges for original documents, each of which
+has a minimum position and maximum position. Text in such range should
+contain all words in the proximity search. We will need to count the
+words in such range to make sure it is less than the specified distance
+of the proximity search */
+struct fts_proximity_t {
+	ulint		n_pos;		/*!< number of position set, defines
+					a range (min to max) containing all
+					matching words */
+	pos_vector_t	min_pos;	/*!< the minimum position (in bytes)
+					of the range */
+	pos_vector_t	max_pos;	/*!< the maximum position (in bytes)
+					of the range */
+};
+
+/** The match positions and tokesn to match */
+struct fts_phrase_t {
+	fts_phrase_t(const dict_table_t* table)
+		:
+		found(false),
+		match(NULL),
+		tokens(NULL),
+		distance(0),
+		charset(NULL),
+		heap(NULL),
+		zip_size(table->space->zip_size()),
+		proximity_pos(NULL),
+		parser(NULL)
+	{
+	}
+
+	/** Match result */
+	ibool			found;
+
+	/** Positions within text */
+	const fts_match_t*	match;
+
+	/** Tokens to match */
+	const ib_vector_t*	tokens;
+
+	/** For matching on proximity distance. Can be 0 for exact match */
+	ulint			distance;
+
+	/** Phrase match charset */
+	CHARSET_INFO*		charset;
+
+	/** Heap for word processing */
+	mem_heap_t*		heap;
+
+	/** ROW_FORMAT=COMPRESSED page size, or 0 */
+	const ulint		zip_size;
+
+	/** Position info for proximity search verification. Records the
+	min and max position of words matched */
+	fts_proximity_t*	proximity_pos;
+
+	/** FTS plugin parser */
+	st_mysql_ftparser*	parser;
+};
+
+/** Paramter passed to fts phrase match by parser */
+struct fts_phrase_param_t {
+	fts_phrase_t*	phrase;		/*!< Match phrase instance */
+	ulint		token_index;	/*!< Index of token to match next */
+	mem_heap_t*	heap;		/*!< Heap for word processing */
+};
+
+/** For storing the frequncy of a word/term in a document */
+struct fts_doc_freq_t {
+	doc_id_t	doc_id;		/*!< Document id */
+	ulint		freq;		/*!< Frequency of a word in a document */
+};
+
+/** To determine the word frequency per document. */
+struct fts_word_freq_t {
+	fts_string_t	word;		/*!< Word for which we need the freq,
+					it's allocated on the query heap */
+
+	ib_rbt_t*	doc_freqs;	/*!< RB Tree for storing per document
+					word frequencies. The elements are
+					of type fts_doc_freq_t */
+	ib_uint64_t	doc_count;	/*!< Total number of documents that
+					contain this word */
+	double		idf;		/*!< Inverse document frequency */
+};
+
+/********************************************************************
+Callback function to fetch the rows in an FTS INDEX record.
+@return always TRUE */
+static
+ibool
+fts_query_index_fetch_nodes(
+/*========================*/
+	void*		row,		/*!< in: sel_node_t* */
+	void*		user_arg);	/*!< in: pointer to ib_vector_t */
+
+/********************************************************************
+Read and filter nodes.
+@return fts_node_t instance */
+static
+dberr_t
+fts_query_filter_doc_ids(
+/*=====================*/
+	fts_query_t*		query,		/*!< in: query instance */
+	const fts_string_t*	word,		/*!< in: the current word */
+	fts_word_freq_t*	word_freq,	/*!< in/out: word frequency */
+	const fts_node_t*	node,		/*!< in: current FTS node */
+	void*			data,		/*!< in: doc id ilist */
+	ulint			len,		/*!< in: doc id ilist size */
+	ibool			calc_doc_count);/*!< in: whether to remember doc
+						count */
+
+/** Process (nested) sub-expression, create a new result set to store the
+sub-expression result by processing nodes under current sub-expression
+list. Merge the sub-expression result with that of parent expression list.
+@param[in,out]	node	current root node
+@param[in,out]	visitor	callback function
+@param[in,out]	arg	argument for callback
+@return DB_SUCCESS if all go well */
+static
+dberr_t
+fts_ast_visit_sub_exp(
+	fts_ast_node_t*		node,
+	fts_ast_callback	visitor,
+	void*			arg);
+
+#if 0
+/*****************************************************************//***
+Find a doc_id in a word's ilist.
+@return TRUE if found. */
+static
+ibool
+fts_query_find_doc_id(
+/*==================*/
+	fts_select_t*	select,		/*!< in/out: search the doc id selected,
+					update the frequency if found. */
+	void*		data,		/*!< in: doc id ilist */
+	ulint		len);		/*!< in: doc id ilist size */
+#endif
+
+/*************************************************************//**
+This function implements a simple "blind" query expansion search:
+words in documents found in the first search pass will be used as
+search arguments to search the document again, thus "expand"
+the search result set.
+@return DB_SUCCESS if success, otherwise the error code */
+static
+dberr_t
+fts_expand_query(
+/*=============*/
+	dict_index_t*	index,		/*!< in: FTS index to search */
+	fts_query_t*	query)		/*!< in: query result, to be freed
+					by the client */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*************************************************************//**
+This function finds documents that contain all words in a
+phrase or proximity search. And if proximity search, verify
+the words are close enough to each other, as in specified distance.
+This function is called for phrase and proximity search.
+@return TRUE if documents are found, FALSE if otherwise */
+static
+ibool
+fts_phrase_or_proximity_search(
+/*===========================*/
+	fts_query_t*	query,		/*!< in/out:  query instance
+					query->doc_ids might be instantiated
+					with qualified doc IDs */
+	ib_vector_t*	tokens);	/*!< in: Tokens contain words */
+/*************************************************************//**
+This function checks whether words in result documents are close to
+each other (within proximity range as specified by "distance").
+If "distance" is MAX_ULINT, then it will find all combinations of
+positions of matching words and store min and max positions
+in the "qualified_pos" for later verification.
+@return true if words are close to each other, false if otherwise */
+static
+bool
+fts_proximity_get_positions(
+/*========================*/
+	fts_match_t**		match,		/*!< in: query instance */
+	ulint			num_match,	/*!< in: number of matching
+						items */
+	ulint			distance,	/*!< in: distance value
+						for proximity search */
+	fts_proximity_t*	qualified_pos);	/*!< out: the position info
+						records ranges containing
+						all matching words. */
+#if 0
+/********************************************************************
+Get the total number of words in a documents. */
+static
+ulint
+fts_query_terms_in_document(
+/*========================*/
+					/*!< out: DB_SUCCESS if all go well
+					else error code */
+	fts_query_t*	query,		/*!< in: FTS query state */
+	doc_id_t	doc_id,		/*!< in: the word to check */
+	ulint*		total);		/*!< out: total words in document */
+#endif
+
+/********************************************************************
+Compare two fts_doc_freq_t doc_ids.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_freq_doc_id_cmp(
+/*================*/
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2)			/*!< in: id2 */
+{
+	const fts_doc_freq_t*	fq1 = (const fts_doc_freq_t*) p1;
+	const fts_doc_freq_t*	fq2 = (const fts_doc_freq_t*) p2;
+
+	return((int) (fq1->doc_id - fq2->doc_id));
+}
+
+#if 0
+/*******************************************************************//**
+Print the table used for calculating LCS. */
+static
+void
+fts_print_lcs_table(
+/*================*/
+	const ulint*	table,		/*!< in: array to print */
+	ulint		n_rows,		/*!< in: total no. of rows */
+	ulint		n_cols)		/*!< in: total no. of cols */
+{
+	ulint		i;
+
+	for (i = 0; i < n_rows; ++i) {
+		ulint	j;
+
+		printf("\n");
+
+		for (j = 0; j < n_cols; ++j) {
+
+			printf("%2lu ", FTS_ELEM(table, n_cols, i, j));
+		}
+	}
+}
+
+/********************************************************************
+Find the longest common subsequence between the query string and
+the document. */
+static
+ulint
+fts_query_lcs(
+/*==========*/
+					/*!< out: LCS (length) between
+					two ilists */
+	const	ulint*	p1,		/*!< in: word positions of query */
+	ulint	len_p1,			/*!< in: no. of elements in p1 */
+	const	ulint*	p2,		/*!< in: word positions within document */
+	ulint	len_p2)			/*!< in: no. of elements in p2 */
+{
+	int	i;
+	ulint	len = 0;
+	ulint	r = len_p1;
+	ulint	c = len_p2;
+	ulint	size = (r + 1) * (c + 1) * sizeof(ulint);
+	ulint*	table = (ulint*) ut_malloc_nokey(size);
+
+	/* Traverse the table backwards, from the last row to the first and
+	also from the last column to the first. We compute the smaller
+	common subsequeces first, then use the caluclated values to determine
+	the longest common subsequence. The result will be in TABLE[0][0]. */
+	for (i = r; i >= 0; --i) {
+		int	j;
+
+		for (j = c; j >= 0; --j) {
+
+			if (p1[i] == (ulint) -1 || p2[j] == (ulint) -1) {
+
+				FTS_ELEM(table, c, i, j) = 0;
+
+			} else if (p1[i] == p2[j]) {
+
+				FTS_ELEM(table, c, i, j) = FTS_ELEM(
+					table, c, i + 1, j + 1) + 1;
+
+			} else {
+
+				ulint	value;
+
+				value = ut_max(
+					FTS_ELEM(table, c, i + 1, j),
+					FTS_ELEM(table, c, i, j + 1));
+
+				FTS_ELEM(table, c, i, j) = value;
+			}
+		}
+	}
+
+	len = FTS_ELEM(table, c, 0, 0);
+
+	fts_print_lcs_table(table, r, c);
+	printf("\nLen=" ULINTPF "\n", len);
+
+	ut_free(table);
+
+	return(len);
+}
+#endif
+
+/*******************************************************************//**
+Compare two fts_ranking_t instance on their rank value and doc ids in
+descending order on the rank and ascending order on doc id.
+@return 0 if p1 == p2, < 0 if p1 < p2, > 0 if p1 > p2 */
+static
+int
+fts_query_compare_rank(
+/*===================*/
+	const void*	p1,		/*!< in: pointer to elem */
+	const void*	p2)		/*!< in: pointer to elem */
+{
+	const fts_ranking_t*	r1 = (const fts_ranking_t*) p1;
+	const fts_ranking_t*	r2 = (const fts_ranking_t*) p2;
+
+	if (r2->rank < r1->rank) {
+		return(-1);
+	} else if (r2->rank == r1->rank) {
+
+		if (r1->doc_id < r2->doc_id) {
+			return(1);
+		} else if (r1->doc_id > r2->doc_id) {
+			return(1);
+		}
+
+		return(0);
+	}
+
+	return(1);
+}
+
+/*******************************************************************//**
+Create words in ranking */
+static
+void
+fts_ranking_words_create(
+/*=====================*/
+	fts_query_t*	query,		/*!< in: query instance */
+	fts_ranking_t*	ranking)	/*!< in: ranking instance */
+{
+	ranking->words = static_cast<byte*>(
+		mem_heap_zalloc(query->heap, RANKING_WORDS_INIT_LEN));
+	ranking->words_len = RANKING_WORDS_INIT_LEN;
+}
+
+/*
+The optimization here is using a char array(bitmap) to replace words rb tree
+in fts_ranking_t.
+
+It can save lots of memory except in some cases of QUERY EXPANSION.
+
+'word_map' is used as a word dictionary, in which the key is a word, the value
+is a number. In 'fts_ranking_words_add', we first check if the word is in 'word_map'.
+if not, we add it into 'word_map', and give it a position(actually a number).
+then we set the corresponding bit to '1' at the position in the char array 'words'.
+
+'word_vector' is a useful backup of 'word_map', and we can get a word by its position,
+more quickly than searching by value in 'word_map'. we use 'word_vector'
+in 'fts_query_calculate_ranking' and 'fts_expand_query'. In the two functions, we need
+to scan the bitmap 'words', and get a word when a bit is '1', then we get word_freq
+by the word.
+*/
+
+/*******************************************************************//**
+Add a word into ranking */
+static
+void
+fts_ranking_words_add(
+/*==================*/
+	fts_query_t*		query,		/*!< in: query instance */
+	fts_ranking_t*		ranking,	/*!< in: ranking instance */
+	const fts_string_t*	word)		/*!< in: term/word to add */
+{
+	ulint	pos;
+	ulint	byte_offset;
+	ulint	bit_offset;
+	ib_rbt_bound_t	parent;
+
+	/* Note: we suppose the word map and vector are append-only. */
+	ut_ad(query->word_vector->size() == rbt_size(query->word_map));
+
+	/* We use ib_rbt to simulate a map, f_n_char means position. */
+	if (rbt_search(query->word_map, &parent, word) == 0) {
+		fts_string_t*	result_word;
+
+		result_word = rbt_value(fts_string_t, parent.last);
+		pos = result_word->f_n_char;
+		ut_ad(pos < rbt_size(query->word_map));
+	} else {
+		/* Add the word to map. */
+		fts_string_t	new_word;
+
+		pos = rbt_size(query->word_map);
+
+		fts_string_dup(&new_word, word, query->heap);
+		new_word.f_n_char = pos;
+
+		rbt_add_node(query->word_map, &parent, &new_word);
+		ut_ad(rbt_validate(query->word_map));
+		query->word_vector->push_back(new_word);
+	}
+
+	/* Check words len */
+	byte_offset = pos / CHAR_BIT;
+	if (byte_offset >= ranking->words_len) {
+		byte*	words = ranking->words;
+		ulint	words_len = ranking->words_len;
+
+		while (byte_offset >= words_len) {
+			words_len *= 2;
+		}
+
+		ranking->words = static_cast<byte*>(
+			mem_heap_zalloc(query->heap, words_len));
+		memcpy(ranking->words, words, ranking->words_len);
+		ranking->words_len = words_len;
+	}
+
+	/* Set ranking words */
+	ut_ad(byte_offset < ranking->words_len);
+	bit_offset = pos % CHAR_BIT;
+	ranking->words[byte_offset] = static_cast<byte>(
+		ranking->words[byte_offset] | 1 << bit_offset);
+}
+
+/*******************************************************************//**
+Get a word from a ranking
+@return true if it's successful */
+static
+bool
+fts_ranking_words_get_next(
+/*=======================*/
+	const	fts_query_t*	query,	/*!< in: query instance */
+	fts_ranking_t*		ranking,/*!< in: ranking instance */
+	ulint*			pos,	/*!< in/out: word start pos */
+	fts_string_t*		word)	/*!< in/out: term/word to add */
+{
+	bool	ret = false;
+	ulint	max_pos = ranking->words_len * CHAR_BIT;
+
+	/* Search for next word */
+	while (*pos < max_pos) {
+		ulint	byte_offset = *pos / CHAR_BIT;
+		ulint	bit_offset = *pos % CHAR_BIT;
+
+		if (ranking->words[byte_offset] & (1 << bit_offset)) {
+			ret = true;
+			break;
+		}
+
+		*pos += 1;
+	};
+
+	/* Get next word from word vector */
+	if (ret) {
+		ut_ad(*pos < query->word_vector->size());
+		*word = query->word_vector->at((size_t)*pos);
+		*pos += 1;
+	}
+
+	return ret;
+}
+
+/*******************************************************************//**
+Add a word if it doesn't exist, to the term freq RB tree. We store
+a pointer to the word that is passed in as the argument.
+@return pointer to word */
+static
+fts_word_freq_t*
+fts_query_add_word_freq(
+/*====================*/
+	fts_query_t*		query,		/*!< in: query instance */
+	const fts_string_t*	word)		/*!< in: term/word to add */
+{
+	ib_rbt_bound_t		parent;
+
+	/* Lookup the word in our rb tree and add if it doesn't exist. */
+	if (rbt_search(query->word_freqs, &parent, word) != 0) {
+		fts_word_freq_t	word_freq;
+
+		memset(&word_freq, 0, sizeof(word_freq));
+
+		fts_string_dup(&word_freq.word, word, query->heap);
+
+		word_freq.doc_count = 0;
+
+		word_freq.doc_freqs = rbt_create(
+			sizeof(fts_doc_freq_t), fts_freq_doc_id_cmp);
+
+		parent.last = rbt_add_node(
+			query->word_freqs, &parent, &word_freq);
+
+		query->total_size += word->f_len
+			+ SIZEOF_RBT_CREATE
+			+ SIZEOF_RBT_NODE_ADD
+			+ sizeof(fts_word_freq_t);
+	}
+
+	return(rbt_value(fts_word_freq_t, parent.last));
+}
+
+/*******************************************************************//**
+Add a doc id if it doesn't exist, to the doc freq RB tree.
+@return pointer to word */
+static
+fts_doc_freq_t*
+fts_query_add_doc_freq(
+/*===================*/
+	fts_query_t*	query,		/*!< in: query instance	*/
+	ib_rbt_t*	doc_freqs,	/*!< in: rb tree of fts_doc_freq_t */
+	doc_id_t	doc_id)		/*!< in: doc id to add */
+{
+	ib_rbt_bound_t	parent;
+
+	/* Lookup the doc id in our rb tree and add if it doesn't exist. */
+	if (rbt_search(doc_freqs, &parent, &doc_id) != 0) {
+		fts_doc_freq_t	doc_freq;
+
+		memset(&doc_freq, 0, sizeof(doc_freq));
+
+		doc_freq.freq = 0;
+		doc_freq.doc_id = doc_id;
+
+		parent.last = rbt_add_node(doc_freqs, &parent, &doc_freq);
+
+		query->total_size += SIZEOF_RBT_NODE_ADD
+			+ sizeof(fts_doc_freq_t);
+	}
+
+	return(rbt_value(fts_doc_freq_t, parent.last));
+}
+
+/*******************************************************************//**
+Add the doc id to the query set only if it's not in the
+deleted array. */
+static
+void
+fts_query_union_doc_id(
+/*===================*/
+	fts_query_t*	query,		/*!< in: query instance */
+	doc_id_t	doc_id,		/*!< in: the doc id to add */
+	fts_rank_t	rank)		/*!< in: if non-zero, it is the
+					rank associated with the doc_id */
+{
+	ib_rbt_bound_t	parent;
+	ulint		size = ib_vector_size(query->deleted->doc_ids);
+	doc_id_t*	updates = (doc_id_t*) query->deleted->doc_ids->data;
+
+	/* Check if the doc id is deleted and it's not already in our set. */
+	if (fts_bsearch(updates, 0, static_cast<int>(size), doc_id) < 0
+	    && rbt_search(query->doc_ids, &parent, &doc_id) != 0) {
+
+		fts_ranking_t	ranking;
+
+		ranking.rank = rank;
+		ranking.doc_id = doc_id;
+		fts_ranking_words_create(query, &ranking);
+
+		rbt_add_node(query->doc_ids, &parent, &ranking);
+
+		query->total_size += SIZEOF_RBT_NODE_ADD
+			+ sizeof(fts_ranking_t) + RANKING_WORDS_INIT_LEN;
+	}
+}
+
+/*******************************************************************//**
+Remove the doc id from the query set only if it's not in the
+deleted set. */
+static
+void
+fts_query_remove_doc_id(
+/*====================*/
+	fts_query_t*	query,		/*!< in: query instance */
+	doc_id_t	doc_id)		/*!< in: the doc id to add */
+{
+	ib_rbt_bound_t	parent;
+	ulint		size = ib_vector_size(query->deleted->doc_ids);
+	doc_id_t*	updates = (doc_id_t*) query->deleted->doc_ids->data;
+
+	/* Check if the doc id is deleted and it's in our set. */
+	if (fts_bsearch(updates, 0, static_cast<int>(size), doc_id) < 0
+	    && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
+		ut_free(rbt_remove_node(query->doc_ids, parent.last));
+
+		ut_ad(query->total_size >=
+		      SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t));
+		query->total_size -= SIZEOF_RBT_NODE_ADD
+			+ sizeof(fts_ranking_t);
+	}
+}
+
+/*******************************************************************//**
+Find the doc id in the query set but not in the deleted set, artificialy
+downgrade or upgrade its ranking by a value and make/initialize its ranking
+under or above its normal range 0 to 1. This is used for Boolean Search
+operator such as Negation operator, which makes word's contribution to the
+row's relevance to be negative */
+static
+void
+fts_query_change_ranking(
+/*====================*/
+	fts_query_t*	query,		/*!< in: query instance */
+	doc_id_t	doc_id,		/*!< in: the doc id to add */
+	ibool		downgrade)	/*!< in: Whether to downgrade ranking */
+{
+	ib_rbt_bound_t	parent;
+	ulint		size = ib_vector_size(query->deleted->doc_ids);
+	doc_id_t*	updates = (doc_id_t*) query->deleted->doc_ids->data;
+
+	/* Check if the doc id is deleted and it's in our set. */
+	if (fts_bsearch(updates, 0, static_cast<int>(size), doc_id) < 0
+	    && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
+
+		fts_ranking_t*	ranking;
+
+		ranking = rbt_value(fts_ranking_t, parent.last);
+
+		ranking->rank += downgrade ? RANK_DOWNGRADE : RANK_UPGRADE;
+
+		/* Allow at most 2 adjustment by RANK_DOWNGRADE (-0.5)
+		and RANK_UPGRADE (0.5) */
+		if (ranking->rank >= 1.0F) {
+			ranking->rank = 1.0F;
+		} else if (ranking->rank <= -1.0F) {
+			ranking->rank = -1.0F;
+		}
+	}
+}
+
+/*******************************************************************//**
+Check the doc id in the query set only if it's not in the
+deleted array. The doc ids that were found are stored in
+another rb tree (fts_query_t::intersect). */
+static
+void
+fts_query_intersect_doc_id(
+/*=======================*/
+	fts_query_t*	query,		/*!< in: query instance */
+	doc_id_t	doc_id,		/*!< in: the doc id to add */
+	fts_rank_t	rank)		/*!< in: if non-zero, it is the
+					rank associated with the doc_id */
+{
+	ib_rbt_bound_t	parent;
+	ulint		size = ib_vector_size(query->deleted->doc_ids);
+	doc_id_t*	updates = (doc_id_t*) query->deleted->doc_ids->data;
+	fts_ranking_t*	ranking= NULL;
+
+	/* There are three types of intersect:
+	   1. '+a': doc_ids is empty, add doc into intersect if it matches 'a'.
+	   2. 'a +b': docs match 'a' is in doc_ids, add doc into intersect
+	      if it matches 'b'. if the doc is also in  doc_ids, then change the
+	      doc's rank, and add 'a' in doc's words.
+	   3. '+a +b': docs matching '+a' is in doc_ids, add doc into intsersect
+	      if it matches 'b' and it's in doc_ids.(multi_exist = true). */
+
+	/* Check if the doc id is deleted and it's in our set */
+	if (fts_bsearch(updates, 0, static_cast<int>(size), doc_id) < 0) {
+		fts_ranking_t	new_ranking;
+
+		if (rbt_search(query->doc_ids, &parent, &doc_id) != 0) {
+			if (query->multi_exist) {
+				return;
+			} else {
+				new_ranking.words = NULL;
+			}
+		} else {
+			ranking = rbt_value(fts_ranking_t, parent.last);
+
+			/* We've just checked the doc id before */
+			if (ranking->words == NULL) {
+				ut_ad(rbt_search(query->intersection, &parent,
+					ranking) == 0);
+				return;
+			}
+
+			/* Merge rank */
+			rank += ranking->rank;
+			if (rank >= 1.0F) {
+				rank = 1.0F;
+			} else if (rank <= -1.0F) {
+				rank = -1.0F;
+			}
+
+			/* Take words */
+			new_ranking.words = ranking->words;
+			new_ranking.words_len = ranking->words_len;
+		}
+
+		new_ranking.rank = rank;
+		new_ranking.doc_id = doc_id;
+
+		if (rbt_search(query->intersection, &parent,
+			       &new_ranking) != 0) {
+			if (new_ranking.words == NULL) {
+				fts_ranking_words_create(query, &new_ranking);
+
+				query->total_size += RANKING_WORDS_INIT_LEN;
+			} else {
+				/* Note that the intersection has taken
+				ownership of the ranking data. */
+				ranking->words = NULL;
+			}
+
+			rbt_add_node(query->intersection,
+				     &parent, &new_ranking);
+
+			query->total_size += SIZEOF_RBT_NODE_ADD
+				+ sizeof(fts_ranking_t);
+		}
+	}
+}
+
+/*******************************************************************//**
+Free the document ranking rb tree. */
+static
+void
+fts_query_free_doc_ids(
+/*===================*/
+	fts_query_t*	query,		/*!< in: query instance */
+	ib_rbt_t*	doc_ids)	/*!< in: rb tree to free */
+{
+	const ib_rbt_node_t*	node;
+
+	for (node = rbt_first(doc_ids); node; node = rbt_first(doc_ids)) {
+
+		fts_ranking_t*	ranking;
+
+		ranking = rbt_value(fts_ranking_t, node);
+
+		if (ranking->words) {
+			ranking->words = NULL;
+		}
+
+		ut_free(rbt_remove_node(doc_ids, node));
+
+		ut_ad(query->total_size >=
+		      SIZEOF_RBT_NODE_ADD + sizeof(fts_ranking_t));
+		query->total_size -= SIZEOF_RBT_NODE_ADD
+			+ sizeof(fts_ranking_t);
+	}
+
+	rbt_free(doc_ids);
+
+	ut_ad(query->total_size >= SIZEOF_RBT_CREATE);
+	query->total_size -= SIZEOF_RBT_CREATE;
+}
+
+/*******************************************************************//**
+Add the word to the documents "list" of matching words from
+the query. We make a copy of the word from the query heap. */
+static
+void
+fts_query_add_word_to_document(
+/*===========================*/
+	fts_query_t*		query,	/*!< in: query to update */
+	doc_id_t		doc_id,	/*!< in: the document to update */
+	const fts_string_t*	word)	/*!< in: the token to add */
+{
+	ib_rbt_bound_t		parent;
+	fts_ranking_t*		ranking = NULL;
+
+	if (query->flags == FTS_OPT_RANKING) {
+		return;
+	}
+
+	/* First we search the intersection RB tree as it could have
+	taken ownership of the words rb tree instance. */
+	if (query->intersection
+	    && rbt_search(query->intersection, &parent, &doc_id) == 0) {
+
+		ranking = rbt_value(fts_ranking_t, parent.last);
+	}
+
+	if (ranking == NULL
+	    && rbt_search(query->doc_ids, &parent, &doc_id) == 0) {
+
+		ranking = rbt_value(fts_ranking_t, parent.last);
+	}
+
+	if (ranking != NULL) {
+		fts_ranking_words_add(query, ranking, word);
+	}
+}
+
+/*******************************************************************//**
+Check the node ilist. */
+static
+void
+fts_query_check_node(
+/*=================*/
+	fts_query_t*		query,	/*!< in: query to update */
+	const fts_string_t*	token,	/*!< in: the token to search */
+	const fts_node_t*	node)	/*!< in: node to check */
+{
+	/* Skip nodes whose doc ids are out range. */
+	if (query->oper == FTS_EXIST
+	    && ((query->upper_doc_id > 0
+		&& node->first_doc_id > query->upper_doc_id)
+		|| (query->lower_doc_id > 0
+		    && node->last_doc_id < query->lower_doc_id))) {
+
+		/* Ignore */
+
+	} else {
+		int		ret;
+		ib_rbt_bound_t	parent;
+		ulint		ilist_size = node->ilist_size;
+		fts_word_freq_t*word_freqs;
+
+		/* The word must exist. */
+		ret = rbt_search(query->word_freqs, &parent, token);
+		ut_a(ret == 0);
+
+		word_freqs = rbt_value(fts_word_freq_t, parent.last);
+
+		query->error = fts_query_filter_doc_ids(
+					query, token, word_freqs, node,
+					node->ilist, ilist_size, TRUE);
+	}
+}
+
+/*****************************************************************//**
+Search index cache for word with wildcard match.
+@return number of words matched */
+static
+ulint
+fts_cache_find_wildcard(
+/*====================*/
+	fts_query_t*		query,		/*!< in: query instance */
+	const fts_index_cache_t*index_cache,	/*!< in: cache to search */
+	const fts_string_t*	token)		/*!< in: token to search */
+{
+	ib_rbt_bound_t		parent;
+	const ib_vector_t*	nodes = NULL;
+	fts_string_t		srch_text;
+	byte			term[FTS_MAX_WORD_LEN + 1];
+	ulint			num_word = 0;
+
+	srch_text.f_len = (token->f_str[token->f_len - 1] == '%')
+			? token->f_len - 1
+			: token->f_len;
+
+	strncpy((char*) term, (char*) token->f_str, srch_text.f_len);
+	term[srch_text.f_len] = '\0';
+	srch_text.f_str = term;
+
+	/* Lookup the word in the rb tree */
+	if (rbt_search_cmp(index_cache->words, &parent, &srch_text, NULL,
+			   innobase_fts_text_cmp_prefix) == 0) {
+		const fts_tokenizer_word_t*     word;
+		ulint				i;
+		const ib_rbt_node_t*		cur_node;
+		ibool				forward = FALSE;
+
+		word = rbt_value(fts_tokenizer_word_t, parent.last);
+		cur_node = parent.last;
+
+		while (innobase_fts_text_cmp_prefix(
+			index_cache->charset, &srch_text, &word->text) == 0) {
+
+			nodes = word->nodes;
+
+			for (i = 0; nodes && i < ib_vector_size(nodes); ++i) {
+				int                     ret;
+				const fts_node_t*       node;
+				ib_rbt_bound_t          freq_parent;
+				fts_word_freq_t*	word_freqs;
+
+				node = static_cast<const fts_node_t*>(
+					ib_vector_get_const(nodes, i));
+
+				ret = rbt_search(query->word_freqs,
+						 &freq_parent,
+						 &srch_text);
+
+				ut_a(ret == 0);
+
+				word_freqs = rbt_value(
+					fts_word_freq_t,
+					freq_parent.last);
+
+				query->error = fts_query_filter_doc_ids(
+					query, &srch_text,
+					word_freqs, node,
+					node->ilist, node->ilist_size, TRUE);
+
+				if (query->error != DB_SUCCESS) {
+					return(0);
+				}
+			}
+
+			num_word++;
+
+			if (!forward) {
+				cur_node = rbt_prev(
+					index_cache->words, cur_node);
+			} else {
+cont_search:
+				cur_node = rbt_next(
+					index_cache->words, cur_node);
+			}
+
+			if (!cur_node) {
+				break;
+			}
+
+			word = rbt_value(fts_tokenizer_word_t, cur_node);
+		}
+
+		if (!forward) {
+			forward = TRUE;
+			cur_node = parent.last;
+			goto cont_search;
+		}
+	}
+
+	return(num_word);
+}
+
+/*****************************************************************//**
+Set difference.
+@return DB_SUCCESS if all go well */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_query_difference(
+/*=================*/
+	fts_query_t*		query,	/*!< in: query instance */
+	const fts_string_t*	token)	/*!< in: token to search */
+{
+	ulint			n_doc_ids= 0;
+	trx_t*			trx = query->trx;
+	dict_table_t*		table = query->index->table;
+
+	ut_a(query->oper == FTS_IGNORE);
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+	{
+		ib::info	out;
+		out << "DIFFERENCE: Searching: '";
+		out.write(token->f_str, token->f_len);
+		out << "'";
+	}
+#endif
+
+	if (query->doc_ids) {
+		n_doc_ids = rbt_size(query->doc_ids);
+	}
+
+	/* There is nothing we can substract from an empty set. */
+	if (query->doc_ids && !rbt_empty(query->doc_ids)) {
+		ulint			i;
+		fts_fetch_t		fetch;
+		const ib_vector_t*	nodes;
+		const fts_index_cache_t*index_cache;
+		que_t*			graph = NULL;
+		fts_cache_t*		cache = table->fts->cache;
+		dberr_t			error;
+
+		mysql_mutex_lock(&cache->lock);
+
+		index_cache = fts_find_index_cache(cache, query->index);
+
+		/* Must find the index cache */
+		ut_a(index_cache != NULL);
+
+		/* Search the cache for a matching word first. */
+		if (query->cur_node->term.wildcard
+		    && query->flags != FTS_PROXIMITY
+		    && query->flags != FTS_PHRASE) {
+			fts_cache_find_wildcard(query, index_cache, token);
+		} else {
+			nodes = fts_cache_find_word(index_cache, token);
+
+			for (i = 0; nodes && i < ib_vector_size(nodes)
+			     && query->error == DB_SUCCESS; ++i) {
+				const fts_node_t*	node;
+
+				node = static_cast<const fts_node_t*>(
+					ib_vector_get_const(nodes, i));
+
+				fts_query_check_node(query, token, node);
+			}
+		}
+
+		mysql_mutex_unlock(&cache->lock);
+
+		/* error is passed by 'query->error' */
+		if (query->error != DB_SUCCESS) {
+			ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
+			return(query->error);
+		}
+
+		/* Setup the callback args for filtering and
+		consolidating the ilist. */
+		fetch.read_arg = query;
+		fetch.read_record = fts_query_index_fetch_nodes;
+
+		error = fts_index_fetch_nodes(
+			trx, &graph, &query->fts_index_table, token, &fetch);
+
+		/* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
+		ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
+		if (error != DB_SUCCESS) {
+			query->error = error;
+		}
+
+		que_graph_free(graph);
+	}
+
+	/* The size can't increase. */
+	ut_a(rbt_size(query->doc_ids) <= n_doc_ids);
+
+	return(query->error);
+}
+
+/* Free the query intersection
+@param	query	query instance */
+static void fts_query_free_intersection(fts_query_t* query)
+{
+  fts_query_free_doc_ids(query, query->intersection);
+  query->intersection = NULL;
+}
+
+/*****************************************************************//**
+Intersect the token doc ids with the current set.
+@return DB_SUCCESS if all go well */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_query_intersect(
+/*================*/
+	fts_query_t*		query,	/*!< in: query instance */
+	const fts_string_t*	token)	/*!< in: the token to search */
+{
+	trx_t*			trx = query->trx;
+	dict_table_t*		table = query->index->table;
+
+	ut_a(query->oper == FTS_EXIST);
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+	{
+		ib::info	out;
+		out << "INTERSECT: Searching: '";
+		out.write(token->f_str, token->f_len);
+		out << "'";
+	}
+#endif
+
+	/* If the words set is not empty and multi exist is true,
+	we know the intersection set is empty in advance. */
+	if (!(rbt_empty(query->doc_ids) && query->multi_exist)) {
+		ulint                   n_doc_ids = 0;
+		ulint			i;
+		fts_fetch_t		fetch;
+		const ib_vector_t*	nodes;
+		const fts_index_cache_t*index_cache;
+		que_t*			graph = NULL;
+		fts_cache_t*		cache = table->fts->cache;
+		dberr_t			error;
+
+		ut_a(!query->intersection);
+
+		n_doc_ids = rbt_size(query->doc_ids);
+
+		/* Create the rb tree that will hold the doc ids of
+		the intersection. */
+		query->intersection = rbt_create(
+			sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
+
+		query->total_size += SIZEOF_RBT_CREATE;
+
+		/* This is to avoid decompressing the ilist if the
+		node's ilist doc ids are out of range. */
+		if (!rbt_empty(query->doc_ids) && query->multi_exist) {
+			const ib_rbt_node_t*	node;
+			doc_id_t*		doc_id;
+
+			node = rbt_first(query->doc_ids);
+			doc_id = rbt_value(doc_id_t, node);
+			query->lower_doc_id = *doc_id;
+
+			node = rbt_last(query->doc_ids);
+			doc_id = rbt_value(doc_id_t, node);
+			query->upper_doc_id = *doc_id;
+
+		} else {
+			query->lower_doc_id = 0;
+			query->upper_doc_id = 0;
+		}
+
+		/* Search the cache for a matching word first. */
+
+		mysql_mutex_lock(&cache->lock);
+
+		/* Search for the index specific cache. */
+		index_cache = fts_find_index_cache(cache, query->index);
+
+		/* Must find the index cache. */
+		ut_a(index_cache != NULL);
+
+		if (query->cur_node->term.wildcard) {
+			/* Wildcard search the index cache */
+			fts_cache_find_wildcard(query, index_cache, token);
+		} else {
+			nodes = fts_cache_find_word(index_cache, token);
+
+			for (i = 0; nodes && i < ib_vector_size(nodes)
+			     && query->error == DB_SUCCESS; ++i) {
+				const fts_node_t*	node;
+
+				node = static_cast<const fts_node_t*>(
+					ib_vector_get_const(nodes, i));
+
+				fts_query_check_node(query, token, node);
+			}
+		}
+
+		mysql_mutex_unlock(&cache->lock);
+
+		/* error is passed by 'query->error' */
+		if (query->error != DB_SUCCESS) {
+			ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
+			fts_query_free_intersection(query);
+			return(query->error);
+		}
+
+		/* Setup the callback args for filtering and
+		consolidating the ilist. */
+		fetch.read_arg = query;
+		fetch.read_record = fts_query_index_fetch_nodes;
+
+		error = fts_index_fetch_nodes(
+			trx, &graph, &query->fts_index_table, token, &fetch);
+
+		/* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
+		ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
+		if (error != DB_SUCCESS) {
+			query->error = error;
+		}
+
+		que_graph_free(graph);
+
+		if (query->error == DB_SUCCESS) {
+			/* Make the intesection (rb tree) the current doc id
+			set and free the old set. */
+			fts_query_free_doc_ids(query, query->doc_ids);
+			query->doc_ids = query->intersection;
+			query->intersection = NULL;
+
+			ut_a(!query->multi_exist || (query->multi_exist
+			     && rbt_size(query->doc_ids) <= n_doc_ids));
+		} else if (query->intersection) {
+			fts_query_free_intersection(query);
+		}
+	}
+
+	return(query->error);
+}
+
+/*****************************************************************//**
+Query index cache.
+@return DB_SUCCESS if all go well */
+static
+dberr_t
+fts_query_cache(
+/*============*/
+	fts_query_t*		query,	/*!< in/out: query instance */
+	const fts_string_t*	token)	/*!< in: token to search */
+{
+	const fts_index_cache_t*index_cache;
+	dict_table_t*		table = query->index->table;
+	fts_cache_t*		cache = table->fts->cache;
+
+	/* Search the cache for a matching word first. */
+	mysql_mutex_lock(&cache->lock);
+
+	/* Search for the index specific cache. */
+	index_cache = fts_find_index_cache(cache, query->index);
+
+	/* Must find the index cache. */
+	ut_a(index_cache != NULL);
+
+	if (query->cur_node->term.wildcard
+	    && query->flags != FTS_PROXIMITY
+	    && query->flags != FTS_PHRASE) {
+		/* Wildcard search the index cache */
+		fts_cache_find_wildcard(query, index_cache, token);
+	} else {
+		const ib_vector_t*      nodes;
+		ulint			i;
+
+		nodes = fts_cache_find_word(index_cache, token);
+
+		for (i = 0; nodes && i < ib_vector_size(nodes)
+		     && query->error == DB_SUCCESS; ++i) {
+			const fts_node_t*	node;
+
+			node = static_cast<const fts_node_t*>(
+				ib_vector_get_const(nodes, i));
+
+			fts_query_check_node(query, token, node);
+		}
+	}
+
+	mysql_mutex_unlock(&cache->lock);
+
+	return(query->error);
+}
+
+/*****************************************************************//**
+Set union.
+@return DB_SUCCESS if all go well */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_query_union(
+/*============*/
+	fts_query_t*		query,	/*!< in: query instance */
+	fts_string_t*		token)	/*!< in: token to search */
+{
+	fts_fetch_t		fetch;
+	ulint			n_doc_ids = 0;
+	trx_t*			trx = query->trx;
+	que_t*			graph = NULL;
+	dberr_t			error;
+
+	ut_a(query->oper == FTS_NONE || query->oper == FTS_DECR_RATING ||
+	     query->oper == FTS_NEGATE || query->oper == FTS_INCR_RATING);
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+	{
+		ib::info	out;
+		out << "UNION: Searching: '";
+		out.write(token->f_str, token->f_len);
+		out << "'";
+	}
+#endif
+
+	if (query->doc_ids) {
+		n_doc_ids = rbt_size(query->doc_ids);
+	}
+
+	if (token->f_len == 0) {
+		return(query->error);
+	}
+
+	fts_query_cache(query, token);
+
+	/* Setup the callback args for filtering and
+	consolidating the ilist. */
+	fetch.read_arg = query;
+	fetch.read_record = fts_query_index_fetch_nodes;
+
+	/* Read the nodes from disk. */
+	error = fts_index_fetch_nodes(
+		trx, &graph, &query->fts_index_table, token, &fetch);
+
+	/* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
+	ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
+	if (error != DB_SUCCESS) {
+		query->error = error;
+	}
+
+	que_graph_free(graph);
+
+	if (query->error == DB_SUCCESS) {
+
+		/* The size can't decrease. */
+		ut_a(rbt_size(query->doc_ids) >= n_doc_ids);
+
+		/* Calulate the number of doc ids that were added to
+		the current doc id set. */
+		if (query->doc_ids) {
+			n_doc_ids = rbt_size(query->doc_ids) - n_doc_ids;
+		}
+	}
+
+	return(query->error);
+}
+
+/*****************************************************************//**
+Depending upon the current query operator process the doc id.
+return DB_SUCCESS if all go well
+or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT */
+static
+dberr_t
+fts_query_process_doc_id(
+/*=====================*/
+	fts_query_t*	query,		/*!< in: query instance */
+	doc_id_t	doc_id,		/*!< in: doc id to process */
+	fts_rank_t	rank)		/*!< in: if non-zero, it is the
+					rank associated with the doc_id */
+{
+	if (query->flags == FTS_OPT_RANKING) {
+		return(DB_SUCCESS);
+	}
+
+	switch (query->oper) {
+	case FTS_NONE:
+		fts_query_union_doc_id(query, doc_id, rank);
+		break;
+
+	case FTS_EXIST:
+		fts_query_intersect_doc_id(query, doc_id, rank);
+		break;
+
+	case FTS_IGNORE:
+		fts_query_remove_doc_id(query, doc_id);
+		break;
+
+	case FTS_NEGATE:
+		fts_query_change_ranking(query, doc_id, TRUE);
+		break;
+
+	case FTS_DECR_RATING:
+		fts_query_union_doc_id(query, doc_id, rank);
+		fts_query_change_ranking(query, doc_id, TRUE);
+		break;
+
+	case FTS_INCR_RATING:
+		fts_query_union_doc_id(query, doc_id, rank);
+		fts_query_change_ranking(query, doc_id, FALSE);
+		break;
+
+	default:
+		ut_error;
+	}
+
+	if (query->total_size > fts_result_cache_limit) {
+		return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
+	} else {
+		return(DB_SUCCESS);
+	}
+}
+
+/*****************************************************************//**
+Merge two result sets. */
+static
+dberr_t
+fts_merge_doc_ids(
+/*==============*/
+	fts_query_t*	query,		/*!< in,out: query instance */
+	const ib_rbt_t*	doc_ids)	/*!< in: result set to merge */
+{
+	const ib_rbt_node_t*	node;
+
+	DBUG_ENTER("fts_merge_doc_ids");
+
+	ut_a(!query->intersection);
+
+	/* To process FTS_EXIST operation (intersection), we need
+	to create a new result set for fts_query_intersect(). */
+	if (query->oper == FTS_EXIST) {
+
+		query->intersection = rbt_create(
+			sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
+
+		query->total_size += SIZEOF_RBT_CREATE;
+	}
+
+	/* Merge the elements to the result set. */
+	for (node = rbt_first(doc_ids); node; node = rbt_next(doc_ids, node)) {
+		fts_ranking_t*		ranking;
+		ulint			pos = 0;
+		fts_string_t		word;
+
+		ranking = rbt_value(fts_ranking_t, node);
+
+		query->error = fts_query_process_doc_id(
+				query, ranking->doc_id, ranking->rank);
+
+		if (query->error != DB_SUCCESS) {
+			if (query->intersection) {
+				ut_a(query->oper == FTS_EXIST);
+				fts_query_free_intersection(query);
+			}
+			DBUG_RETURN(query->error);
+		}
+
+		/* Merge words. Don't need to take operator into account. */
+		ut_a(ranking->words);
+		while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
+			fts_query_add_word_to_document(query, ranking->doc_id,
+						       &word);
+		}
+	}
+
+	/* If it is an intersection operation, reset query->doc_ids
+	to query->intersection and free the old result list. */
+	if (query->oper == FTS_EXIST && query->intersection != NULL) {
+		fts_query_free_doc_ids(query, query->doc_ids);
+		query->doc_ids = query->intersection;
+		query->intersection = NULL;
+	}
+
+	DBUG_RETURN(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Skip non-whitespace in a string. Move ptr to the next word boundary.
+@return pointer to first whitespace character or end */
+UNIV_INLINE
+byte*
+fts_query_skip_word(
+/*================*/
+	byte*		ptr,		/*!< in: start of scan */
+	const byte*	end)		/*!< in: pointer to end of string */
+{
+	/* TODO: Does this have to be UTF-8 too ? */
+	while (ptr < end && !(ispunct(*ptr) || isspace(*ptr))) {
+		++ptr;
+	}
+
+	return(ptr);
+}
+
+/*****************************************************************//**
+Check whether the remaining terms in the phrase match the text.
+@return TRUE if matched else FALSE */
+static
+ibool
+fts_query_match_phrase_terms(
+/*=========================*/
+	fts_phrase_t*	phrase,		/*!< in: phrase to match */
+	byte**		start,		/*!< in/out: text to search, we can't
+					make this const becase we need to
+					first convert the string to
+					lowercase */
+	const byte*	end,		/*!< in: pointer to the end of
+					the string to search */
+	mem_heap_t*	heap)		/*!< in: heap */
+{
+	ulint			i;
+	byte*			ptr = *start;
+	const ib_vector_t*	tokens = phrase->tokens;
+	ulint			distance = phrase->distance;
+
+	/* We check only from the second term onwards, since the first
+	must have matched otherwise we wouldn't be here. */
+	for (i = 1; ptr < end && i < ib_vector_size(tokens); /* No op */) {
+		fts_string_t		match;
+		fts_string_t		cmp_str;
+		const fts_string_t*	token;
+		int			result;
+		ulint			ret;
+
+		ret = innobase_mysql_fts_get_token(
+			phrase->charset, ptr,
+			const_cast<byte*>(end), &match);
+
+		if (match.f_len > 0) {
+			/* Get next token to match. */
+			token = static_cast<const fts_string_t*>(
+				ib_vector_get_const(tokens, i));
+
+			fts_string_dup(&cmp_str, &match, heap);
+
+			result = innobase_fts_text_case_cmp(
+				phrase->charset, token, &cmp_str);
+
+			/* Skip the rest of the tokens if this one doesn't
+			match and the proximity distance is exceeded. */
+			if (result
+			    && (distance == ULINT_UNDEFINED
+				|| distance == 0)) {
+
+				break;
+			}
+
+			/* This token matched move to the next token. */
+			if (result == 0) {
+				/* Advance the text to search by the length
+				of the last token. */
+				ptr += ret;
+
+				/* Advance to the next token. */
+				++i;
+			} else {
+
+				ut_a(distance != ULINT_UNDEFINED);
+
+				ptr = fts_query_skip_word(ptr, end);
+			}
+
+			/* Distance can be 0 for exact matches. */
+			if (distance != ULINT_UNDEFINED && distance > 0) {
+				--distance;
+			}
+		} else {
+			ptr += ret;
+		}
+	}
+
+	*start = ptr;
+
+	/* Can't be greater than the number of elements. */
+	ut_a(i <= ib_vector_size(tokens));
+
+	/* This is the case for multiple words. */
+	if (i == ib_vector_size(tokens)) {
+		phrase->found = TRUE;
+	}
+
+	return(phrase->found);
+}
+
+/*****************************************************************//**
+Callback function to count the number of words in position ranges,
+and see whether the word count is in specified "phrase->distance"
+@return true if the number of characters is less than the "distance" */
+static
+bool
+fts_proximity_is_word_in_range(
+/*===========================*/
+	const fts_phrase_t*
+			phrase,		/*!< in: phrase with the search info */
+	byte*		start,		/*!< in: text to search */
+	ulint		total_len)	/*!< in: length of text */
+{
+	fts_proximity_t*	proximity_pos = phrase->proximity_pos;
+
+	ut_ad(proximity_pos->n_pos == proximity_pos->min_pos.size());
+	ut_ad(proximity_pos->n_pos == proximity_pos->max_pos.size());
+
+	/* Search each matched position pair (with min and max positions)
+	and count the number of words in the range */
+	for (ulint i = 0; i < proximity_pos->n_pos; i++) {
+		ulint		cur_pos = proximity_pos->min_pos[i];
+		ulint		n_word = 0;
+
+		ut_ad(proximity_pos->max_pos[i] <= total_len);
+
+		/* Walk through words in the range and count them */
+		while (cur_pos <= proximity_pos->max_pos[i]) {
+			ulint		len;
+			fts_string_t	str;
+
+			len = innobase_mysql_fts_get_token(
+				phrase->charset,
+				start + cur_pos,
+				start + total_len, &str);
+
+			if (len == 0) {
+				break;
+			}
+
+			/* Advances position with "len" bytes */
+			cur_pos += len;
+
+			/* Record the number of words */
+			if (str.f_n_char > 0) {
+				n_word++;
+			}
+
+			if (n_word > phrase->distance) {
+				break;
+			}
+		}
+
+		/* Check if the number of words is less than specified
+		"distance" */
+		if (n_word && n_word <= phrase->distance) {
+			return(true);
+		}
+	}
+
+	return(false);
+}
+
+/*****************************************************************//**
+FTS plugin parser 'myql_add_word' callback function for phrase match
+Refer to 'st_mysql_ftparser_param' for more detail.
+@return 0 if match, or return non-zero */
+static
+int
+fts_query_match_phrase_add_word_for_parser(
+/*=======================================*/
+	MYSQL_FTPARSER_PARAM*	param,		/*!< in: parser param */
+	const char*			word,		/*!< in: token */
+	int			word_len,	/*!< in: token length */
+	MYSQL_FTPARSER_BOOLEAN_INFO*)
+{
+	fts_phrase_param_t*	phrase_param;
+	fts_phrase_t*		phrase;
+	const ib_vector_t*	tokens;
+	fts_string_t		match;
+	fts_string_t		cmp_str;
+	const fts_string_t*	token;
+	int			result;
+	mem_heap_t*		heap;
+
+	phrase_param = static_cast<fts_phrase_param_t*>(param->mysql_ftparam);
+	heap = phrase_param->heap;
+	phrase = phrase_param->phrase;
+	tokens = phrase->tokens;
+
+	/* In case plugin parser doesn't check return value */
+	if (phrase_param->token_index == ib_vector_size(tokens)) {
+		return(1);
+	}
+
+	match.f_str = (uchar *)(word);
+	match.f_len = ulint(word_len);
+	match.f_n_char= fts_get_token_size(phrase->charset, word, match.f_len);
+
+	if (match.f_len > 0) {
+		/* Get next token to match. */
+		ut_a(phrase_param->token_index < ib_vector_size(tokens));
+		token = static_cast<const fts_string_t*>(
+			ib_vector_get_const(tokens, phrase_param->token_index));
+
+		fts_string_dup(&cmp_str, &match, heap);
+
+		result = innobase_fts_text_case_cmp(
+			phrase->charset, token, &cmp_str);
+
+		if (result == 0) {
+			phrase_param->token_index++;
+		} else {
+			return(1);
+		}
+	}
+
+	/* Can't be greater than the number of elements. */
+	ut_a(phrase_param->token_index <= ib_vector_size(tokens));
+
+	/* This is the case for multiple words. */
+	if (phrase_param->token_index == ib_vector_size(tokens)) {
+		phrase->found = TRUE;
+	}
+
+	return(static_cast<int>(phrase->found));
+}
+
+/*****************************************************************//**
+Check whether the terms in the phrase match the text.
+@return TRUE if matched else FALSE */
+static
+ibool
+fts_query_match_phrase_terms_by_parser(
+/*===================================*/
+	fts_phrase_param_t*	phrase_param,	/* in/out: phrase param */
+	st_mysql_ftparser*	parser,		/* in: plugin fts parser */
+	byte*			text,		/* in: text to check */
+	ulint			len)		/* in: text length */
+{
+	MYSQL_FTPARSER_PARAM	param;
+
+	ut_a(parser);
+
+	/* Set paramters for param */
+	param.mysql_parse = fts_tokenize_document_internal;
+	param.mysql_add_word = fts_query_match_phrase_add_word_for_parser;
+	param.mysql_ftparam = phrase_param;
+	param.cs = phrase_param->phrase->charset;
+	param.doc = reinterpret_cast<char*>(text);
+	param.length = static_cast<int>(len);
+	param.mode= MYSQL_FTPARSER_WITH_STOPWORDS;
+
+	PARSER_INIT(parser, &param);
+	parser->parse(&param);
+	PARSER_DEINIT(parser, &param);
+
+	return(phrase_param->phrase->found);
+}
+
+/*****************************************************************//**
+Callback function to fetch and search the document.
+@return TRUE if matched else FALSE */
+static
+ibool
+fts_query_match_phrase(
+/*===================*/
+	fts_phrase_t*	phrase,		/*!< in: phrase to match */
+	byte*		start,		/*!< in: text to search, we can't make
+					this const becase we need to first
+					convert the string to lowercase */
+	ulint		cur_len,	/*!< in: length of text */
+	ulint		prev_len,	/*!< in: total length for searched
+					doc fields*/
+	mem_heap_t*	heap)		/* heap */
+{
+	ulint			i;
+	const fts_string_t*	first;
+	const byte*		end = start + cur_len;
+	const ib_vector_t*	tokens = phrase->tokens;
+	const ib_vector_t*	positions = phrase->match->positions;
+
+	ut_a(!phrase->found);
+	ut_a(phrase->match->doc_id > 0);
+	ut_a(ib_vector_size(tokens) > 0);
+	ut_a(ib_vector_size(positions) > 0);
+
+	first = static_cast<const fts_string_t*>(
+		ib_vector_get_const(tokens, 0));
+
+	ut_a(phrase->match->start < ib_vector_size(positions));
+
+	for (i = phrase->match->start; i < ib_vector_size(positions); ++i) {
+		ulint		pos;
+		byte*		ptr = start;
+
+		pos = *(ulint*) ib_vector_get_const(positions, i);
+
+		if (pos == ULINT_UNDEFINED) {
+			break;
+		}
+
+		if (pos < prev_len) {
+			continue;
+		}
+
+		/* Document positions are calculated from the beginning
+		of the first field, need to save the length for each
+		searched field to adjust the doc position when search
+		phrases. */
+		pos -= prev_len;
+		ptr = start + pos;
+
+		/* Within limits ? */
+		if (ptr >= end) {
+			break;
+		}
+
+		if (phrase->parser) {
+			fts_phrase_param_t	phrase_param;
+
+			phrase_param.phrase = phrase;
+			phrase_param.token_index = 0;
+			phrase_param.heap = heap;
+
+			if (fts_query_match_phrase_terms_by_parser(
+				&phrase_param,
+				phrase->parser,
+				ptr,
+				ulint(end - ptr))) {
+				break;
+			}
+		} else {
+			fts_string_t	match;
+			fts_string_t	cmp_str;
+			ulint		ret;
+
+			match.f_str = ptr;
+			ret = innobase_mysql_fts_get_token(
+				phrase->charset, start + pos,
+				const_cast<byte*>(end), &match);
+
+			if (match.f_len == 0) {
+				break;
+			}
+
+			fts_string_dup(&cmp_str, &match, heap);
+
+			if (innobase_fts_text_case_cmp(
+				phrase->charset, first, &cmp_str) == 0) {
+
+				/* This is the case for the single word
+				in the phrase. */
+				if (ib_vector_size(phrase->tokens) == 1) {
+					phrase->found = TRUE;
+					break;
+				}
+
+				ptr += ret;
+
+				/* Match the remaining terms in the phrase. */
+				if (fts_query_match_phrase_terms(phrase, &ptr,
+								 end, heap)) {
+					break;
+				}
+			}
+		}
+	}
+
+	return(phrase->found);
+}
+
+/*****************************************************************//**
+Callback function to fetch and search the document.
+@return whether the phrase is found */
+static
+ibool
+fts_query_fetch_document(
+/*=====================*/
+	void*		row,		/*!< in:  sel_node_t* */
+	void*		user_arg)	/*!< in:  fts_doc_t* */
+{
+
+	que_node_t*	exp;
+	sel_node_t*	node = static_cast<sel_node_t*>(row);
+	fts_phrase_t*	phrase = static_cast<fts_phrase_t*>(user_arg);
+	ulint		prev_len = 0;
+	ulint		total_len = 0;
+	byte*		document_text = NULL;
+
+	exp = node->select_list;
+
+	phrase->found = FALSE;
+
+	/* For proximity search, we will need to get the whole document
+	from all fields, so first count the total length of the document
+	from all the fields */
+	if (phrase->proximity_pos) {
+		 while (exp) {
+			ulint		field_len;
+			dfield_t*	dfield = que_node_get_val(exp);
+			byte*		data = static_cast<byte*>(
+						dfield_get_data(dfield));
+
+			if (dfield_is_ext(dfield)) {
+				ulint	local_len = dfield_get_len(dfield);
+
+				local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+
+				field_len = mach_read_from_4(
+					data + local_len + BTR_EXTERN_LEN + 4);
+			} else {
+				field_len = dfield_get_len(dfield);
+			}
+
+			if (field_len != UNIV_SQL_NULL) {
+				total_len += field_len + 1;
+			}
+
+			exp = que_node_get_next(exp);
+		}
+
+		document_text = static_cast<byte*>(mem_heap_zalloc(
+					phrase->heap, total_len));
+
+		if (!document_text) {
+			return(FALSE);
+		}
+	}
+
+	exp = node->select_list;
+
+	while (exp) {
+		dfield_t*	dfield = que_node_get_val(exp);
+		byte*		data = static_cast<byte*>(
+					dfield_get_data(dfield));
+		ulint		cur_len;
+
+		if (dfield_is_ext(dfield)) {
+			data = btr_copy_externally_stored_field(
+				&cur_len, data, phrase->zip_size,
+				dfield_get_len(dfield), phrase->heap);
+		} else {
+			cur_len = dfield_get_len(dfield);
+		}
+
+		if (cur_len != UNIV_SQL_NULL && cur_len != 0) {
+			if (phrase->proximity_pos) {
+				ut_ad(prev_len + cur_len <= total_len);
+				memcpy(document_text + prev_len, data, cur_len);
+			} else {
+				/* For phrase search */
+				phrase->found =
+					fts_query_match_phrase(
+						phrase,
+						static_cast<byte*>(data),
+						cur_len, prev_len,
+						phrase->heap);
+			}
+
+			/* Document positions are calculated from the beginning
+			of the first field, need to save the length for each
+			searched field to adjust the doc position when search
+			phrases. */
+			prev_len += cur_len + 1;
+		}
+
+		if (phrase->found) {
+			break;
+		}
+
+		exp = que_node_get_next(exp);
+	}
+
+	if (phrase->proximity_pos) {
+		ut_ad(prev_len <= total_len);
+
+		phrase->found = fts_proximity_is_word_in_range(
+			phrase, document_text, total_len);
+	}
+
+	return(phrase->found);
+}
+
+#if 0
+/********************************************************************
+Callback function to check whether a record was found or not. */
+static
+ibool
+fts_query_select(
+/*=============*/
+	void*		row,		/*!< in:  sel_node_t* */
+	void*		user_arg)	/*!< in:  fts_doc_t* */
+{
+	int		i;
+	que_node_t*	exp;
+	sel_node_t*	node = row;
+	fts_select_t*	select = user_arg;
+
+	ut_a(select->word_freq);
+	ut_a(select->word_freq->doc_freqs);
+
+	exp = node->select_list;
+
+	for (i = 0; exp && !select->found; ++i) {
+		dfield_t*	dfield = que_node_get_val(exp);
+		void*		data = dfield_get_data(dfield);
+		ulint		len = dfield_get_len(dfield);
+
+		switch (i) {
+		case 0: /* DOC_COUNT */
+			if (len != UNIV_SQL_NULL && len != 0) {
+
+				select->word_freq->doc_count +=
+					mach_read_from_4(data);
+			}
+			break;
+
+		case 1: /* ILIST */
+			if (len != UNIV_SQL_NULL && len != 0) {
+
+				fts_query_find_doc_id(select, data, len);
+			}
+			break;
+
+		default:
+			ut_error;
+		}
+
+		exp = que_node_get_next(exp);
+	}
+
+	return(FALSE);
+}
+
+/********************************************************************
+Read the rows from the FTS index, that match word and where the
+doc id is between first and last doc id.
+@return DB_SUCCESS if all go well else error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_query_find_term(
+/*================*/
+	fts_query_t*		query,	/*!< in: FTS query state */
+	que_t**			graph,	/*!< in: prepared statement */
+	const fts_string_t*	word,	/*!< in: the word to fetch */
+	doc_id_t		doc_id,	/*!< in: doc id to match */
+	ulint*			min_pos,/*!< in/out: pos found must be
+					 greater than this minimum value. */
+	ibool*			found)	/*!< out: TRUE if found else FALSE */
+{
+	pars_info_t*		info;
+	dberr_t			error;
+	fts_select_t		select;
+	doc_id_t		match_doc_id;
+	trx_t*			trx = query->trx;
+	char			table_name[MAX_FULL_NAME_LEN];
+
+	trx->op_info = "fetching FTS index matching nodes";
+
+	if (*graph) {
+		info = (*graph)->info;
+	} else {
+		ulint	selected;
+
+		info = pars_info_create();
+
+		selected = fts_select_index(*word->f_str);
+		query->fts_index_table.suffix = fts_get_suffix(selected);
+
+		fts_get_table_name(&query->fts_index_table, table_name);
+		pars_info_bind_id(info, "index_table_name", table_name);
+	}
+
+	select.found = FALSE;
+	select.doc_id = doc_id;
+	select.min_pos = *min_pos;
+	select.word_freq = fts_query_add_word_freq(query, word->f_str);
+
+	pars_info_bind_function(info, "my_func", fts_query_select, &select);
+	pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
+
+	/* Convert to "storage" byte order. */
+	fts_write_doc_id((byte*) &match_doc_id, doc_id);
+
+	fts_bind_doc_id(info, "min_doc_id", &match_doc_id);
+
+	fts_bind_doc_id(info, "max_doc_id", &match_doc_id);
+
+	if (!*graph) {
+
+		*graph = fts_parse_sql(
+			&query->fts_index_table,
+			info,
+			"DECLARE FUNCTION my_func;\n"
+			"DECLARE CURSOR c IS"
+			" SELECT doc_count, ilist\n"
+			" FROM $index_table_name\n"
+			" WHERE word LIKE :word AND"
+			" first_doc_id <= :min_doc_id AND"
+			" last_doc_id >= :max_doc_id\n"
+			" ORDER BY first_doc_id;\n"
+			"BEGIN\n"
+			"\n"
+			"OPEN c;\n"
+			"WHILE 1 = 1 LOOP\n"
+			"  FETCH c INTO my_func();\n"
+			"  IF c % NOTFOUND THEN\n"
+			"    EXIT;\n"
+			"  END IF;\n"
+			"END LOOP;\n"
+			"CLOSE c;");
+	}
+
+	for (;;) {
+		error = fts_eval_sql(trx, *graph);
+
+		if (error == DB_SUCCESS) {
+
+			break;				/* Exit the loop. */
+		} else {
+
+			if (error == DB_LOCK_WAIT_TIMEOUT) {
+				ib::warn() << "lock wait timeout reading FTS"
+					" index. Retrying!";
+
+				trx->error_state = DB_SUCCESS;
+			} else {
+				ib::error() << error
+					<< " while reading FTS index.";
+
+				break;			/* Exit the loop. */
+			}
+		}
+	}
+
+	/* Value to return */
+	*found = select.found;
+
+	if (*found) {
+		*min_pos = select.min_pos;
+	}
+
+	return(error);
+}
+
+/********************************************************************
+Callback aggregator for int columns. */
+static
+ibool
+fts_query_sum(
+/*==========*/
+					/*!< out: always returns TRUE */
+	void*		row,		/*!< in:  sel_node_t* */
+	void*		user_arg)	/*!< in:  ulint* */
+{
+
+	que_node_t*	exp;
+	sel_node_t*	node = row;
+	ulint*		total = user_arg;
+
+	exp = node->select_list;
+
+	while (exp) {
+		dfield_t*	dfield = que_node_get_val(exp);
+		void*		data = dfield_get_data(dfield);
+		ulint		len = dfield_get_len(dfield);
+
+		if (len != UNIV_SQL_NULL && len != 0) {
+			*total += mach_read_from_4(data);
+		}
+
+		exp = que_node_get_next(exp);
+	}
+
+	return(TRUE);
+}
+
+/********************************************************************
+Calculate the total documents that contain a particular word (term).
+@return DB_SUCCESS if all go well else error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_query_total_docs_containing_term(
+/*=================================*/
+	fts_query_t*		query,	/*!< in: FTS query state */
+	const fts_string_t*	word,	/*!< in: the word to check */
+	ulint*			total)	/*!< out: documents containing word */
+{
+	pars_info_t*		info;
+	dberr_t			error;
+	que_t*			graph;
+	ulint			selected;
+	trx_t*			trx = query->trx;
+	char			table_name[MAX_FULL_NAME_LEN]
+
+	trx->op_info = "fetching FTS index document count";
+
+	*total = 0;
+
+	info = pars_info_create();
+
+	pars_info_bind_function(info, "my_func", fts_query_sum, total);
+	pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
+
+	selected = fts_select_index(*word->f_str);
+
+	query->fts_index_table.suffix = fts_get_suffix(selected);
+
+	fts_get_table_name(&query->fts_index_table, table_name);
+
+	pars_info_bind_id(info, "index_table_name", table_name);
+
+	graph = fts_parse_sql(
+		&query->fts_index_table,
+		info,
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS"
+		" SELECT doc_count\n"
+		" FROM $index_table_name\n"
+		" WHERE word = :word"
+		" ORDER BY first_doc_id;\n"
+		"BEGIN\n"
+		"\n"
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE c;");
+
+	for (;;) {
+		error = fts_eval_sql(trx, graph);
+
+		if (error == DB_SUCCESS) {
+
+			break;				/* Exit the loop. */
+		} else {
+
+			if (error == DB_LOCK_WAIT_TIMEOUT) {
+				ib::warn() << "lock wait timeout reading FTS"
+					" index. Retrying!";
+
+				trx->error_state = DB_SUCCESS;
+			} else {
+				ib::error() << error
+					<< " while reading FTS index.";
+
+				break;			/* Exit the loop. */
+			}
+		}
+	}
+
+	que_graph_free(graph);
+
+	return(error);
+}
+
+/********************************************************************
+Get the total number of words in a documents.
+@return DB_SUCCESS if all go well else error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_query_terms_in_document(
+/*========================*/
+	fts_query_t*	query,		/*!< in: FTS query state */
+	doc_id_t	doc_id,		/*!< in: the word to check */
+	ulint*		total)		/*!< out: total words in document */
+{
+	pars_info_t*	info;
+	dberr_t		error;
+	que_t*		graph;
+	doc_id_t	read_doc_id;
+	trx_t*		trx = query->trx;
+	char		table_name[MAX_FULL_NAME_LEN];
+
+	trx->op_info = "fetching FTS document term count";
+
+	*total = 0;
+
+	info = pars_info_create();
+
+	pars_info_bind_function(info, "my_func", fts_query_sum, total);
+
+	/* Convert to "storage" byte order. */
+	fts_write_doc_id((byte*) &read_doc_id, doc_id);
+	fts_bind_doc_id(info, "doc_id", &read_doc_id);
+
+	query->fts_index_table.suffix = "DOC_ID";
+
+	fts_get_table_name(&query->fts_index_table, table_name);
+
+	pars_info_bind_id(info, "index_table_name", table_name);
+
+	graph = fts_parse_sql(
+		&query->fts_index_table,
+		info,
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS"
+		" SELECT count\n"
+		" FROM $index_table_name\n"
+		" WHERE doc_id = :doc_id"
+		" BEGIN\n"
+		"\n"
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE c;");
+
+	for (;;) {
+		error = fts_eval_sql(trx, graph);
+
+		if (error == DB_SUCCESS) {
+
+			break;				/* Exit the loop. */
+		} else {
+
+			if (error == DB_LOCK_WAIT_TIMEOUT) {
+				ib::warn() << "lock wait timeout reading FTS"
+					" doc id table. Retrying!";
+
+				trx->error_state = DB_SUCCESS;
+			} else {
+				ib::error() << error << " while reading FTS"
+					" doc id table.";
+
+				break;			/* Exit the loop. */
+			}
+		}
+	}
+
+	que_graph_free(graph);
+
+	return(error);
+}
+#endif
+
+/*****************************************************************//**
+Retrieve the document and match the phrase tokens.
+@return DB_SUCCESS or error code */
+MY_ATTRIBUTE((nonnull(1,2,3,6), warn_unused_result))
+static
+dberr_t
+fts_query_match_document(
+/*=====================*/
+	ib_vector_t*	tokens,		/*!< in: phrase tokens */
+	fts_get_doc_t*	get_doc,	/*!< in: table and prepared statements */
+	fts_match_t*	match,		/*!< in: doc id and positions */
+	ulint		distance,	/*!< in: proximity distance */
+	st_mysql_ftparser* parser,	/*!< in: fts plugin parser */
+	ibool*		found)		/*!< out: TRUE if phrase found */
+{
+	dberr_t		error;
+	fts_phrase_t	phrase(get_doc->index_cache->index->table);
+
+	phrase.match = match;		/* Positions to match */
+	phrase.tokens = tokens;		/* Tokens to match */
+	phrase.distance = distance;
+	phrase.charset = get_doc->index_cache->charset;
+	phrase.heap = mem_heap_create(512);
+	phrase.parser = parser;
+
+	*found = phrase.found = FALSE;
+
+	error = fts_doc_fetch_by_doc_id(
+		get_doc, match->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL,
+		fts_query_fetch_document, &phrase);
+
+	if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+		ib::error() << "(" << error << ") matching document.";
+	} else {
+		*found = phrase.found;
+	}
+
+	mem_heap_free(phrase.heap);
+
+	return(error);
+}
+
+/*****************************************************************//**
+This function fetches the original documents and count the
+words in between matching words to see that is in specified distance
+@return DB_SUCCESS if all OK */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+fts_query_is_in_proximity_range(
+/*============================*/
+	const fts_query_t*	query,		/*!< in:  query instance */
+	fts_match_t**		match,		/*!< in: query instance */
+	fts_proximity_t*	qualified_pos)	/*!< in: position info for
+						qualified ranges */
+{
+	fts_get_doc_t	get_doc;
+	fts_cache_t*	cache = query->index->table->fts->cache;
+	dberr_t		err;
+
+	memset(&get_doc, 0x0, sizeof(get_doc));
+
+	mysql_mutex_lock(&cache->lock);
+	get_doc.index_cache = fts_find_index_cache(cache, query->index);
+	mysql_mutex_unlock(&cache->lock);
+	ut_a(get_doc.index_cache != NULL);
+
+	fts_phrase_t	phrase(get_doc.index_cache->index->table);
+
+	phrase.distance = query->distance;
+	phrase.charset = get_doc.index_cache->charset;
+	phrase.heap = mem_heap_create(512);
+	phrase.proximity_pos = qualified_pos;
+	phrase.found = FALSE;
+
+	err = fts_doc_fetch_by_doc_id(
+		&get_doc, match[0]->doc_id, NULL, FTS_FETCH_DOC_BY_ID_EQUAL,
+		fts_query_fetch_document, &phrase);
+
+	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+		ib::error() << "(" << err << ") in verification"
+			" phase of proximity search";
+	}
+
+	/* Free the prepared statement. */
+	if (get_doc.get_document_graph) {
+		que_graph_free(get_doc.get_document_graph);
+		get_doc.get_document_graph = NULL;
+	}
+
+	mem_heap_free(phrase.heap);
+
+	return(err == DB_SUCCESS && phrase.found);
+}
+
+/*****************************************************************//**
+Iterate over the matched document ids and search the for the
+actual phrase in the text.
+@return DB_SUCCESS if all OK */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_query_search_phrase(
+/*====================*/
+	fts_query_t*		query,		/*!< in: query instance */
+	ib_vector_t*		orig_tokens,	/*!< in: tokens to search,
+						with any stopwords in the
+						original phrase */
+	ib_vector_t*		tokens)		/*!< in: tokens that does
+						not include stopwords and
+						can be used to calculate
+						ranking */
+{
+	ulint			i;
+	fts_get_doc_t		get_doc;
+	ulint			n_matched;
+	fts_cache_t*		cache = query->index->table->fts->cache;
+
+	n_matched = ib_vector_size(query->matched);
+
+	/* Setup the doc retrieval infrastructure. */
+	memset(&get_doc, 0x0, sizeof(get_doc));
+
+	mysql_mutex_lock(&cache->lock);
+
+	get_doc.index_cache = fts_find_index_cache(cache, query->index);
+
+	/* Must find the index cache */
+	ut_a(get_doc.index_cache != NULL);
+
+	mysql_mutex_unlock(&cache->lock);
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+	ib::info() << "Start phrase search";
+#endif
+
+	/* Read the document from disk and do the actual
+	match, matching documents will be added to the current
+	doc id set. */
+	for (i = 0; i < n_matched && query->error == DB_SUCCESS; ++i) {
+		fts_match_t*	match;
+		ibool		found = FALSE;
+
+		match = static_cast<fts_match_t*>(
+			ib_vector_get(query->matched, i));
+
+		/* Skip the document ids that were filtered out by
+		an earlier pass. */
+		if (match->doc_id != 0) {
+
+			query->error = fts_query_match_document(
+				orig_tokens, &get_doc, match,
+				query->distance, query->parser, &found);
+
+			if (query->error == DB_SUCCESS && found) {
+				ulint	z;
+
+				query->error = fts_query_process_doc_id(query,
+							 match->doc_id, 0);
+				if (query->error != DB_SUCCESS) {
+					goto func_exit;
+				}
+
+				for (z = 0; z < ib_vector_size(tokens); z++) {
+					fts_string_t*   token;
+					token = static_cast<fts_string_t*>(
+						ib_vector_get(tokens, z));
+					fts_query_add_word_to_document(
+						query, match->doc_id, token);
+				}
+			}
+		}
+	}
+
+func_exit:
+	/* Free the prepared statement. */
+	if (get_doc.get_document_graph) {
+		que_graph_free(get_doc.get_document_graph);
+		get_doc.get_document_graph = NULL;
+	}
+
+	return(query->error);
+}
+
+/** Split the phrase into tokens
+@param[in,out]	query		query instance
+@param[in]	node		query node to search
+@param[in,out]	tokens		token vector
+@param[in,out]	orig_tokens	original node tokens include stopword
+@param[in,out]	heap	mem heap */
+static
+void
+fts_query_phrase_split(
+	fts_query_t*		query,
+	const fts_ast_node_t*	node,
+	ib_vector_t*		tokens,
+	ib_vector_t*		orig_tokens,
+	mem_heap_t*		heap)
+{
+	fts_string_t		phrase;
+	ulint			len = 0;
+	ulint			cur_pos = 0;
+	fts_ast_node_t*		term_node = NULL;
+
+	if (node->type == FTS_AST_TEXT) {
+		phrase.f_str = node->text.ptr->str;
+		phrase.f_len = node->text.ptr->len;
+		len = phrase.f_len;
+	} else {
+		ut_ad(node->type == FTS_AST_PARSER_PHRASE_LIST);
+		phrase.f_str = NULL;
+		phrase.f_len = 0;
+		term_node = node->list.head;
+	}
+
+	while (true) {
+		fts_cache_t*	cache = query->index->table->fts->cache;
+		ulint		cur_len;
+		fts_string_t	result_str;
+
+		if (node->type == FTS_AST_TEXT) {
+			if (cur_pos >= len) {
+				break;
+			}
+
+			cur_len = innobase_mysql_fts_get_token(
+				query->fts_index_table.charset,
+				reinterpret_cast<const byte*>(phrase.f_str)
+				+ cur_pos,
+				reinterpret_cast<const byte*>(phrase.f_str)
+				+ len,
+				&result_str);
+
+			if (cur_len == 0) {
+				break;
+			}
+
+			cur_pos += cur_len;
+		} else {
+			ut_ad(node->type == FTS_AST_PARSER_PHRASE_LIST);
+			/* Term node in parser phrase list */
+			if (term_node == NULL) {
+				break;
+			}
+
+			ut_a(term_node->type == FTS_AST_TERM);
+			result_str.f_str = term_node->term.ptr->str;
+			result_str.f_len = term_node->term.ptr->len;
+			result_str.f_n_char = fts_get_token_size(
+				query->fts_index_table.charset,
+				reinterpret_cast<char*>(result_str.f_str),
+				result_str.f_len);
+
+			term_node = term_node->next;
+		}
+
+		if (result_str.f_n_char == 0) {
+			continue;
+		}
+
+		fts_string_t*	token = static_cast<fts_string_t*>(
+			ib_vector_push(tokens, NULL));
+		fts_string_dup(token, &result_str, heap);
+
+		if (fts_check_token(
+			   &result_str,
+			   cache->stopword_info.cached_stopword,
+			   query->fts_index_table.charset)) {
+			/* Add the word to the RB tree so that we can
+			calculate it's frequencey within a document. */
+			fts_query_add_word_freq(query, token);
+		} else {
+			ib_vector_pop(tokens);
+		}
+
+		/* we will start to store all words including stopwords
+		in the "orig_tokens" vector, but skip any leading words
+		that are stopwords */
+		if (!ib_vector_is_empty(tokens)) {
+			fts_string_t*	orig_token = static_cast<fts_string_t*>(
+				ib_vector_push(orig_tokens, NULL));
+
+			orig_token->f_str = token->f_str;
+			orig_token->f_len = token->f_len;
+		}
+	}
+}
+
+/*****************************************************************//**
+Text/Phrase search.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+fts_query_phrase_search(
+/*====================*/
+	fts_query_t*		query,	/*!< in: query instance */
+	const fts_ast_node_t*	node)	/*!< in: node to search */
+{
+	ib_vector_t*		tokens;
+	ib_vector_t*		orig_tokens;
+	mem_heap_t*		heap = mem_heap_create(sizeof(fts_string_t));
+	ib_alloc_t*		heap_alloc;
+	ulint			num_token;
+
+	heap_alloc = ib_heap_allocator_create(heap);
+
+	tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4);
+	orig_tokens = ib_vector_create(heap_alloc, sizeof(fts_string_t), 4);
+
+	if (query->distance != ULINT_UNDEFINED && query->distance > 0) {
+		query->flags = FTS_PROXIMITY;
+	} else {
+		query->flags = FTS_PHRASE;
+	}
+
+	/* Split the phrase into tokens. */
+	fts_query_phrase_split(query, node, tokens, orig_tokens, heap);
+
+	num_token = ib_vector_size(tokens);
+	if (num_token > MAX_PROXIMITY_ITEM) {
+		query->error = DB_FTS_TOO_MANY_WORDS_IN_PHRASE;
+		goto func_exit;
+	}
+
+	ut_ad(ib_vector_size(orig_tokens) >= num_token);
+
+	/* Ignore empty strings. */
+	if (num_token > 0) {
+		fts_string_t*	token = NULL;
+		fts_fetch_t	fetch;
+		trx_t*		trx = query->trx;
+		fts_ast_oper_t	oper = query->oper;
+		que_t*		graph = NULL;
+		ulint		i;
+		dberr_t		error;
+
+		/* Create the vector for storing matching document ids
+		and the positions of the first token of the phrase. */
+		if (!query->matched) {
+			ib_alloc_t*	heap_alloc;
+
+			heap_alloc = ib_heap_allocator_create(heap);
+
+			if (!(query->flags & FTS_PROXIMITY)
+			    && !(query->flags & FTS_PHRASE)) {
+				query->matched = ib_vector_create(
+					heap_alloc, sizeof(fts_match_t),
+					64);
+			} else {
+				ut_a(num_token <= MAX_PROXIMITY_ITEM);
+				query->match_array =
+					(ib_vector_t**) mem_heap_alloc(
+						heap,
+						num_token *
+						sizeof(query->matched));
+
+				for (i = 0; i < num_token; i++) {
+					query->match_array[i] =
+					ib_vector_create(
+						heap_alloc, sizeof(fts_match_t),
+						64);
+				}
+
+				query->matched = query->match_array[0];
+			}
+		}
+
+		/* Setup the callback args for filtering and consolidating
+		the ilist. */
+		fetch.read_arg = query;
+		fetch.read_record = fts_query_index_fetch_nodes;
+
+		for (i = 0; i < num_token; i++) {
+			/* Search for the first word from the phrase. */
+			token = static_cast<fts_string_t*>(
+				ib_vector_get(tokens, i));
+
+			if (query->flags & FTS_PROXIMITY
+			    || query->flags & FTS_PHRASE) {
+				query->matched = query->match_array[i];
+			}
+
+			error = fts_index_fetch_nodes(
+				trx, &graph, &query->fts_index_table,
+				token, &fetch);
+
+			/* DB_FTS_EXCEED_RESULT_CACHE_LIMIT passed by 'query->error' */
+			ut_ad(!(query->error != DB_SUCCESS && error != DB_SUCCESS));
+			if (error != DB_SUCCESS) {
+				query->error = error;
+			}
+
+			que_graph_free(graph);
+			graph = NULL;
+
+			fts_query_cache(query, token);
+
+			if (!(query->flags & FTS_PHRASE)
+			    && !(query->flags & FTS_PROXIMITY)) {
+				break;
+			}
+
+			/* If any of the token can't be found,
+			no need to continue match */
+			if (ib_vector_is_empty(query->match_array[i])
+			    || query->error != DB_SUCCESS) {
+				goto func_exit;
+			}
+		}
+
+		/* Just a single word, no need to fetch the original
+		documents to do phrase matching */
+		if (ib_vector_size(orig_tokens) == 1
+		    && !ib_vector_is_empty(query->match_array[0])) {
+			fts_match_t*    match;
+			ulint		n_matched;
+
+			n_matched = ib_vector_size(query->match_array[0]);
+
+			for (i = 0; i < n_matched; i++) {
+				match = static_cast<fts_match_t*>(
+					ib_vector_get(
+						query->match_array[0], i));
+
+				query->error = fts_query_process_doc_id(
+						query, match->doc_id, 0);
+				if (query->error != DB_SUCCESS) {
+					goto func_exit;
+				}
+
+				fts_query_add_word_to_document(
+					query, match->doc_id, token);
+			}
+			query->oper = oper;
+			goto func_exit;
+		}
+
+		/* If we are doing proximity search, verify the distance
+		between all words, and check they are in specified distance. */
+		if (query->flags & FTS_PROXIMITY) {
+			fts_phrase_or_proximity_search(query, tokens);
+		} else {
+			ibool	matched;
+
+			/* Phrase Search case:
+			We filter out the doc ids that don't contain
+			all the tokens in the phrase. It's cheaper to
+			search the ilist than bringing the documents in
+			and then doing a search through the text. Isolated
+			testing shows this also helps in mitigating disruption
+			of the buffer cache. */
+			matched = fts_phrase_or_proximity_search(query, tokens);
+			query->matched = query->match_array[0];
+
+			/* Read the actual text in and search for the phrase. */
+			if (matched) {
+				ut_ad(query->error == DB_SUCCESS);
+				query->error = fts_query_search_phrase(
+					query, orig_tokens, tokens);
+			}
+		}
+
+		/* Restore original operation. */
+		query->oper = oper;
+
+		if (query->error != DB_SUCCESS) {
+			goto func_exit;
+		}
+	}
+
+func_exit:
+	mem_heap_free(heap);
+
+	/* Don't need it anymore. */
+	query->matched = NULL;
+
+	return(query->error);
+}
+
+/*****************************************************************//**
+Find the word and evaluate.
+@return DB_SUCCESS if all go well */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_query_execute(
+/*==============*/
+	fts_query_t*		query,	/*!< in: query instance */
+	fts_string_t*		token)	/*!< in: token to search */
+{
+	switch (query->oper) {
+	case FTS_NONE:
+	case FTS_NEGATE:
+	case FTS_INCR_RATING:
+	case FTS_DECR_RATING:
+		query->error = fts_query_union(query, token);
+		break;
+
+	case FTS_EXIST:
+		query->error = fts_query_intersect(query, token);
+		break;
+
+	case FTS_IGNORE:
+		query->error = fts_query_difference(query, token);
+		break;
+
+	default:
+		ut_error;
+	}
+
+	return(query->error);
+}
+
+/*****************************************************************//**
+Create a wildcard string. It's the responsibility of the caller to
+free the byte* pointer. It's allocated using ut_malloc_nokey().
+@return ptr to allocated memory */
+static
+byte*
+fts_query_get_token(
+/*================*/
+	fts_ast_node_t*	node,		/*!< in: the current sub tree */
+	fts_string_t*	token)		/*!< in: token to create */
+{
+	ulint		str_len;
+	byte*		new_ptr = NULL;
+
+	str_len = node->term.ptr->len;
+
+	ut_a(node->type == FTS_AST_TERM);
+
+	token->f_len = str_len;
+	token->f_str = node->term.ptr->str;
+
+	if (node->term.wildcard) {
+
+		token->f_str = static_cast<byte*>(ut_malloc_nokey(str_len + 2));
+		token->f_len = str_len + 1;
+
+		memcpy(token->f_str, node->term.ptr->str, str_len);
+
+		token->f_str[str_len] = '%';
+		token->f_str[token->f_len] = 0;
+
+		new_ptr = token->f_str;
+	}
+
+	return(new_ptr);
+}
+
+static dberr_t fts_ast_visit_sub_exp(fts_ast_node_t*, fts_ast_callback, void*);
+
+/*****************************************************************//**
+Visit every node of the AST. */
+static
+dberr_t
+fts_query_visitor(
+/*==============*/
+	fts_ast_oper_t	oper,		/*!< in: current operator */
+	fts_ast_node_t*	node,		/*!< in: The root of the current subtree*/
+	void*		arg)		/*!< in: callback arg*/
+{
+	byte*		ptr;
+	fts_string_t	token;
+	fts_query_t*	query = static_cast<fts_query_t*>(arg);
+
+	ut_a(node);
+	DBUG_ENTER("fts_query_visitor");
+	DBUG_PRINT("fts", ("nodetype: %s", fts_ast_node_type_get(node->type)));
+
+	token.f_n_char = 0;
+	query->oper = oper;
+	query->cur_node = node;
+
+	switch (node->type) {
+	case FTS_AST_TEXT:
+	case FTS_AST_PARSER_PHRASE_LIST:
+
+		if (query->oper == FTS_EXIST) {
+			ut_ad(query->intersection == NULL);
+			query->intersection = rbt_create(
+				sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
+
+			query->total_size += SIZEOF_RBT_CREATE;
+		}
+
+		/* Set the current proximity distance. */
+		query->distance = node->text.distance;
+
+		/* Force collection of doc ids and the positions. */
+		query->collect_positions = TRUE;
+
+		query->error = fts_query_phrase_search(query, node);
+
+		query->collect_positions = FALSE;
+
+		if (query->oper == FTS_EXIST) {
+			fts_query_free_doc_ids(query, query->doc_ids);
+			query->doc_ids = query->intersection;
+			query->intersection = NULL;
+		}
+
+		break;
+
+	case FTS_AST_TERM:
+		token.f_str = node->term.ptr->str;
+		token.f_len = node->term.ptr->len;
+
+		/* Collect wildcard words for QUERY EXPANSION. */
+		if (node->term.wildcard && query->wildcard_words != NULL) {
+			ib_rbt_bound_t          parent;
+
+			if (rbt_search(query->wildcard_words, &parent, &token)
+			     != 0) {
+				fts_string_t	word;
+
+				fts_string_dup(&word, &token, query->heap);
+				rbt_add_node(query->wildcard_words, &parent,
+					     &word);
+			}
+		}
+
+		/* Add the word to our RB tree that will be used to
+		calculate this terms per document frequency. */
+		fts_query_add_word_freq(query, &token);
+
+		ptr = fts_query_get_token(node, &token);
+		query->error = fts_query_execute(query, &token);
+
+		if (ptr) {
+			ut_free(ptr);
+		}
+
+		break;
+
+	case FTS_AST_SUBEXP_LIST:
+		query->error = fts_ast_visit_sub_exp(node, fts_query_visitor, arg);
+		break;
+
+	default:
+		ut_error;
+	}
+
+	if (query->oper == FTS_EXIST) {
+		query->multi_exist = true;
+	}
+
+	DBUG_RETURN(query->error);
+}
+
+/** Process (nested) sub-expression, create a new result set to store the
+sub-expression result by processing nodes under current sub-expression
+list. Merge the sub-expression result with that of parent expression list.
+@param[in,out]	node	current root node
+@param[in,out]	visitor	callback function
+@param[in,out]	arg	argument for callback
+@return DB_SUCCESS if all go well */
+static
+dberr_t
+fts_ast_visit_sub_exp(
+	fts_ast_node_t*		node,
+	fts_ast_callback	visitor,
+	void*			arg)
+{
+	fts_ast_oper_t		cur_oper;
+	fts_query_t*		query = static_cast<fts_query_t*>(arg);
+	ib_rbt_t*		parent_doc_ids;
+	ib_rbt_t*		subexpr_doc_ids;
+	dberr_t			error = DB_SUCCESS;
+	bool			will_be_ignored = false;
+	bool			multi_exist;
+
+	DBUG_ENTER("fts_ast_visit_sub_exp");
+
+	ut_a(node->type == FTS_AST_SUBEXP_LIST);
+
+	/* To avoid stack overflow, we limit the mutual recursion
+	depth between fts_ast_visit(), fts_query_visitor() and
+	fts_ast_visit_sub_exp(). */
+	if (query->visiting_sub_exp++ > 31) {
+		query->error = DB_OUT_OF_MEMORY;
+		DBUG_RETURN(query->error);
+	}
+
+	cur_oper = query->oper;
+
+	/* Save current result set */
+	parent_doc_ids = query->doc_ids;
+
+	/* Create new result set to store the sub-expression result. We
+	will merge this result set with the parent after processing. */
+	query->doc_ids = rbt_create(sizeof(fts_ranking_t),
+				    fts_ranking_doc_id_cmp);
+
+	query->total_size += SIZEOF_RBT_CREATE;
+
+	multi_exist = query->multi_exist;
+	query->multi_exist = false;
+	/* Process nodes in current sub-expression and store its
+	result set in query->doc_ids we created above. */
+	error = fts_ast_visit(FTS_NONE, node, visitor,
+			      arg, &will_be_ignored);
+
+	/* Reinstate parent node state */
+	query->multi_exist = multi_exist;
+	query->oper = cur_oper;
+	query->visiting_sub_exp--;
+
+	/* Merge the sub-expression result with the parent result set. */
+	subexpr_doc_ids = query->doc_ids;
+	query->doc_ids = parent_doc_ids;
+	if (error == DB_SUCCESS) {
+		error = fts_merge_doc_ids(query, subexpr_doc_ids);
+	}
+
+	/* Free current result set. Result already merged into parent. */
+	fts_query_free_doc_ids(query, subexpr_doc_ids);
+
+	DBUG_RETURN(error);
+}
+
+#if 0
+/*****************************************************************//***
+Check if the doc id exists in the ilist.
+@return TRUE if doc id found */
+static
+ulint
+fts_query_find_doc_id(
+/*==================*/
+	fts_select_t*	select,		/*!< in/out: contains the doc id to
+					find, we update the word freq if
+					document found */
+	void*		data,		/*!< in: doc id ilist */
+	ulint		len)		/*!< in: doc id ilist size */
+{
+	byte*		ptr = data;
+	doc_id_t	doc_id = 0;
+	ulint		decoded = 0;
+
+	/* Decode the ilist and search for selected doc_id. We also
+	calculate the frequency of the word in the document if found. */
+	while (decoded < len && !select->found) {
+		ulint		freq = 0;
+		ulint		min_pos = 0;
+		ulint		last_pos = 0;
+		ulint		pos = fts_decode_vlc(&ptr);
+
+		/* Add the delta. */
+		doc_id += pos;
+
+		while (*ptr) {
+			++freq;
+			last_pos += fts_decode_vlc(&ptr);
+
+			/* Only if min_pos is not set and the current
+			term exists in a position greater than the
+			min_pos of the previous term. */
+			if (min_pos == 0 && last_pos > select->min_pos) {
+				min_pos = last_pos;
+			}
+		}
+
+		/* Skip the end of word position marker. */
+		++ptr;
+
+		/* Bytes decoded so far. */
+		decoded = ptr - (byte*) data;
+
+		/* A word may exist in the document but we only consider a
+		match if it exists in a position that is greater than the
+		position of the previous term. */
+		if (doc_id == select->doc_id && min_pos > 0) {
+			fts_doc_freq_t*	doc_freq;
+
+			/* Add the doc id to the doc freq rb tree, if
+			the doc id doesn't exist it will be created. */
+			doc_freq = fts_query_add_doc_freq(
+				select->word_freq->doc_freqs, doc_id);
+
+			/* Avoid duplicating the frequency tally */
+			if (doc_freq->freq == 0) {
+				doc_freq->freq = freq;
+			}
+
+			select->found = TRUE;
+			select->min_pos = min_pos;
+		}
+	}
+
+	return(select->found);
+}
+#endif
+
+/*****************************************************************//**
+Read and filter nodes.
+@return DB_SUCCESS if all go well,
+or return DB_FTS_EXCEED_RESULT_CACHE_LIMIT */
+static
+dberr_t
+fts_query_filter_doc_ids(
+/*=====================*/
+	fts_query_t*		query,		/*!< in: query instance */
+	const fts_string_t*	word,		/*!< in: the current word */
+	fts_word_freq_t*	word_freq,	/*!< in/out: word frequency */
+	const fts_node_t*	node,		/*!< in: current FTS node */
+	void*			data,		/*!< in: doc id ilist */
+	ulint			len,		/*!< in: doc id ilist size */
+	ibool			calc_doc_count)	/*!< in: whether to remember doc count */
+{
+	const byte*	ptr = static_cast<byte*>(data);
+	doc_id_t	doc_id = 0;
+	ulint		decoded = 0;
+	ib_rbt_t*	doc_freqs = word_freq->doc_freqs;
+
+	/* Decode the ilist and add the doc ids to the query doc_id set. */
+	while (decoded < len) {
+		ulint		freq = 0;
+		fts_doc_freq_t*	doc_freq;
+		fts_match_t*	match = NULL;
+		doc_id_t	last_pos = 0;
+		doc_id_t	pos = fts_decode_vlc(&ptr);
+
+		/* Some sanity checks. */
+		if (doc_id == 0) {
+			ut_a(pos == node->first_doc_id);
+		}
+
+		/* Add the delta. */
+		doc_id += pos;
+
+		if (calc_doc_count) {
+			word_freq->doc_count++;
+		}
+
+		/* We simply collect the matching instances here. */
+		if (query->collect_positions) {
+			ib_alloc_t*	heap_alloc;
+
+			/* Create a new fts_match_t instance. */
+			match = static_cast<fts_match_t*>(
+				ib_vector_push(query->matched, NULL));
+
+			match->start = 0;
+			match->doc_id = doc_id;
+			heap_alloc = ib_vector_allocator(query->matched);
+
+			/* Allocate from the same heap as the
+			parent container. */
+			match->positions = ib_vector_create(
+				heap_alloc, sizeof(ulint), 64);
+
+			query->total_size += sizeof(fts_match_t)
+				+ sizeof(ib_vector_t)
+				+ sizeof(ulint) * 64;
+		}
+
+		/* Unpack the positions within the document. */
+		while (*ptr) {
+			last_pos += fts_decode_vlc(&ptr);
+
+			/* Collect the matching word positions, for phrase
+			matching later. */
+			if (query->collect_positions) {
+				ib_vector_push(match->positions, &last_pos);
+			}
+
+			++freq;
+		}
+
+		/* End of list marker. */
+		last_pos = (ulint) -1;
+
+		if (query->collect_positions) {
+			ut_a(match != NULL);
+			ib_vector_push(match->positions, &last_pos);
+		}
+
+		/* Add the doc id to the doc freq rb tree, if the doc id
+		doesn't exist it will be created. */
+		doc_freq = fts_query_add_doc_freq(query, doc_freqs, doc_id);
+
+		/* Avoid duplicating frequency tally. */
+		if (doc_freq->freq == 0) {
+			doc_freq->freq = freq;
+		}
+
+		/* Skip the end of word position marker. */
+		++ptr;
+
+		/* Bytes decoded so far */
+		decoded = ulint(ptr - (byte*) data);
+
+		/* We simply collect the matching documents and the
+		positions here and match later. */
+		if (!query->collect_positions) {
+			/* We ignore error here and will check it later */
+			fts_query_process_doc_id(query, doc_id, 0);
+
+			/* Add the word to the document's matched RB tree. */
+			fts_query_add_word_to_document(query, doc_id, word);
+		}
+	}
+
+	/* Some sanity checks. */
+	ut_a(doc_id == node->last_doc_id);
+
+	if (query->total_size > fts_result_cache_limit) {
+		return(DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
+	} else {
+		return(DB_SUCCESS);
+	}
+}
+
+/*****************************************************************//**
+Read the FTS INDEX row.
+@return DB_SUCCESS if all go well. */
+static
+dberr_t
+fts_query_read_node(
+/*================*/
+	fts_query_t*		query,	/*!< in: query instance */
+	const fts_string_t*	word,	/*!< in: current word */
+	que_node_t*		exp)	/*!< in: query graph node */
+{
+	int			i;
+	int			ret;
+	fts_node_t		node;
+	ib_rbt_bound_t		parent;
+	fts_word_freq_t*	word_freq;
+	ibool			skip = FALSE;
+	fts_string_t		term;
+	byte			buf[FTS_MAX_WORD_LEN + 1];
+	dberr_t			error = DB_SUCCESS;
+
+	ut_a(query->cur_node->type == FTS_AST_TERM
+	     || query->cur_node->type == FTS_AST_TEXT
+	     || query->cur_node->type == FTS_AST_PARSER_PHRASE_LIST);
+
+	memset(&node, 0, sizeof(node));
+	term.f_str = buf;
+
+	/* Need to consider the wildcard search case, the word frequency
+	is created on the search string not the actual word. So we need
+	to assign the frequency on search string behalf. */
+	if (query->cur_node->type == FTS_AST_TERM
+	    && query->cur_node->term.wildcard) {
+
+		term.f_len = query->cur_node->term.ptr->len;
+		ut_ad(FTS_MAX_WORD_LEN >= term.f_len);
+		memcpy(term.f_str, query->cur_node->term.ptr->str, term.f_len);
+	} else {
+		term.f_len = word->f_len;
+		ut_ad(FTS_MAX_WORD_LEN >= word->f_len);
+		memcpy(term.f_str, word->f_str, word->f_len);
+	}
+
+	/* Lookup the word in our rb tree, it must exist. */
+	ret = rbt_search(query->word_freqs, &parent, &term);
+
+	ut_a(ret == 0);
+
+	word_freq = rbt_value(fts_word_freq_t, parent.last);
+
+	/* Start from 1 since the first column has been read by the caller.
+	Also, we rely on the order of the columns projected, to filter
+	out ilists that are out of range and we always want to read
+	the doc_count irrespective of the suitablility of the row. */
+
+	for (i = 1; exp && !skip; exp = que_node_get_next(exp), ++i) {
+
+		dfield_t*	dfield = que_node_get_val(exp);
+		byte*		data = static_cast<byte*>(
+			dfield_get_data(dfield));
+		ulint		len = dfield_get_len(dfield);
+
+		ut_a(len != UNIV_SQL_NULL);
+
+		/* Note: The column numbers below must match the SELECT. */
+
+		switch (i) {
+		case 1: /* DOC_COUNT */
+			word_freq->doc_count += mach_read_from_4(data);
+			break;
+
+		case 2: /* FIRST_DOC_ID */
+			node.first_doc_id = fts_read_doc_id(data);
+
+			/* Skip nodes whose doc ids are out range. */
+			if (query->oper == FTS_EXIST
+			    && query->upper_doc_id > 0
+			    && node.first_doc_id > query->upper_doc_id) {
+				skip = TRUE;
+			}
+			break;
+
+		case 3: /* LAST_DOC_ID */
+			node.last_doc_id = fts_read_doc_id(data);
+
+			/* Skip nodes whose doc ids are out range. */
+			if (query->oper == FTS_EXIST
+			    && query->lower_doc_id > 0
+			    && node.last_doc_id < query->lower_doc_id) {
+				skip = TRUE;
+			}
+			break;
+
+		case 4: /* ILIST */
+
+			error = fts_query_filter_doc_ids(
+					query, &word_freq->word, word_freq,
+					&node, data, len, FALSE);
+
+			break;
+
+		default:
+			ut_error;
+		}
+	}
+
+	if (!skip) {
+		/* Make sure all columns were read. */
+
+		ut_a(i == 5);
+	}
+
+	return error;
+}
+
+/*****************************************************************//**
+Callback function to fetch the rows in an FTS INDEX record.
+@return always returns TRUE */
+static
+ibool
+fts_query_index_fetch_nodes(
+/*========================*/
+	void*		row,		/*!< in: sel_node_t* */
+	void*		user_arg)	/*!< in: pointer to fts_fetch_t */
+{
+	fts_string_t	key;
+	sel_node_t*	sel_node = static_cast<sel_node_t*>(row);
+	fts_fetch_t*	fetch = static_cast<fts_fetch_t*>(user_arg);
+	fts_query_t*	query = static_cast<fts_query_t*>(fetch->read_arg);
+	que_node_t*	exp = sel_node->select_list;
+	dfield_t*	dfield = que_node_get_val(exp);
+	void*		data = dfield_get_data(dfield);
+	ulint		dfield_len = dfield_get_len(dfield);
+
+	key.f_str = static_cast<byte*>(data);
+	key.f_len = dfield_len;
+
+	ut_a(dfield_len <= FTS_MAX_WORD_LEN);
+
+	/* Note: we pass error out by 'query->error' */
+	query->error = fts_query_read_node(query, &key, que_node_get_next(exp));
+
+	if (query->error != DB_SUCCESS) {
+		ut_ad(query->error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT);
+		return(FALSE);
+	} else {
+		return(TRUE);
+	}
+}
+
+/*****************************************************************//**
+Calculate the inverse document frequency (IDF) for all the terms. */
+static
+void
+fts_query_calculate_idf(
+/*====================*/
+	fts_query_t*	query)	/*!< in: Query state */
+{
+	const ib_rbt_node_t*	node;
+	ib_uint64_t		total_docs = query->total_docs;
+
+	/* We need to free any instances of fts_doc_freq_t that we
+	may have allocated. */
+	for (node = rbt_first(query->word_freqs);
+	     node;
+	     node = rbt_next(query->word_freqs, node)) {
+
+		fts_word_freq_t*	word_freq;
+
+		word_freq = rbt_value(fts_word_freq_t, node);
+
+		if (word_freq->doc_count > 0) {
+			if (total_docs == word_freq->doc_count) {
+				/* QP assume ranking > 0 if we find
+				a match. Since Log10(1) = 0, we cannot
+				make IDF a zero value if do find a
+				word in all documents. So let's make
+				it an arbitrary very small number */
+				word_freq->idf = log10(1.0001);
+			} else {
+				word_freq->idf = log10(
+					static_cast<double>(total_docs)
+					/ static_cast<double>(
+						word_freq->doc_count));
+			}
+		}
+	}
+}
+
+/*****************************************************************//**
+Calculate the ranking of the document. */
+static
+void
+fts_query_calculate_ranking(
+/*========================*/
+	const fts_query_t*	query,		/*!< in: query state */
+	fts_ranking_t*		ranking)	/*!< in: Document to rank */
+{
+	ulint	pos = 0;
+	fts_string_t	word;
+
+	/* At this stage, ranking->rank should not exceed the 1.0
+	bound */
+	ut_ad(ranking->rank <= 1.0 && ranking->rank >= -1.0);
+	ut_ad(rbt_size(query->word_map) == query->word_vector->size());
+
+	while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
+		int			ret;
+		ib_rbt_bound_t		parent;
+		double			weight;
+		fts_doc_freq_t*		doc_freq;
+		fts_word_freq_t*	word_freq;
+
+		ret = rbt_search(query->word_freqs, &parent, &word);
+
+		/* It must exist. */
+		ut_a(ret == 0);
+
+		word_freq = rbt_value(fts_word_freq_t, parent.last);
+
+		ret = rbt_search(
+			word_freq->doc_freqs, &parent, &ranking->doc_id);
+
+		/* It must exist. */
+		ut_a(ret == 0);
+
+		doc_freq = rbt_value(fts_doc_freq_t, parent.last);
+
+		weight = (double) doc_freq->freq * word_freq->idf;
+
+		ranking->rank += (fts_rank_t) (weight * word_freq->idf);
+	}
+}
+
+/*****************************************************************//**
+Add ranking to the result set. */
+static
+void
+fts_query_add_ranking(
+/*==================*/
+	fts_query_t*		query,		/*!< in: query state */
+	ib_rbt_t*		ranking_tree,	/*!< in: ranking tree */
+	const fts_ranking_t*	new_ranking)	/*!< in: ranking of a document */
+{
+	ib_rbt_bound_t		parent;
+
+	/* Lookup the ranking in our rb tree and add if it doesn't exist. */
+	if (rbt_search(ranking_tree, &parent, new_ranking) == 0) {
+		fts_ranking_t*	ranking;
+
+		ranking = rbt_value(fts_ranking_t, parent.last);
+
+		ranking->rank += new_ranking->rank;
+
+		ut_a(ranking->words == NULL);
+	} else {
+		rbt_add_node(ranking_tree, &parent, new_ranking);
+
+		query->total_size += SIZEOF_RBT_NODE_ADD
+			+ sizeof(fts_ranking_t);
+	}
+}
+
+/*****************************************************************//**
+Retrieve the FTS Relevance Ranking result for doc with doc_id
+@return the relevance ranking value, 0 if no ranking value
+present. */
+float
+fts_retrieve_ranking(
+/*=================*/
+	fts_result_t*	result,	/*!< in: FTS result structure */
+	doc_id_t	doc_id)	/*!< in: doc_id of the item to retrieve */
+{
+	ib_rbt_bound_t		parent;
+	fts_ranking_t		new_ranking;
+
+	DBUG_ENTER("fts_retrieve_ranking");
+
+	if (!result || !result->rankings_by_id) {
+		DBUG_RETURN(0);
+	}
+
+	new_ranking.doc_id = doc_id;
+
+	/* Lookup the ranking in our rb tree */
+	if (rbt_search(result->rankings_by_id, &parent, &new_ranking) == 0) {
+		fts_ranking_t*  ranking;
+
+		ranking = rbt_value(fts_ranking_t, parent.last);
+
+		DBUG_RETURN(ranking->rank);
+	}
+
+	DBUG_RETURN(0);
+}
+
+/*****************************************************************//**
+Create the result and copy the data to it. */
+static
+fts_result_t*
+fts_query_prepare_result(
+/*=====================*/
+	fts_query_t*	query,	/*!< in: Query state */
+	fts_result_t*	result)	/*!< in: result this can contain
+				data from a previous search on
+				another FTS index */
+{
+	const ib_rbt_node_t*	node;
+	bool			result_is_null = false;
+
+	DBUG_ENTER("fts_query_prepare_result");
+
+	if (result == NULL) {
+		result = static_cast<fts_result_t*>(
+			ut_zalloc_nokey(sizeof(*result)));
+
+		result->rankings_by_id = rbt_create(
+			sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
+
+		query->total_size += sizeof(fts_result_t) + SIZEOF_RBT_CREATE;
+		result_is_null = true;
+	}
+
+	if (query->flags == FTS_OPT_RANKING) {
+		fts_word_freq_t*	word_freq;
+		ulint		size = ib_vector_size(query->deleted->doc_ids);
+		doc_id_t*	updates =
+			(doc_id_t*) query->deleted->doc_ids->data;
+
+		node = rbt_first(query->word_freqs);
+		ut_ad(node);
+		word_freq = rbt_value(fts_word_freq_t, node);
+
+		for (node = rbt_first(word_freq->doc_freqs);
+		     node;
+		     node = rbt_next(word_freq->doc_freqs, node)) {
+			fts_doc_freq_t* doc_freq;
+			fts_ranking_t	ranking;
+
+			doc_freq = rbt_value(fts_doc_freq_t, node);
+
+			/* Don't put deleted docs into result */
+			if (fts_bsearch(updates, 0, static_cast<int>(size),
+					doc_freq->doc_id) >= 0) {
+				/* one less matching doc count */
+				--word_freq->doc_count;
+				continue;
+			}
+
+			ranking.doc_id = doc_freq->doc_id;
+			ranking.rank = static_cast<fts_rank_t>(doc_freq->freq);
+			ranking.words = NULL;
+
+			fts_query_add_ranking(query, result->rankings_by_id,
+					      &ranking);
+
+			if (query->total_size > fts_result_cache_limit) {
+				query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
+				fts_query_free_result(result);
+				DBUG_RETURN(NULL);
+			}
+		}
+
+		/* Calculate IDF only after we exclude the deleted items */
+		fts_query_calculate_idf(query);
+
+		node = rbt_first(query->word_freqs);
+		word_freq = rbt_value(fts_word_freq_t, node);
+
+		/* Calculate the ranking for each doc */
+		for (node = rbt_first(result->rankings_by_id);
+		     node != NULL;
+		     node = rbt_next(result->rankings_by_id, node)) {
+
+			fts_ranking_t*  ranking;
+
+			ranking = rbt_value(fts_ranking_t, node);
+
+			ranking->rank = static_cast<fts_rank_t>(
+				ranking->rank * word_freq->idf * word_freq->idf);
+		}
+
+		DBUG_RETURN(result);
+	}
+
+	ut_a(rbt_size(query->doc_ids) > 0);
+
+	for (node = rbt_first(query->doc_ids);
+	     node;
+	     node = rbt_next(query->doc_ids, node)) {
+
+		fts_ranking_t*	ranking;
+
+		ranking = rbt_value(fts_ranking_t, node);
+		fts_query_calculate_ranking(query, ranking);
+
+		// FIXME: I think we may requre this information to improve the
+		// ranking of doc ids which have more word matches from
+		// different FTS indexes.
+
+		/* We don't need these anymore free the resources. */
+		ranking->words = NULL;
+
+		if (!result_is_null) {
+			fts_query_add_ranking(query, result->rankings_by_id, ranking);
+
+			 if (query->total_size > fts_result_cache_limit) {
+				query->error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
+				fts_query_free_result(result);
+				DBUG_RETURN(NULL);
+                        }
+		}
+	}
+
+	if (result_is_null) {
+		/* Use doc_ids directly */
+		rbt_free(result->rankings_by_id);
+		result->rankings_by_id = query->doc_ids;
+		query->doc_ids = NULL;
+	}
+
+	DBUG_RETURN(result);
+}
+
+/*****************************************************************//**
+Get the result of the query. Calculate the similarity coefficient. */
+static
+fts_result_t*
+fts_query_get_result(
+/*=================*/
+	fts_query_t*		query,	/*!< in: query instance */
+	fts_result_t*		result)	/*!< in: result */
+{
+	DBUG_ENTER("fts_query_get_result");
+
+	if (rbt_size(query->doc_ids) > 0 || query->flags == FTS_OPT_RANKING) {
+		/* Copy the doc ids to the result. */
+		result = fts_query_prepare_result(query, result);
+	} else {
+		/* Create an empty result instance. */
+		result = static_cast<fts_result_t*>(
+			ut_zalloc_nokey(sizeof(*result)));
+	}
+
+	DBUG_RETURN(result);
+}
+
+/*****************************************************************//**
+FTS Query free resources and reset. */
+static
+void
+fts_query_free(
+/*===========*/
+	fts_query_t*	query)		/*!< in: query instance to free*/
+{
+
+	if (query->read_nodes_graph) {
+		que_graph_free(query->read_nodes_graph);
+	}
+
+	if (query->root) {
+		fts_ast_free_node(query->root);
+	}
+
+	if (query->deleted) {
+		fts_doc_ids_free(query->deleted);
+	}
+
+	if (query->intersection) {
+		fts_query_free_doc_ids(query, query->intersection);
+	}
+
+	if (query->doc_ids) {
+		fts_query_free_doc_ids(query, query->doc_ids);
+	}
+
+	if (query->word_freqs) {
+		const ib_rbt_node_t*	node;
+
+		/* We need to free any instances of fts_doc_freq_t that we
+		may have allocated. */
+		for (node = rbt_first(query->word_freqs);
+		     node;
+		     node = rbt_next(query->word_freqs, node)) {
+
+			fts_word_freq_t*	word_freq;
+
+			word_freq = rbt_value(fts_word_freq_t, node);
+
+			/* We need to cast away the const. */
+			rbt_free(word_freq->doc_freqs);
+		}
+
+		rbt_free(query->word_freqs);
+	}
+
+	if (query->wildcard_words != NULL) {
+		rbt_free(query->wildcard_words);
+	}
+
+	ut_a(!query->intersection);
+
+	if (query->word_map) {
+		rbt_free(query->word_map);
+	}
+
+	if (query->word_vector != NULL) {
+		UT_DELETE(query->word_vector);
+	}
+
+	if (query->heap) {
+		mem_heap_free(query->heap);
+	}
+
+	memset(query, 0, sizeof(*query));
+}
+
+/*****************************************************************//**
+Parse the query using flex/bison or plugin parser.
+@return parse tree node. */
+static
+fts_ast_node_t*
+fts_query_parse(
+/*============*/
+	fts_query_t*	query,		/*!< in: query instance */
+	byte*		query_str,	/*!< in: query string */
+	ulint		query_len)	/*!< in: query string length */
+{
+	int		error;
+	fts_ast_state_t state;
+	bool		mode = query->boolean_mode;
+	DBUG_ENTER("fts_query_parse");
+
+	memset(&state, 0x0, sizeof(state));
+
+	state.charset = query->fts_index_table.charset;
+
+	DBUG_EXECUTE_IF("fts_instrument_query_disable_parser",
+		query->parser = NULL;);
+
+	if (query->parser) {
+		state.root = state.cur_node =
+			fts_ast_create_node_list(&state, NULL);
+		error = fts_parse_by_parser(mode, query_str, query_len,
+					    query->parser, &state);
+	} else {
+		/* Setup the scanner to use, this depends on the mode flag. */
+		state.lexer = fts_lexer_create(mode, query_str, query_len);
+		state.charset = query->fts_index_table.charset;
+		error = fts_parse(&state);
+		fts_lexer_free(state.lexer);
+		state.lexer = NULL;
+	}
+
+	/* Error during parsing ? */
+	if (error) {
+		/* Free the nodes that were allocated during parsing. */
+		fts_ast_state_free(&state);
+	} else {
+		query->root = state.root;
+
+		if (UNIV_UNLIKELY(fts_enable_diag_print) && query->root) {
+			fts_ast_node_print(query->root);
+		}
+	}
+
+	DBUG_RETURN(state.root);
+}
+
+/*******************************************************************//**
+FTS Query optimization
+Set FTS_OPT_RANKING if it is a simple term query */
+static
+void
+fts_query_can_optimize(
+/*===================*/
+	fts_query_t*	query,		/*!< in/out: query instance */
+	uint		flags)		/*!< In: FTS search mode */
+{
+	fts_ast_node_t*	node = query->root;
+
+	if (flags & FTS_EXPAND) {
+		return;
+	}
+
+	/* Check if it has only a term without oper */
+	ut_ad(node->type == FTS_AST_LIST);
+	node = node->list.head;
+	if (node != NULL && node->type == FTS_AST_TERM && node->next == NULL) {
+		query->flags = FTS_OPT_RANKING;
+	}
+}
+
+/** FTS Query entry point.
+@param[in,out]	trx		transaction
+@param[in]	index		fts index to search
+@param[in]	flags		FTS search mode
+@param[in]	query_str	FTS query
+@param[in]	query_len	FTS query string len in bytes
+@param[in,out]	result		result doc ids
+@return DB_SUCCESS if successful otherwise error code */
+dberr_t
+fts_query(
+	trx_t*		trx,
+	dict_index_t*	index,
+	uint		flags,
+	const byte*	query_str,
+	ulint		query_len,
+	fts_result_t**	result)
+{
+	fts_query_t	query;
+	dberr_t		error = DB_SUCCESS;
+	byte*		lc_query_str;
+	ulint		lc_query_str_len;
+	ulint		result_len;
+	bool		boolean_mode;
+	trx_t*		query_trx; /* FIXME: use provided trx */
+	CHARSET_INFO*	charset;
+	ulint		start_time_ms;
+	bool		will_be_ignored = false;
+
+	boolean_mode = flags & FTS_BOOL;
+
+	*result = NULL;
+	memset(&query, 0x0, sizeof(query));
+	query_trx = trx_create();
+	query_trx->op_info = "FTS query";
+
+	start_time_ms = ut_time_ms();
+
+	query.trx = query_trx;
+	query.index = index;
+	query.boolean_mode = boolean_mode;
+	query.deleted = fts_doc_ids_create();
+	query.cur_node = NULL;
+
+	query.fts_common_table.type = FTS_COMMON_TABLE;
+	query.fts_common_table.table_id = index->table->id;
+	query.fts_common_table.table = index->table;
+
+	charset = fts_index_get_charset(index);
+
+	query.fts_index_table.type = FTS_INDEX_TABLE;
+	query.fts_index_table.index_id = index->id;
+	query.fts_index_table.table_id = index->table->id;
+	query.fts_index_table.charset = charset;
+	query.fts_index_table.table = index->table;
+
+	query.word_map = rbt_create_arg_cmp(
+		sizeof(fts_string_t), innobase_fts_text_cmp, (void*)charset);
+	query.word_vector = UT_NEW_NOKEY(word_vector_t());
+	query.error = DB_SUCCESS;
+
+	/* Setup the RB tree that will be used to collect per term
+	statistics. */
+	query.word_freqs = rbt_create_arg_cmp(
+		sizeof(fts_word_freq_t), innobase_fts_text_cmp,
+                (void*) charset);
+
+	if (flags & FTS_EXPAND) {
+		query.wildcard_words = rbt_create_arg_cmp(
+			sizeof(fts_string_t), innobase_fts_text_cmp, (void *)charset);
+	}
+
+	query.total_size += SIZEOF_RBT_CREATE;
+
+	query.total_docs = dict_table_get_n_rows(index->table);
+
+	query.fts_common_table.suffix = "DELETED";
+
+	/* Read the deleted doc_ids, we need these for filtering. */
+	error = fts_table_fetch_doc_ids(
+		NULL, &query.fts_common_table, query.deleted);
+
+	if (error != DB_SUCCESS) {
+		goto func_exit;
+	}
+
+	query.fts_common_table.suffix = "DELETED_CACHE";
+
+	error = fts_table_fetch_doc_ids(
+		NULL, &query.fts_common_table, query.deleted);
+
+	if (error != DB_SUCCESS) {
+		goto func_exit;
+	}
+
+	/* Get the deleted doc ids that are in the cache. */
+	fts_cache_append_deleted_doc_ids(
+		index->table->fts->cache, query.deleted->doc_ids);
+	DEBUG_SYNC_C("fts_deleted_doc_ids_append");
+
+	/* Sort the vector so that we can do a binary search over the ids. */
+	ib_vector_sort(query.deleted->doc_ids, fts_doc_id_cmp);
+
+	/* Convert the query string to lower case before parsing. We own
+	the ut_malloc'ed result and so remember to free it before return. */
+
+	lc_query_str_len = query_len * charset->casedn_multiply() + 1;
+	lc_query_str = static_cast<byte*>(ut_malloc_nokey(lc_query_str_len));
+
+	/* For binary collations, a case sensitive search is
+	performed. Hence don't convert to lower case. */
+	if (my_binary_compare(charset)) {
+	memcpy(lc_query_str, query_str, query_len);
+		lc_query_str[query_len]= 0;
+		result_len= query_len;
+	} else {
+	result_len = innobase_fts_casedn_str(
+				charset, (char*)( query_str), query_len,
+				(char*)(lc_query_str), lc_query_str_len);
+	}
+
+	ut_ad(result_len < lc_query_str_len);
+
+	lc_query_str[result_len] = 0;
+
+	query.heap = mem_heap_create(128);
+
+	/* Create the rb tree for the doc id (current) set. */
+	query.doc_ids = rbt_create(
+		sizeof(fts_ranking_t), fts_ranking_doc_id_cmp);
+	query.parser = index->parser;
+
+	query.total_size += SIZEOF_RBT_CREATE;
+
+	/* Parse the input query string. */
+	if (fts_query_parse(&query, lc_query_str, result_len)) {
+		fts_ast_node_t*	ast = query.root;
+		ast->trx = trx;
+
+		/* Optimize query to check if it's a single term */
+		fts_query_can_optimize(&query, flags);
+
+		DBUG_EXECUTE_IF("fts_instrument_result_cache_limit",
+			        fts_result_cache_limit = 2048;
+		);
+
+		/* Traverse the Abstract Syntax Tree (AST) and execute
+		the query. */
+		query.error = fts_ast_visit(
+			FTS_NONE, ast, fts_query_visitor,
+			&query, &will_be_ignored);
+		if (query.error == DB_INTERRUPTED) {
+			error = DB_INTERRUPTED;
+			ut_free(lc_query_str);
+			goto func_exit;
+		}
+
+		/* If query expansion is requested, extend the search
+		with first search pass result */
+		if (query.error == DB_SUCCESS && (flags & FTS_EXPAND)) {
+			query.error = fts_expand_query(index, &query);
+		}
+
+		/* Calculate the inverse document frequency of the terms. */
+		if (query.error == DB_SUCCESS
+		    && query.flags != FTS_OPT_RANKING) {
+			fts_query_calculate_idf(&query);
+		}
+
+		/* Copy the result from the query state, so that we can
+		return it to the caller. */
+		if (query.error == DB_SUCCESS) {
+			*result = fts_query_get_result(&query, *result);
+		}
+
+		error = query.error;
+	} else {
+		/* still return an empty result set */
+		*result = static_cast<fts_result_t*>(
+			ut_zalloc_nokey(sizeof(**result)));
+	}
+
+	if (trx_is_interrupted(trx)) {
+		error = DB_INTERRUPTED;
+		ut_free(lc_query_str);
+		if (*result) {
+			fts_query_free_result(*result);
+		}
+		goto func_exit;
+	}
+
+	ut_free(lc_query_str);
+
+	if (UNIV_UNLIKELY(fts_enable_diag_print) && (*result)) {
+		ulint	diff_time = ut_time_ms() - start_time_ms;
+
+		ib::info() << "FTS Search Processing time: "
+			<< diff_time / 1000 << " secs: " << diff_time % 1000
+			<< " millisec: row(s) "
+			<< ((*result)->rankings_by_id
+			    ? lint(rbt_size((*result)->rankings_by_id))
+			    : -1);
+
+		/* Log memory consumption & result size */
+		ib::info() << "Full Search Memory: " << query.total_size
+			<< " (bytes),  Row: "
+			<< ((*result)->rankings_by_id
+			    ? rbt_size((*result)->rankings_by_id)
+			    : 0)
+			<< ".";
+	}
+
+func_exit:
+	fts_query_free(&query);
+
+	query_trx->free();
+
+	return(error);
+}
+
+/*****************************************************************//**
+FTS Query free result, returned by fts_query(). */
+void
+fts_query_free_result(
+/*==================*/
+	fts_result_t*	result)		/*!< in: result instance to free.*/
+{
+	if (result) {
+		if (result->rankings_by_id != NULL) {
+			rbt_free(result->rankings_by_id);
+			result->rankings_by_id = NULL;
+		}
+		if (result->rankings_by_rank != NULL) {
+			rbt_free(result->rankings_by_rank);
+			result->rankings_by_rank = NULL;
+		}
+
+		ut_free(result);
+		result = NULL;
+	}
+}
+
+/*****************************************************************//**
+FTS Query sort result, returned by fts_query() on fts_ranking_t::rank. */
+void
+fts_query_sort_result_on_rank(
+/*==========================*/
+	fts_result_t*	result)		/*!< out: result instance to sort.*/
+{
+	const ib_rbt_node_t*	node;
+	ib_rbt_t*		ranked;
+
+	ut_a(result->rankings_by_id != NULL);
+	if (result->rankings_by_rank) {
+		rbt_free(result->rankings_by_rank);
+	}
+
+	ranked = rbt_create(sizeof(fts_ranking_t), fts_query_compare_rank);
+
+	/* We need to free any instances of fts_doc_freq_t that we
+	may have allocated. */
+	for (node = rbt_first(result->rankings_by_id);
+	     node;
+	     node = rbt_next(result->rankings_by_id, node)) {
+
+		fts_ranking_t*	ranking;
+
+		ranking = rbt_value(fts_ranking_t, node);
+
+		ut_a(ranking->words == NULL);
+
+		rbt_insert(ranked, ranking, ranking);
+	}
+
+	/* Reset the current node too. */
+	result->current = NULL;
+	result->rankings_by_rank = ranked;
+}
+
+/*******************************************************************//**
+A debug function to print result doc_id set. */
+static
+void
+fts_print_doc_id(
+/*=============*/
+	fts_query_t*	query)	/*!< in : tree that stores doc_ids.*/
+{
+	const ib_rbt_node_t*	node;
+
+	/* Iterate each member of the doc_id set */
+	for (node = rbt_first(query->doc_ids);
+	     node;
+	     node = rbt_next(query->doc_ids, node)) {
+		fts_ranking_t*	ranking;
+		ranking = rbt_value(fts_ranking_t, node);
+
+		ib::info() << "doc_ids info, doc_id: " << ranking->doc_id;
+
+		ulint		pos = 0;
+		fts_string_t	word;
+
+		while (fts_ranking_words_get_next(query, ranking, &pos, &word)) {
+			ib::info() << "doc_ids info, value: " << word.f_str;
+		}
+	}
+}
+
+/*************************************************************//**
+This function implements a simple "blind" query expansion search:
+words in documents found in the first search pass will be used as
+search arguments to search the document again, thus "expand"
+the search result set.
+@return DB_SUCCESS if success, otherwise the error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+fts_expand_query(
+/*=============*/
+	dict_index_t*	index,		/*!< in: FTS index to search */
+	fts_query_t*	query)		/*!< in: FTS query instance */
+{
+	const ib_rbt_node_t*	node;
+	const ib_rbt_node_t*	token_node;
+	fts_doc_t		result_doc;
+	dberr_t			error = DB_SUCCESS;
+	const fts_index_cache_t*index_cache;
+
+	/* If no doc is found in first search pass, return */
+	if (!rbt_size(query->doc_ids)) {
+		return(error);
+	}
+
+	/* Init "result_doc", to hold words from the first search pass */
+	fts_doc_init(&result_doc);
+
+	mysql_mutex_lock(&index->table->fts->cache->lock);
+	index_cache = fts_find_index_cache(index->table->fts->cache, index);
+	mysql_mutex_unlock(&index->table->fts->cache->lock);
+
+	ut_a(index_cache);
+
+	result_doc.tokens = rbt_create_arg_cmp(
+		sizeof(fts_token_t), innobase_fts_text_cmp,
+		(void*) index_cache->charset);
+
+	result_doc.charset = index_cache->charset;
+	result_doc.parser = index_cache->index->parser;
+
+	query->total_size += SIZEOF_RBT_CREATE;
+
+	if (UNIV_UNLIKELY(fts_enable_diag_print)) {
+		fts_print_doc_id(query);
+	}
+
+	for (node = rbt_first(query->doc_ids);
+	     node;
+	     node = rbt_next(query->doc_ids, node)) {
+
+		fts_ranking_t*	ranking;
+		ulint		prev_token_size;
+		ulint		estimate_size;
+
+		prev_token_size = rbt_size(result_doc.tokens);
+
+		ranking = rbt_value(fts_ranking_t, node);
+
+		/* Fetch the documents with the doc_id from the
+		result of first seach pass. Since we do not
+		store document-to-word mapping, we need to
+		fetch the original document and parse them.
+		Future optimization could be done here if we
+		support some forms of document-to-word mapping */
+		fts_doc_fetch_by_doc_id(NULL, ranking->doc_id, index,
+					FTS_FETCH_DOC_BY_ID_EQUAL,
+					fts_query_expansion_fetch_doc,
+					&result_doc);
+
+		/* Estimate memory used, see fts_process_token and fts_token_t.
+		   We ignore token size here. */
+		estimate_size = (rbt_size(result_doc.tokens) - prev_token_size)
+			* (SIZEOF_RBT_NODE_ADD + sizeof(fts_token_t)
+			+ sizeof(ib_vector_t) + sizeof(ulint) * 32);
+		query->total_size += estimate_size;
+
+		if (query->total_size > fts_result_cache_limit) {
+			error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
+			goto	func_exit;
+		}
+	}
+
+	/* Remove words that have already been searched in the first pass */
+	for (ulint i = 0; i < query->word_vector->size(); i++) {
+		fts_string_t	word = query->word_vector->at(i);
+		ib_rbt_bound_t	parent;
+
+		if (query->wildcard_words
+		    && rbt_search(query->wildcard_words, &parent, &word) == 0) {
+			/* If it's a wildcard word, remove words having
+			it as prefix. */
+			while (rbt_search_cmp(result_doc.tokens,
+					      &parent, &word, NULL,
+					      innobase_fts_text_cmp_prefix)
+			       == 0) {
+				ut_free(rbt_remove_node(result_doc.tokens,
+							parent.last));
+			}
+		} else {
+			/* We don't check return value, because the word may
+			have been deleted by a previous wildcard word as its
+			prefix, e.g. ('g * good'). */
+			rbt_delete(result_doc.tokens, &word);
+		}
+	}
+
+	/* Search the table the second time with expanded search list */
+	for (token_node = rbt_first(result_doc.tokens);
+	     token_node;
+	     token_node = rbt_next(result_doc.tokens, token_node)) {
+		fts_token_t*	mytoken;
+		mytoken = rbt_value(fts_token_t, token_node);
+
+		/* '%' in the end is treated as prefix search,
+		it can cause assert failure, so we skip it. */
+		if (mytoken->text.f_str[mytoken->text.f_len - 1] == '%') {
+			continue;
+		}
+
+		ut_ad(mytoken->text.f_str[mytoken->text.f_len] == 0);
+		fts_query_add_word_freq(query, &mytoken->text);
+		error = fts_query_union(query, &mytoken->text);
+
+		if (error != DB_SUCCESS) {
+			break;
+		}
+	}
+
+func_exit:
+	fts_doc_free(&result_doc);
+
+	return(error);
+}
+/*************************************************************//**
+This function finds documents that contain all words in a
+phrase or proximity search. And if proximity search, verify
+the words are close enough to each other, as in specified distance.
+This function is called for phrase and proximity search.
+@return TRUE if documents are found, FALSE if otherwise */
+static
+ibool
+fts_phrase_or_proximity_search(
+/*===========================*/
+	fts_query_t*	query,		/*!< in/out:  query instance.
+					query->doc_ids might be instantiated
+					with qualified doc IDs */
+	ib_vector_t*	tokens)		/*!< in: Tokens contain words */
+{
+	ulint		n_matched;
+	ulint		i;
+	ibool		matched = FALSE;
+	ulint		num_token = ib_vector_size(tokens);
+	fts_match_t*	match[MAX_PROXIMITY_ITEM];
+	ibool		end_list = FALSE;
+
+	/* Number of matched documents for the first token */
+	n_matched = ib_vector_size(query->match_array[0]);
+
+	/* We have a set of match list for each word, we shall
+	walk through the list and find common documents that
+	contain all the matching words. */
+	for (i = 0; i < n_matched; i++) {
+		ulint		j;
+		ulint		k = 0;
+		fts_proximity_t	qualified_pos;
+
+		match[0] = static_cast<fts_match_t*>(
+			ib_vector_get(query->match_array[0], i));
+
+		/* For remaining match list for the token(word), we
+		try to see if there is a document with the same
+		doc id */
+		for (j = 1; j < num_token; j++) {
+			match[j] = static_cast<fts_match_t*>(
+				ib_vector_get(query->match_array[j], k));
+
+			while (match[j]->doc_id < match[0]->doc_id
+			       && k < ib_vector_size(query->match_array[j])) {
+				 match[j] = static_cast<fts_match_t*>(
+					ib_vector_get(
+						query->match_array[j], k));
+				k++;
+			}
+
+			if (match[j]->doc_id > match[0]->doc_id) {
+				/* no match */
+				if (query->flags & FTS_PHRASE) {
+					match[0]->doc_id = 0;
+				}
+				break;
+			}
+
+			if (k == ib_vector_size(query->match_array[j])) {
+				end_list = TRUE;
+
+				if (query->flags & FTS_PHRASE) {
+					ulint	s;
+					/* Since i is the last doc id in the
+					match_array[j], remove all doc ids > i
+					from the match_array[0]. */
+					fts_match_t*    match_temp;
+					for (s = i + 1; s < n_matched; s++) {
+						match_temp = static_cast<
+						fts_match_t*>(ib_vector_get(
+						query->match_array[0], s));
+						match_temp->doc_id = 0;
+					}
+
+					if (match[j]->doc_id !=
+						match[0]->doc_id) {
+						/* no match */
+						match[0]->doc_id = 0;
+					}
+				}
+
+				if (match[j]->doc_id != match[0]->doc_id) {
+					goto func_exit;
+				}
+			}
+
+			/* FIXME: A better solution will be a counter array
+			remember each run's last position. So we don't
+			reset it here very time */
+			k = 0;
+		}
+
+		if (j != num_token) {
+			continue;
+		}
+
+		/* For this matching doc, we need to further
+		verify whether the words in the doc are close
+		to each other, and within the distance specified
+		in the proximity search */
+		if (query->flags & FTS_PHRASE) {
+			matched = TRUE;
+		} else if (fts_proximity_get_positions(
+			match, num_token, ULINT_MAX, &qualified_pos)) {
+
+			/* Fetch the original documents and count the
+			words in between matching words to see that is in
+			specified distance */
+			if (fts_query_is_in_proximity_range(
+				query, match, &qualified_pos)) {
+				/* If so, mark we find a matching doc */
+				query->error = fts_query_process_doc_id(
+					query, match[0]->doc_id, 0);
+				if (query->error != DB_SUCCESS) {
+					matched = FALSE;
+					goto func_exit;
+				}
+
+				matched = TRUE;
+				for (ulint z = 0; z < num_token; z++) {
+					fts_string_t*	token;
+					token = static_cast<fts_string_t*>(
+						ib_vector_get(tokens, z));
+					fts_query_add_word_to_document(
+						query, match[0]->doc_id, token);
+				}
+			}
+		}
+
+		if (end_list) {
+			break;
+		}
+	}
+
+func_exit:
+	return(matched);
+}
+
+/*************************************************************//**
+This function checks whether words in result documents are close to
+each other (within proximity range as specified by "distance").
+If "distance" is MAX_ULINT, then it will find all combinations of
+positions of matching words and store min and max positions
+in the "qualified_pos" for later verification.
+@return true if words are close to each other, false if otherwise */
+static
+bool
+fts_proximity_get_positions(
+/*========================*/
+	fts_match_t**		match,		/*!< in: query instance */
+	ulint			num_match,	/*!< in: number of matching
+						items */
+	ulint			distance,	/*!< in: distance value
+						for proximity search */
+	fts_proximity_t*	qualified_pos)	/*!< out: the position info
+						records ranges containing
+						all matching words. */
+{
+	ulint	i;
+	ulint	idx[MAX_PROXIMITY_ITEM];
+	ulint	num_pos[MAX_PROXIMITY_ITEM];
+	ulint	min_idx;
+
+	qualified_pos->n_pos = 0;
+
+	ut_a(num_match <= MAX_PROXIMITY_ITEM);
+
+	/* Each word could appear multiple times in a doc. So
+	we need to walk through each word's position list, and find
+	closest distance between different words to see if
+	they are in the proximity distance. */
+
+	/* Assume each word's position list is sorted, we
+	will just do a walk through to all words' lists
+	similar to a the merge phase of a merge sort */
+	for (i = 0; i < num_match; i++) {
+		/* idx is the current position we are checking
+		for a particular word */
+		idx[i] = 0;
+
+		/* Number of positions for this word */
+		num_pos[i] = ib_vector_size(match[i]->positions);
+	}
+
+	/* Start with the first word */
+	min_idx = 0;
+
+	while (idx[min_idx] < num_pos[min_idx]) {
+		ulint	position[MAX_PROXIMITY_ITEM];
+		ulint	min_pos = ULINT_MAX;
+		ulint	max_pos = 0;
+
+		/* Check positions in each word position list, and
+		record the max/min position */
+		for (i = 0; i < num_match; i++) {
+			position[i] = *(ulint*) ib_vector_get_const(
+				match[i]->positions, idx[i]);
+
+			if (position[i] == ULINT_UNDEFINED) {
+				break;
+			}
+
+			if (position[i] < min_pos) {
+				min_pos = position[i];
+				min_idx = i;
+			}
+
+			if (position[i] > max_pos) {
+				max_pos = position[i];
+			}
+		}
+
+		/* If max and min position are within range, we
+		find a good match */
+		if (max_pos - min_pos <= distance
+		    && (i >= num_match || position[i] != ULINT_UNDEFINED)) {
+			/* The charset has variable character
+			length encoding, record the min_pos and
+			max_pos, we will need to verify the actual
+			number of characters */
+			qualified_pos->min_pos.push_back(min_pos);
+			qualified_pos->max_pos.push_back(max_pos);
+			qualified_pos->n_pos++;
+		}
+
+		/* Otherwise, move to the next position is the
+		list for the word with the smallest position */
+		idx[min_idx]++;
+	}
+
+	return(qualified_pos->n_pos != 0);
+}
diff --git a/storage/innobase/fts/fts0sql.cc b/storage/innobase/fts/fts0sql.cc
new file mode 100644
index 00000000..1970f6f5
--- /dev/null
+++ b/storage/innobase/fts/fts0sql.cc
@@ -0,0 +1,208 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file fts/fts0sql.cc
+Full Text Search functionality.
+
+Created 2007-03-27 Sunny Bains
+*******************************************************/
+
+#include "que0que.h"
+#include "trx0roll.h"
+#include "pars0pars.h"
+#include "dict0dict.h"
+#include "fts0types.h"
+#include "fts0priv.h"
+
+/** SQL statements for creating the ancillary FTS tables. */
+
+/** Preamble to all SQL statements. */
+static const char* fts_sql_begin=
+	"PROCEDURE P() IS\n";
+
+/** Postamble to non-committing SQL statements. */
+static const char* fts_sql_end=
+	"\n"
+	"END;\n";
+
+/******************************************************************//**
+Get the table id.
+@return number of bytes written */
+int
+fts_get_table_id(
+/*=============*/
+	const fts_table_t*
+			fts_table,	/*!< in: FTS Auxiliary table */
+	char*		table_id)	/*!< out: table id, must be at least
+					FTS_AUX_MIN_TABLE_ID_LENGTH bytes
+					long */
+{
+	int		len;
+
+	ut_a(fts_table->table != NULL);
+
+	switch (fts_table->type) {
+	case FTS_COMMON_TABLE:
+		len = fts_write_object_id(fts_table->table_id, table_id);
+		break;
+
+	case FTS_INDEX_TABLE:
+
+		len = fts_write_object_id(fts_table->table_id, table_id);
+
+		table_id[len] = '_';
+		++len;
+		table_id += len;
+
+		len += fts_write_object_id(fts_table->index_id, table_id);
+		break;
+
+	default:
+		ut_error;
+	}
+
+	ut_a(len >= 16);
+	ut_a(len < FTS_AUX_MIN_TABLE_ID_LENGTH);
+
+	return(len);
+}
+
+/** Construct the name of an internal FTS table for the given table.
+@param[in]	fts_table	metadata on fulltext-indexed table
+@param[out]	table_name	a name up to MAX_FULL_NAME_LEN
+@param[in]	dict_locked	whether dict_sys.latch is being held */
+void fts_get_table_name(const fts_table_t* fts_table, char* table_name,
+			bool dict_locked)
+{
+	if (!dict_locked) {
+		dict_sys.freeze(SRW_LOCK_CALL);
+	}
+	ut_ad(dict_sys.frozen());
+	/* Include the separator as well. */
+	const size_t dbname_len = fts_table->table->name.dblen() + 1;
+	ut_ad(dbname_len > 1);
+	memcpy(table_name, fts_table->table->name.m_name, dbname_len);
+	if (!dict_locked) {
+		dict_sys.unfreeze();
+	}
+	memcpy(table_name += dbname_len, "FTS_", 4);
+	table_name += 4;
+	table_name += fts_get_table_id(fts_table, table_name);
+	*table_name++ = '_';
+	strcpy(table_name, fts_table->suffix);
+}
+
+/******************************************************************//**
+Parse an SQL string.
+@return query graph */
+que_t*
+fts_parse_sql(
+/*==========*/
+	fts_table_t*	fts_table,	/*!< in: FTS auxiliarry table info */
+	pars_info_t*	info,		/*!< in: info struct, or NULL */
+	const char*	sql)		/*!< in: SQL string to evaluate */
+{
+	char*		str;
+	que_t*		graph;
+	ibool		dict_locked;
+
+	str = ut_str3cat(fts_sql_begin, sql, fts_sql_end);
+
+	dict_locked = (fts_table && fts_table->table->fts
+		       && fts_table->table->fts->dict_locked);
+
+	if (!dict_locked) {
+		/* The InnoDB SQL parser is not re-entrant. */
+		dict_sys.lock(SRW_LOCK_CALL);
+	}
+
+	graph = pars_sql(info, str);
+	ut_a(graph);
+
+	if (!dict_locked) {
+		dict_sys.unlock();
+	}
+
+	ut_free(str);
+
+	return(graph);
+}
+
+/******************************************************************//**
+Evaluate an SQL query graph.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_eval_sql(
+/*=========*/
+	trx_t*		trx,		/*!< in: transaction */
+	que_t*		graph)		/*!< in: Query graph to evaluate */
+{
+	que_thr_t*	thr;
+
+	graph->trx = trx;
+
+	ut_a(thr = que_fork_start_command(graph));
+
+	que_run_threads(thr);
+
+	return(trx->error_state);
+}
+
+/******************************************************************//**
+Construct the column specification part of the SQL string for selecting the
+indexed FTS columns for the given table. Adds the necessary bound
+ids to the given 'info' and returns the SQL string. Examples:
+
+One indexed column named "text":
+
+ "$sel0",
+ info/ids: sel0 -> "text"
+
+Two indexed columns named "subject" and "content":
+
+ "$sel0, $sel1",
+ info/ids: sel0 -> "subject", sel1 -> "content",
+@return heap-allocated WHERE string */
+const char*
+fts_get_select_columns_str(
+/*=======================*/
+	dict_index_t*   index,		/*!< in: index */
+	pars_info_t*    info,		/*!< in/out: parser info */
+	mem_heap_t*     heap)		/*!< in: memory heap */
+{
+	ulint		i;
+	const char*	str = "";
+
+	for (i = 0; i < index->n_user_defined_cols; i++) {
+		char*           sel_str;
+
+		dict_field_t*   field = dict_index_get_nth_field(index, i);
+
+		sel_str = mem_heap_printf(heap, "sel%lu", (ulong) i);
+
+		/* Set copy_name to TRUE since it's dynamic. */
+		pars_info_bind_id(info, sel_str, field->name);
+
+		str = mem_heap_printf(
+			heap, "%s%s$%s", str, (*str) ? ", " : "", sel_str);
+	}
+
+	return(str);
+}
diff --git a/storage/innobase/fts/fts0tlex.cc b/storage/innobase/fts/fts0tlex.cc
new file mode 100644
index 00000000..29f73f23
--- /dev/null
+++ b/storage/innobase/fts/fts0tlex.cc
@@ -0,0 +1,2169 @@
+#include "univ.i"
+#line 2 "fts0tlex.cc"
+
+#line 4 "fts0tlex.cc"
+
+#define  YY_INT_ALIGNED short int
+
+/* A lexical scanner generated by flex */
+
+#define FLEX_SCANNER
+#define YY_FLEX_MAJOR_VERSION 2
+#define YY_FLEX_MINOR_VERSION 6
+#define YY_FLEX_SUBMINOR_VERSION 4
+#if YY_FLEX_SUBMINOR_VERSION > 0
+#define FLEX_BETA
+#endif
+
+#ifdef yy_create_buffer
+#define fts0t_create_buffer_ALREADY_DEFINED
+#else
+#define yy_create_buffer fts0t_create_buffer
+#endif
+
+#ifdef yy_delete_buffer
+#define fts0t_delete_buffer_ALREADY_DEFINED
+#else
+#define yy_delete_buffer fts0t_delete_buffer
+#endif
+
+#ifdef yy_scan_buffer
+#define fts0t_scan_buffer_ALREADY_DEFINED
+#else
+#define yy_scan_buffer fts0t_scan_buffer
+#endif
+
+#ifdef yy_scan_string
+#define fts0t_scan_string_ALREADY_DEFINED
+#else
+#define yy_scan_string fts0t_scan_string
+#endif
+
+#ifdef yy_scan_bytes
+#define fts0t_scan_bytes_ALREADY_DEFINED
+#else
+#define yy_scan_bytes fts0t_scan_bytes
+#endif
+
+#ifdef yy_init_buffer
+#define fts0t_init_buffer_ALREADY_DEFINED
+#else
+#define yy_init_buffer fts0t_init_buffer
+#endif
+
+#ifdef yy_flush_buffer
+#define fts0t_flush_buffer_ALREADY_DEFINED
+#else
+#define yy_flush_buffer fts0t_flush_buffer
+#endif
+
+#ifdef yy_load_buffer_state
+#define fts0t_load_buffer_state_ALREADY_DEFINED
+#else
+#define yy_load_buffer_state fts0t_load_buffer_state
+#endif
+
+#ifdef yy_switch_to_buffer
+#define fts0t_switch_to_buffer_ALREADY_DEFINED
+#else
+#define yy_switch_to_buffer fts0t_switch_to_buffer
+#endif
+
+#ifdef yypush_buffer_state
+#define fts0tpush_buffer_state_ALREADY_DEFINED
+#else
+#define yypush_buffer_state fts0tpush_buffer_state
+#endif
+
+#ifdef yypop_buffer_state
+#define fts0tpop_buffer_state_ALREADY_DEFINED
+#else
+#define yypop_buffer_state fts0tpop_buffer_state
+#endif
+
+#ifdef yyensure_buffer_stack
+#define fts0tensure_buffer_stack_ALREADY_DEFINED
+#else
+#define yyensure_buffer_stack fts0tensure_buffer_stack
+#endif
+
+#ifdef yylex
+#define fts0tlex_ALREADY_DEFINED
+#else
+#define yylex fts0tlex
+#endif
+
+#ifdef yyrestart
+#define fts0trestart_ALREADY_DEFINED
+#else
+#define yyrestart fts0trestart
+#endif
+
+#ifdef yylex_init
+#define fts0tlex_init_ALREADY_DEFINED
+#else
+#define yylex_init fts0tlex_init
+#endif
+
+#ifdef yylex_init_extra
+#define fts0tlex_init_extra_ALREADY_DEFINED
+#else
+#define yylex_init_extra fts0tlex_init_extra
+#endif
+
+#ifdef yylex_destroy
+#define fts0tlex_destroy_ALREADY_DEFINED
+#else
+#define yylex_destroy fts0tlex_destroy
+#endif
+
+#ifdef yyget_debug
+#define fts0tget_debug_ALREADY_DEFINED
+#else
+#define yyget_debug fts0tget_debug
+#endif
+
+#ifdef yyset_debug
+#define fts0tset_debug_ALREADY_DEFINED
+#else
+#define yyset_debug fts0tset_debug
+#endif
+
+#ifdef yyget_extra
+#define fts0tget_extra_ALREADY_DEFINED
+#else
+#define yyget_extra fts0tget_extra
+#endif
+
+#ifdef yyset_extra
+#define fts0tset_extra_ALREADY_DEFINED
+#else
+#define yyset_extra fts0tset_extra
+#endif
+
+#ifdef yyget_in
+#define fts0tget_in_ALREADY_DEFINED
+#else
+#define yyget_in fts0tget_in
+#endif
+
+#ifdef yyset_in
+#define fts0tset_in_ALREADY_DEFINED
+#else
+#define yyset_in fts0tset_in
+#endif
+
+#ifdef yyget_out
+#define fts0tget_out_ALREADY_DEFINED
+#else
+#define yyget_out fts0tget_out
+#endif
+
+#ifdef yyset_out
+#define fts0tset_out_ALREADY_DEFINED
+#else
+#define yyset_out fts0tset_out
+#endif
+
+#ifdef yyget_leng
+#define fts0tget_leng_ALREADY_DEFINED
+#else
+#define yyget_leng fts0tget_leng
+#endif
+
+#ifdef yyget_text
+#define fts0tget_text_ALREADY_DEFINED
+#else
+#define yyget_text fts0tget_text
+#endif
+
+#ifdef yyget_lineno
+#define fts0tget_lineno_ALREADY_DEFINED
+#else
+#define yyget_lineno fts0tget_lineno
+#endif
+
+#ifdef yyset_lineno
+#define fts0tset_lineno_ALREADY_DEFINED
+#else
+#define yyset_lineno fts0tset_lineno
+#endif
+
+#ifdef yyget_column
+#define fts0tget_column_ALREADY_DEFINED
+#else
+#define yyget_column fts0tget_column
+#endif
+
+#ifdef yyset_column
+#define fts0tset_column_ALREADY_DEFINED
+#else
+#define yyset_column fts0tset_column
+#endif
+
+#ifdef yywrap
+#define fts0twrap_ALREADY_DEFINED
+#else
+#define yywrap fts0twrap
+#endif
+
+#ifdef yyalloc
+#define fts0talloc_ALREADY_DEFINED
+#else
+#define yyalloc fts0talloc
+#endif
+
+#ifdef yyrealloc
+#define fts0trealloc_ALREADY_DEFINED
+#else
+#define yyrealloc fts0trealloc
+#endif
+
+#ifdef yyfree
+#define fts0tfree_ALREADY_DEFINED
+#else
+#define yyfree fts0tfree
+#endif
+
+/* First, we deal with  platform-specific or compiler-specific issues. */
+
+/* begin standard C headers. */
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+
+/* end standard C headers. */
+
+/* flex integer type definitions */
+
+#ifndef FLEXINT_H
+#define FLEXINT_H
+
+/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+
+/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
+ * if you want the limit (max/min) macros for int types. 
+ */
+#ifndef __STDC_LIMIT_MACROS
+#define __STDC_LIMIT_MACROS 1
+#endif
+
+#include <inttypes.h>
+typedef int8_t flex_int8_t;
+typedef uint8_t flex_uint8_t;
+typedef int16_t flex_int16_t;
+typedef uint16_t flex_uint16_t;
+typedef int32_t flex_int32_t;
+typedef uint32_t flex_uint32_t;
+#else
+typedef signed char flex_int8_t;
+typedef short int flex_int16_t;
+typedef int flex_int32_t;
+typedef unsigned char flex_uint8_t; 
+typedef unsigned short int flex_uint16_t;
+typedef unsigned int flex_uint32_t;
+
+/* Limits of integral types. */
+#ifndef INT8_MIN
+#define INT8_MIN               (-128)
+#endif
+#ifndef INT16_MIN
+#define INT16_MIN              (-32767-1)
+#endif
+#ifndef INT32_MIN
+#define INT32_MIN              (-2147483647-1)
+#endif
+#ifndef INT8_MAX
+#define INT8_MAX               (127)
+#endif
+#ifndef INT16_MAX
+#define INT16_MAX              (32767)
+#endif
+#ifndef INT32_MAX
+#define INT32_MAX              (2147483647)
+#endif
+#ifndef UINT8_MAX
+#define UINT8_MAX              (255U)
+#endif
+#ifndef UINT16_MAX
+#define UINT16_MAX             (65535U)
+#endif
+#ifndef UINT32_MAX
+#define UINT32_MAX             (4294967295U)
+#endif
+
+#ifndef SIZE_MAX
+#define SIZE_MAX               (~(size_t)0)
+#endif
+
+#endif /* ! C99 */
+
+#endif /* ! FLEXINT_H */
+
+/* begin standard C++ headers. */
+
+/* TODO: this is always defined, so inline it */
+#define yyconst const
+
+#if defined(__GNUC__) && __GNUC__ >= 3
+#define yynoreturn __attribute__((__noreturn__))
+#else
+#define yynoreturn
+#endif
+
+/* Returned upon end-of-file. */
+#define YY_NULL 0
+
+/* Promotes a possibly negative, possibly signed char to an
+ *   integer in range [0..255] for use as an array index.
+ */
+#define YY_SC_TO_UI(c) ((YY_CHAR) (c))
+
+/* An opaque pointer. */
+#ifndef YY_TYPEDEF_YY_SCANNER_T
+#define YY_TYPEDEF_YY_SCANNER_T
+typedef void* yyscan_t;
+#endif
+
+/* For convenience, these vars (plus the bison vars far below)
+   are macros in the reentrant scanner. */
+#define yyin yyg->yyin_r
+#define yyout yyg->yyout_r
+#define yyextra yyg->yyextra_r
+#define yyleng yyg->yyleng_r
+#define yytext yyg->yytext_r
+#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno)
+#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column)
+#define yy_flex_debug yyg->yy_flex_debug_r
+
+/* Enter a start condition.  This macro really ought to take a parameter,
+ * but we do it the disgusting crufty way forced on us by the ()-less
+ * definition of BEGIN.
+ */
+#define BEGIN yyg->yy_start = 1 + 2 *
+/* Translate the current start state into a value that can be later handed
+ * to BEGIN to return to the state.  The YYSTATE alias is for lex
+ * compatibility.
+ */
+#define YY_START ((yyg->yy_start - 1) / 2)
+#define YYSTATE YY_START
+/* Action number for EOF rule of a given start state. */
+#define YY_STATE_EOF(state) (YY_END_OF_BUFFER + state + 1)
+/* Special action meaning "start processing a new file". */
+#define YY_NEW_FILE yyrestart( yyin , yyscanner )
+#define YY_END_OF_BUFFER_CHAR 0
+
+/* Size of default input buffer. */
+#ifndef YY_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k.
+ * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
+ * Ditto for the __ia64__ case accordingly.
+ */
+#define YY_BUF_SIZE 32768
+#else
+#define YY_BUF_SIZE 16384
+#endif /* __ia64__ */
+#endif
+
+/* The state buf must be large enough to hold one state per character in the main buffer.
+ */
+#define YY_STATE_BUF_SIZE   ((YY_BUF_SIZE + 2) * sizeof(yy_state_type))
+
+#ifndef YY_TYPEDEF_YY_BUFFER_STATE
+#define YY_TYPEDEF_YY_BUFFER_STATE
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+#endif
+
+#ifndef YY_TYPEDEF_YY_SIZE_T
+#define YY_TYPEDEF_YY_SIZE_T
+typedef size_t yy_size_t;
+#endif
+
+#define EOB_ACT_CONTINUE_SCAN 0
+#define EOB_ACT_END_OF_FILE 1
+#define EOB_ACT_LAST_MATCH 2
+    
+    #define YY_LESS_LINENO(n)
+    #define YY_LINENO_REWIND_TO(ptr)
+    
+/* Return all but the first "n" matched characters back to the input stream. */
+#define yyless(n) \
+	do \
+		{ \
+		/* Undo effects of setting up yytext. */ \
+        int yyless_macro_arg = (n); \
+        YY_LESS_LINENO(yyless_macro_arg);\
+		*yy_cp = yyg->yy_hold_char; \
+		YY_RESTORE_YY_MORE_OFFSET \
+		yyg->yy_c_buf_p = yy_cp = yy_bp + yyless_macro_arg - YY_MORE_ADJ; \
+		YY_DO_BEFORE_ACTION; /* set up yytext again */ \
+		} \
+	while ( 0 )
+#define unput(c) yyunput( c, yyg->yytext_ptr , yyscanner )
+
+#ifndef YY_STRUCT_YY_BUFFER_STATE
+#define YY_STRUCT_YY_BUFFER_STATE
+struct yy_buffer_state
+	{
+	FILE *yy_input_file;
+
+	char *yy_ch_buf;		/* input buffer */
+	char *yy_buf_pos;		/* current position in input buffer */
+
+	/* Size of input buffer in bytes, not including room for EOB
+	 * characters.
+	 */
+	int yy_buf_size;
+
+	/* Number of characters read into yy_ch_buf, not including EOB
+	 * characters.
+	 */
+	int yy_n_chars;
+
+	/* Whether we "own" the buffer - i.e., we know we created it,
+	 * and can realloc() it to grow it, and should free() it to
+	 * delete it.
+	 */
+	int yy_is_our_buffer;
+
+	/* Whether this is an "interactive" input source; if so, and
+	 * if we're using stdio for input, then we want to use getc()
+	 * instead of fread(), to make sure we stop fetching input after
+	 * each newline.
+	 */
+	int yy_is_interactive;
+
+	/* Whether we're considered to be at the beginning of a line.
+	 * If so, '^' rules will be active on the next match, otherwise
+	 * not.
+	 */
+	int yy_at_bol;
+
+    int yy_bs_lineno; /**< The line count. */
+    int yy_bs_column; /**< The column count. */
+
+	/* Whether to try to fill the input buffer when we reach the
+	 * end of it.
+	 */
+	int yy_fill_buffer;
+
+	int yy_buffer_status;
+
+#define YY_BUFFER_NEW 0
+#define YY_BUFFER_NORMAL 1
+	/* When an EOF's been seen but there's still some text to process
+	 * then we mark the buffer as YY_EOF_PENDING, to indicate that we
+	 * shouldn't try reading from the input source any more.  We might
+	 * still have a bunch of tokens to match, though, because of
+	 * possible backing-up.
+	 *
+	 * When we actually see the EOF, we change the status to "new"
+	 * (via yyrestart()), so that the user can continue scanning by
+	 * just pointing yyin at a new input file.
+	 */
+#define YY_BUFFER_EOF_PENDING 2
+
+	};
+#endif /* !YY_STRUCT_YY_BUFFER_STATE */
+
+/* We provide macros for accessing buffer states in case in the
+ * future we want to put the buffer states in a more general
+ * "scanner state".
+ *
+ * Returns the top of the stack, or NULL.
+ */
+#define YY_CURRENT_BUFFER ( yyg->yy_buffer_stack \
+                          ? yyg->yy_buffer_stack[yyg->yy_buffer_stack_top] \
+                          : NULL)
+/* Same as previous macro, but useful when we know that the buffer stack is not
+ * NULL or when we need an lvalue. For internal use only.
+ */
+#define YY_CURRENT_BUFFER_LVALUE yyg->yy_buffer_stack[yyg->yy_buffer_stack_top]
+
+void yyrestart ( FILE *input_file , yyscan_t yyscanner );
+void yy_switch_to_buffer ( YY_BUFFER_STATE new_buffer , yyscan_t yyscanner );
+YY_BUFFER_STATE yy_create_buffer ( FILE *file, int size , yyscan_t yyscanner );
+void yy_delete_buffer ( YY_BUFFER_STATE b , yyscan_t yyscanner );
+void yy_flush_buffer ( YY_BUFFER_STATE b , yyscan_t yyscanner );
+void yypush_buffer_state ( YY_BUFFER_STATE new_buffer , yyscan_t yyscanner );
+void yypop_buffer_state ( yyscan_t yyscanner );
+
+static void yyensure_buffer_stack ( yyscan_t yyscanner );
+static void yy_load_buffer_state ( yyscan_t yyscanner );
+static void yy_init_buffer ( YY_BUFFER_STATE b, FILE *file , yyscan_t yyscanner );
+#define YY_FLUSH_BUFFER yy_flush_buffer( YY_CURRENT_BUFFER , yyscanner)
+
+YY_BUFFER_STATE yy_scan_buffer ( char *base, yy_size_t size , yyscan_t yyscanner );
+YY_BUFFER_STATE yy_scan_string ( const char *yy_str , yyscan_t yyscanner );
+YY_BUFFER_STATE yy_scan_bytes ( const char *bytes, int len , yyscan_t yyscanner );
+
+void *yyalloc ( yy_size_t , yyscan_t yyscanner );
+void *yyrealloc ( void *, yy_size_t , yyscan_t yyscanner );
+void yyfree ( void * , yyscan_t yyscanner );
+
+#define yy_new_buffer yy_create_buffer
+#define yy_set_interactive(is_interactive) \
+	{ \
+	if ( ! YY_CURRENT_BUFFER ){ \
+        yyensure_buffer_stack (yyscanner); \
+		YY_CURRENT_BUFFER_LVALUE =    \
+            yy_create_buffer( yyin, YY_BUF_SIZE , yyscanner); \
+	} \
+	YY_CURRENT_BUFFER_LVALUE->yy_is_interactive = is_interactive; \
+	}
+#define yy_set_bol(at_bol) \
+	{ \
+	if ( ! YY_CURRENT_BUFFER ){\
+        yyensure_buffer_stack (yyscanner); \
+		YY_CURRENT_BUFFER_LVALUE =    \
+            yy_create_buffer( yyin, YY_BUF_SIZE , yyscanner); \
+	} \
+	YY_CURRENT_BUFFER_LVALUE->yy_at_bol = at_bol; \
+	}
+#define YY_AT_BOL() (YY_CURRENT_BUFFER_LVALUE->yy_at_bol)
+
+/* Begin user sect3 */
+
+#define fts0twrap(yyscanner) (/*CONSTCOND*/1)
+#define YY_SKIP_YYWRAP
+typedef flex_uint8_t YY_CHAR;
+
+typedef int yy_state_type;
+
+#define yytext_ptr yytext_r
+
+static yy_state_type yy_get_previous_state ( yyscan_t yyscanner );
+static yy_state_type yy_try_NUL_trans ( yy_state_type current_state  , yyscan_t yyscanner);
+static int yy_get_next_buffer ( yyscan_t yyscanner );
+static void yynoreturn yy_fatal_error ( const char* msg , yyscan_t yyscanner );
+
+/* Done after the current pattern has been matched and before the
+ * corresponding action - sets up yytext.
+ */
+#define YY_DO_BEFORE_ACTION \
+	yyg->yytext_ptr = yy_bp; \
+	yyleng = (int) (yy_cp - yy_bp); \
+	yyg->yy_hold_char = *yy_cp; \
+	*yy_cp = '\0'; \
+	yyg->yy_c_buf_p = yy_cp;
+#define YY_NUM_RULES 7
+#define YY_END_OF_BUFFER 8
+/* This struct is not used in this scanner,
+   but its presence is necessary. */
+struct yy_trans_info
+	{
+	flex_int32_t yy_verify;
+	flex_int32_t yy_nxt;
+	};
+static const flex_int16_t yy_accept[17] =
+    {   0,
+        4,    4,    8,    4,    1,    6,    1,    5,    5,    2,
+        4,    1,    1,    0,    3,    0
+    } ;
+
+static const YY_CHAR yy_ec[256] =
+    {   0,
+        1,    1,    1,    1,    1,    1,    1,    1,    2,    3,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    4,    1,    5,    1,    1,    6,    1,    1,    1,
+        1,    7,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1
+    } ;
+
+static const YY_CHAR yy_meta[8] =
+    {   0,
+        1,    2,    3,    4,    5,    5,    1
+    } ;
+
+static const flex_int16_t yy_base[20] =
+    {   0,
+        0,    0,   18,    0,    6,   21,    0,    9,   21,    0,
+        0,    0,    0,    4,   21,   21,   10,   11,   15
+    } ;
+
+static const flex_int16_t yy_def[20] =
+    {   0,
+       16,    1,   16,   17,   17,   16,   18,   19,   16,   17,
+       17,    5,   18,   19,   16,    0,   16,   16,   16
+    } ;
+
+static const flex_int16_t yy_nxt[29] =
+    {   0,
+        4,    5,    6,    7,    8,    9,   10,   12,   15,   13,
+       11,   11,   13,   15,   13,   14,   14,   16,   14,   14,
+        3,   16,   16,   16,   16,   16,   16,   16
+    } ;
+
+static const flex_int16_t yy_chk[29] =
+    {   0,
+        1,    1,    1,    1,    1,    1,    1,    5,   14,    5,
+       17,   17,   18,    8,   18,   19,   19,    3,   19,   19,
+       16,   16,   16,   16,   16,   16,   16,   16
+    } ;
+
+/* The intent behind this definition is that it'll catch
+ * any uses of REJECT which flex missed.
+ */
+#define REJECT reject_used_but_not_detected
+#define yymore() yymore_used_but_not_detected
+#define YY_MORE_ADJ 0
+#define YY_RESTORE_YY_MORE_OFFSET
+#line 1 "fts0tlex.l"
+/*****************************************************************************
+
+Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/**
+ * @file fts/fts0tlex.l
+ * FTS parser lexical analyzer
+ *
+ * Created 2007/5/9 Sunny Bains
+ */
+#line 27 "fts0tlex.l"
+
+#include "fts0ast.h"
+#include "fts0pars.h"
+
+/* Required for reentrant parser */
+#define YY_DECL int fts_tlexer(YYSTYPE* val, yyscan_t yyscanner)
+#define exit(A)   ut_error
+
+#line 671 "fts0tlex.cc"
+#define YY_NO_INPUT 1
+#line 673 "fts0tlex.cc"
+
+#define INITIAL 0
+
+#ifndef YY_NO_UNISTD_H
+/* Special case for "unistd.h", since it is non-ANSI. We include it way
+ * down here because we want the user's section 1 to have been scanned first.
+ * The user has a chance to override it with an option.
+ */
+#include <unistd.h>
+#endif
+
+#ifndef YY_EXTRA_TYPE
+#define YY_EXTRA_TYPE void *
+#endif
+
+/* Holds the entire state of the reentrant scanner. */
+struct yyguts_t
+    {
+
+    /* User-defined. Not touched by flex. */
+    YY_EXTRA_TYPE yyextra_r;
+
+    /* The rest are the same as the globals declared in the non-reentrant scanner. */
+    FILE *yyin_r, *yyout_r;
+    size_t yy_buffer_stack_top; /**< index of top of stack. */
+    size_t yy_buffer_stack_max; /**< capacity of stack. */
+    YY_BUFFER_STATE * yy_buffer_stack; /**< Stack as an array. */
+    char yy_hold_char;
+    int yy_n_chars;
+    int yyleng_r;
+    char *yy_c_buf_p;
+    int yy_init;
+    int yy_start;
+    int yy_did_buffer_switch_on_eof;
+    int yy_start_stack_ptr;
+    int yy_start_stack_depth;
+    int *yy_start_stack;
+    yy_state_type yy_last_accepting_state;
+    char* yy_last_accepting_cpos;
+
+    int yylineno_r;
+    int yy_flex_debug_r;
+
+    char *yytext_r;
+    int yy_more_flag;
+    int yy_more_len;
+
+    }; /* end struct yyguts_t */
+
+static int yy_init_globals ( yyscan_t yyscanner );
+
+int yylex_init (yyscan_t* scanner);
+
+int yylex_init_extra ( YY_EXTRA_TYPE user_defined, yyscan_t* scanner);
+
+/* Accessor methods to globals.
+   These are made visible to non-reentrant scanners for convenience. */
+
+int yylex_destroy ( yyscan_t yyscanner );
+
+int yyget_debug ( yyscan_t yyscanner );
+
+void yyset_debug ( int debug_flag , yyscan_t yyscanner );
+
+YY_EXTRA_TYPE yyget_extra ( yyscan_t yyscanner );
+
+void yyset_extra ( YY_EXTRA_TYPE user_defined , yyscan_t yyscanner );
+
+FILE *yyget_in ( yyscan_t yyscanner );
+
+void yyset_in  ( FILE * _in_str , yyscan_t yyscanner );
+
+FILE *yyget_out ( yyscan_t yyscanner );
+
+void yyset_out  ( FILE * _out_str , yyscan_t yyscanner );
+
+			int yyget_leng ( yyscan_t yyscanner );
+
+char *yyget_text ( yyscan_t yyscanner );
+
+int yyget_lineno ( yyscan_t yyscanner );
+
+void yyset_lineno ( int _line_number , yyscan_t yyscanner );
+
+int yyget_column  ( yyscan_t yyscanner );
+
+void yyset_column ( int _column_no , yyscan_t yyscanner );
+
+/* Macros after this point can all be overridden by user definitions in
+ * section 1.
+ */
+
+#ifndef YY_SKIP_YYWRAP
+#ifdef __cplusplus
+extern "C" int yywrap ( yyscan_t yyscanner );
+#else
+extern int yywrap ( yyscan_t yyscanner );
+#endif
+#endif
+
+#ifndef YY_NO_UNPUT
+    
+#endif
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy ( char *, const char *, int , yyscan_t yyscanner);
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen ( const char * , yyscan_t yyscanner);
+#endif
+
+#ifndef YY_NO_INPUT
+#ifdef __cplusplus
+static int yyinput ( yyscan_t yyscanner );
+#else
+static int input ( yyscan_t yyscanner );
+#endif
+
+#endif
+
+/* Amount of stuff to slurp up with each read. */
+#ifndef YY_READ_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k */
+#define YY_READ_BUF_SIZE 16384
+#else
+#define YY_READ_BUF_SIZE 8192
+#endif /* __ia64__ */
+#endif
+
+/* Copy whatever the last rule matched to the standard output. */
+#ifndef ECHO
+/* This used to be an fputs(), but since the string might contain NUL's,
+ * we now use fwrite().
+ */
+#define ECHO do { if (fwrite( yytext, (size_t) yyleng, 1, yyout )) {} } while (0)
+#endif
+
+/* Gets input and stuffs it into "buf".  number of characters read, or YY_NULL,
+ * is returned in "result".
+ */
+#ifndef YY_INPUT
+#define YY_INPUT(buf,result,max_size) \
+	if ( YY_CURRENT_BUFFER_LVALUE->yy_is_interactive ) \
+		{ \
+		int c = '*'; \
+		int n; \
+		for ( n = 0; n < max_size && \
+			     (c = getc( yyin )) != EOF && c != '\n'; ++n ) \
+			buf[n] = (char) c; \
+		if ( c == '\n' ) \
+			buf[n++] = (char) c; \
+		if ( c == EOF && ferror( yyin ) ) \
+			YY_FATAL_ERROR( "input in flex scanner failed" ); \
+		result = n; \
+		} \
+	else \
+		{ \
+		errno=0; \
+		while ( (result = (int) fread(buf, 1, (yy_size_t) max_size, yyin)) == 0 && ferror(yyin)) \
+			{ \
+			if( errno != EINTR) \
+				{ \
+				YY_FATAL_ERROR( "input in flex scanner failed" ); \
+				break; \
+				} \
+			errno=0; \
+			clearerr(yyin); \
+			} \
+		}\
+\
+
+#endif
+
+/* No semi-colon after return; correct usage is to write "yyterminate();" -
+ * we don't want an extra ';' after the "return" because that will cause
+ * some compilers to complain about unreachable statements.
+ */
+#ifndef yyterminate
+#define yyterminate() return YY_NULL
+#endif
+
+/* Number of entries by which start-condition stack grows. */
+#ifndef YY_START_STACK_INCR
+#define YY_START_STACK_INCR 25
+#endif
+
+/* Report a fatal error. */
+#ifndef YY_FATAL_ERROR
+#define YY_FATAL_ERROR(msg) yy_fatal_error( msg , yyscanner)
+#endif
+
+/* end tables serialization structures and prototypes */
+
+/* Default declaration of generated scanner - a define so the user can
+ * easily add parameters.
+ */
+#ifndef YY_DECL
+#define YY_DECL_IS_OURS 1
+
+extern int yylex (yyscan_t yyscanner);
+
+#define YY_DECL int yylex (yyscan_t yyscanner)
+#endif /* !YY_DECL */
+
+/* Code executed at the beginning of each rule, after yytext and yyleng
+ * have been set up.
+ */
+#ifndef YY_USER_ACTION
+#define YY_USER_ACTION
+#endif
+
+/* Code executed at the end of each rule. */
+#ifndef YY_BREAK
+#define YY_BREAK /*LINTED*/break;
+#endif
+
+#define YY_RULE_SETUP \
+	YY_USER_ACTION
+
+/** The main scanner function which does all the work.
+ */
+YY_DECL
+{
+	yy_state_type yy_current_state;
+	char *yy_cp, *yy_bp;
+	int yy_act;
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	if ( !yyg->yy_init )
+		{
+		yyg->yy_init = 1;
+
+#ifdef YY_USER_INIT
+		YY_USER_INIT;
+#endif
+
+		if ( ! yyg->yy_start )
+			yyg->yy_start = 1;	/* first start state */
+
+		if ( ! yyin )
+			yyin = stdin;
+
+		if ( ! yyout )
+			yyout = stdout;
+
+		if ( ! YY_CURRENT_BUFFER ) {
+			yyensure_buffer_stack (yyscanner);
+			YY_CURRENT_BUFFER_LVALUE =
+				yy_create_buffer( yyin, YY_BUF_SIZE , yyscanner);
+		}
+
+		yy_load_buffer_state( yyscanner );
+		}
+
+	{
+#line 45 "fts0tlex.l"
+
+
+#line 934 "fts0tlex.cc"
+
+	while ( /*CONSTCOND*/1 )		/* loops until end-of-file is reached */
+		{
+		yy_cp = yyg->yy_c_buf_p;
+
+		/* Support of yytext. */
+		*yy_cp = yyg->yy_hold_char;
+
+		/* yy_bp points to the position in yy_ch_buf of the start of
+		 * the current run.
+		 */
+		yy_bp = yy_cp;
+
+		yy_current_state = yyg->yy_start;
+yy_match:
+		do
+			{
+			YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)] ;
+			if ( yy_accept[yy_current_state] )
+				{
+				yyg->yy_last_accepting_state = yy_current_state;
+				yyg->yy_last_accepting_cpos = yy_cp;
+				}
+			while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+				{
+				yy_current_state = (int) yy_def[yy_current_state];
+				if ( yy_current_state >= 17 )
+					yy_c = yy_meta[yy_c];
+				}
+			yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c];
+			++yy_cp;
+			}
+		while ( yy_current_state != 16 );
+		yy_cp = yyg->yy_last_accepting_cpos;
+		yy_current_state = yyg->yy_last_accepting_state;
+
+yy_find_action:
+		yy_act = yy_accept[yy_current_state];
+
+		YY_DO_BEFORE_ACTION;
+
+do_action:	/* This label is used only to access EOF actions. */
+
+		switch ( yy_act )
+	{ /* beginning of action switch */
+			case 0: /* must back up */
+			/* undo the effects of YY_DO_BEFORE_ACTION */
+			*yy_cp = yyg->yy_hold_char;
+			yy_cp = yyg->yy_last_accepting_cpos;
+			yy_current_state = yyg->yy_last_accepting_state;
+			goto yy_find_action;
+
+case 1:
+YY_RULE_SETUP
+#line 47 "fts0tlex.l"
+/* Ignore whitespace */ ;
+	YY_BREAK
+case 2:
+YY_RULE_SETUP
+#line 49 "fts0tlex.l"
+{
+	val->oper = fts0tget_text(yyscanner)[0];
+
+	return(val->oper);
+}
+	YY_BREAK
+case 3:
+YY_RULE_SETUP
+#line 55 "fts0tlex.l"
+{
+	val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0tget_text(yyscanner)), fts0tget_leng(yyscanner));
+
+	return(FTS_TEXT);
+}
+	YY_BREAK
+case 4:
+YY_RULE_SETUP
+#line 61 "fts0tlex.l"
+{
+	val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0tget_text(yyscanner)), fts0tget_leng(yyscanner));
+
+	return(FTS_TERM);
+}
+	YY_BREAK
+case 5:
+YY_RULE_SETUP
+#line 66 "fts0tlex.l"
+;
+	YY_BREAK
+case 6:
+/* rule 6 can match eol */
+YY_RULE_SETUP
+#line 67 "fts0tlex.l"
+
+	YY_BREAK
+case 7:
+YY_RULE_SETUP
+#line 69 "fts0tlex.l"
+ECHO;
+	YY_BREAK
+#line 1035 "fts0tlex.cc"
+case YY_STATE_EOF(INITIAL):
+	yyterminate();
+
+	case YY_END_OF_BUFFER:
+		{
+		/* Amount of text matched not including the EOB char. */
+		int yy_amount_of_matched_text = (int) (yy_cp - yyg->yytext_ptr) - 1;
+
+		/* Undo the effects of YY_DO_BEFORE_ACTION. */
+		*yy_cp = yyg->yy_hold_char;
+		YY_RESTORE_YY_MORE_OFFSET
+
+		if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_NEW )
+			{
+			/* We're scanning a new file or input source.  It's
+			 * possible that this happened because the user
+			 * just pointed yyin at a new source and called
+			 * yylex().  If so, then we have to assure
+			 * consistency between YY_CURRENT_BUFFER and our
+			 * globals.  Here is the right place to do so, because
+			 * this is the first action (other than possibly a
+			 * back-up) that will match for the new input source.
+			 */
+			yyg->yy_n_chars = YY_CURRENT_BUFFER_LVALUE->yy_n_chars;
+			YY_CURRENT_BUFFER_LVALUE->yy_input_file = yyin;
+			YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = YY_BUFFER_NORMAL;
+			}
+
+		/* Note that here we test for yy_c_buf_p "<=" to the position
+		 * of the first EOB in the buffer, since yy_c_buf_p will
+		 * already have been incremented past the NUL character
+		 * (since all states make transitions on EOB to the
+		 * end-of-buffer state).  Contrast this with the test
+		 * in input().
+		 */
+		if ( yyg->yy_c_buf_p <= &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] )
+			{ /* This was really a NUL. */
+			yy_state_type yy_next_state;
+
+			yyg->yy_c_buf_p = yyg->yytext_ptr + yy_amount_of_matched_text;
+
+			yy_current_state = yy_get_previous_state( yyscanner );
+
+			/* Okay, we're now positioned to make the NUL
+			 * transition.  We couldn't have
+			 * yy_get_previous_state() go ahead and do it
+			 * for us because it doesn't know how to deal
+			 * with the possibility of jamming (and we don't
+			 * want to build jamming into it because then it
+			 * will run more slowly).
+			 */
+
+			yy_next_state = yy_try_NUL_trans( yy_current_state , yyscanner);
+
+			yy_bp = yyg->yytext_ptr + YY_MORE_ADJ;
+
+			if ( yy_next_state )
+				{
+				/* Consume the NUL. */
+				yy_cp = ++yyg->yy_c_buf_p;
+				yy_current_state = yy_next_state;
+				goto yy_match;
+				}
+
+			else
+				{
+				yy_cp = yyg->yy_last_accepting_cpos;
+				yy_current_state = yyg->yy_last_accepting_state;
+				goto yy_find_action;
+				}
+			}
+
+		else switch ( yy_get_next_buffer( yyscanner ) )
+			{
+			case EOB_ACT_END_OF_FILE:
+				{
+				yyg->yy_did_buffer_switch_on_eof = 0;
+
+				if ( yywrap( yyscanner ) )
+					{
+					/* Note: because we've taken care in
+					 * yy_get_next_buffer() to have set up
+					 * yytext, we can now set up
+					 * yy_c_buf_p so that if some total
+					 * hoser (like flex itself) wants to
+					 * call the scanner after we return the
+					 * YY_NULL, it'll still work - another
+					 * YY_NULL will get returned.
+					 */
+					yyg->yy_c_buf_p = yyg->yytext_ptr + YY_MORE_ADJ;
+
+					yy_act = YY_STATE_EOF(YY_START);
+					goto do_action;
+					}
+
+				else
+					{
+					if ( ! yyg->yy_did_buffer_switch_on_eof )
+						YY_NEW_FILE;
+					}
+				break;
+				}
+
+			case EOB_ACT_CONTINUE_SCAN:
+				yyg->yy_c_buf_p =
+					yyg->yytext_ptr + yy_amount_of_matched_text;
+
+				yy_current_state = yy_get_previous_state( yyscanner );
+
+				yy_cp = yyg->yy_c_buf_p;
+				yy_bp = yyg->yytext_ptr + YY_MORE_ADJ;
+				goto yy_match;
+
+			case EOB_ACT_LAST_MATCH:
+				yyg->yy_c_buf_p =
+				&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars];
+
+				yy_current_state = yy_get_previous_state( yyscanner );
+
+				yy_cp = yyg->yy_c_buf_p;
+				yy_bp = yyg->yytext_ptr + YY_MORE_ADJ;
+				goto yy_find_action;
+			}
+		break;
+		}
+
+	default:
+		YY_FATAL_ERROR(
+			"fatal flex scanner internal error--no action found" );
+	} /* end of action switch */
+		} /* end of scanning one token */
+	} /* end of user's declarations */
+} /* end of yylex */
+
+/* yy_get_next_buffer - try to read in a new buffer
+ *
+ * Returns a code representing an action:
+ *	EOB_ACT_LAST_MATCH -
+ *	EOB_ACT_CONTINUE_SCAN - continue scanning from current position
+ *	EOB_ACT_END_OF_FILE - end of file
+ */
+static int yy_get_next_buffer (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	char *dest = YY_CURRENT_BUFFER_LVALUE->yy_ch_buf;
+	char *source = yyg->yytext_ptr;
+	int number_to_move, i;
+	int ret_val;
+
+	if ( yyg->yy_c_buf_p > &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars + 1] )
+		YY_FATAL_ERROR(
+		"fatal flex scanner internal error--end of buffer missed" );
+
+	if ( YY_CURRENT_BUFFER_LVALUE->yy_fill_buffer == 0 )
+		{ /* Don't try to fill the buffer, so this is an EOF. */
+		if ( yyg->yy_c_buf_p - yyg->yytext_ptr - YY_MORE_ADJ == 1 )
+			{
+			/* We matched a single character, the EOB, so
+			 * treat this as a final EOF.
+			 */
+			return EOB_ACT_END_OF_FILE;
+			}
+
+		else
+			{
+			/* We matched some text prior to the EOB, first
+			 * process it.
+			 */
+			return EOB_ACT_LAST_MATCH;
+			}
+		}
+
+	/* Try to read more data. */
+
+	/* First move last chars to start of buffer. */
+	number_to_move = (int) (yyg->yy_c_buf_p - yyg->yytext_ptr - 1);
+
+	for ( i = 0; i < number_to_move; ++i )
+		*(dest++) = *(source++);
+
+	if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_EOF_PENDING )
+		/* don't do the read, it's not guaranteed to return an EOF,
+		 * just force an EOF
+		 */
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars = 0;
+
+	else
+		{
+			int num_to_read =
+			YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1;
+
+		while ( num_to_read <= 0 )
+			{ /* Not enough room in the buffer - grow it. */
+
+			/* just a shorter name for the current buffer */
+			YY_BUFFER_STATE b = YY_CURRENT_BUFFER_LVALUE;
+
+			int yy_c_buf_p_offset =
+				(int) (yyg->yy_c_buf_p - b->yy_ch_buf);
+
+			if ( b->yy_is_our_buffer )
+				{
+				int new_size = b->yy_buf_size * 2;
+
+				if ( new_size <= 0 )
+					b->yy_buf_size += b->yy_buf_size / 8;
+				else
+					b->yy_buf_size *= 2;
+
+				b->yy_ch_buf = (char *)
+					/* Include room in for 2 EOB chars. */
+					yyrealloc( (void *) b->yy_ch_buf,
+							 (yy_size_t) (b->yy_buf_size + 2) , yyscanner );
+				}
+			else
+				/* Can't grow it, we don't own it. */
+				b->yy_ch_buf = NULL;
+
+			if ( ! b->yy_ch_buf )
+				YY_FATAL_ERROR(
+				"fatal error - scanner input buffer overflow" );
+
+			yyg->yy_c_buf_p = &b->yy_ch_buf[yy_c_buf_p_offset];
+
+			num_to_read = YY_CURRENT_BUFFER_LVALUE->yy_buf_size -
+						number_to_move - 1;
+
+			}
+
+		if ( num_to_read > YY_READ_BUF_SIZE )
+			num_to_read = YY_READ_BUF_SIZE;
+
+		/* Read in more data. */
+		YY_INPUT( (&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]),
+			yyg->yy_n_chars, num_to_read );
+
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars;
+		}
+
+	if ( yyg->yy_n_chars == 0 )
+		{
+		if ( number_to_move == YY_MORE_ADJ )
+			{
+			ret_val = EOB_ACT_END_OF_FILE;
+			yyrestart( yyin  , yyscanner);
+			}
+
+		else
+			{
+			ret_val = EOB_ACT_LAST_MATCH;
+			YY_CURRENT_BUFFER_LVALUE->yy_buffer_status =
+				YY_BUFFER_EOF_PENDING;
+			}
+		}
+
+	else
+		ret_val = EOB_ACT_CONTINUE_SCAN;
+
+	if ((yyg->yy_n_chars + number_to_move) > YY_CURRENT_BUFFER_LVALUE->yy_buf_size) {
+		/* Extend the array by 50%, plus the number we really need. */
+		int new_size = yyg->yy_n_chars + number_to_move + (yyg->yy_n_chars >> 1);
+		YY_CURRENT_BUFFER_LVALUE->yy_ch_buf = (char *) yyrealloc(
+			(void *) YY_CURRENT_BUFFER_LVALUE->yy_ch_buf, (yy_size_t) new_size , yyscanner );
+		if ( ! YY_CURRENT_BUFFER_LVALUE->yy_ch_buf )
+			YY_FATAL_ERROR( "out of dynamic memory in yy_get_next_buffer()" );
+		/* "- 2" to take care of EOB's */
+		YY_CURRENT_BUFFER_LVALUE->yy_buf_size = (int) (new_size - 2);
+	}
+
+	yyg->yy_n_chars += number_to_move;
+	YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] = YY_END_OF_BUFFER_CHAR;
+	YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars + 1] = YY_END_OF_BUFFER_CHAR;
+
+	yyg->yytext_ptr = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[0];
+
+	return ret_val;
+}
+
+/* yy_get_previous_state - get the state just before the EOB char was reached */
+
+    static yy_state_type yy_get_previous_state (yyscan_t yyscanner)
+{
+	yy_state_type yy_current_state;
+	char *yy_cp;
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	yy_current_state = yyg->yy_start;
+
+	for ( yy_cp = yyg->yytext_ptr + YY_MORE_ADJ; yy_cp < yyg->yy_c_buf_p; ++yy_cp )
+		{
+		YY_CHAR yy_c = (*yy_cp ? yy_ec[YY_SC_TO_UI(*yy_cp)] : 1);
+		if ( yy_accept[yy_current_state] )
+			{
+			yyg->yy_last_accepting_state = yy_current_state;
+			yyg->yy_last_accepting_cpos = yy_cp;
+			}
+		while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+			{
+			yy_current_state = (int) yy_def[yy_current_state];
+			if ( yy_current_state >= 17 )
+				yy_c = yy_meta[yy_c];
+			}
+		yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c];
+		}
+
+	return yy_current_state;
+}
+
+/* yy_try_NUL_trans - try to make a transition on the NUL character
+ *
+ * synopsis
+ *	next_state = yy_try_NUL_trans( current_state );
+ */
+    static yy_state_type yy_try_NUL_trans  (yy_state_type yy_current_state , yyscan_t yyscanner)
+{
+	int yy_is_jam;
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; /* This var may be unused depending upon options. */
+	char *yy_cp = yyg->yy_c_buf_p;
+
+	YY_CHAR yy_c = 1;
+	if ( yy_accept[yy_current_state] )
+		{
+		yyg->yy_last_accepting_state = yy_current_state;
+		yyg->yy_last_accepting_cpos = yy_cp;
+		}
+	while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+		{
+		yy_current_state = (int) yy_def[yy_current_state];
+		if ( yy_current_state >= 17 )
+			yy_c = yy_meta[yy_c];
+		}
+	yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c];
+	yy_is_jam = (yy_current_state == 16);
+
+	(void)yyg;
+	return yy_is_jam ? 0 : yy_current_state;
+}
+
+#ifndef YY_NO_UNPUT
+
+#endif
+
+#ifndef YY_NO_INPUT
+#ifdef __cplusplus
+    static int yyinput (yyscan_t yyscanner)
+#else
+    static int input  (yyscan_t yyscanner)
+#endif
+
+{
+	int c;
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	*yyg->yy_c_buf_p = yyg->yy_hold_char;
+
+	if ( *yyg->yy_c_buf_p == YY_END_OF_BUFFER_CHAR )
+		{
+		/* yy_c_buf_p now points to the character we want to return.
+		 * If this occurs *before* the EOB characters, then it's a
+		 * valid NUL; if not, then we've hit the end of the buffer.
+		 */
+		if ( yyg->yy_c_buf_p < &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[yyg->yy_n_chars] )
+			/* This was really a NUL. */
+			*yyg->yy_c_buf_p = '\0';
+
+		else
+			{ /* need more input */
+			int offset = (int) (yyg->yy_c_buf_p - yyg->yytext_ptr);
+			++yyg->yy_c_buf_p;
+
+			switch ( yy_get_next_buffer( yyscanner ) )
+				{
+				case EOB_ACT_LAST_MATCH:
+					/* This happens because yy_g_n_b()
+					 * sees that we've accumulated a
+					 * token and flags that we need to
+					 * try matching the token before
+					 * proceeding.  But for input(),
+					 * there's no matching to consider.
+					 * So convert the EOB_ACT_LAST_MATCH
+					 * to EOB_ACT_END_OF_FILE.
+					 */
+
+					/* Reset buffer status. */
+					yyrestart( yyin , yyscanner);
+
+					/*FALLTHROUGH*/
+
+				case EOB_ACT_END_OF_FILE:
+					{
+					if ( yywrap( yyscanner ) )
+						return 0;
+
+					if ( ! yyg->yy_did_buffer_switch_on_eof )
+						YY_NEW_FILE;
+#ifdef __cplusplus
+					return yyinput(yyscanner);
+#else
+					return input(yyscanner);
+#endif
+					}
+
+				case EOB_ACT_CONTINUE_SCAN:
+					yyg->yy_c_buf_p = yyg->yytext_ptr + offset;
+					break;
+				}
+			}
+		}
+
+	c = *(unsigned char *) yyg->yy_c_buf_p;	/* cast for 8-bit char's */
+	*yyg->yy_c_buf_p = '\0';	/* preserve yytext */
+	yyg->yy_hold_char = *++yyg->yy_c_buf_p;
+
+	return c;
+}
+#endif	/* ifndef YY_NO_INPUT */
+
+/** Immediately switch to a different input stream.
+ * @param input_file A readable stream.
+ * @param yyscanner The scanner object.
+ * @note This function does not reset the start condition to @c INITIAL .
+ */
+    void yyrestart  (FILE * input_file , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	if ( ! YY_CURRENT_BUFFER ){
+        yyensure_buffer_stack (yyscanner);
+		YY_CURRENT_BUFFER_LVALUE =
+            yy_create_buffer( yyin, YY_BUF_SIZE , yyscanner);
+	}
+
+	yy_init_buffer( YY_CURRENT_BUFFER, input_file , yyscanner);
+	yy_load_buffer_state( yyscanner );
+}
+
+/** Switch to a different input buffer.
+ * @param new_buffer The new input buffer.
+ * @param yyscanner The scanner object.
+ */
+    void yy_switch_to_buffer  (YY_BUFFER_STATE  new_buffer , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	/* TODO. We should be able to replace this entire function body
+	 * with
+	 *		yypop_buffer_state();
+	 *		yypush_buffer_state(new_buffer);
+     */
+	yyensure_buffer_stack (yyscanner);
+	if ( YY_CURRENT_BUFFER == new_buffer )
+		return;
+
+	if ( YY_CURRENT_BUFFER )
+		{
+		/* Flush out information for old buffer. */
+		*yyg->yy_c_buf_p = yyg->yy_hold_char;
+		YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = yyg->yy_c_buf_p;
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars;
+		}
+
+	YY_CURRENT_BUFFER_LVALUE = new_buffer;
+	yy_load_buffer_state( yyscanner );
+
+	/* We don't actually know whether we did this switch during
+	 * EOF (yywrap()) processing, but the only time this flag
+	 * is looked at is after yywrap() is called, so it's safe
+	 * to go ahead and always set it.
+	 */
+	yyg->yy_did_buffer_switch_on_eof = 1;
+}
+
+static void yy_load_buffer_state  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	yyg->yy_n_chars = YY_CURRENT_BUFFER_LVALUE->yy_n_chars;
+	yyg->yytext_ptr = yyg->yy_c_buf_p = YY_CURRENT_BUFFER_LVALUE->yy_buf_pos;
+	yyin = YY_CURRENT_BUFFER_LVALUE->yy_input_file;
+	yyg->yy_hold_char = *yyg->yy_c_buf_p;
+}
+
+/** Allocate and initialize an input buffer state.
+ * @param file A readable stream.
+ * @param size The character buffer size in bytes. When in doubt, use @c YY_BUF_SIZE.
+ * @param yyscanner The scanner object.
+ * @return the allocated buffer state.
+ */
+    YY_BUFFER_STATE yy_create_buffer  (FILE * file, int  size , yyscan_t yyscanner)
+{
+	YY_BUFFER_STATE b;
+    
+	b = (YY_BUFFER_STATE) yyalloc( sizeof( struct yy_buffer_state ) , yyscanner );
+	if ( ! b )
+		YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" );
+
+	b->yy_buf_size = size;
+
+	/* yy_ch_buf has to be 2 characters longer than the size given because
+	 * we need to put in 2 end-of-buffer characters.
+	 */
+	b->yy_ch_buf = (char *) yyalloc( (yy_size_t) (b->yy_buf_size + 2) , yyscanner );
+	if ( ! b->yy_ch_buf )
+		YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" );
+
+	b->yy_is_our_buffer = 1;
+
+	yy_init_buffer( b, file , yyscanner);
+
+	return b;
+}
+
+/** Destroy the buffer.
+ * @param b a buffer created with yy_create_buffer()
+ * @param yyscanner The scanner object.
+ */
+    void yy_delete_buffer (YY_BUFFER_STATE  b , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	if ( ! b )
+		return;
+
+	if ( b == YY_CURRENT_BUFFER ) /* Not sure if we should pop here. */
+		YY_CURRENT_BUFFER_LVALUE = (YY_BUFFER_STATE) 0;
+
+	if ( b->yy_is_our_buffer )
+		yyfree( (void *) b->yy_ch_buf , yyscanner );
+
+	yyfree( (void *) b , yyscanner );
+}
+
+/* Initializes or reinitializes a buffer.
+ * This function is sometimes called more than once on the same buffer,
+ * such as during a yyrestart() or at EOF.
+ */
+    static void yy_init_buffer  (YY_BUFFER_STATE  b, FILE * file , yyscan_t yyscanner)
+
+{
+	int oerrno = errno;
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	yy_flush_buffer( b , yyscanner);
+
+	b->yy_input_file = file;
+	b->yy_fill_buffer = 1;
+
+    /* If b is the current buffer, then yy_init_buffer was _probably_
+     * called from yyrestart() or through yy_get_next_buffer.
+     * In that case, we don't want to reset the lineno or column.
+     */
+    if (b != YY_CURRENT_BUFFER){
+        b->yy_bs_lineno = 1;
+        b->yy_bs_column = 0;
+    }
+
+        b->yy_is_interactive = 0;
+    
+	errno = oerrno;
+}
+
+/** Discard all buffered characters. On the next scan, YY_INPUT will be called.
+ * @param b the buffer state to be flushed, usually @c YY_CURRENT_BUFFER.
+ * @param yyscanner The scanner object.
+ */
+    void yy_flush_buffer (YY_BUFFER_STATE  b , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	if ( ! b )
+		return;
+
+	b->yy_n_chars = 0;
+
+	/* We always need two end-of-buffer characters.  The first causes
+	 * a transition to the end-of-buffer state.  The second causes
+	 * a jam in that state.
+	 */
+	b->yy_ch_buf[0] = YY_END_OF_BUFFER_CHAR;
+	b->yy_ch_buf[1] = YY_END_OF_BUFFER_CHAR;
+
+	b->yy_buf_pos = &b->yy_ch_buf[0];
+
+	b->yy_at_bol = 1;
+	b->yy_buffer_status = YY_BUFFER_NEW;
+
+	if ( b == YY_CURRENT_BUFFER )
+		yy_load_buffer_state( yyscanner );
+}
+
+/** Pushes the new state onto the stack. The new state becomes
+ *  the current state. This function will allocate the stack
+ *  if necessary.
+ *  @param new_buffer The new state.
+ *  @param yyscanner The scanner object.
+ */
+void yypush_buffer_state (YY_BUFFER_STATE new_buffer , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	if (new_buffer == NULL)
+		return;
+
+	yyensure_buffer_stack(yyscanner);
+
+	/* This block is copied from yy_switch_to_buffer. */
+	if ( YY_CURRENT_BUFFER )
+		{
+		/* Flush out information for old buffer. */
+		*yyg->yy_c_buf_p = yyg->yy_hold_char;
+		YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = yyg->yy_c_buf_p;
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = yyg->yy_n_chars;
+		}
+
+	/* Only push if top exists. Otherwise, replace top. */
+	if (YY_CURRENT_BUFFER)
+		yyg->yy_buffer_stack_top++;
+	YY_CURRENT_BUFFER_LVALUE = new_buffer;
+
+	/* copied from yy_switch_to_buffer. */
+	yy_load_buffer_state( yyscanner );
+	yyg->yy_did_buffer_switch_on_eof = 1;
+}
+
+/** Removes and deletes the top of the stack, if present.
+ *  The next element becomes the new top.
+ *  @param yyscanner The scanner object.
+ */
+void yypop_buffer_state (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	if (!YY_CURRENT_BUFFER)
+		return;
+
+	yy_delete_buffer(YY_CURRENT_BUFFER , yyscanner);
+	YY_CURRENT_BUFFER_LVALUE = NULL;
+	if (yyg->yy_buffer_stack_top > 0)
+		--yyg->yy_buffer_stack_top;
+
+	if (YY_CURRENT_BUFFER) {
+		yy_load_buffer_state( yyscanner );
+		yyg->yy_did_buffer_switch_on_eof = 1;
+	}
+}
+
+/* Allocates the stack if it does not exist.
+ *  Guarantees space for at least one push.
+ */
+static void yyensure_buffer_stack (yyscan_t yyscanner)
+{
+	yy_size_t num_to_alloc;
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+	if (!yyg->yy_buffer_stack) {
+
+		/* First allocation is just for 2 elements, since we don't know if this
+		 * scanner will even need a stack. We use 2 instead of 1 to avoid an
+		 * immediate realloc on the next call.
+         */
+      num_to_alloc = 1; /* After all that talk, this was set to 1 anyways... */
+		yyg->yy_buffer_stack = (struct yy_buffer_state**)yyalloc
+								(num_to_alloc * sizeof(struct yy_buffer_state*)
+								, yyscanner);
+		if ( ! yyg->yy_buffer_stack )
+			YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" );
+
+		memset(yyg->yy_buffer_stack, 0, num_to_alloc * sizeof(struct yy_buffer_state*));
+
+		yyg->yy_buffer_stack_max = num_to_alloc;
+		yyg->yy_buffer_stack_top = 0;
+		return;
+	}
+
+	if (yyg->yy_buffer_stack_top >= (yyg->yy_buffer_stack_max) - 1){
+
+		/* Increase the buffer to prepare for a possible push. */
+		yy_size_t grow_size = 8 /* arbitrary grow size */;
+
+		num_to_alloc = yyg->yy_buffer_stack_max + grow_size;
+		yyg->yy_buffer_stack = (struct yy_buffer_state**)yyrealloc
+								(yyg->yy_buffer_stack,
+								num_to_alloc * sizeof(struct yy_buffer_state*)
+								, yyscanner);
+		if ( ! yyg->yy_buffer_stack )
+			YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" );
+
+		/* zero only the new slots.*/
+		memset(yyg->yy_buffer_stack + yyg->yy_buffer_stack_max, 0, grow_size * sizeof(struct yy_buffer_state*));
+		yyg->yy_buffer_stack_max = num_to_alloc;
+	}
+}
+
+/** Setup the input buffer state to scan directly from a user-specified character buffer.
+ * @param base the character buffer
+ * @param size the size in bytes of the character buffer
+ * @param yyscanner The scanner object.
+ * @return the newly allocated buffer state object.
+ */
+YY_BUFFER_STATE yy_scan_buffer  (char * base, yy_size_t  size , yyscan_t yyscanner)
+{
+	YY_BUFFER_STATE b;
+    
+	if ( size < 2 ||
+	     base[size-2] != YY_END_OF_BUFFER_CHAR ||
+	     base[size-1] != YY_END_OF_BUFFER_CHAR )
+		/* They forgot to leave room for the EOB's. */
+		return NULL;
+
+	b = (YY_BUFFER_STATE) yyalloc( sizeof( struct yy_buffer_state ) , yyscanner );
+	if ( ! b )
+		YY_FATAL_ERROR( "out of dynamic memory in yy_scan_buffer()" );
+
+	b->yy_buf_size = (int) (size - 2);	/* "- 2" to take care of EOB's */
+	b->yy_buf_pos = b->yy_ch_buf = base;
+	b->yy_is_our_buffer = 0;
+	b->yy_input_file = NULL;
+	b->yy_n_chars = b->yy_buf_size;
+	b->yy_is_interactive = 0;
+	b->yy_at_bol = 1;
+	b->yy_fill_buffer = 0;
+	b->yy_buffer_status = YY_BUFFER_NEW;
+
+	yy_switch_to_buffer( b , yyscanner );
+
+	return b;
+}
+
+/** Setup the input buffer state to scan a string. The next call to yylex() will
+ * scan from a @e copy of @a str.
+ * @param yystr a NUL-terminated string to scan
+ * @param yyscanner The scanner object.
+ * @return the newly allocated buffer state object.
+ * @note If you want to scan bytes that may contain NUL values, then use
+ *       yy_scan_bytes() instead.
+ */
+YY_BUFFER_STATE yy_scan_string (const char * yystr , yyscan_t yyscanner)
+{
+    
+	return yy_scan_bytes( yystr, (int) strlen(yystr) , yyscanner);
+}
+
+/** Setup the input buffer state to scan the given bytes. The next call to yylex() will
+ * scan from a @e copy of @a bytes.
+ * @param yybytes the byte buffer to scan
+ * @param _yybytes_len the number of bytes in the buffer pointed to by @a bytes.
+ * @param yyscanner The scanner object.
+ * @return the newly allocated buffer state object.
+ */
+YY_BUFFER_STATE yy_scan_bytes  (const char * yybytes, int  _yybytes_len , yyscan_t yyscanner)
+{
+	YY_BUFFER_STATE b;
+	char *buf;
+	yy_size_t n;
+	int i;
+    
+	/* Get memory for full buffer, including space for trailing EOB's. */
+	n = (yy_size_t) (_yybytes_len + 2);
+	buf = (char *) yyalloc( n , yyscanner );
+	if ( ! buf )
+		YY_FATAL_ERROR( "out of dynamic memory in yy_scan_bytes()" );
+
+	for ( i = 0; i < _yybytes_len; ++i )
+		buf[i] = yybytes[i];
+
+	buf[_yybytes_len] = buf[_yybytes_len+1] = YY_END_OF_BUFFER_CHAR;
+
+	b = yy_scan_buffer( buf, n , yyscanner);
+	if ( ! b )
+		YY_FATAL_ERROR( "bad buffer in yy_scan_bytes()" );
+
+	/* It's okay to grow etc. this buffer, and we should throw it
+	 * away when we're done.
+	 */
+	b->yy_is_our_buffer = 1;
+
+	return b;
+}
+
+#ifndef YY_EXIT_FAILURE
+#define YY_EXIT_FAILURE 2
+#endif
+
+static void yynoreturn yy_fatal_error (const char* msg , yyscan_t yyscanner)
+{
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	(void)yyg;
+	fprintf( stderr, "%s\n", msg );
+	exit( YY_EXIT_FAILURE );
+}
+
+/* Redefine yyless() so it works in section 3 code. */
+
+#undef yyless
+#define yyless(n) \
+	do \
+		{ \
+		/* Undo effects of setting up yytext. */ \
+        int yyless_macro_arg = (n); \
+        YY_LESS_LINENO(yyless_macro_arg);\
+		yytext[yyleng] = yyg->yy_hold_char; \
+		yyg->yy_c_buf_p = yytext + yyless_macro_arg; \
+		yyg->yy_hold_char = *yyg->yy_c_buf_p; \
+		*yyg->yy_c_buf_p = '\0'; \
+		yyleng = yyless_macro_arg; \
+		} \
+	while ( 0 )
+
+/* Accessor  methods (get/set functions) to struct members. */
+
+/** Get the user-defined data for this scanner.
+ * @param yyscanner The scanner object.
+ */
+YY_EXTRA_TYPE yyget_extra  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    return yyextra;
+}
+
+/** Get the current line number.
+ * @param yyscanner The scanner object.
+ */
+int yyget_lineno  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+        if (! YY_CURRENT_BUFFER)
+            return 0;
+    
+    return yylineno;
+}
+
+/** Get the current column number.
+ * @param yyscanner The scanner object.
+ */
+int yyget_column  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+        if (! YY_CURRENT_BUFFER)
+            return 0;
+    
+    return yycolumn;
+}
+
+/** Get the input stream.
+ * @param yyscanner The scanner object.
+ */
+FILE *yyget_in  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    return yyin;
+}
+
+/** Get the output stream.
+ * @param yyscanner The scanner object.
+ */
+FILE *yyget_out  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    return yyout;
+}
+
+/** Get the length of the current token.
+ * @param yyscanner The scanner object.
+ */
+int yyget_leng  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    return yyleng;
+}
+
+/** Get the current token.
+ * @param yyscanner The scanner object.
+ */
+
+char *yyget_text  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    return yytext;
+}
+
+/** Set the user-defined data. This data is never touched by the scanner.
+ * @param user_defined The data to be associated with this scanner.
+ * @param yyscanner The scanner object.
+ */
+void yyset_extra (YY_EXTRA_TYPE  user_defined , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    yyextra = user_defined ;
+}
+
+/** Set the current line number.
+ * @param _line_number line number
+ * @param yyscanner The scanner object.
+ */
+void yyset_lineno (int  _line_number , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+        /* lineno is only valid if an input buffer exists. */
+        if (! YY_CURRENT_BUFFER )
+           YY_FATAL_ERROR( "yyset_lineno called with no buffer" );
+    
+    yylineno = _line_number;
+}
+
+/** Set the current column.
+ * @param _column_no column number
+ * @param yyscanner The scanner object.
+ */
+void yyset_column (int  _column_no , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+        /* column is only valid if an input buffer exists. */
+        if (! YY_CURRENT_BUFFER )
+           YY_FATAL_ERROR( "yyset_column called with no buffer" );
+    
+    yycolumn = _column_no;
+}
+
+/** Set the input stream. This does not discard the current
+ * input buffer.
+ * @param _in_str A readable stream.
+ * @param yyscanner The scanner object.
+ * @see yy_switch_to_buffer
+ */
+void yyset_in (FILE *  _in_str , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    yyin = _in_str ;
+}
+
+void yyset_out (FILE *  _out_str , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    yyout = _out_str ;
+}
+
+int yyget_debug  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    return yy_flex_debug;
+}
+
+void yyset_debug (int  _bdebug , yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    yy_flex_debug = _bdebug ;
+}
+
+/* Accessor methods for yylval and yylloc */
+
+/* User-visible API */
+
+/* yylex_init is special because it creates the scanner itself, so it is
+ * the ONLY reentrant function that doesn't take the scanner as the last argument.
+ * That's why we explicitly handle the declaration, instead of using our macros.
+ */
+int yylex_init(yyscan_t* ptr_yy_globals)
+{
+    if (ptr_yy_globals == NULL){
+        errno = EINVAL;
+        return 1;
+    }
+
+    *ptr_yy_globals = (yyscan_t) yyalloc ( sizeof( struct yyguts_t ), NULL );
+
+    if (*ptr_yy_globals == NULL){
+        errno = ENOMEM;
+        return 1;
+    }
+
+    /* By setting to 0xAA, we expose bugs in yy_init_globals. Leave at 0x00 for releases. */
+    memset(*ptr_yy_globals,0x00,sizeof(struct yyguts_t));
+
+    return yy_init_globals ( *ptr_yy_globals );
+}
+
+/* yylex_init_extra has the same functionality as yylex_init, but follows the
+ * convention of taking the scanner as the last argument. Note however, that
+ * this is a *pointer* to a scanner, as it will be allocated by this call (and
+ * is the reason, too, why this function also must handle its own declaration).
+ * The user defined value in the first argument will be available to yyalloc in
+ * the yyextra field.
+ */
+int yylex_init_extra( YY_EXTRA_TYPE yy_user_defined, yyscan_t* ptr_yy_globals )
+{
+    struct yyguts_t dummy_yyguts;
+
+    yyset_extra (yy_user_defined, &dummy_yyguts);
+
+    if (ptr_yy_globals == NULL){
+        errno = EINVAL;
+        return 1;
+    }
+
+    *ptr_yy_globals = (yyscan_t) yyalloc ( sizeof( struct yyguts_t ), &dummy_yyguts );
+
+    if (*ptr_yy_globals == NULL){
+        errno = ENOMEM;
+        return 1;
+    }
+
+    /* By setting to 0xAA, we expose bugs in
+    yy_init_globals. Leave at 0x00 for releases. */
+    memset(*ptr_yy_globals,0x00,sizeof(struct yyguts_t));
+
+    yyset_extra (yy_user_defined, *ptr_yy_globals);
+
+    return yy_init_globals ( *ptr_yy_globals );
+}
+
+static int yy_init_globals (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+    /* Initialization is the same as for the non-reentrant scanner.
+     * This function is called from yylex_destroy(), so don't allocate here.
+     */
+
+    yyg->yy_buffer_stack = NULL;
+    yyg->yy_buffer_stack_top = 0;
+    yyg->yy_buffer_stack_max = 0;
+    yyg->yy_c_buf_p = NULL;
+    yyg->yy_init = 0;
+    yyg->yy_start = 0;
+
+    yyg->yy_start_stack_ptr = 0;
+    yyg->yy_start_stack_depth = 0;
+    yyg->yy_start_stack =  NULL;
+
+/* Defined in main.c */
+#ifdef YY_STDINIT
+    yyin = stdin;
+    yyout = stdout;
+#else
+    yyin = NULL;
+    yyout = NULL;
+#endif
+
+    /* For future reference: Set errno on error, since we are called by
+     * yylex_init()
+     */
+    return 0;
+}
+
+/* yylex_destroy is for both reentrant and non-reentrant scanners. */
+int yylex_destroy  (yyscan_t yyscanner)
+{
+    struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+
+    /* Pop the buffer stack, destroying each element. */
+	while(YY_CURRENT_BUFFER){
+		yy_delete_buffer( YY_CURRENT_BUFFER , yyscanner );
+		YY_CURRENT_BUFFER_LVALUE = NULL;
+		yypop_buffer_state(yyscanner);
+	}
+
+	/* Destroy the stack itself. */
+	yyfree(yyg->yy_buffer_stack , yyscanner);
+	yyg->yy_buffer_stack = NULL;
+
+    /* Destroy the start condition stack. */
+        yyfree( yyg->yy_start_stack , yyscanner );
+        yyg->yy_start_stack = NULL;
+
+    /* Reset the globals. This is important in a non-reentrant scanner so the next time
+     * yylex() is called, initialization will occur. */
+    yy_init_globals( yyscanner);
+
+    /* Destroy the main struct (reentrant only). */
+    yyfree ( yyscanner , yyscanner );
+    yyscanner = NULL;
+    return 0;
+}
+
+/*
+ * Internal utility routines.
+ */
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy (char* s1, const char * s2, int n , yyscan_t yyscanner)
+{
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	(void)yyg;
+
+	int i;
+	for ( i = 0; i < n; ++i )
+		s1[i] = s2[i];
+}
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen (const char * s , yyscan_t yyscanner)
+{
+	int n;
+	for ( n = 0; s[n]; ++n )
+		;
+
+	return n;
+}
+#endif
+
+void *yyalloc (yy_size_t  size , yyscan_t yyscanner)
+{
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	(void)yyg;
+	return malloc(size);
+}
+
+void *yyrealloc  (void * ptr, yy_size_t  size , yyscan_t yyscanner)
+{
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	(void)yyg;
+
+	/* The cast to (char *) in the following accommodates both
+	 * implementations that use char* generic pointers, and those
+	 * that use void* generic pointers.  It works with the latter
+	 * because both ANSI C and C++ allow castless assignment from
+	 * any pointer type to void*, and deal with argument conversions
+	 * as though doing an assignment.
+	 */
+	return realloc(ptr, size);
+}
+
+void yyfree (void * ptr , yyscan_t yyscanner)
+{
+	struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
+	(void)yyg;
+	free( (char *) ptr );	/* see yyrealloc() for (char *) cast */
+}
+
+#define YYTABLES_NAME "yytables"
+
+#line 69 "fts0tlex.l"
+
+
diff --git a/storage/innobase/fts/fts0tlex.l b/storage/innobase/fts/fts0tlex.l
new file mode 100644
index 00000000..e19e907f
--- /dev/null
+++ b/storage/innobase/fts/fts0tlex.l
@@ -0,0 +1,69 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**
+ * @file fts/fts0tlex.l
+ * FTS parser lexical analyzer
+ *
+ * Created 2007/5/9 Sunny Bains
+ */
+
+%{
+
+#include "fts0ast.h"
+#include "fts0pars.h"
+
+/* Required for reentrant parser */
+#define YY_DECL int fts_tlexer(YYSTYPE* val, yyscan_t yyscanner)
+#define exit(A)   ut_error
+
+%}
+
+%option noinput
+%option nounput
+%option noyywrap
+%option nostdinit
+%option reentrant
+%option never-interactive
+
+
+%%
+
+[\t ]+	/* Ignore whitespace */ ;
+
+[*]	{
+	val->oper = fts0tget_text(yyscanner)[0];
+
+	return(val->oper);
+}
+
+\"[^\"\n]*\"	{
+	val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0tget_text(yyscanner)), fts0tget_leng(yyscanner));
+
+	return(FTS_TEXT);
+}
+
+[^" \n\%]*	{
+	val->token = fts_ast_string_create(reinterpret_cast<const byte*>(fts0tget_text(yyscanner)), fts0tget_leng(yyscanner));
+
+	return(FTS_TERM);
+}
+.	;
+\n
+
+%%
diff --git a/storage/innobase/fts/make_parser.sh b/storage/innobase/fts/make_parser.sh
new file mode 100755
index 00000000..6b82c5ba
--- /dev/null
+++ b/storage/innobase/fts/make_parser.sh
@@ -0,0 +1,49 @@
+#!/bin/sh
+#
+# Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+
+TMPF=t.$$
+
+make -f Makefile.query
+
+echo '#include "univ.i"' > $TMPF
+
+# This is to avoid compiler warning about unused parameters.
+# FIXME: gcc extension "MY_ATTRIBUTE" causing compilation errors on windows
+# platform. Quote them out for now.
+sed -e '
+s/^\(static.*void.*yy_fatal_error.*msg.*,\)\(.*yyscanner\)/\1 \2 MY_ATTRIBUTE((unused))/;
+s/^\(static.*void.*yy_flex_strncpy.*n.*,\)\(.*yyscanner\)/\1 \2 MY_ATTRIBUTE((unused))/;
+s/^\(static.*int.*yy_flex_strlen.*s.*,\)\(.*yyscanner\)/\1 \2 MY_ATTRIBUTE((unused))/;
+s/^\(\(static\|void\).*fts0[bt]alloc.*,\)\(.*yyscanner\)/\1 \3 MY_ATTRIBUTE((unused))/;
+s/^\(\(static\|void\).*fts0[bt]realloc.*,\)\(.*yyscanner\)/\1 \3 MY_ATTRIBUTE((unused))/;
+s/^\(\(static\|void\).*fts0[bt]free.*,\)\(.*yyscanner\)/\1 \3 MY_ATTRIBUTE((unused))/;
+' < fts0blex.cc >> $TMPF
+
+mv $TMPF fts0blex.cc
+
+echo '#include "univ.i"' > $TMPF
+
+sed -e '
+s/^\(static.*void.*yy_fatal_error.*msg.*,\)\(.*yyscanner\)/\1 \2 MY_ATTRIBUTE((unused))/;
+s/^\(static.*void.*yy_flex_strncpy.*n.*,\)\(.*yyscanner\)/\1 \2 MY_ATTRIBUTE((unused))/;
+s/^\(static.*int.*yy_flex_strlen.*s.*,\)\(.*yyscanner\)/\1 \2 MY_ATTRIBUTE((unused))/;
+s/^\(\(static\|void\).*fts0[bt]alloc.*,\)\(.*yyscanner\)/\1 \3 MY_ATTRIBUTE((unused))/;
+s/^\(\(static\|void\).*fts0[bt]realloc.*,\)\(.*yyscanner\)/\1 \3 MY_ATTRIBUTE((unused))/;
+s/^\(\(static\|void\).*fts0[bt]free.*,\)\(.*yyscanner\)/\1 \3 MY_ATTRIBUTE((unused))/;
+' < fts0tlex.cc >> $TMPF
+
+mv $TMPF fts0tlex.cc
diff --git a/storage/innobase/fut/fut0lst.cc b/storage/innobase/fut/fut0lst.cc
new file mode 100644
index 00000000..a52027f2
--- /dev/null
+++ b/storage/innobase/fut/fut0lst.cc
@@ -0,0 +1,416 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file fut/fut0lst.cc
+File-based list utilities
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#include "fut0lst.h"
+#include "buf0buf.h"
+#include "page0page.h"
+
+
+/** Write a file address.
+@param[in]      block   file page
+@param[in,out]  faddr   file address location
+@param[in]      page    page number
+@param[in]      boffset byte offset
+@param[in,out]  mtr     mini-transaction */
+static void flst_write_addr(const buf_block_t& block, byte *faddr,
+                            uint32_t page, uint16_t boffset, mtr_t* mtr)
+{
+  ut_ad(mtr->memo_contains_page_flagged(faddr, MTR_MEMO_PAGE_X_FIX |
+                                        MTR_MEMO_PAGE_SX_FIX));
+  ut_a(page == FIL_NULL || boffset >= FIL_PAGE_DATA);
+  ut_a(ut_align_offset(faddr, srv_page_size) >= FIL_PAGE_DATA);
+
+  static_assert(FIL_ADDR_PAGE == 0, "compatibility");
+  static_assert(FIL_ADDR_BYTE == 4, "compatibility");
+  static_assert(FIL_ADDR_SIZE == 6, "compatibility");
+
+  const bool same_page= mach_read_from_4(faddr + FIL_ADDR_PAGE) == page;
+  const bool same_offset= mach_read_from_2(faddr + FIL_ADDR_BYTE) == boffset;
+  if (same_page)
+  {
+    if (!same_offset)
+      mtr->write<2>(block, faddr + FIL_ADDR_BYTE, boffset);
+    return;
+  }
+  if (same_offset)
+    mtr->write<4>(block, faddr + FIL_ADDR_PAGE, page);
+  else
+  {
+    alignas(4) byte fil_addr[6];
+    mach_write_to_4(fil_addr + FIL_ADDR_PAGE, page);
+    mach_write_to_2(fil_addr + FIL_ADDR_BYTE, boffset);
+    mtr->memcpy(block, faddr + FIL_ADDR_PAGE, fil_addr, 6);
+  }
+}
+
+/** Write 2 null file addresses.
+@param[in]      b       file page
+@param[in,out]  addr    file address to be zeroed out
+@param[in,out]  mtr     mini-transaction */
+static void flst_zero_both(const buf_block_t& b, byte *addr, mtr_t *mtr)
+{
+  if (mach_read_from_4(addr + FIL_ADDR_PAGE) != FIL_NULL)
+    mtr->memset(&b, ulint(addr - b.page.frame) + FIL_ADDR_PAGE, 4, 0xff);
+  mtr->write<2,mtr_t::MAYBE_NOP>(b, addr + FIL_ADDR_BYTE, 0U);
+  /* Initialize the other address by (MEMMOVE|0x80,offset,FIL_ADDR_SIZE,source)
+  which is 4 bytes, or less than FIL_ADDR_SIZE. */
+  memcpy(addr + FIL_ADDR_SIZE, addr, FIL_ADDR_SIZE);
+  const uint16_t boffset= page_offset(addr);
+  mtr->memmove(b, boffset + FIL_ADDR_SIZE, boffset, FIL_ADDR_SIZE);
+}
+
+/** Add a node to an empty list. */
+static void flst_add_to_empty(buf_block_t *base, uint16_t boffset,
+                              buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
+{
+  ut_ad(base != add || boffset != aoffset);
+  ut_ad(boffset < base->physical_size());
+  ut_ad(aoffset < add->physical_size());
+  ut_ad(mtr->memo_contains_flagged(base, MTR_MEMO_PAGE_X_FIX |
+                                   MTR_MEMO_PAGE_SX_FIX));
+  ut_ad(mtr->memo_contains_flagged(add, MTR_MEMO_PAGE_X_FIX |
+                                   MTR_MEMO_PAGE_SX_FIX));
+
+  ut_ad(!mach_read_from_4(base->page.frame + boffset + FLST_LEN));
+  mtr->write<1>(*base, base->page.frame + boffset + (FLST_LEN + 3), 1U);
+  /* Update first and last fields of base node */
+  flst_write_addr(*base, base->page.frame + boffset + FLST_FIRST,
+                  add->page.id().page_no(), aoffset, mtr);
+  memcpy(base->page.frame + boffset + FLST_LAST,
+         base->page.frame + boffset + FLST_FIRST,
+         FIL_ADDR_SIZE);
+  /* Initialize FLST_LAST by (MEMMOVE|0x80,offset,FIL_ADDR_SIZE,source)
+  which is 4 bytes, or less than FIL_ADDR_SIZE. */
+  mtr->memmove(*base, boffset + FLST_LAST, boffset + FLST_FIRST,
+               FIL_ADDR_SIZE);
+
+  /* Set prev and next fields of node to add */
+  static_assert(FLST_NEXT == FLST_PREV + FIL_ADDR_SIZE, "compatibility");
+  flst_zero_both(*add, add->page.frame + aoffset + FLST_PREV, mtr);
+}
+
+/** Insert a node after another one.
+@param[in,out]  base    base node block
+@param[in]      boffset byte offset of the base node
+@param[in,out]  cur     insert position block
+@param[in]      coffset byte offset of the insert position
+@param[in,out]  add     block to be added
+@param[in]      aoffset byte offset of the block to be added
+@param[in,out]  mtr     mini-transaction */
+static dberr_t flst_insert_after(buf_block_t *base, uint16_t boffset,
+                                 buf_block_t *cur, uint16_t coffset,
+                                 buf_block_t *add, uint16_t aoffset,
+                                 mtr_t *mtr)
+{
+  ut_ad(base != cur || boffset != coffset);
+  ut_ad(base != add || boffset != aoffset);
+  ut_ad(cur != add || coffset != aoffset);
+  ut_ad(boffset < base->physical_size());
+  ut_ad(coffset < cur->physical_size());
+  ut_ad(aoffset < add->physical_size());
+  ut_ad(mtr->memo_contains_flagged(base, MTR_MEMO_PAGE_X_FIX |
+                                   MTR_MEMO_PAGE_SX_FIX));
+  ut_ad(mtr->memo_contains_flagged(cur, MTR_MEMO_PAGE_X_FIX |
+                                   MTR_MEMO_PAGE_SX_FIX));
+  ut_ad(mtr->memo_contains_flagged(add, MTR_MEMO_PAGE_X_FIX |
+                                   MTR_MEMO_PAGE_SX_FIX));
+
+  fil_addr_t next_addr= flst_get_next_addr(cur->page.frame + coffset);
+
+  flst_write_addr(*add, add->page.frame + aoffset + FLST_PREV,
+                  cur->page.id().page_no(), coffset, mtr);
+  flst_write_addr(*add, add->page.frame + aoffset + FLST_NEXT,
+                  next_addr.page, next_addr.boffset, mtr);
+
+  dberr_t err= DB_SUCCESS;
+
+  if (next_addr.page == FIL_NULL)
+    flst_write_addr(*base, base->page.frame + boffset + FLST_LAST,
+                    add->page.id().page_no(), aoffset, mtr);
+  else if (buf_block_t *block=
+           buf_page_get_gen(page_id_t{add->page.id().space(), next_addr.page},
+                            add->zip_size(), RW_SX_LATCH, nullptr,
+                            BUF_GET_POSSIBLY_FREED, mtr, &err))
+    flst_write_addr(*block, block->page.frame +
+                    next_addr.boffset + FLST_PREV,
+                    add->page.id().page_no(), aoffset, mtr);
+
+  flst_write_addr(*cur, cur->page.frame + coffset + FLST_NEXT,
+                  add->page.id().page_no(), aoffset, mtr);
+
+  byte *len= &base->page.frame[boffset + FLST_LEN];
+  mtr->write<4>(*base, len, mach_read_from_4(len) + 1);
+  return err;
+}
+
+/** Insert a node before another one.
+@param[in,out]  base    base node block
+@param[in]      boffset byte offset of the base node
+@param[in,out]  cur     insert position block
+@param[in]      coffset byte offset of the insert position
+@param[in,out]  add     block to be added
+@param[in]      aoffset byte offset of the block to be added
+@param[in,out]  mtr     mini-transaction
+@return error code */
+static dberr_t flst_insert_before(buf_block_t *base, uint16_t boffset,
+                                  buf_block_t *cur, uint16_t coffset,
+                                  buf_block_t *add, uint16_t aoffset,
+                                  mtr_t *mtr)
+{
+  ut_ad(base != cur || boffset != coffset);
+  ut_ad(base != add || boffset != aoffset);
+  ut_ad(cur != add || coffset != aoffset);
+  ut_ad(boffset < base->physical_size());
+  ut_ad(coffset < cur->physical_size());
+  ut_ad(aoffset < add->physical_size());
+  ut_ad(mtr->memo_contains_flagged(base, MTR_MEMO_PAGE_X_FIX |
+                                   MTR_MEMO_PAGE_SX_FIX));
+  ut_ad(mtr->memo_contains_flagged(cur, MTR_MEMO_PAGE_X_FIX |
+                                   MTR_MEMO_PAGE_SX_FIX));
+  ut_ad(mtr->memo_contains_flagged(add, MTR_MEMO_PAGE_X_FIX |
+                                   MTR_MEMO_PAGE_SX_FIX));
+
+  fil_addr_t prev_addr= flst_get_prev_addr(cur->page.frame + coffset);
+
+  flst_write_addr(*add, add->page.frame + aoffset + FLST_PREV,
+                  prev_addr.page, prev_addr.boffset, mtr);
+  flst_write_addr(*add, add->page.frame + aoffset + FLST_NEXT,
+                  cur->page.id().page_no(), coffset, mtr);
+
+  dberr_t err= DB_SUCCESS;
+
+  if (prev_addr.page == FIL_NULL)
+    flst_write_addr(*base, base->page.frame + boffset + FLST_FIRST,
+                    add->page.id().page_no(), aoffset, mtr);
+  else if (buf_block_t *block=
+           buf_page_get_gen(page_id_t{add->page.id().space(), prev_addr.page},
+                            add->zip_size(), RW_SX_LATCH, nullptr,
+                            BUF_GET_POSSIBLY_FREED, mtr, &err))
+    flst_write_addr(*block, block->page.frame +
+                    prev_addr.boffset + FLST_NEXT,
+                    add->page.id().page_no(), aoffset, mtr);
+
+  flst_write_addr(*cur, cur->page.frame + coffset + FLST_PREV,
+                    add->page.id().page_no(), aoffset, mtr);
+
+  byte *len= &base->page.frame[boffset + FLST_LEN];
+  mtr->write<4>(*base, len, mach_read_from_4(len) + 1);
+  return err;
+}
+
+/** Initialize a list base node.
+@param[in]      block   file page
+@param[in,out]  base    base node
+@param[in,out]  mtr     mini-transaction */
+void flst_init(const buf_block_t& block, byte *base, mtr_t *mtr)
+{
+  ut_ad(mtr->memo_contains_page_flagged(base, MTR_MEMO_PAGE_X_FIX |
+                                        MTR_MEMO_PAGE_SX_FIX));
+  mtr->write<4,mtr_t::MAYBE_NOP>(block, base + FLST_LEN, 0U);
+  static_assert(FLST_LAST == FLST_FIRST + FIL_ADDR_SIZE, "compatibility");
+  flst_zero_both(block, base + FLST_FIRST, mtr);
+}
+
+/** Append a file list node to a list.
+@param[in,out]  base    base node block
+@param[in]      boffset byte offset of the base node
+@param[in,out]  add     block to be added
+@param[in]      aoffset byte offset of the node to be added
+@param[in,outr] mtr     mini-transaction */
+dberr_t flst_add_last(buf_block_t *base, uint16_t boffset,
+                      buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
+{
+  ut_ad(base != add || boffset != aoffset);
+  ut_ad(boffset < base->physical_size());
+  ut_ad(aoffset < add->physical_size());
+  ut_ad(mtr->memo_contains_flagged(base, MTR_MEMO_PAGE_X_FIX |
+                                   MTR_MEMO_PAGE_SX_FIX));
+  ut_ad(mtr->memo_contains_flagged(add, MTR_MEMO_PAGE_X_FIX |
+                                   MTR_MEMO_PAGE_SX_FIX));
+  if (!flst_get_len(base->page.frame + boffset))
+  {
+    flst_add_to_empty(base, boffset, add, aoffset, mtr);
+    return DB_SUCCESS;
+  }
+  else
+  {
+    fil_addr_t addr= flst_get_last(base->page.frame + boffset);
+    buf_block_t *cur= add;
+    dberr_t err;
+    if (addr.page != add->page.id().page_no() &&
+        !(cur= buf_page_get_gen(page_id_t{add->page.id().space(), addr.page},
+                                add->zip_size(), RW_SX_LATCH, nullptr,
+                                BUF_GET_POSSIBLY_FREED, mtr, &err)))
+      return err;
+    return flst_insert_after(base, boffset, cur, addr.boffset,
+                             add, aoffset, mtr);
+  }
+}
+
+/** Prepend a file list node to a list.
+@param[in,out]  base    base node block
+@param[in]      boffset byte offset of the base node
+@param[in,out]  add     block to be added
+@param[in]      aoffset byte offset of the node to be added
+@param[in,out]  mtr     mini-transaction
+@return error code */
+dberr_t flst_add_first(buf_block_t *base, uint16_t boffset,
+                       buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
+{
+  ut_ad(base != add || boffset != aoffset);
+  ut_ad(boffset < base->physical_size());
+  ut_ad(aoffset < add->physical_size());
+  ut_ad(mtr->memo_contains_flagged(base, MTR_MEMO_PAGE_X_FIX |
+                                   MTR_MEMO_PAGE_SX_FIX));
+  ut_ad(mtr->memo_contains_flagged(add, MTR_MEMO_PAGE_X_FIX |
+                                   MTR_MEMO_PAGE_SX_FIX));
+
+  if (!flst_get_len(base->page.frame + boffset))
+  {
+    flst_add_to_empty(base, boffset, add, aoffset, mtr);
+    return DB_SUCCESS;
+  }
+  else
+  {
+    fil_addr_t addr= flst_get_first(base->page.frame + boffset);
+    buf_block_t *cur= add;
+    dberr_t err;
+    if (addr.page != add->page.id().page_no() &&
+        !(cur= buf_page_get_gen(page_id_t{add->page.id().space(), addr.page},
+                                add->zip_size(), RW_SX_LATCH, nullptr,
+                                BUF_GET_POSSIBLY_FREED, mtr, &err)))
+      return err;
+    return flst_insert_before(base, boffset, cur, addr.boffset,
+                              add, aoffset, mtr);
+  }
+}
+
+/** Remove a file list node.
+@param[in,out]  base    base node block
+@param[in]      boffset byte offset of the base node
+@param[in,out]  cur     block to be removed
+@param[in]      coffset byte offset of the current record to be removed
+@param[in,out]  mtr     mini-transaction
+@return error code */
+dberr_t flst_remove(buf_block_t *base, uint16_t boffset,
+                    buf_block_t *cur, uint16_t coffset, mtr_t *mtr)
+{
+  ut_ad(boffset < base->physical_size());
+  ut_ad(coffset < cur->physical_size());
+  ut_ad(mtr->memo_contains_flagged(base, MTR_MEMO_PAGE_X_FIX |
+                                   MTR_MEMO_PAGE_SX_FIX));
+  ut_ad(mtr->memo_contains_flagged(cur, MTR_MEMO_PAGE_X_FIX |
+                                   MTR_MEMO_PAGE_SX_FIX));
+
+  const fil_addr_t prev_addr= flst_get_prev_addr(cur->page.frame + coffset);
+  const fil_addr_t next_addr= flst_get_next_addr(cur->page.frame + coffset);
+  dberr_t err= DB_SUCCESS;
+
+  if (prev_addr.page == FIL_NULL)
+    flst_write_addr(*base, base->page.frame + boffset + FLST_FIRST,
+                    next_addr.page, next_addr.boffset, mtr);
+  else
+  {
+    buf_block_t *b= cur;
+    if (prev_addr.page == b->page.id().page_no() ||
+        (b= buf_page_get_gen(page_id_t(b->page.id().space(), prev_addr.page),
+                             b->zip_size(), RW_SX_LATCH, nullptr,
+                             BUF_GET_POSSIBLY_FREED, mtr, &err)))
+      flst_write_addr(*b, b->page.frame + prev_addr.boffset + FLST_NEXT,
+                      next_addr.page, next_addr.boffset, mtr);
+  }
+
+  if (next_addr.page == FIL_NULL)
+    flst_write_addr(*base, base->page.frame + boffset + FLST_LAST,
+                    prev_addr.page, prev_addr.boffset, mtr);
+  else
+  {
+    dberr_t err2;
+    if (next_addr.page == cur->page.id().page_no() ||
+        (cur= buf_page_get_gen(page_id_t(cur->page.id().space(),
+                                         next_addr.page),
+                               cur->zip_size(), RW_SX_LATCH, nullptr,
+                               BUF_GET_POSSIBLY_FREED, mtr, &err2)))
+      flst_write_addr(*cur, cur->page.frame + next_addr.boffset + FLST_PREV,
+                      prev_addr.page, prev_addr.boffset, mtr);
+    else if (err == DB_SUCCESS)
+      err= err2;
+  }
+
+  byte *len= &base->page.frame[boffset + FLST_LEN];
+  if (UNIV_UNLIKELY(!mach_read_from_4(len)))
+    return DB_CORRUPTION;
+  mtr->write<4>(*base, len, mach_read_from_4(len) - 1);
+  return err;
+}
+
+#ifdef UNIV_DEBUG
+/** Validate a file-based list. */
+void flst_validate(const buf_block_t *base, uint16_t boffset, mtr_t *mtr)
+{
+  ut_ad(boffset < base->physical_size());
+  ut_ad(mtr->memo_contains_flagged(base, MTR_MEMO_PAGE_X_FIX |
+                                   MTR_MEMO_PAGE_SX_FIX));
+
+  /* We use two mini-transaction handles: the first is used to lock
+  the base node, and prevent other threads from modifying the list.
+  The second is used to traverse the list. We cannot run the second
+  mtr without committing it at times, because if the list is long,
+  the x-locked pages could fill the buffer, resulting in a deadlock. */
+  mtr_t mtr2;
+
+  const uint32_t len= flst_get_len(base->page.frame + boffset);
+  fil_addr_t addr= flst_get_first(base->page.frame + boffset);
+
+  for (uint32_t i= len; i--; )
+  {
+    mtr2.start();
+    const buf_block_t *b=
+      buf_page_get_gen(page_id_t(base->page.id().space(), addr.page),
+                       base->zip_size(), RW_SX_LATCH, nullptr, BUF_GET, mtr);
+    ut_ad(b);
+    addr= flst_get_next_addr(b->page.frame + addr.boffset);
+    mtr2.commit();
+  }
+
+  ut_ad(addr.page == FIL_NULL);
+
+  addr= flst_get_last(base->page.frame + boffset);
+
+  for (uint32_t i= len; i--; )
+  {
+    mtr2.start();
+    const buf_block_t *b=
+      buf_page_get_gen(page_id_t(base->page.id().space(), addr.page),
+                       base->zip_size(), RW_SX_LATCH, nullptr, BUF_GET, mtr);
+    ut_ad(b);
+    addr= flst_get_prev_addr(b->page.frame + addr.boffset);
+    mtr2.commit();
+  }
+
+  ut_ad(addr.page == FIL_NULL);
+}
+#endif
diff --git a/storage/innobase/gis/gis0geo.cc b/storage/innobase/gis/gis0geo.cc
new file mode 100644
index 00000000..4c3ff188
--- /dev/null
+++ b/storage/innobase/gis/gis0geo.cc
@@ -0,0 +1,650 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file gis/gis0geo.cc
+InnoDB R-tree related functions.
+
+Created 2013/03/27 Allen Lai and Jimmy Yang
+*******************************************************/
+
+#include "page0types.h"
+#include "gis0geo.h"
+#include "page0cur.h"
+#include "ut0rnd.h"
+#include "mach0data.h"
+
+#include <spatial.h>
+#include <cmath>
+
+/* These definitions are for comparing 2 mbrs. */
+
+/* Check if a intersects b.
+Return false if a intersects b, otherwise true. */
+#define INTERSECT_CMP(amin, amax, bmin, bmax) \
+(((amin) > (bmax)) || ((bmin) > (amax)))
+
+/* Check if b contains a.
+Return false if b contains a, otherwise true. */
+#define CONTAIN_CMP(amin, amax, bmin, bmax) \
+(((bmin) > (amin)) || ((bmax) < (amax)))
+
+/* Check if b is within a.
+Return false if b is within a, otherwise true. */
+#define WITHIN_CMP(amin, amax, bmin, bmax) \
+(((amin) > (bmin)) || ((amax) < (bmax)))
+
+/* Check if a disjoints b.
+Return false if a disjoints b, otherwise true. */
+#define DISJOINT_CMP(amin, amax, bmin, bmax) \
+(((amin) <= (bmax)) && ((bmin) <= (amax)))
+
+/* Check if a equals b.
+Return false if equal, otherwise true. */
+#define EQUAL_CMP(amin, amax, bmin, bmax) \
+(((amin) != (bmin)) || ((amax) != (bmax)))
+
+/****************************************************************
+Functions for generating mbr
+****************************************************************/
+/*************************************************************//**
+Add one point stored in wkb to a given mbr.
+@return 0 if the point in wkb is valid, otherwise -1. */
+static
+int
+rtree_add_point_to_mbr(
+/*===================*/
+	const uchar**	wkb,		/*!< in: pointer to wkb,
+				where point is stored */
+	const uchar*	end,		/*!< in: end of wkb. */
+	uint	n_dims,		/*!< in: dimensions. */
+	double*	mbr)		/*!< in/out: mbr, which
+				must be of length n_dims * 2. */
+{
+	double	ord;
+	double*	mbr_end = mbr + n_dims * 2;
+
+	while (mbr < mbr_end) {
+		if ((*wkb) + sizeof(double) > end) {
+			return(-1);
+		}
+
+		ord = mach_double_read(*wkb);
+		(*wkb) += sizeof(double);
+
+		if (ord < *mbr) {
+			*mbr = ord;
+		}
+		mbr++;
+
+		if (ord > *mbr) {
+			*mbr = ord;
+		}
+		mbr++;
+	}
+
+	return(0);
+}
+
+/*************************************************************//**
+Get mbr of point stored in wkb.
+@return 0 if ok, otherwise -1. */
+static
+int
+rtree_get_point_mbr(
+/*================*/
+	const uchar**	wkb,		/*!< in: pointer to wkb,
+				where point is stored. */
+	const uchar*	end,		/*!< in: end of wkb. */
+	uint	n_dims,		/*!< in: dimensions. */
+	double*	mbr)		/*!< in/out: mbr,
+				must be of length n_dims * 2. */
+{
+	return rtree_add_point_to_mbr(wkb, end, n_dims, mbr);
+}
+
+
+/*************************************************************//**
+Get mbr of linestring stored in wkb.
+@return 0 if the linestring is valid, otherwise -1. */
+static
+int
+rtree_get_linestring_mbr(
+/*=====================*/
+	const uchar**	wkb,		/*!< in: pointer to wkb,
+				where point is stored. */
+	const uchar*	end,		/*!< in: end of wkb. */
+	uint	n_dims,		/*!< in: dimensions. */
+	double*	mbr)		/*!< in/out: mbr,
+				must be of length n_dims * 2. */
+{
+	uint	n_points;
+
+	n_points = uint4korr(*wkb);
+	(*wkb) += 4;
+
+	for (; n_points > 0; --n_points) {
+		/* Add next point to mbr */
+		if (rtree_add_point_to_mbr(wkb, end, n_dims, mbr)) {
+			return(-1);
+		}
+	}
+
+	return(0);
+}
+
+/*************************************************************//**
+Get mbr of polygon stored in wkb.
+@return 0 if the polygon is valid, otherwise -1. */
+static
+int
+rtree_get_polygon_mbr(
+/*==================*/
+	const uchar**	wkb,		/*!< in: pointer to wkb,
+				where point is stored. */
+	const uchar*	end,		/*!< in: end of wkb. */
+	uint	n_dims,		/*!< in: dimensions. */
+	double*	mbr)		/*!< in/out: mbr,
+				must be of length n_dims * 2. */
+{
+	uint	n_linear_rings;
+	uint	n_points;
+
+	n_linear_rings = uint4korr((*wkb));
+	(*wkb) += 4;
+
+	for (; n_linear_rings > 0; --n_linear_rings) {
+		n_points = uint4korr((*wkb));
+		(*wkb) += 4;
+
+		for (; n_points > 0; --n_points) {
+			/* Add next point to mbr */
+			if (rtree_add_point_to_mbr(wkb, end, n_dims, mbr)) {
+				return(-1);
+			}
+		}
+	}
+
+	return(0);
+}
+
+/*************************************************************//**
+Get mbr of geometry stored in wkb.
+@return 0 if the geometry is valid, otherwise -1. */
+static
+int
+rtree_get_geometry_mbr(
+/*===================*/
+	const uchar**	wkb,		/*!< in: pointer to wkb,
+				where point is stored. */
+	const uchar*	end,		/*!< in: end of wkb. */
+	uint	n_dims,		/*!< in: dimensions. */
+	double*	mbr,		/*!< in/out: mbr. */
+	int	top)		/*!< in: if it is the top,
+				which means it's not called
+				by itself. */
+{
+	int	res;
+	uint	wkb_type = 0;
+	uint	n_items;
+
+	/* byte_order = *(*wkb); */
+	++(*wkb);
+
+	wkb_type = uint4korr((*wkb));
+	(*wkb) += 4;
+
+	switch ((enum wkbType) wkb_type) {
+	case wkbPoint:
+		res = rtree_get_point_mbr(wkb, end, n_dims, mbr);
+		break;
+	case wkbLineString:
+		res = rtree_get_linestring_mbr(wkb, end, n_dims, mbr);
+		break;
+	case wkbPolygon:
+		res = rtree_get_polygon_mbr(wkb, end, n_dims, mbr);
+		break;
+	case wkbMultiPoint:
+		n_items = uint4korr((*wkb));
+		(*wkb) += 4;
+		for (; n_items > 0; --n_items) {
+			/* byte_order = *(*wkb); */
+			++(*wkb);
+			(*wkb) += 4;
+			if (rtree_get_point_mbr(wkb, end, n_dims, mbr)) {
+				return(-1);
+			}
+		}
+		res = 0;
+		break;
+	case wkbMultiLineString:
+		n_items = uint4korr((*wkb));
+		(*wkb) += 4;
+		for (; n_items > 0; --n_items) {
+			/* byte_order = *(*wkb); */
+			++(*wkb);
+			(*wkb) += 4;
+			if (rtree_get_linestring_mbr(wkb, end, n_dims, mbr)) {
+				return(-1);
+			}
+		}
+		res = 0;
+		break;
+	case wkbMultiPolygon:
+		n_items = uint4korr((*wkb));
+		(*wkb) += 4;
+		for (; n_items > 0; --n_items) {
+			/* byte_order = *(*wkb); */
+			++(*wkb);
+			(*wkb) += 4;
+			if (rtree_get_polygon_mbr(wkb, end, n_dims, mbr)) {
+				return(-1);
+			}
+		}
+		res = 0;
+		break;
+	case wkbGeometryCollection:
+		if (!top) {
+			return(-1);
+		}
+
+		n_items = uint4korr((*wkb));
+		(*wkb) += 4;
+		for (; n_items > 0; --n_items) {
+			if (rtree_get_geometry_mbr(wkb, end, n_dims,
+						   mbr, 0)) {
+				return(-1);
+			}
+		}
+		res = 0;
+		break;
+	default:
+		res = -1;
+	}
+
+	return(res);
+}
+
+/*************************************************************//**
+Calculate Minimal Bounding Rectangle (MBR) of the spatial object
+stored in "well-known binary representation" (wkb) format.
+@return 0 if ok. */
+int
+rtree_mbr_from_wkb(
+/*===============*/
+	const uchar*	wkb,		/*!< in: wkb */
+	uint	size,		/*!< in: size of wkb. */
+	uint	n_dims,		/*!< in: dimensions. */
+	double*	mbr)		/*!< in/out: mbr, which must
+				be of length n_dim2 * 2. */
+{
+	for (uint i = 0; i < n_dims; ++i) {
+		mbr[i * 2] = DBL_MAX;
+		mbr[i * 2 + 1] = -DBL_MAX;
+	}
+
+	return rtree_get_geometry_mbr(&wkb, wkb + size, n_dims, mbr, 1);
+}
+
+
+/****************************************************************
+Functions for Rtree split
+****************************************************************/
+/*************************************************************//**
+Join 2 mbrs of dimensions n_dim. */
+static
+void
+mbr_join(
+/*=====*/
+	double*		a,	/*!< in/out: the first mbr,
+				where the joined result will be. */
+	const double*	b,	/*!< in: the second mbr. */
+	int		n_dim)	/*!< in: dimensions. */
+{
+	double*		end = a + n_dim * 2;
+
+	do {
+		if (a[0] > b[0]) {
+			a[0] = b[0];
+		}
+
+		if (a[1] < b[1]) {
+			a[1] = b[1];
+		}
+
+		a += 2;
+		b += 2;
+
+	} while (a != end);
+}
+
+/*************************************************************//**
+Counts the square of mbr which is the join of a and b. Both a and b
+are of dimensions n_dim. */
+static
+double
+mbr_join_square(
+/*============*/
+	const double*	a,	/*!< in: the first mbr. */
+	const double*	b,	/*!< in: the second mbr. */
+	int		n_dim)	/*!< in: dimensions. */
+{
+	const double*	end = a + n_dim * 2;
+	double		square = 1.0;
+
+	do {
+		square *= std::max(a[1], b[1]) - std::min(a[0], b[0]);
+
+		a += 2;
+		b += 2;
+	} while (a != end);
+
+	/* Check if finite (not infinity or NaN),
+	so we don't get NaN in calculations */
+	if (!std::isfinite(square)) {
+		return DBL_MAX;
+	}
+
+	return square;
+}
+
+/*************************************************************//**
+Counts the square of mbr of dimension n_dim. */
+static
+double
+count_square(
+/*=========*/
+	const double*	a,	/*!< in: the mbr. */
+	int		n_dim)	/*!< in: dimensions. */
+{
+	const double*	end = a + n_dim * 2;
+	double		square = 1.0;
+
+	do {
+		square *= a[1] - a[0];
+		a += 2;
+	} while (a != end);
+
+	return square;
+}
+
+/*************************************************************//**
+Copy mbr of dimension n_dim from src to dst. */
+inline
+static
+void
+copy_coords(
+/*========*/
+	double*		dst,	/*!< in/out: destination. */
+	const double*	src,	/*!< in: source. */
+	int)
+{
+	memcpy(dst, src, DATA_MBR_LEN);
+}
+
+/*************************************************************//**
+Select two nodes to collect group upon */
+static
+void
+pick_seeds(
+/*=======*/
+	rtr_split_node_t*	node,		/*!< in: split nodes. */
+	int			n_entries,	/*!< in: entries number. */
+	rtr_split_node_t**	seed_a,		/*!< out: seed 1. */
+	rtr_split_node_t**	seed_b,		/*!< out: seed 2. */
+	int			n_dim)		/*!< in: dimensions. */
+{
+	rtr_split_node_t*	cur1;
+	rtr_split_node_t*	lim1 = node + (n_entries - 1);
+	rtr_split_node_t*	cur2;
+	rtr_split_node_t*	lim2 = node + n_entries;
+
+	double			max_d = -DBL_MAX;
+	double			d;
+
+	*seed_a = node;
+	*seed_b = node + 1;
+
+	for (cur1 = node; cur1 < lim1; ++cur1) {
+		for (cur2 = cur1 + 1; cur2 < lim2; ++cur2) {
+			d = mbr_join_square(cur1->coords, cur2->coords, n_dim) -
+				cur1->square - cur2->square;
+			if (d > max_d) {
+				max_d = d;
+				*seed_a = cur1;
+				*seed_b = cur2;
+			}
+		}
+	}
+}
+
+/*************************************************************//**
+Select next node and group where to add. */
+static
+void
+pick_next(
+/*======*/
+	rtr_split_node_t*	node,		/*!< in: split nodes. */
+	int			n_entries,	/*!< in: entries number. */
+	double*			g1,		/*!< in: mbr of group 1. */
+	double*			g2,		/*!< in: mbr of group 2. */
+	rtr_split_node_t**	choice,		/*!< out: the next node.*/
+	int*			n_group,	/*!< out: group number.*/
+	int			n_dim)		/*!< in: dimensions. */
+{
+	rtr_split_node_t*	cur = node;
+	rtr_split_node_t*	end = node + n_entries;
+	double			max_diff = -DBL_MAX;
+
+	for (; cur < end; ++cur) {
+		double	diff;
+		double	abs_diff;
+
+		if (cur->n_node != 0) {
+			continue;
+		}
+
+		diff = mbr_join_square(g1, cur->coords, n_dim) -
+		       mbr_join_square(g2, cur->coords, n_dim);
+
+		abs_diff = fabs(diff);
+		if (abs_diff > max_diff) {
+			max_diff = abs_diff;
+
+			/* Introduce some randomness if the record
+			is identical */
+			if (diff == 0) {
+				diff = static_cast<double>(ut_rnd_gen() & 1);
+			}
+
+			*n_group = 1 + (diff > 0);
+			*choice = cur;
+		}
+	}
+}
+
+/*************************************************************//**
+Mark not-in-group entries as n_group. */
+static
+void
+mark_all_entries(
+/*=============*/
+	rtr_split_node_t*	node,		/*!< in/out: split nodes. */
+	int			n_entries,	/*!< in: entries number. */
+	int			n_group)	/*!< in: group number. */
+{
+	rtr_split_node_t*	cur = node;
+	rtr_split_node_t*	end = node + n_entries;
+	for (; cur < end; ++cur) {
+		if (cur->n_node != 0) {
+			continue;
+		}
+		cur->n_node = n_group;
+	}
+}
+
+/*************************************************************//**
+Split rtree node.
+Return which group the first rec is in. */
+int
+split_rtree_node(
+/*=============*/
+	rtr_split_node_t*	node,		/*!< in: split nodes. */
+	int			n_entries,	/*!< in: entries number. */
+	int			all_size,	/*!< in: total key's size. */
+	int			key_size,	/*!< in: key's size. */
+	int			min_size,	/*!< in: minimal group size. */
+	int			size1,		/*!< in: size of group. */
+	int			size2,		/*!< in: initial group sizes */
+	double**		d_buffer,	/*!< in/out: buffer. */
+	int			n_dim,		/*!< in: dimensions. */
+	uchar*			first_rec)	/*!< in: the first rec. */
+{
+	rtr_split_node_t*	cur;
+	rtr_split_node_t*	a = NULL;
+	rtr_split_node_t*	b = NULL;
+	double*			g1 = reserve_coords(d_buffer, n_dim);
+	double*			g2 = reserve_coords(d_buffer, n_dim);
+	rtr_split_node_t*	next = NULL;
+	int			next_node = 0;
+	int			i;
+	int			first_rec_group = 1;
+	rtr_split_node_t*	end = node + n_entries;
+
+	if (all_size < min_size * 2) {
+		return 1;
+	}
+
+	cur = node;
+	for (; cur < end; ++cur) {
+		cur->square = count_square(cur->coords, n_dim);
+		cur->n_node = 0;
+	}
+
+	pick_seeds(node, n_entries, &a, &b, n_dim);
+	a->n_node = 1;
+	b->n_node = 2;
+
+	copy_coords(g1, a->coords, n_dim);
+	size1 += key_size;
+	copy_coords(g2, b->coords, n_dim);
+	size2 += key_size;
+
+	for (i = n_entries - 2; i > 0; --i) {
+		/* Can't write into group 2 */
+		if (all_size - (size2 + key_size) < min_size) {
+			mark_all_entries(node, n_entries, 1);
+			break;
+		}
+
+		/* Can't write into group 1 */
+		if (all_size - (size1 + key_size) < min_size) {
+			mark_all_entries(node, n_entries, 2);
+			break;
+		}
+
+		pick_next(node, n_entries, g1, g2, &next, &next_node, n_dim);
+		if (next_node == 1) {
+			size1 += key_size;
+			mbr_join(g1, next->coords, n_dim);
+		} else {
+			size2 += key_size;
+			mbr_join(g2, next->coords, n_dim);
+		}
+
+		next->n_node = next_node;
+
+		/* Find out where the first rec (of the page) will be at,
+		and inform the caller */
+		if (first_rec && first_rec == next->key) {
+			first_rec_group = next_node;
+		}
+	}
+
+	return(first_rec_group);
+}
+
+/** Compare two minimum bounding rectangles.
+@param mode   comparison operator
+   MBR_INTERSECT(a,b)  a overlaps b
+   MBR_CONTAIN(a,b)    a contains b
+   MBR_DISJOINT(a,b)   a disjoint b
+   MBR_WITHIN(a,b)     a within   b
+   MBR_EQUAL(a,b)      All coordinates of MBRs are equal
+   MBR_DATA(a,b)       Data reference is the same
+@param b first MBR
+@param a second MBR
+@retval 0 if the predicate holds
+@retval 1 if the precidate does not hold */
+int rtree_key_cmp(page_cur_mode_t mode, const void *b, const void *a)
+{
+  const byte *b_= static_cast<const byte*>(b);
+  const byte *a_= static_cast<const byte*>(a);
+
+  static_assert(DATA_MBR_LEN == SPDIMS * 2 * sizeof(double), "compatibility");
+
+  for (auto i = SPDIMS; i--; )
+  {
+    double amin= mach_double_read(a_);
+    double bmin= mach_double_read(b_);
+    a_+= sizeof(double);
+    b_+= sizeof(double);
+    double amax= mach_double_read(a_);
+    double bmax= mach_double_read(b_);
+    a_+= sizeof(double);
+    b_+= sizeof(double);
+
+    switch (mode) {
+    case PAGE_CUR_INTERSECT:
+      if (INTERSECT_CMP(amin, amax, bmin, bmax))
+        return 1;
+      continue;
+    case PAGE_CUR_CONTAIN:
+      if (CONTAIN_CMP(amin, amax, bmin, bmax))
+        return 1;
+      continue;
+    case PAGE_CUR_WITHIN:
+      if (WITHIN_CMP(amin, amax, bmin, bmax))
+        return 1;
+      continue;
+    case PAGE_CUR_MBR_EQUAL:
+      if (EQUAL_CMP(amin, amax, bmin, bmax))
+        return 1;
+      continue;
+    case PAGE_CUR_DISJOINT:
+      if (!DISJOINT_CMP(amin, amax, bmin, bmax))
+        return 0;
+      if (!i)
+        return 1;
+      continue;
+    case PAGE_CUR_UNSUPP:
+    case PAGE_CUR_G:
+    case PAGE_CUR_GE:
+    case PAGE_CUR_L:
+    case PAGE_CUR_LE:
+    case PAGE_CUR_RTREE_LOCATE:
+    case PAGE_CUR_RTREE_GET_FATHER:
+    case PAGE_CUR_RTREE_INSERT:
+      break;
+    }
+    ut_ad("unknown comparison operator" == 0);
+  }
+
+  return 0;
+}
diff --git a/storage/innobase/gis/gis0rtree.cc b/storage/innobase/gis/gis0rtree.cc
new file mode 100644
index 00000000..83afd732
--- /dev/null
+++ b/storage/innobase/gis/gis0rtree.cc
@@ -0,0 +1,1934 @@
+/*****************************************************************************
+
+Copyright (c) 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file gis/gis0rtree.cc
+InnoDB R-tree interfaces
+
+Created 2013/03/27 Allen Lai and Jimmy Yang
+***********************************************************************/
+
+#include "fsp0fsp.h"
+#include "page0page.h"
+#include "page0cur.h"
+#include "page0zip.h"
+#include "gis0rtree.h"
+#include "btr0cur.h"
+#include "btr0sea.h"
+#include "btr0pcur.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "ibuf0ibuf.h"
+#include "trx0undo.h"
+#include "srv0mon.h"
+#include "gis0geo.h"
+#include <cmath>
+
+/*************************************************************//**
+Initial split nodes info for R-tree split.
+@return initialized split nodes array */
+static
+rtr_split_node_t*
+rtr_page_split_initialize_nodes(
+/*============================*/
+	mem_heap_t*	heap,	/*!< in: pointer to memory heap, or NULL */
+	btr_cur_t*	cursor,	/*!< in: cursor at which to insert; when the
+				function returns, the cursor is positioned
+				on the predecessor of the inserted record */
+	rec_offs**	offsets,/*!< in: offsets on inserted record */
+	const dtuple_t*	tuple,	/*!< in: tuple to insert */
+	double**	buf_pos)/*!< in/out: current buffer position */
+{
+	rtr_split_node_t*	split_node_array;
+	double*			buf;
+	ulint			n_recs;
+	rtr_split_node_t*	task;
+	rtr_split_node_t*	stop;
+	rtr_split_node_t*	cur;
+	rec_t*			rec;
+	buf_block_t*		block;
+	page_t*			page;
+	ulint			n_uniq;
+	ulint			len;
+	const byte*		source_cur;
+
+	block = btr_cur_get_block(cursor);
+	page = buf_block_get_frame(block);
+	n_uniq = dict_index_get_n_unique_in_tree(cursor->index());
+
+	n_recs = ulint(page_get_n_recs(page)) + 1;
+
+	/*We reserve 2 MBRs memory space for temp result of split
+	algrithm. And plus the new mbr that need to insert, we
+	need (n_recs + 3)*MBR size for storing all MBRs.*/
+	buf = static_cast<double*>(mem_heap_alloc(
+		heap, DATA_MBR_LEN * (n_recs + 3)
+		+ sizeof(rtr_split_node_t) * (n_recs + 1)));
+
+	split_node_array = (rtr_split_node_t*)(buf + SPDIMS * 2 * (n_recs + 3));
+	task = split_node_array;
+	*buf_pos = buf;
+	stop = task + n_recs;
+
+	rec = page_rec_get_next(page_get_infimum_rec(page));
+	const ulint n_core = page_is_leaf(page)
+		? cursor->index()->n_core_fields : 0;
+	*offsets = rec_get_offsets(rec, cursor->index(), *offsets, n_core,
+				   n_uniq, &heap);
+
+	source_cur = rec_get_nth_field(rec, *offsets, 0, &len);
+
+	for (cur = task; cur < stop - 1; ++cur) {
+		cur->coords = reserve_coords(buf_pos, SPDIMS);
+		cur->key = rec;
+
+		memcpy(cur->coords, source_cur, DATA_MBR_LEN);
+
+		rec = page_rec_get_next(rec);
+		*offsets = rec_get_offsets(rec, cursor->index(), *offsets,
+					   n_core, n_uniq, &heap);
+		source_cur = rec_get_nth_field(rec, *offsets, 0, &len);
+	}
+
+	/* Put the insert key to node list */
+	source_cur = static_cast<const byte*>(dfield_get_data(
+		dtuple_get_nth_field(tuple, 0)));
+	cur->coords = reserve_coords(buf_pos, SPDIMS);
+	rec = (byte*) mem_heap_alloc(
+		heap, rec_get_converted_size(cursor->index(), tuple, 0));
+
+	rec = rec_convert_dtuple_to_rec(rec, cursor->index(), tuple, 0);
+	cur->key = rec;
+
+	memcpy(cur->coords, source_cur, DATA_MBR_LEN);
+
+	return split_node_array;
+}
+
+/**********************************************************************//**
+Builds a Rtree node pointer out of a physical record and a page number.
+Note: For Rtree, we just keep the mbr and page no field in non-leaf level
+page. It's different with Btree, Btree still keeps PK fields so far.
+@return	own: node pointer */
+dtuple_t*
+rtr_index_build_node_ptr(
+/*=====================*/
+	const dict_index_t*	index,	/*!< in: index */
+	const rtr_mbr_t*	mbr,	/*!< in: mbr of lower page */
+	const rec_t*		rec,	/*!< in: record for which to build node
+					pointer */
+	ulint			page_no,/*!< in: page number to put in node
+					pointer */
+	mem_heap_t*		heap)	/*!< in: memory heap where pointer
+					created */
+{
+	dtuple_t*	tuple;
+	dfield_t*	field;
+	byte*		buf;
+	ulint		n_unique;
+	ulint		info_bits;
+
+	ut_ad(dict_index_is_spatial(index));
+
+	n_unique = DICT_INDEX_SPATIAL_NODEPTR_SIZE;
+
+	tuple = dtuple_create(heap, n_unique + 1);
+
+	/* For rtree internal node, we need to compare page number
+	fields. */
+	dtuple_set_n_fields_cmp(tuple, n_unique + 1);
+
+	dict_index_copy_types(tuple, index, n_unique);
+
+	/* Write page no field */
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
+
+	mach_write_to_4(buf, page_no);
+
+	field = dtuple_get_nth_field(tuple, n_unique);
+	dfield_set_data(field, buf, 4);
+
+	dtype_set(dfield_get_type(field), DATA_SYS_CHILD, DATA_NOT_NULL, 4);
+
+	/* Set info bits. */
+	info_bits = rec_get_info_bits(rec, dict_table_is_comp(index->table));
+	dtuple_set_info_bits(tuple, info_bits | REC_STATUS_NODE_PTR);
+
+	/* Set mbr as index entry data */
+	field = dtuple_get_nth_field(tuple, 0);
+
+	buf = static_cast<byte*>(mem_heap_alloc(heap, DATA_MBR_LEN));
+
+	rtr_write_mbr(buf, mbr);
+
+	dfield_set_data(field, buf, DATA_MBR_LEN);
+
+	ut_ad(dtuple_check_typed(tuple));
+
+	return(tuple);
+}
+
+/**************************************************************//**
+Update the mbr field of a spatial index row. */
+void
+rtr_update_mbr_field(
+/*=================*/
+	btr_cur_t*	cursor,		/*!< in/out: cursor pointed to rec.*/
+	rec_offs*	offsets,	/*!< in/out: offsets on rec. */
+	btr_cur_t*	cursor2,	/*!< in/out: cursor pointed to rec
+					that should be deleted.
+					this cursor is for btr_compress to
+					delete the merged page's father rec.*/
+	page_t*		child_page,	/*!< in: child page. */
+	rtr_mbr_t*	mbr,		/*!< in: the new mbr. */
+	rec_t*		new_rec,	/*!< in: rec to use */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	dict_index_t*	index = cursor->index();
+	mem_heap_t*	heap;
+	page_t*		page;
+	rec_t*		rec;
+	constexpr ulint flags = BTR_NO_UNDO_LOG_FLAG
+			| BTR_NO_LOCKING_FLAG
+			| BTR_KEEP_SYS_FLAG;
+	dberr_t		err;
+	big_rec_t*	dummy_big_rec;
+	buf_block_t*	block;
+	rec_t*		child_rec;
+	ulint		up_match = 0;
+	ulint		low_match = 0;
+	ulint		child;
+	ulint		rec_info;
+	bool		ins_suc = true;
+	ulint		cur2_pos = 0;
+	ulint		del_page_no = 0;
+	rec_offs*	offsets2;
+
+	rec = btr_cur_get_rec(cursor);
+	page = page_align(rec);
+
+	rec_info = rec_get_info_bits(rec, rec_offs_comp(offsets));
+
+	heap = mem_heap_create(100);
+	block = btr_cur_get_block(cursor);
+	ut_ad(page == buf_block_get_frame(block));
+
+	child = btr_node_ptr_get_child_page_no(rec, offsets);
+	const ulint n_core = page_is_leaf(block->page.frame)
+		? index->n_core_fields : 0;
+
+	if (new_rec) {
+		child_rec = new_rec;
+	} else {
+		child_rec = page_rec_get_next(page_get_infimum_rec(child_page));
+	}
+
+	dtuple_t* node_ptr = rtr_index_build_node_ptr(
+		index, mbr, child_rec, child, heap);
+
+	/* We need to remember the child page no of cursor2, since page could be
+	reorganized or insert a new rec before it. */
+	if (cursor2) {
+		ut_ad(cursor2->index() == index);
+		rec_t*	del_rec = btr_cur_get_rec(cursor2);
+		offsets2 = rec_get_offsets(btr_cur_get_rec(cursor2),
+					   index, NULL, 0,
+					   ULINT_UNDEFINED, &heap);
+		del_page_no = btr_node_ptr_get_child_page_no(del_rec, offsets2);
+		cur2_pos = page_rec_get_n_recs_before(btr_cur_get_rec(cursor2));
+	}
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(rec_offs_base(offsets)[0 + 1] == DATA_MBR_LEN);
+	ut_ad(node_ptr->fields[0].len == DATA_MBR_LEN);
+
+	if (rec_info & REC_INFO_MIN_REC_FLAG) {
+		/* When the rec is minimal rec in this level, we do
+		in-place update for avoiding it move to other place. */
+		page_zip_des_t* page_zip = buf_block_get_page_zip(block);
+
+		if (UNIV_LIKELY_NULL(page_zip)) {
+			/* Check if there's enough space for in-place
+			update the zip page. */
+			if (!btr_cur_update_alloc_zip(
+					page_zip,
+					btr_cur_get_page_cur(cursor),
+					offsets,
+					rec_offs_size(offsets),
+					false, mtr)) {
+
+				/* If there's not enought space for
+				inplace update zip page, we do delete
+				insert. */
+				ins_suc = false;
+
+				/* Since btr_cur_update_alloc_zip could
+				reorganize the page, we need to repositon
+				cursor2. */
+				if (cursor2) {
+					cursor2->page_cur.rec =
+						page_rec_get_nth(page,
+								 cur2_pos);
+				}
+
+				goto update_mbr;
+			}
+
+			/* Record could be repositioned */
+			rec = btr_cur_get_rec(cursor);
+
+#ifdef UNIV_DEBUG
+			/* Make sure it is still the first record */
+			rec_info = rec_get_info_bits(
+					rec, rec_offs_comp(offsets));
+			ut_ad(rec_info & REC_INFO_MIN_REC_FLAG);
+#endif /* UNIV_DEBUG */
+			memcpy(rec, node_ptr->fields[0].data, DATA_MBR_LEN);
+			page_zip_write_rec(block, rec, index, offsets, 0, mtr);
+		} else {
+			mtr->memcpy<mtr_t::MAYBE_NOP>(*block, rec,
+						      node_ptr->fields[0].data,
+						      DATA_MBR_LEN);
+		}
+
+		if (cursor2) {
+			rec_offs* offsets2;
+
+			if (UNIV_LIKELY_NULL(page_zip)) {
+				cursor2->page_cur.rec
+					= page_rec_get_nth(page, cur2_pos);
+			}
+			offsets2 = rec_get_offsets(btr_cur_get_rec(cursor2),
+						   index, NULL, 0,
+						   ULINT_UNDEFINED, &heap);
+			ut_ad(del_page_no == btr_node_ptr_get_child_page_no(
+							cursor2->page_cur.rec,
+							offsets2));
+
+			page_cur_delete_rec(btr_cur_get_page_cur(cursor2),
+					    offsets2, mtr);
+		}
+	} else if (page_get_n_recs(page) == 1) {
+		/* When there's only one rec in the page, we do insert/delete to
+		avoid page merge. */
+
+		page_cur_t		page_cur;
+		rec_t*			insert_rec;
+		rec_offs*		insert_offsets = NULL;
+		ulint			old_pos;
+		rec_t*			old_rec;
+
+		ut_ad(cursor2 == NULL);
+
+		/* Insert the new mbr rec. */
+		old_pos = page_rec_get_n_recs_before(rec);
+
+		err = btr_cur_optimistic_insert(
+			flags,
+			cursor, &insert_offsets, &heap,
+			node_ptr, &insert_rec, &dummy_big_rec, 0, NULL, mtr);
+
+		ut_ad(err == DB_SUCCESS);
+
+		btr_cur_position(index, insert_rec, block, cursor);
+
+		/* Delete the old mbr rec. */
+		old_rec = page_rec_get_nth(page, old_pos);
+		ut_ad(old_rec != insert_rec);
+
+		page_cur_position(old_rec, block, &page_cur);
+		page_cur.index = index;
+		offsets2 = rec_get_offsets(old_rec, index, NULL, n_core,
+					   ULINT_UNDEFINED, &heap);
+		page_cur_delete_rec(&page_cur, offsets2, mtr);
+
+	} else {
+update_mbr:
+		/* When there're not only 1 rec in the page, we do delete/insert
+		to avoid page split. */
+		rec_t*			insert_rec;
+		rec_offs*		insert_offsets = NULL;
+		rec_t*			next_rec;
+
+		/* Delete the rec which cursor point to. */
+		next_rec = page_rec_get_next(rec);
+		page_cur_delete_rec(&cursor->page_cur, offsets, mtr);
+		if (!ins_suc) {
+			ut_ad(rec_info & REC_INFO_MIN_REC_FLAG);
+
+			btr_set_min_rec_mark(next_rec, *block, mtr);
+		}
+
+		/* If there's more than 1 rec left in the page, delete
+		the rec which cursor2 point to. Otherwise, delete it later.*/
+		if (cursor2 && page_get_n_recs(page) > 1) {
+			ulint		cur2_rec_info;
+			rec_t*		cur2_rec;
+
+			cur2_rec = cursor2->page_cur.rec;
+			offsets2 = rec_get_offsets(cur2_rec, index, NULL,
+						   n_core,
+						   ULINT_UNDEFINED, &heap);
+
+			cur2_rec_info = rec_get_info_bits(cur2_rec,
+						rec_offs_comp(offsets2));
+			if (cur2_rec_info & REC_INFO_MIN_REC_FLAG) {
+				/* If we delete the leftmost node
+				pointer on a non-leaf level, we must
+				mark the new leftmost node pointer as
+				the predefined minimum record */
+				rec_t*	next_rec = page_rec_get_next(cur2_rec);
+				btr_set_min_rec_mark(next_rec, *block, mtr);
+			}
+
+			ut_ad(del_page_no
+			      == btr_node_ptr_get_child_page_no(cur2_rec,
+								offsets2));
+			page_cur_delete_rec(btr_cur_get_page_cur(cursor2),
+					    offsets2, mtr);
+			cursor2 = NULL;
+		}
+
+		/* Insert the new rec. */
+		if (page_cur_search_with_match(node_ptr, PAGE_CUR_LE,
+					       &up_match, &low_match,
+					       btr_cur_get_page_cur(cursor),
+					       NULL)) {
+			goto err_exit;
+		}
+
+		err = btr_cur_optimistic_insert(flags, cursor, &insert_offsets,
+						&heap, node_ptr, &insert_rec,
+						&dummy_big_rec, 0, NULL, mtr);
+
+		/* If optimistic insert fail, try reorganize the page
+		and insert again. */
+		if (err == DB_SUCCESS) {
+			ins_suc = true;
+		} else if (ins_suc) {
+			ut_ad(err == DB_FAIL);
+			err = btr_page_reorganize(btr_cur_get_page_cur(cursor),
+						  mtr);
+			if (err == DB_SUCCESS) {
+				err = btr_cur_optimistic_insert(
+					flags, cursor, &insert_offsets, &heap,
+					node_ptr, &insert_rec, &dummy_big_rec,
+					0, NULL, mtr);
+			}
+
+			/* Will do pessimistic insert */
+			if (err != DB_SUCCESS) {
+				ut_ad(err == DB_FAIL);
+				ins_suc = false;
+			}
+		}
+
+		/* Insert succeed, position cursor the inserted rec.*/
+		if (ins_suc) {
+			btr_cur_position(index, insert_rec, block, cursor);
+			offsets = rec_get_offsets(insert_rec,
+						  index, offsets, n_core,
+						  ULINT_UNDEFINED, &heap);
+		}
+
+		/* Delete the rec which cursor2 point to. */
+		if (cursor2) {
+			ulint		cur2_pno;
+			rec_t*		cur2_rec;
+
+			cursor2->page_cur.rec = page_rec_get_nth(page,
+								 cur2_pos);
+
+			cur2_rec = btr_cur_get_rec(cursor2);
+
+			offsets2 = rec_get_offsets(cur2_rec, index, NULL,
+						   n_core,
+						   ULINT_UNDEFINED, &heap);
+
+			/* If the cursor2 position is on a wrong rec, we
+			need to reposition it. */
+			cur2_pno = btr_node_ptr_get_child_page_no(cur2_rec, offsets2);
+			if ((del_page_no != cur2_pno)
+			    || (cur2_rec == insert_rec)) {
+				cur2_rec = page_get_infimum_rec(page);
+
+				while ((cur2_rec
+					= page_rec_get_next(cur2_rec))) {
+					if (page_rec_is_supremum(cur2_rec)) {
+						break;
+					}
+
+					offsets2 = rec_get_offsets(cur2_rec, index,
+								   NULL,
+								   n_core,
+								   ULINT_UNDEFINED,
+								   &heap);
+					cur2_pno = btr_node_ptr_get_child_page_no(
+							cur2_rec, offsets2);
+					if (cur2_pno == del_page_no) {
+						if (insert_rec != cur2_rec) {
+							cursor2->page_cur.rec =
+								cur2_rec;
+							break;
+						}
+					}
+				}
+			}
+
+			rec_info = rec_get_info_bits(cur2_rec,
+						     rec_offs_comp(offsets2));
+			if (rec_info & REC_INFO_MIN_REC_FLAG) {
+				/* If we delete the leftmost node
+				pointer on a non-leaf level, we must
+				mark the new leftmost node pointer as
+				the predefined minimum record */
+				rec_t*	next_rec = page_rec_get_next(cur2_rec);
+				btr_set_min_rec_mark(next_rec, *block, mtr);
+			}
+
+			ut_ad(cur2_pno == del_page_no && cur2_rec != insert_rec);
+
+			page_cur_delete_rec(btr_cur_get_page_cur(cursor2),
+					    offsets2, mtr);
+		}
+
+		if (!ins_suc) {
+			mem_heap_t*	new_heap = NULL;
+
+			err = btr_cur_pessimistic_insert(
+				flags,
+				cursor, &insert_offsets, &new_heap,
+				node_ptr, &insert_rec, &dummy_big_rec,
+				0, NULL, mtr);
+
+			ut_ad(err == DB_SUCCESS);
+
+			if (new_heap) {
+				mem_heap_free(new_heap);
+			}
+
+		}
+
+		if (cursor2) {
+			btr_cur_compress_if_useful(cursor, FALSE, mtr);
+		}
+	}
+
+	ut_ad(page_has_prev(page)
+	      || (REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
+			  page_rec_get_next(page_get_infimum_rec(page)),
+			  page_is_comp(page))));
+err_exit:
+	mem_heap_free(heap);
+}
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/**************************************************************//**
+Update parent page's MBR and Predicate lock information during a split */
+static
+dberr_t
+rtr_adjust_upper_level(
+/*===================*/
+	btr_cur_t*	sea_cur,	/*!< in: search cursor */
+	ulint		flags,		/*!< in: undo logging and
+					locking flags */
+	buf_block_t*	block,		/*!< in/out: page to be split */
+	buf_block_t*	new_block,	/*!< in/out: the new half page */
+	rtr_mbr_t*	mbr,		/*!< in: MBR on the old page */
+	rtr_mbr_t*	new_mbr,	/*!< in: MBR on the new page */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	ulint		page_no;
+	ulint		new_page_no;
+	btr_cur_t	cursor;
+	rec_offs*	offsets;
+	mem_heap_t*	heap;
+	ulint		level;
+	dtuple_t*	node_ptr_upper = nullptr;
+	page_cur_t*	page_cursor;
+	lock_prdt_t	prdt;
+	lock_prdt_t	new_prdt;
+	big_rec_t*	dummy_big_rec;
+	rec_t*		rec;
+
+	/* Create a memory heap where the data tuple is stored */
+	heap = mem_heap_create(1024);
+
+	cursor.thr = sea_cur->thr;
+	cursor.page_cur.index = sea_cur->index();
+	cursor.page_cur.block = block;
+
+	/* Get the level of the split pages */
+	level = btr_page_get_level(buf_block_get_frame(block));
+	ut_ad(level == btr_page_get_level(buf_block_get_frame(new_block)));
+
+	page_no = block->page.id().page_no();
+
+	new_page_no = new_block->page.id().page_no();
+
+	/* Set new mbr for the old page on the upper level. */
+	/* Look up the index for the node pointer to page */
+	offsets = rtr_page_get_father_block(NULL, heap, mtr, sea_cur, &cursor);
+
+	page_cursor = btr_cur_get_page_cur(&cursor);
+
+	rtr_update_mbr_field(&cursor, offsets, nullptr, block->page.frame, mbr,
+			     nullptr, mtr);
+
+	/* Already updated parent MBR, reset in our path */
+	if (sea_cur->rtr_info) {
+		node_visit_t*	node_visit = rtr_get_parent_node(
+						sea_cur, level + 1, true);
+		if (node_visit) {
+			node_visit->mbr_inc = 0;
+		}
+	}
+
+	dberr_t err;
+
+	if (const rec_t* first = page_rec_get_next_const(
+		    page_get_infimum_rec(new_block->page.frame))) {
+		/* Insert the node for the new page. */
+		node_ptr_upper = rtr_index_build_node_ptr(
+			sea_cur->index(), new_mbr, first, new_page_no, heap);
+		ulint	up_match = 0, low_match = 0;
+		err = page_cur_search_with_match(node_ptr_upper,
+						 PAGE_CUR_LE,
+						 &up_match, &low_match,
+						 btr_cur_get_page_cur(&cursor),
+						 NULL)
+			? DB_CORRUPTION
+			: btr_cur_optimistic_insert(flags
+						    | BTR_NO_LOCKING_FLAG
+						    | BTR_KEEP_SYS_FLAG
+						    | BTR_NO_UNDO_LOG_FLAG,
+						    &cursor, &offsets, &heap,
+						    node_ptr_upper, &rec,
+						    &dummy_big_rec, 0, NULL,
+						    mtr);
+	} else {
+		err = DB_CORRUPTION;
+	}
+
+	if (err == DB_FAIL) {
+		cursor.rtr_info = sea_cur->rtr_info;
+		cursor.tree_height = sea_cur->tree_height;
+
+		/* Recreate a memory heap as input parameter for
+		btr_cur_pessimistic_insert(), because the heap may be
+		emptied in btr_cur_pessimistic_insert(). */
+		mem_heap_t* new_heap = mem_heap_create(1024);
+
+		err = btr_cur_pessimistic_insert(flags
+						 | BTR_NO_LOCKING_FLAG
+						 | BTR_KEEP_SYS_FLAG
+						 | BTR_NO_UNDO_LOG_FLAG,
+						 &cursor, &offsets, &new_heap,
+						 node_ptr_upper, &rec,
+						 &dummy_big_rec, 0, NULL, mtr);
+		cursor.rtr_info = NULL;
+		mem_heap_free(new_heap);
+	}
+
+	if (err == DB_SUCCESS) {
+		prdt.data = static_cast<void*>(mbr);
+		prdt.op = 0;
+		new_prdt.data = static_cast<void*>(new_mbr);
+		new_prdt.op = 0;
+
+		lock_prdt_update_parent(block, new_block, &prdt, &new_prdt,
+					page_cursor->block->page.id());
+	}
+
+	mem_heap_free(heap);
+
+	ut_ad(block->zip_size() == sea_cur->index()->table->space->zip_size());
+
+	if (err != DB_SUCCESS) {
+		return err;
+	}
+
+	const uint32_t next_page_no = btr_page_get_next(block->page.frame);
+
+	if (next_page_no == FIL_NULL) {
+	} else if (buf_block_t*	next_block =
+		   btr_block_get(*sea_cur->index(), next_page_no, RW_X_LATCH,
+				 false, mtr, &err)) {
+		if (UNIV_UNLIKELY(memcmp_aligned<4>(next_block->page.frame
+						    + FIL_PAGE_PREV,
+						    block->page.frame
+						    + FIL_PAGE_OFFSET, 4))) {
+			return DB_CORRUPTION;
+		}
+		btr_page_set_prev(next_block, new_page_no, mtr);
+	} else {
+		return err;
+	}
+
+	btr_page_set_next(block, new_page_no, mtr);
+
+	btr_page_set_prev(new_block, page_no, mtr);
+	btr_page_set_next(new_block, next_page_no, mtr);
+	return DB_SUCCESS;
+}
+
+/*************************************************************//**
+Moves record list to another page for rtree splitting.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return error code
+@retval DB_FAIL on ROW_FORMAT=COMPRESSED compression failure */
+static
+dberr_t
+rtr_split_page_move_rec_list(
+/*=========================*/
+	rtr_split_node_t*	node_array,	/*!< in: split node array. */
+	int			first_rec_group,/*!< in: group number of the
+						first rec. */
+	buf_block_t*		new_block,	/*!< in/out: index page
+						where to move */
+	buf_block_t*		block,		/*!< in/out: page containing
+						split_rec */
+	rec_t*			first_rec,	/*!< in: first record not to
+						move */
+	dict_index_t*		index,		/*!< in: record descriptor */
+	mem_heap_t*		heap,		/*!< in: pointer to memory
+						heap, or NULL */
+	mtr_t*			mtr)		/*!< in: mtr */
+{
+	rtr_split_node_t*	cur_split_node;
+	rtr_split_node_t*	end_split_node;
+	page_cur_t		page_cursor;
+	page_cur_t		new_page_cursor;
+	page_t*			page;
+	page_t*			new_page;
+	rec_offs		offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*		offsets		= offsets_;
+	page_zip_des_t*		new_page_zip
+		= buf_block_get_page_zip(new_block);
+	rec_t*			rec;
+	ulint			moved		= 0;
+	ulint			max_to_move	= 0;
+	rtr_rec_move_t*		rec_move	= NULL;
+
+	ut_ad(!dict_index_is_ibuf(index));
+	ut_ad(dict_index_is_spatial(index));
+
+	rec_offs_init(offsets_);
+
+	page_cur_set_before_first(block, &page_cursor);
+	page_cur_set_before_first(new_block, &new_page_cursor);
+	page_cursor.index = new_page_cursor.index = index;
+
+	page = buf_block_get_frame(block);
+	new_page = buf_block_get_frame(new_block);
+
+	end_split_node = node_array + page_get_n_recs(page);
+
+	mtr_log_t	log_mode = MTR_LOG_NONE;
+
+	if (new_page_zip) {
+		log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
+	}
+
+	max_to_move = page_get_n_recs(buf_block_get_frame(block));
+	rec_move = static_cast<rtr_rec_move_t*>(mem_heap_alloc(
+			heap,
+			sizeof (*rec_move) * max_to_move));
+	const ulint n_core = page_is_leaf(page)
+		? index->n_core_fields : 0;
+
+	/* Insert the recs in group 2 to new page.  */
+	for (cur_split_node = node_array;
+	     cur_split_node < end_split_node; ++cur_split_node) {
+		if (cur_split_node->n_node != first_rec_group) {
+			lock_rec_store_on_page_infimum(
+				block, cur_split_node->key);
+
+			offsets = rec_get_offsets(cur_split_node->key,
+						  index, offsets, n_core,
+						  ULINT_UNDEFINED, &heap);
+
+			ut_ad(!n_core || cur_split_node->key != first_rec);
+
+			rec = page_cur_insert_rec_low(
+				&new_page_cursor,
+				cur_split_node->key, offsets, mtr);
+
+			if (UNIV_UNLIKELY
+			    (!rec
+			     || !page_cur_move_to_next(&new_page_cursor))) {
+				return DB_CORRUPTION;
+			}
+
+			lock_rec_restore_from_page_infimum(
+				*new_block, rec, block->page.id());
+
+			rec_move[moved].new_rec = rec;
+			rec_move[moved].old_rec = cur_split_node->key;
+			rec_move[moved].moved = false;
+			moved++;
+
+			if (moved > max_to_move) {
+				ut_ad(0);
+				break;
+			}
+		}
+	}
+
+	/* Update PAGE_MAX_TRX_ID on the uncompressed page.
+	Modifications will be redo logged and copied to the compressed
+	page in page_zip_compress() or page_zip_reorganize() below.
+	Multiple transactions cannot simultaneously operate on the
+	same temp-table in parallel.
+	max_trx_id is ignored for temp tables because it not required
+	for MVCC. */
+	if (n_core && !index->table->is_temporary()) {
+		page_update_max_trx_id(new_block, NULL,
+				       page_get_max_trx_id(page),
+				       mtr);
+	}
+
+	if (new_page_zip) {
+		mtr_set_log_mode(mtr, log_mode);
+
+		if (!page_zip_compress(new_block, index,
+				       page_zip_level, mtr)) {
+			if (dberr_t err =
+				page_zip_reorganize(new_block, index,
+						    page_zip_level, mtr)) {
+				if (err == DB_FAIL) {
+					ut_a(page_zip_decompress(new_page_zip,
+								 new_page,
+								 FALSE));
+				}
+				return err;
+			}
+		}
+	}
+
+	/* Update the lock table */
+	lock_rtr_move_rec_list(new_block, block, rec_move, moved);
+
+	/* Delete recs in second group from the old page. */
+	for (cur_split_node = node_array;
+	     cur_split_node < end_split_node; ++cur_split_node) {
+		if (cur_split_node->n_node != first_rec_group) {
+			page_cur_position(cur_split_node->key,
+					  block, &page_cursor);
+			offsets = rec_get_offsets(
+				page_cur_get_rec(&page_cursor), index,
+				offsets, n_core, ULINT_UNDEFINED,
+				&heap);
+			page_cur_delete_rec(&page_cursor, offsets, mtr);
+		}
+	}
+
+	return DB_SUCCESS;
+}
+
+/*************************************************************//**
+Splits an R-tree index page to halves and inserts the tuple. It is assumed
+that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is
+released within this function! NOTE that the operation of this
+function must always succeed, we cannot reverse it: therefore enough
+free disk space (2 pages) must be guaranteed to be available before
+this function is called.
+@return inserted record */
+rec_t*
+rtr_page_split_and_insert(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in/out: cursor at which to insert; when the
+				function returns, the cursor is positioned
+				on the predecessor of the inserted record */
+	rec_offs**	offsets,/*!< out: offsets on inserted record */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
+	const dtuple_t*	tuple,	/*!< in: tuple to insert */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	mtr_t*		mtr,	/*!< in: mtr */
+	dberr_t*	err)	/*!< out: error code */
+{
+	buf_block_t*		block;
+	page_t*			page;
+	page_t*			new_page;
+	buf_block_t*		new_block;
+	page_zip_des_t*		page_zip;
+	page_zip_des_t*		new_page_zip;
+	page_cur_t*		page_cursor;
+	rec_t*			rec = 0;
+	ulint			n_recs;
+	ulint			total_data;
+	ulint			insert_size;
+	rtr_split_node_t*	rtr_split_node_array;
+	rtr_split_node_t*	cur_split_node;
+	rtr_split_node_t*	end_split_node;
+	double*			buf_pos;
+	node_seq_t		current_ssn;
+	node_seq_t		next_ssn;
+	buf_block_t*		root_block;
+	rtr_mbr_t		mbr;
+	rtr_mbr_t		new_mbr;
+	lock_prdt_t		prdt;
+	lock_prdt_t		new_prdt;
+	rec_t*			first_rec = NULL;
+	int			first_rec_group = 1;
+	IF_DBUG(bool iterated = false,);
+
+	if (!*heap) {
+		*heap = mem_heap_create(1024);
+	}
+
+func_start:
+	mem_heap_empty(*heap);
+	*offsets = NULL;
+
+	ut_ad(mtr->memo_contains_flagged(&cursor->index()->lock,
+					 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
+	ut_ad(!dict_index_is_online_ddl(cursor->index()));
+	ut_ad(cursor->index()->lock.have_u_or_x());
+
+	block = btr_cur_get_block(cursor);
+	page = buf_block_get_frame(block);
+	page_zip = buf_block_get_page_zip(block);
+	current_ssn = page_get_ssn_id(page);
+
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(page_get_n_recs(page) >= 1);
+
+	const page_id_t page_id(block->page.id());
+
+	if (!page_has_prev(page) && !page_is_leaf(page)) {
+		first_rec = page_rec_get_next(
+			page_get_infimum_rec(buf_block_get_frame(block)));
+		if (UNIV_UNLIKELY(!first_rec)) {
+corrupted:
+			*err = DB_CORRUPTION;
+			return nullptr;
+		}
+	}
+
+	/* Initial split nodes array. */
+	rtr_split_node_array = rtr_page_split_initialize_nodes(
+		*heap, cursor, offsets, tuple, &buf_pos);
+
+	/* Divide all mbrs to two groups. */
+	n_recs = ulint(page_get_n_recs(page)) + 1;
+
+	end_split_node = rtr_split_node_array + n_recs;
+
+#ifdef UNIV_GIS_DEBUG
+	fprintf(stderr, "Before split a page:\n");
+	for (cur_split_node = rtr_split_node_array;
+		cur_split_node < end_split_node; ++cur_split_node) {
+		for (int i = 0; i < SPDIMS * 2; i++) {
+			fprintf(stderr, "%.2lf ",
+			        *(cur_split_node->coords + i));
+		}
+		fprintf(stderr, "\n");
+	}
+#endif
+
+	insert_size = rec_get_converted_size(cursor->index(), tuple, n_ext);
+	total_data = page_get_data_size(page) + insert_size;
+	first_rec_group = split_rtree_node(rtr_split_node_array,
+					   static_cast<int>(n_recs),
+					   static_cast<int>(total_data),
+					   static_cast<int>(insert_size),
+					   0, 2, 2, &buf_pos, SPDIMS,
+					   static_cast<uchar*>(first_rec));
+
+	/* Allocate a new page to the index */
+	const uint16_t page_level = btr_page_get_level(page);
+	new_block = btr_page_alloc(cursor->index(), page_id.page_no() + 1,
+				   FSP_UP, page_level, mtr, mtr, err);
+	if (UNIV_UNLIKELY(!new_block)) {
+		return nullptr;
+	}
+
+	new_page_zip = buf_block_get_page_zip(new_block);
+	if (page_level && UNIV_LIKELY_NULL(new_page_zip)) {
+		/* ROW_FORMAT=COMPRESSED non-leaf pages are not expected
+		to contain FIL_NULL in FIL_PAGE_PREV at this stage. */
+		memset_aligned<4>(new_block->page.frame + FIL_PAGE_PREV, 0, 4);
+	}
+	btr_page_create(new_block, new_page_zip, cursor->index(),
+			page_level, mtr);
+
+	new_page = buf_block_get_frame(new_block);
+	ut_ad(page_get_ssn_id(new_page) == 0);
+
+	/* Set new ssn to the new page and page. */
+	page_set_ssn_id(new_block, new_page_zip, current_ssn, mtr);
+	next_ssn = rtr_get_new_ssn_id(cursor->index());
+
+	page_set_ssn_id(block, page_zip, next_ssn, mtr);
+
+	/* Keep recs in first group to the old page, move recs in second
+	groups to the new page. */
+	if (0
+#ifdef UNIV_ZIP_COPY
+	    || page_zip
+#endif
+	    || (*err = rtr_split_page_move_rec_list(rtr_split_node_array,
+						    first_rec_group,
+						    new_block, block,
+						    first_rec, cursor->index(),
+						    *heap, mtr))) {
+		if (*err != DB_FAIL) {
+			return nullptr;
+		}
+
+		*err = DB_SUCCESS;
+
+		ulint			n		= 0;
+		rec_t*			rec;
+		ulint			moved		= 0;
+		ulint			max_to_move	= 0;
+		rtr_rec_move_t*		rec_move	= NULL;
+		ulint			pos;
+
+		/* For some reason, compressing new_page failed,
+		even though it should contain fewer records than
+		the original page.  Copy the page byte for byte
+		and then delete the records from both pages
+		as appropriate.  Deleting will always succeed. */
+		ut_a(new_page_zip);
+
+		page_zip_copy_recs(new_block,
+				   page_zip, page, cursor->index(), mtr);
+
+		page_cursor = btr_cur_get_page_cur(cursor);
+
+		/* Move locks on recs. */
+		max_to_move = page_get_n_recs(page);
+		rec_move = static_cast<rtr_rec_move_t*>(mem_heap_alloc(
+				*heap,
+				sizeof (*rec_move) * max_to_move));
+
+		/* Init the rec_move array for moving lock on recs.  */
+		for (cur_split_node = rtr_split_node_array;
+		     cur_split_node < end_split_node - 1; ++cur_split_node) {
+			if (cur_split_node->n_node != first_rec_group) {
+				pos = page_rec_get_n_recs_before(
+					cur_split_node->key);
+				rec = page_rec_get_nth(new_page, pos);
+				ut_a(rec);
+
+				rec_move[moved].new_rec = rec;
+				rec_move[moved].old_rec = cur_split_node->key;
+				rec_move[moved].moved = false;
+				moved++;
+
+				if (moved > max_to_move) {
+					ut_ad(0);
+					break;
+				}
+			}
+		}
+
+		/* Update the lock table */
+		lock_rtr_move_rec_list(new_block, block, rec_move, moved);
+
+		const ulint n_core = page_level
+			? 0 : cursor->index()->n_core_fields;
+
+		/* Delete recs in first group from the new page. */
+		for (cur_split_node = rtr_split_node_array;
+		     cur_split_node < end_split_node - 1; ++cur_split_node) {
+			if (cur_split_node->n_node == first_rec_group) {
+				ulint	pos;
+
+				pos = page_rec_get_n_recs_before(
+						cur_split_node->key);
+				ut_a(pos > 0);
+				rec_t* new_rec = page_rec_get_nth(new_page,
+								  pos - n);
+
+				ut_a(new_rec && page_rec_is_user_rec(new_rec));
+				page_cur_position(new_rec, new_block,
+						  page_cursor);
+
+				*offsets = rec_get_offsets(
+					page_cur_get_rec(page_cursor),
+					cursor->index(), *offsets, n_core,
+					ULINT_UNDEFINED, heap);
+
+				page_cur_delete_rec(page_cursor,
+						    *offsets, mtr);
+				n++;
+			}
+		}
+
+		/* Delete recs in second group from the old page. */
+		for (cur_split_node = rtr_split_node_array;
+		     cur_split_node < end_split_node - 1; ++cur_split_node) {
+			if (cur_split_node->n_node != first_rec_group) {
+				page_cur_position(cur_split_node->key,
+						  block, page_cursor);
+				*offsets = rec_get_offsets(
+					page_cur_get_rec(page_cursor),
+					page_cursor->index, *offsets, n_core,
+					ULINT_UNDEFINED, heap);
+				page_cur_delete_rec(page_cursor, *offsets,
+						    mtr);
+			}
+		}
+
+#ifdef UNIV_GIS_DEBUG
+		ut_ad(page_validate(new_page, cursor->index()));
+		ut_ad(page_validate(page, cursor->index()));
+#endif
+	}
+
+	/* Insert the new rec to the proper page. */
+	cur_split_node = end_split_node - 1;
+
+	/* Reposition the cursor for insert and try insertion */
+	page_cursor = btr_cur_get_page_cur(cursor);
+	page_cursor->block = cur_split_node->n_node != first_rec_group
+		? new_block : block;
+
+	ulint up_match = 0, low_match = 0;
+
+	if (page_cur_search_with_match(tuple,
+				       PAGE_CUR_LE, &up_match, &low_match,
+				       page_cursor, nullptr)) {
+		goto corrupted;
+	}
+
+	/* It's possible that the new record is too big to be inserted into
+	the page, and it'll need the second round split in this case.
+	We test this scenario here*/
+	DBUG_EXECUTE_IF("rtr_page_need_second_split",
+			if (!iterated) {
+				rec = NULL;
+				goto after_insert; }
+	);
+
+	rec = page_cur_tuple_insert(page_cursor, tuple,
+				    offsets, heap, n_ext, mtr);
+
+	/* If insert did not fit, try page reorganization.
+	For compressed pages, page_cur_tuple_insert() will have
+	attempted this already. */
+	if (rec == NULL) {
+		if (!is_page_cur_get_page_zip(page_cursor)
+		    && btr_page_reorganize(page_cursor, mtr)) {
+			rec = page_cur_tuple_insert(page_cursor, tuple,
+						    offsets,
+						    heap, n_ext, mtr);
+
+		}
+		/* If insert fail, we will try to split the block again. */
+	}
+
+#ifdef UNIV_DEBUG
+after_insert:
+#endif
+	/* Calculate the mbr on the upper half-page, and the mbr on
+	original page. */
+	rtr_page_cal_mbr(cursor->index(), block, &mbr, *heap);
+	rtr_page_cal_mbr(cursor->index(), new_block, &new_mbr, *heap);
+	prdt.data = &mbr;
+	new_prdt.data = &new_mbr;
+
+	/* Check any predicate locks need to be moved/copied to the
+	new page */
+	lock_prdt_update_split(new_block, &prdt, &new_prdt, page_id);
+
+	/* Adjust the upper level. */
+	*err = rtr_adjust_upper_level(cursor, flags, block, new_block,
+				      &mbr, &new_mbr, mtr);
+	if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+		return nullptr;
+	}
+
+	/* Save the new ssn to the root page, since we need to reinit
+	the first ssn value from it after restart server. */
+
+	root_block = btr_root_block_get(cursor->index(), RW_SX_LATCH,
+					mtr, err);
+	if (UNIV_UNLIKELY(!root_block)) {
+		return nullptr;
+	}
+
+	page_zip = buf_block_get_page_zip(root_block);
+	page_set_ssn_id(root_block, page_zip, next_ssn, mtr);
+
+	/* If the new res insert fail, we need to do another split
+	 again. */
+	if (!rec) {
+		/* We play safe and reset the free bits for new_page */
+		if (!dict_index_is_clust(cursor->index())
+		    && !cursor->index()->table->is_temporary()) {
+			ibuf_reset_free_bits(new_block);
+			ibuf_reset_free_bits(block);
+		}
+
+		/* We need to clean the parent path here and search father
+		node later, otherwise, it's possible that find a wrong
+		parent. */
+		rtr_clean_rtr_info(cursor->rtr_info, true);
+		cursor->rtr_info = NULL;
+		IF_DBUG(iterated=true,);
+
+		rec_t* i_rec = page_rec_get_next(page_get_infimum_rec(
+			buf_block_get_frame(block)));
+		if (UNIV_UNLIKELY(!i_rec)) {
+			goto corrupted;
+		}
+		btr_cur_position(cursor->index(), i_rec, block, cursor);
+
+		goto func_start;
+	}
+
+#ifdef UNIV_GIS_DEBUG
+	ut_ad(page_validate(buf_block_get_frame(block), cursor->index()));
+	ut_ad(page_validate(buf_block_get_frame(new_block), cursor->index()));
+
+	ut_ad(!rec || rec_offs_validate(rec, cursor->index(), *offsets));
+#endif
+	return(rec);
+}
+
+/****************************************************************//**
+Following the right link to find the proper block for insert.
+@return the proper block.*/
+dberr_t
+rtr_ins_enlarge_mbr(
+/*================*/
+	btr_cur_t*		btr_cur,	/*!< in: btr cursor */
+	mtr_t*			mtr)		/*!< in: mtr */
+{
+	dberr_t			err = DB_SUCCESS;
+	rtr_mbr_t		new_mbr;
+	buf_block_t*		block;
+	mem_heap_t*		heap;
+	page_cur_t*		page_cursor;
+	rec_offs*		offsets;
+	node_visit_t*		node_visit;
+	btr_cur_t		cursor;
+	page_t*			page;
+
+	ut_ad(btr_cur->index()->is_spatial());
+
+	/* If no rtr_info or rtree is one level tree, return. */
+	if (!btr_cur->rtr_info || btr_cur->tree_height == 1) {
+		return(err);
+	}
+
+	/* Check path info is not empty. */
+	ut_ad(!btr_cur->rtr_info->parent_path->empty());
+
+	/* Create a memory heap. */
+	heap = mem_heap_create(1024);
+
+	/* Leaf level page is stored in cursor */
+	page_cursor = btr_cur_get_page_cur(btr_cur);
+	block = page_cur_get_block(page_cursor);
+
+	for (ulint i = 1; i < btr_cur->tree_height; i++) {
+		node_visit = rtr_get_parent_node(btr_cur, i, true);
+		ut_ad(node_visit != NULL);
+
+		/* If there's no mbr enlarge, return.*/
+		if (node_visit->mbr_inc == 0) {
+			block = btr_pcur_get_block(node_visit->cursor);
+			continue;
+		}
+
+		/* Calculate the mbr of the child page. */
+		rtr_page_cal_mbr(page_cursor->index, block, &new_mbr, heap);
+
+		/* Get father block. */
+		cursor.page_cur.index = page_cursor->index;
+		cursor.page_cur.block = block;
+		offsets = rtr_page_get_father_block(
+			NULL, heap, mtr, btr_cur, &cursor);
+
+		page = buf_block_get_frame(block);
+
+		/* Update the mbr field of the rec. */
+		rtr_update_mbr_field(&cursor, offsets, NULL, page,
+				     &new_mbr, NULL, mtr);
+		block = btr_cur_get_block(&cursor);
+	}
+
+	mem_heap_free(heap);
+
+	return(err);
+}
+
+/*************************************************************//**
+Copy recs from a page to new_block of rtree.
+
+@return error code */
+dberr_t
+rtr_page_copy_rec_list_end_no_locks(
+/*================================*/
+	buf_block_t*	new_block,	/*!< in: index page to copy to */
+	buf_block_t*	block,		/*!< in: index page of rec */
+	rec_t*		rec,		/*!< in: record on page */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	rtr_rec_move_t*	rec_move,	/*!< in: recording records moved */
+	ulint		max_move,	/*!< in: num of rec to move */
+	ulint*		num_moved,	/*!< out: num of rec to move */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	page_t*		new_page	= buf_block_get_frame(new_block);
+	page_cur_t	page_cur;
+	page_cur_t	cur1;
+	rec_t*		cur_rec;
+	rec_offs	offsets_1[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets1 = offsets_1;
+	rec_offs	offsets_2[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets2 = offsets_2;
+	ulint		moved = 0;
+	const ulint	n_core = page_is_leaf(new_page)
+		? index->n_core_fields : 0;
+
+	rec_offs_init(offsets_1);
+	rec_offs_init(offsets_2);
+
+	page_cur_position(rec, block, &cur1);
+
+	if (page_cur_is_before_first(&cur1) && !page_cur_move_to_next(&cur1)) {
+		return DB_CORRUPTION;
+	}
+
+	ut_a(page_is_comp(new_page) == page_rec_is_comp(rec));
+	ut_a(mach_read_from_2(new_page + srv_page_size - 10) == (ulint)
+	     (page_is_comp(new_page) ? PAGE_NEW_INFIMUM : PAGE_OLD_INFIMUM));
+
+	cur_rec = page_rec_get_next(
+		page_get_infimum_rec(buf_block_get_frame(new_block)));
+	if (UNIV_UNLIKELY(!cur_rec)) {
+		return DB_CORRUPTION;
+	}
+	page_cur_position(cur_rec, new_block, &page_cur);
+	page_cur.index = index;
+
+	/* Copy records from the original page to the new page */
+	while (!page_cur_is_after_last(&cur1)) {
+		rec_t*	cur1_rec = page_cur_get_rec(&cur1);
+		rec_t*	ins_rec;
+
+		if (page_rec_is_infimum(cur_rec)) {
+			cur_rec = page_rec_get_next(cur_rec);
+			if (UNIV_UNLIKELY(!cur_rec)) {
+				return DB_CORRUPTION;
+			}
+		}
+
+		offsets1 = rec_get_offsets(cur1_rec, index, offsets1, n_core,
+					   ULINT_UNDEFINED, &heap);
+		while (!page_rec_is_supremum(cur_rec)) {
+			ulint		cur_matched_fields = 0;
+			int		cmp;
+
+			offsets2 = rec_get_offsets(cur_rec, index, offsets2,
+						   n_core,
+						   ULINT_UNDEFINED, &heap);
+			cmp = cmp_rec_rec(cur1_rec, cur_rec,
+					  offsets1, offsets2, index, false,
+					  &cur_matched_fields);
+			if (cmp < 0) {
+				goto move_to_prev;
+			} else if (cmp > 0) {
+				/* Skip small recs. */
+				cur_rec = page_cur_move_to_next(&page_cur);
+			} else if (n_core) {
+				if (rec_get_deleted_flag(cur1_rec,
+					dict_table_is_comp(index->table))) {
+					goto next;
+				} else {
+					/* We have two identical leaf records,
+					skip copying the undeleted one, and
+					unmark deleted on the current page */
+					btr_rec_set_deleted<false>(
+						new_block, cur_rec, mtr);
+					goto next;
+				}
+			}
+		}
+
+		/* If position is on suprenum rec, need to move to
+		previous rec. */
+		if (page_rec_is_supremum(cur_rec)) {
+move_to_prev:
+			cur_rec = page_cur_move_to_prev(&page_cur);
+		} else {
+			cur_rec = page_cur_get_rec(&page_cur);
+		}
+
+		if (UNIV_UNLIKELY(!cur_rec)) {
+			return DB_CORRUPTION;
+		}
+
+		offsets1 = rec_get_offsets(cur1_rec, index, offsets1, n_core,
+					   ULINT_UNDEFINED, &heap);
+
+		ins_rec = page_cur_insert_rec_low(&page_cur,
+						  cur1_rec, offsets1, mtr);
+		if (UNIV_UNLIKELY(!ins_rec || moved >= max_move)) {
+			return DB_CORRUPTION;
+		}
+
+		rec_move[moved].new_rec = ins_rec;
+		rec_move[moved].old_rec = cur1_rec;
+		rec_move[moved].moved = false;
+		moved++;
+next:
+		if (UNIV_UNLIKELY(!page_cur_move_to_next(&cur1))) {
+			return DB_CORRUPTION;
+		}
+	}
+
+	*num_moved = moved;
+	return DB_SUCCESS;
+}
+
+/*************************************************************//**
+Copy recs till a specified rec from a page to new_block of rtree.
+
+@return error code */
+dberr_t
+rtr_page_copy_rec_list_start_no_locks(
+/*==================================*/
+	buf_block_t*	new_block,	/*!< in: index page to copy to */
+	buf_block_t*	block,		/*!< in: index page of rec */
+	rec_t*		rec,		/*!< in: record on page */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	rtr_rec_move_t*	rec_move,	/*!< in: recording records moved */
+	ulint		max_move,	/*!< in: num of rec to move */
+	ulint*		num_moved,	/*!< out: num of rec to move */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	page_cur_t	cur1;
+	rec_t*		cur_rec;
+	rec_offs	offsets_1[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets1 = offsets_1;
+	rec_offs	offsets_2[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets2 = offsets_2;
+	page_cur_t	page_cur;
+	ulint		moved = 0;
+	const ulint	n_core = page_is_leaf(buf_block_get_frame(block))
+		? index->n_core_fields : 0;
+
+	rec_offs_init(offsets_1);
+	rec_offs_init(offsets_2);
+
+	page_cur_set_before_first(block, &cur1);
+	if (UNIV_UNLIKELY(!page_cur_move_to_next(&cur1))) {
+		return DB_CORRUPTION;
+	}
+
+	cur_rec = page_rec_get_next(
+		page_get_infimum_rec(buf_block_get_frame(new_block)));
+	if (UNIV_UNLIKELY(!cur_rec)) {
+		return DB_CORRUPTION;
+	}
+	page_cur_position(cur_rec, new_block, &page_cur);
+	page_cur.index = index;
+
+	while (page_cur_get_rec(&cur1) != rec) {
+		rec_t*	cur1_rec = page_cur_get_rec(&cur1);
+		rec_t*	ins_rec;
+
+		if (page_rec_is_infimum(cur_rec)) {
+			cur_rec = page_rec_get_next(cur_rec);
+			if (UNIV_UNLIKELY(!cur_rec)) {
+				return DB_CORRUPTION;
+			}
+		}
+
+		offsets1 = rec_get_offsets(cur1_rec, index, offsets1, n_core,
+					   ULINT_UNDEFINED, &heap);
+
+		while (!page_rec_is_supremum(cur_rec)) {
+			ulint		cur_matched_fields = 0;
+
+			offsets2 = rec_get_offsets(cur_rec, index, offsets2,
+						   n_core,
+						   ULINT_UNDEFINED, &heap);
+			int cmp = cmp_rec_rec(cur1_rec, cur_rec,
+					      offsets1, offsets2, index, false,
+					      &cur_matched_fields);
+			if (cmp < 0) {
+				goto move_to_prev;
+			} else if (cmp > 0) {
+				/* Skip small recs. */
+				cur_rec = page_cur_move_to_next(&page_cur);
+			} else if (n_core) {
+				if (rec_get_deleted_flag(
+					cur1_rec,
+					dict_table_is_comp(index->table))) {
+					goto next;
+				} else {
+					/* We have two identical leaf records,
+					skip copying the undeleted one, and
+					unmark deleted on the current page */
+					btr_rec_set_deleted<false>(
+						new_block, cur_rec, mtr);
+					goto next;
+				}
+			}
+		}
+
+		/* If position is on suprenum rec, need to move to
+		previous rec. */
+		if (page_rec_is_supremum(cur_rec)) {
+move_to_prev:
+			cur_rec = page_cur_move_to_prev(&page_cur);
+		} else {
+			cur_rec = page_cur_get_rec(&page_cur);
+		}
+
+		if (UNIV_UNLIKELY(!cur_rec)) {
+			return DB_CORRUPTION;
+		}
+
+		offsets1 = rec_get_offsets(cur1_rec, index, offsets1, n_core,
+					   ULINT_UNDEFINED, &heap);
+
+		ins_rec = page_cur_insert_rec_low(&page_cur,
+						  cur1_rec, offsets1, mtr);
+		if (UNIV_UNLIKELY(!ins_rec || moved >= max_move)) {
+			return DB_CORRUPTION;
+		}
+
+		rec_move[moved].new_rec = ins_rec;
+		rec_move[moved].old_rec = cur1_rec;
+		rec_move[moved].moved = false;
+		moved++;
+next:
+		if (UNIV_UNLIKELY(!page_cur_move_to_next(&cur1))) {
+			return DB_CORRUPTION;
+		}
+	}
+
+	*num_moved = moved;
+	return DB_SUCCESS;
+}
+
+/****************************************************************//**
+Check two MBRs are identical or need to be merged */
+bool
+rtr_merge_mbr_changed(
+/*==================*/
+	btr_cur_t*		cursor,		/*!< in/out: cursor */
+	btr_cur_t*		cursor2,	/*!< in: the other cursor */
+	rec_offs*		offsets,	/*!< in: rec offsets */
+	rec_offs*		offsets2,	/*!< in: rec offsets */
+	rtr_mbr_t*		new_mbr)	/*!< out: MBR to update */
+{
+	double*		mbr;
+	double		mbr1[SPDIMS * 2];
+	double		mbr2[SPDIMS * 2];
+	rec_t*		rec;
+	ulint		len;
+	bool		changed = false;
+
+	ut_ad(cursor->index()->is_spatial());
+
+	rec = btr_cur_get_rec(cursor);
+
+	rtr_read_mbr(rec_get_nth_field(rec, offsets, 0, &len),
+		     reinterpret_cast<rtr_mbr_t*>(mbr1));
+
+	rec = btr_cur_get_rec(cursor2);
+
+	rtr_read_mbr(rec_get_nth_field(rec, offsets2, 0, &len),
+		     reinterpret_cast<rtr_mbr_t*>(mbr2));
+
+	mbr = reinterpret_cast<double*>(new_mbr);
+
+	for (int i = 0; i < SPDIMS * 2; i += 2) {
+		changed = (changed || mbr1[i] != mbr2[i]);
+		*mbr = mbr1[i] < mbr2[i] ? mbr1[i] : mbr2[i];
+		mbr++;
+		changed = (changed || mbr1[i + 1] != mbr2 [i + 1]);
+		*mbr = mbr1[i + 1] > mbr2[i + 1] ? mbr1[i + 1] : mbr2[i + 1];
+		mbr++;
+	}
+
+	return(changed);
+}
+
+/****************************************************************//**
+Merge 2 mbrs and update the the mbr that cursor is on. */
+void
+rtr_merge_and_update_mbr(
+/*=====================*/
+	btr_cur_t*		cursor,		/*!< in/out: cursor */
+	btr_cur_t*		cursor2,	/*!< in: the other cursor */
+	rec_offs*		offsets,	/*!< in: rec offsets */
+	rec_offs*		offsets2,	/*!< in: rec offsets */
+	page_t*			child_page,	/*!< in: the page. */
+	mtr_t*			mtr)		/*!< in: mtr */
+{
+	rtr_mbr_t		new_mbr;
+
+	if (rtr_merge_mbr_changed(cursor, cursor2, offsets, offsets2,
+				  &new_mbr)) {
+		rtr_update_mbr_field(cursor, offsets, cursor2, child_page,
+				     &new_mbr, NULL, mtr);
+	} else {
+		rtr_node_ptr_delete(cursor2, mtr);
+	}
+}
+
+/*************************************************************//**
+Deletes on the upper level the node pointer to a page. */
+void
+rtr_node_ptr_delete(
+/*================*/
+	btr_cur_t*	cursor, /*!< in: search cursor, contains information
+				about parent nodes in search */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ibool		compressed;
+	dberr_t		err;
+
+	compressed = btr_cur_pessimistic_delete(&err, TRUE, cursor,
+						BTR_CREATE_FLAG, false, mtr);
+	ut_a(err == DB_SUCCESS);
+
+	if (!compressed) {
+		btr_cur_compress_if_useful(cursor, FALSE, mtr);
+	}
+}
+
+/**************************************************************//**
+Check whether a Rtree page is child of a parent page
+@return true if there is child/parent relationship */
+bool
+rtr_check_same_block(
+/*================*/
+	dict_index_t*	index,	/*!< in: index tree */
+	btr_cur_t*	cursor,	/*!< in/out: position at the parent entry
+				pointing to the child if successful */
+	buf_block_t*	parentb,/*!< in: parent page to check */
+	mem_heap_t*	heap)	/*!< in: memory heap */
+
+{
+	const uint32_t	page_no =
+		btr_cur_get_block(cursor)->page.id().page_no();
+	rec_offs*	offsets;
+	rec_t*		rec = page_get_infimum_rec(parentb->page.frame);
+
+	while ((rec = page_rec_get_next(rec)) && !page_rec_is_supremum(rec)) {
+		offsets = rec_get_offsets(
+			rec, index, NULL, 0, ULINT_UNDEFINED, &heap);
+
+		if (btr_node_ptr_get_child_page_no(rec, offsets) == page_no) {
+			btr_cur_position(index, rec, parentb, cursor);
+			return(true);
+		}
+	}
+
+	return(false);
+}
+
+/*************************************************************//**
+Calculates MBR_AREA(a+b) - MBR_AREA(a)
+Note: when 'a' and 'b' objects are far from each other,
+the area increase can be really big, so this function
+can return 'inf' as a result.
+Return the area increaed. */
+static double
+rtree_area_increase(
+	const uchar*	a,		/*!< in: original mbr. */
+	const uchar*	b,		/*!< in: new mbr. */
+	double*		ab_area)	/*!< out: increased area. */
+{
+	double		a_area = 1.0;
+	double		loc_ab_area = 1.0;
+	double		amin, amax, bmin, bmax;
+	double		data_round = 1.0;
+
+	static_assert(DATA_MBR_LEN == SPDIMS * 2 * sizeof(double),
+		      "compatibility");
+
+	for (auto i = SPDIMS; i--; ) {
+		double	area;
+
+		amin = mach_double_read(a);
+		bmin = mach_double_read(b);
+		amax = mach_double_read(a + sizeof(double));
+		bmax = mach_double_read(b + sizeof(double));
+
+		a += 2 * sizeof(double);
+		b += 2 * sizeof(double);
+
+		area = amax - amin;
+		if (area == 0) {
+			a_area *= LINE_MBR_WEIGHTS;
+		} else {
+			a_area *= area;
+		}
+
+		area = (double)std::max(amax, bmax) -
+		       (double)std::min(amin, bmin);
+		if (area == 0) {
+			loc_ab_area *= LINE_MBR_WEIGHTS;
+		} else {
+			loc_ab_area *= area;
+		}
+
+		/* Value of amax or bmin can be so large that small difference
+		are ignored. For example: 3.2884281489988079e+284 - 100 =
+		3.2884281489988079e+284. This results some area difference
+		are not detected */
+		if (loc_ab_area == a_area) {
+			if (bmin < amin || bmax > amax) {
+				data_round *= ((double)std::max(amax, bmax)
+					       - amax
+					       + (amin - (double)std::min(
+								amin, bmin)));
+			} else {
+				data_round *= area;
+			}
+		}
+	}
+
+	*ab_area = loc_ab_area;
+
+	if (loc_ab_area == a_area && data_round != 1.0) {
+		return(data_round);
+	}
+
+	return(loc_ab_area - a_area);
+}
+
+/** Calculates overlapping area
+@param[in]	a	mbr a
+@param[in]	b	mbr b
+@return overlapping area */
+static double rtree_area_overlapping(const byte *a, const byte *b)
+{
+	double	area = 1.0;
+	double	amin;
+	double	amax;
+	double	bmin;
+	double	bmax;
+
+	static_assert(DATA_MBR_LEN == SPDIMS * 2 * sizeof(double),
+		      "compatibility");
+
+	for (auto i = SPDIMS; i--; ) {
+		amin = mach_double_read(a);
+		bmin = mach_double_read(b);
+		amax = mach_double_read(a + sizeof(double));
+		bmax = mach_double_read(b + sizeof(double));
+		a += 2 * sizeof(double);
+		b += 2 * sizeof(double);
+
+		amin = std::max(amin, bmin);
+		amax = std::min(amax, bmax);
+
+		if (amin > amax) {
+			return(0);
+		} else {
+			area *= (amax - amin);
+		}
+	}
+
+	return(area);
+}
+
+/****************************************************************//**
+Calculate the area increased for a new record
+@return area increased */
+double
+rtr_rec_cal_increase(
+/*=================*/
+	const dtuple_t*	dtuple,	/*!< in: data tuple to insert, which
+				cause area increase */
+	const rec_t*	rec,	/*!< in: physical record which differs from
+				dtuple in some of the common fields, or which
+				has an equal number or more fields than
+				dtuple */
+	double*		area)	/*!< out: increased area */
+{
+	const dfield_t*	dtuple_field;
+
+	ut_ad(!page_rec_is_supremum(rec));
+	ut_ad(!page_rec_is_infimum(rec));
+
+	dtuple_field = dtuple_get_nth_field(dtuple, 0);
+	ut_ad(dfield_get_len(dtuple_field) == DATA_MBR_LEN);
+
+	return rtree_area_increase(rec,
+				   static_cast<const byte*>(
+					   dfield_get_data(dtuple_field)),
+				   area);
+}
+
+/** Estimates the number of rows in a given area.
+@param[in]	index	index
+@param[in]	tuple	range tuple containing mbr, may also be empty tuple
+@param[in]	mode	search mode
+@return estimated number of rows */
+ha_rows
+rtr_estimate_n_rows_in_range(
+	dict_index_t*	index,
+	const dtuple_t*	tuple,
+	page_cur_mode_t	mode)
+{
+	ut_ad(dict_index_is_spatial(index));
+
+	/* Check tuple & mode */
+	if (tuple->n_fields == 0) {
+		return(HA_POS_ERROR);
+	}
+
+	switch (mode) {
+	case PAGE_CUR_DISJOINT:
+	case PAGE_CUR_CONTAIN:
+	case PAGE_CUR_INTERSECT:
+	case PAGE_CUR_WITHIN:
+	case PAGE_CUR_MBR_EQUAL:
+		break;
+	default:
+		return(HA_POS_ERROR);
+	}
+
+	DBUG_EXECUTE_IF("rtr_pcur_move_to_next_return",
+		return(2);
+	);
+
+	/* Read mbr from tuple. */
+	rtr_mbr_t	range_mbr;
+	double		range_area;
+
+	const dfield_t* dtuple_field = dtuple_get_nth_field(tuple, 0);
+	ut_ad(dfield_get_len(dtuple_field) >= DATA_MBR_LEN);
+	const byte* range_mbr_ptr = reinterpret_cast<const byte*>(
+		dfield_get_data(dtuple_field));
+
+	rtr_read_mbr(range_mbr_ptr, &range_mbr);
+	range_area = (range_mbr.xmax - range_mbr.xmin)
+		 * (range_mbr.ymax - range_mbr.ymin);
+
+	/* Get index root page. */
+	mtr_t		mtr;
+
+	mtr.start();
+	index->set_modified(mtr);
+	mtr_s_lock_index(index, &mtr);
+
+	dberr_t err;
+	buf_block_t* block = btr_root_block_get(index, RW_S_LATCH, &mtr, &err);
+	if (!block) {
+err_exit:
+		mtr.commit();
+		return HA_POS_ERROR;
+	}
+	const page_t* page = buf_block_get_frame(block);
+	const unsigned n_recs = page_header_get_field(page, PAGE_N_RECS);
+
+	if (n_recs == 0) {
+		goto err_exit;
+	}
+
+	/* Scan records in root page and calculate area. */
+	double	area = 0;
+	for (const rec_t* rec = page_rec_get_next_const(
+		     page_get_infimum_rec(block->page.frame));
+	     rec && !page_rec_is_supremum(rec);
+	     rec = page_rec_get_next_const(rec)) {
+		rtr_mbr_t	mbr;
+		double		rec_area;
+
+		rtr_read_mbr(rec, &mbr);
+
+		rec_area = (mbr.xmax - mbr.xmin) * (mbr.ymax - mbr.ymin);
+
+		if (rec_area == 0) {
+			switch (mode) {
+			case PAGE_CUR_CONTAIN:
+			case PAGE_CUR_INTERSECT:
+				area += 1;
+				break;
+
+			case PAGE_CUR_DISJOINT:
+				break;
+
+			case PAGE_CUR_WITHIN:
+			case PAGE_CUR_MBR_EQUAL:
+				if (!rtree_key_cmp(
+					    PAGE_CUR_WITHIN, range_mbr_ptr,
+					    rec)) {
+					area += 1;
+				}
+
+				break;
+
+			default:
+				ut_error;
+			}
+		} else {
+			switch (mode) {
+			case PAGE_CUR_CONTAIN:
+			case PAGE_CUR_INTERSECT:
+				area += rtree_area_overlapping(
+					range_mbr_ptr, rec)
+					/ rec_area;
+				break;
+
+			case PAGE_CUR_DISJOINT:
+				area += 1;
+				area -= rtree_area_overlapping(
+					range_mbr_ptr, rec)
+					/ rec_area;
+				break;
+
+			case PAGE_CUR_WITHIN:
+			case PAGE_CUR_MBR_EQUAL:
+				if (!rtree_key_cmp(
+					    PAGE_CUR_WITHIN, range_mbr_ptr,
+					    rec)) {
+					area += range_area / rec_area;
+				}
+
+				break;
+			default:
+				ut_error;
+			}
+		}
+	}
+
+	mtr.commit();
+
+	if (!std::isfinite(area)) {
+		return(HA_POS_ERROR);
+	}
+
+	area /= n_recs;
+	return ha_rows(static_cast<double>(dict_table_get_n_rows(index->table))
+		       * area);
+}
diff --git a/storage/innobase/gis/gis0sea.cc b/storage/innobase/gis/gis0sea.cc
new file mode 100644
index 00000000..8ca8681b
--- /dev/null
+++ b/storage/innobase/gis/gis0sea.cc
@@ -0,0 +1,2403 @@
+/*****************************************************************************
+
+Copyright (c) 2016, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file gis/gis0sea.cc
+InnoDB R-tree search interfaces
+
+Created 2014/01/16 Jimmy Yang
+***********************************************************************/
+
+#include "fsp0fsp.h"
+#include "page0page.h"
+#include "page0cur.h"
+#include "page0zip.h"
+#include "gis0rtree.h"
+#include "btr0cur.h"
+#include "btr0sea.h"
+#include "btr0pcur.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "ibuf0ibuf.h"
+#include "trx0trx.h"
+#include "srv0mon.h"
+#include "que0que.h"
+#include "gis0geo.h"
+
+/** Restore the stored position of a persistent cursor bufferfixing the page */
+static
+bool
+rtr_cur_restore_position(
+	btr_cur_t*	cursor,		/*!< in: detached persistent cursor */
+	ulint		level,		/*!< in: index level */
+	mtr_t*		mtr);		/*!< in: mtr */
+
+/*************************************************************//**
+Pop out used parent path entry, until we find the parent with matching
+page number */
+static
+void
+rtr_adjust_parent_path(
+/*===================*/
+	rtr_info_t*	rtr_info,	/* R-Tree info struct */
+	ulint		page_no)	/* page number to look for */
+{
+	while (!rtr_info->parent_path->empty()) {
+		if (rtr_info->parent_path->back().child_no == page_no) {
+			break;
+		} else {
+			if (rtr_info->parent_path->back().cursor) {
+				btr_pcur_close(
+					rtr_info->parent_path->back().cursor);
+				ut_free(rtr_info->parent_path->back().cursor);
+			}
+
+			rtr_info->parent_path->pop_back();
+		}
+	}
+}
+
+/** Latches the leaf page or pages requested.
+@param[in]	block_savepoint	leaf page where the search converged
+@param[in]	latch_mode	BTR_SEARCH_LEAF, ...
+@param[in]	cursor		cursor
+@param[in]	mtr		mini-transaction */
+static void
+rtr_latch_leaves(
+	ulint			block_savepoint,
+	btr_latch_mode		latch_mode,
+	btr_cur_t*		cursor,
+	mtr_t*			mtr)
+{
+	compile_time_assert(int(MTR_MEMO_PAGE_S_FIX) == int(RW_S_LATCH));
+	compile_time_assert(int(MTR_MEMO_PAGE_X_FIX) == int(RW_X_LATCH));
+	compile_time_assert(int(MTR_MEMO_PAGE_SX_FIX) == int(RW_SX_LATCH));
+
+	buf_block_t* block = mtr->at_savepoint(block_savepoint);
+
+	ut_ad(block->page.id().space() == cursor->index()->table->space->id);
+	ut_ad(block->page.in_file());
+	ut_ad(mtr->memo_contains_flagged(&cursor->index()->lock,
+					 MTR_MEMO_S_LOCK
+					 | MTR_MEMO_X_LOCK
+					 | MTR_MEMO_SX_LOCK));
+
+	switch (latch_mode) {
+		uint32_t	left_page_no;
+		uint32_t	right_page_no;
+	default:
+		ut_ad(latch_mode == BTR_CONT_MODIFY_TREE);
+		break;
+	case BTR_MODIFY_TREE:
+		/* It is exclusive for other operations which calls
+		btr_page_set_prev() */
+		ut_ad(mtr->memo_contains_flagged(&cursor->index()->lock,
+						 MTR_MEMO_X_LOCK
+						 | MTR_MEMO_SX_LOCK));
+		/* x-latch also siblings from left to right */
+		left_page_no = btr_page_get_prev(block->page.frame);
+
+		if (left_page_no != FIL_NULL) {
+			btr_block_get(*cursor->index(), left_page_no, RW_X_LATCH,
+				      true, mtr);
+		}
+
+		mtr->upgrade_buffer_fix(block_savepoint, RW_X_LATCH);
+
+		right_page_no = btr_page_get_next(block->page.frame);
+
+		if (right_page_no != FIL_NULL) {
+			btr_block_get(*cursor->index(), right_page_no,
+				      RW_X_LATCH, true, mtr);
+		}
+		break;
+	case BTR_SEARCH_LEAF:
+	case BTR_MODIFY_LEAF:
+		rw_lock_type_t mode =
+			rw_lock_type_t(latch_mode & (RW_X_LATCH | RW_S_LATCH));
+		static_assert(int{RW_S_LATCH} == int{BTR_SEARCH_LEAF}, "");
+		static_assert(int{RW_X_LATCH} == int{BTR_MODIFY_LEAF}, "");
+		mtr->upgrade_buffer_fix(block_savepoint, mode);
+	}
+}
+
+/*************************************************************//**
+Find the next matching record. This function is used by search
+or record locating during index delete/update.
+@return true if there is suitable record found, otherwise false */
+TRANSACTIONAL_TARGET
+static
+bool
+rtr_pcur_getnext_from_path(
+/*=======================*/
+	const dtuple_t* tuple,	/*!< in: data tuple */
+	page_cur_mode_t	mode,	/*!< in: cursor search mode */
+	btr_cur_t*	btr_cur,/*!< in: persistent cursor; NOTE that the
+				function may release the page latch */
+	ulint		target_level,
+				/*!< in: target level */
+	ulint		latch_mode,
+				/*!< in: latch_mode */
+	bool		index_locked,
+				/*!< in: index tree locked */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	dict_index_t*	index = btr_cur->index();
+	bool		found = false;
+	page_cur_t*	page_cursor;
+	ulint		level = 0;
+	node_visit_t	next_rec;
+	rtr_info_t*	rtr_info = btr_cur->rtr_info;
+	node_seq_t	page_ssn;
+	ulint		skip_parent = false;
+	bool		new_split = false;
+	bool		for_delete = false;
+	bool		for_undo_ins = false;
+
+	/* exhausted all the pages to be searched */
+	if (rtr_info->path->empty()) {
+		return(false);
+	}
+
+	ut_ad(dtuple_get_n_fields_cmp(tuple));
+
+	const auto my_latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
+
+	for_delete = latch_mode & BTR_RTREE_DELETE_MARK;
+	for_undo_ins = latch_mode & BTR_RTREE_UNDO_INS;
+
+	/* There should be no insert coming to this function. Only
+	mode with BTR_MODIFY_* should be delete */
+	ut_ad(mode != PAGE_CUR_RTREE_INSERT);
+	ut_ad(my_latch_mode == BTR_SEARCH_LEAF
+	      || my_latch_mode == BTR_MODIFY_LEAF
+	      || my_latch_mode == BTR_MODIFY_TREE
+	      || my_latch_mode == BTR_CONT_MODIFY_TREE);
+
+	/* Whether need to track parent information. Only need so
+	when we do tree altering operations (such as index page merge) */
+	static_assert(BTR_CONT_MODIFY_TREE == (4 | BTR_MODIFY_TREE), "");
+
+	const bool need_parent = mode == PAGE_CUR_RTREE_LOCATE
+		&& (my_latch_mode | 4) == BTR_CONT_MODIFY_TREE;
+
+	if (!index_locked) {
+		ut_ad(mtr->is_empty());
+		mtr_s_lock_index(index, mtr);
+	} else {
+		ut_ad(mtr->memo_contains_flagged(&index->lock,
+						 MTR_MEMO_SX_LOCK
+						 | MTR_MEMO_S_LOCK
+						 | MTR_MEMO_X_LOCK));
+	}
+
+	const ulint zip_size = index->table->space->zip_size();
+
+	/* Pop each node/page to be searched from "path" structure
+	and do a search on it. Please note, any pages that are in
+	the "path" structure are protected by "page" lock, so tey
+	cannot be shrunk away */
+	do {
+		buf_block_t*	block;
+		node_seq_t	path_ssn;
+		const page_t*	page;
+		rw_lock_type_t	rw_latch;
+
+		mysql_mutex_lock(&rtr_info->rtr_path_mutex);
+		next_rec = rtr_info->path->back();
+		rtr_info->path->pop_back();
+		level = next_rec.level;
+		path_ssn = next_rec.seq_no;
+
+		/* Maintain the parent path info as well, if needed */
+		if (need_parent && !skip_parent && !new_split) {
+			ulint		old_level;
+			ulint		new_level;
+
+			ut_ad(!rtr_info->parent_path->empty());
+
+			/* Cleanup unused parent info */
+			if (rtr_info->parent_path->back().cursor) {
+				btr_pcur_close(
+					rtr_info->parent_path->back().cursor);
+				ut_free(rtr_info->parent_path->back().cursor);
+			}
+
+			old_level = rtr_info->parent_path->back().level;
+
+			rtr_info->parent_path->pop_back();
+
+			ut_ad(!rtr_info->parent_path->empty());
+
+			/* check whether there is a level change. If so,
+			the current parent path needs to pop enough
+			nodes to adjust to the new search page */
+			new_level = rtr_info->parent_path->back().level;
+
+			if (old_level < new_level) {
+				rtr_adjust_parent_path(
+					rtr_info, next_rec.page_no);
+			}
+
+			ut_ad(!rtr_info->parent_path->empty());
+
+			ut_ad(next_rec.page_no
+			      == rtr_info->parent_path->back().child_no);
+		}
+
+		mysql_mutex_unlock(&rtr_info->rtr_path_mutex);
+
+		skip_parent = false;
+		new_split = false;
+
+		/* Once we have pages in "path", these pages are
+		predicate page locked, so they can't be shrunk away.
+		They also have SSN (split sequence number) to detect
+		splits, so we can directly latch single page while
+		getting them. They can be unlatched if not qualified.
+		One reason for pre-latch is that we might need to position
+		some parent position (requires latch) during search */
+		if (level == 0) {
+			static_assert(ulint{BTR_SEARCH_LEAF} ==
+				      ulint{RW_S_LATCH}, "");
+			static_assert(ulint{BTR_MODIFY_LEAF} ==
+				      ulint{RW_X_LATCH}, "");
+			rw_latch = (my_latch_mode | 4) == BTR_CONT_MODIFY_TREE
+				? RW_NO_LATCH
+				: rw_lock_type_t(my_latch_mode);
+		} else {
+			rw_latch = RW_X_LATCH;
+		}
+
+		if (my_latch_mode == BTR_MODIFY_LEAF) {
+			mtr->rollback_to_savepoint(1);
+		}
+
+		ut_ad((my_latch_mode | 4) == BTR_CONT_MODIFY_TREE
+		      || !page_is_leaf(btr_cur_get_page(btr_cur))
+		      || !btr_cur->page_cur.block->page.lock.have_any());
+
+		const auto block_savepoint = mtr->get_savepoint();
+		block = buf_page_get_gen(
+			page_id_t(index->table->space_id,
+				  next_rec.page_no), zip_size,
+			rw_latch, NULL, BUF_GET, mtr);
+
+		if (!block) {
+			found = false;
+			break;
+		}
+
+		page = buf_block_get_frame(block);
+		page_ssn = page_get_ssn_id(page);
+
+		/* If there are splits, push the splitted page.
+		Note that we have SX lock on index->lock, there
+		should not be any split/shrink happening here */
+		if (page_ssn > path_ssn) {
+			uint32_t next_page_no = btr_page_get_next(page);
+			rtr_non_leaf_stack_push(
+				rtr_info->path, next_page_no, path_ssn,
+				level, 0, NULL, 0);
+
+			if (!srv_read_only_mode
+			    && mode != PAGE_CUR_RTREE_INSERT
+			    && mode != PAGE_CUR_RTREE_LOCATE) {
+				ut_ad(rtr_info->thr);
+				lock_place_prdt_page_lock(
+					page_id_t(block->page.id().space(),
+						  next_page_no),
+					index,
+					rtr_info->thr);
+			}
+			new_split = true;
+#if defined(UNIV_GIS_DEBUG)
+			fprintf(stderr,
+				"GIS_DIAG: Splitted page found: %d, %ld\n",
+				static_cast<int>(need_parent), next_page_no);
+#endif
+		}
+
+		page_cursor = btr_cur_get_page_cur(btr_cur);
+		page_cursor->rec = NULL;
+		page_cursor->block = block;
+
+		if (mode == PAGE_CUR_RTREE_LOCATE) {
+			if (target_level == 0 && level == 0) {
+				ulint	low_match = 0, up_match = 0;
+
+				found = false;
+
+				if (!page_cur_search_with_match(
+					tuple, PAGE_CUR_LE,
+					&up_match, &low_match,
+					btr_cur_get_page_cur(btr_cur), nullptr)
+				    && low_match
+				    == dtuple_get_n_fields_cmp(tuple)) {
+					rec_t*	rec = btr_cur_get_rec(btr_cur);
+
+					if (!rec_get_deleted_flag(rec,
+					    dict_table_is_comp(index->table))
+					    || (!for_delete && !for_undo_ins)) {
+						found = true;
+						btr_cur->low_match = low_match;
+					} else {
+						/* mark we found deleted row */
+						btr_cur->rtr_info->fd_del
+							= true;
+					}
+				}
+			} else {
+				page_cur_mode_t	page_mode = mode;
+
+				if (level == target_level
+				    && target_level != 0) {
+					page_mode = PAGE_CUR_RTREE_GET_FATHER;
+				}
+				found = rtr_cur_search_with_match(
+					block, index, tuple, page_mode,
+					page_cursor, btr_cur->rtr_info);
+
+				/* Save the position of parent if needed */
+				if (found && need_parent) {
+					btr_pcur_t*     r_cursor =
+						rtr_get_parent_cursor(
+							btr_cur, level, false);
+
+					rec_t*	rec = page_cur_get_rec(
+						page_cursor);
+					page_cur_position(
+						rec, block,
+						btr_pcur_get_page_cur(r_cursor));
+					r_cursor->pos_state =
+						 BTR_PCUR_IS_POSITIONED;
+					r_cursor->latch_mode = my_latch_mode;
+					btr_pcur_store_position(r_cursor, mtr);
+					ut_d(ulint num_stored =)
+					rtr_store_parent_path(
+						block, btr_cur,
+						btr_latch_mode(rw_latch),
+						level, mtr);
+					ut_ad(num_stored > 0);
+				}
+			}
+		} else {
+			found = rtr_cur_search_with_match(
+				block, index, tuple, mode, page_cursor,
+				btr_cur->rtr_info);
+		}
+
+		/* Attach predicate lock if needed, no matter whether
+		there are matched records */
+		if (mode != PAGE_CUR_RTREE_INSERT
+		    && mode != PAGE_CUR_RTREE_LOCATE
+		    && mode >= PAGE_CUR_CONTAIN
+		    && btr_cur->rtr_info->need_prdt_lock) {
+			lock_prdt_t	prdt;
+
+			trx_t*		trx = thr_get_trx(
+						btr_cur->rtr_info->thr);
+			{
+				TMLockTrxGuard g{TMLockTrxArgs(*trx)};
+				lock_init_prdt_from_mbr(
+					&prdt, &btr_cur->rtr_info->mbr,
+					mode, trx->lock.lock_heap);
+			}
+
+			if (rw_latch == RW_NO_LATCH) {
+				block->page.lock.s_lock();
+			}
+
+			lock_prdt_lock(block, &prdt, index, LOCK_S,
+				       LOCK_PREDICATE, btr_cur->rtr_info->thr);
+
+			if (rw_latch == RW_NO_LATCH) {
+				block->page.lock.s_unlock();
+			}
+		}
+
+		if (found) {
+			if (level == target_level) {
+				ut_ad(block
+				      == mtr->at_savepoint(block_savepoint));
+
+				if (my_latch_mode == BTR_MODIFY_TREE
+				    && level == 0) {
+					ut_ad(rw_latch == RW_NO_LATCH);
+
+					rtr_latch_leaves(
+						block_savepoint,
+						BTR_MODIFY_TREE,
+						btr_cur, mtr);
+				}
+
+				page_cur_position(
+					page_cur_get_rec(page_cursor),
+					page_cur_get_block(page_cursor),
+					btr_cur_get_page_cur(btr_cur));
+
+				btr_cur->low_match = level != 0 ?
+					DICT_INDEX_SPATIAL_NODEPTR_SIZE + 1
+					: btr_cur->low_match;
+				break;
+			}
+
+			/* Keep the parent path node, which points to
+			last node just located */
+			skip_parent = true;
+		} else {
+			mtr->release_last_page();
+		}
+
+	} while (!rtr_info->path->empty());
+
+	const rec_t* rec = btr_cur_get_rec(btr_cur);
+
+	if (!page_rec_is_user_rec(rec)) {
+		mtr->commit();
+		mtr->start();
+	} else if (!index_locked) {
+		mtr->release(index->lock);
+	}
+
+	return(found);
+}
+
+/*************************************************************//**
+Find the next matching record. This function will first exhaust
+the copied record listed in the rtr_info->matches vector before
+moving to the next page
+@return true if there is suitable record found, otherwise false */
+bool
+rtr_pcur_move_to_next(
+/*==================*/
+	const dtuple_t*	tuple,	/*!< in: data tuple; NOTE: n_fields_cmp in
+				tuple must be set so that it cannot get
+				compared to the node ptr page number field! */
+	page_cur_mode_t	mode,	/*!< in: cursor search mode */
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor; NOTE that the
+				function may release the page latch */
+	ulint		level,	/*!< in: target level */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	rtr_info_t*	rtr_info = cursor->btr_cur.rtr_info;
+
+	ut_a(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+
+	mysql_mutex_lock(&rtr_info->matches->rtr_match_mutex);
+	/* First retrieve the next record on the current page */
+	if (!rtr_info->matches->matched_recs->empty()) {
+		rtr_rec_t	rec;
+		rec = rtr_info->matches->matched_recs->back();
+		rtr_info->matches->matched_recs->pop_back();
+		mysql_mutex_unlock(&rtr_info->matches->rtr_match_mutex);
+
+		cursor->btr_cur.page_cur.rec = rec.r_rec;
+		cursor->btr_cur.page_cur.block = &rtr_info->matches->block;
+
+		DEBUG_SYNC_C("rtr_pcur_move_to_next_return");
+		return(true);
+	}
+
+	mysql_mutex_unlock(&rtr_info->matches->rtr_match_mutex);
+
+	/* Fetch the next page */
+	return(rtr_pcur_getnext_from_path(tuple, mode, &cursor->btr_cur,
+					 level, cursor->latch_mode,
+					 false, mtr));
+}
+
+#ifdef UNIV_DEBUG
+/*************************************************************//**
+Check if the cursor holds record pointing to the specified child page
+@return	true if it is (pointing to the child page) false otherwise */
+static void rtr_compare_cursor_rec(const rec_t *rec, dict_index_t *index,
+                                   ulint page_no)
+{
+  if (!rec)
+    return;
+  mem_heap_t *heap= nullptr;
+  rec_offs *offsets= rec_get_offsets(rec, index, nullptr, 0,
+                                     ULINT_UNDEFINED, &heap);
+  ut_ad(btr_node_ptr_get_child_page_no(rec, offsets) == page_no);
+  mem_heap_free(heap);
+}
+#endif
+
+TRANSACTIONAL_TARGET
+dberr_t rtr_search_to_nth_level(ulint level, const dtuple_t *tuple,
+                                page_cur_mode_t mode,
+                                btr_latch_mode latch_mode,
+                                btr_cur_t *cur, mtr_t *mtr)
+{
+  page_cur_mode_t page_mode;
+  page_cur_mode_t search_mode= PAGE_CUR_UNSUPP;
+
+  bool mbr_adj= false;
+  bool found= false;
+  dict_index_t *const index= cur->index();
+
+  mem_heap_t *heap= nullptr;
+  rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+  rec_offs *offsets= offsets_;
+  rec_offs_init(offsets_);
+  ut_ad(level == 0 || mode == PAGE_CUR_LE || RTREE_SEARCH_MODE(mode));
+  ut_ad(dict_index_check_search_tuple(index, tuple));
+  ut_ad(dtuple_check_typed(tuple));
+  ut_ad(index->is_spatial());
+  ut_ad(index->page != FIL_NULL);
+
+  MEM_UNDEFINED(&cur->up_match, sizeof cur->up_match);
+  MEM_UNDEFINED(&cur->up_bytes, sizeof cur->up_bytes);
+  MEM_UNDEFINED(&cur->low_match, sizeof cur->low_match);
+  MEM_UNDEFINED(&cur->low_bytes, sizeof cur->low_bytes);
+  ut_d(cur->up_match= ULINT_UNDEFINED);
+  ut_d(cur->low_match= ULINT_UNDEFINED);
+
+  const bool latch_by_caller= latch_mode & BTR_ALREADY_S_LATCHED;
+
+  ut_ad(!latch_by_caller
+        || mtr->memo_contains_flagged(&index->lock, MTR_MEMO_S_LOCK
+                                      | MTR_MEMO_SX_LOCK));
+  latch_mode= BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
+
+  ut_ad(!latch_by_caller || latch_mode == BTR_SEARCH_LEAF ||
+        latch_mode == BTR_MODIFY_LEAF);
+
+  cur->flag= BTR_CUR_BINARY;
+
+#ifndef BTR_CUR_ADAPT
+  buf_block_t *guess= nullptr;
+#else
+  btr_search_t *const info= btr_search_get_info(index);
+  buf_block_t *guess= info->root_guess;
+#endif
+
+  /* Store the position of the tree latch we push to mtr so that we
+     know how to release it when we have latched leaf node(s) */
+
+  const ulint savepoint= mtr->get_savepoint();
+
+  rw_lock_type_t upper_rw_latch, root_leaf_rw_latch= RW_NO_LATCH;
+
+  switch (latch_mode) {
+  case BTR_MODIFY_TREE:
+    mtr_x_lock_index(index, mtr);
+    upper_rw_latch= root_leaf_rw_latch= RW_X_LATCH;
+    break;
+  case BTR_CONT_MODIFY_TREE:
+    ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK |
+                                     MTR_MEMO_SX_LOCK));
+    upper_rw_latch= RW_X_LATCH;
+    break;
+  default:
+    ut_ad(latch_mode != BTR_MODIFY_PREV);
+    ut_ad(latch_mode != BTR_SEARCH_PREV);
+    if (!latch_by_caller)
+      mtr_s_lock_index(index, mtr);
+    upper_rw_latch= root_leaf_rw_latch= RW_S_LATCH;
+    if (latch_mode == BTR_MODIFY_LEAF)
+      root_leaf_rw_latch= RW_X_LATCH;
+  }
+
+  auto root_savepoint= mtr->get_savepoint();
+  const ulint zip_size= index->table->space->zip_size();
+
+  /* Start with the root page. */
+  page_id_t page_id(index->table->space_id, index->page);
+
+  ulint up_match= 0, up_bytes= 0, low_match= 0, low_bytes= 0;
+  ulint height= ULINT_UNDEFINED;
+
+  /* We use these modified search modes on non-leaf levels of the
+     B-tree. These let us end up in the right B-tree leaf. In that leaf
+     we use the original search mode. */
+
+  switch (mode) {
+  case PAGE_CUR_GE:
+    page_mode= PAGE_CUR_L;
+    break;
+  case PAGE_CUR_G:
+    page_mode= PAGE_CUR_LE;
+    break;
+  default:
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+    ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
+          || RTREE_SEARCH_MODE(mode)
+          || mode == PAGE_CUR_LE_OR_EXTENDS);
+#else /* PAGE_CUR_LE_OR_EXTENDS */
+    ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
+          || RTREE_SEARCH_MODE(mode));
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+    page_mode= mode;
+    break;
+  }
+
+ search_loop:
+  auto buf_mode= BUF_GET;
+  ulint rw_latch= RW_NO_LATCH;
+
+  if (height)
+  {
+    /* We are about to fetch the root or a non-leaf page. */
+    if (latch_mode != BTR_MODIFY_TREE || height == level)
+      /* If doesn't have SX or X latch of index,
+         each page should be latched before reading. */
+      rw_latch= upper_rw_latch;
+  }
+  else if (latch_mode <= BTR_MODIFY_LEAF)
+    rw_latch= latch_mode;
+
+  dberr_t err;
+  auto block_savepoint= mtr->get_savepoint();
+  buf_block_t *block= buf_page_get_gen(page_id, zip_size, rw_latch, guess,
+                                       buf_mode, mtr, &err, false);
+  if (!block)
+  {
+    if (err == DB_DECRYPTION_FAILED)
+      btr_decryption_failed(*index);
+  func_exit:
+    if (UNIV_LIKELY_NULL(heap))
+      mem_heap_free(heap);
+
+    if (mbr_adj)
+      /* remember that we will need to adjust parent MBR */
+      cur->rtr_info->mbr_adj= true;
+
+    return err;
+  }
+
+  const page_t *page= buf_block_get_frame(block);
+#ifdef UNIV_ZIP_DEBUG
+  if (rw_latch != RW_NO_LATCH) {
+    const page_zip_des_t *page_zip= buf_block_get_page_zip(block);
+    ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+  }
+#endif /* UNIV_ZIP_DEBUG */
+
+  ut_ad(fil_page_index_page_check(page));
+  ut_ad(index->id == btr_page_get_index_id(page));
+
+  if (height != ULINT_UNDEFINED);
+  else if (page_is_leaf(page) &&
+           rw_latch != RW_NO_LATCH && rw_latch != root_leaf_rw_latch)
+  {
+    /* The root page is also a leaf page (root_leaf).
+    We should reacquire the page, because the root page
+    is latched differently from leaf pages. */
+    ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
+    ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_SX_LATCH);
+
+    ut_ad(block == mtr->at_savepoint(block_savepoint));
+    mtr->rollback_to_savepoint(block_savepoint);
+
+    upper_rw_latch= root_leaf_rw_latch;
+    goto search_loop;
+  }
+  else
+  {
+    /* We are in the root node */
+
+    height= btr_page_get_level(page);
+    cur->tree_height= height + 1;
+
+    ut_ad(cur->rtr_info);
+
+    /* If SSN in memory is not initialized, fetch it from root page */
+    if (!rtr_get_current_ssn_id(index))
+      /* FIXME: do this in dict_load_table_one() */
+      index->set_ssn(page_get_ssn_id(page) + 1);
+
+    /* Save the MBR */
+    cur->rtr_info->thr= cur->thr;
+    rtr_get_mbr_from_tuple(tuple, &cur->rtr_info->mbr);
+
+#ifdef BTR_CUR_ADAPT
+    info->root_guess= block;
+#endif
+  }
+
+  if (height == 0) {
+    if (rw_latch == RW_NO_LATCH)
+    {
+      ut_ad(block == mtr->at_savepoint(block_savepoint));
+      rtr_latch_leaves(block_savepoint, latch_mode, cur, mtr);
+    }
+
+    switch (latch_mode) {
+    case BTR_MODIFY_TREE:
+    case BTR_CONT_MODIFY_TREE:
+      break;
+    default:
+      if (!latch_by_caller)
+      {
+        /* Release the tree s-latch */
+        mtr->rollback_to_savepoint(savepoint,
+                                   savepoint + 1);
+        block_savepoint--;
+        root_savepoint--;
+      }
+      /* release upper blocks */
+      if (savepoint < block_savepoint)
+        mtr->rollback_to_savepoint(savepoint, block_savepoint);
+    }
+
+    page_mode= mode;
+  }
+
+  /* Remember the page search mode */
+  search_mode= page_mode;
+
+  /* Some adjustment on search mode, when the page search mode is
+  PAGE_CUR_RTREE_LOCATE or PAGE_CUR_RTREE_INSERT, as we are searching
+  with MBRs. When it is not the target level, we should search all
+  sub-trees that "CONTAIN" the search range/MBR. When it is at the
+  target level, the search becomes PAGE_CUR_LE */
+
+  if (page_mode == PAGE_CUR_RTREE_INSERT)
+  {
+    page_mode= (level == height)
+      ? PAGE_CUR_LE
+      : PAGE_CUR_RTREE_INSERT;
+
+    ut_ad(!page_is_leaf(page) || page_mode == PAGE_CUR_LE);
+  }
+  else if (page_mode == PAGE_CUR_RTREE_LOCATE && level == height)
+    page_mode= level == 0 ? PAGE_CUR_LE : PAGE_CUR_RTREE_GET_FATHER;
+
+  up_match= 0;
+  low_match= 0;
+
+  if (latch_mode == BTR_MODIFY_TREE || latch_mode == BTR_CONT_MODIFY_TREE)
+    /* Tree are locked, no need for Page Lock to protect the "path" */
+    cur->rtr_info->need_page_lock= false;
+
+  cur->page_cur.block= block;
+
+  if (page_mode >= PAGE_CUR_CONTAIN)
+  {
+    found= rtr_cur_search_with_match(block, index, tuple, page_mode,
+                                     &cur->page_cur, cur->rtr_info);
+
+    /* Need to use BTR_MODIFY_TREE to do the MBR adjustment */
+    if (search_mode == PAGE_CUR_RTREE_INSERT && cur->rtr_info->mbr_adj) {
+      static_assert(BTR_MODIFY_TREE == (8 | BTR_MODIFY_LEAF), "");
+
+      if (!(latch_mode & 8))
+        /* Parent MBR needs updated, should retry with BTR_MODIFY_TREE */
+        goto func_exit;
+
+      cur->rtr_info->mbr_adj= false;
+      mbr_adj= true;
+    }
+
+    if (found && page_mode == PAGE_CUR_RTREE_GET_FATHER)
+      cur->low_match= DICT_INDEX_SPATIAL_NODEPTR_SIZE + 1;
+  }
+  else
+  {
+    /* Search for complete index fields. */
+    up_bytes= low_bytes= 0;
+    if (page_cur_search_with_match(tuple, page_mode, &up_match,
+                                   &low_match, &cur->page_cur, nullptr)) {
+      err= DB_CORRUPTION;
+      goto func_exit;
+    }
+  }
+
+  /* If this is the desired level, leave the loop */
+
+  ut_ad(height == btr_page_get_level(btr_cur_get_page(cur)));
+
+  /* Add Predicate lock if it is serializable isolation
+     and only if it is in the search case */
+  if (mode >= PAGE_CUR_CONTAIN && mode != PAGE_CUR_RTREE_INSERT &&
+      mode != PAGE_CUR_RTREE_LOCATE && cur->rtr_info->need_prdt_lock)
+  {
+    lock_prdt_t prdt;
+
+    {
+      trx_t* trx= thr_get_trx(cur->thr);
+      TMLockTrxGuard g{TMLockTrxArgs(*trx)};
+      lock_init_prdt_from_mbr(&prdt, &cur->rtr_info->mbr, mode,
+                              trx->lock.lock_heap);
+    }
+
+    if (rw_latch == RW_NO_LATCH && height != 0)
+      block->page.lock.s_lock();
+
+    lock_prdt_lock(block, &prdt, index, LOCK_S, LOCK_PREDICATE, cur->thr);
+
+    if (rw_latch == RW_NO_LATCH && height != 0)
+      block->page.lock.s_unlock();
+  }
+
+  if (level != height)
+  {
+    ut_ad(height > 0);
+
+    height--;
+    guess= nullptr;
+
+    const rec_t *node_ptr= btr_cur_get_rec(cur);
+
+    offsets= rec_get_offsets(node_ptr, index, offsets, 0,
+                             ULINT_UNDEFINED, &heap);
+
+    if (page_rec_is_supremum(node_ptr))
+    {
+      cur->low_match= 0;
+      cur->up_match= 0;
+      goto func_exit;
+    }
+
+    /* If we are doing insertion or record locating,
+       remember the tree nodes we visited */
+    if (page_mode == PAGE_CUR_RTREE_INSERT ||
+        (search_mode == PAGE_CUR_RTREE_LOCATE &&
+         latch_mode != BTR_MODIFY_LEAF))
+    {
+      const bool add_latch= latch_mode == BTR_MODIFY_TREE &&
+        rw_latch == RW_NO_LATCH;
+
+      if (add_latch)
+      {
+        ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK |
+                                         MTR_MEMO_SX_LOCK));
+        block->page.lock.s_lock();
+      }
+
+      /* Store the parent cursor location */
+      ut_d(auto num_stored=)
+      rtr_store_parent_path(block, cur, latch_mode, height + 1, mtr);
+
+      if (page_mode == PAGE_CUR_RTREE_INSERT)
+      {
+        btr_pcur_t *r_cursor= rtr_get_parent_cursor(cur, height + 1, true);
+        /* If it is insertion, there should be only one parent for
+        each level traverse */
+        ut_ad(num_stored == 1);
+        node_ptr= btr_pcur_get_rec(r_cursor);
+      }
+
+      if (add_latch)
+        block->page.lock.s_unlock();
+
+      ut_ad(!page_rec_is_supremum(node_ptr));
+    }
+
+    ut_ad(page_mode == search_mode ||
+          (page_mode == PAGE_CUR_WITHIN &&
+           search_mode == PAGE_CUR_RTREE_LOCATE));
+    page_mode= search_mode;
+
+    if (height == level && latch_mode == BTR_MODIFY_TREE)
+    {
+      ut_ad(upper_rw_latch == RW_X_LATCH);
+      for (auto i= root_savepoint, n= mtr->get_savepoint(); i < n; i++)
+        mtr->upgrade_buffer_fix(i, RW_X_LATCH);
+    }
+
+    /* Go to the child node */
+    page_id.set_page_no(btr_node_ptr_get_child_page_no(node_ptr, offsets));
+
+    if (page_mode >= PAGE_CUR_CONTAIN && page_mode != PAGE_CUR_RTREE_INSERT)
+    {
+      rtr_node_path_t *path= cur->rtr_info->path;
+
+      if (found && !path->empty())
+      {
+        ut_ad(path->back().page_no == page_id.page_no());
+        path->pop_back();
+#ifdef UNIV_DEBUG
+        if (page_mode == PAGE_CUR_RTREE_LOCATE &&
+            latch_mode != BTR_MODIFY_LEAF)
+        {
+          btr_pcur_t* pcur= cur->rtr_info->parent_path->back().cursor;
+          rec_t *my_node_ptr= btr_pcur_get_rec(pcur);
+
+          offsets= rec_get_offsets(my_node_ptr, index, offsets,
+                                   0, ULINT_UNDEFINED, &heap);
+
+          ut_ad(page_id.page_no() ==
+                btr_node_ptr_get_child_page_no(my_node_ptr, offsets));
+        }
+#endif
+      }
+    }
+
+    goto search_loop;
+  }
+
+  if (level)
+  {
+    if (upper_rw_latch == RW_NO_LATCH)
+    {
+      ut_ad(latch_mode == BTR_CONT_MODIFY_TREE);
+      btr_block_get(*index, page_id.page_no(), RW_X_LATCH, false, mtr, &err);
+    }
+    else
+    {
+      ut_ad(mtr->memo_contains_flagged(block, upper_rw_latch));
+      ut_ad(!latch_by_caller);
+    }
+
+    if (page_mode <= PAGE_CUR_LE)
+    {
+      cur->low_match= low_match;
+      cur->up_match= up_match;
+    }
+  }
+  else
+  {
+    cur->low_match= low_match;
+    cur->low_bytes= low_bytes;
+    cur->up_match= up_match;
+    cur->up_bytes= up_bytes;
+
+    ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_GE);
+    ut_ad(up_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE);
+    ut_ad(low_match != ULINT_UNDEFINED || mode != PAGE_CUR_LE);
+  }
+
+  goto func_exit;
+}
+
+dberr_t rtr_search_leaf(btr_cur_t *cur, const dtuple_t *tuple,
+                        btr_latch_mode latch_mode,
+                        mtr_t *mtr, page_cur_mode_t mode)
+{
+  return rtr_search_to_nth_level(0, tuple, mode, latch_mode, cur, mtr);
+}
+
+/** Search for a spatial index leaf page record.
+@param pcur         cursor
+@param tuple       search tuple
+@param mode        search mode
+@param mtr         mini-transaction */
+dberr_t rtr_search_leaf(btr_pcur_t *pcur, const dtuple_t *tuple,
+                        page_cur_mode_t mode, mtr_t *mtr)
+{
+#ifdef UNIV_DEBUG
+  switch (mode) {
+  case PAGE_CUR_CONTAIN:
+  case PAGE_CUR_INTERSECT:
+  case PAGE_CUR_WITHIN:
+  case PAGE_CUR_DISJOINT:
+  case PAGE_CUR_MBR_EQUAL:
+    break;
+  default:
+    ut_ad("invalid mode" == 0);
+  }
+#endif
+  pcur->latch_mode= BTR_SEARCH_LEAF;
+  pcur->search_mode= mode;
+  pcur->pos_state= BTR_PCUR_IS_POSITIONED;
+  pcur->trx_if_known= nullptr;
+  return rtr_search_leaf(&pcur->btr_cur, tuple, BTR_SEARCH_LEAF, mtr, mode);
+}
+
+/**************************************************************//**
+Initializes and opens a persistent cursor to an index tree. It should be
+closed with btr_pcur_close. */
+bool rtr_search(
+	const dtuple_t*	tuple,	/*!< in: tuple on which search done */
+	btr_latch_mode	latch_mode,/*!< in: BTR_MODIFY_LEAF, ... */
+	btr_pcur_t*	cursor, /*!< in: memory buffer for persistent cursor */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	static_assert(BTR_MODIFY_TREE == (8 | BTR_MODIFY_LEAF), "");
+	ut_ad(latch_mode & BTR_MODIFY_LEAF);
+	ut_ad(!(latch_mode & BTR_ALREADY_S_LATCHED));
+	ut_ad(mtr->is_empty());
+
+	/* Initialize the cursor */
+
+	btr_pcur_init(cursor);
+
+	cursor->latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
+	cursor->search_mode = PAGE_CUR_RTREE_LOCATE;
+	cursor->trx_if_known = nullptr;
+
+	if (latch_mode & 8) {
+		mtr_x_lock_index(cursor->index(), mtr);
+	} else {
+		latch_mode
+			= btr_latch_mode(latch_mode | BTR_ALREADY_S_LATCHED);
+		mtr_sx_lock_index(cursor->index(), mtr);
+	}
+
+	/* Search with the tree cursor */
+
+	btr_cur_t* btr_cursor = btr_pcur_get_btr_cur(cursor);
+
+	btr_cursor->rtr_info
+		= rtr_create_rtr_info(false, false,
+				      btr_cursor, cursor->index());
+
+	if (btr_cursor->thr) {
+		btr_cursor->rtr_info->need_page_lock = true;
+		btr_cursor->rtr_info->thr = btr_cursor->thr;
+	}
+
+	if (rtr_search_leaf(btr_cursor, tuple, latch_mode, mtr)
+	    != DB_SUCCESS) {
+		return true;
+	}
+
+	cursor->pos_state = BTR_PCUR_IS_POSITIONED;
+
+	const rec_t* rec = btr_pcur_get_rec(cursor);
+
+	const bool d= rec_get_deleted_flag(
+		rec, cursor->index()->table->not_redundant());
+
+	if (page_rec_is_infimum(rec)
+	    || btr_pcur_get_low_match(cursor) != dtuple_get_n_fields(tuple)
+	    || (d && latch_mode
+		& (BTR_RTREE_DELETE_MARK | BTR_RTREE_UNDO_INS))) {
+
+		if (d && latch_mode & BTR_RTREE_DELETE_MARK) {
+			btr_cursor->rtr_info->fd_del = true;
+			btr_cursor->low_match = 0;
+		}
+
+		mtr->rollback_to_savepoint(1);
+
+		if (!rtr_pcur_getnext_from_path(tuple, PAGE_CUR_RTREE_LOCATE,
+						btr_cursor, 0, latch_mode,
+						true, mtr)) {
+			return true;
+		}
+
+		ut_ad(btr_pcur_get_low_match(cursor)
+		      == dtuple_get_n_fields(tuple));
+	}
+
+	if (!(latch_mode & 8)) {
+		mtr->rollback_to_savepoint(0, 1);
+	}
+
+	return false;
+}
+
+/* Get the rtree page father.
+@param[in,out]	mtr		mtr
+@param[in]	sea_cur		search cursor, contains information
+				about parent nodes in search
+@param[out]	cursor		cursor on node pointer record,
+				its page x-latched
+@return whether the cursor was successfully positioned */
+bool rtr_page_get_father(mtr_t *mtr, btr_cur_t *sea_cur, btr_cur_t *cursor)
+{
+  mem_heap_t *heap = mem_heap_create(100);
+  rec_offs *offsets= rtr_page_get_father_block(nullptr, heap,
+                                               mtr, sea_cur, cursor);
+  mem_heap_free(heap);
+  return offsets != nullptr;
+}
+
+MY_ATTRIBUTE((warn_unused_result))
+/********************************************************************//**
+Returns the upper level node pointer to a R-Tree page. It is assumed
+that mtr holds an x-latch on the tree. */
+static const rec_t* rtr_get_father_node(
+	ulint		level,	/*!< in: the tree level of search */
+	const dtuple_t*	tuple,	/*!< in: data tuple; NOTE: n_fields_cmp in
+				tuple must be set so that it cannot get
+				compared to the node ptr page number field! */
+	btr_cur_t*	sea_cur,/*!< in: search cursor */
+	btr_cur_t*	btr_cur,/*!< in/out: tree cursor; the cursor page is
+				s- or x-latched, but see also above! */
+	ulint		page_no,/*!< Current page no */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	const rec_t* rec = nullptr;
+	auto had_rtr = btr_cur->rtr_info;
+	dict_index_t* const index = btr_cur->index();
+
+	/* Try to optimally locate the parent node. Level should always
+	less than sea_cur->tree_height unless the root is splitting */
+	if (sea_cur && sea_cur->tree_height > level) {
+		ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+						 | MTR_MEMO_SX_LOCK));
+		if (rtr_cur_restore_position(sea_cur, level, mtr)) {
+			btr_pcur_t*	r_cursor = rtr_get_parent_cursor(
+				sea_cur, level, false);
+
+			rec = btr_pcur_get_rec(r_cursor);
+
+			ut_ad(r_cursor->rel_pos == BTR_PCUR_ON);
+			page_cur_position(rec,
+					  btr_pcur_get_block(r_cursor),
+					  btr_cur_get_page_cur(btr_cur));
+			had_rtr = btr_cur->rtr_info = sea_cur->rtr_info;
+			btr_cur->tree_height = sea_cur->tree_height;
+		}
+		goto func_exit;
+	}
+
+	/* We arrive here in one of two scenario
+	1) check table and btr_valide
+	2) index root page being raised */
+
+	if (btr_cur->rtr_info) {
+		rtr_clean_rtr_info(btr_cur->rtr_info, true);
+	}
+
+	btr_cur->rtr_info = rtr_create_rtr_info(false, false, btr_cur, index);
+
+	if (rtr_search_to_nth_level(level, tuple, PAGE_CUR_RTREE_LOCATE,
+				    BTR_CONT_MODIFY_TREE, btr_cur, mtr)
+	    != DB_SUCCESS) {
+	} else if (sea_cur && sea_cur->tree_height == level) {
+		rec = btr_cur_get_rec(btr_cur);
+	} else {
+		/* btr_validate */
+		ut_ad(level >= 1);
+		ut_ad(!sea_cur);
+
+		rec = btr_cur_get_rec(btr_cur);
+		const ulint n_fields = dtuple_get_n_fields_cmp(tuple);
+
+		if (page_rec_is_infimum(rec)
+		    || (btr_cur->low_match != n_fields)) {
+			if (!rtr_pcur_getnext_from_path(
+				    tuple, PAGE_CUR_RTREE_LOCATE, btr_cur,
+				    level, BTR_CONT_MODIFY_TREE, true, mtr)) {
+				rec = nullptr;
+			} else {
+				ut_ad(btr_cur->low_match == n_fields);
+				rec = btr_cur_get_rec(btr_cur);
+			}
+		}
+	}
+
+func_exit:
+	ut_d(rtr_compare_cursor_rec(rec, index, page_no));
+
+	if (!had_rtr && btr_cur->rtr_info) {
+		rtr_clean_rtr_info(btr_cur->rtr_info, true);
+		btr_cur->rtr_info = NULL;
+	}
+
+	return rec;
+}
+
+/** Returns the upper level node pointer to a R-Tree page. It is assumed
+that mtr holds an SX-latch or X-latch on the tree.
+@return	rec_get_offsets() of the node pointer record */
+static
+rec_offs*
+rtr_page_get_father_node_ptr(
+	rec_offs*	offsets,/*!< in: work area for the return value */
+	mem_heap_t*	heap,	/*!< in: memory heap to use */
+	btr_cur_t*	sea_cur,/*!< in: search cursor */
+	btr_cur_t*	cursor,	/*!< in: cursor pointing to user record,
+				out: cursor on node pointer record,
+				its page x-latched */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	dtuple_t*	tuple;
+	ulint		level;
+	ulint		page_no;
+	dict_index_t*	index;
+	rtr_mbr_t	mbr;
+
+	page_no = btr_cur_get_block(cursor)->page.id().page_no();
+	index = btr_cur_get_index(cursor);
+
+	ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
+					 | MTR_MEMO_SX_LOCK));
+
+	ut_ad(dict_index_get_page(index) != page_no);
+
+	level = btr_page_get_level(btr_cur_get_page(cursor));
+
+	const rec_t* user_rec = btr_cur_get_rec(cursor);
+	ut_a(page_rec_is_user_rec(user_rec));
+
+	offsets = rec_get_offsets(user_rec, index, offsets,
+				  level ? 0 : index->n_fields,
+				  ULINT_UNDEFINED, &heap);
+	rtr_get_mbr_from_rec(user_rec, offsets, &mbr);
+
+	tuple = rtr_index_build_node_ptr(
+		index, &mbr, user_rec, page_no, heap);
+
+	if (sea_cur && !sea_cur->rtr_info) {
+		sea_cur = NULL;
+	}
+
+	const rec_t* node_ptr = rtr_get_father_node(level + 1, tuple,
+						    sea_cur, cursor,
+						    page_no, mtr);
+	if (!node_ptr) {
+		return nullptr;
+	}
+
+	ut_ad(!page_rec_is_comp(node_ptr)
+	      || rec_get_status(node_ptr) == REC_STATUS_NODE_PTR);
+	offsets = rec_get_offsets(node_ptr, index, offsets, 0,
+				  ULINT_UNDEFINED, &heap);
+
+	if (btr_node_ptr_get_child_page_no(node_ptr, offsets) != page_no) {
+		offsets = nullptr;
+	}
+
+	return(offsets);
+}
+
+/************************************************************//**
+Returns the father block to a page. It is assumed that mtr holds
+an X or SX latch on the tree.
+@return rec_get_offsets() of the node pointer record */
+rec_offs*
+rtr_page_get_father_block(
+/*======================*/
+	rec_offs*	offsets,/*!< in: work area for the return value */
+	mem_heap_t*	heap,	/*!< in: memory heap to use */
+	mtr_t*		mtr,	/*!< in: mtr */
+	btr_cur_t*	sea_cur,/*!< in: search cursor, contains information
+				about parent nodes in search */
+	btr_cur_t*	cursor)	/*!< out: cursor on node pointer record,
+				its page x-latched */
+{
+  rec_t *rec=
+    page_rec_get_next(page_get_infimum_rec(cursor->block()->page.frame));
+  if (!rec)
+    return nullptr;
+  cursor->page_cur.rec= rec;
+  return rtr_page_get_father_node_ptr(offsets, heap, sea_cur, cursor, mtr);
+}
+
+/*******************************************************************//**
+Create a RTree search info structure */
+rtr_info_t*
+rtr_create_rtr_info(
+/******************/
+	bool		need_prdt,	/*!< in: Whether predicate lock
+					is needed */
+	bool		init_matches,	/*!< in: Whether to initiate the
+					"matches" structure for collecting
+					matched leaf records */
+	btr_cur_t*	cursor,		/*!< in: tree search cursor */
+	dict_index_t*	index)		/*!< in: index struct */
+{
+	rtr_info_t*	rtr_info;
+
+	index = index ? index : cursor->index();
+	ut_ad(index);
+
+	rtr_info = static_cast<rtr_info_t*>(ut_zalloc_nokey(sizeof(*rtr_info)));
+
+	rtr_info->allocated = true;
+	rtr_info->cursor = cursor;
+	rtr_info->index = index;
+
+	if (init_matches) {
+		rtr_info->heap = mem_heap_create(sizeof(*(rtr_info->matches)));
+		rtr_info->matches = static_cast<matched_rec_t*>(
+					mem_heap_zalloc(
+						rtr_info->heap,
+						sizeof(*rtr_info->matches)));
+
+		rtr_info->matches->matched_recs
+			= UT_NEW_NOKEY(rtr_rec_vector());
+
+		rtr_info->matches->bufp = page_align(rtr_info->matches->rec_buf
+						     + UNIV_PAGE_SIZE_MAX + 1);
+		mysql_mutex_init(rtr_match_mutex_key,
+				 &rtr_info->matches->rtr_match_mutex,
+				 nullptr);
+		rtr_info->matches->block.page.lock.init();
+	}
+
+	rtr_info->path = UT_NEW_NOKEY(rtr_node_path_t());
+	rtr_info->parent_path = UT_NEW_NOKEY(rtr_node_path_t());
+	rtr_info->need_prdt_lock = need_prdt;
+	mysql_mutex_init(rtr_path_mutex_key, &rtr_info->rtr_path_mutex,
+			 nullptr);
+
+	mysql_mutex_lock(&index->rtr_track->rtr_active_mutex);
+	index->rtr_track->rtr_active.push_front(rtr_info);
+	mysql_mutex_unlock(&index->rtr_track->rtr_active_mutex);
+	return(rtr_info);
+}
+
+/*******************************************************************//**
+Update a btr_cur_t with rtr_info */
+void
+rtr_info_update_btr(
+/******************/
+	btr_cur_t*	cursor,		/*!< in/out: tree cursor */
+	rtr_info_t*	rtr_info)	/*!< in: rtr_info to set to the
+					cursor */
+{
+	ut_ad(rtr_info);
+
+	cursor->rtr_info = rtr_info;
+}
+
+/*******************************************************************//**
+Initialize a R-Tree Search structure */
+void
+rtr_init_rtr_info(
+/****************/
+	rtr_info_t*	rtr_info,	/*!< in: rtr_info to set to the
+					cursor */
+	bool		need_prdt,	/*!< in: Whether predicate lock is
+					needed */
+	btr_cur_t*	cursor,		/*!< in: tree search cursor */
+	dict_index_t*	index,		/*!< in: index structure */
+	bool		reinit)		/*!< in: Whether this is a reinit */
+{
+	ut_ad(rtr_info);
+
+	if (!reinit) {
+		/* Reset all members. */
+		memset(rtr_info, 0, sizeof *rtr_info);
+		static_assert(PAGE_CUR_UNSUPP == 0, "compatibility");
+		mysql_mutex_init(rtr_path_mutex_key, &rtr_info->rtr_path_mutex,
+				 nullptr);
+	}
+
+	ut_ad(!rtr_info->matches || rtr_info->matches->matched_recs->empty());
+
+	rtr_info->path = UT_NEW_NOKEY(rtr_node_path_t());
+	rtr_info->parent_path = UT_NEW_NOKEY(rtr_node_path_t());
+	rtr_info->need_prdt_lock = need_prdt;
+	rtr_info->cursor = cursor;
+	rtr_info->index = index;
+
+	mysql_mutex_lock(&index->rtr_track->rtr_active_mutex);
+	index->rtr_track->rtr_active.push_front(rtr_info);
+	mysql_mutex_unlock(&index->rtr_track->rtr_active_mutex);
+}
+
+/**************************************************************//**
+Clean up R-Tree search structure */
+void
+rtr_clean_rtr_info(
+/*===============*/
+	rtr_info_t*	rtr_info,	/*!< in: RTree search info */
+	bool		free_all)	/*!< in: need to free rtr_info itself */
+{
+	dict_index_t*	index;
+	bool		initialized = false;
+
+	if (!rtr_info) {
+		return;
+	}
+
+	index = rtr_info->index;
+
+	if (index) {
+		mysql_mutex_lock(&index->rtr_track->rtr_active_mutex);
+	}
+
+	while (rtr_info->parent_path && !rtr_info->parent_path->empty()) {
+		btr_pcur_t*	cur = rtr_info->parent_path->back().cursor;
+		rtr_info->parent_path->pop_back();
+
+		if (cur) {
+			btr_pcur_close(cur);
+			ut_free(cur);
+		}
+	}
+
+	UT_DELETE(rtr_info->parent_path);
+	rtr_info->parent_path = NULL;
+
+	if (rtr_info->path != NULL) {
+		UT_DELETE(rtr_info->path);
+		rtr_info->path = NULL;
+		initialized = true;
+	}
+
+	if (rtr_info->matches) {
+		rtr_info->matches->used = false;
+		rtr_info->matches->locked = false;
+		rtr_info->matches->valid = false;
+		rtr_info->matches->matched_recs->clear();
+	}
+
+	if (index) {
+		index->rtr_track->rtr_active.remove(rtr_info);
+		mysql_mutex_unlock(&index->rtr_track->rtr_active_mutex);
+	}
+
+	if (free_all) {
+		if (rtr_info->matches) {
+			if (rtr_info->matches->matched_recs != NULL) {
+				UT_DELETE(rtr_info->matches->matched_recs);
+			}
+
+			rtr_info->matches->block.page.lock.free();
+
+			mysql_mutex_destroy(
+				&rtr_info->matches->rtr_match_mutex);
+		}
+
+		if (rtr_info->heap) {
+			mem_heap_free(rtr_info->heap);
+		}
+
+		if (initialized) {
+			mysql_mutex_destroy(&rtr_info->rtr_path_mutex);
+		}
+
+		if (rtr_info->allocated) {
+			ut_free(rtr_info);
+		}
+	}
+}
+
+/**************************************************************//**
+Rebuilt the "path" to exclude the removing page no */
+static
+void
+rtr_rebuild_path(
+/*=============*/
+	rtr_info_t*	rtr_info,	/*!< in: RTree search info */
+	ulint		page_no)	/*!< in: need to free rtr_info itself */
+{
+	rtr_node_path_t*		new_path
+		= UT_NEW_NOKEY(rtr_node_path_t());
+
+	rtr_node_path_t::iterator	rit;
+#ifdef UNIV_DEBUG
+	ulint	before_size = rtr_info->path->size();
+#endif /* UNIV_DEBUG */
+
+	for (rit = rtr_info->path->begin();
+	     rit != rtr_info->path->end(); ++rit) {
+		node_visit_t	next_rec = *rit;
+
+		if (next_rec.page_no == page_no) {
+			continue;
+		}
+
+		new_path->push_back(next_rec);
+#ifdef UNIV_DEBUG
+		node_visit_t	rec = new_path->back();
+		ut_ad(rec.level < rtr_info->cursor->tree_height
+		      && rec.page_no > 0);
+#endif /* UNIV_DEBUG */
+	}
+
+	UT_DELETE(rtr_info->path);
+
+	ut_ad(new_path->size() == before_size - 1);
+
+	rtr_info->path = new_path;
+
+	if (!rtr_info->parent_path->empty()) {
+		rtr_node_path_t*	new_parent_path = UT_NEW_NOKEY(
+			rtr_node_path_t());
+
+		for (rit = rtr_info->parent_path->begin();
+		     rit != rtr_info->parent_path->end(); ++rit) {
+			node_visit_t	next_rec = *rit;
+
+			if (next_rec.child_no == page_no) {
+				btr_pcur_t*	cur = next_rec.cursor;
+
+				if (cur) {
+					btr_pcur_close(cur);
+					ut_free(cur);
+				}
+
+				continue;
+			}
+
+			new_parent_path->push_back(next_rec);
+		}
+		UT_DELETE(rtr_info->parent_path);
+		rtr_info->parent_path = new_parent_path;
+	}
+
+}
+
+/**************************************************************//**
+Check whether a discarding page is in anyone's search path */
+void
+rtr_check_discard_page(
+/*===================*/
+	dict_index_t*	index,	/*!< in: index */
+	btr_cur_t*	cursor, /*!< in: cursor on the page to discard: not on
+				the root page */
+	buf_block_t*	block)	/*!< in: block of page to be discarded */
+{
+	const page_id_t id{block->page.id()};
+
+	mysql_mutex_lock(&index->rtr_track->rtr_active_mutex);
+
+	for (const auto& rtr_info : index->rtr_track->rtr_active) {
+		if (cursor && rtr_info == cursor->rtr_info) {
+			continue;
+		}
+
+		mysql_mutex_lock(&rtr_info->rtr_path_mutex);
+		for (const node_visit_t& node : *rtr_info->path) {
+			if (node.page_no == id.page_no()) {
+				rtr_rebuild_path(rtr_info, node.page_no);
+				break;
+			}
+		}
+		mysql_mutex_unlock(&rtr_info->rtr_path_mutex);
+
+		if (auto matches = rtr_info->matches) {
+			mysql_mutex_lock(&matches->rtr_match_mutex);
+
+			if (matches->block.page.id() == id) {
+				matches->matched_recs->clear();
+				matches->valid = false;
+			}
+
+			mysql_mutex_unlock(&matches->rtr_match_mutex);
+		}
+	}
+
+	mysql_mutex_unlock(&index->rtr_track->rtr_active_mutex);
+
+	lock_sys.prdt_page_free_from_discard(id, true);
+}
+
+/** Structure acts as functor to get the optimistic access of the page.
+It returns true if it successfully gets the page. */
+struct optimistic_get
+{
+  btr_pcur_t *const r_cursor;
+  mtr_t *const mtr;
+
+  optimistic_get(btr_pcur_t *r_cursor,mtr_t *mtr)
+  :r_cursor(r_cursor), mtr(mtr) {}
+
+  bool operator()(buf_block_t *hint) const
+  {
+    return hint && buf_page_optimistic_get(
+       RW_X_LATCH, hint, r_cursor->modify_clock, mtr);
+  }
+};
+
+/** Restore the stored position of a persistent cursor bufferfixing the page */
+static
+bool
+rtr_cur_restore_position(
+	btr_cur_t*	btr_cur,	/*!< in: detached persistent cursor */
+	ulint		level,		/*!< in: index level */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	dict_index_t*	index;
+	mem_heap_t*	heap;
+	btr_pcur_t*	r_cursor = rtr_get_parent_cursor(btr_cur, level, false);
+	dtuple_t*	tuple;
+	bool		ret = false;
+
+	ut_ad(mtr);
+	ut_ad(r_cursor);
+	ut_ad(mtr->is_active());
+
+	index = btr_cur_get_index(btr_cur);
+	ut_ad(r_cursor->index() == btr_cur->index());
+
+	if (r_cursor->rel_pos == BTR_PCUR_AFTER_LAST_IN_TREE
+	    || r_cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE) {
+		return(false);
+	}
+
+	DBUG_EXECUTE_IF(
+		"rtr_pessimistic_position",
+		r_cursor->modify_clock = 100;
+	);
+
+	if (r_cursor->block_when_stored.run_with_hint(
+		optimistic_get(r_cursor, mtr))) {
+		ut_ad(r_cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+
+		ut_ad(r_cursor->rel_pos == BTR_PCUR_ON);
+#ifdef UNIV_DEBUG
+		do {
+			const rec_t*	rec;
+			const rec_offs*	offsets1;
+			const rec_offs*	offsets2;
+			ulint		comp;
+
+			rec = btr_pcur_get_rec(r_cursor);
+
+			heap = mem_heap_create(256);
+			offsets1 = rec_get_offsets(
+				r_cursor->old_rec, index, NULL,
+				level ? 0 : r_cursor->old_n_fields,
+				r_cursor->old_n_fields, &heap);
+			offsets2 = rec_get_offsets(
+				rec, index, NULL,
+				level ? 0 : r_cursor->old_n_fields,
+				r_cursor->old_n_fields, &heap);
+
+			comp = rec_offs_comp(offsets1);
+
+			if (rec_get_info_bits(r_cursor->old_rec, comp)
+			    & REC_INFO_MIN_REC_FLAG) {
+				ut_ad(rec_get_info_bits(rec, comp)
+					& REC_INFO_MIN_REC_FLAG);
+			} else {
+
+				ut_ad(!cmp_rec_rec(r_cursor->old_rec,
+						   rec, offsets1, offsets2,
+						   index));
+			}
+
+			mem_heap_free(heap);
+		} while (0);
+#endif /* UNIV_DEBUG */
+
+		return(true);
+	}
+
+	/* Page has changed, for R-Tree, the page cannot be shrunk away,
+	so we search the page and its right siblings */
+	node_seq_t	page_ssn;
+	const page_t*	page;
+	page_cur_t*	page_cursor;
+	node_visit_t*	node = rtr_get_parent_node(btr_cur, level, false);
+	node_seq_t	path_ssn = node->seq_no;
+	const unsigned	zip_size = index->table->space->zip_size();
+	uint32_t	page_no = node->page_no;
+
+	heap = mem_heap_create(256);
+
+	tuple = dict_index_build_data_tuple(r_cursor->old_rec, index, !level,
+					    r_cursor->old_n_fields, heap);
+
+	page_cursor = btr_pcur_get_page_cur(r_cursor);
+	ut_ad(r_cursor == node->cursor);
+
+search_again:
+	ulint up_match = 0, low_match = 0;
+
+	page_cursor->block = buf_page_get_gen(
+		page_id_t(index->table->space_id, page_no),
+		zip_size, RW_X_LATCH, NULL, BUF_GET, mtr);
+
+	if (!page_cursor->block) {
+corrupted:
+		ret = false;
+		goto func_exit;
+	}
+
+	/* Get the page SSN */
+	page = buf_block_get_frame(page_cursor->block);
+	page_ssn = page_get_ssn_id(page);
+
+	if (page_cur_search_with_match(tuple, PAGE_CUR_LE,
+				       &up_match, &low_match, page_cursor,
+				       nullptr)) {
+		goto corrupted;
+	}
+
+	if (low_match == r_cursor->old_n_fields) {
+		const rec_t*	rec;
+		const rec_offs*	offsets1;
+		const rec_offs*	offsets2;
+		ulint		comp;
+
+		rec = btr_pcur_get_rec(r_cursor);
+
+		offsets1 = rec_get_offsets(r_cursor->old_rec, index, NULL,
+					   level ? 0 : r_cursor->old_n_fields,
+					   r_cursor->old_n_fields, &heap);
+		offsets2 = rec_get_offsets(rec, index, NULL,
+					   level ? 0 : r_cursor->old_n_fields,
+					   r_cursor->old_n_fields, &heap);
+
+		comp = rec_offs_comp(offsets1);
+
+		if ((rec_get_info_bits(r_cursor->old_rec, comp)
+		     & REC_INFO_MIN_REC_FLAG)
+		    && (rec_get_info_bits(rec, comp) & REC_INFO_MIN_REC_FLAG)) {
+			r_cursor->pos_state = BTR_PCUR_IS_POSITIONED;
+			ret = true;
+		} else if (!cmp_rec_rec(r_cursor->old_rec, rec, offsets1, offsets2,
+				 index)) {
+			r_cursor->pos_state = BTR_PCUR_IS_POSITIONED;
+			ret = true;
+		}
+	}
+
+	/* Check the page SSN to see if it has been splitted, if so, search
+	the right page */
+	if (!ret && page_ssn > path_ssn) {
+		page_no = btr_page_get_next(page);
+		goto search_again;
+	}
+
+func_exit:
+	mem_heap_free(heap);
+
+	return(ret);
+}
+
+/****************************************************************//**
+Copy the leaf level R-tree record, and push it to matched_rec in rtr_info */
+static
+void
+rtr_leaf_push_match_rec(
+/*====================*/
+	const rec_t*	rec,		/*!< in: record to copy */
+	rtr_info_t*	rtr_info,	/*!< in/out: search stack */
+	rec_offs*	offsets,	/*!< in: offsets */
+	bool		is_comp)	/*!< in: is compact format */
+{
+	byte*		buf;
+	matched_rec_t*	match_rec = rtr_info->matches;
+	rec_t*		copy;
+	ulint		data_len;
+	rtr_rec_t	rtr_rec;
+
+	buf = match_rec->block.page.frame + match_rec->used;
+	ut_ad(page_rec_is_leaf(rec));
+
+	copy = rec_copy(buf, rec, offsets);
+
+	if (is_comp) {
+		rec_set_next_offs_new(copy, PAGE_NEW_SUPREMUM);
+	} else {
+		rec_set_next_offs_old(copy, PAGE_OLD_SUPREMUM);
+	}
+
+	rtr_rec.r_rec = copy;
+	rtr_rec.locked = false;
+
+	match_rec->matched_recs->push_back(rtr_rec);
+	match_rec->valid = true;
+
+	data_len = rec_offs_data_size(offsets) + rec_offs_extra_size(offsets);
+	match_rec->used += data_len;
+
+	ut_ad(match_rec->used < srv_page_size);
+}
+
+/**************************************************************//**
+Store the parent path cursor
+@return number of cursor stored */
+ulint
+rtr_store_parent_path(
+/*==================*/
+	const buf_block_t*	block,	/*!< in: block of the page */
+	btr_cur_t*		btr_cur,/*!< in/out: persistent cursor */
+	btr_latch_mode		latch_mode,
+					/*!< in: latch_mode */
+	ulint			level,	/*!< in: index level */
+	mtr_t*			mtr)	/*!< in: mtr */
+{
+	ulint	num = btr_cur->rtr_info->parent_path->size();
+	ulint	num_stored = 0;
+
+	while (num >= 1) {
+		node_visit_t*	node = &(*btr_cur->rtr_info->parent_path)[
+					num - 1];
+		btr_pcur_t*	r_cursor = node->cursor;
+		buf_block_t*	cur_block;
+
+		if (node->level > level) {
+			break;
+		}
+
+		r_cursor->pos_state = BTR_PCUR_IS_POSITIONED;
+		r_cursor->latch_mode = latch_mode;
+
+		cur_block = btr_pcur_get_block(r_cursor);
+
+		if (cur_block == block) {
+			btr_pcur_store_position(r_cursor, mtr);
+			num_stored++;
+		} else {
+			break;
+		}
+
+		num--;
+	}
+
+	return(num_stored);
+}
+/**************************************************************//**
+push a nonleaf index node to the search path for insertion */
+static
+void
+rtr_non_leaf_insert_stack_push(
+/*===========================*/
+	dict_index_t*		index,	/*!< in: index descriptor */
+	rtr_node_path_t*	path,	/*!< in/out: search path */
+	ulint			level,	/*!< in: index page level */
+	uint32_t		child_no,/*!< in: child page no */
+	const buf_block_t*	block,	/*!< in: block of the page */
+	const rec_t*		rec,	/*!< in: positioned record */
+	double			mbr_inc)/*!< in: MBR needs to be enlarged */
+{
+	node_seq_t	new_seq;
+	btr_pcur_t*	my_cursor;
+
+	my_cursor = static_cast<btr_pcur_t*>(
+		ut_malloc_nokey(sizeof(*my_cursor)));
+
+	btr_pcur_init(my_cursor);
+
+	page_cur_position(rec, block, btr_pcur_get_page_cur(my_cursor));
+
+	btr_pcur_get_page_cur(my_cursor)->index = index;
+
+	new_seq = rtr_get_current_ssn_id(index);
+	rtr_non_leaf_stack_push(path, block->page.id().page_no(),
+				new_seq, level, child_no, my_cursor, mbr_inc);
+}
+
+/** Copy a buf_block_t, except "block->page.lock".
+@param[in,out]	matches	copy to match->block
+@param[in]	block	block to copy */
+static
+void
+rtr_copy_buf(
+	matched_rec_t*		matches,
+	const buf_block_t*	block)
+{
+	/* Copy all members of "block" to "matches->block" except "lock".
+	We skip "lock" because it is not used
+	from the dummy buf_block_t we create here and because memcpy()ing
+	it generates (valid) compiler warnings that the vtable pointer
+	will be copied. */
+	matches->block.page.lock.free();
+	new (&matches->block.page) buf_page_t(block->page);
+	matches->block.page.frame = block->page.frame;
+	matches->block.unzip_LRU = block->unzip_LRU;
+
+	ut_d(matches->block.in_unzip_LRU_list = block->in_unzip_LRU_list);
+	ut_d(matches->block.in_withdraw_list = block->in_withdraw_list);
+
+	/* Skip buf_block_t::lock */
+	matches->block.modify_clock = block->modify_clock;
+#ifdef BTR_CUR_HASH_ADAPT
+	matches->block.n_hash_helps = block->n_hash_helps;
+	matches->block.n_fields = block->n_fields;
+	matches->block.left_side = block->left_side;
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	matches->block.n_pointers = 0;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	matches->block.curr_n_fields = block->curr_n_fields;
+	matches->block.curr_left_side = block->curr_left_side;
+	matches->block.index = block->index;
+#endif /* BTR_CUR_HASH_ADAPT */
+}
+
+/****************************************************************//**
+Generate a shadow copy of the page block header to save the
+matched records */
+static
+void
+rtr_init_match(
+/*===========*/
+	matched_rec_t*		matches,/*!< in/out: match to initialize */
+	const buf_block_t*	block,	/*!< in: buffer block */
+	const page_t*		page)	/*!< in: buffer page */
+{
+	ut_ad(matches->matched_recs->empty());
+	matches->locked = false;
+	rtr_copy_buf(matches, block);
+	matches->block.page.frame = matches->bufp;
+	matches->valid = false;
+	/* We have to copy PAGE_*_SUPREMUM_END bytes so that we can
+	use infimum/supremum of this page as normal btr page for search. */
+	memcpy(matches->block.page.frame, page, page_is_comp(page)
+	       ? PAGE_NEW_SUPREMUM_END : PAGE_OLD_SUPREMUM_END);
+	matches->used = page_is_comp(page)
+				? PAGE_NEW_SUPREMUM_END
+				: PAGE_OLD_SUPREMUM_END;
+#ifdef RTR_SEARCH_DIAGNOSTIC
+	ulint pageno = page_get_page_no(page);
+	fprintf(stderr, "INNODB_RTR: Searching leaf page %d\n",
+		static_cast<int>(pageno));
+#endif /* RTR_SEARCH_DIAGNOSTIC */
+}
+
+/****************************************************************//**
+Get the bounding box content from an index record */
+void
+rtr_get_mbr_from_rec(
+/*=================*/
+	const rec_t*	rec,	/*!< in: data tuple */
+	const rec_offs*	offsets,/*!< in: offsets array */
+	rtr_mbr_t*	mbr)	/*!< out MBR */
+{
+	ulint		rec_f_len;
+	const byte*	data;
+
+	data = rec_get_nth_field(rec, offsets, 0, &rec_f_len);
+
+	rtr_read_mbr(data, mbr);
+}
+
+/****************************************************************//**
+Get the bounding box content from a MBR data record */
+void
+rtr_get_mbr_from_tuple(
+/*===================*/
+	const dtuple_t* dtuple, /*!< in: data tuple */
+	rtr_mbr*	mbr)	/*!< out: mbr to fill */
+{
+	const dfield_t* dtuple_field;
+        ulint           dtuple_f_len;
+
+	dtuple_field = dtuple_get_nth_field(dtuple, 0);
+	dtuple_f_len = dfield_get_len(dtuple_field);
+	ut_a(dtuple_f_len >= 4 * sizeof(double));
+
+	rtr_read_mbr(static_cast<const byte*>(dfield_get_data(dtuple_field)),
+		     mbr);
+}
+
+/** Compare minimum bounding rectangles.
+@return	1, 0, -1, if mode == PAGE_CUR_MBR_EQUAL. And return
+1, 0 for rest compare modes, depends on a and b qualifies the
+relationship (CONTAINS, WITHIN etc.) */
+static int cmp_gis_field(page_cur_mode_t mode, const void *a, const void *b)
+{
+  return mode == PAGE_CUR_MBR_EQUAL
+    ? cmp_geometry_field(a, b)
+    : rtree_key_cmp(mode, a, b);
+}
+
+/** Compare a GIS data tuple to a physical record in rtree non-leaf node.
+We need to check the page number field, since we don't store pk field in
+rtree non-leaf node.
+@param[in]	dtuple		data tuple
+@param[in]	rec		R-tree record
+@return whether dtuple is less than rec */
+static bool
+cmp_dtuple_rec_with_gis_internal(const dtuple_t* dtuple, const rec_t* rec)
+{
+  const dfield_t *dtuple_field= dtuple_get_nth_field(dtuple, 0);
+  ut_ad(dfield_get_len(dtuple_field) == DATA_MBR_LEN);
+
+  if (cmp_gis_field(PAGE_CUR_WITHIN, dfield_get_data(dtuple_field), rec))
+    return true;
+
+  dtuple_field= dtuple_get_nth_field(dtuple, 1);
+  ut_ad(dfield_get_len(dtuple_field) == 4); /* child page number */
+  ut_ad(dtuple_field->type.mtype == DATA_SYS_CHILD);
+  ut_ad(!(dtuple_field->type.prtype & ~DATA_NOT_NULL));
+
+  return memcmp(dtuple_field->data, rec + DATA_MBR_LEN, 4) != 0;
+}
+
+#ifndef UNIV_DEBUG
+static
+#endif
+/** Compare a GIS data tuple to a physical record.
+@param[in] dtuple data tuple
+@param[in] rec R-tree record
+@param[in] mode compare mode
+@retval negative if dtuple is less than rec */
+int cmp_dtuple_rec_with_gis(const dtuple_t *dtuple, const rec_t *rec,
+                            page_cur_mode_t mode)
+{
+  const dfield_t *dtuple_field= dtuple_get_nth_field(dtuple, 0);
+  /* FIXME: TABLE_SHARE::init_from_binary_frm_image() is adding
+  field->key_part_length_bytes() to the key length */
+  ut_ad(dfield_get_len(dtuple_field) == DATA_MBR_LEN ||
+        dfield_get_len(dtuple_field) == DATA_MBR_LEN + 2);
+
+  return cmp_gis_field(mode, dfield_get_data(dtuple_field), rec);
+}
+
+/****************************************************************//**
+Searches the right position in rtree for a page cursor. */
+bool
+rtr_cur_search_with_match(
+/*======================*/
+	const buf_block_t*	block,	/*!< in: buffer block */
+	dict_index_t*		index,	/*!< in: index descriptor */
+	const dtuple_t*		tuple,	/*!< in: data tuple */
+	page_cur_mode_t		mode,	/*!< in: PAGE_CUR_RTREE_INSERT,
+					PAGE_CUR_RTREE_LOCATE etc. */
+	page_cur_t*		cursor,	/*!< in/out: page cursor */
+	rtr_info_t*		rtr_info)/*!< in/out: search stack */
+{
+	bool		found = false;
+	const page_t*	page;
+	const rec_t*	rec;
+	const rec_t*	last_rec;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets		= offsets_;
+	mem_heap_t*	heap = NULL;
+	int		cmp = 1;
+	double		least_inc = DBL_MAX;
+	const rec_t*	best_rec;
+	const rec_t*	last_match_rec = NULL;
+	bool		match_init = false;
+	page_cur_mode_t	orig_mode = mode;
+	const rec_t*	first_rec = NULL;
+
+	rec_offs_init(offsets_);
+
+	ut_ad(RTREE_SEARCH_MODE(mode));
+
+	ut_ad(dict_index_is_spatial(index));
+
+	page = buf_block_get_frame(block);
+
+	const ulint level = btr_page_get_level(page);
+	const ulint n_core = level ? 0 : index->n_fields;
+
+	if (mode == PAGE_CUR_RTREE_LOCATE) {
+		ut_ad(level != 0);
+		mode = PAGE_CUR_WITHIN;
+	}
+
+	rec = page_dir_slot_get_rec_validate(page_dir_get_nth_slot(page, 0));
+
+	if (UNIV_UNLIKELY(!rec)) {
+		return false;
+	}
+
+	last_rec = rec;
+	best_rec = rec;
+
+	if (page_rec_is_infimum(rec)) {
+		rec = page_rec_get_next_const(rec);
+		if (UNIV_UNLIKELY(!rec)) {
+			return false;
+		}
+	}
+
+	/* Check insert tuple size is larger than first rec, and try to
+	avoid it if possible */
+	if (mode == PAGE_CUR_RTREE_INSERT && !page_rec_is_supremum(rec)) {
+
+		ulint	new_rec_size = rec_get_converted_size(index, tuple, 0);
+
+		offsets = rec_get_offsets(rec, index, offsets, n_core,
+					  dtuple_get_n_fields_cmp(tuple),
+					  &heap);
+
+		if (rec_offs_size(offsets) < new_rec_size) {
+			first_rec = rec;
+		}
+
+		/* If this is the left-most page of this index level
+		and the table is a compressed table, try to avoid
+		first page as much as possible, as there will be problem
+		when update MIN_REC rec in compress table */
+		if (is_buf_block_get_page_zip(block)
+		    && !page_has_prev(page)
+		    && page_get_n_recs(page) >= 2) {
+
+			rec = page_rec_get_next_const(rec);
+		}
+	}
+
+	while (!page_rec_is_supremum(rec)) {
+		if (!n_core) {
+			switch (mode) {
+			case PAGE_CUR_CONTAIN:
+			case PAGE_CUR_INTERSECT:
+			case PAGE_CUR_MBR_EQUAL:
+				/* At non-leaf level, we will need to check
+				both CONTAIN and INTERSECT for either of
+				the search mode */
+				cmp = cmp_dtuple_rec_with_gis(
+					tuple, rec, PAGE_CUR_CONTAIN);
+
+				if (cmp != 0) {
+					cmp = cmp_dtuple_rec_with_gis(
+						tuple, rec,
+						PAGE_CUR_INTERSECT);
+				}
+				break;
+			case PAGE_CUR_DISJOINT:
+				cmp = cmp_dtuple_rec_with_gis(
+					tuple, rec, mode);
+
+				if (cmp != 0) {
+					cmp = cmp_dtuple_rec_with_gis(
+						tuple, rec,
+						PAGE_CUR_INTERSECT);
+				}
+				break;
+			case PAGE_CUR_RTREE_INSERT:
+				double	increase;
+				double	area;
+
+				cmp = cmp_dtuple_rec_with_gis(
+					tuple, rec, PAGE_CUR_WITHIN);
+
+				if (cmp != 0) {
+					increase = rtr_rec_cal_increase(
+						tuple, rec, &area);
+					/* Once it goes beyond DBL_MAX,
+					it would not make sense to record
+					such value, just make it
+					DBL_MAX / 2  */
+					if (increase >= DBL_MAX) {
+						increase = DBL_MAX / 2;
+					}
+
+					if (increase < least_inc) {
+						least_inc = increase;
+						best_rec = rec;
+					} else if (best_rec
+						   && best_rec == first_rec) {
+						/* if first_rec is set,
+						we will try to avoid it */
+						least_inc = increase;
+						best_rec = rec;
+					}
+				}
+				break;
+			case PAGE_CUR_RTREE_GET_FATHER:
+				cmp = cmp_dtuple_rec_with_gis_internal(
+					tuple, rec);
+				break;
+			default:
+				/* WITHIN etc. */
+				cmp = cmp_dtuple_rec_with_gis(
+					tuple, rec, mode);
+			}
+		} else {
+			/* At leaf level, INSERT should translate to LE */
+			ut_ad(mode != PAGE_CUR_RTREE_INSERT);
+
+			cmp = cmp_dtuple_rec_with_gis(
+				tuple, rec, mode);
+		}
+
+		if (cmp == 0) {
+			found = true;
+
+			/* If located, the matching node/rec will be pushed
+			to rtr_info->path for non-leaf nodes, or
+			rtr_info->matches for leaf nodes */
+			if (rtr_info && mode != PAGE_CUR_RTREE_INSERT) {
+				if (!n_core) {
+					uint32_t	page_no;
+					node_seq_t	new_seq;
+					bool		is_loc;
+
+					is_loc = (orig_mode
+						  == PAGE_CUR_RTREE_LOCATE
+						  || orig_mode
+						  == PAGE_CUR_RTREE_GET_FATHER);
+
+					offsets = rec_get_offsets(
+						rec, index, offsets, 0,
+						ULINT_UNDEFINED, &heap);
+
+					page_no = btr_node_ptr_get_child_page_no(
+						rec, offsets);
+
+					ut_ad(level >= 1);
+
+					/* Get current SSN, before we insert
+					it into the path stack */
+					new_seq = rtr_get_current_ssn_id(index);
+
+					rtr_non_leaf_stack_push(
+						rtr_info->path,
+						page_no,
+						new_seq, level - 1, 0,
+						NULL, 0);
+
+					if (is_loc) {
+						rtr_non_leaf_insert_stack_push(
+							index,
+							rtr_info->parent_path,
+							level, page_no, block,
+							rec, 0);
+					}
+
+					if (!srv_read_only_mode
+					    && (rtr_info->need_page_lock
+						|| !is_loc)) {
+
+						/* Lock the page, preventing it
+						from being shrunk */
+						lock_place_prdt_page_lock(
+							page_id_t(block->page
+								  .id()
+								  .space(),
+								  page_no),
+							index,
+							rtr_info->thr);
+					}
+				} else {
+					ut_ad(orig_mode
+					      != PAGE_CUR_RTREE_LOCATE);
+
+					if (!match_init) {
+						rtr_init_match(
+							rtr_info->matches,
+							block, page);
+						match_init = true;
+					}
+
+					/* Collect matched records on page */
+					offsets = rec_get_offsets(
+						rec, index, offsets,
+						index->n_fields,
+						ULINT_UNDEFINED, &heap);
+					rtr_leaf_push_match_rec(
+						rec, rtr_info, offsets,
+						page_is_comp(page));
+				}
+
+				last_match_rec = rec;
+			} else {
+				/* This is the insertion case, it will break
+				once it finds the first MBR that can accomodate
+				the inserting rec */
+				break;
+			}
+		}
+
+		last_rec = rec;
+
+		rec = page_rec_get_next_const(rec);
+	}
+
+	/* All records on page are searched */
+	if (rec && page_rec_is_supremum(rec)) {
+		if (!n_core) {
+			if (!found) {
+				/* No match case, if it is for insertion,
+				then we select the record that result in
+				least increased area */
+				if (mode == PAGE_CUR_RTREE_INSERT) {
+					ut_ad(least_inc < DBL_MAX);
+					offsets = rec_get_offsets(
+						best_rec, index, offsets,
+						0, ULINT_UNDEFINED, &heap);
+					uint32_t child_no =
+					btr_node_ptr_get_child_page_no(
+						best_rec, offsets);
+
+					rtr_non_leaf_insert_stack_push(
+						index, rtr_info->parent_path,
+						level, child_no, block,
+						best_rec, least_inc);
+
+					page_cur_position(best_rec, block,
+							  cursor);
+					rtr_info->mbr_adj = true;
+				} else {
+					/* Position at the last rec of the
+					page, if it is not the leaf page */
+					page_cur_position(last_rec, block,
+							  cursor);
+				}
+			} else {
+				/* There are matching records, position
+				in the last matching records */
+				if (rtr_info) {
+					rec = last_match_rec;
+					page_cur_position(
+						rec, block, cursor);
+				}
+			}
+		} else if (rtr_info) {
+			/* Leaf level, no match, position at the
+			last (supremum) rec */
+			if (!last_match_rec) {
+				page_cur_position(rec, block, cursor);
+				goto func_exit;
+			}
+
+			/* There are matched records */
+			matched_rec_t*	match_rec = rtr_info->matches;
+
+			rtr_rec_t	test_rec;
+
+			test_rec = match_rec->matched_recs->back();
+#ifdef UNIV_DEBUG
+			rec_offs	offsets_2[REC_OFFS_NORMAL_SIZE];
+			rec_offs*	offsets2	= offsets_2;
+			rec_offs_init(offsets_2);
+
+			ut_ad(found);
+
+			/* Verify the record to be positioned is the same
+			as the last record in matched_rec vector */
+			offsets2 = rec_get_offsets(test_rec.r_rec, index,
+						   offsets2, index->n_fields,
+						   ULINT_UNDEFINED, &heap);
+
+			offsets = rec_get_offsets(last_match_rec, index,
+						  offsets, index->n_fields,
+						  ULINT_UNDEFINED, &heap);
+
+			ut_ad(cmp_rec_rec(test_rec.r_rec, last_match_rec,
+					  offsets2, offsets, index) == 0);
+#endif /* UNIV_DEBUG */
+			/* Pop the last match record and position on it */
+			match_rec->matched_recs->pop_back();
+			page_cur_position(test_rec.r_rec, &match_rec->block,
+					  cursor);
+		}
+	} else {
+
+		if (mode == PAGE_CUR_RTREE_INSERT) {
+			ut_ad(!last_match_rec);
+			rtr_non_leaf_insert_stack_push(
+				index, rtr_info->parent_path, level,
+				mach_read_from_4(rec + DATA_MBR_LEN),
+				block, rec, 0);
+
+		} else if (rtr_info && found && !n_core) {
+			rec = last_match_rec;
+		}
+
+		page_cur_position(rec, block, cursor);
+	}
+
+#ifdef UNIV_DEBUG
+	/* Verify that we are positioned at the same child page as pushed in
+	the path stack */
+	if (!n_core && (!page_rec_is_supremum(rec) || found)
+	    && mode != PAGE_CUR_RTREE_INSERT) {
+		ulint		page_no;
+
+		offsets = rec_get_offsets(rec, index, offsets, 0,
+					  ULINT_UNDEFINED, &heap);
+		page_no = btr_node_ptr_get_child_page_no(rec, offsets);
+
+		if (rtr_info && found) {
+			rtr_node_path_t*	path = rtr_info->path;
+			node_visit_t		last_visit = path->back();
+
+			ut_ad(last_visit.page_no == page_no);
+		}
+	}
+#endif /* UNIV_DEBUG */
+
+func_exit:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	return(found);
+}
diff --git a/storage/innobase/ha/ha0storage.cc b/storage/innobase/ha/ha0storage.cc
new file mode 100644
index 00000000..acde71b0
--- /dev/null
+++ b/storage/innobase/ha/ha0storage.cc
@@ -0,0 +1,178 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file ha/ha0storage.cc
+Hash storage.
+Provides a data structure that stores chunks of data in
+its own storage, avoiding duplicates.
+
+Created September 22, 2007 Vasil Dimov
+*******************************************************/
+
+#include "ha0storage.h"
+#include "hash0hash.h"
+#include "mem0mem.h"
+#include "ut0rnd.h"
+
+/*******************************************************************//**
+Retrieves a data from a storage. If it is present, a pointer to the
+stored copy of data is returned, otherwise NULL is returned. */
+static
+const void*
+ha_storage_get(
+/*===========*/
+	ha_storage_t*	storage,	/*!< in: hash storage */
+	const void*	data,		/*!< in: data to check for */
+	ulint		data_len)	/*!< in: data length */
+{
+	ha_storage_node_t*	node;
+	ulint			fold;
+
+	/* avoid repetitive calls to ut_fold_binary() in the HASH_SEARCH
+	macro */
+	fold = ut_fold_binary(static_cast<const byte*>(data), data_len);
+
+#define IS_FOUND	\
+	node->data_len == data_len && memcmp(node->data, data, data_len) == 0
+
+	HASH_SEARCH(
+		next,			/* node->"next" */
+		&storage->hash,		/* the hash table */
+		fold,			/* key */
+		ha_storage_node_t*,	/* type of node->next */
+		node,			/* auxiliary variable */
+		,			/* assertion */
+		IS_FOUND);		/* search criteria */
+
+	if (node == NULL) {
+
+		return(NULL);
+	}
+	/* else */
+
+	return(node->data);
+}
+
+/*******************************************************************//**
+Copies data into the storage and returns a pointer to the copy. If the
+same data chunk is already present, then pointer to it is returned.
+Data chunks are considered to be equal if len1 == len2 and
+memcmp(data1, data2, len1) == 0. If "data" is not present (and thus
+data_len bytes need to be allocated) and the size of storage is going to
+become more than "memlim" then "data" is not added and NULL is returned.
+To disable this behavior "memlim" can be set to 0, which stands for
+"no limit". */
+const void*
+ha_storage_put_memlim(
+/*==================*/
+	ha_storage_t*	storage,	/*!< in/out: hash storage */
+	const void*	data,		/*!< in: data to store */
+	ulint		data_len,	/*!< in: data length */
+	ulint		memlim)		/*!< in: memory limit to obey */
+{
+	void*			raw;
+	ha_storage_node_t*	node;
+	const void*		data_copy;
+	ulint			fold;
+
+	/* check if data chunk is already present */
+	data_copy = ha_storage_get(storage, data, data_len);
+	if (data_copy != NULL) {
+
+		return(data_copy);
+	}
+
+	/* not present */
+
+	/* check if we are allowed to allocate data_len bytes */
+	if (memlim > 0
+	    && ha_storage_get_size(storage) + data_len > memlim) {
+
+		return(NULL);
+	}
+
+	/* we put the auxiliary node struct and the data itself in one
+	continuous block */
+	raw = mem_heap_alloc(storage->heap,
+			     sizeof(ha_storage_node_t) + data_len);
+
+	node = (ha_storage_node_t*) raw;
+	data_copy = (byte*) raw + sizeof(*node);
+
+	memcpy((byte*) raw + sizeof(*node), data, data_len);
+
+	node->data_len = data_len;
+	node->data = data_copy;
+
+	/* avoid repetitive calls to ut_fold_binary() in the HASH_INSERT
+	macro */
+	fold = ut_fold_binary(static_cast<const byte*>(data), data_len);
+
+	HASH_INSERT(
+		ha_storage_node_t,	/* type used in the hash chain */
+		next,			/* node->"next" */
+		&storage->hash,		/* the hash table */
+		fold,			/* key */
+		node);			/* add this data to the hash */
+
+	/* the output should not be changed because it will spoil the
+	hash table */
+	return(data_copy);
+}
+
+#ifdef UNIV_COMPILE_TEST_FUNCS
+
+void
+test_ha_storage()
+{
+	ha_storage_t*	storage;
+	char		buf[1024];
+	int		i;
+	const void*	stored[256];
+	const void*	p;
+
+	storage = ha_storage_create(0, 0);
+
+	for (i = 0; i < 256; i++) {
+
+		memset(buf, i, sizeof(buf));
+		stored[i] = ha_storage_put(storage, buf, sizeof(buf));
+	}
+
+	//ha_storage_empty(&storage);
+
+	for (i = 255; i >= 0; i--) {
+
+		memset(buf, i, sizeof(buf));
+		p = ha_storage_put(storage, buf, sizeof(buf));
+
+		if (p != stored[i]) {
+			ib::warn() << "ha_storage_put() returned " << p
+				<< " instead of " << stored[i] << ", i=" << i;
+			return;
+		}
+	}
+
+	ib::info() << "all ok";
+
+	ha_storage_free(storage);
+}
+
+#endif /* UNIV_COMPILE_TEST_FUNCS */
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
new file mode 100644
index 00000000..21bf10a1
--- /dev/null
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -0,0 +1,21217 @@
+/*****************************************************************************
+
+Copyright (c) 2000, 2020, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2009, Percona Inc.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, 2023, MariaDB Corporation.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/** @file ha_innodb.cc */
+
+#include "univ.i"
+
+/* Include necessary SQL headers */
+#include "ha_prototypes.h"
+#include <debug_sync.h>
+#include <gstream.h>
+#include <log.h>
+#include <mysys_err.h>
+#include <innodb_priv.h>
+#include <strfunc.h>
+#include <sql_acl.h>
+#include <sql_class.h>
+#include <sql_show.h>
+#include <sql_table.h>
+#include <table_cache.h>
+#include <my_check_opt.h>
+#include <my_bitmap.h>
+#include <mysql/service_thd_alloc.h>
+#include <mysql/service_thd_wait.h>
+#include "sql_type_geom.h"
+#include "scope.h"
+#include "srv0srv.h"
+
+// MYSQL_PLUGIN_IMPORT extern my_bool lower_case_file_system;
+// MYSQL_PLUGIN_IMPORT extern char mysql_unpacked_real_data_home[];
+
+#include <my_service_manager.h>
+#include <key.h>
+#include <sql_manager.h>
+
+/* Include necessary InnoDB headers */
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "btr0bulk.h"
+#include "btr0sea.h"
+#include "buf0dblwr.h"
+#include "buf0dump.h"
+#include "buf0buf.h"
+#include "buf0flu.h"
+#include "buf0lru.h"
+#include "dict0boot.h"
+#include "dict0load.h"
+#include "btr0defragment.h"
+#include "dict0crea.h"
+#include "dict0stats.h"
+#include "dict0stats_bg.h"
+#include "fil0fil.h"
+#include "fsp0fsp.h"
+#include "fts0fts.h"
+#include "fts0plugin.h"
+#include "fts0priv.h"
+#include "fts0types.h"
+#include "ibuf0ibuf.h"
+#include "lock0lock.h"
+#include "log0crypt.h"
+#include "mtr0mtr.h"
+#include "os0file.h"
+#include "page0zip.h"
+#include "row0import.h"
+#include "row0ins.h"
+#include "row0log.h"
+#include "row0merge.h"
+#include "row0mysql.h"
+#include "row0quiesce.h"
+#include "row0sel.h"
+#include "row0upd.h"
+#include "fil0crypt.h"
+#include "srv0mon.h"
+#include "srv0start.h"
+#include "rem0rec.h"
+#include "trx0purge.h"
+#include "trx0roll.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "fil0pagecompress.h"
+#include "ut0mem.h"
+#include "row0ext.h"
+#include "mariadb_stats.h"
+thread_local ha_handler_stats mariadb_dummy_stats;
+thread_local ha_handler_stats *mariadb_stats= &mariadb_dummy_stats;
+
+#include "lz4.h"
+#include "lzo/lzo1x.h"
+#include "lzma.h"
+#include "bzlib.h"
+#include "snappy-c.h"
+
+#include <limits>
+
+#define thd_get_trx_isolation(X) ((enum_tx_isolation)thd_tx_isolation(X))
+
+extern "C" void thd_mark_transaction_to_rollback(MYSQL_THD thd, bool all);
+unsigned long long thd_get_query_id(const MYSQL_THD thd);
+void thd_clear_error(MYSQL_THD thd);
+
+TABLE *find_fk_open_table(THD *thd, const char *db, size_t db_len,
+			  const char *table, size_t table_len);
+MYSQL_THD create_background_thd();
+void reset_thd(MYSQL_THD thd);
+TABLE *get_purge_table(THD *thd);
+TABLE *open_purge_table(THD *thd, const char *db, size_t dblen,
+			const char *tb, size_t tblen);
+void close_thread_tables(THD* thd);
+
+#ifdef MYSQL_DYNAMIC_PLUGIN
+#define tc_size  400
+#endif
+
+#include <mysql/plugin.h>
+#include <mysql/service_wsrep.h>
+
+#include "ha_innodb.h"
+#include "i_s.h"
+
+#include <string>
+#include <sstream>
+
+#ifdef WITH_WSREP
+#include <mysql/service_md5.h>
+#include "wsrep_sst.h"
+#endif /* WITH_WSREP */
+
+#ifdef HAVE_URING
+/** The Linux kernel version if io_uring() is considered unsafe */
+const char *io_uring_may_be_unsafe;
+#endif
+
+#define INSIDE_HA_INNOBASE_CC
+
+#define EQ_CURRENT_THD(thd) ((thd) == current_thd)
+
+struct handlerton* innodb_hton_ptr;
+
+static const long AUTOINC_OLD_STYLE_LOCKING = 0;
+static const long AUTOINC_NEW_STYLE_LOCKING = 1;
+static const long AUTOINC_NO_LOCKING = 2;
+
+static constexpr size_t buf_pool_chunk_min_size= 1U << 20;
+
+static ulong innobase_open_files;
+static long innobase_autoinc_lock_mode;
+
+ulonglong innobase_buffer_pool_size;
+
+/** Percentage of the buffer pool to reserve for 'old' blocks.
+Connected to buf_LRU_old_ratio. */
+static uint innobase_old_blocks_pct;
+
+static char*	innobase_data_file_path;
+static char*	innobase_temp_data_file_path;
+
+/* The default values for the following char* start-up parameters
+are determined in innodb_init_params(). */
+
+static char*	innobase_data_home_dir;
+static char*	innobase_enable_monitor_counter;
+static char*	innobase_disable_monitor_counter;
+static char*	innobase_reset_monitor_counter;
+static char*	innobase_reset_all_monitor_counter;
+
+/* This variable can be set in the server configure file, specifying
+stopword table to be used */
+static char*	innobase_server_stopword_table;
+
+my_bool innobase_rollback_on_timeout;
+static my_bool	innobase_create_status_file;
+my_bool	innobase_stats_on_metadata;
+static my_bool	innodb_optimize_fulltext_only;
+
+extern uint srv_fil_crypt_rotate_key_age;
+extern uint srv_n_fil_crypt_iops;
+
+#ifdef UNIV_DEBUG
+my_bool innodb_evict_tables_on_commit_debug;
+#endif
+
+/** File format constraint for ALTER TABLE */
+ulong innodb_instant_alter_column_allowed;
+
+/** Note we cannot use rec_format_enum because we do not allow
+COMPRESSED row format for innodb_default_row_format option. */
+enum default_row_format_enum {
+	DEFAULT_ROW_FORMAT_REDUNDANT = 0,
+	DEFAULT_ROW_FORMAT_COMPACT = 1,
+	DEFAULT_ROW_FORMAT_DYNAMIC = 2,
+};
+
+/** Whether ROW_FORMAT=COMPRESSED tables are read-only */
+static my_bool innodb_read_only_compressed;
+
+/** A dummy variable */
+static uint innodb_max_purge_lag_wait;
+
+/** Wait for trx_sys.history_size() to be below a limit. */
+static void innodb_max_purge_lag_wait_update(THD *thd, st_mysql_sys_var *,
+                                             void *, const void *limit)
+{
+  if (high_level_read_only)
+    return;
+  const uint l= *static_cast<const uint*>(limit);
+  if (!trx_sys.history_exceeds(l))
+    return;
+  mysql_mutex_unlock(&LOCK_global_system_variables);
+  while (trx_sys.history_exceeds(l))
+  {
+    if (thd_kill_level(thd))
+      break;
+    /* Adjust for purge_coordinator_state::refresh() */
+    log_sys.latch.rd_lock(SRW_LOCK_CALL);
+    const lsn_t last= log_sys.last_checkpoint_lsn,
+      max_age= log_sys.max_checkpoint_age;
+    log_sys.latch.rd_unlock();
+    const lsn_t lsn= log_sys.get_lsn();
+    if ((lsn - last) / 4 >= max_age / 5)
+      buf_flush_ahead(last + max_age / 5, false);
+    purge_sys.wake_if_not_active();
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+  }
+  mysql_mutex_lock(&LOCK_global_system_variables);
+}
+
+static
+void set_my_errno(int err)
+{
+	errno = err;
+}
+
+/** Checks whether the file name belongs to a partition of a table.
+@param[in]	file_name	file name
+@return pointer to the end of the table name part of the file name, or NULL */
+static
+char*
+is_partition(
+/*=========*/
+	char*		file_name)
+{
+	/* We look for pattern #P# to see if the table is partitioned
+	MariaDB table. */
+	return strstr(file_name, table_name_t::part_suffix);
+}
+
+
+
+/** Return the InnoDB ROW_FORMAT enum value
+@param[in]	row_format	row_format from "innodb_default_row_format"
+@return InnoDB ROW_FORMAT value from rec_format_t enum. */
+static
+rec_format_t
+get_row_format(
+	ulong row_format)
+{
+	switch(row_format) {
+	case DEFAULT_ROW_FORMAT_REDUNDANT:
+		return(REC_FORMAT_REDUNDANT);
+	case DEFAULT_ROW_FORMAT_COMPACT:
+		return(REC_FORMAT_COMPACT);
+	case DEFAULT_ROW_FORMAT_DYNAMIC:
+		return(REC_FORMAT_DYNAMIC);
+	default:
+		ut_ad(0);
+		return(REC_FORMAT_DYNAMIC);
+	}
+}
+
+static ulong	innodb_default_row_format = DEFAULT_ROW_FORMAT_DYNAMIC;
+
+/** Possible values for system variable "innodb_stats_method". The values
+are defined the same as its corresponding MyISAM system variable
+"myisam_stats_method"(see "myisam_stats_method_names"), for better usability */
+static const char* innodb_stats_method_names[] = {
+	"nulls_equal",
+	"nulls_unequal",
+	"nulls_ignored",
+	NullS
+};
+
+/** Used to define an enumerate type of the system variable innodb_stats_method.
+This is the same as "myisam_stats_method_typelib" */
+static TYPELIB innodb_stats_method_typelib = {
+	array_elements(innodb_stats_method_names) - 1,
+	"innodb_stats_method_typelib",
+	innodb_stats_method_names,
+	NULL
+};
+
+/** Possible values of the parameter innodb_checksum_algorithm */
+const char* innodb_checksum_algorithm_names[] = {
+	"crc32",
+	"strict_crc32",
+	"full_crc32",
+	"strict_full_crc32",
+	NullS
+};
+
+/** Used to define an enumerate type of the system variable
+innodb_checksum_algorithm. */
+TYPELIB innodb_checksum_algorithm_typelib = {
+	array_elements(innodb_checksum_algorithm_names) - 1,
+	"innodb_checksum_algorithm_typelib",
+	innodb_checksum_algorithm_names,
+	NULL
+};
+
+/** Possible values for system variable "innodb_default_row_format". */
+static const char* innodb_default_row_format_names[] = {
+	"redundant",
+	"compact",
+	"dynamic",
+	NullS
+};
+
+/** Used to define an enumerate type of the system variable
+innodb_default_row_format. */
+static TYPELIB innodb_default_row_format_typelib = {
+	array_elements(innodb_default_row_format_names) - 1,
+	"innodb_default_row_format_typelib",
+	innodb_default_row_format_names,
+	NULL
+};
+
+/** Names of allowed values of innodb_flush_method */
+const char* innodb_flush_method_names[] = {
+	"fsync",
+	"O_DSYNC",
+	"littlesync",
+	"nosync",
+	"O_DIRECT",
+	"O_DIRECT_NO_FSYNC",
+#ifdef _WIN32
+	"unbuffered",
+	"async_unbuffered" /* alias for "unbuffered" */,
+	"normal" /* alias for "fsync" */,
+#endif
+	NullS
+};
+
+/** Enumeration of innodb_flush_method */
+TYPELIB innodb_flush_method_typelib = {
+	array_elements(innodb_flush_method_names) - 1,
+	"innodb_flush_method_typelib",
+	innodb_flush_method_names,
+	NULL
+};
+
+/** Names of allowed values of innodb_deadlock_report */
+static const char *innodb_deadlock_report_names[]= {
+	"off", /* Do not report any details of deadlocks */
+	"basic", /* Report waiting transactions and lock requests */
+	"full", /* Also report blocking locks */
+	NullS
+};
+
+static_assert(Deadlock::REPORT_OFF == 0, "compatibility");
+static_assert(Deadlock::REPORT_BASIC == 1, "compatibility");
+static_assert(Deadlock::REPORT_FULL == 2, "compatibility");
+
+/** Enumeration of innodb_deadlock_report */
+static TYPELIB innodb_deadlock_report_typelib = {
+	array_elements(innodb_deadlock_report_names) - 1,
+	"innodb_deadlock_report_typelib",
+	innodb_deadlock_report_names,
+	NULL
+};
+
+/** Allowed values of innodb_change_buffering */
+static const char* innodb_change_buffering_names[] = {
+	"none",		/* IBUF_USE_NONE */
+	"inserts",	/* IBUF_USE_INSERT */
+	"deletes",	/* IBUF_USE_DELETE_MARK */
+	"changes",	/* IBUF_USE_INSERT_DELETE_MARK */
+	"purges",	/* IBUF_USE_DELETE */
+	"all",		/* IBUF_USE_ALL */
+	NullS
+};
+
+/** Enumeration of innodb_change_buffering */
+static TYPELIB innodb_change_buffering_typelib = {
+	array_elements(innodb_change_buffering_names) - 1,
+	"innodb_change_buffering_typelib",
+	innodb_change_buffering_names,
+	NULL
+};
+
+/** Allowed values of innodb_instant_alter_column_allowed */
+const char* innodb_instant_alter_column_allowed_names[] = {
+	"never", /* compatible with MariaDB 5.5 to 10.2 */
+	"add_last",/* allow instant ADD COLUMN ... LAST */
+	"add_drop_reorder", /* allow instant ADD anywhere & DROP & reorder */
+	NullS
+};
+
+/** Enumeration of innodb_instant_alter_column_allowed */
+static TYPELIB innodb_instant_alter_column_allowed_typelib = {
+	array_elements(innodb_instant_alter_column_allowed_names) - 1,
+	"innodb_instant_alter_column_allowed_typelib",
+	innodb_instant_alter_column_allowed_names,
+	NULL
+};
+
+/** Retrieve the FTS Relevance Ranking result for doc with doc_id
+of m_prebuilt->fts_doc_id
+@param[in,out]	fts_hdl	FTS handler
+@return the relevance ranking value */
+static
+float
+innobase_fts_retrieve_ranking(
+	FT_INFO*	fts_hdl);
+/** Free the memory for the FTS handler
+@param[in,out]	fts_hdl	FTS handler */
+static
+void
+innobase_fts_close_ranking(
+	FT_INFO*	fts_hdl);
+/** Find and Retrieve the FTS Relevance Ranking result for doc with doc_id
+of m_prebuilt->fts_doc_id
+@param[in,out]	fts_hdl	FTS handler
+@return the relevance ranking value */
+static
+float
+innobase_fts_find_ranking(
+	FT_INFO*	fts_hdl,
+	uchar*,
+	uint);
+
+/* Call back function array defined by MySQL and used to
+retrieve FTS results. */
+const struct _ft_vft ft_vft_result = {NULL,
+				      innobase_fts_find_ranking,
+				      innobase_fts_close_ranking,
+				      innobase_fts_retrieve_ranking,
+				      NULL};
+
+/** @return version of the extended FTS API */
+static
+uint
+innobase_fts_get_version()
+{
+	/* Currently this doesn't make much sense as returning
+	HA_CAN_FULLTEXT_EXT automatically mean this version is supported.
+	This supposed to ease future extensions.  */
+	return(2);
+}
+
+/** @return Which part of the extended FTS API is supported */
+static
+ulonglong
+innobase_fts_flags()
+{
+	return(FTS_ORDERED_RESULT | FTS_DOCID_IN_RESULT);
+}
+
+/** Find and Retrieve the FTS doc_id for the current result row
+@param[in,out]	fts_hdl	FTS handler
+@return the document ID */
+static
+ulonglong
+innobase_fts_retrieve_docid(
+	FT_INFO_EXT*	fts_hdl);
+
+/** Find and retrieve the size of the current result
+@param[in,out]	fts_hdl	FTS handler
+@return number of matching rows */
+static
+ulonglong
+innobase_fts_count_matches(
+	FT_INFO_EXT*	fts_hdl)	/*!< in: FTS handler */
+{
+	NEW_FT_INFO*	handle = reinterpret_cast<NEW_FT_INFO*>(fts_hdl);
+
+	if (handle->ft_result->rankings_by_id != NULL) {
+		return(rbt_size(handle->ft_result->rankings_by_id));
+	} else {
+		return(0);
+	}
+}
+
+const struct _ft_vft_ext ft_vft_ext_result = {innobase_fts_get_version,
+					      innobase_fts_flags,
+					      innobase_fts_retrieve_docid,
+					      innobase_fts_count_matches};
+
+#ifdef HAVE_PSI_INTERFACE
+# define PSI_KEY(n) {&n##_key, #n, 0}
+/* Keys to register pthread mutexes in the current file with
+performance schema */
+static mysql_pfs_key_t	pending_checkpoint_mutex_key;
+
+# ifdef UNIV_PFS_MUTEX
+mysql_pfs_key_t	buf_pool_mutex_key;
+mysql_pfs_key_t	dict_foreign_err_mutex_key;
+mysql_pfs_key_t	fil_system_mutex_key;
+mysql_pfs_key_t	flush_list_mutex_key;
+mysql_pfs_key_t	fts_cache_mutex_key;
+mysql_pfs_key_t	fts_cache_init_mutex_key;
+mysql_pfs_key_t	fts_delete_mutex_key;
+mysql_pfs_key_t	fts_doc_id_mutex_key;
+mysql_pfs_key_t	ibuf_bitmap_mutex_key;
+mysql_pfs_key_t	ibuf_mutex_key;
+mysql_pfs_key_t	ibuf_pessimistic_insert_mutex_key;
+mysql_pfs_key_t	recalc_pool_mutex_key;
+mysql_pfs_key_t	purge_sys_pq_mutex_key;
+mysql_pfs_key_t	recv_sys_mutex_key;
+mysql_pfs_key_t page_zip_stat_per_index_mutex_key;
+mysql_pfs_key_t rtr_active_mutex_key;
+mysql_pfs_key_t	rtr_match_mutex_key;
+mysql_pfs_key_t	rtr_path_mutex_key;
+mysql_pfs_key_t	srv_innodb_monitor_mutex_key;
+mysql_pfs_key_t	srv_misc_tmpfile_mutex_key;
+mysql_pfs_key_t	srv_monitor_file_mutex_key;
+mysql_pfs_key_t	buf_dblwr_mutex_key;
+mysql_pfs_key_t	trx_pool_mutex_key;
+mysql_pfs_key_t	trx_pool_manager_mutex_key;
+mysql_pfs_key_t	lock_wait_mutex_key;
+mysql_pfs_key_t	trx_sys_mutex_key;
+mysql_pfs_key_t	srv_threads_mutex_key;
+mysql_pfs_key_t	tpool_cache_mutex_key;
+
+/* all_innodb_mutexes array contains mutexes that are
+performance schema instrumented if "UNIV_PFS_MUTEX"
+is defined */
+static PSI_mutex_info all_innodb_mutexes[] = {
+	PSI_KEY(pending_checkpoint_mutex),
+	PSI_KEY(buf_pool_mutex),
+	PSI_KEY(dict_foreign_err_mutex),
+	PSI_KEY(recalc_pool_mutex),
+	PSI_KEY(fil_system_mutex),
+	PSI_KEY(flush_list_mutex),
+	PSI_KEY(fts_cache_mutex),
+	PSI_KEY(fts_cache_init_mutex),
+	PSI_KEY(fts_delete_mutex),
+	PSI_KEY(fts_doc_id_mutex),
+	PSI_KEY(ibuf_mutex),
+	PSI_KEY(ibuf_pessimistic_insert_mutex),
+	PSI_KEY(index_online_log),
+	PSI_KEY(page_zip_stat_per_index_mutex),
+	PSI_KEY(purge_sys_pq_mutex),
+	PSI_KEY(recv_sys_mutex),
+	PSI_KEY(srv_innodb_monitor_mutex),
+	PSI_KEY(srv_misc_tmpfile_mutex),
+	PSI_KEY(srv_monitor_file_mutex),
+	PSI_KEY(buf_dblwr_mutex),
+	PSI_KEY(trx_pool_mutex),
+	PSI_KEY(trx_pool_manager_mutex),
+	PSI_KEY(lock_wait_mutex),
+	PSI_KEY(srv_threads_mutex),
+	PSI_KEY(rtr_active_mutex),
+	PSI_KEY(rtr_match_mutex),
+	PSI_KEY(rtr_path_mutex),
+	PSI_KEY(trx_sys_mutex),
+	PSI_KEY(tpool_cache_mutex),
+};
+# endif /* UNIV_PFS_MUTEX */
+
+# ifdef UNIV_PFS_RWLOCK
+mysql_pfs_key_t	dict_operation_lock_key;
+mysql_pfs_key_t	index_tree_rw_lock_key;
+mysql_pfs_key_t	index_online_log_key;
+mysql_pfs_key_t	fil_space_latch_key;
+mysql_pfs_key_t trx_i_s_cache_lock_key;
+mysql_pfs_key_t	trx_purge_latch_key;
+mysql_pfs_key_t trx_rseg_latch_key;
+mysql_pfs_key_t lock_latch_key;
+mysql_pfs_key_t	log_latch_key;
+
+/* all_innodb_rwlocks array contains rwlocks that are
+performance schema instrumented if "UNIV_PFS_RWLOCK"
+is defined */
+static PSI_rwlock_info all_innodb_rwlocks[] =
+{
+#  ifdef BTR_CUR_HASH_ADAPT
+  { &btr_search_latch_key, "btr_search_latch", 0 },
+#  endif
+  { &dict_operation_lock_key, "dict_operation_lock", 0 },
+  { &fil_space_latch_key, "fil_space_latch", 0 },
+  { &trx_i_s_cache_lock_key, "trx_i_s_cache_lock", 0 },
+  { &trx_purge_latch_key, "trx_purge_latch", 0 },
+  { &trx_rseg_latch_key, "trx_rseg_latch", 0 },
+  { &lock_latch_key, "lock_latch", 0 },
+  { &log_latch_key, "log_latch", 0 },
+  { &index_tree_rw_lock_key, "index_tree_rw_lock", PSI_RWLOCK_FLAG_SX }
+};
+# endif /* UNIV_PFS_RWLOCK */
+
+# ifdef UNIV_PFS_THREAD
+/* all_innodb_threads array contains threads that are
+performance schema instrumented if "UNIV_PFS_THREAD"
+is defined */
+static PSI_thread_info	all_innodb_threads[] = {
+	PSI_KEY(page_cleaner_thread),
+	PSI_KEY(trx_rollback_clean_thread),
+	PSI_KEY(thread_pool_thread)
+};
+# endif /* UNIV_PFS_THREAD */
+
+# ifdef UNIV_PFS_IO
+/* all_innodb_files array contains the type of files that are
+performance schema instrumented if "UNIV_PFS_IO" is defined */
+static PSI_file_info	all_innodb_files[] = {
+	PSI_KEY(innodb_data_file),
+	PSI_KEY(innodb_temp_file)
+};
+# endif /* UNIV_PFS_IO */
+#endif /* HAVE_PSI_INTERFACE */
+
+static void innodb_remember_check_sysvar_funcs();
+mysql_var_check_func check_sysvar_enum;
+mysql_var_check_func check_sysvar_int;
+
+// should page compression be used by default for new tables
+static MYSQL_THDVAR_BOOL(compression_default, PLUGIN_VAR_OPCMDARG,
+  "Is compression the default for new tables", 
+  NULL, NULL, FALSE);
+
+/** Update callback for SET [SESSION] innodb_default_encryption_key_id */
+static void
+innodb_default_encryption_key_id_update(THD* thd, st_mysql_sys_var* var,
+					void* var_ptr, const void *save)
+{
+	uint key_id = *static_cast<const uint*>(save);
+	if (key_id != FIL_DEFAULT_ENCRYPTION_KEY
+	    && !encryption_key_id_exists(key_id)) {
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    ER_WRONG_ARGUMENTS,
+				    "innodb_default_encryption_key=%u"
+				    " is not available", key_id);
+	}
+	*static_cast<uint*>(var_ptr) = key_id;
+}
+
+static MYSQL_THDVAR_UINT(default_encryption_key_id, PLUGIN_VAR_RQCMDARG,
+			 "Default encryption key id used for table encryption.",
+			 NULL, innodb_default_encryption_key_id_update,
+			 FIL_DEFAULT_ENCRYPTION_KEY, 1, UINT_MAX32, 0);
+
+/**
+  Structure for CREATE TABLE options (table options).
+  It needs to be called ha_table_option_struct.
+
+  The option values can be specified in the CREATE TABLE at the end:
+  CREATE TABLE ( ... ) *here*
+*/
+
+ha_create_table_option innodb_table_option_list[]=
+{
+  /* With this option user can enable page compression feature for the
+  table */
+  HA_TOPTION_SYSVAR("PAGE_COMPRESSED", page_compressed, compression_default),
+  /* With this option user can set zip compression level for page
+  compression for this table*/
+  HA_TOPTION_NUMBER("PAGE_COMPRESSION_LEVEL", page_compression_level, 0, 1, 9, 1),
+  /* With this option the user can enable encryption for the table */
+  HA_TOPTION_ENUM("ENCRYPTED", encryption, "DEFAULT,YES,NO", 0),
+  /* With this option the user defines the key identifier using for the encryption */
+  HA_TOPTION_SYSVAR("ENCRYPTION_KEY_ID", encryption_key_id, default_encryption_key_id),
+
+  HA_TOPTION_END
+};
+
+/*************************************************************//**
+Check whether valid argument given to innodb_ft_*_stopword_table.
+This function is registered as a callback with MySQL.
+@return 0 for valid stopword table */
+static
+int
+innodb_stopword_table_validate(
+/*===========================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to system
+						variable */
+	void*				save,	/*!< out: immediate result
+						for update function */
+	struct st_mysql_value*		value);	/*!< in: incoming string */
+
+static
+void innodb_ft_cache_size_update(THD*, st_mysql_sys_var*, void*, const void* save)
+{
+  fts_max_cache_size= *static_cast<const size_t*>(save);
+}
+
+static
+void innodb_ft_total_cache_size_update(THD*, st_mysql_sys_var*, void*, const void* save)
+{
+  fts_max_total_cache_size= *static_cast<const size_t*>(save);
+}
+
+static bool is_mysql_datadir_path(const char *path);
+
+/** Validate passed-in "value" is a valid directory name.
+This function is registered as a callback with MySQL.
+@param[in,out]	thd	thread handle
+@param[in]	var	pointer to system variable
+@param[out]	save	immediate result for update
+@param[in]	value	incoming string
+@return 0 for valid name */
+static
+int
+innodb_tmpdir_validate(
+	THD*				thd,
+	struct st_mysql_sys_var*,
+	void*				save,
+	struct st_mysql_value*		value)
+{
+
+	char*	alter_tmp_dir;
+	char*	innodb_tmp_dir;
+	char	buff[OS_FILE_MAX_PATH];
+	int	len = sizeof(buff);
+	char	tmp_abs_path[FN_REFLEN + 2];
+
+	ut_ad(save != NULL);
+	ut_ad(value != NULL);
+
+	if (check_global_access(thd, FILE_ACL)) {
+		push_warning_printf(
+			thd, Sql_condition::WARN_LEVEL_WARN,
+			ER_WRONG_ARGUMENTS,
+			"InnoDB: FILE Permissions required");
+		*static_cast<const char**>(save) = NULL;
+		return(1);
+	}
+
+	alter_tmp_dir = (char*) value->val_str(value, buff, &len);
+
+	if (!alter_tmp_dir) {
+		*static_cast<const char**>(save) = alter_tmp_dir;
+		return(0);
+	}
+
+	if (strlen(alter_tmp_dir) > FN_REFLEN) {
+		push_warning_printf(
+			thd, Sql_condition::WARN_LEVEL_WARN,
+			ER_WRONG_ARGUMENTS,
+			"Path length should not exceed %d bytes", FN_REFLEN);
+		*static_cast<const char**>(save) = NULL;
+		return(1);
+	}
+
+	my_realpath(tmp_abs_path, alter_tmp_dir, 0);
+	size_t	tmp_abs_len = strlen(tmp_abs_path);
+
+	if (my_access(tmp_abs_path, F_OK)) {
+
+		push_warning_printf(
+			thd, Sql_condition::WARN_LEVEL_WARN,
+			ER_WRONG_ARGUMENTS,
+			"InnoDB: Path doesn't exist.");
+		*static_cast<const char**>(save) = NULL;
+		return(1);
+	} else if (my_access(tmp_abs_path, R_OK | W_OK)) {
+		push_warning_printf(
+			thd, Sql_condition::WARN_LEVEL_WARN,
+			ER_WRONG_ARGUMENTS,
+			"InnoDB: Server doesn't have permission in "
+			"the given location.");
+		*static_cast<const char**>(save) = NULL;
+		return(1);
+	}
+
+	MY_STAT stat_info_dir;
+
+	if (my_stat(tmp_abs_path, &stat_info_dir, MYF(0))) {
+		if ((stat_info_dir.st_mode & S_IFDIR) != S_IFDIR) {
+
+			push_warning_printf(
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_WRONG_ARGUMENTS,
+				"Given path is not a directory. ");
+			*static_cast<const char**>(save) = NULL;
+			return(1);
+		}
+	}
+
+	if (!is_mysql_datadir_path(tmp_abs_path)) {
+
+		push_warning_printf(
+			thd, Sql_condition::WARN_LEVEL_WARN,
+			ER_WRONG_ARGUMENTS,
+			"InnoDB: Path Location should not be same as "
+			"mysql data directory location.");
+		*static_cast<const char**>(save) = NULL;
+		return(1);
+	}
+
+	innodb_tmp_dir = static_cast<char*>(
+		thd_memdup(thd, tmp_abs_path, tmp_abs_len + 1));
+	*static_cast<const char**>(save) = innodb_tmp_dir;
+	return(0);
+}
+
+/******************************************************************//**
+Maps a MySQL trx isolation level code to the InnoDB isolation level code
+@return	InnoDB isolation level */
+static inline
+uint
+innobase_map_isolation_level(
+/*=========================*/
+	enum_tx_isolation	iso);	/*!< in: MySQL isolation level code */
+
+/** Gets field offset for a field in a table.
+@param[in]	table	MySQL table object
+@param[in]	field	MySQL field object (from table->field array)
+@return offset */
+static inline
+uint
+get_field_offset(
+	const TABLE*	table,
+	const Field*	field)
+{
+	return field->offset(table->record[0]);
+}
+
+
+/*************************************************************//**
+Check for a valid value of innobase_compression_algorithm.
+@return	0 for valid innodb_compression_algorithm. */
+static
+int
+innodb_compression_algorithm_validate(
+/*==================================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to system
+						variable */
+	void*				save,	/*!< out: immediate result
+						for update function */
+	struct st_mysql_value*		value);	/*!< in: incoming string */
+
+static ibool innodb_have_punch_hole=IF_PUNCH_HOLE(1, 0);
+
+static
+int
+innodb_encrypt_tables_validate(
+/*==================================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to system
+						variable */
+	void*				save,	/*!< out: immediate result
+						for update function */
+	struct st_mysql_value*		value);	/*!< in: incoming string */
+
+static const char innobase_hton_name[]= "InnoDB";
+
+static MYSQL_THDVAR_BOOL(table_locks, PLUGIN_VAR_OPCMDARG,
+  "Enable InnoDB locking in LOCK TABLES",
+  /* check_func */ NULL, /* update_func */ NULL,
+  /* default */ TRUE);
+
+static MYSQL_THDVAR_BOOL(strict_mode, PLUGIN_VAR_OPCMDARG,
+  "Use strict mode when evaluating create options.",
+  NULL, NULL, TRUE);
+
+static MYSQL_THDVAR_BOOL(ft_enable_stopword, PLUGIN_VAR_OPCMDARG,
+  "Create FTS index with stopword.",
+  NULL, NULL,
+  /* default */ TRUE);
+
+static MYSQL_THDVAR_UINT(lock_wait_timeout, PLUGIN_VAR_RQCMDARG,
+  "Timeout in seconds an InnoDB transaction may wait for a lock before being rolled back. The value 100000000 is infinite timeout.",
+  NULL, NULL, 50, 0, 100000000, 0);
+
+static MYSQL_THDVAR_STR(ft_user_stopword_table,
+  PLUGIN_VAR_OPCMDARG|PLUGIN_VAR_MEMALLOC,
+  "User supplied stopword table name, effective in the session level.",
+  innodb_stopword_table_validate, NULL, NULL);
+
+static MYSQL_THDVAR_STR(tmpdir,
+  PLUGIN_VAR_OPCMDARG|PLUGIN_VAR_MEMALLOC,
+  "Directory for temporary non-tablespace files.",
+  innodb_tmpdir_validate, NULL, NULL);
+
+static size_t truncated_status_writes;
+
+static SHOW_VAR innodb_status_variables[]= {
+#ifdef BTR_CUR_HASH_ADAPT
+  {"adaptive_hash_hash_searches", &export_vars.innodb_ahi_hit, SHOW_SIZE_T},
+  {"adaptive_hash_non_hash_searches",
+  &export_vars.innodb_ahi_miss, SHOW_SIZE_T},
+#endif
+  {"background_log_sync", &srv_log_writes_and_flush, SHOW_SIZE_T},
+  {"buffer_pool_dump_status",
+  (char*) &export_vars.innodb_buffer_pool_dump_status,	  SHOW_CHAR},
+  {"buffer_pool_load_status",
+  (char*) &export_vars.innodb_buffer_pool_load_status,	  SHOW_CHAR},
+  {"buffer_pool_resize_status",
+  (char*) &export_vars.innodb_buffer_pool_resize_status,  SHOW_CHAR},
+  {"buffer_pool_load_incomplete",
+  &export_vars.innodb_buffer_pool_load_incomplete,        SHOW_BOOL},
+  {"buffer_pool_pages_data", &UT_LIST_GET_LEN(buf_pool.LRU), SHOW_SIZE_T},
+  {"buffer_pool_bytes_data",
+   &export_vars.innodb_buffer_pool_bytes_data, SHOW_SIZE_T},
+  {"buffer_pool_pages_dirty",
+   &UT_LIST_GET_LEN(buf_pool.flush_list), SHOW_SIZE_T},
+  {"buffer_pool_bytes_dirty", &buf_pool.flush_list_bytes, SHOW_SIZE_T},
+  {"buffer_pool_pages_flushed", &buf_pool.stat.n_pages_written, SHOW_SIZE_T},
+  {"buffer_pool_pages_free", &UT_LIST_GET_LEN(buf_pool.free), SHOW_SIZE_T},
+#ifdef UNIV_DEBUG
+  {"buffer_pool_pages_latched",
+   &export_vars.innodb_buffer_pool_pages_latched, SHOW_SIZE_T},
+#endif /* UNIV_DEBUG */
+  {"buffer_pool_pages_made_not_young",
+   &buf_pool.stat.n_pages_not_made_young, SHOW_SIZE_T},
+  {"buffer_pool_pages_made_young",
+   &buf_pool.stat.n_pages_made_young, SHOW_SIZE_T},
+  {"buffer_pool_pages_misc",
+   &export_vars.innodb_buffer_pool_pages_misc, SHOW_SIZE_T},
+  {"buffer_pool_pages_old", &buf_pool.LRU_old_len, SHOW_SIZE_T},
+  {"buffer_pool_pages_total",
+   &export_vars.innodb_buffer_pool_pages_total, SHOW_SIZE_T},
+  {"buffer_pool_pages_LRU_flushed", &buf_lru_flush_page_count, SHOW_SIZE_T},
+  {"buffer_pool_pages_LRU_freed", &buf_lru_freed_page_count, SHOW_SIZE_T},
+  {"buffer_pool_pages_split", &buf_pool.pages_split, SHOW_SIZE_T},
+  {"buffer_pool_read_ahead_rnd",
+   &buf_pool.stat.n_ra_pages_read_rnd, SHOW_SIZE_T},
+  {"buffer_pool_read_ahead", &buf_pool.stat.n_ra_pages_read, SHOW_SIZE_T},
+  {"buffer_pool_read_ahead_evicted",
+   &buf_pool.stat.n_ra_pages_evicted, SHOW_SIZE_T},
+  {"buffer_pool_read_requests",
+   &export_vars.innodb_buffer_pool_read_requests, SHOW_SIZE_T},
+  {"buffer_pool_reads", &buf_pool.stat.n_pages_read, SHOW_SIZE_T},
+  {"buffer_pool_wait_free", &buf_pool.stat.LRU_waits, SHOW_SIZE_T},
+  {"buffer_pool_write_requests", &buf_pool.flush_list_requests, SHOW_SIZE_T},
+  {"checkpoint_age", &export_vars.innodb_checkpoint_age, SHOW_SIZE_T},
+  {"checkpoint_max_age", &export_vars.innodb_checkpoint_max_age, SHOW_SIZE_T},
+  {"data_fsyncs", (size_t*) &os_n_fsyncs, SHOW_SIZE_T},
+  {"data_pending_fsyncs",
+   (size_t*) &fil_n_pending_tablespace_flushes, SHOW_SIZE_T},
+  {"data_pending_reads", &export_vars.innodb_data_pending_reads, SHOW_SIZE_T},
+  {"data_pending_writes", &export_vars.innodb_data_pending_writes,SHOW_SIZE_T},
+  {"data_read", &export_vars.innodb_data_read, SHOW_SIZE_T},
+  {"data_reads", &export_vars.innodb_data_reads, SHOW_SIZE_T},
+  {"data_writes", &export_vars.innodb_data_writes, SHOW_SIZE_T},
+  {"data_written", &export_vars.innodb_data_written, SHOW_SIZE_T},
+  {"dblwr_pages_written", &export_vars.innodb_dblwr_pages_written,SHOW_SIZE_T},
+  {"dblwr_writes", &export_vars.innodb_dblwr_writes, SHOW_SIZE_T},
+  {"deadlocks", &lock_sys.deadlocks, SHOW_SIZE_T},
+  {"history_list_length", &export_vars.innodb_history_list_length,SHOW_SIZE_T},
+  {"ibuf_discarded_delete_marks", &ibuf.n_discarded_ops[IBUF_OP_DELETE_MARK],
+   SHOW_SIZE_T},
+  {"ibuf_discarded_deletes", &ibuf.n_discarded_ops[IBUF_OP_DELETE],
+   SHOW_SIZE_T},
+  {"ibuf_discarded_inserts", &ibuf.n_discarded_ops[IBUF_OP_INSERT],
+   SHOW_SIZE_T},
+  {"ibuf_free_list", &ibuf.free_list_len, SHOW_SIZE_T},
+  {"ibuf_merged_delete_marks", &ibuf.n_merged_ops[IBUF_OP_DELETE_MARK],
+   SHOW_SIZE_T},
+  {"ibuf_merged_deletes", &ibuf.n_merged_ops[IBUF_OP_DELETE], SHOW_SIZE_T},
+  {"ibuf_merged_inserts", &ibuf.n_merged_ops[IBUF_OP_INSERT], SHOW_SIZE_T},
+  {"ibuf_merges", &ibuf.n_merges, SHOW_SIZE_T},
+  {"ibuf_segment_size", &ibuf.seg_size, SHOW_SIZE_T},
+  {"ibuf_size", &ibuf.size, SHOW_SIZE_T},
+  {"log_waits", &log_sys.waits, SHOW_SIZE_T},
+  {"log_write_requests", &log_sys.write_to_buf, SHOW_SIZE_T},
+  {"log_writes", &log_sys.write_to_log, SHOW_SIZE_T},
+  {"lsn_current", &export_vars.innodb_lsn_current, SHOW_ULONGLONG},
+  {"lsn_flushed", &export_vars.innodb_lsn_flushed, SHOW_ULONGLONG},
+  {"lsn_last_checkpoint", &export_vars.innodb_lsn_last_checkpoint,
+   SHOW_ULONGLONG},
+  {"master_thread_active_loops", &srv_main_active_loops, SHOW_SIZE_T},
+  {"master_thread_idle_loops", &srv_main_idle_loops, SHOW_SIZE_T},
+  {"max_trx_id", &export_vars.innodb_max_trx_id, SHOW_ULONGLONG},
+#ifdef BTR_CUR_HASH_ADAPT
+  {"mem_adaptive_hash", &export_vars.innodb_mem_adaptive_hash, SHOW_SIZE_T},
+#endif
+  {"mem_dictionary", &export_vars.innodb_mem_dictionary, SHOW_SIZE_T},
+  {"os_log_written", &export_vars.innodb_os_log_written, SHOW_SIZE_T},
+  {"page_size", &srv_page_size, SHOW_ULONG},
+  {"pages_created", &buf_pool.stat.n_pages_created, SHOW_SIZE_T},
+  {"pages_read", &buf_pool.stat.n_pages_read, SHOW_SIZE_T},
+  {"pages_written", &buf_pool.stat.n_pages_written, SHOW_SIZE_T},
+  {"row_lock_current_waits", &export_vars.innodb_row_lock_current_waits,
+   SHOW_SIZE_T},
+  {"row_lock_time", &export_vars.innodb_row_lock_time, SHOW_LONGLONG},
+  {"row_lock_time_avg", &export_vars.innodb_row_lock_time_avg, SHOW_ULONGLONG},
+  {"row_lock_time_max", &export_vars.innodb_row_lock_time_max, SHOW_ULONGLONG},
+  {"row_lock_waits", &export_vars.innodb_row_lock_waits, SHOW_SIZE_T},
+  {"num_open_files", &fil_system.n_open, SHOW_SIZE_T},
+  {"truncated_status_writes", &truncated_status_writes, SHOW_SIZE_T},
+  {"available_undo_logs", &srv_available_undo_logs, SHOW_ULONG},
+  {"undo_truncations", &export_vars.innodb_undo_truncations, SHOW_ULONG},
+
+  /* Status variables for page compression */
+  {"page_compression_saved",
+   &export_vars.innodb_page_compression_saved, SHOW_LONGLONG},
+  {"num_pages_page_compressed",
+   &export_vars.innodb_pages_page_compressed, SHOW_LONGLONG},
+  {"num_page_compressed_trim_op",
+   &export_vars.innodb_page_compressed_trim_op, SHOW_LONGLONG},
+  {"num_pages_page_decompressed",
+   &export_vars.innodb_pages_page_decompressed, SHOW_LONGLONG},
+  {"num_pages_page_compression_error",
+   &export_vars.innodb_pages_page_compression_error, SHOW_LONGLONG},
+  {"num_pages_encrypted",
+   &export_vars.innodb_pages_encrypted, SHOW_LONGLONG},
+  {"num_pages_decrypted",
+   &export_vars.innodb_pages_decrypted, SHOW_LONGLONG},
+  {"have_lz4",        &(provider_service_lz4->is_loaded),    SHOW_BOOL},
+  {"have_lzo",        &(provider_service_lzo->is_loaded),    SHOW_BOOL},
+  {"have_lzma",       &(provider_service_lzma->is_loaded),   SHOW_BOOL},
+  {"have_bzip2",      &(provider_service_bzip2->is_loaded),  SHOW_BOOL},
+  {"have_snappy",     &(provider_service_snappy->is_loaded), SHOW_BOOL},
+  {"have_punch_hole", &innodb_have_punch_hole, SHOW_BOOL},
+
+  /* Defragmentation */
+  {"defragment_compression_failures",
+   &export_vars.innodb_defragment_compression_failures, SHOW_SIZE_T},
+  {"defragment_failures", &export_vars.innodb_defragment_failures,SHOW_SIZE_T},
+  {"defragment_count", &export_vars.innodb_defragment_count, SHOW_SIZE_T},
+
+  {"instant_alter_column",
+   &export_vars.innodb_instant_alter_column, SHOW_ULONG},
+
+  /* Online alter table status variables */
+  {"onlineddl_rowlog_rows",
+   &export_vars.innodb_onlineddl_rowlog_rows, SHOW_SIZE_T},
+  {"onlineddl_rowlog_pct_used",
+   &export_vars.innodb_onlineddl_rowlog_pct_used, SHOW_SIZE_T},
+  {"onlineddl_pct_progress",
+   &export_vars.innodb_onlineddl_pct_progress, SHOW_SIZE_T},
+
+  /* Encryption */
+  {"encryption_rotation_pages_read_from_cache",
+   &export_vars.innodb_encryption_rotation_pages_read_from_cache, SHOW_SIZE_T},
+  {"encryption_rotation_pages_read_from_disk",
+   &export_vars.innodb_encryption_rotation_pages_read_from_disk, SHOW_SIZE_T},
+  {"encryption_rotation_pages_modified",
+   &export_vars.innodb_encryption_rotation_pages_modified, SHOW_SIZE_T},
+  {"encryption_rotation_pages_flushed",
+   &export_vars.innodb_encryption_rotation_pages_flushed, SHOW_SIZE_T},
+  {"encryption_rotation_estimated_iops",
+   &export_vars.innodb_encryption_rotation_estimated_iops, SHOW_SIZE_T},
+  {"encryption_n_merge_blocks_encrypted",
+   &export_vars.innodb_n_merge_blocks_encrypted, SHOW_LONGLONG},
+  {"encryption_n_merge_blocks_decrypted",
+   &export_vars.innodb_n_merge_blocks_decrypted, SHOW_LONGLONG},
+  {"encryption_n_rowlog_blocks_encrypted",
+   &export_vars.innodb_n_rowlog_blocks_encrypted, SHOW_LONGLONG},
+  {"encryption_n_rowlog_blocks_decrypted",
+   &export_vars.innodb_n_rowlog_blocks_decrypted, SHOW_LONGLONG},
+  {"encryption_n_temp_blocks_encrypted",
+   &export_vars.innodb_n_temp_blocks_encrypted, SHOW_LONGLONG},
+  {"encryption_n_temp_blocks_decrypted",
+   &export_vars.innodb_n_temp_blocks_decrypted, SHOW_LONGLONG},
+  {"encryption_num_key_requests", &export_vars.innodb_encryption_key_requests,
+   SHOW_LONGLONG},
+
+  {NullS, NullS, SHOW_LONG}
+};
+
+/*****************************************************************//**
+Frees a possible InnoDB trx object associated with the current THD.
+@return 0 or error number */
+static
+int
+innobase_close_connection(
+/*======================*/
+	handlerton*	hton,		/*!< in/out: InnoDB handlerton */
+	THD*		thd);		/*!< in: MySQL thread handle for
+					which to close the connection */
+
+/** Cancel any pending lock request associated with the current THD.
+@sa THD::awake() @sa ha_kill_query() */
+static void innobase_kill_query(handlerton*, THD* thd, enum thd_kill_levels);
+static void innobase_commit_ordered(handlerton *hton, THD* thd, bool all);
+
+/*****************************************************************//**
+Commits a transaction in an InnoDB database or marks an SQL statement
+ended.
+@return 0 */
+static
+int
+innobase_commit(
+/*============*/
+	handlerton*	hton,		/*!< in/out: InnoDB handlerton */
+	THD*		thd,		/*!< in: MySQL thread handle of the
+					user for whom the transaction should
+					be committed */
+	bool		commit_trx);	/*!< in: true - commit transaction
+					false - the current SQL statement
+					ended */
+
+/*****************************************************************//**
+Rolls back a transaction to a savepoint.
+@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the
+given name */
+static
+int
+innobase_rollback(
+/*==============*/
+	handlerton*	hton,		/*!< in/out: InnoDB handlerton */
+	THD*		thd,		/*!< in: handle to the MySQL thread
+					of the user whose transaction should
+					be rolled back */
+	bool		rollback_trx);	/*!< in: TRUE - rollback entire
+					transaction FALSE - rollback the current
+					statement only */
+
+/*****************************************************************//**
+Rolls back a transaction to a savepoint.
+@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the
+given name */
+static
+int
+innobase_rollback_to_savepoint(
+/*===========================*/
+	handlerton*	hton,		/*!< in/out: InnoDB handlerton */
+	THD*		thd,		/*!< in: handle to the MySQL thread of
+					the user whose XA transaction should
+					be rolled back to savepoint */
+	void*		savepoint);	/*!< in: savepoint data */
+
+/*****************************************************************//**
+Check whether innodb state allows to safely release MDL locks after
+rollback to savepoint.
+@return true if it is safe, false if its not safe. */
+static
+bool
+innobase_rollback_to_savepoint_can_release_mdl(
+/*===========================================*/
+	handlerton*	hton,		/*!< in/out: InnoDB handlerton */
+	THD*		thd);		/*!< in: handle to the MySQL thread of
+					the user whose XA transaction should
+					be rolled back to savepoint */
+
+/*****************************************************************//**
+Sets a transaction savepoint.
+@return always 0, that is, always succeeds */
+static
+int
+innobase_savepoint(
+/*===============*/
+	handlerton*	hton,		/*!< in/out: InnoDB handlerton */
+	THD*		thd,		/*!< in: handle to the MySQL thread of
+					the user's XA transaction for which
+					we need to take a savepoint */
+	void*		savepoint);	/*!< in: savepoint data */
+
+/*****************************************************************//**
+Release transaction savepoint name.
+@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the
+given name */
+static
+int
+innobase_release_savepoint(
+/*=======================*/
+	handlerton*	hton,		/*!< in/out: handlerton for InnoDB */
+	THD*		thd,		/*!< in: handle to the MySQL thread
+					of the user whose transaction's
+					savepoint should be released */
+	void*		savepoint);	/*!< in: savepoint data */
+
+/** Request notification of log writes */
+static void innodb_log_flush_request(void *cookie);
+
+/** Requests for log flushes */
+struct log_flush_request
+{
+  /** earlier request (for a smaller LSN) */
+  log_flush_request *next;
+  /** parameter provided to innodb_log_flush_request() */
+  void *cookie;
+  /** log sequence number that is being waited for */
+  lsn_t lsn;
+};
+
+/** Buffer of pending innodb_log_flush_request() */
+alignas(CPU_LEVEL1_DCACHE_LINESIZE) static
+struct
+{
+  /** first request */
+  std::atomic<log_flush_request*> start;
+  /** last request */
+  log_flush_request *end;
+  /** mutex protecting this object */
+  mysql_mutex_t mutex;
+}
+log_requests;
+
+/** @brief Adjust some InnoDB startup parameters based on file contents
+or innodb_page_size. */
+static
+void
+innodb_params_adjust();
+
+/*******************************************************************//**
+This function is used to prepare an X/Open XA distributed transaction.
+@return 0 or error number */
+static
+int
+innobase_xa_prepare(
+/*================*/
+	handlerton*	hton,		/*!< in: InnoDB handlerton */
+	THD*		thd,		/*!< in: handle to the MySQL thread of
+					the user whose XA transaction should
+					be prepared */
+	bool		all);		/*!< in: true - prepare transaction
+					false - the current SQL statement
+					ended */
+/*******************************************************************//**
+This function is used to recover X/Open XA distributed transactions.
+@return number of prepared transactions stored in xid_list */
+static
+int
+innobase_xa_recover(
+/*================*/
+	handlerton*	hton,		/*!< in: InnoDB handlerton */
+	XID*		xid_list,	/*!< in/out: prepared transactions */
+	uint		len);		/*!< in: number of slots in xid_list */
+/*******************************************************************//**
+This function is used to commit one X/Open XA distributed transaction
+which is in the prepared state
+@return 0 or error number */
+static
+int
+innobase_commit_by_xid(
+/*===================*/
+	handlerton*	hton,		/*!< in: InnoDB handlerton */
+	XID*		xid);		/*!< in: X/Open XA transaction
+					identification */
+
+/** Ignore FOREIGN KEY constraints that would be violated by DROP DATABASE */
+static ibool innodb_drop_database_ignore_fk(void*,void*) { return false; }
+
+/** FOREIGN KEY error reporting context for DROP DATABASE */
+struct innodb_drop_database_fk_report
+{
+  /** database name, with trailing '/' */
+  const span<const char> name;
+  /** whether errors were found */
+  bool violated;
+};
+
+/** Report FOREIGN KEY constraints that would be violated by DROP DATABASE
+@return whether processing should continue */
+static ibool innodb_drop_database_fk(void *node, void *report)
+{
+  auto s= static_cast<sel_node_t*>(node);
+  auto r= static_cast<innodb_drop_database_fk_report*>(report);
+  const dfield_t *name= que_node_get_val(s->select_list);
+  ut_ad(name->type.mtype == DATA_VARCHAR);
+
+  if (name->len == UNIV_SQL_NULL || name->len <= r->name.size() ||
+      memcmp(static_cast<const char*>(name->data), r->name.data(),
+             r->name.size()))
+    return false; /* End of matches */
+
+  node= que_node_get_next(s->select_list);
+  const dfield_t *id= que_node_get_val(node);
+  ut_ad(id->type.mtype == DATA_VARCHAR);
+  ut_ad(!que_node_get_next(node));
+
+  if (id->len != UNIV_SQL_NULL)
+    sql_print_error("DROP DATABASE: table %.*s is referenced"
+                    " by FOREIGN KEY %.*s",
+                    static_cast<int>(name->len),
+                    static_cast<const char*>(name->data),
+                    static_cast<int>(id->len),
+                    static_cast<const char*>(id->data));
+  else
+    ut_ad("corrupted SYS_FOREIGN record" == 0);
+
+  return true;
+}
+
+/** After DROP DATABASE executed ha_innobase::delete_table() on all
+tables that it was aware of, drop any leftover tables inside InnoDB.
+@param path  database path */
+static void innodb_drop_database(handlerton*, char *path)
+{
+  if (high_level_read_only)
+    return;
+
+  ulint len= 0;
+  char *ptr;
+
+  for (ptr= strend(path) - 2; ptr >= path &&
+#ifdef _WIN32
+       *ptr != '\\' &&
+#endif
+       *ptr != '/'; ptr--)
+    len++;
+
+  ptr++;
+  char *namebuf= static_cast<char*>
+    (my_malloc(PSI_INSTRUMENT_ME, len + 2, MYF(0)));
+  if (!namebuf)
+    return;
+  memcpy(namebuf, ptr, len);
+  namebuf[len] = '/';
+  namebuf[len + 1] = '\0';
+
+#ifdef _WIN32
+  innobase_casedn_str(namebuf);
+#endif /* _WIN32 */
+
+  THD * const thd= current_thd;
+  trx_t *trx= innobase_trx_allocate(thd);
+  dberr_t err= DB_SUCCESS;
+
+  dict_sys.lock(SRW_LOCK_CALL);
+
+  for (auto i= dict_sys.table_id_hash.n_cells; i--; )
+  {
+    for (dict_table_t *next, *table= static_cast<dict_table_t*>
+         (dict_sys.table_id_hash.array[i].node); table; table= next)
+    {
+      ut_ad(table->cached);
+      next= table->id_hash;
+      if (strncmp(table->name.m_name, namebuf, len + 1))
+        continue;
+      const auto n_handles= table->get_ref_count();
+      const bool locks= !n_handles && lock_table_has_locks(table);
+      if (n_handles || locks)
+      {
+        err= DB_ERROR;
+        ib::error errmsg;
+        errmsg << "DROP DATABASE: cannot DROP TABLE " << table->name;
+        if (n_handles)
+          errmsg << " due to " << n_handles << " open handles";
+        else
+          errmsg << " due to locks";
+        continue;
+      }
+      dict_sys.remove(table);
+    }
+  }
+
+  dict_sys.unlock();
+
+  dict_table_t *table_stats, *index_stats;
+  MDL_ticket *mdl_table= nullptr, *mdl_index= nullptr;
+  table_stats= dict_table_open_on_name(TABLE_STATS_NAME, false,
+                                       DICT_ERR_IGNORE_NONE);
+  if (table_stats)
+  {
+    dict_sys.freeze(SRW_LOCK_CALL);
+    table_stats= dict_acquire_mdl_shared<false>(table_stats,
+                                                thd, &mdl_table);
+    dict_sys.unfreeze();
+  }
+  index_stats= dict_table_open_on_name(INDEX_STATS_NAME, false,
+                                       DICT_ERR_IGNORE_NONE);
+  if (index_stats)
+  {
+    dict_sys.freeze(SRW_LOCK_CALL);
+    index_stats= dict_acquire_mdl_shared<false>(index_stats,
+                                                thd, &mdl_index);
+    dict_sys.unfreeze();
+  }
+
+  trx_start_for_ddl(trx);
+
+  uint errors= 0;
+  char db[NAME_LEN + 1];
+  strconvert(&my_charset_filename, namebuf, len, system_charset_info, db,
+             sizeof db, &errors);
+  if (!errors && table_stats && index_stats &&
+      !strcmp(table_stats->name.m_name, TABLE_STATS_NAME) &&
+      !strcmp(index_stats->name.m_name, INDEX_STATS_NAME) &&
+      lock_table_for_trx(table_stats, trx, LOCK_X) == DB_SUCCESS &&
+      lock_table_for_trx(index_stats, trx, LOCK_X) == DB_SUCCESS)
+  {
+    row_mysql_lock_data_dictionary(trx);
+    if (dict_stats_delete(db, trx))
+    {
+      /* Ignore this error. Leaving garbage statistics behind is a
+      lesser evil. Carry on to try to remove any garbage tables. */
+      trx->rollback();
+      trx_start_for_ddl(trx);
+    }
+    row_mysql_unlock_data_dictionary(trx);
+  }
+
+  if (err == DB_SUCCESS)
+    err= lock_sys_tables(trx);
+  row_mysql_lock_data_dictionary(trx);
+
+  static const char drop_database[] =
+    "PROCEDURE DROP_DATABASE_PROC () IS\n"
+    "fk CHAR;\n"
+    "name CHAR;\n"
+    "tid CHAR;\n"
+    "iid CHAR;\n"
+
+    "DECLARE FUNCTION fk_report;\n"
+
+    "DECLARE CURSOR fkf IS\n"
+    "SELECT ID FROM SYS_FOREIGN WHERE ID >= :db FOR UPDATE;\n"
+
+    "DECLARE CURSOR fkr IS\n"
+    "SELECT REF_NAME,ID FROM SYS_FOREIGN WHERE REF_NAME >= :db FOR UPDATE\n"
+    "ORDER BY REF_NAME;\n"
+
+    "DECLARE CURSOR tab IS\n"
+    "SELECT ID,NAME FROM SYS_TABLES WHERE NAME >= :db FOR UPDATE;\n"
+
+    "DECLARE CURSOR idx IS\n"
+    "SELECT ID FROM SYS_INDEXES WHERE TABLE_ID = tid FOR UPDATE;\n"
+
+    "BEGIN\n"
+
+    "OPEN fkf;\n"
+    "WHILE 1 = 1 LOOP\n"
+    "  FETCH fkf INTO fk;\n"
+    "  IF (SQL % NOTFOUND) THEN EXIT; END IF;\n"
+    "  IF TO_BINARY(SUBSTR(fk, 0, LENGTH(:db)))<>TO_BINARY(:db)"
+    " THEN EXIT; END IF;\n"
+    "  DELETE FROM SYS_FOREIGN_COLS WHERE TO_BINARY(ID)=TO_BINARY(fk);\n"
+    "  DELETE FROM SYS_FOREIGN WHERE CURRENT OF fkf;\n"
+    "END LOOP;\n"
+    "CLOSE fkf;\n"
+
+    "OPEN fkr;\n"
+    "FETCH fkr INTO fk_report();\n"
+    "CLOSE fkr;\n"
+
+    "OPEN tab;\n"
+    "WHILE 1 = 1 LOOP\n"
+    "  FETCH tab INTO tid,name;\n"
+    "  IF (SQL % NOTFOUND) THEN EXIT; END IF;\n"
+    "  IF TO_BINARY(SUBSTR(name, 0, LENGTH(:db))) <> TO_BINARY(:db)"
+    " THEN EXIT; END IF;\n"
+    "  DELETE FROM SYS_COLUMNS WHERE TABLE_ID=tid;\n"
+    "  DELETE FROM SYS_TABLES WHERE ID=tid;\n"
+    "  OPEN idx;\n"
+    "  WHILE 1 = 1 LOOP\n"
+    "    FETCH idx INTO iid;\n"
+    "    IF (SQL % NOTFOUND) THEN EXIT; END IF;\n"
+    "    DELETE FROM SYS_FIELDS WHERE INDEX_ID=iid;\n"
+    "    DELETE FROM SYS_INDEXES WHERE CURRENT OF idx;\n"
+    "  END LOOP;\n"
+    "  CLOSE idx;\n"
+    "END LOOP;\n"
+    "CLOSE tab;\n"
+
+    "END;\n";
+
+  innodb_drop_database_fk_report report{{namebuf, len + 1}, false};
+
+  if (err == DB_SUCCESS)
+  {
+    pars_info_t* pinfo = pars_info_create();
+    pars_info_bind_function(pinfo, "fk_report", trx->check_foreigns
+                            ? innodb_drop_database_fk
+                            : innodb_drop_database_ignore_fk, &report);
+    pars_info_add_str_literal(pinfo, "db", namebuf);
+    err= que_eval_sql(pinfo, drop_database, trx);
+    if (err == DB_SUCCESS && report.violated)
+      err= DB_CANNOT_DROP_CONSTRAINT;
+  }
+
+  const trx_id_t trx_id= trx->id;
+
+  if (err != DB_SUCCESS)
+  {
+    trx->rollback();
+    namebuf[len] = '\0';
+    ib::error() << "DROP DATABASE " << namebuf << ": " << err;
+  }
+  else
+    trx->commit();
+
+  if (table_stats)
+    dict_table_close(table_stats, true, thd, mdl_table);
+  if (index_stats)
+    dict_table_close(index_stats, true, thd, mdl_index);
+  row_mysql_unlock_data_dictionary(trx);
+
+  trx->free();
+
+  if (err == DB_SUCCESS)
+  {
+    /* Eventually after the DELETE FROM SYS_INDEXES was committed,
+    purge would invoke dict_drop_index_tree() to delete the associated
+    tablespaces. Because the SQL layer expects the directory to be empty,
+    we will "manually" purge the tablespaces that belong to the
+    records that we delete-marked. */
+
+    dfield_t dfield;
+    dtuple_t tuple{
+      0,1,1,&dfield,0,nullptr
+#ifdef UNIV_DEBUG
+      , DATA_TUPLE_MAGIC_N
+#endif
+    };
+    dict_index_t* sys_index= UT_LIST_GET_FIRST(dict_sys.sys_tables->indexes);
+    btr_pcur_t pcur;
+    namebuf[len++]= '/';
+    dfield_set_data(&dfield, namebuf, len);
+    dict_index_copy_types(&tuple, sys_index, 1);
+    std::vector<pfs_os_file_t> to_close;
+    std::vector<uint32_t> space_ids;
+    mtr_t mtr;
+    mtr.start();
+    pcur.btr_cur.page_cur.index = sys_index;
+    err= btr_pcur_open_on_user_rec(&tuple, BTR_SEARCH_LEAF, &pcur, &mtr);
+    if (err != DB_SUCCESS)
+      goto err_exit;
+
+    for (; btr_pcur_is_on_user_rec(&pcur);
+         btr_pcur_move_to_next_user_rec(&pcur, &mtr))
+    {
+      const rec_t *rec= btr_pcur_get_rec(&pcur);
+      if (rec_get_n_fields_old(rec) != DICT_NUM_FIELDS__SYS_TABLES)
+      {
+        ut_ad("corrupted SYS_TABLES record" == 0);
+        break;
+      }
+      if (!rec_get_deleted_flag(rec, false))
+        continue;
+      ulint flen;
+      static_assert(DICT_FLD__SYS_TABLES__NAME == 0, "compatibility");
+      rec_get_nth_field_offs_old(rec, 0, &flen);
+      if (flen == UNIV_SQL_NULL || flen <= len || memcmp(rec, namebuf, len))
+        /* We ran out of tables that had existed in the database. */
+        break;
+      const byte *db_trx_id=
+        rec_get_nth_field_old(rec, DICT_FLD__SYS_TABLES__DB_TRX_ID, &flen);
+      if (flen != 6)
+      {
+        ut_ad("corrupted SYS_TABLES.SPACE" == 0);
+        break;
+      }
+      if (mach_read_from_6(db_trx_id) != trx_id)
+        /* This entry was modified by some other transaction than us.
+        Unfortunately, because SYS_TABLES.NAME is the PRIMARY KEY,
+        we cannot distinguish RENAME and DROP here. It is possible
+        that the table had been renamed to some other database. */
+        continue;
+      const byte *s=
+        rec_get_nth_field_old(rec, DICT_FLD__SYS_TABLES__SPACE, &flen);
+      if (flen != 4)
+        ut_ad("corrupted SYS_TABLES.SPACE" == 0);
+      else if (uint32_t space_id= mach_read_from_4(s))
+      {
+        space_ids.emplace_back(space_id);
+        pfs_os_file_t detached= fil_delete_tablespace(space_id);
+        if (detached != OS_FILE_CLOSED)
+          to_close.emplace_back(detached);
+      }
+    }
+  err_exit:
+    mtr.commit();
+    for (pfs_os_file_t detached : to_close)
+      os_file_close(detached);
+    for (const auto id : space_ids)
+      ibuf_delete_for_discarded_space(id);
+
+    /* Any changes must be persisted before we return. */
+    log_write_up_to(mtr.commit_lsn(), true);
+  }
+
+  my_free(namebuf);
+}
+
+/** Shut down the InnoDB storage engine.
+@return	0 */
+static
+int
+innobase_end(handlerton*, ha_panic_function);
+
+/*****************************************************************//**
+Creates an InnoDB transaction struct for the thd if it does not yet have one.
+Starts a new InnoDB transaction if a transaction is not yet started. And
+assigns a new snapshot for a consistent read if the transaction does not yet
+have one.
+@return 0 */
+static
+int
+innobase_start_trx_and_assign_read_view(
+/*====================================*/
+	handlerton*	hton,		/* in: InnoDB handlerton */
+	THD*		thd);		/* in: MySQL thread handle of the
+					user for whom the transaction should
+					be committed */
+
+/** Flush InnoDB redo logs to the file system.
+@return false */
+static bool innobase_flush_logs(handlerton*)
+{
+  if (!srv_read_only_mode && srv_flush_log_at_trx_commit)
+    /* Write any outstanding redo log. Durably if
+    innodb_flush_log_at_trx_commit=1. */
+    log_buffer_flush_to_disk(srv_flush_log_at_trx_commit == 1);
+  return false;
+}
+
+/************************************************************************//**
+Implements the SHOW ENGINE INNODB STATUS command. Sends the output of the
+InnoDB Monitor to the client.
+@return 0 on success */
+static
+int
+innodb_show_status(
+/*===============*/
+	handlerton*	hton,		/*!< in: the innodb handlerton */
+	THD*		thd,		/*!< in: the MySQL query thread of
+					the caller */
+	stat_print_fn*	stat_print);
+/************************************************************************//**
+Return 0 on success and non-zero on failure. Note: the bool return type
+seems to be abused here, should be an int. */
+static
+bool
+innobase_show_status(
+/*=================*/
+	handlerton*		hton,	/*!< in: the innodb handlerton */
+	THD*			thd,	/*!< in: the MySQL query thread of
+					the caller */
+	stat_print_fn*		stat_print,
+	enum ha_stat_type	stat_type);
+
+/** After ALTER TABLE, recompute statistics. */
+inline void ha_innobase::reload_statistics()
+{
+  if (dict_table_t *table= m_prebuilt ? m_prebuilt->table : nullptr)
+  {
+    if (table->is_readable())
+      dict_stats_init(table);
+    else
+      table->stat_initialized= 1;
+  }
+}
+
+/** After ALTER TABLE, recompute statistics. */
+static int innodb_notify_tabledef_changed(handlerton *,
+                                          LEX_CSTRING *, LEX_CSTRING *,
+                                          LEX_CUSTRING *, LEX_CUSTRING *,
+                                          handler *handler)
+{
+  DBUG_ENTER("innodb_notify_tabledef_changed");
+  if (handler)
+    static_cast<ha_innobase*>(handler)->reload_statistics();
+  DBUG_RETURN(0);
+}
+
+/****************************************************************//**
+Parse and enable InnoDB monitor counters during server startup.
+User can enable monitor counters/groups by specifying
+"loose-innodb_monitor_enable = monitor_name1;monitor_name2..."
+in server configuration file or at the command line. */
+static
+void
+innodb_enable_monitor_at_startup(
+/*=============================*/
+	char*	str);	/*!< in: monitor counter enable list */
+
+#ifdef MYSQL_STORE_FTS_DOC_ID
+/** Store doc_id value into FTS_DOC_ID field
+@param[in,out]	tbl	table containing FULLTEXT index
+@param[in]	doc_id	FTS_DOC_ID value */
+static
+void
+innobase_fts_store_docid(
+	TABLE*		tbl,
+	ulonglong	doc_id)
+{
+	my_bitmap_map*	old_map
+		= dbug_tmp_use_all_columns(tbl, tbl->write_set);
+
+	tbl->fts_doc_id_field->store(static_cast<longlong>(doc_id), true);
+
+	dbug_tmp_restore_column_map(tbl->write_set, old_map);
+}
+#endif
+
+/*******************************************************************//**
+Function for constructing an InnoDB table handler instance. */
+static
+handler*
+innobase_create_handler(
+/*====================*/
+	handlerton*	hton,	/*!< in: InnoDB handlerton */
+	TABLE_SHARE*	table,
+	MEM_ROOT*	mem_root)
+{
+	return(new (mem_root) ha_innobase(hton, table));
+}
+
+/* General functions */
+
+/** Check that a page_size is correct for InnoDB.
+If correct, set the associated page_size_shift which is the power of 2
+for this page size.
+@param[in]	page_size	Page Size to evaluate
+@return an associated page_size_shift if valid, 0 if invalid. */
+inline uint32_t innodb_page_size_validate(ulong page_size)
+{
+	DBUG_ENTER("innodb_page_size_validate");
+
+	for (uint32_t n = UNIV_PAGE_SIZE_SHIFT_MIN;
+	     n <= UNIV_PAGE_SIZE_SHIFT_MAX;
+	     n++) {
+		if (page_size == static_cast<ulong>(1 << n)) {
+			DBUG_RETURN(n);
+		}
+	}
+
+	DBUG_RETURN(0);
+}
+
+/******************************************************************//**
+Returns true if transaction should be flagged as read-only.
+@return true if the thd is marked as read-only */
+bool
+thd_trx_is_read_only(
+/*=================*/
+	THD*	thd)	/*!< in: thread handle */
+{
+	return(thd != 0 && thd_tx_is_read_only(thd));
+}
+
+static MYSQL_THDVAR_BOOL(background_thread,
+			 PLUGIN_VAR_NOCMDOPT | PLUGIN_VAR_NOSYSVAR,
+			 "Internal (not user visible) flag to mark "
+			 "background purge threads", NULL, NULL, 0);
+
+/** Create a MYSQL_THD for a background thread and mark it as such.
+@param name thread info for SHOW PROCESSLIST
+@return new MYSQL_THD */
+MYSQL_THD innobase_create_background_thd(const char* name)
+{
+	MYSQL_THD thd= create_background_thd();
+	thd_proc_info(thd, name);
+	THDVAR(thd, background_thread) = true;
+	return thd;
+}
+
+
+/** Close opened tables, free memory, delete items for a MYSQL_THD.
+@param[in]	thd	MYSQL_THD to reset */
+void
+innobase_reset_background_thd(MYSQL_THD thd)
+{
+	if (!thd) {
+		thd = current_thd;
+	}
+
+	ut_ad(thd);
+	ut_ad(THDVAR(thd, background_thread));
+
+	/* background purge thread */
+	const char *proc_info= thd_proc_info(thd, "reset");
+	reset_thd(thd);
+	thd_proc_info(thd, proc_info);
+}
+
+
+/******************************************************************//**
+Check if the transaction is an auto-commit transaction. TRUE also
+implies that it is a SELECT (read-only) transaction.
+@return true if the transaction is an auto commit read-only transaction. */
+ibool
+thd_trx_is_auto_commit(
+/*===================*/
+	THD*	thd)	/*!< in: thread handle, can be NULL */
+{
+	return(thd != NULL
+	       && !thd_test_options(
+		       thd,
+		       OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)
+	       && thd_sql_command(thd) == SQLCOM_SELECT);
+}
+
+/******************************************************************//**
+Returns the NUL terminated value of glob_hostname.
+@return pointer to glob_hostname. */
+const char*
+server_get_hostname()
+/*=================*/
+{
+	return(glob_hostname);
+}
+
+/******************************************************************//**
+Returns true if the transaction this thread is processing has edited
+non-transactional tables. Used by the deadlock detector when deciding
+which transaction to rollback in case of a deadlock - we try to avoid
+rolling back transactions that have edited non-transactional tables.
+@return true if non-transactional tables have been edited */
+ibool
+thd_has_edited_nontrans_tables(
+/*===========================*/
+	THD*	thd)	/*!< in: thread handle */
+{
+	return((ibool) thd_non_transactional_update(thd));
+}
+
+/******************************************************************//**
+Returns the lock wait timeout for the current connection.
+@return the lock wait timeout, in seconds */
+uint&
+thd_lock_wait_timeout(
+/*==================*/
+	THD*	thd)	/*!< in: thread handle, or NULL to query
+			the global innodb_lock_wait_timeout */
+{
+	/* According to <mysql/plugin.h>, passing thd == NULL
+	returns the global value of the session variable. */
+	return(THDVAR(thd, lock_wait_timeout));
+}
+
+/** Get the value of innodb_tmpdir.
+@param[in]	thd	thread handle, or NULL to query
+			the global innodb_tmpdir.
+@retval NULL if innodb_tmpdir="" */
+const char *thd_innodb_tmpdir(THD *thd)
+{
+	const char*	tmp_dir = THDVAR(thd, tmpdir);
+
+	if (tmp_dir != NULL && *tmp_dir == '\0') {
+		tmp_dir = NULL;
+	}
+
+	return(tmp_dir);
+}
+
+/** Obtain the InnoDB transaction of a MySQL thread.
+@param[in,out]	thd	thread handle
+@return reference to transaction pointer */
+static trx_t* thd_to_trx(THD* thd)
+{
+	return reinterpret_cast<trx_t*>(thd_get_ha_data(thd, innodb_hton_ptr));
+}
+
+#ifdef WITH_WSREP
+/********************************************************************//**
+Obtain the InnoDB transaction id of a MySQL thread.
+@return	transaction id */
+__attribute__((warn_unused_result, nonnull))
+ulonglong
+thd_to_trx_id(
+	THD*	thd)	/*!< in: MySQL thread */
+{
+	return(thd_to_trx(thd)->id);
+}
+
+Atomic_relaxed<bool> wsrep_sst_disable_writes;
+
+static void sst_disable_innodb_writes()
+{
+  const uint old_count= srv_n_fil_crypt_threads;
+  fil_crypt_set_thread_cnt(0);
+  srv_n_fil_crypt_threads= old_count;
+
+  wsrep_sst_disable_writes= true;
+  dict_stats_shutdown();
+  purge_sys.stop();
+  /* We are holding a global MDL thanks to FLUSH TABLES WITH READ LOCK.
+
+  That will prevent any writes from arriving into InnoDB, but it will
+  not prevent writes of modified pages from the buffer pool, or log
+  checkpoints.
+
+  Let us perform a log checkpoint to ensure that the entire buffer
+  pool is clean, so that no writes to persistent files will be
+  possible during the snapshot, and to guarantee that no crash
+  recovery will be necessary when starting up on the snapshot. */
+  log_make_checkpoint();
+  /* If any FILE_MODIFY records were written by the checkpoint, an
+  extra write of a FILE_CHECKPOINT record could still be invoked by
+  buf_flush_page_cleaner(). Let us prevent that by invoking another
+  checkpoint (which will write the FILE_CHECKPOINT record). */
+  log_make_checkpoint();
+  ut_d(recv_no_log_write= true);
+  /* If this were not a no-op, an assertion would fail due to
+  recv_no_log_write. */
+  ut_d(log_make_checkpoint());
+}
+
+static void sst_enable_innodb_writes()
+{
+  ut_ad(recv_no_log_write);
+  ut_d(recv_no_log_write= false);
+  dict_stats_start();
+  purge_sys.resume();
+  wsrep_sst_disable_writes= false;
+  const uint old_count= srv_n_fil_crypt_threads;
+  srv_n_fil_crypt_threads= 0;
+  fil_crypt_set_thread_cnt(old_count);
+}
+
+static void innodb_disable_internal_writes(bool disable)
+{
+  if (disable)
+    sst_disable_innodb_writes();
+  else
+    sst_enable_innodb_writes();
+}
+
+static void wsrep_abort_transaction(handlerton *, THD *, THD *, my_bool)
+    __attribute__((nonnull));
+static int innobase_wsrep_set_checkpoint(handlerton *hton, const XID *xid);
+static int innobase_wsrep_get_checkpoint(handlerton* hton, XID* xid);
+#endif /* WITH_WSREP */
+
+#define normalize_table_name(a,b) \
+	normalize_table_name_c_low(a,b,IF_WIN(true,false))
+
+ulonglong ha_innobase::table_version() const
+{
+  /* This is either "garbage" or something that was assigned
+  on a successful ha_innobase::prepare_inplace_alter_table(). */
+  return m_prebuilt->trx_id;
+}
+
+#ifdef UNIV_DEBUG
+/** whether the DDL log recovery has been completed */
+static bool ddl_recovery_done;
+#endif
+
+static int innodb_check_version(handlerton *hton, const char *path,
+                                const LEX_CUSTRING *version,
+                                ulonglong create_id)
+{
+  DBUG_ENTER("innodb_check_version");
+  DBUG_ASSERT(hton == innodb_hton_ptr);
+  ut_ad(!ddl_recovery_done);
+
+  if (!create_id)
+    DBUG_RETURN(0);
+
+  char norm_path[FN_REFLEN];
+  normalize_table_name(norm_path, path);
+
+  if (dict_table_t *table= dict_table_open_on_name(norm_path, false,
+                                                   DICT_ERR_IGNORE_NONE))
+  {
+    const trx_id_t trx_id= table->def_trx_id;
+    DBUG_ASSERT(trx_id <= create_id);
+    dict_table_close(table);
+    DBUG_PRINT("info", ("create_id: %llu  trx_id: %llu", create_id, trx_id));
+    DBUG_RETURN(create_id != trx_id);
+  }
+  else
+    DBUG_RETURN(2);
+}
+
+/** Drop any garbage intermediate tables that existed in the system
+after a backup was restored.
+
+In a final phase of Mariabackup, the commit of DDL operations is blocked,
+and those DDL operations will have to be rolled back. Because the
+normal DDL recovery will not run due to the lack of the log file,
+at least some #sql-alter- garbage tables may remain in the InnoDB
+data dictionary (while the data files themselves are missing).
+We will attempt to drop the tables here. */
+static void drop_garbage_tables_after_restore()
+{
+  btr_pcur_t pcur;
+  mtr_t mtr;
+  trx_t *trx= trx_create();
+
+  ut_ad(!purge_sys.enabled());
+  ut_d(purge_sys.stop_FTS());
+
+  mtr.start();
+  if (pcur.open_leaf(true, dict_sys.sys_tables->indexes.start, BTR_SEARCH_LEAF,
+                     &mtr) != DB_SUCCESS)
+    goto all_fail;
+  for (;;)
+  {
+    btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+
+    if (!btr_pcur_is_on_user_rec(&pcur))
+      break;
+
+    const rec_t *rec= btr_pcur_get_rec(&pcur);
+    if (rec_get_deleted_flag(rec, 0))
+      continue;
+
+    static_assert(DICT_FLD__SYS_TABLES__NAME == 0, "compatibility");
+    size_t len;
+    if (rec_get_1byte_offs_flag(rec))
+    {
+      len= rec_1_get_field_end_info(rec, 0);
+      if (len & REC_1BYTE_SQL_NULL_MASK)
+        continue; /* corrupted SYS_TABLES.NAME */
+    }
+    else
+    {
+      len= rec_2_get_field_end_info(rec, 0);
+      static_assert(REC_2BYTE_EXTERN_MASK == 16384, "compatibility");
+      if (len >= REC_2BYTE_EXTERN_MASK)
+        continue; /* corrupted SYS_TABLES.NAME */
+    }
+
+    if (len < tmp_file_prefix_length)
+      continue;
+    if (const char *f= static_cast<const char*>
+        (memchr(rec, '/', len - tmp_file_prefix_length)))
+    {
+      if (memcmp(f + 1, tmp_file_prefix, tmp_file_prefix_length))
+        continue;
+    }
+    else
+      continue;
+
+    btr_pcur_store_position(&pcur, &mtr);
+    btr_pcur_commit_specify_mtr(&pcur, &mtr);
+
+    trx_start_for_ddl(trx);
+    std::vector<pfs_os_file_t> deleted;
+    dberr_t err= DB_TABLE_NOT_FOUND;
+    row_mysql_lock_data_dictionary(trx);
+
+    if (dict_table_t *table= dict_sys.load_table
+        ({reinterpret_cast<const char*>(pcur.old_rec), len},
+         DICT_ERR_IGNORE_DROP))
+    {
+      table->acquire();
+      row_mysql_unlock_data_dictionary(trx);
+      err= lock_table_for_trx(table, trx, LOCK_X);
+      if (err == DB_SUCCESS &&
+          (table->flags2 & (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS)))
+      {
+        fts_optimize_remove_table(table);
+        err= fts_lock_tables(trx, *table);
+      }
+      if (err == DB_SUCCESS)
+        err= lock_sys_tables(trx);
+      row_mysql_lock_data_dictionary(trx);
+      table->release();
+
+      if (err == DB_SUCCESS)
+        err= trx->drop_table(*table);
+      if (err != DB_SUCCESS)
+        goto fail;
+      trx->commit(deleted);
+    }
+    else
+    {
+fail:
+      trx->rollback();
+      sql_print_error("InnoDB: cannot drop %.*s: %s",
+                      static_cast<int>(len), pcur.old_rec, ut_strerr(err));
+    }
+
+    row_mysql_unlock_data_dictionary(trx);
+    for (pfs_os_file_t d : deleted)
+      os_file_close(d);
+
+    mtr.start();
+    if (pcur.restore_position(BTR_SEARCH_LEAF, &mtr) == btr_pcur_t::CORRUPTED)
+      break;
+  }
+
+all_fail:
+  mtr.commit();
+  trx->free();
+  ut_free(pcur.old_rec_buf);
+  ut_d(purge_sys.resume_FTS());
+}
+
+static void innodb_ddl_recovery_done(handlerton*)
+{
+  ut_ad(!ddl_recovery_done);
+  ut_d(ddl_recovery_done= true);
+  if (!srv_read_only_mode && srv_operation <= SRV_OPERATION_EXPORT_RESTORED &&
+      srv_force_recovery < SRV_FORCE_NO_BACKGROUND)
+  {
+    if (srv_start_after_restore && !high_level_read_only)
+      drop_garbage_tables_after_restore();
+    srv_init_purge_tasks();
+  }
+}
+
+/********************************************************************//**
+Converts an InnoDB error code to a MySQL error code and also tells to MySQL
+about a possible transaction rollback inside InnoDB caused by a lock wait
+timeout or a deadlock.
+@return MySQL error code */
+static int
+convert_error_code_to_mysql(
+/*========================*/
+	dberr_t	error,	/*!< in: InnoDB error code */
+	ulint	flags,  /*!< in: InnoDB table flags, or 0 */
+	THD*	thd)	/*!< in: user thread handle or NULL */
+{
+	switch (error) {
+	case DB_SUCCESS:
+		return(0);
+
+	case DB_INTERRUPTED:
+		return(HA_ERR_ABORTED_BY_USER);
+
+	case DB_FOREIGN_EXCEED_MAX_CASCADE:
+		ut_ad(thd);
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    HA_ERR_ROW_IS_REFERENCED,
+				    "InnoDB: Cannot delete/update "
+				    "rows with cascading foreign key "
+				    "constraints that exceed max "
+				    "depth of %d. Please "
+				    "drop extra constraints and try "
+				    "again", FK_MAX_CASCADE_DEL);
+		return(HA_ERR_FK_DEPTH_EXCEEDED);
+
+	case DB_CANT_CREATE_GEOMETRY_OBJECT:
+		my_error(ER_CANT_CREATE_GEOMETRY_OBJECT, MYF(0));
+		return(HA_ERR_NULL_IN_SPATIAL);
+
+	case DB_ERROR:
+	default:
+		return(HA_ERR_GENERIC); /* unspecified error */
+
+	case DB_DUPLICATE_KEY:
+		/* Be cautious with returning this error, since
+		mysql could re-enter the storage layer to get
+		duplicated key info, the operation requires a
+		valid table handle and/or transaction information,
+		which might not always be available in the error
+		handling stage. */
+		return(HA_ERR_FOUND_DUPP_KEY);
+
+	case DB_READ_ONLY:
+		return(HA_ERR_TABLE_READONLY);
+
+	case DB_FOREIGN_DUPLICATE_KEY:
+		return(HA_ERR_FOREIGN_DUPLICATE_KEY);
+
+	case DB_MISSING_HISTORY:
+		return(HA_ERR_TABLE_DEF_CHANGED);
+
+	case DB_RECORD_NOT_FOUND:
+		return(HA_ERR_NO_ACTIVE_RECORD);
+
+	case DB_DEADLOCK:
+		/* Since we rolled back the whole transaction, we must
+		tell it also to MySQL so that MySQL knows to empty the
+		cached binlog for this transaction */
+
+		if (thd != NULL) {
+			thd_mark_transaction_to_rollback(thd, 1);
+		}
+
+		return(HA_ERR_LOCK_DEADLOCK);
+
+	case DB_LOCK_WAIT_TIMEOUT:
+		/* Starting from 5.0.13, we let MySQL just roll back the
+		latest SQL statement in a lock wait timeout. Previously, we
+		rolled back the whole transaction. */
+
+		if (thd) {
+			thd_mark_transaction_to_rollback(
+				thd, innobase_rollback_on_timeout);
+		}
+
+		return(HA_ERR_LOCK_WAIT_TIMEOUT);
+
+	case DB_NO_REFERENCED_ROW:
+		return(HA_ERR_NO_REFERENCED_ROW);
+
+	case DB_ROW_IS_REFERENCED:
+		return(HA_ERR_ROW_IS_REFERENCED);
+
+	case DB_NO_FK_ON_S_BASE_COL:
+	case DB_CANNOT_ADD_CONSTRAINT:
+	case DB_CHILD_NO_INDEX:
+	case DB_PARENT_NO_INDEX:
+		return(HA_ERR_CANNOT_ADD_FOREIGN);
+
+	case DB_CANNOT_DROP_CONSTRAINT:
+
+		return(HA_ERR_ROW_IS_REFERENCED); /* TODO: This is a bit
+						misleading, a new MySQL error
+						code should be introduced */
+
+	case DB_CORRUPTION:
+	case DB_PAGE_CORRUPTED:
+		return(HA_ERR_CRASHED);
+
+	case DB_OUT_OF_FILE_SPACE:
+		return(HA_ERR_RECORD_FILE_FULL);
+
+	case DB_TEMP_FILE_WRITE_FAIL:
+		my_error(ER_GET_ERRMSG, MYF(0),
+                         DB_TEMP_FILE_WRITE_FAIL,
+                         ut_strerr(DB_TEMP_FILE_WRITE_FAIL),
+                         "InnoDB");
+		return(HA_ERR_INTERNAL_ERROR);
+
+	case DB_TABLE_NOT_FOUND:
+		return(HA_ERR_NO_SUCH_TABLE);
+
+	case DB_DECRYPTION_FAILED:
+		return(HA_ERR_DECRYPTION_FAILED);
+
+	case DB_TABLESPACE_NOT_FOUND:
+		return(HA_ERR_TABLESPACE_MISSING);
+
+	case DB_TOO_BIG_RECORD: {
+		/* If prefix is true then a 768-byte prefix is stored
+		locally for BLOB fields. Refer to dict_table_get_format().
+		We limit max record size to 16k for 64k page size. */
+		bool prefix = !DICT_TF_HAS_ATOMIC_BLOBS(flags);
+		bool comp = !!(flags & DICT_TF_COMPACT);
+		ulint free_space = page_get_free_space_of_empty(comp) / 2;
+
+		if (free_space >= ulint(comp ? COMPRESSED_REC_MAX_DATA_SIZE :
+				          REDUNDANT_REC_MAX_DATA_SIZE)) {
+			free_space = (comp ? COMPRESSED_REC_MAX_DATA_SIZE :
+				REDUNDANT_REC_MAX_DATA_SIZE) - 1;
+		}
+
+		my_printf_error(ER_TOO_BIG_ROWSIZE,
+			"Row size too large (> " ULINTPF "). Changing some columns "
+			"to TEXT or BLOB %smay help. In current row "
+			"format, BLOB prefix of %d bytes is stored inline.",
+			MYF(0),
+			free_space,
+			prefix
+			? "or using ROW_FORMAT=DYNAMIC or"
+			  " ROW_FORMAT=COMPRESSED "
+			: "",
+			prefix
+			? DICT_MAX_FIXED_COL_LEN
+			: 0);
+		return(HA_ERR_TO_BIG_ROW);
+	}
+
+	case DB_TOO_BIG_INDEX_COL:
+		my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0),
+			 (ulong) DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags));
+		return(HA_ERR_INDEX_COL_TOO_LONG);
+
+	case DB_NO_SAVEPOINT:
+		return(HA_ERR_NO_SAVEPOINT);
+
+	case DB_LOCK_TABLE_FULL:
+		/* Since we rolled back the whole transaction, we must
+		tell it also to MySQL so that MySQL knows to empty the
+		cached binlog for this transaction */
+
+		if (thd) {
+			thd_mark_transaction_to_rollback(thd, 1);
+		}
+
+		return(HA_ERR_LOCK_TABLE_FULL);
+
+	case DB_FTS_INVALID_DOCID:
+		return(HA_FTS_INVALID_DOCID);
+	case DB_FTS_EXCEED_RESULT_CACHE_LIMIT:
+		return(HA_ERR_OUT_OF_MEM);
+	case DB_TOO_MANY_CONCURRENT_TRXS:
+		return(HA_ERR_TOO_MANY_CONCURRENT_TRXS);
+	case DB_UNSUPPORTED:
+		return(HA_ERR_UNSUPPORTED);
+	case DB_INDEX_CORRUPT:
+		return(HA_ERR_INDEX_CORRUPT);
+	case DB_UNDO_RECORD_TOO_BIG:
+		return(HA_ERR_UNDO_REC_TOO_BIG);
+	case DB_OUT_OF_MEMORY:
+		return(HA_ERR_OUT_OF_MEM);
+	case DB_TABLESPACE_EXISTS:
+		return(HA_ERR_TABLESPACE_EXISTS);
+	case DB_TABLESPACE_DELETED:
+		return(HA_ERR_TABLESPACE_MISSING);
+	case DB_IDENTIFIER_TOO_LONG:
+		return(HA_ERR_INTERNAL_ERROR);
+	case DB_TABLE_CORRUPT:
+		return(HA_ERR_TABLE_CORRUPT);
+	case DB_FTS_TOO_MANY_WORDS_IN_PHRASE:
+		return(HA_ERR_FTS_TOO_MANY_WORDS_IN_PHRASE);
+	case DB_COMPUTE_VALUE_FAILED:
+		return(HA_ERR_GENERIC); // impossible
+	}
+}
+
+/*************************************************************//**
+Prints info of a THD object (== user session thread) to the given file. */
+void
+innobase_mysql_print_thd(
+/*=====================*/
+	FILE*	f,		/*!< in: output stream */
+	THD*	thd,		/*!< in: MySQL THD object */
+	uint	max_query_len)	/*!< in: max query length to print, or 0 to
+				use the default max length */
+{
+	char	buffer[1024];
+
+	fputs(thd_get_error_context_description(thd, buffer, sizeof buffer,
+						max_query_len), f);
+	putc('\n', f);
+}
+
+/******************************************************************//**
+Get the variable length bounds of the given character set. */
+static void
+innobase_get_cset_width(
+/*====================*/
+	ulint	cset,		/*!< in: MySQL charset-collation code */
+	unsigned*mbminlen,	/*!< out: minimum length of a char (in bytes) */
+	unsigned*mbmaxlen)	/*!< out: maximum length of a char (in bytes) */
+{
+	CHARSET_INFO*	cs;
+	ut_ad(cset <= MAX_CHAR_COLL_NUM);
+	ut_ad(mbminlen);
+	ut_ad(mbmaxlen);
+
+	cs = cset ? get_charset((uint)cset, MYF(MY_WME)) : NULL;
+	if (cs) {
+		*mbminlen = cs->mbminlen;
+		*mbmaxlen = cs->mbmaxlen;
+		ut_ad(*mbminlen < DATA_MBMAX);
+		ut_ad(*mbmaxlen < DATA_MBMAX);
+	} else {
+		THD*	thd = current_thd;
+
+		if (thd && thd_sql_command(thd) == SQLCOM_DROP_TABLE) {
+
+			/* Fix bug#46256: allow tables to be dropped if the
+			collation is not found, but issue a warning. */
+			if (cset != 0) {
+
+				sql_print_warning(
+					"Unknown collation #" ULINTPF ".",
+					cset);
+			}
+		} else {
+
+			ut_a(cset == 0);
+		}
+
+		*mbminlen = *mbmaxlen = 0;
+	}
+}
+
+/*********************************************************************//**
+Compute the mbminlen and mbmaxlen members of a data type structure. */
+void
+dtype_get_mblen(
+/*============*/
+	ulint	mtype,		/*!< in: main type */
+	ulint	prtype,		/*!< in: precise type (and collation) */
+	unsigned*mbminlen,	/*!< out: minimum length of a
+				multi-byte character */
+	unsigned*mbmaxlen)	/*!< out: maximum length of a
+				multi-byte character */
+{
+	if (dtype_is_string_type(mtype)) {
+		innobase_get_cset_width(dtype_get_charset_coll(prtype),
+					mbminlen, mbmaxlen);
+		ut_ad(*mbminlen <= *mbmaxlen);
+		ut_ad(*mbminlen < DATA_MBMAX);
+		ut_ad(*mbmaxlen < DATA_MBMAX);
+	} else {
+		*mbminlen = *mbmaxlen = 0;
+	}
+}
+
+/******************************************************************//**
+Converts an identifier to a table name. */
+void
+innobase_convert_from_table_id(
+/*===========================*/
+	CHARSET_INFO*	cs,	/*!< in: the 'from' character set */
+	char*		to,	/*!< out: converted identifier */
+	const char*	from,	/*!< in: identifier to convert */
+	ulint		len)	/*!< in: length of 'to', in bytes */
+{
+	uint	errors;
+
+	strconvert(cs, from, FN_REFLEN, &my_charset_filename, to, (uint) len, &errors);
+}
+
+/**********************************************************************
+Check if the length of the identifier exceeds the maximum allowed.
+return true when length of identifier is too long. */
+my_bool
+innobase_check_identifier_length(
+/*=============================*/
+	const char*	id)	/* in: FK identifier to check excluding the
+				database portion. */
+{
+	int		well_formed_error = 0;
+	CHARSET_INFO	*cs = system_charset_info;
+	DBUG_ENTER("innobase_check_identifier_length");
+
+	size_t len = my_well_formed_length(
+		cs, id, id + strlen(id),
+		NAME_CHAR_LEN, &well_formed_error);
+
+	if (well_formed_error || len == NAME_CHAR_LEN) {
+		my_error(ER_TOO_LONG_IDENT, MYF(0), id);
+		DBUG_RETURN(true);
+	}
+	DBUG_RETURN(false);
+}
+
+/******************************************************************//**
+Converts an identifier to UTF-8. */
+void
+innobase_convert_from_id(
+/*=====================*/
+	CHARSET_INFO*	cs,	/*!< in: the 'from' character set */
+	char*		to,	/*!< out: converted identifier */
+	const char*	from,	/*!< in: identifier to convert */
+	ulint		len)	/*!< in: length of 'to', in bytes */
+{
+	uint	errors;
+
+	strconvert(cs, from, FN_REFLEN, system_charset_info, to, (uint) len, &errors);
+}
+
+/******************************************************************//**
+Compares NUL-terminated UTF-8 strings case insensitively.
+@return 0 if a=b, <0 if a<b, >1 if a>b */
+int
+innobase_strcasecmp(
+/*================*/
+	const char*	a,	/*!< in: first string to compare */
+	const char*	b)	/*!< in: second string to compare */
+{
+	if (!a) {
+		if (!b) {
+			return(0);
+		} else {
+			return(-1);
+		}
+	} else if (!b) {
+		return(1);
+	}
+
+	return(my_strcasecmp(system_charset_info, a, b));
+}
+
+/******************************************************************//**
+Compares NUL-terminated UTF-8 strings case insensitively. The
+second string contains wildcards.
+@return 0 if a match is found, 1 if not */
+static
+int
+innobase_wildcasecmp(
+/*=================*/
+	const char*	a,	/*!< in: string to compare */
+	const char*	b)	/*!< in: wildcard string to compare */
+{
+	return(wild_case_compare(system_charset_info, a, b));
+}
+
+/** Strip dir name from a full path name and return only the file name
+@param[in]	path_name	full path name
+@return file name or "null" if no file name */
+const char*
+innobase_basename(
+	const char*	path_name)
+{
+	const char*	name = base_name(path_name);
+
+	return((name) ? name : "null");
+}
+
+/******************************************************************//**
+Makes all characters in a NUL-terminated UTF-8 string lower case. */
+void
+innobase_casedn_str(
+/*================*/
+	char*	a)	/*!< in/out: string to put in lower case */
+{
+	my_casedn_str(system_charset_info, a);
+}
+
+/** Determines the current SQL statement.
+Thread unsafe, can only be called from the thread owning the THD.
+@param[in]	thd	MySQL thread handle
+@param[out]	length	Length of the SQL statement
+@return			SQL statement string */
+const char*
+innobase_get_stmt_unsafe(
+	THD*	thd,
+	size_t*	length)
+{
+	if (const LEX_STRING *stmt = thd_query_string(thd)) {
+		*length = stmt->length;
+		return stmt->str;
+	}
+
+	*length = 0;
+	return NULL;
+}
+
+/**
+  Test a file path whether it is same as mysql data directory path.
+
+  @param path null terminated character string
+
+  @return
+    @retval TRUE The path is different from mysql data directory.
+    @retval FALSE The path is same as mysql data directory.
+*/
+static bool is_mysql_datadir_path(const char *path)
+{
+  if (path == NULL)
+    return false;
+
+  char mysql_data_dir[FN_REFLEN], path_dir[FN_REFLEN];
+  convert_dirname(path_dir, path, NullS);
+  convert_dirname(mysql_data_dir, mysql_unpacked_real_data_home, NullS);
+  size_t mysql_data_home_len= dirname_length(mysql_data_dir);
+  size_t path_len = dirname_length(path_dir);
+
+  if (path_len < mysql_data_home_len)
+    return true;
+
+  if (!lower_case_file_system)
+    return(memcmp(mysql_data_dir, path_dir, mysql_data_home_len));
+
+  return(files_charset_info->strnncoll((uchar *) path_dir, path_len,
+                                       (uchar *) mysql_data_dir,
+                                       mysql_data_home_len,
+                                       TRUE));
+}
+
+/*********************************************************************//**
+Wrapper around MySQL's copy_and_convert function.
+@return number of bytes copied to 'to' */
+static
+ulint
+innobase_convert_string(
+/*====================*/
+	void*		to,		/*!< out: converted string */
+	ulint		to_length,	/*!< in: number of bytes reserved
+					for the converted string */
+	CHARSET_INFO*	to_cs,		/*!< in: character set to convert to */
+	const void*	from,		/*!< in: string to convert */
+	ulint		from_length,	/*!< in: number of bytes to convert */
+	CHARSET_INFO*	from_cs,	/*!< in: character set to convert
+					from */
+	uint*		errors)		/*!< out: number of errors encountered
+					during the conversion */
+{
+	return(copy_and_convert(
+			(char*) to, (uint32) to_length, to_cs,
+			(const char*) from, (uint32) from_length, from_cs,
+			errors));
+}
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) that is of
+type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "charset_coll" and writes
+the result to "buf". The result is converted to "system_charset_info".
+Not more than "buf_size" bytes are written to "buf".
+The result is always NUL-terminated (provided buf_size > 0) and the
+number of bytes that were written to "buf" is returned (including the
+terminating NUL).
+@return number of bytes that were written */
+ulint
+innobase_raw_format(
+/*================*/
+	const char*	data,		/*!< in: raw data */
+	ulint		data_len,	/*!< in: raw data length
+					in bytes */
+	ulint		charset_coll,	/*!< in: charset collation */
+	char*		buf,		/*!< out: output buffer */
+	ulint		buf_size)	/*!< in: output buffer size
+					in bytes */
+{
+	/* XXX we use a hard limit instead of allocating
+	but_size bytes from the heap */
+	CHARSET_INFO*	data_cs;
+	char		buf_tmp[8192];
+	ulint		buf_tmp_used;
+	uint		num_errors;
+
+	data_cs = all_charsets[charset_coll];
+
+	buf_tmp_used = innobase_convert_string(buf_tmp, sizeof(buf_tmp),
+					       system_charset_info,
+					       data, data_len, data_cs,
+					       &num_errors);
+
+	return(ut_str_sql_format(buf_tmp, buf_tmp_used, buf, buf_size));
+}
+
+/*
+The helper function nlz(x) calculates the number of leading zeros
+in the binary representation of the number "x", either using a
+built-in compiler function or a substitute trick based on the use
+of the multiplication operation and a table indexed by the prefix
+of the multiplication result:
+*/
+#ifdef __GNUC__
+#define nlz(x) __builtin_clzll(x)
+#elif defined(_MSC_VER) && !defined(_M_CEE_PURE) && \
+  (defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64))
+#ifndef __INTRIN_H_
+#pragma warning(push, 4)
+#pragma warning(disable: 4255 4668)
+#include <intrin.h>
+#pragma warning(pop)
+#endif
+__forceinline unsigned int nlz (ulonglong x)
+{
+#if defined(_M_IX86) || defined(_M_X64)
+  unsigned long n;
+#ifdef _M_X64
+  _BitScanReverse64(&n, x);
+  return (unsigned int) n ^ 63;
+#else
+  unsigned long y = (unsigned long) (x >> 32);
+  unsigned int m = 31;
+  if (y == 0)
+  {
+    y = (unsigned long) x;
+    m = 63;
+  }
+  _BitScanReverse(&n, y);
+  return (unsigned int) n ^ m;
+#endif
+#elif defined(_M_ARM64)
+  return _CountLeadingZeros64(x);
+#endif
+}
+#else
+inline unsigned int nlz (ulonglong x)
+{
+  static unsigned char table [48] = {
+    32,  6,  5,  0,  4, 12,  0, 20,
+    15,  3, 11,  0,  0, 18, 25, 31,
+     8, 14,  2,  0, 10,  0,  0,  0,
+     0,  0,  0, 21,  0,  0, 19, 26,
+     7,  0, 13,  0, 16,  1, 22, 27,
+     9,  0, 17, 23, 28, 24, 29, 30
+  };
+  unsigned int y= (unsigned int) (x >> 32);
+  unsigned int n= 0;
+  if (y == 0) {
+    y= (unsigned int) x;
+    n= 32;
+  }
+  y = y | (y >> 1); // Propagate leftmost 1-bit to the right.
+  y = y | (y >> 2);
+  y = y | (y >> 4);
+  y = y | (y >> 8);
+  y = y & ~(y >> 16);
+  y = y * 0x3EF5D037;
+  return n + table[y >> 26];
+}
+#endif
+
+/*********************************************************************//**
+Compute the next autoinc value.
+
+For MySQL replication the autoincrement values can be partitioned among
+the nodes. The offset is the start or origin of the autoincrement value
+for a particular node. For n nodes the increment will be n and the offset
+will be in the interval [1, n]. The formula tries to allocate the next
+value for a particular node.
+
+Note: This function is also called with increment set to the number of
+values we want to reserve for multi-value inserts e.g.,
+
+	INSERT INTO T VALUES(), (), ();
+
+innobase_next_autoinc() will be called with increment set to 3 where
+autoinc_lock_mode != TRADITIONAL because we want to reserve 3 values for
+the multi-value INSERT above.
+@return the next value */
+ulonglong
+innobase_next_autoinc(
+/*==================*/
+	ulonglong	current,	/*!< in: Current value */
+	ulonglong	need,		/*!< in: count of values needed */
+	ulonglong	step,		/*!< in: AUTOINC increment step */
+	ulonglong	offset,		/*!< in: AUTOINC offset */
+	ulonglong	max_value)	/*!< in: max value for type */
+{
+	ulonglong	next_value;
+	ulonglong	block;
+
+	/* Should never be 0. */
+	ut_a(need > 0);
+	ut_a(step > 0);
+	ut_a(max_value > 0);
+
+	/*
+	  We need to calculate the "block" value equal to the product
+	  "step * need". However, when calculating this product, an integer
+	  overflow can occur, so we cannot simply use the usual multiplication
+	  operation. The snippet below calculates the product of two numbers
+	  and detects an unsigned integer overflow:
+	*/
+	unsigned int	m= nlz(need);
+	unsigned int	n= nlz(step);
+	if (m + n <= 8 * sizeof(ulonglong) - 2) {
+		// The bit width of the original values is too large,
+		// therefore we are guaranteed to get an overflow.
+		goto overflow;
+	}
+	block = need * (step >> 1);
+	if ((longlong) block < 0) {
+		goto overflow;
+	}
+	block += block;
+	if (step & 1) {
+		block += need;
+		if (block < need) {
+			goto overflow;
+		}
+	}
+
+	/* Check for overflow. Current can be > max_value if the value
+	is in reality a negative value. Also, the visual studio compiler
+	converts large double values (which hypothetically can then be
+	passed here as the values of the "current" parameter) automatically
+	into unsigned long long datatype maximum value: */
+	if (current > max_value) {
+		goto overflow;
+	}
+
+	/* According to MySQL documentation, if the offset is greater than
+	the step then the offset is ignored. */
+	if (offset > step) {
+		offset = 0;
+	}
+
+	/*
+	  Let's round the current value to within a step-size block:
+	*/
+	if (current > offset) {
+		next_value = current - offset;
+	} else {
+		next_value = offset - current;
+	}
+	next_value -= next_value % step;
+
+	/*
+	  Add an offset to the next value and check that the addition
+	  does not cause an integer overflow:
+	*/
+	next_value += offset;
+	if (next_value < offset) {
+		goto overflow;
+	}
+
+	/*
+	  Add a block to the next value and check that the addition
+	  does not cause an integer overflow:
+	*/
+	next_value += block;
+	if (next_value < block) {
+		goto overflow;
+	}
+
+	return(next_value);
+
+overflow:
+	/*
+	  Allow auto_increment to go over max_value up to max ulonglong.
+	  This allows us to detect that all values are exhausted.
+	  If we don't do this, we will return max_value several times
+	  and get duplicate key errors instead of auto increment value
+	  out of range:
+	*/
+	return(~(ulonglong) 0);
+}
+
+/*********************************************************************//**
+Initializes some fields in an InnoDB transaction object. */
+static
+void
+innobase_trx_init(
+/*==============*/
+	THD*	thd,	/*!< in: user thread handle */
+	trx_t*	trx)	/*!< in/out: InnoDB transaction handle */
+{
+	DBUG_ENTER("innobase_trx_init");
+	DBUG_ASSERT(thd == trx->mysql_thd);
+
+	/* Ensure that thd_lock_wait_timeout(), which may be called
+	while holding lock_sys.latch, by lock_rec_enqueue_waiting(),
+	will not end up acquiring LOCK_global_system_variables in
+	intern_sys_var_ptr(). */
+	(void) THDVAR(thd, lock_wait_timeout);
+
+	trx->check_foreigns = !thd_test_options(
+		thd, OPTION_NO_FOREIGN_KEY_CHECKS);
+
+	trx->check_unique_secondary = !thd_test_options(
+		thd, OPTION_RELAXED_UNIQUE_CHECKS);
+#ifdef WITH_WSREP
+	trx->wsrep = wsrep_on(thd);
+#endif
+
+	DBUG_VOID_RETURN;
+}
+
+/*********************************************************************//**
+Allocates an InnoDB transaction for a MySQL handler object for DML.
+@return InnoDB transaction handle */
+trx_t*
+innobase_trx_allocate(
+/*==================*/
+	THD*	thd)	/*!< in: user thread handle */
+{
+	trx_t*	trx;
+
+	DBUG_ENTER("innobase_trx_allocate");
+	DBUG_ASSERT(thd != NULL);
+	DBUG_ASSERT(EQ_CURRENT_THD(thd));
+
+	trx = trx_create();
+
+	trx->mysql_thd = thd;
+
+	innobase_trx_init(thd, trx);
+
+	DBUG_RETURN(trx);
+}
+
+/*********************************************************************//**
+Gets the InnoDB transaction handle for a MySQL handler object, creates
+an InnoDB transaction struct if the corresponding MySQL thread struct still
+lacks one.
+@return InnoDB transaction handle */
+static inline
+trx_t*
+check_trx_exists(
+/*=============*/
+	THD*	thd)	/*!< in: user thread handle */
+{
+	if (trx_t* trx = thd_to_trx(thd)) {
+		ut_a(trx->magic_n == TRX_MAGIC_N);
+		innobase_trx_init(thd, trx);
+		return trx;
+	} else {
+		trx = innobase_trx_allocate(thd);
+		thd_set_ha_data(thd, innodb_hton_ptr, trx);
+		return trx;
+	}
+}
+
+/**
+  Gets current trx.
+
+  This function may be called during InnoDB initialisation, when
+  innodb_hton_ptr->slot is not yet set to meaningful value.
+*/
+
+trx_t *current_trx()
+{
+	THD *thd=current_thd;
+	if (likely(thd != 0) && innodb_hton_ptr->slot != HA_SLOT_UNDEF) {
+		return thd_to_trx(thd);
+	} else {
+		return(NULL);
+	}
+}
+
+/*********************************************************************//**
+Note that a transaction has been registered with MySQL.
+@return true if transaction is registered with MySQL 2PC coordinator */
+static inline
+bool
+trx_is_registered_for_2pc(
+/*======================*/
+	const trx_t*	trx)	/* in: transaction */
+{
+	return(trx->is_registered == 1);
+}
+
+/*********************************************************************//**
+Note that a transaction has been deregistered. */
+static inline
+void
+trx_deregister_from_2pc(
+/*====================*/
+	trx_t*	trx)	/* in: transaction */
+{
+  trx->is_registered= false;
+  trx->active_commit_ordered= false;
+}
+
+/*********************************************************************//**
+Copy table flags from MySQL's HA_CREATE_INFO into an InnoDB table object.
+Those flags are stored in .frm file and end up in the MySQL table object,
+but are frequently used inside InnoDB so we keep their copies into the
+InnoDB table object. */
+static
+void
+innobase_copy_frm_flags_from_create_info(
+/*=====================================*/
+	dict_table_t*		innodb_table,	/*!< in/out: InnoDB table */
+	const HA_CREATE_INFO*	create_info)	/*!< in: create info */
+{
+	ibool	ps_on;
+	ibool	ps_off;
+
+	if (innodb_table->is_temporary()
+	    || innodb_table->no_rollback()) {
+		/* Temp tables do not use persistent stats. */
+		ps_on = FALSE;
+		ps_off = TRUE;
+	} else {
+		ps_on = create_info->table_options
+			& HA_OPTION_STATS_PERSISTENT;
+		ps_off = create_info->table_options
+			& HA_OPTION_NO_STATS_PERSISTENT;
+	}
+
+	dict_stats_set_persistent(innodb_table, ps_on, ps_off);
+
+	dict_stats_auto_recalc_set(
+		innodb_table,
+		create_info->stats_auto_recalc == HA_STATS_AUTO_RECALC_ON,
+		create_info->stats_auto_recalc == HA_STATS_AUTO_RECALC_OFF);
+
+	innodb_table->stats_sample_pages = create_info->stats_sample_pages;
+}
+
+/*********************************************************************//**
+Copy table flags from MySQL's TABLE_SHARE into an InnoDB table object.
+Those flags are stored in .frm file and end up in the MySQL table object,
+but are frequently used inside InnoDB so we keep their copies into the
+InnoDB table object. */
+void
+innobase_copy_frm_flags_from_table_share(
+/*=====================================*/
+	dict_table_t*		innodb_table,	/*!< in/out: InnoDB table */
+	const TABLE_SHARE*	table_share)	/*!< in: table share */
+{
+	ibool	ps_on;
+	ibool	ps_off;
+
+	if (innodb_table->is_temporary()) {
+		/* Temp tables do not use persistent stats */
+		ps_on = FALSE;
+		ps_off = TRUE;
+	} else {
+		ps_on = table_share->db_create_options
+			& HA_OPTION_STATS_PERSISTENT;
+		ps_off = table_share->db_create_options
+			& HA_OPTION_NO_STATS_PERSISTENT;
+	}
+
+	dict_stats_set_persistent(innodb_table, ps_on, ps_off);
+
+	dict_stats_auto_recalc_set(
+		innodb_table,
+		table_share->stats_auto_recalc == HA_STATS_AUTO_RECALC_ON,
+		table_share->stats_auto_recalc == HA_STATS_AUTO_RECALC_OFF);
+
+	innodb_table->stats_sample_pages = table_share->stats_sample_pages;
+}
+
+/*********************************************************************//**
+Construct ha_innobase handler. */
+
+ha_innobase::ha_innobase(
+/*=====================*/
+	handlerton*	hton,
+	TABLE_SHARE*	table_arg)
+	:handler(hton, table_arg),
+	m_prebuilt(),
+	m_user_thd(),
+	m_int_table_flags(HA_REC_NOT_IN_SEQ
+			  | HA_NULL_IN_KEY
+			  | HA_CAN_VIRTUAL_COLUMNS
+			  | HA_CAN_INDEX_BLOBS
+			  | HA_CAN_SQL_HANDLER
+			  | HA_REQUIRES_KEY_COLUMNS_FOR_DELETE
+			  | HA_PRIMARY_KEY_REQUIRED_FOR_POSITION
+			  | HA_PRIMARY_KEY_IN_READ_INDEX
+			  | HA_BINLOG_ROW_CAPABLE
+			  | HA_CAN_GEOMETRY
+			  | HA_PARTIAL_COLUMN_READ
+			  | HA_TABLE_SCAN_ON_INDEX
+			  | HA_CAN_FULLTEXT
+			  | HA_CAN_FULLTEXT_EXT
+		/* JAN: TODO: MySQL 5.7
+			  | HA_CAN_FULLTEXT_HINTS
+		*/
+			  | HA_CAN_EXPORT
+                          | HA_ONLINE_ANALYZE
+			  | HA_CAN_RTREEKEYS
+                          | HA_CAN_TABLES_WITHOUT_ROLLBACK
+                          | HA_CAN_ONLINE_BACKUPS
+			  | HA_CONCURRENT_OPTIMIZE
+			  | HA_CAN_SKIP_LOCKED
+			  |  (srv_force_primary_key ? HA_REQUIRE_PRIMARY_KEY : 0)
+		  ),
+	m_start_of_scan(),
+        m_mysql_has_locked()
+{}
+
+/*********************************************************************//**
+Destruct ha_innobase handler. */
+
+ha_innobase::~ha_innobase() = default;
+/*======================*/
+
+/*********************************************************************//**
+Updates the user_thd field in a handle and also allocates a new InnoDB
+transaction handle if needed, and updates the transaction fields in the
+m_prebuilt struct. */
+void
+ha_innobase::update_thd(
+/*====================*/
+	THD*	thd)	/*!< in: thd to use the handle */
+{
+	DBUG_ENTER("ha_innobase::update_thd");
+	DBUG_PRINT("ha_innobase::update_thd", ("user_thd: %p -> %p",
+		   m_user_thd, thd));
+
+	/* The table should have been opened in ha_innobase::open(). */
+	DBUG_ASSERT(m_prebuilt->table->get_ref_count() > 0);
+
+	trx_t*	trx = check_trx_exists(thd);
+
+	ut_ad(!trx->dict_operation_lock_mode);
+	ut_ad(!trx->dict_operation);
+
+	if (m_prebuilt->trx != trx) {
+
+		row_update_prebuilt_trx(m_prebuilt, trx);
+	}
+
+	m_user_thd = thd;
+
+	DBUG_ASSERT(m_prebuilt->trx->magic_n == TRX_MAGIC_N);
+	DBUG_ASSERT(m_prebuilt->trx == thd_to_trx(m_user_thd));
+
+	DBUG_VOID_RETURN;
+}
+
+/*********************************************************************//**
+Updates the user_thd field in a handle and also allocates a new InnoDB
+transaction handle if needed, and updates the transaction fields in the
+m_prebuilt struct. */
+
+void
+ha_innobase::update_thd()
+/*=====================*/
+{
+	THD*	thd = ha_thd();
+
+	ut_ad(EQ_CURRENT_THD(thd));
+	update_thd(thd);
+}
+
+/*********************************************************************//**
+Registers an InnoDB transaction with the MySQL 2PC coordinator, so that
+the MySQL XA code knows to call the InnoDB prepare and commit, or rollback
+for the transaction. This MUST be called for every transaction for which
+the user may call commit or rollback. Calling this several times to register
+the same transaction is allowed, too. This function also registers the
+current SQL statement. */
+static inline
+void
+innobase_register_trx(
+/*==================*/
+	handlerton*	hton,	/* in: Innobase handlerton */
+	THD*		thd,	/* in: MySQL thd (connection) object */
+	trx_t*		trx)	/* in: transaction to register */
+{
+  ut_ad(!trx->active_commit_ordered);
+  const trx_id_t trx_id= trx->id;
+
+  trans_register_ha(thd, false, hton, trx_id);
+
+  if (!trx->is_registered)
+  {
+    trx->is_registered= true;
+    if (thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))
+      trans_register_ha(thd, true, hton, trx_id);
+  }
+}
+
+/*	BACKGROUND INFO: HOW THE MYSQL QUERY CACHE WORKS WITH INNODB
+	------------------------------------------------------------
+
+1) The use of the query cache for TBL is disabled when there is an
+uncommitted change to TBL.
+
+2) When a change to TBL commits, InnoDB stores the current value of
+its global trx id counter, let us denote it by INV_TRX_ID, to the table object
+in the InnoDB data dictionary, and does only allow such transactions whose
+id <= INV_TRX_ID to use the query cache.
+
+3) When InnoDB does an INSERT/DELETE/UPDATE to a table TBL, or an implicit
+modification because an ON DELETE CASCADE, we invalidate the MySQL query cache
+of TBL immediately.
+
+How this is implemented inside InnoDB:
+
+1) Since every modification always sets an IX type table lock on the InnoDB
+table, it is easy to check if there can be uncommitted modifications for a
+table: just check if there are locks in the lock list of the table.
+
+2) When a transaction inside InnoDB commits, it reads the global trx id
+counter and stores the value INV_TRX_ID to the tables on which it had a lock.
+
+3) If there is an implicit table change from ON DELETE CASCADE or SET NULL,
+InnoDB calls an invalidate method for the MySQL query cache for that table.
+
+How this is implemented inside sql_cache.cc:
+
+1) The query cache for an InnoDB table TBL is invalidated immediately at an
+INSERT/UPDATE/DELETE, just like in the case of MyISAM. No need to delay
+invalidation to the transaction commit.
+
+2) To store or retrieve a value from the query cache of an InnoDB table TBL,
+any query must first ask InnoDB's permission. We must pass the thd as a
+parameter because InnoDB will look at the trx id, if any, associated with
+that thd. Also the full_name which is used as key to search for the table
+object. The full_name is a string containing the normalized path to the
+table in the canonical format.
+
+3) Use of the query cache for InnoDB tables is now allowed also when
+AUTOCOMMIT==0 or we are inside BEGIN ... COMMIT. Thus transactions no longer
+put restrictions on the use of the query cache.
+*/
+
+/** Check if mysql can allow the transaction to read from/store to
+the query cache.
+@param[in]	table	table object
+@param[in]	trx	transaction object
+@return whether the storing or retrieving from the query cache is permitted */
+TRANSACTIONAL_TARGET
+static bool innobase_query_caching_table_check_low(
+	dict_table_t* table, trx_t* trx)
+{
+	/* The following conditions will decide the query cache
+	retrieval or storing into:
+
+	(1) There should not be any locks on the table.
+	(2) Someother trx shouldn't invalidate the cache before this
+	transaction started.
+	(3) Read view shouldn't exist. If exists then the view
+	low_limit_id should be greater than or equal to the transaction that
+	invalidates the cache for the particular table.
+
+	For read-only transaction: should satisfy (1) and (3)
+	For read-write transaction: should satisfy (1), (2), (3) */
+
+	const trx_id_t inv = table->query_cache_inv_trx_id;
+
+	if (trx->id && trx->id < inv) {
+		return false;
+	}
+
+	if (trx->read_view.is_open() && trx->read_view.low_limit_id() < inv) {
+		return false;
+	}
+
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+	if (xbegin()) {
+		if (table->lock_mutex_is_locked())
+			xabort();
+		auto len = UT_LIST_GET_LEN(table->locks);
+		xend();
+		return len == 0;
+	}
+#endif
+
+	table->lock_mutex_lock();
+	auto len= UT_LIST_GET_LEN(table->locks);
+	table->lock_mutex_unlock();
+	return len == 0;
+}
+
+/** Checks if MySQL at the moment is allowed for this table to retrieve a
+consistent read result, or store it to the query cache.
+@param[in,out]	trx		transaction
+@param[in]	norm_name	concatenation of database name,
+				'/' char, table name
+@return whether storing or retrieving from the query cache is permitted */
+static bool innobase_query_caching_table_check(
+	trx_t*		trx,
+	const char*	norm_name)
+{
+	dict_table_t*   table = dict_table_open_on_name(
+		norm_name, false, DICT_ERR_IGNORE_FK_NOKEY);
+
+	if (table == NULL) {
+		return false;
+	}
+
+	/* Start the transaction if it is not started yet */
+	trx_start_if_not_started(trx, false);
+
+	bool allow = innobase_query_caching_table_check_low(table, trx);
+
+	dict_table_close(table);
+
+	if (allow) {
+		/* If the isolation level is high, assign a read view for the
+		transaction if it does not yet have one */
+
+		if (trx->isolation_level >= TRX_ISO_REPEATABLE_READ
+		    && !srv_read_only_mode
+		    && !trx->read_view.is_open()) {
+
+			/* Start the transaction if it is not started yet */
+			trx_start_if_not_started(trx, false);
+
+			trx->read_view.open(trx);
+		}
+	}
+
+	return allow;
+}
+
+/******************************************************************//**
+The MySQL query cache uses this to check from InnoDB if the query cache at
+the moment is allowed to operate on an InnoDB table. The SQL query must
+be a non-locking SELECT.
+
+The query cache is allowed to operate on certain query only if this function
+returns TRUE for all tables in the query.
+
+If thd is not in the autocommit state, this function also starts a new
+transaction for thd if there is no active trx yet, and assigns a consistent
+read view to it if there is no read view yet.
+
+Why a deadlock of threads is not possible: the query cache calls this function
+at the start of a SELECT processing. Then the calling thread cannot be
+holding any InnoDB semaphores. The calling thread is holding the
+query cache mutex, and this function will reserve the trx_sys.mutex.
+@return TRUE if permitted, FALSE if not; note that the value FALSE
+does not mean we should invalidate the query cache: invalidation is
+called explicitly */
+static
+my_bool
+innobase_query_caching_of_table_permitted(
+/*======================================*/
+	THD*	thd,		/*!< in: thd of the user who is trying to
+				store a result to the query cache or
+				retrieve it */
+	const char* full_name,	/*!< in: normalized path to the table */
+	uint	full_name_len,	/*!< in: length of the normalized path
+				to the table */
+	ulonglong *)
+{
+	char	norm_name[1000];
+	trx_t*	trx = check_trx_exists(thd);
+
+	ut_a(full_name_len < 999);
+
+	if (trx->isolation_level == TRX_ISO_SERIALIZABLE) {
+		/* In the SERIALIZABLE mode we add LOCK IN SHARE MODE to every
+		plain SELECT if AUTOCOMMIT is not on. */
+
+		return(false);
+	}
+
+	if (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)
+	    && trx->n_mysql_tables_in_use == 0) {
+		/* We are going to retrieve the query result from the query
+		cache. This cannot be a store operation to the query cache
+		because then MySQL would have locks on tables already.
+
+		TODO: if the user has used LOCK TABLES to lock the table,
+		then we open a transaction in the call of row_.. below.
+		That trx can stay open until UNLOCK TABLES. The same problem
+		exists even if we do not use the query cache. MySQL should be
+		modified so that it ALWAYS calls some cleanup function when
+		the processing of a query ends!
+
+		We can imagine we instantaneously serialize this consistent
+		read trx to the current trx id counter. If trx2 would have
+		changed the tables of a query result stored in the cache, and
+		trx2 would have already committed, making the result obsolete,
+		then trx2 would have already invalidated the cache. Thus we
+		can trust the result in the cache is ok for this query. */
+
+		return(true);
+	}
+
+	/* Normalize the table name to InnoDB format */
+	normalize_table_name(norm_name, full_name);
+
+	innobase_register_trx(innodb_hton_ptr, thd, trx);
+
+	return innobase_query_caching_table_check(trx, norm_name);
+}
+
+/*****************************************************************//**
+Invalidates the MySQL query cache for the table. */
+void
+innobase_invalidate_query_cache(
+/*============================*/
+	trx_t*		trx,		/*!< in: transaction which
+					modifies the table */
+	const char*	full_name)	/*!< in: concatenation of
+					database name, path separator,
+					table name, null char NUL;
+					NOTE that in Windows this is
+					always in LOWER CASE! */
+{
+	/* Note that the query cache mutex is just above the trx_sys.mutex.
+	The caller of this function must not have latches of a lower rank. */
+
+#ifdef HAVE_QUERY_CACHE
+        char    qcache_key_name[2 * (NAME_LEN + 1)];
+        char db_name[NAME_CHAR_LEN * MY_CS_MBMAXLEN + 1];
+        const char *key_ptr;
+        size_t  tabname_len;
+
+        // Extract the database name.
+        key_ptr= strchr(full_name, '/');
+        DBUG_ASSERT(key_ptr != NULL); // Database name should be present
+        size_t  dbname_len= size_t(key_ptr - full_name);
+        memcpy(db_name, full_name, dbname_len);
+        db_name[dbname_len]= '\0';
+
+        /* Construct the key("db-name\0table$name\0") for the query cache using
+        the path name("db@002dname\0table@0024name\0") of the table in its
+        canonical form. */
+        dbname_len = filename_to_tablename(db_name, qcache_key_name,
+                                           sizeof(qcache_key_name));
+        tabname_len = filename_to_tablename(++key_ptr,
+                                            (qcache_key_name + dbname_len + 1),
+                                            sizeof(qcache_key_name) -
+                                            dbname_len - 1);
+
+        /* Argument TRUE below means we are using transactions */
+        mysql_query_cache_invalidate4(trx->mysql_thd,
+                                      qcache_key_name,
+                                      uint(dbname_len + tabname_len + 2),
+                                      TRUE);
+#endif
+}
+
+/** Quote a standard SQL identifier like index or column name.
+@param[in]	file	output stream
+@param[in]	trx	InnoDB transaction, or NULL
+@param[in]	id	identifier to quote */
+void
+innobase_quote_identifier(
+	FILE*		file,
+	trx_t*		trx,
+	const char*	id)
+{
+	const int	q = trx != NULL && trx->mysql_thd != NULL
+		? get_quote_char_for_identifier(trx->mysql_thd, id, strlen(id))
+		: '`';
+
+	if (q == EOF) {
+		fputs(id, file);
+	} else {
+		putc(q, file);
+
+		while (int c = *id++) {
+			if (c == q) {
+				putc(c, file);
+			}
+			putc(c, file);
+		}
+
+		putc(q, file);
+	}
+}
+
+/** Quote a standard SQL identifier like tablespace, index or column name.
+@param[in]	trx	InnoDB transaction, or NULL
+@param[in]	id	identifier to quote
+@return quoted identifier */
+std::string
+innobase_quote_identifier(
+/*======================*/
+	trx_t*		trx,
+	const char*	id)
+{
+	std::string quoted_identifier;
+	const int	q = trx != NULL && trx->mysql_thd != NULL
+		? get_quote_char_for_identifier(trx->mysql_thd, id, strlen(id))
+		: '`';
+
+	if (q == EOF) {
+		quoted_identifier.append(id);
+	} else {
+		quoted_identifier += char(q);
+		quoted_identifier.append(id);
+		quoted_identifier += char(q);
+	}
+
+	return (quoted_identifier);
+}
+
+/** Convert a table name to the MySQL system_charset_info (UTF-8)
+and quote it.
+@param[out]	buf	buffer for converted identifier
+@param[in]	buflen	length of buf, in bytes
+@param[in]	id	identifier to convert
+@param[in]	idlen	length of id, in bytes
+@param[in]	thd	MySQL connection thread, or NULL
+@return pointer to the end of buf */
+static
+char*
+innobase_convert_identifier(
+	char*		buf,
+	ulint		buflen,
+	const char*	id,
+	ulint		idlen,
+	THD*		thd)
+{
+	const char*	s	= id;
+
+	char nz[MAX_TABLE_NAME_LEN + 1];
+	char nz2[MAX_TABLE_NAME_LEN + 1];
+
+	/* Decode the table name.  The MySQL function expects
+	a NUL-terminated string.  The input and output strings
+	buffers must not be shared. */
+	ut_a(idlen <= MAX_TABLE_NAME_LEN);
+	memcpy(nz, id, idlen);
+	nz[idlen] = 0;
+
+	s = nz2;
+	idlen = explain_filename(thd, nz, nz2, sizeof nz2,
+				 EXPLAIN_PARTITIONS_AS_COMMENT);
+	if (idlen > buflen) {
+		idlen = buflen;
+	}
+	memcpy(buf, s, idlen);
+	return(buf + idlen);
+}
+
+/*****************************************************************//**
+Convert a table name to the MySQL system_charset_info (UTF-8).
+@return pointer to the end of buf */
+char*
+innobase_convert_name(
+/*==================*/
+	char*		buf,	/*!< out: buffer for converted identifier */
+	ulint		buflen,	/*!< in: length of buf, in bytes */
+	const char*	id,	/*!< in: table name to convert */
+	ulint		idlen,	/*!< in: length of id, in bytes */
+	THD*		thd)	/*!< in: MySQL connection thread, or NULL */
+{
+	char*		s	= buf;
+	const char*	bufend	= buf + buflen;
+
+	const char*	slash = (const char*) memchr(id, '/', idlen);
+
+	if (slash == NULL) {
+		return(innobase_convert_identifier(
+				buf, buflen, id, idlen, thd));
+	}
+
+	/* Print the database name and table name separately. */
+	s = innobase_convert_identifier(s, ulint(bufend - s),
+					id, ulint(slash - id), thd);
+	if (s < bufend) {
+		*s++ = '.';
+		s = innobase_convert_identifier(s, ulint(bufend - s),
+						slash + 1, idlen
+						- ulint(slash - id) - 1,
+						thd);
+	}
+
+	return(s);
+}
+
+/*****************************************************************//**
+A wrapper function of innobase_convert_name(), convert a table name
+to the MySQL system_charset_info (UTF-8) and quote it if needed.
+@return pointer to the end of buf */
+void
+innobase_format_name(
+/*==================*/
+	char*		buf,	/*!< out: buffer for converted identifier */
+	ulint		buflen,	/*!< in: length of buf, in bytes */
+	const char*	name)	/*!< in: table name to format */
+{
+	const char*     bufend;
+
+	bufend = innobase_convert_name(buf, buflen, name, strlen(name), NULL);
+
+	ut_ad((ulint) (bufend - buf) < buflen);
+
+	buf[bufend - buf] = '\0';
+}
+
+/**********************************************************************//**
+Determines if the currently running transaction has been interrupted.
+@return true if interrupted */
+bool
+trx_is_interrupted(
+/*===============*/
+	const trx_t*	trx)	/*!< in: transaction */
+{
+	return(trx && trx->mysql_thd && thd_kill_level(trx->mysql_thd));
+}
+
+/**************************************************************//**
+Resets some fields of a m_prebuilt struct. The template is used in fast
+retrieval of just those column values MySQL needs in its processing. */
+void
+ha_innobase::reset_template(void)
+/*=============================*/
+{
+	ut_ad(m_prebuilt->magic_n == ROW_PREBUILT_ALLOCATED);
+	ut_ad(m_prebuilt->magic_n2 == m_prebuilt->magic_n);
+
+	/* Force table to be freed in close_thread_table(). */
+	DBUG_EXECUTE_IF("free_table_in_fts_query",
+		if (m_prebuilt->in_fts_query) {
+                  table->mark_table_for_reopen();
+		}
+	);
+
+	m_prebuilt->keep_other_fields_on_keyread = false;
+	m_prebuilt->read_just_key = 0;
+	m_prebuilt->in_fts_query = 0;
+
+	/* Reset index condition pushdown state. */
+	if (m_prebuilt->idx_cond) {
+		m_prebuilt->idx_cond = NULL;
+		m_prebuilt->idx_cond_n_cols = 0;
+		/* Invalidate m_prebuilt->mysql_template
+		in ha_innobase::write_row(). */
+		m_prebuilt->template_type = ROW_MYSQL_NO_TEMPLATE;
+	}
+	if (m_prebuilt->pk_filter) {
+		m_prebuilt->pk_filter = NULL;
+		m_prebuilt->template_type = ROW_MYSQL_NO_TEMPLATE;
+	}
+}
+
+/*****************************************************************//**
+Call this when you have opened a new table handle in HANDLER, before you
+call index_read_map() etc. Actually, we can let the cursor stay open even
+over a transaction commit! Then you should call this before every operation,
+fetch next etc. This function inits the necessary things even after a
+transaction commit. */
+
+void
+ha_innobase::init_table_handle_for_HANDLER(void)
+/*============================================*/
+{
+	/* If current thd does not yet have a trx struct, create one.
+	If the current handle does not yet have a m_prebuilt struct, create
+	one. Update the trx pointers in the m_prebuilt struct. Normally
+	this operation is done in external_lock. */
+
+	update_thd(ha_thd());
+
+	/* Initialize the m_prebuilt struct much like it would be inited in
+	external_lock */
+
+	/* If the transaction is not started yet, start it */
+
+	trx_start_if_not_started_xa(m_prebuilt->trx, false);
+
+	/* Assign a read view if the transaction does not have it yet */
+
+	m_prebuilt->trx->read_view.open(m_prebuilt->trx);
+
+	innobase_register_trx(ht, m_user_thd, m_prebuilt->trx);
+
+	/* We did the necessary inits in this function, no need to repeat them
+	in row_search_mvcc() */
+
+	m_prebuilt->sql_stat_start = FALSE;
+
+	/* We let HANDLER always to do the reads as consistent reads, even
+	if the trx isolation level would have been specified as SERIALIZABLE */
+
+	m_prebuilt->select_lock_type = LOCK_NONE;
+	m_prebuilt->stored_select_lock_type = LOCK_NONE;
+
+	/* Always fetch all columns in the index record */
+
+	m_prebuilt->hint_need_to_fetch_extra_cols = ROW_RETRIEVE_ALL_COLS;
+
+	/* We want always to fetch all columns in the whole row? Or do
+	we???? */
+
+	m_prebuilt->used_in_HANDLER = TRUE;
+
+	reset_template();
+	m_prebuilt->trx->bulk_insert = false;
+}
+
+/*********************************************************************//**
+Free any resources that were allocated and return failure.
+@return always return 1 */
+static int innodb_init_abort()
+{
+	DBUG_ENTER("innodb_init_abort");
+
+	if (fil_system.temp_space) {
+		fil_system.temp_space->close();
+	}
+
+	srv_sys_space.shutdown();
+	if (srv_tmp_space.get_sanity_check_status()) {
+		srv_tmp_space.delete_files();
+	}
+	srv_tmp_space.shutdown();
+
+	DBUG_RETURN(1);
+}
+
+/** Return the minimum buffer pool size based on page size */
+static inline ulint min_buffer_pool_size()
+{
+  ulint s= (BUF_LRU_MIN_LEN + BUF_LRU_MIN_LEN / 4) * srv_page_size;
+  /* buf_pool_chunk_size minimum is 1M, so round up to a multiple */
+  ulint alignment= 1U << 20;
+  return UT_CALC_ALIGN(s, alignment);
+}
+
+/** Validate the requested buffer pool size.  Also, reserve the necessary
+memory needed for buffer pool resize.
+@param[in]	thd	thread handle
+@param[in]	var	pointer to system variable
+@param[out]	save	immediate result for update function
+@param[in]	value	incoming string
+@return 0 on success, 1 on failure.
+*/
+static
+int
+innodb_buffer_pool_size_validate(
+	THD*				thd,
+	struct st_mysql_sys_var*	var,
+	void*				save,
+	struct st_mysql_value*		value);
+
+/** Update the system variable innodb_buffer_pool_size using the "saved"
+value. This function is registered as a callback with MySQL.
+@param[in]	thd	thread handle
+@param[in]	var	pointer to system variable
+@param[out]	var_ptr	where the formal string goes
+@param[in]	save	immediate result from check function */
+static
+void
+innodb_buffer_pool_size_update(
+	THD*				thd,
+	struct st_mysql_sys_var*	var,
+	void*				var_ptr,
+	const void*			save);
+
+static MYSQL_SYSVAR_ULONGLONG(buffer_pool_size, innobase_buffer_pool_size,
+  PLUGIN_VAR_RQCMDARG,
+  "The size of the memory buffer InnoDB uses to cache data and indexes of its tables.",
+  innodb_buffer_pool_size_validate,
+  innodb_buffer_pool_size_update,
+  128ULL << 20,
+  2ULL << 20,
+  LLONG_MAX, 1024*1024L);
+
+/****************************************************************//**
+Gives the file extension of an InnoDB single-table tablespace. */
+static const char* ha_innobase_exts[] = {
+	dot_ext[IBD],
+	dot_ext[ISL],
+	NullS
+};
+
+/** Determine if system-versioned data was modified by the transaction.
+@param[in,out]	thd	current session
+@param[out]	trx_id	transaction start ID
+@return	transaction commit ID
+@retval	0	if no system-versioned data was affected by the transaction */
+static ulonglong innodb_prepare_commit_versioned(THD* thd, ulonglong *trx_id)
+{
+  if (trx_t *trx= thd_to_trx(thd))
+  {
+    *trx_id= trx->id;
+    bool versioned= false;
+
+    for (auto &t : trx->mod_tables)
+    {
+      if (t.second.is_versioned())
+      {
+        DBUG_ASSERT(t.first->versioned_by_id());
+        DBUG_ASSERT(trx->rsegs.m_redo.rseg);
+        versioned= true;
+        if (!trx->bulk_insert)
+          break;
+      }
+      if (t.second.is_bulk_insert())
+      {
+        ut_ad(trx->bulk_insert);
+        if (t.second.write_bulk(t.first, trx))
+          return ULONGLONG_MAX;
+      }
+    }
+
+    return versioned ? trx_sys.get_new_trx_id() : 0;
+  }
+
+  *trx_id= 0;
+  return 0;
+}
+
+/** Initialize and normalize innodb_buffer_pool_{chunk_,}size. */
+static void innodb_buffer_pool_size_init()
+{
+  if (srv_buf_pool_chunk_unit > srv_buf_pool_size)
+  {
+    /* Size unit of buffer pool is larger than srv_buf_pool_size.
+    adjust srv_buf_pool_chunk_unit for srv_buf_pool_size. */
+    srv_buf_pool_chunk_unit = srv_buf_pool_size;
+  }
+  else if (srv_buf_pool_chunk_unit == 0)
+  {
+    srv_buf_pool_chunk_unit = srv_buf_pool_size / 64;
+    my_large_page_truncate(&srv_buf_pool_chunk_unit);
+  }
+
+  if (srv_buf_pool_chunk_unit < buf_pool_chunk_min_size)
+    srv_buf_pool_chunk_unit = buf_pool_chunk_min_size;
+
+  srv_buf_pool_size = buf_pool_size_align(srv_buf_pool_size);
+  innobase_buffer_pool_size = srv_buf_pool_size;
+}
+
+
+static bool
+compression_algorithm_is_not_loaded(ulong compression_algorithm, myf flags)
+{
+  bool is_loaded[PAGE_ALGORITHM_LAST+1]= { 1, 1, provider_service_lz4->is_loaded,
+    provider_service_lzo->is_loaded, provider_service_lzma->is_loaded,
+    provider_service_bzip2->is_loaded, provider_service_snappy->is_loaded };
+
+  DBUG_ASSERT(compression_algorithm <= PAGE_ALGORITHM_LAST);
+
+  if (is_loaded[compression_algorithm])
+    return 0;
+
+  my_printf_error(HA_ERR_UNSUPPORTED, "InnoDB: compression algorithm %s (%u)"
+    " is not available. Please, load the corresponding provider plugin.", flags,
+    page_compression_algorithms[compression_algorithm], compression_algorithm);
+  return 1;
+}
+
+/** Initialize, validate and normalize the InnoDB startup parameters.
+@return failure code
+@retval 0 on success
+@retval HA_ERR_OUT_OF_MEM	when out of memory
+@retval HA_ERR_INITIALIZATION	when some parameters are out of range */
+static int innodb_init_params()
+{
+	DBUG_ENTER("innodb_init_params");
+
+	ulong		num_pll_degree;
+
+	/* Check that values don't overflow on 32-bit systems. */
+	if (sizeof(ulint) == 4) {
+		if (innobase_buffer_pool_size > UINT_MAX32) {
+			sql_print_error(
+				"innodb_buffer_pool_size can't be over 4GB"
+				" on 32-bit systems");
+			DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+		}
+	}
+
+	/* The buffer pool needs to be able to accommodate enough many
+	pages, even for larger pages */
+	MYSQL_SYSVAR_NAME(buffer_pool_size).min_val= min_buffer_pool_size();
+
+	if (innobase_buffer_pool_size < MYSQL_SYSVAR_NAME(buffer_pool_size).min_val) {
+		ib::error() << "innodb_page_size="
+			<< srv_page_size << " requires "
+			<< "innodb_buffer_pool_size >= "
+			<< (MYSQL_SYSVAR_NAME(buffer_pool_size).min_val >> 20)
+			<< "MiB current " << (innobase_buffer_pool_size >> 20)
+			<< "MiB";
+		DBUG_RETURN(HA_ERR_INITIALIZATION);
+	}
+
+        if (compression_algorithm_is_not_loaded(innodb_compression_algorithm, ME_ERROR_LOG))
+          DBUG_RETURN(HA_ERR_INITIALIZATION);
+
+	if ((srv_encrypt_tables || srv_encrypt_log
+	     || innodb_encrypt_temporary_tables)
+	     && !encryption_key_id_exists(FIL_DEFAULT_ENCRYPTION_KEY)) {
+		sql_print_error("InnoDB: cannot enable encryption, "
+				"encryption plugin is not available");
+		DBUG_RETURN(HA_ERR_INITIALIZATION);
+	}
+
+#ifdef _WIN32
+	if (!is_filename_allowed(srv_buf_dump_filename,
+				 strlen(srv_buf_dump_filename), FALSE)) {
+		sql_print_error("InnoDB: innodb_buffer_pool_filename"
+			" cannot have colon (:) in the file name.");
+		DBUG_RETURN(HA_ERR_INITIALIZATION);
+	}
+#endif
+
+	/* First calculate the default path for innodb_data_home_dir etc.,
+	in case the user has not given any value.
+
+	Note that when using the embedded server, the datadirectory is not
+	necessarily the current directory of this program. */
+
+	fil_path_to_mysql_datadir =
+#ifndef HAVE_REPLICATION
+		mysqld_embedded ? mysql_real_data_home :
+#endif
+		"./";
+
+	/* Set InnoDB initialization parameters according to the values
+	read from MySQL .cnf file */
+
+	/* The default dir for data files is the datadir of MySQL */
+
+	srv_data_home = innobase_data_home_dir
+		? innobase_data_home_dir
+		: const_cast<char*>(fil_path_to_mysql_datadir);
+#ifdef WITH_WSREP
+	/* If we use the wsrep API, then we need to tell the server
+	the path to the data files (for passing it to the SST scripts): */
+	wsrep_set_data_home_dir(srv_data_home);
+#endif /* WITH_WSREP */
+
+
+	/*--------------- Shared tablespaces -------------------------*/
+
+	/* Check that the value of system variable innodb_page_size was
+	set correctly.  Its value was put into srv_page_size. If valid,
+	return the associated srv_page_size_shift. */
+	srv_page_size_shift = innodb_page_size_validate(srv_page_size);
+	if (!srv_page_size_shift) {
+		sql_print_error("InnoDB: Invalid page size=%lu.\n",
+				srv_page_size);
+		DBUG_RETURN(HA_ERR_INITIALIZATION);
+	}
+
+	srv_sys_space.set_space_id(TRX_SYS_SPACE);
+
+	switch (srv_checksum_algorithm) {
+	case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
+		srv_sys_space.set_flags(FSP_FLAGS_FCRC32_MASK_MARKER
+					| FSP_FLAGS_FCRC32_PAGE_SSIZE());
+		break;
+	default:
+		srv_sys_space.set_flags(FSP_FLAGS_PAGE_SSIZE());
+	}
+
+	srv_sys_space.set_path(srv_data_home);
+
+	/* Supports raw devices */
+	if (!srv_sys_space.parse_params(innobase_data_file_path, true)) {
+		ib::error() << "Unable to parse innodb_data_file_path="
+			    << innobase_data_file_path;
+		DBUG_RETURN(HA_ERR_INITIALIZATION);
+	}
+
+	srv_tmp_space.set_path(srv_data_home);
+
+	/* Temporary tablespace is in full crc32 format. */
+	srv_tmp_space.set_flags(FSP_FLAGS_FCRC32_MASK_MARKER
+				| FSP_FLAGS_FCRC32_PAGE_SSIZE());
+
+	if (!srv_tmp_space.parse_params(innobase_temp_data_file_path, false)) {
+		ib::error() << "Unable to parse innodb_temp_data_file_path="
+			    << innobase_temp_data_file_path;
+		DBUG_RETURN(HA_ERR_INITIALIZATION);
+	}
+
+	/* Perform all sanity check before we take action of deleting files*/
+	if (srv_sys_space.intersection(&srv_tmp_space)) {
+		sql_print_error("innodb_temporary and innodb_system"
+				" file names seem to be the same.");
+		DBUG_RETURN(HA_ERR_INITIALIZATION);
+	}
+
+	srv_sys_space.normalize_size();
+	srv_tmp_space.normalize_size();
+
+	/* ------------ UNDO tablespaces files ---------------------*/
+	if (!srv_undo_dir) {
+		srv_undo_dir = const_cast<char*>(fil_path_to_mysql_datadir);
+	}
+
+	if (strchr(srv_undo_dir, ';')) {
+		sql_print_error("syntax error in innodb_undo_directory");
+		DBUG_RETURN(HA_ERR_INITIALIZATION);
+	}
+
+	/* -------------- All log files ---------------------------*/
+
+	/* The default dir for log files is the datadir of MySQL */
+
+	if (!srv_log_group_home_dir) {
+		srv_log_group_home_dir
+			= const_cast<char*>(fil_path_to_mysql_datadir);
+	}
+
+	if (strchr(srv_log_group_home_dir, ';')) {
+		sql_print_error("syntax error in innodb_log_group_home_dir");
+		DBUG_RETURN(HA_ERR_INITIALIZATION);
+	}
+
+	DBUG_ASSERT(innodb_change_buffering <= IBUF_USE_ALL);
+
+	/* Check that interdependent parameters have sane values. */
+	if (srv_max_buf_pool_modified_pct < srv_max_dirty_pages_pct_lwm) {
+		sql_print_warning("InnoDB: innodb_max_dirty_pages_pct_lwm"
+				  " cannot be set higher than"
+				  " innodb_max_dirty_pages_pct.\n"
+				  "InnoDB: Setting"
+				  " innodb_max_dirty_pages_pct_lwm to %lf\n",
+				  srv_max_buf_pool_modified_pct);
+
+		srv_max_dirty_pages_pct_lwm = srv_max_buf_pool_modified_pct;
+	}
+
+	if (srv_max_io_capacity == SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT) {
+
+		if (srv_io_capacity >= SRV_MAX_IO_CAPACITY_LIMIT / 2) {
+			/* Avoid overflow. */
+			srv_max_io_capacity = SRV_MAX_IO_CAPACITY_LIMIT;
+		} else {
+			/* The user has not set the value. We should
+			set it based on innodb_io_capacity. */
+			srv_max_io_capacity =
+				ut_max(2 * srv_io_capacity, 2000UL);
+		}
+
+	} else if (srv_max_io_capacity < srv_io_capacity) {
+		sql_print_warning("InnoDB: innodb_io_capacity"
+				  " cannot be set higher than"
+				  " innodb_io_capacity_max."
+				  "Setting innodb_io_capacity=%lu",
+				  srv_max_io_capacity);
+
+		srv_io_capacity = srv_max_io_capacity;
+	}
+
+	if (UNIV_PAGE_SIZE_DEF != srv_page_size) {
+		ib::info() << "innodb_page_size=" << srv_page_size;
+
+		srv_max_undo_log_size = std::max(
+			srv_max_undo_log_size,
+			ulonglong(SRV_UNDO_TABLESPACE_SIZE_IN_PAGES)
+			<< srv_page_size_shift);
+	}
+
+	srv_buf_pool_size = ulint(innobase_buffer_pool_size);
+
+	if (innobase_open_files < 10) {
+		innobase_open_files = 300;
+		if (srv_file_per_table && tc_size > 300 && tc_size < open_files_limit) {
+			innobase_open_files = tc_size;
+		}
+	}
+
+	if (innobase_open_files > open_files_limit) {
+		ib::warn() << "innodb_open_files " << innobase_open_files
+			   << " should not be greater"
+			   << " than the open_files_limit " << open_files_limit;
+		if (innobase_open_files > tc_size) {
+			innobase_open_files = tc_size;
+		}
+	}
+
+	srv_max_n_open_files = innobase_open_files;
+	srv_innodb_status = (ibool) innobase_create_status_file;
+
+	srv_print_verbose_log = mysqld_embedded ? 0 : 1;
+
+	/* Round up fts_sort_pll_degree to nearest power of 2 number */
+	for (num_pll_degree = 1;
+	     num_pll_degree < fts_sort_pll_degree;
+	     num_pll_degree <<= 1) {
+
+		/* No op */
+	}
+
+	fts_sort_pll_degree = num_pll_degree;
+
+	/* Store the default charset-collation number of this MySQL
+	installation */
+
+	data_mysql_default_charset_coll = (ulint) default_charset_info->number;
+
+#ifndef _WIN32
+	if (srv_use_atomic_writes && my_may_have_atomic_write) {
+		/*
+                  Force O_DIRECT on Unixes (on Windows writes are always
+                  unbuffered)
+                */
+		switch (srv_file_flush_method) {
+		case SRV_O_DIRECT:
+		case SRV_O_DIRECT_NO_FSYNC:
+			break;
+		default:
+			srv_file_flush_method = SRV_O_DIRECT;
+			fprintf(stderr, "InnoDB: using O_DIRECT due to atomic writes.\n");
+		}
+	}
+#endif
+
+#if defined __linux__ || defined _WIN32
+	if (srv_flush_log_at_trx_commit == 2) {
+		/* Do not disable the file system cache if
+		innodb_flush_log_at_trx_commit=2. */
+		log_sys.log_buffered = true;
+	}
+#endif
+
+	if (srv_read_only_mode) {
+		ib::info() << "Started in read only mode";
+		srv_use_doublewrite_buf = FALSE;
+	}
+
+#if !defined LINUX_NATIVE_AIO && !defined HAVE_URING && !defined _WIN32
+	/* Currently native AIO is supported only on windows and linux
+	and that also when the support is compiled in. In all other
+	cases, we ignore the setting of innodb_use_native_aio. */
+	srv_use_native_aio = FALSE;
+#endif
+#ifdef HAVE_URING
+	if (srv_use_native_aio && io_uring_may_be_unsafe) {
+		sql_print_warning("innodb_use_native_aio may cause "
+				  "hangs with this kernel %s; see "
+				  "https://jira.mariadb.org/browse/MDEV-26674",
+				  io_uring_may_be_unsafe);
+	}
+#endif
+
+#ifndef _WIN32
+	ut_ad(srv_file_flush_method <= SRV_O_DIRECT_NO_FSYNC);
+#else
+	switch (srv_file_flush_method) {
+	case SRV_ALL_O_DIRECT_FSYNC + 1 /* "async_unbuffered"="unbuffered" */:
+		srv_file_flush_method = SRV_ALL_O_DIRECT_FSYNC;
+		break;
+	case SRV_ALL_O_DIRECT_FSYNC + 2 /* "normal"="fsync" */:
+		srv_file_flush_method = SRV_FSYNC;
+		break;
+	default:
+		ut_ad(srv_file_flush_method <= SRV_ALL_O_DIRECT_FSYNC);
+	}
+#endif
+	innodb_buffer_pool_size_init();
+
+	srv_lock_table_size = 5 * (srv_buf_pool_size >> srv_page_size_shift);
+	DBUG_RETURN(0);
+}
+
+/** Initialize the InnoDB storage engine plugin.
+@param[in,out]	p	InnoDB handlerton
+@return error code
+@retval 0 on success */
+static int innodb_init(void* p)
+{
+	DBUG_ENTER("innodb_init");
+	handlerton* innobase_hton= static_cast<handlerton*>(p);
+	innodb_hton_ptr = innobase_hton;
+
+	innobase_hton->db_type = DB_TYPE_INNODB;
+	innobase_hton->savepoint_offset = sizeof(trx_named_savept_t);
+	innobase_hton->close_connection = innobase_close_connection;
+	innobase_hton->kill_query = innobase_kill_query;
+	innobase_hton->savepoint_set = innobase_savepoint;
+	innobase_hton->savepoint_rollback = innobase_rollback_to_savepoint;
+
+	innobase_hton->savepoint_rollback_can_release_mdl =
+				innobase_rollback_to_savepoint_can_release_mdl;
+
+	innobase_hton->savepoint_release = innobase_release_savepoint;
+	innobase_hton->prepare_ordered= NULL;
+	innobase_hton->commit_ordered= innobase_commit_ordered;
+	innobase_hton->commit = innobase_commit;
+	innobase_hton->rollback = innobase_rollback;
+	innobase_hton->prepare = innobase_xa_prepare;
+	innobase_hton->recover = innobase_xa_recover;
+	innobase_hton->commit_by_xid = innobase_commit_by_xid;
+	innobase_hton->rollback_by_xid = innobase_rollback_by_xid;
+	innobase_hton->commit_checkpoint_request = innodb_log_flush_request;
+	innobase_hton->create = innobase_create_handler;
+
+	innobase_hton->drop_database = innodb_drop_database;
+	innobase_hton->panic = innobase_end;
+	innobase_hton->pre_shutdown = innodb_preshutdown;
+
+	innobase_hton->start_consistent_snapshot =
+		innobase_start_trx_and_assign_read_view;
+
+	innobase_hton->flush_logs = innobase_flush_logs;
+	innobase_hton->show_status = innobase_show_status;
+	innobase_hton->notify_tabledef_changed= innodb_notify_tabledef_changed;
+	innobase_hton->flags =
+		HTON_SUPPORTS_EXTENDED_KEYS | HTON_SUPPORTS_FOREIGN_KEYS |
+		HTON_NATIVE_SYS_VERSIONING |
+		HTON_WSREP_REPLICATION |
+		HTON_REQUIRES_CLOSE_AFTER_TRUNCATE |
+		HTON_TRUNCATE_REQUIRES_EXCLUSIVE_USE |
+		HTON_REQUIRES_NOTIFY_TABLEDEF_CHANGED_AFTER_COMMIT;
+
+#ifdef WITH_WSREP
+	innobase_hton->abort_transaction=wsrep_abort_transaction;
+	innobase_hton->set_checkpoint=innobase_wsrep_set_checkpoint;
+	innobase_hton->get_checkpoint=innobase_wsrep_get_checkpoint;
+	innobase_hton->disable_internal_writes=innodb_disable_internal_writes;
+#endif /* WITH_WSREP */
+
+	innobase_hton->check_version = innodb_check_version;
+	innobase_hton->signal_ddl_recovery_done = innodb_ddl_recovery_done;
+
+	innobase_hton->tablefile_extensions = ha_innobase_exts;
+	innobase_hton->table_options = innodb_table_option_list;
+
+	/* System Versioning */
+	innobase_hton->prepare_commit_versioned
+		= innodb_prepare_commit_versioned;
+
+	innodb_remember_check_sysvar_funcs();
+
+	compile_time_assert(DATA_MYSQL_TRUE_VARCHAR == MYSQL_TYPE_VARCHAR);
+
+#ifndef DBUG_OFF
+	static const char	test_filename[] = "-@";
+	char			test_tablename[sizeof test_filename
+				+ sizeof(srv_mysql50_table_name_prefix) - 1];
+	DBUG_ASSERT(sizeof test_tablename - 1
+		    == filename_to_tablename(test_filename,
+					     test_tablename,
+					     sizeof test_tablename, true));
+	DBUG_ASSERT(!strncmp(test_tablename,
+			     srv_mysql50_table_name_prefix,
+			     sizeof srv_mysql50_table_name_prefix - 1));
+	DBUG_ASSERT(!strcmp(test_tablename
+			    + sizeof srv_mysql50_table_name_prefix - 1,
+			    test_filename));
+#endif /* DBUG_OFF */
+
+	os_file_set_umask(my_umask);
+
+	/* Setup the memory alloc/free tracing mechanisms before calling
+	any functions that could possibly allocate memory. */
+	ut_new_boot();
+
+	if (int error = innodb_init_params()) {
+		DBUG_RETURN(error);
+	}
+
+	/* After this point, error handling has to use
+	innodb_init_abort(). */
+
+#ifdef HAVE_PSI_INTERFACE
+	/* Register keys with MySQL performance schema */
+	int	count;
+
+# ifdef UNIV_PFS_MUTEX
+	count = array_elements(all_innodb_mutexes);
+	mysql_mutex_register("innodb", all_innodb_mutexes, count);
+# endif /* UNIV_PFS_MUTEX */
+
+# ifdef UNIV_PFS_RWLOCK
+	count = array_elements(all_innodb_rwlocks);
+	mysql_rwlock_register("innodb", all_innodb_rwlocks, count);
+# endif /* UNIV_PFS_MUTEX */
+
+# ifdef UNIV_PFS_THREAD
+	count = array_elements(all_innodb_threads);
+	mysql_thread_register("innodb", all_innodb_threads, count);
+# endif /* UNIV_PFS_THREAD */
+
+# ifdef UNIV_PFS_IO
+	count = array_elements(all_innodb_files);
+	mysql_file_register("innodb", all_innodb_files, count);
+# endif /* UNIV_PFS_IO */
+#endif /* HAVE_PSI_INTERFACE */
+
+	bool	create_new_db = false;
+
+	/* Check whether the data files exist. */
+	dberr_t	err = srv_sys_space.check_file_spec(&create_new_db, 5U << 20);
+
+	if (err != DB_SUCCESS) {
+		DBUG_RETURN(innodb_init_abort());
+	}
+
+	err = srv_start(create_new_db);
+
+	if (err != DB_SUCCESS) {
+		innodb_shutdown();
+		DBUG_RETURN(innodb_init_abort());
+	}
+
+	srv_was_started = true;
+	innodb_params_adjust();
+
+	innobase_old_blocks_pct = buf_LRU_old_ratio_update(
+		innobase_old_blocks_pct, true);
+
+	ibuf_max_size_update(srv_change_buffer_max_size);
+
+	mysql_mutex_init(pending_checkpoint_mutex_key,
+			 &log_requests.mutex,
+			 MY_MUTEX_INIT_FAST);
+#ifdef MYSQL_DYNAMIC_PLUGIN
+	if (innobase_hton != p) {
+		innobase_hton = reinterpret_cast<handlerton*>(p);
+		*innobase_hton = *innodb_hton_ptr;
+	}
+#endif /* MYSQL_DYNAMIC_PLUGIN */
+
+	memset(innodb_counter_value, 0, sizeof innodb_counter_value);
+
+	/* Do this as late as possible so server is fully starts up,
+	since  we might get some initial stats if user choose to turn
+	on some counters from start up */
+	if (innobase_enable_monitor_counter) {
+		innodb_enable_monitor_at_startup(
+			innobase_enable_monitor_counter);
+	}
+
+	/* Turn on monitor counters that are default on */
+	srv_mon_default_on();
+
+	/* Unit Tests */
+#ifdef UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR
+	unit_test_os_file_get_parent_dir();
+#endif /* UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR */
+
+#ifdef UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH
+	test_make_filepath();
+#endif /*UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH */
+
+#ifdef UNIV_ENABLE_DICT_STATS_TEST
+	test_dict_stats_all();
+#endif /*UNIV_ENABLE_DICT_STATS_TEST */
+
+#ifdef UNIV_ENABLE_UNIT_TEST_ROW_RAW_FORMAT_INT
+# ifdef HAVE_UT_CHRONO_T
+	test_row_raw_format_int();
+# endif /* HAVE_UT_CHRONO_T */
+#endif /* UNIV_ENABLE_UNIT_TEST_ROW_RAW_FORMAT_INT */
+
+	DBUG_RETURN(0);
+}
+
+/** Shut down the InnoDB storage engine.
+@return	0 */
+static
+int
+innobase_end(handlerton*, ha_panic_function)
+{
+	DBUG_ENTER("innobase_end");
+
+	if (srv_was_started) {
+		THD *thd= current_thd;
+		if (thd) { // may be UNINSTALL PLUGIN statement
+		 	if (trx_t* trx = thd_to_trx(thd)) {
+				trx->free();
+		 	}
+		}
+
+
+		innodb_shutdown();
+		mysql_mutex_destroy(&log_requests.mutex);
+	}
+
+	DBUG_RETURN(0);
+}
+
+/*****************************************************************//**
+Commits a transaction in an InnoDB database. */
+void
+innobase_commit_low(
+/*================*/
+	trx_t*	trx)	/*!< in: transaction handle */
+{
+#ifdef WITH_WSREP
+	const char* tmp = 0;
+	const bool is_wsrep = trx->is_wsrep();
+	if (is_wsrep) {
+		tmp = thd_proc_info(trx->mysql_thd, "innobase_commit_low()");
+	}
+#endif /* WITH_WSREP */
+	if (trx_is_started(trx)) {
+		trx_commit_for_mysql(trx);
+	} else {
+		trx->will_lock = false;
+#ifdef WITH_WSREP
+		trx->wsrep = false;
+#endif /* WITH_WSREP */
+	}
+
+#ifdef WITH_WSREP
+	if (is_wsrep) {
+		thd_proc_info(trx->mysql_thd, tmp);
+	}
+#endif /* WITH_WSREP */
+}
+
+/*****************************************************************//**
+Creates an InnoDB transaction struct for the thd if it does not yet have one.
+Starts a new InnoDB transaction if a transaction is not yet started. And
+assigns a new snapshot for a consistent read if the transaction does not yet
+have one.
+@return 0 */
+static
+int
+innobase_start_trx_and_assign_read_view(
+/*====================================*/
+	handlerton*	hton,	/*!< in: InnoDB handlerton */
+	THD*		thd)	/*!< in: MySQL thread handle of the user for
+				whom the transaction should be committed */
+{
+	DBUG_ENTER("innobase_start_trx_and_assign_read_view");
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	/* Create a new trx struct for thd, if it does not yet have one */
+
+	trx_t*	trx = check_trx_exists(thd);
+
+	/* The transaction should not be active yet, start it */
+
+	ut_ad(!trx_is_started(trx));
+
+	trx_start_if_not_started_xa(trx, false);
+
+	/* Assign a read view if the transaction does not have it yet.
+	Do this only if transaction is using REPEATABLE READ isolation
+	level. */
+	trx->isolation_level = innobase_map_isolation_level(
+		thd_get_trx_isolation(thd));
+
+	if (trx->isolation_level == TRX_ISO_REPEATABLE_READ) {
+		trx->read_view.open(trx);
+	} else {
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    HA_ERR_UNSUPPORTED,
+				    "InnoDB: WITH CONSISTENT SNAPSHOT"
+				    " was ignored because this phrase"
+				    " can only be used with"
+				    " REPEATABLE READ isolation level.");
+	}
+
+	/* Set the MySQL flag to mark that there is an active transaction */
+
+	innobase_register_trx(hton, current_thd, trx);
+
+	DBUG_RETURN(0);
+}
+
+static
+void
+innobase_commit_ordered_2(
+/*======================*/
+	trx_t*	trx, 	/*!< in: Innodb transaction */
+	THD*	thd)	/*!< in: MySQL thread handle */
+{
+	DBUG_ENTER("innobase_commit_ordered_2");
+
+	if (trx->id) {
+		/* The following call reads the binary log position of
+		the transaction being committed.
+
+		Binary logging of other engines is not relevant to
+		InnoDB as all InnoDB requires is that committing
+		InnoDB transactions appear in the same order in the
+		MySQL binary log as they appear in InnoDB logs, which
+		is guaranteed by the server.
+
+		If the binary log is not enabled, or the transaction
+		is not written to the binary log, the file name will
+		be a NULL pointer. */
+		thd_binlog_pos(thd, &trx->mysql_log_file_name,
+			       &trx->mysql_log_offset);
+
+		/* Don't do write + flush right now. For group commit
+		to work we want to do the flush later. */
+		trx->flush_log_later = true;
+	}
+
+#ifdef WITH_WSREP
+	/* If the transaction is not run in 2pc, we must assign wsrep
+	XID here in order to get it written in rollback segment. */
+	if (trx->is_wsrep()) {
+		thd_get_xid(thd, &reinterpret_cast<MYSQL_XID&>(trx->xid));
+	}
+#endif /* WITH_WSREP */
+
+	innobase_commit_low(trx);
+	trx->mysql_log_file_name = NULL;
+	trx->flush_log_later = false;
+
+	DBUG_VOID_RETURN;
+}
+
+/*****************************************************************//**
+Perform the first, fast part of InnoDB commit.
+
+Doing it in this call ensures that we get the same commit order here
+as in binlog and any other participating transactional storage engines.
+
+Note that we want to do as little as really needed here, as we run
+under a global mutex. The expensive fsync() is done later, in
+innobase_commit(), without a lock so group commit can take place.
+
+Note also that this method can be called from a different thread than
+the one handling the rest of the transaction. */
+static
+void
+innobase_commit_ordered(
+/*====================*/
+	handlerton *hton, /*!< in: Innodb handlerton */
+	THD*	thd,	/*!< in: MySQL thread handle of the user for whom
+			the transaction should be committed */
+	bool	all)	/*!< in:	TRUE - commit transaction
+				FALSE - the current SQL statement ended */
+{
+	trx_t*		trx;
+	DBUG_ENTER("innobase_commit_ordered");
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	trx = check_trx_exists(thd);
+
+	if (!trx_is_registered_for_2pc(trx) && trx_is_started(trx)) {
+		/* We cannot throw error here; instead we will catch this error
+		again in innobase_commit() and report it from there. */
+		DBUG_VOID_RETURN;
+	}
+
+	/* commit_ordered is only called when committing the whole transaction
+	(or an SQL statement when autocommit is on). */
+	DBUG_ASSERT(all ||
+		(!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)));
+
+	innobase_commit_ordered_2(trx, thd);
+	trx->active_commit_ordered = true;
+
+	DBUG_VOID_RETURN;
+}
+
+/** Mark the end of a statement.
+@param trx transaction
+@return whether an error occurred */
+static bool end_of_statement(trx_t *trx)
+{
+  trx_mark_sql_stat_end(trx);
+  if (UNIV_LIKELY(trx->error_state == DB_SUCCESS))
+    return false;
+
+  trx_savept_t savept;
+  savept.least_undo_no= 0;
+  trx->rollback(&savept);
+  /* MariaDB will roll back the entire transaction. */
+  trx->bulk_insert= false;
+  trx->last_sql_stat_start.least_undo_no= 0;
+  trx->savepoints_discard();
+  return true;
+}
+
+/*****************************************************************//**
+Commits a transaction in an InnoDB database or marks an SQL statement
+ended.
+@return 0 or deadlock error if the transaction was aborted by another
+	higher priority transaction. */
+static
+int
+innobase_commit(
+/*============*/
+	handlerton*	hton,		/*!< in: InnoDB handlerton */
+	THD*		thd,		/*!< in: MySQL thread handle of the
+					user for whom the transaction should
+					be committed */
+	bool		commit_trx)	/*!< in: true - commit transaction
+					false - the current SQL statement
+					ended */
+{
+	DBUG_ENTER("innobase_commit");
+	DBUG_PRINT("enter", ("commit_trx: %d", commit_trx));
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+	DBUG_PRINT("trans", ("ending transaction"));
+
+	trx_t*	trx = check_trx_exists(thd);
+
+	ut_ad(!trx->dict_operation_lock_mode);
+	ut_ad(!trx->dict_operation);
+
+	/* Transaction is deregistered only in a commit or a rollback. If
+	it is deregistered we know there cannot be resources to be freed
+	and we could return immediately.  For the time being, we play safe
+	and do the cleanup though there should be nothing to clean up. */
+
+	if (!trx_is_registered_for_2pc(trx) && trx_is_started(trx)) {
+
+		sql_print_error("Transaction not registered for MariaDB 2PC,"
+				" but transaction is active");
+	}
+
+	bool	read_only = trx->read_only || trx->id == 0;
+	DBUG_PRINT("info", ("readonly: %d", read_only));
+
+	if (commit_trx
+	    || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
+
+		/* Run the fast part of commit if we did not already. */
+		if (!trx->active_commit_ordered) {
+			innobase_commit_ordered_2(trx, thd);
+
+		}
+
+		/* We were instructed to commit the whole transaction, or
+		this is an SQL statement end and autocommit is on */
+
+		/* At this point commit order is fixed and transaction is
+		visible to others. So we can wakeup other commits waiting for
+		this one, to allow then to group commit with us. */
+		thd_wakeup_subsequent_commits(thd, 0);
+
+		/* Now do a write + flush of logs. */
+		trx_commit_complete_for_mysql(trx);
+
+		trx_deregister_from_2pc(trx);
+	} else {
+		/* We just mark the SQL statement ended and do not do a
+		transaction commit */
+
+		/* If we had reserved the auto-inc lock for some
+		table in this SQL statement we release it now */
+
+		if (!read_only) {
+			lock_unlock_table_autoinc(trx);
+		}
+
+		/* Store the current undo_no of the transaction so that we
+		know where to roll back if we have to roll back the next
+		SQL statement */
+		if (UNIV_UNLIKELY(end_of_statement(trx))) {
+			DBUG_RETURN(1);
+		}
+	}
+
+	/* Reset the number AUTO-INC rows required */
+	trx->n_autoinc_rows = 0;
+
+	/* This is a statement level variable. */
+	trx->fts_next_doc_id = 0;
+
+	DBUG_RETURN(0);
+}
+
+/*****************************************************************//**
+Rolls back a transaction or the latest SQL statement.
+@return 0 or error number */
+static
+int
+innobase_rollback(
+/*==============*/
+	handlerton*	hton,		/*!< in: InnoDB handlerton */
+	THD*		thd,		/*!< in: handle to the MySQL thread
+					of the user whose transaction should
+					be rolled back */
+	bool		rollback_trx)	/*!< in: TRUE - rollback entire
+					transaction FALSE - rollback the current
+					statement only */
+{
+	DBUG_ENTER("innobase_rollback");
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+	DBUG_PRINT("trans", ("aborting transaction"));
+
+	trx_t*	trx = check_trx_exists(thd);
+
+	ut_ad(!trx->dict_operation_lock_mode);
+	ut_ad(!trx->dict_operation);
+
+	/* Reset the number AUTO-INC rows required */
+
+	trx->n_autoinc_rows = 0;
+
+	/* If we had reserved the auto-inc lock for some table (if
+	we come here to roll back the latest SQL statement) we
+	release it now before a possibly lengthy rollback */
+	lock_unlock_table_autoinc(trx);
+
+	/* This is a statement level variable. */
+
+	trx->fts_next_doc_id = 0;
+
+	dberr_t		error;
+
+#ifdef WITH_WSREP
+	/* If trx was assigned wsrep XID in prepare phase and the
+	trx is being rolled back due to BF abort, clear XID in order
+	to avoid writing it to rollback segment out of order. The XID
+	will be reassigned when the transaction is replayed. */
+	if (trx->state != TRX_STATE_NOT_STARTED
+	    && wsrep_is_wsrep_xid(&trx->xid)) {
+		trx->xid.null();
+	}
+#endif /* WITH_WSREP */
+	if (rollback_trx
+	    || !thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
+
+		error = trx_rollback_for_mysql(trx);
+
+		trx_deregister_from_2pc(trx);
+	} else {
+
+		error = trx_rollback_last_sql_stat_for_mysql(trx);
+	}
+
+	DBUG_RETURN(convert_error_code_to_mysql(error, 0, trx->mysql_thd));
+}
+
+/*****************************************************************//**
+Rolls back a transaction
+@return 0 or error number */
+static
+int
+innobase_rollback_trx(
+/*==================*/
+	trx_t*	trx)	/*!< in: transaction */
+{
+	DBUG_ENTER("innobase_rollback_trx");
+	DBUG_PRINT("trans", ("aborting transaction"));
+
+	/* If we had reserved the auto-inc lock for some table (if
+	we come here to roll back the latest SQL statement) we
+	release it now before a possibly lengthy rollback */
+	lock_unlock_table_autoinc(trx);
+	trx_deregister_from_2pc(trx);
+
+	DBUG_RETURN(convert_error_code_to_mysql(trx_rollback_for_mysql(trx),
+						0, trx->mysql_thd));
+}
+
+/** Invoke commit_checkpoint_notify_ha() on completed log flush requests.
+@param pending  log_requests.start
+@param lsn      log_sys.get_flushed_lsn() */
+static void log_flush_notify_and_unlock(log_flush_request *pending, lsn_t lsn)
+{
+  mysql_mutex_assert_owner(&log_requests.mutex);
+  ut_ad(pending == log_requests.start.load(std::memory_order_relaxed));
+  log_flush_request *entry= pending, *last= nullptr;
+  /* Process the first requests that have been completed. Since
+  the list is not necessarily in ascending order of LSN, we may
+  miss to notify some requests that have already been completed.
+  But there is no harm in delaying notifications for those a bit.
+  And in practise, the list is unlikely to have more than one
+  element anyway, because the redo log would be flushed every
+  srv_flush_log_at_timeout seconds (1 by default). */
+  for (; entry && entry->lsn <= lsn; last= entry, entry= entry->next);
+
+  if (!last)
+  {
+    mysql_mutex_unlock(&log_requests.mutex);
+    return;
+  }
+
+  /* Detach the head of the list that corresponds to persisted log writes. */
+  if (!entry)
+    log_requests.end= entry;
+  log_requests.start.store(entry, std::memory_order_relaxed);
+  mysql_mutex_unlock(&log_requests.mutex);
+
+  /* Now that we have released the mutex, notify the submitters
+  and free the head of the list. */
+  do
+  {
+    entry= pending;
+    pending= pending->next;
+    commit_checkpoint_notify_ha(entry->cookie);
+    my_free(entry);
+  }
+  while (entry != last);
+}
+
+/** Invoke commit_checkpoint_notify_ha() to notify that outstanding
+log writes have been completed. */
+void log_flush_notify(lsn_t flush_lsn)
+{
+  if (auto pending= log_requests.start.load(std::memory_order_acquire))
+  {
+    mysql_mutex_lock(&log_requests.mutex);
+    pending= log_requests.start.load(std::memory_order_relaxed);
+    log_flush_notify_and_unlock(pending, flush_lsn);
+  }
+}
+
+/** Handle a commit checkpoint request from server layer.
+We put the request in a queue, so that we can notify upper layer about
+checkpoint complete when we have flushed the redo log.
+If we have already flushed all relevant redo log, we notify immediately.*/
+static void innodb_log_flush_request(void *cookie)
+{
+  lsn_t flush_lsn= log_sys.get_flushed_lsn();
+  /* Load lsn relaxed after flush_lsn was loaded from the same cache line */
+  const lsn_t lsn= log_sys.get_lsn();
+
+  if (flush_lsn >= lsn)
+    /* All log is already persistent. */;
+  else if (UNIV_UNLIKELY(srv_force_recovery >= SRV_FORCE_NO_BACKGROUND))
+    /* Normally, srv_master_callback() should periodically invoke
+    srv_sync_log_buffer_in_background(), which should initiate a log
+    flush about once every srv_flush_log_at_timeout seconds.  But,
+    starting with the innodb_force_recovery=2 level, that background
+    task will not run. */
+    log_write_up_to(flush_lsn= lsn, true);
+  else if (log_flush_request *req= static_cast<log_flush_request*>
+           (my_malloc(PSI_INSTRUMENT_ME, sizeof *req, MYF(MY_WME))))
+  {
+    req->next= nullptr;
+    req->cookie= cookie;
+    req->lsn= lsn;
+
+    log_flush_request *start= nullptr;
+
+    mysql_mutex_lock(&log_requests.mutex);
+    /* In order to prevent a race condition where log_flush_notify()
+    would skip a notification due to, we must update log_requests.start from
+    nullptr (empty) to the first req using std::memory_order_release. */
+    if (log_requests.start.compare_exchange_strong(start, req,
+                                                   std::memory_order_release,
+                                                   std::memory_order_relaxed))
+    {
+      ut_ad(!log_requests.end);
+      start= req;
+      /* In case log_flush_notify() executed
+      log_requests.start.load(std::memory_order_acquire) right before
+      our successful compare_exchange, we must re-read flush_lsn to
+      ensure that our request will be notified immediately if applicable. */
+      flush_lsn= log_sys.get_flushed_lsn();
+    }
+    else
+    {
+      /* Append the entry to the list. Because we determined req->lsn before
+      acquiring the mutex, this list may not be ordered by req->lsn,
+      even though log_flush_notify_and_unlock() assumes so. */
+      log_requests.end->next= req;
+    }
+
+    log_requests.end= req;
+
+    /* This hopefully addresses the hang that was reported in MDEV-24302.
+    Upon receiving a new request, we will notify old requests of
+    completion. */
+    log_flush_notify_and_unlock(start, flush_lsn);
+    return;
+  }
+  else
+    sql_print_error("Failed to allocate %zu bytes."
+                    " Commit checkpoint will be skipped.", sizeof *req);
+
+  /* This hopefully addresses the hang that was reported in MDEV-24302.
+  Upon receiving a new request to notify of log writes becoming
+  persistent, we will notify old requests of completion. Note:
+  log_flush_notify() may skip some notifications because it is
+  basically assuming that the list is in ascending order of LSN. */
+  log_flush_notify(flush_lsn);
+  commit_checkpoint_notify_ha(cookie);
+}
+
+/*****************************************************************//**
+Rolls back a transaction to a savepoint.
+@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the
+given name */
+static
+int
+innobase_rollback_to_savepoint(
+/*===========================*/
+	handlerton*	hton,		/*!< in: InnoDB handlerton */
+	THD*		thd,		/*!< in: handle to the MySQL thread
+					of the user whose transaction should
+					be rolled back to savepoint */
+	void*		savepoint)	/*!< in: savepoint data */
+{
+
+	DBUG_ENTER("innobase_rollback_to_savepoint");
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	trx_t*	trx = check_trx_exists(thd);
+
+	/* TODO: use provided savepoint data area to store savepoint data */
+
+	char	name[64];
+
+	longlong2str(longlong(savepoint), name, 36);
+
+	int64_t	mysql_binlog_cache_pos;
+
+	dberr_t	error = trx_rollback_to_savepoint_for_mysql(
+		trx, name, &mysql_binlog_cache_pos);
+
+	if (error == DB_SUCCESS && trx->fts_trx != NULL) {
+		fts_savepoint_rollback(trx, name);
+	}
+
+	DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
+}
+
+/*****************************************************************//**
+Check whether innodb state allows to safely release MDL locks after
+rollback to savepoint.
+When binlog is on, MDL locks acquired after savepoint unit are not
+released if there are any locks held in InnoDB.
+@return true if it is safe, false if its not safe. */
+static
+bool
+innobase_rollback_to_savepoint_can_release_mdl(
+/*===========================================*/
+	handlerton*	hton,		/*!< in: InnoDB handlerton */
+	THD*		thd)		/*!< in: handle to the MySQL thread
+					of the user whose transaction should
+					be rolled back to savepoint */
+{
+	DBUG_ENTER("innobase_rollback_to_savepoint_can_release_mdl");
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	trx_t*	trx = check_trx_exists(thd);
+
+	/* If transaction has not acquired any locks then it is safe
+	to release MDL after rollback to savepoint */
+	if (UT_LIST_GET_LEN(trx->lock.trx_locks) == 0) {
+
+		DBUG_RETURN(true);
+	}
+
+	DBUG_RETURN(false);
+}
+
+/*****************************************************************//**
+Release transaction savepoint name.
+@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the
+given name */
+static
+int
+innobase_release_savepoint(
+/*=======================*/
+	handlerton*	hton,		/*!< in: handlerton for InnoDB */
+	THD*		thd,		/*!< in: handle to the MySQL thread
+					of the user whose transaction's
+					savepoint should be released */
+	void*		savepoint)	/*!< in: savepoint data */
+{
+	dberr_t		error;
+	trx_t*		trx;
+	char		name[64];
+
+	DBUG_ENTER("innobase_release_savepoint");
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	trx = check_trx_exists(thd);
+
+	/* TODO: use provided savepoint data area to store savepoint data */
+
+	longlong2str(longlong(savepoint), name, 36);
+
+	error = trx_release_savepoint_for_mysql(trx, name);
+
+	if (error == DB_SUCCESS && trx->fts_trx != NULL) {
+		fts_savepoint_release(trx, name);
+	}
+
+	DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
+}
+
+/*****************************************************************//**
+Sets a transaction savepoint.
+@return always 0, that is, always succeeds */
+static
+int
+innobase_savepoint(
+/*===============*/
+	handlerton*	hton,	/*!< in: handle to the InnoDB handlerton */
+	THD*		thd,	/*!< in: handle to the MySQL thread */
+	void*		savepoint)/*!< in: savepoint data */
+{
+	DBUG_ENTER("innobase_savepoint");
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	/* In the autocommit mode there is no sense to set a savepoint
+	(unless we are in sub-statement), so SQL layer ensures that
+	this method is never called in such situation.  */
+
+	trx_t*	trx = check_trx_exists(thd);
+
+	/* Cannot happen outside of transaction */
+	DBUG_ASSERT(trx_is_registered_for_2pc(trx));
+
+	/* TODO: use provided savepoint data area to store savepoint data */
+	char	name[64];
+
+	longlong2str(longlong(savepoint), name, 36);
+
+	dberr_t	error = trx_savepoint_for_mysql(trx, name, 0);
+
+	if (error == DB_SUCCESS && trx->fts_trx != NULL) {
+		fts_savepoint_take(trx->fts_trx, name);
+	}
+
+	DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
+}
+
+
+/**
+  Frees a possible InnoDB trx object associated with the current THD.
+
+  @param hton  innobase handlerton
+  @param thd   server thread descriptor, which resources should be free'd
+
+  @return 0 always
+*/
+static int innobase_close_connection(handlerton *hton, THD *thd)
+{
+  DBUG_ASSERT(hton == innodb_hton_ptr);
+  if (auto trx= thd_to_trx(thd))
+  {
+    thd_set_ha_data(thd, innodb_hton_ptr, NULL);
+    if (trx->state == TRX_STATE_PREPARED && trx->has_logged_persistent())
+    {
+      trx_disconnect_prepared(trx);
+      return 0;
+    }
+    innobase_rollback_trx(trx);
+    trx->free();
+    DEBUG_SYNC(thd, "innobase_connection_closed");
+  }
+  return 0;
+}
+
+/** Cancel any pending lock request associated with the current THD.
+@sa THD::awake() @sa ha_kill_query() */
+static void innobase_kill_query(handlerton*, THD *thd, enum thd_kill_levels)
+{
+  DBUG_ENTER("innobase_kill_query");
+
+  if (trx_t* trx= thd_to_trx(thd))
+  {
+    ut_ad(trx->mysql_thd == thd);
+    mysql_mutex_lock(&lock_sys.wait_mutex);
+    lock_t *lock= trx->lock.wait_lock;
+
+    if (!lock)
+      /* The transaction is not waiting for any lock. */;
+#ifdef WITH_WSREP
+    else if (trx->is_wsrep() && wsrep_thd_is_aborting(thd))
+      /* if victim has been signaled by BF thread and/or aborting is already
+      progressing, following query aborting is not necessary any more.
+      Also, BF thread should own trx mutex for the victim. */;
+#endif /* WITH_WSREP */
+    else
+    {
+      if (!trx->dict_operation)
+      {
+        /* Dictionary transactions must be immune to KILL, because they
+        may be executed as part of a multi-transaction DDL operation, such
+        as rollback_inplace_alter_table() or ha_innobase::delete_table(). */;
+        trx->error_state= DB_INTERRUPTED;
+        lock_sys_t::cancel<false>(trx, lock);
+      }
+      lock_sys.deadlock_check();
+    }
+    mysql_mutex_unlock(&lock_sys.wait_mutex);
+  }
+
+  DBUG_VOID_RETURN;
+}
+
+
+/*************************************************************************//**
+** InnoDB database tables
+*****************************************************************************/
+
+/** Get the record format from the data dictionary.
+@return one of ROW_TYPE_REDUNDANT, ROW_TYPE_COMPACT,
+ROW_TYPE_COMPRESSED, ROW_TYPE_DYNAMIC */
+
+enum row_type
+ha_innobase::get_row_type() const
+{
+	if (m_prebuilt && m_prebuilt->table) {
+		const ulint	flags = m_prebuilt->table->flags;
+
+		switch (dict_tf_get_rec_format(flags)) {
+		case REC_FORMAT_REDUNDANT:
+			return(ROW_TYPE_REDUNDANT);
+		case REC_FORMAT_COMPACT:
+			return(ROW_TYPE_COMPACT);
+		case REC_FORMAT_COMPRESSED:
+			return(ROW_TYPE_COMPRESSED);
+		case REC_FORMAT_DYNAMIC:
+			return(ROW_TYPE_DYNAMIC);
+		}
+	}
+	ut_ad(0);
+	return(ROW_TYPE_NOT_USED);
+}
+
+/****************************************************************//**
+Get the table flags to use for the statement.
+@return table flags */
+
+handler::Table_flags
+ha_innobase::table_flags() const
+/*============================*/
+{
+	THD*			thd = ha_thd();
+	handler::Table_flags	flags = m_int_table_flags;
+
+	/* Need to use tx_isolation here since table flags is (also)
+	called before prebuilt is inited. */
+
+	if (thd_tx_isolation(thd) <= ISO_READ_COMMITTED) {
+		return(flags);
+	}
+
+	return(flags | HA_BINLOG_STMT_CAPABLE);
+}
+
+/****************************************************************//**
+Returns the table type (storage engine name).
+@return table type */
+
+const char*
+ha_innobase::table_type() const
+/*===========================*/
+{
+	return(innobase_hton_name);
+}
+
+/****************************************************************//**
+Returns the index type.
+@return index type */
+
+const char*
+ha_innobase::index_type(
+/*====================*/
+	uint	keynr)		/*!< : index number */
+{
+	dict_index_t*	index = innobase_get_index(keynr);
+
+	if (!index) {
+		return "Corrupted";
+	}
+
+	if (index->type & DICT_FTS) {
+		return("FULLTEXT");
+	}
+
+	if (dict_index_is_spatial(index)) {
+		return("SPATIAL");
+	}
+
+	return("BTREE");
+}
+
+/****************************************************************//**
+Returns the operations supported for indexes.
+@return flags of supported operations */
+
+ulong
+ha_innobase::index_flags(
+/*=====================*/
+	uint	key,
+	uint,
+	bool) const
+{
+	if (table_share->key_info[key].algorithm == HA_KEY_ALG_FULLTEXT) {
+		return(0);
+	}
+
+	/* For spatial index, we don't support descending scan
+	and ICP so far. */
+	if (table_share->key_info[key].flags & HA_SPATIAL) {
+		return HA_READ_NEXT | HA_READ_ORDER| HA_READ_RANGE
+			| HA_KEYREAD_ONLY | HA_KEY_SCAN_NOT_ROR;
+	}
+
+	ulong flags= key == table_share->primary_key
+		? HA_CLUSTERED_INDEX : 0;
+
+	flags |= HA_READ_NEXT | HA_READ_PREV | HA_READ_ORDER
+		| HA_READ_RANGE | HA_KEYREAD_ONLY
+		| HA_DO_INDEX_COND_PUSHDOWN
+		| HA_DO_RANGE_FILTER_PUSHDOWN;
+
+	return(flags);
+}
+
+/****************************************************************//**
+Returns the maximum number of keys.
+@return MAX_KEY */
+
+uint
+ha_innobase::max_supported_keys() const
+/*===================================*/
+{
+	return(MAX_KEY);
+}
+
+/****************************************************************//**
+Returns the maximum key length.
+@return maximum supported key length, in bytes */
+
+uint
+ha_innobase::max_supported_key_length() const
+/*=========================================*/
+{
+	/* An InnoDB page must store >= 2 keys; a secondary key record
+	must also contain the primary key value.  Therefore, if both
+	the primary key and the secondary key are at this maximum length,
+	it must be less than 1/4th of the free space on a page including
+	record overhead.
+
+	MySQL imposes its own limit to this number; MAX_KEY_LENGTH = 3072.
+
+	For page sizes = 16k, InnoDB historically reported 3500 bytes here,
+	But the MySQL limit of 3072 was always used through the handler
+	interface.
+
+	Note: Handle 16k and 32k pages the same here since the limits
+	are higher than imposed by MySQL. */
+
+	switch (srv_page_size) {
+	case 4096:
+		/* Hack: allow mysql.innodb_index_stats to be created. */
+		/* FIXME: rewrite this API, and in sql_table.cc consider
+		that in index-organized tables (such as InnoDB), secondary
+		index records will be padded with the PRIMARY KEY, instead
+		of some short ROWID or record heap address. */
+		return(1173);
+	case 8192:
+		return(1536);
+	default:
+		return(3500);
+	}
+}
+
+/****************************************************************//**
+Returns the key map of keys that are usable for scanning.
+@return key_map_full */
+
+const key_map*
+ha_innobase::keys_to_use_for_scanning()
+/*===================================*/
+{
+	return(&key_map_full);
+}
+
+/** Ensure that indexed virtual columns will be computed. */
+void ha_innobase::column_bitmaps_signal()
+{
+  if (!table->vfield || table->current_lock != F_WRLCK)
+    return;
+
+  dict_index_t* clust_index= dict_table_get_first_index(m_prebuilt->table);
+  uint num_v= 0;
+  for (uint j = 0; j < table->s->virtual_fields; j++)
+  {
+    if (table->vfield[j]->stored_in_db())
+      continue;
+
+    dict_col_t *col= &m_prebuilt->table->v_cols[num_v].m_col;
+    if (col->ord_part ||
+        (dict_index_is_online_ddl(clust_index) &&
+         row_log_col_is_indexed(clust_index, num_v)))
+      table->mark_virtual_column_with_deps(table->vfield[j]);
+    num_v++;
+  }
+}
+
+
+/****************************************************************//**
+Determines if table caching is supported.
+@return HA_CACHE_TBL_ASKTRANSACT */
+
+uint8
+ha_innobase::table_cache_type()
+/*===========================*/
+{
+	return(HA_CACHE_TBL_ASKTRANSACT);
+}
+
+/** Normalizes a table name string.
+A normalized name consists of the database name catenated to '/'
+and table name. For example: test/mytable.
+On Windows, normalization puts both the database name and the
+table name always to lower case if "set_lower_case" is set to TRUE.
+@param[out]	norm_name	Normalized name, null-terminated.
+@param[in]	name		Name to normalize.
+@param[in]	set_lower_case	True if we also should fold to lower case. */
+void
+normalize_table_name_c_low(
+/*=======================*/
+	char*           norm_name,      /* out: normalized name as a
+					null-terminated string */
+	const char*     name,           /* in: table name string */
+	bool            set_lower_case) /* in: TRUE if we want to set
+					 name to lower case */
+{
+	char*	name_ptr;
+	ulint	name_len;
+	char*	db_ptr;
+	ulint	db_len;
+	char*	ptr;
+	ulint	norm_len;
+
+	/* Scan name from the end */
+
+	ptr = strend(name) - 1;
+
+	/* seek to the last path separator */
+	while (ptr >= name && *ptr != '\\' && *ptr != '/') {
+		ptr--;
+	}
+
+	name_ptr = ptr + 1;
+	name_len = strlen(name_ptr);
+
+	/* skip any number of path separators */
+	while (ptr >= name && (*ptr == '\\' || *ptr == '/')) {
+		ptr--;
+	}
+
+	DBUG_ASSERT(ptr >= name);
+
+	/* seek to the last but one path separator or one char before
+	the beginning of name */
+	db_len = 0;
+	while (ptr >= name && *ptr != '\\' && *ptr != '/') {
+		ptr--;
+		db_len++;
+	}
+
+	db_ptr = ptr + 1;
+
+	norm_len = db_len + name_len + sizeof "/";
+	ut_a(norm_len < FN_REFLEN - 1);
+
+	memcpy(norm_name, db_ptr, db_len);
+
+	norm_name[db_len] = '/';
+
+	/* Copy the name and null-byte. */
+	memcpy(norm_name + db_len + 1, name_ptr, name_len + 1);
+
+	if (set_lower_case) {
+		innobase_casedn_str(norm_name);
+	}
+}
+
+create_table_info_t::create_table_info_t(
+	THD*		thd,
+	const TABLE*	form,
+	HA_CREATE_INFO*	create_info,
+	char*		table_name,
+	char*		remote_path,
+	bool		file_per_table,
+	trx_t*		trx)
+	: m_thd(thd),
+	  m_trx(trx),
+	  m_form(form),
+	  m_default_row_format(innodb_default_row_format),
+	  m_create_info(create_info),
+	  m_table_name(table_name), m_table(NULL),
+	  m_remote_path(remote_path),
+	  m_innodb_file_per_table(file_per_table)
+{
+}
+
+#if !defined(DBUG_OFF)
+/*********************************************************************
+Test normalize_table_name_low(). */
+static
+void
+test_normalize_table_name_low()
+/*===========================*/
+{
+	char		norm_name[FN_REFLEN];
+	const char*	test_data[][2] = {
+		/* input, expected result */
+		{"./mysqltest/t1", "mysqltest/t1"},
+		{"./test/#sql-842b_2", "test/#sql-842b_2"},
+		{"./test/#sql-85a3_10", "test/#sql-85a3_10"},
+		{"./test/#sql2-842b-2", "test/#sql2-842b-2"},
+		{"./test/bug29807", "test/bug29807"},
+		{"./test/foo", "test/foo"},
+		{"./test/innodb_bug52663", "test/innodb_bug52663"},
+		{"./test/t", "test/t"},
+		{"./test/t1", "test/t1"},
+		{"./test/t10", "test/t10"},
+		{"/a/b/db/table", "db/table"},
+		{"/a/b/db///////table", "db/table"},
+		{"/a/b////db///////table", "db/table"},
+		{"/var/tmp/mysqld.1/#sql842b_2_10", "mysqld.1/#sql842b_2_10"},
+		{"db/table", "db/table"},
+		{"ddd/t", "ddd/t"},
+		{"d/ttt", "d/ttt"},
+		{"d/t", "d/t"},
+		{".\\mysqltest\\t1", "mysqltest/t1"},
+		{".\\test\\#sql-842b_2", "test/#sql-842b_2"},
+		{".\\test\\#sql-85a3_10", "test/#sql-85a3_10"},
+		{".\\test\\#sql2-842b-2", "test/#sql2-842b-2"},
+		{".\\test\\bug29807", "test/bug29807"},
+		{".\\test\\foo", "test/foo"},
+		{".\\test\\innodb_bug52663", "test/innodb_bug52663"},
+		{".\\test\\t", "test/t"},
+		{".\\test\\t1", "test/t1"},
+		{".\\test\\t10", "test/t10"},
+		{"C:\\a\\b\\db\\table", "db/table"},
+		{"C:\\a\\b\\db\\\\\\\\\\\\\\table", "db/table"},
+		{"C:\\a\\b\\\\\\\\db\\\\\\\\\\\\\\table", "db/table"},
+		{"C:\\var\\tmp\\mysqld.1\\#sql842b_2_10", "mysqld.1/#sql842b_2_10"},
+		{"db\\table", "db/table"},
+		{"ddd\\t", "ddd/t"},
+		{"d\\ttt", "d/ttt"},
+		{"d\\t", "d/t"},
+	};
+
+	for (size_t i = 0; i < UT_ARR_SIZE(test_data); i++) {
+		printf("test_normalize_table_name_low():"
+		       " testing \"%s\", expected \"%s\"... ",
+		       test_data[i][0], test_data[i][1]);
+
+		normalize_table_name_c_low(
+			norm_name, test_data[i][0], FALSE);
+
+		if (strcmp(norm_name, test_data[i][1]) == 0) {
+			printf("ok\n");
+		} else {
+			printf("got \"%s\"\n", norm_name);
+			ut_error;
+		}
+	}
+}
+
+/*********************************************************************
+Test ut_format_name(). */
+static
+void
+test_ut_format_name()
+/*=================*/
+{
+	char		buf[NAME_LEN * 3];
+
+	struct {
+		const char*	name;
+		ulint		buf_size;
+		const char*	expected;
+	} test_data[] = {
+		{"test/t1",	sizeof(buf),	"`test`.`t1`"},
+		{"test/t1",	12,		"`test`.`t1`"},
+		{"test/t1",	11,		"`test`.`t1"},
+		{"test/t1",	10,		"`test`.`t"},
+		{"test/t1",	9,		"`test`.`"},
+		{"test/t1",	8,		"`test`."},
+		{"test/t1",	7,		"`test`"},
+		{"test/t1",	6,		"`test"},
+		{"test/t1",	5,		"`tes"},
+		{"test/t1",	4,		"`te"},
+		{"test/t1",	3,		"`t"},
+		{"test/t1",	2,		"`"},
+		{"test/t1",	1,		""},
+		{"test/t1",	0,		"BUF_NOT_CHANGED"},
+		{"table",	sizeof(buf),	"`table`"},
+		{"ta'le",	sizeof(buf),	"`ta'le`"},
+		{"ta\"le",	sizeof(buf),	"`ta\"le`"},
+		{"ta`le",	sizeof(buf),	"`ta``le`"},
+	};
+
+	for (size_t i = 0; i < UT_ARR_SIZE(test_data); i++) {
+
+		memcpy(buf, "BUF_NOT_CHANGED", strlen("BUF_NOT_CHANGED") + 1);
+
+		char*	ret;
+
+		ret = ut_format_name(test_data[i].name,
+				     buf,
+				     test_data[i].buf_size);
+
+		ut_a(ret == buf);
+
+		if (strcmp(buf, test_data[i].expected) == 0) {
+			ib::info() << "ut_format_name(" << test_data[i].name
+				<< ", buf, " << test_data[i].buf_size << "),"
+				" expected " << test_data[i].expected
+				<< ", OK";
+		} else {
+			ib::error() << "ut_format_name(" << test_data[i].name
+				<< ", buf, " << test_data[i].buf_size << "),"
+				" expected " << test_data[i].expected
+				<< ", ERROR: got " << buf;
+			ut_error;
+		}
+	}
+}
+#endif /* !DBUG_OFF */
+
+/** Match index columns between MySQL and InnoDB.
+This function checks whether the index column information
+is consistent between KEY info from mysql and that from innodb index.
+@param[in]	key_info	Index info from mysql
+@param[in]	index_info	Index info from InnoDB
+@return true if all column types match. */
+static
+bool
+innobase_match_index_columns(
+	const KEY*		key_info,
+	const dict_index_t*	index_info)
+{
+	const KEY_PART_INFO*	key_part;
+	const KEY_PART_INFO*	key_end;
+	const dict_field_t*	innodb_idx_fld;
+	const dict_field_t*	innodb_idx_fld_end;
+
+	DBUG_ENTER("innobase_match_index_columns");
+
+	/* Check whether user defined index column count matches */
+	if (key_info->user_defined_key_parts !=
+		index_info->n_user_defined_cols) {
+		DBUG_RETURN(FALSE);
+	}
+
+	key_part = key_info->key_part;
+	key_end = key_part + key_info->user_defined_key_parts;
+	innodb_idx_fld = index_info->fields;
+	innodb_idx_fld_end = index_info->fields + index_info->n_fields;
+
+	/* Check each index column's datatype. We do not check
+	column name because there exists case that index
+	column name got modified in mysql but such change does not
+	propagate to InnoDB.
+	One hidden assumption here is that the index column sequences
+	are matched up between those in mysql and InnoDB. */
+	for (; key_part != key_end; ++key_part) {
+		unsigned is_unsigned;
+		auto mtype = innodb_idx_fld->col->mtype;
+
+		/* Need to translate to InnoDB column type before
+		comparison. */
+		auto col_type = get_innobase_type_from_mysql_type(
+			&is_unsigned, key_part->field);
+
+		/* Ignore InnoDB specific system columns. */
+		while (mtype == DATA_SYS) {
+			innodb_idx_fld++;
+
+			if (innodb_idx_fld >= innodb_idx_fld_end) {
+				DBUG_RETURN(FALSE);
+			}
+		}
+
+		/* MariaDB-5.5 compatibility */
+		if ((key_part->field->real_type() == MYSQL_TYPE_ENUM ||
+		     key_part->field->real_type() == MYSQL_TYPE_SET) &&
+		    mtype == DATA_FIXBINARY) {
+			col_type= DATA_FIXBINARY;
+		}
+
+		if (innodb_idx_fld->descending
+		    != !!(key_part->key_part_flag & HA_REVERSE_SORT)) {
+			DBUG_RETURN(FALSE);
+		}
+
+		if (col_type != mtype) {
+			/* If the col_type we get from mysql type is a geometry
+			data type, we should check if mtype is a legacy type
+			from 5.6, either upgraded to DATA_GEOMETRY or not.
+			This is indeed not an accurate check, but should be
+			safe, since DATA_BLOB would be upgraded once we create
+			spatial index on it and we intend to use DATA_GEOMETRY
+			for legacy GIS data types which are of var-length. */
+			switch (col_type) {
+			case DATA_GEOMETRY:
+				if (mtype == DATA_BLOB) {
+					break;
+				}
+				/* Fall through */
+			default:
+				/* Column type mismatches */
+				DBUG_RETURN(false);
+			}
+		}
+
+		innodb_idx_fld++;
+	}
+
+	DBUG_RETURN(TRUE);
+}
+
+/** Build a template for a base column for a virtual column
+@param[in]	table		MySQL TABLE
+@param[in]	clust_index	InnoDB clustered index
+@param[in]	field		field in MySQL table
+@param[in]	col		InnoDB column
+@param[in,out]	templ		template to fill
+@param[in]	col_no		field index for virtual col
+*/
+static
+void
+innobase_vcol_build_templ(
+	const TABLE*		table,
+	dict_index_t*		clust_index,
+	Field*			field,
+	const dict_col_t*	col,
+	mysql_row_templ_t*	templ,
+	ulint			col_no)
+{
+	templ->col_no = col_no;
+	templ->is_virtual = col->is_virtual();
+
+	if (templ->is_virtual) {
+		templ->clust_rec_field_no = ULINT_UNDEFINED;
+		templ->rec_field_no = col->ind;
+	} else {
+		templ->clust_rec_field_no = dict_col_get_clust_pos(
+						col, clust_index);
+		ut_a(templ->clust_rec_field_no != ULINT_UNDEFINED);
+
+		templ->rec_field_no = templ->clust_rec_field_no;
+	}
+
+	if (field->real_maybe_null()) {
+                templ->mysql_null_byte_offset =
+                        field->null_offset();
+
+                templ->mysql_null_bit_mask = (ulint) field->null_bit;
+        } else {
+                templ->mysql_null_bit_mask = 0;
+        }
+
+        templ->mysql_col_offset = static_cast<ulint>(
+					get_field_offset(table, field));
+	templ->mysql_col_len = static_cast<ulint>(field->pack_length());
+        templ->type = col->mtype;
+        templ->mysql_type = static_cast<ulint>(field->type());
+
+	if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) {
+		templ->mysql_length_bytes = static_cast<ulint>(
+			((Field_varstring*) field)->length_bytes);
+	}
+
+        templ->charset = dtype_get_charset_coll(col->prtype);
+        templ->mbminlen = dict_col_get_mbminlen(col);
+        templ->mbmaxlen = dict_col_get_mbmaxlen(col);
+        templ->is_unsigned = col->prtype & DATA_UNSIGNED;
+}
+
+/** Build template for the virtual columns and their base columns. This
+is done when the table first opened.
+@param[in]	table		MySQL TABLE
+@param[in]	ib_table	InnoDB dict_table_t
+@param[in,out]	s_templ		InnoDB template structure
+@param[in]	add_v		new virtual columns added along with
+				add index call
+@param[in]	locked		true if dict_sys.latch is held */
+void
+innobase_build_v_templ(
+	const TABLE*		table,
+	const dict_table_t*	ib_table,
+	dict_vcol_templ_t*	s_templ,
+	const dict_add_v_col_t*	add_v,
+	bool			locked)
+{
+	ulint	ncol = unsigned(ib_table->n_cols) - DATA_N_SYS_COLS;
+	ulint	n_v_col = ib_table->n_v_cols;
+	bool	marker[REC_MAX_N_FIELDS];
+
+	DBUG_ENTER("innobase_build_v_templ");
+	ut_ad(ncol < REC_MAX_N_FIELDS);
+
+	if (add_v != NULL) {
+		n_v_col += add_v->n_v_col;
+	}
+
+	ut_ad(n_v_col > 0);
+
+	if (!locked) {
+		dict_sys.lock(SRW_LOCK_CALL);
+	}
+
+#if 0
+	/* This does not (need to) hold for ctx->new_table in
+	alter_rebuild_apply_log() */
+	ut_ad(dict_sys.locked());
+#endif
+
+	if (s_templ->vtempl) {
+		if (!locked) {
+			dict_sys.unlock();
+		}
+		DBUG_VOID_RETURN;
+	}
+
+	memset(marker, 0, sizeof(bool) * ncol);
+
+	s_templ->vtempl = static_cast<mysql_row_templ_t**>(
+		ut_zalloc_nokey((ncol + n_v_col)
+				* sizeof *s_templ->vtempl));
+	s_templ->n_col = ncol;
+	s_templ->n_v_col = n_v_col;
+	s_templ->rec_len = table->s->reclength;
+	s_templ->default_rec = UT_NEW_ARRAY_NOKEY(uchar, s_templ->rec_len);
+	memcpy(s_templ->default_rec, table->s->default_values, s_templ->rec_len);
+
+	/* Mark those columns could be base columns */
+	for (ulint i = 0; i < ib_table->n_v_cols; i++) {
+		const dict_v_col_t*	vcol = dict_table_get_nth_v_col(
+							ib_table, i);
+
+		for (ulint j = vcol->num_base; j--; ) {
+			marker[vcol->base_col[j]->ind] = true;
+		}
+	}
+
+	if (add_v) {
+		for (ulint i = 0; i < add_v->n_v_col; i++) {
+			const dict_v_col_t*	vcol = &add_v->v_col[i];
+
+			for (ulint j = vcol->num_base; j--; ) {
+				marker[vcol->base_col[j]->ind] = true;
+			}
+		}
+	}
+
+	ulint	j = 0;
+	ulint	z = 0;
+
+	dict_index_t*	clust_index = dict_table_get_first_index(ib_table);
+
+	for (ulint i = 0; i < table->s->fields; i++) {
+		Field*  field = table->field[i];
+
+		/* Build template for virtual columns */
+		if (!field->stored_in_db()) {
+#ifdef UNIV_DEBUG
+			const char*	name;
+
+			if (z >= ib_table->n_v_def) {
+				name = add_v->v_col_name[z - ib_table->n_v_def];
+			} else {
+				name = dict_table_get_v_col_name(ib_table, z);
+			}
+
+			ut_ad(!my_strcasecmp(system_charset_info, name,
+					     field->field_name.str));
+#endif
+			const dict_v_col_t*	vcol;
+
+			if (z >= ib_table->n_v_def) {
+				vcol = &add_v->v_col[z - ib_table->n_v_def];
+			} else {
+				vcol = dict_table_get_nth_v_col(ib_table, z);
+			}
+
+			s_templ->vtempl[z + s_templ->n_col]
+				= static_cast<mysql_row_templ_t*>(
+					ut_malloc_nokey(
+					sizeof *s_templ->vtempl[j]));
+
+			innobase_vcol_build_templ(
+				table, clust_index, field,
+				&vcol->m_col,
+				s_templ->vtempl[z + s_templ->n_col],
+				z);
+			z++;
+			continue;
+                }
+
+		ut_ad(j < ncol);
+
+		/* Build template for base columns */
+		if (marker[j]) {
+			dict_col_t*   col = dict_table_get_nth_col(
+						ib_table, j);
+
+			ut_ad(!my_strcasecmp(system_charset_info,
+					     dict_table_get_col_name(
+						     ib_table, j),
+					     field->field_name.str));
+
+			s_templ->vtempl[j] = static_cast<
+				mysql_row_templ_t*>(
+					ut_malloc_nokey(
+					sizeof *s_templ->vtempl[j]));
+
+			innobase_vcol_build_templ(
+				table, clust_index, field, col,
+				s_templ->vtempl[j], j);
+		}
+
+		j++;
+	}
+
+	if (!locked) {
+		dict_sys.unlock();
+	}
+
+	s_templ->db_name = table->s->db.str;
+	s_templ->tb_name = table->s->table_name.str;
+	DBUG_VOID_RETURN;
+}
+
+/** Check consistency between .frm indexes and InnoDB indexes.
+@param[in]	table	table object formed from .frm
+@param[in]	ib_table	InnoDB table definition
+@retval	true if not errors were found */
+static bool
+check_index_consistency(const TABLE* table, const dict_table_t* ib_table)
+{
+	ulint mysql_num_index = table->s->keys;
+	ulint ib_num_index = UT_LIST_GET_LEN(ib_table->indexes);
+	bool ret = true;
+
+	/* If there exists inconsistency between MySQL and InnoDB dictionary
+	(metadata) information, the number of index defined in MySQL
+	could exceed that in InnoDB, return error */
+	if (ib_num_index < mysql_num_index) {
+		ret = false;
+		goto func_exit;
+	}
+
+	/* For each index in the mysql key_info array, fetch its
+	corresponding InnoDB index pointer into index_mapping
+	array. */
+	for (ulint count = 0; count < mysql_num_index; count++) {
+		const dict_index_t* index = dict_table_get_index_on_name(
+			ib_table, table->key_info[count].name.str);
+
+		if (index == NULL) {
+			sql_print_error("Cannot find index %s in InnoDB"
+					" index dictionary.",
+					table->key_info[count].name.str);
+			ret = false;
+			goto func_exit;
+		}
+
+		/* Double check fetched index has the same
+		column info as those in mysql key_info. */
+		if (!innobase_match_index_columns(&table->key_info[count],
+						  index)) {
+			sql_print_error("Found index %s whose column info"
+					" does not match that of MariaDB.",
+					table->key_info[count].name.str);
+			ret = false;
+			goto func_exit;
+		}
+	}
+
+func_exit:
+	return ret;
+}
+
+/********************************************************************//**
+Get the upper limit of the MySQL integral and floating-point type.
+@return maximum allowed value for the field */
+ulonglong innobase_get_int_col_max_value(const Field *field)
+{
+	ulonglong	max_value = 0;
+
+	switch (field->key_type()) {
+	/* TINY */
+	case HA_KEYTYPE_BINARY:
+		max_value = 0xFFULL;
+		break;
+	case HA_KEYTYPE_INT8:
+		max_value = 0x7FULL;
+		break;
+	/* SHORT */
+	case HA_KEYTYPE_USHORT_INT:
+		max_value = 0xFFFFULL;
+		break;
+	case HA_KEYTYPE_SHORT_INT:
+		max_value = 0x7FFFULL;
+		break;
+	/* MEDIUM */
+	case HA_KEYTYPE_UINT24:
+		max_value = 0xFFFFFFULL;
+		break;
+	case HA_KEYTYPE_INT24:
+		max_value = 0x7FFFFFULL;
+		break;
+	/* LONG */
+	case HA_KEYTYPE_ULONG_INT:
+		max_value = 0xFFFFFFFFULL;
+		break;
+	case HA_KEYTYPE_LONG_INT:
+		max_value = 0x7FFFFFFFULL;
+		break;
+	/* BIG */
+	case HA_KEYTYPE_ULONGLONG:
+		max_value = 0xFFFFFFFFFFFFFFFFULL;
+		break;
+	case HA_KEYTYPE_LONGLONG:
+		max_value = 0x7FFFFFFFFFFFFFFFULL;
+		break;
+	case HA_KEYTYPE_FLOAT:
+		/* We use the maximum as per IEEE754-2008 standard, 2^24 */
+		max_value = 0x1000000ULL;
+		break;
+	case HA_KEYTYPE_DOUBLE:
+		/* We use the maximum as per IEEE754-2008 standard, 2^53 */
+		max_value = 0x20000000000000ULL;
+		break;
+	default:
+		ut_error;
+	}
+
+	return(max_value);
+}
+
+/** Initialize the AUTO_INCREMENT column metadata.
+
+Since a partial table definition for a persistent table can already be
+present in the InnoDB dict_sys cache before it is accessed from SQL,
+we have to initialize the AUTO_INCREMENT counter on the first
+ha_innobase::open().
+
+@param[in,out]	table	persistent table
+@param[in]	field	the AUTO_INCREMENT column */
+static
+void
+initialize_auto_increment(dict_table_t* table, const Field* field)
+{
+	ut_ad(!table->is_temporary());
+
+	const unsigned	col_no = innodb_col_no(field);
+
+	table->autoinc_mutex.wr_lock();
+
+	table->persistent_autoinc = static_cast<uint16_t>(
+		dict_table_get_nth_col_pos(table, col_no, NULL) + 1)
+		& dict_index_t::MAX_N_FIELDS;
+
+	if (table->autoinc) {
+		/* Already initialized. Our caller checked
+		table->persistent_autoinc without
+		autoinc_mutex protection, and there might be multiple
+		ha_innobase::open() executing concurrently. */
+	} else if (srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN) {
+		/* If the recovery level is set so high that writes
+		are disabled we force the AUTOINC counter to 0
+		value effectively disabling writes to the table.
+		Secondly, we avoid reading the table in case the read
+		results in failure due to a corrupted table/index.
+
+		We will not return an error to the client, so that the
+		tables can be dumped with minimal hassle.  If an error
+		were returned in this case, the first attempt to read
+		the table would fail and subsequent SELECTs would succeed. */
+	} else if (table->persistent_autoinc) {
+		table->autoinc = innobase_next_autoinc(
+			btr_read_autoinc_with_fallback(table, col_no),
+			1 /* need */,
+			1 /* auto_increment_increment */,
+			0 /* auto_increment_offset */,
+			innobase_get_int_col_max_value(field));
+	}
+
+	table->autoinc_mutex.wr_unlock();
+}
+
+/** Open an InnoDB table
+@param[in]	name	table name
+@return	error code
+@retval	0	on success */
+int
+ha_innobase::open(const char* name, int, uint)
+{
+	char			norm_name[FN_REFLEN];
+
+	DBUG_ENTER("ha_innobase::open");
+
+	normalize_table_name(norm_name, name);
+
+	m_user_thd = NULL;
+
+	/* Will be allocated if it is needed in ::update_row() */
+	m_upd_buf = NULL;
+	m_upd_buf_size = 0;
+
+	char*	is_part = is_partition(norm_name);
+	THD*	thd = ha_thd();
+	dict_table_t* ib_table = open_dict_table(name, norm_name, is_part,
+						 DICT_ERR_IGNORE_FK_NOKEY);
+
+	DEBUG_SYNC(thd, "ib_open_after_dict_open");
+
+	if (NULL == ib_table) {
+
+		if (is_part) {
+			sql_print_error("Failed to open table %s.\n",
+					norm_name);
+		}
+		set_my_errno(ENOENT);
+
+		DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
+	}
+
+	size_t n_fields = omits_virtual_cols(*table_share)
+		? table_share->stored_fields : table_share->fields;
+	size_t n_cols = dict_table_get_n_user_cols(ib_table)
+		+ dict_table_get_n_v_cols(ib_table)
+		- !!DICT_TF2_FLAG_IS_SET(ib_table, DICT_TF2_FTS_HAS_DOC_ID);
+
+	if (UNIV_UNLIKELY(n_cols != n_fields)) {
+		ib::warn() << "Table " << norm_name << " contains "
+			<< n_cols << " user"
+			" defined columns in InnoDB, but " << n_fields
+			<< " columns in MariaDB. Please check"
+			" INFORMATION_SCHEMA.INNODB_SYS_COLUMNS and"
+			" https://mariadb.com/kb/en/innodb-data-dictionary-troubleshooting/"
+			" for how to resolve the issue.";
+
+		/* Mark this table as corrupted, so the drop table
+		or force recovery can still use it, but not others. */
+		ib_table->file_unreadable = true;
+		ib_table->corrupted = true;
+		ib_table->release();
+		set_my_errno(ENOENT);
+		DBUG_RETURN(HA_ERR_CRASHED_ON_USAGE);
+	}
+
+	innobase_copy_frm_flags_from_table_share(ib_table, table->s);
+
+	MONITOR_INC(MONITOR_TABLE_OPEN);
+
+	if ((ib_table->flags2 & DICT_TF2_DISCARDED)) {
+		/* Allow an open because a proper DISCARD should have set
+		all the flags and index root page numbers to FIL_NULL that
+		should prevent any DML from running but it should allow DDL
+		operations. */
+	} else if (!ib_table->is_readable()) {
+		const fil_space_t* space = ib_table->space;
+		if (!space) {
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_WARN,
+				ER_TABLESPACE_MISSING, norm_name);
+		}
+
+		if (!thd_tablespace_op(thd)) {
+			set_my_errno(ENOENT);
+			int ret_err = HA_ERR_TABLESPACE_MISSING;
+
+			if (space && space->crypt_data
+			    && space->crypt_data->is_encrypted()) {
+				push_warning_printf(
+					thd,
+					Sql_condition::WARN_LEVEL_WARN,
+					HA_ERR_DECRYPTION_FAILED,
+					"Table %s in file %s is encrypted"
+					" but encryption service or"
+					" used key_id %u is not available. "
+					" Can't continue reading table.",
+					table_share->table_name.str,
+					space->chain.start->name,
+					space->crypt_data->key_id);
+				ret_err = HA_ERR_DECRYPTION_FAILED;
+			}
+
+			ib_table->release();
+			DBUG_RETURN(ret_err);
+		}
+	}
+
+	m_prebuilt = row_create_prebuilt(ib_table, table->s->reclength);
+
+	m_prebuilt->default_rec = table->s->default_values;
+	ut_ad(m_prebuilt->default_rec);
+
+	m_prebuilt->m_mysql_table = table;
+
+	/* Looks like MySQL-3.23 sometimes has primary key number != 0 */
+	m_primary_key = table->s->primary_key;
+
+	key_used_on_scan = m_primary_key;
+
+	if (ib_table->n_v_cols) {
+		dict_sys.lock(SRW_LOCK_CALL);
+		if (ib_table->vc_templ == NULL) {
+			ib_table->vc_templ = UT_NEW_NOKEY(dict_vcol_templ_t());
+			innobase_build_v_templ(
+				table, ib_table, ib_table->vc_templ, NULL,
+				true);
+		}
+
+		dict_sys.unlock();
+	}
+
+	if (!check_index_consistency(table, ib_table)) {
+		sql_print_error("InnoDB indexes are inconsistent with what "
+				"defined in .frm for table %s",
+				name);
+	}
+
+	/* Allocate a buffer for a 'row reference'. A row reference is
+	a string of bytes of length ref_length which uniquely specifies
+	a row in our table. Note that MySQL may also compare two row
+	references for equality by doing a simple memcmp on the strings
+	of length ref_length! */
+	if (!(m_prebuilt->clust_index_was_generated
+	      = dict_index_is_auto_gen_clust(ib_table->indexes.start))) {
+		if (m_primary_key >= MAX_KEY) {
+			ib_table->dict_frm_mismatch = DICT_FRM_NO_PK;
+
+			/* This mismatch could cause further problems
+			if not attended, bring this to the user's attention
+			by printing a warning in addition to log a message
+			in the errorlog */
+
+			ib_push_frm_error(thd, ib_table, table, 0, true);
+
+			/* If m_primary_key >= MAX_KEY, its (m_primary_key)
+			value could be out of bound if continue to index
+			into key_info[] array. Find InnoDB primary index,
+			and assign its key_length to ref_length.
+			In addition, since MySQL indexes are sorted starting
+			with primary index, unique index etc., initialize
+			ref_length to the first index key length in
+			case we fail to find InnoDB cluster index.
+
+			Please note, this will not resolve the primary
+			index mismatch problem, other side effects are
+			possible if users continue to use the table.
+			However, we allow this table to be opened so
+			that user can adopt necessary measures for the
+			mismatch while still being accessible to the table
+			date. */
+			if (!table->key_info) {
+				ut_ad(!table->s->keys);
+				ref_length = 0;
+			} else {
+				ref_length = table->key_info[0].key_length;
+			}
+
+			/* Find corresponding cluster index
+			key length in MySQL's key_info[] array */
+			for (uint i = 0; i < table->s->keys; i++) {
+				dict_index_t*	index;
+				index = innobase_get_index(i);
+				if (dict_index_is_clust(index)) {
+					ref_length =
+						 table->key_info[i].key_length;
+				}
+			}
+		} else {
+			/* MySQL allocates the buffer for ref.
+			key_info->key_length includes space for all key
+			columns + one byte for each column that may be
+			NULL. ref_length must be as exact as possible to
+			save space, because all row reference buffers are
+			allocated based on ref_length. */
+
+			ref_length = table->key_info[m_primary_key].key_length;
+		}
+	} else {
+		if (m_primary_key != MAX_KEY) {
+
+			ib_table->dict_frm_mismatch = DICT_NO_PK_FRM_HAS;
+
+			/* This mismatch could cause further problems
+			if not attended, bring this to the user attention
+			by printing a warning in addition to log a message
+			in the errorlog */
+			ib_push_frm_error(thd, ib_table, table, 0, true);
+		}
+
+		ref_length = DATA_ROW_ID_LEN;
+
+		/* If we automatically created the clustered index, then
+		MySQL does not know about it, and MySQL must NOT be aware
+		of the index used on scan, to make it avoid checking if we
+		update the column of the index. That is why we assert below
+		that key_used_on_scan is the undefined value MAX_KEY.
+		The column is the row id in the automatical generation case,
+		and it will never be updated anyway. */
+
+		if (key_used_on_scan != MAX_KEY) {
+			sql_print_warning(
+				"Table %s key_used_on_scan is %u even "
+				"though there is no primary key inside "
+				"InnoDB.", name, key_used_on_scan);
+		}
+	}
+
+	/* Index block size in InnoDB: used by MySQL in query optimization */
+	stats.block_size = static_cast<uint>(srv_page_size);
+
+	const my_bool for_vc_purge = THDVAR(thd, background_thread);
+
+	if (for_vc_purge || !m_prebuilt->table
+	    || m_prebuilt->table->is_temporary()
+	    || m_prebuilt->table->persistent_autoinc
+	    || !m_prebuilt->table->is_readable()) {
+	} else if (const Field* ai = table->found_next_number_field) {
+		initialize_auto_increment(m_prebuilt->table, ai);
+	}
+
+	/* Set plugin parser for fulltext index */
+	for (uint i = 0; i < table->s->keys; i++) {
+		if (table->key_info[i].flags & HA_USES_PARSER) {
+			dict_index_t*	index = innobase_get_index(i);
+			plugin_ref	parser = table->key_info[i].parser;
+
+			ut_ad(index->type & DICT_FTS);
+			index->parser =
+				static_cast<st_mysql_ftparser *>(
+					plugin_decl(parser)->info);
+
+			DBUG_EXECUTE_IF("fts_instrument_use_default_parser",
+				index->parser = &fts_default_parser;);
+		}
+	}
+
+	ut_ad(!m_prebuilt->table
+	      || table->versioned() == m_prebuilt->table->versioned());
+
+	if (!for_vc_purge) {
+		info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST
+		     | HA_STATUS_OPEN);
+	}
+
+	DBUG_RETURN(0);
+}
+
+/** Convert MySQL column number to dict_table_t::cols[] offset.
+@param[in]	field	non-virtual column
+@return	column number relative to dict_table_t::cols[] */
+unsigned
+innodb_col_no(const Field* field)
+{
+	ut_ad(!innobase_is_s_fld(field));
+	const TABLE*	table	= field->table;
+	unsigned	col_no	= 0;
+	ut_ad(field == table->field[field->field_index]);
+	for (unsigned i = 0; i < field->field_index; i++) {
+		if (table->field[i]->stored_in_db()) {
+			col_no++;
+		}
+	}
+	return(col_no);
+}
+
+/** Opens dictionary table object using table name. For partition, we need to
+try alternative lower/upper case names to support moving data files across
+platforms.
+@param[in]	table_name	name of the table/partition
+@param[in]	norm_name	normalized name of the table/partition
+@param[in]	is_partition	if this is a partition of a table
+@param[in]	ignore_err	error to ignore for loading dictionary object
+@return dictionary table object or NULL if not found */
+dict_table_t*
+ha_innobase::open_dict_table(
+	const char*
+#ifdef _WIN32
+	table_name
+#endif
+	,
+	const char*		norm_name,
+	bool			is_partition,
+	dict_err_ignore_t	ignore_err)
+{
+	DBUG_ENTER("ha_innobase::open_dict_table");
+	/* FIXME: try_drop_aborted */
+	dict_table_t*	ib_table = dict_table_open_on_name(norm_name, false,
+							   ignore_err);
+
+	if (NULL == ib_table && is_partition) {
+		/* MySQL partition engine hard codes the file name
+		separator as "#P#". The text case is fixed even if
+		lower_case_table_names is set to 1 or 2. This is true
+		for sub-partition names as well. InnoDB always
+		normalises file names to lower case on Windows, this
+		can potentially cause problems when copying/moving
+		tables between platforms.
+
+		1) If boot against an installation from Windows
+		platform, then its partition table name could
+		be in lower case in system tables. So we will
+		need to check lower case name when load table.
+
+		2) If we boot an installation from other case
+		sensitive platform in Windows, we might need to
+		check the existence of table name without lower
+		case in the system table. */
+		if (lower_case_table_names == 1) {
+			char	par_case_name[FN_REFLEN];
+
+#ifndef _WIN32
+			/* Check for the table using lower
+			case name, including the partition
+			separator "P" */
+			strcpy(par_case_name, norm_name);
+			innobase_casedn_str(par_case_name);
+#else
+			/* On Windows platfrom, check
+			whether there exists table name in
+			system table whose name is
+			not being normalized to lower case */
+			normalize_table_name_c_low(
+				par_case_name, table_name, false);
+#endif
+			/* FIXME: try_drop_aborted */
+			ib_table = dict_table_open_on_name(
+				par_case_name, false, ignore_err);
+		}
+
+		if (ib_table != NULL) {
+#ifndef _WIN32
+			sql_print_warning("Partition table %s opened"
+					  " after converting to lower"
+					  " case. The table may have"
+					  " been moved from a case"
+					  " in-sensitive file system."
+					  " Please recreate table in"
+					  " the current file system\n",
+					  norm_name);
+#else
+			sql_print_warning("Partition table %s opened"
+					  " after skipping the step to"
+					  " lower case the table name."
+					  " The table may have been"
+					  " moved from a case sensitive"
+					  " file system. Please"
+					  " recreate table in the"
+					  " current file system\n",
+					  norm_name);
+#endif
+		}
+	}
+
+	DBUG_RETURN(ib_table);
+}
+
+handler*
+ha_innobase::clone(
+/*===============*/
+	const char*	name,		/*!< in: table name */
+	MEM_ROOT*	mem_root)	/*!< in: memory context */
+{
+	DBUG_ENTER("ha_innobase::clone");
+
+	ha_innobase*	new_handler = static_cast<ha_innobase*>(
+		handler::clone(m_prebuilt->table->name.m_name, mem_root));
+
+	if (new_handler != NULL) {
+		DBUG_ASSERT(new_handler->m_prebuilt != NULL);
+
+		new_handler->m_prebuilt->select_lock_type
+			= m_prebuilt->select_lock_type;
+	}
+
+	DBUG_RETURN(new_handler);
+}
+
+
+uint
+ha_innobase::max_supported_key_part_length() const
+/*==============================================*/
+{
+	/* A table format specific index column length check will be performed
+	at ha_innobase::add_index() and row_create_index_for_mysql() */
+	return(REC_VERSION_56_MAX_INDEX_COL_LEN);
+}
+
+/******************************************************************//**
+Closes a handle to an InnoDB table.
+@return 0 */
+
+int
+ha_innobase::close()
+/*================*/
+{
+	DBUG_ENTER("ha_innobase::close");
+
+	row_prebuilt_free(m_prebuilt);
+
+	if (m_upd_buf != NULL) {
+		ut_ad(m_upd_buf_size != 0);
+		my_free(m_upd_buf);
+		m_upd_buf = NULL;
+		m_upd_buf_size = 0;
+	}
+
+	DBUG_RETURN(0);
+}
+
+/* The following accessor functions should really be inside MySQL code! */
+
+#ifdef WITH_WSREP
+ulint
+wsrep_innobase_mysql_sort(
+					/* out: str contains sort string */
+	int		mysql_type,	/* in: MySQL type */
+	uint		charset_number,	/* in: number of the charset */
+	unsigned char*	str,		/* in: data field */
+	ulint		str_length,	/* in: data field length,
+					not UNIV_SQL_NULL */
+	ulint		buf_length)	/* in: total str buffer length */
+
+{
+	CHARSET_INFO*		charset;
+	enum_field_types	mysql_tp;
+	ulint			ret_length =	str_length;
+
+	DBUG_ASSERT(str_length != UNIV_SQL_NULL);
+
+	mysql_tp = (enum_field_types) mysql_type;
+
+	switch (mysql_tp) {
+
+	case MYSQL_TYPE_BIT:
+	case MYSQL_TYPE_STRING:
+	case MYSQL_TYPE_VAR_STRING:
+	case MYSQL_TYPE_TINY_BLOB:
+	case MYSQL_TYPE_MEDIUM_BLOB:
+	case MYSQL_TYPE_BLOB:
+	case MYSQL_TYPE_LONG_BLOB:
+	case MYSQL_TYPE_VARCHAR:
+	{
+		uchar tmp_str[REC_VERSION_56_MAX_INDEX_COL_LEN] = {'\0'};
+		ulint tmp_length = REC_VERSION_56_MAX_INDEX_COL_LEN;
+
+		/* Use the charset number to pick the right charset struct for
+		the comparison. Since the MySQL function get_charset may be
+		slow before Bar removes the mutex operation there, we first
+		look at 2 common charsets directly. */
+
+		if (charset_number == default_charset_info->number) {
+			charset = default_charset_info;
+		} else if (charset_number == my_charset_latin1.number) {
+			charset = &my_charset_latin1;
+		} else {
+			charset = get_charset(charset_number, MYF(MY_WME));
+
+			if (charset == NULL) {
+			  sql_print_error("InnoDB needs charset %lu for doing "
+					  "a comparison, but MariaDB cannot "
+					  "find that charset.",
+					  (ulong) charset_number);
+				ut_a(0);
+			}
+		}
+
+		ut_a(str_length <= tmp_length);
+		memcpy(tmp_str, str, str_length);
+
+		tmp_length = charset->strnxfrm(str, str_length,
+					       uint(str_length), tmp_str,
+					       tmp_length, 0);
+		DBUG_ASSERT(tmp_length <= str_length);
+		if (wsrep_protocol_version < 3) {
+			tmp_length = charset->strnxfrm(
+				str, str_length,
+				uint(str_length), tmp_str, tmp_length, 0);
+			DBUG_ASSERT(tmp_length <= str_length);
+		} else {
+			/* strnxfrm will expand the destination string,
+			   protocols < 3 truncated the sorted sring
+			   protocols >= 3 gets full sorted sring
+			*/
+			tmp_length = charset->strnxfrm(
+				str, buf_length,
+				uint(str_length), tmp_str, str_length, 0);
+			DBUG_ASSERT(tmp_length <= buf_length);
+			ret_length = tmp_length;
+		}
+
+		break;
+	}
+	case MYSQL_TYPE_DECIMAL :
+	case MYSQL_TYPE_TINY :
+	case MYSQL_TYPE_SHORT :
+	case MYSQL_TYPE_LONG :
+	case MYSQL_TYPE_FLOAT :
+	case MYSQL_TYPE_DOUBLE :
+	case MYSQL_TYPE_NULL :
+	case MYSQL_TYPE_TIMESTAMP :
+	case MYSQL_TYPE_LONGLONG :
+	case MYSQL_TYPE_INT24 :
+	case MYSQL_TYPE_DATE :
+	case MYSQL_TYPE_TIME :
+	case MYSQL_TYPE_DATETIME :
+	case MYSQL_TYPE_YEAR :
+	case MYSQL_TYPE_NEWDATE :
+	case MYSQL_TYPE_NEWDECIMAL :
+	case MYSQL_TYPE_ENUM :
+	case MYSQL_TYPE_SET :
+	case MYSQL_TYPE_GEOMETRY :
+		break;
+	default:
+		break;
+	}
+
+	return ret_length;
+}
+#endif /* WITH_WSREP */
+
+/******************************************************************//**
+compare two character string according to their charset. */
+int
+innobase_fts_text_cmp(
+/*==================*/
+	const void*	cs,		/*!< in: Character set */
+	const void*     p1,		/*!< in: key */
+	const void*     p2)		/*!< in: node */
+{
+	const CHARSET_INFO*	charset = (const CHARSET_INFO*) cs;
+	const fts_string_t*	s1 = (const fts_string_t*) p1;
+	const fts_string_t*	s2 = (const fts_string_t*) p2;
+
+	return(ha_compare_word(charset,
+		s1->f_str, static_cast<uint>(s1->f_len),
+		s2->f_str, static_cast<uint>(s2->f_len)));
+}
+
+/******************************************************************//**
+compare two character string case insensitively according to their charset. */
+int
+innobase_fts_text_case_cmp(
+/*=======================*/
+	const void*	cs,		/*!< in: Character set */
+	const void*     p1,		/*!< in: key */
+	const void*     p2)		/*!< in: node */
+{
+	const CHARSET_INFO*	charset = (const CHARSET_INFO*) cs;
+	const fts_string_t*	s1 = (const fts_string_t*) p1;
+	const fts_string_t*	s2 = (const fts_string_t*) p2;
+	ulint			newlen;
+
+	my_casedn_str(charset, (char*) s2->f_str);
+
+	newlen = strlen((const char*) s2->f_str);
+
+	return(ha_compare_word(charset,
+		s1->f_str, static_cast<uint>(s1->f_len),
+		s2->f_str, static_cast<uint>(newlen)));
+}
+
+/******************************************************************//**
+Get the first character's code position for FTS index partition. */
+ulint
+innobase_strnxfrm(
+/*==============*/
+	const CHARSET_INFO*
+			cs,		/*!< in: Character set */
+	const uchar*	str,		/*!< in: string */
+	const ulint	len)		/*!< in: string length */
+{
+	uchar		mystr[2];
+	ulint		value;
+
+	if (!str || len == 0) {
+		return(0);
+	}
+
+	cs->strnxfrm((uchar*) mystr, 2, str, len);
+
+	value = mach_read_from_2(mystr);
+
+	if (value > 255) {
+		value = value / 256;
+	}
+
+	return(value);
+}
+
+/******************************************************************//**
+compare two character string according to their charset. */
+int
+innobase_fts_text_cmp_prefix(
+/*=========================*/
+	const void*	cs,		/*!< in: Character set */
+	const void*	p1,		/*!< in: prefix key */
+	const void*	p2)		/*!< in: value to compare */
+{
+	const CHARSET_INFO*	charset = (const CHARSET_INFO*) cs;
+	const fts_string_t*	s1 = (const fts_string_t*) p1;
+	const fts_string_t*	s2 = (const fts_string_t*) p2;
+	int			result;
+
+	result = ha_compare_word_prefix(charset,
+		s2->f_str, static_cast<uint>(s2->f_len),
+		s1->f_str, static_cast<uint>(s1->f_len));
+
+	/* We switched s1, s2 position in the above call. So we need
+	to negate the result */
+	return(-result);
+}
+
+/******************************************************************//**
+Makes all characters in a string lower case. */
+size_t
+innobase_fts_casedn_str(
+/*====================*/
+	CHARSET_INFO*	cs,	/*!< in: Character set */
+	char*		src,	/*!< in: string to put in lower case */
+	size_t		src_len,/*!< in: input string length */
+	char*		dst,	/*!< in: buffer for result string */
+	size_t		dst_len)/*!< in: buffer size */
+{
+	if (cs->casedn_multiply() == 1) {
+		memcpy(dst, src, src_len);
+		dst[src_len] = 0;
+		my_casedn_str(cs, dst);
+
+		return(strlen(dst));
+	} else {
+		return(cs->casedn(src, src_len, dst, dst_len));
+	}
+}
+
+#define true_word_char(c, ch) ((c) & (_MY_U | _MY_L | _MY_NMR) || (ch) == '_')
+
+#define misc_word_char(X)       0
+
+/*************************************************************//**
+Get the next token from the given string and store it in *token.
+It is mostly copied from MyISAM's doc parsing function ft_simple_get_word()
+@return length of string processed */
+ulint
+innobase_mysql_fts_get_token(
+/*=========================*/
+	CHARSET_INFO*	cs,		/*!< in: Character set */
+	const byte*	start,		/*!< in: start of text */
+	const byte*	end,		/*!< in: one character past end of
+					text */
+	fts_string_t*	token)		/*!< out: token's text */
+{
+	int		mbl;
+	const uchar*	doc = start;
+
+	ut_a(cs);
+
+	token->f_n_char = token->f_len = 0;
+	token->f_str = NULL;
+
+	for (;;) {
+
+		if (doc >= end) {
+			return ulint(doc - start);
+		}
+
+		int	ctype;
+
+		mbl = cs->ctype(&ctype, doc, (const uchar*) end);
+
+		if (true_word_char(ctype, *doc)) {
+			break;
+		}
+
+		doc += mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1);
+	}
+
+	ulint	mwc = 0;
+	ulint	length = 0;
+	bool	reset_token_str = false;
+reset:
+	token->f_str = const_cast<byte*>(doc);
+
+	while (doc < end) {
+
+		int	ctype;
+
+		mbl = cs->ctype(&ctype, (uchar*) doc, (uchar*) end);
+		if (true_word_char(ctype, *doc)) {
+			mwc = 0;
+		} else if (*doc == '\'' && length == 1) {
+			/* Could be apostrophe */
+			reset_token_str = true;
+		} else if (!misc_word_char(*doc) || mwc) {
+			break;
+		} else {
+			++mwc;
+		}
+
+		++length;
+
+		doc += mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1);
+		if (reset_token_str) {
+			/* Reset the token if the single character
+			followed by apostrophe */
+			mwc = 0;
+			length = 0;
+			reset_token_str = false;
+			goto reset;
+		}
+	}
+
+	token->f_len = (uint) (doc - token->f_str) - mwc;
+	token->f_n_char = length;
+
+	return ulint(doc - start);
+}
+
+/** Converts a MySQL type to an InnoDB type. Note that this function returns
+the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1
+VARCHAR and the new true VARCHAR in >= 5.0.3 by the 'prtype'.
+@param[out]	unsigned_flag	DATA_UNSIGNED if an 'unsigned type'; at least
+ENUM and SET, and unsigned integer types are 'unsigned types'
+@param[in]	f		MySQL Field
+@return DATA_BINARY, DATA_VARCHAR, ... */
+uint8_t
+get_innobase_type_from_mysql_type(unsigned *unsigned_flag, const Field *field)
+{
+	/* The following asserts try to check that the MySQL type code fits in
+	8 bits: this is used in ibuf and also when DATA_NOT_NULL is ORed to
+	the type */
+
+	static_assert(MYSQL_TYPE_STRING < 256, "compatibility");
+	static_assert(MYSQL_TYPE_VAR_STRING < 256, "compatibility");
+	static_assert(MYSQL_TYPE_DOUBLE < 256, "compatibility");
+	static_assert(MYSQL_TYPE_FLOAT < 256, "compatibility");
+	static_assert(MYSQL_TYPE_DECIMAL < 256, "compatibility");
+
+	if (field->flags & UNSIGNED_FLAG) {
+
+		*unsigned_flag = DATA_UNSIGNED;
+	} else {
+		*unsigned_flag = 0;
+	}
+
+	if (field->real_type() == MYSQL_TYPE_ENUM
+		|| field->real_type() == MYSQL_TYPE_SET) {
+
+		/* MySQL has field->type() a string type for these, but the
+		data is actually internally stored as an unsigned integer
+		code! */
+
+		*unsigned_flag = DATA_UNSIGNED; /* MySQL has its own unsigned
+						flag set to zero, even though
+						internally this is an unsigned
+						integer type */
+		return(DATA_INT);
+	}
+
+	switch (field->type()) {
+		/* NOTE that we only allow string types in DATA_MYSQL and
+		DATA_VARMYSQL */
+	case MYSQL_TYPE_VAR_STRING:	/* old <= 4.1 VARCHAR */
+	case MYSQL_TYPE_VARCHAR:	/* new >= 5.0.3 true VARCHAR */
+		if (field->binary()) {
+			return(DATA_BINARY);
+		} else if (field->charset() == &my_charset_latin1) {
+			return(DATA_VARCHAR);
+		} else {
+			return(DATA_VARMYSQL);
+		}
+	case MYSQL_TYPE_BIT:
+	case MYSQL_TYPE_STRING:
+		if (field->binary() || field->key_type() == HA_KEYTYPE_BINARY) {
+			return(DATA_FIXBINARY);
+		} else if (field->charset() == &my_charset_latin1) {
+			return(DATA_CHAR);
+		} else {
+			return(DATA_MYSQL);
+		}
+	case MYSQL_TYPE_NEWDECIMAL:
+		return(DATA_FIXBINARY);
+	case MYSQL_TYPE_LONG:
+	case MYSQL_TYPE_LONGLONG:
+	case MYSQL_TYPE_TINY:
+	case MYSQL_TYPE_SHORT:
+	case MYSQL_TYPE_INT24:
+	case MYSQL_TYPE_DATE:
+	case MYSQL_TYPE_YEAR:
+	case MYSQL_TYPE_NEWDATE:
+		return(DATA_INT);
+	case MYSQL_TYPE_TIME:
+	case MYSQL_TYPE_DATETIME:
+	case MYSQL_TYPE_TIMESTAMP:
+		if (field->key_type() == HA_KEYTYPE_BINARY) {
+			return(DATA_FIXBINARY);
+		} else {
+			return(DATA_INT);
+		}
+	case MYSQL_TYPE_FLOAT:
+		return(DATA_FLOAT);
+	case MYSQL_TYPE_DOUBLE:
+		return(DATA_DOUBLE);
+	case MYSQL_TYPE_DECIMAL:
+		return(DATA_DECIMAL);
+	case MYSQL_TYPE_GEOMETRY:
+		return(DATA_GEOMETRY);
+	case MYSQL_TYPE_TINY_BLOB:
+	case MYSQL_TYPE_MEDIUM_BLOB:
+	case MYSQL_TYPE_BLOB:
+	case MYSQL_TYPE_LONG_BLOB:
+		return(DATA_BLOB);
+	case MYSQL_TYPE_NULL:
+		/* MySQL currently accepts "NULL" datatype, but will
+		reject such datatype in the next release. We will cope
+		with it and not trigger assertion failure in 5.1 */
+		break;
+	default:
+		ut_error;
+	}
+
+	return(0);
+}
+
+/*******************************************************************//**
+Reads an unsigned integer value < 64k from 2 bytes, in the little-endian
+storage format.
+@return value */
+static inline
+uint
+innobase_read_from_2_little_endian(
+/*===============================*/
+	const uchar*	buf)	/*!< in: from where to read */
+{
+	return((uint) ((ulint)(buf[0]) + 256 * ((ulint)(buf[1]))));
+}
+
+#ifdef WITH_WSREP
+/*******************************************************************//**
+Stores a key value for a row to a buffer.
+@return	key value length as stored in buff */
+static
+uint16_t
+wsrep_store_key_val_for_row(
+/*=========================*/
+	THD* 		thd,
+	TABLE*		table,
+	uint		keynr,	/*!< in: key number */
+	char*		buff,	/*!< in/out: buffer for the key value (in MySQL
+				format) */
+	uint		buff_len,/*!< in: buffer length */
+	const uchar*	record,
+	bool*		key_is_null)/*!< out: full key was null */
+{
+	KEY*		key_info	= table->key_info + keynr;
+	KEY_PART_INFO*	key_part	= key_info->key_part;
+	KEY_PART_INFO*	end		= key_part + key_info->user_defined_key_parts;
+	char*		buff_start	= buff;
+	enum_field_types mysql_type;
+	Field*		field;
+	ulint buff_space = buff_len;
+
+	DBUG_ENTER("wsrep_store_key_val_for_row");
+
+	memset(buff, 0, buff_len);
+	*key_is_null = true;
+
+	for (; key_part != end; key_part++) {
+		uchar sorted[REC_VERSION_56_MAX_INDEX_COL_LEN] = {'\0'};
+		bool part_is_null = false;
+
+		if (key_part->null_bit) {
+			if (buff_space > 0) {
+				if (record[key_part->null_offset]
+				    & key_part->null_bit) {
+					*buff = 1;
+					part_is_null = true;
+				} else {
+					*buff = 0;
+				}
+				buff++;
+				buff_space--;
+			} else {
+				fprintf (stderr, "WSREP: key truncated: %s\n",
+					 wsrep_thd_query(thd));
+			}
+		}
+		if (!part_is_null)  *key_is_null = false;
+
+		field = key_part->field;
+		mysql_type = field->type();
+
+		if (mysql_type == MYSQL_TYPE_VARCHAR) {
+						/* >= 5.0.3 true VARCHAR */
+			ulint		lenlen;
+			ulint		len;
+			const byte*	data;
+			ulint		key_len;
+			ulint		true_len;
+			const CHARSET_INFO* cs;
+			int		error=0;
+
+			key_len = key_part->length;
+
+			if (part_is_null) {
+				true_len = key_len + 2;
+				if (true_len > buff_space) {
+					fprintf (stderr,
+						 "WSREP: key truncated: %s\n",
+						 wsrep_thd_query(thd));
+					true_len = buff_space;
+				}
+				buff       += true_len;
+				buff_space -= true_len;
+				continue;
+			}
+			cs = field->charset();
+
+			lenlen = (ulint)
+				(((Field_varstring*)field)->length_bytes);
+
+			data = row_mysql_read_true_varchar(&len,
+				(byte*) (record
+				+ (ulint)get_field_offset(table, field)),
+				lenlen);
+
+			true_len = len;
+
+			/* For multi byte character sets we need to calculate
+			the true length of the key */
+
+			if (len > 0 && cs->mbmaxlen > 1) {
+				true_len = (ulint) my_well_formed_length(cs,
+						(const char *) data,
+						(const char *) data + len,
+						(uint) (key_len /
+						cs->mbmaxlen),
+						&error);
+			}
+
+			/* In a column prefix index, we may need to truncate
+			the stored value: */
+			if (true_len > key_len) {
+				true_len = key_len;
+			}
+			/* cannot exceed max column lenght either, we may need to truncate
+			the stored value: */
+			if (true_len > sizeof(sorted)) {
+			  true_len = sizeof(sorted);
+			}
+
+			memcpy(sorted, data, true_len);
+			true_len = wsrep_innobase_mysql_sort(
+				mysql_type, cs->number, sorted, true_len,
+				REC_VERSION_56_MAX_INDEX_COL_LEN);
+			if (wsrep_protocol_version > 1) {
+				/* Note that we always reserve the maximum possible
+				length of the true VARCHAR in the key value, though
+				only len first bytes after the 2 length bytes contain
+				actual data. The rest of the space was reset to zero
+				in the bzero() call above. */
+				if (true_len > buff_space) {
+					WSREP_DEBUG (
+						 "write set key truncated for: %s\n",
+						 wsrep_thd_query(thd));
+					true_len = buff_space;
+				}
+ 				memcpy(buff, sorted, true_len);
+				buff += true_len;
+				buff_space -= true_len;
+			} else {
+				buff += key_len;
+			}
+		} else if (mysql_type == MYSQL_TYPE_TINY_BLOB
+			|| mysql_type == MYSQL_TYPE_MEDIUM_BLOB
+			|| mysql_type == MYSQL_TYPE_BLOB
+			|| mysql_type == MYSQL_TYPE_LONG_BLOB
+			/* MYSQL_TYPE_GEOMETRY data is treated
+			as BLOB data in innodb. */
+			|| mysql_type == MYSQL_TYPE_GEOMETRY) {
+
+			const CHARSET_INFO* cs;
+			ulint		key_len;
+			ulint		true_len;
+			int		error=0;
+			ulint		blob_len;
+			const byte*	blob_data;
+
+			ut_a(key_part->key_part_flag & HA_PART_KEY_SEG);
+
+			key_len = key_part->length;
+
+			if (part_is_null) {
+				true_len = key_len + 2;
+				if (true_len > buff_space) {
+					fprintf (stderr,
+						 "WSREP: key truncated: %s\n",
+						 wsrep_thd_query(thd));
+					true_len = buff_space;
+				}
+				buff       += true_len;
+				buff_space -= true_len;
+
+				continue;
+			}
+
+			cs = field->charset();
+
+			blob_data = row_mysql_read_blob_ref(&blob_len,
+				(byte*) (record
+				+ (ulint)get_field_offset(table, field)),
+					(ulint) field->pack_length());
+
+			true_len = blob_len;
+
+			ut_a(get_field_offset(table, field)
+				== key_part->offset);
+
+			/* For multi byte character sets we need to calculate
+			the true length of the key */
+
+			if (blob_len > 0 && cs->mbmaxlen > 1) {
+				true_len = (ulint) my_well_formed_length(cs,
+						(const char *) blob_data,
+						(const char *) blob_data
+							+ blob_len,
+						(uint) (key_len /
+							cs->mbmaxlen),
+						&error);
+			}
+
+			/* All indexes on BLOB and TEXT are column prefix
+			indexes, and we may need to truncate the data to be
+			stored in the key value: */
+
+			if (true_len > key_len) {
+				true_len = key_len;
+			}
+
+			memcpy(sorted, blob_data, true_len);
+			true_len = wsrep_innobase_mysql_sort(
+				mysql_type, cs->number, sorted, true_len,
+				REC_VERSION_56_MAX_INDEX_COL_LEN);
+
+
+			/* Note that we always reserve the maximum possible
+			length of the BLOB prefix in the key value. */
+			if (wsrep_protocol_version > 1) {
+				if (true_len > buff_space) {
+					fprintf (stderr,
+						 "WSREP: key truncated: %s\n",
+						 wsrep_thd_query(thd));
+					true_len = buff_space;
+				}
+				buff       += true_len;
+				buff_space -= true_len;
+			} else {
+				buff += key_len;
+			}
+			memcpy(buff, sorted, true_len);
+		} else {
+			/* Here we handle all other data types except the
+			true VARCHAR, BLOB and TEXT. Note that the column
+			value we store may be also in a column prefix
+			index. */
+
+			const CHARSET_INFO*	cs = NULL;
+			ulint			true_len;
+			ulint			key_len;
+			const uchar*		src_start;
+			int			error=0;
+			enum_field_types	real_type;
+
+			key_len = key_part->length;
+
+			if (part_is_null) {
+				true_len = key_len;
+				if (true_len > buff_space) {
+					fprintf (stderr,
+						 "WSREP: key truncated: %s\n",
+						 wsrep_thd_query(thd));
+					true_len = buff_space;
+				}
+				buff       += true_len;
+				buff_space -= true_len;
+
+				continue;
+			}
+
+			src_start = record + key_part->offset;
+			real_type = field->real_type();
+			true_len = key_len;
+
+			/* Character set for the field is defined only
+			to fields whose type is string and real field
+			type is not enum or set. For these fields check
+			if character set is multi byte. */
+
+			if (real_type != MYSQL_TYPE_ENUM
+				&& real_type != MYSQL_TYPE_SET
+				&& ( mysql_type == MYSQL_TYPE_VAR_STRING
+					|| mysql_type == MYSQL_TYPE_STRING)) {
+
+				cs = field->charset();
+
+				/* For multi byte character sets we need to
+				calculate the true length of the key */
+
+				if (key_len > 0 && cs->mbmaxlen > 1) {
+
+					true_len = (ulint)
+						my_well_formed_length(cs,
+							(const char *)src_start,
+							(const char *)src_start
+								+ key_len,
+							(uint) (key_len /
+								cs->mbmaxlen),
+							&error);
+				}
+				memcpy(sorted, src_start, true_len);
+				true_len = wsrep_innobase_mysql_sort(
+					mysql_type, cs->number, sorted, true_len,
+					REC_VERSION_56_MAX_INDEX_COL_LEN);
+
+				if (true_len > buff_space) {
+					fprintf (stderr,
+						 "WSREP: key truncated: %s\n",
+						 wsrep_thd_query(thd));
+					true_len   = buff_space;
+				}
+				memcpy(buff, sorted, true_len);
+			} else {
+				memcpy(buff, src_start, true_len);
+			}
+			buff       += true_len;
+			buff_space -= true_len;
+		}
+	}
+
+	ut_a(buff <= buff_start + buff_len);
+
+	DBUG_RETURN(static_cast<uint16_t>(buff - buff_start));
+}
+#endif /* WITH_WSREP */
+/**************************************************************//**
+Determines if a field is needed in a m_prebuilt struct 'template'.
+@return field to use, or NULL if the field is not needed */
+static
+const Field*
+build_template_needs_field(
+/*=======================*/
+	bool		index_contains,	/*!< in:
+					dict_index_t::contains_col_or_prefix(
+					i) */
+	bool		read_just_key,	/*!< in: TRUE when MySQL calls
+					ha_innobase::extra with the
+					argument HA_EXTRA_KEYREAD; it is enough
+					to read just columns defined in
+					the index (i.e., no read of the
+					clustered index record necessary) */
+	bool		fetch_all_in_key,
+					/*!< in: true=fetch all fields in
+					the index */
+	bool		fetch_primary_key_cols,
+					/*!< in: true=fetch the
+					primary key columns */
+	dict_index_t*	index,		/*!< in: InnoDB index to use */
+	const TABLE*	table,		/*!< in: MySQL table object */
+	ulint		i,		/*!< in: field index in InnoDB table */
+	ulint		num_v)		/*!< in: num virtual column so far */
+{
+	const Field*	field	= table->field[i];
+
+	if (!field->stored_in_db()
+	    && ha_innobase::omits_virtual_cols(*table->s)) {
+		return NULL;
+	}
+
+	if (!index_contains) {
+		if (read_just_key) {
+			/* If this is a 'key read', we do not need
+			columns that are not in the key */
+
+			return(NULL);
+		}
+	} else if (fetch_all_in_key) {
+		/* This field is needed in the query */
+
+		return(field);
+	}
+
+	if (bitmap_is_set(table->read_set, static_cast<uint>(i))
+	    || bitmap_is_set(table->write_set, static_cast<uint>(i))) {
+		/* This field is needed in the query */
+
+		return(field);
+	}
+
+	ut_ad(i >= num_v);
+	if (fetch_primary_key_cols
+	    && dict_table_col_in_clustered_key(index->table, i - num_v)) {
+		/* This field is needed in the query */
+		return(field);
+	}
+
+	/* This field is not needed in the query, skip it */
+
+	return(NULL);
+}
+
+/**************************************************************//**
+Determines if a field is needed in a m_prebuilt struct 'template'.
+@return whether the field is needed for index condition pushdown */
+inline
+bool
+build_template_needs_field_in_icp(
+/*==============================*/
+	const dict_index_t*	index,	/*!< in: InnoDB index */
+	const row_prebuilt_t*	prebuilt,/*!< in: row fetch template */
+	bool			contains,/*!< in: whether the index contains
+					column i */
+	ulint			i,	/*!< in: column number */
+	bool			is_virtual)
+					/*!< in: a virtual column or not */
+{
+	ut_ad(contains == index->contains_col_or_prefix(i, is_virtual));
+
+	return(index == prebuilt->index
+	       ? contains
+	       : prebuilt->index->contains_col_or_prefix(i, is_virtual));
+}
+
+/**************************************************************//**
+Adds a field to a m_prebuilt struct 'template'.
+@return the field template */
+static
+mysql_row_templ_t*
+build_template_field(
+/*=================*/
+	row_prebuilt_t*	prebuilt,	/*!< in/out: template */
+	dict_index_t*	clust_index,	/*!< in: InnoDB clustered index */
+	dict_index_t*	index,		/*!< in: InnoDB index to use */
+	TABLE*		table,		/*!< in: MySQL table object */
+	const Field*	field,		/*!< in: field in MySQL table */
+	ulint		i,		/*!< in: field index in InnoDB table */
+	ulint		v_no)		/*!< in: field index for virtual col */
+{
+	mysql_row_templ_t*	templ;
+	const dict_col_t*	col;
+
+	ut_ad(clust_index->table == index->table);
+
+	templ = prebuilt->mysql_template + prebuilt->n_template++;
+	MEM_UNDEFINED(templ, sizeof *templ);
+	templ->rec_field_is_prefix = FALSE;
+	templ->rec_prefix_field_no = ULINT_UNDEFINED;
+	templ->is_virtual = !field->stored_in_db();
+
+	if (!templ->is_virtual) {
+		templ->col_no = i;
+		col = dict_table_get_nth_col(index->table, i);
+		templ->clust_rec_field_no = dict_col_get_clust_pos(
+						col, clust_index);
+		/* If clustered index record field is not found, lets print out
+		field names and all the rest to understand why field is not found. */
+		if (templ->clust_rec_field_no == ULINT_UNDEFINED) {
+			const char* tb_col_name = dict_table_get_col_name(clust_index->table, i);
+			dict_field_t* field=NULL;
+			size_t size = 0;
+
+			for(ulint j=0; j < clust_index->n_user_defined_cols; j++) {
+				dict_field_t* ifield = &(clust_index->fields[j]);
+				if (ifield && !memcmp(tb_col_name, ifield->name,
+						strlen(tb_col_name))) {
+					field = ifield;
+					break;
+				}
+			}
+
+			ib::info() << "Looking for field " << i << " name "
+				<< (tb_col_name ? tb_col_name : "NULL")
+				<< " from table " << clust_index->table->name;
+
+
+			for(ulint j=0; j < clust_index->n_user_defined_cols; j++) {
+				dict_field_t* ifield = &(clust_index->fields[j]);
+				ib::info() << "InnoDB Table "
+					<< clust_index->table->name
+					<< "field " << j << " name "
+					<< (ifield ? ifield->name() : "NULL");
+			}
+
+			for(ulint j=0; j < table->s->stored_fields; j++) {
+				ib::info() << "MySQL table "
+					<< table->s->table_name.str
+					<< " field " << j << " name "
+					<< table->field[j]->field_name.str;
+			}
+
+			ib::fatal() << "Clustered record field for column " << i
+				<< " not found table n_user_defined "
+				<< clust_index->n_user_defined_cols
+				<< " index n_user_defined "
+				<< clust_index->table->n_cols - DATA_N_SYS_COLS
+				<< " InnoDB table "
+				<< clust_index->table->name
+				<< " field name "
+				<< (field ? field->name() : "NULL")
+				<< " MySQL table "
+				<< table->s->table_name.str
+				<< " field name "
+				<< (tb_col_name ? tb_col_name : "NULL")
+				<< " n_fields "
+				<< table->s->stored_fields
+				<< " query "
+				<< innobase_get_stmt_unsafe(current_thd, &size);
+		}
+
+		if (dict_index_is_clust(index)) {
+			templ->rec_field_no = templ->clust_rec_field_no;
+		} else {
+			/* If we're in a secondary index, keep track
+			* of the original index position even if this
+			* is just a prefix index; we will use this
+			* later to avoid a cluster index lookup in
+			* some cases.*/
+
+			templ->rec_field_no = dict_index_get_nth_col_pos(index, i,
+						&templ->rec_prefix_field_no);
+		}
+	} else {
+		DBUG_ASSERT(!ha_innobase::omits_virtual_cols(*table->s));
+		col = &dict_table_get_nth_v_col(index->table, v_no)->m_col;
+		templ->clust_rec_field_no = v_no;
+
+		if (dict_index_is_clust(index)) {
+			templ->rec_field_no = templ->clust_rec_field_no;
+		} else {
+			templ->rec_field_no
+				= dict_index_get_nth_col_or_prefix_pos(
+					index, v_no, FALSE, true,
+					&templ->rec_prefix_field_no);
+		}
+		templ->icp_rec_field_no = ULINT_UNDEFINED;
+	}
+
+	if (field->real_maybe_null()) {
+		templ->mysql_null_byte_offset =
+			field->null_offset();
+
+		templ->mysql_null_bit_mask = (ulint) field->null_bit;
+	} else {
+		templ->mysql_null_bit_mask = 0;
+	}
+
+
+	templ->mysql_col_offset = (ulint) get_field_offset(table, field);
+	templ->mysql_col_len = (ulint) field->pack_length();
+	templ->type = col->mtype;
+	templ->mysql_type = (ulint) field->type();
+
+	if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) {
+		templ->mysql_length_bytes = (ulint)
+			(((Field_varstring*) field)->length_bytes);
+	} else {
+		templ->mysql_length_bytes = 0;
+	}
+
+	templ->charset = dtype_get_charset_coll(col->prtype);
+	templ->mbminlen = dict_col_get_mbminlen(col);
+	templ->mbmaxlen = dict_col_get_mbmaxlen(col);
+	templ->is_unsigned = col->prtype & DATA_UNSIGNED;
+
+	if (!dict_index_is_clust(index)
+	    && templ->rec_field_no == ULINT_UNDEFINED) {
+		prebuilt->need_to_access_clustered = TRUE;
+
+		if (templ->rec_prefix_field_no != ULINT_UNDEFINED) {
+			dict_field_t* field = dict_index_get_nth_field(
+						index,
+						templ->rec_prefix_field_no);
+			templ->rec_field_is_prefix = (field->prefix_len != 0);
+		}
+	}
+
+	/* For spatial index, we need to access cluster index. */
+	if (dict_index_is_spatial(index)) {
+		prebuilt->need_to_access_clustered = TRUE;
+	}
+
+	if (prebuilt->mysql_prefix_len < templ->mysql_col_offset
+	    + templ->mysql_col_len) {
+		prebuilt->mysql_prefix_len = templ->mysql_col_offset
+			+ templ->mysql_col_len;
+	}
+
+	if (DATA_LARGE_MTYPE(templ->type)) {
+		prebuilt->templ_contains_blob = TRUE;
+	}
+
+	return(templ);
+}
+
+/**************************************************************//**
+Builds a 'template' to the m_prebuilt struct. The template is used in fast
+retrieval of just those column values MySQL needs in its processing. */
+
+void
+ha_innobase::build_template(
+/*========================*/
+	bool		whole_row)	/*!< in: true=ROW_MYSQL_WHOLE_ROW,
+					false=ROW_MYSQL_REC_FIELDS */
+{
+	dict_index_t*	index;
+	dict_index_t*	clust_index;
+	ibool		fetch_all_in_key	= FALSE;
+	ibool		fetch_primary_key_cols	= FALSE;
+
+	if (m_prebuilt->select_lock_type == LOCK_X || m_prebuilt->table->no_rollback()) {
+		/* We always retrieve the whole clustered index record if we
+		use exclusive row level locks, for example, if the read is
+		done in an UPDATE statement or if we are using a no rollback
+                table */
+
+		whole_row = true;
+	} else if (!whole_row) {
+		if (m_prebuilt->hint_need_to_fetch_extra_cols
+			== ROW_RETRIEVE_ALL_COLS) {
+
+			/* We know we must at least fetch all columns in the
+			key, or all columns in the table */
+
+			if (m_prebuilt->read_just_key) {
+				/* MySQL has instructed us that it is enough
+				to fetch the columns in the key; looks like
+				MySQL can set this flag also when there is
+				only a prefix of the column in the key: in
+				that case we retrieve the whole column from
+				the clustered index */
+
+				fetch_all_in_key = TRUE;
+			} else {
+				whole_row = true;
+			}
+		} else if (m_prebuilt->hint_need_to_fetch_extra_cols
+			== ROW_RETRIEVE_PRIMARY_KEY) {
+			/* We must at least fetch all primary key cols. Note
+			that if the clustered index was internally generated
+			by InnoDB on the row id (no primary key was
+			defined), then row_search_mvcc() will always
+			retrieve the row id to a special buffer in the
+			m_prebuilt struct. */
+
+			fetch_primary_key_cols = TRUE;
+		}
+	}
+
+	clust_index = dict_table_get_first_index(m_prebuilt->table);
+
+	index = whole_row ? clust_index : m_prebuilt->index;
+
+	m_prebuilt->versioned_write = table->versioned_write(VERS_TRX_ID);
+	m_prebuilt->need_to_access_clustered = (index == clust_index);
+
+	if (m_prebuilt->in_fts_query) {
+		/* Do clustered index lookup to fetch the FTS_DOC_ID */
+		m_prebuilt->need_to_access_clustered = true;
+	}
+
+	/* Either m_prebuilt->index should be a secondary index, or it
+	should be the clustered index. */
+	ut_ad(dict_index_is_clust(index) == (index == clust_index));
+
+	/* Below we check column by column if we need to access
+	the clustered index. */
+
+	if (pushed_rowid_filter && rowid_filter_is_active) {
+		fetch_primary_key_cols = TRUE;
+		m_prebuilt->pk_filter = this;
+	} else {
+		m_prebuilt->pk_filter = NULL;
+	}
+
+	const bool skip_virtual = omits_virtual_cols(*table_share);
+	const ulint n_fields = table_share->fields;
+
+	if (!m_prebuilt->mysql_template) {
+		m_prebuilt->mysql_template = (mysql_row_templ_t*)
+			ut_malloc_nokey(n_fields * sizeof(mysql_row_templ_t));
+	}
+
+	m_prebuilt->template_type = whole_row
+		? ROW_MYSQL_WHOLE_ROW : ROW_MYSQL_REC_FIELDS;
+	m_prebuilt->null_bitmap_len = table->s->null_bytes
+		& dict_index_t::MAX_N_FIELDS;
+
+	/* Prepare to build m_prebuilt->mysql_template[]. */
+	m_prebuilt->templ_contains_blob = FALSE;
+	m_prebuilt->mysql_prefix_len = 0;
+	m_prebuilt->n_template = 0;
+	m_prebuilt->idx_cond_n_cols = 0;
+
+	/* Note that in InnoDB, i is the column number in the table.
+	MySQL calls columns 'fields'. */
+
+	ulint num_v = 0;
+
+	if (active_index != MAX_KEY
+	     && active_index == pushed_idx_cond_keyno) {
+		m_prebuilt->idx_cond = this;
+		goto icp;
+	} else if (pushed_rowid_filter && rowid_filter_is_active) {
+icp:
+		/* Push down an index condition or an end_range check. */
+		for (ulint i = 0; i < n_fields; i++) {
+			const Field* field = table->field[i];
+			const bool is_v = !field->stored_in_db();
+			if (is_v && skip_virtual) {
+				num_v++;
+				continue;
+			}
+			bool index_contains = index->contains_col_or_prefix(
+				is_v ? num_v : i - num_v, is_v);
+			if (is_v && index_contains) {
+				m_prebuilt->n_template = 0;
+				num_v = 0;
+				goto no_icp;
+			}
+
+			/* Test if an end_range or an index condition
+			refers to the field. Note that "index" and
+			"index_contains" may refer to the clustered index.
+			Index condition pushdown is relative to
+			m_prebuilt->index (the index that is being
+			looked up first). */
+
+			/* When join_read_always_key() invokes this
+			code via handler::ha_index_init() and
+			ha_innobase::index_init(), end_range is not
+			yet initialized. Because of that, we must
+			always check for index_contains, instead of
+			the subset
+			field->part_of_key.is_set(active_index)
+			which would be acceptable if end_range==NULL. */
+			if (build_template_needs_field_in_icp(
+				    index, m_prebuilt, index_contains,
+				    is_v ? num_v : i - num_v, is_v)) {
+				if (!whole_row) {
+					field = build_template_needs_field(
+						index_contains,
+						m_prebuilt->read_just_key,
+						fetch_all_in_key,
+						fetch_primary_key_cols,
+						index, table, i, num_v);
+					if (!field) {
+						if (is_v) {
+							num_v++;
+						}
+						continue;
+					}
+				}
+
+				ut_ad(!is_v);
+
+				mysql_row_templ_t* templ= build_template_field(
+					m_prebuilt, clust_index, index,
+					table, field, i - num_v, 0);
+
+				ut_ad(!templ->is_virtual);
+
+				m_prebuilt->idx_cond_n_cols++;
+				ut_ad(m_prebuilt->idx_cond_n_cols
+				      == m_prebuilt->n_template);
+
+				if (index == m_prebuilt->index) {
+					templ->icp_rec_field_no
+						= templ->rec_field_no;
+				} else {
+					templ->icp_rec_field_no
+						= dict_index_get_nth_col_pos(
+							m_prebuilt->index,
+							i - num_v,
+							&templ->rec_prefix_field_no);
+				}
+
+				if (dict_index_is_clust(m_prebuilt->index)) {
+					ut_ad(templ->icp_rec_field_no
+					      != ULINT_UNDEFINED);
+					/* If the primary key includes
+					a column prefix, use it in
+					index condition pushdown,
+					because the condition is
+					evaluated before fetching any
+					off-page (externally stored)
+					columns. */
+					if (templ->icp_rec_field_no
+					    < m_prebuilt->index->n_uniq) {
+						/* This is a key column;
+						all set. */
+						continue;
+					}
+				} else if (templ->icp_rec_field_no
+					   != ULINT_UNDEFINED) {
+					continue;
+				}
+
+				/* This is a column prefix index.
+				The column prefix can be used in
+				an end_range comparison. */
+
+				templ->icp_rec_field_no
+					= dict_index_get_nth_col_or_prefix_pos(
+						m_prebuilt->index, i - num_v,
+						true, false,
+						&templ->rec_prefix_field_no);
+				ut_ad(templ->icp_rec_field_no
+				      != ULINT_UNDEFINED);
+
+				/* Index condition pushdown can be used on
+				all columns of a secondary index, and on
+				the PRIMARY KEY columns. On the clustered
+				index, it must never be used on other than
+				PRIMARY KEY columns, because those columns
+				may be stored off-page, and we will not
+				fetch externally stored columns before
+				checking the index condition. */
+				/* TODO: test the above with an assertion
+				like this. Note that index conditions are
+				currently pushed down as part of the
+				"optimizer phase" while end_range is done
+				as part of the execution phase. Therefore,
+				we were unable to use an accurate condition
+				for end_range in the "if" condition above,
+				and the following assertion would fail.
+				ut_ad(!dict_index_is_clust(m_prebuilt->index)
+				      || templ->rec_field_no
+				      < m_prebuilt->index->n_uniq);
+				*/
+			}
+
+			if (is_v) {
+				num_v++;
+			}
+		}
+
+		ut_ad(m_prebuilt->idx_cond_n_cols > 0);
+		ut_ad(m_prebuilt->idx_cond_n_cols == m_prebuilt->n_template);
+
+		num_v = 0;
+
+		/* Include the fields that are not needed in index condition
+		pushdown. */
+		for (ulint i = 0; i < n_fields; i++) {
+			const Field*		field = table->field[i];
+			const bool is_v = !field->stored_in_db();
+			if (is_v && skip_virtual) {
+				num_v++;
+				continue;
+			}
+
+			bool index_contains = index->contains_col_or_prefix(
+				is_v ? num_v : i - num_v, is_v);
+
+			if (!build_template_needs_field_in_icp(
+				    index, m_prebuilt, index_contains,
+				    is_v ? num_v : i - num_v, is_v)) {
+				/* Not needed in ICP */
+				if (!whole_row) {
+					field = build_template_needs_field(
+						index_contains,
+						m_prebuilt->read_just_key,
+						fetch_all_in_key,
+						fetch_primary_key_cols,
+						index, table, i, num_v);
+					if (!field) {
+						if (is_v) {
+							num_v++;
+						}
+						continue;
+					}
+				}
+
+				ut_d(mysql_row_templ_t*	templ =)
+				build_template_field(
+					m_prebuilt, clust_index, index,
+					table, field, i - num_v, num_v);
+				ut_ad(templ->is_virtual == (ulint)is_v);
+
+				if (is_v) {
+					num_v++;
+				}
+			}
+		}
+	} else {
+no_icp:
+		/* No index condition pushdown */
+		m_prebuilt->idx_cond = NULL;
+		ut_ad(num_v == 0);
+
+		for (ulint i = 0; i < n_fields; i++) {
+			const Field*	field = table->field[i];
+			const bool is_v = !field->stored_in_db();
+
+			if (whole_row) {
+				if (is_v && skip_virtual) {
+					num_v++;
+					continue;
+				}
+				/* Even this is whole_row, if the seach is
+				on a virtual column, and read_just_key is
+				set, and field is not in this index, we
+				will not try to fill the value since they
+				are not stored in such index nor in the
+				cluster index. */
+				if (is_v
+				    && m_prebuilt->read_just_key
+				    && !m_prebuilt->index->contains_col_or_prefix(
+					num_v, true))
+				{
+					/* Turn off ROW_MYSQL_WHOLE_ROW */
+					m_prebuilt->template_type =
+						 ROW_MYSQL_REC_FIELDS;
+					num_v++;
+					continue;
+				}
+			} else {
+				if (is_v
+				    && (skip_virtual || index->is_primary())) {
+					num_v++;
+					continue;
+				}
+
+				bool contain = index->contains_col_or_prefix(
+					is_v ? num_v: i - num_v, is_v);
+
+				field = build_template_needs_field(
+					contain,
+					m_prebuilt->read_just_key,
+					fetch_all_in_key,
+					fetch_primary_key_cols,
+					index, table, i, num_v);
+				if (!field) {
+					if (is_v) {
+						num_v++;
+					}
+					continue;
+				}
+			}
+
+			ut_d(mysql_row_templ_t* templ =)
+			build_template_field(
+				m_prebuilt, clust_index, index,
+				table, field, i - num_v, num_v);
+			ut_ad(templ->is_virtual == (ulint)is_v);
+			if (is_v) {
+				num_v++;
+			}
+		}
+	}
+
+	if (index != clust_index && m_prebuilt->need_to_access_clustered) {
+		/* Change rec_field_no's to correspond to the clustered index
+		record */
+		for (ulint i = 0; i < m_prebuilt->n_template; i++) {
+			mysql_row_templ_t*	templ
+				= &m_prebuilt->mysql_template[i];
+
+			templ->rec_field_no = templ->clust_rec_field_no;
+		}
+	}
+}
+
+/********************************************************************//**
+This special handling is really to overcome the limitations of MySQL's
+binlogging. We need to eliminate the non-determinism that will arise in
+INSERT ... SELECT type of statements, since MySQL binlog only stores the
+min value of the autoinc interval. Once that is fixed we can get rid of
+the special lock handling.
+@return DB_SUCCESS if all OK else error code */
+
+dberr_t
+ha_innobase::innobase_lock_autoinc(void)
+/*====================================*/
+{
+	DBUG_ENTER("ha_innobase::innobase_lock_autoinc");
+	dberr_t		error = DB_SUCCESS;
+
+	ut_ad(!srv_read_only_mode);
+
+	switch (innobase_autoinc_lock_mode) {
+	case AUTOINC_NO_LOCKING:
+		/* Acquire only the AUTOINC mutex. */
+		m_prebuilt->table->autoinc_mutex.wr_lock();
+		break;
+
+	case AUTOINC_NEW_STYLE_LOCKING:
+		/* For simple (single/multi) row INSERTs/REPLACEs and RBR
+		events, we fallback to the old style only if another
+		transaction has already acquired the AUTOINC lock on
+		behalf of a LOAD FILE or INSERT ... SELECT etc. type of
+		statement. */
+		switch (thd_sql_command(m_user_thd)) {
+		case SQLCOM_INSERT:
+		case SQLCOM_REPLACE:
+		case SQLCOM_END: // RBR event
+			/* Acquire the AUTOINC mutex. */
+			m_prebuilt->table->autoinc_mutex.wr_lock();
+			/* We need to check that another transaction isn't
+			already holding the AUTOINC lock on the table. */
+			if (!m_prebuilt->table->n_waiting_or_granted_auto_inc_locks) {
+				/* Do not fall back to old style locking. */
+				DBUG_RETURN(error);
+			}
+			m_prebuilt->table->autoinc_mutex.wr_unlock();
+		}
+		/* Use old style locking. */
+		/* fall through */
+	case AUTOINC_OLD_STYLE_LOCKING:
+		DBUG_EXECUTE_IF("die_if_autoinc_old_lock_style_used",
+				ut_ad(0););
+		error = row_lock_table_autoinc_for_mysql(m_prebuilt);
+
+		if (error == DB_SUCCESS) {
+
+			/* Acquire the AUTOINC mutex. */
+			m_prebuilt->table->autoinc_mutex.wr_lock();
+		}
+		break;
+
+	default:
+		ut_error;
+	}
+
+	DBUG_RETURN(error);
+}
+
+/********************************************************************//**
+Store the autoinc value in the table. The autoinc value is only set if
+it's greater than the existing autoinc value in the table.
+@return DB_SUCCESS if all went well else error code */
+
+dberr_t
+ha_innobase::innobase_set_max_autoinc(
+/*==================================*/
+	ulonglong	auto_inc)	/*!< in: value to store */
+{
+	dberr_t		error;
+
+	error = innobase_lock_autoinc();
+
+	if (error == DB_SUCCESS) {
+
+		dict_table_autoinc_update_if_greater(m_prebuilt->table, auto_inc);
+		m_prebuilt->table->autoinc_mutex.wr_unlock();
+	}
+
+	return(error);
+}
+
+/** @return whether the table is read-only */
+bool ha_innobase::is_read_only(bool altering_to_supported) const
+{
+  ut_ad(m_prebuilt->trx == thd_to_trx(m_user_thd));
+
+  if (high_level_read_only)
+  {
+    ib_senderrf(m_user_thd, IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE);
+    return true;
+  }
+
+  if (altering_to_supported)
+    return false;
+
+  if (!DICT_TF_GET_ZIP_SSIZE(m_prebuilt->table->flags) ||
+      !innodb_read_only_compressed)
+    return false;
+
+  ib_senderrf(m_user_thd, IB_LOG_LEVEL_WARN, ER_UNSUPPORTED_COMPRESSED_TABLE);
+  return true;
+}
+
+/********************************************************************//**
+Stores a row in an InnoDB database, to the table specified in this
+handle.
+@return error code */
+
+int
+ha_innobase::write_row(
+/*===================*/
+	const uchar*	record)	/*!< in: a row in MySQL format */
+{
+	dberr_t		error;
+#ifdef WITH_WSREP
+	bool		wsrep_auto_inc_inserted= false;
+#endif
+	int		error_result = 0;
+	bool		auto_inc_used = false;
+	mariadb_set_stats set_stats_temporary(handler_stats);
+
+	DBUG_ENTER("ha_innobase::write_row");
+
+	trx_t*		trx = thd_to_trx(m_user_thd);
+
+	/* Validation checks before we commence write_row operation. */
+	if (is_read_only()) {
+		DBUG_RETURN(HA_ERR_TABLE_READONLY);
+	}
+
+	if (!trx_is_started(trx)) {
+		trx->will_lock = true;
+	}
+
+	ins_mode_t	vers_set_fields;
+	/* Handling of Auto-Increment Columns. */
+	if (table->next_number_field && record == table->record[0]) {
+
+		/* Reset the error code before calling
+		innobase_get_auto_increment(). */
+		m_prebuilt->autoinc_error = DB_SUCCESS;
+
+#ifdef WITH_WSREP
+		wsrep_auto_inc_inserted = trx->is_wsrep()
+			&& wsrep_drupal_282555_workaround
+			&& table->next_number_field->val_int() == 0;
+#endif
+
+		if ((error_result = update_auto_increment())) {
+			/* We don't want to mask autoinc overflow errors. */
+
+			/* Handle the case where the AUTOINC sub-system
+			failed during initialization. */
+			if (m_prebuilt->autoinc_error == DB_UNSUPPORTED) {
+				error_result = ER_AUTOINC_READ_FAILED;
+				/* Set the error message to report too. */
+				my_error(ER_AUTOINC_READ_FAILED, MYF(0));
+				goto func_exit;
+			} else if (m_prebuilt->autoinc_error != DB_SUCCESS) {
+				error = m_prebuilt->autoinc_error;
+				goto report_error;
+			}
+
+			/* MySQL errors are passed straight back. */
+			goto func_exit;
+		}
+
+		auto_inc_used = true;
+	}
+
+	/* Prepare INSERT graph that will be executed for actual INSERT
+	(This is a one time operation) */
+	if (m_prebuilt->mysql_template == NULL
+	    || m_prebuilt->template_type != ROW_MYSQL_WHOLE_ROW) {
+
+		/* Build the template used in converting quickly between
+		the two database formats */
+
+		build_template(true);
+	}
+
+	vers_set_fields = table->versioned_write(VERS_TRX_ID) ?
+		ROW_INS_VERSIONED : ROW_INS_NORMAL;
+
+	/* Execute insert graph that will result in actual insert. */
+	error = row_insert_for_mysql((byte*) record, m_prebuilt, vers_set_fields);
+
+	DEBUG_SYNC(m_user_thd, "ib_after_row_insert");
+
+	/* Handling of errors related to auto-increment. */
+	if (auto_inc_used) {
+		ulonglong	auto_inc;
+
+		/* Note the number of rows processed for this statement, used
+		by get_auto_increment() to determine the number of AUTO-INC
+		values to reserve. This is only useful for a mult-value INSERT
+		and is a statement level counter. */
+		if (trx->n_autoinc_rows > 0) {
+			--trx->n_autoinc_rows;
+		}
+
+		/* Get the value that MySQL attempted to store in the table.*/
+		auto_inc = table->next_number_field->val_uint();
+
+		switch (error) {
+		case DB_DUPLICATE_KEY:
+
+			/* A REPLACE command and LOAD DATA INFILE REPLACE
+			handle a duplicate key error themselves, but we
+			must update the autoinc counter if we are performing
+			those statements. */
+
+			switch (thd_sql_command(m_user_thd)) {
+			case SQLCOM_LOAD:
+				if (!trx->duplicates) {
+					break;
+				}
+
+			case SQLCOM_REPLACE:
+			case SQLCOM_INSERT_SELECT:
+			case SQLCOM_REPLACE_SELECT:
+				goto set_max_autoinc;
+
+#ifdef WITH_WSREP
+			/* workaround for LP bug #355000, retrying the insert */
+			case SQLCOM_INSERT:
+
+				WSREP_DEBUG("DUPKEY error for autoinc\n"
+				      "THD %ld, value %llu, off %llu inc %llu",
+				      thd_get_thread_id(m_user_thd),
+				      auto_inc,
+				      m_prebuilt->autoinc_offset,
+				      m_prebuilt->autoinc_increment);
+
+                               if (wsrep_auto_inc_inserted &&
+                                   wsrep_thd_retry_counter(m_user_thd) == 0  &&
+				    !thd_test_options(m_user_thd,
+						      OPTION_NOT_AUTOCOMMIT |
+						      OPTION_BEGIN)) {
+					WSREP_DEBUG(
+					    "retrying insert: %s",
+					    wsrep_thd_query(m_user_thd));
+					error= DB_SUCCESS;
+					wsrep_thd_self_abort(m_user_thd);
+                                        /* jump straight to func exit over
+                                         * later wsrep hooks */
+                                        goto func_exit;
+				}
+                                break;
+#endif /* WITH_WSREP */
+
+			default:
+				break;
+			}
+
+			break;
+
+		case DB_SUCCESS:
+			/* If the actual value inserted is greater than
+			the upper limit of the interval, then we try and
+			update the table upper limit. Note: last_value
+			will be 0 if get_auto_increment() was not called. */
+
+			if (auto_inc >= m_prebuilt->autoinc_last_value) {
+set_max_autoinc:
+				/* We need the upper limit of the col type to check for
+				whether we update the table autoinc counter or not. */
+				ulonglong	col_max_value =
+					table->next_number_field->get_max_int_value();
+
+				/* This should filter out the negative
+				values set explicitly by the user. */
+				if (auto_inc <= col_max_value) {
+					ut_ad(m_prebuilt->autoinc_increment > 0);
+
+					ulonglong	offset;
+					ulonglong	increment;
+					dberr_t		err;
+
+					offset = m_prebuilt->autoinc_offset;
+					increment = m_prebuilt->autoinc_increment;
+
+					auto_inc = innobase_next_autoinc(
+						auto_inc, 1, increment, offset,
+						col_max_value);
+
+					err = innobase_set_max_autoinc(
+						auto_inc);
+
+					if (err != DB_SUCCESS) {
+						error = err;
+					}
+				}
+			}
+			break;
+		default:
+			break;
+		}
+	}
+
+report_error:
+	/* Cleanup and exit. */
+	if (error == DB_TABLESPACE_DELETED) {
+		ib_senderrf(
+			trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			ER_TABLESPACE_DISCARDED,
+			table->s->table_name.str);
+	}
+
+	error_result = convert_error_code_to_mysql(
+		error, m_prebuilt->table->flags, m_user_thd);
+
+#ifdef WITH_WSREP
+	if (!error_result && trx->is_wsrep()
+	    && !trx->is_bulk_insert()
+	    && wsrep_thd_is_local(m_user_thd)
+	    && !wsrep_thd_ignore_table(m_user_thd)
+	    && !wsrep_consistency_check(m_user_thd)
+	    && (thd_sql_command(m_user_thd) != SQLCOM_CREATE_TABLE)
+	    && (thd_sql_command(m_user_thd) != SQLCOM_LOAD ||
+	        thd_binlog_format(m_user_thd) == BINLOG_FORMAT_ROW)) {
+		if (wsrep_append_keys(m_user_thd, WSREP_SERVICE_KEY_EXCLUSIVE,
+				      record,
+				      NULL)) {
+			DBUG_PRINT("wsrep", ("row key failed"));
+			error_result = HA_ERR_INTERNAL_ERROR;
+			goto func_exit;
+		}
+	}
+#endif /* WITH_WSREP */
+
+	if (error_result == HA_FTS_INVALID_DOCID) {
+		my_error(HA_FTS_INVALID_DOCID, MYF(0));
+	}
+
+func_exit:
+	DBUG_RETURN(error_result);
+}
+
+/** Fill the update vector's "old_vrow" field for those non-updated,
+but indexed columns. Such columns could stil present in the virtual
+index rec fields even if they are not updated (some other fields updated),
+so needs to be logged.
+@param[in]	prebuilt		InnoDB prebuilt struct
+@param[in,out]	vfield			field to filled
+@param[in]	o_len			actual column length
+@param[in,out]	col			column to be filled
+@param[in]	old_mysql_row_col	MySQL old field ptr
+@param[in]	col_pack_len		MySQL field col length
+@param[in,out]	buf			buffer for a converted integer value
+@return used buffer ptr from row_mysql_store_col_in_innobase_format() */
+static
+byte*
+innodb_fill_old_vcol_val(
+	row_prebuilt_t*	prebuilt,
+	dfield_t*	vfield,
+	ulint		o_len,
+	dict_col_t*	col,
+	const byte*	old_mysql_row_col,
+	ulint		col_pack_len,
+	byte*		buf)
+{
+	dict_col_copy_type(
+		col, dfield_get_type(vfield));
+	if (o_len != UNIV_SQL_NULL) {
+
+		buf = row_mysql_store_col_in_innobase_format(
+			vfield,
+			buf,
+			TRUE,
+			old_mysql_row_col,
+			col_pack_len,
+			dict_table_is_comp(prebuilt->table));
+	} else {
+		dfield_set_null(vfield);
+	}
+
+	return(buf);
+}
+
+/** Calculate an update vector corresponding to the changes
+between old_row and new_row.
+@param[out]	uvect		update vector
+@param[in]	old_row		current row in MySQL format
+@param[in]	new_row		intended updated row in MySQL format
+@param[in]	table		MySQL table handle
+@param[in,out]	upd_buff	buffer to use for converted values
+@param[in]	buff_len	length of upd_buff
+@param[in,out]	prebuilt	InnoDB execution context
+@param[out]	auto_inc	updated AUTO_INCREMENT value, or 0 if none
+@return DB_SUCCESS or error code */
+static
+dberr_t
+calc_row_difference(
+	upd_t*		uvect,
+	const uchar*	old_row,
+	const uchar*	new_row,
+	TABLE*		table,
+	uchar*		upd_buff,
+	ulint		buff_len,
+	row_prebuilt_t*	prebuilt,
+	ib_uint64_t&	auto_inc)
+{
+	uchar*		original_upd_buff = upd_buff;
+	Field*		field;
+	enum_field_types field_mysql_type;
+	ulint		o_len;
+	ulint		n_len;
+	ulint		col_pack_len;
+	const byte*	new_mysql_row_col;
+	const byte*	old_mysql_row_col;
+	const byte*	o_ptr;
+	const byte*	n_ptr;
+	byte*		buf;
+	upd_field_t*	ufield;
+	ulint		col_type;
+	ulint		n_changed = 0;
+	dfield_t	dfield;
+	dict_index_t*	clust_index;
+	ibool		changes_fts_column = FALSE;
+	ibool		changes_fts_doc_col = FALSE;
+	trx_t* const	trx = prebuilt->trx;
+	doc_id_t	doc_id = FTS_NULL_DOC_ID;
+	uint16_t	num_v = 0;
+#ifndef DBUG_OFF
+	uint		vers_fields = 0;
+#endif
+	prebuilt->versioned_write = table->versioned_write(VERS_TRX_ID);
+	const bool skip_virtual = ha_innobase::omits_virtual_cols(*table->s);
+
+	ut_ad(!srv_read_only_mode);
+
+	clust_index = dict_table_get_first_index(prebuilt->table);
+	auto_inc = 0;
+
+	/* We use upd_buff to convert changed fields */
+	buf = (byte*) upd_buff;
+
+	for (uint i = 0; i < table->s->fields; i++) {
+		field = table->field[i];
+
+#ifndef DBUG_OFF
+		if (!field->vers_sys_field()
+		    && !field->vers_update_unversioned()) {
+			++vers_fields;
+		}
+#endif
+
+		const bool is_virtual = !field->stored_in_db();
+		if (is_virtual && skip_virtual) {
+			num_v++;
+			continue;
+		}
+		dict_col_t* col = is_virtual
+			? &prebuilt->table->v_cols[num_v].m_col
+			: &prebuilt->table->cols[i - num_v];
+
+		o_ptr = (const byte*) old_row + get_field_offset(table, field);
+		n_ptr = (const byte*) new_row + get_field_offset(table, field);
+
+		/* Use new_mysql_row_col and col_pack_len save the values */
+
+		new_mysql_row_col = n_ptr;
+		old_mysql_row_col = o_ptr;
+		col_pack_len = field->pack_length();
+
+		o_len = col_pack_len;
+		n_len = col_pack_len;
+
+		/* We use o_ptr and n_ptr to dig up the actual data for
+		comparison. */
+
+		field_mysql_type = field->type();
+
+		col_type = col->mtype;
+
+		switch (col_type) {
+
+		case DATA_BLOB:
+		case DATA_GEOMETRY:
+			o_ptr = row_mysql_read_blob_ref(&o_len, o_ptr, o_len);
+			n_ptr = row_mysql_read_blob_ref(&n_len, n_ptr, n_len);
+
+			break;
+
+		case DATA_VARCHAR:
+		case DATA_BINARY:
+		case DATA_VARMYSQL:
+			if (field_mysql_type == MYSQL_TYPE_VARCHAR) {
+				/* This is a >= 5.0.3 type true VARCHAR where
+				the real payload data length is stored in
+				1 or 2 bytes */
+
+				o_ptr = row_mysql_read_true_varchar(
+					&o_len, o_ptr,
+					(ulint)
+					(((Field_varstring*) field)->length_bytes));
+
+				n_ptr = row_mysql_read_true_varchar(
+					&n_len, n_ptr,
+					(ulint)
+					(((Field_varstring*) field)->length_bytes));
+			}
+
+			break;
+		default:
+			;
+		}
+
+		if (field_mysql_type == MYSQL_TYPE_LONGLONG
+		    && prebuilt->table->fts
+		    && innobase_strcasecmp(
+			field->field_name.str, FTS_DOC_ID_COL_NAME) == 0) {
+			doc_id = mach_read_uint64_little_endian(n_ptr);
+			if (doc_id == 0) {
+				return(DB_FTS_INVALID_DOCID);
+			}
+		}
+
+		if (field->real_maybe_null()) {
+			if (field->is_null_in_record(old_row)) {
+				o_len = UNIV_SQL_NULL;
+			}
+
+			if (field->is_null_in_record(new_row)) {
+				n_len = UNIV_SQL_NULL;
+			}
+		}
+
+		if (is_virtual) {
+			/* If the virtual column is not indexed,
+			we shall ignore it for update */
+			if (!col->ord_part) {
+			next:
+				num_v++;
+				continue;
+			}
+
+			if (!uvect->old_vrow) {
+				uvect->old_vrow = dtuple_create_with_vcol(
+					uvect->heap, 0, prebuilt->table->n_v_cols);
+			}
+
+			ulint   max_field_len = DICT_MAX_FIELD_LEN_BY_FORMAT(
+						prebuilt->table);
+
+			/* for virtual columns, we only materialize
+			its index, and index field length would not
+			exceed max_field_len. So continue if the
+			first max_field_len bytes are matched up */
+			if (o_len != UNIV_SQL_NULL
+			   && n_len != UNIV_SQL_NULL
+			   && o_len >= max_field_len
+			   && n_len >= max_field_len
+			   && memcmp(o_ptr, n_ptr, max_field_len) == 0) {
+				dfield_t*	vfield = dtuple_get_nth_v_field(
+					uvect->old_vrow, num_v);
+				buf = innodb_fill_old_vcol_val(
+					prebuilt, vfield, o_len,
+					col, old_mysql_row_col,
+					col_pack_len, buf);
+				goto next;
+			}
+		}
+
+		if (o_len != n_len || (o_len != 0 && o_len != UNIV_SQL_NULL
+				       && 0 != memcmp(o_ptr, n_ptr, o_len))) {
+			/* The field has changed */
+
+			ufield = uvect->fields + n_changed;
+			MEM_UNDEFINED(ufield, sizeof *ufield);
+
+			/* Let us use a dummy dfield to make the conversion
+			from the MySQL column format to the InnoDB format */
+
+
+			/* If the length of new geometry object is 0, means
+			this object is invalid geometry object, we need
+			to block it. */
+			if (DATA_GEOMETRY_MTYPE(col_type)
+			    && o_len != 0 && n_len == 0) {
+				return(DB_CANT_CREATE_GEOMETRY_OBJECT);
+			}
+
+			if (n_len != UNIV_SQL_NULL) {
+				dict_col_copy_type(
+					col, dfield_get_type(&dfield));
+
+				buf = row_mysql_store_col_in_innobase_format(
+					&dfield,
+					(byte*) buf,
+					TRUE,
+					new_mysql_row_col,
+					col_pack_len,
+					dict_table_is_comp(prebuilt->table));
+				dfield_copy(&ufield->new_val, &dfield);
+			} else {
+				dict_col_copy_type(
+					col, dfield_get_type(&ufield->new_val));
+				dfield_set_null(&ufield->new_val);
+			}
+
+			ufield->exp = NULL;
+			ufield->orig_len = 0;
+			if (is_virtual) {
+				dfield_t*	vfield = dtuple_get_nth_v_field(
+					uvect->old_vrow, num_v);
+				upd_fld_set_virtual_col(ufield);
+				ufield->field_no = num_v;
+
+				ut_ad(col->ord_part);
+				ufield->old_v_val = static_cast<dfield_t*>(
+					mem_heap_alloc(
+						uvect->heap,
+						sizeof *ufield->old_v_val));
+
+				if (!field->is_null_in_record(old_row)) {
+					if (n_len == UNIV_SQL_NULL) {
+						dict_col_copy_type(
+							col, dfield_get_type(
+								&dfield));
+					}
+
+					buf = row_mysql_store_col_in_innobase_format(
+						&dfield,
+						(byte*) buf,
+						TRUE,
+						old_mysql_row_col,
+						col_pack_len,
+						dict_table_is_comp(
+						prebuilt->table));
+					dfield_copy(ufield->old_v_val,
+						    &dfield);
+					dfield_copy(vfield, &dfield);
+				} else {
+					dict_col_copy_type(
+						col, dfield_get_type(
+						ufield->old_v_val));
+					dfield_set_null(ufield->old_v_val);
+					dfield_set_null(vfield);
+				}
+				num_v++;
+				ut_ad(field != table->found_next_number_field);
+			} else {
+				ufield->field_no = static_cast<uint16_t>(
+					dict_col_get_clust_pos(
+						&prebuilt->table->cols
+						[i - num_v],
+						clust_index));
+				ufield->old_v_val = NULL;
+				if (field != table->found_next_number_field
+				    || dfield_is_null(&ufield->new_val)) {
+				} else {
+					auto_inc = field->val_uint();
+				}
+			}
+			n_changed++;
+
+			/* If an FTS indexed column was changed by this
+			UPDATE then we need to inform the FTS sub-system.
+
+			NOTE: Currently we re-index all FTS indexed columns
+			even if only a subset of the FTS indexed columns
+			have been updated. That is the reason we are
+			checking only once here. Later we will need to
+			note which columns have been updated and do
+			selective processing. */
+			if (prebuilt->table->fts != NULL && !is_virtual) {
+				ulint		offset;
+				dict_table_t*   innodb_table;
+
+				innodb_table = prebuilt->table;
+
+				if (!changes_fts_column) {
+					offset = row_upd_changes_fts_column(
+						innodb_table, ufield);
+
+					if (offset != ULINT_UNDEFINED) {
+						changes_fts_column = TRUE;
+					}
+				}
+
+				if (!changes_fts_doc_col) {
+					changes_fts_doc_col =
+					row_upd_changes_doc_id(
+						innodb_table, ufield);
+				}
+			}
+		} else if (is_virtual) {
+			dfield_t*	vfield = dtuple_get_nth_v_field(
+				uvect->old_vrow, num_v);
+			buf = innodb_fill_old_vcol_val(
+				prebuilt, vfield, o_len,
+				col, old_mysql_row_col,
+				col_pack_len, buf);
+			ut_ad(col->ord_part);
+			num_v++;
+		}
+	}
+
+	/* If the update changes a column with an FTS index on it, we
+	then add an update column node with a new document id to the
+	other changes. We piggy back our changes on the normal UPDATE
+	to reduce processing and IO overhead. */
+	if (!prebuilt->table->fts) {
+		trx->fts_next_doc_id = 0;
+	} else if (changes_fts_column || changes_fts_doc_col) {
+		dict_table_t*   innodb_table = prebuilt->table;
+
+		ufield = uvect->fields + n_changed;
+
+		if (!DICT_TF2_FLAG_IS_SET(
+			innodb_table, DICT_TF2_FTS_HAS_DOC_ID)) {
+
+			/* If Doc ID is managed by user, and if any
+			FTS indexed column has been updated, its corresponding
+			Doc ID must also be updated. Otherwise, return
+			error */
+			if (changes_fts_column && !changes_fts_doc_col) {
+				ib::warn() << "A new Doc ID must be supplied"
+					" while updating FTS indexed columns.";
+				return(DB_FTS_INVALID_DOCID);
+			}
+
+			/* Doc ID must monotonically increase */
+			ut_ad(innodb_table->fts->cache);
+			if (doc_id < prebuilt->table->fts->cache->next_doc_id) {
+
+				ib::warn() << "FTS Doc ID must be larger than "
+					<< innodb_table->fts->cache->next_doc_id
+					- 1  << " for table "
+					<< innodb_table->name;
+
+				return(DB_FTS_INVALID_DOCID);
+			}
+
+
+			trx->fts_next_doc_id = doc_id;
+		} else {
+			/* If the Doc ID is a hidden column, it can't be
+			changed by user */
+			ut_ad(!changes_fts_doc_col);
+
+			/* Doc ID column is hidden, a new Doc ID will be
+			generated by following fts_update_doc_id() call */
+			trx->fts_next_doc_id = 0;
+		}
+
+		fts_update_doc_id(
+			innodb_table, ufield, &trx->fts_next_doc_id);
+
+		++n_changed;
+	} else {
+		/* We have a Doc ID column, but none of FTS indexed
+		columns are touched, nor the Doc ID column, so set
+		fts_next_doc_id to UINT64_UNDEFINED, which means do not
+		update the Doc ID column */
+		trx->fts_next_doc_id = UINT64_UNDEFINED;
+	}
+
+	uvect->n_fields = n_changed;
+	uvect->info_bits = 0;
+
+	ut_a(buf <= (byte*) original_upd_buff + buff_len);
+
+	const TABLE_LIST *tl= table->pos_in_table_list;
+	const uint8 op_map= tl->trg_event_map | tl->slave_fk_event_map;
+	/* Used to avoid reading history in FK check on DELETE (see MDEV-16210). */
+	prebuilt->upd_node->is_delete =
+		(op_map & trg2bit(TRG_EVENT_DELETE)
+		 && table->versioned(VERS_TIMESTAMP))
+		? VERSIONED_DELETE : NO_DELETE;
+
+	if (prebuilt->versioned_write) {
+		/* Guaranteed by CREATE TABLE, but anyway we make sure we
+		generate history only when there are versioned fields. */
+		DBUG_ASSERT(vers_fields);
+		prebuilt->upd_node->vers_make_update(trx);
+	}
+
+	ut_ad(uvect->validate());
+	return(DB_SUCCESS);
+}
+
+#ifdef WITH_WSREP
+static
+int
+wsrep_calc_row_hash(
+/*================*/
+	byte*		digest,		/*!< in/out: md5 sum */
+	const uchar*	row,		/*!< in: row in MySQL format */
+	TABLE*		table,		/*!< in: table in MySQL data
+					dictionary */
+	row_prebuilt_t*	prebuilt)	/*!< in: InnoDB prebuilt struct */
+{
+	void *ctx = alloca(my_md5_context_size());
+	my_md5_init(ctx);
+
+	for (uint i = 0; i < table->s->fields; i++) {
+		byte null_byte=0;
+		byte true_byte=1;
+		unsigned is_unsigned;
+
+		const Field* field = table->field[i];
+		if (!field->stored_in_db()) {
+			continue;
+		}
+
+		auto ptr = row + get_field_offset(table, field);
+		ulint len = field->pack_length();
+
+		switch (get_innobase_type_from_mysql_type(&is_unsigned,
+							  field)) {
+		case DATA_BLOB:
+			ptr = row_mysql_read_blob_ref(&len, ptr, len);
+
+			break;
+
+		case DATA_VARCHAR:
+		case DATA_BINARY:
+		case DATA_VARMYSQL:
+			if (field->type() == MYSQL_TYPE_VARCHAR) {
+				/* This is a >= 5.0.3 type true VARCHAR where
+				the real payload data length is stored in
+				1 or 2 bytes */
+
+				ptr = row_mysql_read_true_varchar(
+					&len, ptr,
+					(ulint)
+					(((Field_varstring*)field)->length_bytes));
+
+			}
+
+			break;
+		default:
+			;
+		}
+		/*
+		if (field->null_ptr &&
+		    field_in_record_is_null(table, field, (char*) row)) {
+		*/
+
+		if (field->is_null_in_record(row)) {
+			my_md5_input(ctx, &null_byte, 1);
+		} else {
+			my_md5_input(ctx, &true_byte, 1);
+			my_md5_input(ctx, ptr, len);
+		}
+	}
+
+	my_md5_result(ctx, digest);
+
+	return(0);
+}
+
+/** Append table-level exclusive key.
+@param thd   MySQL thread handle
+@param table table
+@retval false on success
+@retval true on failure */
+ATTRIBUTE_COLD bool wsrep_append_table_key(MYSQL_THD thd, const dict_table_t &table)
+{
+  char db_buf[NAME_LEN + 1];
+  char tbl_buf[NAME_LEN + 1];
+  ulint db_buf_len, tbl_buf_len;
+
+  if (!table.parse_name(db_buf, tbl_buf, &db_buf_len, &tbl_buf_len))
+  {
+    WSREP_ERROR("Parse_name for table key append failed: %s",
+                wsrep_thd_query(thd));
+    return true;
+  }
+
+  /* Append table-level exclusive key */
+  const int rcode = wsrep_thd_append_table_key(thd, db_buf,
+                                               tbl_buf, WSREP_SERVICE_KEY_EXCLUSIVE);
+  if (rcode)
+  {
+    WSREP_ERROR("Appending table key failed: %s, %d",
+                wsrep_thd_query(thd), rcode);
+    return true;
+  }
+
+  return false;
+}
+#endif /* WITH_WSREP */
+
+/**
+Updates a row given as a parameter to a new value. Note that we are given
+whole rows, not just the fields which are updated: this incurs some
+overhead for CPU when we check which fields are actually updated.
+TODO: currently InnoDB does not prevent the 'Halloween problem':
+in a searched update a single row can get updated several times
+if its index columns are updated!
+@param[in] old_row	Old row contents in MySQL format
+@param[out] new_row	Updated row contents in MySQL format
+@return error number or 0 */
+
+int
+ha_innobase::update_row(
+	const uchar*	old_row,
+	const uchar*	new_row)
+{
+	int		err;
+
+	dberr_t		error;
+	trx_t*		trx = thd_to_trx(m_user_thd);
+	mariadb_set_stats set_stats_temporary(handler_stats);
+
+	DBUG_ENTER("ha_innobase::update_row");
+
+	if (is_read_only()) {
+		DBUG_RETURN(HA_ERR_TABLE_READONLY);
+	} else if (!trx_is_started(trx)) {
+		trx->will_lock = true;
+	}
+
+	if (m_upd_buf == NULL) {
+		ut_ad(m_upd_buf_size == 0);
+
+		/* Create a buffer for packing the fields of a record. Why
+		table->reclength did not work here? Obviously, because char
+		fields when packed actually became 1 byte longer, when we also
+		stored the string length as the first byte. */
+
+		m_upd_buf_size = table->s->reclength + table->s->max_key_length
+			+ MAX_REF_PARTS * 3;
+
+		m_upd_buf = reinterpret_cast<uchar*>(
+			my_malloc(PSI_INSTRUMENT_ME,
+                                  m_upd_buf_size,
+				MYF(MY_WME)));
+
+		if (m_upd_buf == NULL) {
+			m_upd_buf_size = 0;
+			DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+		}
+	}
+
+	upd_t*		uvect = row_get_prebuilt_update_vector(m_prebuilt);
+	ib_uint64_t	autoinc;
+
+	/* Build an update vector from the modified fields in the rows
+	(uses m_upd_buf of the handle) */
+
+	error = calc_row_difference(
+		uvect, old_row, new_row, table, m_upd_buf, m_upd_buf_size,
+		m_prebuilt, autoinc);
+
+	if (error != DB_SUCCESS) {
+		goto func_exit;
+	}
+
+	if (!uvect->n_fields) {
+		/* This is the same as success, but instructs
+		MySQL that the row is not really updated and it
+		should not increase the count of updated rows.
+		This is fix for http://bugs.mysql.com/29157 */
+		DBUG_RETURN(HA_ERR_RECORD_IS_THE_SAME);
+	} else {
+		if (m_prebuilt->upd_node->is_delete) {
+			trx->fts_next_doc_id = 0;
+		}
+
+		/* row_start was updated by vers_make_update()
+		in calc_row_difference() */
+		error = row_update_for_mysql(m_prebuilt);
+
+		if (error == DB_SUCCESS && m_prebuilt->versioned_write
+		    /* Multiple UPDATE of same rows in single transaction create
+		       historical rows only once. */
+		    && trx->id != table->vers_start_id()) {
+			/* UPDATE is not used by ALTER TABLE. Just precaution
+			as we don't need history generation for ALTER TABLE. */
+			ut_ad(thd_sql_command(m_user_thd) != SQLCOM_ALTER_TABLE);
+			error = row_insert_for_mysql((byte*) old_row,
+						     m_prebuilt,
+						     ROW_INS_HISTORICAL);
+		}
+	}
+
+	if (error == DB_SUCCESS && autoinc) {
+		/* A value for an AUTO_INCREMENT column
+		was specified in the UPDATE statement. */
+
+		/* We need the upper limit of the col type to check for
+		whether we update the table autoinc counter or not. */
+		ulonglong	col_max_value =
+			table->found_next_number_field->get_max_int_value();
+
+		/* This should filter out the negative
+		values set explicitly by the user. */
+		if (autoinc <= col_max_value) {
+			ulonglong	offset;
+			ulonglong	increment;
+
+			offset = m_prebuilt->autoinc_offset;
+			increment = m_prebuilt->autoinc_increment;
+
+			autoinc = innobase_next_autoinc(
+				autoinc, 1, increment, offset,
+				col_max_value);
+
+			error = innobase_set_max_autoinc(autoinc);
+
+			if (m_prebuilt->table->persistent_autoinc) {
+				/* Update the PAGE_ROOT_AUTO_INC. Yes, we do
+				this even if dict_table_t::autoinc already was
+				greater than autoinc, because we cannot know
+				if any INSERT actually used (and wrote to
+				PAGE_ROOT_AUTO_INC) a value bigger than our
+				autoinc. */
+				btr_write_autoinc(dict_table_get_first_index(
+							  m_prebuilt->table),
+						  autoinc);
+			}
+		}
+	}
+
+func_exit:
+	if (error == DB_FTS_INVALID_DOCID) {
+		err = HA_FTS_INVALID_DOCID;
+		my_error(HA_FTS_INVALID_DOCID, MYF(0));
+	} else {
+		err = convert_error_code_to_mysql(
+			error, m_prebuilt->table->flags, m_user_thd);
+	}
+
+#ifdef WITH_WSREP
+	if (error == DB_SUCCESS && trx->is_wsrep()
+	    && wsrep_thd_is_local(m_user_thd)
+	    && !wsrep_thd_ignore_table(m_user_thd)) {
+		DBUG_PRINT("wsrep", ("update row key"));
+
+		/* We use table-level exclusive key for SEQUENCES
+		   and normal key append for others. */
+		if (table->s->table_type == TABLE_TYPE_SEQUENCE) {
+			if (wsrep_append_table_key(m_user_thd, *m_prebuilt->table))
+				DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
+		} else if (wsrep_append_keys(m_user_thd,
+					     wsrep_protocol_version >= 4
+					     ? WSREP_SERVICE_KEY_UPDATE
+					     : WSREP_SERVICE_KEY_EXCLUSIVE,
+					     old_row, new_row)) {
+			WSREP_DEBUG("WSREP: UPDATE_ROW_KEY FAILED");
+			DBUG_PRINT("wsrep", ("row key failed"));
+			DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
+		}
+	}
+#endif /* WITH_WSREP */
+
+	DBUG_RETURN(err);
+}
+
+/**********************************************************************//**
+Deletes a row given as the parameter.
+@return error number or 0 */
+
+int
+ha_innobase::delete_row(
+/*====================*/
+	const uchar*	record)	/*!< in: a row in MySQL format */
+{
+	dberr_t		error;
+	trx_t*		trx = thd_to_trx(m_user_thd);
+	mariadb_set_stats set_stats_temporary(handler_stats);
+
+	DBUG_ENTER("ha_innobase::delete_row");
+
+	if (is_read_only()) {
+		DBUG_RETURN(HA_ERR_TABLE_READONLY);
+	} else if (!trx_is_started(trx)) {
+		trx->will_lock = true;
+	}
+
+	if (!m_prebuilt->upd_node) {
+		row_get_prebuilt_update_vector(m_prebuilt);
+	}
+
+	/* This is a delete */
+	m_prebuilt->upd_node->is_delete = table->versioned_write(VERS_TRX_ID)
+		&& table->vers_end_field()->is_max()
+		&& trx->id != table->vers_start_id()
+		? VERSIONED_DELETE
+		: PLAIN_DELETE;
+	trx->fts_next_doc_id = 0;
+
+	error = row_update_for_mysql(m_prebuilt);
+
+#ifdef WITH_WSREP
+	if (error == DB_SUCCESS && trx->is_wsrep()
+	    && wsrep_thd_is_local(m_user_thd)
+	    && !wsrep_thd_ignore_table(m_user_thd)) {
+		if (wsrep_append_keys(m_user_thd, WSREP_SERVICE_KEY_EXCLUSIVE,
+				      record,
+				      NULL)) {
+			DBUG_PRINT("wsrep", ("delete fail"));
+			DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
+		}
+	}
+#endif /* WITH_WSREP */
+	DBUG_RETURN(convert_error_code_to_mysql(
+			    error, m_prebuilt->table->flags, m_user_thd));
+}
+
+/**********************************************************************//**
+Removes a new lock set on a row, if it was not read optimistically. This can
+be called after a row has been read in the processing of an UPDATE or a DELETE
+query. */
+
+void
+ha_innobase::unlock_row(void)
+/*=========================*/
+{
+	DBUG_ENTER("ha_innobase::unlock_row");
+
+	if (m_prebuilt->select_lock_type == LOCK_NONE) {
+		DBUG_VOID_RETURN;
+	}
+
+	ut_ad(trx_state_eq(m_prebuilt->trx, TRX_STATE_ACTIVE, true));
+
+	switch (m_prebuilt->row_read_type) {
+	case ROW_READ_WITH_LOCKS:
+		if (m_prebuilt->trx->isolation_level > TRX_ISO_READ_COMMITTED)
+			break;
+		/* fall through */
+	case ROW_READ_TRY_SEMI_CONSISTENT:
+		row_unlock_for_mysql(m_prebuilt, FALSE);
+		break;
+	case ROW_READ_DID_SEMI_CONSISTENT:
+		m_prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
+		break;
+	}
+
+	DBUG_VOID_RETURN;
+}
+
+/* See handler.h and row0mysql.h for docs on this function. */
+
+bool
+ha_innobase::was_semi_consistent_read(void)
+/*=======================================*/
+{
+	return(m_prebuilt->row_read_type == ROW_READ_DID_SEMI_CONSISTENT);
+}
+
+/* See handler.h and row0mysql.h for docs on this function. */
+void ha_innobase::try_semi_consistent_read(bool yes)
+{
+	ut_ad(m_prebuilt->trx == thd_to_trx(ha_thd()));
+	/* Row read type is set to semi consistent read if this was
+	requested by the SQL layer and the transaction isolation level is
+	READ UNCOMMITTED or READ COMMITTED. */
+	m_prebuilt->row_read_type = yes
+		&& m_prebuilt->trx->isolation_level <= TRX_ISO_READ_COMMITTED
+		? ROW_READ_TRY_SEMI_CONSISTENT
+		: ROW_READ_WITH_LOCKS;
+}
+
+/******************************************************************//**
+Initializes a handle to use an index.
+@return 0 or error number */
+
+int
+ha_innobase::index_init(
+/*====================*/
+	uint		keynr,	/*!< in: key (index) number */
+	bool)
+{
+	DBUG_ENTER("index_init");
+
+	DBUG_RETURN(change_active_index(keynr));
+}
+
+/******************************************************************//**
+Currently does nothing.
+@return 0 */
+
+int
+ha_innobase::index_end(void)
+/*========================*/
+{
+	DBUG_ENTER("index_end");
+
+	active_index = MAX_KEY;
+
+	in_range_check_pushed_down = FALSE;
+
+	m_ds_mrr.dsmrr_close();
+
+	DBUG_RETURN(0);
+}
+
+/*********************************************************************//**
+Converts a search mode flag understood by MySQL to a flag understood
+by InnoDB. */
+page_cur_mode_t
+convert_search_mode_to_innobase(
+/*============================*/
+	ha_rkey_function	find_flag)
+{
+	switch (find_flag) {
+	case HA_READ_KEY_EXACT:
+		/* this does not require the index to be UNIQUE */
+	case HA_READ_KEY_OR_NEXT:
+		return(PAGE_CUR_GE);
+	case HA_READ_AFTER_KEY:
+		return(PAGE_CUR_G);
+	case HA_READ_BEFORE_KEY:
+		return(PAGE_CUR_L);
+	case HA_READ_KEY_OR_PREV:
+	case HA_READ_PREFIX_LAST:
+	case HA_READ_PREFIX_LAST_OR_PREV:
+		return(PAGE_CUR_LE);
+	case HA_READ_MBR_CONTAIN:
+		return(PAGE_CUR_CONTAIN);
+	case HA_READ_MBR_INTERSECT:
+		return(PAGE_CUR_INTERSECT);
+	case HA_READ_MBR_WITHIN:
+		return(PAGE_CUR_WITHIN);
+	case HA_READ_MBR_DISJOINT:
+		return(PAGE_CUR_DISJOINT);
+	case HA_READ_MBR_EQUAL:
+		return(PAGE_CUR_MBR_EQUAL);
+	case HA_READ_PREFIX:
+		return(PAGE_CUR_UNSUPP);
+	/* do not use "default:" in order to produce a gcc warning:
+	enumeration value '...' not handled in switch
+	(if -Wswitch or -Wall is used) */
+	}
+
+	my_error(ER_CHECK_NOT_IMPLEMENTED, MYF(0), "this functionality");
+
+	return(PAGE_CUR_UNSUPP);
+}
+
+/*
+   BACKGROUND INFO: HOW A SELECT SQL QUERY IS EXECUTED
+   ---------------------------------------------------
+The following does not cover all the details, but explains how we determine
+the start of a new SQL statement, and what is associated with it.
+
+For each table in the database the MySQL interpreter may have several
+table handle instances in use, also in a single SQL query. For each table
+handle instance there is an InnoDB  'm_prebuilt' struct which contains most
+of the InnoDB data associated with this table handle instance.
+
+  A) if the user has not explicitly set any MySQL table level locks:
+
+  1) MySQL calls ::external_lock to set an 'intention' table level lock on
+the table of the handle instance. There we set
+m_prebuilt->sql_stat_start = TRUE. The flag sql_stat_start should be set
+true if we are taking this table handle instance to use in a new SQL
+statement issued by the user. We also increment trx->n_mysql_tables_in_use.
+
+  2) If m_prebuilt->sql_stat_start == TRUE we 'pre-compile' the MySQL search
+instructions to m_prebuilt->template of the table handle instance in
+::index_read. The template is used to save CPU time in large joins.
+
+  3) In row_search_mvcc(), if m_prebuilt->sql_stat_start is true, we
+allocate a new consistent read view for the trx if it does not yet have one,
+or in the case of a locking read, set an InnoDB 'intention' table level
+lock on the table.
+
+  4) We do the SELECT. MySQL may repeatedly call ::index_read for the
+same table handle instance, if it is a join.
+
+  5) When the SELECT ends, MySQL removes its intention table level locks
+in ::external_lock. When trx->n_mysql_tables_in_use drops to zero,
+ (a) we execute a COMMIT there if the autocommit is on,
+ (b) we also release possible 'SQL statement level resources' InnoDB may
+have for this SQL statement. The MySQL interpreter does NOT execute
+autocommit for pure read transactions, though it should. That is why the
+table handler in that case has to execute the COMMIT in ::external_lock.
+
+  B) If the user has explicitly set MySQL table level locks, then MySQL
+does NOT call ::external_lock at the start of the statement. To determine
+when we are at the start of a new SQL statement we at the start of
+::index_read also compare the query id to the latest query id where the
+table handle instance was used. If it has changed, we know we are at the
+start of a new SQL statement. Since the query id can theoretically
+overwrap, we use this test only as a secondary way of determining the
+start of a new SQL statement. */
+
+
+/**********************************************************************//**
+Positions an index cursor to the index specified in the handle. Fetches the
+row if any.
+@return 0, HA_ERR_KEY_NOT_FOUND, or error number */
+
+int
+ha_innobase::index_read(
+/*====================*/
+	uchar*		buf,		/*!< in/out: buffer for the returned
+					row */
+	const uchar*	key_ptr,	/*!< in: key value; if this is NULL
+					we position the cursor at the
+					start or end of index; this can
+					also contain an InnoDB row id, in
+					which case key_len is the InnoDB
+					row id length; the key value can
+					also be a prefix of a full key value,
+					and the last column can be a prefix
+					of a full column */
+	uint			key_len,/*!< in: key value length */
+	enum ha_rkey_function find_flag)/*!< in: search flags from my_base.h */
+{
+	DBUG_ENTER("index_read");
+	mariadb_set_stats set_stats_temporary(handler_stats);
+	DEBUG_SYNC_C("ha_innobase_index_read_begin");
+
+	ut_a(m_prebuilt->trx == thd_to_trx(m_user_thd));
+	ut_ad(key_len != 0 || find_flag != HA_READ_KEY_EXACT);
+
+	dict_index_t*	index = m_prebuilt->index;
+
+	if (index == NULL || index->is_corrupted()) {
+		m_prebuilt->index_usable = FALSE;
+		DBUG_RETURN(HA_ERR_CRASHED);
+	}
+
+	if (!m_prebuilt->index_usable) {
+		DBUG_RETURN(index->is_corrupted()
+			    ? HA_ERR_INDEX_CORRUPT
+			    : HA_ERR_TABLE_DEF_CHANGED);
+	}
+
+	if (index->type & DICT_FTS) {
+		DBUG_RETURN(HA_ERR_KEY_NOT_FOUND);
+	}
+
+	/* For R-Tree index, we will always place the page lock to
+	pages being searched */
+	if (index->is_spatial() && !m_prebuilt->trx->will_lock) {
+		if (trx_is_started(m_prebuilt->trx)) {
+			DBUG_RETURN(HA_ERR_READ_ONLY_TRANSACTION);
+		} else {
+			m_prebuilt->trx->will_lock = true;
+		}
+	}
+
+	/* Note that if the index for which the search template is built is not
+	necessarily m_prebuilt->index, but can also be the clustered index */
+
+	if (m_prebuilt->sql_stat_start) {
+		build_template(false);
+	}
+
+	if (key_ptr != NULL) {
+		/* Convert the search key value to InnoDB format into
+		m_prebuilt->search_tuple */
+
+		row_sel_convert_mysql_key_to_innobase(
+			m_prebuilt->search_tuple,
+			m_prebuilt->srch_key_val1,
+			m_prebuilt->srch_key_val_len,
+			index,
+			(byte*) key_ptr,
+			(ulint) key_len);
+
+		DBUG_ASSERT(m_prebuilt->search_tuple->n_fields > 0);
+	} else {
+		/* We position the cursor to the last or the first entry
+		in the index */
+
+		dtuple_set_n_fields(m_prebuilt->search_tuple, 0);
+	}
+
+	page_cur_mode_t	mode = convert_search_mode_to_innobase(find_flag);
+
+	ulint	match_mode = 0;
+
+	if (find_flag == HA_READ_KEY_EXACT) {
+
+		match_mode = ROW_SEL_EXACT;
+
+	} else if (find_flag == HA_READ_PREFIX_LAST) {
+
+		match_mode = ROW_SEL_EXACT_PREFIX;
+	}
+
+	m_last_match_mode = (uint) match_mode;
+
+	dberr_t ret = mode == PAGE_CUR_UNSUPP ? DB_UNSUPPORTED
+		: row_search_mvcc(buf, mode, m_prebuilt, match_mode, 0);
+
+	DBUG_EXECUTE_IF("ib_select_query_failure", ret = DB_ERROR;);
+
+	int	error;
+
+	switch (ret) {
+	case DB_SUCCESS:
+		error = 0;
+		table->status = 0;
+		break;
+
+	case DB_RECORD_NOT_FOUND:
+		error = HA_ERR_KEY_NOT_FOUND;
+		table->status = STATUS_NOT_FOUND;
+		break;
+
+	case DB_END_OF_INDEX:
+		error = HA_ERR_KEY_NOT_FOUND;
+		table->status = STATUS_NOT_FOUND;
+		break;
+
+	case DB_TABLESPACE_DELETED:
+		ib_senderrf(
+			m_prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			ER_TABLESPACE_DISCARDED,
+			table->s->table_name.str);
+
+		table->status = STATUS_NOT_FOUND;
+		error = HA_ERR_TABLESPACE_MISSING;
+		break;
+
+	case DB_TABLESPACE_NOT_FOUND:
+
+		ib_senderrf(
+			m_prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			ER_TABLESPACE_MISSING,
+			table->s->table_name.str);
+
+		table->status = STATUS_NOT_FOUND;
+		error = HA_ERR_TABLESPACE_MISSING;
+		break;
+
+	default:
+		error = convert_error_code_to_mysql(
+			ret, m_prebuilt->table->flags, m_user_thd);
+
+		table->status = STATUS_NOT_FOUND;
+		break;
+	}
+
+	DBUG_RETURN(error);
+}
+
+/*******************************************************************//**
+The following functions works like index_read, but it find the last
+row with the current key value or prefix.
+@return 0, HA_ERR_KEY_NOT_FOUND, or an error code */
+
+int
+ha_innobase::index_read_last(
+/*=========================*/
+	uchar*		buf,	/*!< out: fetched row */
+	const uchar*	key_ptr,/*!< in: key value, or a prefix of a full
+				key value */
+	uint		key_len)/*!< in: length of the key val or prefix
+				in bytes */
+{
+	return(index_read(buf, key_ptr, key_len, HA_READ_PREFIX_LAST));
+}
+
+/********************************************************************//**
+Get the index for a handle. Does not change active index.
+@return NULL or index instance. */
+
+dict_index_t*
+ha_innobase::innobase_get_index(
+/*============================*/
+	uint		keynr)	/*!< in: use this index; MAX_KEY means always
+				clustered index, even if it was internally
+				generated by InnoDB */
+{
+	KEY*		key = NULL;
+	dict_table_t*	ib_table = m_prebuilt->table;
+	dict_index_t*	index;
+
+	DBUG_ENTER("innobase_get_index");
+
+	if (keynr != MAX_KEY && table->s->keys > 0) {
+		key = &table->key_info[keynr];
+		index = dict_table_get_index_on_name(ib_table, key->name.str);
+	} else {
+		index = dict_table_get_first_index(ib_table);
+	}
+
+	if (index == NULL) {
+		sql_print_error(
+			"InnoDB could not find key no %u with name %s"
+			" from dict cache for table %s",
+			keynr, key ? key->name.str : "NULL",
+			ib_table->name.m_name);
+	}
+
+	DBUG_RETURN(index);
+}
+
+/********************************************************************//**
+Changes the active index of a handle.
+@return 0 or error code */
+
+int
+ha_innobase::change_active_index(
+/*=============================*/
+	uint	keynr)	/*!< in: use this index; MAX_KEY means always clustered
+			index, even if it was internally generated by
+			InnoDB */
+{
+	DBUG_ENTER("change_active_index");
+
+	ut_ad(m_user_thd == ha_thd());
+	ut_a(m_prebuilt->trx == thd_to_trx(m_user_thd));
+
+	active_index = keynr;
+
+	m_prebuilt->index = innobase_get_index(keynr);
+
+	if (m_prebuilt->index == NULL) {
+		sql_print_warning("InnoDB: change_active_index(%u) failed",
+				  keynr);
+		m_prebuilt->index_usable = FALSE;
+		DBUG_RETURN(1);
+	}
+
+	m_prebuilt->index_usable = row_merge_is_index_usable(
+		m_prebuilt->trx, m_prebuilt->index);
+
+	if (!m_prebuilt->index_usable) {
+		if (m_prebuilt->index->is_corrupted()) {
+			char	table_name[MAX_FULL_NAME_LEN + 1];
+
+			innobase_format_name(
+				table_name, sizeof table_name,
+				m_prebuilt->index->table->name.m_name);
+
+			if (m_prebuilt->index->is_primary()) {
+				ut_ad(m_prebuilt->index->table->corrupted);
+				push_warning_printf(
+					m_user_thd, Sql_condition::WARN_LEVEL_WARN,
+					ER_TABLE_CORRUPT,
+					"InnoDB: Table %s is corrupted.",
+					table_name);
+				DBUG_RETURN(ER_TABLE_CORRUPT);
+			} else {
+				push_warning_printf(
+					m_user_thd, Sql_condition::WARN_LEVEL_WARN,
+					HA_ERR_INDEX_CORRUPT,
+					"InnoDB: Index %s for table %s is"
+					" marked as corrupted",
+					m_prebuilt->index->name(),
+					table_name);
+				DBUG_RETURN(HA_ERR_INDEX_CORRUPT);
+			}
+		} else {
+			push_warning_printf(
+				m_user_thd, Sql_condition::WARN_LEVEL_WARN,
+				HA_ERR_TABLE_DEF_CHANGED,
+				"InnoDB: insufficient history for index %u",
+				keynr);
+		}
+
+		/* The caller seems to ignore this.  Thus, we must check
+		this again in row_search_mvcc(). */
+		DBUG_RETURN(convert_error_code_to_mysql(DB_MISSING_HISTORY,
+				0, NULL));
+	}
+
+	ut_a(m_prebuilt->search_tuple != 0);
+
+	/* Initialization of search_tuple is not needed for FT index
+	since FT search returns rank only. In addition engine should
+	be able to retrieve FTS_DOC_ID column value if necessary. */
+	if (m_prebuilt->index->type & DICT_FTS) {
+		for (uint i = 0; i < table->s->fields; i++) {
+			if (m_prebuilt->read_just_key
+			    && bitmap_is_set(table->read_set, i)
+			    && !strcmp(table->s->field[i]->field_name.str,
+				       FTS_DOC_ID_COL_NAME)) {
+				m_prebuilt->fts_doc_id_in_read_set = true;
+				break;
+			}
+		}
+	} else {
+		ulint n_fields = dict_index_get_n_unique_in_tree(
+			m_prebuilt->index);
+
+		dtuple_set_n_fields(m_prebuilt->search_tuple, n_fields);
+
+		dict_index_copy_types(
+			m_prebuilt->search_tuple, m_prebuilt->index,
+			n_fields);
+
+		/* If it's FTS query and FTS_DOC_ID exists FTS_DOC_ID field is
+		always added to read_set. */
+		m_prebuilt->fts_doc_id_in_read_set = m_prebuilt->in_fts_query
+			&& m_prebuilt->read_just_key
+			&& m_prebuilt->index->contains_col_or_prefix(
+				m_prebuilt->table->fts->doc_col, false);
+	}
+
+	/* MySQL changes the active index for a handle also during some
+	queries, for example SELECT MAX(a), SUM(a) first retrieves the MAX()
+	and then calculates the sum. Previously we played safe and used
+	the flag ROW_MYSQL_WHOLE_ROW below, but that caused unnecessary
+	copying. Starting from MySQL-4.1 we use a more efficient flag here. */
+
+	build_template(false);
+
+	DBUG_RETURN(0);
+}
+
+/* @return true if it's necessary to switch current statement log format from
+STATEMENT to ROW if binary log format is MIXED and autoincrement values
+are changed in the statement */
+bool ha_innobase::autoinc_lock_mode_stmt_unsafe() const
+{
+  return innobase_autoinc_lock_mode == AUTOINC_NO_LOCKING;
+}
+
+/***********************************************************************//**
+Reads the next or previous row from a cursor, which must have previously been
+positioned using index_read.
+@return 0, HA_ERR_END_OF_FILE, or error number */
+
+int
+ha_innobase::general_fetch(
+/*=======================*/
+	uchar*	buf,		/*!< in/out: buffer for next row in MySQL
+				format */
+	uint	direction,	/*!< in: ROW_SEL_NEXT or ROW_SEL_PREV */
+	uint	match_mode)	/*!< in: 0, ROW_SEL_EXACT, or
+				ROW_SEL_EXACT_PREFIX */
+{
+	DBUG_ENTER("general_fetch");
+
+	mariadb_set_stats set_stats_temporary(handler_stats);
+	const trx_t*	trx = m_prebuilt->trx;
+
+	ut_ad(trx == thd_to_trx(m_user_thd));
+
+	if (m_prebuilt->table->is_readable()) {
+	} else if (m_prebuilt->table->corrupted) {
+		DBUG_RETURN(HA_ERR_CRASHED);
+	} else {
+		DBUG_RETURN(m_prebuilt->table->space
+			    ? HA_ERR_DECRYPTION_FAILED
+			    : HA_ERR_NO_SUCH_TABLE);
+	}
+
+	int	error;
+
+	switch (dberr_t	ret = row_search_mvcc(buf, PAGE_CUR_UNSUPP, m_prebuilt,
+					      match_mode, direction)) {
+	case DB_SUCCESS:
+		error = 0;
+		table->status = 0;
+		break;
+	case DB_RECORD_NOT_FOUND:
+		error = HA_ERR_END_OF_FILE;
+		table->status = STATUS_NOT_FOUND;
+		break;
+	case DB_END_OF_INDEX:
+		error = HA_ERR_END_OF_FILE;
+		table->status = STATUS_NOT_FOUND;
+		break;
+	case DB_TABLESPACE_DELETED:
+		ib_senderrf(
+			trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			ER_TABLESPACE_DISCARDED,
+			table->s->table_name.str);
+
+		table->status = STATUS_NOT_FOUND;
+		error = HA_ERR_TABLESPACE_MISSING;
+		break;
+	case DB_TABLESPACE_NOT_FOUND:
+
+		ib_senderrf(
+			trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			ER_TABLESPACE_MISSING,
+			table->s->table_name.str);
+
+		table->status = STATUS_NOT_FOUND;
+		error = HA_ERR_TABLESPACE_MISSING;
+		break;
+	default:
+		error = convert_error_code_to_mysql(
+			ret, m_prebuilt->table->flags, m_user_thd);
+
+		table->status = STATUS_NOT_FOUND;
+		break;
+	}
+
+	DBUG_RETURN(error);
+}
+
+/***********************************************************************//**
+Reads the next row from a cursor, which must have previously been
+positioned using index_read.
+@return 0, HA_ERR_END_OF_FILE, or error number */
+
+int
+ha_innobase::index_next(
+/*====================*/
+	uchar*		buf)	/*!< in/out: buffer for next row in MySQL
+				format */
+{
+	return(general_fetch(buf, ROW_SEL_NEXT, 0));
+}
+
+/*******************************************************************//**
+Reads the next row matching to the key value given as the parameter.
+@return 0, HA_ERR_END_OF_FILE, or error number */
+
+int
+ha_innobase::index_next_same(
+/*=========================*/
+	uchar*		buf,	/*!< in/out: buffer for the row */
+	const uchar*, uint)
+{
+	return(general_fetch(buf, ROW_SEL_NEXT, m_last_match_mode));
+}
+
+/***********************************************************************//**
+Reads the previous row from a cursor, which must have previously been
+positioned using index_read.
+@return 0, HA_ERR_END_OF_FILE, or error number */
+
+int
+ha_innobase::index_prev(
+/*====================*/
+	uchar*	buf)	/*!< in/out: buffer for previous row in MySQL format */
+{
+	return(general_fetch(buf, ROW_SEL_PREV, 0));
+}
+
+/********************************************************************//**
+Positions a cursor on the first record in an index and reads the
+corresponding row to buf.
+@return 0, HA_ERR_END_OF_FILE, or error code */
+
+int
+ha_innobase::index_first(
+/*=====================*/
+	uchar*	buf)	/*!< in/out: buffer for the row */
+{
+	DBUG_ENTER("index_first");
+
+	int	error = index_read(buf, NULL, 0, HA_READ_AFTER_KEY);
+
+	/* MySQL does not seem to allow this to return HA_ERR_KEY_NOT_FOUND */
+
+	if (error == HA_ERR_KEY_NOT_FOUND) {
+		error = HA_ERR_END_OF_FILE;
+	}
+
+	DBUG_RETURN(error);
+}
+
+/********************************************************************//**
+Positions a cursor on the last record in an index and reads the
+corresponding row to buf.
+@return 0, HA_ERR_END_OF_FILE, or error code */
+
+int
+ha_innobase::index_last(
+/*====================*/
+	uchar*	buf)	/*!< in/out: buffer for the row */
+{
+	DBUG_ENTER("index_last");
+
+	int	error = index_read(buf, NULL, 0, HA_READ_BEFORE_KEY);
+
+	/* MySQL does not seem to allow this to return HA_ERR_KEY_NOT_FOUND */
+
+	if (error == HA_ERR_KEY_NOT_FOUND) {
+		error = HA_ERR_END_OF_FILE;
+	}
+
+	DBUG_RETURN(error);
+}
+
+/****************************************************************//**
+Initialize a table scan.
+@return 0 or error number */
+
+int
+ha_innobase::rnd_init(
+/*==================*/
+	bool	scan)	/*!< in: true if table/index scan FALSE otherwise */
+{
+	int		err;
+
+	/* Store the active index value so that we can restore the original
+	value after a scan */
+
+	if (m_prebuilt->clust_index_was_generated) {
+		err = change_active_index(MAX_KEY);
+	} else {
+		err = change_active_index(m_primary_key);
+	}
+
+	/* Don't use semi-consistent read in random row reads (by position).
+	This means we must disable semi_consistent_read if scan is false */
+
+	if (!scan) {
+		try_semi_consistent_read(0);
+	}
+
+	m_start_of_scan = true;
+
+	return(err);
+}
+
+/*****************************************************************//**
+Ends a table scan.
+@return 0 or error number */
+
+int
+ha_innobase::rnd_end(void)
+/*======================*/
+{
+	return(index_end());
+}
+
+/*****************************************************************//**
+Reads the next row in a table scan (also used to read the FIRST row
+in a table scan).
+@return 0, HA_ERR_END_OF_FILE, or error number */
+
+int
+ha_innobase::rnd_next(
+/*==================*/
+	uchar*	buf)	/*!< in/out: returns the row in this buffer,
+			in MySQL format */
+{
+	int	error;
+	DBUG_ENTER("rnd_next");
+
+	if (m_start_of_scan) {
+		error = index_first(buf);
+
+		if (error == HA_ERR_KEY_NOT_FOUND) {
+			error = HA_ERR_END_OF_FILE;
+		}
+
+		m_start_of_scan = false;
+	} else {
+		error = general_fetch(buf, ROW_SEL_NEXT, 0);
+	}
+
+	DBUG_RETURN(error);
+}
+
+/**********************************************************************//**
+Fetches a row from the table based on a row reference.
+@return 0, HA_ERR_KEY_NOT_FOUND, or error code */
+
+int
+ha_innobase::rnd_pos(
+/*=================*/
+	uchar*	buf,	/*!< in/out: buffer for the row */
+	uchar*	pos)	/*!< in: primary key value of the row in the
+			MySQL format, or the row id if the clustered
+			index was internally generated by InnoDB; the
+			length of data in pos has to be ref_length */
+{
+	DBUG_ENTER("rnd_pos");
+	DBUG_DUMP("key", pos, ref_length);
+
+	ut_a(m_prebuilt->trx == thd_to_trx(ha_thd()));
+
+	/* Note that we assume the length of the row reference is fixed
+	for the table, and it is == ref_length */
+
+	int	error = index_read(buf, pos, (uint)ref_length, HA_READ_KEY_EXACT);
+
+	if (error != 0) {
+		DBUG_PRINT("error", ("Got error: %d", error));
+	}
+
+	DBUG_RETURN(error);
+}
+
+/**********************************************************************//**
+Initialize FT index scan
+@return 0 or error number */
+
+int
+ha_innobase::ft_init()
+/*==================*/
+{
+	DBUG_ENTER("ft_init");
+
+	trx_t*	trx = check_trx_exists(ha_thd());
+
+	/* FTS queries are not treated as autocommit non-locking selects.
+	This is because the FTS implementation can acquire locks behind
+	the scenes. This has not been verified but it is safer to treat
+	them as regular read only transactions for now. */
+
+	if (!trx_is_started(trx)) {
+		trx->will_lock = true;
+	}
+
+	DBUG_RETURN(rnd_init(false));
+}
+
+/**********************************************************************//**
+Initialize FT index scan
+@return FT_INFO structure if successful or NULL */
+
+FT_INFO*
+ha_innobase::ft_init_ext(
+/*=====================*/
+	uint			flags,	/* in: */
+	uint			keynr,	/* in: */
+	String*			key)	/* in: */
+{
+	NEW_FT_INFO*		fts_hdl = NULL;
+	dict_index_t*		index;
+	fts_result_t*		result;
+	char			buf_tmp[8192];
+	ulint			buf_tmp_used;
+	uint			num_errors;
+	ulint			query_len = key->length();
+	const CHARSET_INFO*	char_set = key->charset();
+	const char*		query = key->ptr();
+
+	if (UNIV_UNLIKELY(fts_enable_diag_print)) {
+		{
+			ib::info	out;
+			out << "keynr=" << keynr << ", '";
+			out.write(key->ptr(), key->length());
+		}
+
+		if (flags & FT_BOOL) {
+			ib::info() << "BOOL search";
+		} else {
+			ib::info() << "NL search";
+		}
+	}
+
+        /* Multi byte character sets like utf32 and utf16 are not
+           compatible with some string function used. So to convert them
+           to uft8 before we proceed. */
+	if (char_set->mbminlen != 1) {
+		buf_tmp_used = innobase_convert_string(
+			buf_tmp, sizeof(buf_tmp) - 1,
+			&my_charset_utf8mb3_general_ci,
+			query, query_len, (CHARSET_INFO*) char_set,
+			&num_errors);
+
+		buf_tmp[buf_tmp_used] = 0;
+		query = buf_tmp;
+		query_len = buf_tmp_used;
+	}
+
+	trx_t*	trx = m_prebuilt->trx;
+
+	/* FTS queries are not treated as autocommit non-locking selects.
+	This is because the FTS implementation can acquire locks behind
+	the scenes. This has not been verified but it is safer to treat
+	them as regular read only transactions for now. */
+
+	if (!trx_is_started(trx)) {
+		trx->will_lock = true;
+	}
+
+	dict_table_t*	ft_table = m_prebuilt->table;
+
+	/* Table does not have an FTS index */
+	if (!ft_table->fts || ib_vector_is_empty(ft_table->fts->indexes)) {
+		my_error(ER_TABLE_HAS_NO_FT, MYF(0));
+		return(NULL);
+	}
+
+	/* If tablespace is discarded, we should return here */
+	if (!ft_table->space) {
+		my_error(ER_TABLESPACE_MISSING, MYF(0), table->s->db.str,
+			 table->s->table_name.str);
+		return(NULL);
+	}
+
+	if (keynr == NO_SUCH_KEY) {
+		/* FIXME: Investigate the NO_SUCH_KEY usage */
+		index = reinterpret_cast<dict_index_t*>
+			(ib_vector_getp(ft_table->fts->indexes, 0));
+	} else {
+		index = innobase_get_index(keynr);
+	}
+
+	if (index == NULL || index->type != DICT_FTS) {
+		my_error(ER_TABLE_HAS_NO_FT, MYF(0));
+		return(NULL);
+	}
+
+	if (!(ft_table->fts->added_synced)) {
+		fts_init_index(ft_table, FALSE);
+
+		ft_table->fts->added_synced = true;
+	}
+
+	const byte*	q = reinterpret_cast<const byte*>(
+		const_cast<char*>(query));
+
+	// FIXME: support ft_init_ext_with_hints(), pass LIMIT
+	dberr_t	error = fts_query(trx, index, flags, q, query_len, &result);
+
+	if (error != DB_SUCCESS) {
+		my_error(convert_error_code_to_mysql(error, 0, NULL), MYF(0));
+		return(NULL);
+	}
+
+	/* Allocate FTS handler, and instantiate it before return */
+	fts_hdl = reinterpret_cast<NEW_FT_INFO*>(
+		my_malloc(PSI_INSTRUMENT_ME, sizeof(NEW_FT_INFO), MYF(0)));
+
+	fts_hdl->please = const_cast<_ft_vft*>(&ft_vft_result);
+	fts_hdl->could_you = const_cast<_ft_vft_ext*>(&ft_vft_ext_result);
+	fts_hdl->ft_prebuilt = m_prebuilt;
+	fts_hdl->ft_result = result;
+
+	/* FIXME: Re-evaluate the condition when Bug 14469540 is resolved */
+	m_prebuilt->in_fts_query = true;
+
+	return(reinterpret_cast<FT_INFO*>(fts_hdl));
+}
+
+/*****************************************************************//**
+Set up search tuple for a query through FTS_DOC_ID_INDEX on
+supplied Doc ID. This is used by MySQL to retrieve the documents
+once the search result (Doc IDs) is available
+
+@return DB_SUCCESS or DB_INDEX_CORRUPT
+*/
+static
+dberr_t
+innobase_fts_create_doc_id_key(
+/*===========================*/
+	dtuple_t*	tuple,		/* in/out: m_prebuilt->search_tuple */
+	const dict_index_t*
+			index,		/* in: index (FTS_DOC_ID_INDEX) */
+	doc_id_t*	doc_id)		/* in/out: doc id to search, value
+					could be changed to storage format
+					used for search. */
+{
+	doc_id_t	temp_doc_id;
+	dfield_t*	dfield = dtuple_get_nth_field(tuple, 0);
+	const ulint	n_uniq = index->table->fts_n_uniq();
+
+	if (dict_index_get_n_unique(index) != n_uniq)
+		return DB_INDEX_CORRUPT;
+
+	dtuple_set_n_fields(tuple, index->n_fields);
+	dict_index_copy_types(tuple, index, index->n_fields);
+
+#ifdef UNIV_DEBUG
+	/* The unique Doc ID field should be an eight-bytes integer */
+	dict_field_t*	field = dict_index_get_nth_field(index, 0);
+        ut_a(field->col->mtype == DATA_INT);
+	ut_ad(sizeof(*doc_id) == field->fixed_len);
+	ut_ad(!strcmp(index->name, FTS_DOC_ID_INDEX_NAME));
+#endif /* UNIV_DEBUG */
+
+	/* Convert to storage byte order */
+	mach_write_to_8(reinterpret_cast<byte*>(&temp_doc_id), *doc_id);
+	*doc_id = temp_doc_id;
+	dfield_set_data(dfield, doc_id, sizeof(*doc_id));
+
+	if (n_uniq == 2) {
+		ut_ad(index->table->versioned());
+		dfield = dtuple_get_nth_field(tuple, 1);
+		if (index->table->versioned_by_id()) {
+			dfield_set_data(dfield, trx_id_max_bytes,
+					sizeof(trx_id_max_bytes));
+		} else {
+			dfield_set_data(dfield, timestamp_max_bytes,
+					sizeof(timestamp_max_bytes));
+		}
+	}
+
+	dtuple_set_n_fields_cmp(tuple, n_uniq);
+
+	for (ulint i = n_uniq; i < index->n_fields; i++) {
+		dfield = dtuple_get_nth_field(tuple, i);
+		dfield_set_null(dfield);
+	}
+	return DB_SUCCESS;
+}
+
+/**********************************************************************//**
+Fetch next result from the FT result set
+@return error code */
+
+int
+ha_innobase::ft_read(
+/*=================*/
+	uchar*		buf)		/*!< in/out: buf contain result row */
+{
+	row_prebuilt_t*	ft_prebuilt;
+	mariadb_set_stats set_stats_temporary(handler_stats);
+
+	ft_prebuilt = reinterpret_cast<NEW_FT_INFO*>(ft_handler)->ft_prebuilt;
+
+	ut_a(ft_prebuilt == m_prebuilt);
+
+	fts_result_t*	result;
+
+	result = reinterpret_cast<NEW_FT_INFO*>(ft_handler)->ft_result;
+
+	if (result->current == NULL) {
+		/* This is the case where the FTS query did not
+		contain and matching documents. */
+		if (result->rankings_by_id != NULL) {
+			/* Now that we have the complete result, we
+			need to sort the document ids on their rank
+			calculation. */
+
+			fts_query_sort_result_on_rank(result);
+
+			result->current = const_cast<ib_rbt_node_t*>(
+				rbt_first(result->rankings_by_rank));
+		} else {
+			ut_a(result->current == NULL);
+		}
+	} else {
+		result->current = const_cast<ib_rbt_node_t*>(
+			rbt_next(result->rankings_by_rank, result->current));
+	}
+
+next_record:
+
+	if (result->current != NULL) {
+		doc_id_t	search_doc_id;
+		dtuple_t*	tuple = m_prebuilt->search_tuple;
+
+		/* If we only need information from result we can return
+		   without fetching the table row */
+		if (ft_prebuilt->read_just_key) {
+#ifdef MYSQL_STORE_FTS_DOC_ID
+			if (m_prebuilt->fts_doc_id_in_read_set) {
+				fts_ranking_t* ranking;
+				ranking = rbt_value(fts_ranking_t,
+						    result->current);
+				innobase_fts_store_docid(
+					table, ranking->doc_id);
+			}
+#endif
+			table->status= 0;
+			return(0);
+		}
+
+		dict_index_t*	index;
+
+		index = m_prebuilt->table->fts_doc_id_index;
+
+		/* Must find the index */
+		ut_a(index != NULL);
+
+		/* Switch to the FTS doc id index */
+		m_prebuilt->index = index;
+
+		fts_ranking_t*	ranking = rbt_value(
+			fts_ranking_t, result->current);
+
+		search_doc_id = ranking->doc_id;
+
+		/* We pass a pointer of search_doc_id because it will be
+		converted to storage byte order used in the search
+		tuple. */
+		dberr_t ret = innobase_fts_create_doc_id_key(
+			tuple, index, &search_doc_id);
+
+		if (ret == DB_SUCCESS) {
+			ret = row_search_mvcc(
+				buf, PAGE_CUR_GE, m_prebuilt,
+				ROW_SEL_EXACT, 0);
+		}
+
+		int	error;
+
+		switch (ret) {
+		case DB_SUCCESS:
+			error = 0;
+			table->status = 0;
+			break;
+		case DB_RECORD_NOT_FOUND:
+			result->current = const_cast<ib_rbt_node_t*>(
+				rbt_next(result->rankings_by_rank,
+					 result->current));
+
+			if (!result->current) {
+				/* exhaust the result set, should return
+				HA_ERR_END_OF_FILE just like
+				ha_innobase::general_fetch() and/or
+				ha_innobase::index_first() etc. */
+				error = HA_ERR_END_OF_FILE;
+				table->status = STATUS_NOT_FOUND;
+			} else {
+				goto next_record;
+			}
+			break;
+		case DB_END_OF_INDEX:
+			error = HA_ERR_END_OF_FILE;
+			table->status = STATUS_NOT_FOUND;
+			break;
+		case DB_TABLESPACE_DELETED:
+
+			ib_senderrf(
+				m_prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+				ER_TABLESPACE_DISCARDED,
+				table->s->table_name.str);
+
+			table->status = STATUS_NOT_FOUND;
+			error = HA_ERR_TABLESPACE_MISSING;
+			break;
+		case DB_TABLESPACE_NOT_FOUND:
+
+			ib_senderrf(
+				m_prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+				ER_TABLESPACE_MISSING,
+				table->s->table_name.str);
+
+			table->status = STATUS_NOT_FOUND;
+			error = HA_ERR_TABLESPACE_MISSING;
+			break;
+		default:
+			error = convert_error_code_to_mysql(
+				ret, 0, m_user_thd);
+
+			table->status = STATUS_NOT_FOUND;
+			break;
+		}
+
+		return(error);
+	}
+
+	return(HA_ERR_END_OF_FILE);
+}
+
+#ifdef WITH_WSREP
+inline
+const char*
+wsrep_key_type_to_str(Wsrep_service_key_type type)
+{
+	switch (type) {
+	case WSREP_SERVICE_KEY_SHARED:
+		return "shared";
+	case WSREP_SERVICE_KEY_REFERENCE:
+		return "reference";
+	case WSREP_SERVICE_KEY_UPDATE:
+		return "update";
+	case WSREP_SERVICE_KEY_EXCLUSIVE:
+		return "exclusive";
+	};
+	return "unknown";
+}
+
+extern dberr_t
+wsrep_append_foreign_key(
+/*===========================*/
+	trx_t*		trx,		/*!< in: trx */
+	dict_foreign_t*	foreign,	/*!< in: foreign key constraint */
+	const rec_t*	rec,		/*!<in: clustered index record */
+	dict_index_t*	index,		/*!<in: clustered index */
+	bool		referenced,	/*!<in: is check for
+					referenced table */
+	upd_node_t*	upd_node,	/*<!in: update node */
+	bool		pa_disable,	/*<!in: disable parallel apply ?*/
+	Wsrep_service_key_type	key_type)	/*!< in: access type of this key
+					(shared, exclusive, reference...) */
+{
+	ut_ad(trx->is_wsrep());
+
+	if (!wsrep_thd_is_local(trx->mysql_thd))
+		return DB_SUCCESS;
+
+	if (upd_node && wsrep_protocol_version < 4) {
+		key_type = WSREP_SERVICE_KEY_SHARED;
+	}
+
+	THD* thd = trx->mysql_thd;
+
+	if (!foreign ||
+	    (!foreign->referenced_table && !foreign->foreign_table)) {
+		WSREP_INFO("FK: %s missing in: %s",
+			   (!foreign ? "constraint" :
+			    (!foreign->referenced_table ?
+			     "referenced table" : "foreign table")),
+			   wsrep_thd_query(thd));
+		return DB_ERROR;
+	}
+
+	ulint rcode = DB_SUCCESS;
+	char  cache_key[513] = {'\0'};
+	size_t cache_key_len = 0;
+
+	if ( !((referenced) ?
+		foreign->referenced_table : foreign->foreign_table)) {
+		WSREP_DEBUG("pulling %s table into cache",
+			    (referenced) ? "referenced" : "foreign");
+		dict_sys.lock(SRW_LOCK_CALL);
+
+		if (referenced) {
+			foreign->referenced_table =
+				dict_sys.load_table(
+					{foreign->referenced_table_name_lookup,
+					 strlen(foreign->
+						referenced_table_name_lookup)
+					});
+			if (foreign->referenced_table) {
+				foreign->referenced_index =
+					dict_foreign_find_index(
+						foreign->referenced_table, NULL,
+						foreign->referenced_col_names,
+						foreign->n_fields,
+						foreign->foreign_index,
+						TRUE, FALSE);
+			}
+		} else {
+	  		foreign->foreign_table =
+				dict_sys.load_table(
+					{foreign->foreign_table_name_lookup,
+					 strlen(foreign->
+						foreign_table_name_lookup)});
+
+			if (foreign->foreign_table) {
+				foreign->foreign_index =
+					dict_foreign_find_index(
+						foreign->foreign_table, NULL,
+						foreign->foreign_col_names,
+						foreign->n_fields,
+						foreign->referenced_index,
+						TRUE, FALSE);
+			}
+		}
+		dict_sys.unlock();
+	}
+
+	if ( !((referenced) ?
+		foreign->referenced_table : foreign->foreign_table)) {
+		WSREP_WARN("FK: %s missing in query: %s",
+			   (!foreign->referenced_table) ?
+			   "referenced table" : "foreign table",
+			   wsrep_thd_query(thd));
+		return DB_ERROR;
+	}
+
+	byte  key[WSREP_MAX_SUPPORTED_KEY_LENGTH+1] = {'\0'};
+	ulint len = WSREP_MAX_SUPPORTED_KEY_LENGTH;
+
+	dict_index_t *idx_target = (referenced) ?
+		foreign->referenced_index : index;
+	dict_index_t *idx = (referenced) ?
+		UT_LIST_GET_FIRST(foreign->referenced_table->indexes) :
+		UT_LIST_GET_FIRST(foreign->foreign_table->indexes);
+	int i = 0;
+
+	while (idx != NULL && idx != idx_target) {
+		if (innobase_strcasecmp (idx->name, innobase_index_reserve_name) != 0) {
+			i++;
+		}
+		idx = UT_LIST_GET_NEXT(indexes, idx);
+	}
+
+	ut_a(idx);
+	key[0] = byte(i);
+
+	rcode = wsrep_rec_get_foreign_key(
+		&key[1], &len, rec, index, idx,
+		wsrep_protocol_version > 1);
+
+	if (rcode != DB_SUCCESS) {
+		WSREP_ERROR(
+			"FK key set failed: " ULINTPF
+			" (" ULINTPF "%s), index: %s %s, %s",
+			rcode, referenced, wsrep_key_type_to_str(key_type),
+			(index)       ? index->name() : "void index",
+			(index && index->table) ? index->table->name.m_name :
+				"void table",
+			wsrep_thd_query(thd));
+		return DB_ERROR;
+	}
+
+	strncpy(cache_key,
+		(wsrep_protocol_version > 1) ?
+		((referenced) ?
+			foreign->referenced_table->name.m_name :
+			foreign->foreign_table->name.m_name) :
+		foreign->foreign_table->name.m_name, sizeof(cache_key) - 1);
+	cache_key_len = strlen(cache_key);
+
+#ifdef WSREP_DEBUG_PRINT
+	ulint j;
+	fprintf(stderr, "FK parent key, table: %s %s len: %lu ",
+		cache_key, wsrep_key_type_to_str(key_type), len+1);
+	for (j=0; j<len+1; j++) {
+		fprintf(stderr, " %hhX, ", key[j]);
+	}
+	fprintf(stderr, "\n");
+#endif
+	char *p = strchr(cache_key, '/');
+
+	if (p) {
+		*p = '\0';
+	} else {
+		WSREP_WARN("unexpected foreign key table %s %s",
+			   foreign->referenced_table->name.m_name,
+			   foreign->foreign_table->name.m_name);
+	}
+
+	wsrep_buf_t wkey_part[3];
+        wsrep_key_t wkey = {wkey_part, 3};
+
+	if (!wsrep_prepare_key_for_innodb(
+		thd,
+		(const uchar*)cache_key,
+		cache_key_len +  1,
+		(const uchar*)key, len+1,
+		wkey_part,
+		(size_t*)&wkey.key_parts_num)) {
+		WSREP_WARN("key prepare failed for cascaded FK: %s",
+			   wsrep_thd_query(thd));
+		return DB_ERROR;
+	}
+
+	rcode = wsrep_thd_append_key(thd, &wkey, 1, key_type);
+
+	if (rcode) {
+		WSREP_ERROR("Appending cascaded fk row key failed: %s, "
+			    ULINTPF,
+			    wsrep_thd_query(thd),
+			    rcode);
+		return DB_ERROR;
+	}
+
+	if (pa_disable) {
+		wsrep_thd_set_PA_unsafe(trx->mysql_thd);
+	}
+
+	return DB_SUCCESS;
+}
+
+static int
+wsrep_append_key(
+/*=============*/
+	THD		*thd,
+	trx_t 		*trx,
+	TABLE_SHARE 	*table_share,
+	const char*	key,
+	uint16_t        key_len,
+	Wsrep_service_key_type	key_type	/*!< in: access type of this key
+					(shared, exclusive, semi...) */
+)
+{
+	ut_ad(!trx->is_bulk_insert());
+
+	DBUG_ENTER("wsrep_append_key");
+	DBUG_PRINT("enter",
+		    ("thd: %lu trx: %lld", thd_get_thread_id(thd),
+		    (long long)trx->id));
+#ifdef WSREP_DEBUG_PRINT
+	fprintf(stderr, "%s conn %lu, trx " TRX_ID_FMT ", keylen %d, key %s.%s\n",
+		wsrep_key_type_to_str(key_type),
+		thd_get_thread_id(thd), trx->id, key_len,
+		table_share->table_name.str, key);
+	for (int i=0; i<key_len; i++) {
+		fprintf(stderr, "%hhX, ", key[i]);
+	}
+	fprintf(stderr, "\n");
+#endif
+	wsrep_buf_t wkey_part[3];
+        wsrep_key_t wkey = {wkey_part, 3};
+
+	if (!wsrep_prepare_key_for_innodb(
+			thd,
+			(const uchar*)table_share->table_cache_key.str,
+			table_share->table_cache_key.length,
+			(const uchar*)key, key_len,
+			wkey_part,
+			(size_t*)&wkey.key_parts_num)) {
+		WSREP_WARN("key prepare failed for: %s",
+			   (wsrep_thd_query(thd)) ?
+			   wsrep_thd_query(thd) : "void");
+		DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
+	}
+
+	int rcode = wsrep_thd_append_key(thd, &wkey, 1, key_type);
+	if (rcode) {
+		DBUG_PRINT("wsrep", ("row key failed: %d", rcode));
+		WSREP_WARN("Appending row key failed: %s, %d",
+			   (wsrep_thd_query(thd)) ?
+			   wsrep_thd_query(thd) : "void", rcode);
+		DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
+	}
+
+	DBUG_RETURN(0);
+}
+
+static bool
+referenced_by_foreign_key2(
+/*=======================*/
+	dict_table_t* table,
+	dict_index_t* index)
+{
+	ut_ad(table != NULL);
+	ut_ad(index != NULL);
+
+	const dict_foreign_set* fks = &table->referenced_set;
+
+	for (dict_foreign_set::const_iterator it = fks->begin();
+             it != fks->end();
+             ++it) {
+                dict_foreign_t* foreign = *it;
+
+                if (foreign->referenced_index != index) {
+                        continue;
+                }
+                ut_ad(table == foreign->referenced_table);
+                return true;
+        }
+        return false;
+}
+
+int
+ha_innobase::wsrep_append_keys(
+/*===========================*/
+	THD 		*thd,
+	Wsrep_service_key_type	key_type,	/*!< in: access type of this row
+					operation:
+					(shared, exclusive, reference...) */
+	const uchar*	record0,	/* in: row in MySQL format */
+	const uchar*	record1)	/* in: row in MySQL format */
+{
+	/* Sanity check: newly inserted records should always be passed with
+	   EXCLUSIVE key type, all the rest are expected to carry a pre-image
+	 */
+	ut_a(record1 != NULL || key_type == WSREP_SERVICE_KEY_EXCLUSIVE);
+
+	int rcode;
+	DBUG_ENTER("wsrep_append_keys");
+
+	bool key_appended = false;
+	trx_t *trx = thd_to_trx(thd);
+
+#ifdef WSREP_DEBUG_PRINT
+	fprintf(stderr, "%s conn %lu, trx " TRX_ID_FMT ", table %s\nSQL: %s\n",
+		wsrep_key_type_to_str(key_type),
+		thd_get_thread_id(thd), trx->id,
+		table_share->table_name.str, wsrep_thd_query(thd));
+#endif
+
+	if (table_share && table_share->tmp_table  != NO_TMP_TABLE) {
+		WSREP_DEBUG("skipping tmp table DML: THD: %lu tmp: %d SQL: %s",
+			    thd_get_thread_id(thd),
+			    table_share->tmp_table,
+			    (wsrep_thd_query(thd)) ?
+			    wsrep_thd_query(thd) : "void");
+		DBUG_RETURN(0);
+	}
+
+	if (wsrep_protocol_version == 0) {
+		char 	keyval[WSREP_MAX_SUPPORTED_KEY_LENGTH+1] = {'\0'};
+		char 	*key 		= &keyval[0];
+		bool    is_null;
+
+		auto len = wsrep_store_key_val_for_row(
+			thd, table, 0, key, WSREP_MAX_SUPPORTED_KEY_LENGTH,
+			record0, &is_null);
+
+		if (!is_null) {
+			rcode = wsrep_append_key(
+				thd, trx, table_share, keyval,
+				len, key_type);
+
+			if (rcode) {
+				DBUG_RETURN(rcode);
+			}
+		} else {
+			WSREP_DEBUG("NULL key skipped (proto 0): %s",
+				    wsrep_thd_query(thd));
+		}
+	} else {
+		ut_a(table->s->keys <= 256);
+		uint i;
+                bool hasPK= false;
+
+		for (i=0; i<table->s->keys; ++i) {
+			KEY*  key_info	= table->key_info + i;
+			if (key_info->flags & HA_NOSAME) {
+				hasPK = true;
+				break;
+			}
+		}
+
+		for (i=0; i<table->s->keys; ++i) {
+			KEY*  key_info	= table->key_info + i;
+
+			dict_index_t* idx  = innobase_get_index(i);
+			dict_table_t* tab  = (idx) ? idx->table : NULL;
+
+			/* keyval[] shall contain an ordinal number at byte 0
+			   and the actual key data shall be written at byte 1.
+			   Hence the total data length is the key length + 1 */
+			char keyval0[WSREP_MAX_SUPPORTED_KEY_LENGTH+1]= {'\0'};
+			char keyval1[WSREP_MAX_SUPPORTED_KEY_LENGTH+1]= {'\0'};
+			keyval0[0] = (char)i;
+			keyval1[0] = (char)i;
+			char* key0 = &keyval0[1];
+			char* key1 = &keyval1[1];
+
+			if (!tab) {
+				WSREP_WARN("MariaDB-InnoDB key mismatch %s %s",
+					   table->s->table_name.str,
+					   key_info->name.str);
+			}
+			/* !hasPK == table with no PK,
+			   must append all non-unique keys */
+			if (!hasPK || key_info->flags & HA_NOSAME ||
+			    ((tab &&
+			      referenced_by_foreign_key2(tab, idx)) ||
+			     (!tab && referenced_by_foreign_key()))) {
+
+				bool is_null0;
+				auto len0 = wsrep_store_key_val_for_row(
+					thd, table, i, key0,
+					WSREP_MAX_SUPPORTED_KEY_LENGTH,
+					record0, &is_null0);
+
+				if (record1) {
+					bool is_null1;
+					auto len1= wsrep_store_key_val_for_row(
+						thd, table, i, key1,
+						WSREP_MAX_SUPPORTED_KEY_LENGTH,
+						record1, &is_null1);
+
+					if (is_null0 != is_null1 ||
+					    len0 != len1 ||
+					    memcmp(key0, key1, len0)) {
+						/* This key has chaged. If it
+						  is unique, this is an exclusive
+						  operation -> upgrade key type */
+						if (key_info->flags & HA_NOSAME) {
+						    key_type = WSREP_SERVICE_KEY_EXCLUSIVE;
+						}
+
+						if (!is_null1) {
+						    rcode = wsrep_append_key(
+							thd, trx, table_share,
+							keyval1,
+						    /* for len1+1 see keyval1
+						     initialization comment */
+							uint16_t(len1+1),
+							key_type);
+						    if (rcode)
+							DBUG_RETURN(rcode);
+						}
+					}
+				}
+
+				if (!is_null0) {
+					rcode = wsrep_append_key(
+						thd, trx, table_share,
+						/* for len0+1 see keyval0
+						   initialization comment */
+						keyval0, uint16_t(len0+1),
+						key_type);
+					if (rcode)
+						DBUG_RETURN(rcode);
+
+					if (key_info->flags & HA_NOSAME  ||
+					    key_type == WSREP_SERVICE_KEY_SHARED||
+					    key_type == WSREP_SERVICE_KEY_REFERENCE)
+						key_appended = true;
+				} else {
+					WSREP_DEBUG("NULL key skipped: %s",
+						    wsrep_thd_query(thd));
+				}
+			}
+		}
+	}
+
+	/* if no PK, calculate hash of full row, to be the key value */
+	if (!key_appended && wsrep_certify_nonPK) {
+		uchar digest[16];
+
+		wsrep_calc_row_hash(digest, record0, table, m_prebuilt);
+
+		if (int rcode = wsrep_append_key(thd, trx, table_share,
+						 reinterpret_cast<char*>
+						 (digest), 16, key_type)) {
+			DBUG_RETURN(rcode);
+		}
+
+		if (record1) {
+			wsrep_calc_row_hash(
+				digest, record1, table, m_prebuilt);
+			if (int rcode = wsrep_append_key(
+				    thd, trx, table_share,
+				    reinterpret_cast<char*>(digest), 16,
+				    key_type)) {
+				DBUG_RETURN(rcode);
+			}
+		}
+		DBUG_RETURN(0);
+	}
+
+	DBUG_RETURN(0);
+}
+#endif /* WITH_WSREP */
+
+/*********************************************************************//**
+Stores a reference to the current row to 'ref' field of the handle. Note
+that in the case where we have generated the clustered index for the
+table, the function parameter is illogical: we MUST ASSUME that 'record'
+is the current 'position' of the handle, because if row ref is actually
+the row id internally generated in InnoDB, then 'record' does not contain
+it. We just guess that the row id must be for the record where the handle
+was positioned the last time. */
+
+void
+ha_innobase::position(
+/*==================*/
+	const uchar*	record)	/*!< in: row in MySQL format */
+{
+	uint		len;
+
+	ut_a(m_prebuilt->trx == thd_to_trx(ha_thd()));
+
+	if (m_prebuilt->clust_index_was_generated) {
+		/* No primary key was defined for the table and we
+		generated the clustered index from row id: the
+		row reference will be the row id, not any key value
+		that MySQL knows of */
+
+		len = DATA_ROW_ID_LEN;
+
+		memcpy(ref, m_prebuilt->row_id, len);
+	} else {
+
+		/* Copy primary key as the row reference */
+		KEY*	key_info = table->key_info + m_primary_key;
+		key_copy(ref, (uchar*)record, key_info, key_info->key_length);
+		len = key_info->key_length;
+	}
+
+	ut_ad(len == ref_length);
+}
+
+/*****************************************************************//**
+Check whether there exist a column named as "FTS_DOC_ID", which is
+reserved for InnoDB FTS Doc ID
+@return true if there exist a "FTS_DOC_ID" column */
+static
+bool
+create_table_check_doc_id_col(
+/*==========================*/
+	trx_t*		trx,		/*!< in: InnoDB transaction handle */
+	const TABLE*	form,		/*!< in: information on table
+					columns and indexes */
+	ulint*		doc_id_col)	/*!< out: Doc ID column number if
+					there exist a FTS_DOC_ID column,
+					ULINT_UNDEFINED if column is of the
+					wrong type/name/size */
+{
+	for (ulint i = 0; i < form->s->fields; i++) {
+		const Field* field = form->field[i];
+		if (!field->stored_in_db()) {
+			continue;
+		}
+
+		unsigned unsigned_type;
+
+		auto col_type = get_innobase_type_from_mysql_type(
+			&unsigned_type, field);
+
+		auto col_len = field->pack_length();
+
+		if (innobase_strcasecmp(field->field_name.str,
+					FTS_DOC_ID_COL_NAME) == 0) {
+
+			/* Note the name is case sensitive due to
+			our internal query parser */
+			if (col_type == DATA_INT
+			    && !field->real_maybe_null()
+			    && col_len == sizeof(doc_id_t)
+			    && (strcmp(field->field_name.str,
+				      FTS_DOC_ID_COL_NAME) == 0)) {
+				*doc_id_col = i;
+			} else {
+				push_warning_printf(
+					trx->mysql_thd,
+					Sql_condition::WARN_LEVEL_WARN,
+					ER_ILLEGAL_HA_CREATE_OPTION,
+					"InnoDB: FTS_DOC_ID column must be"
+					" of BIGINT NOT NULL type, and named"
+					" in all capitalized characters");
+				my_error(ER_WRONG_COLUMN_NAME, MYF(0),
+					 field->field_name.str);
+				*doc_id_col = ULINT_UNDEFINED;
+			}
+
+			return(true);
+		}
+	}
+
+	return(false);
+}
+
+
+/** Finds all base columns needed to compute a given generated column.
+This is returned as a bitmap, in field->table->tmp_set.
+Works for both dict_v_col_t and dict_s_col_t columns.
+@param[in]	table		InnoDB table
+@param[in]	field		MySQL field
+@param[in,out]	col		virtual or stored column */
+template <typename T>
+void
+prepare_vcol_for_base_setup(
+/*========================*/
+	const dict_table_t*	table,
+	const Field*	field,
+	T*		col)
+{
+	ut_ad(col->num_base == 0);
+	ut_ad(col->base_col == NULL);
+
+	MY_BITMAP *old_read_set = field->table->read_set;
+
+	field->table->read_set = &field->table->tmp_set;
+
+	bitmap_clear_all(&field->table->tmp_set);
+	field->vcol_info->expr->walk(
+		&Item::register_field_in_read_map, 1, field->table);
+	col->num_base= bitmap_bits_set(&field->table->tmp_set)
+		& dict_index_t::MAX_N_FIELDS;
+	if (col->num_base != 0) {
+		col->base_col = static_cast<dict_col_t**>(mem_heap_zalloc(
+					table->heap, col->num_base * sizeof(
+						* col->base_col)));
+	}
+	field->table->read_set= old_read_set;
+}
+
+
+/** Set up base columns for virtual column
+@param[in]	table		InnoDB table
+@param[in]	field		MySQL field
+@param[in,out]	v_col		virtual column */
+void
+innodb_base_col_setup(
+	dict_table_t*	table,
+	const Field*	field,
+	dict_v_col_t*	v_col)
+{
+	uint16_t n = 0;
+
+	prepare_vcol_for_base_setup(table, field, v_col);
+
+	for (uint i= 0; i < field->table->s->fields; ++i) {
+		const Field* base_field = field->table->field[i];
+		if (base_field->stored_in_db()
+			&& bitmap_is_set(&field->table->tmp_set, i)) {
+			ulint   z;
+
+			for (z = 0; z < table->n_cols; z++) {
+				const char* name = dict_table_get_col_name(table, z);
+				if (!innobase_strcasecmp(name,
+						base_field->field_name.str)) {
+					break;
+				}
+			}
+
+			ut_ad(z != table->n_cols);
+
+			v_col->base_col[n] = dict_table_get_nth_col(table, z);
+			ut_ad(v_col->base_col[n]->ind == z);
+			n++;
+		}
+	}
+	v_col->num_base= n & dict_index_t::MAX_N_FIELDS;
+}
+
+/** Set up base columns for stored column
+@param[in]	table	InnoDB table
+@param[in]	field	MySQL field
+@param[in,out]	s_col	stored column */
+void
+innodb_base_col_setup_for_stored(
+	const dict_table_t*	table,
+	const Field*		field,
+	dict_s_col_t*		s_col)
+{
+	ulint	n = 0;
+
+	prepare_vcol_for_base_setup(table, field, s_col);
+
+	for (uint i= 0; i < field->table->s->fields; ++i) {
+		const Field* base_field = field->table->field[i];
+
+		if (base_field->stored_in_db()
+		    && bitmap_is_set(&field->table->tmp_set, i)) {
+			ulint	z;
+			for (z = 0; z < table->n_cols; z++) {
+				const char* name = dict_table_get_col_name(
+						table, z);
+				if (!innobase_strcasecmp(
+					name, base_field->field_name.str)) {
+					break;
+				}
+			}
+
+			ut_ad(z != table->n_cols);
+
+			s_col->base_col[n] = dict_table_get_nth_col(table, z);
+			n++;
+
+			if (n == s_col->num_base) {
+				break;
+			}
+		}
+	}
+	s_col->num_base= n;
+}
+
+/** Create a table definition to an InnoDB database.
+@return ER_* level error */
+inline MY_ATTRIBUTE((warn_unused_result))
+int
+create_table_info_t::create_table_def()
+{
+	dict_table_t*	table;
+	ulint		nulls_allowed;
+	unsigned	unsigned_type;
+	ulint		binary_type;
+	ulint		long_true_varchar;
+	ulint		charset_no;
+	ulint		doc_id_col = 0;
+	ibool		has_doc_id_col = FALSE;
+	mem_heap_t*	heap;
+	ha_table_option_struct *options= m_form->s->option_struct;
+	dberr_t		err = DB_SUCCESS;
+
+	DBUG_ENTER("create_table_def");
+	DBUG_PRINT("enter", ("table_name: %s", m_table_name));
+
+	DBUG_ASSERT(m_trx->mysql_thd == m_thd);
+
+	/* MySQL does the name length check. But we do additional check
+	on the name length here */
+	const size_t	table_name_len = strlen(m_table_name);
+	if (table_name_len > MAX_FULL_NAME_LEN) {
+		push_warning_printf(
+			m_thd, Sql_condition::WARN_LEVEL_WARN,
+			ER_TABLE_NAME,
+			"InnoDB: Table Name or Database Name is too long");
+
+		DBUG_RETURN(ER_TABLE_NAME);
+	}
+
+	if (m_table_name[table_name_len - 1] == '/') {
+		push_warning_printf(
+			m_thd, Sql_condition::WARN_LEVEL_WARN,
+			ER_TABLE_NAME,
+			"InnoDB: Table name is empty");
+
+		DBUG_RETURN(ER_WRONG_TABLE_NAME);
+	}
+
+	/* Find out the number of virtual columns. */
+	ulint num_v = 0;
+	const bool omit_virtual = ha_innobase::omits_virtual_cols(*m_form->s);
+	const ulint n_cols = omit_virtual
+		? m_form->s->stored_fields : m_form->s->fields;
+
+	if (!omit_virtual) {
+		for (ulint i = 0; i < n_cols; i++) {
+			num_v += !m_form->field[i]->stored_in_db();
+		}
+	}
+
+	/* Check whether there already exists a FTS_DOC_ID column */
+	if (create_table_check_doc_id_col(m_trx, m_form, &doc_id_col)){
+
+		/* Raise error if the Doc ID column is of wrong type or name */
+		if (doc_id_col == ULINT_UNDEFINED) {
+			DBUG_RETURN(HA_ERR_GENERIC);
+		} else {
+			has_doc_id_col = TRUE;
+		}
+	}
+
+	/* Adjust the number of columns for the FTS hidden field */
+	const ulint actual_n_cols = n_cols
+		+ (m_flags2 & DICT_TF2_FTS && !has_doc_id_col);
+
+	table = dict_table_t::create({m_table_name,table_name_len}, nullptr,
+				     actual_n_cols, num_v, m_flags, m_flags2);
+
+	/* Set the hidden doc_id column. */
+	if (m_flags2 & DICT_TF2_FTS) {
+		table->fts->doc_col = has_doc_id_col
+				      ? doc_id_col : n_cols - num_v;
+	}
+
+	if (DICT_TF_HAS_DATA_DIR(m_flags)) {
+		ut_a(strlen(m_remote_path));
+
+		table->data_dir_path = mem_heap_strdup(
+			table->heap, m_remote_path);
+
+	} else {
+		table->data_dir_path = NULL;
+	}
+
+	heap = mem_heap_create(1000);
+	auto _ = make_scope_exit([heap]() { mem_heap_free(heap); });
+
+	ut_d(bool have_vers_start = false);
+	ut_d(bool have_vers_end = false);
+
+	for (ulint i = 0, j = 0; j < n_cols; i++) {
+		Field*	field = m_form->field[i];
+		ulint vers_row = 0;
+
+		if (m_form->versioned()) {
+			if (i == m_form->s->vers.start_fieldno) {
+				vers_row = DATA_VERS_START;
+				ut_d(have_vers_start = true);
+			} else if (i == m_form->s->vers.end_fieldno) {
+				vers_row = DATA_VERS_END;
+				ut_d(have_vers_end = true);
+			} else if (!(field->flags
+				     & VERS_UPDATE_UNVERSIONED_FLAG)) {
+				vers_row = DATA_VERSIONED;
+			}
+		}
+
+		auto col_type = get_innobase_type_from_mysql_type(
+			&unsigned_type, field);
+
+		if (!col_type) {
+			push_warning_printf(
+				m_thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_CANT_CREATE_TABLE,
+				"Error creating table '%s' with"
+				" column '%s'. Please check its"
+				" column type and try to re-create"
+				" the table with an appropriate"
+				" column type.",
+				table->name.m_name, field->field_name.str);
+err_col:
+			dict_mem_table_free(table);
+			DBUG_RETURN(HA_ERR_GENERIC);
+		}
+
+		nulls_allowed = field->real_maybe_null() ? 0 : DATA_NOT_NULL;
+		binary_type = field->binary() ? DATA_BINARY_TYPE : 0;
+
+		charset_no = 0;
+
+		if (dtype_is_string_type(col_type)) {
+
+			charset_no = (ulint) field->charset()->number;
+
+			DBUG_EXECUTE_IF("simulate_max_char_col",
+					charset_no = MAX_CHAR_COLL_NUM + 1;
+					);
+
+			if (charset_no > MAX_CHAR_COLL_NUM) {
+				/* in data0type.h we assume that the
+				number fits in one byte in prtype */
+				push_warning_printf(
+					m_thd, Sql_condition::WARN_LEVEL_WARN,
+					ER_CANT_CREATE_TABLE,
+					"In InnoDB, charset-collation codes"
+					" must be below 256."
+					" Unsupported code " ULINTPF ".",
+					charset_no);
+				dict_mem_table_free(table);
+
+				DBUG_RETURN(ER_CANT_CREATE_TABLE);
+			}
+		}
+
+		auto col_len = field->pack_length();
+
+		/* The MySQL pack length contains 1 or 2 bytes length field
+		for a true VARCHAR. Let us subtract that, so that the InnoDB
+		column length in the InnoDB data dictionary is the real
+		maximum byte length of the actual data. */
+
+		long_true_varchar = 0;
+
+		if (field->type() == MYSQL_TYPE_VARCHAR) {
+			col_len -= ((Field_varstring*) field)->length_bytes;
+
+			if (((Field_varstring*) field)->length_bytes == 2) {
+				long_true_varchar = DATA_LONG_TRUE_VARCHAR;
+			}
+		}
+
+		/* First check whether the column to be added has a
+		system reserved name. */
+		if (dict_col_name_is_reserved(field->field_name.str)){
+			my_error(ER_WRONG_COLUMN_NAME, MYF(0),
+				 field->field_name.str);
+			goto err_col;
+		}
+
+		ulint is_virtual = !field->stored_in_db() ? DATA_VIRTUAL : 0;
+
+		if (!is_virtual) {
+			dict_mem_table_add_col(table, heap,
+				field->field_name.str, col_type,
+				dtype_form_prtype(
+					(ulint) field->type()
+					| nulls_allowed | unsigned_type
+					| binary_type | long_true_varchar
+					| vers_row,
+					charset_no),
+				col_len);
+		} else if (!omit_virtual) {
+			dict_mem_table_add_v_col(table, heap,
+				field->field_name.str, col_type,
+				dtype_form_prtype(
+					(ulint) field->type()
+					| nulls_allowed | unsigned_type
+					| binary_type | long_true_varchar
+					| vers_row
+					| is_virtual,
+					charset_no),
+				col_len, i, 0);
+		}
+
+		if (innobase_is_s_fld(field)) {
+			ut_ad(!is_virtual);
+			/* Added stored column in m_s_cols list. */
+			dict_mem_table_add_s_col(
+				table, 0);
+		}
+
+		if (is_virtual && omit_virtual) {
+			continue;
+		}
+
+		j++;
+	}
+
+	ut_ad(have_vers_start == have_vers_end);
+	ut_ad(table->versioned() == have_vers_start);
+	ut_ad(!table->versioned() || table->vers_start != table->vers_end);
+
+	if (num_v) {
+		for (ulint i = 0, j = 0; i < n_cols; i++) {
+			dict_v_col_t*	v_col;
+
+			const Field* field = m_form->field[i];
+
+			if (field->stored_in_db()) {
+				continue;
+			}
+
+			v_col = dict_table_get_nth_v_col(table, j);
+
+			j++;
+
+			innodb_base_col_setup(table, field, v_col);
+		}
+	}
+
+	/** Fill base columns for the stored column present in the list. */
+	if (table->s_cols && !table->s_cols->empty()) {
+		for (ulint i = 0; i < n_cols; i++) {
+			Field*  field = m_form->field[i];
+
+			if (!innobase_is_s_fld(field)) {
+				continue;
+			}
+
+			dict_s_col_list::iterator       it;
+			for (it = table->s_cols->begin();
+			     it != table->s_cols->end(); ++it) {
+				dict_s_col_t	s_col = *it;
+
+				if (s_col.s_pos == i) {
+					innodb_base_col_setup_for_stored(
+						table, field, &s_col);
+					break;
+				}
+			}
+		}
+	}
+
+	/* Add the FTS doc_id hidden column. */
+	if (m_flags2 & DICT_TF2_FTS && !has_doc_id_col) {
+		fts_add_doc_id_column(table, heap);
+	}
+
+	dict_table_add_system_columns(table, heap);
+
+	if (table->is_temporary()) {
+		if ((options->encryption == 1
+		     && !innodb_encrypt_temporary_tables)
+		    || (options->encryption == 2
+			&& innodb_encrypt_temporary_tables)) {
+			push_warning_printf(m_thd,
+					    Sql_condition::WARN_LEVEL_WARN,
+					    ER_ILLEGAL_HA_CREATE_OPTION,
+					    "Ignoring encryption parameter during "
+					    "temporary table creation.");
+		}
+
+		table->id = dict_sys.acquire_temporary_table_id();
+		ut_ad(dict_tf_get_rec_format(table->flags)
+		      != REC_FORMAT_COMPRESSED);
+		table->space_id = SRV_TMP_SPACE_ID;
+		table->space = fil_system.temp_space;
+		table->add_to_cache();
+	} else {
+		ut_ad(dict_sys.sys_tables_exist());
+
+		err = row_create_table_for_mysql(table, m_trx);
+	}
+
+	switch (err) {
+	case DB_SUCCESS:
+		ut_ad(table);
+		m_table = table;
+		DBUG_RETURN(0);
+	default:
+		break;
+	case DB_DUPLICATE_KEY:
+		char display_name[FN_REFLEN];
+		char* buf_end = innobase_convert_identifier(
+			display_name, sizeof(display_name) - 1,
+			m_table_name, strlen(m_table_name),
+			m_thd);
+
+		*buf_end = '\0';
+
+		my_error(ER_TABLE_EXISTS_ERROR, MYF(0), display_name);
+	}
+
+	DBUG_RETURN(convert_error_code_to_mysql(err, m_flags, m_thd));
+}
+
+/*****************************************************************//**
+Creates an index in an InnoDB database. */
+inline
+int
+create_index(
+/*=========*/
+	trx_t*		trx,		/*!< in: InnoDB transaction handle */
+	const TABLE*	form,		/*!< in: information on table
+					columns and indexes */
+	dict_table_t*	table,		/*!< in,out: table */
+	uint		key_num)	/*!< in: index number */
+{
+	dict_index_t*	index;
+	int		error;
+	const KEY*	key;
+	ulint*		field_lengths;
+
+	DBUG_ENTER("create_index");
+
+	key = form->key_info + key_num;
+
+	/* Assert that "GEN_CLUST_INDEX" cannot be used as non-primary index */
+	ut_a(innobase_strcasecmp(key->name.str, innobase_index_reserve_name) != 0);
+	const ha_table_option_struct& o = *form->s->option_struct;
+
+	if (key->flags & (HA_SPATIAL | HA_FULLTEXT)) {
+		/* Only one of these can be specified at a time. */
+		ut_ad(~key->flags & (HA_SPATIAL | HA_FULLTEXT));
+		ut_ad(!(key->flags & HA_NOSAME));
+		index = dict_mem_index_create(table, key->name.str,
+					      (key->flags & HA_SPATIAL)
+					      ? DICT_SPATIAL : DICT_FTS,
+					      key->user_defined_key_parts);
+
+		for (ulint i = 0; i < key->user_defined_key_parts; i++) {
+			const Field* field = key->key_part[i].field;
+
+			/* We do not support special (Fulltext or Spatial)
+			index on virtual columns */
+			if (!field->stored_in_db()) {
+				ut_ad(0);
+				DBUG_RETURN(HA_ERR_UNSUPPORTED);
+			}
+
+			dict_mem_index_add_field(index, field->field_name.str,
+						 0,
+						 key->key_part->key_part_flag
+						 & HA_REVERSE_SORT);
+		}
+
+		DBUG_RETURN(convert_error_code_to_mysql(
+				    row_create_index_for_mysql(
+					    index, trx, NULL,
+					    fil_encryption_t(o.encryption),
+					    uint32_t(o.encryption_key_id)),
+				    table->flags, NULL));
+	}
+
+	ulint ind_type = 0;
+
+	if (key_num == form->s->primary_key) {
+		ind_type |= DICT_CLUSTERED;
+	}
+
+	if (key->flags & HA_NOSAME) {
+		ind_type |= DICT_UNIQUE;
+	}
+
+	field_lengths = (ulint*) my_malloc(PSI_INSTRUMENT_ME,
+		key->user_defined_key_parts * sizeof *
+				field_lengths, MYF(MY_FAE));
+
+	/* We pass 0 as the space id, and determine at a lower level the space
+	id where to store the table */
+
+	index = dict_mem_index_create(table, key->name.str,
+				      ind_type, key->user_defined_key_parts);
+
+	for (ulint i = 0; i < key->user_defined_key_parts; i++) {
+		KEY_PART_INFO*	key_part = key->key_part + i;
+		ulint		prefix_len;
+		unsigned	is_unsigned;
+
+
+		/* (The flag HA_PART_KEY_SEG denotes in MySQL a
+		column prefix field in an index: we only store a
+		specified number of first bytes of the column to
+		the index field.) The flag does not seem to be
+		properly set by MySQL. Let us fall back on testing
+		the length of the key part versus the column.
+		We first reach to the table's column; if the index is on a
+		prefix, key_part->field is not the table's column (it's a
+		"fake" field forged in open_table_from_share() with length
+		equal to the length of the prefix); so we have to go to
+		form->fied. */
+		Field*	field= form->field[key_part->field->field_index];
+		if (field == NULL)
+		  ut_error;
+
+		const char*	field_name = key_part->field->field_name.str;
+
+		auto col_type = get_innobase_type_from_mysql_type(
+			&is_unsigned, key_part->field);
+
+		if (DATA_LARGE_MTYPE(col_type)
+		    || (key_part->length < field->pack_length()
+			&& field->type() != MYSQL_TYPE_VARCHAR)
+		    || (field->type() == MYSQL_TYPE_VARCHAR
+			&& key_part->length < field->pack_length()
+			- ((Field_varstring*) field)->length_bytes)) {
+
+			switch (col_type) {
+			default:
+				prefix_len = key_part->length;
+				break;
+			case DATA_INT:
+			case DATA_FLOAT:
+			case DATA_DOUBLE:
+			case DATA_DECIMAL:
+				sql_print_error(
+					"MariaDB is trying to create a column"
+					" prefix index field, on an"
+					" inappropriate data type. Table"
+					" name %s, column name %s.",
+					form->s->table_name.str,
+					key_part->field->field_name.str);
+
+				prefix_len = 0;
+			}
+		} else {
+			prefix_len = 0;
+		}
+
+		ut_ad(prefix_len % field->charset()->mbmaxlen == 0);
+
+		field_lengths[i] = key_part->length;
+
+		if (!key_part->field->stored_in_db()) {
+			index->type |= DICT_VIRTUAL;
+		}
+
+		dict_mem_index_add_field(index, field_name, prefix_len,
+					 key_part->key_part_flag
+					 & HA_REVERSE_SORT);
+	}
+
+	ut_ad(key->flags & HA_FULLTEXT || !(index->type & DICT_FTS));
+
+	/* Even though we've defined max_supported_key_part_length, we
+	still do our own checking using field_lengths to be absolutely
+	sure we don't create too long indexes. */
+	ulint flags = table->flags;
+
+	error = convert_error_code_to_mysql(
+		row_create_index_for_mysql(index, trx, field_lengths,
+					   fil_encryption_t(o.encryption),
+					   uint32_t(o.encryption_key_id)),
+		flags, NULL);
+
+	my_free(field_lengths);
+
+	DBUG_RETURN(error);
+}
+
+/** Return a display name for the row format
+@param[in]	row_format	Row Format
+@return row format name */
+static
+const char*
+get_row_format_name(
+	enum row_type	row_format)
+{
+	switch (row_format) {
+	case ROW_TYPE_COMPACT:
+		return("COMPACT");
+	case ROW_TYPE_COMPRESSED:
+		return("COMPRESSED");
+	case ROW_TYPE_DYNAMIC:
+		return("DYNAMIC");
+	case ROW_TYPE_REDUNDANT:
+		return("REDUNDANT");
+	case ROW_TYPE_DEFAULT:
+		return("DEFAULT");
+	case ROW_TYPE_FIXED:
+		return("FIXED");
+	case ROW_TYPE_PAGE:
+	case ROW_TYPE_NOT_USED:
+		break;
+	}
+	return("NOT USED");
+}
+
+/** Validate DATA DIRECTORY option.
+@return true if valid, false if not. */
+bool
+create_table_info_t::create_option_data_directory_is_valid()
+{
+	bool		is_valid = true;
+
+	ut_ad(m_create_info->data_file_name
+	      && m_create_info->data_file_name[0] != '\0');
+
+	/* Use DATA DIRECTORY only with file-per-table. */
+	if (!m_allow_file_per_table) {
+		push_warning(
+			m_thd, Sql_condition::WARN_LEVEL_WARN,
+			ER_ILLEGAL_HA_CREATE_OPTION,
+			"InnoDB: DATA DIRECTORY requires"
+			" innodb_file_per_table.");
+		is_valid = false;
+	}
+
+	/* Do not use DATA DIRECTORY with TEMPORARY TABLE. */
+	if (m_create_info->tmp_table()) {
+		push_warning(
+			m_thd, Sql_condition::WARN_LEVEL_WARN,
+			ER_ILLEGAL_HA_CREATE_OPTION,
+			"InnoDB: DATA DIRECTORY cannot be used"
+			" for TEMPORARY tables.");
+		is_valid = false;
+	}
+
+	/* We check for a DATA DIRECTORY mixed with TABLESPACE in
+	create_option_tablespace_is_valid(), no need to here. */
+
+	return(is_valid);
+}
+
+/** Validate the create options. Check that the options KEY_BLOCK_SIZE,
+ROW_FORMAT, DATA DIRECTORY, TEMPORARY are compatible with
+each other and other settings.  These CREATE OPTIONS are not validated
+here unless innodb_strict_mode is on. With strict mode, this function
+will report each problem it finds using a custom message with error
+code ER_ILLEGAL_HA_CREATE_OPTION, not its built-in message.
+@return NULL if valid, string name of bad option if not. */
+const char*
+create_table_info_t::create_options_are_invalid()
+{
+	bool	has_key_block_size = (m_create_info->key_block_size != 0);
+
+	const char*	ret = NULL;
+	enum row_type	row_format	= m_create_info->row_type;
+	const bool	is_temp 	= m_create_info->tmp_table();
+
+	ut_ad(m_thd != NULL);
+
+	/* If innodb_strict_mode is not set don't do any more validation. */
+	if (!THDVAR(m_thd, strict_mode)) {
+		return(NULL);
+	}
+
+	/* Check if a non-zero KEY_BLOCK_SIZE was specified. */
+	if (has_key_block_size) {
+		if (is_temp || innodb_read_only_compressed) {
+			my_error(ER_UNSUPPORTED_COMPRESSED_TABLE, MYF(0));
+			return("KEY_BLOCK_SIZE");
+		}
+
+		switch (m_create_info->key_block_size) {
+			ulint	kbs_max;
+		case 1:
+		case 2:
+		case 4:
+		case 8:
+		case 16:
+			/* The maximum KEY_BLOCK_SIZE (KBS) is
+			UNIV_PAGE_SIZE_MAX. But if srv_page_size is
+			smaller than UNIV_PAGE_SIZE_MAX, the maximum
+			KBS is also smaller. */
+			kbs_max = ut_min(
+				1U << (UNIV_PAGE_SSIZE_MAX - 1),
+				1U << (PAGE_ZIP_SSIZE_MAX - 1));
+			if (m_create_info->key_block_size > kbs_max) {
+				push_warning_printf(
+					m_thd, Sql_condition::WARN_LEVEL_WARN,
+					ER_ILLEGAL_HA_CREATE_OPTION,
+					"InnoDB: KEY_BLOCK_SIZE=%ld"
+					" cannot be larger than %ld.",
+					m_create_info->key_block_size,
+					kbs_max);
+				ret = "KEY_BLOCK_SIZE";
+			}
+
+			/* Valid KEY_BLOCK_SIZE, check its dependencies. */
+			if (!m_allow_file_per_table) {
+				push_warning(
+					m_thd, Sql_condition::WARN_LEVEL_WARN,
+					ER_ILLEGAL_HA_CREATE_OPTION,
+					"InnoDB: KEY_BLOCK_SIZE requires"
+					" innodb_file_per_table.");
+				ret = "KEY_BLOCK_SIZE";
+			}
+			break;
+		default:
+			push_warning_printf(
+				m_thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_ILLEGAL_HA_CREATE_OPTION,
+				"InnoDB: invalid KEY_BLOCK_SIZE = %u."
+				" Valid values are [1, 2, 4, 8, 16]",
+				(uint) m_create_info->key_block_size);
+			ret = "KEY_BLOCK_SIZE";
+			break;
+		}
+	}
+
+	/* Check for a valid InnoDB ROW_FORMAT specifier and
+	other incompatibilities. */
+	switch (row_format) {
+	case ROW_TYPE_COMPRESSED:
+		if (is_temp || innodb_read_only_compressed) {
+			my_error(ER_UNSUPPORTED_COMPRESSED_TABLE, MYF(0));
+			return("ROW_FORMAT");
+		}
+		if (!m_allow_file_per_table) {
+			push_warning_printf(
+				m_thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_ILLEGAL_HA_CREATE_OPTION,
+				"InnoDB: ROW_FORMAT=%s requires"
+				" innodb_file_per_table.",
+				get_row_format_name(row_format));
+			ret = "ROW_FORMAT";
+		}
+		break;
+	case ROW_TYPE_DYNAMIC:
+	case ROW_TYPE_COMPACT:
+	case ROW_TYPE_REDUNDANT:
+		if (has_key_block_size) {
+			push_warning_printf(
+				m_thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_ILLEGAL_HA_CREATE_OPTION,
+				"InnoDB: cannot specify ROW_FORMAT = %s"
+				" with KEY_BLOCK_SIZE.",
+				get_row_format_name(row_format));
+			ret = "KEY_BLOCK_SIZE";
+		}
+		break;
+	case ROW_TYPE_DEFAULT:
+		break;
+	case ROW_TYPE_FIXED:
+	case ROW_TYPE_PAGE:
+	case ROW_TYPE_NOT_USED:
+		push_warning(
+			m_thd, Sql_condition::WARN_LEVEL_WARN,
+			ER_ILLEGAL_HA_CREATE_OPTION,
+			"InnoDB: invalid ROW_FORMAT specifier.");
+		ret = "ROW_TYPE";
+		break;
+	}
+
+	if (!m_create_info->data_file_name
+	    || !m_create_info->data_file_name[0]) {
+	} else if (!my_use_symdir) {
+		my_error(WARN_OPTION_IGNORED, MYF(ME_WARNING),
+			 "DATA DIRECTORY");
+	} else if (!create_option_data_directory_is_valid()) {
+		ret = "DATA DIRECTORY";
+	}
+
+	/* Do not allow INDEX_DIRECTORY */
+	if (m_create_info->index_file_name) {
+		push_warning_printf(
+			m_thd, Sql_condition::WARN_LEVEL_WARN,
+			ER_ILLEGAL_HA_CREATE_OPTION,
+			"InnoDB: INDEX DIRECTORY is not supported");
+		ret = "INDEX DIRECTORY";
+	}
+
+	/* Don't support compressed table when page size > 16k. */
+	if ((has_key_block_size || row_format == ROW_TYPE_COMPRESSED)
+	    && srv_page_size > UNIV_PAGE_SIZE_DEF) {
+		push_warning(m_thd, Sql_condition::WARN_LEVEL_WARN,
+			     ER_ILLEGAL_HA_CREATE_OPTION,
+			     "InnoDB: Cannot create a COMPRESSED table"
+			     " when innodb_page_size > 16k.");
+
+		if (has_key_block_size) {
+			ret = "KEY_BLOCK_SIZE";
+		} else {
+			ret = "ROW_TYPE";
+		}
+	}
+
+	return(ret);
+}
+
+/*****************************************************************//**
+Check engine specific table options not handled by SQL-parser.
+@return	NULL if valid, string if not */
+const char*
+create_table_info_t::check_table_options()
+{
+	enum row_type row_format = m_create_info->row_type;
+	const ha_table_option_struct *options= m_form->s->option_struct;
+
+	switch (options->encryption) {
+	case FIL_ENCRYPTION_OFF:
+		if (options->encryption_key_id != FIL_DEFAULT_ENCRYPTION_KEY) {
+			push_warning(
+				m_thd, Sql_condition::WARN_LEVEL_WARN,
+				HA_WRONG_CREATE_OPTION,
+				"InnoDB: ENCRYPTED=NO implies"
+				" ENCRYPTION_KEY_ID=1");
+			compile_time_assert(FIL_DEFAULT_ENCRYPTION_KEY == 1);
+		}
+		if (srv_encrypt_tables != 2) {
+			break;
+		}
+		push_warning(
+			m_thd, Sql_condition::WARN_LEVEL_WARN,
+			HA_WRONG_CREATE_OPTION,
+			"InnoDB: ENCRYPTED=NO cannot be used with"
+			" innodb_encrypt_tables=FORCE");
+		return "ENCRYPTED";
+	case FIL_ENCRYPTION_DEFAULT:
+		if (!srv_encrypt_tables) {
+			break;
+		}
+		/* fall through */
+	case FIL_ENCRYPTION_ON:
+		const uint32_t key_id = uint32_t(options->encryption_key_id);
+		if (!encryption_key_id_exists(key_id)) {
+			push_warning_printf(
+				m_thd, Sql_condition::WARN_LEVEL_WARN,
+				HA_WRONG_CREATE_OPTION,
+				"InnoDB: ENCRYPTION_KEY_ID %u not available",
+				key_id);
+			return "ENCRYPTION_KEY_ID";
+		}
+
+		/* We do not support encryption for spatial indexes,
+		except if innodb_checksum_algorithm=full_crc32.
+		Do not allow ENCRYPTED=YES if any SPATIAL INDEX exists. */
+		if (options->encryption != FIL_ENCRYPTION_ON
+		    || srv_checksum_algorithm
+		    >= SRV_CHECKSUM_ALGORITHM_FULL_CRC32) {
+			break;
+		}
+		for (ulint i = 0; i < m_form->s->keys; i++) {
+			if (m_form->key_info[i].flags & HA_SPATIAL) {
+				push_warning(m_thd,
+					     Sql_condition::WARN_LEVEL_WARN,
+					     HA_ERR_UNSUPPORTED,
+					     "InnoDB: ENCRYPTED=YES is not"
+					     " supported for SPATIAL INDEX");
+				return "ENCRYPTED";
+			}
+		}
+	}
+
+	if (!m_allow_file_per_table
+	    && options->encryption != FIL_ENCRYPTION_DEFAULT) {
+		push_warning(
+			m_thd, Sql_condition::WARN_LEVEL_WARN,
+			HA_WRONG_CREATE_OPTION,
+			"InnoDB: ENCRYPTED requires innodb_file_per_table");
+		return "ENCRYPTED";
+ 	}
+
+	/* Check page compression requirements */
+	if (options->page_compressed) {
+
+		if (row_format == ROW_TYPE_COMPRESSED) {
+			push_warning(
+				m_thd, Sql_condition::WARN_LEVEL_WARN,
+				HA_WRONG_CREATE_OPTION,
+				"InnoDB: PAGE_COMPRESSED table can't have"
+				" ROW_TYPE=COMPRESSED");
+			return "PAGE_COMPRESSED";
+		}
+
+		switch (row_format) {
+		default:
+			break;
+		case ROW_TYPE_DEFAULT:
+			if (m_default_row_format
+			    != DEFAULT_ROW_FORMAT_REDUNDANT) {
+				break;
+			}
+			/* fall through */
+		case ROW_TYPE_REDUNDANT:
+			push_warning(
+				m_thd, Sql_condition::WARN_LEVEL_WARN,
+				HA_WRONG_CREATE_OPTION,
+				"InnoDB: PAGE_COMPRESSED table can't have"
+				" ROW_TYPE=REDUNDANT");
+			return "PAGE_COMPRESSED";
+		}
+
+		if (!m_allow_file_per_table) {
+			push_warning(
+				m_thd, Sql_condition::WARN_LEVEL_WARN,
+				HA_WRONG_CREATE_OPTION,
+				"InnoDB: PAGE_COMPRESSED requires"
+				" innodb_file_per_table.");
+			return "PAGE_COMPRESSED";
+		}
+
+		if (m_create_info->key_block_size) {
+			push_warning(
+				m_thd, Sql_condition::WARN_LEVEL_WARN,
+				HA_WRONG_CREATE_OPTION,
+				"InnoDB: PAGE_COMPRESSED table can't have"
+				" key_block_size");
+			return "PAGE_COMPRESSED";
+		}
+	}
+
+	/* Check page compression level requirements, some of them are
+	already checked above */
+	if (options->page_compression_level != 0) {
+		if (options->page_compressed == false) {
+			push_warning(
+				m_thd, Sql_condition::WARN_LEVEL_WARN,
+				HA_WRONG_CREATE_OPTION,
+				"InnoDB: PAGE_COMPRESSION_LEVEL requires"
+				" PAGE_COMPRESSED");
+			return "PAGE_COMPRESSION_LEVEL";
+		}
+
+		if (options->page_compression_level < 1 || options->page_compression_level > 9) {
+			push_warning_printf(
+				m_thd, Sql_condition::WARN_LEVEL_WARN,
+				HA_WRONG_CREATE_OPTION,
+				"InnoDB: invalid PAGE_COMPRESSION_LEVEL = %lu."
+				" Valid values are [1, 2, 3, 4, 5, 6, 7, 8, 9]",
+				options->page_compression_level);
+			return "PAGE_COMPRESSION_LEVEL";
+		}
+	}
+
+	return NULL;
+}
+
+/*****************************************************************//**
+Update create_info.  Used in SHOW CREATE TABLE et al. */
+
+void
+ha_innobase::update_create_info(
+/*============================*/
+	HA_CREATE_INFO*	create_info)	/*!< in/out: create info */
+{
+	if (!(create_info->used_fields & HA_CREATE_USED_AUTO)) {
+		info(HA_STATUS_AUTO);
+		create_info->auto_increment_value = stats.auto_increment_value;
+	}
+
+	if (m_prebuilt->table->is_temporary()) {
+		return;
+	}
+
+	dict_get_and_save_data_dir_path(m_prebuilt->table);
+
+	if (m_prebuilt->table->data_dir_path) {
+		create_info->data_file_name = m_prebuilt->table->data_dir_path;
+	}
+}
+
+/*****************************************************************//**
+Initialize the table FTS stopword list
+@return TRUE if success */
+ibool
+innobase_fts_load_stopword(
+/*=======================*/
+	dict_table_t*	table,	/*!< in: Table has the FTS */
+	trx_t*		trx,	/*!< in: transaction */
+	THD*		thd)	/*!< in: current thread */
+{
+  ut_ad(dict_sys.locked());
+
+  const char *stopword_table= THDVAR(thd, ft_user_stopword_table);
+  if (!stopword_table)
+  {
+    mysql_mutex_lock(&LOCK_global_system_variables);
+    if (innobase_server_stopword_table)
+      stopword_table= thd_strdup(thd, innobase_server_stopword_table);
+    mysql_mutex_unlock(&LOCK_global_system_variables);
+  }
+
+  table->fts->dict_locked= true;
+  bool success= fts_load_stopword(table, trx, stopword_table,
+                                  THDVAR(thd, ft_enable_stopword), false);
+  table->fts->dict_locked= false;
+  return success;
+}
+
+/** Parse the table name into normal name and remote path if needed.
+@param[in]	name	Table name (db/table or full path).
+@return 0 if successful, otherwise, error number */
+int
+create_table_info_t::parse_table_name(
+	const char*
+#ifdef _WIN32
+	name
+#endif
+				      )
+{
+	DBUG_ENTER("parse_table_name");
+
+#ifdef _WIN32
+	/* Names passed in from server are in two formats:
+	1. <database_name>/<table_name>: for normal table creation
+	2. full path: for temp table creation, or DATA DIRECTORY.
+
+	When srv_file_per_table is on and mysqld_embedded is off,
+	check for full path pattern, i.e.
+	X:\dir\...,		X is a driver letter, or
+	\\dir1\dir2\...,	UNC path
+	returns error if it is in full path format, but not creating a temp.
+	table. Currently InnoDB does not support symbolic link on Windows. */
+
+	if (m_innodb_file_per_table
+	    && !mysqld_embedded
+	    && !m_create_info->tmp_table()) {
+
+		if ((name[1] == ':')
+		    || (name[0] == '\\' && name[1] == '\\')) {
+			sql_print_error("Cannot create table %s\n", name);
+			DBUG_RETURN(HA_ERR_GENERIC);
+		}
+	}
+#endif
+
+	m_remote_path[0] = '\0';
+
+	/* Make sure DATA DIRECTORY is compatible with other options
+	and set the remote path.  In the case of either;
+	  CREATE TEMPORARY TABLE ... DATA DIRECTORY={path} ... ;
+	  CREATE TABLE ... DATA DIRECTORY={path} TABLESPACE={name}... ;
+	we ignore the DATA DIRECTORY. */
+	if (m_create_info->data_file_name
+	    && m_create_info->data_file_name[0]
+	    && my_use_symdir) {
+		if (!create_option_data_directory_is_valid()) {
+			push_warning_printf(
+				m_thd, Sql_condition::WARN_LEVEL_WARN,
+				WARN_OPTION_IGNORED,
+				ER_DEFAULT(WARN_OPTION_IGNORED),
+				"DATA DIRECTORY");
+
+			m_flags &= ~DICT_TF_MASK_DATA_DIR;
+		} else {
+			strncpy(m_remote_path,
+				m_create_info->data_file_name,
+				FN_REFLEN - 1);
+		}
+	}
+
+	if (m_create_info->index_file_name) {
+		my_error(WARN_OPTION_IGNORED, ME_WARNING,
+			"INDEX DIRECTORY");
+	}
+
+	DBUG_RETURN(0);
+}
+
+/** @return whether innodb_strict_mode is active */
+bool ha_innobase::is_innodb_strict_mode(THD *thd)
+{
+  return THDVAR(thd, strict_mode);
+}
+
+/** Determine InnoDB table flags.
+If strict_mode=OFF, this will adjust the flags to what should be assumed.
+@retval true on success
+@retval false on error */
+bool create_table_info_t::innobase_table_flags()
+{
+	DBUG_ENTER("innobase_table_flags");
+
+	const char*	fts_doc_id_index_bad = NULL;
+	ulint		zip_ssize = 0;
+	enum row_type	row_type;
+	rec_format_t	innodb_row_format =
+		get_row_format(m_default_row_format);
+	const bool	is_temp = m_create_info->tmp_table();
+	bool		zip_allowed = !is_temp;
+
+	const ulint	zip_ssize_max =
+		ut_min(static_cast<ulint>(UNIV_PAGE_SSIZE_MAX),
+		       static_cast<ulint>(PAGE_ZIP_SSIZE_MAX));
+
+	ha_table_option_struct *options= m_form->s->option_struct;
+
+	m_flags = 0;
+	m_flags2 = 0;
+
+	/* Check if there are any FTS indexes defined on this table. */
+	const uint fts_n_uniq= m_form->versioned() ? 2 : 1;
+	for (uint i = 0; i < m_form->s->keys; i++) {
+		const KEY*	key = &m_form->key_info[i];
+
+		if (key->flags & HA_FULLTEXT) {
+			m_flags2 |= DICT_TF2_FTS;
+
+			/* We don't support FTS indexes in temporary
+			tables. */
+			if (is_temp) {
+				my_error(ER_INNODB_NO_FT_TEMP_TABLE, MYF(0));
+				DBUG_RETURN(false);
+			}
+
+			if (fts_doc_id_index_bad) {
+				goto index_bad;
+			}
+		}
+
+		if (innobase_strcasecmp(key->name.str, FTS_DOC_ID_INDEX_NAME)) {
+			continue;
+		}
+
+		/* Do a pre-check on FTS DOC ID index */
+		if (!(key->flags & HA_NOSAME)
+		    || key->user_defined_key_parts != fts_n_uniq
+		    || (key->key_part[0].key_part_flag & HA_REVERSE_SORT)
+		    || strcmp(key->name.str, FTS_DOC_ID_INDEX_NAME)
+		    || strcmp(key->key_part[0].field->field_name.str,
+			      FTS_DOC_ID_COL_NAME)) {
+			fts_doc_id_index_bad = key->name.str;
+		}
+
+		if (fts_doc_id_index_bad && (m_flags2 & DICT_TF2_FTS)) {
+index_bad:
+			my_error(ER_INNODB_FT_WRONG_DOCID_INDEX, MYF(0),
+				 fts_doc_id_index_bad);
+			DBUG_RETURN(false);
+		}
+	}
+
+	if (m_create_info->key_block_size > 0) {
+		/* The requested compressed page size (key_block_size)
+		is given in kilobytes. If it is a valid number, store
+		that value as the number of log2 shifts from 512 in
+		zip_ssize. Zero means it is not compressed. */
+		ulint	zssize;		/* Zip Shift Size */
+		ulint	kbsize;		/* Key Block Size */
+		for (zssize = kbsize = 1;
+		     zssize <= zip_ssize_max;
+		     zssize++, kbsize <<= 1) {
+			if (kbsize == m_create_info->key_block_size) {
+				zip_ssize = zssize;
+				break;
+			}
+		}
+
+		/* Make sure compressed row format is allowed. */
+		if (is_temp) {
+			push_warning(
+				m_thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_ILLEGAL_HA_CREATE_OPTION,
+				"InnoDB: KEY_BLOCK_SIZE is ignored"
+				" for TEMPORARY TABLE.");
+			zip_allowed = false;
+		} else if (!m_allow_file_per_table) {
+			push_warning(
+				m_thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_ILLEGAL_HA_CREATE_OPTION,
+				"InnoDB: KEY_BLOCK_SIZE requires"
+				" innodb_file_per_table.");
+			zip_allowed = false;
+		}
+
+		if (!zip_allowed
+		    || zssize > zip_ssize_max) {
+			push_warning_printf(
+				m_thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_ILLEGAL_HA_CREATE_OPTION,
+				"InnoDB: ignoring KEY_BLOCK_SIZE=%u.",
+				(uint) m_create_info->key_block_size);
+		}
+	}
+
+	row_type = m_create_info->row_type;
+
+	if (zip_ssize && zip_allowed) {
+		/* if ROW_FORMAT is set to default,
+		automatically change it to COMPRESSED. */
+		if (row_type == ROW_TYPE_DEFAULT) {
+			row_type = ROW_TYPE_COMPRESSED;
+		} else if (row_type != ROW_TYPE_COMPRESSED) {
+			/* ROW_FORMAT other than COMPRESSED
+			ignores KEY_BLOCK_SIZE.  It does not
+			make sense to reject conflicting
+			KEY_BLOCK_SIZE and ROW_FORMAT, because
+			such combinations can be obtained
+			with ALTER TABLE anyway. */
+			push_warning_printf(
+				m_thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_ILLEGAL_HA_CREATE_OPTION,
+				"InnoDB: ignoring KEY_BLOCK_SIZE=%u"
+				" unless ROW_FORMAT=COMPRESSED.",
+				(uint) m_create_info->key_block_size);
+			zip_allowed = false;
+		}
+	} else {
+		/* zip_ssize == 0 means no KEY_BLOCK_SIZE. */
+		if (row_type == ROW_TYPE_COMPRESSED && zip_allowed) {
+			/* ROW_FORMAT=COMPRESSED without KEY_BLOCK_SIZE
+			implies half the maximum KEY_BLOCK_SIZE(*1k) or
+			srv_page_size, whichever is less. */
+			zip_ssize = zip_ssize_max - 1;
+		}
+	}
+
+	/* Validate the row format.  Correct it if necessary */
+
+	switch (row_type) {
+	case ROW_TYPE_REDUNDANT:
+		innodb_row_format = REC_FORMAT_REDUNDANT;
+		break;
+	case ROW_TYPE_COMPACT:
+		innodb_row_format = REC_FORMAT_COMPACT;
+		break;
+	case ROW_TYPE_COMPRESSED:
+		if (is_temp) {
+			push_warning_printf(
+				m_thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_ILLEGAL_HA_CREATE_OPTION,
+				"InnoDB: ROW_FORMAT=%s is ignored for"
+				" TEMPORARY TABLE.",
+				get_row_format_name(row_type));
+		} else if (!m_allow_file_per_table) {
+			push_warning_printf(
+				m_thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_ILLEGAL_HA_CREATE_OPTION,
+				"InnoDB: ROW_FORMAT=COMPRESSED requires"
+				" innodb_file_per_table.");
+		} else {
+			innodb_row_format = REC_FORMAT_COMPRESSED;
+			break;
+		}
+		zip_allowed = false;
+		/* Set ROW_FORMAT = COMPACT */
+		/* fall through */
+	case ROW_TYPE_NOT_USED:
+	case ROW_TYPE_FIXED:
+	case ROW_TYPE_PAGE:
+		push_warning(
+			m_thd, Sql_condition::WARN_LEVEL_WARN,
+			ER_ILLEGAL_HA_CREATE_OPTION,
+			"InnoDB: assuming ROW_FORMAT=DYNAMIC.");
+		/* fall through */
+	case ROW_TYPE_DYNAMIC:
+		innodb_row_format = REC_FORMAT_DYNAMIC;
+		break;
+	case ROW_TYPE_DEFAULT:
+		;
+	}
+
+	/* Don't support compressed table when page size > 16k. */
+	if (zip_allowed && zip_ssize && srv_page_size > UNIV_PAGE_SIZE_DEF) {
+		push_warning(m_thd, Sql_condition::WARN_LEVEL_WARN,
+			     ER_ILLEGAL_HA_CREATE_OPTION,
+			     "InnoDB: Cannot create a COMPRESSED table"
+			     " when innodb_page_size > 16k."
+			     " Assuming ROW_FORMAT=DYNAMIC.");
+		zip_allowed = false;
+	}
+
+	ut_ad(!is_temp || !zip_allowed);
+	ut_ad(!is_temp || innodb_row_format != REC_FORMAT_COMPRESSED);
+
+	/* Set the table flags */
+	if (!zip_allowed) {
+		zip_ssize = 0;
+	}
+
+	ulint level = 0;
+
+	if (is_temp) {
+		m_flags2 |= DICT_TF2_TEMPORARY;
+	} else {
+		if (m_use_file_per_table) {
+			m_flags2 |= DICT_TF2_USE_FILE_PER_TABLE;
+		}
+
+		level = ulint(options->page_compression_level);
+		if (!level) {
+			level = page_zip_level;
+			if (!level && options->page_compressed) {
+				push_warning_printf(
+					m_thd, Sql_condition::WARN_LEVEL_WARN,
+					ER_ILLEGAL_HA_CREATE_OPTION,
+					"InnoDB: PAGE_COMPRESSED requires"
+					" PAGE_COMPRESSION_LEVEL or"
+					" innodb_compression_level > 0");
+				DBUG_RETURN(false);
+			}
+		}
+	}
+
+	/* Set the table flags */
+	dict_tf_set(&m_flags, innodb_row_format, zip_ssize,
+		    m_use_data_dir, level && options->page_compressed, level);
+
+	if (m_form->s->table_type == TABLE_TYPE_SEQUENCE) {
+		m_flags |= DICT_TF_MASK_NO_ROLLBACK;
+	}
+
+	/* Set the flags2 when create table or alter tables */
+	m_flags2 |= DICT_TF2_FTS_AUX_HEX_NAME;
+	DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name",
+			m_flags2 &= ~DICT_TF2_FTS_AUX_HEX_NAME;);
+
+	DBUG_RETURN(true);
+}
+
+/** Parse MERGE_THRESHOLD value from the string.
+@param[in]	thd	connection
+@param[in]	str	string which might include 'MERGE_THRESHOLD='
+@return	value parsed. 0 means not found or invalid value. */
+static
+unsigned
+innobase_parse_merge_threshold(
+	THD*		thd,
+	const char*	str)
+{
+	static const char*	label = "MERGE_THRESHOLD=";
+	static const size_t	label_len = strlen(label);
+	const char*		pos = str;
+
+	pos = strstr(str, label);
+
+	if (pos == NULL) {
+		return(0);
+	}
+
+	pos += label_len;
+
+	lint	ret = atoi(pos);
+
+	if (ret > 0 && ret <= 50) {
+		return(static_cast<unsigned>(ret));
+	}
+
+	push_warning_printf(
+		thd, Sql_condition::WARN_LEVEL_WARN,
+		ER_ILLEGAL_HA_CREATE_OPTION,
+		"InnoDB: Invalid value for MERGE_THRESHOLD in the CREATE TABLE"
+		" statement. The value is ignored.");
+
+	return(0);
+}
+
+/** Parse hint for table and its indexes, and update the information
+in dictionary.
+@param[in]	thd		connection
+@param[in,out]	table		target table
+@param[in]	table_share	table definition */
+void
+innobase_parse_hint_from_comment(
+	THD*			thd,
+	dict_table_t*		table,
+	const TABLE_SHARE*	table_share)
+{
+	unsigned merge_threshold_table;
+	unsigned merge_threshold_index[MAX_KEY];
+	bool	is_found[MAX_KEY];
+
+	if (table_share->comment.str != NULL) {
+		merge_threshold_table
+			= innobase_parse_merge_threshold(
+				thd, table_share->comment.str);
+	} else {
+		merge_threshold_table = DICT_INDEX_MERGE_THRESHOLD_DEFAULT;
+	}
+
+	if (merge_threshold_table == 0) {
+		merge_threshold_table = DICT_INDEX_MERGE_THRESHOLD_DEFAULT;
+	}
+
+	for (uint i = 0; i < table_share->keys; i++) {
+		KEY*	key_info = &table_share->key_info[i];
+
+		ut_ad(i < sizeof(merge_threshold_index)
+			  / sizeof(merge_threshold_index[0]));
+
+		if (key_info->flags & HA_USES_COMMENT
+		    && key_info->comment.str != NULL) {
+			merge_threshold_index[i]
+				= innobase_parse_merge_threshold(
+					thd, key_info->comment.str);
+		} else {
+			merge_threshold_index[i] = merge_threshold_table;
+		}
+
+		if (merge_threshold_index[i] == 0) {
+			merge_threshold_index[i] = merge_threshold_table;
+		}
+	}
+
+	/* update SYS_INDEX table */
+	if (!table->is_temporary()) {
+		for (uint i = 0; i < table_share->keys; i++) {
+			is_found[i] = false;
+		}
+
+		for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
+		     index != NULL;
+		     index = UT_LIST_GET_NEXT(indexes, index)) {
+
+			if (dict_index_is_auto_gen_clust(index)) {
+
+				/* GEN_CLUST_INDEX should use
+				merge_threshold_table */
+				dict_index_set_merge_threshold(
+					index, merge_threshold_table);
+				continue;
+			}
+
+			for (uint i = 0; i < table_share->keys; i++) {
+				if (is_found[i]) {
+					continue;
+				}
+
+				KEY*	key_info = &table_share->key_info[i];
+
+				if (innobase_strcasecmp(
+					index->name, key_info->name.str) == 0) {
+
+					dict_index_set_merge_threshold(
+						index,
+						merge_threshold_index[i]);
+					is_found[i] = true;
+					break;
+				}
+			}
+		}
+	}
+
+	for (uint i = 0; i < table_share->keys; i++) {
+		is_found[i] = false;
+	}
+
+	/* update in memory */
+	for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
+	     index != NULL;
+	     index = UT_LIST_GET_NEXT(indexes, index)) {
+
+		if (dict_index_is_auto_gen_clust(index)) {
+
+			/* GEN_CLUST_INDEX should use merge_threshold_table */
+
+			/* x-lock index is needed to exclude concurrent
+			pessimistic tree operations */
+			index->lock.x_lock(SRW_LOCK_CALL);
+			index->merge_threshold = merge_threshold_table
+				& ((1U << 6) - 1);
+			index->lock.x_unlock();
+
+			continue;
+		}
+
+		for (uint i = 0; i < table_share->keys; i++) {
+			if (is_found[i]) {
+				continue;
+			}
+
+			KEY*	key_info = &table_share->key_info[i];
+
+			if (innobase_strcasecmp(
+				index->name, key_info->name.str) == 0) {
+
+				/* x-lock index is needed to exclude concurrent
+				pessimistic tree operations */
+				index->lock.x_lock(SRW_LOCK_CALL);
+				index->merge_threshold
+					= merge_threshold_index[i]
+					& ((1U << 6) - 1);
+				index->lock.x_unlock();
+				is_found[i] = true;
+
+				break;
+			}
+		}
+	}
+}
+
+/** Set m_use_* flags. */
+void
+create_table_info_t::set_tablespace_type(
+	bool	table_being_altered_is_file_per_table)
+{
+	/** Allow file_per_table for this table either because:
+	1) the setting innodb_file_per_table=on,
+	2) the table being altered is currently file_per_table */
+	m_allow_file_per_table =
+		m_innodb_file_per_table
+		|| table_being_altered_is_file_per_table;
+
+	/* Ignore the current innodb-file-per-table setting if we are
+	creating a temporary table. */
+	m_use_file_per_table = m_allow_file_per_table
+		&& !m_create_info->tmp_table();
+
+	/* DATA DIRECTORY must have m_use_file_per_table but cannot be
+	used with TEMPORARY tables. */
+	m_use_data_dir =
+		m_use_file_per_table
+		&& m_create_info->data_file_name
+		&& m_create_info->data_file_name[0]
+		&& my_use_symdir;
+}
+
+/** Initialize the create_table_info_t object.
+@return error number */
+int
+create_table_info_t::initialize()
+{
+	DBUG_ENTER("create_table_info_t::initialize");
+
+	ut_ad(m_thd != NULL);
+	ut_ad(m_create_info != NULL);
+
+	if (m_form->s->fields > REC_MAX_N_USER_FIELDS) {
+		DBUG_RETURN(HA_ERR_TOO_MANY_FIELDS);
+	}
+
+	/* Check for name conflicts (with reserved name) for
+	any user indices to be created. */
+	if (innobase_index_name_is_reserved(m_thd, m_form->key_info,
+					    m_form->s->keys)) {
+		DBUG_RETURN(HA_ERR_WRONG_INDEX);
+	}
+
+	/* Get the transaction associated with the current thd, or create one
+	if not yet created */
+
+	check_trx_exists(m_thd);
+
+	DBUG_RETURN(0);
+}
+
+
+/** Check if a virtual column is part of a fulltext or spatial index. */
+bool
+create_table_info_t::gcols_in_fulltext_or_spatial()
+{
+	for (ulint i = 0; i < m_form->s->keys; i++) {
+		const KEY*	key = m_form->key_info + i;
+		if (!(key->flags & (HA_SPATIAL | HA_FULLTEXT))) {
+			continue;
+		}
+		for (ulint j = 0; j < key->user_defined_key_parts; j++) {
+			/* We do not support special (Fulltext or
+			Spatial) index on virtual columns */
+			if (!key->key_part[j].field->stored_in_db()) {
+				my_error(ER_UNSUPPORTED_ACTION_ON_GENERATED_COLUMN, MYF(0));
+				return true;
+			}
+		}
+	}
+	return false;
+}
+
+
+/** Prepare to create a new table to an InnoDB database.
+@param[in]	name	Table name
+@return error number */
+int create_table_info_t::prepare_create_table(const char* name, bool strict)
+{
+	DBUG_ENTER("prepare_create_table");
+
+	ut_ad(m_thd != NULL);
+	ut_ad(m_create_info != NULL);
+
+	set_tablespace_type(false);
+
+	normalize_table_name(m_table_name, name);
+
+	/* Validate table options not handled by the SQL-parser */
+	if (check_table_options()) {
+		DBUG_RETURN(HA_WRONG_CREATE_OPTION);
+	}
+
+	/* Validate the create options if innodb_strict_mode is set.
+	Do not use the regular message for ER_ILLEGAL_HA_CREATE_OPTION
+	because InnoDB might actually support the option, but not under
+	the current conditions.  The messages revealing the specific
+	problems are reported inside this function. */
+	if (strict && create_options_are_invalid()) {
+		DBUG_RETURN(HA_WRONG_CREATE_OPTION);
+	}
+
+	/* Create the table flags and flags2 */
+	if (!innobase_table_flags()) {
+		DBUG_RETURN(HA_WRONG_CREATE_OPTION);
+	}
+
+	if (high_level_read_only) {
+		DBUG_RETURN(HA_ERR_TABLE_READONLY);
+	}
+
+	if (gcols_in_fulltext_or_spatial()) {
+		DBUG_RETURN(HA_ERR_UNSUPPORTED);
+	}
+
+	for (uint i = 0; i < m_form->s->keys; i++) {
+		const size_t max_field_len
+		    = DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(m_flags);
+		const KEY& key = m_form->key_info[i];
+
+		if (key.algorithm == HA_KEY_ALG_FULLTEXT) {
+			continue;
+		}
+
+		if (too_big_key_part_length(max_field_len, key)) {
+			DBUG_RETURN(convert_error_code_to_mysql(
+			    DB_TOO_BIG_INDEX_COL, m_flags, NULL));
+		}
+	}
+
+	DBUG_RETURN(parse_table_name(name));
+}
+
+/** Push warning message to SQL-layer based on foreign key constraint index
+match error.
+@param[in]	trx		Current transaction
+@param[in]	operation	Operation ("Create" or "Alter")
+@param[in]	create_name	Table name as specified in SQL
+@param[in]	columns		Foreign key column names array
+@param[in]	index_error 	Index error code
+@param[in]	err_col	  	Column where error happened
+@param[in]	err_index  	Index where error happened
+@param[in]	table	  	Table object */
+static void
+foreign_push_index_error(trx_t* trx, const char* operation,
+			 const char* create_name, const char* fk_text,
+			 const char** columns, fkerr_t index_error,
+			 ulint err_col, dict_index_t* err_index,
+			 dict_table_t* table)
+{
+	switch (index_error) {
+	case FK_SUCCESS:
+		break;
+	case FK_INDEX_NOT_FOUND:
+		ib_foreign_warn(trx, DB_CANNOT_ADD_CONSTRAINT, create_name,
+				"%s table %s with foreign key %s constraint"
+				" failed. There is no index in the referenced"
+				" table where the referenced columns appear"
+				" as the first columns.",
+				operation, create_name, fk_text);
+		return;
+	case FK_IS_PREFIX_INDEX:
+		ib_foreign_warn(
+			trx, DB_CANNOT_ADD_CONSTRAINT, create_name,
+			"%s table %s with foreign key %s constraint"
+			" failed. There is only prefix index in the referenced"
+			" table where the referenced columns appear"
+			" as the first columns.",
+			operation, create_name, fk_text);
+		return;
+	case FK_COL_NOT_NULL:
+		ib_foreign_warn(
+			trx, DB_CANNOT_ADD_CONSTRAINT, create_name,
+			"%s table %s with foreign key %s constraint"
+			" failed. You have defined a SET NULL condition but "
+			"column '%s' on index is defined as NOT NULL.",
+			operation, create_name, fk_text, columns[err_col]);
+		return;
+	case FK_COLS_NOT_EQUAL:
+		dict_field_t* field;
+		const char*   col_name;
+		field = dict_index_get_nth_field(err_index, err_col);
+
+		col_name = field->col->is_virtual()
+				   ? "(null)"
+				   : dict_table_get_col_name(
+					   table, dict_col_get_no(field->col));
+		ib_foreign_warn(
+			trx, DB_CANNOT_ADD_CONSTRAINT, create_name,
+			"%s table %s with foreign key %s constraint"
+			" failed. Field type or character set for column '%s' "
+			"does not match referenced column '%s'.",
+			operation, create_name, fk_text, columns[err_col],
+			col_name);
+		return;
+	}
+	DBUG_ASSERT("unknown error" == 0);
+}
+
+/** Find column or virtual column in table by its name.
+@param[in]	table	Table where column is searched
+@param[in]	name	Name to search for
+@retval		true	if found
+@retval		false	if not found */
+static bool
+find_col(dict_table_t* table, const char** name)
+{
+	ulint i;
+	for (i = 0; i < dict_table_get_n_cols(table); i++) {
+
+		const char* col_name = dict_table_get_col_name(table, i);
+
+		if (0 == innobase_strcasecmp(col_name, *name)) {
+			/* Found */
+			strcpy((char*)*name, col_name);
+			return true;
+		}
+	}
+
+	for (i = 0; i < dict_table_get_n_v_cols(table); i++) {
+
+		const char* col_name = dict_table_get_v_col_name(table, i);
+
+		if (0 == innobase_strcasecmp(col_name, *name)) {
+			/* Found */
+			strcpy((char*)*name, col_name);
+			return true;
+		}
+	}
+	return false;
+}
+
+/** Foreign key printer for error messages. Prints FK name if it exists or
+key part list in the form (col1, col2, col3, ...) */
+class key_text
+{
+	static const size_t MAX_TEXT = 48;
+	char		    buf[MAX_TEXT + 1];
+
+public:
+	key_text(Key* key)
+	{
+		char* ptr = buf;
+		if (key->name.str) {
+			size_t len = std::min(key->name.length, MAX_TEXT - 2);
+			*(ptr++)   = '`';
+			memcpy(ptr, key->name.str, len);
+			ptr	  += len;
+			*(ptr++)   = '`';
+			*ptr	   = '\0';
+			return;
+		}
+		*(ptr++)  = '(';
+		List_iterator_fast<Key_part_spec> it(key->columns);
+		while (Key_part_spec* k = it++) {
+			/* 3 is etc continuation ("...");
+			   2 is comma separator (", ") in case of next exists;
+			   1 is terminating ')' */
+			if (MAX_TEXT - (size_t)(ptr - buf)
+				>= (it.peek() ? 3 + 2 + 1 : 3 + 1)
+				+ k->field_name.length) {
+				memcpy(ptr, k->field_name.str,
+				       k->field_name.length);
+				ptr += k->field_name.length;
+				if (it.peek()) {
+					*(ptr++) = ',';
+					*(ptr++) = ' ';
+				}
+			} else {
+				ut_ad((size_t)(ptr - buf) <= MAX_TEXT - 4);
+				memcpy(ptr, "...", 3);
+				ptr += 3;
+				break;
+			}
+		}
+		*(ptr++) = ')';
+		*ptr 	 = '\0';
+	}
+	const char* str() { return buf; }
+};
+
+/** Create InnoDB foreign keys from MySQL alter_info. Collect all
+dict_foreign_t items into local_fk_set and then add into system table.
+@return		DB_SUCCESS or specific error code */
+dberr_t
+create_table_info_t::create_foreign_keys()
+{
+	dict_foreign_set      local_fk_set;
+	dict_foreign_set_free local_fk_set_free(local_fk_set);
+	dberr_t		      error;
+	ulint		      number	      = 1;
+	static const unsigned MAX_COLS_PER_FK = 500;
+	const char*	      column_names[MAX_COLS_PER_FK];
+	const char*	      ref_column_names[MAX_COLS_PER_FK];
+	char		      create_name[MAX_DATABASE_NAME_LEN + 1 +
+					  MAX_TABLE_NAME_LEN + 1];
+	dict_index_t*	      index	  = NULL;
+	fkerr_t		      index_error = FK_SUCCESS;
+	dict_index_t*	      err_index	  = NULL;
+	ulint		      err_col;
+	const bool	      tmp_table = m_flags2 & DICT_TF2_TEMPORARY;
+	const CHARSET_INFO*   cs	= thd_charset(m_thd);
+	const char*	      operation = "Create ";
+	const char*	      name	= m_table_name;
+
+	enum_sql_command sqlcom = enum_sql_command(thd_sql_command(m_thd));
+
+	if (sqlcom == SQLCOM_ALTER_TABLE) {
+		dict_table_t* table_to_alter;
+		mem_heap_t*   heap = mem_heap_create(10000);
+		ulint	      highest_id_so_far;
+		char*	      n = dict_get_referenced_table(
+			name, LEX_STRING_WITH_LEN(m_form->s->db),
+			LEX_STRING_WITH_LEN(m_form->s->table_name),
+			&table_to_alter, heap, cs);
+
+		/* Starting from 4.0.18 and 4.1.2, we generate foreign key id's
+		in the format databasename/tablename_ibfk_[number], where
+		[number] is local to the table; look for the highest [number]
+		for table_to_alter, so that we can assign to new constraints
+		higher numbers. */
+
+		/* If we are altering a temporary table, the table name after
+		ALTER TABLE does not correspond to the internal table name, and
+		table_to_alter is NULL. TODO: should we fix this somehow? */
+
+		if (table_to_alter) {
+			n		  = table_to_alter->name.m_name;
+			highest_id_so_far = dict_table_get_highest_foreign_id(
+				table_to_alter);
+		} else {
+			highest_id_so_far = 0;
+		}
+
+		char* bufend = innobase_convert_name(
+			create_name, sizeof create_name, n, strlen(n), m_thd);
+		create_name[bufend - create_name] = '\0';
+		number				  = highest_id_so_far + 1;
+		mem_heap_free(heap);
+		operation = "Alter ";
+	} else if (strstr(name, "#P#") || strstr(name, "#p#")) {
+		/* Partitioned table */
+		create_name[0] = '\0';
+	} else {
+		char* bufend = innobase_convert_name(create_name,
+						     sizeof create_name,
+						     name,
+						     strlen(name), m_thd);
+		create_name[bufend - create_name] = '\0';
+	}
+
+	Alter_info* alter_info = m_create_info->alter_info;
+	ut_ad(alter_info);
+	List_iterator_fast<Key> key_it(alter_info->key_list);
+
+	dict_table_t* table = dict_sys.find_table({name,strlen(name)});
+	if (!table) {
+		ib_foreign_warn(m_trx, DB_CANNOT_ADD_CONSTRAINT, create_name,
+				"%s table %s foreign key constraint"
+				" failed. Table not found.",
+				operation, create_name);
+
+		return (DB_CANNOT_ADD_CONSTRAINT);
+	}
+
+	while (Key* key = key_it++) {
+		if (key->type != Key::FOREIGN_KEY || key->old)
+			continue;
+
+		if (tmp_table) {
+			ib_foreign_warn(m_trx, DB_CANNOT_ADD_CONSTRAINT,
+					create_name,
+					"%s table `%s`.`%s` with foreign key "
+					"constraint failed. "
+					"Temporary tables can't have "
+					"foreign key constraints.",
+					operation, m_form->s->db.str,
+					m_form->s->table_name.str);
+
+			return (DB_CANNOT_ADD_CONSTRAINT);
+		} else if (!*create_name) {
+			ut_ad("should be unreachable" == 0);
+			return DB_CANNOT_ADD_CONSTRAINT;
+		}
+
+		Foreign_key*   fk = static_cast<Foreign_key*>(key);
+		Key_part_spec* col;
+		bool	       success;
+
+		dict_foreign_t* foreign = dict_mem_foreign_create();
+		if (!foreign) {
+			return (DB_OUT_OF_MEMORY);
+		}
+
+		List_iterator_fast<Key_part_spec> col_it(fk->columns);
+		unsigned			  i = 0, j = 0;
+		while ((col = col_it++)) {
+			column_names[i] = mem_heap_strdupl(
+				foreign->heap, col->field_name.str,
+				col->field_name.length);
+			success = find_col(table, column_names + i);
+			if (!success) {
+				key_text k(fk);
+				ib_foreign_warn(
+					m_trx, DB_CANNOT_ADD_CONSTRAINT,
+					create_name,
+					"%s table %s foreign key %s constraint"
+					" failed. Column %s was not found.",
+					operation, create_name, k.str(),
+					column_names[i]);
+				dict_foreign_free(foreign);
+				return (DB_CANNOT_ADD_CONSTRAINT);
+			}
+			++i;
+			if (i >= MAX_COLS_PER_FK) {
+				key_text k(fk);
+				ib_foreign_warn(
+					m_trx, DB_CANNOT_ADD_CONSTRAINT,
+					create_name,
+					"%s table %s foreign key %s constraint"
+					" failed. Too many columns: %u (%u "
+					"allowed).",
+					operation, create_name, k.str(), i,
+					MAX_COLS_PER_FK);
+				dict_foreign_free(foreign);
+				return (DB_CANNOT_ADD_CONSTRAINT);
+			}
+		}
+
+		index = dict_foreign_find_index(
+			table, NULL, column_names, i, NULL, TRUE, FALSE,
+			&index_error, &err_col, &err_index);
+
+		if (!index) {
+			key_text k(fk);
+			foreign_push_index_error(m_trx, operation, create_name,
+						 k.str(), column_names,
+						 index_error, err_col,
+						 err_index, table);
+			dict_foreign_free(foreign);
+			return (DB_CANNOT_ADD_CONSTRAINT);
+		}
+
+		if (fk->constraint_name.str) {
+			ulint db_len;
+
+			/* Catenate 'databasename/' to the constraint name
+			specified by the user: we conceive the constraint as
+			belonging to the same MySQL 'database' as the table
+			itself. We store the name to foreign->id. */
+
+			db_len = dict_get_db_name_len(table->name.m_name);
+
+			foreign->id = static_cast<char*>(mem_heap_alloc(
+				foreign->heap,
+				db_len + fk->constraint_name.length + 2));
+
+			memcpy(foreign->id, table->name.m_name, db_len);
+			foreign->id[db_len] = '/';
+			strcpy(foreign->id + db_len + 1,
+			       fk->constraint_name.str);
+		}
+
+		if (foreign->id == NULL) {
+			error = dict_create_add_foreign_id(
+				&number, table->name.m_name, foreign);
+			if (error != DB_SUCCESS) {
+				dict_foreign_free(foreign);
+				return (error);
+			}
+		}
+
+		std::pair<dict_foreign_set::iterator, bool> ret
+			= local_fk_set.insert(foreign);
+
+		if (!ret.second) {
+			/* A duplicate foreign key name has been found */
+			dict_foreign_free(foreign);
+			return (DB_CANNOT_ADD_CONSTRAINT);
+		}
+
+		foreign->foreign_table = table;
+		foreign->foreign_table_name
+			= mem_heap_strdup(foreign->heap, table->name.m_name);
+		if (!foreign->foreign_table_name) {
+			return (DB_OUT_OF_MEMORY);
+		}
+
+		dict_mem_foreign_table_name_lookup_set(foreign, TRUE);
+
+		foreign->foreign_index = index;
+		foreign->n_fields      = i & dict_index_t::MAX_N_FIELDS;
+
+		foreign->foreign_col_names = static_cast<const char**>(
+			mem_heap_alloc(foreign->heap, i * sizeof(void*)));
+		if (!foreign->foreign_col_names) {
+			return (DB_OUT_OF_MEMORY);
+		}
+
+		memcpy(foreign->foreign_col_names, column_names,
+		       i * sizeof(void*));
+
+		foreign->referenced_table_name = dict_get_referenced_table(
+			name, LEX_STRING_WITH_LEN(fk->ref_db),
+			LEX_STRING_WITH_LEN(fk->ref_table),
+			&foreign->referenced_table, foreign->heap, cs);
+
+		if (!foreign->referenced_table_name) {
+			return (DB_OUT_OF_MEMORY);
+		}
+
+		if (!foreign->referenced_table && m_trx->check_foreigns) {
+			char  buf[MAX_TABLE_NAME_LEN + 1] = "";
+			char* bufend;
+
+			bufend = innobase_convert_name(
+				buf, MAX_TABLE_NAME_LEN,
+				foreign->referenced_table_name,
+				strlen(foreign->referenced_table_name), m_thd);
+			buf[bufend - buf] = '\0';
+			key_text k(fk);
+			ib_foreign_warn(m_trx, DB_CANNOT_ADD_CONSTRAINT,
+					create_name,
+					"%s table %s with foreign key %s "
+					"constraint failed. Referenced table "
+					"%s not found in the data dictionary.",
+					operation, create_name, k.str(), buf);
+			return (DB_CANNOT_ADD_CONSTRAINT);
+		}
+
+		/* Don't allow foreign keys on partitioned tables yet. */
+		if (foreign->referenced_table
+		    && dict_table_is_partition(foreign->referenced_table)) {
+			/* How could one make a referenced table to be a
+			 * partition? */
+			ut_ad(0);
+			my_error(ER_FEATURE_NOT_SUPPORTED_WITH_PARTITIONING,
+				 MYF(0), "FOREIGN KEY");
+			return (DB_CANNOT_ADD_CONSTRAINT);
+		}
+
+		col_it.init(fk->ref_columns);
+		while ((col = col_it++)) {
+			ref_column_names[j] = mem_heap_strdupl(
+				foreign->heap, col->field_name.str,
+				col->field_name.length);
+			if (foreign->referenced_table) {
+				success = find_col(foreign->referenced_table,
+						   ref_column_names + j);
+				if (!success) {
+					key_text k(fk);
+					ib_foreign_warn(
+						m_trx,
+						DB_CANNOT_ADD_CONSTRAINT,
+						create_name,
+						"%s table %s foreign key %s "
+						"constraint failed. "
+						"Column %s was not found.",
+						operation, create_name,
+						k.str(), ref_column_names[j]);
+
+					return (DB_CANNOT_ADD_CONSTRAINT);
+				}
+			}
+			++j;
+		}
+		/* See ER_WRONG_FK_DEF in mysql_prepare_create_table() */
+		ut_ad(i == j);
+
+		/* Try to find an index which contains the columns as the first
+		fields and in the right order, and the types are the same as in
+		foreign->foreign_index */
+
+		if (foreign->referenced_table) {
+			index = dict_foreign_find_index(
+				foreign->referenced_table, NULL,
+				ref_column_names, i, foreign->foreign_index,
+				TRUE, FALSE, &index_error, &err_col,
+				&err_index);
+
+			if (!index) {
+				key_text k(fk);
+				foreign_push_index_error(
+					m_trx, operation, create_name, k.str(),
+					column_names, index_error, err_col,
+					err_index, foreign->referenced_table);
+
+				return (DB_CANNOT_ADD_CONSTRAINT);
+			}
+		} else {
+			ut_a(m_trx->check_foreigns == FALSE);
+			index = NULL;
+		}
+
+		foreign->referenced_index = index;
+		dict_mem_referenced_table_name_lookup_set(foreign, TRUE);
+
+		foreign->referenced_col_names = static_cast<const char**>(
+			mem_heap_alloc(foreign->heap, i * sizeof(void*)));
+		if (!foreign->referenced_col_names) {
+			return (DB_OUT_OF_MEMORY);
+		}
+
+		memcpy(foreign->referenced_col_names, ref_column_names,
+		       i * sizeof(void*));
+
+		if (fk->delete_opt == FK_OPTION_SET_NULL
+		    || fk->update_opt == FK_OPTION_SET_NULL) {
+			for (j = 0; j < foreign->n_fields; j++) {
+				if ((dict_index_get_nth_col(
+					     foreign->foreign_index, j)
+					     ->prtype)
+				    & DATA_NOT_NULL) {
+					const dict_col_t* col
+						= dict_index_get_nth_col(
+							foreign->foreign_index,
+							j);
+					const char* col_name
+						= dict_table_get_col_name(
+							foreign->foreign_index
+								->table,
+							dict_col_get_no(col));
+
+					/* It is not sensible to define SET
+					NULL
+					if the column is not allowed to be
+					NULL! */
+					key_text k(fk);
+					ib_foreign_warn(
+						m_trx,
+						DB_CANNOT_ADD_CONSTRAINT,
+						create_name,
+						"%s table %s with foreign key "
+						"%s constraint failed. You have"
+						" defined a SET NULL condition "
+						"but column '%s' is defined as "
+						"NOT NULL.",
+						operation, create_name,
+						k.str(), col_name);
+
+					return (DB_CANNOT_ADD_CONSTRAINT);
+				}
+			}
+		}
+
+		switch (fk->delete_opt) {
+		case FK_OPTION_UNDEF:
+		case FK_OPTION_RESTRICT:
+			break;
+		case FK_OPTION_CASCADE:
+			foreign->type |= DICT_FOREIGN_ON_DELETE_CASCADE;
+			break;
+		case FK_OPTION_SET_NULL:
+			foreign->type |= DICT_FOREIGN_ON_DELETE_SET_NULL;
+			break;
+		case FK_OPTION_NO_ACTION:
+			foreign->type |= DICT_FOREIGN_ON_DELETE_NO_ACTION;
+			break;
+		case FK_OPTION_SET_DEFAULT:
+			// TODO: MDEV-10393 Foreign keys SET DEFAULT action
+			break;
+		default:
+			ut_ad(0);
+			break;
+		}
+
+		switch (fk->update_opt) {
+		case FK_OPTION_UNDEF:
+		case FK_OPTION_RESTRICT:
+			break;
+		case FK_OPTION_CASCADE:
+			foreign->type |= DICT_FOREIGN_ON_UPDATE_CASCADE;
+			break;
+		case FK_OPTION_SET_NULL:
+			foreign->type |= DICT_FOREIGN_ON_UPDATE_SET_NULL;
+			break;
+		case FK_OPTION_NO_ACTION:
+			foreign->type |= DICT_FOREIGN_ON_UPDATE_NO_ACTION;
+			break;
+		case FK_OPTION_SET_DEFAULT:
+			// TODO: MDEV-10393 Foreign keys SET DEFAULT action
+			break;
+		default:
+			ut_ad(0);
+			break;
+		}
+	}
+
+	if (dict_foreigns_has_s_base_col(local_fk_set, table)) {
+		return (DB_NO_FK_ON_S_BASE_COL);
+	}
+
+	/**********************************************************/
+	/* The following call adds the foreign key constraints
+	to the data dictionary system tables on disk */
+	m_trx->op_info = "adding foreign keys";
+
+	trx_start_if_not_started_xa(m_trx, true);
+
+	m_trx->dict_operation = true;
+
+	error = dict_create_add_foreigns_to_dictionary(local_fk_set, table,
+						       m_trx);
+
+	if (error == DB_SUCCESS) {
+
+		table->foreign_set.insert(local_fk_set.begin(),
+					  local_fk_set.end());
+		std::for_each(local_fk_set.begin(), local_fk_set.end(),
+			      dict_foreign_add_to_referenced_table());
+		local_fk_set.clear();
+
+		dict_mem_table_fill_foreign_vcol_set(table);
+	}
+	return (error);
+}
+
+/** Create the internal innodb table.
+@param create_fk	whether to add FOREIGN KEY constraints */
+int create_table_info_t::create_table(bool create_fk)
+{
+	int		error;
+	int		primary_key_no;
+	uint		i;
+
+	DBUG_ENTER("create_table");
+
+	/* Look for a primary key */
+	primary_key_no = (m_form->s->primary_key != MAX_KEY ?
+			  (int) m_form->s->primary_key : -1);
+
+	/* Our function innobase_get_mysql_key_number_for_index assumes
+	the primary key is always number 0, if it exists */
+	ut_a(primary_key_no == -1 || primary_key_no == 0);
+
+	error = create_table_def();
+
+	if (error) {
+		DBUG_RETURN(error);
+	}
+
+	/* Create the keys */
+
+	if (m_form->s->keys == 0 || primary_key_no == -1) {
+		/* Create an index which is used as the clustered index;
+		order the rows by their row id which is internally generated
+		by InnoDB */
+		ulint flags = m_table->flags;
+		dict_index_t* index = dict_mem_index_create(
+			m_table, innobase_index_reserve_name,
+			DICT_CLUSTERED, 0);
+		const ha_table_option_struct& o = *m_form->s->option_struct;
+		error = convert_error_code_to_mysql(
+			row_create_index_for_mysql(
+				index, m_trx, NULL,
+				fil_encryption_t(o.encryption),
+				uint32_t(o.encryption_key_id)),
+			flags, m_thd);
+		if (error) {
+			DBUG_RETURN(error);
+		}
+	}
+
+	if (primary_key_no != -1) {
+		/* In InnoDB the clustered index must always be created
+		first */
+		if ((error = create_index(m_trx, m_form, m_table,
+					  (uint) primary_key_no))) {
+			DBUG_RETURN(error);
+		}
+	}
+
+	/* Create the ancillary tables that are common to all FTS indexes on
+	this table. */
+	if (m_flags2 & DICT_TF2_FTS) {
+		fts_doc_id_index_enum	ret;
+
+		/* Check whether there already exists FTS_DOC_ID_INDEX */
+		ret = innobase_fts_check_doc_id_index_in_def(
+			m_form->s->keys, m_form->key_info);
+
+		switch (ret) {
+		case FTS_INCORRECT_DOC_ID_INDEX:
+			push_warning_printf(m_thd,
+					    Sql_condition::WARN_LEVEL_WARN,
+					    ER_WRONG_NAME_FOR_INDEX,
+					    " InnoDB: Index name %s is reserved"
+					    " for the unique index on"
+					    " FTS_DOC_ID column for FTS"
+					    " Document ID indexing"
+					    " on table %s. Please check"
+					    " the index definition to"
+					    " make sure it is of correct"
+					    " type\n",
+					    FTS_DOC_ID_INDEX_NAME,
+					    m_table->name.m_name);
+
+			if (m_table->fts) {
+				m_table->fts->~fts_t();
+				m_table->fts = nullptr;
+			}
+
+			my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0),
+				 FTS_DOC_ID_INDEX_NAME);
+			DBUG_RETURN(-1);
+		case FTS_EXIST_DOC_ID_INDEX:
+		case FTS_NOT_EXIST_DOC_ID_INDEX:
+			break;
+		}
+
+		dberr_t	err = fts_create_common_tables(
+			m_trx, m_table,
+			(ret == FTS_EXIST_DOC_ID_INDEX));
+
+		error = convert_error_code_to_mysql(err, 0, NULL);
+
+		if (error) {
+			DBUG_RETURN(error);
+		}
+	}
+
+	for (i = 0; i < m_form->s->keys; i++) {
+		if (i != uint(primary_key_no)
+		    && (error = create_index(m_trx, m_form, m_table, i))) {
+			DBUG_RETURN(error);
+		}
+	}
+
+	/* Cache all the FTS indexes on this table in the FTS specific
+	structure. They are used for FTS indexed column update handling. */
+	if (m_flags2 & DICT_TF2_FTS) {
+		fts_t*          fts = m_table->fts;
+
+		ut_a(fts != NULL);
+
+		dict_table_get_all_fts_indexes(m_table, fts->indexes);
+	}
+
+	dberr_t err = create_fk ? create_foreign_keys() : DB_SUCCESS;
+
+	if (err == DB_SUCCESS) {
+		const dict_err_ignore_t ignore_err = m_trx->check_foreigns
+			? DICT_ERR_IGNORE_NONE : DICT_ERR_IGNORE_FK_NOKEY;
+
+		/* Check that also referencing constraints are ok */
+		dict_names_t	fk_tables;
+		err = dict_load_foreigns(m_table_name, nullptr,
+					 m_trx->id, true,
+					 ignore_err, fk_tables);
+		while (err == DB_SUCCESS && !fk_tables.empty()) {
+			dict_sys.load_table(
+				{fk_tables.front(), strlen(fk_tables.front())},
+				ignore_err);
+			fk_tables.pop_front();
+		}
+	}
+
+	switch (err) {
+	case DB_PARENT_NO_INDEX:
+		push_warning_printf(
+			m_thd, Sql_condition::WARN_LEVEL_WARN,
+			HA_ERR_CANNOT_ADD_FOREIGN,
+			"Create table '%s' with foreign key constraint"
+			" failed. There is no index in the referenced"
+			" table where the referenced columns appear"
+			" as the first columns.\n", m_table_name);
+		break;
+
+	case DB_CHILD_NO_INDEX:
+		push_warning_printf(
+			m_thd, Sql_condition::WARN_LEVEL_WARN,
+			HA_ERR_CANNOT_ADD_FOREIGN,
+			"Create table '%s' with foreign key constraint"
+			" failed. There is no index in the referencing"
+			" table where referencing columns appear"
+			" as the first columns.\n", m_table_name);
+		break;
+	case DB_NO_FK_ON_S_BASE_COL:
+		push_warning_printf(
+			m_thd, Sql_condition::WARN_LEVEL_WARN,
+			HA_ERR_CANNOT_ADD_FOREIGN,
+			"Create table '%s' with foreign key constraint"
+			" failed. Cannot add foreign key constraint"
+			" placed on the base column of stored"
+			" column. \n",
+			m_table_name);
+	default:
+		break;
+	}
+
+	if (err != DB_SUCCESS) {
+		DBUG_RETURN(convert_error_code_to_mysql(
+					err, m_flags, NULL));
+	}
+
+	/* In TRUNCATE TABLE, we will merely warn about the maximum
+	row size being too large. */
+	if (!row_size_is_acceptable(*m_table, create_fk)) {
+		DBUG_RETURN(convert_error_code_to_mysql(
+			    DB_TOO_BIG_RECORD, m_flags, NULL));
+	}
+
+	DBUG_RETURN(0);
+}
+
+bool create_table_info_t::row_size_is_acceptable(
+  const dict_table_t &table, bool strict) const
+{
+  for (dict_index_t *index= dict_table_get_first_index(&table); index;
+       index= dict_table_get_next_index(index))
+    if (!row_size_is_acceptable(*index, strict))
+      return false;
+  return true;
+}
+
+dict_index_t::record_size_info_t dict_index_t::record_size_info() const
+{
+  ut_ad(!(type & DICT_FTS));
+
+  /* maximum allowed size of a node pointer record */
+  ulint page_ptr_max;
+  const bool comp= table->not_redundant();
+  /* table->space == NULL after DISCARD TABLESPACE */
+  const ulint zip_size= dict_tf_get_zip_size(table->flags);
+  record_size_info_t result;
+
+  if (zip_size && zip_size < srv_page_size)
+  {
+    /* On a ROW_FORMAT=COMPRESSED page, two records must fit in the
+    uncompressed page modification log. On compressed pages
+    with size.physical() == univ_page_size.physical(),
+    this limit will never be reached. */
+    ut_ad(comp);
+    /* The maximum allowed record size is the size of
+    an empty page, minus a byte for recoding the heap
+    number in the page modification log.  The maximum
+    allowed node pointer size is half that. */
+    result.max_leaf_size= page_zip_empty_size(n_fields, zip_size);
+    if (result.max_leaf_size)
+    {
+      result.max_leaf_size--;
+    }
+    page_ptr_max= result.max_leaf_size / 2;
+    /* On a compressed page, there is a two-byte entry in
+    the dense page directory for every record.  But there
+    is no record header. */
+    result.shortest_size= 2;
+  }
+  else
+  {
+    /* The maximum allowed record size is half a B-tree
+    page(16k for 64k page size).  No additional sparse
+    page directory entry will be generated for the first
+    few user records. */
+    result.max_leaf_size= (comp || srv_page_size < UNIV_PAGE_SIZE_MAX)
+                              ? page_get_free_space_of_empty(comp) / 2
+                              : REDUNDANT_REC_MAX_DATA_SIZE;
+
+    page_ptr_max= result.max_leaf_size;
+    /* Each record has a header. */
+    result.shortest_size= comp ? REC_N_NEW_EXTRA_BYTES : REC_N_OLD_EXTRA_BYTES;
+  }
+
+  if (comp)
+  {
+    /* Include the "null" flags in the
+    maximum possible record size. */
+    result.shortest_size+= UT_BITS_IN_BYTES(n_nullable);
+  }
+  else
+  {
+    /* For each column, include a 2-byte offset and a
+    "null" flag.  The 1-byte format is only used in short
+    records that do not contain externally stored columns.
+    Such records could never exceed the page limit, even
+    when using the 2-byte format. */
+    result.shortest_size+= 2 * n_fields;
+  }
+
+  const ulint max_local_len= table->get_overflow_field_local_len();
+
+  /* Compute the maximum possible record size. */
+  for (unsigned i= 0; i < n_fields; i++)
+  {
+    const dict_field_t &f= fields[i];
+    const dict_col_t &col= *f.col;
+
+    /* In dtuple_convert_big_rec(), variable-length columns
+    that are longer than BTR_EXTERN_LOCAL_STORED_MAX_SIZE
+    may be chosen for external storage.
+
+    Fixed-length columns, and all columns of secondary
+    index records are always stored inline. */
+
+    /* Determine the maximum length of the index field.
+    The field_ext_max_size should be computed as the worst
+    case in rec_get_converted_size_comp() for
+    REC_STATUS_ORDINARY records. */
+
+    size_t field_max_size= dict_col_get_fixed_size(&col, comp);
+    if (field_max_size && f.fixed_len != 0)
+    {
+      /* dict_index_add_col() should guarantee this */
+      ut_ad(!f.prefix_len || f.fixed_len == f.prefix_len);
+      if (f.prefix_len)
+        field_max_size= f.prefix_len;
+      /* Fixed lengths are not encoded
+      in ROW_FORMAT=COMPACT. */
+      goto add_field_size;
+    }
+
+    field_max_size= dict_col_get_max_size(&col);
+
+    if (f.prefix_len)
+    {
+      if (f.prefix_len < field_max_size)
+      {
+        field_max_size= f.prefix_len;
+      }
+
+      /* those conditions were copied from dtuple_convert_big_rec()*/
+    }
+    else if (field_max_size > max_local_len &&
+             field_max_size > BTR_EXTERN_LOCAL_STORED_MAX_SIZE &&
+             DATA_BIG_COL(&col) && dict_index_is_clust(this))
+    {
+
+      /* In the worst case, we have a locally stored
+      column of BTR_EXTERN_LOCAL_STORED_MAX_SIZE bytes.
+      The length can be stored in one byte.  If the
+      column were stored externally, the lengths in
+      the clustered index page would be
+      BTR_EXTERN_FIELD_REF_SIZE and 2. */
+      field_max_size= max_local_len;
+    }
+
+    if (comp)
+    {
+      /* Add the extra size for ROW_FORMAT=COMPACT.
+      For ROW_FORMAT=REDUNDANT, these bytes were
+      added to result.shortest_size before this loop. */
+      result.shortest_size+= field_max_size < 256 ? 1 : 2;
+    }
+  add_field_size:
+    result.shortest_size+= field_max_size;
+
+    /* Check the size limit on leaf pages. */
+    if (result.shortest_size >= result.max_leaf_size)
+    {
+      result.set_too_big(i);
+    }
+
+    /* Check the size limit on non-leaf pages.  Records
+    stored in non-leaf B-tree pages consist of the unique
+    columns of the record (the key columns of the B-tree)
+    and a node pointer field.  When we have processed the
+    unique columns, result.shortest_size equals the size of the
+    node pointer record minus the node pointer column. */
+    if (i + 1 == dict_index_get_n_unique_in_tree(this) &&
+        result.shortest_size + REC_NODE_PTR_SIZE + (comp ? 0 : 2) >=
+        page_ptr_max)
+    {
+      result.set_too_big(i);
+    }
+  }
+
+  return result;
+}
+
+/** Issue a warning that the row is too big. */
+static void ib_warn_row_too_big(THD *thd, const dict_table_t *table)
+{
+  /* FIXME: this row size check should be improved */
+  /* If prefix is true then a 768-byte prefix is stored
+  locally for BLOB fields. Refer to dict_table_get_format() */
+  const bool prefix= !dict_table_has_atomic_blobs(table);
+
+  const ulint free_space=
+      page_get_free_space_of_empty(table->flags & DICT_TF_COMPACT) / 2;
+
+  push_warning_printf(
+      thd, Sql_condition::WARN_LEVEL_WARN, HA_ERR_TO_BIG_ROW,
+      "Row size too large (> " ULINTPF "). Changing some columns to TEXT"
+      " or BLOB %smay help. In current row format, BLOB prefix of"
+      " %d bytes is stored inline.",
+      free_space,
+      prefix ? "or using ROW_FORMAT=DYNAMIC or ROW_FORMAT=COMPRESSED " : "",
+      prefix ? DICT_MAX_FIXED_COL_LEN : 0);
+}
+
+bool create_table_info_t::row_size_is_acceptable(
+    const dict_index_t &index, bool strict) const
+{
+  if ((index.type & DICT_FTS) || index.table->is_system_db)
+  {
+    /* Ignore system tables check because innodb_table_stats
+    maximum row size can not fit on 4k page. */
+    return true;
+  }
+
+  const bool innodb_strict_mode= THDVAR(m_thd, strict_mode);
+  dict_index_t::record_size_info_t info= index.record_size_info();
+
+  if (info.row_is_too_big())
+  {
+    ut_ad(info.get_overrun_size() != 0);
+
+    const size_t idx= info.get_first_overrun_field_index();
+    const dict_field_t *field= dict_index_get_nth_field(&index, idx);
+
+    ut_ad((!field->name) == field->col->is_dropped());
+    if (innodb_strict_mode || global_system_variables.log_warnings > 2)
+    {
+      ib::error_or_warn eow(strict && innodb_strict_mode);
+      if (field->name)
+        eow << "Cannot add field " << field->name << " in table ";
+      else
+        eow << "Cannot add an instantly dropped column in table ";
+      eow << "`" << m_form->s->db.str << "`.`" << m_form->s->table_name.str
+	  << "`" " because after adding it, the row size is "
+          << info.get_overrun_size()
+          << " which is greater than maximum allowed size ("
+          << info.max_leaf_size << " bytes) for a record on index leaf page.";
+    }
+
+    if (strict && innodb_strict_mode)
+      return false;
+
+    ib_warn_row_too_big(m_thd, index.table);
+  }
+
+  return true;
+}
+
+void create_table_info_t::create_table_update_dict(dict_table_t *table,
+                                                   THD *thd,
+                                                   const HA_CREATE_INFO &info,
+                                                   const TABLE &t)
+{
+  ut_ad(dict_sys.locked());
+
+  DBUG_ASSERT(table->get_ref_count());
+  if (table->fts)
+  {
+    if (!table->fts_doc_id_index)
+      table->fts_doc_id_index=
+        dict_table_get_index_on_name(table, FTS_DOC_ID_INDEX_NAME);
+    else
+      DBUG_ASSERT(table->fts_doc_id_index ==
+                  dict_table_get_index_on_name(table, FTS_DOC_ID_INDEX_NAME));
+  }
+
+  DBUG_ASSERT(!table->fts == !table->fts_doc_id_index);
+
+  innobase_copy_frm_flags_from_create_info(table, &info);
+
+  /* Load server stopword into FTS cache */
+  if (table->flags2 & DICT_TF2_FTS &&
+      innobase_fts_load_stopword(table, nullptr, thd))
+    fts_optimize_add_table(table);
+
+  if (const Field *ai = t.found_next_number_field)
+  {
+    ut_ad(ai->stored_in_db());
+    ib_uint64_t autoinc= info.auto_increment_value;
+    if (autoinc == 0)
+      autoinc= 1;
+
+    table->autoinc_mutex.wr_lock();
+    dict_table_autoinc_initialize(table, autoinc);
+
+    if (!table->is_temporary())
+    {
+      const unsigned col_no= innodb_col_no(ai);
+      table->persistent_autoinc= static_cast<uint16_t>
+        (dict_table_get_nth_col_pos(table, col_no, nullptr) + 1) &
+        dict_index_t::MAX_N_FIELDS;
+      /* Persist the "last used" value, which typically is AUTO_INCREMENT - 1.
+      In btr_create(), the value 0 was already written. */
+      if (--autoinc)
+        btr_write_autoinc(dict_table_get_first_index(table), autoinc);
+    }
+
+    table->autoinc_mutex.wr_unlock();
+  }
+
+  innobase_parse_hint_from_comment(thd, table, t.s);
+}
+
+/** Allocate a new trx. */
+void
+create_table_info_t::allocate_trx()
+{
+	m_trx = innobase_trx_allocate(m_thd);
+	m_trx->will_lock = true;
+}
+
+/** Create a new table to an InnoDB database.
+@param[in]	name		Table name, format: "db/table_name".
+@param[in]	form		Table format; columns and index information.
+@param[in]	create_info	Create info (including create statement string).
+@param[in]	file_per_table	whether to create .ibd file
+@param[in,out]	trx		dictionary transaction, or NULL to create new
+@return error code
+@retval	0 on success */
+int
+ha_innobase::create(const char *name, TABLE *form, HA_CREATE_INFO *create_info,
+                    bool file_per_table, trx_t *trx= nullptr)
+{
+  char norm_name[FN_REFLEN];	/* {database}/{tablename} */
+  char remote_path[FN_REFLEN];	/* Absolute path of table */
+
+  DBUG_ENTER("ha_innobase::create");
+  DBUG_ASSERT(form->s == table_share);
+  DBUG_ASSERT(table_share->table_type == TABLE_TYPE_SEQUENCE ||
+              table_share->table_type == TABLE_TYPE_NORMAL);
+
+  create_table_info_t info(ha_thd(), form, create_info, norm_name,
+                           remote_path, file_per_table, trx);
+
+  int error= info.initialize();
+  if (!error)
+    error= info.prepare_create_table(name, !trx);
+  if (error)
+    DBUG_RETURN(error);
+
+  const bool own_trx= !trx;
+  if (own_trx)
+  {
+    info.allocate_trx();
+    trx= info.trx();
+    DBUG_ASSERT(trx_state_eq(trx, TRX_STATE_NOT_STARTED));
+
+    if (!(info.flags2() & DICT_TF2_TEMPORARY))
+    {
+      trx_start_for_ddl(trx);
+      if (dberr_t err= lock_sys_tables(trx))
+        error= convert_error_code_to_mysql(err, 0, nullptr);
+    }
+    row_mysql_lock_data_dictionary(trx);
+  }
+
+  if (!error)
+    error= info.create_table(own_trx);
+
+  if (own_trx || (info.flags2() & DICT_TF2_TEMPORARY))
+  {
+    if (error)
+      trx_rollback_for_mysql(trx);
+    else
+    {
+      std::vector<pfs_os_file_t> deleted;
+      trx->commit(deleted);
+      ut_ad(deleted.empty());
+      info.table()->acquire();
+      info.create_table_update_dict(info.table(), info.thd(),
+                                    *create_info, *form);
+    }
+
+    if (own_trx)
+    {
+      row_mysql_unlock_data_dictionary(trx);
+
+      if (!error)
+      {
+        dict_stats_update(info.table(), DICT_STATS_EMPTY_TABLE);
+        if (!info.table()->is_temporary())
+          log_write_up_to(trx->commit_lsn, true);
+        info.table()->release();
+      }
+      trx->free();
+    }
+  }
+  else if (!error && m_prebuilt)
+    m_prebuilt->table= info.table();
+
+  DBUG_RETURN(error);
+}
+
+/** Create a new table to an InnoDB database.
+@param[in]	name		Table name, format: "db/table_name".
+@param[in]	form		Table format; columns and index information.
+@param[in]	create_info	Create info (including create statement string).
+@return	0 if success else error number. */
+int ha_innobase::create(const char *name, TABLE *form,
+                        HA_CREATE_INFO *create_info)
+{
+  return create(name, form, create_info, srv_file_per_table);
+}
+
+/*****************************************************************//**
+Discards or imports an InnoDB tablespace.
+@return 0 == success, -1 == error */
+
+int
+ha_innobase::discard_or_import_tablespace(
+/*======================================*/
+	my_bool		discard)	/*!< in: TRUE if discard, else import */
+{
+
+	DBUG_ENTER("ha_innobase::discard_or_import_tablespace");
+
+	ut_a(m_prebuilt->trx != NULL);
+	ut_a(m_prebuilt->trx->magic_n == TRX_MAGIC_N);
+	ut_a(m_prebuilt->trx == thd_to_trx(ha_thd()));
+
+	if (is_read_only()) {
+		DBUG_RETURN(HA_ERR_TABLE_READONLY);
+	}
+
+	if (m_prebuilt->table->is_temporary()) {
+		ib_senderrf(
+			m_prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			ER_CANNOT_DISCARD_TEMPORARY_TABLE);
+
+		DBUG_RETURN(HA_ERR_TABLE_NEEDS_UPGRADE);
+	}
+
+	if (m_prebuilt->table->space == fil_system.sys_space) {
+		ib_senderrf(
+			m_prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			ER_TABLE_IN_SYSTEM_TABLESPACE,
+			m_prebuilt->table->name.m_name);
+
+		DBUG_RETURN(HA_ERR_TABLE_NEEDS_UPGRADE);
+	}
+
+	trx_start_if_not_started(m_prebuilt->trx, true);
+	m_prebuilt->trx->dict_operation = true;
+
+	/* Obtain an exclusive lock on the table. */
+	dberr_t	err = lock_table_for_trx(m_prebuilt->table,
+					 m_prebuilt->trx, LOCK_X);
+	if (err == DB_SUCCESS) {
+		err = lock_sys_tables(m_prebuilt->trx);
+	}
+
+	if (err != DB_SUCCESS) {
+		/* unable to lock the table: do nothing */
+		m_prebuilt->trx->commit();
+	} else if (discard) {
+
+		/* Discarding an already discarded tablespace should be an
+		idempotent operation. Also, if the .ibd file is missing the
+		user may want to set the DISCARD flag in order to IMPORT
+		a new tablespace. */
+
+		if (!m_prebuilt->table->is_readable()) {
+			ib_senderrf(
+				m_prebuilt->trx->mysql_thd,
+				IB_LOG_LEVEL_WARN, ER_TABLESPACE_MISSING,
+				m_prebuilt->table->name.m_name);
+		}
+
+		err = row_discard_tablespace_for_mysql(
+			m_prebuilt->table, m_prebuilt->trx);
+	} else if (m_prebuilt->table->is_readable()) {
+		/* Commit the transaction in order to
+		release the table lock. */
+		trx_commit_for_mysql(m_prebuilt->trx);
+
+		ib::error() << "Unable to import tablespace "
+			<< m_prebuilt->table->name << " because it already"
+			" exists.  Please DISCARD the tablespace"
+			" before IMPORT.";
+		ib_senderrf(
+			m_prebuilt->trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			ER_TABLESPACE_EXISTS, m_prebuilt->table->name.m_name);
+
+		DBUG_RETURN(HA_ERR_TABLE_EXIST);
+	} else {
+		err = row_import_for_mysql(m_prebuilt->table, m_prebuilt);
+
+		if (err == DB_SUCCESS) {
+
+			info(HA_STATUS_TIME
+			     | HA_STATUS_CONST
+			     | HA_STATUS_VARIABLE
+			     | HA_STATUS_AUTO);
+
+			fil_crypt_set_encrypt_tables(srv_encrypt_tables);
+		}
+	}
+
+	ut_ad(m_prebuilt->trx->state == TRX_STATE_NOT_STARTED);
+
+	if (discard || err != DB_SUCCESS) {
+		DBUG_RETURN(convert_error_code_to_mysql(
+				    err, m_prebuilt->table->flags, NULL));
+	}
+
+	if (dict_stats_is_persistent_enabled(m_prebuilt->table)) {
+		dberr_t		ret;
+
+		/* Adjust the persistent statistics. */
+		ret = dict_stats_update(m_prebuilt->table,
+					DICT_STATS_RECALC_PERSISTENT);
+
+		if (ret != DB_SUCCESS) {
+			push_warning_printf(
+				ha_thd(),
+				Sql_condition::WARN_LEVEL_WARN,
+				ER_ALTER_INFO,
+				"Error updating stats for table '%s'"
+				" after table rebuild: %s",
+				m_prebuilt->table->name.m_name,
+				ut_strerr(ret));
+		}
+	}
+
+	DBUG_RETURN(0);
+}
+
+
+/** DROP TABLE (possibly as part of DROP DATABASE, CREATE/ALTER TABLE)
+@param name   table name
+@return error number */
+int ha_innobase::delete_table(const char *name)
+{
+  DBUG_ENTER("ha_innobase::delete_table");
+  if (high_level_read_only)
+    DBUG_RETURN(HA_ERR_TABLE_READONLY);
+
+  THD *thd= ha_thd();
+
+  DBUG_EXECUTE_IF("test_normalize_table_name_low",
+                  test_normalize_table_name_low(););
+  DBUG_EXECUTE_IF("test_ut_format_name", test_ut_format_name(););
+
+  trx_t *parent_trx= check_trx_exists(thd);
+  dict_table_t *table;
+
+  {
+    char norm_name[FN_REFLEN];
+    normalize_table_name(norm_name, name);
+    span<const char> n{norm_name, strlen(norm_name)};
+
+    dict_sys.lock(SRW_LOCK_CALL);
+    table= dict_sys.load_table(n, DICT_ERR_IGNORE_DROP);
+#ifdef WITH_PARTITION_STORAGE_ENGINE
+    if (!table && lower_case_table_names == 1 && is_partition(norm_name))
+    {
+      IF_WIN(normalize_table_name_c_low(norm_name, name, false),
+             innobase_casedn_str(norm_name));
+      table= dict_sys.load_table(n, DICT_ERR_IGNORE_DROP);
+    }
+#endif
+    if (!table)
+    {
+      dict_sys.unlock();
+      DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
+    }
+  }
+
+  if (table->is_temporary())
+  {
+    dict_sys.unlock();
+    parent_trx->mod_tables.erase(table); /* CREATE...SELECT error handling */
+    btr_drop_temporary_table(*table);
+    dict_sys.lock(SRW_LOCK_CALL);
+    dict_sys.remove(table);
+    dict_sys.unlock();
+    DBUG_RETURN(0);
+  }
+
+  table->acquire();
+  dict_sys.unlock();
+
+  trx_t *trx= parent_trx;
+  dberr_t err= DB_SUCCESS;
+  if (!trx->lock.table_locks.empty() &&
+      thd_ddl_options(trx->mysql_thd)->is_create_select())
+  {
+    /* CREATE TABLE...PRIMARY KEY...SELECT ought to be dropping the
+    table because a duplicate key was detected or a timeout occurred.
+
+    We shall hijack the existing transaction to drop the table and
+    commit the transaction.  If this is a partitioned table, one
+    partition will use this hijacked transaction; others will use a
+    separate transaction, one per partition. */
+    ut_ad(!trx->dict_operation_lock_mode);
+    ut_ad(trx->will_lock);
+    ut_ad(trx->state == TRX_STATE_ACTIVE);
+    trx->dict_operation= true;
+  }
+  else
+  {
+    trx= innobase_trx_allocate(thd);
+    trx_start_for_ddl(trx);
+
+    if (table->name.is_temporary())
+      /* There is no need to lock any FOREIGN KEY child tables. */;
+#ifdef WITH_PARTITION_STORAGE_ENGINE
+    else if (table->name.part())
+      /* FOREIGN KEY constraints cannot exist on partitioned tables. */;
+#endif
+    else
+    {
+      dict_sys.freeze(SRW_LOCK_CALL);
+      for (const dict_foreign_t* f : table->referenced_set)
+        if (dict_table_t* child= f->foreign_table)
+          if ((err= lock_table_for_trx(child, trx, LOCK_X)) != DB_SUCCESS)
+            break;
+      dict_sys.unfreeze();
+    }
+  }
+
+  dict_table_t *table_stats= nullptr, *index_stats= nullptr;
+  MDL_ticket *mdl_table= nullptr, *mdl_index= nullptr;
+  if (err == DB_SUCCESS)
+    err= lock_table_for_trx(table, trx, LOCK_X);
+
+  const bool fts= err == DB_SUCCESS &&
+    (table->flags2 & (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS));
+  const enum_sql_command sqlcom= enum_sql_command(thd_sql_command(thd));
+
+  if (fts)
+  {
+    fts_optimize_remove_table(table);
+    purge_sys.stop_FTS(*table);
+    err= fts_lock_tables(trx, *table);
+  }
+
+#ifdef WITH_PARTITION_STORAGE_ENGINE
+  const bool rollback_add_partition=
+    (sqlcom == SQLCOM_ALTER_TABLE && table->name.part());
+
+  if (rollback_add_partition)
+  {
+    if (!fts)
+      purge_sys.stop_FTS();
+    /* This looks like the rollback of ALTER TABLE...ADD PARTITION
+    that was caused by MDL timeout. We could have written undo log
+    for inserting the data into the new partitions. */
+    if (table->stat_persistent != DICT_STATS_PERSISTENT_OFF)
+    {
+      /* We do not really know if we are holding MDL_EXCLUSIVE. Even
+      though this code is handling the case that we are not holding
+      it, we might actually hold it. We want to avoid a deadlock
+      with dict_stats_process_entry_from_recalc_pool(). */
+      dict_stats_recalc_pool_del(table->id, true);
+      /* If statistics calculation is still using this table, we will
+      catch it below while waiting for purge to stop using this table. */
+    }
+  }
+#endif
+
+  DEBUG_SYNC(thd, "before_delete_table_stats");
+
+  if (err == DB_SUCCESS && dict_stats_is_persistent_enabled(table) &&
+      !table->is_stats_table())
+  {
+    table_stats= dict_table_open_on_name(TABLE_STATS_NAME, false,
+                                         DICT_ERR_IGNORE_NONE);
+    if (table_stats)
+    {
+      dict_sys.freeze(SRW_LOCK_CALL);
+      table_stats= dict_acquire_mdl_shared<false>(table_stats,
+                                                  thd, &mdl_table);
+      dict_sys.unfreeze();
+    }
+
+    index_stats= dict_table_open_on_name(INDEX_STATS_NAME, false,
+                                         DICT_ERR_IGNORE_NONE);
+    if (index_stats)
+    {
+      dict_sys.freeze(SRW_LOCK_CALL);
+      index_stats= dict_acquire_mdl_shared<false>(index_stats,
+                                                  thd, &mdl_index);
+      dict_sys.unfreeze();
+    }
+
+    const bool skip_wait{table->name.is_temporary()};
+
+    if (table_stats && index_stats &&
+        !strcmp(table_stats->name.m_name, TABLE_STATS_NAME) &&
+        !strcmp(index_stats->name.m_name, INDEX_STATS_NAME) &&
+        !(err= lock_table_for_trx(table_stats, trx, LOCK_X, skip_wait)))
+      err= lock_table_for_trx(index_stats, trx, LOCK_X, skip_wait);
+
+    if (err != DB_SUCCESS && skip_wait)
+    {
+      /* We may skip deleting statistics if we cannot lock the tables,
+      when the table carries a temporary name. */
+      ut_ad(err == DB_LOCK_WAIT);
+      ut_ad(trx->error_state == DB_SUCCESS);
+      err= DB_SUCCESS;
+      dict_table_close(table_stats, false, thd, mdl_table);
+      dict_table_close(index_stats, false, thd, mdl_index);
+      table_stats= nullptr;
+      index_stats= nullptr;
+    }
+  }
+
+  if (err == DB_SUCCESS)
+  {
+    if (!table->space)
+    {
+      const char *data_dir_path= DICT_TF_HAS_DATA_DIR(table->flags)
+        ? table->data_dir_path : nullptr;
+      char *path= fil_make_filepath(data_dir_path, table->name, CFG,
+                                    data_dir_path != nullptr);
+      os_file_delete_if_exists(innodb_data_file_key, path, nullptr);
+      ut_free(path);
+      path= fil_make_filepath(data_dir_path, table->name, IBD,
+                              data_dir_path != nullptr);
+      os_file_delete_if_exists(innodb_data_file_key, path, nullptr);
+      ut_free(path);
+      if (data_dir_path)
+      {
+        path= fil_make_filepath(nullptr, table->name, ISL, false);
+        os_file_delete_if_exists(innodb_data_file_key, path, nullptr);
+        ut_free(path);
+      }
+    }
+    err= lock_sys_tables(trx);
+  }
+
+  dict_sys.lock(SRW_LOCK_CALL);
+
+  if (!table->release() && err == DB_SUCCESS)
+  {
+    /* Wait for purge threads to stop using the table. */
+    for (uint n= 15;;)
+    {
+      dict_sys.unlock();
+      std::this_thread::sleep_for(std::chrono::milliseconds(50));
+      dict_sys.lock(SRW_LOCK_CALL);
+
+      if (!--n)
+      {
+        err= DB_LOCK_WAIT_TIMEOUT;
+        break;
+      }
+      if (!table->get_ref_count())
+        break;
+    }
+  }
+
+  trx->dict_operation_lock_mode= true;
+
+  if (err != DB_SUCCESS)
+  {
+err_exit:
+    trx->dict_operation_lock_mode= false;
+    trx->rollback();
+    switch (err) {
+    case DB_CANNOT_DROP_CONSTRAINT:
+    case DB_LOCK_WAIT_TIMEOUT:
+      break;
+    default:
+      ib::error() << "DROP TABLE " << table->name << ": " << err;
+    }
+    if (fts)
+    {
+      fts_optimize_add_table(table);
+      purge_sys.resume_FTS();
+    }
+#ifdef WITH_PARTITION_STORAGE_ENGINE
+    else if (rollback_add_partition)
+      purge_sys.resume_FTS();
+#endif
+    if (table_stats)
+      dict_table_close(table_stats, true, thd, mdl_table);
+    if (index_stats)
+      dict_table_close(index_stats, true, thd, mdl_index);
+    dict_sys.unlock();
+    if (trx != parent_trx)
+      trx->free();
+    DBUG_RETURN(convert_error_code_to_mysql(err, 0, NULL));
+  }
+
+  if (!table->no_rollback() && trx->check_foreigns)
+  {
+    const bool drop_db= sqlcom == SQLCOM_DROP_DB;
+    for (auto foreign : table->referenced_set)
+    {
+      /* We should allow dropping a referenced table if creating
+      that referenced table has failed for some reason. For example
+      if referenced table is created but it column types that are
+      referenced do not match. */
+      if (foreign->foreign_table == table ||
+          (drop_db &&
+           dict_tables_have_same_db(table->name.m_name,
+                                    foreign->foreign_table_name_lookup)))
+        continue;
+      mysql_mutex_lock(&dict_foreign_err_mutex);
+      rewind(dict_foreign_err_file);
+      ut_print_timestamp(dict_foreign_err_file);
+      fputs("  Cannot drop table ", dict_foreign_err_file);
+      ut_print_name(dict_foreign_err_file, trx, table->name.m_name);
+      fputs("\nbecause it is referenced by ", dict_foreign_err_file);
+      ut_print_name(dict_foreign_err_file, trx, foreign->foreign_table_name);
+      putc('\n', dict_foreign_err_file);
+      mysql_mutex_unlock(&dict_foreign_err_mutex);
+      err= DB_CANNOT_DROP_CONSTRAINT;
+      goto err_exit;
+    }
+  }
+
+  if (!table->no_rollback())
+    err= trx->drop_table_foreign(table->name);
+
+  if (err == DB_SUCCESS && table_stats && index_stats)
+    err= trx->drop_table_statistics(table->name);
+  if (err != DB_SUCCESS)
+    goto err_exit;
+
+  err= trx->drop_table(*table);
+  if (err != DB_SUCCESS)
+    goto err_exit;
+
+  std::vector<pfs_os_file_t> deleted;
+  trx->commit(deleted);
+  if (table_stats)
+    dict_table_close(table_stats, true, thd, mdl_table);
+  if (index_stats)
+    dict_table_close(index_stats, true, thd, mdl_index);
+  row_mysql_unlock_data_dictionary(trx);
+  for (pfs_os_file_t d : deleted)
+    os_file_close(d);
+  log_write_up_to(trx->commit_lsn, true);
+  if (trx != parent_trx)
+    trx->free();
+  if (!fts)
+#ifdef WITH_PARTITION_STORAGE_ENGINE
+  if (!rollback_add_partition)
+#endif
+    DBUG_RETURN(0);
+  purge_sys.resume_FTS();
+  DBUG_RETURN(0);
+}
+
+/** Rename an InnoDB table.
+@param[in,out]	trx	InnoDB data dictionary transaction
+@param[in]	from	old table name
+@param[in]	to	new table name
+@param[in]	use_fk	whether to enforce FOREIGN KEY
+@return DB_SUCCESS or error code */
+static dberr_t innobase_rename_table(trx_t *trx, const char *from,
+                                     const char *to, bool use_fk)
+{
+	dberr_t	error;
+	char	norm_to[FN_REFLEN];
+	char	norm_from[FN_REFLEN];
+
+	DBUG_ENTER("innobase_rename_table");
+	DBUG_ASSERT(trx->dict_operation);
+
+	ut_ad(!srv_read_only_mode);
+
+	normalize_table_name(norm_to, to);
+	normalize_table_name(norm_from, from);
+
+	DEBUG_SYNC_C("innodb_rename_table_ready");
+
+	ut_ad(trx->will_lock);
+
+	error = row_rename_table_for_mysql(norm_from, norm_to, trx, use_fk);
+
+	if (error != DB_SUCCESS) {
+		if (error == DB_TABLE_NOT_FOUND
+		    && lower_case_table_names == 1) {
+			char*	is_part = is_partition(norm_from);
+
+			if (is_part) {
+				char	par_case_name[FN_REFLEN];
+#ifndef _WIN32
+				/* Check for the table using lower
+				case name, including the partition
+				separator "P" */
+				strcpy(par_case_name, norm_from);
+				innobase_casedn_str(par_case_name);
+#else
+				/* On Windows platfrom, check
+				whether there exists table name in
+				system table whose name is
+				not being normalized to lower case */
+				normalize_table_name_c_low(
+					par_case_name, from, false);
+#endif /* _WIN32 */
+				trx_start_if_not_started(trx, true);
+				error = row_rename_table_for_mysql(
+					par_case_name, norm_to, trx, false);
+			}
+		}
+
+		if (error == DB_SUCCESS) {
+#ifndef _WIN32
+			sql_print_warning("Rename partition table %s"
+					  " succeeds after converting to lower"
+					  " case. The table may have"
+					  " been moved from a case"
+					  " in-sensitive file system.\n",
+					  norm_from);
+#else
+			sql_print_warning("Rename partition table %s"
+					  " succeeds after skipping the step to"
+					  " lower case the table name."
+					  " The table may have been"
+					  " moved from a case sensitive"
+					  " file system.\n",
+					  norm_from);
+#endif /* _WIN32 */
+		}
+	}
+
+	DBUG_RETURN(error);
+}
+
+/** TRUNCATE TABLE
+@return	error code
+@retval	0	on success */
+int ha_innobase::truncate()
+{
+  mariadb_set_stats set_stats_temporary(handler_stats);
+  DBUG_ENTER("ha_innobase::truncate");
+
+  update_thd();
+
+  if (is_read_only())
+    DBUG_RETURN(HA_ERR_TABLE_READONLY);
+
+  HA_CREATE_INFO info;
+  dict_table_t *ib_table= m_prebuilt->table;
+  info.init();
+  update_create_info_from_table(&info, table);
+  switch (dict_tf_get_rec_format(ib_table->flags)) {
+  case REC_FORMAT_REDUNDANT:
+    info.row_type= ROW_TYPE_REDUNDANT;
+    break;
+  case REC_FORMAT_COMPACT:
+    info.row_type= ROW_TYPE_COMPACT;
+    break;
+  case REC_FORMAT_COMPRESSED:
+    info.row_type= ROW_TYPE_COMPRESSED;
+    break;
+  case REC_FORMAT_DYNAMIC:
+    info.row_type= ROW_TYPE_DYNAMIC;
+    break;
+  }
+
+  const auto stored_lock= m_prebuilt->stored_select_lock_type;
+  trx_t *trx= innobase_trx_allocate(m_user_thd);
+  trx_start_for_ddl(trx);
+
+  if (ib_table->is_temporary())
+  {
+    info.options|= HA_LEX_CREATE_TMP_TABLE;
+    btr_drop_temporary_table(*ib_table);
+    m_prebuilt->table= nullptr;
+    row_prebuilt_free(m_prebuilt);
+    m_prebuilt= nullptr;
+    my_free(m_upd_buf);
+    m_upd_buf= nullptr;
+    m_upd_buf_size= 0;
+
+    row_mysql_lock_data_dictionary(trx);
+    ib_table->release();
+    dict_sys.remove(ib_table, false, true);
+    int err= create(ib_table->name.m_name, table, &info, true, trx);
+    row_mysql_unlock_data_dictionary(trx);
+
+    ut_ad(!err);
+    if (!err)
+    {
+      err= open(ib_table->name.m_name, 0, 0);
+      m_prebuilt->table->release();
+      m_prebuilt->stored_select_lock_type= stored_lock;
+    }
+
+    trx->free();
+
+#ifdef BTR_CUR_HASH_ADAPT
+    if (UT_LIST_GET_LEN(ib_table->freed_indexes))
+    {
+      ib_table->vc_templ= nullptr;
+      ib_table->id= 0;
+    }
+    else
+#endif /* BTR_CUR_HASH_ADAPT */
+    dict_mem_table_free(ib_table);
+
+    DBUG_RETURN(err);
+  }
+
+  mem_heap_t *heap= mem_heap_create(1000);
+
+  if (!ib_table->space)
+    ib_senderrf(m_user_thd, IB_LOG_LEVEL_WARN, ER_TABLESPACE_DISCARDED,
+                table->s->table_name.str);
+
+  dict_get_and_save_data_dir_path(ib_table);
+  info.data_file_name= ib_table->data_dir_path;
+  const char *temp_name=
+    dict_mem_create_temporary_tablename(heap,
+                                        ib_table->name.m_name, ib_table->id);
+  const char *name= mem_heap_strdup(heap, ib_table->name.m_name);
+
+  dict_table_t *table_stats = nullptr, *index_stats = nullptr;
+  MDL_ticket *mdl_table = nullptr, *mdl_index = nullptr;
+
+  dberr_t error= DB_SUCCESS;
+
+  dict_sys.freeze(SRW_LOCK_CALL);
+  for (const dict_foreign_t *f : ib_table->referenced_set)
+    if (dict_table_t *child= f->foreign_table)
+      if ((error= lock_table_for_trx(child, trx, LOCK_X)) != DB_SUCCESS)
+        break;
+  dict_sys.unfreeze();
+
+  if (error == DB_SUCCESS)
+    error= lock_table_for_trx(ib_table, trx, LOCK_X);
+
+  const bool fts= error == DB_SUCCESS &&
+    ib_table->flags2 & (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS);
+
+  if (fts)
+  {
+    fts_optimize_remove_table(ib_table);
+    purge_sys.stop_FTS(*ib_table);
+    error= fts_lock_tables(trx, *ib_table);
+  }
+
+  /* Wait for purge threads to stop using the table. */
+  for (uint n = 15; ib_table->get_ref_count() > 1; )
+  {
+    if (!--n)
+    {
+      error= DB_LOCK_WAIT_TIMEOUT;
+      break;
+    }
+    std::this_thread::sleep_for(std::chrono::milliseconds(50));
+  }
+
+  if (error == DB_SUCCESS && dict_stats_is_persistent_enabled(ib_table) &&
+      !ib_table->is_stats_table())
+  {
+    table_stats= dict_table_open_on_name(TABLE_STATS_NAME, false,
+                                         DICT_ERR_IGNORE_NONE);
+    if (table_stats)
+    {
+      dict_sys.freeze(SRW_LOCK_CALL);
+      table_stats= dict_acquire_mdl_shared<false>(table_stats, m_user_thd,
+                                                  &mdl_table);
+      dict_sys.unfreeze();
+    }
+    index_stats= dict_table_open_on_name(INDEX_STATS_NAME, false,
+                                         DICT_ERR_IGNORE_NONE);
+    if (index_stats)
+    {
+      dict_sys.freeze(SRW_LOCK_CALL);
+      index_stats= dict_acquire_mdl_shared<false>(index_stats, m_user_thd,
+                                                  &mdl_index);
+      dict_sys.unfreeze();
+    }
+
+    if (table_stats && index_stats &&
+        !strcmp(table_stats->name.m_name, TABLE_STATS_NAME) &&
+        !strcmp(index_stats->name.m_name, INDEX_STATS_NAME) &&
+        !(error= lock_table_for_trx(table_stats, trx, LOCK_X)))
+      error= lock_table_for_trx(index_stats, trx, LOCK_X);
+  }
+
+  if (error == DB_SUCCESS)
+    error= lock_sys_tables(trx);
+
+  std::vector<pfs_os_file_t> deleted;
+
+  row_mysql_lock_data_dictionary(trx);
+
+  if (error == DB_SUCCESS)
+  {
+    error= innobase_rename_table(trx, ib_table->name.m_name, temp_name, false);
+    if (error == DB_SUCCESS)
+      error= trx->drop_table(*ib_table);
+  }
+
+  int err = convert_error_code_to_mysql(error, ib_table->flags, m_user_thd);
+  const auto update_time = ib_table->update_time;
+
+  if (err)
+  {
+    trx_rollback_for_mysql(trx);
+    if (fts)
+      fts_optimize_add_table(ib_table);
+  }
+  else
+  {
+    const auto def_trx_id= ib_table->def_trx_id;
+    ib_table->release();
+    m_prebuilt->table= nullptr;
+
+    err= create(name, table, &info, dict_table_is_file_per_table(ib_table),
+                trx);
+    if (!err)
+    {
+      m_prebuilt->table->acquire();
+      create_table_info_t::create_table_update_dict(m_prebuilt->table,
+                                                    m_user_thd, info, *table);
+      trx->commit(deleted);
+    }
+    else
+    {
+      trx_rollback_for_mysql(trx);
+      m_prebuilt->table= dict_table_open_on_name(name, true,
+                                                 DICT_ERR_IGNORE_FK_NOKEY);
+      m_prebuilt->table->def_trx_id= def_trx_id;
+    }
+    dict_names_t fk_tables;
+    dict_load_foreigns(m_prebuilt->table->name.m_name, nullptr, 1, true,
+                       DICT_ERR_IGNORE_FK_NOKEY, fk_tables);
+    for (const char *f : fk_tables)
+      dict_sys.load_table({f, strlen(f)});
+  }
+
+  if (fts)
+    purge_sys.resume_FTS();
+
+  row_mysql_unlock_data_dictionary(trx);
+  for (pfs_os_file_t d : deleted) os_file_close(d);
+
+  if (!err)
+  {
+    dict_stats_update(m_prebuilt->table, DICT_STATS_EMPTY_TABLE);
+    log_write_up_to(trx->commit_lsn, true);
+    row_prebuilt_t *prebuilt= m_prebuilt;
+    uchar *upd_buf= m_upd_buf;
+    ulint upd_buf_size= m_upd_buf_size;
+    /* Mimic ha_innobase::close(). */
+    m_prebuilt= nullptr;
+    m_upd_buf= nullptr;
+    m_upd_buf_size= 0;
+
+    err= open(name, 0, 0);
+    if (!err)
+    {
+      m_prebuilt->stored_select_lock_type= stored_lock;
+      m_prebuilt->table->update_time= update_time;
+      row_prebuilt_free(prebuilt);
+      my_free(upd_buf);
+    }
+    else
+    {
+      /* Revert to the old table. */
+      m_prebuilt= prebuilt;
+      m_upd_buf= upd_buf;
+      m_upd_buf_size= upd_buf_size;
+    }
+  }
+
+  trx->free();
+
+  mem_heap_free(heap);
+
+  if (table_stats)
+    dict_table_close(table_stats, false, m_user_thd, mdl_table);
+  if (index_stats)
+    dict_table_close(index_stats, false, m_user_thd, mdl_index);
+
+  DBUG_RETURN(err);
+}
+
+/*********************************************************************//**
+Renames an InnoDB table.
+@return 0 or error code */
+
+int
+ha_innobase::rename_table(
+/*======================*/
+	const char*	from,	/*!< in: old name of the table */
+	const char*	to)	/*!< in: new name of the table */
+{
+	THD*	thd = ha_thd();
+
+	DBUG_ENTER("ha_innobase::rename_table");
+
+	if (high_level_read_only) {
+		ib_senderrf(thd, IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE);
+		DBUG_RETURN(HA_ERR_TABLE_READONLY);
+	}
+
+	trx_t*	trx = innobase_trx_allocate(thd);
+	trx_start_for_ddl(trx);
+
+	dict_table_t *table_stats = nullptr, *index_stats = nullptr;
+	MDL_ticket *mdl_table = nullptr, *mdl_index = nullptr;
+	char norm_from[MAX_FULL_NAME_LEN];
+	char norm_to[MAX_FULL_NAME_LEN];
+
+	normalize_table_name(norm_from, from);
+	normalize_table_name(norm_to, to);
+
+	dberr_t error = DB_SUCCESS;
+	const bool from_temp = dict_table_t::is_temporary_name(norm_from);
+
+	if (from_temp) {
+		/* There is no need to lock any FOREIGN KEY child tables. */
+	} else if (dict_table_t *table = dict_table_open_on_name(
+		    norm_from, false, DICT_ERR_IGNORE_FK_NOKEY)) {
+		dict_sys.freeze(SRW_LOCK_CALL);
+		for (const dict_foreign_t* f : table->referenced_set) {
+			if (dict_table_t* child = f->foreign_table) {
+				error = lock_table_for_trx(child, trx, LOCK_X);
+				if (error != DB_SUCCESS) {
+					break;
+				}
+			}
+		}
+		dict_sys.unfreeze();
+		if (error == DB_SUCCESS) {
+			error = lock_table_for_trx(table, trx, LOCK_X);
+		}
+		table->release();
+	}
+
+	if (strcmp(norm_from, TABLE_STATS_NAME)
+	    && strcmp(norm_from, INDEX_STATS_NAME)
+	    && strcmp(norm_to, TABLE_STATS_NAME)
+	    && strcmp(norm_to, INDEX_STATS_NAME)) {
+		table_stats = dict_table_open_on_name(TABLE_STATS_NAME, false,
+						      DICT_ERR_IGNORE_NONE);
+		if (table_stats) {
+			dict_sys.freeze(SRW_LOCK_CALL);
+			table_stats = dict_acquire_mdl_shared<false>(
+				table_stats, thd, &mdl_table);
+			dict_sys.unfreeze();
+		}
+		index_stats = dict_table_open_on_name(INDEX_STATS_NAME, false,
+						      DICT_ERR_IGNORE_NONE);
+		if (index_stats) {
+			dict_sys.freeze(SRW_LOCK_CALL);
+			index_stats = dict_acquire_mdl_shared<false>(
+				index_stats, thd, &mdl_index);
+			dict_sys.unfreeze();
+		}
+
+		if (error == DB_SUCCESS && table_stats && index_stats
+		    && !strcmp(table_stats->name.m_name, TABLE_STATS_NAME)
+		    && !strcmp(index_stats->name.m_name, INDEX_STATS_NAME)) {
+			error = lock_table_for_trx(table_stats, trx, LOCK_X,
+						   from_temp);
+			if (error == DB_SUCCESS) {
+				error = lock_table_for_trx(index_stats, trx,
+							   LOCK_X, from_temp);
+			}
+			if (error != DB_SUCCESS && from_temp) {
+				ut_ad(error == DB_LOCK_WAIT);
+				ut_ad(trx->error_state == DB_SUCCESS);
+				error = DB_SUCCESS;
+				/* We may skip renaming statistics if
+				we cannot lock the tables, when the
+				table is being renamed from from a
+				temporary name. */
+				dict_table_close(table_stats, false, thd,
+						 mdl_table);
+				dict_table_close(index_stats, false, thd,
+						 mdl_index);
+				table_stats = nullptr;
+				index_stats = nullptr;
+			}
+		}
+	}
+
+	if (error == DB_SUCCESS) {
+		error = lock_table_for_trx(dict_sys.sys_tables, trx, LOCK_X);
+		if (error == DB_SUCCESS) {
+			error = lock_table_for_trx(dict_sys.sys_foreign, trx,
+						   LOCK_X);
+			if (error == DB_SUCCESS) {
+				error = lock_table_for_trx(
+					dict_sys.sys_foreign_cols,
+					trx, LOCK_X);
+			}
+		}
+	}
+
+	row_mysql_lock_data_dictionary(trx);
+
+	if (error == DB_SUCCESS) {
+		error = innobase_rename_table(trx, from, to, true);
+	}
+
+	DEBUG_SYNC(thd, "after_innobase_rename_table");
+
+	if (error == DB_SUCCESS && table_stats && index_stats) {
+		error = dict_stats_rename_table(norm_from, norm_to, trx);
+		if (error == DB_DUPLICATE_KEY) {
+			/* The duplicate may also occur in
+			mysql.innodb_index_stats.  */
+			my_error(ER_DUP_KEY, MYF(0),
+				 "mysql.innodb_table_stats");
+			error = DB_ERROR;
+		}
+	}
+
+	if (error == DB_SUCCESS) {
+		trx->flush_log_later = true;
+		innobase_commit_low(trx);
+	} else {
+		trx->rollback();
+	}
+
+	if (table_stats) {
+		dict_table_close(table_stats, true, thd, mdl_table);
+	}
+	if (index_stats) {
+		dict_table_close(index_stats, true, thd, mdl_index);
+	}
+	row_mysql_unlock_data_dictionary(trx);
+	if (error == DB_SUCCESS) {
+		log_write_up_to(trx->commit_lsn, true);
+	}
+	trx->flush_log_later = false;
+	trx->free();
+
+	if (error == DB_DUPLICATE_KEY) {
+		/* We are not able to deal with handler::get_dup_key()
+		during DDL operations, because the duplicate key would
+		exist in metadata tables, not in the user table. */
+		my_error(ER_TABLE_EXISTS_ERROR, MYF(0), to);
+		error = DB_ERROR;
+	} else if (error == DB_LOCK_WAIT_TIMEOUT) {
+		my_error(ER_LOCK_WAIT_TIMEOUT, MYF(0), to);
+		error = DB_LOCK_WAIT;
+	}
+
+	DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
+}
+
+/*********************************************************************//**
+Estimates the number of index records in a range.
+@return estimated number of rows */
+
+ha_rows
+ha_innobase::records_in_range(
+/*==========================*/
+	uint			keynr,		/*!< in: index number */
+	const key_range		*min_key,	/*!< in: start key value of the
+						range, may also be 0 */
+	const key_range		*max_key,	/*!< in: range end key val, may
+						also be 0 */
+        page_range              *pages)
+{
+	KEY*		key;
+	dict_index_t*	index;
+	dtuple_t*	range_start;
+	dtuple_t*	range_end;
+	ha_rows		n_rows;
+	page_cur_mode_t	mode1;
+	page_cur_mode_t	mode2;
+	mem_heap_t*	heap;
+
+	DBUG_ENTER("records_in_range");
+
+	ut_a(m_prebuilt->trx == thd_to_trx(ha_thd()));
+
+	m_prebuilt->trx->op_info = "estimating records in index range";
+
+	active_index = keynr;
+
+	key = table->key_info + active_index;
+
+	index = innobase_get_index(keynr);
+
+	/* There exists possibility of not being able to find requested
+	index due to inconsistency between MySQL and InoDB dictionary info.
+	Necessary message should have been printed in innobase_get_index() */
+	if (!m_prebuilt->table->space) {
+		n_rows = HA_POS_ERROR;
+		goto func_exit;
+	}
+	if (!index) {
+		n_rows = HA_POS_ERROR;
+		goto func_exit;
+	}
+	if (index->is_corrupted()) {
+		n_rows = HA_ERR_INDEX_CORRUPT;
+		goto func_exit;
+	}
+	if (!row_merge_is_index_usable(m_prebuilt->trx, index)) {
+		n_rows = HA_ERR_TABLE_DEF_CHANGED;
+		goto func_exit;
+	}
+
+	heap = mem_heap_create(2 * (key->ext_key_parts * sizeof(dfield_t)
+				    + sizeof(dtuple_t)));
+
+	range_start = dtuple_create(heap, key->ext_key_parts);
+	dict_index_copy_types(range_start, index, key->ext_key_parts);
+
+	range_end = dtuple_create(heap, key->ext_key_parts);
+	dict_index_copy_types(range_end, index, key->ext_key_parts);
+
+	row_sel_convert_mysql_key_to_innobase(
+		range_start,
+		m_prebuilt->srch_key_val1,
+		m_prebuilt->srch_key_val_len,
+		index,
+		(byte*) (min_key ? min_key->key : (const uchar*) 0),
+		(ulint) (min_key ? min_key->length : 0));
+
+	DBUG_ASSERT(min_key
+		    ? range_start->n_fields > 0
+		    : range_start->n_fields == 0);
+
+	row_sel_convert_mysql_key_to_innobase(
+		range_end,
+		m_prebuilt->srch_key_val2,
+		m_prebuilt->srch_key_val_len,
+		index,
+		(byte*) (max_key ? max_key->key : (const uchar*) 0),
+		(ulint) (max_key ? max_key->length : 0));
+
+	DBUG_ASSERT(max_key
+		    ? range_end->n_fields > 0
+		    : range_end->n_fields == 0);
+
+	mode1 = convert_search_mode_to_innobase(
+		min_key ? min_key->flag : HA_READ_KEY_EXACT);
+
+	mode2 = convert_search_mode_to_innobase(
+		max_key ? max_key->flag : HA_READ_KEY_EXACT);
+
+	if (mode1 != PAGE_CUR_UNSUPP && mode2 != PAGE_CUR_UNSUPP) {
+
+		if (dict_index_is_spatial(index)) {
+			/*Only min_key used in spatial index. */
+			n_rows = rtr_estimate_n_rows_in_range(
+				index, range_start, mode1);
+		} else {
+                        btr_pos_t tuple1(range_start, mode1, pages->first_page);
+                        btr_pos_t tuple2(range_end,   mode2, pages->last_page);
+			n_rows = btr_estimate_n_rows_in_range(
+                                 index, &tuple1, &tuple2);
+                        pages->first_page= tuple1.page_id.raw();
+                        pages->last_page=  tuple2.page_id.raw();
+		}
+	} else {
+
+		n_rows = HA_POS_ERROR;
+	}
+
+	mem_heap_free(heap);
+
+	DBUG_EXECUTE_IF(
+		"print_btr_estimate_n_rows_in_range_return_value",
+		push_warning_printf(
+			ha_thd(), Sql_condition::WARN_LEVEL_WARN,
+			ER_NO_DEFAULT,
+			"btr_estimate_n_rows_in_range(): %lld",
+                        (longlong) n_rows);
+	);
+
+func_exit:
+
+	m_prebuilt->trx->op_info = (char*)"";
+
+	/* The MySQL optimizer seems to believe an estimate of 0 rows is
+	always accurate and may return the result 'Empty set' based on that.
+	The accuracy is not guaranteed, and even if it were, for a locking
+	read we should anyway perform the search to set the next-key lock.
+	Add 1 to the value to make sure MySQL does not make the assumption! */
+
+	if (n_rows == 0) {
+		n_rows = 1;
+	}
+
+	DBUG_RETURN((ha_rows) n_rows);
+}
+
+/*********************************************************************//**
+Gives an UPPER BOUND to the number of rows in a table. This is used in
+filesort.cc.
+@return upper bound of rows */
+
+ha_rows
+ha_innobase::estimate_rows_upper_bound()
+/*====================================*/
+{
+	const dict_index_t*	index;
+	ulonglong		estimate;
+	ulonglong		local_data_file_length;
+	mariadb_set_stats set_stats_temporary(handler_stats);
+	DBUG_ENTER("estimate_rows_upper_bound");
+
+	/* We do not know if MySQL can call this function before calling
+	external_lock(). To be safe, update the thd of the current table
+	handle. */
+
+	update_thd(ha_thd());
+
+	m_prebuilt->trx->op_info = "calculating upper bound for table rows";
+
+	index = dict_table_get_first_index(m_prebuilt->table);
+
+	ulint	stat_n_leaf_pages = index->stat_n_leaf_pages;
+
+	ut_a(stat_n_leaf_pages > 0);
+
+	local_data_file_length = ulonglong(stat_n_leaf_pages)
+		<< srv_page_size_shift;
+
+	/* Calculate a minimum length for a clustered index record and from
+	that an upper bound for the number of rows. Since we only calculate
+	new statistics in row0mysql.cc when a table has grown by a threshold
+	factor, we must add a safety factor 2 in front of the formula below. */
+
+	estimate = 2 * local_data_file_length
+		/ dict_index_calc_min_rec_len(index);
+
+	m_prebuilt->trx->op_info = "";
+
+        /* Set num_rows less than MERGEBUFF to simulate the case where we do
+        not have enough space to merge the externally sorted file blocks. */
+        DBUG_EXECUTE_IF("set_num_rows_lt_MERGEBUFF",
+                        estimate = 2;
+                        DBUG_SET("-d,set_num_rows_lt_MERGEBUFF");
+                       );
+
+	DBUG_RETURN((ha_rows) estimate);
+}
+
+/*********************************************************************//**
+How many seeks it will take to read through the table. This is to be
+comparable to the number returned by records_in_range so that we can
+decide if we should scan the table or use keys.
+@return estimated time measured in disk seeks */
+
+double
+ha_innobase::scan_time()
+/*====================*/
+{
+	/* Since MySQL seems to favor table scans too much over index
+	searches, we pretend that a sequential read takes the same time
+	as a random disk read, that is, we do not divide the following
+	by 10, which would be physically realistic. */
+
+	/* The locking below is disabled for performance reasons. Without
+	it we could end up returning uninitialized value to the caller,
+	which in the worst case could make some query plan go bogus or
+	issue a Valgrind warning. */
+	if (m_prebuilt == NULL) {
+		/* In case of derived table, Optimizer will try to fetch stat
+		for table even before table is create or open. In such
+		cases return default value of 1.
+		TODO: This will be further improved to return some approximate
+		estimate but that would also needs pre-population of stats
+		structure. As of now approach is in sync with MyISAM. */
+		return(ulonglong2double(stats.data_file_length) / IO_SIZE + 2);
+	}
+
+	ulint	stat_clustered_index_size;
+
+	ut_a(m_prebuilt->table->stat_initialized);
+
+	stat_clustered_index_size =
+		m_prebuilt->table->stat_clustered_index_size;
+
+	return((double) stat_clustered_index_size);
+}
+
+/******************************************************************//**
+Calculate the time it takes to read a set of ranges through an index
+This enables us to optimise reads for clustered indexes.
+@return estimated time measured in disk seeks */
+
+double
+ha_innobase::read_time(
+/*===================*/
+	uint	index,	/*!< in: key number */
+	uint	ranges,	/*!< in: how many ranges */
+	ha_rows rows)	/*!< in: estimated number of rows in the ranges */
+{
+	ha_rows total_rows;
+
+	if (index != table->s->primary_key) {
+		/* Not clustered */
+		return(handler::read_time(index, ranges, rows));
+	}
+
+	/* Assume that the read time is proportional to the scan time for all
+	rows + at most one seek per range. */
+
+	double	time_for_scan = scan_time();
+
+	if ((total_rows = estimate_rows_upper_bound()) < rows) {
+
+		return(time_for_scan);
+	}
+
+	return(ranges + (double) rows / (double) total_rows * time_for_scan);
+}
+
+/*********************************************************************//**
+Calculates the key number used inside MySQL for an Innobase index.
+@return the key number used inside MySQL */
+static
+unsigned
+innobase_get_mysql_key_number_for_index(
+/*====================================*/
+	const TABLE*		table,	/*!< in: table in MySQL data
+					dictionary */
+	dict_table_t*		ib_table,/*!< in: table in InnoDB data
+					dictionary */
+	const dict_index_t*	index)	/*!< in: index */
+{
+	const dict_index_t*	ind;
+	unsigned int		i;
+
+	/* If index does not belong to the table object of share structure
+	(ib_table comes from the share structure) search the index->table
+	object instead */
+	if (index->table != ib_table) {
+		i = 0;
+		ind = dict_table_get_first_index(index->table);
+
+		while (index != ind) {
+			ind = dict_table_get_next_index(ind);
+			i++;
+		}
+
+		if (dict_index_is_auto_gen_clust(index)) {
+			ut_a(i > 0);
+			i--;
+		}
+
+		return(i);
+	}
+
+	/* Directly find matching index with information from mysql TABLE
+	structure and InnoDB dict_index_t list */
+	for (i = 0; i < table->s->keys; i++) {
+		ind = dict_table_get_index_on_name(
+			ib_table, table->key_info[i].name.str);
+
+		if (index == ind) {
+			return(i);
+		}
+	}
+
+	/* Loop through each index of the table and lock them */
+	for (ind = dict_table_get_first_index(ib_table);
+	     ind != NULL;
+	     ind = dict_table_get_next_index(ind)) {
+		if (index == ind) {
+			/* Temp index is internal to InnoDB, that is
+			not present in the MySQL index list, so no
+			need to print such mismatch warning. */
+			if (index->is_committed()) {
+				sql_print_warning(
+					"Found index %s in InnoDB index list"
+					" but not its MariaDB index number."
+					" It could be an InnoDB internal"
+					" index.",
+					index->name());
+			}
+			return(~0U);
+		}
+	}
+
+	ut_error;
+
+	return(~0U);
+}
+
+/*********************************************************************//**
+Calculate Record Per Key value. Need to exclude the NULL value if
+innodb_stats_method is set to "nulls_ignored"
+@return estimated record per key value */
+rec_per_key_t
+innodb_rec_per_key(
+/*===============*/
+	dict_index_t*	index,		/*!< in: dict_index_t structure */
+	ulint		i,		/*!< in: the column we are
+					calculating rec per key */
+	ha_rows		records)	/*!< in: estimated total records */
+{
+	rec_per_key_t	rec_per_key;
+	ib_uint64_t	n_diff;
+
+	ut_a(index->table->stat_initialized);
+
+	ut_ad(i < dict_index_get_n_unique(index));
+	ut_ad(!dict_index_is_spatial(index));
+
+	if (records == 0) {
+		/* "Records per key" is meaningless for empty tables.
+		Return 1.0 because that is most convenient to the Optimizer. */
+		return(1.0);
+	}
+
+	n_diff = index->stat_n_diff_key_vals[i];
+
+	if (n_diff == 0) {
+
+		rec_per_key = static_cast<rec_per_key_t>(records);
+	} else if (srv_innodb_stats_method == SRV_STATS_NULLS_IGNORED) {
+		ib_uint64_t	n_null;
+		ib_uint64_t	n_non_null;
+
+		n_non_null = index->stat_n_non_null_key_vals[i];
+
+		/* In theory, index->stat_n_non_null_key_vals[i]
+		should always be less than the number of records.
+		Since this is statistics value, the value could
+		have slight discrepancy. But we will make sure
+		the number of null values is not a negative number. */
+		if (records < n_non_null) {
+			n_null = 0;
+		} else {
+			n_null = records - n_non_null;
+		}
+
+		/* If the number of NULL values is the same as or
+		larger than that of the distinct values, we could
+		consider that the table consists mostly of NULL value.
+		Set rec_per_key to 1. */
+		if (n_diff <= n_null) {
+			rec_per_key = 1.0;
+		} else {
+			/* Need to exclude rows with NULL values from
+			rec_per_key calculation */
+			rec_per_key
+				= static_cast<rec_per_key_t>(records - n_null)
+				/ static_cast<rec_per_key_t>(n_diff - n_null);
+		}
+	} else {
+		DEBUG_SYNC_C("after_checking_for_0");
+		rec_per_key = static_cast<rec_per_key_t>(records)
+			/ static_cast<rec_per_key_t>(n_diff);
+	}
+
+	if (rec_per_key < 1.0) {
+		/* Values below 1.0 are meaningless and must be due to the
+		stats being imprecise. */
+		rec_per_key = 1.0;
+	}
+
+	return(rec_per_key);
+}
+
+/** Calculate how many KiB of new data we will be able to insert to the
+tablespace without running out of space. Start with a space object that has
+been acquired by the caller who holds it for the calculation,
+@param[in]	space		tablespace object from fil_space_acquire()
+@return available space in KiB */
+static uintmax_t
+fsp_get_available_space_in_free_extents(const fil_space_t& space)
+{
+	ulint	size_in_header = space.size_in_header;
+	if (size_in_header < FSP_EXTENT_SIZE) {
+		return 0;		/* TODO: count free frag pages and
+					return a value based on that */
+	}
+
+	/* Below we play safe when counting free extents above the free limit:
+	some of them will contain extent descriptor pages, and therefore
+	will not be free extents */
+	ut_ad(size_in_header >= space.free_limit);
+	ulint	n_free_up =
+		(size_in_header - space.free_limit) / FSP_EXTENT_SIZE;
+
+	const ulint size = space.physical_size();
+	if (n_free_up > 0) {
+		n_free_up--;
+		n_free_up -= n_free_up / (size / FSP_EXTENT_SIZE);
+	}
+
+	/* We reserve 1 extent + 0.5 % of the space size to undo logs
+	and 1 extent + 0.5 % to cleaning operations; NOTE: this source
+	code is duplicated in the function above! */
+
+	ulint	reserve = 2 + ((size_in_header / FSP_EXTENT_SIZE) * 2) / 200;
+	ulint	n_free = space.free_len + n_free_up;
+
+	if (reserve > n_free) {
+		return(0);
+	}
+
+	return(static_cast<uintmax_t>(n_free - reserve)
+	       * FSP_EXTENT_SIZE * (size / 1024));
+}
+
+/*********************************************************************//**
+Returns statistics information of the table to the MySQL interpreter,
+in various fields of the handle object.
+@return HA_ERR_* error code or 0 */
+
+int
+ha_innobase::info_low(
+/*==================*/
+	uint	flag,	/*!< in: what information is requested */
+	bool	is_analyze)
+{
+	dict_table_t*	ib_table;
+	ib_uint64_t	n_rows;
+	char		path[FN_REFLEN];
+	os_file_stat_t	stat_info;
+
+	DBUG_ENTER("info");
+
+	DEBUG_SYNC_C("ha_innobase_info_low");
+
+	/* If we are forcing recovery at a high level, we will suppress
+	statistics calculation on tables, because that may crash the
+	server if an index is badly corrupted. */
+
+	/* We do not know if MySQL can call this function before calling
+	external_lock(). To be safe, update the thd of the current table
+	handle. */
+
+	update_thd(ha_thd());
+
+	m_prebuilt->trx->op_info = "returning various info to MariaDB";
+
+	ib_table = m_prebuilt->table;
+	DBUG_ASSERT(ib_table->get_ref_count() > 0);
+
+	if (!ib_table->is_readable()) {
+		ib_table->stats_mutex_lock();
+		ib_table->stat_initialized = true;
+		ib_table->stat_n_rows = 0;
+		ib_table->stat_clustered_index_size = 0;
+		ib_table->stat_sum_of_other_index_sizes = 0;
+		ib_table->stats_mutex_unlock();
+	}
+
+	if (flag & HA_STATUS_TIME) {
+		if (is_analyze || innobase_stats_on_metadata) {
+
+			dict_stats_upd_option_t	opt;
+			dberr_t			ret;
+
+			m_prebuilt->trx->op_info = "updating table statistics";
+
+			if (dict_stats_is_persistent_enabled(ib_table)) {
+				if (is_analyze) {
+					if (!srv_read_only_mode) {
+						dict_stats_recalc_pool_del(
+							ib_table->id, false);
+					}
+					opt = DICT_STATS_RECALC_PERSISTENT;
+				} else {
+					/* This is e.g. 'SHOW INDEXES', fetch
+					the persistent stats from disk. */
+					opt = DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY;
+				}
+			} else {
+				opt = DICT_STATS_RECALC_TRANSIENT;
+			}
+
+			ret = dict_stats_update(ib_table, opt);
+
+			if (ret != DB_SUCCESS) {
+				m_prebuilt->trx->op_info = "";
+				DBUG_RETURN(HA_ERR_GENERIC);
+			}
+
+			m_prebuilt->trx->op_info =
+				"returning various info to MariaDB";
+		}
+
+
+		stats.update_time = (ulong) ib_table->update_time;
+	}
+
+	dict_stats_init(ib_table);
+
+	if (flag & HA_STATUS_VARIABLE) {
+
+		ulint	stat_clustered_index_size;
+		ulint	stat_sum_of_other_index_sizes;
+
+		ib_table->stats_mutex_lock();
+
+		ut_a(ib_table->stat_initialized);
+
+		n_rows = ib_table->stat_n_rows;
+
+		stat_clustered_index_size
+			= ib_table->stat_clustered_index_size;
+
+		stat_sum_of_other_index_sizes
+			= ib_table->stat_sum_of_other_index_sizes;
+
+		ib_table->stats_mutex_unlock();
+
+		/*
+		The MySQL optimizer seems to assume in a left join that n_rows
+		is an accurate estimate if it is zero. Of course, it is not,
+		since we do not have any locks on the rows yet at this phase.
+		Since SHOW TABLE STATUS seems to call this function with the
+		HA_STATUS_TIME flag set, while the left join optimizer does not
+		set that flag, we add one to a zero value if the flag is not
+		set. That way SHOW TABLE STATUS will show the best estimate,
+		while the optimizer never sees the table empty. */
+
+		if (n_rows == 0 && !(flag & (HA_STATUS_TIME | HA_STATUS_OPEN))) {
+			n_rows++;
+		}
+
+		/* Fix bug#40386: Not flushing query cache after truncate.
+		n_rows can not be 0 unless the table is empty, set to 1
+		instead. The original problem of bug#29507 is actually
+		fixed in the server code. */
+		if (thd_sql_command(m_user_thd) == SQLCOM_TRUNCATE) {
+
+			n_rows = 1;
+
+			/* We need to reset the m_prebuilt value too, otherwise
+			checks for values greater than the last value written
+			to the table will fail and the autoinc counter will
+			not be updated. This will force write_row() into
+			attempting an update of the table's AUTOINC counter. */
+
+			m_prebuilt->autoinc_last_value = 0;
+		}
+
+		stats.records = (ha_rows) n_rows;
+		stats.deleted = 0;
+		if (fil_space_t* space = ib_table->space) {
+			const ulint size = space->physical_size();
+			stats.data_file_length
+				= ulonglong(stat_clustered_index_size)
+				* size;
+			stats.index_file_length
+				= ulonglong(stat_sum_of_other_index_sizes)
+				* size;
+			space->s_lock();
+			stats.delete_length = 1024
+				* fsp_get_available_space_in_free_extents(
+					*space);
+			space->s_unlock();
+		}
+		stats.check_time = 0;
+		stats.mrr_length_per_rec= (uint)ref_length +  8; // 8 = max(sizeof(void *));
+
+		if (stats.records == 0) {
+			stats.mean_rec_length = 0;
+		} else {
+			stats.mean_rec_length = (ulong)
+				(stats.data_file_length / stats.records);
+		}
+	}
+
+	if (flag & HA_STATUS_CONST) {
+		/* Verify the number of index in InnoDB and MySQL
+		matches up. If m_prebuilt->clust_index_was_generated
+		holds, InnoDB defines GEN_CLUST_INDEX internally */
+		ulint	num_innodb_index = UT_LIST_GET_LEN(ib_table->indexes)
+			- m_prebuilt->clust_index_was_generated;
+		if (table->s->keys < num_innodb_index) {
+			/* If there are too many indexes defined
+			inside InnoDB, ignore those that are being
+			created, because MySQL will only consider
+			the fully built indexes here. */
+
+			for (const dict_index_t* index
+				     = UT_LIST_GET_FIRST(ib_table->indexes);
+			     index != NULL;
+			     index = UT_LIST_GET_NEXT(indexes, index)) {
+
+				/* First, online index creation is
+				completed inside InnoDB, and then
+				MySQL attempts to upgrade the
+				meta-data lock so that it can rebuild
+				the .frm file. If we get here in that
+				time frame, dict_index_is_online_ddl()
+				would not hold and the index would
+				still not be included in TABLE_SHARE. */
+				if (!index->is_committed()) {
+					num_innodb_index--;
+				}
+			}
+
+			if (table->s->keys < num_innodb_index
+			    && innobase_fts_check_doc_id_index(
+				    ib_table, NULL, NULL)
+			    == FTS_EXIST_DOC_ID_INDEX) {
+				num_innodb_index--;
+			}
+		}
+
+		if (table->s->keys != num_innodb_index) {
+			ib_table->dict_frm_mismatch = DICT_FRM_INCONSISTENT_KEYS;
+			ib_push_frm_error(m_user_thd, ib_table, table, num_innodb_index, true);
+		}
+
+		snprintf(path, sizeof(path), "%s/%s%s",
+			 mysql_data_home, table->s->normalized_path.str,
+			 reg_ext);
+
+		unpack_filename(path,path);
+
+		/* Note that we do not know the access time of the table,
+		nor the CHECK TABLE time, nor the UPDATE or INSERT time. */
+
+		if (os_file_get_status(
+			    path, &stat_info, false,
+			    srv_read_only_mode) == DB_SUCCESS) {
+			stats.create_time = (ulong) stat_info.ctime;
+		}
+
+		ib_table->stats_mutex_lock();
+		auto _ = make_scope_exit([ib_table]() {
+			ib_table->stats_mutex_unlock(); });
+
+		ut_a(ib_table->stat_initialized);
+
+		for (uint i = 0; i < table->s->keys; i++) {
+			ulong	j;
+
+			dict_index_t* index = innobase_get_index(i);
+
+			if (index == NULL) {
+				ib_table->dict_frm_mismatch = DICT_FRM_INCONSISTENT_KEYS;
+				ib_push_frm_error(m_user_thd, ib_table, table, num_innodb_index, true);
+				break;
+			}
+
+			KEY*	key = &table->key_info[i];
+
+			for (j = 0; j < key->ext_key_parts; j++) {
+
+				if ((key->flags & HA_FULLTEXT)
+				    || (key->flags & HA_SPATIAL)) {
+
+					/* The record per key does not apply to
+					FTS or Spatial indexes. */
+				/*
+					key->rec_per_key[j] = 1;
+					key->set_records_per_key(j, 1.0);
+				*/
+					continue;
+				}
+
+				if (j + 1 > index->n_uniq) {
+					sql_print_error(
+						"Index %s of %s has %u columns"
+					        " unique inside InnoDB, but "
+						"server is asking statistics for"
+					        " %lu columns. Have you mixed "
+						"up .frm files from different "
+						" installations? %s",
+						index->name(),
+						ib_table->name.m_name,
+						index->n_uniq, j + 1,
+						TROUBLESHOOTING_MSG);
+					break;
+				}
+
+				/* innodb_rec_per_key() will use
+				index->stat_n_diff_key_vals[] and the value we
+				pass index->table->stat_n_rows. Both are
+				calculated by ANALYZE and by the background
+				stats gathering thread (which kicks in when too
+				much of the table has been changed). In
+				addition table->stat_n_rows is adjusted with
+				each DML (e.g. ++ on row insert). Those
+				adjustments are not MVCC'ed and not even
+				reversed on rollback. So,
+				index->stat_n_diff_key_vals[] and
+				index->table->stat_n_rows could have been
+				calculated at different time. This is
+				acceptable. */
+
+				ulong	rec_per_key_int = static_cast<ulong>(
+					innodb_rec_per_key(index, j,
+							   stats.records));
+
+				/* Since MySQL seems to favor table scans
+				too much over index searches, we pretend
+				index selectivity is 2 times better than
+				our estimate: */
+
+				rec_per_key_int = rec_per_key_int / 2;
+
+				if (rec_per_key_int == 0) {
+					rec_per_key_int = 1;
+				}
+
+				key->rec_per_key[j] = rec_per_key_int;
+			}
+		}
+	}
+
+	if (srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN) {
+
+		goto func_exit;
+
+	} else if (flag & HA_STATUS_ERRKEY) {
+		const dict_index_t*	err_index;
+
+		ut_a(m_prebuilt->trx);
+		ut_a(m_prebuilt->trx->magic_n == TRX_MAGIC_N);
+
+		err_index = trx_get_error_info(m_prebuilt->trx);
+
+		if (err_index) {
+			errkey = innobase_get_mysql_key_number_for_index(
+					table, ib_table, err_index);
+		} else {
+			errkey = (unsigned int) (
+				(m_prebuilt->trx->error_key_num
+				 == ULINT_UNDEFINED)
+					? ~0U
+					: m_prebuilt->trx->error_key_num);
+		}
+	}
+
+	if ((flag & HA_STATUS_AUTO) && table->found_next_number_field) {
+		stats.auto_increment_value = innobase_peek_autoinc();
+	}
+
+func_exit:
+	m_prebuilt->trx->op_info = (char*)"";
+
+	DBUG_RETURN(0);
+}
+
+/*********************************************************************//**
+Returns statistics information of the table to the MySQL interpreter,
+in various fields of the handle object.
+@return HA_ERR_* error code or 0 */
+
+int
+ha_innobase::info(
+/*==============*/
+	uint	flag)	/*!< in: what information is requested */
+{
+	return(info_low(flag, false /* not ANALYZE */));
+}
+
+/*
+Updates index cardinalities of the table, based on random dives into
+each index tree. This does NOT calculate exact statistics on the table.
+@return HA_ADMIN_* error code or HA_ADMIN_OK */
+
+int
+ha_innobase::analyze(THD*, HA_CHECK_OPT*)
+{
+	/* Simply call info_low() with all the flags
+	and request recalculation of the statistics */
+	int	ret = info_low(
+		HA_STATUS_TIME | HA_STATUS_CONST | HA_STATUS_VARIABLE,
+		true /* this is ANALYZE */);
+
+	if (ret != 0) {
+		return(HA_ADMIN_FAILED);
+	}
+
+	return(HA_ADMIN_OK);
+}
+
+/*****************************************************************//**
+Defragment table.
+@return	error number */
+inline int ha_innobase::defragment_table()
+{
+  for (dict_index_t *index= dict_table_get_first_index(m_prebuilt->table);
+       index; index= dict_table_get_next_index(index))
+  {
+    if (!index->is_btree())
+      continue;
+
+    if (btr_defragment_find_index(index))
+    {
+      // We borrow this error code. When the same index is already in
+      // the defragmentation queue, issuing another defragmentation
+      // only introduces overhead. We return an error here to let the
+      // user know this is not necessary. Note that this will fail a
+      // query that's trying to defragment a full table if one of the
+      // indicies in that table is already in defragmentation.  We
+      // choose this behavior so user is aware of this rather than
+      // silently defragment other indicies of that table.
+      return ER_SP_ALREADY_EXISTS;
+    }
+
+    btr_pcur_t pcur;
+
+    mtr_t mtr;
+    mtr.start();
+    if (dberr_t err= pcur.open_leaf(true, index, BTR_SEARCH_LEAF, &mtr))
+    {
+      mtr.commit();
+      return convert_error_code_to_mysql(err, 0, m_user_thd);
+    }
+    else if (btr_pcur_get_block(&pcur)->page.id().page_no() == index->page)
+    {
+      mtr.commit();
+      continue;
+    }
+
+    btr_pcur_move_to_next(&pcur, &mtr);
+    btr_pcur_store_position(&pcur, &mtr);
+    mtr.commit();
+    ut_ad(pcur.index() == index);
+    const bool interrupted= btr_defragment_add_index(&pcur, m_user_thd);
+    ut_free(pcur.old_rec_buf);
+    if (interrupted)
+      return ER_QUERY_INTERRUPTED;
+  }
+
+  return 0;
+}
+
+/**********************************************************************//**
+This is mapped to "ALTER TABLE tablename ENGINE=InnoDB", which rebuilds
+the table in MySQL. */
+
+int
+ha_innobase::optimize(
+/*==================*/
+	THD*		thd,		/*!< in: connection thread handle */
+	HA_CHECK_OPT*)
+{
+
+	/* FTS-FIXME: Since MySQL doesn't support engine-specific commands,
+	we have to hijack some existing command in order to be able to test
+	the new admin commands added in InnoDB's FTS support. For now, we
+	use MySQL's OPTIMIZE command, normally mapped to ALTER TABLE in
+	InnoDB (so it recreates the table anew), and map it to OPTIMIZE.
+
+	This works OK otherwise, but MySQL locks the entire table during
+	calls to OPTIMIZE, which is undesirable. */
+	bool try_alter = true;
+
+	if (!m_prebuilt->table->is_temporary()
+	    && m_prebuilt->table->is_readable()
+	    && srv_defragment) {
+		int err = defragment_table();
+
+		if (err == 0) {
+			try_alter = false;
+		} else {
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+					    uint(err),
+				"InnoDB: Cannot defragment table %s: returned error code %d\n",
+				m_prebuilt->table->name.m_name, err);
+
+			if(err == ER_SP_ALREADY_EXISTS) {
+				try_alter = false;
+			}
+		}
+	}
+
+	if (innodb_optimize_fulltext_only) {
+		if (m_prebuilt->table->fts && m_prebuilt->table->fts->cache
+		    && m_prebuilt->table->space) {
+			fts_sync_table(m_prebuilt->table);
+			fts_optimize_table(m_prebuilt->table);
+		}
+		try_alter = false;
+	}
+
+	return try_alter ? HA_ADMIN_TRY_ALTER : HA_ADMIN_OK;
+}
+
+/*******************************************************************//**
+Tries to check that an InnoDB table is not corrupted. If corruption is
+noticed, prints to stderr information about it. In case of corruption
+may also assert a failure and crash the server.
+@return HA_ADMIN_CORRUPT or HA_ADMIN_OK */
+
+int
+ha_innobase::check(
+/*===============*/
+	THD*		thd,		/*!< in: user thread handle */
+	HA_CHECK_OPT*	check_opt)	/*!< in: check options */
+{
+	ulint		n_rows;
+	ulint		n_rows_in_table	= ULINT_UNDEFINED;
+	bool		is_ok		= true;
+	dberr_t		ret;
+
+	DBUG_ENTER("ha_innobase::check");
+	DBUG_ASSERT(thd == ha_thd());
+	DBUG_ASSERT(thd == m_user_thd);
+	ut_a(m_prebuilt->trx->magic_n == TRX_MAGIC_N);
+	ut_a(m_prebuilt->trx == thd_to_trx(thd));
+	ut_ad(m_prebuilt->trx->mysql_thd == thd);
+
+	if (m_prebuilt->mysql_template == NULL) {
+		/* Build the template; we will use a dummy template
+		in index scans done in checking */
+
+		build_template(true);
+	}
+
+	if (!m_prebuilt->table->space) {
+		ib_senderrf(
+			thd,
+			IB_LOG_LEVEL_ERROR,
+			ER_TABLESPACE_DISCARDED,
+			table->s->table_name.str);
+
+		DBUG_RETURN(HA_ADMIN_CORRUPT);
+	} else if (!m_prebuilt->table->is_readable()) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_ERROR,
+			ER_TABLESPACE_MISSING,
+			table->s->table_name.str);
+
+		DBUG_RETURN(HA_ADMIN_CORRUPT);
+	}
+
+	m_prebuilt->trx->op_info = "checking table";
+
+	uint old_isolation_level = m_prebuilt->trx->isolation_level;
+
+	/* We must run the index record counts at an isolation level
+	>= READ COMMITTED, because a dirty read can see a wrong number
+	of records in some index; to play safe, we normally use
+	REPEATABLE READ here */
+	m_prebuilt->trx->isolation_level = high_level_read_only
+		&& !m_prebuilt->table->is_temporary()
+		? TRX_ISO_READ_UNCOMMITTED
+		: TRX_ISO_REPEATABLE_READ;
+
+	trx_start_if_not_started(m_prebuilt->trx, false);
+	m_prebuilt->trx->read_view.open(m_prebuilt->trx);
+
+	for (dict_index_t* index
+	     = dict_table_get_first_index(m_prebuilt->table);
+	     index;
+	     index = dict_table_get_next_index(index)) {
+		/* If this is an index being created or dropped, skip */
+		if (!index->is_committed()) {
+			continue;
+		}
+		if (index->type & DICT_FTS) {
+			/* We do not check any FULLTEXT INDEX. */
+			continue;
+		}
+
+		if ((check_opt->flags & T_QUICK) || index->is_corrupted()) {
+		} else if (trx_id_t bulk_trx_id =
+				m_prebuilt->table->bulk_trx_id) {
+			if (!m_prebuilt->trx->read_view.changes_visible(
+							bulk_trx_id)) {
+				is_ok = true;
+				goto func_exit;
+			}
+
+			if (btr_validate_index(index, m_prebuilt->trx)
+			    != DB_SUCCESS) {
+				is_ok = false;
+				push_warning_printf(
+					thd,
+					Sql_condition::WARN_LEVEL_WARN,
+					ER_NOT_KEYFILE,
+					"InnoDB: The B-tree of"
+					" index %s is corrupted.",
+					index->name());
+				continue;
+			}
+		}
+
+		/* Instead of invoking change_active_index(), set up
+		a dummy template for non-locking reads, disabling
+		access to the clustered index. */
+		m_prebuilt->index = index;
+
+		m_prebuilt->index_usable = row_merge_is_index_usable(
+			m_prebuilt->trx, m_prebuilt->index);
+
+		DBUG_EXECUTE_IF(
+			"dict_set_index_corrupted",
+			if (!index->is_primary()) {
+				m_prebuilt->index_usable = FALSE;
+				dict_set_corrupted(index,
+						   "dict_set_index_corrupted");
+			});
+
+		if (UNIV_UNLIKELY(!m_prebuilt->index_usable)) {
+			if (index->is_corrupted()) {
+				push_warning_printf(
+					thd,
+					Sql_condition::WARN_LEVEL_WARN,
+					HA_ERR_INDEX_CORRUPT,
+					"InnoDB: Index %s is marked as"
+					" corrupted",
+					index->name());
+				is_ok = false;
+			} else {
+				push_warning_printf(
+					thd,
+					Sql_condition::WARN_LEVEL_WARN,
+					HA_ERR_TABLE_DEF_CHANGED,
+					"InnoDB: Insufficient history for"
+					" index %s",
+					index->name());
+			}
+			continue;
+		}
+
+		m_prebuilt->sql_stat_start = TRUE;
+		m_prebuilt->template_type = ROW_MYSQL_DUMMY_TEMPLATE;
+		m_prebuilt->n_template = 0;
+		m_prebuilt->read_just_key = 0;
+		m_prebuilt->autoinc_error = DB_SUCCESS;
+		m_prebuilt->need_to_access_clustered =
+			!!(check_opt->flags & T_EXTEND);
+
+		dtuple_set_n_fields(m_prebuilt->search_tuple, 0);
+
+		m_prebuilt->select_lock_type = LOCK_NONE;
+
+		/* Scan this index. */
+		if (index->is_spatial()) {
+			ret = row_count_rtree_recs(m_prebuilt, &n_rows);
+		} else if (index->type & DICT_FTS) {
+			ret = DB_SUCCESS;
+		} else {
+			ret = row_check_index(m_prebuilt, &n_rows);
+		}
+
+		DBUG_EXECUTE_IF(
+			"dict_set_index_corrupted",
+			if (!index->is_primary()) {
+				ret = DB_CORRUPTION;
+			});
+
+		if (ret == DB_INTERRUPTED || thd_killed(thd)) {
+			/* Do not report error since this could happen
+			during shutdown */
+			break;
+		}
+
+		if (ret == DB_SUCCESS
+		    && m_prebuilt->autoinc_error != DB_MISSING_HISTORY) {
+			/* See if any non-fatal errors were reported. */
+			ret = m_prebuilt->autoinc_error;
+		}
+
+		if (ret != DB_SUCCESS) {
+			/* Assume some kind of corruption. */
+			push_warning_printf(
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_NOT_KEYFILE,
+				"InnoDB: The B-tree of"
+				" index %s is corrupted.",
+				index->name());
+			is_ok = false;
+			dict_set_corrupted(index, "CHECK TABLE-check index");
+		}
+
+
+		if (index == dict_table_get_first_index(m_prebuilt->table)) {
+			n_rows_in_table = n_rows;
+		} else if (!(index->type & DICT_FTS)
+			   && (n_rows != n_rows_in_table)) {
+			push_warning_printf(
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_NOT_KEYFILE,
+				"InnoDB: Index '%-.200s' contains " ULINTPF
+				" entries, should be " ULINTPF ".",
+				index->name(), n_rows, n_rows_in_table);
+			is_ok = false;
+			dict_set_corrupted(index, "CHECK TABLE; Wrong count");
+		}
+	}
+
+	/* Restore the original isolation level */
+	m_prebuilt->trx->isolation_level = old_isolation_level;
+#ifdef BTR_CUR_HASH_ADAPT
+# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	/* We validate the whole adaptive hash index for all tables
+	at every CHECK TABLE only when QUICK flag is not present. */
+
+	if (!(check_opt->flags & T_QUICK)
+	    && !btr_search_validate(m_prebuilt->trx->mysql_thd)) {
+		push_warning(thd, Sql_condition::WARN_LEVEL_WARN,
+			     ER_NOT_KEYFILE,
+			     "InnoDB: The adaptive hash index is corrupted.");
+		is_ok = false;
+	}
+# endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */
+#endif /* BTR_CUR_HASH_ADAPT */
+func_exit:
+	m_prebuilt->trx->op_info = "";
+
+	DBUG_RETURN(is_ok ? HA_ADMIN_OK : HA_ADMIN_CORRUPT);
+}
+
+/*******************************************************************//**
+Gets the foreign key create info for a table stored in InnoDB.
+@return own: character string in the form which can be inserted to the
+CREATE TABLE statement, MUST be freed with
+ha_innobase::free_foreign_key_create_info */
+
+char*
+ha_innobase::get_foreign_key_create_info(void)
+/*==========================================*/
+{
+	ut_a(m_prebuilt != NULL);
+
+	/* We do not know if MySQL can call this function before calling
+	external_lock(). To be safe, update the thd of the current table
+	handle. */
+
+	update_thd(ha_thd());
+
+	m_prebuilt->trx->op_info = "getting info on foreign keys";
+
+	/* Output the data to a temporary string */
+	std::string str = dict_print_info_on_foreign_keys(
+		TRUE, m_prebuilt->trx,
+		m_prebuilt->table);
+
+	m_prebuilt->trx->op_info = "";
+
+	/* Allocate buffer for the string */
+	char *fk_str = reinterpret_cast<char*>(
+			my_malloc(PSI_INSTRUMENT_ME, str.length() + 1, MYF(0)));
+
+	if (fk_str) {
+		memcpy(fk_str, str.c_str(), str.length());
+		fk_str[str.length()]='\0';
+	}
+
+	return(fk_str);
+}
+
+
+/***********************************************************************//**
+Maps a InnoDB foreign key constraint to a equivalent MySQL foreign key info.
+@return pointer to foreign key info */
+static
+FOREIGN_KEY_INFO*
+get_foreign_key_info(
+/*=================*/
+	THD*		thd,	/*!< in: user thread handle */
+	dict_foreign_t* foreign)/*!< in: foreign key constraint */
+{
+	FOREIGN_KEY_INFO	f_key_info;
+	FOREIGN_KEY_INFO*	pf_key_info;
+	uint			i = 0;
+	size_t			len;
+	char			tmp_buff[NAME_LEN+1];
+	char			name_buff[NAME_LEN+1];
+	const char*		ptr;
+	LEX_CSTRING*		referenced_key_name;
+	LEX_CSTRING*		name = NULL;
+
+	if (dict_table_t::is_temporary_name(foreign->foreign_table_name)) {
+ 		return NULL;
+ 	}
+
+	ptr = dict_remove_db_name(foreign->id);
+	f_key_info.foreign_id = thd_make_lex_string(
+		thd, 0, ptr, strlen(ptr), 1);
+
+	/* Name format: database name, '/', table name, '\0' */
+
+	/* Referenced (parent) database name */
+	len = dict_get_db_name_len(foreign->referenced_table_name);
+	ut_a(len < sizeof(tmp_buff));
+	memcpy(tmp_buff, foreign->referenced_table_name, len);
+	tmp_buff[len] = 0;
+
+	len = filename_to_tablename(tmp_buff, name_buff, sizeof(name_buff));
+	f_key_info.referenced_db = thd_make_lex_string(
+		thd, 0, name_buff, len, 1);
+
+	/* Referenced (parent) table name */
+	ptr = dict_remove_db_name(foreign->referenced_table_name);
+	len = filename_to_tablename(ptr, name_buff, sizeof(name_buff), 1);
+	f_key_info.referenced_table = thd_make_lex_string(
+		thd, 0, name_buff, len, 1);
+
+	/* Dependent (child) database name */
+	len = dict_get_db_name_len(foreign->foreign_table_name);
+	ut_a(len < sizeof(tmp_buff));
+	memcpy(tmp_buff, foreign->foreign_table_name, len);
+	tmp_buff[len] = 0;
+
+	len = filename_to_tablename(tmp_buff, name_buff, sizeof(name_buff));
+	f_key_info.foreign_db = thd_make_lex_string(
+		thd, 0, name_buff, len, 1);
+
+	/* Dependent (child) table name */
+	ptr = dict_remove_db_name(foreign->foreign_table_name);
+	len = filename_to_tablename(ptr, name_buff, sizeof(name_buff), 1);
+	f_key_info.foreign_table = thd_make_lex_string(
+		thd, 0, name_buff, len, 1);
+
+	do {
+		ptr = foreign->foreign_col_names[i];
+		name = thd_make_lex_string(thd, name, ptr,
+					   strlen(ptr), 1);
+		f_key_info.foreign_fields.push_back(name);
+		ptr = foreign->referenced_col_names[i];
+		name = thd_make_lex_string(thd, name, ptr,
+					   strlen(ptr), 1);
+		f_key_info.referenced_fields.push_back(name);
+	} while (++i < foreign->n_fields);
+
+	if (foreign->type & DICT_FOREIGN_ON_DELETE_CASCADE) {
+		f_key_info.delete_method = FK_OPTION_CASCADE;
+	} else if (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL) {
+		f_key_info.delete_method = FK_OPTION_SET_NULL;
+	} else if (foreign->type & DICT_FOREIGN_ON_DELETE_NO_ACTION) {
+		f_key_info.delete_method = FK_OPTION_NO_ACTION;
+	} else {
+		f_key_info.delete_method = FK_OPTION_RESTRICT;
+	}
+
+
+	if (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE) {
+		f_key_info.update_method = FK_OPTION_CASCADE;
+	} else if (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL) {
+		f_key_info.update_method = FK_OPTION_SET_NULL;
+	} else if (foreign->type & DICT_FOREIGN_ON_UPDATE_NO_ACTION) {
+		f_key_info.update_method = FK_OPTION_NO_ACTION;
+	} else {
+		f_key_info.update_method = FK_OPTION_RESTRICT;
+	}
+
+	/* Load referenced table to update FK referenced key name. */
+	if (foreign->referenced_table == NULL) {
+
+		dict_table_t*	ref_table = dict_table_open_on_name(
+			foreign->referenced_table_name_lookup,
+			true, DICT_ERR_IGNORE_NONE);
+
+		if (ref_table == NULL) {
+
+			if (!thd_test_options(
+				thd, OPTION_NO_FOREIGN_KEY_CHECKS)) {
+				ib::info()
+					<< "Foreign Key referenced table "
+					<< foreign->referenced_table_name
+					<< " not found for foreign table "
+					<< foreign->foreign_table_name;
+ 			}
+		} else {
+			dict_table_close(ref_table, true);
+		}
+	}
+
+	if (foreign->referenced_index
+	    && foreign->referenced_index->name != NULL) {
+		referenced_key_name = thd_make_lex_string(
+			thd,
+			f_key_info.referenced_key_name,
+			foreign->referenced_index->name,
+			strlen(foreign->referenced_index->name),
+			1);
+	} else {
+		referenced_key_name = NULL;
+	}
+
+	f_key_info.referenced_key_name = referenced_key_name;
+
+	pf_key_info = (FOREIGN_KEY_INFO*) thd_memdup(thd, &f_key_info,
+						      sizeof(FOREIGN_KEY_INFO));
+
+	return(pf_key_info);
+}
+
+/*******************************************************************//**
+Gets the list of foreign keys in this table.
+@return always 0, that is, always succeeds */
+
+int
+ha_innobase::get_foreign_key_list(
+/*==============================*/
+	THD*			thd,		/*!< in: user thread handle */
+	List<FOREIGN_KEY_INFO>*	f_key_list)	/*!< out: foreign key list */
+{
+	update_thd(ha_thd());
+
+	m_prebuilt->trx->op_info = "getting list of foreign keys";
+
+	dict_sys.lock(SRW_LOCK_CALL);
+
+	for (dict_foreign_set::iterator it
+		= m_prebuilt->table->foreign_set.begin();
+	     it != m_prebuilt->table->foreign_set.end();
+	     ++it) {
+
+		FOREIGN_KEY_INFO*	pf_key_info;
+		dict_foreign_t*		foreign = *it;
+
+		pf_key_info = get_foreign_key_info(thd, foreign);
+
+		if (pf_key_info != NULL) {
+			f_key_list->push_back(pf_key_info);
+		}
+	}
+
+	dict_sys.unlock();
+
+	m_prebuilt->trx->op_info = "";
+
+	return(0);
+}
+
+/*******************************************************************//**
+Gets the set of foreign keys where this table is the referenced table.
+@return always 0, that is, always succeeds */
+
+int
+ha_innobase::get_parent_foreign_key_list(
+/*=====================================*/
+	THD*			thd,		/*!< in: user thread handle */
+	List<FOREIGN_KEY_INFO>*	f_key_list)	/*!< out: foreign key list */
+{
+	update_thd(ha_thd());
+
+	m_prebuilt->trx->op_info = "getting list of referencing foreign keys";
+
+	dict_sys.freeze(SRW_LOCK_CALL);
+
+	for (dict_foreign_set::iterator it
+		= m_prebuilt->table->referenced_set.begin();
+	     it != m_prebuilt->table->referenced_set.end();
+	     ++it) {
+
+		FOREIGN_KEY_INFO*	pf_key_info;
+		dict_foreign_t*		foreign = *it;
+
+		pf_key_info = get_foreign_key_info(thd, foreign);
+
+		if (pf_key_info != NULL) {
+			f_key_list->push_back(pf_key_info);
+		}
+	}
+
+	dict_sys.unfreeze();
+
+	m_prebuilt->trx->op_info = "";
+
+	return(0);
+}
+
+/** Table list item structure is used to store only the table
+and name. It is used by get_cascade_foreign_key_table_list to store
+the intermediate result for fetching the table set. */
+struct table_list_item {
+	/** InnoDB table object */
+	const dict_table_t*	table;
+	/** Table name */
+	const char*		name;
+};
+
+/** @return whether ALTER TABLE may change the storage engine of the table */
+bool ha_innobase::can_switch_engines()
+{
+  DBUG_ENTER("ha_innobase::can_switch_engines");
+  update_thd();
+  DBUG_RETURN(m_prebuilt->table->foreign_set.empty() &&
+              m_prebuilt->table->referenced_set.empty());
+}
+
+/*******************************************************************//**
+Checks if a table is referenced by a foreign key. The MySQL manual states that
+a REPLACE is either equivalent to an INSERT, or DELETE(s) + INSERT. Only a
+delete is then allowed internally to resolve a duplicate key conflict in
+REPLACE, not an update.
+@return > 0 if referenced by a FOREIGN KEY */
+
+uint ha_innobase::referenced_by_foreign_key()
+{
+  dict_sys.freeze(SRW_LOCK_CALL);
+  const bool empty= m_prebuilt->table->referenced_set.empty();
+  dict_sys.unfreeze();
+  return !empty;
+}
+
+/*******************************************************************//**
+Tells something additional to the handler about how to do things.
+@return 0 or error number */
+
+int
+ha_innobase::extra(
+/*===============*/
+	enum ha_extra_function operation)
+			   /*!< in: HA_EXTRA_FLUSH or some other flag */
+{
+	/* Warning: since it is not sure that MariaDB calls external_lock()
+	before calling this function, m_prebuilt->trx can be obsolete! */
+	trx_t* trx = check_trx_exists(ha_thd());
+
+	switch (operation) {
+	case HA_EXTRA_FLUSH:
+		if (m_prebuilt->blob_heap) {
+			row_mysql_prebuilt_free_blob_heap(m_prebuilt);
+		}
+		break;
+	case HA_EXTRA_RESET_STATE:
+		reset_template();
+		trx->duplicates = 0;
+	stmt_boundary:
+		trx->bulk_insert_apply();
+		trx->end_bulk_insert(*m_prebuilt->table);
+		trx->bulk_insert = false;
+		break;
+	case HA_EXTRA_NO_KEYREAD:
+		m_prebuilt->read_just_key = 0;
+		break;
+	case HA_EXTRA_KEYREAD:
+		m_prebuilt->read_just_key = 1;
+		break;
+	case HA_EXTRA_KEYREAD_PRESERVE_FIELDS:
+		m_prebuilt->keep_other_fields_on_keyread = 1;
+		break;
+	case HA_EXTRA_INSERT_WITH_UPDATE:
+		trx->duplicates |= TRX_DUP_IGNORE;
+		goto stmt_boundary;
+	case HA_EXTRA_NO_IGNORE_DUP_KEY:
+		trx->duplicates &= ~TRX_DUP_IGNORE;
+		if (trx->is_bulk_insert()) {
+			/* Allow a subsequent INSERT into an empty table
+			if !unique_checks && !foreign_key_checks. */
+			if (dberr_t err = trx->bulk_insert_apply()) {
+				return err;
+			}
+			break;
+		}
+		goto stmt_boundary;
+	case HA_EXTRA_WRITE_CAN_REPLACE:
+		trx->duplicates |= TRX_DUP_REPLACE;
+		goto stmt_boundary;
+	case HA_EXTRA_WRITE_CANNOT_REPLACE:
+		trx->duplicates &= ~TRX_DUP_REPLACE;
+		if (trx->is_bulk_insert()) {
+			/* Allow a subsequent INSERT into an empty table
+			if !unique_checks && !foreign_key_checks. */
+			break;
+		}
+		goto stmt_boundary;
+	case HA_EXTRA_BEGIN_ALTER_COPY:
+		m_prebuilt->table->skip_alter_undo = 1;
+		if (m_prebuilt->table->is_temporary()
+		    || !m_prebuilt->table->versioned_by_id()) {
+			break;
+		}
+		ut_ad(trx == m_prebuilt->trx);
+		trx_start_if_not_started(trx, true);
+		trx->mod_tables.emplace(
+			const_cast<dict_table_t*>(m_prebuilt->table), 0)
+			.first->second.set_versioned(0);
+		break;
+	case HA_EXTRA_END_ALTER_COPY:
+		m_prebuilt->table->skip_alter_undo = 0;
+		if (!m_prebuilt->table->is_temporary()) {
+			log_buffer_flush_to_disk();
+		}
+		break;
+	default:/* Do nothing */
+		;
+	}
+
+	return(0);
+}
+
+/**
+MySQL calls this method at the end of each statement */
+int
+ha_innobase::reset()
+{
+	if (m_prebuilt->blob_heap) {
+		row_mysql_prebuilt_free_blob_heap(m_prebuilt);
+	}
+
+	reset_template();
+
+	m_ds_mrr.dsmrr_close();
+
+	/* TODO: This should really be reset in reset_template() but for now
+	it's safer to do it explicitly here. */
+
+	/* This is a statement level counter. */
+	m_prebuilt->autoinc_last_value = 0;
+
+	m_prebuilt->skip_locked = false;
+	return(0);
+}
+
+/******************************************************************//**
+MySQL calls this function at the start of each SQL statement inside LOCK
+TABLES. Inside LOCK TABLES the ::external_lock method does not work to
+mark SQL statement borders. Note also a special case: if a temporary table
+is created inside LOCK TABLES, MySQL has not called external_lock() at all
+on that table.
+MySQL-5.0 also calls this before each statement in an execution of a stored
+procedure. To make the execution more deterministic for binlogging, MySQL-5.0
+locks all tables involved in a stored procedure with full explicit table
+locks (thd_in_lock_tables(thd) holds in store_lock()) before executing the
+procedure.
+@return 0 or error code */
+
+int
+ha_innobase::start_stmt(
+/*====================*/
+	THD*		thd,	/*!< in: handle to the user thread */
+	thr_lock_type	lock_type)
+{
+	trx_t*		trx = m_prebuilt->trx;
+
+	DBUG_ENTER("ha_innobase::start_stmt");
+
+	update_thd(thd);
+
+	ut_ad(m_prebuilt->table != NULL);
+
+	trx = m_prebuilt->trx;
+
+	/* Reset the AUTOINC statement level counter for multi-row INSERTs. */
+	trx->n_autoinc_rows = 0;
+
+	const auto sql_command = thd_sql_command(thd);
+
+	m_prebuilt->hint_need_to_fetch_extra_cols = 0;
+	reset_template();
+
+	switch (sql_command) {
+	case SQLCOM_INSERT:
+	case SQLCOM_INSERT_SELECT:
+		if (trx->is_bulk_insert()) {
+			/* Allow a subsequent INSERT into an empty table
+			if !unique_checks && !foreign_key_checks. */
+			break;
+		}
+		/* fall through */
+	default:
+		trx->end_bulk_insert(*m_prebuilt->table);
+		if (!trx->bulk_insert) {
+			break;
+		}
+
+		/* Trigger could've initiated another stmt.
+		So apply all bulk operation and mark as
+		end bulk insert for all tables */
+		trx->bulk_insert_apply();
+		trx->end_bulk_insert();
+		trx->bulk_insert = false;
+		trx->last_sql_stat_start.least_undo_no = trx->undo_no;
+	}
+
+	m_prebuilt->sql_stat_start = TRUE;
+
+	if (m_prebuilt->table->is_temporary()
+	    && m_mysql_has_locked
+	    && m_prebuilt->select_lock_type == LOCK_NONE) {
+		switch (sql_command) {
+		case SQLCOM_INSERT:
+		case SQLCOM_UPDATE:
+		case SQLCOM_DELETE:
+		case SQLCOM_REPLACE:
+			init_table_handle_for_HANDLER();
+			m_prebuilt->select_lock_type = LOCK_X;
+			m_prebuilt->stored_select_lock_type = LOCK_X;
+			if (dberr_t error = row_lock_table(m_prebuilt)) {
+				DBUG_RETURN(convert_error_code_to_mysql(
+						    error, 0, thd));
+			}
+			break;
+		}
+	}
+
+	if (!m_mysql_has_locked) {
+		/* This handle is for a temporary table created inside
+		this same LOCK TABLES; since MySQL does NOT call external_lock
+		in this case, we must use x-row locks inside InnoDB to be
+		prepared for an update of a row */
+
+		m_prebuilt->select_lock_type = LOCK_X;
+
+	} else if (sql_command == SQLCOM_SELECT
+		   && lock_type == TL_READ
+		   && trx->isolation_level != TRX_ISO_SERIALIZABLE) {
+
+		/* For other than temporary tables, we obtain
+		no lock for consistent read (plain SELECT). */
+
+		m_prebuilt->select_lock_type = LOCK_NONE;
+	} else {
+		/* Not a consistent read: restore the
+		select_lock_type value. The value of
+		stored_select_lock_type was decided in:
+		1) ::store_lock(),
+		2) ::external_lock(),
+		3) ::init_table_handle_for_HANDLER(). */
+
+		ut_a(m_prebuilt->stored_select_lock_type != LOCK_NONE_UNSET);
+
+		m_prebuilt->select_lock_type =
+			m_prebuilt->stored_select_lock_type;
+	}
+
+	*trx->detailed_error = 0;
+
+	innobase_register_trx(ht, thd, trx);
+
+	if (!trx_is_started(trx)) {
+		trx->will_lock = true;
+	}
+
+	DBUG_RETURN(0);
+}
+
+/******************************************************************//**
+Maps a MySQL trx isolation level code to the InnoDB isolation level code
+@return InnoDB isolation level */
+static inline
+uint
+innobase_map_isolation_level(
+/*=========================*/
+	enum_tx_isolation	iso)	/*!< in: MySQL isolation level code */
+{
+	if (UNIV_UNLIKELY(srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN)
+	    || UNIV_UNLIKELY(srv_read_only_mode)) {
+		return TRX_ISO_READ_UNCOMMITTED;
+	}
+	switch (iso) {
+	case ISO_REPEATABLE_READ:	return(TRX_ISO_REPEATABLE_READ);
+	case ISO_READ_COMMITTED:	return(TRX_ISO_READ_COMMITTED);
+	case ISO_SERIALIZABLE:		return(TRX_ISO_SERIALIZABLE);
+	case ISO_READ_UNCOMMITTED:	return(TRX_ISO_READ_UNCOMMITTED);
+	}
+
+	ut_error;
+
+	return(0);
+}
+
+/******************************************************************//**
+As MySQL will execute an external lock for every new table it uses when it
+starts to process an SQL statement (an exception is when MySQL calls
+start_stmt for the handle) we can use this function to store the pointer to
+the THD in the handle. We will also use this function to communicate
+to InnoDB that a new SQL statement has started and that we must store a
+savepoint to our transaction handle, so that we are able to roll back
+the SQL statement in case of an error.
+@return 0 */
+
+int
+ha_innobase::external_lock(
+/*=======================*/
+	THD*	thd,		/*!< in: handle to the user thread */
+	int	lock_type)	/*!< in: lock type */
+{
+	DBUG_ENTER("ha_innobase::external_lock");
+	DBUG_PRINT("enter",("lock_type: %d", lock_type));
+
+	update_thd(thd);
+	trx_t* trx = m_prebuilt->trx;
+	ut_ad(m_prebuilt->table);
+
+	/* Statement based binlogging does not work in isolation level
+	READ UNCOMMITTED and READ COMMITTED since the necessary
+	locks cannot be taken. In this case, we print an
+	informative error message and return with an error.
+	Note: decide_logging_format would give the same error message,
+	except it cannot give the extra details. */
+
+	if (lock_type == F_WRLCK
+	    && !(table_flags() & HA_BINLOG_STMT_CAPABLE)
+	    && thd_binlog_format(thd) == BINLOG_FORMAT_STMT
+	    && thd_binlog_filter_ok(thd)
+	    && thd_sqlcom_can_generate_row_events(thd)) {
+		bool skip = false;
+#ifdef WITH_WSREP
+		skip = trx->is_wsrep() && !wsrep_thd_is_local(thd);
+#endif /* WITH_WSREP */
+		/* used by test case */
+		DBUG_EXECUTE_IF("no_innodb_binlog_errors", skip = true;);
+
+		if (!skip) {
+			my_error(ER_BINLOG_STMT_MODE_AND_ROW_ENGINE, MYF(0),
+			         " InnoDB is limited to row-logging when"
+			         " transaction isolation level is"
+			         " READ COMMITTED or READ UNCOMMITTED.");
+
+			DBUG_RETURN(HA_ERR_LOGGING_IMPOSSIBLE);
+		}
+	}
+
+	const auto sql_command = thd_sql_command(thd);
+
+	/* Check for UPDATEs in read-only mode. */
+	if (srv_read_only_mode) {
+		switch (sql_command) {
+		case SQLCOM_CREATE_TABLE:
+			if (lock_type != F_WRLCK) {
+				break;
+			}
+			/* fall through */
+		case SQLCOM_UPDATE:
+		case SQLCOM_INSERT:
+		case SQLCOM_REPLACE:
+		case SQLCOM_DROP_TABLE:
+		case SQLCOM_ALTER_TABLE:
+		case SQLCOM_OPTIMIZE:
+		case SQLCOM_CREATE_INDEX:
+		case SQLCOM_DROP_INDEX:
+		case SQLCOM_CREATE_SEQUENCE:
+		case SQLCOM_DROP_SEQUENCE:
+		case SQLCOM_DELETE:
+			ib_senderrf(thd, IB_LOG_LEVEL_WARN,
+				    ER_READ_ONLY_MODE);
+			DBUG_RETURN(HA_ERR_TABLE_READONLY);
+		}
+	}
+
+	m_prebuilt->sql_stat_start = TRUE;
+	m_prebuilt->hint_need_to_fetch_extra_cols = 0;
+
+	reset_template();
+	switch (sql_command) {
+	case SQLCOM_INSERT:
+	case SQLCOM_INSERT_SELECT:
+		if (trx->is_bulk_insert()) {
+			/* Allow a subsequent INSERT into an empty table
+			if !unique_checks && !foreign_key_checks. */
+			break;
+		}
+		/* fall through */
+	default:
+		trx->end_bulk_insert(*m_prebuilt->table);
+		if (!trx->bulk_insert) {
+			break;
+		}
+		trx->bulk_insert = false;
+		trx->last_sql_stat_start.least_undo_no = trx->undo_no;
+	}
+
+	switch (m_prebuilt->table->quiesce) {
+	case QUIESCE_START:
+		/* Check for FLUSH TABLE t WITH READ LOCK; */
+		if (!srv_read_only_mode
+		    && sql_command == SQLCOM_FLUSH
+		    && lock_type == F_RDLCK) {
+
+			if (!m_prebuilt->table->space) {
+				ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+					    ER_TABLESPACE_DISCARDED,
+					    table->s->table_name.str);
+
+				DBUG_RETURN(HA_ERR_TABLESPACE_MISSING);
+			}
+
+			row_quiesce_table_start(m_prebuilt->table, trx);
+
+			/* Use the transaction instance to track UNLOCK
+			TABLES. It can be done via START TRANSACTION; too
+			implicitly. */
+
+			++trx->flush_tables;
+		}
+		break;
+
+	case QUIESCE_COMPLETE:
+		/* Check for UNLOCK TABLES; implicit or explicit
+		or trx interruption. */
+		if (trx->flush_tables > 0
+		    && (lock_type == F_UNLCK || trx_is_interrupted(trx))) {
+
+			row_quiesce_table_complete(m_prebuilt->table, trx);
+
+			ut_a(trx->flush_tables > 0);
+			--trx->flush_tables;
+		}
+
+		break;
+
+	case QUIESCE_NONE:
+		break;
+	}
+
+	if (lock_type == F_WRLCK) {
+
+		/* If this is a SELECT, then it is in UPDATE TABLE ...
+		or SELECT ... FOR UPDATE */
+		m_prebuilt->select_lock_type = LOCK_X;
+		m_prebuilt->stored_select_lock_type = LOCK_X;
+	}
+
+	if (lock_type != F_UNLCK) {
+		/* MySQL is setting a new table lock */
+
+		*trx->detailed_error = 0;
+
+		innobase_register_trx(ht, thd, trx);
+
+		if (trx->isolation_level == TRX_ISO_SERIALIZABLE
+		    && m_prebuilt->select_lock_type == LOCK_NONE
+		    && thd_test_options(
+			    thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
+
+			/* To get serializable execution, we let InnoDB
+			conceptually add 'LOCK IN SHARE MODE' to all SELECTs
+			which otherwise would have been consistent reads. An
+			exception is consistent reads in the AUTOCOMMIT=1 mode:
+			we know that they are read-only transactions, and they
+			can be serialized also if performed as consistent
+			reads. */
+
+			m_prebuilt->select_lock_type = LOCK_S;
+			m_prebuilt->stored_select_lock_type = LOCK_S;
+		}
+
+		/* Starting from 4.1.9, no InnoDB table lock is taken in LOCK
+		TABLES if AUTOCOMMIT=1. It does not make much sense to acquire
+		an InnoDB table lock if it is released immediately at the end
+		of LOCK TABLES, and InnoDB's table locks in that case cause
+		VERY easily deadlocks.
+
+		We do not set InnoDB table locks if user has not explicitly
+		requested a table lock. Note that thd_in_lock_tables(thd)
+		can hold in some cases, e.g., at the start of a stored
+		procedure call (SQLCOM_CALL). */
+
+		if (m_prebuilt->select_lock_type != LOCK_NONE) {
+
+			if (sql_command == SQLCOM_LOCK_TABLES
+			    && THDVAR(thd, table_locks)
+			    && thd_test_options(thd, OPTION_NOT_AUTOCOMMIT)
+			    && thd_in_lock_tables(thd)) {
+
+				dberr_t	error = row_lock_table(m_prebuilt);
+
+				if (error != DB_SUCCESS) {
+
+					DBUG_RETURN(
+						convert_error_code_to_mysql(
+							error, 0, thd));
+				}
+			}
+
+			trx->mysql_n_tables_locked++;
+		}
+
+		trx->n_mysql_tables_in_use++;
+		m_mysql_has_locked = true;
+
+		if (!trx_is_started(trx)
+		    && (m_prebuilt->select_lock_type != LOCK_NONE
+			|| m_prebuilt->stored_select_lock_type != LOCK_NONE)) {
+
+			trx->will_lock = true;
+		}
+
+		DBUG_RETURN(0);
+	} else {
+		DEBUG_SYNC_C("ha_innobase_end_statement");
+	}
+
+	/* MySQL is releasing a table lock */
+
+	trx->n_mysql_tables_in_use--;
+	m_mysql_has_locked = false;
+
+	/* If the MySQL lock count drops to zero we know that the current SQL
+	statement has ended */
+
+	if (trx->n_mysql_tables_in_use == 0) {
+
+		trx->mysql_n_tables_locked = 0;
+		m_prebuilt->used_in_HANDLER = FALSE;
+
+		if (!thd_test_options(
+				thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
+
+			if (trx_is_started(trx)) {
+
+				innobase_commit(ht, thd, TRUE);
+			}
+
+		} else if (trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
+			trx->read_view.close();
+		}
+	}
+
+	if (!trx_is_started(trx)
+	    && lock_type != F_UNLCK
+	    && (m_prebuilt->select_lock_type != LOCK_NONE
+		|| m_prebuilt->stored_select_lock_type != LOCK_NONE)) {
+
+		trx->will_lock = true;
+	}
+
+	DBUG_RETURN(0);
+}
+
+/************************************************************************//**
+Here we export InnoDB status variables to MySQL. */
+static
+void
+innodb_export_status()
+/*==================*/
+{
+	if (srv_was_started) {
+		srv_export_innodb_status();
+	}
+}
+
+/************************************************************************//**
+Implements the SHOW ENGINE INNODB STATUS command. Sends the output of the
+InnoDB Monitor to the client.
+@return 0 on success */
+static
+int
+innodb_show_status(
+/*===============*/
+	handlerton*	hton,	/*!< in: the innodb handlerton */
+	THD*		thd,	/*!< in: the MySQL query thread of the caller */
+	stat_print_fn*	stat_print)
+{
+	static const char	truncated_msg[] = "... truncated...\n";
+	const long		MAX_STATUS_SIZE = 1048576;
+	ulint			trx_list_start = ULINT_UNDEFINED;
+	ulint			trx_list_end = ULINT_UNDEFINED;
+	bool			ret_val;
+
+	DBUG_ENTER("innodb_show_status");
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	/* We don't create the temp files or associated
+	mutexes in read-only-mode */
+
+	if (srv_read_only_mode) {
+		DBUG_RETURN(0);
+	}
+
+	purge_sys.wake_if_not_active();
+
+	/* We let the InnoDB Monitor to output at most MAX_STATUS_SIZE
+	bytes of text. */
+
+	char*	str;
+	size_t	flen;
+
+	mysql_mutex_lock(&srv_monitor_file_mutex);
+	rewind(srv_monitor_file);
+
+	srv_printf_innodb_monitor(srv_monitor_file, FALSE,
+				  &trx_list_start, &trx_list_end);
+
+	os_file_set_eof(srv_monitor_file);
+
+	flen = size_t(ftell(srv_monitor_file));
+	if (ssize_t(flen) < 0) {
+		flen = 0;
+	}
+
+	size_t	usable_len;
+
+	if (flen > MAX_STATUS_SIZE) {
+		usable_len = MAX_STATUS_SIZE;
+		truncated_status_writes++;
+	} else {
+		usable_len = flen;
+	}
+
+	/* allocate buffer for the string, and
+	read the contents of the temporary file */
+
+	if (!(str = (char*) my_malloc(PSI_INSTRUMENT_ME,
+		      usable_len + 1, MYF(0)))) {
+		mysql_mutex_unlock(&srv_monitor_file_mutex);
+		DBUG_RETURN(1);
+	}
+
+	rewind(srv_monitor_file);
+
+	if (flen < MAX_STATUS_SIZE) {
+		/* Display the entire output. */
+		flen = fread(str, 1, flen, srv_monitor_file);
+	} else if (trx_list_end < flen
+		   && trx_list_start < trx_list_end
+		   && trx_list_start + flen - trx_list_end
+		   < MAX_STATUS_SIZE - sizeof truncated_msg - 1) {
+
+		/* Omit the beginning of the list of active transactions. */
+		size_t	len = fread(str, 1, trx_list_start, srv_monitor_file);
+
+		memcpy(str + len, truncated_msg, sizeof truncated_msg - 1);
+		len += sizeof truncated_msg - 1;
+		usable_len = (MAX_STATUS_SIZE - 1) - len;
+		fseek(srv_monitor_file, long(flen - usable_len), SEEK_SET);
+		len += fread(str + len, 1, usable_len, srv_monitor_file);
+		flen = len;
+	} else {
+		/* Omit the end of the output. */
+		flen = fread(str, 1, MAX_STATUS_SIZE - 1, srv_monitor_file);
+	}
+
+	mysql_mutex_unlock(&srv_monitor_file_mutex);
+
+	ret_val= stat_print(
+		thd, innobase_hton_name,
+		static_cast<uint>(strlen(innobase_hton_name)),
+		STRING_WITH_LEN(""), str, static_cast<uint>(flen));
+
+	my_free(str);
+
+	DBUG_RETURN(ret_val);
+}
+
+/************************************************************************//**
+Return 0 on success and non-zero on failure. Note: the bool return type
+seems to be abused here, should be an int. */
+static
+bool
+innobase_show_status(
+/*=================*/
+	handlerton*		hton,	/*!< in: the innodb handlerton */
+	THD*			thd,	/*!< in: the MySQL query thread
+					of the caller */
+	stat_print_fn*		stat_print,
+	enum ha_stat_type	stat_type)
+{
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	switch (stat_type) {
+	case HA_ENGINE_STATUS:
+		/* Non-zero return value means there was an error. */
+		return(innodb_show_status(hton, thd, stat_print) != 0);
+
+	case HA_ENGINE_MUTEX:
+	case HA_ENGINE_LOGS:
+		/* Not handled */
+		break;
+	}
+
+	/* Success */
+	return(false);
+}
+
+/*********************************************************************//**
+Returns number of THR_LOCK locks used for one instance of InnoDB table.
+InnoDB no longer relies on THR_LOCK locks so 0 value is returned.
+Instead of THR_LOCK locks InnoDB relies on combination of metadata locks
+(e.g. for LOCK TABLES and DDL) and its own locking subsystem.
+Note that even though this method returns 0, SQL-layer still calls
+::store_lock(), ::start_stmt() and ::external_lock() methods for InnoDB
+tables. */
+
+uint
+ha_innobase::lock_count(void) const
+/*===============================*/
+{
+	return 0;
+}
+
+/*****************************************************************//**
+Supposed to convert a MySQL table lock stored in the 'lock' field of the
+handle to a proper type before storing pointer to the lock into an array
+of pointers.
+In practice, since InnoDB no longer relies on THR_LOCK locks and its
+lock_count() method returns 0 it just informs storage engine about type
+of THR_LOCK which SQL-layer would have acquired for this specific statement
+on this specific table.
+MySQL also calls this if it wants to reset some table locks to a not-locked
+state during the processing of an SQL query. An example is that during a
+SELECT the read lock is released early on the 'const' tables where we only
+fetch one row. MySQL does not call this when it releases all locks at the
+end of an SQL statement.
+@return pointer to the current element in the 'to' array. */
+
+THR_LOCK_DATA**
+ha_innobase::store_lock(
+/*====================*/
+	THD*			thd,		/*!< in: user thread handle */
+	THR_LOCK_DATA**		to,		/*!< in: pointer to the current
+						element in an array of pointers
+						to lock structs;
+						only used as return value */
+	thr_lock_type		lock_type)	/*!< in: lock type to store in
+						'lock'; this may also be
+						TL_IGNORE */
+{
+	/* Note that trx in this function is NOT necessarily m_prebuilt->trx
+	because we call update_thd() later, in ::external_lock()! Failure to
+	understand this caused a serious memory corruption bug in 5.1.11. */
+
+	trx_t*	trx = check_trx_exists(thd);
+
+	/* NOTE: MySQL can call this function with lock 'type' TL_IGNORE!
+	Be careful to ignore TL_IGNORE if we are going to do something with
+	only 'real' locks! */
+
+	/* If no MySQL table is in use, we need to set the isolation level
+	of the transaction. */
+
+	if (lock_type != TL_IGNORE
+	    && trx->n_mysql_tables_in_use == 0) {
+		trx->isolation_level = innobase_map_isolation_level(
+			(enum_tx_isolation) thd_tx_isolation(thd));
+
+		if (trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
+
+			/* At low transaction isolation levels we let
+			each consistent read set its own snapshot */
+			trx->read_view.close();
+		}
+	}
+
+	DBUG_ASSERT(EQ_CURRENT_THD(thd));
+	const bool in_lock_tables = thd_in_lock_tables(thd);
+	const int sql_command = thd_sql_command(thd);
+
+	if (srv_read_only_mode
+	    && (sql_command == SQLCOM_UPDATE
+		|| sql_command == SQLCOM_INSERT
+		|| sql_command == SQLCOM_REPLACE
+		|| sql_command == SQLCOM_DROP_TABLE
+		|| sql_command == SQLCOM_ALTER_TABLE
+		|| sql_command == SQLCOM_OPTIMIZE
+		|| (sql_command == SQLCOM_CREATE_TABLE
+		    && (lock_type >= TL_WRITE_CONCURRENT_INSERT
+			 && lock_type <= TL_WRITE))
+		|| sql_command == SQLCOM_CREATE_INDEX
+		|| sql_command == SQLCOM_DROP_INDEX
+		|| sql_command == SQLCOM_CREATE_SEQUENCE
+		|| sql_command == SQLCOM_DROP_SEQUENCE
+		|| sql_command == SQLCOM_DELETE)) {
+
+		ib_senderrf(trx->mysql_thd,
+			    IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE);
+
+	} else if (sql_command == SQLCOM_FLUSH
+		   && lock_type == TL_READ_NO_INSERT) {
+
+		/* Check for FLUSH TABLES ... WITH READ LOCK */
+
+		/* Note: This call can fail, but there is no way to return
+		the error to the caller. We simply ignore it for now here
+		and push the error code to the caller where the error is
+		detected in the function. */
+
+		dberr_t	err = row_quiesce_set_state(
+			m_prebuilt->table, QUIESCE_START, trx);
+
+		ut_a(err == DB_SUCCESS || err == DB_UNSUPPORTED);
+
+		if (trx->isolation_level == TRX_ISO_SERIALIZABLE) {
+			m_prebuilt->select_lock_type = LOCK_S;
+			m_prebuilt->stored_select_lock_type = LOCK_S;
+		} else {
+			m_prebuilt->select_lock_type = LOCK_NONE;
+			m_prebuilt->stored_select_lock_type = LOCK_NONE;
+		}
+
+	/* Check for DROP TABLE */
+	} else if (sql_command == SQLCOM_DROP_TABLE ||
+                   sql_command == SQLCOM_DROP_SEQUENCE) {
+
+		/* MySQL calls this function in DROP TABLE though this table
+		handle may belong to another thd that is running a query. Let
+		us in that case skip any changes to the m_prebuilt struct. */
+
+	/* Check for LOCK TABLE t1,...,tn WITH SHARED LOCKS */
+	} else if ((lock_type == TL_READ && in_lock_tables)
+		   || (lock_type == TL_READ_HIGH_PRIORITY && in_lock_tables)
+		   || lock_type == TL_READ_WITH_SHARED_LOCKS
+		   || lock_type == TL_READ_SKIP_LOCKED
+		   || lock_type == TL_READ_NO_INSERT
+		   || (lock_type != TL_IGNORE
+		       && sql_command != SQLCOM_SELECT)) {
+
+		/* The OR cases above are in this order:
+		1) MySQL is doing LOCK TABLES ... READ LOCAL, or we
+		are processing a stored procedure or function, or
+		2) (we do not know when TL_READ_HIGH_PRIORITY is used), or
+		3) this is a SELECT ... IN SHARE MODE, or
+		4) this is a SELECT ... IN SHARE MODE SKIP LOCKED, or
+		5) we are doing a complex SQL statement like
+		INSERT INTO ... SELECT ... and the logical logging (MySQL
+		binlog) requires the use of a locking read, or
+		MySQL is doing LOCK TABLES ... READ.
+		6) we let InnoDB do locking reads for all SQL statements that
+		are not simple SELECTs; note that select_lock_type in this
+		case may get strengthened in ::external_lock() to LOCK_X.
+		Note that we MUST use a locking read in all data modifying
+		SQL statements, because otherwise the execution would not be
+		serializable, and also the results from the update could be
+		unexpected if an obsolete consistent read view would be
+		used. */
+
+		/* Use consistent read for checksum table */
+
+		if (sql_command == SQLCOM_CHECKSUM
+		    || sql_command == SQLCOM_CREATE_SEQUENCE
+		    || (sql_command == SQLCOM_ANALYZE && lock_type == TL_READ)
+		    || (trx->isolation_level <= TRX_ISO_READ_COMMITTED
+			&& (lock_type == TL_READ
+			    || lock_type == TL_READ_NO_INSERT)
+			&& (sql_command == SQLCOM_INSERT_SELECT
+			    || sql_command == SQLCOM_REPLACE_SELECT
+			    || sql_command == SQLCOM_UPDATE
+			    || sql_command == SQLCOM_CREATE_SEQUENCE
+			    || sql_command == SQLCOM_CREATE_TABLE))) {
+
+			/* If the transaction isolation level is
+			READ UNCOMMITTED or READ COMMITTED and we are executing
+			INSERT INTO...SELECT or REPLACE INTO...SELECT
+			or UPDATE ... = (SELECT ...) or CREATE  ...
+			SELECT... without FOR UPDATE or IN SHARE
+			MODE in select, then we use consistent read
+			for select. */
+
+			m_prebuilt->select_lock_type = LOCK_NONE;
+			m_prebuilt->stored_select_lock_type = LOCK_NONE;
+		} else {
+			m_prebuilt->select_lock_type = LOCK_S;
+			m_prebuilt->stored_select_lock_type = LOCK_S;
+		}
+
+	} else if (lock_type != TL_IGNORE) {
+
+		/* We set possible LOCK_X value in external_lock, not yet
+		here even if this would be SELECT ... FOR UPDATE */
+
+		m_prebuilt->select_lock_type = LOCK_NONE;
+		m_prebuilt->stored_select_lock_type = LOCK_NONE;
+	}
+	m_prebuilt->skip_locked= (lock_type == TL_WRITE_SKIP_LOCKED ||
+				  lock_type == TL_READ_SKIP_LOCKED);
+
+	if (!trx_is_started(trx)
+	    && (m_prebuilt->select_lock_type != LOCK_NONE
+	        || m_prebuilt->stored_select_lock_type != LOCK_NONE)) {
+
+		trx->will_lock = true;
+	}
+
+	return(to);
+}
+
+/*********************************************************************//**
+Read the next autoinc value. Acquire the relevant locks before reading
+the AUTOINC value. If SUCCESS then the table AUTOINC mutex will be locked
+on return and all relevant locks acquired.
+@return DB_SUCCESS or error code */
+
+dberr_t
+ha_innobase::innobase_get_autoinc(
+/*==============================*/
+	ulonglong*	value)		/*!< out: autoinc value */
+{
+	*value = 0;
+
+	m_prebuilt->autoinc_error = innobase_lock_autoinc();
+
+	if (m_prebuilt->autoinc_error == DB_SUCCESS) {
+
+		/* Determine the first value of the interval */
+		*value = dict_table_autoinc_read(m_prebuilt->table);
+
+		/* It should have been initialized during open. */
+		if (*value == 0) {
+			m_prebuilt->autoinc_error = DB_UNSUPPORTED;
+			m_prebuilt->table->autoinc_mutex.wr_unlock();
+		}
+	}
+
+	return(m_prebuilt->autoinc_error);
+}
+
+/*******************************************************************//**
+This function reads the global auto-inc counter. It doesn't use the
+AUTOINC lock even if the lock mode is set to TRADITIONAL.
+@return the autoinc value */
+
+ulonglong
+ha_innobase::innobase_peek_autoinc(void)
+/*====================================*/
+{
+	ulonglong	auto_inc;
+	dict_table_t*	innodb_table;
+
+	ut_a(m_prebuilt != NULL);
+	ut_a(m_prebuilt->table != NULL);
+
+	innodb_table = m_prebuilt->table;
+
+	innodb_table->autoinc_mutex.wr_lock();
+
+	auto_inc = dict_table_autoinc_read(innodb_table);
+
+	if (auto_inc == 0) {
+		ib::info() << "AUTOINC next value generation is disabled for"
+			" '" << innodb_table->name << "'";
+	}
+
+	innodb_table->autoinc_mutex.wr_unlock();
+
+	return(auto_inc);
+}
+
+/*********************************************************************//**
+Returns the value of the auto-inc counter in *first_value and ~0 on failure. */
+
+void
+ha_innobase::get_auto_increment(
+/*============================*/
+	ulonglong	offset,			/*!< in: table autoinc offset */
+	ulonglong	increment,		/*!< in: table autoinc
+						increment */
+	ulonglong	nb_desired_values,	/*!< in: number of values
+						reqd */
+	ulonglong*	first_value,		/*!< out: the autoinc value */
+	ulonglong*	nb_reserved_values)	/*!< out: count of reserved
+						values */
+{
+	trx_t*		trx;
+	dberr_t		error;
+	ulonglong	autoinc = 0;
+	mariadb_set_stats set_stats_temporary(handler_stats);
+
+	/* Prepare m_prebuilt->trx in the table handle */
+	update_thd(ha_thd());
+
+	error = innobase_get_autoinc(&autoinc);
+
+	if (error != DB_SUCCESS) {
+		*first_value = (~(ulonglong) 0);
+		return;
+	}
+
+	/* This is a hack, since nb_desired_values seems to be accurate only
+	for the first call to get_auto_increment() for multi-row INSERT and
+	meaningless for other statements e.g, LOAD etc. Subsequent calls to
+	this method for the same statement results in different values which
+	don't make sense. Therefore we store the value the first time we are
+	called and count down from that as rows are written (see write_row()).
+	*/
+
+	trx = m_prebuilt->trx;
+
+	/* Note: We can't rely on *first_value since some MySQL engines,
+	in particular the partition engine, don't initialize it to 0 when
+	invoking this method. So we are not sure if it's guaranteed to
+	be 0 or not. */
+
+	/* We need the upper limit of the col type to check for
+	whether we update the table autoinc counter or not. */
+	ulonglong col_max_value =
+			table->next_number_field->get_max_int_value();
+
+	/** The following logic is needed to avoid duplicate key error
+	for autoincrement column.
+
+	(1) InnoDB gives the current autoincrement value with respect
+	to increment and offset value.
+
+	(2) Basically it does compute_next_insert_id() logic inside InnoDB
+	to avoid the current auto increment value changed by handler layer.
+
+	(3) It is restricted only for insert operations. */
+
+	if (increment > 1 && increment <= ~autoinc && autoinc < col_max_value
+	    && thd_sql_command(m_user_thd) != SQLCOM_ALTER_TABLE) {
+
+		ulonglong prev_auto_inc = autoinc;
+
+		autoinc = ((autoinc - 1) + increment - offset)/ increment;
+
+		autoinc = autoinc * increment + offset;
+
+		/* If autoinc exceeds the col_max_value then reset
+		to old autoinc value. Because in case of non-strict
+		sql mode, boundary value is not considered as error. */
+
+		if (autoinc >= col_max_value) {
+			autoinc = prev_auto_inc;
+		}
+
+		ut_ad(autoinc > 0);
+	}
+
+	/* Called for the first time ? */
+	if (trx->n_autoinc_rows == 0) {
+
+		trx->n_autoinc_rows = (ulint) nb_desired_values;
+
+		/* It's possible for nb_desired_values to be 0:
+		e.g., INSERT INTO T1(C) SELECT C FROM T2; */
+		if (nb_desired_values == 0) {
+
+			trx->n_autoinc_rows = 1;
+		}
+
+		set_if_bigger(*first_value, autoinc);
+	/* Not in the middle of a mult-row INSERT. */
+	} else if (m_prebuilt->autoinc_last_value == 0) {
+		set_if_bigger(*first_value, autoinc);
+	}
+
+	if (*first_value > col_max_value) {
+		/* Out of range number. Let handler::update_auto_increment()
+		take care of this */
+		m_prebuilt->autoinc_last_value = 0;
+		m_prebuilt->table->autoinc_mutex.wr_unlock();
+		*nb_reserved_values= 0;
+		return;
+	}
+
+	*nb_reserved_values = trx->n_autoinc_rows;
+
+	/* With old style AUTOINC locking we only update the table's
+	AUTOINC counter after attempting to insert the row. */
+	if (innobase_autoinc_lock_mode != AUTOINC_OLD_STYLE_LOCKING) {
+		ulonglong	current;
+		ulonglong	next_value;
+
+		current = *first_value;
+
+		/* Compute the last value in the interval */
+		next_value = innobase_next_autoinc(
+			current, *nb_reserved_values, increment, offset,
+			col_max_value);
+
+		m_prebuilt->autoinc_last_value = next_value;
+
+		if (m_prebuilt->autoinc_last_value < *first_value) {
+			*first_value = (~(ulonglong) 0);
+		} else {
+			/* Update the table autoinc variable */
+			dict_table_autoinc_update_if_greater(
+				m_prebuilt->table,
+				m_prebuilt->autoinc_last_value);
+		}
+	} else {
+		/* This will force write_row() into attempting an update
+		of the table's AUTOINC counter. */
+		m_prebuilt->autoinc_last_value = 0;
+	}
+
+	/* The increment to be used to increase the AUTOINC value, we use
+	this in write_row() and update_row() to increase the autoinc counter
+	for columns that are filled by the user. We need the offset and
+	the increment. */
+	m_prebuilt->autoinc_offset = offset;
+	m_prebuilt->autoinc_increment = increment;
+
+	m_prebuilt->table->autoinc_mutex.wr_unlock();
+}
+
+/*******************************************************************//**
+See comment in handler.cc */
+
+bool
+ha_innobase::get_error_message(
+/*===========================*/
+	int	error,
+	String*	buf)
+{
+	trx_t*	trx = check_trx_exists(ha_thd());
+
+	if (error == HA_ERR_DECRYPTION_FAILED) {
+		const char *msg = "Table encrypted but decryption failed. This could be because correct encryption management plugin is not loaded, used encryption key is not available or encryption method does not match.";
+		buf->copy(msg, (uint)strlen(msg), system_charset_info);
+	} else {
+		buf->copy(trx->detailed_error, (uint) strlen(trx->detailed_error),
+			system_charset_info);
+	}
+
+	return(FALSE);
+}
+
+/** Retrieves the names of the table and the key for which there was a
+duplicate entry in the case of HA_ERR_FOREIGN_DUPLICATE_KEY.
+
+If any of the names is not available, then this method will return
+false and will not change any of child_table_name or child_key_name.
+
+@param[out] child_table_name Table name
+@param[in] child_table_name_len Table name buffer size
+@param[out] child_key_name Key name
+@param[in] child_key_name_len Key name buffer size
+
+@retval true table and key names were available and were written into the
+corresponding out parameters.
+@retval false table and key names were not available, the out parameters
+were not touched. */
+bool
+ha_innobase::get_foreign_dup_key(
+/*=============================*/
+	char*	child_table_name,
+	uint	child_table_name_len,
+	char*	child_key_name,
+	uint	child_key_name_len)
+{
+	const dict_index_t*	err_index;
+
+	ut_a(m_prebuilt->trx != NULL);
+	ut_a(m_prebuilt->trx->magic_n == TRX_MAGIC_N);
+
+	err_index = trx_get_error_info(m_prebuilt->trx);
+
+	if (err_index == NULL) {
+		return(false);
+	}
+	/* else */
+
+	/* copy table name (and convert from filename-safe encoding to
+	system_charset_info) */
+	char*	p = strchr(err_index->table->name.m_name, '/');
+
+	/* strip ".../" prefix if any */
+	if (p != NULL) {
+		p++;
+	} else {
+		p = err_index->table->name.m_name;
+	}
+
+	size_t	len;
+
+	len = filename_to_tablename(p, child_table_name, child_table_name_len);
+
+	child_table_name[len] = '\0';
+
+	/* copy index name */
+	snprintf(child_key_name, child_key_name_len, "%s",
+		    err_index->name());
+
+	return(true);
+}
+
+/*******************************************************************//**
+Compares two 'refs'. A 'ref' is the (internal) primary key value of the row.
+If there is no explicitly declared non-null unique key or a primary key, then
+InnoDB internally uses the row id as the primary key.
+@return < 0 if ref1 < ref2, 0 if equal, else > 0 */
+
+int
+ha_innobase::cmp_ref(
+/*=================*/
+	const uchar*	ref1,	/*!< in: an (internal) primary key value in the
+				MySQL key value format */
+	const uchar*	ref2)	/*!< in: an (internal) primary key value in the
+				MySQL key value format */
+{
+	enum_field_types mysql_type;
+	Field*		field;
+	KEY_PART_INFO*	key_part;
+	KEY_PART_INFO*	key_part_end;
+	uint		len1;
+	uint		len2;
+	int		result;
+
+	if (m_prebuilt->clust_index_was_generated) {
+		/* The 'ref' is an InnoDB row id */
+
+		return(memcmp(ref1, ref2, DATA_ROW_ID_LEN));
+	}
+
+	/* Do a type-aware comparison of primary key fields. PK fields
+	are always NOT NULL, so no checks for NULL are performed. */
+
+	key_part = table->key_info[table->s->primary_key].key_part;
+
+	key_part_end = key_part
+		+ table->key_info[table->s->primary_key].user_defined_key_parts;
+
+	for (; key_part != key_part_end; ++key_part) {
+		field = key_part->field;
+		mysql_type = field->type();
+
+		if (mysql_type == MYSQL_TYPE_TINY_BLOB
+			|| mysql_type == MYSQL_TYPE_MEDIUM_BLOB
+			|| mysql_type == MYSQL_TYPE_BLOB
+			|| mysql_type == MYSQL_TYPE_LONG_BLOB) {
+
+			/* In the MySQL key value format, a column prefix of
+			a BLOB is preceded by a 2-byte length field */
+
+			len1 = innobase_read_from_2_little_endian(ref1);
+			len2 = innobase_read_from_2_little_endian(ref2);
+
+			result = ((Field_blob*) field)->cmp(
+				ref1 + 2, len1, ref2 + 2, len2);
+		} else {
+			result = field->key_cmp(ref1, ref2);
+		}
+
+		if (result) {
+			if (key_part->key_part_flag & HA_REVERSE_SORT)
+				result = -result;
+			return(result);
+		}
+
+		ref1 += key_part->store_length;
+		ref2 += key_part->store_length;
+	}
+
+	return(0);
+}
+
+/*******************************************************************//**
+Ask InnoDB if a query to a table can be cached.
+@return TRUE if query caching of the table is permitted */
+
+my_bool
+ha_innobase::register_query_cache_table(
+/*====================================*/
+	THD*		thd,		/*!< in: user thread handle */
+	const char*	table_key,	/*!< in: normalized path to the
+					table */
+	uint		key_length,	/*!< in: length of the normalized
+					path to the table */
+	qc_engine_callback*
+			call_back,	/*!< out: pointer to function for
+					checking if query caching
+					is permitted */
+	ulonglong	*engine_data)	/*!< in/out: data to call_back */
+{
+	*engine_data = 0;
+	*call_back = innobase_query_caching_of_table_permitted;
+
+	return(innobase_query_caching_of_table_permitted(
+			thd, table_key,
+			static_cast<uint>(key_length),
+			engine_data));
+}
+
+/******************************************************************//**
+This function is used to find the storage length in bytes of the first n
+characters for prefix indexes using a multibyte character set. The function
+finds charset information and returns length of prefix_len characters in the
+index field in bytes.
+@return number of bytes occupied by the first n characters */
+ulint
+innobase_get_at_most_n_mbchars(
+/*===========================*/
+	ulint charset_id,	/*!< in: character set id */
+	ulint prefix_len,	/*!< in: prefix length in bytes of the index
+				(this has to be divided by mbmaxlen to get the
+				number of CHARACTERS n in the prefix) */
+	ulint data_len,		/*!< in: length of the string in bytes */
+	const char* str)	/*!< in: character string */
+{
+	ulint char_length;	/*!< character length in bytes */
+	ulint n_chars;		/*!< number of characters in prefix */
+	CHARSET_INFO* charset;	/*!< charset used in the field */
+
+	charset = get_charset((uint) charset_id, MYF(MY_WME));
+
+	ut_ad(charset);
+	ut_ad(charset->mbmaxlen);
+
+	/* Calculate how many characters at most the prefix index contains */
+
+	n_chars = prefix_len / charset->mbmaxlen;
+
+	/* If the charset is multi-byte, then we must find the length of the
+	first at most n chars in the string. If the string contains less
+	characters than n, then we return the length to the end of the last
+	character. */
+
+	if (charset->mbmaxlen > 1) {
+		/* charpos() returns the byte length of the first n_chars
+		characters, or a value bigger than the length of str, if
+		there were not enough full characters in str.
+
+		Why does the code below work:
+		Suppose that we are looking for n UTF-8 characters.
+
+		1) If the string is long enough, then the prefix contains at
+		least n complete UTF-8 characters + maybe some extra
+		characters + an incomplete UTF-8 character. No problem in
+		this case. The function returns the pointer to the
+		end of the nth character.
+
+		2) If the string is not long enough, then the string contains
+		the complete value of a column, that is, only complete UTF-8
+		characters, and we can store in the column prefix index the
+		whole string. */
+
+		char_length= charset->charpos(str, str + data_len, n_chars);
+		if (char_length > data_len) {
+			char_length = data_len;
+		}
+	} else if (data_len < prefix_len) {
+
+		char_length = data_len;
+
+	} else {
+
+		char_length = prefix_len;
+	}
+
+	return(char_length);
+}
+
+/*******************************************************************//**
+This function is used to prepare an X/Open XA distributed transaction.
+@return 0 or error number */
+static
+int
+innobase_xa_prepare(
+/*================*/
+	handlerton*	hton,		/*!< in: InnoDB handlerton */
+	THD*		thd,		/*!< in: handle to the MySQL thread of
+					the user whose XA transaction should
+					be prepared */
+	bool		prepare_trx)	/*!< in: true - prepare transaction
+					false - the current SQL statement
+					ended */
+{
+	trx_t*		trx = check_trx_exists(thd);
+
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	thd_get_xid(thd, &reinterpret_cast<MYSQL_XID&>(trx->xid));
+
+	if (!trx_is_registered_for_2pc(trx) && trx_is_started(trx)) {
+
+		sql_print_error("Transaction not registered for MariaDB 2PC,"
+				" but transaction is active");
+	}
+
+	if (prepare_trx
+	    || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
+
+		/* We were instructed to prepare the whole transaction, or
+		this is an SQL statement end and autocommit is on */
+
+		ut_ad(trx_is_registered_for_2pc(trx));
+
+		trx_prepare_for_mysql(trx);
+	} else {
+		/* We just mark the SQL statement ended and do not do a
+		transaction prepare */
+
+		/* If we had reserved the auto-inc lock for some
+		table in this SQL statement we release it now */
+
+		lock_unlock_table_autoinc(trx);
+
+		/* Store the current undo_no of the transaction so that we
+		know where to roll back if we have to roll back the next
+		SQL statement */
+		if (UNIV_UNLIKELY(end_of_statement(trx))) {
+			return 1;
+		}
+	}
+
+	if (thd_sql_command(thd) != SQLCOM_XA_PREPARE
+	    && (prepare_trx
+		|| !thd_test_options(
+			thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
+
+		/* For mysqlbackup to work the order of transactions in binlog
+		and InnoDB must be the same. Consider the situation
+
+		  thread1> prepare; write to binlog; ...
+			  <context switch>
+		  thread2> prepare; write to binlog; commit
+		  thread1>			     ... commit
+
+		The server guarantees that writes to the binary log
+		and commits are in the same order, so we do not have
+		to handle this case. */
+	}
+
+	return(0);
+}
+
+/*******************************************************************//**
+This function is used to recover X/Open XA distributed transactions.
+@return number of prepared transactions stored in xid_list */
+static
+int
+innobase_xa_recover(
+/*================*/
+	handlerton*	hton,	/*!< in: InnoDB handlerton */
+	XID*		xid_list,/*!< in/out: prepared transactions */
+	uint		len)	/*!< in: number of slots in xid_list */
+{
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	if (len == 0 || xid_list == NULL) {
+
+		return(0);
+	}
+
+	return(trx_recover_for_mysql(xid_list, len));
+}
+
+/*******************************************************************//**
+This function is used to commit one X/Open XA distributed transaction
+which is in the prepared state
+@return 0 or error number */
+static
+int
+innobase_commit_by_xid(
+/*===================*/
+	handlerton*	hton,
+	XID*		xid)	/*!< in: X/Open XA transaction identification */
+{
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	DBUG_EXECUTE_IF("innobase_xa_fail",
+			return XAER_RMFAIL;);
+
+	if (high_level_read_only) {
+		return(XAER_RMFAIL);
+	}
+
+	if (trx_t* trx = trx_get_trx_by_xid(xid)) {
+		/* use cases are: disconnected xa, slave xa, recovery */
+		innobase_commit_low(trx);
+		ut_ad(trx->mysql_thd == NULL);
+		trx_deregister_from_2pc(trx);
+		ut_ad(!trx->will_lock);    /* trx cache requirement */
+		trx->free();
+
+		return(XA_OK);
+	} else {
+		return(XAER_NOTA);
+	}
+}
+
+/** This function is used to rollback one X/Open XA distributed transaction
+which is in the prepared state
+
+@param[in] hton InnoDB handlerton
+@param[in] xid X/Open XA transaction identification
+
+@return 0 or error number */
+int innobase_rollback_by_xid(handlerton* hton, XID* xid)
+{
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	DBUG_EXECUTE_IF("innobase_xa_fail",
+			return XAER_RMFAIL;);
+
+	if (high_level_read_only) {
+		return(XAER_RMFAIL);
+	}
+
+	if (trx_t* trx = trx_get_trx_by_xid(xid)) {
+#ifdef WITH_WSREP
+		/* If a wsrep transaction is being rolled back during
+		the recovery, we must clear the xid in order to avoid
+		writing serialisation history for rolled back transaction. */
+		if (wsrep_is_wsrep_xid(&trx->xid)) {
+			trx->xid.null();
+		}
+#endif /* WITH_WSREP */
+		int ret = innobase_rollback_trx(trx);
+		ut_ad(!trx->will_lock);
+		trx->free();
+
+		return(ret);
+	} else {
+		return(XAER_NOTA);
+	}
+}
+
+bool
+ha_innobase::check_if_incompatible_data(
+/*====================================*/
+	HA_CREATE_INFO*	info,
+	uint		table_changes)
+{
+	ha_table_option_struct *param_old, *param_new;
+
+	/* Cache engine specific options */
+	param_new = info->option_struct;
+	param_old = table->s->option_struct;
+
+	innobase_copy_frm_flags_from_create_info(m_prebuilt->table, info);
+
+	if (table_changes != IS_EQUAL_YES) {
+
+		return(COMPATIBLE_DATA_NO);
+	}
+
+	/* Check that auto_increment value was not changed */
+	if ((info->used_fields & HA_CREATE_USED_AUTO)
+	    && info->auto_increment_value != 0) {
+
+		return(COMPATIBLE_DATA_NO);
+	}
+
+	/* Check that row format didn't change */
+	if ((info->used_fields & HA_CREATE_USED_ROW_FORMAT)
+	    && info->row_type != get_row_type()) {
+
+		return(COMPATIBLE_DATA_NO);
+	}
+
+	/* Specifying KEY_BLOCK_SIZE requests a rebuild of the table. */
+	if (info->used_fields & HA_CREATE_USED_KEY_BLOCK_SIZE) {
+		return(COMPATIBLE_DATA_NO);
+	}
+
+	/* Changes on engine specific table options requests a rebuild of the table. */
+	if (param_new->page_compressed != param_old->page_compressed ||
+	    param_new->page_compression_level != param_old->page_compression_level)
+        {
+		return(COMPATIBLE_DATA_NO);
+	}
+
+	return(COMPATIBLE_DATA_YES);
+}
+
+/****************************************************************//**
+Update the system variable innodb_io_capacity_max using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_io_capacity_max_update(
+/*===========================*/
+	THD*				thd,	/*!< in: thread handle */
+	st_mysql_sys_var*, void*,
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	ulong	in_val = *static_cast<const ulong*>(save);
+
+	if (in_val < srv_io_capacity) {
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    ER_WRONG_ARGUMENTS,
+				    "Setting innodb_io_capacity_max %lu"
+			" lower than innodb_io_capacity %lu.",
+			in_val, srv_io_capacity);
+
+		srv_io_capacity = in_val;
+
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+					    ER_WRONG_ARGUMENTS,
+				    "Setting innodb_io_capacity to %lu",
+				    srv_io_capacity);
+	}
+
+	srv_max_io_capacity = in_val;
+}
+
+/****************************************************************//**
+Update the system variable innodb_io_capacity using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_io_capacity_update(
+/*======================*/
+	THD*				thd,	/*!< in: thread handle */
+	st_mysql_sys_var*, void*,
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	ulong	in_val = *static_cast<const ulong*>(save);
+
+	if (in_val > srv_max_io_capacity) {
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    ER_WRONG_ARGUMENTS,
+				    "Setting innodb_io_capacity to %lu"
+				    " higher than innodb_io_capacity_max %lu",
+				    in_val, srv_max_io_capacity);
+
+		srv_max_io_capacity = (in_val & ~(~0UL >> 1))
+			? in_val : in_val * 2;
+
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    ER_WRONG_ARGUMENTS,
+				    "Setting innodb_max_io_capacity to %lu",
+				    srv_max_io_capacity);
+	}
+
+	srv_io_capacity = in_val;
+}
+
+/****************************************************************//**
+Update the system variable innodb_max_dirty_pages_pct using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_max_dirty_pages_pct_update(
+/*==============================*/
+	THD*				thd,	/*!< in: thread handle */
+	st_mysql_sys_var*, void*,
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	double	in_val = *static_cast<const double*>(save);
+	if (in_val < srv_max_dirty_pages_pct_lwm) {
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    ER_WRONG_ARGUMENTS,
+				    "innodb_max_dirty_pages_pct cannot be"
+				    " set lower than"
+				    " innodb_max_dirty_pages_pct_lwm.");
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    ER_WRONG_ARGUMENTS,
+				    "Lowering"
+				    " innodb_max_dirty_page_pct_lwm to %lf",
+				    in_val);
+
+		srv_max_dirty_pages_pct_lwm = in_val;
+	}
+
+	srv_max_buf_pool_modified_pct = in_val;
+
+	mysql_mutex_unlock(&LOCK_global_system_variables);
+	mysql_mutex_lock(&buf_pool.flush_list_mutex);
+	buf_pool.page_cleaner_wakeup();
+	mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+	mysql_mutex_lock(&LOCK_global_system_variables);
+}
+
+/****************************************************************//**
+Update the system variable innodb_max_dirty_pages_pct_lwm using the
+"saved" value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_max_dirty_pages_pct_lwm_update(
+/*==================================*/
+	THD*				thd,	/*!< in: thread handle */
+	st_mysql_sys_var*, void*,
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	double	in_val = *static_cast<const double*>(save);
+	if (in_val > srv_max_buf_pool_modified_pct) {
+		in_val = srv_max_buf_pool_modified_pct;
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    ER_WRONG_ARGUMENTS,
+				    "innodb_max_dirty_pages_pct_lwm"
+				    " cannot be set higher than"
+				    " innodb_max_dirty_pages_pct.");
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    ER_WRONG_ARGUMENTS,
+				    "Setting innodb_max_dirty_page_pct_lwm"
+				    " to %lf",
+				    in_val);
+	}
+
+	srv_max_dirty_pages_pct_lwm = in_val;
+
+	mysql_mutex_unlock(&LOCK_global_system_variables);
+	mysql_mutex_lock(&buf_pool.flush_list_mutex);
+	buf_pool.page_cleaner_wakeup();
+	mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+	mysql_mutex_lock(&LOCK_global_system_variables);
+}
+
+/*************************************************************//**
+Don't allow to set innodb_fast_shutdown=0 if purge threads are
+already down.
+@return 0 if innodb_fast_shutdown can be set */
+static
+int
+fast_shutdown_validate(
+/*=============================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to system
+						variable */
+	void*				save,	/*!< out: immediate result
+						for update function */
+	struct st_mysql_value*		value)	/*!< in: incoming string */
+{
+	if (check_sysvar_int(thd, var, save, value)) {
+		return(1);
+	}
+
+	uint new_val = *reinterpret_cast<uint*>(save);
+
+	if (srv_fast_shutdown && !new_val
+	    && !srv_read_only_mode && abort_loop) {
+		return(1);
+	}
+
+	return(0);
+}
+
+/*************************************************************//**
+Check whether valid argument given to innobase_*_stopword_table.
+This function is registered as a callback with MySQL.
+@return 0 for valid stopword table */
+static
+int
+innodb_stopword_table_validate(
+/*===========================*/
+	THD*				thd,	/*!< in: thread handle */
+	st_mysql_sys_var*,
+	void*				save,	/*!< out: immediate result
+						for update function */
+	struct st_mysql_value*		value)	/*!< in: incoming string */
+{
+	const char*	stopword_table_name;
+	char		buff[STRING_BUFFER_USUAL_SIZE];
+	int		len = sizeof(buff);
+	trx_t*		trx;
+
+	ut_a(save != NULL);
+	ut_a(value != NULL);
+
+	stopword_table_name = value->val_str(value, buff, &len);
+
+	trx = check_trx_exists(thd);
+
+	row_mysql_lock_data_dictionary(trx);
+
+	/* Validate the stopword table's (if supplied) existence and
+	of the right format */
+	int ret = stopword_table_name && !fts_valid_stopword_table(
+		stopword_table_name, NULL);
+
+	row_mysql_unlock_data_dictionary(trx);
+
+	if (!ret) {
+		if (stopword_table_name == buff) {
+			ut_ad(static_cast<size_t>(len) < sizeof buff);
+			stopword_table_name = thd_strmake(thd,
+							  stopword_table_name,
+							  len);
+		}
+
+		*static_cast<const char**>(save) = stopword_table_name;
+	}
+
+	return(ret);
+}
+
+extern void buf_resize_start();
+
+/** Update the system variable innodb_buffer_pool_size using the "saved"
+value. This function is registered as a callback with MySQL.
+@param[in]	save	immediate result from check function */
+static
+void
+innodb_buffer_pool_size_update(THD*,st_mysql_sys_var*,void*, const void* save)
+{
+	snprintf(export_vars.innodb_buffer_pool_resize_status,
+	        sizeof(export_vars.innodb_buffer_pool_resize_status),
+		"Buffer pool resize requested");
+
+	buf_resize_start();
+}
+
+/** The latest assigned innodb_ft_aux_table name */
+static char* innodb_ft_aux_table;
+
+/** Update innodb_ft_aux_table_id on SET GLOBAL innodb_ft_aux_table.
+@param[in,out]	thd	connection
+@param[out]	save	new value of innodb_ft_aux_table
+@param[in]	value	user-specified value */
+static int innodb_ft_aux_table_validate(THD *thd, st_mysql_sys_var*,
+					void* save, st_mysql_value* value)
+{
+	char buf[STRING_BUFFER_USUAL_SIZE];
+	int len = sizeof buf;
+
+	if (const char* table_name = value->val_str(value, buf, &len)) {
+		if (dict_table_t* table = dict_table_open_on_name(
+			    table_name, false, DICT_ERR_IGNORE_NONE)) {
+			const table_id_t id = dict_table_has_fts_index(table)
+				? table->id : 0;
+			dict_table_close(table);
+			if (id) {
+				innodb_ft_aux_table_id = id;
+				if (table_name == buf) {
+					ut_ad(static_cast<size_t>(len)
+					      < sizeof buf);
+					table_name = thd_strmake(thd,
+								 table_name,
+								 len);
+				}
+
+
+				*static_cast<const char**>(save) = table_name;
+				return 0;
+			}
+		}
+
+		return 1;
+	} else {
+		*static_cast<char**>(save) = NULL;
+		innodb_ft_aux_table_id = 0;
+		return 0;
+	}
+}
+
+#ifdef BTR_CUR_HASH_ADAPT
+/****************************************************************//**
+Update the system variable innodb_adaptive_hash_index using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_adaptive_hash_index_update(THD*, st_mysql_sys_var*, void*,
+				  const void* save)
+{
+	mysql_mutex_unlock(&LOCK_global_system_variables);
+	if (*(my_bool*) save) {
+		btr_search_enable();
+	} else {
+		btr_search_disable();
+	}
+	mysql_mutex_lock(&LOCK_global_system_variables);
+}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+/****************************************************************//**
+Update the system variable innodb_cmp_per_index using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_cmp_per_index_update(THD*, st_mysql_sys_var*, void*, const void* save)
+{
+	/* Reset the stats whenever we enable the table
+	INFORMATION_SCHEMA.innodb_cmp_per_index. */
+	if (!srv_cmp_per_index_enabled && *(my_bool*) save) {
+		mysql_mutex_unlock(&LOCK_global_system_variables);
+		page_zip_reset_stat_per_index();
+		mysql_mutex_lock(&LOCK_global_system_variables);
+	}
+
+	srv_cmp_per_index_enabled = !!(*(my_bool*) save);
+}
+
+/****************************************************************//**
+Update the system variable innodb_old_blocks_pct using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_old_blocks_pct_update(THD*, st_mysql_sys_var*, void*, const void* save)
+{
+	mysql_mutex_unlock(&LOCK_global_system_variables);
+	uint ratio = buf_LRU_old_ratio_update(*static_cast<const uint*>(save),
+					      true);
+	mysql_mutex_lock(&LOCK_global_system_variables);
+	innobase_old_blocks_pct = ratio;
+}
+
+/****************************************************************//**
+Update the system variable innodb_old_blocks_pct using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_change_buffer_max_size_update(THD*, st_mysql_sys_var*, void*,
+				     const void* save)
+{
+	srv_change_buffer_max_size = *static_cast<const uint*>(save);
+	mysql_mutex_unlock(&LOCK_global_system_variables);
+	ibuf_max_size_update(srv_change_buffer_max_size);
+	mysql_mutex_lock(&LOCK_global_system_variables);
+}
+
+#ifdef UNIV_DEBUG
+static uint srv_fil_make_page_dirty_debug = 0;
+static uint srv_saved_page_number_debug;
+
+/****************************************************************//**
+Make the first page of given user tablespace dirty. */
+static
+void
+innodb_make_page_dirty(THD*, st_mysql_sys_var*, void*, const void* save)
+{
+	mtr_t		mtr;
+	uint		space_id = *static_cast<const uint*>(save);
+	mysql_mutex_unlock(&LOCK_global_system_variables);
+	fil_space_t*	space = fil_space_t::get(space_id);
+
+	if (space == NULL) {
+func_exit_no_space:
+		mysql_mutex_lock(&LOCK_global_system_variables);
+		return;
+	}
+
+	if (srv_saved_page_number_debug >= space->size) {
+func_exit:
+		space->release();
+		goto func_exit_no_space;
+	}
+
+	mtr.start();
+	mtr.set_named_space(space);
+
+	buf_block_t*	block = buf_page_get(
+		page_id_t(space_id, srv_saved_page_number_debug),
+		space->zip_size(), RW_X_LATCH, &mtr);
+
+	if (block != NULL) {
+		ib::info() << "Dirtying page: " << block->page.id();
+		mtr.write<1,mtr_t::FORCED>(*block,
+					   block->page.frame
+					   + FIL_PAGE_SPACE_ID,
+					   block->page.frame
+					   [FIL_PAGE_SPACE_ID]);
+	}
+	mtr.commit();
+	log_write_up_to(mtr.commit_lsn(), true);
+	goto func_exit;
+}
+#endif // UNIV_DEBUG
+
+/****************************************************************//**
+Update the monitor counter according to the "set_option",  turn
+on/off or reset specified monitor counter. */
+static
+void
+innodb_monitor_set_option(
+/*======================*/
+	const monitor_info_t* monitor_info,/*!< in: monitor info for the monitor
+					to set */
+	mon_option_t	set_option)	/*!< in: Turn on/off reset the
+					counter */
+{
+	monitor_id_t	monitor_id = monitor_info->monitor_id;
+
+	/* If module type is MONITOR_GROUP_MODULE, it cannot be
+	turned on/off individually. It should never use this
+	function to set options */
+	ut_a(!(monitor_info->monitor_type & MONITOR_GROUP_MODULE));
+
+	switch (set_option) {
+	case MONITOR_TURN_ON:
+		MONITOR_ON(monitor_id);
+		MONITOR_INIT(monitor_id);
+		MONITOR_SET_START(monitor_id);
+
+		/* If the monitor to be turned on uses
+		exisitng monitor counter (status variable),
+		make special processing to remember existing
+		counter value. */
+		if (monitor_info->monitor_type & MONITOR_EXISTING) {
+			srv_mon_process_existing_counter(
+				monitor_id, MONITOR_TURN_ON);
+		}
+		break;
+
+	case MONITOR_TURN_OFF:
+		if (monitor_info->monitor_type & MONITOR_EXISTING) {
+			srv_mon_process_existing_counter(
+				monitor_id, MONITOR_TURN_OFF);
+		}
+
+		MONITOR_OFF(monitor_id);
+		MONITOR_SET_OFF(monitor_id);
+		break;
+
+	case MONITOR_RESET_VALUE:
+		srv_mon_reset(monitor_id);
+		break;
+
+	case MONITOR_RESET_ALL_VALUE:
+		srv_mon_reset_all(monitor_id);
+		break;
+
+	default:
+		ut_error;
+	}
+}
+
+/****************************************************************//**
+Find matching InnoDB monitor counters and update their status
+according to the "set_option",  turn on/off or reset specified
+monitor counter. */
+static
+void
+innodb_monitor_update_wildcard(
+/*===========================*/
+	const char*	name,		/*!< in: monitor name to match */
+	mon_option_t	set_option)	/*!< in: the set option, whether
+					to turn on/off or reset the counter */
+{
+	ut_a(name);
+
+	for (ulint use = 0; use < NUM_MONITOR; use++) {
+		ulint		type;
+		monitor_id_t	monitor_id = static_cast<monitor_id_t>(use);
+		monitor_info_t*	monitor_info;
+
+		if (!innobase_wildcasecmp(
+			srv_mon_get_name(monitor_id), name)) {
+			monitor_info = srv_mon_get_info(monitor_id);
+
+			type = monitor_info->monitor_type;
+
+			/* If the monitor counter is of MONITOR_MODULE
+			type, skip it. Except for those also marked with
+			MONITOR_GROUP_MODULE flag, which can be turned
+			on only as a module. */
+			if (!(type & MONITOR_MODULE)
+			     && !(type & MONITOR_GROUP_MODULE)) {
+				innodb_monitor_set_option(monitor_info,
+							  set_option);
+			}
+
+			/* Need to special handle counters marked with
+			MONITOR_GROUP_MODULE, turn on the whole module if
+			any one of it comes here. Currently, only
+			"module_buf_page" is marked with MONITOR_GROUP_MODULE */
+			if (type & MONITOR_GROUP_MODULE) {
+				if ((monitor_id >= MONITOR_MODULE_BUF_PAGE)
+				     && (monitor_id < MONITOR_MODULE_OS)) {
+					if (set_option == MONITOR_TURN_ON
+					    && MONITOR_IS_ON(
+						MONITOR_MODULE_BUF_PAGE)) {
+						continue;
+					}
+
+					srv_mon_set_module_control(
+						MONITOR_MODULE_BUF_PAGE,
+						set_option);
+				} else {
+					/* If new monitor is added with
+					MONITOR_GROUP_MODULE, it needs
+					to be added here. */
+					ut_ad(0);
+				}
+			}
+		}
+	}
+}
+
+/*************************************************************//**
+Given a configuration variable name, find corresponding monitor counter
+and return its monitor ID if found.
+@return monitor ID if found, MONITOR_NO_MATCH if there is no match */
+static
+ulint
+innodb_monitor_id_by_name_get(
+/*==========================*/
+	const char*	name)	/*!< in: monitor counter namer */
+{
+	ut_a(name);
+
+	/* Search for wild character '%' in the name, if
+	found, we treat it as a wildcard match. We do not search for
+	single character wildcard '_' since our monitor names already contain
+	such character. To avoid confusion, we request user must include
+	at least one '%' character to activate the wildcard search. */
+	if (strchr(name, '%')) {
+		return(MONITOR_WILDCARD_MATCH);
+	}
+
+	/* Not wildcard match, check for an exact match */
+	for (ulint i = 0; i < NUM_MONITOR; i++) {
+		if (!innobase_strcasecmp(
+			name, srv_mon_get_name(static_cast<monitor_id_t>(i)))) {
+			return(i);
+		}
+	}
+
+	return(MONITOR_NO_MATCH);
+}
+/*************************************************************//**
+Validate that the passed in monitor name matches at least one
+monitor counter name with wildcard compare.
+@return TRUE if at least one monitor name matches */
+static
+ibool
+innodb_monitor_validate_wildcard_name(
+/*==================================*/
+	const char*	name)	/*!< in: monitor counter namer */
+{
+	for (ulint i = 0; i < NUM_MONITOR; i++) {
+		if (!innobase_wildcasecmp(
+			srv_mon_get_name(static_cast<monitor_id_t>(i)), name)) {
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+/*************************************************************//**
+Validate the passed in monitor name, find and save the
+corresponding monitor name in the function parameter "save".
+@return 0 if monitor name is valid */
+static int innodb_monitor_valid_byname(const char *name)
+{
+	ulint		use;
+	monitor_info_t*	monitor_info;
+
+	if (!name) {
+		return(1);
+	}
+
+	use = innodb_monitor_id_by_name_get(name);
+
+	/* No monitor name matches, nor it is wildcard match */
+	if (use == MONITOR_NO_MATCH) {
+		return(1);
+	}
+
+	if (use < NUM_MONITOR) {
+		monitor_info = srv_mon_get_info((monitor_id_t) use);
+
+		/* If the monitor counter is marked with
+		MONITOR_GROUP_MODULE flag, then this counter
+		cannot be turned on/off individually, instead
+		it shall be turned on/off as a group using
+		its module name */
+		if ((monitor_info->monitor_type & MONITOR_GROUP_MODULE)
+		    && (!(monitor_info->monitor_type & MONITOR_MODULE))) {
+			sql_print_warning(
+				"Monitor counter '%s' cannot"
+				" be turned on/off individually."
+				" Please use its module name"
+				" to turn on/off the counters"
+				" in the module as a group.\n",
+				name);
+
+			return(1);
+		}
+
+	} else {
+		ut_a(use == MONITOR_WILDCARD_MATCH);
+
+		/* For wildcard match, if there is not a single monitor
+		counter name that matches, treat it as an invalid
+		value for the system configuration variables */
+		if (!innodb_monitor_validate_wildcard_name(name)) {
+			return(1);
+		}
+	}
+
+	return(0);
+}
+/*************************************************************//**
+Validate passed-in "value" is a valid monitor counter name.
+This function is registered as a callback with MySQL.
+@return 0 for valid name */
+static
+int
+innodb_monitor_validate(
+/*====================*/
+	THD*, st_mysql_sys_var*,
+	void*				save,	/*!< out: immediate result
+						for update function */
+	struct st_mysql_value*		value)	/*!< in: incoming string */
+{
+  int ret= 0;
+
+  if (const char *name= value->val_str(value, nullptr, &ret))
+  {
+    ret= innodb_monitor_valid_byname(name);
+    if (!ret)
+      *static_cast<const char**>(save)= name;
+  }
+  else
+    ret= 1;
+
+  return ret;
+}
+
+/****************************************************************//**
+Update the system variable innodb_enable(disable/reset/reset_all)_monitor
+according to the "set_option" and turn on/off or reset specified monitor
+counter. */
+static
+void
+innodb_monitor_update(
+/*==================*/
+	THD*			thd,		/*!< in: thread handle */
+	void*			var_ptr,	/*!< out: where the
+						formal string goes */
+	const void*		save,		/*!< in: immediate result
+						from check function */
+	mon_option_t		set_option)	/*!< in: the set option,
+						whether to turn on/off or
+						reset the counter */
+{
+	monitor_info_t*	monitor_info;
+	ulint		monitor_id;
+	ulint		err_monitor = 0;
+	const char*	name;
+
+	ut_a(save != NULL);
+
+	name = *static_cast<const char*const*>(save);
+
+	if (!name) {
+		monitor_id = MONITOR_DEFAULT_START;
+	} else {
+		monitor_id = innodb_monitor_id_by_name_get(name);
+
+		/* Double check we have a valid monitor ID */
+		if (monitor_id == MONITOR_NO_MATCH) {
+			return;
+		}
+	}
+
+	if (monitor_id == MONITOR_DEFAULT_START) {
+		/* If user set the variable to "default", we will
+		print a message and make this set operation a "noop".
+		The check is being made here is because "set default"
+		does not go through validation function */
+		if (thd) {
+			push_warning_printf(
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_NO_DEFAULT,
+				"Default value is not defined for"
+				" this set option. Please specify"
+				" correct counter or module name.");
+		} else {
+			sql_print_error(
+				"Default value is not defined for"
+				" this set option. Please specify"
+				" correct counter or module name.\n");
+		}
+
+		if (var_ptr) {
+			*(const char**) var_ptr = NULL;
+		}
+	} else if (monitor_id == MONITOR_WILDCARD_MATCH) {
+		innodb_monitor_update_wildcard(name, set_option);
+	} else {
+		monitor_info = srv_mon_get_info(
+			static_cast<monitor_id_t>(monitor_id));
+
+		ut_a(monitor_info);
+
+		/* If monitor is already truned on, someone could already
+		collect monitor data, exit and ask user to turn off the
+		monitor before turn it on again. */
+		if (set_option == MONITOR_TURN_ON
+		    && MONITOR_IS_ON(monitor_id)) {
+			err_monitor = monitor_id;
+			goto exit;
+		}
+
+		if (var_ptr) {
+			*(const char**) var_ptr = monitor_info->monitor_name;
+		}
+
+		/* Depending on the monitor name is for a module or
+		a counter, process counters in the whole module or
+		individual counter. */
+		if (monitor_info->monitor_type & MONITOR_MODULE) {
+			srv_mon_set_module_control(
+				static_cast<monitor_id_t>(monitor_id),
+				set_option);
+		} else {
+			innodb_monitor_set_option(monitor_info, set_option);
+		}
+	}
+exit:
+	/* Only if we are trying to turn on a monitor that already
+	been turned on, we will set err_monitor. Print related
+	information */
+	if (err_monitor) {
+		sql_print_warning("InnoDB: Monitor %s is already enabled.",
+				  srv_mon_get_name((monitor_id_t) err_monitor));
+	}
+}
+
+#ifdef UNIV_DEBUG
+static char* srv_buffer_pool_evict;
+
+/****************************************************************//**
+Evict all uncompressed pages of compressed tables from the buffer pool.
+Keep the compressed pages in the buffer pool.
+@return whether all uncompressed pages were evicted */
+static bool innodb_buffer_pool_evict_uncompressed()
+{
+	bool	all_evicted = true;
+
+	mysql_mutex_lock(&buf_pool.mutex);
+
+	for (buf_block_t* block = UT_LIST_GET_LAST(buf_pool.unzip_LRU);
+	     block != NULL; ) {
+		buf_block_t*	prev_block = UT_LIST_GET_PREV(unzip_LRU, block);
+		ut_ad(block->page.in_file());
+		ut_ad(block->page.belongs_to_unzip_LRU());
+		ut_ad(block->in_unzip_LRU_list);
+		ut_ad(block->page.in_LRU_list);
+
+		if (!buf_LRU_free_page(&block->page, false)) {
+			all_evicted = false;
+			block = prev_block;
+		} else {
+			/* Because buf_LRU_free_page() may release
+			and reacquire buf_pool.mutex, prev_block
+			may be invalid. */
+			block = UT_LIST_GET_LAST(buf_pool.unzip_LRU);
+		}
+	}
+
+	mysql_mutex_unlock(&buf_pool.mutex);
+	return(all_evicted);
+}
+
+/****************************************************************//**
+Called on SET GLOBAL innodb_buffer_pool_evict=...
+Handles some values specially, to evict pages from the buffer pool.
+SET GLOBAL innodb_buffer_pool_evict='uncompressed'
+evicts all uncompressed page frames of compressed tablespaces. */
+static
+void
+innodb_buffer_pool_evict_update(THD*, st_mysql_sys_var*, void*,
+				const void* save)
+{
+	if (const char* op = *static_cast<const char*const*>(save)) {
+		if (!strcmp(op, "uncompressed")) {
+			mysql_mutex_unlock(&LOCK_global_system_variables);
+			for (uint tries = 0; tries < 10000; tries++) {
+				if (innodb_buffer_pool_evict_uncompressed()) {
+					mysql_mutex_lock(
+						&LOCK_global_system_variables);
+					return;
+				}
+
+				std::this_thread::sleep_for(
+					std::chrono::milliseconds(10));
+			}
+
+			/* We failed to evict all uncompressed pages. */
+			ut_ad(0);
+		}
+	}
+}
+#endif /* UNIV_DEBUG */
+
+/****************************************************************//**
+Update the system variable innodb_monitor_enable and enable
+specified monitor counter.
+This function is registered as a callback with MySQL. */
+static
+void
+innodb_enable_monitor_update(
+/*=========================*/
+	THD*				thd,	/*!< in: thread handle */
+	st_mysql_sys_var*,
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	innodb_monitor_update(thd, var_ptr, save, MONITOR_TURN_ON);
+}
+
+/****************************************************************//**
+Update the system variable innodb_monitor_disable and turn
+off specified monitor counter. */
+static
+void
+innodb_disable_monitor_update(
+/*==========================*/
+	THD*				thd,	/*!< in: thread handle */
+	st_mysql_sys_var*,
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	innodb_monitor_update(thd, var_ptr, save, MONITOR_TURN_OFF);
+}
+
+/****************************************************************//**
+Update the system variable innodb_monitor_reset and reset
+specified monitor counter(s).
+This function is registered as a callback with MySQL. */
+static
+void
+innodb_reset_monitor_update(
+/*========================*/
+	THD*				thd,	/*!< in: thread handle */
+	st_mysql_sys_var*,
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	innodb_monitor_update(thd, var_ptr, save, MONITOR_RESET_VALUE);
+}
+
+/****************************************************************//**
+Update the system variable innodb_monitor_reset_all and reset
+all value related monitor counter.
+This function is registered as a callback with MySQL. */
+static
+void
+innodb_reset_all_monitor_update(
+/*============================*/
+	THD*				thd,	/*!< in: thread handle */
+	st_mysql_sys_var*,
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	innodb_monitor_update(thd, var_ptr, save, MONITOR_RESET_ALL_VALUE);
+}
+
+static
+void
+innodb_defragment_frequency_update(THD*, st_mysql_sys_var*, void*,
+				   const void* save)
+{
+	srv_defragment_frequency = (*static_cast<const uint*>(save));
+	srv_defragment_interval = 1000000000ULL / srv_defragment_frequency;
+}
+
+static inline char *my_strtok_r(char *str, const char *delim, char **saveptr)
+{
+#if defined _WIN32
+	return strtok_s(str, delim, saveptr);
+#else
+	return strtok_r(str, delim, saveptr);
+#endif
+}
+
+/****************************************************************//**
+Parse and enable InnoDB monitor counters during server startup.
+User can list the monitor counters/groups to be enable by specifying
+"loose-innodb_monitor_enable=monitor_name1;monitor_name2..."
+in server configuration file or at the command line. The string
+separate could be ";", "," or empty space. */
+static
+void
+innodb_enable_monitor_at_startup(
+/*=============================*/
+	char*	str)	/*!< in/out: monitor counter enable list */
+{
+	static const char*	sep = " ;,";
+	char*			last;
+
+	ut_a(str);
+
+	/* Walk through the string, and separate each monitor counter
+	and/or counter group name, and calling innodb_monitor_update()
+	if successfully updated. Please note that the "str" would be
+	changed by strtok_r() as it walks through it. */
+	for (char* option = my_strtok_r(str, sep, &last);
+	     option;
+	     option = my_strtok_r(NULL, sep, &last)) {
+		if (!innodb_monitor_valid_byname(option)) {
+			innodb_monitor_update(NULL, NULL, &option,
+					      MONITOR_TURN_ON);
+		} else {
+			sql_print_warning("Invalid monitor counter"
+					  " name: '%s'", option);
+		}
+	}
+}
+
+/****************************************************************//**
+Callback function for accessing the InnoDB variables from MySQL:
+SHOW VARIABLES. */
+static int show_innodb_vars(THD*, SHOW_VAR* var, void *,
+                            struct system_status_var *status_var,
+                            enum enum_var_type var_type)
+{
+	innodb_export_status();
+	var->type = SHOW_ARRAY;
+	var->value = (char*) &innodb_status_variables;
+	//var->scope = SHOW_SCOPE_GLOBAL;
+
+	return(0);
+}
+
+/****************************************************************//**
+This function checks each index name for a table against reserved
+system default primary index name 'GEN_CLUST_INDEX'. If a name
+matches, this function pushes an warning message to the client,
+and returns true.
+@return true if the index name matches the reserved name */
+bool
+innobase_index_name_is_reserved(
+/*============================*/
+	THD*		thd,		/*!< in/out: MySQL connection */
+	const KEY*	key_info,	/*!< in: Indexes to be created */
+	ulint		num_of_keys)	/*!< in: Number of indexes to
+					be created. */
+{
+	const KEY*	key;
+	uint		key_num;	/* index number */
+
+	for (key_num = 0; key_num < num_of_keys; key_num++) {
+		key = &key_info[key_num];
+
+		if (innobase_strcasecmp(key->name.str,
+					innobase_index_reserve_name) == 0) {
+			/* Push warning to mysql */
+			push_warning_printf(thd,
+					    Sql_condition::WARN_LEVEL_WARN,
+					    ER_WRONG_NAME_FOR_INDEX,
+					    "Cannot Create Index with name"
+					    " '%s'. The name is reserved"
+					    " for the system default primary"
+					    " index.",
+					    innobase_index_reserve_name);
+
+			my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0),
+				 innobase_index_reserve_name);
+
+			return(true);
+		}
+	}
+
+	return(false);
+}
+
+/** Retrieve the FTS Relevance Ranking result for doc with doc_id
+of m_prebuilt->fts_doc_id
+@param[in,out]	fts_hdl	FTS handler
+@return the relevance ranking value */
+static
+float
+innobase_fts_retrieve_ranking(
+	FT_INFO*	fts_hdl)
+{
+	fts_result_t*	result;
+	row_prebuilt_t*	ft_prebuilt;
+
+	result = reinterpret_cast<NEW_FT_INFO*>(fts_hdl)->ft_result;
+
+	ft_prebuilt = reinterpret_cast<NEW_FT_INFO*>(fts_hdl)->ft_prebuilt;
+
+	fts_ranking_t*  ranking = rbt_value(fts_ranking_t, result->current);
+	ft_prebuilt->fts_doc_id= ranking->doc_id;
+
+	return(ranking->rank);
+}
+
+/** Free the memory for the FTS handler
+@param[in,out]	fts_hdl	FTS handler */
+static
+void
+innobase_fts_close_ranking(
+	FT_INFO*	fts_hdl)
+{
+	fts_result_t*	result;
+
+	result = reinterpret_cast<NEW_FT_INFO*>(fts_hdl)->ft_result;
+
+	fts_query_free_result(result);
+
+	my_free((uchar*) fts_hdl);
+}
+
+/** Find and Retrieve the FTS Relevance Ranking result for doc with doc_id
+of m_prebuilt->fts_doc_id
+@param[in,out]	fts_hdl	FTS handler
+@return the relevance ranking value */
+static
+float
+innobase_fts_find_ranking(FT_INFO* fts_hdl, uchar*, uint)
+{
+	fts_result_t*	result;
+	row_prebuilt_t*	ft_prebuilt;
+
+	ft_prebuilt = reinterpret_cast<NEW_FT_INFO*>(fts_hdl)->ft_prebuilt;
+	result = reinterpret_cast<NEW_FT_INFO*>(fts_hdl)->ft_result;
+
+	/* Retrieve the ranking value for doc_id with value of
+	m_prebuilt->fts_doc_id */
+	return(fts_retrieve_ranking(result, ft_prebuilt->fts_doc_id));
+}
+
+#ifdef UNIV_DEBUG
+static my_bool	innodb_log_checkpoint_now = TRUE;
+static my_bool	innodb_buf_flush_list_now = TRUE;
+static uint	innodb_merge_threshold_set_all_debug
+	= DICT_INDEX_MERGE_THRESHOLD_DEFAULT;
+
+/** Force an InnoDB log checkpoint. */
+static
+void
+checkpoint_now_set(THD*, st_mysql_sys_var*, void*, const void *save)
+{
+  if (!*static_cast<const my_bool*>(save))
+    return;
+  const auto size= log_sys.is_encrypted()
+    ? SIZE_OF_FILE_CHECKPOINT + 8 : SIZE_OF_FILE_CHECKPOINT;
+  mysql_mutex_unlock(&LOCK_global_system_variables);
+  lsn_t lsn;
+  while (log_sys.last_checkpoint_lsn.load(std::memory_order_acquire) + size <
+         (lsn= log_sys.get_lsn(std::memory_order_acquire)))
+    log_make_checkpoint();
+
+  mysql_mutex_lock(&LOCK_global_system_variables);
+}
+
+/****************************************************************//**
+Force a dirty pages flush now. */
+static
+void
+buf_flush_list_now_set(THD*, st_mysql_sys_var*, void*, const void* save)
+{
+	if (*(my_bool*) save) {
+		mysql_mutex_unlock(&LOCK_global_system_variables);
+		buf_flush_sync();
+		mysql_mutex_lock(&LOCK_global_system_variables);
+	}
+}
+
+/** Override current MERGE_THRESHOLD setting for all indexes at dictionary
+now.
+@param[in]	save	immediate result from check function */
+static
+void
+innodb_merge_threshold_set_all_debug_update(THD*, st_mysql_sys_var*, void*,
+					    const void* save)
+{
+	innodb_merge_threshold_set_all_debug
+		= (*static_cast<const uint*>(save));
+	dict_set_merge_threshold_all_debug(
+		innodb_merge_threshold_set_all_debug);
+}
+#endif /* UNIV_DEBUG */
+
+/** Find and Retrieve the FTS doc_id for the current result row
+@param[in,out]	fts_hdl	FTS handler
+@return the document ID */
+static
+ulonglong
+innobase_fts_retrieve_docid(
+	FT_INFO_EXT*	fts_hdl)
+{
+	fts_result_t*	result;
+	row_prebuilt_t* ft_prebuilt;
+
+	ft_prebuilt = reinterpret_cast<NEW_FT_INFO *>(fts_hdl)->ft_prebuilt;
+	result = reinterpret_cast<NEW_FT_INFO *>(fts_hdl)->ft_result;
+
+	if (ft_prebuilt->read_just_key) {
+
+		fts_ranking_t* ranking =
+			rbt_value(fts_ranking_t, result->current);
+
+		return(ranking->doc_id);
+	}
+
+	return(ft_prebuilt->fts_doc_id);
+}
+
+/* These variables are never read by InnoDB or changed. They are a kind of
+dummies that are needed by the MySQL infrastructure to call
+buffer_pool_dump_now(), buffer_pool_load_now() and buffer_pool_load_abort()
+by the user by doing:
+  SET GLOBAL innodb_buffer_pool_dump_now=ON;
+  SET GLOBAL innodb_buffer_pool_load_now=ON;
+  SET GLOBAL innodb_buffer_pool_load_abort=ON;
+Their values are read by MySQL and displayed to the user when the variables
+are queried, e.g.:
+  SELECT @@innodb_buffer_pool_dump_now;
+  SELECT @@innodb_buffer_pool_load_now;
+  SELECT @@innodb_buffer_pool_load_abort; */
+static my_bool	innodb_buffer_pool_dump_now = FALSE;
+static my_bool	innodb_buffer_pool_load_now = FALSE;
+static my_bool	innodb_buffer_pool_load_abort = FALSE;
+
+/****************************************************************//**
+Trigger a dump of the buffer pool if innodb_buffer_pool_dump_now is set
+to ON. This function is registered as a callback with MySQL. */
+static
+void
+buffer_pool_dump_now(
+/*=================*/
+	THD*				thd	/*!< in: thread handle */
+					MY_ATTRIBUTE((unused)),
+	struct st_mysql_sys_var*	var	/*!< in: pointer to system
+						variable */
+					MY_ATTRIBUTE((unused)),
+	void*				var_ptr	/*!< out: where the formal
+						string goes */
+					MY_ATTRIBUTE((unused)),
+	const void*			save)	/*!< in: immediate result from
+						check function */
+{
+	if (*(my_bool*) save && !srv_read_only_mode) {
+		mysql_mutex_unlock(&LOCK_global_system_variables);
+		buf_dump_start();
+		mysql_mutex_lock(&LOCK_global_system_variables);
+	}
+}
+
+/****************************************************************//**
+Trigger a load of the buffer pool if innodb_buffer_pool_load_now is set
+to ON. This function is registered as a callback with MySQL. */
+static
+void
+buffer_pool_load_now(
+/*=================*/
+	THD*				thd	/*!< in: thread handle */
+					MY_ATTRIBUTE((unused)),
+	struct st_mysql_sys_var*	var	/*!< in: pointer to system
+						variable */
+					MY_ATTRIBUTE((unused)),
+	void*				var_ptr	/*!< out: where the formal
+						string goes */
+					MY_ATTRIBUTE((unused)),
+	const void*			save)	/*!< in: immediate result from
+						check function */
+{
+	if (*(my_bool*) save && !srv_read_only_mode) {
+		mysql_mutex_unlock(&LOCK_global_system_variables);
+		buf_load_start();
+		mysql_mutex_lock(&LOCK_global_system_variables);
+	}
+}
+
+/****************************************************************//**
+Abort a load of the buffer pool if innodb_buffer_pool_load_abort
+is set to ON. This function is registered as a callback with MySQL. */
+static
+void
+buffer_pool_load_abort(
+/*===================*/
+	THD*				thd	/*!< in: thread handle */
+					MY_ATTRIBUTE((unused)),
+	struct st_mysql_sys_var*	var	/*!< in: pointer to system
+						variable */
+					MY_ATTRIBUTE((unused)),
+	void*				var_ptr	/*!< out: where the formal
+						string goes */
+					MY_ATTRIBUTE((unused)),
+	const void*			save)	/*!< in: immediate result from
+						check function */
+{
+	if (*(my_bool*) save && !srv_read_only_mode) {
+		mysql_mutex_unlock(&LOCK_global_system_variables);
+		buf_load_abort();
+		mysql_mutex_lock(&LOCK_global_system_variables);
+	}
+}
+
+#if defined __linux__ || defined _WIN32
+static void innodb_log_file_buffering_update(THD *thd, st_mysql_sys_var*,
+                                             void *, const void *save)
+{
+  mysql_mutex_unlock(&LOCK_global_system_variables);
+  log_sys.set_buffered(*static_cast<const my_bool*>(save));
+  mysql_mutex_lock(&LOCK_global_system_variables);
+}
+#endif
+
+static void innodb_log_file_size_update(THD *thd, st_mysql_sys_var*,
+                                        void *var, const void *save)
+{
+  ut_ad(var == &srv_log_file_size);
+  mysql_mutex_unlock(&LOCK_global_system_variables);
+
+  if (high_level_read_only)
+    ib_senderrf(thd, IB_LOG_LEVEL_ERROR, ER_READ_ONLY_MODE);
+  else if (!log_sys.is_pmem() &&
+           *static_cast<const ulonglong*>(save) < log_sys.buf_size)
+    my_printf_error(ER_WRONG_ARGUMENTS,
+                    "innodb_log_file_size must be at least"
+                    " innodb_log_buffer_size=%zu", MYF(0), log_sys.buf_size);
+  else
+  {
+    switch (log_sys.resize_start(*static_cast<const ulonglong*>(save))) {
+    case log_t::RESIZE_NO_CHANGE:
+      break;
+    case log_t::RESIZE_IN_PROGRESS:
+      my_printf_error(ER_WRONG_USAGE,
+                      "innodb_log_file_size change is already in progress",
+                      MYF(0));
+      break;
+    case log_t::RESIZE_FAILED:
+      ib_senderrf(thd, IB_LOG_LEVEL_ERROR, ER_CANT_CREATE_HANDLER_FILE);
+      break;
+    case log_t::RESIZE_STARTED:
+      for (timespec abstime;;)
+      {
+        if (thd_kill_level(thd))
+        {
+          log_sys.resize_abort();
+          break;
+        }
+
+        set_timespec(abstime, 5);
+        mysql_mutex_lock(&buf_pool.flush_list_mutex);
+        const bool in_progress(buf_pool.get_oldest_modification(LSN_MAX) <
+                               log_sys.resize_in_progress());
+        if (in_progress)
+          my_cond_timedwait(&buf_pool.do_flush_list,
+                            &buf_pool.flush_list_mutex.m_mutex, &abstime);
+        mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+        if (!log_sys.resize_in_progress())
+          break;
+      }
+    }
+  }
+  mysql_mutex_lock(&LOCK_global_system_variables);
+}
+
+/** Update innodb_status_output or innodb_status_output_locks,
+which control InnoDB "status monitor" output to the error log.
+@param[out]	var	current value
+@param[in]	save	to-be-assigned value */
+static
+void
+innodb_status_output_update(THD*,st_mysql_sys_var*,void*var,const void*save)
+{
+  if (srv_monitor_timer)
+  {
+    *static_cast<my_bool*>(var)= *static_cast<const my_bool*>(save);
+    mysql_mutex_unlock(&LOCK_global_system_variables);
+    /* Wakeup server monitor. */
+    srv_monitor_timer_schedule_now();
+    mysql_mutex_lock(&LOCK_global_system_variables);
+  }
+}
+
+/** Update the system variable innodb_encryption_threads.
+@param[in]	save	to-be-assigned value */
+static
+void
+innodb_encryption_threads_update(THD*,st_mysql_sys_var*,void*,const void*save)
+{
+	mysql_mutex_unlock(&LOCK_global_system_variables);
+	fil_crypt_set_thread_cnt(*static_cast<const uint*>(save));
+	mysql_mutex_lock(&LOCK_global_system_variables);
+}
+
+/** Update the system variable innodb_encryption_rotate_key_age.
+@param[in]	save	to-be-assigned value */
+static
+void
+innodb_encryption_rotate_key_age_update(THD*, st_mysql_sys_var*, void*,
+					const void* save)
+{
+	mysql_mutex_unlock(&LOCK_global_system_variables);
+	fil_crypt_set_rotate_key_age(*static_cast<const uint*>(save));
+	mysql_mutex_lock(&LOCK_global_system_variables);
+}
+
+/** Update the system variable innodb_encryption_rotation_iops.
+@param[in]	save	to-be-assigned value */
+static
+void
+innodb_encryption_rotation_iops_update(THD*, st_mysql_sys_var*, void*,
+				       const void* save)
+{
+	mysql_mutex_unlock(&LOCK_global_system_variables);
+	fil_crypt_set_rotation_iops(*static_cast<const uint*>(save));
+	mysql_mutex_lock(&LOCK_global_system_variables);
+}
+
+/** Update the system variable innodb_encrypt_tables.
+@param[in]	save	to-be-assigned value */
+static
+void
+innodb_encrypt_tables_update(THD*, st_mysql_sys_var*, void*, const void* save)
+{
+	mysql_mutex_unlock(&LOCK_global_system_variables);
+	fil_crypt_set_encrypt_tables(*static_cast<const ulong*>(save));
+	mysql_mutex_lock(&LOCK_global_system_variables);
+}
+
+static SHOW_VAR innodb_status_variables_export[]= {
+	SHOW_FUNC_ENTRY("Innodb", &show_innodb_vars),
+	{NullS, NullS, SHOW_LONG}
+};
+
+static struct st_mysql_storage_engine innobase_storage_engine=
+{ MYSQL_HANDLERTON_INTERFACE_VERSION };
+
+#ifdef WITH_WSREP
+/** Request a transaction to be killed that holds a conflicting lock.
+@param bf_trx    brute force applier transaction
+@param thd_id    thd_get_thread_id(victim_trx->mysql_htd)
+@param trx_id    victim_trx->id */
+void lock_wait_wsrep_kill(trx_t *bf_trx, ulong thd_id, trx_id_t trx_id)
+{
+  THD *bf_thd= bf_trx->mysql_thd;
+
+  if (THD *vthd= find_thread_by_id(thd_id))
+  {
+    bool aborting= false;
+    wsrep_thd_LOCK(vthd);
+    trx_t *vtrx= thd_to_trx(vthd);
+    if (vtrx)
+    {
+      /* Do not bother with lock elision using transactional memory here;
+      this is rather complex code */
+      LockMutexGuard g{SRW_LOCK_CALL};
+      mysql_mutex_lock(&lock_sys.wait_mutex);
+      vtrx->mutex_lock();
+      /* victim transaction is either active or prepared, if it has already
+	 proceeded to replication phase */
+      if (vtrx->id == trx_id)
+      {
+        switch (vtrx->state) {
+        default:
+          break;
+        case TRX_STATE_PREPARED:
+          if (!wsrep_is_wsrep_xid(&vtrx->xid))
+            break;
+          /* fall through */
+        case TRX_STATE_ACTIVE:
+          WSREP_LOG_CONFLICT(bf_thd, vthd, TRUE);
+          WSREP_DEBUG("Aborter BF trx_id: " TRX_ID_FMT " thread: %ld "
+                      "seqno: %lld client_state: %s "
+                      "client_mode: %s transaction_mode: %s query: %s",
+                      bf_trx->id,
+                      thd_get_thread_id(bf_thd),
+                      wsrep_thd_trx_seqno(bf_thd),
+                      wsrep_thd_client_state_str(bf_thd),
+                      wsrep_thd_client_mode_str(bf_thd),
+                      wsrep_thd_transaction_state_str(bf_thd),
+                      wsrep_thd_query(bf_thd));
+          WSREP_DEBUG("Victim %s trx_id: " TRX_ID_FMT " thread: %ld "
+                      "seqno: %lld client_state: %s "
+                      "client_mode: %s transaction_mode: %s query: %s",
+                      wsrep_thd_is_BF(vthd, false) ? "BF" : "normal",
+                      vtrx->id,
+                      thd_get_thread_id(vthd),
+                      wsrep_thd_trx_seqno(vthd),
+                      wsrep_thd_client_state_str(vthd),
+                      wsrep_thd_client_mode_str(vthd),
+                      wsrep_thd_transaction_state_str(vthd),
+                      wsrep_thd_query(vthd));
+          aborting= true;
+        }
+      }
+      mysql_mutex_unlock(&lock_sys.wait_mutex);
+      vtrx->mutex_unlock();
+    }
+
+    DEBUG_SYNC(bf_thd, "before_wsrep_thd_abort");
+    if (aborting && wsrep_thd_bf_abort(bf_thd, vthd, true))
+    {
+      /* Need to grab mutexes again to ensure that the trx is still in
+         right state. */
+      lock_sys.wr_lock(SRW_LOCK_CALL);
+      mysql_mutex_lock(&lock_sys.wait_mutex);
+      vtrx->mutex_lock();
+
+      /* if victim is waiting for some other lock, we have to cancel
+         that waiting
+      */
+      if (vtrx->id == trx_id)
+      {
+        switch (vtrx->state) {
+        default:
+          break;
+        case TRX_STATE_ACTIVE:
+        case TRX_STATE_PREPARED:
+          lock_sys.cancel_lock_wait_for_wsrep_bf_abort(vtrx);
+        }
+      }
+      lock_sys.wr_unlock();
+      mysql_mutex_unlock(&lock_sys.wait_mutex);
+      vtrx->mutex_unlock();
+    }
+    else
+    {
+      WSREP_DEBUG("wsrep_thd_bf_abort has failed, victim %lu will survive",
+                  thd_get_thread_id(vthd));
+    }
+    wsrep_thd_UNLOCK(vthd);
+    wsrep_thd_kill_UNLOCK(vthd);
+  }
+}
+
+/** This function forces the victim transaction to abort. Aborting the
+  transaction does NOT end it, it still has to be rolled back.
+
+  The caller must lock LOCK_thd_kill and LOCK_thd_data.
+
+  @param bf_thd       brute force THD asking for the abort
+  @param victim_thd   victim THD to be aborted
+*/
+static void wsrep_abort_transaction(handlerton *, THD *bf_thd, THD *victim_thd,
+                                    my_bool signal)
+{
+  DBUG_ENTER("wsrep_abort_transaction");
+  ut_ad(bf_thd);
+  ut_ad(victim_thd);
+
+  trx_t *victim_trx= thd_to_trx(victim_thd);
+
+  WSREP_DEBUG("abort transaction: BF: %s victim: %s victim conf: %s",
+              wsrep_thd_query(bf_thd), wsrep_thd_query(victim_thd),
+              wsrep_thd_transaction_state_str(victim_thd));
+
+  if (!victim_trx)
+  {
+    WSREP_DEBUG("abort transaction: victim did not exist");
+    DBUG_VOID_RETURN;
+  }
+
+  lock_sys.wr_lock(SRW_LOCK_CALL);
+  mysql_mutex_lock(&lock_sys.wait_mutex);
+  victim_trx->mutex_lock();
+
+  switch (victim_trx->state) {
+  default:
+    break;
+  case TRX_STATE_ACTIVE:
+  case TRX_STATE_PREPARED:
+    /* Cancel lock wait if the victim is waiting for a lock in InnoDB.
+       The transaction which is blocked somewhere else (e.g. waiting
+       for next command or MDL) has been interrupted by THD::awake_no_mutex()
+       on server level before calling this function. */
+    lock_sys.cancel_lock_wait_for_wsrep_bf_abort(victim_trx);
+  }
+  lock_sys.wr_unlock();
+  mysql_mutex_unlock(&lock_sys.wait_mutex);
+  victim_trx->mutex_unlock();
+
+  DBUG_VOID_RETURN;
+}
+
+static
+int
+innobase_wsrep_set_checkpoint(
+/*==========================*/
+	handlerton* hton,
+	const XID* xid)
+{
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	if (wsrep_is_wsrep_xid(xid)) {
+
+		trx_rseg_update_wsrep_checkpoint(xid);
+		log_buffer_flush_to_disk(srv_flush_log_at_trx_commit == 1);
+		return 0;
+	} else {
+		return 1;
+	}
+}
+
+static
+int
+innobase_wsrep_get_checkpoint(
+/*==========================*/
+	handlerton* hton,
+	XID* xid)
+{
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+        trx_rseg_read_wsrep_checkpoint(*xid);
+        return 0;
+}
+#endif /* WITH_WSREP */
+
+/* plugin options */
+
+static MYSQL_SYSVAR_ENUM(checksum_algorithm, srv_checksum_algorithm,
+  PLUGIN_VAR_RQCMDARG,
+  "The algorithm InnoDB uses for page checksumming. Possible values are"
+  " FULL_CRC32"
+    " for new files, always use CRC-32C; for old, see CRC32 below;"
+  " STRICT_FULL_CRC32"
+    " for new files, always use CRC-32C; for old, see STRICT_CRC32 below;"
+  " CRC32"
+    " write crc32, allow previously used algorithms to match when reading;"
+  " STRICT_CRC32"
+    " write crc32, do not allow other algorithms to match when reading;"
+  " New files created with full_crc32 are readable by MariaDB 10.4.3+",
+  NULL, NULL, SRV_CHECKSUM_ALGORITHM_FULL_CRC32,
+  &innodb_checksum_algorithm_typelib);
+
+static MYSQL_SYSVAR_STR(data_home_dir, innobase_data_home_dir,
+  PLUGIN_VAR_READONLY,
+  "The common part for InnoDB table spaces.",
+  NULL, NULL, NULL);
+
+static MYSQL_SYSVAR_BOOL(doublewrite, srv_use_doublewrite_buf,
+  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "Enable InnoDB doublewrite buffer (enabled by default)."
+  " Disable with --skip-innodb-doublewrite.",
+  NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_BOOL(use_atomic_writes, srv_use_atomic_writes,
+  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "Enable atomic writes, instead of using the doublewrite buffer, for files "
+  "on devices that supports atomic writes.",
+  NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_BOOL(stats_include_delete_marked,
+  srv_stats_include_delete_marked,
+  PLUGIN_VAR_OPCMDARG,
+  "Include delete marked records when calculating persistent statistics",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_ENUM(instant_alter_column_allowed,
+			 innodb_instant_alter_column_allowed,
+  PLUGIN_VAR_RQCMDARG,
+  "File format constraint for ALTER TABLE", NULL, NULL, 2/*add_drop_reorder*/,
+  &innodb_instant_alter_column_allowed_typelib);
+
+static MYSQL_SYSVAR_ULONG(io_capacity, srv_io_capacity,
+  PLUGIN_VAR_RQCMDARG,
+  "Number of IOPs the server can do. Tunes the background IO rate",
+  NULL, innodb_io_capacity_update, 200, 100, ~0UL, 0);
+
+static MYSQL_SYSVAR_ULONG(io_capacity_max, srv_max_io_capacity,
+  PLUGIN_VAR_RQCMDARG,
+  "Limit to which innodb_io_capacity can be inflated.",
+  NULL, innodb_io_capacity_max_update,
+  SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT, 100,
+  SRV_MAX_IO_CAPACITY_LIMIT, 0);
+
+#ifdef UNIV_DEBUG
+static MYSQL_SYSVAR_BOOL(log_checkpoint_now, innodb_log_checkpoint_now,
+  PLUGIN_VAR_OPCMDARG,
+  "Force checkpoint now",
+  NULL, checkpoint_now_set, FALSE);
+
+static MYSQL_SYSVAR_BOOL(buf_flush_list_now, innodb_buf_flush_list_now,
+  PLUGIN_VAR_OPCMDARG,
+  "Force dirty page flush now",
+  NULL, buf_flush_list_now_set, FALSE);
+
+static MYSQL_SYSVAR_UINT(merge_threshold_set_all_debug,
+  innodb_merge_threshold_set_all_debug,
+  PLUGIN_VAR_RQCMDARG,
+  "Override current MERGE_THRESHOLD setting for all indexes at dictionary"
+  " cache by the specified value dynamically, at the time.",
+  NULL, innodb_merge_threshold_set_all_debug_update,
+  DICT_INDEX_MERGE_THRESHOLD_DEFAULT, 1, 50, 0);
+#endif /* UNIV_DEBUG */
+
+static MYSQL_SYSVAR_ULONG(purge_batch_size, srv_purge_batch_size,
+  PLUGIN_VAR_OPCMDARG,
+  "Number of UNDO log pages to purge in one batch from the history list.",
+  NULL, NULL,
+  1000,			/* Default setting */
+  1,			/* Minimum value */
+  innodb_purge_batch_size_MAX, 0);
+
+extern void srv_update_purge_thread_count(uint n);
+
+static
+void
+innodb_purge_threads_update(THD*, struct st_mysql_sys_var*, void*, const void*save )
+{
+  srv_update_purge_thread_count(*static_cast<const uint*>(save));
+}
+
+static MYSQL_SYSVAR_UINT(purge_threads, srv_n_purge_threads,
+  PLUGIN_VAR_OPCMDARG,
+  "Number of tasks for purging transaction history",
+  NULL, innodb_purge_threads_update,
+  4,			    /* Default setting */
+  1,			    /* Minimum value */
+  innodb_purge_threads_MAX, /* Maximum value */
+  0);
+
+static MYSQL_SYSVAR_UINT(fast_shutdown, srv_fast_shutdown,
+  PLUGIN_VAR_OPCMDARG,
+  "Speeds up the shutdown process of the InnoDB storage engine. Possible"
+  " values are 0, 1 (faster), 2 (crash-like), 3 (fastest clean).",
+  fast_shutdown_validate, NULL, 1, 0, 3, 0);
+
+static MYSQL_SYSVAR_BOOL(file_per_table, srv_file_per_table,
+  PLUGIN_VAR_NOCMDARG,
+  "Stores each InnoDB table to an .ibd file in the database dir.",
+  NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_STR(ft_server_stopword_table, innobase_server_stopword_table,
+  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_MEMALLOC,
+  "The user supplied stopword table name.",
+  innodb_stopword_table_validate,
+  NULL,
+  NULL);
+
+static MYSQL_SYSVAR_UINT(flush_log_at_timeout, srv_flush_log_at_timeout,
+  PLUGIN_VAR_OPCMDARG,
+  "Write and flush logs every (n) second.",
+  NULL, NULL, 1, 0, 2700, 0);
+
+static MYSQL_SYSVAR_ULONG(flush_log_at_trx_commit, srv_flush_log_at_trx_commit,
+  PLUGIN_VAR_OPCMDARG,
+  "Controls the durability/speed trade-off for commits."
+  " Set to 0 (write and flush redo log to disk only once per second),"
+  " 1 (flush to disk at each commit),"
+  " 2 (write to log at commit but flush to disk only once per second)"
+  " or 3 (flush to disk at prepare and at commit, slower and usually redundant)."
+  " 1 and 3 guarantees that after a crash, committed transactions will"
+  " not be lost and will be consistent with the binlog and other transactional"
+  " engines. 2 can get inconsistent and lose transactions if there is a"
+  " power failure or kernel crash but not if mysqld crashes. 0 has no"
+  " guarantees in case of crash. 0 and 2 can be faster than 1 or 3.",
+  NULL, NULL, 1, 0, 3, 0);
+
+static MYSQL_SYSVAR_ENUM(flush_method, srv_file_flush_method,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "With which method to flush data.",
+  NULL, NULL, IF_WIN(SRV_ALL_O_DIRECT_FSYNC, SRV_O_DIRECT),
+  &innodb_flush_method_typelib);
+
+static MYSQL_SYSVAR_STR(log_group_home_dir, srv_log_group_home_dir,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Path to ib_logfile0", NULL, NULL, NULL);
+
+static MYSQL_SYSVAR_DOUBLE(max_dirty_pages_pct, srv_max_buf_pool_modified_pct,
+  PLUGIN_VAR_RQCMDARG,
+  "Percentage of dirty pages allowed in bufferpool.",
+  NULL, innodb_max_dirty_pages_pct_update, 90.0, 0, 99.999, 0);
+
+static MYSQL_SYSVAR_DOUBLE(max_dirty_pages_pct_lwm,
+  srv_max_dirty_pages_pct_lwm,
+  PLUGIN_VAR_RQCMDARG,
+  "Percentage of dirty pages at which flushing kicks in. "
+  "The value 0 (default) means 'refer to innodb_max_dirty_pages_pct'.",
+  NULL, innodb_max_dirty_pages_pct_lwm_update, 0, 0, 99.999, 0);
+
+static MYSQL_SYSVAR_DOUBLE(adaptive_flushing_lwm,
+  srv_adaptive_flushing_lwm,
+  PLUGIN_VAR_RQCMDARG,
+  "Percentage of log capacity below which no adaptive flushing happens.",
+  NULL, NULL, 10.0, 0.0, 70.0, 0);
+
+static MYSQL_SYSVAR_BOOL(adaptive_flushing, srv_adaptive_flushing,
+  PLUGIN_VAR_NOCMDARG,
+  "Attempt flushing dirty pages to avoid IO bursts at checkpoints.",
+  NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_BOOL(flush_sync, srv_flush_sync,
+  PLUGIN_VAR_NOCMDARG,
+  "Allow IO bursts at the checkpoints ignoring io_capacity setting.",
+  NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_ULONG(flushing_avg_loops,
+  srv_flushing_avg_loops,
+  PLUGIN_VAR_RQCMDARG,
+  "Number of iterations over which the background flushing is averaged.",
+  NULL, NULL, 30, 1, 1000, 0);
+
+static MYSQL_SYSVAR_ULONG(max_purge_lag, srv_max_purge_lag,
+  PLUGIN_VAR_RQCMDARG,
+  "Desired maximum length of the purge queue (0 = no limit)",
+  NULL, NULL, 0, 0, ~0UL, 0);
+
+static MYSQL_SYSVAR_ULONG(max_purge_lag_delay, srv_max_purge_lag_delay,
+   PLUGIN_VAR_RQCMDARG,
+   "Maximum delay of user threads in micro-seconds",
+   NULL, NULL,
+   0L,			/* Default seting */
+   0L,			/* Minimum value */
+   10000000UL, 0);	/* Maximum value */
+
+static MYSQL_SYSVAR_UINT(max_purge_lag_wait, innodb_max_purge_lag_wait,
+  PLUGIN_VAR_RQCMDARG,
+  "Wait until History list length is below the specified limit",
+  NULL, innodb_max_purge_lag_wait_update, UINT_MAX, 0, UINT_MAX, 0);
+
+static MYSQL_SYSVAR_BOOL(rollback_on_timeout, innobase_rollback_on_timeout,
+  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+  "Roll back the complete transaction on lock wait timeout, for 4.x compatibility (disabled by default)",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(status_file, innobase_create_status_file,
+  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_NOSYSVAR,
+  "Enable SHOW ENGINE INNODB STATUS output in the innodb_status.<pid> file",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(stats_on_metadata, innobase_stats_on_metadata,
+  PLUGIN_VAR_OPCMDARG,
+  "Enable statistics gathering for metadata commands such as"
+  " SHOW TABLE STATUS for tables that use transient statistics (off by default)",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_ULONGLONG(stats_transient_sample_pages,
+  srv_stats_transient_sample_pages,
+  PLUGIN_VAR_RQCMDARG,
+  "The number of leaf index pages to sample when calculating transient"
+  " statistics (if persistent statistics are not used, default 8)",
+  NULL, NULL, 8, 1, ~0ULL, 0);
+
+static MYSQL_SYSVAR_BOOL(stats_persistent, srv_stats_persistent,
+  PLUGIN_VAR_OPCMDARG,
+  "InnoDB persistent statistics enabled for all tables unless overridden"
+  " at table level",
+  NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_BOOL(stats_auto_recalc, srv_stats_auto_recalc,
+  PLUGIN_VAR_OPCMDARG,
+  "InnoDB automatic recalculation of persistent statistics enabled for all"
+  " tables unless overridden at table level (automatic recalculation is only"
+  " done when InnoDB decides that the table has changed too much and needs a"
+  " new statistics)",
+  NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_ULONGLONG(stats_persistent_sample_pages,
+  srv_stats_persistent_sample_pages,
+  PLUGIN_VAR_RQCMDARG,
+  "The number of leaf index pages to sample when calculating persistent"
+  " statistics (by ANALYZE, default 20)",
+  NULL, NULL, 20, 1, ~0ULL, 0);
+
+static MYSQL_SYSVAR_ULONGLONG(stats_modified_counter, srv_stats_modified_counter,
+  PLUGIN_VAR_RQCMDARG,
+  "The number of rows modified before we calculate new statistics (default 0 = current limits)",
+  NULL, NULL, 0, 0, ~0ULL, 0);
+
+static MYSQL_SYSVAR_BOOL(stats_traditional, srv_stats_sample_traditional,
+  PLUGIN_VAR_RQCMDARG,
+  "Enable traditional statistic calculation based on number of configured pages (default true)",
+  NULL, NULL, TRUE);
+
+#ifdef BTR_CUR_HASH_ADAPT
+static MYSQL_SYSVAR_BOOL(adaptive_hash_index, btr_search_enabled,
+  PLUGIN_VAR_OPCMDARG,
+  "Enable InnoDB adaptive hash index (disabled by default).",
+  NULL, innodb_adaptive_hash_index_update, false);
+
+/** Number of distinct partitions of AHI.
+Each partition is protected by its own latch and so we have parts number
+of latches protecting complete search system. */
+static MYSQL_SYSVAR_ULONG(adaptive_hash_index_parts, btr_ahi_parts,
+  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+  "Number of InnoDB Adaptive Hash Index Partitions (default 8)",
+  NULL, NULL, 8, 1, 512, 0);
+#endif /* BTR_CUR_HASH_ADAPT */
+
+static MYSQL_SYSVAR_UINT(compression_level, page_zip_level,
+  PLUGIN_VAR_RQCMDARG,
+  "Compression level used for zlib compression.  0 is no compression"
+  ", 1 is fastest, 9 is best compression and default is 6.",
+  NULL, NULL, DEFAULT_COMPRESSION_LEVEL, 0, 9, 0);
+
+static MYSQL_SYSVAR_UINT(autoextend_increment,
+  sys_tablespace_auto_extend_increment,
+  PLUGIN_VAR_RQCMDARG,
+  "Data file autoextend increment in megabytes",
+  NULL, NULL, 64, 1, 1000, 0);
+
+static MYSQL_SYSVAR_SIZE_T(buffer_pool_chunk_size, srv_buf_pool_chunk_unit,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Size of a single memory chunk"
+  " for resizing buffer pool. Online buffer pool resizing happens at this"
+  " granularity. 0 means autosize this variable based on buffer pool size.",
+  NULL, NULL,
+  0, 0, SIZE_T_MAX, 1024 * 1024);
+
+static MYSQL_SYSVAR_STR(buffer_pool_filename, srv_buf_dump_filename,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Filename to/from which to dump/load the InnoDB buffer pool",
+  NULL, NULL, SRV_BUF_DUMP_FILENAME_DEFAULT);
+
+static MYSQL_SYSVAR_BOOL(buffer_pool_dump_now, innodb_buffer_pool_dump_now,
+  PLUGIN_VAR_RQCMDARG,
+  "Trigger an immediate dump of the buffer pool into a file named @@innodb_buffer_pool_filename",
+  NULL, buffer_pool_dump_now, FALSE);
+
+static MYSQL_SYSVAR_BOOL(buffer_pool_dump_at_shutdown, srv_buffer_pool_dump_at_shutdown,
+  PLUGIN_VAR_RQCMDARG,
+  "Dump the buffer pool into a file named @@innodb_buffer_pool_filename",
+  NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_ULONG(buffer_pool_dump_pct, srv_buf_pool_dump_pct,
+  PLUGIN_VAR_RQCMDARG,
+  "Dump only the hottest N% of each buffer pool, defaults to 25",
+  NULL, NULL, 25, 1, 100, 0);
+
+#ifdef UNIV_DEBUG
+/* Added to test the innodb_buffer_pool_load_incomplete status variable. */
+static MYSQL_SYSVAR_ULONG(buffer_pool_load_pages_abort, srv_buf_pool_load_pages_abort,
+  PLUGIN_VAR_RQCMDARG,
+  "Number of pages during a buffer pool load to process before signaling innodb_buffer_pool_load_abort=1",
+  NULL, NULL, LONG_MAX, 1, LONG_MAX, 0);
+
+static MYSQL_SYSVAR_STR(buffer_pool_evict, srv_buffer_pool_evict,
+  PLUGIN_VAR_RQCMDARG,
+  "Evict pages from the buffer pool",
+  NULL, innodb_buffer_pool_evict_update, "");
+#endif /* UNIV_DEBUG */
+
+static MYSQL_SYSVAR_BOOL(buffer_pool_load_now, innodb_buffer_pool_load_now,
+  PLUGIN_VAR_RQCMDARG,
+  "Trigger an immediate load of the buffer pool from a file named @@innodb_buffer_pool_filename",
+  NULL, buffer_pool_load_now, FALSE);
+
+static MYSQL_SYSVAR_BOOL(buffer_pool_load_abort, innodb_buffer_pool_load_abort,
+  PLUGIN_VAR_RQCMDARG,
+  "Abort a currently running load of the buffer pool",
+  NULL, buffer_pool_load_abort, FALSE);
+
+/* there is no point in changing this during runtime, thus readonly */
+static MYSQL_SYSVAR_BOOL(buffer_pool_load_at_startup, srv_buffer_pool_load_at_startup,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Load the buffer pool from a file named @@innodb_buffer_pool_filename",
+  NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_BOOL(defragment, srv_defragment,
+  PLUGIN_VAR_RQCMDARG,
+  "Enable/disable InnoDB defragmentation (default FALSE). When set to FALSE, all existing "
+  "defragmentation will be paused. And new defragmentation command will fail."
+  "Paused defragmentation commands will resume when this variable is set to "
+  "true again.",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_UINT(defragment_n_pages, srv_defragment_n_pages,
+  PLUGIN_VAR_RQCMDARG,
+  "Number of pages considered at once when merging multiple pages to "
+  "defragment",
+  NULL, NULL, 7, 2, 32, 0);
+
+static MYSQL_SYSVAR_UINT(defragment_stats_accuracy,
+  srv_defragment_stats_accuracy,
+  PLUGIN_VAR_RQCMDARG,
+  "How many defragment stats changes there are before the stats "
+  "are written to persistent storage. Set to 0 meaning disable "
+  "defragment stats tracking.",
+  NULL, NULL, 0, 0, ~0U, 0);
+
+static MYSQL_SYSVAR_UINT(defragment_fill_factor_n_recs,
+  srv_defragment_fill_factor_n_recs,
+  PLUGIN_VAR_RQCMDARG,
+  "How many records of space defragmentation should leave on the page. "
+  "This variable, together with innodb_defragment_fill_factor, is introduced "
+  "so defragmentation won't pack the page too full and cause page split on "
+  "the next insert on every page. The variable indicating more defragmentation"
+  " gain is the one effective.",
+  NULL, NULL, 20, 1, 100, 0);
+
+static MYSQL_SYSVAR_DOUBLE(defragment_fill_factor, srv_defragment_fill_factor,
+  PLUGIN_VAR_RQCMDARG,
+  "A number between [0.7, 1] that tells defragmentation how full it should "
+  "fill a page. Default is 0.9. Number below 0.7 won't make much sense."
+  "This variable, together with innodb_defragment_fill_factor_n_recs, is "
+  "introduced so defragmentation won't pack the page too full and cause "
+  "page split on the next insert on every page. The variable indicating more "
+  "defragmentation gain is the one effective.",
+  NULL, NULL, 0.9, 0.7, 1, 0);
+
+static MYSQL_SYSVAR_UINT(defragment_frequency, srv_defragment_frequency,
+  PLUGIN_VAR_RQCMDARG,
+  "Do not defragment a single index more than this number of time per second."
+  "This controls the number of time defragmentation thread can request X_LOCK "
+  "on an index. Defragmentation thread will check whether "
+  "1/defragment_frequency (s) has passed since it worked on this index last "
+  "time, and put the index back to the queue if not enough time has passed. "
+  "The actual frequency can only be lower than this given number.",
+  NULL, innodb_defragment_frequency_update,
+  SRV_DEFRAGMENT_FREQUENCY_DEFAULT, 1, 1000, 0);
+
+
+static MYSQL_SYSVAR_ULONG(lru_scan_depth, srv_LRU_scan_depth,
+  PLUGIN_VAR_RQCMDARG,
+  "How deep to scan LRU to keep it clean",
+  NULL, NULL, 1536, 100, ~0UL, 0);
+
+static MYSQL_SYSVAR_SIZE_T(lru_flush_size, innodb_lru_flush_size,
+  PLUGIN_VAR_RQCMDARG,
+  "How many pages to flush on LRU eviction",
+  NULL, NULL, 32, 1, SIZE_T_MAX, 0);
+
+static MYSQL_SYSVAR_ULONG(flush_neighbors, srv_flush_neighbors,
+  PLUGIN_VAR_OPCMDARG,
+  "Set to 0 (don't flush neighbors from buffer pool),"
+  " 1 (flush contiguous neighbors from buffer pool)"
+  " or 2 (flush neighbors from buffer pool),"
+  " when flushing a block",
+  NULL, NULL, 1, 0, 2, 0);
+
+static MYSQL_SYSVAR_BOOL(deadlock_detect, innodb_deadlock_detect,
+  PLUGIN_VAR_NOCMDARG,
+  "Enable/disable InnoDB deadlock detector (default ON)."
+  " if set to OFF, deadlock detection is skipped,"
+  " and we rely on innodb_lock_wait_timeout in case of deadlock.",
+  NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_ENUM(deadlock_report, innodb_deadlock_report,
+  PLUGIN_VAR_RQCMDARG,
+  "How to report deadlocks (if innodb_deadlock_detect=ON).",
+  NULL, NULL, Deadlock::REPORT_FULL, &innodb_deadlock_report_typelib);
+
+static MYSQL_SYSVAR_UINT(fill_factor, innobase_fill_factor,
+  PLUGIN_VAR_RQCMDARG,
+  "Percentage of B-tree page filled during bulk insert",
+  NULL, NULL, 100, 10, 100, 0);
+
+static MYSQL_SYSVAR_BOOL(ft_enable_diag_print, fts_enable_diag_print,
+  PLUGIN_VAR_OPCMDARG,
+  "Whether to enable additional FTS diagnostic printout ",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(disable_sort_file_cache, srv_disable_sort_file_cache,
+  PLUGIN_VAR_OPCMDARG,
+  "Whether to disable OS system file cache for sort I/O",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_STR(ft_aux_table, innodb_ft_aux_table,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC,
+  "FTS internal auxiliary table to be checked",
+  innodb_ft_aux_table_validate, NULL, NULL);
+
+#if UNIV_WORD_SIZE == 4
+
+static MYSQL_SYSVAR_SIZE_T(ft_cache_size,
+  *reinterpret_cast<size_t*>(&fts_max_cache_size),
+  PLUGIN_VAR_RQCMDARG,
+  "InnoDB Fulltext search cache size in bytes",
+  NULL, innodb_ft_cache_size_update, 8000000, 1600000, 1U << 29, 0);
+
+static MYSQL_SYSVAR_SIZE_T(ft_total_cache_size,
+  *reinterpret_cast<size_t*>(&fts_max_total_cache_size),
+  PLUGIN_VAR_RQCMDARG,
+  "Total memory allocated for InnoDB Fulltext Search cache",
+  NULL, innodb_ft_total_cache_size_update, 640000000, 32000000, 1600000000, 0);
+
+#else
+
+static MYSQL_SYSVAR_SIZE_T(ft_cache_size,
+  *reinterpret_cast<size_t*>(&fts_max_cache_size),
+  PLUGIN_VAR_RQCMDARG,
+  "InnoDB Fulltext search cache size in bytes",
+  NULL, innodb_ft_cache_size_update, 8000000, 1600000, 1ULL << 40, 0);
+
+static MYSQL_SYSVAR_SIZE_T(ft_total_cache_size,
+  *reinterpret_cast<size_t*>(&fts_max_total_cache_size),
+  PLUGIN_VAR_RQCMDARG,
+  "Total memory allocated for InnoDB Fulltext Search cache",
+  NULL, innodb_ft_total_cache_size_update, 640000000, 32000000, 1ULL << 40, 0);
+
+#endif
+
+static MYSQL_SYSVAR_SIZE_T(ft_result_cache_limit, fts_result_cache_limit,
+  PLUGIN_VAR_RQCMDARG,
+  "InnoDB Fulltext search query result cache limit in bytes",
+  NULL, NULL, 2000000000L, 1000000L, SIZE_T_MAX, 0);
+
+static MYSQL_SYSVAR_ULONG(ft_min_token_size, fts_min_token_size,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "InnoDB Fulltext search minimum token size in characters",
+  NULL, NULL, 3, 0, 16, 0);
+
+static MYSQL_SYSVAR_ULONG(ft_max_token_size, fts_max_token_size,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "InnoDB Fulltext search maximum token size in characters",
+  NULL, NULL, FTS_MAX_WORD_LEN_IN_CHAR, 10, FTS_MAX_WORD_LEN_IN_CHAR, 0);
+
+static MYSQL_SYSVAR_ULONG(ft_num_word_optimize, fts_num_word_optimize,
+  PLUGIN_VAR_OPCMDARG,
+  "InnoDB Fulltext search number of words to optimize for each optimize table call ",
+  NULL, NULL, 2000, 1000, 10000, 0);
+
+static MYSQL_SYSVAR_ULONG(ft_sort_pll_degree, fts_sort_pll_degree,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "InnoDB Fulltext search parallel sort degree, will round up to nearest power of 2 number",
+  NULL, NULL, 2, 1, 16, 0);
+
+static MYSQL_SYSVAR_ULONG(sort_buffer_size, srv_sort_buf_size,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Memory buffer size for index creation",
+  NULL, NULL, 1048576, 65536, 64<<20, 0);
+
+static MYSQL_SYSVAR_ULONGLONG(online_alter_log_max_size, srv_online_max_size,
+  PLUGIN_VAR_RQCMDARG,
+  "Maximum modification log file size for online index creation",
+  NULL, NULL, 128<<20, 65536, ~0ULL, 0);
+
+static MYSQL_SYSVAR_BOOL(optimize_fulltext_only, innodb_optimize_fulltext_only,
+  PLUGIN_VAR_NOCMDARG,
+  "Only optimize the Fulltext index of the table",
+  NULL, NULL, FALSE);
+
+extern int os_aio_resize(ulint n_reader_threads, ulint n_writer_threads);
+static void innodb_update_io_thread_count(THD *thd,ulint n_read, ulint n_write)
+{
+  int res = os_aio_resize(n_read, n_write);
+  if (res)
+  {
+#ifndef __linux__
+    ut_ad(0);
+#else
+    ut_a(srv_use_native_aio);
+    push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+      ER_UNKNOWN_ERROR,
+      "Could not reserve max. number of concurrent ios."
+      "Increase the /proc/sys/fs/aio-max-nr to fix.");
+#endif
+  }
+}
+
+static void innodb_read_io_threads_update(THD* thd, struct st_mysql_sys_var*, void*, const void* save)
+{
+  srv_n_read_io_threads = *static_cast<const uint*>(save);
+  innodb_update_io_thread_count(thd, srv_n_read_io_threads, srv_n_write_io_threads);
+}
+static void innodb_write_io_threads_update(THD* thd, struct st_mysql_sys_var*, void*, const void* save)
+{
+  srv_n_write_io_threads = *static_cast<const uint*>(save);
+  innodb_update_io_thread_count(thd, srv_n_read_io_threads, srv_n_write_io_threads);
+}
+
+static MYSQL_SYSVAR_UINT(read_io_threads, srv_n_read_io_threads,
+  PLUGIN_VAR_RQCMDARG,
+  "Number of background read I/O threads in InnoDB.",
+  NULL, innodb_read_io_threads_update , 4, 1, 64, 0);
+
+static MYSQL_SYSVAR_UINT(write_io_threads, srv_n_write_io_threads,
+  PLUGIN_VAR_RQCMDARG,
+  "Number of background write I/O threads in InnoDB.",
+  NULL, innodb_write_io_threads_update, 4, 2, 64, 0);
+
+static MYSQL_SYSVAR_ULONG(force_recovery, srv_force_recovery,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Helps to save your data in case the disk image of the database becomes corrupt. Value 5 can return bogus data, and 6 can permanently corrupt data.",
+  NULL, NULL, 0, 0, 6, 0);
+
+static MYSQL_SYSVAR_ULONG(page_size, srv_page_size,
+  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+  "Page size to use for all InnoDB tablespaces.",
+  NULL, NULL, UNIV_PAGE_SIZE_DEF,
+  UNIV_PAGE_SIZE_MIN, UNIV_PAGE_SIZE_MAX, 0);
+
+static MYSQL_SYSVAR_SIZE_T(log_buffer_size, log_sys.buf_size,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Redo log buffer size in bytes.",
+  NULL, NULL, 16U << 20, 2U << 20, SIZE_T_MAX, 4096);
+
+#if defined __linux__ || defined _WIN32
+static MYSQL_SYSVAR_BOOL(log_file_buffering, log_sys.log_buffered,
+  PLUGIN_VAR_OPCMDARG,
+  "Whether the file system cache for ib_logfile0 is enabled",
+  nullptr, innodb_log_file_buffering_update, FALSE);
+#endif
+
+static MYSQL_SYSVAR_ULONGLONG(log_file_size, srv_log_file_size,
+  PLUGIN_VAR_RQCMDARG,
+  "Redo log size in bytes.",
+  nullptr, innodb_log_file_size_update,
+  96 << 20, 4 << 20, std::numeric_limits<ulonglong>::max(), 4096);
+
+static MYSQL_SYSVAR_UINT(old_blocks_pct, innobase_old_blocks_pct,
+  PLUGIN_VAR_RQCMDARG,
+  "Percentage of the buffer pool to reserve for 'old' blocks.",
+  NULL, innodb_old_blocks_pct_update, 100 * 3 / 8, 5, 95, 0);
+
+static MYSQL_SYSVAR_UINT(old_blocks_time, buf_LRU_old_threshold_ms,
+  PLUGIN_VAR_RQCMDARG,
+  "Move blocks to the 'new' end of the buffer pool if the first access"
+  " was at least this many milliseconds ago."
+  " The timeout is disabled if 0.",
+  NULL, NULL, 1000, 0, UINT_MAX32, 0);
+
+static MYSQL_SYSVAR_ULONG(open_files, innobase_open_files,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "How many files at the maximum InnoDB keeps open at the same time.",
+  NULL, NULL, 0, 0, LONG_MAX, 0);
+
+static MYSQL_SYSVAR_ULONG(sync_spin_loops, srv_n_spin_wait_rounds,
+  PLUGIN_VAR_RQCMDARG,
+  "Count of spin-loop rounds in InnoDB mutexes (30 by default)",
+  NULL, NULL, 30L, 0L, ~0UL, 0);
+
+static MYSQL_SYSVAR_UINT(spin_wait_delay, srv_spin_wait_delay,
+  PLUGIN_VAR_OPCMDARG,
+  "Maximum delay between polling for a spin lock (4 by default)",
+  NULL, NULL, 4, 0, 6000, 0);
+
+static my_bool innodb_prefix_index_cluster_optimization;
+
+static MYSQL_SYSVAR_BOOL(prefix_index_cluster_optimization,
+  innodb_prefix_index_cluster_optimization,
+  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_DEPRECATED,
+  "Deprecated parameter with no effect",
+  nullptr, nullptr, TRUE);
+
+static MYSQL_SYSVAR_STR(data_file_path, innobase_data_file_path,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Path to individual files and their sizes.",
+  NULL, NULL, "ibdata1:12M:autoextend");
+
+static MYSQL_SYSVAR_STR(temp_data_file_path, innobase_temp_data_file_path,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Path to files and their sizes making temp-tablespace.",
+  NULL, NULL, "ibtmp1:12M:autoextend");
+
+static MYSQL_SYSVAR_STR(undo_directory, srv_undo_dir,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Directory where undo tablespace files live, this path can be absolute.",
+  NULL, NULL, NULL);
+
+static MYSQL_SYSVAR_UINT(undo_tablespaces, srv_undo_tablespaces,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Number of undo tablespaces to use.",
+  NULL, NULL,
+  0L,			/* Default seting */
+  0L,			/* Minimum value */
+  TRX_SYS_MAX_UNDO_SPACES, 0); /* Maximum value */
+
+static MYSQL_SYSVAR_ULONGLONG(max_undo_log_size, srv_max_undo_log_size,
+  PLUGIN_VAR_OPCMDARG,
+  "Desired maximum UNDO tablespace size in bytes",
+  NULL, NULL,
+  10 << 20, 10 << 20,
+  1ULL << (32 + UNIV_PAGE_SIZE_SHIFT_MAX), 0);
+
+static MYSQL_SYSVAR_ULONG(purge_rseg_truncate_frequency,
+  srv_purge_rseg_truncate_frequency,
+  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_DEPRECATED,
+  "Deprecated parameter with no effect",
+  NULL, NULL, 128, 1, 128, 0);
+
+static void innodb_undo_log_truncate_update(THD *thd, struct st_mysql_sys_var*,
+                                            void*, const void *save)
+{
+  if ((srv_undo_log_truncate= *static_cast<const my_bool*>(save)))
+    purge_sys.wake_if_not_active();
+}
+
+static MYSQL_SYSVAR_BOOL(undo_log_truncate, srv_undo_log_truncate,
+  PLUGIN_VAR_OPCMDARG,
+  "Enable or Disable Truncate of UNDO tablespace.",
+  NULL, innodb_undo_log_truncate_update, FALSE);
+
+static MYSQL_SYSVAR_LONG(autoinc_lock_mode, innobase_autoinc_lock_mode,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "The AUTOINC lock modes supported by InnoDB:"
+  " 0 => Old style AUTOINC locking (for backward compatibility);"
+  " 1 => New style AUTOINC locking;"
+  " 2 => No AUTOINC locking (unsafe for SBR)",
+  NULL, NULL,
+  AUTOINC_NEW_STYLE_LOCKING,	/* Default setting */
+  AUTOINC_OLD_STYLE_LOCKING,	/* Minimum value */
+  AUTOINC_NO_LOCKING, 0);	/* Maximum value */
+
+#ifdef HAVE_URING
+# include <sys/utsname.h>
+static utsname uname_for_io_uring;
+#else
+static
+#endif
+bool innodb_use_native_aio_default()
+{
+#ifdef HAVE_URING
+  utsname &u= uname_for_io_uring;
+  if (!uname(&u) && u.release[0] == '5' && u.release[1] == '.' &&
+      u.release[2] == '1' && u.release[3] >= '1' && u.release[3] <= '5' &&
+      u.release[4] == '.')
+  {
+    if (u.release[3] == '5') {
+      const char *s= strstr(u.version, "5.15.");
+      if (s || (s= strstr(u.release, "5.15.")))
+        if ((s[5] >= '3' || s[6] >= '0'))
+          return true; /* 5.15.3 and later should be fine */
+    }
+    io_uring_may_be_unsafe= u.release;
+    return false; /* working around io_uring hangs (MDEV-26674) */
+  }
+#endif
+  return true;
+}
+
+static MYSQL_SYSVAR_BOOL(use_native_aio, srv_use_native_aio,
+  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "Use native AIO if supported on this platform.",
+  NULL, NULL, innodb_use_native_aio_default());
+
+#ifdef HAVE_LIBNUMA
+static MYSQL_SYSVAR_BOOL(numa_interleave, srv_numa_interleave,
+  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "Use NUMA interleave memory policy to allocate InnoDB buffer pool.",
+  NULL, NULL, FALSE);
+#endif /* HAVE_LIBNUMA */
+
+static void innodb_change_buffering_update(THD *thd, struct st_mysql_sys_var*,
+                                           void*, const void *save)
+{
+  ulong i= *static_cast<const ulong*>(save);
+  if (i != IBUF_USE_NONE && !ibuf.index)
+    push_warning(thd, Sql_condition::WARN_LEVEL_WARN, ER_NOT_KEYFILE,
+                 "InnoDB: The change buffer is corrupted.");
+  else
+    innodb_change_buffering= i;
+}
+
+static MYSQL_SYSVAR_ENUM(change_buffering, innodb_change_buffering,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_DEPRECATED,
+  "Buffer changes to secondary indexes.",
+  nullptr, innodb_change_buffering_update,
+  IBUF_USE_NONE, &innodb_change_buffering_typelib);
+
+static MYSQL_SYSVAR_UINT(change_buffer_max_size,
+  srv_change_buffer_max_size,
+  PLUGIN_VAR_RQCMDARG,
+  "Maximum on-disk size of change buffer in terms of percentage"
+  " of the buffer pool.",
+  NULL, innodb_change_buffer_max_size_update,
+  CHANGE_BUFFER_DEFAULT_SIZE, 0, 50, 0);
+
+static MYSQL_SYSVAR_ENUM(stats_method, srv_innodb_stats_method,
+   PLUGIN_VAR_RQCMDARG,
+  "Specifies how InnoDB index statistics collection code should"
+  " treat NULLs. Possible values are NULLS_EQUAL (default),"
+  " NULLS_UNEQUAL and NULLS_IGNORED",
+   NULL, NULL, SRV_STATS_NULLS_EQUAL, &innodb_stats_method_typelib);
+
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+static MYSQL_SYSVAR_BOOL(change_buffer_dump, ibuf_dump,
+  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "Dump the change buffer at startup.",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_UINT(change_buffering_debug, ibuf_debug,
+  PLUGIN_VAR_RQCMDARG,
+  "Debug flags for InnoDB change buffering (0=none, 1=try to buffer)",
+  NULL, NULL, 0, 0, 1, 0);
+#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+
+static MYSQL_SYSVAR_ULONG(buf_dump_status_frequency, srv_buf_dump_status_frequency,
+  PLUGIN_VAR_RQCMDARG,
+  "A number between [0, 100] that tells how oftern buffer pool dump status "
+  "in percentages should be printed. E.g. 10 means that buffer pool dump "
+  "status is printed when every 10% of number of buffer pool pages are "
+  "dumped. Default is 0 (only start and end status is printed).",
+  NULL, NULL, 0, 0, 100, 0);
+
+static MYSQL_SYSVAR_BOOL(random_read_ahead, srv_random_read_ahead,
+  PLUGIN_VAR_NOCMDARG,
+  "Whether to use read ahead for random access within an extent.",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_ULONG(read_ahead_threshold, srv_read_ahead_threshold,
+  PLUGIN_VAR_RQCMDARG,
+  "Number of pages that must be accessed sequentially for InnoDB to"
+  " trigger a readahead.",
+  NULL, NULL, 56, 0, 64, 0);
+
+static MYSQL_SYSVAR_STR(monitor_enable, innobase_enable_monitor_counter,
+  PLUGIN_VAR_RQCMDARG,
+  "Turn on a monitor counter",
+  innodb_monitor_validate,
+  innodb_enable_monitor_update, NULL);
+
+static MYSQL_SYSVAR_STR(monitor_disable, innobase_disable_monitor_counter,
+  PLUGIN_VAR_RQCMDARG,
+  "Turn off a monitor counter",
+  innodb_monitor_validate,
+  innodb_disable_monitor_update, NULL);
+
+static MYSQL_SYSVAR_STR(monitor_reset, innobase_reset_monitor_counter,
+  PLUGIN_VAR_RQCMDARG,
+  "Reset a monitor counter",
+  innodb_monitor_validate,
+  innodb_reset_monitor_update, NULL);
+
+static MYSQL_SYSVAR_STR(monitor_reset_all, innobase_reset_all_monitor_counter,
+  PLUGIN_VAR_RQCMDARG,
+  "Reset all values for a monitor counter",
+  innodb_monitor_validate,
+  innodb_reset_all_monitor_update, NULL);
+
+static MYSQL_SYSVAR_BOOL(status_output, srv_print_innodb_monitor,
+  PLUGIN_VAR_OPCMDARG, "Enable InnoDB monitor output to the error log.",
+  NULL, innodb_status_output_update, FALSE);
+
+static MYSQL_SYSVAR_BOOL(status_output_locks, srv_print_innodb_lock_monitor,
+  PLUGIN_VAR_OPCMDARG, "Enable InnoDB lock monitor output to the error log."
+  " Requires innodb_status_output=ON.",
+  NULL, innodb_status_output_update, FALSE);
+
+static MYSQL_SYSVAR_BOOL(print_all_deadlocks, srv_print_all_deadlocks,
+  PLUGIN_VAR_OPCMDARG,
+  "Print all deadlocks to MariaDB error log (off by default)",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_ULONG(compression_failure_threshold_pct,
+  zip_failure_threshold_pct, PLUGIN_VAR_OPCMDARG,
+  "If the compression failure rate of a table is greater than this number"
+  " more padding is added to the pages to reduce the failures. A value of"
+  " zero implies no padding",
+  NULL, NULL, 5, 0, 100, 0);
+
+static MYSQL_SYSVAR_ULONG(compression_pad_pct_max,
+  zip_pad_max, PLUGIN_VAR_OPCMDARG,
+  "Percentage of empty space on a data page that can be reserved"
+  " to make the page compressible.",
+  NULL, NULL, 50, 0, 75, 0);
+
+static MYSQL_SYSVAR_BOOL(read_only, srv_read_only_mode,
+  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+  "Start InnoDB in read only mode (off by default)",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(read_only_compressed, innodb_read_only_compressed,
+  PLUGIN_VAR_OPCMDARG,
+  "Make ROW_FORMAT=COMPRESSED tables read-only",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(cmp_per_index_enabled, srv_cmp_per_index_enabled,
+  PLUGIN_VAR_OPCMDARG,
+  "Enable INFORMATION_SCHEMA.innodb_cmp_per_index,"
+  " may have negative impact on performance (off by default)",
+  NULL, innodb_cmp_per_index_update, FALSE);
+
+static MYSQL_SYSVAR_ENUM(default_row_format, innodb_default_row_format,
+  PLUGIN_VAR_RQCMDARG,
+  "The default ROW FORMAT for all innodb tables created without explicit"
+  " ROW_FORMAT. Possible values are REDUNDANT, COMPACT, and DYNAMIC."
+  " The ROW_FORMAT value COMPRESSED is not allowed",
+  NULL, NULL, DEFAULT_ROW_FORMAT_DYNAMIC,
+  &innodb_default_row_format_typelib);
+
+#ifdef UNIV_DEBUG
+static MYSQL_SYSVAR_UINT(trx_rseg_n_slots_debug, trx_rseg_n_slots_debug,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_NOCMDOPT,
+  "Debug flags for InnoDB to limit TRX_RSEG_N_SLOTS for trx_rsegf_undo_find_free()",
+  NULL, NULL, 0, 0, 1024, 0);
+
+static MYSQL_SYSVAR_UINT(limit_optimistic_insert_debug,
+  btr_cur_limit_optimistic_insert_debug, PLUGIN_VAR_RQCMDARG,
+  "Artificially limit the number of records per B-tree page (0=unlimited).",
+  NULL, NULL, 0, 0, UINT_MAX32, 0);
+
+static MYSQL_SYSVAR_BOOL(trx_purge_view_update_only_debug,
+  srv_purge_view_update_only_debug, PLUGIN_VAR_NOCMDOPT,
+  "Pause actual purging any delete-marked records, but merely update the purge view."
+  " It is to create artificially the situation the purge view have been updated"
+  " but the each purges were not done yet.",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(evict_tables_on_commit_debug,
+  innodb_evict_tables_on_commit_debug, PLUGIN_VAR_OPCMDARG,
+  "On transaction commit, try to evict tables from the data dictionary cache.",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_UINT(data_file_size_debug,
+  srv_sys_space_size_debug,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "InnoDB system tablespace size to be set in recovery.",
+  NULL, NULL, 0, 0, 256U << 20, 0);
+
+static MYSQL_SYSVAR_UINT(fil_make_page_dirty_debug,
+  srv_fil_make_page_dirty_debug, PLUGIN_VAR_OPCMDARG,
+  "Make the first page of the given tablespace dirty.",
+  NULL, innodb_make_page_dirty, 0, 0, UINT_MAX32, 0);
+
+static MYSQL_SYSVAR_UINT(saved_page_number_debug,
+  srv_saved_page_number_debug, PLUGIN_VAR_OPCMDARG,
+  "An InnoDB page number.",
+  NULL, NULL, 0, 0, UINT_MAX32, 0);
+#endif /* UNIV_DEBUG */
+
+static MYSQL_SYSVAR_BOOL(force_primary_key,
+  srv_force_primary_key,
+  PLUGIN_VAR_OPCMDARG,
+  "Do not allow creating a table without primary key (off by default)",
+  NULL, NULL, FALSE);
+
+const char *page_compression_algorithms[]= { "none", "zlib", "lz4", "lzo", "lzma", "bzip2", "snappy", 0 };
+static TYPELIB page_compression_algorithms_typelib=
+{
+  array_elements(page_compression_algorithms) - 1, 0,
+  page_compression_algorithms, 0
+};
+static MYSQL_SYSVAR_ENUM(compression_algorithm, innodb_compression_algorithm,
+  PLUGIN_VAR_OPCMDARG,
+  "Compression algorithm used on page compression. One of: none, zlib, lz4, lzo, lzma, bzip2, or snappy",
+  innodb_compression_algorithm_validate, NULL,
+  /* We use here the largest number of supported compression method to
+  enable all those methods that are available. Availability of compression
+  method is verified on innodb_compression_algorithm_validate function. */
+  PAGE_ZLIB_ALGORITHM,
+  &page_compression_algorithms_typelib);
+
+static MYSQL_SYSVAR_ULONG(fatal_semaphore_wait_threshold, srv_fatal_semaphore_wait_threshold,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Maximum number of seconds that semaphore times out in InnoDB.",
+  NULL, NULL,
+  DEFAULT_SRV_FATAL_SEMAPHORE_TIMEOUT, /* Default setting */
+  1, /* Minimum setting */
+  UINT_MAX32, /* Maximum setting */
+  0);
+
+static const char* srv_encrypt_tables_names[] = { "OFF", "ON", "FORCE", 0 };
+static TYPELIB srv_encrypt_tables_typelib = {
+	array_elements(srv_encrypt_tables_names)-1, 0, srv_encrypt_tables_names,
+	NULL
+};
+static MYSQL_SYSVAR_ENUM(encrypt_tables, srv_encrypt_tables,
+			 PLUGIN_VAR_OPCMDARG,
+			 "Enable encryption for tables. "
+			 "Don't forget to enable --innodb-encrypt-log too",
+			 innodb_encrypt_tables_validate,
+			 innodb_encrypt_tables_update,
+			 0,
+			 &srv_encrypt_tables_typelib);
+
+static MYSQL_SYSVAR_UINT(encryption_threads, srv_n_fil_crypt_threads,
+			 PLUGIN_VAR_RQCMDARG,
+			 "Number of threads performing background key rotation ",
+			 NULL,
+			 innodb_encryption_threads_update,
+			 0, 0, 255, 0);
+
+static MYSQL_SYSVAR_UINT(encryption_rotate_key_age,
+			 srv_fil_crypt_rotate_key_age,
+			 PLUGIN_VAR_RQCMDARG,
+			 "Key rotation - re-encrypt in background "
+                         "all pages that were encrypted with a key that "
+                         "many (or more) versions behind. Value 0 indicates "
+			 "that key rotation is disabled.",
+			 NULL,
+			 innodb_encryption_rotate_key_age_update,
+			 1, 0, UINT_MAX32, 0);
+
+static MYSQL_SYSVAR_UINT(encryption_rotation_iops, srv_n_fil_crypt_iops,
+			 PLUGIN_VAR_RQCMDARG,
+			 "Use this many iops for background key rotation",
+			 NULL,
+			 innodb_encryption_rotation_iops_update,
+			 100, 0, UINT_MAX32, 0);
+
+static MYSQL_SYSVAR_BOOL(encrypt_log, srv_encrypt_log,
+  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+  "Enable redo log encryption",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(immediate_scrub_data_uncompressed,
+			 srv_immediate_scrub_data_uncompressed,
+			 0,
+			 "Enable scrubbing of data",
+			 NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(encrypt_temporary_tables, innodb_encrypt_temporary_tables,
+  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+  "Enrypt the temporary table data.",
+  NULL, NULL, false);
+
+static struct st_mysql_sys_var* innobase_system_variables[]= {
+  MYSQL_SYSVAR(autoextend_increment),
+  MYSQL_SYSVAR(buffer_pool_size),
+  MYSQL_SYSVAR(buffer_pool_chunk_size),
+  MYSQL_SYSVAR(buffer_pool_filename),
+  MYSQL_SYSVAR(buffer_pool_dump_now),
+  MYSQL_SYSVAR(buffer_pool_dump_at_shutdown),
+  MYSQL_SYSVAR(buffer_pool_dump_pct),
+#ifdef UNIV_DEBUG
+  MYSQL_SYSVAR(buffer_pool_evict),
+#endif /* UNIV_DEBUG */
+  MYSQL_SYSVAR(buffer_pool_load_now),
+  MYSQL_SYSVAR(buffer_pool_load_abort),
+#ifdef UNIV_DEBUG
+  MYSQL_SYSVAR(buffer_pool_load_pages_abort),
+#endif /* UNIV_DEBUG */
+  MYSQL_SYSVAR(buffer_pool_load_at_startup),
+  MYSQL_SYSVAR(defragment),
+  MYSQL_SYSVAR(defragment_n_pages),
+  MYSQL_SYSVAR(defragment_stats_accuracy),
+  MYSQL_SYSVAR(defragment_fill_factor),
+  MYSQL_SYSVAR(defragment_fill_factor_n_recs),
+  MYSQL_SYSVAR(defragment_frequency),
+  MYSQL_SYSVAR(lru_scan_depth),
+  MYSQL_SYSVAR(lru_flush_size),
+  MYSQL_SYSVAR(flush_neighbors),
+  MYSQL_SYSVAR(checksum_algorithm),
+  MYSQL_SYSVAR(compression_level),
+  MYSQL_SYSVAR(data_file_path),
+  MYSQL_SYSVAR(temp_data_file_path),
+  MYSQL_SYSVAR(data_home_dir),
+  MYSQL_SYSVAR(doublewrite),
+  MYSQL_SYSVAR(stats_include_delete_marked),
+  MYSQL_SYSVAR(use_atomic_writes),
+  MYSQL_SYSVAR(fast_shutdown),
+  MYSQL_SYSVAR(read_io_threads),
+  MYSQL_SYSVAR(write_io_threads),
+  MYSQL_SYSVAR(file_per_table),
+  MYSQL_SYSVAR(flush_log_at_timeout),
+  MYSQL_SYSVAR(flush_log_at_trx_commit),
+  MYSQL_SYSVAR(flush_method),
+  MYSQL_SYSVAR(force_recovery),
+  MYSQL_SYSVAR(fill_factor),
+  MYSQL_SYSVAR(ft_cache_size),
+  MYSQL_SYSVAR(ft_total_cache_size),
+  MYSQL_SYSVAR(ft_result_cache_limit),
+  MYSQL_SYSVAR(ft_enable_stopword),
+  MYSQL_SYSVAR(ft_max_token_size),
+  MYSQL_SYSVAR(ft_min_token_size),
+  MYSQL_SYSVAR(ft_num_word_optimize),
+  MYSQL_SYSVAR(ft_sort_pll_degree),
+  MYSQL_SYSVAR(lock_wait_timeout),
+  MYSQL_SYSVAR(deadlock_detect),
+  MYSQL_SYSVAR(deadlock_report),
+  MYSQL_SYSVAR(page_size),
+  MYSQL_SYSVAR(log_buffer_size),
+#if defined __linux__ || defined _WIN32
+  MYSQL_SYSVAR(log_file_buffering),
+#endif
+  MYSQL_SYSVAR(log_file_size),
+  MYSQL_SYSVAR(log_group_home_dir),
+  MYSQL_SYSVAR(max_dirty_pages_pct),
+  MYSQL_SYSVAR(max_dirty_pages_pct_lwm),
+  MYSQL_SYSVAR(adaptive_flushing_lwm),
+  MYSQL_SYSVAR(adaptive_flushing),
+  MYSQL_SYSVAR(flush_sync),
+  MYSQL_SYSVAR(flushing_avg_loops),
+  MYSQL_SYSVAR(max_purge_lag),
+  MYSQL_SYSVAR(max_purge_lag_delay),
+  MYSQL_SYSVAR(max_purge_lag_wait),
+  MYSQL_SYSVAR(old_blocks_pct),
+  MYSQL_SYSVAR(old_blocks_time),
+  MYSQL_SYSVAR(open_files),
+  MYSQL_SYSVAR(optimize_fulltext_only),
+  MYSQL_SYSVAR(rollback_on_timeout),
+  MYSQL_SYSVAR(ft_aux_table),
+  MYSQL_SYSVAR(ft_enable_diag_print),
+  MYSQL_SYSVAR(ft_server_stopword_table),
+  MYSQL_SYSVAR(ft_user_stopword_table),
+  MYSQL_SYSVAR(disable_sort_file_cache),
+  MYSQL_SYSVAR(stats_on_metadata),
+  MYSQL_SYSVAR(stats_transient_sample_pages),
+  MYSQL_SYSVAR(stats_persistent),
+  MYSQL_SYSVAR(stats_persistent_sample_pages),
+  MYSQL_SYSVAR(stats_auto_recalc),
+  MYSQL_SYSVAR(stats_modified_counter),
+  MYSQL_SYSVAR(stats_traditional),
+#ifdef BTR_CUR_HASH_ADAPT
+  MYSQL_SYSVAR(adaptive_hash_index),
+  MYSQL_SYSVAR(adaptive_hash_index_parts),
+#endif /* BTR_CUR_HASH_ADAPT */
+  MYSQL_SYSVAR(stats_method),
+  MYSQL_SYSVAR(status_file),
+  MYSQL_SYSVAR(strict_mode),
+  MYSQL_SYSVAR(sort_buffer_size),
+  MYSQL_SYSVAR(online_alter_log_max_size),
+  MYSQL_SYSVAR(sync_spin_loops),
+  MYSQL_SYSVAR(spin_wait_delay),
+  MYSQL_SYSVAR(table_locks),
+  MYSQL_SYSVAR(prefix_index_cluster_optimization),
+  MYSQL_SYSVAR(tmpdir),
+  MYSQL_SYSVAR(autoinc_lock_mode),
+  MYSQL_SYSVAR(use_native_aio),
+#ifdef HAVE_LIBNUMA
+  MYSQL_SYSVAR(numa_interleave),
+#endif /* HAVE_LIBNUMA */
+  MYSQL_SYSVAR(change_buffering),
+  MYSQL_SYSVAR(change_buffer_max_size),
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+  MYSQL_SYSVAR(change_buffer_dump),
+  MYSQL_SYSVAR(change_buffering_debug),
+#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+  MYSQL_SYSVAR(random_read_ahead),
+  MYSQL_SYSVAR(read_ahead_threshold),
+  MYSQL_SYSVAR(read_only),
+  MYSQL_SYSVAR(read_only_compressed),
+  MYSQL_SYSVAR(instant_alter_column_allowed),
+  MYSQL_SYSVAR(io_capacity),
+  MYSQL_SYSVAR(io_capacity_max),
+  MYSQL_SYSVAR(monitor_enable),
+  MYSQL_SYSVAR(monitor_disable),
+  MYSQL_SYSVAR(monitor_reset),
+  MYSQL_SYSVAR(monitor_reset_all),
+  MYSQL_SYSVAR(purge_threads),
+  MYSQL_SYSVAR(purge_batch_size),
+#ifdef UNIV_DEBUG
+  MYSQL_SYSVAR(log_checkpoint_now),
+  MYSQL_SYSVAR(buf_flush_list_now),
+  MYSQL_SYSVAR(merge_threshold_set_all_debug),
+#endif /* UNIV_DEBUG */
+  MYSQL_SYSVAR(status_output),
+  MYSQL_SYSVAR(status_output_locks),
+  MYSQL_SYSVAR(print_all_deadlocks),
+  MYSQL_SYSVAR(cmp_per_index_enabled),
+  MYSQL_SYSVAR(max_undo_log_size),
+  MYSQL_SYSVAR(purge_rseg_truncate_frequency),
+  MYSQL_SYSVAR(undo_log_truncate),
+  MYSQL_SYSVAR(undo_directory),
+  MYSQL_SYSVAR(undo_tablespaces),
+  MYSQL_SYSVAR(compression_failure_threshold_pct),
+  MYSQL_SYSVAR(compression_pad_pct_max),
+  MYSQL_SYSVAR(default_row_format),
+#ifdef UNIV_DEBUG
+  MYSQL_SYSVAR(trx_rseg_n_slots_debug),
+  MYSQL_SYSVAR(limit_optimistic_insert_debug),
+  MYSQL_SYSVAR(trx_purge_view_update_only_debug),
+  MYSQL_SYSVAR(evict_tables_on_commit_debug),
+  MYSQL_SYSVAR(data_file_size_debug),
+  MYSQL_SYSVAR(fil_make_page_dirty_debug),
+  MYSQL_SYSVAR(saved_page_number_debug),
+#endif /* UNIV_DEBUG */
+  MYSQL_SYSVAR(force_primary_key),
+  MYSQL_SYSVAR(fatal_semaphore_wait_threshold),
+  /* Table page compression feature */
+  MYSQL_SYSVAR(compression_default),
+  MYSQL_SYSVAR(compression_algorithm),
+  /* Encryption feature */
+  MYSQL_SYSVAR(encrypt_tables),
+  MYSQL_SYSVAR(encryption_threads),
+  MYSQL_SYSVAR(encryption_rotate_key_age),
+  MYSQL_SYSVAR(encryption_rotation_iops),
+  MYSQL_SYSVAR(encrypt_log),
+  MYSQL_SYSVAR(default_encryption_key_id),
+  MYSQL_SYSVAR(immediate_scrub_data_uncompressed),
+  MYSQL_SYSVAR(buf_dump_status_frequency),
+  MYSQL_SYSVAR(background_thread),
+  MYSQL_SYSVAR(encrypt_temporary_tables),
+
+  NULL
+};
+
+maria_declare_plugin(innobase)
+{
+  MYSQL_STORAGE_ENGINE_PLUGIN,
+  &innobase_storage_engine,
+  innobase_hton_name,
+  plugin_author,
+  "Supports transactions, row-level locking, foreign keys and encryption for tables",
+  PLUGIN_LICENSE_GPL,
+  innodb_init, /* Plugin Init */
+  NULL, /* Plugin Deinit */
+  MYSQL_VERSION_MAJOR << 8 | MYSQL_VERSION_MINOR,
+  innodb_status_variables_export,/* status variables             */
+  innobase_system_variables, /* system variables */
+  PACKAGE_VERSION,
+  MariaDB_PLUGIN_MATURITY_STABLE /* maturity */
+},
+i_s_innodb_trx,
+i_s_innodb_locks,
+i_s_innodb_lock_waits,
+i_s_innodb_cmp,
+i_s_innodb_cmp_reset,
+i_s_innodb_cmpmem,
+i_s_innodb_cmpmem_reset,
+i_s_innodb_cmp_per_index,
+i_s_innodb_cmp_per_index_reset,
+i_s_innodb_buffer_page,
+i_s_innodb_buffer_page_lru,
+i_s_innodb_buffer_stats,
+i_s_innodb_metrics,
+i_s_innodb_ft_default_stopword,
+i_s_innodb_ft_deleted,
+i_s_innodb_ft_being_deleted,
+i_s_innodb_ft_config,
+i_s_innodb_ft_index_cache,
+i_s_innodb_ft_index_table,
+i_s_innodb_sys_tables,
+i_s_innodb_sys_tablestats,
+i_s_innodb_sys_indexes,
+i_s_innodb_sys_columns,
+i_s_innodb_sys_fields,
+i_s_innodb_sys_foreign,
+i_s_innodb_sys_foreign_cols,
+i_s_innodb_sys_tablespaces,
+i_s_innodb_sys_virtual,
+i_s_innodb_tablespaces_encryption
+maria_declare_plugin_end;
+
+/** @brief Adjust some InnoDB startup parameters based on file contents
+or innodb_page_size. */
+static
+void
+innodb_params_adjust()
+{
+	MYSQL_SYSVAR_NAME(max_undo_log_size).max_val
+		= 1ULL << (32U + srv_page_size_shift);
+	MYSQL_SYSVAR_NAME(max_undo_log_size).min_val
+		= MYSQL_SYSVAR_NAME(max_undo_log_size).def_val
+		= ulonglong(SRV_UNDO_TABLESPACE_SIZE_IN_PAGES)
+		<< srv_page_size_shift;
+	MYSQL_SYSVAR_NAME(max_undo_log_size).max_val
+		= 1ULL << (32U + srv_page_size_shift);
+}
+
+/****************************************************************************
+ * DS-MRR implementation
+ ***************************************************************************/
+
+/**
+Multi Range Read interface, DS-MRR calls */
+int
+ha_innobase::multi_range_read_init(
+	RANGE_SEQ_IF*	seq,
+	void*		seq_init_param,
+	uint		n_ranges,
+	uint		mode,
+	HANDLER_BUFFER*	buf)
+{
+	return(m_ds_mrr.dsmrr_init(this, seq, seq_init_param,
+				 n_ranges, mode, buf));
+}
+
+int
+ha_innobase::multi_range_read_next(
+	range_id_t*		range_info)
+{
+	return(m_ds_mrr.dsmrr_next(range_info));
+}
+
+ha_rows
+ha_innobase::multi_range_read_info_const(
+	uint		keyno,
+	RANGE_SEQ_IF*	seq,
+	void*		seq_init_param,
+	uint		n_ranges,
+	uint*		bufsz,
+	uint*		flags,
+	Cost_estimate*	cost)
+{
+	/* See comments in ha_myisam::multi_range_read_info_const */
+	m_ds_mrr.init(this, table);
+
+	if (m_prebuilt->select_lock_type != LOCK_NONE) {
+		*flags |= HA_MRR_USE_DEFAULT_IMPL;
+	}
+
+	ha_rows res= m_ds_mrr.dsmrr_info_const(keyno, seq, seq_init_param, n_ranges,
+			bufsz, flags, cost);
+	return res;
+}
+
+ha_rows
+ha_innobase::multi_range_read_info(
+	uint		keyno,
+	uint		n_ranges,
+	uint		keys,
+	uint		key_parts,
+	uint*		bufsz,
+	uint*		flags,
+	Cost_estimate*	cost)
+{
+	m_ds_mrr.init(this, table);
+	ha_rows res= m_ds_mrr.dsmrr_info(keyno, n_ranges, keys, key_parts, bufsz,
+					flags, cost);
+	return res;
+}
+
+int
+ha_innobase::multi_range_read_explain_info(
+	uint mrr_mode,
+	char *str,
+	size_t size)
+{
+	return m_ds_mrr.dsmrr_explain_info(mrr_mode, str, size);
+}
+
+/** Find or open a table handle for the virtual column template
+@param[in]	thd	thread handle
+@param[in,out]	table	InnoDB table whose virtual column template
+			is to be updated
+@return table handle
+@retval NULL if the table is dropped, unaccessible or corrupted
+for purge thread */
+static TABLE* innodb_find_table_for_vc(THD* thd, dict_table_t* table)
+{
+	TABLE *mysql_table;
+	const bool  bg_thread = THDVAR(thd, background_thread);
+
+	if (bg_thread) {
+		if ((mysql_table = get_purge_table(thd))) {
+			return mysql_table;
+		}
+	} else {
+		if (table->vc_templ->mysql_table_query_id
+		    == thd_get_query_id(thd)) {
+			return table->vc_templ->mysql_table;
+		}
+	}
+
+	char	db_buf[NAME_LEN + 1];
+	char	tbl_buf[NAME_LEN + 1];
+	ulint	db_buf_len, tbl_buf_len;
+
+	if (!table->parse_name(db_buf, tbl_buf, &db_buf_len, &tbl_buf_len)) {
+		return NULL;
+	}
+
+	if (bg_thread) {
+		return open_purge_table(thd, db_buf, db_buf_len,
+					tbl_buf, tbl_buf_len);
+	}
+
+	mysql_table = find_fk_open_table(thd, db_buf, db_buf_len,
+					 tbl_buf, tbl_buf_len);
+	table->vc_templ->mysql_table = mysql_table;
+	table->vc_templ->mysql_table_query_id = thd_get_query_id(thd);
+	return mysql_table;
+}
+
+/** Change dbname and table name in table->vc_templ.
+@param[in,out]	table	the table whose virtual column template
+dbname and tbname to be renamed. */
+void
+innobase_rename_vc_templ(
+	dict_table_t*	table)
+{
+	char	dbname[MAX_DATABASE_NAME_LEN + 1];
+	char	tbname[MAX_DATABASE_NAME_LEN + 1];
+	char*	name = table->name.m_name;
+	ulint	dbnamelen = dict_get_db_name_len(name);
+	ulint	tbnamelen = strlen(name) - dbnamelen - 1;
+	char	t_dbname[MAX_DATABASE_NAME_LEN + 1];
+	char	t_tbname[MAX_TABLE_NAME_LEN + 1];
+
+	strncpy(dbname, name, dbnamelen);
+	dbname[dbnamelen] = 0;
+	strncpy(tbname, name + dbnamelen + 1, tbnamelen);
+	tbname[tbnamelen] =0;
+
+	/* For partition table, remove the partition name and use the
+	"main" table name to build the template */
+	char*	is_part = is_partition(tbname);
+
+	if (is_part != NULL) {
+		*is_part = '\0';
+		tbnamelen = ulint(is_part - tbname);
+	}
+
+	dbnamelen = filename_to_tablename(dbname, t_dbname,
+					  MAX_DATABASE_NAME_LEN + 1);
+	tbnamelen = filename_to_tablename(tbname, t_tbname,
+					  MAX_TABLE_NAME_LEN + 1);
+
+	table->vc_templ->db_name = t_dbname;
+	table->vc_templ->tb_name = t_tbname;
+}
+
+
+/**
+   Allocate a heap and record for calculating virtual fields
+   Used mainly for virtual fields in indexes
+
+@param[in]      thd             MariaDB THD
+@param[in]      index           Index in use
+@param[out]     heap            Heap that holds temporary row
+@param[in,out]  table           MariaDB table
+@param[out]     record	        Pointer to allocated MariaDB record
+@param[out]     storage	        Internal storage for blobs etc
+
+@retval		true on success
+@retval		false on malloc failure or failed to open the maria table
+		for purge thread.
+*/
+
+bool innobase_allocate_row_for_vcol(THD *thd, const dict_index_t *index,
+                                    mem_heap_t **heap, TABLE **table,
+                                    VCOL_STORAGE *storage)
+{
+  TABLE *maria_table;
+  String *blob_value_storage;
+  if (!*table)
+    *table = innodb_find_table_for_vc(thd, index->table);
+
+  /* For purge thread, there is a possiblity that table could have
+     dropped, corrupted or unaccessible. */
+  if (!*table)
+    return false;
+  maria_table = *table;
+  if (!*heap && !(*heap = mem_heap_create(srv_page_size)))
+    return false;
+
+  uchar *record = static_cast<byte *>(mem_heap_alloc(*heap,
+                                                    maria_table->s->reclength));
+
+  size_t len = maria_table->s->virtual_not_stored_blob_fields * sizeof(String);
+  blob_value_storage = static_cast<String *>(mem_heap_alloc(*heap, len));
+
+  if (!record || !blob_value_storage)
+    return false;
+
+  storage->maria_table = maria_table;
+  storage->innobase_record = record;
+  storage->maria_record = maria_table->field[0]->record_ptr();
+  storage->blob_value_storage = blob_value_storage;
+
+  maria_table->move_fields(maria_table->field, record, storage->maria_record);
+  maria_table->remember_blob_values(blob_value_storage);
+
+  return true;
+}
+
+
+/** Free memory allocated by innobase_allocate_row_for_vcol() */
+
+void innobase_free_row_for_vcol(VCOL_STORAGE *storage)
+{
+	TABLE *maria_table= storage->maria_table;
+	maria_table->move_fields(maria_table->field, storage->maria_record,
+                                 storage->innobase_record);
+        maria_table->restore_blob_values(storage->blob_value_storage);
+}
+
+
+void innobase_report_computed_value_failed(dtuple_t *row)
+{
+  ib::error() << "Compute virtual column values failed for "
+              << rec_printer(row).str();
+}
+
+
+/** Get the computed value by supplying the base column values.
+@param[in,out]	row		the data row
+@param[in]	col		virtual column
+@param[in]	index		index
+@param[in,out]	local_heap	heap memory for processing large data etc.
+@param[in,out]	heap		memory heap that copies the actual index row
+@param[in]	ifield		index field
+@param[in]	thd		MySQL thread handle
+@param[in,out]	mysql_table	mysql table object
+@param[in,out]	mysql_rec	MariaDB record buffer
+@param[in]	old_table	during ALTER TABLE, this is the old table
+				or NULL.
+@param[in]	update		update vector for the row, if any
+@param[in]	foreign		foreign key information
+@return the field filled with computed value, or NULL if just want
+to store the value in passed in "my_rec" */
+dfield_t*
+innobase_get_computed_value(
+	dtuple_t*		row,
+	const dict_v_col_t*	col,
+	const dict_index_t*	index,
+	mem_heap_t**		local_heap,
+	mem_heap_t*		heap,
+	const dict_field_t*	ifield,
+	THD*			thd,
+	TABLE*			mysql_table,
+	byte*			mysql_rec,
+	const dict_table_t*	old_table,
+	const upd_t*		update,
+	bool			ignore_warnings)
+{
+	byte		rec_buf2[REC_VERSION_56_MAX_INDEX_COL_LEN];
+	byte*		buf;
+	dfield_t*	field;
+	ulint		len;
+
+	const ulint zip_size = old_table
+		? old_table->space->zip_size()
+		: dict_tf_get_zip_size(index->table->flags);
+
+	ulint		ret = 0;
+
+	dict_index_t *clust_index= dict_table_get_first_index(index->table);
+
+	ut_ad(index->table->vc_templ);
+	ut_ad(thd != NULL);
+	ut_ad(mysql_table);
+
+	DBUG_ENTER("innobase_get_computed_value");
+	const mysql_row_templ_t*
+			vctempl =  index->table->vc_templ->vtempl[
+				index->table->vc_templ->n_col + col->v_pos];
+
+	if (!heap || index->table->vc_templ->rec_len
+		     >= REC_VERSION_56_MAX_INDEX_COL_LEN) {
+		if (*local_heap == NULL) {
+			*local_heap = mem_heap_create(srv_page_size);
+		}
+
+		buf = static_cast<byte*>(mem_heap_alloc(
+				*local_heap, index->table->vc_templ->rec_len));
+	} else {
+		buf = rec_buf2;
+	}
+
+	for (ulint i = 0; i < unsigned{col->num_base}; i++) {
+		dict_col_t*			base_col = col->base_col[i];
+		const dfield_t*			row_field = NULL;
+		ulint				col_no = base_col->ind;
+		const mysql_row_templ_t*	templ
+			= index->table->vc_templ->vtempl[col_no];
+		const byte*			data;
+
+		if (update) {
+			ulint clust_no = dict_col_get_clust_pos(base_col,
+								clust_index);
+			ut_ad(clust_no != ULINT_UNDEFINED);
+			if (const upd_field_t *uf = upd_get_field_by_field_no(
+				    update, uint16_t(clust_no), false)) {
+				row_field = &uf->new_val;
+			}
+		}
+
+		if (!row_field) {
+			row_field = dtuple_get_nth_field(row, col_no);
+		}
+
+		data = static_cast<const byte*>(row_field->data);
+		len = row_field->len;
+
+		if (row_field->ext) {
+			if (*local_heap == NULL) {
+				*local_heap = mem_heap_create(srv_page_size);
+			}
+
+			data = btr_copy_externally_stored_field(
+				&len, data, zip_size,
+				dfield_get_len(row_field), *local_heap);
+		}
+
+		if (len == UNIV_SQL_NULL) {
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 may need this here */
+#endif
+                        mysql_rec[templ->mysql_null_byte_offset]
+                                |= (byte) templ->mysql_null_bit_mask;
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
+                        memcpy(mysql_rec + templ->mysql_col_offset,
+                               static_cast<const byte*>(
+					index->table->vc_templ->default_rec
+					+ templ->mysql_col_offset),
+                               templ->mysql_col_len);
+                } else {
+
+			row_sel_field_store_in_mysql_format(
+				mysql_rec + templ->mysql_col_offset,
+				templ, index, templ->clust_rec_field_no,
+				(const byte*)data, len);
+
+			if (templ->mysql_null_bit_mask) {
+				/* It is a nullable column with a
+				non-NULL value */
+				mysql_rec[templ->mysql_null_byte_offset]
+					&= static_cast<byte>(
+						~templ->mysql_null_bit_mask);
+			}
+		}
+	}
+
+	field = dtuple_get_nth_v_field(row, col->v_pos);
+
+	MY_BITMAP *old_write_set = dbug_tmp_use_all_columns(mysql_table, &mysql_table->write_set);
+	MY_BITMAP *old_read_set = dbug_tmp_use_all_columns(mysql_table, &mysql_table->read_set);
+	ret = mysql_table->update_virtual_field(
+		mysql_table->field[col->m_col.ind],
+		ignore_warnings);
+	dbug_tmp_restore_column_map(&mysql_table->read_set, old_read_set);
+	dbug_tmp_restore_column_map(&mysql_table->write_set, old_write_set);
+
+	if (ret != 0) {
+		DBUG_RETURN(NULL);
+	}
+
+	if (vctempl->mysql_null_bit_mask
+	    && (mysql_rec[vctempl->mysql_null_byte_offset]
+	        & vctempl->mysql_null_bit_mask)) {
+		dfield_set_null(field);
+		field->type.prtype |= DATA_VIRTUAL;
+		DBUG_RETURN(field);
+	}
+
+	row_mysql_store_col_in_innobase_format(
+		field, buf,
+		TRUE, mysql_rec + vctempl->mysql_col_offset,
+		vctempl->mysql_col_len, dict_table_is_comp(index->table));
+	field->type.prtype |= DATA_VIRTUAL;
+
+	ulint	max_prefix = col->m_col.max_prefix;
+
+	if (max_prefix && ifield
+	    && (ifield->prefix_len == 0
+	        || ifield->prefix_len > col->m_col.max_prefix)) {
+		max_prefix = ifield->prefix_len;
+	}
+
+	/* If this is a prefix index, we only need a portion of the field */
+	if (max_prefix) {
+		len = dtype_get_at_most_n_mbchars(
+			col->m_col.prtype,
+			col->m_col.mbminlen, col->m_col.mbmaxlen,
+			max_prefix,
+			field->len,
+			static_cast<char*>(dfield_get_data(field)));
+		dfield_set_len(field, len);
+	}
+
+	if (heap) {
+		dfield_dup(field, heap);
+	}
+
+	DBUG_RETURN(field);
+}
+
+
+/** Attempt to push down an index condition.
+@param[in] keyno MySQL key number
+@param[in] idx_cond Index condition to be checked
+@return Part of idx_cond which the handler will not evaluate */
+
+class Item*
+ha_innobase::idx_cond_push(
+	uint		keyno,
+	class Item*	idx_cond)
+{
+	DBUG_ENTER("ha_innobase::idx_cond_push");
+	DBUG_ASSERT(keyno != MAX_KEY);
+	DBUG_ASSERT(idx_cond != NULL);
+
+	/* We can only evaluate the condition if all columns are stored.*/
+	dict_index_t* idx  = innobase_get_index(keyno);
+	if (idx && dict_index_has_virtual(idx)) {
+		DBUG_RETURN(idx_cond);
+	}
+
+	pushed_idx_cond = idx_cond;
+	pushed_idx_cond_keyno = keyno;
+	in_range_check_pushed_down = TRUE;
+	/* We will evaluate the condition entirely */
+	DBUG_RETURN(NULL);
+}
+
+
+/** Push a primary key filter.
+@param[in]	pk_filter	filter against which primary keys
+				are to be checked
+@retval	false if pushed (always) */
+bool ha_innobase::rowid_filter_push(Rowid_filter* pk_filter)
+{
+	DBUG_ENTER("ha_innobase::rowid_filter_push");
+	DBUG_ASSERT(pk_filter != NULL);
+	pushed_rowid_filter= pk_filter;
+	DBUG_RETURN(false);
+}
+
+static bool is_part_of_a_key_prefix(const Field_longstr *field)
+{
+  const TABLE_SHARE *s= field->table->s;
+
+  for (uint i= 0; i < s->keys; i++)
+  {
+    const KEY &key= s->key_info[i];
+    for (uint j= 0; j < key.user_defined_key_parts; j++)
+    {
+      const KEY_PART_INFO &info= key.key_part[j];
+      // When field is a part of some key, a key part and field will have the
+      // same length. And their length will be different when only some prefix
+      // of a field is used as a key part. That's what we're looking for here.
+      if (info.field->field_index == field->field_index &&
+          info.length != field->field_length)
+      {
+        DBUG_ASSERT(info.length < field->field_length);
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+static bool
+is_part_of_a_primary_key(const Field* field)
+{
+	const TABLE_SHARE* s = field->table->s;
+
+	return s->primary_key != MAX_KEY
+	       && field->part_of_key.is_set(s->primary_key);
+}
+
+bool ha_innobase::can_convert_string(const Field_string *field,
+                                     const Column_definition &new_type) const
+{
+  DBUG_ASSERT(!field->compression_method());
+  if (new_type.type_handler() != field->type_handler())
+    return false;
+
+  if (new_type.char_length != field->char_length())
+    return false;
+
+  const Charset field_cs(field->charset());
+
+  if (new_type.length != field->max_display_length() &&
+      (!m_prebuilt->table->not_redundant() ||
+       field_cs.mbminlen() == field_cs.mbmaxlen()))
+    return false;
+
+  if (new_type.charset != field->charset())
+  {
+    if (!field_cs.encoding_allows_reinterpret_as(new_type.charset))
+      return false;
+
+    if (!field_cs.eq_collation_specific_names(new_type.charset))
+      return !is_part_of_a_primary_key(field);
+
+    // Fully indexed case works instantly like
+    // Compare_keys::EqualButKeyPartLength. But prefix case isn't implemented.
+    if (is_part_of_a_key_prefix(field))
+	    return false;
+
+    return true;
+  }
+
+  return true;
+}
+
+static bool
+supports_enlarging(const dict_table_t* table, const Field_varstring* field,
+		   const Column_definition& new_type)
+{
+	return field->field_length <= 127 || new_type.length <= 255
+	       || field->field_length > 255 || !table->not_redundant();
+}
+
+bool ha_innobase::can_convert_varstring(
+    const Field_varstring *field, const Column_definition &new_type) const
+{
+  if (new_type.length < field->field_length)
+    return false;
+
+  if (new_type.char_length < field->char_length())
+    return false;
+
+  if (!new_type.compression_method() != !field->compression_method())
+    return false;
+
+  if (new_type.type_handler() != field->type_handler())
+    return false;
+
+  if (new_type.charset != field->charset())
+  {
+    if (!supports_enlarging(m_prebuilt->table, field, new_type))
+      return false;
+
+    Charset field_cs(field->charset());
+    if (!field_cs.encoding_allows_reinterpret_as(new_type.charset))
+      return false;
+
+    if (!field_cs.eq_collation_specific_names(new_type.charset))
+      return !is_part_of_a_primary_key(field);
+
+    // Fully indexed case works instantly like
+    // Compare_keys::EqualButKeyPartLength. But prefix case isn't implemented.
+    if (is_part_of_a_key_prefix(field))
+      return false;
+
+    return true;
+  }
+
+  if (new_type.length != field->field_length)
+  {
+    if (!supports_enlarging(m_prebuilt->table, field, new_type))
+      return false;
+
+    return true;
+  }
+
+  return true;
+}
+
+static bool is_part_of_a_key(const Field_blob *field)
+{
+  const TABLE_SHARE *s= field->table->s;
+
+  for (uint i= 0; i < s->keys; i++)
+  {
+    const KEY &key= s->key_info[i];
+    for (uint j= 0; j < key.user_defined_key_parts; j++)
+    {
+      const KEY_PART_INFO &info= key.key_part[j];
+      if (info.field->field_index == field->field_index)
+        return true;
+    }
+  }
+
+  return false;
+}
+
+bool ha_innobase::can_convert_blob(const Field_blob *field,
+                                   const Column_definition &new_type) const
+{
+  if (new_type.type_handler() != field->type_handler())
+    return false;
+
+  if (!new_type.compression_method() != !field->compression_method())
+    return false;
+
+  if (new_type.pack_length != field->pack_length())
+    return false;
+
+  if (new_type.charset != field->charset())
+  {
+    Charset field_cs(field->charset());
+    if (!field_cs.encoding_allows_reinterpret_as(new_type.charset))
+      return false;
+
+    if (!field_cs.eq_collation_specific_names(new_type.charset))
+      return !is_part_of_a_key(field);
+
+    // Fully indexed case works instantly like
+    // Compare_keys::EqualButKeyPartLength. But prefix case isn't implemented.
+    if (is_part_of_a_key_prefix(field))
+      return false;
+
+    return true;
+  }
+
+  return true;
+}
+
+
+bool ha_innobase::can_convert_nocopy(const Field &field,
+                                     const Column_definition &new_type) const
+{
+  if (const Field_string *tf= dynamic_cast<const Field_string *>(&field))
+    return can_convert_string(tf, new_type);
+
+  if (const Field_varstring *tf= dynamic_cast<const Field_varstring *>(&field))
+    return can_convert_varstring(tf, new_type);
+
+  if (dynamic_cast<const Field_geom *>(&field))
+    return false;
+
+  if (const Field_blob *tf= dynamic_cast<const Field_blob *>(&field))
+    return can_convert_blob(tf, new_type);
+
+  return false;
+}
+
+
+Compare_keys ha_innobase::compare_key_parts(
+    const Field &old_field, const Column_definition &new_field,
+    const KEY_PART_INFO &old_part, const KEY_PART_INFO &new_part) const
+{
+  const bool is_equal= old_field.is_equal(new_field);
+  const CHARSET_INFO *old_cs= old_field.charset();
+  const CHARSET_INFO *new_cs= new_field.charset;
+
+  if (!is_equal)
+  {
+    if (!old_field.table->file->can_convert_nocopy(old_field, new_field))
+      return Compare_keys::NotEqual;
+
+    if (!Charset(old_cs).eq_collation_specific_names(new_cs))
+      return Compare_keys::NotEqual;
+  }
+
+  if (old_part.length / old_cs->mbmaxlen != new_part.length / new_cs->mbmaxlen)
+  {
+    if (old_part.length != old_field.field_length)
+      return Compare_keys::NotEqual;
+
+    if (old_part.length >= new_part.length)
+      return Compare_keys::NotEqual;
+
+    return Compare_keys::EqualButKeyPartLength;
+  }
+
+  return Compare_keys::Equal;
+}
+
+/******************************************************************//**
+Use this when the args are passed to the format string from
+errmsg-utf8.txt directly as is.
+
+Push a warning message to the client, it is a wrapper around:
+
+void push_warning_printf(
+	THD *thd, Sql_condition::enum_condition_level level,
+	uint code, const char *format, ...);
+*/
+void
+ib_senderrf(
+/*========*/
+	THD*		thd,		/*!< in/out: session */
+	ib_log_level_t	level,		/*!< in: warning level */
+	ib_uint32_t	code,		/*!< MySQL error code */
+	...)				/*!< Args */
+{
+	va_list		args;
+	const char*	format = my_get_err_msg(code);
+
+	/* If the caller wants to push a message to the client then
+	the caller must pass a valid session handle. */
+
+	ut_a(thd != 0);
+
+	/* The error code must exist in the errmsg-utf8.txt file. */
+	ut_a(format != 0);
+
+	va_start(args, code);
+
+	myf l;
+
+	switch (level) {
+	case IB_LOG_LEVEL_INFO:
+		l = ME_NOTE;
+		break;
+	case IB_LOG_LEVEL_WARN:
+		l = ME_WARNING;
+		break;
+	default:
+		l = 0;
+		break;
+	}
+
+	my_printv_error(code, format, MYF(l), args);
+
+	va_end(args);
+
+	if (level == IB_LOG_LEVEL_FATAL) {
+		ut_error;
+	}
+}
+
+/******************************************************************//**
+Use this when the args are first converted to a formatted string and then
+passed to the format string from errmsg-utf8.txt. The error message format
+must be: "Some string ... %s".
+
+Push a warning message to the client, it is a wrapper around:
+
+void push_warning_printf(
+	THD *thd, Sql_condition::enum_condition_level level,
+	uint code, const char *format, ...);
+*/
+void
+ib_errf(
+/*====*/
+	THD*		thd,		/*!< in/out: session */
+	ib_log_level_t	level,		/*!< in: warning level */
+	ib_uint32_t	code,		/*!< MySQL error code */
+	const char*	format,		/*!< printf format */
+	...)				/*!< Args */
+{
+	char*		str = NULL;
+	va_list         args;
+
+	/* If the caller wants to push a message to the client then
+	the caller must pass a valid session handle. */
+
+	ut_a(thd != 0);
+	ut_a(format != 0);
+
+	va_start(args, format);
+
+#ifdef _WIN32
+	int		size = _vscprintf(format, args) + 1;
+	if (size > 0) {
+		str = static_cast<char*>(malloc(size));
+	}
+	if (str == NULL) {
+		va_end(args);
+		return;	/* Watch for Out-Of-Memory */
+	}
+	str[size - 1] = 0x0;
+	vsnprintf(str, size, format, args);
+#elif HAVE_VASPRINTF
+	if (vasprintf(&str, format, args) == -1) {
+		/* In case of failure use a fixed length string */
+		str = static_cast<char*>(malloc(BUFSIZ));
+		vsnprintf(str, BUFSIZ, format, args);
+	}
+#else
+	/* Use a fixed length string. */
+	str = static_cast<char*>(malloc(BUFSIZ));
+	if (str == NULL) {
+		va_end(args);
+		return;	/* Watch for Out-Of-Memory */
+	}
+	vsnprintf(str, BUFSIZ, format, args);
+#endif /* _WIN32 */
+
+	ib_senderrf(thd, level, code, str);
+
+	va_end(args);
+	free(str);
+}
+
+/* Keep the first 16 characters as-is, since the url is sometimes used
+as an offset from this.*/
+const char*	TROUBLESHOOTING_MSG =
+	"Please refer to https://mariadb.com/kb/en/innodb-troubleshooting/"
+	" for how to resolve the issue.";
+
+const char*	TROUBLESHOOT_DATADICT_MSG =
+	"Please refer to https://mariadb.com/kb/en/innodb-data-dictionary-troubleshooting/"
+	" for how to resolve the issue.";
+
+const char*	BUG_REPORT_MSG =
+	"Submit a detailed bug report to https://jira.mariadb.org/";
+
+const char*	FORCE_RECOVERY_MSG =
+	"Please refer to "
+	"https://mariadb.com/kb/en/library/innodb-recovery-modes/"
+	" for information about forcing recovery.";
+
+const char*	OPERATING_SYSTEM_ERROR_MSG =
+	"Some operating system error numbers are described at"
+	" https://mariadb.com/kb/en/library/operating-system-error-codes/";
+
+const char*	FOREIGN_KEY_CONSTRAINTS_MSG =
+	"Please refer to https://mariadb.com/kb/en/library/foreign-keys/"
+	" for correct foreign key definition.";
+
+const char*	SET_TRANSACTION_MSG =
+	"Please refer to https://mariadb.com/kb/en/library/set-transaction/";
+
+const char*	INNODB_PARAMETERS_MSG =
+	"Please refer to https://mariadb.com/kb/en/library/innodb-system-variables/";
+
+/**********************************************************************
+Converts an identifier from my_charset_filename to UTF-8 charset.
+@return result string length, as returned by strconvert() */
+uint
+innobase_convert_to_filename_charset(
+/*=================================*/
+	char*		to,	/* out: converted identifier */
+	const char*	from,	/* in: identifier to convert */
+	ulint		len)	/* in: length of 'to', in bytes */
+{
+	uint		errors;
+	CHARSET_INFO*	cs_to = &my_charset_filename;
+	CHARSET_INFO*	cs_from = system_charset_info;
+
+	return(static_cast<uint>(strconvert(
+				cs_from, from, uint(strlen(from)),
+				cs_to, to, static_cast<uint>(len), &errors)));
+}
+
+/**********************************************************************
+Converts an identifier from my_charset_filename to UTF-8 charset.
+@return result string length, as returned by strconvert() */
+uint
+innobase_convert_to_system_charset(
+/*===============================*/
+	char*		to,	/* out: converted identifier */
+	const char*	from,	/* in: identifier to convert */
+	ulint		len,	/* in: length of 'to', in bytes */
+	uint*		errors)	/* out: error return */
+{
+	CHARSET_INFO*	cs1 = &my_charset_filename;
+	CHARSET_INFO*	cs2 = system_charset_info;
+
+	return(static_cast<uint>(strconvert(
+				cs1, from, static_cast<uint>(strlen(from)),
+				cs2, to, static_cast<uint>(len), errors)));
+}
+
+/** Validate the requested buffer pool size.  Also, reserve the necessary
+memory needed for buffer pool resize.
+@param[in]	thd	thread handle
+@param[out]	save	immediate result for update function
+@param[in]	value	incoming string
+@return 0 on success, 1 on failure.
+*/
+static
+int
+innodb_buffer_pool_size_validate(
+	THD*				thd,
+	st_mysql_sys_var*,
+	void*				save,
+	struct st_mysql_value*		value)
+{
+	longlong	intbuf;
+
+	value->val_int(value, &intbuf);
+
+	if (static_cast<ulonglong>(intbuf) < MYSQL_SYSVAR_NAME(buffer_pool_size).min_val) {
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    ER_WRONG_ARGUMENTS,
+				    "innodb_buffer_pool_size must be at least"
+				    " %lld for innodb_page_size=%lu",
+				    MYSQL_SYSVAR_NAME(buffer_pool_size).min_val,
+				    srv_page_size);
+		return(1);
+	}
+
+	if (!srv_was_started) {
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    ER_WRONG_ARGUMENTS,
+				    "Cannot update innodb_buffer_pool_size,"
+				    " because InnoDB is not started.");
+		return(1);
+	}
+
+	mysql_mutex_lock(&buf_pool.mutex);
+
+	if (srv_buf_pool_old_size != srv_buf_pool_size) {
+		mysql_mutex_unlock(&buf_pool.mutex);
+		my_printf_error(ER_WRONG_ARGUMENTS,
+			"Another buffer pool resize is already in progress.", MYF(0));
+		return(1);
+	}
+
+	ulint	requested_buf_pool_size = buf_pool_size_align(ulint(intbuf));
+
+	*static_cast<ulonglong*>(save) = requested_buf_pool_size;
+
+	if (srv_buf_pool_size == ulint(intbuf)) {
+		mysql_mutex_unlock(&buf_pool.mutex);
+		/* nothing to do */
+		return(0);
+	}
+
+	if (srv_buf_pool_size == requested_buf_pool_size) {
+		mysql_mutex_unlock(&buf_pool.mutex);
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    ER_WRONG_ARGUMENTS,
+				    "innodb_buffer_pool_size must be at least"
+				    " innodb_buffer_pool_chunk_size=%zu",
+				    srv_buf_pool_chunk_unit);
+		/* nothing to do */
+		return(0);
+	}
+
+	srv_buf_pool_size = requested_buf_pool_size;
+	mysql_mutex_unlock(&buf_pool.mutex);
+
+	if (intbuf != static_cast<longlong>(requested_buf_pool_size)) {
+		char	buf[64];
+		int	len = 64;
+		value->val_str(value, buf, &len);
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    ER_TRUNCATED_WRONG_VALUE,
+				    "Truncated incorrect %-.32s value: '%-.128s'",
+				    mysql_sysvar_buffer_pool_size.name,
+				    value->val_str(value, buf, &len));
+	}
+
+	return(0);
+}
+
+/*************************************************************//**
+Check for a valid value of innobase_compression_algorithm.
+@return	0 for valid innodb_compression_algorithm. */
+static
+int
+innodb_compression_algorithm_validate(
+/*==================================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to system
+						variable */
+	void*				save,	/*!< out: immediate result
+						for update function */
+	struct st_mysql_value*		value)	/*!< in: incoming string */
+{
+	DBUG_ENTER("innobase_compression_algorithm_validate");
+
+	if (check_sysvar_enum(thd, var, save, value)) {
+		DBUG_RETURN(1);
+	}
+
+        if (compression_algorithm_is_not_loaded(*(ulong*)save, ME_WARNING))
+          DBUG_RETURN(1);
+	DBUG_RETURN(0);
+}
+
+static
+int
+innodb_encrypt_tables_validate(
+/*=================================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to system
+						variable */
+	void*				save,	/*!< out: immediate result
+						for update function */
+	struct st_mysql_value*		value)	/*!< in: incoming string */
+{
+	if (check_sysvar_enum(thd, var, save, value)) {
+		return 1;
+	}
+
+	ulong encrypt_tables = *(ulong*)save;
+
+	if (encrypt_tables
+	    && !encryption_key_id_exists(FIL_DEFAULT_ENCRYPTION_KEY)) {
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    HA_ERR_UNSUPPORTED,
+				    "InnoDB: cannot enable encryption, "
+		                    "encryption plugin is not available");
+		return 1;
+	}
+
+	return 0;
+}
+
+static void innodb_remember_check_sysvar_funcs()
+{
+	/* remember build-in sysvar check functions */
+	ut_ad((MYSQL_SYSVAR_NAME(checksum_algorithm).flags & 0x1FF) == PLUGIN_VAR_ENUM);
+	check_sysvar_enum = MYSQL_SYSVAR_NAME(checksum_algorithm).check;
+
+	ut_ad((MYSQL_SYSVAR_NAME(flush_log_at_timeout).flags & 15) == PLUGIN_VAR_INT);
+	check_sysvar_int = MYSQL_SYSVAR_NAME(flush_log_at_timeout).check;
+}
+
+static const size_t MAX_BUF_SIZE = 4 * 1024;
+
+/********************************************************************//**
+Helper function to push warnings from InnoDB internals to SQL-layer. */
+void
+ib_push_warning(
+	trx_t*		trx,	/*!< in: trx */
+	dberr_t		error,	/*!< in: error code to push as warning */
+	const char	*format,/*!< in: warning message */
+	...)
+{
+	if (trx && trx->mysql_thd) {
+		THD *thd = (THD *)trx->mysql_thd;
+		va_list args;
+		char *buf;
+
+		va_start(args, format);
+		buf = (char *)my_malloc(PSI_INSTRUMENT_ME, MAX_BUF_SIZE, MYF(MY_WME));
+		buf[MAX_BUF_SIZE - 1] = 0;
+		vsnprintf(buf, MAX_BUF_SIZE - 1, format, args);
+
+		push_warning_printf(
+			thd, Sql_condition::WARN_LEVEL_WARN,
+			uint(convert_error_code_to_mysql(error, 0, thd)), buf);
+		my_free(buf);
+		va_end(args);
+	}
+}
+
+/********************************************************************//**
+Helper function to push warnings from InnoDB internals to SQL-layer. */
+void
+ib_push_warning(
+	void*		ithd,	/*!< in: thd */
+	dberr_t		error,	/*!< in: error code to push as warning */
+	const char	*format,/*!< in: warning message */
+	...)
+{
+	va_list args;
+	THD *thd = (THD *)ithd;
+	char *buf;
+
+	if (ithd == NULL) {
+		thd = current_thd;
+	}
+
+	if (thd) {
+		va_start(args, format);
+		buf = (char *)my_malloc(PSI_INSTRUMENT_ME, MAX_BUF_SIZE, MYF(MY_WME));
+		buf[MAX_BUF_SIZE - 1] = 0;
+		vsnprintf(buf, MAX_BUF_SIZE - 1, format, args);
+
+		push_warning_printf(
+			thd, Sql_condition::WARN_LEVEL_WARN,
+			uint(convert_error_code_to_mysql(error, 0, thd)), buf);
+		my_free(buf);
+		va_end(args);
+	}
+}
+
+/** Helper function to push warnings from InnoDB internals to SQL-layer.
+@param[in]	trx
+@param[in]	error		Error code to push as warning
+@param[in]	table_name	Table name
+@param[in]	format		Warning message
+@param[in]	...		Message arguments */
+void
+ib_foreign_warn(trx_t*	    trx,   /*!< in: trx */
+		dberr_t	    error, /*!< in: error code to push as warning */
+		const char* table_name,
+		const char* format, /*!< in: warning message */
+		...)
+{
+	va_list		    args;
+	char*		    buf;
+	static FILE*	    ef		 = dict_foreign_err_file;
+	static const size_t MAX_BUF_SIZE = 4 * 1024;
+	buf = (char*)my_malloc(PSI_INSTRUMENT_ME, MAX_BUF_SIZE, MYF(MY_WME));
+	if (!buf) {
+		return;
+	}
+
+	va_start(args, format);
+	vsprintf(buf, format, args);
+	va_end(args);
+
+	mysql_mutex_lock(&dict_foreign_err_mutex);
+	rewind(ef);
+	ut_print_timestamp(ef);
+	fprintf(ef, " Error in foreign key constraint of table %s:\n",
+		table_name);
+	fputs(buf, ef);
+	mysql_mutex_unlock(&dict_foreign_err_mutex);
+
+	if (trx && trx->mysql_thd) {
+		THD* thd = (THD*)trx->mysql_thd;
+
+		push_warning_printf(
+			thd, Sql_condition::WARN_LEVEL_WARN,
+			uint(convert_error_code_to_mysql(error, 0, thd)), buf);
+	}
+
+	my_free(buf);
+}
+
+/********************************************************************//**
+Helper function to push frm mismatch error to error log and
+if needed to sql-layer. */
+void
+ib_push_frm_error(
+	THD*		thd,		/*!< in: MySQL thd */
+	dict_table_t*	ib_table,	/*!< in: InnoDB table */
+	TABLE*		table,		/*!< in: MySQL table */
+	ulint		n_keys,		/*!< in: InnoDB #keys */
+	bool		push_warning)	/*!< in: print warning ? */
+{
+	switch (ib_table->dict_frm_mismatch) {
+	case DICT_FRM_NO_PK:
+		sql_print_error("Table %s has a primary key in "
+			"InnoDB data dictionary, but not "
+			"in MariaDB!"
+			" Have you mixed up "
+			".frm files from different "
+			"installations? See "
+			"https://mariadb.com/kb/en/innodb-troubleshooting/\n",
+			ib_table->name.m_name);
+
+		if (push_warning) {
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_NO_SUCH_INDEX,
+				"InnoDB: Table %s has a "
+				"primary key in InnoDB data "
+				"dictionary, but not in "
+				"MariaDB!", ib_table->name.m_name);
+		}
+		break;
+	case DICT_NO_PK_FRM_HAS:
+		sql_print_error(
+				"Table %s has no primary key in InnoDB data "
+				"dictionary, but has one in MariaDB! If you "
+				"created the table with a MariaDB version < "
+				"3.23.54 and did not define a primary key, "
+				"but defined a unique key with all non-NULL "
+				"columns, then MariaDB internally treats that "
+				"key as the primary key. You can fix this "
+				"error by dump + DROP + CREATE + reimport "
+				"of the table.", ib_table->name.m_name);
+
+		if (push_warning) {
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_NO_SUCH_INDEX,
+				"InnoDB: Table %s has no "
+				"primary key in InnoDB data "
+				"dictionary, but has one in "
+				"MariaDB!",
+				ib_table->name.m_name);
+		}
+		break;
+
+	case DICT_FRM_INCONSISTENT_KEYS:
+		sql_print_error("InnoDB: Table %s contains " ULINTPF " "
+			"indexes inside InnoDB, which "
+			"is different from the number of "
+			"indexes %u defined in the .frm file. See "
+			"https://mariadb.com/kb/en/innodb-troubleshooting/\n",
+			ib_table->name.m_name, n_keys,
+			table->s->keys);
+
+		if (push_warning) {
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				ER_NO_SUCH_INDEX,
+				"InnoDB: Table %s contains " ULINTPF " "
+				"indexes inside InnoDB, which "
+				"is different from the number of "
+				"indexes %u defined in the MariaDB ",
+                                ib_table->name.m_name, n_keys,
+				table->s->keys);
+		}
+		break;
+
+	case DICT_FRM_CONSISTENT:
+	default:
+		sql_print_error("InnoDB: Table %s is consistent "
+			"on InnoDB data dictionary and MariaDB "
+			" FRM file.",
+			ib_table->name.m_name);
+		ut_error;
+		break;
+	}
+}
+
+/** Writes 8 bytes to nth tuple field
+@param[in]	tuple	where to write
+@param[in]	nth	index in tuple
+@param[in]	data	what to write
+@param[in]	buf	field data buffer */
+static void set_tuple_col_8(dtuple_t *tuple, int col, uint64_t data, byte *buf)
+{
+  dfield_t *dfield= dtuple_get_nth_field(tuple, col);
+  ut_ad(dfield->type.len == 8);
+  if (dfield->len == UNIV_SQL_NULL)
+  {
+    dfield_set_data(dfield, buf, 8);
+  }
+  ut_ad(dfield->len == dfield->type.len && dfield->data);
+  mach_write_to_8(dfield->data, data);
+}
+
+void ins_node_t::vers_update_end(row_prebuilt_t *prebuilt, bool history_row)
+{
+  ut_ad(prebuilt->ins_node == this);
+  trx_t *trx= prebuilt->trx;
+#ifndef DBUG_OFF
+  ut_ad(table->vers_start != table->vers_end);
+  const mysql_row_templ_t *t= prebuilt->get_template_by_col(table->vers_end);
+  ut_ad(t);
+  ut_ad(t->mysql_col_len == 8);
+#endif
+
+  if (history_row)
+  {
+    set_tuple_col_8(row, table->vers_end, trx->id, vers_end_buf);
+  }
+  else /* ROW_INS_VERSIONED */
+  {
+    set_tuple_col_8(row, table->vers_end, TRX_ID_MAX, vers_end_buf);
+#ifndef DBUG_OFF
+    t= prebuilt->get_template_by_col(table->vers_start);
+    ut_ad(t);
+    ut_ad(t->mysql_col_len == 8);
+#endif
+    set_tuple_col_8(row, table->vers_start, trx->id, vers_start_buf);
+  }
+  dict_index_t *clust_index= dict_table_get_first_index(table);
+  THD *thd= trx->mysql_thd;
+  TABLE *mysql_table= prebuilt->m_mysql_table;
+  mem_heap_t *local_heap= NULL;
+  for (ulint col_no= 0; col_no < dict_table_get_n_v_cols(table); col_no++)
+  {
+    const dict_v_col_t *v_col= dict_table_get_nth_v_col(table, col_no);
+    for (ulint i= 0; i < unsigned(v_col->num_base); i++)
+      if (v_col->base_col[i]->ind == table->vers_end)
+        innobase_get_computed_value(row, v_col, clust_index, &local_heap,
+                                    table->heap, NULL, thd, mysql_table,
+                                    mysql_table->record[0], NULL, NULL);
+  }
+  if (UNIV_LIKELY_NULL(local_heap))
+    mem_heap_free(local_heap);
+}
+
+/** Calculate aligned buffer pool size based on srv_buf_pool_chunk_unit,
+if needed.
+@param[in]	size	size in bytes
+@return	aligned size */
+ulint
+buf_pool_size_align(
+	ulint	size)
+{
+  const size_t m = srv_buf_pool_chunk_unit;
+  size = ut_max(size, (size_t) MYSQL_SYSVAR_NAME(buffer_pool_size).min_val);
+
+  if (size % m == 0) {
+    return(size);
+  } else {
+    return (size / m + 1) * m;
+  }
+}
diff --git a/storage/innobase/handler/ha_innodb.h b/storage/innobase/handler/ha_innodb.h
new file mode 100644
index 00000000..1f42bf18
--- /dev/null
+++ b/storage/innobase/handler/ha_innodb.h
@@ -0,0 +1,937 @@
+/*****************************************************************************
+
+Copyright (c) 2000, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+#ifdef WITH_WSREP
+#include "wsrep_api.h"
+#include <mysql/service_wsrep.h>
+#endif /* WITH_WSREP */
+
+#include "table.h"
+
+/* The InnoDB handler: the interface between MySQL and InnoDB. */
+
+/** "GEN_CLUST_INDEX" is the name reserved for InnoDB default
+system clustered index when there is no primary key. */
+extern const char innobase_index_reserve_name[];
+
+/** Prebuilt structures in an InnoDB table handle used within MySQL */
+struct row_prebuilt_t;
+
+/** InnoDB transaction */
+struct trx_t;
+
+/** Engine specific table options are defined using this struct */
+struct ha_table_option_struct
+{
+	bool		page_compressed;	/*!< Table is using page compression
+						if this option is true. */
+	ulonglong	page_compression_level;	/*!< Table page compression level
+						0-9. */
+	uint		atomic_writes;		/*!< Use atomic writes for this
+						table if this options is ON or
+						in DEFAULT if
+						innodb_use_atomic_writes.
+						Atomic writes are not used if
+						value OFF.*/
+	uint		encryption;		/*!<  DEFAULT, ON, OFF */
+	ulonglong	encryption_key_id;	/*!< encryption key id  */
+};
+
+/** The class defining a handle to an Innodb table */
+class ha_innobase final : public handler
+{
+public:
+	ha_innobase(handlerton* hton, TABLE_SHARE* table_arg);
+	~ha_innobase() override;
+
+	/** @return the transaction that last modified the table definition
+	@see dict_table_t::def_trx_id */
+	ulonglong table_version() const override;
+
+	/** Get the row type from the storage engine.  If this method returns
+	ROW_TYPE_NOT_USED, the information in HA_CREATE_INFO should be used. */
+        enum row_type get_row_type() const override;
+
+        const char* table_type() const override;
+
+	const char* index_type(uint key_number) override;
+
+	Table_flags table_flags() const override;
+
+	ulong index_flags(uint idx, uint part, bool all_parts) const override;
+
+	uint max_supported_keys() const override;
+
+	uint max_supported_key_length() const override;
+
+	uint max_supported_key_part_length() const override;
+
+	const key_map* keys_to_use_for_scanning() override;
+
+	void column_bitmaps_signal() override;
+
+	/** Opens dictionary table object using table name. For partition, we need to
+	try alternative lower/upper case names to support moving data files across
+	platforms.
+	@param[in]	table_name	name of the table/partition
+	@param[in]	norm_name	normalized name of the table/partition
+	@param[in]	is_partition	if this is a partition of a table
+	@param[in]	ignore_err	error to ignore for loading dictionary object
+	@return dictionary table object or NULL if not found */
+        static dict_table_t* open_dict_table(
+		const char*		table_name,
+		const char*		norm_name,
+		bool			is_partition,
+		dict_err_ignore_t	ignore_err);
+
+	int open(const char *name, int mode, uint test_if_locked) override;
+
+	handler* clone(const char *name, MEM_ROOT *mem_root) override;
+
+	int close(void) override;
+
+	double scan_time() override;
+
+	double read_time(uint index, uint ranges, ha_rows rows) override;
+
+	int write_row(const uchar * buf) override;
+
+	int update_row(const uchar * old_data, const uchar * new_data) override;
+
+	int delete_row(const uchar * buf) override;
+
+	bool was_semi_consistent_read() override;
+
+	void try_semi_consistent_read(bool yes) override;
+
+	void unlock_row() override;
+
+	int index_init(uint index, bool sorted) override;
+
+	int index_end() override;
+
+	int index_read(
+		uchar*			buf,
+		const uchar*		key,
+		uint			key_len,
+		ha_rkey_function	find_flag) override;
+
+	int index_read_last(uchar * buf, const uchar * key,
+			    uint key_len) override;
+
+        int index_next(uchar * buf) override;
+
+	int index_next_same(uchar * buf, const uchar * key,
+			    uint keylen) override;
+
+	int index_prev(uchar * buf) override;
+
+	int index_first(uchar * buf) override;
+
+	int index_last(uchar * buf) override;
+
+	/* Copy a cached MySQL row. If requested, also avoids
+	overwriting non-read columns. */
+	void copy_cached_row(uchar *to_rec, const uchar *from_rec,
+				uint rec_length);
+	int rnd_init(bool scan) override;
+
+	int rnd_end() override;
+
+	int rnd_next(uchar *buf) override;
+
+	int rnd_pos(uchar * buf, uchar *pos) override;
+
+	int ft_init() override;
+	void ft_end() override { rnd_end(); }
+	FT_INFO *ft_init_ext(uint flags, uint inx, String* key) override;
+	int ft_read(uchar* buf) override;
+
+	void position(const uchar *record) override;
+
+	int info(uint) override;
+
+	int analyze(THD* thd,HA_CHECK_OPT* check_opt) override;
+
+	int optimize(THD* thd,HA_CHECK_OPT* check_opt) override;
+
+	int discard_or_import_tablespace(my_bool discard) override;
+
+	int extra(ha_extra_function operation) override;
+
+	int reset() override;
+
+	int external_lock(THD *thd, int lock_type) override;
+
+	int start_stmt(THD *thd, thr_lock_type lock_type) override;
+
+	ha_rows records_in_range(
+                uint                    inx,
+                const key_range*        min_key,
+                const key_range*        max_key,
+                page_range*             pages) override;
+
+	ha_rows estimate_rows_upper_bound() override;
+
+	void update_create_info(HA_CREATE_INFO* create_info) override;
+
+	int create(
+		const char*		name,
+		TABLE*			form,
+		HA_CREATE_INFO*		create_info,
+		bool			file_per_table,
+		trx_t*			trx);
+
+	int create(
+		const char*		name,
+		TABLE*			form,
+		HA_CREATE_INFO*		create_info) override;
+
+	int truncate() override;
+
+	int delete_table(const char *name) override;
+
+	int rename_table(const char* from, const char* to) override;
+	inline int defragment_table();
+	int check(THD* thd, HA_CHECK_OPT* check_opt) override;
+
+	inline void reload_statistics();
+
+	char* get_foreign_key_create_info() override;
+
+        int get_foreign_key_list(THD *thd,
+                                 List<FOREIGN_KEY_INFO> *f_key_list) override;
+
+	int get_parent_foreign_key_list(
+		THD*			thd,
+		List<FOREIGN_KEY_INFO>*	f_key_list) override;
+
+	bool can_switch_engines() override;
+
+	uint referenced_by_foreign_key() override;
+
+	void free_foreign_key_create_info(char* str) override { my_free(str); }
+
+	uint lock_count(void) const override;
+
+	THR_LOCK_DATA** store_lock(
+		THD*			thd,
+		THR_LOCK_DATA**		to,
+		thr_lock_type		lock_type) override;
+
+	void init_table_handle_for_HANDLER() override;
+
+	void get_auto_increment(
+		ulonglong		offset,
+		ulonglong		increment,
+		ulonglong		nb_desired_values,
+		ulonglong*		first_value,
+		ulonglong*		nb_reserved_values) override;
+
+	bool get_error_message(int error, String *buf) override;
+
+	bool get_foreign_dup_key(char*, uint, char*, uint) override;
+
+	uint8 table_cache_type() override;
+
+	/**
+	Ask handler about permission to cache table during query registration
+	*/
+	my_bool register_query_cache_table(
+		THD*			thd,
+		const char*		table_key,
+		uint			key_length,
+		qc_engine_callback*	call_back,
+		ulonglong*		engine_data) override;
+
+	int cmp_ref(const uchar* ref1, const uchar* ref2) override;
+
+	/** On-line ALTER TABLE interface @see handler0alter.cc @{ */
+
+	/** Check if InnoDB supports a particular alter table in-place
+	@param altered_table TABLE object for new version of table.
+	@param ha_alter_info Structure describing changes to be done
+	by ALTER TABLE and holding data used during in-place alter.
+
+	@retval HA_ALTER_INPLACE_NOT_SUPPORTED Not supported
+	@retval HA_ALTER_INPLACE_INSTANT
+	MDL_EXCLUSIVE is needed for executing prepare_inplace_alter_table()
+	and commit_inplace_alter_table(). inplace_alter_table()
+	will not be called.
+	@retval HA_ALTER_INPLACE_COPY_NO_LOCK
+	MDL_EXCLUSIVE in prepare_inplace_alter_table(), which can be downgraded
+	to LOCK=NONE for rebuilding the table in inplace_alter_table()
+	@retval HA_ALTER_INPLACE_COPY_LOCK
+	MDL_EXCLUSIVE in prepare_inplace_alter_table(), which can be downgraded
+	to LOCK=SHARED for rebuilding the table in inplace_alter_table()
+	@retval HA_ALTER_INPLACE_NOCOPY_NO_LOCK
+	MDL_EXCLUSIVE in prepare_inplace_alter_table(), which can be downgraded
+	to LOCK=NONE for inplace_alter_table() which will not rebuild the table
+	@retval HA_ALTER_INPLACE_NOCOPY_LOCK
+	MDL_EXCLUSIVE in prepare_inplace_alter_table(), which can be downgraded
+	to LOCK=SHARED for inplace_alter_table() which will not rebuild
+	the table. */
+
+	enum_alter_inplace_result check_if_supported_inplace_alter(
+		TABLE*			altered_table,
+		Alter_inplace_info*	ha_alter_info) override;
+
+	/** Allows InnoDB to update internal structures with concurrent
+	writes blocked (provided that check_if_supported_inplace_alter()
+	did not return HA_ALTER_INPLACE_NO_LOCK).
+	This will be invoked before inplace_alter_table().
+
+	@param altered_table TABLE object for new version of table.
+	@param ha_alter_info Structure describing changes to be done
+	by ALTER TABLE and holding data used during in-place alter.
+
+	@retval true Failure
+	@retval false Success
+	*/
+	bool prepare_inplace_alter_table(
+		TABLE*			altered_table,
+		Alter_inplace_info*	ha_alter_info) override;
+
+	/** Alter the table structure in-place with operations
+	specified using HA_ALTER_FLAGS and Alter_inplace_information.
+	The level of concurrency allowed during this operation depends
+	on the return value from check_if_supported_inplace_alter().
+
+	@param altered_table TABLE object for new version of table.
+	@param ha_alter_info Structure describing changes to be done
+	by ALTER TABLE and holding data used during in-place alter.
+
+	@retval true Failure
+	@retval false Success
+	*/
+	bool inplace_alter_table(
+		TABLE*			altered_table,
+		Alter_inplace_info*	ha_alter_info) override;
+
+	/** Commit or rollback the changes made during
+	prepare_inplace_alter_table() and inplace_alter_table() inside
+	the storage engine. Note that the allowed level of concurrency
+	during this operation will be the same as for
+	inplace_alter_table() and thus might be higher than during
+	prepare_inplace_alter_table(). (E.g concurrent writes were
+	blocked during prepare, but might not be during commit).
+	@param altered_table TABLE object for new version of table.
+	@param ha_alter_info Structure describing changes to be done
+	by ALTER TABLE and holding data used during in-place alter.
+	@param commit true => Commit, false => Rollback.
+	@retval true Failure
+	@retval false Success
+	*/
+	bool commit_inplace_alter_table(
+		TABLE*			altered_table,
+		Alter_inplace_info*	ha_alter_info,
+		bool			commit) override;
+	/** @} */
+
+	bool check_if_incompatible_data(
+		HA_CREATE_INFO*		info,
+		uint			table_changes) override;
+
+	/** @name Multi Range Read interface @{ */
+
+	/** Initialize multi range read @see DsMrr_impl::dsmrr_init
+	@param seq
+	@param seq_init_param
+	@param n_ranges
+	@param mode
+	@param buf */
+	int multi_range_read_init(
+		RANGE_SEQ_IF*		seq,
+		void*			seq_init_param,
+		uint			n_ranges,
+		uint			mode,
+		HANDLER_BUFFER*		buf) override;
+
+	/** Process next multi range read @see DsMrr_impl::dsmrr_next
+	@param range_info */
+	int multi_range_read_next(range_id_t *range_info) override;
+
+	/** Initialize multi range read and get information.
+	@see ha_myisam::multi_range_read_info_const
+	@see DsMrr_impl::dsmrr_info_const
+	@param keyno
+	@param seq
+	@param seq_init_param
+	@param n_ranges
+	@param bufsz
+	@param flags
+	@param cost */
+	ha_rows multi_range_read_info_const(
+		uint			keyno,
+		RANGE_SEQ_IF*		seq,
+		void*			seq_init_param,
+		uint			n_ranges,
+		uint*			bufsz,
+		uint*			flags,
+		Cost_estimate*		cost) override;
+
+	/** Initialize multi range read and get information.
+	@see DsMrr_impl::dsmrr_info
+	@param keyno
+	@param seq
+	@param seq_init_param
+	@param n_ranges
+	@param bufsz
+	@param flags
+	@param cost */
+	ha_rows multi_range_read_info(uint keyno, uint n_ranges, uint keys,
+				      uint key_parts, uint* bufsz, uint* flags,
+				      Cost_estimate* cost) override;
+
+	int multi_range_read_explain_info(uint mrr_mode,
+					  char *str, size_t size) override;
+
+	/** Attempt to push down an index condition.
+	@param[in] keyno MySQL key number
+	@param[in] idx_cond Index condition to be checked
+	@return idx_cond if pushed; NULL if not pushed */
+	Item* idx_cond_push(uint keyno, Item* idx_cond) override;
+	/* @} */
+
+	/** Check if InnoDB is not storing virtual column metadata for a table.
+	@param	s	table definition (based on .frm file)
+	@return	whether InnoDB will omit virtual column metadata */
+	static bool omits_virtual_cols(const TABLE_SHARE& s)
+	{
+		return s.frm_version<FRM_VER_EXPRESSSIONS && s.virtual_fields;
+	}
+
+	/** Push a primary key filter.
+	@param[in]	pk_filter	filter against which primary keys
+					are to be checked
+	@retval	false if pushed (always) */
+	bool rowid_filter_push(Rowid_filter *rowid_filter) override;
+
+	bool can_convert_nocopy(const Field &field,
+				const Column_definition& new_field) const
+		override;
+
+	/** @return whether innodb_strict_mode is active */
+	static bool is_innodb_strict_mode(THD* thd);
+
+	/** @return whether innodb_strict_mode is active */
+	bool is_innodb_strict_mode()
+	{ return is_innodb_strict_mode(m_user_thd); }
+	Compare_keys
+	compare_key_parts(const Field& old_field,
+			  const Column_definition& new_field,
+			  const KEY_PART_INFO& old_part,
+			  const KEY_PART_INFO& new_part) const override;
+
+protected:
+	bool
+	can_convert_string(const Field_string* field,
+			   const Column_definition& new_field) const;
+	bool can_convert_varstring(
+	    const Field_varstring* field,
+	    const Column_definition& new_field) const;
+	bool
+	can_convert_blob(const Field_blob* field,
+			 const Column_definition& new_field) const;
+
+	dberr_t innobase_get_autoinc(ulonglong* value);
+	dberr_t innobase_lock_autoinc();
+	ulonglong innobase_peek_autoinc();
+	dberr_t innobase_set_max_autoinc(ulonglong auto_inc);
+
+	/** Resets a query execution 'template'.
+	@see build_template() */
+	void reset_template();
+
+	/** @return whether the table is read-only */
+	bool is_read_only(bool altering_to_supported= false) const;
+
+	inline void update_thd(THD* thd);
+	void update_thd();
+
+	int general_fetch(uchar* buf, uint direction, uint match_mode);
+	int change_active_index(uint keynr);
+	/* @return true if it's necessary to switch current statement log
+	format from STATEMENT to ROW if binary log format is MIXED and
+	autoincrement values are changed in the statement */
+	bool autoinc_lock_mode_stmt_unsafe() const override;
+	dict_index_t* innobase_get_index(uint keynr);
+
+#ifdef WITH_WSREP
+	int wsrep_append_keys(
+		THD *thd,
+		Wsrep_service_key_type key_type,
+		const uchar* record0,
+		const uchar* record1);
+#endif
+	/** Builds a 'template' to the prebuilt struct.
+
+	The template is used in fast retrieval of just those column
+	values MySQL needs in its processing.
+	@param whole_row true if access is needed to a whole row,
+	false if accessing individual fields is enough */
+	void build_template(bool whole_row);
+
+	int info_low(uint, bool);
+
+	/** The multi range read session object */
+	DsMrr_impl		m_ds_mrr;
+
+	/** Save CPU time with prebuilt/cached data structures */
+	row_prebuilt_t*		m_prebuilt;
+
+	/** Thread handle of the user currently using the handler;
+	this is set in external_lock function */
+	THD*			m_user_thd;
+
+	/** buffer used in updates */
+	uchar*			m_upd_buf;
+
+	/** the size of upd_buf in bytes */
+	ulint			m_upd_buf_size;
+
+	/** Flags that specificy the handler instance (table) capability. */
+	Table_flags		m_int_table_flags;
+
+	/** Index into the server's primkary keye meta-data table->key_info{} */
+	uint			m_primary_key;
+
+	/** this is set to 1 when we are starting a table scan but have
+	not yet fetched any row, else false */
+	bool			m_start_of_scan;
+
+	/*!< match mode of the latest search: ROW_SEL_EXACT,
+	ROW_SEL_EXACT_PREFIX, or undefined */
+	uint			m_last_match_mode;
+
+        /** If mysql has locked with external_lock() */
+        bool                    m_mysql_has_locked;
+};
+
+
+/* Some accessor functions which the InnoDB plugin needs, but which
+can not be added to mysql/plugin.h as part of the public interface;
+the definitions are bracketed with #ifdef INNODB_COMPATIBILITY_HOOKS */
+
+#ifndef INNODB_COMPATIBILITY_HOOKS
+#error InnoDB needs MySQL to be built with #define INNODB_COMPATIBILITY_HOOKS
+#endif
+
+extern "C" {
+
+/** Check if a user thread is running a non-transactional update
+@param thd user thread
+@retval 0 the user thread is not running a non-transactional update
+@retval 1 the user thread is running a non-transactional update */
+int thd_non_transactional_update(const MYSQL_THD thd);
+
+/** Get the user thread's binary logging format
+@param thd user thread
+@return Value to be used as index into the binlog_format_names array */
+int thd_binlog_format(const MYSQL_THD thd);
+
+/** Check if binary logging is filtered for thread's current db.
+@param thd Thread handle
+@retval 1 the query is not filtered, 0 otherwise. */
+bool thd_binlog_filter_ok(const MYSQL_THD thd);
+
+/** Check if the query may generate row changes which may end up in the binary.
+@param thd Thread handle
+@retval 1 the query may generate row changes, 0 otherwise.
+*/
+bool thd_sqlcom_can_generate_row_events(const MYSQL_THD thd);
+
+/** Is strict sql_mode set.
+@param thd Thread object
+@return True if sql_mode has strict mode (all or trans), false otherwise. */
+bool thd_is_strict_mode(const MYSQL_THD thd);
+
+} /* extern "C" */
+
+/** Get the file name and position of the MySQL binlog corresponding to the
+ * current commit.
+ */
+extern void mysql_bin_log_commit_pos(THD *thd, ulonglong *out_pos, const char **out_file);
+
+struct trx_t;
+
+extern const struct _ft_vft ft_vft_result;
+
+/** Structure Returned by ha_innobase::ft_init_ext() */
+typedef struct new_ft_info
+{
+	struct _ft_vft		*please;
+	struct _ft_vft_ext	*could_you;
+	row_prebuilt_t*		ft_prebuilt;
+	fts_result_t*		ft_result;
+} NEW_FT_INFO;
+
+/**
+Allocates an InnoDB transaction for a MySQL handler object.
+@return InnoDB transaction handle */
+trx_t*
+innobase_trx_allocate(
+	MYSQL_THD	thd);	/*!< in: user thread handle */
+
+/*********************************************************************//**
+This function checks each index name for a table against reserved
+system default primary index name 'GEN_CLUST_INDEX'. If a name
+matches, this function pushes an warning message to the client,
+and returns true.
+@return true if the index name matches the reserved name */
+bool
+innobase_index_name_is_reserved(
+	THD*		thd,		/*!< in/out: MySQL connection */
+	const KEY*	key_info,	/*!< in: Indexes to be created */
+	ulint		num_of_keys)	/*!< in: Number of indexes to
+					be created. */
+	MY_ATTRIBUTE((nonnull(1), warn_unused_result));
+
+/** Parse hint for table and its indexes, and update the information
+in dictionary.
+@param[in]	thd		Connection thread
+@param[in,out]	table		Target table
+@param[in]	table_share	Table definition */
+void
+innobase_parse_hint_from_comment(
+	THD*			thd,
+	dict_table_t*		table,
+	const TABLE_SHARE*	table_share);
+
+/** Class for handling create table information. */
+class create_table_info_t
+{
+public:
+	/** Constructor.
+	Used in two ways:
+	- all but file_per_table is used, when creating the table.
+	- all but name/path is used, when validating options and using flags. */
+	create_table_info_t(
+		THD*		thd,
+		const TABLE*	form,
+		HA_CREATE_INFO*	create_info,
+		char*		table_name,
+		char*		remote_path,
+		bool		file_per_table,
+		trx_t*		trx = NULL);
+
+	/** Initialize the object. */
+	int initialize();
+
+	/** Set m_tablespace_type. */
+	void set_tablespace_type(bool table_being_altered_is_file_per_table);
+
+	/** Create InnoDB foreign keys from MySQL alter_info. */
+	dberr_t create_foreign_keys();
+
+	/** Create the internal innodb table.
+	@param create_fk	whether to add FOREIGN KEY constraints */
+	int create_table(bool create_fk = true);
+
+  static void create_table_update_dict(dict_table_t* table, THD* thd,
+                                       const HA_CREATE_INFO& info,
+                                       const TABLE& t);
+
+	/** Validates the create options. Checks that the options
+	KEY_BLOCK_SIZE, ROW_FORMAT, DATA DIRECTORY, TEMPORARY & TABLESPACE
+	are compatible with each other and other settings.
+	These CREATE OPTIONS are not validated here unless innodb_strict_mode
+	is on. With strict mode, this function will report each problem it
+	finds using a custom message with error code
+	ER_ILLEGAL_HA_CREATE_OPTION, not its built-in message.
+	@return NULL if valid, string name of bad option if not. */
+	const char* create_options_are_invalid();
+
+	bool gcols_in_fulltext_or_spatial();
+
+	/** Validates engine specific table options not handled by
+	SQL-parser.
+	@return NULL if valid, string name of bad option if not. */
+	const char* check_table_options();
+
+	/** Validate DATA DIRECTORY option. */
+	bool create_option_data_directory_is_valid();
+
+	/** Validate TABLESPACE option. */
+	bool create_option_tablespace_is_valid();
+
+	/** Prepare to create a table. */
+	int prepare_create_table(const char* name, bool strict = true);
+
+	void allocate_trx();
+
+	/** Checks that every index have sane size. Depends on strict mode */
+	bool row_size_is_acceptable(const dict_table_t& table,
+				    bool strict) const;
+	/** Checks that given index have sane size. Depends on strict mode */
+	bool row_size_is_acceptable(const dict_index_t& index,
+				    bool strict) const;
+
+	/** Determines InnoDB table flags.
+	If strict_mode=OFF, this will adjust the flags to what should be assumed.
+	@retval true if successful, false if error */
+	bool innobase_table_flags();
+
+	/** Set flags and append '/' to remote path if necessary. */
+	void set_remote_path_flags();
+
+	/** Get table flags. */
+	ulint flags() const
+	{ return(m_flags); }
+
+	/** Update table flags. */
+	void flags_set(ulint flags) { m_flags |= flags; }
+
+	/** Get table flags2. */
+	ulint flags2() const
+	{ return(m_flags2); }
+
+	/** Get trx. */
+	trx_t* trx() const
+	{ return(m_trx); }
+
+	/** @return table name */
+	const char* table_name() const { return(m_table_name); }
+
+	/** @return the created table */
+	dict_table_t *table() const { return m_table; }
+
+	THD* thd() const { return(m_thd); }
+
+private:
+	/** Parses the table name into normal name and either temp path or
+	remote path if needed.*/
+	int
+	parse_table_name(
+		const char*	name);
+
+	/** Create the internal innodb table definition. */
+	int create_table_def();
+
+	/** Connection thread handle. */
+	THD*		m_thd;
+
+	/** InnoDB transaction handle. */
+	trx_t*		m_trx;
+
+	/** Information on table columns and indexes. */
+	const TABLE*	m_form;
+
+	/** Value of innodb_default_row_format */
+	const ulong	m_default_row_format;
+
+	/** Create options. */
+	HA_CREATE_INFO*	m_create_info;
+
+	/** Table name */
+	char*		m_table_name;
+	/** Table */
+	dict_table_t*	m_table;
+
+	/** Remote path (DATA DIRECTORY) or zero length-string */
+	char*		m_remote_path;
+
+	/** Local copy of srv_file_per_table. */
+	bool		m_innodb_file_per_table;
+
+	/** Allow file_per_table for this table either because:
+	1) the setting innodb_file_per_table=on,
+	2) it was explicitly requested by tablespace=innodb_file_per_table.
+	3) the table being altered is currently file_per_table */
+	bool		m_allow_file_per_table;
+
+	/** After all considerations, this shows whether we will actually
+	create a table and tablespace using file-per-table. */
+	bool		m_use_file_per_table;
+
+	/** Using DATA DIRECTORY */
+	bool		m_use_data_dir;
+
+	/** Table flags */
+	ulint		m_flags;
+
+	/** Table flags2 */
+	ulint		m_flags2;
+};
+
+/**
+Initialize the table FTS stopword list
+@return TRUE if success */
+ibool
+innobase_fts_load_stopword(
+/*=======================*/
+	dict_table_t*	table,		/*!< in: Table has the FTS */
+	trx_t*		trx,		/*!< in: transaction */
+	THD*		thd)		/*!< in: current thread */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Some defines for innobase_fts_check_doc_id_index() return value */
+enum fts_doc_id_index_enum {
+	FTS_INCORRECT_DOC_ID_INDEX,
+	FTS_EXIST_DOC_ID_INDEX,
+	FTS_NOT_EXIST_DOC_ID_INDEX
+};
+
+/**
+Check whether the table has a unique index with FTS_DOC_ID_INDEX_NAME
+on the Doc ID column.
+@return the status of the FTS_DOC_ID index */
+fts_doc_id_index_enum
+innobase_fts_check_doc_id_index(
+	const dict_table_t*	table,		/*!< in: table definition */
+	const TABLE*		altered_table,	/*!< in: MySQL table
+						that is being altered */
+	ulint*			fts_doc_col_no)	/*!< out: The column number for
+						Doc ID */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/**
+Check whether the table has a unique index with FTS_DOC_ID_INDEX_NAME
+on the Doc ID column in MySQL create index definition.
+@return FTS_EXIST_DOC_ID_INDEX if there exists the FTS_DOC_ID index,
+FTS_INCORRECT_DOC_ID_INDEX if the FTS_DOC_ID index is of wrong format */
+fts_doc_id_index_enum
+innobase_fts_check_doc_id_index_in_def(
+	ulint		n_key,		/*!< in: Number of keys */
+	const KEY*	key_info)	/*!< in: Key definitions */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/**
+Copy table flags from MySQL's TABLE_SHARE into an InnoDB table object.
+Those flags are stored in .frm file and end up in the MySQL table object,
+but are frequently used inside InnoDB so we keep their copies into the
+InnoDB table object. */
+void
+innobase_copy_frm_flags_from_table_share(
+	dict_table_t*		innodb_table,	/*!< in/out: InnoDB table */
+	const TABLE_SHARE*	table_share);	/*!< in: table share */
+
+/** Set up base columns for virtual column
+@param[in]	table	the InnoDB table
+@param[in]	field	MySQL field
+@param[in,out]	v_col	virtual column to be set up */
+void
+innodb_base_col_setup(
+	dict_table_t*	table,
+	const Field*	field,
+	dict_v_col_t*	v_col);
+
+/** Set up base columns for stored column
+@param[in]	table	InnoDB table
+@param[in]	field	MySQL field
+@param[in,out]	s_col	stored column */
+void
+innodb_base_col_setup_for_stored(
+	const dict_table_t*	table,
+	const Field*		field,
+	dict_s_col_t*		s_col);
+
+/** whether this is a stored generated column */
+#define innobase_is_s_fld(field) ((field)->vcol_info && (field)->stored_in_db())
+
+/** Converts a search mode flag understood by MySQL to a flag understood
+by InnoDB.
+@param[in]	find_flag	MySQL search mode flag.
+@return	InnoDB search mode flag. */
+page_cur_mode_t
+convert_search_mode_to_innobase(
+	enum ha_rkey_function	find_flag);
+
+/** Commits a transaction in an InnoDB database.
+@param[in]	trx	Transaction handle. */
+void
+innobase_commit_low(
+	trx_t*	trx);
+
+extern my_bool	innobase_stats_on_metadata;
+
+/** Calculate Record Per Key value.
+Need to exclude the NULL value if innodb_stats_method is set to "nulls_ignored"
+@param[in]	index	InnoDB index.
+@param[in]	i	The column we are calculating rec per key.
+@param[in]	records	Estimated total records.
+@return estimated record per key value */
+/* JAN: TODO: MySQL 5.7  */
+typedef float rec_per_key_t;
+rec_per_key_t
+innodb_rec_per_key(
+	dict_index_t*	index,
+	ulint		i,
+	ha_rows		records);
+
+/** Build template for the virtual columns and their base columns
+@param[in]	table		MySQL TABLE
+@param[in]	ib_table	InnoDB dict_table_t
+@param[in,out]	s_templ		InnoDB template structure
+@param[in]	add_v		new virtual columns added along with
+				add index call
+@param[in]	locked		true if innobase_share_mutex is held */
+void
+innobase_build_v_templ(
+	const TABLE*		table,
+	const dict_table_t*	ib_table,
+	dict_vcol_templ_t*	s_templ,
+	const dict_add_v_col_t*	add_v,
+	bool			locked);
+
+/** callback used by MySQL server layer to initialized
+the table virtual columns' template
+@param[in]	table		MySQL TABLE
+@param[in,out]	ib_table	InnoDB dict_table_t */
+void
+innobase_build_v_templ_callback(
+        const TABLE*	table,
+        void*		ib_table);
+
+/** Callback function definition, used by MySQL server layer to initialized
+the table virtual columns' template */
+typedef void (*my_gcolumn_templatecallback_t)(const TABLE*, void*);
+
+/** Convert MySQL column number to dict_table_t::cols[] offset.
+@param[in]	field	non-virtual column
+@return	column number relative to dict_table_t::cols[] */
+unsigned
+innodb_col_no(const Field* field)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/********************************************************************//**
+Helper function to push frm mismatch error to error log and
+if needed to sql-layer. */
+void
+ib_push_frm_error(
+	THD*		thd,		/*!< in: MySQL thd */
+	dict_table_t*	ib_table,	/*!< in: InnoDB table */
+	TABLE*		table,		/*!< in: MySQL table */
+	ulint		n_keys,		/*!< in: InnoDB #keys */
+	bool		push_warning);	/*!< in: print warning ? */
+
+/** Check each index part length whether they not exceed the max limit
+@param[in]	max_field_len	maximum allowed key part length
+@param[in]	key		MariaDB key definition
+@return true if index column length exceeds limit */
+MY_ATTRIBUTE((warn_unused_result))
+bool too_big_key_part_length(size_t max_field_len, const KEY& key);
+
+/** This function is used to rollback one X/Open XA distributed transaction
+which is in the prepared state
+
+@param[in] hton InnoDB handlerton
+@param[in] xid X/Open XA transaction identification
+
+@return 0 or error number */
+int innobase_rollback_by_xid(handlerton* hton, XID* xid);
diff --git a/storage/innobase/handler/handler0alter.cc b/storage/innobase/handler/handler0alter.cc
new file mode 100644
index 00000000..40370ac5
--- /dev/null
+++ b/storage/innobase/handler/handler0alter.cc
@@ -0,0 +1,11843 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file handler/handler0alter.cc
+Smart ALTER TABLE
+*******************************************************/
+
+/* Include necessary SQL headers */
+#include "univ.i"
+#include <debug_sync.h>
+#include <log.h>
+#include <sql_lex.h>
+#include <sql_class.h>
+#include <sql_table.h>
+#include <mysql/plugin.h>
+
+/* Include necessary InnoDB headers */
+#include "btr0sea.h"
+#include "dict0crea.h"
+#include "dict0dict.h"
+#include "dict0load.h"
+#include "dict0stats.h"
+#include "dict0stats_bg.h"
+#include "log0log.h"
+#include "rem0types.h"
+#include "row0log.h"
+#include "row0merge.h"
+#include "row0ins.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "trx0trx.h"
+#include "trx0purge.h"
+#include "handler0alter.h"
+#include "srv0mon.h"
+#include "srv0srv.h"
+#include "fts0priv.h"
+#include "fts0plugin.h"
+#include "pars0pars.h"
+#include "row0sel.h"
+#include "ha_innodb.h"
+#include "ut0stage.h"
+#include <thread>
+#include <sstream>
+
+/** File format constraint for ALTER TABLE */
+extern ulong innodb_instant_alter_column_allowed;
+
+static const char *MSG_UNSUPPORTED_ALTER_ONLINE_ON_VIRTUAL_COLUMN=
+			"INPLACE ADD or DROP of virtual columns cannot be "
+			"combined with other ALTER TABLE actions";
+
+/** Operations for creating secondary indexes (no rebuild needed) */
+static const alter_table_operations INNOBASE_ONLINE_CREATE
+	= ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX
+	| ALTER_ADD_UNIQUE_INDEX;
+
+/** Operations that require filling in default values for columns */
+static const alter_table_operations INNOBASE_DEFAULTS
+	= ALTER_COLUMN_NOT_NULLABLE
+	| ALTER_ADD_STORED_BASE_COLUMN;
+
+
+/** Operations that require knowledge about row_start, row_end values */
+static const alter_table_operations INNOBASE_ALTER_VERSIONED_REBUILD
+	= ALTER_ADD_SYSTEM_VERSIONING
+	| ALTER_DROP_SYSTEM_VERSIONING;
+
+/** Operations for rebuilding a table in place */
+static const alter_table_operations INNOBASE_ALTER_REBUILD
+	= ALTER_ADD_PK_INDEX
+	| ALTER_DROP_PK_INDEX
+	| ALTER_OPTIONS
+	/* ALTER_OPTIONS needs to check alter_options_need_rebuild() */
+	| ALTER_COLUMN_NULLABLE
+	| INNOBASE_DEFAULTS
+	| ALTER_STORED_COLUMN_ORDER
+	| ALTER_DROP_STORED_COLUMN
+	| ALTER_RECREATE_TABLE
+	/*
+	| ALTER_STORED_COLUMN_TYPE
+	*/
+	| INNOBASE_ALTER_VERSIONED_REBUILD
+	;
+
+/** Operations that require changes to data */
+static const alter_table_operations INNOBASE_ALTER_DATA
+	= INNOBASE_ONLINE_CREATE | INNOBASE_ALTER_REBUILD;
+
+/** Operations for altering a table that InnoDB does not care about */
+static const alter_table_operations INNOBASE_INPLACE_IGNORE
+	= ALTER_COLUMN_DEFAULT
+	| ALTER_PARTITIONED
+	| ALTER_COLUMN_COLUMN_FORMAT
+	| ALTER_COLUMN_STORAGE_TYPE
+	| ALTER_CONVERT_TO
+	| ALTER_VIRTUAL_GCOL_EXPR
+	| ALTER_DROP_CHECK_CONSTRAINT
+	| ALTER_RENAME
+	| ALTER_INDEX_ORDER
+	| ALTER_COLUMN_INDEX_LENGTH
+	| ALTER_CHANGE_INDEX_COMMENT
+	| ALTER_INDEX_IGNORABILITY;
+
+/** Operations on foreign key definitions (changing the schema only) */
+static const alter_table_operations INNOBASE_FOREIGN_OPERATIONS
+	= ALTER_DROP_FOREIGN_KEY
+	| ALTER_ADD_FOREIGN_KEY;
+
+/** Operations that InnoDB cares about and can perform without creating data */
+static const alter_table_operations INNOBASE_ALTER_NOCREATE
+	= ALTER_DROP_NON_UNIQUE_NON_PRIM_INDEX
+	| ALTER_DROP_UNIQUE_INDEX;
+
+/** Operations that InnoDB cares about and can perform without validation */
+static const alter_table_operations INNOBASE_ALTER_NOVALIDATE
+	= INNOBASE_ALTER_NOCREATE
+	| ALTER_VIRTUAL_COLUMN_ORDER
+	| ALTER_COLUMN_NAME
+	| INNOBASE_FOREIGN_OPERATIONS
+	| ALTER_COLUMN_UNVERSIONED
+	| ALTER_DROP_VIRTUAL_COLUMN;
+
+/** Operations that InnoDB cares about and can perform without rebuild */
+static const alter_table_operations INNOBASE_ALTER_NOREBUILD
+	= INNOBASE_ONLINE_CREATE
+	| INNOBASE_ALTER_NOCREATE;
+
+/** Operations that can be performed instantly, without inplace_alter_table() */
+static const alter_table_operations INNOBASE_ALTER_INSTANT
+	= ALTER_VIRTUAL_COLUMN_ORDER
+	| ALTER_COLUMN_NAME
+	| ALTER_ADD_VIRTUAL_COLUMN
+	| INNOBASE_FOREIGN_OPERATIONS
+	| ALTER_COLUMN_TYPE_CHANGE_BY_ENGINE
+	| ALTER_COLUMN_UNVERSIONED
+	| ALTER_RENAME_INDEX
+	| ALTER_DROP_VIRTUAL_COLUMN;
+
+/** Initialize instant->field_map.
+@param[in]	table	table definition to copy from */
+inline void dict_table_t::init_instant(const dict_table_t& table)
+{
+	const dict_index_t& oindex __attribute__((unused))= *table.indexes.start;
+	dict_index_t& index = *indexes.start;
+	const unsigned u = index.first_user_field();
+	DBUG_ASSERT(u == oindex.first_user_field());
+	DBUG_ASSERT(index.n_fields >= oindex.n_fields);
+
+	field_map_element_t* field_map_it = static_cast<field_map_element_t*>(
+		mem_heap_zalloc(heap, (index.n_fields - u)
+				* sizeof *field_map_it));
+	instant->field_map = field_map_it;
+
+	ut_d(unsigned n_drop = 0);
+	ut_d(unsigned n_nullable = 0);
+	for (unsigned i = u; i < index.n_fields; i++) {
+		auto& f = index.fields[i];
+		ut_d(n_nullable += f.col->is_nullable());
+
+		if (!f.col->is_dropped()) {
+			(*field_map_it++).set_ind(f.col->ind);
+			continue;
+		}
+
+		auto fixed_len = dict_col_get_fixed_size(
+			f.col, not_redundant());
+		field_map_it->set_dropped();
+		if (!f.col->is_nullable()) {
+			field_map_it->set_not_null();
+		}
+		field_map_it->set_ind(fixed_len
+				      ? uint16_t(fixed_len + 1)
+				      : DATA_BIG_COL(f.col));
+		field_map_it++;
+		ut_ad(f.col >= table.instant->dropped);
+		ut_ad(f.col < table.instant->dropped
+		      + table.instant->n_dropped);
+		ut_d(n_drop++);
+		size_t d = f.col - table.instant->dropped;
+		ut_ad(f.col == &table.instant->dropped[d]);
+		ut_ad(d <= instant->n_dropped);
+		f.col = &instant->dropped[d];
+	}
+	ut_ad(n_drop == n_dropped());
+	ut_ad(field_map_it == &instant->field_map[index.n_fields - u]);
+	ut_ad(index.n_nullable == n_nullable);
+}
+
+/** Set is_instant() before instant_column().
+@param[in]	old		previous table definition
+@param[in]	col_map		map from old.cols[] and old.v_cols[] to this
+@param[out]	first_alter_pos	0, or 1 + first changed column position */
+inline void dict_table_t::prepare_instant(const dict_table_t& old,
+					  const ulint* col_map,
+					  unsigned& first_alter_pos)
+{
+	DBUG_ASSERT(!is_instant());
+	DBUG_ASSERT(n_dropped() == 0);
+	DBUG_ASSERT(old.n_cols == old.n_def);
+	DBUG_ASSERT(n_cols == n_def);
+	DBUG_ASSERT(old.supports_instant());
+	DBUG_ASSERT(not_redundant() == old.not_redundant());
+	DBUG_ASSERT(DICT_TF_HAS_ATOMIC_BLOBS(flags)
+		    == DICT_TF_HAS_ATOMIC_BLOBS(old.flags));
+	DBUG_ASSERT(!persistent_autoinc
+		    || persistent_autoinc == old.persistent_autoinc);
+	/* supports_instant() does not necessarily hold here,
+	in case ROW_FORMAT=COMPRESSED according to the
+	MariaDB data dictionary, and ALTER_OPTIONS was not set.
+	If that is the case, the instant ALTER TABLE would keep
+	the InnoDB table in its current format. */
+
+	dict_index_t& oindex = *old.indexes.start;
+	dict_index_t& index = *indexes.start;
+	first_alter_pos = 0;
+
+	for (unsigned i = 0; i + DATA_N_SYS_COLS < old.n_cols; i++) {
+		if (col_map[i] != i) {
+			first_alter_pos = 1 + i;
+			goto add_metadata;
+		}
+	}
+
+	if (!old.instant) {
+		/* Columns were not dropped or reordered.
+		Therefore columns must have been added at the end,
+		or modified instantly in place. */
+		DBUG_ASSERT(index.n_fields >= oindex.n_fields);
+		DBUG_ASSERT(index.n_fields > oindex.n_fields
+			    || !not_redundant());
+#ifdef UNIV_DEBUG
+		if (index.n_fields == oindex.n_fields) {
+			ut_ad(!not_redundant());
+			for (unsigned i = index.n_fields; i--; ) {
+				ut_ad(index.fields[i].col->same_format(
+					      *oindex.fields[i].col));
+			}
+		}
+#endif
+set_core_fields:
+		index.n_core_fields = oindex.n_core_fields;
+		index.n_core_null_bytes = oindex.n_core_null_bytes;
+	} else {
+add_metadata:
+		const unsigned n_old_drop = old.n_dropped();
+		unsigned n_drop = n_old_drop;
+		for (unsigned i = old.n_cols; i--; ) {
+			if (col_map[i] == ULINT_UNDEFINED) {
+				DBUG_ASSERT(i + DATA_N_SYS_COLS
+					    < uint(old.n_cols));
+				n_drop++;
+			}
+		}
+
+		instant = new (mem_heap_alloc(heap, sizeof(dict_instant_t)))
+			dict_instant_t();
+		instant->n_dropped = n_drop;
+		if (n_drop) {
+			instant->dropped
+				= static_cast<dict_col_t*>(
+					mem_heap_alloc(heap, n_drop
+						       * sizeof(dict_col_t)));
+			if (n_old_drop) {
+				memcpy(instant->dropped, old.instant->dropped,
+				       n_old_drop * sizeof(dict_col_t));
+			}
+		} else {
+			instant->dropped = NULL;
+		}
+
+		for (unsigned i = 0, d = n_old_drop; i < old.n_cols; i++) {
+			if (col_map[i] == ULINT_UNDEFINED) {
+				(new (&instant->dropped[d++])
+				 dict_col_t(old.cols[i]))->set_dropped();
+			}
+		}
+#ifndef DBUG_OFF
+		for (unsigned i = 0; i < n_drop; i++) {
+			DBUG_ASSERT(instant->dropped[i].is_dropped());
+		}
+#endif
+		const unsigned n_fields = index.n_fields + n_dropped();
+
+		DBUG_ASSERT(n_fields >= oindex.n_fields);
+		dict_field_t* fields = static_cast<dict_field_t*>(
+			mem_heap_zalloc(heap, n_fields * sizeof *fields));
+		unsigned i = 0, j = 0, n_nullable = 0;
+		ut_d(uint core_null = 0);
+		for (; i < oindex.n_fields; i++) {
+			DBUG_ASSERT(j <= i);
+			dict_field_t&f = fields[i] = oindex.fields[i];
+			if (f.col->is_dropped()) {
+				/* The column has been instantly
+				dropped earlier. */
+				DBUG_ASSERT(f.col >= old.instant->dropped);
+				{
+					size_t d = f.col
+						- old.instant->dropped;
+					DBUG_ASSERT(d < n_old_drop);
+					DBUG_ASSERT(&old.instant->dropped[d]
+						    == f.col);
+					DBUG_ASSERT(!f.name);
+					f.col = instant->dropped + d;
+				}
+				if (f.col->is_nullable()) {
+found_nullable:
+					n_nullable++;
+					ut_d(core_null
+					     += i < oindex.n_core_fields);
+				}
+				continue;
+			}
+
+			const ulint col_ind = col_map[f.col->ind];
+			if (col_ind != ULINT_UNDEFINED) {
+				if (index.fields[j].col->ind != col_ind) {
+					/* The fields for instantly
+					added columns must be placed
+					last in the clustered index.
+					Keep pre-existing fields in
+					the same position. */
+					uint k;
+					for (k = j + 1; k < index.n_fields;
+					     k++) {
+						if (index.fields[k].col->ind
+						    == col_ind) {
+							goto found_j;
+						}
+					}
+					DBUG_ASSERT("no such col" == 0);
+found_j:
+					std::swap(index.fields[j],
+						  index.fields[k]);
+				}
+				DBUG_ASSERT(index.fields[j].col->ind
+					    == col_ind);
+				fields[i] = index.fields[j++];
+				DBUG_ASSERT(!fields[i].col->is_dropped());
+				DBUG_ASSERT(fields[i].name
+					    == fields[i].col->name(*this));
+				if (fields[i].col->is_nullable()) {
+					goto found_nullable;
+				}
+				continue;
+			}
+
+			/* This column is being dropped. */
+			unsigned d = n_old_drop;
+			for (unsigned c = 0; c < f.col->ind; c++) {
+				d += col_map[c] == ULINT_UNDEFINED;
+			}
+			DBUG_ASSERT(d < n_drop);
+			f.col = &instant->dropped[d];
+			f.name = NULL;
+			if (f.col->is_nullable()) {
+				goto found_nullable;
+			}
+		}
+
+		/* In case of discarded tablespace, InnoDB can't
+		read the root page. So assign the null bytes based
+		on nullabled fields */
+		if (!oindex.table->space) {
+			oindex.n_core_null_bytes = static_cast<uint8_t>(
+				UT_BITS_IN_BYTES(unsigned(oindex.n_nullable)));
+		}
+
+		/* The n_core_null_bytes only matters for
+		ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC tables. */
+		ut_ad(UT_BITS_IN_BYTES(core_null) == oindex.n_core_null_bytes
+		      || !not_redundant());
+		DBUG_ASSERT(i >= oindex.n_core_fields);
+		DBUG_ASSERT(j <= i);
+		DBUG_ASSERT(n_fields - (i - j) == index.n_fields);
+		std::sort(index.fields + j, index.fields + index.n_fields,
+			  [](const dict_field_t& a, const dict_field_t& b)
+			  { return a.col->ind < b.col->ind; });
+		for (; i < n_fields; i++) {
+			fields[i] = index.fields[j++];
+			n_nullable += fields[i].col->is_nullable();
+			DBUG_ASSERT(!fields[i].col->is_dropped());
+			DBUG_ASSERT(fields[i].name
+				    == fields[i].col->name(*this));
+		}
+		DBUG_ASSERT(j == index.n_fields);
+		index.n_fields = index.n_def = n_fields
+			& dict_index_t::MAX_N_FIELDS;
+		index.fields = fields;
+		DBUG_ASSERT(n_nullable >= index.n_nullable);
+		DBUG_ASSERT(n_nullable >= oindex.n_nullable);
+		index.n_nullable = n_nullable & dict_index_t::MAX_N_FIELDS;
+		goto set_core_fields;
+	}
+
+	DBUG_ASSERT(n_cols + n_dropped() >= old.n_cols + old.n_dropped());
+	DBUG_ASSERT(n_dropped() >= old.n_dropped());
+	DBUG_ASSERT(index.n_core_fields == oindex.n_core_fields);
+	DBUG_ASSERT(index.n_core_null_bytes == oindex.n_core_null_bytes);
+}
+
+/** Adjust index metadata for instant ADD/DROP/reorder COLUMN.
+@param[in]	clustered index definition after instant ALTER TABLE */
+inline void dict_index_t::instant_add_field(const dict_index_t& instant)
+{
+	DBUG_ASSERT(is_primary());
+	DBUG_ASSERT(instant.is_primary());
+	DBUG_ASSERT(!has_virtual());
+	DBUG_ASSERT(!instant.has_virtual());
+	DBUG_ASSERT(instant.n_core_fields <= instant.n_fields);
+	DBUG_ASSERT(n_def == n_fields);
+	DBUG_ASSERT(instant.n_def == instant.n_fields);
+	DBUG_ASSERT(type == instant.type);
+	DBUG_ASSERT(trx_id_offset == instant.trx_id_offset);
+	DBUG_ASSERT(n_user_defined_cols == instant.n_user_defined_cols);
+	DBUG_ASSERT(n_uniq == instant.n_uniq);
+	DBUG_ASSERT(instant.n_fields >= n_fields);
+	DBUG_ASSERT(instant.n_nullable >= n_nullable);
+	DBUG_ASSERT(instant.n_core_fields == n_core_fields);
+	DBUG_ASSERT(instant.n_core_null_bytes == n_core_null_bytes);
+
+	/* instant will have all fields (including ones for columns
+	that have been or are being instantly dropped) in the same position
+	as this index. Fields for any added columns are appended at the end. */
+#ifndef DBUG_OFF
+	for (unsigned i = 0; i < n_fields; i++) {
+		DBUG_ASSERT(fields[i].same(instant.fields[i]));
+		DBUG_ASSERT(instant.fields[i].col->same_format(*fields[i]
+							       .col));
+		/* Instant conversion from NULL to NOT NULL is not allowed. */
+		DBUG_ASSERT(!fields[i].col->is_nullable()
+			    || instant.fields[i].col->is_nullable());
+		DBUG_ASSERT(fields[i].col->is_nullable()
+			    == instant.fields[i].col->is_nullable()
+			    || !table->not_redundant());
+	}
+#endif
+	n_fields = instant.n_fields;
+	n_def = instant.n_def;
+	n_nullable = instant.n_nullable;
+	fields = static_cast<dict_field_t*>(
+		mem_heap_dup(heap, instant.fields, n_fields * sizeof *fields));
+
+	ut_d(unsigned n_null = 0);
+	ut_d(unsigned n_dropped = 0);
+
+	for (unsigned i = 0; i < n_fields; i++) {
+		const dict_col_t* icol = instant.fields[i].col;
+		dict_field_t& f = fields[i];
+		ut_d(n_null += icol->is_nullable());
+		DBUG_ASSERT(!icol->is_virtual());
+		if (icol->is_dropped()) {
+			ut_d(n_dropped++);
+			f.col->set_dropped();
+			f.name = NULL;
+		} else {
+			f.col = &table->cols[icol - instant.table->cols];
+			f.name = f.col->name(*table);
+		}
+	}
+
+	ut_ad(n_null == n_nullable);
+	ut_ad(n_dropped == instant.table->n_dropped());
+}
+
+/** Adjust table metadata for instant ADD/DROP/reorder COLUMN.
+@param[in]	table	altered table (with dropped columns)
+@param[in]	col_map	mapping from cols[] and v_cols[] to table
+@return		whether the metadata record must be updated */
+inline bool dict_table_t::instant_column(const dict_table_t& table,
+					 const ulint* col_map)
+{
+	DBUG_ASSERT(!table.cached);
+	DBUG_ASSERT(table.n_def == table.n_cols);
+	DBUG_ASSERT(table.n_t_def == table.n_t_cols);
+	DBUG_ASSERT(n_def == n_cols);
+	DBUG_ASSERT(n_t_def == n_t_cols);
+	DBUG_ASSERT(n_v_def == n_v_cols);
+	DBUG_ASSERT(table.n_v_def == table.n_v_cols);
+	DBUG_ASSERT(table.n_cols + table.n_dropped() >= n_cols + n_dropped());
+	DBUG_ASSERT(!table.persistent_autoinc
+		    || persistent_autoinc == table.persistent_autoinc);
+	ut_ad(dict_sys.locked());
+
+	{
+		const char* end = table.col_names;
+		for (unsigned i = table.n_cols; i--; ) end += strlen(end) + 1;
+
+		col_names = static_cast<char*>(
+			mem_heap_dup(heap, table.col_names,
+				     ulint(end - table.col_names)));
+	}
+	const dict_col_t* const old_cols = cols;
+	cols = static_cast<dict_col_t*>(mem_heap_dup(heap, table.cols,
+						     table.n_cols
+						     * sizeof *cols));
+
+	/* Preserve the default values of previously instantly added
+	columns, or copy the new default values to this->heap. */
+	for (uint16_t i = 0; i < table.n_cols; i++) {
+		dict_col_t& c = cols[i];
+
+		if (const dict_col_t* o = find(old_cols, col_map, n_cols, i)) {
+			c.def_val = o->def_val;
+			DBUG_ASSERT(!((c.prtype ^ o->prtype)
+				      & ~(DATA_NOT_NULL | DATA_VERSIONED
+					  | CHAR_COLL_MASK << 16
+					  | DATA_LONG_TRUE_VARCHAR)));
+			DBUG_ASSERT(c.same_type(*o));
+			DBUG_ASSERT(c.len >= o->len);
+
+			if (o->vers_sys_start()) {
+				ut_ad(o->ind == vers_start);
+				vers_start = i & dict_index_t::MAX_N_FIELDS;
+			} else if (o->vers_sys_end()) {
+				ut_ad(o->ind == vers_end);
+				vers_end = i & dict_index_t::MAX_N_FIELDS;
+			}
+			continue;
+		}
+
+		DBUG_ASSERT(c.is_added());
+		if (c.def_val.len <= UNIV_PAGE_SIZE_MAX
+		    && (!c.def_val.len
+			|| !memcmp(c.def_val.data, field_ref_zero,
+				   c.def_val.len))) {
+			c.def_val.data = field_ref_zero;
+		} else if (const void*& d = c.def_val.data) {
+			d = mem_heap_dup(heap, d, c.def_val.len);
+		} else {
+			DBUG_ASSERT(c.def_val.len == UNIV_SQL_NULL);
+		}
+	}
+
+	n_t_def = (n_t_def + (table.n_cols - n_cols))
+		& dict_index_t::MAX_N_FIELDS;
+	n_t_cols = (n_t_cols + (table.n_cols - n_cols))
+		& dict_index_t::MAX_N_FIELDS;
+	n_def = table.n_cols;
+
+	const dict_v_col_t* const old_v_cols = v_cols;
+
+	if (const char* end = table.v_col_names) {
+		for (unsigned i = table.n_v_cols; i--; ) {
+			end += strlen(end) + 1;
+		}
+
+		v_col_names = static_cast<char*>(
+			mem_heap_dup(heap, table.v_col_names,
+				     ulint(end - table.v_col_names)));
+		v_cols = static_cast<dict_v_col_t*>(
+			mem_heap_alloc(heap, table.n_v_cols * sizeof(*v_cols)));
+		for (ulint i = table.n_v_cols; i--; ) {
+			new (&v_cols[i]) dict_v_col_t(table.v_cols[i]);
+			v_cols[i].v_indexes.clear();
+		}
+	} else {
+		ut_ad(table.n_v_cols == 0);
+		v_col_names = NULL;
+		v_cols = NULL;
+	}
+
+	n_t_def = (n_t_def + (table.n_v_cols - n_v_cols))
+		& dict_index_t::MAX_N_FIELDS;
+	n_t_cols = (n_t_cols + (table.n_v_cols - n_v_cols))
+		& dict_index_t::MAX_N_FIELDS;
+	n_v_def = table.n_v_cols;
+
+	for (unsigned i = 0; i < n_v_def; i++) {
+		dict_v_col_t& v = v_cols[i];
+		DBUG_ASSERT(v.v_indexes.empty());
+		v.base_col = static_cast<dict_col_t**>(
+			mem_heap_dup(heap, v.base_col,
+				     v.num_base * sizeof *v.base_col));
+
+		for (ulint n = v.num_base; n--; ) {
+			dict_col_t*& base = v.base_col[n];
+			if (base->is_virtual()) {
+			} else if (base >= table.cols
+				   && base < table.cols + table.n_cols) {
+				/* The base column was instantly added. */
+				size_t c = base - table.cols;
+				DBUG_ASSERT(base == &table.cols[c]);
+				base = &cols[c];
+			} else {
+				DBUG_ASSERT(base >= old_cols);
+				size_t c = base - old_cols;
+				DBUG_ASSERT(c + DATA_N_SYS_COLS < n_cols);
+				DBUG_ASSERT(base == &old_cols[c]);
+				DBUG_ASSERT(col_map[c] + DATA_N_SYS_COLS
+					    < n_cols);
+				base = &cols[col_map[c]];
+			}
+		}
+	}
+
+	dict_index_t* index = dict_table_get_first_index(this);
+	bool metadata_changed;
+	{
+		const dict_index_t& i = *dict_table_get_first_index(&table);
+		metadata_changed = i.n_fields > index->n_fields;
+		ut_ad(i.n_fields >= index->n_fields);
+		index->instant_add_field(i);
+	}
+
+	if (instant || table.instant) {
+		const auto old_instant = instant;
+		/* FIXME: add instant->heap, and transfer ownership here */
+		if (!instant) {
+			instant = new (mem_heap_zalloc(heap, sizeof *instant))
+				dict_instant_t();
+			goto dup_dropped;
+		} else if (n_dropped() < table.n_dropped()) {
+dup_dropped:
+			instant->dropped = static_cast<dict_col_t*>(
+				mem_heap_dup(heap, table.instant->dropped,
+					     table.instant->n_dropped
+					     * sizeof *instant->dropped));
+			instant->n_dropped = table.instant->n_dropped;
+		} else if (table.instant->n_dropped) {
+			memcpy(instant->dropped, table.instant->dropped,
+			       table.instant->n_dropped
+			       * sizeof *instant->dropped);
+		}
+
+		const field_map_element_t* field_map = old_instant
+			? old_instant->field_map : NULL;
+
+		init_instant(table);
+
+		if (!metadata_changed) {
+			metadata_changed = !field_map
+				|| memcmp(field_map,
+					  instant->field_map,
+					  (index->n_fields
+					   - index->first_user_field())
+					  * sizeof *field_map);
+		}
+	}
+
+	while ((index = dict_table_get_next_index(index)) != NULL) {
+		if (index->to_be_dropped) {
+			continue;
+		}
+		for (unsigned i = 0; i < index->n_fields; i++) {
+			dict_field_t& f = index->fields[i];
+			if (f.col >= table.cols
+			    && f.col < table.cols + table.n_cols) {
+				/* This is an instantly added column
+				in a newly added index. */
+				DBUG_ASSERT(!f.col->is_virtual());
+				size_t c = f.col - table.cols;
+				DBUG_ASSERT(f.col == &table.cols[c]);
+				f.col = &cols[c];
+			} else if (f.col >= &table.v_cols->m_col
+				   && f.col < &table.v_cols[n_v_cols].m_col) {
+				/* This is an instantly added virtual column
+				in a newly added index. */
+				DBUG_ASSERT(f.col->is_virtual());
+				size_t c = reinterpret_cast<dict_v_col_t*>(
+					f.col) - table.v_cols;
+				DBUG_ASSERT(f.col == &table.v_cols[c].m_col);
+				f.col = &v_cols[c].m_col;
+			} else if (f.col < old_cols
+				   || f.col >= old_cols + n_cols) {
+				DBUG_ASSERT(f.col->is_virtual());
+				f.col = &v_cols[col_map[
+						reinterpret_cast<dict_v_col_t*>(
+							f.col)
+						- old_v_cols + n_cols]].m_col;
+			} else {
+				f.col = &cols[col_map[f.col - old_cols]];
+				DBUG_ASSERT(!f.col->is_virtual());
+			}
+			f.name = f.col->name(*this);
+			if (f.col->is_virtual()) {
+				dict_v_col_t* v_col = reinterpret_cast
+					<dict_v_col_t*>(f.col);
+				v_col->v_indexes.push_front(
+					dict_v_idx_t(index, i));
+			}
+		}
+	}
+
+	n_cols = table.n_cols;
+	n_v_cols = table.n_v_cols;
+	return metadata_changed;
+}
+
+/** Find the old column number for the given new column position.
+@param[in]	col_map	column map from old column to new column
+@param[in]	pos	new column position
+@param[in]	n	number of columns present in the column map
+@return old column position for the given new column position. */
+static ulint find_old_col_no(const ulint* col_map, ulint pos, ulint n)
+{
+	do {
+		ut_ad(n);
+	} while (col_map[--n] != pos);
+	return n;
+}
+
+/** Roll back instant_column().
+@param[in]	old_n_cols		original n_cols
+@param[in]	old_cols		original cols
+@param[in]	old_col_names		original col_names
+@param[in]	old_instant		original instant structure
+@param[in]	old_fields		original fields
+@param[in]	old_n_fields		original number of fields
+@param[in]	old_n_core_fields	original number of core fields
+@param[in]	old_n_v_cols		original n_v_cols
+@param[in]	old_v_cols		original v_cols
+@param[in]	old_v_col_names		original v_col_names
+@param[in]	col_map			column map */
+inline void dict_table_t::rollback_instant(
+	unsigned	old_n_cols,
+	dict_col_t*	old_cols,
+	const char*	old_col_names,
+	dict_instant_t*	old_instant,
+	dict_field_t*	old_fields,
+	unsigned	old_n_fields,
+	unsigned	old_n_core_fields,
+	unsigned	old_n_v_cols,
+	dict_v_col_t*	old_v_cols,
+	const char*	old_v_col_names,
+	const ulint*	col_map)
+{
+	ut_ad(dict_sys.locked());
+
+	if (cols == old_cols) {
+		/* Alter fails before instant operation happens.
+		So there is no need to do rollback instant operation */
+		return;
+	}
+
+	dict_index_t* index = indexes.start;
+	/* index->is_instant() does not necessarily hold here, because
+	the table may have been emptied */
+	DBUG_ASSERT(old_n_cols >= DATA_N_SYS_COLS);
+	DBUG_ASSERT(n_cols == n_def);
+	DBUG_ASSERT(index->n_def == index->n_fields);
+	DBUG_ASSERT(index->n_core_fields <= index->n_fields);
+	DBUG_ASSERT(old_n_core_fields <= old_n_fields);
+	DBUG_ASSERT(instant || !old_instant);
+
+	instant = old_instant;
+
+	index->n_nullable = 0;
+
+	for (unsigned i = old_n_fields; i--; ) {
+		if (old_fields[i].col->is_nullable()) {
+			index->n_nullable++;
+		}
+	}
+
+	for (unsigned i = n_v_cols; i--; ) {
+		v_cols[i].~dict_v_col_t();
+	}
+
+	index->n_core_fields = ((index->n_fields == index->n_core_fields)
+				? old_n_fields
+				: old_n_core_fields)
+		& dict_index_t::MAX_N_FIELDS;
+	index->n_def = index->n_fields = old_n_fields
+		& dict_index_t::MAX_N_FIELDS;
+	index->n_core_null_bytes = static_cast<uint8_t>(
+		UT_BITS_IN_BYTES(index->get_n_nullable(index->n_core_fields)));
+
+	const dict_col_t* const new_cols = cols;
+	const dict_col_t* const new_cols_end __attribute__((unused)) = cols + n_cols;
+	const dict_v_col_t* const new_v_cols = v_cols;
+	const dict_v_col_t* const new_v_cols_end __attribute__((unused))= v_cols + n_v_cols;
+
+	cols = old_cols;
+	col_names = old_col_names;
+	v_cols = old_v_cols;
+	v_col_names = old_v_col_names;
+	n_def = n_cols = old_n_cols & dict_index_t::MAX_N_FIELDS;
+	n_v_def = n_v_cols = old_n_v_cols & dict_index_t::MAX_N_FIELDS;
+	n_t_def = n_t_cols = (n_cols + n_v_cols) & dict_index_t::MAX_N_FIELDS;
+
+	if (versioned()) {
+		for (unsigned i = 0; i < n_cols; ++i) {
+			if (cols[i].vers_sys_start()) {
+				vers_start = i & dict_index_t::MAX_N_FIELDS;
+			} else if (cols[i].vers_sys_end()) {
+				vers_end = i & dict_index_t::MAX_N_FIELDS;
+			}
+		}
+	}
+
+	index->fields = old_fields;
+
+	while ((index = dict_table_get_next_index(index)) != NULL) {
+		if (index->to_be_dropped) {
+			/* instant_column() did not adjust these indexes. */
+			continue;
+		}
+
+		for (unsigned i = 0; i < index->n_fields; i++) {
+			dict_field_t& f = index->fields[i];
+			if (f.col->is_virtual()) {
+				DBUG_ASSERT(f.col >= &new_v_cols->m_col);
+				DBUG_ASSERT(f.col < &new_v_cols_end->m_col);
+				size_t n = size_t(
+					reinterpret_cast<dict_v_col_t*>(f.col)
+					- new_v_cols);
+				DBUG_ASSERT(n <= n_v_cols);
+
+				ulint old_col_no = find_old_col_no(
+					col_map + n_cols, n, n_v_cols);
+				DBUG_ASSERT(old_col_no <= n_v_cols);
+				f.col = &v_cols[old_col_no].m_col;
+				DBUG_ASSERT(f.col->is_virtual());
+			} else {
+				DBUG_ASSERT(f.col >= new_cols);
+				DBUG_ASSERT(f.col < new_cols_end);
+				size_t n = size_t(f.col - new_cols);
+				DBUG_ASSERT(n <= n_cols);
+
+				ulint old_col_no = find_old_col_no(col_map,
+								   n, n_cols);
+				DBUG_ASSERT(old_col_no < n_cols);
+				f.col = &cols[old_col_no];
+				DBUG_ASSERT(!f.col->is_virtual());
+			}
+			f.name = f.col->name(*this);
+		}
+	}
+}
+
+/* Report an InnoDB error to the client by invoking my_error(). */
+static ATTRIBUTE_COLD __attribute__((nonnull))
+void
+my_error_innodb(
+/*============*/
+	dberr_t		error,	/*!< in: InnoDB error code */
+	const char*	table,	/*!< in: table name */
+	ulint		flags)	/*!< in: table flags */
+{
+	switch (error) {
+	case DB_MISSING_HISTORY:
+		my_error(ER_TABLE_DEF_CHANGED, MYF(0));
+		break;
+	case DB_RECORD_NOT_FOUND:
+		my_error(ER_KEY_NOT_FOUND, MYF(0), table);
+		break;
+	case DB_DEADLOCK:
+		my_error(ER_LOCK_DEADLOCK, MYF(0));
+		break;
+	case DB_LOCK_WAIT_TIMEOUT:
+		my_error(ER_LOCK_WAIT_TIMEOUT, MYF(0));
+		break;
+	case DB_INTERRUPTED:
+		my_error(ER_QUERY_INTERRUPTED, MYF(0));
+		break;
+	case DB_OUT_OF_MEMORY:
+		my_error(ER_OUT_OF_RESOURCES, MYF(0));
+		break;
+	case DB_OUT_OF_FILE_SPACE:
+		my_error(ER_RECORD_FILE_FULL, MYF(0), table);
+		break;
+	case DB_TEMP_FILE_WRITE_FAIL:
+		my_error(ER_TEMP_FILE_WRITE_FAILURE, MYF(0));
+		break;
+	case DB_TOO_BIG_INDEX_COL:
+		my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0),
+			 (ulong) DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags));
+		break;
+	case DB_TOO_MANY_CONCURRENT_TRXS:
+		my_error(ER_TOO_MANY_CONCURRENT_TRXS, MYF(0));
+		break;
+	case DB_LOCK_TABLE_FULL:
+		my_error(ER_LOCK_TABLE_FULL, MYF(0));
+		break;
+	case DB_UNDO_RECORD_TOO_BIG:
+		my_error(ER_UNDO_RECORD_TOO_BIG, MYF(0));
+		break;
+	case DB_CORRUPTION:
+		my_error(ER_NOT_KEYFILE, MYF(0), table);
+		break;
+	case DB_TOO_BIG_RECORD: {
+		/* Note that in page0zip.ic page_zip_rec_needs_ext() rec_size
+		is limited to COMPRESSED_REC_MAX_DATA_SIZE (16K) or
+		REDUNDANT_REC_MAX_DATA_SIZE (16K-1). */
+		bool comp = !!(flags & DICT_TF_COMPACT);
+		ulint free_space = page_get_free_space_of_empty(comp) / 2;
+
+		if (free_space >= ulint(comp ? COMPRESSED_REC_MAX_DATA_SIZE :
+					  REDUNDANT_REC_MAX_DATA_SIZE)) {
+			free_space = (comp ? COMPRESSED_REC_MAX_DATA_SIZE :
+				REDUNDANT_REC_MAX_DATA_SIZE) - 1;
+		}
+
+		my_error(ER_TOO_BIG_ROWSIZE, MYF(0), free_space);
+		break;
+	}
+	case DB_INVALID_NULL:
+		/* TODO: report the row, as we do for DB_DUPLICATE_KEY */
+		my_error(ER_INVALID_USE_OF_NULL, MYF(0));
+		break;
+	case DB_CANT_CREATE_GEOMETRY_OBJECT:
+		my_error(ER_CANT_CREATE_GEOMETRY_OBJECT, MYF(0));
+		break;
+	case DB_TABLESPACE_EXISTS:
+		my_error(ER_TABLESPACE_EXISTS, MYF(0), table);
+		break;
+
+#ifdef UNIV_DEBUG
+	case DB_SUCCESS:
+	case DB_DUPLICATE_KEY:
+	case DB_ONLINE_LOG_TOO_BIG:
+		/* These codes should not be passed here. */
+		ut_error;
+#endif /* UNIV_DEBUG */
+	default:
+		my_error(ER_GET_ERRNO, MYF(0), error, "InnoDB");
+		break;
+	}
+}
+
+/** Get the name of an erroneous key.
+@param[in]	error_key_num	InnoDB number of the erroneus key
+@param[in]	ha_alter_info	changes that were being performed
+@param[in]	table		InnoDB table
+@return	the name of the erroneous key */
+static
+const char*
+get_error_key_name(
+	ulint				error_key_num,
+	const Alter_inplace_info*	ha_alter_info,
+	const dict_table_t*		table)
+{
+	if (error_key_num == ULINT_UNDEFINED) {
+		return(FTS_DOC_ID_INDEX_NAME);
+	} else if (ha_alter_info->key_count == 0) {
+		return(dict_table_get_first_index(table)->name);
+	} else {
+		return(ha_alter_info->key_info_buffer[error_key_num].name.str);
+	}
+}
+
+/** Convert field type and length to InnoDB format */
+static void get_type(const Field &f, uint &prtype, uint8_t &mtype,
+                     uint16_t &len)
+{
+  mtype= get_innobase_type_from_mysql_type(&prtype, &f);
+  len= static_cast<uint16_t>(f.pack_length());
+  prtype|= f.type();
+  if (f.type() == MYSQL_TYPE_VARCHAR)
+  {
+    auto l= static_cast<const Field_varstring&>(f).length_bytes;
+    len= static_cast<uint16_t>(len - l);
+    if (l == 2)
+      prtype|= DATA_LONG_TRUE_VARCHAR;
+  }
+  if (!f.real_maybe_null())
+    prtype |= DATA_NOT_NULL;
+  if (f.binary())
+    prtype |= DATA_BINARY_TYPE;
+  if (f.table->versioned())
+  {
+    if (&f == f.table->field[f.table->s->vers.start_fieldno])
+      prtype|= DATA_VERS_START;
+    else if (&f == f.table->field[f.table->s->vers.end_fieldno])
+      prtype|= DATA_VERS_END;
+    else if (!(f.flags & VERS_UPDATE_UNVERSIONED_FLAG))
+      prtype|= DATA_VERSIONED;
+  }
+
+  if (!f.stored_in_db())
+    prtype|= DATA_VIRTUAL;
+
+  if (dtype_is_string_type(mtype))
+    prtype|= f.charset()->number << 16;
+}
+
+struct ha_innobase_inplace_ctx : public inplace_alter_handler_ctx
+{
+	/** Dummy query graph */
+	que_thr_t*const	thr;
+	/** The prebuilt struct of the creating instance */
+	row_prebuilt_t*&	prebuilt;
+	/** InnoDB indexes being created */
+	dict_index_t**	add_index;
+	/** MySQL key numbers for the InnoDB indexes that are being created */
+	const ulint*	add_key_numbers;
+	/** number of InnoDB indexes being created */
+	ulint		num_to_add_index;
+	/** InnoDB indexes being dropped */
+	dict_index_t**	drop_index;
+	/** number of InnoDB indexes being dropped */
+	const ulint	num_to_drop_index;
+	/** InnoDB foreign key constraints being dropped */
+	dict_foreign_t** drop_fk;
+	/** number of InnoDB foreign key constraints being dropped */
+	const ulint	num_to_drop_fk;
+	/** InnoDB foreign key constraints being added */
+	dict_foreign_t** add_fk;
+	/** number of InnoDB foreign key constraints being dropped */
+	const ulint	num_to_add_fk;
+	/** whether to create the indexes online */
+	const bool	online;
+	/** memory heap */
+	mem_heap_t* const heap;
+	/** dictionary transaction */
+	trx_t*		trx;
+	/** original table (if rebuilt, differs from indexed_table) */
+	dict_table_t*	old_table;
+	/** table where the indexes are being created or dropped */
+	dict_table_t*	new_table;
+	/** table definition for instant ADD/DROP/reorder COLUMN */
+	dict_table_t*	instant_table;
+	/** mapping of old column numbers to new ones, or NULL */
+	const ulint*	col_map;
+	/** new column names, or NULL if nothing was renamed */
+	const char**	col_names;
+	/** added AUTO_INCREMENT column position, or ULINT_UNDEFINED */
+	const ulint	add_autoinc;
+	/** default values of ADD and CHANGE COLUMN, or NULL */
+	const dtuple_t*	defaults;
+	/** autoinc sequence to use */
+	ib_sequence_t	sequence;
+	/** temporary table name to use for old table when renaming tables */
+	const char*	tmp_name;
+	/** whether the order of the clustered index is unchanged */
+	bool		skip_pk_sort;
+	/** number of virtual columns to be added */
+	unsigned	num_to_add_vcol;
+	/** virtual columns to be added */
+	dict_v_col_t*	add_vcol;
+	const char**	add_vcol_name;
+	/** number of virtual columns to be dropped */
+	unsigned	num_to_drop_vcol;
+	/** virtual columns to be dropped */
+	dict_v_col_t*	drop_vcol;
+	const char**	drop_vcol_name;
+	/** ALTER TABLE stage progress recorder */
+	ut_stage_alter_t* m_stage;
+	/** original number of user columns in the table */
+	const unsigned	old_n_cols;
+	/** original columns of the table */
+	dict_col_t* const old_cols;
+	/** original column names of the table */
+	const char* const old_col_names;
+	/** original instantly dropped or reordered columns */
+	dict_instant_t*	const	old_instant;
+	/** original index fields */
+	dict_field_t* const	old_fields;
+	/** size of old_fields */
+	const unsigned		old_n_fields;
+	/** original old_table->n_core_fields */
+	const unsigned		old_n_core_fields;
+	/** original number of virtual columns in the table */
+	const unsigned		old_n_v_cols;
+	/** original virtual columns of the table */
+	dict_v_col_t* const old_v_cols;
+	/** original virtual column names of the table */
+	const char* const old_v_col_names;
+	/** 0, or 1 + first column whose position changes in instant ALTER */
+	unsigned	first_alter_pos;
+	/** Allow non-null conversion.
+	(1) Alter ignore should allow the conversion
+	irrespective of sql mode.
+	(2) Don't allow the conversion in strict mode
+	(3) Allow the conversion only in non-strict mode. */
+	const bool	allow_not_null;
+
+	/** The page_compression_level attribute, or 0 */
+	const uint	page_compression_level;
+
+	/** Indexed columns whose charset-collation is changing
+	in a way that does not require the table to be rebuilt */
+	col_collations change_col_collate;
+
+	ha_innobase_inplace_ctx(row_prebuilt_t*& prebuilt_arg,
+				dict_index_t** drop_arg,
+				ulint num_to_drop_arg,
+				dict_foreign_t** drop_fk_arg,
+				ulint num_to_drop_fk_arg,
+				dict_foreign_t** add_fk_arg,
+				ulint num_to_add_fk_arg,
+				bool online_arg,
+				mem_heap_t* heap_arg,
+				dict_table_t* new_table_arg,
+				const char** col_names_arg,
+				ulint add_autoinc_arg,
+				ulonglong autoinc_col_min_value_arg,
+				ulonglong autoinc_col_max_value_arg,
+				bool allow_not_null_flag,
+				bool page_compressed,
+				ulonglong page_compression_level_arg) :
+		inplace_alter_handler_ctx(),
+		thr (pars_complete_graph_for_exec(nullptr, prebuilt_arg->trx,
+						  heap_arg, prebuilt_arg)),
+		prebuilt (prebuilt_arg),
+		add_index (0), add_key_numbers (0), num_to_add_index (0),
+		drop_index (drop_arg), num_to_drop_index (num_to_drop_arg),
+		drop_fk (drop_fk_arg), num_to_drop_fk (num_to_drop_fk_arg),
+		add_fk (add_fk_arg), num_to_add_fk (num_to_add_fk_arg),
+		online (online_arg), heap (heap_arg),
+		trx (innobase_trx_allocate(prebuilt_arg->trx->mysql_thd)),
+		old_table (prebuilt_arg->table),
+		new_table (new_table_arg), instant_table (0),
+		col_map (0), col_names (col_names_arg),
+		add_autoinc (add_autoinc_arg),
+		defaults (0),
+		sequence(prebuilt->trx->mysql_thd,
+			 autoinc_col_min_value_arg, autoinc_col_max_value_arg),
+		tmp_name (0),
+		skip_pk_sort(false),
+		num_to_add_vcol(0),
+		add_vcol(0),
+		add_vcol_name(0),
+		num_to_drop_vcol(0),
+		drop_vcol(0),
+		drop_vcol_name(0),
+		m_stage(NULL),
+		old_n_cols(prebuilt_arg->table->n_cols),
+		old_cols(prebuilt_arg->table->cols),
+		old_col_names(prebuilt_arg->table->col_names),
+		old_instant(prebuilt_arg->table->instant),
+		old_fields(prebuilt_arg->table->indexes.start->fields),
+		old_n_fields(prebuilt_arg->table->indexes.start->n_fields),
+		old_n_core_fields(prebuilt_arg->table->indexes.start
+				  ->n_core_fields),
+		old_n_v_cols(prebuilt_arg->table->n_v_cols),
+		old_v_cols(prebuilt_arg->table->v_cols),
+		old_v_col_names(prebuilt_arg->table->v_col_names),
+		first_alter_pos(0),
+		allow_not_null(allow_not_null_flag),
+		page_compression_level(page_compressed
+				       ? (page_compression_level_arg
+					  ? uint(page_compression_level_arg)
+					  : page_zip_level)
+				       : 0)
+	{
+		ut_ad(old_n_cols >= DATA_N_SYS_COLS);
+		ut_ad(page_compression_level <= 9);
+#ifdef UNIV_DEBUG
+		for (ulint i = 0; i < num_to_add_index; i++) {
+			ut_ad(!add_index[i]->to_be_dropped);
+		}
+		for (ulint i = 0; i < num_to_drop_index; i++) {
+			ut_ad(drop_index[i]->to_be_dropped);
+		}
+#endif /* UNIV_DEBUG */
+
+		trx_start_for_ddl(trx);
+	}
+
+	~ha_innobase_inplace_ctx()
+	{
+		UT_DELETE(m_stage);
+		if (instant_table) {
+			ut_ad(!instant_table->id);
+			while (dict_index_t* index
+			       = UT_LIST_GET_LAST(instant_table->indexes)) {
+				UT_LIST_REMOVE(instant_table->indexes, index);
+				index->lock.free();
+				dict_mem_index_free(index);
+			}
+			for (unsigned i = old_n_v_cols; i--; ) {
+				old_v_cols[i].~dict_v_col_t();
+			}
+			if (instant_table->fts) {
+				instant_table->fts->~fts_t();
+				instant_table->fts = nullptr;
+			}
+			dict_mem_table_free(instant_table);
+		}
+		mem_heap_free(heap);
+	}
+
+	/** Determine if the table will be rebuilt.
+	@return whether the table will be rebuilt */
+	bool need_rebuild () const { return(old_table != new_table); }
+
+	/** Convert table-rebuilding ALTER to instant ALTER. */
+	void prepare_instant()
+	{
+		DBUG_ASSERT(need_rebuild());
+		DBUG_ASSERT(!is_instant());
+		DBUG_ASSERT(old_table->n_cols == old_n_cols);
+
+		instant_table = new_table;
+		new_table = old_table;
+		export_vars.innodb_instant_alter_column++;
+
+		instant_table->prepare_instant(*old_table, col_map,
+					       first_alter_pos);
+	}
+
+	/** Adjust table metadata for instant ADD/DROP/reorder COLUMN.
+	@return whether the metadata record must be updated */
+	bool instant_column()
+	{
+		DBUG_ASSERT(is_instant());
+		DBUG_ASSERT(old_n_fields
+			    == old_table->indexes.start->n_fields);
+		return old_table->instant_column(*instant_table, col_map);
+	}
+
+	/** Revert prepare_instant() if the transaction is rolled back. */
+	void rollback_instant()
+	{
+		if (!is_instant()) return;
+		old_table->rollback_instant(old_n_cols,
+					    old_cols, old_col_names,
+					    old_instant,
+					    old_fields, old_n_fields,
+					    old_n_core_fields,
+					    old_n_v_cols, old_v_cols,
+					    old_v_col_names,
+					    col_map);
+	}
+
+	/** @return whether this is instant ALTER TABLE */
+	bool is_instant() const
+	{
+		DBUG_ASSERT(!instant_table || !instant_table->can_be_evicted);
+		return instant_table;
+	}
+
+	/** Create an index table where indexes are ordered as follows:
+
+	IF a new primary key is defined for the table THEN
+
+		1) New primary key
+		2) The remaining keys in key_info
+
+	ELSE
+
+		1) All new indexes in the order they arrive from MySQL
+
+	ENDIF
+
+	@return key definitions */
+	MY_ATTRIBUTE((nonnull, warn_unused_result, malloc))
+	inline index_def_t*
+	create_key_defs(
+		const Alter_inplace_info*	ha_alter_info,
+				/*!< in: alter operation */
+		const TABLE*			altered_table,
+				/*!< in: MySQL table that is being altered */
+		ulint&				n_fts_add,
+				/*!< out: number of FTS indexes to be created */
+		ulint&				fts_doc_id_col,
+				/*!< in: The column number for Doc ID */
+		bool&				add_fts_doc_id,
+				/*!< in: whether we need to add new DOC ID
+				column for FTS index */
+		bool&				add_fts_doc_idx,
+				/*!< in: whether we need to add new DOC ID
+				index for FTS index */
+		const TABLE*			table);
+				/*!< in: MySQL table that is being altered */
+
+	/** Share context between partitions.
+	@param[in] ctx	context from another partition of the table */
+	void set_shared_data(const inplace_alter_handler_ctx& ctx)
+	{
+		if (add_autoinc != ULINT_UNDEFINED) {
+			const ha_innobase_inplace_ctx& ha_ctx =
+				static_cast<const ha_innobase_inplace_ctx&>
+				(ctx);
+			/* When adding an AUTO_INCREMENT column to a
+			partitioned InnoDB table, we must share the
+			sequence for all partitions. */
+			ut_ad(ha_ctx.add_autoinc == add_autoinc);
+			ut_ad(ha_ctx.sequence.last());
+			sequence = ha_ctx.sequence;
+		}
+	}
+
+   /** @return whether the given column is being added */
+   bool is_new_vcol(const dict_v_col_t &v_col) const
+   {
+     for (ulint i= 0; i < num_to_add_vcol; i++)
+       if (&add_vcol[i] == &v_col)
+         return true;
+     return false;
+   }
+
+  /** During rollback, make newly added indexes point to
+  newly added virtual columns. */
+  void clean_new_vcol_index()
+  {
+    ut_ad(old_table == new_table);
+    const dict_index_t *index= dict_table_get_first_index(old_table);
+    while ((index= dict_table_get_next_index(index)) != NULL)
+    {
+      if (!index->has_virtual() || index->is_committed())
+        continue;
+      ulint n_drop_new_vcol= index->get_new_n_vcol();
+      for (ulint i= 0; n_drop_new_vcol && i < index->n_fields; i++)
+      {
+        dict_col_t *col= index->fields[i].col;
+        /* Skip the non-virtual and old virtual columns */
+        if (!col->is_virtual())
+          continue;
+        dict_v_col_t *vcol= reinterpret_cast<dict_v_col_t*>(col);
+        if (!is_new_vcol(*vcol))
+          continue;
+
+        index->fields[i].col= &index->new_vcol_info->
+          add_drop_v_col(index->heap, vcol, --n_drop_new_vcol)->m_col;
+      }
+    }
+  }
+
+  /** @return whether a FULLTEXT INDEX is being added */
+  bool adding_fulltext_index() const
+  {
+    for (ulint a= 0; a < num_to_add_index; a++)
+      if (add_index[a]->type & DICT_FTS)
+        return true;
+    return false;
+  }
+
+  /** Handle the apply log failure for online DDL operation.
+  @param ha_alter_info    handler alter inplace info
+  @param altered_table    MySQL table that is being altered
+  @param error            error code
+  @retval false if error value is DB_SUCCESS or
+  TRUE in case of error */
+  bool log_failure(Alter_inplace_info *ha_alter_info,
+                   TABLE *altered_table, dberr_t error)
+  {
+    ulint err_key= thr_get_trx(thr)->error_key_num;
+    switch (error) {
+      KEY *dup_key;
+    case DB_SUCCESS:
+      return false;
+    case DB_DUPLICATE_KEY:
+      if (err_key == ULINT_UNDEFINED)
+        /* This should be the hidden index on FTS_DOC_ID */
+        dup_key= nullptr;
+      else
+      {
+        DBUG_ASSERT(err_key < ha_alter_info->key_count);
+        dup_key= &ha_alter_info->key_info_buffer[err_key];
+      }
+      print_keydup_error(altered_table, dup_key, MYF(0));
+      break;
+    case DB_ONLINE_LOG_TOO_BIG:
+      my_error(ER_INNODB_ONLINE_LOG_TOO_BIG, MYF(0),
+               get_error_key_name(err_key, ha_alter_info, new_table));
+      break;
+    case DB_INDEX_CORRUPT:
+      my_error(ER_INDEX_CORRUPT, MYF(0),
+               get_error_key_name(err_key, ha_alter_info, new_table));
+      break;
+    default:
+      my_error_innodb(error, old_table->name.m_name, old_table->flags);
+    }
+    return true;
+  }
+
+  /** Check whether the column has any change in collation type.
+  If it is then store the column information in heap
+  @param index          index being added (or rebuilt)
+  @param altered_table  altered table definition */
+  void change_col_collation(dict_index_t *index, const TABLE &altered_table)
+  {
+    ut_ad(!need_rebuild());
+    ut_ad(!index->is_primary());
+    ut_ad(!index->is_committed());
+
+    unsigned n_cols= 0;
+    for (unsigned i= 0; i < index->n_fields; i++)
+    {
+      const char *field_name= index->fields[i].name();
+      if (!field_name || !dtype_is_string_type(index->fields[i].col->mtype) ||
+	  index->fields[i].col->is_virtual())
+        continue;
+      for (uint j= 0; j < altered_table.s->fields; j++)
+      {
+        const Field *altered_field= altered_table.field[j];
+
+        if (my_strcasecmp(system_charset_info, field_name,
+                          altered_field->field_name.str))
+          continue;
+
+        unsigned prtype;
+        uint8_t mtype;
+        uint16_t len;
+        get_type(*altered_field, prtype, mtype, len);
+
+        if (prtype == index->fields[i].col->prtype)
+          continue;
+        auto it= change_col_collate.find(index->fields[i].col->ind);
+        if (it != change_col_collate.end())
+        {
+          n_cols++;
+          index->fields[i].col= it->second;
+          continue;
+        }
+
+        const CHARSET_INFO *cs= altered_field->charset();
+
+        dict_col_t *col=
+          static_cast<dict_col_t*>(mem_heap_alloc(heap, sizeof *col));
+        *col= *index->fields[i].col;
+        col->prtype= prtype;
+        col->mtype= mtype;
+        col->mbminlen= cs->mbminlen & 7;
+        col->mbmaxlen= cs->mbmaxlen & 7;
+        col->len= len;
+        index->fields[i].col= col;
+        n_cols++;
+        change_col_collate[col->ind]= col;
+      }
+    }
+
+    index->init_change_cols(n_cols);
+  }
+
+  void cleanup_col_collation()
+  {
+    ut_ad(old_table == new_table);
+    if (change_col_collate.empty())
+      return;
+    const dict_index_t *index= dict_table_get_first_index(old_table);
+    while ((index= dict_table_get_next_index(index)) != nullptr)
+    {
+      if (index->is_committed())
+        continue;
+      auto collate_end= change_col_collate.end();
+      for (unsigned i= 0, j= 0; i < index->n_fields; i++)
+      {
+        const dict_col_t *col= index->fields[i].col;
+        auto it= change_col_collate.find(col->ind);
+        if (it != collate_end)
+        {
+          ut_ad(it->second == col);
+          index->fields[i].col=
+            index->change_col_info->add(index->heap, *col, j++);
+        }
+      }
+    }
+  }
+};
+
+/********************************************************************//**
+Get the upper limit of the MySQL integral and floating-point type.
+@return maximum allowed value for the field */
+ulonglong innobase_get_int_col_max_value(const Field *field);
+
+/** Determine if fulltext indexes exist in a given table.
+@param table MySQL table
+@return number of fulltext indexes */
+static uint innobase_fulltext_exist(const TABLE* table)
+{
+	uint count = 0;
+
+	for (uint i = 0; i < table->s->keys; i++) {
+		if (table->key_info[i].flags & HA_FULLTEXT) {
+			count++;
+		}
+	}
+
+	return count;
+}
+
+/** Determine whether indexed virtual columns exist in a table.
+@param[in]	table	table definition
+@return	whether indexes exist on virtual columns */
+static bool innobase_indexed_virtual_exist(const TABLE* table)
+{
+	const KEY* const end = &table->key_info[table->s->keys];
+
+	for (const KEY* key = table->key_info; key < end; key++) {
+		const KEY_PART_INFO* const key_part_end = key->key_part
+			+ key->user_defined_key_parts;
+		for (const KEY_PART_INFO* key_part = key->key_part;
+		     key_part < key_part_end; key_part++) {
+			if (!key_part->field->stored_in_db())
+				return true;
+		}
+	}
+
+	return false;
+}
+
+/** Determine if spatial indexes exist in a given table.
+@param table MySQL table
+@return whether spatial indexes exist on the table */
+static
+bool
+innobase_spatial_exist(
+/*===================*/
+	const   TABLE*  table)
+{
+	for (uint i = 0; i < table->s->keys; i++) {
+	       if (table->key_info[i].flags & HA_SPATIAL) {
+		       return(true);
+	       }
+	}
+
+	return(false);
+}
+
+/** Determine if ALTER_OPTIONS requires rebuilding the table.
+@param[in] ha_alter_info	the ALTER TABLE operation
+@param[in] table		metadata before ALTER TABLE
+@return whether it is mandatory to rebuild the table */
+static bool alter_options_need_rebuild(
+	const Alter_inplace_info*	ha_alter_info,
+	const TABLE*			table)
+{
+	DBUG_ASSERT(ha_alter_info->handler_flags & ALTER_OPTIONS);
+
+	if (ha_alter_info->create_info->used_fields
+	    & (HA_CREATE_USED_ROW_FORMAT
+	       | HA_CREATE_USED_KEY_BLOCK_SIZE)) {
+		/* Specifying ROW_FORMAT or KEY_BLOCK_SIZE requires
+		rebuilding the table. (These attributes in the .frm
+		file may disagree with the InnoDB data dictionary, and
+		the interpretation of thse attributes depends on
+		InnoDB parameters. That is why we for now always
+		require a rebuild when these attributes are specified.) */
+		return true;
+	}
+
+	const ha_table_option_struct& alt_opt=
+			*ha_alter_info->create_info->option_struct;
+	const ha_table_option_struct& opt= *table->s->option_struct;
+
+	/* Allow an instant change to enable page_compressed,
+	and any change of page_compression_level. */
+	if ((!alt_opt.page_compressed && opt.page_compressed)
+	    || alt_opt.encryption != opt.encryption
+	    || alt_opt.encryption_key_id != opt.encryption_key_id) {
+		return(true);
+	}
+
+	return false;
+}
+
+/** Determine if ALTER TABLE needs to rebuild the table
+(or perform instant operation).
+@param[in] ha_alter_info	the ALTER TABLE operation
+@param[in] table		metadata before ALTER TABLE
+@return whether it is necessary to rebuild the table or to alter columns */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+innobase_need_rebuild(
+	const Alter_inplace_info*	ha_alter_info,
+	const TABLE*			table)
+{
+	if ((ha_alter_info->handler_flags & ~(INNOBASE_INPLACE_IGNORE
+					      | INNOBASE_ALTER_NOREBUILD
+					      | INNOBASE_ALTER_INSTANT))
+	    == ALTER_OPTIONS) {
+		return alter_options_need_rebuild(ha_alter_info, table);
+	}
+
+	return !!(ha_alter_info->handler_flags & INNOBASE_ALTER_REBUILD);
+}
+
+/** Check if virtual column in old and new table are in order, excluding
+those dropped column. This is needed because when we drop a virtual column,
+ALTER_VIRTUAL_COLUMN_ORDER is also turned on, so we can't decide if this
+is a real ORDER change or just DROP COLUMN
+@param[in]	table		old TABLE
+@param[in]	altered_table	new TABLE
+@param[in]	ha_alter_info	Structure describing changes to be done
+by ALTER TABLE and holding data used during in-place alter.
+@return	true is all columns in order, false otherwise. */
+static
+bool
+check_v_col_in_order(
+	const TABLE*		table,
+	const TABLE*		altered_table,
+	Alter_inplace_info*	ha_alter_info)
+{
+	ulint	j = 0;
+
+	/* We don't support any adding new virtual column before
+	existed virtual column. */
+	if (ha_alter_info->handler_flags
+              & ALTER_ADD_VIRTUAL_COLUMN) {
+		bool			has_new = false;
+
+		for (const Create_field& new_field :
+		     ha_alter_info->alter_info->create_list) {
+			if (new_field.stored_in_db()) {
+				continue;
+			}
+
+			/* Found a new added virtual column. */
+			if (!new_field.field) {
+				has_new = true;
+				continue;
+			}
+
+			/* If there's any old virtual column
+			after the new added virtual column,
+			order must be changed. */
+			if (has_new) {
+				return(false);
+			}
+		}
+	}
+
+	/* directly return true if ALTER_VIRTUAL_COLUMN_ORDER is not on */
+	if (!(ha_alter_info->handler_flags
+              & ALTER_VIRTUAL_COLUMN_ORDER)) {
+		return(true);
+	}
+
+	for (ulint i = 0; i < table->s->fields; i++) {
+		Field*		field = table->field[i];
+
+		if (field->stored_in_db()) {
+			continue;
+		}
+
+		if (field->flags & FIELD_IS_DROPPED) {
+			continue;
+		}
+
+		/* Now check if the next virtual column in altered table
+		matches this column */
+		while (j < altered_table->s->fields) {
+			 Field*  new_field = altered_table->s->field[j];
+
+			if (new_field->stored_in_db()) {
+				j++;
+				continue;
+			}
+
+			if (my_strcasecmp(system_charset_info,
+					  field->field_name.str,
+					  new_field->field_name.str) != 0) {
+				/* different column */
+				return(false);
+			} else {
+				j++;
+				break;
+			}
+		}
+
+		if (j > altered_table->s->fields) {
+			/* there should not be less column in new table
+			without them being in drop list */
+			ut_ad(0);
+			return(false);
+		}
+	}
+
+	return(true);
+}
+
+/** Determine if an instant operation is possible for altering columns.
+@param[in]	ib_table	InnoDB table definition
+@param[in]	ha_alter_info	the ALTER TABLE operation
+@param[in]	table		table definition before ALTER TABLE
+@param[in]	altered_table	table definition after ALTER TABLE
+@param[in]	strict		whether to ensure that user records fit */
+static
+bool
+instant_alter_column_possible(
+	const dict_table_t&		ib_table,
+	const Alter_inplace_info*	ha_alter_info,
+	const TABLE*			table,
+	const TABLE*			altered_table,
+	bool				strict)
+{
+	const dict_index_t* const pk = ib_table.indexes.start;
+	ut_ad(pk->is_primary());
+	ut_ad(!pk->has_virtual());
+
+	if (ha_alter_info->handler_flags
+	    & (ALTER_STORED_COLUMN_ORDER | ALTER_DROP_STORED_COLUMN
+	       | ALTER_ADD_STORED_BASE_COLUMN)) {
+#if 1 // MDEV-17459: adjust fts_fetch_doc_from_rec() and friends; remove this
+		if (ib_table.fts || innobase_fulltext_exist(altered_table))
+			return false;
+#endif
+#if 1 // MDEV-17468: fix bugs with indexed virtual columns & remove this
+		for (const dict_index_t* index = ib_table.indexes.start;
+		     index; index = index->indexes.next) {
+			if (index->has_virtual()) {
+				ut_ad(ib_table.n_v_cols
+				      || index->is_corrupted());
+				return false;
+			}
+		}
+#endif
+		uint n_add = 0, n_nullable = 0, lenlen = 0;
+		const uint blob_prefix = dict_table_has_atomic_blobs(&ib_table)
+			? 0
+			: REC_ANTELOPE_MAX_INDEX_COL_LEN;
+		const uint min_local_len = blob_prefix
+			? blob_prefix + FIELD_REF_SIZE
+			: 2 * FIELD_REF_SIZE;
+		size_t min_size = 0, max_size = 0;
+		Field** af = altered_table->field;
+		Field** const end = altered_table->field
+			+ altered_table->s->fields;
+		List_iterator_fast<Create_field> cf_it(
+			ha_alter_info->alter_info->create_list);
+
+		for (; af < end; af++) {
+			const Create_field* cf = cf_it++;
+			if (!(*af)->stored_in_db() || cf->field) {
+				/* Virtual or pre-existing column */
+				continue;
+			}
+			const bool nullable = (*af)->real_maybe_null();
+			const bool is_null = (*af)->is_real_null();
+			ut_ad(!is_null || nullable);
+			n_nullable += nullable;
+			n_add++;
+			uint l;
+			switch ((*af)->type()) {
+			case MYSQL_TYPE_VARCHAR:
+				l = reinterpret_cast<const Field_varstring*>
+					(*af)->get_length();
+			variable_length:
+				if (l >= min_local_len) {
+					max_size += blob_prefix
+						+ FIELD_REF_SIZE;
+					if (!is_null) {
+						min_size += blob_prefix
+							+ FIELD_REF_SIZE;
+					}
+					lenlen += 2;
+				} else {
+					if (!is_null) {
+						min_size += l;
+					}
+					l = (*af)->pack_length();
+					max_size += l;
+					lenlen += l > 255 ? 2 : 1;
+				}
+				break;
+			case MYSQL_TYPE_GEOMETRY:
+			case MYSQL_TYPE_TINY_BLOB:
+			case MYSQL_TYPE_MEDIUM_BLOB:
+			case MYSQL_TYPE_BLOB:
+			case MYSQL_TYPE_LONG_BLOB:
+				l = reinterpret_cast<const Field_blob*>
+					((*af))->get_length();
+				goto variable_length;
+			default:
+				l = (*af)->pack_length();
+				if (l > 255 && ib_table.not_redundant()) {
+					goto variable_length;
+				}
+				max_size += l;
+				if (!is_null) {
+					min_size += l;
+				}
+			}
+		}
+
+		ulint n_fields = pk->n_fields + n_add;
+
+		if (n_fields >= REC_MAX_N_USER_FIELDS + DATA_N_SYS_COLS) {
+			return false;
+		}
+
+		if (pk->is_gen_clust()) {
+			min_size += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN
+				+ DATA_ROW_ID_LEN;
+			max_size += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN
+				+ DATA_ROW_ID_LEN;
+		} else {
+			min_size += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+			max_size += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+		}
+
+		uint i = pk->n_fields;
+		while (i-- > pk->n_core_fields) {
+			const dict_field_t& f = pk->fields[i];
+			if (f.col->is_nullable()) {
+				n_nullable++;
+				if (!f.col->is_dropped()
+				    && f.col->def_val.data) {
+					goto instantly_added_column;
+				}
+			} else if (f.fixed_len
+				   && (f.fixed_len <= 255
+				       || !ib_table.not_redundant())) {
+				if (ib_table.not_redundant()
+				    || !f.col->is_dropped()) {
+					min_size += f.fixed_len;
+					max_size += f.fixed_len;
+				}
+			} else if (f.col->is_dropped() || !f.col->is_added()) {
+				lenlen++;
+				goto set_max_size;
+			} else {
+instantly_added_column:
+				ut_ad(f.col->is_added());
+				if (f.col->def_val.len >= min_local_len) {
+					min_size += blob_prefix
+						+ FIELD_REF_SIZE;
+					lenlen += 2;
+				} else {
+					min_size += f.col->def_val.len;
+					lenlen += f.col->def_val.len
+						> 255 ? 2 : 1;
+				}
+set_max_size:
+				if (f.fixed_len
+				    && (f.fixed_len <= 255
+					|| !ib_table.not_redundant())) {
+					max_size += f.fixed_len;
+				} else if (f.col->len >= min_local_len) {
+					max_size += blob_prefix
+						+ FIELD_REF_SIZE;
+				} else {
+					max_size += f.col->len;
+				}
+			}
+		}
+
+		do {
+			const dict_field_t& f = pk->fields[i];
+			if (f.col->is_nullable()) {
+				n_nullable++;
+			} else if (f.fixed_len) {
+				min_size += f.fixed_len;
+			} else {
+				lenlen++;
+			}
+		} while (i--);
+
+		if (ib_table.instant
+		    || (ha_alter_info->handler_flags
+			& (ALTER_STORED_COLUMN_ORDER
+			   | ALTER_DROP_STORED_COLUMN))) {
+			n_fields++;
+			lenlen += 2;
+			min_size += FIELD_REF_SIZE;
+		}
+
+		if (ib_table.not_redundant()) {
+			min_size += REC_N_NEW_EXTRA_BYTES
+				+ UT_BITS_IN_BYTES(n_nullable)
+				+ lenlen;
+		} else {
+			min_size += (n_fields > 255 || min_size > 255)
+				? n_fields * 2 : n_fields;
+			min_size += REC_N_OLD_EXTRA_BYTES;
+		}
+
+		if (page_zip_rec_needs_ext(min_size, ib_table.not_redundant(),
+					   0, 0)) {
+			return false;
+		}
+
+		if (strict && page_zip_rec_needs_ext(max_size,
+						     ib_table.not_redundant(),
+						     0, 0)) {
+			return false;
+		}
+	}
+	// Making table system-versioned instantly is not implemented yet.
+	if (ha_alter_info->handler_flags & ALTER_ADD_SYSTEM_VERSIONING) {
+		return false;
+	}
+
+	static constexpr alter_table_operations avoid_rebuild
+		= ALTER_ADD_STORED_BASE_COLUMN
+		| ALTER_DROP_STORED_COLUMN
+		| ALTER_STORED_COLUMN_ORDER
+		| ALTER_COLUMN_NULLABLE;
+
+	if (!(ha_alter_info->handler_flags & avoid_rebuild)) {
+		alter_table_operations flags = ha_alter_info->handler_flags
+			& ~avoid_rebuild;
+		/* None of the flags are set that we can handle
+		specially to avoid rebuild. In this case, we can
+		allow ALGORITHM=INSTANT, except if some requested
+		operation requires that the table be rebuilt. */
+		if (flags & INNOBASE_ALTER_REBUILD) {
+			return false;
+		}
+		if ((flags & ALTER_OPTIONS)
+		    && alter_options_need_rebuild(ha_alter_info, table)) {
+			return false;
+		}
+	} else if (!ib_table.supports_instant()) {
+		return false;
+	}
+
+	/* At the moment, we disallow ADD [UNIQUE] INDEX together with
+	instant ADD COLUMN.
+
+	The main reason is that the work of instant ADD must be done
+	in commit_inplace_alter_table().  For the rollback_instant()
+	to work, we must add the columns to dict_table_t beforehand,
+	and roll back those changes in case the transaction is rolled
+	back.
+
+	If we added the columns to the dictionary cache already in the
+	prepare_inplace_alter_table(), we would have to deal with
+	column number mismatch in ha_innobase::open(), write_row() and
+	other functions. */
+
+	/* FIXME: allow instant ADD COLUMN together with
+	INNOBASE_ONLINE_CREATE (ADD [UNIQUE] INDEX) on pre-existing
+	columns. */
+	if (ha_alter_info->handler_flags
+	    & ((INNOBASE_ALTER_REBUILD | INNOBASE_ONLINE_CREATE)
+	       & ~ALTER_DROP_STORED_COLUMN
+	       & ~ALTER_STORED_COLUMN_ORDER
+	       & ~ALTER_ADD_STORED_BASE_COLUMN
+	       & ~ALTER_COLUMN_NULLABLE
+	       & ~ALTER_OPTIONS)) {
+		return false;
+	}
+
+	if ((ha_alter_info->handler_flags & ALTER_OPTIONS)
+	    && alter_options_need_rebuild(ha_alter_info, table)) {
+		return false;
+	}
+
+	if (ha_alter_info->handler_flags & ALTER_COLUMN_NULLABLE) {
+		if (ib_table.not_redundant()) {
+			/* Instantaneous removal of NOT NULL is
+			only supported for ROW_FORMAT=REDUNDANT. */
+			return false;
+		}
+		if (ib_table.fts_doc_id_index
+		    && !innobase_fulltext_exist(altered_table)) {
+			/* Removing hidden FTS_DOC_ID_INDEX(FTS_DOC_ID)
+			requires that the table be rebuilt. */
+			return false;
+		}
+
+		Field** af = altered_table->field;
+		Field** const end = altered_table->field
+			+ altered_table->s->fields;
+		List_iterator_fast<Create_field> cf_it(
+			ha_alter_info->alter_info->create_list);
+		for (unsigned c = 0; af < end; af++) {
+			const Create_field* cf = cf_it++;
+			if (!cf->field || !(*af)->stored_in_db()) {
+				/* Ignore virtual or newly created
+				column */
+				continue;
+			}
+
+			const dict_col_t* col = dict_table_get_nth_col(
+				&ib_table, c++);
+
+			if (!col->ord_part || col->is_nullable()
+			    || !(*af)->real_maybe_null()) {
+				continue;
+			}
+
+			/* The column would be changed from NOT NULL.
+			Ensure that it is not a clustered index key. */
+			for (auto i = pk->n_uniq; i--; ) {
+				if (pk->fields[i].col == col) {
+					return false;
+				}
+			}
+		}
+	}
+
+	return true;
+}
+
+/** Check whether the non-const default value for the field
+@param[in]	field	field which could be added or changed
+@return true if the non-const default is present. */
+static bool is_non_const_value(Field* field)
+{
+	return field->default_value
+		&& field->default_value->flags
+		& uint(~(VCOL_SESSION_FUNC | VCOL_TIME_FUNC));
+}
+
+/** Set default value for the field.
+@param[in]	field	field which could be added or changed
+@return true if the default value is set. */
+static bool set_default_value(Field* field)
+{
+	/* The added/changed NOT NULL column lacks a DEFAULT value,
+	   or the DEFAULT is the same for all rows.
+	   (Time functions, such as CURRENT_TIMESTAMP(),
+	   are evaluated from a timestamp that is assigned
+	   at the start of the statement. Session
+	   functions, such as USER(), always evaluate the
+	   same within a statement.) */
+
+	ut_ad(!is_non_const_value(field));
+
+	/* Compute the DEFAULT values of non-constant columns
+	   (VCOL_SESSION_FUNC | VCOL_TIME_FUNC). */
+	switch (field->set_default()) {
+	case 0: /* OK */
+	case 3: /* DATETIME to TIME or DATE conversion */
+		return true;
+	case -1: /* OOM, or GEOMETRY type mismatch */
+	case 1:  /* A number adjusted to the min/max value */
+	case 2:  /* String truncation, or conversion problem */
+		break;
+	}
+
+	return false;
+}
+
+/** Check whether the table has the FTS_DOC_ID column
+@param[in]	table		InnoDB table with fulltext index
+@param[in]	altered_table	MySQL table with fulltext index
+@param[out]	fts_doc_col_no	The column number for Doc ID,
+				or ULINT_UNDEFINED if it is of wrong type
+@param[out]	num_v		Number of virtual column
+@param[in]	check_only	check only whether fts doc id exist.
+@return whether there exists an FTS_DOC_ID column */
+static
+bool
+innobase_fts_check_doc_id_col(
+	const dict_table_t*	table,
+	const TABLE*		altered_table,
+	ulint*			fts_doc_col_no,
+	ulint*			num_v,
+	bool			check_only=false)
+{
+	*fts_doc_col_no = ULINT_UNDEFINED;
+
+	const uint n_cols = altered_table->s->fields;
+	ulint	i;
+	int	err = 0;
+	*num_v = 0;
+
+	for (i = 0; i < n_cols; i++) {
+		const Field*	field = altered_table->field[i];
+
+		if (!field->stored_in_db()) {
+			(*num_v)++;
+		}
+
+		if (my_strcasecmp(system_charset_info,
+				  field->field_name.str, FTS_DOC_ID_COL_NAME)) {
+			continue;
+		}
+
+		if (strcmp(field->field_name.str, FTS_DOC_ID_COL_NAME)) {
+			err = ER_WRONG_COLUMN_NAME;
+		} else if (field->type() != MYSQL_TYPE_LONGLONG
+			   || field->pack_length() != 8
+			   || field->real_maybe_null()
+			   || !(field->flags & UNSIGNED_FLAG)
+			   || !field->stored_in_db()) {
+			err = ER_INNODB_FT_WRONG_DOCID_COLUMN;
+		} else {
+			*fts_doc_col_no = i - *num_v;
+		}
+
+		if (err && !check_only) {
+			my_error(err, MYF(0), field->field_name.str);
+		}
+
+		return(true);
+	}
+
+	if (!table) {
+		return(false);
+	}
+
+	/* Not to count the virtual columns */
+	i -= *num_v;
+
+	for (; i + DATA_N_SYS_COLS < (uint) table->n_cols; i++) {
+		const char*     name = dict_table_get_col_name(table, i);
+
+		if (strcmp(name, FTS_DOC_ID_COL_NAME) == 0) {
+#ifdef UNIV_DEBUG
+			const dict_col_t*       col;
+
+			col = dict_table_get_nth_col(table, i);
+
+			/* Because the FTS_DOC_ID does not exist in
+			the .frm file or TABLE_SHARE, this must be the
+			internally created FTS_DOC_ID column. */
+			ut_ad(col->mtype == DATA_INT);
+			ut_ad(col->len == 8);
+			ut_ad(col->prtype & DATA_NOT_NULL);
+			ut_ad(col->prtype & DATA_UNSIGNED);
+#endif /* UNIV_DEBUG */
+			*fts_doc_col_no = i;
+			return(true);
+		}
+	}
+
+	return(false);
+}
+
+/** Check whether the table is empty.
+@param[in]	table			table to be checked
+@param[in]	ignore_delete_marked	Ignore the delete marked
+					flag record
+@return true if table is empty */
+static bool innobase_table_is_empty(const dict_table_t *table,
+				    bool ignore_delete_marked=true)
+{
+  if (!table->space)
+    return false;
+  dict_index_t *clust_index= dict_table_get_first_index(table);
+  mtr_t mtr;
+  btr_pcur_t pcur;
+  buf_block_t *block;
+  page_cur_t *cur;
+  rec_t *rec;
+  bool next_page= false;
+
+  mtr.start();
+  if (pcur.open_leaf(true, clust_index, BTR_SEARCH_LEAF, &mtr) != DB_SUCCESS)
+  {
+non_empty:
+    mtr.commit();
+    return false;
+  }
+  rec= page_rec_get_next(btr_pcur_get_rec(&pcur));
+  if (UNIV_UNLIKELY(!rec))
+    goto non_empty;
+  if (rec_is_metadata(rec, *clust_index))
+    btr_pcur_get_page_cur(&pcur)->rec= rec;
+scan_leaf:
+  cur= btr_pcur_get_page_cur(&pcur);
+  if (UNIV_UNLIKELY(!page_cur_move_to_next(cur)))
+    goto non_empty;
+next_page:
+  if (next_page)
+  {
+    uint32_t next_page_no= btr_page_get_next(page_cur_get_page(cur));
+    if (next_page_no == FIL_NULL)
+    {
+      mtr.commit();
+      return true;
+    }
+
+    next_page= false;
+    block= btr_block_get(*clust_index, next_page_no, RW_S_LATCH, false, &mtr);
+    if (!block)
+      goto non_empty;
+    page_cur_set_before_first(block, cur);
+    if (UNIV_UNLIKELY(!page_cur_move_to_next(cur)))
+      goto non_empty;
+    const auto s= mtr.get_savepoint();
+    mtr.rollback_to_savepoint(s - 2, s - 1);
+  }
+
+  rec= page_cur_get_rec(cur);
+  if (rec_get_deleted_flag(rec, dict_table_is_comp(table)))
+  {
+    if (ignore_delete_marked)
+      goto scan_leaf;
+    goto non_empty;
+  }
+  else if (!page_rec_is_supremum(rec))
+    goto non_empty;
+  else
+  {
+    next_page= true;
+    goto next_page;
+  }
+  goto scan_leaf;
+}
+
+/** Check if InnoDB supports a particular alter table in-place
+@param altered_table TABLE object for new version of table.
+@param ha_alter_info Structure describing changes to be done
+by ALTER TABLE and holding data used during in-place alter.
+
+@retval HA_ALTER_INPLACE_NOT_SUPPORTED Not supported
+@retval HA_ALTER_INPLACE_INSTANT
+MDL_EXCLUSIVE is needed for executing prepare_inplace_alter_table()
+and commit_inplace_alter_table(). inplace_alter_table() will not be called.
+@retval HA_ALTER_INPLACE_COPY_NO_LOCK
+MDL_EXCLUSIVE in prepare_inplace_alter_table(), which can be downgraded to
+LOCK=NONE for rebuilding the table in inplace_alter_table()
+@retval HA_ALTER_INPLACE_COPY_LOCK
+MDL_EXCLUSIVE in prepare_inplace_alter_table(), which can be downgraded to
+LOCK=SHARED for rebuilding the table in inplace_alter_table()
+@retval HA_ALTER_INPLACE_NOCOPY_NO_LOCK
+MDL_EXCLUSIVE in prepare_inplace_alter_table(), which can be downgraded to
+LOCK=NONE for inplace_alter_table() which will not rebuild the table
+@retval HA_ALTER_INPLACE_NOCOPY_LOCK
+MDL_EXCLUSIVE in prepare_inplace_alter_table(), which can be downgraded to
+LOCK=SHARED for inplace_alter_table() which will not rebuild the table
+*/
+
+enum_alter_inplace_result
+ha_innobase::check_if_supported_inplace_alter(
+	TABLE*			altered_table,
+	Alter_inplace_info*	ha_alter_info)
+{
+	DBUG_ENTER("check_if_supported_inplace_alter");
+
+	if ((ha_alter_info->handler_flags
+	     & INNOBASE_ALTER_VERSIONED_REBUILD)
+	    && altered_table->versioned(VERS_TIMESTAMP)) {
+		ha_alter_info->unsupported_reason =
+			"Not implemented for system-versioned timestamp tables";
+		DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+	}
+
+	/* Before 10.2.2 information about virtual columns was not stored in
+	system tables. We need to do a full alter to rebuild proper 10.2.2+
+	metadata with the information about virtual columns */
+	if (omits_virtual_cols(*table_share)) {
+		DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+	}
+
+	if (altered_table->s->fields > REC_MAX_N_USER_FIELDS) {
+		/* Deny the inplace ALTER TABLE. MySQL will try to
+		re-create the table and ha_innobase::create() will
+		return an error too. This is how we effectively
+		deny adding too many columns to a table. */
+		ha_alter_info->unsupported_reason =
+			my_get_err_msg(ER_TOO_MANY_FIELDS);
+		DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+	}
+
+	update_thd();
+
+	if (!m_prebuilt->table->space) {
+		ib_senderrf(m_user_thd, IB_LOG_LEVEL_WARN,
+			    ER_TABLESPACE_DISCARDED,
+			    table->s->table_name.str);
+	}
+
+	if (is_read_only(!high_level_read_only
+			 && (ha_alter_info->handler_flags & ALTER_OPTIONS)
+			 && ha_alter_info->create_info->key_block_size == 0
+			 && ha_alter_info->create_info->row_type
+			 != ROW_TYPE_COMPRESSED)) {
+		ha_alter_info->unsupported_reason =
+			my_get_err_msg(ER_READ_ONLY_MODE);
+
+		DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+	}
+
+	if (ha_alter_info->handler_flags
+	    & ~(INNOBASE_INPLACE_IGNORE
+		| INNOBASE_ALTER_INSTANT
+		| INNOBASE_ALTER_NOREBUILD
+		| INNOBASE_ALTER_REBUILD
+		| ALTER_INDEX_IGNORABILITY)) {
+
+		if (ha_alter_info->handler_flags
+		    & ALTER_STORED_COLUMN_TYPE) {
+			ha_alter_info->unsupported_reason = my_get_err_msg(
+				ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_COLUMN_TYPE);
+		}
+
+		DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+	}
+
+	ut_ad(dict_sys.sys_tables_exist());
+
+	/* Only support online add foreign key constraint when
+	check_foreigns is turned off */
+	if ((ha_alter_info->handler_flags & ALTER_ADD_FOREIGN_KEY)
+	    && m_prebuilt->trx->check_foreigns) {
+		ha_alter_info->unsupported_reason = my_get_err_msg(
+			ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_FK_CHECK);
+		DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+	}
+
+	const char* reason_rebuild = NULL;
+
+	switch (innodb_instant_alter_column_allowed) {
+	case 0: /* never */
+		if ((ha_alter_info->handler_flags
+		     & (ALTER_ADD_STORED_BASE_COLUMN
+			| ALTER_STORED_COLUMN_ORDER
+			| ALTER_DROP_STORED_COLUMN))
+		    || m_prebuilt->table->is_instant()) {
+			reason_rebuild =
+				"innodb_instant_alter_column_allowed=never";
+innodb_instant_alter_column_allowed_reason:
+			if (ha_alter_info->handler_flags
+			    & ALTER_RECREATE_TABLE) {
+				reason_rebuild = NULL;
+			} else {
+				ha_alter_info->handler_flags
+					|= ALTER_RECREATE_TABLE;
+				ha_alter_info->unsupported_reason
+					= reason_rebuild;
+			}
+		}
+		break;
+	case 1: /* add_last */
+		if ((ha_alter_info->handler_flags
+		     & (ALTER_STORED_COLUMN_ORDER | ALTER_DROP_STORED_COLUMN))
+		    || m_prebuilt->table->instant) {
+			reason_rebuild = "innodb_instant_atler_column_allowed="
+				"add_last";
+			goto innodb_instant_alter_column_allowed_reason;
+		}
+	}
+
+	switch (ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE) {
+	case ALTER_OPTIONS:
+		if (alter_options_need_rebuild(ha_alter_info, table)) {
+			reason_rebuild = my_get_err_msg(
+				ER_ALTER_OPERATION_TABLE_OPTIONS_NEED_REBUILD);
+			ha_alter_info->unsupported_reason = reason_rebuild;
+			break;
+		}
+		/* fall through */
+	case 0:
+		DBUG_RETURN(HA_ALTER_INPLACE_INSTANT);
+	}
+
+	/* InnoDB cannot IGNORE when creating unique indexes. IGNORE
+	should silently delete some duplicate rows. Our inplace_alter
+	code will not delete anything from existing indexes. */
+	if (ha_alter_info->ignore
+	    && (ha_alter_info->handler_flags
+		& (ALTER_ADD_PK_INDEX | ALTER_ADD_UNIQUE_INDEX))) {
+		ha_alter_info->unsupported_reason = my_get_err_msg(
+			ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_IGNORE);
+		DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+	}
+
+	/* DROP PRIMARY KEY is only allowed in combination with ADD
+	PRIMARY KEY. */
+	if ((ha_alter_info->handler_flags
+	     & (ALTER_ADD_PK_INDEX | ALTER_DROP_PK_INDEX))
+	    == ALTER_DROP_PK_INDEX) {
+		ha_alter_info->unsupported_reason = my_get_err_msg(
+			ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_NOPK);
+		DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+	}
+
+	if (ha_alter_info->handler_flags & ALTER_COLUMN_NULLABLE) {
+		/* If a NOT NULL attribute is going to be removed and
+		a UNIQUE INDEX on the column had been promoted to an
+		implicit PRIMARY KEY, the table should be rebuilt by
+		ALGORITHM=COPY. (Theoretically, we could support
+		rebuilding by ALGORITHM=INPLACE if a PRIMARY KEY is
+		going to be added, either explicitly or by promoting
+		another UNIQUE KEY.) */
+		const uint my_primary_key = altered_table->s->primary_key;
+
+		if (UNIV_UNLIKELY(my_primary_key >= MAX_KEY)
+		    && !dict_index_is_auto_gen_clust(
+			    dict_table_get_first_index(m_prebuilt->table))) {
+			ha_alter_info->unsupported_reason = my_get_err_msg(
+				ER_PRIMARY_CANT_HAVE_NULL);
+			DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+		}
+	}
+
+	/*
+	  InnoDB in different MariaDB versions was generating different mtype
+	  codes for certain types. In some cases the signed/unsigned bit was
+	  generated differently too.
+
+	  Inplace ALTER would change the mtype/unsigned_flag (to what the
+	  current code generates) without changing the underlying data
+	  represenation, and it might result in data corruption.
+
+	  Don't do inplace ALTER if mtype/unsigned_flag are wrong.
+	*/
+	for (ulint i = 0, icol= 0; i < table->s->fields; i++) {
+		const Field*		field = table->field[i];
+		const dict_col_t*	col = dict_table_get_nth_col(
+			m_prebuilt->table, icol);
+		unsigned unsigned_flag;
+
+		if (!field->stored_in_db()) {
+			continue;
+		}
+
+		icol++;
+
+		if (col->mtype != get_innobase_type_from_mysql_type(
+			    &unsigned_flag, field)) {
+
+			DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+		}
+
+		if ((col->prtype & DATA_UNSIGNED) != unsigned_flag) {
+
+			DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+		}
+	}
+
+	ulint n_indexes = UT_LIST_GET_LEN((m_prebuilt->table)->indexes);
+
+	/* If InnoDB dictionary and MySQL frm file are not consistent
+	use "Copy" method. */
+	if (m_prebuilt->table->dict_frm_mismatch) {
+
+		ha_alter_info->unsupported_reason = my_get_err_msg(
+			ER_NO_SUCH_INDEX);
+		ib_push_frm_error(m_user_thd, m_prebuilt->table, altered_table,
+			n_indexes, true);
+
+		DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+	}
+
+	/* '0000-00-00' value isn't allowed for datetime datatype
+	for newly added column when table is not empty */
+	if (ha_alter_info->error_if_not_empty
+	    && m_prebuilt->table->space
+	    && !innobase_table_is_empty(m_prebuilt->table)) {
+		DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+	}
+
+	const bool add_drop_v_cols = !!(ha_alter_info->handler_flags
+					& (ALTER_ADD_VIRTUAL_COLUMN
+					   | ALTER_DROP_VIRTUAL_COLUMN
+					   | ALTER_VIRTUAL_COLUMN_ORDER));
+
+	/* We should be able to do the operation in-place.
+	See if we can do it online (LOCK=NONE) or without rebuild. */
+	bool online = true, need_rebuild = false;
+	const uint fulltext_indexes = innobase_fulltext_exist(altered_table);
+
+	/* Fix the key parts. */
+	for (KEY* new_key = ha_alter_info->key_info_buffer;
+	     new_key < ha_alter_info->key_info_buffer
+		     + ha_alter_info->key_count;
+	     new_key++) {
+
+		/* Do not support adding/droping a virtual column, while
+		there is a table rebuild caused by adding a new FTS_DOC_ID */
+		if ((new_key->flags & HA_FULLTEXT) && add_drop_v_cols
+		    && !DICT_TF2_FLAG_IS_SET(m_prebuilt->table,
+					     DICT_TF2_FTS_HAS_DOC_ID)) {
+			ha_alter_info->unsupported_reason =
+				MSG_UNSUPPORTED_ALTER_ONLINE_ON_VIRTUAL_COLUMN;
+			DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+		}
+
+		for (KEY_PART_INFO* key_part = new_key->key_part;
+		     key_part < (new_key->key_part
+				 + new_key->user_defined_key_parts);
+		     key_part++) {
+			DBUG_ASSERT(key_part->fieldnr
+				    < altered_table->s->fields);
+
+			const Create_field* new_field
+				= ha_alter_info->alter_info->create_list.elem(
+					key_part->fieldnr);
+
+			DBUG_ASSERT(new_field);
+
+			key_part->field = altered_table->field[
+				key_part->fieldnr];
+
+			/* In some special cases InnoDB emits "false"
+			duplicate key errors with NULL key values. Let
+			us play safe and ensure that we can correctly
+			print key values even in such cases. */
+			key_part->null_offset = key_part->field->null_offset();
+			key_part->null_bit = key_part->field->null_bit;
+
+			if (new_field->field) {
+				/* This is an existing column. */
+				continue;
+			}
+
+			/* This is an added column. */
+			DBUG_ASSERT(ha_alter_info->handler_flags
+				    & ALTER_ADD_COLUMN);
+
+			/* We cannot replace a hidden FTS_DOC_ID
+			with a user-visible FTS_DOC_ID. */
+			if (fulltext_indexes && m_prebuilt->table->fts
+			    && !my_strcasecmp(
+				    system_charset_info,
+				    key_part->field->field_name.str,
+				    FTS_DOC_ID_COL_NAME)) {
+				ha_alter_info->unsupported_reason = my_get_err_msg(
+					ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_HIDDEN_FTS);
+				DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+			}
+
+			DBUG_ASSERT((key_part->field->unireg_check
+				     == Field::NEXT_NUMBER)
+				    == !!(key_part->field->flags
+					  & AUTO_INCREMENT_FLAG));
+
+			if (key_part->field->flags & AUTO_INCREMENT_FLAG) {
+				/* We cannot assign AUTO_INCREMENT values
+				during online or instant ALTER. */
+				DBUG_ASSERT(key_part->field == altered_table
+					    -> found_next_number_field);
+
+				if (ha_alter_info->online) {
+					ha_alter_info->unsupported_reason = my_get_err_msg(
+						ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_AUTOINC);
+				}
+
+				online = false;
+				need_rebuild = true;
+			}
+
+			if (!key_part->field->stored_in_db()) {
+				/* Do not support adding index on newly added
+				virtual column, while there is also a drop
+				virtual column in the same clause */
+				if (ha_alter_info->handler_flags
+				    & ALTER_DROP_VIRTUAL_COLUMN) {
+					ha_alter_info->unsupported_reason =
+						MSG_UNSUPPORTED_ALTER_ONLINE_ON_VIRTUAL_COLUMN;
+
+					DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+				}
+
+				if (ha_alter_info->online
+				    && !ha_alter_info->unsupported_reason) {
+					ha_alter_info->unsupported_reason =
+						MSG_UNSUPPORTED_ALTER_ONLINE_ON_VIRTUAL_COLUMN;
+				}
+
+				online = false;
+			}
+		}
+	}
+
+	DBUG_ASSERT(!m_prebuilt->table->fts
+		    || (m_prebuilt->table->fts->doc_col <= table->s->fields));
+
+	DBUG_ASSERT(!m_prebuilt->table->fts
+		    || (m_prebuilt->table->fts->doc_col
+		        < dict_table_get_n_user_cols(m_prebuilt->table)));
+
+	if (fulltext_indexes && m_prebuilt->table->fts) {
+		/* FTS index of versioned table has row_end, need rebuild */
+		if (table->versioned() != altered_table->versioned()) {
+			need_rebuild= true;
+		}
+
+		/* FULLTEXT indexes are supposed to remain. */
+		/* Disallow DROP INDEX FTS_DOC_ID_INDEX */
+
+		for (uint i = 0; i < ha_alter_info->index_drop_count; i++) {
+			if (!my_strcasecmp(
+				    system_charset_info,
+				    ha_alter_info->index_drop_buffer[i]->name.str,
+				    FTS_DOC_ID_INDEX_NAME)) {
+				ha_alter_info->unsupported_reason = my_get_err_msg(
+					ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_CHANGE_FTS);
+				DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+			}
+		}
+
+		/* InnoDB can have a hidden FTS_DOC_ID_INDEX on a
+		visible FTS_DOC_ID column as well. Prevent dropping or
+		renaming the FTS_DOC_ID. */
+
+		for (Field** fp = table->field; *fp; fp++) {
+			if (!((*fp)->flags
+			      & (FIELD_IS_RENAMED | FIELD_IS_DROPPED))) {
+				continue;
+			}
+
+			if (!my_strcasecmp(
+				    system_charset_info,
+				    (*fp)->field_name.str,
+				    FTS_DOC_ID_COL_NAME)) {
+				ha_alter_info->unsupported_reason = my_get_err_msg(
+					ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_CHANGE_FTS);
+				DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+			}
+		}
+	}
+
+	m_prebuilt->trx->will_lock = true;
+
+	/* When changing a NULL column to NOT NULL and specifying a
+	DEFAULT value, ensure that the DEFAULT expression is a constant.
+	Also, in ADD COLUMN, for now we only support a
+	constant DEFAULT expression. */
+	Field **af = altered_table->field;
+	bool fts_need_rebuild = false;
+	need_rebuild = need_rebuild
+		|| innobase_need_rebuild(ha_alter_info, table);
+
+	for (Create_field& cf : ha_alter_info->alter_info->create_list) {
+		DBUG_ASSERT(cf.field
+			    || (ha_alter_info->handler_flags
+				& ALTER_ADD_COLUMN));
+
+		if (const Field* f = cf.field) {
+			/* An AUTO_INCREMENT attribute can only
+			be added to an existing column by ALGORITHM=COPY,
+			but we can remove the attribute. */
+			ut_ad((*af)->unireg_check != Field::NEXT_NUMBER
+			      || f->unireg_check == Field::NEXT_NUMBER);
+			if (!f->real_maybe_null() || (*af)->real_maybe_null())
+				goto next_column;
+			/* We are changing an existing column
+			from NULL to NOT NULL. */
+			DBUG_ASSERT(ha_alter_info->handler_flags
+				    & ALTER_COLUMN_NOT_NULLABLE);
+			/* Virtual columns are never NOT NULL. */
+			DBUG_ASSERT(f->stored_in_db());
+			switch ((*af)->type()) {
+			case MYSQL_TYPE_TIMESTAMP:
+			case MYSQL_TYPE_TIMESTAMP2:
+				/* Inserting NULL into a TIMESTAMP column
+				would cause the DEFAULT value to be
+				replaced. Ensure that the DEFAULT
+				expression is not changing during
+				ALTER TABLE. */
+				if (!(*af)->default_value
+				    && (*af)->is_real_null()) {
+					/* No DEFAULT value is
+					specified. We can report
+					errors for any NULL values for
+					the TIMESTAMP. */
+					goto next_column;
+				}
+				break;
+			default:
+				/* For any other data type, NULL
+				values are not converted. */
+				goto next_column;
+			}
+
+			ha_alter_info->unsupported_reason = my_get_err_msg(
+				ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_NOT_NULL);
+		} else if (!is_non_const_value(*af)
+			   && set_default_value(*af)) {
+			if (fulltext_indexes > 1
+			    && !my_strcasecmp(system_charset_info,
+					      (*af)->field_name.str,
+					      FTS_DOC_ID_COL_NAME)) {
+				/* If a hidden FTS_DOC_ID column exists
+				(because of FULLTEXT INDEX), it cannot
+				be replaced with a user-created one
+				except when using ALGORITHM=COPY. */
+				ha_alter_info->unsupported_reason =
+					my_get_err_msg(ER_INNODB_FT_LIMIT);
+				DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+			}
+			goto next_column;
+		}
+
+		DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+
+next_column:
+		af++;
+	}
+
+	const bool supports_instant = instant_alter_column_possible(
+		*m_prebuilt->table, ha_alter_info, table, altered_table,
+		is_innodb_strict_mode());
+	if (add_drop_v_cols) {
+		ulonglong flags = ha_alter_info->handler_flags;
+
+		/* TODO: uncomment the flags below, once we start to
+		support them */
+
+		flags &= ~(ALTER_ADD_VIRTUAL_COLUMN
+			   | ALTER_DROP_VIRTUAL_COLUMN
+			   | ALTER_VIRTUAL_COLUMN_ORDER
+		           | ALTER_VIRTUAL_GCOL_EXPR
+		           | ALTER_COLUMN_VCOL
+		/*
+			   | ALTER_ADD_STORED_BASE_COLUMN
+			   | ALTER_DROP_STORED_COLUMN
+			   | ALTER_STORED_COLUMN_ORDER
+			   | ALTER_ADD_UNIQUE_INDEX
+		*/
+			   | ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX
+			   | ALTER_DROP_NON_UNIQUE_NON_PRIM_INDEX
+			   | ALTER_INDEX_ORDER);
+		if (supports_instant) {
+			flags &= ~(ALTER_DROP_STORED_COLUMN
+#if 0 /* MDEV-17468: remove check_v_col_in_order() and fix the code */
+				   | ALTER_ADD_STORED_BASE_COLUMN
+#endif
+				   | ALTER_STORED_COLUMN_ORDER);
+		}
+		if (flags != 0
+		    || IF_PARTITIONING((altered_table->s->partition_info_str
+			&& altered_table->s->partition_info_str_len), 0)
+		    || (!check_v_col_in_order(
+			this->table, altered_table, ha_alter_info))) {
+			ha_alter_info->unsupported_reason =
+				MSG_UNSUPPORTED_ALTER_ONLINE_ON_VIRTUAL_COLUMN;
+			DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+		}
+	}
+
+	if (supports_instant && !(ha_alter_info->handler_flags
+				  & INNOBASE_ALTER_NOREBUILD)) {
+		DBUG_RETURN(HA_ALTER_INPLACE_INSTANT);
+	}
+
+	if (need_rebuild
+	    && (fulltext_indexes
+		|| innobase_spatial_exist(altered_table)
+		|| innobase_indexed_virtual_exist(altered_table))) {
+		/* If the table already contains fulltext indexes,
+		refuse to rebuild the table natively altogether. */
+		if (fulltext_indexes > 1) {
+cannot_create_many_fulltext_index:
+			ha_alter_info->unsupported_reason =
+				my_get_err_msg(ER_INNODB_FT_LIMIT);
+			DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+		}
+
+		if (!online || !ha_alter_info->online
+		    || ha_alter_info->unsupported_reason != reason_rebuild) {
+			/* Either LOCK=NONE was not requested, or we already
+			gave specific reason to refuse it. */
+		} else if (fulltext_indexes) {
+			ha_alter_info->unsupported_reason = my_get_err_msg(
+				ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_FTS);
+		} else if (innobase_spatial_exist(altered_table)) {
+			ha_alter_info->unsupported_reason = my_get_err_msg(
+				ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_GIS);
+		} else {
+			/* MDEV-14341 FIXME: Remove this limitation. */
+			ha_alter_info->unsupported_reason =
+				"online rebuild with indexed virtual columns";
+		}
+
+		online = false;
+	}
+
+	if (ha_alter_info->handler_flags
+		& ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX) {
+		/* ADD FULLTEXT|SPATIAL INDEX requires a lock.
+
+		We could do ADD FULLTEXT INDEX without a lock if the
+		table already contains an FTS_DOC_ID column, but in
+		that case we would have to apply the modification log
+		to the full-text indexes.
+
+		We could also do ADD SPATIAL INDEX by implementing
+		row_log_apply() for it. */
+		bool add_fulltext = false;
+
+		for (uint i = 0; i < ha_alter_info->index_add_count; i++) {
+			const KEY* key =
+				&ha_alter_info->key_info_buffer[
+					ha_alter_info->index_add_buffer[i]];
+			if (key->flags & HA_FULLTEXT) {
+				DBUG_ASSERT(!(key->flags & HA_KEYFLAG_MASK
+					      & ~(HA_FULLTEXT
+						  | HA_PACK_KEY
+						  | HA_GENERATED_KEY
+						  | HA_BINARY_PACK_KEY)));
+				if (add_fulltext) {
+					goto cannot_create_many_fulltext_index;
+				}
+
+				add_fulltext = true;
+				if (ha_alter_info->online
+				    && !ha_alter_info->unsupported_reason) {
+					ha_alter_info->unsupported_reason = my_get_err_msg(
+						ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_FTS);
+				}
+
+				online = false;
+
+				/* Full text search index exists, check
+				whether the table already has DOC ID column.
+				If not, InnoDB have to rebuild the table to
+				add a Doc ID hidden column and change
+				primary index. */
+				ulint	fts_doc_col_no;
+				ulint	num_v = 0;
+
+				fts_need_rebuild =
+					!innobase_fts_check_doc_id_col(
+						m_prebuilt->table,
+						altered_table,
+						&fts_doc_col_no, &num_v, true);
+			}
+
+			if (online && (key->flags & HA_SPATIAL)) {
+
+				if (ha_alter_info->online) {
+					ha_alter_info->unsupported_reason = my_get_err_msg(
+						ER_ALTER_OPERATION_NOT_SUPPORTED_REASON_GIS);
+				}
+
+				online = false;
+			}
+		}
+	}
+
+	// FIXME: implement Online DDL for system-versioned operations
+	if (ha_alter_info->handler_flags & INNOBASE_ALTER_VERSIONED_REBUILD) {
+
+		if (ha_alter_info->online) {
+			ha_alter_info->unsupported_reason =
+				"Not implemented for system-versioned operations";
+		}
+
+		online = false;
+	}
+
+	if ((need_rebuild && !supports_instant) || fts_need_rebuild) {
+		ha_alter_info->handler_flags |= ALTER_RECREATE_TABLE;
+		DBUG_RETURN(online
+			    ? HA_ALTER_INPLACE_COPY_NO_LOCK
+			    : HA_ALTER_INPLACE_COPY_LOCK);
+	}
+
+	if (ha_alter_info->unsupported_reason) {
+	} else if (ha_alter_info->handler_flags & INNOBASE_ONLINE_CREATE) {
+		ha_alter_info->unsupported_reason = "ADD INDEX";
+	} else {
+		ha_alter_info->unsupported_reason = "DROP INDEX";
+	}
+
+	DBUG_RETURN(online
+		    ? HA_ALTER_INPLACE_NOCOPY_NO_LOCK
+		    : HA_ALTER_INPLACE_NOCOPY_LOCK);
+}
+
+/*************************************************************//**
+Initialize the dict_foreign_t structure with supplied info
+@return true if added, false if duplicate foreign->id */
+static MY_ATTRIBUTE((nonnull(1,3,5,7)))
+bool
+innobase_init_foreign(
+/*==================*/
+	dict_foreign_t*	foreign,		/*!< in/out: structure to
+						initialize */
+	const char*	constraint_name,	/*!< in/out: constraint name if
+						exists */
+	dict_table_t*	table,			/*!< in: foreign table */
+	dict_index_t*	index,			/*!< in: foreign key index */
+	const char**	column_names,		/*!< in: foreign key column
+						names */
+	ulint		num_field,		/*!< in: number of columns */
+	const char*	referenced_table_name,	/*!< in: referenced table
+						name */
+	dict_table_t*	referenced_table,	/*!< in: referenced table */
+	dict_index_t*	referenced_index,	/*!< in: referenced index */
+	const char**	referenced_column_names,/*!< in: referenced column
+						names */
+	ulint		referenced_num_field)	/*!< in: number of referenced
+						columns */
+{
+	ut_ad(dict_sys.locked());
+
+        if (constraint_name) {
+                ulint   db_len;
+
+                /* Catenate 'databasename/' to the constraint name specified
+                by the user: we conceive the constraint as belonging to the
+                same MySQL 'database' as the table itself. We store the name
+                to foreign->id. */
+
+                db_len = dict_get_db_name_len(table->name.m_name);
+
+                foreign->id = static_cast<char*>(mem_heap_alloc(
+                        foreign->heap, db_len + strlen(constraint_name) + 2));
+
+                memcpy(foreign->id, table->name.m_name, db_len);
+                foreign->id[db_len] = '/';
+                strcpy(foreign->id + db_len + 1, constraint_name);
+
+		/* Check if any existing foreign key has the same id,
+		this is needed only if user supplies the constraint name */
+
+		if (table->foreign_set.find(foreign)
+		    != table->foreign_set.end()) {
+			return(false);
+		}
+        }
+
+        foreign->foreign_table = table;
+        foreign->foreign_table_name = mem_heap_strdup(
+                foreign->heap, table->name.m_name);
+        dict_mem_foreign_table_name_lookup_set(foreign, TRUE);
+
+        foreign->foreign_index = index;
+        foreign->n_fields = static_cast<unsigned>(num_field)
+		& dict_index_t::MAX_N_FIELDS;
+
+        foreign->foreign_col_names = static_cast<const char**>(
+                mem_heap_alloc(foreign->heap, num_field * sizeof(void*)));
+
+        for (ulint i = 0; i < foreign->n_fields; i++) {
+                foreign->foreign_col_names[i] = mem_heap_strdup(
+                        foreign->heap, column_names[i]);
+        }
+
+	foreign->referenced_index = referenced_index;
+	foreign->referenced_table = referenced_table;
+
+	foreign->referenced_table_name = mem_heap_strdup(
+		foreign->heap, referenced_table_name);
+        dict_mem_referenced_table_name_lookup_set(foreign, TRUE);
+
+        foreign->referenced_col_names = static_cast<const char**>(
+                mem_heap_alloc(foreign->heap,
+			       referenced_num_field * sizeof(void*)));
+
+        for (ulint i = 0; i < foreign->n_fields; i++) {
+                foreign->referenced_col_names[i]
+                        = mem_heap_strdup(foreign->heap,
+					  referenced_column_names[i]);
+        }
+
+	return(true);
+}
+
+/*************************************************************//**
+Check whether the foreign key options is legit
+@return true if it is */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+innobase_check_fk_option(
+/*=====================*/
+	const dict_foreign_t*	foreign)	/*!< in: foreign key */
+{
+	if (!foreign->foreign_index) {
+		return(true);
+	}
+
+	if (foreign->type & (DICT_FOREIGN_ON_UPDATE_SET_NULL
+			     | DICT_FOREIGN_ON_DELETE_SET_NULL)) {
+
+		for (ulint j = 0; j < foreign->n_fields; j++) {
+			if ((dict_index_get_nth_col(
+				     foreign->foreign_index, j)->prtype)
+			    & DATA_NOT_NULL) {
+
+				/* It is not sensible to define
+				SET NULL if the column is not
+				allowed to be NULL! */
+				return(false);
+			}
+		}
+	}
+
+	return(true);
+}
+
+/*************************************************************//**
+Set foreign key options
+@return true if successfully set */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+innobase_set_foreign_key_option(
+/*============================*/
+	dict_foreign_t*	foreign,	/*!< in:InnoDB Foreign key */
+	Foreign_key*	fk_key)		/*!< in: Foreign key info from
+					MySQL */
+{
+	ut_ad(!foreign->type);
+
+	switch (fk_key->delete_opt) {
+	case FK_OPTION_NO_ACTION:
+	case FK_OPTION_RESTRICT:
+	case FK_OPTION_SET_DEFAULT:
+		foreign->type = DICT_FOREIGN_ON_DELETE_NO_ACTION;
+		break;
+	case FK_OPTION_CASCADE:
+		foreign->type = DICT_FOREIGN_ON_DELETE_CASCADE;
+		break;
+	case FK_OPTION_SET_NULL:
+		foreign->type = DICT_FOREIGN_ON_DELETE_SET_NULL;
+		break;
+	case FK_OPTION_UNDEF:
+		break;
+	}
+
+	switch (fk_key->update_opt) {
+	case FK_OPTION_NO_ACTION:
+	case FK_OPTION_RESTRICT:
+	case FK_OPTION_SET_DEFAULT:
+		foreign->type |= DICT_FOREIGN_ON_UPDATE_NO_ACTION;
+		break;
+	case FK_OPTION_CASCADE:
+		foreign->type |= DICT_FOREIGN_ON_UPDATE_CASCADE;
+		break;
+	case FK_OPTION_SET_NULL:
+		foreign->type |= DICT_FOREIGN_ON_UPDATE_SET_NULL;
+		break;
+	case FK_OPTION_UNDEF:
+		break;
+	}
+
+	return(innobase_check_fk_option(foreign));
+}
+
+/*******************************************************************//**
+Check if a foreign key constraint can make use of an index
+that is being created.
+@param[in]	col_names	column names
+@param[in]	n_cols		number of columns
+@param[in]	keys		index information
+@param[in]	add		indexes being created
+@return useable index, or NULL if none found */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+const KEY*
+innobase_find_equiv_index(
+	const char*const*	col_names,
+	uint			n_cols,
+	const KEY*		keys,
+	span<uint>		add)
+{
+	for (span<uint>::iterator it = add.begin(), end = add.end(); it != end;
+	     ++it) {
+		const KEY*	key = &keys[*it];
+
+		if (key->user_defined_key_parts < n_cols
+		    || key->flags & HA_SPATIAL) {
+no_match:
+			continue;
+		}
+
+		for (uint j = 0; j < n_cols; j++) {
+			const KEY_PART_INFO&	key_part = key->key_part[j];
+			uint32			col_len
+				= key_part.field->pack_length();
+
+			/* Any index on virtual columns cannot be used
+			for reference constraint */
+			if (!key_part.field->stored_in_db()) {
+				goto no_match;
+			}
+
+			/* The MySQL pack length contains 1 or 2 bytes
+			length field for a true VARCHAR. */
+
+			if (key_part.field->type() == MYSQL_TYPE_VARCHAR) {
+				col_len -= static_cast<const Field_varstring*>(
+					key_part.field)->length_bytes;
+			}
+
+			if (key_part.length < col_len) {
+
+				/* Column prefix indexes cannot be
+				used for FOREIGN KEY constraints. */
+				goto no_match;
+			}
+
+			if (innobase_strcasecmp(col_names[j],
+						key_part.field->field_name.str)) {
+				/* Name mismatch */
+				goto no_match;
+			}
+		}
+
+		return(key);
+	}
+
+	return(NULL);
+}
+
+/*************************************************************//**
+Find an index whose first fields are the columns in the array
+in the same order and is not marked for deletion
+@return matching index, NULL if not found */
+static MY_ATTRIBUTE((nonnull(1,4), warn_unused_result))
+dict_index_t*
+innobase_find_fk_index(
+/*===================*/
+	dict_table_t*		table,	/*!< in: table */
+	const char**		col_names,
+					/*!< in: column names, or NULL
+					to use table->col_names */
+	span<dict_index_t*>	drop_index,
+					/*!< in: indexes to be dropped */
+	const char**		columns,/*!< in: array of column names */
+	ulint			n_cols) /*!< in: number of columns */
+{
+	dict_index_t*	index;
+
+	index = dict_table_get_first_index(table);
+
+	while (index != NULL) {
+		if (dict_foreign_qualify_index(table, col_names, columns,
+					       n_cols, index, NULL, true, 0,
+					       NULL, NULL, NULL)
+		    && std::find(drop_index.begin(), drop_index.end(), index)
+			   == drop_index.end()) {
+			return index;
+		}
+
+		index = dict_table_get_next_index(index);
+	}
+
+	return(NULL);
+}
+
+/** Check whether given column is a base of stored column.
+@param[in]	col_name	column name
+@param[in]	table		table
+@param[in]	s_cols		list of stored columns
+@return true if the given column is a base of stored column,else false. */
+static
+bool
+innobase_col_check_fk(
+	const char*		col_name,
+	const dict_table_t*	table,
+	dict_s_col_list*	s_cols)
+{
+	dict_s_col_list::const_iterator	it;
+
+	for (it = s_cols->begin(); it != s_cols->end(); ++it) {
+		for (ulint j = it->num_base; j--; ) {
+			if (!strcmp(col_name, dict_table_get_col_name(
+					    table, it->base_col[j]->ind))) {
+				return(true);
+			}
+		}
+	}
+
+	return(false);
+}
+
+/** Check whether the foreign key constraint is on base of any stored columns.
+@param[in]	foreign	Foriegn key constraing information
+@param[in]	table	table to which the foreign key objects
+to be added
+@param[in]	s_cols	list of stored column information in the table.
+@return true if yes, otherwise false. */
+static
+bool
+innobase_check_fk_stored(
+	const dict_foreign_t*	foreign,
+	const dict_table_t*	table,
+	dict_s_col_list*	s_cols)
+{
+	ulint	type = foreign->type;
+
+	type &= ~(DICT_FOREIGN_ON_DELETE_NO_ACTION
+		  | DICT_FOREIGN_ON_UPDATE_NO_ACTION);
+
+	if (type == 0 || s_cols == NULL) {
+		return(false);
+	}
+
+	for (ulint i = 0; i < foreign->n_fields; i++) {
+		if (innobase_col_check_fk(
+			foreign->foreign_col_names[i], table, s_cols)) {
+			return(true);
+		}
+	}
+
+	return(false);
+}
+
+/** Create InnoDB foreign key structure from MySQL alter_info
+@param[in]	ha_alter_info	alter table info
+@param[in]	table_share	TABLE_SHARE
+@param[in]	table		table object
+@param[in]	col_names	column names, or NULL to use
+table->col_names
+@param[in]	drop_index	indexes to be dropped
+@param[in]	n_drop_index	size of drop_index
+@param[out]	add_fk		foreign constraint added
+@param[out]	n_add_fk	number of foreign constraints
+added
+@param[in]	trx		user transaction
+@param[in]	s_cols		list of stored column information
+@retval true if successful
+@retval false on error (will call my_error()) */
+static MY_ATTRIBUTE((nonnull(1,2,3,7,8), warn_unused_result))
+bool
+innobase_get_foreign_key_info(
+	Alter_inplace_info*
+			ha_alter_info,
+	const TABLE_SHARE*
+			table_share,
+	dict_table_t*	table,
+	const char**	col_names,
+	dict_index_t**	drop_index,
+	ulint		n_drop_index,
+	dict_foreign_t**add_fk,
+	ulint*		n_add_fk,
+	const trx_t*	trx,
+	dict_s_col_list*s_cols)
+{
+	dict_table_t*	referenced_table = NULL;
+	char*		referenced_table_name = NULL;
+	ulint		num_fk = 0;
+	Alter_info*	alter_info = ha_alter_info->alter_info;
+	const CHARSET_INFO*	cs = thd_charset(trx->mysql_thd);
+
+	DBUG_ENTER("innobase_get_foreign_key_info");
+
+	*n_add_fk = 0;
+
+	for (Key& key : alter_info->key_list) {
+		if (key.type != Key::FOREIGN_KEY || key.old) {
+			continue;
+		}
+
+		const char*	column_names[MAX_NUM_FK_COLUMNS];
+		dict_index_t*	index = NULL;
+		const char*	referenced_column_names[MAX_NUM_FK_COLUMNS];
+		dict_index_t*	referenced_index = NULL;
+		ulint		num_col = 0;
+		ulint		referenced_num_col = 0;
+		bool		correct_option;
+
+		Foreign_key* fk_key = static_cast<Foreign_key*>(&key);
+
+		if (fk_key->columns.elements > 0) {
+			ulint	i = 0;
+
+			/* Get all the foreign key column info for the
+			current table */
+			for (const Key_part_spec& column : fk_key->columns) {
+				column_names[i] = column.field_name.str;
+				ut_ad(i < MAX_NUM_FK_COLUMNS);
+				i++;
+			}
+
+			index = innobase_find_fk_index(
+				table, col_names,
+				span<dict_index_t*>(drop_index, n_drop_index),
+				column_names, i);
+
+			/* MySQL would add a index in the creation
+			list if no such index for foreign table,
+			so we have to use DBUG_EXECUTE_IF to simulate
+			the scenario */
+			DBUG_EXECUTE_IF("innodb_test_no_foreign_idx",
+					index = NULL;);
+
+			/* Check whether there exist such
+			index in the the index create clause */
+			if (!index && !innobase_find_equiv_index(
+				    column_names, static_cast<uint>(i),
+				    ha_alter_info->key_info_buffer,
+				    span<uint>(ha_alter_info->index_add_buffer,
+					       ha_alter_info->index_add_count))) {
+				my_error(
+					ER_FK_NO_INDEX_CHILD,
+					MYF(0),
+					fk_key->name.str
+					? fk_key->name.str : "",
+					table_share->table_name.str);
+				goto err_exit;
+			}
+
+			num_col = i;
+		}
+
+		add_fk[num_fk] = dict_mem_foreign_create();
+
+		dict_sys.lock(SRW_LOCK_CALL);
+
+		referenced_table_name = dict_get_referenced_table(
+			table->name.m_name,
+			LEX_STRING_WITH_LEN(fk_key->ref_db),
+			LEX_STRING_WITH_LEN(fk_key->ref_table),
+			&referenced_table,
+			add_fk[num_fk]->heap, cs);
+
+		/* Test the case when referenced_table failed to
+		open, if trx->check_foreigns is not set, we should
+		still be able to add the foreign key */
+		DBUG_EXECUTE_IF("innodb_test_open_ref_fail",
+				referenced_table = NULL;);
+
+		if (!referenced_table && trx->check_foreigns) {
+			my_error(ER_FK_CANNOT_OPEN_PARENT,
+				 MYF(0), fk_key->ref_table.str);
+			goto err_exit_unlock;
+		}
+
+		if (fk_key->ref_columns.elements > 0) {
+			ulint	i = 0;
+
+			for (Key_part_spec &column : fk_key->ref_columns) {
+				referenced_column_names[i] =
+					column.field_name.str;
+				ut_ad(i < MAX_NUM_FK_COLUMNS);
+				i++;
+			}
+
+			if (referenced_table) {
+				referenced_index =
+					dict_foreign_find_index(
+						referenced_table, 0,
+						referenced_column_names,
+						i, index,
+						TRUE, FALSE,
+						NULL, NULL, NULL);
+
+				DBUG_EXECUTE_IF(
+					"innodb_test_no_reference_idx",
+					referenced_index = NULL;);
+
+				/* Check whether there exist such
+				index in the the index create clause */
+				if (!referenced_index) {
+					my_error(ER_FK_NO_INDEX_PARENT, MYF(0),
+						 fk_key->name.str
+						 ? fk_key->name.str : "",
+						 fk_key->ref_table.str);
+					goto err_exit_unlock;
+				}
+			} else {
+				ut_a(!trx->check_foreigns);
+			}
+
+			referenced_num_col = i;
+		} else {
+			/* Not possible to add a foreign key without a
+			referenced column */
+			my_error(ER_CANNOT_ADD_FOREIGN, MYF(0),
+				 fk_key->ref_table.str);
+			goto err_exit_unlock;
+		}
+
+		if (!innobase_init_foreign(
+			    add_fk[num_fk], fk_key->name.str,
+			    table, index, column_names,
+			    num_col, referenced_table_name,
+			    referenced_table, referenced_index,
+			    referenced_column_names, referenced_num_col)) {
+			my_error(
+				ER_DUP_CONSTRAINT_NAME,
+				MYF(0),
+                                "FOREIGN KEY", add_fk[num_fk]->id);
+			goto err_exit_unlock;
+		}
+
+		dict_sys.unlock();
+
+		correct_option = innobase_set_foreign_key_option(
+			add_fk[num_fk], fk_key);
+
+		DBUG_EXECUTE_IF("innodb_test_wrong_fk_option",
+				correct_option = false;);
+
+		if (!correct_option) {
+			my_error(ER_FK_INCORRECT_OPTION,
+				 MYF(0),
+				 table_share->table_name.str,
+				 add_fk[num_fk]->id);
+			goto err_exit;
+		}
+
+		if (innobase_check_fk_stored(
+			add_fk[num_fk], table, s_cols)) {
+			my_printf_error(
+				HA_ERR_UNSUPPORTED,
+				"Cannot add foreign key on the base column "
+				"of stored column", MYF(0));
+			goto err_exit;
+		}
+
+		num_fk++;
+	}
+
+	*n_add_fk = num_fk;
+
+	DBUG_RETURN(true);
+err_exit_unlock:
+	dict_sys.unlock();
+err_exit:
+	for (ulint i = 0; i <= num_fk; i++) {
+		if (add_fk[i]) {
+			dict_foreign_free(add_fk[i]);
+		}
+	}
+
+	DBUG_RETURN(false);
+}
+
+/*************************************************************//**
+Copies an InnoDB column to a MySQL field.  This function is
+adapted from row_sel_field_store_in_mysql_format(). */
+static
+void
+innobase_col_to_mysql(
+/*==================*/
+	const dict_col_t*	col,	/*!< in: InnoDB column */
+	const uchar*		data,	/*!< in: InnoDB column data */
+	ulint			len,	/*!< in: length of data, in bytes */
+	Field*			field)	/*!< in/out: MySQL field */
+{
+	uchar*	ptr;
+	uchar*	dest	= field->ptr;
+	ulint	flen	= field->pack_length();
+
+	switch (col->mtype) {
+	case DATA_INT:
+		ut_ad(len == flen);
+
+		/* Convert integer data from Innobase to little-endian
+		format, sign bit restored to normal */
+
+		for (ptr = dest + len; ptr != dest; ) {
+			*--ptr = *data++;
+		}
+
+		if (!(col->prtype & DATA_UNSIGNED)) {
+			((byte*) dest)[len - 1] ^= 0x80;
+		}
+
+		break;
+
+	case DATA_VARCHAR:
+	case DATA_VARMYSQL:
+	case DATA_BINARY:
+		field->reset();
+
+		if (field->type() == MYSQL_TYPE_VARCHAR) {
+			/* This is a >= 5.0.3 type true VARCHAR. Store the
+			length of the data to the first byte or the first
+			two bytes of dest. */
+
+			dest = row_mysql_store_true_var_len(
+				dest, len, flen - field->key_length());
+		}
+
+		/* Copy the actual data */
+		memcpy(dest, data, len);
+		break;
+
+	case DATA_GEOMETRY:
+	case DATA_BLOB:
+		/* Skip MySQL BLOBs when reporting an erroneous row
+		during index creation or table rebuild. */
+		field->set_null();
+		break;
+
+#ifdef UNIV_DEBUG
+	case DATA_MYSQL:
+		ut_ad(flen >= len);
+		ut_ad(col->mbmaxlen >= col->mbminlen);
+		memcpy(dest, data, len);
+		break;
+
+	default:
+	case DATA_SYS_CHILD:
+	case DATA_SYS:
+		/* These column types should never be shipped to MySQL. */
+		ut_ad(0);
+		/* fall through */
+	case DATA_FLOAT:
+	case DATA_DOUBLE:
+	case DATA_DECIMAL:
+		/* Above are the valid column types for MySQL data. */
+		ut_ad(flen == len);
+		/* fall through */
+	case DATA_FIXBINARY:
+	case DATA_CHAR:
+		/* We may have flen > len when there is a shorter
+		prefix on the CHAR and BINARY column. */
+		ut_ad(flen >= len);
+#else /* UNIV_DEBUG */
+	default:
+#endif /* UNIV_DEBUG */
+		memcpy(dest, data, len);
+	}
+}
+
+/*************************************************************//**
+Copies an InnoDB record to table->record[0]. */
+void
+innobase_rec_to_mysql(
+/*==================*/
+	struct TABLE*		table,	/*!< in/out: MySQL table */
+	const rec_t*		rec,	/*!< in: record */
+	const dict_index_t*	index,	/*!< in: index */
+	const rec_offs*		offsets)/*!< in: rec_get_offsets(
+					rec, index, ...) */
+{
+	uint	n_fields	= table->s->fields;
+
+	ut_ad(n_fields == dict_table_get_n_user_cols(index->table)
+	      - !!(DICT_TF2_FLAG_IS_SET(index->table,
+					DICT_TF2_FTS_HAS_DOC_ID)));
+
+	for (uint i = 0; i < n_fields; i++) {
+		Field*		field	= table->field[i];
+		ulint		ipos;
+		ulint		ilen;
+		const uchar*	ifield;
+		ulint prefix_col;
+
+		field->reset();
+
+		ipos = dict_index_get_nth_col_or_prefix_pos(
+			index, i, true, false, &prefix_col);
+
+		if (ipos == ULINT_UNDEFINED
+		    || rec_offs_nth_extern(offsets, ipos)) {
+null_field:
+			field->set_null();
+			continue;
+		}
+
+		ifield = rec_get_nth_cfield(rec, index, offsets, ipos, &ilen);
+
+		/* Assign the NULL flag */
+		if (ilen == UNIV_SQL_NULL) {
+			ut_ad(field->real_maybe_null());
+			goto null_field;
+		}
+
+		field->set_notnull();
+
+		innobase_col_to_mysql(
+			dict_field_get_col(
+				dict_index_get_nth_field(index, ipos)),
+			ifield, ilen, field);
+	}
+}
+
+/*************************************************************//**
+Copies an InnoDB index entry to table->record[0].
+This is used in preparation for print_keydup_error() from
+inline add index */
+void
+innobase_fields_to_mysql(
+/*=====================*/
+	struct TABLE*		table,	/*!< in/out: MySQL table */
+	const dict_index_t*	index,	/*!< in: InnoDB index */
+	const dfield_t*		fields)	/*!< in: InnoDB index fields */
+{
+	uint	n_fields	= table->s->fields;
+	ulint	num_v 		= 0;
+
+	ut_ad(n_fields == dict_table_get_n_user_cols(index->table)
+	      + dict_table_get_n_v_cols(index->table)
+	      - !!(DICT_TF2_FLAG_IS_SET(index->table,
+					DICT_TF2_FTS_HAS_DOC_ID)));
+
+	for (uint i = 0; i < n_fields; i++) {
+		Field*		field	= table->field[i];
+		ulint		ipos;
+		ulint		prefix_col;
+
+		field->reset();
+
+		const bool is_v = !field->stored_in_db();
+		const ulint col_n = is_v ? num_v++ : i - num_v;
+
+		ipos = dict_index_get_nth_col_or_prefix_pos(
+			index, col_n, true, is_v, &prefix_col);
+
+		if (ipos == ULINT_UNDEFINED
+		    || dfield_is_ext(&fields[ipos])
+		    || dfield_is_null(&fields[ipos])) {
+
+			field->set_null();
+		} else {
+			field->set_notnull();
+
+			const dfield_t*	df	= &fields[ipos];
+
+			innobase_col_to_mysql(
+				dict_field_get_col(
+					dict_index_get_nth_field(index, ipos)),
+				static_cast<const uchar*>(dfield_get_data(df)),
+				dfield_get_len(df), field);
+		}
+	}
+}
+
+/*************************************************************//**
+Copies an InnoDB row to table->record[0].
+This is used in preparation for print_keydup_error() from
+row_log_table_apply() */
+void
+innobase_row_to_mysql(
+/*==================*/
+	struct TABLE*		table,	/*!< in/out: MySQL table */
+	const dict_table_t*	itab,	/*!< in: InnoDB table */
+	const dtuple_t*		row)	/*!< in: InnoDB row */
+{
+	uint	n_fields = table->s->fields;
+	ulint	num_v = 0;
+
+	/* The InnoDB row may contain an extra FTS_DOC_ID column at the end. */
+	ut_ad(row->n_fields == dict_table_get_n_cols(itab));
+	ut_ad(n_fields == row->n_fields - DATA_N_SYS_COLS
+	      + dict_table_get_n_v_cols(itab)
+	      - !!(DICT_TF2_FLAG_IS_SET(itab, DICT_TF2_FTS_HAS_DOC_ID)));
+
+	for (uint i = 0; i < n_fields; i++) {
+		Field*		field	= table->field[i];
+
+		field->reset();
+
+		if (!field->stored_in_db()) {
+			/* Virtual column are not stored in InnoDB table, so
+			skip it */
+			num_v++;
+			continue;
+		}
+
+		const dfield_t*	df	= dtuple_get_nth_field(row, i - num_v);
+
+		if (dfield_is_ext(df) || dfield_is_null(df)) {
+			field->set_null();
+		} else {
+			field->set_notnull();
+
+			innobase_col_to_mysql(
+				dict_table_get_nth_col(itab, i - num_v),
+				static_cast<const uchar*>(dfield_get_data(df)),
+				dfield_get_len(df), field);
+		}
+	}
+	if (table->vfield) {
+		MY_BITMAP*	old_read_set = tmp_use_all_columns(table, &table->read_set);
+		table->update_virtual_fields(table->file, VCOL_UPDATE_FOR_READ);
+		tmp_restore_column_map(&table->read_set, old_read_set);
+	}
+}
+
+/*******************************************************************//**
+This function checks that index keys are sensible.
+@return 0 or error number */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+int
+innobase_check_index_keys(
+/*======================*/
+	const Alter_inplace_info*	info,
+				/*!< in: indexes to be created or dropped */
+	const dict_table_t*		innodb_table)
+				/*!< in: Existing indexes */
+{
+	for (uint key_num = 0; key_num < info->index_add_count;
+	     key_num++) {
+		const KEY&	key = info->key_info_buffer[
+			info->index_add_buffer[key_num]];
+
+		/* Check that the same index name does not appear
+		twice in indexes to be created. */
+
+		for (ulint i = 0; i < key_num; i++) {
+			const KEY&	key2 = info->key_info_buffer[
+				info->index_add_buffer[i]];
+
+			if (0 == strcmp(key.name.str, key2.name.str)) {
+				my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0),
+					 key.name.str);
+
+				return(ER_WRONG_NAME_FOR_INDEX);
+			}
+		}
+
+		/* Check that the same index name does not already exist. */
+
+		const dict_index_t* index;
+
+		for (index = dict_table_get_first_index(innodb_table);
+		     index; index = dict_table_get_next_index(index)) {
+
+			if (index->is_committed()
+			    && !strcmp(key.name.str, index->name)) {
+				break;
+			}
+		}
+
+		/* Now we are in a situation where we have "ADD INDEX x"
+		and an index by the same name already exists. We have 4
+		possible cases:
+		1. No further clauses for an index x are given. Should reject
+		the operation.
+		2. "DROP INDEX x" is given. Should allow the operation.
+		3. "RENAME INDEX x TO y" is given. Should allow the operation.
+		4. "DROP INDEX x, RENAME INDEX x TO y" is given. Should allow
+		the operation, since no name clash occurs. In this particular
+		case MySQL cancels the operation without calling InnoDB
+		methods. */
+
+		if (index) {
+			/* If a key by the same name is being created and
+			dropped, the name clash is OK. */
+			for (uint i = 0; i < info->index_drop_count;
+			     i++) {
+				const KEY*	drop_key
+					= info->index_drop_buffer[i];
+
+				if (0 == strcmp(key.name.str,
+                                                drop_key->name.str)) {
+					goto name_ok;
+				}
+			}
+
+			for (const Alter_inplace_info::Rename_key_pair& pair :
+			     info->rename_keys) {
+				if (0 == strcmp(key.name.str,
+                                                pair.old_key->name.str)) {
+					goto name_ok;
+				}
+			}
+
+			my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0),
+                                 key.name.str);
+			return(ER_WRONG_NAME_FOR_INDEX);
+		}
+
+name_ok:
+		for (ulint i = 0; i < key.user_defined_key_parts; i++) {
+			const KEY_PART_INFO&	key_part1
+				= key.key_part[i];
+			const Field*		field
+				= key_part1.field;
+			unsigned		is_unsigned;
+
+			switch (get_innobase_type_from_mysql_type(
+					&is_unsigned, field)) {
+			default:
+				break;
+			case DATA_INT:
+			case DATA_FLOAT:
+			case DATA_DOUBLE:
+			case DATA_DECIMAL:
+				/* Check that MySQL does not try to
+				create a column prefix index field on
+				an inappropriate data type. */
+
+				if (field->type() == MYSQL_TYPE_VARCHAR) {
+					if (key_part1.length
+					    >= field->pack_length()
+					    - ((Field_varstring*) field)
+					    ->length_bytes) {
+						break;
+					}
+				} else {
+					if (key_part1.length
+					    >= field->pack_length()) {
+						break;
+					}
+				}
+
+				my_error(ER_WRONG_KEY_COLUMN, MYF(0), "InnoDB",
+					 field->field_name.str);
+				return(ER_WRONG_KEY_COLUMN);
+			}
+
+			/* Check that the same column does not appear
+			twice in the index. */
+
+			for (ulint j = 0; j < i; j++) {
+				const KEY_PART_INFO&	key_part2
+					= key.key_part[j];
+
+				if (key_part1.fieldnr != key_part2.fieldnr) {
+					continue;
+				}
+
+				my_error(ER_WRONG_KEY_COLUMN, MYF(0), "InnoDB",
+					 field->field_name.str);
+				return(ER_WRONG_KEY_COLUMN);
+			}
+		}
+	}
+
+	return(0);
+}
+
+/** Create index field definition for key part
+@param[in]	new_clustered	true if alter is generating a new clustered
+index
+@param[in]	altered_table	MySQL table that is being altered
+@param[in]	key_part	MySQL key definition
+@param[out]	index_field	index field definition for key_part */
+static MY_ATTRIBUTE((nonnull))
+void
+innobase_create_index_field_def(
+	bool			new_clustered,
+	const TABLE*		altered_table,
+	const KEY_PART_INFO*	key_part,
+	index_field_t*		index_field)
+{
+	const Field*	field;
+	unsigned	is_unsigned;
+	unsigned	num_v = 0;
+
+	DBUG_ENTER("innobase_create_index_field_def");
+
+	field = new_clustered
+		? altered_table->field[key_part->fieldnr]
+		: key_part->field;
+
+	for (ulint i = 0; i < key_part->fieldnr; i++) {
+		if (!altered_table->field[i]->stored_in_db()) {
+			num_v++;
+		}
+	}
+
+	auto col_type = get_innobase_type_from_mysql_type(
+		&is_unsigned, field);
+
+	if ((index_field->is_v_col = !field->stored_in_db())) {
+		index_field->col_no = num_v;
+	} else {
+		index_field->col_no = key_part->fieldnr - num_v;
+	}
+
+	index_field->descending= !!(key_part->key_part_flag & HA_REVERSE_SORT);
+
+	if (DATA_LARGE_MTYPE(col_type)
+	    || (key_part->length < field->pack_length()
+		&& field->type() != MYSQL_TYPE_VARCHAR)
+	    || (field->type() == MYSQL_TYPE_VARCHAR
+		&& key_part->length < field->pack_length()
+			- ((Field_varstring*) field)->length_bytes)) {
+
+		index_field->prefix_len = key_part->length;
+	} else {
+		index_field->prefix_len = 0;
+	}
+
+	DBUG_VOID_RETURN;
+}
+
+/** Create index definition for key
+@param[in]	altered_table		MySQL table that is being altered
+@param[in]	keys			key definitions
+@param[in]	key_number		MySQL key number
+@param[in]	new_clustered		true if generating a new clustered
+index on the table
+@param[in]	key_clustered		true if this is the new clustered index
+@param[out]	index			index definition
+@param[in]	heap			heap where memory is allocated */
+static MY_ATTRIBUTE((nonnull))
+void
+innobase_create_index_def(
+	const TABLE*		altered_table,
+	const KEY*		keys,
+	ulint			key_number,
+	bool			new_clustered,
+	bool			key_clustered,
+	index_def_t*		index,
+	mem_heap_t*		heap)
+{
+	const KEY*	key = &keys[key_number];
+	ulint		i;
+	ulint		n_fields = key->user_defined_key_parts;
+
+	DBUG_ENTER("innobase_create_index_def");
+	DBUG_ASSERT(!key_clustered || new_clustered);
+
+	index->fields = static_cast<index_field_t*>(
+		mem_heap_alloc(heap, n_fields * sizeof *index->fields));
+
+	index->parser = NULL;
+	index->key_number = key_number;
+	index->n_fields = n_fields;
+	index->name = mem_heap_strdup(heap, key->name.str);
+	index->rebuild = new_clustered;
+
+	if (key_clustered) {
+		DBUG_ASSERT(!(key->flags & (HA_FULLTEXT | HA_SPATIAL)));
+		DBUG_ASSERT(key->flags & HA_NOSAME);
+		index->ind_type = DICT_CLUSTERED | DICT_UNIQUE;
+	} else if (key->flags & HA_FULLTEXT) {
+		DBUG_ASSERT(!(key->flags & (HA_SPATIAL | HA_NOSAME)));
+		DBUG_ASSERT(!(key->flags & HA_KEYFLAG_MASK
+			      & ~(HA_FULLTEXT
+				  | HA_PACK_KEY
+				  | HA_BINARY_PACK_KEY)));
+		index->ind_type = DICT_FTS;
+
+		/* Note: key->parser is only parser name,
+			 we need to get parser from altered_table instead */
+
+		if (key->flags & HA_USES_PARSER) {
+			for (ulint j = 0; j < altered_table->s->keys; j++) {
+				if (!strcmp(altered_table->key_info[j].name.str,
+					    key->name.str)) {
+					ut_ad(altered_table->key_info[j].flags
+					      & HA_USES_PARSER);
+
+					plugin_ref	parser =
+						altered_table->key_info[j].parser;
+					index->parser =
+						static_cast<st_mysql_ftparser*>(
+						plugin_decl(parser)->info);
+
+					break;
+				}
+			}
+
+			DBUG_EXECUTE_IF("fts_instrument_use_default_parser",
+				index->parser = &fts_default_parser;);
+			ut_ad(index->parser);
+		}
+	} else if (key->flags & HA_SPATIAL) {
+		DBUG_ASSERT(!(key->flags & HA_NOSAME));
+		index->ind_type = DICT_SPATIAL;
+		ut_ad(n_fields == 1);
+		ulint	num_v = 0;
+
+		/* Need to count the virtual fields before this spatial
+		indexed field */
+		for (ulint i = 0; i < key->key_part->fieldnr; i++) {
+			num_v += !altered_table->field[i]->stored_in_db();
+		}
+		index->fields[0].col_no = key->key_part[0].fieldnr - num_v;
+		index->fields[0].prefix_len = 0;
+		index->fields[0].is_v_col = false;
+		index->fields[0].descending = false;
+
+		/* Currently, the spatial index cannot be created
+		on virtual columns. It is blocked in the SQL layer. */
+		DBUG_ASSERT(key->key_part[0].field->stored_in_db());
+	} else {
+		index->ind_type = (key->flags & HA_NOSAME) ? DICT_UNIQUE : 0;
+	}
+
+	if (!(key->flags & HA_SPATIAL)) {
+		for (i = 0; i < n_fields; i++) {
+			innobase_create_index_field_def(
+				new_clustered, altered_table,
+				&key->key_part[i], &index->fields[i]);
+
+			if (index->fields[i].is_v_col) {
+				index->ind_type |= DICT_VIRTUAL;
+			}
+		}
+	}
+
+	DBUG_VOID_RETURN;
+}
+
+/*******************************************************************//**
+Check whether the table has a unique index with FTS_DOC_ID_INDEX_NAME
+on the Doc ID column.
+@return the status of the FTS_DOC_ID index */
+enum fts_doc_id_index_enum
+innobase_fts_check_doc_id_index(
+/*============================*/
+	const dict_table_t*	table,		/*!< in: table definition */
+	const TABLE*		altered_table,	/*!< in: MySQL table
+						that is being altered */
+	ulint*			fts_doc_col_no)	/*!< out: The column number for
+						Doc ID, or ULINT_UNDEFINED
+						if it is being created in
+						ha_alter_info */
+{
+	const dict_index_t*	index;
+	const dict_field_t*	field;
+
+	if (altered_table) {
+		/* Check if a unique index with the name of
+		FTS_DOC_ID_INDEX_NAME is being created. */
+
+		const ulint fts_n_uniq= altered_table->versioned() ? 2 : 1;
+
+		for (uint i = 0; i < altered_table->s->keys; i++) {
+			const KEY& key = altered_table->key_info[i];
+
+			if (innobase_strcasecmp(
+				    key.name.str, FTS_DOC_ID_INDEX_NAME)) {
+				continue;
+			}
+
+			if ((key.flags & HA_NOSAME)
+			    && key.user_defined_key_parts == fts_n_uniq
+			    && !(key.key_part[0].key_part_flag
+				 & HA_REVERSE_SORT)
+			    && !strcmp(key.name.str, FTS_DOC_ID_INDEX_NAME)
+			    && !strcmp(key.key_part[0].field->field_name.str,
+				       FTS_DOC_ID_COL_NAME)) {
+				if (fts_doc_col_no) {
+					*fts_doc_col_no = ULINT_UNDEFINED;
+				}
+				return(FTS_EXIST_DOC_ID_INDEX);
+			} else {
+				return(FTS_INCORRECT_DOC_ID_INDEX);
+			}
+		}
+	}
+
+	if (!table) {
+		return(FTS_NOT_EXIST_DOC_ID_INDEX);
+	}
+
+	for (index = dict_table_get_first_index(table);
+	     index; index = dict_table_get_next_index(index)) {
+
+
+		/* Check if there exists a unique index with the name of
+		FTS_DOC_ID_INDEX_NAME and ignore the corrupted index */
+		if (index->type & DICT_CORRUPT
+		    || innobase_strcasecmp(index->name, FTS_DOC_ID_INDEX_NAME)) {
+			continue;
+		}
+
+		if (!dict_index_is_unique(index)
+		    || dict_index_get_n_unique(index) != table->fts_n_uniq()
+		    || strcmp(index->name, FTS_DOC_ID_INDEX_NAME)) {
+			return(FTS_INCORRECT_DOC_ID_INDEX);
+		}
+
+		/* Check whether the index has FTS_DOC_ID as its
+		first column */
+		field = dict_index_get_nth_field(index, 0);
+
+		/* The column would be of a BIGINT data type */
+		if (strcmp(field->name, FTS_DOC_ID_COL_NAME) == 0
+		    && !field->descending
+		    && field->col->mtype == DATA_INT
+		    && field->col->len == 8
+		    && field->col->prtype & DATA_NOT_NULL
+		    && !field->col->is_virtual()) {
+			if (fts_doc_col_no) {
+				*fts_doc_col_no = dict_col_get_no(field->col);
+			}
+			return(FTS_EXIST_DOC_ID_INDEX);
+		} else {
+			return(FTS_INCORRECT_DOC_ID_INDEX);
+		}
+	}
+
+
+	/* Not found */
+	return(FTS_NOT_EXIST_DOC_ID_INDEX);
+}
+/*******************************************************************//**
+Check whether the table has a unique index with FTS_DOC_ID_INDEX_NAME
+on the Doc ID column in MySQL create index definition.
+@return FTS_EXIST_DOC_ID_INDEX if there exists the FTS_DOC_ID index,
+FTS_INCORRECT_DOC_ID_INDEX if the FTS_DOC_ID index is of wrong format */
+enum fts_doc_id_index_enum
+innobase_fts_check_doc_id_index_in_def(
+/*===================================*/
+	ulint		n_key,		/*!< in: Number of keys */
+	const KEY*	key_info)	/*!< in: Key definition */
+{
+	/* Check whether there is a "FTS_DOC_ID_INDEX" in the to be built index
+	list */
+	const uint fts_n_uniq= key_info->table->versioned() ? 2 : 1;
+	for (ulint j = 0; j < n_key; j++) {
+		const KEY*	key = &key_info[j];
+
+		if (innobase_strcasecmp(key->name.str, FTS_DOC_ID_INDEX_NAME)) {
+			continue;
+		}
+
+		/* Do a check on FTS DOC ID_INDEX, it must be unique,
+		named as "FTS_DOC_ID_INDEX" and on column "FTS_DOC_ID" */
+		if (!(key->flags & HA_NOSAME)
+		    || key->user_defined_key_parts != fts_n_uniq
+		    || (key->key_part[0].key_part_flag & HA_REVERSE_SORT)
+		    || strcmp(key->name.str, FTS_DOC_ID_INDEX_NAME)
+		    || strcmp(key->key_part[0].field->field_name.str,
+			      FTS_DOC_ID_COL_NAME)) {
+			return(FTS_INCORRECT_DOC_ID_INDEX);
+		}
+
+		return(FTS_EXIST_DOC_ID_INDEX);
+	}
+
+	return(FTS_NOT_EXIST_DOC_ID_INDEX);
+}
+
+/** Create an index table where indexes are ordered as follows:
+
+IF a new primary key is defined for the table THEN
+
+	1) New primary key
+	2) The remaining keys in key_info
+
+ELSE
+
+	1) All new indexes in the order they arrive from MySQL
+
+ENDIF
+
+@return key definitions */
+MY_ATTRIBUTE((nonnull, warn_unused_result, malloc))
+inline index_def_t*
+ha_innobase_inplace_ctx::create_key_defs(
+	const Alter_inplace_info*	ha_alter_info,
+			/*!< in: alter operation */
+	const TABLE*			altered_table,
+			/*!< in: MySQL table that is being altered */
+	ulint&				n_fts_add,
+			/*!< out: number of FTS indexes to be created */
+	ulint&				fts_doc_id_col,
+			/*!< in: The column number for Doc ID */
+	bool&				add_fts_doc_id,
+			/*!< in: whether we need to add new DOC ID
+			column for FTS index */
+	bool&				add_fts_doc_idx,
+			/*!< in: whether we need to add new DOC ID
+			index for FTS index */
+	const TABLE*			table)
+			/*!< in: MySQL table that is being altered */
+{
+	ulint&			n_add = num_to_add_index;
+	const bool got_default_clust = new_table->indexes.start->is_gen_clust();
+
+	index_def_t*		indexdef;
+	index_def_t*		indexdefs;
+	bool			new_primary;
+	const uint*const	add
+		= ha_alter_info->index_add_buffer;
+	const KEY*const		key_info
+		= ha_alter_info->key_info_buffer;
+
+	DBUG_ENTER("ha_innobase_inplace_ctx::create_key_defs");
+	DBUG_ASSERT(!add_fts_doc_id || add_fts_doc_idx);
+	DBUG_ASSERT(ha_alter_info->index_add_count == n_add);
+
+	/* If there is a primary key, it is always the first index
+	defined for the innodb_table. */
+
+	new_primary = n_add > 0
+		&& !my_strcasecmp(system_charset_info,
+				  key_info[*add].name.str, "PRIMARY");
+	n_fts_add = 0;
+
+	/* If there is a UNIQUE INDEX consisting entirely of NOT NULL
+	columns and if the index does not contain column prefix(es)
+	(only prefix/part of the column is indexed), MySQL will treat the
+	index as a PRIMARY KEY unless the table already has one. */
+
+	ut_ad(altered_table->s->primary_key == 0
+	      || altered_table->s->primary_key == MAX_KEY);
+
+	if (got_default_clust && !new_primary) {
+		new_primary = (altered_table->s->primary_key != MAX_KEY);
+	}
+
+	const bool rebuild = new_primary || add_fts_doc_id
+		|| innobase_need_rebuild(ha_alter_info, table);
+
+	/* Reserve one more space if new_primary is true, and we might
+	need to add the FTS_DOC_ID_INDEX */
+	indexdef = indexdefs = static_cast<index_def_t*>(
+		mem_heap_alloc(
+			heap, sizeof *indexdef
+			* (ha_alter_info->key_count
+			   + rebuild
+			   + got_default_clust)));
+
+	if (rebuild) {
+		ulint	primary_key_number;
+
+		if (new_primary) {
+			DBUG_ASSERT(n_add || got_default_clust);
+			DBUG_ASSERT(n_add || !altered_table->s->primary_key);
+			primary_key_number = altered_table->s->primary_key;
+		} else if (got_default_clust) {
+			/* Create the GEN_CLUST_INDEX */
+			index_def_t*	index = indexdef++;
+
+			index->fields = NULL;
+			index->n_fields = 0;
+			index->ind_type = DICT_CLUSTERED;
+			index->name = innobase_index_reserve_name;
+			index->rebuild = true;
+			index->key_number = ~0U;
+			primary_key_number = ULINT_UNDEFINED;
+			goto created_clustered;
+		} else {
+			primary_key_number = 0;
+		}
+
+		/* Create the PRIMARY key index definition */
+		innobase_create_index_def(
+			altered_table, key_info, primary_key_number,
+			true, true, indexdef++, heap);
+
+created_clustered:
+		n_add = 1;
+
+		for (ulint i = 0; i < ha_alter_info->key_count; i++) {
+			if (i == primary_key_number) {
+				continue;
+			}
+			/* Copy the index definitions. */
+			innobase_create_index_def(
+				altered_table, key_info, i, true,
+				false, indexdef, heap);
+
+			if (indexdef->ind_type & DICT_FTS) {
+				n_fts_add++;
+			}
+
+			indexdef++;
+			n_add++;
+		}
+
+		if (n_fts_add > 0) {
+			ulint	num_v = 0;
+
+			if (!add_fts_doc_id
+			    && !innobase_fts_check_doc_id_col(
+				    NULL, altered_table,
+				    &fts_doc_id_col, &num_v)) {
+				fts_doc_id_col = altered_table->s->fields - num_v;
+				add_fts_doc_id = true;
+			}
+
+			if (!add_fts_doc_idx) {
+				fts_doc_id_index_enum	ret;
+				ulint			doc_col_no;
+
+				ret = innobase_fts_check_doc_id_index(
+					NULL, altered_table, &doc_col_no);
+
+				/* This should have been checked before */
+				ut_ad(ret != FTS_INCORRECT_DOC_ID_INDEX);
+
+				if (ret == FTS_NOT_EXIST_DOC_ID_INDEX) {
+					add_fts_doc_idx = true;
+				} else {
+					ut_ad(ret == FTS_EXIST_DOC_ID_INDEX);
+					ut_ad(doc_col_no == ULINT_UNDEFINED
+					      || doc_col_no == fts_doc_id_col);
+				}
+			}
+		}
+	} else {
+		/* Create definitions for added secondary indexes. */
+
+		for (ulint i = 0; i < n_add; i++) {
+			innobase_create_index_def(
+				altered_table, key_info, add[i],
+				false, false, indexdef, heap);
+
+			if (indexdef->ind_type & DICT_FTS) {
+				n_fts_add++;
+			}
+
+			indexdef++;
+		}
+	}
+
+	DBUG_ASSERT(indexdefs + n_add == indexdef);
+
+	if (add_fts_doc_idx) {
+		index_def_t*	index = indexdef++;
+		uint nfields = 1;
+
+		if (altered_table->versioned())
+			++nfields;
+		index->fields = static_cast<index_field_t*>(
+			mem_heap_alloc(heap, sizeof(*index->fields) * nfields));
+		index->n_fields = nfields;
+		index->fields[0].col_no = fts_doc_id_col;
+		index->fields[0].prefix_len = 0;
+		index->fields[0].descending = false;
+		index->fields[0].is_v_col = false;
+		if (nfields == 2) {
+			index->fields[1].col_no
+				= altered_table->s->vers.end_fieldno;
+			index->fields[1].prefix_len = 0;
+			index->fields[1].descending = false;
+			index->fields[1].is_v_col = false;
+		}
+		index->ind_type = DICT_UNIQUE;
+		ut_ad(!rebuild
+		      || !add_fts_doc_id
+		      || fts_doc_id_col <= altered_table->s->fields);
+
+		index->name = FTS_DOC_ID_INDEX_NAME;
+		index->rebuild = rebuild;
+
+		/* TODO: assign a real MySQL key number for this */
+		index->key_number = ULINT_UNDEFINED;
+		n_add++;
+	}
+
+	DBUG_ASSERT(indexdef > indexdefs);
+	DBUG_ASSERT((ulint) (indexdef - indexdefs)
+		    <= ha_alter_info->key_count
+		    + add_fts_doc_idx + got_default_clust);
+	DBUG_ASSERT(ha_alter_info->index_add_count <= n_add);
+	DBUG_RETURN(indexdefs);
+}
+
+MY_ATTRIBUTE((warn_unused_result))
+bool too_big_key_part_length(size_t max_field_len, const KEY& key)
+{
+	for (ulint i = 0; i < key.user_defined_key_parts; i++) {
+		if (key.key_part[i].length > max_field_len) {
+			return true;
+		}
+	}
+	return false;
+}
+
+/********************************************************************//**
+Drop any indexes that we were not able to free previously due to
+open table handles. */
+static
+void
+online_retry_drop_indexes_low(
+/*==========================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	trx_t*		trx)	/*!< in/out: transaction */
+{
+	ut_ad(dict_sys.locked());
+	ut_ad(trx->dict_operation_lock_mode);
+	ut_ad(trx->dict_operation);
+
+	/* We can have table->n_ref_count > 1, because other threads
+	may have prebuilt->table pointing to the table. However, these
+	other threads should be between statements, waiting for the
+	next statement to execute, or for a meta-data lock. */
+	ut_ad(table->get_ref_count() >= 1);
+
+	if (table->drop_aborted) {
+		row_merge_drop_indexes(trx, table, true);
+	}
+}
+
+/** After commit, unlock the data dictionary and close any deleted files.
+@param deleted  handles of deleted files
+@param trx      committed transaction */
+static void unlock_and_close_files(const std::vector<pfs_os_file_t> &deleted,
+                                   trx_t *trx)
+{
+  row_mysql_unlock_data_dictionary(trx);
+  for (pfs_os_file_t d : deleted)
+    os_file_close(d);
+  log_write_up_to(trx->commit_lsn, true);
+}
+
+/** Commit a DDL transaction and unlink any deleted files. */
+static void commit_unlock_and_unlink(trx_t *trx)
+{
+  std::vector<pfs_os_file_t> deleted;
+  trx->commit(deleted);
+  unlock_and_close_files(deleted, trx);
+}
+
+/**
+Drop any indexes that we were not able to free previously due to
+open table handles.
+@param table     InnoDB table
+@param thd       connection handle
+*/
+static void online_retry_drop_indexes(dict_table_t *table, THD *thd)
+{
+  if (table->drop_aborted)
+  {
+    trx_t *trx= innobase_trx_allocate(thd);
+
+    trx_start_for_ddl(trx);
+    if (lock_sys_tables(trx) == DB_SUCCESS)
+    {
+      row_mysql_lock_data_dictionary(trx);
+      online_retry_drop_indexes_low(table, trx);
+      commit_unlock_and_unlink(trx);
+    }
+    else
+      trx->commit();
+    trx->free();
+  }
+
+  ut_d(dict_sys.freeze(SRW_LOCK_CALL));
+  ut_d(dict_table_check_for_dup_indexes(table, CHECK_ALL_COMPLETE));
+  ut_d(dict_sys.unfreeze());
+  ut_ad(!table->drop_aborted);
+}
+
+/** Determines if InnoDB is dropping a foreign key constraint.
+@param foreign the constraint
+@param drop_fk constraints being dropped
+@param n_drop_fk number of constraints that are being dropped
+@return whether the constraint is being dropped */
+MY_ATTRIBUTE((pure, nonnull(1), warn_unused_result))
+inline
+bool
+innobase_dropping_foreign(
+	const dict_foreign_t*	foreign,
+	dict_foreign_t**	drop_fk,
+	ulint			n_drop_fk)
+{
+	while (n_drop_fk--) {
+		if (*drop_fk++ == foreign) {
+			return(true);
+		}
+	}
+
+	return(false);
+}
+
+/** Determines if an InnoDB FOREIGN KEY constraint depends on a
+column that is being dropped or modified to NOT NULL.
+@param user_table InnoDB table as it is before the ALTER operation
+@param col_name Name of the column being altered
+@param drop_fk constraints being dropped
+@param n_drop_fk number of constraints that are being dropped
+@param drop true=drop column, false=set NOT NULL
+@retval true Not allowed (will call my_error())
+@retval false Allowed
+*/
+MY_ATTRIBUTE((pure, nonnull(1,4), warn_unused_result))
+static
+bool
+innobase_check_foreigns_low(
+	const dict_table_t*	user_table,
+	dict_foreign_t**	drop_fk,
+	ulint			n_drop_fk,
+	const char*		col_name,
+	bool			drop)
+{
+	dict_foreign_t*	foreign;
+	ut_ad(dict_sys.locked());
+
+	/* Check if any FOREIGN KEY constraints are defined on this
+	column. */
+
+	for (dict_foreign_set::const_iterator it = user_table->foreign_set.begin();
+	     it != user_table->foreign_set.end();
+	     ++it) {
+
+		foreign = *it;
+
+		if (!drop && !(foreign->type
+			       & (DICT_FOREIGN_ON_DELETE_SET_NULL
+				  | DICT_FOREIGN_ON_UPDATE_SET_NULL))) {
+			continue;
+		}
+
+		if (innobase_dropping_foreign(foreign, drop_fk, n_drop_fk)) {
+			continue;
+		}
+
+		for (unsigned f = 0; f < foreign->n_fields; f++) {
+			if (!strcmp(foreign->foreign_col_names[f],
+				    col_name)) {
+				my_error(drop
+					 ? ER_FK_COLUMN_CANNOT_DROP
+					 : ER_FK_COLUMN_NOT_NULL, MYF(0),
+					 col_name, foreign->id);
+				return(true);
+			}
+		}
+	}
+
+	if (!drop) {
+		/* SET NULL clauses on foreign key constraints of
+		child tables affect the child tables, not the parent table.
+		The column can be NOT NULL in the parent table. */
+		return(false);
+	}
+
+	/* Check if any FOREIGN KEY constraints in other tables are
+	referring to the column that is being dropped. */
+	for (dict_foreign_set::const_iterator it
+		= user_table->referenced_set.begin();
+	     it != user_table->referenced_set.end();
+	     ++it) {
+
+		foreign = *it;
+
+		if (innobase_dropping_foreign(foreign, drop_fk, n_drop_fk)) {
+			continue;
+		}
+
+		for (unsigned f = 0; f < foreign->n_fields; f++) {
+			char display_name[FN_REFLEN];
+
+			if (strcmp(foreign->referenced_col_names[f],
+				   col_name)) {
+				continue;
+			}
+
+			char* buf_end = innobase_convert_name(
+				display_name, (sizeof display_name) - 1,
+				foreign->foreign_table_name,
+				strlen(foreign->foreign_table_name),
+				NULL);
+			*buf_end = '\0';
+			my_error(ER_FK_COLUMN_CANNOT_DROP_CHILD,
+				 MYF(0), col_name, foreign->id,
+				 display_name);
+
+			return(true);
+		}
+	}
+
+	return(false);
+}
+
+/** Determines if an InnoDB FOREIGN KEY constraint depends on a
+column that is being dropped or modified to NOT NULL.
+@param ha_alter_info Data used during in-place alter
+@param altered_table MySQL table that is being altered
+@param old_table MySQL table as it is before the ALTER operation
+@param user_table InnoDB table as it is before the ALTER operation
+@param drop_fk constraints being dropped
+@param n_drop_fk number of constraints that are being dropped
+@retval true Not allowed (will call my_error())
+@retval false Allowed
+*/
+MY_ATTRIBUTE((pure, nonnull(1,2,3), warn_unused_result))
+static
+bool
+innobase_check_foreigns(
+	Alter_inplace_info*	ha_alter_info,
+	const TABLE*		old_table,
+	const dict_table_t*	user_table,
+	dict_foreign_t**	drop_fk,
+	ulint			n_drop_fk)
+{
+	for (Field** fp = old_table->field; *fp; fp++) {
+		ut_ad(!(*fp)->real_maybe_null()
+		      == !!((*fp)->flags & NOT_NULL_FLAG));
+
+		auto end = ha_alter_info->alter_info->create_list.end();
+		auto it = std::find_if(
+			ha_alter_info->alter_info->create_list.begin(), end,
+			[fp](const Create_field& field) {
+				return field.field == *fp;
+			});
+
+		if (it == end || (it->flags & NOT_NULL_FLAG)) {
+			if (innobase_check_foreigns_low(
+				    user_table, drop_fk, n_drop_fk,
+				    (*fp)->field_name.str, it == end)) {
+				return(true);
+			}
+		}
+	}
+
+	return(false);
+}
+
+/** Convert a default value for ADD COLUMN.
+@param[in,out]	heap		Memory heap where allocated
+@param[out]	dfield		InnoDB data field to copy to
+@param[in]	field		MySQL value for the column
+@param[in]	old_field	Old column if altering; NULL for ADD COLUMN
+@param[in]	comp		nonzero if in compact format. */
+static void innobase_build_col_map_add(
+	mem_heap_t*	heap,
+	dfield_t*	dfield,
+	const Field*	field,
+	const Field*	old_field,
+	ulint		comp)
+{
+	if (old_field && old_field->real_maybe_null()
+	    && field->real_maybe_null()) {
+		return;
+	}
+
+	if (field->is_real_null()) {
+		dfield_set_null(dfield);
+		return;
+	}
+
+	const Field& from = old_field ? *old_field : *field;
+	ulint	size	= from.pack_length();
+
+	byte*	buf	= static_cast<byte*>(mem_heap_alloc(heap, size));
+
+	row_mysql_store_col_in_innobase_format(
+		dfield, buf, true, from.ptr, size, comp);
+}
+
+/** Construct the translation table for reordering, dropping or
+adding columns.
+
+@param ha_alter_info Data used during in-place alter
+@param altered_table MySQL table that is being altered
+@param table MySQL table as it is before the ALTER operation
+@param new_table InnoDB table corresponding to MySQL altered_table
+@param old_table InnoDB table corresponding to MYSQL table
+@param defaults Default values for ADD COLUMN, or NULL if no ADD COLUMN
+@param heap Memory heap where allocated
+@return array of integers, mapping column numbers in the table
+to column numbers in altered_table */
+static MY_ATTRIBUTE((nonnull(1,2,3,4,5,7), warn_unused_result))
+const ulint*
+innobase_build_col_map(
+/*===================*/
+	Alter_inplace_info*	ha_alter_info,
+	const TABLE*		altered_table,
+	const TABLE*		table,
+	dict_table_t*		new_table,
+	const dict_table_t*	old_table,
+	dtuple_t*		defaults,
+	mem_heap_t*		heap)
+{
+	DBUG_ENTER("innobase_build_col_map");
+	DBUG_ASSERT(altered_table != table);
+	DBUG_ASSERT(new_table != old_table);
+	DBUG_ASSERT(dict_table_get_n_cols(new_table)
+		    + dict_table_get_n_v_cols(new_table)
+		    >= altered_table->s->fields + DATA_N_SYS_COLS);
+	DBUG_ASSERT(dict_table_get_n_cols(old_table)
+		    + dict_table_get_n_v_cols(old_table)
+		    >= table->s->fields + DATA_N_SYS_COLS
+		    || ha_innobase::omits_virtual_cols(*table->s));
+	DBUG_ASSERT(!!defaults == !!(ha_alter_info->handler_flags
+				     & INNOBASE_DEFAULTS));
+	DBUG_ASSERT(!defaults || dtuple_get_n_fields(defaults)
+		    == dict_table_get_n_cols(new_table));
+
+	const uint old_n_v_cols = uint(table->s->fields
+				       - table->s->stored_fields);
+	DBUG_ASSERT(old_n_v_cols == old_table->n_v_cols
+		    || table->s->frm_version < FRM_VER_EXPRESSSIONS);
+	DBUG_ASSERT(!old_n_v_cols || table->s->virtual_fields);
+
+	ulint*	col_map = static_cast<ulint*>(
+		mem_heap_alloc(
+			heap, (size_t(old_table->n_cols) + old_n_v_cols)
+			* sizeof *col_map));
+
+	uint	i = 0;
+	uint	num_v = 0;
+
+	/* Any dropped columns will map to ULINT_UNDEFINED. */
+	for (uint old_i = 0; old_i + DATA_N_SYS_COLS < old_table->n_cols;
+	     old_i++) {
+		col_map[old_i] = ULINT_UNDEFINED;
+	}
+
+	for (uint old_i = 0; old_i < old_n_v_cols; old_i++) {
+		col_map[old_i + old_table->n_cols] = ULINT_UNDEFINED;
+	}
+
+	const bool omits_virtual = ha_innobase::omits_virtual_cols(*table->s);
+
+	for (const Create_field& new_field :
+	     ha_alter_info->alter_info->create_list) {
+		bool	is_v = !new_field.stored_in_db();
+		ulint	num_old_v = 0;
+
+		for (uint old_i = 0; table->field[old_i]; old_i++) {
+			const Field* field = table->field[old_i];
+			if (!field->stored_in_db()) {
+				if (is_v && new_field.field == field) {
+					if (!omits_virtual) {
+						col_map[old_table->n_cols
+							+ num_v]
+							= num_old_v;
+					}
+					num_old_v++;
+					goto found_col;
+				}
+				num_old_v++;
+				continue;
+			}
+
+			if (new_field.field == field) {
+
+				const Field* altered_field =
+					altered_table->field[i + num_v];
+
+				if (defaults) {
+					innobase_build_col_map_add(
+						heap,
+						dtuple_get_nth_field(
+							defaults, i),
+						altered_field,
+						field,
+						dict_table_is_comp(
+							new_table));
+				}
+
+				col_map[old_i - num_old_v] = i;
+				if (!old_table->versioned()
+				    || !altered_table->versioned()) {
+				} else if (old_i == old_table->vers_start) {
+					new_table->vers_start = (i + num_v)
+						& dict_index_t::MAX_N_FIELDS;
+				} else if (old_i == old_table->vers_end) {
+					new_table->vers_end = (i + num_v)
+						& dict_index_t::MAX_N_FIELDS;
+				}
+				goto found_col;
+			}
+		}
+
+		if (!is_v) {
+			innobase_build_col_map_add(
+				heap, dtuple_get_nth_field(defaults, i),
+				altered_table->field[i + num_v],
+				NULL,
+				dict_table_is_comp(new_table));
+		}
+found_col:
+		if (is_v) {
+			num_v++;
+		} else {
+			i++;
+		}
+	}
+
+	DBUG_ASSERT(i == altered_table->s->fields - num_v);
+
+	i = table->s->fields - old_n_v_cols;
+
+	/* Add the InnoDB hidden FTS_DOC_ID column, if any. */
+	if (i + DATA_N_SYS_COLS < old_table->n_cols) {
+		/* There should be exactly one extra field,
+		the FTS_DOC_ID. */
+		DBUG_ASSERT(DICT_TF2_FLAG_IS_SET(old_table,
+						 DICT_TF2_FTS_HAS_DOC_ID));
+		DBUG_ASSERT(i + DATA_N_SYS_COLS + 1 == old_table->n_cols);
+		DBUG_ASSERT(!strcmp(dict_table_get_col_name(
+					    old_table, i),
+				    FTS_DOC_ID_COL_NAME));
+		if (altered_table->s->fields + DATA_N_SYS_COLS
+		    - new_table->n_v_cols
+		    < new_table->n_cols) {
+			DBUG_ASSERT(DICT_TF2_FLAG_IS_SET(
+					    new_table,
+					    DICT_TF2_FTS_HAS_DOC_ID));
+			DBUG_ASSERT(altered_table->s->fields
+				    + DATA_N_SYS_COLS + 1
+				    == static_cast<ulint>(
+					new_table->n_cols
+					+ new_table->n_v_cols));
+			col_map[i] = altered_table->s->fields
+				     - new_table->n_v_cols;
+		} else {
+			DBUG_ASSERT(!DICT_TF2_FLAG_IS_SET(
+					    new_table,
+					    DICT_TF2_FTS_HAS_DOC_ID));
+			col_map[i] = ULINT_UNDEFINED;
+		}
+
+		i++;
+	} else {
+		DBUG_ASSERT(!DICT_TF2_FLAG_IS_SET(
+				    old_table,
+				    DICT_TF2_FTS_HAS_DOC_ID));
+	}
+
+	for (; i < old_table->n_cols; i++) {
+		col_map[i] = i + new_table->n_cols - old_table->n_cols;
+	}
+
+	DBUG_RETURN(col_map);
+}
+
+/** Get the new non-virtual column names if any columns were renamed
+@param ha_alter_info	Data used during in-place alter
+@param altered_table	MySQL table that is being altered
+@param table		MySQL table as it is before the ALTER operation
+@param user_table	InnoDB table as it is before the ALTER operation
+@param heap		Memory heap for the allocation
+@return array of new column names in rebuilt_table, or NULL if not renamed */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+const char**
+innobase_get_col_names(
+	Alter_inplace_info*	ha_alter_info,
+	const TABLE*		altered_table,
+	const TABLE*		table,
+	const dict_table_t*	user_table,
+	mem_heap_t*		heap)
+{
+	const char**		cols;
+	uint			i;
+
+	DBUG_ENTER("innobase_get_col_names");
+	DBUG_ASSERT(user_table->n_t_def > table->s->fields);
+	DBUG_ASSERT(ha_alter_info->handler_flags
+		    & ALTER_COLUMN_NAME);
+
+	cols = static_cast<const char**>(
+		mem_heap_zalloc(heap, user_table->n_def * sizeof *cols));
+
+	i = 0;
+	for (const Create_field& new_field :
+	     ha_alter_info->alter_info->create_list) {
+		ulint	num_v = 0;
+		DBUG_ASSERT(i < altered_table->s->fields);
+
+		if (!new_field.stored_in_db()) {
+			continue;
+		}
+
+		for (uint old_i = 0; table->field[old_i]; old_i++) {
+			num_v += !table->field[old_i]->stored_in_db();
+
+			if (new_field.field == table->field[old_i]) {
+				cols[old_i - num_v] = new_field.field_name.str;
+				break;
+			}
+		}
+
+		i++;
+	}
+
+	/* Copy the internal column names. */
+	i = table->s->fields - user_table->n_v_def;
+	cols[i] = dict_table_get_col_name(user_table, i);
+
+	while (++i < user_table->n_def) {
+		cols[i] = cols[i - 1] + strlen(cols[i - 1]) + 1;
+	}
+
+	DBUG_RETURN(cols);
+}
+
+/** Check whether the column prefix is increased, decreased, or unchanged.
+@param[in]	new_prefix_len	new prefix length
+@param[in]	old_prefix_len	new prefix length
+@retval	1	prefix is increased
+@retval	0	prefix is unchanged
+@retval	-1	prefix is decreased */
+static inline
+lint
+innobase_pk_col_prefix_compare(
+	ulint	new_prefix_len,
+	ulint	old_prefix_len)
+{
+	ut_ad(new_prefix_len < COMPRESSED_REC_MAX_DATA_SIZE);
+	ut_ad(old_prefix_len < COMPRESSED_REC_MAX_DATA_SIZE);
+
+	if (new_prefix_len == old_prefix_len) {
+		return(0);
+	}
+
+	if (new_prefix_len == 0) {
+		new_prefix_len = ULINT_MAX;
+	}
+
+	if (old_prefix_len == 0) {
+		old_prefix_len = ULINT_MAX;
+	}
+
+	if (new_prefix_len > old_prefix_len) {
+		return(1);
+	} else {
+		return(-1);
+	}
+}
+
+/** Check whether the column is existing in old table.
+@param[in]	new_col_no	new column no
+@param[in]	col_map		mapping of old column numbers to new ones
+@param[in]	col_map_size	the column map size
+@return true if the column is existing, otherwise false. */
+static inline
+bool
+innobase_pk_col_is_existing(
+	const ulint	new_col_no,
+	const ulint*	col_map,
+	const ulint	col_map_size)
+{
+	for (ulint i = 0; i < col_map_size; i++) {
+		if (col_map[i] == new_col_no) {
+			return(true);
+		}
+	}
+
+	return(false);
+}
+
+/** Determine whether both the indexes have same set of primary key
+fields arranged in the same order.
+
+Rules when we cannot skip sorting:
+(1) Removing existing PK columns somewhere else than at the end of the PK;
+(2) Adding existing columns to the PK, except at the end of the PK when no
+columns are removed from the PK;
+(3) Changing the order of existing PK columns;
+(4) Decreasing the prefix length just like removing existing PK columns
+follows rule(1), Increasing the prefix length just like adding existing
+PK columns follows rule(2);
+(5) Changing the ASC/DESC attribute of the existing PK columns.
+@param[in]	col_map		mapping of old column numbers to new ones
+@param[in]	ha_alter_info	Data used during in-place alter
+@param[in]	old_clust_index	index to be compared
+@param[in]	new_clust_index index to be compared
+@retval true if both indexes have same order.
+@retval false. */
+static MY_ATTRIBUTE((warn_unused_result))
+bool
+innobase_pk_order_preserved(
+	const ulint*		col_map,
+	const dict_index_t*	old_clust_index,
+	const dict_index_t*	new_clust_index)
+{
+	ulint	old_n_uniq
+		= dict_index_get_n_ordering_defined_by_user(
+			old_clust_index);
+	ulint	new_n_uniq
+		= dict_index_get_n_ordering_defined_by_user(
+			new_clust_index);
+
+	ut_ad(dict_index_is_clust(old_clust_index));
+	ut_ad(dict_index_is_clust(new_clust_index));
+	ut_ad(old_clust_index->table != new_clust_index->table);
+	ut_ad(col_map != NULL);
+
+	if (old_n_uniq == 0) {
+		/* There was no PRIMARY KEY in the table.
+		If there is no PRIMARY KEY after the ALTER either,
+		no sorting is needed. */
+		return(new_n_uniq == old_n_uniq);
+	}
+
+	/* DROP PRIMARY KEY is only allowed in combination with
+	ADD PRIMARY KEY. */
+	ut_ad(new_n_uniq > 0);
+
+	/* The order of the last processed new_clust_index key field,
+	not counting ADD COLUMN, which are constant. */
+	lint	last_field_order = -1;
+	ulint	existing_field_count = 0;
+	ulint	old_n_cols = dict_table_get_n_cols(old_clust_index->table);
+	for (ulint new_field = 0; new_field < new_n_uniq; new_field++) {
+		ulint	new_col_no =
+			new_clust_index->fields[new_field].col->ind;
+
+		/* Check if there is a match in old primary key. */
+		ulint	old_field = 0;
+		while (old_field < old_n_uniq) {
+			ulint	old_col_no =
+				old_clust_index->fields[old_field].col->ind;
+
+			if (col_map[old_col_no] == new_col_no) {
+				break;
+			}
+
+			old_field++;
+		}
+
+		/* The order of key field in the new primary key.
+		1. old PK column:      idx in old primary key
+		2. existing column:    old_n_uniq + sequence no
+		3. newly added column: no order */
+		lint		new_field_order;
+		const bool	old_pk_column = old_field < old_n_uniq;
+
+		if (old_pk_column) {
+			new_field_order = lint(old_field);
+		} else if (innobase_pk_col_is_existing(new_col_no, col_map,
+						       old_n_cols)
+			   || new_clust_index->table->persistent_autoinc
+			   == new_field + 1) {
+			/* Adding an existing column or an AUTO_INCREMENT
+			column may change the existing ordering. */
+			new_field_order = lint(old_n_uniq
+					       + existing_field_count++);
+		} else {
+			/* Skip newly added column. */
+			continue;
+		}
+
+		if (last_field_order + 1 != new_field_order) {
+			/* Old PK order is not kept, or existing column
+			is not added at the end of old PK. */
+			return(false);
+		}
+
+		last_field_order = new_field_order;
+
+		if (!old_pk_column) {
+			continue;
+		}
+
+		const dict_field_t &of = old_clust_index->fields[old_field];
+		const dict_field_t &nf = new_clust_index->fields[new_field];
+
+		if (of.descending != nf.descending) {
+			return false;
+		}
+
+		/* Check prefix length change. */
+		const lint	prefix_change = innobase_pk_col_prefix_compare(
+			nf.prefix_len, of.prefix_len);
+
+		if (prefix_change < 0) {
+			/* If a column's prefix length is decreased, it should
+			be the last old PK column in new PK.
+			Note: we set last_field_order to -2, so that if	there
+			are any old PK colmns or existing columns after it in
+			new PK, the comparison to new_field_order will fail in
+			the next round.*/
+			last_field_order = -2;
+		} else if (prefix_change > 0) {
+			/* If a column's prefix length is increased, it	should
+			be the last PK column in old PK. */
+			if (old_field != old_n_uniq - 1) {
+				return(false);
+			}
+		}
+	}
+
+	return(true);
+}
+
+/** Update the mtype from DATA_BLOB to DATA_GEOMETRY for a specified
+GIS column of a table. This is used when we want to create spatial index
+on legacy GIS columns coming from 5.6, where we store GIS data as DATA_BLOB
+in innodb layer.
+@param[in]	table_id	table id
+@param[in]	col_name	column name
+@param[in]	trx		data dictionary transaction
+@retval true Failure
+@retval false Success */
+static
+bool
+innobase_update_gis_column_type(
+	table_id_t	table_id,
+	const char*	col_name,
+	trx_t*		trx)
+{
+	pars_info_t*	info;
+	dberr_t		error;
+
+	DBUG_ENTER("innobase_update_gis_column_type");
+
+	DBUG_ASSERT(trx->dict_operation);
+	ut_ad(trx->dict_operation_lock_mode);
+	ut_ad(dict_sys.locked());
+
+	info = pars_info_create();
+
+	pars_info_add_ull_literal(info, "tableid", table_id);
+	pars_info_add_str_literal(info, "name", col_name);
+	pars_info_add_int4_literal(info, "mtype", DATA_GEOMETRY);
+
+	trx->op_info = "update column type to DATA_GEOMETRY";
+
+	error = que_eval_sql(
+		info,
+		"PROCEDURE UPDATE_SYS_COLUMNS_PROC () IS\n"
+		"BEGIN\n"
+		"UPDATE SYS_COLUMNS SET MTYPE=:mtype\n"
+		"WHERE TABLE_ID=:tableid AND NAME=:name;\n"
+		"END;\n", trx);
+
+	trx->error_state = DB_SUCCESS;
+	trx->op_info = "";
+
+	DBUG_RETURN(error != DB_SUCCESS);
+}
+
+/** Check if we are creating spatial indexes on GIS columns, which are
+legacy columns from earlier MySQL, such as 5.6. If so, we have to update
+the mtypes of the old GIS columns to DATA_GEOMETRY.
+In 5.6, we store GIS columns as DATA_BLOB in InnoDB layer, it will introduce
+confusion when we run latest server on older data. That's why we need to
+do the upgrade.
+@param[in] ha_alter_info	Data used during in-place alter
+@param[in] table		Table on which we want to add indexes
+@param[in] trx			Transaction
+@return DB_SUCCESS if update successfully or no columns need to be updated,
+otherwise DB_ERROR, which means we can't update the mtype for some
+column, and creating spatial index on it should be dangerous */
+static
+dberr_t
+innobase_check_gis_columns(
+	Alter_inplace_info*	ha_alter_info,
+	dict_table_t*		table,
+	trx_t*			trx)
+{
+	DBUG_ENTER("innobase_check_gis_columns");
+
+	for (uint key_num = 0;
+	     key_num < ha_alter_info->index_add_count;
+	     key_num++) {
+
+		const KEY&	key = ha_alter_info->key_info_buffer[
+			ha_alter_info->index_add_buffer[key_num]];
+
+		if (!(key.flags & HA_SPATIAL)) {
+			continue;
+		}
+
+		ut_ad(key.user_defined_key_parts == 1);
+		const KEY_PART_INFO&    key_part = key.key_part[0];
+
+		/* Does not support spatial index on virtual columns */
+		if (!key_part.field->stored_in_db()) {
+			DBUG_RETURN(DB_UNSUPPORTED);
+		}
+
+		ulint col_nr = dict_table_has_column(
+			table,
+			key_part.field->field_name.str,
+			key_part.fieldnr);
+		ut_ad(col_nr != table->n_def);
+		dict_col_t*	col = &table->cols[col_nr];
+
+		if (col->mtype != DATA_BLOB) {
+			ut_ad(DATA_GEOMETRY_MTYPE(col->mtype));
+			continue;
+		}
+
+		const char* col_name = dict_table_get_col_name(
+			table, col_nr);
+
+		if (innobase_update_gis_column_type(
+			table->id, col_name, trx)) {
+
+			DBUG_RETURN(DB_ERROR);
+		} else {
+			col->mtype = DATA_GEOMETRY;
+
+			ib::info() << "Updated mtype of column" << col_name
+				<< " in table " << table->name
+				<< ", whose id is " << table->id
+				<< " to DATA_GEOMETRY";
+		}
+	}
+
+	DBUG_RETURN(DB_SUCCESS);
+}
+
+/** Collect virtual column info for its addition
+@param[in] ha_alter_info	Data used during in-place alter
+@param[in] altered_table	MySQL table that is being altered to
+@param[in] table		MySQL table as it is before the ALTER operation
+@retval true Failure
+@retval false Success */
+static
+bool
+prepare_inplace_add_virtual(
+	Alter_inplace_info*	ha_alter_info,
+	const TABLE*		altered_table,
+	const TABLE*		table)
+{
+	ha_innobase_inplace_ctx*	ctx;
+	uint16_t i = 0;
+
+	ctx = static_cast<ha_innobase_inplace_ctx*>
+		(ha_alter_info->handler_ctx);
+
+	unsigned j = altered_table->s->virtual_fields + ctx->num_to_drop_vcol;
+
+	ctx->add_vcol = static_cast<dict_v_col_t*>(
+		 mem_heap_zalloc(ctx->heap, j * sizeof *ctx->add_vcol));
+	ctx->add_vcol_name = static_cast<const char**>(
+		 mem_heap_alloc(ctx->heap, j * sizeof *ctx->add_vcol_name));
+
+	j = 0;
+
+	for (const Create_field& new_field :
+	     ha_alter_info->alter_info->create_list) {
+		const Field* field = altered_table->field[i++];
+
+		if (new_field.field || field->stored_in_db()) {
+			continue;
+		}
+
+		unsigned is_unsigned;
+		auto col_type = get_innobase_type_from_mysql_type(
+			&is_unsigned, field);
+
+		auto col_len = field->pack_length();
+		unsigned field_type = field->type() | is_unsigned;
+
+		if (!field->real_maybe_null()) {
+			field_type |= DATA_NOT_NULL;
+		}
+
+		if (field->binary()) {
+			field_type |= DATA_BINARY_TYPE;
+		}
+
+		unsigned charset_no;
+
+		if (dtype_is_string_type(col_type)) {
+			charset_no = field->charset()->number;
+
+			DBUG_EXECUTE_IF(
+				"ib_alter_add_virtual_fail",
+				charset_no += MAX_CHAR_COLL_NUM;);
+
+			if (charset_no > MAX_CHAR_COLL_NUM) {
+				my_error(ER_WRONG_KEY_COLUMN, MYF(0), "InnoDB",
+					 field->field_name.str);
+				return(true);
+			}
+		} else {
+			charset_no = 0;
+		}
+
+		if (field->type() == MYSQL_TYPE_VARCHAR) {
+			uint32  length_bytes
+				= static_cast<const Field_varstring*>(
+					field)->length_bytes;
+
+			col_len -= length_bytes;
+
+			if (length_bytes == 2) {
+				field_type |= DATA_LONG_TRUE_VARCHAR;
+			}
+		}
+
+		new (&ctx->add_vcol[j]) dict_v_col_t();
+		ctx->add_vcol[j].m_col.prtype = dtype_form_prtype(
+						field_type, charset_no);
+
+		ctx->add_vcol[j].m_col.prtype |= DATA_VIRTUAL;
+
+		ctx->add_vcol[j].m_col.mtype = col_type;
+
+		ctx->add_vcol[j].m_col.len = static_cast<uint16_t>(col_len);
+
+		ctx->add_vcol[j].m_col.ind = (i - 1)
+			& dict_index_t::MAX_N_FIELDS;
+		ctx->add_vcol[j].num_base = 0;
+		ctx->add_vcol_name[j] = field->field_name.str;
+		ctx->add_vcol[j].base_col = NULL;
+		ctx->add_vcol[j].v_pos = (ctx->old_table->n_v_cols
+					  - ctx->num_to_drop_vcol + j)
+			& dict_index_t::MAX_N_FIELDS;
+
+		/* MDEV-17468: Do this on ctx->instant_table later */
+		innodb_base_col_setup(ctx->old_table, field, &ctx->add_vcol[j]);
+		j++;
+	}
+
+	ctx->num_to_add_vcol = j;
+	return(false);
+}
+
+/** Collect virtual column info for its addition
+@param[in] ha_alter_info	Data used during in-place alter
+@param[in] table		MySQL table as it is before the ALTER operation
+@retval true Failure
+@retval false Success */
+static
+bool
+prepare_inplace_drop_virtual(
+	Alter_inplace_info*	ha_alter_info,
+	const TABLE*		table)
+{
+	ha_innobase_inplace_ctx*	ctx;
+	unsigned i = 0, j = 0;
+
+	ctx = static_cast<ha_innobase_inplace_ctx*>
+		(ha_alter_info->handler_ctx);
+
+	ctx->num_to_drop_vcol = 0;
+	for (i = 0; table->field[i]; i++) {
+		const Field* field = table->field[i];
+		if (field->flags & FIELD_IS_DROPPED && !field->stored_in_db()) {
+			ctx->num_to_drop_vcol++;
+		}
+	}
+
+	ctx->drop_vcol = static_cast<dict_v_col_t*>(
+		 mem_heap_alloc(ctx->heap, ctx->num_to_drop_vcol
+				* sizeof *ctx->drop_vcol));
+	ctx->drop_vcol_name = static_cast<const char**>(
+		 mem_heap_alloc(ctx->heap, ctx->num_to_drop_vcol
+				* sizeof *ctx->drop_vcol_name));
+
+	for (i = 0; table->field[i]; i++) {
+		Field *field =  table->field[i];
+		if (!(field->flags & FIELD_IS_DROPPED) || field->stored_in_db()) {
+			continue;
+		}
+
+		unsigned is_unsigned;
+
+		auto col_type = get_innobase_type_from_mysql_type(
+			&is_unsigned, field);
+
+		auto col_len = field->pack_length();
+		unsigned field_type = field->type() | is_unsigned;
+
+		if (!field->real_maybe_null()) {
+			field_type |= DATA_NOT_NULL;
+		}
+
+		if (field->binary()) {
+			field_type |= DATA_BINARY_TYPE;
+		}
+
+		unsigned charset_no = 0;
+
+		if (dtype_is_string_type(col_type)) {
+			charset_no = field->charset()->number;
+
+			DBUG_EXECUTE_IF(
+				"ib_alter_add_virtual_fail",
+				charset_no += MAX_CHAR_COLL_NUM;);
+
+			if (charset_no > MAX_CHAR_COLL_NUM) {
+				my_error(ER_WRONG_KEY_COLUMN, MYF(0), "InnoDB",
+					 field->field_name.str);
+				return(true);
+			}
+		} else {
+			charset_no = 0;
+		}
+
+		if (field->type() == MYSQL_TYPE_VARCHAR) {
+			uint32  length_bytes
+				= static_cast<const Field_varstring*>(
+					field)->length_bytes;
+
+			col_len -= length_bytes;
+
+			if (length_bytes == 2) {
+				field_type |= DATA_LONG_TRUE_VARCHAR;
+			}
+		}
+
+
+		ctx->drop_vcol[j].m_col.prtype = dtype_form_prtype(
+						field_type, charset_no);
+
+		ctx->drop_vcol[j].m_col.prtype |= DATA_VIRTUAL;
+
+		ctx->drop_vcol[j].m_col.mtype = col_type;
+
+		ctx->drop_vcol[j].m_col.len = static_cast<uint16_t>(col_len);
+
+		ctx->drop_vcol[j].m_col.ind = i & dict_index_t::MAX_N_FIELDS;
+
+		ctx->drop_vcol_name[j] = field->field_name.str;
+
+		dict_v_col_t*	v_col = dict_table_get_nth_v_col_mysql(
+					ctx->old_table, i);
+		ctx->drop_vcol[j].v_pos = v_col->v_pos;
+		j++;
+	}
+
+	return(false);
+}
+
+/** Insert a new record to INNODB SYS_VIRTUAL
+@param[in] table	InnoDB table
+@param[in] pos		virtual column column no
+@param[in] base_pos	base column pos
+@param[in] trx		transaction
+@retval	false	on success
+@retval	true	on failure (my_error() will have been called) */
+static bool innobase_insert_sys_virtual(
+	const dict_table_t*	table,
+	ulint			pos,
+	ulint			base_pos,
+	trx_t*			trx)
+{
+	pars_info_t*    info = pars_info_create();
+	pars_info_add_ull_literal(info, "id", table->id);
+	pars_info_add_int4_literal(info, "pos", pos);
+	pars_info_add_int4_literal(info, "base_pos", base_pos);
+
+	if (DB_SUCCESS != que_eval_sql(
+		    info,
+		    "PROCEDURE P () IS\n"
+		    "BEGIN\n"
+		    "INSERT INTO SYS_VIRTUAL VALUES (:id, :pos, :base_pos);\n"
+		    "END;\n", trx)) {
+		my_error(ER_INTERNAL_ERROR, MYF(0),
+			 "InnoDB: ADD COLUMN...VIRTUAL");
+		return true;
+	}
+
+	return false;
+}
+
+/** Insert a record to the SYS_COLUMNS dictionary table.
+@param[in]	table_id	table id
+@param[in]	pos		position of the column
+@param[in]	field_name	field name
+@param[in]	mtype		main type
+@param[in]	prtype		precise type
+@param[in]	len		fixed length in bytes, or 0
+@param[in]	n_base		number of base columns of virtual columns, or 0
+@param[in]	update		whether to update instead of inserting
+@retval	false	on success
+@retval	true	on failure (my_error() will have been called) */
+static bool innodb_insert_sys_columns(
+	table_id_t	table_id,
+	ulint		pos,
+	const char*	field_name,
+	ulint		mtype,
+	ulint		prtype,
+	ulint		len,
+	ulint		n_base,
+	trx_t*		trx,
+	bool		update = false)
+{
+	pars_info_t*	info = pars_info_create();
+	pars_info_add_ull_literal(info, "id", table_id);
+	pars_info_add_int4_literal(info, "pos", pos);
+	pars_info_add_str_literal(info, "name", field_name);
+	pars_info_add_int4_literal(info, "mtype", mtype);
+	pars_info_add_int4_literal(info, "prtype", prtype);
+	pars_info_add_int4_literal(info, "len", len);
+	pars_info_add_int4_literal(info, "base", n_base);
+
+	if (update) {
+		if (DB_SUCCESS != que_eval_sql(
+			    info,
+			    "PROCEDURE UPD_COL () IS\n"
+			    "BEGIN\n"
+			    "UPDATE SYS_COLUMNS SET\n"
+			    "NAME=:name, MTYPE=:mtype, PRTYPE=:prtype, "
+			    "LEN=:len, PREC=:base\n"
+			    "WHERE TABLE_ID=:id AND POS=:pos;\n"
+			    "END;\n", trx)) {
+			my_error(ER_INTERNAL_ERROR, MYF(0),
+				 "InnoDB: Updating SYS_COLUMNS failed");
+			return true;
+		}
+
+		return false;
+	}
+
+	if (DB_SUCCESS != que_eval_sql(
+		    info,
+		    "PROCEDURE ADD_COL () IS\n"
+		    "BEGIN\n"
+		    "INSERT INTO SYS_COLUMNS VALUES"
+		    "(:id,:pos,:name,:mtype,:prtype,:len,:base);\n"
+		    "END;\n", trx)) {
+		my_error(ER_INTERNAL_ERROR, MYF(0),
+			 "InnoDB: Insert into SYS_COLUMNS failed");
+		return true;
+	}
+
+	return false;
+}
+
+/** Update INNODB SYS_COLUMNS on new virtual columns
+@param[in] table	InnoDB table
+@param[in] col_name	column name
+@param[in] vcol		virtual column
+@param[in] trx		transaction
+@retval	false	on success
+@retval	true	on failure (my_error() will have been called) */
+static bool innobase_add_one_virtual(
+	const dict_table_t*	table,
+	const char*		col_name,
+	dict_v_col_t*		vcol,
+	trx_t*			trx)
+{
+	ulint		pos = dict_create_v_col_pos(vcol->v_pos,
+						    vcol->m_col.ind);
+
+	if (innodb_insert_sys_columns(table->id, pos, col_name,
+				      vcol->m_col.mtype, vcol->m_col.prtype,
+				      vcol->m_col.len, vcol->num_base, trx)) {
+		return true;
+	}
+
+	for (unsigned i = 0; i < vcol->num_base; i++) {
+		if (innobase_insert_sys_virtual(
+			    table, pos, vcol->base_col[i]->ind, trx)) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+/** Update SYS_TABLES.N_COLS in the data dictionary.
+@param[in] user_table	InnoDB table
+@param[in] n		the new value of SYS_TABLES.N_COLS
+@param[in] trx		transaction
+@return whether the operation failed */
+static bool innodb_update_cols(const dict_table_t* table, ulint n, trx_t* trx)
+{
+	pars_info_t*    info = pars_info_create();
+
+	pars_info_add_int4_literal(info, "n", n);
+	pars_info_add_ull_literal(info, "id", table->id);
+
+	if (DB_SUCCESS != que_eval_sql(info,
+				       "PROCEDURE UPDATE_N_COLS () IS\n"
+				       "BEGIN\n"
+				       "UPDATE SYS_TABLES SET N_COLS = :n"
+				       " WHERE ID = :id;\n"
+				       "END;\n", trx)) {
+		my_error(ER_INTERNAL_ERROR, MYF(0),
+			 "InnoDB: Updating SYS_TABLES.N_COLS failed");
+		return true;
+	}
+
+	return false;
+}
+
+/** Update system table for adding virtual column(s)
+@param[in]	ha_alter_info	Data used during in-place alter
+@param[in]	user_table	InnoDB table
+@param[in]	trx		transaction
+@retval true Failure
+@retval false Success */
+static
+bool
+innobase_add_virtual_try(
+	const Alter_inplace_info*	ha_alter_info,
+	const dict_table_t*		user_table,
+	trx_t*				trx)
+{
+	ha_innobase_inplace_ctx* ctx = static_cast<ha_innobase_inplace_ctx*>(
+		ha_alter_info->handler_ctx);
+
+	for (ulint i = 0; i < ctx->num_to_add_vcol; i++) {
+		if (innobase_add_one_virtual(
+			    user_table, ctx->add_vcol_name[i],
+			    &ctx->add_vcol[i], trx)) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+/** Delete metadata from SYS_COLUMNS and SYS_VIRTUAL.
+@param[in]	id	table id
+@param[in]	pos	first SYS_COLUMNS.POS
+@param[in,out]	trx	data dictionary transaction
+@retval true Failure
+@retval false Success. */
+static bool innobase_instant_drop_cols(table_id_t id, ulint pos, trx_t* trx)
+{
+	pars_info_t*	info = pars_info_create();
+	pars_info_add_ull_literal(info, "id", id);
+	pars_info_add_int4_literal(info, "pos", pos);
+
+	dberr_t err = que_eval_sql(
+			info,
+			"PROCEDURE DELETE_COL () IS\n"
+			"BEGIN\n"
+			"DELETE FROM SYS_COLUMNS WHERE\n"
+			"TABLE_ID = :id AND POS >= :pos;\n"
+			"DELETE FROM SYS_VIRTUAL WHERE TABLE_ID = :id;\n"
+			"END;\n", trx);
+	if (err != DB_SUCCESS) {
+		my_error(ER_INTERNAL_ERROR, MYF(0),
+			 "InnoDB: DELETE from SYS_COLUMNS/SYS_VIRTUAL failed");
+		return true;
+	}
+
+	return false;
+}
+
+/** Update INNODB SYS_COLUMNS on new virtual column's position
+@param[in]	table	InnoDB table
+@param[in]	old_pos	old position
+@param[in]	new_pos	new position
+@param[in]	trx	transaction
+@return DB_SUCCESS if successful, otherwise error code */
+static
+dberr_t
+innobase_update_v_pos_sys_columns(
+	const dict_table_t*	table,
+	ulint			old_pos,
+	ulint			new_pos,
+	trx_t*			trx)
+{
+	pars_info_t*    info = pars_info_create();
+
+	pars_info_add_int4_literal(info, "pos", old_pos);
+	pars_info_add_int4_literal(info, "val", new_pos);
+	pars_info_add_ull_literal(info, "id", table->id);
+
+	dberr_t error = que_eval_sql(
+			info,
+			"PROCEDURE P () IS\n"
+			"BEGIN\n"
+			"UPDATE SYS_COLUMNS\n"
+			"SET POS = :val\n"
+			"WHERE POS = :pos\n"
+			"AND TABLE_ID = :id;\n"
+			"END;\n", trx);
+
+	return(error);
+}
+
+/** Update INNODB SYS_VIRTUAL table with new virtual column position
+@param[in]	table		InnoDB table
+@param[in]	old_pos		old position
+@param[in]	new_pos		new position
+@param[in]	trx		transaction
+@return DB_SUCCESS if successful, otherwise error code */
+static
+dberr_t
+innobase_update_v_pos_sys_virtual(
+	const dict_table_t*	table,
+	ulint			old_pos,
+	ulint			new_pos,
+	trx_t*			trx)
+{
+	pars_info_t*    info = pars_info_create();
+
+	pars_info_add_int4_literal(info, "pos", old_pos);
+	pars_info_add_int4_literal(info, "val", new_pos);
+	pars_info_add_ull_literal(info, "id", table->id);
+
+	dberr_t error = que_eval_sql(
+			info,
+			"PROCEDURE P () IS\n"
+			"BEGIN\n"
+			"UPDATE SYS_VIRTUAL\n"
+			"SET POS = :val\n"
+			"WHERE POS = :pos\n"
+			"AND TABLE_ID = :id;\n"
+			"END;\n", trx);
+
+	return(error);
+}
+
+/** Update InnoDB system tables on dropping a virtual column
+@param[in]	table		InnoDB table
+@param[in]	col_name	column name of the dropping column
+@param[in]	drop_col	col information for the dropping column
+@param[in]	n_prev_dropped	number of previously dropped columns in the
+				same alter clause
+@param[in]	trx		transaction
+@return DB_SUCCESS if successful, otherwise error code */
+static
+dberr_t
+innobase_drop_one_virtual_sys_columns(
+	const dict_table_t*	table,
+	const char*		col_name,
+	dict_col_t*		drop_col,
+	ulint			n_prev_dropped,
+	trx_t*			trx)
+{
+	pars_info_t*    info = pars_info_create();
+	pars_info_add_ull_literal(info, "id", table->id);
+
+	pars_info_add_str_literal(info, "name", col_name);
+
+	dberr_t error = que_eval_sql(
+			info,
+			"PROCEDURE P () IS\n"
+			"BEGIN\n"
+			"DELETE FROM SYS_COLUMNS\n"
+			"WHERE TABLE_ID = :id\n"
+			"AND NAME = :name;\n"
+			"END;\n", trx);
+
+	if (error != DB_SUCCESS) {
+		return(error);
+	}
+
+	dict_v_col_t*	v_col = dict_table_get_nth_v_col_mysql(
+				table, drop_col->ind);
+
+	/* Adjust column positions for all subsequent columns */
+	for (ulint i = v_col->v_pos + 1; i < table->n_v_cols; i++) {
+		dict_v_col_t*   t_col = dict_table_get_nth_v_col(table, i);
+		ulint		old_p = dict_create_v_col_pos(
+			t_col->v_pos - n_prev_dropped,
+			t_col->m_col.ind - n_prev_dropped);
+		ulint		new_p = dict_create_v_col_pos(
+			t_col->v_pos - 1 - n_prev_dropped,
+			ulint(t_col->m_col.ind) - 1 - n_prev_dropped);
+
+		error = innobase_update_v_pos_sys_columns(
+			table, old_p, new_p, trx);
+		if (error != DB_SUCCESS) {
+			return(error);
+		}
+		error = innobase_update_v_pos_sys_virtual(
+			table, old_p, new_p, trx);
+		if (error != DB_SUCCESS) {
+			return(error);
+		}
+	}
+
+	return(error);
+}
+
+/** Delete virtual column's info from INNODB SYS_VIRTUAL
+@param[in]	table	InnoDB table
+@param[in]	pos	position of the virtual column to be deleted
+@param[in]	trx	transaction
+@return DB_SUCCESS if successful, otherwise error code */
+static
+dberr_t
+innobase_drop_one_virtual_sys_virtual(
+	const dict_table_t*	table,
+	ulint			pos,
+	trx_t*			trx)
+{
+	pars_info_t*    info = pars_info_create();
+	pars_info_add_ull_literal(info, "id", table->id);
+
+	pars_info_add_int4_literal(info, "pos", pos);
+
+	dberr_t error = que_eval_sql(
+			info,
+			"PROCEDURE P () IS\n"
+			"BEGIN\n"
+			"DELETE FROM SYS_VIRTUAL\n"
+			"WHERE TABLE_ID = :id\n"
+			"AND POS = :pos;\n"
+			"END;\n", trx);
+
+	return(error);
+}
+
+/** Update system table for dropping virtual column(s)
+@param[in]	ha_alter_info	Data used during in-place alter
+@param[in]	user_table	InnoDB table
+@param[in]	trx		transaction
+@retval true Failure
+@retval false Success */
+static
+bool
+innobase_drop_virtual_try(
+	const Alter_inplace_info*	ha_alter_info,
+	const dict_table_t*	     user_table,
+	trx_t*				trx)
+{
+	ha_innobase_inplace_ctx*	ctx;
+	dberr_t				err = DB_SUCCESS;
+
+	ctx = static_cast<ha_innobase_inplace_ctx*>
+		(ha_alter_info->handler_ctx);
+
+	for (unsigned i = 0; i < ctx->num_to_drop_vcol; i++) {
+
+		ulint	pos = dict_create_v_col_pos(
+			ctx->drop_vcol[i].v_pos - i,
+			ctx->drop_vcol[i].m_col.ind - i);
+		err = innobase_drop_one_virtual_sys_virtual(
+			user_table, pos, trx);
+
+		if (err != DB_SUCCESS) {
+			my_error(ER_INTERNAL_ERROR, MYF(0),
+				 "InnoDB: DROP COLUMN...VIRTUAL");
+			return(true);
+		}
+
+		err = innobase_drop_one_virtual_sys_columns(
+			user_table, ctx->drop_vcol_name[i],
+			&(ctx->drop_vcol[i].m_col), i, trx);
+
+		if (err != DB_SUCCESS) {
+			my_error(ER_INTERNAL_ERROR, MYF(0),
+				 "InnoDB: DROP COLUMN...VIRTUAL");
+			return(true);
+		}
+	}
+
+	return false;
+}
+
+/** Serialise metadata of dropped or reordered columns.
+@param[in,out]	heap	memory heap for allocation
+@param[out]	field	data field with the metadata */
+inline
+void dict_table_t::serialise_columns(mem_heap_t* heap, dfield_t* field) const
+{
+	DBUG_ASSERT(instant);
+	const dict_index_t& index = *UT_LIST_GET_FIRST(indexes);
+	unsigned n_fixed = index.first_user_field();
+	unsigned num_non_pk_fields = index.n_fields - n_fixed;
+
+	ulint len = 4 + num_non_pk_fields * 2;
+
+	byte* data = static_cast<byte*>(mem_heap_alloc(heap, len));
+
+	dfield_set_data(field, data, len);
+
+	mach_write_to_4(data, num_non_pk_fields);
+
+	data += 4;
+
+	for (ulint i = n_fixed; i < index.n_fields; i++) {
+		mach_write_to_2(data, instant->field_map[i - n_fixed]);
+		data += 2;
+	}
+}
+
+/** Construct the metadata record for instant ALTER TABLE.
+@param[in]	row	dummy or default values for existing columns
+@param[in,out]	heap	memory heap for allocations
+@return	metadata record */
+inline
+dtuple_t*
+dict_index_t::instant_metadata(const dtuple_t& row, mem_heap_t* heap) const
+{
+	ut_ad(is_primary());
+	dtuple_t* entry;
+
+	if (!table->instant) {
+		entry = row_build_index_entry(&row, NULL, this, heap);
+		entry->info_bits = REC_INFO_METADATA_ADD;
+		return entry;
+	}
+
+	entry = dtuple_create(heap, n_fields + 1);
+	entry->n_fields_cmp = n_uniq;
+	entry->info_bits = REC_INFO_METADATA_ALTER;
+
+	const dict_field_t* field = fields;
+
+	for (uint i = 0; i <= n_fields; i++, field++) {
+		dfield_t* dfield = dtuple_get_nth_field(entry, i);
+
+		if (i == first_user_field()) {
+			table->serialise_columns(heap, dfield);
+			dfield->type.metadata_blob_init();
+			field--;
+			continue;
+		}
+
+		ut_ad(!field->col->is_virtual());
+
+		if (field->col->is_dropped()) {
+			dict_col_copy_type(field->col, &dfield->type);
+			if (field->col->is_nullable()) {
+				dfield_set_null(dfield);
+			} else {
+				dfield_set_data(dfield, field_ref_zero,
+						field->fixed_len);
+			}
+			continue;
+		}
+
+		const dfield_t* s = dtuple_get_nth_field(&row, field->col->ind);
+		ut_ad(dict_col_type_assert_equal(field->col, &s->type));
+		*dfield = *s;
+
+		if (dfield_is_null(dfield)) {
+			continue;
+		}
+
+		if (dfield_is_ext(dfield)) {
+			ut_ad(i > first_user_field());
+			ut_ad(!field->prefix_len);
+			ut_ad(dfield->len >= FIELD_REF_SIZE);
+			dfield_set_len(dfield, dfield->len - FIELD_REF_SIZE);
+		}
+
+		if (!field->prefix_len) {
+			continue;
+		}
+
+		ut_ad(field->col->ord_part);
+		ut_ad(i < n_uniq);
+
+		ulint len = dtype_get_at_most_n_mbchars(
+			field->col->prtype,
+			field->col->mbminlen, field->col->mbmaxlen,
+			field->prefix_len, dfield->len,
+			static_cast<char*>(dfield_get_data(dfield)));
+		dfield_set_len(dfield, len);
+	}
+
+	return entry;
+}
+
+/** Insert or update SYS_COLUMNS and the hidden metadata record
+for instant ALTER TABLE.
+@param[in]	ha_alter_info	ALTER TABLE context
+@param[in,out]	ctx		ALTER TABLE context for the current partition
+@param[in]	altered_table	MySQL table that is being altered
+@param[in]	table		MySQL table as it is before the ALTER operation
+@param[in,out]	trx		dictionary transaction
+@retval	true	failure
+@retval	false	success */
+static bool innobase_instant_try(
+	const Alter_inplace_info*	ha_alter_info,
+	ha_innobase_inplace_ctx*	ctx,
+	const TABLE*			altered_table,
+	const TABLE*			table,
+	trx_t*				trx)
+{
+	DBUG_ASSERT(!ctx->need_rebuild());
+	DBUG_ASSERT(ctx->is_instant());
+
+	dict_table_t* user_table = ctx->old_table;
+
+	dict_index_t* index = dict_table_get_first_index(user_table);
+	const unsigned n_old_fields = index->n_fields;
+	const dict_col_t* old_cols = user_table->cols;
+	DBUG_ASSERT(user_table->n_cols == ctx->old_n_cols);
+
+#ifdef BTR_CUR_HASH_ADAPT
+	/* Acquire the ahi latch to avoid a race condition
+	between ahi access and instant alter table */
+	srw_spin_lock* ahi_latch = btr_search_sys.get_latch(*index);
+	ahi_latch->wr_lock(SRW_LOCK_CALL);
+#endif /* BTR_CUR_HASH_ADAPT */
+	const bool metadata_changed = ctx->instant_column();
+#ifdef BTR_CUR_HASH_ADAPT
+	ahi_latch->wr_unlock();
+#endif /* BTR_CUR_HASH_ADAPT */
+
+	DBUG_ASSERT(index->n_fields >= n_old_fields);
+	/* The table may have been emptied and may have lost its
+	'instantness' during this ALTER TABLE. */
+
+	/* Construct a table row of default values for the stored columns. */
+	dtuple_t* row = dtuple_create(ctx->heap, user_table->n_cols);
+	dict_table_copy_types(row, user_table);
+	Field** af = altered_table->field;
+	Field** const end = altered_table->field + altered_table->s->fields;
+	ut_d(List_iterator_fast<Create_field> cf_it(
+		     ha_alter_info->alter_info->create_list));
+	if (ctx->first_alter_pos
+	    && innobase_instant_drop_cols(user_table->id,
+					  ctx->first_alter_pos - 1, trx)) {
+		return true;
+	}
+	for (uint i = 0; af < end; af++) {
+		if (!(*af)->stored_in_db()) {
+			ut_d(cf_it++);
+			continue;
+		}
+
+		const dict_col_t* old = dict_table_t::find(old_cols,
+							   ctx->col_map,
+							   ctx->old_n_cols, i);
+		DBUG_ASSERT(!old || i >= ctx->old_n_cols - DATA_N_SYS_COLS
+			    || old->ind == i
+			    || (ctx->first_alter_pos
+				&& old->ind >= ctx->first_alter_pos - 1));
+
+		dfield_t* d = dtuple_get_nth_field(row, i);
+		const dict_col_t* col = dict_table_get_nth_col(user_table, i);
+		DBUG_ASSERT(!col->is_virtual());
+		DBUG_ASSERT(!col->is_dropped());
+		DBUG_ASSERT(col->mtype != DATA_SYS);
+		DBUG_ASSERT(!strcmp((*af)->field_name.str,
+				    dict_table_get_col_name(user_table, i)));
+		DBUG_ASSERT(old || col->is_added());
+
+		ut_d(const Create_field* new_field = cf_it++);
+		/* new_field->field would point to an existing column.
+		If it is NULL, the column was added by this ALTER TABLE. */
+		ut_ad(!new_field->field == !old);
+
+		if (col->is_added()) {
+			dfield_set_data(d, col->def_val.data,
+					col->def_val.len);
+		} else if ((*af)->real_maybe_null()) {
+			/* Store NULL for nullable 'core' columns. */
+			dfield_set_null(d);
+		} else {
+			switch ((*af)->type()) {
+			case MYSQL_TYPE_VARCHAR:
+			case MYSQL_TYPE_GEOMETRY:
+			case MYSQL_TYPE_TINY_BLOB:
+			case MYSQL_TYPE_MEDIUM_BLOB:
+			case MYSQL_TYPE_BLOB:
+			case MYSQL_TYPE_LONG_BLOB:
+			variable_length:
+				/* Store the empty string for 'core'
+				variable-length NOT NULL columns. */
+				dfield_set_data(d, field_ref_zero, 0);
+				break;
+			case MYSQL_TYPE_STRING:
+				if (col->mbminlen != col->mbmaxlen
+				    && user_table->not_redundant()) {
+					goto variable_length;
+				}
+				/* fall through */
+			default:
+				/* For fixed-length NOT NULL 'core' columns,
+				get a dummy default value from SQL. Note that
+				we will preserve the old values of these
+				columns when updating the metadata
+				record, to avoid unnecessary updates. */
+				ulint len = (*af)->pack_length();
+				DBUG_ASSERT(d->type.mtype != DATA_INT
+					    || len <= 8);
+				row_mysql_store_col_in_innobase_format(
+					d, d->type.mtype == DATA_INT
+					? static_cast<byte*>(
+						mem_heap_alloc(ctx->heap, len))
+					: NULL, true, (*af)->ptr, len,
+					dict_table_is_comp(user_table));
+				ut_ad(new_field->field->pack_length() == len);
+			}
+		}
+
+		bool update = old && (!ctx->first_alter_pos
+				      || i < ctx->first_alter_pos - 1);
+		DBUG_ASSERT(!old || col->same_format(*old));
+		if (update
+		    && old->prtype == d->type.prtype) {
+			/* The record is already present in SYS_COLUMNS. */
+		} else if (innodb_insert_sys_columns(user_table->id, i,
+						     (*af)->field_name.str,
+						     d->type.mtype,
+						     d->type.prtype,
+						     d->type.len, 0, trx,
+						     update)) {
+			return true;
+		}
+
+		i++;
+	}
+
+	if (innodb_update_cols(user_table, dict_table_encode_n_col(
+				       unsigned(user_table->n_cols)
+				       - DATA_N_SYS_COLS,
+				       user_table->n_v_cols)
+			       | (user_table->flags & DICT_TF_COMPACT) << 31,
+			       trx)) {
+		return true;
+	}
+
+	if (ctx->first_alter_pos) {
+add_all_virtual:
+		for (uint i = 0; i < user_table->n_v_cols; i++) {
+			if (innobase_add_one_virtual(
+				    user_table,
+				    dict_table_get_v_col_name(user_table, i),
+				    &user_table->v_cols[i], trx)) {
+				return true;
+			}
+		}
+	} else if (ha_alter_info->handler_flags & ALTER_DROP_VIRTUAL_COLUMN) {
+		if (innobase_instant_drop_cols(user_table->id, 65536, trx)) {
+			return true;
+		}
+		goto add_all_virtual;
+	} else if ((ha_alter_info->handler_flags & ALTER_ADD_VIRTUAL_COLUMN)
+		   && innobase_add_virtual_try(ha_alter_info, user_table,
+					       trx)) {
+		return true;
+        }
+
+	if (!user_table->space) {
+		/* In case of ALTER TABLE...DISCARD TABLESPACE,
+		update only the metadata and transform the dictionary
+		cache entry to the canonical format. */
+		index->clear_instant_alter();
+		return false;
+	}
+
+	unsigned i = unsigned(user_table->n_cols) - DATA_N_SYS_COLS;
+	DBUG_ASSERT(i >= altered_table->s->stored_fields);
+	DBUG_ASSERT(i <= altered_table->s->stored_fields + 1);
+	if (i > altered_table->s->fields) {
+		const dict_col_t& fts_doc_id = user_table->cols[i - 1];
+		DBUG_ASSERT(!strcmp(fts_doc_id.name(*user_table),
+				    FTS_DOC_ID_COL_NAME));
+		DBUG_ASSERT(!fts_doc_id.is_nullable());
+		DBUG_ASSERT(fts_doc_id.len == 8);
+		dfield_set_data(dtuple_get_nth_field(row, i - 1),
+				field_ref_zero, fts_doc_id.len);
+	}
+	byte trx_id[DATA_TRX_ID_LEN], roll_ptr[DATA_ROLL_PTR_LEN];
+	dfield_set_data(dtuple_get_nth_field(row, i++), field_ref_zero,
+			DATA_ROW_ID_LEN);
+	dfield_set_data(dtuple_get_nth_field(row, i++), trx_id, sizeof trx_id);
+	dfield_set_data(dtuple_get_nth_field(row, i),roll_ptr,sizeof roll_ptr);
+	DBUG_ASSERT(i + 1 == user_table->n_cols);
+
+	trx_write_trx_id(trx_id, trx->id);
+	/* The DB_ROLL_PTR will be assigned later, when allocating undo log.
+	Silence a Valgrind warning in dtuple_validate() when
+	row_ins_clust_index_entry_low() searches for the insert position. */
+	memset(roll_ptr, 0, sizeof roll_ptr);
+
+	dtuple_t* entry = index->instant_metadata(*row, ctx->heap);
+	mtr_t	mtr;
+	mtr.start();
+	index->set_modified(mtr);
+	btr_pcur_t pcur;
+	dberr_t err= pcur.open_leaf(true, index, BTR_MODIFY_TREE, &mtr);
+	if (err != DB_SUCCESS) {
+func_exit:
+		mtr.commit();
+
+		if (err != DB_SUCCESS) {
+			my_error_innodb(err, table->s->table_name.str,
+					user_table->flags);
+			return true;
+		}
+		return false;
+	}
+	ut_ad(btr_pcur_is_before_first_on_page(&pcur));
+
+	buf_block_t* block = btr_pcur_get_block(&pcur);
+	ut_ad(page_is_leaf(block->page.frame));
+	ut_ad(!page_has_prev(block->page.frame));
+	ut_ad(!buf_block_get_page_zip(block));
+	const rec_t* rec = btr_pcur_move_to_next_on_page(&pcur);
+	if (UNIV_UNLIKELY(!rec)) {
+		err = DB_CORRUPTION;
+		goto func_exit;
+	}
+
+	que_thr_t* thr = pars_complete_graph_for_exec(
+		NULL, trx, ctx->heap, NULL);
+	page_id_t id{block->page.id()};
+	const bool is_root = id.page_no() == index->page;
+
+	if (rec_is_metadata(rec, *index)) {
+		ut_ad(page_rec_is_user_rec(rec));
+		if (is_root
+		    && !rec_is_alter_metadata(rec, *index)
+		    && !index->table->instant
+		    && !page_has_next(block->page.frame)
+		    && page_rec_is_last(rec, block->page.frame)) {
+			goto empty_table;
+		}
+
+		if (!metadata_changed) {
+			goto func_exit;
+		}
+
+		/* Ensure that the root page is in the correct format. */
+		id.set_page_no(index->page);
+		buf_block_t* root = mtr.get_already_latched(
+			id, MTR_MEMO_PAGE_SX_FIX);
+
+		if (UNIV_UNLIKELY(!root)) {
+			err = DB_CORRUPTION;
+			goto func_exit;
+		}
+
+		if (fil_page_get_type(root->page.frame)
+		    != FIL_PAGE_TYPE_INSTANT) {
+			DBUG_ASSERT("wrong page type" == 0);
+			err = DB_CORRUPTION;
+			goto func_exit;
+		}
+
+		btr_set_instant(root, *index, &mtr);
+
+		/* Extend the record with any added columns. */
+		uint n = uint(index->n_fields) - n_old_fields;
+		/* Reserve room for DB_TRX_ID,DB_ROLL_PTR and any
+		non-updated off-page columns in case they are moved off
+		page as a result of the update. */
+		const uint16_t f = user_table->instant != NULL;
+		upd_t* update = upd_create(index->n_fields + f, ctx->heap);
+		update->n_fields = n + f;
+		update->info_bits = f
+			? REC_INFO_METADATA_ALTER
+			: REC_INFO_METADATA_ADD;
+		if (f) {
+			upd_field_t* uf = upd_get_nth_field(update, 0);
+			uf->field_no = index->first_user_field();
+			uf->new_val = entry->fields[uf->field_no];
+			DBUG_ASSERT(!dfield_is_ext(&uf->new_val));
+			DBUG_ASSERT(!dfield_is_null(&uf->new_val));
+		}
+
+		/* Add the default values for instantly added columns */
+		unsigned j = f;
+
+		for (unsigned k = n_old_fields; k < index->n_fields; k++) {
+			upd_field_t* uf = upd_get_nth_field(update, j++);
+			uf->field_no = static_cast<uint16_t>(k + f);
+			uf->new_val = entry->fields[k + f];
+
+			ut_ad(j <= n + f);
+		}
+
+		ut_ad(j == n + f);
+
+		rec_offs* offsets = NULL;
+		mem_heap_t* offsets_heap = NULL;
+		big_rec_t* big_rec;
+		err = btr_cur_pessimistic_update(
+			BTR_NO_LOCKING_FLAG | BTR_KEEP_POS_FLAG,
+			btr_pcur_get_btr_cur(&pcur),
+			&offsets, &offsets_heap, ctx->heap,
+			&big_rec, update, UPD_NODE_NO_ORD_CHANGE,
+			thr, trx->id, &mtr);
+		if (err == DB_SUCCESS) {
+			offsets = rec_get_offsets(
+				btr_pcur_get_rec(&pcur), index, offsets,
+				index->n_core_fields, ULINT_UNDEFINED,
+				&offsets_heap);
+		}
+
+		if (big_rec) {
+			if (err == DB_SUCCESS) {
+				err = btr_store_big_rec_extern_fields(
+					&pcur, offsets, big_rec, &mtr,
+					BTR_STORE_UPDATE);
+			}
+
+			dtuple_big_rec_free(big_rec);
+		}
+		if (offsets_heap) {
+			mem_heap_free(offsets_heap);
+		}
+		ut_free(pcur.old_rec_buf);
+		goto func_exit;
+	} else if (is_root && page_rec_is_supremum(rec)
+		   && !index->table->instant) {
+empty_table:
+		/* The table is empty. */
+		ut_ad(fil_page_index_page_check(block->page.frame));
+		ut_ad(!page_has_siblings(block->page.frame));
+		ut_ad(block->page.id().page_no() == index->page);
+		/* MDEV-17383: free metadata BLOBs! */
+		btr_page_empty(block, NULL, index, 0, &mtr);
+		if (index->is_instant()) {
+			index->clear_instant_add();
+		}
+		goto func_exit;
+	} else if (!user_table->is_instant()) {
+		ut_ad(!user_table->not_redundant());
+		goto func_exit;
+	}
+
+	/* Convert the table to the instant ALTER TABLE format. */
+	mtr.commit();
+	mtr.start();
+	index->set_modified(mtr);
+	if (buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, &mtr,
+						   &err)) {
+		if (fil_page_get_type(root->page.frame) != FIL_PAGE_INDEX) {
+			DBUG_ASSERT("wrong page type" == 0);
+			err = DB_CORRUPTION;
+			goto func_exit;
+		}
+
+		btr_set_instant(root, *index, &mtr);
+		mtr.commit();
+		mtr.start();
+		index->set_modified(mtr);
+		err = row_ins_clust_index_entry_low(
+			BTR_NO_LOCKING_FLAG, BTR_MODIFY_TREE, index,
+			index->n_uniq, entry, 0, thr);
+	}
+
+	goto func_exit;
+}
+
+/** Adjust the create index column number from "New table" to
+"old InnoDB table" while we are doing dropping virtual column. Since we do
+not create separate new table for the dropping/adding virtual columns.
+To correctly find the indexed column, we will need to find its col_no
+in the "Old Table", not the "New table".
+@param[in]	ha_alter_info	Data used during in-place alter
+@param[in]	old_table	MySQL table as it is before the ALTER operation
+@param[in]	num_v_dropped	number of virtual column dropped
+@param[in,out]	index_def	index definition */
+static
+void
+innodb_v_adjust_idx_col(
+	const Alter_inplace_info*	ha_alter_info,
+	const TABLE*			old_table,
+	ulint				num_v_dropped,
+	index_def_t*			index_def)
+{
+	for (ulint i = 0; i < index_def->n_fields; i++) {
+#ifdef UNIV_DEBUG
+		bool	col_found = false;
+#endif /* UNIV_DEBUG */
+		ulint	num_v = 0;
+
+		index_field_t*	index_field = &index_def->fields[i];
+
+		/* Only adjust virtual column col_no, since non-virtual
+		column position (in non-vcol list) won't change unless
+		table rebuild */
+		if (!index_field->is_v_col) {
+			continue;
+		}
+
+		const Field*	field = NULL;
+
+		/* Found the field in the new table */
+		for (const Create_field& new_field :
+		     ha_alter_info->alter_info->create_list) {
+			if (new_field.stored_in_db()) {
+				continue;
+			}
+
+			field = new_field.field;
+
+			if (num_v == index_field->col_no) {
+				break;
+			}
+			num_v++;
+		}
+
+		if (!field) {
+			/* this means the field is a newly added field, this
+			should have been blocked when we drop virtual column
+			at the same time */
+			ut_ad(num_v_dropped > 0);
+			ut_a(0);
+		}
+
+		ut_ad(!field->stored_in_db());
+
+		num_v = 0;
+
+		/* Look for its position in old table */
+		for (uint old_i = 0; old_table->field[old_i]; old_i++) {
+			if (old_table->field[old_i] == field) {
+				/* Found it, adjust its col_no to its position
+				in old table */
+				index_def->fields[i].col_no = num_v;
+				ut_d(col_found = true);
+				break;
+			}
+
+			num_v += !old_table->field[old_i]->stored_in_db();
+		}
+
+		ut_ad(col_found);
+	}
+}
+
+/** Create index metadata in the data dictionary.
+@param[in,out]	trx	dictionary transaction
+@param[in,out]	index	index being created
+@param[in]	mode	encryption mode (for creating a table)
+@param[in]	key_id	encryption key identifier (for creating a table)
+@param[in]	add_v	virtual columns that are being added, or NULL
+@return the created index */
+MY_ATTRIBUTE((nonnull(1,2), warn_unused_result))
+static
+dict_index_t*
+create_index_dict(
+	trx_t*			trx,
+	dict_index_t*		index,
+	fil_encryption_t	mode,
+	uint32_t		key_id,
+	const dict_add_v_col_t* add_v)
+{
+	DBUG_ENTER("create_index_dict");
+
+	mem_heap_t* heap = mem_heap_create(512);
+	ind_node_t* node = ind_create_graph_create(
+		index, index->table->name.m_name, heap, mode, key_id, add_v);
+	que_thr_t* thr = pars_complete_graph_for_exec(node, trx, heap, NULL);
+
+	que_fork_start_command(
+		static_cast<que_fork_t*>(que_node_get_parent(thr)));
+
+	que_run_threads(thr);
+
+	DBUG_ASSERT(trx->error_state != DB_SUCCESS || index != node->index);
+	DBUG_ASSERT(trx->error_state != DB_SUCCESS || node->index);
+	index = node->index;
+
+	que_graph_free((que_t*) que_node_get_parent(thr));
+
+	DBUG_RETURN(index);
+}
+
+/** Update internal structures with concurrent writes blocked,
+while preparing ALTER TABLE.
+
+@param ha_alter_info Data used during in-place alter
+@param altered_table MySQL table that is being altered
+@param old_table MySQL table as it is before the ALTER operation
+@param table_name Table name in MySQL
+@param flags Table and tablespace flags
+@param flags2 Additional table flags
+@param fts_doc_id_col The column number of FTS_DOC_ID
+@param add_fts_doc_id Flag: add column FTS_DOC_ID?
+@param add_fts_doc_id_idx Flag: add index FTS_DOC_ID_INDEX (FTS_DOC_ID)?
+
+@retval true Failure
+@retval false Success
+*/
+static MY_ATTRIBUTE((warn_unused_result, nonnull(1,2,3,4)))
+bool
+prepare_inplace_alter_table_dict(
+/*=============================*/
+	Alter_inplace_info*	ha_alter_info,
+	const TABLE*		altered_table,
+	const TABLE*		old_table,
+	const char*		table_name,
+	ulint			flags,
+	ulint			flags2,
+	ulint			fts_doc_id_col,
+	bool			add_fts_doc_id,
+	bool			add_fts_doc_id_idx)
+{
+	bool			dict_locked	= false;
+	ulint*			add_key_nums;	/* MySQL key numbers */
+	index_def_t*		index_defs;	/* index definitions */
+	dict_table_t*		user_table;
+	dict_index_t*		fts_index	= NULL;
+	bool			new_clustered	= false;
+	dberr_t			error		= DB_SUCCESS;
+	ulint			num_fts_index;
+	dict_add_v_col_t*	add_v = NULL;
+	ha_innobase_inplace_ctx*ctx;
+
+	DBUG_ENTER("prepare_inplace_alter_table_dict");
+
+	ctx = static_cast<ha_innobase_inplace_ctx*>
+		(ha_alter_info->handler_ctx);
+
+	DBUG_ASSERT((ctx->add_autoinc != ULINT_UNDEFINED)
+		    == (ctx->sequence.max_value() > 0));
+	DBUG_ASSERT(!ctx->num_to_drop_index == !ctx->drop_index);
+	DBUG_ASSERT(!ctx->num_to_drop_fk == !ctx->drop_fk);
+	DBUG_ASSERT(!add_fts_doc_id || add_fts_doc_id_idx);
+	DBUG_ASSERT(!add_fts_doc_id_idx
+		    || innobase_fulltext_exist(altered_table));
+	DBUG_ASSERT(!ctx->defaults);
+	DBUG_ASSERT(!ctx->add_index);
+	DBUG_ASSERT(!ctx->add_key_numbers);
+	DBUG_ASSERT(!ctx->num_to_add_index);
+
+	user_table = ctx->new_table;
+
+	switch (ha_alter_info->inplace_supported) {
+	default: break;
+	case HA_ALTER_INPLACE_INSTANT:
+	case HA_ALTER_INPLACE_NOCOPY_LOCK:
+	case HA_ALTER_INPLACE_NOCOPY_NO_LOCK:
+		/* If we promised ALGORITHM=NOCOPY or ALGORITHM=INSTANT,
+		we must retain the original ROW_FORMAT of the table. */
+		flags = (user_table->flags & (DICT_TF_MASK_COMPACT
+					      | DICT_TF_MASK_ZIP_SSIZE
+					      | DICT_TF_MASK_ATOMIC_BLOBS))
+			| (flags & ~(DICT_TF_MASK_COMPACT
+				     | DICT_TF_MASK_ZIP_SSIZE
+				     | DICT_TF_MASK_ATOMIC_BLOBS));
+	}
+
+	trx_start_if_not_started_xa(ctx->prebuilt->trx, true);
+
+	if (ha_alter_info->handler_flags
+	    & ALTER_DROP_VIRTUAL_COLUMN) {
+		if (prepare_inplace_drop_virtual(ha_alter_info, old_table)) {
+			DBUG_RETURN(true);
+		}
+	}
+
+	if (ha_alter_info->handler_flags
+	    & ALTER_ADD_VIRTUAL_COLUMN) {
+		if (prepare_inplace_add_virtual(
+			    ha_alter_info, altered_table, old_table)) {
+			DBUG_RETURN(true);
+		}
+
+		/* Need information for newly added virtual columns
+		for create index */
+
+		if (ha_alter_info->handler_flags
+		    & ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX) {
+			for (ulint i = 0; i < ctx->num_to_add_vcol; i++) {
+				/* Set mbminmax for newly added column */
+				dict_col_t& col = ctx->add_vcol[i].m_col;
+				unsigned mbminlen, mbmaxlen;
+				dtype_get_mblen(col.mtype, col.prtype,
+						&mbminlen, &mbmaxlen);
+				col.mbminlen = mbminlen & 7;
+				col.mbmaxlen = mbmaxlen & 7;
+			}
+			add_v = static_cast<dict_add_v_col_t*>(
+				mem_heap_alloc(ctx->heap, sizeof *add_v));
+			add_v->n_v_col = ctx->num_to_add_vcol;
+			add_v->v_col = ctx->add_vcol;
+			add_v->v_col_name = ctx->add_vcol_name;
+		}
+	}
+
+	/* There should be no order change for virtual columns coming in
+	here */
+	ut_ad(check_v_col_in_order(old_table, altered_table, ha_alter_info));
+
+	/* Create table containing all indexes to be built in this
+	ALTER TABLE ADD INDEX so that they are in the correct order
+	in the table. */
+
+	ctx->num_to_add_index = ha_alter_info->index_add_count;
+
+	ut_ad(ctx->prebuilt->trx->mysql_thd != NULL);
+	const char*	path = thd_innodb_tmpdir(
+		ctx->prebuilt->trx->mysql_thd);
+
+	index_defs = ctx->create_key_defs(
+		ha_alter_info, altered_table,
+		num_fts_index,
+		fts_doc_id_col, add_fts_doc_id, add_fts_doc_id_idx,
+		old_table);
+
+	new_clustered = (DICT_CLUSTERED & index_defs[0].ind_type) != 0;
+
+	create_table_info_t info(ctx->prebuilt->trx->mysql_thd, altered_table,
+				 ha_alter_info->create_info, NULL, NULL,
+				 srv_file_per_table);
+
+	/* The primary index would be rebuilt if a FTS Doc ID
+	column is to be added, and the primary index definition
+	is just copied from old table and stored in indexdefs[0] */
+	DBUG_ASSERT(!add_fts_doc_id || new_clustered);
+	DBUG_ASSERT(!!new_clustered ==
+		    (innobase_need_rebuild(ha_alter_info, old_table)
+		     || add_fts_doc_id));
+
+	/* Allocate memory for dictionary index definitions */
+
+	ctx->add_index = static_cast<dict_index_t**>(
+		mem_heap_zalloc(ctx->heap, ctx->num_to_add_index
+			       * sizeof *ctx->add_index));
+	ctx->add_key_numbers = add_key_nums = static_cast<ulint*>(
+		mem_heap_alloc(ctx->heap, ctx->num_to_add_index
+			       * sizeof *ctx->add_key_numbers));
+
+	const bool fts_exist = ctx->new_table->flags2
+		& (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS);
+	/* Acquire a lock on the table before creating any indexes. */
+	bool table_lock_failed = false;
+
+	if (!ctx->online) {
+acquire_lock:
+		ctx->prebuilt->trx->op_info = "acquiring table lock";
+		error = lock_table_for_trx(user_table, ctx->trx, LOCK_S);
+	} else if (add_key_nums) {
+		/* FIXME: trx_resurrect_table_locks() will not resurrect
+		MDL for any recovered transactions that may hold locks on
+		the table. We will prevent race conditions by "unnecessarily"
+		acquiring an InnoDB table lock even for online operation,
+		to ensure that the rollback of recovered transactions will
+		not run concurrently with online ADD INDEX. */
+		user_table->lock_mutex_lock();
+		for (lock_t *lock = UT_LIST_GET_FIRST(user_table->locks);
+		     lock;
+		     lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock)) {
+			if (lock->trx->is_recovered) {
+				user_table->lock_mutex_unlock();
+				goto acquire_lock;
+			}
+		}
+		user_table->lock_mutex_unlock();
+	}
+
+	if (fts_exist) {
+		purge_sys.stop_FTS(*ctx->new_table);
+		if (error == DB_SUCCESS) {
+			error = fts_lock_tables(ctx->trx, *ctx->new_table);
+		}
+	}
+
+	if (error == DB_SUCCESS) {
+		error = lock_sys_tables(ctx->trx);
+	}
+
+	if (error != DB_SUCCESS) {
+		table_lock_failed = true;
+		goto error_handling;
+	}
+
+	/* Latch the InnoDB data dictionary exclusively so that no deadlocks
+	or lock waits can happen in it during an index create operation. */
+
+	row_mysql_lock_data_dictionary(ctx->trx);
+	dict_locked = true;
+	online_retry_drop_indexes_low(ctx->new_table, ctx->trx);
+
+	ut_d(dict_table_check_for_dup_indexes(
+		     ctx->new_table, CHECK_ABORTED_OK));
+
+	DBUG_EXECUTE_IF("innodb_OOM_prepare_inplace_alter",
+			error = DB_OUT_OF_MEMORY;
+			goto error_handling;);
+
+	/* If a new clustered index is defined for the table we need
+	to rebuild the table with a temporary name. */
+
+	if (new_clustered) {
+		if (innobase_check_foreigns(
+			    ha_alter_info, old_table,
+			    user_table, ctx->drop_fk, ctx->num_to_drop_fk)) {
+new_clustered_failed:
+			DBUG_ASSERT(ctx->trx != ctx->prebuilt->trx);
+			ctx->trx->rollback();
+
+			ut_ad(user_table->get_ref_count() == 1);
+
+			if (user_table->drop_aborted) {
+				row_mysql_unlock_data_dictionary(ctx->trx);
+				trx_start_for_ddl(ctx->trx);
+				if (lock_sys_tables(ctx->trx) == DB_SUCCESS) {
+					row_mysql_lock_data_dictionary(
+						ctx->trx);
+					online_retry_drop_indexes_low(
+						user_table, ctx->trx);
+					commit_unlock_and_unlink(ctx->trx);
+				} else {
+					ctx->trx->commit();
+				}
+				row_mysql_lock_data_dictionary(ctx->trx);
+			}
+
+			if (ctx->need_rebuild()) {
+				if (ctx->new_table) {
+					ut_ad(!ctx->new_table->cached);
+					dict_mem_table_free(ctx->new_table);
+				}
+				ctx->new_table = ctx->old_table;
+			}
+
+			while (ctx->num_to_add_index--) {
+				if (dict_index_t*& i = ctx->add_index[
+					    ctx->num_to_add_index]) {
+					dict_mem_index_free(i);
+					i = NULL;
+				}
+			}
+
+			goto err_exit;
+		}
+
+		size_t	prefixlen= strlen(mysql_data_home);
+                if (mysql_data_home[prefixlen-1] != FN_LIBCHAR)
+                  prefixlen++;
+		size_t	tablen = altered_table->s->path.length - prefixlen;
+		const char* part = ctx->old_table->name.part();
+		size_t	partlen = part ? strlen(part) : 0;
+		char*	new_table_name = static_cast<char*>(
+			mem_heap_alloc(ctx->heap, tablen + partlen + 1));
+		memcpy(new_table_name,
+		       altered_table->s->path.str + prefixlen, tablen);
+#ifdef _WIN32
+                {
+                  char *sep= strchr(new_table_name, FN_LIBCHAR);
+                  sep[0]= '/';
+                }
+#endif
+		memcpy(new_table_name + tablen, part ? part : "", partlen + 1);
+		ulint		n_cols = 0;
+		ulint		n_v_cols = 0;
+		dtuple_t*	defaults;
+		ulint		z = 0;
+
+		for (uint i = 0; i < altered_table->s->fields; i++) {
+			const Field*	field = altered_table->field[i];
+
+			if (!field->stored_in_db()) {
+				n_v_cols++;
+			} else {
+				n_cols++;
+			}
+		}
+
+		ut_ad(n_cols + n_v_cols == altered_table->s->fields);
+
+		if (add_fts_doc_id) {
+			n_cols++;
+			DBUG_ASSERT(flags2 & DICT_TF2_FTS);
+			DBUG_ASSERT(add_fts_doc_id_idx);
+			flags2 |= DICT_TF2_FTS_ADD_DOC_ID
+				| DICT_TF2_FTS_HAS_DOC_ID
+				| DICT_TF2_FTS;
+		}
+
+		DBUG_ASSERT(!add_fts_doc_id_idx || (flags2 & DICT_TF2_FTS));
+
+		ctx->new_table = dict_table_t::create(
+			{new_table_name, tablen + partlen}, nullptr,
+			n_cols + n_v_cols, n_v_cols, flags, flags2);
+
+		/* The rebuilt indexed_table will use the renamed
+		column names. */
+		ctx->col_names = NULL;
+
+		if (DICT_TF_HAS_DATA_DIR(flags)) {
+			ctx->new_table->data_dir_path =
+				mem_heap_strdup(ctx->new_table->heap,
+				user_table->data_dir_path);
+		}
+
+		for (uint i = 0; i < altered_table->s->fields; i++) {
+			const Field*	field = altered_table->field[i];
+			unsigned is_unsigned;
+			auto col_type = get_innobase_type_from_mysql_type(
+				&is_unsigned, field);
+			unsigned field_type = field->type() | is_unsigned;
+			const bool is_virtual = !field->stored_in_db();
+
+			/* we assume in dtype_form_prtype() that this
+			fits in two bytes */
+			ut_a(field_type <= MAX_CHAR_COLL_NUM);
+
+			if (!field->real_maybe_null()) {
+				field_type |= DATA_NOT_NULL;
+			}
+
+			if (field->binary()) {
+				field_type |= DATA_BINARY_TYPE;
+			}
+
+			if (altered_table->versioned()) {
+				if (i == altered_table->s->vers.start_fieldno) {
+					field_type |= DATA_VERS_START;
+				} else if (i ==
+					   altered_table->s->vers.end_fieldno) {
+					field_type |= DATA_VERS_END;
+				} else if (!(field->flags
+					     & VERS_UPDATE_UNVERSIONED_FLAG)) {
+					field_type |= DATA_VERSIONED;
+				}
+			}
+
+			unsigned charset_no;
+
+			if (dtype_is_string_type(col_type)) {
+				charset_no = field->charset()->number;
+
+				if (charset_no > MAX_CHAR_COLL_NUM) {
+					my_error(ER_WRONG_KEY_COLUMN, MYF(0), "InnoDB",
+						 field->field_name.str);
+					goto new_clustered_failed;
+				}
+			} else {
+				charset_no = 0;
+			}
+
+			auto col_len = field->pack_length();
+
+			/* The MySQL pack length contains 1 or 2 bytes
+			length field for a true VARCHAR. Let us
+			subtract that, so that the InnoDB column
+			length in the InnoDB data dictionary is the
+			real maximum byte length of the actual data. */
+
+			if (field->type() == MYSQL_TYPE_VARCHAR) {
+				uint32	length_bytes
+					= static_cast<const Field_varstring*>(
+						field)->length_bytes;
+
+				col_len -= length_bytes;
+
+				if (length_bytes == 2) {
+					field_type |= DATA_LONG_TRUE_VARCHAR;
+				}
+
+			}
+
+			if (dict_col_name_is_reserved(field->field_name.str)) {
+wrong_column_name:
+				dict_mem_table_free(ctx->new_table);
+				ctx->new_table = ctx->old_table;
+				my_error(ER_WRONG_COLUMN_NAME, MYF(0),
+					 field->field_name.str);
+				goto new_clustered_failed;
+			}
+
+			/** Note the FTS_DOC_ID name is case sensitive due
+			 to internal query parser.
+			 FTS_DOC_ID column must be of BIGINT NOT NULL type
+			 and it should be in all capitalized characters */
+			if (!innobase_strcasecmp(field->field_name.str,
+						 FTS_DOC_ID_COL_NAME)) {
+				if (col_type != DATA_INT
+				    || field->real_maybe_null()
+				    || col_len != sizeof(doc_id_t)
+				    || strcmp(field->field_name.str,
+					      FTS_DOC_ID_COL_NAME)) {
+					goto wrong_column_name;
+				}
+			}
+
+			if (is_virtual) {
+				dict_mem_table_add_v_col(
+					ctx->new_table, ctx->heap,
+					field->field_name.str,
+					col_type,
+					dtype_form_prtype(
+						field_type, charset_no)
+					| DATA_VIRTUAL,
+					col_len, i, 0);
+			} else {
+				dict_mem_table_add_col(
+					ctx->new_table, ctx->heap,
+					field->field_name.str,
+					col_type,
+					dtype_form_prtype(
+						field_type, charset_no),
+					col_len);
+			}
+		}
+
+		if (n_v_cols) {
+			for (uint i = 0; i < altered_table->s->fields; i++) {
+				dict_v_col_t*	v_col;
+				const Field*	field = altered_table->field[i];
+
+				if (!!field->stored_in_db()) {
+					continue;
+				}
+				v_col = dict_table_get_nth_v_col(
+					ctx->new_table, z);
+				z++;
+				innodb_base_col_setup(
+					ctx->new_table, field, v_col);
+			}
+		}
+
+		if (add_fts_doc_id) {
+			fts_add_doc_id_column(ctx->new_table, ctx->heap);
+			ctx->new_table->fts->doc_col = fts_doc_id_col;
+			ut_ad(fts_doc_id_col
+			      == altered_table->s->fields - n_v_cols);
+		} else if (ctx->new_table->fts) {
+			ctx->new_table->fts->doc_col = fts_doc_id_col;
+		}
+
+		dict_table_add_system_columns(ctx->new_table, ctx->heap);
+
+		if (ha_alter_info->handler_flags & INNOBASE_DEFAULTS) {
+			defaults = dtuple_create_with_vcol(
+				ctx->heap,
+				dict_table_get_n_cols(ctx->new_table),
+				dict_table_get_n_v_cols(ctx->new_table));
+
+			dict_table_copy_types(defaults, ctx->new_table);
+		} else {
+			defaults = NULL;
+		}
+
+		ctx->col_map = innobase_build_col_map(
+			ha_alter_info, altered_table, old_table,
+			ctx->new_table, user_table, defaults, ctx->heap);
+		ctx->defaults = defaults;
+	} else {
+		DBUG_ASSERT(!innobase_need_rebuild(ha_alter_info, old_table));
+		DBUG_ASSERT(old_table->s->primary_key
+			    == altered_table->s->primary_key);
+
+		for (dict_index_t* index
+			     = dict_table_get_first_index(user_table);
+		     index != NULL;
+		     index = dict_table_get_next_index(index)) {
+			if (!index->to_be_dropped && index->is_corrupted()) {
+				my_error(ER_CHECK_NO_SUCH_TABLE, MYF(0));
+				goto error_handled;
+			}
+		}
+
+		for (dict_index_t* index
+			     = dict_table_get_first_index(user_table);
+		     index != NULL;
+		     index = dict_table_get_next_index(index)) {
+			if (!index->to_be_dropped && index->is_corrupted()) {
+				my_error(ER_CHECK_NO_SUCH_TABLE, MYF(0));
+				goto error_handled;
+			}
+		}
+
+		if (!ctx->new_table->fts
+		    && innobase_fulltext_exist(altered_table)) {
+			ctx->new_table->fts = fts_create(
+				ctx->new_table);
+			ctx->new_table->fts->doc_col = fts_doc_id_col;
+		}
+
+		/* Check if we need to update mtypes of legacy GIS columns.
+		This check is only needed when we don't have to rebuild
+		the table, since rebuild would update all mtypes for GIS
+		columns */
+		error = innobase_check_gis_columns(
+			ha_alter_info, ctx->new_table, ctx->trx);
+		if (error != DB_SUCCESS) {
+			ut_ad(error == DB_ERROR);
+			my_error(ER_TABLE_CANT_HANDLE_SPKEYS, MYF(0), "SYS_COLUMNS");
+			goto error_handled;
+		}
+	}
+
+	ut_ad(new_clustered == ctx->need_rebuild());
+
+	/* Create the index metadata. */
+	for (ulint a = 0; a < ctx->num_to_add_index; a++) {
+		if (index_defs[a].ind_type & DICT_VIRTUAL
+		    && ctx->num_to_drop_vcol > 0 && !new_clustered) {
+			innodb_v_adjust_idx_col(ha_alter_info, old_table,
+						ctx->num_to_drop_vcol,
+						&index_defs[a]);
+		}
+
+		ctx->add_index[a] = row_merge_create_index(
+			ctx->new_table, &index_defs[a], add_v);
+
+		add_key_nums[a] = index_defs[a].key_number;
+
+		DBUG_ASSERT(ctx->add_index[a]->is_committed()
+			    == !!new_clustered);
+	}
+
+	DBUG_ASSERT(!ctx->need_rebuild()
+		    || !ctx->new_table->persistent_autoinc);
+
+	if (ctx->need_rebuild() && instant_alter_column_possible(
+		    *user_table, ha_alter_info, old_table, altered_table,
+		    ha_innobase::is_innodb_strict_mode(ctx->trx->mysql_thd))) {
+		for (uint a = 0; a < ctx->num_to_add_index; a++) {
+			ctx->add_index[a]->table = ctx->new_table;
+			error = dict_index_add_to_cache(
+				ctx->add_index[a], FIL_NULL, add_v);
+			ut_a(error == DB_SUCCESS);
+		}
+
+		DBUG_ASSERT(ha_alter_info->key_count
+			    /* hidden GEN_CLUST_INDEX in InnoDB */
+			    + dict_index_is_auto_gen_clust(
+				    dict_table_get_first_index(ctx->new_table))
+			    /* hidden FTS_DOC_ID_INDEX in InnoDB */
+			    + (ctx->old_table->fts_doc_id_index
+			       && innobase_fts_check_doc_id_index_in_def(
+				       altered_table->s->keys,
+				       altered_table->key_info)
+			       != FTS_EXIST_DOC_ID_INDEX)
+			    == ctx->num_to_add_index);
+
+		ctx->num_to_add_index = 0;
+		ctx->add_index = NULL;
+
+		uint i = 0; // index of stored columns ctx->new_table->cols[]
+		Field **af = altered_table->field;
+
+		for (const Create_field& new_field :
+		     ha_alter_info->alter_info->create_list) {
+			DBUG_ASSERT(!new_field.field
+				    || std::find(old_table->field,
+						 old_table->field
+						 + old_table->s->fields,
+						 new_field.field) !=
+				    old_table->field + old_table->s->fields);
+			DBUG_ASSERT(new_field.field
+				    || !strcmp(new_field.field_name.str,
+					       (*af)->field_name.str));
+
+			if (!(*af)->stored_in_db()) {
+				af++;
+				continue;
+			}
+
+			dict_col_t* col = dict_table_get_nth_col(
+				ctx->new_table, i);
+			DBUG_ASSERT(!strcmp((*af)->field_name.str,
+				    dict_table_get_col_name(ctx->new_table,
+							    i)));
+			DBUG_ASSERT(!col->is_added());
+
+			if (new_field.field) {
+				/* This is a pre-existing column,
+				possibly at a different position. */
+			} else if ((*af)->is_real_null()) {
+				/* DEFAULT NULL */
+				col->def_val.len = UNIV_SQL_NULL;
+			} else {
+				switch ((*af)->type()) {
+				case MYSQL_TYPE_VARCHAR:
+					col->def_val.len = reinterpret_cast
+						<const Field_varstring*>
+						((*af))->get_length();
+					col->def_val.data = reinterpret_cast
+						<const Field_varstring*>
+						((*af))->get_data();
+					break;
+				case MYSQL_TYPE_GEOMETRY:
+				case MYSQL_TYPE_TINY_BLOB:
+				case MYSQL_TYPE_MEDIUM_BLOB:
+				case MYSQL_TYPE_BLOB:
+				case MYSQL_TYPE_LONG_BLOB:
+					col->def_val.len = reinterpret_cast
+						<const Field_blob*>
+						((*af))->get_length();
+					col->def_val.data = reinterpret_cast
+						<const Field_blob*>
+						((*af))->get_ptr();
+					break;
+				default:
+					dfield_t d;
+					dict_col_copy_type(col, &d.type);
+					ulint len = (*af)->pack_length();
+					DBUG_ASSERT(len <= 8
+						    || d.type.mtype
+						    != DATA_INT);
+					row_mysql_store_col_in_innobase_format(
+						&d,
+						d.type.mtype == DATA_INT
+						? static_cast<byte*>(
+							mem_heap_alloc(
+								ctx->heap,
+								len))
+						: NULL,
+						true, (*af)->ptr, len,
+						dict_table_is_comp(
+							user_table));
+					col->def_val.len = d.len;
+					col->def_val.data = d.data;
+				}
+			}
+
+			i++;
+			af++;
+		}
+
+		DBUG_ASSERT(af == altered_table->field
+			    + altered_table->s->fields);
+		/* There might exist a hidden FTS_DOC_ID column for
+		FULLTEXT INDEX. If it exists, the columns should have
+		been implicitly added by ADD FULLTEXT INDEX together
+		with instant ADD COLUMN. (If a hidden FTS_DOC_ID pre-existed,
+		then the ctx->col_map[] check should have prevented
+		adding visible user columns after that.) */
+		DBUG_ASSERT(DATA_N_SYS_COLS + i == ctx->new_table->n_cols
+			    || (1 + DATA_N_SYS_COLS + i
+				== ctx->new_table->n_cols
+				&& !strcmp(dict_table_get_col_name(
+						   ctx->new_table, i),
+				   FTS_DOC_ID_COL_NAME)));
+
+		if (altered_table->found_next_number_field) {
+			ctx->new_table->persistent_autoinc
+				= ctx->old_table->persistent_autoinc;
+		}
+
+		ctx->prepare_instant();
+	}
+
+	if (ctx->need_rebuild()) {
+		DBUG_ASSERT(ctx->need_rebuild());
+		DBUG_ASSERT(!ctx->is_instant());
+		DBUG_ASSERT(num_fts_index <= 1);
+		DBUG_ASSERT(!ctx->online || num_fts_index == 0);
+		DBUG_ASSERT(!ctx->online
+			    || !ha_alter_info->mdl_exclusive_after_prepare
+			    || ctx->add_autoinc == ULINT_UNDEFINED);
+		DBUG_ASSERT(!ctx->online
+			    || !innobase_need_rebuild(ha_alter_info, old_table)
+			    || !innobase_fulltext_exist(altered_table));
+
+		uint32_t		key_id	= FIL_DEFAULT_ENCRYPTION_KEY;
+		fil_encryption_t	mode	= FIL_ENCRYPTION_DEFAULT;
+
+		if (fil_space_t* s = user_table->space) {
+			if (const fil_space_crypt_t* c = s->crypt_data) {
+				key_id = c->key_id;
+				mode = c->encryption;
+			}
+		}
+
+		if (ha_alter_info->handler_flags & ALTER_OPTIONS) {
+			const ha_table_option_struct& alt_opt=
+				*ha_alter_info->create_info->option_struct;
+			const ha_table_option_struct& opt=
+				*old_table->s->option_struct;
+			if (alt_opt.encryption != opt.encryption
+			    || alt_opt.encryption_key_id
+			    != opt.encryption_key_id) {
+				key_id = uint32_t(alt_opt.encryption_key_id);
+				mode = fil_encryption_t(alt_opt.encryption);
+			}
+		}
+
+		if (dict_sys.find_table(
+			    {ctx->new_table->name.m_name,
+			     strlen(ctx->new_table->name.m_name)})) {
+			my_error(ER_TABLE_EXISTS_ERROR, MYF(0),
+				 ctx->new_table->name.m_name);
+			goto new_clustered_failed;
+		}
+
+		/* Create the table. */
+		ctx->trx->dict_operation = true;
+
+		error = row_create_table_for_mysql(ctx->new_table, ctx->trx);
+
+		switch (error) {
+		case DB_SUCCESS:
+			DBUG_ASSERT(ctx->new_table->get_ref_count() == 0);
+			DBUG_ASSERT(ctx->new_table->id != 0);
+			break;
+		case DB_DUPLICATE_KEY:
+			my_error(HA_ERR_TABLE_EXIST, MYF(0),
+				 altered_table->s->table_name.str);
+			goto new_table_failed;
+		case DB_UNSUPPORTED:
+			my_error(ER_UNSUPPORTED_EXTENSION, MYF(0),
+				 altered_table->s->table_name.str);
+			goto new_table_failed;
+		default:
+			my_error_innodb(error, table_name, flags);
+new_table_failed:
+			DBUG_ASSERT(ctx->trx != ctx->prebuilt->trx);
+			ctx->new_table = NULL;
+			goto new_clustered_failed;
+		}
+
+		for (ulint a = 0; a < ctx->num_to_add_index; a++) {
+			dict_index_t* index = ctx->add_index[a];
+			const ulint n_v_col = index->get_new_n_vcol();
+			index = create_index_dict(ctx->trx, index,
+						  mode, key_id, add_v);
+			error = ctx->trx->error_state;
+			if (error != DB_SUCCESS) {
+				if (index) {
+					dict_mem_index_free(index);
+				}
+error_handling_drop_uncached_1:
+				while (++a < ctx->num_to_add_index) {
+					dict_mem_index_free(ctx->add_index[a]);
+				}
+				goto error_handling;
+			} else {
+				DBUG_ASSERT(index != ctx->add_index[a]);
+			}
+
+			ctx->add_index[a] = index;
+			/* For ALTER TABLE...FORCE or OPTIMIZE TABLE,
+			we may only issue warnings, because there will
+			be no schema change from the user perspective. */
+			if (!info.row_size_is_acceptable(
+				    *index,
+				    !!(ha_alter_info->handler_flags
+				       & ~(INNOBASE_INPLACE_IGNORE
+					   | INNOBASE_ALTER_NOVALIDATE
+					   | ALTER_RECREATE_TABLE)))) {
+				error = DB_TOO_BIG_RECORD;
+				goto error_handling_drop_uncached_1;
+			}
+			index->parser = index_defs[a].parser;
+			if (n_v_col) {
+				index->assign_new_v_col(n_v_col);
+			}
+			/* Note the id of the transaction that created this
+			index, we use it to restrict readers from accessing
+			this index, to ensure read consistency. */
+			ut_ad(index->trx_id == ctx->trx->id);
+
+			if (index->type & DICT_FTS) {
+				DBUG_ASSERT(num_fts_index == 1);
+				DBUG_ASSERT(!fts_index);
+				DBUG_ASSERT(index->type == DICT_FTS);
+				fts_index = ctx->add_index[a];
+			}
+		}
+
+		dict_index_t*	clust_index = dict_table_get_first_index(
+			user_table);
+		dict_index_t*	new_clust_index = dict_table_get_first_index(
+			ctx->new_table);
+		ut_ad(!new_clust_index->is_instant());
+		/* row_merge_build_index() depends on the correct value */
+		ut_ad(new_clust_index->n_core_null_bytes
+		      == UT_BITS_IN_BYTES(new_clust_index->n_nullable));
+
+		if (const Field* ai = altered_table->found_next_number_field) {
+			const unsigned	col_no = innodb_col_no(ai);
+
+			ctx->new_table->persistent_autoinc =
+				(dict_table_get_nth_col_pos(
+					ctx->new_table, col_no, NULL) + 1)
+				& dict_index_t::MAX_N_FIELDS;
+
+			/* Initialize the AUTO_INCREMENT sequence
+			to the rebuilt table from the old one. */
+			if (!old_table->found_next_number_field
+			    || !user_table->space) {
+			} else if (ib_uint64_t autoinc
+				   = btr_read_autoinc(clust_index)) {
+				btr_write_autoinc(new_clust_index, autoinc);
+			}
+		}
+
+		ctx->skip_pk_sort = innobase_pk_order_preserved(
+			ctx->col_map, clust_index, new_clust_index);
+
+		DBUG_EXECUTE_IF("innodb_alter_table_pk_assert_no_sort",
+			DBUG_ASSERT(ctx->skip_pk_sort););
+
+		if (ctx->online) {
+			/* Allocate a log for online table rebuild. */
+			clust_index->lock.x_lock(SRW_LOCK_CALL);
+			bool ok = row_log_allocate(
+				ctx->prebuilt->trx,
+				clust_index, ctx->new_table,
+				!(ha_alter_info->handler_flags
+				  & ALTER_ADD_PK_INDEX),
+				ctx->defaults, ctx->col_map, path,
+				old_table,
+				ctx->allow_not_null);
+			clust_index->lock.x_unlock();
+
+			if (!ok) {
+				error = DB_OUT_OF_MEMORY;
+				goto error_handling;
+			}
+		}
+	} else if (ctx->num_to_add_index) {
+		ut_ad(!ctx->is_instant());
+
+		for (ulint a = 0; a < ctx->num_to_add_index; a++) {
+			dict_index_t* index = ctx->add_index[a];
+			const ulint n_v_col = index->get_new_n_vcol();
+			DBUG_EXECUTE_IF(
+				"create_index_metadata_fail",
+				if (a + 1 == ctx->num_to_add_index) {
+					ctx->trx->error_state =
+						DB_OUT_OF_FILE_SPACE;
+					goto index_created;
+				});
+			index = create_index_dict(ctx->trx, index,
+						  FIL_ENCRYPTION_DEFAULT,
+						  FIL_DEFAULT_ENCRYPTION_KEY,
+						  add_v);
+#ifndef DBUG_OFF
+index_created:
+#endif
+			error = ctx->trx->error_state;
+			if (error != DB_SUCCESS) {
+				if (index) {
+					dict_mem_index_free(index);
+				}
+error_handling_drop_uncached:
+				while (++a < ctx->num_to_add_index) {
+					dict_mem_index_free(ctx->add_index[a]);
+				}
+				goto error_handling;
+			} else {
+				DBUG_ASSERT(index != ctx->add_index[a]);
+			}
+			ctx->add_index[a]= index;
+			if (!info.row_size_is_acceptable(*index, true)) {
+				error = DB_TOO_BIG_RECORD;
+				goto error_handling_drop_uncached;
+			}
+
+			index->parser = index_defs[a].parser;
+			if (n_v_col) {
+				index->assign_new_v_col(n_v_col);
+			}
+
+			ctx->change_col_collation(index, *altered_table);
+			/* Note the id of the transaction that created this
+			index, we use it to restrict readers from accessing
+			this index, to ensure read consistency. */
+			ut_ad(index->trx_id == ctx->trx->id);
+
+			/* If ADD INDEX with LOCK=NONE has been
+			requested, allocate a modification log. */
+			if (index->type & DICT_FTS) {
+				DBUG_ASSERT(num_fts_index == 1);
+				DBUG_ASSERT(!fts_index);
+				DBUG_ASSERT(index->type == DICT_FTS);
+				fts_index = ctx->add_index[a];
+				/* Fulltext indexes are not covered
+				by a modification log. */
+			} else if (!ctx->online
+				   || !user_table->is_readable()
+				   || !user_table->space) {
+				/* No need to allocate a modification log. */
+				DBUG_ASSERT(!index->online_log);
+			} else {
+				index->lock.x_lock(SRW_LOCK_CALL);
+
+				bool ok = row_log_allocate(
+					ctx->prebuilt->trx,
+					index,
+					NULL, true, NULL, NULL,
+					path, old_table,
+					ctx->allow_not_null);
+
+				index->lock.x_unlock();
+
+				DBUG_EXECUTE_IF(
+					"innodb_OOM_prepare_add_index",
+					if (ok && a == 1) {
+						row_log_free(
+							index->online_log);
+						index->online_log = NULL;
+						ctx->old_table->indexes.start
+							->online_log = nullptr;
+						ok = false;
+					});
+
+				if (!ok) {
+					error = DB_OUT_OF_MEMORY;
+					goto error_handling_drop_uncached;
+				}
+			}
+		}
+	} else if (ctx->is_instant()
+		   && !info.row_size_is_acceptable(*user_table, true)) {
+		error = DB_TOO_BIG_RECORD;
+		goto error_handling;
+	}
+
+	if (ctx->online && ctx->num_to_add_index) {
+		/* Assign a consistent read view for
+		row_merge_read_clustered_index(). */
+		ctx->prebuilt->trx->read_view.open(ctx->prebuilt->trx);
+	}
+
+	if (fts_index) {
+		ut_ad(ctx->trx->dict_operation);
+		ut_ad(ctx->trx->dict_operation_lock_mode);
+		ut_ad(dict_sys.locked());
+
+		DICT_TF2_FLAG_SET(ctx->new_table, DICT_TF2_FTS);
+		if (ctx->need_rebuild()) {
+			/* For !ctx->need_rebuild(), this will be set at
+			commit_cache_norebuild(). */
+			ctx->new_table->fts_doc_id_index
+				= dict_table_get_index_on_name(
+					ctx->new_table, FTS_DOC_ID_INDEX_NAME);
+			DBUG_ASSERT(ctx->new_table->fts_doc_id_index != NULL);
+		}
+
+		error = fts_create_index_tables(ctx->trx, fts_index,
+						ctx->new_table->id);
+
+		DBUG_EXECUTE_IF("innodb_test_fail_after_fts_index_table",
+				error = DB_LOCK_WAIT_TIMEOUT;
+				goto error_handling;);
+
+		if (error != DB_SUCCESS) {
+			goto error_handling;
+		}
+
+		if (!ctx->new_table->fts
+		    || ib_vector_size(ctx->new_table->fts->indexes) == 0) {
+			error = fts_create_common_tables(
+				ctx->trx, ctx->new_table, true);
+
+			DBUG_EXECUTE_IF(
+				"innodb_test_fail_after_fts_common_table",
+				error = DB_LOCK_WAIT_TIMEOUT;);
+
+			if (error != DB_SUCCESS) {
+				goto error_handling;
+			}
+
+			error = innobase_fts_load_stopword(
+				ctx->new_table, ctx->trx,
+				ctx->prebuilt->trx->mysql_thd)
+				? DB_SUCCESS : DB_ERROR;
+
+			if (error != DB_SUCCESS) {
+				goto error_handling;
+			}
+		}
+	}
+
+	DBUG_ASSERT(error == DB_SUCCESS);
+
+	{
+		/* Commit the data dictionary transaction in order to release
+		the table locks on the system tables.  This means that if
+		MariaDB is killed while rebuilding the table inside
+		row_merge_build_indexes(), ctx->new_table will not be dropped
+		by trx_rollback_active(). */
+		ut_d(dict_table_check_for_dup_indexes(user_table,
+						      CHECK_PARTIAL_OK));
+		if (ctx->need_rebuild()) {
+			ctx->new_table->acquire();
+		}
+
+		/* fts_create_common_tables() may drop old common tables,
+		whose files would be deleted here. */
+		commit_unlock_and_unlink(ctx->trx);
+		if (fts_exist) {
+			purge_sys.resume_FTS();
+		}
+
+		trx_start_for_ddl(ctx->trx);
+		ctx->prebuilt->trx_id = ctx->trx->id;
+	}
+
+	if (ctx->old_table->fts) {
+		fts_sync_during_ddl(ctx->old_table);
+	}
+
+	DBUG_RETURN(false);
+
+error_handling:
+	/* After an error, remove all those index definitions from the
+	dictionary which were defined. */
+
+	switch (error) {
+	case DB_TABLESPACE_EXISTS:
+		my_error(ER_TABLESPACE_EXISTS, MYF(0), "(unknown)");
+		break;
+	case DB_DUPLICATE_KEY:
+		my_error(ER_DUP_KEY, MYF(0), "SYS_INDEXES");
+		break;
+	default:
+		my_error_innodb(error, table_name, user_table->flags);
+	}
+
+	ctx->trx->rollback();
+
+	ut_ad(!ctx->need_rebuild()
+	      || !user_table->indexes.start->online_log);
+
+	ctx->prebuilt->trx->error_info = NULL;
+	ctx->trx->error_state = DB_SUCCESS;
+
+	if (false) {
+error_handled:
+		ut_ad(!table_lock_failed);
+		ut_ad(ctx->trx->state == TRX_STATE_ACTIVE);
+		ut_ad(!ctx->trx->undo_no);
+		ut_ad(dict_locked);
+	} else if (table_lock_failed) {
+		if (!dict_locked) {
+			row_mysql_lock_data_dictionary(ctx->trx);
+		}
+		goto err_exit;
+	} else {
+		ut_ad(ctx->trx->state == TRX_STATE_NOT_STARTED);
+		if (new_clustered && !user_table->drop_aborted) {
+			goto err_exit;
+		}
+		if (dict_locked) {
+			row_mysql_unlock_data_dictionary(ctx->trx);
+		}
+		trx_start_for_ddl(ctx->trx);
+		dberr_t err= lock_sys_tables(ctx->trx);
+		row_mysql_lock_data_dictionary(ctx->trx);
+		if (err != DB_SUCCESS) {
+			goto err_exit;
+		}
+	}
+
+	/* n_ref_count must be 1, because background threads cannot
+	be executing on this very table as we are
+	holding MDL_EXCLUSIVE. */
+	ut_ad(ctx->online || user_table->get_ref_count() == 1);
+
+	if (new_clustered) {
+		online_retry_drop_indexes_low(user_table, ctx->trx);
+		commit_unlock_and_unlink(ctx->trx);
+		row_mysql_lock_data_dictionary(ctx->trx);
+	} else {
+		row_merge_drop_indexes(ctx->trx, user_table, true);
+		ctx->trx->commit();
+	}
+
+	ut_d(dict_table_check_for_dup_indexes(user_table, CHECK_ALL_COMPLETE));
+	ut_ad(!user_table->drop_aborted);
+
+err_exit:
+	/* Clear the to_be_dropped flag in the data dictionary cache. */
+	for (ulint i = 0; i < ctx->num_to_drop_index; i++) {
+		DBUG_ASSERT(ctx->drop_index[i]->is_committed());
+		DBUG_ASSERT(ctx->drop_index[i]->to_be_dropped);
+		ctx->drop_index[i]->to_be_dropped = 0;
+	}
+
+	if (ctx->trx) {
+		row_mysql_unlock_data_dictionary(ctx->trx);
+		ctx->trx->rollback();
+		ctx->trx->free();
+	}
+	trx_commit_for_mysql(ctx->prebuilt->trx);
+	if (fts_exist) {
+		purge_sys.resume_FTS();
+	}
+
+	for (uint i = 0; i < ctx->num_to_add_fk; i++) {
+		if (ctx->add_fk[i]) {
+			dict_foreign_free(ctx->add_fk[i]);
+		}
+	}
+
+	delete ctx;
+	ha_alter_info->handler_ctx = NULL;
+
+	DBUG_RETURN(true);
+}
+
+/* Check whether an index is needed for the foreign key constraint.
+If so, if it is dropped, is there an equivalent index can play its role.
+@return true if the index is needed and can't be dropped */
+static MY_ATTRIBUTE((nonnull(1,2,3,5), warn_unused_result))
+bool
+innobase_check_foreign_key_index(
+/*=============================*/
+	Alter_inplace_info*	ha_alter_info,	/*!< in: Structure describing
+						changes to be done by ALTER
+						TABLE */
+	dict_index_t*		index,		/*!< in: index to check */
+	dict_table_t*		indexed_table,	/*!< in: table that owns the
+						foreign keys */
+	const char**		col_names,	/*!< in: column names, or NULL
+						for indexed_table->col_names */
+	trx_t*			trx,		/*!< in/out: transaction */
+	dict_foreign_t**	drop_fk,	/*!< in: Foreign key constraints
+						to drop */
+	ulint			n_drop_fk)	/*!< in: Number of foreign keys
+						to drop */
+{
+	const dict_foreign_set*	fks = &indexed_table->referenced_set;
+
+	/* Check for all FK references from other tables to the index. */
+	for (dict_foreign_set::const_iterator it = fks->begin();
+	     it != fks->end(); ++it) {
+
+		dict_foreign_t*	foreign = *it;
+		if (foreign->referenced_index != index) {
+			continue;
+		}
+		ut_ad(indexed_table == foreign->referenced_table);
+
+		if (NULL == dict_foreign_find_index(
+			    indexed_table, col_names,
+			    foreign->referenced_col_names,
+			    foreign->n_fields, index,
+			    /*check_charsets=*/TRUE,
+			    /*check_null=*/FALSE,
+			    NULL, NULL, NULL)
+		    && NULL == innobase_find_equiv_index(
+			    foreign->referenced_col_names,
+			    foreign->n_fields,
+			    ha_alter_info->key_info_buffer,
+			    span<uint>(ha_alter_info->index_add_buffer,
+				       ha_alter_info->index_add_count))) {
+
+			/* Index cannot be dropped. */
+			trx->error_info = index;
+			return(true);
+		}
+	}
+
+	fks = &indexed_table->foreign_set;
+
+	/* Check for all FK references in current table using the index. */
+	for (dict_foreign_set::const_iterator it = fks->begin();
+	     it != fks->end(); ++it) {
+
+		dict_foreign_t*	foreign = *it;
+		if (foreign->foreign_index != index) {
+			continue;
+		}
+
+		ut_ad(indexed_table == foreign->foreign_table);
+
+		if (!innobase_dropping_foreign(
+			    foreign, drop_fk, n_drop_fk)
+		    && NULL == dict_foreign_find_index(
+			    indexed_table, col_names,
+			    foreign->foreign_col_names,
+			    foreign->n_fields, index,
+			    /*check_charsets=*/TRUE,
+			    /*check_null=*/FALSE,
+			    NULL, NULL, NULL)
+		    && NULL == innobase_find_equiv_index(
+			    foreign->foreign_col_names,
+			    foreign->n_fields,
+			    ha_alter_info->key_info_buffer,
+			    span<uint>(ha_alter_info->index_add_buffer,
+				       ha_alter_info->index_add_count))) {
+
+			/* Index cannot be dropped. */
+			trx->error_info = index;
+			return(true);
+		}
+	}
+
+	return(false);
+}
+
+/**
+Rename a given index in the InnoDB data dictionary.
+
+@param index index to rename
+@param new_name new name of the index
+@param[in,out] trx dict transaction to use, not going to be committed here
+
+@retval true Failure
+@retval false Success */
+static MY_ATTRIBUTE((warn_unused_result))
+bool
+rename_index_try(
+	const dict_index_t*	index,
+	const char*		new_name,
+	trx_t*			trx)
+{
+	DBUG_ENTER("rename_index_try");
+	ut_ad(dict_sys.locked());
+	ut_ad(trx->dict_operation_lock_mode);
+
+	pars_info_t*	pinfo;
+	dberr_t		err;
+
+	pinfo = pars_info_create();
+
+	pars_info_add_ull_literal(pinfo, "table_id", index->table->id);
+	pars_info_add_ull_literal(pinfo, "index_id", index->id);
+	pars_info_add_str_literal(pinfo, "new_name", new_name);
+
+	trx->op_info = "Renaming an index in SYS_INDEXES";
+
+	DBUG_EXECUTE_IF(
+		"ib_rename_index_fail1",
+		DBUG_SET("+d,innodb_report_deadlock");
+	);
+
+	err = que_eval_sql(
+		pinfo,
+		"PROCEDURE RENAME_INDEX_IN_SYS_INDEXES () IS\n"
+		"BEGIN\n"
+		"UPDATE SYS_INDEXES SET\n"
+		"NAME = :new_name\n"
+		"WHERE\n"
+		"ID = :index_id AND\n"
+		"TABLE_ID = :table_id;\n"
+		"END;\n", trx); /* pinfo is freed by que_eval_sql() */
+
+	DBUG_EXECUTE_IF(
+		"ib_rename_index_fail1",
+		DBUG_SET("-d,innodb_report_deadlock");
+	);
+
+	trx->op_info = "";
+
+	if (err != DB_SUCCESS) {
+		my_error_innodb(err, index->table->name.m_name, 0);
+		DBUG_RETURN(true);
+	}
+
+	DBUG_RETURN(false);
+}
+
+
+/**
+Rename a given index in the InnoDB data dictionary cache.
+
+@param[in,out] index index to rename
+@param new_name new index name
+*/
+static
+void
+innobase_rename_index_cache(dict_index_t* index, const char* new_name)
+{
+	DBUG_ENTER("innobase_rename_index_cache");
+	ut_ad(dict_sys.locked());
+
+	size_t	old_name_len = strlen(index->name);
+	size_t	new_name_len = strlen(new_name);
+
+	if (old_name_len < new_name_len) {
+		index->name = static_cast<char*>(
+		    mem_heap_alloc(index->heap, new_name_len + 1));
+	}
+
+	memcpy(const_cast<char*>(index->name()), new_name, new_name_len + 1);
+
+	DBUG_VOID_RETURN;
+}
+
+
+/** Rename the index name in cache.
+@param[in]	ctx		alter context
+@param[in]	ha_alter_info	Data used during inplace alter. */
+static void
+innobase_rename_indexes_cache(const ha_innobase_inplace_ctx *ctx,
+                              const Alter_inplace_info *ha_alter_info)
+{
+  DBUG_ASSERT(ha_alter_info->handler_flags & ALTER_RENAME_INDEX);
+
+  std::vector<std::pair<dict_index_t *, const char *>> rename_info;
+  rename_info.reserve(ha_alter_info->rename_keys.size());
+
+  for (const Alter_inplace_info::Rename_key_pair &pair :
+       ha_alter_info->rename_keys)
+  {
+    dict_index_t *index=
+        dict_table_get_index_on_name(ctx->old_table, pair.old_key->name.str);
+    ut_ad(index);
+
+    rename_info.emplace_back(index, pair.new_key->name.str);
+  }
+
+  for (const auto &pair : rename_info)
+    innobase_rename_index_cache(pair.first, pair.second);
+}
+
+/** Fill the stored column information in s_cols list.
+@param[in]	altered_table	mysql table object
+@param[in]	table		innodb table object
+@param[out]	s_cols		list of stored column
+@param[out]	s_heap		heap for storing stored
+column information. */
+static
+void
+alter_fill_stored_column(
+	const TABLE*		altered_table,
+	dict_table_t*		table,
+	dict_s_col_list**	s_cols,
+	mem_heap_t**		s_heap)
+{
+	ulint	n_cols = altered_table->s->fields;
+	ulint	stored_col_no = 0;
+
+	for (ulint i = 0; i < n_cols; i++) {
+		Field* field = altered_table->field[i];
+		dict_s_col_t	s_col;
+
+		if (field->stored_in_db()) {
+			stored_col_no++;
+		}
+
+		if (!innobase_is_s_fld(field)) {
+			continue;
+		}
+
+		ulint	num_base = 0;
+		dict_col_t*	col = dict_table_get_nth_col(table,
+							     stored_col_no);
+
+		s_col.m_col = col;
+		s_col.s_pos = i;
+
+		if (*s_cols == NULL) {
+			*s_cols = UT_NEW_NOKEY(dict_s_col_list());
+			*s_heap = mem_heap_create(1000);
+		}
+
+		if (num_base != 0) {
+			s_col.base_col = static_cast<dict_col_t**>(mem_heap_zalloc(
+						*s_heap, num_base * sizeof(dict_col_t*)));
+		} else {
+			s_col.base_col = NULL;
+		}
+
+		s_col.num_base = num_base;
+		innodb_base_col_setup_for_stored(table, field, &s_col);
+		(*s_cols)->push_front(s_col);
+	}
+}
+
+static bool alter_templ_needs_rebuild(const TABLE* altered_table,
+                                      const Alter_inplace_info* ha_alter_info,
+                                      const dict_table_t* table);
+
+/** Check whether the column is present in table foreign key
+relations.
+@param table     table which has foreign key relation
+@param col       column to be checked
+@param col_name  column name to be display during error
+@param drop_fk   Drop foreign key constraint
+@param n_drop_fk number of drop foreign keys
+@param add_fk    Newly added foreign key constraint
+@param n_add_fk  number of newly added foreign constraint */
+static
+bool check_col_is_in_fk_indexes(
+  const dict_table_t *table, const dict_col_t *col,
+  const char* col_name,
+  span<const dict_foreign_t *> drop_fk,
+  span<const dict_foreign_t *> add_fk)
+{
+  char *fk_id= nullptr;
+
+  for (const auto &f : table->foreign_set)
+  {
+    if (!f->foreign_index ||
+        std::find(drop_fk.begin(), drop_fk.end(), f) != drop_fk.end())
+      continue;
+    for (ulint i= 0; i < f->n_fields; i++)
+      if (f->foreign_index->fields[i].col == col)
+      {
+        fk_id= f->id;
+        goto err_exit;
+      }
+  }
+
+  for (const auto &a : add_fk)
+  {
+    for (ulint i= 0; i < a->n_fields; i++)
+    {
+      if (a->foreign_index->fields[i].col == col)
+      {
+        fk_id= a->id;
+        goto err_exit;
+      }
+    }
+  }
+
+  for (const auto &f : table->referenced_set)
+  {
+    if (!f->referenced_index) continue;
+    for (ulint i= 0; i < f->n_fields; i++)
+    {
+      if (f->referenced_index->fields[i].col == col)
+      {
+        my_error(ER_FK_COLUMN_CANNOT_CHANGE_CHILD, MYF(0),
+                 col_name, f->id, f->foreign_table_name);
+        return true;
+      }
+    }
+  }
+  return false;
+err_exit:
+  my_error(ER_FK_COLUMN_CANNOT_CHANGE, MYF(0), col_name,
+           fk_id ? fk_id :
+	   (std::string(table->name.m_name) + "_ibfk_0").c_str());
+  return true;
+}
+
+/** Allows InnoDB to update internal structures with concurrent
+writes blocked (provided that check_if_supported_inplace_alter()
+did not return HA_ALTER_INPLACE_NO_LOCK).
+This will be invoked before inplace_alter_table().
+
+@param altered_table TABLE object for new version of table.
+@param ha_alter_info Structure describing changes to be done
+by ALTER TABLE and holding data used during in-place alter.
+
+@retval true Failure
+@retval false Success
+*/
+
+bool
+ha_innobase::prepare_inplace_alter_table(
+/*=====================================*/
+	TABLE*			altered_table,
+	Alter_inplace_info*	ha_alter_info)
+{
+	dict_index_t**	drop_index;	/*!< Index to be dropped */
+	ulint		n_drop_index;	/*!< Number of indexes to drop */
+	dict_foreign_t**drop_fk;	/*!< Foreign key constraints to drop */
+	ulint		n_drop_fk;	/*!< Number of foreign keys to drop */
+	dict_foreign_t**add_fk = NULL;	/*!< Foreign key constraints to drop */
+	ulint		n_add_fk= 0;	/*!< Number of foreign keys to drop */
+	dict_table_t*	indexed_table;	/*!< Table where indexes are created */
+	mem_heap_t*	heap;
+	const char**	col_names;
+	int		error;
+	ulint		add_autoinc_col_no	= ULINT_UNDEFINED;
+	ulonglong	autoinc_col_max_value	= 0;
+	ulint		fts_doc_col_no		= ULINT_UNDEFINED;
+	bool		add_fts_doc_id		= false;
+	bool		add_fts_doc_id_idx	= false;
+	bool		add_fts_idx		= false;
+	dict_s_col_list*s_cols			= NULL;
+	mem_heap_t*	s_heap			= NULL;
+
+	DBUG_ENTER("prepare_inplace_alter_table");
+	DBUG_ASSERT(!ha_alter_info->handler_ctx);
+	DBUG_ASSERT(ha_alter_info->create_info);
+	DBUG_ASSERT(!srv_read_only_mode);
+
+	/* Init online ddl status variables */
+	onlineddl_rowlog_rows = 0;
+	onlineddl_rowlog_pct_used = 0;
+	onlineddl_pct_progress = 0;
+
+	MONITOR_ATOMIC_INC(MONITOR_PENDING_ALTER_TABLE);
+
+#ifdef UNIV_DEBUG
+	for (dict_index_t* index = dict_table_get_first_index(m_prebuilt->table);
+	     index;
+	     index = dict_table_get_next_index(index)) {
+		ut_ad(!index->to_be_dropped);
+	}
+#endif /* UNIV_DEBUG */
+
+	ut_d(dict_sys.freeze(SRW_LOCK_CALL));
+	ut_d(dict_table_check_for_dup_indexes(
+		     m_prebuilt->table, CHECK_ABORTED_OK));
+	ut_d(dict_sys.unfreeze());
+
+	if (!(ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE)) {
+		/* Nothing to do */
+		DBUG_ASSERT(!m_prebuilt->trx->dict_operation_lock_mode);
+		m_prebuilt->trx_id = 0;
+		DBUG_RETURN(false);
+	}
+
+#ifdef WITH_PARTITION_STORAGE_ENGINE
+	if (table->part_info == NULL) {
+#endif
+	/* Ignore the MDL downgrade when table is empty.
+	This optimization is disabled for partition table. */
+	ha_alter_info->mdl_exclusive_after_prepare =
+		innobase_table_is_empty(m_prebuilt->table, false);
+	if (ha_alter_info->online
+	    && ha_alter_info->mdl_exclusive_after_prepare) {
+		ha_alter_info->online = false;
+	}
+#ifdef WITH_PARTITION_STORAGE_ENGINE
+	}
+#endif
+	indexed_table = m_prebuilt->table;
+
+	/* ALTER TABLE will not implicitly move a table from a single-table
+	tablespace to the system tablespace when innodb_file_per_table=OFF.
+	But it will implicitly move a table from the system tablespace to a
+	single-table tablespace if innodb_file_per_table = ON. */
+
+	create_table_info_t	info(m_user_thd,
+				     altered_table,
+				     ha_alter_info->create_info,
+				     NULL,
+				     NULL,
+				     srv_file_per_table);
+
+	info.set_tablespace_type(indexed_table->space != fil_system.sys_space);
+
+	if (ha_alter_info->handler_flags & ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX) {
+		if (info.gcols_in_fulltext_or_spatial()) {
+			goto err_exit_no_heap;
+		}
+	}
+
+	if (indexed_table->is_readable()) {
+	} else {
+		if (indexed_table->corrupted) {
+			/* Handled below */
+		} else {
+			if (const fil_space_t* space = indexed_table->space) {
+				String str;
+				const char* engine= table_type();
+
+				push_warning_printf(
+					m_user_thd,
+					Sql_condition::WARN_LEVEL_WARN,
+					HA_ERR_DECRYPTION_FAILED,
+					"Table %s in file %s is encrypted but encryption service or"
+					" used key_id is not available. "
+					" Can't continue reading table.",
+					table_share->table_name.str,
+					space->chain.start->name);
+
+				my_error(ER_GET_ERRMSG, MYF(0), HA_ERR_DECRYPTION_FAILED, str.c_ptr(), engine);
+				DBUG_RETURN(true);
+			}
+		}
+	}
+
+	if (indexed_table->corrupted
+	    || dict_table_get_first_index(indexed_table) == NULL
+	    || dict_table_get_first_index(indexed_table)->is_corrupted()) {
+		/* The clustered index is corrupted. */
+		my_error(ER_CHECK_NO_SUCH_TABLE, MYF(0));
+		DBUG_RETURN(true);
+	} else {
+		const char* invalid_opt = info.create_options_are_invalid();
+
+		/* Check engine specific table options */
+		if (const char* invalid_tbopt = info.check_table_options()) {
+			my_error(ER_ILLEGAL_HA_CREATE_OPTION, MYF(0),
+				 table_type(), invalid_tbopt);
+			goto err_exit_no_heap;
+		}
+
+		if (invalid_opt) {
+			my_error(ER_ILLEGAL_HA_CREATE_OPTION, MYF(0),
+				 table_type(), invalid_opt);
+			goto err_exit_no_heap;
+		}
+	}
+
+	/* Check if any index name is reserved. */
+	if (innobase_index_name_is_reserved(
+		    m_user_thd,
+		    ha_alter_info->key_info_buffer,
+		    ha_alter_info->key_count)) {
+err_exit_no_heap:
+		DBUG_ASSERT(!m_prebuilt->trx->dict_operation_lock_mode);
+		online_retry_drop_indexes(m_prebuilt->table, m_user_thd);
+		DBUG_RETURN(true);
+	}
+
+	indexed_table = m_prebuilt->table;
+
+	/* Check that index keys are sensible */
+	error = innobase_check_index_keys(ha_alter_info, indexed_table);
+
+	if (error) {
+		goto err_exit_no_heap;
+	}
+
+	/* Prohibit renaming a column to something that the table
+	already contains. */
+	if (ha_alter_info->handler_flags
+	    & ALTER_COLUMN_NAME) {
+		for (Field** fp = table->field; *fp; fp++) {
+			if (!((*fp)->flags & FIELD_IS_RENAMED)) {
+				continue;
+			}
+
+			const char* name = 0;
+
+			for (const Create_field& cf :
+			     ha_alter_info->alter_info->create_list) {
+				if (cf.field == *fp) {
+					name = cf.field_name.str;
+					goto check_if_ok_to_rename;
+				}
+			}
+
+			ut_error;
+check_if_ok_to_rename:
+			/* Prohibit renaming a column from FTS_DOC_ID
+			if full-text indexes exist. */
+			if (!my_strcasecmp(system_charset_info,
+					   (*fp)->field_name.str,
+					   FTS_DOC_ID_COL_NAME)
+			    && innobase_fulltext_exist(altered_table)) {
+				my_error(ER_INNODB_FT_WRONG_DOCID_COLUMN,
+					 MYF(0), name);
+				goto err_exit_no_heap;
+			}
+
+			/* Prohibit renaming a column to an internal column. */
+			const char*	s = m_prebuilt->table->col_names;
+			unsigned j;
+			/* Skip user columns.
+			MySQL should have checked these already.
+			We want to allow renaming of c1 to c2, c2 to c1. */
+			for (j = 0; j < table->s->fields; j++) {
+				if (table->field[j]->stored_in_db()) {
+					s += strlen(s) + 1;
+				}
+			}
+
+			for (; j < m_prebuilt->table->n_def; j++) {
+				if (!my_strcasecmp(
+					    system_charset_info, name, s)) {
+					my_error(ER_WRONG_COLUMN_NAME, MYF(0),
+						 s);
+					goto err_exit_no_heap;
+				}
+
+				s += strlen(s) + 1;
+			}
+		}
+	}
+
+	if (!info.innobase_table_flags()) {
+		my_error(ER_ILLEGAL_HA_CREATE_OPTION, MYF(0),
+			 table_type(), "PAGE_COMPRESSED");
+		goto err_exit_no_heap;
+	}
+
+	if (info.flags2() & DICT_TF2_USE_FILE_PER_TABLE) {
+		/* Preserve the DATA DIRECTORY attribute, because it
+		currently cannot be changed during ALTER TABLE. */
+		info.flags_set(m_prebuilt->table->flags
+			       & 1U << DICT_TF_POS_DATA_DIR);
+	}
+
+
+	/* ALGORITHM=INPLACE without rebuild (10.3+ ALGORITHM=NOCOPY)
+	must use the current ROW_FORMAT of the table. */
+	const ulint max_col_len = DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(
+		innobase_need_rebuild(ha_alter_info, this->table)
+		? info.flags()
+		: m_prebuilt->table->flags);
+
+	/* Check each index's column length to make sure they do not
+	exceed limit */
+	for (ulint i = 0; i < ha_alter_info->key_count; i++) {
+		const KEY* key = &ha_alter_info->key_info_buffer[i];
+
+		if (key->flags & HA_FULLTEXT) {
+			/* The column length does not matter for
+			fulltext search indexes. But, UNIQUE
+			fulltext indexes are not supported. */
+			DBUG_ASSERT(!(key->flags & HA_NOSAME));
+			DBUG_ASSERT(!(key->flags & HA_KEYFLAG_MASK
+				      & ~(HA_FULLTEXT
+					  | HA_PACK_KEY
+					  | HA_BINARY_PACK_KEY)));
+			add_fts_idx = true;
+			continue;
+		}
+
+		if (too_big_key_part_length(max_col_len, *key)) {
+			my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0),
+				 max_col_len);
+			goto err_exit_no_heap;
+		}
+	}
+
+	/* We won't be allowed to add fts index to a table with
+	fts indexes already but without AUX_HEX_NAME set.
+	This means the aux tables of the table failed to
+	rename to hex format but new created aux tables
+	shall be in hex format, which is contradictory. */
+	if (!DICT_TF2_FLAG_IS_SET(indexed_table, DICT_TF2_FTS_AUX_HEX_NAME)
+	    && indexed_table->fts != NULL && add_fts_idx) {
+		my_error(ER_INNODB_FT_AUX_NOT_HEX_ID, MYF(0));
+		goto err_exit_no_heap;
+	}
+
+	/* Check existing index definitions for too-long column
+	prefixes as well, in case max_col_len shrunk. */
+	for (const dict_index_t* index
+		     = dict_table_get_first_index(indexed_table);
+	     index;
+	     index = dict_table_get_next_index(index)) {
+		if (index->type & DICT_FTS) {
+			DBUG_ASSERT(index->type == DICT_FTS
+				    || (index->type & DICT_CORRUPT));
+
+			/* We need to drop any corrupted fts indexes
+			before we add a new fts index. */
+			if (add_fts_idx && index->type & DICT_CORRUPT) {
+				ib_errf(m_user_thd, IB_LOG_LEVEL_ERROR,
+					ER_INNODB_INDEX_CORRUPT,
+					"Fulltext index '%s' is corrupt. "
+					"you should drop this index first.",
+					index->name());
+
+				goto err_exit_no_heap;
+			}
+
+			continue;
+		}
+
+		for (ulint i = 0; i < dict_index_get_n_fields(index); i++) {
+			const dict_field_t* field
+				= dict_index_get_nth_field(index, i);
+			if (field->prefix_len > max_col_len) {
+				my_error(ER_INDEX_COLUMN_TOO_LONG, MYF(0),
+					 max_col_len);
+				goto err_exit_no_heap;
+			}
+		}
+	}
+
+	n_drop_index = 0;
+	n_drop_fk = 0;
+
+	if (ha_alter_info->handler_flags
+	    & (INNOBASE_ALTER_NOREBUILD | INNOBASE_ALTER_REBUILD
+	       | INNOBASE_ALTER_INSTANT)) {
+		heap = mem_heap_create(1024);
+
+		if (ha_alter_info->handler_flags
+		    & ALTER_COLUMN_NAME) {
+			col_names = innobase_get_col_names(
+				ha_alter_info, altered_table, table,
+				indexed_table, heap);
+		} else {
+			col_names = NULL;
+		}
+	} else {
+		heap = NULL;
+		col_names = NULL;
+	}
+
+	if (ha_alter_info->handler_flags
+	    & ALTER_DROP_FOREIGN_KEY) {
+		DBUG_ASSERT(ha_alter_info->alter_info->drop_list.elements > 0);
+
+		drop_fk = static_cast<dict_foreign_t**>(
+			mem_heap_alloc(
+				heap,
+				ha_alter_info->alter_info->drop_list.elements
+				* sizeof(dict_foreign_t*)));
+
+		for (Alter_drop& drop : ha_alter_info->alter_info->drop_list) {
+			if (drop.type != Alter_drop::FOREIGN_KEY) {
+				continue;
+			}
+
+			dict_foreign_t* foreign;
+
+			for (dict_foreign_set::iterator it
+				= m_prebuilt->table->foreign_set.begin();
+			     it != m_prebuilt->table->foreign_set.end();
+			     ++it) {
+
+				foreign = *it;
+				const char* fid = strchr(foreign->id, '/');
+
+				DBUG_ASSERT(fid);
+				/* If no database/ prefix was present in
+				the FOREIGN KEY constraint name, compare
+				to the full constraint name. */
+				fid = fid ? fid + 1 : foreign->id;
+
+				if (!my_strcasecmp(system_charset_info,
+						   fid, drop.name)) {
+					goto found_fk;
+				}
+			}
+
+			my_error(ER_CANT_DROP_FIELD_OR_KEY, MYF(0),
+				drop.type_name(), drop.name);
+			goto err_exit;
+found_fk:
+			for (ulint i = n_drop_fk; i--; ) {
+				if (drop_fk[i] == foreign) {
+					goto dup_fk;
+				}
+			}
+			drop_fk[n_drop_fk++] = foreign;
+dup_fk:
+			continue;
+		}
+
+		DBUG_ASSERT(n_drop_fk > 0);
+
+		DBUG_ASSERT(n_drop_fk
+			    <= ha_alter_info->alter_info->drop_list.elements);
+	} else {
+		drop_fk = NULL;
+	}
+
+	if (ha_alter_info->index_drop_count) {
+		dict_index_t*	drop_primary = NULL;
+
+		DBUG_ASSERT(ha_alter_info->handler_flags
+			    & (ALTER_DROP_NON_UNIQUE_NON_PRIM_INDEX
+			       | ALTER_DROP_UNIQUE_INDEX
+			       | ALTER_DROP_PK_INDEX));
+		/* Check which indexes to drop. */
+		drop_index = static_cast<dict_index_t**>(
+			mem_heap_alloc(
+				heap, (ha_alter_info->index_drop_count + 1)
+				* sizeof *drop_index));
+
+		for (uint i = 0; i < ha_alter_info->index_drop_count; i++) {
+			const KEY*	key
+				= ha_alter_info->index_drop_buffer[i];
+			dict_index_t*	index
+				= dict_table_get_index_on_name(
+					indexed_table, key->name.str);
+
+			if (!index) {
+				push_warning_printf(
+					m_user_thd,
+					Sql_condition::WARN_LEVEL_WARN,
+					HA_ERR_WRONG_INDEX,
+					"InnoDB could not find key"
+					" with name %s", key->name.str);
+			} else {
+				ut_ad(!index->to_be_dropped);
+				if (!index->is_primary()) {
+					drop_index[n_drop_index++] = index;
+				} else {
+					drop_primary = index;
+				}
+			}
+		}
+
+		/* If all FULLTEXT indexes were removed, drop an
+		internal FTS_DOC_ID_INDEX as well, unless it exists in
+		the table. */
+
+		if (innobase_fulltext_exist(table)
+		    && !innobase_fulltext_exist(altered_table)
+		    && !DICT_TF2_FLAG_IS_SET(
+			indexed_table, DICT_TF2_FTS_HAS_DOC_ID)) {
+			dict_index_t*	fts_doc_index
+				= indexed_table->fts_doc_id_index;
+			ut_ad(fts_doc_index);
+
+			// Add some fault tolerance for non-debug builds.
+			if (fts_doc_index == NULL) {
+				goto check_if_can_drop_indexes;
+			}
+
+			DBUG_ASSERT(!fts_doc_index->to_be_dropped);
+
+			for (uint i = 0; i < table->s->keys; i++) {
+				if (!my_strcasecmp(
+					    system_charset_info,
+					    FTS_DOC_ID_INDEX_NAME,
+					    table->key_info[i].name.str)) {
+					/* The index exists in the MySQL
+					data dictionary. Do not drop it,
+					even though it is no longer needed
+					by InnoDB fulltext search. */
+					goto check_if_can_drop_indexes;
+				}
+			}
+
+			drop_index[n_drop_index++] = fts_doc_index;
+		}
+
+check_if_can_drop_indexes:
+		/* Check if the indexes can be dropped. */
+
+		/* Prevent a race condition between DROP INDEX and
+		CREATE TABLE adding FOREIGN KEY constraints. */
+		row_mysql_lock_data_dictionary(m_prebuilt->trx);
+
+		if (!n_drop_index) {
+			drop_index = NULL;
+		} else {
+			/* Flag all indexes that are to be dropped. */
+			for (ulint i = 0; i < n_drop_index; i++) {
+				ut_ad(!drop_index[i]->to_be_dropped);
+				drop_index[i]->to_be_dropped = 1;
+			}
+		}
+
+		if (m_prebuilt->trx->check_foreigns) {
+			for (uint i = 0; i < n_drop_index; i++) {
+				dict_index_t*	index = drop_index[i];
+
+				if (innobase_check_foreign_key_index(
+						ha_alter_info, index,
+						indexed_table, col_names,
+						m_prebuilt->trx, drop_fk, n_drop_fk)) {
+					row_mysql_unlock_data_dictionary(
+						m_prebuilt->trx);
+					m_prebuilt->trx->error_info = index;
+					print_error(HA_ERR_DROP_INDEX_FK,
+						MYF(0));
+					goto err_exit;
+				}
+			}
+
+			/* If a primary index is dropped, need to check
+			any depending foreign constraints get affected */
+			if (drop_primary
+				&& innobase_check_foreign_key_index(
+					ha_alter_info, drop_primary,
+					indexed_table, col_names,
+					m_prebuilt->trx, drop_fk, n_drop_fk)) {
+				row_mysql_unlock_data_dictionary(m_prebuilt->trx);
+				print_error(HA_ERR_DROP_INDEX_FK, MYF(0));
+				goto err_exit;
+			}
+		}
+
+		row_mysql_unlock_data_dictionary(m_prebuilt->trx);
+	} else {
+		drop_index = NULL;
+	}
+
+	/* Check if any of the existing indexes are marked as corruption
+	and if they are, refuse adding more indexes. */
+	if (ha_alter_info->handler_flags & ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX) {
+		for (dict_index_t* index = dict_table_get_first_index(indexed_table);
+		     index != NULL; index = dict_table_get_next_index(index)) {
+
+			if (!index->to_be_dropped && index->is_committed()
+			    && index->is_corrupted()) {
+				my_error(ER_INDEX_CORRUPT, MYF(0), index->name());
+				goto err_exit;
+			}
+		}
+	}
+
+	if (ha_alter_info->handler_flags
+	    & ALTER_ADD_FOREIGN_KEY) {
+		ut_ad(!m_prebuilt->trx->check_foreigns);
+
+		alter_fill_stored_column(altered_table, m_prebuilt->table,
+					 &s_cols, &s_heap);
+
+		add_fk = static_cast<dict_foreign_t**>(
+			mem_heap_zalloc(
+				heap,
+				ha_alter_info->alter_info->key_list.elements
+				* sizeof(dict_foreign_t*)));
+
+		if (!innobase_get_foreign_key_info(
+			    ha_alter_info, table_share,
+			    m_prebuilt->table, col_names,
+			    drop_index, n_drop_index,
+			    add_fk, &n_add_fk, m_prebuilt->trx, s_cols)) {
+err_exit:
+			if (n_drop_index) {
+				row_mysql_lock_data_dictionary(m_prebuilt->trx);
+
+				/* Clear the to_be_dropped flags, which might
+				have been set at this point. */
+				for (ulint i = 0; i < n_drop_index; i++) {
+					ut_ad(drop_index[i]->is_committed());
+					drop_index[i]->to_be_dropped = 0;
+				}
+
+				row_mysql_unlock_data_dictionary(
+					m_prebuilt->trx);
+			}
+
+			for (uint i = 0; i < n_add_fk; i++) {
+				if (add_fk[i]) {
+					dict_foreign_free(add_fk[i]);
+				}
+			}
+
+			if (heap) {
+				mem_heap_free(heap);
+			}
+
+			if (s_cols != NULL) {
+				UT_DELETE(s_cols);
+				mem_heap_free(s_heap);
+			}
+
+			goto err_exit_no_heap;
+		}
+
+		if (s_cols != NULL) {
+			UT_DELETE(s_cols);
+			mem_heap_free(s_heap);
+		}
+	}
+
+	/** Alter shouldn't support if the foreign and referenced
+	index columns are modified */
+	if (ha_alter_info->handler_flags
+			& ALTER_COLUMN_TYPE_CHANGE_BY_ENGINE) {
+
+		for (uint i= 0, n_v_col= 0; i < table->s->fields;
+		     i++) {
+			Field* field = table->field[i];
+
+			/* Altering the virtual column is not
+			supported for inplace alter algorithm */
+			if (field->vcol_info) {
+				n_v_col++;
+				continue;
+			}
+
+			for (const Create_field& new_field :
+				ha_alter_info->alter_info->create_list) {
+				if (new_field.field == field) {
+					if (!field->is_equal(new_field)) {
+						goto field_changed;
+					}
+					break;
+				}
+			}
+
+			continue;
+field_changed:
+			const char* col_name= field->field_name.str;
+			dict_col_t *col= dict_table_get_nth_col(
+				m_prebuilt->table, i - n_v_col);
+			if (check_col_is_in_fk_indexes(
+				m_prebuilt->table, col, col_name,
+				span<const dict_foreign_t*>(
+				  const_cast<const dict_foreign_t**>(
+				    drop_fk), n_drop_fk),
+				span<const dict_foreign_t*>(
+				  const_cast<const dict_foreign_t**>(
+				    add_fk), n_add_fk)))
+				goto err_exit;
+		}
+	}
+
+	if (ha_alter_info->handler_flags & ALTER_RENAME_INDEX) {
+		for (const Alter_inplace_info::Rename_key_pair& pair :
+		     ha_alter_info->rename_keys) {
+			dict_index_t* index = dict_table_get_index_on_name(
+			    indexed_table, pair.old_key->name.str);
+
+			if (!index || index->is_corrupted()) {
+				my_error(ER_INDEX_CORRUPT, MYF(0),
+					 index->name());
+				goto err_exit;
+			}
+		}
+	}
+
+	const ha_table_option_struct& alt_opt=
+		*ha_alter_info->create_info->option_struct;
+
+        ha_innobase_inplace_ctx *ctx = NULL;
+
+	if (!(ha_alter_info->handler_flags & INNOBASE_ALTER_DATA)
+	    || ((ha_alter_info->handler_flags & ~(INNOBASE_INPLACE_IGNORE
+						  | INNOBASE_ALTER_NOCREATE
+						  | INNOBASE_ALTER_INSTANT))
+		== ALTER_OPTIONS
+		&& !alter_options_need_rebuild(ha_alter_info, table))) {
+
+		DBUG_ASSERT(!m_prebuilt->trx->dict_operation_lock_mode);
+		online_retry_drop_indexes(m_prebuilt->table, m_user_thd);
+
+		if (heap) {
+			ctx = new ha_innobase_inplace_ctx(
+					m_prebuilt,
+					drop_index, n_drop_index,
+					drop_fk, n_drop_fk,
+					add_fk, n_add_fk,
+					ha_alter_info->online,
+					heap, indexed_table,
+					col_names, ULINT_UNDEFINED, 0, 0,
+					(ha_alter_info->ignore
+					 || !thd_is_strict_mode(m_user_thd)),
+					alt_opt.page_compressed,
+					alt_opt.page_compression_level);
+			ha_alter_info->handler_ctx = ctx;
+		}
+
+		if ((ha_alter_info->handler_flags
+		     & ALTER_DROP_VIRTUAL_COLUMN)
+		    && prepare_inplace_drop_virtual(ha_alter_info, table)) {
+			DBUG_RETURN(true);
+		}
+
+		if ((ha_alter_info->handler_flags
+		     & ALTER_ADD_VIRTUAL_COLUMN)
+		    && prepare_inplace_add_virtual(
+			    ha_alter_info, altered_table, table)) {
+			DBUG_RETURN(true);
+		}
+
+		if (!(ha_alter_info->handler_flags & INNOBASE_ALTER_DATA)
+		    && alter_templ_needs_rebuild(altered_table, ha_alter_info,
+						 ctx->new_table)
+		    && ctx->new_table->n_v_cols > 0) {
+			/* Changing maria record structure may end up here only
+			if virtual columns were altered. In this case, however,
+			vc_templ should be rebuilt. Since we don't actually
+			change any stored data, we can just dispose vc_templ;
+			it will be recreated on next ha_innobase::open(). */
+
+			DBUG_ASSERT(ctx->new_table == ctx->old_table);
+
+			dict_free_vc_templ(ctx->new_table->vc_templ);
+			UT_DELETE(ctx->new_table->vc_templ);
+
+			ctx->new_table->vc_templ = NULL;
+		}
+
+
+success:
+		/* Memorize the future transaction ID for committing
+		the data dictionary change, to be reported by
+		ha_innobase::table_version(). */
+		m_prebuilt->trx_id = (ha_alter_info->handler_flags
+				      & ~INNOBASE_INPLACE_IGNORE)
+			? static_cast<ha_innobase_inplace_ctx*>
+			(ha_alter_info->handler_ctx)->trx->id
+			: 0;
+		DBUG_RETURN(false);
+	}
+
+	/* If we are to build a full-text search index, check whether
+	the table already has a DOC ID column.  If not, we will need to
+	add a Doc ID hidden column and rebuild the primary index */
+	if (innobase_fulltext_exist(altered_table)) {
+		ulint	doc_col_no;
+		ulint	num_v = 0;
+
+		if (!innobase_fts_check_doc_id_col(
+			    m_prebuilt->table,
+			    altered_table, &fts_doc_col_no, &num_v)) {
+
+			fts_doc_col_no = altered_table->s->fields - num_v;
+			add_fts_doc_id = true;
+			add_fts_doc_id_idx = true;
+
+		} else if (fts_doc_col_no == ULINT_UNDEFINED) {
+			goto err_exit;
+		}
+
+		switch (innobase_fts_check_doc_id_index(
+				m_prebuilt->table, altered_table,
+				&doc_col_no)) {
+		case FTS_NOT_EXIST_DOC_ID_INDEX:
+			add_fts_doc_id_idx = true;
+			break;
+		case FTS_INCORRECT_DOC_ID_INDEX:
+			my_error(ER_INNODB_FT_WRONG_DOCID_INDEX, MYF(0),
+				 FTS_DOC_ID_INDEX_NAME);
+			goto err_exit;
+		case FTS_EXIST_DOC_ID_INDEX:
+			DBUG_ASSERT(
+				doc_col_no == fts_doc_col_no
+				|| doc_col_no == ULINT_UNDEFINED
+				|| (ha_alter_info->handler_flags
+				    & (ALTER_STORED_COLUMN_ORDER
+				       | ALTER_DROP_STORED_COLUMN
+				       | ALTER_ADD_STORED_BASE_COLUMN)));
+		}
+	}
+
+	/* See if an AUTO_INCREMENT column was added. */
+	uint	i = 0;
+	ulint	num_v = 0;
+	for (const Create_field& new_field :
+	     ha_alter_info->alter_info->create_list) {
+		const Field*	field;
+
+		DBUG_ASSERT(i < altered_table->s->fields);
+
+		for (uint old_i = 0; table->field[old_i]; old_i++) {
+			if (new_field.field == table->field[old_i]) {
+				goto found_col;
+			}
+		}
+
+		/* This is an added column. */
+		DBUG_ASSERT(!new_field.field);
+		DBUG_ASSERT(ha_alter_info->handler_flags
+			    & ALTER_ADD_COLUMN);
+
+		field = altered_table->field[i];
+
+		DBUG_ASSERT((field->unireg_check
+			     == Field::NEXT_NUMBER)
+			    == !!(field->flags & AUTO_INCREMENT_FLAG));
+
+		if (field->flags & AUTO_INCREMENT_FLAG) {
+			if (add_autoinc_col_no != ULINT_UNDEFINED) {
+				/* This should have been blocked earlier. */
+				ut_ad(0);
+				my_error(ER_WRONG_AUTO_KEY, MYF(0));
+				goto err_exit;
+			}
+
+			/* Get the col no of the old table non-virtual column array */
+			add_autoinc_col_no = i - num_v;
+
+			autoinc_col_max_value = innobase_get_int_col_max_value(field);
+		}
+found_col:
+		num_v += !new_field.stored_in_db();
+		i++;
+	}
+
+	DBUG_ASSERT(heap);
+	DBUG_ASSERT(m_user_thd == m_prebuilt->trx->mysql_thd);
+	DBUG_ASSERT(!ha_alter_info->handler_ctx);
+
+	ha_alter_info->handler_ctx = new ha_innobase_inplace_ctx(
+		m_prebuilt,
+		drop_index, n_drop_index,
+		drop_fk, n_drop_fk, add_fk, n_add_fk,
+		ha_alter_info->online,
+		heap, m_prebuilt->table, col_names,
+		add_autoinc_col_no,
+		ha_alter_info->create_info->auto_increment_value,
+		autoinc_col_max_value,
+		ha_alter_info->ignore || !thd_is_strict_mode(m_user_thd),
+		alt_opt.page_compressed, alt_opt.page_compression_level);
+
+	if (!prepare_inplace_alter_table_dict(
+		    ha_alter_info, altered_table, table,
+		    table_share->table_name.str,
+		    info.flags(), info.flags2(),
+		    fts_doc_col_no, add_fts_doc_id,
+		    add_fts_doc_id_idx)) {
+		goto success;
+	}
+
+	DBUG_RETURN(true);
+}
+
+/* Check whether a columnn length change alter operation requires
+to rebuild the template.
+@param[in]	altered_table	TABLE object for new version of table.
+@param[in]	ha_alter_info	Structure describing changes to be done
+				by ALTER TABLE and holding data used
+				during in-place alter.
+@param[in]	table		table being altered
+@return TRUE if needs rebuild. */
+static
+bool
+alter_templ_needs_rebuild(
+	const TABLE*            altered_table,
+	const Alter_inplace_info*     ha_alter_info,
+	const dict_table_t*		table)
+{
+        ulint	i = 0;
+
+	for (Field** fp = altered_table->field; *fp; fp++, i++) {
+		for (const Create_field& cf :
+		     ha_alter_info->alter_info->create_list) {
+			for (ulint j=0; j < table->n_cols; j++) {
+				dict_col_t* cols
+                                   = dict_table_get_nth_col(table, j);
+				if (cf.length > cols->len) {
+					return(true);
+				}
+			}
+		}
+	}
+
+	return(false);
+}
+
+/** Alter the table structure in-place with operations
+specified using Alter_inplace_info.
+The level of concurrency allowed during this operation depends
+on the return value from check_if_supported_inplace_alter().
+
+@param altered_table TABLE object for new version of table.
+@param ha_alter_info Structure describing changes to be done
+by ALTER TABLE and holding data used during in-place alter.
+
+@retval true Failure
+@retval false Success
+*/
+
+bool
+ha_innobase::inplace_alter_table(
+/*=============================*/
+	TABLE*			altered_table,
+	Alter_inplace_info*	ha_alter_info)
+{
+	dberr_t			error;
+	dict_add_v_col_t*	add_v = NULL;
+	dict_vcol_templ_t*	s_templ = NULL;
+	dict_vcol_templ_t*	old_templ = NULL;
+	struct TABLE*		eval_table = altered_table;
+	bool			rebuild_templ = false;
+	DBUG_ENTER("inplace_alter_table");
+	DBUG_ASSERT(!srv_read_only_mode);
+
+	DEBUG_SYNC(m_user_thd, "innodb_inplace_alter_table_enter");
+
+	/* Ignore the inplace alter phase when table is empty */
+	if (!(ha_alter_info->handler_flags & INNOBASE_ALTER_DATA)
+	    || ha_alter_info->mdl_exclusive_after_prepare) {
+ok_exit:
+		DEBUG_SYNC(m_user_thd, "innodb_after_inplace_alter_table");
+		DBUG_RETURN(false);
+	}
+
+	if ((ha_alter_info->handler_flags & ~(INNOBASE_INPLACE_IGNORE
+					      | INNOBASE_ALTER_NOCREATE
+					      | INNOBASE_ALTER_INSTANT))
+	    == ALTER_OPTIONS
+	    && !alter_options_need_rebuild(ha_alter_info, table)) {
+		goto ok_exit;
+	}
+
+	ha_innobase_inplace_ctx*	ctx
+		= static_cast<ha_innobase_inplace_ctx*>
+		(ha_alter_info->handler_ctx);
+
+	DBUG_ASSERT(ctx);
+	DBUG_ASSERT(ctx->trx);
+	DBUG_ASSERT(ctx->prebuilt == m_prebuilt);
+
+	if (ctx->is_instant()) goto ok_exit;
+
+	dict_index_t*	pk = dict_table_get_first_index(m_prebuilt->table);
+	ut_ad(pk != NULL);
+
+	/* For partitioned tables this could be already allocated from a
+	previous partition invocation. For normal tables this is NULL. */
+	UT_DELETE(ctx->m_stage);
+
+	ctx->m_stage = UT_NEW_NOKEY(ut_stage_alter_t(pk));
+
+	if (!m_prebuilt->table->is_readable()) {
+		goto all_done;
+	}
+
+	/* If we are doing a table rebuilding or having added virtual
+	columns in the same clause, we will need to build a table template
+	that carries translation information between MySQL TABLE and InnoDB
+	table, which indicates the virtual columns and their base columns
+	info. This is used to do the computation callback, so that the
+	data in base columns can be extracted send to server.
+	If the Column length changes and it is a part of virtual
+	index then we need to rebuild the template. */
+	rebuild_templ
+	     = ctx->need_rebuild()
+	       || ((ha_alter_info->handler_flags
+		& ALTER_COLUMN_TYPE_CHANGE_BY_ENGINE)
+		&& alter_templ_needs_rebuild(
+		   altered_table, ha_alter_info, ctx->new_table));
+
+	if ((ctx->new_table->n_v_cols > 0) && rebuild_templ) {
+		/* Save the templ if isn't NULL so as to restore the
+		original state in case of alter operation failures. */
+		if (ctx->new_table->vc_templ != NULL && !ctx->need_rebuild()) {
+			old_templ = ctx->new_table->vc_templ;
+		}
+		s_templ = UT_NEW_NOKEY(dict_vcol_templ_t());
+
+		innobase_build_v_templ(
+			altered_table, ctx->new_table, s_templ, NULL, false);
+
+		ctx->new_table->vc_templ = s_templ;
+	} else if (ctx->num_to_add_vcol > 0 && ctx->num_to_drop_vcol == 0) {
+		/* if there is ongoing drop virtual column, then we disallow
+		inplace add index on newly added virtual column, so it does
+		not need to come in here to rebuild template with add_v.
+		Please also see the assertion in innodb_v_adjust_idx_col() */
+
+		s_templ = UT_NEW_NOKEY(dict_vcol_templ_t());
+
+		add_v = static_cast<dict_add_v_col_t*>(
+			mem_heap_alloc(ctx->heap, sizeof *add_v));
+		add_v->n_v_col = ctx->num_to_add_vcol;
+		add_v->v_col = ctx->add_vcol;
+		add_v->v_col_name = ctx->add_vcol_name;
+
+		innobase_build_v_templ(
+			altered_table, ctx->new_table, s_templ, add_v, false);
+		old_templ = ctx->new_table->vc_templ;
+		ctx->new_table->vc_templ = s_templ;
+	}
+
+	/* Drop virtual column without rebuild will keep dict table
+	unchanged, we use old table to evaluate virtual column value
+	in innobase_get_computed_value(). */
+	if (!ctx->need_rebuild() && ctx->num_to_drop_vcol > 0) {
+		eval_table = table;
+	}
+
+	/* Read the clustered index of the table and build
+	indexes based on this information using temporary
+	files and merge sort. */
+	DBUG_EXECUTE_IF("innodb_OOM_inplace_alter",
+			error = DB_OUT_OF_MEMORY; goto oom;);
+
+	error = row_merge_build_indexes(
+		m_prebuilt->trx,
+		m_prebuilt->table, ctx->new_table,
+		ctx->online,
+		ctx->add_index, ctx->add_key_numbers, ctx->num_to_add_index,
+		altered_table, ctx->defaults, ctx->col_map,
+		ctx->add_autoinc, ctx->sequence, ctx->skip_pk_sort,
+		ctx->m_stage, add_v, eval_table, ctx->allow_not_null,
+		ctx->change_col_collate.empty()
+		? nullptr : &ctx->change_col_collate);
+
+#ifndef DBUG_OFF
+oom:
+#endif /* !DBUG_OFF */
+	if (error == DB_SUCCESS && ctx->online && ctx->need_rebuild()) {
+		DEBUG_SYNC_C("row_log_table_apply1_before");
+		error = row_log_table_apply(
+			ctx->thr, m_prebuilt->table, altered_table,
+			ctx->m_stage, ctx->new_table);
+	}
+
+	/* Init online ddl status variables */
+	onlineddl_rowlog_rows = 0;
+	onlineddl_rowlog_pct_used = 0;
+	onlineddl_pct_progress = 0;
+
+	if (s_templ) {
+		ut_ad(ctx->need_rebuild() || ctx->num_to_add_vcol > 0
+		      || rebuild_templ);
+		dict_free_vc_templ(s_templ);
+		UT_DELETE(s_templ);
+
+		ctx->new_table->vc_templ = old_templ;
+	}
+
+	DEBUG_SYNC_C("inplace_after_index_build");
+
+	DBUG_EXECUTE_IF("create_index_fail",
+			error = DB_DUPLICATE_KEY;
+			m_prebuilt->trx->error_key_num = ULINT_UNDEFINED;);
+
+	/* After an error, remove all those index definitions
+	from the dictionary which were defined. */
+
+	switch (error) {
+		KEY*	dup_key;
+	default:
+		my_error_innodb(error,
+				table_share->table_name.str,
+				m_prebuilt->table->flags);
+		break;
+	all_done:
+	case DB_SUCCESS:
+		ut_d(dict_sys.freeze(SRW_LOCK_CALL));
+		ut_d(dict_table_check_for_dup_indexes(
+			     m_prebuilt->table, CHECK_PARTIAL_OK));
+		ut_d(dict_sys.unfreeze());
+		/* prebuilt->table->n_ref_count can be anything here,
+		given that we hold at most a shared lock on the table. */
+		goto ok_exit;
+	case DB_DUPLICATE_KEY:
+		if (m_prebuilt->trx->error_key_num == ULINT_UNDEFINED
+		    || ha_alter_info->key_count == 0) {
+			/* This should be the hidden index on
+			FTS_DOC_ID, or there is no PRIMARY KEY in the
+			table. Either way, we should be seeing and
+			reporting a bogus duplicate key error. */
+			dup_key = NULL;
+		} else {
+			DBUG_ASSERT(m_prebuilt->trx->error_key_num
+				    < ha_alter_info->key_count);
+			dup_key = &ha_alter_info->key_info_buffer[
+				m_prebuilt->trx->error_key_num];
+		}
+		print_keydup_error(altered_table, dup_key, MYF(0));
+		break;
+	case DB_ONLINE_LOG_TOO_BIG:
+		DBUG_ASSERT(ctx->online);
+		my_error(ER_INNODB_ONLINE_LOG_TOO_BIG, MYF(0),
+			 get_error_key_name(m_prebuilt->trx->error_key_num,
+					    ha_alter_info, m_prebuilt->table));
+		break;
+	case DB_INDEX_CORRUPT:
+		my_error(ER_INDEX_CORRUPT, MYF(0),
+			 get_error_key_name(m_prebuilt->trx->error_key_num,
+					    ha_alter_info, m_prebuilt->table));
+		break;
+	case DB_DECRYPTION_FAILED:
+		String str;
+		const char* engine= table_type();
+		get_error_message(HA_ERR_DECRYPTION_FAILED, &str);
+		my_error(ER_GET_ERRMSG, MYF(0), HA_ERR_DECRYPTION_FAILED,
+			 str.c_ptr(), engine);
+		break;
+	}
+
+	/* prebuilt->table->n_ref_count can be anything here, given
+	that we hold at most a shared lock on the table. */
+	m_prebuilt->trx->error_info = NULL;
+	ctx->trx->error_state = DB_SUCCESS;
+
+	DBUG_RETURN(true);
+}
+
+/** Free the modification log for online table rebuild.
+@param table table that was being rebuilt online */
+static
+void
+innobase_online_rebuild_log_free(
+/*=============================*/
+	dict_table_t*	table)
+{
+	dict_index_t* clust_index = dict_table_get_first_index(table);
+	ut_ad(dict_sys.locked());
+	clust_index->lock.x_lock(SRW_LOCK_CALL);
+
+	if (clust_index->online_log) {
+		ut_ad(dict_index_get_online_status(clust_index)
+		      == ONLINE_INDEX_CREATION);
+		clust_index->online_status = ONLINE_INDEX_COMPLETE;
+		row_log_free(clust_index->online_log);
+		clust_index->online_log = NULL;
+		DEBUG_SYNC_C("innodb_online_rebuild_log_free_aborted");
+	}
+
+	DBUG_ASSERT(dict_index_get_online_status(clust_index)
+		    == ONLINE_INDEX_COMPLETE);
+	clust_index->lock.x_unlock();
+}
+
+/** For each user column, which is part of an index which is not going to be
+dropped, it checks if the column number of the column is same as col_no
+argument passed.
+@param[in]	table		table
+@param[in]	col_no		column number
+@param[in]	is_v		if this is a virtual column
+@param[in]	only_committed	whether to consider only committed indexes
+@retval true column exists
+@retval false column does not exist, true if column is system column or
+it is in the index. */
+static
+bool
+check_col_exists_in_indexes(
+	const dict_table_t*	table,
+	ulint			col_no,
+	bool			is_v,
+	bool			only_committed = false)
+{
+	/* This function does not check system columns */
+	if (!is_v && dict_table_get_nth_col(table, col_no)->mtype == DATA_SYS) {
+		return(true);
+	}
+
+	for (const dict_index_t* index = dict_table_get_first_index(table);
+	     index;
+	     index = dict_table_get_next_index(index)) {
+
+		if (only_committed
+		    ? !index->is_committed()
+		    : index->to_be_dropped) {
+			continue;
+		}
+
+		for (ulint i = 0; i < index->n_user_defined_cols; i++) {
+			const dict_col_t* idx_col
+				= dict_index_get_nth_col(index, i);
+
+			if (is_v && idx_col->is_virtual()) {
+				const dict_v_col_t*   v_col = reinterpret_cast<
+					const dict_v_col_t*>(idx_col);
+				if (v_col->v_pos == col_no) {
+					return(true);
+				}
+			}
+
+			if (!is_v && !idx_col->is_virtual()
+			    && dict_col_get_no(idx_col) == col_no) {
+				return(true);
+			}
+		}
+	}
+
+	return(false);
+}
+
+/** Rollback a secondary index creation, drop the indexes with
+temparary index prefix
+@param user_table InnoDB table
+@param table the TABLE
+@param locked TRUE=table locked, FALSE=may need to do a lazy drop
+@param trx the transaction
+@param alter_trx transaction which takes S-lock on the table
+                 while creating the index */
+static
+void
+innobase_rollback_sec_index(
+        dict_table_t*   user_table,
+        const TABLE*    table,
+        bool            locked,
+        trx_t*          trx,
+        const trx_t*    alter_trx=NULL)
+{
+	row_merge_drop_indexes(trx, user_table, locked, alter_trx);
+
+	/* Free the table->fts only if there is no FTS_DOC_ID
+	in the table */
+	if (user_table->fts
+	    && !DICT_TF2_FLAG_IS_SET(user_table,
+				     DICT_TF2_FTS_HAS_DOC_ID)
+	    && !innobase_fulltext_exist(table)) {
+		user_table->fts->~fts_t();
+		user_table->fts = nullptr;
+	}
+}
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Roll back the changes made during prepare_inplace_alter_table()
+and inplace_alter_table() inside the storage engine. Note that the
+allowed level of concurrency during this operation will be the same as
+for inplace_alter_table() and thus might be higher than during
+prepare_inplace_alter_table(). (E.g concurrent writes were blocked
+during prepare, but might not be during commit).
+
+@param ha_alter_info Data used during in-place alter.
+@param table the TABLE
+@param prebuilt the prebuilt struct
+@retval true Failure
+@retval false Success
+*/
+inline bool rollback_inplace_alter_table(Alter_inplace_info *ha_alter_info,
+                                         const TABLE *table,
+                                         row_prebuilt_t *prebuilt)
+{
+  bool fail= false;
+  ha_innobase_inplace_ctx *ctx= static_cast<ha_innobase_inplace_ctx*>
+    (ha_alter_info->handler_ctx);
+
+  DBUG_ENTER("rollback_inplace_alter_table");
+
+  DEBUG_SYNC_C("innodb_rollback_inplace_alter_table");
+  if (!ctx)
+    /* If we have not started a transaction yet,
+    (almost) nothing has been or needs to be done. */
+    dict_sys.lock(SRW_LOCK_CALL);
+  else if (ctx->trx->state == TRX_STATE_NOT_STARTED)
+    goto free_and_exit;
+  else if (ctx->new_table)
+  {
+    ut_ad(ctx->trx->state == TRX_STATE_ACTIVE);
+    const bool fts_exist= (ctx->new_table->flags2 &
+                           (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS)) ||
+      ctx->adding_fulltext_index();
+    if (ctx->need_rebuild())
+    {
+      if (fts_exist)
+      {
+        fts_optimize_remove_table(ctx->new_table);
+        purge_sys.stop_FTS(*ctx->new_table);
+      }
+
+      dberr_t err= lock_table_for_trx(ctx->new_table, ctx->trx, LOCK_X);
+      if (fts_exist)
+      {
+        if (err == DB_SUCCESS)
+          err= fts_lock_common_tables(ctx->trx, *ctx->new_table);
+        for (const dict_index_t* index= ctx->new_table->indexes.start;
+             err == DB_SUCCESS && index; index= index->indexes.next)
+          if (index->type & DICT_FTS)
+            err= fts_lock_index_tables(ctx->trx, *index);
+      }
+      if (err == DB_SUCCESS)
+        err= lock_sys_tables(ctx->trx);
+
+      row_mysql_lock_data_dictionary(ctx->trx);
+      /* Detach ctx->new_table from dict_index_t::online_log. */
+      innobase_online_rebuild_log_free(ctx->old_table);
+
+      ut_d(const bool last_handle=) ctx->new_table->release();
+      ut_ad(last_handle);
+      if (err == DB_SUCCESS)
+        err= ctx->trx->drop_table(*ctx->new_table);
+
+      if (err == DB_SUCCESS)
+        for (const dict_index_t* index= ctx->new_table->indexes.start; index;
+             index= index->indexes.next)
+          if (index->type & DICT_FTS)
+            if (dberr_t err2= fts_drop_index_tables(ctx->trx, *index))
+              err= err2;
+
+      if (err != DB_SUCCESS)
+      {
+        my_error_innodb(err, table->s->table_name.str, ctx->new_table->flags);
+        fail= true;
+      }
+    }
+    else
+    {
+      DBUG_ASSERT(!(ha_alter_info->handler_flags & ALTER_ADD_PK_INDEX));
+      DBUG_ASSERT(ctx->old_table == prebuilt->table);
+      uint &innodb_lock_wait_timeout=
+        thd_lock_wait_timeout(ctx->trx->mysql_thd);
+      const uint save_timeout= innodb_lock_wait_timeout;
+      innodb_lock_wait_timeout= ~0U; /* infinite  */
+      dict_index_t *old_clust_index= ctx->old_table->indexes.start;
+      old_clust_index->lock.x_lock(SRW_LOCK_CALL);
+      old_clust_index->online_log= nullptr;
+      old_clust_index->lock.x_unlock();
+      if (fts_exist)
+      {
+        const dict_index_t *fts_index= nullptr;
+        for (ulint a= 0; a < ctx->num_to_add_index; a++)
+        {
+          const dict_index_t *index = ctx->add_index[a];
+          if (index->type & DICT_FTS)
+            fts_index= index;
+        }
+
+        /* Remove the fts table from fts_optimize_wq if there are
+        no FTS secondary index exist other than newly added one */
+        if (fts_index &&
+            (ib_vector_is_empty(prebuilt->table->fts->indexes) ||
+             (ib_vector_size(prebuilt->table->fts->indexes) == 1 &&
+              fts_index == static_cast<dict_index_t*>(
+                ib_vector_getp(prebuilt->table->fts->indexes, 0)))))
+          fts_optimize_remove_table(prebuilt->table);
+
+        purge_sys.stop_FTS(*prebuilt->table);
+        ut_a(!fts_index || !fts_lock_index_tables(ctx->trx, *fts_index));
+        ut_a(!fts_lock_common_tables(ctx->trx, *ctx->new_table));
+        ut_a(!lock_sys_tables(ctx->trx));
+      }
+      else
+      {
+        ut_a(!lock_table_for_trx(dict_sys.sys_indexes, ctx->trx, LOCK_X));
+        ut_a(!lock_table_for_trx(dict_sys.sys_fields, ctx->trx, LOCK_X));
+      }
+      innodb_lock_wait_timeout= save_timeout;
+      DEBUG_SYNC_C("innodb_rollback_after_fts_lock");
+      row_mysql_lock_data_dictionary(ctx->trx);
+      ctx->rollback_instant();
+      innobase_rollback_sec_index(ctx->old_table, table,
+                                  ha_alter_info->alter_info->requested_lock ==
+                                  Alter_info::ALTER_TABLE_LOCK_EXCLUSIVE,
+                                  ctx->trx, prebuilt->trx);
+      ctx->clean_new_vcol_index();
+      ctx->cleanup_col_collation();
+      ut_d(dict_table_check_for_dup_indexes(ctx->old_table, CHECK_ABORTED_OK));
+    }
+
+    DEBUG_SYNC(ctx->trx->mysql_thd, "before_commit_rollback_inplace");
+    commit_unlock_and_unlink(ctx->trx);
+    if (fts_exist)
+      purge_sys.resume_FTS();
+    if (ctx->old_table->fts)
+    {
+      dict_sys.lock(SRW_LOCK_CALL);
+      ut_ad(fts_check_cached_index(ctx->old_table));
+      fts_optimize_add_table(ctx->old_table);
+      dict_sys.unlock();
+    }
+    goto free_and_exit;
+  }
+  else
+  {
+free_and_exit:
+    DBUG_ASSERT(ctx->prebuilt == prebuilt);
+    ctx->trx->free();
+    ctx->trx= nullptr;
+
+    dict_sys.lock(SRW_LOCK_CALL);
+
+    if (ctx->add_vcol)
+    {
+      for (ulint i = 0; i < ctx->num_to_add_vcol; i++)
+        ctx->add_vcol[i].~dict_v_col_t();
+      ctx->num_to_add_vcol= 0;
+      ctx->add_vcol= nullptr;
+    }
+
+    for (ulint i= 0; i < ctx->num_to_add_fk; i++)
+      dict_foreign_free(ctx->add_fk[i]);
+    /* Clear the to_be_dropped flags in the data dictionary cache.
+    The flags may already have been cleared, in case an error was
+    detected in commit_inplace_alter_table(). */
+    for (ulint i= 0; i < ctx->num_to_drop_index; i++)
+    {
+      dict_index_t *index= ctx->drop_index[i];
+      DBUG_ASSERT(index->is_committed());
+      index->to_be_dropped= 0;
+    }
+  }
+
+  DBUG_ASSERT(!prebuilt->table->indexes.start->online_log);
+  DBUG_ASSERT(prebuilt->table->indexes.start->online_status ==
+              ONLINE_INDEX_COMPLETE);
+
+  /* Reset dict_col_t::ord_part for unindexed columns */
+  for (ulint i= 0; i < dict_table_get_n_cols(prebuilt->table); i++)
+  {
+    dict_col_t &col= prebuilt->table->cols[i];
+    if (col.ord_part && !check_col_exists_in_indexes(prebuilt->table, i, false,
+                                                     true))
+      col.ord_part= 0;
+  }
+
+  for (ulint i = 0; i < dict_table_get_n_v_cols(prebuilt->table); i++)
+  {
+    dict_col_t &col = prebuilt->table->v_cols[i].m_col;
+    if (col.ord_part && !check_col_exists_in_indexes(prebuilt->table, i, true,
+                                                     true))
+      col.ord_part= 0;
+  }
+  dict_sys.unlock();
+  trx_commit_for_mysql(prebuilt->trx);
+  prebuilt->trx_id = 0;
+  MONITOR_ATOMIC_DEC(MONITOR_PENDING_ALTER_TABLE);
+  DBUG_RETURN(fail);
+}
+
+/** Drop a FOREIGN KEY constraint from the data dictionary tables.
+@param trx data dictionary transaction
+@param table_name Table name in MySQL
+@param foreign_id Foreign key constraint identifier
+@retval true Failure
+@retval false Success */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+innobase_drop_foreign_try(
+/*======================*/
+	trx_t*			trx,
+	const char*		table_name,
+	const char*		foreign_id)
+{
+	DBUG_ENTER("innobase_drop_foreign_try");
+
+	DBUG_ASSERT(trx->dict_operation);
+	ut_ad(trx->dict_operation_lock_mode);
+	ut_ad(dict_sys.locked());
+
+	/* Drop the constraint from the data dictionary. */
+	static const char sql[] =
+		"PROCEDURE DROP_FOREIGN_PROC () IS\n"
+		"BEGIN\n"
+		"DELETE FROM SYS_FOREIGN WHERE ID=:id;\n"
+		"DELETE FROM SYS_FOREIGN_COLS WHERE ID=:id;\n"
+		"END;\n";
+
+	dberr_t		error;
+	pars_info_t*	info;
+
+	info = pars_info_create();
+	pars_info_add_str_literal(info, "id", foreign_id);
+
+	trx->op_info = "dropping foreign key constraint from dictionary";
+	error = que_eval_sql(info, sql, trx);
+	trx->op_info = "";
+
+	DBUG_EXECUTE_IF("ib_drop_foreign_error",
+			error = DB_OUT_OF_FILE_SPACE;);
+
+	if (error != DB_SUCCESS) {
+		my_error_innodb(error, table_name, 0);
+		trx->error_state = DB_SUCCESS;
+		DBUG_RETURN(true);
+	}
+
+	DBUG_RETURN(false);
+}
+
+/** Rename a column in the data dictionary tables.
+@param[in] ctx			ALTER TABLE context
+@param[in,out] trx		Data dictionary transaction
+@param[in] table_name		Table name in MySQL
+@param[in] from			old column name
+@param[in] to			new column name
+@retval true Failure
+@retval false Success */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+innobase_rename_column_try(
+	const ha_innobase_inplace_ctx&	ctx,
+	trx_t*				trx,
+	const char*			table_name,
+	const char*			from,
+	const char*			to)
+{
+	dberr_t		error;
+	bool clust_has_wide_format = false;
+
+	DBUG_ENTER("innobase_rename_column_try");
+
+	DBUG_ASSERT(trx->dict_operation);
+	ut_ad(trx->dict_operation_lock_mode);
+	ut_ad(dict_sys.locked());
+
+	if (ctx.need_rebuild()) {
+		goto rename_foreign;
+	}
+
+	error = DB_SUCCESS;
+
+	trx->op_info = "renaming column in SYS_FIELDS";
+
+	for (const dict_index_t* index = dict_table_get_first_index(
+		     ctx.old_table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+
+		bool wide_format = false;
+		for (size_t i = 0; i < dict_index_get_n_fields(index); i++) {
+			dict_field_t* field= dict_index_get_nth_field(index, i);
+			if (field->prefix_len || field->descending) {
+				wide_format = true;
+				break;
+			}
+		}
+
+		for (ulint i = 0; i < dict_index_get_n_fields(index); i++) {
+			const dict_field_t& f = index->fields[i];
+			DBUG_ASSERT(!f.name == f.col->is_dropped());
+
+			if (!f.name || my_strcasecmp(system_charset_info,
+						     f.name, from)) {
+				continue;
+			}
+
+			pars_info_t* info = pars_info_create();
+			ulint pos = wide_format
+				    ? i << 16 | f.prefix_len
+				      | !!f.descending << 15
+				    : i;
+			pars_info_add_ull_literal(info, "indexid", index->id);
+			pars_info_add_int4_literal(info, "nth", pos);
+			pars_info_add_str_literal(info, "new", to);
+
+			error = que_eval_sql(
+				info,
+				"PROCEDURE RENAME_SYS_FIELDS_PROC () IS\n"
+				"BEGIN\n"
+				"UPDATE SYS_FIELDS SET COL_NAME=:new\n"
+				"WHERE INDEX_ID=:indexid\n"
+				"AND POS=:nth;\n"
+				"END;\n", trx);
+			DBUG_EXECUTE_IF("ib_rename_column_error",
+					error = DB_OUT_OF_FILE_SPACE;);
+
+			if (error != DB_SUCCESS) {
+				goto err_exit;
+			}
+
+			if (!wide_format || !clust_has_wide_format
+			    || f.prefix_len || f.descending) {
+				continue;
+			}
+
+			/* For secondary indexes, the
+			wide_format check can be 'polluted'
+			by PRIMARY KEY column prefix or descending
+			field. Try also the simpler encoding
+			of SYS_FIELDS.POS. */
+			info = pars_info_create();
+
+			pars_info_add_ull_literal(info, "indexid", index->id);
+			pars_info_add_int4_literal(info, "nth", i);
+			pars_info_add_str_literal(info, "new", to);
+
+			error = que_eval_sql(
+				info,
+				"PROCEDURE RENAME_SYS_FIELDS_PROC () IS\n"
+				"BEGIN\n"
+				"UPDATE SYS_FIELDS SET COL_NAME=:new\n"
+				"WHERE INDEX_ID=:indexid\n"
+				"AND POS=:nth;\n"
+				"END;\n", trx);
+
+			if (error != DB_SUCCESS) {
+				goto err_exit;
+			}
+		}
+
+		if (index == dict_table_get_first_index(ctx.old_table)) {
+			clust_has_wide_format = wide_format;
+		}
+	}
+
+	if (error != DB_SUCCESS) {
+err_exit:
+		my_error_innodb(error, table_name, 0);
+		trx->error_state = DB_SUCCESS;
+		trx->op_info = "";
+		DBUG_RETURN(true);
+	}
+
+rename_foreign:
+	trx->op_info = "renaming column in SYS_FOREIGN_COLS";
+
+	std::set<dict_foreign_t*> fk_evict;
+	bool		foreign_modified;
+
+	for (dict_foreign_set::const_iterator it = ctx.old_table->foreign_set.begin();
+	     it != ctx.old_table->foreign_set.end();
+	     ++it) {
+
+		dict_foreign_t*	foreign = *it;
+		foreign_modified = false;
+
+		for (unsigned i = 0; i < foreign->n_fields; i++) {
+			if (my_strcasecmp(system_charset_info,
+					  foreign->foreign_col_names[i],
+					  from)) {
+				continue;
+			}
+
+			/* Ignore the foreign key rename if fk info
+			is being dropped. */
+			if (innobase_dropping_foreign(
+				    foreign, ctx.drop_fk,
+				    ctx.num_to_drop_fk)) {
+				continue;
+			}
+
+			pars_info_t* info = pars_info_create();
+
+			pars_info_add_str_literal(info, "id", foreign->id);
+			pars_info_add_int4_literal(info, "nth", i);
+			pars_info_add_str_literal(info, "new", to);
+
+			error = que_eval_sql(
+				info,
+				"PROCEDURE RENAME_SYS_FOREIGN_F_PROC () IS\n"
+				"BEGIN\n"
+				"UPDATE SYS_FOREIGN_COLS\n"
+				"SET FOR_COL_NAME=:new\n"
+				"WHERE ID=:id AND POS=:nth;\n"
+				"END;\n", trx);
+
+			if (error != DB_SUCCESS) {
+				goto err_exit;
+			}
+			foreign_modified = true;
+		}
+
+		if (foreign_modified) {
+			fk_evict.insert(foreign);
+		}
+	}
+
+	for (dict_foreign_set::const_iterator it
+		= ctx.old_table->referenced_set.begin();
+	     it != ctx.old_table->referenced_set.end();
+	     ++it) {
+
+		foreign_modified = false;
+		dict_foreign_t*	foreign = *it;
+
+		for (unsigned i = 0; i < foreign->n_fields; i++) {
+			if (my_strcasecmp(system_charset_info,
+					  foreign->referenced_col_names[i],
+					  from)) {
+				continue;
+			}
+
+			pars_info_t* info = pars_info_create();
+
+			pars_info_add_str_literal(info, "id", foreign->id);
+			pars_info_add_int4_literal(info, "nth", i);
+			pars_info_add_str_literal(info, "new", to);
+
+			error = que_eval_sql(
+				info,
+				"PROCEDURE RENAME_SYS_FOREIGN_R_PROC () IS\n"
+				"BEGIN\n"
+				"UPDATE SYS_FOREIGN_COLS\n"
+				"SET REF_COL_NAME=:new\n"
+				"WHERE ID=:id AND POS=:nth;\n"
+				"END;\n", trx);
+
+			if (error != DB_SUCCESS) {
+				goto err_exit;
+			}
+			foreign_modified = true;
+		}
+
+		if (foreign_modified) {
+			fk_evict.insert(foreign);
+		}
+	}
+
+	/* Reload the foreign key info for instant table too. */
+	if (ctx.need_rebuild() || ctx.is_instant()) {
+		std::for_each(fk_evict.begin(), fk_evict.end(),
+			      dict_foreign_remove_from_cache);
+	}
+
+	trx->op_info = "";
+	DBUG_RETURN(false);
+}
+
+/** Rename columns in the data dictionary tables.
+@param ha_alter_info Data used during in-place alter.
+@param ctx In-place ALTER TABLE context
+@param table the TABLE
+@param trx data dictionary transaction
+@param table_name Table name in MySQL
+@retval true Failure
+@retval false Success */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+innobase_rename_columns_try(
+/*========================*/
+	Alter_inplace_info*	ha_alter_info,
+	ha_innobase_inplace_ctx*ctx,
+	const TABLE*		table,
+	trx_t*			trx,
+	const char*		table_name)
+{
+	uint	i = 0;
+
+	DBUG_ASSERT(ctx->need_rebuild());
+	DBUG_ASSERT(ha_alter_info->handler_flags
+		    & ALTER_COLUMN_NAME);
+
+	for (Field** fp = table->field; *fp; fp++, i++) {
+		if (!((*fp)->flags & FIELD_IS_RENAMED)) {
+			continue;
+		}
+
+		for (const Create_field& cf :
+		     ha_alter_info->alter_info->create_list) {
+			if (cf.field == *fp) {
+				if (innobase_rename_column_try(
+					    *ctx, trx, table_name,
+					    cf.field->field_name.str,
+					    cf.field_name.str)) {
+					return(true);
+				}
+				goto processed_field;
+			}
+		}
+
+		ut_error;
+processed_field:
+		continue;
+	}
+
+	return(false);
+}
+
+/** Enlarge a column in the data dictionary tables.
+@param ctx In-place ALTER TABLE context
+@param trx data dictionary transaction
+@param table_name Table name in MySQL
+@param pos 0-based index to user_table->cols[] or user_table->v_cols[]
+@param f new column
+@param is_v if it's a virtual column
+@retval true Failure
+@retval false Success */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+innobase_rename_or_enlarge_column_try(
+	ha_innobase_inplace_ctx*ctx,
+	trx_t*			trx,
+	const char*		table_name,
+	ulint			pos,
+	const Field&		f,
+	bool			is_v)
+{
+	dict_col_t*	col;
+	dict_table_t* user_table = ctx->old_table;
+
+	DBUG_ENTER("innobase_rename_or_enlarge_column_try");
+	DBUG_ASSERT(!ctx->need_rebuild());
+
+	DBUG_ASSERT(trx->dict_operation);
+	ut_ad(trx->dict_operation_lock_mode);
+	ut_ad(dict_sys.locked());
+
+	ulint n_base;
+
+	if (is_v) {
+		dict_v_col_t* v_col= dict_table_get_nth_v_col(user_table, pos);
+		pos = dict_create_v_col_pos(v_col->v_pos, v_col->m_col.ind);
+		col = &v_col->m_col;
+		n_base = v_col->num_base;
+	} else {
+		col = dict_table_get_nth_col(user_table, pos);
+		n_base = 0;
+	}
+
+	unsigned prtype;
+	uint8_t mtype;
+	uint16_t len;
+	get_type(f, prtype, mtype, len);
+	DBUG_ASSERT(!dtype_is_string_type(col->mtype)
+		    || col->mbminlen == f.charset()->mbminlen);
+	DBUG_ASSERT(col->len <= len);
+
+#ifdef UNIV_DEBUG
+	ut_ad(col->mbminlen <= col->mbmaxlen);
+	switch (mtype) {
+	case DATA_MYSQL:
+		if (!(prtype & DATA_BINARY_TYPE) || user_table->not_redundant()
+		    || col->mbminlen != col->mbmaxlen) {
+			/* NOTE: we could allow this when !(prtype &
+			DATA_BINARY_TYPE) and ROW_FORMAT is not REDUNDANT and
+			mbminlen<mbmaxlen. That is, we treat a UTF-8 CHAR(n)
+			column somewhat like a VARCHAR. */
+			break;
+		}
+		/* fall through */
+	case DATA_FIXBINARY:
+	case DATA_CHAR:
+		ut_ad(col->len == len);
+		break;
+	case DATA_BINARY:
+	case DATA_VARCHAR:
+	case DATA_VARMYSQL:
+	case DATA_DECIMAL:
+	case DATA_BLOB:
+		break;
+	default:
+		ut_ad(!((col->prtype ^ prtype) & ~DATA_VERSIONED));
+		ut_ad(col->mtype == mtype);
+		ut_ad(col->len == len);
+	}
+#endif /* UNIV_DEBUG */
+
+	const char* col_name = col->name(*user_table);
+	const bool same_name = !strcmp(col_name, f.field_name.str);
+
+	if (!same_name
+	    && innobase_rename_column_try(*ctx, trx, table_name,
+					  col_name, f.field_name.str)) {
+		DBUG_RETURN(true);
+	}
+
+	if (same_name
+	    && col->prtype == prtype && col->mtype == mtype
+	    && col->len == len) {
+		DBUG_RETURN(false);
+	}
+
+	DBUG_RETURN(innodb_insert_sys_columns(user_table->id, pos,
+					      f.field_name.str,
+					      mtype, prtype, len,
+					      n_base, trx, true));
+}
+
+/** Rename or enlarge columns in the data dictionary cache
+as part of commit_try_norebuild().
+@param ha_alter_info Data used during in-place alter.
+@param ctx In-place ALTER TABLE context
+@param altered_table metadata after ALTER TABLE
+@param table metadata before ALTER TABLE
+@param trx data dictionary transaction
+@param table_name Table name in MySQL
+@retval true Failure
+@retval false Success */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+innobase_rename_or_enlarge_columns_try(
+	Alter_inplace_info*	ha_alter_info,
+	ha_innobase_inplace_ctx*ctx,
+	const TABLE*		altered_table,
+	const TABLE*		table,
+	trx_t*			trx,
+	const char*		table_name)
+{
+	DBUG_ENTER("innobase_rename_or_enlarge_columns_try");
+
+	if (!(ha_alter_info->handler_flags
+	      & (ALTER_COLUMN_TYPE_CHANGE_BY_ENGINE
+		 | ALTER_COLUMN_NAME))) {
+		DBUG_RETURN(false);
+	}
+
+	ulint	i = 0;
+	ulint	num_v = 0;
+
+	for (Field** fp = table->field; *fp; fp++, i++) {
+		const bool is_v = !(*fp)->stored_in_db();
+		ulint idx = is_v ? num_v++ : i - num_v;
+
+		Field** af = altered_table->field;
+		for (const Create_field& cf :
+		     ha_alter_info->alter_info->create_list) {
+			if (cf.field == *fp) {
+				if (innobase_rename_or_enlarge_column_try(
+					    ctx, trx, table_name,
+					    idx, **af, is_v)) {
+					DBUG_RETURN(true);
+				}
+				break;
+			}
+			af++;
+		}
+	}
+
+	DBUG_RETURN(false);
+}
+
+/** Rename or enlarge columns in the data dictionary cache
+as part of commit_cache_norebuild().
+@param ha_alter_info Data used during in-place alter.
+@param altered_table metadata after ALTER TABLE
+@param table metadata before ALTER TABLE
+@param user_table InnoDB table that was being altered */
+static MY_ATTRIBUTE((nonnull))
+void
+innobase_rename_or_enlarge_columns_cache(
+/*=====================================*/
+	Alter_inplace_info*	ha_alter_info,
+	const TABLE*		altered_table,
+	const TABLE*		table,
+	dict_table_t*		user_table)
+{
+	if (!(ha_alter_info->handler_flags
+	      & (ALTER_COLUMN_TYPE_CHANGE_BY_ENGINE
+		 | ALTER_COLUMN_NAME))) {
+		return;
+	}
+
+	uint	i = 0;
+	ulint	num_v = 0;
+
+	for (Field** fp = table->field; *fp; fp++, i++) {
+		const bool is_virtual = !(*fp)->stored_in_db();
+
+		Field** af = altered_table->field;
+		for (Create_field& cf :
+		     ha_alter_info->alter_info->create_list) {
+			if (cf.field != *fp) {
+				af++;
+				continue;
+			}
+
+			ulint	col_n = is_virtual ? num_v : i - num_v;
+			dict_col_t *col = is_virtual
+				? &dict_table_get_nth_v_col(user_table, col_n)
+				->m_col
+				: dict_table_get_nth_col(user_table, col_n);
+			const bool is_string= dtype_is_string_type(col->mtype);
+			DBUG_ASSERT(col->mbminlen
+				    == (is_string
+					? (*af)->charset()->mbminlen : 0));
+			unsigned prtype;
+			uint8_t mtype;
+			uint16_t len;
+			get_type(**af, prtype, mtype, len);
+			DBUG_ASSERT(is_string == dtype_is_string_type(mtype));
+
+			col->prtype = prtype;
+			col->mtype = mtype;
+			col->len = len;
+			col->mbmaxlen = is_string
+				? (*af)->charset()->mbmaxlen & 7: 0;
+
+			if ((*fp)->flags & FIELD_IS_RENAMED) {
+				dict_mem_table_col_rename(
+					user_table, col_n,
+					cf.field->field_name.str,
+					(*af)->field_name.str, is_virtual);
+			}
+
+			break;
+		}
+
+		if (is_virtual) {
+			num_v++;
+		}
+	}
+}
+
+/** Set the auto-increment value of the table on commit.
+@param ha_alter_info Data used during in-place alter
+@param ctx In-place ALTER TABLE context
+@param altered_table MySQL table that is being altered
+@param old_table MySQL table as it is before the ALTER operation
+@return whether the operation failed (and my_error() was called) */
+static MY_ATTRIBUTE((nonnull))
+bool
+commit_set_autoinc(
+	Alter_inplace_info*	ha_alter_info,
+	ha_innobase_inplace_ctx*ctx,
+	const TABLE*		altered_table,
+	const TABLE*		old_table)
+{
+	DBUG_ENTER("commit_set_autoinc");
+
+	if (!altered_table->found_next_number_field) {
+		/* There is no AUTO_INCREMENT column in the table
+		after the ALTER operation. */
+	} else if (ctx->add_autoinc != ULINT_UNDEFINED) {
+		ut_ad(ctx->need_rebuild());
+		/* An AUTO_INCREMENT column was added. Get the last
+		value from the sequence, which may be based on a
+		supplied AUTO_INCREMENT value. */
+		ib_uint64_t autoinc = ctx->sequence.last();
+		ctx->new_table->autoinc = autoinc;
+		/* Bulk index creation does not update
+		PAGE_ROOT_AUTO_INC, so we must persist the "last used"
+		value here. */
+		btr_write_autoinc(dict_table_get_first_index(ctx->new_table),
+				  autoinc - 1, true);
+	} else if ((ha_alter_info->handler_flags
+		    & ALTER_CHANGE_CREATE_OPTION)
+		   && (ha_alter_info->create_info->used_fields
+		       & HA_CREATE_USED_AUTO)) {
+
+		if (!ctx->old_table->space) {
+			my_error(ER_TABLESPACE_DISCARDED, MYF(0),
+				 old_table->s->table_name.str);
+			DBUG_RETURN(true);
+		}
+
+		/* An AUTO_INCREMENT value was supplied by the user.
+		It must be persisted to the data file. */
+		const Field*	ai	= old_table->found_next_number_field;
+		ut_ad(!strcmp(dict_table_get_col_name(ctx->old_table,
+						      innodb_col_no(ai)),
+			      ai->field_name.str));
+
+		ib_uint64_t	autoinc
+			= ha_alter_info->create_info->auto_increment_value;
+		if (autoinc == 0) {
+			autoinc = 1;
+		}
+
+		if (autoinc >= ctx->old_table->autoinc) {
+			/* Persist the predecessor of the
+			AUTO_INCREMENT value as the last used one. */
+			ctx->new_table->autoinc = autoinc--;
+		} else {
+			/* Mimic ALGORITHM=COPY in the following scenario:
+
+			CREATE TABLE t (a SERIAL);
+			INSERT INTO t SET a=100;
+			ALTER TABLE t AUTO_INCREMENT = 1;
+			INSERT INTO t SET a=NULL;
+			SELECT * FROM t;
+
+			By default, ALGORITHM=INPLACE would reset the
+			sequence to 1, while after ALGORITHM=COPY, the
+			last INSERT would use a value larger than 100.
+
+			We could only search the tree to know current
+			max counter in the table and compare. */
+			const dict_col_t*	autoinc_col
+				= dict_table_get_nth_col(ctx->old_table,
+							 innodb_col_no(ai));
+			dict_index_t*		index
+				= dict_table_get_first_index(ctx->old_table);
+			while (index != NULL
+			       && index->fields[0].col != autoinc_col) {
+				index = dict_table_get_next_index(index);
+			}
+
+			ut_ad(index);
+
+			ib_uint64_t	max_in_table = index
+				? row_search_max_autoinc(index)
+				: 0;
+
+			if (autoinc <= max_in_table) {
+				ctx->new_table->autoinc = innobase_next_autoinc(
+					max_in_table, 1,
+					ctx->prebuilt->autoinc_increment,
+					ctx->prebuilt->autoinc_offset,
+					innobase_get_int_col_max_value(ai));
+				/* Persist the maximum value as the
+				last used one. */
+				autoinc = max_in_table;
+			} else {
+				/* Persist the predecessor of the
+				AUTO_INCREMENT value as the last used one. */
+				ctx->new_table->autoinc = autoinc--;
+			}
+		}
+
+		btr_write_autoinc(dict_table_get_first_index(ctx->new_table),
+				  autoinc, true);
+	} else if (ctx->need_rebuild()) {
+		/* No AUTO_INCREMENT value was specified.
+		Copy it from the old table. */
+		ctx->new_table->autoinc = ctx->old_table->autoinc;
+		/* The persistent value was already copied in
+		prepare_inplace_alter_table_dict() when ctx->new_table
+		was created. If this was a LOCK=NONE operation, the
+		AUTO_INCREMENT values would be updated during
+		row_log_table_apply(). If this was LOCK!=NONE,
+		the table contents could not possibly have changed
+		between prepare_inplace and commit_inplace. */
+	}
+
+	DBUG_RETURN(false);
+}
+
+/** Add or drop foreign key constraints to the data dictionary tables,
+but do not touch the data dictionary cache.
+@param ha_alter_info Data used during in-place alter
+@param ctx In-place ALTER TABLE context
+@param trx Data dictionary transaction
+@param table_name Table name in MySQL
+@retval true Failure
+@retval false Success
+*/
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+innobase_update_foreign_try(
+/*========================*/
+	ha_innobase_inplace_ctx*ctx,
+	trx_t*			trx,
+	const char*		table_name)
+{
+	ulint	foreign_id;
+	ulint	i;
+
+	DBUG_ENTER("innobase_update_foreign_try");
+
+	foreign_id = dict_table_get_highest_foreign_id(ctx->new_table);
+
+	foreign_id++;
+
+	for (i = 0; i < ctx->num_to_add_fk; i++) {
+		dict_foreign_t*		fk = ctx->add_fk[i];
+
+		ut_ad(fk->foreign_table == ctx->new_table
+		      || fk->foreign_table == ctx->old_table);
+
+		dberr_t error = dict_create_add_foreign_id(
+			&foreign_id, ctx->old_table->name.m_name, fk);
+
+		if (error != DB_SUCCESS) {
+			my_error(ER_TOO_LONG_IDENT, MYF(0),
+				 fk->id);
+			DBUG_RETURN(true);
+		}
+
+		if (!fk->foreign_index) {
+			fk->foreign_index = dict_foreign_find_index(
+				ctx->new_table, ctx->col_names,
+				fk->foreign_col_names,
+				fk->n_fields, fk->referenced_index, TRUE,
+				fk->type
+				& (DICT_FOREIGN_ON_DELETE_SET_NULL
+					| DICT_FOREIGN_ON_UPDATE_SET_NULL),
+				NULL, NULL, NULL);
+			if (!fk->foreign_index) {
+				my_error(ER_FK_INCORRECT_OPTION,
+					 MYF(0), table_name, fk->id);
+				DBUG_RETURN(true);
+			}
+		}
+
+		/* The fk->foreign_col_names[] uses renamed column
+		names, while the columns in ctx->old_table have not
+		been renamed yet. */
+		error = dict_create_add_foreign_to_dictionary(
+			ctx->old_table->name.m_name, fk, trx);
+
+		DBUG_EXECUTE_IF(
+			"innodb_test_cannot_add_fk_system",
+			error = DB_ERROR;);
+
+		if (error != DB_SUCCESS) {
+			my_error(ER_FK_FAIL_ADD_SYSTEM, MYF(0),
+				 fk->id);
+			DBUG_RETURN(true);
+		}
+	}
+
+	for (i = 0; i < ctx->num_to_drop_fk; i++) {
+		dict_foreign_t* fk = ctx->drop_fk[i];
+
+		DBUG_ASSERT(fk->foreign_table == ctx->old_table);
+
+		if (innobase_drop_foreign_try(trx, table_name, fk->id)) {
+			DBUG_RETURN(true);
+		}
+	}
+
+	DBUG_RETURN(false);
+}
+
+/** Update the foreign key constraint definitions in the data dictionary cache
+after the changes to data dictionary tables were committed.
+@param ctx	In-place ALTER TABLE context
+@param user_thd	MySQL connection
+@return		InnoDB error code (should always be DB_SUCCESS) */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+innobase_update_foreign_cache(
+/*==========================*/
+	ha_innobase_inplace_ctx*	ctx,
+	THD*				user_thd)
+{
+	dict_table_t*	user_table;
+	dberr_t		err = DB_SUCCESS;
+
+	DBUG_ENTER("innobase_update_foreign_cache");
+
+	ut_ad(dict_sys.locked());
+
+	user_table = ctx->old_table;
+
+	/* Discard the added foreign keys, because we will
+	load them from the data dictionary. */
+	for (ulint i = 0; i < ctx->num_to_add_fk; i++) {
+		dict_foreign_t*	fk = ctx->add_fk[i];
+		dict_foreign_free(fk);
+	}
+
+	if (ctx->need_rebuild()) {
+		/* The rebuilt table is already using the renamed
+		column names. No need to pass col_names or to drop
+		constraints from the data dictionary cache. */
+		DBUG_ASSERT(!ctx->col_names);
+		user_table = ctx->new_table;
+	} else {
+		/* Drop the foreign key constraints if the
+		table was not rebuilt. If the table is rebuilt,
+		there would not be any foreign key contraints for
+		it yet in the data dictionary cache. */
+		for (ulint i = 0; i < ctx->num_to_drop_fk; i++) {
+			dict_foreign_t* fk = ctx->drop_fk[i];
+			dict_foreign_remove_from_cache(fk);
+		}
+	}
+
+	/* Load the old or added foreign keys from the data dictionary
+	and prevent the table from being evicted from the data
+	dictionary cache (work around the lack of WL#6049). */
+	dict_names_t	fk_tables;
+
+	err = dict_load_foreigns(user_table->name.m_name,
+				 ctx->col_names, 1, true,
+				 DICT_ERR_IGNORE_FK_NOKEY,
+				 fk_tables);
+
+	if (err == DB_CANNOT_ADD_CONSTRAINT) {
+		fk_tables.clear();
+
+		/* It is possible there are existing foreign key are
+		loaded with "foreign_key checks" off,
+		so let's retry the loading with charset_check is off */
+		err = dict_load_foreigns(user_table->name.m_name,
+					 ctx->col_names, 1, false,
+					 DICT_ERR_IGNORE_NONE,
+					 fk_tables);
+
+		/* The load with "charset_check" off is successful, warn
+		the user that the foreign key has loaded with mis-matched
+		charset */
+		if (err == DB_SUCCESS) {
+			push_warning_printf(
+				user_thd,
+				Sql_condition::WARN_LEVEL_WARN,
+				ER_ALTER_INFO,
+				"Foreign key constraints for table '%s'"
+				" are loaded with charset check off",
+				user_table->name.m_name);
+		}
+	}
+
+	/* For complete loading of foreign keys, all associated tables must
+	also be loaded. */
+	while (err == DB_SUCCESS && !fk_tables.empty()) {
+		const char *f = fk_tables.front();
+		if (!dict_sys.load_table({f, strlen(f)})) {
+			err = DB_TABLE_NOT_FOUND;
+			ib::error()
+				<< "Failed to load table "
+				<< table_name_t(const_cast<char*>(f))
+				<< " which has a foreign key constraint with"
+				<< user_table->name;
+			break;
+		}
+
+		fk_tables.pop_front();
+	}
+
+	DBUG_RETURN(err);
+}
+
+/** Changes SYS_COLUMNS.PRTYPE for one column.
+@param[in,out]	trx	transaction
+@param[in]	table_name	table name
+@param[in]	tableid	table ID as in SYS_TABLES
+@param[in]	pos	column position
+@param[in]	prtype	new precise type
+@return		boolean flag
+@retval	true	on failure
+@retval false	on success */
+static
+bool
+vers_change_field_try(
+	trx_t* trx,
+	const char* table_name,
+	const table_id_t tableid,
+	const ulint pos,
+	const ulint prtype)
+{
+	DBUG_ENTER("vers_change_field_try");
+
+	pars_info_t* info = pars_info_create();
+
+	pars_info_add_int4_literal(info, "prtype", prtype);
+	pars_info_add_ull_literal(info,"tableid", tableid);
+	pars_info_add_int4_literal(info, "pos", pos);
+
+	dberr_t error = que_eval_sql(info,
+				     "PROCEDURE CHANGE_COLUMN_MTYPE () IS\n"
+				     "BEGIN\n"
+				     "UPDATE SYS_COLUMNS SET PRTYPE=:prtype\n"
+				     "WHERE TABLE_ID=:tableid AND POS=:pos;\n"
+				     "END;\n", trx);
+
+	if (error != DB_SUCCESS) {
+		my_error_innodb(error, table_name, 0);
+		trx->error_state = DB_SUCCESS;
+		trx->op_info = "";
+		DBUG_RETURN(true);
+	}
+
+	DBUG_RETURN(false);
+}
+
+/** Changes fields WITH/WITHOUT SYSTEM VERSIONING property in SYS_COLUMNS.
+@param[in]	ha_alter_info	alter info
+@param[in]	ctx	alter inplace context
+@param[in]	trx	transaction
+@param[in]	table	old table
+@return		boolean flag
+@retval	true	on failure
+@retval false	on success */
+static
+bool
+vers_change_fields_try(
+	const Alter_inplace_info* ha_alter_info,
+	const ha_innobase_inplace_ctx* ctx,
+	trx_t* trx,
+	const TABLE* table)
+{
+	DBUG_ENTER("vers_change_fields_try");
+
+	DBUG_ASSERT(ha_alter_info);
+	DBUG_ASSERT(ctx);
+
+	for (const Create_field& create_field : ha_alter_info->alter_info->create_list) {
+		if (!create_field.field) {
+			continue;
+		}
+		if (create_field.versioning
+		    == Column_definition::VERSIONING_NOT_SET) {
+			continue;
+		}
+
+		const dict_table_t* new_table = ctx->new_table;
+		const uint pos = innodb_col_no(create_field.field);
+		const dict_col_t* col = dict_table_get_nth_col(new_table, pos);
+
+		DBUG_ASSERT(!col->vers_sys_start());
+		DBUG_ASSERT(!col->vers_sys_end());
+
+		ulint new_prtype
+		    = create_field.versioning
+			      == Column_definition::WITHOUT_VERSIONING
+			  ? col->prtype & ~DATA_VERSIONED
+			  : col->prtype | DATA_VERSIONED;
+
+		if (vers_change_field_try(trx, table->s->table_name.str,
+					  new_table->id, pos,
+					  new_prtype)) {
+			DBUG_RETURN(true);
+		}
+	}
+
+	DBUG_RETURN(false);
+}
+
+/** Changes WITH/WITHOUT SYSTEM VERSIONING for fields
+in the data dictionary cache.
+@param ha_alter_info Data used during in-place alter
+@param ctx In-place ALTER TABLE context
+@param table MySQL table as it is before the ALTER operation */
+static
+void
+vers_change_fields_cache(
+	Alter_inplace_info*		ha_alter_info,
+	const ha_innobase_inplace_ctx*	ctx,
+	const TABLE*			table)
+{
+	DBUG_ENTER("vers_change_fields_cache");
+
+	DBUG_ASSERT(ha_alter_info);
+	DBUG_ASSERT(ctx);
+	DBUG_ASSERT(ha_alter_info->handler_flags & ALTER_COLUMN_UNVERSIONED);
+
+	for (const Create_field& create_field :
+	     ha_alter_info->alter_info->create_list) {
+		if (!create_field.field || create_field.field->vcol_info) {
+			continue;
+		}
+		dict_col_t* col = dict_table_get_nth_col(
+		    ctx->new_table, innodb_col_no(create_field.field));
+
+		if (create_field.versioning
+		    == Column_definition::WITHOUT_VERSIONING) {
+
+			DBUG_ASSERT(!col->vers_sys_start());
+			DBUG_ASSERT(!col->vers_sys_end());
+			col->prtype &= ~DATA_VERSIONED;
+		} else if (create_field.versioning
+			   == Column_definition::WITH_VERSIONING) {
+
+			DBUG_ASSERT(!col->vers_sys_start());
+			DBUG_ASSERT(!col->vers_sys_end());
+			col->prtype |= DATA_VERSIONED;
+		}
+	}
+
+	DBUG_VOID_RETURN;
+}
+
+/** Commit the changes made during prepare_inplace_alter_table()
+and inplace_alter_table() inside the data dictionary tables,
+when rebuilding the table.
+@param ha_alter_info Data used during in-place alter
+@param ctx In-place ALTER TABLE context
+@param altered_table MySQL table that is being altered
+@param old_table MySQL table as it is before the ALTER operation
+@param trx Data dictionary transaction
+@param table_name Table name in MySQL
+@retval true Failure
+@retval false Success
+*/
+inline MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+commit_try_rebuild(
+/*===============*/
+	Alter_inplace_info*	ha_alter_info,
+	ha_innobase_inplace_ctx*ctx,
+	TABLE*			altered_table,
+	const TABLE*		old_table,
+	bool			statistics_exist,
+	trx_t*			trx,
+	const char*		table_name)
+{
+	dict_table_t*	rebuilt_table	= ctx->new_table;
+	dict_table_t*	user_table	= ctx->old_table;
+
+	DBUG_ENTER("commit_try_rebuild");
+	DBUG_ASSERT(ctx->need_rebuild());
+	DBUG_ASSERT(trx->dict_operation_lock_mode);
+	DBUG_ASSERT(!(ha_alter_info->handler_flags
+		      & ALTER_DROP_FOREIGN_KEY)
+		    || ctx->num_to_drop_fk > 0);
+	DBUG_ASSERT(ctx->num_to_drop_fk
+		    <= ha_alter_info->alter_info->drop_list.elements);
+
+	innobase_online_rebuild_log_free(user_table);
+
+	for (dict_index_t* index = dict_table_get_first_index(rebuilt_table);
+	     index;
+	     index = dict_table_get_next_index(index)) {
+		DBUG_ASSERT(dict_index_get_online_status(index)
+			    == ONLINE_INDEX_COMPLETE);
+		DBUG_ASSERT(index->is_committed());
+		if (index->is_corrupted()) {
+			my_error(ER_INDEX_CORRUPT, MYF(0), index->name());
+			DBUG_RETURN(true);
+		}
+	}
+
+	if (innobase_update_foreign_try(ctx, trx, table_name)) {
+		DBUG_RETURN(true);
+	}
+
+	/* Clear the to_be_dropped flag in the data dictionary cache
+	of user_table. */
+	for (ulint i = 0; i < ctx->num_to_drop_index; i++) {
+		dict_index_t*	index = ctx->drop_index[i];
+		DBUG_ASSERT(index->table == user_table);
+		DBUG_ASSERT(index->is_committed());
+		DBUG_ASSERT(index->to_be_dropped);
+		index->to_be_dropped = 0;
+	}
+
+	if ((ha_alter_info->handler_flags
+	     & ALTER_COLUMN_NAME)
+	    && innobase_rename_columns_try(ha_alter_info, ctx, old_table,
+					   trx, table_name)) {
+		DBUG_RETURN(true);
+	}
+
+	/* The new table must inherit the flag from the
+	"parent" table. */
+	if (!user_table->space) {
+		rebuilt_table->file_unreadable = true;
+		rebuilt_table->flags2 |= DICT_TF2_DISCARDED;
+	}
+
+	/* We can now rename the old table as a temporary table,
+	rename the new temporary table as the old table and drop the
+	old table. */
+	char* old_name= mem_heap_strdup(ctx->heap, user_table->name.m_name);
+
+	dberr_t error = row_rename_table_for_mysql(user_table->name.m_name,
+						   ctx->tmp_name, trx, false);
+	if (error == DB_SUCCESS) {
+		error = row_rename_table_for_mysql(
+			rebuilt_table->name.m_name, old_name, trx, false);
+		if (error == DB_SUCCESS) {
+			/* The statistics for the surviving indexes will be
+			re-inserted in alter_stats_rebuild(). */
+			if (statistics_exist) {
+				error = trx->drop_table_statistics(old_name);
+			}
+			if (error == DB_SUCCESS) {
+				error = trx->drop_table(*user_table);
+			}
+		}
+	}
+
+	/* We must be still holding a table handle. */
+	DBUG_ASSERT(user_table->get_ref_count() == 1);
+	DBUG_EXECUTE_IF("ib_rebuild_cannot_rename", error = DB_ERROR;);
+
+	switch (error) {
+	case DB_SUCCESS:
+		DBUG_RETURN(false);
+	case DB_TABLESPACE_EXISTS:
+		ut_a(rebuilt_table->get_ref_count() == 1);
+		my_error(ER_TABLESPACE_EXISTS, MYF(0), ctx->tmp_name);
+		DBUG_RETURN(true);
+	case DB_DUPLICATE_KEY:
+		ut_a(rebuilt_table->get_ref_count() == 1);
+		my_error(ER_TABLE_EXISTS_ERROR, MYF(0), ctx->tmp_name);
+		DBUG_RETURN(true);
+	default:
+		my_error_innodb(error, table_name, user_table->flags);
+		DBUG_RETURN(true);
+	}
+}
+
+/** Rename indexes in dictionary.
+@param[in]	ctx		alter info context
+@param[in]	ha_alter_info	Operation used during inplace alter
+@param[out]	trx		transaction to change the index name
+				in dictionary
+@return true if it failed to rename
+@return false if it is success. */
+static
+bool
+rename_indexes_try(
+	const ha_innobase_inplace_ctx*	ctx,
+	const Alter_inplace_info*	ha_alter_info,
+	trx_t*				trx)
+{
+	DBUG_ASSERT(ha_alter_info->handler_flags & ALTER_RENAME_INDEX);
+
+	for (const Alter_inplace_info::Rename_key_pair& pair :
+	     ha_alter_info->rename_keys) {
+		dict_index_t* index = dict_table_get_index_on_name(
+		    ctx->old_table, pair.old_key->name.str);
+		// This was checked previously in
+		// ha_innobase::prepare_inplace_alter_table()
+		ut_ad(index);
+
+		if (rename_index_try(index, pair.new_key->name.str, trx)) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+/** Set of column numbers */
+typedef std::set<ulint, std::less<ulint>, ut_allocator<ulint> >	col_set;
+
+/** Collect (not instantly dropped) columns from dropped indexes
+@param[in]	ctx		In-place ALTER TABLE context
+@param[in, out]	drop_col_list	list which will be set, containing columns
+				which is part of index being dropped
+@param[in, out]	drop_v_col_list	list which will be set, containing
+				virtual columns which is part of index
+				being dropped */
+static
+void
+collect_columns_from_dropped_indexes(
+	const ha_innobase_inplace_ctx*	ctx,
+	col_set&			drop_col_list,
+	col_set&			drop_v_col_list)
+{
+	for (ulint index_count = 0; index_count < ctx->num_to_drop_index;
+	     index_count++) {
+		const dict_index_t*	index = ctx->drop_index[index_count];
+
+		for (ulint col = 0; col < index->n_user_defined_cols; col++) {
+			const dict_col_t*	idx_col
+				= dict_index_get_nth_col(index, col);
+
+			if (idx_col->is_virtual()) {
+				const dict_v_col_t*	v_col
+					= reinterpret_cast<
+						const dict_v_col_t*>(idx_col);
+				drop_v_col_list.insert(v_col->v_pos);
+
+			} else {
+				ulint	col_no = dict_col_get_no(idx_col);
+				if (ctx->col_map
+				    && ctx->col_map[col_no]
+					   == ULINT_UNDEFINED) {
+					// this column was instantly dropped
+					continue;
+				}
+				drop_col_list.insert(col_no);
+			}
+		}
+	}
+}
+
+/** Change PAGE_COMPRESSED to ON or change the PAGE_COMPRESSION_LEVEL.
+@param[in]	level		PAGE_COMPRESSION_LEVEL
+@param[in]	table		table before the change
+@param[in,out]	trx		data dictionary transaction
+@param[in]	table_name	table name in MariaDB
+@return	whether the operation succeeded */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+static
+bool
+innobase_page_compression_try(
+	uint			level,
+	const dict_table_t*	table,
+	trx_t*			trx,
+	const char*		table_name)
+{
+	DBUG_ENTER("innobase_page_compression_try");
+	DBUG_ASSERT(level >= 1);
+	DBUG_ASSERT(level <= 9);
+
+	unsigned flags = table->flags
+		& ~(0xFU << DICT_TF_POS_PAGE_COMPRESSION_LEVEL);
+	flags |= 1U << DICT_TF_POS_PAGE_COMPRESSION
+		| level << DICT_TF_POS_PAGE_COMPRESSION_LEVEL;
+
+	if (table->flags == flags) {
+		DBUG_RETURN(false);
+	}
+
+	pars_info_t* info = pars_info_create();
+
+	pars_info_add_ull_literal(info, "id", table->id);
+	pars_info_add_int4_literal(info, "type",
+				   dict_tf_to_sys_tables_type(flags));
+
+	dberr_t error = que_eval_sql(info,
+				     "PROCEDURE CHANGE_COMPRESSION () IS\n"
+				     "BEGIN\n"
+				     "UPDATE SYS_TABLES SET TYPE=:type\n"
+				     "WHERE ID=:id;\n"
+				     "END;\n", trx);
+
+	if (error != DB_SUCCESS) {
+		my_error_innodb(error, table_name, 0);
+		trx->error_state = DB_SUCCESS;
+		trx->op_info = "";
+		DBUG_RETURN(true);
+	}
+
+	DBUG_RETURN(false);
+}
+
+/** Evict the table from cache and reopen it. Drop outdated statistics.
+@param thd           mariadb THD entity
+@param table         innodb table
+@param table_name    user-friendly table name for errors
+@param ctx           ALTER TABLE context
+@return newly opened table */
+static dict_table_t *innobase_reload_table(THD *thd, dict_table_t *table,
+                                           const LEX_CSTRING &table_name,
+                                           ha_innobase_inplace_ctx &ctx)
+{
+  if (ctx.is_instant())
+  {
+    for (auto i= ctx.old_n_v_cols; i--; )
+    {
+      ctx.old_v_cols[i].~dict_v_col_t();
+      const_cast<unsigned&>(ctx.old_n_v_cols)= 0;
+    }
+  }
+
+  const table_id_t id= table->id;
+  table->release();
+  dict_sys.remove(table);
+  return dict_table_open_on_id(id, true, DICT_TABLE_OP_NORMAL);
+}
+
+/** Commit the changes made during prepare_inplace_alter_table()
+and inplace_alter_table() inside the data dictionary tables,
+when not rebuilding the table.
+@param ha_alter_info Data used during in-place alter
+@param ctx In-place ALTER TABLE context
+@param old_table MySQL table as it is before the ALTER operation
+@param trx Data dictionary transaction
+@param table_name Table name in MySQL
+@retval true Failure
+@retval false Success
+*/
+inline MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+commit_try_norebuild(
+/*=================*/
+	Alter_inplace_info*	ha_alter_info,
+	ha_innobase_inplace_ctx*ctx,
+	TABLE*			altered_table,
+	const TABLE*		old_table,
+	trx_t*			trx,
+	const char*		table_name)
+{
+	DBUG_ENTER("commit_try_norebuild");
+	DBUG_ASSERT(!ctx->need_rebuild());
+	DBUG_ASSERT(trx->dict_operation_lock_mode);
+	DBUG_ASSERT(!(ha_alter_info->handler_flags
+		      & ALTER_DROP_FOREIGN_KEY)
+		    || ctx->num_to_drop_fk > 0);
+	DBUG_ASSERT(ctx->num_to_drop_fk
+		    <= ha_alter_info->alter_info->drop_list.elements
+		    || ctx->num_to_drop_vcol
+		       == ha_alter_info->alter_info->drop_list.elements);
+
+	if (ctx->page_compression_level
+	    && innobase_page_compression_try(ctx->page_compression_level,
+					     ctx->new_table, trx,
+					     table_name)) {
+		DBUG_RETURN(true);
+	}
+
+	for (ulint i = 0; i < ctx->num_to_add_index; i++) {
+		dict_index_t*	index = ctx->add_index[i];
+		DBUG_ASSERT(dict_index_get_online_status(index)
+			    == ONLINE_INDEX_COMPLETE);
+		DBUG_ASSERT(!index->is_committed());
+		if (index->is_corrupted()) {
+			/* Report a duplicate key
+			error for the index that was
+			flagged corrupted, most likely
+			because a duplicate value was
+			inserted (directly or by
+			rollback) after
+			ha_innobase::inplace_alter_table()
+			completed.
+			TODO: report this as a corruption
+			with a detailed reason once
+			WL#6379 has been implemented. */
+			my_error(ER_DUP_UNKNOWN_IN_INDEX,
+				 MYF(0), index->name());
+			DBUG_RETURN(true);
+		}
+	}
+
+	if (innobase_update_foreign_try(ctx, trx, table_name)) {
+		DBUG_RETURN(true);
+	}
+
+	if ((ha_alter_info->handler_flags & ALTER_COLUMN_UNVERSIONED)
+	    && vers_change_fields_try(ha_alter_info, ctx, trx, old_table)) {
+		DBUG_RETURN(true);
+	}
+
+	dberr_t	error = DB_SUCCESS;
+	dict_index_t* index;
+	const char *op = "rename index to add";
+	ulint num_fts_index = 0;
+
+	/* We altered the table in place. Mark the indexes as committed. */
+	for (ulint i = 0; i < ctx->num_to_add_index; i++) {
+		index = ctx->add_index[i];
+		DBUG_ASSERT(dict_index_get_online_status(index)
+			    == ONLINE_INDEX_COMPLETE);
+		DBUG_ASSERT(!index->is_committed());
+		error = row_merge_rename_index_to_add(
+			trx, ctx->new_table->id, index->id);
+		if (error) {
+			goto handle_error;
+		}
+	}
+
+	for (dict_index_t *index = UT_LIST_GET_FIRST(ctx->old_table->indexes);
+	     index; index = UT_LIST_GET_NEXT(indexes, index)) {
+		if (index->type & DICT_FTS) {
+			num_fts_index++;
+		}
+	}
+
+	char db[MAX_DB_UTF8_LEN], table[MAX_TABLE_UTF8_LEN];
+	if (ctx->num_to_drop_index) {
+		dict_fs2utf8(ctx->old_table->name.m_name,
+			     db, sizeof db, table, sizeof table);
+	}
+
+	for (ulint i = 0; i < ctx->num_to_drop_index; i++) {
+		index = ctx->drop_index[i];
+		DBUG_ASSERT(index->is_committed());
+		DBUG_ASSERT(index->table == ctx->new_table);
+		DBUG_ASSERT(index->to_be_dropped);
+		op = "DROP INDEX";
+
+		static const char drop_index[] =
+			"PROCEDURE DROP_INDEX_PROC () IS\n"
+			"BEGIN\n"
+			"DELETE FROM SYS_FIELDS WHERE INDEX_ID=:indexid;\n"
+			"DELETE FROM SYS_INDEXES WHERE ID=:indexid;\n"
+			"END;\n";
+
+		pars_info_t* info = pars_info_create();
+		pars_info_add_ull_literal(info, "indexid", index->id);
+		error = que_eval_sql(info, drop_index, trx);
+
+		if (error == DB_SUCCESS && index->type & DICT_FTS) {
+			DBUG_ASSERT(index->table->fts);
+			DEBUG_SYNC_C("norebuild_fts_drop");
+			error = fts_drop_index(index->table, index, trx);
+			ut_ad(num_fts_index);
+			num_fts_index--;
+		}
+
+		if (error != DB_SUCCESS) {
+			goto handle_error;
+		}
+
+		error = dict_stats_delete_from_index_stats(db, table,
+							   index->name, trx);
+		switch (error) {
+		case DB_SUCCESS:
+		case DB_STATS_DO_NOT_EXIST:
+			continue;
+		default:
+			goto handle_error;
+		}
+	}
+
+	if (const size_t size = ha_alter_info->rename_keys.size()) {
+		char tmp_name[5];
+		char db[MAX_DB_UTF8_LEN], table[MAX_TABLE_UTF8_LEN];
+
+		dict_fs2utf8(ctx->new_table->name.m_name, db, sizeof db,
+			     table, sizeof table);
+		tmp_name[0]= (char)0xff;
+		for (size_t i = 0; error == DB_SUCCESS && i < size; i++) {
+			snprintf(tmp_name+1, sizeof(tmp_name)-1, "%zu", i);
+			error = dict_stats_rename_index(db, table,
+							ha_alter_info->
+							rename_keys[i].
+							old_key->name.str,
+							tmp_name, trx);
+		}
+		for (size_t i = 0; error == DB_SUCCESS && i < size; i++) {
+			snprintf(tmp_name+1, sizeof(tmp_name)-1, "%zu", i);
+			error = dict_stats_rename_index(db, table, tmp_name,
+							ha_alter_info
+							->rename_keys[i].
+							new_key->name.str,
+							trx);
+		}
+
+		switch (error) {
+		case DB_SUCCESS:
+		case DB_STATS_DO_NOT_EXIST:
+			break;
+		case DB_DUPLICATE_KEY:
+			my_error(ER_DUP_KEY, MYF(0),
+				 "mysql.innodb_index_stats");
+			DBUG_RETURN(true);
+		default:
+			goto handle_error;
+		}
+	}
+
+	if ((ctx->old_table->flags2 & DICT_TF2_FTS) && !num_fts_index) {
+		error = fts_drop_tables(trx, *ctx->old_table);
+		if (error != DB_SUCCESS) {
+handle_error:
+			switch (error) {
+			case DB_TOO_MANY_CONCURRENT_TRXS:
+				my_error(ER_TOO_MANY_CONCURRENT_TRXS, MYF(0));
+				break;
+			case DB_LOCK_WAIT_TIMEOUT:
+				my_error(ER_LOCK_WAIT_TIMEOUT, MYF(0));
+				break;
+			default:
+				sql_print_error("InnoDB: %s: %s\n", op,
+						ut_strerr(error));
+				DBUG_ASSERT(error == DB_IO_ERROR
+					    || error == DB_LOCK_TABLE_FULL
+					    || error == DB_DECRYPTION_FAILED
+					    || error == DB_PAGE_CORRUPTED
+					    || error == DB_CORRUPTION);
+				my_error(ER_INTERNAL_ERROR, MYF(0), op);
+			}
+
+			DBUG_RETURN(true);
+		}
+	}
+
+	if (innobase_rename_or_enlarge_columns_try(ha_alter_info, ctx,
+						   altered_table, old_table,
+						   trx, table_name)) {
+		DBUG_RETURN(true);
+	}
+
+	if ((ha_alter_info->handler_flags & ALTER_RENAME_INDEX)
+	    && rename_indexes_try(ctx, ha_alter_info, trx)) {
+		DBUG_RETURN(true);
+	}
+
+	if (ctx->is_instant()) {
+		DBUG_RETURN(innobase_instant_try(ha_alter_info, ctx,
+						 altered_table, old_table,
+						 trx));
+	}
+
+	if (ha_alter_info->handler_flags
+	    & (ALTER_DROP_VIRTUAL_COLUMN | ALTER_ADD_VIRTUAL_COLUMN)) {
+		if ((ha_alter_info->handler_flags & ALTER_DROP_VIRTUAL_COLUMN)
+		    && innobase_drop_virtual_try(ha_alter_info, ctx->old_table,
+						 trx)) {
+			DBUG_RETURN(true);
+		}
+
+		if ((ha_alter_info->handler_flags & ALTER_ADD_VIRTUAL_COLUMN)
+		    && innobase_add_virtual_try(ha_alter_info, ctx->old_table,
+						trx)) {
+			DBUG_RETURN(true);
+		}
+
+		unsigned n_col = ctx->old_table->n_cols
+			- DATA_N_SYS_COLS;
+		unsigned n_v_col = ctx->old_table->n_v_cols
+			+ ctx->num_to_add_vcol - ctx->num_to_drop_vcol;
+
+		if (innodb_update_cols(
+			    ctx->old_table,
+			    dict_table_encode_n_col(n_col, n_v_col)
+			    | unsigned(ctx->old_table->flags & DICT_TF_COMPACT)
+			    << 31, trx)) {
+			DBUG_RETURN(true);
+		}
+	}
+
+	DBUG_RETURN(false);
+}
+
+/** Commit the changes to the data dictionary cache
+after a successful commit_try_norebuild() call.
+@param ha_alter_info algorithm=inplace context
+@param ctx In-place ALTER TABLE context for the current partition
+@param altered_table the TABLE after the ALTER
+@param table the TABLE before the ALTER
+@param trx Data dictionary transaction
+(will be started and committed, for DROP INDEX)
+@return whether all replacements were found for dropped indexes */
+inline MY_ATTRIBUTE((nonnull))
+bool
+commit_cache_norebuild(
+/*===================*/
+	Alter_inplace_info*	ha_alter_info,
+	ha_innobase_inplace_ctx*ctx,
+	const TABLE*		altered_table,
+	const TABLE*		table,
+	trx_t*			trx)
+{
+	DBUG_ENTER("commit_cache_norebuild");
+	DBUG_ASSERT(!ctx->need_rebuild());
+	DBUG_ASSERT(ctx->new_table->space != fil_system.temp_space);
+	DBUG_ASSERT(!ctx->new_table->is_temporary());
+
+	bool found = true;
+
+	if (ctx->page_compression_level) {
+		DBUG_ASSERT(ctx->new_table->space != fil_system.sys_space);
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 4 and 5 need this here */
+#endif
+		ctx->new_table->flags
+			= static_cast<uint16_t>(
+				(ctx->new_table->flags
+				 & ~(0xFU
+				     << DICT_TF_POS_PAGE_COMPRESSION_LEVEL))
+				| 1 << DICT_TF_POS_PAGE_COMPRESSION
+				| (ctx->page_compression_level & 0xF)
+				<< DICT_TF_POS_PAGE_COMPRESSION_LEVEL)
+			& ((1U << DICT_TF_BITS) - 1);
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
+
+		if (fil_space_t* space = ctx->new_table->space) {
+			bool update = !(space->flags
+					& FSP_FLAGS_MASK_PAGE_COMPRESSION);
+			mysql_mutex_lock(&fil_system.mutex);
+			space->flags &= ~FSP_FLAGS_MASK_MEM_COMPRESSION_LEVEL;
+			space->flags |= ctx->page_compression_level
+				<< FSP_FLAGS_MEM_COMPRESSION_LEVEL;
+			if (!space->full_crc32()) {
+				space->flags
+					|= FSP_FLAGS_MASK_PAGE_COMPRESSION;
+			} else if (!space->is_compressed()) {
+				space->flags |= static_cast<uint32_t>(
+					innodb_compression_algorithm)
+					<< FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO;
+			}
+			mysql_mutex_unlock(&fil_system.mutex);
+
+			if (update) {
+				/* Maybe we should introduce an undo
+				log record for updating tablespace
+				flags, and perform the update already
+				in innobase_page_compression_try().
+
+				If the server is killed before the
+				following mini-transaction commit
+				becomes durable, fsp_flags_try_adjust()
+				will perform the equivalent adjustment
+				and warn "adjusting FSP_SPACE_FLAGS". */
+				mtr_t	mtr;
+				mtr.start();
+				if (buf_block_t* b = buf_page_get(
+					    page_id_t(space->id, 0),
+					    space->zip_size(),
+					    RW_X_LATCH, &mtr)) {
+					byte* f = FSP_HEADER_OFFSET
+						+ FSP_SPACE_FLAGS
+						+ b->page.frame;
+					const auto sf = space->flags
+						& ~FSP_FLAGS_MEM_MASK;
+					if (mach_read_from_4(f) != sf) {
+						mtr.set_named_space(space);
+						mtr.write<4,mtr_t::FORCED>(
+							*b, f, sf);
+					}
+				}
+				mtr.commit();
+			}
+		}
+	}
+
+	col_set			drop_list;
+	col_set			v_drop_list;
+
+	/* Check if the column, part of an index to be dropped is part of any
+	other index which is not being dropped. If it so, then set the ord_part
+	of the column to 0. */
+	collect_columns_from_dropped_indexes(ctx, drop_list, v_drop_list);
+
+	for (ulint col : drop_list) {
+		if (!check_col_exists_in_indexes(ctx->new_table, col, false)) {
+			ctx->new_table->cols[col].ord_part = 0;
+		}
+	}
+
+	for (ulint col : v_drop_list) {
+		if (!check_col_exists_in_indexes(ctx->new_table, col, true)) {
+			ctx->new_table->v_cols[col].m_col.ord_part = 0;
+		}
+	}
+
+	for (ulint i = 0; i < ctx->num_to_add_index; i++) {
+		dict_index_t*	index = ctx->add_index[i];
+		DBUG_ASSERT(dict_index_get_online_status(index)
+			    == ONLINE_INDEX_COMPLETE);
+		DBUG_ASSERT(!index->is_committed());
+		index->change_col_info = nullptr;
+		index->set_committed(true);
+	}
+
+	for (ulint i = 0; i < ctx->num_to_drop_index; i++) {
+		dict_index_t*	index = ctx->drop_index[i];
+		DBUG_ASSERT(index->is_committed());
+		DBUG_ASSERT(index->table == ctx->new_table);
+		DBUG_ASSERT(index->to_be_dropped);
+
+		if (!dict_foreign_replace_index(index->table, ctx->col_names,
+						index)) {
+			found = false;
+		}
+
+		dict_index_remove_from_cache(index->table, index);
+	}
+
+	fts_clear_all(ctx->old_table);
+
+	if (!ctx->is_instant()) {
+		innobase_rename_or_enlarge_columns_cache(
+			ha_alter_info, altered_table, table, ctx->new_table);
+	} else {
+		ut_ad(ctx->col_map);
+
+		if (fts_t* fts = ctx->new_table->fts) {
+			ut_ad(fts->doc_col != ULINT_UNDEFINED);
+			ut_ad(ctx->new_table->n_cols > DATA_N_SYS_COLS);
+			const ulint c = ctx->col_map[fts->doc_col];
+			ut_ad(c < ulint(ctx->new_table->n_cols)
+			      - DATA_N_SYS_COLS);
+			ut_d(const dict_col_t& col = ctx->new_table->cols[c]);
+			ut_ad(!col.is_nullable());
+			ut_ad(!col.is_virtual());
+			ut_ad(!col.is_added());
+			ut_ad(col.prtype & DATA_UNSIGNED);
+			ut_ad(col.mtype == DATA_INT);
+			ut_ad(col.len == 8);
+			ut_ad(col.ord_part);
+			fts->doc_col = c;
+		}
+
+		if (ha_alter_info->handler_flags & ALTER_DROP_STORED_COLUMN) {
+			const dict_index_t* index = ctx->new_table->indexes.start;
+
+			for (const dict_field_t* f = index->fields,
+				     * const end = f + index->n_fields;
+			     f != end; f++) {
+				dict_col_t& c = *f->col;
+				if (c.is_dropped()) {
+					c.set_dropped(!c.is_nullable(),
+						      DATA_LARGE_MTYPE(c.mtype)
+						      || (!f->fixed_len
+							  && c.len > 255),
+						      f->fixed_len);
+				}
+			}
+		}
+
+		if (!ctx->instant_table->persistent_autoinc) {
+			ctx->new_table->persistent_autoinc = 0;
+		}
+	}
+
+	if (ha_alter_info->handler_flags & ALTER_COLUMN_UNVERSIONED) {
+		vers_change_fields_cache(ha_alter_info, ctx, table);
+	}
+
+	if (ha_alter_info->handler_flags & ALTER_RENAME_INDEX) {
+		innobase_rename_indexes_cache(ctx, ha_alter_info);
+	}
+
+	ctx->new_table->fts_doc_id_index
+		= ctx->new_table->fts
+		? dict_table_get_index_on_name(
+			ctx->new_table, FTS_DOC_ID_INDEX_NAME)
+		: NULL;
+	DBUG_ASSERT((ctx->new_table->fts == NULL)
+		    == (ctx->new_table->fts_doc_id_index == NULL));
+	if (table->found_next_number_field
+		&& !altered_table->found_next_number_field) {
+		ctx->prebuilt->table->persistent_autoinc = 0;
+	}
+	DBUG_RETURN(found);
+}
+
+/** Adjust the persistent statistics after non-rebuilding ALTER TABLE.
+Remove statistics for dropped indexes, add statistics for created indexes
+and rename statistics for renamed indexes.
+@param ha_alter_info Data used during in-place alter
+@param ctx In-place ALTER TABLE context
+@param thd MySQL connection
+*/
+static
+void
+alter_stats_norebuild(
+/*==================*/
+	Alter_inplace_info*		ha_alter_info,
+	ha_innobase_inplace_ctx*	ctx,
+	THD*				thd)
+{
+	DBUG_ENTER("alter_stats_norebuild");
+	DBUG_ASSERT(!ctx->need_rebuild());
+
+	if (!dict_stats_is_persistent_enabled(ctx->new_table)) {
+		DBUG_VOID_RETURN;
+	}
+
+	for (ulint i = 0; i < ctx->num_to_add_index; i++) {
+		dict_index_t*	index = ctx->add_index[i];
+		DBUG_ASSERT(index->table == ctx->new_table);
+
+		if (!(index->type & DICT_FTS)) {
+			dict_stats_init(ctx->new_table);
+			dict_stats_update_for_index(index);
+		}
+	}
+
+	DBUG_VOID_RETURN;
+}
+
+/** Adjust the persistent statistics after rebuilding ALTER TABLE.
+Remove statistics for dropped indexes, add statistics for created indexes
+and rename statistics for renamed indexes.
+@param table InnoDB table that was rebuilt by ALTER TABLE
+@param table_name Table name in MySQL
+@param thd MySQL connection
+*/
+static
+void
+alter_stats_rebuild(
+/*================*/
+	dict_table_t*	table,
+	const char*	table_name,
+	THD*		thd)
+{
+	DBUG_ENTER("alter_stats_rebuild");
+
+	if (!table->space
+	    || !dict_stats_is_persistent_enabled(table)) {
+		DBUG_VOID_RETURN;
+	}
+
+	dberr_t	ret = dict_stats_update(table, DICT_STATS_RECALC_PERSISTENT);
+
+	if (ret != DB_SUCCESS) {
+		push_warning_printf(
+			thd,
+			Sql_condition::WARN_LEVEL_WARN,
+			ER_ALTER_INFO,
+			"Error updating stats for table '%s'"
+			" after table rebuild: %s",
+			table_name, ut_strerr(ret));
+	}
+
+	DBUG_VOID_RETURN;
+}
+
+/** Apply the log for the table rebuild operation.
+@param[in]	ctx		Inplace Alter table context
+@param[in]	altered_table	MySQL table that is being altered
+@return true Failure, else false. */
+static bool alter_rebuild_apply_log(
+	ha_innobase_inplace_ctx*	ctx,
+	Alter_inplace_info*		ha_alter_info,
+	TABLE*				altered_table)
+{
+	DBUG_ENTER("alter_rebuild_apply_log");
+
+	if (!ctx->online) {
+		DBUG_RETURN(false);
+	}
+
+	/* We copied the table. Any indexes that were requested to be
+	dropped were not created in the copy of the table. Apply any
+	last bit of the rebuild log and then rename the tables. */
+	dict_table_t*	user_table = ctx->old_table;
+
+	DEBUG_SYNC_C("row_log_table_apply2_before");
+
+	dict_vcol_templ_t* s_templ  = NULL;
+
+	if (ctx->new_table->n_v_cols > 0) {
+		s_templ = UT_NEW_NOKEY(
+				dict_vcol_templ_t());
+		s_templ->vtempl = NULL;
+
+		innobase_build_v_templ(altered_table, ctx->new_table, s_templ,
+				       NULL, true);
+		ctx->new_table->vc_templ = s_templ;
+	}
+
+	dberr_t error = row_log_table_apply(
+		ctx->thr, user_table, altered_table,
+		static_cast<ha_innobase_inplace_ctx*>(
+			ha_alter_info->handler_ctx)->m_stage,
+		ctx->new_table);
+
+	if (s_templ) {
+		ut_ad(ctx->need_rebuild());
+		dict_free_vc_templ(s_templ);
+		UT_DELETE(s_templ);
+		ctx->new_table->vc_templ = NULL;
+	}
+
+	DBUG_RETURN(ctx->log_failure(
+			ha_alter_info, altered_table, error));
+}
+
+/** Commit or rollback the changes made during
+prepare_inplace_alter_table() and inplace_alter_table() inside
+the storage engine. Note that the allowed level of concurrency
+during this operation will be the same as for
+inplace_alter_table() and thus might be higher than during
+prepare_inplace_alter_table(). (E.g concurrent writes were
+blocked during prepare, but might not be during commit).
+@param altered_table TABLE object for new version of table.
+@param ha_alter_info Structure describing changes to be done
+by ALTER TABLE and holding data used during in-place alter.
+@param commit true => Commit, false => Rollback.
+@retval true Failure
+@retval false Success
+*/
+
+bool
+ha_innobase::commit_inplace_alter_table(
+/*====================================*/
+	TABLE*			altered_table,
+	Alter_inplace_info*	ha_alter_info,
+	bool			commit)
+{
+	ha_innobase_inplace_ctx*ctx0;
+
+	ctx0 = static_cast<ha_innobase_inplace_ctx*>
+		(ha_alter_info->handler_ctx);
+
+#ifndef DBUG_OFF
+	uint	failure_inject_count	= 1;
+#endif /* DBUG_OFF */
+
+	DBUG_ENTER("commit_inplace_alter_table");
+	DBUG_ASSERT(!srv_read_only_mode);
+	DBUG_ASSERT(!ctx0 || ctx0->prebuilt == m_prebuilt);
+	DBUG_ASSERT(!ctx0 || ctx0->old_table == m_prebuilt->table);
+
+	DEBUG_SYNC_C("innodb_commit_inplace_alter_table_enter");
+
+	DEBUG_SYNC_C("innodb_commit_inplace_alter_table_wait");
+
+	if (ctx0 != NULL && ctx0->m_stage != NULL) {
+		ctx0->m_stage->begin_phase_end();
+	}
+
+	if (!commit) {
+		/* A rollback is being requested. So far we may at
+		most have created stubs for ADD INDEX or a copy of the
+		table for rebuild. */
+		DBUG_RETURN(rollback_inplace_alter_table(
+				    ha_alter_info, table, m_prebuilt));
+	}
+
+	if (!(ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE)) {
+		DBUG_ASSERT(!ctx0);
+		MONITOR_ATOMIC_DEC(MONITOR_PENDING_ALTER_TABLE);
+		if (table->found_next_number_field
+			&& !altered_table->found_next_number_field) {
+			m_prebuilt->table->persistent_autoinc = 0;
+			/* Don't reset ha_alter_info->group_commit_ctx to make
+			partitions engine to call this function for all
+			partitions. */
+		}
+		else
+			ha_alter_info->group_commit_ctx = NULL;
+		DBUG_RETURN(false);
+	}
+
+	DBUG_ASSERT(ctx0);
+
+	inplace_alter_handler_ctx**	ctx_array;
+	inplace_alter_handler_ctx*	ctx_single[2];
+
+	if (ha_alter_info->group_commit_ctx) {
+		ctx_array = ha_alter_info->group_commit_ctx;
+	} else {
+		ctx_single[0] = ctx0;
+		ctx_single[1] = NULL;
+		ctx_array = ctx_single;
+	}
+
+	DBUG_ASSERT(ctx0 == ctx_array[0]);
+	ut_ad(m_prebuilt->table == ctx0->old_table);
+	ha_alter_info->group_commit_ctx = NULL;
+
+	const bool new_clustered = ctx0->need_rebuild();
+	trx_t* const trx = ctx0->trx;
+	trx->op_info = "acquiring table lock";
+	bool fts_exist = false;
+	for (inplace_alter_handler_ctx** pctx = ctx_array; *pctx; pctx++) {
+		auto ctx = static_cast<ha_innobase_inplace_ctx*>(*pctx);
+		DBUG_ASSERT(ctx->prebuilt->trx == m_prebuilt->trx);
+		ut_ad(m_prebuilt != ctx->prebuilt || ctx == ctx0);
+		DBUG_ASSERT(new_clustered == ctx->need_rebuild());
+		/* If decryption failed for old table or new table
+		fail here. */
+		if ((!ctx->old_table->is_readable()
+		     && ctx->old_table->space)
+		    || (!ctx->new_table->is_readable()
+			&& ctx->new_table->space)) {
+			String str;
+			const char* engine= table_type();
+			get_error_message(HA_ERR_DECRYPTION_FAILED, &str);
+			my_error(ER_GET_ERRMSG, MYF(0), HA_ERR_DECRYPTION_FAILED, str.c_ptr(), engine);
+			DBUG_RETURN(true);
+		}
+		if ((ctx->old_table->flags2 | ctx->new_table->flags2)
+		    & (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS)) {
+			fts_exist = true;
+		}
+	}
+
+	bool already_stopped= false;
+	for (inplace_alter_handler_ctx** pctx = ctx_array; *pctx; pctx++) {
+		auto ctx = static_cast<ha_innobase_inplace_ctx*>(*pctx);
+		dberr_t error = DB_SUCCESS;
+
+		if (fts_exist) {
+			purge_sys.stop_FTS(*ctx->old_table, already_stopped);
+			already_stopped = true;
+		}
+
+		if (new_clustered && ctx->old_table->fts) {
+			ut_ad(!ctx->old_table->fts->add_wq);
+			fts_optimize_remove_table(ctx->old_table);
+		}
+
+		dict_sys.freeze(SRW_LOCK_CALL);
+		for (auto f : ctx->old_table->referenced_set) {
+			if (dict_table_t* child = f->foreign_table) {
+				error = lock_table_for_trx(child, trx, LOCK_X);
+				if (error != DB_SUCCESS) {
+					break;
+				}
+			}
+		}
+		dict_sys.unfreeze();
+
+		if (ctx->new_table->fts) {
+			ut_ad(!ctx->new_table->fts->add_wq);
+			fts_optimize_remove_table(ctx->new_table);
+			fts_sync_during_ddl(ctx->new_table);
+		}
+
+		/* Exclusively lock the table, to ensure that no other
+		transaction is holding locks on the table while we
+		change the table definition. Any recovered incomplete
+		transactions would be holding InnoDB locks only, not MDL. */
+		if (error == DB_SUCCESS) {
+			error = lock_table_for_trx(ctx->new_table, trx,
+						   LOCK_X);
+		}
+
+		DBUG_EXECUTE_IF("deadlock_table_fail",
+				{
+				  error= DB_DEADLOCK;
+				  trx_rollback_for_mysql(trx);
+				});
+
+		if (error != DB_SUCCESS) {
+lock_fail:
+			my_error_innodb(
+				error, table_share->table_name.str, 0);
+			if (fts_exist) {
+				purge_sys.resume_FTS();
+			}
+
+			/* Deadlock encountered and rollbacked the
+			transaction. So restart the transaction
+			to remove the newly created table or
+			index from data dictionary and table cache
+			in rollback_inplace_alter_table() */
+			if (trx->state == TRX_STATE_NOT_STARTED) {
+				trx_start_for_ddl(trx);
+			}
+
+			DBUG_RETURN(true);
+		} else if ((ctx->new_table->flags2
+			    & (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS))
+			   && (error = fts_lock_tables(trx, *ctx->new_table))
+			   != DB_SUCCESS) {
+			goto lock_fail;
+		} else if (!new_clustered) {
+		} else if ((error = lock_table_for_trx(ctx->old_table, trx,
+						       LOCK_X))
+			   != DB_SUCCESS) {
+			goto lock_fail;
+		} else if ((ctx->old_table->flags2
+			    & (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS))
+			   && (error = fts_lock_tables(trx, *ctx->old_table))
+			   != DB_SUCCESS) {
+			goto lock_fail;
+		}
+	}
+
+	DEBUG_SYNC(m_user_thd, "innodb_alter_commit_after_lock_table");
+
+	if (new_clustered) {
+		/* We are holding MDL_EXCLUSIVE as well as exclusive
+		InnoDB table locks. Let us apply any table rebuild log
+		before locking dict_sys. */
+		for (inplace_alter_handler_ctx** pctx= ctx_array; *pctx;
+		     pctx++) {
+			auto ctx= static_cast<ha_innobase_inplace_ctx*>(*pctx);
+			DBUG_ASSERT(ctx->need_rebuild());
+			if (alter_rebuild_apply_log(ctx, ha_alter_info,
+						    altered_table)) {
+				if (fts_exist) {
+					purge_sys.resume_FTS();
+				}
+				DBUG_RETURN(true);
+			}
+		}
+	} else {
+		dberr_t error= DB_SUCCESS;
+		for (inplace_alter_handler_ctx** pctx= ctx_array; *pctx;
+		     pctx++) {
+			auto ctx= static_cast<ha_innobase_inplace_ctx*>(*pctx);
+
+			if (!ctx->online || !ctx->old_table->space
+			    || !ctx->old_table->is_readable()) {
+				continue;
+			}
+
+			for (ulint i = 0; i < ctx->num_to_add_index; i++) {
+				dict_index_t *index= ctx->add_index[i];
+
+				ut_ad(!(index->type &
+					(DICT_FTS | DICT_SPATIAL)));
+
+				index->lock.x_lock(SRW_LOCK_CALL);
+				if (!index->online_log) {
+					/* online log would've cleared
+					when we detect the error in
+					other index */
+					index->lock.x_unlock();
+					continue;
+				}
+
+				if (index->is_corrupted()) {
+					/* Online index log has been
+					preserved to show the error
+					when it happened via
+					row_log_apply() by DML thread */
+					error= row_log_get_error(index);
+err_index:
+					ut_ad(error != DB_SUCCESS);
+					ctx->log_failure(
+						ha_alter_info,
+						altered_table, error);
+					row_log_free(index->online_log);
+					index->online_log= nullptr;
+					index->lock.x_unlock();
+
+					ctx->old_table->indexes.start
+						->online_log= nullptr;
+					if (fts_exist) {
+						purge_sys.resume_FTS();
+					}
+					MONITOR_ATOMIC_INC(
+						MONITOR_BACKGROUND_DROP_INDEX);
+					DBUG_RETURN(true);
+				}
+
+				index->lock.x_unlock();
+
+				error = row_log_apply(
+					m_prebuilt->trx, index, altered_table,
+					ctx->m_stage);
+
+				index->lock.x_lock(SRW_LOCK_CALL);
+
+				if (error != DB_SUCCESS) {
+					goto err_index;
+				}
+
+				row_log_free(index->online_log);
+				index->online_log= nullptr;
+				index->lock.x_unlock();
+			}
+
+			ctx->old_table->indexes.start->online_log= nullptr;
+		}
+	}
+
+	dict_table_t *table_stats = nullptr, *index_stats = nullptr;
+	MDL_ticket *mdl_table = nullptr, *mdl_index = nullptr;
+	dberr_t error = DB_SUCCESS;
+	if (!ctx0->old_table->is_stats_table() &&
+	    !ctx0->new_table->is_stats_table()) {
+		table_stats = dict_table_open_on_name(
+			TABLE_STATS_NAME, false, DICT_ERR_IGNORE_NONE);
+		if (table_stats) {
+			dict_sys.freeze(SRW_LOCK_CALL);
+			table_stats = dict_acquire_mdl_shared<false>(
+				table_stats, m_user_thd, &mdl_table);
+			dict_sys.unfreeze();
+		}
+		index_stats = dict_table_open_on_name(
+			INDEX_STATS_NAME, false, DICT_ERR_IGNORE_NONE);
+		if (index_stats) {
+			dict_sys.freeze(SRW_LOCK_CALL);
+			index_stats = dict_acquire_mdl_shared<false>(
+				index_stats, m_user_thd, &mdl_index);
+			dict_sys.unfreeze();
+		}
+
+		if (table_stats && index_stats
+		    && !strcmp(table_stats->name.m_name, TABLE_STATS_NAME)
+		    && !strcmp(index_stats->name.m_name, INDEX_STATS_NAME)
+		    && !(error = lock_table_for_trx(table_stats,
+						    trx, LOCK_X))) {
+			error = lock_table_for_trx(index_stats, trx, LOCK_X);
+		}
+	}
+
+	DBUG_EXECUTE_IF("stats_lock_fail",
+			error = DB_LOCK_WAIT_TIMEOUT;
+			trx_rollback_for_mysql(trx););
+
+	if (error == DB_SUCCESS) {
+		error = lock_sys_tables(trx);
+	}
+	if (error != DB_SUCCESS) {
+		if (table_stats) {
+			dict_table_close(table_stats, false, m_user_thd,
+					 mdl_table);
+		}
+		if (index_stats) {
+			dict_table_close(index_stats, false, m_user_thd,
+					 mdl_index);
+		}
+		my_error_innodb(error, table_share->table_name.str, 0);
+		if (fts_exist) {
+			purge_sys.resume_FTS();
+		}
+
+		if (trx->state == TRX_STATE_NOT_STARTED) {
+			/* Transaction may have been rolled back
+			due to a lock wait timeout, deadlock,
+			or a KILL statement. So restart the
+			transaction to remove the newly created
+			table or index stubs from data dictionary
+			and table cache in
+			rollback_inplace_alter_table() */
+			trx_start_for_ddl(trx);
+		}
+
+		DBUG_RETURN(true);
+	}
+
+	row_mysql_lock_data_dictionary(trx);
+
+	/* Apply the changes to the data dictionary tables, for all
+	partitions. */
+	for (inplace_alter_handler_ctx** pctx = ctx_array; *pctx; pctx++) {
+		auto ctx = static_cast<ha_innobase_inplace_ctx*>(*pctx);
+
+		DBUG_ASSERT(new_clustered == ctx->need_rebuild());
+		if (ctx->need_rebuild() && !ctx->old_table->space) {
+			my_error(ER_TABLESPACE_DISCARDED, MYF(0),
+				 table->s->table_name.str);
+fail:
+			trx->rollback();
+			ut_ad(!trx->fts_trx);
+			if (table_stats) {
+				dict_table_close(table_stats, true, m_user_thd,
+						 mdl_table);
+			}
+			if (index_stats) {
+				dict_table_close(index_stats, true, m_user_thd,
+						 mdl_index);
+			}
+			row_mysql_unlock_data_dictionary(trx);
+			if (fts_exist) {
+				purge_sys.resume_FTS();
+			}
+			trx_start_for_ddl(trx);
+			DBUG_RETURN(true);
+		}
+
+		if (commit_set_autoinc(ha_alter_info, ctx,
+				       altered_table, table)) {
+			goto fail;
+		}
+
+		if (ctx->need_rebuild()) {
+			ctx->tmp_name = dict_mem_create_temporary_tablename(
+				ctx->heap, ctx->new_table->name.m_name,
+				ctx->new_table->id);
+
+			if (commit_try_rebuild(ha_alter_info, ctx,
+					       altered_table, table,
+					       table_stats && index_stats,
+					       trx,
+					       table_share->table_name.str)) {
+				goto fail;
+			}
+		} else if (commit_try_norebuild(ha_alter_info, ctx,
+						altered_table, table, trx,
+						table_share->table_name.str)) {
+			goto fail;
+		}
+#ifndef DBUG_OFF
+		{
+			/* Generate a dynamic dbug text. */
+			char buf[32];
+
+			snprintf(buf, sizeof buf,
+				    "ib_commit_inplace_fail_%u",
+				    failure_inject_count++);
+
+			DBUG_EXECUTE_IF(buf,
+					my_error(ER_INTERNAL_ERROR, MYF(0),
+						 "Injected error!");
+					goto fail;
+			);
+		}
+#endif
+	}
+
+	if (table_stats) {
+		dict_table_close(table_stats, true, m_user_thd, mdl_table);
+	}
+	if (index_stats) {
+		dict_table_close(index_stats, true, m_user_thd, mdl_index);
+	}
+
+	/* Commit or roll back the changes to the data dictionary. */
+	DEBUG_SYNC(m_user_thd, "innodb_alter_inplace_before_commit");
+
+	if (new_clustered) {
+		ut_ad(trx->has_logged_persistent());
+		for (inplace_alter_handler_ctx** pctx = ctx_array; *pctx;
+		     pctx++) {
+			auto ctx= static_cast<ha_innobase_inplace_ctx*>(*pctx);
+			ut_ad(!strcmp(ctx->old_table->name.m_name,
+				      ctx->tmp_name));
+			ut_ad(ctx->new_table->get_ref_count() == 1);
+			const bool own = m_prebuilt == ctx->prebuilt;
+			trx_t* const user_trx = m_prebuilt->trx;
+			ctx->prebuilt->table->release();
+			ctx->prebuilt->table = nullptr;
+			row_prebuilt_free(ctx->prebuilt);
+			/* Rebuild the prebuilt object. */
+			ctx->prebuilt = row_create_prebuilt(
+				ctx->new_table, altered_table->s->reclength);
+			if (own) {
+				m_prebuilt = ctx->prebuilt;
+			}
+			trx_start_if_not_started(user_trx, true);
+			m_prebuilt->trx = user_trx;
+		}
+	}
+
+	ut_ad(!trx->fts_trx);
+
+	std::vector<pfs_os_file_t> deleted;
+	DBUG_EXECUTE_IF("innodb_alter_commit_crash_before_commit",
+			log_buffer_flush_to_disk(); DBUG_SUICIDE(););
+	/* The SQL layer recovery of ALTER TABLE will invoke
+	innodb_check_version() to know whether our trx->id, which we
+	reported via ha_innobase::table_version() after
+	ha_innobase::prepare_inplace_alter_table(), was committed.
+
+	If this trx was committed (the log write below completed),
+	we will be able to recover our trx->id to
+	dict_table_t::def_trx_id from the data dictionary tables.
+
+	For this logic to work, purge_sys.stop_SYS() and
+	purge_sys.resume_SYS() will ensure that the DB_TRX_ID that we
+	wrote to the SYS_ tables will be preserved until the SQL layer
+	has durably marked the ALTER TABLE operation as completed.
+
+	During recovery, the purge of InnoDB transaction history will
+	not start until innodb_ddl_recovery_done(). */
+	ha_alter_info->inplace_alter_table_committed = purge_sys.resume_SYS;
+	purge_sys.stop_SYS();
+	trx->commit(deleted);
+
+	/* At this point, the changes to the persistent storage have
+	been committed or rolled back. What remains to be done is to
+	update the in-memory structures, close some handles, release
+	temporary files, and (unless we rolled back) update persistent
+	statistics. */
+	for (inplace_alter_handler_ctx** pctx = ctx_array;
+	     *pctx; pctx++) {
+		ha_innobase_inplace_ctx*	ctx
+			= static_cast<ha_innobase_inplace_ctx*>(*pctx);
+
+		DBUG_ASSERT(ctx->need_rebuild() == new_clustered);
+
+		innobase_copy_frm_flags_from_table_share(
+			ctx->new_table, altered_table->s);
+
+		if (new_clustered) {
+			DBUG_PRINT("to_be_dropped",
+				   ("table: %s", ctx->old_table->name.m_name));
+
+			if (innobase_update_foreign_cache(ctx, m_user_thd)
+			    != DB_SUCCESS
+			    && m_prebuilt->trx->check_foreigns) {
+foreign_fail:
+				push_warning_printf(
+					m_user_thd,
+					Sql_condition::WARN_LEVEL_WARN,
+					ER_ALTER_INFO,
+					"failed to load FOREIGN KEY"
+					" constraints");
+			}
+		} else {
+			bool fk_fail = innobase_update_foreign_cache(
+				ctx, m_user_thd) != DB_SUCCESS;
+
+			if (!commit_cache_norebuild(ha_alter_info, ctx,
+						    altered_table, table,
+						    trx)) {
+				fk_fail = true;
+			}
+
+			if (fk_fail && m_prebuilt->trx->check_foreigns) {
+				goto foreign_fail;
+			}
+		}
+
+		dict_mem_table_free_foreign_vcol_set(ctx->new_table);
+		dict_mem_table_fill_foreign_vcol_set(ctx->new_table);
+	}
+
+	ut_ad(trx == ctx0->trx);
+	ctx0->trx = nullptr;
+
+	/* Free the ctx->trx of other partitions, if any. We will only
+	use the ctx0->trx here. Others may have been allocated in
+	the prepare stage. */
+
+	for (inplace_alter_handler_ctx** pctx = &ctx_array[1]; *pctx;
+	     pctx++) {
+		ha_innobase_inplace_ctx*	ctx
+			= static_cast<ha_innobase_inplace_ctx*>(*pctx);
+
+		if (ctx->trx) {
+			ctx->trx->rollback();
+			ctx->trx->free();
+			ctx->trx = NULL;
+		}
+	}
+
+	/* MDEV-17468: Avoid this at least when ctx->is_instant().
+	Currently dict_load_column_low() is the only place where
+	num_base for virtual columns is assigned to nonzero. */
+	if (ctx0->num_to_drop_vcol || ctx0->num_to_add_vcol
+	    || (ctx0->new_table->n_v_cols && !new_clustered
+		&& (ha_alter_info->alter_info->drop_list.elements
+		    || ha_alter_info->alter_info->create_list.elements))
+	    || (ctx0->is_instant()
+		&& m_prebuilt->table->n_v_cols
+		&& ha_alter_info->handler_flags & ALTER_STORED_COLUMN_ORDER)
+	    || !ctx0->change_col_collate.empty()) {
+		DBUG_ASSERT(ctx0->old_table->get_ref_count() == 1);
+		ut_ad(ctx0->prebuilt == m_prebuilt);
+
+		for (inplace_alter_handler_ctx** pctx = ctx_array; *pctx;
+		     pctx++) {
+			auto ctx= static_cast<ha_innobase_inplace_ctx*>(*pctx);
+			ctx->prebuilt->table = innobase_reload_table(
+				m_user_thd, ctx->prebuilt->table,
+				table->s->table_name, *ctx);
+			innobase_copy_frm_flags_from_table_share(
+				ctx->prebuilt->table, altered_table->s);
+		}
+
+		unlock_and_close_files(deleted, trx);
+		log_write_up_to(trx->commit_lsn, true);
+		DBUG_EXECUTE_IF("innodb_alter_commit_crash_after_commit",
+				DBUG_SUICIDE(););
+		trx->free();
+		if (fts_exist) {
+			purge_sys.resume_FTS();
+		}
+		MONITOR_ATOMIC_DEC(MONITOR_PENDING_ALTER_TABLE);
+		/* There is no need to reset dict_table_t::persistent_autoinc
+		as the table is reloaded */
+		DBUG_RETURN(false);
+	}
+
+	for (inplace_alter_handler_ctx** pctx = ctx_array;
+	     *pctx; pctx++) {
+		ha_innobase_inplace_ctx*	ctx
+			= static_cast<ha_innobase_inplace_ctx*>
+			(*pctx);
+		DBUG_ASSERT(ctx->need_rebuild() == new_clustered);
+
+		/* Publish the created fulltext index, if any.
+		Note that a fulltext index can be created without
+		creating the clustered index, if there already exists
+		a suitable FTS_DOC_ID column. If not, one will be
+		created, implying new_clustered */
+		for (ulint i = 0; i < ctx->num_to_add_index; i++) {
+			dict_index_t*	index = ctx->add_index[i];
+
+			if (index->type & DICT_FTS) {
+				DBUG_ASSERT(index->type == DICT_FTS);
+				/* We reset DICT_TF2_FTS here because the bit
+				is left unset when a drop proceeds the add. */
+				DICT_TF2_FLAG_SET(ctx->new_table, DICT_TF2_FTS);
+				fts_add_index(index, ctx->new_table);
+			}
+		}
+
+		ut_d(dict_table_check_for_dup_indexes(
+			     ctx->new_table, CHECK_ALL_COMPLETE));
+
+		/* Start/Restart the FTS background operations. */
+		if (ctx->new_table->fts) {
+			fts_optimize_add_table(ctx->new_table);
+		}
+
+		ut_d(dict_table_check_for_dup_indexes(
+			     ctx->new_table, CHECK_ABORTED_OK));
+
+#ifdef UNIV_DEBUG
+		if (!(ctx->new_table->fts != NULL
+			&& ctx->new_table->fts->cache->sync->in_progress)) {
+			ut_a(fts_check_cached_index(ctx->new_table));
+		}
+#endif
+	}
+
+	unlock_and_close_files(deleted, trx);
+	log_write_up_to(trx->commit_lsn, true);
+	DBUG_EXECUTE_IF("innodb_alter_commit_crash_after_commit",
+			DBUG_SUICIDE(););
+	trx->free();
+	if (fts_exist) {
+		purge_sys.resume_FTS();
+	}
+
+	/* TODO: The following code could be executed
+	while allowing concurrent access to the table
+	(MDL downgrade). */
+
+	if (new_clustered) {
+		for (inplace_alter_handler_ctx** pctx = ctx_array;
+		     *pctx; pctx++) {
+			ha_innobase_inplace_ctx*	ctx
+				= static_cast<ha_innobase_inplace_ctx*>
+				(*pctx);
+			DBUG_ASSERT(ctx->need_rebuild());
+
+			alter_stats_rebuild(
+				ctx->new_table, table->s->table_name.str,
+				m_user_thd);
+		}
+	} else {
+		for (inplace_alter_handler_ctx** pctx = ctx_array;
+		     *pctx; pctx++) {
+			ha_innobase_inplace_ctx*	ctx
+				= static_cast<ha_innobase_inplace_ctx*>
+				(*pctx);
+			DBUG_ASSERT(!ctx->need_rebuild());
+
+			alter_stats_norebuild(ha_alter_info, ctx, m_user_thd);
+		}
+	}
+
+	innobase_parse_hint_from_comment(
+		m_user_thd, m_prebuilt->table, altered_table->s);
+
+	/* TODO: Also perform DROP TABLE and DROP INDEX after
+	the MDL downgrade. */
+
+#ifndef DBUG_OFF
+	dict_index_t* clust_index = dict_table_get_first_index(
+		ctx0->prebuilt->table);
+	DBUG_ASSERT(!clust_index->online_log);
+	DBUG_ASSERT(dict_index_get_online_status(clust_index)
+		    == ONLINE_INDEX_COMPLETE);
+
+	for (dict_index_t* index = clust_index;
+	     index;
+	     index = dict_table_get_next_index(index)) {
+		DBUG_ASSERT(!index->to_be_dropped);
+	}
+#endif /* DBUG_OFF */
+	MONITOR_ATOMIC_DEC(MONITOR_PENDING_ALTER_TABLE);
+	DBUG_RETURN(false);
+}
+
+/**
+@param thd the session
+@param start_value the lower bound
+@param max_value the upper bound (inclusive) */
+
+ib_sequence_t::ib_sequence_t(
+	THD*		thd,
+	ulonglong	start_value,
+	ulonglong	max_value)
+	:
+	m_max_value(max_value),
+	m_increment(0),
+	m_offset(0),
+	m_next_value(start_value),
+	m_eof(false)
+{
+	if (thd != 0 && m_max_value > 0) {
+
+		thd_get_autoinc(thd, &m_offset, &m_increment);
+
+		if (m_increment > 1 || m_offset > 1) {
+
+			/* If there is an offset or increment specified
+			then we need to work out the exact next value. */
+
+			m_next_value = innobase_next_autoinc(
+				start_value, 1,
+				m_increment, m_offset, m_max_value);
+
+		} else if (start_value == 0) {
+			/* The next value can never be 0. */
+			m_next_value = 1;
+		}
+	} else {
+		m_eof = true;
+	}
+}
+
+/**
+Postfix increment
+@return the next value to insert */
+
+ulonglong
+ib_sequence_t::operator++(int) UNIV_NOTHROW
+{
+	ulonglong	current = m_next_value;
+
+	ut_ad(!m_eof);
+	ut_ad(m_max_value > 0);
+
+	m_next_value = innobase_next_autoinc(
+		current, 1, m_increment, m_offset, m_max_value);
+
+	if (m_next_value == m_max_value && current == m_next_value) {
+		m_eof = true;
+	}
+
+	return(current);
+}
diff --git a/storage/innobase/handler/i_s.cc b/storage/innobase/handler/i_s.cc
new file mode 100644
index 00000000..b00308d7
--- /dev/null
+++ b/storage/innobase/handler/i_s.cc
@@ -0,0 +1,6506 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2014, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file handler/i_s.cc
+InnoDB INFORMATION SCHEMA tables interface to MySQL.
+
+Created July 18, 2007 Vasil Dimov
+*******************************************************/
+
+#include "univ.i"
+#include <mysql_version.h>
+#include <field.h>
+
+#include <sql_acl.h>
+#include <sql_show.h>
+#include <sql_time.h>
+
+#include "i_s.h"
+#include "btr0pcur.h"
+#include "btr0types.h"
+#include "dict0dict.h"
+#include "dict0load.h"
+#include "buf0buddy.h"
+#include "buf0buf.h"
+#include "ibuf0ibuf.h"
+#include "dict0mem.h"
+#include "dict0types.h"
+#include "srv0start.h"
+#include "trx0i_s.h"
+#include "trx0trx.h"
+#include "srv0mon.h"
+#include "pars0pars.h"
+#include "fts0types.h"
+#include "fts0opt.h"
+#include "fts0priv.h"
+#include "btr0btr.h"
+#include "page0zip.h"
+#include "fil0fil.h"
+#include "fil0crypt.h"
+#include "dict0crea.h"
+#include "fts0vlc.h"
+#include "scope.h"
+#include "log.h"
+
+/** The latest successfully looked up innodb_fts_aux_table */
+table_id_t innodb_ft_aux_table_id;
+
+/** structure associates a name string with a file page type and/or buffer
+page state. */
+struct buf_page_desc_t{
+	const char*	type_str;	/*!< String explain the page
+					type/state */
+	ulint		type_value;	/*!< Page type or page state */
+};
+
+/** We also define I_S_PAGE_TYPE_INDEX as the Index Page's position
+in i_s_page_type[] array */
+#define I_S_PAGE_TYPE_INDEX		1
+
+/** Any unassigned FIL_PAGE_TYPE will be treated as unknown. */
+#define	I_S_PAGE_TYPE_UNKNOWN		FIL_PAGE_TYPE_UNKNOWN
+
+/** R-tree index page */
+#define	I_S_PAGE_TYPE_RTREE		(FIL_PAGE_TYPE_LAST + 1)
+
+/** Change buffer B-tree page */
+#define	I_S_PAGE_TYPE_IBUF		(FIL_PAGE_TYPE_LAST + 2)
+
+#define I_S_PAGE_TYPE_LAST		I_S_PAGE_TYPE_IBUF
+
+#define I_S_PAGE_TYPE_BITS		4
+
+/** Name string for File Page Types */
+static buf_page_desc_t	i_s_page_type[] = {
+	{"ALLOCATED", FIL_PAGE_TYPE_ALLOCATED},
+	{"INDEX", FIL_PAGE_INDEX},
+	{"UNDO_LOG", FIL_PAGE_UNDO_LOG},
+	{"INODE", FIL_PAGE_INODE},
+	{"IBUF_FREE_LIST", FIL_PAGE_IBUF_FREE_LIST},
+	{"IBUF_BITMAP", FIL_PAGE_IBUF_BITMAP},
+	{"SYSTEM", FIL_PAGE_TYPE_SYS},
+	{"TRX_SYSTEM", FIL_PAGE_TYPE_TRX_SYS},
+	{"FILE_SPACE_HEADER", FIL_PAGE_TYPE_FSP_HDR},
+	{"EXTENT_DESCRIPTOR", FIL_PAGE_TYPE_XDES},
+	{"BLOB", FIL_PAGE_TYPE_BLOB},
+	{"COMPRESSED_BLOB", FIL_PAGE_TYPE_ZBLOB},
+	{"COMPRESSED_BLOB2", FIL_PAGE_TYPE_ZBLOB2},
+	{"UNKNOWN", I_S_PAGE_TYPE_UNKNOWN},
+	{"RTREE_INDEX", I_S_PAGE_TYPE_RTREE},
+	{"IBUF_INDEX", I_S_PAGE_TYPE_IBUF},
+	{"PAGE COMPRESSED", FIL_PAGE_PAGE_COMPRESSED},
+	{"PAGE COMPRESSED AND ENCRYPTED", FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED},
+};
+
+/** This structure defines information we will fetch from pages
+currently cached in the buffer pool. It will be used to populate
+table INFORMATION_SCHEMA.INNODB_BUFFER_PAGE */
+struct buf_page_info_t{
+	ulint		block_id;	/*!< Buffer Pool block ID */
+	/** page identifier */
+	page_id_t	id;
+	uint32_t	access_time;	/*!< Time of first access */
+	uint32_t	state;		/*!< buf_page_t::state() */
+#ifdef BTR_CUR_HASH_ADAPT
+	unsigned	hashed:1;	/*!< Whether hash index has been
+					built on this page */
+#endif /* BTR_CUR_HASH_ADAPT */
+	unsigned	is_old:1;	/*!< TRUE if the block is in the old
+					blocks in buf_pool.LRU_old */
+	unsigned	freed_page_clock:31; /*!< the value of
+					buf_pool.freed_page_clock */
+	unsigned	zip_ssize:PAGE_ZIP_SSIZE_BITS;
+					/*!< Compressed page size */
+	unsigned	compressed_only:1; /*!< ROW_FORMAT=COMPRESSED only */
+	unsigned	page_type:I_S_PAGE_TYPE_BITS;	/*!< Page type */
+	unsigned	num_recs:UNIV_PAGE_SIZE_SHIFT_MAX-2;
+					/*!< Number of records on Page */
+	unsigned	data_size:UNIV_PAGE_SIZE_SHIFT_MAX;
+					/*!< Sum of the sizes of the records */
+	lsn_t		newest_mod;	/*!< Log sequence number of
+					the youngest modification */
+	lsn_t		oldest_mod;	/*!< Log sequence number of
+					the oldest modification */
+	index_id_t	index_id;	/*!< Index ID if a index page */
+};
+
+/*
+Use the following types mapping:
+
+C type	ST_FIELD_INFO::field_type
+---------------------------------
+long			MYSQL_TYPE_LONGLONG
+(field_length=MY_INT64_NUM_DECIMAL_DIGITS)
+
+long unsigned		MYSQL_TYPE_LONGLONG
+(field_length=MY_INT64_NUM_DECIMAL_DIGITS, field_flags=MY_I_S_UNSIGNED)
+
+char*			MYSQL_TYPE_STRING
+(field_length=n)
+
+float			MYSQL_TYPE_FLOAT
+(field_length=0 is ignored)
+
+void*			MYSQL_TYPE_LONGLONG
+(field_length=MY_INT64_NUM_DECIMAL_DIGITS, field_flags=MY_I_S_UNSIGNED)
+
+boolean (if else)	MYSQL_TYPE_LONG
+(field_length=1)
+
+time_t			MYSQL_TYPE_DATETIME
+(field_length=0 ignored)
+---------------------------------
+*/
+
+/**
+Common function to fill any of the dynamic tables:
+INFORMATION_SCHEMA.innodb_trx
+INFORMATION_SCHEMA.innodb_locks
+INFORMATION_SCHEMA.innodb_lock_waits
+@retval false if access to the table is blocked
+@retval true  if something should be filled in */
+static bool trx_i_s_common_fill_table(THD *thd, TABLE_LIST *tables)
+{
+  DBUG_ENTER("trx_i_s_common_fill_table");
+
+  /* deny access to non-superusers */
+  if (check_global_access(thd, PROCESS_ACL))
+    DBUG_RETURN(false);
+
+  RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+  /* update the cache */
+  trx_i_s_cache_start_write(trx_i_s_cache);
+  trx_i_s_possibly_fetch_data_into_cache(trx_i_s_cache);
+  trx_i_s_cache_end_write(trx_i_s_cache);
+
+  if (trx_i_s_cache_is_truncated(trx_i_s_cache))
+    sql_print_warning("InnoDB: Data in %.*s truncated due to memory limit"
+                      " of %u bytes",
+                      int(tables->schema_table_name.length),
+                      tables->schema_table_name.str,
+                      TRX_I_S_MEM_LIMIT);
+
+  DBUG_RETURN(true);
+}
+
+/*******************************************************************//**
+Unbind a dynamic INFORMATION_SCHEMA table.
+@return 0 on success */
+static
+int
+i_s_common_deinit(
+/*==============*/
+	void*	p);	/*!< in/out: table schema object */
+/*******************************************************************//**
+Auxiliary function to store time_t value in MYSQL_TYPE_DATETIME
+field.
+@return 0 on success */
+static
+int
+field_store_time_t(
+/*===============*/
+	Field*	field,	/*!< in/out: target field for storage */
+	time_t	time)	/*!< in: value to store */
+{
+	MYSQL_TIME	my_time;
+	struct tm	tm_time;
+
+	if (time) {
+#if 0
+		/* use this if you are sure that `variables' and `time_zone'
+		are always initialized */
+		thd->variables.time_zone->gmt_sec_to_TIME(
+			&my_time, (my_time_t) time);
+#else
+		localtime_r(&time, &tm_time);
+		localtime_to_TIME(&my_time, &tm_time);
+		my_time.time_type = MYSQL_TIMESTAMP_DATETIME;
+#endif
+	} else {
+		memset(&my_time, 0, sizeof(my_time));
+	}
+
+	/* JAN: TODO: MySQL 5.7
+	return(field->store_time(&my_time, MYSQL_TIMESTAMP_DATETIME));
+	*/
+	return(field->store_time(&my_time));
+}
+
+/*******************************************************************//**
+Auxiliary function to store char* value in MYSQL_TYPE_STRING field.
+@return 0 on success */
+static
+int
+field_store_string(
+/*===============*/
+	Field*		field,	/*!< in/out: target field for storage */
+	const char*	str)	/*!< in: NUL-terminated utf-8 string,
+				or NULL */
+{
+	if (!str) {
+		field->set_null();
+		return 0;
+	}
+
+	field->set_notnull();
+	return field->store(str, uint(strlen(str)), system_charset_info);
+}
+
+#ifdef BTR_CUR_HASH_ADAPT
+# define I_S_AHI 1 /* Include the IS_HASHED column */
+#else
+# define I_S_AHI 0 /* Omit the IS_HASHED column */
+#endif
+
+static const LEX_CSTRING isolation_level_values[] =
+{
+	{ STRING_WITH_LEN("READ UNCOMMITTED") },
+	{ STRING_WITH_LEN("READ COMMITTED") },
+	{ STRING_WITH_LEN("REPEATABLE READ") },
+	{ STRING_WITH_LEN("SERIALIZABLE") }
+};
+
+static TypelibBuffer<4> isolation_level_values_typelib(isolation_level_values);
+
+namespace Show {
+
+/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_trx */
+static ST_FIELD_INFO innodb_trx_fields_info[]=
+{
+#define IDX_TRX_ID		0
+  Column("trx_id", ULonglong(), NOT_NULL),
+
+#define IDX_TRX_STATE		1
+  Column("trx_state", Varchar(13), NOT_NULL),
+
+#define IDX_TRX_STARTED		2
+  Column("trx_started", Datetime(0), NOT_NULL),
+
+#define IDX_TRX_REQUESTED_LOCK_ID	3
+  Column("trx_requested_lock_id",
+         Varchar(TRX_I_S_LOCK_ID_MAX_LEN + 1), NULLABLE),
+
+#define IDX_TRX_WAIT_STARTED	4
+ Column("trx_wait_started", Datetime(0), NULLABLE),
+
+#define IDX_TRX_WEIGHT		5
+ Column("trx_weight", ULonglong(), NOT_NULL),
+
+#define IDX_TRX_MYSQL_THREAD_ID	6
+  Column("trx_mysql_thread_id", ULonglong(), NOT_NULL),
+
+#define IDX_TRX_QUERY		7
+  Column("trx_query", Varchar(TRX_I_S_TRX_QUERY_MAX_LEN), NULLABLE),
+
+#define IDX_TRX_OPERATION_STATE	8
+  Column("trx_operation_state", Varchar(64), NULLABLE),
+
+#define IDX_TRX_TABLES_IN_USE	9
+  Column("trx_tables_in_use", ULonglong(), NOT_NULL),
+
+#define IDX_TRX_TABLES_LOCKED	10
+  Column("trx_tables_locked", ULonglong(), NOT_NULL),
+
+#define IDX_TRX_LOCK_STRUCTS	11
+  Column("trx_lock_structs", ULonglong(), NOT_NULL),
+
+#define IDX_TRX_LOCK_MEMORY_BYTES	12
+  Column("trx_lock_memory_bytes", ULonglong(), NOT_NULL),
+
+#define IDX_TRX_ROWS_LOCKED	13
+  Column("trx_rows_locked", ULonglong(), NOT_NULL),
+
+#define IDX_TRX_ROWS_MODIFIED	14
+  Column("trx_rows_modified", ULonglong(), NOT_NULL),
+
+#define IDX_TRX_CONNCURRENCY_TICKETS	15
+  Column("trx_concurrency_tickets", ULonglong(), NOT_NULL),
+
+#define IDX_TRX_ISOLATION_LEVEL	16
+  Column("trx_isolation_level",
+         Enum(&isolation_level_values_typelib), NOT_NULL),
+
+#define IDX_TRX_UNIQUE_CHECKS	17
+  Column("trx_unique_checks", SLong(1), NOT_NULL),
+
+#define IDX_TRX_FOREIGN_KEY_CHECKS	18
+  Column("trx_foreign_key_checks", SLong(1), NOT_NULL),
+
+#define IDX_TRX_LAST_FOREIGN_KEY_ERROR	19
+  Column("trx_last_foreign_key_error",
+         Varchar(TRX_I_S_TRX_FK_ERROR_MAX_LEN),NULLABLE),
+
+#define IDX_TRX_READ_ONLY		20
+  Column("trx_is_read_only", SLong(1), NOT_NULL),
+
+#define IDX_TRX_AUTOCOMMIT_NON_LOCKING	21
+  Column("trx_autocommit_non_locking", SLong(1), NOT_NULL),
+
+  CEnd()
+};
+
+} // namespace Show
+
+/*******************************************************************//**
+Read data from cache buffer and fill the INFORMATION_SCHEMA.innodb_trx
+table with it.
+@retval 0 on success
+@retval 1 on failure */
+static int fill_innodb_trx_from_cache(THD *thd, TABLE_LIST *tables, Item*)
+{
+	ulint	rows_num;
+	char	lock_id[TRX_I_S_LOCK_ID_MAX_LEN + 1];
+	ulint	i;
+
+	DBUG_ENTER("fill_innodb_trx_from_cache");
+
+	if (!trx_i_s_common_fill_table(thd, tables)) {
+		DBUG_RETURN(0);
+	}
+
+	struct cache
+	{
+		cache() { trx_i_s_cache_start_read(trx_i_s_cache); }
+		~cache() { trx_i_s_cache_end_read(trx_i_s_cache); }
+	} c;
+
+	Field** fields = tables->table->field;
+
+	rows_num = trx_i_s_cache_get_rows_used(trx_i_s_cache,
+					       I_S_INNODB_TRX);
+
+	for (i = 0; i < rows_num; i++) {
+
+		i_s_trx_row_t*	row;
+
+		row = (i_s_trx_row_t*)
+			trx_i_s_cache_get_nth_row(
+				trx_i_s_cache, I_S_INNODB_TRX, i);
+
+		/* trx_id */
+		OK(fields[IDX_TRX_ID]->store(row->trx_id, true));
+
+		/* trx_state */
+		OK(field_store_string(fields[IDX_TRX_STATE],
+				      row->trx_state));
+
+		/* trx_started */
+		OK(field_store_time_t(fields[IDX_TRX_STARTED],
+				      (time_t) row->trx_started));
+
+		/* trx_requested_lock_id */
+		/* trx_wait_started */
+		if (row->trx_wait_started != 0) {
+
+			OK(field_store_string(
+				   fields[IDX_TRX_REQUESTED_LOCK_ID],
+				   trx_i_s_create_lock_id(
+					   row->requested_lock_row,
+					   lock_id, sizeof(lock_id))));
+			/* field_store_string() sets it no notnull */
+
+			OK(field_store_time_t(
+				   fields[IDX_TRX_WAIT_STARTED],
+				   (time_t) row->trx_wait_started));
+			fields[IDX_TRX_WAIT_STARTED]->set_notnull();
+		} else {
+
+			fields[IDX_TRX_REQUESTED_LOCK_ID]->set_null();
+			fields[IDX_TRX_WAIT_STARTED]->set_null();
+		}
+
+		/* trx_weight */
+		OK(fields[IDX_TRX_WEIGHT]->store(row->trx_weight, true));
+
+		/* trx_mysql_thread_id */
+		OK(fields[IDX_TRX_MYSQL_THREAD_ID]->store(
+			   row->trx_mysql_thread_id, true));
+
+		/* trx_query */
+		if (row->trx_query) {
+			/* store will do appropriate character set
+			conversion check */
+			fields[IDX_TRX_QUERY]->store(
+				row->trx_query,
+				static_cast<uint>(strlen(row->trx_query)),
+				row->trx_query_cs);
+			fields[IDX_TRX_QUERY]->set_notnull();
+		} else {
+			fields[IDX_TRX_QUERY]->set_null();
+		}
+
+		/* trx_operation_state */
+		OK(field_store_string(fields[IDX_TRX_OPERATION_STATE],
+				      row->trx_operation_state));
+
+		/* trx_tables_in_use */
+		OK(fields[IDX_TRX_TABLES_IN_USE]->store(
+			   row->trx_tables_in_use, true));
+
+		/* trx_tables_locked */
+		OK(fields[IDX_TRX_TABLES_LOCKED]->store(
+			   row->trx_tables_locked, true));
+
+		/* trx_lock_structs */
+		OK(fields[IDX_TRX_LOCK_STRUCTS]->store(
+			   row->trx_lock_structs, true));
+
+		/* trx_lock_memory_bytes */
+		OK(fields[IDX_TRX_LOCK_MEMORY_BYTES]->store(
+			   row->trx_lock_memory_bytes, true));
+
+		/* trx_rows_locked */
+		OK(fields[IDX_TRX_ROWS_LOCKED]->store(
+			   row->trx_rows_locked, true));
+
+		/* trx_rows_modified */
+		OK(fields[IDX_TRX_ROWS_MODIFIED]->store(
+			   row->trx_rows_modified, true));
+
+		/* trx_concurrency_tickets */
+		OK(fields[IDX_TRX_CONNCURRENCY_TICKETS]->store(0, true));
+
+		/* trx_isolation_level */
+		OK(fields[IDX_TRX_ISOLATION_LEVEL]->store(
+			   1 + row->trx_isolation_level, true));
+
+		/* trx_unique_checks */
+		OK(fields[IDX_TRX_UNIQUE_CHECKS]->store(
+			   row->trx_unique_checks, true));
+
+		/* trx_foreign_key_checks */
+		OK(fields[IDX_TRX_FOREIGN_KEY_CHECKS]->store(
+			   row->trx_foreign_key_checks, true));
+
+		/* trx_last_foreign_key_error */
+		OK(field_store_string(fields[IDX_TRX_LAST_FOREIGN_KEY_ERROR],
+				      row->trx_foreign_key_error));
+
+		/* trx_is_read_only*/
+		OK(fields[IDX_TRX_READ_ONLY]->store(
+			   row->trx_is_read_only, true));
+
+		/* trx_is_autocommit_non_locking */
+		OK(fields[IDX_TRX_AUTOCOMMIT_NON_LOCKING]->store(
+			   row->trx_is_autocommit_non_locking, true));
+
+		OK(schema_table_store_record(thd, tables->table));
+	}
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_trx
+@return 0 on success */
+static
+int
+innodb_trx_init(
+/*============*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_trx_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = Show::innodb_trx_fields_info;
+	schema->fill_table = fill_innodb_trx_from_cache;
+
+	DBUG_RETURN(0);
+}
+
+static struct st_mysql_information_schema	i_s_info =
+{
+	MYSQL_INFORMATION_SCHEMA_INTERFACE_VERSION
+};
+
+/** version number reported by SHOW PLUGINS */
+constexpr unsigned i_s_version= MYSQL_VERSION_MAJOR << 8 | MYSQL_VERSION_MINOR;
+
+struct st_maria_plugin	i_s_innodb_trx =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	MYSQL_INFORMATION_SCHEMA_PLUGIN,
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	&i_s_info,
+
+	/* plugin name */
+	/* const char* */
+	"INNODB_TRX",
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	plugin_author,
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	"InnoDB transactions",
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	PLUGIN_LICENSE_GPL,
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	innodb_trx_init,
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	i_s_common_deinit,
+
+	i_s_version, nullptr, nullptr, PACKAGE_VERSION,
+	MariaDB_PLUGIN_MATURITY_STABLE
+};
+
+static const LEX_CSTRING lock_mode_values[] =
+{
+	{ STRING_WITH_LEN("S") },
+	{ STRING_WITH_LEN("S,GAP") },
+	{ STRING_WITH_LEN("X") },
+	{ STRING_WITH_LEN("X,GAP") },
+	{ STRING_WITH_LEN("IS") },
+	{ STRING_WITH_LEN("IS,GAP") },
+	{ STRING_WITH_LEN("IX") },
+	{ STRING_WITH_LEN("IX,GAP") },
+	{ STRING_WITH_LEN("AUTO_INC") }
+};
+
+static TypelibBuffer<9> lock_mode_values_typelib(lock_mode_values);
+
+static const LEX_CSTRING lock_type_values[] =
+{
+	{ STRING_WITH_LEN("RECORD") },
+	{ STRING_WITH_LEN("TABLE") }
+};
+
+static TypelibBuffer<2> lock_type_values_typelib(lock_type_values);
+
+namespace Show {
+/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_locks */
+static ST_FIELD_INFO innodb_locks_fields_info[]=
+{
+#define IDX_LOCK_ID		0
+  Column("lock_id",     Varchar(TRX_I_S_LOCK_ID_MAX_LEN + 1),  NOT_NULL),
+
+#define IDX_LOCK_TRX_ID		1
+  Column("lock_trx_id", ULonglong(), NOT_NULL),
+
+#define IDX_LOCK_MODE		2
+  Column("lock_mode",   Enum(&lock_mode_values_typelib), NOT_NULL),
+
+#define IDX_LOCK_TYPE		3
+  Column("lock_type",   Enum(&lock_type_values_typelib), NOT_NULL),
+
+#define IDX_LOCK_TABLE		4
+  Column("lock_table",  Varchar(1024), NOT_NULL),
+
+#define IDX_LOCK_INDEX		5
+  Column("lock_index",  Varchar(1024), NULLABLE),
+
+#define IDX_LOCK_SPACE		6
+  Column("lock_space",  ULong(),   NULLABLE),
+
+#define IDX_LOCK_PAGE		7
+  Column("lock_page",   ULong(),   NULLABLE),
+
+#define IDX_LOCK_REC		8
+  Column("lock_rec",    ULong(),   NULLABLE),
+
+#define IDX_LOCK_DATA		9
+  Column("lock_data",   Varchar(TRX_I_S_LOCK_DATA_MAX_LEN), NULLABLE),
+  CEnd()
+};
+} // namespace Show
+
+/*******************************************************************//**
+Read data from cache buffer and fill the INFORMATION_SCHEMA.innodb_locks
+table with it.
+@return 0 on success */
+static
+int
+fill_innodb_locks_from_cache(
+/*=========================*/
+	THD*			thd,	/*!< in: MySQL client connection */
+	TABLE_LIST*		tables,	/*!< in/out: fill this table */
+	Item*)
+{
+	ulint	rows_num;
+	char	lock_id[TRX_I_S_LOCK_ID_MAX_LEN + 1];
+	ulint	i;
+
+	DBUG_ENTER("fill_innodb_locks_from_cache");
+
+	if (!trx_i_s_common_fill_table(thd, tables)) {
+		DBUG_RETURN(0);
+	}
+
+	struct cache
+	{
+		cache() { trx_i_s_cache_start_read(trx_i_s_cache); }
+		~cache() { trx_i_s_cache_end_read(trx_i_s_cache); }
+	} c;
+
+	Field** fields = tables->table->field;
+
+	rows_num = trx_i_s_cache_get_rows_used(trx_i_s_cache,
+					       I_S_INNODB_LOCKS);
+
+	for (i = 0; i < rows_num; i++) {
+
+		i_s_locks_row_t*	row;
+		char			buf[MAX_FULL_NAME_LEN + 1];
+		const char*		bufend;
+
+		row = (i_s_locks_row_t*)
+			trx_i_s_cache_get_nth_row(
+				trx_i_s_cache, I_S_INNODB_LOCKS, i);
+
+		/* lock_id */
+		trx_i_s_create_lock_id(row, lock_id, sizeof(lock_id));
+		OK(field_store_string(fields[IDX_LOCK_ID],
+				      lock_id));
+
+		/* lock_trx_id */
+		OK(fields[IDX_LOCK_TRX_ID]->store(row->lock_trx_id, true));
+
+		/* lock_mode */
+		OK(fields[IDX_LOCK_MODE]->store(row->lock_mode, true));
+
+		/* lock_type */
+		OK(fields[IDX_LOCK_TYPE]->store(
+			   row->lock_index ? 1 : 2, true));
+
+		/* lock_table */
+		bufend = innobase_convert_name(buf, sizeof(buf),
+					       row->lock_table,
+					       strlen(row->lock_table),
+					       thd);
+		OK(fields[IDX_LOCK_TABLE]->store(
+			buf, uint(bufend - buf), system_charset_info));
+
+		if (row->lock_index) {
+			/* record lock */
+			OK(field_store_string(fields[IDX_LOCK_INDEX],
+					      row->lock_index));
+			OK(fields[IDX_LOCK_SPACE]->store(
+				   row->lock_page.space(), true));
+			fields[IDX_LOCK_SPACE]->set_notnull();
+			OK(fields[IDX_LOCK_PAGE]->store(
+				   row->lock_page.page_no(), true));
+			fields[IDX_LOCK_PAGE]->set_notnull();
+			OK(fields[IDX_LOCK_REC]->store(
+				   row->lock_rec, true));
+			fields[IDX_LOCK_REC]->set_notnull();
+			OK(field_store_string(fields[IDX_LOCK_DATA],
+					      row->lock_data));
+		} else {
+			fields[IDX_LOCK_INDEX]->set_null();
+			fields[IDX_LOCK_SPACE]->set_null();
+			fields[IDX_LOCK_REC]->set_null();
+			fields[IDX_LOCK_DATA]->set_null();
+		}
+
+		OK(schema_table_store_record(thd, tables->table));
+	}
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_locks
+@return 0 on success */
+static
+int
+innodb_locks_init(
+/*==============*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_locks_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = Show::innodb_locks_fields_info;
+	schema->fill_table = fill_innodb_locks_from_cache;
+
+	DBUG_RETURN(0);
+}
+
+struct st_maria_plugin	i_s_innodb_locks =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	MYSQL_INFORMATION_SCHEMA_PLUGIN,
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	&i_s_info,
+
+	/* plugin name */
+	/* const char* */
+	"INNODB_LOCKS",
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	plugin_author,
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	"InnoDB conflicting locks",
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	PLUGIN_LICENSE_GPL,
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	innodb_locks_init,
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	i_s_common_deinit,
+
+	i_s_version, nullptr, nullptr, PACKAGE_VERSION,
+	MariaDB_PLUGIN_MATURITY_STABLE
+};
+
+
+namespace Show {
+/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_lock_waits */
+static ST_FIELD_INFO innodb_lock_waits_fields_info[]=
+{
+#define IDX_REQUESTING_TRX_ID	0
+  Column("requesting_trx_id", ULonglong(), NOT_NULL),
+
+#define IDX_REQUESTED_LOCK_ID	1
+  Column("requested_lock_id", Varchar(TRX_I_S_LOCK_ID_MAX_LEN + 1), NOT_NULL),
+
+#define IDX_BLOCKING_TRX_ID	2
+  Column("blocking_trx_id",   ULonglong(), NOT_NULL),
+
+#define IDX_BLOCKING_LOCK_ID	3
+  Column("blocking_lock_id",  Varchar(TRX_I_S_LOCK_ID_MAX_LEN + 1), NOT_NULL),
+  CEnd()
+};
+} // namespace Show
+
+/*******************************************************************//**
+Read data from cache buffer and fill the
+INFORMATION_SCHEMA.innodb_lock_waits table with it.
+@return 0 on success */
+static
+int
+fill_innodb_lock_waits_from_cache(
+/*==============================*/
+	THD*			thd,	/*!< in: used to call
+					schema_table_store_record() */
+	TABLE_LIST*		tables,	/*!< in/out: fill this table */
+	Item*)
+{
+	ulint	rows_num;
+	char	requested_lock_id[TRX_I_S_LOCK_ID_MAX_LEN + 1];
+	char	blocking_lock_id[TRX_I_S_LOCK_ID_MAX_LEN + 1];
+	ulint	i;
+
+	DBUG_ENTER("fill_innodb_lock_waits_from_cache");
+
+	if (!trx_i_s_common_fill_table(thd, tables)) {
+		DBUG_RETURN(0);
+	}
+
+	struct cache
+	{
+		cache() { trx_i_s_cache_start_read(trx_i_s_cache); }
+		~cache() { trx_i_s_cache_end_read(trx_i_s_cache); }
+	} c;
+
+	Field** fields = tables->table->field;
+
+	rows_num = trx_i_s_cache_get_rows_used(trx_i_s_cache,
+					       I_S_INNODB_LOCK_WAITS);
+
+	for (i = 0; i < rows_num; i++) {
+
+		i_s_lock_waits_row_t*	row;
+
+		row = (i_s_lock_waits_row_t*)
+			trx_i_s_cache_get_nth_row(
+				trx_i_s_cache, I_S_INNODB_LOCK_WAITS, i);
+
+		/* requesting_trx_id */
+		OK(fields[IDX_REQUESTING_TRX_ID]->store(
+				      row->requested_lock_row->lock_trx_id, true));
+
+		/* requested_lock_id */
+		OK(field_store_string(
+			   fields[IDX_REQUESTED_LOCK_ID],
+			   trx_i_s_create_lock_id(
+				   row->requested_lock_row,
+				   requested_lock_id,
+				   sizeof(requested_lock_id))));
+
+		/* blocking_trx_id */
+		OK(fields[IDX_BLOCKING_TRX_ID]->store(
+				      row->blocking_lock_row->lock_trx_id, true));
+
+		/* blocking_lock_id */
+		OK(field_store_string(
+			   fields[IDX_BLOCKING_LOCK_ID],
+			   trx_i_s_create_lock_id(
+				   row->blocking_lock_row,
+				   blocking_lock_id,
+				   sizeof(blocking_lock_id))));
+
+		OK(schema_table_store_record(thd, tables->table));
+	}
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_lock_waits
+@return 0 on success */
+static
+int
+innodb_lock_waits_init(
+/*===================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_lock_waits_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = Show::innodb_lock_waits_fields_info;
+	schema->fill_table = fill_innodb_lock_waits_from_cache;
+
+	DBUG_RETURN(0);
+}
+
+struct st_maria_plugin	i_s_innodb_lock_waits =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	MYSQL_INFORMATION_SCHEMA_PLUGIN,
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	&i_s_info,
+
+	/* plugin name */
+	/* const char* */
+	"INNODB_LOCK_WAITS",
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	plugin_author,
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	"InnoDB which lock is blocking which",
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	PLUGIN_LICENSE_GPL,
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	innodb_lock_waits_init,
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	i_s_common_deinit,
+
+	i_s_version, nullptr, nullptr, PACKAGE_VERSION,
+	MariaDB_PLUGIN_MATURITY_STABLE
+};
+
+namespace Show {
+/* Fields of the dynamic table information_schema.innodb_cmp. */
+static ST_FIELD_INFO i_s_cmp_fields_info[] =
+{
+  Column("page_size",      SLong(5),NOT_NULL, "Compressed Page Size"),
+  Column("compress_ops",   SLong(), NOT_NULL, "Total Number of Compressions"),
+  Column("compress_ops_ok",SLong(), NOT_NULL, "Total Number of "
+                                              "Successful Compressions"),
+  Column("compress_time",  SLong(), NOT_NULL, "Total Duration of "
+                                              "Compressions, in Seconds"),
+  Column("uncompress_ops", SLong(), NOT_NULL, "Total Number of Decompressions"),
+  Column("uncompress_time",SLong(), NOT_NULL, "Total Duration of "
+                                              "Decompressions, in Seconds"),
+  CEnd(),
+};
+} // namespace Show
+
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_cmp or
+innodb_cmp_reset.
+@return 0 on success, 1 on failure */
+static
+int
+i_s_cmp_fill_low(
+/*=============*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		,	/*!< in: condition (ignored) */
+	ibool		reset)	/*!< in: TRUE=reset cumulated counts */
+{
+	TABLE*	table	= (TABLE*) tables->table;
+	int	status	= 0;
+
+	DBUG_ENTER("i_s_cmp_fill_low");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL)) {
+
+		DBUG_RETURN(0);
+	}
+
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+	for (uint i = 0; i < PAGE_ZIP_SSIZE_MAX; i++) {
+		page_zip_stat_t*	zip_stat = &page_zip_stat[i];
+
+		table->field[0]->store(UNIV_ZIP_SIZE_MIN << i);
+
+		/* The cumulated counts are not protected by any
+		mutex.  Thus, some operation in page0zip.cc could
+		increment a counter between the time we read it and
+		clear it.  We could introduce mutex protection, but it
+		could cause a measureable performance hit in
+		page0zip.cc. */
+		table->field[1]->store(zip_stat->compressed, true);
+		table->field[2]->store(zip_stat->compressed_ok, true);
+		table->field[3]->store(zip_stat->compressed_usec / 1000000,
+				       true);
+		table->field[4]->store(zip_stat->decompressed, true);
+		table->field[5]->store(zip_stat->decompressed_usec / 1000000,
+				       true);
+
+		if (reset) {
+			new (zip_stat) page_zip_stat_t();
+		}
+
+		if (schema_table_store_record(thd, table)) {
+			status = 1;
+			break;
+		}
+	}
+
+	DBUG_RETURN(status);
+}
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_cmp.
+@return 0 on success, 1 on failure */
+static
+int
+i_s_cmp_fill(
+/*=========*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		cond)	/*!< in: condition (ignored) */
+{
+	return(i_s_cmp_fill_low(thd, tables, cond, FALSE));
+}
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_cmp_reset.
+@return 0 on success, 1 on failure */
+static
+int
+i_s_cmp_reset_fill(
+/*===============*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		cond)	/*!< in: condition (ignored) */
+{
+	return(i_s_cmp_fill_low(thd, tables, cond, TRUE));
+}
+
+/*******************************************************************//**
+Bind the dynamic table information_schema.innodb_cmp.
+@return 0 on success */
+static
+int
+i_s_cmp_init(
+/*=========*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_cmp_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = Show::i_s_cmp_fields_info;
+	schema->fill_table = i_s_cmp_fill;
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table information_schema.innodb_cmp_reset.
+@return 0 on success */
+static
+int
+i_s_cmp_reset_init(
+/*===============*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_cmp_reset_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = Show::i_s_cmp_fields_info;
+	schema->fill_table = i_s_cmp_reset_fill;
+
+	DBUG_RETURN(0);
+}
+
+struct st_maria_plugin	i_s_innodb_cmp =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	MYSQL_INFORMATION_SCHEMA_PLUGIN,
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	&i_s_info,
+
+	/* plugin name */
+	/* const char* */
+	"INNODB_CMP",
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	plugin_author,
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	"Statistics for the InnoDB compression",
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	PLUGIN_LICENSE_GPL,
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	i_s_cmp_init,
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	i_s_common_deinit,
+
+	i_s_version, nullptr, nullptr, PACKAGE_VERSION,
+	MariaDB_PLUGIN_MATURITY_STABLE
+};
+
+struct st_maria_plugin	i_s_innodb_cmp_reset =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	MYSQL_INFORMATION_SCHEMA_PLUGIN,
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	&i_s_info,
+
+	/* plugin name */
+	/* const char* */
+	"INNODB_CMP_RESET",
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	plugin_author,
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	"Statistics for the InnoDB compression;"
+		   " reset cumulated counts",
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	PLUGIN_LICENSE_GPL,
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	i_s_cmp_reset_init,
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	i_s_common_deinit,
+
+	i_s_version, nullptr, nullptr, PACKAGE_VERSION,
+	MariaDB_PLUGIN_MATURITY_STABLE
+};
+
+
+namespace Show {
+/* Fields of the dynamic tables
+information_schema.innodb_cmp_per_index and
+information_schema.innodb_cmp_per_index_reset. */
+static ST_FIELD_INFO i_s_cmp_per_index_fields_info[]=
+{
+#define IDX_DATABASE_NAME	0
+  Column("database_name",   Varchar(NAME_CHAR_LEN), NOT_NULL),
+
+#define IDX_TABLE_NAME		1 /* FIXME: this is in my_charset_filename! */
+  Column("table_name",      Varchar(NAME_CHAR_LEN), NOT_NULL),
+
+#define IDX_INDEX_NAME		2
+  Column("index_name",      Varchar(NAME_CHAR_LEN), NOT_NULL),
+
+#define IDX_COMPRESS_OPS	3
+  Column("compress_ops",    SLong(),      NOT_NULL),
+
+#define IDX_COMPRESS_OPS_OK	4
+  Column("compress_ops_ok", SLong(),      NOT_NULL),
+
+#define IDX_COMPRESS_TIME	5
+  Column("compress_time",   SLong(),      NOT_NULL),
+
+#define IDX_UNCOMPRESS_OPS	6
+  Column("uncompress_ops",  SLong(),      NOT_NULL),
+
+#define IDX_UNCOMPRESS_TIME	7
+  Column("uncompress_time", SLong(),      NOT_NULL),
+
+  CEnd()
+};
+
+} // namespace Show
+
+/*******************************************************************//**
+Fill the dynamic table
+information_schema.innodb_cmp_per_index or
+information_schema.innodb_cmp_per_index_reset.
+@return 0 on success, 1 on failure */
+static
+int
+i_s_cmp_per_index_fill_low(
+/*=======================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		,	/*!< in: condition (ignored) */
+	ibool		reset)	/*!< in: TRUE=reset cumulated counts */
+{
+	TABLE*	table = tables->table;
+	Field**	fields = table->field;
+	int	status = 0;
+
+	DBUG_ENTER("i_s_cmp_per_index_fill_low");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL)) {
+
+		DBUG_RETURN(0);
+	}
+
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+	/* Create a snapshot of the stats so we do not bump into lock
+	order violations with dict_sys.latch below. */
+	mysql_mutex_lock(&page_zip_stat_per_index_mutex);
+	page_zip_stat_per_index_t		snap (page_zip_stat_per_index);
+	mysql_mutex_unlock(&page_zip_stat_per_index_mutex);
+
+	dict_sys.freeze(SRW_LOCK_CALL);
+
+	page_zip_stat_per_index_t::iterator	iter;
+	ulint					i;
+
+	for (iter = snap.begin(), i = 0; iter != snap.end(); iter++, i++) {
+
+		if (dict_index_t* index
+		    = dict_index_get_if_in_cache_low(iter->first)) {
+			char	db_utf8[MAX_DB_UTF8_LEN];
+			char	table_utf8[MAX_TABLE_UTF8_LEN];
+
+			dict_fs2utf8(index->table->name.m_name,
+				     db_utf8, sizeof(db_utf8),
+				     table_utf8, sizeof(table_utf8));
+
+			status = field_store_string(fields[IDX_DATABASE_NAME],
+						    db_utf8)
+				|| field_store_string(fields[IDX_TABLE_NAME],
+						      table_utf8)
+				|| field_store_string(fields[IDX_INDEX_NAME],
+						      index->name);
+		} else {
+			/* index not found */
+			char name[MY_INT64_NUM_DECIMAL_DIGITS
+				  + sizeof "index_id: "];
+			fields[IDX_DATABASE_NAME]->set_null();
+			fields[IDX_TABLE_NAME]->set_null();
+			fields[IDX_INDEX_NAME]->set_notnull();
+			status = fields[IDX_INDEX_NAME]->store(
+				name,
+				uint(snprintf(name, sizeof name,
+					      "index_id: " IB_ID_FMT,
+					      iter->first)),
+				system_charset_info);
+		}
+
+		if (status
+		    || fields[IDX_COMPRESS_OPS]->store(
+			    iter->second.compressed, true)
+		    || fields[IDX_COMPRESS_OPS_OK]->store(
+			    iter->second.compressed_ok, true)
+		    || fields[IDX_COMPRESS_TIME]->store(
+			    iter->second.compressed_usec / 1000000, true)
+		    || fields[IDX_UNCOMPRESS_OPS]->store(
+			    iter->second.decompressed, true)
+		    || fields[IDX_UNCOMPRESS_TIME]->store(
+			    iter->second.decompressed_usec / 1000000, true)
+		    || schema_table_store_record(thd, table)) {
+			status = 1;
+			break;
+		}
+		/* Release and reacquire the dict_sys.latch to allow other
+		threads to proceed. This could eventually result in the
+		contents of INFORMATION_SCHEMA.innodb_cmp_per_index being
+		inconsistent, but it is an acceptable compromise. */
+		if (i == 1000) {
+			dict_sys.unfreeze();
+			i = 0;
+			dict_sys.freeze(SRW_LOCK_CALL);
+		}
+	}
+
+	dict_sys.unfreeze();
+
+	if (reset) {
+		page_zip_reset_stat_per_index();
+	}
+
+	DBUG_RETURN(status);
+}
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_cmp_per_index.
+@return 0 on success, 1 on failure */
+static
+int
+i_s_cmp_per_index_fill(
+/*===================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		cond)	/*!< in: condition (ignored) */
+{
+	return(i_s_cmp_per_index_fill_low(thd, tables, cond, FALSE));
+}
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_cmp_per_index_reset.
+@return 0 on success, 1 on failure */
+static
+int
+i_s_cmp_per_index_reset_fill(
+/*=========================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		cond)	/*!< in: condition (ignored) */
+{
+	return(i_s_cmp_per_index_fill_low(thd, tables, cond, TRUE));
+}
+
+/*******************************************************************//**
+Bind the dynamic table information_schema.innodb_cmp_per_index.
+@return 0 on success */
+static
+int
+i_s_cmp_per_index_init(
+/*===================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_cmp_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = Show::i_s_cmp_per_index_fields_info;
+	schema->fill_table = i_s_cmp_per_index_fill;
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table information_schema.innodb_cmp_per_index_reset.
+@return 0 on success */
+static
+int
+i_s_cmp_per_index_reset_init(
+/*=========================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_cmp_reset_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = Show::i_s_cmp_per_index_fields_info;
+	schema->fill_table = i_s_cmp_per_index_reset_fill;
+
+	DBUG_RETURN(0);
+}
+
+struct st_maria_plugin	i_s_innodb_cmp_per_index =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	MYSQL_INFORMATION_SCHEMA_PLUGIN,
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	&i_s_info,
+
+	/* plugin name */
+	/* const char* */
+	"INNODB_CMP_PER_INDEX",
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	plugin_author,
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	"Statistics for the InnoDB compression (per index)",
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	PLUGIN_LICENSE_GPL,
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	i_s_cmp_per_index_init,
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	i_s_common_deinit,
+
+	i_s_version, nullptr, nullptr, PACKAGE_VERSION,
+	MariaDB_PLUGIN_MATURITY_STABLE
+};
+
+struct st_maria_plugin	i_s_innodb_cmp_per_index_reset =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	MYSQL_INFORMATION_SCHEMA_PLUGIN,
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	&i_s_info,
+
+	/* plugin name */
+	/* const char* */
+	"INNODB_CMP_PER_INDEX_RESET",
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	plugin_author,
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	"Statistics for the InnoDB compression (per index);"
+		   " reset cumulated counts",
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	PLUGIN_LICENSE_GPL,
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	i_s_cmp_per_index_reset_init,
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	i_s_common_deinit,
+
+	i_s_version, nullptr, nullptr, PACKAGE_VERSION,
+	MariaDB_PLUGIN_MATURITY_STABLE
+};
+
+
+namespace Show {
+/* Fields of the dynamic table information_schema.innodb_cmpmem. */
+static ST_FIELD_INFO i_s_cmpmem_fields_info[] =
+{
+  Column("page_size",           SLong(5), NOT_NULL, "Buddy Block Size"),
+  Column("buffer_pool_instance", SLong(), NOT_NULL, "Buffer Pool Id"),
+  Column("pages_used",           SLong(), NOT_NULL, "Currently in Use"),
+  Column("pages_free",           SLong(), NOT_NULL, "Currently Available"),
+  Column("relocation_ops",   SLonglong(), NOT_NULL, "Total Number of Relocations"),
+  Column("relocation_time",      SLong(), NOT_NULL, "Total Duration of Relocations,"
+                                                    " in Seconds"),
+  CEnd()
+};
+} // namespace Show
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_cmpmem or
+innodb_cmpmem_reset.
+@return 0 on success, 1 on failure */
+static
+int
+i_s_cmpmem_fill_low(
+/*================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		,	/*!< in: condition (ignored) */
+	ibool		reset)	/*!< in: TRUE=reset cumulated counts */
+{
+	TABLE*	table	= (TABLE*) tables->table;
+
+	DBUG_ENTER("i_s_cmpmem_fill_low");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL)) {
+
+		DBUG_RETURN(0);
+	}
+
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+	ulint			zip_free_len_local[BUF_BUDDY_SIZES_MAX + 1];
+	buf_buddy_stat_t	buddy_stat_local[BUF_BUDDY_SIZES_MAX + 1];
+
+	/* Save buddy stats for buffer pool in local variables. */
+	mysql_mutex_lock(&buf_pool.mutex);
+
+	for (uint x = 0; x <= BUF_BUDDY_SIZES; x++) {
+		zip_free_len_local[x] = (x < BUF_BUDDY_SIZES) ?
+			UT_LIST_GET_LEN(buf_pool.zip_free[x]) : 0;
+
+		buddy_stat_local[x] = buf_pool.buddy_stat[x];
+
+		if (reset) {
+			/* This is protected by buf_pool.mutex. */
+			buf_pool.buddy_stat[x].relocated = 0;
+			buf_pool.buddy_stat[x].relocated_usec = 0;
+		}
+	}
+
+	mysql_mutex_unlock(&buf_pool.mutex);
+
+	for (uint x = 0; x <= BUF_BUDDY_SIZES; x++) {
+		buf_buddy_stat_t* buddy_stat = &buddy_stat_local[x];
+
+		Field **field = table->field;
+
+		(*field++)->store(BUF_BUDDY_LOW << x);
+		(*field++)->store(0, true);
+		(*field++)->store(buddy_stat->used, true);
+		(*field++)->store(zip_free_len_local[x], true);
+		(*field++)->store(buddy_stat->relocated, true);
+		(*field)->store(buddy_stat->relocated_usec / 1000000, true);
+
+		if (schema_table_store_record(thd, table)) {
+			DBUG_RETURN(1);
+		}
+	}
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_cmpmem.
+@return 0 on success, 1 on failure */
+static
+int
+i_s_cmpmem_fill(
+/*============*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		cond)	/*!< in: condition (ignored) */
+{
+	return(i_s_cmpmem_fill_low(thd, tables, cond, FALSE));
+}
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_cmpmem_reset.
+@return 0 on success, 1 on failure */
+static
+int
+i_s_cmpmem_reset_fill(
+/*==================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		cond)	/*!< in: condition (ignored) */
+{
+	return(i_s_cmpmem_fill_low(thd, tables, cond, TRUE));
+}
+
+/*******************************************************************//**
+Bind the dynamic table information_schema.innodb_cmpmem.
+@return 0 on success */
+static
+int
+i_s_cmpmem_init(
+/*============*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_cmpmem_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = Show::i_s_cmpmem_fields_info;
+	schema->fill_table = i_s_cmpmem_fill;
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table information_schema.innodb_cmpmem_reset.
+@return 0 on success */
+static
+int
+i_s_cmpmem_reset_init(
+/*==================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_cmpmem_reset_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = Show::i_s_cmpmem_fields_info;
+	schema->fill_table = i_s_cmpmem_reset_fill;
+
+	DBUG_RETURN(0);
+}
+
+struct st_maria_plugin	i_s_innodb_cmpmem =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	MYSQL_INFORMATION_SCHEMA_PLUGIN,
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	&i_s_info,
+
+	/* plugin name */
+	/* const char* */
+	"INNODB_CMPMEM",
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	plugin_author,
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	"Statistics for the InnoDB compressed buffer pool",
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	PLUGIN_LICENSE_GPL,
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	i_s_cmpmem_init,
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	i_s_common_deinit,
+
+	i_s_version, nullptr, nullptr, PACKAGE_VERSION,
+	MariaDB_PLUGIN_MATURITY_STABLE
+};
+
+struct st_maria_plugin	i_s_innodb_cmpmem_reset =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	MYSQL_INFORMATION_SCHEMA_PLUGIN,
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	&i_s_info,
+
+	/* plugin name */
+	/* const char* */
+	"INNODB_CMPMEM_RESET",
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	plugin_author,
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	"Statistics for the InnoDB compressed buffer pool;"
+		   " reset cumulated counts",
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	PLUGIN_LICENSE_GPL,
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	i_s_cmpmem_reset_init,
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	i_s_common_deinit,
+
+	i_s_version, nullptr, nullptr, PACKAGE_VERSION,
+	MariaDB_PLUGIN_MATURITY_STABLE
+};
+
+
+static const LEX_CSTRING metric_type_values[] =
+{
+	{ STRING_WITH_LEN("value") },
+	{ STRING_WITH_LEN("status_counter") },
+	{ STRING_WITH_LEN("set_owner") },
+	{ STRING_WITH_LEN("set_member") },
+	{ STRING_WITH_LEN("counter") }
+};
+
+static TypelibBuffer<5> metric_type_values_typelib(metric_type_values);
+
+namespace Show {
+/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_metrics */
+static ST_FIELD_INFO innodb_metrics_fields_info[]=
+{
+#define	METRIC_NAME		0
+  Column("NAME",            Varchar(NAME_LEN + 1),       NOT_NULL),
+
+#define	METRIC_SUBSYS		1
+  Column("SUBSYSTEM",       Varchar(NAME_LEN + 1),       NOT_NULL),
+
+#define	METRIC_VALUE_START	2
+  Column("COUNT",           SLonglong(),                 NOT_NULL),
+
+#define	METRIC_MAX_VALUE_START	3
+  Column("MAX_COUNT",       SLonglong(),                 NULLABLE),
+
+#define	METRIC_MIN_VALUE_START	4
+  Column("MIN_COUNT",       SLonglong(),                 NULLABLE),
+
+#define	METRIC_AVG_VALUE_START	5
+  Column("AVG_COUNT",       Float(MAX_FLOAT_STR_LENGTH), NULLABLE),
+
+#define	METRIC_VALUE_RESET	6
+  Column("COUNT_RESET",     SLonglong(),                 NOT_NULL),
+
+#define	METRIC_MAX_VALUE_RESET	7
+  Column("MAX_COUNT_RESET", SLonglong(),                 NULLABLE),
+
+#define	METRIC_MIN_VALUE_RESET	8
+  Column("MIN_COUNT_RESET", SLonglong(),                 NULLABLE),
+
+#define	METRIC_AVG_VALUE_RESET	9
+  Column("AVG_COUNT_RESET", Float(MAX_FLOAT_STR_LENGTH), NULLABLE),
+
+#define	METRIC_START_TIME	10
+  Column("TIME_ENABLED",    Datetime(0),                 NULLABLE),
+
+#define	METRIC_STOP_TIME	11
+  Column("TIME_DISABLED",   Datetime(0),                 NULLABLE),
+
+#define	METRIC_TIME_ELAPSED	12
+  Column("TIME_ELAPSED",    SLonglong(),                 NULLABLE),
+
+#define	METRIC_RESET_TIME	13
+  Column("TIME_RESET",      Datetime(0),                 NULLABLE),
+
+#define	METRIC_STATUS		14
+  Column("ENABLED", SLong(1), NOT_NULL),
+
+#define	METRIC_TYPE		15
+  Column("TYPE",    Enum(&metric_type_values_typelib), NOT_NULL),
+
+#define	METRIC_DESC		16
+  Column("COMMENT",         Varchar(NAME_LEN + 1),       NOT_NULL),
+  CEnd()
+};
+} // namespace Show
+
+/**********************************************************************//**
+Fill the information schema metrics table.
+@return 0 on success */
+static
+int
+i_s_metrics_fill(
+/*=============*/
+	THD*		thd,		/*!< in: thread */
+	TABLE*		table_to_fill)	/*!< in/out: fill this table */
+{
+	int		count;
+	Field**		fields;
+	double		time_diff = 0;
+	monitor_info_t*	monitor_info;
+	mon_type_t	min_val;
+	mon_type_t	max_val;
+
+	DBUG_ENTER("i_s_metrics_fill");
+	fields = table_to_fill->field;
+
+	for (count = 0; count < NUM_MONITOR; count++) {
+		monitor_info = srv_mon_get_info((monitor_id_t) count);
+
+		/* A good place to sanity check the Monitor ID */
+		ut_a(count == monitor_info->monitor_id);
+
+		/* If the item refers to a Module, nothing to fill,
+		continue. */
+		if ((monitor_info->monitor_type & MONITOR_MODULE)
+		    || (monitor_info->monitor_type & MONITOR_HIDDEN)) {
+			continue;
+		}
+
+		/* If this is an existing "status variable", and
+		its corresponding counter is still on, we need
+		to calculate the result from its corresponding
+		counter. */
+		if (monitor_info->monitor_type & MONITOR_EXISTING
+		    && MONITOR_IS_ON(count)) {
+			srv_mon_process_existing_counter((monitor_id_t) count,
+							 MONITOR_GET_VALUE);
+		}
+
+		/* Fill in counter's basic information */
+		OK(field_store_string(fields[METRIC_NAME],
+				      monitor_info->monitor_name));
+
+		OK(field_store_string(fields[METRIC_SUBSYS],
+				      monitor_info->monitor_module));
+
+		OK(field_store_string(fields[METRIC_DESC],
+				      monitor_info->monitor_desc));
+
+		/* Fill in counter values */
+		OK(fields[METRIC_VALUE_RESET]->store(
+			MONITOR_VALUE(count), FALSE));
+
+		OK(fields[METRIC_VALUE_START]->store(
+			MONITOR_VALUE_SINCE_START(count), FALSE));
+
+		/* If the max value is MAX_RESERVED, counter max
+		value has not been updated. Set the column value
+		to NULL. */
+		if (MONITOR_MAX_VALUE(count) == MAX_RESERVED
+		    || MONITOR_MAX_MIN_NOT_INIT(count)) {
+			fields[METRIC_MAX_VALUE_RESET]->set_null();
+		} else {
+			OK(fields[METRIC_MAX_VALUE_RESET]->store(
+				MONITOR_MAX_VALUE(count), FALSE));
+			fields[METRIC_MAX_VALUE_RESET]->set_notnull();
+		}
+
+		/* If the min value is MAX_RESERVED, counter min
+		value has not been updated. Set the column value
+		to NULL. */
+		if (MONITOR_MIN_VALUE(count) == MIN_RESERVED
+		    || MONITOR_MAX_MIN_NOT_INIT(count)) {
+			fields[METRIC_MIN_VALUE_RESET]->set_null();
+		} else {
+			OK(fields[METRIC_MIN_VALUE_RESET]->store(
+				MONITOR_MIN_VALUE(count), FALSE));
+			fields[METRIC_MIN_VALUE_RESET]->set_notnull();
+		}
+
+		/* Calculate the max value since counter started */
+		max_val = srv_mon_calc_max_since_start((monitor_id_t) count);
+
+		if (max_val == MAX_RESERVED
+		    || MONITOR_MAX_MIN_NOT_INIT(count)) {
+			fields[METRIC_MAX_VALUE_START]->set_null();
+		} else {
+			OK(fields[METRIC_MAX_VALUE_START]->store(
+				max_val, FALSE));
+			fields[METRIC_MAX_VALUE_START]->set_notnull();
+		}
+
+		/* Calculate the min value since counter started */
+		min_val = srv_mon_calc_min_since_start((monitor_id_t) count);
+
+		if (min_val == MIN_RESERVED
+		    || MONITOR_MAX_MIN_NOT_INIT(count)) {
+			fields[METRIC_MIN_VALUE_START]->set_null();
+		} else {
+			OK(fields[METRIC_MIN_VALUE_START]->store(
+				min_val, FALSE));
+
+			fields[METRIC_MIN_VALUE_START]->set_notnull();
+		}
+
+		/* If monitor has been enabled (no matter it is disabled
+		or not now), fill METRIC_START_TIME and METRIC_TIME_ELAPSED
+		field */
+		if (MONITOR_FIELD(count, mon_start_time)) {
+			OK(field_store_time_t(fields[METRIC_START_TIME],
+				(time_t)MONITOR_FIELD(count, mon_start_time)));
+			fields[METRIC_START_TIME]->set_notnull();
+
+			/* If monitor is enabled, the TIME_ELAPSED is the
+			time difference between current and time when monitor
+			is enabled. Otherwise, it is the time difference
+			between time when monitor is enabled and time
+			when it is disabled */
+			if (MONITOR_IS_ON(count)) {
+				time_diff = difftime(time(NULL),
+					MONITOR_FIELD(count, mon_start_time));
+			} else {
+				time_diff =  difftime(
+					MONITOR_FIELD(count, mon_stop_time),
+					MONITOR_FIELD(count, mon_start_time));
+			}
+
+			OK(fields[METRIC_TIME_ELAPSED]->store(
+				time_diff));
+			fields[METRIC_TIME_ELAPSED]->set_notnull();
+		} else {
+			fields[METRIC_START_TIME]->set_null();
+			fields[METRIC_TIME_ELAPSED]->set_null();
+			time_diff = 0;
+		}
+
+		/* Unless MONITOR_NO_AVERAGE is set, we must
+		to calculate the average value. If this is a monitor set
+		owner marked by MONITOR_SET_OWNER, divide
+		the value by another counter (number of calls) designated
+		by monitor_info->monitor_related_id.
+		Otherwise average the counter value by the time between the
+		time that the counter is enabled and time it is disabled
+		or time it is sampled. */
+		if ((monitor_info->monitor_type
+		     & (MONITOR_NO_AVERAGE | MONITOR_SET_OWNER))
+		    == MONITOR_SET_OWNER
+		    && monitor_info->monitor_related_id) {
+			mon_type_t	value_start
+				 = MONITOR_VALUE_SINCE_START(
+					monitor_info->monitor_related_id);
+
+			if (value_start) {
+				OK(fields[METRIC_AVG_VALUE_START]->store(
+					MONITOR_VALUE_SINCE_START(count)
+					/ value_start, FALSE));
+
+				fields[METRIC_AVG_VALUE_START]->set_notnull();
+			} else {
+				fields[METRIC_AVG_VALUE_START]->set_null();
+			}
+
+			if (mon_type_t related_value =
+			    MONITOR_VALUE(monitor_info->monitor_related_id)) {
+				OK(fields[METRIC_AVG_VALUE_RESET]
+				   ->store(MONITOR_VALUE(count)
+					   / related_value, false));
+				fields[METRIC_AVG_VALUE_RESET]->set_notnull();
+			} else {
+				fields[METRIC_AVG_VALUE_RESET]->set_null();
+			}
+		} else if (!(monitor_info->monitor_type
+			     & (MONITOR_NO_AVERAGE
+				| MONITOR_DISPLAY_CURRENT))) {
+			if (time_diff != 0) {
+				OK(fields[METRIC_AVG_VALUE_START]->store(
+					(double) MONITOR_VALUE_SINCE_START(
+						count) / time_diff));
+				fields[METRIC_AVG_VALUE_START]->set_notnull();
+			} else {
+				fields[METRIC_AVG_VALUE_START]->set_null();
+			}
+
+			if (MONITOR_FIELD(count, mon_reset_time)) {
+				/* calculate the time difference since last
+				reset */
+				if (MONITOR_IS_ON(count)) {
+					time_diff = difftime(
+						time(NULL), MONITOR_FIELD(
+							count, mon_reset_time));
+				} else {
+					time_diff =  difftime(
+					MONITOR_FIELD(count, mon_stop_time),
+					MONITOR_FIELD(count, mon_reset_time));
+				}
+			} else {
+				time_diff = 0;
+			}
+
+			if (time_diff != 0) {
+				OK(fields[METRIC_AVG_VALUE_RESET]->store(
+					static_cast<double>(
+						MONITOR_VALUE(count))
+					/ time_diff));
+				fields[METRIC_AVG_VALUE_RESET]->set_notnull();
+			} else {
+				fields[METRIC_AVG_VALUE_RESET]->set_null();
+			}
+		} else {
+			fields[METRIC_AVG_VALUE_START]->set_null();
+			fields[METRIC_AVG_VALUE_RESET]->set_null();
+		}
+
+		if (MONITOR_IS_ON(count)) {
+			/* If monitor is on, the stop time will set to NULL */
+			fields[METRIC_STOP_TIME]->set_null();
+
+			/* Display latest Monitor Reset Time only if Monitor
+			counter is on. */
+			if (MONITOR_FIELD(count, mon_reset_time)) {
+				OK(field_store_time_t(
+					fields[METRIC_RESET_TIME],
+					(time_t)MONITOR_FIELD(
+						count, mon_reset_time)));
+				fields[METRIC_RESET_TIME]->set_notnull();
+			} else {
+				fields[METRIC_RESET_TIME]->set_null();
+			}
+
+			OK(fields[METRIC_STATUS]->store(1, true));
+		} else {
+			if (MONITOR_FIELD(count, mon_stop_time)) {
+				OK(field_store_time_t(fields[METRIC_STOP_TIME],
+				(time_t)MONITOR_FIELD(count, mon_stop_time)));
+				fields[METRIC_STOP_TIME]->set_notnull();
+			} else {
+				fields[METRIC_STOP_TIME]->set_null();
+			}
+
+			fields[METRIC_RESET_TIME]->set_null();
+
+			OK(fields[METRIC_STATUS]->store(0, true));
+		}
+
+		uint metric_type;
+
+		if (monitor_info->monitor_type & MONITOR_DISPLAY_CURRENT) {
+			metric_type = 1; /* "value" */
+		} else if (monitor_info->monitor_type & MONITOR_EXISTING) {
+			metric_type = 2; /* "status_counter" */
+		} else if (monitor_info->monitor_type & MONITOR_SET_OWNER) {
+			metric_type = 3; /* "set_owner" */
+		} else if (monitor_info->monitor_type & MONITOR_SET_MEMBER) {
+			metric_type = 4; /* "set_member" */
+		} else {
+			metric_type = 5; /* "counter" */
+		}
+
+		OK(fields[METRIC_TYPE]->store(metric_type, true));
+
+		OK(schema_table_store_record(thd, table_to_fill));
+	}
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Function to fill information schema metrics tables.
+@return 0 on success */
+static
+int
+i_s_metrics_fill_table(
+/*===================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
+{
+	DBUG_ENTER("i_s_metrics_fill_table");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	i_s_metrics_fill(thd, tables->table);
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_metrics
+@return 0 on success */
+static
+int
+innodb_metrics_init(
+/*================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_metrics_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = Show::innodb_metrics_fields_info;
+	schema->fill_table = i_s_metrics_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+struct st_maria_plugin	i_s_innodb_metrics =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	MYSQL_INFORMATION_SCHEMA_PLUGIN,
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	&i_s_info,
+
+	/* plugin name */
+	/* const char* */
+	"INNODB_METRICS",
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	plugin_author,
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	"InnoDB Metrics Info",
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	PLUGIN_LICENSE_GPL,
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	innodb_metrics_init,
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	i_s_common_deinit,
+
+	i_s_version, nullptr, nullptr, PACKAGE_VERSION,
+	MariaDB_PLUGIN_MATURITY_STABLE
+};
+
+namespace Show {
+/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_ft_default_stopword */
+static ST_FIELD_INFO i_s_stopword_fields_info[]=
+{
+#define STOPWORD_VALUE	0
+  Column("value", Varchar(TRX_ID_MAX_LEN + 1), NOT_NULL),
+  CEnd()
+};
+} // namespace Show
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_ft_default_stopword.
+@return 0 on success, 1 on failure */
+static
+int
+i_s_stopword_fill(
+/*==============*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
+{
+	Field**	fields;
+	ulint	i = 0;
+	TABLE*	table = (TABLE*) tables->table;
+
+	DBUG_ENTER("i_s_stopword_fill");
+
+	fields = table->field;
+
+	/* Fill with server default stopword list in array
+	fts_default_stopword */
+	while (fts_default_stopword[i]) {
+		OK(field_store_string(fields[STOPWORD_VALUE],
+				      fts_default_stopword[i]));
+
+		OK(schema_table_store_record(thd, table));
+		i++;
+	}
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table information_schema.innodb_ft_default_stopword.
+@return 0 on success */
+static
+int
+i_s_stopword_init(
+/*==============*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_stopword_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = Show::i_s_stopword_fields_info;
+	schema->fill_table = i_s_stopword_fill;
+
+	DBUG_RETURN(0);
+}
+
+struct st_maria_plugin	i_s_innodb_ft_default_stopword =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	MYSQL_INFORMATION_SCHEMA_PLUGIN,
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	&i_s_info,
+
+	/* plugin name */
+	/* const char* */
+	"INNODB_FT_DEFAULT_STOPWORD",
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	plugin_author,
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	"Default stopword list for InnoDB Full Text Search",
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	PLUGIN_LICENSE_GPL,
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	i_s_stopword_init,
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	i_s_common_deinit,
+
+	i_s_version, nullptr, nullptr, PACKAGE_VERSION,
+	MariaDB_PLUGIN_MATURITY_STABLE
+};
+
+namespace Show {
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_FT_DELETED
+INFORMATION_SCHEMA.INNODB_FT_BEING_DELETED */
+static ST_FIELD_INFO i_s_fts_doc_fields_info[]=
+{
+#define	I_S_FTS_DOC_ID			0
+  Column("DOC_ID", ULonglong(), NOT_NULL),
+  CEnd()
+};
+} // namespace Show
+
+/*******************************************************************//**
+Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_DELETED or
+INFORMATION_SCHEMA.INNODB_FT_BEING_DELETED
+@return 0 on success, 1 on failure */
+static
+int
+i_s_fts_deleted_generic_fill(
+/*=========================*/
+	THD*		thd,		/*!< in: thread */
+	TABLE_LIST*	tables,		/*!< in/out: tables to fill */
+	ibool		being_deleted)	/*!< in: BEING_DELTED table */
+{
+	Field**			fields;
+	TABLE*			table = (TABLE*) tables->table;
+	trx_t*			trx;
+	fts_table_t		fts_table;
+	fts_doc_ids_t*		deleted;
+	dict_table_t*		user_table;
+
+	DBUG_ENTER("i_s_fts_deleted_generic_fill");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+	MDL_ticket* mdl_ticket = nullptr;
+	user_table = dict_table_open_on_id(
+		innodb_ft_aux_table_id, false, DICT_TABLE_OP_NORMAL,
+		thd, &mdl_ticket);
+
+	if (!user_table) {
+		DBUG_RETURN(0);
+	} else if (!dict_table_has_fts_index(user_table)
+		   || !user_table->is_readable()) {
+		dict_table_close(user_table, false, thd, mdl_ticket);
+		DBUG_RETURN(0);
+	}
+
+	deleted = fts_doc_ids_create();
+
+	trx = trx_create();
+	trx->op_info = "Select for FTS DELETE TABLE";
+
+	FTS_INIT_FTS_TABLE(&fts_table,
+			   (being_deleted) ? "BEING_DELETED" : "DELETED",
+			   FTS_COMMON_TABLE, user_table);
+
+	fts_table_fetch_doc_ids(trx, &fts_table, deleted);
+
+	dict_table_close(user_table, false, thd, mdl_ticket);
+
+	trx->free();
+
+	fields = table->field;
+
+	int	ret = 0;
+
+	for (ulint j = 0; j < ib_vector_size(deleted->doc_ids); ++j) {
+		doc_id_t	doc_id;
+
+		doc_id = *(doc_id_t*) ib_vector_get_const(deleted->doc_ids, j);
+
+		BREAK_IF(ret = fields[I_S_FTS_DOC_ID]->store(doc_id, true));
+
+		BREAK_IF(ret = schema_table_store_record(thd, table));
+	}
+
+	fts_doc_ids_free(deleted);
+
+	DBUG_RETURN(ret);
+}
+
+/*******************************************************************//**
+Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_DELETED
+@return 0 on success, 1 on failure */
+static
+int
+i_s_fts_deleted_fill(
+/*=================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (ignored) */
+{
+	DBUG_ENTER("i_s_fts_deleted_fill");
+
+	DBUG_RETURN(i_s_fts_deleted_generic_fill(thd, tables, FALSE));
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_FT_DELETED
+@return 0 on success */
+static
+int
+i_s_fts_deleted_init(
+/*=================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_fts_deleted_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = Show::i_s_fts_doc_fields_info;
+	schema->fill_table = i_s_fts_deleted_fill;
+
+	DBUG_RETURN(0);
+}
+
+struct st_maria_plugin	i_s_innodb_ft_deleted =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	MYSQL_INFORMATION_SCHEMA_PLUGIN,
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	&i_s_info,
+
+	/* plugin name */
+	/* const char* */
+	"INNODB_FT_DELETED",
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	plugin_author,
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	"INNODB AUXILIARY FTS DELETED TABLE",
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	PLUGIN_LICENSE_GPL,
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	i_s_fts_deleted_init,
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	i_s_common_deinit,
+
+	i_s_version, nullptr, nullptr, PACKAGE_VERSION,
+	MariaDB_PLUGIN_MATURITY_STABLE
+};
+
+/*******************************************************************//**
+Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_BEING_DELETED
+@return 0 on success, 1 on failure */
+static
+int
+i_s_fts_being_deleted_fill(
+/*=======================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (ignored) */
+{
+	DBUG_ENTER("i_s_fts_being_deleted_fill");
+
+	DBUG_RETURN(i_s_fts_deleted_generic_fill(thd, tables, TRUE));
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_FT_BEING_DELETED
+@return 0 on success */
+static
+int
+i_s_fts_being_deleted_init(
+/*=======================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_fts_deleted_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = Show::i_s_fts_doc_fields_info;
+	schema->fill_table = i_s_fts_being_deleted_fill;
+
+	DBUG_RETURN(0);
+}
+
+struct st_maria_plugin	i_s_innodb_ft_being_deleted =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	MYSQL_INFORMATION_SCHEMA_PLUGIN,
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	&i_s_info,
+
+	/* plugin name */
+	/* const char* */
+	"INNODB_FT_BEING_DELETED",
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	plugin_author,
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	"INNODB AUXILIARY FTS BEING DELETED TABLE",
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	PLUGIN_LICENSE_GPL,
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	i_s_fts_being_deleted_init,
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	i_s_common_deinit,
+
+	i_s_version, nullptr, nullptr, PACKAGE_VERSION,
+	MariaDB_PLUGIN_MATURITY_STABLE
+};
+
+
+namespace Show {
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_FT_INDEX_CACHED and
+INFORMATION_SCHEMA.INNODB_FT_INDEX_TABLE */
+static ST_FIELD_INFO i_s_fts_index_fields_info[]=
+{
+#define	I_S_FTS_WORD			0
+  Column("WORD",         Varchar(FTS_MAX_WORD_LEN + 1), NOT_NULL),
+
+#define	I_S_FTS_FIRST_DOC_ID		1
+  Column("FIRST_DOC_ID", ULonglong(),                   NOT_NULL),
+
+#define	I_S_FTS_LAST_DOC_ID		2
+  Column("LAST_DOC_ID",  ULonglong(),                   NOT_NULL),
+
+#define	I_S_FTS_DOC_COUNT		3
+  Column("DOC_COUNT",    ULonglong(),                   NOT_NULL),
+
+#define	I_S_FTS_ILIST_DOC_ID		4
+  Column("DOC_ID",       ULonglong(),                   NOT_NULL),
+
+#define	I_S_FTS_ILIST_DOC_POS		5
+  Column("POSITION",     ULonglong(),                   NOT_NULL),
+  CEnd()
+};
+} // namespace Show
+
+/*******************************************************************//**
+Go through the Doc Node and its ilist, fill the dynamic table
+INFORMATION_SCHEMA.INNODB_FT_INDEX_CACHED for one FTS index on the table.
+@return 0 on success, 1 on failure */
+static
+int
+i_s_fts_index_cache_fill_one_index(
+/*===============================*/
+	fts_index_cache_t*	index_cache,	/*!< in: FTS index cache */
+	THD*			thd,		/*!< in: thread */
+	fts_string_t*		conv_str,	/*!< in/out: buffer */
+	TABLE_LIST*		tables)		/*!< in/out: tables to fill */
+{
+	TABLE*			table = (TABLE*) tables->table;
+	Field**			fields;
+	CHARSET_INFO*		index_charset;
+	const ib_rbt_node_t*	rbt_node;
+	uint			dummy_errors;
+	char*			word_str;
+
+	DBUG_ENTER("i_s_fts_index_cache_fill_one_index");
+
+	fields = table->field;
+
+	index_charset = index_cache->charset;
+	conv_str->f_n_char = 0;
+
+	int	ret = 0;
+
+	/* Go through each word in the index cache */
+	for (rbt_node = rbt_first(index_cache->words);
+	     rbt_node;
+	     rbt_node = rbt_next(index_cache->words, rbt_node)) {
+		fts_tokenizer_word_t* word;
+
+		word = rbt_value(fts_tokenizer_word_t, rbt_node);
+
+		/* Convert word from index charset to system_charset_info */
+		if (index_charset->cset != system_charset_info->cset) {
+			conv_str->f_n_char = my_convert(
+				reinterpret_cast<char*>(conv_str->f_str),
+				static_cast<uint32>(conv_str->f_len),
+				system_charset_info,
+				reinterpret_cast<char*>(word->text.f_str),
+				static_cast<uint32>(word->text.f_len),
+				index_charset, &dummy_errors);
+			ut_ad(conv_str->f_n_char <= conv_str->f_len);
+			conv_str->f_str[conv_str->f_n_char] = 0;
+			word_str = reinterpret_cast<char*>(conv_str->f_str);
+		} else {
+			word_str = reinterpret_cast<char*>(word->text.f_str);
+		}
+
+		/* Decrypt the ilist, and display Dod ID and word position */
+		for (ulint i = 0; i < ib_vector_size(word->nodes); i++) {
+			fts_node_t*	node;
+			const byte*	ptr;
+			ulint		decoded = 0;
+			doc_id_t	doc_id = 0;
+
+			node = static_cast<fts_node_t*> (ib_vector_get(
+				word->nodes, i));
+
+			ptr = node->ilist;
+
+			while (decoded < node->ilist_size) {
+
+				doc_id += fts_decode_vlc(&ptr);
+
+				/* Get position info */
+				while (*ptr) {
+
+					OK(field_store_string(
+						   fields[I_S_FTS_WORD],
+						   word_str));
+
+					OK(fields[I_S_FTS_FIRST_DOC_ID]->store(
+						   node->first_doc_id,
+						   true));
+
+					OK(fields[I_S_FTS_LAST_DOC_ID]->store(
+						   node->last_doc_id,
+						   true));
+
+					OK(fields[I_S_FTS_DOC_COUNT]->store(
+						   node->doc_count, true));
+
+					OK(fields[I_S_FTS_ILIST_DOC_ID]->store(
+						   doc_id, true));
+
+					OK(fields[I_S_FTS_ILIST_DOC_POS]->store(
+						   fts_decode_vlc(&ptr), true));
+
+					OK(schema_table_store_record(
+						   thd, table));
+				}
+
+				++ptr;
+
+				decoded = ptr - (byte*) node->ilist;
+			}
+		}
+	}
+
+	DBUG_RETURN(ret);
+}
+/*******************************************************************//**
+Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_INDEX_CACHED
+@return 0 on success, 1 on failure */
+static
+int
+i_s_fts_index_cache_fill(
+/*=====================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (ignored) */
+{
+	dict_table_t*		user_table;
+	fts_cache_t*		cache;
+
+	DBUG_ENTER("i_s_fts_index_cache_fill");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+	MDL_ticket* mdl_ticket = nullptr;
+	user_table = dict_table_open_on_id(
+		innodb_ft_aux_table_id, false, DICT_TABLE_OP_NORMAL,
+		thd, &mdl_ticket);
+
+	if (!user_table) {
+		DBUG_RETURN(0);
+	}
+
+	if (!user_table->fts || !user_table->fts->cache) {
+		dict_table_close(user_table, false, thd, mdl_ticket);
+		DBUG_RETURN(0);
+	}
+
+	cache = user_table->fts->cache;
+
+	int			ret = 0;
+	fts_string_t		conv_str;
+	byte			word[HA_FT_MAXBYTELEN + 1];
+	conv_str.f_len = sizeof word;
+	conv_str.f_str = word;
+
+	mysql_mutex_lock(&cache->lock);
+
+	for (ulint i = 0; i < ib_vector_size(cache->indexes); i++) {
+		fts_index_cache_t*      index_cache;
+
+		index_cache = static_cast<fts_index_cache_t*> (
+			ib_vector_get(cache->indexes, i));
+
+		BREAK_IF(ret = i_s_fts_index_cache_fill_one_index(
+				 index_cache, thd, &conv_str, tables));
+	}
+
+	mysql_mutex_unlock(&cache->lock);
+	dict_table_close(user_table, false, thd, mdl_ticket);
+
+	DBUG_RETURN(ret);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_FT_INDEX_CACHE
+@return 0 on success */
+static
+int
+i_s_fts_index_cache_init(
+/*=====================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_fts_index_cache_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = Show::i_s_fts_index_fields_info;
+	schema->fill_table = i_s_fts_index_cache_fill;
+
+	DBUG_RETURN(0);
+}
+
+struct st_maria_plugin	i_s_innodb_ft_index_cache =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	MYSQL_INFORMATION_SCHEMA_PLUGIN,
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	&i_s_info,
+
+	/* plugin name */
+	/* const char* */
+	"INNODB_FT_INDEX_CACHE",
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	plugin_author,
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	"INNODB AUXILIARY FTS INDEX CACHED",
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	PLUGIN_LICENSE_GPL,
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	i_s_fts_index_cache_init,
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	i_s_common_deinit,
+
+	i_s_version, nullptr, nullptr, PACKAGE_VERSION,
+	MariaDB_PLUGIN_MATURITY_STABLE
+};
+
+/*******************************************************************//**
+Go through a FTS index auxiliary table, fetch its rows and fill
+FTS word cache structure.
+@return DB_SUCCESS on success, otherwise error code */
+static
+dberr_t
+i_s_fts_index_table_fill_selected(
+/*==============================*/
+	dict_index_t*		index,		/*!< in: FTS index */
+	ib_vector_t*		words,		/*!< in/out: vector to hold
+						fetched words */
+	ulint			selected,	/*!< in: selected FTS index */
+	fts_string_t*		word)		/*!< in: word to select */
+{
+	pars_info_t*		info;
+	fts_table_t		fts_table;
+	trx_t*			trx;
+	que_t*			graph;
+	dberr_t			error;
+	fts_fetch_t		fetch;
+	char			table_name[MAX_FULL_NAME_LEN];
+
+	info = pars_info_create();
+
+	fetch.read_arg = words;
+	fetch.read_record = fts_optimize_index_fetch_node;
+	fetch.total_memory = 0;
+
+	DBUG_EXECUTE_IF("fts_instrument_result_cache_limit",
+	        fts_result_cache_limit = 8192;
+	);
+
+	trx = trx_create();
+
+	trx->op_info = "fetching FTS index nodes";
+
+	pars_info_bind_function(info, "my_func", fetch.read_record, &fetch);
+	pars_info_bind_varchar_literal(info, "word", word->f_str, word->f_len);
+
+	FTS_INIT_INDEX_TABLE(&fts_table, fts_get_suffix(selected),
+			     FTS_INDEX_TABLE, index);
+	fts_get_table_name(&fts_table, table_name);
+	pars_info_bind_id(info, "table_name", table_name);
+
+	graph = fts_parse_sql(
+		&fts_table, info,
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS"
+		" SELECT word, doc_count, first_doc_id, last_doc_id,"
+		" ilist\n"
+		" FROM $table_name WHERE word >= :word;\n"
+		"BEGIN\n"
+		"\n"
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE c;");
+
+	for (;;) {
+		error = fts_eval_sql(trx, graph);
+
+		if (UNIV_LIKELY(error == DB_SUCCESS)) {
+			fts_sql_commit(trx);
+
+			break;
+		} else {
+			fts_sql_rollback(trx);
+
+			if (error == DB_LOCK_WAIT_TIMEOUT) {
+				ib::warn() << "Lock wait timeout reading"
+					" FTS index. Retrying!";
+
+				trx->error_state = DB_SUCCESS;
+			} else {
+				ib::error() << "Error occurred while reading"
+					" FTS index: " << error;
+				break;
+			}
+		}
+	}
+
+	que_graph_free(graph);
+
+	trx->free();
+
+	if (fetch.total_memory >= fts_result_cache_limit) {
+		error = DB_FTS_EXCEED_RESULT_CACHE_LIMIT;
+	}
+
+	return(error);
+}
+
+/*******************************************************************//**
+Free words. */
+static
+void
+i_s_fts_index_table_free_one_fetch(
+/*===============================*/
+	ib_vector_t*		words)		/*!< in: words fetched */
+{
+	for (ulint i = 0; i < ib_vector_size(words); i++) {
+		fts_word_t*	word;
+
+		word = static_cast<fts_word_t*>(ib_vector_get(words, i));
+
+		for (ulint j = 0; j < ib_vector_size(word->nodes); j++) {
+			fts_node_t*     node;
+
+			node = static_cast<fts_node_t*> (ib_vector_get(
+				word->nodes, j));
+			ut_free(node->ilist);
+		}
+
+		fts_word_free(word);
+	}
+
+	ib_vector_reset(words);
+}
+
+/*******************************************************************//**
+Go through words, fill INFORMATION_SCHEMA.INNODB_FT_INDEX_TABLE.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_fts_index_table_fill_one_fetch(
+/*===============================*/
+	CHARSET_INFO*		index_charset,	/*!< in: FTS index charset */
+	THD*			thd,		/*!< in: thread */
+	TABLE_LIST*		tables,		/*!< in/out: tables to fill */
+	ib_vector_t*		words,		/*!< in: words fetched */
+	fts_string_t*		conv_str,	/*!< in: string for conversion*/
+	bool			has_more)	/*!< in: has more to fetch */
+{
+	TABLE*			table = (TABLE*) tables->table;
+	Field**			fields;
+	uint			dummy_errors;
+	char*			word_str;
+	ulint			words_size;
+	int			ret = 0;
+
+	DBUG_ENTER("i_s_fts_index_table_fill_one_fetch");
+
+	fields = table->field;
+
+	words_size = ib_vector_size(words);
+	if (has_more) {
+		/* the last word is not fetched completely. */
+		ut_ad(words_size > 1);
+		words_size -= 1;
+	}
+
+	/* Go through each word in the index cache */
+	for (ulint i = 0; i < words_size; i++) {
+		fts_word_t*	word;
+
+		word = static_cast<fts_word_t*>(ib_vector_get(words, i));
+
+		word->text.f_str[word->text.f_len] = 0;
+
+		/* Convert word from index charset to system_charset_info */
+		if (index_charset->cset != system_charset_info->cset) {
+			conv_str->f_n_char = my_convert(
+				reinterpret_cast<char*>(conv_str->f_str),
+				static_cast<uint32>(conv_str->f_len),
+				system_charset_info,
+				reinterpret_cast<char*>(word->text.f_str),
+				static_cast<uint32>(word->text.f_len),
+				index_charset, &dummy_errors);
+			ut_ad(conv_str->f_n_char <= conv_str->f_len);
+			conv_str->f_str[conv_str->f_n_char] = 0;
+			word_str = reinterpret_cast<char*>(conv_str->f_str);
+		} else {
+			word_str = reinterpret_cast<char*>(word->text.f_str);
+		}
+
+		/* Decrypt the ilist, and display Dod ID and word position */
+		for (ulint i = 0; i < ib_vector_size(word->nodes); i++) {
+			fts_node_t*	node;
+			const byte*	ptr;
+			ulint		decoded = 0;
+			doc_id_t	doc_id = 0;
+
+			node = static_cast<fts_node_t*> (ib_vector_get(
+				word->nodes, i));
+
+			ptr = node->ilist;
+
+			while (decoded < node->ilist_size) {
+				doc_id += fts_decode_vlc(&ptr);
+
+				/* Get position info */
+				while (*ptr) {
+
+					OK(field_store_string(
+						   fields[I_S_FTS_WORD],
+						   word_str));
+
+					OK(fields[I_S_FTS_FIRST_DOC_ID]->store(
+						longlong(node->first_doc_id), true));
+
+					OK(fields[I_S_FTS_LAST_DOC_ID]->store(
+						longlong(node->last_doc_id), true));
+
+					OK(fields[I_S_FTS_DOC_COUNT]->store(
+						   node->doc_count, true));
+
+					OK(fields[I_S_FTS_ILIST_DOC_ID]->store(
+						longlong(doc_id), true));
+
+					OK(fields[I_S_FTS_ILIST_DOC_POS]->store(
+						   fts_decode_vlc(&ptr), true));
+
+					OK(schema_table_store_record(
+						   thd, table));
+				}
+
+				++ptr;
+
+				decoded = ptr - (byte*) node->ilist;
+			}
+		}
+	}
+
+	DBUG_RETURN(ret);
+}
+
+/*******************************************************************//**
+Go through a FTS index and its auxiliary tables, fetch rows in each table
+and fill INFORMATION_SCHEMA.INNODB_FT_INDEX_TABLE.
+@return 0 on success, 1 on failure */
+static
+int
+i_s_fts_index_table_fill_one_index(
+/*===============================*/
+	dict_index_t*		index,		/*!< in: FTS index */
+	THD*			thd,		/*!< in: thread */
+	fts_string_t*		conv_str,	/*!< in/out: buffer */
+	TABLE_LIST*		tables)		/*!< in/out: tables to fill */
+{
+	ib_vector_t*		words;
+	mem_heap_t*		heap;
+	CHARSET_INFO*		index_charset;
+	dberr_t			error;
+	int			ret = 0;
+
+	DBUG_ENTER("i_s_fts_index_table_fill_one_index");
+	DBUG_ASSERT(!dict_index_is_online_ddl(index));
+
+	heap = mem_heap_create(1024);
+
+	words = ib_vector_create(ib_heap_allocator_create(heap),
+				 sizeof(fts_word_t), 256);
+
+	index_charset = fts_index_get_charset(index);
+
+	/* Iterate through each auxiliary table as described in
+	fts_index_selector */
+	for (ulint selected = 0; selected < FTS_NUM_AUX_INDEX; selected++) {
+		fts_string_t	word;
+		bool		has_more = false;
+
+		word.f_str = NULL;
+		word.f_len = 0;
+		word.f_n_char = 0;
+
+		do {
+			/* Fetch from index */
+			error = i_s_fts_index_table_fill_selected(
+				index, words, selected, &word);
+
+			if (error == DB_SUCCESS) {
+				has_more = false;
+			} else if (error == DB_FTS_EXCEED_RESULT_CACHE_LIMIT) {
+				has_more = true;
+			} else {
+				i_s_fts_index_table_free_one_fetch(words);
+				ret = 1;
+				goto func_exit;
+			}
+
+			if (has_more) {
+				fts_word_t*	last_word;
+
+				/* Prepare start point for next fetch */
+				last_word = static_cast<fts_word_t*>(ib_vector_last(words));
+				ut_ad(last_word != NULL);
+				fts_string_dup(&word, &last_word->text, heap);
+			}
+
+			/* Fill into tables */
+			ret = i_s_fts_index_table_fill_one_fetch(
+				index_charset, thd, tables, words, conv_str,
+				has_more);
+			i_s_fts_index_table_free_one_fetch(words);
+
+			if (ret != 0) {
+				goto func_exit;
+			}
+		} while (has_more);
+	}
+
+func_exit:
+	mem_heap_free(heap);
+
+	DBUG_RETURN(ret);
+}
+/*******************************************************************//**
+Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_INDEX_TABLE
+@return 0 on success, 1 on failure */
+static
+int
+i_s_fts_index_table_fill(
+/*=====================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (ignored) */
+{
+	dict_table_t*		user_table;
+	dict_index_t*		index;
+
+	DBUG_ENTER("i_s_fts_index_table_fill");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+	MDL_ticket* mdl_ticket = nullptr;
+	user_table = dict_table_open_on_id(
+		innodb_ft_aux_table_id, false, DICT_TABLE_OP_NORMAL,
+		thd, &mdl_ticket);
+
+	if (!user_table) {
+		DBUG_RETURN(0);
+	}
+
+	int		ret = 0;
+	fts_string_t	conv_str;
+	conv_str.f_len = system_charset_info->mbmaxlen
+		* FTS_MAX_WORD_LEN_IN_CHAR;
+	conv_str.f_str = static_cast<byte*>(ut_malloc_nokey(conv_str.f_len));
+
+	for (index = dict_table_get_first_index(user_table);
+	     index; index = dict_table_get_next_index(index)) {
+		if (index->type & DICT_FTS) {
+			BREAK_IF(ret = i_s_fts_index_table_fill_one_index(
+					 index, thd, &conv_str, tables));
+		}
+	}
+
+	dict_table_close(user_table, false, thd, mdl_ticket);
+
+	ut_free(conv_str.f_str);
+
+	DBUG_RETURN(ret);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_FT_INDEX_TABLE
+@return 0 on success */
+static
+int
+i_s_fts_index_table_init(
+/*=====================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_fts_index_table_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = Show::i_s_fts_index_fields_info;
+	schema->fill_table = i_s_fts_index_table_fill;
+
+	DBUG_RETURN(0);
+}
+
+struct st_maria_plugin	i_s_innodb_ft_index_table =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	MYSQL_INFORMATION_SCHEMA_PLUGIN,
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	&i_s_info,
+
+	/* plugin name */
+	/* const char* */
+	"INNODB_FT_INDEX_TABLE",
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	plugin_author,
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	"INNODB AUXILIARY FTS INDEX TABLE",
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	PLUGIN_LICENSE_GPL,
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	i_s_fts_index_table_init,
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	i_s_common_deinit,
+
+	i_s_version, nullptr, nullptr, PACKAGE_VERSION,
+	MariaDB_PLUGIN_MATURITY_STABLE
+};
+
+
+namespace Show {
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_FT_CONFIG */
+static ST_FIELD_INFO i_s_fts_config_fields_info[]=
+{
+#define	FTS_CONFIG_KEY			0
+  Column("KEY",   Varchar(NAME_LEN + 1),  NOT_NULL),
+
+#define	FTS_CONFIG_VALUE		1
+  Column("VALUE", Varchar(NAME_LEN + 1),  NOT_NULL),
+
+  CEnd()
+};
+} // namespace Show
+
+static const char* fts_config_key[] = {
+	FTS_OPTIMIZE_LIMIT_IN_SECS,
+	FTS_SYNCED_DOC_ID,
+	FTS_STOPWORD_TABLE_NAME,
+	FTS_USE_STOPWORD,
+        NULL
+};
+
+/*******************************************************************//**
+Fill the dynamic table INFORMATION_SCHEMA.INNODB_FT_CONFIG
+@return 0 on success, 1 on failure */
+static
+int
+i_s_fts_config_fill(
+/*================*/
+	THD*		thd,		/*!< in: thread */
+	TABLE_LIST*	tables,		/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (ignored) */
+{
+	Field**			fields;
+	TABLE*			table = (TABLE*) tables->table;
+	trx_t*			trx;
+	fts_table_t		fts_table;
+	dict_table_t*		user_table;
+	ulint			i = 0;
+	dict_index_t*		index = NULL;
+	unsigned char		str[FTS_MAX_CONFIG_VALUE_LEN + 1];
+
+	DBUG_ENTER("i_s_fts_config_fill");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+	MDL_ticket* mdl_ticket = nullptr;
+	user_table = dict_table_open_on_id(
+		innodb_ft_aux_table_id, false, DICT_TABLE_OP_NORMAL,
+		thd, &mdl_ticket);
+
+	if (!user_table) {
+		DBUG_RETURN(0);
+	}
+
+	if (!dict_table_has_fts_index(user_table)) {
+		dict_table_close(user_table, false, thd, mdl_ticket);
+		DBUG_RETURN(0);
+	}
+
+	fields = table->field;
+
+	trx = trx_create();
+	trx->op_info = "Select for FTS CONFIG TABLE";
+
+	FTS_INIT_FTS_TABLE(&fts_table, "CONFIG", FTS_COMMON_TABLE, user_table);
+
+	if (!ib_vector_is_empty(user_table->fts->indexes)) {
+		index = (dict_index_t*) ib_vector_getp_const(
+				user_table->fts->indexes, 0);
+		DBUG_ASSERT(!dict_index_is_online_ddl(index));
+	}
+
+	int	ret = 0;
+
+	while (fts_config_key[i]) {
+		fts_string_t	value;
+		char*		key_name;
+		ulint		allocated = FALSE;
+
+		value.f_len = FTS_MAX_CONFIG_VALUE_LEN;
+
+		value.f_str = str;
+
+		if (index
+		    && strcmp(fts_config_key[i], FTS_TOTAL_WORD_COUNT) == 0) {
+			key_name = fts_config_create_index_param_name(
+				fts_config_key[i], index);
+			allocated = TRUE;
+		} else {
+			key_name = (char*) fts_config_key[i];
+		}
+
+		fts_config_get_value(trx, &fts_table, key_name, &value);
+
+		if (allocated) {
+			ut_free(key_name);
+		}
+
+		BREAK_IF(ret = field_store_string(
+				 fields[FTS_CONFIG_KEY], fts_config_key[i]));
+
+		BREAK_IF(ret = field_store_string(
+				 fields[FTS_CONFIG_VALUE],
+				 reinterpret_cast<const char*>(value.f_str)));
+
+		BREAK_IF(ret = schema_table_store_record(thd, table));
+
+		i++;
+	}
+
+	fts_sql_commit(trx);
+
+	dict_table_close(user_table, false, thd, mdl_ticket);
+
+	trx->free();
+
+	DBUG_RETURN(ret);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_FT_CONFIG
+@return 0 on success */
+static
+int
+i_s_fts_config_init(
+/*=================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_fts_config_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = Show::i_s_fts_config_fields_info;
+	schema->fill_table = i_s_fts_config_fill;
+
+	DBUG_RETURN(0);
+}
+
+struct st_maria_plugin	i_s_innodb_ft_config =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	MYSQL_INFORMATION_SCHEMA_PLUGIN,
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	&i_s_info,
+
+	/* plugin name */
+	/* const char* */
+	"INNODB_FT_CONFIG",
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	plugin_author,
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	"INNODB AUXILIARY FTS CONFIG TABLE",
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	PLUGIN_LICENSE_GPL,
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	i_s_fts_config_init,
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	i_s_common_deinit,
+
+	i_s_version, nullptr, nullptr, PACKAGE_VERSION,
+	MariaDB_PLUGIN_MATURITY_STABLE
+};
+
+namespace Show {
+/* Fields of the dynamic table INNODB_BUFFER_POOL_STATS. */
+static ST_FIELD_INFO i_s_innodb_buffer_stats_fields_info[]=
+{
+#define IDX_BUF_STATS_POOL_ID		0
+  Column("POOL_ID", ULong(), NOT_NULL),
+
+#define IDX_BUF_STATS_POOL_SIZE		1
+  Column("POOL_SIZE", ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_FREE_BUFFERS	2
+  Column("FREE_BUFFERS", ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_LRU_LEN		3
+  Column("DATABASE_PAGES", ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_OLD_LRU_LEN	4
+  Column("OLD_DATABASE_PAGES", ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_FLUSH_LIST_LEN	5
+  Column("MODIFIED_DATABASE_PAGES", ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_PENDING_ZIP	6
+  Column("PENDING_DECOMPRESS", ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_PENDING_READ	7
+  Column("PENDING_READS",ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_FLUSH_LRU		8
+  Column("PENDING_FLUSH_LRU",ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_FLUSH_LIST	9
+  Column("PENDING_FLUSH_LIST", ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_PAGE_YOUNG	10
+  Column("PAGES_MADE_YOUNG",ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_PAGE_NOT_YOUNG	11
+  Column("PAGES_NOT_MADE_YOUNG",ULonglong(), NOT_NULL),
+
+#define	IDX_BUF_STATS_PAGE_YOUNG_RATE	12
+  Column("PAGES_MADE_YOUNG_RATE", Float(MAX_FLOAT_STR_LENGTH), NOT_NULL),
+
+#define	IDX_BUF_STATS_PAGE_NOT_YOUNG_RATE 13
+  Column("PAGES_MADE_NOT_YOUNG_RATE", Float(MAX_FLOAT_STR_LENGTH), NOT_NULL),
+
+#define IDX_BUF_STATS_PAGE_READ		14
+  Column("NUMBER_PAGES_READ",ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_PAGE_CREATED	15
+  Column("NUMBER_PAGES_CREATED",ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_PAGE_WRITTEN	16
+  Column("NUMBER_PAGES_WRITTEN",ULonglong(), NOT_NULL),
+
+#define	IDX_BUF_STATS_PAGE_READ_RATE	17
+  Column("PAGES_READ_RATE", Float(MAX_FLOAT_STR_LENGTH), NOT_NULL),
+
+#define	IDX_BUF_STATS_PAGE_CREATE_RATE	18
+  Column("PAGES_CREATE_RATE", Float(MAX_FLOAT_STR_LENGTH), NOT_NULL),
+
+#define	IDX_BUF_STATS_PAGE_WRITTEN_RATE	19
+  Column("PAGES_WRITTEN_RATE",Float(MAX_FLOAT_STR_LENGTH), NOT_NULL),
+
+#define IDX_BUF_STATS_GET		20
+  Column("NUMBER_PAGES_GET", ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_HIT_RATE		21
+  Column("HIT_RATE", ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_MADE_YOUNG_PCT	22
+  Column("YOUNG_MAKE_PER_THOUSAND_GETS", ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_NOT_MADE_YOUNG_PCT 23
+  Column("NOT_YOUNG_MAKE_PER_THOUSAND_GETS", ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_READ_AHEAD	24
+  Column("NUMBER_PAGES_READ_AHEAD", ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_READ_AHEAD_EVICTED 25
+  Column("NUMBER_READ_AHEAD_EVICTED", ULonglong(), NOT_NULL),
+
+#define	IDX_BUF_STATS_READ_AHEAD_RATE	26
+  Column("READ_AHEAD_RATE", Float(MAX_FLOAT_STR_LENGTH), NOT_NULL),
+
+#define	IDX_BUF_STATS_READ_AHEAD_EVICT_RATE 27
+  Column("READ_AHEAD_EVICTED_RATE",Float(MAX_FLOAT_STR_LENGTH), NOT_NULL),
+
+#define IDX_BUF_STATS_LRU_IO_SUM	28
+  Column("LRU_IO_TOTAL", ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_LRU_IO_CUR	29
+  Column("LRU_IO_CURRENT", ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_UNZIP_SUM		30
+  Column("UNCOMPRESS_TOTAL",ULonglong(), NOT_NULL),
+
+#define IDX_BUF_STATS_UNZIP_CUR		31
+  Column("UNCOMPRESS_CURRENT", ULonglong(), NOT_NULL),
+
+  CEnd()
+};
+} // namespace Show
+
+/** Fill INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS
+@param[in,out]	thd	connection
+@param[in,out]	tables	tables to fill
+@return 0 on success, 1 on failure */
+static int i_s_innodb_stats_fill(THD *thd, TABLE_LIST * tables, Item *)
+{
+	TABLE*		table;
+	Field**		fields;
+	buf_pool_info_t	info;
+
+	DBUG_ENTER("i_s_innodb_stats_fill");
+
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+	/* Only allow the PROCESS privilege holder to access the stats */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	buf_stats_get_pool_info(&info);
+
+	table = tables->table;
+
+	fields = table->field;
+
+	OK(fields[IDX_BUF_STATS_POOL_ID]->store(0, true));
+
+	OK(fields[IDX_BUF_STATS_POOL_SIZE]->store(info.pool_size, true));
+
+	OK(fields[IDX_BUF_STATS_LRU_LEN]->store(info.lru_len, true));
+
+	OK(fields[IDX_BUF_STATS_OLD_LRU_LEN]->store(info.old_lru_len, true));
+
+	OK(fields[IDX_BUF_STATS_FREE_BUFFERS]->store(
+		   info.free_list_len, true));
+
+	OK(fields[IDX_BUF_STATS_FLUSH_LIST_LEN]->store(
+		   info.flush_list_len, true));
+
+	OK(fields[IDX_BUF_STATS_PENDING_ZIP]->store(info.n_pend_unzip, true));
+
+	OK(fields[IDX_BUF_STATS_PENDING_READ]->store(info.n_pend_reads, true));
+
+	OK(fields[IDX_BUF_STATS_FLUSH_LRU]->store(
+		   info.n_pending_flush_lru, true));
+
+	OK(fields[IDX_BUF_STATS_FLUSH_LIST]->store(
+		   info.n_pending_flush_list, true));
+
+	OK(fields[IDX_BUF_STATS_PAGE_YOUNG]->store(
+		   info.n_pages_made_young, true));
+
+	OK(fields[IDX_BUF_STATS_PAGE_NOT_YOUNG]->store(
+		   info.n_pages_not_made_young, true));
+
+	OK(fields[IDX_BUF_STATS_PAGE_YOUNG_RATE]->store(
+		   info.page_made_young_rate));
+
+	OK(fields[IDX_BUF_STATS_PAGE_NOT_YOUNG_RATE]->store(
+		   info.page_not_made_young_rate));
+
+	OK(fields[IDX_BUF_STATS_PAGE_READ]->store(info.n_pages_read, true));
+
+	OK(fields[IDX_BUF_STATS_PAGE_CREATED]->store(
+		   info.n_pages_created, true));
+
+	OK(fields[IDX_BUF_STATS_PAGE_WRITTEN]->store(
+		   info.n_pages_written, true));
+
+	OK(fields[IDX_BUF_STATS_GET]->store(info.n_page_gets, true));
+
+	OK(fields[IDX_BUF_STATS_PAGE_READ_RATE]->store(
+		   info.pages_read_rate));
+
+	OK(fields[IDX_BUF_STATS_PAGE_CREATE_RATE]->store(
+		   info.pages_created_rate));
+
+	OK(fields[IDX_BUF_STATS_PAGE_WRITTEN_RATE]->store(
+		   info.pages_written_rate));
+
+	if (info.n_page_get_delta) {
+		if (info.page_read_delta <= info.n_page_get_delta) {
+			OK(fields[IDX_BUF_STATS_HIT_RATE]->store(
+				static_cast<double>(
+					1000 - (1000 * info.page_read_delta
+					/ info.n_page_get_delta))));
+		} else {
+			OK(fields[IDX_BUF_STATS_HIT_RATE]->store(0));
+		}
+
+		OK(fields[IDX_BUF_STATS_MADE_YOUNG_PCT]->store(
+			   1000 * info.young_making_delta
+			   / info.n_page_get_delta, true));
+
+		OK(fields[IDX_BUF_STATS_NOT_MADE_YOUNG_PCT]->store(
+			   1000 * info.not_young_making_delta
+			   / info.n_page_get_delta, true));
+	} else {
+		OK(fields[IDX_BUF_STATS_HIT_RATE]->store(0, true));
+		OK(fields[IDX_BUF_STATS_MADE_YOUNG_PCT]->store(0, true));
+		OK(fields[IDX_BUF_STATS_NOT_MADE_YOUNG_PCT]->store(0, true));
+	}
+
+	OK(fields[IDX_BUF_STATS_READ_AHEAD]->store(
+		   info.n_ra_pages_read, true));
+
+	OK(fields[IDX_BUF_STATS_READ_AHEAD_EVICTED]->store(
+		   info.n_ra_pages_evicted, true));
+
+	OK(fields[IDX_BUF_STATS_READ_AHEAD_RATE]->store(
+		   info.pages_readahead_rate));
+
+	OK(fields[IDX_BUF_STATS_READ_AHEAD_EVICT_RATE]->store(
+		   info.pages_evicted_rate));
+
+	OK(fields[IDX_BUF_STATS_LRU_IO_SUM]->store(info.io_sum, true));
+
+	OK(fields[IDX_BUF_STATS_LRU_IO_CUR]->store(info.io_cur, true));
+
+	OK(fields[IDX_BUF_STATS_UNZIP_SUM]->store(info.unzip_sum, true));
+
+	OK(fields[IDX_BUF_STATS_UNZIP_CUR]->store(info.unzip_cur, true));
+
+	DBUG_RETURN(schema_table_store_record(thd, table));
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS.
+@return 0 on success, 1 on failure */
+static
+int
+i_s_innodb_buffer_pool_stats_init(
+/*==============================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("i_s_innodb_buffer_pool_stats_init");
+
+	schema = reinterpret_cast<ST_SCHEMA_TABLE*>(p);
+
+	schema->fields_info = Show::i_s_innodb_buffer_stats_fields_info;
+	schema->fill_table = i_s_innodb_stats_fill;
+
+	DBUG_RETURN(0);
+}
+
+struct st_maria_plugin	i_s_innodb_buffer_stats =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	MYSQL_INFORMATION_SCHEMA_PLUGIN,
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	&i_s_info,
+
+	/* plugin name */
+	/* const char* */
+	"INNODB_BUFFER_POOL_STATS",
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	plugin_author,
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	"InnoDB Buffer Pool Statistics Information ",
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	PLUGIN_LICENSE_GPL,
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	i_s_innodb_buffer_pool_stats_init,
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	i_s_common_deinit,
+
+	i_s_version, nullptr, nullptr, PACKAGE_VERSION,
+	MariaDB_PLUGIN_MATURITY_STABLE
+};
+
+/** These must correspond to the first values of buf_page_state */
+static const LEX_CSTRING page_state_values[] =
+{
+  { STRING_WITH_LEN("NOT_USED") },
+  { STRING_WITH_LEN("MEMORY") },
+  { STRING_WITH_LEN("REMOVE_HASH") },
+  { STRING_WITH_LEN("FILE_PAGE") },
+};
+
+static const TypelibBuffer<4> page_state_values_typelib(page_state_values);
+
+static const LEX_CSTRING io_values[] =
+{
+	{ STRING_WITH_LEN("IO_NONE") },
+	{ STRING_WITH_LEN("IO_READ") },
+	{ STRING_WITH_LEN("IO_WRITE") }
+};
+
+
+static TypelibBuffer<3> io_values_typelib(io_values);
+
+namespace Show {
+/* Fields of the dynamic table INNODB_BUFFER_POOL_PAGE. */
+static ST_FIELD_INFO i_s_innodb_buffer_page_fields_info[]=
+{
+#define IDX_BUFFER_POOL_ID		0
+  Column("POOL_ID", ULong(), NOT_NULL),
+
+#define IDX_BUFFER_BLOCK_ID		1
+  Column("BLOCK_ID", ULonglong(), NOT_NULL),
+
+#define IDX_BUFFER_PAGE_SPACE		2
+  Column("SPACE", ULong(), NOT_NULL),
+
+#define IDX_BUFFER_PAGE_NUM		3
+  Column("PAGE_NUMBER", ULong(), NOT_NULL),
+
+#define IDX_BUFFER_PAGE_TYPE		4
+  Column("PAGE_TYPE", Varchar(64), NULLABLE),
+
+#define IDX_BUFFER_PAGE_FLUSH_TYPE	5
+  Column("FLUSH_TYPE", ULong(), NOT_NULL),
+
+#define IDX_BUFFER_PAGE_FIX_COUNT	6
+  Column("FIX_COUNT", ULong(), NOT_NULL),
+
+#ifdef BTR_CUR_HASH_ADAPT
+#define IDX_BUFFER_PAGE_HASHED		7
+  Column("IS_HASHED", SLong(1), NOT_NULL),
+#endif /* BTR_CUR_HASH_ADAPT */
+#define IDX_BUFFER_PAGE_NEWEST_MOD	7 + I_S_AHI
+  Column("NEWEST_MODIFICATION", ULonglong(), NOT_NULL),
+
+#define IDX_BUFFER_PAGE_OLDEST_MOD	8 + I_S_AHI
+  Column("OLDEST_MODIFICATION", ULonglong(), NOT_NULL),
+
+#define IDX_BUFFER_PAGE_ACCESS_TIME	9 + I_S_AHI
+  Column("ACCESS_TIME", ULonglong(), NOT_NULL),
+
+#define IDX_BUFFER_PAGE_TABLE_NAME	10 + I_S_AHI
+  Column("TABLE_NAME", Varchar(1024), NULLABLE),
+
+#define IDX_BUFFER_PAGE_INDEX_NAME	11 + I_S_AHI
+  Column("INDEX_NAME", Varchar(NAME_CHAR_LEN), NULLABLE),
+
+#define IDX_BUFFER_PAGE_NUM_RECS	12 + I_S_AHI
+  Column("NUMBER_RECORDS", ULonglong(), NOT_NULL),
+
+#define IDX_BUFFER_PAGE_DATA_SIZE	13 + I_S_AHI
+  Column("DATA_SIZE", ULonglong(), NOT_NULL),
+
+#define IDX_BUFFER_PAGE_ZIP_SIZE	14 + I_S_AHI
+  Column("COMPRESSED_SIZE", ULonglong(), NOT_NULL),
+
+#define IDX_BUFFER_PAGE_STATE		15 + I_S_AHI
+  Column("PAGE_STATE", Enum(&page_state_values_typelib), NOT_NULL),
+
+#define IDX_BUFFER_PAGE_IO_FIX		16 + I_S_AHI
+  Column("IO_FIX", Enum(&io_values_typelib), NOT_NULL),
+
+#define IDX_BUFFER_PAGE_IS_OLD		17 + I_S_AHI
+  Column("IS_OLD", SLong(1), NOT_NULL),
+
+#define IDX_BUFFER_PAGE_FREE_CLOCK	18 + I_S_AHI
+  Column("FREE_PAGE_CLOCK", ULonglong(), NOT_NULL),
+
+  CEnd()
+};
+} // namespace Show
+
+/*******************************************************************//**
+Fill Information Schema table INNODB_BUFFER_PAGE with information
+cached in the buf_page_info_t array
+@return 0 on success, 1 on failure */
+static
+int
+i_s_innodb_buffer_page_fill(
+/*========================*/
+	THD*			thd,		/*!< in: thread */
+	TABLE_LIST*		tables,		/*!< in/out: tables to fill */
+	const buf_page_info_t*	info_array,	/*!< in: array cached page
+						info */
+	ulint			num_page)	/*!< in: number of page info
+						cached */
+{
+	TABLE*			table;
+	Field**			fields;
+
+	compile_time_assert(I_S_PAGE_TYPE_LAST < 1 << I_S_PAGE_TYPE_BITS);
+
+	DBUG_ENTER("i_s_innodb_buffer_page_fill");
+
+	table = tables->table;
+
+	fields = table->field;
+
+	/* Iterate through the cached array and fill the I_S table rows */
+	for (ulint i = 0; i < num_page; i++) {
+		const buf_page_info_t*	page_info;
+		char			table_name[MAX_FULL_NAME_LEN + 1];
+		const char*		table_name_end = NULL;
+
+		page_info = info_array + i;
+
+		OK(fields[IDX_BUFFER_POOL_ID]->store(0, true));
+
+		OK(fields[IDX_BUFFER_BLOCK_ID]->store(
+			   page_info->block_id, true));
+
+		OK(fields[IDX_BUFFER_PAGE_SPACE]->store(
+			   page_info->id.space(), true));
+
+		OK(fields[IDX_BUFFER_PAGE_NUM]->store(
+			   page_info->id.page_no(), true));
+
+		OK(field_store_string(
+			   fields[IDX_BUFFER_PAGE_TYPE],
+			   i_s_page_type[page_info->page_type].type_str));
+
+		OK(fields[IDX_BUFFER_PAGE_FLUSH_TYPE]->store(0, true));
+
+		OK(fields[IDX_BUFFER_PAGE_FIX_COUNT]->store(
+			   ~buf_page_t::LRU_MASK & page_info->state, true));
+
+#ifdef BTR_CUR_HASH_ADAPT
+		OK(fields[IDX_BUFFER_PAGE_HASHED]->store(
+			   page_info->hashed, true));
+#endif /* BTR_CUR_HASH_ADAPT */
+
+		OK(fields[IDX_BUFFER_PAGE_NEWEST_MOD]->store(
+			   page_info->newest_mod, true));
+
+		OK(fields[IDX_BUFFER_PAGE_OLDEST_MOD]->store(
+			   page_info->oldest_mod, true));
+
+		OK(fields[IDX_BUFFER_PAGE_ACCESS_TIME]->store(
+			   page_info->access_time, true));
+
+		fields[IDX_BUFFER_PAGE_TABLE_NAME]->set_null();
+
+		fields[IDX_BUFFER_PAGE_INDEX_NAME]->set_null();
+
+		/* If this is an index page, fetch the index name
+		and table name */
+		if (page_info->page_type == I_S_PAGE_TYPE_INDEX) {
+			bool ret = false;
+
+			dict_sys.freeze(SRW_LOCK_CALL);
+
+			const dict_index_t* index =
+				dict_index_get_if_in_cache_low(
+					page_info->index_id);
+
+			if (index) {
+				table_name_end = innobase_convert_name(
+					table_name, sizeof(table_name),
+					index->table->name.m_name,
+					strlen(index->table->name.m_name),
+					thd);
+
+				ret = fields[IDX_BUFFER_PAGE_TABLE_NAME]
+					->store(table_name,
+						static_cast<uint>(
+							table_name_end
+							- table_name),
+						system_charset_info)
+					|| fields[IDX_BUFFER_PAGE_INDEX_NAME]
+					->store(index->name,
+						uint(strlen(index->name)),
+						system_charset_info);
+			}
+
+			dict_sys.unfreeze();
+
+			OK(ret);
+
+			if (index) {
+				fields[IDX_BUFFER_PAGE_TABLE_NAME]
+					->set_notnull();
+				fields[IDX_BUFFER_PAGE_INDEX_NAME]
+					->set_notnull();
+			}
+		}
+
+		OK(fields[IDX_BUFFER_PAGE_NUM_RECS]->store(
+			   page_info->num_recs, true));
+
+		OK(fields[IDX_BUFFER_PAGE_DATA_SIZE]->store(
+			   page_info->data_size, true));
+
+		OK(fields[IDX_BUFFER_PAGE_ZIP_SIZE]->store(
+			   page_info->zip_ssize
+			   ? (UNIV_ZIP_SIZE_MIN >> 1) << page_info->zip_ssize
+			   : 0, true));
+
+		static_assert(buf_page_t::NOT_USED == 0, "compatibility");
+		static_assert(buf_page_t::MEMORY == 1, "compatibility");
+		static_assert(buf_page_t::REMOVE_HASH == 2, "compatibility");
+
+		OK(fields[IDX_BUFFER_PAGE_STATE]->store(
+			   std::min<uint32_t>(3, page_info->state) + 1, true));
+
+		static_assert(buf_page_t::UNFIXED == 1U << 29, "comp.");
+		static_assert(buf_page_t::READ_FIX == 4U << 29, "comp.");
+		static_assert(buf_page_t::WRITE_FIX == 5U << 29, "comp.");
+
+		unsigned io_fix = page_info->state >> 29;
+		if (io_fix < 4) {
+			io_fix = 1;
+		} else if (io_fix > 5) {
+			io_fix = 3;
+		} else {
+			io_fix -= 2;
+		}
+
+		OK(fields[IDX_BUFFER_PAGE_IO_FIX]->store(io_fix, true));
+
+		OK(fields[IDX_BUFFER_PAGE_IS_OLD]->store(
+			   page_info->is_old, true));
+
+		OK(fields[IDX_BUFFER_PAGE_FREE_CLOCK]->store(
+			   page_info->freed_page_clock, true));
+
+		OK(schema_table_store_record(thd, table));
+	}
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Set appropriate page type to a buf_page_info_t structure */
+static
+void
+i_s_innodb_set_page_type(
+/*=====================*/
+	buf_page_info_t*page_info,	/*!< in/out: structure to fill with
+					scanned info */
+	const byte*	frame)		/*!< in: buffer frame */
+{
+	uint16_t page_type = fil_page_get_type(frame);
+
+	if (fil_page_type_is_index(page_type)) {
+		const page_t*	page = (const page_t*) frame;
+
+		page_info->index_id = btr_page_get_index_id(page);
+
+		/* FIL_PAGE_INDEX and FIL_PAGE_RTREE are a bit special,
+		their values are defined as 17855 and 17854, so we cannot
+		use them to index into i_s_page_type[] array, its array index
+		in the i_s_page_type[] array is I_S_PAGE_TYPE_INDEX
+		(1) for index pages or I_S_PAGE_TYPE_IBUF for
+		change buffer index pages */
+		if (page_type == FIL_PAGE_RTREE) {
+			page_info->page_type = I_S_PAGE_TYPE_RTREE;
+		} else if (page_info->index_id
+			   == static_cast<index_id_t>(DICT_IBUF_ID_MIN
+						      + IBUF_SPACE_ID)) {
+			page_info->page_type = I_S_PAGE_TYPE_IBUF;
+		} else {
+			ut_ad(page_type == FIL_PAGE_INDEX
+			      || page_type == FIL_PAGE_TYPE_INSTANT);
+			page_info->page_type = I_S_PAGE_TYPE_INDEX;
+		}
+
+		page_info->data_size = uint16_t(page_header_get_field(
+			page, PAGE_HEAP_TOP) - (page_is_comp(page)
+						? PAGE_NEW_SUPREMUM_END
+						: PAGE_OLD_SUPREMUM_END)
+			- page_header_get_field(page, PAGE_GARBAGE));
+
+		page_info->num_recs = page_get_n_recs(page) & ((1U << 14) - 1);
+	} else if (page_type > FIL_PAGE_TYPE_LAST) {
+		/* Encountered an unknown page type */
+		page_info->page_type = I_S_PAGE_TYPE_UNKNOWN;
+	} else {
+		/* Make sure we get the right index into the
+		i_s_page_type[] array */
+		ut_a(page_type == i_s_page_type[page_type].type_value);
+
+		page_info->page_type = page_type & 0xf;
+	}
+}
+/*******************************************************************//**
+Scans pages in the buffer cache, and collect their general information
+into the buf_page_info_t array which is zero-filled. So any fields
+that are not initialized in the function will default to 0 */
+static
+void
+i_s_innodb_buffer_page_get_info(
+/*============================*/
+	const buf_page_t*bpage,		/*!< in: buffer pool page to scan */
+	ulint		pos,		/*!< in: buffer block position in
+					buffer pool or in the LRU list */
+	buf_page_info_t*page_info)	/*!< in: zero filled info structure;
+					out: structure filled with scanned
+					info */
+{
+	page_info->block_id = pos;
+
+	static_assert(buf_page_t::NOT_USED == 0, "compatibility");
+	static_assert(buf_page_t::MEMORY == 1, "compatibility");
+	static_assert(buf_page_t::REMOVE_HASH == 2, "compatibility");
+	static_assert(buf_page_t::UNFIXED == 1U << 29, "compatibility");
+	static_assert(buf_page_t::READ_FIX == 4U << 29, "compatibility");
+	static_assert(buf_page_t::WRITE_FIX == 5U << 29, "compatibility");
+
+	page_info->state = bpage->state();
+
+	if (page_info->state < buf_page_t::UNFIXED) {
+		page_info->page_type = I_S_PAGE_TYPE_UNKNOWN;
+		page_info->compressed_only = false;
+	} else {
+		const byte*	frame;
+
+		page_info->id = bpage->id();
+
+		page_info->oldest_mod = bpage->oldest_modification();
+
+		page_info->access_time = bpage->access_time;
+
+		page_info->zip_ssize = bpage->zip.ssize;
+
+		page_info->is_old = bpage->old;
+
+		page_info->freed_page_clock = bpage->freed_page_clock;
+
+		if (page_info->state >= buf_page_t::READ_FIX
+		    && page_info->state < buf_page_t::WRITE_FIX) {
+			page_info->page_type = I_S_PAGE_TYPE_UNKNOWN;
+			page_info->newest_mod = 0;
+			return;
+		}
+
+		page_info->compressed_only = !bpage->frame,
+		frame = bpage->frame;
+		if (UNIV_LIKELY(frame != nullptr)) {
+#ifdef BTR_CUR_HASH_ADAPT
+			/* Note: this may be a false positive, that
+			is, block->index will not always be set to
+			NULL when the last adaptive hash index
+			reference is dropped. */
+			page_info->hashed =
+				reinterpret_cast<const buf_block_t*>(bpage)
+				->index != nullptr;
+#endif /* BTR_CUR_HASH_ADAPT */
+		} else {
+			ut_ad(page_info->zip_ssize);
+			frame = bpage->zip.data;
+		}
+
+		page_info->newest_mod = mach_read_from_8(FIL_PAGE_LSN + frame);
+		i_s_innodb_set_page_type(page_info, frame);
+	}
+}
+
+/*******************************************************************//**
+This is the function that goes through each block of the buffer pool
+and fetch information to information schema tables: INNODB_BUFFER_PAGE.
+@param[in,out]	thd	connection
+@param[in,out]	tables	tables to fill
+@return 0 on success, 1 on failure */
+static int i_s_innodb_buffer_page_fill(THD *thd, TABLE_LIST *tables, Item *)
+{
+	int			status	= 0;
+	mem_heap_t*		heap;
+
+	DBUG_ENTER("i_s_innodb_buffer_page_fill");
+
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+	/* deny access to user without PROCESS privilege */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	heap = mem_heap_create(10000);
+
+	for (ulint n = 0;
+	     n < ut_min(buf_pool.n_chunks, buf_pool.n_chunks_new); n++) {
+		const buf_block_t*	block;
+		ulint			n_blocks;
+		buf_page_info_t*	info_buffer;
+		ulint			num_page;
+		ulint			mem_size;
+		ulint			chunk_size;
+		ulint			num_to_process = 0;
+		ulint			block_id = 0;
+
+		/* Get buffer block of the nth chunk */
+		block = buf_pool.chunks[n].blocks;
+		chunk_size = buf_pool.chunks[n].size;
+		num_page = 0;
+
+		while (chunk_size > 0) {
+			/* we cache maximum MAX_BUF_INFO_CACHED number of
+			buffer page info */
+			num_to_process = ut_min(chunk_size,
+				(ulint)MAX_BUF_INFO_CACHED);
+
+			mem_size = num_to_process * sizeof(buf_page_info_t);
+
+			/* For each chunk, we'll pre-allocate information
+			structures to cache the page information read from
+			the buffer pool. Doing so before obtain any mutex */
+			info_buffer = (buf_page_info_t*) mem_heap_zalloc(
+				heap, mem_size);
+
+			/* Obtain appropriate mutexes. Since this is diagnostic
+			buffer pool info printout, we are not required to
+			preserve the overall consistency, so we can
+			release mutex periodically */
+			mysql_mutex_lock(&buf_pool.mutex);
+
+			/* GO through each block in the chunk */
+			for (n_blocks = num_to_process; n_blocks--; block++) {
+				i_s_innodb_buffer_page_get_info(
+					&block->page, block_id,
+					info_buffer + num_page);
+				block_id++;
+				num_page++;
+			}
+
+			mysql_mutex_unlock(&buf_pool.mutex);
+
+			/* Fill in information schema table with information
+			just collected from the buffer chunk scan */
+			status = i_s_innodb_buffer_page_fill(
+				thd, tables, info_buffer,
+				num_page);
+
+			/* If something goes wrong, break and return */
+			if (status) {
+				break;
+			}
+
+			mem_heap_empty(heap);
+			chunk_size -= num_to_process;
+			num_page = 0;
+		}
+	}
+
+	mem_heap_free(heap);
+
+	DBUG_RETURN(status);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_BUFFER_PAGE.
+@return 0 on success, 1 on failure */
+static
+int
+i_s_innodb_buffer_page_init(
+/*========================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("i_s_innodb_buffer_page_init");
+
+	schema = reinterpret_cast<ST_SCHEMA_TABLE*>(p);
+
+	schema->fields_info = Show::i_s_innodb_buffer_page_fields_info;
+	schema->fill_table = i_s_innodb_buffer_page_fill;
+
+	DBUG_RETURN(0);
+}
+
+struct st_maria_plugin	i_s_innodb_buffer_page =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	MYSQL_INFORMATION_SCHEMA_PLUGIN,
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	&i_s_info,
+
+	/* plugin name */
+	/* const char* */
+	"INNODB_BUFFER_PAGE",
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	plugin_author,
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	"InnoDB Buffer Page Information",
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	PLUGIN_LICENSE_GPL,
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	i_s_innodb_buffer_page_init,
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	i_s_common_deinit,
+
+	i_s_version, nullptr, nullptr, PACKAGE_VERSION,
+	MariaDB_PLUGIN_MATURITY_STABLE
+};
+
+namespace Show {
+static ST_FIELD_INFO i_s_innodb_buf_page_lru_fields_info[] =
+{
+#define IDX_BUF_LRU_POOL_ID		0
+  Column("POOL_ID", ULong(), NOT_NULL),
+
+#define IDX_BUF_LRU_POS			1
+  Column("LRU_POSITION", ULonglong(), NOT_NULL),
+
+#define IDX_BUF_LRU_PAGE_SPACE		2
+  Column("SPACE", ULong(), NOT_NULL),
+
+#define IDX_BUF_LRU_PAGE_NUM		3
+  Column("PAGE_NUMBER", ULong(), NOT_NULL),
+
+#define IDX_BUF_LRU_PAGE_TYPE		4
+  Column("PAGE_TYPE", Varchar(64), NULLABLE),
+
+#define IDX_BUF_LRU_PAGE_FLUSH_TYPE	5
+  Column("FLUSH_TYPE", ULong(), NOT_NULL),
+
+#define IDX_BUF_LRU_PAGE_FIX_COUNT	6
+  Column("FIX_COUNT", ULong(), NOT_NULL),
+
+#ifdef BTR_CUR_HASH_ADAPT
+#define IDX_BUF_LRU_PAGE_HASHED		7
+  Column("IS_HASHED", SLong(1), NOT_NULL),
+#endif /* BTR_CUR_HASH_ADAPT */
+#define IDX_BUF_LRU_PAGE_NEWEST_MOD	7 + I_S_AHI
+  Column("NEWEST_MODIFICATION",ULonglong(), NOT_NULL),
+
+#define IDX_BUF_LRU_PAGE_OLDEST_MOD	8 + I_S_AHI
+  Column("OLDEST_MODIFICATION",ULonglong(), NOT_NULL),
+
+#define IDX_BUF_LRU_PAGE_ACCESS_TIME	9 + I_S_AHI
+  Column("ACCESS_TIME",ULonglong(), NOT_NULL),
+
+#define IDX_BUF_LRU_PAGE_TABLE_NAME	10 + I_S_AHI
+  Column("TABLE_NAME", Varchar(1024), NULLABLE),
+
+#define IDX_BUF_LRU_PAGE_INDEX_NAME	11 + I_S_AHI
+  Column("INDEX_NAME", Varchar(NAME_CHAR_LEN), NULLABLE),
+
+#define IDX_BUF_LRU_PAGE_NUM_RECS	12 + I_S_AHI
+  Column("NUMBER_RECORDS", ULonglong(), NOT_NULL),
+
+#define IDX_BUF_LRU_PAGE_DATA_SIZE	13 + I_S_AHI
+  Column("DATA_SIZE", ULonglong(), NOT_NULL),
+
+#define IDX_BUF_LRU_PAGE_ZIP_SIZE	14 + I_S_AHI
+  Column("COMPRESSED_SIZE",ULonglong(), NOT_NULL),
+
+#define IDX_BUF_LRU_PAGE_STATE		15 + I_S_AHI
+  Column("COMPRESSED", SLong(1), NOT_NULL),
+
+#define IDX_BUF_LRU_PAGE_IO_FIX		16 + I_S_AHI
+  Column("IO_FIX", Enum(&io_values_typelib), NOT_NULL),
+
+#define IDX_BUF_LRU_PAGE_IS_OLD		17 + I_S_AHI
+  Column("IS_OLD", SLong(1), NULLABLE),
+
+#define IDX_BUF_LRU_PAGE_FREE_CLOCK	18 + I_S_AHI
+  Column("FREE_PAGE_CLOCK", ULonglong(), NOT_NULL),
+
+  CEnd()
+};
+} // namespace Show
+
+/*******************************************************************//**
+Fill Information Schema table INNODB_BUFFER_PAGE_LRU with information
+cached in the buf_page_info_t array
+@return 0 on success, 1 on failure */
+static
+int
+i_s_innodb_buf_page_lru_fill(
+/*=========================*/
+	THD*			thd,		/*!< in: thread */
+	TABLE_LIST*		tables,		/*!< in/out: tables to fill */
+	const buf_page_info_t*	info_array,	/*!< in: array cached page
+						info */
+	ulint			num_page)	/*!< in: number of page info
+						 cached */
+{
+	DBUG_ENTER("i_s_innodb_buf_page_lru_fill");
+
+	TABLE*	table	= tables->table;
+	Field**	fields	= table->field;
+
+	/* Iterate through the cached array and fill the I_S table rows */
+	for (ulint i = 0; i < num_page; i++) {
+		const buf_page_info_t*	page_info;
+		char			table_name[MAX_FULL_NAME_LEN + 1];
+		const char*		table_name_end = NULL;
+
+		page_info = info_array + i;
+
+		OK(fields[IDX_BUF_LRU_POOL_ID]->store(0, true));
+
+		OK(fields[IDX_BUF_LRU_POS]->store(
+			   page_info->block_id, true));
+
+		OK(fields[IDX_BUF_LRU_PAGE_SPACE]->store(
+			   page_info->id.space(), true));
+
+		OK(fields[IDX_BUF_LRU_PAGE_NUM]->store(
+			   page_info->id.page_no(), true));
+
+		OK(field_store_string(
+			   fields[IDX_BUF_LRU_PAGE_TYPE],
+			   i_s_page_type[page_info->page_type].type_str));
+
+		OK(fields[IDX_BUF_LRU_PAGE_FLUSH_TYPE]->store(0, true));
+
+		OK(fields[IDX_BUF_LRU_PAGE_FIX_COUNT]->store(
+			   ~buf_page_t::LRU_MASK & page_info->state, true));
+
+#ifdef BTR_CUR_HASH_ADAPT
+		OK(fields[IDX_BUF_LRU_PAGE_HASHED]->store(
+			   page_info->hashed, true));
+#endif /* BTR_CUR_HASH_ADAPT */
+
+		OK(fields[IDX_BUF_LRU_PAGE_NEWEST_MOD]->store(
+			   page_info->newest_mod, true));
+
+		OK(fields[IDX_BUF_LRU_PAGE_OLDEST_MOD]->store(
+			   page_info->oldest_mod, true));
+
+		OK(fields[IDX_BUF_LRU_PAGE_ACCESS_TIME]->store(
+			   page_info->access_time, true));
+
+		fields[IDX_BUF_LRU_PAGE_TABLE_NAME]->set_null();
+
+		fields[IDX_BUF_LRU_PAGE_INDEX_NAME]->set_null();
+
+		/* If this is an index page, fetch the index name
+		and table name */
+		if (page_info->page_type == I_S_PAGE_TYPE_INDEX) {
+			bool ret = false;
+
+			dict_sys.freeze(SRW_LOCK_CALL);
+
+			const dict_index_t* index =
+				dict_index_get_if_in_cache_low(
+					page_info->index_id);
+
+			if (index) {
+				table_name_end = innobase_convert_name(
+					table_name, sizeof(table_name),
+					index->table->name.m_name,
+					strlen(index->table->name.m_name),
+					thd);
+
+				ret = fields[IDX_BUF_LRU_PAGE_TABLE_NAME]
+					->store(table_name,
+						static_cast<uint>(
+							table_name_end
+							- table_name),
+						system_charset_info)
+					|| fields[IDX_BUF_LRU_PAGE_INDEX_NAME]
+					->store(index->name,
+						uint(strlen(index->name)),
+						system_charset_info);
+			}
+
+			dict_sys.unfreeze();
+
+			OK(ret);
+
+			if (index) {
+				fields[IDX_BUF_LRU_PAGE_TABLE_NAME]
+					->set_notnull();
+				fields[IDX_BUF_LRU_PAGE_INDEX_NAME]
+					->set_notnull();
+			}
+		}
+
+		OK(fields[IDX_BUF_LRU_PAGE_NUM_RECS]->store(
+			   page_info->num_recs, true));
+
+		OK(fields[IDX_BUF_LRU_PAGE_DATA_SIZE]->store(
+			   page_info->data_size, true));
+
+		OK(fields[IDX_BUF_LRU_PAGE_ZIP_SIZE]->store(
+			   page_info->zip_ssize
+			   ? 512 << page_info->zip_ssize : 0, true));
+
+		OK(fields[IDX_BUF_LRU_PAGE_STATE]->store(
+			   page_info->compressed_only, true));
+
+		static_assert(buf_page_t::UNFIXED == 1U << 29, "comp.");
+		static_assert(buf_page_t::READ_FIX == 4U << 29, "comp.");
+		static_assert(buf_page_t::WRITE_FIX == 5U << 29, "comp.");
+
+		unsigned io_fix = page_info->state >> 29;
+		if (io_fix < 4) {
+			io_fix = 1;
+		} else if (io_fix > 5) {
+			io_fix = 3;
+		} else {
+			io_fix -= 2;
+		}
+
+		OK(fields[IDX_BUF_LRU_PAGE_IO_FIX]->store(io_fix, true));
+
+		OK(fields[IDX_BUF_LRU_PAGE_IS_OLD]->store(
+			   page_info->is_old, true));
+
+		OK(fields[IDX_BUF_LRU_PAGE_FREE_CLOCK]->store(
+			   page_info->freed_page_clock, true));
+
+		OK(schema_table_store_record(thd, table));
+	}
+
+	DBUG_RETURN(0);
+}
+
+/** Fill the table INFORMATION_SCHEMA.INNODB_BUFFER_PAGE_LRU.
+@param[in]	thd		thread
+@param[in,out]	tables		tables to fill
+@return 0 on success, 1 on failure */
+static int i_s_innodb_fill_buffer_lru(THD *thd, TABLE_LIST *tables, Item *)
+{
+	int			status = 0;
+	buf_page_info_t*	info_buffer;
+	ulint			lru_pos = 0;
+	const buf_page_t*	bpage;
+	ulint			lru_len;
+
+	DBUG_ENTER("i_s_innodb_fill_buffer_lru");
+
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+	/* deny access to any users that do not hold PROCESS_ACL */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	/* Aquire the mutex before allocating info_buffer, since
+	UT_LIST_GET_LEN(buf_pool.LRU) could change */
+	mysql_mutex_lock(&buf_pool.mutex);
+
+	lru_len = UT_LIST_GET_LEN(buf_pool.LRU);
+
+	/* Print error message if malloc fail */
+	info_buffer = (buf_page_info_t*) my_malloc(PSI_INSTRUMENT_ME,
+		lru_len * sizeof *info_buffer, MYF(MY_WME | MY_ZEROFILL));
+
+	if (!info_buffer) {
+		status = 1;
+		goto exit;
+	}
+
+	/* Walk through Pool's LRU list and print the buffer page
+	information */
+	bpage = UT_LIST_GET_LAST(buf_pool.LRU);
+
+	while (bpage != NULL) {
+		/* Use the same function that collect buffer info for
+		INNODB_BUFFER_PAGE to get buffer page info */
+		i_s_innodb_buffer_page_get_info(bpage, lru_pos,
+						(info_buffer + lru_pos));
+
+		bpage = UT_LIST_GET_PREV(LRU, bpage);
+
+		lru_pos++;
+	}
+
+	ut_ad(lru_pos == lru_len);
+	ut_ad(lru_pos == UT_LIST_GET_LEN(buf_pool.LRU));
+
+exit:
+	mysql_mutex_unlock(&buf_pool.mutex);
+
+	if (info_buffer) {
+		status = i_s_innodb_buf_page_lru_fill(
+			thd, tables, info_buffer, lru_len);
+
+		my_free(info_buffer);
+	}
+
+	DBUG_RETURN(status);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_BUFFER_PAGE_LRU.
+@return 0 on success, 1 on failure */
+static
+int
+i_s_innodb_buffer_page_lru_init(
+/*============================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("i_s_innodb_buffer_page_lru_init");
+
+	schema = reinterpret_cast<ST_SCHEMA_TABLE*>(p);
+
+	schema->fields_info = Show::i_s_innodb_buf_page_lru_fields_info;
+	schema->fill_table = i_s_innodb_fill_buffer_lru;
+
+	DBUG_RETURN(0);
+}
+
+struct st_maria_plugin	i_s_innodb_buffer_page_lru =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	MYSQL_INFORMATION_SCHEMA_PLUGIN,
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	&i_s_info,
+
+	/* plugin name */
+	/* const char* */
+	"INNODB_BUFFER_PAGE_LRU",
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	plugin_author,
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	"InnoDB Buffer Page in LRU",
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	PLUGIN_LICENSE_GPL,
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	i_s_innodb_buffer_page_lru_init,
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	i_s_common_deinit,
+
+	i_s_version, nullptr, nullptr, PACKAGE_VERSION,
+	MariaDB_PLUGIN_MATURITY_STABLE
+};
+
+/*******************************************************************//**
+Unbind a dynamic INFORMATION_SCHEMA table.
+@return 0 */
+static int i_s_common_deinit(void*)
+{
+	DBUG_ENTER("i_s_common_deinit");
+
+	/* Do nothing */
+
+	DBUG_RETURN(0);
+}
+
+static const LEX_CSTRING row_format_values[] =
+{
+  { STRING_WITH_LEN("Redundant") },
+  { STRING_WITH_LEN("Compact") },
+  { STRING_WITH_LEN("Compressed") },
+  { STRING_WITH_LEN("Dynamic") }
+};
+
+static TypelibBuffer<4> row_format_values_typelib(row_format_values);
+
+static const LEX_CSTRING space_type_values[] =
+{
+	{ STRING_WITH_LEN("Single") },
+	{ STRING_WITH_LEN("System") }
+};
+
+static TypelibBuffer<2> space_type_values_typelib(space_type_values);
+
+namespace Show {
+/**  SYS_TABLES  ***************************************************/
+/* Fields of the dynamic table INFORMATION_SCHEMA.SYS_TABLES */
+static ST_FIELD_INFO innodb_sys_tables_fields_info[]=
+{
+#define SYS_TABLES_ID			0
+  Column("TABLE_ID", ULonglong(), NOT_NULL),
+
+#define SYS_TABLES_NAME			1
+  Column("NAME", Varchar(MAX_FULL_NAME_LEN + 1), NOT_NULL),
+
+#define SYS_TABLES_FLAG			2
+  Column("FLAG", SLong(), NOT_NULL),
+
+#define SYS_TABLES_NUM_COLUMN		3
+  Column("N_COLS", ULong(), NOT_NULL),
+
+#define SYS_TABLES_SPACE		4
+  Column("SPACE", ULong(), NOT_NULL),
+
+#define SYS_TABLES_ROW_FORMAT		5
+  Column("ROW_FORMAT", Enum(&row_format_values_typelib), NULLABLE),
+
+#define SYS_TABLES_ZIP_PAGE_SIZE	6
+  Column("ZIP_PAGE_SIZE", ULong(), NOT_NULL),
+
+#define SYS_TABLES_SPACE_TYPE		7
+  Column("SPACE_TYPE", Enum(&space_type_values_typelib), NULLABLE),
+
+  CEnd()
+};
+} // namespace Show
+
+/**********************************************************************//**
+Populate information_schema.innodb_sys_tables table with information
+from SYS_TABLES.
+@return 0 on success */
+static
+int
+i_s_dict_fill_sys_tables(
+/*=====================*/
+	THD*		thd,		/*!< in: thread */
+	dict_table_t*	table,		/*!< in: table */
+	TABLE*		table_to_fill)	/*!< in/out: fill this table */
+{
+	Field**			fields;
+	ulint			compact = DICT_TF_GET_COMPACT(table->flags);
+	ulint			atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(
+								table->flags);
+	const ulint zip_size = dict_tf_get_zip_size(table->flags);
+	const char*		row_format;
+
+	if (!compact) {
+		row_format = "Redundant";
+	} else if (!atomic_blobs) {
+		row_format = "Compact";
+	} else if (DICT_TF_GET_ZIP_SSIZE(table->flags)) {
+		row_format = "Compressed";
+	} else {
+		row_format = "Dynamic";
+	}
+
+	DBUG_ENTER("i_s_dict_fill_sys_tables");
+
+	fields = table_to_fill->field;
+
+	OK(fields[SYS_TABLES_ID]->store(longlong(table->id), TRUE));
+
+	OK(field_store_string(fields[SYS_TABLES_NAME], table->name.m_name));
+
+	OK(fields[SYS_TABLES_FLAG]->store(table->flags));
+
+	OK(fields[SYS_TABLES_NUM_COLUMN]->store(table->n_cols));
+
+	OK(fields[SYS_TABLES_SPACE]->store(table->space_id, true));
+
+	OK(field_store_string(fields[SYS_TABLES_ROW_FORMAT], row_format));
+
+	OK(fields[SYS_TABLES_ZIP_PAGE_SIZE]->store(zip_size, true));
+
+	OK(field_store_string(fields[SYS_TABLES_SPACE_TYPE],
+			      table->space_id ? "Single" : "System"));
+
+	OK(schema_table_store_record(thd, table_to_fill));
+
+	DBUG_RETURN(0);
+}
+
+/** Convert one SYS_TABLES record to dict_table_t.
+@param pcur      persistent cursor position on SYS_TABLES record
+@param mtr       mini-transaction (nullptr=use the dict_sys cache)
+@param rec       record to read from (nullptr=use the dict_sys cache)
+@param table     the converted dict_table_t
+@return error message
+@retval nullptr on success */
+static const char *i_s_sys_tables_rec(const btr_pcur_t &pcur, mtr_t *mtr,
+                                      const rec_t *rec, dict_table_t **table)
+{
+  static_assert(DICT_FLD__SYS_TABLES__NAME == 0, "compatibility");
+  size_t len;
+  if (rec_get_1byte_offs_flag(pcur.old_rec))
+  {
+    len= rec_1_get_field_end_info(pcur.old_rec, 0);
+    if (len & REC_1BYTE_SQL_NULL_MASK)
+      return "corrupted SYS_TABLES.NAME";
+  }
+  else
+  {
+    len= rec_2_get_field_end_info(pcur.old_rec, 0);
+    static_assert(REC_2BYTE_EXTERN_MASK == 16384, "compatibility");
+    if (len >= REC_2BYTE_EXTERN_MASK)
+      return "corrupted SYS_TABLES.NAME";
+  }
+
+  if (rec)
+    return dict_load_table_low(mtr, false, rec, table);
+
+  *table= dict_sys.load_table
+    (span<const char>{reinterpret_cast<const char*>(pcur.old_rec), len});
+  return *table ? nullptr : "Table not found in cache";
+}
+
+/*******************************************************************//**
+Function to go through each record in SYS_TABLES table, and fill the
+information_schema.innodb_sys_tables table with related table information
+@return 0 on success */
+static
+int
+i_s_sys_tables_fill_table(
+/*======================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
+{
+	btr_pcur_t	pcur;
+	mtr_t		mtr;
+
+	DBUG_ENTER("i_s_sys_tables_fill_table");
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+	/* deny access to user without PROCESS_ACL privilege */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	mtr.start();
+	dict_sys.lock(SRW_LOCK_CALL);
+
+	for (const rec_t *rec = dict_startscan_system(&pcur, &mtr,
+						      dict_sys.sys_tables);
+	     rec; rec = dict_getnext_system(&pcur, &mtr)) {
+		if (rec_get_deleted_flag(rec, 0)) {
+			continue;
+		}
+
+		const char*	err_msg;
+		dict_table_t*	table_rec;
+
+		/* Create and populate a dict_table_t structure with
+		information from SYS_TABLES row */
+		err_msg = i_s_sys_tables_rec(pcur, &mtr, rec, &table_rec);
+		mtr.commit();
+		dict_sys.unlock();
+
+		if (!err_msg) {
+			i_s_dict_fill_sys_tables(thd, table_rec,
+						 tables->table);
+		} else {
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+					    ER_CANT_FIND_SYSTEM_REC, "%s",
+					    err_msg);
+		}
+
+		if (table_rec) {
+			dict_mem_table_free(table_rec);
+		}
+
+		/* Get the next record */
+		mtr.start();
+		dict_sys.lock(SRW_LOCK_CALL);
+	}
+
+	mtr.commit();
+	dict_sys.unlock();
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_tables
+@return 0 on success */
+static
+int
+innodb_sys_tables_init(
+/*===================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_sys_tables_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = Show::innodb_sys_tables_fields_info;
+	schema->fill_table = i_s_sys_tables_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+struct st_maria_plugin	i_s_innodb_sys_tables =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	MYSQL_INFORMATION_SCHEMA_PLUGIN,
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	&i_s_info,
+
+	/* plugin name */
+	/* const char* */
+	"INNODB_SYS_TABLES",
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	plugin_author,
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	"InnoDB SYS_TABLES",
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	PLUGIN_LICENSE_GPL,
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	innodb_sys_tables_init,
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	i_s_common_deinit,
+
+	i_s_version, nullptr, nullptr, PACKAGE_VERSION,
+	MariaDB_PLUGIN_MATURITY_STABLE
+};
+
+namespace Show {
+/**  SYS_TABLESTATS  ***********************************************/
+/* Fields of the dynamic table INFORMATION_SCHEMA.SYS_TABLESTATS */
+static ST_FIELD_INFO innodb_sys_tablestats_fields_info[]=
+{
+#define SYS_TABLESTATS_ID		0
+  Column("TABLE_ID", ULonglong(), NOT_NULL),
+
+#define SYS_TABLESTATS_NAME		1
+  Column("NAME", Varchar(NAME_CHAR_LEN), NOT_NULL),
+
+#define SYS_TABLESTATS_INIT		2
+  Column("STATS_INITIALIZED", SLong(1), NOT_NULL),
+
+#define SYS_TABLESTATS_NROW		3
+  Column("NUM_ROWS", ULonglong(), NOT_NULL),
+
+#define SYS_TABLESTATS_CLUST_SIZE	4
+  Column("CLUST_INDEX_SIZE", ULonglong(), NOT_NULL),
+
+#define SYS_TABLESTATS_INDEX_SIZE	5
+  Column("OTHER_INDEX_SIZE", ULonglong(), NOT_NULL),
+
+#define SYS_TABLESTATS_MODIFIED		6
+  Column("MODIFIED_COUNTER", ULonglong(), NOT_NULL),
+
+#define SYS_TABLESTATS_AUTONINC		7
+  Column("AUTOINC", ULonglong(), NOT_NULL),
+
+#define SYS_TABLESTATS_TABLE_REF_COUNT	8
+  Column("REF_COUNT", SLong(), NOT_NULL),
+
+  CEnd()
+};
+} // namespace Show
+
+/** Populate information_schema.innodb_sys_tablestats table with a table,
+and release exclusive dict_sys.latch.
+@param[in]	thd		connection
+@param[in,out]	table		InnoDB table metadata
+@param[in,out]	table_to_fill	INFORMATION_SCHEMA.INNODB_SYS_TABLESTATS
+@return 0 on success */
+static
+int
+i_s_dict_fill_sys_tablestats(THD* thd, dict_table_t *table,
+                             TABLE* table_to_fill)
+{
+  DBUG_ENTER("i_s_dict_fill_sys_tablestats");
+
+  Field **fields= table_to_fill->field;
+
+  {
+    table->stats_mutex_lock();
+    auto _ = make_scope_exit([table]() {
+      table->stats_mutex_unlock(); dict_sys.unlock(); });
+
+    OK(fields[SYS_TABLESTATS_ID]->store(longlong(table->id), TRUE));
+
+    OK(field_store_string(fields[SYS_TABLESTATS_NAME],
+                          table->name.m_name));
+    OK(fields[SYS_TABLESTATS_INIT]->store(table->stat_initialized, true));
+
+    if (table->stat_initialized)
+    {
+      OK(fields[SYS_TABLESTATS_NROW]->store(table->stat_n_rows, true));
+
+      OK(fields[SYS_TABLESTATS_CLUST_SIZE]->
+         store(table->stat_clustered_index_size, true));
+
+      OK(fields[SYS_TABLESTATS_INDEX_SIZE]->
+         store(table->stat_sum_of_other_index_sizes, true));
+
+      OK(fields[SYS_TABLESTATS_MODIFIED]->
+         store(table->stat_modified_counter, true));
+    }
+    else
+    {
+      OK(fields[SYS_TABLESTATS_NROW]->store(0, true));
+      OK(fields[SYS_TABLESTATS_CLUST_SIZE]->store(0, true));
+      OK(fields[SYS_TABLESTATS_INDEX_SIZE]->store(0, true));
+      OK(fields[SYS_TABLESTATS_MODIFIED]->store(0, true));
+    }
+
+    OK(fields[SYS_TABLESTATS_AUTONINC]->store(table->autoinc, true));
+
+    OK(fields[SYS_TABLESTATS_TABLE_REF_COUNT]->
+       store(table->get_ref_count(), true));
+  }
+
+  OK(schema_table_store_record(thd, table_to_fill));
+  DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Function to go through each record in SYS_TABLES table, and fill the
+information_schema.innodb_sys_tablestats table with table statistics
+related information
+@return 0 on success */
+static
+int
+i_s_sys_tables_fill_table_stats(
+/*============================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
+{
+	btr_pcur_t	pcur;
+	const rec_t*	rec;
+	mtr_t		mtr;
+
+	DBUG_ENTER("i_s_sys_tables_fill_table_stats");
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+	/* deny access to user without PROCESS_ACL privilege */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	mtr.start();
+	dict_sys.lock(SRW_LOCK_CALL);
+
+	rec = dict_startscan_system(&pcur, &mtr, dict_sys.sys_tables);
+
+	while (rec) {
+		const char*	err_msg;
+		dict_table_t*	table_rec = nullptr;
+
+		mtr.commit();
+		/* Fetch the dict_table_t structure corresponding to
+		this SYS_TABLES record */
+		err_msg = i_s_sys_tables_rec(pcur, nullptr, nullptr,
+					     &table_rec);
+
+		if (UNIV_LIKELY(!err_msg)) {
+			i_s_dict_fill_sys_tablestats(thd, table_rec,
+						     tables->table);
+		} else {
+			ut_ad(!table_rec);
+			dict_sys.unlock();
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+					    ER_CANT_FIND_SYSTEM_REC, "%s",
+					    err_msg);
+		}
+
+		/* Get the next record */
+		mtr.start();
+		dict_sys.lock(SRW_LOCK_CALL);
+
+		rec = dict_getnext_system(&pcur, &mtr);
+	}
+
+	mtr.commit();
+	dict_sys.unlock();
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_tablestats
+@return 0 on success */
+static
+int
+innodb_sys_tablestats_init(
+/*=======================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_sys_tablestats_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = Show::innodb_sys_tablestats_fields_info;
+	schema->fill_table = i_s_sys_tables_fill_table_stats;
+
+	DBUG_RETURN(0);
+}
+
+struct st_maria_plugin	i_s_innodb_sys_tablestats =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	MYSQL_INFORMATION_SCHEMA_PLUGIN,
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	&i_s_info,
+
+	/* plugin name */
+	/* const char* */
+	"INNODB_SYS_TABLESTATS",
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	plugin_author,
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	"InnoDB SYS_TABLESTATS",
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	PLUGIN_LICENSE_GPL,
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	innodb_sys_tablestats_init,
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	i_s_common_deinit,
+
+	i_s_version, nullptr, nullptr, PACKAGE_VERSION,
+	MariaDB_PLUGIN_MATURITY_STABLE
+};
+
+namespace Show {
+/**  SYS_INDEXES  **************************************************/
+/* Fields of the dynamic table INFORMATION_SCHEMA.SYS_INDEXES */
+static ST_FIELD_INFO innodb_sysindex_fields_info[]=
+{
+#define SYS_INDEX_ID		0
+  Column("INDEX_ID", ULonglong(), NOT_NULL),
+
+#define SYS_INDEX_NAME		1
+  Column("NAME", Varchar(NAME_CHAR_LEN), NOT_NULL),
+
+#define SYS_INDEX_TABLE_ID	2
+  Column("TABLE_ID", ULonglong(), NOT_NULL),
+
+#define SYS_INDEX_TYPE		3
+  Column("TYPE", SLong(), NOT_NULL),
+
+#define SYS_INDEX_NUM_FIELDS	4
+  Column("N_FIELDS", SLong(), NOT_NULL),
+
+#define SYS_INDEX_PAGE_NO	5
+  Column("PAGE_NO", SLong(), NULLABLE),
+
+#define SYS_INDEX_SPACE		6
+  Column("SPACE", SLong(), NULLABLE),
+
+#define SYS_INDEX_MERGE_THRESHOLD 7
+  Column("MERGE_THRESHOLD", SLong(), NOT_NULL),
+
+  CEnd()
+};
+} // namespace Show
+
+/**********************************************************************//**
+Function to populate the information_schema.innodb_sys_indexes table with
+collected index information
+@return 0 on success */
+static
+int
+i_s_dict_fill_sys_indexes(
+/*======================*/
+	THD*		thd,		/*!< in: thread */
+	table_id_t	table_id,	/*!< in: table id */
+	ulint		space_id,	/*!< in: tablespace id */
+	dict_index_t*	index,		/*!< in: populated dict_index_t
+					struct with index info */
+	TABLE*		table_to_fill)	/*!< in/out: fill this table */
+{
+	Field**		fields;
+
+	DBUG_ENTER("i_s_dict_fill_sys_indexes");
+
+	fields = table_to_fill->field;
+
+	if (*index->name == *TEMP_INDEX_PREFIX_STR) {
+		/* Since TEMP_INDEX_PREFIX_STR is not valid UTF-8, we
+		need to convert it to something else. */
+		*const_cast<char*>(index->name()) = '?';
+	}
+
+	OK(fields[SYS_INDEX_NAME]->store(index->name,
+					 uint(strlen(index->name)),
+					 system_charset_info));
+
+	OK(fields[SYS_INDEX_ID]->store(longlong(index->id), true));
+
+	OK(fields[SYS_INDEX_TABLE_ID]->store(longlong(table_id), true));
+
+	OK(fields[SYS_INDEX_TYPE]->store(index->type, true));
+
+	OK(fields[SYS_INDEX_NUM_FIELDS]->store(index->n_fields));
+
+	/* FIL_NULL is ULINT32_UNDEFINED */
+	if (index->page == FIL_NULL) {
+		fields[SYS_INDEX_PAGE_NO]->set_null();
+	} else {
+		fields[SYS_INDEX_PAGE_NO]->set_notnull();
+		OK(fields[SYS_INDEX_PAGE_NO]->store(index->page, true));
+	}
+
+	if (space_id == FIL_NULL) {
+		fields[SYS_INDEX_SPACE]->set_null();
+	} else {
+		fields[SYS_INDEX_SPACE]->set_notnull();
+		OK(fields[SYS_INDEX_SPACE]->store(space_id, true));
+	}
+
+	OK(fields[SYS_INDEX_MERGE_THRESHOLD]->store(index->merge_threshold,
+						    true));
+
+	OK(schema_table_store_record(thd, table_to_fill));
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Function to go through each record in SYS_INDEXES table, and fill the
+information_schema.innodb_sys_indexes table with related index information
+@return 0 on success */
+static
+int
+i_s_sys_indexes_fill_table(
+/*=======================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
+{
+	btr_pcur_t		pcur;
+	const rec_t*		rec;
+	mem_heap_t*		heap;
+	mtr_t			mtr;
+
+	DBUG_ENTER("i_s_sys_indexes_fill_table");
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+	/* deny access to user without PROCESS_ACL privilege */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	heap = mem_heap_create(1000);
+	dict_sys.lock(SRW_LOCK_CALL);
+	mtr_start(&mtr);
+
+	/* Start scan the SYS_INDEXES table */
+	rec = dict_startscan_system(&pcur, &mtr, dict_sys.sys_indexes);
+
+	/* Process each record in the table */
+	while (rec) {
+		const char*	err_msg;
+		table_id_t	table_id;
+		ulint		space_id;
+		dict_index_t	index_rec;
+
+		/* Populate a dict_index_t structure with information from
+		a SYS_INDEXES row */
+		err_msg = dict_process_sys_indexes_rec(heap, rec, &index_rec,
+						       &table_id);
+		const byte* field = rec_get_nth_field_old(
+			rec, DICT_FLD__SYS_INDEXES__SPACE, &space_id);
+		space_id = space_id == 4 ? mach_read_from_4(field)
+			: ULINT_UNDEFINED;
+		mtr.commit();
+		dict_sys.unlock();
+
+		if (!err_msg) {
+			if (int err = i_s_dict_fill_sys_indexes(
+				    thd, table_id, space_id, &index_rec,
+				    tables->table)) {
+				mem_heap_free(heap);
+				DBUG_RETURN(err);
+			}
+		} else {
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+					    ER_CANT_FIND_SYSTEM_REC, "%s",
+					    err_msg);
+		}
+
+		mem_heap_empty(heap);
+
+		/* Get the next record */
+		mtr.start();
+		dict_sys.lock(SRW_LOCK_CALL);
+		rec = dict_getnext_system(&pcur, &mtr);
+	}
+
+	mtr.commit();
+	dict_sys.unlock();
+	mem_heap_free(heap);
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_indexes
+@return 0 on success */
+static
+int
+innodb_sys_indexes_init(
+/*====================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_sys_indexes_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = Show::innodb_sysindex_fields_info;
+	schema->fill_table = i_s_sys_indexes_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+struct st_maria_plugin	i_s_innodb_sys_indexes =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	MYSQL_INFORMATION_SCHEMA_PLUGIN,
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	&i_s_info,
+
+	/* plugin name */
+	/* const char* */
+	"INNODB_SYS_INDEXES",
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	plugin_author,
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	"InnoDB SYS_INDEXES",
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	PLUGIN_LICENSE_GPL,
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	innodb_sys_indexes_init,
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	i_s_common_deinit,
+
+	i_s_version, nullptr, nullptr, PACKAGE_VERSION,
+	MariaDB_PLUGIN_MATURITY_STABLE
+};
+
+namespace Show {
+/**  SYS_COLUMNS  **************************************************/
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_COLUMNS */
+static ST_FIELD_INFO innodb_sys_columns_fields_info[]=
+{
+#define SYS_COLUMN_TABLE_ID		0
+  Column("TABLE_ID", ULonglong(), NOT_NULL),
+
+#define SYS_COLUMN_NAME		1
+  Column("NAME", Varchar(NAME_CHAR_LEN), NOT_NULL),
+
+#define SYS_COLUMN_POSITION	2
+  Column("POS", ULonglong(), NOT_NULL),
+
+#define SYS_COLUMN_MTYPE		3
+  Column("MTYPE", SLong(), NOT_NULL),
+
+#define SYS_COLUMN__PRTYPE	4
+  Column("PRTYPE", SLong(), NOT_NULL),
+
+#define SYS_COLUMN_COLUMN_LEN	5
+  Column("LEN", SLong(), NOT_NULL),
+
+  CEnd()
+};
+} // namespace Show
+
+/**********************************************************************//**
+Function to populate the information_schema.innodb_sys_columns with
+related column information
+@return 0 on success */
+static
+int
+i_s_dict_fill_sys_columns(
+/*======================*/
+	THD*		thd,		/*!< in: thread */
+	table_id_t	table_id,	/*!< in: table ID */
+	const char*	col_name,	/*!< in: column name */
+	dict_col_t*	column,		/*!< in: dict_col_t struct holding
+					more column information */
+	ulint		nth_v_col,	/*!< in: virtual column, its
+					sequence number (nth virtual col) */
+	TABLE*		table_to_fill)	/*!< in/out: fill this table */
+{
+	Field**		fields;
+
+	DBUG_ENTER("i_s_dict_fill_sys_columns");
+
+	fields = table_to_fill->field;
+
+	OK(fields[SYS_COLUMN_TABLE_ID]->store((longlong) table_id, TRUE));
+
+	OK(field_store_string(fields[SYS_COLUMN_NAME], col_name));
+
+	if (column->is_virtual()) {
+		ulint	pos = dict_create_v_col_pos(nth_v_col, column->ind);
+		OK(fields[SYS_COLUMN_POSITION]->store(pos, true));
+	} else {
+		OK(fields[SYS_COLUMN_POSITION]->store(column->ind, true));
+	}
+
+	OK(fields[SYS_COLUMN_MTYPE]->store(column->mtype));
+
+	OK(fields[SYS_COLUMN__PRTYPE]->store(column->prtype));
+
+	OK(fields[SYS_COLUMN_COLUMN_LEN]->store(column->len));
+
+	OK(schema_table_store_record(thd, table_to_fill));
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Function to fill information_schema.innodb_sys_columns with information
+collected by scanning SYS_COLUMNS table.
+@return 0 on success */
+static
+int
+i_s_sys_columns_fill_table(
+/*=======================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
+{
+	btr_pcur_t	pcur;
+	const rec_t*	rec;
+	const char*	col_name;
+	mem_heap_t*	heap;
+	mtr_t		mtr;
+
+	DBUG_ENTER("i_s_sys_columns_fill_table");
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+	/* deny access to user without PROCESS_ACL privilege */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	heap = mem_heap_create(1000);
+	mtr.start();
+	dict_sys.lock(SRW_LOCK_CALL);
+
+	rec = dict_startscan_system(&pcur, &mtr, dict_sys.sys_columns);
+
+	while (rec) {
+		const char*	err_msg;
+		dict_col_t	column_rec;
+		table_id_t	table_id;
+		ulint		nth_v_col;
+
+		/* populate a dict_col_t structure with information from
+		a SYS_COLUMNS row */
+		err_msg = dict_process_sys_columns_rec(heap, rec, &column_rec,
+						       &table_id, &col_name,
+						       &nth_v_col);
+
+		mtr.commit();
+		dict_sys.unlock();
+
+		if (!err_msg) {
+			i_s_dict_fill_sys_columns(thd, table_id, col_name,
+						 &column_rec, nth_v_col,
+						 tables->table);
+		} else {
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+					    ER_CANT_FIND_SYSTEM_REC, "%s",
+					    err_msg);
+		}
+
+		mem_heap_empty(heap);
+
+		/* Get the next record */
+		mtr.start();
+		dict_sys.lock(SRW_LOCK_CALL);
+		rec = dict_getnext_system(&pcur, &mtr);
+	}
+
+	mtr.commit();
+	dict_sys.unlock();
+	mem_heap_free(heap);
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_columns
+@return 0 on success */
+static
+int
+innodb_sys_columns_init(
+/*====================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_sys_columns_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = Show::innodb_sys_columns_fields_info;
+	schema->fill_table = i_s_sys_columns_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+struct st_maria_plugin	i_s_innodb_sys_columns =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	MYSQL_INFORMATION_SCHEMA_PLUGIN,
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	&i_s_info,
+
+	/* plugin name */
+	/* const char* */
+	"INNODB_SYS_COLUMNS",
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	plugin_author,
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	"InnoDB SYS_COLUMNS",
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	PLUGIN_LICENSE_GPL,
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	innodb_sys_columns_init,
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	i_s_common_deinit,
+
+	i_s_version, nullptr, nullptr, PACKAGE_VERSION,
+	MariaDB_PLUGIN_MATURITY_STABLE
+};
+
+namespace Show {
+/**  SYS_VIRTUAL **************************************************/
+/** Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_VIRTUAL */
+static ST_FIELD_INFO innodb_sys_virtual_fields_info[]=
+{
+#define SYS_VIRTUAL_TABLE_ID		0
+  Column("TABLE_ID", ULonglong(), NOT_NULL),
+
+#define SYS_VIRTUAL_POS			1
+  Column("POS", ULong(), NOT_NULL),
+
+#define SYS_VIRTUAL_BASE_POS		2
+  Column("BASE_POS", ULong(), NOT_NULL),
+
+  CEnd()
+};
+} // namespace Show
+
+/** Function to populate the information_schema.innodb_sys_virtual with
+related information
+param[in]	thd		thread
+param[in]	table_id	table ID
+param[in]	pos		virtual column position
+param[in]	base_pos	base column position
+param[in,out]	table_to_fill	fill this table
+@return 0 on success */
+static
+int
+i_s_dict_fill_sys_virtual(
+	THD*		thd,
+	table_id_t	table_id,
+	ulint		pos,
+	ulint		base_pos,
+	TABLE*		table_to_fill)
+{
+	Field**		fields;
+
+	DBUG_ENTER("i_s_dict_fill_sys_virtual");
+
+	fields = table_to_fill->field;
+
+	OK(fields[SYS_VIRTUAL_TABLE_ID]->store(table_id, true));
+
+	OK(fields[SYS_VIRTUAL_POS]->store(pos, true));
+
+	OK(fields[SYS_VIRTUAL_BASE_POS]->store(base_pos, true));
+
+	OK(schema_table_store_record(thd, table_to_fill));
+
+	DBUG_RETURN(0);
+}
+
+/** Function to fill information_schema.innodb_sys_virtual with information
+collected by scanning SYS_VIRTUAL table.
+param[in]	thd		thread
+param[in,out]	tables		tables to fill
+param[in]	item		condition (not used)
+@return 0 on success */
+static
+int
+i_s_sys_virtual_fill_table(
+	THD*		thd,
+	TABLE_LIST*	tables,
+	Item*		)
+{
+	btr_pcur_t	pcur;
+	const rec_t*	rec;
+	ulint		pos;
+	ulint		base_pos;
+	mtr_t		mtr;
+
+	DBUG_ENTER("i_s_sys_virtual_fill_table");
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+	/* deny access to user without PROCESS_ACL privilege */
+	if (check_global_access(thd, PROCESS_ACL) || !dict_sys.sys_virtual) {
+		DBUG_RETURN(0);
+	}
+
+	mtr.start();
+	dict_sys.lock(SRW_LOCK_CALL);
+
+	rec = dict_startscan_system(&pcur, &mtr, dict_sys.sys_virtual);
+
+	while (rec) {
+		const char*	err_msg;
+		table_id_t	table_id;
+
+		/* populate a dict_col_t structure with information from
+		a SYS_VIRTUAL row */
+		err_msg = dict_process_sys_virtual_rec(rec,
+						       &table_id, &pos,
+						       &base_pos);
+
+		mtr.commit();
+		dict_sys.unlock();
+
+		if (!err_msg) {
+			i_s_dict_fill_sys_virtual(thd, table_id, pos, base_pos,
+						  tables->table);
+		} else {
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+					    ER_CANT_FIND_SYSTEM_REC, "%s",
+					    err_msg);
+		}
+
+		/* Get the next record */
+		mtr.start();
+		dict_sys.lock(SRW_LOCK_CALL);
+		rec = dict_getnext_system(&pcur, &mtr);
+	}
+
+	mtr.commit();
+	dict_sys.unlock();
+
+	DBUG_RETURN(0);
+}
+
+/** Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_virtual
+param[in,out]	p	table schema object
+@return 0 on success */
+static
+int
+innodb_sys_virtual_init(
+	void*	p)
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_sys_virtual_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = Show::innodb_sys_virtual_fields_info;
+	schema->fill_table = i_s_sys_virtual_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+struct st_maria_plugin	i_s_innodb_sys_virtual =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	MYSQL_INFORMATION_SCHEMA_PLUGIN,
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	&i_s_info,
+
+	/* plugin name */
+	/* const char* */
+	"INNODB_SYS_VIRTUAL",
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	plugin_author,
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	"InnoDB SYS_VIRTUAL",
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	PLUGIN_LICENSE_GPL,
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	innodb_sys_virtual_init,
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	i_s_common_deinit,
+
+	i_s_version, nullptr, nullptr, PACKAGE_VERSION,
+	MariaDB_PLUGIN_MATURITY_STABLE
+};
+
+
+namespace Show {
+/**  SYS_FIELDS  ***************************************************/
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_FIELDS */
+static ST_FIELD_INFO innodb_sys_fields_fields_info[]=
+{
+#define SYS_FIELD_INDEX_ID	0
+  Column("INDEX_ID", ULonglong(), NOT_NULL),
+
+#define SYS_FIELD_NAME		1
+  Column("NAME", Varchar(NAME_CHAR_LEN), NOT_NULL),
+
+#define SYS_FIELD_POS		2
+  Column("POS", ULong(), NOT_NULL),
+
+  CEnd()
+};
+} // namespace Show
+
+/**********************************************************************//**
+Function to fill information_schema.innodb_sys_fields with information
+collected by scanning SYS_FIELDS table.
+@return 0 on success */
+static
+int
+i_s_dict_fill_sys_fields(
+/*=====================*/
+	THD*		thd,		/*!< in: thread */
+	index_id_t	index_id,	/*!< in: index id for the field */
+	dict_field_t*	field,		/*!< in: table */
+	ulint		pos,		/*!< in: Field position */
+	TABLE*		table_to_fill)	/*!< in/out: fill this table */
+{
+	Field**		fields;
+
+	DBUG_ENTER("i_s_dict_fill_sys_fields");
+
+	fields = table_to_fill->field;
+
+	OK(fields[SYS_FIELD_INDEX_ID]->store(index_id, true));
+
+	OK(field_store_string(fields[SYS_FIELD_NAME], field->name));
+
+	OK(fields[SYS_FIELD_POS]->store(pos, true));
+
+	OK(schema_table_store_record(thd, table_to_fill));
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Function to go through each record in SYS_FIELDS table, and fill the
+information_schema.innodb_sys_fields table with related index field
+information
+@return 0 on success */
+static
+int
+i_s_sys_fields_fill_table(
+/*======================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
+{
+	btr_pcur_t	pcur;
+	const rec_t*	rec;
+	mem_heap_t*	heap;
+	index_id_t	last_id;
+	mtr_t		mtr;
+
+	DBUG_ENTER("i_s_sys_fields_fill_table");
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+	/* deny access to user without PROCESS_ACL privilege */
+	if (check_global_access(thd, PROCESS_ACL)) {
+
+		DBUG_RETURN(0);
+	}
+
+	heap = mem_heap_create(1000);
+	mtr.start();
+
+	/* will save last index id so that we know whether we move to
+	the next index. This is used to calculate prefix length */
+	last_id = 0;
+
+	dict_sys.lock(SRW_LOCK_CALL);
+	rec = dict_startscan_system(&pcur, &mtr, dict_sys.sys_fields);
+
+	while (rec) {
+		ulint		pos;
+		const char*	err_msg;
+		index_id_t	index_id;
+		dict_field_t	field_rec;
+
+		/* Populate a dict_field_t structure with information from
+		a SYS_FIELDS row */
+		err_msg = dict_process_sys_fields_rec(heap, rec, &field_rec,
+						      &pos, &index_id, last_id);
+
+		mtr.commit();
+		dict_sys.unlock();
+
+		if (!err_msg) {
+			i_s_dict_fill_sys_fields(thd, index_id, &field_rec,
+						 pos, tables->table);
+			last_id = index_id;
+		} else {
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+					    ER_CANT_FIND_SYSTEM_REC, "%s",
+					    err_msg);
+		}
+
+		mem_heap_empty(heap);
+
+		/* Get the next record */
+		mtr.start();
+		dict_sys.lock(SRW_LOCK_CALL);
+		rec = dict_getnext_system(&pcur, &mtr);
+	}
+
+	mtr.commit();
+	dict_sys.unlock();
+	mem_heap_free(heap);
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_fields
+@return 0 on success */
+static
+int
+innodb_sys_fields_init(
+/*===================*/
+	void*   p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_sys_field_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = Show::innodb_sys_fields_fields_info;
+	schema->fill_table = i_s_sys_fields_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+struct st_maria_plugin	i_s_innodb_sys_fields =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	MYSQL_INFORMATION_SCHEMA_PLUGIN,
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	&i_s_info,
+
+	/* plugin name */
+	/* const char* */
+	"INNODB_SYS_FIELDS",
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	plugin_author,
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	"InnoDB SYS_FIELDS",
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	PLUGIN_LICENSE_GPL,
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	innodb_sys_fields_init,
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	i_s_common_deinit,
+
+	i_s_version, nullptr, nullptr, PACKAGE_VERSION,
+	MariaDB_PLUGIN_MATURITY_STABLE
+};
+
+namespace Show {
+/**  SYS_FOREIGN        ********************************************/
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_FOREIGN */
+static ST_FIELD_INFO innodb_sys_foreign_fields_info[]=
+{
+#define SYS_FOREIGN_ID		0
+  Column("ID", Varchar(NAME_LEN + 1), NOT_NULL),
+
+#define SYS_FOREIGN_FOR_NAME	1
+  Column("FOR_NAME", Varchar(NAME_LEN + 1), NOT_NULL),
+
+#define SYS_FOREIGN_REF_NAME	2
+  Column("REF_NAME", Varchar(NAME_LEN + 1), NOT_NULL),
+
+#define SYS_FOREIGN_NUM_COL	3
+  Column("N_COLS", ULong(), NOT_NULL),
+
+#define SYS_FOREIGN_TYPE	4
+  Column("TYPE", ULong(), NOT_NULL),
+
+  CEnd()
+};
+} // namespace Show
+
+/**********************************************************************//**
+Function to fill information_schema.innodb_sys_foreign with information
+collected by scanning SYS_FOREIGN table.
+@return 0 on success */
+static
+int
+i_s_dict_fill_sys_foreign(
+/*======================*/
+	THD*		thd,		/*!< in: thread */
+	dict_foreign_t*	foreign,	/*!< in: table */
+	TABLE*		table_to_fill)	/*!< in/out: fill this table */
+{
+	Field**		fields;
+
+	DBUG_ENTER("i_s_dict_fill_sys_foreign");
+
+	fields = table_to_fill->field;
+
+	OK(field_store_string(fields[SYS_FOREIGN_ID], foreign->id));
+
+	OK(field_store_string(fields[SYS_FOREIGN_FOR_NAME],
+			      foreign->foreign_table_name));
+
+	OK(field_store_string(fields[SYS_FOREIGN_REF_NAME],
+			      foreign->referenced_table_name));
+
+	OK(fields[SYS_FOREIGN_NUM_COL]->store(foreign->n_fields));
+
+	OK(fields[SYS_FOREIGN_TYPE]->store(foreign->type));
+
+	OK(schema_table_store_record(thd, table_to_fill));
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Function to populate INFORMATION_SCHEMA.innodb_sys_foreign table. Loop
+through each record in SYS_FOREIGN, and extract the foreign key
+information.
+@return 0 on success */
+static
+int
+i_s_sys_foreign_fill_table(
+/*=======================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
+{
+	btr_pcur_t	pcur;
+	const rec_t*	rec;
+	mem_heap_t*	heap;
+	mtr_t		mtr;
+
+	DBUG_ENTER("i_s_sys_foreign_fill_table");
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+	/* deny access to user without PROCESS_ACL privilege */
+	if (check_global_access(thd, PROCESS_ACL) || !dict_sys.sys_foreign) {
+		DBUG_RETURN(0);
+	}
+
+	heap = mem_heap_create(1000);
+	mtr.start();
+	dict_sys.lock(SRW_LOCK_CALL);
+
+	rec = dict_startscan_system(&pcur, &mtr, dict_sys.sys_foreign);
+
+	while (rec) {
+		const char*	err_msg;
+		dict_foreign_t	foreign_rec;
+
+		/* Populate a dict_foreign_t structure with information from
+		a SYS_FOREIGN row */
+		err_msg = dict_process_sys_foreign_rec(heap, rec, &foreign_rec);
+
+		mtr.commit();
+		dict_sys.unlock();
+
+		if (!err_msg) {
+			i_s_dict_fill_sys_foreign(thd, &foreign_rec,
+						 tables->table);
+		} else {
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+					    ER_CANT_FIND_SYSTEM_REC, "%s",
+					    err_msg);
+		}
+
+		mem_heap_empty(heap);
+
+		/* Get the next record */
+		mtr.start();
+		dict_sys.lock(SRW_LOCK_CALL);
+		rec = dict_getnext_system(&pcur, &mtr);
+	}
+
+	mtr.commit();
+	dict_sys.unlock();
+	mem_heap_free(heap);
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_foreign
+@return 0 on success */
+static
+int
+innodb_sys_foreign_init(
+/*====================*/
+	void*   p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_sys_foreign_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = Show::innodb_sys_foreign_fields_info;
+	schema->fill_table = i_s_sys_foreign_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+struct st_maria_plugin	i_s_innodb_sys_foreign =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	MYSQL_INFORMATION_SCHEMA_PLUGIN,
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	&i_s_info,
+
+	/* plugin name */
+	/* const char* */
+	"INNODB_SYS_FOREIGN",
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	plugin_author,
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	"InnoDB SYS_FOREIGN",
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	PLUGIN_LICENSE_GPL,
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	innodb_sys_foreign_init,
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	i_s_common_deinit,
+
+	i_s_version, nullptr, nullptr, PACKAGE_VERSION,
+	MariaDB_PLUGIN_MATURITY_STABLE
+};
+
+namespace Show {
+/**  SYS_FOREIGN_COLS   ********************************************/
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_FOREIGN_COLS */
+static ST_FIELD_INFO innodb_sys_foreign_cols_fields_info[]=
+{
+#define SYS_FOREIGN_COL_ID		0
+  Column("ID", Varchar(NAME_LEN + 1), NOT_NULL),
+
+#define SYS_FOREIGN_COL_FOR_NAME	1
+  Column("FOR_COL_NAME", Varchar(NAME_CHAR_LEN), NOT_NULL),
+
+#define SYS_FOREIGN_COL_REF_NAME	2
+  Column("REF_COL_NAME", Varchar(NAME_CHAR_LEN), NOT_NULL),
+
+#define SYS_FOREIGN_COL_POS		3
+  Column("POS", ULong(), NOT_NULL),
+
+  CEnd()
+};
+} // namespace Show
+
+/**********************************************************************//**
+Function to fill information_schema.innodb_sys_foreign_cols with information
+collected by scanning SYS_FOREIGN_COLS table.
+@return 0 on success */
+static
+int
+i_s_dict_fill_sys_foreign_cols(
+/*==========================*/
+	THD*		thd,		/*!< in: thread */
+	const char*	name,		/*!< in: foreign key constraint name */
+	const char*	for_col_name,	/*!< in: referencing column name*/
+	const char*	ref_col_name,	/*!< in: referenced column
+					name */
+	ulint		pos,		/*!< in: column position */
+	TABLE*		table_to_fill)	/*!< in/out: fill this table */
+{
+	Field**		fields;
+
+	DBUG_ENTER("i_s_dict_fill_sys_foreign_cols");
+
+	fields = table_to_fill->field;
+
+	OK(field_store_string(fields[SYS_FOREIGN_COL_ID], name));
+
+	OK(field_store_string(fields[SYS_FOREIGN_COL_FOR_NAME], for_col_name));
+
+	OK(field_store_string(fields[SYS_FOREIGN_COL_REF_NAME], ref_col_name));
+
+	OK(fields[SYS_FOREIGN_COL_POS]->store(pos, true));
+
+	OK(schema_table_store_record(thd, table_to_fill));
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Function to populate INFORMATION_SCHEMA.innodb_sys_foreign_cols table. Loop
+through each record in SYS_FOREIGN_COLS, and extract the foreign key column
+information and fill the INFORMATION_SCHEMA.innodb_sys_foreign_cols table.
+@return 0 on success */
+static
+int
+i_s_sys_foreign_cols_fill_table(
+/*============================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
+{
+	btr_pcur_t	pcur;
+	const rec_t*	rec;
+	mem_heap_t*	heap;
+	mtr_t		mtr;
+
+	DBUG_ENTER("i_s_sys_foreign_cols_fill_table");
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+	/* deny access to user without PROCESS_ACL privilege */
+	if (check_global_access(thd, PROCESS_ACL)
+	    || !dict_sys.sys_foreign_cols) {
+		DBUG_RETURN(0);
+	}
+
+	heap = mem_heap_create(1000);
+	mtr.start();
+	dict_sys.lock(SRW_LOCK_CALL);
+
+	rec = dict_startscan_system(&pcur, &mtr, dict_sys.sys_foreign_cols);
+
+	while (rec) {
+		const char*	err_msg;
+		const char*	name;
+		const char*	for_col_name;
+		const char*	ref_col_name;
+		ulint		pos;
+
+		/* Extract necessary information from a SYS_FOREIGN_COLS row */
+		err_msg = dict_process_sys_foreign_col_rec(
+			heap, rec, &name, &for_col_name, &ref_col_name, &pos);
+
+		mtr.commit();
+		dict_sys.unlock();
+
+		if (!err_msg) {
+			i_s_dict_fill_sys_foreign_cols(
+				thd, name, for_col_name, ref_col_name, pos,
+				tables->table);
+		} else {
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+					    ER_CANT_FIND_SYSTEM_REC, "%s",
+					    err_msg);
+		}
+
+		mem_heap_empty(heap);
+
+		/* Get the next record */
+		mtr.start();
+		dict_sys.lock(SRW_LOCK_CALL);
+		rec = dict_getnext_system(&pcur, &mtr);
+	}
+
+	mtr.commit();
+	dict_sys.unlock();
+	mem_heap_free(heap);
+
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_sys_foreign_cols
+@return 0 on success */
+static
+int
+innodb_sys_foreign_cols_init(
+/*========================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_sys_foreign_cols_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = Show::innodb_sys_foreign_cols_fields_info;
+	schema->fill_table = i_s_sys_foreign_cols_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+struct st_maria_plugin	i_s_innodb_sys_foreign_cols =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	MYSQL_INFORMATION_SCHEMA_PLUGIN,
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	&i_s_info,
+
+	/* plugin name */
+	/* const char* */
+	"INNODB_SYS_FOREIGN_COLS",
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	plugin_author,
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	"InnoDB SYS_FOREIGN_COLS",
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	PLUGIN_LICENSE_GPL,
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	innodb_sys_foreign_cols_init,
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	i_s_common_deinit,
+
+	i_s_version, nullptr, nullptr, PACKAGE_VERSION,
+	MariaDB_PLUGIN_MATURITY_STABLE
+};
+
+namespace Show {
+/**  SYS_TABLESPACES    ********************************************/
+/* Fields of the dynamic table INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES */
+static ST_FIELD_INFO innodb_sys_tablespaces_fields_info[]=
+{
+#define SYS_TABLESPACES_SPACE		0
+  Column("SPACE", ULong(), NOT_NULL),
+
+#define SYS_TABLESPACES_NAME		1
+  Column("NAME", Varchar(MAX_FULL_NAME_LEN + 1), NOT_NULL),
+
+#define SYS_TABLESPACES_FLAGS		2
+  Column("FLAG", ULong(), NOT_NULL),
+
+#define SYS_TABLESPACES_ROW_FORMAT	3
+  Column("ROW_FORMAT", Varchar(22), NULLABLE),
+
+#define SYS_TABLESPACES_PAGE_SIZE	4
+  Column("PAGE_SIZE", ULong(), NOT_NULL),
+
+#define SYS_TABLESPACES_FILENAME	5
+  Column("FILENAME", Varchar(FN_REFLEN), NOT_NULL),
+
+#define SYS_TABLESPACES_FS_BLOCK_SIZE	6
+  Column("FS_BLOCK_SIZE", ULong(),NOT_NULL),
+
+#define SYS_TABLESPACES_FILE_SIZE	7
+  Column("FILE_SIZE", ULonglong(), NOT_NULL),
+
+#define SYS_TABLESPACES_ALLOC_SIZE	8
+  Column("ALLOCATED_SIZE", ULonglong(), NOT_NULL),
+
+  CEnd()
+};
+} // namespace Show
+
+extern size_t os_file_get_fs_block_size(const char *path);
+
+/** Produce one row of INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES.
+@param thd  connection
+@param s    tablespace
+@param t    output table
+@return 0 on success */
+static int i_s_sys_tablespaces_fill(THD *thd, const fil_space_t &s, TABLE *t)
+{
+  DBUG_ENTER("i_s_sys_tablespaces_fill");
+  const char *row_format;
+
+  if (s.full_crc32() || is_system_tablespace(s.id))
+    row_format= nullptr;
+  else if (FSP_FLAGS_GET_ZIP_SSIZE(s.flags))
+    row_format= "Compressed";
+  else if (FSP_FLAGS_HAS_ATOMIC_BLOBS(s.flags))
+    row_format= "Dynamic";
+  else
+    row_format= "Compact or Redundant";
+
+  Field **fields= t->field;
+
+  OK(fields[SYS_TABLESPACES_SPACE]->store(s.id, true));
+  {
+    Field *f= fields[SYS_TABLESPACES_NAME];
+    const auto name= s.name();
+    if (name.data())
+    {
+      OK(f->store(name.data(), name.size(), system_charset_info));
+      f->set_notnull();
+    }
+    else if (srv_is_undo_tablespace(s.id))
+    {
+      char name[15];
+      snprintf(name, sizeof name, "innodb_undo%03u",
+               (s.id - srv_undo_space_id_start + 1));
+      OK(f->store(name, strlen(name), system_charset_info));
+    } else f->set_notnull();
+  }
+
+  fields[SYS_TABLESPACES_NAME]->set_null();
+  OK(fields[SYS_TABLESPACES_FLAGS]->store(s.flags, true));
+  OK(field_store_string(fields[SYS_TABLESPACES_ROW_FORMAT], row_format));
+  const char *filepath= s.chain.start->name;
+  OK(field_store_string(fields[SYS_TABLESPACES_FILENAME], filepath));
+
+  OK(fields[SYS_TABLESPACES_PAGE_SIZE]->store(s.physical_size(), true));
+  size_t fs_block_size;
+  os_file_size_t file= os_file_get_size(filepath);
+  if (file.m_total_size == os_offset_t(~0))
+  {
+    file.m_total_size= 0;
+    file.m_alloc_size= 0;
+    fs_block_size= 0;
+  }
+  else
+    fs_block_size= os_file_get_fs_block_size(filepath);
+
+  OK(fields[SYS_TABLESPACES_FS_BLOCK_SIZE]->store(fs_block_size, true));
+  OK(fields[SYS_TABLESPACES_FILE_SIZE]->store(file.m_total_size, true));
+  OK(fields[SYS_TABLESPACES_ALLOC_SIZE]->store(file.m_alloc_size, true));
+
+  OK(schema_table_store_record(thd, t));
+
+  DBUG_RETURN(0);
+}
+
+/** Populate INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES.
+@param thd    connection
+@param tables table to fill
+@return 0 on success */
+static int i_s_sys_tablespaces_fill_table(THD *thd, TABLE_LIST *tables, Item*)
+{
+  DBUG_ENTER("i_s_sys_tablespaces_fill_table");
+  RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+  if (check_global_access(thd, PROCESS_ACL))
+    DBUG_RETURN(0);
+
+  int err= 0;
+
+  mysql_mutex_lock(&fil_system.mutex);
+  fil_system.freeze_space_list++;
+
+  for (fil_space_t &space : fil_system.space_list)
+  {
+    if (space.purpose == FIL_TYPE_TABLESPACE && !space.is_stopping() &&
+        space.chain.start)
+    {
+      space.reacquire();
+      mysql_mutex_unlock(&fil_system.mutex);
+      space.s_lock();
+      err= i_s_sys_tablespaces_fill(thd, space, tables->table);
+      space.s_unlock();
+      mysql_mutex_lock(&fil_system.mutex);
+      space.release();
+      if (err)
+        break;
+    }
+  }
+
+  fil_system.freeze_space_list--;
+  mysql_mutex_unlock(&fil_system.mutex);
+  if (err == DB_SUCCESS)
+    err= i_s_sys_tablespaces_fill(thd, *fil_system.temp_space, tables->table);
+  DBUG_RETURN(err);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_SYS_TABLESPACES
+@return 0 on success */
+static
+int
+innodb_sys_tablespaces_init(
+/*========================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_sys_tablespaces_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = Show::innodb_sys_tablespaces_fields_info;
+	schema->fill_table = i_s_sys_tablespaces_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+struct st_maria_plugin	i_s_innodb_sys_tablespaces =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	MYSQL_INFORMATION_SCHEMA_PLUGIN,
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	&i_s_info,
+
+	/* plugin name */
+	/* const char* */
+	"INNODB_SYS_TABLESPACES",
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	plugin_author,
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	"InnoDB tablespaces",
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	PLUGIN_LICENSE_GPL,
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	innodb_sys_tablespaces_init,
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	i_s_common_deinit,
+
+	i_s_version, nullptr, nullptr, PACKAGE_VERSION,
+	MariaDB_PLUGIN_MATURITY_STABLE
+};
+
+namespace Show {
+/**  TABLESPACES_ENCRYPTION    ********************************************/
+/* Fields of the table INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION */
+static ST_FIELD_INFO innodb_tablespaces_encryption_fields_info[]=
+{
+#define TABLESPACES_ENCRYPTION_SPACE	0
+  Column("SPACE", ULong(), NOT_NULL),
+
+#define TABLESPACES_ENCRYPTION_NAME		1
+  Column("NAME", Varchar(MAX_FULL_NAME_LEN + 1), NULLABLE),
+
+#define TABLESPACES_ENCRYPTION_ENCRYPTION_SCHEME	2
+  Column("ENCRYPTION_SCHEME", ULong(), NOT_NULL),
+
+#define TABLESPACES_ENCRYPTION_KEYSERVER_REQUESTS	3
+  Column("KEYSERVER_REQUESTS", ULong(), NOT_NULL),
+
+#define TABLESPACES_ENCRYPTION_MIN_KEY_VERSION	4
+  Column("MIN_KEY_VERSION", ULong(), NOT_NULL),
+
+#define TABLESPACES_ENCRYPTION_CURRENT_KEY_VERSION	5
+  Column("CURRENT_KEY_VERSION", ULong(), NOT_NULL),
+
+#define TABLESPACES_ENCRYPTION_KEY_ROTATION_PAGE_NUMBER	6
+  Column("KEY_ROTATION_PAGE_NUMBER", ULonglong(), NULLABLE),
+
+#define TABLESPACES_ENCRYPTION_KEY_ROTATION_MAX_PAGE_NUMBER 7
+  Column("KEY_ROTATION_MAX_PAGE_NUMBER", ULonglong(), NULLABLE),
+
+#define TABLESPACES_ENCRYPTION_CURRENT_KEY_ID	8
+  Column("CURRENT_KEY_ID", ULong(), NOT_NULL),
+
+#define TABLESPACES_ENCRYPTION_ROTATING_OR_FLUSHING 9
+  Column("ROTATING_OR_FLUSHING", SLong(1), NOT_NULL),
+
+  CEnd()
+};
+} // namespace Show
+
+/**********************************************************************//**
+Function to fill INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION.
+@param[in]	thd		thread handle
+@param[in]	space		Tablespace
+@param[in]	table_to_fill	I_S table to fill
+@return 0 on success */
+static
+int
+i_s_dict_fill_tablespaces_encryption(
+	THD*		thd,
+	fil_space_t*	space,
+	TABLE*		table_to_fill)
+{
+	Field**	fields;
+	struct fil_space_crypt_status_t status;
+	DBUG_ENTER("i_s_dict_fill_tablespaces_encryption");
+
+	fields = table_to_fill->field;
+
+	fil_space_crypt_get_status(space, &status);
+
+	/* If tablespace id does not match, we did not find
+	encryption information for this tablespace. */
+	if (!space->crypt_data || space->id != status.space) {
+		goto skip;
+	}
+
+	OK(fields[TABLESPACES_ENCRYPTION_SPACE]->store(space->id, true));
+
+	{
+		const auto name = space->name();
+		if (name.data()) {
+			OK(fields[TABLESPACES_ENCRYPTION_NAME]->store(
+				   name.data(), name.size(),
+				   system_charset_info));
+			fields[TABLESPACES_ENCRYPTION_NAME]->set_notnull();
+		} else if (srv_is_undo_tablespace(space->id)) {
+			char undo_name[sizeof "innodb_undo000"];
+			snprintf(undo_name, sizeof undo_name,
+				 "innodb_undo%03" PRIu32, space->id);
+			OK(fields[TABLESPACES_ENCRYPTION_NAME]->store(
+				   undo_name, strlen(undo_name),
+				   system_charset_info));
+			fields[TABLESPACES_ENCRYPTION_NAME]->set_notnull();
+		} else {
+			fields[TABLESPACES_ENCRYPTION_NAME]->set_null();
+		}
+	}
+
+	OK(fields[TABLESPACES_ENCRYPTION_ENCRYPTION_SCHEME]->store(
+		   status.scheme, true));
+	OK(fields[TABLESPACES_ENCRYPTION_KEYSERVER_REQUESTS]->store(
+		   status.keyserver_requests, true));
+	OK(fields[TABLESPACES_ENCRYPTION_MIN_KEY_VERSION]->store(
+		   status.min_key_version, true));
+	OK(fields[TABLESPACES_ENCRYPTION_CURRENT_KEY_VERSION]->store(
+		   status.current_key_version, true));
+	OK(fields[TABLESPACES_ENCRYPTION_CURRENT_KEY_ID]->store(
+		   status.key_id, true));
+	OK(fields[TABLESPACES_ENCRYPTION_ROTATING_OR_FLUSHING]->store(
+			   status.rotating || status.flushing, true));
+
+	if (status.rotating) {
+		fields[TABLESPACES_ENCRYPTION_KEY_ROTATION_PAGE_NUMBER]->set_notnull();
+		OK(fields[TABLESPACES_ENCRYPTION_KEY_ROTATION_PAGE_NUMBER]->store(
+			   status.rotate_next_page_number, true));
+		fields[TABLESPACES_ENCRYPTION_KEY_ROTATION_MAX_PAGE_NUMBER]->set_notnull();
+		OK(fields[TABLESPACES_ENCRYPTION_KEY_ROTATION_MAX_PAGE_NUMBER]->store(
+			   status.rotate_max_page_number, true));
+	} else {
+		fields[TABLESPACES_ENCRYPTION_KEY_ROTATION_PAGE_NUMBER]
+			->set_null();
+		fields[TABLESPACES_ENCRYPTION_KEY_ROTATION_MAX_PAGE_NUMBER]
+			->set_null();
+	}
+
+	OK(schema_table_store_record(thd, table_to_fill));
+
+skip:
+	DBUG_RETURN(0);
+}
+/*******************************************************************//**
+Function to populate INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION table.
+Loop through each record in TABLESPACES_ENCRYPTION, and extract the column
+information and fill the INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION table.
+@return 0 on success */
+static
+int
+i_s_tablespaces_encryption_fill_table(
+/*===========================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
+{
+	DBUG_ENTER("i_s_tablespaces_encryption_fill_table");
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name.str);
+
+	/* deny access to user without PROCESS_ACL privilege */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	int err = 0;
+	mysql_mutex_lock(&fil_system.mutex);
+	fil_system.freeze_space_list++;
+
+	for (fil_space_t& space : fil_system.space_list) {
+		if (space.purpose == FIL_TYPE_TABLESPACE
+		    && !space.is_stopping()) {
+			space.reacquire();
+			mysql_mutex_unlock(&fil_system.mutex);
+			space.s_lock();
+			err = i_s_dict_fill_tablespaces_encryption(
+				thd, &space, tables->table);
+			space.s_unlock();
+			mysql_mutex_lock(&fil_system.mutex);
+			space.release();
+			if (err) {
+				break;
+			}
+		}
+	}
+
+	fil_system.freeze_space_list--;
+	mysql_mutex_unlock(&fil_system.mutex);
+	DBUG_RETURN(err);
+}
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION
+@return 0 on success */
+static
+int
+innodb_tablespaces_encryption_init(
+/*========================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_tablespaces_encryption_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = Show::innodb_tablespaces_encryption_fields_info;
+	schema->fill_table = i_s_tablespaces_encryption_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+struct st_maria_plugin	i_s_innodb_tablespaces_encryption =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	MYSQL_INFORMATION_SCHEMA_PLUGIN,
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	&i_s_info,
+
+	/* plugin name */
+	/* const char* */
+	"INNODB_TABLESPACES_ENCRYPTION",
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	"Google Inc",
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	"InnoDB TABLESPACES_ENCRYPTION",
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	PLUGIN_LICENSE_BSD,
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	innodb_tablespaces_encryption_init,
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	i_s_common_deinit,
+
+	i_s_version, nullptr, nullptr, PACKAGE_VERSION,
+	MariaDB_PLUGIN_MATURITY_STABLE
+};
diff --git a/storage/innobase/handler/i_s.h b/storage/innobase/handler/i_s.h
new file mode 100644
index 00000000..c8190a41
--- /dev/null
+++ b/storage/innobase/handler/i_s.h
@@ -0,0 +1,91 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2014, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file handler/i_s.h
+InnoDB INFORMATION SCHEMA tables interface to MySQL.
+
+Created July 18, 2007 Vasil Dimov
+Modified Dec 29, 2014 Jan Lindström
+*******************************************************/
+
+#ifndef i_s_h
+#define i_s_h
+#include "dict0types.h"
+
+const char plugin_author[] = "Oracle Corporation";
+const char maria_plugin_author[] = "MariaDB Corporation";
+
+extern struct st_maria_plugin	i_s_innodb_trx;
+extern struct st_maria_plugin	i_s_innodb_locks;
+extern struct st_maria_plugin	i_s_innodb_lock_waits;
+extern struct st_maria_plugin	i_s_innodb_cmp;
+extern struct st_maria_plugin	i_s_innodb_cmp_reset;
+extern struct st_maria_plugin	i_s_innodb_cmp_per_index;
+extern struct st_maria_plugin	i_s_innodb_cmp_per_index_reset;
+extern struct st_maria_plugin	i_s_innodb_cmpmem;
+extern struct st_maria_plugin	i_s_innodb_cmpmem_reset;
+extern struct st_maria_plugin   i_s_innodb_metrics;
+extern struct st_maria_plugin	i_s_innodb_ft_default_stopword;
+extern struct st_maria_plugin	i_s_innodb_ft_deleted;
+extern struct st_maria_plugin	i_s_innodb_ft_being_deleted;
+extern struct st_maria_plugin	i_s_innodb_ft_index_cache;
+extern struct st_maria_plugin	i_s_innodb_ft_index_table;
+extern struct st_maria_plugin	i_s_innodb_ft_config;
+extern struct st_maria_plugin	i_s_innodb_buffer_page;
+extern struct st_maria_plugin	i_s_innodb_buffer_page_lru;
+extern struct st_maria_plugin	i_s_innodb_buffer_stats;
+extern struct st_maria_plugin	i_s_innodb_sys_tables;
+extern struct st_maria_plugin	i_s_innodb_sys_tablestats;
+extern struct st_maria_plugin	i_s_innodb_sys_indexes;
+extern struct st_maria_plugin	i_s_innodb_sys_columns;
+extern struct st_maria_plugin	i_s_innodb_sys_fields;
+extern struct st_maria_plugin	i_s_innodb_sys_foreign;
+extern struct st_maria_plugin	i_s_innodb_sys_foreign_cols;
+extern struct st_maria_plugin	i_s_innodb_sys_tablespaces;
+extern struct st_maria_plugin	i_s_innodb_sys_virtual;
+extern struct st_maria_plugin	i_s_innodb_tablespaces_encryption;
+
+/** The latest successfully looked up innodb_fts_aux_table */
+extern table_id_t innodb_ft_aux_table_id;
+
+/** maximum number of buffer page info we would cache. */
+#define MAX_BUF_INFO_CACHED		10000
+
+#define OK(expr)		\
+	if ((expr) != 0) {	\
+		DBUG_RETURN(1);	\
+	}
+
+#define BREAK_IF(expr) if ((expr)) break
+
+#define RETURN_IF_INNODB_NOT_STARTED(plugin_name)			\
+do {									\
+	if (!srv_was_started) {						\
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,	\
+				    ER_CANT_FIND_SYSTEM_REC,		\
+				    "InnoDB: SELECTing from "		\
+				    "INFORMATION_SCHEMA.%s but "	\
+				    "the InnoDB storage engine "	\
+				    "is not installed", plugin_name);	\
+		DBUG_RETURN(0);						\
+	}								\
+} while (0)
+
+#endif /* i_s_h */
diff --git a/storage/innobase/ibuf/ibuf0ibuf.cc b/storage/innobase/ibuf/ibuf0ibuf.cc
new file mode 100644
index 00000000..b9e94a67
--- /dev/null
+++ b/storage/innobase/ibuf/ibuf0ibuf.cc
@@ -0,0 +1,4617 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file ibuf/ibuf0ibuf.cc
+Insert buffer
+
+Created 7/19/1997 Heikki Tuuri
+*******************************************************/
+
+#include "ibuf0ibuf.h"
+#include "btr0sea.h"
+
+/** Number of bits describing a single page */
+#define IBUF_BITS_PER_PAGE	4
+/** The start address for an insert buffer bitmap page bitmap */
+#define IBUF_BITMAP		PAGE_DATA
+
+#include "buf0buf.h"
+#include "buf0rea.h"
+#include "fsp0fsp.h"
+#include "trx0sys.h"
+#include "fil0fil.h"
+#include "rem0rec.h"
+#include "btr0cur.h"
+#include "btr0pcur.h"
+#include "btr0btr.h"
+#include "row0upd.h"
+#include "dict0boot.h"
+#include "fut0lst.h"
+#include "lock0lock.h"
+#include "log0recv.h"
+#include "que0que.h"
+#include "srv0start.h" /* srv_shutdown_state */
+#include "rem0cmp.h"
+#include "log.h"
+
+/*	STRUCTURE OF AN INSERT BUFFER RECORD
+
+In versions < 4.1.x:
+
+1. The first field is the page number.
+2. The second field is an array which stores type info for each subsequent
+   field. We store the information which affects the ordering of records, and
+   also the physical storage size of an SQL NULL value. E.g., for CHAR(10) it
+   is 10 bytes.
+3. Next we have the fields of the actual index record.
+
+In versions >= 4.1.x:
+
+Note that contary to what we planned in the 1990's, there will only be one
+insert buffer tree, and that is in the system tablespace of InnoDB.
+
+1. The first field is the space id.
+2. The second field is a one-byte marker (0) which differentiates records from
+   the < 4.1.x storage format.
+3. The third field is the page number.
+4. The fourth field contains the type info, where we have also added 2 bytes to
+   store the charset. In the compressed table format of 5.0.x we must add more
+   information here so that we can build a dummy 'index' struct which 5.0.x
+   can use in the binary search on the index page in the ibuf merge phase.
+5. The rest of the fields contain the fields of the actual index record.
+
+In versions >= 5.0.3:
+
+The first byte of the fourth field is an additional marker (0) if the record
+is in the compact format.  The presence of this marker can be detected by
+looking at the length of the field modulo DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE.
+
+The high-order bit of the character set field in the type info is the
+"nullable" flag for the field.
+
+In versions >= 5.5:
+
+The optional marker byte at the start of the fourth field is replaced by
+mandatory 3 fields, totaling 4 bytes:
+
+ 1. 2 bytes: Counter field, used to sort records within a (space id, page
+    no) in the order they were added. This is needed so that for example the
+    sequence of operations "INSERT x, DEL MARK x, INSERT x" is handled
+    correctly.
+
+ 2. 1 byte: Operation type (see ibuf_op_t).
+
+ 3. 1 byte: Flags. Currently only one flag exists, IBUF_REC_COMPACT.
+
+To ensure older records, which do not have counters to enforce correct
+sorting, are merged before any new records, ibuf_insert checks if we're
+trying to insert to a position that contains old-style records, and if so,
+refuses the insert. Thus, ibuf pages are gradually converted to the new
+format as their corresponding buffer pool pages are read into memory.
+*/
+
+
+/*	PREVENTING DEADLOCKS IN THE INSERT BUFFER SYSTEM
+
+If an OS thread performs any operation that brings in disk pages from
+non-system tablespaces into the buffer pool, or creates such a page there,
+then the operation may have as a side effect an insert buffer index tree
+compression. Thus, the tree latch of the insert buffer tree may be acquired
+in the x-mode, and also the file space latch of the system tablespace may
+be acquired in the x-mode.
+
+Also, an insert to an index in a non-system tablespace can have the same
+effect. How do we know this cannot lead to a deadlock of OS threads? There
+is a problem with the i\o-handler threads: they break the latching order
+because they own x-latches to pages which are on a lower level than the
+insert buffer tree latch, its page latches, and the tablespace latch an
+insert buffer operation can reserve.
+
+The solution is the following: Let all the tree and page latches connected
+with the insert buffer be later in the latching order than the fsp latch and
+fsp page latches.
+
+Insert buffer pages must be such that the insert buffer is never invoked
+when these pages are accessed as this would result in a recursion violating
+the latching order. We let a special i/o-handler thread take care of i/o to
+the insert buffer pages and the ibuf bitmap pages, as well as the fsp bitmap
+pages and the first inode page, which contains the inode of the ibuf tree: let
+us call all these ibuf pages. To prevent deadlocks, we do not let a read-ahead
+access both non-ibuf and ibuf pages.
+
+Then an i/o-handler for the insert buffer never needs to access recursively the
+insert buffer tree and thus obeys the latching order. On the other hand, other
+i/o-handlers for other tablespaces may require access to the insert buffer,
+but because all kinds of latches they need to access there are later in the
+latching order, no violation of the latching order occurs in this case,
+either.
+
+A problem is how to grow and contract an insert buffer tree. As it is later
+in the latching order than the fsp management, we have to reserve the fsp
+latch first, before adding or removing pages from the insert buffer tree.
+We let the insert buffer tree have its own file space management: a free
+list of pages linked to the tree root. To prevent recursive using of the
+insert buffer when adding pages to the tree, we must first load these pages
+to memory, obtaining a latch on them, and only after that add them to the
+free list of the insert buffer tree. More difficult is removing of pages
+from the free list. If there is an excess of pages in the free list of the
+ibuf tree, they might be needed if some thread reserves the fsp latch,
+intending to allocate more file space. So we do the following: if a thread
+reserves the fsp latch, we check the writer count field of the latch. If
+this field has value 1, it means that the thread did not own the latch
+before entering the fsp system, and the mtr of the thread contains no
+modifications to the fsp pages. Now we are free to reserve the ibuf latch,
+and check if there is an excess of pages in the free list. We can then, in a
+separate mini-transaction, take them out of the free list and free them to
+the fsp system.
+
+To avoid deadlocks in the ibuf system, we divide file pages into three levels:
+
+(1) non-ibuf pages,
+(2) ibuf tree pages and the pages in the ibuf tree free list, and
+(3) ibuf bitmap pages.
+
+No OS thread is allowed to access higher level pages if it has latches to
+lower level pages; even if the thread owns a B-tree latch it must not access
+the B-tree non-leaf pages if it has latches on lower level pages. Read-ahead
+is only allowed for level 1 and 2 pages. Dedicated i/o-handler threads handle
+exclusively level 1 i/o. A dedicated i/o handler thread handles exclusively
+level 2 i/o. However, if an OS thread does the i/o handling for itself, i.e.,
+it uses synchronous aio, it can access any pages, as long as it obeys the
+access order rules. */
+
+/** Operations that can currently be buffered. */
+ulong	innodb_change_buffering;
+
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+/** Dump the change buffer at startup */
+my_bool	ibuf_dump;
+/** Flag to control insert buffer debugging. */
+uint	ibuf_debug;
+#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+
+/** The insert buffer control structure */
+ibuf_t	ibuf;
+
+/** @name Offsets to the per-page bits in the insert buffer bitmap */
+/* @{ */
+#define	IBUF_BITMAP_FREE	0	/*!< Bits indicating the
+					amount of free space */
+#define IBUF_BITMAP_BUFFERED	2	/*!< TRUE if there are buffered
+					changes for the page */
+#define IBUF_BITMAP_IBUF	3	/*!< TRUE if page is a part of
+					the ibuf tree, excluding the
+					root page, or is in the free
+					list of the ibuf */
+/* @} */
+
+#define IBUF_REC_FIELD_SPACE	0	/*!< in the pre-4.1 format,
+					the page number. later, the space_id */
+#define IBUF_REC_FIELD_MARKER	1	/*!< starting with 4.1, a marker
+					consisting of 1 byte that is 0 */
+#define IBUF_REC_FIELD_PAGE	2	/*!< starting with 4.1, the
+					page number */
+#define IBUF_REC_FIELD_METADATA	3	/* the metadata field */
+#define IBUF_REC_FIELD_USER	4	/* first user field */
+
+/* Various constants for checking the type of an ibuf record and extracting
+data from it. For details, see the description of the record format at the
+top of this file. */
+
+/** @name Format of the IBUF_REC_FIELD_METADATA of an insert buffer record
+The fourth column in the MySQL 5.5 format contains an operation
+type, counter, and some flags. */
+/* @{ */
+#define IBUF_REC_INFO_SIZE	4	/*!< Combined size of info fields at
+					the beginning of the fourth field */
+
+/* Offsets for the fields at the beginning of the fourth field */
+#define IBUF_REC_OFFSET_COUNTER	0	/*!< Operation counter */
+#define IBUF_REC_OFFSET_TYPE	2	/*!< Type of operation */
+#define IBUF_REC_OFFSET_FLAGS	3	/*!< Additional flags */
+
+/* Record flag masks */
+#define IBUF_REC_COMPACT	0x1	/*!< Set in
+					IBUF_REC_OFFSET_FLAGS if the
+					user index is in COMPACT
+					format or later */
+
+
+#ifndef SAFE_MUTEX
+static
+#endif /* SAFE_MUTEX */
+/** The mutex protecting the insert buffer */
+mysql_mutex_t ibuf_mutex,
+	/** The mutex covering pessimistic inserts into the change buffer */
+	ibuf_pessimistic_insert_mutex;
+
+/** The area in pages from which contract looks for page numbers for merge */
+constexpr ulint		IBUF_MERGE_AREA = 8;
+
+/** In ibuf_contract() at most this number of pages is read to memory in one
+batch, in order to merge the entries for them in the change buffer */
+constexpr ulint		IBUF_MAX_N_PAGES_MERGED = IBUF_MERGE_AREA;
+
+/* TODO: how to cope with drop table if there are records in the insert
+buffer for the indexes of the table? Is there actually any problem,
+because ibuf merge is done to a page when it is read in, and it is
+still physically like the index page even if the index would have been
+dropped! So, there seems to be no problem. */
+
+/******************************************************************//**
+Sets the flag in the current mini-transaction record indicating we're
+inside an insert buffer routine. */
+UNIV_INLINE
+void
+ibuf_enter(
+/*=======*/
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
+{
+	ut_ad(!mtr->is_inside_ibuf());
+	mtr->enter_ibuf();
+}
+
+/******************************************************************//**
+Sets the flag in the current mini-transaction record indicating we're
+exiting an insert buffer routine. */
+UNIV_INLINE
+void
+ibuf_exit(
+/*======*/
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
+{
+	ut_ad(mtr->is_inside_ibuf());
+	mtr->exit_ibuf();
+}
+
+/**************************************************************//**
+Commits an insert buffer mini-transaction and sets the persistent
+cursor latch mode to BTR_NO_LATCHES, that is, detaches the cursor. */
+UNIV_INLINE
+void
+ibuf_btr_pcur_commit_specify_mtr(
+/*=============================*/
+	btr_pcur_t*	pcur,	/*!< in/out: persistent cursor */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	ut_d(ibuf_exit(mtr));
+	btr_pcur_commit_specify_mtr(pcur, mtr);
+}
+
+/******************************************************************//**
+Gets the ibuf header page and x-latches it.
+@return insert buffer header page */
+static
+page_t*
+ibuf_header_page_get(
+/*=================*/
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
+{
+	ut_ad(!ibuf_inside(mtr));
+
+	buf_block_t* block = buf_page_get(
+		page_id_t(IBUF_SPACE_ID, FSP_IBUF_HEADER_PAGE_NO),
+		0, RW_X_LATCH, mtr);
+
+	return block ? block->page.frame : nullptr;
+}
+
+/** Acquire the change buffer root page.
+@param[in,out]  mtr     mini-transaction
+@return change buffer root page, SX-latched */
+static buf_block_t *ibuf_tree_root_get(mtr_t *mtr, dberr_t *err= nullptr)
+{
+  ut_ad(ibuf_inside(mtr));
+  mysql_mutex_assert_owner(&ibuf_mutex);
+
+  mtr_sx_lock_index(ibuf.index, mtr);
+
+  buf_block_t *block=
+    buf_page_get_gen(page_id_t{IBUF_SPACE_ID, FSP_IBUF_TREE_ROOT_PAGE_NO},
+                     0, RW_SX_LATCH, nullptr, BUF_GET, mtr, err);
+  ut_ad(!block || ibuf.empty == page_is_empty(block->page.frame));
+  return block;
+}
+
+/******************************************************************//**
+Closes insert buffer and frees the data structures. */
+void
+ibuf_close(void)
+/*============*/
+{
+	if (!ibuf.index) {
+		return;
+	}
+
+	mysql_mutex_destroy(&ibuf_pessimistic_insert_mutex);
+	mysql_mutex_destroy(&ibuf_mutex);
+
+	dict_table_t*	ibuf_table = ibuf.index->table;
+	ibuf.index->lock.free();
+	dict_mem_index_free(ibuf.index);
+	dict_mem_table_free(ibuf_table);
+	ibuf.index = NULL;
+}
+
+/******************************************************************//**
+Updates the size information of the ibuf, assuming the segment size has not
+changed. */
+static
+void
+ibuf_size_update(
+/*=============*/
+	const page_t*	root)	/*!< in: ibuf tree root */
+{
+	mysql_mutex_assert_owner(&ibuf_mutex);
+
+	ibuf.free_list_len = flst_get_len(root + PAGE_HEADER
+					   + PAGE_BTR_IBUF_FREE_LIST);
+
+	ibuf.height = 1 + btr_page_get_level(root);
+
+	/* the '1 +' is the ibuf header page */
+	ibuf.size = ibuf.seg_size - (1 + ibuf.free_list_len);
+}
+
+/******************************************************************//**
+Creates the insert buffer data structure at a database startup and initializes
+the data structures for the insert buffer.
+@return DB_SUCCESS or failure */
+dberr_t
+ibuf_init_at_db_start(void)
+/*=======================*/
+{
+	page_t*		root;
+
+	ut_ad(!ibuf.index);
+	mtr_t mtr;
+	mtr.start();
+	compile_time_assert(IBUF_SPACE_ID == TRX_SYS_SPACE);
+	compile_time_assert(IBUF_SPACE_ID == 0);
+	mtr.x_lock_space(fil_system.sys_space);
+	dberr_t err;
+	buf_block_t* header_page = buf_page_get_gen(
+		page_id_t(IBUF_SPACE_ID, FSP_IBUF_HEADER_PAGE_NO),
+		0, RW_X_LATCH, nullptr, BUF_GET, &mtr, &err);
+
+	if (!header_page) {
+err_exit:
+		sql_print_error("InnoDB: The change buffer is corrupted"
+				" or has been removed on upgrade"
+				" to MariaDB 11.0 or later");
+		mtr.commit();
+		if (innodb_change_buffering == IBUF_USE_NONE) {
+			err = DB_SUCCESS;
+		}
+		return err;
+	}
+
+	fseg_n_reserved_pages(*header_page,
+			      IBUF_HEADER + IBUF_TREE_SEG_HEADER
+			      + header_page->page.frame, &ibuf.seg_size, &mtr);
+
+	do {
+		DBUG_EXECUTE_IF("intermittent_read_failure", continue;);
+		ut_ad(ibuf.seg_size >= 2);
+	} while (0);
+
+	if (buf_block_t* block =
+	    buf_page_get_gen(page_id_t(IBUF_SPACE_ID,
+				       FSP_IBUF_TREE_ROOT_PAGE_NO),
+			     0, RW_X_LATCH, nullptr, BUF_GET, &mtr, &err)) {
+		root = buf_block_get_frame(block);
+	} else {
+		goto err_exit;
+	}
+
+	DBUG_EXECUTE_IF("ibuf_init_corrupt",
+			err = DB_CORRUPTION;
+			goto err_exit;);
+
+	if (page_is_comp(root) || fil_page_get_type(root) != FIL_PAGE_INDEX
+	    || btr_page_get_index_id(root) != DICT_IBUF_ID_MIN) {
+		err = DB_CORRUPTION;
+		goto err_exit;
+	}
+
+	/* At startup we intialize ibuf to have a maximum of
+	CHANGE_BUFFER_DEFAULT_SIZE in terms of percentage of the
+	buffer pool size. Once ibuf struct is initialized this
+	value is updated with the user supplied size by calling
+	ibuf_max_size_update(). */
+	ibuf.max_size = ((buf_pool_get_curr_size() >> srv_page_size_shift)
+			  * CHANGE_BUFFER_DEFAULT_SIZE) / 100;
+
+	mysql_mutex_init(ibuf_mutex_key, &ibuf_mutex, nullptr);
+	mysql_mutex_init(ibuf_pessimistic_insert_mutex_key,
+			 &ibuf_pessimistic_insert_mutex, nullptr);
+
+	mysql_mutex_lock(&ibuf_mutex);
+	ibuf_size_update(root);
+	mysql_mutex_unlock(&ibuf_mutex);
+
+	ibuf.empty = page_is_empty(root);
+	mtr.commit();
+
+	ibuf.index = dict_mem_index_create(
+		dict_table_t::create(
+			{C_STRING_WITH_LEN("innodb_change_buffer")},
+			fil_system.sys_space, 1, 0, 0, 0),
+		"CLUST_IND",
+		DICT_CLUSTERED | DICT_IBUF, 1);
+	ibuf.index->id = DICT_IBUF_ID_MIN + IBUF_SPACE_ID;
+	ibuf.index->n_uniq = REC_MAX_N_FIELDS;
+	ibuf.index->lock.SRW_LOCK_INIT(index_tree_rw_lock_key);
+#ifdef BTR_CUR_ADAPT
+	ibuf.index->search_info = btr_search_info_create(ibuf.index->heap);
+#endif /* BTR_CUR_ADAPT */
+	ibuf.index->page = FSP_IBUF_TREE_ROOT_PAGE_NO;
+	ut_d(ibuf.index->cached = TRUE);
+
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+	if (!ibuf_dump) {
+		return DB_SUCCESS;
+	}
+	ib::info() << "Dumping the change buffer";
+	ibuf_mtr_start(&mtr);
+	btr_pcur_t pcur;
+	if (DB_SUCCESS
+	    == pcur.open_leaf(true, ibuf.index, BTR_SEARCH_LEAF, &mtr)) {
+		while (btr_pcur_move_to_next_user_rec(&pcur, &mtr)) {
+			rec_print_old(stderr, btr_pcur_get_rec(&pcur));
+		}
+	}
+	ibuf_mtr_commit(&mtr);
+	ib::info() << "Dumped the change buffer";
+#endif
+
+	return DB_SUCCESS;
+}
+
+/*********************************************************************//**
+Updates the max_size value for ibuf. */
+void
+ibuf_max_size_update(
+/*=================*/
+	ulint	new_val)	/*!< in: new value in terms of
+				percentage of the buffer pool size */
+{
+	if (UNIV_UNLIKELY(!ibuf.index)) return;
+	ulint	new_size = ((buf_pool_get_curr_size() >> srv_page_size_shift)
+			    * new_val) / 100;
+	mysql_mutex_lock(&ibuf_mutex);
+	ibuf.max_size = new_size;
+	mysql_mutex_unlock(&ibuf_mutex);
+}
+
+# ifdef UNIV_DEBUG
+/** Gets the desired bits for a given page from a bitmap page.
+@param[in]	page		bitmap page
+@param[in]	page_id		page id whose bits to get
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	bit		IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ...
+@param[in,out]	mtr		mini-transaction holding an x-latch on the
+bitmap page
+@return value of bits */
+#  define ibuf_bitmap_page_get_bits(page, page_id, zip_size, bit, mtr)	\
+	ibuf_bitmap_page_get_bits_low(page, page_id, zip_size,		\
+				      MTR_MEMO_PAGE_X_FIX, mtr, bit)
+# else /* UNIV_DEBUG */
+/** Gets the desired bits for a given page from a bitmap page.
+@param[in]	page		bitmap page
+@param[in]	page_id		page id whose bits to get
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	bit		IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ...
+@param[in,out]	mtr		mini-transaction holding an x-latch on the
+bitmap page
+@return value of bits */
+#  define ibuf_bitmap_page_get_bits(page, page_id, zip_size, bit, mtr)	\
+	ibuf_bitmap_page_get_bits_low(page, page_id, zip_size, bit)
+# endif /* UNIV_DEBUG */
+
+/** Gets the desired bits for a given page from a bitmap page.
+@param[in]	page		bitmap page
+@param[in]	page_id		page id whose bits to get
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	latch_type	MTR_MEMO_PAGE_X_FIX, MTR_MEMO_BUF_FIX, ...
+@param[in,out]	mtr		mini-transaction holding latch_type on the
+bitmap page
+@param[in]	bit		IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ...
+@return value of bits */
+UNIV_INLINE
+ulint
+ibuf_bitmap_page_get_bits_low(
+	const page_t*		page,
+	const page_id_t		page_id,
+	ulint			zip_size,
+#ifdef UNIV_DEBUG
+	ulint			latch_type,
+	mtr_t*			mtr,
+#endif /* UNIV_DEBUG */
+	ulint			bit)
+{
+	ulint	byte_offset;
+	ulint	bit_offset;
+	ulint	map_byte;
+	ulint	value;
+	const ulint size = zip_size ? zip_size : srv_page_size;
+
+	ut_ad(ut_is_2pow(zip_size));
+	ut_ad(bit < IBUF_BITS_PER_PAGE);
+	compile_time_assert(!(IBUF_BITS_PER_PAGE % 2));
+	ut_ad(mtr->memo_contains_page_flagged(page, latch_type));
+
+	bit_offset = (page_id.page_no() & (size - 1))
+		* IBUF_BITS_PER_PAGE + bit;
+
+	byte_offset = bit_offset / 8;
+	bit_offset = bit_offset % 8;
+
+	ut_ad(byte_offset + IBUF_BITMAP < srv_page_size);
+
+	map_byte = mach_read_from_1(page + IBUF_BITMAP + byte_offset);
+
+	value = ut_bit_get_nth(map_byte, bit_offset);
+
+	if (bit == IBUF_BITMAP_FREE) {
+		ut_ad(bit_offset + 1 < 8);
+
+		value = value * 2 + ut_bit_get_nth(map_byte, bit_offset + 1);
+	}
+
+	return(value);
+}
+
+/** Sets the desired bit for a given page in a bitmap page.
+@tparam bit	IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ...
+@param[in,out]	block		bitmap page
+@param[in]	page_id		page id whose bits to set
+@param[in]	physical_size	page size
+@param[in]	val		value to set
+@param[in,out]	mtr		mtr containing an x-latch to the bitmap page */
+template<ulint bit>
+static void
+ibuf_bitmap_page_set_bits(
+	buf_block_t*		block,
+	const page_id_t		page_id,
+	ulint			physical_size,
+	ulint			val,
+	mtr_t*			mtr)
+{
+	ulint	byte_offset;
+	ulint	bit_offset;
+
+	static_assert(bit < IBUF_BITS_PER_PAGE, "wrong bit");
+	compile_time_assert(!(IBUF_BITS_PER_PAGE % 2));
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr->is_named_space(page_id.space()));
+
+	bit_offset = (page_id.page_no() % physical_size)
+		* IBUF_BITS_PER_PAGE + bit;
+
+	byte_offset = bit_offset / 8;
+	bit_offset = bit_offset % 8;
+
+	ut_ad(byte_offset + IBUF_BITMAP < srv_page_size);
+
+	byte* map_byte = &block->page.frame[IBUF_BITMAP + byte_offset];
+	byte b = *map_byte;
+
+	if (bit == IBUF_BITMAP_FREE) {
+		ut_ad(bit_offset + 1 < 8);
+		ut_ad(val <= 3);
+		b &= static_cast<byte>(~(3U << bit_offset));
+		b |= static_cast<byte>(((val & 2) >> 1) << bit_offset
+				       | (val & 1) << (bit_offset + 1));
+	} else {
+		ut_ad(val <= 1);
+		b &= static_cast<byte>(~(1U << bit_offset));
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 may need this here */
+#endif
+		b |= static_cast<byte>(val << bit_offset);
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
+	}
+
+	mtr->write<1,mtr_t::MAYBE_NOP>(*block, map_byte, b);
+}
+
+/** Calculates the bitmap page number for a given page number.
+@param[in]	page_id		page id
+@param[in]	size		page size
+@return the bitmap page id where the file page is mapped */
+inline page_id_t ibuf_bitmap_page_no_calc(const page_id_t page_id, ulint size)
+{
+  if (!size)
+    size= srv_page_size;
+
+  return page_id_t(page_id.space(), FSP_IBUF_BITMAP_OFFSET
+		   + uint32_t(page_id.page_no() & ~(size - 1)));
+}
+
+/** Gets the ibuf bitmap page where the bits describing a given file page are
+stored.
+@param[in]	page_id		page id of the file page
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out]	mtr		mini-transaction
+@return bitmap page where the file page is mapped, that is, the bitmap
+page containing the descriptor bits for the file page; the bitmap page
+is x-latched */
+static
+buf_block_t*
+ibuf_bitmap_get_map_page(
+	const page_id_t		page_id,
+	ulint			zip_size,
+	mtr_t*			mtr)
+{
+  return buf_page_get_gen(ibuf_bitmap_page_no_calc(page_id, zip_size),
+                          zip_size, RW_X_LATCH, nullptr,
+                          BUF_GET_POSSIBLY_FREED, mtr);
+}
+
+/************************************************************************//**
+Sets the free bits of the page in the ibuf bitmap. This is done in a separate
+mini-transaction, hence this operation does not restrict further work to only
+ibuf bitmap operations, which would result if the latch to the bitmap page
+were kept. */
+UNIV_INLINE
+void
+ibuf_set_free_bits_low(
+/*===================*/
+	const buf_block_t*	block,	/*!< in: index page; free bits are set if
+					the index is non-clustered and page
+					level is 0 */
+	ulint			val,	/*!< in: value to set: < 4 */
+	mtr_t*			mtr)	/*!< in/out: mtr */
+{
+	ut_ad(mtr->is_named_space(block->page.id().space()));
+	if (!page_is_leaf(block->page.frame)) {
+		return;
+	}
+
+#ifdef UNIV_IBUF_DEBUG
+	ut_a(val <= ibuf_index_page_calc_free(block));
+#endif /* UNIV_IBUF_DEBUG */
+	const page_id_t id(block->page.id());
+
+	if (buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(
+			id, block->zip_size(), mtr)) {
+		ibuf_bitmap_page_set_bits<IBUF_BITMAP_FREE>(
+			bitmap_page, id, block->physical_size(),
+			val, mtr);
+	}
+}
+
+/************************************************************************//**
+Sets the free bit of the page in the ibuf bitmap. This is done in a separate
+mini-transaction, hence this operation does not restrict further work to only
+ibuf bitmap operations, which would result if the latch to the bitmap page
+were kept. */
+void
+ibuf_set_free_bits_func(
+/*====================*/
+	buf_block_t*	block,	/*!< in: index page of a non-clustered index;
+				free bit is reset if page level is 0 */
+#ifdef UNIV_IBUF_DEBUG
+	ulint		max_val,/*!< in: ULINT_UNDEFINED or a maximum
+				value which the bits must have before
+				setting; this is for debugging */
+#endif /* UNIV_IBUF_DEBUG */
+	ulint		val)	/*!< in: value to set: < 4 */
+{
+  if (!page_is_leaf(block->page.frame))
+    return;
+
+  mtr_t	mtr;
+  mtr.start();
+  const page_id_t id(block->page.id());
+  const fil_space_t *space= mtr.set_named_space_id(id.space());
+
+  if (buf_block_t *bitmap_page=
+      ibuf_bitmap_get_map_page(id, block->zip_size(), &mtr))
+  {
+    if (space->purpose != FIL_TYPE_TABLESPACE)
+      mtr.set_log_mode(MTR_LOG_NO_REDO);
+
+#ifdef UNIV_IBUF_DEBUG
+    if (max_val != ULINT_UNDEFINED)
+    {
+      ulint old_val= ibuf_bitmap_page_get_bits(bitmap_page, id,
+                                               IBUF_BITMAP_FREE, &mtr);
+      ut_a(old_val <= max_val);
+    }
+
+    ut_a(val <= ibuf_index_page_calc_free(block));
+#endif /* UNIV_IBUF_DEBUG */
+
+    ibuf_bitmap_page_set_bits<IBUF_BITMAP_FREE>
+      (bitmap_page, id, block->physical_size(), val, &mtr);
+  }
+
+  mtr.commit();
+}
+
+/************************************************************************//**
+Resets the free bits of the page in the ibuf bitmap. This is done in a
+separate mini-transaction, hence this operation does not restrict
+further work to only ibuf bitmap operations, which would result if the
+latch to the bitmap page were kept.  NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page.  It is safe
+to decrement or reset the bits in the bitmap in a mini-transaction
+that is committed before the mini-transaction that affects the free
+space. */
+void
+ibuf_reset_free_bits(
+/*=================*/
+	buf_block_t*	block)	/*!< in: index page; free bits are set to 0
+				if the index is a non-clustered
+				non-unique, and page level is 0 */
+{
+	ibuf_set_free_bits(block, 0, ULINT_UNDEFINED);
+}
+
+/**********************************************************************//**
+Updates the free bits for an uncompressed page to reflect the present
+state.  Does this in the mtr given, which means that the latching
+order rules virtually prevent any further operations for this OS
+thread until mtr is committed.  NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page.  It is safe
+to set the free bits in the same mini-transaction that updated the
+page. */
+void
+ibuf_update_free_bits_low(
+/*======================*/
+	const buf_block_t*	block,		/*!< in: index page */
+	ulint			max_ins_size,	/*!< in: value of
+						maximum insert size
+						with reorganize before
+						the latest operation
+						performed to the page */
+	mtr_t*			mtr)		/*!< in/out: mtr */
+{
+	ulint	before;
+	ulint	after;
+
+	ut_a(!is_buf_block_get_page_zip(block));
+	ut_ad(mtr->is_named_space(block->page.id().space()));
+
+	before = ibuf_index_page_calc_free_bits(srv_page_size,
+						max_ins_size);
+
+	after = ibuf_index_page_calc_free(block);
+
+	/* This approach cannot be used on compressed pages, since the
+	computed value of "before" often does not match the current
+	state of the bitmap.  This is because the free space may
+	increase or decrease when a compressed page is reorganized. */
+	if (before != after) {
+		ibuf_set_free_bits_low(block, after, mtr);
+	}
+}
+
+/**********************************************************************//**
+Updates the free bits for a compressed page to reflect the present
+state.  Does this in the mtr given, which means that the latching
+order rules virtually prevent any further operations for this OS
+thread until mtr is committed.  NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page.  It is safe
+to set the free bits in the same mini-transaction that updated the
+page. */
+void
+ibuf_update_free_bits_zip(
+/*======================*/
+	buf_block_t*	block,	/*!< in/out: index page */
+	mtr_t*		mtr)	/*!< in/out: mtr */
+{
+	ut_ad(page_is_leaf(block->page.frame));
+	ut_ad(block->zip_size());
+
+	ulint after = ibuf_index_page_calc_free_zip(block);
+
+	if (after == 0) {
+		/* We move the page to the front of the buffer pool LRU list:
+		the purpose of this is to prevent those pages to which we
+		cannot make inserts using the insert buffer from slipping
+		out of the buffer pool */
+
+		buf_page_make_young(&block->page);
+	}
+
+	if (buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(
+		block->page.id(), block->zip_size(), mtr)) {
+
+		ibuf_bitmap_page_set_bits<IBUF_BITMAP_FREE>(
+			bitmap_page, block->page.id(),
+			block->physical_size(), after, mtr);
+	}
+}
+
+/**********************************************************************//**
+Updates the free bits for the two pages to reflect the present state.
+Does this in the mtr given, which means that the latching order rules
+virtually prevent any further operations until mtr is committed.
+NOTE: The free bits in the insert buffer bitmap must never exceed the
+free space on a page.  It is safe to set the free bits in the same
+mini-transaction that updated the pages. */
+void
+ibuf_update_free_bits_for_two_pages_low(
+/*====================================*/
+	buf_block_t*	block1,	/*!< in: index page */
+	buf_block_t*	block2,	/*!< in: index page */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+  ut_ad(mtr->is_named_space(block1->page.id().space()));
+  ut_ad(block1->page.id().space() == block2->page.id().space());
+
+  /* Avoid deadlocks by acquiring multiple bitmap page latches in
+  a consistent order (smaller pointer first). */
+  if (block1 > block2)
+    std::swap(block1, block2);
+
+  ibuf_set_free_bits_low(block1, ibuf_index_page_calc_free(block1), mtr);
+  ibuf_set_free_bits_low(block2, ibuf_index_page_calc_free(block2), mtr);
+}
+
+/** Returns TRUE if the page is one of the fixed address ibuf pages.
+@param[in]	page_id		page id
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@return TRUE if a fixed address ibuf i/o page */
+inline bool ibuf_fixed_addr_page(const page_id_t page_id, ulint zip_size)
+{
+	return(page_id == page_id_t(IBUF_SPACE_ID, IBUF_TREE_ROOT_PAGE_NO)
+	       || ibuf_bitmap_page(page_id, zip_size));
+}
+
+/** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages.
+Must not be called when recv_no_ibuf_operations==true.
+@param[in]	page_id		page id
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	x_latch		FALSE if relaxed check (avoid latching the
+bitmap page)
+@param[in,out]	mtr		mtr which will contain an x-latch to the
+bitmap page if the page is not one of the fixed address ibuf pages, or NULL,
+in which case a new transaction is created.
+@return TRUE if level 2 or level 3 page */
+bool
+ibuf_page_low(
+	const page_id_t		page_id,
+	ulint			zip_size,
+#ifdef UNIV_DEBUG
+	bool			x_latch,
+#endif /* UNIV_DEBUG */
+	mtr_t*			mtr)
+{
+	ibool	ret;
+	mtr_t	local_mtr;
+
+	ut_ad(!recv_no_ibuf_operations);
+	ut_ad(x_latch || mtr == NULL);
+
+	if (ibuf_fixed_addr_page(page_id, zip_size)) {
+		return(true);
+	} else if (page_id.space() != IBUF_SPACE_ID) {
+		return(false);
+	}
+
+	compile_time_assert(IBUF_SPACE_ID == 0);
+	ut_ad(fil_system.sys_space->purpose == FIL_TYPE_TABLESPACE);
+
+#ifdef UNIV_DEBUG
+	if (!x_latch) {
+		mtr_start(&local_mtr);
+
+		/* Get the bitmap page without a page latch, so that
+		we will not be violating the latching order when
+		another bitmap page has already been latched by this
+		thread. The page will be buffer-fixed, and thus it
+		cannot be removed or relocated while we are looking at
+		it. The contents of the page could change, but the
+		IBUF_BITMAP_IBUF bit that we are interested in should
+		not be modified by any other thread. Nobody should be
+		calling ibuf_add_free_page() or ibuf_remove_free_page()
+		while the page is linked to the insert buffer b-tree. */
+		buf_block_t* block = buf_page_get_gen(
+			ibuf_bitmap_page_no_calc(page_id, zip_size),
+			zip_size, RW_NO_LATCH, nullptr, BUF_GET, &local_mtr);
+
+		ret = block
+			&& ibuf_bitmap_page_get_bits_low(
+			block->page.frame, page_id, zip_size,
+			MTR_MEMO_BUF_FIX, &local_mtr, IBUF_BITMAP_IBUF);
+
+		mtr_commit(&local_mtr);
+		return(ret);
+	}
+#endif /* UNIV_DEBUG */
+
+	if (mtr == NULL) {
+		mtr = &local_mtr;
+		mtr_start(mtr);
+	}
+
+	buf_block_t *block = ibuf_bitmap_get_map_page(page_id, zip_size,
+						      mtr);
+	ret = block
+		&& ibuf_bitmap_page_get_bits(block->page.frame,
+					     page_id, zip_size,
+					     IBUF_BITMAP_IBUF, mtr);
+
+	if (mtr == &local_mtr) {
+		mtr_commit(mtr);
+	}
+
+	return(ret);
+}
+
+#ifdef UNIV_DEBUG
+# define ibuf_rec_get_page_no(mtr,rec) ibuf_rec_get_page_no_func(mtr,rec)
+#else /* UNIV_DEBUG */
+# define ibuf_rec_get_page_no(mtr,rec) ibuf_rec_get_page_no_func(rec)
+#endif /* UNIV_DEBUG */
+
+/********************************************************************//**
+Returns the page number field of an ibuf record.
+@return page number */
+static
+uint32_t
+ibuf_rec_get_page_no_func(
+/*======================*/
+#ifdef UNIV_DEBUG
+	mtr_t*		mtr,	/*!< in: mini-transaction owning rec */
+#endif /* UNIV_DEBUG */
+	const rec_t*	rec)	/*!< in: ibuf record */
+{
+	const byte*	field;
+	ulint		len;
+
+	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
+					      | MTR_MEMO_PAGE_S_FIX));
+	ut_ad(ibuf_inside(mtr));
+	ut_ad(rec_get_n_fields_old(rec) > 2);
+
+	field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len);
+
+	ut_a(len == 1);
+
+	field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_PAGE, &len);
+
+	ut_a(len == 4);
+
+	return(mach_read_from_4(field));
+}
+
+#ifdef UNIV_DEBUG
+# define ibuf_rec_get_space(mtr,rec) ibuf_rec_get_space_func(mtr,rec)
+#else /* UNIV_DEBUG */
+# define ibuf_rec_get_space(mtr,rec) ibuf_rec_get_space_func(rec)
+#endif /* UNIV_DEBUG */
+
+/********************************************************************//**
+Returns the space id field of an ibuf record. For < 4.1.x format records
+returns 0.
+@return space id */
+static
+uint32_t
+ibuf_rec_get_space_func(
+/*====================*/
+#ifdef UNIV_DEBUG
+	mtr_t*		mtr,	/*!< in: mini-transaction owning rec */
+#endif /* UNIV_DEBUG */
+	const rec_t*	rec)	/*!< in: ibuf record */
+{
+	const byte*	field;
+	ulint		len;
+
+	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
+					      | MTR_MEMO_PAGE_S_FIX));
+	ut_ad(ibuf_inside(mtr));
+	ut_ad(rec_get_n_fields_old(rec) > 2);
+
+	field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len);
+
+	ut_a(len == 1);
+
+	field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_SPACE, &len);
+
+	ut_a(len == 4);
+
+	return(mach_read_from_4(field));
+}
+
+#ifdef UNIV_DEBUG
+# define ibuf_rec_get_info(mtr,rec,op,comp,info_len,counter)	\
+	ibuf_rec_get_info_func(mtr,rec,op,comp,info_len,counter)
+#else /* UNIV_DEBUG */
+# define ibuf_rec_get_info(mtr,rec,op,comp,info_len,counter)	\
+	ibuf_rec_get_info_func(rec,op,comp,info_len,counter)
+#endif
+/****************************************************************//**
+Get various information about an ibuf record in >= 4.1.x format. */
+static
+void
+ibuf_rec_get_info_func(
+/*===================*/
+#ifdef UNIV_DEBUG
+	mtr_t*		mtr,	/*!< in: mini-transaction owning rec */
+#endif /* UNIV_DEBUG */
+	const rec_t*	rec,		/*!< in: ibuf record */
+	ibuf_op_t*	op,		/*!< out: operation type, or NULL */
+	ibool*		comp,		/*!< out: compact flag, or NULL */
+	ulint*		info_len,	/*!< out: length of info fields at the
+					start of the fourth field, or
+					NULL */
+	ulint*		counter)	/*!< in: counter value, or NULL */
+{
+	const byte*	types;
+	ulint		fields;
+	ulint		len;
+
+	/* Local variables to shadow arguments. */
+	ibuf_op_t	op_local;
+	ibool		comp_local;
+	ulint		info_len_local;
+	ulint		counter_local;
+
+	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
+					      | MTR_MEMO_PAGE_S_FIX));
+	ut_ad(ibuf_inside(mtr));
+	fields = rec_get_n_fields_old(rec);
+	ut_a(fields > IBUF_REC_FIELD_USER);
+
+	types = rec_get_nth_field_old(rec, IBUF_REC_FIELD_METADATA, &len);
+
+	info_len_local = len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE;
+	compile_time_assert(IBUF_REC_INFO_SIZE
+			    < DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
+
+	switch (info_len_local) {
+	case 0:
+	case 1:
+		op_local = IBUF_OP_INSERT;
+		comp_local = info_len_local;
+		ut_ad(!counter);
+		counter_local = ULINT_UNDEFINED;
+		break;
+
+	case IBUF_REC_INFO_SIZE:
+		op_local = (ibuf_op_t) types[IBUF_REC_OFFSET_TYPE];
+		comp_local = types[IBUF_REC_OFFSET_FLAGS] & IBUF_REC_COMPACT;
+		counter_local = mach_read_from_2(
+			types + IBUF_REC_OFFSET_COUNTER);
+		break;
+
+	default:
+		ut_error;
+	}
+
+	ut_a(op_local < IBUF_OP_COUNT);
+	ut_a((len - info_len_local) ==
+	     (fields - IBUF_REC_FIELD_USER)
+	     * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
+
+	if (op) {
+		*op = op_local;
+	}
+
+	if (comp) {
+		*comp = comp_local;
+	}
+
+	if (info_len) {
+		*info_len = info_len_local;
+	}
+
+	if (counter) {
+		*counter = counter_local;
+	}
+}
+
+#ifdef UNIV_DEBUG
+# define ibuf_rec_get_op_type(mtr,rec) ibuf_rec_get_op_type_func(mtr,rec)
+#else /* UNIV_DEBUG */
+# define ibuf_rec_get_op_type(mtr,rec) ibuf_rec_get_op_type_func(rec)
+#endif
+
+/****************************************************************//**
+Returns the operation type field of an ibuf record.
+@return operation type */
+static
+ibuf_op_t
+ibuf_rec_get_op_type_func(
+/*======================*/
+#ifdef UNIV_DEBUG
+	mtr_t*		mtr,	/*!< in: mini-transaction owning rec */
+#endif /* UNIV_DEBUG */
+	const rec_t*	rec)	/*!< in: ibuf record */
+{
+	ulint		len;
+
+	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
+					      | MTR_MEMO_PAGE_S_FIX));
+	ut_ad(ibuf_inside(mtr));
+	ut_ad(rec_get_n_fields_old(rec) > 2);
+
+	(void) rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len);
+
+	if (len > 1) {
+		/* This is a < 4.1.x format record */
+
+		return(IBUF_OP_INSERT);
+	} else {
+		ibuf_op_t	op;
+
+		ibuf_rec_get_info(mtr, rec, &op, NULL, NULL, NULL);
+
+		return(op);
+	}
+}
+
+/****************************************************************//**
+Read the first two bytes from a record's fourth field (counter field in new
+records; something else in older records).
+@return "counter" field, or ULINT_UNDEFINED if for some reason it
+can't be read */
+ulint
+ibuf_rec_get_counter(
+/*=================*/
+	const rec_t*	rec)	/*!< in: ibuf record */
+{
+	const byte*	ptr;
+	ulint		len;
+
+	if (rec_get_n_fields_old(rec) <= IBUF_REC_FIELD_METADATA) {
+
+		return(ULINT_UNDEFINED);
+	}
+
+	ptr = rec_get_nth_field_old(rec, IBUF_REC_FIELD_METADATA, &len);
+
+	if (len >= 2) {
+
+		return(mach_read_from_2(ptr));
+	} else {
+
+		return(ULINT_UNDEFINED);
+	}
+}
+
+
+/**
+  Add accumulated operation counts to a permanent array.
+  Both arrays must be of size IBUF_OP_COUNT.
+*/
+static void ibuf_add_ops(Atomic_counter<ulint> *out, const ulint *in)
+{
+  for (auto i = 0; i < IBUF_OP_COUNT; i++)
+    out[i]+= in[i];
+}
+
+
+/****************************************************************//**
+Print operation counts. The array must be of size IBUF_OP_COUNT. */
+static
+void
+ibuf_print_ops(
+/*===========*/
+	const char*			op_name,/*!< in: operation name */
+	const Atomic_counter<ulint>*	ops,	/*!< in: operation counts */
+	FILE*				file)	/*!< in: file where to print */
+{
+	static const char* op_names[] = {
+		"insert",
+		"delete mark",
+		"delete"
+	};
+
+	static_assert(array_elements(op_names) == IBUF_OP_COUNT, "");
+	fputs(op_name, file);
+
+	for (ulint i = 0; i < IBUF_OP_COUNT; i++) {
+		fprintf(file, "%s " ULINTPF "%s", op_names[i],
+			ulint{ops[i]}, (i < (IBUF_OP_COUNT - 1)) ? ", " : "");
+	}
+
+	putc('\n', file);
+}
+
+/********************************************************************//**
+Creates a dummy index for inserting a record to a non-clustered index.
+@return dummy index */
+static
+dict_index_t*
+ibuf_dummy_index_create(
+/*====================*/
+	ulint		n,	/*!< in: number of fields */
+	ibool		comp)	/*!< in: TRUE=use compact record format */
+{
+	dict_table_t*	table;
+	dict_index_t*	index;
+
+	table = dict_table_t::create({C_STRING_WITH_LEN("IBUF_DUMMY")},
+				     nullptr, n, 0,
+				     comp ? DICT_TF_COMPACT : 0, 0);
+
+	index = dict_mem_index_create(table, "IBUF_DUMMY", 0, n);
+
+	/* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */
+	index->cached = TRUE;
+	ut_d(index->is_dummy = true);
+
+	return(index);
+}
+/********************************************************************//**
+Add a column to the dummy index */
+static
+void
+ibuf_dummy_index_add_col(
+/*=====================*/
+	dict_index_t*	index,	/*!< in: dummy index */
+	const dtype_t*	type,	/*!< in: the data type of the column */
+	ulint		len)	/*!< in: length of the column */
+{
+	ulint	i	= index->table->n_def;
+	dict_mem_table_add_col(index->table, NULL, NULL,
+			       dtype_get_mtype(type),
+			       dtype_get_prtype(type),
+			       dtype_get_len(type));
+	dict_index_add_col(index, index->table,
+			   dict_table_get_nth_col(index->table, i), len);
+}
+/********************************************************************//**
+Deallocates a dummy index for inserting a record to a non-clustered index. */
+static
+void
+ibuf_dummy_index_free(
+/*==================*/
+	dict_index_t*	index)	/*!< in, own: dummy index */
+{
+	dict_table_t*	table = index->table;
+
+	dict_mem_index_free(index);
+	dict_mem_table_free(table);
+}
+
+#ifdef UNIV_DEBUG
+# define ibuf_build_entry_from_ibuf_rec(mtr,ibuf_rec,heap,pindex)	\
+	ibuf_build_entry_from_ibuf_rec_func(mtr,ibuf_rec,heap,pindex)
+#else /* UNIV_DEBUG */
+# define ibuf_build_entry_from_ibuf_rec(mtr,ibuf_rec,heap,pindex)	\
+	ibuf_build_entry_from_ibuf_rec_func(ibuf_rec,heap,pindex)
+#endif
+
+/*********************************************************************//**
+Builds the entry used to
+
+1) IBUF_OP_INSERT: insert into a non-clustered index
+
+2) IBUF_OP_DELETE_MARK: find the record whose delete-mark flag we need to
+   activate
+
+3) IBUF_OP_DELETE: find the record we need to delete
+
+when we have the corresponding record in an ibuf index.
+
+NOTE that as we copy pointers to fields in ibuf_rec, the caller must
+hold a latch to the ibuf_rec page as long as the entry is used!
+
+@return own: entry to insert to a non-clustered index */
+static
+dtuple_t*
+ibuf_build_entry_from_ibuf_rec_func(
+/*================================*/
+#ifdef UNIV_DEBUG
+	mtr_t*		mtr,	/*!< in: mini-transaction owning rec */
+#endif /* UNIV_DEBUG */
+	const rec_t*	ibuf_rec,	/*!< in: record in an insert buffer */
+	mem_heap_t*	heap,		/*!< in: heap where built */
+	dict_index_t**	pindex)		/*!< out, own: dummy index that
+					describes the entry */
+{
+	dtuple_t*	tuple;
+	dfield_t*	field;
+	ulint		n_fields;
+	const byte*	types;
+	const byte*	data;
+	ulint		len;
+	ulint		info_len;
+	ulint		i;
+	ulint		comp;
+	dict_index_t*	index;
+
+	ut_ad(mtr->memo_contains_page_flagged(ibuf_rec, MTR_MEMO_PAGE_X_FIX
+					      | MTR_MEMO_PAGE_S_FIX));
+	ut_ad(ibuf_inside(mtr));
+
+	data = rec_get_nth_field_old(ibuf_rec, IBUF_REC_FIELD_MARKER, &len);
+
+	ut_a(len == 1);
+	ut_a(*data == 0);
+	ut_a(rec_get_n_fields_old(ibuf_rec) > IBUF_REC_FIELD_USER);
+
+	n_fields = rec_get_n_fields_old(ibuf_rec) - IBUF_REC_FIELD_USER;
+
+	tuple = dtuple_create(heap, n_fields);
+
+	types = rec_get_nth_field_old(ibuf_rec, IBUF_REC_FIELD_METADATA, &len);
+
+	ibuf_rec_get_info(mtr, ibuf_rec, NULL, &comp, &info_len, NULL);
+
+	index = ibuf_dummy_index_create(n_fields, comp);
+
+	len -= info_len;
+	types += info_len;
+
+	ut_a(len == n_fields * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
+
+	for (i = 0; i < n_fields; i++) {
+		field = dtuple_get_nth_field(tuple, i);
+
+		data = rec_get_nth_field_old(
+			ibuf_rec, i + IBUF_REC_FIELD_USER, &len);
+
+		dfield_set_data(field, data, len);
+
+		dtype_new_read_for_order_and_null_size(
+			dfield_get_type(field),
+			types + i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
+
+		ibuf_dummy_index_add_col(index, dfield_get_type(field), len);
+	}
+
+	index->n_core_null_bytes = static_cast<uint8_t>(
+		UT_BITS_IN_BYTES(unsigned(index->n_nullable)));
+
+	/* Prevent an ut_ad() failure in page_zip_write_rec() by
+	adding system columns to the dummy table pointed to by the
+	dummy secondary index.  The insert buffer is only used for
+	secondary indexes, whose records never contain any system
+	columns, such as DB_TRX_ID. */
+	ut_d(dict_table_add_system_columns(index->table, index->table->heap));
+
+	*pindex = index;
+
+	return(tuple);
+}
+
+/******************************************************************//**
+Get the data size.
+@return size of fields */
+UNIV_INLINE
+ulint
+ibuf_rec_get_size(
+/*==============*/
+	const rec_t*	rec,			/*!< in: ibuf record */
+	const byte*	types,			/*!< in: fields */
+	ulint		n_fields,		/*!< in: number of fields */
+	ulint		comp)			/*!< in: 0=ROW_FORMAT=REDUNDANT,
+						nonzero=ROW_FORMAT=COMPACT */
+{
+	ulint	i;
+	ulint	field_offset;
+	ulint	types_offset;
+	ulint	size = 0;
+
+	field_offset = IBUF_REC_FIELD_USER;
+	types_offset = DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE;
+
+	for (i = 0; i < n_fields; i++) {
+		ulint		len;
+		dtype_t		dtype;
+
+		rec_get_nth_field_offs_old(rec, i + field_offset, &len);
+
+		if (len != UNIV_SQL_NULL) {
+			size += len;
+		} else {
+			dtype_new_read_for_order_and_null_size(&dtype, types);
+
+			size += dtype_get_sql_null_size(&dtype, comp);
+		}
+
+		types += types_offset;
+	}
+
+	return(size);
+}
+
+#ifdef UNIV_DEBUG
+# define ibuf_rec_get_volume(mtr,rec) ibuf_rec_get_volume_func(mtr,rec)
+#else /* UNIV_DEBUG */
+# define ibuf_rec_get_volume(mtr,rec) ibuf_rec_get_volume_func(rec)
+#endif
+
+/********************************************************************//**
+Returns the space taken by a stored non-clustered index entry if converted to
+an index record.
+@return size of index record in bytes + an upper limit of the space
+taken in the page directory */
+static
+ulint
+ibuf_rec_get_volume_func(
+/*=====================*/
+#ifdef UNIV_DEBUG
+	mtr_t*		mtr,	/*!< in: mini-transaction owning rec */
+#endif /* UNIV_DEBUG */
+	const rec_t*	ibuf_rec)/*!< in: ibuf record */
+{
+	ulint		len;
+	const byte*	data;
+	const byte*	types;
+	ulint		n_fields;
+	ulint		data_size;
+	ulint		comp;
+	ibuf_op_t	op;
+	ulint		info_len;
+
+	ut_ad(mtr->memo_contains_page_flagged(ibuf_rec, MTR_MEMO_PAGE_X_FIX
+					      | MTR_MEMO_PAGE_S_FIX));
+	ut_ad(ibuf_inside(mtr));
+	ut_ad(rec_get_n_fields_old(ibuf_rec) > 2);
+
+	data = rec_get_nth_field_old(ibuf_rec, IBUF_REC_FIELD_MARKER, &len);
+	ut_a(len == 1);
+	ut_a(*data == 0);
+
+	types = rec_get_nth_field_old(
+		ibuf_rec, IBUF_REC_FIELD_METADATA, &len);
+
+	ibuf_rec_get_info(mtr, ibuf_rec, &op, &comp, &info_len, NULL);
+
+	if (op == IBUF_OP_DELETE_MARK || op == IBUF_OP_DELETE) {
+		/* Delete-marking a record doesn't take any
+		additional space, and while deleting a record
+		actually frees up space, we have to play it safe and
+		pretend it takes no additional space (the record
+		might not exist, etc.).  */
+
+		return(0);
+	} else if (comp) {
+		dtuple_t*	entry;
+		ulint		volume;
+		dict_index_t*	dummy_index;
+		mem_heap_t*	heap = mem_heap_create(500);
+
+		entry = ibuf_build_entry_from_ibuf_rec(mtr, ibuf_rec,
+			heap, &dummy_index);
+
+		volume = rec_get_converted_size(dummy_index, entry, 0);
+
+		ibuf_dummy_index_free(dummy_index);
+		mem_heap_free(heap);
+
+		return(volume + page_dir_calc_reserved_space(1));
+	}
+
+	types += info_len;
+	n_fields = rec_get_n_fields_old(ibuf_rec)
+		- IBUF_REC_FIELD_USER;
+
+	data_size = ibuf_rec_get_size(ibuf_rec, types, n_fields, comp);
+
+	return(data_size + rec_get_converted_extra_size(data_size, n_fields, 0)
+	       + page_dir_calc_reserved_space(1));
+}
+
+/*********************************************************************//**
+Builds the tuple to insert to an ibuf tree when we have an entry for a
+non-clustered index.
+
+NOTE that the original entry must be kept because we copy pointers to
+its fields.
+
+@return own: entry to insert into an ibuf index tree */
+static
+dtuple_t*
+ibuf_entry_build(
+/*=============*/
+	ibuf_op_t	op,	/*!< in: operation type */
+	dict_index_t*	index,	/*!< in: non-clustered index */
+	const dtuple_t*	entry,	/*!< in: entry for a non-clustered index */
+	ulint		space,	/*!< in: space id */
+	ulint		page_no,/*!< in: index page number where entry should
+				be inserted */
+	ulint		counter,/*!< in: counter value;
+				ULINT_UNDEFINED=not used */
+	mem_heap_t*	heap)	/*!< in: heap into which to build */
+{
+	dtuple_t*	tuple;
+	dfield_t*	field;
+	const dfield_t*	entry_field;
+	ulint		n_fields;
+	byte*		buf;
+	byte*		ti;
+	byte*		type_info;
+	ulint		i;
+
+	ut_ad(counter != ULINT_UNDEFINED || op == IBUF_OP_INSERT);
+	ut_ad(counter == ULINT_UNDEFINED || counter <= 0xFFFF);
+	ut_ad(op < IBUF_OP_COUNT);
+
+	/* We have to build a tuple with the following fields:
+
+	1-4) These are described at the top of this file.
+
+	5) The rest of the fields are copied from the entry.
+
+	All fields in the tuple are ordered like the type binary in our
+	insert buffer tree. */
+
+	n_fields = dtuple_get_n_fields(entry);
+
+	tuple = dtuple_create(heap, n_fields + IBUF_REC_FIELD_USER);
+
+	/* 1) Space Id */
+
+	field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_SPACE);
+
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
+
+	mach_write_to_4(buf, space);
+
+	dfield_set_data(field, buf, 4);
+
+	/* 2) Marker byte */
+
+	field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_MARKER);
+
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 1));
+
+	/* We set the marker byte zero */
+
+	mach_write_to_1(buf, 0);
+
+	dfield_set_data(field, buf, 1);
+
+	/* 3) Page number */
+
+	field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_PAGE);
+
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
+
+	mach_write_to_4(buf, page_no);
+
+	dfield_set_data(field, buf, 4);
+
+	/* 4) Type info, part #1 */
+
+	if (counter == ULINT_UNDEFINED) {
+		i = dict_table_is_comp(index->table) ? 1 : 0;
+	} else {
+		ut_ad(counter <= 0xFFFF);
+		i = IBUF_REC_INFO_SIZE;
+	}
+
+	ti = type_info = static_cast<byte*>(
+		mem_heap_alloc(
+			heap,
+			i + n_fields * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE));
+
+	switch (i) {
+	default:
+		ut_error;
+		break;
+	case 1:
+		/* set the flag for ROW_FORMAT=COMPACT */
+		*ti++ = 0;
+		/* fall through */
+	case 0:
+		/* the old format does not allow delete buffering */
+		ut_ad(op == IBUF_OP_INSERT);
+		break;
+	case IBUF_REC_INFO_SIZE:
+		mach_write_to_2(ti + IBUF_REC_OFFSET_COUNTER, counter);
+
+		ti[IBUF_REC_OFFSET_TYPE] = (byte) op;
+		ti[IBUF_REC_OFFSET_FLAGS] = dict_table_is_comp(index->table)
+			? IBUF_REC_COMPACT : 0;
+		ti += IBUF_REC_INFO_SIZE;
+		break;
+	}
+
+	/* 5+) Fields from the entry */
+
+	for (i = 0; i < n_fields; i++) {
+		ulint			fixed_len;
+		const dict_field_t*	ifield;
+
+		field = dtuple_get_nth_field(tuple, i + IBUF_REC_FIELD_USER);
+		entry_field = dtuple_get_nth_field(entry, i);
+		dfield_copy(field, entry_field);
+
+		ifield = dict_index_get_nth_field(index, i);
+		ut_ad(!ifield->descending);
+		/* Prefix index columns of fixed-length columns are of
+		fixed length.  However, in the function call below,
+		dfield_get_type(entry_field) contains the fixed length
+		of the column in the clustered index.  Replace it with
+		the fixed length of the secondary index column. */
+		fixed_len = ifield->fixed_len;
+
+#ifdef UNIV_DEBUG
+		if (fixed_len) {
+			/* dict_index_add_col() should guarantee these */
+			ut_ad(fixed_len <= (ulint)
+			      dfield_get_type(entry_field)->len);
+			if (ifield->prefix_len) {
+				ut_ad(ifield->prefix_len == fixed_len);
+			} else {
+				ut_ad(fixed_len == (ulint)
+				      dfield_get_type(entry_field)->len);
+			}
+		}
+#endif /* UNIV_DEBUG */
+
+		dtype_new_store_for_order_and_null_size(
+			ti, dfield_get_type(entry_field), fixed_len);
+		ti += DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE;
+	}
+
+	/* 4) Type info, part #2 */
+
+	field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_METADATA);
+
+	dfield_set_data(field, type_info, ulint(ti - type_info));
+
+	/* Set all the types in the new tuple binary */
+
+	dtuple_set_types_binary(tuple, n_fields + IBUF_REC_FIELD_USER);
+
+	return(tuple);
+}
+
+/*********************************************************************//**
+Builds a search tuple used to search buffered inserts for an index page.
+This is for >= 4.1.x format records.
+@return own: search tuple */
+static
+dtuple_t*
+ibuf_search_tuple_build(
+/*====================*/
+	ulint		space,	/*!< in: space id */
+	ulint		page_no,/*!< in: index page number */
+	mem_heap_t*	heap)	/*!< in: heap into which to build */
+{
+	dtuple_t*	tuple;
+	dfield_t*	field;
+	byte*		buf;
+
+	tuple = dtuple_create(heap, IBUF_REC_FIELD_METADATA);
+
+	/* Store the space id in tuple */
+
+	field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_SPACE);
+
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
+
+	mach_write_to_4(buf, space);
+
+	dfield_set_data(field, buf, 4);
+
+	/* Store the new format record marker byte */
+
+	field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_MARKER);
+
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 1));
+
+	mach_write_to_1(buf, 0);
+
+	dfield_set_data(field, buf, 1);
+
+	/* Store the page number in tuple */
+
+	field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_PAGE);
+
+	buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
+
+	mach_write_to_4(buf, page_no);
+
+	dfield_set_data(field, buf, 4);
+
+	dtuple_set_types_binary(tuple, IBUF_REC_FIELD_METADATA);
+
+	return(tuple);
+}
+
+/*********************************************************************//**
+Checks if there are enough pages in the free list of the ibuf tree that we
+dare to start a pessimistic insert to the insert buffer.
+@return whether enough free pages in list */
+static inline bool ibuf_data_enough_free_for_insert()
+{
+	mysql_mutex_assert_owner(&ibuf_mutex);
+
+	/* We want a big margin of free pages, because a B-tree can sometimes
+	grow in size also if records are deleted from it, as the node pointers
+	can change, and we must make sure that we are able to delete the
+	inserts buffered for pages that we read to the buffer pool, without
+	any risk of running out of free space in the insert buffer. */
+
+	return(ibuf.free_list_len >= (ibuf.size / 2) + 3 * ibuf.height);
+}
+
+/*********************************************************************//**
+Checks if there are enough pages in the free list of the ibuf tree that we
+should remove them and free to the file space management.
+@return TRUE if enough free pages in list */
+UNIV_INLINE
+ibool
+ibuf_data_too_much_free(void)
+/*=========================*/
+{
+	mysql_mutex_assert_owner(&ibuf_mutex);
+
+	return(ibuf.free_list_len >= 3 + (ibuf.size / 2) + 3 * ibuf.height);
+}
+
+/** Allocate a change buffer page.
+@retval true on success
+@retval false if no space left */
+static bool ibuf_add_free_page()
+{
+	mtr_t		mtr;
+	page_t*		header_page;
+	buf_block_t*	block;
+
+	mtr.start();
+	/* Acquire the fsp latch before the ibuf header, obeying the latching
+	order */
+	mtr.x_lock_space(fil_system.sys_space);
+	header_page = ibuf_header_page_get(&mtr);
+	if (!header_page) {
+		mtr.commit();
+		return false;
+	}
+
+	/* Allocate a new page: NOTE that if the page has been a part of a
+	non-clustered index which has subsequently been dropped, then the
+	page may have buffered inserts in the insert buffer, and these
+	should be deleted from there. These get deleted when the page
+	allocation creates the page in buffer. Thus the call below may end
+	up calling the insert buffer routines and, as we yet have no latches
+	to insert buffer tree pages, these routines can run without a risk
+	of a deadlock. This is the reason why we created a special ibuf
+	header page apart from the ibuf tree. */
+
+	dberr_t err;
+	block = fseg_alloc_free_page_general(
+		header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER, 0, FSP_UP,
+		false, &mtr, &mtr, &err);
+
+	if (!block) {
+		mtr.commit();
+		return false;
+	}
+
+	ut_ad(block->page.lock.not_recursive());
+	ibuf_enter(&mtr);
+	mysql_mutex_lock(&ibuf_mutex);
+
+	mtr.write<2>(*block, block->page.frame + FIL_PAGE_TYPE,
+		     FIL_PAGE_IBUF_FREE_LIST);
+	buf_block_t* ibuf_root = ibuf_tree_root_get(&mtr);
+	if (UNIV_UNLIKELY(!ibuf_root)) {
+corrupted:
+		/* Do not bother to try to free the allocated block, because
+		the change buffer is seriously corrupted already. */
+		mysql_mutex_unlock(&ibuf_mutex);
+		ibuf_mtr_commit(&mtr);
+		return false;
+	}
+
+	/* Add the page to the free list and update the ibuf size data */
+
+	err = flst_add_last(ibuf_root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+			    block, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE,
+			    &mtr);
+	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+		goto corrupted;
+	}
+
+	/* Set the bit indicating that this page is now an ibuf tree page
+	(level 2 page) */
+
+	const page_id_t page_id(block->page.id());
+	buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(page_id, 0, &mtr);
+
+	if (UNIV_UNLIKELY(!bitmap_page)) {
+		goto corrupted;
+	}
+
+	ibuf.seg_size++;
+	ibuf.free_list_len++;
+
+	mysql_mutex_unlock(&ibuf_mutex);
+
+	ibuf_bitmap_page_set_bits<IBUF_BITMAP_IBUF>(bitmap_page, page_id,
+						    srv_page_size, true, &mtr);
+	ibuf_mtr_commit(&mtr);
+	return true;
+}
+
+/*********************************************************************//**
+Removes a page from the free list and frees it to the fsp system. */
+static void ibuf_remove_free_page()
+{
+	mtr_t	mtr;
+	mtr_t	mtr2;
+	page_t*	header_page;
+
+	log_free_check();
+
+	mtr_start(&mtr);
+	/* Acquire the fsp latch before the ibuf header, obeying the latching
+	order */
+
+	mtr.x_lock_space(fil_system.sys_space);
+	header_page = ibuf_header_page_get(&mtr);
+
+	/* Prevent pessimistic inserts to insert buffer trees for a while */
+	ibuf_enter(&mtr);
+	mysql_mutex_lock(&ibuf_pessimistic_insert_mutex);
+	mysql_mutex_lock(&ibuf_mutex);
+
+	if (!header_page || !ibuf_data_too_much_free()) {
+early_exit:
+		mysql_mutex_unlock(&ibuf_mutex);
+		mysql_mutex_unlock(&ibuf_pessimistic_insert_mutex);
+
+		ibuf_mtr_commit(&mtr);
+
+		return;
+	}
+
+	ibuf_mtr_start(&mtr2);
+
+	buf_block_t* root = ibuf_tree_root_get(&mtr2);
+
+	if (UNIV_UNLIKELY(!root)) {
+		ibuf_mtr_commit(&mtr2);
+		goto early_exit;
+	}
+
+	mysql_mutex_unlock(&ibuf_mutex);
+
+	const uint32_t page_no = flst_get_last(PAGE_HEADER
+					       + PAGE_BTR_IBUF_FREE_LIST
+					       + root->page.frame).page;
+
+	/* NOTE that we must release the latch on the ibuf tree root
+	because in fseg_free_page we access level 1 pages, and the root
+	is a level 2 page. */
+
+	ibuf_mtr_commit(&mtr2);
+	ibuf_exit(&mtr);
+
+	/* Since pessimistic inserts were prevented, we know that the
+	page is still in the free list. NOTE that also deletes may take
+	pages from the free list, but they take them from the start, and
+	the free list was so long that they cannot have taken the last
+	page from it. */
+
+	compile_time_assert(IBUF_SPACE_ID == 0);
+	const page_id_t	page_id{IBUF_SPACE_ID, page_no};
+	buf_block_t* bitmap_page = nullptr;
+	dberr_t err = fseg_free_page(
+		header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER,
+		fil_system.sys_space, page_no, &mtr);
+
+	if (err != DB_SUCCESS) {
+		goto func_exit;
+	}
+
+	ibuf_enter(&mtr);
+
+	mysql_mutex_lock(&ibuf_mutex);
+
+	root = ibuf_tree_root_get(&mtr, &err);
+	if (UNIV_UNLIKELY(!root)) {
+		mysql_mutex_unlock(&ibuf_pessimistic_insert_mutex);
+		goto func_exit;
+	}
+
+	ut_ad(page_no == flst_get_last(PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST
+				       + root->page.frame).page);
+
+	/* Remove the page from the free list and update the ibuf size data */
+	if (buf_block_t* block =
+	    buf_page_get_gen(page_id, 0, RW_X_LATCH, nullptr, BUF_GET,
+			     &mtr, &err)) {
+		err = flst_remove(root, PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+				  block,
+				  PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE,
+				  &mtr);
+	}
+
+	mysql_mutex_unlock(&ibuf_pessimistic_insert_mutex);
+
+	if (err == DB_SUCCESS) {
+		ibuf.seg_size--;
+		ibuf.free_list_len--;
+		bitmap_page = ibuf_bitmap_get_map_page(page_id, 0, &mtr);
+	}
+
+func_exit:
+	mysql_mutex_unlock(&ibuf_mutex);
+
+	if (bitmap_page) {
+		/* Set the bit indicating that this page is no more an
+		ibuf tree page (level 2 page) */
+		ibuf_bitmap_page_set_bits<IBUF_BITMAP_IBUF>(
+			bitmap_page, page_id, srv_page_size, false, &mtr);
+	}
+
+	if (err == DB_SUCCESS) {
+		buf_page_free(fil_system.sys_space, page_no, &mtr);
+	}
+
+	ibuf_mtr_commit(&mtr);
+}
+
+/***********************************************************************//**
+Frees excess pages from the ibuf free list. This function is called when an OS
+thread calls fsp services to allocate a new file segment, or a new page to a
+file segment, and the thread did not own the fsp latch before this call. */
+void
+ibuf_free_excess_pages(void)
+/*========================*/
+{
+	if (UNIV_UNLIKELY(!ibuf.index)) return;
+	/* Free at most a few pages at a time, so that we do not delay the
+	requested service too much */
+
+	for (ulint i = 0; i < 4; i++) {
+
+		ibool	too_much_free;
+
+		mysql_mutex_lock(&ibuf_mutex);
+		too_much_free = ibuf_data_too_much_free();
+		mysql_mutex_unlock(&ibuf_mutex);
+
+		if (!too_much_free) {
+			return;
+		}
+
+		ibuf_remove_free_page();
+	}
+}
+
+#ifdef UNIV_DEBUG
+# define ibuf_get_merge_page_nos(rec,mtr,ids,pages,n_stored) \
+	ibuf_get_merge_page_nos_func(rec,mtr,ids,pages,n_stored)
+#else /* UNIV_DEBUG */
+# define ibuf_get_merge_page_nos(rec,mtr,ids,pages,n_stored) \
+	ibuf_get_merge_page_nos_func(rec,ids,pages,n_stored)
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Reads page numbers from a leaf in an ibuf tree.
+@return a lower limit for the combined volume of records which will be
+merged */
+static
+ulint
+ibuf_get_merge_page_nos_func(
+/*=========================*/
+	const rec_t*	rec,	/*!< in: insert buffer record */
+#ifdef UNIV_DEBUG
+	mtr_t*		mtr,	/*!< in: mini-transaction holding rec */
+#endif /* UNIV_DEBUG */
+	uint32_t*	space_ids,/*!< in/out: space id's of the pages */
+	uint32_t*	page_nos,/*!< in/out: buffer for at least
+				IBUF_MAX_N_PAGES_MERGED many page numbers;
+				the page numbers are in an ascending order */
+	ulint*		n_stored)/*!< out: number of page numbers stored to
+				page_nos in this function */
+{
+	uint32_t prev_page_no;
+	uint32_t prev_space_id;
+	uint32_t first_page_no;
+	uint32_t first_space_id;
+	uint32_t rec_page_no;
+	uint32_t rec_space_id;
+	ulint	sum_volumes;
+	ulint	volume_for_page;
+	ulint	rec_volume;
+	ulint	limit;
+	ulint	n_pages;
+
+	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
+					      | MTR_MEMO_PAGE_S_FIX));
+	ut_ad(ibuf_inside(mtr));
+
+	*n_stored = 0;
+
+	if (page_rec_is_supremum(rec)) {
+
+		rec = page_rec_get_prev_const(rec);
+		if (UNIV_UNLIKELY(!rec)) {
+corruption:
+			ut_ad("corrupted page" == 0);
+			return 0;
+		}
+	}
+
+	if (page_rec_is_infimum(rec)) {
+		rec = page_rec_get_next_const(rec);
+		if (!rec || page_rec_is_supremum(rec)) {
+			return 0;
+		}
+	}
+
+	limit = ut_min(IBUF_MAX_N_PAGES_MERGED,
+		       buf_pool_get_curr_size() / 4);
+
+	first_page_no = ibuf_rec_get_page_no(mtr, rec);
+	first_space_id = ibuf_rec_get_space(mtr, rec);
+	n_pages = 0;
+	prev_page_no = 0;
+	prev_space_id = 0;
+
+	/* Go backwards from the first rec until we reach the border of the
+	'merge area', or the page start or the limit of storeable pages is
+	reached */
+
+	while (!page_rec_is_infimum(rec) && UNIV_LIKELY(n_pages < limit)) {
+
+		rec_page_no = ibuf_rec_get_page_no(mtr, rec);
+		rec_space_id = ibuf_rec_get_space(mtr, rec);
+
+		if (rec_space_id != first_space_id
+		    || (rec_page_no / IBUF_MERGE_AREA)
+		    != (first_page_no / IBUF_MERGE_AREA)) {
+
+			break;
+		}
+
+		if (rec_page_no != prev_page_no
+		    || rec_space_id != prev_space_id) {
+			n_pages++;
+		}
+
+		prev_page_no = rec_page_no;
+		prev_space_id = rec_space_id;
+
+		if (UNIV_UNLIKELY(!(rec = page_rec_get_prev_const(rec)))) {
+			goto corruption;
+		}
+	}
+
+	rec = page_rec_get_next_const(rec);
+
+	/* At the loop start there is no prev page; we mark this with a pair
+	of space id, page no (0, 0) for which there can never be entries in
+	the insert buffer */
+
+	prev_page_no = 0;
+	prev_space_id = 0;
+	sum_volumes = 0;
+	volume_for_page = 0;
+
+	while (*n_stored < limit && rec) {
+		if (page_rec_is_supremum(rec)) {
+			/* When no more records available, mark this with
+			another 'impossible' pair of space id, page no */
+			rec_page_no = 1;
+			rec_space_id = 0;
+		} else {
+			rec_page_no = ibuf_rec_get_page_no(mtr, rec);
+			rec_space_id = ibuf_rec_get_space(mtr, rec);
+			/* In the system tablespace the smallest
+			possible secondary index leaf page number is
+			bigger than FSP_DICT_HDR_PAGE_NO (7).
+			In all tablespaces, pages 0 and 1 are reserved
+			for the allocation bitmap and the change
+			buffer bitmap. In file-per-table tablespaces,
+			a file segment inode page will be created at
+			page 2 and the clustered index tree is created
+			at page 3.  So for file-per-table tablespaces,
+			page 4 is the smallest possible secondary
+			index leaf page. CREATE TABLESPACE also initially
+			uses pages 2 and 3 for the first created table,
+			but that table may be dropped, allowing page 2
+			to be reused for a secondary index leaf page.
+			To keep this assertion simple, just
+			make sure the page is >= 2. */
+			ut_ad(rec_page_no >= FSP_FIRST_INODE_PAGE_NO);
+		}
+
+#ifdef UNIV_IBUF_DEBUG
+		ut_a(*n_stored < IBUF_MAX_N_PAGES_MERGED);
+#endif
+		if ((rec_space_id != prev_space_id
+		     || rec_page_no != prev_page_no)
+		    && (prev_space_id != 0 || prev_page_no != 0)) {
+
+			space_ids[*n_stored] = prev_space_id;
+			page_nos[*n_stored] = prev_page_no;
+			(*n_stored)++;
+			sum_volumes += volume_for_page;
+
+			if (rec_space_id != first_space_id
+			    || rec_page_no / IBUF_MERGE_AREA
+			    != first_page_no / IBUF_MERGE_AREA) {
+
+				break;
+			}
+
+			volume_for_page = 0;
+		}
+
+		if (rec_page_no == 1 && rec_space_id == 0) {
+			/* Supremum record */
+
+			break;
+		}
+
+		rec_volume = ibuf_rec_get_volume(mtr, rec);
+
+		volume_for_page += rec_volume;
+
+		prev_page_no = rec_page_no;
+		prev_space_id = rec_space_id;
+
+		rec = page_rec_get_next_const(rec);
+	}
+
+#ifdef UNIV_IBUF_DEBUG
+	ut_a(*n_stored <= IBUF_MAX_N_PAGES_MERGED);
+#endif
+#if 0
+	fprintf(stderr, "Ibuf merge batch %lu pages %lu volume\n",
+		*n_stored, sum_volumes);
+#endif
+	return(sum_volumes);
+}
+
+/*******************************************************************//**
+Get the matching records for space id.
+@return current rec or NULL */
+static	MY_ATTRIBUTE((nonnull, warn_unused_result))
+const rec_t*
+ibuf_get_user_rec(
+/*===============*/
+	btr_pcur_t*	pcur,		/*!< in: the current cursor */
+	mtr_t*		mtr)		/*!< in: mini transaction */
+{
+	do {
+		const rec_t* rec = btr_pcur_get_rec(pcur);
+
+		if (page_rec_is_user_rec(rec)) {
+			return(rec);
+		}
+	} while (btr_pcur_move_to_next(pcur, mtr));
+
+	return(NULL);
+}
+
+/*********************************************************************//**
+Reads page numbers for a space id from an ibuf tree.
+@return a lower limit for the combined volume of records which will be
+merged */
+static	MY_ATTRIBUTE((nonnull, warn_unused_result))
+ulint
+ibuf_get_merge_pages(
+/*=================*/
+	btr_pcur_t*	pcur,	/*!< in/out: cursor */
+	uint32_t	space,	/*!< in: space for which to merge */
+	ulint		limit,	/*!< in: max page numbers to read */
+	uint32_t*	pages,	/*!< out: pages read */
+	uint32_t*	spaces,	/*!< out: spaces read */
+	ulint*		n_pages,/*!< out: number of pages read */
+	mtr_t*		mtr)	/*!< in: mini transaction */
+{
+	const rec_t*	rec;
+	ulint		volume = 0;
+
+	*n_pages = 0;
+
+	while ((rec = ibuf_get_user_rec(pcur, mtr)) != 0
+	       && ibuf_rec_get_space(mtr, rec) == space
+	       && *n_pages < limit) {
+
+		uint32_t page_no = ibuf_rec_get_page_no(mtr, rec);
+
+		if (*n_pages == 0 || pages[*n_pages - 1] != page_no) {
+			spaces[*n_pages] = space;
+			pages[*n_pages] = page_no;
+			++*n_pages;
+		}
+
+		volume += ibuf_rec_get_volume(mtr, rec);
+
+		btr_pcur_move_to_next(pcur, mtr);
+	}
+
+	return(volume);
+}
+
+/**
+Delete a change buffer record.
+@param[in]	page_id		page identifier
+@param[in,out]	pcur		persistent cursor positioned on the record
+@param[in]	search_tuple	search key for (space,page_no)
+@param[in,out]	mtr		mini-transaction
+@return whether mtr was committed (due to pessimistic operation) */
+static MY_ATTRIBUTE((warn_unused_result, nonnull))
+bool ibuf_delete_rec(const page_id_t page_id, btr_pcur_t* pcur,
+		     const dtuple_t* search_tuple, mtr_t* mtr);
+
+/** Delete the change buffer records for the given page id
+@param page_id page identifier */
+static void ibuf_delete_recs(const page_id_t page_id)
+{
+  if (!ibuf.index || srv_read_only_mode)
+    return;
+  dfield_t dfield[IBUF_REC_FIELD_METADATA];
+  dtuple_t tuple {0,IBUF_REC_FIELD_METADATA,IBUF_REC_FIELD_METADATA,
+                  dfield,0,nullptr
+#ifdef UNIV_DEBUG
+                  ,DATA_TUPLE_MAGIC_N
+#endif
+  };
+  byte space_id[4], page_no[4];
+
+  mach_write_to_4(space_id, page_id.space());
+  mach_write_to_4(page_no, page_id.page_no());
+
+  dfield_set_data(&dfield[0], space_id, 4);
+  dfield_set_data(&dfield[1], field_ref_zero, 1);
+  dfield_set_data(&dfield[2], page_no, 4);
+  dtuple_set_types_binary(&tuple, IBUF_REC_FIELD_METADATA);
+
+  mtr_t mtr;
+loop:
+  btr_pcur_t pcur;
+  pcur.btr_cur.page_cur.index= ibuf.index;
+  ibuf_mtr_start(&mtr);
+  if (btr_pcur_open(&tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF, &pcur, &mtr))
+    goto func_exit;
+  if (!btr_pcur_is_on_user_rec(&pcur))
+  {
+    ut_ad(btr_pcur_is_after_last_on_page(&pcur));
+    goto func_exit;
+  }
+
+  for (;;)
+  {
+    ut_ad(btr_pcur_is_on_user_rec(&pcur));
+    const rec_t* ibuf_rec = btr_pcur_get_rec(&pcur);
+    if (ibuf_rec_get_space(&mtr, ibuf_rec) != page_id.space()
+        || ibuf_rec_get_page_no(&mtr, ibuf_rec) != page_id.page_no())
+      break;
+    /* Delete the record from ibuf */
+    if (ibuf_delete_rec(page_id, &pcur, &tuple, &mtr))
+    {
+      /* Deletion was pessimistic and mtr was committed:
+      we start from the beginning again */
+      ut_ad(mtr.has_committed());
+      goto loop;
+    }
+
+    if (btr_pcur_is_after_last_on_page(&pcur))
+    {
+      ibuf_mtr_commit(&mtr);
+      btr_pcur_close(&pcur);
+      goto loop;
+    }
+  }
+func_exit:
+  ibuf_mtr_commit(&mtr);
+  btr_pcur_close(&pcur);
+}
+
+/** Merge the change buffer to some pages. */
+static void ibuf_read_merge_pages(const uint32_t* space_ids,
+				  const uint32_t* page_nos, ulint n_stored)
+{
+	for (ulint i = 0; i < n_stored; i++) {
+		const uint32_t space_id = space_ids[i];
+		fil_space_t* s = fil_space_t::get(space_id);
+		if (!s) {
+tablespace_deleted:
+			/* The tablespace was not found: remove all
+			entries for it */
+			ibuf_delete_for_discarded_space(space_id);
+			while (i + 1 < n_stored
+			       && space_ids[i + 1] == space_id) {
+				i++;
+			}
+			continue;
+		}
+
+		const ulint zip_size = s->zip_size(), size = s->size;
+		s->x_lock();
+		s->release();
+		mtr_t mtr;
+
+		if (UNIV_LIKELY(page_nos[i] < size)) {
+			mtr.start();
+			dberr_t err;
+			buf_block_t *block =
+			buf_page_get_gen(page_id_t(space_id, page_nos[i]),
+					 zip_size, RW_X_LATCH, nullptr,
+					 BUF_GET_POSSIBLY_FREED,
+					 &mtr, &err, true);
+			bool remove = !block
+				|| fil_page_get_type(block->page.frame)
+				!= FIL_PAGE_INDEX
+				|| !page_is_leaf(block->page.frame);
+			mtr.commit();
+			if (err == DB_TABLESPACE_DELETED) {
+				s->x_unlock();
+				goto tablespace_deleted;
+			}
+			if (!remove) {
+				s->x_unlock();
+				continue;
+			}
+		}
+
+		s->x_unlock();
+
+		if (srv_shutdown_state == SRV_SHUTDOWN_NONE
+		    || srv_fast_shutdown) {
+			continue;
+		}
+
+		/* The following code works around a hang when the
+		change buffer is corrupted, likely due to the
+		failure of ibuf_merge_or_delete_for_page() to
+		invoke ibuf_delete_recs() if (!bitmap_bits).
+
+		It also introduced corruption by itself in the
+		following scenario:
+
+		(1) We merged buffered changes in buf_page_get_gen()
+		(2) We committed the mini-transaction
+		(3) Redo log and the page with the merged changes is written
+		(4) A write completion callback thread evicts the page.
+		(5) Other threads buffer changes for that page.
+		(6) We will wrongly discard those newly buffered changes below.
+
+		To prevent this scenario, we will only invoke this code
+		on shutdown. A call to ibuf_max_size_update(0) will cause
+		ibuf_insert_low() to refuse to insert anything into the
+		change buffer. */
+
+		/* Prevent an infinite loop, by removing entries from
+		the change buffer in the case the bitmap bits were
+		wrongly clear even though buffered changes exist. */
+		ibuf_delete_recs(page_id_t(space_id, page_nos[i]));
+	}
+}
+
+/** Contract the change buffer by reading pages to the buffer pool.
+@return a lower limit for the combined size in bytes of entries which
+will be merged from ibuf trees to the pages read
+@retval 0 if ibuf.empty */
+ATTRIBUTE_COLD ulint ibuf_contract()
+{
+	if (UNIV_UNLIKELY(!ibuf.index)) return 0;
+	mtr_t		mtr;
+	btr_cur_t	cur;
+	ulint		sum_sizes;
+	uint32_t	page_nos[IBUF_MAX_N_PAGES_MERGED];
+	uint32_t	space_ids[IBUF_MAX_N_PAGES_MERGED];
+
+	ibuf_mtr_start(&mtr);
+
+	if (cur.open_leaf(true, ibuf.index, BTR_SEARCH_LEAF, &mtr) !=
+	    DB_SUCCESS) {
+		return 0;
+	}
+
+	ut_ad(page_validate(btr_cur_get_page(&cur), ibuf.index));
+
+	if (page_is_empty(btr_cur_get_page(&cur))) {
+		/* If a B-tree page is empty, it must be the root page
+		and the whole B-tree must be empty. InnoDB does not
+		allow empty B-tree pages other than the root. */
+		ut_ad(ibuf.empty);
+		ut_ad(btr_cur_get_block(&cur)->page.id()
+		      == page_id_t(IBUF_SPACE_ID, FSP_IBUF_TREE_ROOT_PAGE_NO));
+
+		ibuf_mtr_commit(&mtr);
+
+		return(0);
+	}
+
+	ulint n_pages = 0;
+	sum_sizes = ibuf_get_merge_page_nos(btr_cur_get_rec(&cur), &mtr,
+					    space_ids, page_nos, &n_pages);
+	ibuf_mtr_commit(&mtr);
+
+	ibuf_read_merge_pages(space_ids, page_nos, n_pages);
+
+	return(sum_sizes + 1);
+}
+
+/*********************************************************************//**
+Contracts insert buffer trees by reading pages referring to space_id
+to the buffer pool.
+@returns number of pages merged.*/
+ulint
+ibuf_merge_space(
+/*=============*/
+	ulint		space)	/*!< in: tablespace id to merge */
+{
+	if (UNIV_UNLIKELY(!ibuf.index)) return 0;
+	mtr_t		mtr;
+	btr_pcur_t	pcur;
+
+	dfield_t dfield[IBUF_REC_FIELD_METADATA];
+	dtuple_t tuple {0, IBUF_REC_FIELD_METADATA,
+			IBUF_REC_FIELD_METADATA,dfield,0,nullptr
+#ifdef UNIV_DEBUG
+			, DATA_TUPLE_MAGIC_N
+#endif
+	};
+	byte space_id[4];
+
+	mach_write_to_4(space_id, space);
+
+	dfield_set_data(&dfield[0], space_id, 4);
+	dfield_set_data(&dfield[1], field_ref_zero, 1);
+	dfield_set_data(&dfield[2], field_ref_zero, 4);
+
+	dtuple_set_types_binary(&tuple, IBUF_REC_FIELD_METADATA);
+	ulint		n_pages = 0;
+
+	ut_ad(space < SRV_SPACE_ID_UPPER_BOUND);
+
+	log_free_check();
+	ibuf_mtr_start(&mtr);
+
+	/* Position the cursor on the first matching record. */
+
+	pcur.btr_cur.page_cur.index = ibuf.index;
+	dberr_t err = btr_pcur_open(&tuple, PAGE_CUR_GE, BTR_SEARCH_LEAF,
+				    &pcur, &mtr);
+	ut_ad(err != DB_SUCCESS || page_validate(btr_pcur_get_page(&pcur),
+						 ibuf.index));
+
+	ulint		sum_sizes = 0;
+	uint32_t	pages[IBUF_MAX_N_PAGES_MERGED];
+	uint32_t	spaces[IBUF_MAX_N_PAGES_MERGED];
+
+	if (err != DB_SUCCESS) {
+	} else if (page_is_empty(btr_pcur_get_page(&pcur))) {
+		/* If a B-tree page is empty, it must be the root page
+		and the whole B-tree must be empty. InnoDB does not
+		allow empty B-tree pages other than the root. */
+		ut_ad(ibuf.empty);
+		ut_ad(btr_pcur_get_block(&pcur)->page.id()
+		      == page_id_t(IBUF_SPACE_ID, FSP_IBUF_TREE_ROOT_PAGE_NO));
+	} else {
+
+		sum_sizes = ibuf_get_merge_pages(
+			&pcur, uint32_t(space), IBUF_MAX_N_PAGES_MERGED,
+			&pages[0], &spaces[0], &n_pages,
+			&mtr);
+		ib::info() << "Size of pages merged " << sum_sizes;
+	}
+
+	ibuf_mtr_commit(&mtr);
+
+	if (n_pages > 0) {
+		ut_ad(n_pages <= UT_ARR_SIZE(pages));
+
+#ifdef UNIV_DEBUG
+		for (ulint i = 0; i < n_pages; ++i) {
+			ut_ad(spaces[i] == space);
+		}
+#endif /* UNIV_DEBUG */
+
+		ibuf_read_merge_pages(spaces, pages, n_pages);
+	}
+
+	return(n_pages);
+}
+
+/** Determine if a change buffer record has been encountered already.
+@param rec   change buffer record in the MySQL 5.5 format
+@param hash  hash table of encountered records
+@param size  number of elements in hash
+@retval true if a distinct record
+@retval false if this may be duplicating an earlier record */
+static bool ibuf_get_volume_buffered_hash(const rec_t *rec, ulint *hash,
+                                          ulint size)
+{
+  ut_ad(rec_get_n_fields_old(rec) > IBUF_REC_FIELD_USER);
+  const ulint start= rec_get_field_start_offs(rec, IBUF_REC_FIELD_USER);
+  const ulint len= rec_get_data_size_old(rec) - start;
+  const uint32_t fold= my_crc32c(0, rec + start, len);
+  hash+= (fold / (CHAR_BIT * sizeof *hash)) % size;
+  ulint bitmask= static_cast<ulint>(1) << (fold % (CHAR_BIT * sizeof(*hash)));
+
+  if (*hash & bitmask)
+    return false;
+
+  /* We have not seen this record yet. Remember it. */
+  *hash|= bitmask;
+  return true;
+}
+
+#ifdef UNIV_DEBUG
+# define ibuf_get_volume_buffered_count(mtr,rec,hash,size,n_recs)	\
+	ibuf_get_volume_buffered_count_func(mtr,rec,hash,size,n_recs)
+#else /* UNIV_DEBUG */
+# define ibuf_get_volume_buffered_count(mtr,rec,hash,size,n_recs)	\
+	ibuf_get_volume_buffered_count_func(rec,hash,size,n_recs)
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Update the estimate of the number of records on a page, and
+get the space taken by merging the buffered record to the index page.
+@return size of index record in bytes + an upper limit of the space
+taken in the page directory */
+static
+ulint
+ibuf_get_volume_buffered_count_func(
+/*================================*/
+#ifdef UNIV_DEBUG
+	mtr_t*		mtr,	/*!< in: mini-transaction owning rec */
+#endif /* UNIV_DEBUG */
+	const rec_t*	rec,	/*!< in: insert buffer record */
+	ulint*		hash,	/*!< in/out: hash array */
+	ulint		size,	/*!< in: number of elements in hash array */
+	lint*		n_recs)	/*!< in/out: estimated number of records
+				on the page that rec points to */
+{
+	ulint		len;
+	ibuf_op_t	ibuf_op;
+	const byte*	types;
+	ulint		n_fields;
+
+	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
+					      | MTR_MEMO_PAGE_S_FIX));
+	ut_ad(ibuf_inside(mtr));
+
+	n_fields = rec_get_n_fields_old(rec);
+	ut_ad(n_fields > IBUF_REC_FIELD_USER);
+	n_fields -= IBUF_REC_FIELD_USER;
+
+	rec_get_nth_field_offs_old(rec, 1, &len);
+	/* This function is only invoked when buffering new
+	operations.  All pre-4.1 records should have been merged
+	when the database was started up. */
+	ut_a(len == 1);
+
+	if (rec_get_deleted_flag(rec, 0)) {
+		/* This record has been merged already,
+		but apparently the system crashed before
+		the change was discarded from the buffer.
+		Pretend that the record does not exist. */
+		return(0);
+	}
+
+	types = rec_get_nth_field_old(rec, IBUF_REC_FIELD_METADATA, &len);
+
+	switch (UNIV_EXPECT(int(len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE),
+			    IBUF_REC_INFO_SIZE)) {
+	default:
+		ut_error;
+	case 0:
+		/* This ROW_TYPE=REDUNDANT record does not include an
+		operation counter.  Exclude it from the *n_recs,
+		because deletes cannot be buffered if there are
+		old-style inserts buffered for the page. */
+
+		len = ibuf_rec_get_size(rec, types, n_fields, 0);
+
+		return(len
+		       + rec_get_converted_extra_size(len, n_fields, 0)
+		       + page_dir_calc_reserved_space(1));
+	case 1:
+		/* This ROW_TYPE=COMPACT record does not include an
+		operation counter.  Exclude it from the *n_recs,
+		because deletes cannot be buffered if there are
+		old-style inserts buffered for the page. */
+		goto get_volume_comp;
+
+	case IBUF_REC_INFO_SIZE:
+		ibuf_op = (ibuf_op_t) types[IBUF_REC_OFFSET_TYPE];
+		break;
+	}
+
+	switch (ibuf_op) {
+	case IBUF_OP_INSERT:
+		/* Inserts can be done by updating a delete-marked record.
+		Because delete-mark and insert operations can be pointing to
+		the same records, we must not count duplicates. */
+	case IBUF_OP_DELETE_MARK:
+		/* There must be a record to delete-mark.
+		See if this record has been already buffered. */
+		if (n_recs && ibuf_get_volume_buffered_hash(rec, hash, size)) {
+			(*n_recs)++;
+		}
+
+		if (ibuf_op == IBUF_OP_DELETE_MARK) {
+			/* Setting the delete-mark flag does not
+			affect the available space on the page. */
+			return(0);
+		}
+		break;
+	case IBUF_OP_DELETE:
+		/* A record will be removed from the page. */
+		if (n_recs) {
+			(*n_recs)--;
+		}
+		/* While deleting a record actually frees up space,
+		we have to play it safe and pretend that it takes no
+		additional space (the record might not exist, etc.). */
+		return(0);
+	default:
+		ut_error;
+	}
+
+	ut_ad(ibuf_op == IBUF_OP_INSERT);
+
+get_volume_comp:
+	{
+		dtuple_t*	entry;
+		ulint		volume;
+		dict_index_t*	dummy_index;
+		mem_heap_t*	heap = mem_heap_create(500);
+
+		entry = ibuf_build_entry_from_ibuf_rec(
+			mtr, rec, heap, &dummy_index);
+
+		volume = rec_get_converted_size(dummy_index, entry, 0);
+
+		ibuf_dummy_index_free(dummy_index);
+		mem_heap_free(heap);
+
+		return(volume + page_dir_calc_reserved_space(1));
+	}
+}
+
+/*********************************************************************//**
+Gets an upper limit for the combined size of entries buffered in the insert
+buffer for a given page.
+@return upper limit for the volume of buffered inserts for the index
+page, in bytes; srv_page_size, if the entries for the index page span
+several pages in the insert buffer */
+static
+ulint
+ibuf_get_volume_buffered(
+/*=====================*/
+	const btr_pcur_t*pcur,	/*!< in: pcur positioned at a place in an
+				insert buffer tree where we would insert an
+				entry for the index page whose number is
+				page_no, latch mode has to be BTR_MODIFY_PREV
+				or BTR_MODIFY_TREE */
+	ulint		space,	/*!< in: space id */
+	ulint		page_no,/*!< in: page number of an index page */
+	lint*		n_recs,	/*!< in/out: minimum number of records on the
+				page after the buffered changes have been
+				applied, or NULL to disable the counting */
+	mtr_t*		mtr)	/*!< in: mini-transaction of pcur */
+{
+	ulint		volume;
+	const rec_t*	rec;
+	const page_t*	page;
+	const page_t*	prev_page;
+	const page_t*	next_page;
+	/* bitmap of buffered recs */
+	ulint		hash_bitmap[128 / sizeof(ulint)];
+
+	ut_ad((pcur->latch_mode == BTR_MODIFY_PREV)
+	      || (pcur->latch_mode == BTR_MODIFY_TREE));
+
+	/* Count the volume of inserts earlier in the alphabetical order than
+	pcur */
+
+	volume = 0;
+
+	if (n_recs) {
+		memset(hash_bitmap, 0, sizeof hash_bitmap);
+	}
+
+	rec = btr_pcur_get_rec(pcur);
+	page = page_align(rec);
+	ut_ad(page_validate(page, ibuf.index));
+
+	if (page_rec_is_supremum(rec)
+	    && UNIV_UNLIKELY(!(rec = page_rec_get_prev_const(rec)))) {
+corruption:
+		ut_ad("corrupted page" == 0);
+		return srv_page_size;
+	}
+
+	uint32_t prev_page_no;
+
+	for (; !page_rec_is_infimum(rec); ) {
+		ut_ad(page_align(rec) == page);
+
+		if (page_no != ibuf_rec_get_page_no(mtr, rec)
+		    || space != ibuf_rec_get_space(mtr, rec)) {
+
+			goto count_later;
+		}
+
+		volume += ibuf_get_volume_buffered_count(
+			mtr, rec,
+			hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs);
+
+		if (UNIV_UNLIKELY(!(rec = page_rec_get_prev_const(rec)))) {
+			goto corruption;
+		}
+	}
+
+	/* Look at the previous page */
+
+	prev_page_no = btr_page_get_prev(page);
+
+	if (prev_page_no == FIL_NULL) {
+
+		goto count_later;
+	}
+
+	if (buf_block_t* block =
+	    buf_page_get(page_id_t(IBUF_SPACE_ID, prev_page_no),
+			 0, RW_X_LATCH, mtr)) {
+		prev_page = buf_block_get_frame(block);
+		ut_ad(page_validate(prev_page, ibuf.index));
+	} else {
+		return srv_page_size;
+	}
+
+	static_assert(FIL_PAGE_NEXT % 4 == 0, "alignment");
+	static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
+
+	if (UNIV_UNLIKELY(memcmp_aligned<4>(prev_page + FIL_PAGE_NEXT,
+					    page + FIL_PAGE_OFFSET, 4))) {
+		return srv_page_size;
+	}
+
+	rec = page_rec_get_prev_const(page_get_supremum_rec(prev_page));
+
+	if (UNIV_UNLIKELY(!rec)) {
+		goto corruption;
+	}
+
+	for (;;) {
+		ut_ad(page_align(rec) == prev_page);
+
+		if (page_rec_is_infimum(rec)) {
+
+			/* We cannot go to yet a previous page, because we
+			do not have the x-latch on it, and cannot acquire one
+			because of the latching order: we have to give up */
+
+			return(srv_page_size);
+		}
+
+		if (page_no != ibuf_rec_get_page_no(mtr, rec)
+		    || space != ibuf_rec_get_space(mtr, rec)) {
+
+			goto count_later;
+		}
+
+		volume += ibuf_get_volume_buffered_count(
+			mtr, rec,
+			hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs);
+
+		if (UNIV_UNLIKELY(!(rec = page_rec_get_prev_const(rec)))) {
+			goto corruption;
+		}
+	}
+
+count_later:
+	rec = btr_pcur_get_rec(pcur);
+
+	if (!page_rec_is_supremum(rec)) {
+		rec = page_rec_get_next_const(rec);
+	}
+
+	for (; !page_rec_is_supremum(rec);
+	     rec = page_rec_get_next_const(rec)) {
+		if (UNIV_UNLIKELY(!rec)) {
+			return srv_page_size;
+		}
+		if (page_no != ibuf_rec_get_page_no(mtr, rec)
+		    || space != ibuf_rec_get_space(mtr, rec)) {
+
+			return(volume);
+		}
+
+		volume += ibuf_get_volume_buffered_count(
+			mtr, rec,
+			hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs);
+	}
+
+	/* Look at the next page */
+
+	uint32_t next_page_no = btr_page_get_next(page);
+
+	if (next_page_no == FIL_NULL) {
+
+		return(volume);
+	}
+
+	if (buf_block_t* block =
+	    buf_page_get(page_id_t(IBUF_SPACE_ID, next_page_no),
+			 0, RW_X_LATCH, mtr)) {
+		next_page = buf_block_get_frame(block);
+		ut_ad(page_validate(next_page, ibuf.index));
+	} else {
+		return srv_page_size;
+	}
+
+	static_assert(FIL_PAGE_PREV % 4 == 0, "alignment");
+	static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment");
+
+	if (UNIV_UNLIKELY(memcmp_aligned<4>(next_page + FIL_PAGE_PREV,
+					    page + FIL_PAGE_OFFSET, 4))) {
+		return 0;
+	}
+
+	rec = page_get_infimum_rec(next_page);
+	rec = page_rec_get_next_const(rec);
+
+	for (; ; rec = page_rec_get_next_const(rec)) {
+		if (!rec || page_rec_is_supremum(rec)) {
+			/* We give up */
+			return(srv_page_size);
+		}
+
+		ut_ad(page_align(rec) == next_page);
+
+		if (page_no != ibuf_rec_get_page_no(mtr, rec)
+		    || space != ibuf_rec_get_space(mtr, rec)) {
+
+			return(volume);
+		}
+
+		volume += ibuf_get_volume_buffered_count(
+			mtr, rec,
+			hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs);
+	}
+}
+
+/*********************************************************************//**
+Reads the biggest tablespace id from the high end of the insert buffer
+tree and updates the counter in fil_system. */
+void
+ibuf_update_max_tablespace_id(void)
+/*===============================*/
+{
+	if (UNIV_UNLIKELY(!ibuf.index)) return;
+	const rec_t*	rec;
+	const byte*	field;
+	ulint		len;
+	btr_pcur_t	pcur;
+	mtr_t		mtr;
+
+	ut_ad(!ibuf.index->table->not_redundant());
+
+	ibuf_mtr_start(&mtr);
+
+	if (pcur.open_leaf(false, ibuf.index, BTR_SEARCH_LEAF, &mtr)
+	    != DB_SUCCESS) {
+func_exit:
+		ibuf_mtr_commit(&mtr);
+		return;
+	}
+
+	ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf.index));
+
+	if (!btr_pcur_move_to_prev(&pcur, &mtr)
+	    || btr_pcur_is_before_first_on_page(&pcur)) {
+		goto func_exit;
+	}
+
+	rec = btr_pcur_get_rec(&pcur);
+
+	field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_SPACE, &len);
+
+	ut_a(len == 4);
+
+	const uint32_t max_space_id = mach_read_from_4(field);
+
+	ibuf_mtr_commit(&mtr);
+
+	/* printf("Maximum space id in insert buffer %lu\n", max_space_id); */
+
+	fil_set_max_space_id_if_bigger(max_space_id);
+}
+
+#ifdef UNIV_DEBUG
+# define ibuf_get_entry_counter_low(mtr,rec,space,page_no)	\
+	ibuf_get_entry_counter_low_func(mtr,rec,space,page_no)
+#else /* UNIV_DEBUG */
+# define ibuf_get_entry_counter_low(mtr,rec,space,page_no)	\
+	ibuf_get_entry_counter_low_func(rec,space,page_no)
+#endif
+/****************************************************************//**
+Helper function for ibuf_get_entry_counter_func. Checks if rec is for
+(space, page_no), and if so, reads counter value from it and returns
+that + 1.
+@retval ULINT_UNDEFINED if the record does not contain any counter
+@retval 0 if the record is not for (space, page_no)
+@retval 1 + previous counter value, otherwise */
+static
+ulint
+ibuf_get_entry_counter_low_func(
+/*============================*/
+#ifdef UNIV_DEBUG
+	mtr_t*		mtr,		/*!< in: mini-transaction of rec */
+#endif /* UNIV_DEBUG */
+	const rec_t*	rec,		/*!< in: insert buffer record */
+	ulint		space,		/*!< in: space id */
+	ulint		page_no)	/*!< in: page number */
+{
+	ulint		counter;
+	const byte*	field;
+	ulint		len;
+
+	ut_ad(ibuf_inside(mtr));
+	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
+					      | MTR_MEMO_PAGE_S_FIX));
+	ut_ad(rec_get_n_fields_old(rec) > 2);
+
+	field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len);
+
+	ut_a(len == 1);
+
+	/* Check the tablespace identifier. */
+	field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_SPACE, &len);
+
+	ut_a(len == 4);
+
+	if (mach_read_from_4(field) != space) {
+
+		return(0);
+	}
+
+	/* Check the page offset. */
+	field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_PAGE, &len);
+	ut_a(len == 4);
+
+	if (mach_read_from_4(field) != page_no) {
+
+		return(0);
+	}
+
+	/* Check if the record contains a counter field. */
+	field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_METADATA, &len);
+
+	switch (len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE) {
+	default:
+		ut_error;
+	case 0: /* ROW_FORMAT=REDUNDANT */
+	case 1: /* ROW_FORMAT=COMPACT */
+		return(ULINT_UNDEFINED);
+
+	case IBUF_REC_INFO_SIZE:
+		counter = mach_read_from_2(field + IBUF_REC_OFFSET_COUNTER);
+		ut_a(counter < 0xFFFF);
+		return(counter + 1);
+	}
+}
+
+#ifdef UNIV_DEBUG
+# define ibuf_get_entry_counter(space,page_no,rec,mtr,exact_leaf) \
+	ibuf_get_entry_counter_func(space,page_no,rec,mtr,exact_leaf)
+#else /* UNIV_DEBUG */
+# define ibuf_get_entry_counter(space,page_no,rec,mtr,exact_leaf) \
+	ibuf_get_entry_counter_func(space,page_no,rec,exact_leaf)
+#endif /* UNIV_DEBUG */
+
+/****************************************************************//**
+Calculate the counter field for an entry based on the current
+last record in ibuf for (space, page_no).
+@return the counter field, or ULINT_UNDEFINED
+if we should abort this insertion to ibuf */
+static
+ulint
+ibuf_get_entry_counter_func(
+/*========================*/
+	ulint		space,		/*!< in: space id of entry */
+	ulint		page_no,	/*!< in: page number of entry */
+	const rec_t*	rec,		/*!< in: the record preceding the
+					insertion point */
+#ifdef UNIV_DEBUG
+	mtr_t*		mtr,		/*!< in: mini-transaction */
+#endif /* UNIV_DEBUG */
+	ibool		only_leaf)	/*!< in: TRUE if this is the only
+					leaf page that can contain entries
+					for (space,page_no), that is, there
+					was no exact match for (space,page_no)
+					in the node pointer */
+{
+	ut_ad(ibuf_inside(mtr));
+	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(page_validate(page_align(rec), ibuf.index));
+
+	if (page_rec_is_supremum(rec)) {
+		/* This is just for safety. The record should be a
+		page infimum or a user record. */
+		ut_ad(0);
+		return(ULINT_UNDEFINED);
+	} else if (!page_rec_is_infimum(rec)) {
+		return(ibuf_get_entry_counter_low(mtr, rec, space, page_no));
+	} else if (only_leaf || !page_has_prev(page_align(rec))) {
+		/* The parent node pointer did not contain the
+		searched for (space, page_no), which means that the
+		search ended on the correct page regardless of the
+		counter value, and since we're at the infimum record,
+		there are no existing records. */
+		return(0);
+	} else {
+		/* We used to read the previous page here. It would
+		break the latching order, because the caller has
+		buffer-fixed an insert buffer bitmap page. */
+		return(ULINT_UNDEFINED);
+	}
+}
+
+
+/** Translates the ibuf free bits to the free space on a page in bytes.
+@param[in]	physical_size	page_size
+@param[in]	bits		value for ibuf bitmap bits
+@return maximum insert size after reorganize for the page */
+inline ulint
+ibuf_index_page_calc_free_from_bits(ulint physical_size, ulint bits)
+{
+	ut_ad(bits < 4);
+	ut_ad(physical_size > IBUF_PAGE_SIZE_PER_FREE_SPACE);
+
+	if (bits == 3) {
+		bits = 4;
+	}
+
+	return bits * physical_size / IBUF_PAGE_SIZE_PER_FREE_SPACE;
+}
+
+/** Buffer an operation in the insert/delete buffer, instead of doing it
+directly to the disk page, if this is possible.
+@param[in]	mode		BTR_MODIFY_PREV or BTR_INSERT_TREE
+@param[in]	op		operation type
+@param[in]	no_counter	TRUE=use 5.0.3 format; FALSE=allow delete
+buffering
+@param[in]	entry		index entry to insert
+@param[in]	entry_size	rec_get_converted_size(index, entry)
+@param[in,out]	index		index where to insert; must not be unique
+or clustered
+@param[in]	page_id		page id where to insert
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out]	thr		query thread
+@return DB_SUCCESS, DB_STRONG_FAIL or other error */
+static TRANSACTIONAL_TARGET MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+ibuf_insert_low(
+	btr_latch_mode		mode,
+	ibuf_op_t		op,
+	ibool			no_counter,
+	const dtuple_t*		entry,
+	ulint			entry_size,
+	dict_index_t*		index,
+	const page_id_t		page_id,
+	ulint			zip_size,
+	que_thr_t*		thr)
+{
+	big_rec_t*	dummy_big_rec;
+	btr_pcur_t	pcur;
+	btr_cur_t*	cursor;
+	dtuple_t*	ibuf_entry;
+	mem_heap_t*	offsets_heap	= NULL;
+	mem_heap_t*	heap;
+	rec_offs*	offsets		= NULL;
+	ulint		buffered;
+	lint		min_n_recs;
+	rec_t*		ins_rec;
+	buf_block_t*	bitmap_page;
+	buf_block_t*	block		= NULL;
+	page_t*		root;
+	dberr_t		err;
+	mtr_t		mtr;
+	mtr_t		bitmap_mtr;
+
+	ut_a(!dict_index_is_clust(index));
+	ut_ad(!dict_index_is_spatial(index));
+	ut_ad(dtuple_check_typed(entry));
+	ut_ad(!no_counter || op == IBUF_OP_INSERT);
+	ut_ad(page_id.space() == index->table->space_id);
+	ut_a(op < IBUF_OP_COUNT);
+
+	/* Perform dirty comparison of ibuf.max_size and ibuf.size to
+	reduce ibuf_mutex contention. */
+	if (ibuf.size >= ibuf.max_size) {
+		return(DB_STRONG_FAIL);
+	}
+
+	heap = mem_heap_create(1024);
+
+	/* Build the entry which contains the space id and the page number
+	as the first fields and the type information for other fields, and
+	which will be inserted to the insert buffer. Using a counter value
+	of 0xFFFF we find the last record for (space, page_no), from which
+	we can then read the counter value N and use N + 1 in the record we
+	insert. (We patch the ibuf_entry's counter field to the correct
+	value just before actually inserting the entry.) */
+
+	ibuf_entry = ibuf_entry_build(
+		op, index, entry, page_id.space(), page_id.page_no(),
+		no_counter ? ULINT_UNDEFINED : 0xFFFF, heap);
+
+	/* Open a cursor to the insert buffer tree to calculate if we can add
+	the new entry to it without exceeding the free space limit for the
+	page. */
+
+	if (mode == BTR_INSERT_TREE) {
+		for (;;) {
+			mysql_mutex_lock(&ibuf_pessimistic_insert_mutex);
+			mysql_mutex_lock(&ibuf_mutex);
+
+			if (UNIV_LIKELY(ibuf_data_enough_free_for_insert())) {
+
+				break;
+			}
+
+			mysql_mutex_unlock(&ibuf_mutex);
+			mysql_mutex_unlock(&ibuf_pessimistic_insert_mutex);
+
+			if (!ibuf_add_free_page()) {
+
+				mem_heap_free(heap);
+				return(DB_STRONG_FAIL);
+			}
+		}
+	}
+
+	ibuf_mtr_start(&mtr);
+	pcur.btr_cur.page_cur.index = ibuf.index;
+
+	err = btr_pcur_open(ibuf_entry, PAGE_CUR_LE, mode, &pcur, &mtr);
+	if (err != DB_SUCCESS) {
+func_exit:
+		ibuf_mtr_commit(&mtr);
+		ut_free(pcur.old_rec_buf);
+		mem_heap_free(heap);
+		return err;
+	}
+
+	ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf.index));
+
+	/* Find out the volume of already buffered inserts for the same index
+	page */
+	min_n_recs = 0;
+	buffered = ibuf_get_volume_buffered(&pcur,
+					    page_id.space(),
+					    page_id.page_no(),
+					    op == IBUF_OP_DELETE
+					    ? &min_n_recs
+					    : NULL, &mtr);
+
+	const ulint physical_size = zip_size ? zip_size : srv_page_size;
+
+	if (op == IBUF_OP_DELETE
+	    && (min_n_recs < 2 || buf_pool.watch_occurred(page_id))) {
+		/* The page could become empty after the record is
+		deleted, or the page has been read in to the buffer
+		pool.  Refuse to buffer the operation. */
+
+		/* The buffer pool watch is needed for IBUF_OP_DELETE
+		because of latching order considerations.  We can
+		check buf_pool_watch_occurred() only after latching
+		the insert buffer B-tree pages that contain buffered
+		changes for the page.  We never buffer IBUF_OP_DELETE,
+		unless some IBUF_OP_INSERT or IBUF_OP_DELETE_MARK have
+		been previously buffered for the page.  Because there
+		are buffered operations for the page, the insert
+		buffer B-tree page latches held by mtr will guarantee
+		that no changes for the user page will be merged
+		before mtr_commit(&mtr).  We must not mtr_commit(&mtr)
+		until after the IBUF_OP_DELETE has been buffered. */
+
+fail_exit:
+		if (mode == BTR_INSERT_TREE) {
+			mysql_mutex_unlock(&ibuf_mutex);
+			mysql_mutex_unlock(&ibuf_pessimistic_insert_mutex);
+		}
+
+		err = DB_STRONG_FAIL;
+		goto func_exit;
+	}
+
+	/* After this point, the page could still be loaded to the
+	buffer pool, but we do not have to care about it, since we are
+	holding a latch on the insert buffer leaf page that contains
+	buffered changes for (space, page_no).  If the page enters the
+	buffer pool, buf_page_t::read_complete() for (space, page_no) will
+	have to acquire a latch on the same insert buffer leaf page,
+	which it cannot do until we have buffered the IBUF_OP_DELETE
+	and done mtr_commit(&mtr) to release the latch. */
+
+	ibuf_mtr_start(&bitmap_mtr);
+
+	bitmap_page = ibuf_bitmap_get_map_page(page_id, zip_size, &bitmap_mtr);
+
+	/* We check if the index page is suitable for buffered entries */
+
+	if (!bitmap_page || buf_pool.page_hash_contains(
+		    page_id, buf_pool.page_hash.cell_get(page_id.fold()))) {
+commit_exit:
+		ibuf_mtr_commit(&bitmap_mtr);
+		goto fail_exit;
+	} else if (!lock_sys.rd_lock_try()) {
+		goto commit_exit;
+	} else {
+		hash_cell_t* cell = lock_sys.rec_hash.cell_get(page_id.fold());
+		lock_sys.rec_hash.latch(cell)->acquire();
+		const lock_t* lock = lock_sys_t::get_first(*cell, page_id);
+		lock_sys.rec_hash.latch(cell)->release();
+		lock_sys.rd_unlock();
+		if (lock) {
+			goto commit_exit;
+		}
+	}
+
+	if (op == IBUF_OP_INSERT) {
+		ulint	bits = ibuf_bitmap_page_get_bits(
+			bitmap_page->page.frame, page_id, physical_size,
+			IBUF_BITMAP_FREE, &bitmap_mtr);
+
+		if (buffered + entry_size + page_dir_calc_reserved_space(1)
+		    > ibuf_index_page_calc_free_from_bits(physical_size,
+							  bits)) {
+			/* Release the bitmap page latch early. */
+			ibuf_mtr_commit(&bitmap_mtr);
+			goto fail_exit;
+		}
+	}
+
+	if (!no_counter) {
+		/* Patch correct counter value to the entry to
+		insert. This can change the insert position, which can
+		result in the need to abort in some cases. */
+		ulint		counter = ibuf_get_entry_counter(
+			page_id.space(), page_id.page_no(),
+			btr_pcur_get_rec(&pcur), &mtr,
+			btr_pcur_get_btr_cur(&pcur)->low_match
+			< IBUF_REC_FIELD_METADATA);
+		dfield_t*	field;
+
+		if (counter == ULINT_UNDEFINED) {
+			goto commit_exit;
+		}
+
+		field = dtuple_get_nth_field(
+			ibuf_entry, IBUF_REC_FIELD_METADATA);
+		mach_write_to_2(
+			(byte*) dfield_get_data(field)
+			+ IBUF_REC_OFFSET_COUNTER, counter);
+	}
+
+	/* Set the bitmap bit denoting that the insert buffer contains
+	buffered entries for this index page, if the bit is not set yet */
+	index->set_modified(bitmap_mtr);
+	ibuf_bitmap_page_set_bits<IBUF_BITMAP_BUFFERED>(
+		bitmap_page, page_id, physical_size, true, &bitmap_mtr);
+	ibuf_mtr_commit(&bitmap_mtr);
+
+	cursor = btr_pcur_get_btr_cur(&pcur);
+
+	if (mode == BTR_MODIFY_PREV) {
+		err = btr_cur_optimistic_insert(
+			BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG,
+			cursor, &offsets, &offsets_heap,
+			ibuf_entry, &ins_rec,
+			&dummy_big_rec, 0, thr, &mtr);
+		block = btr_cur_get_block(cursor);
+		ut_ad(block->page.id().space() == IBUF_SPACE_ID);
+
+		/* If this is the root page, update ibuf.empty. */
+		if (block->page.id().page_no() == FSP_IBUF_TREE_ROOT_PAGE_NO) {
+			const page_t*	root = buf_block_get_frame(block);
+
+			ut_ad(page_get_space_id(root) == IBUF_SPACE_ID);
+			ut_ad(page_get_page_no(root)
+			      == FSP_IBUF_TREE_ROOT_PAGE_NO);
+
+			ibuf.empty = page_is_empty(root);
+		}
+	} else {
+		ut_ad(mode == BTR_INSERT_TREE);
+
+		/* We acquire an sx-latch to the root page before the insert,
+		because a pessimistic insert releases the tree x-latch,
+		which would cause the sx-latching of the root after that to
+		break the latching order. */
+		if (buf_block_t* ibuf_root = ibuf_tree_root_get(&mtr)) {
+			root = ibuf_root->page.frame;
+		} else {
+			err = DB_CORRUPTION;
+			mysql_mutex_unlock(&ibuf_pessimistic_insert_mutex);
+			mysql_mutex_unlock(&ibuf_mutex);
+			goto ibuf_insert_done;
+		}
+
+		err = btr_cur_optimistic_insert(
+			BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG,
+			cursor, &offsets, &offsets_heap,
+			ibuf_entry, &ins_rec,
+			&dummy_big_rec, 0, thr, &mtr);
+
+		if (err == DB_FAIL) {
+			err = btr_cur_pessimistic_insert(
+				BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG,
+				cursor, &offsets, &offsets_heap,
+				ibuf_entry, &ins_rec,
+				&dummy_big_rec, 0, thr, &mtr);
+		}
+
+		mysql_mutex_unlock(&ibuf_pessimistic_insert_mutex);
+		ibuf_size_update(root);
+		mysql_mutex_unlock(&ibuf_mutex);
+		ibuf.empty = page_is_empty(root);
+
+		block = btr_cur_get_block(cursor);
+		ut_ad(block->page.id().space() == IBUF_SPACE_ID);
+	}
+
+ibuf_insert_done:
+	if (offsets_heap) {
+		mem_heap_free(offsets_heap);
+	}
+
+	if (err == DB_SUCCESS && op != IBUF_OP_DELETE) {
+		/* Update the page max trx id field */
+		page_update_max_trx_id(block, NULL,
+				       thr_get_trx(thr)->id, &mtr);
+	}
+
+	goto func_exit;
+}
+
+/** Buffer an operation in the change buffer, instead of applying it
+directly to the file page, if this is possible. Does not do it if the index
+is clustered or unique.
+@param[in]	op		operation type
+@param[in]	entry		index entry to insert
+@param[in,out]	index		index where to insert
+@param[in]	page_id		page id where to insert
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out]	thr		query thread
+@return true if success */
+TRANSACTIONAL_TARGET
+bool
+ibuf_insert(
+	ibuf_op_t		op,
+	const dtuple_t*		entry,
+	dict_index_t*		index,
+	const page_id_t		page_id,
+	ulint			zip_size,
+	que_thr_t*		thr)
+{
+	if (!index->is_committed()) {
+		return false;
+	}
+
+	dberr_t		err;
+	ulint		entry_size;
+	ibool		no_counter;
+	/* Read the settable global variable only once in
+	this function, so that we will have a consistent view of it. */
+	ibuf_use_t	use		= ibuf_use_t(innodb_change_buffering);
+	DBUG_ENTER("ibuf_insert");
+
+	DBUG_PRINT("ibuf", ("op: %d, space: " UINT32PF ", page_no: " UINT32PF,
+			    op, page_id.space(), page_id.page_no()));
+
+	ut_ad(dtuple_check_typed(entry));
+	ut_ad(page_id.space() != SRV_TMP_SPACE_ID);
+	ut_ad(index->is_btree());
+	ut_a(!dict_index_is_clust(index));
+	ut_ad(!index->table->is_temporary());
+
+	no_counter = use <= IBUF_USE_INSERT;
+
+	switch (op) {
+	case IBUF_OP_INSERT:
+		switch (use) {
+		case IBUF_USE_NONE:
+		case IBUF_USE_DELETE:
+		case IBUF_USE_DELETE_MARK:
+			DBUG_RETURN(false);
+		case IBUF_USE_INSERT:
+		case IBUF_USE_INSERT_DELETE_MARK:
+		case IBUF_USE_ALL:
+			goto check_watch;
+		}
+		break;
+	case IBUF_OP_DELETE_MARK:
+		switch (use) {
+		case IBUF_USE_NONE:
+		case IBUF_USE_INSERT:
+			DBUG_RETURN(false);
+		case IBUF_USE_DELETE_MARK:
+		case IBUF_USE_DELETE:
+		case IBUF_USE_INSERT_DELETE_MARK:
+		case IBUF_USE_ALL:
+			ut_ad(!no_counter);
+			goto check_watch;
+		}
+		break;
+	case IBUF_OP_DELETE:
+		switch (use) {
+		case IBUF_USE_NONE:
+		case IBUF_USE_INSERT:
+		case IBUF_USE_INSERT_DELETE_MARK:
+			DBUG_RETURN(false);
+		case IBUF_USE_DELETE_MARK:
+		case IBUF_USE_DELETE:
+		case IBUF_USE_ALL:
+			ut_ad(!no_counter);
+			goto skip_watch;
+		}
+		break;
+	case IBUF_OP_COUNT:
+		break;
+	}
+
+	/* unknown op or use */
+	ut_error;
+
+check_watch:
+	/* If a thread attempts to buffer an insert on a page while a
+	purge is in progress on the same page, the purge must not be
+	buffered, because it could remove a record that was
+	re-inserted later.  For simplicity, we block the buffering of
+	all operations on a page that has a purge pending.
+
+	We do not check this in the IBUF_OP_DELETE case, because that
+	would always trigger the buffer pool watch during purge and
+	thus prevent the buffering of delete operations.  We assume
+	that the issuer of IBUF_OP_DELETE has called
+	buf_pool_t::watch_set(). */
+
+	if (buf_pool.page_hash_contains<true>(
+		    page_id, buf_pool.page_hash.cell_get(page_id.fold()))) {
+		/* A buffer pool watch has been set or the
+		page has been read into the buffer pool.
+		Do not buffer the request.  If a purge operation
+		is being buffered, have this request executed
+		directly on the page in the buffer pool after the
+		buffered entries for this page have been merged. */
+		DBUG_RETURN(false);
+	}
+
+skip_watch:
+	entry_size = rec_get_converted_size(index, entry, 0);
+
+	if (entry_size
+	    >= page_get_free_space_of_empty(dict_table_is_comp(index->table))
+	    / 2) {
+
+		DBUG_RETURN(false);
+	}
+
+	err = ibuf_insert_low(BTR_MODIFY_PREV, op, no_counter,
+			      entry, entry_size,
+			      index, page_id, zip_size, thr);
+	if (err == DB_FAIL) {
+		err = ibuf_insert_low(BTR_INSERT_TREE,
+				      op, no_counter, entry, entry_size,
+				      index, page_id, zip_size, thr);
+	}
+
+	ut_a(err == DB_SUCCESS || err == DB_STRONG_FAIL
+	     || err == DB_TOO_BIG_RECORD);
+
+	DBUG_RETURN(err == DB_SUCCESS);
+}
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/********************************************************************//**
+During merge, inserts to an index page a secondary index entry extracted
+from the insert buffer.
+@return	error code */
+static
+dberr_t
+ibuf_insert_to_index_page_low(
+/*==========================*/
+	const dtuple_t*	entry,	/*!< in: buffered entry to insert */
+	rec_offs**	offsets,/*!< out: offsets on *rec */
+	mem_heap_t*	heap,	/*!< in/out: memory heap */
+	mtr_t*		mtr,	/*!< in/out: mtr */
+	page_cur_t*	page_cur)/*!< in/out: cursor positioned on the record
+				after which to insert the buffered entry */
+{
+  if (page_cur_tuple_insert(page_cur, entry, offsets, &heap, 0, mtr))
+    return DB_SUCCESS;
+
+  /* Page reorganization or recompression should already have been
+  attempted by page_cur_tuple_insert(). Besides, per
+  ibuf_index_page_calc_free_zip() the page should not have been
+  recompressed or reorganized. */
+  ut_ad(!is_buf_block_get_page_zip(page_cur->block));
+
+  /* If the record did not fit, reorganize */
+  if (dberr_t err= btr_page_reorganize(page_cur, mtr))
+    return err;
+
+  /* This time the record must fit */
+  if (page_cur_tuple_insert(page_cur, entry, offsets, &heap, 0, mtr))
+    return DB_SUCCESS;
+
+  return DB_CORRUPTION;
+}
+
+/************************************************************************
+During merge, inserts to an index page a secondary index entry extracted
+from the insert buffer. */
+static
+dberr_t
+ibuf_insert_to_index_page(
+/*======================*/
+	const dtuple_t*	entry,	/*!< in: buffered entry to insert */
+	buf_block_t*	block,	/*!< in/out: index page where the buffered entry
+				should be placed */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_cur_t	page_cur;
+	page_t*		page		= buf_block_get_frame(block);
+	rec_t*		rec;
+	rec_offs*	offsets;
+	mem_heap_t*	heap;
+
+	DBUG_PRINT("ibuf", ("page " UINT32PF ":" UINT32PF,
+			    block->page.id().space(),
+			    block->page.id().page_no()));
+
+	ut_ad(!dict_index_is_online_ddl(index));// this is an ibuf_dummy index
+	ut_ad(ibuf_inside(mtr));
+	ut_ad(dtuple_check_typed(entry));
+#ifdef BTR_CUR_HASH_ADAPT
+	/* A change buffer merge must occur before users are granted
+	any access to the page. No adaptive hash index entries may
+	point to a freshly read page. */
+	ut_ad(!block->index);
+	assert_block_ahi_empty(block);
+#endif /* BTR_CUR_HASH_ADAPT */
+	ut_ad(mtr->is_named_space(block->page.id().space()));
+
+	if (UNIV_UNLIKELY(dict_table_is_comp(index->table)
+			  != (ibool)!!page_is_comp(page))) {
+		return DB_CORRUPTION;
+	}
+
+	rec = page_rec_get_next(page_get_infimum_rec(page));
+
+	if (!rec || page_rec_is_supremum(rec)) {
+		return DB_CORRUPTION;
+	}
+
+	if (!rec_n_fields_is_sane(index, rec, entry)) {
+		return DB_CORRUPTION;
+	}
+
+	ulint up_match = 0, low_match = 0;
+	page_cur.index = index;
+	page_cur.block = block;
+
+	if (page_cur_search_with_match(entry, PAGE_CUR_LE,
+				       &up_match, &low_match, &page_cur,
+				       nullptr)) {
+		return DB_CORRUPTION;
+	}
+
+	dberr_t err = DB_SUCCESS;
+
+	heap = mem_heap_create(
+		sizeof(upd_t)
+		+ REC_OFFS_HEADER_SIZE * sizeof(*offsets)
+		+ dtuple_get_n_fields(entry)
+		* (sizeof(upd_field_t) + sizeof *offsets));
+
+	if (UNIV_UNLIKELY(low_match == dtuple_get_n_fields(entry))) {
+		upd_t*		update;
+
+		rec = page_cur_get_rec(&page_cur);
+
+		/* This is based on
+		row_ins_sec_index_entry_by_modify(BTR_MODIFY_LEAF). */
+		ut_ad(rec_get_deleted_flag(rec, page_is_comp(page)));
+
+		offsets = rec_get_offsets(rec, index, NULL, index->n_fields,
+					  ULINT_UNDEFINED, &heap);
+		update = row_upd_build_sec_rec_difference_binary(
+			rec, index, offsets, entry, heap);
+
+		if (update->n_fields == 0) {
+			/* The records only differ in the delete-mark.
+			Clear the delete-mark, like we did before
+			Bug #56680 was fixed. */
+			btr_rec_set_deleted<false>(block, rec, mtr);
+			goto updated_in_place;
+		}
+
+		/* Copy the info bits. Clear the delete-mark. */
+		update->info_bits = rec_get_info_bits(rec, page_is_comp(page));
+		update->info_bits &= byte(~REC_INFO_DELETED_FLAG);
+		page_zip_des_t* page_zip = buf_block_get_page_zip(block);
+
+		/* We cannot invoke btr_cur_optimistic_update() here,
+		because we do not have a btr_cur_t or que_thr_t,
+		as the insert buffer merge occurs at a very low level. */
+		if (!row_upd_changes_field_size_or_external(index, offsets,
+							    update)
+		    && (!page_zip || btr_cur_update_alloc_zip(
+				page_zip, &page_cur, offsets,
+				rec_offs_size(offsets), false, mtr))) {
+			/* This is the easy case. Do something similar
+			to btr_cur_update_in_place(). */
+			rec = page_cur_get_rec(&page_cur);
+			btr_cur_upd_rec_in_place(rec, index, offsets,
+						 update, block, mtr);
+
+			DBUG_EXECUTE_IF(
+				"crash_after_log_ibuf_upd_inplace",
+				log_buffer_flush_to_disk();
+				ib::info() << "Wrote log record for ibuf"
+					" update in place operation";
+				DBUG_SUICIDE();
+			);
+
+			goto updated_in_place;
+		}
+
+		/* btr_cur_update_alloc_zip() may have changed this */
+		rec = page_cur_get_rec(&page_cur);
+
+		/* A collation may identify values that differ in
+		storage length.
+		Some examples (1 or 2 bytes):
+		utf8_turkish_ci: I = U+0131 LATIN SMALL LETTER DOTLESS I
+		utf8_general_ci: S = U+00DF LATIN SMALL LETTER SHARP S
+		utf8_general_ci: A = U+00E4 LATIN SMALL LETTER A WITH DIAERESIS
+
+		latin1_german2_ci: SS = U+00DF LATIN SMALL LETTER SHARP S
+
+		Examples of a character (3-byte UTF-8 sequence)
+		identified with 2 or 4 characters (1-byte UTF-8 sequences):
+
+		utf8_unicode_ci: 'II' = U+2171 SMALL ROMAN NUMERAL TWO
+		utf8_unicode_ci: '(10)' = U+247D PARENTHESIZED NUMBER TEN
+		*/
+
+		/* Delete the different-length record, and insert the
+		buffered one. */
+
+		page_cur_delete_rec(&page_cur, offsets, mtr);
+		if (!(page_cur_move_to_prev(&page_cur))) {
+			err = DB_CORRUPTION;
+			goto updated_in_place;
+		}
+	} else {
+		offsets = NULL;
+	}
+
+	err = ibuf_insert_to_index_page_low(entry, &offsets, heap, mtr,
+                                            &page_cur);
+updated_in_place:
+	mem_heap_free(heap);
+
+	return err;
+}
+
+/****************************************************************//**
+During merge, sets the delete mark on a record for a secondary index
+entry. */
+static
+void
+ibuf_set_del_mark(
+/*==============*/
+	const dtuple_t*		entry,	/*!< in: entry */
+	buf_block_t*		block,	/*!< in/out: block */
+	dict_index_t*		index,	/*!< in: record descriptor */
+	mtr_t*			mtr)	/*!< in: mtr */
+{
+	page_cur_t	page_cur;
+	page_cur.block = block;
+	page_cur.index = index;
+	ulint		up_match = 0, low_match = 0;
+
+	ut_ad(ibuf_inside(mtr));
+	ut_ad(dtuple_check_typed(entry));
+
+	if (!page_cur_search_with_match(entry, PAGE_CUR_LE,
+					&up_match, &low_match, &page_cur,
+					nullptr)
+	    && low_match == dtuple_get_n_fields(entry)) {
+		rec_t* rec = page_cur_get_rec(&page_cur);
+
+		/* Delete mark the old index record. According to a
+		comment in row_upd_sec_index_entry(), it can already
+		have been delete marked if a lock wait occurred in
+		row_ins_sec_index_entry() in a previous invocation of
+		row_upd_sec_index_entry(). */
+
+		if (UNIV_LIKELY
+		    (!rec_get_deleted_flag(
+			    rec, dict_table_is_comp(index->table)))) {
+			btr_rec_set_deleted<true>(block, rec, mtr);
+		}
+	} else {
+		const page_t*		page
+			= page_cur_get_page(&page_cur);
+		const buf_block_t*	block
+			= page_cur_get_block(&page_cur);
+
+		ib::error() << "Unable to find a record to delete-mark";
+		fputs("InnoDB: tuple ", stderr);
+		dtuple_print(stderr, entry);
+		fputs("\n"
+		      "InnoDB: record ", stderr);
+		rec_print(stderr, page_cur_get_rec(&page_cur), index);
+
+		ib::error() << "page " << block->page.id() << " ("
+			<< page_get_n_recs(page) << " records, index id "
+			<< btr_page_get_index_id(page) << ").";
+
+		ib::error() << BUG_REPORT_MSG;
+		ut_ad(0);
+	}
+}
+
+/****************************************************************//**
+During merge, delete a record for a secondary index entry. */
+static
+void
+ibuf_delete(
+/*========*/
+	const dtuple_t*	entry,	/*!< in: entry */
+	buf_block_t*	block,	/*!< in/out: block */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	mtr_t*		mtr)	/*!< in/out: mtr; must be committed
+				before latching any further pages */
+{
+	page_cur_t	page_cur;
+	page_cur.block = block;
+	page_cur.index = index;
+	ulint		up_match = 0, low_match = 0;
+
+	ut_ad(ibuf_inside(mtr));
+	ut_ad(dtuple_check_typed(entry));
+	ut_ad(!index->is_spatial());
+	ut_ad(!index->is_clust());
+
+	if (!page_cur_search_with_match(entry, PAGE_CUR_LE,
+					&up_match, &low_match, &page_cur,
+					nullptr)
+	    && low_match == dtuple_get_n_fields(entry)) {
+		page_zip_des_t*	page_zip= buf_block_get_page_zip(block);
+		page_t*		page	= buf_block_get_frame(block);
+		rec_t*		rec	= page_cur_get_rec(&page_cur);
+
+		/* TODO: the below should probably be a separate function,
+		it's a bastardized version of btr_cur_optimistic_delete. */
+
+		rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+		rec_offs*	offsets	= offsets_;
+		mem_heap_t*	heap = NULL;
+		ulint		max_ins_size = 0;
+
+		rec_offs_init(offsets_);
+
+		offsets = rec_get_offsets(rec, index, offsets, index->n_fields,
+					  ULINT_UNDEFINED, &heap);
+
+		if (page_get_n_recs(page) <= 1
+		    || !(REC_INFO_DELETED_FLAG
+			 & rec_get_info_bits(rec, page_is_comp(page)))) {
+			/* Refuse to purge the last record or a
+			record that has not been marked for deletion. */
+			ib::error() << "Unable to purge a record";
+			fputs("InnoDB: tuple ", stderr);
+			dtuple_print(stderr, entry);
+			fputs("\n"
+			      "InnoDB: record ", stderr);
+			rec_print_new(stderr, rec, offsets);
+			fprintf(stderr, "\nspace " UINT32PF " offset " UINT32PF
+				" (%u records, index id %llu)\n"
+				"InnoDB: Submit a detailed bug report"
+				" to https://jira.mariadb.org/\n",
+				block->page.id().space(),
+				block->page.id().page_no(),
+				(unsigned) page_get_n_recs(page),
+				(ulonglong) btr_page_get_index_id(page));
+
+			ut_ad(0);
+			return;
+		}
+
+		if (!page_zip) {
+			max_ins_size
+				= page_get_max_insert_size_after_reorganize(
+					page, 1);
+		}
+#ifdef UNIV_ZIP_DEBUG
+		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+		page_cur_delete_rec(&page_cur, offsets, mtr);
+#ifdef UNIV_ZIP_DEBUG
+		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+		if (page_zip) {
+			ibuf_update_free_bits_zip(block, mtr);
+		} else {
+			ibuf_update_free_bits_low(block, max_ins_size, mtr);
+		}
+
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+	}
+}
+
+/*********************************************************************//**
+Restores insert buffer tree cursor position
+@return whether the position was restored */
+static MY_ATTRIBUTE((nonnull))
+bool
+ibuf_restore_pos(
+/*=============*/
+	const page_id_t	page_id,/*!< in: page identifier */
+	const dtuple_t*	search_tuple,
+				/*!< in: search tuple for entries of page_no */
+	btr_latch_mode	mode,	/*!< in: BTR_MODIFY_LEAF or BTR_PURGE_TREE */
+	btr_pcur_t*	pcur,	/*!< in/out: persistent cursor whose
+				position is to be restored */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	if (UNIV_LIKELY(pcur->restore_position(mode, mtr) ==
+	      btr_pcur_t::SAME_ALL)) {
+		return true;
+	}
+
+	if (fil_space_t* s = fil_space_t::get(page_id.space())) {
+		ib::error() << "ibuf cursor restoration fails!"
+			" ibuf record inserted to page "
+			<< page_id
+			<< " in file " << s->chain.start->name;
+		s->release();
+
+		ib::error() << BUG_REPORT_MSG;
+
+		rec_print_old(stderr, btr_pcur_get_rec(pcur));
+		rec_print_old(stderr, pcur->old_rec);
+		dtuple_print(stderr, search_tuple);
+	}
+
+	ibuf_btr_pcur_commit_specify_mtr(pcur, mtr);
+	return false;
+}
+
+/**
+Delete a change buffer record.
+@param[in]	page_id		page identifier
+@param[in,out]	pcur		persistent cursor positioned on the record
+@param[in]	search_tuple	search key for (space,page_no)
+@param[in,out]	mtr		mini-transaction
+@return whether mtr was committed (due to pessimistic operation) */
+static MY_ATTRIBUTE((warn_unused_result, nonnull))
+bool ibuf_delete_rec(const page_id_t page_id, btr_pcur_t* pcur,
+		     const dtuple_t* search_tuple, mtr_t* mtr)
+{
+	dberr_t		err;
+
+	ut_ad(ibuf_inside(mtr));
+	ut_ad(page_rec_is_user_rec(btr_pcur_get_rec(pcur)));
+	ut_ad(ibuf_rec_get_page_no(mtr, btr_pcur_get_rec(pcur))
+	      == page_id.page_no());
+	ut_ad(ibuf_rec_get_space(mtr, btr_pcur_get_rec(pcur))
+	      == page_id.space());
+
+	switch (btr_cur_optimistic_delete(btr_pcur_get_btr_cur(pcur),
+					  BTR_CREATE_FLAG, mtr)) {
+	case DB_FAIL:
+		break;
+	case DB_SUCCESS:
+		if (page_is_empty(btr_pcur_get_page(pcur))) {
+			/* If a B-tree page is empty, it must be the root page
+			and the whole B-tree must be empty. InnoDB does not
+			allow empty B-tree pages other than the root. */
+			ut_d(const page_t* root = btr_pcur_get_page(pcur));
+
+			ut_ad(page_get_space_id(root) == IBUF_SPACE_ID);
+			ut_ad(page_get_page_no(root)
+			      == FSP_IBUF_TREE_ROOT_PAGE_NO);
+
+			/* ibuf.empty is protected by the root page latch.
+			Before the deletion, it had to be FALSE. */
+			ut_ad(!ibuf.empty);
+			ibuf.empty = true;
+		}
+		/* fall through */
+	default:
+		return(FALSE);
+	}
+
+	/* We have to resort to a pessimistic delete from ibuf.
+	Delete-mark the record so that it will not be applied again,
+	in case the server crashes before the pessimistic delete is
+	made persistent. */
+	btr_rec_set_deleted<true>(btr_pcur_get_block(pcur),
+				  btr_pcur_get_rec(pcur), mtr);
+
+	btr_pcur_store_position(pcur, mtr);
+	ibuf_btr_pcur_commit_specify_mtr(pcur, mtr);
+
+	ibuf_mtr_start(mtr);
+	mysql_mutex_lock(&ibuf_mutex);
+	mtr_x_lock_index(ibuf.index, mtr);
+
+	if (!ibuf_restore_pos(page_id, search_tuple,
+			      BTR_PURGE_TREE_ALREADY_LATCHED, pcur, mtr)) {
+		mysql_mutex_unlock(&ibuf_mutex);
+		goto func_exit;
+	}
+
+	if (buf_block_t* ibuf_root = ibuf_tree_root_get(mtr)) {
+		btr_cur_pessimistic_delete(&err, TRUE,
+					   btr_pcur_get_btr_cur(pcur),
+					   BTR_CREATE_FLAG, false, mtr);
+		ut_a(err == DB_SUCCESS);
+
+		ibuf_size_update(ibuf_root->page.frame);
+		ibuf.empty = page_is_empty(ibuf_root->page.frame);
+	}
+
+	mysql_mutex_unlock(&ibuf_mutex);
+	ibuf_btr_pcur_commit_specify_mtr(pcur, mtr);
+
+func_exit:
+	ut_ad(mtr->has_committed());
+	btr_pcur_close(pcur);
+
+	return(TRUE);
+}
+
+/** Check whether buffered changes exist for a page.
+@param[in]	id		page identifier
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@return whether buffered changes exist */
+bool ibuf_page_exists(const page_id_t id, ulint zip_size)
+{
+	ut_ad(!fsp_is_system_temporary(id.space()));
+
+	const ulint physical_size = zip_size ? zip_size : srv_page_size;
+
+	if (ibuf_fixed_addr_page(id, physical_size)
+	    || fsp_descr_page(id, physical_size)) {
+		return false;
+	}
+
+	mtr_t mtr;
+	bool bitmap_bits = false;
+
+	ibuf_mtr_start(&mtr);
+	if (const buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(
+		    id, zip_size, &mtr)) {
+		bitmap_bits = ibuf_bitmap_page_get_bits(
+			bitmap_page->page.frame, id, zip_size,
+			IBUF_BITMAP_BUFFERED, &mtr) != 0;
+	}
+	ibuf_mtr_commit(&mtr);
+	return bitmap_bits;
+}
+
+/** Reset the bits in the bitmap page for the given block and page id.
+@param b        X-latched secondary index page (nullptr to discard changes)
+@param page_id  page identifier
+@param zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@param mtr      mini-transaction */
+static void ibuf_reset_bitmap(buf_block_t *b, page_id_t page_id,
+                              ulint zip_size, mtr_t *mtr)
+{
+ buf_block_t *bitmap= ibuf_bitmap_get_map_page(page_id, zip_size, mtr);
+ if (!bitmap)
+   return;
+
+ const ulint physical_size = zip_size ? zip_size : srv_page_size;
+ /* FIXME: update the bitmap byte only once! */
+ ibuf_bitmap_page_set_bits<IBUF_BITMAP_BUFFERED>(bitmap, page_id,
+                                                 physical_size, false, mtr);
+
+ if (b)
+   ibuf_bitmap_page_set_bits<IBUF_BITMAP_FREE>(bitmap, page_id, physical_size,
+                                               ibuf_index_page_calc_free(b),
+                                               mtr);
+}
+
+/** When an index page is read from a disk to the buffer pool, this function
+applies any buffered operations to the page and deletes the entries from the
+insert buffer. If the page is not read, but created in the buffer pool, this
+function deletes its buffered entries from the insert buffer; there can
+exist entries for such a page if the page belonged to an index which
+subsequently was dropped.
+@param block    X-latched page to try to apply changes to, or NULL to discard
+@param page_id  page identifier
+@param zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@return error code */
+dberr_t ibuf_merge_or_delete_for_page(buf_block_t *block,
+                                      const page_id_t page_id,
+                                      ulint zip_size)
+{
+	if (trx_sys_hdr_page(page_id)) {
+		return DB_SUCCESS;
+	}
+
+	ut_ad(!block || page_id == block->page.id());
+	ut_ad(!block || block->page.frame);
+	ut_ad(!block || !block->page.is_ibuf_exist());
+	ut_ad(!block || !block->page.is_reinit());
+	ut_ad(!trx_sys_hdr_page(page_id));
+	ut_ad(page_id < page_id_t(SRV_SPACE_ID_UPPER_BOUND, 0));
+
+	const ulint physical_size = zip_size ? zip_size : srv_page_size;
+
+	if (ibuf_fixed_addr_page(page_id, physical_size)
+	    || fsp_descr_page(page_id, physical_size)) {
+		return DB_SUCCESS;
+	}
+
+	btr_pcur_t	pcur;
+#ifdef UNIV_IBUF_DEBUG
+	ulint		volume			= 0;
+#endif /* UNIV_IBUF_DEBUG */
+	dberr_t		err = DB_SUCCESS;
+	mtr_t		mtr;
+
+	fil_space_t* space = fil_space_t::get(page_id.space());
+
+	if (UNIV_UNLIKELY(!space)) {
+		block = nullptr;
+	} else {
+		ulint	bitmap_bits = 0;
+
+		ibuf_mtr_start(&mtr);
+
+		buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(
+			page_id, zip_size, &mtr);
+
+		if (bitmap_page
+		    && fil_page_get_type(bitmap_page->page.frame)
+		    != FIL_PAGE_TYPE_ALLOCATED) {
+			bitmap_bits = ibuf_bitmap_page_get_bits(
+				bitmap_page->page.frame, page_id, zip_size,
+				IBUF_BITMAP_BUFFERED, &mtr);
+		}
+
+		ibuf_mtr_commit(&mtr);
+
+		if (!bitmap_bits) {
+		done:
+			/* No changes are buffered for this page. */
+			space->release();
+			return DB_SUCCESS;
+		}
+
+		if (!block
+		    || DB_SUCCESS
+		    == fseg_page_is_allocated(space, page_id.page_no())) {
+			ibuf_mtr_start(&mtr);
+			mtr.set_named_space(space);
+			ibuf_reset_bitmap(block, page_id, zip_size, &mtr);
+			ibuf_mtr_commit(&mtr);
+			if (!block
+			    || btr_page_get_index_id(block->page.frame)
+			    != DICT_IBUF_ID_MIN + IBUF_SPACE_ID) {
+				ibuf_delete_recs(page_id);
+			}
+			goto done;
+		}
+	}
+
+	if (!block) {
+	} else if (!fil_page_index_page_check(block->page.frame)
+		   || !page_is_leaf(block->page.frame)) {
+		space->set_corrupted();
+		err = DB_CORRUPTION;
+		block = nullptr;
+	} else {
+		/* Move the ownership of the x-latch on the page to this OS
+		thread, so that we can acquire a second x-latch on it. This
+		is needed for the insert operations to the index page to pass
+		the debug checks. */
+
+		block->page.lock.claim_ownership();
+	}
+
+	mem_heap_t* heap = mem_heap_create(512);
+
+	const dtuple_t* search_tuple = ibuf_search_tuple_build(
+		page_id.space(), page_id.page_no(), heap);
+
+	/* Counts for merged & discarded operations. */
+	ulint mops[IBUF_OP_COUNT];
+	ulint dops[IBUF_OP_COUNT];
+
+	memset(mops, 0, sizeof(mops));
+	memset(dops, 0, sizeof(dops));
+	pcur.btr_cur.page_cur.index = ibuf.index;
+
+loop:
+	ibuf_mtr_start(&mtr);
+
+	/* Position pcur in the insert buffer at the first entry for this
+	index page */
+	if (btr_pcur_open_on_user_rec(search_tuple,
+				      BTR_MODIFY_LEAF, &pcur, &mtr)
+	    != DB_SUCCESS) {
+		err = DB_CORRUPTION;
+		goto reset_bit;
+	}
+
+	if (block) {
+		block->page.fix();
+		block->page.lock.x_lock_recursive();
+		mtr.memo_push(block, MTR_MEMO_PAGE_X_FIX);
+	}
+
+	if (space) {
+		mtr.set_named_space(space);
+	}
+
+	if (!btr_pcur_is_on_user_rec(&pcur)) {
+		ut_ad(btr_pcur_is_after_last_on_page(&pcur));
+		goto reset_bit;
+	}
+
+	for (;;) {
+		rec_t*	rec;
+
+		ut_ad(btr_pcur_is_on_user_rec(&pcur));
+
+		rec = btr_pcur_get_rec(&pcur);
+
+		/* Check if the entry is for this index page */
+		if (ibuf_rec_get_page_no(&mtr, rec) != page_id.page_no()
+		    || ibuf_rec_get_space(&mtr, rec) != page_id.space()) {
+
+			if (block != NULL) {
+				page_header_reset_last_insert(block, &mtr);
+			}
+
+			goto reset_bit;
+		}
+
+		if (err) {
+			fputs("InnoDB: Discarding record\n ", stderr);
+			rec_print_old(stderr, rec);
+			fputs("\nInnoDB: from the insert buffer!\n\n", stderr);
+		} else if (block != NULL && !rec_get_deleted_flag(rec, 0)) {
+			/* Now we have at pcur a record which should be
+			applied on the index page; NOTE that the call below
+			copies pointers to fields in rec, and we must
+			keep the latch to the rec page until the
+			insertion is finished! */
+			dtuple_t*	entry;
+			trx_id_t	max_trx_id;
+			dict_index_t*	dummy_index;
+			ibuf_op_t	op = ibuf_rec_get_op_type(&mtr, rec);
+
+			max_trx_id = page_get_max_trx_id(page_align(rec));
+			page_update_max_trx_id(block,
+					       buf_block_get_page_zip(block),
+					       max_trx_id, &mtr);
+
+			ut_ad(page_validate(page_align(rec), ibuf.index));
+
+			entry = ibuf_build_entry_from_ibuf_rec(
+				&mtr, rec, heap, &dummy_index);
+			ut_ad(!dummy_index->table->space);
+			dummy_index->table->space = space;
+			dummy_index->table->space_id = space->id;
+
+			ut_ad(page_validate(block->page.frame, dummy_index));
+
+			switch (op) {
+			case IBUF_OP_INSERT:
+#ifdef UNIV_IBUF_DEBUG
+				volume += rec_get_converted_size(
+					dummy_index, entry, 0);
+
+				volume += page_dir_calc_reserved_space(1);
+
+				ut_a(volume <= (4U << srv_page_size_shift)
+				     / IBUF_PAGE_SIZE_PER_FREE_SPACE);
+#endif
+				ibuf_insert_to_index_page(
+					entry, block, dummy_index, &mtr);
+				break;
+
+			case IBUF_OP_DELETE_MARK:
+				ibuf_set_del_mark(
+					entry, block, dummy_index, &mtr);
+				break;
+
+			case IBUF_OP_DELETE:
+				ibuf_delete(entry, block, dummy_index, &mtr);
+				/* Because ibuf_delete() will latch an
+				insert buffer bitmap page, commit mtr
+				before latching any further pages.
+				Store and restore the cursor position. */
+				ut_ad(rec == btr_pcur_get_rec(&pcur));
+				ut_ad(page_rec_is_user_rec(rec));
+				ut_ad(ibuf_rec_get_page_no(&mtr, rec)
+				      == page_id.page_no());
+				ut_ad(ibuf_rec_get_space(&mtr, rec)
+				      == page_id.space());
+
+				/* Mark the change buffer record processed,
+				so that it will not be merged again in case
+				the server crashes between the following
+				mtr_commit() and the subsequent mtr_commit()
+				of deleting the change buffer record. */
+				btr_rec_set_deleted<true>(
+					btr_pcur_get_block(&pcur),
+					btr_pcur_get_rec(&pcur), &mtr);
+
+				btr_pcur_store_position(&pcur, &mtr);
+				ibuf_btr_pcur_commit_specify_mtr(&pcur, &mtr);
+
+				ibuf_mtr_start(&mtr);
+				mtr.set_named_space(space);
+
+				block->page.lock.x_lock_recursive();
+				block->fix();
+				mtr.memo_push(block, MTR_MEMO_PAGE_X_FIX);
+
+				if (!ibuf_restore_pos(page_id, search_tuple,
+						      BTR_MODIFY_LEAF,
+						      &pcur, &mtr)) {
+
+					ut_ad(mtr.has_committed());
+					mops[op]++;
+					ibuf_dummy_index_free(dummy_index);
+					goto loop;
+				}
+
+				break;
+			default:
+				ut_error;
+			}
+
+			mops[op]++;
+
+			ibuf_dummy_index_free(dummy_index);
+		} else {
+			dops[ibuf_rec_get_op_type(&mtr, rec)]++;
+		}
+
+		/* Delete the record from ibuf */
+		if (ibuf_delete_rec(page_id, &pcur, search_tuple, &mtr)) {
+			/* Deletion was pessimistic and mtr was committed:
+			we start from the beginning again */
+
+			ut_ad(mtr.has_committed());
+			goto loop;
+		} else if (btr_pcur_is_after_last_on_page(&pcur)) {
+			ibuf_mtr_commit(&mtr);
+			goto loop;
+		}
+	}
+
+reset_bit:
+	if (space) {
+		ibuf_reset_bitmap(block, page_id, zip_size, &mtr);
+	}
+
+	ibuf_mtr_commit(&mtr);
+	ut_free(pcur.old_rec_buf);
+
+	if (space) {
+		space->release();
+	}
+
+	mem_heap_free(heap);
+
+	ibuf.n_merges++;
+	ibuf_add_ops(ibuf.n_merged_ops, mops);
+	ibuf_add_ops(ibuf.n_discarded_ops, dops);
+
+	return err;
+}
+
+/** Delete all change buffer entries for a tablespace,
+in DISCARD TABLESPACE, IMPORT TABLESPACE, or read-ahead.
+@param[in]	space		missing or to-be-discarded tablespace */
+void ibuf_delete_for_discarded_space(uint32_t space)
+{
+	if (UNIV_UNLIKELY(!ibuf.index)) return;
+
+	btr_pcur_t	pcur;
+	const rec_t*	ibuf_rec;
+	mtr_t		mtr;
+
+	/* Counts for discarded operations. */
+	ulint		dops[IBUF_OP_COUNT];
+
+	dfield_t	dfield[IBUF_REC_FIELD_METADATA];
+	dtuple_t	search_tuple {0,IBUF_REC_FIELD_METADATA,
+				      IBUF_REC_FIELD_METADATA,dfield,0
+				      ,nullptr
+#ifdef UNIV_DEBUG
+				      ,DATA_TUPLE_MAGIC_N
+#endif /* UNIV_DEBUG */
+	};
+	byte space_id[4];
+	mach_write_to_4(space_id, space);
+
+	dfield_set_data(&dfield[0], space_id, 4);
+	dfield_set_data(&dfield[1], field_ref_zero, 1);
+	dfield_set_data(&dfield[2], field_ref_zero, 4);
+	dtuple_set_types_binary(&search_tuple, IBUF_REC_FIELD_METADATA);
+	/* Use page number 0 to build the search tuple so that we get the
+	cursor positioned at the first entry for this space id */
+
+	memset(dops, 0, sizeof(dops));
+	pcur.btr_cur.page_cur.index = ibuf.index;
+
+loop:
+	log_free_check();
+	ibuf_mtr_start(&mtr);
+
+	/* Position pcur in the insert buffer at the first entry for the
+	space */
+	if (btr_pcur_open_on_user_rec(&search_tuple,
+				      BTR_MODIFY_LEAF, &pcur, &mtr)
+	    != DB_SUCCESS) {
+		goto leave_loop;
+	}
+
+	if (!btr_pcur_is_on_user_rec(&pcur)) {
+		ut_ad(btr_pcur_is_after_last_on_page(&pcur));
+		goto leave_loop;
+	}
+
+	for (;;) {
+		ut_ad(btr_pcur_is_on_user_rec(&pcur));
+
+		ibuf_rec = btr_pcur_get_rec(&pcur);
+
+		/* Check if the entry is for this space */
+		if (ibuf_rec_get_space(&mtr, ibuf_rec) != space) {
+
+			goto leave_loop;
+		}
+
+		uint32_t page_no = ibuf_rec_get_page_no(&mtr, ibuf_rec);
+
+		dops[ibuf_rec_get_op_type(&mtr, ibuf_rec)]++;
+
+		/* Delete the record from ibuf */
+		if (ibuf_delete_rec(page_id_t(space, page_no),
+				    &pcur, &search_tuple, &mtr)) {
+			/* Deletion was pessimistic and mtr was committed:
+			we start from the beginning again */
+
+			ut_ad(mtr.has_committed());
+clear:
+			ut_free(pcur.old_rec_buf);
+			goto loop;
+		}
+
+		if (btr_pcur_is_after_last_on_page(&pcur)) {
+			ibuf_mtr_commit(&mtr);
+			goto clear;
+		}
+	}
+
+leave_loop:
+	ibuf_mtr_commit(&mtr);
+	ut_free(pcur.old_rec_buf);
+
+	ibuf_add_ops(ibuf.n_discarded_ops, dops);
+}
+
+/******************************************************************//**
+Looks if the insert buffer is empty.
+@return true if empty */
+bool
+ibuf_is_empty(void)
+/*===============*/
+{
+	mtr_t		mtr;
+
+	ibuf_mtr_start(&mtr);
+
+	ut_d(mysql_mutex_lock(&ibuf_mutex));
+	const buf_block_t* root = ibuf_tree_root_get(&mtr);
+	bool is_empty = root && page_is_empty(root->page.frame);
+	ut_ad(!root || is_empty == ibuf.empty);
+	ut_d(mysql_mutex_unlock(&ibuf_mutex));
+	ibuf_mtr_commit(&mtr);
+
+	return(is_empty);
+}
+
+/******************************************************************//**
+Prints info of ibuf. */
+void
+ibuf_print(
+/*=======*/
+	FILE*	file)	/*!< in: file where to print */
+{
+  if (UNIV_UNLIKELY(!ibuf.index)) return;
+
+  mysql_mutex_lock(&ibuf_mutex);
+  if (ibuf.empty)
+  {
+    mysql_mutex_unlock(&ibuf_mutex);
+    return;
+  }
+
+  const ulint size= ibuf.size;
+  const ulint free_list_len= ibuf.free_list_len;
+  const ulint seg_size= ibuf.seg_size;
+  mysql_mutex_unlock(&ibuf_mutex);
+
+  fprintf(file,
+          "-------------\n"
+          "INSERT BUFFER\n"
+          "-------------\n"
+          "size " ULINTPF ", free list len " ULINTPF ","
+          " seg size " ULINTPF ", " ULINTPF " merges\n",
+          size, free_list_len, seg_size, ulint{ibuf.n_merges});
+  ibuf_print_ops("merged operations:\n", ibuf.n_merged_ops, file);
+  ibuf_print_ops("discarded operations:\n", ibuf.n_discarded_ops, file);
+}
+
+/** Check the insert buffer bitmaps on IMPORT TABLESPACE.
+@param[in]	trx	transaction
+@param[in,out]	space	tablespace being imported
+@return DB_SUCCESS or error code */
+dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space)
+{
+	ut_ad(trx->mysql_thd);
+	ut_ad(space->purpose == FIL_TYPE_IMPORT);
+
+	const unsigned zip_size = space->zip_size();
+	const unsigned physical_size = space->physical_size();
+
+	uint32_t size= std::min(space->free_limit, space->size);
+
+	if (size == 0) {
+		return(DB_TABLE_NOT_FOUND);
+	}
+
+	mtr_t mtr;
+
+	/* The two bitmap pages (allocation bitmap and ibuf bitmap) repeat
+	every page_size pages. For example if page_size is 16 KiB, then the
+	two bitmap pages repeat every 16 KiB * 16384 = 256 MiB. In the loop
+	below page_no is measured in number of pages since the beginning of
+	the space, as usual. */
+
+	for (uint32_t page_no = 0; page_no < size; page_no += physical_size) {
+		if (trx_is_interrupted(trx)) {
+			return(DB_INTERRUPTED);
+		}
+
+		mtr_start(&mtr);
+
+		buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(
+			page_id_t(space->id, page_no), zip_size, &mtr);
+		if (!bitmap_page) {
+			mtr.commit();
+			return DB_CORRUPTION;
+		}
+
+		if (buf_is_zeroes(span<const byte>(bitmap_page->page.frame,
+						   physical_size))) {
+			/* This means we got all-zero page instead of
+			ibuf bitmap page. The subsequent page should be
+			all-zero pages. */
+#ifdef UNIV_DEBUG
+			for (uint32_t curr_page = page_no + 1;
+			     curr_page < physical_size; curr_page++) {
+
+				buf_block_t* block = buf_page_get(
+					page_id_t(space->id, curr_page),
+					zip_size, RW_S_LATCH, &mtr);
+				page_t*	page = buf_block_get_frame(block);
+				ut_ad(buf_is_zeroes(span<const byte>(
+							    page,
+							    physical_size)));
+			}
+#endif /* UNIV_DEBUG */
+			mtr_commit(&mtr);
+			continue;
+		}
+
+		for (uint32_t i = FSP_IBUF_BITMAP_OFFSET + 1; i < physical_size;
+		     i++) {
+			const uint32_t offset = page_no + i;
+			const page_id_t	cur_page_id(space->id, offset);
+
+			if (ibuf_bitmap_page_get_bits(
+				    bitmap_page->page.frame,
+				    cur_page_id, zip_size,
+				    IBUF_BITMAP_IBUF, &mtr)) {
+
+				mtr_commit(&mtr);
+
+				ib_errf(trx->mysql_thd,
+					IB_LOG_LEVEL_ERROR,
+					 ER_INNODB_INDEX_CORRUPT,
+					 "File %s page %u"
+					 " is wrongly flagged to belong to the"
+					 " insert buffer",
+					space->chain.start->name, offset);
+				return(DB_CORRUPTION);
+			}
+
+			if (ibuf_bitmap_page_get_bits(
+				    bitmap_page->page.frame,
+				    cur_page_id, zip_size,
+				    IBUF_BITMAP_BUFFERED, &mtr)) {
+
+				ib_errf(trx->mysql_thd,
+					IB_LOG_LEVEL_WARN,
+					ER_INNODB_INDEX_CORRUPT,
+					"Buffered changes"
+					" for file %s page %u are lost",
+					space->chain.start->name, offset);
+
+				/* Tolerate this error, so that
+				slightly corrupted tables can be
+				imported and dumped.  Clear the bit. */
+				ibuf_bitmap_page_set_bits<IBUF_BITMAP_BUFFERED>(
+					bitmap_page, cur_page_id,
+					physical_size, false, &mtr);
+			}
+		}
+
+		mtr_commit(&mtr);
+	}
+
+	return(DB_SUCCESS);
+}
+
+void ibuf_set_bitmap_for_bulk_load(buf_block_t *block, mtr_t *mtr, bool reset)
+{
+  ut_a(page_is_leaf(block->page.frame));
+  const page_id_t id{block->page.id()};
+  const auto zip_size= block->zip_size();
+
+  if (buf_block_t *bitmap_page= ibuf_bitmap_get_map_page(id, zip_size, mtr))
+  {
+    if (ibuf_bitmap_page_get_bits(bitmap_page->page.frame, id, zip_size,
+                                  IBUF_BITMAP_BUFFERED, mtr))
+      ibuf_delete_recs(id);
+
+    ulint free_val= reset ? 0 : ibuf_index_page_calc_free(block);
+    /* FIXME: update the bitmap byte only once! */
+    ibuf_bitmap_page_set_bits<IBUF_BITMAP_FREE>
+      (bitmap_page, id, block->physical_size(), free_val, mtr);
+    ibuf_bitmap_page_set_bits<IBUF_BITMAP_BUFFERED>
+      (bitmap_page, id, block->physical_size(), false, mtr);
+  }
+}
diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h
new file mode 100644
index 00000000..5a0401fa
--- /dev/null
+++ b/storage/innobase/include/btr0btr.h
@@ -0,0 +1,543 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2014, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0btr.h
+The B-tree
+
+Created 6/2/1994 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "dict0dict.h"
+#include "data0data.h"
+#include "rem0types.h"
+#include "page0cur.h"
+#include "btr0types.h"
+#include "gis0type.h"
+
+#define BTR_MAX_NODE_LEVEL	50	/*!< Maximum B-tree page level
+					(not really a hard limit).
+					Used in debug assertions
+					in btr_page_set_level and
+					btr_page_get_level */
+
+/** Maximum record size which can be stored on a page, without using the
+special big record storage structure */
+#define	BTR_PAGE_MAX_REC_SIZE	(srv_page_size / 2 - 200)
+
+/** @brief Maximum depth of a B-tree in InnoDB.
+
+Note that this isn't a maximum as such; none of the tree operations
+avoid producing trees bigger than this. It is instead a "max depth
+that other code must work with", useful for e.g.  fixed-size arrays
+that must store some information about each level in a tree. In other
+words: if a B-tree with bigger depth than this is encountered, it is
+not acceptable for it to lead to mysterious memory corruption, but it
+is acceptable for the program to die with a clear assert failure. */
+#define BTR_MAX_LEVELS		100
+
+#define BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode)		\
+	btr_latch_mode((latch_mode) & ~(BTR_INSERT	\
+				| BTR_DELETE_MARK		\
+				| BTR_RTREE_UNDO_INS		\
+				| BTR_RTREE_DELETE_MARK		\
+				| BTR_DELETE			\
+				| BTR_IGNORE_SEC_UNIQUE		\
+				| BTR_ALREADY_S_LATCHED		\
+				| BTR_LATCH_FOR_INSERT		\
+				| BTR_LATCH_FOR_DELETE))
+
+#define BTR_LATCH_MODE_WITHOUT_INTENTION(latch_mode)			\
+	btr_latch_mode((latch_mode)					\
+		       & ~(BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE))
+
+/**************************************************************//**
+Checks and adjusts the root node of a tree during IMPORT TABLESPACE.
+@return error code, or DB_SUCCESS */
+dberr_t
+btr_root_adjust_on_import(
+/*======================*/
+	const dict_index_t*	index)	/*!< in: index tree */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Report a decryption failure. */
+ATTRIBUTE_COLD void btr_decryption_failed(const dict_index_t &index);
+
+/** Get an index page and declare its latching order level.
+@param[in]	index	index tree
+@param[in]	page	page number
+@param[in]	mode	latch mode
+@param[in]	merge	whether change buffer merge should be attempted
+@param[in,out]	mtr	mini-transaction
+@param[out]	err	error code
+@return block */
+buf_block_t *btr_block_get(const dict_index_t &index,
+                           uint32_t page, rw_lock_type_t mode, bool merge,
+                           mtr_t *mtr, dberr_t *err= nullptr);
+
+/**************************************************************//**
+Gets the index id field of a page.
+@return index id */
+UNIV_INLINE
+index_id_t
+btr_page_get_index_id(
+/*==================*/
+	const page_t*	page)	/*!< in: index page */
+	MY_ATTRIBUTE((warn_unused_result));
+/** Read the B-tree or R-tree PAGE_LEVEL.
+@param page B-tree or R-tree page
+@return number of child page links to reach the leaf level
+@retval 0 for leaf pages */
+inline uint16_t btr_page_get_level(const page_t *page)
+{
+  uint16_t level= mach_read_from_2(my_assume_aligned<2>
+                                   (PAGE_HEADER + PAGE_LEVEL + page));
+  ut_ad(level <= BTR_MAX_NODE_LEVEL);
+  return level;
+} MY_ATTRIBUTE((warn_unused_result))
+
+/** Read FIL_PAGE_NEXT.
+@param page  buffer pool page
+@return previous page number */
+inline uint32_t btr_page_get_next(const page_t* page)
+{
+  return mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_NEXT));
+}
+
+/** Read FIL_PAGE_PREV.
+@param page  buffer pool page
+@return previous page number */
+inline uint32_t btr_page_get_prev(const page_t* page)
+{
+  return mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_PREV));
+}
+
+/**************************************************************//**
+Gets the child node file address in a node pointer.
+NOTE: the offsets array must contain all offsets for the record since
+we read the last field according to offsets and assume that it contains
+the child page number. In other words offsets must have been retrieved
+with rec_get_offsets(n_fields=ULINT_UNDEFINED).
+@return child node address */
+UNIV_INLINE
+uint32_t
+btr_node_ptr_get_child_page_no(
+/*===========================*/
+	const rec_t*	rec,	/*!< in: node pointer record */
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Create the root node for a new index tree.
+@param[in]	type			type of the index
+@param[in,out]	space			tablespace where created
+@param[in]	index_id		index id
+@param[in]	index			index, or NULL to create a system table
+@param[in,out]	mtr			mini-transaction
+@param[out]	err			error code
+@return	page number of the created root
+@retval	FIL_NULL	if did not succeed */
+uint32_t
+btr_create(
+	ulint			type,
+	fil_space_t*		space,
+	index_id_t		index_id,
+	dict_index_t*		index,
+	mtr_t*			mtr,
+	dberr_t*		err)
+	MY_ATTRIBUTE((nonnull(2,5,6), warn_unused_result));
+
+/** Free a persistent index tree if it exists.
+@param[in,out]	space		tablespce
+@param[in]	page		root page number
+@param[in]	index_id	PAGE_INDEX_ID contents
+@param[in,out]	mtr		mini-transaction */
+void btr_free_if_exists(fil_space_t *space, uint32_t page,
+                        index_id_t index_id, mtr_t *mtr);
+
+/** Drop a temporary table
+@param table   temporary table */
+void btr_drop_temporary_table(const dict_table_t &table);
+
+/** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC.
+@param[in,out]	index	clustered index
+@return	the last used AUTO_INCREMENT value
+@retval	0 on error or if no AUTO_INCREMENT value was used yet */
+ib_uint64_t
+btr_read_autoinc(dict_index_t* index)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC,
+or fall back to MAX(auto_increment_column).
+@param[in]	table	table containing an AUTO_INCREMENT column
+@param[in]	col_no	index of the AUTO_INCREMENT column
+@return	the AUTO_INCREMENT value
+@retval	0 on error or if no AUTO_INCREMENT value was used yet */
+ib_uint64_t
+btr_read_autoinc_with_fallback(const dict_table_t* table, unsigned col_no)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Write the next available AUTO_INCREMENT value to PAGE_ROOT_AUTO_INC.
+@param[in,out]	index	clustered index
+@param[in]	autoinc	the AUTO_INCREMENT value
+@param[in]	reset	whether to reset the AUTO_INCREMENT
+			to a possibly smaller value than currently
+			exists in the page */
+void
+btr_write_autoinc(dict_index_t* index, ib_uint64_t autoinc, bool reset = false)
+	MY_ATTRIBUTE((nonnull));
+
+/** Write instant ALTER TABLE metadata to a root page.
+@param[in,out]	root	clustered index root page
+@param[in]	index	clustered index with instant ALTER TABLE
+@param[in,out]	mtr	mini-transaction */
+void btr_set_instant(buf_block_t* root, const dict_index_t& index, mtr_t* mtr);
+
+ATTRIBUTE_COLD __attribute__((nonnull))
+/** Reset the table to the canonical format on ROLLBACK of instant ALTER TABLE.
+@param[in]      index   clustered index with instant ALTER TABLE
+@param[in]      all     whether to reset FIL_PAGE_TYPE as well
+@param[in,out]  mtr     mini-transaction */
+void btr_reset_instant(const dict_index_t &index, bool all, mtr_t *mtr);
+
+/*************************************************************//**
+Makes tree one level higher by splitting the root, and inserts
+the tuple. It is assumed that mtr contains an x-latch on the tree.
+NOTE that the operation of this function must always succeed,
+we cannot reverse it: therefore enough free disk space must be
+guaranteed to be available before this function is called.
+@return inserted record */
+rec_t*
+btr_root_raise_and_insert(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in: cursor at which to insert: must be
+				on the root page; when the function returns,
+				the cursor is positioned on the predecessor
+				of the inserted record */
+	rec_offs**	offsets,/*!< out: offsets on inserted record */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap
+				that can be emptied, or NULL */
+	const dtuple_t*	tuple,	/*!< in: tuple to insert */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	mtr_t*		mtr,	/*!< in: mtr */
+	dberr_t*	err)	/*!< out: error code */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*************************************************************//**
+Reorganizes an index page.
+
+IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index. This has to
+be done either within the same mini-transaction, or by invoking
+ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
+IBUF_BITMAP_FREE is unaffected by reorganization.
+
+@param cursor  page cursor
+@param mtr     mini-transaction
+@return error code
+@retval DB_FAIL if reorganizing a ROW_FORMAT=COMPRESSED page failed */
+dberr_t btr_page_reorganize(page_cur_t *cursor, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Decide if the page should be split at the convergence point of inserts
+converging to the left.
+@param[in]	cursor	insert position
+@return the first record to be moved to the right half page
+@retval	NULL if no split is recommended */
+rec_t* btr_page_get_split_rec_to_left(const btr_cur_t* cursor);
+/** Decide if the page should be split at the convergence point of inserts
+converging to the right.
+@param[in]	cursor		insert position
+@param[out]	split_rec	if split recommended, the first record
+				on the right half page, or
+				NULL if the to-be-inserted record
+				should be first
+@return whether split is recommended */
+bool
+btr_page_get_split_rec_to_right(const btr_cur_t* cursor, rec_t** split_rec);
+
+/*************************************************************//**
+Splits an index page to halves and inserts the tuple. It is assumed
+that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is
+released within this function! NOTE that the operation of this
+function must always succeed, we cannot reverse it: therefore enough
+free disk space (2 pages) must be guaranteed to be available before
+this function is called.
+
+@return inserted record */
+rec_t*
+btr_page_split_and_insert(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in: cursor at which to insert; when the
+				function returns, the cursor is positioned
+				on the predecessor of the inserted record */
+	rec_offs**	offsets,/*!< out: offsets on inserted record */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap
+				that can be emptied, or NULL */
+	const dtuple_t*	tuple,	/*!< in: tuple to insert */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	mtr_t*		mtr,	/*!< in: mtr */
+	dberr_t*	err)	/*!< out: error code */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*******************************************************//**
+Inserts a data tuple to a tree on a non-leaf level. It is assumed
+that mtr holds an x-latch on the tree. */
+dberr_t
+btr_insert_on_non_leaf_level(
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	dict_index_t*	index,	/*!< in: index */
+	ulint		level,	/*!< in: level, must be > 0 */
+	dtuple_t*	tuple,	/*!< in: the record to be inserted */
+	mtr_t*		mtr)	/*!< in: mtr */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Set a child page pointer record as the predefined minimum record.
+@tparam has_prev  whether the page is supposed to have a left sibling
+@param[in,out]  rec     leftmost record on a leftmost non-leaf page
+@param[in,out]  block   buffer pool block
+@param[in,out]  mtr     mini-transaction */
+template<bool has_prev= false>
+inline void btr_set_min_rec_mark(rec_t *rec, const buf_block_t &block,
+                                 mtr_t *mtr)
+{
+  ut_ad(block.page.frame == page_align(rec));
+  ut_ad(!page_is_leaf(block.page.frame));
+  ut_ad(has_prev == page_has_prev(block.page.frame));
+
+  rec-= page_rec_is_comp(rec) ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS;
+
+  if (block.page.zip.data)
+    /* This flag is computed from other contents on a ROW_FORMAT=COMPRESSED
+    page. We are not modifying the compressed page frame at all. */
+    *rec|= REC_INFO_MIN_REC_FLAG;
+  else
+    mtr->write<1>(block, rec, *rec | REC_INFO_MIN_REC_FLAG);
+}
+
+/** Seek to the parent page of a B-tree page.
+@param[in,out]	mtr	mini-transaction
+@param[in,out]	cursor	cursor pointing to the x-latched parent page
+@return whether the cursor was successfully positioned */
+bool btr_page_get_father(mtr_t* mtr, btr_cur_t* cursor)
+	MY_ATTRIBUTE((nonnull,warn_unused_result));
+#ifdef UNIV_DEBUG
+/************************************************************//**
+Checks that the node pointer to a page is appropriate.
+@return TRUE */
+ibool
+btr_check_node_ptr(
+/*===============*/
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: index page */
+	mtr_t*		mtr)	/*!< in: mtr */
+	MY_ATTRIBUTE((warn_unused_result));
+#endif /* UNIV_DEBUG */
+/*************************************************************//**
+Tries to merge the page first to the left immediate brother if such a
+brother exists, and the node pointers to the current page and to the
+brother reside on the same page. If the left brother does not satisfy these
+conditions, looks at the right brother. If the page is the only one on that
+level lifts the records of the page to the father page, thus reducing the
+tree height. It is assumed that mtr holds an x-latch on the tree and on the
+page. If cursor is on the leaf level, mtr must also hold x-latches to
+the brothers, if they exist.
+@return error code
+@retval DB_FAIL if the tree could not be merged */
+dberr_t
+btr_compress(
+/*=========*/
+	btr_cur_t*	cursor,	/*!< in/out: cursor on the page to merge
+				or lift; the page must not be empty:
+				when deleting records, use btr_discard_page()
+				if the page would become empty */
+	bool		adjust,	/*!< in: whether the cursor position should be
+				adjusted even when compression occurs */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*************************************************************//**
+Discards a page from a B-tree. This is used to remove the last record from
+a B-tree page: the whole page must be removed at the same time. This cannot
+be used for the root page, which is allowed to be empty. */
+dberr_t
+btr_discard_page(
+/*=============*/
+	btr_cur_t*	cursor,	/*!< in: cursor on the page to discard: not on
+				the root page */
+	mtr_t*		mtr);	/*!< in: mtr */
+
+/**************************************************************//**
+Allocates a new file page to be used in an index tree. NOTE: we assume
+that the caller has made the reservation for free extents!
+@retval NULL if no page could be allocated */
+buf_block_t*
+btr_page_alloc(
+/*===========*/
+	dict_index_t*	index,		/*!< in: index tree */
+	uint32_t	hint_page_no,	/*!< in: hint of a good page */
+	byte		file_direction,	/*!< in: direction where a possible
+					page split is made */
+	ulint		level,		/*!< in: level where the page is placed
+					in the tree */
+	mtr_t*		mtr,		/*!< in/out: mini-transaction
+					for the allocation */
+	mtr_t*		init_mtr,	/*!< in/out: mini-transaction
+					for x-latching and initializing
+					the page */
+	dberr_t*	err)		/*!< out: error code */
+	MY_ATTRIBUTE((warn_unused_result));
+/** Empty an index page (possibly the root page). @see btr_page_create().
+@param[in,out]	block		page to be emptied
+@param[in,out]	page_zip	compressed page frame, or NULL
+@param[in]	index		index of the page
+@param[in]	level		B-tree level of the page (0=leaf)
+@param[in,out]	mtr		mini-transaction */
+void
+btr_page_empty(
+	buf_block_t*	block,
+	page_zip_des_t*	page_zip,
+	dict_index_t*	index,
+	ulint		level,
+	mtr_t*		mtr)
+	MY_ATTRIBUTE((nonnull(1, 3, 5)));
+/**************************************************************//**
+Creates a new index page (not the root, and also not
+used in page reorganization).  @see btr_page_empty(). */
+void
+btr_page_create(
+/*============*/
+	buf_block_t*	block,	/*!< in/out: page to be created */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	dict_index_t*	index,	/*!< in: index */
+	ulint		level,	/*!< in: the B-tree level of the page */
+	mtr_t*		mtr);	/*!< in: mtr */
+
+/** Free an index page.
+@param[in,out]	index	index tree
+@param[in,out]	block	block to be freed
+@param[in,out]	mtr	mini-transaction
+@param[in]	blob	whether this is freeing a BLOB page
+@param[in]	latched	whether index->table->space->x_lock() was called */
+MY_ATTRIBUTE((nonnull))
+dberr_t btr_page_free(dict_index_t *index, buf_block_t *block, mtr_t *mtr,
+                      bool blob= false, bool space_latched= false);
+
+/**************************************************************//**
+Gets the root node of a tree and x- or s-latches it.
+@return root page, x- or s-latched */
+buf_block_t*
+btr_root_block_get(
+/*===============*/
+	dict_index_t*		index,	/*!< in: index tree */
+	rw_lock_type_t		mode,	/*!< in: either RW_S_LATCH
+					or RW_X_LATCH */
+	mtr_t*			mtr,	/*!< in: mtr */
+	dberr_t*		err);	/*!< out: error code */
+/*************************************************************//**
+Reorganizes an index page.
+
+IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index. This has to
+be done either within the same mini-transaction, or by invoking
+ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
+IBUF_BITMAP_FREE is unaffected by reorganization.
+
+@return error code
+@retval DB_FAIL if reorganizing a ROW_FORMAT=COMPRESSED page failed */
+dberr_t btr_page_reorganize_block(
+	ulint		z_level,/*!< in: compression level to be used
+				if dealing with compressed page */
+	buf_block_t*	block,	/*!< in/out: B-tree page */
+	dict_index_t*	index,	/*!< in: the index tree of the page */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	__attribute__((nonnull, warn_unused_result));
+
+#ifdef UNIV_BTR_PRINT
+/*************************************************************//**
+Prints size info of a B-tree. */
+void
+btr_print_size(
+/*===========*/
+	dict_index_t*	index)	/*!< in: index tree */
+	MY_ATTRIBUTE((nonnull));
+/**************************************************************//**
+Prints directories and other info of all nodes in the index. */
+void
+btr_print_index(
+/*============*/
+	dict_index_t*	index,	/*!< in: index */
+	ulint		width)	/*!< in: print this many entries from start
+				and end */
+	MY_ATTRIBUTE((nonnull));
+#endif /* UNIV_BTR_PRINT */
+/************************************************************//**
+Checks the size and number of fields in a record based on the definition of
+the index.
+@return TRUE if ok */
+ibool
+btr_index_rec_validate(
+/*===================*/
+	const rec_t*		rec,		/*!< in: index record */
+	const dict_index_t*	index,		/*!< in: index */
+	ibool			dump_on_error)	/*!< in: TRUE if the function
+						should print hex dump of record
+						and page on error */
+	MY_ATTRIBUTE((warn_unused_result));
+/**************************************************************//**
+Checks the consistency of an index tree.
+@return	DB_SUCCESS if ok, error code if not */
+dberr_t
+btr_validate_index(
+/*===============*/
+	dict_index_t*	index,	/*!< in: index */
+	const trx_t*	trx)	/*!< in: transaction or 0 */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Remove a page from the level list of pages.
+@param[in]	block		page to remove
+@param[in]	index		index tree
+@param[in,out]	mtr		mini-transaction */
+dberr_t btr_level_list_remove(const buf_block_t& block,
+                              const dict_index_t& index, mtr_t* mtr)
+  MY_ATTRIBUTE((warn_unused_result));
+
+/*************************************************************//**
+If page is the only on its level, this function moves its records to the
+father page, thus reducing the tree height.
+@return father block */
+buf_block_t*
+btr_lift_page_up(
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: page which is the only on its level;
+				must not be empty: use
+				btr_discard_only_page_on_level if the last
+				record from the page should be removed */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	dberr_t*	err)	/*!< out: error code */
+	__attribute__((nonnull));
+
+#define BTR_N_LEAF_PAGES	1
+#define BTR_TOTAL_SIZE		2
+
+#include "btr0btr.inl"
+
+/****************************************************************
+Global variable controlling if scrubbing should be performed */
+extern my_bool srv_immediate_scrub_data_uncompressed;
diff --git a/storage/innobase/include/btr0btr.inl b/storage/innobase/include/btr0btr.inl
new file mode 100644
index 00000000..9a9e39b6
--- /dev/null
+++ b/storage/innobase/include/btr0btr.inl
@@ -0,0 +1,111 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0btr.ic
+The B-tree
+
+Created 6/2/1994 Heikki Tuuri
+*******************************************************/
+
+#include "mtr0log.h"
+
+/**************************************************************//**
+Gets the index id field of a page.
+@return index id */
+UNIV_INLINE
+index_id_t
+btr_page_get_index_id(
+/*==================*/
+	const page_t*	page)	/*!< in: index page */
+{
+	return(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID));
+}
+
+/** Set PAGE_LEVEL.
+@param[in,out]  block  buffer block
+@param[in]      level  page level
+@param[in,out]  mtr    mini-transaction */
+inline
+void btr_page_set_level(buf_block_t *block, ulint level, mtr_t *mtr)
+{
+  ut_ad(level <= BTR_MAX_NODE_LEVEL);
+  constexpr uint16_t field= PAGE_HEADER + PAGE_LEVEL;
+  byte *b= my_assume_aligned<2>(&block->page.frame[field]);
+  if (mtr->write<2,mtr_t::MAYBE_NOP>(*block, b, level) &&
+      UNIV_LIKELY_NULL(block->page.zip.data))
+    memcpy_aligned<2>(&block->page.zip.data[field], b, 2);
+}
+
+/** Set FIL_PAGE_NEXT.
+@param[in,out]  block  buffer block
+@param[in]      next   number of successor page
+@param[in,out]  mtr    mini-transaction */
+inline void btr_page_set_next(buf_block_t *block, ulint next, mtr_t *mtr)
+{
+  constexpr uint16_t field= FIL_PAGE_NEXT;
+  byte *b= my_assume_aligned<4>(&block->page.frame[field]);
+  if (mtr->write<4,mtr_t::MAYBE_NOP>(*block, b, next) &&
+      UNIV_LIKELY_NULL(block->page.zip.data))
+    memcpy_aligned<4>(&block->page.zip.data[field], b, 4);
+}
+
+/** Set FIL_PAGE_PREV.
+@param[in,out]  block  buffer block
+@param[in]      prev   number of predecessor page
+@param[in,out]  mtr    mini-transaction */
+inline void btr_page_set_prev(buf_block_t *block, ulint prev, mtr_t *mtr)
+{
+  constexpr uint16_t field= FIL_PAGE_PREV;
+  byte *b= my_assume_aligned<4>(&block->page.frame[field]);
+  if (mtr->write<4,mtr_t::MAYBE_NOP>(*block, b, prev) &&
+      UNIV_LIKELY_NULL(block->page.zip.data))
+    memcpy_aligned<4>(&block->page.zip.data[field], b, 4);
+}
+
+/**************************************************************//**
+Gets the child node file address in a node pointer.
+NOTE: the offsets array must contain all offsets for the record since
+we read the last field according to offsets and assume that it contains
+the child page number. In other words offsets must have been retrieved
+with rec_get_offsets(n_fields=ULINT_UNDEFINED).
+@return child node address */
+UNIV_INLINE
+uint32_t
+btr_node_ptr_get_child_page_no(
+/*===========================*/
+	const rec_t*	rec,	/*!< in: node pointer record */
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	const byte*	field;
+	ulint		len;
+
+	ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec));
+
+	/* The child address is in the last field */
+	field = rec_get_nth_field(rec, offsets,
+				  rec_offs_n_fields(offsets) - 1, &len);
+
+	ut_ad(len == 4);
+
+	uint32_t page_no = mach_read_from_4(field);
+	ut_ad(page_no > 1);
+
+	return(page_no);
+}
diff --git a/storage/innobase/include/btr0bulk.h b/storage/innobase/include/btr0bulk.h
new file mode 100644
index 00000000..9fcea86d
--- /dev/null
+++ b/storage/innobase/include/btr0bulk.h
@@ -0,0 +1,371 @@
+/*****************************************************************************
+
+Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/btr0bulk.h
+The B-tree bulk load
+
+Created 03/11/2014 Shaohua Wang
+*************************************************************************/
+
+#ifndef btr0bulk_h
+#define btr0bulk_h
+
+#include "dict0dict.h"
+#include "rem0types.h"
+#include "page0cur.h"
+
+#include <vector>
+
+/** Innodb B-tree index fill factor for bulk load. */
+extern	uint	innobase_fill_factor;
+
+/*
+The proper function call sequence of PageBulk is as below:
+-- PageBulk::init
+-- PageBulk::insert
+-- PageBulk::finish
+-- PageBulk::compress(COMPRESSED table only)
+-- PageBulk::pageSplit(COMPRESSED table only)
+-- PageBulk::commit
+*/
+
+class PageBulk
+{
+public:
+	/** Constructor
+	@param[in]	index		B-tree index
+	@param[in]	page_no		page number
+	@param[in]	level		page level
+	@param[in]	trx_id		transaction id */
+	PageBulk(
+		dict_index_t*	index,
+		trx_id_t	trx_id,
+		uint32_t	page_no,
+		ulint		level)
+		:
+		m_heap(NULL),
+		m_index(index),
+		m_mtr(),
+		m_trx_id(trx_id),
+		m_block(NULL),
+		m_page(NULL),
+		m_page_zip(NULL),
+		m_cur_rec(NULL),
+		m_page_no(page_no),
+		m_level(level),
+		m_is_comp(dict_table_is_comp(index->table)),
+		m_heap_top(NULL),
+		m_rec_no(0),
+		m_free_space(0),
+		m_reserved_space(0),
+#ifdef UNIV_DEBUG
+		m_total_data(0),
+#endif /* UNIV_DEBUG */
+		m_modify_clock(0),
+		m_err(DB_SUCCESS)
+	{
+		ut_ad(!dict_index_is_spatial(m_index));
+		ut_ad(!m_index->table->is_temporary());
+	}
+
+	/** Deconstructor */
+	~PageBulk()
+	{
+		mem_heap_free(m_heap);
+	}
+
+	/** Initialize members and allocate page if needed and start mtr.
+	Note: must be called and only once right after constructor.
+	@return error code */
+	dberr_t init();
+
+	/** Insert a record in the page.
+	@param[in]	rec		record
+	@param[in]	offsets		record offsets */
+	inline void insert(const rec_t* rec, rec_offs* offsets);
+private:
+	/** Page format */
+	enum format { REDUNDANT, DYNAMIC, COMPRESSED };
+	/** Mark end of insertion to the page. Scan all records to set page
+	dirs, and set page header members.
+	@tparam format  the page format */
+	template<format> inline void finishPage();
+	/** Insert a record in the page.
+	@tparam format  the page format
+	@param[in,out]	rec		record
+	@param[in]	offsets		record offsets */
+	template<format> inline void insertPage(rec_t* rec, rec_offs* offsets);
+
+public:
+	/** Mark end of insertion to the page. Scan all records to set page
+	dirs, and set page header members. */
+	inline void finish();
+
+  /** @return whether finish() actually needs to do something */
+  inline bool needs_finish() const;
+
+	/** Commit mtr for a page
+	@param[in]	success		Flag whether all inserts succeed. */
+	void commit(bool success);
+
+	/** Compress if it is compressed table
+	@return	true	compress successfully or no need to compress
+	@return	false	compress failed. */
+	bool compress();
+
+	/** Check whether the record needs to be stored externally.
+	@return	true
+	@return	false */
+	bool needExt(const dtuple_t* tuple, ulint rec_size);
+
+	/** Store external record
+	@param[in]	big_rec		external recrod
+	@param[in]	offsets		record offsets
+	@return	error code */
+	dberr_t storeExt(const big_rec_t* big_rec, rec_offs* offsets);
+
+	/** Get node pointer
+	@return node pointer */
+	dtuple_t* getNodePtr();
+
+	/** Get split rec in the page. We split a page in half when compresssion
+	fails, and the split rec should be copied to the new page.
+	@return split rec */
+	rec_t*	getSplitRec();
+
+	/** Copy all records after split rec including itself.
+	@param[in]	rec	split rec */
+	void copyIn(rec_t*	split_rec);
+
+	/** Remove all records after split rec including itself.
+	@param[in]	rec	split rec	*/
+	void copyOut(rec_t*	split_rec);
+
+	/** Set next page
+	@param[in]	next_page_no	next page no */
+	inline void setNext(ulint next_page_no);
+
+	/** Set previous page
+	@param[in]	prev_page_no	previous page no */
+	inline void setPrev(ulint prev_page_no);
+
+	/** Release block by commiting mtr */
+	inline void release();
+
+	/** Start mtr and latch block */
+	inline void latch();
+
+	/** Check if required space is available in the page for the rec
+	to be inserted.	We check fill factor & padding here.
+	@param[in]	length		required length
+	@return true	if space is available */
+	inline bool isSpaceAvailable(ulint	rec_size);
+
+	/** Get page no */
+	uint32_t getPageNo() const { return m_page_no; }
+
+	/** Get page level */
+	ulint	getLevel()
+	{
+		return(m_level);
+	}
+
+	/** Get record no */
+	ulint	getRecNo()
+	{
+		return(m_rec_no);
+	}
+
+	/** Get page */
+	page_t*	getPage()
+	{
+		return(m_page);
+	}
+
+	/** Get page zip */
+	page_zip_des_t*	getPageZip()
+	{
+		return(m_page_zip);
+	}
+
+	dberr_t getError()
+	{
+		return(m_err);
+	}
+
+	void set_modified() { m_mtr.set_modified(*m_block); }
+
+	/* Memory heap for internal allocation */
+	mem_heap_t*	m_heap;
+
+private:
+	/** The index B-tree */
+	dict_index_t*	m_index;
+
+	/** The mini-transaction */
+	mtr_t		m_mtr;
+
+	/** The transaction id */
+	trx_id_t	m_trx_id;
+
+	/** The buffer block */
+	buf_block_t*	m_block;
+
+	/** The page */
+	page_t*		m_page;
+
+	/** The page zip descriptor */
+	page_zip_des_t*	m_page_zip;
+
+	/** The current rec, just before the next insert rec */
+	rec_t*		m_cur_rec;
+
+	/** The page no */
+	uint32_t	m_page_no;
+
+	/** The page level in B-tree */
+	ulint		m_level;
+
+	/** Flag: is page in compact format */
+	const bool	m_is_comp;
+
+	/** The heap top in page for next insert */
+	byte*		m_heap_top;
+
+	/** User record no */
+	ulint		m_rec_no;
+
+	/** The free space left in the page */
+	ulint		m_free_space;
+
+	/** The reserved space for fill factor */
+	ulint		m_reserved_space;
+
+	/** The padding space for compressed page */
+	ulint		m_padding_space;
+
+#ifdef UNIV_DEBUG
+	/** Total data in the page */
+	ulint		m_total_data;
+#endif /* UNIV_DEBUG */
+
+	/** The modify clock value of the buffer block
+	when the block is re-pinned */
+	ib_uint64_t     m_modify_clock;
+
+	/** Operation result DB_SUCCESS or error code */
+	dberr_t		m_err;
+};
+
+typedef std::vector<PageBulk*, ut_allocator<PageBulk*> >
+	page_bulk_vector;
+
+class BtrBulk
+{
+public:
+	/** Constructor
+	@param[in]	index		B-tree index
+	@param[in]	trx		transaction */
+	BtrBulk(
+		dict_index_t*	index,
+		const trx_t*	trx)
+		:
+		m_index(index),
+		m_trx(trx)
+	{
+		ut_ad(!dict_index_is_spatial(index));
+	}
+
+	/** Insert a tuple
+	@param[in]	tuple	tuple to insert.
+	@return error code */
+	dberr_t	insert(dtuple_t*	tuple)
+	{
+		return(insert(tuple, 0));
+	}
+
+	/** Btree bulk load finish. We commit the last page in each level
+	and copy the last page in top level to the root page of the index
+	if no error occurs.
+	@param[in]	err	whether bulk load was successful until now
+	@return error code  */
+	dberr_t finish(dberr_t	err);
+
+	/** Release all latches */
+	void release();
+
+	/** Re-latch all latches */
+	void latch();
+
+	table_name_t table_name() { return m_index->table->name; }
+
+private:
+	/** Insert a tuple to a page in a level
+	@param[in]	tuple	tuple to insert
+	@param[in]	level	B-tree level
+	@return error code */
+	dberr_t insert(dtuple_t* tuple, ulint level);
+
+	/** Split a page
+	@param[in]	page_bulk	page to split
+	@param[in]	next_page_bulk	next page
+	@return	error code */
+	dberr_t pageSplit(PageBulk* page_bulk,
+			  PageBulk* next_page_bulk);
+
+	/** Commit(finish) a page. We set next/prev page no, compress a page of
+	compressed table and split the page if compression fails, insert a node
+	pointer to father page if needed, and commit mini-transaction.
+	@param[in]	page_bulk	page to commit
+	@param[in]	next_page_bulk	next page
+	@param[in]	insert_father	flag whether need to insert node ptr
+	@return	error code */
+	dberr_t pageCommit(PageBulk* page_bulk,
+			   PageBulk* next_page_bulk,
+			   bool insert_father);
+
+	/** Abort a page when an error occurs
+	@param[in]	page_bulk	page bulk object
+	Note: we should call pageAbort for a PageBulk object, which is not in
+	m_page_bulks after pageCommit, and we will commit or abort PageBulk
+	objects in function "finish". */
+	void	pageAbort(PageBulk* page_bulk)
+	{
+		page_bulk->commit(false);
+	}
+
+	/** Log free check */
+	inline void logFreeCheck();
+
+private:
+	/** B-tree index */
+	dict_index_t*const	m_index;
+
+	/** Transaction */
+	const trx_t*const	m_trx;
+
+	/** Root page level */
+	ulint			m_root_level;
+
+	/** Page cursor vector for all level */
+	page_bulk_vector	m_page_bulks;
+};
+
+#endif
diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h
new file mode 100644
index 00000000..f6abc9f5
--- /dev/null
+++ b/storage/innobase/include/btr0cur.h
@@ -0,0 +1,855 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0cur.h
+The index tree cursor
+
+Created 10/16/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef btr0cur_h
+#define btr0cur_h
+
+#include "dict0dict.h"
+#include "page0cur.h"
+#include "btr0types.h"
+#include "rem0types.h"
+#include "gis0type.h"
+#include "my_base.h"
+#ifdef BTR_CUR_HASH_ADAPT
+# include "srw_lock.h"
+#endif
+
+/** Mode flags for btr_cur operations; these can be ORed */
+enum {
+	/** do no undo logging */
+	BTR_NO_UNDO_LOG_FLAG = 1,
+	/** do no record lock checking */
+	BTR_NO_LOCKING_FLAG = 2,
+	/** sys fields will be found in the update vector or inserted
+	entry */
+	BTR_KEEP_SYS_FLAG = 4,
+
+	/** no rollback */
+	BTR_NO_ROLLBACK = BTR_NO_UNDO_LOG_FLAG
+		| BTR_NO_LOCKING_FLAG | BTR_KEEP_SYS_FLAG,
+
+	/** btr_cur_pessimistic_update() must keep cursor position
+	when moving columns to big_rec */
+	BTR_KEEP_POS_FLAG = 8,
+	/** the caller is creating the index or wants to bypass the
+	index->info.online creation log */
+	BTR_CREATE_FLAG = 16,
+	/** the caller of btr_cur_optimistic_update() or
+	btr_cur_update_in_place() will take care of
+	updating IBUF_BITMAP_FREE */
+	BTR_KEEP_IBUF_BITMAP = 32
+};
+
+#include "que0types.h"
+#include "row0types.h"
+
+#define btr_cur_get_page_cur(cursor)	(&(cursor)->page_cur)
+#define btr_cur_get_block(cursor)	((cursor)->page_cur.block)
+#define btr_cur_get_rec(cursor)	((cursor)->page_cur.rec)
+
+/*********************************************************//**
+Returns the compressed page on which the tree cursor is positioned.
+@return pointer to compressed page, or NULL if the page is not compressed */
+UNIV_INLINE
+page_zip_des_t*
+btr_cur_get_page_zip(
+/*=================*/
+	btr_cur_t*	cursor);/*!< in: tree cursor */
+/*********************************************************//**
+Returns the page of a tree cursor.
+@return pointer to page */
+UNIV_INLINE
+page_t*
+btr_cur_get_page(
+/*=============*/
+	btr_cur_t*	cursor);/*!< in: tree cursor */
+/*********************************************************//**
+Returns the index of a cursor.
+@param cursor b-tree cursor
+@return index */
+#define btr_cur_get_index(cursor) ((cursor)->index())
+/*********************************************************//**
+Positions a tree cursor at a given record. */
+UNIV_INLINE
+void
+btr_cur_position(
+/*=============*/
+	dict_index_t*	index,	/*!< in: index */
+	rec_t*		rec,	/*!< in: record in tree */
+	buf_block_t*	block,	/*!< in: buffer block of rec */
+	btr_cur_t*	cursor);/*!< in: cursor */
+
+/** Load the instant ALTER TABLE metadata from the clustered index
+when loading a table definition.
+@param[in,out]	table	table definition from the data dictionary
+@return	error code
+@retval	DB_SUCCESS	if no error occurred */
+dberr_t
+btr_cur_instant_init(dict_table_t* table)
+	ATTRIBUTE_COLD __attribute__((nonnull, warn_unused_result));
+
+/** Initialize the n_core_null_bytes on first access to a clustered
+index root page.
+@param[in]	index	clustered index that is on its first access
+@param[in]	page	clustered index root page
+@return	whether the page is corrupted */
+bool
+btr_cur_instant_root_init(dict_index_t* index, const page_t* page)
+	ATTRIBUTE_COLD __attribute__((nonnull, warn_unused_result));
+
+MY_ATTRIBUTE((warn_unused_result))
+/********************************************************************//**
+Searches an index tree and positions a tree cursor on a given non-leaf level.
+NOTE: n_fields_cmp in tuple must be set so that it cannot be compared
+to node pointer page number fields on the upper levels of the tree!
+cursor->up_match and cursor->low_match both will have sensible values.
+Cursor is left at the place where an insert of the
+search tuple should be performed in the B-tree. InnoDB does an insert
+immediately after the cursor. Thus, the cursor may end up on a user record,
+or on a page infimum record.
+@param level      the tree level of search
+@param tuple      data tuple; NOTE: n_fields_cmp in tuple must be set so that
+                  it cannot get compared to the node ptr page number field!
+@param latch      RW_S_LATCH or RW_X_LATCH
+@param cursor     tree cursor; the cursor page is s- or x-latched, but see also
+                  above!
+@param mtr        mini-transaction
+@return DB_SUCCESS on success or error code otherwise */
+dberr_t btr_cur_search_to_nth_level(ulint level,
+                                    const dtuple_t *tuple,
+                                    rw_lock_type_t rw_latch,
+                                    btr_cur_t *cursor, mtr_t *mtr);
+
+/*************************************************************//**
+Tries to perform an insert to a page in an index tree, next to cursor.
+It is assumed that mtr holds an x-latch on the page. The operation does
+not succeed if there is too little space on the page. If there is just
+one record on the page, the insert will always succeed; this is to
+prevent trying to split a page with just one record.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_FAIL, or error number */
+dberr_t
+btr_cur_optimistic_insert(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags: if not
+				zero, the parameters index and thr should be
+				specified */
+	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert;
+				cursor stays valid */
+	rec_offs**	offsets,/*!< out: offsets on *rec */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap */
+	dtuple_t*	entry,	/*!< in/out: entry to insert */
+	rec_t**		rec,	/*!< out: pointer to inserted record if
+				succeed */
+	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
+				be stored externally by the caller */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	que_thr_t*	thr,	/*!< in/out: query thread; can be NULL if
+				!(~flags
+				& (BTR_NO_LOCKING_FLAG
+				| BTR_NO_UNDO_LOG_FLAG)) */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction;
+				if this function returns DB_SUCCESS on
+				a leaf page of a secondary index in a
+				compressed tablespace, the caller must
+				mtr_commit(mtr) before latching
+				any further pages */
+	MY_ATTRIBUTE((nonnull(2,3,4,5,6,7,10), warn_unused_result));
+/*************************************************************//**
+Performs an insert on a page of an index tree. It is assumed that mtr
+holds an x-latch on the tree and on the cursor page. If the insert is
+made on the leaf level, to avoid deadlocks, mtr must also own x-latches
+to brothers of page, if those brothers exist.
+@return DB_SUCCESS or error number */
+dberr_t
+btr_cur_pessimistic_insert(
+/*=======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags: if not
+				zero, the parameter thr should be
+				specified; if no undo logging is specified,
+				then the caller must have reserved enough
+				free extents in the file space so that the
+				insertion will certainly succeed */
+	btr_cur_t*	cursor,	/*!< in: cursor after which to insert;
+				cursor stays valid */
+	rec_offs**	offsets,/*!< out: offsets on *rec */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap
+				that can be emptied */
+	dtuple_t*	entry,	/*!< in/out: entry to insert */
+	rec_t**		rec,	/*!< out: pointer to inserted record if
+				succeed */
+	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
+				be stored externally by the caller */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	que_thr_t*	thr,	/*!< in/out: query thread; can be NULL if
+				!(~flags
+				& (BTR_NO_LOCKING_FLAG
+				| BTR_NO_UNDO_LOG_FLAG)) */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull(2,3,4,5,6,7,10), warn_unused_result));
+/*************************************************************//**
+See if there is enough place in the page modification log to log
+an update-in-place.
+
+@retval false if out of space; IBUF_BITMAP_FREE will be reset
+outside mtr if the page was recompressed
+@retval true if enough place;
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE if this is
+a secondary index leaf page. This has to be done either within the
+same mini-transaction, or by invoking ibuf_reset_free_bits() before
+mtr_commit(mtr). */
+bool
+btr_cur_update_alloc_zip_func(
+/*==========================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	page_cur_t*	cursor,	/*!< in/out: B-tree page cursor */
+#ifdef UNIV_DEBUG
+	rec_offs*	offsets,/*!< in/out: offsets of the cursor record */
+#endif /* UNIV_DEBUG */
+	ulint		length,	/*!< in: size needed */
+	bool		create,	/*!< in: true=delete-and-insert,
+				false=update-in-place */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+#ifdef UNIV_DEBUG
+# define btr_cur_update_alloc_zip(page_zip,cursor,offsets,len,cr,mtr) \
+	btr_cur_update_alloc_zip_func(page_zip,cursor,offsets,len,cr,mtr)
+#else /* UNIV_DEBUG */
+# define btr_cur_update_alloc_zip(page_zip,cursor,offsets,len,cr,mtr) \
+	btr_cur_update_alloc_zip_func(page_zip,cursor,len,cr,mtr)
+#endif /* UNIV_DEBUG */
+
+/** Apply an update vector to a record. No field size changes are allowed.
+
+This is usually invoked on a clustered index. The only use case for a
+secondary index is row_ins_sec_index_entry_by_modify() or its
+counterpart in ibuf_insert_to_index_page().
+@param[in,out]  rec     index record
+@param[in]      index   the index of the record
+@param[in]      offsets rec_get_offsets(rec, index)
+@param[in]      update  update vector
+@param[in,out]  block   index page
+@param[in,out]  mtr     mini-transaction */
+void btr_cur_upd_rec_in_place(rec_t *rec, const dict_index_t *index,
+                              const rec_offs *offsets, const upd_t *update,
+                              buf_block_t *block, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull));
+/*************************************************************//**
+Updates a record when the update causes no size changes in its fields.
+@return locking or undo log related error code, or
+@retval DB_SUCCESS on success
+@retval DB_ZIP_OVERFLOW if there is not enough space left
+on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
+dberr_t
+btr_cur_update_in_place(
+/*====================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in: cursor on the record to update;
+				cursor stays valid and positioned on the
+				same record */
+	rec_offs*	offsets,/*!< in/out: offsets on cursor->page_cur.rec */
+	const upd_t*	update,	/*!< in: update vector */
+	ulint		cmpl_info,/*!< in: compiler info on secondary index
+				updates */
+	que_thr_t*	thr,	/*!< in: query thread */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction; if this
+				is a secondary index, the caller must
+				mtr_commit(mtr) before latching any
+				further pages */
+	MY_ATTRIBUTE((warn_unused_result, nonnull));
+/*************************************************************//**
+Tries to update a record on a page in an index tree. It is assumed that mtr
+holds an x-latch on the page. The operation does not succeed if there is too
+little space on the page or if the update would result in too empty a page,
+so that tree compression is recommended.
+@return error code, including
+@retval DB_SUCCESS on success
+@retval DB_OVERFLOW if the updated record does not fit
+@retval DB_UNDERFLOW if the page would become too empty
+@retval DB_ZIP_OVERFLOW if there is not enough space left
+on the compressed page */
+dberr_t
+btr_cur_optimistic_update(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in: cursor on the record to update;
+				cursor stays valid and positioned on the
+				same record */
+	rec_offs**	offsets,/*!< out: offsets on cursor->page_cur.rec */
+	mem_heap_t**	heap,	/*!< in/out: pointer to NULL or memory heap */
+	const upd_t*	update,	/*!< in: update vector; this must also
+				contain trx id and roll ptr fields */
+	ulint		cmpl_info,/*!< in: compiler info on secondary index
+				updates */
+	que_thr_t*	thr,	/*!< in: query thread */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction; if this
+				is a secondary index, the caller must
+				mtr_commit(mtr) before latching any
+				further pages */
+	MY_ATTRIBUTE((warn_unused_result, nonnull));
+/*************************************************************//**
+Performs an update of a record on a page of a tree. It is assumed
+that mtr holds an x-latch on the tree and on the cursor page. If the
+update is made on the leaf level, to avoid deadlocks, mtr must also
+own x-latches to brothers of page, if those brothers exist.
+@return DB_SUCCESS or error code */
+dberr_t
+btr_cur_pessimistic_update(
+/*=======================*/
+	ulint		flags,	/*!< in: undo logging, locking, and rollback
+				flags */
+	btr_cur_t*	cursor,	/*!< in/out: cursor on the record to update;
+				cursor may become invalid if *big_rec == NULL
+				|| !(flags & BTR_KEEP_POS_FLAG) */
+	rec_offs**	offsets,/*!< out: offsets on cursor->page_cur.rec */
+	mem_heap_t**	offsets_heap,
+				/*!< in/out: pointer to memory heap
+				that can be emptied */
+	mem_heap_t*	entry_heap,
+				/*!< in/out: memory heap for allocating
+				big_rec and the index tuple */
+	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
+				be stored externally by the caller */
+	upd_t*		update,	/*!< in/out: update vector; this is allowed to
+				also contain trx id and roll ptr fields.
+				Non-updated columns that are moved offpage will
+				be appended to this. */
+	ulint		cmpl_info,/*!< in: compiler info on secondary index
+				updates */
+	que_thr_t*	thr,	/*!< in: query thread */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction; must be committed
+				before latching any further pages */
+	MY_ATTRIBUTE((warn_unused_result, nonnull));
+/***********************************************************//**
+Marks a clustered index record deleted. Writes an undo log record to
+undo log on this delete marking. Writes in the trx id field the id
+of the deleting transaction, and in the roll ptr field pointer to the
+undo log record created.
+@return DB_SUCCESS, DB_LOCK_WAIT, or error number */
+dberr_t
+btr_cur_del_mark_set_clust_rec(
+/*===========================*/
+	buf_block_t*	block,	/*!< in/out: buffer block of the record */
+	rec_t*		rec,	/*!< in/out: record */
+	dict_index_t*	index,	/*!< in: clustered index of the record */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec) */
+	que_thr_t*	thr,	/*!< in: query thread */
+	const dtuple_t*	entry,	/*!< in: dtuple for the deleting record */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*************************************************************//**
+Tries to compress a page of the tree if it seems useful. It is assumed
+that mtr holds an x-latch on the tree and on the cursor page. To avoid
+deadlocks, mtr must also own x-latches to brothers of page, if those
+brothers exist. NOTE: it is assumed that the caller has reserved enough
+free extents so that the compression will always succeed if done!
+@return whether compression occurred */
+bool
+btr_cur_compress_if_useful(
+/*=======================*/
+	btr_cur_t*	cursor,	/*!< in/out: cursor on the page to compress;
+				cursor does not stay valid if !adjust and
+				compression occurs */
+	bool		adjust,	/*!< in: whether the cursor position should be
+				adjusted even when compression occurs */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull));
+/*******************************************************//**
+Removes the record on which the tree cursor is positioned. It is assumed
+that the mtr has an x-latch on the page where the cursor is positioned,
+but no latch on the whole tree.
+@return error code
+@retval DB_FAIL if the page would become too empty */
+dberr_t
+btr_cur_optimistic_delete(
+/*======================*/
+	btr_cur_t*	cursor,	/*!< in: cursor on the record to delete;
+				cursor stays valid: if deletion succeeds,
+				on function exit it points to the successor
+				of the deleted record */
+	ulint		flags,	/*!< in: BTR_CREATE_FLAG or 0 */
+	mtr_t*		mtr)	/*!< in: mtr; if this function returns
+				TRUE on a leaf page of a secondary
+				index, the mtr must be committed
+				before latching any further pages */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*************************************************************//**
+Removes the record on which the tree cursor is positioned. Tries
+to compress the page if its fillfactor drops below a threshold
+or if it is the only page on the level. It is assumed that mtr holds
+an x-latch on the tree and on the cursor page. To avoid deadlocks,
+mtr must also own x-latches to brothers of page, if those brothers
+exist.
+@return TRUE if compression occurred */
+ibool
+btr_cur_pessimistic_delete(
+/*=======================*/
+	dberr_t*		err,	/*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE;
+				the latter may occur because we may have
+				to update node pointers on upper levels,
+				and in the case of variable length keys
+				these may actually grow in size */
+	ibool		has_reserved_extents, /*!< in: TRUE if the
+				caller has already reserved enough free
+				extents so that he knows that the operation
+				will succeed */
+	btr_cur_t*	cursor,	/*!< in: cursor on the record to delete;
+				if compression does not occur, the cursor
+				stays valid: it points to successor of
+				deleted record on function exit */
+	ulint		flags,	/*!< in: BTR_CREATE_FLAG or 0 */
+	bool		rollback,/*!< in: performing rollback? */
+	mtr_t*		mtr)	/*!< in: mtr */
+	MY_ATTRIBUTE((nonnull));
+/** Delete the node pointer in a parent page.
+@param[in,out]	parent	cursor pointing to parent record
+@param[in,out]	mtr	mini-transaction */
+dberr_t btr_cur_node_ptr_delete(btr_cur_t* parent, mtr_t* mtr)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/***********************************************************//**
+Parses a redo log record of updating a record in-place.
+@return end of log record or NULL */
+byte*
+btr_cur_parse_update_in_place(
+/*==========================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	page_t*		page,	/*!< in/out: page or NULL */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	dict_index_t*	index);	/*!< in: index corresponding to page */
+/** Arguments to btr_estimate_n_rows_in_range */
+struct btr_pos_t
+{
+  btr_pos_t(dtuple_t *arg_tuple,
+            page_cur_mode_t arg_mode,
+            page_id_t arg_page_id)
+  :tuple(arg_tuple), mode(arg_mode), page_id(arg_page_id)
+  {}
+
+  dtuple_t*       tuple;       /* Range start or end. May be NULL */
+  page_cur_mode_t mode;        /* search mode for range */
+  page_id_t       page_id;     /* Out: Page where we found the tuple */
+};
+
+/** Estimates the number of rows in a given index range. Do search in the
+left page, then if there are pages between left and right ones, read a few
+pages to the right, if the right page is reached, fetch it and count the exact
+number of rows, otherwise count the estimated(see
+btr_estimate_n_rows_in_range_on_level() for details) number if rows, and
+fetch the right page. If leaves are reached, unlatch non-leaf pages except
+the right leaf parent. After the right leaf page is fetched, commit mtr.
+@param[in]  index index
+@param[in]  range_start range start
+@param[in]  range_end   range end
+@return estimated number of rows; */
+ha_rows btr_estimate_n_rows_in_range(dict_index_t *index,
+                                     btr_pos_t *range_start,
+                                     btr_pos_t *range_end);
+
+/** Gets the externally stored size of a record, in units of a database page.
+@param[in]	rec	record
+@param[in]	offsets	array returned by rec_get_offsets()
+@return externally stored part, in units of a database page */
+ulint
+btr_rec_get_externally_stored_len(
+	const rec_t*	rec,
+	const rec_offs*	offsets);
+
+/*******************************************************************//**
+Marks non-updated off-page fields as disowned by this record. The ownership
+must be transferred to the updated record which is inserted elsewhere in the
+index tree. In purge only the owner of externally stored field is allowed
+to free the field. */
+void
+btr_cur_disown_inherited_fields(
+/*============================*/
+	buf_block_t*	block,	/*!< in/out: index page */
+	rec_t*		rec,	/*!< in/out: record in a clustered index */
+	dict_index_t*	index,	/*!< in: index of the page */
+	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
+	const upd_t*	update,	/*!< in: update vector */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull(2,3,4,5,6)));
+
+/** Operation code for btr_store_big_rec_extern_fields(). */
+enum blob_op {
+	/** Store off-page columns for a freshly inserted record */
+	BTR_STORE_INSERT = 0,
+	/** Store off-page columns for an insert by update */
+	BTR_STORE_INSERT_UPDATE,
+	/** Store off-page columns for an update */
+	BTR_STORE_UPDATE,
+	/** Store off-page columns for a freshly inserted record by bulk */
+	BTR_STORE_INSERT_BULK
+};
+
+/*******************************************************************//**
+Determine if an operation on off-page columns is an update.
+@return TRUE if op != BTR_STORE_INSERT */
+UNIV_INLINE
+ibool
+btr_blob_op_is_update(
+/*==================*/
+	enum blob_op	op)	/*!< in: operation */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/*******************************************************************//**
+Stores the fields in big_rec_vec to the tablespace and puts pointers to
+them in rec.  The extern flags in rec will have to be set beforehand.
+The fields are stored on pages allocated from leaf node
+file segment of the index tree.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+dberr_t
+btr_store_big_rec_extern_fields(
+/*============================*/
+	btr_pcur_t*	pcur,		/*!< in: a persistent cursor */
+	rec_offs*	offsets,	/*!< in/out: rec_get_offsets() on
+					pcur. the "external storage" flags
+					in offsets will correctly correspond
+					to rec when this function returns */
+	const big_rec_t*big_rec_vec,	/*!< in: vector containing fields
+					to be stored externally */
+	mtr_t*		btr_mtr,	/*!< in/out: mtr containing the
+					latches to the clustered index. can be
+					committed and restarted. */
+	enum blob_op	op)		/*! in: operation code */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/*******************************************************************//**
+Frees the space in an externally stored field to the file space
+management if the field in data is owned the externally stored field,
+in a rollback we may have the additional condition that the field must
+not be inherited. */
+void
+btr_free_externally_stored_field(
+/*=============================*/
+	dict_index_t*	index,		/*!< in: index of the data, the index
+					tree MUST be X-latched; if the tree
+					height is 1, then also the root page
+					must be X-latched! (this is relevant
+					in the case this function is called
+					from purge where 'data' is located on
+					an undo log page, not an index
+					page) */
+	byte*		field_ref,	/*!< in/out: field reference */
+	const rec_t*	rec,		/*!< in: record containing field_ref, for
+					page_zip_write_blob_ptr(), or NULL */
+	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec, index),
+					or NULL */
+	buf_block_t*	block,		/*!< in/out: page of field_ref */
+	ulint		i,		/*!< in: field number of field_ref;
+					ignored if rec == NULL */
+	bool		rollback,	/*!< in: performing rollback? */
+	mtr_t*		local_mtr)	/*!< in: mtr containing the latch */
+	MY_ATTRIBUTE((nonnull(1,2,5,8)));
+
+/** Copies the prefix of an externally stored field of a record.
+The clustered index record must be protected by a lock or a page latch.
+@param[out]	buf		the field, or a prefix of it
+@param[in]	len		length of buf, in bytes
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	data		'internally' stored part of the field
+containing also the reference to the external part; must be protected by
+a lock or a page latch
+@param[in]	local_len	length of data, in bytes
+@return the length of the copied field, or 0 if the column was being
+or has been deleted */
+ulint
+btr_copy_externally_stored_field_prefix(
+	byte*			buf,
+	ulint			len,
+	ulint			zip_size,
+	const byte*		data,
+	ulint			local_len);
+
+/** Copies an externally stored field of a record to mem heap.
+The clustered index record must be protected by a lock or a page latch.
+@param[out]	len		length of the whole field
+@param[in]	data		'internally' stored part of the field
+containing also the reference to the external part; must be protected by
+a lock or a page latch
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	local_len	length of data
+@param[in,out]	heap		mem heap
+@return the whole field copied to heap */
+byte*
+btr_copy_externally_stored_field(
+	ulint*			len,
+	const byte*		data,
+	ulint			zip_size,
+	ulint			local_len,
+	mem_heap_t*		heap);
+
+/** Copies an externally stored field of a record to mem heap.
+@param[in]	rec		record in a clustered index; must be
+protected by a lock or a page latch
+@param[in]	offset		array returned by rec_get_offsets()
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	no		field number
+@param[out]	len		length of the field
+@param[in,out]	heap		mem heap
+@return the field copied to heap, or NULL if the field is incomplete */
+byte*
+btr_rec_copy_externally_stored_field(
+	const rec_t*		rec,
+	const rec_offs*		offsets,
+	ulint			zip_size,
+	ulint			no,
+	ulint*			len,
+	mem_heap_t*		heap);
+
+/*######################################################################*/
+
+/** In the pessimistic delete, if the page data size drops below this
+limit, merging it to a neighbor is tried */
+#define BTR_CUR_PAGE_COMPRESS_LIMIT(index) \
+	((srv_page_size * (ulint)((index)->merge_threshold)) / 100)
+
+/** A slot in the path array. We store here info on a search path down the
+tree. Each slot contains data on a single level of the tree. */
+struct btr_path_t {
+	/* Assume a page like:
+	records:             (inf, a, b, c, d, sup)
+	index of the record:    0, 1, 2, 3, 4, 5
+	*/
+
+	/** Index of the record where the page cursor stopped on this level
+	(index in alphabetical order). Value ULINT_UNDEFINED denotes array
+	end. In the above example, if the search stopped on record 'c', then
+	nth_rec will be 3. */
+	ulint	nth_rec;
+
+	/** Number of the records on the page, not counting inf and sup.
+	In the above example n_recs will be 4. */
+	ulint	n_recs;
+
+	/** Number of the page containing the record. */
+	uint32_t page_no;
+
+	/** Level of the page. If later we fetch the page under page_no
+	and it is no different level then we know that the tree has been
+	reorganized. */
+	ulint	page_level;
+};
+
+#define BTR_PATH_ARRAY_N_SLOTS	250	/*!< size of path array (in slots) */
+
+/** Values for the flag documenting the used search method */
+enum btr_cur_method {
+	BTR_CUR_HASH = 1,	/*!< successful shortcut using
+				the hash index */
+	BTR_CUR_HASH_FAIL,	/*!< failure using hash, success using
+				binary search: the misleading hash
+				reference is stored in the field
+				hash_node, and might be necessary to
+				update */
+	BTR_CUR_BINARY,		/*!< success using the binary search */
+	BTR_CUR_INSERT_TO_IBUF,	/*!< performed the intended insert to
+				the insert buffer */
+	BTR_CUR_DEL_MARK_IBUF,	/*!< performed the intended delete
+				mark in the insert/delete buffer */
+	BTR_CUR_DELETE_IBUF,	/*!< performed the intended delete in
+				the insert/delete buffer */
+	BTR_CUR_DELETE_REF	/*!< row_purge_poss_sec() failed */
+};
+
+/** The tree cursor: the definition appears here only for the compiler
+to know struct size! */
+struct btr_cur_t {
+	page_cur_t	page_cur;	/*!< page cursor */
+	purge_node_t*	purge_node;	/*!< purge node, for BTR_DELETE */
+	/*------------------------------*/
+	que_thr_t*	thr;		/*!< this field is only used
+					when search_leaf()
+					is called for an index entry
+					insertion: the calling query
+					thread is passed here to be
+					used in the insert buffer */
+	/*------------------------------*/
+	/** The following fields are used in
+	search_leaf() to pass information: */
+	/* @{ */
+	enum btr_cur_method	flag;	/*!< Search method used */
+	ulint		tree_height;	/*!< Tree height if the search is done
+					for a pessimistic insert or update
+					operation */
+	ulint		up_match;	/*!< If the search mode was PAGE_CUR_LE,
+					the number of matched fields to the
+					the first user record to the right of
+					the cursor record after search_leaf();
+					for the mode PAGE_CUR_GE, the matched
+					fields to the first user record AT THE
+					CURSOR or to the right of it;
+					NOTE that the up_match and low_match
+					values may exceed the correct values
+					for comparison to the adjacent user
+					record if that record is on a
+					different leaf page! (See the note in
+					row_ins_duplicate_error_in_clust.) */
+	ulint		up_bytes;	/*!< number of matched bytes to the
+					right at the time cursor positioned;
+					only used internally in searches: not
+					defined after the search */
+	ulint		low_match;	/*!< if search mode was PAGE_CUR_LE,
+					the number of matched fields to the
+					first user record AT THE CURSOR or
+					to the left of it after search_leaf();
+					NOT defined for PAGE_CUR_GE or any
+					other search modes; see also the NOTE
+					in up_match! */
+	ulint		low_bytes;	/*!< number of matched bytes to the
+					left at the time cursor positioned;
+					only used internally in searches: not
+					defined after the search */
+	ulint		n_fields;	/*!< prefix length used in a hash
+					search if hash_node != NULL */
+	ulint		n_bytes;	/*!< hash prefix bytes if hash_node !=
+					NULL */
+	ulint		fold;		/*!< fold value used in the search if
+					flag is BTR_CUR_HASH */
+	/* @} */
+	btr_path_t*	path_arr;	/*!< in estimating the number of
+					rows in range, we store in this array
+					information of the path through
+					the tree */
+	rtr_info_t*	rtr_info;	/*!< rtree search info */
+  btr_cur_t() { memset((void*) this, 0, sizeof *this); }
+
+  dict_index_t *index() const { return page_cur.index; }
+  buf_block_t *block() const { return page_cur.block; }
+
+  /** Open the cursor on the first or last record.
+  @param first         true=first record, false=last record
+  @param index         B-tree
+  @param latch_mode    which latches to acquire
+  @param mtr           mini-transaction
+  @return error code */
+  dberr_t open_leaf(bool first, dict_index_t *index, btr_latch_mode latch_mode,
+                    mtr_t *mtr);
+
+  /** Search the leaf page record corresponding to a key.
+  @param tuple      key to search for, with correct n_fields_cmp
+  @param mode       search mode; PAGE_CUR_LE for unique prefix or for inserting
+  @param latch_mode latch mode
+  @param mtr        mini-transaction
+  @return error code */
+  dberr_t search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,
+                      btr_latch_mode latch_mode, mtr_t *mtr);
+
+  /** Search the leaf page record corresponding to a key, exclusively latching
+  all sibling pages on the way.
+  @param tuple      key to search for, with correct n_fields_cmp
+  @param mode       search mode; PAGE_CUR_LE for unique prefix or for inserting
+  @param mtr        mini-transaction
+  @return error code */
+  dberr_t pessimistic_search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,
+                                  mtr_t *mtr);
+
+  /** Open the cursor at a random leaf page record.
+  @param offsets   temporary memory for rec_get_offsets()
+  @param heap      memory heap for rec_get_offsets()
+  @param mtr       mini-transaction
+  @return error code */
+  inline dberr_t open_random_leaf(rec_offs *&offsets, mem_heap_t *& heap,
+                                  mtr_t &mtr);
+};
+
+/** Modify the delete-mark flag of a record.
+@tparam         flag    the value of the delete-mark flag
+@param[in,out]  block   buffer block
+@param[in,out]  rec     record on a physical index page
+@param[in,out]  mtr     mini-transaction  */
+template<bool flag>
+void btr_rec_set_deleted(buf_block_t *block, rec_t *rec, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull));
+
+/** If pessimistic delete fails because of lack of file space, there
+is still a good change of success a little later.  Try this many
+times. */
+#define BTR_CUR_RETRY_DELETE_N_TIMES	100
+/** If pessimistic delete fails because of lack of file space, there
+is still a good change of success a little later.  Sleep this time
+between retries. */
+static const std::chrono::milliseconds BTR_CUR_RETRY_SLEEP_TIME(50);
+
+/** The reference in a field for which data is stored on a different page.
+The reference is at the end of the 'locally' stored part of the field.
+'Locally' means storage in the index record.
+We store locally a long enough prefix of each column so that we can determine
+the ordering parts of each index record without looking into the externally
+stored part. */
+/*-------------------------------------- @{ */
+#define BTR_EXTERN_SPACE_ID		0U	/*!< space id where stored */
+#define BTR_EXTERN_PAGE_NO		4U	/*!< page no where stored */
+#define BTR_EXTERN_OFFSET		8U	/*!< offset of BLOB header
+						on that page */
+#define BTR_EXTERN_LEN			12U	/*!< 8 bytes containing the
+						length of the externally
+						stored part of the BLOB.
+						The 2 highest bits are
+						reserved to the flags below. */
+/*-------------------------------------- @} */
+/* #define BTR_EXTERN_FIELD_REF_SIZE	20 // moved to btr0types.h */
+
+/** The most significant bit of BTR_EXTERN_LEN (i.e., the most
+significant bit of the byte at smallest address) is set to 1 if this
+field does not 'own' the externally stored field; only the owner field
+is allowed to free the field in purge! */
+#define BTR_EXTERN_OWNER_FLAG		128U
+/** If the second most significant bit of BTR_EXTERN_LEN (i.e., the
+second most significant bit of the byte at smallest address) is 1 then
+it means that the externally stored field was inherited from an
+earlier version of the row.  In rollback we are not allowed to free an
+inherited external field. */
+#define BTR_EXTERN_INHERITED_FLAG	64U
+
+#ifdef BTR_CUR_HASH_ADAPT
+/** Number of searches down the B-tree in btr_cur_t::search_leaf(). */
+extern ib_counter_t<ulint, ib_counter_element_t>	btr_cur_n_non_sea;
+/** Old value of btr_cur_n_non_sea.  Copied by
+srv_refresh_innodb_monitor_stats().  Referenced by
+srv_printf_innodb_monitor(). */
+extern ulint	btr_cur_n_non_sea_old;
+/** Number of successful adaptive hash index lookups in
+btr_cur_t::search_leaf(). */
+extern ib_counter_t<ulint, ib_counter_element_t>	btr_cur_n_sea;
+/** Old value of btr_cur_n_sea.  Copied by
+srv_refresh_innodb_monitor_stats().  Referenced by
+srv_printf_innodb_monitor(). */
+extern ulint	btr_cur_n_sea_old;
+#endif /* BTR_CUR_HASH_ADAPT */
+
+#ifdef UNIV_DEBUG
+/* Flag to limit optimistic insert records */
+extern uint	btr_cur_limit_optimistic_insert_debug;
+#endif /* UNIV_DEBUG */
+
+#include "btr0cur.inl"
+
+#endif
diff --git a/storage/innobase/include/btr0cur.inl b/storage/innobase/include/btr0cur.inl
new file mode 100644
index 00000000..955cf342
--- /dev/null
+++ b/storage/innobase/include/btr0cur.inl
@@ -0,0 +1,170 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0cur.ic
+The index tree cursor
+
+Created 10/16/1994 Heikki Tuuri
+*******************************************************/
+
+#include "btr0btr.h"
+
+#ifdef UNIV_DEBUG
+# define LIMIT_OPTIMISTIC_INSERT_DEBUG(NREC, CODE)\
+if (btr_cur_limit_optimistic_insert_debug > 1\
+    && (NREC) >= btr_cur_limit_optimistic_insert_debug) {\
+        CODE;\
+}
+#else
+# define LIMIT_OPTIMISTIC_INSERT_DEBUG(NREC, CODE)
+#endif /* UNIV_DEBUG */
+
+/*********************************************************//**
+Returns the compressed page on which the tree cursor is positioned.
+@return pointer to compressed page, or NULL if the page is not compressed */
+UNIV_INLINE
+page_zip_des_t*
+btr_cur_get_page_zip(
+/*=================*/
+	btr_cur_t*	cursor)	/*!< in: tree cursor */
+{
+	return(buf_block_get_page_zip(btr_cur_get_block(cursor)));
+}
+
+/*********************************************************//**
+Returns the page of a tree cursor.
+@return pointer to page */
+UNIV_INLINE
+page_t*
+btr_cur_get_page(
+/*=============*/
+	btr_cur_t*	cursor)	/*!< in: tree cursor */
+{
+	return(page_align(page_cur_get_rec(&(cursor->page_cur))));
+}
+
+/*********************************************************//**
+Positions a tree cursor at a given record. */
+UNIV_INLINE
+void
+btr_cur_position(
+/*=============*/
+	dict_index_t*	index,	/*!< in: index */
+	rec_t*		rec,	/*!< in: record in tree */
+	buf_block_t*	block,	/*!< in: buffer block of rec */
+	btr_cur_t*	cursor)	/*!< out: cursor */
+{
+	page_cur_position(rec, block, btr_cur_get_page_cur(cursor));
+	cursor->page_cur.index = index;
+}
+
+/*********************************************************************//**
+Checks if compressing an index page where a btr cursor is placed makes
+sense.
+@return TRUE if compression is recommended */
+UNIV_INLINE
+ibool
+btr_cur_compress_recommendation(
+/*============================*/
+	btr_cur_t*	cursor,	/*!< in: btr cursor */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	const page_t*	page;
+
+	ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
+					 MTR_MEMO_PAGE_X_FIX));
+
+	page = btr_cur_get_page(cursor);
+
+	LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page) * 2U,
+				      return(FALSE));
+
+	if (!page_has_siblings(page)
+	    || page_get_data_size(page)
+	    < BTR_CUR_PAGE_COMPRESS_LIMIT(cursor->index())) {
+
+		/* The page fillfactor has dropped below a predefined
+		minimum value OR the level in the B-tree contains just
+		one page: we recommend compression if this is not the
+		root page. */
+
+		return cursor->index()->page
+			!= btr_cur_get_block(cursor)->page.id().page_no();
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Checks if the record on which the cursor is placed can be deleted without
+making tree compression necessary (or, recommended).
+@return TRUE if can be deleted without recommended compression */
+UNIV_INLINE
+ibool
+btr_cur_can_delete_without_compress(
+/*================================*/
+	btr_cur_t*	cursor,	/*!< in: btr cursor */
+	ulint		rec_size,/*!< in: rec_get_size(btr_cur_get_rec(cursor))*/
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_t*		page;
+
+	ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
+					 MTR_MEMO_PAGE_X_FIX));
+
+	page = btr_cur_get_page(cursor);
+
+	if (!page_has_siblings(page) || page_get_n_recs(page) < 2
+	    || page_get_data_size(page) - rec_size
+	    < BTR_CUR_PAGE_COMPRESS_LIMIT(cursor->index())) {
+
+		/* The page fillfactor will drop below a predefined
+		minimum value, OR the level in the B-tree contains just
+		one page, OR the page will become empty: we recommend
+		compression if this is not the root page. */
+
+		return cursor->index()->page
+			== btr_cur_get_block(cursor)->page.id().page_no();
+	}
+
+	return(TRUE);
+}
+
+/*******************************************************************//**
+Determine if an operation on off-page columns is an update.
+@return TRUE if op != BTR_STORE_INSERT */
+UNIV_INLINE
+ibool
+btr_blob_op_is_update(
+/*==================*/
+	enum blob_op	op)	/*!< in: operation */
+{
+	switch (op) {
+	case BTR_STORE_INSERT:
+	case BTR_STORE_INSERT_BULK:
+		return(FALSE);
+	case BTR_STORE_INSERT_UPDATE:
+	case BTR_STORE_UPDATE:
+		return(TRUE);
+	}
+
+	ut_ad(0);
+	return(FALSE);
+}
diff --git a/storage/innobase/include/btr0defragment.h b/storage/innobase/include/btr0defragment.h
new file mode 100644
index 00000000..0523829b
--- /dev/null
+++ b/storage/innobase/include/btr0defragment.h
@@ -0,0 +1,65 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2014 Facebook, Inc. All Rights Reserved.
+Copyright (C) 2014, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#ifndef btr0defragment_h
+#define btr0defragment_h
+
+#include "btr0pcur.h"
+
+/* Max number of pages to consider at once during defragmentation. */
+#define BTR_DEFRAGMENT_MAX_N_PAGES	32
+
+/** stats in btr_defragment */
+extern Atomic_counter<ulint> btr_defragment_compression_failures;
+extern Atomic_counter<ulint> btr_defragment_failures;
+extern Atomic_counter<ulint> btr_defragment_count;
+
+/******************************************************************//**
+Initialize defragmentation. */
+void
+btr_defragment_init(void);
+/******************************************************************//**
+Shutdown defragmentation. */
+void
+btr_defragment_shutdown();
+/******************************************************************//**
+Check whether the given index is in btr_defragment_wq. */
+bool
+btr_defragment_find_index(
+	dict_index_t*	index);	/*!< Index to find. */
+/** Defragment an index.
+@param pcur      persistent cursor
+@param thd       current session, for checking thd_killed()
+@return whether the operation was interrupted */
+bool btr_defragment_add_index(btr_pcur_t *pcur, THD *thd);
+/******************************************************************//**
+When table is dropped, this function is called to mark a table as removed in
+btr_efragment_wq. The difference between this function and the remove_index
+function is this will not NULL the event. */
+void
+btr_defragment_remove_table(
+	dict_table_t*	table);	/*!< Index to be removed. */
+/*********************************************************************//**
+Check whether we should save defragmentation statistics to persistent storage.*/
+void btr_defragment_save_defrag_stats_if_needed(dict_index_t *index);
+
+/* Stop defragmentation.*/
+void btr_defragment_end();
+extern bool btr_defragment_active;
+#endif
diff --git a/storage/innobase/include/btr0pcur.h b/storage/innobase/include/btr0pcur.h
new file mode 100644
index 00000000..c66a3bfa
--- /dev/null
+++ b/storage/innobase/include/btr0pcur.h
@@ -0,0 +1,459 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0pcur.h
+The index tree persistent cursor
+
+Created 2/23/1996 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "dict0dict.h"
+#include "btr0cur.h"
+#include "buf0block_hint.h"
+#include "btr0btr.h"
+#include "gis0rtree.h"
+
+/* Relative positions for a stored cursor position */
+enum btr_pcur_pos_t {
+	BTR_PCUR_ON		= 1,
+	BTR_PCUR_BEFORE		= 2,
+	BTR_PCUR_AFTER		= 3,
+/* Note that if the tree is not empty, btr_pcur_store_position does not
+use the following, but only uses the above three alternatives, where the
+position is stored relative to a specific record: this makes implementation
+of a scroll cursor easier */
+	BTR_PCUR_BEFORE_FIRST_IN_TREE	= 4,	/* in an empty tree */
+	BTR_PCUR_AFTER_LAST_IN_TREE	= 5	/* in an empty tree */
+};
+
+/**************************************************************//**
+Resets a persistent cursor object, freeing ::old_rec_buf if it is
+allocated and resetting the other members to their initial values. */
+void
+btr_pcur_reset(
+/*===========*/
+	btr_pcur_t*	cursor);/*!< in, out: persistent cursor */
+
+/**************************************************************//**
+Copies the stored position of a pcur to another pcur. */
+void
+btr_pcur_copy_stored_position(
+/*==========================*/
+	btr_pcur_t*	pcur_receive,	/*!< in: pcur which will receive the
+					position info */
+	btr_pcur_t*	pcur_donate);	/*!< in: pcur from which the info is
+					copied */
+/**************************************************************//**
+Sets the old_rec_buf field to NULL. */
+UNIV_INLINE
+void
+btr_pcur_init(
+/*==========*/
+	btr_pcur_t*	pcur);	/*!< in: persistent cursor */
+
+/** Opens an persistent cursor to an index tree without initializing the
+cursor.
+@param tuple      tuple on which search done
+@param mode       PAGE_CUR_L, ...; NOTE that if the search is made using a
+                  unique prefix of a record, mode should be PAGE_CUR_LE, not
+                  PAGE_CUR_GE, as the latter may end up on the previous page of
+                  the record!
+@param latch_mode BTR_SEARCH_LEAF, ...
+@param cursor     memory buffer for persistent cursor
+@param mtr        mini-transaction
+@return DB_SUCCESS on success or error code otherwise. */
+inline
+dberr_t btr_pcur_open_with_no_init(const dtuple_t *tuple, page_cur_mode_t mode,
+                                   btr_latch_mode latch_mode,
+                                   btr_pcur_t *cursor, mtr_t *mtr);
+
+/**************************************************************//**
+Gets the up_match value for a pcur after a search.
+@return number of matched fields at the cursor or to the right if
+search mode was PAGE_CUR_GE, otherwise undefined */
+UNIV_INLINE
+ulint
+btr_pcur_get_up_match(
+/*==================*/
+	const btr_pcur_t*	cursor); /*!< in: persistent cursor */
+/**************************************************************//**
+Gets the low_match value for a pcur after a search.
+@return number of matched fields at the cursor or to the right if
+search mode was PAGE_CUR_LE, otherwise undefined */
+UNIV_INLINE
+ulint
+btr_pcur_get_low_match(
+/*===================*/
+	const btr_pcur_t*	cursor); /*!< in: persistent cursor */
+
+/**************************************************************//**
+Frees the possible memory heap of a persistent cursor and sets the latch
+mode of the persistent cursor to BTR_NO_LATCHES.
+WARNING: this function does not release the latch on the page where the
+cursor is currently positioned. The latch is acquired by the
+"move to next/previous" family of functions. Since recursive shared locks
+are not allowed, you must take care (if using the cursor in S-mode) to
+manually release the latch by either calling
+btr_leaf_page_release(btr_pcur_get_block(&pcur), pcur.latch_mode, mtr)
+or by mtr_t::commit(). */
+UNIV_INLINE
+void
+btr_pcur_close(
+/*===========*/
+	btr_pcur_t*	cursor);	/*!< in: persistent cursor */
+/**************************************************************//**
+The position of the cursor is stored by taking an initial segment of the
+record the cursor is positioned on, before, or after, and copying it to the
+cursor data structure, or just setting a flag if the cursor id before the
+first in an EMPTY tree, or after the last in an EMPTY tree. NOTE that the
+page where the cursor is positioned must not be empty if the index tree is
+not totally empty! */
+void
+btr_pcur_store_position(
+/*====================*/
+	btr_pcur_t*	cursor, /*!< in: persistent cursor */
+	mtr_t*		mtr);	/*!< in: mtr */
+/*********************************************************//**
+Gets the rel_pos field for a cursor whose position has been stored.
+@return BTR_PCUR_ON, ... */
+UNIV_INLINE
+ulint
+btr_pcur_get_rel_pos(
+/*=================*/
+	const btr_pcur_t*	cursor);/*!< in: persistent cursor */
+/**************************************************************//**
+Commits the mtr and sets the pcur latch mode to BTR_NO_LATCHES,
+that is, the cursor becomes detached.
+Function btr_pcur_store_position should be used before calling this,
+if restoration of cursor is wanted later. */
+UNIV_INLINE
+void
+btr_pcur_commit_specify_mtr(
+/*========================*/
+	btr_pcur_t*	pcur,	/*!< in: persistent cursor */
+	mtr_t*		mtr);	/*!< in: mtr to commit */
+
+/** Commits the mtr and sets the clustered index pcur and secondary index
+pcur latch mode to BTR_NO_LATCHES, that is, the cursor becomes detached.
+Function btr_pcur_store_position should be used for both cursor before
+calling this, if restoration of cursor is wanted later.
+@param[in]	pcur		persistent cursor
+@param[in]	sec_pcur	secondary index persistent cursor
+@param[in]	mtr		mtr to commit */
+UNIV_INLINE
+void
+btr_pcurs_commit_specify_mtr(
+	btr_pcur_t*	pcur,
+	btr_pcur_t*	sec_pcur,
+	mtr_t*		mtr);
+
+/*********************************************************//**
+Moves the persistent cursor to the next record in the tree. If no records are
+left, the cursor stays 'after last in tree'.
+@return TRUE if the cursor was not after last in tree */
+UNIV_INLINE
+ibool
+btr_pcur_move_to_next(
+/*==================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor; NOTE that the
+				function may release the page latch */
+	mtr_t*		mtr);	/*!< in: mtr */
+/*********************************************************//**
+Moves the persistent cursor to the previous record in the tree. If no records
+are left, the cursor stays 'before first in tree'.
+@return true if the cursor was not before first in tree */
+bool
+btr_pcur_move_to_prev(
+/*==================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor; NOTE that the
+				function may release the page latch */
+	mtr_t*		mtr)	/*!< in: mtr */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************//**
+Moves the persistent cursor to the next user record in the tree. If no user
+records are left, the cursor ends up 'after last in tree'.
+@return TRUE if the cursor moved forward, ending on a user record */
+UNIV_INLINE
+ibool
+btr_pcur_move_to_next_user_rec(
+/*===========================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor; NOTE that the
+				function may release the page latch */
+	mtr_t*		mtr);	/*!< in: mtr */
+/*********************************************************//**
+Moves the persistent cursor to the first record on the next page.
+Releases the latch on the current page, and bufferunfixes it.
+Note that there must not be modifications on the current page,
+as then the x-latch can be released only in mtr_commit. */
+dberr_t
+btr_pcur_move_to_next_page(
+/*=======================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor; must be on the
+				last record of the current page */
+	mtr_t*		mtr)	/*!< in: mtr */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+#define btr_pcur_get_btr_cur(cursor) (&(cursor)->btr_cur)
+#define btr_pcur_get_page_cur(cursor) (&(cursor)->btr_cur.page_cur)
+#define btr_pcur_get_page(cursor) btr_pcur_get_block(cursor)->page.frame
+
+/*********************************************************//**
+Checks if the persistent cursor is on a user record. */
+UNIV_INLINE
+ibool
+btr_pcur_is_on_user_rec(
+/*====================*/
+	const btr_pcur_t*	cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Checks if the persistent cursor is after the last user record on
+a page. */
+UNIV_INLINE
+ibool
+btr_pcur_is_after_last_on_page(
+/*===========================*/
+	const btr_pcur_t*	cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Checks if the persistent cursor is before the first user record on
+a page. */
+UNIV_INLINE
+ibool
+btr_pcur_is_before_first_on_page(
+/*=============================*/
+	const btr_pcur_t*	cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Checks if the persistent cursor is before the first user record in
+the index tree. */
+static inline bool btr_pcur_is_before_first_in_tree(btr_pcur_t* cursor);
+/*********************************************************//**
+Checks if the persistent cursor is after the last user record in
+the index tree. */
+static inline bool btr_pcur_is_after_last_in_tree(btr_pcur_t* cursor);
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/*********************************************************//**
+Moves the persistent cursor to the next record on the same page. */
+UNIV_INLINE
+rec_t*
+btr_pcur_move_to_next_on_page(
+/*==========================*/
+	btr_pcur_t*	cursor);/*!< in/out: persistent cursor */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/*********************************************************//**
+Moves the persistent cursor to the previous record on the same page. */
+UNIV_INLINE
+rec_t*
+btr_pcur_move_to_prev_on_page(
+/*==========================*/
+	btr_pcur_t*	cursor);/*!< in/out: persistent cursor */
+/*********************************************************//**
+Moves the persistent cursor to the infimum record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_before_first_on_page(
+/*===============================*/
+	btr_pcur_t*	cursor); /*!< in/out: persistent cursor */
+
+/** Position state of persistent B-tree cursor. */
+enum pcur_pos_t {
+	/** The persistent cursor is not positioned. */
+	BTR_PCUR_NOT_POSITIONED = 0,
+	/** The persistent cursor was previously positioned.
+	TODO: currently, the state can be BTR_PCUR_IS_POSITIONED,
+	though it really should be BTR_PCUR_WAS_POSITIONED,
+	because we have no obligation to commit the cursor with
+	mtr; similarly latch_mode may be out of date. This can
+	lead to problems if btr_pcur is not used the right way;
+	all current code should be ok. */
+	BTR_PCUR_WAS_POSITIONED,
+	/** The persistent cursor is positioned by optimistic get to the same
+	record as it was positioned at. Not used for rel_pos == BTR_PCUR_ON.
+	It may need adjustment depending on previous/current search direction
+	and rel_pos. */
+	BTR_PCUR_IS_POSITIONED_OPTIMISTIC,
+	/** The persistent cursor is positioned by index search.
+	Or optimistic get for rel_pos == BTR_PCUR_ON. */
+	BTR_PCUR_IS_POSITIONED
+};
+
+/* The persistent B-tree cursor structure. This is used mainly for SQL
+selects, updates, and deletes. */
+
+struct btr_pcur_t
+{
+  /** Return value of restore_position() */
+  enum restore_status {
+    /** cursor position on user rec and points on the record with
+    the same field values as in the stored record */
+    SAME_ALL,
+    /** cursor position is on user rec and points on the record with
+    the same unique field values as in the stored record */
+    SAME_UNIQ,
+    /** cursor position is not on user rec or points on the record
+    with not the same uniq field values as in the stored record */
+    NOT_SAME,
+    /** the index tree is corrupted */
+    CORRUPTED
+  };
+  /** a B-tree cursor */
+  btr_cur_t btr_cur;
+  /** @see BTR_PCUR_WAS_POSITIONED
+  BTR_SEARCH_LEAF, BTR_MODIFY_LEAF, BTR_MODIFY_TREE or BTR_NO_LATCHES,
+  depending on the latching state of the page and tree where the cursor
+  is positioned; BTR_NO_LATCHES means that the cursor is not currently
+  positioned:
+  we say then that the cursor is detached; it can be restored to
+  attached if the old position was stored in old_rec */
+  btr_latch_mode latch_mode= BTR_NO_LATCHES;
+  /** if cursor position is stored, contains an initial segment of the
+  latest record cursor was positioned either on, before or after */
+  rec_t *old_rec= nullptr;
+  /** btr_cur.index()->n_core_fields when old_rec was copied */
+  uint16 old_n_core_fields= 0;
+  /** number of fields in old_rec */
+  uint16 old_n_fields= 0;
+  /** BTR_PCUR_ON, BTR_PCUR_BEFORE, or BTR_PCUR_AFTER, depending on
+  whether cursor was on, before, or after the old_rec record */
+  btr_pcur_pos_t rel_pos= btr_pcur_pos_t(0);
+  /** buffer block when the position was stored */
+  buf::Block_hint block_when_stored;
+  /** the modify clock value of the buffer block when the cursor position
+  was stored */
+  ib_uint64_t modify_clock= 0;
+  /** btr_pcur_store_position() and restore_position() state. */
+  enum pcur_pos_t pos_state= BTR_PCUR_NOT_POSITIONED;
+  page_cur_mode_t search_mode= PAGE_CUR_UNSUPP;
+  /** the transaction, if we know it; otherwise this field is not defined;
+  can ONLY BE USED in error prints in fatal assertion failures! */
+  trx_t *trx_if_known= nullptr;
+  /** a dynamically allocated buffer for old_rec */
+  byte *old_rec_buf= nullptr;
+  /** old_rec_buf size if old_rec_buf is not NULL */
+  ulint buf_size= 0;
+
+  /** Return the index of this persistent cursor */
+  dict_index_t *index() const { return(btr_cur.index()); }
+  MY_ATTRIBUTE((nonnull, warn_unused_result))
+  /** Restores the stored position of a persistent cursor bufferfixing
+  the page and obtaining the specified latches. If the cursor position
+  was saved when the
+  (1) cursor was positioned on a user record: this function restores the
+  position to the last record LESS OR EQUAL to the stored record;
+  (2) cursor was positioned on a page infimum record: restores the
+  position to the last record LESS than the user record which was the
+  successor of the page infimum;
+  (3) cursor was positioned on the page supremum: restores to the first
+  record GREATER than the user record which was the predecessor of the
+  supremum.
+  (4) cursor was positioned before the first or after the last in an
+  empty tree: restores to before first or after the last in the tree.
+  @param latch_mode  BTR_SEARCH_LEAF, ...
+  @param mtr         mini-transaction
+  @retval SAME_ALL cursor position on user rec and points on
+  the record with the same field values as in the stored record,
+  @retval SAME_UNIQ cursor position is on user rec and points on the
+  record with the same unique field values as in the stored record,
+  @retval NOT_SAME cursor position is not on user rec or points on
+  the record with not the same uniq field values as in the stored
+  @retval CORRUPTED if the index is corrupted */
+  restore_status restore_position(btr_latch_mode latch_mode, mtr_t *mtr);
+
+  /** Open the cursor on the first or last record.
+  @param first         true=first record, false=last record
+  @param index         B-tree
+  @param latch_mode    which latches to acquire
+  @param mtr           mini-transaction
+  @return error code */
+  dberr_t open_leaf(bool first, dict_index_t *index, btr_latch_mode latch_mode,
+                    mtr_t *mtr)
+
+  {
+    this->latch_mode= BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
+    search_mode= first ? PAGE_CUR_G : PAGE_CUR_L;
+    pos_state= BTR_PCUR_IS_POSITIONED;
+    old_rec= nullptr;
+
+    return btr_cur.open_leaf(first, index, this->latch_mode, mtr);
+  }
+};
+
+inline buf_block_t *btr_pcur_get_block(btr_pcur_t *cursor)
+{
+  ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+  return cursor->btr_cur.page_cur.block;
+}
+
+inline const buf_block_t *btr_pcur_get_block(const btr_pcur_t *cursor)
+{
+  ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+  return cursor->btr_cur.page_cur.block;
+}
+
+inline rec_t *btr_pcur_get_rec(const btr_pcur_t *cursor)
+{
+  ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+  ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+  return cursor->btr_cur.page_cur.rec;
+}
+
+/**************************************************************//**
+Initializes and opens a persistent cursor to an index tree. */
+inline
+dberr_t
+btr_pcur_open(
+	const dtuple_t*	tuple,	/*!< in: tuple on which search done */
+        page_cur_mode_t	mode,	/*!< in: PAGE_CUR_LE, ... */
+	btr_latch_mode	latch_mode,/*!< in: BTR_SEARCH_LEAF, ... */
+	btr_pcur_t*	cursor, /*!< in: memory buffer for persistent cursor */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+  cursor->latch_mode= BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
+  cursor->search_mode= mode;
+  cursor->pos_state= BTR_PCUR_IS_POSITIONED;
+  cursor->trx_if_known= nullptr;
+  return cursor->btr_cur.search_leaf(tuple, mode, latch_mode, mtr);
+}
+
+/** Open a cursor on the first user record satisfying the search condition;
+in case of no match, after the last index record. */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+inline
+dberr_t
+btr_pcur_open_on_user_rec(
+	const dtuple_t*	tuple,		/*!< in: tuple on which search done */
+	btr_latch_mode	latch_mode,	/*!< in: BTR_SEARCH_LEAF or
+					BTR_MODIFY_LEAF */
+	btr_pcur_t*	cursor,		/*!< in: memory buffer for persistent
+					cursor */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+  ut_ad(latch_mode == BTR_SEARCH_LEAF || latch_mode == BTR_MODIFY_LEAF);
+  if (dberr_t err=
+      btr_pcur_open(tuple, PAGE_CUR_GE, latch_mode, cursor, mtr))
+    return err;
+  if (!btr_pcur_is_after_last_on_page(cursor) ||
+      btr_pcur_is_after_last_in_tree(cursor))
+    return DB_SUCCESS;
+  if (dberr_t err= btr_pcur_move_to_next_page(cursor, mtr))
+    return err;
+  return btr_pcur_move_to_next_on_page(cursor) ? DB_SUCCESS : DB_CORRUPTION;
+}
+
+#include "btr0pcur.inl"
diff --git a/storage/innobase/include/btr0pcur.inl b/storage/innobase/include/btr0pcur.inl
new file mode 100644
index 00000000..b827d70d
--- /dev/null
+++ b/storage/innobase/include/btr0pcur.inl
@@ -0,0 +1,372 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0pcur.ic
+The index tree persistent cursor
+
+Created 2/23/1996 Heikki Tuuri
+*******************************************************/
+
+
+/*********************************************************//**
+Gets the rel_pos field for a cursor whose position has been stored.
+@return BTR_PCUR_ON, ... */
+UNIV_INLINE
+ulint
+btr_pcur_get_rel_pos(
+/*=================*/
+	const btr_pcur_t*	cursor)	/*!< in: persistent cursor */
+{
+	ut_ad(cursor);
+	ut_ad(cursor->old_rec);
+	ut_ad(cursor->pos_state == BTR_PCUR_WAS_POSITIONED
+	      || cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+
+	return(cursor->rel_pos);
+}
+
+/**************************************************************//**
+Gets the up_match value for a pcur after a search.
+@return number of matched fields at the cursor or to the right if
+search mode was PAGE_CUR_GE, otherwise undefined */
+UNIV_INLINE
+ulint
+btr_pcur_get_up_match(
+/*==================*/
+	const btr_pcur_t*	cursor) /*!< in: persistent cursor */
+{
+	const btr_cur_t*	btr_cursor;
+
+	ut_ad((cursor->pos_state == BTR_PCUR_WAS_POSITIONED)
+	      || (cursor->pos_state == BTR_PCUR_IS_POSITIONED));
+
+	btr_cursor = btr_pcur_get_btr_cur(cursor);
+
+	ut_ad(btr_cursor->up_match != ULINT_UNDEFINED);
+
+	return(btr_cursor->up_match);
+}
+
+/**************************************************************//**
+Gets the low_match value for a pcur after a search.
+@return number of matched fields at the cursor or to the right if
+search mode was PAGE_CUR_LE, otherwise undefined */
+UNIV_INLINE
+ulint
+btr_pcur_get_low_match(
+/*===================*/
+	const btr_pcur_t*	cursor) /*!< in: persistent cursor */
+{
+	const btr_cur_t*	btr_cursor;
+
+	ut_ad((cursor->pos_state == BTR_PCUR_WAS_POSITIONED)
+	      || (cursor->pos_state == BTR_PCUR_IS_POSITIONED));
+
+	btr_cursor = btr_pcur_get_btr_cur(cursor);
+	ut_ad(btr_cursor->low_match != ULINT_UNDEFINED);
+
+	return(btr_cursor->low_match);
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is after the last user record on
+a page. */
+UNIV_INLINE
+ibool
+btr_pcur_is_after_last_on_page(
+/*===========================*/
+	const btr_pcur_t*	cursor)	/*!< in: persistent cursor */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	return(page_cur_is_after_last(btr_pcur_get_page_cur(cursor)));
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is before the first user record on
+a page. */
+UNIV_INLINE
+ibool
+btr_pcur_is_before_first_on_page(
+/*=============================*/
+	const btr_pcur_t*	cursor)	/*!< in: persistent cursor */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	return(page_cur_is_before_first(btr_pcur_get_page_cur(cursor)));
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is on a user record. */
+UNIV_INLINE
+ibool
+btr_pcur_is_on_user_rec(
+/*====================*/
+	const btr_pcur_t*	cursor)	/*!< in: persistent cursor */
+{
+  return !btr_pcur_is_before_first_on_page(cursor) &&
+    !btr_pcur_is_after_last_on_page(cursor);
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is before the first user record in
+the index tree. */
+static inline bool btr_pcur_is_before_first_in_tree(btr_pcur_t* cursor)
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	return !page_has_prev(btr_pcur_get_page(cursor))
+		&& page_cur_is_before_first(btr_pcur_get_page_cur(cursor));
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is after the last user record in
+the index tree. */
+static inline bool btr_pcur_is_after_last_in_tree(btr_pcur_t* cursor)
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	return !page_has_next(btr_pcur_get_page(cursor))
+		&& page_cur_is_after_last(btr_pcur_get_page_cur(cursor));
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the next record on the same page. */
+UNIV_INLINE
+rec_t*
+btr_pcur_move_to_next_on_page(
+/*==========================*/
+	btr_pcur_t*	cursor)	/*!< in/out: persistent cursor */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	cursor->old_rec = nullptr;
+	return page_cur_move_to_next(btr_pcur_get_page_cur(cursor));
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the previous record on the same page. */
+UNIV_INLINE
+rec_t*
+btr_pcur_move_to_prev_on_page(
+/*==========================*/
+	btr_pcur_t*	cursor)	/*!< in/out: persistent cursor */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+	cursor->old_rec = nullptr;
+
+	return page_cur_move_to_prev(btr_pcur_get_page_cur(cursor));
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the next user record in the tree. If no user
+records are left, the cursor ends up 'after last in tree'.
+@return TRUE if the cursor moved forward, ending on a user record */
+UNIV_INLINE
+ibool
+btr_pcur_move_to_next_user_rec(
+/*===========================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor; NOTE that the
+				function may release the page latch */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+	cursor->old_rec = nullptr;
+loop:
+	if (btr_pcur_is_after_last_on_page(cursor)) {
+		if (btr_pcur_is_after_last_in_tree(cursor)
+		    || btr_pcur_move_to_next_page(cursor, mtr) != DB_SUCCESS) {
+			return(FALSE);
+		}
+	} else if (UNIV_UNLIKELY(!btr_pcur_move_to_next_on_page(cursor))) {
+		return false;
+	}
+
+	if (btr_pcur_is_on_user_rec(cursor)) {
+
+		return(TRUE);
+	}
+
+	goto loop;
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the next record in the tree. If no records are
+left, the cursor stays 'after last in tree'.
+@return TRUE if the cursor was not after last in tree */
+UNIV_INLINE
+ibool
+btr_pcur_move_to_next(
+/*==================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor; NOTE that the
+				function may release the page latch */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+  ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+  ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+  cursor->old_rec= nullptr;
+
+  if (btr_pcur_is_after_last_on_page(cursor))
+    return !btr_pcur_is_after_last_in_tree(cursor) &&
+      btr_pcur_move_to_next_page(cursor, mtr) == DB_SUCCESS;
+  else
+    return !!btr_pcur_move_to_next_on_page(cursor);
+}
+
+/**************************************************************//**
+Commits the mtr and sets the pcur latch mode to BTR_NO_LATCHES,
+that is, the cursor becomes detached.
+Function btr_pcur_store_position should be used before calling this,
+if restoration of cursor is wanted later. */
+UNIV_INLINE
+void
+btr_pcur_commit_specify_mtr(
+/*========================*/
+	btr_pcur_t*	pcur,	/*!< in: persistent cursor */
+	mtr_t*		mtr)	/*!< in: mtr to commit */
+{
+	ut_ad(pcur->pos_state == BTR_PCUR_IS_POSITIONED);
+
+	pcur->latch_mode = BTR_NO_LATCHES;
+
+	mtr_commit(mtr);
+
+	pcur->pos_state = BTR_PCUR_WAS_POSITIONED;
+}
+
+/** Commits the mtr and sets the clustered index pcur and secondary index
+pcur latch mode to BTR_NO_LATCHES, that is, the cursor becomes detached.
+Function btr_pcur_store_position should be used for both cursor before
+calling this, if restoration of cursor is wanted later.
+@param[in]	pcur		persistent cursor
+@param[in]	sec_pcur	secondary index persistent cursor
+@param[in]	mtr		mtr to commit */
+UNIV_INLINE
+void
+btr_pcurs_commit_specify_mtr(
+	btr_pcur_t*	pcur,
+	btr_pcur_t*	sec_pcur,
+	mtr_t*		mtr)
+{
+	ut_ad(pcur->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(sec_pcur->pos_state == BTR_PCUR_IS_POSITIONED);
+
+	pcur->latch_mode = BTR_NO_LATCHES;
+	sec_pcur->latch_mode = BTR_NO_LATCHES;
+
+	mtr_commit(mtr);
+
+	pcur->pos_state = BTR_PCUR_WAS_POSITIONED;
+	sec_pcur->pos_state = BTR_PCUR_WAS_POSITIONED;
+}
+
+/**************************************************************//**
+Sets the old_rec_buf field to NULL. */
+UNIV_INLINE
+void
+btr_pcur_init(
+/*==========*/
+	btr_pcur_t*	pcur)	/*!< in: persistent cursor */
+{
+	pcur->old_rec_buf = NULL;
+	pcur->old_rec = NULL;
+
+	pcur->btr_cur.rtr_info = NULL;
+}
+
+/** Opens an persistent cursor to an index tree without initializing the
+cursor.
+@param tuple      tuple on which search done
+@param mode       search mode; NOTE that if the search is made using a
+                  unique prefix of a record, mode should be PAGE_CUR_LE, not
+                  PAGE_CUR_GE, as the latter may end up on the previous page of
+                  the record!
+@param latch_mode BTR_SEARCH_LEAF, ...
+@param cursor     memory buffer for persistent cursor
+@param mtr        mini-transaction
+@return DB_SUCCESS on success or error code otherwise. */
+inline
+dberr_t btr_pcur_open_with_no_init(const dtuple_t *tuple, page_cur_mode_t mode,
+                                   btr_latch_mode latch_mode,
+                                   btr_pcur_t *cursor, mtr_t *mtr)
+{
+  cursor->latch_mode= BTR_LATCH_MODE_WITHOUT_INTENTION(latch_mode);
+  cursor->search_mode= mode;
+  cursor->pos_state= BTR_PCUR_IS_POSITIONED;
+  cursor->trx_if_known= nullptr;
+  return cursor->btr_cur.search_leaf(tuple, mode, latch_mode, mtr);
+}
+
+/**************************************************************//**
+Frees the possible memory heap of a persistent cursor and sets the latch
+mode of the persistent cursor to BTR_NO_LATCHES.
+WARNING: this function does not release the latch on the page where the
+cursor is currently positioned. The latch is acquired by the
+"move to next/previous" family of functions. Since recursive shared locks
+are not allowed, you must take care (if using the cursor in S-mode) to
+manually release the latch by either calling
+btr_leaf_page_release(btr_pcur_get_block(&pcur), pcur.latch_mode, mtr)
+or by mtr_t::commit(). */
+UNIV_INLINE
+void
+btr_pcur_close(
+/*===========*/
+	btr_pcur_t*	cursor)	/*!< in: persistent cursor */
+{
+  ut_free(cursor->old_rec_buf);
+
+  if (cursor->btr_cur.rtr_info)
+    rtr_clean_rtr_info(cursor->btr_cur.rtr_info, true);
+
+  cursor->btr_cur.rtr_info= nullptr;
+  cursor->old_rec = nullptr;
+  cursor->old_rec_buf = nullptr;
+  cursor->btr_cur.page_cur.rec = nullptr;
+  cursor->btr_cur.page_cur.block = nullptr;
+
+  cursor->latch_mode = BTR_NO_LATCHES;
+  cursor->pos_state = BTR_PCUR_NOT_POSITIONED;
+
+  cursor->trx_if_known = nullptr;
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the infimum record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_before_first_on_page(
+/*===============================*/
+	btr_pcur_t*	cursor) /*!< in/out: persistent cursor */
+{
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	page_cur_set_before_first(btr_pcur_get_block(cursor),
+		btr_pcur_get_page_cur(cursor));
+
+	cursor->old_rec = nullptr;
+}
diff --git a/storage/innobase/include/btr0sea.h b/storage/innobase/include/btr0sea.h
new file mode 100644
index 00000000..b75cad10
--- /dev/null
+++ b/storage/innobase/include/btr0sea.h
@@ -0,0 +1,403 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/btr0sea.h
+The index tree adaptive search
+
+Created 2/17/1996 Heikki Tuuri
+*************************************************************************/
+
+#ifndef btr0sea_h
+#define btr0sea_h
+
+#include "dict0dict.h"
+#ifdef BTR_CUR_HASH_ADAPT
+#include "ha0ha.h"
+#include "srw_lock.h"
+
+#ifdef UNIV_PFS_RWLOCK
+extern mysql_pfs_key_t btr_search_latch_key;
+#endif /* UNIV_PFS_RWLOCK */
+
+#define btr_search_sys_create() btr_search_sys.create()
+#define btr_search_sys_free() btr_search_sys.free()
+
+/** Disable the adaptive hash search system and empty the index. */
+void btr_search_disable();
+
+/** Enable the adaptive hash search system.
+@param resize whether buf_pool_t::resize() is the caller */
+void btr_search_enable(bool resize= false);
+
+/*********************************************************************//**
+Updates the search info. */
+UNIV_INLINE
+void
+btr_search_info_update(
+/*===================*/
+	dict_index_t*	index,	/*!< in: index of the cursor */
+	btr_cur_t*	cursor);/*!< in: cursor which was just positioned */
+
+/** Tries to guess the right search position based on the hash search info
+of the index. Note that if mode is PAGE_CUR_LE, which is used in inserts,
+and the function returns TRUE, then cursor->up_match and cursor->low_match
+both have sensible values.
+@param[in,out]	index		index
+@param[in,out]	info		index search info
+@param[in]	tuple		logical record
+@param[in]	mode		PAGE_CUR_L, ....
+@param[in]	latch_mode	BTR_SEARCH_LEAF, ...
+@param[out]	cursor		tree cursor
+@param[in]	mtr		mini-transaction
+@return whether the search succeeded */
+bool
+btr_search_guess_on_hash(
+	dict_index_t*	index,
+	btr_search_t*	info,
+	const dtuple_t*	tuple,
+	ulint		mode,
+	ulint		latch_mode,
+	btr_cur_t*	cursor,
+	mtr_t*		mtr);
+
+/** Move or delete hash entries for moved records, usually in a page split.
+If new_block is already hashed, then any hash index for block is dropped.
+If new_block is not hashed, and block is hashed, then a new hash index is
+built to new_block with the same parameters as block.
+@param[in,out]	new_block	destination page
+@param[in,out]	block		source page (subject to deletion later) */
+void
+btr_search_move_or_delete_hash_entries(
+	buf_block_t*	new_block,
+	buf_block_t*	block);
+
+/** Drop any adaptive hash index entries that point to an index page.
+@param[in,out]	block	block containing index page, s- or x-latched, or an
+			index page for which we know that
+			block->buf_fix_count == 0 or it is an index page which
+			has already been removed from the buf_pool.page_hash
+			i.e.: it is in state BUF_BLOCK_REMOVE_HASH
+@param[in]	garbage_collect	drop ahi only if the index is marked
+				as freed */
+void btr_search_drop_page_hash_index(buf_block_t* block,
+				     bool garbage_collect);
+
+/** Drop possible adaptive hash index entries when a page is evicted
+from the buffer pool or freed in a file, or the index is being dropped.
+@param[in]	page_id		page id */
+void btr_search_drop_page_hash_when_freed(const page_id_t page_id);
+
+/** Updates the page hash index when a single record is inserted on a page.
+@param[in]	cursor	cursor which was positioned to the place to insert
+			using btr_cur_search_, and the new record has been
+			inserted next to the cursor.
+@param[in]	ahi_latch	the adaptive hash index latch */
+void btr_search_update_hash_node_on_insert(btr_cur_t *cursor,
+                                           srw_spin_lock *ahi_latch);
+
+/** Updates the page hash index when a single record is inserted on a page.
+@param[in,out]	cursor		cursor which was positioned to the
+				place to insert using btr_cur_search_...,
+				and the new record has been inserted next
+				to the cursor
+@param[in]	ahi_latch	the adaptive hash index latch */
+void btr_search_update_hash_on_insert(btr_cur_t *cursor,
+                                      srw_spin_lock *ahi_latch);
+
+/** Updates the page hash index when a single record is deleted from a page.
+@param[in]	cursor	cursor which was positioned on the record to delete
+			using btr_cur_search_, the record is not yet deleted.*/
+void btr_search_update_hash_on_delete(btr_cur_t *cursor);
+
+/** Validates the search system.
+@param thd   connection, for checking if CHECK TABLE has been killed
+@return true if ok */
+bool btr_search_validate(THD *thd);
+
+/** Lock all search latches in exclusive mode. */
+static inline void btr_search_x_lock_all();
+
+/** Unlock all search latches from exclusive mode. */
+static inline void btr_search_x_unlock_all();
+
+/** Lock all search latches in shared mode. */
+static inline void btr_search_s_lock_all();
+
+/** Unlock all search latches from shared mode. */
+static inline void btr_search_s_unlock_all();
+
+# ifdef UNIV_DEBUG
+/** @return if the index is marked as freed */
+bool btr_search_check_marked_free_index(const buf_block_t *block);
+# endif /* UNIV_DEBUG */
+#else /* BTR_CUR_HASH_ADAPT */
+# define btr_search_sys_create()
+# define btr_search_sys_free()
+# define btr_search_drop_page_hash_index(block, garbage_collect)
+# define btr_search_s_lock_all(index)
+# define btr_search_s_unlock_all(index)
+# define btr_search_info_update(index, cursor)
+# define btr_search_move_or_delete_hash_entries(new_block, block)
+# define btr_search_update_hash_on_insert(cursor, ahi_latch)
+# define btr_search_update_hash_on_delete(cursor)
+# ifdef UNIV_DEBUG
+#  define btr_search_check_marked_free_index(block)
+# endif /* UNIV_DEBUG */
+#endif /* BTR_CUR_HASH_ADAPT */
+
+#ifdef BTR_CUR_ADAPT
+/** Create and initialize search info.
+@param[in,out]	heap		heap where created
+@return own: search info struct */
+static inline btr_search_t* btr_search_info_create(mem_heap_t* heap)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** @return the search info of an index */
+static inline btr_search_t* btr_search_get_info(dict_index_t* index)
+{
+	return(index->search_info);
+}
+#endif /* BTR_CUR_ADAPT */
+
+/** The search info struct in an index */
+struct btr_search_t{
+	/* @{ The following fields are not protected by any latch.
+	Unfortunately, this means that they must be aligned to
+	the machine word, i.e., they cannot be turned into bit-fields. */
+	buf_block_t* root_guess;/*!< the root page frame when it was last time
+				fetched, or NULL */
+#ifdef BTR_CUR_HASH_ADAPT
+	ulint	hash_analysis;	/*!< when this exceeds
+				BTR_SEARCH_HASH_ANALYSIS, the hash
+				analysis starts; this is reset if no
+				success noticed */
+	ibool	last_hash_succ;	/*!< TRUE if the last search would have
+				succeeded, or did succeed, using the hash
+				index; NOTE that the value here is not exact:
+				it is not calculated for every search, and the
+				calculation itself is not always accurate! */
+	ulint	n_hash_potential;
+				/*!< number of consecutive searches
+				which would have succeeded, or did succeed,
+				using the hash index;
+				the range is 0 .. BTR_SEARCH_BUILD_LIMIT + 5 */
+	/* @} */
+	ulint	ref_count;	/*!< Number of blocks in this index tree
+				that have search index built
+				i.e. block->index points to this index.
+				Protected by search latch except
+				when during initialization in
+				btr_search_info_create(). */
+
+	/*---------------------- @{ */
+	uint16_t n_fields;	/*!< recommended prefix length for hash search:
+				number of full fields */
+	uint16_t n_bytes;	/*!< recommended prefix: number of bytes in
+				an incomplete field
+				@see BTR_PAGE_MAX_REC_SIZE */
+	bool	left_side;	/*!< true or false, depending on whether
+				the leftmost record of several records with
+				the same prefix should be indexed in the
+				hash index */
+	/*---------------------- @} */
+#ifdef UNIV_SEARCH_PERF_STAT
+	ulint	n_hash_succ;	/*!< number of successful hash searches thus
+				far */
+	ulint	n_hash_fail;	/*!< number of failed hash searches */
+	ulint	n_patt_succ;	/*!< number of successful pattern searches thus
+				far */
+	ulint	n_searches;	/*!< number of searches */
+#endif /* UNIV_SEARCH_PERF_STAT */
+#endif /* BTR_CUR_HASH_ADAPT */
+#ifdef UNIV_DEBUG
+	ulint	magic_n;	/*!< magic number @see BTR_SEARCH_MAGIC_N */
+/** value of btr_search_t::magic_n, used in assertions */
+# define BTR_SEARCH_MAGIC_N	1112765
+#endif /* UNIV_DEBUG */
+};
+
+#ifdef BTR_CUR_HASH_ADAPT
+/** The hash index system */
+struct btr_search_sys_t
+{
+  /** Partition of the hash table */
+  struct partition
+  {
+    /** latches protecting hash_table */
+    srw_spin_lock latch;
+    /** mapping of dtuple_fold() to rec_t* in buf_block_t::frame */
+    hash_table_t table;
+    /** memory heap for table */
+    mem_heap_t *heap;
+
+#ifdef _MSC_VER
+#pragma warning(push)
+// nonstandard extension - zero sized array, if perfschema is not compiled
+#pragma warning(disable : 4200)
+#endif
+
+    char pad[(CPU_LEVEL1_DCACHE_LINESIZE - sizeof latch -
+              sizeof table - sizeof heap) &
+             (CPU_LEVEL1_DCACHE_LINESIZE - 1)];
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+    void init()
+    {
+      memset((void*) this, 0, sizeof *this);
+      latch.SRW_LOCK_INIT(btr_search_latch_key);
+    }
+
+    void alloc(ulint hash_size)
+    {
+      table.create(hash_size);
+      heap= mem_heap_create_typed(std::min<ulong>(4096,
+                                                  MEM_MAX_ALLOC_IN_BUF / 2
+                                                  - MEM_BLOCK_HEADER_SIZE
+                                                  - MEM_SPACE_NEEDED(0)),
+                                  MEM_HEAP_FOR_BTR_SEARCH);
+    }
+
+    void clear()
+    {
+      mem_heap_free(heap);
+      heap= nullptr;
+      ut_free(table.array);
+    }
+
+    void free()
+    {
+      latch.destroy();
+      if (heap)
+        clear();
+    }
+  };
+
+  /** Partitions of the adaptive hash index */
+  partition *parts;
+
+  /** Get an adaptive hash index partition */
+  partition *get_part(index_id_t id, ulint space_id) const
+  {
+    return parts + ut_fold_ulint_pair(ulint(id), space_id) % btr_ahi_parts;
+  }
+
+  /** Get an adaptive hash index partition */
+  partition *get_part(const dict_index_t &index) const
+  {
+    ut_ad(!index.table->space ||
+          index.table->space->id == index.table->space_id);
+    return get_part(ulint(index.id), index.table->space_id);
+  }
+
+  /** Get the search latch for the adaptive hash index partition */
+  srw_spin_lock *get_latch(const dict_index_t &index) const
+  { return &get_part(index)->latch; }
+
+  /** Create and initialize at startup */
+  void create()
+  {
+    parts= static_cast<partition*>(ut_malloc(btr_ahi_parts * sizeof *parts,
+                                             mem_key_ahi));
+    for (ulong i= 0; i < btr_ahi_parts; ++i)
+      parts[i].init();
+    if (btr_search_enabled)
+      btr_search_enable();
+  }
+
+  void alloc(ulint hash_size)
+  {
+    hash_size/= btr_ahi_parts;
+    for (ulong i= 0; i < btr_ahi_parts; ++i)
+      parts[i].alloc(hash_size);
+  }
+
+  /** Clear when disabling the adaptive hash index */
+  void clear() { for (ulong i= 0; i < btr_ahi_parts; ++i) parts[i].clear(); }
+
+  /** Free at shutdown */
+  void free()
+  {
+    if (parts)
+    {
+      for (ulong i= 0; i < btr_ahi_parts; ++i)
+        parts[i].free();
+      ut_free(parts);
+      parts= nullptr;
+    }
+  }
+};
+
+/** The adaptive hash index */
+extern btr_search_sys_t btr_search_sys;
+
+/** @return number of leaf pages pointed to by the adaptive hash index */
+TRANSACTIONAL_INLINE inline ulint dict_index_t::n_ahi_pages() const
+{
+  if (!btr_search_enabled)
+    return 0;
+  srw_spin_lock *latch= &btr_search_sys.get_part(*this)->latch;
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+  if (xbegin())
+  {
+    if (latch->is_locked())
+      xabort();
+    ulint ref_count= search_info->ref_count;
+    xend();
+    return ref_count;
+  }
+#endif
+  latch->rd_lock(SRW_LOCK_CALL);
+  ulint ref_count= search_info->ref_count;
+  latch->rd_unlock();
+  return ref_count;
+}
+
+#ifdef UNIV_SEARCH_PERF_STAT
+/** Number of successful adaptive hash index lookups */
+extern ulint	btr_search_n_succ;
+/** Number of failed adaptive hash index lookups */
+extern ulint	btr_search_n_hash_fail;
+#endif /* UNIV_SEARCH_PERF_STAT */
+
+/** After change in n_fields or n_bytes in info, this many rounds are waited
+before starting the hash analysis again: this is to save CPU time when there
+is no hope in building a hash index. */
+#define BTR_SEARCH_HASH_ANALYSIS	17
+
+/** Limit of consecutive searches for trying a search shortcut on the search
+pattern */
+#define BTR_SEARCH_ON_PATTERN_LIMIT	3
+
+/** Limit of consecutive searches for trying a search shortcut using
+the hash index */
+#define BTR_SEARCH_ON_HASH_LIMIT	3
+
+/** We do this many searches before trying to keep the search latch
+over calls from MySQL. If we notice someone waiting for the latch, we
+again set this much timeout. This is to reduce contention. */
+#define BTR_SEA_TIMEOUT			10000
+#endif /* BTR_CUR_HASH_ADAPT */
+
+#include "btr0sea.inl"
+
+#endif
diff --git a/storage/innobase/include/btr0sea.inl b/storage/innobase/include/btr0sea.inl
new file mode 100644
index 00000000..5a8d6480
--- /dev/null
+++ b/storage/innobase/include/btr0sea.inl
@@ -0,0 +1,117 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/btr0sea.ic
+The index tree adaptive search
+
+Created 2/17/1996 Heikki Tuuri
+*************************************************************************/
+
+#include "dict0mem.h"
+#include "btr0cur.h"
+#include "buf0buf.h"
+
+/** Create and initialize search info.
+@param[in,out]	heap		heap where created
+@return own: search info struct */
+static inline btr_search_t* btr_search_info_create(mem_heap_t* heap)
+{
+	btr_search_t*	info = static_cast<btr_search_t*>(
+		mem_heap_zalloc(heap, sizeof(btr_search_t)));
+	ut_d(info->magic_n = BTR_SEARCH_MAGIC_N);
+#ifdef BTR_CUR_HASH_ADAPT
+	info->n_fields = 1;
+	info->left_side = TRUE;
+#endif /* BTR_CUR_HASH_ADAPT */
+	return(info);
+}
+
+#ifdef BTR_CUR_HASH_ADAPT
+/** Updates the search info.
+@param[in,out]	info	search info
+@param[in,out]	cursor	cursor which was just positioned */
+void btr_search_info_update_slow(btr_search_t *info, btr_cur_t *cursor);
+
+/*********************************************************************//**
+Updates the search info. */
+static inline
+void
+btr_search_info_update(
+/*===================*/
+	dict_index_t*	index,	/*!< in: index of the cursor */
+	btr_cur_t*	cursor)	/*!< in: cursor which was just positioned */
+{
+	ut_ad(!index->is_spatial());
+	ut_ad(!index->table->is_temporary());
+
+	if (!btr_search_enabled) {
+		return;
+	}
+
+	btr_search_t*	info;
+	info = btr_search_get_info(index);
+
+	info->hash_analysis++;
+
+	if (info->hash_analysis < BTR_SEARCH_HASH_ANALYSIS) {
+
+		/* Do nothing */
+
+		return;
+
+	}
+
+	ut_ad(cursor->flag != BTR_CUR_HASH);
+
+	btr_search_info_update_slow(info, cursor);
+}
+
+/** Lock all search latches in exclusive mode. */
+static inline void btr_search_x_lock_all()
+{
+	for (ulint i = 0; i < btr_ahi_parts; ++i) {
+		btr_search_sys.parts[i].latch.wr_lock(SRW_LOCK_CALL);
+	}
+}
+
+/** Unlock all search latches from exclusive mode. */
+static inline void btr_search_x_unlock_all()
+{
+	for (ulint i = 0; i < btr_ahi_parts; ++i) {
+		btr_search_sys.parts[i].latch.wr_unlock();
+	}
+}
+
+/** Lock all search latches in shared mode. */
+static inline void btr_search_s_lock_all()
+{
+	for (ulint i = 0; i < btr_ahi_parts; ++i) {
+		btr_search_sys.parts[i].latch.rd_lock(SRW_LOCK_CALL);
+	}
+}
+
+/** Unlock all search latches from shared mode. */
+static inline void btr_search_s_unlock_all()
+{
+	for (ulint i = 0; i < btr_ahi_parts; ++i) {
+		btr_search_sys.parts[i].latch.rd_unlock();
+	}
+}
+#endif /* BTR_CUR_HASH_ADAPT */
diff --git a/storage/innobase/include/btr0types.h b/storage/innobase/include/btr0types.h
new file mode 100644
index 00000000..fc829e78
--- /dev/null
+++ b/storage/innobase/include/btr0types.h
@@ -0,0 +1,154 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/btr0types.h
+The index tree general types
+
+Created 2/17/1996 Heikki Tuuri
+*************************************************************************/
+
+#pragma once
+
+#include "page0types.h"
+#include "rem0types.h"
+
+/** Persistent cursor */
+struct btr_pcur_t;
+/** B-tree cursor */
+struct btr_cur_t;
+/** B-tree search information for the adaptive hash index */
+struct btr_search_t;
+
+#ifdef BTR_CUR_HASH_ADAPT
+/** Is search system enabled.
+Search system is protected by array of latches. */
+extern char	btr_search_enabled;
+
+/** Number of adaptive hash index partition. */
+extern ulong	btr_ahi_parts;
+#endif /* BTR_CUR_HASH_ADAPT */
+
+/** The size of a reference to data stored on a different page.
+The reference is stored at the end of the prefix of the field
+in the index record. */
+#define FIELD_REF_SIZE			20U
+#define BTR_EXTERN_FIELD_REF_SIZE	FIELD_REF_SIZE
+
+/** If the data don't exceed the size, the data are stored locally. */
+#define BTR_EXTERN_LOCAL_STORED_MAX_SIZE	\
+	(BTR_EXTERN_FIELD_REF_SIZE * 2)
+
+/** Latching modes for btr_cur_t::search_leaf(). */
+enum btr_latch_mode {
+	/** Search a record on a leaf page and S-latch it. */
+	BTR_SEARCH_LEAF = RW_S_LATCH,
+	/** (Prepare to) modify a record on a leaf page and X-latch it. */
+	BTR_MODIFY_LEAF	= RW_X_LATCH,
+	/** U-latch root and X-latch a leaf page */
+	BTR_MODIFY_ROOT_AND_LEAF = RW_SX_LATCH,
+	/** Obtain no latches. */
+	BTR_NO_LATCHES = RW_NO_LATCH,
+	/** Search the previous record.
+	Used in btr_pcur_move_backward_from_page(). */
+	BTR_SEARCH_PREV = 4 | BTR_SEARCH_LEAF,
+	/** Modify the previous record.
+	Used in btr_pcur_move_backward_from_page() and ibuf_insert(). */
+	BTR_MODIFY_PREV = 4 | BTR_MODIFY_LEAF,
+	/** Start modifying the entire B-tree. */
+	BTR_MODIFY_TREE = 8 | BTR_MODIFY_LEAF,
+	/** Continue modifying the entire R-tree.
+	Only used by rtr_search_to_nth_level(). */
+	BTR_CONT_MODIFY_TREE = 4 | BTR_MODIFY_TREE,
+
+	/* BTR_INSERT, BTR_DELETE and BTR_DELETE_MARK are mutually
+	exclusive. */
+	/** The search tuple will be inserted to the secondary index
+	at the searched position.  When the leaf page is not in the
+	buffer pool, try to use the change buffer. */
+	BTR_INSERT = 64,
+
+	/** Try to delete mark a secondary index leaf page record at
+	the searched position using the change buffer when the page is
+	not in the buffer pool. */
+	BTR_DELETE_MARK	= 128,
+
+	/** Try to purge the record using the change buffer when the
+	secondary index leaf page is not in the buffer pool. */
+	BTR_DELETE = BTR_INSERT | BTR_DELETE_MARK,
+
+	/** The caller is already holding dict_index_t::lock S-latch. */
+	BTR_ALREADY_S_LATCHED = 256,
+	/** Search and S-latch a leaf page, assuming that the
+	dict_index_t::lock S-latch is being held. */
+	BTR_SEARCH_LEAF_ALREADY_S_LATCHED = BTR_SEARCH_LEAF
+	| BTR_ALREADY_S_LATCHED,
+	/** Search and X-latch a leaf page, assuming that the
+	dict_index_t::lock is being held in non-exclusive mode. */
+	BTR_MODIFY_LEAF_ALREADY_LATCHED = BTR_MODIFY_LEAF
+	| BTR_ALREADY_S_LATCHED,
+	/** Attempt to modify records in an x-latched tree. */
+	BTR_MODIFY_TREE_ALREADY_LATCHED = BTR_MODIFY_TREE
+	| BTR_ALREADY_S_LATCHED,
+	/** U-latch root and X-latch a leaf page, assuming that
+	dict_index_t::lock is being held in U mode. */
+	BTR_MODIFY_ROOT_AND_LEAF_ALREADY_LATCHED = BTR_MODIFY_ROOT_AND_LEAF
+	| BTR_ALREADY_S_LATCHED,
+
+	/** Attempt to delete-mark a secondary index record. */
+	BTR_DELETE_MARK_LEAF = BTR_MODIFY_LEAF | BTR_DELETE_MARK,
+	/** Attempt to delete-mark a secondary index record
+	while holding the dict_index_t::lock S-latch. */
+	BTR_DELETE_MARK_LEAF_ALREADY_S_LATCHED = BTR_DELETE_MARK_LEAF
+	| BTR_ALREADY_S_LATCHED,
+	/** Attempt to purge a secondary index record. */
+	BTR_PURGE_LEAF = BTR_MODIFY_LEAF | BTR_DELETE,
+	/** Attempt to purge a secondary index record
+	while holding the dict_index_t::lock S-latch. */
+	BTR_PURGE_LEAF_ALREADY_S_LATCHED = BTR_PURGE_LEAF
+	| BTR_ALREADY_S_LATCHED,
+
+	/** In the case of BTR_MODIFY_TREE, the caller specifies
+	the intention to delete record only. It is used to optimize
+	block->lock range.*/
+	BTR_LATCH_FOR_DELETE = 512,
+
+	/** In the case of BTR_MODIFY_TREE, the caller specifies
+	the intention to delete record only. It is used to optimize
+	block->lock range.*/
+	BTR_LATCH_FOR_INSERT = 1024,
+
+	/** Attempt to delete a record in the tree. */
+	BTR_PURGE_TREE = BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE,
+	/** Attempt to delete a record in an x-latched tree. */
+	BTR_PURGE_TREE_ALREADY_LATCHED = BTR_PURGE_TREE
+	| BTR_ALREADY_S_LATCHED,
+
+	/** Attempt to insert a record into the tree. */
+	BTR_INSERT_TREE = BTR_MODIFY_TREE | BTR_LATCH_FOR_INSERT,
+
+	/** This flag ORed to BTR_INSERT says that we can ignore possible
+	UNIQUE definition on secondary indexes when we decide if we can use
+	the insert buffer to speed up inserts */
+	BTR_IGNORE_SEC_UNIQUE = 2048,
+	/** Rollback in spatial index */
+	BTR_RTREE_UNDO_INS = 4096,
+	/** Try to delete mark a spatial index record */
+	BTR_RTREE_DELETE_MARK = 8192
+};
diff --git a/storage/innobase/include/buf0block_hint.h b/storage/innobase/include/buf0block_hint.h
new file mode 100644
index 00000000..d4fee7c1
--- /dev/null
+++ b/storage/innobase/include/buf0block_hint.h
@@ -0,0 +1,76 @@
+/*****************************************************************************
+
+Copyright (c) 2020, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2020, MariaDB Corporation.
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License, version 2.0, as published by the
+Free Software Foundation.
+
+This program is also distributed with certain software (including but not
+limited to OpenSSL) that is licensed under separate terms, as designated in a
+particular file or component or in included license documentation. The authors
+of MySQL hereby grant you an additional permission to link the program and
+your derivative works with the separately licensed software that they have
+included with MySQL.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0,
+for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA
+
+*****************************************************************************/
+#pragma once
+#include "buf0buf.h"
+
+namespace buf {
+class Block_hint {
+public:
+  /** Stores the pointer to the block, which is currently buffer-fixed.
+  @param  block   a pointer to a buffer-fixed block to be stored */
+  inline void store(buf_block_t *block)
+  {
+    ut_ad(block->page.buf_fix_count());
+    m_block= block;
+    m_page_id= block->page.id();
+  }
+
+  /** Clears currently stored pointer. */
+  inline void clear() { m_block= nullptr; }
+
+  /** Invoke f on m_block(which may be null)
+  @param  f   The function to be executed. It will be passed the pointer.
+              If you wish to use the block pointer subsequently,
+	      you need to ensure you buffer-fix it before returning from f.
+  @return the return value of f
+  */
+  template <typename F>
+  bool run_with_hint(const F &f)
+  {
+    buffer_fix_block_if_still_valid();
+    /* m_block could be changed during f() call, so we use local
+    variable to remember which block we need to unfix */
+    buf_block_t *block= m_block;
+    bool res= f(block);
+    if (block)
+      block->page.unfix();
+    return res;
+  }
+
+  buf_block_t *block() const { return m_block; }
+
+ private:
+  /** The block pointer stored by store(). */
+  buf_block_t *m_block= nullptr;
+  /** If m_block is non-null, the m_block->page.id at time it was stored. */
+  page_id_t m_page_id{0, 0};
+
+  /** A helper function which checks if m_block is not a dangling pointer and
+  still points to block with page with m_page_id and if so, buffer-fixes it,
+  otherwise clear()s it */
+  void buffer_fix_block_if_still_valid();
+};
+}  // namespace buf
diff --git a/storage/innobase/include/buf0buddy.h b/storage/innobase/include/buf0buddy.h
new file mode 100644
index 00000000..bb999420
--- /dev/null
+++ b/storage/innobase/include/buf0buddy.h
@@ -0,0 +1,91 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0buddy.h
+Binary buddy allocator for compressed pages
+
+Created December 2006 by Marko Makela
+*******************************************************/
+
+#ifndef buf0buddy_h
+#define buf0buddy_h
+
+#include "buf0types.h"
+
+/**
+@param[in]	block size in bytes
+@return index of buf_pool.zip_free[], or BUF_BUDDY_SIZES */
+inline
+ulint
+buf_buddy_get_slot(ulint size)
+{
+	ulint	i;
+	ulint	s;
+
+	ut_ad(ut_is_2pow(size));
+	ut_ad(size >= UNIV_ZIP_SIZE_MIN);
+	ut_ad(size <= srv_page_size);
+
+	for (i = 0, s = BUF_BUDDY_LOW; s < size; i++, s <<= 1) {
+	}
+	ut_ad(i <= BUF_BUDDY_SIZES);
+	return i;
+}
+
+/** Allocate a ROW_FORMAT=COMPRESSED block.
+@param i      index of buf_pool.zip_free[] or BUF_BUDDY_SIZES
+@param lru    assigned to true if buf_pool.mutex was temporarily released
+@return allocated block, never NULL */
+byte *buf_buddy_alloc_low(ulint i, bool *lru) MY_ATTRIBUTE((malloc));
+
+/** Allocate a ROW_FORMAT=COMPRESSED block.
+@param size   compressed page size in bytes
+@param lru    assigned to true if buf_pool.mutex was temporarily released
+@return allocated block, never NULL */
+inline byte *buf_buddy_alloc(ulint size, bool *lru= nullptr)
+{
+  return buf_buddy_alloc_low(buf_buddy_get_slot(size), lru);
+}
+
+/** Deallocate a block.
+@param[in]	buf	block to be freed, must not be pointed to
+			by the buffer pool
+@param[in]	i	index of buf_pool.zip_free[], or BUF_BUDDY_SIZES */
+void buf_buddy_free_low(void* buf, ulint i);
+
+/** Deallocate a block.
+@param[in]	buf	block to be freed, must not be pointed to
+			by the buffer pool
+@param[in]	size	block size in bytes */
+inline void buf_buddy_free(void* buf, ulint size)
+{
+	buf_buddy_free_low(buf, buf_buddy_get_slot(size));
+}
+
+/** Try to reallocate a block.
+@param[in]	buf		block to be reallocated, must be pointed
+to by the buffer pool
+@param[in]	size		block size, up to srv_page_size
+@retval false	if failed because of no free blocks. */
+bool buf_buddy_realloc(void* buf, ulint size);
+
+/** Combine all pairs of free buddies. */
+void buf_buddy_condense_free();
+#endif /* buf0buddy_h */
diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h
new file mode 100644
index 00000000..332b2039
--- /dev/null
+++ b/storage/innobase/include/buf0buf.h
@@ -0,0 +1,2190 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0buf.h
+The database buffer pool high-level routines
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+/** Magic value to use instead of checksums when they are disabled */
+#define BUF_NO_CHECKSUM_MAGIC 0xDEADBEEFUL
+
+#include "fil0fil.h"
+#include "mtr0types.h"
+#include "span.h"
+#include "assume_aligned.h"
+#include "buf0types.h"
+#ifndef UNIV_INNOCHECKSUM
+#include "ut0byte.h"
+#include "page0types.h"
+#include "log0log.h"
+#include "srv0srv.h"
+#include "transactional_lock_guard.h"
+#include <ostream>
+
+/** @name Modes for buf_page_get_gen */
+/* @{ */
+#define BUF_GET			10	/*!< get always */
+#define	BUF_GET_IF_IN_POOL	11	/*!< get if in pool */
+#define BUF_PEEK_IF_IN_POOL	12	/*!< get if in pool, do not make
+					the block young in the LRU list */
+#define BUF_GET_IF_IN_POOL_OR_WATCH	15
+					/*!< Get the page only if it's in the
+					buffer pool, if not then set a watch
+					on the page. */
+#define BUF_GET_POSSIBLY_FREED		16
+					/*!< Like BUF_GET, but do not mind
+					if the file page has been freed. */
+/* @} */
+
+/** If LRU list of a buf_pool is less than this size then LRU eviction
+should not happen. This is because when we do LRU flushing we also put
+the blocks on free list. If LRU list is very small then we can end up
+in thrashing. */
+#define BUF_LRU_MIN_LEN		256
+
+/** This structure defines information we will fetch from each buffer pool. It
+will be used to print table IO stats */
+struct buf_pool_info_t
+{
+	/* General buffer pool info */
+	ulint	pool_size;		/*!< Buffer Pool size in pages */
+	ulint	lru_len;		/*!< Length of buf_pool.LRU */
+	ulint	old_lru_len;		/*!< buf_pool.LRU_old_len */
+	ulint	free_list_len;		/*!< Length of buf_pool.free list */
+	ulint	flush_list_len;		/*!< Length of buf_pool.flush_list */
+	ulint	n_pend_unzip;		/*!< buf_pool.n_pend_unzip, pages
+					pending decompress */
+	ulint	n_pend_reads;		/*!< os_aio_pending_reads() */
+	ulint	n_pending_flush_lru;	/*!< Pages pending flush in LRU */
+	ulint	n_pending_flush_list;	/*!< Pages pending flush in FLUSH
+					LIST */
+	ulint	n_pages_made_young;	/*!< number of pages made young */
+	ulint	n_pages_not_made_young;	/*!< number of pages not made young */
+	ulint	n_pages_read;		/*!< buf_pool.n_pages_read */
+	ulint	n_pages_created;	/*!< buf_pool.n_pages_created */
+	ulint	n_pages_written;	/*!< buf_pool.n_pages_written */
+	ulint	n_page_gets;		/*!< buf_pool.n_page_gets */
+	ulint	n_ra_pages_read_rnd;	/*!< buf_pool.n_ra_pages_read_rnd,
+					number of pages readahead */
+	ulint	n_ra_pages_read;	/*!< buf_pool.n_ra_pages_read, number
+					of pages readahead */
+	ulint	n_ra_pages_evicted;	/*!< buf_pool.n_ra_pages_evicted,
+					number of readahead pages evicted
+					without access */
+	ulint	n_page_get_delta;	/*!< num of buffer pool page gets since
+					last printout */
+
+	/* Buffer pool access stats */
+	double	page_made_young_rate;	/*!< page made young rate in pages
+					per second */
+	double	page_not_made_young_rate;/*!< page not made young rate
+					in pages per second */
+	double	pages_read_rate;	/*!< num of pages read per second */
+	double	pages_created_rate;	/*!< num of pages create per second */
+	double	pages_written_rate;	/*!< num of  pages written per second */
+	ulint	page_read_delta;	/*!< num of pages read since last
+					printout */
+	ulint	young_making_delta;	/*!< num of pages made young since
+					last printout */
+	ulint	not_young_making_delta;	/*!< num of pages not make young since
+					last printout */
+
+	/* Statistics about read ahead algorithm.  */
+	double	pages_readahead_rnd_rate;/*!< random readahead rate in pages per
+					second */
+	double	pages_readahead_rate;	/*!< readahead rate in pages per
+					second */
+	double	pages_evicted_rate;	/*!< rate of readahead page evicted
+					without access, in pages per second */
+
+	/* Stats about LRU eviction */
+	ulint	unzip_lru_len;		/*!< length of buf_pool.unzip_LRU
+					list */
+	/* Counters for LRU policy */
+	ulint	io_sum;			/*!< buf_LRU_stat_sum.io */
+	ulint	io_cur;			/*!< buf_LRU_stat_cur.io, num of IO
+					for current interval */
+	ulint	unzip_sum;		/*!< buf_LRU_stat_sum.unzip */
+	ulint	unzip_cur;		/*!< buf_LRU_stat_cur.unzip, num
+					pages decompressed in current
+					interval */
+};
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** Print the given page_id_t object.
+@param[in,out]	out	the output stream
+@param[in]	page_id	the page_id_t object to be printed
+@return the output stream */
+std::ostream&
+operator<<(
+	std::ostream&		out,
+	const page_id_t		page_id);
+
+#ifndef UNIV_INNOCHECKSUM
+# define buf_pool_get_curr_size() srv_buf_pool_curr_size
+
+/** Allocate a buffer block.
+@return own: the allocated block, state()==MEMORY */
+inline buf_block_t *buf_block_alloc();
+/********************************************************************//**
+Frees a buffer block which does not contain a file page. */
+UNIV_INLINE
+void
+buf_block_free(
+/*===========*/
+	buf_block_t*	block);	/*!< in, own: block to be freed */
+
+#define buf_page_get(ID, SIZE, LA, MTR)					\
+	buf_page_get_gen(ID, SIZE, LA, NULL, BUF_GET, MTR)
+
+/** Try to acquire a page latch.
+@param rw_latch      RW_S_LATCH or RW_X_LATCH
+@param block         guessed block
+@param modify_clock  expected value of block->modify_clock
+@param mtr           mini-transaction
+@return whether the latch was acquired (the page is an allocated file page) */
+bool buf_page_optimistic_get(ulint rw_latch, buf_block_t *block,
+                             uint64_t modify_clock, mtr_t *mtr);
+
+/** Try to S-latch a page.
+Suitable for using when holding the lock_sys latches (as it avoids deadlock).
+@param[in]	page_id	page identifier
+@param[in,out]	mtr	mini-transaction
+@return the block
+@retval nullptr if an S-latch cannot be granted immediately */
+buf_block_t *buf_page_try_get(const page_id_t page_id, mtr_t *mtr);
+
+/** Get read access to a compressed page (usually of type
+FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2).
+The page must be released with unfix().
+NOTE: the page is not protected by any latch.  Mutual exclusion has to
+be implemented at a higher level.  In other words, all possible
+accesses to a given page through this function must be protected by
+the same set of mutexes or latches.
+@param page_id   page identifier
+@param zip_size  ROW_FORMAT=COMPRESSED page size in bytes
+@return pointer to the block, s-latched */
+buf_page_t *buf_page_get_zip(const page_id_t page_id, ulint zip_size);
+
+/** Get access to a database page. Buffered redo log may be applied.
+@param[in]	page_id			page id
+@param[in]	zip_size		ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	rw_latch		RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
+@param[in]	guess			guessed block or NULL
+@param[in]	mode			BUF_GET, BUF_GET_IF_IN_POOL,
+BUF_PEEK_IF_IN_POOL, or BUF_GET_IF_IN_POOL_OR_WATCH
+@param[in,out]	mtr			mini-transaction
+@param[out]	err			DB_SUCCESS or error code
+@param[in]	allow_ibuf_merge	Allow change buffer merge while
+reading the pages from file.
+@return pointer to the block or NULL */
+buf_block_t*
+buf_page_get_gen(
+	const page_id_t		page_id,
+	ulint			zip_size,
+	ulint			rw_latch,
+	buf_block_t*		guess,
+	ulint			mode,
+	mtr_t*			mtr,
+	dberr_t*		err = NULL,
+	bool			allow_ibuf_merge = false)
+	MY_ATTRIBUTE((nonnull(6), warn_unused_result));
+
+/** This is the low level function used to get access to a database page.
+@param[in]	page_id			page id
+@param[in]	zip_size		ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	rw_latch		RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
+@param[in]	guess			guessed block or NULL
+@param[in]	mode			BUF_GET, BUF_GET_IF_IN_POOL,
+BUF_PEEK_IF_IN_POOL, or BUF_GET_IF_IN_POOL_OR_WATCH
+@param[in,out]	mtr			mini-transaction, or NULL if a
+					block with page_id is to be evicted
+@param[out]	err			DB_SUCCESS or error code
+@param[in]	allow_ibuf_merge	Allow change buffer merge to happen
+while reading the page from file
+then it makes sure that it does merging of change buffer changes while
+reading the page from file.
+@return pointer to the block or NULL */
+buf_block_t*
+buf_page_get_low(
+	const page_id_t		page_id,
+	ulint			zip_size,
+	ulint			rw_latch,
+	buf_block_t*		guess,
+	ulint			mode,
+	mtr_t*			mtr,
+	dberr_t*		err,
+	bool			allow_ibuf_merge);
+
+/** Initialize a page in the buffer pool. The page is usually not read
+from a file even if it cannot be found in the buffer buf_pool. This is one
+of the functions which perform to a block a state transition NOT_USED => LRU
+(the other is buf_page_get_low()).
+@param[in,out]	space		space object
+@param[in]	offset		offset of the tablespace
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out]	mtr		mini-transaction
+@param[in,out]	free_block	pre-allocated buffer block
+@return pointer to the block, page bufferfixed */
+buf_block_t*
+buf_page_create(fil_space_t *space, uint32_t offset,
+                ulint zip_size, mtr_t *mtr, buf_block_t *free_block);
+
+/** Initialize a page in buffer pool while initializing the
+deferred tablespace
+@param space_id         space identfier
+@param zip_size         ROW_FORMAT=COMPRESSED page size or 0
+@param mtr              mini-transaction
+@param free_block       pre-allocated buffer block
+@return pointer to the block, page bufferfixed */
+buf_block_t*
+buf_page_create_deferred(uint32_t space_id, ulint zip_size, mtr_t *mtr,
+                         buf_block_t *free_block);
+
+/** Move a block to the start of the LRU list. */
+void buf_page_make_young(buf_page_t *bpage);
+/** Mark the page status as FREED for the given tablespace and page number.
+@param[in,out]	space	tablespace
+@param[in]	page	page number
+@param[in,out]	mtr	mini-transaction */
+void buf_page_free(fil_space_t *space, uint32_t page, mtr_t *mtr);
+
+/** Determine if a block is still close enough to the MRU end of the LRU list
+meaning that it is not in danger of getting evicted and also implying
+that it has been accessed recently.
+Note that this is for heuristics only and does not reserve buffer pool
+mutex.
+@param[in]	bpage		buffer pool page
+@return whether bpage is close to MRU end of LRU */
+inline bool buf_page_peek_if_young(const buf_page_t *bpage);
+
+/** Determine if a block should be moved to the start of the LRU list if
+there is danger of dropping from the buffer pool.
+@param[in]	bpage		buffer pool page
+@return true if bpage should be made younger */
+inline bool buf_page_peek_if_too_old(const buf_page_t *bpage);
+
+/** Move a page to the start of the buffer pool LRU list if it is too old.
+@param[in,out]	bpage		buffer pool page */
+inline void buf_page_make_young_if_needed(buf_page_t *bpage)
+{
+	if (UNIV_UNLIKELY(buf_page_peek_if_too_old(bpage))) {
+		buf_page_make_young(bpage);
+	}
+}
+
+/********************************************************************//**
+Increments the modify clock of a frame by 1. The caller must (1) own the
+buf_pool.mutex and block bufferfix count has to be zero, (2) or own an x-lock
+on the block. */
+UNIV_INLINE
+void
+buf_block_modify_clock_inc(
+/*=======================*/
+	buf_block_t*	block);	/*!< in: block */
+/********************************************************************//**
+Returns the value of the modify clock. The caller must have an s-lock
+or x-lock on the block.
+@return value */
+UNIV_INLINE
+ib_uint64_t
+buf_block_get_modify_clock(
+/*=======================*/
+	buf_block_t*	block);	/*!< in: block */
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** Check if a buffer is all zeroes.
+@param[in]	buf	data to check
+@return whether the buffer is all zeroes */
+bool buf_is_zeroes(st_::span<const byte> buf);
+
+/** Check if a page is corrupt.
+@param check_lsn   whether FIL_PAGE_LSN should be checked
+@param read_buf    database page
+@param fsp_flags   contents of FIL_SPACE_FLAGS
+@return whether the page is corrupted */
+bool buf_page_is_corrupted(bool check_lsn, const byte *read_buf,
+                           uint32_t fsp_flags)
+  MY_ATTRIBUTE((warn_unused_result));
+
+/** Read the key version from the page. In full crc32 format,
+key version is stored at {0-3th} bytes. In other format, it is
+stored in 26th position.
+@param[in]	read_buf	database page
+@param[in]	fsp_flags	tablespace flags
+@return key version of the page. */
+inline uint32_t buf_page_get_key_version(const byte* read_buf,
+                                         uint32_t fsp_flags)
+{
+  static_assert(FIL_PAGE_FCRC32_KEY_VERSION == 0, "compatibility");
+  return fil_space_t::full_crc32(fsp_flags)
+    ? mach_read_from_4(my_assume_aligned<4>(read_buf))
+    : mach_read_from_4(my_assume_aligned<2>
+		       (read_buf + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION));
+}
+
+/** Read the compression info from the page. In full crc32 format,
+compression info is at MSB of page type. In other format, it is
+stored in page type.
+@param[in]	read_buf	database page
+@param[in]	fsp_flags	tablespace flags
+@return true if page is compressed. */
+inline bool buf_page_is_compressed(const byte* read_buf, uint32_t fsp_flags)
+{
+  uint16_t page_type= fil_page_get_type(read_buf);
+  return fil_space_t::full_crc32(fsp_flags)
+    ? !!(page_type & 1U << FIL_PAGE_COMPRESS_FCRC32_MARKER)
+    : page_type == FIL_PAGE_PAGE_COMPRESSED;
+}
+
+/** Get the compressed or uncompressed size of a full_crc32 page.
+@param[in]	buf	page_compressed or uncompressed page
+@param[out]	comp	whether the page could be compressed
+@param[out]	cr	whether the page could be corrupted
+@return the payload size in the file page */
+inline uint buf_page_full_crc32_size(const byte* buf, bool* comp, bool* cr)
+{
+	uint t = fil_page_get_type(buf);
+	uint page_size = uint(srv_page_size);
+
+	if (!(t & 1U << FIL_PAGE_COMPRESS_FCRC32_MARKER)) {
+		return page_size;
+	}
+
+	t &= ~(1U << FIL_PAGE_COMPRESS_FCRC32_MARKER);
+	t <<= 8;
+
+	if (t < page_size) {
+		page_size = t;
+		if (comp) {
+			*comp = true;
+		}
+	} else if (cr) {
+		*cr = true;
+	}
+
+	return page_size;
+}
+
+#ifndef UNIV_INNOCHECKSUM
+/** Dump a page to stderr.
+@param[in]	read_buf	database page
+@param[in]	zip_size	compressed page size, or 0 */
+void buf_page_print(const byte* read_buf, ulint zip_size = 0)
+	ATTRIBUTE_COLD __attribute__((nonnull));
+/********************************************************************//**
+Decompress a block.
+@return TRUE if successful */
+ibool
+buf_zip_decompress(
+/*===============*/
+	buf_block_t*	block,	/*!< in/out: block */
+	ibool		check);	/*!< in: TRUE=verify the page checksum */
+
+#ifdef UNIV_DEBUG
+/** @return the number of latched pages in the buffer pool */
+ulint buf_get_latched_pages_number();
+#endif /* UNIV_DEBUG */
+/*********************************************************************//**
+Prints info of the buffer i/o. */
+void
+buf_print_io(
+/*=========*/
+	FILE*	file);	/*!< in: file where to print */
+/** Collect buffer pool metadata.
+@param[out]	pool_info	buffer pool metadata */
+void buf_stats_get_pool_info(buf_pool_info_t *pool_info);
+
+/** Refresh the statistics used to print per-second averages. */
+void buf_refresh_io_stats();
+
+/** Invalidate all pages in the buffer pool.
+All pages must be in a replaceable state (not modified or latched). */
+void buf_pool_invalidate();
+
+/*========================================================================
+--------------------------- LOWER LEVEL ROUTINES -------------------------
+=========================================================================*/
+
+#define buf_block_get_frame(block) (block)->page.frame
+
+/*********************************************************************//**
+Gets the compressed page descriptor corresponding to an uncompressed page
+if applicable. */
+#define buf_block_get_page_zip(block) \
+	(UNIV_LIKELY_NULL((block)->page.zip.data) ? &(block)->page.zip : NULL)
+#define is_buf_block_get_page_zip(block) \
+        UNIV_LIKELY_NULL((block)->page.zip.data)
+
+/** Monitor the buffer page read/write activity, and increment corresponding
+counter value in MONITOR_MODULE_BUF_PAGE.
+@param bpage   buffer page whose read or write was completed
+@param read    true=read, false=write */
+ATTRIBUTE_COLD void buf_page_monitor(const buf_page_t &bpage, bool read);
+
+/** Calculate aligned buffer pool size based on srv_buf_pool_chunk_unit,
+if needed.
+@param[in]	size	size in bytes
+@return	aligned size */
+ulint
+buf_pool_size_align(
+	ulint	size);
+
+/** Verify that post encryption checksum match with the calculated checksum.
+This function should be called only if tablespace contains crypt data metadata.
+@param page       page frame
+@param fsp_flags  contents of FSP_SPACE_FLAGS
+@return whether the page is encrypted and valid */
+bool buf_page_verify_crypt_checksum(const byte *page, uint32_t fsp_flags);
+
+/** Calculate a ROW_FORMAT=COMPRESSED page checksum and update the page.
+@param[in,out]	page		page to update
+@param[in]	size		compressed page size */
+void buf_flush_update_zip_checksum(buf_frame_t* page, ulint size);
+
+/** @brief The temporary memory structure.
+
+NOTE! The definition appears here only for other modules of this
+directory (buf) to see it. Do not use from outside! */
+
+class buf_tmp_buffer_t
+{
+  /** whether this slot is reserved */
+  std::atomic<bool> reserved;
+public:
+  /** For encryption, the data needs to be copied to a separate buffer
+  before it's encrypted&written. The buffer block itself can be replaced
+  while a write of crypt_buf to file is in progress. */
+  byte *crypt_buf;
+  /** buffer for fil_page_compress(), for flushing page_compressed pages */
+  byte *comp_buf;
+  /** pointer to resulting buffer after encryption or compression;
+  not separately allocated memory */
+  byte *out_buf;
+
+  /** Release the slot */
+  void release() { reserved.store(false, std::memory_order_relaxed); }
+
+  /** Acquire the slot
+  @return whether the slot was acquired */
+  bool acquire() { return !reserved.exchange(true, std::memory_order_relaxed);}
+
+  /** Allocate a buffer for encryption, decryption or decompression. */
+  void allocate()
+  {
+    if (!crypt_buf)
+      crypt_buf= static_cast<byte*>
+      (aligned_malloc(srv_page_size, srv_page_size));
+  }
+};
+
+/** The common buffer control block structure
+for compressed and uncompressed frames */
+
+class buf_pool_t;
+
+class buf_page_t
+{
+  friend buf_pool_t;
+  friend buf_block_t;
+
+  /** @name General fields */
+  /* @{ */
+
+public: // FIXME: fix fil_iterate()
+  /** Page id. Protected by buf_pool.page_hash.lock_get() when
+  the page is in buf_pool.page_hash. */
+  page_id_t id_;
+  /** buf_pool.page_hash link; protected by buf_pool.page_hash.lock_get() */
+  buf_page_t *hash;
+private:
+  /** log sequence number of the START of the log entry written of the
+  oldest modification to this block which has not yet been written
+  to the data file;
+
+  0 if no modifications are pending;
+  1 if no modifications are pending, but the block is in buf_pool.flush_list;
+  2 if modifications are pending, but the block is not in buf_pool.flush_list
+  (because id().space() is the temporary tablespace). */
+  Atomic_relaxed<lsn_t> oldest_modification_;
+
+public:
+  /** state() of unused block (in buf_pool.free list) */
+  static constexpr uint32_t NOT_USED= 0;
+  /** state() of block allocated as general-purpose memory */
+  static constexpr uint32_t MEMORY= 1;
+  /** state() of block that is being freed */
+  static constexpr uint32_t REMOVE_HASH= 2;
+  /** smallest state() of a buffer page that is freed in the tablespace */
+  static constexpr uint32_t FREED= 3;
+  /** smallest state() for a block that belongs to buf_pool.LRU */
+  static constexpr uint32_t UNFIXED= 1U << 29;
+  /** smallest state() of a block for which buffered changes may exist */
+  static constexpr uint32_t IBUF_EXIST= 2U << 29;
+  /** smallest state() of a (re)initialized page (no doublewrite needed) */
+  static constexpr uint32_t REINIT= 3U << 29;
+  /** smallest state() for an io-fixed block */
+  static constexpr uint32_t READ_FIX= 4U << 29;
+  /** smallest state() for a write-fixed block */
+  static constexpr uint32_t WRITE_FIX= 5U << 29;
+  /** smallest state() for a write-fixed block with buffered changes */
+  static constexpr uint32_t WRITE_FIX_IBUF= 6U << 29;
+  /** smallest state() for a write-fixed block (no doublewrite was used) */
+  static constexpr uint32_t WRITE_FIX_REINIT= 7U << 29;
+  /** buf_pool.LRU status mask in state() */
+  static constexpr uint32_t LRU_MASK= 7U << 29;
+
+  /** lock covering the contents of frame */
+  block_lock lock;
+  /** pointer to aligned, uncompressed page frame of innodb_page_size */
+  byte *frame;
+  /* @} */
+  /** ROW_FORMAT=COMPRESSED page; zip.data (but not the data it points to)
+  is also protected by buf_pool.mutex;
+  !frame && !zip.data means an active buf_pool.watch */
+  page_zip_des_t zip;
+#ifdef UNIV_DEBUG
+  /** whether this->list is in buf_pool.zip_hash; protected by buf_pool.mutex */
+  bool in_zip_hash;
+  /** whether this->LRU is in buf_pool.LRU (in_file());
+  protected by buf_pool.mutex */
+  bool in_LRU_list;
+  /** whether this is in buf_pool.page_hash (in_file());
+  protected by buf_pool.mutex */
+  bool in_page_hash;
+  /** whether this->list is in buf_pool.free (state() == NOT_USED);
+  protected by buf_pool.flush_list_mutex */
+  bool in_free_list;
+#endif /* UNIV_DEBUG */
+  /** list member in one of the lists of buf_pool; protected by
+  buf_pool.mutex or buf_pool.flush_list_mutex
+
+  state() == NOT_USED: buf_pool.free or buf_pool.withdraw
+
+  in_file() && oldest_modification():
+  buf_pool.flush_list (protected by buf_pool.flush_list_mutex)
+
+  The contents is undefined if in_file() && !oldest_modification(),
+  or if state() == MEMORY or state() == REMOVE_HASH. */
+  UT_LIST_NODE_T(buf_page_t) list;
+
+	/** @name LRU replacement algorithm fields.
+	Protected by buf_pool.mutex. */
+	/* @{ */
+
+	UT_LIST_NODE_T(buf_page_t) LRU;
+					/*!< node of the LRU list */
+	unsigned	old:1;		/*!< TRUE if the block is in the old
+					blocks in buf_pool.LRU_old */
+	unsigned	freed_page_clock:31;/*!< the value of
+					buf_pool.freed_page_clock
+					when this block was the last
+					time put to the head of the
+					LRU list; a thread is allowed
+					to read this for heuristic
+					purposes without holding any
+					mutex or latch */
+	/* @} */
+	Atomic_counter<unsigned> access_time;	/*!< time of first access, or
+					0 if the block was never accessed
+					in the buffer pool.
+
+					For state() == MEMORY
+					blocks, this field can be repurposed
+					for something else.
+
+					When this field counts log records
+					and bytes allocated for recv_sys.pages,
+					the field is protected by
+					recv_sys_t::mutex. */
+  buf_page_t() : id_{0}
+  {
+    static_assert(NOT_USED == 0, "compatibility");
+    memset((void*) this, 0, sizeof *this);
+  }
+
+  buf_page_t(const buf_page_t &b) :
+    id_(b.id_), hash(b.hash),
+    oldest_modification_(b.oldest_modification_),
+    lock() /* not copied */,
+    frame(b.frame), zip(b.zip),
+#ifdef UNIV_DEBUG
+    in_zip_hash(b.in_zip_hash), in_LRU_list(b.in_LRU_list),
+    in_page_hash(b.in_page_hash), in_free_list(b.in_free_list),
+#endif /* UNIV_DEBUG */
+    list(b.list), LRU(b.LRU), old(b.old), freed_page_clock(b.freed_page_clock),
+    access_time(b.access_time)
+  {
+    lock.init();
+  }
+
+  /** Initialize some more fields */
+  void init(uint32_t state, page_id_t id)
+  {
+    ut_ad(state < REMOVE_HASH || state >= UNFIXED);
+    id_= id;
+    zip.fix= state;
+    oldest_modification_= 0;
+    lock.init();
+    ut_d(in_zip_hash= false);
+    ut_d(in_free_list= false);
+    ut_d(in_LRU_list= false);
+    ut_d(in_page_hash= false);
+    old= 0;
+    freed_page_clock= 0;
+    access_time= 0;
+  }
+
+  void set_os_unused()
+  {
+    MEM_NOACCESS(frame, srv_page_size);
+#ifdef MADV_FREE
+    madvise(frame, srv_page_size, MADV_FREE);
+#endif
+  }
+
+  void set_os_used() const
+  {
+    MEM_MAKE_ADDRESSABLE(frame, srv_page_size);
+  }
+public:
+  const page_id_t &id() const { return id_; }
+  uint32_t state() const { return zip.fix; }
+  uint32_t buf_fix_count() const
+  {
+    uint32_t f= state();
+    ut_ad(f >= FREED);
+    return f < UNFIXED ? (f - FREED) : (~LRU_MASK & f);
+  }
+  /** @return whether this block is read or write fixed;
+  read_complete() or write_complete() will always release
+  the io-fix before releasing U-lock or X-lock */
+  bool is_io_fixed() const
+  { const auto s= state(); ut_ad(s >= FREED); return s >= READ_FIX; }
+  /** @return whether this block is write fixed;
+  write_complete() will always release the write-fix before releasing U-lock */
+  bool is_write_fixed() const { return state() >= WRITE_FIX; }
+  /** @return whether this block is read fixed; this should never hold
+  when a thread is holding the block lock in any mode */
+  bool is_read_fixed() const { return is_io_fixed() && !is_write_fixed(); }
+
+  /** @return if this belongs to buf_pool.unzip_LRU */
+  bool belongs_to_unzip_LRU() const
+  { return UNIV_LIKELY_NULL(zip.data) && frame; }
+
+  bool is_freed() const
+  { const auto s= state(); ut_ad(s >= FREED); return s < UNFIXED; }
+  bool is_ibuf_exist() const
+  {
+    const auto s= state();
+    ut_ad(s >= UNFIXED);
+    ut_ad(s < READ_FIX);
+    return (s & LRU_MASK) == IBUF_EXIST;
+  }
+  bool is_reinit() const { return !(~state() & REINIT); }
+
+  void set_reinit(uint32_t prev_state)
+  {
+    ut_ad(prev_state < READ_FIX);
+    ut_d(const auto s=) zip.fix.fetch_add(REINIT - prev_state);
+    ut_ad(s > prev_state);
+    ut_ad(s < prev_state + UNFIXED);
+  }
+
+  void set_ibuf_exist()
+  {
+    ut_ad(lock.is_write_locked());
+    ut_ad(id() < page_id_t(SRV_SPACE_ID_UPPER_BOUND, 0));
+    const auto s= state();
+    ut_ad(s >= UNFIXED);
+    ut_ad(s < READ_FIX);
+    ut_ad(s < IBUF_EXIST || s >= REINIT);
+    zip.fix.fetch_add(IBUF_EXIST - (LRU_MASK & s));
+  }
+  void clear_ibuf_exist()
+  {
+    ut_ad(lock.is_write_locked());
+    ut_ad(id() < page_id_t(SRV_SPACE_ID_UPPER_BOUND, 0));
+    ut_d(const auto s=) zip.fix.fetch_sub(IBUF_EXIST - UNFIXED);
+    ut_ad(s >= IBUF_EXIST);
+    ut_ad(s < REINIT);
+  }
+
+  uint32_t read_unfix(uint32_t s)
+  {
+    ut_ad(lock.is_write_locked());
+    ut_ad(s == UNFIXED + 1 || s == IBUF_EXIST + 1 || s == REINIT + 1);
+    uint32_t old_state= zip.fix.fetch_add(s - READ_FIX);
+    ut_ad(old_state >= READ_FIX);
+    ut_ad(old_state < WRITE_FIX);
+    return old_state + (s - READ_FIX);
+  }
+
+  void set_freed(uint32_t prev_state, uint32_t count= 0)
+  {
+    ut_ad(lock.is_write_locked());
+    ut_ad(prev_state >= UNFIXED);
+    ut_ad(prev_state < READ_FIX);
+    ut_d(auto s=) zip.fix.fetch_sub((prev_state & LRU_MASK) - FREED - count);
+    ut_ad(!((prev_state ^ s) & LRU_MASK));
+  }
+
+  inline void set_state(uint32_t s);
+  inline void set_corrupt_id();
+
+  /** @return the log sequence number of the oldest pending modification
+  @retval 0 if the block is being removed from (or not in) buf_pool.flush_list
+  @retval 1 if the block is in buf_pool.flush_list but not modified
+  @retval 2 if the block belongs to the temporary tablespace and
+  has unwritten changes */
+  lsn_t oldest_modification() const { return oldest_modification_; }
+  /** @return the log sequence number of the oldest pending modification,
+  @retval 0 if the block is definitely not in buf_pool.flush_list
+  @retval 1 if the block is in buf_pool.flush_list but not modified
+  @retval 2 if the block belongs to the temporary tablespace and
+  has unwritten changes */
+  lsn_t oldest_modification_acquire() const
+  { return oldest_modification_.load(std::memory_order_acquire); }
+  /** Set oldest_modification when adding to buf_pool.flush_list */
+  inline void set_oldest_modification(lsn_t lsn);
+  /** Clear oldest_modification after removing from buf_pool.flush_list */
+  inline void clear_oldest_modification();
+  /** Reset the oldest_modification when marking a persistent page freed */
+  void reset_oldest_modification()
+  {
+    ut_ad(oldest_modification() > 2);
+    oldest_modification_.store(1, std::memory_order_release);
+  }
+
+  /** Complete a read of a page.
+  @param node     data file
+  @return whether the operation succeeded
+  @retval DB_PAGE_CORRUPTED    if the checksum fails
+  @retval DB_DECRYPTION_FAILED if the page cannot be decrypted
+  @retval DB_FAIL              if the page contains the wrong ID */
+  dberr_t read_complete(const fil_node_t &node);
+
+  /** Note that a block is no longer dirty, while not removing
+  it from buf_pool.flush_list
+  @param temporary   whether the page belongs to the temporary tablespace
+  @param error       whether an error may have occurred while writing */
+  inline void write_complete(bool temporary, bool error);
+
+  /** Write a flushable page to a file or free a freeable block.
+  @param evict       whether to evict the page on write completion
+  @param space       tablespace
+  @return whether a page write was initiated and buf_pool.mutex released */
+  bool flush(bool evict, fil_space_t *space);
+
+  /** Notify that a page in a temporary tablespace has been modified. */
+  void set_temp_modified()
+  {
+    ut_ad(fsp_is_system_temporary(id().space()));
+    ut_ad(in_file());
+    ut_ad((oldest_modification() | 2) == 2);
+    oldest_modification_= 2;
+  }
+
+  /** Prepare to release a file page to buf_pool.free. */
+  void free_file_page()
+  {
+    ut_ad((zip.fix.fetch_sub(REMOVE_HASH - MEMORY)) == REMOVE_HASH);
+    /* buf_LRU_block_free_non_file_page() asserts !oldest_modification() */
+    ut_d(oldest_modification_= 0;)
+    id_= page_id_t(~0ULL);
+  }
+
+  void fix_on_recovery()
+  {
+    ut_d(const auto f=) zip.fix.fetch_sub(READ_FIX - UNFIXED - 1);
+    ut_ad(f >= READ_FIX);
+    ut_ad(f < WRITE_FIX);
+  }
+
+  uint32_t fix(uint32_t count= 1)
+  {
+    ut_ad(count);
+    ut_ad(count < IBUF_EXIST);
+    uint32_t f= zip.fix.fetch_add(count);
+    ut_ad(f >= FREED);
+    ut_ad(!((f ^ (f + 1)) & LRU_MASK));
+    return f;
+  }
+
+  uint32_t unfix()
+  {
+    uint32_t f= zip.fix.fetch_sub(1);
+    ut_ad(f > FREED);
+    ut_ad(!((f ^ (f - 1)) & LRU_MASK));
+    return f - 1;
+  }
+
+  /** @return the physical size, in bytes */
+  ulint physical_size() const
+  {
+    return zip.ssize ? (UNIV_ZIP_SIZE_MIN >> 1) << zip.ssize : srv_page_size;
+  }
+
+  /** @return the ROW_FORMAT=COMPRESSED physical size, in bytes
+  @retval 0 if not compressed */
+  ulint zip_size() const
+  {
+    return zip.ssize ? (UNIV_ZIP_SIZE_MIN >> 1) << zip.ssize : 0;
+  }
+
+  /** @return the byte offset of the page within a file */
+  os_offset_t physical_offset() const
+  {
+    os_offset_t o= id().page_no();
+    return zip.ssize
+      ? o << (zip.ssize + (UNIV_ZIP_SIZE_SHIFT_MIN - 1))
+      : o << srv_page_size_shift;
+  }
+
+  /** @return whether the block is mapped to a data file */
+  bool in_file() const { return state() >= FREED; }
+
+  /** @return whether the block can be relocated in memory.
+  The block can be dirty, but it must not be I/O-fixed or bufferfixed. */
+  inline bool can_relocate() const;
+  /** @return whether the block has been flagged old in buf_pool.LRU */
+  inline bool is_old() const;
+  /** Set whether a block is old in buf_pool.LRU */
+  inline void set_old(bool old);
+  /** Flag a page accessed in buf_pool
+  @return whether this is not the first access */
+  bool set_accessed()
+  {
+    if (is_accessed()) return true;
+    access_time= static_cast<uint32_t>(ut_time_ms());
+    return false;
+  }
+  /** @return ut_time_ms() at the time of first access of a block in buf_pool
+  @retval 0 if not accessed */
+  unsigned is_accessed() const { ut_ad(in_file()); return access_time; }
+};
+
+/** The buffer control block structure */
+
+struct buf_block_t{
+
+	/** @name General fields */
+	/* @{ */
+
+	buf_page_t	page;		/*!< page information; this must
+					be the first field, so that
+					buf_pool.page_hash can point
+					to buf_page_t or buf_block_t */
+#ifdef UNIV_DEBUG
+  /** whether page.list is in buf_pool.withdraw
+  ((state() == NOT_USED)) and the buffer pool is being shrunk;
+  protected by buf_pool.mutex */
+  bool in_withdraw_list;
+  /** whether unzip_LRU is in buf_pool.unzip_LRU
+  (in_file() && frame && zip.data);
+  protected by buf_pool.mutex */
+  bool in_unzip_LRU_list;
+#endif
+  /** member of buf_pool.unzip_LRU (if belongs_to_unzip_LRU()) */
+  UT_LIST_NODE_T(buf_block_t) unzip_LRU;
+	/* @} */
+	/** @name Optimistic search field */
+	/* @{ */
+
+	ib_uint64_t	modify_clock;	/*!< this clock is incremented every
+					time a pointer to a record on the
+					page may become obsolete; this is
+					used in the optimistic cursor
+					positioning: if the modify clock has
+					not changed, we know that the pointer
+					is still valid; this field may be
+					changed if the thread (1) owns the
+					pool mutex and the page is not
+					bufferfixed, or (2) the thread has an
+					x-latch on the block */
+	/* @} */
+#ifdef BTR_CUR_HASH_ADAPT
+	/** @name Hash search fields (unprotected)
+	NOTE that these fields are NOT protected by any semaphore! */
+	/* @{ */
+
+	volatile uint16_t n_bytes;	/*!< recommended prefix length for hash
+					search: number of bytes in
+					an incomplete last field */
+	volatile uint16_t n_fields;	/*!< recommended prefix length for hash
+					search: number of full fields */
+	uint16_t	n_hash_helps;	/*!< counter which controls building
+					of a new hash index for the page */
+	volatile bool	left_side;	/*!< true or false, depending on
+					whether the leftmost record of several
+					records with the same prefix should be
+					indexed in the hash index */
+	/* @} */
+
+	/** @name Hash search fields
+	These 5 fields may only be modified when:
+	we are holding the appropriate x-latch in btr_search_latches[], and
+	one of the following holds:
+	(1) in_file(), and we are holding lock in any mode, or
+	(2) !is_read_fixed()&&(state()>=UNFIXED||state()==REMOVE_HASH).
+
+	An exception to this is when we init or create a page
+	in the buffer pool in buf0buf.cc.
+
+	Another exception for buf_pool_t::clear_hash_index() is that
+	assigning block->index = NULL (and block->n_pointers = 0)
+	is allowed whenever all AHI latches are exclusively locked.
+
+	Another exception is that ha_insert_for_fold() may
+	decrement n_pointers without holding the appropriate latch
+	in btr_search_latches[]. Thus, n_pointers must be
+	protected by atomic memory access.
+
+	This implies that the fields may be read without race
+	condition whenever any of the following hold:
+	- the btr_search_sys.partition[].latch is being held, or
+	- state() == NOT_USED || state() == MEMORY,
+	and holding some latch prevents the state from changing to that.
+
+	Some use of assert_block_ahi_empty() or assert_block_ahi_valid()
+	is prone to race conditions while buf_pool_t::clear_hash_index() is
+	executing (the adaptive hash index is being disabled). Such use
+	is explicitly commented. */
+
+	/* @{ */
+
+# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	Atomic_counter<ulint>
+			n_pointers;	/*!< used in debugging: the number of
+					pointers in the adaptive hash index
+					pointing to this frame */
+#  define assert_block_ahi_empty(block)					\
+	ut_a((block)->n_pointers == 0)
+#  define assert_block_ahi_empty_on_init(block) do {			\
+	MEM_MAKE_DEFINED(&(block)->n_pointers, sizeof (block)->n_pointers); \
+	assert_block_ahi_empty(block);					\
+} while (0)
+#  define assert_block_ahi_valid(block)					\
+	ut_a((block)->index || (block)->n_pointers == 0)
+# else /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+#  define assert_block_ahi_empty(block) /* nothing */
+#  define assert_block_ahi_empty_on_init(block) /* nothing */
+#  define assert_block_ahi_valid(block) /* nothing */
+# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	unsigned	curr_n_fields:10;/*!< prefix length for hash indexing:
+					number of full fields */
+	unsigned	curr_n_bytes:15;/*!< number of bytes in hash
+					indexing */
+	unsigned	curr_left_side:1;/*!< TRUE or FALSE in hash indexing */
+	dict_index_t*	index;		/*!< Index for which the
+					adaptive hash index has been
+					created, or NULL if the page
+					does not exist in the
+					index. Note that it does not
+					guarantee that the index is
+					complete, though: there may
+					have been hash collisions,
+					record deletions, etc. */
+	/* @} */
+#else /* BTR_CUR_HASH_ADAPT */
+# define assert_block_ahi_empty(block) /* nothing */
+# define assert_block_ahi_empty_on_init(block) /* nothing */
+# define assert_block_ahi_valid(block) /* nothing */
+#endif /* BTR_CUR_HASH_ADAPT */
+  void fix() { page.fix(); }
+  uint32_t unfix() { return page.unfix(); }
+
+  /** @return the physical size, in bytes */
+  ulint physical_size() const { return page.physical_size(); }
+
+  /** @return the ROW_FORMAT=COMPRESSED physical size, in bytes
+  @retval 0 if not compressed */
+  ulint zip_size() const { return page.zip_size(); }
+
+  /** Initialize the block.
+  @param page_id  page identifier
+  @param zip_size ROW_FORMAT=COMPRESSED page size, or 0
+  @param state    initial state() */
+  void initialise(const page_id_t page_id, ulint zip_size, uint32_t state);
+};
+
+/**********************************************************************//**
+Compute the hash fold value for blocks in buf_pool.zip_hash. */
+/* @{ */
+#define BUF_POOL_ZIP_FOLD_PTR(ptr) (ulint(ptr) >> srv_page_size_shift)
+#define BUF_POOL_ZIP_FOLD(b) BUF_POOL_ZIP_FOLD_PTR((b)->page.frame)
+#define BUF_POOL_ZIP_FOLD_BPAGE(b) BUF_POOL_ZIP_FOLD((buf_block_t*) (b))
+/* @} */
+
+/** A "Hazard Pointer" class used to iterate over buf_pool.LRU or
+buf_pool.flush_list. A hazard pointer is a buf_page_t pointer
+which we intend to iterate over next and we want it remain valid
+even after we release the mutex that protects the list. */
+class HazardPointer
+{
+public:
+  virtual ~HazardPointer() = default;
+
+  /** @return current value */
+  buf_page_t *get() const { mysql_mutex_assert_owner(m_mutex); return m_hp; }
+
+  /** Set current value
+  @param bpage buffer block to be set as hp */
+  void set(buf_page_t *bpage)
+  {
+    mysql_mutex_assert_owner(m_mutex);
+    ut_ad(!bpage || bpage->in_file());
+    m_hp= bpage;
+  }
+
+  /** Checks if a bpage is the hp
+  @param bpage  buffer block to be compared
+  @return true if it is hp */
+  bool is_hp(const buf_page_t *bpage) const
+  { mysql_mutex_assert_owner(m_mutex); return bpage == m_hp; }
+
+  /** Adjust the value of hp. This happens when some
+  other thread working on the same list attempts to
+  remove the hp from the list. */
+  virtual void adjust(const buf_page_t*) = 0;
+
+#ifdef UNIV_DEBUG
+  /** mutex that protects access to the m_hp. */
+  const mysql_mutex_t *m_mutex= nullptr;
+#endif /* UNIV_DEBUG */
+
+protected:
+  /** hazard pointer */
+  buf_page_t *m_hp= nullptr;
+};
+
+/** Class implementing buf_pool.flush_list hazard pointer */
+class FlushHp : public HazardPointer
+{
+public:
+  ~FlushHp() override = default;
+
+  /** Adjust the value of hp. This happens when some
+  other thread working on the same list attempts to
+  remove the hp from the list.
+  @param bpage  buffer block to be compared */
+  MY_ATTRIBUTE((nonnull))
+  void adjust(const buf_page_t *bpage) override
+  {
+    /* We only support reverse traversal for now. */
+    if (is_hp(bpage))
+      m_hp= UT_LIST_GET_PREV(list, m_hp);
+
+    ut_ad(!m_hp || m_hp->oldest_modification());
+  }
+};
+
+/** Class implementing buf_pool.LRU hazard pointer */
+class LRUHp : public HazardPointer {
+public:
+  ~LRUHp() override = default;
+
+  /** Adjust the value of hp. This happens when some
+  other thread working on the same list attempts to
+  remove the hp from the list.
+  @param bpage  buffer block to be compared */
+  MY_ATTRIBUTE((nonnull))
+  void adjust(const buf_page_t *bpage) override
+  {
+    /** We only support reverse traversal for now. */
+    if (is_hp(bpage))
+      m_hp= UT_LIST_GET_PREV(LRU, m_hp);
+
+    ut_ad(!m_hp || m_hp->in_LRU_list);
+  }
+};
+
+/** Special purpose iterators to be used when scanning the LRU list.
+The idea is that when one thread finishes the scan it leaves the
+itr in that position and the other thread can start scan from
+there */
+class LRUItr : public LRUHp {
+public:
+  ~LRUItr() override = default;
+
+  /** Select from where to start a scan. If we have scanned
+  too deep into the LRU list it resets the value to the tail
+  of the LRU list.
+  @return buf_page_t from where to start scan. */
+  inline buf_page_t *start();
+};
+
+/** Struct that is embedded in the free zip blocks */
+struct buf_buddy_free_t {
+	union {
+		ulint	size;	/*!< size of the block */
+		byte	bytes[FIL_PAGE_DATA];
+				/*!< stamp[FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID]
+				== BUF_BUDDY_FREE_STAMP denotes a free
+				block. If the space_id field of buddy
+				block != BUF_BUDDY_FREE_STAMP, the block
+				is not in any zip_free list. If the
+				space_id is BUF_BUDDY_FREE_STAMP then
+				stamp[0] will contain the
+				buddy block size. */
+	} stamp;
+
+	buf_page_t	bpage;	/*!< Embedded bpage descriptor */
+	UT_LIST_NODE_T(buf_buddy_free_t) list;
+				/*!< Node of zip_free list */
+};
+
+/** @brief The buffer pool statistics structure;
+protected by buf_pool.mutex unless otherwise noted. */
+struct buf_pool_stat_t{
+	/** Initialize the counters */
+	void init() { memset((void*) this, 0, sizeof *this); }
+
+	ib_counter_t<ulint, ib_counter_element_t>	n_page_gets;
+				/*!< number of page gets performed;
+				also successful searches through
+				the adaptive hash index are
+				counted as page gets;
+				NOT protected by buf_pool.mutex */
+	ulint	n_pages_read;	/*!< number read operations */
+	ulint	n_pages_written;/*!< number write operations */
+	ulint	n_pages_created;/*!< number of pages created
+				in the pool with no read */
+	ulint	n_ra_pages_read_rnd;/*!< number of pages read in
+				as part of random read ahead */
+	ulint	n_ra_pages_read;/*!< number of pages read in
+				as part of read ahead */
+	ulint	n_ra_pages_evicted;/*!< number of read ahead
+				pages that are evicted without
+				being accessed */
+	ulint	n_pages_made_young; /*!< number of pages made young, in
+				buf_page_make_young() */
+	ulint	n_pages_not_made_young; /*!< number of pages not made
+				young because the first access
+				was not long enough ago, in
+				buf_page_peek_if_too_old() */
+	/** number of waits for eviction */
+	ulint	LRU_waits;
+	ulint	LRU_bytes;	/*!< LRU size in bytes */
+};
+
+/** Statistics of buddy blocks of a given size. */
+struct buf_buddy_stat_t {
+	/** Number of blocks allocated from the buddy system. */
+	ulint		used;
+	/** Number of blocks relocated by the buddy system. */
+	ib_uint64_t	relocated;
+	/** Total duration of block relocations, in microseconds. */
+	ib_uint64_t	relocated_usec;
+};
+
+/** The buffer pool */
+class buf_pool_t
+{
+  /** A chunk of buffers */
+  struct chunk_t
+  {
+    /** number of elements in blocks[] */
+    size_t size;
+    /** memory allocated for the page frames */
+    unsigned char *mem;
+    /** descriptor of mem */
+    ut_new_pfx_t mem_pfx;
+    /** array of buffer control blocks */
+    buf_block_t *blocks;
+
+    /** Map of first page frame address to chunks[] */
+    using map= std::map<const void*, chunk_t*, std::less<const void*>,
+                        ut_allocator<std::pair<const void* const,chunk_t*>>>;
+    /** Chunk map that may be under construction by buf_resize_thread() */
+    static map *map_reg;
+    /** Current chunk map for lookup only */
+    static map *map_ref;
+
+    /** @return the memory size bytes. */
+    size_t mem_size() const { return mem_pfx.m_size; }
+
+    /** Register the chunk */
+    void reg() { map_reg->emplace(map::value_type(blocks->page.frame, this)); }
+
+    /** Allocate a chunk of buffer frames.
+    @param bytes    requested size
+    @return whether the allocation succeeded */
+    inline bool create(size_t bytes);
+
+#ifdef UNIV_DEBUG
+    /** Find a block that points to a ROW_FORMAT=COMPRESSED page
+    @param data  pointer to the start of a ROW_FORMAT=COMPRESSED page frame
+    @return the block
+    @retval nullptr  if not found */
+    const buf_block_t *contains_zip(const void *data) const
+    {
+      const buf_block_t *block= blocks;
+      for (auto i= size; i--; block++)
+        if (block->page.zip.data == data)
+          return block;
+      return nullptr;
+    }
+
+    /** Check that all blocks are in a replaceable state.
+    @return address of a non-free block
+    @retval nullptr if all freed */
+    inline const buf_block_t *not_freed() const;
+#endif /* UNIV_DEBUG */
+  };
+public:
+  /** Hash cell chain in page_hash_table */
+  struct hash_chain
+  {
+    /** pointer to the first block */
+    buf_page_t *first;
+  };
+private:
+  /** Withdraw blocks from the buffer pool until meeting withdraw_target.
+  @return whether retry is needed */
+  inline bool withdraw_blocks();
+
+  /** Determine if a pointer belongs to a buf_block_t. It can be a pointer to
+  the buf_block_t itself or a member of it.
+  @param ptr    a pointer that will not be dereferenced
+  @return whether the ptr belongs to a buf_block_t struct */
+  bool is_block_field(const void *ptr) const
+  {
+    const chunk_t *chunk= chunks;
+    const chunk_t *const echunk= chunk + ut_min(n_chunks, n_chunks_new);
+
+    /* TODO: protect chunks with a mutex (the older pointer will
+    currently remain during resize()) */
+    for (; chunk < echunk; chunk++)
+      if (ptr >= reinterpret_cast<const void*>(chunk->blocks) &&
+          ptr < reinterpret_cast<const void*>(chunk->blocks + chunk->size))
+        return true;
+    return false;
+  }
+
+  /** Try to reallocate a control block.
+  @param block  control block to reallocate
+  @return whether the reallocation succeeded */
+  inline bool realloc(buf_block_t *block);
+
+public:
+  bool is_initialised() const { return chunks != nullptr; }
+
+  /** Create the buffer pool.
+  @return whether the creation failed */
+  bool create();
+
+  /** Clean up after successful create() */
+  void close();
+
+  /** Resize from srv_buf_pool_old_size to srv_buf_pool_size. */
+  inline void resize();
+
+  /** @return whether resize() is in progress */
+  bool resize_in_progress() const
+  {
+    return UNIV_UNLIKELY(resizing.load(std::memory_order_relaxed));
+  }
+
+  /** @return the current size in blocks */
+  size_t get_n_pages() const
+  {
+    ut_ad(is_initialised());
+    size_t size= 0;
+    for (auto j= ut_min(n_chunks_new, n_chunks); j--; )
+      size+= chunks[j].size;
+    return size;
+  }
+
+  /** Determine whether a frame is intended to be withdrawn during resize().
+  @param ptr    pointer within a buf_page_t::frame
+  @return whether the frame will be withdrawn */
+  bool will_be_withdrawn(const byte *ptr) const
+  {
+    ut_ad(n_chunks_new < n_chunks);
+#ifdef SAFE_MUTEX
+    if (resize_in_progress())
+      mysql_mutex_assert_owner(&mutex);
+#endif /* SAFE_MUTEX */
+
+    for (const chunk_t *chunk= chunks + n_chunks_new,
+         * const echunk= chunks + n_chunks;
+         chunk != echunk; chunk++)
+      if (ptr >= chunk->blocks->page.frame &&
+          ptr < (chunk->blocks + chunk->size - 1)->page.frame + srv_page_size)
+        return true;
+    return false;
+  }
+
+  /** Determine whether a block is intended to be withdrawn during resize().
+  @param bpage  buffer pool block
+  @return whether the frame will be withdrawn */
+  bool will_be_withdrawn(const buf_page_t &bpage) const
+  {
+    ut_ad(n_chunks_new < n_chunks);
+#ifdef SAFE_MUTEX
+    if (resize_in_progress())
+      mysql_mutex_assert_owner(&mutex);
+#endif /* SAFE_MUTEX */
+
+    for (const chunk_t *chunk= chunks + n_chunks_new,
+         * const echunk= chunks + n_chunks;
+         chunk != echunk; chunk++)
+      if (&bpage >= &chunk->blocks->page &&
+          &bpage < &chunk->blocks[chunk->size].page)
+        return true;
+    return false;
+  }
+
+  /** Release and evict a corrupted page.
+  @param bpage    x-latched page that was found corrupted
+  @param state    expected current state of the page */
+  ATTRIBUTE_COLD void corrupted_evict(buf_page_t *bpage, uint32_t state);
+
+  /** Release a memory block to the buffer pool. */
+  ATTRIBUTE_COLD void free_block(buf_block_t *block);
+
+#ifdef UNIV_DEBUG
+  /** Find a block that points to a ROW_FORMAT=COMPRESSED page
+  @param data  pointer to the start of a ROW_FORMAT=COMPRESSED page frame
+  @return the block
+  @retval nullptr  if not found */
+  const buf_block_t *contains_zip(const void *data) const
+  {
+    mysql_mutex_assert_owner(&mutex);
+    for (const chunk_t *chunk= chunks, * const end= chunks + n_chunks;
+         chunk != end; chunk++)
+      if (const buf_block_t *block= chunk->contains_zip(data))
+        return block;
+    return nullptr;
+  }
+
+  /** Assert that all buffer pool pages are in a replaceable state */
+  void assert_all_freed();
+#endif /* UNIV_DEBUG */
+
+#ifdef BTR_CUR_HASH_ADAPT
+  /** Clear the adaptive hash index on all pages in the buffer pool. */
+  inline void clear_hash_index();
+
+  /** Get a buffer block from an adaptive hash index pointer.
+  This function does not return if the block is not identified.
+  @param ptr  pointer to within a page frame
+  @return pointer to block, never NULL */
+  inline buf_block_t *block_from_ahi(const byte *ptr) const;
+#endif /* BTR_CUR_HASH_ADAPT */
+
+  /**
+  @return the smallest oldest_modification lsn for any page
+  @retval empty_lsn if all modified persistent pages have been flushed */
+  lsn_t get_oldest_modification(lsn_t empty_lsn)
+  {
+    mysql_mutex_assert_owner(&flush_list_mutex);
+    while (buf_page_t *bpage= UT_LIST_GET_LAST(flush_list))
+    {
+      ut_ad(!fsp_is_system_temporary(bpage->id().space()));
+      lsn_t lsn= bpage->oldest_modification();
+      if (lsn != 1)
+      {
+        ut_ad(lsn > 2);
+        return lsn;
+      }
+      delete_from_flush_list(bpage);
+    }
+    return empty_lsn;
+  }
+
+  /** Determine if a buffer block was created by chunk_t::create().
+  @param block  block descriptor (not dereferenced)
+  @return whether block has been created by chunk_t::create() */
+  bool is_uncompressed(const buf_block_t *block) const
+  {
+    return is_block_field(reinterpret_cast<const void*>(block));
+  }
+
+public:
+  /** @return whether the buffer pool contains a page
+  @tparam allow_watch  whether to allow watch_is_sentinel()
+  @param page_id       page identifier
+  @param chain         hash table chain for page_id.fold() */
+  template<bool allow_watch= false>
+  TRANSACTIONAL_INLINE
+  bool page_hash_contains(const page_id_t page_id, hash_chain &chain)
+  {
+    transactional_shared_lock_guard<page_hash_latch> g
+      {page_hash.lock_get(chain)};
+    buf_page_t *bpage= page_hash.get(page_id, chain);
+    if (bpage >= &watch[0] && bpage < &watch[UT_ARR_SIZE(watch)])
+    {
+      ut_ad(!bpage->in_zip_hash);
+      ut_ad(!bpage->zip.data);
+      if (!allow_watch)
+        bpage= nullptr;
+    }
+    return bpage;
+  }
+
+  /** Determine if a block is a sentinel for a buffer pool watch.
+  @param bpage page descriptor
+  @return whether bpage a sentinel for a buffer pool watch */
+  bool watch_is_sentinel(const buf_page_t &bpage)
+  {
+#ifdef SAFE_MUTEX
+    DBUG_ASSERT(mysql_mutex_is_owner(&mutex) ||
+                page_hash.lock_get(page_hash.cell_get(bpage.id().fold())).
+                is_locked());
+#endif /* SAFE_MUTEX */
+    ut_ad(bpage.in_file());
+    if (&bpage < &watch[0] || &bpage >= &watch[array_elements(watch)])
+      return false;
+    ut_ad(!bpage.in_zip_hash);
+    ut_ad(!bpage.zip.data);
+    return true;
+  }
+
+  /** Check if a watched page has been read.
+  This may only be called after !watch_set() and before invoking watch_unset().
+  @param id   page identifier
+  @return whether the page was read to the buffer pool */
+  TRANSACTIONAL_INLINE
+  bool watch_occurred(const page_id_t id)
+  {
+    hash_chain &chain= page_hash.cell_get(id.fold());
+    transactional_shared_lock_guard<page_hash_latch> g
+      {page_hash.lock_get(chain)};
+    /* The page must exist because watch_set() increments buf_fix_count. */
+    return !watch_is_sentinel(*page_hash.get(id, chain));
+  }
+
+  /** Register a watch for a page identifier.
+  @param id         page identifier
+  @param chain      page_hash.cell_get(id.fold())
+  @return a buffer page corresponding to id
+  @retval nullptr   if the block was not present in page_hash */
+  buf_page_t *watch_set(const page_id_t id, hash_chain &chain);
+
+  /** Stop watching whether a page has been read in.
+  watch_set(id) must have returned nullptr before.
+  @param id         page identifier
+  @param chain      unlocked hash table chain */
+  void watch_unset(const page_id_t id, hash_chain &chain);
+
+  /** Remove the sentinel block for the watch before replacing it with a
+  real block. watch_unset() or watch_occurred() will notice
+  that the block has been replaced with the real block.
+  @param w          sentinel
+  @param chain      locked hash table chain
+  @return           w->state() */
+  inline uint32_t watch_remove(buf_page_t *w, hash_chain &chain);
+
+  /** @return whether less than 1/4 of the buffer pool is available */
+  TPOOL_SUPPRESS_TSAN
+  bool running_out() const
+  {
+    return !recv_recovery_is_on() &&
+      UT_LIST_GET_LEN(free) + UT_LIST_GET_LEN(LRU) <
+        n_chunks_new / 4 * chunks->size;
+  }
+
+  /** @return whether the buffer pool has run out */
+  TPOOL_SUPPRESS_TSAN
+  bool ran_out() const
+  { return UNIV_UNLIKELY(!try_LRU_scan || !UT_LIST_GET_LEN(free)); }
+
+  /** @return whether the buffer pool is shrinking */
+  inline bool is_shrinking() const
+  {
+    return n_chunks_new < n_chunks;
+  }
+
+#ifdef UNIV_DEBUG
+  /** Validate the buffer pool. */
+  void validate();
+#endif /* UNIV_DEBUG */
+#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG
+  /** Write information of the buf_pool to the error log. */
+  void print();
+#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG */
+
+  /** Remove a block from the LRU list.
+  @return the predecessor in the LRU list */
+  buf_page_t *LRU_remove(buf_page_t *bpage)
+  {
+    mysql_mutex_assert_owner(&mutex);
+    ut_ad(bpage->in_LRU_list);
+    ut_ad(bpage->in_page_hash);
+    ut_ad(!bpage->in_zip_hash);
+    ut_ad(bpage->in_file());
+    lru_hp.adjust(bpage);
+    lru_scan_itr.adjust(bpage);
+    ut_d(bpage->in_LRU_list= false);
+    buf_page_t *prev= UT_LIST_GET_PREV(LRU, bpage);
+    UT_LIST_REMOVE(LRU, bpage);
+    return prev;
+  }
+
+  /** Number of pages to read ahead */
+  static constexpr uint32_t READ_AHEAD_PAGES= 64;
+
+  /** Buffer pool mutex */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t mutex;
+  /** current statistics; protected by mutex */
+  buf_pool_stat_t stat;
+  /** old statistics; protected by mutex */
+  buf_pool_stat_t old_stat;
+
+	/** @name General fields */
+	/* @{ */
+	ulint		curr_pool_size;	/*!< Current pool size in bytes */
+	ulint		LRU_old_ratio;  /*!< Reserve this much of the buffer
+					pool for "old" blocks */
+#ifdef UNIV_DEBUG
+	ulint		buddy_n_frames; /*!< Number of frames allocated from
+					the buffer pool to the buddy system */
+	ulint		mutex_exit_forbidden; /*!< Forbid release mutex */
+#endif
+	ut_allocator<unsigned char>	allocator;	/*!< Allocator used for
+					allocating memory for the the "chunks"
+					member. */
+	ulint		n_chunks;	/*!< number of buffer pool chunks */
+	ulint		n_chunks_new;	/*!< new number of buffer pool chunks.
+					both n_chunks{,new} are protected under
+					mutex */
+	chunk_t*	chunks;		/*!< buffer pool chunks */
+	chunk_t*	chunks_old;	/*!< old buffer pool chunks to be freed
+					after resizing buffer pool */
+	/** current pool size in pages */
+	Atomic_counter<ulint> curr_size;
+	/** read-ahead request size in pages */
+	Atomic_counter<uint32_t> read_ahead_area;
+
+  /** Hash table with singly-linked overflow lists */
+  struct page_hash_table
+  {
+    static_assert(CPU_LEVEL1_DCACHE_LINESIZE >= 64, "less than 64 bytes");
+    static_assert(!(CPU_LEVEL1_DCACHE_LINESIZE & 63),
+      "not a multiple of 64 bytes");
+
+    /** Number of array[] elements per page_hash_latch.
+    Must be one less than a power of 2. */
+    static constexpr size_t ELEMENTS_PER_LATCH= 64 / sizeof(void*) - 1;
+    static constexpr size_t EMPTY_SLOTS_PER_LATCH=
+      ((CPU_LEVEL1_DCACHE_LINESIZE / 64) - 1) * (64 / sizeof(void*));
+
+    /** number of payload elements in array[] */
+    Atomic_relaxed<ulint> n_cells;
+    /** the hash table, with pad(n_cells) elements, aligned to L1 cache size */
+    hash_chain *array;
+
+    /** Create the hash table.
+    @param n  the lower bound of n_cells */
+    void create(ulint n);
+
+    /** Free the hash table. */
+    void free() { aligned_free(array); array= nullptr; }
+
+    /** @return the index of an array element */
+    ulint calc_hash(ulint fold) const { return calc_hash(fold, n_cells); }
+    /** @return raw array index converted to padded index */
+    static ulint pad(ulint h)
+    {
+      ulint latches= h / ELEMENTS_PER_LATCH;
+      ulint empty_slots= latches * EMPTY_SLOTS_PER_LATCH;
+      return 1 + latches + empty_slots + h;
+    }
+  private:
+    /** @return the hash value before any ELEMENTS_PER_LATCH padding */
+    static ulint hash(ulint fold, ulint n) { return ut_hash_ulint(fold, n); }
+
+    /** @return the index of an array element */
+    static ulint calc_hash(ulint fold, ulint n_cells)
+    {
+      return pad(hash(fold, n_cells));
+    }
+  public:
+    /** @return the latch covering a hash table chain */
+    static page_hash_latch &lock_get(hash_chain &chain)
+    {
+      static_assert(!((ELEMENTS_PER_LATCH + 1) & ELEMENTS_PER_LATCH),
+                    "must be one less than a power of 2");
+      const size_t addr= reinterpret_cast<size_t>(&chain);
+      ut_ad(addr & (ELEMENTS_PER_LATCH * sizeof chain));
+      return *reinterpret_cast<page_hash_latch*>
+        (addr & ~(ELEMENTS_PER_LATCH * sizeof chain));
+    }
+
+    /** Get a hash table slot. */
+    hash_chain &cell_get(ulint fold) const
+    { return array[calc_hash(fold, n_cells)]; }
+
+    /** Append a block descriptor to a hash bucket chain. */
+    void append(hash_chain &chain, buf_page_t *bpage)
+    {
+      ut_ad(!bpage->in_page_hash);
+      ut_ad(!bpage->hash);
+      ut_d(bpage->in_page_hash= true);
+      buf_page_t **prev= &chain.first;
+      while (*prev)
+      {
+        ut_ad((*prev)->in_page_hash);
+        prev= &(*prev)->hash;
+      }
+      *prev= bpage;
+    }
+
+    /** Remove a block descriptor from a hash bucket chain. */
+    void remove(hash_chain &chain, buf_page_t *bpage)
+    {
+      ut_ad(bpage->in_page_hash);
+      buf_page_t **prev= &chain.first;
+      while (*prev != bpage)
+      {
+        ut_ad((*prev)->in_page_hash);
+        prev= &(*prev)->hash;
+      }
+      *prev= bpage->hash;
+      ut_d(bpage->in_page_hash= false);
+      bpage->hash= nullptr;
+    }
+
+    /** Replace a block descriptor with another. */
+    void replace(hash_chain &chain, buf_page_t *old, buf_page_t *bpage)
+    {
+      ut_ad(old->in_page_hash);
+      ut_ad(bpage->in_page_hash);
+      ut_d(old->in_page_hash= false);
+      ut_ad(bpage->hash == old->hash);
+      old->hash= nullptr;
+      buf_page_t **prev= &chain.first;
+      while (*prev != old)
+      {
+        ut_ad((*prev)->in_page_hash);
+        prev= &(*prev)->hash;
+      }
+      *prev= bpage;
+    }
+
+    /** Look up a page in a hash bucket chain. */
+    inline buf_page_t *get(const page_id_t id, const hash_chain &chain) const;
+
+    /** Exclusively aqcuire all latches */
+    inline void write_lock_all();
+
+    /** Release all latches */
+    inline void write_unlock_all();
+  };
+
+  /** Hash table of file pages (buf_page_t::in_file() holds),
+  indexed by page_id_t. Protected by both mutex and page_hash.lock_get(). */
+  page_hash_table page_hash;
+
+  /** map of block->frame to buf_block_t blocks that belong
+  to buf_buddy_alloc(); protected by buf_pool.mutex */
+  hash_table_t zip_hash;
+	Atomic_counter<ulint>
+			n_pend_unzip;	/*!< number of pending decompressions */
+
+	time_t		last_printout_time;
+					/*!< when buf_print_io was last time
+					called */
+	buf_buddy_stat_t buddy_stat[BUF_BUDDY_SIZES_MAX + 1];
+					/*!< Statistics of buddy system,
+					indexed by block size */
+
+	/* @} */
+
+  /** number of index page splits */
+  Atomic_counter<ulint> pages_split;
+
+  /** @name Page flushing algorithm fields */
+  /* @{ */
+
+  /** mutex protecting flush_list, buf_page_t::set_oldest_modification()
+  and buf_page_t::list pointers when !oldest_modification() */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t flush_list_mutex;
+  /** "hazard pointer" for flush_list scans; protected by flush_list_mutex */
+  FlushHp flush_hp;
+  /** flush_list size in bytes; protected by flush_list_mutex */
+  ulint flush_list_bytes;
+  /** possibly modified persistent pages (a subset of LRU);
+  os_aio_pending_writes() is approximately COUNT(is_write_fixed()) */
+  UT_LIST_BASE_NODE_T(buf_page_t) flush_list;
+  /** number of blocks ever added to flush_list;
+  sometimes protected by flush_list_mutex */
+  size_t flush_list_requests;
+
+  TPOOL_SUPPRESS_TSAN void add_flush_list_requests(size_t size)
+  { ut_ad(size); flush_list_requests+= size; }
+private:
+  static constexpr unsigned PAGE_CLEANER_IDLE= 1;
+  static constexpr unsigned FLUSH_LIST_ACTIVE= 2;
+  static constexpr unsigned LRU_FLUSH= 4;
+
+  /** Number of pending LRU flush * LRU_FLUSH +
+  PAGE_CLEANER_IDLE + FLUSH_LIST_ACTIVE flags */
+  unsigned page_cleaner_status;
+  /** track server activity count for signaling idle flushing */
+  ulint last_activity_count;
+public:
+  /** signalled to wake up the page_cleaner; protected by flush_list_mutex */
+  pthread_cond_t do_flush_list;
+  /** broadcast when !n_flush(); protected by flush_list_mutex */
+  pthread_cond_t done_flush_LRU;
+  /** broadcast when a batch completes; protected by flush_list_mutex */
+  pthread_cond_t done_flush_list;
+
+  /** @return number of pending LRU flush */
+  unsigned n_flush() const
+  {
+    mysql_mutex_assert_owner(&flush_list_mutex);
+    return page_cleaner_status / LRU_FLUSH;
+  }
+
+  /** Increment the number of pending LRU flush */
+  inline void n_flush_inc();
+
+  /** Decrement the number of pending LRU flush */
+  inline void n_flush_dec();
+
+  /** Decrement the number of pending LRU flush
+  while holding flush_list_mutex */
+  inline void n_flush_dec_holding_mutex();
+
+  /** @return whether flush_list flushing is active */
+  bool flush_list_active() const
+  {
+    mysql_mutex_assert_owner(&flush_list_mutex);
+    return page_cleaner_status & FLUSH_LIST_ACTIVE;
+  }
+
+  void flush_list_set_active()
+  {
+    ut_ad(!flush_list_active());
+    page_cleaner_status+= FLUSH_LIST_ACTIVE;
+  }
+  void flush_list_set_inactive()
+  {
+    ut_ad(flush_list_active());
+    page_cleaner_status-= FLUSH_LIST_ACTIVE;
+  }
+
+  /** @return whether the page cleaner must sleep due to being idle */
+  bool page_cleaner_idle() const noexcept
+  {
+    mysql_mutex_assert_owner(&flush_list_mutex);
+    return page_cleaner_status & PAGE_CLEANER_IDLE;
+  }
+
+  /** @return whether the page cleaner may be initiating writes */
+  bool page_cleaner_active() const
+  {
+    mysql_mutex_assert_owner(&flush_list_mutex);
+    static_assert(PAGE_CLEANER_IDLE == 1, "efficiency");
+    return page_cleaner_status > PAGE_CLEANER_IDLE;
+  }
+
+  /** Wake up the page cleaner if needed.
+  @param for_LRU  whether to wake up for LRU eviction */
+  void page_cleaner_wakeup(bool for_LRU= false);
+
+  /** Register whether an explicit wakeup of the page cleaner is needed */
+  void page_cleaner_set_idle(bool deep_sleep)
+  {
+    mysql_mutex_assert_owner(&flush_list_mutex);
+    page_cleaner_status= (page_cleaner_status & ~PAGE_CLEANER_IDLE) |
+      (PAGE_CLEANER_IDLE * deep_sleep);
+  }
+
+  /** Update server last activity count */
+  void update_last_activity_count(ulint activity_count)
+  {
+    mysql_mutex_assert_owner(&flush_list_mutex);
+    last_activity_count= activity_count;
+  }
+
+	unsigned	freed_page_clock;/*!< a sequence number used
+					to count the number of buffer
+					blocks removed from the end of
+					the LRU list; NOTE that this
+					counter may wrap around at 4
+					billion! A thread is allowed
+					to read this for heuristic
+					purposes without holding any
+					mutex or latch */
+  /** Cleared when buf_LRU_get_free_block() fails.
+  Set whenever the free list grows, along with a broadcast of done_free.
+  Protected by buf_pool.mutex. */
+  Atomic_relaxed<bool> try_LRU_scan;
+	/* @} */
+
+	/** @name LRU replacement algorithm fields */
+	/* @{ */
+
+	UT_LIST_BASE_NODE_T(buf_page_t) free;
+					/*!< base node of the free
+					block list */
+  /** broadcast each time when the free list grows or try_LRU_scan is set;
+  protected by mutex */
+  pthread_cond_t done_free;
+
+	UT_LIST_BASE_NODE_T(buf_page_t) withdraw;
+					/*!< base node of the withdraw
+					block list. It is only used during
+					shrinking buffer pool size, not to
+					reuse the blocks will be removed */
+
+	ulint		withdraw_target;/*!< target length of withdraw
+					block list, when withdrawing */
+
+	/** "hazard pointer" used during scan of LRU while doing
+	LRU list batch.  Protected by buf_pool_t::mutex. */
+	LRUHp		lru_hp;
+
+	/** Iterator used to scan the LRU list when searching for
+	replacable victim. Protected by buf_pool_t::mutex. */
+	LRUItr		lru_scan_itr;
+
+	UT_LIST_BASE_NODE_T(buf_page_t) LRU;
+					/*!< base node of the LRU list */
+
+	buf_page_t*	LRU_old;	/*!< pointer to the about
+					LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV
+					oldest blocks in the LRU list;
+					NULL if LRU length less than
+					BUF_LRU_OLD_MIN_LEN;
+					NOTE: when LRU_old != NULL, its length
+					should always equal LRU_old_len */
+	ulint		LRU_old_len;	/*!< length of the LRU list from
+					the block to which LRU_old points
+					onward, including that block;
+					see buf0lru.cc for the restrictions
+					on this value; 0 if LRU_old == NULL;
+					NOTE: LRU_old_len must be adjusted
+					whenever LRU_old shrinks or grows! */
+
+	UT_LIST_BASE_NODE_T(buf_block_t) unzip_LRU;
+					/*!< base node of the
+					unzip_LRU list */
+
+	/* @} */
+  /** free ROW_FORMAT=COMPRESSED page frames */
+  UT_LIST_BASE_NODE_T(buf_buddy_free_t) zip_free[BUF_BUDDY_SIZES_MAX];
+#if BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN
+# error "BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN"
+#endif
+
+  /** Sentinels to detect if pages are read into the buffer pool while
+  a delete-buffering operation is pending. Protected by mutex. */
+  buf_page_t watch[innodb_purge_threads_MAX + 1];
+  /** Reserve a buffer. */
+  buf_tmp_buffer_t *io_buf_reserve() { return io_buf.reserve(); }
+
+  /** Remove a block from flush_list.
+  @param bpage   buffer pool page */
+  void delete_from_flush_list(buf_page_t *bpage) noexcept;
+
+  /** Prepare to insert a modified blcok into flush_list.
+  @param lsn start LSN of the mini-transaction
+  @return insert position for insert_into_flush_list() */
+  inline buf_page_t *prepare_insert_into_flush_list(lsn_t lsn) noexcept;
+
+  /** Insert a modified block into the flush list.
+  @param prev     insert position (from prepare_insert_into_flush_list())
+  @param block    modified block
+  @param lsn      start LSN of the mini-transaction that modified the block */
+  inline void insert_into_flush_list(buf_page_t *prev, buf_block_t *block,
+                                     lsn_t lsn) noexcept;
+
+  /** Free a page whose underlying file page has been freed. */
+  ATTRIBUTE_COLD void release_freed_page(buf_page_t *bpage) noexcept;
+
+private:
+  /** Temporary memory for page_compressed and encrypted I/O */
+  struct io_buf_t
+  {
+    /** number of elements in slots[] */
+    ulint n_slots;
+    /** array of slots */
+    buf_tmp_buffer_t *slots;
+
+    void create(ulint n_slots);
+
+    void close();
+
+    /** Reserve a buffer */
+    buf_tmp_buffer_t *reserve();
+  } io_buf;
+
+  /** whether resize() is in the critical path */
+  std::atomic<bool> resizing;
+};
+
+/** The InnoDB buffer pool */
+extern buf_pool_t buf_pool;
+
+inline buf_page_t *buf_pool_t::page_hash_table::get(const page_id_t id,
+                                                    const hash_chain &chain)
+  const
+{
+#ifdef SAFE_MUTEX
+  DBUG_ASSERT(mysql_mutex_is_owner(&buf_pool.mutex) ||
+              lock_get(const_cast<hash_chain&>(chain)).is_locked());
+#endif /* SAFE_MUTEX */
+  for (buf_page_t *bpage= chain.first; bpage; bpage= bpage->hash)
+  {
+    ut_ad(bpage->in_page_hash);
+    ut_ad(bpage->in_file());
+    if (bpage->id() == id)
+      return bpage;
+  }
+  return nullptr;
+}
+
+#ifdef SUX_LOCK_GENERIC
+inline void page_hash_latch::lock_shared()
+{
+  mysql_mutex_assert_not_owner(&buf_pool.mutex);
+  if (!read_trylock())
+    read_lock_wait();
+}
+
+inline void page_hash_latch::lock()
+{
+  if (!write_trylock())
+    write_lock_wait();
+}
+#endif /* SUX_LOCK_GENERIC */
+
+inline void buf_page_t::set_state(uint32_t s)
+{
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+  ut_ad(s <= REMOVE_HASH || s >= UNFIXED);
+  ut_ad(s < WRITE_FIX);
+  ut_ad(s <= READ_FIX || zip.fix == READ_FIX);
+  zip.fix= s;
+}
+
+inline void buf_page_t::set_corrupt_id()
+{
+#ifdef UNIV_DEBUG
+  switch (oldest_modification()) {
+  case 0:
+    break;
+  case 2:
+    ut_ad(fsp_is_system_temporary(id().space()));
+    /* buf_LRU_block_free_non_file_page() asserts !oldest_modification() */
+    ut_d(oldest_modification_= 0;)
+    break;
+  default:
+    ut_ad("block is dirty" == 0);
+  }
+  const auto f= state();
+  if (f != REMOVE_HASH)
+  {
+    ut_ad(f >= UNFIXED);
+    ut_ad(buf_pool.page_hash.lock_get(buf_pool.page_hash.cell_get(id_.fold())).
+          is_write_locked());
+  }
+#endif
+  id_.set_corrupted();
+}
+
+/** Set oldest_modification when adding to buf_pool.flush_list */
+inline void buf_page_t::set_oldest_modification(lsn_t lsn)
+{
+  mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
+  ut_ad(oldest_modification() <= 1);
+  ut_ad(lsn > 2);
+  oldest_modification_= lsn;
+}
+
+/** Clear oldest_modification after removing from buf_pool.flush_list */
+inline void buf_page_t::clear_oldest_modification()
+{
+#ifdef SAFE_MUTEX
+  if (oldest_modification() != 2)
+    mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
+#endif /* SAFE_MUTEX */
+  ut_d(const auto s= state());
+  ut_ad(s >= REMOVE_HASH);
+  ut_ad(oldest_modification());
+  ut_ad(!list.prev);
+  ut_ad(!list.next);
+  /* We must use release memory order to guarantee that callers of
+  oldest_modification_acquire() will observe the block as
+  being detached from buf_pool.flush_list, after reading the value 0. */
+  oldest_modification_.store(0, std::memory_order_release);
+}
+
+/** @return whether the block can be relocated in memory.
+The block can be dirty, but it must not be I/O-fixed or bufferfixed. */
+inline bool buf_page_t::can_relocate() const
+{
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+  const auto f= state();
+  ut_ad(f >= FREED);
+  ut_ad(in_LRU_list);
+  return (f == FREED || (f < READ_FIX && !(f & ~LRU_MASK))) &&
+    !lock.is_locked_or_waiting();
+}
+
+/** @return whether the block has been flagged old in buf_pool.LRU */
+inline bool buf_page_t::is_old() const
+{
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+  ut_ad(in_file());
+  ut_ad(in_LRU_list);
+  return old;
+}
+
+/** Set whether a block is old in buf_pool.LRU */
+inline void buf_page_t::set_old(bool old)
+{
+  mysql_mutex_assert_owner(&buf_pool.mutex);
+  ut_ad(in_LRU_list);
+
+#ifdef UNIV_LRU_DEBUG
+  ut_a((buf_pool.LRU_old_len == 0) == (buf_pool.LRU_old == nullptr));
+  /* If a block is flagged "old", the LRU_old list must exist. */
+  ut_a(!old || buf_pool.LRU_old);
+
+  if (UT_LIST_GET_PREV(LRU, this) && UT_LIST_GET_NEXT(LRU, this))
+  {
+    const buf_page_t *prev= UT_LIST_GET_PREV(LRU, this);
+    const buf_page_t *next = UT_LIST_GET_NEXT(LRU, this);
+    if (prev->old == next->old)
+      ut_a(prev->old == old);
+    else
+    {
+      ut_a(!prev->old);
+      ut_a(buf_pool.LRU_old == (old ? this : next));
+    }
+  }
+#endif /* UNIV_LRU_DEBUG */
+
+  this->old= old;
+}
+
+#ifdef UNIV_DEBUG
+/** Forbid the release of the buffer pool mutex. */
+# define buf_pool_mutex_exit_forbid() do {		\
+	mysql_mutex_assert_owner(&buf_pool.mutex);	\
+	buf_pool.mutex_exit_forbidden++;		\
+} while (0)
+/** Allow the release of the buffer pool mutex. */
+# define buf_pool_mutex_exit_allow() do {		\
+	mysql_mutex_assert_owner(&buf_pool.mutex);	\
+	ut_ad(buf_pool.mutex_exit_forbidden--);		\
+} while (0)
+#else
+/** Forbid the release of the buffer pool mutex. */
+# define buf_pool_mutex_exit_forbid() ((void) 0)
+/** Allow the release of the buffer pool mutex. */
+# define buf_pool_mutex_exit_allow() ((void) 0)
+#endif
+
+/**********************************************************************
+Let us list the consistency conditions for different control block states.
+
+NOT_USED:	is in free list, not LRU, not flush_list, nor page_hash
+MEMORY:		is not in any of free, LRU, flush_list, page_hash
+in_file():	is not in free list, is in LRU list, id() is defined,
+		is in page_hash (not necessarily if is_read_fixed())
+
+		is in buf_pool.flush_list, if and only
+		if oldest_modification == 1 || oldest_modification > 2
+
+		(1) if is_write_fixed(): is u-locked
+		(2) if is_read_fixed(): is x-locked
+
+State transitions:
+
+NOT_USED => MEMORY
+MEMORY => NOT_USED
+MEMORY => UNFIXED
+UNFIXED => in_file()
+in_file() => UNFIXED or FREED
+UNFIXED or FREED => REMOVE_HASH
+REMOVE_HASH => NOT_USED	(if and only if !oldest_modification())
+*/
+
+/** Select from where to start a scan. If we have scanned
+too deep into the LRU list it resets the value to the tail
+of the LRU list.
+@return buf_page_t from where to start scan. */
+inline buf_page_t *LRUItr::start()
+{
+  mysql_mutex_assert_owner(m_mutex);
+
+  if (!m_hp || m_hp->old)
+    m_hp= UT_LIST_GET_LAST(buf_pool.LRU);
+
+  return m_hp;
+}
+
+#ifdef UNIV_DEBUG
+/** Functor to validate the LRU list. */
+struct	CheckInLRUList {
+	void	operator()(const buf_page_t* elem) const
+	{
+		ut_a(elem->in_LRU_list);
+	}
+
+	static void validate()
+	{
+		ut_list_validate(buf_pool.LRU, CheckInLRUList());
+	}
+};
+
+/** Functor to validate the LRU list. */
+struct	CheckInFreeList {
+	void	operator()(const buf_page_t* elem) const
+	{
+		ut_a(elem->in_free_list);
+	}
+
+	static void validate()
+	{
+		ut_list_validate(buf_pool.free, CheckInFreeList());
+	}
+};
+
+struct	CheckUnzipLRUAndLRUList {
+	void	operator()(const buf_block_t* elem) const
+	{
+                ut_a(elem->page.in_LRU_list);
+                ut_a(elem->in_unzip_LRU_list);
+	}
+
+	static void validate()
+	{
+		ut_list_validate(buf_pool.unzip_LRU,
+				 CheckUnzipLRUAndLRUList());
+	}
+};
+#endif /* UNIV_DEBUG */
+
+#include "buf0buf.inl"
+
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/buf0buf.inl b/storage/innobase/include/buf0buf.inl
new file mode 100644
index 00000000..b3158cf1
--- /dev/null
+++ b/storage/innobase/include/buf0buf.inl
@@ -0,0 +1,132 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2014, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0buf.ic
+The database buffer buf_pool
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "buf0lru.h"
+
+/** Determine if a block is still close enough to the MRU end of the LRU list
+meaning that it is not in danger of getting evicted and also implying
+that it has been accessed recently.
+The page must be either buffer-fixed, or its page hash must be locked.
+@param[in]	bpage		buffer pool page
+@return whether bpage is close to MRU end of LRU */
+inline bool buf_page_peek_if_young(const buf_page_t *bpage)
+{
+	/* FIXME: bpage->freed_page_clock is 31 bits */
+	return((buf_pool.freed_page_clock & ((1UL << 31) - 1))
+	       < (bpage->freed_page_clock
+		  + (buf_pool.curr_size
+		     * (BUF_LRU_OLD_RATIO_DIV - buf_pool.LRU_old_ratio)
+		     / (BUF_LRU_OLD_RATIO_DIV * 4))));
+}
+
+/** Determine if a block should be moved to the start of the LRU list if
+there is danger of dropping from the buffer pool.
+@param[in]	bpage		buffer pool page
+@return true if bpage should be made younger */
+inline bool buf_page_peek_if_too_old(const buf_page_t *bpage)
+{
+	if (buf_pool.freed_page_clock == 0) {
+		/* If eviction has not started yet, do not update the
+		statistics or move blocks in the LRU list.  This is
+		either the warm-up phase or an in-memory workload. */
+		return(FALSE);
+	} else if (buf_LRU_old_threshold_ms && bpage->old) {
+		uint32_t access_time = bpage->is_accessed();
+
+		/* It is possible that the below comparison returns an
+		unexpected result. 2^32 milliseconds pass in about 50 days,
+		so if the difference between ut_time_ms() and access_time
+		is e.g. 50 days + 15 ms, then the below will behave as if
+		it is 15 ms. This is known and fixing it would require to
+		increase buf_page_t::access_time from 32 to 64 bits. */
+		if (access_time
+		    && ((ib_uint32_t) (ut_time_ms() - access_time))
+		    >= buf_LRU_old_threshold_ms) {
+			return(TRUE);
+		}
+
+		buf_pool.stat.n_pages_not_made_young++;
+		return false;
+	} else {
+		return !buf_page_peek_if_young(bpage);
+	}
+}
+
+/** Allocate a buffer block.
+@return own: the allocated block, in state BUF_BLOCK_MEMORY */
+inline buf_block_t *buf_block_alloc()
+{
+  return buf_LRU_get_free_block(false);
+}
+
+/********************************************************************//**
+Frees a buffer block which does not contain a file page. */
+UNIV_INLINE
+void
+buf_block_free(
+/*===========*/
+	buf_block_t*	block)	/*!< in, own: block to be freed */
+{
+	mysql_mutex_lock(&buf_pool.mutex);
+	buf_LRU_block_free_non_file_page(block);
+	mysql_mutex_unlock(&buf_pool.mutex);
+}
+
+/********************************************************************//**
+Increments the modify clock of a frame by 1. The caller must (1) own the
+buf_pool mutex and block bufferfix count has to be zero, (2) or own an x-lock
+on the block. */
+UNIV_INLINE
+void
+buf_block_modify_clock_inc(
+/*=======================*/
+	buf_block_t*	block)	/*!< in: block */
+{
+#ifdef SAFE_MUTEX
+	ut_ad((mysql_mutex_is_owner(&buf_pool.mutex)
+	       && !block->page.buf_fix_count())
+	      || block->page.lock.have_u_or_x());
+#else /* SAFE_MUTEX */
+	ut_ad(!block->page.buf_fix_count() || block->page.lock.have_u_or_x());
+#endif /* SAFE_MUTEX */
+	assert_block_ahi_valid(block);
+
+	block->modify_clock++;
+}
+
+/********************************************************************//**
+Returns the value of the modify clock. The caller must have an s-lock
+or x-lock on the block.
+@return value */
+UNIV_INLINE
+ib_uint64_t
+buf_block_get_modify_clock(
+/*=======================*/
+	buf_block_t*	block)	/*!< in: block */
+{
+	ut_ad(block->page.lock.have_any());
+	return(block->modify_clock);
+}
diff --git a/storage/innobase/include/buf0checksum.h b/storage/innobase/include/buf0checksum.h
new file mode 100644
index 00000000..d9f03177
--- /dev/null
+++ b/storage/innobase/include/buf0checksum.h
@@ -0,0 +1,57 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0checksum.h
+Buffer pool checksum functions, also linked from /extra/innochecksum.cc
+
+Created Aug 11, 2011 Vasil Dimov
+*******************************************************/
+
+#pragma once
+#include "buf0types.h"
+
+/** Calculate the CRC32 checksum of a page. The value is stored to the page
+when it is written to a file and also checked for a match when reading from
+the file. Note that we must be careful to calculate the same value on all
+architectures.
+@param[in]	page			buffer page (srv_page_size bytes)
+@return	CRC-32C */
+uint32_t buf_calc_page_crc32(const byte* page);
+
+#ifndef UNIV_INNOCHECKSUM
+/** Calculate a checksum which is stored to the page when it is written
+to a file. Note that we must be careful to calculate the same value on
+32-bit and 64-bit architectures.
+@param[in]	page	file page (srv_page_size bytes)
+@return checksum */
+uint32_t
+buf_calc_page_new_checksum(const byte* page);
+
+/** In MySQL before 4.0.14 or 4.1.1 there was an InnoDB bug that
+the checksum only looked at the first few bytes of the page.
+This calculates that old checksum.
+NOTE: we must first store the new formula checksum to
+FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum
+because this takes that field as an input!
+@param[in]	page	file page (srv_page_size bytes)
+@return checksum */
+uint32_t
+buf_calc_page_old_checksum(const byte* page);
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/buf0dblwr.h b/storage/innobase/include/buf0dblwr.h
new file mode 100644
index 00000000..9932b0e5
--- /dev/null
+++ b/storage/innobase/include/buf0dblwr.h
@@ -0,0 +1,164 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0dblwr.h
+Doublewrite buffer module
+
+Created 2011/12/19 Inaam Rana
+*******************************************************/
+
+#pragma once
+
+#include "os0file.h"
+#include "buf0types.h"
+
+/** Doublewrite control struct */
+class buf_dblwr_t
+{
+  struct element
+  {
+    /** asynchronous write request */
+    IORequest request;
+    /** payload size in bytes */
+    size_t size;
+  };
+
+  struct slot
+  {
+    /** first free position in write_buf measured in units of
+     * srv_page_size */
+    ulint first_free;
+    /** number of slots reserved for the current write batch */
+    ulint reserved;
+    /** the doublewrite buffer, aligned to srv_page_size */
+    byte* write_buf;
+    /** buffer blocks to be written via write_buf */
+    element* buf_block_arr;
+  };
+
+  /** the page number of the first doublewrite block (block_size() pages) */
+  page_id_t block1{0, 0};
+  /** the page number of the second doublewrite block (block_size() pages) */
+  page_id_t block2{0, 0};
+
+  /** mutex protecting the data members below */
+  mysql_mutex_t mutex;
+  /** condition variable for !batch_running */
+  pthread_cond_t cond;
+  /** whether a batch is being written from the doublewrite buffer */
+  bool batch_running;
+  /** number of expected flush_buffered_writes_completed() calls */
+  unsigned flushing_buffered_writes;
+  /** number of flush_buffered_writes_completed() calls */
+  ulint writes_completed;
+  /** number of pages written by flush_buffered_writes_completed() */
+  ulint pages_written;
+
+  slot slots[2];
+  slot *active_slot;
+
+  /** Initialise the persistent storage of the doublewrite buffer.
+  @param header   doublewrite page header in the TRX_SYS page */
+  inline void init(const byte *header);
+
+  /** Flush possible buffered writes to persistent storage. */
+  bool flush_buffered_writes(const ulint size);
+
+public:
+  /** Initialise the doublewrite buffer data structures. */
+  void init();
+  /** Create or restore the doublewrite buffer in the TRX_SYS page.
+  @return whether the operation succeeded */
+  bool create();
+  /** Free the doublewrite buffer. */
+  void close();
+
+  /** Acquire the mutex */
+  void lock() { mysql_mutex_lock(&mutex); }
+  /** @return the number of completed batches */
+  ulint batches() const
+  { mysql_mutex_assert_owner(&mutex); return writes_completed; }
+  /** @return the number of final pages written */
+  ulint written() const
+  { mysql_mutex_assert_owner(&mutex); return pages_written; }
+  /** Release the mutex */
+  void unlock() { mysql_mutex_unlock(&mutex); }
+
+  /** Initialize the doublewrite buffer memory structure on recovery.
+  If we are upgrading from a version before MySQL 4.1, then this
+  function performs the necessary update operations to support
+  innodb_file_per_table. If we are in a crash recovery, this function
+  loads the pages from double write buffer into memory.
+  @param file File handle
+  @param path Path name of file
+  @return DB_SUCCESS or error code */
+  dberr_t init_or_load_pages(pfs_os_file_t file, const char *path);
+
+  /** Process and remove the double write buffer pages for all tablespaces. */
+  void recover();
+
+  /** Update the doublewrite buffer on data page write completion. */
+  void write_completed();
+  /** Flush possible buffered writes to persistent storage.
+  It is very important to call this function after a batch of writes has been
+  posted, and also when we may have to wait for a page latch!
+  Otherwise a deadlock of threads can occur. */
+  void flush_buffered_writes();
+  /** Update the doublewrite buffer on write batch completion
+  @param request  the completed batch write request */
+  void flush_buffered_writes_completed(const IORequest &request);
+
+  /** Size of the doublewrite block in pages */
+  uint32_t block_size() const { return FSP_EXTENT_SIZE; }
+
+  /** Schedule a page write. If the doublewrite memory buffer is full,
+  flush_buffered_writes() will be invoked to make space.
+  @param request    asynchronous write request
+  @param size       payload size in bytes */
+  void add_to_batch(const IORequest &request, size_t size);
+
+  /** Determine whether the doublewrite buffer has been created */
+  bool is_created() const
+  { return UNIV_LIKELY(block1 != page_id_t(0, 0)); }
+
+  /** @return whether a page identifier is part of the doublewrite buffer */
+  bool is_inside(const page_id_t id) const
+  {
+    if (!is_created())
+      return false;
+    ut_ad(block1 < block2);
+    if (id < block1)
+      return false;
+    const uint32_t size= block_size();
+    return id < block1 + size || (id >= block2 && id < block2 + size);
+  }
+
+  /** Wait for flush_buffered_writes() to be fully completed */
+  void wait_flush_buffered_writes()
+  {
+    mysql_mutex_lock(&mutex);
+    while (batch_running)
+      my_cond_wait(&cond, &mutex.m_mutex);
+    mysql_mutex_unlock(&mutex);
+  }
+};
+
+/** The doublewrite buffer */
+extern buf_dblwr_t buf_dblwr;
diff --git a/storage/innobase/include/buf0dump.h b/storage/innobase/include/buf0dump.h
new file mode 100644
index 00000000..48586900
--- /dev/null
+++ b/storage/innobase/include/buf0dump.h
@@ -0,0 +1,44 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0dump.h
+Implements a buffer pool dump/load.
+
+Created April 08, 2011 Vasil Dimov
+*******************************************************/
+
+#ifndef buf0dump_h
+#define buf0dump_h
+
+/** Start the buffer pool dump/load task and instructs it to start a dump. */
+void buf_dump_start();
+/** Start the buffer pool dump/load task and instructs it to start a load. */
+void buf_load_start();
+
+/** Abort a currently running buffer pool load. */
+void buf_load_abort();
+
+/** Start async buffer pool load, if srv_buffer_pool_load_at_startup was set.*/
+void buf_load_at_startup();
+
+/** Wait for currently running load/dumps to finish*/
+void buf_load_dump_end();
+
+#endif /* buf0dump_h */
diff --git a/storage/innobase/include/buf0flu.h b/storage/innobase/include/buf0flu.h
new file mode 100644
index 00000000..0cce514b
--- /dev/null
+++ b/storage/innobase/include/buf0flu.h
@@ -0,0 +1,125 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2014, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0flu.h
+The database buffer pool flush algorithm
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "ut0byte.h"
+#include "log0log.h"
+#include "buf0buf.h"
+
+/** Number of pages flushed via LRU. Protected by buf_pool.mutex.
+Also included in buf_pool.stat.n_pages_written. */
+extern ulint buf_lru_flush_page_count;
+/** Number of pages freed without flushing. Protected by buf_pool.mutex. */
+extern ulint buf_lru_freed_page_count;
+
+/** Flag indicating if the page_cleaner is in active state. */
+extern Atomic_relaxed<bool> buf_page_cleaner_is_active;
+
+/** Remove all dirty pages belonging to a given tablespace when we are
+deleting the data file of that tablespace.
+The pages still remain a part of LRU and are evicted from
+the list as they age towards the tail of the LRU.
+@param id    tablespace identifier */
+void buf_flush_remove_pages(uint32_t id);
+
+/*******************************************************************//**
+Relocates a buffer control block on the flush_list.
+Note that it is assumed that the contents of bpage has already been
+copied to dpage. */
+ATTRIBUTE_COLD
+void
+buf_flush_relocate_on_flush_list(
+/*=============================*/
+	buf_page_t*	bpage,	/*!< in/out: control block being moved */
+	buf_page_t*	dpage);	/*!< in/out: destination block */
+
+/** Complete write of a file page from buf_pool.
+@param request write request
+@param error   whether the write may have failed */
+void buf_page_write_complete(const IORequest &request, bool error);
+
+/** Assign the full crc32 checksum for non-compressed page.
+@param[in,out]	page	page to be updated */
+void buf_flush_assign_full_crc32_checksum(byte* page);
+
+/** Initialize a page for writing to the tablespace.
+@param[in]	block			buffer block; NULL if bypassing the buffer pool
+@param[in,out]	page			page frame
+@param[in,out]	page_zip_		compressed page, or NULL if uncompressed
+@param[in]	use_full_checksum	whether tablespace uses full checksum */
+void
+buf_flush_init_for_writing(
+	const buf_block_t*	block,
+	byte*			page,
+	void*			page_zip_,
+	bool			use_full_checksum);
+
+/** Try to flush dirty pages that belong to a given tablespace.
+@param space       tablespace
+@param n_flushed   number of pages written
+@return whether the flush for some pages might not have been initiated */
+bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed= nullptr)
+  MY_ATTRIBUTE((warn_unused_result));
+
+/** Write out dirty blocks from buf_pool.LRU,
+and move clean blocks to buf_pool.free.
+The caller must invoke buf_dblwr.flush_buffered_writes()
+after releasing buf_pool.mutex.
+@param max_n    wished maximum mumber of blocks flushed
+@param evict    whether to evict pages after flushing
+@return evict ? number of processed pages : number of pages written
+@retval 0 if a buf_pool.LRU batch is already running */
+ulint buf_flush_LRU(ulint max_n, bool evict);
+
+/** Wait until a LRU flush batch ends. */
+void buf_flush_wait_LRU_batch_end();
+/** Wait until all persistent pages are flushed up to a limit.
+@param sync_lsn   buf_pool.get_oldest_modification(LSN_MAX) to wait for */
+ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn);
+/** Initiate more eager page flushing if the log checkpoint age is too old.
+@param lsn      buf_pool.get_oldest_modification(LSN_MAX) target
+@param furious  true=furious flushing, false=limit to innodb_io_capacity */
+ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious);
+
+/** Initialize page_cleaner. */
+ATTRIBUTE_COLD void buf_flush_page_cleaner_init();
+
+/** Flush the buffer pool on shutdown. */
+ATTRIBUTE_COLD void buf_flush_buffer_pool();
+
+#ifdef UNIV_DEBUG
+/** Validate the flush list. */
+void buf_flush_validate();
+#endif /* UNIV_DEBUG */
+
+/** Synchronously flush dirty blocks during recv_sys_t::apply().
+NOTE: The calling thread is not allowed to hold any buffer page latches! */
+void buf_flush_sync_batch(lsn_t lsn);
+
+/** Synchronously flush dirty blocks.
+NOTE: The calling thread is not allowed to hold any buffer page latches! */
+void buf_flush_sync();
diff --git a/storage/innobase/include/buf0lru.h b/storage/innobase/include/buf0lru.h
new file mode 100644
index 00000000..aec08e77
--- /dev/null
+++ b/storage/innobase/include/buf0lru.h
@@ -0,0 +1,193 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0lru.h
+The database buffer pool LRU replacement algorithm
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "buf0types.h"
+#include "hash0hash.h"
+
+// Forward declaration
+struct trx_t;
+struct fil_space_t;
+
+/** Flush this many pages in buf_LRU_get_free_block() */
+extern size_t innodb_lru_flush_size;
+
+/*#######################################################################
+These are low-level functions
+#########################################################################*/
+
+/** Minimum LRU list length for which the LRU_old pointer is defined */
+#define BUF_LRU_OLD_MIN_LEN	512	/* 8 megabytes of 16k pages */
+
+/** Try to free a block. If bpage is a descriptor of a compressed-only
+ROW_FORMAT=COMPRESSED page, the buf_page_t object will be freed as well.
+The caller must hold buf_pool.mutex.
+@param bpage      block to be freed
+@param zip        whether to remove both copies of a ROW_FORMAT=COMPRESSED page
+@retval true if freed and buf_pool.mutex may have been temporarily released
+@retval false if the page was not freed */
+bool buf_LRU_free_page(buf_page_t *bpage, bool zip)
+  MY_ATTRIBUTE((nonnull));
+
+/** Try to free a replaceable block.
+@param limit  maximum number of blocks to scan
+@return true if found and freed */
+bool buf_LRU_scan_and_free_block(ulint limit= ULINT_UNDEFINED);
+
+/** @return a buffer block from the buf_pool.free list
+@retval	NULL	if the free list is empty */
+buf_block_t* buf_LRU_get_free_only();
+
+/** Get a block from the buf_pool.free list.
+If the list is empty, blocks will be moved from the end of buf_pool.LRU
+to buf_pool.free.
+
+This function is called from a user thread when it needs a clean
+block to read in a page. Note that we only ever get a block from
+the free list. Even when we flush a page or find a page in LRU scan
+we put it to free list to be used.
+* iteration 0:
+  * get a block from the buf_pool.free list, success:done
+  * if buf_pool.try_LRU_scan is set
+    * scan LRU up to 100 pages to free a clean block
+    * success:retry the free list
+  * flush up to innodb_lru_flush_size LRU blocks to data files
+    (until UT_LIST_GET_GEN(buf_pool.free) < innodb_lru_scan_depth)
+    * on buf_page_write_complete() the blocks will put on buf_pool.free list
+    * success: retry the free list
+* subsequent iterations: same as iteration 0 except:
+  * scan whole LRU list
+  * scan LRU list even if buf_pool.try_LRU_scan is not set
+
+@param have_mutex  whether buf_pool.mutex is already being held
+@return the free control block, in state BUF_BLOCK_MEMORY */
+buf_block_t* buf_LRU_get_free_block(bool have_mutex)
+	MY_ATTRIBUTE((malloc,warn_unused_result));
+
+/** @return whether the unzip_LRU list should be used for evicting a victim
+instead of the general LRU list */
+bool buf_LRU_evict_from_unzip_LRU();
+
+/** Puts a block back to the free list.
+@param[in]	block	block; not containing a file page */
+void
+buf_LRU_block_free_non_file_page(buf_block_t* block);
+/******************************************************************//**
+Adds a block to the LRU list. Please make sure that the page_size is
+already set when invoking the function, so that we can get correct
+page_size from the buffer page when adding a block into LRU */
+void
+buf_LRU_add_block(
+/*==============*/
+	buf_page_t*	bpage,	/*!< in: control block */
+	bool		old);	/*!< in: true if should be put to the old
+				blocks in the LRU list, else put to the
+				start; if the LRU list is very short, added to
+				the start regardless of this parameter */
+/******************************************************************//**
+Adds a block to the LRU list of decompressed zip pages. */
+void
+buf_unzip_LRU_add_block(
+/*====================*/
+	buf_block_t*	block,	/*!< in: control block */
+	ibool		old);	/*!< in: TRUE if should be put to the end
+				of the list, else put to the start */
+
+/** Update buf_pool.LRU_old_ratio.
+@param[in]	old_pct		Reserve this percentage of
+				the buffer pool for "old" blocks
+@param[in]	adjust		true=adjust the LRU list;
+				false=just assign buf_pool.LRU_old_ratio
+				during the initialization of InnoDB
+@return updated old_pct */
+uint buf_LRU_old_ratio_update(uint old_pct, bool adjust);
+/********************************************************************//**
+Update the historical stats that we are collecting for LRU eviction
+policy at the end of each interval. */
+void
+buf_LRU_stat_update();
+
+#ifdef UNIV_DEBUG
+/** Validate the LRU list. */
+void buf_LRU_validate();
+#endif /* UNIV_DEBUG */
+#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG
+/** Dump the LRU list to stderr. */
+void buf_LRU_print();
+#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG */
+
+/** @name Heuristics for detecting index scan @{ */
+/** The denominator of buf_pool.LRU_old_ratio. */
+#define BUF_LRU_OLD_RATIO_DIV	1024
+/** Maximum value of buf_pool.LRU_old_ratio.
+@see buf_LRU_old_adjust_len
+@see buf_pool.LRU_old_ratio_update */
+#define BUF_LRU_OLD_RATIO_MAX	BUF_LRU_OLD_RATIO_DIV
+/** Minimum value of buf_pool.LRU_old_ratio.
+@see buf_LRU_old_adjust_len
+@see buf_pool.LRU_old_ratio_update
+The minimum must exceed
+(BUF_LRU_OLD_TOLERANCE + 5) * BUF_LRU_OLD_RATIO_DIV / BUF_LRU_OLD_MIN_LEN. */
+#define BUF_LRU_OLD_RATIO_MIN	51
+
+#if BUF_LRU_OLD_RATIO_MIN >= BUF_LRU_OLD_RATIO_MAX
+# error "BUF_LRU_OLD_RATIO_MIN >= BUF_LRU_OLD_RATIO_MAX"
+#endif
+#if BUF_LRU_OLD_RATIO_MAX > BUF_LRU_OLD_RATIO_DIV
+# error "BUF_LRU_OLD_RATIO_MAX > BUF_LRU_OLD_RATIO_DIV"
+#endif
+
+/** Move blocks to "new" LRU list only if the first access was at
+least this many milliseconds ago.  Not protected by any mutex or latch. */
+extern uint	buf_LRU_old_threshold_ms;
+/* @} */
+
+/** @brief Statistics for selecting the LRU list for eviction.
+
+These statistics are not 'of' LRU but 'for' LRU.  We keep count of I/O
+and page_zip_decompress() operations.  Based on the statistics we decide
+if we want to evict from buf_pool.unzip_LRU or buf_pool.LRU. */
+struct buf_LRU_stat_t
+{
+	ulint	io;	/**< Counter of buffer pool I/O operations. */
+	ulint	unzip;	/**< Counter of page_zip_decompress operations. */
+};
+
+/** Current operation counters.  Not protected by any mutex.
+Cleared by buf_LRU_stat_update(). */
+extern buf_LRU_stat_t	buf_LRU_stat_cur;
+
+/** Running sum of past values of buf_LRU_stat_cur.
+Updated by buf_LRU_stat_update().  Protected by buf_pool.mutex. */
+extern buf_LRU_stat_t	buf_LRU_stat_sum;
+
+/********************************************************************//**
+Increments the I/O counter in buf_LRU_stat_cur. */
+#define buf_LRU_stat_inc_io() buf_LRU_stat_cur.io++
+/********************************************************************//**
+Increments the page_zip_decompress() counter in buf_LRU_stat_cur. */
+#define buf_LRU_stat_inc_unzip() buf_LRU_stat_cur.unzip++
diff --git a/storage/innobase/include/buf0rea.h b/storage/innobase/include/buf0rea.h
new file mode 100644
index 00000000..3dd085dd
--- /dev/null
+++ b/storage/innobase/include/buf0rea.h
@@ -0,0 +1,120 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0rea.h
+The database buffer read
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef buf0rea_h
+#define buf0rea_h
+
+#include "buf0buf.h"
+
+/** High-level function which reads a page asynchronously from a file to the
+buffer buf_pool if it is not already there. Sets the io_fix flag and sets
+an exclusive lock on the buffer frame. The flag is cleared and the x-lock
+released by the i/o-handler thread.
+@param page_id   page id
+@param zip_size  ROW_FORMAT=COMPRESSED page size, or 0
+@retval DB_SUCCESS if the page was read and is not corrupted
+@retval DB_SUCCESS_LOCKED_REC if the page was not read
+@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted
+@retval DB_DECRYPTION_FAILED if page post encryption checksum matches but
+after decryption normal page checksum does not match.
+@retval DB_TABLESPACE_DELETED if tablespace .ibd file is missing */
+dberr_t buf_read_page(const page_id_t page_id, ulint zip_size);
+
+/** High-level function which reads a page asynchronously from a file to the
+buffer buf_pool if it is not already there. Sets the io_fix flag and sets
+an exclusive lock on the buffer frame. The flag is cleared and the x-lock
+released by the i/o-handler thread.
+@param[in,out]	space		tablespace
+@param[in]	page_id		page id
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0 */
+void buf_read_page_background(fil_space_t *space, const page_id_t page_id,
+                              ulint zip_size)
+  MY_ATTRIBUTE((nonnull));
+
+/** Applies a random read-ahead in buf_pool if there are at least a threshold
+value of accessed pages from the random read-ahead area. Does not read any
+page, not even the one at the position (space, offset), if the read-ahead
+mechanism is not activated. NOTE 1: the calling thread may own latches on
+pages: to avoid deadlocks this function must be written such that it cannot
+end up waiting for these latches! NOTE 2: the calling thread must want
+access to the page given: this rule is set to prevent unintended read-aheads
+performed by ibuf routines, a situation which could result in a deadlock if
+the OS does not support asynchronous i/o.
+@param[in]	page_id		page id of a page which the current thread
+wants to access
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	ibuf		whether we are inside ibuf routine
+@return number of page read requests issued; NOTE that if we read ibuf
+pages, it may happen that the page at the given page number does not
+get read even if we return a positive value! */
+ulint
+buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf);
+
+/** Applies linear read-ahead if in the buf_pool the page is a border page of
+a linear read-ahead area and all the pages in the area have been accessed.
+Does not read any page if the read-ahead mechanism is not activated. Note
+that the algorithm looks at the 'natural' adjacent successor and
+predecessor of the page, which on the leaf level of a B-tree are the next
+and previous page in the chain of leaves. To know these, the page specified
+in (space, offset) must already be present in the buf_pool. Thus, the
+natural way to use this function is to call it when a page in the buf_pool
+is accessed the first time, calling this function just after it has been
+bufferfixed.
+NOTE 1: as this function looks at the natural predecessor and successor
+fields on the page, what happens, if these are not initialized to any
+sensible value? No problem, before applying read-ahead we check that the
+area to read is within the span of the space, if not, read-ahead is not
+applied. An uninitialized value may result in a useless read operation, but
+only very improbably.
+NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this
+function must be written such that it cannot end up waiting for these
+latches!
+NOTE 3: the calling thread must want access to the page given: this rule is
+set to prevent unintended read-aheads performed by ibuf routines, a situation
+which could result in a deadlock if the OS does not support asynchronous io.
+@param[in]	page_id		page id; see NOTE 3 above
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	ibuf		whether if we are inside ibuf routine
+@return number of page read requests issued */
+ulint
+buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf);
+
+/** Schedule a page for recovery.
+@param space    tablespace
+@param page_id  page identifier
+@param recs     log records
+@param init     page initialization, or nullptr if the page needs to be read */
+void buf_read_recover(fil_space_t *space, const page_id_t page_id,
+                      page_recv_t &recs, recv_init *init);
+
+/** @name Modes used in read-ahead @{ */
+/** read only pages belonging to the insert buffer tree */
+#define BUF_READ_IBUF_PAGES_ONLY	131
+/** read any page */
+#define BUF_READ_ANY_PAGE		132
+/* @} */
+
+#endif
diff --git a/storage/innobase/include/buf0types.h b/storage/innobase/include/buf0types.h
new file mode 100644
index 00000000..6c13f5ee
--- /dev/null
+++ b/storage/innobase/include/buf0types.h
@@ -0,0 +1,235 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2015, Oracle and/or its affiliates. All rights reserved.
+Copyright (c) 2019, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0types.h
+The database buffer pool global types for the directory
+
+Created 11/17/1995 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+#include "univ.i"
+
+/** Buffer page (uncompressed or compressed) */
+class buf_page_t;
+/** Buffer block for which an uncompressed page exists */
+struct buf_block_t;
+/** Buffer pool statistics struct */
+struct buf_pool_stat_t;
+/** Buffer pool buddy statistics struct */
+struct buf_buddy_stat_t;
+
+/** A buffer frame. @see page_t */
+typedef	byte	buf_frame_t;
+
+/** Alternatives for srv_checksum_algorithm, which can be changed by
+setting innodb_checksum_algorithm */
+enum srv_checksum_algorithm_t {
+  /** Write crc32; allow full_crc32,crc32,innodb,none when reading */
+  SRV_CHECKSUM_ALGORITHM_CRC32,
+  /** Write crc32; allow full_crc23,crc32 when reading */
+  SRV_CHECKSUM_ALGORITHM_STRICT_CRC32,
+  /** For new files, always compute CRC-32C for the whole page.
+  For old files, allow crc32, innodb or none when reading. */
+  SRV_CHECKSUM_ALGORITHM_FULL_CRC32,
+  /** For new files, always compute CRC-32C for the whole page.
+  For old files, allow crc32 when reading. */
+  SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32
+};
+
+inline bool is_checksum_strict(srv_checksum_algorithm_t algo)
+{
+  return algo == SRV_CHECKSUM_ALGORITHM_STRICT_CRC32;
+}
+
+inline bool is_checksum_strict(ulint algo)
+{
+  return algo == SRV_CHECKSUM_ALGORITHM_STRICT_CRC32;
+}
+
+/** Parameters of binary buddy system for compressed pages (buf0buddy.h) */
+/* @{ */
+/** Zip shift value for the smallest page size */
+#define BUF_BUDDY_LOW_SHIFT	UNIV_ZIP_SIZE_SHIFT_MIN
+
+/** Smallest buddy page size */
+#define BUF_BUDDY_LOW		(1U << BUF_BUDDY_LOW_SHIFT)
+
+/** Actual number of buddy sizes based on current page size */
+#define BUF_BUDDY_SIZES		(srv_page_size_shift - BUF_BUDDY_LOW_SHIFT)
+
+/** Maximum number of buddy sizes based on the max page size */
+#define BUF_BUDDY_SIZES_MAX	(UNIV_PAGE_SIZE_SHIFT_MAX	\
+				- BUF_BUDDY_LOW_SHIFT)
+
+/** twice the maximum block size of the buddy system;
+the underlying memory is aligned by this amount:
+this must be equal to srv_page_size */
+#define BUF_BUDDY_HIGH	(BUF_BUDDY_LOW << BUF_BUDDY_SIZES)
+/* @} */
+
+/** Page identifier. */
+class page_id_t
+{
+public:
+  /** Constructor from (space, page_no).
+  @param space	 tablespace id
+  @param page_no page number */
+  constexpr page_id_t(uint32_t space, uint32_t page_no) :
+    m_id(uint64_t{space} << 32 | page_no) {}
+
+  constexpr page_id_t(uint64_t id) : m_id(id) {}
+  constexpr bool operator==(const page_id_t& rhs) const
+  { return m_id == rhs.m_id; }
+  constexpr bool operator!=(const page_id_t& rhs) const
+  { return m_id != rhs.m_id; }
+  constexpr bool operator<(const page_id_t& rhs) const
+  { return m_id < rhs.m_id; }
+  constexpr bool operator>(const page_id_t& rhs) const
+  { return m_id > rhs.m_id; }
+  constexpr bool operator<=(const page_id_t& rhs) const
+  { return m_id <= rhs.m_id; }
+  constexpr bool operator>=(const page_id_t& rhs) const
+  { return m_id >= rhs.m_id; }
+  page_id_t &operator--() { ut_ad(page_no()); m_id--; return *this; }
+  page_id_t &operator++()
+  {
+    ut_ad(page_no() < 0xFFFFFFFFU);
+    m_id++;
+    return *this;
+  }
+  page_id_t operator-(uint32_t i) const
+  {
+    ut_ad(page_no() >= i);
+    return page_id_t(m_id - i);
+  }
+  page_id_t operator+(uint32_t i) const
+  {
+    ut_ad(page_no() < ~i);
+    return page_id_t(m_id + i);
+  }
+
+  /** Retrieve the tablespace id.
+  @return tablespace id */
+  constexpr uint32_t space() const { return static_cast<uint32_t>(m_id >> 32); }
+
+  /** Retrieve the page number.
+  @return page number */
+  constexpr uint32_t page_no() const { return static_cast<uint32_t>(m_id); }
+
+  /** Retrieve the fold value.
+  @return fold value */
+  constexpr ulint fold() const
+  { return (ulint{space()} << 20) + space() + page_no(); }
+
+  /** Reset the page number only.
+  @param[in]	page_no	page number */
+  void set_page_no(uint32_t page_no)
+  {
+    m_id= (m_id & ~uint64_t{0} << 32) | page_no;
+  }
+
+  constexpr ulonglong raw() const { return m_id; }
+
+  /** Flag the page identifier as corrupted. */
+  void set_corrupted() { m_id= ~0ULL; }
+
+  /** @return whether the page identifier belongs to a corrupted page */
+  constexpr bool is_corrupted() const { return m_id == ~0ULL; }
+
+private:
+  /** The page identifier */
+  uint64_t m_id;
+};
+
+/** A 64KiB buffer of NUL bytes, for use in assertions and checks,
+and dummy default values of instantly dropped columns.
+Initially, BLOB field references are set to NUL bytes, in
+dtuple_convert_big_rec(). */
+extern const byte *field_ref_zero;
+
+#ifndef UNIV_INNOCHECKSUM
+
+/** Latch types */
+enum rw_lock_type_t
+{
+  RW_S_LATCH= 1 << 0,
+  RW_X_LATCH= 1 << 1,
+  RW_SX_LATCH= 1 << 2,
+  RW_NO_LATCH= 1 << 3
+};
+
+#include "sux_lock.h"
+
+#ifdef SUX_LOCK_GENERIC
+class page_hash_latch : private rw_lock
+{
+  /** Wait for a shared lock */
+  void read_lock_wait();
+  /** Wait for an exclusive lock */
+  void write_lock_wait();
+public:
+  /** Acquire a shared lock */
+  inline void lock_shared();
+  /** Acquire an exclusive lock */
+  inline void lock();
+
+  /** @return whether an exclusive lock is being held by any thread */
+  bool is_write_locked() const { return rw_lock::is_write_locked(); }
+
+  /** @return whether any lock is being held by any thread */
+  bool is_locked() const { return rw_lock::is_locked(); }
+  /** @return whether any lock is being held or waited for by any thread */
+  bool is_locked_or_waiting() const { return rw_lock::is_locked_or_waiting(); }
+
+  /** Release a shared lock */
+  void unlock_shared() { read_unlock(); }
+  /** Release an exclusive lock */
+  void unlock() { write_unlock(); }
+};
+#elif defined _WIN32 || SIZEOF_SIZE_T >= 8
+class page_hash_latch
+{
+  srw_spin_lock_low lk;
+public:
+  void lock_shared() { lk.rd_lock(); }
+  void unlock_shared() { lk.rd_unlock(); }
+  void lock() { lk.wr_lock(); }
+  void unlock() { lk.wr_unlock(); }
+  bool is_write_locked() const { return lk.is_write_locked(); }
+  bool is_locked() const { return lk.is_locked(); }
+  bool is_locked_or_waiting() const { return lk.is_locked_or_waiting(); }
+};
+#else
+class page_hash_latch
+{
+  srw_spin_mutex lk;
+public:
+  void lock_shared() { lock(); }
+  void unlock_shared() { unlock(); }
+  void lock() { lk.wr_lock(); }
+  void unlock() { lk.wr_unlock(); }
+  bool is_locked() const { return lk.is_locked(); }
+  bool is_write_locked() const { return is_locked(); }
+  bool is_locked_or_waiting() const { return is_locked(); }
+};
+#endif
+
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/data0data.h b/storage/innobase/include/data0data.h
new file mode 100644
index 00000000..a5356e0d
--- /dev/null
+++ b/storage/innobase/include/data0data.h
@@ -0,0 +1,704 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/data0data.h
+SQL data field and tuple
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifndef data0data_h
+#define data0data_h
+
+#include "data0types.h"
+#include "data0type.h"
+#include "mem0mem.h"
+#include "dict0types.h"
+#include "btr0types.h"
+#include <vector>
+
+#include <ostream>
+
+/** Storage for overflow data in a big record, that is, a clustered
+index record which needs external storage of data fields */
+struct big_rec_t;
+struct upd_t;
+
+/** Dummy variable to catch access to uninitialized fields.  In the
+debug version, dtuple_create() will make all fields of dtuple_t point
+to data_error. */
+ut_d(extern byte data_error);
+
+/*********************************************************************//**
+Sets the type struct of SQL data field. */
+UNIV_INLINE
+void
+dfield_set_type(
+/*============*/
+	dfield_t*	field,	/*!< in: SQL data field */
+	const dtype_t*	type);	/*!< in: pointer to data type struct */
+
+/*********************************************************************//**
+Sets length in a field. */
+UNIV_INLINE
+void
+dfield_set_len(
+/*===========*/
+	dfield_t*	field,	/*!< in: field */
+	ulint		len)	/*!< in: length or UNIV_SQL_NULL */
+	MY_ATTRIBUTE((nonnull));
+
+/** Gets spatial status for "external storage"
+@param[in,out]	field		field */
+UNIV_INLINE
+spatial_status_t
+dfield_get_spatial_status(
+	const dfield_t*	field);
+
+/** Sets spatial status for "external storage"
+@param[in,out]	field		field
+@param[in]	spatial_status	spatial status */
+UNIV_INLINE
+void
+dfield_set_spatial_status(
+	dfield_t*		field,
+	spatial_status_t	spatial_status);
+
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+dfield_set_data(
+/*============*/
+	dfield_t*	field,	/*!< in: field */
+	const void*	data,	/*!< in: data */
+	ulint		len)	/*!< in: length or UNIV_SQL_NULL */
+	MY_ATTRIBUTE((nonnull(1)));
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+dfield_write_mbr(
+/*=============*/
+	dfield_t*	field,	/*!< in: field */
+	const double*	mbr)	/*!< in: data */
+	MY_ATTRIBUTE((nonnull(1)));
+/*********************************************************************//**
+Sets a data field to SQL NULL. */
+UNIV_INLINE
+void
+dfield_set_null(
+/*============*/
+	dfield_t*	field)	/*!< in/out: field */
+	MY_ATTRIBUTE((nonnull));
+/**********************************************************************//**
+Writes an SQL null field full of zeros. */
+UNIV_INLINE
+void
+data_write_sql_null(
+/*================*/
+	byte*	data,	/*!< in: pointer to a buffer of size len */
+	ulint	len)	/*!< in: SQL null size in bytes */
+	MY_ATTRIBUTE((nonnull));
+/*********************************************************************//**
+Copies the data and len fields. */
+UNIV_INLINE
+void
+dfield_copy_data(
+/*=============*/
+	dfield_t*	field1,		/*!< out: field to copy to */
+	const dfield_t*	field2);	/*!< in: field to copy from */
+
+/*********************************************************************//**
+Copies a data field to another. */
+UNIV_INLINE
+void
+dfield_copy(
+/*========*/
+	dfield_t*	field1,	/*!< out: field to copy to */
+	const dfield_t*	field2)	/*!< in: field to copy from */
+	MY_ATTRIBUTE((nonnull));
+/*********************************************************************//**
+Copies the data pointed to by a data field. */
+UNIV_INLINE
+void
+dfield_dup(
+/*=======*/
+	dfield_t*	field,	/*!< in/out: data field */
+	mem_heap_t*	heap)	/*!< in: memory heap where allocated */
+	MY_ATTRIBUTE((nonnull));
+
+/*********************************************************************//**
+Tests if two data fields are equal.
+If len==0, tests the data length and content for equality.
+If len>0, tests the first len bytes of the content for equality.
+@return TRUE if both fields are NULL or if they are equal */
+UNIV_INLINE
+ibool
+dfield_datas_are_binary_equal(
+/*==========================*/
+	const dfield_t*	field1,	/*!< in: field */
+	const dfield_t*	field2,	/*!< in: field */
+	ulint		len)	/*!< in: maximum prefix to compare,
+				or 0 to compare the whole field length */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Tests if dfield data length and content is equal to the given.
+@return TRUE if equal */
+UNIV_INLINE
+ibool
+dfield_data_is_binary_equal(
+/*========================*/
+	const dfield_t*	field,	/*!< in: field */
+	ulint		len,	/*!< in: data length or UNIV_SQL_NULL */
+	const byte*	data)	/*!< in: data */
+	MY_ATTRIBUTE((nonnull(1), warn_unused_result));
+
+/*********************************************************************//**
+Gets info bits in a data tuple.
+@return info bits */
+UNIV_INLINE
+ulint
+dtuple_get_info_bits(
+/*=================*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Sets info bits in a data tuple. */
+UNIV_INLINE
+void
+dtuple_set_info_bits(
+/*=================*/
+	dtuple_t*	tuple,		/*!< in: tuple */
+	ulint		info_bits)	/*!< in: info bits */
+	MY_ATTRIBUTE((nonnull));
+/*********************************************************************//**
+Gets number of fields used in record comparisons.
+@return number of fields used in comparisons in rem0cmp.* */
+UNIV_INLINE
+ulint
+dtuple_get_n_fields_cmp(
+/*====================*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Gets number of fields used in record comparisons. */
+UNIV_INLINE
+void
+dtuple_set_n_fields_cmp(
+/*====================*/
+	dtuple_t*	tuple,		/*!< in: tuple */
+	ulint		n_fields_cmp)	/*!< in: number of fields used in
+					comparisons in rem0cmp.* */
+	MY_ATTRIBUTE((nonnull));
+
+/* Estimate the number of bytes that are going to be allocated when
+creating a new dtuple_t object */
+#define DTUPLE_EST_ALLOC(n_fields)	\
+	(sizeof(dtuple_t) + (n_fields) * sizeof(dfield_t))
+
+/** Creates a data tuple from an already allocated chunk of memory.
+The size of the chunk must be at least DTUPLE_EST_ALLOC(n_fields).
+The default value for number of fields used in record comparisons
+for this tuple is n_fields.
+@param[in,out]	buf		buffer to use
+@param[in]	buf_size	buffer size
+@param[in]	n_fields	number of field
+@param[in]	n_v_fields	number of fields on virtual columns
+@return created tuple (inside buf) */
+UNIV_INLINE
+dtuple_t*
+dtuple_create_from_mem(
+	void*	buf,
+	ulint	buf_size,
+	ulint	n_fields,
+	ulint	n_v_fields)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/**********************************************************//**
+Creates a data tuple to a memory heap. The default value for number
+of fields used in record comparisons for this tuple is n_fields.
+@return own: created tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_create(
+/*==========*/
+	mem_heap_t*	heap,	/*!< in: memory heap where the tuple
+				is created, DTUPLE_EST_ALLOC(n_fields)
+				bytes will be allocated from this heap */
+	ulint		n_fields)/*!< in: number of fields */
+	MY_ATTRIBUTE((nonnull, malloc));
+
+/** Initialize the virtual field data in a dtuple_t
+@param[in,out]		vrow	dtuple contains the virtual fields */
+UNIV_INLINE void dtuple_init_v_fld(dtuple_t* vrow);
+
+/** Duplicate the virtual field data in a dtuple_t
+@param[in,out]		vrow	dtuple contains the virtual fields
+@param[in]		heap	heap memory to use */
+UNIV_INLINE void dtuple_dup_v_fld(dtuple_t* vrow, mem_heap_t* heap);
+
+/** Creates a data tuple with possible virtual columns to a memory heap.
+@param[in]	heap		memory heap where the tuple is created
+@param[in]	n_fields	number of fields
+@param[in]	n_v_fields	number of fields on virtual col
+@return own: created tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_create_with_vcol(
+	mem_heap_t*	heap,
+	ulint		n_fields,
+	ulint		n_v_fields);
+
+/*********************************************************************//**
+Sets number of fields used in a tuple. Normally this is set in
+dtuple_create, but if you want later to set it smaller, you can use this. */
+void
+dtuple_set_n_fields(
+/*================*/
+	dtuple_t*	tuple,		/*!< in: tuple */
+	ulint		n_fields)	/*!< in: number of fields */
+	MY_ATTRIBUTE((nonnull));
+/** Copies a data tuple's virtaul fields to another. This is a shallow copy;
+@param[in,out]	d_tuple		destination tuple
+@param[in]	s_tuple		source tuple */
+UNIV_INLINE
+void
+dtuple_copy_v_fields(
+	dtuple_t*	d_tuple,
+	const dtuple_t*	s_tuple);
+/*********************************************************************//**
+Copies a data tuple to another.  This is a shallow copy; if a deep copy
+is desired, dfield_dup() will have to be invoked on each field.
+@return own: copy of tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_copy(
+/*========*/
+	const dtuple_t*	tuple,	/*!< in: tuple to copy from */
+	mem_heap_t*	heap)	/*!< in: memory heap
+				where the tuple is created */
+	MY_ATTRIBUTE((nonnull, malloc));
+/**********************************************************//**
+The following function returns the sum of data lengths of a tuple. The space
+occupied by the field structs or the tuple struct is not counted.
+@return sum of data lens */
+UNIV_INLINE
+ulint
+dtuple_get_data_size(
+/*=================*/
+	const dtuple_t*	tuple,	/*!< in: typed data tuple */
+	ulint		comp)	/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+	MY_ATTRIBUTE((nonnull));
+/*********************************************************************//**
+Computes the number of externally stored fields in a data tuple.
+@return number of fields */
+UNIV_INLINE
+ulint
+dtuple_get_n_ext(
+/*=============*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+	MY_ATTRIBUTE((nonnull));
+/** Fold a prefix given as the number of fields of a tuple.
+@param[in]	tuple		index record
+@param[in]	n_fields	number of complete fields to fold
+@param[in]	n_bytes		number of bytes to fold in the last field
+@param[in]	index_id	index tree ID
+@return the folded value */
+UNIV_INLINE
+ulint
+dtuple_fold(
+	const dtuple_t*	tuple,
+	ulint		n_fields,
+	ulint		n_bytes,
+	index_id_t	tree_id)
+	MY_ATTRIBUTE((warn_unused_result));
+/*******************************************************************//**
+Sets types of fields binary in a tuple. */
+UNIV_INLINE
+void
+dtuple_set_types_binary(
+/*====================*/
+	dtuple_t*	tuple,	/*!< in: data tuple */
+	ulint		n)	/*!< in: number of fields to set */
+	MY_ATTRIBUTE((nonnull));
+/**********************************************************************//**
+Checks if a dtuple contains an SQL null value.
+@return TRUE if some field is SQL null */
+UNIV_INLINE
+ibool
+dtuple_contains_null(
+/*=================*/
+	const dtuple_t*	tuple)	/*!< in: dtuple */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/**********************************************************//**
+Checks that a data field is typed. Asserts an error if not.
+@return TRUE if ok */
+ibool
+dfield_check_typed(
+/*===============*/
+	const dfield_t*	field)	/*!< in: data field */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/**********************************************************//**
+Checks that a data tuple is typed. Asserts an error if not.
+@return TRUE if ok */
+ibool
+dtuple_check_typed(
+/*===============*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+#ifdef UNIV_DEBUG
+/**********************************************************//**
+Validates the consistency of a tuple which must be complete, i.e,
+all fields must have been set.
+@return TRUE if ok */
+ibool
+dtuple_validate(
+/*============*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+#endif /* UNIV_DEBUG */
+/*************************************************************//**
+Pretty prints a dfield value according to its data type. */
+void
+dfield_print(
+/*=========*/
+	const dfield_t*	dfield)	/*!< in: dfield */
+	MY_ATTRIBUTE((nonnull));
+/*************************************************************//**
+Pretty prints a dfield value according to its data type. Also the hex string
+is printed if a string contains non-printable characters. */
+void
+dfield_print_also_hex(
+/*==================*/
+	const dfield_t*	dfield)	 /*!< in: dfield */
+	MY_ATTRIBUTE((nonnull));
+/**********************************************************//**
+The following function prints the contents of a tuple. */
+void
+dtuple_print(
+/*=========*/
+	FILE*		f,	/*!< in: output stream */
+	const dtuple_t*	tuple)	/*!< in: tuple */
+	MY_ATTRIBUTE((nonnull));
+
+/** Print the contents of a tuple.
+@param[out]	o	output stream
+@param[in]	field	array of data fields
+@param[in]	n	number of data fields */
+void
+dfield_print(
+	std::ostream&	o,
+	const dfield_t*	field,
+	ulint		n);
+/** Print the contents of a tuple.
+@param[out]	o	output stream
+@param[in]	tuple	data tuple */
+void
+dtuple_print(
+	std::ostream&	o,
+	const dtuple_t*	tuple);
+
+/** Print the contents of a tuple.
+@param[out]	o	output stream
+@param[in]	tuple	data tuple */
+inline
+std::ostream&
+operator<<(std::ostream& o, const dtuple_t& tuple)
+{
+	dtuple_print(o, &tuple);
+	return(o);
+}
+
+/**************************************************************//**
+Moves parts of long fields in entry to the big record vector so that
+the size of tuple drops below the maximum record size allowed in the
+database. Moves data only from those fields which are not necessary
+to determine uniquely the insertion place of the tuple in the index.
+@return own: created big record vector, NULL if we are not able to
+shorten the entry enough, i.e., if there are too many fixed-length or
+short fields in entry or the index is clustered */
+big_rec_t*
+dtuple_convert_big_rec(
+/*===================*/
+	dict_index_t*	index,	/*!< in: index */
+	upd_t*		upd,	/*!< in/out: update vector */
+	dtuple_t*	entry,	/*!< in/out: index entry */
+	ulint*		n_ext)	/*!< in/out: number of
+				externally stored columns */
+	MY_ATTRIBUTE((malloc, warn_unused_result));
+/**************************************************************//**
+Puts back to entry the data stored in vector. Note that to ensure the
+fields in entry can accommodate the data, vector must have been created
+from entry with dtuple_convert_big_rec. */
+void
+dtuple_convert_back_big_rec(
+/*========================*/
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry,	/*!< in: entry whose data was put to vector */
+	big_rec_t*	vector)	/*!< in, own: big rec vector; it is
+				freed in this function */
+	MY_ATTRIBUTE((nonnull));
+/**************************************************************//**
+Frees the memory in a big rec vector. */
+UNIV_INLINE
+void
+dtuple_big_rec_free(
+/*================*/
+	big_rec_t*	vector)	/*!< in, own: big rec vector; it is
+				freed in this function */
+	MY_ATTRIBUTE((nonnull));
+
+/*######################################################################*/
+
+/** Structure for an SQL data field */
+struct dfield_t{
+	void*		data;	/*!< pointer to data */
+	unsigned	ext:1;	/*!< TRUE=externally stored, FALSE=local */
+	unsigned	spatial_status:2;
+				/*!< spatial status of externally stored field
+				in undo log for purge */
+	unsigned	len;	/*!< data length; UNIV_SQL_NULL if SQL null */
+	dtype_t		type;	/*!< type of data */
+
+	/** Create a deep copy of this object.
+	@param[in,out]	heap	memory heap in which the clone will be created
+	@return	the cloned object */
+	dfield_t* clone(mem_heap_t* heap) const;
+
+	/** @return system field indicates history row */
+	bool vers_history_row() const
+	{
+		ut_ad(type.vers_sys_end());
+		if (type.mtype == DATA_FIXBINARY) {
+			ut_ad(len == sizeof timestamp_max_bytes);
+			return 0 != memcmp(data, timestamp_max_bytes, len);
+		} else {
+			ut_ad(type.mtype == DATA_INT);
+			ut_ad(len == sizeof trx_id_max_bytes);
+			return 0 != memcmp(data, trx_id_max_bytes, len);
+		}
+		ut_ad(0);
+		return false;
+	}
+};
+
+/** Structure for an SQL data tuple of fields (logical record) */
+struct dtuple_t {
+	ulint		info_bits;	/*!< info bits of an index record:
+					the default is 0; this field is used
+					if an index record is built from
+					a data tuple */
+	ulint		n_fields;	/*!< number of fields in dtuple */
+	ulint		n_fields_cmp;	/*!< number of fields which should
+					be used in comparison services
+					of rem0cmp.*; the index search
+					is performed by comparing only these
+					fields, others are ignored; the
+					default value in dtuple creation is
+					the same value as n_fields */
+	dfield_t*	fields;		/*!< fields */
+	ulint		n_v_fields;	/*!< number of virtual fields */
+	dfield_t*	v_fields;	/*!< fields on virtual column */
+#ifdef UNIV_DEBUG
+	ulint		magic_n;	/*!< magic number, used in
+					debug assertions */
+/** Value of dtuple_t::magic_n */
+# define		DATA_TUPLE_MAGIC_N	65478679
+#endif /* UNIV_DEBUG */
+
+	/** Trim the tail of an index tuple before insert or update.
+	After instant ADD COLUMN, if the last fields of a clustered index tuple
+	match the default values that were explicitly specified or implied
+	during ADD COLUMN, there will be no need to store them.
+	NOTE: A page latch in the index must be held, so that the index
+	may not lose 'instantness' before the trimmed tuple has been
+	inserted or updated.
+	@param[in]	index	index possibly with instantly added columns */
+	void trim(const dict_index_t& index);
+
+	bool vers_history_row() const
+	{
+		for (ulint i = 0; i < n_fields; i++) {
+			const dfield_t* field = &fields[i];
+			if (field->type.vers_sys_end()) {
+				return field->vers_history_row();
+			}
+		}
+		return false;
+	}
+
+	/**
+	@param info_bits	the info_bits of a data tuple
+	@return whether this is a hidden metadata record
+	for instant ADD COLUMN or ALTER TABLE */
+	static bool is_alter_metadata(ulint info_bits)
+	{
+		return UNIV_UNLIKELY(info_bits == REC_INFO_METADATA_ALTER);
+	}
+
+	/**
+	@param info_bits	the info_bits of a data tuple
+	@return whether this is a hidden metadata record
+	for instant ADD COLUMN or ALTER TABLE */
+	static bool is_metadata(ulint info_bits)
+	{
+		return UNIV_UNLIKELY((info_bits & ~REC_INFO_DELETED_FLAG)
+				     == REC_INFO_METADATA_ADD);
+	}
+
+	/** @return whether this is a hidden metadata record
+	for instant ALTER TABLE (not only ADD COLUMN) */
+	bool is_alter_metadata() const { return is_alter_metadata(info_bits); }
+
+	/** @return whether this is a hidden metadata record
+	for instant ADD COLUMN or ALTER TABLE */
+	bool is_metadata() const { return is_metadata(info_bits); }
+
+	/** Copy type information from index fields.
+	@param index	index field to be copied */
+	inline void copy_field_types(const dict_index_t &index);
+};
+
+inline ulint dtuple_get_n_fields(const dtuple_t* tuple)
+{ return tuple->n_fields; }
+inline dtype_t* dfield_get_type(dfield_t* field) { return &field->type; }
+inline const dtype_t* dfield_get_type(const dfield_t* field)
+{ return &field->type; }
+inline void* dfield_get_data(dfield_t* field)
+{
+	ut_ad(field->len == UNIV_SQL_NULL || field->data != &data_error);
+	return field->data;
+}
+inline const void* dfield_get_data(const dfield_t* field)
+{
+	ut_ad(field->len == UNIV_SQL_NULL || field->data != &data_error);
+	return field->data;
+}
+inline ulint dfield_get_len(const dfield_t* field) {
+	ut_ad(field->len == UNIV_SQL_NULL || field->data != &data_error);
+	ut_ad(field->len != UNIV_SQL_DEFAULT);
+	return field->len;
+}
+inline bool dfield_is_null(const dfield_t* field)
+{ return field->len == UNIV_SQL_NULL; }
+/** @return whether a column is to be stored off-page */
+inline bool dfield_is_ext(const dfield_t* field)
+{
+	ut_ad(!field->ext || field->len >= BTR_EXTERN_FIELD_REF_SIZE);
+	return static_cast<bool>(field->ext);
+}
+/** Set the "external storage" flag */
+inline void dfield_set_ext(dfield_t* field) { field->ext = 1; }
+
+/** Gets number of virtual fields in a data tuple.
+@param[in]	tuple	dtuple to check
+@return number of fields */
+inline ulint
+dtuple_get_n_v_fields(const dtuple_t* tuple) { return tuple->n_v_fields; }
+
+inline const dfield_t* dtuple_get_nth_field(const dtuple_t* tuple, ulint n)
+{
+	ut_ad(n < tuple->n_fields);
+	return &tuple->fields[n];
+}
+inline dfield_t* dtuple_get_nth_field(dtuple_t* tuple, ulint n)
+{
+	ut_ad(n < tuple->n_fields);
+	return &tuple->fields[n];
+}
+
+/** Get a virtual column in a table row or an extended clustered index record.
+@param[in]	tuple	tuple
+@oaran[in]	n	the nth virtual field to get
+@return nth virtual field */
+inline const dfield_t* dtuple_get_nth_v_field(const dtuple_t* tuple, ulint n)
+{
+	ut_ad(n < tuple->n_v_fields);
+	return &tuple->v_fields[n];
+}
+/** Get a virtual column in a table row or an extended clustered index record.
+@param[in]	tuple	tuple
+@oaran[in]	n	the nth virtual field to get
+@return nth virtual field */
+inline dfield_t* dtuple_get_nth_v_field(dtuple_t* tuple, ulint n)
+{
+	ut_ad(n < tuple->n_v_fields);
+	return &tuple->v_fields[n];
+}
+
+/** A slot for a field in a big rec vector */
+struct big_rec_field_t {
+
+	/** Constructor.
+	@param[in]	field_no_	the field number
+	@param[in]	len_		the data length
+	@param[in]	data_		the data */
+	big_rec_field_t(ulint field_no_, ulint len_, const void* data_)
+		: field_no(field_no_),
+		  len(len_),
+		  data(data_)
+	{}
+
+	ulint		field_no;	/*!< field number in record */
+	ulint		len;		/*!< stored data length, in bytes */
+	const void*	data;		/*!< stored data */
+};
+
+/** Storage format for overflow data in a big record, that is, a
+clustered index record which needs external storage of data fields */
+struct big_rec_t {
+	mem_heap_t*	heap;		/*!< memory heap from which
+					allocated */
+	const ulint	capacity;	/*!< fields array size */
+	ulint		n_fields;	/*!< number of stored fields */
+	big_rec_field_t*fields;		/*!< stored fields */
+
+	/** Constructor.
+	@param[in]	max	the capacity of the array of fields. */
+	explicit big_rec_t(const ulint max)
+		: heap(0),
+		  capacity(max),
+		  n_fields(0),
+		  fields(0)
+	{}
+
+	/** Append one big_rec_field_t object to the end of array of fields */
+	void append(const big_rec_field_t& field)
+	{
+		ut_ad(n_fields < capacity);
+		fields[n_fields] = field;
+		n_fields++;
+	}
+
+	/** Allocate a big_rec_t object in the given memory heap, and for
+	storing n_fld number of fields.
+	@param[in]	heap	memory heap in which this object is allocated
+	@param[in]	n_fld	maximum number of fields that can be stored in
+			this object
+	@return the allocated object */
+	static big_rec_t* alloc(
+		mem_heap_t*	heap,
+		ulint		n_fld);
+};
+
+#include "data0data.inl"
+
+#endif
diff --git a/storage/innobase/include/data0data.inl b/storage/innobase/include/data0data.inl
new file mode 100644
index 00000000..2d1bf5a2
--- /dev/null
+++ b/storage/innobase/include/data0data.inl
@@ -0,0 +1,633 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/data0data.ic
+SQL data field and tuple
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "ut0rnd.h"
+
+/*********************************************************************//**
+Sets the type struct of SQL data field. */
+UNIV_INLINE
+void
+dfield_set_type(
+/*============*/
+	dfield_t*	field,	/*!< in: SQL data field */
+	const dtype_t*	type)	/*!< in: pointer to data type struct */
+{
+	ut_ad(field != NULL);
+	ut_ad(type != NULL);
+
+	field->type = *type;
+}
+
+/*********************************************************************//**
+Sets length in a field. */
+UNIV_INLINE
+void
+dfield_set_len(
+/*===========*/
+	dfield_t*	field,	/*!< in: field */
+	ulint		len)	/*!< in: length or UNIV_SQL_NULL */
+{
+	ut_ad(len != UNIV_SQL_DEFAULT);
+	field->ext = 0;
+	field->len = static_cast<unsigned int>(len);
+}
+
+/** Gets spatial status for "external storage"
+@param[in,out]	field		field */
+UNIV_INLINE
+spatial_status_t
+dfield_get_spatial_status(
+	const dfield_t*	field)
+{
+	ut_ad(dfield_is_ext(field));
+
+	return(static_cast<spatial_status_t>(field->spatial_status));
+}
+
+/** Sets spatial status for "external storage"
+@param[in,out]	field		field
+@param[in]	spatial_status	spatial status */
+UNIV_INLINE
+void
+dfield_set_spatial_status(
+	dfield_t*		field,
+	spatial_status_t	spatial_status)
+{
+	field->spatial_status = spatial_status & 3;
+	ut_ad(dfield_get_spatial_status(field) == spatial_status);
+}
+
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+dfield_set_data(
+/*============*/
+	dfield_t*	field,	/*!< in: field */
+	const void*	data,	/*!< in: data */
+	ulint		len)	/*!< in: length or UNIV_SQL_NULL */
+{
+	field->data = (void*) data;
+	field->ext = 0;
+	field->len = static_cast<unsigned int>(len);
+}
+
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+dfield_write_mbr(
+/*=============*/
+	dfield_t*	field,	/*!< in: field */
+	const double*	mbr)	/*!< in: data */
+{
+	MEM_CHECK_DEFINED(mbr, sizeof *mbr);
+	field->ext = 0;
+
+	for (unsigned i = 0; i < SPDIMS * 2; i++) {
+		mach_double_write(static_cast<byte*>(field->data)
+				  + i * sizeof(double), mbr[i]);
+	}
+
+	field->len = DATA_MBR_LEN;
+}
+
+/*********************************************************************//**
+Sets a data field to SQL NULL. */
+UNIV_INLINE
+void
+dfield_set_null(
+/*============*/
+	dfield_t*	field)	/*!< in/out: field */
+{
+	dfield_set_data(field, NULL, UNIV_SQL_NULL);
+}
+
+/*********************************************************************//**
+Copies the data and len fields. */
+UNIV_INLINE
+void
+dfield_copy_data(
+/*=============*/
+	dfield_t*	field1,	/*!< out: field to copy to */
+	const dfield_t*	field2)	/*!< in: field to copy from */
+{
+	ut_ad(field1 != NULL);
+	ut_ad(field2 != NULL);
+
+	field1->data = field2->data;
+	field1->len = field2->len;
+	field1->ext = field2->ext;
+	field1->spatial_status = field2->spatial_status;
+}
+
+/*********************************************************************//**
+Copies a data field to another. */
+UNIV_INLINE
+void
+dfield_copy(
+/*========*/
+	dfield_t*	field1,	/*!< out: field to copy to */
+	const dfield_t*	field2)	/*!< in: field to copy from */
+{
+	*field1 = *field2;
+}
+
+/*********************************************************************//**
+Copies the data pointed to by a data field. */
+UNIV_INLINE
+void
+dfield_dup(
+/*=======*/
+	dfield_t*	field,	/*!< in/out: data field */
+	mem_heap_t*	heap)	/*!< in: memory heap where allocated */
+{
+	if (!dfield_is_null(field)) {
+		MEM_CHECK_DEFINED(field->data, field->len);
+		field->data = mem_heap_dup(heap, field->data, field->len);
+	}
+}
+
+/*********************************************************************//**
+Tests if two data fields are equal.
+If len==0, tests the data length and content for equality.
+If len>0, tests the first len bytes of the content for equality.
+@return TRUE if both fields are NULL or if they are equal */
+UNIV_INLINE
+ibool
+dfield_datas_are_binary_equal(
+/*==========================*/
+	const dfield_t*	field1,	/*!< in: field */
+	const dfield_t*	field2,	/*!< in: field */
+	ulint		len)	/*!< in: maximum prefix to compare,
+				or 0 to compare the whole field length */
+{
+	ulint	len2 = len;
+
+	if (field1->len == UNIV_SQL_NULL || len == 0 || field1->len < len) {
+		len = field1->len;
+	}
+
+	if (field2->len == UNIV_SQL_NULL || len2 == 0 || field2->len < len2) {
+		len2 = field2->len;
+	}
+
+	return(len == len2
+	       && (len == UNIV_SQL_NULL
+		   || !memcmp(field1->data, field2->data, len)));
+}
+
+/*********************************************************************//**
+Tests if dfield data length and content is equal to the given.
+@return TRUE if equal */
+UNIV_INLINE
+ibool
+dfield_data_is_binary_equal(
+/*========================*/
+	const dfield_t*	field,	/*!< in: field */
+	ulint		len,	/*!< in: data length or UNIV_SQL_NULL */
+	const byte*	data)	/*!< in: data */
+{
+	ut_ad(len != UNIV_SQL_DEFAULT);
+	return(len == dfield_get_len(field)
+	       && (!len || len == UNIV_SQL_NULL
+		   || !memcmp(dfield_get_data(field), data, len)));
+}
+
+/*********************************************************************//**
+Gets info bits in a data tuple.
+@return info bits */
+UNIV_INLINE
+ulint
+dtuple_get_info_bits(
+/*=================*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+{
+	return(tuple->info_bits);
+}
+
+/*********************************************************************//**
+Sets info bits in a data tuple. */
+UNIV_INLINE
+void
+dtuple_set_info_bits(
+/*=================*/
+	dtuple_t*	tuple,		/*!< in: tuple */
+	ulint		info_bits)	/*!< in: info bits */
+{
+	tuple->info_bits = info_bits;
+}
+
+/*********************************************************************//**
+Gets number of fields used in record comparisons.
+@return number of fields used in comparisons in rem0cmp.* */
+UNIV_INLINE
+ulint
+dtuple_get_n_fields_cmp(
+/*====================*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+{
+	return(tuple->n_fields_cmp);
+}
+
+/*********************************************************************//**
+Sets number of fields used in record comparisons. */
+UNIV_INLINE
+void
+dtuple_set_n_fields_cmp(
+/*====================*/
+	dtuple_t*	tuple,		/*!< in: tuple */
+	ulint		n_fields_cmp)	/*!< in: number of fields used in
+					comparisons in rem0cmp.* */
+{
+	ut_ad(n_fields_cmp <= tuple->n_fields);
+	tuple->n_fields_cmp = n_fields_cmp;
+}
+
+/** Creates a data tuple from an already allocated chunk of memory.
+The size of the chunk must be at least DTUPLE_EST_ALLOC(n_fields).
+The default value for number of fields used in record comparisons
+for this tuple is n_fields.
+@param[in,out]	buf		buffer to use
+@param[in]	buf_size	buffer size
+@param[in]	n_fields	number of field
+@param[in]	n_v_fields	number of fields on virtual columns
+@return created tuple (inside buf) */
+UNIV_INLINE
+dtuple_t*
+dtuple_create_from_mem(
+	void*	buf,
+	ulint	buf_size,
+	ulint	n_fields,
+	ulint	n_v_fields)
+{
+	dtuple_t*	tuple;
+	ulint		n_t_fields = n_fields + n_v_fields;
+
+	ut_a(buf_size >= DTUPLE_EST_ALLOC(n_t_fields));
+
+	tuple = (dtuple_t*) buf;
+	tuple->info_bits = 0;
+	tuple->n_fields = n_fields;
+	tuple->n_v_fields = n_v_fields;
+	tuple->n_fields_cmp = n_fields;
+	tuple->fields = (dfield_t*) &tuple[1];
+	if (n_v_fields > 0) {
+		tuple->v_fields = &tuple->fields[n_fields];
+	} else {
+		tuple->v_fields = NULL;
+	}
+
+#ifdef UNIV_DEBUG
+	tuple->magic_n = DATA_TUPLE_MAGIC_N;
+
+	{	/* In the debug version, initialize fields to an error value */
+		ulint	i;
+
+		for (i = 0; i < n_t_fields; i++) {
+			dfield_t*       field;
+
+			if (i >= n_fields) {
+				field = dtuple_get_nth_v_field(
+					tuple, i - n_fields);
+			} else {
+				field = dtuple_get_nth_field(tuple, i);
+			}
+
+			dfield_set_len(field, UNIV_SQL_NULL);
+			field->data = &data_error;
+			dfield_get_type(field)->mtype = DATA_ERROR;
+			dfield_get_type(field)->prtype = DATA_ERROR;
+		}
+	}
+#endif
+	MEM_CHECK_ADDRESSABLE(tuple->fields, n_t_fields
+			      * sizeof *tuple->fields);
+	MEM_UNDEFINED(tuple->fields, n_t_fields * sizeof *tuple->fields);
+	return(tuple);
+}
+
+/** Duplicate the virtual field data in a dtuple_t
+@param[in,out]		vrow	dtuple contains the virtual fields
+@param[in,out]		heap	heap memory to use */
+UNIV_INLINE
+void
+dtuple_dup_v_fld(dtuple_t* vrow, mem_heap_t* heap)
+{
+	for (ulint i = 0; i < vrow->n_v_fields; i++) {
+		dfield_t*       dfield = dtuple_get_nth_v_field(vrow, i);
+		dfield_dup(dfield, heap);
+	}
+}
+
+/** Initialize the virtual field data in a dtuple_t
+@param[in,out]		vrow	dtuple contains the virtual fields */
+UNIV_INLINE
+void
+dtuple_init_v_fld(dtuple_t* vrow)
+{
+	for (ulint i = 0; i < vrow->n_v_fields; i++) {
+		dfield_t*       dfield = dtuple_get_nth_v_field(vrow, i);
+		dfield_get_type(dfield)->mtype = DATA_MISSING;
+		dfield_set_len(dfield, UNIV_SQL_NULL);
+	}
+}
+
+/**********************************************************//**
+Creates a data tuple to a memory heap. The default value for number
+of fields used in record comparisons for this tuple is n_fields.
+@return own: created tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_create(
+/*==========*/
+	mem_heap_t*	heap,	/*!< in: memory heap where the tuple
+				is created, DTUPLE_EST_ALLOC(n_fields)
+				bytes will be allocated from this heap */
+	ulint		n_fields) /*!< in: number of fields */
+{
+	return(dtuple_create_with_vcol(heap, n_fields, 0));
+}
+
+/** Creates a data tuple with virtual columns to a memory heap.
+@param[in]	heap		memory heap where the tuple is created
+@param[in]	n_fields	number of fields
+@param[in]	n_v_fields	number of fields on virtual col
+@return own: created tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_create_with_vcol(
+	mem_heap_t*	heap,
+	ulint		n_fields,
+	ulint		n_v_fields)
+{
+	void*		buf;
+	ulint		buf_size;
+	dtuple_t*	tuple;
+
+	ut_ad(heap);
+
+	buf_size = DTUPLE_EST_ALLOC(n_fields + n_v_fields);
+	buf = mem_heap_alloc(heap, buf_size);
+
+	tuple = dtuple_create_from_mem(buf, buf_size, n_fields, n_v_fields);
+
+	return(tuple);
+}
+
+/** Copies a data tuple's virtual fields to another. This is a shallow copy;
+@param[in,out]	d_tuple		destination tuple
+@param[in]	s_tuple		source tuple */
+UNIV_INLINE
+void
+dtuple_copy_v_fields(
+	dtuple_t*	d_tuple,
+	const dtuple_t*	s_tuple)
+{
+
+	ulint		n_v_fields	= dtuple_get_n_v_fields(d_tuple);
+	ut_ad(n_v_fields == dtuple_get_n_v_fields(s_tuple));
+
+	for (ulint i = 0; i < n_v_fields; i++) {
+		dfield_copy(dtuple_get_nth_v_field(d_tuple, i),
+			    dtuple_get_nth_v_field(s_tuple, i));
+	}
+}
+
+/*********************************************************************//**
+Copies a data tuple to another.  This is a shallow copy; if a deep copy
+is desired, dfield_dup() will have to be invoked on each field.
+@return own: copy of tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_copy(
+/*========*/
+	const dtuple_t*	tuple,	/*!< in: tuple to copy from */
+	mem_heap_t*	heap)	/*!< in: memory heap
+				where the tuple is created */
+{
+	ulint		n_fields	= dtuple_get_n_fields(tuple);
+	ulint		n_v_fields	= dtuple_get_n_v_fields(tuple);
+	dtuple_t*	new_tuple	= dtuple_create_with_vcol(
+						heap, n_fields, n_v_fields);
+	ulint		i;
+
+	for (i = 0; i < n_fields; i++) {
+		dfield_copy(dtuple_get_nth_field(new_tuple, i),
+			    dtuple_get_nth_field(tuple, i));
+	}
+
+	for (i = 0; i < n_v_fields; i++) {
+		dfield_copy(dtuple_get_nth_v_field(new_tuple, i),
+			    dtuple_get_nth_v_field(tuple, i));
+	}
+
+	return(new_tuple);
+}
+
+/**********************************************************//**
+The following function returns the sum of data lengths of a tuple. The space
+occupied by the field structs or the tuple struct is not counted. Neither
+is possible space in externally stored parts of the field.
+@return sum of data lengths */
+UNIV_INLINE
+ulint
+dtuple_get_data_size(
+/*=================*/
+	const dtuple_t*	tuple,	/*!< in: typed data tuple */
+	ulint		comp)	/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+{
+	const dfield_t*	field;
+	ulint		n_fields;
+	ulint		len;
+	ulint		i;
+	ulint		sum	= 0;
+
+	ut_ad(dtuple_check_typed(tuple));
+	ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N);
+
+	n_fields = tuple->n_fields;
+
+	for (i = 0; i < n_fields; i++) {
+		field = dtuple_get_nth_field(tuple,  i);
+		len = dfield_get_len(field);
+
+		if (len == UNIV_SQL_NULL) {
+			len = dtype_get_sql_null_size(dfield_get_type(field),
+						      comp);
+		}
+
+		sum += len;
+	}
+
+	return(sum);
+}
+
+/*********************************************************************//**
+Computes the number of externally stored fields in a data tuple.
+@return number of externally stored fields */
+UNIV_INLINE
+ulint
+dtuple_get_n_ext(
+/*=============*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+{
+	ulint	n_ext		= 0;
+	ulint	n_fields	= tuple->n_fields;
+	ulint	i;
+
+	ut_ad(dtuple_check_typed(tuple));
+	ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N);
+
+	for (i = 0; i < n_fields; i++) {
+		n_ext += dtuple_get_nth_field(tuple, i)->ext;
+	}
+
+	return(n_ext);
+}
+
+/*******************************************************************//**
+Sets types of fields binary in a tuple. */
+UNIV_INLINE
+void
+dtuple_set_types_binary(
+/*====================*/
+	dtuple_t*	tuple,	/*!< in: data tuple */
+	ulint		n)	/*!< in: number of fields to set */
+{
+	dtype_t*	dfield_type;
+	ulint		i;
+
+	for (i = 0; i < n; i++) {
+		dfield_type = dfield_get_type(dtuple_get_nth_field(tuple, i));
+		dtype_set(dfield_type, DATA_BINARY, 0, 0);
+	}
+}
+
+/** Fold a prefix given as the number of fields of a tuple.
+@param[in]	tuple		index record
+@param[in]	n_fields	number of complete fields to fold
+@param[in]	n_bytes		number of bytes to fold in the last field
+@param[in]	index_id	index tree ID
+@return the folded value */
+UNIV_INLINE
+ulint
+dtuple_fold(
+	const dtuple_t*	tuple,
+	ulint		n_fields,
+	ulint		n_bytes,
+	index_id_t	tree_id)
+{
+	const dfield_t*	field;
+	ulint		i;
+	const byte*	data;
+	ulint		len;
+	ulint		fold;
+
+	ut_ad(tuple);
+	ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N);
+	ut_ad(dtuple_check_typed(tuple));
+
+	fold = ut_fold_ull(tree_id);
+
+	for (i = 0; i < n_fields; i++) {
+		field = dtuple_get_nth_field(tuple, i);
+
+		data = (const byte*) dfield_get_data(field);
+		len = dfield_get_len(field);
+
+		if (len != UNIV_SQL_NULL) {
+			fold = ut_fold_ulint_pair(fold,
+						  ut_fold_binary(data, len));
+		}
+	}
+
+	if (n_bytes > 0) {
+		field = dtuple_get_nth_field(tuple, i);
+
+		data = (const byte*) dfield_get_data(field);
+		len = dfield_get_len(field);
+
+		if (len != UNIV_SQL_NULL) {
+			if (len > n_bytes) {
+				len = n_bytes;
+			}
+
+			fold = ut_fold_ulint_pair(fold,
+						  ut_fold_binary(data, len));
+		}
+	}
+
+	return(fold);
+}
+
+/**********************************************************************//**
+Writes an SQL null field full of zeros. */
+UNIV_INLINE
+void
+data_write_sql_null(
+/*================*/
+	byte*	data,	/*!< in: pointer to a buffer of size len */
+	ulint	len)	/*!< in: SQL null size in bytes */
+{
+	memset(data, 0, len);
+}
+
+/**********************************************************************//**
+Checks if a dtuple contains an SQL null value.
+@return TRUE if some field is SQL null */
+UNIV_INLINE
+ibool
+dtuple_contains_null(
+/*=================*/
+	const dtuple_t*	tuple)	/*!< in: dtuple */
+{
+	ulint	n;
+	ulint	i;
+
+	n = dtuple_get_n_fields(tuple);
+
+	for (i = 0; i < n; i++) {
+		if (dfield_is_null(dtuple_get_nth_field(tuple, i))) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/**************************************************************//**
+Frees the memory in a big rec vector. */
+UNIV_INLINE
+void
+dtuple_big_rec_free(
+/*================*/
+	big_rec_t*	vector)	/*!< in, own: big rec vector; it is
+				freed in this function */
+{
+	mem_heap_free(vector->heap);
+}
diff --git a/storage/innobase/include/data0type.h b/storage/innobase/include/data0type.h
new file mode 100644
index 00000000..3d63ddb7
--- /dev/null
+++ b/storage/innobase/include/data0type.h
@@ -0,0 +1,591 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/data0type.h
+Data types
+
+Created 1/16/1996 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+#include "univ.i"
+
+/** Special length indicating a missing instantly added column */
+#define UNIV_SQL_DEFAULT (UNIV_SQL_NULL - 1)
+
+/** @return whether a length is actually stored in a field */
+#define len_is_stored(len) (len != UNIV_SQL_NULL && len != UNIV_SQL_DEFAULT)
+
+extern ulint	data_mysql_default_charset_coll;
+#define DATA_MYSQL_BINARY_CHARSET_COLL 63
+
+/* SQL data type struct */
+struct dtype_t;
+
+/** SQL Like operator comparison types */
+enum ib_like_t {
+	IB_LIKE_EXACT,	/**< e.g.  STRING */
+	IB_LIKE_PREFIX	/**< e.g., STRING% */
+};
+
+/*-------------------------------------------*/
+/* The 'MAIN TYPE' of a column */
+#define DATA_MISSING	0	/* missing column */
+#define	DATA_VARCHAR	1	/* character varying of the
+				latin1_swedish_ci charset-collation; note
+				that the MySQL format for this, DATA_BINARY,
+				DATA_VARMYSQL, is also affected by whether the
+				'precise type' contains
+				DATA_MYSQL_TRUE_VARCHAR */
+#define DATA_CHAR	2	/* fixed length character of the
+				latin1_swedish_ci charset-collation */
+#define DATA_FIXBINARY	3	/* binary string of fixed length */
+#define DATA_BINARY	4	/* binary string */
+#define DATA_BLOB	5	/* binary large object, or a TEXT type;
+				if prtype & DATA_BINARY_TYPE == 0, then this is
+				actually a TEXT column (or a BLOB created
+				with < 4.0.14; since column prefix indexes
+				came only in 4.0.14, the missing flag in BLOBs
+				created before that does not cause any harm) */
+#define	DATA_INT	6	/* integer: can be any size 1 - 8 bytes */
+#define	DATA_SYS_CHILD	7	/* address of the child page in node pointer */
+#define	DATA_SYS	8	/* system column */
+
+/* Data types >= DATA_FLOAT must be compared using the whole field, not as
+binary strings */
+
+#define DATA_FLOAT	9
+#define DATA_DOUBLE	10
+#define DATA_DECIMAL	11	/* decimal number stored as an ASCII string */
+#define	DATA_VARMYSQL	12	/* any charset varying length char */
+#define	DATA_MYSQL	13	/* any charset fixed length char */
+				/* NOTE that 4.1.1 used DATA_MYSQL and
+				DATA_VARMYSQL for all character sets, and the
+				charset-collation for tables created with it
+				can also be latin1_swedish_ci */
+
+/* DATA_GEOMETRY includes all standard geometry datatypes as described in
+OGC standard(point, line_string, polygon, multi_point, multi_polygon,
+multi_line_string, geometry_collection, geometry).
+Currently, geometry data is stored in the standard Well-Known Binary(WKB)
+format (http://www.opengeospatial.org/standards/sfa).
+We use BLOB as the underlying datatype. */
+#define DATA_GEOMETRY	14	/* geometry datatype of variable length */
+#define DATA_MTYPE_MAX	63	/* dtype_store_for_order_and_null_size()
+				requires the values are <= 63 */
+
+#define DATA_MTYPE_CURRENT_MIN	DATA_VARCHAR	/* minimum value of mtype */
+#define DATA_MTYPE_CURRENT_MAX	DATA_GEOMETRY	/* maximum value of mtype */
+/*-------------------------------------------*/
+/* The 'PRECISE TYPE' of a column */
+/*
+Tables created by a MySQL user have the following convention:
+
+- In the least significant byte in the precise type we store the MySQL type
+code (not applicable for system columns).
+
+- In the second least significant byte we OR flags DATA_NOT_NULL,
+DATA_UNSIGNED, DATA_BINARY_TYPE.
+
+- In the third least significant byte of the precise type of string types we
+store the MySQL charset-collation code. In DATA_BLOB columns created with
+< 4.0.14 we do not actually know if it is a BLOB or a TEXT column. Since there
+are no indexes on prefixes of BLOB or TEXT columns in < 4.0.14, this is no
+problem, though.
+
+Note that versions < 4.1.2 or < 5.0.1 did not store the charset code to the
+precise type, since the charset was always the default charset of the MySQL
+installation. If the stored charset code is 0 in the system table SYS_COLUMNS
+of InnoDB, that means that the default charset of this MySQL installation
+should be used.
+
+When loading a table definition from the system tables to the InnoDB data
+dictionary cache in main memory, InnoDB versions >= 4.1.2 and >= 5.0.1 check
+if the stored charset-collation is 0, and if that is the case and the type is
+a non-binary string, replace that 0 by the default charset-collation code of
+this MySQL installation. In short, in old tables, the charset-collation code
+in the system tables on disk can be 0, but in in-memory data structures
+(dtype_t), the charset-collation code is always != 0 for non-binary string
+types.
+
+In new tables, in binary string types, the charset-collation code is the
+MySQL code for the 'binary charset', that is, != 0.
+
+For binary string types and for DATA_CHAR, DATA_VARCHAR, and for those
+DATA_BLOB which are binary or have the charset-collation latin1_swedish_ci,
+InnoDB performs all comparisons internally, without resorting to the MySQL
+comparison functions. This is to save CPU time.
+
+InnoDB's own internal system tables have different precise types for their
+columns, and for them the precise type is usually not used at all.
+*/
+
+#define DATA_ENGLISH	4	/* English language character string: this
+				is a relic from pre-MySQL time and only used
+				for InnoDB's own system tables */
+#define DATA_ERROR	111	/* another relic from pre-MySQL time */
+
+#define DATA_MYSQL_TYPE_MASK 255U/* AND with this mask to extract the MySQL
+				 type from the precise type */
+#define DATA_MYSQL_TRUE_VARCHAR 15 /* MySQL type code for the >= 5.0.3
+				   format true VARCHAR */
+
+/* Precise data types for system columns and the length of those columns;
+NOTE: the values must run from 0 up in the order given! All codes must
+be less than 256 */
+#define	DATA_ROW_ID	0	/* row id: a 48-bit integer */
+#define DATA_ROW_ID_LEN	6	/* stored length for row id */
+
+#define DATA_TRX_ID	1	/* transaction id: 6 bytes */
+#define DATA_TRX_ID_LEN	6
+
+#define	DATA_ROLL_PTR	2	/* rollback data pointer: 7 bytes */
+#define DATA_ROLL_PTR_LEN 7
+
+#define	DATA_N_SYS_COLS 3	/* number of system columns defined above */
+
+#define DATA_FTS_DOC_ID	3	/* Used as FTS DOC ID column */
+
+#define DATA_SYS_PRTYPE_MASK 0xFU /* mask to extract the above from prtype */
+
+/* Flags ORed to the precise data type */
+#define DATA_NOT_NULL	256U	/* this is ORed to the precise type when
+				the column is declared as NOT NULL */
+#define DATA_UNSIGNED	512U	/* this id ORed to the precise type when
+				we have an unsigned integer type */
+#define	DATA_BINARY_TYPE 1024U	/* if the data type is a binary character
+				string, this is ORed to the precise type:
+				this only holds for tables created with
+				>= MySQL-4.0.14 */
+/* #define	DATA_NONLATIN1	2048 This is a relic from < 4.1.2 and < 5.0.1.
+				In earlier versions this was set for some
+				BLOB columns.
+*/
+#define DATA_GIS_MBR	2048U	/* Used as GIS MBR column */
+/** the size of a GIS maximum bounding rectangle */
+constexpr uint8_t DATA_MBR_LEN= uint8_t(SPDIMS * 2 * sizeof(double));
+
+#define	DATA_LONG_TRUE_VARCHAR 4096U	/* this is ORed to the precise data
+				type when the column is true VARCHAR where
+				MySQL uses 2 bytes to store the data len;
+				for shorter VARCHARs MySQL uses only 1 byte */
+#define	DATA_VIRTUAL	8192U	/* Virtual column */
+
+/** System Versioning */
+#define DATA_VERS_START	16384U	/* start system field */
+#define DATA_VERS_END	32768U	/* end system field */
+/** system-versioned user data column */
+#define DATA_VERSIONED (DATA_VERS_START|DATA_VERS_END)
+
+/*-------------------------------------------*/
+
+/* This many bytes we need to store the type information affecting the
+alphabetical order for a single field and decide the storage size of an
+SQL null*/
+#define DATA_ORDER_NULL_TYPE_BUF_SIZE		4
+/* In the >= 4.1.x storage format we add 2 bytes more so that we can also
+store the charset-collation number; one byte is left unused, though */
+#define DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE	6
+
+/* Maximum multi-byte character length in bytes, plus 1 */
+#define DATA_MBMAX	8
+
+/* For checking if mtype is GEOMETRY datatype */
+#define DATA_GEOMETRY_MTYPE(mtype)	((mtype) == DATA_GEOMETRY)
+
+/* For checking if mtype is BLOB or GEOMETRY, since we use BLOB as
+the underlying datatype of GEOMETRY data. */
+#define DATA_LARGE_MTYPE(mtype) ((mtype) == DATA_BLOB			\
+				 || (mtype) == DATA_GEOMETRY)
+
+/* For checking if data type is big length data type. */
+#define DATA_BIG_LEN_MTYPE(len, mtype) ((len) > 255 || DATA_LARGE_MTYPE(mtype))
+
+/* For checking if the column is a big length column. */
+#define DATA_BIG_COL(col) DATA_BIG_LEN_MTYPE((col)->len, (col)->mtype)
+
+/* For checking if data type is large binary data type. */
+#define DATA_LARGE_BINARY(mtype,prtype) ((mtype) == DATA_GEOMETRY || \
+	((mtype) == DATA_BLOB && !((prtype) & DATA_BINARY_TYPE)))
+
+/* We now support 15 bits (up to 32767) collation number */
+#define MAX_CHAR_COLL_NUM	32767
+
+/* Mask to get the Charset Collation number (0x7fff) */
+#define CHAR_COLL_MASK		MAX_CHAR_COLL_NUM
+
+/*********************************************************************//**
+Gets the MySQL type code from a dtype.
+@return MySQL type code; this is NOT an InnoDB type code! */
+UNIV_INLINE
+ulint
+dtype_get_mysql_type(
+/*=================*/
+	const dtype_t*	type);	/*!< in: type struct */
+/*********************************************************************//**
+Determine how many bytes the first n characters of the given string occupy.
+If the string is shorter than n characters, returns the number of bytes
+the characters in the string occupy.
+@return length of the prefix, in bytes */
+ulint
+dtype_get_at_most_n_mbchars(
+/*========================*/
+	ulint		prtype,		/*!< in: precise type */
+	ulint		mbminlen,	/*!< in: minimum length of
+					a multi-byte character, in bytes */
+	ulint		mbmaxlen,	/*!< in: maximum length of
+					a multi-byte character, in bytes */
+	ulint		prefix_len,	/*!< in: length of the requested
+					prefix, in characters, multiplied by
+					dtype_get_mbmaxlen(dtype) */
+	ulint		data_len,	/*!< in: length of str (in bytes) */
+	const char*	str);		/*!< in: the string whose prefix
+					length is being determined */
+/** @return whether main type is a string type */
+inline bool dtype_is_string_type(ulint mtype)
+{
+	return mtype <= DATA_BLOB
+		|| mtype == DATA_MYSQL || mtype == DATA_VARMYSQL;
+}
+
+/** @return whether a type is a binary string type */
+inline bool dtype_is_binary_string_type(ulint mtype, ulint prtype)
+{
+	/* Note that for tables created before MySQL 4.0.14,
+	we do not know if a DATA_BLOB column is a BLOB or a TEXT column.
+	For those DATA_BLOB columns we return false. */
+
+	return mtype == DATA_FIXBINARY || mtype == DATA_BINARY
+		|| (mtype == DATA_BLOB && (prtype & DATA_BINARY_TYPE));
+}
+
+/** @return whether a type is a non-binary string type */
+inline bool dtype_is_non_binary_string_type(ulint mtype, ulint prtype)
+{
+	return dtype_is_string_type(mtype)
+		&& !dtype_is_binary_string_type(mtype, prtype);
+}
+
+/*********************************************************************//**
+Sets a data type structure. */
+UNIV_INLINE
+void
+dtype_set(
+/*======*/
+	dtype_t*	type,	/*!< in: type struct to init */
+	ulint		mtype,	/*!< in: main data type */
+	ulint		prtype,	/*!< in: precise type */
+	ulint		len);	/*!< in: precision of type */
+/*********************************************************************//**
+Copies a data type structure. */
+UNIV_INLINE
+void
+dtype_copy(
+/*=======*/
+	dtype_t*	type1,	/*!< in: type struct to copy to */
+	const dtype_t*	type2);	/*!< in: type struct to copy from */
+/*********************************************************************//**
+Gets the SQL main data type.
+@return SQL main data type */
+UNIV_INLINE
+ulint
+dtype_get_mtype(
+/*============*/
+	const dtype_t*	type);	/*!< in: data type */
+/*********************************************************************//**
+Gets the precise data type.
+@return precise data type */
+UNIV_INLINE
+ulint
+dtype_get_prtype(
+/*=============*/
+	const dtype_t*	type);	/*!< in: data type */
+
+/*********************************************************************//**
+Compute the mbminlen and mbmaxlen members of a data type structure. */
+void
+dtype_get_mblen(
+/*============*/
+	ulint	mtype,		/*!< in: main type */
+	ulint	prtype,		/*!< in: precise type (and collation) */
+	unsigned* mbminlen,	/*!< out: minimum length of a
+				multi-byte character */
+	unsigned* mbmaxlen);	/*!< out: maximum length of a
+				multi-byte character */
+/**
+Get the charset-collation code for string types.
+@param  prtype  InnoDB precise type
+@return charset-collation code */
+inline uint16_t dtype_get_charset_coll(ulint prtype)
+{
+  return static_cast<uint16_t>(prtype >> 16) & CHAR_COLL_MASK;
+}
+
+/** Form a precise type from the < 4.1.2 format precise type plus the
+charset-collation code.
+@param[in]	old_prtype	MySQL type code and the flags
+				DATA_BINARY_TYPE etc.
+@param[in]	charset_coll	character-set collation code
+@return precise type, including the charset-collation code */
+UNIV_INLINE
+uint32_t
+dtype_form_prtype(ulint old_prtype, ulint charset_coll)
+{
+	ut_ad(old_prtype < 256 * 256);
+	ut_ad(charset_coll <= MAX_CHAR_COLL_NUM);
+	return(uint32_t(old_prtype + (charset_coll << 16)));
+}
+
+/*********************************************************************//**
+Determines if a MySQL string type is a subset of UTF-8.  This function
+may return false negatives, in case further character-set collation
+codes are introduced in MySQL later.
+@return whether a subset of UTF-8 */
+UNIV_INLINE
+bool
+dtype_is_utf8(
+/*==========*/
+	ulint	prtype);/*!< in: precise data type */
+/*********************************************************************//**
+Gets the type length.
+@return fixed length of the type, in bytes, or 0 if variable-length */
+UNIV_INLINE
+ulint
+dtype_get_len(
+/*==========*/
+	const dtype_t*	type);	/*!< in: data type */
+
+/*********************************************************************//**
+Gets the minimum length of a character, in bytes.
+@return minimum length of a char, in bytes, or 0 if this is not a
+character type */
+UNIV_INLINE
+ulint
+dtype_get_mbminlen(
+/*===============*/
+	const dtype_t*	type);	/*!< in: type */
+/*********************************************************************//**
+Gets the maximum length of a character, in bytes.
+@return maximum length of a char, in bytes, or 0 if this is not a
+character type */
+UNIV_INLINE
+ulint
+dtype_get_mbmaxlen(
+/*===============*/
+	const dtype_t*	type);	/*!< in: type */
+/***********************************************************************//**
+Returns the size of a fixed size data type, 0 if not a fixed size type.
+@return fixed size, or 0 */
+UNIV_INLINE
+unsigned
+dtype_get_fixed_size_low(
+/*=====================*/
+	ulint	mtype,		/*!< in: main type */
+	ulint	prtype,		/*!< in: precise type */
+	ulint	len,		/*!< in: length */
+	ulint	mbminlen,	/*!< in: minimum length of a
+				multibyte character, in bytes */
+	ulint	mbmaxlen,	/*!< in: maximum length of a
+				multibyte character, in bytes */
+	ulint	comp);		/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+
+/***********************************************************************//**
+Returns the minimum size of a data type.
+@return minimum size */
+UNIV_INLINE
+unsigned
+dtype_get_min_size_low(
+/*===================*/
+	ulint	mtype,		/*!< in: main type */
+	ulint	prtype,		/*!< in: precise type */
+	ulint	len,		/*!< in: length */
+	ulint	mbminlen,	/*!< in: minimum length of a character */
+	ulint	mbmaxlen);	/*!< in: maximum length of a character */
+/***********************************************************************//**
+Returns the maximum size of a data type. Note: types in system tables may be
+incomplete and return incorrect information.
+@return maximum size */
+UNIV_INLINE
+ulint
+dtype_get_max_size_low(
+/*===================*/
+	ulint	mtype,		/*!< in: main type */
+	ulint	len);		/*!< in: length */
+/***********************************************************************//**
+Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a type.
+For fixed length types it is the fixed length of the type, otherwise 0.
+@return SQL null storage size in ROW_FORMAT=REDUNDANT */
+UNIV_INLINE
+ulint
+dtype_get_sql_null_size(
+/*====================*/
+	const dtype_t*	type,	/*!< in: type */
+	ulint		comp);	/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+
+/**********************************************************************//**
+Reads to a type the stored information which determines its alphabetical
+ordering and the storage size of an SQL NULL value. */
+UNIV_INLINE
+void
+dtype_read_for_order_and_null_size(
+/*===============================*/
+	dtype_t*	type,	/*!< in: type struct */
+	const byte*	buf);	/*!< in: buffer for the stored order info */
+/**********************************************************************//**
+Stores for a type the information which determines its alphabetical ordering
+and the storage size of an SQL NULL value. This is the >= 4.1.x storage
+format. */
+UNIV_INLINE
+void
+dtype_new_store_for_order_and_null_size(
+/*====================================*/
+	byte*		buf,	/*!< in: buffer for
+				DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE
+				bytes where we store the info */
+	const dtype_t*	type,	/*!< in: type struct */
+	ulint		prefix_len);/*!< in: prefix length to
+				replace type->len, or 0 */
+/**********************************************************************//**
+Reads to a type the stored information which determines its alphabetical
+ordering and the storage size of an SQL NULL value. This is the 4.1.x storage
+format. */
+UNIV_INLINE
+void
+dtype_new_read_for_order_and_null_size(
+/*===================================*/
+	dtype_t*	type,	/*!< in: type struct */
+	const byte*	buf);	/*!< in: buffer for stored type order info */
+
+/*********************************************************************//**
+Validates a data type structure.
+@return TRUE if ok */
+ibool
+dtype_validate(
+/*===========*/
+	const dtype_t*	type);	/*!< in: type struct to validate */
+#ifdef UNIV_DEBUG
+/** Print a data type structure.
+@param[in]	type	data type */
+void
+dtype_print(
+	const dtype_t*	type);
+#endif /* UNIV_DEBUG */
+
+struct dict_col_t;
+
+/* Structure for an SQL data type.
+If you add fields to this structure, be sure to initialize them everywhere.
+This structure is initialized in the following functions:
+dtype_set()
+dtype_read_for_order_and_null_size()
+dtype_new_read_for_order_and_null_size()
+sym_tab_add_null_lit() */
+
+struct dtype_t{
+	unsigned	prtype:32;	/*!< precise type; MySQL data
+					type, charset code, flags to
+					indicate nullability,
+					signedness, whether this is a
+					binary string, whether this is
+					a true VARCHAR where MySQL
+					uses 2 bytes to store the length */
+	unsigned	mtype:8;	/*!< main data type */
+
+	/* the remaining fields do not affect alphabetical ordering: */
+
+	unsigned	len:16;		/*!< length; for MySQL data this
+					is field->pack_length(),
+					except that for a >= 5.0.3
+					type true VARCHAR this is the
+					maximum byte length of the
+					string data (in addition to
+					the string, MySQL uses 1 or 2
+					bytes to store the string length) */
+	unsigned	mbminlen:3;	/*!< minimum length of a character,
+					in bytes */
+	unsigned	mbmaxlen:3;	/*!< maximum length of a character,
+					in bytes */
+
+	/** @return whether this is system versioned user field */
+	bool is_versioned() const { return !(~prtype & DATA_VERSIONED); }
+	/** @return whether this is the system field start */
+	bool vers_sys_start() const
+	{
+		return (prtype & DATA_VERSIONED) == DATA_VERS_START;
+	}
+	/** @return whether this is the system field end */
+	bool vers_sys_end() const
+	{
+		return (prtype & DATA_VERSIONED) == DATA_VERS_END;
+	}
+
+	/** Set the type of the BLOB in the hidden metadata record. */
+	void metadata_blob_init()
+	{
+		prtype = DATA_NOT_NULL;
+		mtype = DATA_BLOB;
+		len = 0;
+		mbminlen = 0;
+		mbmaxlen = 0;
+	}
+
+	/** Copy the type information from a column.
+	@param col column type to be copied */
+	void assign(const dict_col_t &col);
+};
+
+/** The DB_TRX_ID,DB_ROLL_PTR values for "no history is available" */
+extern const byte reset_trx_id[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN];
+
+/** Info bit denoting the predefined minimum record: this bit is set
+if and only if the record is the first user record on a non-leaf
+B-tree page that is the leftmost page on its level
+(PAGE_LEVEL is nonzero and FIL_PAGE_PREV is FIL_NULL). */
+#define REC_INFO_MIN_REC_FLAG	0x10UL
+/** The delete-mark flag in info bits */
+#define REC_INFO_DELETED_FLAG	0x20UL
+
+/** Record status values for ROW_FORMAT=COMPACT,DYNAMIC,COMPRESSED */
+enum rec_comp_status_t {
+	/** User record (PAGE_LEVEL=0, heap>=PAGE_HEAP_NO_USER_LOW) */
+	REC_STATUS_ORDINARY = 0,
+	/** Node pointer record (PAGE_LEVEL>=0, heap>=PAGE_HEAP_NO_USER_LOW) */
+	REC_STATUS_NODE_PTR = 1,
+	/** The page infimum pseudo-record (heap=PAGE_HEAP_NO_INFIMUM) */
+	REC_STATUS_INFIMUM = 2,
+	/** The page supremum pseudo-record (heap=PAGE_HEAP_NO_SUPREMUM) */
+	REC_STATUS_SUPREMUM = 3,
+	/** Clustered index record that has been inserted or updated
+	after instant ADD COLUMN (more than dict_index_t::n_core_fields) */
+	REC_STATUS_INSTANT = 4
+};
+
+/** The dtuple_t::info_bits of the hidden metadata of instant ADD COLUMN.
+@see rec_is_metadata()
+@see rec_is_alter_metadata() */
+static const byte REC_INFO_METADATA_ADD
+	= REC_INFO_MIN_REC_FLAG | REC_STATUS_INSTANT;
+
+/** The dtuple_t::info_bits of the hidden metadata of instant ALTER TABLE.
+@see rec_is_metadata() */
+static const byte REC_INFO_METADATA_ALTER
+	= REC_INFO_METADATA_ADD | REC_INFO_DELETED_FLAG;
+
+#include "data0type.inl"
diff --git a/storage/innobase/include/data0type.inl b/storage/innobase/include/data0type.inl
new file mode 100644
index 00000000..329cee5d
--- /dev/null
+++ b/storage/innobase/include/data0type.inl
@@ -0,0 +1,487 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/data0type.ic
+Data types
+
+Created 1/16/1996 Heikki Tuuri
+*******************************************************/
+
+#include "mach0data.h"
+#include "ha_prototypes.h"
+
+/*********************************************************************//**
+Determines if a MySQL string type is a subset of UTF-8.  This function
+may return false negatives, in case further character-set collation
+codes are introduced in MySQL later.
+@return whether a subset of UTF-8 */
+UNIV_INLINE
+bool
+dtype_is_utf8(
+/*==========*/
+	ulint	prtype)	/*!< in: precise data type */
+{
+	/* These codes have been copied from strings/ctype-extra.c
+	and strings/ctype-utf8.c. */
+	switch (dtype_get_charset_coll(prtype)) {
+	case 11: /* ascii_general_ci */
+	case 65: /* ascii_bin */
+	case 33: /* utf8_general_ci */
+	case 83: /* utf8_bin */
+	case 254: /* utf8_general_cs */
+		return true;
+	}
+
+	return false;
+}
+
+/*********************************************************************//**
+Gets the MySQL type code from a dtype.
+@return MySQL type code; this is NOT an InnoDB type code! */
+UNIV_INLINE
+ulint
+dtype_get_mysql_type(
+/*=================*/
+	const dtype_t*	type)	/*!< in: type struct */
+{
+	return(type->prtype & 0xFFUL);
+}
+
+/*********************************************************************//**
+Compute the mbminlen and mbmaxlen members of a data type structure. */
+UNIV_INLINE
+void
+dtype_set_mblen(
+/*============*/
+	dtype_t*	type)	/*!< in/out: type */
+{
+	unsigned mbminlen, mbmaxlen;
+
+	dtype_get_mblen(type->mtype, type->prtype, &mbminlen, &mbmaxlen);
+	type->mbminlen = mbminlen & 7;
+	type->mbmaxlen = mbmaxlen & 7;
+
+	ut_ad(dtype_validate(type));
+}
+
+/*********************************************************************//**
+Sets a data type structure. */
+UNIV_INLINE
+void
+dtype_set(
+/*======*/
+	dtype_t*	type,	/*!< in: type struct to init */
+	ulint		mtype,	/*!< in: main data type */
+	ulint		prtype,	/*!< in: precise type */
+	ulint		len)	/*!< in: precision of type */
+{
+	ut_ad(type);
+	ut_ad(mtype <= DATA_MTYPE_MAX);
+
+	type->mtype = static_cast<byte>(mtype);
+	type->prtype = static_cast<unsigned>(prtype);
+	type->len = static_cast<uint16_t>(len);
+
+	dtype_set_mblen(type);
+}
+
+/*********************************************************************//**
+Copies a data type structure. */
+UNIV_INLINE
+void
+dtype_copy(
+/*=======*/
+	dtype_t*	type1,	/*!< in: type struct to copy to */
+	const dtype_t*	type2)	/*!< in: type struct to copy from */
+{
+	*type1 = *type2;
+
+	ut_ad(dtype_validate(type1));
+}
+
+/*********************************************************************//**
+Gets the SQL main data type.
+@return SQL main data type */
+UNIV_INLINE
+ulint
+dtype_get_mtype(
+/*============*/
+	const dtype_t*	type)	/*!< in: data type */
+{
+	ut_ad(type);
+
+	return(type->mtype);
+}
+
+/*********************************************************************//**
+Gets the precise data type.
+@return precise data type */
+UNIV_INLINE
+ulint
+dtype_get_prtype(
+/*=============*/
+	const dtype_t*	type)	/*!< in: data type */
+{
+	ut_ad(type);
+
+	return(type->prtype);
+}
+
+/*********************************************************************//**
+Gets the type length.
+@return fixed length of the type, in bytes, or 0 if variable-length */
+UNIV_INLINE
+ulint
+dtype_get_len(
+/*==========*/
+	const dtype_t*	type)	/*!< in: data type */
+{
+	ut_ad(type);
+
+	return(type->len);
+}
+
+/*********************************************************************//**
+Gets the minimum length of a character, in bytes.
+@return minimum length of a char, in bytes, or 0 if this is not a
+character type */
+UNIV_INLINE
+ulint
+dtype_get_mbminlen(
+/*===============*/
+	const dtype_t*	type)	/*!< in: type */
+{
+	return type->mbminlen;
+}
+/*********************************************************************//**
+Gets the maximum length of a character, in bytes.
+@return maximum length of a char, in bytes, or 0 if this is not a
+character type */
+UNIV_INLINE
+ulint
+dtype_get_mbmaxlen(
+/*===============*/
+	const dtype_t*	type)	/*!< in: type */
+{
+	return type->mbmaxlen;
+}
+
+/**********************************************************************//**
+Stores for a type the information which determines its alphabetical ordering
+and the storage size of an SQL NULL value. This is the >= 4.1.x storage
+format. */
+UNIV_INLINE
+void
+dtype_new_store_for_order_and_null_size(
+/*====================================*/
+	byte*		buf,	/*!< in: buffer for
+				DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE
+				bytes where we store the info */
+	const dtype_t*	type,	/*!< in: type struct */
+	ulint		prefix_len)/*!< in: prefix length to
+				replace type->len, or 0 */
+{
+	compile_time_assert(6 == DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
+
+	ulint	len;
+
+	ut_ad(type);
+	ut_ad(type->mtype >= DATA_VARCHAR);
+	ut_ad(type->mtype <= DATA_MTYPE_MAX);
+
+	buf[0] = (byte)(type->mtype & 0xFFUL);
+
+	if (type->prtype & DATA_BINARY_TYPE) {
+		buf[0] |= 128;
+	}
+
+	/* In versions < 4.1.2 we had:	if (type->prtype & DATA_NONLATIN1) {
+	buf[0] |= 64;
+	}
+	*/
+
+	buf[1] = (byte)(type->prtype & 0xFFUL);
+
+	len = prefix_len ? prefix_len : type->len;
+
+	mach_write_to_2(buf + 2, len & 0xFFFFUL);
+
+	ut_ad(dtype_get_charset_coll(type->prtype) <= MAX_CHAR_COLL_NUM);
+	mach_write_to_2(buf + 4, dtype_get_charset_coll(type->prtype));
+
+	if (type->prtype & DATA_NOT_NULL) {
+		buf[4] |= 128;
+	}
+}
+
+/**********************************************************************//**
+Reads to a type the stored information which determines its alphabetical
+ordering and the storage size of an SQL NULL value. This is the < 4.1.x
+storage format. */
+UNIV_INLINE
+void
+dtype_read_for_order_and_null_size(
+/*===============================*/
+	dtype_t*	type,	/*!< in: type struct */
+	const byte*	buf)	/*!< in: buffer for stored type order info */
+{
+	compile_time_assert(4 == DATA_ORDER_NULL_TYPE_BUF_SIZE);
+	type->mtype = buf[0] & 63;
+	type->prtype = buf[1];
+
+	if (buf[0] & 128) {
+		type->prtype |= DATA_BINARY_TYPE;
+	}
+
+	type->len = mach_read_from_2(buf + 2);
+
+	type->prtype = dtype_form_prtype(type->prtype,
+					 data_mysql_default_charset_coll);
+	dtype_set_mblen(type);
+}
+
+/**********************************************************************//**
+Reads to a type the stored information which determines its alphabetical
+ordering and the storage size of an SQL NULL value. This is the >= 4.1.x
+storage format. */
+UNIV_INLINE
+void
+dtype_new_read_for_order_and_null_size(
+/*===================================*/
+	dtype_t*	type,	/*!< in: type struct */
+	const byte*	buf)	/*!< in: buffer for stored type order info */
+{
+	compile_time_assert(6 == DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
+
+	type->mtype = buf[0] & 63;
+	type->prtype = buf[1];
+
+	if (buf[0] & 128) {
+		type->prtype |= DATA_BINARY_TYPE;
+	}
+
+	if (buf[4] & 128) {
+		type->prtype |= DATA_NOT_NULL;
+	}
+
+	type->len = mach_read_from_2(buf + 2);
+
+	ulint charset_coll = mach_read_from_2(buf + 4) & CHAR_COLL_MASK;
+
+	if (dtype_is_string_type(type->mtype)) {
+		ut_a(charset_coll <= MAX_CHAR_COLL_NUM);
+
+		if (charset_coll == 0) {
+			/* This insert buffer record was inserted with MySQL
+			version < 4.1.2, and the charset-collation code was not
+			explicitly stored to dtype->prtype at that time. It
+			must be the default charset-collation of this MySQL
+			installation. */
+
+			charset_coll = data_mysql_default_charset_coll;
+		}
+
+		type->prtype = dtype_form_prtype(type->prtype, charset_coll);
+	}
+	dtype_set_mblen(type);
+}
+
+/***********************************************************************//**
+Returns the size of a fixed size data type, 0 if not a fixed size type.
+@return fixed size, or 0 */
+UNIV_INLINE
+unsigned
+dtype_get_fixed_size_low(
+/*=====================*/
+	ulint	mtype,		/*!< in: main type */
+	ulint	prtype,		/*!< in: precise type */
+	ulint	len,		/*!< in: length */
+	ulint	mbminlen,	/*!< in: minimum length of a
+				multibyte character, in bytes */
+	ulint	mbmaxlen,	/*!< in: maximum length of a
+				multibyte character, in bytes */
+	ulint	comp)		/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+{
+	switch (mtype) {
+	case DATA_SYS:
+#ifdef UNIV_DEBUG
+		switch (prtype & DATA_MYSQL_TYPE_MASK) {
+		case DATA_ROW_ID:
+			ut_ad(len == DATA_ROW_ID_LEN);
+			break;
+		case DATA_TRX_ID:
+			ut_ad(len == DATA_TRX_ID_LEN);
+			break;
+		case DATA_ROLL_PTR:
+			ut_ad(len == DATA_ROLL_PTR_LEN);
+			break;
+		default:
+			ut_ad(0);
+			return(0);
+		}
+#endif /* UNIV_DEBUG */
+		/* fall through */
+	case DATA_CHAR:
+	case DATA_FIXBINARY:
+	case DATA_INT:
+	case DATA_FLOAT:
+	case DATA_DOUBLE:
+		return static_cast<unsigned>(len);
+	case DATA_MYSQL:
+		if (prtype & DATA_BINARY_TYPE) {
+			return static_cast<unsigned>(len);
+		} else if (!comp) {
+			return static_cast<unsigned>(len);
+		} else {
+			if (mbminlen == mbmaxlen) {
+				return static_cast<unsigned>(len);
+			}
+		}
+		/* Treat as variable-length. */
+		/* fall through */
+	case DATA_VARCHAR:
+	case DATA_BINARY:
+	case DATA_DECIMAL:
+	case DATA_VARMYSQL:
+	case DATA_GEOMETRY:
+	case DATA_BLOB:
+		return(0);
+	default:
+		ut_error;
+	}
+
+	return(0);
+}
+
+/***********************************************************************//**
+Returns the minimum size of a data type.
+@return minimum size */
+UNIV_INLINE
+unsigned
+dtype_get_min_size_low(
+/*===================*/
+	ulint	mtype,		/*!< in: main type */
+	ulint	prtype,		/*!< in: precise type */
+	ulint	len,		/*!< in: length */
+	ulint	mbminlen,	/*!< in: minimum length of a character */
+	ulint	mbmaxlen)	/*!< in: maximum length of a character */
+{
+	switch (mtype) {
+	case DATA_SYS:
+#ifdef UNIV_DEBUG
+		switch (prtype & DATA_MYSQL_TYPE_MASK) {
+		case DATA_ROW_ID:
+			ut_ad(len == DATA_ROW_ID_LEN);
+			break;
+		case DATA_TRX_ID:
+			ut_ad(len == DATA_TRX_ID_LEN);
+			break;
+		case DATA_ROLL_PTR:
+			ut_ad(len == DATA_ROLL_PTR_LEN);
+			break;
+		default:
+			ut_ad(0);
+			return(0);
+		}
+#endif /* UNIV_DEBUG */
+		/* fall through */
+	case DATA_CHAR:
+	case DATA_FIXBINARY:
+	case DATA_INT:
+	case DATA_FLOAT:
+	case DATA_DOUBLE:
+		return static_cast<unsigned>(len);
+	case DATA_MYSQL:
+		if (prtype & DATA_BINARY_TYPE) {
+			return static_cast<unsigned>(len);
+		} else {
+			if (mbminlen == mbmaxlen) {
+				return static_cast<unsigned>(len);
+			}
+
+			/* this is a variable-length character set */
+			ut_a(mbminlen > 0);
+			ut_a(mbmaxlen > mbminlen);
+			ut_a(len % mbmaxlen == 0);
+			return static_cast<unsigned>(
+				len * mbminlen / mbmaxlen);
+		}
+	case DATA_VARCHAR:
+	case DATA_BINARY:
+	case DATA_DECIMAL:
+	case DATA_VARMYSQL:
+	case DATA_GEOMETRY:
+	case DATA_BLOB:
+		return(0);
+	default:
+		ut_error;
+	}
+
+	return(0);
+}
+
+/***********************************************************************//**
+Returns the maximum size of a data type. Note: types in system tables may be
+incomplete and return incorrect information.
+@return maximum size */
+UNIV_INLINE
+ulint
+dtype_get_max_size_low(
+/*===================*/
+	ulint	mtype,		/*!< in: main type */
+	ulint	len)		/*!< in: length */
+{
+	switch (mtype) {
+	case DATA_SYS:
+	case DATA_CHAR:
+	case DATA_FIXBINARY:
+	case DATA_INT:
+	case DATA_FLOAT:
+	case DATA_DOUBLE:
+	case DATA_MYSQL:
+	case DATA_VARCHAR:
+	case DATA_BINARY:
+	case DATA_DECIMAL:
+	case DATA_VARMYSQL:
+		return(len);
+	case DATA_GEOMETRY:
+	case DATA_BLOB:
+		break;
+	default:
+		ut_error;
+	}
+
+	return(ULINT_MAX);
+}
+
+/***********************************************************************//**
+Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a type.
+For fixed length types it is the fixed length of the type, otherwise 0.
+@return SQL null storage size in ROW_FORMAT=REDUNDANT */
+UNIV_INLINE
+ulint
+dtype_get_sql_null_size(
+/*====================*/
+	const dtype_t*	type,	/*!< in: type */
+	ulint		comp)	/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+{
+	return(dtype_get_fixed_size_low(type->mtype, type->prtype, type->len,
+					type->mbminlen, type->mbmaxlen, comp));
+}
diff --git a/storage/innobase/include/data0types.h b/storage/innobase/include/data0types.h
new file mode 100644
index 00000000..bcd6b8bc
--- /dev/null
+++ b/storage/innobase/include/data0types.h
@@ -0,0 +1,36 @@
+/*****************************************************************************
+
+Copyright (c) 2000, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/data0types.h
+Some type definitions
+
+Created 9/21/2000 Heikki Tuuri
+*************************************************************************/
+
+#ifndef data0types_h
+#define data0types_h
+
+/* SQL data field struct */
+struct dfield_t;
+
+/* SQL data tuple struct */
+struct dtuple_t;
+
+#endif
+
diff --git a/storage/innobase/include/db0err.h b/storage/innobase/include/db0err.h
new file mode 100644
index 00000000..64182aab
--- /dev/null
+++ b/storage/innobase/include/db0err.h
@@ -0,0 +1,170 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/db0err.h
+Global error codes for the database
+
+Created 5/24/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef db0err_h
+#define db0err_h
+
+/* Do not include univ.i because univ.i includes this. */
+
+enum dberr_t {
+	DB_SUCCESS,
+
+	DB_SUCCESS_LOCKED_REC = 9,	/*!< like DB_SUCCESS, but a new
+					explicit record lock was created */
+
+	/* The following are error codes */
+	DB_ERROR = 11,
+	DB_INTERRUPTED,
+	DB_OUT_OF_MEMORY,
+	DB_OUT_OF_FILE_SPACE,
+	DB_LOCK_WAIT,
+	DB_DEADLOCK,
+	DB_ROLLBACK,
+	DB_DUPLICATE_KEY,
+	DB_MISSING_HISTORY,		/*!< required history data has been
+					deleted due to lack of space in
+					rollback segment */
+	DB_CLUSTER_NOT_FOUND = 30,
+	DB_TABLE_NOT_FOUND,
+	DB_TOO_BIG_RECORD,		/*!< a record in an index would not fit
+					on a compressed page, or it would
+					become bigger than 1/2 free space in
+					an uncompressed page frame */
+	DB_LOCK_WAIT_TIMEOUT,		/*!< lock wait lasted too long */
+	DB_NO_REFERENCED_ROW,		/*!< referenced key value not found
+					for a foreign key in an insert or
+					update of a row */
+	DB_ROW_IS_REFERENCED,		/*!< cannot delete or update a row
+					because it contains a key value
+					which is referenced */
+	DB_CANNOT_ADD_CONSTRAINT,	/*!< adding a foreign key constraint
+					to a table failed */
+	DB_CORRUPTION,			/*!< data structure corruption
+					noticed */
+	DB_CANNOT_DROP_CONSTRAINT,	/*!< dropping a foreign key constraint
+					from a table failed */
+	DB_NO_SAVEPOINT,		/*!< no savepoint exists with the given
+					name */
+	DB_TABLESPACE_EXISTS,		/*!< we cannot create a new single-table
+					tablespace because a file of the same
+					name already exists */
+	DB_TABLESPACE_DELETED,		/*!< tablespace was deleted or is
+					being dropped right now */
+	DB_TABLESPACE_NOT_FOUND,	/*<! Attempt to delete a tablespace
+					instance that was not found in the
+					tablespace hash table */
+	DB_LOCK_TABLE_FULL,		/*!< lock structs have exhausted the
+					buffer pool (for big transactions,
+					InnoDB stores the lock structs in the
+					buffer pool) */
+	DB_FOREIGN_DUPLICATE_KEY,	/*!< foreign key constraints
+					activated by the operation would
+					lead to a duplicate key in some
+					table */
+	DB_TOO_MANY_CONCURRENT_TRXS,	/*!< when InnoDB runs out of the
+					preconfigured undo slots, this can
+					only happen when there are too many
+					concurrent transactions */
+	DB_UNSUPPORTED,			/*!< when InnoDB sees any artefact or
+					a feature that it can't recoginize or
+					work with e.g., FT indexes created by
+					a later version of the engine. */
+
+	DB_INVALID_NULL,		/*!< a NOT NULL column was found to
+					be NULL during table rebuild */
+
+	DB_STATS_DO_NOT_EXIST,		/*!< an operation that requires the
+					persistent storage, used for recording
+					table and index statistics, was
+					requested but this storage does not
+					exist itself or the stats for a given
+					table do not exist */
+	DB_FOREIGN_EXCEED_MAX_CASCADE,	/*!< Foreign key constraint related
+					cascading delete/update exceeds
+					maximum allowed depth */
+	DB_CHILD_NO_INDEX,		/*!< the child (foreign) table does
+					not have an index that contains the
+					foreign keys as its prefix columns */
+	DB_PARENT_NO_INDEX,		/*!< the parent table does not
+					have an index that contains the
+					foreign keys as its prefix columns */
+	DB_TOO_BIG_INDEX_COL,		/*!< index column size exceeds
+					maximum limit */
+	DB_INDEX_CORRUPT,		/*!< we have corrupted index */
+	DB_UNDO_RECORD_TOO_BIG,		/*!< the undo log record is too big */
+	DB_READ_ONLY,			/*!< Update operation attempted in
+					a read-only transaction */
+	DB_FTS_INVALID_DOCID,		/* FTS Doc ID cannot be zero */
+	DB_ONLINE_LOG_TOO_BIG,		/*!< Modification log grew too big
+					during online index creation */
+
+	DB_IDENTIFIER_TOO_LONG,		/*!< Identifier name too long */
+	DB_FTS_EXCEED_RESULT_CACHE_LIMIT,	/*!< FTS query memory
+					exceeds result cache limit */
+	DB_TEMP_FILE_WRITE_FAIL,	/*!< Temp file write failure */
+	DB_CANT_CREATE_GEOMETRY_OBJECT,	/*!< Cannot create specified Geometry
+					data object */
+	DB_CANNOT_OPEN_FILE,		/*!< Cannot open a file */
+	DB_FTS_TOO_MANY_WORDS_IN_PHRASE,
+					/*< Too many words in a phrase */
+
+	DB_DECRYPTION_FAILED,		/* Tablespace encrypted and
+					decrypt operation failed because
+					of missing key management plugin,
+					or missing or incorrect key or
+					incorret AES method or algorithm. */
+
+	DB_IO_ERROR = 100,		/*!< Generic IO error */
+
+	DB_IO_PARTIAL_FAILED,		/*!< Partial IO request failed */
+
+	DB_TABLE_CORRUPT,		/*!< Table/clustered index is
+					corrupted */
+
+	DB_COMPUTE_VALUE_FAILED,	/*!< Compute generated value failed */
+
+	DB_NO_FK_ON_S_BASE_COL,		/*!< Cannot add foreign constrain
+					placed on the base column of
+					stored column */
+
+	DB_IO_NO_PUNCH_HOLE,		/*!< Punch hole not supported by
+					file system. */
+
+	DB_PAGE_CORRUPTED,		/* Page read from tablespace is
+					corrupted. */
+	/* The following are partial failure codes */
+	DB_FAIL = 1000,
+	DB_OVERFLOW,
+	DB_UNDERFLOW,
+	DB_STRONG_FAIL,
+	DB_ZIP_OVERFLOW,
+	DB_RECORD_NOT_FOUND = 1500,
+	DB_END_OF_INDEX,
+	DB_NOT_FOUND,			/*!< Generic error code for "Not found"
+					type of errors */
+};
+
+#endif
diff --git a/storage/innobase/include/dict0boot.h b/storage/innobase/include/dict0boot.h
new file mode 100644
index 00000000..a6528747
--- /dev/null
+++ b/storage/innobase/include/dict0boot.h
@@ -0,0 +1,297 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0boot.h
+Data dictionary creation and booting
+
+Created 4/18/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0boot_h
+#define dict0boot_h
+
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+#include "ut0byte.h"
+#include "buf0buf.h"
+#include "dict0dict.h"
+
+/**********************************************************************//**
+Returns a new table, index, or space id. */
+void
+dict_hdr_get_new_id(
+/*================*/
+	table_id_t*		table_id,	/*!< out: table id
+						(not assigned if NULL) */
+	index_id_t*		index_id,	/*!< out: index id
+						(not assigned if NULL) */
+	uint32_t*		space_id);	/*!< out: space id
+						(not assigned if NULL) */
+/** Update dict_sys.row_id in the dictionary header file page. */
+void dict_hdr_flush_row_id(row_id_t id);
+/** @return A new value for GEN_CLUST_INDEX(DB_ROW_ID) */
+inline row_id_t dict_sys_t::get_new_row_id()
+{
+  row_id_t id= row_id.fetch_add(1);
+  if (!(id % ROW_ID_WRITE_MARGIN))
+    dict_hdr_flush_row_id(id);
+  return id;
+}
+
+/** Ensure that row_id is not smaller than id, on IMPORT TABLESPACE */
+inline void dict_sys_t::update_row_id(row_id_t id)
+{
+  row_id_t sys_id= row_id;
+  while (id >= sys_id)
+  {
+    if (!row_id.compare_exchange_strong(sys_id, id))
+      continue;
+    if (!(id % ROW_ID_WRITE_MARGIN))
+      dict_hdr_flush_row_id(id);
+    break;
+  }
+}
+
+/**********************************************************************//**
+Writes a row id to a record or other 6-byte stored form. */
+inline void dict_sys_write_row_id(byte *field, row_id_t row_id)
+{
+  static_assert(DATA_ROW_ID_LEN == 6, "compatibility");
+  mach_write_to_6(field, row_id);
+}
+
+/*****************************************************************//**
+Initializes the data dictionary memory structures when the database is
+started. This function is also called when the data dictionary is created.
+@return DB_SUCCESS or error code. */
+dberr_t
+dict_boot(void)
+/*===========*/
+	MY_ATTRIBUTE((warn_unused_result));
+
+/*****************************************************************//**
+Creates and initializes the data dictionary at the server bootstrap.
+@return DB_SUCCESS or error code. */
+dberr_t
+dict_create(void)
+/*=============*/
+	MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************//**
+Check if a table id belongs to  system table.
+@return true if the table id belongs to a system table. */
+inline bool dict_is_sys_table(table_id_t id) { return id < DICT_HDR_FIRST_ID; }
+
+/* Space id and page no where the dictionary header resides */
+#define	DICT_HDR_SPACE		0	/* the SYSTEM tablespace */
+#define	DICT_HDR_PAGE_NO	FSP_DICT_HDR_PAGE_NO
+
+/* The ids for the basic system tables and their indexes */
+#define DICT_TABLES_ID		1
+#define DICT_COLUMNS_ID		2
+#define DICT_INDEXES_ID		dict_index_t::DICT_INDEXES_ID /* 3 */
+#define DICT_FIELDS_ID		4
+/* The following is a secondary index on SYS_TABLES */
+#define DICT_TABLE_IDS_ID	5
+
+/* The offset of the dictionary header on the page */
+#define	DICT_HDR		FSEG_PAGE_DATA
+
+/*-------------------------------------------------------------*/
+/* Dictionary header offsets */
+#define DICT_HDR_ROW_ID		0	/* The latest assigned row id */
+#define DICT_HDR_TABLE_ID	8	/* The latest assigned table id */
+#define DICT_HDR_INDEX_ID	16	/* The latest assigned index id */
+#define DICT_HDR_MAX_SPACE_ID	24	/* The latest assigned space id,or 0*/
+#define DICT_HDR_MIX_ID_LOW	28	/* Obsolete,always DICT_HDR_FIRST_ID*/
+#define DICT_HDR_TABLES		32	/* Root of SYS_TABLES clust index */
+#define DICT_HDR_TABLE_IDS	36	/* Root of SYS_TABLE_IDS sec index */
+#define DICT_HDR_COLUMNS	40	/* Root of SYS_COLUMNS clust index */
+#define DICT_HDR_INDEXES	44	/* Root of SYS_INDEXES clust index */
+#define DICT_HDR_FIELDS		48	/* Root of SYS_FIELDS clust index */
+
+#define DICT_HDR_FSEG_HEADER	56	/* Segment header for the tablespace
+					segment into which the dictionary
+					header is created */
+/*-------------------------------------------------------------*/
+
+/* The columns in SYS_TABLES */
+enum dict_col_sys_tables_enum {
+	DICT_COL__SYS_TABLES__NAME		= 0,
+	DICT_COL__SYS_TABLES__ID		= 1,
+	DICT_COL__SYS_TABLES__N_COLS		= 2,
+	DICT_COL__SYS_TABLES__TYPE		= 3,
+	DICT_COL__SYS_TABLES__MIX_ID		= 4,
+	DICT_COL__SYS_TABLES__MIX_LEN		= 5,
+	DICT_COL__SYS_TABLES__CLUSTER_ID	= 6,
+	DICT_COL__SYS_TABLES__SPACE		= 7,
+	DICT_NUM_COLS__SYS_TABLES		= 8
+};
+/* The field numbers in the SYS_TABLES clustered index */
+enum dict_fld_sys_tables_enum {
+	DICT_FLD__SYS_TABLES__NAME		= 0,
+	DICT_FLD__SYS_TABLES__DB_TRX_ID		= 1,
+	DICT_FLD__SYS_TABLES__DB_ROLL_PTR	= 2,
+	DICT_FLD__SYS_TABLES__ID		= 3,
+	DICT_FLD__SYS_TABLES__N_COLS		= 4,
+	DICT_FLD__SYS_TABLES__TYPE		= 5,
+	DICT_FLD__SYS_TABLES__MIX_ID		= 6,
+	DICT_FLD__SYS_TABLES__MIX_LEN		= 7,
+	DICT_FLD__SYS_TABLES__CLUSTER_ID	= 8,
+	DICT_FLD__SYS_TABLES__SPACE		= 9,
+	DICT_NUM_FIELDS__SYS_TABLES		= 10
+};
+/* The field numbers in the SYS_TABLE_IDS index */
+enum dict_fld_sys_table_ids_enum {
+	DICT_FLD__SYS_TABLE_IDS__ID		= 0,
+	DICT_FLD__SYS_TABLE_IDS__NAME		= 1,
+	DICT_NUM_FIELDS__SYS_TABLE_IDS		= 2
+};
+/* The columns in SYS_COLUMNS */
+enum dict_col_sys_columns_enum {
+	DICT_COL__SYS_COLUMNS__TABLE_ID		= 0,
+	DICT_COL__SYS_COLUMNS__POS		= 1,
+	DICT_COL__SYS_COLUMNS__NAME		= 2,
+	DICT_COL__SYS_COLUMNS__MTYPE		= 3,
+	DICT_COL__SYS_COLUMNS__PRTYPE		= 4,
+	DICT_COL__SYS_COLUMNS__LEN		= 5,
+	DICT_COL__SYS_COLUMNS__PREC		= 6,
+	DICT_NUM_COLS__SYS_COLUMNS		= 7
+};
+/* The field numbers in the SYS_COLUMNS clustered index */
+enum dict_fld_sys_columns_enum {
+	DICT_FLD__SYS_COLUMNS__TABLE_ID		= 0,
+	DICT_FLD__SYS_COLUMNS__POS		= 1,
+	DICT_FLD__SYS_COLUMNS__DB_TRX_ID	= 2,
+	DICT_FLD__SYS_COLUMNS__DB_ROLL_PTR	= 3,
+	DICT_FLD__SYS_COLUMNS__NAME		= 4,
+	DICT_FLD__SYS_COLUMNS__MTYPE		= 5,
+	DICT_FLD__SYS_COLUMNS__PRTYPE		= 6,
+	DICT_FLD__SYS_COLUMNS__LEN		= 7,
+	DICT_FLD__SYS_COLUMNS__PREC		= 8,
+	DICT_NUM_FIELDS__SYS_COLUMNS		= 9
+};
+/* The columns in SYS_INDEXES */
+enum dict_col_sys_indexes_enum {
+	DICT_COL__SYS_INDEXES__TABLE_ID		= 0,
+	DICT_COL__SYS_INDEXES__ID		= 1,
+	DICT_COL__SYS_INDEXES__NAME		= 2,
+	DICT_COL__SYS_INDEXES__N_FIELDS		= 3,
+	DICT_COL__SYS_INDEXES__TYPE		= 4,
+	DICT_COL__SYS_INDEXES__SPACE		= 5,
+	DICT_COL__SYS_INDEXES__PAGE_NO		= 6,
+	DICT_COL__SYS_INDEXES__MERGE_THRESHOLD	= 7,
+	DICT_NUM_COLS__SYS_INDEXES		= 8
+};
+/* The field numbers in the SYS_INDEXES clustered index */
+enum dict_fld_sys_indexes_enum {
+	DICT_FLD__SYS_INDEXES__TABLE_ID		= 0,
+	DICT_FLD__SYS_INDEXES__ID		= 1,
+	DICT_FLD__SYS_INDEXES__DB_TRX_ID	= 2,
+	DICT_FLD__SYS_INDEXES__DB_ROLL_PTR	= 3,
+	DICT_FLD__SYS_INDEXES__NAME		= 4,
+	DICT_FLD__SYS_INDEXES__N_FIELDS		= 5,
+	DICT_FLD__SYS_INDEXES__TYPE		= 6,
+	DICT_FLD__SYS_INDEXES__SPACE		= 7,
+	DICT_FLD__SYS_INDEXES__PAGE_NO		= 8,
+	DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD	= 9,
+	DICT_NUM_FIELDS__SYS_INDEXES		= 10
+};
+/* The columns in SYS_FIELDS */
+enum dict_col_sys_fields_enum {
+	DICT_COL__SYS_FIELDS__INDEX_ID		= 0,
+	DICT_COL__SYS_FIELDS__POS		= 1,
+	DICT_COL__SYS_FIELDS__COL_NAME		= 2,
+	DICT_NUM_COLS__SYS_FIELDS		= 3
+};
+/* The field numbers in the SYS_FIELDS clustered index */
+enum dict_fld_sys_fields_enum {
+	DICT_FLD__SYS_FIELDS__INDEX_ID		= 0,
+	DICT_FLD__SYS_FIELDS__POS		= 1,
+	DICT_FLD__SYS_FIELDS__DB_TRX_ID		= 2,
+	DICT_FLD__SYS_FIELDS__DB_ROLL_PTR	= 3,
+	DICT_FLD__SYS_FIELDS__COL_NAME		= 4,
+	DICT_NUM_FIELDS__SYS_FIELDS		= 5
+};
+/* The columns in SYS_FOREIGN */
+enum dict_col_sys_foreign_enum {
+	DICT_COL__SYS_FOREIGN__ID		= 0,
+	DICT_COL__SYS_FOREIGN__FOR_NAME		= 1,
+	DICT_COL__SYS_FOREIGN__REF_NAME		= 2,
+	DICT_COL__SYS_FOREIGN__N_COLS		= 3,
+	DICT_NUM_COLS__SYS_FOREIGN		= 4
+};
+/* The field numbers in the SYS_FOREIGN clustered index */
+enum dict_fld_sys_foreign_enum {
+	DICT_FLD__SYS_FOREIGN__ID		= 0,
+	DICT_FLD__SYS_FOREIGN__DB_TRX_ID	= 1,
+	DICT_FLD__SYS_FOREIGN__DB_ROLL_PTR	= 2,
+	DICT_FLD__SYS_FOREIGN__FOR_NAME		= 3,
+	DICT_FLD__SYS_FOREIGN__REF_NAME		= 4,
+	DICT_FLD__SYS_FOREIGN__N_COLS		= 5,
+	DICT_NUM_FIELDS__SYS_FOREIGN		= 6
+};
+/* The field numbers in the SYS_FOREIGN_FOR_NAME secondary index */
+enum dict_fld_sys_foreign_for_name_enum {
+	DICT_FLD__SYS_FOREIGN_FOR_NAME__NAME	= 0,
+	DICT_FLD__SYS_FOREIGN_FOR_NAME__ID	= 1,
+	DICT_NUM_FIELDS__SYS_FOREIGN_FOR_NAME	= 2
+};
+/* The columns in SYS_FOREIGN_COLS */
+enum dict_col_sys_foreign_cols_enum {
+	DICT_COL__SYS_FOREIGN_COLS__ID			= 0,
+	DICT_COL__SYS_FOREIGN_COLS__POS			= 1,
+	DICT_COL__SYS_FOREIGN_COLS__FOR_COL_NAME	= 2,
+	DICT_COL__SYS_FOREIGN_COLS__REF_COL_NAME	= 3,
+	DICT_NUM_COLS__SYS_FOREIGN_COLS			= 4
+};
+/* The field numbers in the SYS_FOREIGN_COLS clustered index */
+enum dict_fld_sys_foreign_cols_enum {
+	DICT_FLD__SYS_FOREIGN_COLS__ID			= 0,
+	DICT_FLD__SYS_FOREIGN_COLS__POS			= 1,
+	DICT_FLD__SYS_FOREIGN_COLS__DB_TRX_ID		= 2,
+	DICT_FLD__SYS_FOREIGN_COLS__DB_ROLL_PTR		= 3,
+	DICT_FLD__SYS_FOREIGN_COLS__FOR_COL_NAME	= 4,
+	DICT_FLD__SYS_FOREIGN_COLS__REF_COL_NAME	= 5,
+	DICT_NUM_FIELDS__SYS_FOREIGN_COLS		= 6
+};
+/* The columns in SYS_VIRTUAL */
+enum dict_col_sys_virtual_enum {
+	DICT_COL__SYS_VIRTUAL__TABLE_ID		= 0,
+	DICT_COL__SYS_VIRTUAL__POS		= 1,
+	DICT_COL__SYS_VIRTUAL__BASE_POS		= 2,
+	DICT_NUM_COLS__SYS_VIRTUAL		= 3
+};
+/* The field numbers in the SYS_VIRTUAL clustered index */
+enum dict_fld_sys_virtual_enum {
+	DICT_FLD__SYS_VIRTUAL__TABLE_ID		= 0,
+	DICT_FLD__SYS_VIRTUAL__POS		= 1,
+	DICT_FLD__SYS_VIRTUAL__BASE_POS		= 2,
+	DICT_FLD__SYS_VIRTUAL__DB_TRX_ID	= 3,
+	DICT_FLD__SYS_VIRTUAL__DB_ROLL_PTR	= 4,
+	DICT_NUM_FIELDS__SYS_VIRTUAL		= 5
+};
+
+/* A number of the columns above occur in multiple tables.  These are the
+length of thos fields. */
+#define	DICT_FLD_LEN_SPACE	4
+#define	DICT_FLD_LEN_FLAGS	4
+
+#endif
diff --git a/storage/innobase/include/dict0crea.h b/storage/innobase/include/dict0crea.h
new file mode 100644
index 00000000..c40df12b
--- /dev/null
+++ b/storage/innobase/include/dict0crea.h
@@ -0,0 +1,277 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0crea.h
+Database object creation
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0crea_h
+#define dict0crea_h
+
+#include "dict0dict.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "mtr0mtr.h"
+#include "fil0crypt.h"
+
+/*********************************************************************//**
+Creates a table create graph.
+@return own: table create node */
+tab_node_t*
+tab_create_graph_create(
+/*====================*/
+	dict_table_t*	table,		/*!< in: table to create, built as
+					a memory data structure */
+	mem_heap_t*	heap);		/*!< in: heap where created */
+
+/** Creates an index create graph.
+@param[in]	index	index to create, built as a memory data structure
+@param[in]	table	table name
+@param[in,out]	heap	heap where created
+@param[in]	mode	encryption mode (for creating a table)
+@param[in]	key_id	encryption key identifier (for creating a table)
+@param[in]	add_v	new virtual columns added in the same clause with
+			add index
+@return own: index create node */
+ind_node_t*
+ind_create_graph_create(
+	dict_index_t*		index,
+	const char*		table,
+	mem_heap_t*		heap,
+	fil_encryption_t	mode,
+	uint32_t		key_id,
+	const dict_add_v_col_t*	add_v = NULL);
+
+/***********************************************************//**
+Creates a table. This is a high-level function used in SQL execution graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+dict_create_table_step(
+/*===================*/
+	que_thr_t*	thr);		/*!< in: query thread */
+
+/***********************************************************//**
+Creates an index. This is a high-level function used in SQL execution
+graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+dict_create_index_step(
+/*===================*/
+	que_thr_t*	thr);		/*!< in: query thread */
+
+/***************************************************************//**
+Builds an index definition but doesn't update sys_table.
+@return DB_SUCCESS or error code */
+void
+dict_build_index_def(
+/*=================*/
+	const dict_table_t*	table,	/*!< in: table */
+	dict_index_t*		index,	/*!< in/out: index */
+	trx_t*			trx);	/*!< in/out: InnoDB transaction
+					handle */
+/***************************************************************//**
+Creates an index tree for the index if it is not a member of a cluster.
+Don't update SYSTEM TABLES.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+dberr_t
+dict_create_index_tree(
+/*===================*/
+	dict_index_t*	index,	/*!< in/out: index */
+	const trx_t*	trx);	/*!< in: InnoDB transaction handle */
+
+/** Drop the index tree associated with a row in SYS_INDEXES table.
+@param[in,out]	pcur	persistent cursor on rec
+@param[in,out]	trx	dictionary transaction
+@param[in,out]	mtr	mini-transaction
+@return tablespace ID to drop (if this is the clustered index)
+@retval 0 if no tablespace is to be dropped */
+uint32_t dict_drop_index_tree(btr_pcur_t *pcur, trx_t *trx, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull(1,3), warn_unused_result));
+
+/***************************************************************//**
+Creates an index tree for the index if it is not a member of a cluster.
+Don't update SYSTEM TABLES.
+@return	error code */
+dberr_t
+dict_create_index_tree_in_mem(
+/*==========================*/
+	dict_index_t*	index,		/*!< in/out: index */
+	const trx_t*	trx);		/*!< in: InnoDB transaction handle */
+
+/********************************************************************//**
+Generate a foreign key constraint name when it was not named by the user.
+A generated constraint has a name of the format dbname/tablename_ibfk_NUMBER,
+where the numbers start from 1, and are given locally for this table, that is,
+the number is not global, as it used to be before MySQL 4.0.18.  */
+UNIV_INLINE
+dberr_t
+dict_create_add_foreign_id(
+/*=======================*/
+	ulint*		id_nr,		/*!< in/out: number to use in id
+					generation; incremented if used */
+	const char*	name,		/*!< in: table name */
+	dict_foreign_t*	foreign);	/*!< in/out: foreign key */
+
+/** Adds the given set of foreign key objects to the dictionary tables
+in the database. This function does not modify the dictionary cache. The
+caller must ensure that all foreign key objects contain a valid constraint
+name in foreign->id.
+@param[in]	local_fk_set	set of foreign key objects, to be added to
+the dictionary tables
+@param[in]	table		table to which the foreign key objects in
+local_fk_set belong to
+@param[in,out]	trx		transaction
+@return error code or DB_SUCCESS */
+dberr_t
+dict_create_add_foreigns_to_dictionary(
+/*===================================*/
+	const dict_foreign_set&	local_fk_set,
+	const dict_table_t*	table,
+	trx_t*			trx)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Check if a foreign constraint is on columns server as base columns
+of any stored column. This is to prevent creating SET NULL or CASCADE
+constraint on such columns
+@param[in]	local_fk_set	set of foreign key objects, to be added to
+the dictionary tables
+@param[in]	table		table to which the foreign key objects in
+local_fk_set belong to
+@return true if yes, otherwise, false */
+bool
+dict_foreigns_has_s_base_col(
+	const dict_foreign_set&	local_fk_set,
+	const dict_table_t*	table);
+
+/********************************************************************//**
+Add a foreign key definition to the data dictionary tables.
+@return error code or DB_SUCCESS */
+dberr_t
+dict_create_add_foreign_to_dictionary(
+/*==================================*/
+	const char*		name,	/*!< in: table name */
+	const dict_foreign_t*	foreign,/*!< in: foreign key */
+	trx_t*			trx)	/*!< in/out: dictionary transaction */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/* Table create node structure */
+struct tab_node_t{
+	que_common_t	common;		/*!< node type: QUE_NODE_TABLE_CREATE */
+	dict_table_t*	table;		/*!< table to create, built as a
+					memory data structure with
+					dict_mem_... functions */
+	ins_node_t*	tab_def;	/*!< child node which does the insert of
+					the table definition; the row to be
+					inserted is built by the parent node  */
+	ins_node_t*	col_def;	/*!< child node which does the inserts
+					of the column definitions; the row to
+					be inserted is built by the parent
+					node  */
+	ins_node_t*	v_col_def;	/*!< child node which does the inserts
+					of the sys_virtual row definitions;
+					the row to be inserted is built by
+					the parent node  */
+	/*----------------------*/
+	/* Local storage for this graph node */
+	ulint		state;		/*!< node execution state */
+	ulint		col_no;		/*!< next column definition to insert */
+	ulint		base_col_no;	/*!< next base column to insert */
+	mem_heap_t*	heap;		/*!< memory heap used as auxiliary
+					storage */
+};
+
+/* Table create node states */
+#define	TABLE_BUILD_TABLE_DEF	1
+#define	TABLE_BUILD_COL_DEF	2
+#define	TABLE_BUILD_V_COL_DEF	3
+#define	TABLE_ADD_TO_CACHE	4
+#define	TABLE_COMPLETED		5
+
+/* Index create node struct */
+
+struct ind_node_t{
+	que_common_t	common;		/*!< node type: QUE_NODE_INDEX_CREATE */
+	dict_index_t*	index;		/*!< index to create, built as a
+					memory data structure with
+					dict_mem_... functions */
+	const char*	table_name;	/*!< table name */
+	ins_node_t*	ind_def;	/*!< child node which does the insert of
+					the index definition; the row to be
+					inserted is built by the parent node  */
+	ins_node_t*	field_def;	/*!< child node which does the inserts
+					of the field definitions; the row to
+					be inserted is built by the parent
+					node  */
+	/*----------------------*/
+	/* Local storage for this graph node */
+	ulint		state;		/*!< node execution state */
+	uint32_t	page_no;	/* root page number of the index */
+	dtuple_t*	ind_row;	/* index definition row built */
+	ulint		field_no;	/* next field definition to insert */
+	mem_heap_t*	heap;		/*!< memory heap used as auxiliary
+					storage */
+	uint		key_id;		/*!< encryption key_id */
+	fil_encryption_t mode;		/*!< encryption mode */
+	const dict_add_v_col_t*
+			add_v;		/*!< new virtual columns that being
+					added along with an add index call */
+};
+
+/** Compose a column number for a virtual column, stored in the "POS" field
+of Sys_columns. The column number includes both its virtual column sequence
+(the "nth" virtual column) and its actual column position in original table
+@param[in]	v_pos		virtual column sequence
+@param[in]	col_pos		column position in original table definition
+@return	composed column position number */
+UNIV_INLINE
+ulint
+dict_create_v_col_pos(
+	ulint	v_pos,
+	ulint	col_pos);
+
+/** Get the column number for a virtual column (the column position in
+original table), stored in the "POS" field of Sys_columns
+@param[in]      pos             virtual column position
+@return column position in original table */
+UNIV_INLINE
+ulint
+dict_get_v_col_mysql_pos(
+        ulint   pos);
+
+/** Get a virtual column sequence (the "nth" virtual column) for a
+virtual column, stord in the "POS" field of Sys_columns
+@param[in]      pos             virtual column position
+@return virtual column sequence */
+UNIV_INLINE
+ulint
+dict_get_v_col_pos(
+        ulint   pos);
+
+/* Index create node states */
+#define	INDEX_BUILD_INDEX_DEF	1
+#define	INDEX_BUILD_FIELD_DEF	2
+#define	INDEX_CREATE_INDEX_TREE	3
+#define	INDEX_ADD_TO_CACHE	4
+
+#include "dict0crea.inl"
+
+#endif
diff --git a/storage/innobase/include/dict0crea.inl b/storage/innobase/include/dict0crea.inl
new file mode 100644
index 00000000..5641206d
--- /dev/null
+++ b/storage/innobase/include/dict0crea.inl
@@ -0,0 +1,136 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0crea.ic
+Database object creation
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#include "ha_prototypes.h"
+
+#include "mem0mem.h"
+
+/********************************************************************//**
+Generate a foreign key constraint name when it was not named by the user.
+A generated constraint has a name of the format dbname/tablename_ibfk_NUMBER,
+where the numbers start from 1, and are given locally for this table, that is,
+the number is not global, as it used to be before MySQL 4.0.18.  */
+UNIV_INLINE
+dberr_t
+dict_create_add_foreign_id(
+/*=======================*/
+	ulint*		id_nr,	/*!< in/out: number to use in id generation;
+				incremented if used */
+	const char*	name,	/*!< in: table name */
+	dict_foreign_t*	foreign)/*!< in/out: foreign key */
+{
+	DBUG_ENTER("dict_create_add_foreign_id");
+
+	if (foreign->id == NULL) {
+		/* Generate a new constraint id */
+		ulint	namelen	= strlen(name);
+		char*	id	= static_cast<char*>(
+					mem_heap_alloc(foreign->heap,
+						       namelen + 20));
+
+		if (dict_table_t::is_temporary_name(name)) {
+
+			/* no overflow if number < 1e13 */
+			sprintf(id, "%s_ibfk_%lu", name,
+				(ulong) (*id_nr)++);
+		} else {
+			char	table_name[MAX_TABLE_NAME_LEN + 21];
+			uint	errors = 0;
+
+			strncpy(table_name, name, (sizeof table_name) - 1);
+			table_name[(sizeof table_name) - 1] = '\0';
+
+			innobase_convert_to_system_charset(
+				strchr(table_name, '/') + 1,
+				strchr(name, '/') + 1,
+				MAX_TABLE_NAME_LEN, &errors);
+
+			if (errors) {
+				strncpy(table_name, name,
+					(sizeof table_name) - 1);
+				table_name[(sizeof table_name) - 1] = '\0';
+			}
+
+			/* no overflow if number < 1e13 */
+			sprintf(id, "%s_ibfk_%lu", table_name,
+				(ulong) (*id_nr)++);
+
+			if (innobase_check_identifier_length(
+				strchr(id,'/') + 1)) {
+				DBUG_RETURN(DB_IDENTIFIER_TOO_LONG);
+			}
+		}
+		foreign->id = id;
+
+		DBUG_PRINT("dict_create_add_foreign_id",
+			   ("generated foreign id: %s", id));
+	}
+
+
+	DBUG_RETURN(DB_SUCCESS);
+}
+
+/** Compose a column number for a virtual column, stored in the "POS" field
+of Sys_columns. The column number includes both its virtual column sequence
+(the "nth" virtual column) and its actual column position in original table
+@param[in]	v_pos		virtual column sequence
+@param[in]	col_pos		column position in original table definition
+@return composed column position number */
+UNIV_INLINE
+ulint
+dict_create_v_col_pos(
+	ulint	v_pos,
+	ulint	col_pos)
+{
+	ut_ad(v_pos <= REC_MAX_N_FIELDS);
+	ut_ad(col_pos <= REC_MAX_N_FIELDS);
+
+	return(((v_pos + 1) << 16) + col_pos);
+}
+
+/** Get the column number for a virtual column (the column position in
+original table), stored in the "POS" field of Sys_columns
+@param[in]	pos		virtual column position
+@return column position in original table */
+UNIV_INLINE
+ulint
+dict_get_v_col_mysql_pos(
+	ulint	pos)
+{
+	return(pos & 0xFFFF);
+}
+
+/** Get a virtual column sequence (the "nth" virtual column) for a
+virtual column, stord in the "POS" field of Sys_columns
+@param[in]	pos		virtual column position
+@return virtual column sequence */
+UNIV_INLINE
+ulint
+dict_get_v_col_pos(
+	ulint	pos)
+{
+	return((pos >> 16) - 1);
+}
diff --git a/storage/innobase/include/dict0defrag_bg.h b/storage/innobase/include/dict0defrag_bg.h
new file mode 100644
index 00000000..679484ad
--- /dev/null
+++ b/storage/innobase/include/dict0defrag_bg.h
@@ -0,0 +1,101 @@
+/*****************************************************************************
+
+Copyright (c) 2016, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0defrag_bg.h
+Code used for background table and index
+defragmentation
+
+Created 25/08/2016 Jan Lindström
+*******************************************************/
+
+#ifndef dict0defrag_bg_h
+#define dict0defrag_bg_h
+
+#include "dict0types.h"
+
+/** Indices whose defrag stats need to be saved to persistent storage.*/
+struct defrag_pool_item_t {
+	table_id_t	table_id;
+	index_id_t	index_id;
+};
+
+/** Allocator type, used by std::vector */
+typedef ut_allocator<defrag_pool_item_t>
+	defrag_pool_allocator_t;
+
+/** The multitude of tables to be defragmented- an STL vector */
+typedef std::vector<defrag_pool_item_t, defrag_pool_allocator_t>
+	defrag_pool_t;
+
+/** Pool where we store information on which tables are to be processed
+by background defragmentation. */
+extern defrag_pool_t		defrag_pool;
+
+/*****************************************************************//**
+Initialize the defrag pool, called once during thread initialization. */
+void
+dict_defrag_pool_init(void);
+/*========================*/
+
+/*****************************************************************//**
+Free the resources occupied by the defrag pool, called once during
+thread de-initialization. */
+void
+dict_defrag_pool_deinit(void);
+/*==========================*/
+
+/*****************************************************************//**
+Add an index in a table to the defrag pool, which is processed by the
+background stats gathering thread. Only the table id and index id are
+added to the list, so the table can be closed after being enqueued and
+it will be opened when needed. If the table or index does not exist later
+(has been DROPped), then it will be removed from the pool and skipped. */
+void
+dict_stats_defrag_pool_add(
+/*=======================*/
+	const dict_index_t*	index);	/*!< in: table to add */
+
+/*****************************************************************//**
+Delete a given index from the auto defrag pool. */
+void
+dict_stats_defrag_pool_del(
+/*=======================*/
+	const dict_table_t*	table,	/*!<in: if given, remove
+					all entries for the table */
+	const dict_index_t*	index);	/*!< in: index to remove */
+
+/**
+Get the first index that has been added for updating persistent defrag
+stats and eventually save its stats. */
+void dict_defrag_process_entries_from_defrag_pool(THD *thd);
+
+/*********************************************************************//**
+Save defragmentation result.
+@return DB_SUCCESS or error code */
+dberr_t dict_stats_save_defrag_summary(dict_index_t *index, THD *thd)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/*********************************************************************//**
+Save defragmentation stats for a given index.
+@return DB_SUCCESS or error code */
+dberr_t
+dict_stats_save_defrag_stats(
+/*============================*/
+	dict_index_t*	index);	/*!< in: index */
+#endif /* dict0defrag_bg_h */
diff --git a/storage/innobase/include/dict0dict.h b/storage/innobase/include/dict0dict.h
new file mode 100644
index 00000000..5fafb2c5
--- /dev/null
+++ b/storage/innobase/include/dict0dict.h
@@ -0,0 +1,1744 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0dict.h
+Data dictionary system
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0dict_h
+#define dict0dict_h
+
+#include "data0data.h"
+#include "dict0mem.h"
+#include "fsp0fsp.h"
+#include "srw_lock.h"
+#include <my_sys.h>
+#include <deque>
+
+class MDL_ticket;
+
+/** the first table or index ID for other than hard-coded system tables */
+constexpr uint8_t DICT_HDR_FIRST_ID= 10;
+
+
+/** Get the database name length in a table name.
+@param name   filename-safe encoded table name "dbname/tablename"
+@return database name length */
+inline size_t dict_get_db_name_len(const char *name)
+{
+  /* table_name_t::dblen() would assert that '/' is contained */
+  if (const char* s= strchr(name, '/'))
+    return size_t(s - name);
+
+  return 0;
+}
+
+
+/*********************************************************************//**
+Open a table from its database and table name, this is currently used by
+foreign constraint parser to get the referenced table.
+@return complete table name with database and table name, allocated from
+heap memory passed in */
+char*
+dict_get_referenced_table(
+/*======================*/
+	const char*	name,		/*!< in: foreign key table name */
+	const char*	database_name,	/*!< in: table db name */
+	ulint		database_name_len,/*!< in: db name length */
+	const char*	table_name,	/*!< in: table name */
+	ulint		table_name_len,	/*!< in: table name length */
+	dict_table_t**	table,		/*!< out: table object or NULL */
+	mem_heap_t*	heap,		/*!< in: heap memory */
+	CHARSET_INFO*	from_cs);	/*!< in: table name charset */
+/*********************************************************************//**
+Frees a foreign key struct. */
+void
+dict_foreign_free(
+/*==============*/
+	dict_foreign_t*	foreign);	/*!< in, own: foreign key struct */
+/*********************************************************************//**
+Finds the highest [number] for foreign key constraints of the table. Looks
+only at the >= 4.0.18-format id's, which are of the form
+databasename/tablename_ibfk_[number].
+@return highest number, 0 if table has no new format foreign key constraints */
+ulint
+dict_table_get_highest_foreign_id(
+/*==============================*/
+	dict_table_t*	table);		/*!< in: table in the dictionary
+					memory cache */
+/** Check whether the dict_table_t is a partition.
+A partitioned table on the SQL level is composed of InnoDB tables,
+where each InnoDB table is a [sub]partition including its secondary indexes
+which belongs to the partition.
+@param[in]	table	Table to check.
+@return true if the dict_table_t is a partition else false. */
+UNIV_INLINE
+bool
+dict_table_is_partition(const dict_table_t* table)
+{
+	/* Check both P and p on all platforms in case it was moved to/from
+	WIN. */
+	return (strstr(table->name.m_name, "#p#")
+		|| strstr(table->name.m_name, "#P#"));
+}
+/********************************************************************//**
+Return the end of table name where we have removed dbname and '/'.
+@return table name */
+const char*
+dict_remove_db_name(
+/*================*/
+	const char*	name)	/*!< in: table name in the form
+				dbname '/' tablename */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Operation to perform when opening a table */
+enum dict_table_op_t {
+	/** Expect the tablespace to exist. */
+	DICT_TABLE_OP_NORMAL = 0,
+	/** Drop any orphan indexes after an aborted online index creation */
+	DICT_TABLE_OP_DROP_ORPHAN,
+	/** Silently load the tablespace if it does not exist,
+	and do not load the definitions of incomplete indexes. */
+	DICT_TABLE_OP_LOAD_TABLESPACE,
+	/** Open the table only if it's in table cache. */
+	DICT_TABLE_OP_OPEN_ONLY_IF_CACHED
+};
+
+/** Acquire MDL shared for the table name.
+@tparam trylock whether to use non-blocking operation
+@param[in,out]  table           table object
+@param[in,out]  thd             background thread
+@param[out]     mdl             mdl ticket
+@param[in]      table_op        operation to perform when opening
+@return table object after locking MDL shared
+@retval NULL if the table is not readable, or if trylock && MDL blocked */
+template<bool trylock>
+dict_table_t*
+dict_acquire_mdl_shared(dict_table_t *table,
+                        THD *thd,
+                        MDL_ticket **mdl,
+                        dict_table_op_t table_op= DICT_TABLE_OP_NORMAL);
+
+/** Look up a table by numeric identifier.
+@param[in]      table_id        table identifier
+@param[in]      dict_locked     data dictionary locked
+@param[in]      table_op        operation to perform when opening
+@param[in,out]  thd             background thread, or NULL to not acquire MDL
+@param[out]     mdl             mdl ticket, or NULL
+@return table, NULL if does not exist */
+dict_table_t*
+dict_table_open_on_id(table_id_t table_id, bool dict_locked,
+                      dict_table_op_t table_op, THD *thd= nullptr,
+                      MDL_ticket **mdl= nullptr)
+  MY_ATTRIBUTE((warn_unused_result));
+
+/** Decrement the count of open handles */
+void dict_table_close(dict_table_t *table);
+
+/** Decrements the count of open handles of a table.
+@param[in,out]	table		table
+@param[in]	dict_locked	whether dict_sys.latch is being held
+@param[in]	thd		thread to release MDL
+@param[in]	mdl		metadata lock or NULL if the thread is a
+				foreground one. */
+void
+dict_table_close(
+	dict_table_t*	table,
+	bool		dict_locked,
+	THD*		thd = NULL,
+	MDL_ticket*	mdl = NULL);
+
+/*********************************************************************//**
+Gets the minimum number of bytes per character.
+@return minimum multi-byte char size, in bytes */
+UNIV_INLINE
+unsigned
+dict_col_get_mbminlen(
+/*==================*/
+	const dict_col_t*	col)	/*!< in: column */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Gets the maximum number of bytes per character.
+@return maximum multi-byte char size, in bytes */
+UNIV_INLINE
+unsigned
+dict_col_get_mbmaxlen(
+/*==================*/
+	const dict_col_t*	col)	/*!< in: column */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Gets the column data type. */
+UNIV_INLINE
+void
+dict_col_copy_type(
+/*===============*/
+	const dict_col_t*	col,	/*!< in: column */
+	dtype_t*		type);	/*!< out: data type */
+
+/**********************************************************************//**
+Determine bytes of column prefix to be stored in the undo log. Please
+note that if !dict_table_has_atomic_blobs(table), no prefix
+needs to be stored in the undo log.
+@return bytes of column prefix to be stored in the undo log */
+UNIV_INLINE
+ulint
+dict_max_field_len_store_undo(
+/*==========================*/
+	dict_table_t*		table,	/*!< in: table */
+	const dict_col_t*	col)	/*!< in: column which index prefix
+					is based on */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Determine maximum bytes of a virtual column need to be stored
+in the undo log.
+@param[in]	table		dict_table_t for the table
+@param[in]	col_no		virtual column number
+@return maximum bytes of virtual column to be stored in the undo log */
+UNIV_INLINE
+ulint
+dict_max_v_field_len_store_undo(
+	dict_table_t*		table,
+	ulint			col_no);
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Assert that a column and a data type match.
+@return TRUE */
+UNIV_INLINE
+ibool
+dict_col_type_assert_equal(
+/*=======================*/
+	const dict_col_t*	col,	/*!< in: column */
+	const dtype_t*		type)	/*!< in: data type */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+#endif /* UNIV_DEBUG */
+
+/***********************************************************************//**
+Returns the minimum size of the column.
+@return minimum size */
+UNIV_INLINE
+unsigned
+dict_col_get_min_size(
+/*==================*/
+	const dict_col_t*	col)	/*!< in: column */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/***********************************************************************//**
+Returns the maximum size of the column.
+@return maximum size */
+UNIV_INLINE
+ulint
+dict_col_get_max_size(
+/*==================*/
+	const dict_col_t*	col)	/*!< in: column */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/***********************************************************************//**
+Returns the size of a fixed size column, 0 if not a fixed size column.
+@return fixed size, or 0 */
+UNIV_INLINE
+unsigned
+dict_col_get_fixed_size(
+/*====================*/
+	const dict_col_t*	col,	/*!< in: column */
+	ulint			comp)	/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/***********************************************************************//**
+Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a column.
+For fixed length types it is the fixed length of the type, otherwise 0.
+@return SQL null storage size in ROW_FORMAT=REDUNDANT */
+UNIV_INLINE
+unsigned
+dict_col_get_sql_null_size(
+/*=======================*/
+	const dict_col_t*	col,	/*!< in: column */
+	ulint			comp)	/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Gets the column number.
+@return col->ind, table column position (starting from 0) */
+UNIV_INLINE
+unsigned
+dict_col_get_no(
+/*============*/
+	const dict_col_t*	col)	/*!< in: column */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Gets the column position in the clustered index. */
+UNIV_INLINE
+ulint
+dict_col_get_clust_pos(
+/*===================*/
+	const dict_col_t*	col,		/*!< in: table column */
+	const dict_index_t*	clust_index)	/*!< in: clustered index */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Gets the column position in the given index.
+@param[in]	col	table column
+@param[in]	index	index to be searched for column
+@return position of column in the given index. */
+UNIV_INLINE
+ulint
+dict_col_get_index_pos(
+	const dict_col_t*	col,
+	const dict_index_t*	index)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/****************************************************************//**
+If the given column name is reserved for InnoDB system columns, return
+TRUE.
+@return TRUE if name is reserved */
+ibool
+dict_col_name_is_reserved(
+/*======================*/
+	const char*	name)	/*!< in: column name */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Unconditionally set the AUTO_INCREMENT counter.
+@param[in,out]	table	table or partition
+@param[in]	value	next available AUTO_INCREMENT value */
+MY_ATTRIBUTE((nonnull))
+UNIV_INLINE
+void
+dict_table_autoinc_initialize(dict_table_t* table, ib_uint64_t value)
+{
+	table->autoinc = value;
+}
+
+/**
+@param[in]	table	table or partition
+@return the next AUTO_INCREMENT counter value
+@retval	0	if AUTO_INCREMENT is not yet initialized */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+UNIV_INLINE
+ib_uint64_t
+dict_table_autoinc_read(const dict_table_t* table)
+{
+	return(table->autoinc);
+}
+
+/** Update the AUTO_INCREMENT sequence if the value supplied is greater
+than the current value.
+@param[in,out]	table	table or partition
+@param[in]	value	AUTO_INCREMENT value that was assigned to a row
+@return	whether the AUTO_INCREMENT sequence was updated */
+MY_ATTRIBUTE((nonnull))
+UNIV_INLINE
+bool
+dict_table_autoinc_update_if_greater(dict_table_t* table, ib_uint64_t value)
+{
+	if (value > table->autoinc) {
+
+		table->autoinc = value;
+		return(true);
+	}
+
+	return(false);
+}
+
+/**********************************************************************//**
+Adds system columns to a table object. */
+void
+dict_table_add_system_columns(
+/*==========================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	mem_heap_t*	heap)	/*!< in: temporary heap */
+	MY_ATTRIBUTE((nonnull));
+/**********************************************************************//**
+Renames a table object.
+@return TRUE if success */
+dberr_t
+dict_table_rename_in_cache(
+/*=======================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	span<const char> new_name,	/*!< in: new name */
+	bool		replace_new_file)
+					/*!< in: whether to replace the
+					file with the new name
+					(as part of rolling back TRUNCATE) */
+	MY_ATTRIBUTE((nonnull));
+
+/** Removes an index from the dictionary cache.
+@param[in,out]	table	table whose index to remove
+@param[in,out]	index	index to remove, this object is destroyed and must not
+be accessed by the caller afterwards */
+void
+dict_index_remove_from_cache(
+	dict_table_t*	table,
+	dict_index_t*	index);
+
+/**********************************************************************//**
+Change the id of a table object in the dictionary cache. This is used in
+DISCARD TABLESPACE. */
+void
+dict_table_change_id_in_cache(
+/*==========================*/
+	dict_table_t*	table,	/*!< in/out: table object already in cache */
+	table_id_t	new_id)	/*!< in: new id to set */
+	MY_ATTRIBUTE((nonnull));
+/**********************************************************************//**
+Removes a foreign constraint struct from the dictionary cache. */
+void
+dict_foreign_remove_from_cache(
+/*===========================*/
+	dict_foreign_t*	foreign)	/*!< in, own: foreign constraint */
+	MY_ATTRIBUTE((nonnull));
+/**********************************************************************//**
+Adds a foreign key constraint object to the dictionary cache. May free
+the object if there already is an object with the same identifier in.
+At least one of foreign table or referenced table must already be in
+the dictionary cache!
+@return DB_SUCCESS or error code */
+dberr_t
+dict_foreign_add_to_cache(
+/*======================*/
+	dict_foreign_t*		foreign,
+				/*!< in, own: foreign key constraint */
+	const char**		col_names,
+				/*!< in: column names, or NULL to use
+				foreign->foreign_table->col_names */
+	bool			check_charsets,
+				/*!< in: whether to check charset
+				compatibility */
+	dict_err_ignore_t	ignore_err)
+				/*!< in: error to be ignored */
+	MY_ATTRIBUTE((nonnull(1), warn_unused_result));
+/**********************************************************************//**
+Replace the index passed in with another equivalent index in the
+foreign key lists of the table.
+@return whether all replacements were found */
+bool
+dict_foreign_replace_index(
+/*=======================*/
+	dict_table_t*		table,  /*!< in/out: table */
+	const char**		col_names,
+					/*!< in: column names, or NULL
+					to use table->col_names */
+	const dict_index_t*	index)	/*!< in: index to be replaced */
+	MY_ATTRIBUTE((nonnull(1,3), warn_unused_result));
+/**********************************************************************//**
+Parses the CONSTRAINT id's to be dropped in an ALTER TABLE statement.
+@return DB_SUCCESS or DB_CANNOT_DROP_CONSTRAINT if syntax error or the
+constraint id does not match */
+dberr_t
+dict_foreign_parse_drop_constraints(
+/*================================*/
+	mem_heap_t*	heap,			/*!< in: heap from which we can
+						allocate memory */
+	trx_t*		trx,			/*!< in: transaction */
+	dict_table_t*	table,			/*!< in: table */
+	ulint*		n,			/*!< out: number of constraints
+						to drop */
+	const char***	constraints_to_drop)	/*!< out: id's of the
+						constraints to drop */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/**********************************************************************//**
+Returns a table object and increments its open handle count.
+NOTE! This is a high-level function to be used mainly from outside the
+'dict' directory. Inside this directory dict_table_get_low
+is usually the appropriate function.
+@param[in] table_name Table name
+@param[in] dict_locked whether dict_sys.latch is being held exclusively
+@param[in] ignore_err error to be ignored when loading the table
+@return table
+@retval nullptr if does not exist */
+dict_table_t*
+dict_table_open_on_name(
+	const char*		table_name,
+	bool			dict_locked,
+	dict_err_ignore_t	ignore_err)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Outcome of dict_foreign_find_index() or dict_foreign_qualify_index() */
+enum fkerr_t
+{
+  /** A backing index was found for a FOREIGN KEY constraint */
+  FK_SUCCESS = 0,
+  /** There is no index that covers the columns in the constraint. */
+  FK_INDEX_NOT_FOUND,
+  /** The index is for a prefix index, not a full column. */
+  FK_IS_PREFIX_INDEX,
+  /** A condition of SET NULL conflicts with a NOT NULL column. */
+  FK_COL_NOT_NULL,
+  /** The column types do not match */
+  FK_COLS_NOT_EQUAL
+};
+
+/*********************************************************************//**
+Tries to find an index whose first fields are the columns in the array,
+in the same order and is not marked for deletion and is not the same
+as types_idx.
+@return matching index, NULL if not found */
+dict_index_t*
+dict_foreign_find_index(
+/*====================*/
+	const dict_table_t*	table,	/*!< in: table */
+	const char**		col_names,
+					/*!< in: column names, or NULL
+					to use table->col_names */
+	const char**		columns,/*!< in: array of column names */
+	ulint			n_cols,	/*!< in: number of columns */
+	const dict_index_t*	types_idx,
+					/*!< in: NULL or an index
+					whose types the column types
+					must match */
+	bool			check_charsets,
+					/*!< in: whether to check
+					charsets.  only has an effect
+					if types_idx != NULL */
+	ulint			check_null,
+					/*!< in: nonzero if none of
+					the columns must be declared
+					NOT NULL */
+	fkerr_t*		error = NULL,	/*!< out: error code */
+	ulint*			err_col_no = NULL,
+					/*!< out: column number where
+					error happened */
+	dict_index_t**		err_index = NULL)
+					/*!< out: index where error
+					happened */
+
+	MY_ATTRIBUTE((nonnull(1,3), warn_unused_result));
+
+/** Returns a virtual column's name.
+@param[in]	table		table object
+@param[in]	col_nr		virtual column number(nth virtual column)
+@return column name. */
+const char*
+dict_table_get_v_col_name(
+	const dict_table_t*	table,
+	ulint			col_nr);
+
+/** Check if the table has a given column.
+@param[in]	table		table object
+@param[in]	col_name	column name
+@param[in]	col_nr		column number guessed, 0 as default
+@return column number if the table has the specified column,
+otherwise table->n_def */
+ulint
+dict_table_has_column(
+	const dict_table_t*	table,
+	const char*		col_name,
+	ulint			col_nr = 0);
+
+/**********************************************************************//**
+Outputs info on foreign keys of a table. */
+std::string
+dict_print_info_on_foreign_keys(
+/*============================*/
+	ibool		create_table_format, /*!< in: if TRUE then print in
+				a format suitable to be inserted into
+				a CREATE TABLE, otherwise in the format
+				of SHOW TABLE STATUS */
+	trx_t*		trx,	/*!< in: transaction */
+	dict_table_t*	table);	/*!< in: table */
+
+/**********************************************************************//**
+Outputs info on a foreign key of a table in a format suitable for
+CREATE TABLE. */
+std::string
+dict_print_info_on_foreign_key_in_create_format(
+/*============================================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_foreign_t*	foreign,	/*!< in: foreign key constraint */
+	ibool		add_newline);	/*!< in: whether to add a newline */
+
+/*********************************************************************//**
+Tries to find an index whose first fields are the columns in the array,
+in the same order and is not marked for deletion and is not the same
+as types_idx.
+@return matching index, NULL if not found */
+bool
+dict_foreign_qualify_index(
+/*====================*/
+	const dict_table_t*	table,	/*!< in: table */
+	const char**		col_names,
+					/*!< in: column names, or NULL
+					to use table->col_names */
+	const char**		columns,/*!< in: array of column names */
+	ulint			n_cols,	/*!< in: number of columns */
+	const dict_index_t*	index,	/*!< in: index to check */
+	const dict_index_t*	types_idx,
+					/*!< in: NULL or an index
+					whose types the column types
+					must match */
+	bool			check_charsets,
+					/*!< in: whether to check
+					charsets.  only has an effect
+					if types_idx != NULL */
+	ulint			check_null,
+					/*!< in: nonzero if none of
+					the columns must be declared
+					NOT NULL */
+	fkerr_t*		error,	/*!< out: error code */
+	ulint*			err_col_no,
+					/*!< out: column number where
+					error happened */
+	dict_index_t**		err_index)
+					/*!< out: index where error
+					happened */
+	MY_ATTRIBUTE((nonnull(1,3), warn_unused_result));
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the first index on the table (the clustered index).
+@return index, NULL if none exists */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_first_index(
+/*=======================*/
+	const dict_table_t*	table)	/*!< in: table */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************//**
+Gets the last index on the table.
+@return index, NULL if none exists */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_last_index(
+/*=======================*/
+	const dict_table_t*	table)	/*!< in: table */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************//**
+Gets the next index on the table.
+@return index, NULL if none left */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_next_index(
+/*======================*/
+	const dict_index_t*	index)	/*!< in: index */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+#else /* UNIV_DEBUG */
+# define dict_table_get_first_index(table) UT_LIST_GET_FIRST((table)->indexes)
+# define dict_table_get_last_index(table) UT_LIST_GET_LAST((table)->indexes)
+# define dict_table_get_next_index(index) UT_LIST_GET_NEXT(indexes, index)
+#endif /* UNIV_DEBUG */
+
+#define dict_index_is_clust(index) (index)->is_clust()
+#define dict_index_is_auto_gen_clust(index) (index)->is_gen_clust()
+#define dict_index_is_unique(index) (index)->is_unique()
+#define dict_index_is_spatial(index) (index)->is_spatial()
+#define dict_index_is_ibuf(index) (index)->is_ibuf()
+#define dict_index_is_sec_or_ibuf(index) !(index)->is_primary()
+#define dict_index_has_virtual(index) (index)->has_virtual()
+
+/** Get all the FTS indexes on a table.
+@param[in]	table	table
+@param[out]	indexes	all FTS indexes on this table
+@return number of FTS indexes */
+ulint
+dict_table_get_all_fts_indexes(
+	const dict_table_t*	table,
+	ib_vector_t*		indexes);
+
+/********************************************************************//**
+Gets the number of user-defined non-virtual columns in a table in the
+dictionary cache.
+@return number of user-defined (e.g., not ROW_ID) non-virtual
+columns of a table */
+UNIV_INLINE
+unsigned
+dict_table_get_n_user_cols(
+/*=======================*/
+	const dict_table_t*	table)	/*!< in: table */
+	MY_ATTRIBUTE((warn_unused_result));
+/********************************************************************//**
+Gets the number of all non-virtual columns (also system) in a table
+in the dictionary cache.
+@return number of columns of a table */
+UNIV_INLINE
+unsigned
+dict_table_get_n_cols(
+/*==================*/
+	const dict_table_t*	table)	/*!< in: table */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Gets the number of virtual columns in a table in the dictionary cache.
+@param[in]	table	the table to check
+@return number of virtual columns of a table */
+UNIV_INLINE
+unsigned
+dict_table_get_n_v_cols(
+	const dict_table_t*	table);
+
+/** Check if a table has indexed virtual columns
+@param[in]	table	the table to check
+@return true is the table has indexed virtual columns */
+UNIV_INLINE
+bool
+dict_table_has_indexed_v_cols(
+	const dict_table_t*	table);
+
+/********************************************************************//**
+Gets the approximately estimated number of rows in the table.
+@return estimated number of rows */
+UNIV_INLINE
+ib_uint64_t
+dict_table_get_n_rows(
+/*==================*/
+	const dict_table_t*	table)	/*!< in: table */
+	MY_ATTRIBUTE((warn_unused_result));
+/********************************************************************//**
+Increment the number of rows in the table by one.
+Notice that this operation is not protected by any latch, the number is
+approximate. */
+UNIV_INLINE
+void
+dict_table_n_rows_inc(
+/*==================*/
+	dict_table_t*	table)	/*!< in/out: table */
+	MY_ATTRIBUTE((nonnull));
+/********************************************************************//**
+Decrement the number of rows in the table by one.
+Notice that this operation is not protected by any latch, the number is
+approximate. */
+UNIV_INLINE
+void
+dict_table_n_rows_dec(
+/*==================*/
+	dict_table_t*	table)	/*!< in/out: table */
+	MY_ATTRIBUTE((nonnull));
+
+/** Get nth virtual column
+@param[in]	table	target table
+@param[in]	col_nr	column number in MySQL Table definition
+@return dict_v_col_t ptr */
+dict_v_col_t*
+dict_table_get_nth_v_col_mysql(
+	const dict_table_t*	table,
+	ulint			col_nr);
+
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the nth column of a table.
+@return pointer to column object */
+UNIV_INLINE
+dict_col_t*
+dict_table_get_nth_col(
+/*===================*/
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			pos)	/*!< in: position of column */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Gets the nth virtual column of a table.
+@param[in]	table	table
+@param[in]	pos	position of virtual column
+@return pointer to virtual column object */
+UNIV_INLINE
+dict_v_col_t*
+dict_table_get_nth_v_col(
+        const dict_table_t*	table,
+        ulint			pos);
+/********************************************************************//**
+Gets the given system column of a table.
+@return pointer to column object */
+UNIV_INLINE
+dict_col_t*
+dict_table_get_sys_col(
+/*===================*/
+	const dict_table_t*	table,	/*!< in: table */
+	unsigned		sys)	/*!< in: DATA_ROW_ID, ... */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+#else /* UNIV_DEBUG */
+#define dict_table_get_nth_col(table, pos)	(&(table)->cols[pos])
+#define dict_table_get_sys_col(table, sys)	\
+	&(table)->cols[(table)->n_cols + (sys) - DATA_N_SYS_COLS]
+/* Get nth virtual columns */
+#define dict_table_get_nth_v_col(table, pos)	(&(table)->v_cols[pos])
+#endif /* UNIV_DEBUG */
+/** Wrapper function.
+@see dict_col_t::name()
+@param[in]	table	table
+@param[in]	col_nr	column number in table
+@return	column name */
+inline
+const char*
+dict_table_get_col_name(const dict_table_t* table, ulint col_nr)
+{
+	return(dict_table_get_nth_col(table, col_nr)->name(*table));
+}
+
+/********************************************************************//**
+Gets the given system column number of a table.
+@return column number */
+UNIV_INLINE
+unsigned
+dict_table_get_sys_col_no(
+/*======================*/
+	const dict_table_t*	table,	/*!< in: table */
+	unsigned		sys)	/*!< in: DATA_ROW_ID, ... */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/********************************************************************//**
+Returns the minimum data size of an index record.
+@return minimum data size in bytes */
+UNIV_INLINE
+unsigned
+dict_index_get_min_size(
+/*====================*/
+	const dict_index_t*	index)	/*!< in: index */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+#define dict_table_is_comp(table) (table)->not_redundant()
+
+/** Determine if a table uses atomic BLOBs (no locally stored prefix).
+@param[in]	table	InnoDB table
+@return whether BLOBs are atomic */
+inline
+bool
+dict_table_has_atomic_blobs(const dict_table_t* table)
+{
+	return(DICT_TF_HAS_ATOMIC_BLOBS(table->flags));
+}
+
+/** @return potential max length stored inline for externally stored fields */
+inline size_t dict_table_t::get_overflow_field_local_len() const
+{
+	if (dict_table_has_atomic_blobs(this)) {
+		/* ROW_FORMAT=DYNAMIC or ROW_FORMAT=COMPRESSED: do not
+		store any BLOB prefix locally */
+		return BTR_EXTERN_FIELD_REF_SIZE;
+	}
+	/* up to MySQL 5.1: store a 768-byte prefix locally */
+	return BTR_EXTERN_FIELD_REF_SIZE + DICT_ANTELOPE_MAX_INDEX_COL_LEN;
+}
+
+/** Set the various values in a dict_table_t::flags pointer.
+@param[in,out]	flags,		Pointer to a 4 byte Table Flags
+@param[in]	format,		File Format
+@param[in]	zip_ssize	Zip Shift Size
+@param[in]	use_data_dir	Table uses DATA DIRECTORY
+@param[in]	page_compressed Table uses page compression
+@param[in]	page_compression_level Page compression level */
+UNIV_INLINE
+void
+dict_tf_set(
+	ulint*		flags,
+	rec_format_t	format,
+	ulint		zip_ssize,
+	bool		use_data_dir,
+	bool		page_compressed,
+	ulint		page_compression_level);
+
+/** Convert a 32 bit integer table flags to the 32 bit FSP Flags.
+Fsp Flags are written into the tablespace header at the offset
+FSP_SPACE_FLAGS and are also stored in the fil_space_t::flags field.
+The following chart shows the translation of the low order bit.
+Other bits are the same.
+========================= Low order bit ==========================
+                    | REDUNDANT | COMPACT | COMPRESSED | DYNAMIC
+dict_table_t::flags |     0     |    1    |     1      |    1
+fil_space_t::flags  |     0     |    0    |     1      |    1
+==================================================================
+@param[in]	table_flags	dict_table_t::flags
+@return tablespace flags (fil_space_t::flags) */
+inline uint32_t dict_tf_to_fsp_flags(unsigned table_flags)
+  MY_ATTRIBUTE((const));
+
+/** Extract the ROW_FORMAT=COMPRESSED page size from table flags.
+@param[in]	flags	flags
+@return ROW_FORMAT=COMPRESSED page size
+@retval	0 if not compressed */
+inline ulint dict_tf_get_zip_size(ulint flags)
+{
+	flags &= DICT_TF_MASK_ZIP_SSIZE;
+	return flags
+		? (UNIV_ZIP_SIZE_MIN >> 1)
+		<< (FSP_FLAGS_GET_ZIP_SSIZE(flags >> DICT_TF_POS_ZIP_SSIZE
+					    << FSP_FLAGS_POS_ZIP_SSIZE))
+		: 0;
+}
+
+/********************************************************************//**
+Checks if a column is in the ordering columns of the clustered index of a
+table. Column prefixes are treated like whole columns.
+@return TRUE if the column, or its prefix, is in the clustered key */
+ibool
+dict_table_col_in_clustered_key(
+/*============================*/
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			n)	/*!< in: column number */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*******************************************************************//**
+Check if the table has an FTS index.
+@return TRUE if table has an FTS index */
+UNIV_INLINE
+ibool
+dict_table_has_fts_index(
+/*=====================*/
+	dict_table_t*   table)		/*!< in: table */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Copies types of virtual columns contained in table to tuple and sets all
+fields of the tuple to the SQL NULL value.  This function should
+be called right after dtuple_create().
+@param[in,out]	tuple	data tuple
+@param[in]	table	table
+*/
+void
+dict_table_copy_v_types(
+	dtuple_t*		tuple,
+	const dict_table_t*	table);
+
+/*******************************************************************//**
+Copies types of columns contained in table to tuple and sets all
+fields of the tuple to the SQL NULL value.  This function should
+be called right after dtuple_create(). */
+void
+dict_table_copy_types(
+/*==================*/
+	dtuple_t*		tuple,	/*!< in/out: data tuple */
+	const dict_table_t*	table)	/*!< in: table */
+	MY_ATTRIBUTE((nonnull));
+/** Adds an index to the dictionary cache, with possible indexing newly
+added column.
+@param[in,out]	index	index; NOTE! The index memory
+			object is freed in this function!
+@param[in]	page_no	root page number of the index
+@param[in]	add_v	virtual columns being added along with ADD INDEX
+@return DB_SUCCESS, or DB_CORRUPTION */
+dberr_t
+dict_index_add_to_cache(
+	dict_index_t*&		index,
+	ulint			page_no,
+	const dict_add_v_col_t* add_v = NULL)
+	MY_ATTRIBUTE((warn_unused_result));
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index,
+including fields added by the dictionary system.
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_fields(
+/*====================*/
+	const dict_index_t*	index)	/*!< in: an internal
+					representation of index (in
+					the dictionary cache) */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index
+that uniquely determine the position of an index entry in the index, if
+we do not take multiversioning into account: in the B-tree use the value
+returned by dict_index_get_n_unique_in_tree.
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_unique(
+/*====================*/
+	const dict_index_t*	index)	/*!< in: an internal representation
+					of index (in the dictionary cache) */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index
+which uniquely determine the position of an index entry in the index, if
+we also take multiversioning into account.
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_unique_in_tree(
+/*============================*/
+	const dict_index_t*	index)	/*!< in: an internal representation
+					of index (in the dictionary cache) */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** The number of fields in the nonleaf page of spatial index, except
+the page no field. */
+#define DICT_INDEX_SPATIAL_NODEPTR_SIZE	1
+/**
+Gets the number of fields on nonleaf page level in the internal representation
+of an index which uniquely determine the position of an index entry in the
+index, if we also take multiversioning into account. Note, it doesn't
+include page no field.
+@param[in]	index	index
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_unique_in_tree_nonleaf(
+	const dict_index_t*	index)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************//**
+Gets the number of user-defined ordering fields in the index. In the internal
+representation we add the row id to the ordering fields to make all indexes
+unique, but this function returns the number of fields the user defined
+in the index as ordering fields.
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_ordering_defined_by_user(
+/*======================================*/
+	const dict_index_t*	index)	/*!< in: an internal representation
+					of index (in the dictionary cache) */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the nth field of an index.
+@return pointer to field object */
+UNIV_INLINE
+dict_field_t*
+dict_index_get_nth_field(
+/*=====================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			pos)	/*!< in: position of field */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+#else /* UNIV_DEBUG */
+# define dict_index_get_nth_field(index, pos) ((index)->fields + (pos))
+#endif /* UNIV_DEBUG */
+/********************************************************************//**
+Gets pointer to the nth column in an index.
+@return column */
+UNIV_INLINE
+const dict_col_t*
+dict_index_get_nth_col(
+/*===================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			pos)	/*!< in: position of the field */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************//**
+Gets the column number of the nth field in an index.
+@return column number */
+UNIV_INLINE
+ulint
+dict_index_get_nth_col_no(
+/*======================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			pos)	/*!< in: position of the field */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************//**
+Looks for column n in an index.
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+UNIV_INLINE
+ulint
+dict_index_get_nth_col_pos(
+/*=======================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			n,	/*!< in: column number */
+	ulint*			prefix_col_pos) /*!< out: col num if prefix */
+	MY_ATTRIBUTE((nonnull(1), warn_unused_result));
+
+/** Looks for column n in an index.
+@param[in]	index		index
+@param[in]	n		column number
+@param[in]	inc_prefix	true=consider column prefixes too
+@param[in]	is_virtual	true==virtual column
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+ulint
+dict_index_get_nth_col_or_prefix_pos(
+	const dict_index_t*	index,		/*!< in: index */
+	ulint			n,		/*!< in: column number */
+	bool			inc_prefix,	/*!< in: TRUE=consider
+						column prefixes too */
+	bool			is_virtual,	/*!< in: is a virtual column
+						*/
+	ulint*			prefix_col_pos) /*!< out: col num if prefix
+						*/
+	__attribute__((warn_unused_result));
+/********************************************************************//**
+Looks for a matching field in an index. The column has to be the same. The
+column in index must be complete, or must contain a prefix longer than the
+column in index2. That is, we must be able to construct the prefix in index2
+from the prefix in index.
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+ulint
+dict_index_get_nth_field_pos(
+/*=========================*/
+	const dict_index_t*	index,	/*!< in: index from which to search */
+	const dict_index_t*	index2,	/*!< in: index */
+	ulint			n)	/*!< in: field number in index2 */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************//**
+Looks for column n position in the clustered index.
+@return position in internal representation of the clustered index */
+unsigned
+dict_table_get_nth_col_pos(
+/*=======================*/
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			n,	/*!< in: column number */
+	ulint*			prefix_col_pos) /*!< out: col num if prefix */
+	MY_ATTRIBUTE((nonnull(1), warn_unused_result));
+/** Add a column to an index.
+@param index          index
+@param table          table
+@param col            column
+@param prefix_len     column prefix length
+@param descending     whether to use descending order */
+void dict_index_add_col(dict_index_t *index, const dict_table_t *table,
+                        dict_col_t *col, ulint prefix_len,
+                        bool descending= false)
+  MY_ATTRIBUTE((nonnull));
+
+/*******************************************************************//**
+Copies types of fields contained in index to tuple. */
+void
+dict_index_copy_types(
+/*==================*/
+	dtuple_t*		tuple,		/*!< in/out: data tuple */
+	const dict_index_t*	index,		/*!< in: index */
+	ulint			n_fields)	/*!< in: number of
+						field types to copy */
+	MY_ATTRIBUTE((nonnull));
+/*********************************************************************//**
+Gets the field column.
+@return field->col, pointer to the table column */
+UNIV_INLINE
+const dict_col_t*
+dict_field_get_col(
+/*===============*/
+	const dict_field_t*	field)	/*!< in: index field */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/**********************************************************************//**
+Returns an index object if it is found in the dictionary cache.
+@return index, NULL if not found */
+dict_index_t*
+dict_index_get_if_in_cache_low(
+/*===========================*/
+	index_id_t	index_id)	/*!< in: index id */
+	MY_ATTRIBUTE((warn_unused_result));
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Returns an index object if it is found in the dictionary cache.
+@return index, NULL if not found */
+dict_index_t*
+dict_index_get_if_in_cache(
+/*=======================*/
+	index_id_t	index_id)	/*!< in: index id */
+	MY_ATTRIBUTE((warn_unused_result));
+/**********************************************************************//**
+Checks that a tuple has n_fields_cmp value in a sensible range, so that
+no comparison can occur with the page number field in a node pointer.
+@return TRUE if ok */
+ibool
+dict_index_check_search_tuple(
+/*==========================*/
+	const dict_index_t*	index,	/*!< in: index tree */
+	const dtuple_t*		tuple)	/*!< in: tuple used in a search */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Whether and when to allow temporary index names */
+enum check_name {
+	/** Require all indexes to be complete. */
+	CHECK_ALL_COMPLETE,
+	/** Allow aborted online index creation. */
+	CHECK_ABORTED_OK,
+	/** Allow partial indexes to exist. */
+	CHECK_PARTIAL_OK
+};
+/**********************************************************************//**
+Check for duplicate index entries in a table [using the index name] */
+void
+dict_table_check_for_dup_indexes(
+/*=============================*/
+	const dict_table_t*	table,	/*!< in: Check for dup indexes
+					in this table */
+	enum check_name		check)	/*!< in: whether and when to allow
+					temporary index names */
+	MY_ATTRIBUTE((nonnull));
+#endif /* UNIV_DEBUG */
+/**********************************************************************//**
+Builds a node pointer out of a physical record and a page number.
+@return own: node pointer */
+dtuple_t*
+dict_index_build_node_ptr(
+/*======================*/
+	const dict_index_t*	index,	/*!< in: index */
+	const rec_t*		rec,	/*!< in: record for which to build node
+					pointer */
+	ulint			page_no,/*!< in: page number to put in node
+					pointer */
+	mem_heap_t*		heap,	/*!< in: memory heap where pointer
+					created */
+	ulint			level)	/*!< in: level of rec in tree:
+					0 means leaf level */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Convert a physical record into a search tuple.
+@param[in]	rec		index record (not necessarily in an index page)
+@param[in]	index		index
+@param[in]	leaf		whether rec is in a leaf page
+@param[in]	n_fields	number of data fields
+@param[in,out]	heap		memory heap for allocation
+@return own: data tuple */
+dtuple_t*
+dict_index_build_data_tuple(
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	bool			leaf,
+	ulint			n_fields,
+	mem_heap_t*		heap)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/*********************************************************************//**
+Gets the page number of the root of the index tree.
+@return page number */
+UNIV_INLINE
+uint32_t
+dict_index_get_page(
+/*================*/
+	const dict_index_t*	tree)	/*!< in: index */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************//**
+Returns free space reserved for future updates of records. This is
+relevant only in the case of many consecutive inserts, as updates
+which make the records bigger might fragment the index.
+@return number of free bytes on page, reserved for updates */
+UNIV_INLINE
+ulint
+dict_index_get_space_reserve(void);
+/*==============================*/
+
+/* Online index creation @{ */
+/********************************************************************//**
+Gets the status of online index creation.
+@return the status */
+UNIV_INLINE
+enum online_index_status
+dict_index_get_online_status(
+/*=========================*/
+	const dict_index_t*	index)	/*!< in: secondary index */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************//**
+Sets the status of online index creation. */
+UNIV_INLINE
+void
+dict_index_set_online_status(
+/*=========================*/
+	dict_index_t*			index,	/*!< in/out: index */
+	enum online_index_status	status)	/*!< in: status */
+	MY_ATTRIBUTE((nonnull));
+/********************************************************************//**
+Determines if a secondary index is being or has been created online,
+or if the table is being rebuilt online, allowing concurrent modifications
+to the table.
+@retval true if the index is being or has been built online, or
+if this is a clustered index and the table is being or has been rebuilt online
+@retval false if the index has been created or the table has been
+rebuilt completely */
+UNIV_INLINE
+bool
+dict_index_is_online_ddl(
+/*=====================*/
+	const dict_index_t*	index)	/*!< in: index */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Calculates the minimum record length in an index. */
+ulint
+dict_index_calc_min_rec_len(
+/*========================*/
+	const dict_index_t*	index)	/*!< in: index */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/********************************************************************//**
+Checks if the database name in two table names is the same.
+@return TRUE if same db name */
+ibool
+dict_tables_have_same_db(
+/*=====================*/
+	const char*	name1,	/*!< in: table name in the form
+				dbname '/' tablename */
+	const char*	name2)	/*!< in: table name in the form
+				dbname '/' tablename */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Get an index by name.
+@param[in]	table		the table where to look for the index
+@param[in]	name		the index name to look for
+@return index, NULL if does not exist */
+dict_index_t*
+dict_table_get_index_on_name(dict_table_t* table, const char* name)
+		MY_ATTRIBUTE((warn_unused_result));
+
+/** Get an index by name.
+@param[in]	table		the table where to look for the index
+@param[in]	name		the index name to look for
+@return index, NULL if does not exist */
+inline
+const dict_index_t*
+dict_table_get_index_on_name(const dict_table_t* table, const char* name)
+{
+	return dict_table_get_index_on_name(const_cast<dict_table_t*>(table),
+					    name);
+}
+
+/***************************************************************
+Check whether a column exists in an FTS index. */
+UNIV_INLINE
+ulint
+dict_table_is_fts_column(
+/*=====================*/
+				/* out: ULINT_UNDEFINED if no match else
+				the offset within the vector */
+	ib_vector_t*	indexes,/* in: vector containing only FTS indexes */
+	ulint		col_no,	/* in: col number to search for */
+	bool		is_virtual)/*!< in: whether it is a virtual column */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Looks for an index with the given id given a table instance.
+@param[in]	table	table instance
+@param[in]	id	index id
+@return index or NULL */
+dict_index_t*
+dict_table_find_index_on_id(
+	const dict_table_t*	table,
+	index_id_t		id)
+	MY_ATTRIBUTE((nonnull(1)));
+
+/** Maximum number of columns in a foreign key constraint. Please Note MySQL
+has a much lower limit on the number of columns allowed in a foreign key
+constraint */
+#define MAX_NUM_FK_COLUMNS		500
+
+/* Buffers for storing detailed information about the latest foreign key
+and unique key errors */
+extern FILE*		dict_foreign_err_file;
+extern mysql_mutex_t dict_foreign_err_mutex;
+
+/** InnoDB data dictionary cache */
+class dict_sys_t
+{
+  /** The my_hrtime_coarse().val of the oldest lock_wait() start, or 0 */
+  std::atomic<ulonglong> latch_ex_wait_start;
+
+  /** the rw-latch protecting the data dictionary cache */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) srw_lock latch;
+#ifdef UNIV_DEBUG
+  /** whether latch is being held in exclusive mode (by any thread) */
+  Atomic_relaxed<pthread_t> latch_ex;
+  /** number of S-latch holders */
+  Atomic_counter<uint32_t> latch_readers;
+#endif
+public:
+  /** Indexes of SYS_TABLE[] */
+  enum
+  {
+    SYS_TABLES= 0,
+    SYS_INDEXES,
+    SYS_COLUMNS,
+    SYS_FIELDS,
+    SYS_FOREIGN,
+    SYS_FOREIGN_COLS,
+    SYS_VIRTUAL
+  };
+  /** System table names */
+  static const span<const char> SYS_TABLE[];
+
+  /** all tables (persistent and temporary), hashed by name */
+  hash_table_t table_hash;
+  /** hash table of persistent table IDs */
+  hash_table_t table_id_hash;
+
+  /** the SYS_TABLES table */
+  dict_table_t *sys_tables;
+  /** the SYS_COLUMNS table */
+  dict_table_t *sys_columns;
+  /** the SYS_INDEXES table */
+  dict_table_t *sys_indexes;
+  /** the SYS_FIELDS table */
+  dict_table_t *sys_fields;
+  /** the SYS_FOREIGN table */
+  dict_table_t *sys_foreign;
+  /** the SYS_FOREIGN_COLS table */
+  dict_table_t *sys_foreign_cols;
+  /** the SYS_VIRTUAL table */
+  dict_table_t *sys_virtual;
+
+  /** @return whether all non-hard-coded system tables exist */
+  bool sys_tables_exist() const
+  { return UNIV_LIKELY(sys_foreign && sys_foreign_cols && sys_virtual); }
+
+  /** list of persistent tables that can be evicted */
+  UT_LIST_BASE_NODE_T(dict_table_t) table_LRU;
+  /** list of persistent tables that cannot be evicted */
+  UT_LIST_BASE_NODE_T(dict_table_t) table_non_LRU;
+
+private:
+  bool m_initialised= false;
+  /** the sequence of temporary table IDs */
+  std::atomic<table_id_t> temp_table_id{DICT_HDR_FIRST_ID};
+  /** hash table of temporary table IDs */
+  hash_table_t temp_id_hash;
+  /** the next value of DB_ROW_ID, backed by DICT_HDR_ROW_ID
+  (FIXME: remove this, and move to dict_table_t) */
+  Atomic_relaxed<row_id_t> row_id;
+  /** The synchronization interval of row_id */
+  static constexpr size_t ROW_ID_WRITE_MARGIN= 256;
+public:
+  /** Diagnostic message for exceeding the lock_wait() timeout */
+  static const char fatal_msg[];
+
+  /** @return A new value for GEN_CLUST_INDEX(DB_ROW_ID) */
+  inline row_id_t get_new_row_id();
+
+  /** Ensure that row_id is not smaller than id, on IMPORT TABLESPACE */
+  inline void update_row_id(row_id_t id);
+
+  /** Recover the global DB_ROW_ID sequence on database startup */
+  void recover_row_id(row_id_t id)
+  {
+    row_id= ut_uint64_align_up(id, ROW_ID_WRITE_MARGIN) + ROW_ID_WRITE_MARGIN;
+  }
+
+  /** @return a new temporary table ID */
+  table_id_t acquire_temporary_table_id()
+  {
+    return temp_table_id.fetch_add(1, std::memory_order_relaxed);
+  }
+
+  /** Look up a temporary table.
+  @param id        temporary table ID
+  @return          temporary table
+  @retval nullptr  if the table does not exist
+  (should only happen during the rollback of CREATE...SELECT) */
+  dict_table_t *acquire_temporary_table(table_id_t id)
+  {
+    ut_ad(frozen());
+    dict_table_t *table;
+    ulint fold = ut_fold_ull(id);
+    HASH_SEARCH(id_hash, &temp_id_hash, fold, dict_table_t*, table,
+                ut_ad(table->cached), table->id == id);
+    if (UNIV_LIKELY(table != nullptr))
+    {
+      DBUG_ASSERT(table->is_temporary());
+      DBUG_ASSERT(table->id >= DICT_HDR_FIRST_ID);
+      table->acquire();
+    }
+    return table;
+  }
+
+  /** Look up a persistent table.
+  @param id     table ID
+  @return table
+  @retval nullptr if not cached */
+  dict_table_t *find_table(table_id_t id)
+  {
+    ut_ad(frozen());
+    dict_table_t *table;
+    ulint fold= ut_fold_ull(id);
+    HASH_SEARCH(id_hash, &table_id_hash, fold, dict_table_t*, table,
+                ut_ad(table->cached), table->id == id);
+    DBUG_ASSERT(!table || !table->is_temporary());
+    return table;
+  }
+
+  bool is_initialised() const { return m_initialised; }
+
+  /** Initialise the data dictionary cache. */
+  void create();
+
+  /** Close the data dictionary cache on shutdown. */
+  void close();
+
+  /** Resize the hash tables based on the current buffer pool size. */
+  void resize();
+
+  /** Add a table definition to the data dictionary cache */
+  inline void add(dict_table_t* table);
+  /** Remove a table definition from the data dictionary cache.
+  @param[in,out]	table	cached table definition to be evicted
+  @param[in]	lru	whether this is part of least-recently-used evictiono
+  @param[in]	keep	whether to keep (not free) the object */
+  void remove(dict_table_t* table, bool lru = false, bool keep = false);
+
+#ifdef UNIV_DEBUG
+  /** Find a table */
+  template <bool in_lru> bool find(const dict_table_t *table)
+  {
+    ut_ad(table);
+    ut_ad(table->can_be_evicted == in_lru);
+    ut_ad(frozen());
+    for (const dict_table_t* t= in_lru ? table_LRU.start : table_non_LRU.start;
+         t; t = UT_LIST_GET_NEXT(table_LRU, t))
+    {
+      if (t == table) return true;
+      ut_ad(t->can_be_evicted == in_lru);
+    }
+    return false;
+  }
+  /** Find a table */
+  bool find(const dict_table_t *table)
+  {
+    return table->can_be_evicted ? find<true>(table) : find<false>(table);
+  }
+#endif
+
+  /** Move a table to the non-LRU list from the LRU list. */
+  void prevent_eviction(dict_table_t *table)
+  {
+    ut_d(locked());
+    ut_ad(find(table));
+    if (!table->can_be_evicted)
+      return;
+    table->can_be_evicted= false;
+    UT_LIST_REMOVE(table_LRU, table);
+    UT_LIST_ADD_LAST(table_non_LRU, table);
+  }
+
+#ifdef UNIV_DEBUG
+  /** @return whether any thread (not necessarily the current thread)
+  is holding the latch; that is, this check may return false
+  positives */
+  bool frozen() const { return latch_readers || latch_ex; }
+  /** @return whether any thread (not necessarily the current thread)
+  is holding a shared latch */
+  bool frozen_not_locked() const { return latch_readers; }
+  /** @return whether the current thread holds the exclusive latch */
+  bool locked() const { return latch_ex == pthread_self(); }
+#endif
+private:
+  /** Acquire the exclusive latch */
+  ATTRIBUTE_NOINLINE
+  void lock_wait(SRW_LOCK_ARGS(const char *file, unsigned line));
+public:
+  /** @return the my_hrtime_coarse().val of the oldest lock_wait() start,
+  assuming that requests are served on a FIFO basis */
+  ulonglong oldest_wait() const
+  { return latch_ex_wait_start.load(std::memory_order_relaxed); }
+
+  /** Exclusively lock the dictionary cache. */
+  void lock(SRW_LOCK_ARGS(const char *file, unsigned line))
+  {
+    if (latch.wr_lock_try())
+    {
+      ut_ad(!latch_readers);
+      ut_ad(!latch_ex);
+      ut_d(latch_ex= pthread_self());
+    }
+    else
+      lock_wait(SRW_LOCK_ARGS(file, line));
+  }
+
+#ifdef UNIV_PFS_RWLOCK
+  /** Unlock the data dictionary cache. */
+  ATTRIBUTE_NOINLINE void unlock();
+  /** Acquire a shared lock on the dictionary cache. */
+  ATTRIBUTE_NOINLINE void freeze(const char *file, unsigned line);
+  /** Release a shared lock on the dictionary cache. */
+  ATTRIBUTE_NOINLINE void unfreeze();
+#else
+  /** Unlock the data dictionary cache. */
+  void unlock()
+  {
+    ut_ad(latch_ex == pthread_self());
+    ut_ad(!latch_readers);
+    ut_d(latch_ex= 0);
+    latch.wr_unlock();
+  }
+  /** Acquire a shared lock on the dictionary cache. */
+  void freeze()
+  {
+    latch.rd_lock();
+    ut_ad(!latch_ex);
+    ut_d(latch_readers++);
+  }
+  /** Release a shared lock on the dictionary cache. */
+  void unfreeze()
+  {
+    ut_ad(!latch_ex);
+    ut_ad(latch_readers--);
+    latch.rd_unlock();
+  }
+#endif
+
+  /** Estimate the used memory occupied by the data dictionary
+  table and index objects.
+  @return number of bytes occupied */
+  TPOOL_SUPPRESS_TSAN ulint rough_size() const
+  {
+    /* No latch; this is a very crude approximation anyway */
+    ulint size = UT_LIST_GET_LEN(table_LRU) + UT_LIST_GET_LEN(table_non_LRU);
+    size *= sizeof(dict_table_t)
+      + sizeof(dict_index_t) * 2
+      + (sizeof(dict_col_t) + sizeof(dict_field_t)) * 10
+      + sizeof(dict_field_t) * 5 /* total number of key fields */
+      + 200; /* arbitrary, covering names and overhead */
+    size += (table_hash.n_cells + table_id_hash.n_cells +
+             temp_id_hash.n_cells) * sizeof(hash_cell_t);
+    return size;
+  }
+
+  /** Evict unused, unlocked tables from table_LRU.
+  @param half whether to consider half the tables only (instead of all)
+  @return number of tables evicted */
+  ulint evict_table_LRU(bool half);
+
+  /** Look up a table in the dictionary cache.
+  @param name   table name
+  @return table handle
+  @retval nullptr if not found */
+  dict_table_t *find_table(const span<const char> &name) const
+  {
+    ut_ad(frozen());
+    for (dict_table_t *table= static_cast<dict_table_t*>
+         (HASH_GET_FIRST(&table_hash, table_hash.calc_hash
+                         (my_crc32c(0, name.data(), name.size()))));
+         table; table= table->name_hash)
+      if (strlen(table->name.m_name) == name.size() &&
+          !memcmp(table->name.m_name, name.data(), name.size()))
+        return table;
+    return nullptr;
+  }
+
+  /** Look up or load a table definition
+  @param name   table name
+  @param ignore errors to ignore when loading the table definition
+  @return table handle
+  @retval nullptr if not found */
+  dict_table_t *load_table(const span<const char> &name,
+                           dict_err_ignore_t ignore= DICT_ERR_IGNORE_NONE);
+
+  /** Attempt to load the system tables on startup
+  @return whether any discrepancy with the expected definition was found */
+  bool load_sys_tables();
+  /** Create or check system tables on startup */
+  dberr_t create_or_check_sys_tables();
+};
+
+/** the data dictionary cache */
+extern dict_sys_t	dict_sys;
+
+/*********************************************************************//**
+Converts a database and table name from filesystem encoding
+(e.g. d@i1b/a@q1b@1Kc, same format as used in dict_table_t::name) in two
+strings in UTF8 encoding (e.g. dцb and aюbØc). The output buffers must be
+at least MAX_DB_UTF8_LEN and MAX_TABLE_UTF8_LEN bytes. */
+void
+dict_fs2utf8(
+/*=========*/
+	const char*	db_and_table,	/*!< in: database and table names,
+					e.g. d@i1b/a@q1b@1Kc */
+	char*		db_utf8,	/*!< out: database name, e.g. dцb */
+	size_t		db_utf8_size,	/*!< in: dbname_utf8 size */
+	char*		table_utf8,	/*!< out: table name, e.g. aюbØc */
+	size_t		table_utf8_size)/*!< in: table_utf8 size */
+	MY_ATTRIBUTE((nonnull));
+
+/** Flag an index corrupted both in the data dictionary cache
+and in the system table SYS_INDEXES.
+@param index       index to be flagged as corrupted
+@param ctx         context (for error log reporting) */
+void dict_set_corrupted(dict_index_t *index, const char *ctx)
+  ATTRIBUTE_COLD __attribute__((nonnull));
+
+/** Sets merge_threshold in the SYS_INDEXES
+@param[in,out]	index		index
+@param[in]	merge_threshold	value to set */
+void
+dict_index_set_merge_threshold(
+	dict_index_t*	index,
+	ulint		merge_threshold);
+
+#ifdef UNIV_DEBUG
+/** Sets merge_threshold for all indexes in dictionary cache for debug.
+@param[in]	merge_threshold_all	value to set for all indexes */
+void
+dict_set_merge_threshold_all_debug(
+	uint	merge_threshold_all);
+#endif /* UNIV_DEBUG */
+
+/** Validate the table flags.
+@param[in]	flags	Table flags
+@return true if valid. */
+UNIV_INLINE
+bool
+dict_tf_is_valid(
+	ulint	flags);
+
+/** Validate both table flags and table flags2 and make sure they
+are compatible.
+@param[in]	flags	Table flags
+@param[in]	flags2	Table flags2
+@return true if valid. */
+UNIV_INLINE
+bool
+dict_tf2_is_valid(
+	ulint	flags,
+	ulint	flags2);
+
+/*********************************************************************//**
+This function should be called whenever a page is successfully
+compressed. Updates the compression padding information. */
+void
+dict_index_zip_success(
+/*===================*/
+	dict_index_t*	index)	/*!< in/out: index to be updated. */
+	MY_ATTRIBUTE((nonnull));
+/*********************************************************************//**
+This function should be called whenever a page compression attempt
+fails. Updates the compression padding information. */
+void
+dict_index_zip_failure(
+/*===================*/
+	dict_index_t*	index)	/*!< in/out: index to be updated. */
+	MY_ATTRIBUTE((nonnull));
+/*********************************************************************//**
+Return the optimal page size, for which page will likely compress.
+@return page size beyond which page may not compress*/
+ulint
+dict_index_zip_pad_optimal_page_size(
+/*=================================*/
+	dict_index_t*	index)	/*!< in: index for which page size
+				is requested */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*************************************************************//**
+Convert table flag to row format string.
+@return row format name */
+const char*
+dict_tf_to_row_format_string(
+/*=========================*/
+	ulint	table_flag);		/*!< in: row format setting */
+
+/** encode number of columns and number of virtual columns in one
+4 bytes value. We could do this because the number of columns in
+InnoDB is limited to 1017
+@param[in]	n_col	number of non-virtual column
+@param[in]	n_v_col	number of virtual column
+@return encoded value */
+UNIV_INLINE
+ulint
+dict_table_encode_n_col(
+	ulint	n_col,
+	ulint	n_v_col);
+
+/** Decode number of virtual and non-virtual columns in one 4 bytes value.
+@param[in]	encoded	encoded value
+@param[in,out]	n_col	number of non-virtual column
+@param[in,out]	n_v_col	number of virtual column */
+UNIV_INLINE
+void
+dict_table_decode_n_col(
+	ulint	encoded,
+	ulint*	n_col,
+	ulint*	n_v_col);
+
+/** Free the virtual column template
+@param[in,out]	vc_templ	virtual column template */
+UNIV_INLINE
+void
+dict_free_vc_templ(
+	dict_vcol_templ_t*	vc_templ);
+
+/** Check whether the table have virtual index.
+@param[in]	table	InnoDB table
+@return true if the table have virtual index, false otherwise. */
+UNIV_INLINE
+bool
+dict_table_have_virtual_index(
+	dict_table_t*	table);
+
+#include "dict0dict.inl"
+
+#endif
diff --git a/storage/innobase/include/dict0dict.inl b/storage/innobase/include/dict0dict.inl
new file mode 100644
index 00000000..4cc3eae9
--- /dev/null
+++ b/storage/innobase/include/dict0dict.inl
@@ -0,0 +1,1217 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/dict0dict.ic
+Data dictionary system
+
+Created 1/8/1996 Heikki Tuuri
+***********************************************************************/
+
+#include "fsp0sysspace.h"
+#include "dict0pagecompress.h"
+
+/*********************************************************************//**
+Gets the minimum number of bytes per character.
+@return minimum multi-byte char size, in bytes */
+UNIV_INLINE
+unsigned
+dict_col_get_mbminlen(
+/*==================*/
+	const dict_col_t*	col)	/*!< in: column */
+{
+	return col->mbminlen;
+}
+/*********************************************************************//**
+Gets the maximum number of bytes per character.
+@return maximum multi-byte char size, in bytes */
+UNIV_INLINE
+unsigned
+dict_col_get_mbmaxlen(
+/*==================*/
+	const dict_col_t*	col)	/*!< in: column */
+{
+	return col->mbmaxlen;
+}
+/*********************************************************************//**
+Gets the column data type. */
+UNIV_INLINE
+void
+dict_col_copy_type(
+/*===============*/
+	const dict_col_t*	col,	/*!< in: column */
+	dtype_t*		type)	/*!< out: data type */
+{
+	ut_ad(col != NULL);
+	ut_ad(type != NULL);
+
+	type->mtype = col->mtype;
+	type->prtype = col->prtype;
+	type->len = col->len;
+	type->mbminlen = col->mbminlen;
+	type->mbmaxlen = col->mbmaxlen;
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Assert that a column and a data type match.
+@return TRUE */
+UNIV_INLINE
+ibool
+dict_col_type_assert_equal(
+/*=======================*/
+	const dict_col_t*	col,	/*!< in: column */
+	const dtype_t*		type)	/*!< in: data type */
+{
+	ut_ad(col->mtype == type->mtype);
+	ut_ad(col->prtype == type->prtype);
+	//ut_ad(col->len == type->len);
+	ut_ad(col->mbminlen == type->mbminlen);
+	ut_ad(col->mbmaxlen == type->mbmaxlen);
+
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/***********************************************************************//**
+Returns the minimum size of the column.
+@return minimum size */
+UNIV_INLINE
+unsigned
+dict_col_get_min_size(
+/*==================*/
+	const dict_col_t*	col)	/*!< in: column */
+{
+	return(dtype_get_min_size_low(col->mtype, col->prtype, col->len,
+				      col->mbminlen, col->mbmaxlen));
+}
+/***********************************************************************//**
+Returns the maximum size of the column.
+@return maximum size */
+UNIV_INLINE
+ulint
+dict_col_get_max_size(
+/*==================*/
+	const dict_col_t*	col)	/*!< in: column */
+{
+	return(dtype_get_max_size_low(col->mtype, col->len));
+}
+/***********************************************************************//**
+Returns the size of a fixed size column, 0 if not a fixed size column.
+@return fixed size, or 0 */
+UNIV_INLINE
+unsigned
+dict_col_get_fixed_size(
+/*====================*/
+	const dict_col_t*	col,	/*!< in: column */
+	ulint			comp)	/*!< in: nonzero=ROW_FORMAT=COMPACT */
+{
+	return(dtype_get_fixed_size_low(col->mtype, col->prtype, col->len,
+					col->mbminlen, col->mbmaxlen, comp));
+}
+/***********************************************************************//**
+Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a column.
+For fixed length types it is the fixed length of the type, otherwise 0.
+@return SQL null storage size in ROW_FORMAT=REDUNDANT */
+UNIV_INLINE
+unsigned
+dict_col_get_sql_null_size(
+/*=======================*/
+	const dict_col_t*	col,	/*!< in: column */
+	ulint			comp)	/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+{
+	return(dict_col_get_fixed_size(col, comp));
+}
+
+/*********************************************************************//**
+Gets the column number.
+@return col->ind, table column position (starting from 0) */
+UNIV_INLINE
+unsigned
+dict_col_get_no(
+/*============*/
+	const dict_col_t*	col)	/*!< in: column */
+{
+	return(col->ind);
+}
+
+/*********************************************************************//**
+Gets the column position in the clustered index. */
+UNIV_INLINE
+ulint
+dict_col_get_clust_pos(
+/*===================*/
+	const dict_col_t*	col,		/*!< in: table column */
+	const dict_index_t*	clust_index)	/*!< in: clustered index */
+{
+	ulint	i;
+
+	ut_ad(dict_index_is_clust(clust_index));
+
+	for (i = 0; i < clust_index->n_def; i++) {
+		const dict_field_t*	field = &clust_index->fields[i];
+
+		if (!field->prefix_len && field->col == col) {
+			return(i);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/** Gets the column position in the given index.
+@param[in]	col	table column
+@param[in]	index	index to be searched for column
+@return position of column in the given index. */
+UNIV_INLINE
+ulint
+dict_col_get_index_pos(
+	const dict_col_t*	col,
+	const dict_index_t*	index)
+{
+	ulint	i;
+
+	for (i = 0; i < index->n_def; i++) {
+		const dict_field_t*	field = &index->fields[i];
+
+		if (!field->prefix_len && field->col == col) {
+			return(i);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the first index on the table (the clustered index).
+@return index, NULL if none exists */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_first_index(
+/*=======================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+	return(UT_LIST_GET_FIRST(((dict_table_t*) table)->indexes));
+}
+
+/********************************************************************//**
+Gets the last index on the table.
+@return index, NULL if none exists */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_last_index(
+/*=======================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+	return(UT_LIST_GET_LAST((const_cast<dict_table_t*>(table))
+				->indexes));
+}
+
+/********************************************************************//**
+Gets the next index on the table.
+@return index, NULL if none left */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_next_index(
+/*======================*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+	return(UT_LIST_GET_NEXT(indexes, (dict_index_t*) index));
+}
+#endif /* UNIV_DEBUG */
+
+/********************************************************************//**
+Gets the number of user-defined non-virtual columns in a table in the
+dictionary cache.
+@return number of user-defined (e.g., not ROW_ID) non-virtual
+columns of a table */
+UNIV_INLINE
+unsigned
+dict_table_get_n_user_cols(
+/*=======================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+	/* n_cols counts stored columns only. A table may contain
+	virtual columns and no user-specified stored columns at all. */
+	ut_ad(table->n_cols >= DATA_N_SYS_COLS);
+	return unsigned(table->n_cols) - DATA_N_SYS_COLS;
+}
+
+/********************************************************************//**
+Gets the number of all non-virtual columns (also system) in a table
+in the dictionary cache.
+@return number of non-virtual columns of a table */
+UNIV_INLINE
+unsigned
+dict_table_get_n_cols(
+/*==================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+	return(table->n_cols);
+}
+
+/** Gets the number of virtual columns in a table in the dictionary cache.
+@param[in]	table	the table to check
+@return number of virtual columns of a table */
+UNIV_INLINE
+unsigned
+dict_table_get_n_v_cols(
+	const dict_table_t*	table)
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+	return(table->n_v_cols);
+}
+
+/** Check if a table has indexed virtual columns
+@param[in]	table	the table to check
+@return true is the table has indexed virtual columns */
+UNIV_INLINE
+bool
+dict_table_has_indexed_v_cols(
+	const dict_table_t*	table)
+{
+
+	for (unsigned i = 0; i < table->n_v_cols; i++) {
+		const dict_v_col_t*     col = dict_table_get_nth_v_col(table, i);
+		if (col->m_col.ord_part) {
+			return(true);
+		}
+	}
+
+	return(false);
+}
+
+/********************************************************************//**
+Gets the approximately estimated number of rows in the table.
+@return estimated number of rows */
+UNIV_INLINE
+ib_uint64_t
+dict_table_get_n_rows(
+/*==================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ut_ad(table->stat_initialized);
+
+	return(table->stat_n_rows);
+}
+
+/********************************************************************//**
+Increment the number of rows in the table by one.
+Notice that this operation is not protected by any latch, the number is
+approximate. */
+UNIV_INLINE
+void
+dict_table_n_rows_inc(
+/*==================*/
+	dict_table_t*	table)	/*!< in/out: table */
+{
+	if (table->stat_initialized) {
+		ib_uint64_t	n_rows = table->stat_n_rows;
+		if (n_rows < 0xFFFFFFFFFFFFFFFFULL) {
+			table->stat_n_rows = n_rows + 1;
+		}
+	}
+}
+
+/********************************************************************//**
+Decrement the number of rows in the table by one.
+Notice that this operation is not protected by any latch, the number is
+approximate. */
+UNIV_INLINE
+void
+dict_table_n_rows_dec(
+/*==================*/
+	dict_table_t*	table)	/*!< in/out: table */
+{
+	if (table->stat_initialized) {
+		ib_uint64_t	n_rows = table->stat_n_rows;
+		if (n_rows > 0) {
+			table->stat_n_rows = n_rows - 1;
+		}
+	}
+}
+
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the nth column of a table.
+@return pointer to column object */
+UNIV_INLINE
+dict_col_t*
+dict_table_get_nth_col(
+/*===================*/
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			pos)	/*!< in: position of column */
+{
+	ut_ad(pos < table->n_def);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+	return((dict_col_t*) (table->cols) + pos);
+}
+
+/** Gets the nth virtual column of a table.
+@param[in]	table	table
+@param[in]	pos	position of virtual column
+@return pointer to virtual column object */
+UNIV_INLINE
+dict_v_col_t*
+dict_table_get_nth_v_col(
+	const dict_table_t*	table,
+	ulint			pos)
+{
+	ut_ad(table);
+	ut_ad(pos < table->n_v_def);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+	ut_ad(!table->v_cols[pos].m_col.is_added());
+	ut_ad(!table->v_cols[pos].m_col.is_dropped());
+	return &table->v_cols[pos];
+}
+
+/********************************************************************//**
+Gets the given system column of a table.
+@return pointer to column object */
+UNIV_INLINE
+dict_col_t*
+dict_table_get_sys_col(
+/*===================*/
+	const dict_table_t*	table,	/*!< in: table */
+	unsigned		sys)	/*!< in: DATA_ROW_ID, ... */
+{
+	dict_col_t*	col;
+	col = dict_table_get_nth_col(table,
+				     dict_table_get_sys_col_no(table, sys));
+	ut_ad(col->mtype == DATA_SYS);
+	ut_ad(col->prtype == (sys | DATA_NOT_NULL));
+
+	return(col);
+}
+#endif /* UNIV_DEBUG */
+
+/********************************************************************//**
+Gets the given system column number of a table.
+@return column number */
+UNIV_INLINE
+unsigned
+dict_table_get_sys_col_no(
+/*======================*/
+	const dict_table_t*	table,	/*!< in: table */
+	unsigned		sys)	/*!< in: DATA_ROW_ID, ... */
+{
+	ut_ad(sys < DATA_N_SYS_COLS);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+	return unsigned(table->n_cols) + (sys - DATA_N_SYS_COLS);
+}
+
+/************************************************************************
+Check if the table has an FTS index. */
+UNIV_INLINE
+ibool
+dict_table_has_fts_index(
+/*=====================*/
+				/* out: TRUE if table has an FTS index */
+	dict_table_t*   table)  /* in: table */
+{
+	return(DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS));
+}
+
+/** Validate the flags for tables that are not ROW_FORMAT=REDUNDANT.
+@param[in]	flags		table flags
+@return whether the flags are valid */
+inline
+bool
+dict_tf_is_valid_not_redundant(ulint flags)
+{
+	const bool	atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(flags);
+
+	ulint	zip_ssize = DICT_TF_GET_ZIP_SSIZE(flags);
+
+	if (!zip_ssize) {
+		/* Not ROW_FORMAT=COMPRESSED */
+	} else if (!atomic_blobs) {
+		/* ROW_FORMAT=COMPRESSED implies ROW_FORMAT=DYNAMIC
+		for the uncompressed page format */
+		return(false);
+	} else if (zip_ssize > PAGE_ZIP_SSIZE_MAX
+		   || zip_ssize > srv_page_size_shift
+		   || srv_page_size_shift > UNIV_ZIP_SIZE_SHIFT_MAX) {
+		/* KEY_BLOCK_SIZE is out of bounds, or
+		ROW_FORMAT=COMPRESSED is not supported with this
+		innodb_page_size (only up to 16KiB) */
+		return(false);
+	}
+
+	switch (DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags)) {
+	case 0:
+		/* PAGE_COMPRESSION_LEVEL=0 should imply PAGE_COMPRESSED=NO */
+		return(!DICT_TF_GET_PAGE_COMPRESSION(flags));
+	case 1: case 2: case 3: case 4: case 5: case 6: case 7: case 8: case 9:
+		/* PAGE_COMPRESSION_LEVEL requires
+		ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC
+		(not ROW_FORMAT=COMPRESSED or ROW_FORMAT=REDUNDANT)
+		and PAGE_COMPRESSED=YES */
+		return(!zip_ssize && DICT_TF_GET_PAGE_COMPRESSION(flags));
+	default:
+		/* Invalid PAGE_COMPRESSION_LEVEL value */
+		return(false);
+	}
+}
+
+/** Validate the table flags.
+@param[in]	flags	Table flags
+@return true if valid. */
+UNIV_INLINE
+bool
+dict_tf_is_valid(
+	ulint	flags)
+{
+	ut_ad(flags < 1U << DICT_TF_BITS);
+	/* The DATA_DIRECTORY flag can be assigned fully independently
+	of all other persistent table flags. */
+	flags &= ~DICT_TF_MASK_DATA_DIR;
+	if (!(flags & 1)) {
+		/* Only ROW_FORMAT=REDUNDANT has 0 in the least significant
+		bit. For ROW_FORMAT=REDUNDANT, only the DATA_DIR flag
+		(which we cleared above) can be set. If any other flags
+		are set, the flags are invalid. */
+		return(flags == 0 || flags == DICT_TF_MASK_NO_ROLLBACK);
+	}
+
+	return(dict_tf_is_valid_not_redundant(flags));
+}
+
+/** Validate both table flags and table flags2 and make sure they
+are compatible.
+@param[in]	flags	Table flags
+@param[in]	flags2	Table flags2
+@return true if valid. */
+UNIV_INLINE
+bool
+dict_tf2_is_valid(
+	ulint	flags,
+	ulint	flags2)
+{
+	if (!dict_tf_is_valid(flags)) {
+		return(false);
+	}
+
+	if ((flags2 & DICT_TF2_UNUSED_BIT_MASK) != 0) {
+		return(false);
+	}
+
+	return(true);
+}
+
+/********************************************************************//**
+Determine the file format from dict_table_t::flags
+The low order bit will be zero for REDUNDANT and 1 for COMPACT. For any
+other row_format, file_format is > 0 and DICT_TF_COMPACT will also be set.
+@return file format version */
+UNIV_INLINE
+rec_format_t
+dict_tf_get_rec_format(
+/*===================*/
+	ulint		flags)	/*!< in: dict_table_t::flags */
+{
+	ut_a(dict_tf_is_valid(flags));
+
+	if (!DICT_TF_GET_COMPACT(flags)) {
+		return(REC_FORMAT_REDUNDANT);
+	}
+
+	if (!DICT_TF_HAS_ATOMIC_BLOBS(flags)) {
+		return(REC_FORMAT_COMPACT);
+	}
+
+	if (DICT_TF_GET_ZIP_SSIZE(flags)) {
+		return(REC_FORMAT_COMPRESSED);
+	}
+
+	return(REC_FORMAT_DYNAMIC);
+}
+
+/** Set the various values in a dict_table_t::flags pointer.
+@param[in,out]	flags,		Pointer to a 4 byte Table Flags
+@param[in]	format		File Format
+@param[in]	zip_ssize	Zip Shift Size
+@param[in]	use_data_dir	Table uses DATA DIRECTORY
+@param[in]	page_compressed Table uses page compression
+@param[in]	page_compression_level Page compression level */
+UNIV_INLINE
+void
+dict_tf_set(
+/*========*/
+	ulint*		flags,
+	rec_format_t	format,
+	ulint		zip_ssize,
+	bool		use_data_dir,
+	bool		page_compressed,
+	ulint		page_compression_level)
+{
+	*flags = use_data_dir ? 1 << DICT_TF_POS_DATA_DIR : 0;
+
+	switch (format) {
+	case REC_FORMAT_REDUNDANT:
+		ut_ad(zip_ssize == 0);
+		/* no other options are allowed */
+		ut_ad(!page_compressed);
+		return;
+	case REC_FORMAT_COMPACT:
+		*flags |= DICT_TF_COMPACT;
+		ut_ad(zip_ssize == 0);
+		break;
+	case REC_FORMAT_COMPRESSED:
+		*flags |= DICT_TF_COMPACT
+			| (1 << DICT_TF_POS_ATOMIC_BLOBS)
+			| (zip_ssize << DICT_TF_POS_ZIP_SSIZE);
+		break;
+	case REC_FORMAT_DYNAMIC:
+		*flags |= DICT_TF_COMPACT
+			| (1 << DICT_TF_POS_ATOMIC_BLOBS);
+		ut_ad(zip_ssize == 0);
+		break;
+	}
+
+	if (page_compressed) {
+		*flags |= (1 << DICT_TF_POS_ATOMIC_BLOBS)
+		       | (1 << DICT_TF_POS_PAGE_COMPRESSION)
+		       | (page_compression_level << DICT_TF_POS_PAGE_COMPRESSION_LEVEL);
+
+		ut_ad(zip_ssize == 0);
+		ut_ad(dict_tf_get_page_compression(*flags) == TRUE);
+		ut_ad(dict_tf_get_page_compression_level(*flags) == page_compression_level);
+	}
+}
+
+/** Convert a 32 bit integer table flags to the 32 bit FSP Flags.
+Fsp Flags are written into the tablespace header at the offset
+FSP_SPACE_FLAGS and are also stored in the fil_space_t::flags field.
+The following chart shows the translation of the low order bit.
+Other bits are the same.
+========================= Low order bit ==========================
+                    | REDUNDANT | COMPACT | COMPRESSED | DYNAMIC
+dict_table_t::flags |     0     |    1    |     1      |    1
+fil_space_t::flags  |     0     |    0    |     1      |    1
+==================================================================
+@param[in]	table_flags	dict_table_t::flags
+@return tablespace flags (fil_space_t::flags) */
+inline uint32_t dict_tf_to_fsp_flags(unsigned table_flags)
+{
+	uint32_t fsp_flags;
+	uint32_t page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(
+		table_flags);
+
+	ut_ad((DICT_TF_GET_PAGE_COMPRESSION(table_flags) == 0)
+	      == (page_compression_level == 0));
+
+	DBUG_EXECUTE_IF("dict_tf_to_fsp_flags_failure", return UINT32_MAX;);
+
+	/* No ROW_FORMAT=COMPRESSED for innodb_checksum_algorithm=full_crc32 */
+	if ((srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32
+	     || srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_FULL_CRC32)
+	    && !(table_flags & DICT_TF_MASK_ZIP_SSIZE)) {
+
+		fsp_flags = 1U << FSP_FLAGS_FCRC32_POS_MARKER
+			| FSP_FLAGS_FCRC32_PAGE_SSIZE();
+
+		if (page_compression_level) {
+			fsp_flags |= static_cast<uint32_t>(
+				innodb_compression_algorithm)
+				<< FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO;
+		}
+	} else {
+		/* Adjust bit zero. */
+		fsp_flags = DICT_TF_HAS_ATOMIC_BLOBS(table_flags) ? 1 : 0;
+
+		/* ZIP_SSIZE and ATOMIC_BLOBS are at the same position. */
+		fsp_flags |= table_flags
+			& (DICT_TF_MASK_ZIP_SSIZE | DICT_TF_MASK_ATOMIC_BLOBS);
+
+		fsp_flags |= FSP_FLAGS_PAGE_SSIZE();
+
+		if (page_compression_level) {
+			fsp_flags |= FSP_FLAGS_MASK_PAGE_COMPRESSION;
+		}
+	}
+
+	ut_a(fil_space_t::is_valid_flags(fsp_flags, false));
+
+	if (DICT_TF_HAS_DATA_DIR(table_flags)) {
+		fsp_flags |= 1U << FSP_FLAGS_MEM_DATA_DIR;
+	}
+
+	fsp_flags |= page_compression_level << FSP_FLAGS_MEM_COMPRESSION_LEVEL;
+
+	return(fsp_flags);
+}
+
+/********************************************************************//**
+Convert a 32 bit integer table flags to the 32bit integer that is written
+to a SYS_TABLES.TYPE field. The following chart shows the translation of
+the low order bit.  Other bits are the same.
+========================= Low order bit ==========================
+                    | REDUNDANT | COMPACT | COMPRESSED and DYNAMIC
+dict_table_t::flags |     0     |    1    |     1
+SYS_TABLES.TYPE     |     1     |    1    |     1
+==================================================================
+@return ulint containing SYS_TABLES.TYPE */
+UNIV_INLINE
+ulint
+dict_tf_to_sys_tables_type(
+/*=======================*/
+	ulint	flags)	/*!< in: dict_table_t::flags */
+{
+	ulint type;
+
+	ut_a(dict_tf_is_valid(flags));
+
+	/* Adjust bit zero. It is always 1 in SYS_TABLES.TYPE */
+	type = 1;
+
+	/* ZIP_SSIZE, ATOMIC_BLOBS, DATA_DIR, PAGE_COMPRESSION,
+	PAGE_COMPRESSION_LEVEL are the same. */
+	type |= flags & (DICT_TF_MASK_ZIP_SSIZE
+			 | DICT_TF_MASK_ATOMIC_BLOBS
+			 | DICT_TF_MASK_DATA_DIR
+			 | DICT_TF_MASK_PAGE_COMPRESSION
+			 | DICT_TF_MASK_PAGE_COMPRESSION_LEVEL
+			 | DICT_TF_MASK_NO_ROLLBACK);
+
+	return(type);
+}
+
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index,
+including fields added by the dictionary system.
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_fields(
+/*====================*/
+	const dict_index_t*	index)	/*!< in: an internal
+					representation of index (in
+					the dictionary cache) */
+{
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+	return(index->n_fields);
+}
+
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index
+that uniquely determine the position of an index entry in the index, if
+we do not take multiversioning into account: in the B-tree use the value
+returned by dict_index_get_n_unique_in_tree.
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_unique(
+/*====================*/
+	const dict_index_t*	index)	/*!< in: an internal representation
+					of index (in the dictionary cache) */
+{
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+	ut_ad(index->cached);
+	return(index->n_uniq);
+}
+
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index
+which uniquely determine the position of an index entry in the index, if
+we also take multiversioning into account.
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_unique_in_tree(
+/*============================*/
+	const dict_index_t*	index)	/*!< in: an internal representation
+					of index (in the dictionary cache) */
+{
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+	ut_ad(index->cached);
+
+	if (dict_index_is_clust(index)) {
+
+		return(dict_index_get_n_unique(index));
+	}
+
+	return(dict_index_get_n_fields(index));
+}
+
+/**
+Gets the number of fields on nonleaf page level in the internal representation
+of an index which uniquely determine the position of an index entry in the
+index, if we also take multiversioning into account. Note, it doesn't
+include page no field.
+@param[in]	index	index
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_unique_in_tree_nonleaf(
+	const dict_index_t*	index)
+{
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+	ut_ad(index->cached);
+
+	if (dict_index_is_spatial(index)) {
+		/* For spatial index, on non-leaf page, we have only
+		2 fields(mbr+page_no). So, except page no field,
+		there's one field there. */
+		return(DICT_INDEX_SPATIAL_NODEPTR_SIZE);
+	} else {
+		return(dict_index_get_n_unique_in_tree(index));
+	}
+}
+
+/********************************************************************//**
+Gets the number of user-defined ordering fields in the index. In the internal
+representation of clustered indexes we add the row id to the ordering fields
+to make a clustered index unique, but this function returns the number of
+fields the user defined in the index as ordering fields.
+@return number of fields */
+UNIV_INLINE
+uint16_t
+dict_index_get_n_ordering_defined_by_user(
+/*======================================*/
+	const dict_index_t*	index)	/*!< in: an internal representation
+					of index (in the dictionary cache) */
+{
+	return(index->n_user_defined_cols);
+}
+
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the nth field of an index.
+@return pointer to field object */
+UNIV_INLINE
+dict_field_t*
+dict_index_get_nth_field(
+/*=====================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			pos)	/*!< in: position of field */
+{
+	ut_ad(pos < index->n_def);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	return((dict_field_t*) (index->fields) + pos);
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Gets the field column.
+@return field->col, pointer to the table column */
+UNIV_INLINE
+const dict_col_t*
+dict_field_get_col(
+/*===============*/
+	const dict_field_t*	field)	/*!< in: index field */
+{
+	return(field->col);
+}
+
+/********************************************************************//**
+Gets pointer to the nth column in an index.
+@return column */
+UNIV_INLINE
+const dict_col_t*
+dict_index_get_nth_col(
+/*===================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			pos)	/*!< in: position of the field */
+{
+	return(dict_field_get_col(dict_index_get_nth_field(index, pos)));
+}
+
+/********************************************************************//**
+Gets the column number the nth field in an index.
+@return column number */
+UNIV_INLINE
+ulint
+dict_index_get_nth_col_no(
+/*======================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			pos)	/*!< in: position of the field */
+{
+	return(dict_col_get_no(dict_index_get_nth_col(index, pos)));
+}
+
+/********************************************************************//**
+Looks for column n in an index.
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+UNIV_INLINE
+ulint
+dict_index_get_nth_col_pos(
+/*=======================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			n,	/*!< in: column number */
+	ulint*			prefix_col_pos) /*!< out: col num if prefix */
+{
+	return(dict_index_get_nth_col_or_prefix_pos(index, n, false, false,
+						    prefix_col_pos));
+}
+
+/********************************************************************//**
+Returns the minimum data size of an index record.
+@return minimum data size in bytes */
+UNIV_INLINE
+unsigned
+dict_index_get_min_size(
+/*====================*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+  unsigned n= dict_index_get_n_fields(index);
+  unsigned size= 0;
+
+  while (n--)
+    size+= dict_col_get_min_size(dict_index_get_nth_col(index, n));
+
+  return size;
+}
+
+/*********************************************************************//**
+Gets the page number of the root of the index tree.
+@return page number */
+UNIV_INLINE
+uint32_t
+dict_index_get_page(
+/*================*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	return(index->page);
+}
+
+/********************************************************************//**
+Returns free space reserved for future updates of records. This is
+relevant only in the case of many consecutive inserts, as updates
+which make the records bigger might fragment the index.
+@return number of free bytes on page, reserved for updates */
+UNIV_INLINE
+ulint
+dict_index_get_space_reserve(void)
+/*==============================*/
+{
+	return(srv_page_size / 16);
+}
+
+/********************************************************************//**
+Gets the status of online index creation.
+@return the status */
+UNIV_INLINE
+enum online_index_status
+dict_index_get_online_status(
+/*=========================*/
+	const dict_index_t*	index)	/*!< in: secondary index */
+{
+	enum online_index_status	status;
+
+	status = (enum online_index_status) index->online_status;
+
+	/* Without the index->lock protection, the online
+	status can change from ONLINE_INDEX_CREATION to
+	ONLINE_INDEX_COMPLETE (or ONLINE_INDEX_ABORTED) in
+	row_log_apply() once log application is done. So to make
+	sure the status is ONLINE_INDEX_CREATION or ONLINE_INDEX_COMPLETE
+	you should always do the recheck after acquiring index->lock */
+
+#ifdef UNIV_DEBUG
+	switch (status) {
+	case ONLINE_INDEX_COMPLETE:
+	case ONLINE_INDEX_CREATION:
+	case ONLINE_INDEX_ABORTED:
+	case ONLINE_INDEX_ABORTED_DROPPED:
+		return(status);
+	}
+	ut_error;
+#endif /* UNIV_DEBUG */
+	return(status);
+}
+
+/********************************************************************//**
+Sets the status of online index creation. */
+UNIV_INLINE
+void
+dict_index_set_online_status(
+/*=========================*/
+	dict_index_t*			index,	/*!< in/out: index */
+	enum online_index_status	status)	/*!< in: status */
+{
+	ut_ad(!(index->type & DICT_FTS));
+	ut_ad(index->lock.have_x());
+
+#ifdef UNIV_DEBUG
+	switch (dict_index_get_online_status(index)) {
+	case ONLINE_INDEX_COMPLETE:
+	case ONLINE_INDEX_CREATION:
+		break;
+	case ONLINE_INDEX_ABORTED:
+		ut_ad(status == ONLINE_INDEX_ABORTED_DROPPED);
+		break;
+	case ONLINE_INDEX_ABORTED_DROPPED:
+		ut_error;
+	}
+#endif /* UNIV_DEBUG */
+
+	index->online_status = status & 3;
+	ut_ad(dict_index_get_online_status(index) == status);
+}
+
+/********************************************************************//**
+Determines if a secondary index is being or has been created online,
+or if the table is being rebuilt online, allowing concurrent modifications
+to the table.
+@retval true if the index is being or has been built online, or
+if this is a clustered index and the table is being or has been rebuilt online
+@retval false if the index has been created or the table has been
+rebuilt completely */
+UNIV_INLINE
+bool
+dict_index_is_online_ddl(
+/*=====================*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+#ifdef UNIV_DEBUG
+	if (dict_index_is_clust(index)) {
+		switch (dict_index_get_online_status(index)) {
+		case ONLINE_INDEX_CREATION:
+			return(true);
+		case ONLINE_INDEX_COMPLETE:
+			return(false);
+		case ONLINE_INDEX_ABORTED:
+		case ONLINE_INDEX_ABORTED_DROPPED:
+			break;
+		}
+		ut_ad(0);
+		return(false);
+	}
+#endif /* UNIV_DEBUG */
+
+	return(UNIV_UNLIKELY(dict_index_get_online_status(index)
+			     != ONLINE_INDEX_COMPLETE));
+}
+
+/**********************************************************************//**
+Check whether a column exists in an FTS index.
+@return ULINT_UNDEFINED if no match else the offset within the vector */
+UNIV_INLINE
+ulint
+dict_table_is_fts_column(
+/*=====================*/
+	ib_vector_t*	indexes,/*!< in: vector containing only FTS indexes */
+	ulint		col_no,	/*!< in: col number to search for */
+	bool		is_virtual) /*!< in: whether it is a virtual column */
+
+{
+	ulint		i;
+
+	for (i = 0; i < ib_vector_size(indexes); ++i) {
+		dict_index_t*	index;
+
+		index = (dict_index_t*) ib_vector_getp(indexes, i);
+
+		if (index->contains_col_or_prefix(col_no, is_virtual)) {
+			return(i);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/**********************************************************************//**
+Determine bytes of column prefix to be stored in the undo log. Please
+note that if !dict_table_has_atomic_blobs(table), no prefix
+needs to be stored in the undo log.
+@return bytes of column prefix to be stored in the undo log */
+UNIV_INLINE
+ulint
+dict_max_field_len_store_undo(
+/*==========================*/
+	dict_table_t*		table,	/*!< in: table */
+	const dict_col_t*	col)	/*!< in: column which index prefix
+					is based on */
+{
+	if (!dict_table_has_atomic_blobs(table)) {
+		return(0);
+	}
+
+	if (col->max_prefix != 0) {
+		return(col->max_prefix);
+	}
+
+	return(REC_VERSION_56_MAX_INDEX_COL_LEN);
+}
+
+/** Determine maximum bytes of a virtual column need to be stored
+in the undo log.
+@param[in]	table		dict_table_t for the table
+@param[in]	col_no		virtual column number
+@return maximum bytes of virtual column to be stored in the undo log */
+UNIV_INLINE
+ulint
+dict_max_v_field_len_store_undo(
+	dict_table_t*		table,
+	ulint			col_no)
+{
+	const dict_col_t*	col
+		= &dict_table_get_nth_v_col(table, col_no)->m_col;
+	ulint			max_log_len;
+
+	/* This calculation conforms to the non-virtual column
+	maximum log length calculation:
+	1) if No atomic BLOB, upto REC_ANTELOPE_MAX_INDEX_COL_LEN
+	2) if atomic BLOB, upto col->max_prefix or
+	REC_VERSION_56_MAX_INDEX_COL_LEN, whichever is less */
+	if (dict_table_has_atomic_blobs(table)) {
+		if (DATA_BIG_COL(col) && col->max_prefix > 0) {
+			max_log_len = col->max_prefix;
+		} else {
+			max_log_len = DICT_MAX_FIELD_LEN_BY_FORMAT(table);
+		}
+	} else {
+		max_log_len = REC_ANTELOPE_MAX_INDEX_COL_LEN;
+	}
+
+	return(max_log_len);
+}
+
+/** Check if the table is found is a file_per_table tablespace.
+This test does not use table flags2 since some REDUNDANT tables in the
+system tablespace may have garbage in the MIX_LEN field where flags2 is
+stored. These garbage MIX_LEN fields were written before v3.23.52.
+A patch was added to v3.23.52 which initializes the MIX_LEN field to 0.
+Since file-per-table tablespaces were added in 4.1, any SYS_TABLES
+record with a non-zero space ID will have a reliable MIX_LEN field.
+However, this test does not use flags2 from SYS_TABLES.MIX_LEN.  Instead,
+assume that if the tablespace is not a predefined system tablespace,
+ then it must be file-per-table.
+Also, during ALTER TABLE, the DICT_TF2_USE_FILE_PER_TABLE flag may not be
+set on one of the file-per-table tablespaces.
+This test cannot be done on a table in the process of being created
+because the space_id will be zero until the tablespace is created.
+@param[in]	table	An existing open table to check
+@return true if this table was created as a file-per-table tablespace. */
+UNIV_INLINE
+bool
+dict_table_is_file_per_table(
+	const dict_table_t*	table)	/*!< in: table to check */
+{
+	return table->space != fil_system.sys_space
+		&& table->space != fil_system.temp_space;
+}
+
+/** Acquire the table handle. */
+inline void dict_table_t::acquire()
+{
+  ut_ad(dict_sys.frozen());
+  n_ref_count++;
+}
+
+/** Release the table handle.
+@return	whether the last handle was released */
+inline
+bool
+dict_table_t::release()
+{
+	auto n = n_ref_count--;
+	ut_ad(n > 0);
+	return n == 1;
+}
+
+/** Encode the number of columns and number of virtual columns in a
+4 bytes value. We could do this because the number of columns in
+InnoDB is limited to 1017
+@param[in]      n_col   number of non-virtual column
+@param[in]      n_v_col number of virtual column
+@return encoded value */
+UNIV_INLINE
+ulint
+dict_table_encode_n_col(
+                ulint   n_col,
+                ulint   n_v_col)
+{
+	return(n_col + (n_v_col<<16));
+}
+
+/** decode number of virtual and non-virtual columns in one 4 bytes value.
+@param[in]      encoded encoded value
+@param[in,out]     n_col   number of non-virtual column
+@param[in,out]     n_v_col number of virtual column */
+UNIV_INLINE
+void
+dict_table_decode_n_col(
+                ulint   encoded,
+                ulint*  n_col,
+                ulint*  n_v_col)
+{
+
+	ulint	num = encoded & ~DICT_N_COLS_COMPACT;
+	*n_v_col = num >> 16;
+	*n_col = num & 0xFFFF;
+}
+
+/** Free the virtual column template
+@param[in,out]	vc_templ	virtual column template */
+void
+dict_free_vc_templ(
+	dict_vcol_templ_t*	vc_templ)
+{
+	UT_DELETE_ARRAY(vc_templ->default_rec);
+	vc_templ->default_rec = NULL;
+
+	if (vc_templ->vtempl != NULL) {
+		ut_ad(vc_templ->n_v_col > 0);
+		for (ulint i = 0; i < vc_templ->n_col
+		     + vc_templ->n_v_col; i++) {
+			if (vc_templ->vtempl[i] != NULL) {
+				ut_free(vc_templ->vtempl[i]);
+			}
+		}
+		ut_free(vc_templ->vtempl);
+		vc_templ->vtempl = NULL;
+	}
+}
+
+/** Check whether the table have virtual index.
+@param[in]	table	InnoDB table
+@return true if the table have virtual index, false otherwise. */
+UNIV_INLINE
+bool
+dict_table_have_virtual_index(
+	dict_table_t*	table)
+{
+	for (ulint col_no = 0; col_no < dict_table_get_n_v_cols(table);
+	     col_no++) {
+		const dict_v_col_t*	col
+			= dict_table_get_nth_v_col(table, col_no);
+
+		if (col->m_col.ord_part) {
+			return(true);
+		}
+	}
+
+	return(false);
+}
diff --git a/storage/innobase/include/dict0load.h b/storage/innobase/include/dict0load.h
new file mode 100644
index 00000000..f7d33d5b
--- /dev/null
+++ b/storage/innobase/include/dict0load.h
@@ -0,0 +1,220 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0load.h
+Loads to the memory cache database object definitions
+from dictionary tables
+
+Created 4/24/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0load_h
+#define dict0load_h
+
+#include "dict0types.h"
+#include "trx0types.h"
+#include "ut0byte.h"
+#include "mem0mem.h"
+#include "btr0types.h"
+
+#include <deque>
+
+/** A stack of table names related through foreign key constraints */
+typedef std::deque<const char*, ut_allocator<const char*> >	dict_names_t;
+
+/** Check each tablespace found in the data dictionary.
+Then look at each table defined in SYS_TABLES that has a space_id > 0
+to find all the file-per-table tablespaces.
+
+In a crash recovery we already have some tablespace objects created from
+processing the REDO log. We will compare the
+space_id information in the data dictionary to what we find in the
+tablespace file. In addition, more validation will be done if recovery
+was needed and force_recovery is not set.
+
+We also scan the biggest space id, and store it to fil_system. */
+void dict_check_tablespaces_and_store_max_id();
+
+/** Make sure the data_file_name is saved in dict_table_t if needed.
+@param[in,out]	table		Table object */
+void dict_get_and_save_data_dir_path(dict_table_t* table);
+
+/***********************************************************************//**
+Loads a table object based on the table id.
+@return table; NULL if table does not exist */
+dict_table_t*
+dict_load_table_on_id(
+/*==================*/
+	table_id_t		table_id,	/*!< in: table id */
+	dict_err_ignore_t	ignore_err);	/*!< in: errors to ignore
+						when loading the table */
+/********************************************************************//**
+This function is called when the database is booted.
+Loads system table index definitions except for the clustered index which
+is added to the dictionary cache at booting before calling this function. */
+void
+dict_load_sys_table(
+/*================*/
+	dict_table_t*	table);	/*!< in: system table */
+/***********************************************************************//**
+Loads foreign key constraints where the table is either the foreign key
+holder or where the table is referenced by a foreign key. Adds these
+constraints to the data dictionary.
+
+The foreign key constraint is loaded only if the referenced table is also
+in the dictionary cache.  If the referenced table is not in dictionary
+cache, then it is added to the output parameter (fk_tables).
+
+@return DB_SUCCESS or error code */
+dberr_t
+dict_load_foreigns(
+/*===============*/
+	const char*		table_name,	/*!< in: table name */
+	const char**		col_names,	/*!< in: column names, or NULL
+						to use table->col_names */
+	trx_id_t		trx_id,		/*!< in: DDL transaction id,
+						or 0 to check
+						recursive load of tables
+						chained by FK */
+	bool			check_charsets,	/*!< in: whether to check
+						charset compatibility */
+	dict_err_ignore_t	ignore_err,	/*!< in: error to be ignored */
+	dict_names_t&		fk_tables)	/*!< out: stack of table names
+						which must be loaded
+						subsequently to load all the
+						foreign key constraints. */
+	MY_ATTRIBUTE((nonnull(1)));
+
+/********************************************************************//**
+This function opens a system table, and return the first record.
+@return first record of the system table */
+const rec_t*
+dict_startscan_system(
+/*==================*/
+	btr_pcur_t*	pcur,		/*!< out: persistent cursor to
+					the record */
+	mtr_t*		mtr,		/*!< in: the mini-transaction */
+	dict_table_t*	table);		/*!< in: system table */
+/********************************************************************//**
+This function get the next system table record as we scan the table.
+@return the record if found, NULL if end of scan. */
+const rec_t*
+dict_getnext_system(
+/*================*/
+	btr_pcur_t*	pcur,		/*!< in/out: persistent cursor
+					to the record */
+	mtr_t*		mtr);		/*!< in: the mini-transaction */
+
+/** Load a table definition from a SYS_TABLES record to dict_table_t.
+Do not load any columns or indexes.
+@param[in,out]	mtr		mini-transaction
+@param[in]	uncommitted	whether to use READ UNCOMMITTED isolation level
+@param[in]	rec		SYS_TABLES record
+@param[out,own]	table		table, or nullptr
+@return	error message
+@retval	nullptr on success */
+const char *dict_load_table_low(mtr_t *mtr, bool uncommitted,
+                                const rec_t *rec, dict_table_t **table)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/********************************************************************//**
+This function parses a SYS_INDEXES record and populate a dict_index_t
+structure with the information from the record. For detail information
+about SYS_INDEXES fields, please refer to dict_boot() function.
+@return error message, or NULL on success */
+const char*
+dict_process_sys_indexes_rec(
+/*=========================*/
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	const rec_t*	rec,		/*!< in: current SYS_INDEXES rec */
+	dict_index_t*	index,		/*!< out: dict_index_t to be
+					filled */
+	table_id_t*	table_id);	/*!< out: table id */
+/********************************************************************//**
+This function parses a SYS_COLUMNS record and populate a dict_column_t
+structure with the information from the record.
+@return error message, or NULL on success */
+const char*
+dict_process_sys_columns_rec(
+/*=========================*/
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	const rec_t*	rec,		/*!< in: current SYS_COLUMNS rec */
+	dict_col_t*	column,		/*!< out: dict_col_t to be filled */
+	table_id_t*	table_id,	/*!< out: table id */
+	const char**	col_name,	/*!< out: column name */
+	ulint*		nth_v_col);	/*!< out: if virtual col, this is
+					records its sequence number */
+
+/** This function parses a SYS_VIRTUAL record and extract virtual column
+information
+@param[in,out]	heap		heap memory
+@param[in]	rec		current SYS_COLUMNS rec
+@param[in,out]	table_id	table id
+@param[in,out]	pos		virtual column position
+@param[in,out]	base_pos	base column position
+@return error message, or NULL on success */
+const char*
+dict_process_sys_virtual_rec(
+	const rec_t*	rec,
+	table_id_t*	table_id,
+	ulint*		pos,
+	ulint*		base_pos);
+/********************************************************************//**
+This function parses a SYS_FIELDS record and populate a dict_field_t
+structure with the information from the record.
+@return error message, or NULL on success */
+const char*
+dict_process_sys_fields_rec(
+/*========================*/
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	const rec_t*	rec,		/*!< in: current SYS_FIELDS rec */
+	dict_field_t*	sys_field,	/*!< out: dict_field_t to be
+					filled */
+	ulint*		pos,		/*!< out: Field position */
+	index_id_t*	index_id,	/*!< out: current index id */
+	index_id_t	last_id);	/*!< in: previous index id */
+/********************************************************************//**
+This function parses a SYS_FOREIGN record and populate a dict_foreign_t
+structure with the information from the record. For detail information
+about SYS_FOREIGN fields, please refer to dict_load_foreign() function
+@return error message, or NULL on success */
+const char*
+dict_process_sys_foreign_rec(
+/*=========================*/
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	const rec_t*	rec,		/*!< in: current SYS_FOREIGN rec */
+	dict_foreign_t*	foreign);	/*!< out: dict_foreign_t to be
+					filled */
+/********************************************************************//**
+This function parses a SYS_FOREIGN_COLS record and extract necessary
+information from the record and return to caller.
+@return error message, or NULL on success */
+const char*
+dict_process_sys_foreign_col_rec(
+/*=============================*/
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	const rec_t*	rec,		/*!< in: current SYS_FOREIGN_COLS rec */
+	const char**	name,		/*!< out: foreign key constraint name */
+	const char**	for_col_name,	/*!< out: referencing column name */
+	const char**	ref_col_name,	/*!< out: referenced column name
+					in referenced table */
+	ulint*		pos);		/*!< out: column position */
+
+#endif
diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h
new file mode 100644
index 00000000..fde2a714
--- /dev/null
+++ b/storage/innobase/include/dict0mem.h
@@ -0,0 +1,2649 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0mem.h
+Data dictionary memory object creation
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0mem_h
+#define dict0mem_h
+
+#include "dict0types.h"
+#include "data0type.h"
+#include "mem0mem.h"
+#include "row0types.h"
+#include "btr0types.h"
+#include "lock0types.h"
+#include "que0types.h"
+#include "sux_lock.h"
+#include "ut0mem.h"
+#include "ut0rnd.h"
+#include "ut0byte.h"
+#include "hash0hash.h"
+#include "trx0types.h"
+#include "fts0fts.h"
+#include "buf0buf.h"
+#include "mtr0mtr.h"
+#include "gis0type.h"
+#include "fil0fil.h"
+#include "fil0crypt.h"
+#include "mysql_com.h"
+#include <sql_const.h>
+#include <set>
+#include <algorithm>
+#include <iterator>
+#include <ostream>
+#include <mutex>
+
+/* Forward declaration. */
+struct ib_rbt_t;
+
+/** Type flags of an index: OR'ing of the flags is allowed to define a
+combination of types */
+/* @{ */
+#define DICT_CLUSTERED	1	/*!< clustered index; for other than
+				auto-generated clustered indexes,
+				also DICT_UNIQUE will be set */
+#define DICT_UNIQUE	2	/*!< unique index */
+#define	DICT_IBUF	8	/*!< insert buffer tree */
+#define	DICT_CORRUPT	16	/*!< bit to store the corrupted flag
+				in SYS_INDEXES.TYPE */
+#define	DICT_FTS	32	/* FTS index; can't be combined with the
+				other flags */
+#define	DICT_SPATIAL	64	/* SPATIAL index; can't be combined with the
+				other flags */
+#define	DICT_VIRTUAL	128	/* Index on Virtual column */
+
+#define	DICT_IT_BITS	8	/*!< number of bits used for
+				SYS_INDEXES.TYPE */
+/* @} */
+
+#if 0 /* not implemented, retained for history */
+/** Types for a table object */
+#define DICT_TABLE_ORDINARY		1 /*!< ordinary table */
+#define	DICT_TABLE_CLUSTER_MEMBER	2
+#define	DICT_TABLE_CLUSTER		3 /* this means that the table is
+					  really a cluster definition */
+#endif
+
+/* Table and tablespace flags are generally not used for the Antelope file
+format except for the low order bit, which is used differently depending on
+where the flags are stored.
+
+==================== Low order flags bit =========================
+                    | REDUNDANT | COMPACT | COMPRESSED and DYNAMIC
+SYS_TABLES.TYPE     |     1     |    1    |     1
+dict_table_t::flags |     0     |    1    |     1
+FSP_SPACE_FLAGS     |     0     |    0    |     1
+fil_space_t::flags  |     0     |    0    |     1
+
+Before the 5.1 plugin, SYS_TABLES.TYPE was always DICT_TABLE_ORDINARY (1)
+and the tablespace flags field was always 0. In the 5.1 plugin, these fields
+were repurposed to identify compressed and dynamic row formats.
+
+The following types and constants describe the flags found in dict_table_t
+and SYS_TABLES.TYPE.  Similar flags found in fil_space_t and FSP_SPACE_FLAGS
+are described in fsp0fsp.h. */
+
+/* @{ */
+/** dict_table_t::flags bit 0 is equal to 0 if the row format = Redundant */
+#define DICT_TF_REDUNDANT		0	/*!< Redundant row format. */
+/** dict_table_t::flags bit 0 is equal to 1 if the row format = Compact */
+#define DICT_TF_COMPACT			1U	/*!< Compact row format. */
+
+/** This bitmask is used in SYS_TABLES.N_COLS to set and test whether
+the Compact page format is used, i.e ROW_FORMAT != REDUNDANT */
+constexpr uint32_t DICT_N_COLS_COMPACT= 1U << 31;
+
+/** Width of the COMPACT flag */
+#define DICT_TF_WIDTH_COMPACT		1
+
+/** Width of the ZIP_SSIZE flag */
+#define DICT_TF_WIDTH_ZIP_SSIZE		4
+
+/** Width of the ATOMIC_BLOBS flag.  The ROW_FORMAT=REDUNDANT and
+ROW_FORMAT=COMPACT broke up BLOB and TEXT fields, storing the first 768 bytes
+in the clustered index. ROW_FORMAT=DYNAMIC and ROW_FORMAT=COMPRESSED
+store the whole blob or text field off-page atomically.
+Secondary indexes are created from this external data using row_ext_t
+to cache the BLOB prefixes. */
+#define DICT_TF_WIDTH_ATOMIC_BLOBS	1
+
+/** If a table is created with the MYSQL option DATA DIRECTORY and
+innodb-file-per-table, an older engine will not be able to find that table.
+This flag prevents older engines from attempting to open the table and
+allows InnoDB to update_create_info() accordingly. */
+#define DICT_TF_WIDTH_DATA_DIR		1
+
+/**
+Width of the page compression flag
+*/
+#define DICT_TF_WIDTH_PAGE_COMPRESSION  1
+#define DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL 4
+
+/**
+The NO_ROLLBACK flag (3=yes; the values 1,2 used stand for
+ATOMIC_WRITES=ON and ATOMIC_WRITES=OFF between MariaDB 10.1.0 and 10.2.3)
+*/
+#define DICT_TF_WIDTH_NO_ROLLBACK 2
+
+/** Width of all the currently known table flags */
+#define DICT_TF_BITS	(DICT_TF_WIDTH_COMPACT			\
+			+ DICT_TF_WIDTH_ZIP_SSIZE		\
+			+ DICT_TF_WIDTH_ATOMIC_BLOBS		\
+			+ DICT_TF_WIDTH_DATA_DIR		\
+			+ DICT_TF_WIDTH_PAGE_COMPRESSION	\
+			+ DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL	\
+			+ DICT_TF_WIDTH_NO_ROLLBACK)
+
+/** Zero relative shift position of the COMPACT field */
+#define DICT_TF_POS_COMPACT		0
+/** Zero relative shift position of the ZIP_SSIZE field */
+#define DICT_TF_POS_ZIP_SSIZE		(DICT_TF_POS_COMPACT		\
+					+ DICT_TF_WIDTH_COMPACT)
+/** Zero relative shift position of the ATOMIC_BLOBS field */
+#define DICT_TF_POS_ATOMIC_BLOBS	(DICT_TF_POS_ZIP_SSIZE		\
+					+ DICT_TF_WIDTH_ZIP_SSIZE)
+/** Zero relative shift position of the DATA_DIR field */
+#define DICT_TF_POS_DATA_DIR		(DICT_TF_POS_ATOMIC_BLOBS	\
+					+ DICT_TF_WIDTH_ATOMIC_BLOBS)
+/** Zero relative shift position of the PAGE_COMPRESSION field */
+#define DICT_TF_POS_PAGE_COMPRESSION	(DICT_TF_POS_DATA_DIR		\
+					+ DICT_TF_WIDTH_DATA_DIR)
+/** Zero relative shift position of the PAGE_COMPRESSION_LEVEL field */
+#define DICT_TF_POS_PAGE_COMPRESSION_LEVEL	(DICT_TF_POS_PAGE_COMPRESSION	\
+					+ DICT_TF_WIDTH_PAGE_COMPRESSION)
+/** Zero relative shift position of the NO_ROLLBACK field */
+#define DICT_TF_POS_NO_ROLLBACK		(DICT_TF_POS_PAGE_COMPRESSION_LEVEL \
+					+ DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL)
+#define DICT_TF_POS_UNUSED		(DICT_TF_POS_NO_ROLLBACK     \
+					+ DICT_TF_WIDTH_NO_ROLLBACK)
+
+/** Bit mask of the COMPACT field */
+#define DICT_TF_MASK_COMPACT				\
+		((~(~0U << DICT_TF_WIDTH_COMPACT))	\
+		<< DICT_TF_POS_COMPACT)
+/** Bit mask of the ZIP_SSIZE field */
+#define DICT_TF_MASK_ZIP_SSIZE				\
+		((~(~0U << DICT_TF_WIDTH_ZIP_SSIZE))	\
+		<< DICT_TF_POS_ZIP_SSIZE)
+/** Bit mask of the ATOMIC_BLOBS field */
+#define DICT_TF_MASK_ATOMIC_BLOBS			\
+		((~(~0U << DICT_TF_WIDTH_ATOMIC_BLOBS))	\
+		<< DICT_TF_POS_ATOMIC_BLOBS)
+/** Bit mask of the DATA_DIR field */
+#define DICT_TF_MASK_DATA_DIR				\
+		((~(~0U << DICT_TF_WIDTH_DATA_DIR))	\
+		<< DICT_TF_POS_DATA_DIR)
+/** Bit mask of the PAGE_COMPRESSION field */
+#define DICT_TF_MASK_PAGE_COMPRESSION			\
+		((~(~0U << DICT_TF_WIDTH_PAGE_COMPRESSION)) \
+		<< DICT_TF_POS_PAGE_COMPRESSION)
+/** Bit mask of the PAGE_COMPRESSION_LEVEL field */
+#define DICT_TF_MASK_PAGE_COMPRESSION_LEVEL		\
+		((~(~0U << DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL)) \
+		<< DICT_TF_POS_PAGE_COMPRESSION_LEVEL)
+/** Bit mask of the NO_ROLLBACK field */
+#define DICT_TF_MASK_NO_ROLLBACK		\
+		((~(~0U << DICT_TF_WIDTH_NO_ROLLBACK)) \
+		<< DICT_TF_POS_NO_ROLLBACK)
+
+/** Return the value of the COMPACT field */
+#define DICT_TF_GET_COMPACT(flags)			\
+		((flags & DICT_TF_MASK_COMPACT)		\
+		>> DICT_TF_POS_COMPACT)
+/** Return the value of the ZIP_SSIZE field */
+#define DICT_TF_GET_ZIP_SSIZE(flags)			\
+		((flags & DICT_TF_MASK_ZIP_SSIZE)	\
+		>> DICT_TF_POS_ZIP_SSIZE)
+/** Return the value of the ATOMIC_BLOBS field */
+#define DICT_TF_HAS_ATOMIC_BLOBS(flags)			\
+		((flags & DICT_TF_MASK_ATOMIC_BLOBS)	\
+		>> DICT_TF_POS_ATOMIC_BLOBS)
+/** Return the value of the DATA_DIR field */
+#define DICT_TF_HAS_DATA_DIR(flags)			\
+		((flags & DICT_TF_MASK_DATA_DIR)	\
+		>> DICT_TF_POS_DATA_DIR)
+/** Return the value of the PAGE_COMPRESSION field */
+#define DICT_TF_GET_PAGE_COMPRESSION(flags)	       \
+		((flags & DICT_TF_MASK_PAGE_COMPRESSION) \
+		>> DICT_TF_POS_PAGE_COMPRESSION)
+/** Return the value of the PAGE_COMPRESSION_LEVEL field */
+#define DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags)       \
+		((flags & DICT_TF_MASK_PAGE_COMPRESSION_LEVEL)	\
+		>> DICT_TF_POS_PAGE_COMPRESSION_LEVEL)
+
+/* @} */
+
+/** @brief Table Flags set number 2.
+
+These flags will be stored in SYS_TABLES.MIX_LEN.  All unused flags
+will be written as 0.  The column may contain garbage for tables
+created with old versions of InnoDB that only implemented
+ROW_FORMAT=REDUNDANT.  InnoDB engines do not check these flags
+for unknown bits in order to protect backward incompatibility. */
+/* @{ */
+/** Total number of bits in table->flags2. */
+#define DICT_TF2_BITS			7
+#define DICT_TF2_UNUSED_BIT_MASK	(~0U << DICT_TF2_BITS)
+#define DICT_TF2_BIT_MASK		~DICT_TF2_UNUSED_BIT_MASK
+
+/** TEMPORARY; TRUE for tables from CREATE TEMPORARY TABLE. */
+#define DICT_TF2_TEMPORARY		1U
+
+/** The table has an internal defined DOC ID column */
+#define DICT_TF2_FTS_HAS_DOC_ID		2U
+
+/** The table has an FTS index */
+#define DICT_TF2_FTS			4U
+
+/** Need to add Doc ID column for FTS index build.
+This is a transient bit for index build */
+#define DICT_TF2_FTS_ADD_DOC_ID		8U
+
+/** This bit is used during table creation to indicate that it will
+use its own tablespace instead of the system tablespace. */
+#define DICT_TF2_USE_FILE_PER_TABLE	16U
+
+/** Set when we discard/detach the tablespace */
+#define DICT_TF2_DISCARDED		32U
+
+/** This bit is set if all aux table names (both common tables and
+index tables) of a FTS table are in HEX format. */
+#define DICT_TF2_FTS_AUX_HEX_NAME	64U
+
+/* @} */
+
+#define DICT_TF2_FLAG_SET(table, flag)		\
+	(table->flags2 |= (flag))
+
+#define DICT_TF2_FLAG_IS_SET(table, flag)	\
+	(table->flags2 & (flag))
+
+#define DICT_TF2_FLAG_UNSET(table, flag)	\
+	(table->flags2 &= ~(flag) & ((1U << DICT_TF2_BITS) - 1))
+
+/** Tables could be chained together with Foreign key constraint. When
+first load the parent table, we would load all of its descedents.
+This could result in rescursive calls and out of stack error eventually.
+DICT_FK_MAX_RECURSIVE_LOAD defines the maximum number of recursive loads,
+when exceeded, the child table will not be loaded. It will be loaded when
+the foreign constraint check needs to be run. */
+#define DICT_FK_MAX_RECURSIVE_LOAD	20
+
+/** Similarly, when tables are chained together with foreign key constraints
+with on cascading delete/update clause, delete from parent table could
+result in recursive cascading calls. This defines the maximum number of
+such cascading deletes/updates allowed. When exceeded, the delete from
+parent table will fail, and user has to drop excessive foreign constraint
+before proceeds. */
+#define FK_MAX_CASCADE_DEL		15
+
+/****************************************************************/ /**
+ Free a table memory object. */
+void
+dict_mem_table_free(
+/*================*/
+	dict_table_t*	table);		/*!< in: table */
+/**********************************************************************//**
+Adds a column definition to a table. */
+void
+dict_mem_table_add_col(
+/*===================*/
+	dict_table_t*	table,	/*!< in: table */
+	mem_heap_t*	heap,	/*!< in: temporary memory heap, or NULL */
+	const char*	name,	/*!< in: column name, or NULL */
+	ulint		mtype,	/*!< in: main datatype */
+	ulint		prtype,	/*!< in: precise type */
+	ulint		len)	/*!< in: precision */
+	MY_ATTRIBUTE((nonnull(1)));
+/** Adds a virtual column definition to a table.
+@param[in,out]	table		table
+@param[in]	heap		temporary memory heap, or NULL. It is
+				used to store name when we have not finished
+				adding all columns. When all columns are
+				added, the whole name will copy to memory from
+				table->heap
+@param[in]	name		column name
+@param[in]	mtype		main datatype
+@param[in]	prtype		precise type
+@param[in]	len		length
+@param[in]	pos		position in a table
+@param[in]	num_base	number of base columns
+@return the virtual column definition */
+dict_v_col_t*
+dict_mem_table_add_v_col(
+	dict_table_t*	table,
+	mem_heap_t*	heap,
+	const char*	name,
+	ulint		mtype,
+	ulint		prtype,
+	ulint		len,
+	ulint		pos,
+	ulint		num_base);
+
+/** Adds a stored column definition to a table.
+@param[in]	table		table
+@param[in]	num_base	number of base columns. */
+void
+dict_mem_table_add_s_col(
+	dict_table_t*	table,
+	ulint		num_base);
+
+/**********************************************************************//**
+Renames a column of a table in the data dictionary cache. */
+void
+dict_mem_table_col_rename(
+/*======================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	ulint		nth_col,/*!< in: column index */
+	const char*	from,	/*!< in: old column name */
+	const char*	to,	/*!< in: new column name */
+	bool		is_virtual);
+				/*!< in: if this is a virtual column */
+/**********************************************************************//**
+This function populates a dict_col_t memory structure with
+supplied information. */
+void
+dict_mem_fill_column_struct(
+/*========================*/
+	dict_col_t*	column,		/*!< out: column struct to be
+					filled */
+	ulint		col_pos,	/*!< in: column position */
+	ulint		mtype,		/*!< in: main data type */
+	ulint		prtype,		/*!< in: precise type */
+	ulint		col_len);	/*!< in: column length */
+/**********************************************************************//**
+This function poplulates a dict_index_t index memory structure with
+supplied information. */
+UNIV_INLINE
+void
+dict_mem_fill_index_struct(
+/*=======================*/
+	dict_index_t*	index,		/*!< out: index to be filled */
+	mem_heap_t*	heap,		/*!< in: memory heap */
+	const char*	index_name,	/*!< in: index name */
+	ulint		type,		/*!< in: DICT_UNIQUE,
+					DICT_CLUSTERED, ... ORed */
+	ulint		n_fields);	/*!< in: number of fields */
+/**********************************************************************//**
+Creates an index memory object.
+@return own: index object */
+dict_index_t*
+dict_mem_index_create(
+/*==================*/
+	dict_table_t*	table,		/*!< in: table */
+	const char*	index_name,	/*!< in: index name */
+	ulint		type,		/*!< in: DICT_UNIQUE,
+					DICT_CLUSTERED, ... ORed */
+	ulint		n_fields);	/*!< in: number of fields */
+
+/**********************************************************************//**
+Frees an index memory object. */
+void
+dict_mem_index_free(
+/*================*/
+	dict_index_t*	index);	/*!< in: index */
+/**********************************************************************//**
+Creates and initializes a foreign constraint memory object.
+@return own: foreign constraint struct */
+dict_foreign_t*
+dict_mem_foreign_create(void);
+/*=========================*/
+
+/**********************************************************************//**
+Sets the foreign_table_name_lookup pointer based on the value of
+lower_case_table_names.  If that is 0 or 1, foreign_table_name_lookup
+will point to foreign_table_name.  If 2, then another string is
+allocated from the heap and set to lower case. */
+void
+dict_mem_foreign_table_name_lookup_set(
+/*===================================*/
+	dict_foreign_t*	foreign,	/*!< in/out: foreign struct */
+	ibool		do_alloc);	/*!< in: is an alloc needed */
+
+/**********************************************************************//**
+Sets the referenced_table_name_lookup pointer based on the value of
+lower_case_table_names.  If that is 0 or 1, referenced_table_name_lookup
+will point to referenced_table_name.  If 2, then another string is
+allocated from the heap and set to lower case. */
+void
+dict_mem_referenced_table_name_lookup_set(
+/*======================================*/
+	dict_foreign_t*	foreign,	/*!< in/out: foreign struct */
+	ibool		do_alloc);	/*!< in: is an alloc needed */
+
+/** Fills the dependent virtual columns in a set.
+Reason for being dependent are
+1) FK can be present on base column of virtual columns
+2) FK can be present on column which is a part of virtual index
+@param[in,out] foreign foreign key information. */
+void
+dict_mem_foreign_fill_vcol_set(
+       dict_foreign_t*	foreign);
+
+/** Fill virtual columns set in each fk constraint present in the table.
+@param[in,out] table   innodb table object. */
+void
+dict_mem_table_fill_foreign_vcol_set(
+        dict_table_t*	table);
+
+/** Free the vcol_set from all foreign key constraint on the table.
+@param[in,out] table   innodb table object. */
+void
+dict_mem_table_free_foreign_vcol_set(
+	dict_table_t*	table);
+
+/** Create a temporary tablename like "#sql-ibNNN".
+@param[in]	heap	A memory heap
+@param[in]	dbtab	Table name in the form database/table name
+@param[in]	id	Table id
+@return A unique temporary tablename suitable for InnoDB use */
+char*
+dict_mem_create_temporary_tablename(
+	mem_heap_t*	heap,
+	const char*	dbtab,
+	table_id_t	id);
+
+/** SQL identifier name wrapper for pretty-printing */
+class id_name_t
+{
+public:
+	/** Default constructor */
+	id_name_t()
+		: m_name()
+	{}
+	/** Constructor
+	@param[in]	name	identifier to assign */
+	explicit id_name_t(
+		const char*	name)
+		: m_name(name)
+	{}
+
+	/** Assignment operator
+	@param[in]	name	identifier to assign */
+	id_name_t& operator=(
+		const char*	name)
+	{
+		m_name = name;
+		return(*this);
+	}
+
+	/** Implicit type conversion
+	@return the name */
+	operator const char*() const
+	{
+		return(m_name);
+	}
+
+	/** Explicit type conversion
+	@return the name */
+	const char* operator()() const
+	{
+		return(m_name);
+	}
+
+private:
+	/** The name in internal representation */
+	const char*	m_name;
+};
+
+/** Data structure for a column in a table */
+struct dict_col_t{
+	/*----------------------*/
+	/** The following are copied from dtype_t,
+	so that all bit-fields can be packed tightly. */
+	/* @{ */
+	unsigned	prtype:32;	/*!< precise type; MySQL data
+					type, charset code, flags to
+					indicate nullability,
+					signedness, whether this is a
+					binary string, whether this is
+					a true VARCHAR where MySQL
+					uses 2 bytes to store the length */
+	unsigned	mtype:8;	/*!< main data type */
+
+	/* the remaining fields do not affect alphabetical ordering: */
+
+	unsigned	len:16;		/*!< length; for MySQL data this
+					is field->pack_length(),
+					except that for a >= 5.0.3
+					type true VARCHAR this is the
+					maximum byte length of the
+					string data (in addition to
+					the string, MySQL uses 1 or 2
+					bytes to store the string length) */
+
+	unsigned	mbminlen:3;	/*!< minimum length of a
+					character, in bytes */
+	unsigned	mbmaxlen:3;	/*!< maximum length of a
+					character, in bytes */
+	/*----------------------*/
+	/* End of definitions copied from dtype_t */
+	/* @} */
+
+	unsigned	ind:10;		/*!< table column position
+					(starting from 0) */
+	unsigned	ord_part:1;	/*!< nonzero if this column
+					appears in the ordering fields
+					of an index */
+	unsigned	max_prefix:12;	/*!< maximum index prefix length on
+					this column. Our current max limit is
+					3072 (REC_VERSION_56_MAX_INDEX_COL_LEN)
+					bytes. */
+private:
+	/** Special value of ind for a dropped column */
+	static const unsigned DROPPED = 1023;
+public:
+
+  /** Detach a virtual column from an index.
+  @param index  being-freed index */
+  inline void detach(const dict_index_t &index);
+
+  /** Data for instantly added columns */
+  struct def_t
+  {
+    /** original default value of instantly added column */
+    const void *data;
+    /** len of data, or UNIV_SQL_DEFAULT if unavailable */
+    ulint len;
+  } def_val;
+
+  /** Retrieve the column name.
+  @param table  the table of this column */
+  const char *name(const dict_table_t &table) const;
+
+  /** @return whether this is a virtual column */
+  bool is_virtual() const { return prtype & DATA_VIRTUAL; }
+  /** @return whether NULL is an allowed value for this column */
+  bool is_nullable() const { return !(prtype & DATA_NOT_NULL); }
+
+  /** @return whether table of this system field is TRX_ID-based */
+  bool vers_native() const
+  {
+    ut_ad(vers_sys_start() || vers_sys_end());
+    ut_ad(mtype == DATA_INT || mtype == DATA_FIXBINARY);
+    return mtype == DATA_INT;
+  }
+  /** @return whether this user column (not row_start, row_end)
+  has System Versioning property */
+  bool is_versioned() const { return !(~prtype & DATA_VERSIONED); }
+  /** @return whether this is the system version start */
+  bool vers_sys_start() const
+  {
+    return (prtype & DATA_VERSIONED) == DATA_VERS_START;
+  }
+  /** @return whether this is the system version end */
+  bool vers_sys_end() const
+  {
+    return (prtype & DATA_VERSIONED) == DATA_VERS_END;
+  }
+
+  /** @return whether this is an instantly-added column */
+  bool is_added() const
+  {
+    DBUG_ASSERT(def_val.len != UNIV_SQL_DEFAULT || !def_val.data);
+    return def_val.len != UNIV_SQL_DEFAULT;
+  }
+  /** Flag the column instantly dropped */
+  void set_dropped() { ind = DROPPED; }
+  /** Flag the column instantly dropped.
+  @param not_null  whether the column was NOT NULL
+  @param len2      whether the length exceeds 255 bytes
+  @param fixed_len the fixed length in bytes, or 0 */
+  void set_dropped(bool not_null, bool len2, unsigned fixed)
+  {
+    DBUG_ASSERT(!len2 || !fixed);
+    prtype= not_null ? DATA_NOT_NULL | DATA_BINARY_TYPE : DATA_BINARY_TYPE;
+    if (fixed)
+    {
+      mtype= DATA_FIXBINARY;
+      len= static_cast<uint16_t>(fixed);
+    }
+    else
+    {
+      mtype= DATA_BINARY;
+      len= len2 ? 65535 : 255;
+    }
+    mbminlen= mbmaxlen= 0;
+    ind= DROPPED;
+    ord_part= 0;
+    max_prefix= 0;
+  }
+  /** @return whether the column was instantly dropped */
+  bool is_dropped() const { return ind == DROPPED; }
+  /** @return whether the column was instantly dropped
+  @param index  the clustered index */
+  inline bool is_dropped(const dict_index_t &index) const;
+
+  /** Get the default value of an instantly-added column.
+  @param[out] len   value length (in bytes), or UNIV_SQL_NULL
+  @return default value
+  @retval NULL if the default value is SQL NULL (len=UNIV_SQL_NULL) */
+  const byte *instant_value(ulint *len) const
+  {
+    DBUG_ASSERT(is_added());
+    *len= def_val.len;
+    return static_cast<const byte*>(def_val.data);
+  }
+
+  /** Remove the 'instant ADD' status of the column */
+  void clear_instant()
+  {
+    def_val.len= UNIV_SQL_DEFAULT;
+    def_val.data= NULL;
+  }
+
+  /** @return whether two columns have compatible data type encoding */
+  bool same_type(const dict_col_t &other) const
+  {
+    if (mtype != other.mtype)
+    {
+      /* For latin1_swedish_ci, DATA_CHAR and DATA_VARCHAR
+      will be used instead of DATA_MYSQL and DATA_VARMYSQL.
+      As long as mtype,prtype are being written to InnoDB
+      data dictionary tables, we cannot simplify this. */
+      switch (mtype) {
+      default:
+        return false;
+      case DATA_VARCHAR:
+        if (other.mtype != DATA_VARMYSQL)
+          return false;
+        goto check_encoding;
+      case DATA_VARMYSQL:
+        if (other.mtype != DATA_VARCHAR)
+          return false;
+        goto check_encoding;
+      case DATA_CHAR:
+        if (other.mtype != DATA_MYSQL)
+          return false;
+        goto check_encoding;
+      case DATA_MYSQL:
+        if (other.mtype != DATA_CHAR)
+          return false;
+        goto check_encoding;
+      }
+    }
+    else if (dtype_is_string_type(mtype))
+    {
+    check_encoding:
+      const uint16_t cset= dtype_get_charset_coll(prtype);
+      const uint16_t ocset= dtype_get_charset_coll(other.prtype);
+      return cset == ocset || dict_col_t::same_encoding(cset, ocset);
+    }
+
+    return true;
+  }
+
+  /** @return whether two collations codes have the same character encoding */
+  static bool same_encoding(uint16_t a, uint16_t b);
+
+  /** Determine if the columns have the same format
+  except for is_nullable() and is_versioned().
+  @param other   column to compare to
+  @return whether the columns have the same format */
+  bool same_format(const dict_col_t &other) const
+  {
+    return same_type(other) && len >= other.len &&
+      mbminlen == other.mbminlen && mbmaxlen >= other.mbmaxlen &&
+      !((prtype ^ other.prtype) & ~(DATA_NOT_NULL | DATA_VERSIONED |
+                                    CHAR_COLL_MASK << 16 |
+                                    DATA_LONG_TRUE_VARCHAR));
+  }
+
+  /** @return whether the column values are comparable by memcmp() */
+  bool is_binary() const { return prtype & DATA_BINARY_TYPE; }
+};
+
+/** Index information put in a list of virtual column structure. Index
+id and virtual column position in the index will be logged.
+There can be multiple entries for a given index, with a different position. */
+struct dict_v_idx_t {
+	/** active index on the column */
+	dict_index_t*	index;
+
+	/** position in this index */
+	ulint		nth_field;
+
+	dict_v_idx_t(dict_index_t* index, ulint nth_field)
+		: index(index), nth_field(nth_field) {}
+};
+
+/** Data structure for a virtual column in a table */
+struct dict_v_col_t{
+	/** column structure */
+	dict_col_t		m_col;
+
+	/** array of base column ptr */
+	dict_col_t**		base_col;
+
+	/** number of base column */
+	unsigned		num_base:10;
+
+	/** column pos in table */
+	unsigned		v_pos:10;
+
+	/** Virtual index list, and column position in the index */
+	std::forward_list<dict_v_idx_t, ut_allocator<dict_v_idx_t> >
+	v_indexes;
+
+  /** Detach the column from an index.
+  @param index  index to be detached from */
+  void detach(const dict_index_t &index)
+  {
+    if (v_indexes.empty()) return;
+    auto i= v_indexes.before_begin();
+    do {
+      auto prev = i++;
+      if (i == v_indexes.end())
+      {
+        return;
+      }
+      if (i->index == &index)
+      {
+        v_indexes.erase_after(prev);
+        return;
+      }
+    }
+    while (i != v_indexes.end());
+  }
+};
+
+/** Data structure for newly added virtual column in a index.
+It is used only during rollback_inplace_alter_table() of
+addition of index depending on newly added virtual columns
+and uses index heap. Should be freed when index is being
+removed from cache. */
+struct dict_add_v_col_info
+{
+  ulint n_v_col;
+  dict_v_col_t *v_col;
+
+  /** Add the newly added virtual column while rollbacking
+  the index which contains new virtual columns
+  @param col    virtual column to be duplicated
+  @param offset offset where to duplicate virtual column */
+  dict_v_col_t* add_drop_v_col(mem_heap_t *heap, dict_v_col_t *col,
+                               ulint offset)
+  {
+    ut_ad(n_v_col);
+    ut_ad(offset < n_v_col);
+    if (!v_col)
+      v_col= static_cast<dict_v_col_t*>
+        (mem_heap_alloc(heap, n_v_col * sizeof *v_col));
+    new (&v_col[offset]) dict_v_col_t();
+    v_col[offset].m_col= col->m_col;
+    v_col[offset].v_pos= col->v_pos;
+    return &v_col[offset];
+  }
+};
+
+/** Data structure for newly added virtual column in a table */
+struct dict_add_v_col_t{
+	/** number of new virtual column */
+	ulint			n_v_col;
+
+	/** column structures */
+	const dict_v_col_t*	v_col;
+
+	/** new col names */
+	const char**		v_col_name;
+};
+
+/** Data structure for a stored column in a table. */
+struct dict_s_col_t {
+	/** Stored column ptr */
+	dict_col_t*	m_col;
+	/** array of base col ptr */
+	dict_col_t**	base_col;
+	/** number of base columns */
+	ulint		num_base;
+	/** column pos in table */
+	ulint		s_pos;
+};
+
+/** list to put stored column for create_table_info_t */
+typedef std::forward_list<dict_s_col_t, ut_allocator<dict_s_col_t> >
+dict_s_col_list;
+
+/** @brief DICT_ANTELOPE_MAX_INDEX_COL_LEN is measured in bytes and
+is the maximum indexed column length (or indexed prefix length) in
+ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT. Also, in any format,
+any fixed-length field that is longer than this will be encoded as
+a variable-length field.
+
+It is set to 3*256, so that one can create a column prefix index on
+256 characters of a TEXT or VARCHAR column also in the UTF-8
+charset. In that charset, a character may take at most 3 bytes.  This
+constant MUST NOT BE CHANGED, or the compatibility of InnoDB data
+files would be at risk! */
+#define DICT_ANTELOPE_MAX_INDEX_COL_LEN	REC_ANTELOPE_MAX_INDEX_COL_LEN
+
+/** Find out maximum indexed column length by its table format.
+For ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT, the maximum
+field length is REC_ANTELOPE_MAX_INDEX_COL_LEN - 1 (767). For
+ROW_FORMAT=COMPRESSED and ROW_FORMAT=DYNAMIC, the length could
+be REC_VERSION_56_MAX_INDEX_COL_LEN (3072) bytes */
+#define DICT_MAX_FIELD_LEN_BY_FORMAT(table)	\
+	(dict_table_has_atomic_blobs(table)	\
+	 ? REC_VERSION_56_MAX_INDEX_COL_LEN	\
+	 : REC_ANTELOPE_MAX_INDEX_COL_LEN - 1)
+
+#define DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(flags)	\
+	(DICT_TF_HAS_ATOMIC_BLOBS(flags)		\
+	 ? REC_VERSION_56_MAX_INDEX_COL_LEN		\
+	 : REC_ANTELOPE_MAX_INDEX_COL_LEN - 1)
+
+/** Defines the maximum fixed length column size */
+#define DICT_MAX_FIXED_COL_LEN		DICT_ANTELOPE_MAX_INDEX_COL_LEN
+
+#ifdef WITH_WSREP
+#define WSREP_MAX_SUPPORTED_KEY_LENGTH 3500
+#endif /* WITH_WSREP */
+
+/** Data structure for a field in an index */
+struct dict_field_t{
+	dict_col_t*	col;		/*!< pointer to the table column */
+	id_name_t	name;		/*!< name of the column */
+	unsigned	prefix_len:12;	/*!< 0 or the length of the column
+					prefix in bytes in a MySQL index of
+					type, e.g., INDEX (textcol(25));
+					must be smaller than
+					DICT_MAX_FIELD_LEN_BY_FORMAT;
+					NOTE that in the UTF-8 charset, MySQL
+					sets this to (mbmaxlen * the prefix len)
+					in UTF-8 chars */
+	unsigned	fixed_len:10;	/*!< 0 or the fixed length of the
+					column if smaller than
+					DICT_ANTELOPE_MAX_INDEX_COL_LEN */
+	/** 1=DESC, 0=ASC */
+	unsigned	descending:1;
+
+	/** Zero-initialize all fields */
+	dict_field_t() { memset((void*) this, 0, sizeof *this); }
+
+	/** Check whether two index fields are equivalent.
+	@param[in]	old	the other index field
+	@return	whether the index fields are equivalent */
+	bool same(const dict_field_t& other) const
+	{
+		return(prefix_len == other.prefix_len
+		       && fixed_len == other.fixed_len);
+	}
+};
+
+/**********************************************************************//**
+PADDING HEURISTIC BASED ON LINEAR INCREASE OF PADDING TO AVOID
+COMPRESSION FAILURES
+(Note: this is relevant only for compressed indexes)
+GOAL: Avoid compression failures by maintaining information about the
+compressibility of data. If data is not very compressible then leave
+some extra space 'padding' in the uncompressed page making it more
+likely that compression of less than fully packed uncompressed page will
+succeed.
+
+This padding heuristic works by increasing the pad linearly until the
+desired failure rate is reached. A "round" is a fixed number of
+compression operations.
+After each round, the compression failure rate for that round is
+computed. If the failure rate is too high, then padding is incremented
+by a fixed value, otherwise it's left intact.
+If the compression failure is lower than the desired rate for a fixed
+number of consecutive rounds, then the padding is decreased by a fixed
+value. This is done to prevent overshooting the padding value,
+and to accommodate the possible change in data compressibility. */
+
+/** Number of zip ops in one round. */
+#define ZIP_PAD_ROUND_LEN			(128)
+
+/** Number of successful rounds after which the padding is decreased */
+#define ZIP_PAD_SUCCESSFUL_ROUND_LIMIT		(5)
+
+/** Amount by which padding is increased. */
+#define ZIP_PAD_INCR				(128)
+
+/** Percentage of compression failures that are allowed in a single
+round */
+extern ulong	zip_failure_threshold_pct;
+
+/** Maximum percentage of a page that can be allowed as a pad to avoid
+compression failures */
+extern ulong	zip_pad_max;
+
+/** Data structure to hold information about about how much space in
+an uncompressed page should be left as padding to avoid compression
+failures. This estimate is based on a self-adapting heuristic. */
+struct zip_pad_info_t {
+  /** Dummy assignment operator for dict_index_t::clone() */
+  zip_pad_info_t &operator=(const zip_pad_info_t&) { return *this; }
+	std::mutex	mutex;	/*!< mutex protecting the info */
+	Atomic_relaxed<ulint>
+			pad;	/*!< number of bytes used as pad */
+	ulint		success;/*!< successful compression ops during
+				current round */
+	ulint		failure;/*!< failed compression ops during
+				current round */
+	ulint		n_rounds;/*!< number of currently successful
+				rounds */
+};
+
+/** Number of samples of data size kept when page compression fails for
+a certain index.*/
+#define STAT_DEFRAG_DATA_SIZE_N_SAMPLE	10
+
+/** "GEN_CLUST_INDEX" is the name reserved for InnoDB default
+system clustered index when there is no primary key. */
+const char innobase_index_reserve_name[] = "GEN_CLUST_INDEX";
+
+/** Data structure for an index.  Most fields will be
+initialized to 0, NULL or FALSE in dict_mem_index_create(). */
+struct dict_index_t {
+  /** Columns whose character-set collation is being changed */
+  struct col_info
+  {
+    /** number of columns whose charset-collation is being changed */
+    unsigned n_cols;
+    /** columns with changed charset-collation */
+    dict_col_t *cols;
+
+    /** Add a column with changed collation. */
+    dict_col_t *add(mem_heap_t *heap, const dict_col_t &col, unsigned offset)
+    {
+      ut_ad(offset < n_cols);
+      if (!cols)
+        cols= static_cast<dict_col_t*>
+          (mem_heap_alloc(heap, n_cols * sizeof col));
+      new (&cols[offset]) dict_col_t(col);
+      return &cols[offset];
+    }
+  };
+
+  /** Maximum number of fields */
+  static constexpr unsigned MAX_N_FIELDS= (1U << 10) - 1;
+
+	index_id_t	id;	/*!< id of the index */
+	mem_heap_t*	heap;	/*!< memory heap */
+	id_name_t	name;	/*!< index name */
+	dict_table_t*	table;	/*!< back pointer to table */
+	/** root page number, or FIL_NULL if the index has been detached
+	from storage (DISCARD TABLESPACE or similar),
+	or 1 if the index is in table->freed_indexes */
+	unsigned	page:32;
+	unsigned	merge_threshold:6;
+				/*!< In the pessimistic delete, if the page
+				data size drops below this limit in percent,
+				merging it to a neighbor is tried */
+# define DICT_INDEX_MERGE_THRESHOLD_DEFAULT 50
+	unsigned	type:DICT_IT_BITS;
+				/*!< index type (DICT_CLUSTERED, DICT_UNIQUE,
+				DICT_IBUF, DICT_CORRUPT) */
+#define MAX_KEY_LENGTH_BITS 12
+	unsigned	trx_id_offset:MAX_KEY_LENGTH_BITS;
+				/*!< position of the trx id column
+				in a clustered index record, if the fields
+				before it are known to be of a fixed size,
+				0 otherwise */
+#if (1<<MAX_KEY_LENGTH_BITS) < HA_MAX_KEY_LENGTH
+# error (1<<MAX_KEY_LENGTH_BITS) < HA_MAX_KEY_LENGTH
+#endif
+	unsigned	n_user_defined_cols:10;
+				/*!< number of columns the user defined to
+				be in the index: in the internal
+				representation we add more columns */
+	unsigned	nulls_equal:1;
+				/*!< if true, SQL NULL == SQL NULL */
+	unsigned	n_uniq:10;/*!< number of fields from the beginning
+				which are enough to determine an index
+				entry uniquely */
+	unsigned	n_def:10;/*!< number of fields defined so far */
+	unsigned	n_fields:10;/*!< number of fields in the index */
+	unsigned	n_nullable:10;/*!< number of nullable fields */
+	unsigned	n_core_fields:10;/*!< number of fields in the index
+				(before the first time of instant add columns) */
+	/** number of bytes of null bits in ROW_FORMAT!=REDUNDANT node pointer
+	records; usually equal to UT_BITS_IN_BYTES(n_nullable), but
+	can be less in clustered indexes with instant ADD COLUMN */
+	unsigned	n_core_null_bytes:8;
+	/** magic value signalling that n_core_null_bytes was not
+	initialized yet */
+	static const unsigned NO_CORE_NULL_BYTES = 0xff;
+	/** The clustered index ID of the hard-coded SYS_INDEXES table. */
+	static const unsigned DICT_INDEXES_ID = 3;
+	unsigned	cached:1;/*!< TRUE if the index object is in the
+				dictionary cache */
+	unsigned	to_be_dropped:1;
+				/*!< TRUE if the index is to be dropped;
+				protected by dict_sys.latch */
+	unsigned	online_status:2;
+				/*!< enum online_index_status.
+				Transitions from ONLINE_INDEX_COMPLETE (to
+				ONLINE_INDEX_CREATION) are protected
+				by dict_sys.latch. Other changes are
+				protected by index->lock. */
+	unsigned	uncommitted:1;
+				/*!< a flag that is set for secondary indexes
+				that have not been committed to the
+				data dictionary yet. Protected by
+				MDL */
+
+#ifdef UNIV_DEBUG
+	/** whether this is a dummy index object */
+	bool		is_dummy;
+	/** whether btr_cur_instant_init() is in progress */
+	bool		in_instant_init;
+	uint32_t	magic_n;/*!< magic number */
+/** Value of dict_index_t::magic_n */
+# define DICT_INDEX_MAGIC_N	76789786
+#endif
+	dict_field_t*	fields;	/*!< array of field descriptions */
+	st_mysql_ftparser*
+			parser;	/*!< fulltext parser plugin */
+
+	/** It just indicates whether newly added virtual column
+	during alter. It stores column in case of alter failure.
+	It should use heap from dict_index_t. It should be freed
+	while removing the index from table. */
+	dict_add_v_col_info* new_vcol_info;
+
+	/** During ALTER TABLE, columns that a being-added index depends on
+	and whose encoding or collation is being changed to something
+	that is compatible with the clustered index.
+	Allocated from dict_index_t::heap.
+
+	@see rollback_inplace_alter_table()
+	@see ha_innobase_inplace_ctx::col_collations */
+	col_info* change_col_info;
+
+	UT_LIST_NODE_T(dict_index_t)
+			indexes;/*!< list of indexes of the table */
+#ifdef BTR_CUR_ADAPT
+	btr_search_t*	search_info;
+				/*!< info used in optimistic searches */
+#endif /* BTR_CUR_ADAPT */
+	row_log_t*	online_log;
+				/*!< the log of modifications
+				during online index creation;
+				valid when online_status is
+				ONLINE_INDEX_CREATION */
+	/*----------------------*/
+	/** Statistics for query optimization */
+	/* @{ */
+	ib_uint64_t*	stat_n_diff_key_vals;
+				/*!< approximate number of different
+				key values for this index, for each
+				n-column prefix where 1 <= n <=
+				dict_get_n_unique(index) (the array is
+				indexed from 0 to n_uniq-1); we
+				periodically calculate new
+				estimates */
+	ib_uint64_t*	stat_n_sample_sizes;
+				/*!< number of pages that were sampled
+				to calculate each of stat_n_diff_key_vals[],
+				e.g. stat_n_sample_sizes[3] pages were sampled
+				to get the number stat_n_diff_key_vals[3]. */
+	ib_uint64_t*	stat_n_non_null_key_vals;
+				/* approximate number of non-null key values
+				for this index, for each column where
+				1 <= n <= dict_get_n_unique(index) (the array
+				is indexed from 0 to n_uniq-1); This
+				is used when innodb_stats_method is
+				"nulls_ignored". */
+	ulint		stat_index_size;
+				/*!< approximate index size in
+				database pages */
+	ulint		stat_n_leaf_pages;
+				/*!< approximate number of leaf pages in the
+				index tree */
+	bool		stats_error_printed;
+				/*!< has persistent statistics error printed
+				for this index ? */
+	/* @} */
+	/** Statistics for defragmentation, these numbers are estimations and
+	could be very inaccurate at certain times, e.g. right after restart,
+	during defragmentation, etc. */
+	/* @{ */
+	ulint		stat_defrag_modified_counter;
+	ulint		stat_defrag_n_pages_freed;
+				/* number of pages freed by defragmentation. */
+	ulint		stat_defrag_n_page_split;
+				/* number of page splits since last full index
+				defragmentation. */
+	ulint		stat_defrag_data_size_sample[STAT_DEFRAG_DATA_SIZE_N_SAMPLE];
+				/* data size when compression failure happened
+				the most recent 10 times. */
+	ulint		stat_defrag_sample_next_slot;
+				/* in which slot the next sample should be
+				saved. */
+	/* @} */
+private:
+  /** R-tree split sequence number */
+  Atomic_relaxed<node_seq_t> rtr_ssn;
+public:
+  void set_ssn(node_seq_t ssn) { rtr_ssn= ssn; }
+  node_seq_t assign_ssn() { return rtr_ssn.fetch_add(1) + 1; }
+  node_seq_t ssn() const { return rtr_ssn; }
+
+	rtr_info_track_t*
+			rtr_track;/*!< tracking all R-Tree search cursors */
+	trx_id_t	trx_id; /*!< id of the transaction that created this
+				index, or 0 if the index existed
+				when InnoDB was started up */
+	zip_pad_info_t	zip_pad;/*!< Information about state of
+				compression failures and successes */
+  /** lock protecting the non-leaf index pages */
+  mutable index_lock lock;
+
+	/** Determine if the index has been committed to the
+	data dictionary.
+	@return whether the index definition has been committed */
+	bool is_committed() const
+	{
+		ut_ad(!uncommitted || !(type & DICT_CLUSTERED));
+		return(UNIV_LIKELY(!uncommitted));
+	}
+
+	/** Flag an index committed or uncommitted.
+	@param[in]	committed	whether the index is committed */
+	void set_committed(bool committed)
+	{
+		ut_ad(!to_be_dropped);
+		ut_ad(committed || !(type & DICT_CLUSTERED));
+		ut_ad(!committed || !change_col_info);
+		uncommitted = !committed;
+	}
+
+	/** Notify that the index pages are going to be modified.
+	@param[in,out]	mtr	mini-transaction */
+	inline void set_modified(mtr_t& mtr) const;
+
+	/** @return whether this index is readable
+	@retval	true	normally
+	@retval	false	if this is a single-table tablespace
+			and the .ibd file is missing, or a
+			page cannot be read or decrypted */
+	inline bool is_readable() const;
+
+	/** @return whether instant ALTER TABLE is in effect */
+	inline bool is_instant() const;
+
+	/** @return whether the index is the primary key index
+	(not the clustered index of the change buffer) */
+	bool is_primary() const
+	{
+		return DICT_CLUSTERED == (type & (DICT_CLUSTERED | DICT_IBUF));
+	}
+
+	/** @return whether this is a generated clustered index */
+	bool is_gen_clust() const { return type == DICT_CLUSTERED; }
+
+	/** @return whether this is a clustered index */
+	bool is_clust() const { return type & DICT_CLUSTERED; }
+
+	/** @return whether this is a unique index */
+	bool is_unique() const { return type & DICT_UNIQUE; }
+
+	/** @return whether this is a spatial index */
+	bool is_spatial() const { return UNIV_UNLIKELY(type & DICT_SPATIAL); }
+
+	/** @return whether this is the change buffer */
+	bool is_ibuf() const { return UNIV_UNLIKELY(type & DICT_IBUF); }
+
+	/** @return whether this index requires locking */
+	bool has_locking() const { return !is_ibuf(); }
+
+	/** @return whether this is a normal B-tree index
+        (not the change buffer, not SPATIAL or FULLTEXT) */
+	bool is_btree() const {
+		return UNIV_LIKELY(!(type & (DICT_IBUF | DICT_SPATIAL
+					     | DICT_FTS | DICT_CORRUPT)));
+	}
+
+	/** @return whether the index includes virtual columns */
+	bool has_virtual() const { return type & DICT_VIRTUAL; }
+
+	/** @return the position of DB_TRX_ID */
+	uint16_t db_trx_id() const {
+		DBUG_ASSERT(is_primary());
+		DBUG_ASSERT(n_uniq);
+		DBUG_ASSERT(n_uniq <= MAX_REF_PARTS);
+		return n_uniq;
+	}
+	/** @return the position of DB_ROLL_PTR */
+	uint16_t db_roll_ptr() const
+	{
+		return static_cast<uint16_t>(db_trx_id() + 1);
+	}
+
+	/** @return the offset of the metadata BLOB field,
+	or the first user field after the PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR */
+	uint16_t first_user_field() const
+	{
+		return static_cast<uint16_t>(db_trx_id() + 2);
+	}
+
+	/** @return whether the index is corrupted */
+	inline bool is_corrupted() const;
+
+  /** Detach the virtual columns from the index that is to be removed. */
+  void detach_columns()
+  {
+    if (!has_virtual() || !cached)
+      return;
+    for (unsigned i= 0; i < n_fields; i++)
+    {
+      dict_col_t* col= fields[i].col;
+      if (!col || !col->is_virtual())
+        continue;
+      col->detach(*this);
+    }
+  }
+
+	/** Determine how many fields of a given prefix can be set NULL.
+	@param[in]	n_prefix	number of fields in the prefix
+	@return	number of fields 0..n_prefix-1 that can be set NULL */
+	unsigned get_n_nullable(ulint n_prefix) const
+	{
+		DBUG_ASSERT(n_prefix > 0);
+		DBUG_ASSERT(n_prefix <= n_fields);
+		unsigned n = n_nullable;
+		for (; n_prefix < n_fields; n_prefix++) {
+			const dict_col_t* col = fields[n_prefix].col;
+			DBUG_ASSERT(!col->is_virtual());
+			n -= col->is_nullable();
+		}
+		DBUG_ASSERT(n < n_def);
+		return n;
+	}
+
+	/** Get the default value of an instantly-added clustered index field.
+	@param[in]	n	instantly added field position
+	@param[out]	len	value length (in bytes), or UNIV_SQL_NULL
+	@return	default value
+	@retval	NULL	if the default value is SQL NULL (len=UNIV_SQL_NULL) */
+	const byte* instant_field_value(ulint n, ulint* len) const
+	{
+		DBUG_ASSERT(is_instant() || id == DICT_INDEXES_ID);
+		DBUG_ASSERT(n + (id == DICT_INDEXES_ID) >= n_core_fields);
+		DBUG_ASSERT(n < n_fields);
+		return fields[n].col->instant_value(len);
+	}
+
+	/** Adjust index metadata for instant ADD/DROP/reorder COLUMN.
+	@param[in]	clustered index definition after instant ALTER TABLE */
+	inline void instant_add_field(const dict_index_t& instant);
+	/** Remove instant ADD COLUMN metadata. */
+	inline void clear_instant_add();
+	/** Remove instant ALTER TABLE metadata. */
+	inline void clear_instant_alter();
+
+	/** Construct the metadata record for instant ALTER TABLE.
+	@param[in]	row	dummy or default values for existing columns
+	@param[in,out]	heap	memory heap for allocations
+	@return	metadata record */
+	inline dtuple_t*
+	instant_metadata(const dtuple_t& row, mem_heap_t* heap) const;
+
+	/** Check if record in clustered index is historical row.
+	@param[in]	rec	clustered row
+	@param[in]	offsets	offsets
+	@return true if row is historical */
+	bool
+	vers_history_row(const rec_t* rec, const rec_offs* offsets);
+
+	/** Check if record in secondary index is historical row.
+	@param[in]	rec	record in a secondary index
+	@param[out]	history_row true if row is historical
+	@return true on error */
+	bool
+	vers_history_row(const rec_t* rec, bool &history_row);
+
+  /** Assign the number of new column to be added as a part
+  of the index
+  @param        n_vcol  number of virtual columns to be added */
+  void assign_new_v_col(ulint n_vcol)
+  {
+    new_vcol_info= static_cast<dict_add_v_col_info*>
+      (mem_heap_zalloc(heap, sizeof *new_vcol_info));
+    new_vcol_info->n_v_col= n_vcol;
+  }
+
+  /* @return whether index has new virtual column */
+  bool has_new_v_col() const { return new_vcol_info; }
+
+  /* @return number of newly added virtual column */
+  ulint get_new_n_vcol() const
+  { return new_vcol_info ? new_vcol_info->n_v_col : 0; }
+
+  /** Assign the number of collation change fields as a part of the index
+  @param  n_cols   number of columns whose collation is changing */
+  void init_change_cols(unsigned n_cols)
+  {
+    ut_ad(n_fields > n_cols || type & DICT_FTS);
+    change_col_info= static_cast<col_info*>
+      (mem_heap_zalloc(heap, sizeof(col_info)));
+    change_col_info->n_cols= n_cols;
+  }
+
+  /** Reconstruct the clustered index fields.
+  @return whether metadata is incorrect */
+  inline bool reconstruct_fields();
+
+  /** Check if the index contains a column or a prefix of that column.
+  @param[in]	n		column number
+  @param[in]	is_virtual	whether it is a virtual col
+  @return whether the index contains the column or its prefix */
+  bool contains_col_or_prefix(ulint n, bool is_virtual) const
+  MY_ATTRIBUTE((warn_unused_result));
+
+#ifdef BTR_CUR_HASH_ADAPT
+  /** @return a clone of this */
+  dict_index_t* clone() const;
+  /** Clone this index for lazy dropping of the adaptive hash index.
+  @return this or a clone */
+  dict_index_t* clone_if_needed();
+  /** @return number of leaf pages pointed to by the adaptive hash index */
+  inline ulint n_ahi_pages() const;
+  /** @return whether mark_freed() had been invoked */
+  bool freed() const { return UNIV_UNLIKELY(page == 1); }
+  /** Note that the index is waiting for btr_search_lazy_free() */
+  void set_freed() { ut_ad(!freed()); page= 1; }
+#endif /* BTR_CUR_HASH_ADAPT */
+
+  /** @return whether it is forbidden to invoke clear_instant_add() */
+  bool must_avoid_clear_instant_add() const
+  {
+    if (is_instant())
+      for (auto i= this; (i= UT_LIST_GET_NEXT(indexes, i)) != nullptr; )
+        if (i->to_be_dropped /* || i->online_log*/)
+          return true;
+    return false;
+  }
+
+	/** This ad-hoc class is used by record_size_info only.	*/
+	class record_size_info_t {
+	public:
+		record_size_info_t()
+		    : max_leaf_size(0), shortest_size(0), too_big(false),
+		      first_overrun_field_index(SIZE_T_MAX), overrun_size(0)
+		{
+		}
+
+		/** Mark row potentially too big for page and set up first
+		overflow field index. */
+		void set_too_big(size_t field_index)
+		{
+			ut_ad(field_index != SIZE_T_MAX);
+
+			too_big = true;
+			if (first_overrun_field_index > field_index) {
+				first_overrun_field_index = field_index;
+				overrun_size = shortest_size;
+			}
+		}
+
+		/** @return overrun field index or SIZE_T_MAX if nothing
+		overflowed*/
+		size_t get_first_overrun_field_index() const
+		{
+			ut_ad(row_is_too_big());
+			ut_ad(first_overrun_field_index != SIZE_T_MAX);
+			return first_overrun_field_index;
+		}
+
+		size_t get_overrun_size() const
+		{
+			ut_ad(row_is_too_big());
+			return overrun_size;
+		}
+
+		bool row_is_too_big() const { return too_big; }
+
+		size_t max_leaf_size; /** Bigger row size this index can
+				      produce */
+		size_t shortest_size; /** shortest because it counts everything
+				      as in overflow pages */
+
+	private:
+		bool too_big; /** This one is true when maximum row size this
+			      index can produce is bigger than maximum row
+			      size given page can hold. */
+		size_t first_overrun_field_index; /** After adding this field
+						  index row overflowed maximum
+						  allowed size. Useful for
+						  reporting back to user. */
+		size_t overrun_size;		  /** Just overrun row size */
+	};
+
+	/** Returns max possibly record size for that index, size of a shortest
+	everything in overflow) size of the longest possible row and index
+	of a field which made index records too big to fit on a page.*/
+	inline record_size_info_t record_size_info() const;
+
+  /** Clear the index tree and reinitialize the root page, in the
+  rollback of TRX_UNDO_EMPTY. The BTR_SEG_LEAF is freed and reinitialized.
+  @param thr query thread
+  @return error code */
+  dberr_t clear(que_thr_t *thr);
+
+  /** Check whether the online log is dummy value to indicate
+  whether table undergoes active DDL.
+  @retval true if online log is dummy value */
+  bool online_log_is_dummy() const
+  {
+    return online_log == reinterpret_cast<const row_log_t*>(this);
+  }
+
+  /** Assign clustered index online log to dummy value */
+  void online_log_make_dummy()
+  {
+    online_log= reinterpret_cast<row_log_t*>(this);
+  }
+};
+
+/** Detach a virtual column from an index.
+@param index  being-freed index */
+inline void dict_col_t::detach(const dict_index_t &index)
+{
+  if (is_virtual())
+    reinterpret_cast<dict_v_col_t*>(this)->detach(index);
+}
+
+/** Add a field definition to an index.
+@param index         index
+@param name          pointer to column name
+@param prefix_len    column prefix length, or 0
+@param descending    whether to use descending order */
+inline void dict_mem_index_add_field(dict_index_t *index, const char *name,
+                                     ulint prefix_len, bool descending= false)
+{
+  ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+  dict_field_t &field= index->fields[index->n_def++];
+  field.name= name;
+  field.prefix_len= prefix_len & ((1U << 12) - 1);
+  field.descending= descending;
+}
+
+/** The status of online index creation */
+enum online_index_status {
+	/** the index is complete and ready for access */
+	ONLINE_INDEX_COMPLETE = 0,
+	/** the index is being created, online
+	(allowing concurrent modifications) */
+	ONLINE_INDEX_CREATION,
+	/** secondary index creation was aborted and the index
+	should be dropped as soon as index->table->n_ref_count reaches 0,
+	or online table rebuild was aborted and the clustered index
+	of the original table should soon be restored to
+	ONLINE_INDEX_COMPLETE */
+	ONLINE_INDEX_ABORTED,
+	/** the online index creation was aborted, the index was
+	dropped from the data dictionary and the tablespace, and it
+	should be dropped from the data dictionary cache as soon as
+	index->table->n_ref_count reaches 0. */
+	ONLINE_INDEX_ABORTED_DROPPED
+};
+
+/** Set to store the virtual columns which are affected by Foreign
+key constraint. */
+typedef std::set<dict_v_col_t*, std::less<dict_v_col_t*>,
+		ut_allocator<dict_v_col_t*> >		dict_vcol_set;
+
+/** Data structure for a foreign key constraint; an example:
+FOREIGN KEY (A, B) REFERENCES TABLE2 (C, D).  Most fields will be
+initialized to 0, NULL or FALSE in dict_mem_foreign_create(). */
+struct dict_foreign_t{
+	mem_heap_t*	heap;		/*!< this object is allocated from
+					this memory heap */
+	char*		id;		/*!< id of the constraint as a
+					null-terminated string */
+	unsigned	n_fields:10;	/*!< number of indexes' first fields
+					for which the foreign key
+					constraint is defined: we allow the
+					indexes to contain more fields than
+					mentioned in the constraint, as long
+					as the first fields are as mentioned */
+	unsigned	type:6;		/*!< 0 or DICT_FOREIGN_ON_DELETE_CASCADE
+					or DICT_FOREIGN_ON_DELETE_SET_NULL */
+	char*		foreign_table_name;/*!< foreign table name */
+	char*		foreign_table_name_lookup;
+				/*!< foreign table name used for dict lookup */
+	dict_table_t*	foreign_table;	/*!< table where the foreign key is */
+	const char**	foreign_col_names;/*!< names of the columns in the
+					foreign key */
+	char*		referenced_table_name;/*!< referenced table name */
+	char*		referenced_table_name_lookup;
+				/*!< referenced table name for dict lookup*/
+	dict_table_t*	referenced_table;/*!< table where the referenced key
+					is */
+	const char**	referenced_col_names;/*!< names of the referenced
+					columns in the referenced table */
+	dict_index_t*	foreign_index;	/*!< foreign index; we require that
+					both tables contain explicitly defined
+					indexes for the constraint: InnoDB
+					does not generate new indexes
+					implicitly */
+	dict_index_t*	referenced_index;/*!< referenced index */
+
+	dict_vcol_set*	v_cols;		/*!< set of virtual columns affected
+					by foreign key constraint. */
+
+	/** Check whether the fulltext index gets affected by
+	foreign key constraint */
+	bool affects_fulltext() const;
+};
+
+std::ostream&
+operator<< (std::ostream& out, const dict_foreign_t& foreign);
+
+struct dict_foreign_print {
+
+	dict_foreign_print(std::ostream& out)
+		: m_out(out)
+	{}
+
+	void operator()(const dict_foreign_t* foreign) {
+		m_out << *foreign;
+	}
+private:
+	std::ostream&	m_out;
+};
+
+/** Compare two dict_foreign_t objects using their ids. Used in the ordering
+of dict_table_t::foreign_set and dict_table_t::referenced_set.  It returns
+true if the first argument is considered to go before the second in the
+strict weak ordering it defines, and false otherwise. */
+struct dict_foreign_compare {
+
+	bool operator()(
+		const dict_foreign_t*	lhs,
+		const dict_foreign_t*	rhs) const
+	{
+		return strcmp(lhs->id, rhs->id) < 0;
+	}
+};
+
+/** A function object to find a foreign key with the given index as the
+referenced index. Return the foreign key with matching criteria or NULL */
+struct dict_foreign_with_index {
+
+	dict_foreign_with_index(const dict_index_t*	index)
+	: m_index(index)
+	{}
+
+	bool operator()(const dict_foreign_t*	foreign) const
+	{
+		return(foreign->referenced_index == m_index);
+	}
+
+	const dict_index_t*	m_index;
+};
+
+/* A function object to check if the foreign constraint is between different
+tables.  Returns true if foreign key constraint is between different tables,
+false otherwise. */
+struct dict_foreign_different_tables {
+
+	bool operator()(const dict_foreign_t*	foreign) const
+	{
+		return(foreign->foreign_table != foreign->referenced_table);
+	}
+};
+
+/** A function object to check if the foreign key constraint has the same
+name as given.  If the full name of the foreign key constraint doesn't match,
+then, check if removing the database name from the foreign key constraint
+matches. Return true if it matches, false otherwise. */
+struct dict_foreign_matches_id {
+
+	dict_foreign_matches_id(const char* id)
+		: m_id(id)
+	{}
+
+	bool operator()(const dict_foreign_t*	foreign) const
+	{
+		if (0 == innobase_strcasecmp(foreign->id, m_id)) {
+			return(true);
+		}
+		if (const char* pos = strchr(foreign->id, '/')) {
+			if (0 == innobase_strcasecmp(m_id, pos + 1)) {
+				return(true);
+			}
+		}
+		return(false);
+	}
+
+	const char*	m_id;
+};
+
+typedef std::set<
+	dict_foreign_t*,
+	dict_foreign_compare,
+	ut_allocator<dict_foreign_t*> >	dict_foreign_set;
+
+std::ostream&
+operator<< (std::ostream& out, const dict_foreign_set& fk_set);
+
+/** Function object to check if a foreign key object is there
+in the given foreign key set or not.  It returns true if the
+foreign key is not found, false otherwise */
+struct dict_foreign_not_exists {
+	dict_foreign_not_exists(const dict_foreign_set& obj_)
+		: m_foreigns(obj_)
+	{}
+
+	/* Return true if the given foreign key is not found */
+	bool operator()(dict_foreign_t* const & foreign) const {
+		return(m_foreigns.find(foreign) == m_foreigns.end());
+	}
+private:
+	const dict_foreign_set&	m_foreigns;
+};
+
+/** Validate the search order in the foreign key set.
+@param[in]	fk_set	the foreign key set to be validated
+@return true if search order is fine in the set, false otherwise. */
+bool
+dict_foreign_set_validate(
+	const dict_foreign_set&	fk_set);
+
+/** Validate the search order in the foreign key sets of the table
+(foreign_set and referenced_set).
+@param[in]	table	table whose foreign key sets are to be validated
+@return true if foreign key sets are fine, false otherwise. */
+bool
+dict_foreign_set_validate(
+	const dict_table_t&	table);
+
+/*********************************************************************//**
+Frees a foreign key struct. */
+inline
+void
+dict_foreign_free(
+/*==============*/
+	dict_foreign_t*	foreign)	/*!< in, own: foreign key struct */
+{
+	if (foreign->v_cols != NULL) {
+		UT_DELETE(foreign->v_cols);
+	}
+
+	mem_heap_free(foreign->heap);
+}
+
+/** The destructor will free all the foreign key constraints in the set
+by calling dict_foreign_free() on each of the foreign key constraints.
+This is used to free the allocated memory when a local set goes out
+of scope. */
+struct dict_foreign_set_free {
+
+	dict_foreign_set_free(const dict_foreign_set&	foreign_set)
+		: m_foreign_set(foreign_set)
+	{}
+
+	~dict_foreign_set_free()
+	{
+		std::for_each(m_foreign_set.begin(),
+			      m_foreign_set.end(),
+			      dict_foreign_free);
+	}
+
+	const dict_foreign_set&	m_foreign_set;
+};
+
+/** The flags for ON_UPDATE and ON_DELETE can be ORed; the default is that
+a foreign key constraint is enforced, therefore RESTRICT just means no flag */
+/* @{ */
+#define DICT_FOREIGN_ON_DELETE_CASCADE	1U	/*!< ON DELETE CASCADE */
+#define DICT_FOREIGN_ON_DELETE_SET_NULL	2U	/*!< ON UPDATE SET NULL */
+#define DICT_FOREIGN_ON_UPDATE_CASCADE	4U	/*!< ON DELETE CASCADE */
+#define DICT_FOREIGN_ON_UPDATE_SET_NULL	8U	/*!< ON UPDATE SET NULL */
+#define DICT_FOREIGN_ON_DELETE_NO_ACTION 16U	/*!< ON DELETE NO ACTION */
+#define DICT_FOREIGN_ON_UPDATE_NO_ACTION 32U	/*!< ON UPDATE NO ACTION */
+/* @} */
+
+/** Display an identifier.
+@param[in,out]	s	output stream
+@param[in]	id_name	SQL identifier (other than table name)
+@return the output stream */
+std::ostream&
+operator<<(
+	std::ostream&		s,
+	const id_name_t&	id_name);
+
+/** Display a table name.
+@param[in,out]	s		output stream
+@param[in]	table_name	table name
+@return the output stream */
+std::ostream&
+operator<<(
+	std::ostream&		s,
+	const table_name_t&	table_name);
+
+/** List of locks that different transactions have acquired on a table. This
+list has a list node that is embedded in a nested union/structure. We have to
+generate a specific template for it. */
+
+typedef ut_list_base<lock_t, ut_list_node<lock_t> lock_table_t::*>
+	table_lock_list_t;
+
+/** mysql template structure defined in row0mysql.cc */
+struct mysql_row_templ_t;
+
+/** Structure defines template related to virtual columns and
+their base columns */
+struct dict_vcol_templ_t {
+	/** number of regular columns */
+	ulint			n_col;
+
+	/** number of virtual columns */
+	ulint			n_v_col;
+
+	/** array of templates for virtual col and their base columns */
+	mysql_row_templ_t**	vtempl;
+
+	/** table's database name */
+	std::string		db_name;
+
+	/** table name */
+	std::string		tb_name;
+
+	/** MySQL record length */
+	ulint			rec_len;
+
+	/** default column value if any */
+	byte*			default_rec;
+
+	/** cached MySQL TABLE object */
+	TABLE*			mysql_table;
+
+	/** when mysql_table was cached */
+	uint64_t		mysql_table_query_id;
+
+	dict_vcol_templ_t() : vtempl(0), mysql_table_query_id(~0ULL) {}
+};
+
+/** Metadata on clustered index fields starting from first_user_field() */
+class field_map_element_t
+{
+	/** Number of bits for representing a column number */
+	static constexpr uint16_t IND_BITS = 10;
+
+	/** Set if the column of the field has been instantly dropped */
+	static constexpr uint16_t DROPPED = 1U << (IND_BITS + 5);
+
+	/** Set if the column was dropped and originally declared NOT NULL */
+	static constexpr uint16_t NOT_NULL = 1U << (IND_BITS + 4);
+
+	/** Column index (if !(data & DROPPED)): table->cols[data & IND],
+	or field length (if (data & DROPPED)):
+	(data & IND) = 0 if variable-length with max_len < 256 bytes;
+	(data & IND) = 1 if variable-length with max_len > 255 bytes;
+	(data & IND) = 1 + L otherwise, with L=fixed length of the column */
+	static constexpr uint16_t IND = (1U << IND_BITS) - 1;
+
+	/** Field metadata */
+	uint16_t data;
+
+	void clear_not_null() { data &= uint16_t(~NOT_NULL); }
+public:
+	bool is_dropped() const { return data & DROPPED; }
+	void set_dropped() { data |= DROPPED; }
+	bool is_not_null() const { return data & NOT_NULL; }
+	void set_not_null() { ut_ad(is_dropped()); data |= NOT_NULL; }
+	uint16_t ind() const { return data & IND; }
+	void set_ind(uint16_t i)
+	{
+		DBUG_ASSERT(i <= IND);
+		DBUG_ASSERT(!ind());
+		data |= i;
+	}
+	field_map_element_t& operator= (uint16_t value)
+	{
+		data = value;
+		return *this;
+	}
+	operator uint16_t() { return data; }
+};
+
+static_assert(sizeof(field_map_element_t) == 2,
+	      "Size mismatch for a persistent data item!");
+
+/** Instantly dropped or reordered columns */
+struct dict_instant_t
+{
+	/** Number of dropped columns */
+	unsigned n_dropped;
+	/** Dropped columns */
+	dict_col_t* dropped;
+	/** Map of clustered index non-PK fields[i - first_user_field()]
+	to table columns */
+	field_map_element_t* field_map;
+};
+
+/** These are used when MySQL FRM and InnoDB data dictionary are
+in inconsistent state. */
+typedef enum {
+	DICT_FRM_CONSISTENT = 0,	/*!< Consistent state */
+	DICT_FRM_NO_PK = 1,		/*!< MySQL has no primary key
+					but InnoDB dictionary has
+					non-generated one. */
+	DICT_NO_PK_FRM_HAS = 2,		/*!< MySQL has primary key but
+					InnoDB dictionary has not. */
+	DICT_FRM_INCONSISTENT_KEYS = 3	/*!< Key count mismatch */
+} dict_frm_t;
+
+/** Data structure for a database table.  Most fields will be
+zero-initialized in dict_table_t::create(). */
+struct dict_table_t {
+
+	/** Get reference count.
+	@return current value of n_ref_count */
+	inline uint32_t get_ref_count() const { return n_ref_count; }
+
+	/** Acquire the table handle. */
+	inline void acquire();
+
+	/** Release the table handle.
+	@return	whether the last handle was released */
+	inline bool release();
+
+	/** @return whether the table supports transactions */
+	bool no_rollback() const
+	{
+		return !(~unsigned(flags) & DICT_TF_MASK_NO_ROLLBACK);
+        }
+	/** @return whether this is a temporary table */
+	bool is_temporary() const
+	{
+		return flags2 & DICT_TF2_TEMPORARY;
+	}
+
+	/** @return whether the table is not in ROW_FORMAT=REDUNDANT */
+	bool not_redundant() const { return flags & DICT_TF_COMPACT; }
+
+	/** @return whether this table is readable
+	@retval	true	normally
+	@retval	false	if this is a single-table tablespace
+			and the .ibd file is missing, or a
+			page cannot be read or decrypted */
+	bool is_readable() const
+	{
+		ut_ad(file_unreadable || space);
+		return(UNIV_LIKELY(!file_unreadable));
+	}
+
+	/** @return whether the table is accessible */
+	bool is_accessible() const
+	{
+		return UNIV_LIKELY(is_readable() && !corrupted && space)
+			&& !space->is_stopping();
+	}
+
+	/** Check if a table name contains the string "/#sql"
+	which denotes temporary or intermediate tables in MariaDB. */
+	static bool is_temporary_name(const char* name)
+	{
+		return strstr(name, "/#sql");
+	}
+
+	/** @return whether instant ALTER TABLE is in effect */
+	bool is_instant() const
+	{
+		return(UT_LIST_GET_FIRST(indexes)->is_instant());
+	}
+
+	/** @return whether the table supports instant ALTER TABLE */
+	bool supports_instant() const
+	{
+		return(!(flags & DICT_TF_MASK_ZIP_SSIZE));
+	}
+
+	/** @return the number of instantly dropped columns */
+	unsigned n_dropped() const { return instant ? instant->n_dropped : 0; }
+
+	/** Look up an old column.
+	@param[in]	cols	the old columns of the table
+	@param[in]	col_map	map from old table columns to altered ones
+	@param[in]	n_cols	number of old columns
+	@param[in]	i	the number of the new column
+	@return	old column
+	@retval	NULL	if column i was added to the table */
+	static const dict_col_t* find(const dict_col_t* cols,
+				      const ulint* col_map, ulint n_cols,
+				      ulint i)
+	{
+		for (ulint o = n_cols; o--; ) {
+			if (col_map[o] == i) {
+				return &cols[o];
+			}
+		}
+		return NULL;
+	}
+
+	/** Serialise metadata of dropped or reordered columns.
+	@param[in,out]	heap	memory heap for allocation
+	@param[out]	field	data field with the metadata */
+	inline void serialise_columns(mem_heap_t* heap, dfield_t* field) const;
+
+	/** Reconstruct dropped or reordered columns.
+	@param[in]	metadata	data from serialise_columns()
+	@param[in]	len		length of the metadata, in bytes
+	@return whether parsing the metadata failed */
+	bool deserialise_columns(const byte* metadata, ulint len);
+
+	/** Set is_instant() before instant_column().
+	@param[in]	old		previous table definition
+	@param[in]	col_map		map from old.cols[]
+					and old.v_cols[] to this
+	@param[out]	first_alter_pos	0, or
+					1 + first changed column position */
+	inline void prepare_instant(const dict_table_t& old,
+				    const ulint* col_map,
+				    unsigned& first_alter_pos);
+
+	/** Adjust table metadata for instant ADD/DROP/reorder COLUMN.
+	@param[in]	table	table on which prepare_instant() was invoked
+	@param[in]	col_map	mapping from cols[] and v_cols[] to table
+	@return		whether the metadata record must be updated */
+	inline bool instant_column(const dict_table_t& table,
+				   const ulint* col_map);
+
+	/** Roll back instant_column().
+	@param[in]	old_n_cols		original n_cols
+	@param[in]	old_cols		original cols
+	@param[in]	old_col_names		original col_names
+	@param[in]	old_instant		original instant structure
+	@param[in]	old_fields		original fields
+	@param[in]	old_n_fields		original number of fields
+	@param[in]	old_n_core_fields	original number of core fields
+	@param[in]	old_n_v_cols		original n_v_cols
+	@param[in]	old_v_cols		original v_cols
+	@param[in]	old_v_col_names		original v_col_names
+	@param[in]	col_map			column map */
+	inline void rollback_instant(
+		unsigned	old_n_cols,
+		dict_col_t*	old_cols,
+		const char*	old_col_names,
+		dict_instant_t*	old_instant,
+		dict_field_t*	old_fields,
+		unsigned	old_n_fields,
+		unsigned	old_n_core_fields,
+		unsigned	old_n_v_cols,
+		dict_v_col_t*	old_v_cols,
+		const char*	old_v_col_names,
+		const ulint*	col_map);
+
+	/** Add the table definition to the data dictionary cache */
+	void add_to_cache();
+
+	/** @return whether the table is versioned.
+	It is assumed that both vers_start and vers_end set to 0
+	iff table is not versioned. In any other case,
+	these fields correspond to actual positions in cols[]. */
+	bool versioned() const { return vers_start || vers_end; }
+	bool versioned_by_id() const
+	{
+		return versioned() && cols[vers_start].mtype == DATA_INT;
+	}
+
+	/** For overflow fields returns potential max length stored inline */
+	inline size_t get_overflow_field_local_len() const;
+
+  /** Parse the table file name into table name and database name.
+  @tparam        dict_frozen  whether the caller holds dict_sys.latch
+  @param[in,out] db_name      database name buffer
+  @param[in,out] tbl_name     table name buffer
+  @param[out] db_name_len     database name length
+  @param[out] tbl_name_len    table name length
+  @return whether the table name is visible to SQL */
+  template<bool dict_frozen= false>
+  bool parse_name(char (&db_name)[NAME_LEN + 1],
+                  char (&tbl_name)[NAME_LEN + 1],
+                  size_t *db_name_len, size_t *tbl_name_len) const;
+
+  /** Clear the table when rolling back TRX_UNDO_EMPTY
+  @return error code */
+  dberr_t clear(que_thr_t *thr);
+
+#ifdef UNIV_DEBUG
+  /** @return whether the current thread holds the lock_mutex */
+  bool lock_mutex_is_owner() const
+  { return lock_mutex_owner == pthread_self(); }
+  /** @return whether the current thread holds the stats_mutex (lock_mutex) */
+  bool stats_mutex_is_owner() const
+  { return lock_mutex_owner == pthread_self(); }
+#endif /* UNIV_DEBUG */
+  void lock_mutex_init() { lock_mutex.init(); }
+  void lock_mutex_destroy() { lock_mutex.destroy(); }
+  /** Acquire lock_mutex */
+  void lock_mutex_lock()
+  {
+    ut_ad(!lock_mutex_is_owner());
+    lock_mutex.wr_lock();
+    ut_ad(!lock_mutex_owner.exchange(pthread_self()));
+  }
+  /** Try to acquire lock_mutex */
+  bool lock_mutex_trylock()
+  {
+    ut_ad(!lock_mutex_is_owner());
+    bool acquired= lock_mutex.wr_lock_try();
+    ut_ad(!acquired || !lock_mutex_owner.exchange(pthread_self()));
+    return acquired;
+  }
+  /** Release lock_mutex */
+  void lock_mutex_unlock()
+  {
+    ut_ad(lock_mutex_owner.exchange(0) == pthread_self());
+    lock_mutex.wr_unlock();
+  }
+#ifndef SUX_LOCK_GENERIC
+  /** @return whether the lock mutex is held by some thread */
+  bool lock_mutex_is_locked() const noexcept { return lock_mutex.is_locked(); }
+#endif
+
+  /* stats mutex lock currently defaults to lock_mutex but in the future,
+  there could be a use-case to have separate mutex for stats.
+  extra indirection (through inline so no performance hit) should
+  help simplify code and increase long-term maintainability */
+  void stats_mutex_init() { lock_mutex_init(); }
+  void stats_mutex_destroy() { lock_mutex_destroy(); }
+  void stats_mutex_lock() { lock_mutex_lock(); }
+  void stats_mutex_unlock() { lock_mutex_unlock(); }
+
+  /** Rename the data file.
+  @param new_name     name of the table
+  @param replace      whether to replace the file with the new name
+                      (as part of rolling back TRUNCATE) */
+  dberr_t rename_tablespace(span<const char> new_name, bool replace) const;
+
+private:
+	/** Initialize instant->field_map.
+	@param[in]	table	table definition to copy from */
+	inline void init_instant(const dict_table_t& table);
+public:
+	/** Id of the table. */
+	table_id_t				id;
+	/** dict_sys.id_hash chain node */
+	dict_table_t*				id_hash;
+	/** Table name in name_hash */
+	table_name_t				name;
+	/** dict_sys.name_hash chain node */
+	dict_table_t*				name_hash;
+
+	/** Memory heap */
+	mem_heap_t*				heap;
+
+	/** NULL or the directory path specified by DATA DIRECTORY. */
+	char*					data_dir_path;
+
+	/** The tablespace of the table */
+	fil_space_t*				space;
+	/** Tablespace ID */
+	uint32_t				space_id;
+
+	/** Stores information about:
+	1 row format (redundant or compact),
+	2 compressed page size (zip shift size),
+	3 whether using atomic blobs,
+	4 whether the table has been created with the option DATA DIRECTORY.
+	Use DICT_TF_GET_COMPACT(), DICT_TF_GET_ZIP_SSIZE(),
+	DICT_TF_HAS_ATOMIC_BLOBS() and DICT_TF_HAS_DATA_DIR() to parse this
+	flag. */
+	unsigned				flags:DICT_TF_BITS;
+
+	/** Stores information about:
+	1 whether the table has been created using CREATE TEMPORARY TABLE,
+	2 whether the table has an internally defined DOC ID column,
+	3 whether the table has a FTS index,
+	4 whether DOC ID column need to be added to the FTS index,
+	5 whether the table is being created its own tablespace,
+	6 whether the table has been DISCARDed,
+	7 whether the aux FTS tables names are in hex.
+	Use DICT_TF2_FLAG_IS_SET() to parse this flag. */
+	unsigned				flags2:DICT_TF2_BITS;
+
+	/** TRUE if the table is an intermediate table during copy alter
+	operation or a partition/subpartition which is required for copying
+	data and skip the undo log for insertion of row in the table.
+	This variable will be set and unset during extra(), or during the
+	process of altering partitions */
+	unsigned                                skip_alter_undo:1;
+
+	/*!< whether this is in a single-table tablespace and the .ibd
+	file is missing or page decryption failed and page is corrupted */
+	unsigned				file_unreadable:1;
+
+	/** TRUE if the table object has been added to the dictionary cache. */
+	unsigned				cached:1;
+
+	/** Number of non-virtual columns defined so far. */
+	unsigned				n_def:10;
+
+	/** Number of non-virtual columns. */
+	unsigned				n_cols:10;
+
+	/** Number of total columns (inlcude virtual and non-virtual) */
+	unsigned				n_t_cols:10;
+
+	/** Number of total columns defined so far. */
+	unsigned                                n_t_def:10;
+
+	/** Number of virtual columns defined so far. */
+	unsigned                                n_v_def:10;
+
+	/** Number of virtual columns. */
+	unsigned                                n_v_cols:10;
+
+	/** 1 + the position of autoinc counter field in clustered
+	index, or 0 if there is no persistent AUTO_INCREMENT column in
+	the table. */
+	unsigned				persistent_autoinc:10;
+
+	/** TRUE if it's not an InnoDB system table or a table that has no FK
+	relationships. */
+	unsigned				can_be_evicted:1;
+
+	/** TRUE if table is corrupted. */
+	unsigned				corrupted:1;
+
+	/** TRUE if some indexes should be dropped after ONLINE_INDEX_ABORTED
+	or ONLINE_INDEX_ABORTED_DROPPED. */
+	unsigned				drop_aborted:1;
+
+	/** Array of column descriptions. */
+	dict_col_t*				cols;
+
+	/** Array of virtual column descriptions. */
+	dict_v_col_t*				v_cols;
+
+	/** List of stored column descriptions. It is used only for foreign key
+	check during create table and copy alter operations.
+	During copy alter, s_cols list is filled during create table operation
+	and need to preserve till rename table operation. That is the
+	reason s_cols is a part of dict_table_t */
+	dict_s_col_list*			s_cols;
+
+	/** Instantly dropped or reordered columns, or NULL if none */
+	dict_instant_t*				instant;
+
+	/** Column names packed in a character string
+	"name1\0name2\0...nameN\0". Until the string contains n_cols, it will
+	be allocated from a temporary heap. The final string will be allocated
+	from table->heap. */
+	const char*				col_names;
+
+	/** Virtual column names */
+	const char*				v_col_names;
+	unsigned	vers_start:10;
+				/*!< System Versioning: row start col index */
+	unsigned	vers_end:10;
+				/*!< System Versioning: row end col index */
+	bool		is_system_db;
+				/*!< True if the table belongs to a system
+				database (mysql, information_schema or
+				performance_schema) */
+	dict_frm_t	dict_frm_mismatch;
+				/*!< !DICT_FRM_CONSISTENT==0 if data
+				dictionary information and
+				MySQL FRM information mismatch. */
+	/** The FTS_DOC_ID_INDEX, or NULL if no fulltext indexes exist */
+	dict_index_t*				fts_doc_id_index;
+
+	/** List of indexes of the table. */
+	UT_LIST_BASE_NODE_T(dict_index_t)	indexes;
+#ifdef BTR_CUR_HASH_ADAPT
+	/** List of detached indexes that are waiting to be freed along with
+	the last adaptive hash index entry.
+	Protected by autoinc_mutex (sic!) */
+	UT_LIST_BASE_NODE_T(dict_index_t)	freed_indexes;
+#endif /* BTR_CUR_HASH_ADAPT */
+
+	/** List of foreign key constraints in the table. These refer to
+	columns in other tables. */
+	UT_LIST_BASE_NODE_T(dict_foreign_t)	foreign_list;
+
+	/** List of foreign key constraints which refer to this table. */
+	UT_LIST_BASE_NODE_T(dict_foreign_t)	referenced_list;
+
+	/** Node of the LRU list of tables. */
+	UT_LIST_NODE_T(dict_table_t)		table_LRU;
+
+	/** Maximum recursive level we support when loading tables chained
+	together with FK constraints. If exceeds this level, we will stop
+	loading child table into memory along with its parent table. */
+	byte					fk_max_recusive_level;
+
+  /** DDL transaction that last touched the table definition, or 0 if
+  no history is available. This includes possible changes in
+  ha_innobase::prepare_inplace_alter_table() and
+  ha_innobase::commit_inplace_alter_table(). */
+  trx_id_t def_trx_id;
+
+  /** Last transaction that inserted into an empty table.
+  Updated while holding exclusive table lock and an exclusive
+  latch on the clustered index root page (which must also be
+  an empty leaf page), and an ahi_latch (if btr_search_enabled). */
+  Atomic_relaxed<trx_id_t> bulk_trx_id;
+
+  /** Original table name, for MDL acquisition in purge. Normally,
+  this points to the same as name. When is_temporary_name(name.m_name) holds,
+  this should be a copy of the original table name, allocated from heap. */
+  table_name_t mdl_name;
+
+	/*!< set of foreign key constraints in the table; these refer to
+	columns in other tables */
+	dict_foreign_set			foreign_set;
+
+	/*!< set of foreign key constraints which refer to this table */
+	dict_foreign_set			referenced_set;
+
+	/** Statistics for query optimization. Mostly protected by
+	dict_sys.latch and stats_mutex_lock(). @{ */
+
+	/** TRUE if statistics have been calculated the first time after
+	database startup or table creation. */
+	unsigned				stat_initialized:1;
+
+	/** Timestamp of last recalc of the stats. */
+	time_t					stats_last_recalc;
+
+	/** The two bits below are set in the 'stat_persistent' member. They
+	have the following meaning:
+	1. _ON=0, _OFF=0, no explicit persistent stats setting for this table,
+	the value of the global srv_stats_persistent is used to determine
+	whether the table has persistent stats enabled or not
+	2. _ON=0, _OFF=1, persistent stats are explicitly disabled for this
+	table, regardless of the value of the global srv_stats_persistent
+	3. _ON=1, _OFF=0, persistent stats are explicitly enabled for this
+	table, regardless of the value of the global srv_stats_persistent
+	4. _ON=1, _OFF=1, not allowed, we assert if this ever happens. */
+	#define DICT_STATS_PERSISTENT_ON	(1 << 1)
+	#define DICT_STATS_PERSISTENT_OFF	(1 << 2)
+
+	/** Indicates whether the table uses persistent stats or not. See
+	DICT_STATS_PERSISTENT_ON and DICT_STATS_PERSISTENT_OFF. */
+	ib_uint32_t				stat_persistent;
+
+	/** The two bits below are set in the 'stats_auto_recalc' member. They
+	have the following meaning:
+	1. _ON=0, _OFF=0, no explicit auto recalc setting for this table, the
+	value of the global srv_stats_persistent_auto_recalc is used to
+	determine whether the table has auto recalc enabled or not
+	2. _ON=0, _OFF=1, auto recalc is explicitly disabled for this table,
+	regardless of the value of the global srv_stats_persistent_auto_recalc
+	3. _ON=1, _OFF=0, auto recalc is explicitly enabled for this table,
+	regardless of the value of the global srv_stats_persistent_auto_recalc
+	4. _ON=1, _OFF=1, not allowed, we assert if this ever happens. */
+	#define DICT_STATS_AUTO_RECALC_ON	(1 << 1)
+	#define DICT_STATS_AUTO_RECALC_OFF	(1 << 2)
+
+	/** Indicates whether the table uses automatic recalc for persistent
+	stats or not. See DICT_STATS_AUTO_RECALC_ON and
+	DICT_STATS_AUTO_RECALC_OFF. */
+	ib_uint32_t				stats_auto_recalc;
+
+	/** The number of pages to sample for this table during persistent
+	stats estimation. If this is 0, then the value of the global
+	srv_stats_persistent_sample_pages will be used instead. */
+	ulint					stats_sample_pages;
+
+	/** Approximate number of rows in the table. We periodically calculate
+	new estimates. */
+	ib_uint64_t				stat_n_rows;
+
+	/** Approximate clustered index size in database pages. */
+	ulint					stat_clustered_index_size;
+
+	/** Approximate size of other indexes in database pages. */
+	ulint					stat_sum_of_other_index_sizes;
+
+	/** How many rows are modified since last stats recalc. When a row is
+	inserted, updated, or deleted, we add 1 to this number; we calculate
+	new estimates for the table and the indexes if the table has changed
+	too much, see dict_stats_update_if_needed(). The counter is reset
+	to zero at statistics calculation. This counter is not protected by
+	any latch, because this is only used for heuristics. */
+	ib_uint64_t				stat_modified_counter;
+
+	bool		stats_error_printed;
+				/*!< Has persistent stats error beein
+				already printed for this table ? */
+	/* @} */
+
+	/** AUTOINC related members. @{ */
+
+	/* The actual collection of tables locked during AUTOINC read/write is
+	kept in trx_t. In order to quickly determine whether a transaction has
+	locked the AUTOINC lock we keep a pointer to the transaction here in
+	the 'autoinc_trx' member. This is to avoid acquiring the
+	lock_sys.latch and scanning the vector in trx_t.
+	When an AUTOINC lock has to wait, the corresponding lock instance is
+	created on the trx lock heap rather than use the pre-allocated instance
+	in autoinc_lock below. */
+
+	/** A buffer for an AUTOINC lock for this table. We allocate the
+	memory here so that individual transactions can get it and release it
+	without a need to allocate space from the lock heap of the trx:
+	otherwise the lock heap would grow rapidly if we do a large insert
+	from a select. */
+	lock_t*					autoinc_lock;
+
+  /** Mutex protecting autoinc and freed_indexes. */
+  srw_spin_mutex autoinc_mutex;
+private:
+  /** Mutex protecting locks on this table. */
+  srw_spin_mutex lock_mutex;
+#ifdef UNIV_DEBUG
+  /** The owner of lock_mutex (0 if none) */
+  Atomic_relaxed<pthread_t> lock_mutex_owner{0};
+#endif
+public:
+  /** Autoinc counter value to give to the next inserted row. */
+  uint64_t autoinc;
+
+  /** The transaction that currently holds the the AUTOINC lock on this table.
+  Protected by lock_mutex.
+  The thread that is executing autoinc_trx may read this field without
+  holding a latch, in row_lock_table_autoinc_for_mysql().
+  Only the autoinc_trx thread may clear this field; it cannot be
+  modified on the behalf of a transaction that is being handled by a
+  different thread. */
+  Atomic_relaxed<const trx_t*> autoinc_trx;
+
+  /** Number of granted or pending autoinc_lock on this table. This
+  value is set after acquiring lock_sys.latch but
+  in innodb_autoinc_lock_mode=1 (the default),
+  ha_innobase::innobase_lock_autoinc() will perform a dirty read
+  to determine whether other transactions have acquired the autoinc_lock. */
+  uint32_t n_waiting_or_granted_auto_inc_locks;
+
+	/* @} */
+
+  /** Number of granted or pending LOCK_S or LOCK_X on the table.
+  Protected by lock_sys.assert_locked(*this). */
+  uint32_t n_lock_x_or_s;
+
+	/** FTS specific state variables. */
+	fts_t*					fts;
+
+	/** Quiescing states, protected by the dict_index_t::lock. ie. we can
+	only change the state if we acquire all the latches (dict_index_t::lock)
+	in X mode of this table's indexes. */
+	ib_quiesce_t				quiesce;
+
+  /** Count of the number of record locks on this table. We use this to
+  determine whether we can evict the table from the dictionary cache.
+  Modified when lock_sys.is_writer(), or
+  lock_sys.assert_locked(page_id) and trx->mutex_is_owner() hold.
+  @see trx_lock_t::trx_locks */
+  Atomic_counter<uint32_t> n_rec_locks;
+private:
+  /** Count of how many handles are opened to this table. Dropping of the
+  table is NOT allowed until this count gets to zero. MySQL does NOT
+  itself check the number of open handles at DROP. */
+  Atomic_counter<uint32_t> n_ref_count;
+public:
+  /** List of locks on the table. Protected by lock_sys.assert_locked(lock). */
+  table_lock_list_t locks;
+
+  /** Timestamp of the last modification of this table. */
+  Atomic_relaxed<time_t> update_time;
+  /** Transactions whose view low limit is greater than this number are
+  not allowed to access the MariaDB query cache.
+  @see innobase_query_caching_table_check_low()
+  @see trx_t::commit_tables() */
+  Atomic_relaxed<trx_id_t> query_cache_inv_trx_id;
+
+#ifdef UNIV_DEBUG
+	/** Value of 'magic_n'. */
+	#define DICT_TABLE_MAGIC_N		76333786
+
+	/** Magic number. */
+	ulint					magic_n;
+#endif /* UNIV_DEBUG */
+	/** mysql_row_templ_t for base columns used for compute the virtual
+	columns */
+	dict_vcol_templ_t*			vc_templ;
+
+  /* @return whether the table has any other transcation lock
+  other than the given transaction */
+  bool has_lock_other_than(const trx_t *trx) const
+  {
+    for (lock_t *lock= UT_LIST_GET_FIRST(locks); lock;
+         lock= UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock))
+      if (lock->trx != trx)
+        return true;
+    return false;
+  }
+
+  /** @return whether a DDL operation is in progress on this table */
+  bool is_active_ddl() const
+  {
+    return UT_LIST_GET_FIRST(indexes)->online_log;
+  }
+
+  /** @return whether the name is
+  mysql.innodb_index_stats or mysql.innodb_table_stats */
+  bool is_stats_table() const;
+
+  /** @return number of unique columns in FTS_DOC_ID index */
+  unsigned fts_n_uniq() const { return versioned() ? 2 : 1; }
+
+  /** Create metadata.
+  @param name     table name
+  @param space    tablespace
+  @param n_cols   total number of columns (both virtual and non-virtual)
+  @param n_v_cols number of virtual columns
+  @param flags    table flags
+  @param flags2   table flags2
+  @return newly allocated table object */
+  static dict_table_t *create(const span<const char> &name, fil_space_t *space,
+                              ulint n_cols, ulint n_v_cols, ulint flags,
+                              ulint flags2);
+
+  /** Check whether the table has any spatial indexes */
+  bool has_spatial_index() const
+  {
+    for (auto i= UT_LIST_GET_FIRST(indexes);
+         (i= UT_LIST_GET_NEXT(indexes, i)) != nullptr; )
+      if (i->is_spatial())
+        return true;
+    return false;
+  }
+};
+
+inline void dict_index_t::set_modified(mtr_t& mtr) const
+{
+	mtr.set_named_space(table->space);
+}
+
+inline bool table_name_t::is_temporary() const
+{
+	return dict_table_t::is_temporary_name(m_name);
+}
+
+inline bool dict_index_t::is_readable() const { return table->is_readable(); }
+
+inline bool dict_index_t::is_instant() const
+{
+	ut_ad(n_core_fields > 0);
+	ut_ad(n_core_fields <= n_fields || table->n_dropped());
+	ut_ad(n_core_fields == n_fields
+	      || (type & ~(DICT_UNIQUE | DICT_CORRUPT)) == DICT_CLUSTERED);
+	ut_ad(n_core_fields == n_fields || table->supports_instant());
+	ut_ad(n_core_fields == n_fields || !table->is_temporary());
+	ut_ad(!table->instant || !table->is_temporary());
+
+	return n_core_fields != n_fields
+		|| (is_primary() && table->instant);
+}
+
+inline bool dict_index_t::is_corrupted() const
+{
+	return UNIV_UNLIKELY(online_status >= ONLINE_INDEX_ABORTED
+			     || (type & DICT_CORRUPT)
+			     || (table && table->corrupted));
+}
+
+inline void dict_index_t::clear_instant_add()
+{
+  DBUG_ASSERT(is_primary());
+  DBUG_ASSERT(is_instant());
+  DBUG_ASSERT(!table->instant);
+  for (unsigned i= n_core_fields; i < n_fields; i++)
+    fields[i].col->clear_instant();
+  n_core_fields= n_fields;
+  n_core_null_bytes= static_cast<byte>
+    (UT_BITS_IN_BYTES(static_cast<unsigned>(n_nullable)));
+}
+
+inline void dict_index_t::clear_instant_alter()
+{
+	DBUG_ASSERT(is_primary());
+	DBUG_ASSERT(n_fields == n_def);
+
+	if (!table->instant) {
+		if (is_instant()) {
+			clear_instant_add();
+		}
+		return;
+	}
+
+#ifndef DBUG_OFF
+	for (unsigned i = first_user_field(); i--; ) {
+		DBUG_ASSERT(!fields[i].col->is_dropped());
+		DBUG_ASSERT(!fields[i].col->is_nullable());
+	}
+#endif
+	const dict_col_t* ai_col = table->persistent_autoinc
+		? fields[table->persistent_autoinc - 1].col
+		: NULL;
+	dict_field_t* const begin = &fields[first_user_field()];
+	dict_field_t* end = &fields[n_fields];
+
+	for (dict_field_t* d = begin; d < end; ) {
+		/* Move fields for dropped columns to the end. */
+		if (!d->col->is_dropped()) {
+			d++;
+		} else {
+			if (d->col->is_nullable()) {
+				n_nullable--;
+			}
+
+			std::swap(*d, *--end);
+		}
+	}
+
+	DBUG_ASSERT(&fields[n_fields - table->n_dropped()] == end);
+	n_core_fields = n_fields = n_def
+		= static_cast<unsigned>(end - fields) & MAX_N_FIELDS;
+	n_core_null_bytes = static_cast<byte>(UT_BITS_IN_BYTES(n_nullable));
+	std::sort(begin, end, [](const dict_field_t& a, const dict_field_t& b)
+			      { return a.col->ind < b.col->ind; });
+	table->instant = NULL;
+	if (ai_col) {
+		auto a = std::find_if(fields, end,
+				      [ai_col](const dict_field_t& f)
+				      { return f.col == ai_col; });
+		table->persistent_autoinc = (a == end)
+			? 0
+			: (1 + static_cast<unsigned>(a - fields))
+			& MAX_N_FIELDS;
+	}
+}
+
+/** @return whether the column was instantly dropped
+@param[in] index	the clustered index */
+inline bool dict_col_t::is_dropped(const dict_index_t& index) const
+{
+	DBUG_ASSERT(index.is_primary());
+	DBUG_ASSERT(!is_dropped() == !index.table->instant);
+	DBUG_ASSERT(!is_dropped() || (this >= index.table->instant->dropped
+				      && this < index.table->instant->dropped
+				      + index.table->instant->n_dropped));
+	return is_dropped();
+}
+
+/*******************************************************************//**
+Initialise the table lock list. */
+void
+lock_table_lock_list_init(
+/*======================*/
+	table_lock_list_t*	locks);		/*!< List to initialise */
+
+/** A function object to add the foreign key constraint to the referenced set
+of the referenced table, if it exists in the dictionary cache. */
+struct dict_foreign_add_to_referenced_table {
+	void operator()(dict_foreign_t*	foreign) const
+	{
+		if (dict_table_t* table = foreign->referenced_table) {
+			std::pair<dict_foreign_set::iterator, bool>	ret
+				= table->referenced_set.insert(foreign);
+			ut_a(ret.second);
+		}
+	}
+};
+
+/** Check whether the col is used in spatial index or regular index.
+@param[in]	col	column to check
+@return spatial status */
+inline
+spatial_status_t
+dict_col_get_spatial_status(
+	const dict_col_t*	col)
+{
+	spatial_status_t	spatial_status = SPATIAL_NONE;
+
+	/* Column is not a part of any index. */
+	if (!col->ord_part) {
+		return(spatial_status);
+	}
+
+	if (DATA_GEOMETRY_MTYPE(col->mtype)) {
+		if (col->max_prefix == 0) {
+			spatial_status = SPATIAL_ONLY;
+		} else {
+			/* Any regular index on a geometry column
+			should have a prefix. */
+			spatial_status = SPATIAL_MIXED;
+		}
+	}
+
+	return(spatial_status);
+}
+
+/** Clear defragmentation summary. */
+inline void dict_stats_empty_defrag_summary(dict_index_t* index)
+{
+	index->stat_defrag_n_pages_freed = 0;
+}
+
+/** Clear defragmentation related index stats. */
+inline void dict_stats_empty_defrag_stats(dict_index_t* index)
+{
+	index->stat_defrag_modified_counter = 0;
+	index->stat_defrag_n_page_split = 0;
+}
+
+#include "dict0mem.inl"
+
+#endif /* dict0mem_h */
diff --git a/storage/innobase/include/dict0mem.inl b/storage/innobase/include/dict0mem.inl
new file mode 100644
index 00000000..d60ee5d9
--- /dev/null
+++ b/storage/innobase/include/dict0mem.inl
@@ -0,0 +1,68 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/dict0mem.ic
+Data dictionary memory object creation
+
+Created 1/8/1996 Heikki Tuuri
+***********************************************************************/
+
+#include "data0type.h"
+#include "dict0mem.h"
+#include "fil0fil.h"
+
+/**********************************************************************//**
+This function poplulates a dict_index_t index memory structure with
+supplied information. */
+UNIV_INLINE
+void
+dict_mem_fill_index_struct(
+/*=======================*/
+	dict_index_t*	index,		/*!< out: index to be filled */
+	mem_heap_t*	heap,		/*!< in: memory heap */
+	const char*	index_name,	/*!< in: index name */
+	ulint		type,		/*!< in: DICT_UNIQUE,
+					DICT_CLUSTERED, ... ORed */
+	ulint		n_fields)	/*!< in: number of fields */
+{
+
+	if (heap) {
+		index->heap = heap;
+		index->name = mem_heap_strdup(heap, index_name);
+		index->fields = (dict_field_t*) mem_heap_alloc(
+			heap, 1 + n_fields * sizeof(dict_field_t));
+	} else {
+		index->name = index_name;
+		index->heap = NULL;
+		index->fields = NULL;
+	}
+
+	index->type = type & ((1U << DICT_IT_BITS) - 1);
+	index->page = FIL_NULL;
+	index->merge_threshold = DICT_INDEX_MERGE_THRESHOLD_DEFAULT;
+	index->n_fields = static_cast<unsigned>(n_fields)
+		& index->MAX_N_FIELDS;
+	index->n_core_fields = static_cast<unsigned>(n_fields)
+		& index->MAX_N_FIELDS;
+	/* The '1 +' above prevents allocation
+	of an empty mem block */
+	index->nulls_equal = false;
+	ut_d(index->magic_n = DICT_INDEX_MAGIC_N);
+}
diff --git a/storage/innobase/include/dict0pagecompress.h b/storage/innobase/include/dict0pagecompress.h
new file mode 100644
index 00000000..f1272dc4
--- /dev/null
+++ b/storage/innobase/include/dict0pagecompress.h
@@ -0,0 +1,61 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2017, MariaDB Corporation. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/dict0pagecompress.h
+Helper functions for extracting/storing page compression information
+to dictionary.
+
+Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com
+***********************************************************************/
+
+#ifndef dict0pagecompress_h
+#define dict0pagecompress_h
+
+/********************************************************************//**
+Extract the page compression level from table flags.
+@return	page compression level, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_tf_get_page_compression_level(
+/*===============================*/
+	ulint	flags)			/*!< in: flags */
+	__attribute__((const));
+/********************************************************************//**
+Extract the page compression flag from table flags
+@return	page compression flag, or false if not compressed */
+UNIV_INLINE
+ibool
+dict_tf_get_page_compression(
+/*==========================*/
+	ulint	flags)			/*!< in: flags */
+	__attribute__((const));
+
+/********************************************************************//**
+Check whether the table uses the page compressed page format.
+@return	page compression level, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_table_page_compression_level(
+/*==============================*/
+	const dict_table_t*	table)	/*!< in: table */
+	__attribute__((const));
+
+#include "dict0pagecompress.inl"
+
+#endif
diff --git a/storage/innobase/include/dict0pagecompress.inl b/storage/innobase/include/dict0pagecompress.inl
new file mode 100644
index 00000000..c959f9ca
--- /dev/null
+++ b/storage/innobase/include/dict0pagecompress.inl
@@ -0,0 +1,81 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2017, MariaDB Corporation. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/dict0pagecompress.ic
+Inline implementation for helper functions for extracting/storing
+page compression and atomic writes information to dictionary.
+
+Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com
+***********************************************************************/
+
+/********************************************************************//**
+Extract the page compression level from dict_table_t::flags.
+These flags are in memory, so assert that they are valid.
+@return	page compression level, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_tf_get_page_compression_level(
+/*===============================*/
+	ulint	flags)	/*!< in: flags */
+{
+        ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags);
+
+	ut_ad(page_compression_level <= 9);
+
+	return(page_compression_level);
+}
+
+/********************************************************************//**
+Check whether the table uses the page compression page format.
+@return	page compression level, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_table_page_compression_level(
+/*==============================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ut_ad(table);
+	ut_ad(dict_tf_get_page_compression(table->flags));
+
+	return(dict_tf_get_page_compression_level(table->flags));
+}
+
+/********************************************************************//**
+Check whether the table uses the page compression page format.
+@return	true if page compressed, false if not */
+UNIV_INLINE
+ibool
+dict_tf_get_page_compression(
+/*=========================*/
+	ulint	flags)	/*!< in: flags */
+{
+	return(DICT_TF_GET_PAGE_COMPRESSION(flags));
+}
+
+/********************************************************************//**
+Check whether the table uses the page compression page format.
+@return	true if page compressed, false if not */
+UNIV_INLINE
+ibool
+dict_table_is_page_compressed(
+/*==========================*/
+	const dict_table_t* table)	/*!< in: table */
+{
+	return (dict_tf_get_page_compression(table->flags));
+}
diff --git a/storage/innobase/include/dict0stats.h b/storage/innobase/include/dict0stats.h
new file mode 100644
index 00000000..0dc1b984
--- /dev/null
+++ b/storage/innobase/include/dict0stats.h
@@ -0,0 +1,238 @@
+/*****************************************************************************
+
+Copyright (c) 2009, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0stats.h
+Code used for calculating and manipulating table statistics.
+
+Created Jan 06, 2010 Vasil Dimov
+*******************************************************/
+
+#ifndef dict0stats_h
+#define dict0stats_h
+
+#include "dict0types.h"
+#include "trx0types.h"
+
+enum dict_stats_upd_option_t {
+	DICT_STATS_RECALC_PERSISTENT,/* (re) calculate the
+				statistics using a precise and slow
+				algo and save them to the persistent
+				storage, if the persistent storage is
+				not present then emit a warning and
+				fall back to transient stats */
+	DICT_STATS_RECALC_TRANSIENT,/* (re) calculate the statistics
+				using an imprecise quick algo
+				without saving the results
+				persistently */
+	DICT_STATS_EMPTY_TABLE,	/* Write all zeros (or 1 where it makes sense)
+				into a table and its indexes' statistics
+				members. The resulting stats correspond to an
+				empty table. If the table is using persistent
+				statistics, then they are saved on disk. */
+	DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY /* fetch the stats
+				from the persistent storage if the in-memory
+				structures have not been initialized yet,
+				otherwise do nothing */
+};
+
+/*********************************************************************//**
+Set the persistent statistics flag for a given table. This is set only
+in the in-memory table object and is not saved on disk. It will be read
+from the .frm file upon first open from MySQL after a server restart. */
+UNIV_INLINE
+void
+dict_stats_set_persistent(
+/*======================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	ibool		ps_on,	/*!< in: persistent stats explicitly enabled */
+	ibool		ps_off)	/*!< in: persistent stats explicitly disabled */
+	MY_ATTRIBUTE((nonnull));
+
+/** @return whether persistent statistics is enabled for a given table */
+UNIV_INLINE
+bool
+dict_stats_is_persistent_enabled(const dict_table_t* table)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/*********************************************************************//**
+Set the auto recalc flag for a given table (only honored for a persistent
+stats enabled table). The flag is set only in the in-memory table object
+and is not saved in InnoDB files. It will be read from the .frm file upon
+first open from MySQL after a server restart. */
+UNIV_INLINE
+void
+dict_stats_auto_recalc_set(
+/*=======================*/
+	dict_table_t*	table,			/*!< in/out: table */
+	ibool		auto_recalc_on,		/*!< in: explicitly enabled */
+	ibool		auto_recalc_off);	/*!< in: explicitly disabled */
+
+/** @return whether auto recalc is enabled for a given table*/
+UNIV_INLINE
+bool
+dict_stats_auto_recalc_is_enabled(const dict_table_t* table)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/*********************************************************************//**
+Initialize table's stats for the first time when opening a table. */
+UNIV_INLINE
+void
+dict_stats_init(
+/*============*/
+	dict_table_t*	table);	/*!< in/out: table */
+
+/*********************************************************************//**
+Deinitialize table's stats after the last close of the table. This is
+used to detect "FLUSH TABLE" and refresh the stats upon next open. */
+UNIV_INLINE
+void
+dict_stats_deinit(
+/*==============*/
+	dict_table_t*	table)	/*!< in/out: table */
+	MY_ATTRIBUTE((nonnull));
+
+#ifdef WITH_WSREP
+/** Update the table modification counter and if necessary,
+schedule new estimates for table and index statistics to be calculated.
+@param[in,out]	table	persistent or temporary table
+@param[in]	trx	transaction */
+void dict_stats_update_if_needed(dict_table_t *table, const trx_t &trx)
+	MY_ATTRIBUTE((nonnull));
+#else
+/** Update the table modification counter and if necessary,
+schedule new estimates for table and index statistics to be calculated.
+@param[in,out]	table	persistent or temporary table */
+void dict_stats_update_if_needed_func(dict_table_t *table)
+	MY_ATTRIBUTE((nonnull));
+# define dict_stats_update_if_needed(t,trx) dict_stats_update_if_needed_func(t)
+#endif
+
+/*********************************************************************//**
+Calculates new estimates for table and index statistics. The statistics
+are used in query optimization.
+@return DB_* error code or DB_SUCCESS */
+dberr_t
+dict_stats_update(
+/*==============*/
+	dict_table_t*		table,	/*!< in/out: table */
+	dict_stats_upd_option_t	stats_upd_option);
+					/*!< in: whether to (re) calc
+					the stats or to fetch them from
+					the persistent storage */
+
+/** Execute DELETE FROM mysql.innodb_table_stats
+@param database_name  database name
+@param table_name     table name
+@param trx            transaction
+@return DB_SUCCESS or error code */
+dberr_t dict_stats_delete_from_table_stats(const char *database_name,
+                                           const char *table_name,
+                                           trx_t *trx)
+  MY_ATTRIBUTE((nonnull));
+/** Execute DELETE FROM mysql.innodb_index_stats
+@param database_name  database name
+@param table_name     table name
+@param trx            transaction
+@return DB_SUCCESS or error code */
+dberr_t dict_stats_delete_from_index_stats(const char *database_name,
+                                           const char *table_name,
+                                           trx_t *trx)
+  MY_ATTRIBUTE((nonnull));
+/** Execute DELETE FROM mysql.innodb_index_stats
+@param database_name  database name
+@param table_name     table name
+@param index_name     name of the index
+@param trx            transaction (nullptr=start and commit a new one)
+@return DB_SUCCESS or error code */
+dberr_t dict_stats_delete_from_index_stats(const char *database_name,
+                                           const char *table_name,
+                                           const char *index_name, trx_t *trx);
+
+/*********************************************************************//**
+Fetches or calculates new estimates for index statistics. */
+void
+dict_stats_update_for_index(
+/*========================*/
+	dict_index_t*	index)	/*!< in/out: index */
+	MY_ATTRIBUTE((nonnull));
+
+/** Rename a table in InnoDB persistent stats storage.
+@param old_name  old table name
+@param new_name  new table name
+@param trx       transaction
+@return DB_SUCCESS or error code */
+dberr_t dict_stats_rename_table(const char *old_name, const char *new_name,
+                                trx_t *trx);
+/** Rename an index in InnoDB persistent statistics.
+@param db         database name
+@param table      table name
+@param old_name   old table name
+@param new_name   new table name
+@param trx        transaction
+@return DB_SUCCESS or error code */
+dberr_t dict_stats_rename_index(const char *db, const char *table,
+                                const char *old_name, const char *new_name,
+                                trx_t *trx);
+
+/** Delete all persistent statistics for a database.
+@param db    database name
+@param trx   transaction
+@return DB_SUCCESS or error code */
+dberr_t dict_stats_delete(const char *db, trx_t *trx);
+
+/** Save an individual index's statistic into the persistent statistics
+storage.
+@param[in]	index			index to be updated
+@param[in]	last_update		timestamp of the stat
+@param[in]	stat_name		name of the stat
+@param[in]	stat_value		value of the stat
+@param[in]	sample_size		n pages sampled or NULL
+@param[in]	stat_description	description of the stat
+@param[in,out]	trx			transaction
+@return DB_SUCCESS or error code */
+dberr_t
+dict_stats_save_index_stat(
+	dict_index_t*	index,
+	time_t		last_update,
+	const char*	stat_name,
+	ib_uint64_t	stat_value,
+	ib_uint64_t*	sample_size,
+	const char*	stat_description,
+	trx_t*		trx)
+	MY_ATTRIBUTE((nonnull(1, 3, 6, 7)));
+
+/** Report an error if updating table statistics failed because
+.ibd file is missing, table decryption failed or table is corrupted.
+@param[in,out]	table	Table
+@param[in]	defragment	true if statistics is for defragment
+@retval DB_DECRYPTION_FAILED if decryption of the table failed
+@retval DB_TABLESPACE_DELETED if .ibd file is missing
+@retval DB_CORRUPTION if table is marked as corrupted */
+dberr_t
+dict_stats_report_error(dict_table_t* table, bool defragment = false)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+#include "dict0stats.inl"
+
+#ifdef UNIV_ENABLE_UNIT_TEST_DICT_STATS
+void test_dict_stats_all();
+#endif /* UNIV_ENABLE_UNIT_TEST_DICT_STATS */
+
+#endif /* dict0stats_h */
diff --git a/storage/innobase/include/dict0stats.inl b/storage/innobase/include/dict0stats.inl
new file mode 100644
index 00000000..dd516275
--- /dev/null
+++ b/storage/innobase/include/dict0stats.inl
@@ -0,0 +1,219 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2015, Oracle and/or its affiliates. All rights reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0stats.ic
+Code used for calculating and manipulating table statistics.
+
+Created Jan 23, 2012 Vasil Dimov
+*******************************************************/
+
+#include "dict0dict.h"
+#include "srv0srv.h"
+
+/*********************************************************************//**
+Set the persistent statistics flag for a given table. This is set only
+in the in-memory table object and is not saved on disk. It will be read
+from the .frm file upon first open from MySQL after a server restart. */
+UNIV_INLINE
+void
+dict_stats_set_persistent(
+/*======================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	ibool		ps_on,	/*!< in: persistent stats explicitly enabled */
+	ibool		ps_off)	/*!< in: persistent stats explicitly disabled */
+{
+	/* Not allowed to have both flags set, but a CREATE or ALTER
+	statement that contains "STATS_PERSISTENT=0 STATS_PERSISTENT=1" would
+	end up having both set. In this case we clear the OFF flag. */
+	if (ps_on && ps_off) {
+		ps_off = FALSE;
+	}
+
+	ib_uint32_t	stat_persistent = 0;
+
+	if (ps_on) {
+		stat_persistent |= DICT_STATS_PERSISTENT_ON;
+	}
+
+	if (ps_off) {
+		stat_persistent |= DICT_STATS_PERSISTENT_OFF;
+	}
+
+	/* we rely on this assignment to be atomic */
+	table->stat_persistent = stat_persistent;
+}
+
+/** @return whether persistent statistics is enabled for a given table */
+UNIV_INLINE
+bool
+dict_stats_is_persistent_enabled(const dict_table_t* table)
+{
+	/* Because of the nature of this check (non-locking) it is possible
+	that a table becomes:
+	* PS-disabled immediately after this function has returned TRUE or
+	* PS-enabled immediately after this function has returned FALSE.
+	This means that it is possible that we do:
+	+ dict_stats_update(DICT_STATS_RECALC_PERSISTENT) on a table that has
+	  just been PS-disabled or
+	+ dict_stats_update(DICT_STATS_RECALC_TRANSIENT) on a table that has
+	  just been PS-enabled.
+	This is acceptable. Avoiding this would mean that we would have to
+	hold dict_sys.latch or stats_mutex_lock() like for accessing the
+	other ::stat_ members which would be too big performance penalty,
+	especially when this function is called from
+	dict_stats_update_if_needed(). */
+
+	/* we rely on this read to be atomic */
+	ib_uint32_t	stat_persistent = table->stat_persistent;
+
+	if (stat_persistent & DICT_STATS_PERSISTENT_ON) {
+		ut_ad(!(stat_persistent & DICT_STATS_PERSISTENT_OFF));
+		return(true);
+	} else if (stat_persistent & DICT_STATS_PERSISTENT_OFF) {
+		return(false);
+	} else {
+		return(srv_stats_persistent);
+	}
+}
+
+/*********************************************************************//**
+Set the auto recalc flag for a given table (only honored for a persistent
+stats enabled table). The flag is set only in the in-memory table object
+and is not saved in InnoDB files. It will be read from the .frm file upon
+first open from MySQL after a server restart. */
+UNIV_INLINE
+void
+dict_stats_auto_recalc_set(
+/*=======================*/
+	dict_table_t*	table,			/*!< in/out: table */
+	ibool		auto_recalc_on,		/*!< in: explicitly enabled */
+	ibool		auto_recalc_off)	/*!< in: explicitly disabled */
+{
+	ut_ad(!auto_recalc_on || !auto_recalc_off);
+
+	ib_uint32_t	stats_auto_recalc = 0;
+
+	if (auto_recalc_on) {
+		stats_auto_recalc |= DICT_STATS_AUTO_RECALC_ON;
+	}
+
+	if (auto_recalc_off) {
+		stats_auto_recalc |= DICT_STATS_AUTO_RECALC_OFF;
+	}
+
+	/* we rely on this assignment to be atomic */
+	table->stats_auto_recalc = stats_auto_recalc;
+}
+
+/** @return whether auto recalc is enabled for a given table*/
+UNIV_INLINE
+bool
+dict_stats_auto_recalc_is_enabled(const dict_table_t* table)
+{
+	/* we rely on this read to be atomic */
+	ib_uint32_t	stats_auto_recalc = table->stats_auto_recalc;
+
+	if (stats_auto_recalc & DICT_STATS_AUTO_RECALC_ON) {
+		ut_ad(!(stats_auto_recalc & DICT_STATS_AUTO_RECALC_OFF));
+		return(true);
+	} else if (stats_auto_recalc & DICT_STATS_AUTO_RECALC_OFF) {
+		return(false);
+	} else {
+		return(srv_stats_auto_recalc);
+	}
+}
+
+/*********************************************************************//**
+Initialize table's stats for the first time when opening a table. */
+UNIV_INLINE
+void
+dict_stats_init(
+/*============*/
+	dict_table_t*	table)	/*!< in/out: table */
+{
+	ut_ad(!table->stats_mutex_is_owner());
+
+	if (table->stat_initialized) {
+		return;
+	}
+
+	dict_stats_upd_option_t	opt;
+
+	if (dict_stats_is_persistent_enabled(table)) {
+		opt = DICT_STATS_FETCH_ONLY_IF_NOT_IN_MEMORY;
+	} else {
+		opt = DICT_STATS_RECALC_TRANSIENT;
+	}
+
+	dict_stats_update(table, opt);
+}
+
+/*********************************************************************//**
+Deinitialize table's stats after the last close of the table. This is
+used to detect "FLUSH TABLE" and refresh the stats upon next open. */
+UNIV_INLINE
+void
+dict_stats_deinit(
+/*==============*/
+	dict_table_t*	table)	/*!< in/out: table */
+{
+	ut_ad(table->stats_mutex_is_owner());
+	ut_ad(table->get_ref_count() == 0);
+
+#ifdef HAVE_valgrind
+	if (!table->stat_initialized) {
+		return;
+	}
+
+	MEM_UNDEFINED(&table->stat_n_rows, sizeof table->stat_n_rows);
+	MEM_UNDEFINED(&table->stat_clustered_index_size,
+		      sizeof table->stat_clustered_index_size);
+	MEM_UNDEFINED(&table->stat_sum_of_other_index_sizes,
+		      sizeof table->stat_sum_of_other_index_sizes);
+	MEM_UNDEFINED(&table->stat_modified_counter,
+		      sizeof table->stat_modified_counter);
+
+	dict_index_t*   index;
+
+	for (index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+		MEM_UNDEFINED(
+			index->stat_n_diff_key_vals,
+			index->n_uniq
+			* sizeof index->stat_n_diff_key_vals[0]);
+		MEM_UNDEFINED(
+			index->stat_n_sample_sizes,
+			index->n_uniq
+			* sizeof index->stat_n_sample_sizes[0]);
+		MEM_UNDEFINED(
+			index->stat_n_non_null_key_vals,
+			index->n_uniq
+			* sizeof index->stat_n_non_null_key_vals[0]);
+		MEM_UNDEFINED(
+			&index->stat_index_size,
+			sizeof(index->stat_index_size));
+		MEM_UNDEFINED(
+			&index->stat_n_leaf_pages,
+			sizeof(index->stat_n_leaf_pages));
+	}
+#endif /* HAVE_valgrind */
+	table->stat_initialized = FALSE;
+}
diff --git a/storage/innobase/include/dict0stats_bg.h b/storage/innobase/include/dict0stats_bg.h
new file mode 100644
index 00000000..d9a2f628
--- /dev/null
+++ b/storage/innobase/include/dict0stats_bg.h
@@ -0,0 +1,59 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0stats_bg.h
+Code used for background table and index stats gathering.
+
+Created Apr 26, 2012 Vasil Dimov
+*******************************************************/
+
+#ifndef dict0stats_bg_h
+#define dict0stats_bg_h
+
+#include "dict0types.h"
+
+#ifdef HAVE_PSI_INTERFACE
+extern mysql_pfs_key_t	recalc_pool_mutex_key;
+#endif /* HAVE_PSI_INTERFACE */
+
+/** Delete a table from the auto recalc pool, and ensure that
+no statistics are being updated on it. */
+void dict_stats_recalc_pool_del(table_id_t id, bool have_mdl_exclusive);
+
+/*****************************************************************//**
+Initialize global variables needed for the operation of dict_stats_thread().
+Must be called before dict_stats task is started. */
+void dict_stats_init();
+
+/*****************************************************************//**
+Free resources allocated by dict_stats_thread_init(), must be called
+after dict_stats task has exited. */
+void dict_stats_deinit();
+
+/** Start the dict stats timer. */
+void dict_stats_start();
+
+/** Shut down the dict_stats timer. */
+void dict_stats_shutdown();
+
+/** Reschedule dict stats timer to run now. */
+void dict_stats_schedule_now();
+
+#endif /* dict0stats_bg_h */
diff --git a/storage/innobase/include/dict0types.h b/storage/innobase/include/dict0types.h
new file mode 100644
index 00000000..ec50e8cd
--- /dev/null
+++ b/storage/innobase/include/dict0types.h
@@ -0,0 +1,176 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0types.h
+Data dictionary global types
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0types_h
+#define dict0types_h
+
+#include "univ.i"
+#include "span.h"
+#include <rem0types.h>
+
+using st_::span;
+
+struct dict_col_t;
+struct dict_field_t;
+struct dict_index_t;
+struct dict_table_t;
+struct dict_foreign_t;
+struct dict_v_col_t;
+
+struct ind_node_t;
+struct tab_node_t;
+struct dict_add_v_col_t;
+
+/* Space id and page no where the dictionary header resides */
+#define	DICT_HDR_SPACE		0	/* the SYSTEM tablespace */
+#define	DICT_HDR_PAGE_NO	FSP_DICT_HDR_PAGE_NO
+
+/* The ibuf table and indexes's ID are assigned as the number
+DICT_IBUF_ID_MIN plus the space id */
+#define DICT_IBUF_ID_MIN	0xFFFFFFFF00000000ULL
+
+typedef ib_id_t		table_id_t;
+typedef ib_id_t		index_id_t;
+
+/** Maximum transaction identifier */
+#define TRX_ID_MAX	IB_ID_MAX
+
+/** The bit pattern corresponding to TRX_ID_MAX */
+extern const byte trx_id_max_bytes[8];
+extern const byte timestamp_max_bytes[7];
+
+/** Error to ignore when we load table dictionary into memory. However,
+the table and index will be marked as "corrupted", and caller will
+be responsible to deal with corrupted table or index.
+Note: please define the IGNORE_ERR_* as bits, so their value can
+be or-ed together */
+enum dict_err_ignore_t {
+	DICT_ERR_IGNORE_NONE = 0,	/*!< no error to ignore */
+	DICT_ERR_IGNORE_FK_NOKEY = 1,	/*!< ignore error if any foreign
+					key is missing */
+	DICT_ERR_IGNORE_INDEX = 2,	/*!< ignore corrupted indexes */
+	DICT_ERR_IGNORE_RECOVER_LOCK = 4 | DICT_ERR_IGNORE_FK_NOKEY,
+					/*!< Used when recovering table locks
+					for resurrected transactions.
+					Silently load a missing
+					tablespace, and do not load
+					incomplete index definitions. */
+	/** ignore all errors above */
+	DICT_ERR_IGNORE_ALL = 7,
+	/** prepare some DDL operation;
+	do not attempt to load tablespace */
+	DICT_ERR_IGNORE_TABLESPACE = 15,
+	/** prepare to drop the table; do not attempt to load tablespace
+	or the metadata */
+	DICT_ERR_IGNORE_DROP = 31
+};
+
+/** Quiescing states for flushing tables to disk. */
+enum ib_quiesce_t {
+	QUIESCE_NONE,
+	QUIESCE_START,			/*!< Initialise, prepare to start */
+	QUIESCE_COMPLETE		/*!< All done */
+};
+
+/** Prefix for InnoDB internal tables, adopted from sql/table.h */
+#define TEMP_FILE_PREFIX_INNODB		"#sql-ib"
+
+/** Table name wrapper for pretty-printing */
+struct table_name_t
+{
+	/** The name in internal representation */
+	char*	m_name;
+
+	/** Default constructor */
+	table_name_t() = default;
+	/** Constructor */
+	table_name_t(char* name) : m_name(name) {}
+
+	/** @return the end of the schema name */
+	const char* dbend() const
+	{
+		const char* sep = strchr(m_name, '/');
+		ut_ad(sep);
+		return sep;
+	}
+
+	/** @return the length of the schema name, in bytes */
+	size_t dblen() const { return size_t(dbend() - m_name); }
+
+	/** Determine the filename-safe encoded table name.
+	@return	the filename-safe encoded table name */
+	const char* basename() const { return dbend() + 1; }
+
+	/** The start of the table basename suffix for partitioned tables */
+	static const char part_suffix[4];
+
+	/** Determine the partition or subpartition name suffix.
+	@return the partition name
+	@retval	NULL	if the table is not partitioned */
+	const char* part() const { return strstr(basename(), part_suffix); }
+
+	/** @return whether this is a temporary or intermediate table name */
+	inline bool is_temporary() const;
+};
+
+#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
+/** Dump the change buffer at startup */
+extern my_bool		ibuf_dump;
+/** Flag to control insert buffer debugging. */
+extern uint		ibuf_debug;
+#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+
+/** Shift for spatial status */
+#define SPATIAL_STATUS_SHIFT	12
+
+/** Mask to encode/decode spatial status. */
+#define SPATIAL_STATUS_MASK	(3U << SPATIAL_STATUS_SHIFT)
+
+#if SPATIAL_STATUS_MASK < REC_VERSION_56_MAX_INDEX_COL_LEN
+# error SPATIAL_STATUS_MASK < REC_VERSION_56_MAX_INDEX_COL_LEN
+#endif
+
+/** whether a col is used in spatial index or regular index
+Note: the spatial status is part of persistent undo log,
+so we should not modify the values in MySQL 5.7 */
+enum spatial_status_t {
+	/* Unkown status (undo format in 5.7.9) */
+	SPATIAL_UNKNOWN = 0,
+
+	/** Not used in gis index. */
+	SPATIAL_NONE	= 1,
+
+	/** Used in both spatial index and regular index. */
+	SPATIAL_MIXED	= 2,
+
+	/** Only used in spatial index. */
+	SPATIAL_ONLY	= 3
+};
+
+#define TABLE_STATS_NAME "mysql/innodb_table_stats"
+#define INDEX_STATS_NAME "mysql/innodb_index_stats"
+
+#endif
diff --git a/storage/innobase/include/dyn0buf.h b/storage/innobase/include/dyn0buf.h
new file mode 100644
index 00000000..06af4dcc
--- /dev/null
+++ b/storage/innobase/include/dyn0buf.h
@@ -0,0 +1,442 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dyn0buf.h
+The dynamically allocated buffer implementation
+
+Created 2013-03-16 Sunny Bains
+*******************************************************/
+
+#ifndef dyn0buf_h
+#define dyn0buf_h
+
+#include "mem0mem.h"
+#include "dyn0types.h"
+#include "ilist.h"
+
+
+/** Class that manages dynamic buffers. It uses a UT_LIST of
+mtr_buf_t::block_t instances. We don't use STL containers in
+order to avoid the overhead of heap calls. Using a custom memory
+allocator doesn't solve the problem either because we have to get
+the memory from somewhere. We can't use the block_t::m_data as the
+backend for the custom allocator because we would like the data in
+the blocks to be contiguous. */
+class mtr_buf_t {
+public:
+	/** SIZE - sizeof(m_node) + sizeof(m_used) */
+	enum { MAX_DATA_SIZE = DYN_ARRAY_DATA_SIZE
+	       - sizeof(ilist_node<>) + sizeof(uint32_t) };
+
+	class block_t : public ilist_node<> {
+	public:
+
+		block_t()
+		{
+			compile_time_assert(MAX_DATA_SIZE <= (2 << 15));
+			init();
+		}
+
+		/**
+		Gets the number of used bytes in a block.
+		@return	number of bytes used */
+		ulint used() const
+			MY_ATTRIBUTE((warn_unused_result))
+		{
+			return(static_cast<ulint>(m_used & ~DYN_BLOCK_FULL_FLAG));
+		}
+
+		/**
+		Gets pointer to the start of data.
+		@return	pointer to data */
+		byte* start()
+			MY_ATTRIBUTE((warn_unused_result))
+		{
+			return(m_data);
+		}
+
+		/**
+		@return start of data - non const version */
+		byte* begin()
+			MY_ATTRIBUTE((warn_unused_result))
+		{
+			return(m_data);
+		}
+
+		/**
+		@return end of used data - non const version */
+		byte* end()
+			MY_ATTRIBUTE((warn_unused_result))
+		{
+			return(begin() + m_used);
+		}
+
+		/**
+		@return start of data - const version */
+		const byte* begin() const
+			MY_ATTRIBUTE((warn_unused_result))
+		{
+			return(m_data);
+		}
+
+		/**
+		@return end of used data - const version */
+		const byte* end() const
+			MY_ATTRIBUTE((warn_unused_result))
+		{
+			return(begin() + m_used);
+		}
+
+	private:
+		/**
+		@return pointer to start of reserved space */
+		template <typename Type>
+		Type push(uint32_t size)
+		{
+			Type	ptr = reinterpret_cast<Type>(end());
+
+			m_used += size;
+			ut_ad(m_used <= uint32_t(MAX_DATA_SIZE));
+
+			return(ptr);
+		}
+
+		/**
+		Grow the stack. */
+		void close(const byte* ptr)
+		{
+			/* Check that it is within bounds */
+			ut_ad(ptr >= begin());
+			ut_ad(ptr <= begin() + m_buf_end);
+
+			/* We have done the boundary check above */
+			m_used = uint32_t(ptr - begin());
+
+			ut_ad(m_used <= MAX_DATA_SIZE);
+			ut_d(m_buf_end = 0);
+		}
+
+		/**
+		Initialise the block */
+		void init()
+		{
+			m_used = 0;
+			ut_d(m_buf_end = 0);
+			ut_d(m_magic_n = DYN_BLOCK_MAGIC_N);
+		}
+	private:
+#ifdef UNIV_DEBUG
+		/** If opened then this is the buffer end offset, else 0 */
+		ulint		m_buf_end;
+
+		/** Magic number (DYN_BLOCK_MAGIC_N) */
+		ulint		m_magic_n;
+#endif /* UNIV_DEBUG */
+
+		/** Storage */
+		byte		m_data[MAX_DATA_SIZE];
+
+		/** number of data bytes used in this block;
+		DYN_BLOCK_FULL_FLAG is set when the block becomes full */
+		uint32_t	m_used;
+
+		friend class mtr_buf_t;
+	};
+
+	typedef sized_ilist<block_t> list_t;
+
+	/** Default constructor */
+	mtr_buf_t()
+		:
+		m_heap(),
+		m_size()
+	{
+		push_back(&m_first_block);
+	}
+
+	/** Destructor */
+	~mtr_buf_t()
+	{
+		erase();
+	}
+
+	/** Reset the buffer vector */
+	void erase()
+	{
+		if (m_heap != NULL) {
+			mem_heap_free(m_heap);
+			m_heap = NULL;
+
+			/* Initialise the list and add the first block. */
+			m_list.clear();
+			m_list.push_back(m_first_block);
+		} else {
+			m_first_block.init();
+			ut_ad(m_list.size() == 1);
+		}
+
+		m_size = 0;
+	}
+
+	/**
+	Makes room on top and returns a pointer to a buffer in it. After
+	copying the elements, the caller must close the buffer using close().
+	@param size	in bytes of the buffer; MUST be <= MAX_DATA_SIZE!
+	@return	pointer to the buffer */
+	byte* open(ulint size)
+		MY_ATTRIBUTE((warn_unused_result))
+	{
+		ut_ad(size > 0);
+		ut_ad(size <= MAX_DATA_SIZE);
+
+		block_t*	block;
+
+		block = has_space(size) ? back() : add_block();
+
+		ut_ad(block->m_used <= MAX_DATA_SIZE);
+		ut_d(block->m_buf_end = block->m_used + size);
+
+		return(block->end());
+	}
+
+	/**
+	Closes the buffer returned by open.
+	@param ptr	end of used space */
+	void close(const byte* ptr)
+	{
+		ut_ad(!m_list.empty());
+		block_t*	block = back();
+
+		m_size -= block->used();
+
+		block->close(ptr);
+
+		m_size += block->used();
+	}
+
+	/**
+	Makes room on top and returns a pointer to the added element.
+	The caller must copy the element to the pointer returned.
+	@param size	in bytes of the element
+	@return	pointer to the element */
+	template <typename Type>
+	Type push(uint32_t size)
+	{
+		ut_ad(size > 0);
+		ut_ad(size <= MAX_DATA_SIZE);
+
+		block_t*	block;
+
+		block = has_space(size) ? back() : add_block();
+
+		m_size += size;
+
+		/* See ISO C++03 14.2/4 for why "template" is required. */
+
+		return(block->template push<Type>(size));
+	}
+
+	/**
+	Pushes n bytes.
+	@param str	string to write
+	@param len	string length */
+	void push(const byte* ptr, uint32_t len)
+	{
+		while (len > 0) {
+			uint32_t n_copied = std::min(len,
+						     uint32_t(MAX_DATA_SIZE));
+			::memmove(push<byte*>(n_copied), ptr, n_copied);
+
+			ptr += n_copied;
+			len -= n_copied;
+		}
+	}
+
+	/**
+	Returns a pointer to an element in the buffer. const version.
+	@param pos	position of element in bytes from start
+	@return	pointer to element */
+	template <typename Type>
+	const Type at(ulint pos) const
+	{
+		block_t*	block = const_cast<block_t*>(
+			const_cast<mtr_buf_t*>(this)->find(pos));
+
+		return(reinterpret_cast<Type>(block->begin() + pos));
+	}
+
+	/**
+	Returns a pointer to an element in the buffer. non const version.
+	@param pos	position of element in bytes from start
+	@return	pointer to element */
+	template <typename Type>
+	Type at(ulint pos)
+	{
+		block_t*	block = const_cast<block_t*>(find(pos));
+
+		return(reinterpret_cast<Type>(block->begin() + pos));
+	}
+
+	/**
+	Returns the size of the total stored data.
+	@return	data size in bytes */
+	ulint size() const
+		MY_ATTRIBUTE((warn_unused_result))
+	{
+#ifdef UNIV_DEBUG
+		ulint	total_size = 0;
+
+		for (list_t::iterator it = m_list.begin(), end = m_list.end();
+		     it != end; ++it) {
+			total_size += it->used();
+		}
+
+		ut_ad(total_size == m_size);
+#endif /* UNIV_DEBUG */
+		return(m_size);
+	}
+
+	/**
+	Iterate over each block and call the functor.
+	@return	false if iteration was terminated. */
+	template <typename Functor>
+	bool for_each_block(const Functor& functor) const
+	{
+		for (list_t::iterator it = m_list.begin(), end = m_list.end();
+		     it != end; ++it) {
+
+			if (!functor(&*it)) {
+				return false;
+			}
+		}
+
+		return(true);
+	}
+
+	/**
+	@return the first block */
+	block_t* front()
+		MY_ATTRIBUTE((warn_unused_result))
+	{
+		return &m_list.front();
+	}
+
+	/**
+	@return true if m_first_block block was not filled fully */
+	bool is_small() const
+		MY_ATTRIBUTE((warn_unused_result))
+	{
+		return(m_heap == NULL);
+	}
+
+	/** @return whether the buffer is empty */
+	bool empty() const { return !back()->m_used; }
+
+private:
+	// Disable copying
+	mtr_buf_t(const mtr_buf_t&);
+	mtr_buf_t& operator=(const mtr_buf_t&);
+
+	/**
+	Add the block to the end of the list*/
+	void push_back(block_t* block)
+	{
+		block->init();
+		m_list.push_back(*block);
+	}
+
+	/** @return the last block in the list */
+	block_t* back() const
+	{
+		return &const_cast<block_t&>(m_list.back());
+	}
+
+	/*
+	@return true if request can be fullfilled */
+	bool has_space(ulint size) const
+	{
+		return(back()->m_used + size <= MAX_DATA_SIZE);
+	}
+
+	/*
+	@return true if request can be fullfilled */
+	bool has_space(ulint size)
+	{
+		return(back()->m_used + size <= MAX_DATA_SIZE);
+	}
+
+	/** Find the block that contains the pos.
+	@param pos	absolute offset, it is updated to make it relative
+			to the block
+	@return the block containing the pos. */
+	block_t* find(ulint& pos)
+	{
+		ut_ad(!m_list.empty());
+
+		for (list_t::iterator it = m_list.begin(), end = m_list.end();
+		     it != end; ++it) {
+
+			if (pos < it->used()) {
+				ut_ad(it->used() >= pos);
+
+				return &*it;
+			}
+
+			pos -= it->used();
+		}
+
+		return NULL;
+	}
+
+	/**
+	Allocate and add a new block to m_list */
+	block_t* add_block()
+	{
+		block_t*	block;
+
+		if (m_heap == NULL) {
+			m_heap = mem_heap_create(sizeof(*block));
+		}
+
+		block = reinterpret_cast<block_t*>(
+			mem_heap_alloc(m_heap, sizeof(*block)));
+
+		push_back(block);
+
+		return(block);
+	}
+
+private:
+	/** Heap to use for memory allocation */
+	mem_heap_t*		m_heap;
+
+	/** Allocated blocks */
+	list_t			m_list;
+
+	/** Total size used by all blocks */
+	ulint			m_size;
+
+	/** The default block, should always be the first element. This
+	is for backwards compatibility and to avoid an extra heap allocation
+	for small REDO log records */
+	block_t			m_first_block;
+};
+
+#endif /* dyn0buf_h */
diff --git a/storage/innobase/include/dyn0types.h b/storage/innobase/include/dyn0types.h
new file mode 100644
index 00000000..83d0b0d6
--- /dev/null
+++ b/storage/innobase/include/dyn0types.h
@@ -0,0 +1,39 @@
+/*****************************************************************************
+
+Copyright (c) 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dyn0types.h
+The dynamically allocated buffer types and constants
+
+Created 2013-03-16 Sunny Bains
+*******************************************************/
+
+#ifndef dyn0types_h
+#define dyn0types_h
+
+/** Value of dyn_block_t::magic_n */
+#define DYN_BLOCK_MAGIC_N	375767
+
+/** This is the initial 'payload' size of a dynamic array */
+#define	DYN_ARRAY_DATA_SIZE	512
+
+/** Flag for dyn_block_t::used that indicates a full block */
+#define DYN_BLOCK_FULL_FLAG	0x1000000UL
+
+#endif /* dyn0types_h */
diff --git a/storage/innobase/include/eval0eval.h b/storage/innobase/include/eval0eval.h
new file mode 100644
index 00000000..a3ea0462
--- /dev/null
+++ b/storage/innobase/include/eval0eval.h
@@ -0,0 +1,109 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/eval0eval.h
+SQL evaluator: evaluates simple data structures, like expressions, in
+a query graph
+
+Created 12/29/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef eval0eval_h
+#define eval0eval_h
+
+#include "que0types.h"
+#include "pars0sym.h"
+#include "pars0pars.h"
+
+/*****************************************************************//**
+Free the buffer from global dynamic memory for a value of a que_node,
+if it has been allocated in the above function. The freeing for pushed
+column values is done in sel_col_prefetch_buf_free. */
+void
+eval_node_free_val_buf(
+/*===================*/
+	que_node_t*	node);	/*!< in: query graph node */
+/*****************************************************************//**
+Evaluates a symbol table symbol. */
+UNIV_INLINE
+void
+eval_sym(
+/*=====*/
+	sym_node_t*	sym_node);	/*!< in: symbol table node */
+/*****************************************************************//**
+Evaluates an expression. */
+UNIV_INLINE
+void
+eval_exp(
+/*=====*/
+	que_node_t*	exp_node);	/*!< in: expression */
+/*****************************************************************//**
+Sets an integer value as the value of an expression node. */
+UNIV_INLINE
+void
+eval_node_set_int_val(
+/*==================*/
+	que_node_t*	node,	/*!< in: expression node */
+	lint		val);	/*!< in: value to set */
+/*****************************************************************//**
+Gets an integer value from an expression node.
+@return integer value */
+UNIV_INLINE
+lint
+eval_node_get_int_val(
+/*==================*/
+	que_node_t*	node);	/*!< in: expression node */
+/*****************************************************************//**
+Copies a binary string value as the value of a query graph node. Allocates a
+new buffer if necessary. */
+UNIV_INLINE
+void
+eval_node_copy_and_alloc_val(
+/*=========================*/
+	que_node_t*	node,	/*!< in: query graph node */
+	const byte*	str,	/*!< in: binary string */
+	ulint		len);	/*!< in: string length or UNIV_SQL_NULL */
+/*****************************************************************//**
+Copies a query node value to another node. */
+UNIV_INLINE
+void
+eval_node_copy_val(
+/*===============*/
+	que_node_t*	node1,	/*!< in: node to copy to */
+	que_node_t*	node2);	/*!< in: node to copy from */
+/*****************************************************************//**
+Gets a iboolean value from a query node.
+@return iboolean value */
+UNIV_INLINE
+ibool
+eval_node_get_ibool_val(
+/*====================*/
+	que_node_t*	node);	/*!< in: query graph node */
+/*****************************************************************//**
+Evaluates a comparison node.
+@return the result of the comparison */
+ibool
+eval_cmp(
+/*=====*/
+	func_node_t*	cmp_node);	/*!< in: comparison node */
+
+
+#include "eval0eval.inl"
+
+#endif
diff --git a/storage/innobase/include/eval0eval.inl b/storage/innobase/include/eval0eval.inl
new file mode 100644
index 00000000..0ea4057f
--- /dev/null
+++ b/storage/innobase/include/eval0eval.inl
@@ -0,0 +1,254 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/eval0eval.ic
+SQL evaluator: evaluates simple data structures, like expressions, in
+a query graph
+
+Created 12/29/1997 Heikki Tuuri
+*******************************************************/
+
+#include "que0que.h"
+#include "rem0cmp.h"
+#include "pars0grm.h"
+
+/*****************************************************************//**
+Evaluates a function node. */
+void
+eval_func(
+/*======*/
+	func_node_t*	func_node);	/*!< in: function node */
+/*****************************************************************//**
+Allocate a buffer from global dynamic memory for a value of a que_node.
+NOTE that this memory must be explicitly freed when the query graph is
+freed. If the node already has allocated buffer, that buffer is freed
+here. NOTE that this is the only function where dynamic memory should be
+allocated for a query node val field.
+@return pointer to allocated buffer */
+byte*
+eval_node_alloc_val_buf(
+/*====================*/
+	que_node_t*	node,	/*!< in: query graph node; sets the val field
+				data field to point to the new buffer, and
+				len field equal to size */
+	ulint		size);	/*!< in: buffer size */
+
+
+/*****************************************************************//**
+Allocates a new buffer if needed.
+@return pointer to buffer */
+UNIV_INLINE
+byte*
+eval_node_ensure_val_buf(
+/*=====================*/
+	que_node_t*	node,	/*!< in: query graph node; sets the val field
+				data field to point to the new buffer, and
+				len field equal to size */
+	ulint		size)	/*!< in: buffer size */
+{
+	dfield_t*	dfield;
+	byte*		data;
+
+	dfield = que_node_get_val(node);
+	dfield_set_len(dfield, size);
+
+	data = static_cast<byte*>(dfield_get_data(dfield));
+
+	if (!data || que_node_get_val_buf_size(node) < size) {
+
+		data = eval_node_alloc_val_buf(node, size);
+	}
+
+	return(data);
+}
+
+/*****************************************************************//**
+Evaluates a symbol table symbol. */
+UNIV_INLINE
+void
+eval_sym(
+/*=====*/
+	sym_node_t*	sym_node)	/*!< in: symbol table node */
+{
+
+	ut_ad(que_node_get_type(sym_node) == QUE_NODE_SYMBOL);
+
+	if (sym_node->indirection) {
+		/* The symbol table node is an alias for a variable or a
+		column */
+
+		dfield_copy_data(que_node_get_val(sym_node),
+				 que_node_get_val(sym_node->indirection));
+	}
+}
+
+/*****************************************************************//**
+Evaluates an expression. */
+UNIV_INLINE
+void
+eval_exp(
+/*=====*/
+	que_node_t*	exp_node)	/*!< in: expression */
+{
+	if (que_node_get_type(exp_node) == QUE_NODE_SYMBOL) {
+
+		eval_sym((sym_node_t*) exp_node);
+
+		return;
+	}
+
+	eval_func(static_cast<func_node_t*>(exp_node));
+}
+
+/*****************************************************************//**
+Sets an integer value as the value of an expression node. */
+UNIV_INLINE
+void
+eval_node_set_int_val(
+/*==================*/
+	que_node_t*	node,	/*!< in: expression node */
+	lint		val)	/*!< in: value to set */
+{
+	dfield_t*	dfield;
+	byte*		data;
+
+	dfield = que_node_get_val(node);
+
+	data = static_cast<byte*>(dfield_get_data(dfield));
+
+	if (data == NULL) {
+		data = eval_node_alloc_val_buf(node, 4);
+	}
+
+	ut_ad(dfield_get_len(dfield) == 4);
+
+	mach_write_to_4(data, (ulint) val);
+}
+
+/*****************************************************************//**
+Gets an integer non-SQL null value from an expression node.
+@return integer value */
+UNIV_INLINE
+lint
+eval_node_get_int_val(
+/*==================*/
+	que_node_t*	node)	/*!< in: expression node */
+{
+	const byte*	ptr;
+	dfield_t*	dfield;
+
+	dfield = que_node_get_val(node);
+	ptr = static_cast<byte*>(dfield_get_data(dfield));
+
+	ut_ad(dfield_get_len(dfield) == 4);
+
+	return((int) mach_read_from_4(ptr));
+}
+
+/*****************************************************************//**
+Gets a iboolean value from a query node.
+@return iboolean value */
+UNIV_INLINE
+ibool
+eval_node_get_ibool_val(
+/*====================*/
+	que_node_t*	node)	/*!< in: query graph node */
+{
+	dfield_t*	dfield;
+	byte*		data;
+
+	dfield = que_node_get_val(node);
+
+	data = static_cast<byte*>(dfield_get_data(dfield));
+
+	ut_ad(data != NULL);
+
+	return(mach_read_from_1(data));
+}
+
+/*****************************************************************//**
+Sets a iboolean value as the value of a function node. */
+UNIV_INLINE
+void
+eval_node_set_ibool_val(
+/*====================*/
+	func_node_t*	func_node,	/*!< in: function node */
+	ibool		val)		/*!< in: value to set */
+{
+	dfield_t*	dfield;
+	byte*		data;
+
+	dfield = que_node_get_val(func_node);
+
+	data = static_cast<byte*>(dfield_get_data(dfield));
+
+	if (data == NULL) {
+		/* Allocate 1 byte to hold the value */
+
+		data = eval_node_alloc_val_buf(func_node, 1);
+	}
+
+	ut_ad(dfield_get_len(dfield) == 1);
+
+	mach_write_to_1(data, val);
+}
+
+/*****************************************************************//**
+Copies a binary string value as the value of a query graph node. Allocates a
+new buffer if necessary. */
+UNIV_INLINE
+void
+eval_node_copy_and_alloc_val(
+/*=========================*/
+	que_node_t*	node,	/*!< in: query graph node */
+	const byte*	str,	/*!< in: binary string */
+	ulint		len)	/*!< in: string length or UNIV_SQL_NULL */
+{
+	byte*		data;
+
+	if (len == UNIV_SQL_NULL) {
+		dfield_set_len(que_node_get_val(node), len);
+
+		return;
+	}
+
+	data = eval_node_ensure_val_buf(node, len);
+
+	memcpy(data, str, len);
+}
+
+/*****************************************************************//**
+Copies a query node value to another node. */
+UNIV_INLINE
+void
+eval_node_copy_val(
+/*===============*/
+	que_node_t*	node1,	/*!< in: node to copy to */
+	que_node_t*	node2)	/*!< in: node to copy from */
+{
+	dfield_t*	dfield2;
+
+	dfield2 = que_node_get_val(node2);
+
+	eval_node_copy_and_alloc_val(
+		node1,
+		static_cast<byte*>(dfield_get_data(dfield2)),
+		dfield_get_len(dfield2));
+}
diff --git a/storage/innobase/include/eval0proc.h b/storage/innobase/include/eval0proc.h
new file mode 100644
index 00000000..a93140bf
--- /dev/null
+++ b/storage/innobase/include/eval0proc.h
@@ -0,0 +1,94 @@
+/*****************************************************************************
+
+Copyright (c) 1998, 2016, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/eval0proc.h
+Executes SQL stored procedures and their control structures
+
+Created 1/20/1998 Heikki Tuuri
+*******************************************************/
+
+#ifndef eval0proc_h
+#define eval0proc_h
+
+#include "que0types.h"
+#include "pars0sym.h"
+#include "pars0pars.h"
+
+/**********************************************************************//**
+Performs an execution step of a procedure node.
+@return query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+proc_step(
+/*======*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of an if-statement node.
+@return query thread to run next or NULL */
+que_thr_t*
+if_step(
+/*====*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of a while-statement node.
+@return query thread to run next or NULL */
+que_thr_t*
+while_step(
+/*=======*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of a for-loop node.
+@return query thread to run next or NULL */
+que_thr_t*
+for_step(
+/*=====*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of an assignment statement node.
+@return query thread to run next or NULL */
+que_thr_t*
+assign_step(
+/*========*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of a procedure call node.
+@return query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+proc_eval_step(
+/*===========*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of an exit statement node.
+@return query thread to run next or NULL */
+que_thr_t*
+exit_step(
+/*======*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of a return-statement node.
+@return query thread to run next or NULL */
+que_thr_t*
+return_step(
+/*========*/
+	que_thr_t*	thr);	/*!< in: query thread */
+
+#include "eval0proc.inl"
+
+#endif
diff --git a/storage/innobase/include/eval0proc.inl b/storage/innobase/include/eval0proc.inl
new file mode 100644
index 00000000..b0c5f75b
--- /dev/null
+++ b/storage/innobase/include/eval0proc.inl
@@ -0,0 +1,88 @@
+/*****************************************************************************
+
+Copyright (c) 1998, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/eval0proc.ic
+Executes SQL stored procedures and their control structures
+
+Created 1/20/1998 Heikki Tuuri
+*******************************************************/
+
+#include "pars0pars.h"
+#include "que0que.h"
+#include "eval0eval.h"
+
+/**********************************************************************//**
+Performs an execution step of a procedure node.
+@return query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+proc_step(
+/*======*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	proc_node_t*	node;
+
+	ut_ad(thr);
+
+	node = static_cast<proc_node_t*>(thr->run_node);
+	ut_ad(que_node_get_type(node) == QUE_NODE_PROC);
+
+	if (thr->prev_node == que_node_get_parent(node)) {
+		/* Start execution from the first statement in the statement
+		list */
+
+		thr->run_node = node->stat_list;
+	} else {
+		/* Move to the next statement */
+		ut_ad(que_node_get_next(thr->prev_node) == NULL);
+
+		thr->run_node = NULL;
+	}
+
+	if (thr->run_node == NULL) {
+		thr->run_node = que_node_get_parent(node);
+	}
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Performs an execution step of a procedure call node.
+@return query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+proc_eval_step(
+/*===========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	func_node_t*	node;
+
+	ut_ad(thr);
+
+	node = static_cast<func_node_t*>(thr->run_node);
+	ut_ad(que_node_get_type(node) == QUE_NODE_FUNC);
+
+	/* Evaluate the procedure */
+
+	eval_exp(node);
+
+	thr->run_node = que_node_get_parent(node);
+
+	return(thr);
+}
diff --git a/storage/innobase/include/fil0crypt.h b/storage/innobase/include/fil0crypt.h
new file mode 100644
index 00000000..f43965cd
--- /dev/null
+++ b/storage/innobase/include/fil0crypt.h
@@ -0,0 +1,396 @@
+/*****************************************************************************
+Copyright (C) 2013, 2015, Google Inc. All Rights Reserved.
+Copyright (c) 2015, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fil0crypt.h
+The low-level file system encryption support functions
+
+Created 04/01/2015 Jan Lindström
+*******************************************************/
+
+#ifndef fil0crypt_h
+#define fil0crypt_h
+
+#include "my_crypt.h"
+#include "fil0fil.h"
+
+/**
+* Magic pattern in start of crypt data on page 0
+*/
+#define MAGIC_SZ 6
+
+static const unsigned char CRYPT_MAGIC[MAGIC_SZ] = {
+	's', 0xE, 0xC, 'R', 'E', 't' };
+
+/* This key will be used if nothing else is given */
+#define FIL_DEFAULT_ENCRYPTION_KEY ENCRYPTION_KEY_SYSTEM_DATA
+
+/** Wake up the encryption threads */
+void fil_crypt_threads_signal(bool broadcast= false);
+
+/**
+ * CRYPT_SCHEME_UNENCRYPTED
+ *
+ * Used as intermediate state when convering a space from unencrypted
+ * to encrypted
+ */
+/**
+ * CRYPT_SCHEME_1
+ *
+ * xxx is AES_CTR or AES_CBC (or another block cypher with the same key and iv lengths)
+ *  L = AES_ECB(KEY, IV)
+ *  CRYPT(PAGE) = xxx(KEY=L, IV=C, PAGE)
+ */
+
+#define CRYPT_SCHEME_1 1
+#define CRYPT_SCHEME_1_IV_LEN 16
+#define CRYPT_SCHEME_UNENCRYPTED 0
+
+/* Cached L or key for given key_version */
+struct key_struct
+{
+	uint key_version;			/*!< Version of the key */
+	uint key_length;			/*!< Key length */
+	unsigned char key[MY_AES_MAX_KEY_LENGTH]; /*!< Cached key
+                                                (that is L in CRYPT_SCHEME_1) */
+};
+
+/** is encryption enabled */
+extern ulong	srv_encrypt_tables;
+
+/** Mutex helper for crypt_data->scheme
+@param[in, out]	schme	encryption scheme
+@param[in]	exit	should we exit or enter mutex ? */
+void
+crypt_data_scheme_locker(
+	st_encryption_scheme*	scheme,
+	int			exit);
+
+struct fil_space_rotate_state_t
+{
+	time_t start_time;	/*!< time when rotation started */
+	ulint active_threads;	/*!< active threads in space */
+	uint32_t next_offset;	/*!< next "free" offset */
+	uint32_t max_offset;	/*!< max offset needing to be rotated */
+	uint  min_key_version_found; /*!< min key version found but not
+				     rotated */
+	lsn_t end_lsn;		/*!< max lsn created when rotating this
+				space */
+	bool starting;		/*!< initial write of IV */
+	bool flushing;		/*!< space is being flushed at end of rotate */
+};
+
+#ifndef UNIV_INNOCHECKSUM
+
+struct fil_space_crypt_t : st_encryption_scheme
+{
+ public:
+	/** Constructor. Does not initialize the members!
+	The object is expected to be placed in a buffer that
+	has been zero-initialized. */
+	fil_space_crypt_t(
+		uint new_type,
+		uint new_min_key_version,
+		uint new_key_id,
+		fil_encryption_t new_encryption)
+		: st_encryption_scheme(),
+		min_key_version(new_min_key_version),
+		encryption(new_encryption),
+		key_found(0),
+		rotate_state()
+	{
+		key_id = new_key_id;
+		my_random_bytes(iv, sizeof(iv));
+		mysql_mutex_init(0, &mutex, nullptr);
+		locker = crypt_data_scheme_locker;
+		type = new_type;
+
+		if (new_encryption == FIL_ENCRYPTION_OFF ||
+			(!srv_encrypt_tables &&
+			 new_encryption == FIL_ENCRYPTION_DEFAULT)) {
+			type = CRYPT_SCHEME_UNENCRYPTED;
+		} else {
+			type = CRYPT_SCHEME_1;
+			min_key_version = key_get_latest_version();
+		}
+
+		key_found = min_key_version;
+	}
+
+	/** Destructor */
+	~fil_space_crypt_t()
+	{
+		mysql_mutex_destroy(&mutex);
+	}
+
+	/** Get latest key version from encryption plugin
+	@retval key_version or
+	@retval ENCRYPTION_KEY_VERSION_INVALID if used key_id
+	is not found from encryption plugin. */
+	uint key_get_latest_version(void);
+
+	/** Returns true if key was found from encryption plugin
+	and false if not. */
+	bool is_key_found() const {
+		return key_found != ENCRYPTION_KEY_VERSION_INVALID;
+	}
+
+	/** Returns true if tablespace should be encrypted */
+	bool should_encrypt() const {
+		return ((encryption == FIL_ENCRYPTION_ON) ||
+			(srv_encrypt_tables &&
+				encryption == FIL_ENCRYPTION_DEFAULT));
+	}
+
+	/** Return true if tablespace is encrypted. */
+	bool is_encrypted() const {
+		return (encryption != FIL_ENCRYPTION_OFF);
+	}
+
+	/** Return true if default tablespace encryption is used, */
+	bool is_default_encryption() const {
+		return (encryption == FIL_ENCRYPTION_DEFAULT);
+	}
+
+	/** Return true if tablespace is not encrypted. */
+	bool not_encrypted() const {
+		return (encryption == FIL_ENCRYPTION_OFF);
+	}
+
+	/** Write encryption metadata to the first page.
+	@param[in,out]	block	first page of the tablespace
+	@param[in,out]	mtr	mini-transaction */
+	void write_page0(buf_block_t* block, mtr_t* mtr);
+
+	uint min_key_version; // min key version for this space
+	fil_encryption_t encryption; // Encryption setup
+
+	mysql_mutex_t mutex;   // mutex protecting following variables
+
+	/** Return code from encryption_key_get_latest_version.
+        If ENCRYPTION_KEY_VERSION_INVALID encryption plugin
+	could not find the key and there is no need to call
+	get_latest_key_version again as keys are read only
+	at startup. */
+	uint key_found;
+
+	fil_space_rotate_state_t rotate_state;
+};
+
+/** Status info about encryption */
+struct fil_space_crypt_status_t {
+	ulint space;             /*!< tablespace id */
+	ulint scheme;            /*!< encryption scheme */
+	uint  min_key_version;   /*!< min key version */
+	uint  current_key_version;/*!< current key version */
+	uint  keyserver_requests;/*!< no of key requests to key server */
+	uint key_id;            /*!< current key_id */
+	bool rotating;           /*!< is key rotation ongoing */
+	bool flushing;           /*!< is flush at end of rotation ongoing */
+	ulint rotate_next_page_number; /*!< next page if key rotating */
+	ulint rotate_max_page_number;  /*!< max page if key rotating */
+};
+
+/** Statistics about encryption key rotation */
+struct fil_crypt_stat_t
+{
+  ulint pages_read_from_cache= 0;
+  ulint pages_read_from_disk= 0;
+  ulint pages_modified= 0;
+  ulint pages_flushed= 0;
+  ulint estimated_iops= 0;
+};
+
+/** Init space crypt */
+void fil_space_crypt_init();
+
+/** Cleanup space crypt */
+void fil_space_crypt_cleanup();
+
+/**
+Create a fil_space_crypt_t object
+@param[in]	encrypt_mode	FIL_ENCRYPTION_DEFAULT or
+				FIL_ENCRYPTION_ON or
+				FIL_ENCRYPTION_OFF
+
+@param[in]	key_id		Encryption key id
+@return crypt object */
+fil_space_crypt_t*
+fil_space_create_crypt_data(
+	fil_encryption_t	encrypt_mode,
+	uint			key_id)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Initialize encryption parameters from a tablespace header page.
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	page		first page of the tablespace
+@return crypt data from page 0
+@retval	NULL	if not present or not valid */
+fil_space_crypt_t* fil_space_read_crypt_data(ulint zip_size, const byte* page)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/**
+Free a crypt data object
+@param[in,out] crypt_data	crypt data to be freed */
+void fil_space_destroy_crypt_data(fil_space_crypt_t **crypt_data);
+
+/** Amend encryption information from redo log.
+@param[in]	space	tablespace
+@param[in]	data	encryption metadata */
+void fil_crypt_parse(fil_space_t* space, const byte* data);
+
+/** Encrypt a buffer.
+@param[in,out]		crypt_data		Crypt data
+@param[in]		space			space_id
+@param[in]		offset			Page offset
+@param[in]		src_frame		Page to encrypt
+@param[in]		zip_size		ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out]		dst_frame		Output buffer
+@param[in]		use_full_checksum	full crc32 algo is used
+@return encrypted buffer or NULL */
+byte*
+fil_encrypt_buf(
+	fil_space_crypt_t*	crypt_data,
+	ulint			space,
+	ulint			offset,
+	const byte*		src_frame,
+	ulint			zip_size,
+	byte*			dst_frame,
+	bool			use_full_checksum)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/**
+Encrypt a page.
+
+@param[in]		space		Tablespace
+@param[in]		offset		Page offset
+@param[in]		src_frame	Page to encrypt
+@param[in,out]		dst_frame	Output buffer
+@return encrypted buffer or NULL */
+byte* fil_space_encrypt(
+	const fil_space_t* space,
+	ulint		offset,
+	byte*		src_frame,
+	byte*		dst_frame)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Decrypt a page.
+@param]in]	space_id		space id
+@param[in]	fsp_flags		Tablespace flags
+@param[in]	crypt_data		crypt_data
+@param[in]	tmp_frame		Temporary buffer
+@param[in]	physical_size		page size
+@param[in,out]	src_frame		Page to decrypt
+@retval DB_SUCCESS on success
+@retval DB_DECRYPTION_FAILED on error */
+dberr_t
+fil_space_decrypt(
+	uint32_t		space_id,
+	uint32_t		fsp_flags,
+	fil_space_crypt_t*	crypt_data,
+	byte*			tmp_frame,
+	ulint			physical_size,
+	byte*			src_frame);
+
+/******************************************************************
+Decrypt a page
+@param[in]	space			Tablespace
+@param[in]	tmp_frame		Temporary buffer used for decrypting
+@param[in,out]	src_frame		Page to decrypt
+@return decrypted page, or original not encrypted page if decryption is
+not needed.
+@retval nullptr on failure */
+byte*
+fil_space_decrypt(
+	const fil_space_t* space,
+	byte*		tmp_frame,
+	byte*		src_frame)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************
+Adjust thread count for key rotation
+@param[in]	enw_cnt		Number of threads to be used */
+void fil_crypt_set_thread_cnt(const uint new_cnt);
+
+/*********************************************************************
+Adjust max key age
+@param[in]	val		New max key age */
+void fil_crypt_set_rotate_key_age(uint val);
+
+/*********************************************************************
+Adjust rotation iops
+@param[in]	val		New max roation iops */
+void fil_crypt_set_rotation_iops(uint val);
+
+/*********************************************************************
+Adjust encrypt tables
+@param[in]	val		New setting for innodb-encrypt-tables */
+void fil_crypt_set_encrypt_tables(ulong val);
+
+/*********************************************************************
+Init threads for key rotation */
+void fil_crypt_threads_init();
+
+/*********************************************************************
+Clean up key rotation threads resources */
+void fil_crypt_threads_cleanup();
+
+/*********************************************************************
+Wait for crypt threads to stop accessing space
+@param[in]	space		Tablespace */
+void fil_space_crypt_close_tablespace(const fil_space_t *space);
+
+/*********************************************************************
+Get crypt status for a space (used by information_schema)
+@param[in]	space		Tablespace
+@param[out]	status		Crypt status
+return 0 if crypt data present */
+void
+fil_space_crypt_get_status(
+	const fil_space_t*			space,
+	struct fil_space_crypt_status_t*	status);
+
+/*********************************************************************
+Return crypt statistics
+@param[out]	stat		Crypt statistics */
+void fil_crypt_total_stat(fil_crypt_stat_t *stat);
+
+#include "fil0crypt.inl"
+#endif /* !UNIV_INNOCHECKSUM */
+
+/**
+Verify that post encryption checksum match calculated checksum.
+This function should be called only if tablespace contains crypt_data
+metadata (this is strong indication that tablespace is encrypted).
+Function also verifies that traditional checksum does not match
+calculated checksum as if it does page could be valid unencrypted,
+encrypted, or corrupted.
+
+@param[in,out]	page		page frame (checksum is temporarily modified)
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@return true if page is encrypted AND OK, false otherwise */
+bool fil_space_verify_crypt_checksum(const byte* page, ulint zip_size)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Add the tablespace to the rotation list if
+innodb_encrypt_rotate_key_age is 0 or encryption plugin does
+not do key version rotation
+@return whether the tablespace should be added to rotation list */
+bool fil_crypt_must_default_encrypt();
+
+#endif /* fil0crypt_h */
diff --git a/storage/innobase/include/fil0crypt.inl b/storage/innobase/include/fil0crypt.inl
new file mode 100644
index 00000000..cc59b394
--- /dev/null
+++ b/storage/innobase/include/fil0crypt.inl
@@ -0,0 +1,81 @@
+/*****************************************************************************
+
+Copyright (c) 2015, 2017, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fil0crypt.ic
+The low-level file system encryption support functions
+
+Created 04/01/2015 Jan Lindström
+*******************************************************/
+
+/*******************************************************************//**
+Find out whether the page is page encrypted
+@return	true if page is page encrypted, false if not */
+UNIV_INLINE
+bool
+fil_page_is_encrypted(
+/*==================*/
+	const byte *buf)	/*!< in: page */
+{
+	return(mach_read_from_4(buf+FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION) != 0);
+}
+
+/*******************************************************************//**
+Get current encryption mode from crypt_data.
+@return string representation */
+UNIV_INLINE
+const char *
+fil_crypt_get_mode(
+/*===============*/
+	const fil_space_crypt_t* crypt_data)
+{
+	switch (crypt_data->encryption) {
+	case FIL_ENCRYPTION_DEFAULT:
+		return("Default tablespace encryption mode");
+	case FIL_ENCRYPTION_ON:
+		return("Tablespace encrypted");
+	case FIL_ENCRYPTION_OFF:
+		return("Tablespace not encrypted");
+	}
+
+	ut_error;
+	return ("NULL");
+}
+
+/*******************************************************************//**
+Get current encryption type from crypt_data.
+@return string representation */
+UNIV_INLINE
+const char *
+fil_crypt_get_type(
+	const fil_space_crypt_t* crypt_data)
+{
+	ut_ad(crypt_data != NULL);
+	switch (crypt_data->type) {
+	case CRYPT_SCHEME_UNENCRYPTED:
+		return("scheme unencrypted");
+		break;
+	case CRYPT_SCHEME_1:
+		return("scheme encrypted");
+		break;
+	default:
+		ut_error;
+	}
+
+	return ("NULL");
+}
diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h
new file mode 100644
index 00000000..6f58e3c1
--- /dev/null
+++ b/storage/innobase/include/fil0fil.h
@@ -0,0 +1,1823 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fil0fil.h
+The low-level file system
+
+Created 10/25/1995 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "fsp0types.h"
+#include "mach0data.h"
+#include "assume_aligned.h"
+
+#ifndef UNIV_INNOCHECKSUM
+
+#include "srw_lock.h"
+#include "buf0dblwr.h"
+#include "hash0hash.h"
+#include "log0recv.h"
+#include "dict0types.h"
+#include "ilist.h"
+#include <set>
+#include <mutex>
+
+struct unflushed_spaces_tag_t;
+struct default_encrypt_tag_t;
+struct space_list_tag_t;
+struct named_spaces_tag_t;
+
+using space_list_t= ilist<fil_space_t, space_list_tag_t>;
+
+// Forward declaration
+extern my_bool srv_use_doublewrite_buf;
+
+/** Possible values of innodb_flush_method */
+enum srv_flush_t
+{
+  /** fsync, the default */
+  SRV_FSYNC= 0,
+  /** open log files in O_DSYNC mode */
+  SRV_O_DSYNC,
+  /** do not call os_file_flush() when writing data files, but do flush
+  after writing to log files */
+  SRV_LITTLESYNC,
+  /** do not flush after writing */
+  SRV_NOSYNC,
+  /** invoke os_file_set_nocache() on data files. This implies using
+  unbuffered I/O but still fdatasync(), because some filesystems might
+  not flush meta-data on write completion */
+  SRV_O_DIRECT,
+  /** Like O_DIRECT, but skip fdatasync(), assuming that the data is
+  durable on write completion */
+  SRV_O_DIRECT_NO_FSYNC
+#ifdef _WIN32
+  /** Traditional Windows appoach to open all files without caching,
+  and do FileFlushBuffers() */
+  ,SRV_ALL_O_DIRECT_FSYNC
+#endif
+};
+
+/** innodb_flush_method */
+extern ulong srv_file_flush_method;
+
+/** Undo tablespaces starts with space_id. */
+extern uint32_t srv_undo_space_id_start;
+/** The number of UNDO tablespaces that are open and ready to use. */
+extern uint32_t srv_undo_tablespaces_open;
+
+/** Check whether given space id is undo tablespace id
+@param[in]	space_id	space id to check
+@return true if it is undo tablespace else false. */
+inline bool srv_is_undo_tablespace(uint32_t space_id)
+{
+  return srv_undo_space_id_start > 0 &&
+    space_id >= srv_undo_space_id_start &&
+    space_id < srv_undo_space_id_start + srv_undo_tablespaces_open;
+}
+
+class page_id_t;
+
+/** Structure containing encryption specification */
+struct fil_space_crypt_t;
+
+/** File types */
+enum fil_type_t {
+	/** temporary tablespace (temporary undo log or tables) */
+	FIL_TYPE_TEMPORARY,
+	/** a tablespace that is being imported (no logging until finished) */
+	FIL_TYPE_IMPORT,
+	/** persistent tablespace (for system, undo log or tables) */
+	FIL_TYPE_TABLESPACE,
+};
+
+struct fil_node_t;
+
+/** Structure to store first and last value of range */
+struct range_t
+{
+  uint32_t first;
+  uint32_t last;
+};
+
+/** Sort the range based on first value of the range */
+struct range_compare
+{
+  bool operator() (const range_t lhs, const range_t rhs) const
+  {
+    return lhs.first < rhs.first;
+  }
+};
+
+using range_set_t= std::set<range_t, range_compare>;
+/** Range to store the set of ranges of integers */
+class range_set
+{
+private:
+  range_set_t ranges;
+
+  range_set_t::iterator find(uint32_t value) const
+  {
+    auto r_offset= ranges.lower_bound({value, value});
+    const auto r_end= ranges.end();
+    if (r_offset != r_end);
+    else if (empty())
+      return r_end;
+    else
+      r_offset= std::prev(r_end);
+    if (r_offset->first <= value && r_offset->last >= value)
+      return r_offset;
+    return r_end;
+  }
+public:
+  /** Merge the current range with previous range.
+  @param[in] range      range to be merged
+  @param[in] prev_range range to be merged with next */
+  void merge_range(range_set_t::iterator range,
+		   range_set_t::iterator prev_range)
+  {
+    if (range->first != prev_range->last + 1)
+      return;
+
+    /* Merge the current range with previous range */
+    range_t new_range {prev_range->first, range->last};
+    ranges.erase(prev_range);
+    ranges.erase(range);
+    ranges.emplace(new_range);
+  }
+
+  /** Split the range and add two more ranges
+  @param[in] range	range to be split
+  @param[in] value	Value to be removed from range */
+  void split_range(range_set_t::iterator range, uint32_t value)
+  {
+    range_t split1{range->first, value - 1};
+    range_t split2{value + 1, range->last};
+
+    /* Remove the existing element */
+    ranges.erase(range);
+
+    /* Insert the two elements */
+    ranges.emplace(split1);
+    ranges.emplace(split2);
+  }
+
+  /** Remove the value with the given range
+  @param[in,out] range  range to be changed
+  @param[in]	 value	value to be removed */
+  void remove_within_range(range_set_t::iterator range, uint32_t value)
+  {
+    range_t new_range{range->first, range->last};
+    if (value == range->first)
+    {
+      if (range->first == range->last)
+      {
+        ranges.erase(range);
+        return;
+      }
+      else
+        new_range.first++;
+    }
+    else if (value == range->last)
+      new_range.last--;
+    else if (range->first < value && range->last > value)
+      return split_range(range, value);
+
+    ranges.erase(range);
+    ranges.emplace(new_range);
+  }
+
+  /** Remove the value from the ranges.
+  @param[in]	value	Value to be removed. */
+  void remove_value(uint32_t value)
+  {
+    if (empty())
+      return;
+    range_t new_range {value, value};
+    range_set_t::iterator range= ranges.lower_bound(new_range);
+    if (range == ranges.end())
+      return remove_within_range(std::prev(range), value);
+
+    if (range->first > value && range != ranges.begin())
+      /* Iterate the previous ranges to delete */
+      return remove_within_range(std::prev(range), value);
+    return remove_within_range(range, value);
+  }
+  /** Add the value within the existing range
+  @param[in]	range	range to be modified
+  @param[in]	value	value to be added */
+  range_set_t::iterator add_within_range(range_set_t::iterator range,
+                                         uint32_t value)
+  {
+    if (range->first <= value && range->last >= value)
+      return range;
+
+    range_t new_range{range->first, range->last};
+    if (range->last + 1 == value)
+      new_range.last++;
+    else if (range->first - 1 == value)
+      new_range.first--;
+    else return ranges.end();
+    ranges.erase(range);
+    return ranges.emplace(new_range).first;
+  }
+  /** Add the range in the ranges set
+  @param[in]	new_range	range to be added */
+  void add_range(range_t new_range)
+  {
+    auto r_offset= ranges.lower_bound(new_range);
+    auto r_begin= ranges.begin();
+    auto r_end= ranges.end();
+    if (!ranges.size())
+    {
+new_range:
+      ranges.emplace(new_range);
+      return;
+    }
+
+    if (r_offset == r_end)
+    {
+      /* last range */
+      if (add_within_range(std::prev(r_offset), new_range.first) == r_end)
+        goto new_range;
+    }
+    else if (r_offset == r_begin)
+    {
+      /* First range */
+      if (add_within_range(r_offset, new_range.first) == r_end)
+        goto new_range;
+    }
+    else if (r_offset->first - 1 == new_range.first)
+    {
+      /* Change starting of the existing range */
+      auto r_value= add_within_range(r_offset, new_range.first);
+      if (r_value != ranges.begin())
+        merge_range(r_value, std::prev(r_value));
+    }
+    else
+    {
+      /* previous range last_value alone */
+      if (add_within_range(std::prev(r_offset), new_range.first) == r_end)
+        goto new_range;
+    }
+  }
+
+ /** Add the value in the ranges
+ @param[in] value  value to be added */
+  void add_value(uint32_t value)
+  {
+    range_t new_range{value, value};
+    add_range(new_range);
+  }
+
+  bool remove_if_exists(uint32_t value)
+  {
+    auto r_offset= find(value);
+    if (r_offset != ranges.end())
+    {
+      remove_within_range(r_offset, value);
+      return true;
+    }
+    return false;
+  }
+
+  bool contains(uint32_t value) const
+  {
+    return find(value) != ranges.end();
+  }
+
+  ulint size() { return ranges.size(); }
+  void clear() { ranges.clear(); }
+  bool empty() const { return ranges.empty(); }
+  typename range_set_t::iterator begin() { return ranges.begin(); }
+  typename range_set_t::iterator end() { return ranges.end(); }
+};
+#endif
+
+/** Tablespace or log data space */
+#ifndef UNIV_INNOCHECKSUM
+struct fil_io_t
+{
+  /** error code */
+  dberr_t err;
+  /** file; node->space->release() must follow IORequestRead call */
+  fil_node_t *node;
+};
+
+/** Tablespace encryption mode */
+enum fil_encryption_t
+{
+  /** Encrypted if innodb_encrypt_tables=ON (srv_encrypt_tables) */
+  FIL_ENCRYPTION_DEFAULT,
+  /** Encrypted */
+  FIL_ENCRYPTION_ON,
+  /** Not encrypted */
+  FIL_ENCRYPTION_OFF
+};
+
+struct fil_space_t final : ilist_node<unflushed_spaces_tag_t>,
+                           ilist_node<default_encrypt_tag_t>,
+                           ilist_node<space_list_tag_t>,
+                           ilist_node<named_spaces_tag_t>
+#else
+struct fil_space_t final
+#endif
+{
+#ifndef UNIV_INNOCHECKSUM
+  friend fil_node_t;
+  ~fil_space_t()
+  {
+    ut_ad(!latch_owner);
+    ut_ad(!latch_count);
+    latch.destroy();
+  }
+
+  /** fil_system.spaces chain node */
+  fil_space_t *hash;
+  /** LSN of the most recent fil_names_write_if_was_clean().
+  Reset to 0 by fil_names_clear(). Protected by exclusive log_sys.latch.
+  If and only if max_lsn is nonzero, this is in fil_system.named_spaces. */
+  lsn_t max_lsn;
+  /** tablespace identifier */
+  uint32_t id;
+	/** whether undo tablespace truncation is in progress */
+	bool		is_being_truncated;
+	fil_type_t	purpose;/*!< purpose */
+	UT_LIST_BASE_NODE_T(fil_node_t) chain;
+				/*!< base node for the file chain */
+	uint32_t	size;	/*!< tablespace file size in pages;
+				0 if not known yet */
+	uint32_t	size_in_header;
+				/* FSP_SIZE in the tablespace header;
+				0 if not known yet */
+	uint32_t	free_len;
+				/*!< length of the FSP_FREE list */
+	uint32_t	free_limit;
+				/*!< contents of FSP_FREE_LIMIT */
+	uint32_t	recv_size;
+				/*!< recovered tablespace size in pages;
+				0 if no size change was read from the redo log,
+				or if the size change was implemented */
+	uint32_t	n_reserved_extents;
+				/*!< number of reserved free extents for
+				ongoing operations like B-tree page split */
+private:
+#ifdef UNIV_DEBUG
+  fil_space_t *next_in_space_list();
+  fil_space_t *prev_in_space_list();
+
+  fil_space_t *next_in_unflushed_spaces();
+  fil_space_t *prev_in_unflushed_spaces();
+#endif
+
+  /** the committed size of the tablespace in pages */
+  Atomic_relaxed<uint32_t> committed_size;
+  /** Number of pending operations on the file.
+  The tablespace cannot be freed while (n_pending & PENDING) != 0. */
+  std::atomic<uint32_t> n_pending;
+  /** Flag in n_pending that indicates that the tablespace is about to be
+  deleted, and no further operations should be performed */
+  static constexpr uint32_t STOPPING_READS= 1U << 31;
+  /** Flag in n_pending that indicates that the tablespace is being
+  deleted, and no further operations should be performed */
+  static constexpr uint32_t STOPPING_WRITES= 1U << 30;
+  /** Flags in n_pending that indicate that the tablespace is being
+  deleted, and no further operations should be performed */
+  static constexpr uint32_t STOPPING= STOPPING_READS | STOPPING_WRITES;
+  /** Flag in n_pending that indicates that the tablespace is a candidate
+  for being closed, and fil_node_t::is_open() can only be trusted after
+  acquiring fil_system.mutex and resetting the flag */
+  static constexpr uint32_t CLOSING= 1U << 29;
+  /** Flag in n_pending that indicates that the tablespace needs fsync().
+  This must be the least significant flag bit; @see release_flush() */
+  static constexpr uint32_t NEEDS_FSYNC= 1U << 28;
+  /** The reference count */
+  static constexpr uint32_t PENDING= ~(STOPPING | CLOSING | NEEDS_FSYNC);
+  /** latch protecting all page allocation bitmap pages */
+  srw_lock latch;
+  pthread_t latch_owner;
+  ut_d(Atomic_relaxed<uint32_t> latch_count;)
+public:
+  /** MariaDB encryption data */
+  fil_space_crypt_t *crypt_data;
+
+  /** Whether needs_flush(), or this is in fil_system.unflushed_spaces */
+  bool is_in_unflushed_spaces;
+
+  /** Whether this in fil_system.default_encrypt_tables (needs key rotation) */
+  bool is_in_default_encrypt;
+
+private:
+  /** Whether any corrupton of this tablespace has been reported */
+  mutable std::atomic_flag is_corrupted;
+
+public:
+  /** mutex to protect freed_ranges and last_freed_lsn */
+  std::mutex freed_range_mutex;
+private:
+  /** Ranges of freed page numbers; protected by freed_range_mutex */
+  range_set freed_ranges;
+
+  /** LSN of freeing last page; protected by freed_range_mutex */
+  lsn_t last_freed_lsn;
+
+public:
+  /** @return whether doublewrite buffering is needed */
+  inline bool use_doublewrite() const;
+
+  /** @return whether a page has been freed */
+  inline bool is_freed(uint32_t page);
+
+  /** Apply freed_ranges to the file.
+  @param writable whether the file is writable
+  @return number of pages written or hole-punched */
+  uint32_t flush_freed(bool writable);
+
+	/** Append a file to the chain of files of a space.
+	@param[in]	name		file name of a file that is not open
+	@param[in]	handle		file handle, or OS_FILE_CLOSED
+	@param[in]	size		file size in entire database pages
+	@param[in]	is_raw		whether this is a raw device
+	@param[in]	atomic_write	true if atomic write could be enabled
+	@param[in]	max_pages	maximum number of pages in file,
+	or UINT32_MAX for unlimited
+	@return file object */
+	fil_node_t* add(const char* name, pfs_os_file_t handle,
+			uint32_t size, bool is_raw, bool atomic_write,
+			uint32_t max_pages = UINT32_MAX);
+#ifdef UNIV_DEBUG
+	/** Assert that the mini-transaction is compatible with
+	updating an allocation bitmap page.
+	@param[in]	mtr	mini-transaction */
+	void modify_check(const mtr_t& mtr) const;
+#endif /* UNIV_DEBUG */
+
+	/** Try to reserve free extents.
+	@param[in]	n_free_now	current number of free extents
+	@param[in]	n_to_reserve	number of extents to reserve
+	@return	whether the reservation succeeded */
+	bool reserve_free_extents(uint32_t n_free_now, uint32_t n_to_reserve)
+	{
+		if (n_reserved_extents + n_to_reserve > n_free_now) {
+			return false;
+		}
+
+		n_reserved_extents += n_to_reserve;
+		return true;
+	}
+
+	/** Release the reserved free extents.
+	@param[in]	n_reserved	number of reserved extents */
+	void release_free_extents(uint32_t n_reserved)
+	{
+		if (!n_reserved) return;
+		ut_a(n_reserved_extents >= n_reserved);
+		n_reserved_extents -= n_reserved;
+	}
+
+  /** Rename a file.
+  @param[in]	path	tablespace file name after renaming
+  @param[in]	log	whether to write redo log
+  @param[in]	replace	whether to ignore the existence of path
+  @return	error code
+  @retval	DB_SUCCESS	on success */
+  dberr_t rename(const char *path, bool log, bool replace= false)
+    MY_ATTRIBUTE((nonnull));
+
+  /** Note that the tablespace has been imported.
+  Initially, purpose=FIL_TYPE_IMPORT so that no redo log is
+  written while the space ID is being updated in each page. */
+  inline void set_imported();
+
+  /** Report the tablespace as corrupted */
+  ATTRIBUTE_COLD void set_corrupted() const;
+
+  /** @return whether the storage device is rotational (HDD, not SSD) */
+  inline bool is_rotational() const;
+
+  /** Open each file. Never invoked on .ibd files.
+  @param create_new_db    whether to skip the call to fil_node_t::read_page0()
+  @return whether all files were opened */
+  bool open(bool create_new_db);
+  /** Close each file. Only invoked on fil_system.temp_space. */
+  void close();
+
+  /** Note that operations on the tablespace must stop. */
+  inline void set_stopping();
+
+  /** Note that operations on the tablespace can resume after truncation */
+  inline void clear_stopping();
+
+  /** Drop the tablespace and wait for any pending operations to cease
+  @param id               tablespace identifier
+  @param detached_handle  pointer to file to be closed later, or nullptr
+  @return tablespace to invoke fil_space_free() on
+  @retval nullptr if no tablespace was found, or it was deleted by
+  another concurrent thread */
+  static fil_space_t *drop(uint32_t id, pfs_os_file_t *detached_handle);
+
+private:
+  MY_ATTRIBUTE((warn_unused_result))
+  /** Try to acquire a tablespace reference (increment referenced()).
+  @param avoid   when these flags are set, nothing will be acquired
+  @return the old reference count */
+  uint32_t acquire_low(uint32_t avoid= STOPPING)
+  {
+    uint32_t n= 0;
+    while (!n_pending.compare_exchange_strong(n, n + 1,
+                                              std::memory_order_acquire,
+                                              std::memory_order_relaxed) &&
+           !(n & avoid));
+    return n;
+  }
+public:
+  MY_ATTRIBUTE((warn_unused_result))
+  /** Acquire a tablespace reference.
+  @return whether a tablespace reference was successfully acquired */
+  inline bool acquire_if_not_stopped();
+
+  MY_ATTRIBUTE((warn_unused_result))
+  /** Acquire a tablespace reference for I/O.
+  @param avoid   when these flags are set, nothing will be acquired
+  @return whether the file is usable */
+  bool acquire(uint32_t avoid= STOPPING | CLOSING)
+  {
+    const auto flags= acquire_low(avoid) & (avoid);
+    return UNIV_LIKELY(!flags) || (flags == CLOSING && acquire_and_prepare());
+  }
+
+  /** Acquire a tablespace reference for writing.
+  @param avoid   when these flags are set, nothing will be acquired
+  @return whether the file is writable */
+  bool acquire_for_write() { return acquire(STOPPING_WRITES | CLOSING); }
+
+  /** Acquire another tablespace reference for I/O. */
+  inline void reacquire();
+
+  /** Release a tablespace reference.
+  @return whether this was the last reference */
+  bool release()
+  {
+    uint32_t n= n_pending.fetch_sub(1, std::memory_order_release);
+    ut_ad(n & PENDING);
+    return (n & PENDING) == 1;
+  }
+
+  /** Clear the NEEDS_FSYNC flag */
+  void clear_flush()
+  {
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+    static_assert(NEEDS_FSYNC == 1U << 28, "compatibility");
+    __asm__ __volatile__("lock btrl $28, %0" : "+m" (n_pending));
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+    static_assert(NEEDS_FSYNC == 1U << 28, "compatibility");
+    _interlockedbittestandreset(reinterpret_cast<volatile long*>
+                                (&n_pending), 28);
+#else
+    n_pending.fetch_and(~NEEDS_FSYNC, std::memory_order_release);
+#endif
+  }
+
+private:
+  /** Clear the CLOSING flag */
+  void clear_closing()
+  {
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+    static_assert(CLOSING == 1U << 29, "compatibility");
+    __asm__ __volatile__("lock btrl $29, %0" : "+m" (n_pending));
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+    static_assert(CLOSING == 1U << 29, "compatibility");
+    _interlockedbittestandreset(reinterpret_cast<volatile long*>
+                                (&n_pending), 29);
+#else
+    n_pending.fetch_and(~CLOSING, std::memory_order_relaxed);
+#endif
+  }
+
+  /** @return pending operations (and flags) */
+  uint32_t pending()const { return n_pending.load(std::memory_order_acquire); }
+public:
+  /** @return whether close() of the file handle has been requested */
+  bool is_closing() const { return pending() & CLOSING; }
+  /** @return whether the tablespace is about to be dropped */
+  bool is_stopping() const { return pending() & STOPPING; }
+  /** @return whether the tablespace is going to be dropped */
+  bool is_stopping_writes() const { return pending() & STOPPING_WRITES; }
+  /** @return number of pending operations */
+  bool is_ready_to_close() const
+  { return (pending() & (PENDING | CLOSING)) == CLOSING; }
+  /** @return whether fsync() or similar is needed */
+  bool needs_flush() const { return pending() & NEEDS_FSYNC; }
+  /** @return whether fsync() or similar is needed, and the tablespace is
+  not being dropped  */
+  bool needs_flush_not_stopping() const
+  { return (pending() & (NEEDS_FSYNC | STOPPING_WRITES)) == NEEDS_FSYNC; }
+
+  uint32_t referenced() const { return pending() & PENDING; }
+private:
+  MY_ATTRIBUTE((warn_unused_result))
+  /** Prepare to close the file handle.
+  @return number of pending operations, possibly with NEEDS_FSYNC flag */
+  uint32_t set_closing()
+  {
+    return n_pending.fetch_or(CLOSING, std::memory_order_acquire);
+  }
+
+public:
+  /** Try to close a file to adhere to the innodb_open_files limit.
+  @param print_info   whether to diagnose why a file cannot be closed
+  @return whether a file was closed */
+  static bool try_to_close(bool print_info);
+
+  /** Close all tablespace files at shutdown */
+  static void close_all();
+
+  /** Update last_freed_lsn */
+  void update_last_freed_lsn(lsn_t lsn) { last_freed_lsn= lsn; }
+
+  /** Note that the file will need fsync().
+  @return whether this needs to be added to fil_system.unflushed_spaces */
+  bool set_needs_flush()
+  {
+    uint32_t n= 1;
+    while (!n_pending.compare_exchange_strong(n, n | NEEDS_FSYNC,
+                                              std::memory_order_acquire,
+                                              std::memory_order_relaxed))
+    {
+      ut_ad(n & PENDING);
+      if (n & (NEEDS_FSYNC | STOPPING_WRITES))
+        return false;
+    }
+
+    return true;
+  }
+
+  /** Clear all freed ranges for undo tablespace when InnoDB
+  encounters TRIM redo log record */
+  void clear_freed_ranges() { freed_ranges.clear(); }
+#endif /* !UNIV_INNOCHECKSUM */
+  /** FSP_SPACE_FLAGS and FSP_FLAGS_MEM_ flags;
+  check fsp0types.h to more info about flags. */
+  uint32_t flags;
+
+  /** Determine if full_crc32 is used for a data file
+  @param[in]	flags	tablespace flags (FSP_SPACE_FLAGS)
+  @return whether the full_crc32 algorithm is active */
+  static bool full_crc32(uint32_t flags)
+  { return flags & FSP_FLAGS_FCRC32_MASK_MARKER; }
+  /** @return whether innodb_checksum_algorithm=full_crc32 is active */
+  bool full_crc32() const { return full_crc32(flags); }
+  /** Determine if full_crc32 is used along with PAGE_COMPRESSED */
+  static bool is_full_crc32_compressed(uint32_t flags)
+  {
+    if (!full_crc32(flags))
+      return false;
+    auto algo= FSP_FLAGS_FCRC32_GET_COMPRESSED_ALGO(flags);
+    DBUG_ASSERT(algo <= PAGE_ALGORITHM_LAST);
+    return algo != 0;
+  }
+  /** Determine the logical page size.
+  @param flags	tablespace flags (FSP_SPACE_FLAGS)
+  @return the logical page size
+  @retval 0 if the flags are invalid */
+  static unsigned logical_size(uint32_t flags)
+  {
+    switch (full_crc32(flags)
+            ? FSP_FLAGS_FCRC32_GET_PAGE_SSIZE(flags)
+            : FSP_FLAGS_GET_PAGE_SSIZE(flags)) {
+    case 3: return 4096;
+    case 4: return 8192;
+    case 5: return full_crc32(flags) ? 16384 : 0;
+    case 0: return full_crc32(flags) ? 0 : 16384;
+    case 6: return 32768;
+    case 7: return 65536;
+    default: return 0;
+    }
+  }
+  /** Determine the ROW_FORMAT=COMPRESSED page size.
+  @param flags	tablespace flags (FSP_SPACE_FLAGS)
+  @return the ROW_FORMAT=COMPRESSED page size
+  @retval 0	if ROW_FORMAT=COMPRESSED is not used */
+  static unsigned zip_size(uint32_t flags)
+  {
+    if (full_crc32(flags))
+      return 0;
+    const uint32_t zip_ssize= FSP_FLAGS_GET_ZIP_SSIZE(flags);
+    return zip_ssize ? (UNIV_ZIP_SIZE_MIN >> 1) << zip_ssize : 0;
+  }
+  /** Determine the physical page size.
+  @param flags	tablespace flags (FSP_SPACE_FLAGS)
+  @return the physical page size */
+  static unsigned physical_size(uint32_t flags)
+  {
+    if (full_crc32(flags))
+      return logical_size(flags);
+
+    const uint32_t zip_ssize= FSP_FLAGS_GET_ZIP_SSIZE(flags);
+    return zip_ssize
+      ? (UNIV_ZIP_SIZE_MIN >> 1) << zip_ssize
+      : unsigned(srv_page_size);
+  }
+
+  /** @return the ROW_FORMAT=COMPRESSED page size
+  @retval 0  if ROW_FORMAT=COMPRESSED is not used */
+  unsigned zip_size() const { return zip_size(flags); }
+  /** @return the physical page size */
+  unsigned physical_size() const { return physical_size(flags); }
+
+  /** Check whether PAGE_COMPRESSED is enabled.
+  @param[in]	flags	tablespace flags */
+  static bool is_compressed(uint32_t flags)
+  {
+    return is_full_crc32_compressed(flags) ||
+      FSP_FLAGS_HAS_PAGE_COMPRESSION(flags);
+  }
+  /** @return whether the compression enabled for the tablespace. */
+  bool is_compressed() const { return is_compressed(flags); }
+
+  /** Get the compression algorithm for full crc32 format.
+  @param flags contents of FSP_SPACE_FLAGS
+  @return PAGE_COMPRESSED algorithm of full_crc32 tablespace
+  @retval 0 if not PAGE_COMPRESSED or not full_crc32 */
+  static unsigned get_compression_algo(uint32_t flags)
+  {
+    return full_crc32(flags)
+      ? FSP_FLAGS_FCRC32_GET_COMPRESSED_ALGO(flags)
+      : 0;
+  }
+  /** @return the page_compressed algorithm
+  @retval 0 if not page_compressed */
+  unsigned get_compression_algo() const { return get_compression_algo(flags); }
+  /** Determine if the page_compressed page contains an extra byte
+  for exact compressed stream length
+  @param flags   contents of FSP_SPACE_FLAGS
+  @return whether the extra byte is needed */
+  static bool full_crc32_page_compressed_len(uint32_t flags)
+  {
+    DBUG_ASSERT(full_crc32(flags));
+    switch (get_compression_algo(flags)) {
+    case PAGE_LZ4_ALGORITHM:
+    case PAGE_LZO_ALGORITHM:
+    case PAGE_SNAPPY_ALGORITHM:
+      return true;
+    }
+    return false;
+  }
+
+  /** Whether the full checksum matches with non full checksum flags.
+  @param flags    contents of FSP_SPACE_FLAGS
+  @param expected expected flags
+  @return true if it is equivalent */
+  static bool is_flags_full_crc32_equal(uint32_t flags, uint32_t expected)
+  {
+    ut_ad(full_crc32(flags));
+    uint32_t fcrc32_psize= FSP_FLAGS_FCRC32_GET_PAGE_SSIZE(flags);
+
+    if (full_crc32(expected))
+      /* The data file may have been created with a
+      different innodb_compression_algorithm. But
+      we only support one innodb_page_size for all files. */
+      return fcrc32_psize == FSP_FLAGS_FCRC32_GET_PAGE_SSIZE(expected);
+
+    uint32_t non_fcrc32_psize = FSP_FLAGS_GET_PAGE_SSIZE(expected);
+    if (!non_fcrc32_psize)
+      return fcrc32_psize == 5;
+    return fcrc32_psize == non_fcrc32_psize;
+  }
+
+  /** Whether old tablespace flags match full_crc32 flags.
+  @param flags    contents of FSP_SPACE_FLAGS
+  @param expected expected flags
+  @return true if it is equivalent */
+  static bool is_flags_non_full_crc32_equal(uint32_t flags, uint32_t expected)
+  {
+    ut_ad(!full_crc32(flags));
+    if (!full_crc32(expected))
+      return false;
+
+    uint32_t non_fcrc32_psize= FSP_FLAGS_GET_PAGE_SSIZE(flags);
+    uint32_t fcrc32_psize = FSP_FLAGS_FCRC32_GET_PAGE_SSIZE(expected);
+
+    if (!non_fcrc32_psize)
+      return fcrc32_psize == 5;
+    return fcrc32_psize == non_fcrc32_psize;
+  }
+
+  /** Whether both fsp flags are equivalent */
+  static bool is_flags_equal(uint32_t flags, uint32_t expected)
+  {
+    if (!((flags ^ expected) & ~(1U << FSP_FLAGS_POS_RESERVED)))
+      return true;
+    return full_crc32(flags)
+       ? is_flags_full_crc32_equal(flags, expected)
+       : is_flags_non_full_crc32_equal(flags, expected);
+  }
+
+  /** Validate the tablespace flags for full crc32 format.
+  @param flags contents of FSP_SPACE_FLAGS
+  @return whether the flags are correct in full crc32 format */
+  static bool is_fcrc32_valid_flags(uint32_t flags)
+  {
+    ut_ad(flags & FSP_FLAGS_FCRC32_MASK_MARKER);
+    const ulint page_ssize= physical_size(flags);
+    if (page_ssize < 3 || page_ssize & 8)
+      return false;
+    flags >>= FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO;
+    return flags <= PAGE_ALGORITHM_LAST;
+  }
+  /** Validate the tablespace flags.
+  @param flags	contents of FSP_SPACE_FLAGS
+  @param is_ibd	whether this is an .ibd file (not system tablespace)
+  @return whether the flags are correct */
+  static bool is_valid_flags(uint32_t flags, bool is_ibd)
+  {
+    DBUG_EXECUTE_IF("fsp_flags_is_valid_failure", return false;);
+    if (full_crc32(flags))
+      return is_fcrc32_valid_flags(flags);
+
+    if (flags == 0)
+      return true;
+    if (~FSP_FLAGS_MASK & flags)
+      return false;
+
+    if (FSP_FLAGS_MASK_ATOMIC_BLOBS ==
+        (flags & (FSP_FLAGS_MASK_POST_ANTELOPE | FSP_FLAGS_MASK_ATOMIC_BLOBS)))
+      /* If the "atomic blobs" flag (indicating
+      ROW_FORMAT=DYNAMIC or ROW_FORMAT=COMPRESSED) flag is set, then the
+      ROW_FORMAT!=REDUNDANT flag must also be set. */
+      return false;
+
+    /* Bits 10..14 should be 0b0000d where d is the DATA_DIR flag
+    of MySQL 5.6 and MariaDB 10.0, which we ignore.
+    In the buggy FSP_SPACE_FLAGS written by MariaDB 10.1.0 to 10.1.20,
+    bits 10..14 would be nonzero 0bsssaa where sss is
+    nonzero PAGE_SSIZE (3, 4, 6, or 7)
+    and aa is ATOMIC_WRITES (not 0b11). */
+    if (FSP_FLAGS_GET_RESERVED(flags) & ~1U)
+      return false;
+
+    const uint32_t ssize= FSP_FLAGS_GET_PAGE_SSIZE(flags);
+    if (ssize == 1 || ssize == 2 || ssize == 5 || ssize & 8)
+      /* the page_size is not between 4k and 64k;
+      16k should be encoded as 0, not 5 */
+      return false;
+
+    const uint32_t zssize= FSP_FLAGS_GET_ZIP_SSIZE(flags);
+    if (zssize == 0)
+      /* not ROW_FORMAT=COMPRESSED */;
+    else if (zssize > (ssize ? ssize : 5))
+      /* Invalid KEY_BLOCK_SIZE */
+      return false;
+    else if (~flags &
+             (FSP_FLAGS_MASK_POST_ANTELOPE | FSP_FLAGS_MASK_ATOMIC_BLOBS))
+     /* both these flags must set for ROW_FORMAT=COMPRESSED */
+     return false;
+
+    /* The flags do look valid. But, avoid misinterpreting
+    buggy MariaDB 10.1 format flags for
+    PAGE_COMPRESSED=1 PAGE_COMPRESSION_LEVEL={0,2,3}
+    as valid-looking PAGE_SSIZE if this is known to be
+    an .ibd file and we are using the default innodb_page_size=16k. */
+    return(ssize == 0 || !is_ibd || srv_page_size != UNIV_PAGE_SIZE_ORIG);
+  }
+
+#ifndef UNIV_INNOCHECKSUM
+  MY_ATTRIBUTE((warn_unused_result))
+  /** Create a tablespace in fil_system.
+  @param id         tablespace identifier
+  @param flags      tablespace flags
+  @param purpose    tablespace purpose
+  @param crypt_data encryption information
+  @param mode       encryption mode
+  @param opened     true if space files are opened
+  @return pointer to created tablespace, to be filled in with add()
+  @retval nullptr on failure (such as when the same tablespace exists) */
+  static fil_space_t *create(uint32_t id, uint32_t flags,
+                             fil_type_t purpose, fil_space_crypt_t *crypt_data,
+                             fil_encryption_t mode= FIL_ENCRYPTION_DEFAULT,
+                             bool opened= false);
+
+  MY_ATTRIBUTE((warn_unused_result))
+  /** Acquire a tablespace reference.
+  @param id      tablespace identifier
+  @return tablespace
+  @retval nullptr if the tablespace is missing or inaccessible */
+  static fil_space_t *get(uint32_t id);
+  /** Acquire a tablespace reference for writing.
+  @param id      tablespace identifier
+  @return tablespace
+  @retval nullptr if the tablespace is missing or inaccessible */
+  static fil_space_t *get_for_write(uint32_t id);
+
+  /** Add/remove the free page in the freed ranges list.
+  @param[in] offset     page number to be added
+  @param[in] free       true if page to be freed */
+  void free_page(uint32_t offset, bool add=true)
+  {
+    std::lock_guard<std::mutex> freed_lock(freed_range_mutex);
+    if (add)
+      return freed_ranges.add_value(offset);
+
+    if (freed_ranges.empty())
+      return;
+
+    return freed_ranges.remove_value(offset);
+  }
+
+  /** Add the range of freed pages */
+  void add_free_ranges(range_set ranges)
+  {
+    std::lock_guard<std::mutex> freed_lock(freed_range_mutex);
+    freed_ranges= std::move(ranges);
+  }
+
+  /** Add the set of freed page ranges */
+  void add_free_range(const range_t range)
+  {
+    freed_ranges.add_range(range);
+  }
+
+  /** Set the tablespace size in pages */
+  void set_sizes(uint32_t s)
+  {
+    ut_ad(id ? !size : (size >= s));
+    size= s; committed_size= s;
+  }
+
+  /** Update committed_size in mtr_t::commit() */
+  void set_committed_size() { committed_size= size; }
+
+  /** @return the last persisted page number */
+  uint32_t last_page_number() const { return committed_size - 1; }
+
+  /** @return the size in pages (0 if unreadable) */
+  inline uint32_t get_size();
+
+  /** Read or write data.
+  @param type     I/O context
+  @param offset   offset in bytes
+  @param len      number of bytes
+  @param buf      the data to be read or written
+  @param bpage    buffer block (for type.is_async() completion callback)
+  @return status and file descriptor */
+  fil_io_t io(const IORequest &type, os_offset_t offset, size_t len,
+              void *buf, buf_page_t *bpage= nullptr);
+  /** Flush pending writes from the file system cache to the file. */
+  template<bool have_reference> inline void flush();
+  /** Flush pending writes from the file system cache to the file. */
+  void flush_low();
+
+  /** Read the first page of a data file.
+  @return whether the page was found valid */
+  bool read_page0();
+
+  /** Determine the next tablespace for encryption key rotation.
+  @param space    current tablespace (nullptr to start from the beginning)
+  @param recheck  whether the removal condition needs to be rechecked after
+                  encryption parameters were changed
+  @param encrypt  expected state of innodb_encrypt_tables
+  @return the next tablespace
+  @retval nullptr upon reaching the end of the iteration */
+  static space_list_t::iterator next(space_list_t::iterator space,
+                                     bool recheck, bool encrypt);
+
+#ifdef UNIV_DEBUG
+  bool is_latched() const { return latch_count != 0; }
+#endif
+  bool is_owner() const { return latch_owner == pthread_self(); }
+  /** Acquire the allocation latch in exclusive mode */
+  void x_lock()
+  {
+    latch.wr_lock(SRW_LOCK_CALL);
+    ut_ad(!latch_owner);
+    latch_owner= pthread_self();
+    ut_ad(!latch_count.fetch_add(1));
+  }
+  /** Release the allocation latch from exclusive mode */
+  void x_unlock()
+  {
+    ut_ad(latch_count.fetch_sub(1) == 1);
+    ut_ad(latch_owner == pthread_self());
+    latch_owner= 0;
+    latch.wr_unlock();
+  }
+  /** Acquire the allocation latch in shared mode */
+  void s_lock()
+  {
+    ut_ad(!is_owner());
+    latch.rd_lock(SRW_LOCK_CALL);
+    ut_ad(!latch_owner);
+    ut_d(latch_count.fetch_add(1));
+  }
+  /** Release the allocation latch from shared mode */
+  void s_unlock()
+  {
+    ut_ad(latch_count.fetch_sub(1));
+    ut_ad(!latch_owner);
+    latch.rd_unlock();
+  }
+
+  typedef span<const char> name_type;
+
+  /** @return the tablespace name (databasename/tablename) */
+  name_type name() const;
+
+private:
+  /** @return whether the file is usable for io() */
+  ATTRIBUTE_COLD bool prepare_acquired();
+  /** @return whether the file is usable for io() */
+  ATTRIBUTE_COLD bool acquire_and_prepare();
+#endif /*!UNIV_INNOCHECKSUM */
+};
+
+#ifndef UNIV_INNOCHECKSUM
+/** File node of a tablespace or the log data space */
+struct fil_node_t final
+{
+  /** tablespace containing this file */
+  fil_space_t *space;
+  /** file name; protected by fil_system.mutex and exclusive log_sys.latch */
+  char *name;
+  /** file handle */
+  pfs_os_file_t handle;
+  /** whether the file is on non-rotational media (SSD) */
+  unsigned on_ssd:1;
+  /** how to write page_compressed tables
+  (0=do not punch holes but write minimal amount of data, 1=punch holes,
+  2=always write the same amount; thinly provisioned storage will compress) */
+  unsigned punch_hole:2;
+  /** whether this file could use atomic write */
+  unsigned atomic_write:1;
+  /** whether the file actually is a raw device or disk partition */
+  unsigned is_raw_disk:1;
+  /** whether the tablespace discovery is being deferred during crash
+  recovery due to incompletely written page 0 */
+  unsigned deferred:1;
+
+  /** size of the file in database pages (0 if not known yet);
+  the possible last incomplete megabyte may be ignored if space->id == 0 */
+  uint32_t size;
+  /** initial size of the file in database pages;
+  FIL_IBD_FILE_INITIAL_SIZE by default */
+  uint32_t init_size;
+  /** maximum size of the file in database pages (0 if unlimited) */
+  uint32_t max_size;
+  /** whether the file is currently being extended */
+  Atomic_relaxed<bool> being_extended;
+  /** link to other files in this tablespace */
+  UT_LIST_NODE_T(fil_node_t) chain;
+
+  /** Filesystem block size */
+  ulint block_size;
+
+  /** @return whether this file is open */
+  bool is_open() const { return handle != OS_FILE_CLOSED; }
+
+  /** Read the first page of a data file.
+  @return whether the page was found valid */
+  bool read_page0();
+
+  /** Determine some file metadata when creating or reading the file.
+  @param file   the file that is being created, or OS_FILE_CLOSED */
+  void find_metadata(os_file_t file= OS_FILE_CLOSED
+#ifndef _WIN32
+                     , bool create= false, struct stat *statbuf= nullptr
+#endif
+                     );
+
+  /** Close the file handle. */
+  void close();
+  /** Same as close() but returns file handle instead of closing it. */
+  pfs_os_file_t detach() MY_ATTRIBUTE((warn_unused_result));
+  /** Prepare to free a file from fil_system.
+  @param detach_handle whether to detach instead of closing a handle
+  @return detached handle or OS_FILE_CLOSED */
+  inline pfs_os_file_t close_to_free(bool detach_handle= false);
+
+  /** Update the data structures on write completion */
+  inline void complete_write();
+
+private:
+  /** Does stuff common for close() and detach() */
+  void prepare_to_close_or_detach();
+};
+
+inline bool fil_space_t::use_doublewrite() const
+{
+  return !UT_LIST_GET_FIRST(chain)->atomic_write && srv_use_doublewrite_buf &&
+    buf_dblwr.is_created();
+}
+
+inline void fil_space_t::set_imported()
+{
+  ut_ad(purpose == FIL_TYPE_IMPORT);
+  purpose= FIL_TYPE_TABLESPACE;
+  UT_LIST_GET_FIRST(chain)->find_metadata();
+}
+
+inline bool fil_space_t::is_rotational() const
+{
+  for (const fil_node_t *node= UT_LIST_GET_FIRST(chain); node;
+       node= UT_LIST_GET_NEXT(chain, node))
+    if (!node->on_ssd)
+      return true;
+  return false;
+}
+
+/** Common InnoDB file extensions */
+enum ib_extention {
+	NO_EXT = 0,
+	IBD = 1,
+	ISL = 2,
+	CFG = 3
+};
+extern const char* dot_ext[];
+#define DOT_IBD dot_ext[IBD]
+#define DOT_ISL dot_ext[ISL]
+#define DOT_CFG dot_ext[CFG]
+
+/** When mariadbd is run, the default directory "." is the mysqld datadir,
+but in the MariaDB Embedded Server Library and mysqlbackup it is not the default
+directory, and we must set the base file path explicitly */
+extern const char*	fil_path_to_mysql_datadir;
+#else
+# include "univ.i"
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** Initial size of a single-table tablespace in pages */
+#define FIL_IBD_FILE_INITIAL_SIZE	4U
+
+/** 'null' (undefined) page offset in the context of file spaces */
+#define	FIL_NULL	ULINT32_UNDEFINED
+
+
+#define FIL_ADDR_PAGE	0U	/* first in address is the page offset */
+#define	FIL_ADDR_BYTE	4U	/* then comes 2-byte byte offset within page*/
+#define	FIL_ADDR_SIZE	6U	/* address size is 6 bytes */
+
+/** File space address */
+struct fil_addr_t {
+  /** page number within a tablespace */
+  uint32_t page;
+  /** byte offset within the page */
+  uint16_t boffset;
+};
+
+/** The byte offsets on a file page for various variables @{ */
+#define FIL_PAGE_SPACE_OR_CHKSUM 0	/*!< in < MySQL-4.0.14 space id the
+					page belongs to (== 0) but in later
+					versions the 'new' checksum of the
+					page */
+#define FIL_PAGE_OFFSET		4U	/*!< page offset inside space */
+#define FIL_PAGE_PREV		8U	/*!< if there is a 'natural'
+					predecessor of the page, its
+					offset.  Otherwise FIL_NULL.
+					This field is not set on BLOB
+					pages, which are stored as a
+					singly-linked list.  See also
+					FIL_PAGE_NEXT. */
+#define FIL_PAGE_NEXT		12U	/*!< if there is a 'natural' successor
+					of the page, its offset.
+					Otherwise FIL_NULL.
+					B-tree index pages
+					(FIL_PAGE_TYPE contains FIL_PAGE_INDEX)
+					on the same PAGE_LEVEL are maintained
+					as a doubly linked list via
+					FIL_PAGE_PREV and FIL_PAGE_NEXT
+					in the collation order of the
+					smallest user record on each page. */
+#define FIL_PAGE_LSN		16U	/*!< lsn of the end of the newest
+					modification log record to the page */
+#define	FIL_PAGE_TYPE		24U	/*!< file page type: FIL_PAGE_INDEX,...,
+					2 bytes.
+
+					The contents of this field can only
+					be trusted in the following case:
+					if the page is an uncompressed
+					B-tree index page, then it is
+					guaranteed that the value is
+					FIL_PAGE_INDEX.
+					The opposite does not hold.
+
+					In tablespaces created by
+					MySQL/InnoDB 5.1.7 or later, the
+					contents of this field is valid
+					for all uncompressed pages. */
+
+/** For the first page in a system tablespace data file(ibdata*, not *.ibd):
+the file has been flushed to disk at least up to this lsn
+For other pages of tablespaces not in innodb_checksum_algorithm=full_crc32
+format: 32-bit key version used to encrypt the page + 32-bit checksum
+or 64 bits of zero if no encryption */
+#define FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION 26U
+
+/** This overloads FIL_PAGE_FILE_FLUSH_LSN for RTREE Split Sequence Number */
+#define	FIL_RTREE_SPLIT_SEQ_NUM	FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION
+
+/** Start of the page_compressed content */
+#define FIL_PAGE_COMP_ALGO	FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION
+
+/** starting from 4.1.x this contains the space id of the page */
+#define FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID  34U
+
+#define FIL_PAGE_SPACE_ID  FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID
+
+#define FIL_PAGE_DATA		38U	/*!< start of the data on the page */
+
+/** 32-bit key version used to encrypt the page in full_crc32 format.
+For non-encrypted page, it contains 0. */
+#define FIL_PAGE_FCRC32_KEY_VERSION	0
+
+/** page_compressed without innodb_checksum_algorithm=full_crc32 @{ */
+/** Number of bytes used to store actual payload data size on
+page_compressed pages when not using full_crc32. */
+#define FIL_PAGE_COMP_SIZE		0
+
+/** Number of bytes for FIL_PAGE_COMP_SIZE */
+#define FIL_PAGE_COMP_METADATA_LEN		2
+
+/** Number of bytes used to store actual compression method
+for encrypted tables when not using full_crc32. */
+#define FIL_PAGE_ENCRYPT_COMP_ALGO		2
+
+/** Extra header size for encrypted page_compressed pages when
+not using full_crc32 */
+#define FIL_PAGE_ENCRYPT_COMP_METADATA_LEN	4
+/* @} */
+
+/** File page trailer @{ */
+#define FIL_PAGE_END_LSN_OLD_CHKSUM 8	/*!< the low 4 bytes of this are used
+					to store the page checksum, the
+					last 4 bytes should be identical
+					to the last 4 bytes of FIL_PAGE_LSN */
+#define FIL_PAGE_DATA_END	8	/*!< size of the page trailer */
+
+/** Store the last 4 bytes of FIL_PAGE_LSN */
+#define FIL_PAGE_FCRC32_END_LSN 8
+
+/** Store crc32 checksum at the end of the page */
+#define FIL_PAGE_FCRC32_CHECKSUM	4
+/* @} */
+
+/** File page types (values of FIL_PAGE_TYPE) @{ */
+/** page_compressed, encrypted=YES (not used for full_crc32) */
+constexpr uint16_t FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED= 37401;
+/** page_compressed (not used for full_crc32) */
+constexpr uint16_t FIL_PAGE_PAGE_COMPRESSED= 34354;
+/** B-tree index page */
+constexpr uint16_t FIL_PAGE_INDEX= 17855;
+/** R-tree index page (SPATIAL INDEX) */
+constexpr uint16_t FIL_PAGE_RTREE= 17854;
+/** Undo log page */
+constexpr uint16_t FIL_PAGE_UNDO_LOG= 2;
+/** Index node (of file-in-file metadata) */
+constexpr uint16_t FIL_PAGE_INODE= 3;
+/** Insert buffer free list */
+constexpr uint16_t FIL_PAGE_IBUF_FREE_LIST= 4;
+/** Freshly allocated page */
+constexpr uint16_t FIL_PAGE_TYPE_ALLOCATED= 0;
+/** Change buffer bitmap (pages n*innodb_page_size+1) */
+constexpr uint16_t FIL_PAGE_IBUF_BITMAP= 5;
+/** System page */
+constexpr uint16_t FIL_PAGE_TYPE_SYS= 6;
+/** Transaction system data */
+constexpr uint16_t FIL_PAGE_TYPE_TRX_SYS= 7;
+/** Tablespace header (page 0) */
+constexpr uint16_t FIL_PAGE_TYPE_FSP_HDR= 8;
+/** Extent descriptor page (pages n*innodb_page_size, except 0) */
+constexpr uint16_t FIL_PAGE_TYPE_XDES= 9;
+/** Uncompressed BLOB page */
+constexpr uint16_t FIL_PAGE_TYPE_BLOB= 10;
+/** First ROW_FORMAT=COMPRESSED BLOB page */
+constexpr uint16_t FIL_PAGE_TYPE_ZBLOB= 11;
+/** Subsequent ROW_FORMAT=COMPRESSED BLOB page */
+constexpr uint16_t FIL_PAGE_TYPE_ZBLOB2= 12;
+/** In old tablespaces, garbage in FIL_PAGE_TYPE is replaced with this
+value when flushing pages. */
+constexpr uint16_t FIL_PAGE_TYPE_UNKNOWN= 13;
+
+/* File page types introduced in MySQL 5.7, not supported in MariaDB */
+//constexpr uint16_t FIL_PAGE_COMPRESSED = 14;
+//constexpr uint16_t FIL_PAGE_ENCRYPTED = 15;
+//constexpr uint16_t FIL_PAGE_COMPRESSED_AND_ENCRYPTED = 16;
+//constexpr FIL_PAGE_ENCRYPTED_RTREE = 17;
+/** Clustered index root page after instant ADD COLUMN */
+constexpr uint16_t FIL_PAGE_TYPE_INSTANT= 18;
+
+/** Used by i_s.cc to index into the text description.
+Note: FIL_PAGE_TYPE_INSTANT maps to the same as FIL_PAGE_INDEX. */
+constexpr uint16_t FIL_PAGE_TYPE_LAST= FIL_PAGE_TYPE_UNKNOWN;
+
+/** Set in FIL_PAGE_TYPE for full_crc32 pages in page_compressed format.
+If the flag is set, then the following holds for the remaining bits
+of FIL_PAGE_TYPE:
+Bits 0..7 will contain the compressed page size in bytes.
+Bits 8..14 are reserved and must be 0. */
+constexpr uint16_t FIL_PAGE_COMPRESS_FCRC32_MARKER= 15;
+/* @} */
+
+/** @return whether the page type is B-tree or R-tree index */
+inline bool fil_page_type_is_index(uint16_t page_type)
+{
+	switch (page_type) {
+	case FIL_PAGE_TYPE_INSTANT:
+	case FIL_PAGE_INDEX:
+	case FIL_PAGE_RTREE:
+		return(true);
+	}
+	return(false);
+}
+
+/** Check whether the page is index page (either regular Btree index or Rtree
+index */
+#define fil_page_index_page_check(page)                         \
+        fil_page_type_is_index(fil_page_get_type(page))
+
+/** Get the file page type.
+@param[in]	page	file page
+@return page type */
+inline uint16_t fil_page_get_type(const byte *page)
+{
+  return mach_read_from_2(my_assume_aligned<2>(page + FIL_PAGE_TYPE));
+}
+
+#ifndef UNIV_INNOCHECKSUM
+
+/** Number of pending tablespace flushes */
+extern Atomic_counter<ulint> fil_n_pending_tablespace_flushes;
+
+/** Look up a tablespace.
+The caller should hold an InnoDB table lock or a MDL that prevents
+the tablespace from being dropped during the operation,
+or the caller should be in single-threaded crash recovery mode
+(no user connections that could drop tablespaces).
+Normally, fil_space_t::get() should be used instead.
+@param[in]	id	tablespace ID
+@return tablespace, or NULL if not found */
+fil_space_t *fil_space_get(uint32_t id)
+  MY_ATTRIBUTE((warn_unused_result));
+
+/** The tablespace memory cache */
+struct fil_system_t
+{
+  /**
+    Constructor.
+
+    Some members may require late initialisation, thus we just mark object as
+    uninitialised. Real initialisation happens in create().
+  */
+  fil_system_t() : m_initialised(false) {}
+
+  bool is_initialised() const { return m_initialised; }
+
+  /**
+    Create the file system interface at database start.
+
+    @param[in] hash_size	hash table size
+  */
+  void create(ulint hash_size);
+
+  /** Close the file system interface at shutdown */
+  void close();
+
+private:
+  bool m_initialised;
+
+  /** Points to the last opened space in space_list. Protected with
+  fil_system.mutex. */
+  fil_space_t *space_list_last_opened= nullptr;
+
+#ifdef __linux__
+  /** available block devices that reside on non-rotational storage */
+  std::vector<dev_t> ssd;
+public:
+  /** @return whether a file system device is on non-rotational storage */
+  bool is_ssd(dev_t dev) const
+  {
+    /* Linux seems to allow up to 15 partitions per block device.
+    If the detected ssd carries "partition number 0" (it is the whole device),
+    compare the candidate file system number without the partition number. */
+    for (const auto s : ssd)
+      if (dev == s || (dev & ~15U) == s)
+        return true;
+    return false;
+  }
+#endif
+public:
+  /** Detach a tablespace from the cache and close the files.
+  @param space tablespace
+  @param detach_handle whether to detach the handle, instead of closing
+  @return detached handle
+  @retval OS_FILE_CLOSED if no handle was detached */
+  pfs_os_file_t detach(fil_space_t *space, bool detach_handle= false);
+
+  /** the mutex protecting most data fields, and some fields of fil_space_t */
+  mysql_mutex_t mutex;
+	fil_space_t*	sys_space;	/*!< The innodb_system tablespace */
+	fil_space_t*	temp_space;	/*!< The innodb_temporary tablespace */
+  /** Map of fil_space_t::id to fil_space_t* */
+  hash_table_t spaces;
+  /** tablespaces for which fil_space_t::needs_flush() holds */
+  sized_ilist<fil_space_t, unflushed_spaces_tag_t> unflushed_spaces;
+  /** number of currently open files; protected by mutex */
+  ulint n_open;
+  /** last time we noted n_open exceeding the limit; protected by mutex */
+  time_t n_open_exceeded_time;
+  /** maximum persistent tablespace id that has ever been assigned */
+  uint32_t max_assigned_id;
+  /** nonzero if fil_node_open_file_low() should avoid moving the tablespace
+  to the end of space_list, for FIFO policy of try_to_close() */
+  ulint freeze_space_list;
+  /** List of all file spaces, opened spaces should be at the top of the list
+  to optimize try_to_close() execution. Protected with fil_system.mutex. */
+  ilist<fil_space_t, space_list_tag_t> space_list;
+  /** list of all tablespaces for which a FILE_MODIFY record has been written
+  since the latest redo log checkpoint.
+  Protected only by exclusive log_sys.latch. */
+  ilist<fil_space_t, named_spaces_tag_t> named_spaces;
+
+  /** list of all ENCRYPTED=DEFAULT tablespaces that need
+  to be converted to the current value of innodb_encrypt_tables */
+  ilist<fil_space_t, default_encrypt_tag_t> default_encrypt_tables;
+
+  /** whether fil_space_t::create() has issued a warning about
+  potential space_id reuse */
+  bool space_id_reuse_warned;
+
+  /** Add the file to the end of opened spaces list in
+  fil_system.space_list, so that fil_space_t::try_to_close() should close
+  it as a last resort.
+  @param space space to add */
+  void add_opened_last_to_space_list(fil_space_t *space);
+
+  /** Move the file to the end of opened spaces list in
+  fil_system.space_list, so that fil_space_t::try_to_close() should close
+  it as a last resort.
+  @param space space to move */
+  inline void move_opened_last_to_space_list(fil_space_t *space)
+  {
+    /* In the case when several files of the same space are added in a
+    row, there is no need to remove and add a space to the same position
+    in space_list. It can be for system or temporary tablespaces. */
+    if (freeze_space_list || space_list_last_opened == space)
+      return;
+
+    space_list.erase(space_list_t::iterator(space));
+    add_opened_last_to_space_list(space);
+  }
+
+  /** Move closed file last in fil_system.space_list, so that
+  fil_space_t::try_to_close() iterates opened files first in FIFO order,
+  i.e. first opened, first closed.
+  @param space space to move */
+  void move_closed_last_to_space_list(fil_space_t *space)
+  {
+    if (UNIV_UNLIKELY(freeze_space_list))
+      return;
+
+    space_list_t::iterator s= space_list_t::iterator(space);
+
+    if (space_list_last_opened == space)
+    {
+      ut_ad(s != space_list.begin());
+      space_list_t::iterator prev= s;
+      space_list_last_opened= &*--prev;
+    }
+
+    space_list.erase(s);
+    space_list.push_back(*space);
+  }
+
+  /** Return the next tablespace from default_encrypt_tables list.
+  @param space   previous tablespace (nullptr to start from the start)
+  @param recheck whether the removal condition needs to be rechecked after
+  the encryption parameters were changed
+  @param encrypt expected state of innodb_encrypt_tables
+  @return the next tablespace to process (n_pending_ops incremented)
+  @retval fil_system.temp_space if there is no work to do
+  @retval nullptr upon reaching the end of the iteration */
+  inline fil_space_t* default_encrypt_next(fil_space_t *space, bool recheck,
+                                           bool encrypt);
+
+  /** Extend all open data files to the recovered size */
+  ATTRIBUTE_COLD void extend_to_recv_size();
+
+  /** Determine if a tablespace associated with a file name exists.
+  @param path   tablespace file name to look for
+  @return a matching tablespace */
+  inline fil_space_t *find(const char *path) const;
+};
+
+/** The tablespace memory cache. */
+extern fil_system_t	fil_system;
+
+inline void fil_space_t::reacquire()
+{
+  ut_d(uint32_t n=) n_pending.fetch_add(1, std::memory_order_relaxed);
+#ifdef SAFE_MUTEX
+  if (mysql_mutex_is_owner(&fil_system.mutex)) return;
+  ut_ad(n & PENDING);
+  ut_ad(UT_LIST_GET_FIRST(chain)->is_open());
+#endif /* SAFE_MUTEX */
+}
+
+/** Note that operations on the tablespace must stop. */
+inline void fil_space_t::set_stopping()
+{
+  mysql_mutex_assert_owner(&fil_system.mutex);
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+  static_assert(STOPPING_WRITES == 1U << 30, "compatibility");
+  __asm__ __volatile__("lock btsl $30, %0" : "+m" (n_pending));
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+  static_assert(STOPPING_WRITES == 1U << 30, "compatibility");
+  _interlockedbittestandset(reinterpret_cast<volatile long*>(&n_pending), 30);
+#else
+  n_pending.fetch_or(STOPPING_WRITES, std::memory_order_relaxed);
+#endif
+}
+
+inline void fil_space_t::clear_stopping()
+{
+  mysql_mutex_assert_owner(&fil_system.mutex);
+  static_assert(STOPPING_WRITES == 1U << 30, "compatibility");
+  ut_d(auto n=) n_pending.fetch_sub(STOPPING_WRITES, std::memory_order_relaxed);
+  ut_ad((n & STOPPING) == STOPPING_WRITES);
+}
+
+/** Flush pending writes from the file system cache to the file. */
+template<bool have_reference> inline void fil_space_t::flush()
+{
+  mysql_mutex_assert_not_owner(&fil_system.mutex);
+  ut_ad(!have_reference || (pending() & PENDING));
+  ut_ad(purpose == FIL_TYPE_TABLESPACE || purpose == FIL_TYPE_IMPORT);
+  if (srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)
+  {
+    ut_ad(!is_in_unflushed_spaces);
+    ut_ad(!needs_flush());
+  }
+  else if (have_reference)
+    flush_low();
+  else
+  {
+    if (!(acquire_low(STOPPING | CLOSING) & (STOPPING | CLOSING)))
+    {
+      flush_low();
+      release();
+    }
+  }
+}
+
+/** @return the size in pages (0 if unreadable) */
+inline uint32_t fil_space_t::get_size()
+{
+  if (!size)
+  {
+    mysql_mutex_lock(&fil_system.mutex);
+    read_page0();
+    mysql_mutex_unlock(&fil_system.mutex);
+  }
+  return size;
+}
+
+#include "fil0crypt.h"
+
+/*******************************************************************//**
+Assigns a new space id for a new single-table tablespace. This works simply by
+incrementing the global counter. If 4 billion id's is not enough, we may need
+to recycle id's.
+@return true if assigned, false if not */
+bool fil_assign_new_space_id(uint32_t *space_id);
+
+/** Frees a space object from the tablespace memory cache.
+Closes the files in the chain but does not delete them.
+There must not be any pending i/o's or flushes on the files.
+@param id          tablespace identifier
+@param x_latched   whether the caller holds exclusive fil_space_t::latch
+@return true if success */
+bool fil_space_free(uint32_t id, bool x_latched);
+
+/** Set the recovered size of a tablespace in pages.
+@param	id	tablespace ID
+@param	size	recovered size in pages
+@param	flags	tablespace flags */
+void fil_space_set_recv_size_and_flags(uint32_t id, uint32_t size,
+                                       uint32_t flags);
+
+/*******************************************************************//**
+Sets the max tablespace id counter if the given number is bigger than the
+previous value. */
+void fil_set_max_space_id_if_bigger(uint32_t max_id);
+
+MY_ATTRIBUTE((warn_unused_result))
+/** Delete a tablespace and associated .ibd file.
+@param id    tablespace identifier
+@return detached file handle (to be closed by the caller)
+@return	OS_FILE_CLOSED if no file existed */
+pfs_os_file_t fil_delete_tablespace(uint32_t id);
+
+/** Close a single-table tablespace on failed IMPORT TABLESPACE.
+The tablespace must be cached in the memory cache.
+Free all pages used by the tablespace. */
+void fil_close_tablespace(uint32_t id);
+
+/*******************************************************************//**
+Allocates and builds a file name from a path, a table or tablespace name
+and a suffix. The string must be freed by caller with ut_free().
+@param[in] path NULL or the directory path or the full path and filename.
+@param[in] name {} if path is full, or Table/Tablespace name
+@param[in] ext the file extension to use
+@param[in] trim_name true if the last name on the path should be trimmed.
+@return own: file name */
+char* fil_make_filepath(const char *path, const fil_space_t::name_type &name,
+                        ib_extention ext, bool trim_name);
+
+char *fil_make_filepath(const char* path, const table_name_t name,
+                        ib_extention suffix, bool strip_name);
+
+/** Create a tablespace file.
+@param[in]	space_id	Tablespace ID
+@param[in]	name		Tablespace name in dbname/tablename format.
+@param[in]	path		Path and filename of the datafile to create.
+@param[in]	flags		Tablespace flags
+@param[in]	size		Initial size of the tablespace file in pages,
+must be >= FIL_IBD_FILE_INITIAL_SIZE
+@param[in]	mode		MariaDB encryption mode
+@param[in]	key_id		MariaDB encryption key_id
+@param[out]	err		DB_SUCCESS or error code
+@return	the created tablespace
+@retval	NULL	on error */
+fil_space_t*
+fil_ibd_create(
+	uint32_t	space_id,
+	const table_name_t name,
+	const char*	path,
+	uint32_t	flags,
+	uint32_t	size,
+	fil_encryption_t mode,
+	uint32_t	key_id,
+	dberr_t*	err)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Try to adjust FSP_SPACE_FLAGS if they differ from the expectations.
+(Typically when upgrading from MariaDB 10.1.0..10.1.20.)
+@param[in,out]	space		tablespace
+@param[in]	flags		desired tablespace flags */
+void fsp_flags_try_adjust(fil_space_t *space, uint32_t flags);
+
+/********************************************************************//**
+Tries to open a single-table tablespace and optionally checks the space id is
+right in it. If does not succeed, prints an error message to the .err log. This
+function is used to open a tablespace when we start up mysqld, and also in
+IMPORT TABLESPACE.
+NOTE that we assume this operation is used either at the database startup
+or under the protection of dict_sys.latch, so that two users cannot
+race here. This operation does not leave the file associated with the
+tablespace open, but closes it after we have looked at the space id in it.
+
+If the validate boolean is set, we read the first page of the file and
+check that the space id in the file is what we expect. We assume that
+this function runs much faster if no check is made, since accessing the
+file inode probably is much faster (the OS caches them) than accessing
+the first page of the file.  This boolean may be initially false, but if
+a remote tablespace is found it will be changed to true.
+
+@param[in]	validate	0=maybe missing, 1=do not validate, 2=validate
+@param[in]	purpose		FIL_TYPE_TABLESPACE or FIL_TYPE_TEMPORARY
+@param[in]	id		tablespace ID
+@param[in]	flags		expected FSP_SPACE_FLAGS
+@param[in]	name		table name
+If file-per-table, it is the table name in the databasename/tablename format
+@param[in]	path_in		expected filepath, usually read from dictionary
+@param[out]	err		DB_SUCCESS or error code
+@return	tablespace
+@retval	NULL	if the tablespace could not be opened */
+fil_space_t*
+fil_ibd_open(
+	unsigned		validate,
+	fil_type_t		purpose,
+	uint32_t		id,
+	uint32_t		flags,
+	fil_space_t::name_type	name,
+	const char*		path_in,
+	dberr_t*		err = NULL)
+	MY_ATTRIBUTE((warn_unused_result));
+
+enum fil_load_status {
+	/** The tablespace file(s) were found and valid. */
+	FIL_LOAD_OK,
+	/** The name no longer matches space_id */
+	FIL_LOAD_ID_CHANGED,
+	/** The file(s) were not found */
+	FIL_LOAD_NOT_FOUND,
+	/** The file(s) were not valid */
+	FIL_LOAD_INVALID,
+	/** The tablespace file was deferred to open */
+	FIL_LOAD_DEFER
+};
+
+/** Open a single-file tablespace and add it to the InnoDB data structures.
+@param[in]	space_id	tablespace ID
+@param[in]	filename	path/to/databasename/tablename.ibd
+@param[out]	space		the tablespace, or NULL on error
+@return status of the operation */
+enum fil_load_status
+fil_ibd_load(uint32_t space_id, const char *filename, fil_space_t *&space)
+  MY_ATTRIBUTE((warn_unused_result));
+
+/** Determine if a matching tablespace exists in the InnoDB tablespace
+memory cache. Note that if we have not done a crash recovery at the database
+startup, there may be many tablespaces which are not yet in the memory cache.
+@param[in]	id		Tablespace ID
+@param[in]	table_flags	table flags
+@return the tablespace
+@retval	NULL	if no matching tablespace exists in the memory cache */
+fil_space_t *fil_space_for_table_exists_in_mem(uint32_t id,
+                                               uint32_t table_flags);
+
+/** Try to extend a tablespace if it is smaller than the specified size.
+@param[in,out]	space	tablespace
+@param[in]	size	desired size in pages
+@return whether the tablespace is at least as big as requested */
+bool fil_space_extend(fil_space_t *space, uint32_t size);
+
+/** Flush to disk the writes in file spaces of the given type
+possibly cached by the OS. */
+void fil_flush_file_spaces();
+/******************************************************************//**
+Checks the consistency of the tablespace cache.
+@return true if ok */
+bool fil_validate();
+/*********************************************************************//**
+Sets the file page type. */
+void
+fil_page_set_type(
+/*==============*/
+	byte*	page,	/*!< in/out: file page */
+	ulint	type);	/*!< in: type */
+
+/********************************************************************//**
+Delete the tablespace file and any related files like .cfg.
+This should not be called for temporary tables. */
+void
+fil_delete_file(
+/*============*/
+	const char*	path);	/*!< in: filepath of the ibd tablespace */
+
+/** Look up a tablespace.
+@param tablespace identifier
+@return tablespace
+@retval nullptr if not found */
+fil_space_t *fil_space_get_by_id(uint32_t id);
+
+/** Note that a non-predefined persistent tablespace has been modified
+by redo log.
+@param[in,out]	space	tablespace */
+void
+fil_names_dirty(
+	fil_space_t*	space);
+
+
+bool fil_comp_algo_loaded(ulint comp_algo);
+
+/** On a log checkpoint, reset fil_names_dirty_and_write() flags
+and write out FILE_MODIFY if needed, and write FILE_CHECKPOINT.
+@param lsn  checkpoint LSN
+@return current LSN */
+lsn_t fil_names_clear(lsn_t lsn);
+
+#ifdef UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH
+void test_make_filepath();
+#endif /* UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH */
+
+/** Determine the block size of the data file.
+@param[in]	space		tablespace
+@param[in]	offset		page number
+@return	block size */
+ulint fil_space_get_block_size(const fil_space_t* space, unsigned offset);
+
+/** Check whether encryption key found
+@param crypt_data Encryption data
+@param f_name     File name
+@return encryption key found */
+bool fil_crypt_check(fil_space_crypt_t *crypt_data, const char *f_name);
+
+#endif /* UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/fil0pagecompress.h b/storage/innobase/include/fil0pagecompress.h
new file mode 100644
index 00000000..2927da3c
--- /dev/null
+++ b/storage/innobase/include/fil0pagecompress.h
@@ -0,0 +1,57 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#ifndef fil0pagecompress_h
+#define fil0pagecompress_h
+
+#include "fsp0fsp.h"
+
+/******************************************************************//**
+@file include/fil0pagecompress.h
+Helper functions for extracting/storing page compression and
+atomic writes information to table space.
+
+Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com
+***********************************************************************/
+
+/** Compress a page_compressed page before writing to a data file.
+@param[in]	buf		page to be compressed
+@param[out]	out_buf		compressed page
+@param[in]	flags		tablespace flags
+@param[in]	block_size	file system block size
+@param[in]	encrypted	whether the page will be subsequently encrypted
+@return actual length of compressed page
+@retval	0	if the page was not compressed */
+ulint fil_page_compress(
+	const byte*	buf,
+	byte*		out_buf,
+	uint32_t	flags,
+	ulint		block_size,
+	bool		encrypted)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Decompress a page that may be subject to page_compressed compression.
+@param[in,out]	tmp_buf		temporary buffer (of innodb_page_size)
+@param[in,out]	buf		compressed page buffer
+@param[in]	flags		tablespace flags
+@return size of the compressed data
+@retval	0		if decompression failed
+@retval	srv_page_size	if the page was not compressed */
+ulint fil_page_decompress(byte *tmp_buf, byte *buf, uint32_t flags)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+#endif
diff --git a/storage/innobase/include/fsp0file.h b/storage/innobase/include/fsp0file.h
new file mode 100644
index 00000000..67e79f1a
--- /dev/null
+++ b/storage/innobase/include/fsp0file.h
@@ -0,0 +1,509 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fsp0file.h
+Tablespace data file implementation.
+
+Created 2013-7-26 by Kevin Lewis
+*******************************************************/
+
+#ifndef fsp0file_h
+#define fsp0file_h
+
+#include "mem0mem.h"
+#include "os0file.h"
+#include "fil0fil.h"
+
+/** Types of raw partitions in innodb_data_file_path */
+enum device_t {
+	SRV_NOT_RAW = 0,	/*!< Not a raw partition */
+	SRV_NEW_RAW,		/*!< A 'newraw' partition, only to be
+				initialized */
+	SRV_OLD_RAW		/*!< An initialized raw partition */
+};
+
+/** Data file control information. */
+class Datafile {
+
+	friend class Tablespace;
+	friend class SysTablespace;
+
+public:
+
+	Datafile()
+		:
+		m_filepath(),
+		m_filename(),
+		m_handle(),
+		m_open_flags(OS_FILE_OPEN),
+		m_size(),
+		m_order(),
+		m_type(SRV_NOT_RAW),
+		m_space_id(UINT32_MAX),
+		m_flags(),
+		m_exists(),
+		m_is_valid(),
+		m_first_page(),
+		m_last_os_error(),
+		m_file_info()
+	{
+		/* No op */
+	}
+
+	Datafile(uint32_t flags, uint32_t size, ulint order)
+		:
+		m_filepath(),
+		m_filename(),
+		m_handle(),
+		m_open_flags(OS_FILE_OPEN),
+		m_size(size),
+		m_order(order),
+		m_type(SRV_NOT_RAW),
+		m_space_id(UINT32_MAX),
+		m_flags(flags),
+		m_exists(),
+		m_is_valid(),
+		m_first_page(),
+		m_last_os_error(),
+		m_file_info()
+	{
+	}
+
+	Datafile(const Datafile& file)
+		:
+		m_handle(file.m_handle),
+		m_open_flags(file.m_open_flags),
+		m_size(file.m_size),
+		m_order(file.m_order),
+		m_type(file.m_type),
+		m_space_id(file.m_space_id),
+		m_flags(file.m_flags),
+		m_exists(file.m_exists),
+		m_is_valid(file.m_is_valid),
+		m_first_page(),
+		m_last_os_error(),
+		m_file_info()
+	{
+		if (file.m_filepath != NULL) {
+			m_filepath = mem_strdup(file.m_filepath);
+			ut_a(m_filepath != NULL);
+			set_filename();
+		} else {
+			m_filepath = NULL;
+			m_filename = NULL;
+		}
+	}
+
+	virtual ~Datafile()
+	{
+		shutdown();
+	}
+
+	Datafile& operator=(const Datafile& file)
+	{
+		ut_a(this != &file);
+
+		m_size = file.m_size;
+		m_order = file.m_order;
+		m_type = file.m_type;
+
+		ut_a(m_handle == OS_FILE_CLOSED);
+		m_handle = file.m_handle;
+
+		m_exists = file.m_exists;
+		m_is_valid = file.m_is_valid;
+		m_open_flags = file.m_open_flags;
+		m_space_id = file.m_space_id;
+		m_flags = file.m_flags;
+		m_last_os_error = 0;
+
+		if (m_filepath != NULL) {
+			ut_free(m_filepath);
+			m_filepath = NULL;
+			m_filename = NULL;
+		}
+
+		if (file.m_filepath != NULL) {
+			m_filepath = mem_strdup(file.m_filepath);
+			ut_a(m_filepath != NULL);
+			set_filename();
+		}
+
+		/* Do not make a copy of the first page,
+		it should be reread if needed */
+		m_first_page = NULL;
+
+		return(*this);
+	}
+
+	/** Initialize the tablespace flags */
+	void init(uint32_t flags) { m_flags= flags; }
+
+	/** Release the resources. */
+	virtual void shutdown();
+
+	/** Open a data file in read-only mode to check if it exists
+	so that it can be validated.
+	@param[in]	strict	whether to issue error messages
+	@return DB_SUCCESS or error code */
+	dberr_t open_read_only(bool strict);
+
+	/** Open a data file in read-write mode during start-up so that
+	doublewrite pages can be restored and then it can be validated.
+	@return DB_SUCCESS or error code */
+	inline dberr_t open_read_write()
+		MY_ATTRIBUTE((warn_unused_result));
+
+	/** Initialize OS specific file info. */
+	void init_file_info();
+
+	/** Close a data file.
+	@return DB_SUCCESS or error code */
+	dberr_t close();
+
+	/** Make a full filepath from a directory path and a filename.
+	Prepend the dirpath to filename using the extension given.
+	If dirpath is NULL, prepend the default datadir to filepath.
+	Store the result in m_filepath.
+	@param dirpath  directory path
+	@param name     tablespace (table) name
+	@param ext      filename extension */
+	void make_filepath(const char* dirpath, fil_space_t::name_type name,
+			   ib_extention ext);
+
+	/** Set the filepath by duplicating the filepath sent in */
+	void set_filepath(const char* filepath);
+
+	/** Validates the datafile and checks that it conforms with
+	the expected space ID and flags.  The file should exist and be
+	successfully opened in order for this function to validate it.
+	@param[in]	space_id	The expected tablespace ID.
+	@param[in]	flags		The expected tablespace flags.
+	@retval DB_SUCCESS if tablespace is valid, DB_ERROR if not.
+	m_is_valid is also set true on success, else false. */
+	dberr_t validate_to_dd(uint32_t space_id, uint32_t flags)
+		MY_ATTRIBUTE((warn_unused_result));
+
+	/** Validates this datafile for the purpose of recovery.
+	The file should exist and be successfully opened. We initially
+	open it in read-only mode because we just want to read the SpaceID.
+	However, if the first page is corrupt and needs to be restored
+	from the doublewrite buffer, we will reopen it in write mode and
+	ry to restore that page.
+	@retval DB_SUCCESS if tablespace is valid, DB_ERROR if not.
+	m_is_valid is also set true on success, else false. */
+	dberr_t validate_for_recovery()
+		MY_ATTRIBUTE((warn_unused_result));
+
+	/** Checks the consistency of the first page of a datafile when the
+	tablespace is opened.  This occurs before the fil_space_t is created
+	so the Space ID found here must not already be open.
+	m_is_valid is set true on success, else false.
+	@retval DB_SUCCESS on if the datafile is valid
+	@retval DB_CORRUPTION if the datafile is not readable
+	@retval DB_TABLESPACE_EXISTS if there is a duplicate space_id */
+	dberr_t validate_first_page()
+		MY_ATTRIBUTE((warn_unused_result));
+
+	/** Get Datafile::m_filepath.
+	@return m_filepath */
+	const char*	filepath()	const
+	{
+		return(m_filepath);
+	}
+
+	/** Get Datafile::m_handle.
+	@return m_handle */
+	pfs_os_file_t	handle()	const
+	{
+		return(m_handle);
+	}
+
+	/** @return detached file handle */
+	pfs_os_file_t detach()
+	{
+		pfs_os_file_t detached = m_handle;
+		m_handle = OS_FILE_CLOSED;
+		return detached;
+	}
+
+	/** Get Datafile::m_order.
+	@return m_order */
+	ulint	order()	const
+	{
+		return(m_order);
+	}
+
+	/** Get Datafile::m_space_id.
+	@return m_space_id */
+	uint32_t space_id() const { return m_space_id; }
+
+	/** Get Datafile::m_flags.
+	@return m_flags */
+	uint32_t flags() const { return m_flags; }
+
+	/**
+	@return true if m_handle is open, false if not */
+	bool is_open() const { return m_handle != OS_FILE_CLOSED; }
+
+	/** Get Datafile::m_is_valid.
+	@return m_is_valid */
+	bool	is_valid()	const
+	{
+		return(m_is_valid);
+	}
+
+	/** Get the last OS error reported
+	@return m_last_os_error */
+	ulint	last_os_error()		const
+	{
+		return(m_last_os_error);
+	}
+
+	/** Check whether the file is empty.
+	@return true if file is empty */
+	bool	is_empty_file()		const
+	{
+#ifdef _WIN32
+		os_offset_t	offset =
+			(os_offset_t) m_file_info.nFileSizeLow
+			| ((os_offset_t) m_file_info.nFileSizeHigh << 32);
+
+		return (offset == 0);
+#else
+		return (m_file_info.st_size == 0);
+#endif
+	}
+
+	/** Check if the file exist.
+	@return true if file exists. */
+	bool exists()	const { return m_exists; }
+
+	/** Test if the filepath provided looks the same as this filepath
+	by string comparison. If they are two different paths to the same
+	file, same_as() will be used to show that after the files are opened.
+	@param[in]	other	filepath to compare with
+	@retval true if it is the same filename by char comparison
+	@retval false if it looks different */
+	bool same_filepath_as(const char* other) const;
+
+	/** Test if another opened datafile is the same file as this object.
+	@param[in]	other	Datafile to compare with
+	@return true if it is the same file, else false */
+	bool same_as(const Datafile&	other) const;
+
+	/** Get access to the first data page.
+	It is valid after open_read_only() succeeded.
+	@return the first data page */
+	const byte* get_first_page() const { return(m_first_page); }
+
+	void set_space_id(uint32_t space_id) { m_space_id= space_id; }
+
+	void set_flags(uint32_t flags) { m_flags = flags; }
+private:
+	/** Free the filepath buffer. */
+	void free_filepath();
+
+	/** Set the filename pointer to the start of the file name
+	in the filepath. */
+	void set_filename()
+	{
+		if (!m_filepath) {
+			return;
+		}
+
+		if (char *last_slash = strrchr(m_filepath, '/')) {
+#if _WIN32
+			if (char *last = strrchr(m_filepath, '\\')) {
+				if (last > last_slash) {
+					last_slash = last;
+				}
+			}
+#endif
+			m_filename = last_slash + 1;
+		} else {
+			m_filename = m_filepath;
+		}
+	}
+
+	/** Create/open a data file.
+	@param[in]	read_only_mode	if true, then readonly mode checks
+					are enforced.
+	@return DB_SUCCESS or error code */
+	dberr_t open_or_create(bool read_only_mode)
+		MY_ATTRIBUTE((warn_unused_result));
+
+	/** Reads a few significant fields from the first page of the
+	datafile, which must already be open.
+	@param[in]	read_only_mode	if true, then readonly mode checks
+					are enforced.
+	@return DB_SUCCESS or DB_IO_ERROR if page cannot be read */
+	dberr_t read_first_page(bool read_only_mode)
+		MY_ATTRIBUTE((warn_unused_result));
+
+	/** Free the first page from memory when it is no longer needed. */
+	void free_first_page();
+
+	/** Set the Datafile::m_open_flags.
+	@param open_flags	The Open flags to set. */
+	void set_open_flags(os_file_create_t	open_flags)
+	{
+		m_open_flags = open_flags;
+	};
+
+	/** Determine if this datafile is on a Raw Device
+	@return true if it is a RAW device. */
+	bool is_raw_device()
+	{
+		return(m_type != SRV_NOT_RAW);
+	}
+
+	/* DATA MEMBERS */
+
+protected:
+	/** Physical file path with base name and extension */
+	char*			m_filepath;
+
+private:
+	/** Determine the space id of the given file descriptor by reading
+	a few pages from the beginning of the .ibd file.
+	@return DB_SUCCESS if space id was successfully identified,
+	else DB_ERROR. */
+	dberr_t find_space_id();
+
+	/** Points into m_filepath to the file name with extension */
+	char*			m_filename;
+
+	/** Open file handle */
+	pfs_os_file_t		m_handle;
+
+	/** Flags to use for opening the data file */
+	os_file_create_t	m_open_flags;
+
+	/** size in megabytes or pages; converted from megabytes to
+	pages in SysTablespace::normalize_size() */
+	uint32_t		m_size;
+
+	/** ordinal position of this datafile in the tablespace */
+	ulint			m_order;
+
+	/** The type of the data file */
+	device_t		m_type;
+
+	/** Tablespace ID. Contained in the datafile header.
+	If this is a system tablespace, FSP_SPACE_ID is only valid
+	in the first datafile. */
+	uint32_t		m_space_id;
+
+	/** Tablespace flags. Contained in the datafile header.
+	If this is a system tablespace, FSP_SPACE_FLAGS are only valid
+	in the first datafile. */
+	uint32_t		m_flags;
+
+	/** true if file already existed on startup */
+	bool			m_exists;
+
+	/* true if the tablespace is valid */
+	bool			m_is_valid;
+
+	/** Aligned buffer to hold first page */
+	byte*			m_first_page;
+
+protected:
+	/** Last OS error received so it can be reported if needed. */
+	ulint			m_last_os_error;
+
+public:
+	/** true if table is deferred during recovery */
+	bool			m_defer=false;
+	/** Use the following to determine the uniqueness of this datafile. */
+#ifdef _WIN32
+	/* Use fields dwVolumeSerialNumber, nFileIndexLow, nFileIndexHigh. */
+	BY_HANDLE_FILE_INFORMATION	m_file_info;
+#else
+	/* Use field st_ino. */
+	struct stat			m_file_info;
+#endif	/* WIN32 */
+};
+
+
+/** Data file control information. */
+class RemoteDatafile : public Datafile
+{
+private:
+	/** Link filename (full path) */
+	char*	m_link_filepath;
+
+public:
+
+	RemoteDatafile()
+		:
+		m_link_filepath()
+	{
+		/* No op - base constructor is called. */
+	}
+
+	RemoteDatafile(const char*, ulint, ulint)
+		:
+		m_link_filepath()
+	{
+		/* No op - base constructor is called. */
+	}
+
+	~RemoteDatafile() override
+	{
+		shutdown();
+	}
+
+	/** Release the resources. */
+	void shutdown() override;
+
+	/** Get the link filepath.
+	@return m_link_filepath */
+	const char*	link_filepath()	const
+	{
+		return(m_link_filepath);
+	}
+
+	/** Attempt to read the contents of an .isl file into m_filepath.
+	@param name   table name
+	@return filepath()
+	@retval nullptr  if the .isl file does not exist or cannot be read */
+	const char* open_link_file(const fil_space_t::name_type name);
+
+	/** Delete an InnoDB Symbolic Link (ISL) file. */
+	void delete_link_file(void);
+
+	/******************************************************************
+	Global Static Functions;  Cannot refer to data members.
+	******************************************************************/
+
+	/** Create InnoDB Symbolic Link (ISL) file.
+	@param name     tablespace name
+	@param filepath full file name
+	@return DB_SUCCESS or error code */
+	static dberr_t create_link_file(fil_space_t::name_type name,
+					const char *filepath);
+
+	/** Delete an InnoDB Symbolic Link (ISL) file by name.
+	@param name   tablespace name */
+	static void delete_link_file(fil_space_t::name_type name);
+};
+#endif /* fsp0file_h */
diff --git a/storage/innobase/include/fsp0fsp.h b/storage/innobase/include/fsp0fsp.h
new file mode 100644
index 00000000..26261554
--- /dev/null
+++ b/storage/innobase/include/fsp0fsp.h
@@ -0,0 +1,762 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fsp0fsp.h
+File space management
+
+Created 12/18/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef fsp0fsp_h
+#define fsp0fsp_h
+
+#include "assume_aligned.h"
+#include "fsp0types.h"
+#include "fut0lst.h"
+#include "ut0byte.h"
+
+#ifndef UNIV_INNOCHECKSUM
+#include "mtr0mtr.h"
+#include "page0types.h"
+#include "rem0types.h"
+#else
+# include "mach0data.h"
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** @return the PAGE_SSIZE flags for the current innodb_page_size */
+#define FSP_FLAGS_PAGE_SSIZE()						\
+	((srv_page_size == UNIV_PAGE_SIZE_ORIG) ?			\
+	 0U : (srv_page_size_shift - UNIV_ZIP_SIZE_SHIFT_MIN + 1)	\
+	 << FSP_FLAGS_POS_PAGE_SSIZE)
+
+/** @return the PAGE_SSIZE flags for the current innodb_page_size in
+full checksum format */
+#define FSP_FLAGS_FCRC32_PAGE_SSIZE()					\
+	((srv_page_size_shift - UNIV_ZIP_SIZE_SHIFT_MIN + 1)		\
+	<< FSP_FLAGS_FCRC32_POS_PAGE_SSIZE)
+
+/* @defgroup Compatibility macros for MariaDB 10.1.0 through 10.1.20;
+see the table in fsp0types.h @{ */
+/** Zero relative shift position of the PAGE_COMPRESSION field */
+#define FSP_FLAGS_POS_PAGE_COMPRESSION_MARIADB101	\
+	(FSP_FLAGS_POS_ATOMIC_BLOBS			\
+	 + FSP_FLAGS_WIDTH_ATOMIC_BLOBS)
+/** Zero relative shift position of the PAGE_COMPRESSION_LEVEL field */
+#define FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL_MARIADB101	\
+	(FSP_FLAGS_POS_PAGE_COMPRESSION_MARIADB101 + 1)
+/** Zero relative shift position of the ATOMIC_WRITES field */
+#define FSP_FLAGS_POS_ATOMIC_WRITES_MARIADB101		\
+	(FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL_MARIADB101 + 4)
+/** Zero relative shift position of the PAGE_SSIZE field */
+#define FSP_FLAGS_POS_PAGE_SSIZE_MARIADB101		\
+	(FSP_FLAGS_POS_ATOMIC_WRITES_MARIADB101 + 2)
+
+/** Bit mask of the PAGE_COMPRESSION field */
+#define FSP_FLAGS_MASK_PAGE_COMPRESSION_MARIADB101		\
+	(1U << FSP_FLAGS_POS_PAGE_COMPRESSION_MARIADB101)
+/** Bit mask of the PAGE_COMPRESSION_LEVEL field */
+#define FSP_FLAGS_MASK_PAGE_COMPRESSION_LEVEL_MARIADB101	\
+	(15U << FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL_MARIADB101)
+/** Bit mask of the ATOMIC_WRITES field */
+#define FSP_FLAGS_MASK_ATOMIC_WRITES_MARIADB101			\
+	(3U << FSP_FLAGS_POS_ATOMIC_WRITES_MARIADB101)
+/** Bit mask of the PAGE_SSIZE field */
+#define FSP_FLAGS_MASK_PAGE_SSIZE_MARIADB101			\
+	(15U << FSP_FLAGS_POS_PAGE_SSIZE_MARIADB101)
+
+/** Return the value of the PAGE_COMPRESSION field */
+#define FSP_FLAGS_GET_PAGE_COMPRESSION_MARIADB101(flags)	\
+		((flags & FSP_FLAGS_MASK_PAGE_COMPRESSION_MARIADB101)	\
+		>> FSP_FLAGS_POS_PAGE_COMPRESSION_MARIADB101)
+/** Return the value of the PAGE_COMPRESSION_LEVEL field */
+#define FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL_MARIADB101(flags)	\
+		((flags & FSP_FLAGS_MASK_PAGE_COMPRESSION_LEVEL_MARIADB101) \
+		>> FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL_MARIADB101)
+/** Return the value of the PAGE_SSIZE field */
+#define FSP_FLAGS_GET_PAGE_SSIZE_MARIADB101(flags)		\
+		((flags & FSP_FLAGS_MASK_PAGE_SSIZE_MARIADB101)	\
+		>> FSP_FLAGS_POS_PAGE_SSIZE_MARIADB101)
+
+/* @} */
+
+/* @defgroup Tablespace Header Constants (moved from fsp0fsp.c) @{ */
+
+/** Offset of the space header within a file page */
+#define FSP_HEADER_OFFSET	FIL_PAGE_DATA
+
+/* The data structures in files are defined just as byte strings in C */
+typedef	byte	xdes_t;
+
+/*			SPACE HEADER
+			============
+
+File space header data structure: this data structure is contained in the
+first page of a space. The space for this header is reserved in every extent
+descriptor page, but used only in the first. */
+
+/*-------------------------------------*/
+#define FSP_SPACE_ID		0	/* space id */
+#define FSP_NOT_USED		4	/* this field contained a value up to
+					which we know that the modifications
+					in the database have been flushed to
+					the file space; not used now */
+#define	FSP_SIZE		8	/* Current size of the space in
+					pages */
+#define	FSP_FREE_LIMIT		12	/* Minimum page number for which the
+					free list has not been initialized:
+					the pages >= this limit are, by
+					definition, free; note that in a
+					single-table tablespace where size
+					< 64 pages, this number is 64, i.e.,
+					we have initialized the space
+					about the first extent, but have not
+					physically allocated those pages to the
+					file */
+#define	FSP_SPACE_FLAGS		16	/* fsp_space_t.flags, similar to
+					dict_table_t::flags */
+#define	FSP_FRAG_N_USED		20	/* number of used pages in the
+					FSP_FREE_FRAG list */
+#define	FSP_FREE		24	/* list of free extents */
+#define	FSP_FREE_FRAG		(24 + FLST_BASE_NODE_SIZE)
+					/* list of partially free extents not
+					belonging to any segment */
+#define	FSP_FULL_FRAG		(24 + 2 * FLST_BASE_NODE_SIZE)
+					/* list of full extents not belonging
+					to any segment */
+#define FSP_SEG_ID		(24 + 3 * FLST_BASE_NODE_SIZE)
+					/* 8 bytes which give the first unused
+					segment id */
+#define FSP_SEG_INODES_FULL	(32 + 3 * FLST_BASE_NODE_SIZE)
+					/* list of pages containing segment
+					headers, where all the segment inode
+					slots are reserved */
+#define FSP_SEG_INODES_FREE	(32 + 4 * FLST_BASE_NODE_SIZE)
+					/* list of pages containing segment
+					headers, where not all the segment
+					header slots are reserved */
+/*-------------------------------------*/
+/* File space header size */
+#define	FSP_HEADER_SIZE		(32 + 5 * FLST_BASE_NODE_SIZE)
+
+#define	FSP_FREE_ADD		4	/* this many free extents are added
+					to the free list from above
+					FSP_FREE_LIMIT at a time */
+/* @} */
+
+/* @defgroup File Segment Inode Constants (moved from fsp0fsp.c) @{ */
+
+/*			FILE SEGMENT INODE
+			==================
+
+Segment inode which is created for each segment in a tablespace. NOTE: in
+purge we assume that a segment having only one currently used page can be
+freed in a few steps, so that the freeing cannot fill the file buffer with
+bufferfixed file pages. */
+
+typedef	byte	fseg_inode_t;
+
+#define FSEG_INODE_PAGE_NODE	FSEG_PAGE_DATA
+					/* the list node for linking
+					segment inode pages */
+
+#define FSEG_ARR_OFFSET		(FSEG_PAGE_DATA + FLST_NODE_SIZE)
+/*-------------------------------------*/
+#define	FSEG_ID			0	/* 8 bytes of segment id: if this is 0,
+					it means that the header is unused */
+#define FSEG_NOT_FULL_N_USED	8
+					/* number of used segment pages in
+					the FSEG_NOT_FULL list */
+#define	FSEG_FREE		12
+					/* list of free extents of this
+					segment */
+#define	FSEG_NOT_FULL		(12 + FLST_BASE_NODE_SIZE)
+					/* list of partially free extents */
+#define	FSEG_FULL		(12 + 2 * FLST_BASE_NODE_SIZE)
+					/* list of full extents */
+#define	FSEG_MAGIC_N		(12 + 3 * FLST_BASE_NODE_SIZE)
+					/* magic number used in debugging */
+#define	FSEG_FRAG_ARR		(16 + 3 * FLST_BASE_NODE_SIZE)
+					/* array of individual pages
+					belonging to this segment in fsp
+					fragment extent lists */
+#define FSEG_FRAG_ARR_N_SLOTS	(FSP_EXTENT_SIZE / 2)
+					/* number of slots in the array for
+					the fragment pages */
+#define	FSEG_FRAG_SLOT_SIZE	4	/* a fragment page slot contains its
+					page number within space, FIL_NULL
+					means that the slot is not in use */
+/*-------------------------------------*/
+#define FSEG_INODE_SIZE					\
+	(16 + 3 * FLST_BASE_NODE_SIZE			\
+	 + FSEG_FRAG_ARR_N_SLOTS * FSEG_FRAG_SLOT_SIZE)
+
+static constexpr byte FSEG_MAGIC_N_BYTES[4]={0x05,0xd6,0x69,0xd2};
+
+#define	FSEG_FILLFACTOR		8	/* If the number of unused but reserved
+					pages in a segment is less than
+					reserved pages / FSEG_FILLFACTOR,
+					and there are
+					at least FSEG_FRAG_LIMIT used pages,
+					then we allow a new empty extent to
+					be added to the segment in
+					fseg_alloc_free_page_general().
+					Otherwise, we
+					use unused pages of the segment. */
+
+#define FSEG_FRAG_LIMIT		FSEG_FRAG_ARR_N_SLOTS
+					/* If the segment has >= this many
+					used pages, it may be expanded by
+					allocating extents to the segment;
+					until that only individual fragment
+					pages are allocated from the space */
+
+#define	FSEG_FREE_LIST_LIMIT	40	/* If the reserved size of a segment
+					is at least this many extents, we
+					allow extents to be put to the free
+					list of the extent: at most
+					FSEG_FREE_LIST_MAX_LEN many */
+#define	FSEG_FREE_LIST_MAX_LEN	4
+/* @} */
+
+/* @defgroup Extent Descriptor Constants (moved from fsp0fsp.c) @{ */
+
+/*			EXTENT DESCRIPTOR
+			=================
+
+File extent descriptor data structure: contains bits to tell which pages in
+the extent are free and which contain old tuple version to clean. */
+
+/*-------------------------------------*/
+#define	XDES_ID			0	/* The identifier of the segment
+					to which this extent belongs */
+#define XDES_FLST_NODE		8	/* The list node data structure
+					for the descriptors */
+#define	XDES_STATE		(FLST_NODE_SIZE + 8)
+					/* contains state information
+					of the extent */
+#define	XDES_BITMAP		(FLST_NODE_SIZE + 12)
+					/* Descriptor bitmap of the pages
+					in the extent */
+/*-------------------------------------*/
+
+#define	XDES_BITS_PER_PAGE	2	/* How many bits are there per page */
+#define	XDES_FREE_BIT		0	/* Index of the bit which tells if
+					the page is free */
+#define	XDES_CLEAN_BIT		1	/* NOTE: currently not used!
+					Index of the bit which tells if
+					there are old versions of tuples
+					on the page */
+/* States of a descriptor */
+#define	XDES_FREE		1	/* extent is in free list of space */
+#define	XDES_FREE_FRAG		2	/* extent is in free fragment list of
+					space */
+#define	XDES_FULL_FRAG		3	/* extent is in full fragment list of
+					space */
+#define	XDES_FSEG		4	/* extent belongs to a segment */
+
+/** File extent data structure size in bytes. */
+#define	XDES_SIZE							\
+	(XDES_BITMAP							\
+	+ UT_BITS_IN_BYTES(FSP_EXTENT_SIZE * XDES_BITS_PER_PAGE))
+
+/** File extent data structure size in bytes for MAX page size. */
+#define	XDES_SIZE_MAX							\
+	(XDES_BITMAP							\
+	+ UT_BITS_IN_BYTES(FSP_EXTENT_SIZE_MAX * XDES_BITS_PER_PAGE))
+
+/** File extent data structure size in bytes for MIN page size. */
+#define	XDES_SIZE_MIN							\
+	(XDES_BITMAP							\
+	+ UT_BITS_IN_BYTES(FSP_EXTENT_SIZE_MIN * XDES_BITS_PER_PAGE))
+
+/** Offset of the descriptor array on a descriptor page */
+#define	XDES_ARR_OFFSET		(FSP_HEADER_OFFSET + FSP_HEADER_SIZE)
+
+/**
+Determine if a page is marked free.
+@param[in]	descr	extent descriptor
+@param[in]	offset	page offset within extent
+@return whether the page is free */
+inline bool xdes_is_free(const xdes_t *descr, ulint offset)
+{
+  ut_ad(offset < FSP_EXTENT_SIZE);
+  ulint index= XDES_FREE_BIT + XDES_BITS_PER_PAGE * offset;
+  return ut_bit_get_nth(descr[XDES_BITMAP + (index >> 3)], index & 7);
+}
+
+#ifndef UNIV_INNOCHECKSUM
+/* @} */
+
+/** Read a tablespace header field.
+@param[in]	page	first page of a tablespace
+@param[in]	field	the header field
+@return the contents of the header field */
+inline uint32_t fsp_header_get_field(const page_t* page, ulint field)
+{
+  return mach_read_from_4(FSP_HEADER_OFFSET + field +
+			  my_assume_aligned<UNIV_ZIP_SIZE_MIN>(page));
+}
+
+/** Read the flags from the tablespace header page.
+@param[in]	page	first page of a tablespace
+@return the contents of FSP_SPACE_FLAGS */
+inline uint32_t fsp_header_get_flags(const page_t *page)
+{
+  return fsp_header_get_field(page, FSP_SPACE_FLAGS);
+}
+
+/** Get the byte offset of encryption information in page 0.
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@return	byte offset relative to FSP_HEADER_OFFSET */
+inline MY_ATTRIBUTE((pure, warn_unused_result))
+ulint fsp_header_get_encryption_offset(ulint zip_size)
+{
+	return zip_size
+		? XDES_ARR_OFFSET + XDES_SIZE * zip_size / FSP_EXTENT_SIZE
+		: XDES_ARR_OFFSET + (XDES_SIZE << srv_page_size_shift)
+		/ FSP_EXTENT_SIZE;
+}
+
+/** Check the encryption key from the first page of a tablespace.
+@param[in]	fsp_flags	tablespace flags
+@param[in]	page		first page of a tablespace
+@return true if success */
+bool
+fsp_header_check_encryption_key(
+	ulint			fsp_flags,
+	page_t*			page);
+
+/** Initialize a tablespace header.
+@param[in,out]	space	tablespace
+@param[in]	size	current size in blocks
+@param[in,out]	mtr	mini-transaction
+@return error code */
+dberr_t fsp_header_init(fil_space_t *space, uint32_t size, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Create a new segment.
+@param space                tablespace
+@param byte_offset          byte offset of the created segment header
+@param mtr                  mini-transaction
+@param err                  error code
+@param has_done_reservation whether fsp_reserve_free_extents() was invoked
+@param block                block where segment header is placed,
+                            or NULL to allocate an additional page for that
+@return the block where the segment header is placed, x-latched
+@retval nullptr if could not create segment */
+buf_block_t*
+fseg_create(fil_space_t *space, ulint byte_offset, mtr_t *mtr, dberr_t *err,
+            bool has_done_reservation= false, buf_block_t *block= nullptr)
+  MY_ATTRIBUTE((nonnull(1,3,4), warn_unused_result));
+
+/** Calculate the number of pages reserved by a segment,
+and how many pages are currently used.
+@param[in]      block   buffer block containing the file segment header
+@param[in]      header  file segment header
+@param[out]     used    number of pages that are used (not more than reserved)
+@param[in,out]  mtr     mini-transaction
+@return number of reserved pages */
+ulint fseg_n_reserved_pages(const buf_block_t &block,
+                            const fseg_header_t *header, ulint *used,
+                            mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull));
+/**********************************************************************//**
+Allocates a single free page from a segment. This function implements
+the intelligent allocation strategy which tries to minimize file space
+fragmentation.
+@retval NULL if no page could be allocated */
+buf_block_t*
+fseg_alloc_free_page_general(
+/*=========================*/
+	fseg_header_t*	seg_header,/*!< in/out: segment header */
+	uint32_t	hint,	/*!< in: hint of which page would be
+				desirable */
+	byte		direction,/*!< in: if the new page is needed because
+				of an index page split, and records are
+				inserted there in order, into which
+				direction they go alphabetically: FSP_DOWN,
+				FSP_UP, FSP_NO_DIR */
+	bool		has_done_reservation, /*!< in: true if the caller has
+				already done the reservation for the page
+				with fsp_reserve_free_extents, then there
+				is no need to do the check for this individual
+				page */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	mtr_t*		init_mtr,/*!< in/out: mtr or another mini-transaction
+				in which the page should be initialized. */
+	dberr_t*	err)	/*!< out: error code */
+	MY_ATTRIBUTE((warn_unused_result, nonnull));
+
+/** Reserves free pages from a tablespace. All mini-transactions which may
+use several pages from the tablespace should call this function beforehand
+and reserve enough free extents so that they certainly will be able
+to do their operation, like a B-tree page split, fully. Reservations
+must be released with function fil_space_t::release_free_extents()!
+
+The alloc_type below has the following meaning: FSP_NORMAL means an
+operation which will probably result in more space usage, like an
+insert in a B-tree; FSP_UNDO means allocation to undo logs: if we are
+deleting rows, then this allocation will in the long run result in
+less space usage (after a purge); FSP_CLEANING means allocation done
+in a physical record delete (like in a purge) or other cleaning operation
+which will result in less space usage in the long run. We prefer the latter
+two types of allocation: when space is scarce, FSP_NORMAL allocations
+will not succeed, but the latter two allocations will succeed, if possible.
+The purpose is to avoid dead end where the database is full but the
+user cannot free any space because these freeing operations temporarily
+reserve some space.
+
+Single-table tablespaces whose size is < FSP_EXTENT_SIZE pages are a special
+case. In this function we would liberally reserve several extents for
+every page split or merge in a B-tree. But we do not want to waste disk space
+if the table only occupies < FSP_EXTENT_SIZE pages. That is why we apply
+different rules in that special case, just ensuring that there are n_pages
+free pages available.
+
+@param[out]     n_reserved      number of extents actually reserved; if we
+                                return true and the tablespace size is <
+                                FSP_EXTENT_SIZE pages, then this can be 0,
+                                otherwise it is n_ext
+@param[in,out]  space           tablespace
+@param[in]      n_ext           number of extents to reserve
+@param[in]      alloc_type      page reservation type (FSP_BLOB, etc)
+@param[in,out]  mtr             the mini transaction
+@param[out]     err             error code
+@param[in]      n_pages         for small tablespaces (tablespace size is
+                                less than FSP_EXTENT_SIZE), number of free
+                                pages to reserve.
+@return error code
+@retval DB_SUCCESS if we were able to make the reservation */
+dberr_t
+fsp_reserve_free_extents(
+	uint32_t*	n_reserved,
+	fil_space_t*	space,
+	uint32_t	n_ext,
+	fsp_reserve_t	alloc_type,
+	mtr_t*		mtr,
+	uint32_t	n_pages = 2);
+
+/** Free a page in a file segment.
+@param[in,out]	seg_header	file segment header
+@param[in,out]	space		tablespace
+@param[in]	offset		page number
+@param[in,out]	mtr		mini-transaction
+@param[in]	have_latch	whether space->x_lock() was already called
+@return error code */
+dberr_t
+fseg_free_page(
+	fseg_header_t*	seg_header,
+	fil_space_t*	space,
+	uint32_t	offset,
+	mtr_t*		mtr,
+	bool		have_latch = false)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Determine whether a page is allocated.
+@param space   tablespace
+@param page    page number
+@return error code
+@retval DB_SUCCESS             if the page is marked as free
+@retval DB_SUCCESS_LOCKED_REC  if the page is marked as allocated */
+dberr_t fseg_page_is_allocated(fil_space_t *space, unsigned page)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Frees part of a segment. This function can be used to free
+a segment by repeatedly calling this function in different
+mini-transactions. Doing the freeing in a single mini-transaction
+might result in too big a mini-transaction.
+@param	header	segment header; NOTE: if the header resides on first
+		page of the frag list of the segment, this pointer
+		becomes obsolete after the last freeing step
+@param	mtr	mini-transaction
+@param	ahi	Drop the adaptive hash index
+@return whether the freeing was completed */
+bool
+fseg_free_step(
+	fseg_header_t*	header,
+	mtr_t*		mtr
+#ifdef BTR_CUR_HASH_ADAPT
+	,bool		ahi=false
+#endif /* BTR_CUR_HASH_ADAPT */
+	)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Frees part of a segment. Differs from fseg_free_step because
+this function leaves the header page unfreed.
+@param	header	segment header which must reside on the first
+		fragment page of the segment
+@param	mtr	mini-transaction
+@param	ahi	drop the adaptive hash index
+@return whether the freeing was completed, except for the header page */
+bool
+fseg_free_step_not_header(
+	fseg_header_t*	header,
+	mtr_t*		mtr
+#ifdef BTR_CUR_HASH_ADAPT
+	,bool		ahi=false
+#endif /* BTR_CUR_HASH_ADAPT */
+	)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Reset the page type.
+Data files created before MySQL 5.1.48 may contain garbage in FIL_PAGE_TYPE.
+In MySQL 3.23.53, only undo log pages and index pages were tagged.
+Any other pages were written with uninitialized bytes in FIL_PAGE_TYPE.
+@param[in]	block	block with invalid FIL_PAGE_TYPE
+@param[in]	type	expected page type
+@param[in,out]	mtr	mini-transaction */
+ATTRIBUTE_COLD
+void fil_block_reset_type(const buf_block_t& block, ulint type, mtr_t* mtr);
+
+/** Check (and if needed, reset) the page type.
+Data files created before MySQL 5.1.48 may contain
+garbage in the FIL_PAGE_TYPE field.
+In MySQL 3.23.53, only undo log pages and index pages were tagged.
+Any other pages were written with uninitialized bytes in FIL_PAGE_TYPE.
+@param[in]	page_id	page number
+@param[in,out]	page	page with possibly invalid FIL_PAGE_TYPE
+@param[in]	type	expected page type
+@param[in,out]	mtr	mini-transaction */
+inline void
+fil_block_check_type(
+	const buf_block_t&	block,
+	ulint			type,
+	mtr_t*			mtr)
+{
+  if (UNIV_UNLIKELY(type != fil_page_get_type(block.page.frame)))
+    fil_block_reset_type(block, type, mtr);
+}
+
+/** Checks if a page address is an extent descriptor page address.
+@param[in]	page_id		page id
+@param[in]	physical_size	page size
+@return whether a descriptor page */
+inline bool fsp_descr_page(const page_id_t page_id, ulint physical_size)
+{
+	return (page_id.page_no() & (physical_size - 1)) == FSP_XDES_OFFSET;
+}
+
+/** Initialize a file page whose prior contents should be ignored.
+@param[in,out]	block	buffer pool block */
+void fsp_apply_init_file_page(buf_block_t *block);
+
+/** Initialize a file page.
+@param[in]	space	tablespace
+@param[in,out]	block	file page
+@param[in,out]	mtr	mini-transaction */
+inline void fsp_init_file_page(
+#ifdef UNIV_DEBUG
+	const fil_space_t* space,
+#endif
+	buf_block_t* block, mtr_t* mtr)
+{
+	ut_d(space->modify_check(*mtr));
+	ut_ad(space->id == block->page.id().space());
+	fsp_apply_init_file_page(block);
+	mtr->init(block);
+}
+
+#ifndef UNIV_DEBUG
+# define fsp_init_file_page(space, block, mtr) fsp_init_file_page(block, mtr)
+#endif
+
+#ifdef UNIV_BTR_PRINT
+/*******************************************************************//**
+Writes info of a segment. */
+void
+fseg_print(
+/*=======*/
+	fseg_header_t*	header, /*!< in: segment header */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
+#endif /* UNIV_BTR_PRINT */
+
+/** Convert FSP_SPACE_FLAGS from the buggy MariaDB 10.1.0..10.1.20 format.
+@param[in]	flags	the contents of FSP_SPACE_FLAGS
+@return	the flags corrected from the buggy MariaDB 10.1 format
+@retval	UINT32_MAX  if the flags are not in the buggy 10.1 format */
+MY_ATTRIBUTE((warn_unused_result, const))
+inline uint32_t fsp_flags_convert_from_101(uint32_t flags)
+{
+	DBUG_EXECUTE_IF("fsp_flags_is_valid_failure", return UINT32_MAX;);
+	if (flags == 0 || fil_space_t::full_crc32(flags)) {
+		return(flags);
+	}
+
+	if (flags >> 18) {
+		/* The most significant FSP_SPACE_FLAGS bit that was ever set
+		by MariaDB 10.1.0 to 10.1.20 was bit 17 (misplaced DATA_DIR flag).
+		The flags must be less than 1<<18 in order to be valid. */
+		return UINT32_MAX;
+	}
+
+	if ((flags & (FSP_FLAGS_MASK_POST_ANTELOPE | FSP_FLAGS_MASK_ATOMIC_BLOBS))
+	    == FSP_FLAGS_MASK_ATOMIC_BLOBS) {
+		/* If the "atomic blobs" flag (indicating
+		ROW_FORMAT=DYNAMIC or ROW_FORMAT=COMPRESSED) flag
+		is set, then the "post Antelope" (ROW_FORMAT!=REDUNDANT) flag
+		must also be set. */
+		return UINT32_MAX;
+	}
+
+	/* Bits 6..10 denote compression in MariaDB 10.1.0 to 10.1.20.
+	They must be either 0b00000 or 0b00011 through 0b10011.
+	In correct versions, these bits would be
+	0bd0sss where d is the DATA_DIR flag (garbage bit) and
+	sss is the PAGE_SSIZE (3, 4, 6, or 7).
+
+	NOTE: MariaDB 10.1.0 to 10.1.20 can misinterpret
+	uncompressed data files with innodb_page_size=4k or 64k as
+	compressed innodb_page_size=16k files. Below is an exhaustive
+	state space analysis.
+
+	-0by1zzz: impossible (the bit 4 must be clean; see above)
+	-0b101xx: DATA_DIR, innodb_page_size>4k: invalid (COMPRESSION_LEVEL>9)
+	+0bx0011: innodb_page_size=4k:
+	!!!	Misinterpreted as COMPRESSION_LEVEL=9 or 1, COMPRESSION=1.
+	-0bx0010: impossible, because sss must be 0b011 or 0b1xx
+	-0bx0001: impossible, because sss must be 0b011 or 0b1xx
+	-0b10000: DATA_DIR, innodb_page_size=16:
+	invalid (COMPRESSION_LEVEL=8 but COMPRESSION=0)
+	+0b00111: no DATA_DIR, innodb_page_size=64k:
+	!!!	Misinterpreted as COMPRESSION_LEVEL=3, COMPRESSION=1.
+	-0b00101: impossible, because sss must be 0 for 16k, not 0b101
+	-0b001x0: no DATA_DIR, innodb_page_size=32k or 8k:
+	invalid (COMPRESSION_LEVEL=3 but COMPRESSION=0)
+	+0b00000: innodb_page_size=16k (looks like COMPRESSION=0)
+	???	Could actually be compressed; see PAGE_SSIZE below */
+	const uint32_t level = FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL_MARIADB101(
+		flags);
+	if (FSP_FLAGS_GET_PAGE_COMPRESSION_MARIADB101(flags) != (level != 0)
+	    || level > 9) {
+		/* The compression flags are not in the buggy MariaDB
+		10.1 format. */
+		return UINT32_MAX;
+	}
+	if (!(~flags & FSP_FLAGS_MASK_ATOMIC_WRITES_MARIADB101)) {
+		/* The ATOMIC_WRITES flags cannot be 0b11.
+		(The bits 11..12 should actually never be 0b11,
+		because in MySQL they would be SHARED|TEMPORARY.) */
+		return UINT32_MAX;
+	}
+
+	/* Bits 13..16 are the wrong position for PAGE_SSIZE, and they
+	should contain one of the values 3,4,6,7, that is, be of the form
+	0b0011 or 0b01xx (except 0b0101).
+	In correct versions, these bits should be 0bc0se
+	where c is the MariaDB COMPRESSED flag
+	and e is the MySQL 5.7 ENCRYPTION flag
+	and s is the MySQL 8.0 SDI flag. MariaDB can only support s=0, e=0.
+
+	Compressed innodb_page_size=16k tables with correct FSP_SPACE_FLAGS
+	will be properly rejected by older MariaDB 10.1.x because they
+	would read as PAGE_SSIZE>=8 which is not valid. */
+
+	const uint32_t ssize = FSP_FLAGS_GET_PAGE_SSIZE_MARIADB101(flags);
+	if (ssize == 1 || ssize == 2 || ssize == 5 || ssize & 8) {
+		/* the page_size is not between 4k and 64k;
+		16k should be encoded as 0, not 5 */
+		return UINT32_MAX;
+	}
+	const uint32_t zssize = FSP_FLAGS_GET_ZIP_SSIZE(flags);
+	if (zssize == 0) {
+		/* not ROW_FORMAT=COMPRESSED */
+	} else if (zssize > (ssize ? ssize : 5)) {
+		/* invalid KEY_BLOCK_SIZE */
+		return UINT32_MAX;
+	} else if (~flags & (FSP_FLAGS_MASK_POST_ANTELOPE
+			     | FSP_FLAGS_MASK_ATOMIC_BLOBS)) {
+		/* both these flags should be set for
+		ROW_FORMAT=COMPRESSED */
+		return UINT32_MAX;
+	}
+
+	flags = ((flags & 0x3f) | ssize << FSP_FLAGS_POS_PAGE_SSIZE
+		 | FSP_FLAGS_GET_PAGE_COMPRESSION_MARIADB101(flags)
+		 << FSP_FLAGS_POS_PAGE_COMPRESSION);
+	ut_ad(fil_space_t::is_valid_flags(flags, false));
+	return(flags);
+}
+
+/** Compare tablespace flags.
+@param[in]	expected	expected flags from dict_tf_to_fsp_flags()
+@param[in]	actual		flags read from FSP_SPACE_FLAGS
+@return whether the flags match */
+MY_ATTRIBUTE((warn_unused_result))
+inline bool fsp_flags_match(uint32_t expected, uint32_t actual)
+{
+  expected&= ~FSP_FLAGS_MEM_MASK;
+  ut_ad(fil_space_t::is_valid_flags(expected, false));
+  return actual == expected || fsp_flags_convert_from_101(actual) == expected;
+}
+
+/** Determine if FSP_SPACE_FLAGS are from an incompatible MySQL format.
+@param	flags	the contents of FSP_SPACE_FLAGS
+@return	MySQL flags shifted.
+@retval	0, if not a MySQL incompatible format. */
+MY_ATTRIBUTE((warn_unused_result, const))
+inline uint32_t fsp_flags_is_incompatible_mysql(uint32_t flags)
+{
+  /*
+    MySQL-8.0 SDI flag (bit 14),
+    or MySQL 5.7 Encyption flag (bit 13)
+  */
+  return flags >> 13 & 3;
+}
+
+/** Determine the descriptor index within a descriptor page.
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	offset		page offset
+@return descriptor index */
+inline ulint xdes_calc_descriptor_index(ulint zip_size, ulint offset)
+{
+	return ut_2pow_remainder<ulint>(offset,
+					zip_size ? zip_size : srv_page_size)
+		/ FSP_EXTENT_SIZE;
+}
+
+/** Determine the descriptor page number for a page.
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	offset		page offset
+@return descriptor page offset */
+inline uint32_t xdes_calc_descriptor_page(ulint zip_size, uint32_t offset)
+{
+	compile_time_assert(UNIV_PAGE_SIZE_MAX > XDES_ARR_OFFSET
+			    + (UNIV_PAGE_SIZE_MAX / FSP_EXTENT_SIZE_MAX)
+			    * XDES_SIZE_MAX);
+	compile_time_assert(UNIV_PAGE_SIZE_MIN > XDES_ARR_OFFSET
+			    + (UNIV_PAGE_SIZE_MIN / FSP_EXTENT_SIZE_MIN)
+			    * XDES_SIZE_MIN);
+
+	ut_ad(srv_page_size > XDES_ARR_OFFSET
+	      + (srv_page_size / FSP_EXTENT_SIZE)
+	      * XDES_SIZE);
+	ut_ad(UNIV_ZIP_SIZE_MIN > XDES_ARR_OFFSET
+	      + (UNIV_ZIP_SIZE_MIN / FSP_EXTENT_SIZE)
+	      * XDES_SIZE);
+	ut_ad(!zip_size
+	      || zip_size > XDES_ARR_OFFSET
+	      + (zip_size / FSP_EXTENT_SIZE) * XDES_SIZE);
+	return ut_2pow_round(offset,
+			     uint32_t(zip_size ? zip_size : srv_page_size));
+}
+
+#endif /* UNIV_INNOCHECKSUM */
+
+#endif
diff --git a/storage/innobase/include/fsp0space.h b/storage/innobase/include/fsp0space.h
new file mode 100644
index 00000000..a2bb46d3
--- /dev/null
+++ b/storage/innobase/include/fsp0space.h
@@ -0,0 +1,209 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fsp0space.h
+Shared tablespace interface
+
+Created 2013-7-26 by Kevin Lewis
+*******************************************************/
+
+#ifndef fsp0space_h
+#define fsp0space_h
+
+#include "fsp0file.h"
+#include "fsp0fsp.h"
+#include "fsp0types.h"
+
+#include <vector>
+
+/** Data structure that contains the information about shared tablespaces.
+Currently this can be the system tablespace or a temporary table tablespace */
+class Tablespace {
+
+public:
+	typedef std::vector<Datafile, ut_allocator<Datafile> >	files_t;
+
+	/** Data file information - each Datafile can be accessed globally */
+	files_t		m_files;
+	/** Data file iterator */
+	typedef files_t::iterator iterator;
+	/** Data file iterator */
+	typedef files_t::const_iterator const_iterator;
+
+	Tablespace() {}
+
+	virtual ~Tablespace()
+	{
+		shutdown();
+		ut_ad(m_files.empty());
+		ut_ad(m_space_id == UINT32_MAX);
+	}
+
+	// Disable copying
+	Tablespace(const Tablespace&);
+	Tablespace& operator=(const Tablespace&);
+
+	/** Data file iterator */
+	const_iterator begin() const { return m_files.begin(); }
+	/** Data file iterator */
+	const_iterator end() const { return m_files.end(); }
+	/** Data file iterator */
+	iterator begin() { return m_files.begin(); }
+	/** Data file iterator */
+	iterator end() { return m_files.end(); }
+
+	/** Set tablespace path and filename members.
+	@param[in]	path	where tablespace file(s) resides
+	@param[in]	len	length of the file path */
+	void set_path(const char* path, size_t len)
+	{
+		ut_ad(m_path == NULL);
+		m_path = mem_strdupl(path, len);
+		ut_ad(m_path != NULL);
+	}
+
+	/** Set tablespace path and filename members.
+	@param[in]	path	where tablespace file(s) resides */
+	void set_path(const char* path)
+	{
+		set_path(path, strlen(path));
+	}
+
+	/** Get tablespace path
+	@return tablespace path */
+	const char* path()	const
+	{
+		return(m_path);
+	}
+
+	/** Set the space id of the tablespace
+	@param[in]	space_id	 tablespace ID to set */
+	void set_space_id(uint32_t space_id)
+	{
+		ut_ad(m_space_id == UINT32_MAX);
+		m_space_id = space_id;
+	}
+
+	/** Get the space id of the tablespace
+	@return m_space_id space id of the tablespace */
+	uint32_t space_id() const { return m_space_id; }
+
+	/** Set the tablespace flags
+	@param[in]	fsp_flags	tablespace flags */
+	void set_flags(uint32_t fsp_flags)
+	{
+		ut_ad(fil_space_t::is_valid_flags(fsp_flags, false));
+		m_flags = fsp_flags;
+	}
+
+	/** Get the tablespace flags
+	@return m_flags tablespace flags */
+	uint32_t flags() const { return m_flags; }
+
+	/** Get the tablespace encryption mode
+	@return m_mode tablespace encryption mode */
+	fil_encryption_t encryption_mode() const { return m_mode; }
+
+	/** Get the tablespace encryption key_id
+	@return m_key_id tablespace encryption key_id */
+	uint32_t key_id() const { return m_key_id; }
+
+	/** Set Ignore Read Only Status for tablespace.
+	@param[in]	read_only_status	read only status indicator */
+	void set_ignore_read_only(bool read_only_status)
+	{
+		m_ignore_read_only = read_only_status;
+	}
+
+	/** Free the memory allocated by the Tablespace object */
+	void shutdown();
+
+	/** @return the sum of the file sizes of each Datafile */
+	uint32_t get_sum_of_sizes() const
+	{
+		uint32_t sum = 0;
+
+		for (const_iterator it = begin(); it != end(); ++it) {
+			sum += it->m_size;
+		}
+
+		return(sum);
+	}
+
+	/** Open or Create the data files if they do not exist.
+	@param[in]	is_temp	whether this is a temporary tablespace
+	@return DB_SUCCESS or error code */
+	dberr_t open_or_create(bool is_temp)
+		MY_ATTRIBUTE((warn_unused_result));
+
+	/** Delete all the data files. */
+	void delete_files();
+
+	/** Check if two tablespaces have common data file names.
+	@param[in]	other_space	Tablespace to check against this.
+	@return true if they have the same data filenames and paths */
+	bool intersection(const Tablespace* other_space);
+
+	/** Use the ADD DATAFILE path to create a Datafile object and add
+	it to the front of m_files. Parse the datafile path into a path
+	and a basename with extension 'ibd'. This datafile_path provided
+	may be an absolute or relative path, but it must end with the
+	extension .ibd and have a basename of at least 1 byte.
+
+	Set tablespace m_path member and add a Datafile with the filename.
+	@param[in]	datafile_path	full path of the tablespace file. */
+	dberr_t add_datafile(
+		const char*	datafile_path);
+
+	/* Return a pointer to the first Datafile for this Tablespace
+	@return pointer to the first Datafile for this Tablespace*/
+	Datafile* first_datafile()
+	{
+		ut_a(!m_files.empty());
+		return(&m_files.front());
+	}
+private:
+	/**
+	@param[in]	filename	Name to lookup in the data files.
+	@return true if the filename exists in the data files */
+	bool find(const char* filename) const;
+
+	/** Note that the data file was found.
+	@param[in]	file	data file object */
+	void file_found(Datafile& file);
+
+	/** Tablespace ID */
+	uint32_t	m_space_id = UINT32_MAX;
+	/** Tablespace flags */
+	uint32_t	m_flags = UINT32_MAX;
+
+	/** Path where tablespace files will reside, excluding a filename */
+	char*		m_path;
+
+	/** Encryption mode and key_id */
+	fil_encryption_t m_mode;
+	uint32_t	m_key_id;
+
+protected:
+	/** Ignore server read only configuration for this tablespace. */
+	bool		m_ignore_read_only = false;
+};
+
+#endif /* fsp0space_h */
diff --git a/storage/innobase/include/fsp0sysspace.h b/storage/innobase/include/fsp0sysspace.h
new file mode 100644
index 00000000..514f3fdb
--- /dev/null
+++ b/storage/innobase/include/fsp0sysspace.h
@@ -0,0 +1,278 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fsp0sysspace.h
+Multi file, shared, system tablespace implementation.
+
+Created 2013-7-26 by Kevin Lewis
+*******************************************************/
+
+#ifndef fsp0sysspace_h
+#define fsp0sysspace_h
+
+#include "fsp0space.h"
+
+/** If the last data file is auto-extended, we add this many pages to it
+at a time. We have to make this public because it is a config variable. */
+extern uint sys_tablespace_auto_extend_increment;
+
+/** Data structure that contains the information about shared tablespaces.
+Currently this can be the system tablespace or a temporary table tablespace */
+class SysTablespace : public Tablespace
+{
+public:
+
+	SysTablespace()
+		:
+		m_auto_extend_last_file(),
+		m_last_file_size_max(),
+		m_created_new_raw(),
+		m_is_tablespace_full(false),
+		m_sanity_checks_done(false)
+	{
+		/* No op */
+	}
+
+	~SysTablespace() override
+	{
+		shutdown();
+	}
+
+	/** Set tablespace full status
+	@param[in]	is_full		true if full */
+	void set_tablespace_full_status(bool is_full)
+	{
+		m_is_tablespace_full = is_full;
+	}
+
+	/** Get tablespace full status
+	@return true if table is full */
+	bool get_tablespace_full_status()
+	{
+		return(m_is_tablespace_full);
+	}
+
+	/** Set sanity check status
+	@param[in]	status	true if sanity checks are done */
+	void set_sanity_check_status(bool status)
+	{
+		m_sanity_checks_done = status;
+	}
+
+	/** Get sanity check status
+	@return true if sanity checks are done */
+	bool get_sanity_check_status()
+	{
+		return(m_sanity_checks_done);
+	}
+
+	/** Parse the input params and populate member variables.
+	@param	filepath	path to data files
+	@param	supports_raw	true if it supports raw devices
+	@return true on success parse */
+	bool parse_params(const char* filepath, bool supports_raw);
+
+	/** Check the data file specification.
+	@param[out]	create_new_db		true if a new database
+	is to be created
+	@param[in]	min_expected_size	expected tablespace
+	size in bytes
+	@return DB_SUCCESS if all OK else error code */
+	dberr_t check_file_spec(
+		bool*	create_new_db,
+		ulint	min_expected_tablespace_size);
+
+	/** Free the memory allocated by parse() */
+	void shutdown();
+
+	/** Normalize the file size, convert to extents. */
+	void normalize_size();
+
+	/**
+	@return true if a new raw device was created. */
+	bool created_new_raw() const
+	{
+		return(m_created_new_raw);
+	}
+
+	/**
+	@return auto_extend value setting */
+	ulint can_auto_extend_last_file() const
+	{
+		return(m_auto_extend_last_file);
+	}
+
+	/** Set the last file size.
+	@param[in]	size	the size to set */
+	void set_last_file_size(uint32_t size)
+	{
+		ut_ad(!m_files.empty());
+		m_files.back().m_size = size;
+	}
+
+	/** Get the size of the last data file in the tablespace
+	@return the size of the last data file in the array */
+	uint32_t last_file_size() const
+	{
+		ut_ad(!m_files.empty());
+		return(m_files.back().m_size);
+	}
+
+	/**
+	@return the autoextend increment in pages. */
+	uint32_t get_autoextend_increment() const
+	{
+		return sys_tablespace_auto_extend_increment
+			<< (20 - srv_page_size_shift);
+	}
+
+	/**
+	@return next increment size */
+	uint32_t get_increment() const;
+
+	/** Open or create the data files
+	@param[in]  is_temp		whether this is a temporary tablespace
+	@param[in]  create_new_db	whether we are creating a new database
+	@param[out] sum_new_sizes	sum of sizes of the new files added
+	@return DB_SUCCESS or error code */
+	dberr_t open_or_create(
+		bool	is_temp,
+		bool	create_new_db,
+		ulint*	sum_new_sizes)
+		MY_ATTRIBUTE((warn_unused_result));
+
+private:
+	/** Check the tablespace header for this tablespace.
+	@return DB_SUCCESS or error code */
+	inline dberr_t read_lsn_and_check_flags();
+
+	/**
+	@return true if the last file size is valid. */
+	bool is_valid_size() const
+	{
+		return(m_last_file_size_max >= last_file_size());
+	}
+
+	/**
+	@return true if configured to use raw devices */
+	bool has_raw_device();
+
+	/** Note that the data file was not found.
+	@param[in]	file		data file object
+	@param[out]	create_new_db	true if a new instance to be created
+	@return DB_SUCESS or error code */
+	dberr_t file_not_found(Datafile& file, bool* create_new_db);
+
+	/** Note that the data file was found.
+	@param[in,out]	file	data file object
+	@return true if a new instance to be created */
+	bool file_found(Datafile& file);
+
+	/** Create a data file.
+	@param[in,out]	file	data file object
+	@return DB_SUCCESS or error code */
+	dberr_t create(Datafile& file);
+
+	/** Create a data file.
+	@param[in,out]	file	data file object
+	@return DB_SUCCESS or error code */
+	dberr_t create_file(Datafile& file);
+
+	/** Open a data file.
+	@param[in,out]	file	data file object
+	@return DB_SUCCESS or error code */
+	dberr_t open_file(Datafile& file);
+
+	/** Set the size of the file.
+	@param[in,out]	file	data file object
+	@return DB_SUCCESS or error code */
+	dberr_t set_size(Datafile& file);
+
+	/** Convert a numeric string that optionally ends in G or M, to a
+	number containing megabytes.
+	@param[in]	ptr	string with a quantity in bytes
+	@param[out]	megs	the number in megabytes
+	@return next character in string */
+	static char* parse_units(char* ptr, ulint* megs);
+
+private:
+	enum file_status_t {
+		FILE_STATUS_VOID = 0,		/** status not set */
+		FILE_STATUS_RW_PERMISSION_ERROR,/** permission error */
+		FILE_STATUS_READ_WRITE_ERROR,	/** not readable/writable */
+		FILE_STATUS_NOT_REGULAR_FILE_ERROR /** not a regular file */
+	};
+
+	/** Verify the size of the physical file
+	@param[in]	file	data file object
+	@return DB_SUCCESS if OK else error code. */
+	dberr_t check_size(Datafile& file);
+
+	/** Check if a file can be opened in the correct mode.
+	@param[in,out]	file	data file object
+	@param[out]	reason	exact reason if file_status check failed.
+	@return DB_SUCCESS or error code. */
+	dberr_t check_file_status(
+		const Datafile& 	file,
+		file_status_t& 		reason);
+
+	/* DATA MEMBERS */
+
+	/** if true, then we auto-extend the last data file */
+	bool		m_auto_extend_last_file;
+
+	/** maximum size of the last data file (0=unlimited) */
+	ulint		m_last_file_size_max;
+
+	/** If the following is true we do not allow
+	inserts etc. This protects the user from forgetting
+	the 'newraw' keyword to my.cnf */
+	bool		m_created_new_raw;
+
+	/** Tablespace full status */
+	bool		m_is_tablespace_full;
+
+	/** if false, then sanity checks are still pending */
+	bool		m_sanity_checks_done;
+};
+
+/* GLOBAL OBJECTS */
+
+/** The control info of the system tablespace. */
+extern SysTablespace srv_sys_space;
+
+/** The control info of a temporary table shared tablespace. */
+extern SysTablespace srv_tmp_space;
+
+/** Check if the space_id is for a system-tablespace (shared + temp).
+@param[in]	id	Space ID to check
+@return true if id is a system tablespace, false if not. */
+inline bool is_system_tablespace(uint32_t id)
+{
+  return id == TRX_SYS_SPACE || id == SRV_TMP_SPACE_ID;
+}
+
+/** Check if predefined shared tablespace.
+@return true if predefined shared tablespace */
+inline bool is_predefined_tablespace(uint32_t id)
+{
+  return is_system_tablespace(id) || srv_is_undo_tablespace(id);
+}
+#endif /* fsp0sysspace_h */
diff --git a/storage/innobase/include/fsp0types.h b/storage/innobase/include/fsp0types.h
new file mode 100644
index 00000000..9a23e840
--- /dev/null
+++ b/storage/innobase/include/fsp0types.h
@@ -0,0 +1,404 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2014, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************
+@file include/fsp0types.h
+File space management types
+
+Created May 26, 2009 Vasil Dimov
+*******************************************************/
+
+#pragma once
+#include "ut0byte.h"
+
+/** All persistent tablespaces have a smaller fil_space_t::id than this. */
+constexpr uint32_t SRV_SPACE_ID_UPPER_BOUND= 0xFFFFFFF0U;
+/** The fil_space_t::id of the innodb_temporary tablespace. */
+constexpr uint32_t SRV_TMP_SPACE_ID= 0xFFFFFFFEU;
+
+/* Possible values of innodb_compression_algorithm */
+#define PAGE_UNCOMPRESSED		0
+#define PAGE_ZLIB_ALGORITHM		1
+#define PAGE_LZ4_ALGORITHM		2
+#define PAGE_LZO_ALGORITHM		3
+#define PAGE_LZMA_ALGORITHM		4
+#define PAGE_BZIP2_ALGORITHM	5
+#define PAGE_SNAPPY_ALGORITHM	6
+#define PAGE_ALGORITHM_LAST		PAGE_SNAPPY_ALGORITHM
+
+extern const char *page_compression_algorithms[];
+
+/** @name Flags for inserting records in order
+If records are inserted in order, there are the following
+flags to tell this (their type is made byte for the compiler
+to warn if direction and hint parameters are switched in
+fseg_alloc_free_page_general) */
+/* @{ */
+#define	FSP_UP		((byte)111)	/*!< alphabetically upwards */
+#define	FSP_DOWN	((byte)112)	/*!< alphabetically downwards */
+#define	FSP_NO_DIR	((byte)113)	/*!< no order */
+/* @} */
+
+/** File space extent size in pages
+page size | file space extent size
+----------+-----------------------
+   4 KiB  | 256 pages = 1 MiB
+   8 KiB  | 128 pages = 1 MiB
+  16 KiB  |  64 pages = 1 MiB
+  32 KiB  |  64 pages = 2 MiB
+  64 KiB  |  64 pages = 4 MiB
+*/
+#define FSP_EXTENT_SIZE         (srv_page_size_shift < 14 ?	\
+				 (1048576U >> srv_page_size_shift) : 64U)
+
+/** File space extent size (four megabyte) in pages for MAX page size */
+#define	FSP_EXTENT_SIZE_MAX	(4194304 / UNIV_PAGE_SIZE_MAX)
+
+/** File space extent size (one megabyte) in pages for MIN page size */
+#define	FSP_EXTENT_SIZE_MIN	(1048576 / UNIV_PAGE_SIZE_MIN)
+
+/** On a page of any file segment, data may be put starting from this
+offset */
+#define FSEG_PAGE_DATA		FIL_PAGE_DATA
+
+/** @name File segment header
+The file segment header points to the inode describing the file segment. */
+/* @{ */
+/** Data type for file segment header */
+typedef	byte	fseg_header_t;
+
+#define FSEG_HDR_SPACE		0	/*!< space id of the inode */
+#define FSEG_HDR_PAGE_NO	4	/*!< page number of the inode */
+#define FSEG_HDR_OFFSET		8	/*!< byte offset of the inode */
+
+#define FSEG_HEADER_SIZE	10	/*!< Length of the file system
+					header, in bytes */
+/* @} */
+
+#ifndef UNIV_INNOCHECKSUM
+#ifdef UNIV_DEBUG
+
+struct mtr_t;
+
+/** A wrapper class to print the file segment header information. */
+class fseg_header
+{
+public:
+	/** Constructor of fseg_header.
+	@param[in]	header	the underlying file segment header object
+	@param[in]	mtr	the mini-transaction.  No redo logs are
+				generated, only latches are checked within
+				mini-transaction */
+	fseg_header(
+		const fseg_header_t*	header,
+		mtr_t*			mtr)
+		:
+		m_header(header),
+		m_mtr(mtr)
+	{}
+
+	/** Print the file segment header to the given output stream.
+	@param[in,out]	out	the output stream into which the object
+				is printed.
+	@retval	the output stream into which the object was printed. */
+	std::ostream&
+	to_stream(std::ostream&	out) const;
+private:
+	/** The underlying file segment header */
+	const fseg_header_t*	m_header;
+
+	/** The mini transaction, which is used mainly to check whether
+	appropriate latches have been taken by the calling thread. */
+	mtr_t*			m_mtr;
+};
+
+/* Overloading the global output operator to print a file segment header
+@param[in,out]	out	the output stream into which object will be printed
+@param[in]	header	the file segment header to be printed
+@retval the output stream */
+inline
+std::ostream&
+operator<<(
+	std::ostream&		out,
+	const fseg_header&	header)
+{
+	return(header.to_stream(out));
+}
+#endif /* UNIV_DEBUG */
+
+/** Flags for fsp_reserve_free_extents */
+enum fsp_reserve_t {
+	FSP_NORMAL,	/* reservation during normal B-tree operations */
+	FSP_UNDO,	/* reservation done for undo logging */
+	FSP_CLEANING,	/* reservation done during purge operations */
+	FSP_BLOB	/* reservation being done for BLOB insertion */
+};
+
+/* Number of pages described in a single descriptor page: currently each page
+description takes less than 1 byte; a descriptor page is repeated every
+this many file pages */
+/* #define XDES_DESCRIBED_PER_PAGE		srv_page_size */
+/* This has been replaced with either srv_page_size or page_zip->size. */
+
+/** @name The space low address page map
+The pages at FSP_XDES_OFFSET and FSP_IBUF_BITMAP_OFFSET are repeated
+every XDES_DESCRIBED_PER_PAGE pages in every tablespace. */
+/* @{ */
+/*--------------------------------------*/
+#define FSP_XDES_OFFSET			0U	/* !< extent descriptor */
+#define FSP_IBUF_BITMAP_OFFSET		1U	/* !< insert buffer bitmap */
+				/* The ibuf bitmap pages are the ones whose
+				page number is the number above plus a
+				multiple of XDES_DESCRIBED_PER_PAGE */
+
+#define FSP_FIRST_INODE_PAGE_NO		2U	/*!< in every tablespace */
+				/* The following pages exist
+				in the system tablespace (space 0). */
+#define FSP_IBUF_HEADER_PAGE_NO		3U	/*!< insert buffer
+						header page, in
+						tablespace 0 */
+#define FSP_IBUF_TREE_ROOT_PAGE_NO	4U	/*!< insert buffer
+						B-tree root page in
+						tablespace 0 */
+				/* The ibuf tree root page number in
+				tablespace 0; its fseg inode is on the page
+				number FSP_FIRST_INODE_PAGE_NO */
+#define FSP_TRX_SYS_PAGE_NO		5U	/*!< transaction
+						system header, in
+						tablespace 0 */
+#define	FSP_FIRST_RSEG_PAGE_NO		6U	/*!< first rollback segment
+						page, in tablespace 0 */
+#define FSP_DICT_HDR_PAGE_NO		7U	/*!< data dictionary header
+						page, in tablespace 0 */
+/*--------------------------------------*/
+/* @} */
+
+/** Check if tablespace is system temporary.
+@param[in]      space_id        verify is checksum is enabled for given space.
+@return true if tablespace is system temporary. */
+inline
+bool
+fsp_is_system_temporary(ulint	space_id)
+{
+	return(space_id == SRV_TMP_SPACE_ID);
+}
+#endif /* !UNIV_INNOCHECKSUM */
+
+/* @defgroup fsp_flags InnoDB Tablespace Flag Constants @{ */
+
+/** Width of the POST_ANTELOPE flag */
+#define FSP_FLAGS_WIDTH_POST_ANTELOPE	1
+/** Number of flag bits used to indicate the tablespace zip page size */
+#define FSP_FLAGS_WIDTH_ZIP_SSIZE	4
+/** Width of the ATOMIC_BLOBS flag.  The ability to break up a long
+column into an in-record prefix and an externally stored part is available
+to ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT. */
+#define FSP_FLAGS_WIDTH_ATOMIC_BLOBS	1
+/** Number of flag bits used to indicate the tablespace page size */
+#define FSP_FLAGS_WIDTH_PAGE_SSIZE	4
+/** Number of reserved bits */
+#define FSP_FLAGS_WIDTH_RESERVED 6
+/** Number of flag bits used to indicate the page compression */
+#define FSP_FLAGS_WIDTH_PAGE_COMPRESSION 1
+
+/** Width of all the currently known persistent tablespace flags */
+#define FSP_FLAGS_WIDTH		(FSP_FLAGS_WIDTH_POST_ANTELOPE	\
+				+ FSP_FLAGS_WIDTH_ZIP_SSIZE	\
+				+ FSP_FLAGS_WIDTH_ATOMIC_BLOBS	\
+				+ FSP_FLAGS_WIDTH_PAGE_SSIZE	\
+				+ FSP_FLAGS_WIDTH_RESERVED	\
+				+ FSP_FLAGS_WIDTH_PAGE_COMPRESSION)
+
+/** A mask of all the known/used bits in FSP_SPACE_FLAGS */
+#define FSP_FLAGS_MASK		(~(~0U << FSP_FLAGS_WIDTH))
+
+/** Number of flag bits used to indicate the tablespace page size */
+#define FSP_FLAGS_FCRC32_WIDTH_PAGE_SSIZE	4
+
+/** Marker to indicate whether tablespace is in full checksum format. */
+#define FSP_FLAGS_FCRC32_WIDTH_MARKER		1
+
+/** Stores the compressed algo for full checksum format. */
+#define FSP_FLAGS_FCRC32_WIDTH_COMPRESSED_ALGO	3
+
+/* FSP_SPACE_FLAGS position and name in MySQL 5.6/MariaDB 10.0 or older
+and MariaDB 10.1.20 or older MariaDB 10.1 and in MariaDB 10.1.21
+or newer.
+MySQL 5.6		MariaDB 10.1.x		MariaDB 10.1.21
+====================================================================
+Below flags in same offset
+====================================================================
+0: POST_ANTELOPE	0:POST_ANTELOPE		0: POST_ANTELOPE
+1..4: ZIP_SSIZE(0..5)	1..4:ZIP_SSIZE(0..5)	1..4: ZIP_SSIZE(0..5)
+(NOTE: bit 4 is always 0)
+5: ATOMIC_BLOBS    	5:ATOMIC_BLOBS		5: ATOMIC_BLOBS
+=====================================================================
+Below note the order difference:
+=====================================================================
+6..9: PAGE_SSIZE(3..7)	6: COMPRESSION		6..9: PAGE_SSIZE(3..7)
+10: DATA_DIR		7..10: COMP_LEVEL(0..9)	10: RESERVED (5.6 DATA_DIR)
+=====================================================================
+The flags below were in incorrect position in MariaDB 10.1,
+or have been introduced in MySQL 5.7 or 8.0:
+=====================================================================
+11: UNUSED		11..12:ATOMIC_WRITES	11: RESERVED (5.7 SHARED)
+						12: RESERVED (5.7 TEMPORARY)
+			13..15:PAGE_SSIZE(3..7)	13: RESERVED (5.7 ENCRYPTION)
+						14: RESERVED (8.0 SDI)
+						15: RESERVED
+			16: PAGE_SSIZE_msb(0)	16: COMPRESSION
+			17: DATA_DIR		17: UNUSED
+			18: UNUSED
+=====================================================================
+The flags below only exist in fil_space_t::flags, not in FSP_SPACE_FLAGS:
+=====================================================================
+						27: DATA_DIR
+						28..31: COMPRESSION_LEVEL
+*/
+
+/** A mask of the memory-only flags in fil_space_t::flags */
+#define FSP_FLAGS_MEM_MASK		(~0U << FSP_FLAGS_MEM_DATA_DIR)
+
+/** Zero relative shift position of the DATA_DIR flag */
+#define FSP_FLAGS_MEM_DATA_DIR		27
+/** Zero relative shift position of the COMPRESSION_LEVEL field */
+#define FSP_FLAGS_MEM_COMPRESSION_LEVEL	28
+
+/** Zero relative shift position of the POST_ANTELOPE field */
+#define FSP_FLAGS_POS_POST_ANTELOPE	0
+/** Zero relative shift position of the ZIP_SSIZE field */
+#define FSP_FLAGS_POS_ZIP_SSIZE		(FSP_FLAGS_POS_POST_ANTELOPE	\
+					+ FSP_FLAGS_WIDTH_POST_ANTELOPE)
+/** Zero relative shift position of the ATOMIC_BLOBS field */
+#define FSP_FLAGS_POS_ATOMIC_BLOBS	(FSP_FLAGS_POS_ZIP_SSIZE	\
+					+ FSP_FLAGS_WIDTH_ZIP_SSIZE)
+/** Zero relative shift position of the start of the PAGE_SSIZE bits */
+#define FSP_FLAGS_POS_PAGE_SSIZE	(FSP_FLAGS_POS_ATOMIC_BLOBS	\
+                                        + FSP_FLAGS_WIDTH_ATOMIC_BLOBS)
+/** Zero relative shift position of the start of the RESERVED bits
+these are only used in MySQL 5.7 and used for compatibility. */
+#define FSP_FLAGS_POS_RESERVED		(FSP_FLAGS_POS_PAGE_SSIZE	\
+					+ FSP_FLAGS_WIDTH_PAGE_SSIZE)
+/** Zero relative shift position of the PAGE_COMPRESSION field */
+#define FSP_FLAGS_POS_PAGE_COMPRESSION	(FSP_FLAGS_POS_RESERVED \
+					+ FSP_FLAGS_WIDTH_RESERVED)
+
+/** Zero relative shift position of the PAGE_SIZE field
+in full crc32 format */
+#define FSP_FLAGS_FCRC32_POS_PAGE_SSIZE	0
+
+/** Zero relative shift position of the MARKER field in full crc32 format. */
+#define FSP_FLAGS_FCRC32_POS_MARKER	(FSP_FLAGS_FCRC32_POS_PAGE_SSIZE \
+					 + FSP_FLAGS_FCRC32_WIDTH_PAGE_SSIZE)
+
+/** Zero relative shift position of the compressed algorithm stored
+in full crc32 format. */
+#define FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO	(FSP_FLAGS_FCRC32_POS_MARKER \
+						 + FSP_FLAGS_FCRC32_WIDTH_MARKER)
+
+/** Bit mask of the POST_ANTELOPE field */
+#define FSP_FLAGS_MASK_POST_ANTELOPE				\
+		((~(~0U << FSP_FLAGS_WIDTH_POST_ANTELOPE))	\
+		<< FSP_FLAGS_POS_POST_ANTELOPE)
+/** Bit mask of the ZIP_SSIZE field */
+#define FSP_FLAGS_MASK_ZIP_SSIZE				\
+		((~(~0U << FSP_FLAGS_WIDTH_ZIP_SSIZE))		\
+		<< FSP_FLAGS_POS_ZIP_SSIZE)
+/** Bit mask of the ATOMIC_BLOBS field */
+#define FSP_FLAGS_MASK_ATOMIC_BLOBS				\
+		((~(~0U << FSP_FLAGS_WIDTH_ATOMIC_BLOBS))	\
+		<< FSP_FLAGS_POS_ATOMIC_BLOBS)
+/** Bit mask of the PAGE_SSIZE field */
+#define FSP_FLAGS_MASK_PAGE_SSIZE				\
+		((~(~0U << FSP_FLAGS_WIDTH_PAGE_SSIZE))		\
+		<< FSP_FLAGS_POS_PAGE_SSIZE)
+/** Bit mask of the RESERVED1 field */
+#define FSP_FLAGS_MASK_RESERVED					\
+		((~(~0U << FSP_FLAGS_WIDTH_RESERVED))		\
+		<< FSP_FLAGS_POS_RESERVED)
+/** Bit mask of the PAGE_COMPRESSION field */
+#define FSP_FLAGS_MASK_PAGE_COMPRESSION				\
+		((~(~0U << FSP_FLAGS_WIDTH_PAGE_COMPRESSION))	\
+		<< FSP_FLAGS_POS_PAGE_COMPRESSION)
+
+/** Bit mask of the in-memory COMPRESSION_LEVEL field */
+#define FSP_FLAGS_MASK_MEM_COMPRESSION_LEVEL			\
+		(15U << FSP_FLAGS_MEM_COMPRESSION_LEVEL)
+
+/** Bit mask of the PAGE_SIZE field in full crc32 format */
+#define FSP_FLAGS_FCRC32_MASK_PAGE_SSIZE			\
+		((~(~0U << FSP_FLAGS_FCRC32_WIDTH_PAGE_SSIZE))	\
+		<< FSP_FLAGS_FCRC32_POS_PAGE_SSIZE)
+
+/** Bit mask of the MARKER field in full crc32 format */
+#define FSP_FLAGS_FCRC32_MASK_MARKER				\
+		((~(~0U << FSP_FLAGS_FCRC32_WIDTH_MARKER))	\
+		<< FSP_FLAGS_FCRC32_POS_MARKER)
+
+/** Bit mask of the COMPRESSED ALGO field in full crc32 format */
+#define FSP_FLAGS_FCRC32_MASK_COMPRESSED_ALGO			\
+		((~(~0U << FSP_FLAGS_FCRC32_WIDTH_COMPRESSED_ALGO))	\
+		<< FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO)
+
+/** Return the value of the POST_ANTELOPE field */
+#define FSP_FLAGS_GET_POST_ANTELOPE(flags)			\
+		((flags & FSP_FLAGS_MASK_POST_ANTELOPE)		\
+		>> FSP_FLAGS_POS_POST_ANTELOPE)
+/** Return the value of the ZIP_SSIZE field */
+#define FSP_FLAGS_GET_ZIP_SSIZE(flags)				\
+		((flags & FSP_FLAGS_MASK_ZIP_SSIZE)		\
+		>> FSP_FLAGS_POS_ZIP_SSIZE)
+/** Return the value of the ATOMIC_BLOBS field */
+#define FSP_FLAGS_HAS_ATOMIC_BLOBS(flags)			\
+		((flags & FSP_FLAGS_MASK_ATOMIC_BLOBS)		\
+		>> FSP_FLAGS_POS_ATOMIC_BLOBS)
+/** Return the value of the PAGE_SSIZE field */
+#define FSP_FLAGS_GET_PAGE_SSIZE(flags)				\
+		((flags & FSP_FLAGS_MASK_PAGE_SSIZE)		\
+		>> FSP_FLAGS_POS_PAGE_SSIZE)
+/** @return the RESERVED flags */
+#define FSP_FLAGS_GET_RESERVED(flags)				\
+		((flags & FSP_FLAGS_MASK_RESERVED)		\
+		>> FSP_FLAGS_POS_RESERVED)
+/** @return the PAGE_COMPRESSION flag */
+#define FSP_FLAGS_HAS_PAGE_COMPRESSION(flags)			\
+		((flags & FSP_FLAGS_MASK_PAGE_COMPRESSION)	\
+		>> FSP_FLAGS_POS_PAGE_COMPRESSION)
+/** @return the PAGE_SSIZE flags in full crc32 format */
+#define FSP_FLAGS_FCRC32_GET_PAGE_SSIZE(flags)			\
+		((flags & FSP_FLAGS_FCRC32_MASK_PAGE_SSIZE)	\
+		>> FSP_FLAGS_FCRC32_POS_PAGE_SSIZE)
+/** @return the COMPRESSED_ALGO flags in full crc32 format */
+#define FSP_FLAGS_FCRC32_GET_COMPRESSED_ALGO(flags)			\
+		((flags & FSP_FLAGS_FCRC32_MASK_COMPRESSED_ALGO)	\
+		>> FSP_FLAGS_FCRC32_POS_COMPRESSED_ALGO)
+
+/** @return the value of the DATA_DIR field */
+#define FSP_FLAGS_HAS_DATA_DIR(flags)				\
+	(flags & 1U << FSP_FLAGS_MEM_DATA_DIR)
+/** @return the COMPRESSION_LEVEL field */
+#define FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags)		\
+	((flags & FSP_FLAGS_MASK_MEM_COMPRESSION_LEVEL)		\
+	 >> FSP_FLAGS_MEM_COMPRESSION_LEVEL)
+
+/* @} */
+
+struct fil_node_t;
+struct fil_space_t;
+class buf_page_t;
diff --git a/storage/innobase/include/fts0ast.h b/storage/innobase/include/fts0ast.h
new file mode 100644
index 00000000..15bf30bc
--- /dev/null
+++ b/storage/innobase/include/fts0ast.h
@@ -0,0 +1,340 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0ast.h
+The FTS query parser (AST) abstract syntax tree routines
+
+Created 2007/03/16/03 Sunny Bains
+*******************************************************/
+
+#ifndef INNOBASE_FST0AST_H
+#define INNOBASE_FST0AST_H
+
+#include "mem0mem.h"
+
+/* The type of AST Node */
+enum fts_ast_type_t {
+	FTS_AST_OPER,				/*!< Operator */
+	FTS_AST_NUMB,				/*!< Number */
+	FTS_AST_TERM,				/*!< Term (or word) */
+	FTS_AST_TEXT,				/*!< Text string */
+	FTS_AST_PARSER_PHRASE_LIST,		/*!< Phase for plugin parser
+						The difference from text type
+						is that we tokenize text into
+						term list */
+	FTS_AST_LIST,				/*!< Expression list */
+	FTS_AST_SUBEXP_LIST			/*!< Sub-Expression list */
+};
+
+/* The FTS query operators that we support */
+enum fts_ast_oper_t {
+	FTS_NONE,				/*!< No operator */
+
+	FTS_IGNORE,				/*!< Ignore rows that contain
+						this word */
+
+	FTS_EXIST,				/*!< Include rows that contain
+						this word */
+
+	FTS_NEGATE,				/*!< Include rows that contain
+						this word but rank them
+						lower*/
+
+	FTS_INCR_RATING,			/*!< Increase the rank for this
+						word*/
+
+	FTS_DECR_RATING,			/*!< Decrease the rank for this
+						word*/
+
+	FTS_DISTANCE,				/*!< Proximity distance */
+	FTS_IGNORE_SKIP,			/*!< Transient node operator
+						signifies that this is a
+						FTS_IGNORE node, and ignored in
+						the first pass of
+						fts_ast_visit() */
+	FTS_EXIST_SKIP				/*!< Transient node operator
+						signifies that this ia a
+						FTS_EXIST node, and ignored in
+						the first pass of
+						fts_ast_visit() */
+};
+
+/* Data types used by the FTS parser */
+struct fts_lexer_t;
+struct fts_ast_node_t;
+struct fts_ast_state_t;
+struct fts_ast_string_t;
+
+typedef dberr_t (*fts_ast_callback)(fts_ast_oper_t, fts_ast_node_t*, void*);
+
+/********************************************************************
+Parse the string using the lexer setup within state.*/
+int
+fts_parse(
+/*======*/
+						/* out: 0 on OK, 1 on error */
+	fts_ast_state_t* state);		/*!< in: ast state instance.*/
+
+/********************************************************************
+Create an AST operator node */
+extern
+fts_ast_node_t*
+fts_ast_create_node_oper(
+/*=====================*/
+	void*		arg,			/*!< in: ast state */
+	fts_ast_oper_t	oper);			/*!< in: ast operator */
+/********************************************************************
+Create an AST term node, makes a copy of ptr */
+extern
+fts_ast_node_t*
+fts_ast_create_node_term(
+/*=====================*/
+	void*			arg,		/*!< in: ast state */
+	const fts_ast_string_t*	ptr);		/*!< in: term string */
+/********************************************************************
+Create an AST text node */
+extern
+fts_ast_node_t*
+fts_ast_create_node_text(
+/*=====================*/
+	void*			arg,		/*!< in: ast state */
+	const fts_ast_string_t*	ptr);		/*!< in: text string */
+/********************************************************************
+Create an AST expr list node */
+extern
+fts_ast_node_t*
+fts_ast_create_node_list(
+/*=====================*/
+	void*		arg,			/*!< in: ast state */
+	fts_ast_node_t*	expr);			/*!< in: ast expr */
+/********************************************************************
+Create a sub-expression list node. This function takes ownership of
+expr and is responsible for deleting it. */
+extern
+fts_ast_node_t*
+fts_ast_create_node_subexp_list(
+/*============================*/
+						/* out: new node */
+	void*		arg,			/*!< in: ast state instance */
+	fts_ast_node_t*	expr);			/*!< in: ast expr instance */
+/********************************************************************
+Set the wildcard attribute of a term.*/
+extern
+void
+fts_ast_term_set_wildcard(
+/*======================*/
+	fts_ast_node_t*	node);			/*!< in: term to change */
+/********************************************************************
+Set the proximity attribute of a text node. */
+void
+fts_ast_text_set_distance(
+/*======================*/
+	fts_ast_node_t*	node,			/*!< in/out: text node */
+	ulint		distance);		/*!< in: the text proximity
+						distance */
+/********************************************************************//**
+Free a fts_ast_node_t instance.
+@return next node to free */
+fts_ast_node_t*
+fts_ast_free_node(
+/*==============*/
+	fts_ast_node_t*	node);			/*!< in: node to free */
+/********************************************************************
+Add a sub-expression to an AST*/
+extern
+fts_ast_node_t*
+fts_ast_add_node(
+/*=============*/
+	fts_ast_node_t*	list,			/*!< in: list node instance */
+	fts_ast_node_t*	node);			/*!< in: (sub) expr to add */
+/********************************************************************
+Print the AST node recursively.*/
+extern
+void
+fts_ast_node_print(
+/*===============*/
+	fts_ast_node_t*	node);			/*!< in: ast node to print */
+/********************************************************************
+Free node and expr allocations.*/
+extern
+void
+fts_ast_state_free(
+/*===============*/
+	fts_ast_state_t*state);			/*!< in: state instance
+						to free */
+/** Check only union operation involved in the node
+@param[in]	node	ast node to check
+@return true if the node contains only union else false. */
+bool
+fts_ast_node_check_union(
+	fts_ast_node_t*	node);
+
+/******************************************************************//**
+Traverse the AST - in-order traversal.
+@return DB_SUCCESS if all went well */
+dberr_t
+fts_ast_visit(
+/*==========*/
+	fts_ast_oper_t		oper,		/*!< in: FTS operator */
+	fts_ast_node_t*		node,		/*!< in: instance to traverse*/
+	fts_ast_callback	visitor,	/*!< in: callback */
+	void*			arg,		/*!< in: callback arg */
+	bool*			has_ignore)	/*!< out: whether we encounter
+						and ignored processing an
+						operator, currently we only
+						ignore FTS_IGNORE operator */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/********************************************************************
+Create a lex instance.*/
+fts_lexer_t*
+fts_lexer_create(
+/*=============*/
+	ibool		boolean_mode,		/*!< in: query type */
+	const byte*	query,			/*!< in: query string */
+	ulint		query_len)		/*!< in: query string len */
+	MY_ATTRIBUTE((nonnull, malloc, warn_unused_result));
+/********************************************************************
+Free an fts_lexer_t instance.*/
+void
+fts_lexer_free(
+/*===========*/
+	fts_lexer_t*	fts_lexer)		/*!< in: lexer instance to
+						free */
+	MY_ATTRIBUTE((nonnull));
+
+/**
+Create an ast string object, with NUL-terminator, so the string
+has one more byte than len
+@param[in] str		pointer to string
+@param[in] len		length of the string
+@return ast string with NUL-terminator */
+fts_ast_string_t*
+fts_ast_string_create(
+	const byte*	str,
+	ulint		len);
+
+/**
+Free an ast string instance
+@param[in,out] ast_str		string to free */
+void
+fts_ast_string_free(
+	fts_ast_string_t*	ast_str);
+
+/**
+Translate ast string of type FTS_AST_NUMB to unsigned long by strtoul
+@param[in] str		string to translate
+@param[in] base		the base
+@return translated number */
+ulint
+fts_ast_string_to_ul(
+	const fts_ast_string_t*	ast_str,
+	int			base);
+
+/* String of length len.
+We always store the string of length len with a terminating '\0',
+regardless of there is any 0x00 in the string itself */
+struct fts_ast_string_t {
+	/*!< Pointer to string. */
+	byte*		str;
+
+	/*!< Length of the string. */
+	ulint		len;
+};
+
+/* Query term type */
+struct fts_ast_term_t {
+	fts_ast_string_t*	ptr;		/*!< Pointer to term string.*/
+	ibool			wildcard;	/*!< TRUE if wild card set.*/
+};
+
+/* Query text type */
+struct fts_ast_text_t {
+	fts_ast_string_t*	ptr;		/*!< Pointer to text string.*/
+	ulint			distance;	/*!< > 0 if proximity distance
+						set */
+};
+
+/* The list of nodes in an expr list */
+struct fts_ast_list_t {
+	fts_ast_node_t*	head;			/*!< Children list head */
+	fts_ast_node_t*	tail;			/*!< Children list tail */
+};
+
+/* FTS AST node to store the term, text, operator and sub-expressions.*/
+struct fts_ast_node_t {
+	fts_ast_type_t	type;			/*!< The type of node */
+	fts_ast_text_t	text;			/*!< Text node */
+	fts_ast_term_t	term;			/*!< Term node */
+	fts_ast_oper_t	oper;			/*!< Operator value */
+	fts_ast_list_t	list;			/*!< Expression list */
+	fts_ast_node_t*	next;			/*!< Link for expr list */
+	fts_ast_node_t*	next_alloc;		/*!< For tracking allocations */
+	bool		visited;		/*!< whether this node is
+						already processed */
+	/** current transaction */
+	const trx_t*	trx;
+	/* Used by plugin parser */
+	fts_ast_node_t* up_node;		/*!< Direct up node */
+	bool		go_up;			/*!< Flag if go one level up */
+};
+
+/* To track state during parsing */
+struct fts_ast_state_t {
+	mem_heap_t*	heap;			/*!< Heap to use for alloc */
+	fts_ast_node_t*	root;			/*!< If all goes OK, then this
+						will point to the root.*/
+
+	fts_ast_list_t	list;			/*!< List of nodes allocated */
+
+	fts_lexer_t*	lexer;			/*!< Lexer callback + arg */
+	CHARSET_INFO*	charset;		/*!< charset used for
+						tokenization */
+	/* Used by plugin parser */
+	fts_ast_node_t*	cur_node;		/*!< Current node into which
+						 we add new node */
+	int		depth;			/*!< Depth of parsing state */
+};
+
+/******************************************************************//**
+Create an AST term node, makes a copy of ptr for plugin parser
+@return node */
+extern
+fts_ast_node_t*
+fts_ast_create_node_term_for_parser(
+/*==========i=====================*/
+	void*		arg,			/*!< in: ast state */
+	const char*	ptr,			/*!< in: term string */
+	const ulint	len);			/*!< in: term string length */
+
+/******************************************************************//**
+Create an AST phrase list node for plugin parser
+@return node */
+extern
+fts_ast_node_t*
+fts_ast_create_node_phrase_list(
+/*============================*/
+	void*		arg);			/*!< in: ast state */
+
+#ifdef UNIV_DEBUG
+const char*
+fts_ast_node_type_get(fts_ast_type_t	type);
+#endif /* UNIV_DEBUG */
+
+#endif /* INNOBASE_FSTS0AST_H */
diff --git a/storage/innobase/include/fts0blex.h b/storage/innobase/include/fts0blex.h
new file mode 100644
index 00000000..b16e7f2c
--- /dev/null
+++ b/storage/innobase/include/fts0blex.h
@@ -0,0 +1,702 @@
+#ifndef fts0bHEADER_H
+#define fts0bHEADER_H 1
+#define fts0bIN_HEADER 1
+
+#line 6 "../include/fts0blex.h"
+
+#line 8 "../include/fts0blex.h"
+
+#define  YY_INT_ALIGNED short int
+
+/* A lexical scanner generated by flex */
+
+#define FLEX_SCANNER
+#define YY_FLEX_MAJOR_VERSION 2
+#define YY_FLEX_MINOR_VERSION 6
+#define YY_FLEX_SUBMINOR_VERSION 4
+#if YY_FLEX_SUBMINOR_VERSION > 0
+#define FLEX_BETA
+#endif
+
+#ifdef yy_create_buffer
+#define fts0b_create_buffer_ALREADY_DEFINED
+#else
+#define yy_create_buffer fts0b_create_buffer
+#endif
+
+#ifdef yy_delete_buffer
+#define fts0b_delete_buffer_ALREADY_DEFINED
+#else
+#define yy_delete_buffer fts0b_delete_buffer
+#endif
+
+#ifdef yy_scan_buffer
+#define fts0b_scan_buffer_ALREADY_DEFINED
+#else
+#define yy_scan_buffer fts0b_scan_buffer
+#endif
+
+#ifdef yy_scan_string
+#define fts0b_scan_string_ALREADY_DEFINED
+#else
+#define yy_scan_string fts0b_scan_string
+#endif
+
+#ifdef yy_scan_bytes
+#define fts0b_scan_bytes_ALREADY_DEFINED
+#else
+#define yy_scan_bytes fts0b_scan_bytes
+#endif
+
+#ifdef yy_init_buffer
+#define fts0b_init_buffer_ALREADY_DEFINED
+#else
+#define yy_init_buffer fts0b_init_buffer
+#endif
+
+#ifdef yy_flush_buffer
+#define fts0b_flush_buffer_ALREADY_DEFINED
+#else
+#define yy_flush_buffer fts0b_flush_buffer
+#endif
+
+#ifdef yy_load_buffer_state
+#define fts0b_load_buffer_state_ALREADY_DEFINED
+#else
+#define yy_load_buffer_state fts0b_load_buffer_state
+#endif
+
+#ifdef yy_switch_to_buffer
+#define fts0b_switch_to_buffer_ALREADY_DEFINED
+#else
+#define yy_switch_to_buffer fts0b_switch_to_buffer
+#endif
+
+#ifdef yypush_buffer_state
+#define fts0bpush_buffer_state_ALREADY_DEFINED
+#else
+#define yypush_buffer_state fts0bpush_buffer_state
+#endif
+
+#ifdef yypop_buffer_state
+#define fts0bpop_buffer_state_ALREADY_DEFINED
+#else
+#define yypop_buffer_state fts0bpop_buffer_state
+#endif
+
+#ifdef yyensure_buffer_stack
+#define fts0bensure_buffer_stack_ALREADY_DEFINED
+#else
+#define yyensure_buffer_stack fts0bensure_buffer_stack
+#endif
+
+#ifdef yylex
+#define fts0blex_ALREADY_DEFINED
+#else
+#define yylex fts0blex
+#endif
+
+#ifdef yyrestart
+#define fts0brestart_ALREADY_DEFINED
+#else
+#define yyrestart fts0brestart
+#endif
+
+#ifdef yylex_init
+#define fts0blex_init_ALREADY_DEFINED
+#else
+#define yylex_init fts0blex_init
+#endif
+
+#ifdef yylex_init_extra
+#define fts0blex_init_extra_ALREADY_DEFINED
+#else
+#define yylex_init_extra fts0blex_init_extra
+#endif
+
+#ifdef yylex_destroy
+#define fts0blex_destroy_ALREADY_DEFINED
+#else
+#define yylex_destroy fts0blex_destroy
+#endif
+
+#ifdef yyget_debug
+#define fts0bget_debug_ALREADY_DEFINED
+#else
+#define yyget_debug fts0bget_debug
+#endif
+
+#ifdef yyset_debug
+#define fts0bset_debug_ALREADY_DEFINED
+#else
+#define yyset_debug fts0bset_debug
+#endif
+
+#ifdef yyget_extra
+#define fts0bget_extra_ALREADY_DEFINED
+#else
+#define yyget_extra fts0bget_extra
+#endif
+
+#ifdef yyset_extra
+#define fts0bset_extra_ALREADY_DEFINED
+#else
+#define yyset_extra fts0bset_extra
+#endif
+
+#ifdef yyget_in
+#define fts0bget_in_ALREADY_DEFINED
+#else
+#define yyget_in fts0bget_in
+#endif
+
+#ifdef yyset_in
+#define fts0bset_in_ALREADY_DEFINED
+#else
+#define yyset_in fts0bset_in
+#endif
+
+#ifdef yyget_out
+#define fts0bget_out_ALREADY_DEFINED
+#else
+#define yyget_out fts0bget_out
+#endif
+
+#ifdef yyset_out
+#define fts0bset_out_ALREADY_DEFINED
+#else
+#define yyset_out fts0bset_out
+#endif
+
+#ifdef yyget_leng
+#define fts0bget_leng_ALREADY_DEFINED
+#else
+#define yyget_leng fts0bget_leng
+#endif
+
+#ifdef yyget_text
+#define fts0bget_text_ALREADY_DEFINED
+#else
+#define yyget_text fts0bget_text
+#endif
+
+#ifdef yyget_lineno
+#define fts0bget_lineno_ALREADY_DEFINED
+#else
+#define yyget_lineno fts0bget_lineno
+#endif
+
+#ifdef yyset_lineno
+#define fts0bset_lineno_ALREADY_DEFINED
+#else
+#define yyset_lineno fts0bset_lineno
+#endif
+
+#ifdef yyget_column
+#define fts0bget_column_ALREADY_DEFINED
+#else
+#define yyget_column fts0bget_column
+#endif
+
+#ifdef yyset_column
+#define fts0bset_column_ALREADY_DEFINED
+#else
+#define yyset_column fts0bset_column
+#endif
+
+#ifdef yywrap
+#define fts0bwrap_ALREADY_DEFINED
+#else
+#define yywrap fts0bwrap
+#endif
+
+#ifdef yyalloc
+#define fts0balloc_ALREADY_DEFINED
+#else
+#define yyalloc fts0balloc
+#endif
+
+#ifdef yyrealloc
+#define fts0brealloc_ALREADY_DEFINED
+#else
+#define yyrealloc fts0brealloc
+#endif
+
+#ifdef yyfree
+#define fts0bfree_ALREADY_DEFINED
+#else
+#define yyfree fts0bfree
+#endif
+
+/* First, we deal with  platform-specific or compiler-specific issues. */
+
+/* begin standard C headers. */
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+
+/* end standard C headers. */
+
+/* flex integer type definitions */
+
+#ifndef FLEXINT_H
+#define FLEXINT_H
+
+/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+
+/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
+ * if you want the limit (max/min) macros for int types. 
+ */
+#ifndef __STDC_LIMIT_MACROS
+#define __STDC_LIMIT_MACROS 1
+#endif
+
+#include <inttypes.h>
+typedef int8_t flex_int8_t;
+typedef uint8_t flex_uint8_t;
+typedef int16_t flex_int16_t;
+typedef uint16_t flex_uint16_t;
+typedef int32_t flex_int32_t;
+typedef uint32_t flex_uint32_t;
+#else
+typedef signed char flex_int8_t;
+typedef short int flex_int16_t;
+typedef int flex_int32_t;
+typedef unsigned char flex_uint8_t; 
+typedef unsigned short int flex_uint16_t;
+typedef unsigned int flex_uint32_t;
+
+/* Limits of integral types. */
+#ifndef INT8_MIN
+#define INT8_MIN               (-128)
+#endif
+#ifndef INT16_MIN
+#define INT16_MIN              (-32767-1)
+#endif
+#ifndef INT32_MIN
+#define INT32_MIN              (-2147483647-1)
+#endif
+#ifndef INT8_MAX
+#define INT8_MAX               (127)
+#endif
+#ifndef INT16_MAX
+#define INT16_MAX              (32767)
+#endif
+#ifndef INT32_MAX
+#define INT32_MAX              (2147483647)
+#endif
+#ifndef UINT8_MAX
+#define UINT8_MAX              (255U)
+#endif
+#ifndef UINT16_MAX
+#define UINT16_MAX             (65535U)
+#endif
+#ifndef UINT32_MAX
+#define UINT32_MAX             (4294967295U)
+#endif
+
+#ifndef SIZE_MAX
+#define SIZE_MAX               (~(size_t)0)
+#endif
+
+#endif /* ! C99 */
+
+#endif /* ! FLEXINT_H */
+
+/* begin standard C++ headers. */
+
+/* TODO: this is always defined, so inline it */
+#define yyconst const
+
+#if defined(__GNUC__) && __GNUC__ >= 3
+#define yynoreturn __attribute__((__noreturn__))
+#else
+#define yynoreturn
+#endif
+
+/* An opaque pointer. */
+#ifndef YY_TYPEDEF_YY_SCANNER_T
+#define YY_TYPEDEF_YY_SCANNER_T
+typedef void* yyscan_t;
+#endif
+
+/* For convenience, these vars (plus the bison vars far below)
+   are macros in the reentrant scanner. */
+#define yyin yyg->yyin_r
+#define yyout yyg->yyout_r
+#define yyextra yyg->yyextra_r
+#define yyleng yyg->yyleng_r
+#define yytext yyg->yytext_r
+#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno)
+#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column)
+#define yy_flex_debug yyg->yy_flex_debug_r
+
+/* Size of default input buffer. */
+#ifndef YY_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k.
+ * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
+ * Ditto for the __ia64__ case accordingly.
+ */
+#define YY_BUF_SIZE 32768
+#else
+#define YY_BUF_SIZE 16384
+#endif /* __ia64__ */
+#endif
+
+#ifndef YY_TYPEDEF_YY_BUFFER_STATE
+#define YY_TYPEDEF_YY_BUFFER_STATE
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+#endif
+
+#ifndef YY_TYPEDEF_YY_SIZE_T
+#define YY_TYPEDEF_YY_SIZE_T
+typedef size_t yy_size_t;
+#endif
+
+#ifndef YY_STRUCT_YY_BUFFER_STATE
+#define YY_STRUCT_YY_BUFFER_STATE
+struct yy_buffer_state
+	{
+	FILE *yy_input_file;
+
+	char *yy_ch_buf;		/* input buffer */
+	char *yy_buf_pos;		/* current position in input buffer */
+
+	/* Size of input buffer in bytes, not including room for EOB
+	 * characters.
+	 */
+	int yy_buf_size;
+
+	/* Number of characters read into yy_ch_buf, not including EOB
+	 * characters.
+	 */
+	int yy_n_chars;
+
+	/* Whether we "own" the buffer - i.e., we know we created it,
+	 * and can realloc() it to grow it, and should free() it to
+	 * delete it.
+	 */
+	int yy_is_our_buffer;
+
+	/* Whether this is an "interactive" input source; if so, and
+	 * if we're using stdio for input, then we want to use getc()
+	 * instead of fread(), to make sure we stop fetching input after
+	 * each newline.
+	 */
+	int yy_is_interactive;
+
+	/* Whether we're considered to be at the beginning of a line.
+	 * If so, '^' rules will be active on the next match, otherwise
+	 * not.
+	 */
+	int yy_at_bol;
+
+    int yy_bs_lineno; /**< The line count. */
+    int yy_bs_column; /**< The column count. */
+
+	/* Whether to try to fill the input buffer when we reach the
+	 * end of it.
+	 */
+	int yy_fill_buffer;
+
+	int yy_buffer_status;
+
+	};
+#endif /* !YY_STRUCT_YY_BUFFER_STATE */
+
+void yyrestart ( FILE *input_file , yyscan_t yyscanner );
+void yy_switch_to_buffer ( YY_BUFFER_STATE new_buffer , yyscan_t yyscanner );
+YY_BUFFER_STATE yy_create_buffer ( FILE *file, int size , yyscan_t yyscanner );
+void yy_delete_buffer ( YY_BUFFER_STATE b , yyscan_t yyscanner );
+void yy_flush_buffer ( YY_BUFFER_STATE b , yyscan_t yyscanner );
+void yypush_buffer_state ( YY_BUFFER_STATE new_buffer , yyscan_t yyscanner );
+void yypop_buffer_state ( yyscan_t yyscanner );
+
+YY_BUFFER_STATE yy_scan_buffer ( char *base, yy_size_t size , yyscan_t yyscanner );
+YY_BUFFER_STATE yy_scan_string ( const char *yy_str , yyscan_t yyscanner );
+YY_BUFFER_STATE yy_scan_bytes ( const char *bytes, int len , yyscan_t yyscanner );
+
+void *yyalloc ( yy_size_t , yyscan_t yyscanner );
+void *yyrealloc ( void *, yy_size_t , yyscan_t yyscanner );
+void yyfree ( void * , yyscan_t yyscanner );
+
+/* Begin user sect3 */
+
+#define fts0bwrap(yyscanner) (/*CONSTCOND*/1)
+#define YY_SKIP_YYWRAP
+
+#define yytext_ptr yytext_r
+
+#ifdef YY_HEADER_EXPORT_START_CONDITIONS
+#define INITIAL 0
+
+#endif
+
+#ifndef YY_NO_UNISTD_H
+/* Special case for "unistd.h", since it is non-ANSI. We include it way
+ * down here because we want the user's section 1 to have been scanned first.
+ * The user has a chance to override it with an option.
+ */
+#include <unistd.h>
+#endif
+
+#ifndef YY_EXTRA_TYPE
+#define YY_EXTRA_TYPE void *
+#endif
+
+int yylex_init (yyscan_t* scanner);
+
+int yylex_init_extra ( YY_EXTRA_TYPE user_defined, yyscan_t* scanner);
+
+/* Accessor methods to globals.
+   These are made visible to non-reentrant scanners for convenience. */
+
+int yylex_destroy ( yyscan_t yyscanner );
+
+int yyget_debug ( yyscan_t yyscanner );
+
+void yyset_debug ( int debug_flag , yyscan_t yyscanner );
+
+YY_EXTRA_TYPE yyget_extra ( yyscan_t yyscanner );
+
+void yyset_extra ( YY_EXTRA_TYPE user_defined , yyscan_t yyscanner );
+
+FILE *yyget_in ( yyscan_t yyscanner );
+
+void yyset_in  ( FILE * _in_str , yyscan_t yyscanner );
+
+FILE *yyget_out ( yyscan_t yyscanner );
+
+void yyset_out  ( FILE * _out_str , yyscan_t yyscanner );
+
+			int yyget_leng ( yyscan_t yyscanner );
+
+char *yyget_text ( yyscan_t yyscanner );
+
+int yyget_lineno ( yyscan_t yyscanner );
+
+void yyset_lineno ( int _line_number , yyscan_t yyscanner );
+
+int yyget_column  ( yyscan_t yyscanner );
+
+void yyset_column ( int _column_no , yyscan_t yyscanner );
+
+/* Macros after this point can all be overridden by user definitions in
+ * section 1.
+ */
+
+#ifndef YY_SKIP_YYWRAP
+#ifdef __cplusplus
+extern "C" int yywrap ( yyscan_t yyscanner );
+#else
+extern int yywrap ( yyscan_t yyscanner );
+#endif
+#endif
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy ( char *, const char *, int , yyscan_t yyscanner);
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen ( const char * , yyscan_t yyscanner);
+#endif
+
+#ifndef YY_NO_INPUT
+
+#endif
+
+/* Amount of stuff to slurp up with each read. */
+#ifndef YY_READ_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k */
+#define YY_READ_BUF_SIZE 16384
+#else
+#define YY_READ_BUF_SIZE 8192
+#endif /* __ia64__ */
+#endif
+
+/* Number of entries by which start-condition stack grows. */
+#ifndef YY_START_STACK_INCR
+#define YY_START_STACK_INCR 25
+#endif
+
+/* Default declaration of generated scanner - a define so the user can
+ * easily add parameters.
+ */
+#ifndef YY_DECL
+#define YY_DECL_IS_OURS 1
+
+extern int yylex (yyscan_t yyscanner);
+
+#define YY_DECL int yylex (yyscan_t yyscanner)
+#endif /* !YY_DECL */
+
+/* yy_get_previous_state - get the state just before the EOB char was reached */
+
+#undef YY_NEW_FILE
+#undef YY_FLUSH_BUFFER
+#undef yy_set_bol
+#undef yy_new_buffer
+#undef yy_set_interactive
+#undef YY_DO_BEFORE_ACTION
+
+#ifdef YY_DECL_IS_OURS
+#undef YY_DECL_IS_OURS
+#undef YY_DECL
+#endif
+
+#ifndef fts0b_create_buffer_ALREADY_DEFINED
+#undef yy_create_buffer
+#endif
+#ifndef fts0b_delete_buffer_ALREADY_DEFINED
+#undef yy_delete_buffer
+#endif
+#ifndef fts0b_scan_buffer_ALREADY_DEFINED
+#undef yy_scan_buffer
+#endif
+#ifndef fts0b_scan_string_ALREADY_DEFINED
+#undef yy_scan_string
+#endif
+#ifndef fts0b_scan_bytes_ALREADY_DEFINED
+#undef yy_scan_bytes
+#endif
+#ifndef fts0b_init_buffer_ALREADY_DEFINED
+#undef yy_init_buffer
+#endif
+#ifndef fts0b_flush_buffer_ALREADY_DEFINED
+#undef yy_flush_buffer
+#endif
+#ifndef fts0b_load_buffer_state_ALREADY_DEFINED
+#undef yy_load_buffer_state
+#endif
+#ifndef fts0b_switch_to_buffer_ALREADY_DEFINED
+#undef yy_switch_to_buffer
+#endif
+#ifndef fts0bpush_buffer_state_ALREADY_DEFINED
+#undef yypush_buffer_state
+#endif
+#ifndef fts0bpop_buffer_state_ALREADY_DEFINED
+#undef yypop_buffer_state
+#endif
+#ifndef fts0bensure_buffer_stack_ALREADY_DEFINED
+#undef yyensure_buffer_stack
+#endif
+#ifndef fts0blex_ALREADY_DEFINED
+#undef yylex
+#endif
+#ifndef fts0brestart_ALREADY_DEFINED
+#undef yyrestart
+#endif
+#ifndef fts0blex_init_ALREADY_DEFINED
+#undef yylex_init
+#endif
+#ifndef fts0blex_init_extra_ALREADY_DEFINED
+#undef yylex_init_extra
+#endif
+#ifndef fts0blex_destroy_ALREADY_DEFINED
+#undef yylex_destroy
+#endif
+#ifndef fts0bget_debug_ALREADY_DEFINED
+#undef yyget_debug
+#endif
+#ifndef fts0bset_debug_ALREADY_DEFINED
+#undef yyset_debug
+#endif
+#ifndef fts0bget_extra_ALREADY_DEFINED
+#undef yyget_extra
+#endif
+#ifndef fts0bset_extra_ALREADY_DEFINED
+#undef yyset_extra
+#endif
+#ifndef fts0bget_in_ALREADY_DEFINED
+#undef yyget_in
+#endif
+#ifndef fts0bset_in_ALREADY_DEFINED
+#undef yyset_in
+#endif
+#ifndef fts0bget_out_ALREADY_DEFINED
+#undef yyget_out
+#endif
+#ifndef fts0bset_out_ALREADY_DEFINED
+#undef yyset_out
+#endif
+#ifndef fts0bget_leng_ALREADY_DEFINED
+#undef yyget_leng
+#endif
+#ifndef fts0bget_text_ALREADY_DEFINED
+#undef yyget_text
+#endif
+#ifndef fts0bget_lineno_ALREADY_DEFINED
+#undef yyget_lineno
+#endif
+#ifndef fts0bset_lineno_ALREADY_DEFINED
+#undef yyset_lineno
+#endif
+#ifndef fts0bget_column_ALREADY_DEFINED
+#undef yyget_column
+#endif
+#ifndef fts0bset_column_ALREADY_DEFINED
+#undef yyset_column
+#endif
+#ifndef fts0bwrap_ALREADY_DEFINED
+#undef yywrap
+#endif
+#ifndef fts0bget_lval_ALREADY_DEFINED
+#undef yyget_lval
+#endif
+#ifndef fts0bset_lval_ALREADY_DEFINED
+#undef yyset_lval
+#endif
+#ifndef fts0bget_lloc_ALREADY_DEFINED
+#undef yyget_lloc
+#endif
+#ifndef fts0bset_lloc_ALREADY_DEFINED
+#undef yyset_lloc
+#endif
+#ifndef fts0balloc_ALREADY_DEFINED
+#undef yyalloc
+#endif
+#ifndef fts0brealloc_ALREADY_DEFINED
+#undef yyrealloc
+#endif
+#ifndef fts0bfree_ALREADY_DEFINED
+#undef yyfree
+#endif
+#ifndef fts0btext_ALREADY_DEFINED
+#undef yytext
+#endif
+#ifndef fts0bleng_ALREADY_DEFINED
+#undef yyleng
+#endif
+#ifndef fts0bin_ALREADY_DEFINED
+#undef yyin
+#endif
+#ifndef fts0bout_ALREADY_DEFINED
+#undef yyout
+#endif
+#ifndef fts0b_flex_debug_ALREADY_DEFINED
+#undef yy_flex_debug
+#endif
+#ifndef fts0blineno_ALREADY_DEFINED
+#undef yylineno
+#endif
+#ifndef fts0btables_fload_ALREADY_DEFINED
+#undef yytables_fload
+#endif
+#ifndef fts0btables_destroy_ALREADY_DEFINED
+#undef yytables_destroy
+#endif
+#ifndef fts0bTABLES_NAME_ALREADY_DEFINED
+#undef yyTABLES_NAME
+#endif
+
+#line 74 "fts0blex.l"
+
+
+#line 701 "../include/fts0blex.h"
+#undef fts0bIN_HEADER
+#endif /* fts0bHEADER_H */
diff --git a/storage/innobase/include/fts0fts.h b/storage/innobase/include/fts0fts.h
new file mode 100644
index 00000000..c0151b44
--- /dev/null
+++ b/storage/innobase/include/fts0fts.h
@@ -0,0 +1,947 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0fts.h
+Full text search header file
+
+Created 2011/09/02 Sunny Bains
+***********************************************************************/
+
+#pragma once
+
+#include "data0type.h"
+#include "data0types.h"
+#include "mem0mem.h"
+#include "rem0types.h"
+#include "row0types.h"
+#include "trx0types.h"
+#include "ut0vec.h"
+#include "ut0rbt.h"
+#include "ut0wqueue.h"
+#include "que0types.h"
+#include "ft_global.h"
+#include "mysql/plugin_ftparser.h"
+
+/** "NULL" value of a document id. */
+#define FTS_NULL_DOC_ID			0
+
+/** FTS hidden column that is used to map to and from the row */
+#define FTS_DOC_ID_COL_NAME		"FTS_DOC_ID"
+
+/** The name of the index created by FTS */
+#define FTS_DOC_ID_INDEX_NAME		"FTS_DOC_ID_INDEX"
+
+#define FTS_DOC_ID_INDEX_NAME_LEN	16
+
+/** Doc ID is a 8 byte value */
+#define FTS_DOC_ID_LEN			8
+
+/** The number of fields to sort when we build FT index with
+FIC. Three fields are sort: (word, doc_id, position) */
+#define FTS_NUM_FIELDS_SORT		3
+
+/** Maximum number of rows in a table, smaller than which, we will
+optimize using a 4 byte Doc ID for FIC merge sort to reduce sort size */
+#define MAX_DOC_ID_OPT_VAL		1073741824
+
+/** Document id type. */
+typedef ib_id_t doc_id_t;
+
+/** doc_id_t printf format */
+#define FTS_DOC_ID_FORMAT	IB_ID_FMT
+
+/** Convert document id to the InnoDB (BIG ENDIAN) storage format. */
+#define fts_write_doc_id(d, s)	mach_write_to_8(d, s)
+
+/** Read a document id to internal format. */
+#define fts_read_doc_id(s)	mach_read_from_8(s)
+
+/** Bind the doc id to a variable */
+#define fts_bind_doc_id(i, n, v) pars_info_bind_int8_literal(i, n, v)
+
+/** Defines for FTS query mode, they have the same values as
+those defined in mysql file ft_global.h */
+#define FTS_NL		0
+#define FTS_BOOL	1
+#define FTS_SORTED	2
+#define FTS_EXPAND	4
+#define FTS_NO_RANKING	8
+#define FTS_PROXIMITY	16
+#define FTS_PHRASE	32
+#define FTS_OPT_RANKING	64
+
+#define FTS_INDEX_TABLE_IND_NAME	"FTS_INDEX_TABLE_IND"
+
+/** The number of FTS index partitions for a fulltext idnex */
+#define FTS_NUM_AUX_INDEX		6
+
+/** Threshold where our optimize thread automatically kicks in */
+#define FTS_OPTIMIZE_THRESHOLD		10000000
+
+/** Maximum possible Fulltext word length in bytes (assuming mbmaxlen=4) */
+#define FTS_MAX_WORD_LEN		(HA_FT_MAXCHARLEN * 4)
+
+/** Maximum possible Fulltext word length (in characters) */
+#define FTS_MAX_WORD_LEN_IN_CHAR	HA_FT_MAXCHARLEN
+
+/** Number of columns in FTS AUX Tables */
+#define FTS_DELETED_TABLE_NUM_COLS	1
+#define FTS_CONFIG_TABLE_NUM_COLS	2
+#define FTS_AUX_INDEX_TABLE_NUM_COLS	5
+
+/** DELETED_TABLE(doc_id BIGINT UNSIGNED) */
+#define FTS_DELETED_TABLE_COL_LEN	8
+/** CONFIG_TABLE(key CHAR(50), value CHAR(200)) */
+#define FTS_CONFIG_TABLE_KEY_COL_LEN	50
+#define FTS_CONFIG_TABLE_VALUE_COL_LEN	200
+
+#define FTS_INDEX_FIRST_DOC_ID_LEN	8
+#define FTS_INDEX_LAST_DOC_ID_LEN	8
+#define FTS_INDEX_DOC_COUNT_LEN		4
+/* BLOB COLUMN, 0 means VARIABLE SIZE */
+#define FTS_INDEX_ILIST_LEN		0
+
+
+/** Variable specifying the FTS parallel sort degree */
+extern ulong		fts_sort_pll_degree;
+
+/** Variable specifying the number of word to optimize for each optimize table
+call */
+extern ulong		fts_num_word_optimize;
+
+/** Variable specifying whether we do additional FTS diagnostic printout
+in the log */
+extern char		fts_enable_diag_print;
+
+/** FTS rank type, which will be between 0 .. 1 inclusive */
+typedef float 		fts_rank_t;
+
+/** Type of a row during a transaction. FTS_NOTHING means the row can be
+forgotten from the FTS system's POV, FTS_INVALID is an internal value used
+to mark invalid states.
+
+NOTE: Do not change the order or value of these, fts_trx_row_get_new_state
+depends on them being exactly as they are. */
+enum fts_row_state {
+	FTS_INSERT = 0,
+	FTS_MODIFY,
+	FTS_DELETE,
+	FTS_NOTHING,
+	FTS_INVALID
+};
+
+/** The FTS table types. */
+enum fts_table_type_t {
+	FTS_INDEX_TABLE,		/*!< FTS auxiliary table that is
+					specific to a particular FTS index
+					on a table */
+
+	FTS_COMMON_TABLE		/*!< FTS auxiliary table that is common
+					for all FTS index on a table */
+};
+
+struct fts_doc_t;
+struct fts_cache_t;
+struct fts_token_t;
+struct fts_doc_ids_t;
+struct fts_index_cache_t;
+
+
+/** Initialize the "fts_table" for internal query into FTS auxiliary
+tables */
+#define FTS_INIT_FTS_TABLE(fts_table, m_suffix, m_type, m_table)\
+do {								\
+	(fts_table)->suffix = m_suffix;				\
+        (fts_table)->type = m_type;				\
+        (fts_table)->table_id = m_table->id;			\
+        (fts_table)->table = m_table;				\
+} while (0);
+
+#define FTS_INIT_INDEX_TABLE(fts_table, m_suffix, m_type, m_index)\
+do {								\
+	(fts_table)->suffix = m_suffix;				\
+        (fts_table)->type = m_type;				\
+        (fts_table)->table_id = m_index->table->id;		\
+        (fts_table)->table = m_index->table;			\
+        (fts_table)->index_id = m_index->id;			\
+} while (0);
+
+/** Information about changes in a single transaction affecting
+the FTS system. */
+struct fts_trx_t {
+	trx_t*		trx;		/*!< InnoDB transaction */
+
+	ib_vector_t*	savepoints;	/*!< Active savepoints, must have at
+					least one element, the implied
+					savepoint */
+	ib_vector_t*	last_stmt;	/*!< last_stmt */
+
+	mem_heap_t*	heap;		/*!< heap */
+};
+
+/** Information required for transaction savepoint handling. */
+struct fts_savepoint_t {
+	char*		name;		/*!< First entry is always NULL, the
+					default instance. Otherwise the name
+					of the savepoint */
+
+	ib_rbt_t*	tables;		/*!< Modified FTS tables */
+};
+
+/** Information about changed rows in a transaction for a single table. */
+struct fts_trx_table_t {
+	dict_table_t*	table;		/*!< table */
+
+	fts_trx_t*	fts_trx;	/*!< link to parent */
+
+	ib_rbt_t*	rows;		/*!< rows changed; indexed by doc-id,
+					cells are fts_trx_row_t* */
+
+	fts_doc_ids_t*	added_doc_ids;	/*!< list of added doc ids (NULL until
+					the first addition) */
+
+					/*!< for adding doc ids */
+	que_t*		docs_added_graph;
+};
+
+/** Information about one changed row in a transaction. */
+struct fts_trx_row_t {
+	doc_id_t	doc_id;		/*!< Id of the ins/upd/del document */
+
+	fts_row_state	state;		/*!< state of the row */
+
+	ib_vector_t*	fts_indexes;	/*!< The indexes that are affected */
+};
+
+/** List of document ids that were added during a transaction. This
+list is passed on to a background 'Add' thread and OPTIMIZE, so it
+needs its own memory heap. */
+struct fts_doc_ids_t {
+	ib_vector_t*	doc_ids;	/*!< document ids (each element is
+					of type doc_id_t). */
+
+	ib_alloc_t*	self_heap;	/*!< Allocator used to create an
+					instance of this type and the
+					doc_ids vector */
+};
+
+// FIXME: Get rid of this if possible.
+/** Since MySQL's character set support for Unicode is woefully inadequate
+(it supports basic operations like isalpha etc. only for 8-bit characters),
+we have to implement our own. We use UTF-16 without surrogate processing
+as our in-memory format. This typedef is a single such character. */
+typedef unsigned short ib_uc_t;
+
+/** An UTF-16 ro UTF-8 string. */
+struct fts_string_t {
+	byte*		f_str;		/*!< string, not necessary terminated in
+					any way */
+	ulint		f_len;		/*!< Length of the string in bytes */
+	ulint		f_n_char;	/*!< Number of characters */
+};
+
+/** Query ranked doc ids. */
+struct fts_ranking_t {
+	doc_id_t	doc_id;		/*!< Document id */
+
+	fts_rank_t	rank;		/*!< Rank is between 0 .. 1 */
+
+	byte*		words;		/*!< this contains the words
+					that were queried
+					and found in this document */
+	ulint		words_len;	/*!< words len */
+};
+
+/** Query result. */
+struct fts_result_t {
+	ib_rbt_node_t*	current;	/*!< Current element */
+
+	ib_rbt_t*	rankings_by_id;	/*!< RB tree of type fts_ranking_t
+					indexed by doc id */
+	ib_rbt_t*	rankings_by_rank;/*!< RB tree of type fts_ranking_t
+					indexed by rank */
+};
+
+/** This is used to generate the FTS auxiliary table name, we need the
+table id and the index id to generate the column specific FTS auxiliary
+table name. */
+struct fts_table_t {
+	fts_table_type_t
+			type;		/*!< The auxiliary table type */
+
+	table_id_t	table_id;	/*!< The table id */
+
+	index_id_t	index_id;	/*!< The index id */
+
+	const char*	suffix;		/*!< The suffix of the fts auxiliary
+					table name, can be NULL, not used
+					everywhere (yet) */
+	const dict_table_t*
+			table;		/*!< Parent table */
+	CHARSET_INFO*	charset;	/*!< charset info if it is for FTS
+					index auxiliary table */
+};
+
+/** The state of the FTS sub system. */
+class fts_t {
+public:
+	/** fts_t constructor.
+	@param[in]	table	table with FTS indexes
+	@param[in,out]	heap	memory heap where 'this' is stored */
+	fts_t(
+		const dict_table_t*	table,
+		mem_heap_t*		heap);
+
+	/** fts_t destructor. */
+	~fts_t();
+
+	/** Whether the ADDED table record sync-ed after crash recovery */
+	unsigned	added_synced:1;
+	/** Whether the table holds dict_sys.latch */
+	unsigned	dict_locked:1;
+
+	/** Work queue for scheduling jobs for the FTS 'Add' thread, or NULL
+	if the thread has not yet been created. Each work item is a
+	fts_trx_doc_ids_t*. */
+	ib_wqueue_t*	add_wq;
+
+	/** FTS memory buffer for this table, or NULL if the table has no FTS
+	index. */
+	fts_cache_t*	cache;
+
+	/** FTS doc id hidden column number in the CLUSTERED index. */
+	ulint		doc_col;
+
+	/** Vector of FTS indexes, this is mainly for caching purposes. */
+	ib_vector_t*	indexes;
+
+	/** Whether the table exists in fts_optimize_wq;
+	protected by fts_optimize_wq mutex */
+	bool		in_queue;
+
+	/** Whether the sync message exists in fts_optimize_wq;
+	protected by fts_optimize_wq mutex */
+	bool		sync_message;
+
+	/** Heap for fts_t allocation. */
+	mem_heap_t*	fts_heap;
+};
+
+struct fts_stopword_t;
+
+/** status bits for fts_stopword_t status field. */
+#define STOPWORD_NOT_INIT               0x1
+#define STOPWORD_OFF                    0x2
+#define STOPWORD_FROM_DEFAULT           0x4
+#define STOPWORD_USER_TABLE             0x8
+
+extern const char*	fts_default_stopword[];
+
+/** Variable specifying the maximum FTS cache size for each table */
+extern Atomic_relaxed<size_t> fts_max_cache_size;
+
+/** Variable specifying the total memory allocated for FTS cache */
+extern Atomic_relaxed<size_t> fts_max_total_cache_size;
+
+/** Variable specifying the FTS result cache limit for each query */
+extern size_t		fts_result_cache_limit;
+
+/** Variable specifying the maximum FTS max token size */
+extern ulong		fts_max_token_size;
+
+/** Variable specifying the minimum FTS max token size */
+extern ulong		fts_min_token_size;
+
+/** Whether the total memory used for FTS cache is exhausted, and we will
+need a sync to free some memory */
+extern bool		fts_need_sync;
+
+/******************************************************************//**
+Create a FTS cache. */
+fts_cache_t*
+fts_cache_create(
+/*=============*/
+	dict_table_t*	table);			/*!< table owns the FTS cache */
+
+/******************************************************************//**
+Create a FTS index cache.
+@return Index Cache */
+fts_index_cache_t*
+fts_cache_index_cache_create(
+/*=========================*/
+	dict_table_t*	table,			/*!< in: table with FTS index */
+	dict_index_t*	index);			/*!< in: FTS index */
+
+/******************************************************************//**
+Get the next available document id. This function creates a new
+transaction to generate the document id.
+@return DB_SUCCESS if OK */
+dberr_t
+fts_get_next_doc_id(
+/*================*/
+	const dict_table_t*	table,	/*!< in: table */
+	doc_id_t*		doc_id);/*!< out: new document id */
+
+/******************************************************************//**
+Create a new fts_doc_ids_t.
+@return new fts_doc_ids_t. */
+fts_doc_ids_t*
+fts_doc_ids_create(void);
+/*=====================*/
+
+/** Free fts_doc_ids_t */
+inline void fts_doc_ids_free(fts_doc_ids_t* doc_ids)
+{
+	mem_heap_free(static_cast<mem_heap_t*>(doc_ids->self_heap->arg));
+}
+
+/******************************************************************//**
+Notify the FTS system about an operation on an FTS-indexed table. */
+void
+fts_trx_add_op(
+/*===========*/
+	trx_t*		trx,			/*!< in: InnoDB transaction */
+	dict_table_t*	table,			/*!< in: table */
+	doc_id_t	doc_id,			/*!< in: doc id */
+	fts_row_state	state,			/*!< in: state of the row */
+	ib_vector_t*	fts_indexes);		/*!< in: FTS indexes affected
+						(NULL=all) */
+
+/******************************************************************//**
+Free an FTS trx. */
+void
+fts_trx_free(
+/*=========*/
+	fts_trx_t*	fts_trx);		/*!< in, own: FTS trx */
+
+/** Creates the common auxiliary tables needed for supporting an FTS index
+on the given table.
+The following tables are created.
+CREATE TABLE $FTS_PREFIX_DELETED
+	(doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id)
+CREATE TABLE $FTS_PREFIX_DELETED_CACHE
+	(doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id)
+CREATE TABLE $FTS_PREFIX_BEING_DELETED
+	(doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id)
+CREATE TABLE $FTS_PREFIX_BEING_DELETED_CACHE
+	(doc_id BIGINT UNSIGNED, UNIQUE CLUSTERED INDEX on doc_id)
+CREATE TABLE $FTS_PREFIX_CONFIG
+	(key CHAR(50), value CHAR(200), UNIQUE CLUSTERED INDEX on key)
+@param[in,out]	trx			transaction
+@param[in]	table			table with FTS index
+@param[in]	skip_doc_id_index	Skip index on doc id
+@return DB_SUCCESS if succeed */
+dberr_t
+fts_create_common_tables(
+	trx_t*		trx,
+	dict_table_t*	table,
+	bool		skip_doc_id_index)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Creates the column specific ancillary tables needed for supporting an
+FTS index on the given table.
+
+All FTS AUX Index tables have the following schema.
+CREAT TABLE $FTS_PREFIX_INDEX_[1-6](
+	word		VARCHAR(FTS_MAX_WORD_LEN),
+	first_doc_id	INT NOT NULL,
+	last_doc_id	UNSIGNED NOT NULL,
+	doc_count	UNSIGNED INT NOT NULL,
+	ilist		VARBINARY NOT NULL,
+	UNIQUE CLUSTERED INDEX ON (word, first_doc_id))
+@param[in,out]	trx	dictionary transaction
+@param[in]	index	fulltext index
+@param[in]	id	table id
+@return DB_SUCCESS or error code */
+dberr_t
+fts_create_index_tables(trx_t* trx, const dict_index_t* index, table_id_t id)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/******************************************************************//**
+Add the FTS document id hidden column. */
+void
+fts_add_doc_id_column(
+/*==================*/
+	dict_table_t*	table,	/*!< in/out: Table with FTS index */
+	mem_heap_t*	heap);	/*!< in: temporary memory heap, or NULL */
+
+/** Lock the internal FTS_ tables for an index, before fts_drop_index_tables().
+@param trx   transaction
+@param index fulltext index */
+dberr_t fts_lock_index_tables(trx_t *trx, const dict_index_t &index);
+
+/** Lock the internal common FTS_ tables, before fts_drop_common_tables().
+@param trx    transaction
+@param table  table containing FULLTEXT INDEX
+@return DB_SUCCESS or error code */
+dberr_t fts_lock_common_tables(trx_t *trx, const dict_table_t &table);
+
+/** Lock the internal FTS_ tables for table, before fts_drop_tables().
+@param trx    transaction
+@param table  table containing FULLTEXT INDEX
+@return DB_SUCCESS or error code */
+dberr_t fts_lock_tables(trx_t *trx, const dict_table_t &table);
+
+/** Drop the internal FTS_ tables for table.
+@param trx    transaction
+@param table  table containing FULLTEXT INDEX
+@return DB_SUCCESS or error code */
+dberr_t fts_drop_tables(trx_t *trx, const dict_table_t &table);
+
+/******************************************************************//**
+The given transaction is about to be committed; do whatever is necessary
+from the FTS system's POV.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_commit(
+/*=======*/
+	trx_t*		trx)			/*!< in: transaction */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** FTS Query entry point.
+@param[in,out]	trx		transaction
+@param[in]	index		fts index to search
+@param[in]	flags		FTS search mode
+@param[in]	query_str	FTS query
+@param[in]	query_len	FTS query string len in bytes
+@param[in,out]	result		result doc ids
+@return DB_SUCCESS if successful otherwise error code */
+dberr_t
+fts_query(
+	trx_t*		trx,
+	dict_index_t*	index,
+	uint		flags,
+	const byte*	query_str,
+	ulint		query_len,
+	fts_result_t**	result)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/******************************************************************//**
+Retrieve the FTS Relevance Ranking result for doc with doc_id
+@return the relevance ranking value. */
+float
+fts_retrieve_ranking(
+/*=================*/
+	fts_result_t*	result,			/*!< in: FTS result structure */
+	doc_id_t	doc_id);		/*!< in: the interested document
+						doc_id */
+
+/******************************************************************//**
+FTS Query sort result, returned by fts_query() on fts_ranking_t::rank. */
+void
+fts_query_sort_result_on_rank(
+/*==========================*/
+	fts_result_t*	result);		/*!< out: result instance
+						to sort.*/
+
+/******************************************************************//**
+FTS Query free result, returned by fts_query(). */
+void
+fts_query_free_result(
+/*==================*/
+	fts_result_t*	result);		/*!< in: result instance
+						to free.*/
+
+/******************************************************************//**
+Extract the doc id from the FTS hidden column. */
+doc_id_t
+fts_get_doc_id_from_row(
+/*====================*/
+	dict_table_t*	table,			/*!< in: table */
+	dtuple_t*	row);			/*!< in: row whose FTS doc id we
+						want to extract.*/
+
+/** Extract the doc id from the record that belongs to index.
+@param[in]	rec	record containing FTS_DOC_ID
+@param[in]	index	index of rec
+@param[in]	offsets	rec_get_offsets(rec,index)
+@return doc id that was extracted from rec */
+doc_id_t
+fts_get_doc_id_from_rec(
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	const rec_offs*		offsets);
+
+/** Add new fts doc id to the update vector.
+@param[in]	table		the table that contains the FTS index.
+@param[in,out]	ufield		the fts doc id field in the update vector.
+				No new memory is allocated for this in this
+				function.
+@param[in,out]	next_doc_id	the fts doc id that has been added to the
+				update vector.  If 0, a new fts doc id is
+				automatically generated.  The memory provided
+				for this argument will be used by the update
+				vector. Ensure that the life time of this
+				memory matches that of the update vector.
+@return the fts doc id used in the update vector */
+doc_id_t
+fts_update_doc_id(
+	dict_table_t*	table,
+	upd_field_t*	ufield,
+	doc_id_t*	next_doc_id);
+
+/******************************************************************//**
+FTS initialize. */
+void
+fts_startup(void);
+/*==============*/
+
+/******************************************************************//**
+Create an instance of fts_t.
+@return instance of fts_t */
+fts_t*
+fts_create(
+/*=======*/
+	dict_table_t*	table);			/*!< out: table with FTS
+						indexes */
+
+/*********************************************************************//**
+Run OPTIMIZE on the given table.
+@return DB_SUCCESS if all OK */
+dberr_t
+fts_optimize_table(
+/*===============*/
+	dict_table_t*	table);			/*!< in: table to optimiza */
+
+/**********************************************************************//**
+Startup the optimize thread and create the work queue. */
+void
+fts_optimize_init(void);
+/*====================*/
+
+/****************************************************************//**
+Drops index ancillary tables for a FTS index
+@return DB_SUCCESS or error code */
+dberr_t fts_drop_index_tables(trx_t *trx, const dict_index_t &index)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Add the table to add to the OPTIMIZER's list.
+@param[in]	table	table to add */
+void
+fts_optimize_add_table(
+	dict_table_t*	table);
+
+/******************************************************************//**
+Remove the table from the OPTIMIZER's list. We do wait for
+acknowledgement from the consumer of the message. */
+void
+fts_optimize_remove_table(
+/*======================*/
+	dict_table_t*	table);			/*!< in: table to remove */
+
+/** Shutdown fts optimize thread. */
+void
+fts_optimize_shutdown();
+
+/** Send sync fts cache for the table.
+@param[in]	table	table to sync */
+void
+fts_optimize_request_sync_table(
+	dict_table_t*	table);
+
+/**********************************************************************//**
+Take a FTS savepoint. */
+void
+fts_savepoint_take(
+/*===============*/
+	fts_trx_t*	fts_trx,		/*!< in: fts transaction */
+	const char*	name);			/*!< in: savepoint name */
+
+/**********************************************************************//**
+Refresh last statement savepoint. */
+void
+fts_savepoint_laststmt_refresh(
+/*===========================*/
+	trx_t*		trx);			/*!< in: transaction */
+
+/**********************************************************************//**
+Release the savepoint data identified by  name. */
+void
+fts_savepoint_release(
+/*==================*/
+	trx_t*		trx,			/*!< in: transaction */
+	const char*	name);			/*!< in: savepoint name */
+
+/** Clear cache.
+@param[in,out]	cache	fts cache */
+void
+fts_cache_clear(
+	fts_cache_t*	cache);
+
+/*********************************************************************//**
+Initialize things in cache. */
+void
+fts_cache_init(
+/*===========*/
+	fts_cache_t*	cache);			/*!< in: cache */
+
+/*********************************************************************//**
+Rollback to and including savepoint indentified by name. */
+void
+fts_savepoint_rollback(
+/*===================*/
+	trx_t*		trx,			/*!< in: transaction */
+	const char*	name);			/*!< in: savepoint name */
+
+/*********************************************************************//**
+Rollback to and including savepoint indentified by name. */
+void
+fts_savepoint_rollback_last_stmt(
+/*=============================*/
+	trx_t*		trx);			/*!< in: transaction */
+
+/** Run SYNC on the table, i.e., write out data from the cache to the
+FTS auxiliary INDEX table and clear the cache at the end.
+@param[in,out]	table		fts table
+@param[in]	wait		whether to wait for existing sync to finish
+@return DB_SUCCESS on success, error code on failure. */
+dberr_t fts_sync_table(dict_table_t* table, bool wait = true);
+
+/****************************************************************//**
+Create an FTS index cache. */
+CHARSET_INFO*
+fts_index_get_charset(
+/*==================*/
+	dict_index_t*		index);		/*!< in: FTS index */
+
+/*********************************************************************//**
+Get the initial Doc ID by consulting the CONFIG table
+@return initial Doc ID */
+doc_id_t
+fts_init_doc_id(
+/*============*/
+	const dict_table_t*		table);	/*!< in: table */
+
+/******************************************************************//**
+compare two character string according to their charset. */
+extern
+int
+innobase_fts_text_cmp(
+/*==================*/
+	const void*	cs,			/*!< in: Character set */
+	const void*	p1,			/*!< in: key */
+	const void*	p2);			/*!< in: node */
+
+/******************************************************************//**
+Makes all characters in a string lower case. */
+extern
+size_t
+innobase_fts_casedn_str(
+/*====================*/
+        CHARSET_INFO*	cs,			/*!< in: Character set */
+	char*		src,			/*!< in: string to put in
+						lower case */
+	size_t		src_len,		/*!< in: input string length */
+	char*		dst,			/*!< in: buffer for result
+						string */
+	size_t		dst_len);		/*!< in: buffer size */
+
+
+/******************************************************************//**
+compare two character string according to their charset. */
+extern
+int
+innobase_fts_text_cmp_prefix(
+/*=========================*/
+	const void*	cs,			/*!< in: Character set */
+	const void*	p1,			/*!< in: key */
+	const void*	p2);			/*!< in: node */
+
+/*************************************************************//**
+Get the next token from the given string and store it in *token. */
+extern
+ulint
+innobase_mysql_fts_get_token(
+/*=========================*/
+	CHARSET_INFO*	charset,		/*!< in: Character set */
+	const byte*	start,			/*!< in: start of text */
+	const byte*	end,			/*!< in: one character past
+						end of text */
+	fts_string_t*	token);			/*!< out: token's text */
+
+/*************************************************************//**
+Get token char size by charset
+@return the number of token char size */
+ulint
+fts_get_token_size(
+/*===============*/
+	const CHARSET_INFO*	cs,		/*!< in: Character set */
+	const char*		token,		/*!< in: token */
+	ulint			len);		/*!< in: token length */
+
+/*************************************************************//**
+FULLTEXT tokenizer internal in MYSQL_FTPARSER_SIMPLE_MODE
+@return 0 if tokenize sucessfully */
+int
+fts_tokenize_document_internal(
+/*===========================*/
+	MYSQL_FTPARSER_PARAM*	param,	/*!< in: parser parameter */
+	const char*			doc,	/*!< in: document to tokenize */
+	int			len);	/*!< in: document length */
+
+/*********************************************************************//**
+Fetch COUNT(*) from specified table.
+@return the number of rows in the table */
+ulint
+fts_get_rows_count(
+/*===============*/
+	fts_table_t*	fts_table);		/*!< in: fts table to read */
+
+/*************************************************************//**
+Get maximum Doc ID in a table if index "FTS_DOC_ID_INDEX" exists
+@return max Doc ID or 0 if index "FTS_DOC_ID_INDEX" does not exist */
+doc_id_t
+fts_get_max_doc_id(
+/*===============*/
+	dict_table_t*	table);			/*!< in: user table */
+
+/** Check whether a stopword table is in the right format.
+@param stopword_table_name   table name
+@param row_end   name of the system-versioning end column, or "value"
+@return the stopword column charset
+@retval NULL if the table does not exist or qualify */
+CHARSET_INFO *fts_valid_stopword_table(const char *stopword_table_name,
+                                       const char **row_end= NULL);
+
+/****************************************************************//**
+This function loads specified stopword into FTS cache
+@return true if success */
+bool
+fts_load_stopword(
+/*==============*/
+	const dict_table_t*
+			table,			/*!< in: Table with FTS */
+	trx_t*		trx,			/*!< in: Transaction */
+	const char*	session_stopword_table,	/*!< in: Session stopword table
+						name */
+	bool		stopword_is_on,		/*!< in: Whether stopword
+						option is turned on/off */
+	bool		reload);		/*!< in: Whether it is during
+						reload of FTS table */
+
+/****************************************************************//**
+Read the rows from the FTS index
+@return DB_SUCCESS if OK */
+dberr_t
+fts_table_fetch_doc_ids(
+/*====================*/
+	trx_t*		trx,			/*!< in: transaction */
+	fts_table_t*	fts_table,		/*!< in: aux table */
+	fts_doc_ids_t*	doc_ids);		/*!< in: For collecting
+						doc ids */
+/****************************************************************//**
+This function brings FTS index in sync when FTS index is first
+used. There are documents that have not yet sync-ed to auxiliary
+tables from last server abnormally shutdown, we will need to bring
+such document into FTS cache before any further operations */
+void
+fts_init_index(
+/*===========*/
+	dict_table_t*	table,			/*!< in: Table with FTS */
+	bool		has_cache_lock);	/*!< in: Whether we already
+						have cache lock */
+/*******************************************************************//**
+Add a newly create index in FTS cache */
+void
+fts_add_index(
+/*==========*/
+	dict_index_t*	index,			/*!< FTS index to be added */
+	dict_table_t*	table);			/*!< table */
+
+/*******************************************************************//**
+Drop auxiliary tables related to an FTS index
+@return DB_SUCCESS or error number */
+dberr_t
+fts_drop_index(
+/*===========*/
+	dict_table_t*	table,	/*!< in: Table where indexes are dropped */
+	dict_index_t*	index,	/*!< in: Index to be dropped */
+	trx_t*		trx);	/*!< in: Transaction for the drop */
+
+/****************************************************************//**
+Rename auxiliary tables for all fts index for a table
+@return DB_SUCCESS or error code */
+dberr_t
+fts_rename_aux_tables(
+/*==================*/
+	dict_table_t*	table,		/*!< in: user Table */
+	const char*	new_name,	/*!< in: new table name */
+	trx_t*		trx);		/*!< in: transaction */
+
+/*******************************************************************//**
+Check indexes in the fts->indexes is also present in index cache and
+table->indexes list
+@return TRUE if all indexes match */
+ibool
+fts_check_cached_index(
+/*===================*/
+	dict_table_t*	table);  /*!< in: Table where indexes are dropped */
+
+/** Fetch the document from tuple, tokenize the text data and
+insert the text data into fts auxiliary table and
+its cache. Moreover this tuple fields doesn't contain any information
+about externally stored field. This tuple contains data directly
+converted from mysql.
+@param[in]     ftt     FTS transaction table
+@param[in]     doc_id  doc id
+@param[in]     tuple   tuple from where data can be retrieved
+                       and tuple should be arranged in table
+                       schema order. */
+void
+fts_add_doc_from_tuple(
+	fts_trx_table_t*ftt,
+	doc_id_t        doc_id,
+	const dtuple_t* tuple);
+
+/** Create an FTS trx.
+@param[in,out] trx     InnoDB Transaction
+@return FTS transaction. */
+fts_trx_t*
+fts_trx_create(
+	trx_t*  trx);
+
+/** Clear all fts resources when there is no internal DOC_ID
+and there are no new fts index to add.
+@param[in,out]  table   table  where fts is to be freed */
+void fts_clear_all(dict_table_t *table);
+
+/** Check whether the given name is fts auxiliary table
+and fetch the parent table id and index id
+@param[in]	name		table name
+@param[in,out]	table_id	parent table id
+@param[in,out]	index_id	index id
+@return true if it is auxilary table */
+bool fts_check_aux_table(const char *name,
+                         table_id_t *table_id,
+                         index_id_t *index_id);
+
+/** Update the last document id. This function could create a new
+transaction to update the last document id.
+@param	table	table to be updated
+@param	doc_id	last document id
+@param	trx	update trx or null
+@retval DB_SUCCESS if OK */
+dberr_t
+fts_update_sync_doc_id(const dict_table_t *table,
+		       doc_id_t  doc_id,
+		       trx_t *trx)
+	MY_ATTRIBUTE((nonnull(1)));
+
+/** Sync the table during commit phase
+@param[in]	table	table to be synced */
+void fts_sync_during_ddl(dict_table_t* table);
diff --git a/storage/innobase/include/fts0opt.h b/storage/innobase/include/fts0opt.h
new file mode 100644
index 00000000..c527ad8e
--- /dev/null
+++ b/storage/innobase/include/fts0opt.h
@@ -0,0 +1,39 @@
+/*****************************************************************************
+
+Copyright (c) 2001, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0opt.h
+Full Text Search optimize thread
+
+Created 2011-02-15 Jimmy Yang
+***********************************************************************/
+#ifndef INNODB_FTS0OPT_H
+#define INNODB_FTS0OPT_H
+
+/** The FTS optimize thread's work queue. */
+extern ib_wqueue_t*	fts_optimize_wq;
+
+/********************************************************************
+Callback function to fetch the rows in an FTS INDEX record. */
+ibool
+fts_optimize_index_fetch_node(
+/*==========================*/
+                                        /* out: always returns non-NULL */
+        void*           row,		/* in: sel_node_t* */
+        void*           user_arg);	/* in: pointer to ib_vector_t */
+#endif
diff --git a/storage/innobase/include/fts0pars.h b/storage/innobase/include/fts0pars.h
new file mode 100644
index 00000000..8108e811
--- /dev/null
+++ b/storage/innobase/include/fts0pars.h
@@ -0,0 +1,72 @@
+/* A Bison parser, made by GNU Bison 2.5.  */
+
+/* Bison interface for Yacc-like parsers in C
+   
+      Copyright (C) 1984, 1989-1990, 2000-2011 Free Software Foundation, Inc.
+   
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/* As a special exception, you may create a larger work that contains
+   part or all of the Bison parser skeleton and distribute that work
+   under terms of your choice, so long as that work isn't itself a
+   parser generator using the skeleton or a modified version thereof
+   as a parser skeleton.  Alternatively, if you modify or redistribute
+   the parser skeleton itself, you may (at your option) remove this
+   special exception, which will cause the skeleton and the resulting
+   Bison output files to be licensed under the GNU General Public
+   License without this special exception.
+   
+   This special exception was added by the Free Software Foundation in
+   version 2.2 of Bison.  */
+
+
+/* Tokens.  */
+#ifndef YYTOKENTYPE
+# define YYTOKENTYPE
+   /* Put the tokens into the symbol table, so that GDB and other debuggers
+      know about them.  */
+   enum yytokentype {
+     FTS_OPER = 258,
+     FTS_TEXT = 259,
+     FTS_TERM = 260,
+     FTS_NUMB = 261
+   };
+#endif
+
+
+
+#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
+typedef union YYSTYPE
+{
+
+/* Line 2068 of yacc.c  */
+#line 61 "fts0pars.y"
+
+	int			oper;
+	fts_ast_string_t*	token;
+	fts_ast_node_t*		node;
+
+
+
+/* Line 2068 of yacc.c  */
+#line 64 "fts0pars.hh"
+} YYSTYPE;
+# define YYSTYPE_IS_TRIVIAL 1
+# define yystype YYSTYPE /* obsolescent; will be withdrawn */
+# define YYSTYPE_IS_DECLARED 1
+#endif
+
+
+
+
diff --git a/storage/innobase/include/fts0plugin.h b/storage/innobase/include/fts0plugin.h
new file mode 100644
index 00000000..18ec2d6d
--- /dev/null
+++ b/storage/innobase/include/fts0plugin.h
@@ -0,0 +1,50 @@
+/*****************************************************************************
+
+Copyright (c) 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0plugin.h
+Full text search plugin header file
+
+Created 2013/06/04 Shaohua Wang
+***********************************************************************/
+
+#ifndef INNOBASE_FTS0PLUGIN_H
+#define INNOBASE_FTS0PLUGIN_H
+
+#include "univ.i"
+
+extern struct st_mysql_ftparser fts_default_parser;
+
+struct fts_ast_state_t;
+
+#define PARSER_INIT(parser, arg) if (parser->init) { parser->init(arg); }
+#define PARSER_DEINIT(parser, arg) if (parser->deinit) { parser->deinit(arg); }
+
+/******************************************************************//**
+fts parse query by plugin parser.
+@return 0 if parse successfully, or return non-zero. */
+int
+fts_parse_by_parser(
+/*================*/
+	ibool			mode,	/*!< in: query boolean mode */
+	uchar*			query,	/*!< in: query string */
+	ulint			len,	/*!< in: query string length */
+	st_mysql_ftparser*	parse,	/*!< in: fts plugin parser */
+	fts_ast_state_t*	state);	/*!< in: query parser state */
+
+#endif	/* INNOBASE_FTS0PLUGIN_H */
diff --git a/storage/innobase/include/fts0priv.h b/storage/innobase/include/fts0priv.h
new file mode 100644
index 00000000..ae0bb036
--- /dev/null
+++ b/storage/innobase/include/fts0priv.h
@@ -0,0 +1,485 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0priv.h
+Full text search internal header file
+
+Created 2011/09/02 Sunny Bains
+***********************************************************************/
+
+#ifndef INNOBASE_FTS0PRIV_H
+#define INNOBASE_FTS0PRIV_H
+
+#include "dict0dict.h"
+#include "pars0pars.h"
+#include "que0que.h"
+#include "que0types.h"
+#include "fts0types.h"
+
+/* The various states of the FTS sub system pertaining to a table with
+FTS indexes defined on it. */
+enum fts_table_state_enum {
+					/* !<This must be 0 since we insert
+					a hard coded '0' at create time
+					to the config table */
+
+	FTS_TABLE_STATE_RUNNING = 0,	/*!< Auxiliary tables created OK */
+
+	FTS_TABLE_STATE_OPTIMIZING,	/*!< This is a substate of RUNNING */
+
+	FTS_TABLE_STATE_DELETED		/*!< All aux tables to be dropped when
+					it's safe to do so */
+};
+
+typedef enum fts_table_state_enum fts_table_state_t;
+
+/** The default time to wait for the background thread (in microsecnds). */
+#define FTS_MAX_BACKGROUND_THREAD_WAIT		10000
+
+/** Maximum number of iterations to wait before we complain */
+#define FTS_BACKGROUND_THREAD_WAIT_COUNT	1000
+
+/** The maximum length of the config table's value column in bytes */
+#define FTS_MAX_CONFIG_NAME_LEN			64
+
+/** The maximum length of the config table's value column in bytes */
+#define FTS_MAX_CONFIG_VALUE_LEN		1024
+
+/** Approx. upper limit of ilist length in bytes. */
+#define FTS_ILIST_MAX_SIZE			(64 * 1024)
+
+/** FTS config table name parameters */
+
+/** The number of seconds after which an OPTIMIZE run will stop */
+#define FTS_OPTIMIZE_LIMIT_IN_SECS	"optimize_checkpoint_limit"
+
+/** The next doc id */
+#define FTS_SYNCED_DOC_ID		"synced_doc_id"
+
+/** The last word that was OPTIMIZED */
+#define FTS_LAST_OPTIMIZED_WORD		"last_optimized_word"
+
+/** Total number of documents that have been deleted. The next_doc_id
+minus this count gives us the total number of documents. */
+#define FTS_TOTAL_DELETED_COUNT		"deleted_doc_count"
+
+/** Total number of words parsed from all documents */
+#define FTS_TOTAL_WORD_COUNT		"total_word_count"
+
+/** Start of optimize of an FTS index */
+#define FTS_OPTIMIZE_START_TIME		"optimize_start_time"
+
+/** End of optimize for an FTS index */
+#define FTS_OPTIMIZE_END_TIME		"optimize_end_time"
+
+/** User specified stopword table name */
+#define	FTS_STOPWORD_TABLE_NAME		"stopword_table_name"
+
+/** Whether to use (turn on/off) stopword */
+#define	FTS_USE_STOPWORD		"use_stopword"
+
+/** State of the FTS system for this table. It can be one of
+ RUNNING, OPTIMIZING, DELETED. */
+#define FTS_TABLE_STATE			"table_state"
+
+/** The minimum length of an FTS auxiliary table names's id component
+e.g., For an auxiliary table name
+
+	FTS_<TABLE_ID>_SUFFIX
+
+This constant is for the minimum length required to store the <TABLE_ID>
+component.
+*/
+#define FTS_AUX_MIN_TABLE_ID_LENGTH	48
+
+/** Maximum length of an integer stored in the config table value column. */
+#define FTS_MAX_INT_LEN			32
+
+/******************************************************************//**
+Parse an SQL string. %s is replaced with the table's id.
+@return query graph */
+que_t*
+fts_parse_sql(
+/*==========*/
+	fts_table_t*	fts_table,	/*!< in: FTS aux table */
+	pars_info_t*	info,		/*!< in: info struct, or NULL */
+	const char*	sql)		/*!< in: SQL string to evaluate */
+	MY_ATTRIBUTE((nonnull(3), malloc, warn_unused_result));
+/******************************************************************//**
+Evaluate a parsed SQL statement
+@return DB_SUCCESS or error code */
+dberr_t
+fts_eval_sql(
+/*=========*/
+	trx_t*		trx,		/*!< in: transaction */
+	que_t*		graph)		/*!< in: Parsed statement */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Construct the name of an internal FTS table for the given table.
+@param[in]	fts_table	metadata on fulltext-indexed table
+@param[out]	table_name	a name up to MAX_FULL_NAME_LEN
+@param[in]	dict_locked	whether dict_sys.latch is being held */
+void fts_get_table_name(const fts_table_t* fts_table, char* table_name,
+			bool dict_locked = false)
+	MY_ATTRIBUTE((nonnull));
+/******************************************************************//**
+Construct the column specification part of the SQL string for selecting the
+indexed FTS columns for the given table. Adds the necessary bound
+ids to the given 'info' and returns the SQL string. Examples:
+
+One indexed column named "text":
+
+ "$sel0",
+ info/ids: sel0 -> "text"
+
+Two indexed columns named "subject" and "content":
+
+ "$sel0, $sel1",
+ info/ids: sel0 -> "subject", sel1 -> "content",
+@return heap-allocated WHERE string */
+const char*
+fts_get_select_columns_str(
+/*=======================*/
+	dict_index_t*	index,		/*!< in: FTS index */
+	pars_info_t*	info,		/*!< in/out: parser info */
+	mem_heap_t*	heap)		/*!< in: memory heap */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** define for fts_doc_fetch_by_doc_id() "option" value, defines whether
+we want to get Doc whose ID is equal to or greater or smaller than supplied
+ID */
+#define	FTS_FETCH_DOC_BY_ID_EQUAL	1
+#define	FTS_FETCH_DOC_BY_ID_LARGE	2
+#define	FTS_FETCH_DOC_BY_ID_SMALL	3
+
+/*************************************************************//**
+Fetch document (= a single row's indexed text) with the given
+document id.
+@return: DB_SUCCESS if fetch is successful, else error */
+dberr_t
+fts_doc_fetch_by_doc_id(
+/*====================*/
+	fts_get_doc_t*	get_doc,	/*!< in: state */
+	doc_id_t	doc_id,		/*!< in: id of document to fetch */
+	dict_index_t*	index_to_use,	/*!< in: caller supplied FTS index,
+					or NULL */
+	ulint		option,         /*!< in: search option, if it is
+                                        greater than doc_id or equal */
+	fts_sql_callback
+			callback,	/*!< in: callback to read
+					records */
+	void*		arg)		/*!< in: callback arg */
+	MY_ATTRIBUTE((nonnull(6)));
+
+/*******************************************************************//**
+Callback function for fetch that stores the text of an FTS document,
+converting each column to UTF-16.
+@return always FALSE */
+ibool
+fts_query_expansion_fetch_doc(
+/*==========================*/
+	void*		row,		/*!< in: sel_node_t* */
+	void*		user_arg)	/*!< in: fts_doc_t* */
+	MY_ATTRIBUTE((nonnull));
+/********************************************************************
+Write out a single word's data as new entry/entries in the INDEX table.
+@return DB_SUCCESS if all OK. */
+dberr_t
+fts_write_node(
+/*===========*/
+	trx_t*		trx,		/*!< in: transaction */
+	que_t**		graph,		/*!< in: query graph */
+	fts_table_t*	fts_table,	/*!< in: the FTS aux index */
+	fts_string_t*	word,		/*!< in: word in UTF-8 */
+	fts_node_t*	node)		/*!< in: node columns */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Check if a fts token is a stopword or less than fts_min_token_size
+or greater than fts_max_token_size.
+@param[in]	token		token string
+@param[in]	stopwords	stopwords rb tree
+@param[in]	cs		token charset
+@retval true	if it is not stopword and length in range
+@retval false	if it is stopword or length not in range */
+bool
+fts_check_token(
+	const fts_string_t*	token,
+	const ib_rbt_t*		stopwords,
+	const CHARSET_INFO*	cs);
+
+/******************************************************************//**
+Initialize a document. */
+void
+fts_doc_init(
+/*=========*/
+	fts_doc_t*	doc)		/*!< in: doc to initialize */
+	MY_ATTRIBUTE((nonnull));
+
+/******************************************************************//**
+Do a binary search for a doc id in the array
+@return +ve index if found -ve index where it should be
+        inserted if not found */
+int
+fts_bsearch(
+/*========*/
+	doc_id_t*	array,		/*!< in: array to sort */
+	int		lower,		/*!< in: lower bound of array*/
+	int		upper,		/*!< in: upper bound of array*/
+	doc_id_t	doc_id)		/*!< in: doc id to lookup */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/******************************************************************//**
+Free document. */
+void
+fts_doc_free(
+/*=========*/
+	fts_doc_t*	doc)		/*!< in: document */
+	MY_ATTRIBUTE((nonnull));
+/******************************************************************//**
+Free fts_optimizer_word_t instanace.*/
+void
+fts_word_free(
+/*==========*/
+	fts_word_t*	word)		/*!< in: instance to free.*/
+	MY_ATTRIBUTE((nonnull));
+/******************************************************************//**
+Read the rows from the FTS inde
+@return DB_SUCCESS or error code */
+dberr_t
+fts_index_fetch_nodes(
+/*==================*/
+	trx_t*		trx,		/*!< in: transaction */
+	que_t**		graph,		/*!< in: prepared statement */
+	fts_table_t*	fts_table,	/*!< in: FTS aux table */
+	const fts_string_t*
+			word,		/*!< in: the word to fetch */
+	fts_fetch_t*	fetch)		/*!< in: fetch callback.*/
+	MY_ATTRIBUTE((nonnull));
+/******************************************************************//**
+Compare two fts_trx_table_t instances, we actually compare the
+table id's here.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_trx_table_cmp(
+/*==============*/
+	const void*	v1,		/*!< in: id1 */
+	const void*	v2)		/*!< in: id2 */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/******************************************************************//**
+Compare a table id with a trx_table_t table id.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_trx_table_id_cmp(
+/*=================*/
+	const void*	p1,		/*!< in: id1 */
+	const void*	p2)		/*!< in: id2 */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+#define fts_sql_commit(trx) trx_commit_for_mysql(trx)
+#define fts_sql_rollback(trx) (trx)->rollback()
+/******************************************************************//**
+Get value from config table. The caller must ensure that enough
+space is allocated for value to hold the column contents
+@return DB_SUCCESS or error code */
+dberr_t
+fts_config_get_value(
+/*=================*/
+	trx_t*		trx,		/* transaction */
+	fts_table_t*	fts_table,	/*!< in: the indexed FTS table */
+	const char*	name,		/*!< in: get config value for
+					this parameter name */
+	fts_string_t*	value)		/*!< out: value read from
+					config table */
+	MY_ATTRIBUTE((nonnull));
+/******************************************************************//**
+Get value specific to an FTS index from the config table. The caller
+must ensure that enough space is allocated for value to hold the
+column contents.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_config_get_index_value(
+/*=======================*/
+	trx_t*		trx,		/*!< transaction */
+	dict_index_t*	index,		/*!< in: index */
+	const char*	param,		/*!< in: get config value for
+					this parameter name */
+	fts_string_t*	value)		/*!< out: value read from
+					config table */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/******************************************************************//**
+Set the value in the config table for name.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_config_set_value(
+/*=================*/
+	trx_t*		trx,		/*!< transaction */
+	fts_table_t*	fts_table,	/*!< in: the indexed FTS table */
+	const char*	name,		/*!< in: get config value for
+					this parameter name */
+	const fts_string_t*
+			value)		/*!< in: value to update */
+	MY_ATTRIBUTE((nonnull));
+/****************************************************************//**
+Set an ulint value in the config table.
+@return DB_SUCCESS if all OK else error code */
+dberr_t
+fts_config_set_ulint(
+/*=================*/
+	trx_t*		trx,		/*!< in: transaction */
+	fts_table_t*	fts_table,	/*!< in: the indexed FTS table */
+	const char*	name,		/*!< in: param name */
+	ulint		int_value)	/*!< in: value */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/******************************************************************//**
+Set the value specific to an FTS index in the config table.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_config_set_index_value(
+/*=======================*/
+	trx_t*		trx,		/*!< transaction */
+	dict_index_t*	index,		/*!< in: index */
+	const char*	param,		/*!< in: get config value for
+					this parameter name */
+	fts_string_t*	value)		/*!< out: value read from
+					config table */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+#ifdef FTS_OPTIMIZE_DEBUG
+/******************************************************************//**
+Get an ulint value from the config table.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_config_get_index_ulint(
+/*=======================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_index_t*	index,		/*!< in: FTS index */
+	const char*	name,		/*!< in: param name */
+	ulint*		int_value)	/*!< out: value */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+#endif /* FTS_OPTIMIZE_DEBUG */
+
+/******************************************************************//**
+Set an ulint value int the config table.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_config_set_index_ulint(
+/*=======================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_index_t*	index,		/*!< in: FTS index */
+	const char*	name,		/*!< in: param name */
+	ulint		int_value)	/*!< in: value */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/******************************************************************//**
+Get an ulint value from the config table.
+@return DB_SUCCESS or error code */
+dberr_t
+fts_config_get_ulint(
+/*=================*/
+	trx_t*		trx,		/*!< in: transaction */
+	fts_table_t*	fts_table,	/*!< in: the indexed FTS table */
+	const char*	name,		/*!< in: param name */
+	ulint*		int_value)	/*!< out: value */
+	MY_ATTRIBUTE((nonnull));
+/******************************************************************//**
+Search cache for word.
+@return the word node vector if found else NULL */
+const ib_vector_t*
+fts_cache_find_word(
+/*================*/
+	const fts_index_cache_t*
+			index_cache,	/*!< in: cache to search */
+	const fts_string_t*
+			text)		/*!< in: word to search for */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/******************************************************************//**
+Append deleted doc ids to vector and sort the vector. */
+void
+fts_cache_append_deleted_doc_ids(
+/*=============================*/
+	fts_cache_t*	cache,		/*!< in: cache to use */
+	ib_vector_t*	vector);	/*!< in: append to this vector */
+/******************************************************************//**
+Search the index specific cache for a particular FTS index.
+@return the index specific cache else NULL */
+fts_index_cache_t*
+fts_find_index_cache(
+/*================*/
+	const fts_cache_t*
+			cache,		/*!< in: cache to search */
+	const dict_index_t*
+			index)		/*!< in: index to search for */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/******************************************************************//**
+Write the table id to the given buffer (including final NUL). Buffer must be
+at least FTS_AUX_MIN_TABLE_ID_LENGTH bytes long.
+@return number of bytes written */
+UNIV_INLINE
+int
+fts_write_object_id(
+/*================*/
+	ib_id_t		id,		/*!< in: a table/index id */
+	char*		str);		/*!< in: buffer to write the id to */
+/******************************************************************//**
+Read the table id from the string generated by fts_write_object_id().
+@return TRUE if parse successful */
+UNIV_INLINE
+ibool
+fts_read_object_id(
+/*===============*/
+	ib_id_t*	id,		/*!< out: a table id */
+	const char*	str)		/*!< in: buffer to read from */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/******************************************************************//**
+Get the table id.
+@return number of bytes written */
+int
+fts_get_table_id(
+/*=============*/
+	const fts_table_t*
+			fts_table,	/*!< in: FTS Auxiliary table */
+	char*		table_id)	/*!< out: table id, must be at least
+					FTS_AUX_MIN_TABLE_ID_LENGTH bytes
+					long */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/******************************************************************//**
+Add node positions. */
+void
+fts_cache_node_add_positions(
+/*=========================*/
+	fts_cache_t*	cache,		/*!< in: cache */
+	fts_node_t*	node,		/*!< in: word node */
+	doc_id_t	doc_id,		/*!< in: doc id */
+	ib_vector_t*	positions)	/*!< in: fts_token_t::positions */
+	MY_ATTRIBUTE((nonnull(2,4)));
+
+/******************************************************************//**
+Create the config table name for retrieving index specific value.
+@return index config parameter name */
+char*
+fts_config_create_index_param_name(
+/*===============================*/
+	const char*		param,	/*!< in: base name of param */
+	const dict_index_t*	index)	/*!< in: index for config */
+	MY_ATTRIBUTE((nonnull, malloc, warn_unused_result));
+
+#include "fts0priv.inl"
+
+#endif /* INNOBASE_FTS0PRIV_H */
diff --git a/storage/innobase/include/fts0priv.inl b/storage/innobase/include/fts0priv.inl
new file mode 100644
index 00000000..da14cfcb
--- /dev/null
+++ b/storage/innobase/include/fts0priv.inl
@@ -0,0 +1,121 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2016, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0priv.ic
+Full text search internal header file
+
+Created 2011/11/12 Sunny Bains
+***********************************************************************/
+
+/******************************************************************//**
+Write the table id to the given buffer (including final NUL). Buffer must be
+at least FTS_AUX_MIN_TABLE_ID_LENGTH bytes long.
+@return number of bytes written */
+UNIV_INLINE
+int
+fts_write_object_id(
+/*================*/
+	ib_id_t		id,		/* in: a table/index id */
+	char*		str)		/* in: buffer to write the id to */
+{
+
+#ifdef _WIN32
+
+	DBUG_EXECUTE_IF("innodb_test_wrong_non_windows_fts_aux_table_name",
+			return(sprintf(str, UINT64PFx, id)););
+
+	/* Use this to construct old(5.6.14 and 5.7.3) windows
+	ambiguous aux table names */
+	DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name",
+			return(sprintf(str, "%016llu", (ulonglong) id)););
+
+#else /* _WIN32 */
+
+	/* Use this to construct old(5.6.14 and 5.7.3) windows
+	ambiguous aux table names */
+	DBUG_EXECUTE_IF("innodb_test_wrong_windows_fts_aux_table_name",
+			return(sprintf(str, "%016llu", (ulonglong) id)););
+
+	DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name",
+			return(sprintf(str, "%016llx", (ulonglong) id)););
+
+#endif /* _WIN32 */
+
+	return(sprintf(str, "%016llx", (ulonglong) id));
+}
+
+/******************************************************************//**
+Read the table id from the string generated by fts_write_object_id().
+@return TRUE if parse successful */
+UNIV_INLINE
+ibool
+fts_read_object_id(
+/*===============*/
+	ib_id_t*	id,		/* out: an id */
+	const char*	str)		/* in: buffer to read from */
+{
+	/* NOTE: this func doesn't care about whether current table
+	is set with HEX_NAME, the user of the id read here will check
+	if the id is HEX or DEC and do the right thing with it. */
+	return(sscanf(str, UINT64PFx, id) == 1);
+}
+
+/******************************************************************//**
+Compare two fts_trx_table_t instances.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_trx_table_cmp(
+/*==============*/
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2)			/*!< in: id2 */
+{
+	const dict_table_t*	table1
+		= (*static_cast<const fts_trx_table_t* const*>(p1))->table;
+
+	const dict_table_t*	table2
+		= (*static_cast<const fts_trx_table_t* const*>(p2))->table;
+
+	return((table1->id > table2->id)
+	       ? 1
+	       : (table1->id == table2->id)
+		  ? 0
+		  : -1);
+}
+
+/******************************************************************//**
+Compare a table id with a fts_trx_table_t table id.
+@return < 0 if n1 < n2, 0 if n1 == n2,> 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_trx_table_id_cmp(
+/*=================*/
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2)			/*!< in: id2 */
+{
+	const uintmax_t*	table_id = static_cast<const uintmax_t*>(p1);
+	const dict_table_t*	table2
+		= (*static_cast<const fts_trx_table_t* const*>(p2))->table;
+
+	return((*table_id > table2->id)
+	       ? 1
+	       : (*table_id == table2->id)
+		  ? 0
+		  : -1);
+}
diff --git a/storage/innobase/include/fts0tlex.h b/storage/innobase/include/fts0tlex.h
new file mode 100644
index 00000000..89655ca1
--- /dev/null
+++ b/storage/innobase/include/fts0tlex.h
@@ -0,0 +1,702 @@
+#ifndef fts0tHEADER_H
+#define fts0tHEADER_H 1
+#define fts0tIN_HEADER 1
+
+#line 6 "../include/fts0tlex.h"
+
+#line 8 "../include/fts0tlex.h"
+
+#define  YY_INT_ALIGNED short int
+
+/* A lexical scanner generated by flex */
+
+#define FLEX_SCANNER
+#define YY_FLEX_MAJOR_VERSION 2
+#define YY_FLEX_MINOR_VERSION 6
+#define YY_FLEX_SUBMINOR_VERSION 4
+#if YY_FLEX_SUBMINOR_VERSION > 0
+#define FLEX_BETA
+#endif
+
+#ifdef yy_create_buffer
+#define fts0t_create_buffer_ALREADY_DEFINED
+#else
+#define yy_create_buffer fts0t_create_buffer
+#endif
+
+#ifdef yy_delete_buffer
+#define fts0t_delete_buffer_ALREADY_DEFINED
+#else
+#define yy_delete_buffer fts0t_delete_buffer
+#endif
+
+#ifdef yy_scan_buffer
+#define fts0t_scan_buffer_ALREADY_DEFINED
+#else
+#define yy_scan_buffer fts0t_scan_buffer
+#endif
+
+#ifdef yy_scan_string
+#define fts0t_scan_string_ALREADY_DEFINED
+#else
+#define yy_scan_string fts0t_scan_string
+#endif
+
+#ifdef yy_scan_bytes
+#define fts0t_scan_bytes_ALREADY_DEFINED
+#else
+#define yy_scan_bytes fts0t_scan_bytes
+#endif
+
+#ifdef yy_init_buffer
+#define fts0t_init_buffer_ALREADY_DEFINED
+#else
+#define yy_init_buffer fts0t_init_buffer
+#endif
+
+#ifdef yy_flush_buffer
+#define fts0t_flush_buffer_ALREADY_DEFINED
+#else
+#define yy_flush_buffer fts0t_flush_buffer
+#endif
+
+#ifdef yy_load_buffer_state
+#define fts0t_load_buffer_state_ALREADY_DEFINED
+#else
+#define yy_load_buffer_state fts0t_load_buffer_state
+#endif
+
+#ifdef yy_switch_to_buffer
+#define fts0t_switch_to_buffer_ALREADY_DEFINED
+#else
+#define yy_switch_to_buffer fts0t_switch_to_buffer
+#endif
+
+#ifdef yypush_buffer_state
+#define fts0tpush_buffer_state_ALREADY_DEFINED
+#else
+#define yypush_buffer_state fts0tpush_buffer_state
+#endif
+
+#ifdef yypop_buffer_state
+#define fts0tpop_buffer_state_ALREADY_DEFINED
+#else
+#define yypop_buffer_state fts0tpop_buffer_state
+#endif
+
+#ifdef yyensure_buffer_stack
+#define fts0tensure_buffer_stack_ALREADY_DEFINED
+#else
+#define yyensure_buffer_stack fts0tensure_buffer_stack
+#endif
+
+#ifdef yylex
+#define fts0tlex_ALREADY_DEFINED
+#else
+#define yylex fts0tlex
+#endif
+
+#ifdef yyrestart
+#define fts0trestart_ALREADY_DEFINED
+#else
+#define yyrestart fts0trestart
+#endif
+
+#ifdef yylex_init
+#define fts0tlex_init_ALREADY_DEFINED
+#else
+#define yylex_init fts0tlex_init
+#endif
+
+#ifdef yylex_init_extra
+#define fts0tlex_init_extra_ALREADY_DEFINED
+#else
+#define yylex_init_extra fts0tlex_init_extra
+#endif
+
+#ifdef yylex_destroy
+#define fts0tlex_destroy_ALREADY_DEFINED
+#else
+#define yylex_destroy fts0tlex_destroy
+#endif
+
+#ifdef yyget_debug
+#define fts0tget_debug_ALREADY_DEFINED
+#else
+#define yyget_debug fts0tget_debug
+#endif
+
+#ifdef yyset_debug
+#define fts0tset_debug_ALREADY_DEFINED
+#else
+#define yyset_debug fts0tset_debug
+#endif
+
+#ifdef yyget_extra
+#define fts0tget_extra_ALREADY_DEFINED
+#else
+#define yyget_extra fts0tget_extra
+#endif
+
+#ifdef yyset_extra
+#define fts0tset_extra_ALREADY_DEFINED
+#else
+#define yyset_extra fts0tset_extra
+#endif
+
+#ifdef yyget_in
+#define fts0tget_in_ALREADY_DEFINED
+#else
+#define yyget_in fts0tget_in
+#endif
+
+#ifdef yyset_in
+#define fts0tset_in_ALREADY_DEFINED
+#else
+#define yyset_in fts0tset_in
+#endif
+
+#ifdef yyget_out
+#define fts0tget_out_ALREADY_DEFINED
+#else
+#define yyget_out fts0tget_out
+#endif
+
+#ifdef yyset_out
+#define fts0tset_out_ALREADY_DEFINED
+#else
+#define yyset_out fts0tset_out
+#endif
+
+#ifdef yyget_leng
+#define fts0tget_leng_ALREADY_DEFINED
+#else
+#define yyget_leng fts0tget_leng
+#endif
+
+#ifdef yyget_text
+#define fts0tget_text_ALREADY_DEFINED
+#else
+#define yyget_text fts0tget_text
+#endif
+
+#ifdef yyget_lineno
+#define fts0tget_lineno_ALREADY_DEFINED
+#else
+#define yyget_lineno fts0tget_lineno
+#endif
+
+#ifdef yyset_lineno
+#define fts0tset_lineno_ALREADY_DEFINED
+#else
+#define yyset_lineno fts0tset_lineno
+#endif
+
+#ifdef yyget_column
+#define fts0tget_column_ALREADY_DEFINED
+#else
+#define yyget_column fts0tget_column
+#endif
+
+#ifdef yyset_column
+#define fts0tset_column_ALREADY_DEFINED
+#else
+#define yyset_column fts0tset_column
+#endif
+
+#ifdef yywrap
+#define fts0twrap_ALREADY_DEFINED
+#else
+#define yywrap fts0twrap
+#endif
+
+#ifdef yyalloc
+#define fts0talloc_ALREADY_DEFINED
+#else
+#define yyalloc fts0talloc
+#endif
+
+#ifdef yyrealloc
+#define fts0trealloc_ALREADY_DEFINED
+#else
+#define yyrealloc fts0trealloc
+#endif
+
+#ifdef yyfree
+#define fts0tfree_ALREADY_DEFINED
+#else
+#define yyfree fts0tfree
+#endif
+
+/* First, we deal with  platform-specific or compiler-specific issues. */
+
+/* begin standard C headers. */
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+
+/* end standard C headers. */
+
+/* flex integer type definitions */
+
+#ifndef FLEXINT_H
+#define FLEXINT_H
+
+/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+
+/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
+ * if you want the limit (max/min) macros for int types. 
+ */
+#ifndef __STDC_LIMIT_MACROS
+#define __STDC_LIMIT_MACROS 1
+#endif
+
+#include <inttypes.h>
+typedef int8_t flex_int8_t;
+typedef uint8_t flex_uint8_t;
+typedef int16_t flex_int16_t;
+typedef uint16_t flex_uint16_t;
+typedef int32_t flex_int32_t;
+typedef uint32_t flex_uint32_t;
+#else
+typedef signed char flex_int8_t;
+typedef short int flex_int16_t;
+typedef int flex_int32_t;
+typedef unsigned char flex_uint8_t; 
+typedef unsigned short int flex_uint16_t;
+typedef unsigned int flex_uint32_t;
+
+/* Limits of integral types. */
+#ifndef INT8_MIN
+#define INT8_MIN               (-128)
+#endif
+#ifndef INT16_MIN
+#define INT16_MIN              (-32767-1)
+#endif
+#ifndef INT32_MIN
+#define INT32_MIN              (-2147483647-1)
+#endif
+#ifndef INT8_MAX
+#define INT8_MAX               (127)
+#endif
+#ifndef INT16_MAX
+#define INT16_MAX              (32767)
+#endif
+#ifndef INT32_MAX
+#define INT32_MAX              (2147483647)
+#endif
+#ifndef UINT8_MAX
+#define UINT8_MAX              (255U)
+#endif
+#ifndef UINT16_MAX
+#define UINT16_MAX             (65535U)
+#endif
+#ifndef UINT32_MAX
+#define UINT32_MAX             (4294967295U)
+#endif
+
+#ifndef SIZE_MAX
+#define SIZE_MAX               (~(size_t)0)
+#endif
+
+#endif /* ! C99 */
+
+#endif /* ! FLEXINT_H */
+
+/* begin standard C++ headers. */
+
+/* TODO: this is always defined, so inline it */
+#define yyconst const
+
+#if defined(__GNUC__) && __GNUC__ >= 3
+#define yynoreturn __attribute__((__noreturn__))
+#else
+#define yynoreturn
+#endif
+
+/* An opaque pointer. */
+#ifndef YY_TYPEDEF_YY_SCANNER_T
+#define YY_TYPEDEF_YY_SCANNER_T
+typedef void* yyscan_t;
+#endif
+
+/* For convenience, these vars (plus the bison vars far below)
+   are macros in the reentrant scanner. */
+#define yyin yyg->yyin_r
+#define yyout yyg->yyout_r
+#define yyextra yyg->yyextra_r
+#define yyleng yyg->yyleng_r
+#define yytext yyg->yytext_r
+#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno)
+#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column)
+#define yy_flex_debug yyg->yy_flex_debug_r
+
+/* Size of default input buffer. */
+#ifndef YY_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k.
+ * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
+ * Ditto for the __ia64__ case accordingly.
+ */
+#define YY_BUF_SIZE 32768
+#else
+#define YY_BUF_SIZE 16384
+#endif /* __ia64__ */
+#endif
+
+#ifndef YY_TYPEDEF_YY_BUFFER_STATE
+#define YY_TYPEDEF_YY_BUFFER_STATE
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+#endif
+
+#ifndef YY_TYPEDEF_YY_SIZE_T
+#define YY_TYPEDEF_YY_SIZE_T
+typedef size_t yy_size_t;
+#endif
+
+#ifndef YY_STRUCT_YY_BUFFER_STATE
+#define YY_STRUCT_YY_BUFFER_STATE
+struct yy_buffer_state
+	{
+	FILE *yy_input_file;
+
+	char *yy_ch_buf;		/* input buffer */
+	char *yy_buf_pos;		/* current position in input buffer */
+
+	/* Size of input buffer in bytes, not including room for EOB
+	 * characters.
+	 */
+	int yy_buf_size;
+
+	/* Number of characters read into yy_ch_buf, not including EOB
+	 * characters.
+	 */
+	int yy_n_chars;
+
+	/* Whether we "own" the buffer - i.e., we know we created it,
+	 * and can realloc() it to grow it, and should free() it to
+	 * delete it.
+	 */
+	int yy_is_our_buffer;
+
+	/* Whether this is an "interactive" input source; if so, and
+	 * if we're using stdio for input, then we want to use getc()
+	 * instead of fread(), to make sure we stop fetching input after
+	 * each newline.
+	 */
+	int yy_is_interactive;
+
+	/* Whether we're considered to be at the beginning of a line.
+	 * If so, '^' rules will be active on the next match, otherwise
+	 * not.
+	 */
+	int yy_at_bol;
+
+    int yy_bs_lineno; /**< The line count. */
+    int yy_bs_column; /**< The column count. */
+
+	/* Whether to try to fill the input buffer when we reach the
+	 * end of it.
+	 */
+	int yy_fill_buffer;
+
+	int yy_buffer_status;
+
+	};
+#endif /* !YY_STRUCT_YY_BUFFER_STATE */
+
+void yyrestart ( FILE *input_file , yyscan_t yyscanner );
+void yy_switch_to_buffer ( YY_BUFFER_STATE new_buffer , yyscan_t yyscanner );
+YY_BUFFER_STATE yy_create_buffer ( FILE *file, int size , yyscan_t yyscanner );
+void yy_delete_buffer ( YY_BUFFER_STATE b , yyscan_t yyscanner );
+void yy_flush_buffer ( YY_BUFFER_STATE b , yyscan_t yyscanner );
+void yypush_buffer_state ( YY_BUFFER_STATE new_buffer , yyscan_t yyscanner );
+void yypop_buffer_state ( yyscan_t yyscanner );
+
+YY_BUFFER_STATE yy_scan_buffer ( char *base, yy_size_t size , yyscan_t yyscanner );
+YY_BUFFER_STATE yy_scan_string ( const char *yy_str , yyscan_t yyscanner );
+YY_BUFFER_STATE yy_scan_bytes ( const char *bytes, int len , yyscan_t yyscanner );
+
+void *yyalloc ( yy_size_t , yyscan_t yyscanner );
+void *yyrealloc ( void *, yy_size_t , yyscan_t yyscanner );
+void yyfree ( void * , yyscan_t yyscanner );
+
+/* Begin user sect3 */
+
+#define fts0twrap(yyscanner) (/*CONSTCOND*/1)
+#define YY_SKIP_YYWRAP
+
+#define yytext_ptr yytext_r
+
+#ifdef YY_HEADER_EXPORT_START_CONDITIONS
+#define INITIAL 0
+
+#endif
+
+#ifndef YY_NO_UNISTD_H
+/* Special case for "unistd.h", since it is non-ANSI. We include it way
+ * down here because we want the user's section 1 to have been scanned first.
+ * The user has a chance to override it with an option.
+ */
+#include <unistd.h>
+#endif
+
+#ifndef YY_EXTRA_TYPE
+#define YY_EXTRA_TYPE void *
+#endif
+
+int yylex_init (yyscan_t* scanner);
+
+int yylex_init_extra ( YY_EXTRA_TYPE user_defined, yyscan_t* scanner);
+
+/* Accessor methods to globals.
+   These are made visible to non-reentrant scanners for convenience. */
+
+int yylex_destroy ( yyscan_t yyscanner );
+
+int yyget_debug ( yyscan_t yyscanner );
+
+void yyset_debug ( int debug_flag , yyscan_t yyscanner );
+
+YY_EXTRA_TYPE yyget_extra ( yyscan_t yyscanner );
+
+void yyset_extra ( YY_EXTRA_TYPE user_defined , yyscan_t yyscanner );
+
+FILE *yyget_in ( yyscan_t yyscanner );
+
+void yyset_in  ( FILE * _in_str , yyscan_t yyscanner );
+
+FILE *yyget_out ( yyscan_t yyscanner );
+
+void yyset_out  ( FILE * _out_str , yyscan_t yyscanner );
+
+			int yyget_leng ( yyscan_t yyscanner );
+
+char *yyget_text ( yyscan_t yyscanner );
+
+int yyget_lineno ( yyscan_t yyscanner );
+
+void yyset_lineno ( int _line_number , yyscan_t yyscanner );
+
+int yyget_column  ( yyscan_t yyscanner );
+
+void yyset_column ( int _column_no , yyscan_t yyscanner );
+
+/* Macros after this point can all be overridden by user definitions in
+ * section 1.
+ */
+
+#ifndef YY_SKIP_YYWRAP
+#ifdef __cplusplus
+extern "C" int yywrap ( yyscan_t yyscanner );
+#else
+extern int yywrap ( yyscan_t yyscanner );
+#endif
+#endif
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy ( char *, const char *, int , yyscan_t yyscanner);
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen ( const char * , yyscan_t yyscanner);
+#endif
+
+#ifndef YY_NO_INPUT
+
+#endif
+
+/* Amount of stuff to slurp up with each read. */
+#ifndef YY_READ_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k */
+#define YY_READ_BUF_SIZE 16384
+#else
+#define YY_READ_BUF_SIZE 8192
+#endif /* __ia64__ */
+#endif
+
+/* Number of entries by which start-condition stack grows. */
+#ifndef YY_START_STACK_INCR
+#define YY_START_STACK_INCR 25
+#endif
+
+/* Default declaration of generated scanner - a define so the user can
+ * easily add parameters.
+ */
+#ifndef YY_DECL
+#define YY_DECL_IS_OURS 1
+
+extern int yylex (yyscan_t yyscanner);
+
+#define YY_DECL int yylex (yyscan_t yyscanner)
+#endif /* !YY_DECL */
+
+/* yy_get_previous_state - get the state just before the EOB char was reached */
+
+#undef YY_NEW_FILE
+#undef YY_FLUSH_BUFFER
+#undef yy_set_bol
+#undef yy_new_buffer
+#undef yy_set_interactive
+#undef YY_DO_BEFORE_ACTION
+
+#ifdef YY_DECL_IS_OURS
+#undef YY_DECL_IS_OURS
+#undef YY_DECL
+#endif
+
+#ifndef fts0t_create_buffer_ALREADY_DEFINED
+#undef yy_create_buffer
+#endif
+#ifndef fts0t_delete_buffer_ALREADY_DEFINED
+#undef yy_delete_buffer
+#endif
+#ifndef fts0t_scan_buffer_ALREADY_DEFINED
+#undef yy_scan_buffer
+#endif
+#ifndef fts0t_scan_string_ALREADY_DEFINED
+#undef yy_scan_string
+#endif
+#ifndef fts0t_scan_bytes_ALREADY_DEFINED
+#undef yy_scan_bytes
+#endif
+#ifndef fts0t_init_buffer_ALREADY_DEFINED
+#undef yy_init_buffer
+#endif
+#ifndef fts0t_flush_buffer_ALREADY_DEFINED
+#undef yy_flush_buffer
+#endif
+#ifndef fts0t_load_buffer_state_ALREADY_DEFINED
+#undef yy_load_buffer_state
+#endif
+#ifndef fts0t_switch_to_buffer_ALREADY_DEFINED
+#undef yy_switch_to_buffer
+#endif
+#ifndef fts0tpush_buffer_state_ALREADY_DEFINED
+#undef yypush_buffer_state
+#endif
+#ifndef fts0tpop_buffer_state_ALREADY_DEFINED
+#undef yypop_buffer_state
+#endif
+#ifndef fts0tensure_buffer_stack_ALREADY_DEFINED
+#undef yyensure_buffer_stack
+#endif
+#ifndef fts0tlex_ALREADY_DEFINED
+#undef yylex
+#endif
+#ifndef fts0trestart_ALREADY_DEFINED
+#undef yyrestart
+#endif
+#ifndef fts0tlex_init_ALREADY_DEFINED
+#undef yylex_init
+#endif
+#ifndef fts0tlex_init_extra_ALREADY_DEFINED
+#undef yylex_init_extra
+#endif
+#ifndef fts0tlex_destroy_ALREADY_DEFINED
+#undef yylex_destroy
+#endif
+#ifndef fts0tget_debug_ALREADY_DEFINED
+#undef yyget_debug
+#endif
+#ifndef fts0tset_debug_ALREADY_DEFINED
+#undef yyset_debug
+#endif
+#ifndef fts0tget_extra_ALREADY_DEFINED
+#undef yyget_extra
+#endif
+#ifndef fts0tset_extra_ALREADY_DEFINED
+#undef yyset_extra
+#endif
+#ifndef fts0tget_in_ALREADY_DEFINED
+#undef yyget_in
+#endif
+#ifndef fts0tset_in_ALREADY_DEFINED
+#undef yyset_in
+#endif
+#ifndef fts0tget_out_ALREADY_DEFINED
+#undef yyget_out
+#endif
+#ifndef fts0tset_out_ALREADY_DEFINED
+#undef yyset_out
+#endif
+#ifndef fts0tget_leng_ALREADY_DEFINED
+#undef yyget_leng
+#endif
+#ifndef fts0tget_text_ALREADY_DEFINED
+#undef yyget_text
+#endif
+#ifndef fts0tget_lineno_ALREADY_DEFINED
+#undef yyget_lineno
+#endif
+#ifndef fts0tset_lineno_ALREADY_DEFINED
+#undef yyset_lineno
+#endif
+#ifndef fts0tget_column_ALREADY_DEFINED
+#undef yyget_column
+#endif
+#ifndef fts0tset_column_ALREADY_DEFINED
+#undef yyset_column
+#endif
+#ifndef fts0twrap_ALREADY_DEFINED
+#undef yywrap
+#endif
+#ifndef fts0tget_lval_ALREADY_DEFINED
+#undef yyget_lval
+#endif
+#ifndef fts0tset_lval_ALREADY_DEFINED
+#undef yyset_lval
+#endif
+#ifndef fts0tget_lloc_ALREADY_DEFINED
+#undef yyget_lloc
+#endif
+#ifndef fts0tset_lloc_ALREADY_DEFINED
+#undef yyset_lloc
+#endif
+#ifndef fts0talloc_ALREADY_DEFINED
+#undef yyalloc
+#endif
+#ifndef fts0trealloc_ALREADY_DEFINED
+#undef yyrealloc
+#endif
+#ifndef fts0tfree_ALREADY_DEFINED
+#undef yyfree
+#endif
+#ifndef fts0ttext_ALREADY_DEFINED
+#undef yytext
+#endif
+#ifndef fts0tleng_ALREADY_DEFINED
+#undef yyleng
+#endif
+#ifndef fts0tin_ALREADY_DEFINED
+#undef yyin
+#endif
+#ifndef fts0tout_ALREADY_DEFINED
+#undef yyout
+#endif
+#ifndef fts0t_flex_debug_ALREADY_DEFINED
+#undef yy_flex_debug
+#endif
+#ifndef fts0tlineno_ALREADY_DEFINED
+#undef yylineno
+#endif
+#ifndef fts0ttables_fload_ALREADY_DEFINED
+#undef yytables_fload
+#endif
+#ifndef fts0ttables_destroy_ALREADY_DEFINED
+#undef yytables_destroy
+#endif
+#ifndef fts0tTABLES_NAME_ALREADY_DEFINED
+#undef yyTABLES_NAME
+#endif
+
+#line 69 "fts0tlex.l"
+
+
+#line 701 "../include/fts0tlex.h"
+#undef fts0tIN_HEADER
+#endif /* fts0tHEADER_H */
diff --git a/storage/innobase/include/fts0tokenize.h b/storage/innobase/include/fts0tokenize.h
new file mode 100644
index 00000000..1cddaf5b
--- /dev/null
+++ b/storage/innobase/include/fts0tokenize.h
@@ -0,0 +1,189 @@
+/*****************************************************************************
+
+Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file fts/fts0tokenize.cc
+Full Text Search plugin tokenizer refer to MyISAM
+
+Created 2014/11/17 Shaohua Wang
+***********************************************************************/
+
+#include "ft_global.h"
+#include "mysql/plugin_ftparser.h"
+#include "m_ctype.h"
+
+/* Macros and structs below are from ftdefs.h in MyISAM */
+/** Check a char is true word */
+#define true_word_char(c, ch) ((c) & (_MY_U | _MY_L | _MY_NMR) || (ch) == '_')
+
+/** Check if a char is misc word */
+#define misc_word_char(X)       0
+
+/** Boolean search syntax */
+static const char* fts_boolean_syntax = DEFAULT_FTB_SYNTAX;
+
+#define FTB_YES   (fts_boolean_syntax[0])
+#define FTB_EGAL  (fts_boolean_syntax[1])
+#define FTB_NO    (fts_boolean_syntax[2])
+#define FTB_INC   (fts_boolean_syntax[3])
+#define FTB_DEC   (fts_boolean_syntax[4])
+#define FTB_LBR   (fts_boolean_syntax[5])
+#define FTB_RBR   (fts_boolean_syntax[6])
+#define FTB_NEG   (fts_boolean_syntax[7])
+#define FTB_TRUNC (fts_boolean_syntax[8])
+#define FTB_LQUOT (fts_boolean_syntax[10])
+#define FTB_RQUOT (fts_boolean_syntax[11])
+
+/** FTS query token */
+typedef struct st_ft_word {
+        uchar* pos;     /*!< word start pointer */
+        uint   len;     /*!< word len */
+        double weight;  /*!< word weight, unused in innodb */
+} FT_WORD;
+
+/** Tokenizer for ngram referring to ft_get_word(ft_parser.c) in MyISAM.
+Differences: a. code format changed; b. stopword processing removed.
+@param[in]	cs	charset
+@param[in,out]	start	doc start pointer
+@param[in,out]	end	doc end pointer
+@param[in,out]	word	token
+@param[in,out]	info	token info
+@retval	0	eof
+@retval	1	word found
+@retval	2	left bracket
+@retval	3	right bracket
+@retval	4	stopword found */
+inline
+uchar
+fts_get_word(
+	const CHARSET_INFO*	cs,
+	uchar**			start,
+	uchar*			end,
+	FT_WORD*		word,
+	MYSQL_FTPARSER_BOOLEAN_INFO*
+				info)
+{
+	uchar*	doc = *start;
+	int	ctype;
+	uint	mwc;
+	uint	length;
+	int	mbl;
+
+	info->yesno = (FTB_YES ==' ') ? 1 : (info->quot != 0);
+	info->weight_adjust = info->wasign = 0;
+	info->type = FT_TOKEN_EOF;
+
+	while (doc < end) {
+		for (; doc < end;
+		     doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) {
+			mbl = cs->ctype(&ctype, doc, end);
+
+			if (true_word_char(ctype, *doc)) {
+				break;
+			}
+
+			if (*doc == FTB_RQUOT && info->quot) {
+				*start = doc + 1;
+				info->type = FT_TOKEN_RIGHT_PAREN;
+
+				return(info->type);
+			}
+
+			if (!info->quot) {
+				if (*doc == FTB_LBR
+				    || *doc == FTB_RBR
+				    || *doc == FTB_LQUOT) {
+					/* param->prev=' '; */
+					*start = doc + 1;
+					if (*doc == FTB_LQUOT) {
+						info->quot = (char*)1;
+					}
+
+					info->type = (*doc == FTB_RBR ?
+						       FT_TOKEN_RIGHT_PAREN :
+						       FT_TOKEN_LEFT_PAREN);
+
+					return(info->type);
+				}
+
+				if (info->prev == ' ') {
+					if (*doc == FTB_YES) {
+						info->yesno = +1;
+						continue;
+					} else if (*doc == FTB_EGAL) {
+						info->yesno = 0;
+						continue;
+					} else if (*doc == FTB_NO) {
+						info->yesno = -1;
+						continue;
+					} else if (*doc == FTB_INC) {
+						info->weight_adjust++;
+						continue;
+					} else if (*doc == FTB_DEC) {
+						info->weight_adjust--;
+						continue;
+					} else if (*doc == FTB_NEG) {
+						info->wasign = !info->wasign;
+						continue;
+					}
+				}
+			}
+
+			info->prev = char(*doc);
+			info->yesno = (FTB_YES == ' ') ? 1 : (info->quot != 0);
+			info->weight_adjust = info->wasign = 0;
+		}
+
+		mwc = length = 0;
+		for (word->pos = doc;
+		     doc < end;
+		     length++, doc += (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1))) {
+			mbl = cs->ctype(&ctype, doc, end);
+
+			if (true_word_char(ctype, *doc)) {
+				mwc = 0;
+			} else if (!misc_word_char(*doc) || mwc) {
+				break;
+			} else {
+				mwc++;
+			}
+		}
+
+		/* Be sure *prev is true_word_char. */
+		info->prev = 'A';
+		word->len = (uint)(doc-word->pos) - mwc;
+
+		if ((info->trunc = (doc < end && *doc == FTB_TRUNC))) {
+			doc++;
+		}
+
+		/* We don't check stopword here. */
+		*start = doc;
+		info->type = FT_TOKEN_WORD;
+
+		return(info->type);
+	}
+
+	if (info->quot) {
+		*start = doc;
+		info->type = FT_TOKEN_RIGHT_PAREN;
+	}
+
+	return(info->type);
+}
diff --git a/storage/innobase/include/fts0types.h b/storage/innobase/include/fts0types.h
new file mode 100644
index 00000000..fb278d54
--- /dev/null
+++ b/storage/innobase/include/fts0types.h
@@ -0,0 +1,354 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0types.h
+Full text search types file
+
+Created 2007-03-27 Sunny Bains
+*******************************************************/
+
+#ifndef INNOBASE_FTS0TYPES_H
+#define INNOBASE_FTS0TYPES_H
+
+#include "fts0fts.h"
+#include "pars0pars.h"
+#include "que0types.h"
+#include "ut0byte.h"
+#include "ut0rbt.h"
+
+/** Types used within FTS. */
+struct fts_que_t;
+struct fts_node_t;
+
+/** Callbacks used within FTS. */
+typedef pars_user_func_cb_t fts_sql_callback;
+typedef void (*fts_filter)(void*, fts_node_t*, void*, ulint len);
+
+/** Statistics relevant to a particular document, used during retrieval. */
+struct fts_doc_stats_t {
+	doc_id_t	doc_id;		/*!< Document id */
+	ulint		word_count;	/*!< Total words in the document */
+};
+
+/** It's main purpose is to store the SQL prepared statements that
+are required to retrieve a document from the database. */
+struct fts_get_doc_t {
+	fts_index_cache_t*
+			index_cache;	/*!< The index cache instance */
+
+					/*!< Parsed sql statement */
+	que_t*		get_document_graph;
+	fts_cache_t*	cache;		/*!< The parent cache */
+};
+
+/** Since we can have multiple FTS indexes on a table, we keep a
+per index cache of words etc. */
+struct fts_index_cache_t {
+	dict_index_t*	index;		/*!< The FTS index instance */
+
+	ib_rbt_t*	words;		/*!< Nodes; indexed by fts_string_t*,
+					cells are fts_tokenizer_word_t*.*/
+
+	ib_vector_t*	doc_stats;	/*!< Array of the fts_doc_stats_t
+					contained in the memory buffer.
+					Must be in sorted order (ascending).
+					The  ideal choice is an rb tree but
+					the rb tree imposes a space overhead
+					that we can do without */
+
+	que_t**		ins_graph;	/*!< Insert query graphs */
+
+	que_t**		sel_graph;	/*!< Select query graphs */
+	CHARSET_INFO*	charset;	/*!< charset */
+};
+
+/** Stop word control infotmation. */
+struct fts_stopword_t {
+	ulint		status;		/*!< Status of the stopword tree */
+	ib_alloc_t*	heap;		/*!< The memory allocator to use */
+	ib_rbt_t*	cached_stopword;/*!< This stores all active stopwords */
+	CHARSET_INFO*	charset;	/*!< charset for stopword */
+};
+
+/** The SYNC state of the cache. There is one instance of this struct
+associated with each ADD thread. */
+struct fts_sync_t {
+	trx_t*		trx;		/*!< The transaction used for SYNCing
+					the cache to disk */
+	dict_table_t*	table;		/*!< Table with FTS index(es) */
+	ulint		max_cache_size;	/*!< Max size in bytes of the cache */
+	ibool		cache_full;	/*!< flag, when true it indicates that
+					we need to sync the cache to disk */
+	ulint		lower_index;	/*!< the start index of the doc id
+					vector from where to start adding
+					documents to the FTS cache */
+	ulint		upper_index;	/*!< max index of the doc id vector to
+					add to the FTS cache */
+	ibool		interrupted;	/*!< TRUE if SYNC was interrupted */
+	doc_id_t	min_doc_id;	/*!< The smallest doc id added to the
+					cache. It should equal to
+					doc_ids[lower_index] */
+	doc_id_t	max_doc_id;	/*!< The doc id at which the cache was
+					noted as being full, we use this to
+					set the upper_limit field */
+	time_t		start_time;	/*!< SYNC start time; only used if
+					fts_enable_diag_print */
+	bool		in_progress;	/*!< flag whether sync is in progress.*/
+	bool		unlock_cache;	/*!< flag whether unlock cache when
+					write fts node */
+  /** condition variable for in_progress; used with table->fts->cache->lock */
+  pthread_cond_t cond;
+};
+
+/** The cache for the FTS system. It is a memory-based inverted index
+that new entries are added to, until it grows over the configured maximum
+size, at which time its contents are written to the INDEX table. */
+struct fts_cache_t
+{
+  /** lock protecting all access to the memory buffer */
+  mysql_mutex_t lock;
+  /** cache initialization */
+  mysql_mutex_t init_lock;
+
+  /** protection for deleted_doc_ids */
+  mysql_mutex_t deleted_lock;
+
+  /** protection for DOC_ID */
+  mysql_mutex_t	doc_id_lock;
+
+	ib_vector_t*	deleted_doc_ids;/*!< Array of deleted doc ids, each
+					element is of type fts_update_t */
+
+	ib_vector_t*	indexes;	/*!< We store the stats and inverted
+					index for the individual FTS indexes
+					in this vector. Each element is
+					an instance of fts_index_cache_t */
+
+	ib_vector_t*	get_docs;	/*!< information required to read
+					the document from the table. Each
+					element is of type fts_doc_t */
+
+	size_t		total_size;	/*!< total size consumed by the ilist
+					field of all nodes. SYNC is run
+					whenever this gets too big */
+	/** total_size at the time of the previous SYNC request */
+	size_t		total_size_at_sync;
+
+	fts_sync_t*	sync;		/*!< sync structure to sync data to
+					disk */
+	ib_alloc_t*	sync_heap;	/*!< The heap allocator, for indexes
+					and deleted_doc_ids, ie. transient
+					objects, they are recreated after
+					a SYNC is completed */
+
+	ib_alloc_t*	self_heap;	/*!< This heap is the heap out of
+					which an instance of the cache itself
+					was created. Objects created using
+					this heap will last for the lifetime
+					of the cache */
+
+	doc_id_t	next_doc_id;	/*!< Next doc id */
+
+	doc_id_t	synced_doc_id;	/*!< Doc ID sync-ed to CONFIG table */
+
+	doc_id_t	first_doc_id;	/*!< first doc id since this table
+					was opened */
+
+	ulint		deleted;	/*!< Number of doc ids deleted since
+					last optimized. This variable is
+					covered by deleted_lock */
+
+	ulint		added;		/*!< Number of doc ids added since last
+					optimized. This variable is covered by
+					the deleted lock */
+
+	fts_stopword_t	stopword_info;	/*!< Cached stopwords for the FTS */
+	mem_heap_t*	cache_heap;	/*!< Cache Heap */
+};
+
+/** Columns of the FTS auxiliary INDEX table */
+struct fts_node_t {
+	doc_id_t	first_doc_id;	/*!< First document id in ilist. */
+
+	doc_id_t	last_doc_id;	/*!< Last document id in ilist. */
+
+	byte*		ilist;		/*!< Binary list of documents & word
+					positions the token appears in.
+					TODO: For now, these are simply
+					ut_malloc'd, but if testing shows
+					that they waste memory unacceptably, a
+					special memory allocator will have
+					to be written */
+
+	ulint		doc_count;	/*!< Number of doc ids in ilist */
+
+	ulint		ilist_size;	/*!< Used size of ilist in bytes. */
+
+	ulint		ilist_size_alloc;
+					/*!< Allocated size of ilist in
+					bytes */
+	bool		synced;		/*!< flag whether the node is synced */
+};
+
+/** A tokenizer word. Contains information about one word. */
+struct fts_tokenizer_word_t {
+	fts_string_t	text;		/*!< Token text. */
+
+	ib_vector_t*	nodes;		/*!< Word node ilists, each element is
+					of type fts_node_t */
+};
+
+/** Word text plus it's array of nodes as on disk in FTS index */
+struct fts_word_t {
+	fts_string_t	text;		/*!< Word value in UTF-8 */
+	ib_vector_t*	nodes;		/*!< Nodes read from disk */
+
+	ib_alloc_t*	heap_alloc;	/*!< For handling all allocations */
+};
+
+/** Callback for reading and filtering nodes that are read from FTS index */
+struct fts_fetch_t {
+	void*		read_arg;	/*!< Arg for the sql_callback */
+
+	fts_sql_callback
+			read_record;	/*!< Callback for reading index
+					record */
+	size_t		total_memory;	/*!< Total memory used */
+};
+
+/** For horizontally splitting an FTS auxiliary index */
+struct fts_index_selector_t {
+	ulint		value;		/*!< Character value at which
+					to split */
+
+	const char*	suffix;		/*!< FTS aux index suffix */
+};
+
+/** This type represents a single document. */
+struct fts_doc_t {
+	fts_string_t	text;		/*!< document text */
+
+	ibool		found;		/*!< TRUE if the document was found
+					successfully in the database */
+
+	ib_rbt_t*	tokens;		/*!< This is filled when the document
+					is tokenized. Tokens; indexed by
+					fts_string_t*, cells are of type
+					fts_token_t* */
+
+	ib_alloc_t*	self_heap;	/*!< An instance of this type is
+					allocated from this heap along
+					with any objects that have the
+					same lifespan, most notably
+					the vector of token positions */
+	CHARSET_INFO*	charset;	/*!< Document's charset info */
+
+	st_mysql_ftparser* parser;	/*!< fts plugin parser */
+
+	ib_rbt_t*	stopwords;	/*!< Stopwords */
+};
+
+/** A token and its positions within a document. */
+struct fts_token_t {
+	fts_string_t	text;		/*!< token text */
+
+	ib_vector_t*	positions;	/*!< an array of the positions the
+					token is found in; each item is
+					actually an ulint. */
+};
+
+/** It's defined in fts/fts0fts.c */
+extern const fts_index_selector_t fts_index_selector[];
+
+/******************************************************************//**
+Compare two fts_trx_row_t instances doc_ids. */
+UNIV_INLINE
+int
+fts_trx_row_doc_id_cmp(
+/*===================*/
+						/*!< out:
+						< 0 if n1 < n2,
+						0 if n1 == n2,
+						> 0 if n1 > n2 */
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2);			/*!< in: id2 */
+
+/******************************************************************//**
+Compare two fts_ranking_t instances doc_ids. */
+UNIV_INLINE
+int
+fts_ranking_doc_id_cmp(
+/*===================*/
+						/*!< out:
+						< 0 if n1 < n2,
+						0 if n1 == n2,
+						> 0 if n1 > n2 */
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2);			/*!< in: id2 */
+
+/******************************************************************//**
+Compare two doc_ids. */
+UNIV_INLINE
+int fts_doc_id_cmp(
+/*==================*/
+						/*!< out:
+						< 0 if n1 < n2,
+						0 if n1 == n2,
+						> 0 if n1 > n2 */
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2);			/*!< in: id2 */
+
+/******************************************************************//**
+Duplicate a string. */
+UNIV_INLINE
+void
+fts_string_dup(
+/*===========*/
+						/*!< out:
+						< 0 if n1 < n2,
+						0 if n1 == n2,
+						> 0 if n1 > n2 */
+	fts_string_t*		dst,		/*!< in: dup to here */
+	const fts_string_t*	src,		/*!< in: src string */
+	mem_heap_t*		heap);		/*!< in: heap to use */
+
+/******************************************************************//**
+Get the selected FTS aux INDEX suffix. */
+UNIV_INLINE
+const char*
+fts_get_suffix(
+/*===========*/
+	ulint		selected);		/*!< in: selected index */
+
+/** Select the FTS auxiliary index for the given character.
+@param[in]	cs	charset
+@param[in]	str	string
+@param[in]	len	string length in bytes
+@return the index to use for the string */
+UNIV_INLINE
+ulint
+fts_select_index(
+	const CHARSET_INFO*	cs,
+	const byte*		str,
+	ulint			len);
+
+#include "fts0types.inl"
+
+#endif /* INNOBASE_FTS0TYPES_H */
diff --git a/storage/innobase/include/fts0types.inl b/storage/innobase/include/fts0types.inl
new file mode 100644
index 00000000..facc1e5c
--- /dev/null
+++ b/storage/innobase/include/fts0types.inl
@@ -0,0 +1,231 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fts0types.ic
+Full text search types.
+
+Created 2007-03-27 Sunny Bains
+*******************************************************/
+
+#ifndef INNOBASE_FTS0TYPES_IC
+#define INNOBASE_FTS0TYPES_IC
+
+/******************************************************************//**
+Duplicate a string.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+void
+fts_string_dup(
+/*===========*/
+	fts_string_t*		dst,		/*!< in: dup to here */
+	const fts_string_t*	src,		/*!< in: src string */
+	mem_heap_t*		heap)		/*!< in: heap to use */
+{
+	dst->f_str = (byte*)mem_heap_alloc(heap, src->f_len + 1);
+	memcpy(dst->f_str, src->f_str, src->f_len);
+
+	dst->f_len = src->f_len;
+	dst->f_str[src->f_len] = 0;
+	dst->f_n_char = src->f_n_char;
+}
+
+/******************************************************************//**
+Compare two fts_trx_row_t doc_ids.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_trx_row_doc_id_cmp(
+/*===================*/
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2)			/*!< in: id2 */
+{
+	const fts_trx_row_t*	tr1 = (const fts_trx_row_t*) p1;
+	const fts_trx_row_t*	tr2 = (const fts_trx_row_t*) p2;
+
+	return((int)(tr1->doc_id - tr2->doc_id));
+}
+
+/******************************************************************//**
+Compare two fts_ranking_t doc_ids.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int
+fts_ranking_doc_id_cmp(
+/*===================*/
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2)			/*!< in: id2 */
+{
+	const fts_ranking_t*	rk1 = (const fts_ranking_t*) p1;
+	const fts_ranking_t*	rk2 = (const fts_ranking_t*) p2;
+
+	return((int)(rk1->doc_id - rk2->doc_id));
+}
+
+/******************************************************************//**
+Compare two doc_ids.
+@return < 0 if n1 < n2, 0 if n1 == n2, > 0 if n1 > n2 */
+UNIV_INLINE
+int fts_doc_id_cmp(
+/*==================*/
+	const void*	p1,			/*!< in: id1 */
+	const void*	p2)			/*!< in: id2 */
+{
+	const doc_id_t*	up1 = static_cast<const doc_id_t*>(p1);
+	const doc_id_t*	up2 = static_cast<const doc_id_t*>(p2);
+
+	return static_cast<int>(*up1 - *up2);
+}
+
+/******************************************************************//**
+Get the first character's code position for FTS index partition */
+extern
+ulint
+innobase_strnxfrm(
+/*==============*/
+        const CHARSET_INFO*	cs,	/*!< in: Character set */
+        const uchar*		p2,	/*!< in: string */
+        const ulint		len2);	/*!< in: string length */
+
+/** Check if fts index charset is cjk
+@param[in]	cs	charset
+@retval	true	if the charset is cjk
+@retval	false	if not. */
+inline bool fts_is_charset_cjk(const CHARSET_INFO* cs)
+{
+	switch (cs->number) {
+	case 24: /* my_charset_gb2312_chinese_ci */
+	case 28: /* my_charset_gbk_chinese_ci */
+	case 1: /* my_charset_big5_chinese_ci */
+	case 12: /* my_charset_ujis_japanese_ci */
+	case 13: /* my_charset_sjis_japanese_ci */
+	case 95: /* my_charset_cp932_japanese_ci */
+	case 97: /* my_charset_eucjpms_japanese_ci */
+	case 19: /* my_charset_euckr_korean_ci */
+		return true;
+	default:
+		return false;
+	}
+}
+
+/** Select the FTS auxiliary index for the given character by range.
+@param[in]	cs	charset
+@param[in]	str	string
+@param[in]	len	string length
+@retval	the index to use for the string */
+UNIV_INLINE
+ulint
+fts_select_index_by_range(
+	const CHARSET_INFO*	cs,
+	const byte*		str,
+	ulint			len)
+{
+	ulint			selected = 0;
+	ulint			value = innobase_strnxfrm(cs, str, len);
+
+	while (fts_index_selector[selected].value != 0) {
+
+		if (fts_index_selector[selected].value == value) {
+
+			return(selected);
+
+		} else if (fts_index_selector[selected].value > value) {
+
+			return(selected > 0 ? selected - 1 : 0);
+		}
+
+		++selected;
+	}
+
+	ut_ad(selected > 1);
+
+	return(selected - 1);
+}
+
+/** Select the FTS auxiliary index for the given character by hash.
+@param[in]	cs	charset
+@param[in]	str	string
+@param[in]	len	string length
+@retval the index to use for the string */
+UNIV_INLINE
+ulint
+fts_select_index_by_hash(
+	const CHARSET_INFO*	cs,
+	const byte*		str,
+	ulint			len)
+{
+	ulong	nr1 = 1;
+	ulong	nr2 = 4;
+
+	ut_ad(!(str == NULL && len > 0));
+
+	if (str == NULL || len == 0) {
+		return 0;
+	}
+
+	/* Get the first char */
+	/* JAN: TODO: MySQL 5.7 had
+	char_len = my_mbcharlen_ptr(cs, reinterpret_cast<const char*>(str),
+				    reinterpret_cast<const char*>(str + len));
+	*/
+	size_t char_len = size_t(cs->charlen(str, str + len));
+
+	ut_ad(char_len <= len);
+
+	/* Get collation hash code */
+	my_ci_hash_sort(cs, str, char_len, &nr1, &nr2);
+
+	return(nr1 % FTS_NUM_AUX_INDEX);
+}
+
+/** Select the FTS auxiliary index for the given character.
+@param[in]	cs	charset
+@param[in]	str	string
+@param[in]	len	string length in bytes
+@retval	the index to use for the string */
+UNIV_INLINE
+ulint
+fts_select_index(
+	const CHARSET_INFO*	cs,
+	const byte*		str,
+	ulint			len)
+{
+	ulint	selected;
+
+	if (fts_is_charset_cjk(cs)) {
+		selected = fts_select_index_by_hash(cs, str, len);
+	} else {
+		selected = fts_select_index_by_range(cs, str, len);
+	}
+
+	return(selected);
+}
+
+/******************************************************************//**
+Return the selected FTS aux index suffix. */
+UNIV_INLINE
+const char*
+fts_get_suffix(
+/*===========*/
+	ulint		selected)	/*!< in: selected index */
+{
+	return(fts_index_selector[selected].suffix);
+}
+
+#endif /* INNOBASE_FTS0TYPES_IC */
diff --git a/storage/innobase/include/fts0vlc.h b/storage/innobase/include/fts0vlc.h
new file mode 100644
index 00000000..d6e60377
--- /dev/null
+++ b/storage/innobase/include/fts0vlc.h
@@ -0,0 +1,124 @@
+/**
+
+Copyright (c) 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+**/
+/**
+@file include/fts0vlc.h
+Full text variable length integer encoding/decoding.
+
+Created 2021-10-19 Thirunarayanan Balathandayuthapani
+**/
+
+/** Return length of val if it were encoded using our VLC scheme.
+@param	val	value to encode
+@return length of value encoded, in bytes */
+inline size_t fts_get_encoded_len(doc_id_t val)
+{
+  if (val < static_cast<doc_id_t>(1) << 7)
+    return 1;
+  if (val < static_cast<doc_id_t>(1) << 14)
+    return 2;
+  if (val < static_cast<doc_id_t>(1) << 21)
+    return 3;
+  if (val < static_cast<doc_id_t>(1) << 28)
+    return 4;
+  if (val < static_cast<doc_id_t>(1) << 35)
+    return 5;
+  if (val < static_cast<doc_id_t>(1) << 42)
+    return 6;
+  if (val < static_cast<doc_id_t>(1) << 49)
+    return 7;
+  if (val < static_cast<doc_id_t>(1) << 56)
+    return 8;
+  if (val < static_cast<doc_id_t>(1) << 63)
+    return 9;
+  return 10;
+}
+
+/** Encode an integer using our VLC scheme and return the
+length in bytes.
+@param	val	value to encode
+@param	buf	buffer, must have enough space
+@return length of value encoded, in bytes */
+inline byte *fts_encode_int(doc_id_t val, byte *buf)
+{
+  if (val < static_cast<doc_id_t>(1) << 7)
+    goto add_1;
+  if (val < static_cast<doc_id_t>(1) << 14)
+    goto add_2;
+  if (val < static_cast<doc_id_t>(1) << 21)
+    goto add_3;
+  if (val < static_cast<doc_id_t>(1) << 28)
+    goto add_4;
+  if (val < static_cast<doc_id_t>(1) << 35)
+    goto add_5;
+  if (val < static_cast<doc_id_t>(1) << 42)
+    goto add_6;
+  if (val < static_cast<doc_id_t>(1) << 49)
+    goto add_7;
+  if (val < static_cast<doc_id_t>(1) << 56)
+    goto add_8;
+  if (val < static_cast<doc_id_t>(1) << 63)
+    goto add_9;
+
+  *buf++= static_cast<byte>(val >> 63);
+add_9:
+  *buf++= static_cast<byte>(val >> 56) & 0x7F;
+add_8:
+  *buf++= static_cast<byte>(val >> 49) & 0x7F;
+add_7:
+  *buf++= static_cast<byte>(val >> 42) & 0x7F;
+add_6:
+  *buf++= static_cast<byte>(val >> 35) & 0x7F;
+add_5:
+  *buf++= static_cast<byte>(val >> 28) & 0x7F;
+add_4:
+  *buf++= static_cast<byte>(val >> 21) & 0x7F;
+add_3:
+  *buf++= static_cast<byte>(val >> 14) & 0x7F;
+add_2:
+  *buf++= static_cast<byte>(val >> 7) & 0x7F;
+add_1:
+  *buf++= static_cast<byte>(val) | 0x80;
+  return buf;
+}
+
+/** Decode and return the integer that was encoded using
+our VLC scheme.
+@param	ptr 	pointer to decode from, this ptr is
+		incremented by the number of bytes decoded
+@return value decoded */
+inline doc_id_t fts_decode_vlc(const byte **ptr)
+{
+  ut_d(const byte *const start= *ptr);
+  ut_ad(*start);
+
+  doc_id_t val= 0;
+  for (;;)
+  {
+    byte b= *(*ptr)++;
+    val|= (b & 0x7F);
+
+    /* High-bit on means "last byte in the encoded integer". */
+    if (b & 0x80)
+      break;
+    ut_ad(val < static_cast<doc_id_t>(1) << (64 - 7));
+    val <<= 7;
+  }
+
+  ut_ad(*ptr - start <= 10);
+
+  return(val);
+}
diff --git a/storage/innobase/include/fut0lst.h b/storage/innobase/include/fut0lst.h
new file mode 100644
index 00000000..746dab80
--- /dev/null
+++ b/storage/innobase/include/fut0lst.h
@@ -0,0 +1,156 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fut0lst.h
+File-based list utilities
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#pragma once
+
+/* The physical size of a list base node in bytes */
+#define	FLST_BASE_NODE_SIZE	(4 + 2 * FIL_ADDR_SIZE)
+/* The physical size of a list node in bytes */
+#define	FLST_NODE_SIZE		(2 * FIL_ADDR_SIZE)
+
+#ifdef UNIV_INNOCHECKSUM
+# include "fil0fil.h"
+#else
+# include "mtr0log.h"
+
+typedef	byte	flst_base_node_t;
+typedef	byte	flst_node_t;
+
+/* We define the field offsets of a node for the list */
+#define FLST_PREV	0	/* 6-byte address of the previous list element;
+				the page part of address is FIL_NULL, if no
+				previous element */
+#define FLST_NEXT	FIL_ADDR_SIZE	/* 6-byte address of the next
+				list element; the page part of address
+				is FIL_NULL, if no next element */
+
+/* We define the field offsets of a base node for the list */
+#define FLST_LEN	0	/* 32-bit list length field */
+#define	FLST_FIRST	4	/* 6-byte address of the first element
+				of the list; undefined if empty list */
+#define	FLST_LAST	(4 + FIL_ADDR_SIZE) /* 6-byte address of the
+				last element of the list; undefined
+				if empty list */
+
+/** Initialize a zero-initialized list base node.
+@param[in,out]	block	file page
+@param[in]	ofs	byte offset of the list base node
+@param[in,out]	mtr	mini-transaction */
+inline void flst_init(const buf_block_t* block, uint16_t ofs, mtr_t* mtr)
+{
+  ut_d(const page_t *page= block->page.frame);
+  ut_ad(!mach_read_from_2(FLST_LEN + ofs + page));
+  ut_ad(!mach_read_from_2(FLST_FIRST + FIL_ADDR_BYTE + ofs + page));
+  ut_ad(!mach_read_from_2(FLST_LAST + FIL_ADDR_BYTE + ofs + page));
+  compile_time_assert(FIL_NULL == 0xffU * 0x1010101U);
+  mtr->memset(block, FLST_FIRST + FIL_ADDR_PAGE + ofs, 4, 0xff);
+  mtr->memset(block, FLST_LAST + FIL_ADDR_PAGE + ofs, 4, 0xff);
+}
+
+/** Initialize a list base node.
+@param[in]      block   file page
+@param[in,out]  base    base node
+@param[in,out]  mtr     mini-transaction */
+void flst_init(const buf_block_t &block, byte *base, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull));
+
+/** Append a file list node to a list.
+@param[in,out]  base    base node block
+@param[in]      boffset byte offset of the base node
+@param[in,out]  add     block to be added
+@param[in]      aoffset byte offset of the node to be added
+@param[in,out]  mtr     mini-transaction
+@return error code */
+dberr_t flst_add_last(buf_block_t *base, uint16_t boffset,
+                      buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Prepend a file list node to a list.
+@param[in,out]  base    base node block
+@param[in]      boffset byte offset of the base node
+@param[in,out]  add     block to be added
+@param[in]      aoffset byte offset of the node to be added
+@param[in,out]  mtr     mini-transaction
+@return error code */
+dberr_t flst_add_first(buf_block_t *base, uint16_t boffset,
+                    buf_block_t *add, uint16_t aoffset, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Remove a file list node.
+@param[in,out]  base    base node block
+@param[in]      boffset byte offset of the base node
+@param[in,out]  cur     block to be removed
+@param[in]      coffset byte offset of the current record to be removed
+@param[in,out]  mtr     mini-transaction
+@return error code */
+dberr_t flst_remove(buf_block_t *base, uint16_t boffset,
+                    buf_block_t *cur, uint16_t coffset, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** @return the length of a list */
+inline uint32_t flst_get_len(const flst_base_node_t *base)
+{
+  return mach_read_from_4(base + FLST_LEN);
+}
+
+/** @return a file address */
+inline fil_addr_t flst_read_addr(const byte *faddr)
+{
+  fil_addr_t addr= { mach_read_from_4(faddr + FIL_ADDR_PAGE),
+		     mach_read_from_2(faddr + FIL_ADDR_BYTE) };
+  ut_a(addr.page == FIL_NULL || addr.boffset >= FIL_PAGE_DATA);
+  ut_a(ut_align_offset(faddr, srv_page_size) >= FIL_PAGE_DATA);
+  return addr;
+}
+
+/** @return list first node address */
+inline fil_addr_t flst_get_first(const flst_base_node_t *base)
+{
+  return flst_read_addr(base + FLST_FIRST);
+}
+
+/** @return list last node address */
+inline fil_addr_t flst_get_last(const flst_base_node_t *base)
+{
+  return flst_read_addr(base + FLST_LAST);
+}
+
+/** @return list next node address */
+inline fil_addr_t flst_get_next_addr(const flst_node_t* node)
+{
+  return flst_read_addr(node + FLST_NEXT);
+}
+
+/** @return list prev node address */
+inline fil_addr_t flst_get_prev_addr(const flst_node_t *node)
+{
+  return flst_read_addr(node + FLST_PREV);
+}
+
+# ifdef UNIV_DEBUG
+/** Validate a file-based list. */
+void flst_validate(const buf_block_t *base, uint16_t boffset, mtr_t *mtr);
+# endif
+
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/gis0geo.h b/storage/innobase/include/gis0geo.h
new file mode 100644
index 00000000..3fd01a3a
--- /dev/null
+++ b/storage/innobase/include/gis0geo.h
@@ -0,0 +1,122 @@
+/*****************************************************************************
+Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software Foundation,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+*****************************************************************************/
+
+/**************************************************//**
+@file gis0geo.h
+The r-tree define from MyISAM
+*******************************************************/
+
+#ifndef _gis0geo_h
+#define _gis0geo_h
+
+#include "my_global.h"
+#include "string.h"
+
+#define SPTYPE HA_KEYTYPE_DOUBLE
+#define SPLEN  8
+
+/* Since the mbr could be a point or a linestring, in this case, area of
+mbr is 0. So, we define this macro for calculating the area increasing
+when we need to enlarge the mbr. */
+#define LINE_MBR_WEIGHTS	0.001
+
+/* Types of "well-known binary representation" (wkb) format. */
+enum wkbType
+{
+  wkbPoint = 1,
+  wkbLineString = 2,
+  wkbPolygon = 3,
+  wkbMultiPoint = 4,
+  wkbMultiLineString = 5,
+  wkbMultiPolygon = 6,
+  wkbGeometryCollection = 7
+};
+
+/* Byte order of "well-known binary representation" (wkb) format. */
+enum wkbByteOrder
+{
+  wkbXDR = 0,    /* Big Endian    */
+  wkbNDR = 1     /* Little Endian */
+};
+
+/*************************************************************//**
+Calculate minimal bounding rectangle (mbr) of the spatial object
+stored in "well-known binary representation" (wkb) format.
+@return 0 if ok */
+int
+rtree_mbr_from_wkb(
+/*===============*/
+	const uchar*	wkb,		/*!< in: pointer to wkb. */
+	uint	size,		/*!< in: size of wkb. */
+	uint	n_dims,		/*!< in: dimensions. */
+	double*	mbr);		/*!< in/out: mbr. */
+
+/* Rtree split node structure. */
+struct rtr_split_node_t
+{
+	double	square;		/* square of the mbr.*/
+	int	n_node;		/* which group in.*/
+	uchar*	key;		/* key. */
+	double* coords;		/* mbr. */
+};
+
+/*************************************************************//**
+Inline function for reserving coords */
+inline
+static
+double*
+reserve_coords(double	**d_buffer,	/*!< in/out: buffer. */
+	       int	n_dim)		/*!< in: dimensions. */
+/*===========*/
+{
+  double *coords = *d_buffer;
+  (*d_buffer) += n_dim * 2;
+  return coords;
+}
+
+/*************************************************************//**
+Split rtree nodes.
+Return which group the first rec is in.  */
+int
+split_rtree_node(
+/*=============*/
+	rtr_split_node_t*	node,		/*!< in: split nodes.*/
+	int			n_entries,	/*!< in: entries number.*/
+	int			all_size,	/*!< in: total key's size.*/
+	int			key_size,	/*!< in: key's size.*/
+	int			min_size,	/*!< in: minimal group size.*/
+	int			size1,		/*!< in: size of group.*/
+	int			size2,		/*!< in: initial group sizes */
+	double**		d_buffer,	/*!< in/out: buffer.*/
+	int			n_dim,		/*!< in: dimensions. */
+	uchar*			first_rec);	/*!< in: the first rec. */
+
+/** Compare two minimum bounding rectangles.
+@param mode   comparison operator
+   MBR_INTERSECT(a,b)  a overlaps b
+   MBR_CONTAIN(a,b)    a contains b
+   MBR_DISJOINT(a,b)   a disjoint b
+   MBR_WITHIN(a,b)     a within   b
+   MBR_EQUAL(a,b)      All coordinates of MBRs are equal
+   MBR_DATA(a,b)       Data reference is the same
+@param b first MBR
+@param a second MBR
+@retval 0 if the predicate holds
+@retval 1 if the precidate does not hold */
+int rtree_key_cmp(page_cur_mode_t mode, const void *b, const void *a);
+#endif
diff --git a/storage/innobase/include/gis0rtree.h b/storage/innobase/include/gis0rtree.h
new file mode 100644
index 00000000..b07261ce
--- /dev/null
+++ b/storage/innobase/include/gis0rtree.h
@@ -0,0 +1,513 @@
+/*****************************************************************************
+
+Copyright (c) 2014, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include gis0rtree.h
+R-tree header file
+
+Created 2013/03/27 Jimmy Yang and Allen Lai
+***********************************************************************/
+
+#ifndef gis0rtree_h
+#define gis0rtree_h
+
+#include "btr0cur.h"
+#include "rem0types.h"
+
+/* Whether MBR 'a' contains 'b' */
+#define	MBR_CONTAIN_CMP(a, b)					\
+	((((b)->xmin >= (a)->xmin) && ((b)->xmax <= (a)->xmax)	\
+	 && ((b)->ymin >= (a)->ymin) && ((b)->ymax <= (a)->ymax)))
+
+/* Whether MBR 'a' equals to 'b' */
+#define	MBR_EQUAL_CMP(a, b)					\
+	((((b)->xmin == (a)->xmin) && ((b)->xmax == (a)->xmax))	\
+	 && (((b)->ymin == (a)->ymin) && ((b)->ymax == (a)->ymax)))
+
+/* Whether MBR 'a' intersects 'b' */
+#define	MBR_INTERSECT_CMP(a, b)					\
+	((((b)->xmin <= (a)->xmax) || ((b)->xmax >= (a)->xmin))	\
+	 && (((b)->ymin <= (a)->ymax) || ((b)->ymax >= (a)->ymin)))
+
+/* Whether MBR 'a' and 'b' disjoint */
+#define	MBR_DISJOINT_CMP(a, b)	(!MBR_INTERSECT_CMP(a, b))
+
+/* Whether MBR 'a' within 'b' */
+#define	MBR_WITHIN_CMP(a, b)					\
+	((((b)->xmin <= (a)->xmin) && ((b)->xmax >= (a)->xmax))	\
+	 && (((b)->ymin <= (a)->ymin) && ((b)->ymax >= (a)->ymax)))
+
+/* Define it for rtree search mode checking. */
+#define RTREE_SEARCH_MODE(mode)					\
+	(((mode) >= PAGE_CUR_CONTAIN) && ((mode <= PAGE_CUR_RTREE_GET_FATHER)))
+
+/* Geometry data header */
+#define	GEO_DATA_HEADER_SIZE	4
+
+/** Search for a spatial index leaf page record.
+@param cur         cursor
+@param tuple       search tuple
+@param latch_mode  latching mode
+@param mtr         mini-transaction
+@param mode        search mode */
+dberr_t rtr_search_leaf(btr_cur_t *cur, const dtuple_t *tuple,
+                        btr_latch_mode latch_mode, mtr_t *mtr,
+                        page_cur_mode_t mode= PAGE_CUR_RTREE_LOCATE)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Search for inserting a spatial index leaf page record.
+@param cur         cursor
+@param tuple       search tuple
+@param latch_mode  latching mode
+@param mtr         mini-transaction */
+inline dberr_t rtr_insert_leaf(btr_cur_t *cur, const dtuple_t *tuple,
+                               btr_latch_mode latch_mode, mtr_t *mtr)
+{
+  return rtr_search_leaf(cur, tuple, latch_mode, mtr, PAGE_CUR_RTREE_INSERT);
+}
+
+/** Search for a spatial index leaf page record.
+@param pcur         cursor
+@param tuple       search tuple
+@param mode        search mode
+@param mtr         mini-transaction */
+dberr_t rtr_search_leaf(btr_pcur_t *pcur, const dtuple_t *tuple,
+                        page_cur_mode_t mode, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+dberr_t rtr_search_to_nth_level(ulint level, const dtuple_t *tuple,
+                                page_cur_mode_t mode,
+                                btr_latch_mode latch_mode,
+                                btr_cur_t *cur, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/**********************************************************************//**
+Builds a Rtree node pointer out of a physical record and a page number.
+@return own: node pointer */
+dtuple_t*
+rtr_index_build_node_ptr(
+/*=====================*/
+	const dict_index_t*	index,	/*!< in: index */
+	const rtr_mbr_t*	mbr,	/*!< in: mbr of lower page */
+	const rec_t*		rec,	/*!< in: record for which to build node
+					pointer */
+	ulint			page_no,/*!< in: page number to put in node
+					pointer */
+	mem_heap_t*		heap);	/*!< in: memory heap where pointer
+					created */
+
+/*************************************************************//**
+Splits an R-tree index page to halves and inserts the tuple. It is assumed
+that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is
+released within this function! NOTE that the operation of this
+function must always succeed, we cannot reverse it: therefore enough
+free disk space (2 pages) must be guaranteed to be available before
+this function is called.
+@return inserted record */
+rec_t*
+rtr_page_split_and_insert(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in/out: cursor at which to insert; when the
+				function returns, the cursor is positioned
+				on the predecessor of the inserted record */
+	rec_offs**	offsets,/*!< out: offsets on inserted record */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
+	const dtuple_t*	tuple,	/*!< in: tuple to insert */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	mtr_t*		mtr,	/*!< in: mtr */
+	dberr_t*	err);	/*!< out: error code */
+
+/**************************************************************//**
+Sets the child node mbr in a node pointer. */
+UNIV_INLINE
+void
+rtr_page_cal_mbr(
+/*=============*/
+	const dict_index_t*	index,	/*!< in: index */
+	const buf_block_t*	block,	/*!< in: buffer block */
+	rtr_mbr_t*		mbr,	/*!< out: MBR encapsulates the page */
+	mem_heap_t*		heap);	/*!< in: heap for the memory
+					allocation */
+/*************************************************************//**
+Find the next matching record. This function will first exhaust
+the copied record listed in the rtr_info->matches vector before
+moving to next page
+@return true if there is next qualified record found, otherwise(if
+exhausted) false */
+bool
+rtr_pcur_move_to_next(
+/*==================*/
+	const dtuple_t*	tuple,	/*!< in: data tuple; NOTE: n_fields_cmp in
+				tuple must be set so that it cannot get
+				compared to the node ptr page number field! */
+	page_cur_mode_t	mode,	/*!< in: cursor search mode */
+	btr_pcur_t*	cursor, /*!< in: persistent cursor; NOTE that the
+				function may release the page latch */
+	ulint		cur_level,
+				/*!< in: current level */
+	mtr_t*		mtr)	/*!< in: mtr */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/****************************************************************//**
+Searches the right position in rtree for a page cursor. */
+bool
+rtr_cur_search_with_match(
+/*======================*/
+	const buf_block_t*	block,	/*!< in: buffer block */
+	dict_index_t*		index,	/*!< in: index descriptor */
+	const dtuple_t*		tuple,	/*!< in: data tuple */
+	page_cur_mode_t		mode,	/*!< in: PAGE_CUR_L,
+					PAGE_CUR_LE, PAGE_CUR_G, or
+					PAGE_CUR_GE */
+	page_cur_t*		cursor,	/*!< in/out: page cursor */
+	rtr_info_t*		rtr_info);/*!< in/out: search stack */
+
+/****************************************************************//**
+Calculate the area increased for a new record
+@return area increased */
+double
+rtr_rec_cal_increase(
+/*=================*/
+	const dtuple_t*	dtuple,	/*!< in: data tuple to insert, which
+				cause area increase */
+	const rec_t*	rec,	/*!< in: physical record which differs from
+				dtuple in some of the common fields, or which
+				has an equal number or more fields than
+				dtuple */
+	double*		area);	/*!< out: increased area */
+
+/****************************************************************//**
+Following the right link to find the proper block for insert.
+@return the proper block.*/
+dberr_t
+rtr_ins_enlarge_mbr(
+/*=================*/
+	btr_cur_t*		cursor,	/*!< in: btr cursor */
+	mtr_t*			mtr);	/*!< in: mtr */
+
+/**************************************************************//**
+push a nonleaf index node to the search path */
+UNIV_INLINE
+void
+rtr_non_leaf_stack_push(
+/*====================*/
+	rtr_node_path_t*	path,		/*!< in/out: search path */
+	uint32_t		pageno,		/*!< in: pageno to insert */
+	node_seq_t		seq_no,		/*!< in: Node sequence num */
+	ulint			level,		/*!< in: index level */
+	uint32_t		child_no,	/*!< in: child page no */
+	btr_pcur_t*		cursor,		/*!< in: position cursor */
+	double			mbr_inc);	/*!< in: MBR needs to be
+						enlarged */
+
+/**************************************************************//**
+push a nonleaf index node to the search path for insertion */
+void
+rtr_non_leaf_insert_stack_push(
+/*===========================*/
+	dict_index_t*		index,		/*!< in: index descriptor */
+	rtr_node_path_t*	path,		/*!< in/out: search path */
+	ulint			level,		/*!< in: index level */
+	const buf_block_t*	block,		/*!< in: block of the page */
+	const rec_t*		rec,		/*!< in: positioned record */
+	double			mbr_inc);	/*!< in: MBR needs to be
+						enlarged */
+
+#define rtr_get_new_ssn_id(index) (index)->assign_ssn()
+#define rtr_get_current_ssn_id(index) (index)->ssn()
+
+/********************************************************************//**
+Create a RTree search info structure */
+rtr_info_t*
+rtr_create_rtr_info(
+/******************/
+	bool		need_prdt,	/*!< in: Whether predicate lock is
+					needed */
+	bool		init_matches,	/*!< in: Whether to initiate the
+					"matches" structure for collecting
+					matched leaf records */
+	btr_cur_t*	cursor,		/*!< in: tree search cursor */
+	dict_index_t*	index);		/*!< in: index struct */
+
+/********************************************************************//**
+Update a btr_cur_t with rtr_info */
+void
+rtr_info_update_btr(
+/******************/
+	btr_cur_t*	cursor,		/*!< in/out: tree cursor */
+	rtr_info_t*	rtr_info);	/*!< in: rtr_info to set to the
+					cursor */
+
+/********************************************************************//**
+Update a btr_cur_t with rtr_info */
+void
+rtr_init_rtr_info(
+/****************/
+	rtr_info_t*	rtr_info,	/*!< in: rtr_info to set to the
+					cursor */
+	bool		need_prdt,	/*!< in: Whether predicate lock is
+					needed */
+	btr_cur_t*	cursor,		/*!< in: tree search cursor */
+	dict_index_t*	index,		/*!< in: index structure */
+	bool		reinit);	/*!< in: Whether this is a reinit */
+
+/**************************************************************//**
+Clean up Rtree cursor */
+void
+rtr_clean_rtr_info(
+/*===============*/
+	rtr_info_t*	rtr_info,	/*!< in: RTree search info */
+	bool		free_all);	/*!< in: need to free rtr_info itself */
+
+/****************************************************************//**
+Get the bounding box content from an index record*/
+void
+rtr_get_mbr_from_rec(
+/*=================*/
+	const rec_t*	rec,	/*!< in: data tuple */
+	const rec_offs*	offsets,/*!< in: offsets array */
+	rtr_mbr_t*	mbr);	/*!< out MBR */
+
+/****************************************************************//**
+Get the bounding box content from a MBR data record */
+void
+rtr_get_mbr_from_tuple(
+/*===================*/
+	const dtuple_t*	dtuple,	/*!< in: data tuple */
+	rtr_mbr*	mbr);	/*!< out: mbr to fill */
+
+/* Get the rtree page father.
+@param[in,out]	mtr		mtr
+@param[in]	sea_cur		search cursor, contains information
+				about parent nodes in search
+@param[in,out]	cursor		cursor on node pointer record,
+				its page x-latched
+@return whether the cursor was successfully positioned */
+bool rtr_page_get_father(mtr_t *mtr, btr_cur_t *sea_cur, btr_cur_t *cursor)
+  MY_ATTRIBUTE((nonnull(1,3), warn_unused_result));
+
+/************************************************************//**
+Returns the father block to a page. It is assumed that mtr holds
+an X or SX latch on the tree.
+@return rec_get_offsets() of the node pointer record */
+rec_offs*
+rtr_page_get_father_block(
+/*======================*/
+	rec_offs*	offsets,/*!< in: work area for the return value */
+	mem_heap_t*	heap,	/*!< in: memory heap to use */
+	mtr_t*		mtr,	/*!< in: mtr */
+	btr_cur_t*	sea_cur,/*!< in: search cursor, contains information
+				about parent nodes in search */
+	btr_cur_t*	cursor);/*!< out: cursor on node pointer record,
+				its page x-latched */
+/**************************************************************//**
+Store the parent path cursor
+@return number of cursor stored */
+ulint
+rtr_store_parent_path(
+/*==================*/
+	const buf_block_t*	block,	/*!< in: block of the page */
+	btr_cur_t*		btr_cur,/*!< in/out: persistent cursor */
+	btr_latch_mode		latch_mode,
+					/*!< in: latch_mode */
+	ulint			level,	/*!< in: index level */
+	mtr_t*			mtr);	/*!< in: mtr */
+
+/**************************************************************//**
+Initializes and opens a persistent cursor to an index tree. It should be
+closed with btr_pcur_close. */
+bool rtr_search(
+	const dtuple_t*	tuple,	/*!< in: tuple on which search done */
+	btr_latch_mode	latch_mode,/*!< in: BTR_MODIFY_LEAF, ... */
+	btr_pcur_t*	cursor,	/*!< in: memory buffer for persistent cursor */
+	mtr_t*		mtr)	/*!< in: mtr */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************//**
+Returns the R-Tree node stored in the parent search path
+@return pointer to R-Tree cursor component */
+UNIV_INLINE
+node_visit_t*
+rtr_get_parent_node(
+/*================*/
+	btr_cur_t*	btr_cur,	/*!< in: persistent cursor */
+	ulint		level,		/*!< in: index level of buffer page */
+	ulint		is_insert);	/*!< in: whether it is insert */
+
+/*********************************************************//**
+Returns the R-Tree cursor stored in the parent search path
+@return pointer to R-Tree cursor component */
+UNIV_INLINE
+btr_pcur_t*
+rtr_get_parent_cursor(
+/*==================*/
+	btr_cur_t*	btr_cur,	/*!< in: persistent cursor */
+	ulint		level,		/*!< in: index level of buffer page */
+	ulint		is_insert);	/*!< in: whether insert operation */
+
+MY_ATTRIBUTE((warn_unused_result))
+/*************************************************************//**
+Copy recs from a page to new_block of rtree.
+
+@return error code */
+dberr_t
+rtr_page_copy_rec_list_end_no_locks(
+/*================================*/
+	buf_block_t*	new_block,	/*!< in: index page to copy to */
+	buf_block_t*	block,		/*!< in: index page of rec */
+	rec_t*		rec,		/*!< in: record on page */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	rtr_rec_move_t*	rec_move,	/*!< in: recording records moved */
+	ulint		max_move,	/*!< in: num of rec to move */
+	ulint*		num_moved,	/*!< out: num of rec to move */
+	mtr_t*		mtr);		/*!< in: mtr */
+
+MY_ATTRIBUTE((warn_unused_result))
+/*************************************************************//**
+Copy recs till a specified rec from a page to new_block of rtree.
+
+@return error code */
+dberr_t
+rtr_page_copy_rec_list_start_no_locks(
+/*==================================*/
+	buf_block_t*	new_block,	/*!< in: index page to copy to */
+	buf_block_t*	block,		/*!< in: index page of rec */
+	rec_t*		rec,		/*!< in: record on page */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	rtr_rec_move_t*	rec_move,	/*!< in: recording records moved */
+	ulint		max_move,	/*!< in: num of rec to move */
+	ulint*		num_moved,	/*!< out: num of rec to move */
+	mtr_t*		mtr);		/*!< in: mtr */
+
+/****************************************************************//**
+Merge 2 mbrs and update the the mbr that cursor is on. */
+void
+rtr_merge_and_update_mbr(
+/*=====================*/
+	btr_cur_t*		cursor,		/*!< in/out: cursor */
+	btr_cur_t*		cursor2,	/*!< in: the other cursor */
+	rec_offs*		offsets,	/*!< in: rec offsets */
+	rec_offs*		offsets2,	/*!< in: rec offsets */
+	page_t*			child_page,	/*!< in: the child page. */
+	mtr_t*			mtr);		/*!< in: mtr */
+
+/*************************************************************//**
+Deletes on the upper level the node pointer to a page. */
+void
+rtr_node_ptr_delete(
+/*================*/
+	btr_cur_t*	cursor,	/*!< in: search cursor, contains information
+				about parent nodes in search */
+	mtr_t*		mtr);	/*!< in: mtr */
+
+/****************************************************************//**
+Check two MBRs are identical or need to be merged */
+bool
+rtr_merge_mbr_changed(
+/*==================*/
+	btr_cur_t*	cursor,		/*!< in: cursor */
+	btr_cur_t*	cursor2,	/*!< in: the other cursor */
+	rec_offs*	offsets,	/*!< in: rec offsets */
+	rec_offs*	offsets2,	/*!< in: rec offsets */
+	rtr_mbr_t*	new_mbr);	/*!< out: MBR to update */
+
+
+/**************************************************************//**
+Update the mbr field of a spatial index row. */
+void
+rtr_update_mbr_field(
+/*=================*/
+	btr_cur_t*	cursor,		/*!< in: cursor pointed to rec.*/
+	rec_offs*	offsets,	/*!< in: offsets on rec. */
+	btr_cur_t*	cursor2,	/*!< in/out: cursor pointed to rec
+					that should be deleted.
+					this cursor is for btr_compress to
+					delete the merged page's father rec.*/
+	page_t*		child_page,	/*!< in: child page. */
+	rtr_mbr_t*	new_mbr,	/*!< in: the new mbr. */
+	rec_t*		new_rec,	/*!< in: rec to use */
+	mtr_t*		mtr);		/*!< in: mtr */
+
+/**************************************************************//**
+Check whether a Rtree page is child of a parent page
+@return true if there is child/parent relationship */
+bool
+rtr_check_same_block(
+/*=================*/
+	dict_index_t*	index,	/*!< in: index tree */
+	btr_cur_t*	cur,	/*!< in/out: position at the parent entry
+				pointing to the child if successful */
+	buf_block_t*	parentb,/*!< in: parent page to check */
+	mem_heap_t*	heap);	/*!< in: memory heap */
+
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+rtr_write_mbr(
+/*==========*/
+	byte*			data,	/*!< out: data */
+	const rtr_mbr_t*	mbr);	/*!< in: data */
+
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+rtr_read_mbr(
+/*==========*/
+	const byte*		data,	/*!< in: data */
+	rtr_mbr_t*		mbr);	/*!< out: data */
+
+/**************************************************************//**
+Check whether a discarding page is in anyone's search path */
+void
+rtr_check_discard_page(
+/*===================*/
+	dict_index_t*	index,	/*!< in: index */
+	btr_cur_t*	cursor,	/*!< in: cursor on the page to discard: not on
+				the root page */
+	buf_block_t*	block);	/*!< in: block of page to be discarded */
+
+/********************************************************************//**
+Reinitialize a RTree search info */
+UNIV_INLINE
+void
+rtr_info_reinit_in_cursor(
+/************************/
+	btr_cur_t*	cursor,		/*!< in/out: tree cursor */
+	dict_index_t*	index,		/*!< in: index struct */
+	bool		need_prdt);	/*!< in: Whether predicate lock is
+					needed */
+
+/** Estimates the number of rows in a given area.
+@param[in]	index	index
+@param[in]	tuple	range tuple containing mbr, may also be empty tuple
+@param[in]	mode	search mode
+@return estimated number of rows */
+ha_rows
+rtr_estimate_n_rows_in_range(
+	dict_index_t*	index,
+	const dtuple_t*	tuple,
+	page_cur_mode_t	mode);
+
+#include "gis0rtree.inl"
+#endif /*!< gis0rtree.h */
diff --git a/storage/innobase/include/gis0rtree.inl b/storage/innobase/include/gis0rtree.inl
new file mode 100644
index 00000000..5101eeb6
--- /dev/null
+++ b/storage/innobase/include/gis0rtree.inl
@@ -0,0 +1,245 @@
+/*****************************************************************************
+
+Copyright (c) 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include gis0rtree.h
+R-tree Inline code
+
+Created 2013/03/27 Jimmy Yang and Allen Lai
+***********************************************************************/
+
+/**************************************************************//**
+Sets the child node mbr in a node pointer. */
+UNIV_INLINE
+void
+rtr_page_cal_mbr(
+/*=============*/
+	const dict_index_t*	index,	/*!< in: index */
+	const buf_block_t*	block,	/*!< in: buffer block */
+	rtr_mbr_t*		rtr_mbr,/*!< out: MBR encapsulates the page */
+	mem_heap_t*		heap)	/*!< in: heap for the memory
+					allocation */
+{
+	page_t*		page;
+	rec_t*		rec;
+	const byte*	field;
+	ulint		len;
+	rec_offs*	offsets = NULL;
+	double		bmin, bmax;
+	double*		amin;
+	double*		amax;
+	ulint		inc = 0;
+	double*		mbr;
+
+	rtr_mbr->xmin = DBL_MAX;
+	rtr_mbr->ymin = DBL_MAX;
+	rtr_mbr->xmax = -DBL_MAX;
+	rtr_mbr->ymax = -DBL_MAX;
+
+	mbr = reinterpret_cast<double*>(rtr_mbr);
+
+	page = buf_block_get_frame(block);
+
+	rec = page_rec_get_next(page_get_infimum_rec(page));
+	if (UNIV_UNLIKELY(!rec)) {
+		return;
+	}
+	offsets = rec_get_offsets(rec, index, offsets, page_is_leaf(page)
+				  ? index->n_fields : 0,
+				  ULINT_UNDEFINED, &heap);
+
+	do {
+		/* The mbr address is in the first field. */
+		field = rec_get_nth_field(rec, offsets, 0, &len);
+
+		ut_ad(len == DATA_MBR_LEN);
+		inc = 0;
+		for (unsigned i = 0; i < SPDIMS; i++) {
+			bmin = mach_double_read(field + inc);
+			bmax = mach_double_read(field + inc + sizeof(double));
+
+			amin = mbr + i * SPDIMS;
+			amax = mbr + i * SPDIMS + 1;
+
+			if (*amin > bmin)
+				*amin = bmin;
+			if (*amax < bmax)
+				*amax = bmax;
+
+			inc += 2 * sizeof(double);
+		}
+
+		rec = page_rec_get_next(rec);
+
+		if (rec == NULL) {
+			break;
+		}
+	} while (!page_rec_is_supremum(rec));
+}
+
+/**************************************************************//**
+push a nonleaf index node to the search path */
+UNIV_INLINE
+void
+rtr_non_leaf_stack_push(
+/*====================*/
+	rtr_node_path_t*	path,		/*!< in/out: search path */
+	uint32_t		pageno,		/*!< in: pageno to insert */
+	node_seq_t		seq_no,		/*!< in: Node sequence num */
+	ulint			level,		/*!< in: index page level */
+	uint32_t		child_no,	/*!< in: child page no */
+	btr_pcur_t*		cursor,		/*!< in: position cursor */
+	double			mbr_inc)	/*!< in: MBR needs to be
+						enlarged */
+{
+	node_visit_t	insert_val;
+
+	insert_val.page_no = pageno;
+	insert_val.seq_no = seq_no;
+	insert_val.level = level;
+	insert_val.child_no = child_no;
+	insert_val.cursor = cursor;
+	insert_val.mbr_inc = mbr_inc;
+
+	path->push_back(insert_val);
+
+#ifdef RTR_SEARCH_DIAGNOSTIC
+	fprintf(stderr, "INNODB_RTR: Push page %d, level %d, seq %d"
+			" to search stack \n",
+		static_cast<int>(pageno), static_cast<int>(level),
+		static_cast<int>(seq_no));
+#endif /* RTR_SEARCH_DIAGNOSTIC */
+}
+
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+rtr_write_mbr(
+/*==========*/
+	byte*			data,	/*!< out: data */
+	const rtr_mbr_t*	mbr)	/*!< in: data */
+{
+	const double* my_mbr = reinterpret_cast<const double*>(mbr);
+
+	for (unsigned i = 0; i < SPDIMS * 2; i++) {
+		mach_double_write(data + i * sizeof(double), my_mbr[i]);
+	}
+}
+
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+rtr_read_mbr(
+/*==========*/
+	const byte*	data,	/*!< in: data */
+	rtr_mbr_t*	mbr)	/*!< out: MBR */
+{
+	for (unsigned i = 0; i < SPDIMS * 2; i++) {
+		(reinterpret_cast<double*>(mbr))[i] = mach_double_read(
+							data
+							+ i * sizeof(double));
+	}
+}
+
+/*********************************************************//**
+Returns the R-Tree node stored in the parent search path
+@return pointer to R-Tree cursor component in the parent path,
+NULL if parent path is empty or index is larger than num of items contained */
+UNIV_INLINE
+node_visit_t*
+rtr_get_parent_node(
+/*================*/
+	btr_cur_t*	btr_cur,	/*!< in: persistent cursor */
+	ulint		level,		/*!< in: index level of buffer page */
+	ulint		is_insert)	/*!< in: whether it is insert */
+{
+	ulint			num;
+	ulint			tree_height = btr_cur->tree_height;
+	node_visit_t*		found_node = NULL;
+
+	if (level >= tree_height) {
+		return(NULL);
+	}
+
+	mysql_mutex_lock(&btr_cur->rtr_info->rtr_path_mutex);
+
+	num = btr_cur->rtr_info->parent_path->size();
+
+	if (!num) {
+		mysql_mutex_unlock(&btr_cur->rtr_info->rtr_path_mutex);
+		return(NULL);
+	}
+
+	if (is_insert) {
+		ulint	idx = tree_height - level - 1;
+		ut_ad(idx < num);
+
+		found_node = &(*btr_cur->rtr_info->parent_path)[idx];
+	} else {
+		node_visit_t*	node;
+
+		while (num > 0) {
+			node = &(*btr_cur->rtr_info->parent_path)[num - 1];
+
+			if (node->level == level) {
+				found_node = node;
+				break;
+			}
+			num--;
+		}
+	}
+
+	mysql_mutex_unlock(&btr_cur->rtr_info->rtr_path_mutex);
+
+	return(found_node);
+}
+
+/*********************************************************//**
+Returns the R-Tree cursor stored in the parent search path
+@return pointer to R-Tree cursor component */
+UNIV_INLINE
+btr_pcur_t*
+rtr_get_parent_cursor(
+/*==================*/
+	btr_cur_t*	btr_cur,	/*!< in: persistent cursor */
+	ulint		level,		/*!< in: index level of buffer page */
+	ulint		is_insert)	/*!< in: whether insert operation */
+{
+	node_visit_t*   found_node = rtr_get_parent_node(
+					btr_cur, level, is_insert);
+
+	return((found_node) ? found_node->cursor : NULL);
+}
+
+/********************************************************************//**
+Reinitialize a R-Tree search info in btr_cur_t */
+UNIV_INLINE
+void
+rtr_info_reinit_in_cursor(
+/************************/
+	btr_cur_t*	cursor,		/*!< in/out: tree cursor */
+	dict_index_t*	index,		/*!< in: index struct */
+	bool		need_prdt)	/*!< in: Whether predicate lock is
+					needed */
+{
+	rtr_clean_rtr_info(cursor->rtr_info, false);
+	rtr_init_rtr_info(cursor->rtr_info, need_prdt, cursor, index, true);
+}
diff --git a/storage/innobase/include/gis0type.h b/storage/innobase/include/gis0type.h
new file mode 100644
index 00000000..d6a4ef67
--- /dev/null
+++ b/storage/innobase/include/gis0type.h
@@ -0,0 +1,146 @@
+/*****************************************************************************
+
+Copyright (c) 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include gis0type.h
+R-tree header file
+
+Created 2013/03/27 Jimmy Yang
+***********************************************************************/
+
+#ifndef gis0type_h
+#define gis0type_h
+
+#include "buf0buf.h"
+#include "data0type.h"
+#include "data0types.h"
+#include "dict0types.h"
+#include "ut0vec.h"
+#include "gis0geo.h"
+
+#include <vector>
+#include <forward_list>
+
+/** Node Sequence Number. Only updated when page splits */
+typedef uint32_t     node_seq_t;
+
+/* RTree internal non-leaf Nodes to be searched, from root to leaf */
+struct node_visit_t {
+	uint32_t	page_no;	/*!< the page number */
+	node_seq_t	seq_no;		/*!< the SSN (split sequence number */
+	ulint		level;		/*!< the page's index level */
+	uint32_t	child_no;	/*!< child page num if for parent
+					recording */
+	btr_pcur_t*	cursor;		/*!< cursor structure if we positioned
+					FIXME: there is no need to use whole
+					btr_pcur_t, just the position related
+					members */
+	double		mbr_inc;	/*!< whether this node needs to be
+					enlarged for insertion */
+};
+
+typedef std::vector<node_visit_t, ut_allocator<node_visit_t> >	rtr_node_path_t;
+
+typedef	struct rtr_rec {
+		rec_t*	r_rec;		/*!< matched record */
+		bool	locked;		/*!< whether the record locked */
+} rtr_rec_t;
+
+typedef std::vector<rtr_rec_t, ut_allocator<rtr_rec_t> >	rtr_rec_vector;
+
+/* Structure for matched records on the leaf page */
+typedef	struct matched_rec {
+	byte*		bufp;		/*!< aligned buffer point */
+	byte		rec_buf[UNIV_PAGE_SIZE_MAX * 2];
+					/*!< buffer used to copy matching rec */
+	buf_block_t	block;		/*!< the shadow buffer block */
+	ulint		used;		/*!< memory used */
+	rtr_rec_vector*	matched_recs;	/*!< vector holding the matching rec */
+	mysql_mutex_t	rtr_match_mutex;/*!< mutex protect the match_recs
+					vector */
+	bool		valid;		/*!< whether result in matched_recs
+					or this search is valid (page not
+					dropped) */
+	bool		locked;		/*!< whether these recs locked */
+} matched_rec_t;
+
+/* In memory representation of a minimum bounding rectangle */
+typedef struct rtr_mbr {
+	double	xmin;			/*!< minimum on x */
+	double	xmax;			/*!< maximum on x */
+	double	ymin;			/*!< minimum on y */
+	double	ymax;			/*!< maximum on y */
+} rtr_mbr_t;
+
+/* Maximum index level for R-Tree, this is consistent with BTR_MAX_LEVELS */
+#define RTR_MAX_LEVELS		100
+
+/* Number of pages we latch at leaf level when there is possible Tree
+modification (split, shrink), we always latch left, current
+and right pages */
+#define RTR_LEAF_LATCH_NUM	3
+
+/** Vectors holding the matching internal pages/nodes and leaf records */
+typedef	struct rtr_info{
+	rtr_node_path_t*path;	/*!< vector holding matching pages */
+	rtr_node_path_t*parent_path;
+				/*!< vector holding parent pages during
+				search */
+	matched_rec_t*	matches;/*!< struct holding matching leaf records */
+	mysql_mutex_t	rtr_path_mutex;
+				/*!< mutex protect the "path" vector */
+	rtr_mbr_t	mbr;	/*!< the search MBR */
+	que_thr_t*      thr;	/*!< the search thread */
+	mem_heap_t*	heap;	/*!< memory heap */
+	btr_cur_t*	cursor;	/*!< cursor used for search */
+	dict_index_t*	index;	/*!< index it is searching */
+	bool		need_prdt_lock;
+				/*!< whether we will need predicate lock
+				the tree */
+	bool		need_page_lock;
+				/*!< whether we will need predicate page lock
+				the tree */
+	bool		allocated;/*!< whether this structure is allocate or
+				on stack */
+	bool		mbr_adj;/*!< whether mbr will need to be enlarged
+				for an insertion operation */
+	bool		fd_del;	/*!< found deleted row */
+	const dtuple_t*	search_tuple;
+				/*!< search tuple being used */
+	page_cur_mode_t	search_mode;
+				/*!< current search mode */
+} rtr_info_t;
+
+/* Tracking structure for all ongoing search for an index */
+struct rtr_info_track_t {
+	/** Active search info */
+	std::forward_list<rtr_info_t*, ut_allocator<rtr_info_t*> > rtr_active;
+	mysql_mutex_t rtr_active_mutex;
+						/*!< mutex to protect
+						rtr_active */
+};
+
+/* This is to record the record movement between pages. Used for corresponding
+lock movement */
+typedef struct rtr_rec_move {
+	rec_t*		old_rec;	/*!< record being moved in old page */
+	rec_t*		new_rec;	/*!< new record location */
+	bool		moved;		/*!< whether lock are moved too */
+} rtr_rec_move_t;
+#endif /*!< gis0rtree.h */
diff --git a/storage/innobase/include/ha0ha.h b/storage/innobase/include/ha0ha.h
new file mode 100644
index 00000000..5aaa559b
--- /dev/null
+++ b/storage/innobase/include/ha0ha.h
@@ -0,0 +1,60 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ha0ha.h
+The hash table interface for the adaptive hash index
+
+Created 8/18/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef ha0ha_h
+#define ha0ha_h
+
+#include "hash0hash.h"
+#include "page0types.h"
+#include "buf0types.h"
+#include "rem0types.h"
+
+#ifdef BTR_CUR_HASH_ADAPT
+/*************************************************************//**
+Looks for an element in a hash table.
+@return pointer to the data of the first hash table node in chain
+having the fold number, NULL if not found */
+UNIV_INLINE
+const rec_t*
+ha_search_and_get_data(
+/*===================*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold);	/*!< in: folded value of the searched data */
+
+/** The hash table external chain node */
+struct ha_node_t {
+	ulint		fold;	/*!< fold value for the data */
+	ha_node_t*	next;	/*!< next chain node or NULL if none */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	buf_block_t*	block;	/*!< buffer block containing the data, or NULL */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	const rec_t*	data;	/*!< pointer to the data */
+};
+
+#include "ha0ha.inl"
+#endif /* BTR_CUR_HASH_ADAPT */
+
+#endif
diff --git a/storage/innobase/include/ha0ha.inl b/storage/innobase/include/ha0ha.inl
new file mode 100644
index 00000000..0b256257
--- /dev/null
+++ b/storage/innobase/include/ha0ha.inl
@@ -0,0 +1,154 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/ha0ha.ic
+The hash table interface for the adaptive hash index
+
+Created 8/18/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifdef BTR_CUR_HASH_ADAPT
+#include "btr0types.h"
+
+/******************************************************************//**
+Gets a hash node data.
+@return pointer to the data */
+UNIV_INLINE
+const rec_t*
+ha_node_get_data(
+/*=============*/
+	const ha_node_t*	node)	/*!< in: hash chain node */
+{
+	return(node->data);
+}
+
+/******************************************************************//**
+Sets hash node data. */
+UNIV_INLINE
+void
+ha_node_set_data_func(
+/*==================*/
+	ha_node_t*	node,	/*!< in: hash chain node */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	buf_block_t*	block,	/*!< in: buffer block containing the data */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	const rec_t*	data)	/*!< in: pointer to the data */
+{
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	node->block = block;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	node->data = data;
+}
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+/** Sets hash node data.
+@param n in: hash chain node
+@param b in: buffer block containing the data
+@param d in: pointer to the data */
+# define ha_node_set_data(n,b,d) ha_node_set_data_func(n,b,d)
+#else /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+/** Sets hash node data.
+@param n in: hash chain node
+@param b in: buffer block containing the data
+@param d in: pointer to the data */
+# define ha_node_set_data(n,b,d) ha_node_set_data_func(n,d)
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+/******************************************************************//**
+Gets the next node in a hash chain.
+@return next node, NULL if none */
+UNIV_INLINE
+ha_node_t*
+ha_chain_get_next(
+/*==============*/
+	const ha_node_t*	node)	/*!< in: hash chain node */
+{
+	return(node->next);
+}
+
+/******************************************************************//**
+Gets the first node in a hash chain.
+@return first node, NULL if none */
+UNIV_INLINE
+ha_node_t*
+ha_chain_get_first(
+/*===============*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold value determining the chain */
+{
+  return static_cast<ha_node_t*>(table->array[table->calc_hash(fold)].node);
+}
+
+/*************************************************************//**
+Looks for an element in a hash table.
+@return pointer to the data of the first hash table node in chain
+having the fold number, NULL if not found */
+UNIV_INLINE
+const rec_t*
+ha_search_and_get_data(
+/*===================*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: folded value of the searched data */
+{
+	ut_ad(btr_search_enabled);
+
+	for (const ha_node_t* node = ha_chain_get_first(table, fold);
+	     node != NULL;
+	     node = ha_chain_get_next(node)) {
+
+		if (node->fold == fold) {
+
+			return(node->data);
+		}
+	}
+
+	return(NULL);
+}
+
+/*********************************************************//**
+Looks for an element when we know the pointer to the data.
+@return pointer to the hash table node, NULL if not found in the table */
+UNIV_INLINE
+ha_node_t*
+ha_search_with_data(
+/*================*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold,	/*!< in: folded value of the searched data */
+	const rec_t*	data)	/*!< in: pointer to the data */
+{
+	ha_node_t*	node;
+
+	ut_ad(btr_search_enabled);
+
+	node = ha_chain_get_first(table, fold);
+
+	while (node) {
+		if (node->data == data) {
+
+			return(node);
+		}
+
+		node = ha_chain_get_next(node);
+	}
+
+	return(NULL);
+}
+
+#endif /* BTR_CUR_HASH_ADAPT */
diff --git a/storage/innobase/include/ha0storage.h b/storage/innobase/include/ha0storage.h
new file mode 100644
index 00000000..fdf50a2e
--- /dev/null
+++ b/storage/innobase/include/ha0storage.h
@@ -0,0 +1,137 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ha0storage.h
+Hash storage.
+Provides a data structure that stores chunks of data in
+its own storage, avoiding duplicates.
+
+Created September 22, 2007 Vasil Dimov
+*******************************************************/
+
+#ifndef ha0storage_h
+#define ha0storage_h
+
+#include "univ.i"
+
+/** This value is used by default by ha_storage_create(). More memory
+is allocated later when/if it is needed. */
+#define HA_STORAGE_DEFAULT_HEAP_BYTES	1024
+
+/** This value is used by default by ha_storage_create(). It is a
+constant per ha_storage's lifetime. */
+#define HA_STORAGE_DEFAULT_HASH_CELLS	4096
+
+/** Hash storage */
+struct ha_storage_t;
+
+/*******************************************************************//**
+Creates a hash storage. If any of the parameters is 0, then a default
+value is used.
+@return own: hash storage */
+UNIV_INLINE
+ha_storage_t*
+ha_storage_create(
+/*==============*/
+	ulint	initial_heap_bytes,	/*!< in: initial heap's size */
+	ulint	initial_hash_cells);	/*!< in: initial number of cells
+					in the hash table */
+
+/*******************************************************************//**
+Copies data into the storage and returns a pointer to the copy. If the
+same data chunk is already present, then pointer to it is returned.
+Data chunks are considered to be equal if len1 == len2 and
+memcmp(data1, data2, len1) == 0. If "data" is not present (and thus
+data_len bytes need to be allocated) and the size of storage is going to
+become more than "memlim" then "data" is not added and NULL is returned.
+To disable this behavior "memlim" can be set to 0, which stands for
+"no limit".
+@return pointer to the copy */
+const void*
+ha_storage_put_memlim(
+/*==================*/
+	ha_storage_t*	storage,	/*!< in/out: hash storage */
+	const void*	data,		/*!< in: data to store */
+	ulint		data_len,	/*!< in: data length */
+	ulint		memlim);	/*!< in: memory limit to obey */
+
+/*******************************************************************//**
+Same as ha_storage_put_memlim() but without memory limit.
+@param storage in/out: hash storage
+@param data in: data to store
+@param data_len in: data length
+@return pointer to the copy of the string */
+#define ha_storage_put(storage, data, data_len)	\
+	ha_storage_put_memlim((storage), (data), (data_len), 0)
+
+/*******************************************************************//**
+Copies string into the storage and returns a pointer to the copy. If the
+same string is already present, then pointer to it is returned.
+Strings are considered to be equal if strcmp(str1, str2) == 0.
+@param storage in/out: hash storage
+@param str in: string to put
+@return pointer to the copy of the string */
+#define ha_storage_put_str(storage, str)	\
+	((const char*) ha_storage_put((storage), (str), strlen(str) + 1))
+
+/*******************************************************************//**
+Copies string into the storage and returns a pointer to the copy obeying
+a memory limit.
+If the same string is already present, then pointer to it is returned.
+Strings are considered to be equal if strcmp(str1, str2) == 0.
+@param storage in/out: hash storage
+@param str in: string to put
+@param memlim in: memory limit to obey
+@return pointer to the copy of the string */
+#define ha_storage_put_str_memlim(storage, str, memlim)	\
+	((const char*) ha_storage_put_memlim((storage), (str),	\
+					     strlen(str) + 1, (memlim)))
+
+/*******************************************************************//**
+Empties a hash storage, freeing memory occupied by data chunks.
+This invalidates any pointers previously returned by ha_storage_put().
+The hash storage is not invalidated itself and can be used again. */
+UNIV_INLINE
+void
+ha_storage_empty(
+/*=============*/
+	ha_storage_t**	storage);	/*!< in/out: hash storage */
+
+/*******************************************************************//**
+Frees a hash storage and everything it contains, it cannot be used after
+this call.
+This invalidates any pointers previously returned by ha_storage_put(). */
+UNIV_INLINE
+void
+ha_storage_free(
+/*============*/
+	ha_storage_t*	storage);	/*!< in, own: hash storage */
+
+/*******************************************************************//**
+Gets the size of the memory used by a storage.
+@return bytes used */
+UNIV_INLINE
+ulint
+ha_storage_get_size(
+/*================*/
+	const ha_storage_t*	storage);	/*!< in: hash storage */
+
+#include "ha0storage.inl"
+
+#endif /* ha0storage_h */
diff --git a/storage/innobase/include/ha0storage.inl b/storage/innobase/include/ha0storage.inl
new file mode 100644
index 00000000..df9679cf
--- /dev/null
+++ b/storage/innobase/include/ha0storage.inl
@@ -0,0 +1,142 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ha0storage.ic
+Hash storage.
+Provides a data structure that stores chunks of data in
+its own storage, avoiding duplicates.
+
+Created September 24, 2007 Vasil Dimov
+*******************************************************/
+
+#include "hash0hash.h"
+#include "mem0mem.h"
+
+/** Hash storage for strings */
+struct ha_storage_t {
+	mem_heap_t*	heap;	/*!< memory heap from which memory is
+				allocated */
+	hash_table_t	hash;	/*!< hash table used to avoid
+				duplicates */
+};
+
+/** Objects of this type are stored in ha_storage_t */
+struct ha_storage_node_t {
+	ulint			data_len;/*!< length of the data */
+	const void*		data;	/*!< pointer to data */
+	ha_storage_node_t*	next;	/*!< next node in hash chain */
+};
+
+/*******************************************************************//**
+Creates a hash storage. If any of the parameters is 0, then a default
+value is used.
+@return own: hash storage */
+UNIV_INLINE
+ha_storage_t*
+ha_storage_create(
+/*==============*/
+	ulint	initial_heap_bytes,	/*!< in: initial heap's size */
+	ulint	initial_hash_cells)	/*!< in: initial number of cells
+					in the hash table */
+{
+	ha_storage_t*	storage;
+	mem_heap_t*	heap;
+
+	if (initial_heap_bytes == 0) {
+
+		initial_heap_bytes = HA_STORAGE_DEFAULT_HEAP_BYTES;
+	}
+
+	if (initial_hash_cells == 0) {
+
+		initial_hash_cells = HA_STORAGE_DEFAULT_HASH_CELLS;
+	}
+
+	/* we put "storage" within "storage->heap" */
+
+	heap = mem_heap_create(sizeof(ha_storage_t)
+			       + initial_heap_bytes);
+
+	storage = (ha_storage_t*) mem_heap_alloc(heap,
+						 sizeof(ha_storage_t));
+
+	storage->heap = heap;
+	storage->hash.create(initial_hash_cells);
+
+	return(storage);
+}
+
+/*******************************************************************//**
+Empties a hash storage, freeing memory occupied by data chunks.
+This invalidates any pointers previously returned by ha_storage_put().
+The hash storage is not invalidated itself and can be used again. */
+UNIV_INLINE
+void
+ha_storage_empty(
+/*=============*/
+	ha_storage_t**	storage)	/*!< in/out: hash storage */
+{
+	ha_storage_t	temp_storage;
+
+	temp_storage.heap = (*storage)->heap;
+	temp_storage.hash = (*storage)->hash;
+
+	temp_storage.hash.clear();
+	mem_heap_empty(temp_storage.heap);
+
+	*storage = (ha_storage_t*) mem_heap_alloc(temp_storage.heap,
+						  sizeof(ha_storage_t));
+
+	(*storage)->heap = temp_storage.heap;
+	(*storage)->hash = temp_storage.hash;
+}
+
+/*******************************************************************//**
+Frees a hash storage and everything it contains, it cannot be used after
+this call.
+This invalidates any pointers previously returned by ha_storage_put(). */
+UNIV_INLINE
+void
+ha_storage_free(
+/*============*/
+	ha_storage_t*	storage)	/*!< in, own: hash storage */
+{
+	storage->hash.free();
+	mem_heap_free(storage->heap);
+}
+
+/*******************************************************************//**
+Gets the size of the memory used by a storage.
+@return bytes used */
+UNIV_INLINE
+ulint
+ha_storage_get_size(
+/*================*/
+	const ha_storage_t*	storage)	/*!< in: hash storage */
+{
+	ulint	ret;
+
+	ret = mem_heap_get_size(storage->heap);
+
+	/* this assumes hash->heap and hash->heaps are NULL */
+	ret += sizeof(hash_table_t);
+	ret += sizeof(hash_cell_t) * storage->hash.n_cells;
+
+	return(ret);
+}
diff --git a/storage/innobase/include/ha_prototypes.h b/storage/innobase/include/ha_prototypes.h
new file mode 100644
index 00000000..d5239ec3
--- /dev/null
+++ b/storage/innobase/include/ha_prototypes.h
@@ -0,0 +1,476 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ha_prototypes.h
+Prototypes for global functions in ha_innodb.cc that are called by
+InnoDB C code.
+
+NOTE: This header is intended to insulate InnoDB from SQL names and functions.
+Do not include any headers other than univ.i into this unless they are very
+simple headers.
+************************************************************************/
+
+#ifndef HA_INNODB_PROTOTYPES_H
+#define HA_INNODB_PROTOTYPES_H
+
+#include "univ.i"
+
+#ifndef UNIV_INNOCHECKSUM
+
+/* Forward declarations */
+class THD;
+class Field;
+
+// JAN: TODO missing features:
+#undef MYSQL_FT_INIT_EXT
+#undef MYSQL_PFS
+#undef MYSQL_STORE_FTS_DOC_ID
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) that is of
+type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "charset_coll" and writes
+the result to "buf". The result is converted to "system_charset_info".
+Not more than "buf_size" bytes are written to "buf".
+The result is always NUL-terminated (provided buf_size > 0) and the
+number of bytes that were written to "buf" is returned (including the
+terminating NUL).
+@return number of bytes that were written */
+ulint
+innobase_raw_format(
+/*================*/
+	const char*	data,		/*!< in: raw data */
+	ulint		data_len,	/*!< in: raw data length
+					in bytes */
+	ulint		charset_coll,	/*!< in: charset collation */
+	char*		buf,		/*!< out: output buffer */
+	ulint		buf_size);	/*!< in: output buffer size
+					in bytes */
+
+/*****************************************************************//**
+Invalidates the MySQL query cache for the table. */
+void
+innobase_invalidate_query_cache(
+/*============================*/
+	trx_t*		trx,		/*!< in: transaction which
+					modifies the table */
+	const char*	full_name);	/*!< in: concatenation of
+					database name, path separator,
+					table name, null char NUL;
+					NOTE that in Windows this is
+					always in LOWER CASE! */
+
+/** Quote a standard SQL identifier like tablespace, index or column name.
+@param[in]	file	output stream
+@param[in]	trx	InnoDB transaction, or NULL
+@param[in]	id	identifier to quote */
+void
+innobase_quote_identifier(
+	FILE*		file,
+	trx_t*		trx,
+	const char*	id);
+
+/** Quote an standard SQL identifier like tablespace, index or column name.
+Return the string as an std:string object.
+@param[in]	trx	InnoDB transaction, or NULL
+@param[in]	id	identifier to quote
+@return a std::string with id properly quoted. */
+std::string
+innobase_quote_identifier(
+	trx_t*		trx,
+	const char*	id);
+
+/*****************************************************************//**
+Convert a table name to the MySQL system_charset_info (UTF-8).
+@return pointer to the end of buf */
+char*
+innobase_convert_name(
+/*==================*/
+	char*		buf,	/*!< out: buffer for converted identifier */
+	ulint		buflen,	/*!< in: length of buf, in bytes */
+	const char*	id,	/*!< in: table name to convert */
+	ulint		idlen,	/*!< in: length of id, in bytes */
+	THD*		thd);	/*!< in: MySQL connection thread, or NULL */
+
+/******************************************************************//**
+Returns true if the transaction this thread is processing has edited
+non-transactional tables. Used by the deadlock detector when deciding
+which transaction to rollback in case of a deadlock - we try to avoid
+rolling back transactions that have edited non-transactional tables.
+@return true if non-transactional tables have been edited */
+ibool
+thd_has_edited_nontrans_tables(
+/*===========================*/
+	THD*	thd);	/*!< in: thread handle */
+
+/*************************************************************//**
+Prints info of a THD object (== user session thread) to the given file. */
+void
+innobase_mysql_print_thd(
+/*=====================*/
+	FILE*	f,		/*!< in: output stream */
+	THD*	thd,		/*!< in: pointer to a MySQL THD object */
+	uint	max_query_len);	/*!< in: max query length to print, or 0 to
+				   use the default max length */
+
+/** Converts a MySQL type to an InnoDB type. Note that this function returns
+the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1
+VARCHAR and the new true VARCHAR in >= 5.0.3 by the 'prtype'.
+@param[out]	unsigned_flag		DATA_UNSIGNED if an 'unsigned type';
+at least ENUM and SET, and unsigned integer types are 'unsigned types'
+@param[in]	f			MySQL Field
+@return DATA_BINARY, DATA_VARCHAR, ... */
+uint8_t
+get_innobase_type_from_mysql_type(unsigned *unsigned_flag, const Field *field);
+
+/******************************************************************//**
+Compares NUL-terminated UTF-8 strings case insensitively.
+@return 0 if a=b, <0 if a<b, >1 if a>b */
+int
+innobase_strcasecmp(
+/*================*/
+	const char*	a,	/*!< in: first string to compare */
+	const char*	b);	/*!< in: second string to compare */
+
+/** Strip dir name from a full path name and return only the file name
+@param[in]	path_name	full path name
+@return file name or "null" if no file name */
+const char*
+innobase_basename(
+	const char*	path_name);
+
+/******************************************************************//**
+Converts an identifier to a table name. */
+void
+innobase_convert_from_table_id(
+/*===========================*/
+	CHARSET_INFO*	cs,	/*!< in: the 'from' character set */
+	char*		to,	/*!< out: converted identifier */
+	const char*	from,	/*!< in: identifier to convert */
+	ulint		len);	/*!< in: length of 'to', in bytes; should
+				be at least 5 * strlen(to) + 1 */
+/******************************************************************//**
+Converts an identifier to UTF-8. */
+void
+innobase_convert_from_id(
+/*=====================*/
+	CHARSET_INFO*	cs,	/*!< in: the 'from' character set */
+	char*		to,	/*!< out: converted identifier */
+	const char*	from,	/*!< in: identifier to convert */
+	ulint		len);	/*!< in: length of 'to', in bytes;
+				should be at least 3 * strlen(to) + 1 */
+/******************************************************************//**
+Makes all characters in a NUL-terminated UTF-8 string lower case. */
+void
+innobase_casedn_str(
+/*================*/
+	char*	a);	/*!< in/out: string to put in lower case */
+
+#ifdef WITH_WSREP
+ulint wsrep_innobase_mysql_sort(int mysql_type, uint charset_number,
+                             unsigned char* str, ulint str_length,
+                             ulint buf_length);
+#endif /* WITH_WSREP */
+
+extern "C" struct charset_info_st *thd_charset(THD *thd);
+
+/** Get high resolution timestamp for the current query start time.
+The timestamp is not anchored to any specific point in time,
+but can be used for comparison.
+@param thd user thread
+@retval timestamp in microseconds precision
+*/
+extern "C" unsigned long long thd_start_utime(const MYSQL_THD thd);
+
+
+/** Determines the current SQL statement.
+Thread unsafe, can only be called from the thread owning the THD.
+@param[in]	thd	MySQL thread handle
+@param[out]	length	Length of the SQL statement
+@return			SQL statement string */
+const char*
+innobase_get_stmt_unsafe(
+	THD*	thd,
+	size_t*	length);
+
+/******************************************************************//**
+This function is used to find the storage length in bytes of the first n
+characters for prefix indexes using a multibyte character set. The function
+finds charset information and returns length of prefix_len characters in the
+index field in bytes.
+@return number of bytes occupied by the first n characters */
+ulint
+innobase_get_at_most_n_mbchars(
+/*===========================*/
+	ulint charset_id,	/*!< in: character set id */
+	ulint prefix_len,	/*!< in: prefix length in bytes of the index
+				(this has to be divided by mbmaxlen to get the
+				number of CHARACTERS n in the prefix) */
+	ulint data_len,		/*!< in: length of the string in bytes */
+	const char* str);	/*!< in: character string */
+
+/** Get status of innodb_tmpdir.
+@param[in]	thd	thread handle, or NULL to query
+			the global innodb_tmpdir.
+@retval NULL if innodb_tmpdir="" */
+const char *thd_innodb_tmpdir(THD *thd);
+
+/******************************************************************//**
+Returns the lock wait timeout for the current connection.
+@return the lock wait timeout, in seconds */
+uint&
+thd_lock_wait_timeout(
+/*==================*/
+	THD*	thd);	/*!< in: thread handle, or NULL to query
+			the global innodb_lock_wait_timeout */
+
+/******************************************************************//**
+compare two character string case insensitively according to their charset. */
+int
+innobase_fts_text_case_cmp(
+/*=======================*/
+	const void*	cs,		/*!< in: Character set */
+	const void*	p1,		/*!< in: key */
+	const void*	p2);		/*!< in: node */
+
+/******************************************************************//**
+Returns true if transaction should be flagged as read-only.
+@return true if the thd is marked as read-only */
+bool
+thd_trx_is_read_only(
+/*=================*/
+	THD*	thd);	/*!< in/out: thread handle */
+
+/******************************************************************//**
+Check if the transaction is an auto-commit transaction. TRUE also
+implies that it is a SELECT (read-only) transaction.
+@return true if the transaction is an auto commit read-only transaction. */
+ibool
+thd_trx_is_auto_commit(
+/*===================*/
+	THD*	thd);	/*!< in: thread handle, or NULL */
+
+/*****************************************************************//**
+A wrapper function of innobase_convert_name(), convert a table name
+to the MySQL system_charset_info (UTF-8) and quote it if needed.
+@return pointer to the end of buf */
+void
+innobase_format_name(
+/*==================*/
+	char*		buf,	/*!< out: buffer for converted identifier */
+	ulint		buflen,	/*!< in: length of buf, in bytes */
+	const char*	name);	/*!< in: table name to format */
+
+/** Corresponds to Sql_condition:enum_warning_level. */
+enum ib_log_level_t {
+	IB_LOG_LEVEL_INFO,
+	IB_LOG_LEVEL_WARN,
+	IB_LOG_LEVEL_ERROR,
+	IB_LOG_LEVEL_FATAL
+};
+
+/******************************************************************//**
+Use this when the args are first converted to a formatted string and then
+passed to the format string from errmsg-utf8.txt. The error message format
+must be: "Some string ... %s".
+
+Push a warning message to the client, it is a wrapper around:
+
+void push_warning_printf(
+	THD *thd, Sql_condition::enum_warning_level level,
+	uint code, const char *format, ...);
+*/
+void
+ib_errf(
+/*====*/
+	THD*		thd,		/*!< in/out: session */
+	ib_log_level_t	level,		/*!< in: warning level */
+	ib_uint32_t	code,		/*!< MySQL error code */
+	const char*	format,		/*!< printf format */
+	...)				/*!< Args */
+	MY_ATTRIBUTE((format(printf, 4, 5)));
+
+/******************************************************************//**
+Use this when the args are passed to the format string from
+errmsg-utf8.txt directly as is.
+
+Push a warning message to the client, it is a wrapper around:
+
+void push_warning_printf(
+	THD *thd, Sql_condition::enum_warning_level level,
+	uint code, const char *format, ...);
+*/
+void
+ib_senderrf(
+/*========*/
+	THD*		thd,		/*!< in/out: session */
+	ib_log_level_t	level,		/*!< in: warning level */
+	ib_uint32_t	code,		/*!< MySQL error code */
+	...);				/*!< Args */
+
+extern const char* 	TROUBLESHOOTING_MSG;
+extern const char* 	TROUBLESHOOT_DATADICT_MSG;
+extern const char* 	BUG_REPORT_MSG;
+extern const char* 	FORCE_RECOVERY_MSG;
+extern const char*      OPERATING_SYSTEM_ERROR_MSG;
+extern const char*      FOREIGN_KEY_CONSTRAINTS_MSG;
+extern const char*      SET_TRANSACTION_MSG;
+extern const char*      INNODB_PARAMETERS_MSG;
+
+/******************************************************************//**
+Returns the NUL terminated value of glob_hostname.
+@return pointer to glob_hostname. */
+const char*
+server_get_hostname();
+/*=================*/
+
+/*********************************************************************//**
+Compute the next autoinc value.
+
+For MySQL replication the autoincrement values can be partitioned among
+the nodes. The offset is the start or origin of the autoincrement value
+for a particular node. For n nodes the increment will be n and the offset
+will be in the interval [1, n]. The formula tries to allocate the next
+value for a particular node.
+
+Note: This function is also called with increment set to the number of
+values we want to reserve for multi-value inserts e.g.,
+
+	INSERT INTO T VALUES(), (), ();
+
+innobase_next_autoinc() will be called with increment set to 3 where
+autoinc_lock_mode != TRADITIONAL because we want to reserve 3 values for
+the multi-value INSERT above.
+@return the next value */
+ulonglong
+innobase_next_autoinc(
+/*==================*/
+	ulonglong	current,	/*!< in: Current value */
+	ulonglong	need,		/*!< in: count of values needed */
+	ulonglong	step,		/*!< in: AUTOINC increment step */
+	ulonglong	offset,		/*!< in: AUTOINC offset */
+	ulonglong	max_value)	/*!< in: max value for type */
+	MY_ATTRIBUTE((pure, warn_unused_result));
+
+/**********************************************************************
+Converts an identifier from my_charset_filename to UTF-8 charset. */
+uint
+innobase_convert_to_system_charset(
+/*===============================*/
+	char*           to,		/* out: converted identifier */
+	const char*     from,		/* in: identifier to convert */
+	ulint           len,		/* in: length of 'to', in bytes */
+	uint*		errors);	/* out: error return */
+/**********************************************************************
+Check if the length of the identifier exceeds the maximum allowed.
+The input to this function is an identifier in charset my_charset_filename.
+return true when length of identifier is too long. */
+my_bool
+innobase_check_identifier_length(
+/*=============================*/
+	const char*	id);	/* in: identifier to check.  it must belong
+				to charset my_charset_filename */
+
+/**********************************************************************
+Converts an identifier from my_charset_filename to UTF-8 charset. */
+uint
+innobase_convert_to_system_charset(
+/*===============================*/
+	char*		to,		/* out: converted identifier */
+	const char*	from,		/* in: identifier to convert */
+	ulint		len,		/* in: length of 'to', in bytes */
+	uint*		errors);	/* out: error return */
+
+/**********************************************************************
+Converts an identifier from my_charset_filename to UTF-8 charset. */
+uint
+innobase_convert_to_filename_charset(
+/*=================================*/
+	char*		to,	/* out: converted identifier */
+	const char*	from,	/* in: identifier to convert */
+	ulint		len);	/* in: length of 'to', in bytes */
+
+/********************************************************************//**
+Helper function to push warnings from InnoDB internals to SQL-layer. */
+void
+ib_push_warning(
+	trx_t*		trx,	/*!< in: trx */
+	dberr_t		error,	/*!< in: error code to push as warning */
+	const char	*format,/*!< in: warning message */
+	...);
+
+/********************************************************************//**
+Helper function to push warnings from InnoDB internals to SQL-layer. */
+void
+ib_push_warning(
+	void*		ithd,	/*!< in: thd */
+	dberr_t		error,	/*!< in: error code to push as warning */
+	const char	*format,/*!< in: warning message */
+	...);
+
+/********************************************************************//**
+Helper function to push warnings from InnoDB internals to SQL-layer. */
+void
+ib_foreign_warn(
+	trx_t*		trx,	/*!< in: trx */
+	dberr_t		error,	/*!< in: error code to push as warning */
+	const char	*table_name,
+	const char	*format,/*!< in: warning message */
+	...);
+
+/*****************************************************************//**
+Normalizes a table name string. A normalized name consists of the
+database name catenated to '/' and table name. An example:
+test/mytable. On Windows normalization puts both the database name and the
+table name always to lower case if "set_lower_case" is set to TRUE. */
+void
+normalize_table_name_c_low(
+/*=======================*/
+	char*		norm_name,	/*!< out: normalized name as a
+					null-terminated string */
+	const char*	name,		/*!< in: table name string */
+	bool		set_lower_case); /*!< in: true if we want to set
+					name to lower case */
+
+/** Create a MYSQL_THD for a background thread and mark it as such.
+@param name thread info for SHOW PROCESSLIST
+@return new MYSQL_THD */
+MYSQL_THD innobase_create_background_thd(const char* name);
+
+/** Destroy a THD object associated with a background task.
+@param[in]	thd	MYSQL_THD to destroy */
+void destroy_background_thd(MYSQL_THD thd);
+
+/** Close opened tables, free memory, delete items for a MYSQL_THD.
+@param[in]	thd	MYSQL_THD to reset */
+void
+innobase_reset_background_thd(MYSQL_THD);
+
+#ifdef WITH_WSREP
+/** Append table-level exclusive key.
+@param thd   MySQL thread handle
+@param table table
+@retval false on success
+@retval true on failure */
+struct dict_table_t;
+bool wsrep_append_table_key(MYSQL_THD thd, const dict_table_t &table);
+#endif /* WITH_WSREP */
+
+#endif /* !UNIV_INNOCHECKSUM */
+#endif /* HA_INNODB_PROTOTYPES_H */
diff --git a/storage/innobase/include/handler0alter.h b/storage/innobase/include/handler0alter.h
new file mode 100644
index 00000000..add983a0
--- /dev/null
+++ b/storage/innobase/include/handler0alter.h
@@ -0,0 +1,108 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/handler0alter.h
+Smart ALTER TABLE
+*******************************************************/
+
+#include "rem0types.h"
+
+/*************************************************************//**
+Copies an InnoDB record to table->record[0]. */
+void
+innobase_rec_to_mysql(
+/*==================*/
+	struct TABLE*		table,	/*!< in/out: MySQL table */
+	const rec_t*		rec,	/*!< in: record */
+	const dict_index_t*	index,	/*!< in: index */
+	const rec_offs*		offsets)/*!< in: rec_get_offsets(
+					rec, index, ...) */
+	MY_ATTRIBUTE((nonnull));
+
+/*************************************************************//**
+Copies an InnoDB index entry to table->record[0]. */
+void
+innobase_fields_to_mysql(
+/*=====================*/
+	struct TABLE*		table,	/*!< in/out: MySQL table */
+	const dict_index_t*	index,	/*!< in: InnoDB index */
+	const dfield_t*		fields)	/*!< in: InnoDB index fields */
+	MY_ATTRIBUTE((nonnull));
+
+/*************************************************************//**
+Copies an InnoDB row to table->record[0]. */
+void
+innobase_row_to_mysql(
+/*==================*/
+	struct TABLE*		table,	/*!< in/out: MySQL table */
+	const dict_table_t*	itab,	/*!< in: InnoDB table */
+	const dtuple_t*		row)	/*!< in: InnoDB row */
+	MY_ATTRIBUTE((nonnull));
+
+/** Generate the next autoinc based on a snapshot of the session
+auto_increment_increment and auto_increment_offset variables. */
+struct ib_sequence_t {
+
+	/**
+	@param thd the session
+	@param start_value the lower bound
+	@param max_value the upper bound (inclusive) */
+	ib_sequence_t(THD* thd, ulonglong start_value, ulonglong max_value);
+
+	/** Postfix increment
+	@return the value to insert */
+	ulonglong operator++(int) UNIV_NOTHROW;
+
+	/** Check if the autoinc "sequence" is exhausted.
+	@return true if the sequence is exhausted */
+	bool eof() const UNIV_NOTHROW
+	{
+		return(m_eof);
+	}
+
+	/**
+	@return the next value in the sequence */
+	ulonglong last() const UNIV_NOTHROW
+	{
+		ut_ad(m_next_value > 0);
+
+		return(m_next_value);
+	}
+
+	/** @return maximum column value
+	@retval	0	if not adding AUTO_INCREMENT column */
+	ulonglong max_value() const { return m_max_value; }
+
+private:
+	/** Maximum value if adding an AUTO_INCREMENT column, else 0 */
+	ulonglong	m_max_value;
+
+	/** Value of auto_increment_increment */
+	ulong		m_increment;
+
+	/** Value of auto_increment_offset */
+	ulong		m_offset;
+
+	/** Next value in the sequence */
+	ulonglong	m_next_value;
+
+	/** true if no more values left in the sequence */
+	bool		m_eof;
+};
diff --git a/storage/innobase/include/hash0hash.h b/storage/innobase/include/hash0hash.h
new file mode 100644
index 00000000..867ad9e0
--- /dev/null
+++ b/storage/innobase/include/hash0hash.h
@@ -0,0 +1,190 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/hash0hash.h
+The simple hash table utility
+
+Created 5/20/1997 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+#include "ut0rnd.h"
+#include "ut0new.h"
+
+struct hash_table_t;
+struct hash_cell_t
+{
+  /** singly-linked, nullptr terminated list of hash buckets */
+  void *node;
+
+  /** Append an element.
+  @tparam T      type of the element
+  @param insert  the being-inserted element
+  @param next    the next-element pointer in T */
+  template<typename T>
+  void append(T &insert, T *T::*next)
+  {
+    void **after;
+    for (after= &node; *after;
+         after= reinterpret_cast<void**>(&(static_cast<T*>(*after)->*next)));
+    insert.*next= nullptr;
+    *after= &insert;
+  }
+};
+
+/*******************************************************************//**
+Inserts a struct to a hash table. */
+
+#define HASH_INSERT(TYPE, NAME, TABLE, FOLD, DATA)\
+do {\
+	hash_cell_t*	cell3333;\
+	TYPE*		struct3333;\
+\
+	(DATA)->NAME = NULL;\
+\
+	cell3333 = &(TABLE)->array[(TABLE)->calc_hash(FOLD)];	\
+\
+	if (cell3333->node == NULL) {\
+		cell3333->node = DATA;\
+	} else {\
+		struct3333 = (TYPE*) cell3333->node;\
+\
+		while (struct3333->NAME != NULL) {\
+\
+			struct3333 = (TYPE*) struct3333->NAME;\
+		}\
+\
+		struct3333->NAME = DATA;\
+	}\
+} while (0)
+
+#ifdef UNIV_HASH_DEBUG
+# define HASH_ASSERT_VALID(DATA) ut_a((void*) (DATA) != (void*) -1)
+# define HASH_INVALIDATE(DATA, NAME) *(void**) (&DATA->NAME) = (void*) -1
+#else
+# define HASH_ASSERT_VALID(DATA) do {} while (0)
+# define HASH_INVALIDATE(DATA, NAME) do {} while (0)
+#endif
+
+/*******************************************************************//**
+Deletes a struct from a hash table. */
+
+#define HASH_DELETE(TYPE, NAME, TABLE, FOLD, DATA)\
+do {\
+	hash_cell_t*	cell3333;\
+	TYPE*		struct3333;\
+\
+	cell3333 = &(TABLE)->array[(TABLE)->calc_hash(FOLD)]; \
+\
+	if (cell3333->node == DATA) {\
+		HASH_ASSERT_VALID(DATA->NAME);\
+		cell3333->node = DATA->NAME;\
+	} else {\
+		struct3333 = (TYPE*) cell3333->node;\
+\
+		while (struct3333->NAME != DATA) {\
+\
+			struct3333 = (TYPE*) struct3333->NAME;\
+			ut_a(struct3333);\
+		}\
+\
+		struct3333->NAME = DATA->NAME;\
+	}\
+	HASH_INVALIDATE(DATA, NAME);\
+} while (0)
+
+/*******************************************************************//**
+Gets the first struct in a hash chain, NULL if none. */
+
+#define HASH_GET_FIRST(TABLE, HASH_VAL) (TABLE)->array[HASH_VAL].node
+
+/*******************************************************************//**
+Gets the next struct in a hash chain, NULL if none. */
+
+#define HASH_GET_NEXT(NAME, DATA)	((DATA)->NAME)
+
+/********************************************************************//**
+Looks for a struct in a hash table. */
+#define HASH_SEARCH(NAME, TABLE, FOLD, TYPE, DATA, ASSERTION, TEST)\
+{\
+	(DATA) = (TYPE) HASH_GET_FIRST(TABLE, (TABLE)->calc_hash(FOLD)); \
+	HASH_ASSERT_VALID(DATA);\
+\
+	while ((DATA) != NULL) {\
+		ASSERTION;\
+		if (TEST) {\
+			break;\
+		} else {\
+			HASH_ASSERT_VALID(HASH_GET_NEXT(NAME, DATA));\
+			(DATA) = (TYPE) HASH_GET_NEXT(NAME, DATA);\
+		}\
+	}\
+}
+
+/********************************************************************//**
+Looks for an item in all hash buckets. */
+#define HASH_SEARCH_ALL(NAME, TABLE, TYPE, DATA, ASSERTION, TEST)	\
+do {									\
+	ulint	i3333;							\
+									\
+	for (i3333 = (TABLE)->n_cells; i3333--; ) {			\
+		(DATA) = (TYPE) HASH_GET_FIRST(TABLE, i3333);		\
+									\
+		while ((DATA) != NULL) {				\
+			HASH_ASSERT_VALID(DATA);			\
+			ASSERTION;					\
+									\
+			if (TEST) {					\
+				break;					\
+			}						\
+									\
+			(DATA) = (TYPE) HASH_GET_NEXT(NAME, DATA);	\
+		}							\
+									\
+		if ((DATA) != NULL) {					\
+			break;						\
+		}							\
+	}								\
+} while (0)
+
+/** Hash table with singly-linked overflow lists */
+struct hash_table_t
+{
+  /** number of elements in array (a prime number) */
+  ulint n_cells;
+  /** the hash array */
+  hash_cell_t *array;
+
+  /** Create the hash table.
+  @param n  the lower bound of n_cells */
+  void create(ulint n)
+  {
+    n_cells= ut_find_prime(n);
+    array= static_cast<hash_cell_t*>(ut_zalloc_nokey(n_cells * sizeof *array));
+  }
+
+  /** Clear the hash table. */
+  void clear() { memset(array, 0, n_cells * sizeof *array); }
+
+  /** Free the hash table. */
+  void free() { ut_free(array); array= nullptr; }
+
+  ulint calc_hash(ulint fold) const { return ut_hash_ulint(fold, n_cells); }
+};
diff --git a/storage/innobase/include/ibuf0ibuf.h b/storage/innobase/include/ibuf0ibuf.h
new file mode 100644
index 00000000..c246b2ef
--- /dev/null
+++ b/storage/innobase/include/ibuf0ibuf.h
@@ -0,0 +1,436 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ibuf0ibuf.h
+Insert buffer
+
+Created 7/19/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef ibuf0ibuf_h
+#define ibuf0ibuf_h
+
+#include "mtr0mtr.h"
+#include "dict0mem.h"
+#include "fsp0fsp.h"
+
+/** Default value for maximum on-disk size of change buffer in terms
+of percentage of the buffer pool. */
+#define CHANGE_BUFFER_DEFAULT_SIZE	(25)
+
+/* Possible operations buffered in the insert/whatever buffer. See
+ibuf_insert(). DO NOT CHANGE THE VALUES OF THESE, THEY ARE STORED ON DISK. */
+typedef enum {
+	IBUF_OP_INSERT = 0,
+	IBUF_OP_DELETE_MARK = 1,
+	IBUF_OP_DELETE = 2,
+
+	/* Number of different operation types. */
+	IBUF_OP_COUNT = 3
+} ibuf_op_t;
+
+/** Combinations of operations that can be buffered.
+@see innodb_change_buffering_names */
+enum ibuf_use_t {
+	IBUF_USE_NONE = 0,
+	IBUF_USE_INSERT,	/* insert */
+	IBUF_USE_DELETE_MARK,	/* delete */
+	IBUF_USE_INSERT_DELETE_MARK,	/* insert+delete */
+	IBUF_USE_DELETE,	/* delete+purge */
+	IBUF_USE_ALL		/* insert+delete+purge */
+};
+
+/** Operations that can currently be buffered. */
+extern ulong		innodb_change_buffering;
+
+/** Insert buffer struct */
+struct ibuf_t{
+	Atomic_relaxed<ulint> size;	/*!< current size of the ibuf index
+					tree, in pages */
+	Atomic_relaxed<ulint> max_size;	/*!< recommended maximum size of the
+					ibuf index tree, in pages */
+	ulint		seg_size;	/*!< allocated pages of the file
+					segment containing ibuf header and
+					tree */
+	bool		empty;		/*!< Protected by the page
+					latch of the root page of the
+					insert buffer tree
+					(FSP_IBUF_TREE_ROOT_PAGE_NO). true
+					if and only if the insert
+					buffer tree is empty. */
+	ulint		free_list_len;	/*!< length of the free list */
+	ulint		height;		/*!< tree height */
+	dict_index_t*	index;		/*!< insert buffer index */
+
+	/** number of pages merged */
+	Atomic_counter<ulint> n_merges;
+	Atomic_counter<ulint> n_merged_ops[IBUF_OP_COUNT];
+					/*!< number of operations of each type
+					merged to index pages */
+	Atomic_counter<ulint> n_discarded_ops[IBUF_OP_COUNT];
+					/*!< number of operations of each type
+					discarded without merging due to the
+					tablespace being deleted or the
+					index being dropped */
+};
+
+/** The insert buffer control structure */
+extern ibuf_t		ibuf;
+
+/* The purpose of the insert buffer is to reduce random disk access.
+When we wish to insert a record into a non-unique secondary index and
+the B-tree leaf page where the record belongs to is not in the buffer
+pool, we insert the record into the insert buffer B-tree, indexed by
+(space_id, page_no).  When the page is eventually read into the buffer
+pool, we look up the insert buffer B-tree for any modifications to the
+page, and apply these upon the completion of the read operation.  This
+is called the insert buffer merge. */
+
+/* The insert buffer merge must always succeed.  To guarantee this,
+the insert buffer subsystem keeps track of the free space in pages for
+which it can buffer operations.  Two bits per page in the insert
+buffer bitmap indicate the available space in coarse increments.  The
+free bits in the insert buffer bitmap must never exceed the free space
+on a page.  It is safe to decrement or reset the bits in the bitmap in
+a mini-transaction that is committed before the mini-transaction that
+affects the free space.  It is unsafe to increment the bits in a
+separately committed mini-transaction, because in crash recovery, the
+free bits could momentarily be set too high. */
+
+/******************************************************************//**
+Creates the insert buffer data structure at a database startup.
+@return DB_SUCCESS or failure */
+dberr_t
+ibuf_init_at_db_start(void);
+/*=======================*/
+/*********************************************************************//**
+Updates the max_size value for ibuf. */
+void
+ibuf_max_size_update(
+/*=================*/
+	ulint	new_val);	/*!< in: new value in terms of
+				percentage of the buffer pool size */
+/*********************************************************************//**
+Reads the biggest tablespace id from the high end of the insert buffer
+tree and updates the counter in fil_system. */
+void
+ibuf_update_max_tablespace_id(void);
+/*===============================*/
+/***************************************************************//**
+Starts an insert buffer mini-transaction. */
+UNIV_INLINE
+void
+ibuf_mtr_start(
+/*===========*/
+	mtr_t*	mtr)	/*!< out: mini-transaction */
+	MY_ATTRIBUTE((nonnull));
+/***************************************************************//**
+Commits an insert buffer mini-transaction. */
+UNIV_INLINE
+void
+ibuf_mtr_commit(
+/*============*/
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull));
+/************************************************************************//**
+Resets the free bits of the page in the ibuf bitmap. This is done in a
+separate mini-transaction, hence this operation does not restrict
+further work to only ibuf bitmap operations, which would result if the
+latch to the bitmap page were kept.  NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page.  It is safe
+to decrement or reset the bits in the bitmap in a mini-transaction
+that is committed before the mini-transaction that affects the free
+space. */
+void
+ibuf_reset_free_bits(
+/*=================*/
+	buf_block_t*	block);	/*!< in: index page; free bits are set to 0
+				if the index is a non-clustered
+				non-unique, and page level is 0 */
+/************************************************************************//**
+Updates the free bits of an uncompressed page in the ibuf bitmap if
+there is not enough free on the page any more.  This is done in a
+separate mini-transaction, hence this operation does not restrict
+further work to only ibuf bitmap operations, which would result if the
+latch to the bitmap page were kept.  NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page.  It is
+unsafe to increment the bits in a separately committed
+mini-transaction, because in crash recovery, the free bits could
+momentarily be set too high.  It is only safe to use this function for
+decrementing the free bits.  Should more free space become available,
+we must not update the free bits here, because that would break crash
+recovery. */
+UNIV_INLINE
+void
+ibuf_update_free_bits_if_full(
+/*==========================*/
+	buf_block_t*	block,	/*!< in: index page to which we have added new
+				records; the free bits are updated if the
+				index is non-clustered and non-unique and
+				the page level is 0, and the page becomes
+				fuller */
+	ulint		max_ins_size,/*!< in: value of maximum insert size with
+				reorganize before the latest operation
+				performed to the page */
+	ulint		increase);/*!< in: upper limit for the additional space
+				used in the latest operation, if known, or
+				ULINT_UNDEFINED */
+/**********************************************************************//**
+Updates the free bits for an uncompressed page to reflect the present
+state.  Does this in the mtr given, which means that the latching
+order rules virtually prevent any further operations for this OS
+thread until mtr is committed.  NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page.  It is safe
+to set the free bits in the same mini-transaction that updated the
+page. */
+void
+ibuf_update_free_bits_low(
+/*======================*/
+	const buf_block_t*	block,		/*!< in: index page */
+	ulint			max_ins_size,	/*!< in: value of
+						maximum insert size
+						with reorganize before
+						the latest operation
+						performed to the page */
+	mtr_t*			mtr);		/*!< in/out: mtr */
+/**********************************************************************//**
+Updates the free bits for a compressed page to reflect the present
+state.  Does this in the mtr given, which means that the latching
+order rules virtually prevent any further operations for this OS
+thread until mtr is committed.  NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page.  It is safe
+to set the free bits in the same mini-transaction that updated the
+page. */
+void
+ibuf_update_free_bits_zip(
+/*======================*/
+	buf_block_t*	block,	/*!< in/out: index page */
+	mtr_t*		mtr);	/*!< in/out: mtr */
+/**********************************************************************//**
+Updates the free bits for the two pages to reflect the present state.
+Does this in the mtr given, which means that the latching order rules
+virtually prevent any further operations until mtr is committed.
+NOTE: The free bits in the insert buffer bitmap must never exceed the
+free space on a page.  It is safe to set the free bits in the same
+mini-transaction that updated the pages. */
+void
+ibuf_update_free_bits_for_two_pages_low(
+/*====================================*/
+	buf_block_t*	block1,	/*!< in: index page */
+	buf_block_t*	block2,	/*!< in: index page */
+	mtr_t*		mtr);	/*!< in: mtr */
+/**********************************************************************//**
+A basic partial test if an insert to the insert buffer could be possible and
+recommended. */
+UNIV_INLINE
+ibool
+ibuf_should_try(
+/*============*/
+	dict_index_t*	index,			/*!< in: index where to insert */
+	ulint		ignore_sec_unique);	/*!< in: if != 0, we should
+						ignore UNIQUE constraint on
+						a secondary index when we
+						decide */
+/******************************************************************//**
+Returns TRUE if the current OS thread is performing an insert buffer
+routine.
+
+For instance, a read-ahead of non-ibuf pages is forbidden by threads
+that are executing an insert buffer routine.
+@return TRUE if inside an insert buffer routine */
+UNIV_INLINE
+ibool
+ibuf_inside(
+/*========*/
+	const mtr_t*	mtr)	/*!< in: mini-transaction */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Checks if a page address is an ibuf bitmap page (level 3 page) address.
+@param[in]	page_id		page id
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@return TRUE if a bitmap page */
+inline bool ibuf_bitmap_page(const page_id_t page_id, ulint zip_size)
+{
+	ut_ad(ut_is_2pow(zip_size));
+	ulint size = zip_size ? zip_size : srv_page_size;
+	return (page_id.page_no() & (size - 1)) == FSP_IBUF_BITMAP_OFFSET;
+}
+
+/** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages.
+Must not be called when recv_no_ibuf_operations==true.
+@param[in]	page_id		page id
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	x_latch		FALSE if relaxed check (avoid latching the
+bitmap page)
+@param[in,out]	mtr		mtr which will contain an x-latch to the
+bitmap page if the page is not one of the fixed address ibuf pages, or NULL,
+in which case a new transaction is created.
+@return true if level 2 or level 3 page */
+bool
+ibuf_page_low(
+	const page_id_t		page_id,
+	ulint			zip_size,
+#ifdef UNIV_DEBUG
+	bool			x_latch,
+#endif /* UNIV_DEBUG */
+	mtr_t*			mtr)
+	MY_ATTRIBUTE((warn_unused_result));
+
+#ifdef UNIV_DEBUG
+/** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages.
+Must not be called when recv_no_ibuf_operations==true.
+@param[in]	page_id		tablespace/page identifier
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out]	mtr		mini-transaction or NULL
+@return TRUE if level 2 or level 3 page */
+# define ibuf_page(page_id, zip_size, mtr)	\
+	ibuf_page_low(page_id, zip_size, true, mtr)
+
+#else /* UNIV_DEBUG */
+
+/** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages.
+Must not be called when recv_no_ibuf_operations==true.
+@param[in]	page_id		tablespace/page identifier
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out]	mtr		mini-transaction or NULL
+@return TRUE if level 2 or level 3 page */
+# define ibuf_page(page_id, zip_size, mtr)	\
+	ibuf_page_low(page_id, zip_size, mtr)
+
+#endif /* UNIV_DEBUG */
+/***********************************************************************//**
+Frees excess pages from the ibuf free list. This function is called when an OS
+thread calls fsp services to allocate a new file segment, or a new page to a
+file segment, and the thread did not own the fsp latch before this call. */
+void
+ibuf_free_excess_pages(void);
+/*========================*/
+
+/** Buffer an operation in the change buffer, instead of applying it
+directly to the file page, if this is possible. Does not do it if the index
+is clustered or unique.
+@param[in]	op		operation type
+@param[in]	entry		index entry to insert
+@param[in,out]	index		index where to insert
+@param[in]	page_id		page id where to insert
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out]	thr		query thread
+@return true if success */
+bool
+ibuf_insert(
+	ibuf_op_t		op,
+	const dtuple_t*		entry,
+	dict_index_t*		index,
+	const page_id_t		page_id,
+	ulint			zip_size,
+	que_thr_t*		thr);
+
+/** Check whether buffered changes exist for a page.
+@param[in]	id		page identifier
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@return whether buffered changes exist */
+bool ibuf_page_exists(const page_id_t id, ulint zip_size);
+
+/** When an index page is read from a disk to the buffer pool, this function
+applies any buffered operations to the page and deletes the entries from the
+insert buffer. If the page is not read, but created in the buffer pool, this
+function deletes its buffered entries from the insert buffer; there can
+exist entries for such a page if the page belonged to an index which
+subsequently was dropped.
+@param block    X-latched page to try to apply changes to, or NULL to discard
+@param page_id  page identifier
+@param zip_size ROW_FORMAT=COMPRESSED page size, or 0
+@return error code */
+dberr_t ibuf_merge_or_delete_for_page(buf_block_t *block,
+                                      const page_id_t page_id,
+                                      ulint zip_size);
+
+/** Delete all change buffer entries for a tablespace,
+in DISCARD TABLESPACE, IMPORT TABLESPACE, or read-ahead.
+@param[in]	space		missing or to-be-discarded tablespace */
+void ibuf_delete_for_discarded_space(uint32_t space);
+
+/** Contract the change buffer by reading pages to the buffer pool.
+@return a lower limit for the combined size in bytes of entries which
+will be merged from ibuf trees to the pages read
+@retval 0 if ibuf.empty */
+ulint ibuf_contract();
+
+/** Contracts insert buffer trees by reading pages referring to space_id
+to the buffer pool.
+@returns number of pages merged.*/
+ulint
+ibuf_merge_space(
+/*=============*/
+	ulint	space);	/*!< in: space id */
+
+/******************************************************************//**
+Looks if the insert buffer is empty.
+@return true if empty */
+bool
+ibuf_is_empty(void);
+/*===============*/
+/******************************************************************//**
+Prints info of ibuf. */
+void
+ibuf_print(
+/*=======*/
+	FILE*	file);	/*!< in: file where to print */
+/********************************************************************
+Read the first two bytes from a record's fourth field (counter field in new
+records; something else in older records).
+@return "counter" field, or ULINT_UNDEFINED if for some reason it can't be read */
+ulint
+ibuf_rec_get_counter(
+/*=================*/
+	const rec_t*	rec);	/*!< in: ibuf record */
+/******************************************************************//**
+Closes insert buffer and frees the data structures. */
+void
+ibuf_close(void);
+/*============*/
+
+/** Check the insert buffer bitmaps on IMPORT TABLESPACE.
+@param[in]	trx	transaction
+@param[in,out]	space	tablespace being imported
+@return DB_SUCCESS or error code */
+dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Update free bits and buffered bits for bulk loaded page.
+@param block   secondary index leaf page
+@param mtr     mini-transaction
+@param reset   whether the page is full */
+void ibuf_set_bitmap_for_bulk_load(buf_block_t *block, mtr_t *mtr, bool reset);
+
+#define IBUF_HEADER_PAGE_NO	FSP_IBUF_HEADER_PAGE_NO
+#define IBUF_TREE_ROOT_PAGE_NO	FSP_IBUF_TREE_ROOT_PAGE_NO
+
+/* The ibuf header page currently contains only the file segment header
+for the file segment from which the pages for the ibuf tree are allocated */
+#define IBUF_HEADER		PAGE_DATA
+#define	IBUF_TREE_SEG_HEADER	0	/* fseg header for ibuf tree */
+
+/* The insert buffer tree itself is always located in space 0. */
+#define IBUF_SPACE_ID		static_cast<ulint>(0)
+
+#include "ibuf0ibuf.inl"
+
+#endif
diff --git a/storage/innobase/include/ibuf0ibuf.inl b/storage/innobase/include/ibuf0ibuf.inl
new file mode 100644
index 00000000..003bf22a
--- /dev/null
+++ b/storage/innobase/include/ibuf0ibuf.inl
@@ -0,0 +1,282 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ibuf0ibuf.ic
+Insert buffer
+
+Created 7/19/1997 Heikki Tuuri
+*******************************************************/
+
+#include "page0page.h"
+#include "page0zip.h"
+#include "fsp0types.h"
+#include "buf0lru.h"
+
+/** An index page must contain at least srv_page_size /
+IBUF_PAGE_SIZE_PER_FREE_SPACE bytes of free space for ibuf to try to
+buffer inserts to this page.  If there is this much of free space, the
+corresponding bits are set in the ibuf bitmap. */
+#define IBUF_PAGE_SIZE_PER_FREE_SPACE	32
+
+/***************************************************************//**
+Starts an insert buffer mini-transaction. */
+UNIV_INLINE
+void
+ibuf_mtr_start(
+/*===========*/
+	mtr_t*	mtr)	/*!< out: mini-transaction */
+{
+	mtr_start(mtr);
+	mtr->enter_ibuf();
+
+	if (high_level_read_only || srv_read_only_mode) {
+		mtr_set_log_mode(mtr, MTR_LOG_NO_REDO);
+	}
+
+}
+/***************************************************************//**
+Commits an insert buffer mini-transaction. */
+UNIV_INLINE
+void
+ibuf_mtr_commit(
+/*============*/
+	mtr_t*	mtr)	/*!< in/out: mini-transaction */
+{
+	ut_ad(mtr->is_inside_ibuf());
+	ut_d(mtr->exit_ibuf());
+
+	mtr_commit(mtr);
+}
+
+/************************************************************************//**
+Sets the free bit of the page in the ibuf bitmap. This is done in a separate
+mini-transaction, hence this operation does not restrict further work to only
+ibuf bitmap operations, which would result if the latch to the bitmap page
+were kept. */
+void
+ibuf_set_free_bits_func(
+/*====================*/
+	buf_block_t*	block,	/*!< in: index page of a non-clustered index;
+				free bit is reset if page level is 0 */
+#ifdef UNIV_IBUF_DEBUG
+	ulint		max_val,/*!< in: ULINT_UNDEFINED or a maximum
+				value which the bits must have before
+				setting; this is for debugging */
+#endif /* UNIV_IBUF_DEBUG */
+	ulint		val);	/*!< in: value to set: < 4 */
+#ifdef UNIV_IBUF_DEBUG
+# define ibuf_set_free_bits(b,v,max) ibuf_set_free_bits_func(b,max,v)
+#else /* UNIV_IBUF_DEBUG */
+# define ibuf_set_free_bits(b,v,max) ibuf_set_free_bits_func(b,v)
+#endif /* UNIV_IBUF_DEBUG */
+
+/**********************************************************************//**
+A basic partial test if an insert to the insert buffer could be possible and
+recommended. */
+UNIV_INLINE
+ibool
+ibuf_should_try(
+/*============*/
+	dict_index_t*	index,			/*!< in: index where to insert */
+	ulint		ignore_sec_unique)	/*!< in: if != 0, we should
+						ignore UNIQUE constraint on
+						a secondary index when we
+						decide */
+{
+  if (index->type & (DICT_CLUSTERED | DICT_IBUF | DICT_SPATIAL) ||
+      !innodb_change_buffering || !ibuf.max_size)
+    return false;
+  if (!ignore_sec_unique && index->is_unique())
+    return false;
+  if (index->table->quiesce != QUIESCE_NONE)
+    return false;
+  for (unsigned i= 0; i < index->n_fields; i++)
+    if (index->fields[i].descending)
+      return false;
+  return true;
+}
+
+/******************************************************************//**
+Returns TRUE if the current OS thread is performing an insert buffer
+routine.
+
+For instance, a read-ahead of non-ibuf pages is forbidden by threads
+that are executing an insert buffer routine.
+@return TRUE if inside an insert buffer routine */
+UNIV_INLINE
+ibool
+ibuf_inside(
+/*========*/
+	const mtr_t*	mtr)	/*!< in: mini-transaction */
+{
+	return(mtr->is_inside_ibuf());
+}
+
+/** Translates the free space on a page to a value in the ibuf bitmap.
+@param[in]	page_size	page size in bytes
+@param[in]	max_ins_size	maximum insert size after reorganize for
+the page
+@return value for ibuf bitmap bits */
+UNIV_INLINE
+ulint
+ibuf_index_page_calc_free_bits(
+	ulint	page_size,
+	ulint	max_ins_size)
+{
+	ulint	n;
+	ut_ad(ut_is_2pow(page_size));
+	ut_ad(page_size > IBUF_PAGE_SIZE_PER_FREE_SPACE);
+
+	n = max_ins_size / (page_size / IBUF_PAGE_SIZE_PER_FREE_SPACE);
+
+	if (n == 3) {
+		n = 2;
+	}
+
+	if (n > 3) {
+		n = 3;
+	}
+
+	return(n);
+}
+
+/*********************************************************************//**
+Translates the free space on a compressed page to a value in the ibuf bitmap.
+@return value for ibuf bitmap bits */
+UNIV_INLINE
+ulint
+ibuf_index_page_calc_free_zip(
+/*==========================*/
+	const buf_block_t*	block)	/*!< in: buffer block */
+{
+	ulint			max_ins_size;
+	const page_zip_des_t*	page_zip;
+	lint			zip_max_ins;
+
+	ut_ad(block->page.zip.data);
+
+	/* Consider the maximum insert size on the uncompressed page
+	without reorganizing the page. We must not assume anything
+	about the compression ratio. If zip_max_ins > max_ins_size and
+	there is 1/4 garbage on the page, recompression after the
+	reorganize could fail, in theory. So, let us guarantee that
+	merging a buffered insert to a compressed page will always
+	succeed without reorganizing or recompressing the page, just
+	by using the page modification log. */
+	max_ins_size = page_get_max_insert_size(
+		buf_block_get_frame(block), 1);
+
+	page_zip = buf_block_get_page_zip(block);
+	zip_max_ins = page_zip_max_ins_size(page_zip,
+					    FALSE/* not clustered */);
+
+	if (zip_max_ins < 0) {
+		return(0);
+	} else if (max_ins_size > (ulint) zip_max_ins) {
+		max_ins_size = (ulint) zip_max_ins;
+	}
+
+	return(ibuf_index_page_calc_free_bits(block->physical_size(),
+					      max_ins_size));
+}
+
+/*********************************************************************//**
+Translates the free space on a page to a value in the ibuf bitmap.
+@return value for ibuf bitmap bits */
+UNIV_INLINE
+ulint
+ibuf_index_page_calc_free(
+/*======================*/
+	const buf_block_t*	block)	/*!< in: buffer block */
+{
+	if (!block->page.zip.data) {
+		ulint	max_ins_size;
+
+		max_ins_size = page_get_max_insert_size_after_reorganize(
+			buf_block_get_frame(block), 1);
+
+		return(ibuf_index_page_calc_free_bits(
+				block->physical_size(), max_ins_size));
+	} else {
+		return(ibuf_index_page_calc_free_zip(block));
+	}
+}
+
+/************************************************************************//**
+Updates the free bits of an uncompressed page in the ibuf bitmap if
+there is not enough free on the page any more.  This is done in a
+separate mini-transaction, hence this operation does not restrict
+further work to only ibuf bitmap operations, which would result if the
+latch to the bitmap page were kept.  NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page.  It is
+unsafe to increment the bits in a separately committed
+mini-transaction, because in crash recovery, the free bits could
+momentarily be set too high.  It is only safe to use this function for
+decrementing the free bits.  Should more free space become available,
+we must not update the free bits here, because that would break crash
+recovery. */
+UNIV_INLINE
+void
+ibuf_update_free_bits_if_full(
+/*==========================*/
+	buf_block_t*	block,	/*!< in: index page to which we have added new
+				records; the free bits are updated if the
+				index is non-clustered and non-unique and
+				the page level is 0, and the page becomes
+				fuller */
+	ulint		max_ins_size,/*!< in: value of maximum insert size with
+				reorganize before the latest operation
+				performed to the page */
+	ulint		increase)/*!< in: upper limit for the additional space
+				used in the latest operation, if known, or
+				ULINT_UNDEFINED */
+{
+	ulint	before;
+	ulint	after;
+
+	ut_ad(buf_block_get_page_zip(block) == NULL);
+
+	before = ibuf_index_page_calc_free_bits(
+		srv_page_size, max_ins_size);
+
+	if (max_ins_size >= increase) {
+		compile_time_assert(ULINT32_UNDEFINED > UNIV_PAGE_SIZE_MAX);
+		after = ibuf_index_page_calc_free_bits(
+			srv_page_size, max_ins_size - increase);
+#ifdef UNIV_IBUF_DEBUG
+		ut_a(after <= ibuf_index_page_calc_free(block));
+#endif
+	} else {
+		after = ibuf_index_page_calc_free(block);
+	}
+
+	if (after == 0) {
+		/* We move the page to the front of the buffer pool LRU list:
+		the purpose of this is to prevent those pages to which we
+		cannot make inserts using the insert buffer from slipping
+		out of the buffer pool */
+
+		buf_page_make_young(&block->page);
+	}
+
+	if (before > after) {
+		ibuf_set_free_bits(block, after, before);
+	}
+}
diff --git a/storage/innobase/include/lock0iter.h b/storage/innobase/include/lock0iter.h
new file mode 100644
index 00000000..a7e61395
--- /dev/null
+++ b/storage/innobase/include/lock0iter.h
@@ -0,0 +1,66 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0iter.h
+Lock queue iterator type and function prototypes.
+
+Created July 16, 2007 Vasil Dimov
+*******************************************************/
+
+#ifndef lock0iter_h
+#define lock0iter_h
+
+#include "lock0types.h"
+
+struct lock_queue_iterator_t {
+	const lock_t*	current_lock;
+	/* In case this is a record lock queue (not table lock queue)
+	then bit_no is the record number within the heap in which the
+	record is stored. */
+	ulint		bit_no;
+};
+
+/*******************************************************************//**
+Initialize lock queue iterator so that it starts to iterate from
+"lock". bit_no specifies the record number within the heap where the
+record is stored. It can be undefined (ULINT_UNDEFINED) in two cases:
+1. If the lock is a table lock, thus we have a table lock queue;
+2. If the lock is a record lock and it is a wait lock. In this case
+   bit_no is calculated in this function by using
+   lock_rec_find_set_bit(). There is exactly one bit set in the bitmap
+   of a wait lock. */
+void
+lock_queue_iterator_reset(
+/*======================*/
+	lock_queue_iterator_t*	iter,	/*!< out: iterator */
+	const lock_t*		lock,	/*!< in: lock to start from */
+	ulint			bit_no);/*!< in: record number in the
+					heap */
+
+/*******************************************************************//**
+Gets the previous lock in the lock queue, returns NULL if there are no
+more locks (i.e. the current lock is the first one). The iterator is
+receded (if not-NULL is returned).
+@return previous lock or NULL */
+const lock_t*
+lock_queue_iterator_get_prev(
+/*=========================*/
+	lock_queue_iterator_t*	iter);	/*!< in/out: iterator */
+
+#endif /* lock0iter_h */
diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h
new file mode 100644
index 00000000..59ee7f55
--- /dev/null
+++ b/storage/innobase/include/lock0lock.h
@@ -0,0 +1,1271 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2022, Oracle and/or its affiliates.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0lock.h
+The transaction lock system
+
+Created 5/7/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef lock0lock_h
+#define lock0lock_h
+
+#include "buf0types.h"
+#include "trx0trx.h"
+#include "mtr0types.h"
+#include "rem0types.h"
+#include "hash0hash.h"
+#include "srv0srv.h"
+#include "ut0vec.h"
+#include "gis0rtree.h"
+#include "lock0prdt.h"
+#include "transactional_lock_guard.h"
+
+// Forward declaration
+class ReadView;
+
+/** The value of innodb_deadlock_detect */
+extern my_bool innodb_deadlock_detect;
+/** The value of innodb_deadlock_report */
+extern ulong innodb_deadlock_report;
+
+namespace Deadlock
+{
+  /** The allowed values of innodb_deadlock_report */
+  enum report { REPORT_OFF, REPORT_BASIC, REPORT_FULL };
+}
+
+/*********************************************************************//**
+Gets the heap_no of the smallest user record on a page.
+@return heap_no of smallest user record, or PAGE_HEAP_NO_SUPREMUM */
+UNIV_INLINE
+ulint
+lock_get_min_heap_no(
+/*=================*/
+	const buf_block_t*	block);	/*!< in: buffer block */
+
+/** Discard locks for an index when purging DELETE FROM SYS_INDEXES
+after an aborted CREATE INDEX operation.
+@param index   a stale index on which ADD INDEX operation was aborted */
+ATTRIBUTE_COLD void lock_discard_for_index(const dict_index_t &index);
+
+/*************************************************************//**
+Updates the lock table when we have reorganized a page. NOTE: we copy
+also the locks set on the infimum of the page; the infimum may carry
+locks if an update of a record is occurring on the page, and its locks
+were temporarily stored on the infimum. */
+void
+lock_move_reorganize_page(
+/*======================*/
+	const buf_block_t*	block,	/*!< in: old index page, now
+					reorganized */
+	const buf_block_t*	oblock);/*!< in: copy of the old, not
+					reorganized page */
+/*************************************************************//**
+Moves the explicit locks on user records to another page if a record
+list end is moved to another page. */
+void
+lock_move_rec_list_end(
+/*===================*/
+	const buf_block_t*	new_block,	/*!< in: index page to move to */
+	const buf_block_t*	block,		/*!< in: index page */
+	const rec_t*		rec);		/*!< in: record on page: this
+						is the first record moved */
+/*************************************************************//**
+Moves the explicit locks on user records to another page if a record
+list start is moved to another page. */
+void
+lock_move_rec_list_start(
+/*=====================*/
+	const buf_block_t*	new_block,	/*!< in: index page to move to */
+	const buf_block_t*	block,		/*!< in: index page */
+	const rec_t*		rec,		/*!< in: record on page:
+						this is the first
+						record NOT copied */
+	const rec_t*		old_end);	/*!< in: old
+						previous-to-last
+						record on new_page
+						before the records
+						were copied */
+/*************************************************************//**
+Updates the lock table when a page is split to the right. */
+void
+lock_update_split_right(
+/*====================*/
+	const buf_block_t*	right_block,	/*!< in: right page */
+	const buf_block_t*	left_block);	/*!< in: left page */
+/*************************************************************//**
+Updates the lock table when a page is merged to the right. */
+void
+lock_update_merge_right(
+/*====================*/
+	const buf_block_t*	right_block,	/*!< in: right page to
+						which merged */
+	const rec_t*		orig_succ,	/*!< in: original
+						successor of infimum
+						on the right page
+						before merge */
+	const buf_block_t*	left_block);	/*!< in: merged index
+						page which will be
+						discarded */
+/** Update locks when the root page is copied to another in
+btr_root_raise_and_insert(). Note that we leave lock structs on the
+root page, even though they do not make sense on other than leaf
+pages: the reason is that in a pessimistic update the infimum record
+of the root page will act as a dummy carrier of the locks of the record
+to be updated. */
+void lock_update_root_raise(const buf_block_t &block, const page_id_t root);
+/** Update the lock table when a page is copied to another.
+@param new_block  the target page
+@param old        old page (not index root page) */
+void lock_update_copy_and_discard(const buf_block_t &new_block, page_id_t old);
+
+/** Update gap locks between the last record of the left_block and the
+first record of the right_block when a record is about to be inserted
+at the start of the right_block, even though it should "naturally" be
+inserted as the last record of the left_block according to the
+current node pointer in the parent page.
+
+That is, we assume that the lowest common ancestor of the left_block
+and right_block routes the key of the new record to the left_block,
+but a heuristic which tries to avoid overflowing left_block has chosen
+to insert the record into right_block instead. Said ancestor performs
+this routing by comparing the key of the record to a "split point" -
+all records greater or equal to than the split point (node pointer)
+are in right_block, and smaller ones in left_block.
+The split point may be smaller than the smallest key in right_block.
+
+The gap between the last record on the left_block and the first record
+on the right_block is represented as a gap lock attached to the supremum
+pseudo-record of left_block, and a gap lock attached to the new first
+record of right_block.
+
+Thus, inserting the new record, and subsequently adjusting the node
+pointers in parent pages to values smaller or equal to the new
+records' key, will mean that gap will be sliced at a different place
+("moved to the left"): fragment of the 1st gap will now become treated
+as 2nd. Therefore, we must copy any GRANTED locks from 1st gap to the
+2nd gap. Any WAITING locks must be of INSERT_INTENTION type (as no
+other GAP locks ever wait for anything) and can stay at 1st gap, as
+their only purpose is to notify the requester they can retry
+insertion, and there's no correctness requirement to avoid waking them
+up too soon.
+@param left_block   left page
+@param right_block  right page */
+void lock_update_node_pointer(const buf_block_t *left_block,
+                              const buf_block_t *right_block);
+/*************************************************************//**
+Updates the lock table when a page is split to the left. */
+void
+lock_update_split_left(
+/*===================*/
+	const buf_block_t*	right_block,	/*!< in: right page */
+	const buf_block_t*	left_block);	/*!< in: left page */
+/** Update the lock table when a page is merged to the left.
+@param left      left page
+@param orig_pred original predecessor of supremum on the left page before merge
+@param right     merged, to-be-discarded right page */
+void lock_update_merge_left(const buf_block_t& left, const rec_t *orig_pred,
+                            const page_id_t right);
+
+/** Update the locks when a page is split and merged to two pages,
+in defragmentation. */
+void lock_update_split_and_merge(
+	const buf_block_t* left_block,	/*!< in: left page to which merged */
+	const rec_t* orig_pred,		/*!< in: original predecessor of
+					supremum on the left page before merge*/
+	const buf_block_t* right_block);/*!< in: right page from which merged */
+/*************************************************************//**
+Resets the original locks on heir and replaces them with gap type locks
+inherited from rec. */
+void
+lock_rec_reset_and_inherit_gap_locks(
+/*=================================*/
+	const buf_block_t&	heir_block,	/*!< in: block containing the
+						record which inherits */
+	const page_id_t		donor,		/*!< in: page containing the
+						record from which inherited;
+						does NOT reset the locks on
+						this record */
+	ulint			heir_heap_no,	/*!< in: heap_no of the
+						inheriting record */
+	ulint			heap_no);	/*!< in: heap_no of the
+						donating record */
+/*************************************************************//**
+Updates the lock table when a page is discarded. */
+void
+lock_update_discard(
+/*================*/
+	const buf_block_t*	heir_block,	/*!< in: index page
+						which will inherit the locks */
+	ulint			heir_heap_no,	/*!< in: heap_no of the record
+						which will inherit the locks */
+	const buf_block_t*	block);		/*!< in: index page
+						which will be discarded */
+/*************************************************************//**
+Updates the lock table when a new user record is inserted. */
+void
+lock_update_insert(
+/*===============*/
+	const buf_block_t*	block,	/*!< in: buffer block containing rec */
+	const rec_t*		rec);	/*!< in: the inserted record */
+/*************************************************************//**
+Updates the lock table when a record is removed. */
+void
+lock_update_delete(
+/*===============*/
+	const buf_block_t*	block,	/*!< in: buffer block containing rec */
+	const rec_t*		rec);	/*!< in: the record to be removed */
+/*********************************************************************//**
+Stores on the page infimum record the explicit locks of another record.
+This function is used to store the lock state of a record when it is
+updated and the size of the record changes in the update. The record
+is in such an update moved, perhaps to another page. The infimum record
+acts as a dummy carrier record, taking care of lock releases while the
+actual record is being moved. */
+void
+lock_rec_store_on_page_infimum(
+/*===========================*/
+	const buf_block_t*	block,	/*!< in: buffer block containing rec */
+	const rec_t*		rec);	/*!< in: record whose lock state
+					is stored on the infimum
+					record of the same page; lock
+					bits are reset on the
+					record */
+/** Restore the explicit lock requests on a single record, where the
+state was stored on the infimum of a page.
+@param block   buffer block containing rec
+@param rec     record whose lock state is restored
+@param donator page (rec is not necessarily on this page)
+whose infimum stored the lock state; lock bits are reset on the infimum */
+void lock_rec_restore_from_page_infimum(const buf_block_t &block,
+					const rec_t *rec, page_id_t donator);
+
+/**
+Create a table lock, without checking for deadlocks or lock compatibility.
+@param table      table on which the lock is created
+@param type_mode  lock type and mode
+@param trx        transaction
+@param c_lock     conflicting lock
+@return the created lock object */
+lock_t *lock_table_create(dict_table_t *table, unsigned type_mode, trx_t *trx,
+                          lock_t *c_lock= nullptr);
+
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate insert of
+a record. If they do, first tests if the query thread should anyway
+be suspended for some reason; if not, then puts the transaction and
+the query thread to the lock wait state and inserts a waiting request
+for a gap x-lock to the lock queue.
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_rec_insert_check_and_lock(
+/*===========================*/
+	const rec_t*	rec,	/*!< in: record after which to insert */
+	buf_block_t*	block,	/*!< in/out: buffer block of rec */
+	dict_index_t*	index,	/*!< in: index */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	bool*		inherit)/*!< out: set to true if the new
+				inserted record maybe should inherit
+				LOCK_GAP type locks from the successor
+				record */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate modify (update,
+delete mark, or delete unmark) of a clustered index record. If they do,
+first tests if the query thread should anyway be suspended for some
+reason; if not, then puts the transaction and the query thread to the
+lock wait state and inserts a waiting request for a record x-lock to the
+lock queue.
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_clust_rec_modify_check_and_lock(
+/*=================================*/
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: record which should be
+					modified */
+	dict_index_t*		index,	/*!< in: clustered index */
+	const rec_offs*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	que_thr_t*		thr)	/*!< in: query thread */
+	MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate modify
+(delete mark or delete unmark) of a secondary index record.
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_sec_rec_modify_check_and_lock(
+/*===============================*/
+	ulint		flags,	/*!< in: if BTR_NO_LOCKING_FLAG
+				bit is set, does nothing */
+	buf_block_t*	block,	/*!< in/out: buffer block of rec */
+	const rec_t*	rec,	/*!< in: record which should be
+				modified; NOTE: as this is a secondary
+				index, we always have to modify the
+				clustered index record first: see the
+				comment below */
+	dict_index_t*	index,	/*!< in: secondary index */
+	que_thr_t*	thr,	/*!< in: query thread
+				(can be NULL if BTR_NO_LOCKING_FLAG) */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************************//**
+Like lock_clust_rec_read_check_and_lock(), but reads a
+secondary index record.
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_sec_rec_read_check_and_lock(
+/*=============================*/
+	ulint			flags,	/*!< in: if BTR_NO_LOCKING_FLAG
+					bit is set, does nothing */
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: user record or page
+					supremum record which should
+					be read or passed over by a
+					read cursor */
+	dict_index_t*		index,	/*!< in: secondary index */
+	const rec_offs*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	lock_mode		mode,	/*!< in: mode of the lock which
+					the read cursor should set on
+					records: LOCK_S or LOCK_X; the
+					latter is possible in
+					SELECT FOR UPDATE */
+	unsigned		gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+					LOCK_REC_NOT_GAP */
+	que_thr_t*		thr);	/*!< in: query thread */
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate read, or passing
+over by a read cursor, of a clustered index record. If they do, first tests
+if the query thread should anyway be suspended for some reason; if not, then
+puts the transaction and the query thread to the lock wait state and inserts a
+waiting request for a record lock to the lock queue. Sets the requested mode
+lock on the record.
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_clust_rec_read_check_and_lock(
+/*===============================*/
+	ulint			flags,	/*!< in: if BTR_NO_LOCKING_FLAG
+					bit is set, does nothing */
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: user record or page
+					supremum record which should
+					be read or passed over by a
+					read cursor */
+	dict_index_t*		index,	/*!< in: clustered index */
+	const rec_offs*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	lock_mode		mode,	/*!< in: mode of the lock which
+					the read cursor should set on
+					records: LOCK_S or LOCK_X; the
+					latter is possible in
+					SELECT FOR UPDATE */
+	unsigned		gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+					LOCK_REC_NOT_GAP */
+	que_thr_t*		thr);	/*!< in: query thread */
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate read, or passing
+over by a read cursor, of a clustered index record. If they do, first tests
+if the query thread should anyway be suspended for some reason; if not, then
+puts the transaction and the query thread to the lock wait state and inserts a
+waiting request for a record lock to the lock queue. Sets the requested mode
+lock on the record. This is an alternative version of
+lock_clust_rec_read_check_and_lock() that does not require the parameter
+"offsets".
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_clust_rec_read_check_and_lock_alt(
+/*===================================*/
+	ulint			flags,	/*!< in: if BTR_NO_LOCKING_FLAG
+					bit is set, does nothing */
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: user record or page
+					supremum record which should
+					be read or passed over by a
+					read cursor */
+	dict_index_t*		index,	/*!< in: clustered index */
+	lock_mode		mode,	/*!< in: mode of the lock which
+					the read cursor should set on
+					records: LOCK_S or LOCK_X; the
+					latter is possible in
+					SELECT FOR UPDATE */
+	unsigned		gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+					LOCK_REC_NOT_GAP */
+	que_thr_t*		thr)	/*!< in: query thread */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Acquire a table lock.
+@param table   table to be locked
+@param fktable pointer to table, in case of a FOREIGN key check
+@param mode    lock mode
+@param thr     SQL execution thread
+@retval DB_SUCCESS    if the lock was acquired
+@retval DB_DEADLOCK   if a deadlock occurred, or fktable && *fktable != table
+@retval DB_LOCK_WAIT  if lock_wait() must be invoked */
+dberr_t lock_table(dict_table_t *table, dict_table_t *const*fktable,
+                   lock_mode mode, que_thr_t *thr)
+  MY_ATTRIBUTE((warn_unused_result));
+
+/** Create a table lock object for a resurrected transaction.
+@param table    table to be X-locked
+@param trx      transaction
+@param mode     LOCK_X or LOCK_IX */
+void lock_table_resurrect(dict_table_t *table, trx_t *trx, lock_mode mode);
+
+/** Sets a lock on a table based on the given mode.
+@param table	table to lock
+@param trx	transaction
+@param mode	LOCK_X or LOCK_S
+@param no_wait  whether to skip handling DB_LOCK_WAIT
+@return error code */
+dberr_t lock_table_for_trx(dict_table_t *table, trx_t *trx, lock_mode mode,
+                           bool no_wait= false)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Exclusively lock the data dictionary tables.
+@param trx  dictionary transaction
+@return error code
+@retval DB_SUCCESS on success */
+dberr_t lock_sys_tables(trx_t *trx);
+
+/*************************************************************//**
+Removes a granted record lock of a transaction from the queue and grants
+locks to other transactions waiting in the queue if they now are entitled
+to a lock. */
+void
+lock_rec_unlock(
+/*============*/
+	trx_t*			trx,	/*!< in/out: transaction that has
+					set a record lock */
+	const page_id_t		id,	/*!< in: page containing rec */
+	const rec_t*		rec,	/*!< in: record */
+	lock_mode		lock_mode);/*!< in: LOCK_S or LOCK_X */
+
+/** Release the explicit locks of a committing transaction,
+and release possible other transactions waiting because of these locks. */
+void lock_release(trx_t* trx);
+
+/** Release the explicit locks of a committing transaction while
+dict_sys.latch is exclusively locked,
+and release possible other transactions waiting because of these locks. */
+void lock_release_on_drop(trx_t *trx);
+
+/** Release non-exclusive locks on XA PREPARE,
+and release possible other transactions waiting because of these locks. */
+void lock_release_on_prepare(trx_t *trx);
+
+/** Release locks on a table whose creation is being rolled back */
+ATTRIBUTE_COLD void lock_release_on_rollback(trx_t *trx, dict_table_t *table);
+
+/**********************************************************************//**
+Looks for a set bit in a record lock bitmap. Returns ULINT_UNDEFINED,
+if none found.
+@return bit index == heap number of the record, or ULINT_UNDEFINED if
+none found */
+ulint
+lock_rec_find_set_bit(
+/*==================*/
+	const lock_t*	lock);	/*!< in: record lock with at least one
+				bit set */
+
+/*********************************************************************//**
+Checks if a lock request lock1 has to wait for request lock2.
+@return whether lock1 has to wait for lock2 to be removed */
+bool
+lock_has_to_wait(
+/*=============*/
+	const lock_t*	lock1,	/*!< in: waiting lock */
+	const lock_t*	lock2);	/*!< in: another lock; NOTE that it is
+				assumed that this has a lock bit set
+				on the same record as in lock1 if the
+				locks are record locks */
+/*********************************************************************//**
+Reports that a transaction id is insensible, i.e., in the future. */
+ATTRIBUTE_COLD
+void
+lock_report_trx_id_insanity(
+/*========================*/
+	trx_id_t	trx_id,		/*!< in: trx id */
+	const rec_t*	rec,		/*!< in: user record */
+	dict_index_t*	index,		/*!< in: index */
+	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec, index) */
+	trx_id_t	max_trx_id);	/*!< in: trx_sys.get_max_trx_id() */
+/*********************************************************************//**
+Prints info of locks for all transactions.
+@return FALSE if not able to acquire lock_sys.latch (and display info) */
+ibool
+lock_print_info_summary(
+/*====================*/
+	FILE*	file,	/*!< in: file where to print */
+	ibool   nowait)	/*!< in: whether to wait for lock_sys.latch */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Prints transaction lock wait and MVCC state.
+@param[in,out]	file	file where to print
+@param[in]	trx	transaction
+@param[in]	now	current my_hrtime_coarse() */
+void lock_trx_print_wait_and_mvcc_state(FILE *file, const trx_t *trx,
+                                        my_hrtime_t now);
+
+/*********************************************************************//**
+Prints info of locks for each transaction. This function will release
+lock_sys.latch, which the caller must be holding in exclusive mode. */
+void
+lock_print_info_all_transactions(
+/*=============================*/
+	FILE*	file);	/*!< in: file where to print */
+
+/*********************************************************************//**
+Return the number of table locks for a transaction.
+The caller must be holding lock_sys.latch. */
+ulint
+lock_number_of_tables_locked(
+/*=========================*/
+	const trx_lock_t*	trx_lock)	/*!< in: transaction locks */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Check if there are any locks on a table.
+@return true if table has either table or record locks. */
+bool lock_table_has_locks(dict_table_t *table);
+
+/** Wait for a lock to be released.
+@retval DB_DEADLOCK if this transaction was chosen as the deadlock victim
+@retval DB_INTERRUPTED if the execution was interrupted by the user
+@retval DB_LOCK_WAIT_TIMEOUT if the lock wait timed out
+@retval DB_SUCCESS if the lock was granted */
+dberr_t lock_wait(que_thr_t *thr);
+/*********************************************************************//**
+Unlocks AUTO_INC type locks that were possibly reserved by a trx. This
+function should be called at the the end of an SQL statement, by the
+connection thread that owns the transaction (trx->mysql_thd). */
+void
+lock_unlock_table_autoinc(
+/*======================*/
+	trx_t*	trx);			/*!< in/out: transaction */
+
+/** Handle a pending lock wait (DB_LOCK_WAIT) in a semi-consistent read
+while holding a clustered index leaf page latch.
+@param trx           transaction that is or was waiting for a lock
+@retval DB_SUCCESS   if the lock was granted
+@retval DB_DEADLOCK  if the transaction must be aborted due to a deadlock
+@retval DB_LOCK_WAIT if a lock wait would be necessary; the pending
+                     lock request was released */
+dberr_t lock_trx_handle_wait(trx_t *trx);
+
+/*********************************************************************//**
+Checks that a transaction id is sensible, i.e., not in the future.
+@return true if ok */
+bool
+lock_check_trx_id_sanity(
+/*=====================*/
+	trx_id_t	trx_id,		/*!< in: trx id */
+	const rec_t*	rec,		/*!< in: user record */
+	dict_index_t*	index,		/*!< in: index */
+	const rec_offs*	offsets);	/*!< in: rec_get_offsets(rec, index) */
+#ifdef UNIV_DEBUG
+/*******************************************************************//**
+Check if the transaction holds any locks on the sys tables
+or its records.
+@return the strongest lock found on any sys table or 0 for none */
+const lock_t*
+lock_trx_has_sys_table_locks(
+/*=========================*/
+	const trx_t*	trx)	/*!< in: transaction to check */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Check if the transaction holds an explicit exclusive lock on a record.
+@param[in]	trx	transaction
+@param[in]	table	table
+@param[in]	id	leaf page identifier
+@param[in]	heap_no	heap number identifying the record
+@return whether an explicit X-lock is held */
+bool lock_trx_has_expl_x_lock(const trx_t &trx, const dict_table_t &table,
+                              page_id_t id, ulint heap_no);
+#endif /* UNIV_DEBUG */
+
+/** Lock operation struct */
+struct lock_op_t{
+	dict_table_t*	table;	/*!< table to be locked */
+	lock_mode	mode;	/*!< lock mode */
+};
+
+/** The lock system struct */
+class lock_sys_t
+{
+  friend struct LockGuard;
+  friend struct LockMultiGuard;
+  friend struct TMLockGuard;
+  friend struct TMLockMutexGuard;
+  friend struct TMLockTrxGuard;
+
+  /** Hash table latch */
+  struct hash_latch
+#ifdef SUX_LOCK_GENERIC
+  : private rw_lock
+  {
+    /** Wait for an exclusive lock */
+    void wait();
+    /** Try to acquire a lock */
+    bool try_acquire() { return write_trylock(); }
+    /** Acquire a lock */
+    void acquire() { if (!try_acquire()) wait(); }
+    /** Release a lock */
+    void release();
+    /** @return whether any lock is being held or waited for by any thread */
+    bool is_locked_or_waiting() const
+    { return rw_lock::is_locked_or_waiting(); }
+    /** @return whether this latch is possibly held by any thread */
+    bool is_locked() const { return rw_lock::is_locked(); }
+#else
+  {
+  private:
+    srw_spin_lock_low lock;
+  public:
+    /** Try to acquire a lock */
+    bool try_acquire() { return lock.wr_lock_try(); }
+    /** Acquire a lock */
+    void acquire() { lock.wr_lock(); }
+    /** Release a lock */
+    void release() { lock.wr_unlock(); }
+    /** @return whether any lock may be held by any thread */
+    bool is_locked_or_waiting() const noexcept
+    { return lock.is_locked_or_waiting(); }
+    /** @return whether this latch is possibly held by any thread */
+    bool is_locked() const noexcept { return lock.is_locked(); }
+#endif
+  };
+
+public:
+  struct hash_table
+  {
+    /** Number of consecutive array[] elements occupied by a hash_latch */
+    static constexpr size_t LATCH= sizeof(void*) >= sizeof(hash_latch) ? 1 : 2;
+    static_assert(sizeof(hash_latch) <= LATCH * sizeof(void*), "allocation");
+
+    /** Number of array[] elements per hash_latch.
+    Must be LATCH less than a power of 2. */
+    static constexpr size_t ELEMENTS_PER_LATCH= (64 / sizeof(void*)) - LATCH;
+    static constexpr size_t EMPTY_SLOTS_PER_LATCH=
+      ((CPU_LEVEL1_DCACHE_LINESIZE / 64) - 1) * (64 / sizeof(void*));
+
+    /** number of payload elements in array[]. Protected by lock_sys.latch. */
+    ulint n_cells;
+    /** the hash table, with pad(n_cells) elements, aligned to L1 cache size;
+    in any hash chain, lock_t::is_waiting() entries must not precede
+    granted locks */
+    hash_cell_t *array;
+
+    /** Create the hash table.
+    @param n  the lower bound of n_cells */
+    void create(ulint n);
+
+    /** Resize the hash table.
+    @param n  the lower bound of n_cells */
+    void resize(ulint n);
+
+    /** Free the hash table. */
+    void free() { aligned_free(array); array= nullptr; }
+
+    /** @return the index of an array element */
+    inline ulint calc_hash(ulint fold) const;
+
+    /** @return raw array index converted to padded index */
+    static ulint pad(ulint h)
+    {
+      ulint latches= LATCH * (h / ELEMENTS_PER_LATCH);
+      ulint empty_slots= (h / ELEMENTS_PER_LATCH) * EMPTY_SLOTS_PER_LATCH;
+      return LATCH + latches + empty_slots + h;
+    }
+
+    /** Get a latch. */
+    static hash_latch *latch(hash_cell_t *cell)
+    {
+      void *l= ut_align_down(cell, sizeof *cell *
+                             (ELEMENTS_PER_LATCH + LATCH));
+      return static_cast<hash_latch*>(l);
+    }
+    /** Get a hash table cell. */
+    inline hash_cell_t *cell_get(ulint fold) const;
+
+#ifdef UNIV_DEBUG
+    void assert_locked(const page_id_t id) const;
+#else
+    void assert_locked(const page_id_t) const {}
+#endif
+
+  private:
+    /** @return the hash value before any ELEMENTS_PER_LATCH padding */
+    static ulint hash(ulint fold, ulint n) { return ut_hash_ulint(fold, n); }
+
+    /** @return the index of an array element */
+    static ulint calc_hash(ulint fold, ulint n_cells)
+    {
+      return pad(hash(fold, n_cells));
+    }
+  };
+
+private:
+  bool m_initialised;
+
+  /** mutex proteting the locks */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) srw_spin_lock latch;
+#ifdef UNIV_DEBUG
+  /** The owner of exclusive latch (0 if none); protected by latch */
+  std::atomic<pthread_t> writer{0};
+  /** Number of shared latches */
+  std::atomic<ulint> readers{0};
+#endif
+#ifdef SUX_LOCK_GENERIC
+protected:
+  /** mutex for hash_latch::wait() */
+  pthread_mutex_t hash_mutex;
+  /** condition variable for hash_latch::wait() */
+  pthread_cond_t hash_cond;
+#endif
+public:
+  /** record locks */
+  hash_table rec_hash;
+  /** predicate locks for SPATIAL INDEX */
+  hash_table prdt_hash;
+  /** page locks for SPATIAL INDEX */
+  hash_table prdt_page_hash;
+
+  /** mutex covering lock waits; @see trx_lock_t::wait_lock */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t wait_mutex;
+private:
+  /** The increment of wait_count for a wait. Anything smaller is a
+  pending wait count. */
+  static constexpr uint64_t WAIT_COUNT_STEP= 1U << 19;
+  /** waits and total number of lock waits; protected by wait_mutex */
+  uint64_t wait_count;
+  /** Cumulative wait time; protected by wait_mutex */
+  uint64_t wait_time;
+  /** Longest wait time; protected by wait_mutex */
+  uint64_t wait_time_max;
+public:
+  /** number of deadlocks detected; protected by wait_mutex */
+  ulint deadlocks;
+  /** number of lock wait timeouts; protected by wait_mutex */
+  ulint timeouts;
+  /**
+    Constructor.
+
+    Some members may require late initialisation, thus we just mark object as
+    uninitialised. Real initialisation happens in create().
+  */
+  lock_sys_t(): m_initialised(false) {}
+
+
+  bool is_initialised() const { return m_initialised; }
+
+#ifdef UNIV_PFS_RWLOCK
+  /** Acquire exclusive lock_sys.latch */
+  ATTRIBUTE_NOINLINE
+  void wr_lock(const char *file, unsigned line);
+  /** Release exclusive lock_sys.latch */
+  ATTRIBUTE_NOINLINE void wr_unlock();
+  /** Acquire shared lock_sys.latch */
+  ATTRIBUTE_NOINLINE void rd_lock(const char *file, unsigned line);
+  /** Release shared lock_sys.latch */
+  ATTRIBUTE_NOINLINE void rd_unlock();
+#else
+  /** Acquire exclusive lock_sys.latch */
+  void wr_lock()
+  {
+    mysql_mutex_assert_not_owner(&wait_mutex);
+    ut_ad(!is_writer());
+    latch.wr_lock();
+    ut_ad(!writer.exchange(pthread_self(),
+                           std::memory_order_relaxed));
+  }
+  /** Release exclusive lock_sys.latch */
+  void wr_unlock()
+  {
+    ut_ad(writer.exchange(0, std::memory_order_relaxed) ==
+          pthread_self());
+    latch.wr_unlock();
+  }
+  /** Acquire shared lock_sys.latch */
+  void rd_lock()
+  {
+    mysql_mutex_assert_not_owner(&wait_mutex);
+    ut_ad(!is_writer());
+    latch.rd_lock();
+    ut_ad(!writer.load(std::memory_order_relaxed));
+    ut_d(readers.fetch_add(1, std::memory_order_relaxed));
+  }
+  /** Release shared lock_sys.latch */
+  void rd_unlock()
+  {
+    ut_ad(!is_writer());
+    ut_ad(readers.fetch_sub(1, std::memory_order_relaxed));
+    latch.rd_unlock();
+  }
+#endif
+  /** Try to acquire exclusive lock_sys.latch
+  @return whether the latch was acquired */
+  bool wr_lock_try()
+  {
+    ut_ad(!is_writer());
+    if (!latch.wr_lock_try()) return false;
+    ut_ad(!writer.exchange(pthread_self(),
+                           std::memory_order_relaxed));
+    return true;
+  }
+  /** Try to acquire shared lock_sys.latch
+  @return whether the latch was acquired */
+  bool rd_lock_try()
+  {
+    ut_ad(!is_writer());
+    if (!latch.rd_lock_try()) return false;
+    ut_ad(!writer.load(std::memory_order_relaxed));
+    ut_d(readers.fetch_add(1, std::memory_order_relaxed));
+    return true;
+  }
+
+  /** Assert that wr_lock() has been invoked by this thread */
+  void assert_locked() const { ut_ad(is_writer()); }
+  /** Assert that wr_lock() has not been invoked by this thread */
+  void assert_unlocked() const { ut_ad(!is_writer()); }
+#ifdef UNIV_DEBUG
+  /** @return whether the current thread is the lock_sys.latch writer */
+  bool is_writer() const
+  {
+# ifdef SUX_LOCK_GENERIC
+    return writer.load(std::memory_order_relaxed) == pthread_self();
+# else
+    return writer.load(std::memory_order_relaxed) == pthread_self() ||
+      (xtest() && !latch.is_locked_or_waiting());
+# endif
+  }
+  /** Assert that a lock shard is exclusively latched (by some thread) */
+  void assert_locked(const lock_t &lock) const;
+  /** Assert that a table lock shard is exclusively latched by this thread */
+  void assert_locked(const dict_table_t &table) const;
+  /** Assert that a hash table cell is exclusively latched (by some thread) */
+  void assert_locked(const hash_cell_t &cell) const;
+#else
+  void assert_locked(const lock_t &) const {}
+  void assert_locked(const dict_table_t &) const {}
+  void assert_locked(const hash_cell_t &) const {}
+#endif
+
+  /**
+    Creates the lock system at database start.
+
+    @param[in] n_cells number of slots in lock hash table
+  */
+  void create(ulint n_cells);
+
+
+  /**
+    Resize the lock hash table.
+
+    @param[in] n_cells number of slots in lock hash table
+  */
+  void resize(ulint n_cells);
+
+
+  /** Closes the lock system at database shutdown. */
+  void close();
+
+
+  /** Check for deadlocks while holding only lock_sys.wait_mutex. */
+  void deadlock_check();
+
+  /** Cancel a waiting lock request.
+  @tparam check_victim  whether to check for DB_DEADLOCK
+  @param trx            active transaction
+  @param lock           waiting lock request
+  @retval DB_SUCCESS    if no lock existed
+  @retval DB_DEADLOCK   if trx->lock.was_chosen_as_deadlock_victim was set
+  @retval DB_LOCK_WAIT  if the lock was canceled */
+  template<bool check_victim>
+  static dberr_t cancel(trx_t *trx, lock_t *lock);
+
+  /** Note that a record lock wait started */
+  inline void wait_start();
+
+  /** Note that a record lock wait resumed */
+  inline void wait_resume(THD *thd, my_hrtime_t start, my_hrtime_t now);
+
+  /** @return pending number of lock waits */
+  ulint get_wait_pending() const
+  {
+    return static_cast<ulint>(wait_count & (WAIT_COUNT_STEP - 1));
+  }
+  /** @return cumulative number of lock waits */
+  ulint get_wait_cumulative() const
+  { return static_cast<ulint>(wait_count / WAIT_COUNT_STEP); }
+  /** Cumulative wait time; protected by wait_mutex */
+  uint64_t get_wait_time_cumulative() const { return wait_time; }
+  /** Longest wait time; protected by wait_mutex */
+  uint64_t get_wait_time_max() const { return wait_time_max; }
+
+  /** Get the lock hash table for a mode */
+  hash_table &hash_get(ulint mode)
+  {
+    if (UNIV_LIKELY(!(mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE))))
+      return rec_hash;
+    return (mode & LOCK_PREDICATE) ? prdt_hash : prdt_page_hash;
+  }
+
+  /** Get the lock hash table for predicate a mode */
+  hash_table &prdt_hash_get(bool page)
+  { return page ? prdt_page_hash : prdt_hash; }
+
+  /** Get the first lock on a page.
+  @param cell        hash table cell
+  @param id          page number
+  @return first lock
+  @retval nullptr if none exists */
+  static inline lock_t *get_first(const hash_cell_t &cell, page_id_t id);
+
+  /** Get the first explicit lock request on a record.
+  @param cell     first lock hash table cell
+  @param id       page identifier
+  @param heap_no  record identifier in page
+  @return first lock
+  @retval nullptr if none exists */
+  static inline lock_t *get_first(const hash_cell_t &cell, page_id_t id,
+                                  ulint heap_no);
+
+  /** Remove locks on a discarded SPATIAL INDEX page.
+  @param id   page to be discarded
+  @param page whether to discard also from lock_sys.prdt_hash */
+  void prdt_page_free_from_discard(const page_id_t id, bool all= false);
+
+  /** Cancel possible lock waiting for a transaction */
+  static void cancel_lock_wait_for_trx(trx_t *trx);
+#ifdef WITH_WSREP
+  /** Cancel lock waiting for a wsrep BF abort. */
+  static void cancel_lock_wait_for_wsrep_bf_abort(trx_t *trx);
+#endif /* WITH_WSREP */
+};
+
+/** The lock system */
+extern lock_sys_t lock_sys;
+
+/** @return the index of an array element */
+inline ulint lock_sys_t::hash_table::calc_hash(ulint fold) const
+{
+  ut_ad(lock_sys.is_writer() || lock_sys.readers);
+  return calc_hash(fold, n_cells);
+}
+
+/** Get a hash table cell. */
+inline hash_cell_t *lock_sys_t::hash_table::cell_get(ulint fold) const
+{
+  ut_ad(lock_sys.is_writer() || lock_sys.readers);
+  return &array[calc_hash(fold)];
+}
+
+/** Get the first lock on a page.
+@param cell        hash table cell
+@param id          page number
+@return first lock
+@retval nullptr if none exists */
+inline lock_t *lock_sys_t::get_first(const hash_cell_t &cell, page_id_t id)
+{
+  lock_sys.assert_locked(cell);
+  for (auto lock= static_cast<lock_t*>(cell.node); lock; lock= lock->hash)
+  {
+    ut_ad(!lock->is_table());
+    if (lock->un_member.rec_lock.page_id == id)
+      return lock;
+  }
+  return nullptr;
+}
+
+/** lock_sys.latch exclusive guard */
+struct LockMutexGuard
+{
+  LockMutexGuard(SRW_LOCK_ARGS(const char *file, unsigned line))
+  { lock_sys.wr_lock(SRW_LOCK_ARGS(file, line)); }
+  ~LockMutexGuard() { lock_sys.wr_unlock(); }
+};
+
+/** lock_sys latch guard for 1 page_id_t */
+struct LockGuard
+{
+  LockGuard(lock_sys_t::hash_table &hash, const page_id_t id);
+  ~LockGuard()
+  {
+    lock_sys_t::hash_table::latch(cell_)->release();
+    /* Must be last, to avoid a race with lock_sys_t::hash_table::resize() */
+    lock_sys.rd_unlock();
+  }
+  /** @return the hash array cell */
+  hash_cell_t &cell() const { return *cell_; }
+private:
+  /** The hash array cell */
+  hash_cell_t *cell_;
+};
+
+/** lock_sys latch guard for 2 page_id_t */
+struct LockMultiGuard
+{
+  LockMultiGuard(lock_sys_t::hash_table &hash,
+                 const page_id_t id1, const page_id_t id2);
+  ~LockMultiGuard();
+
+  /** @return the first hash array cell */
+  hash_cell_t &cell1() const { return *cell1_; }
+  /** @return the second hash array cell */
+  hash_cell_t &cell2() const { return *cell2_; }
+private:
+  /** The first hash array cell */
+  hash_cell_t *cell1_;
+  /** The second hash array cell */
+  hash_cell_t *cell2_;
+};
+
+/** lock_sys.latch exclusive guard using transactional memory */
+struct TMLockMutexGuard
+{
+  TRANSACTIONAL_INLINE
+  TMLockMutexGuard(SRW_LOCK_ARGS(const char *file, unsigned line))
+  {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+    if (xbegin())
+    {
+      if (was_elided())
+        return;
+      xabort();
+    }
+#endif
+    lock_sys.wr_lock(SRW_LOCK_ARGS(file, line));
+  }
+  TRANSACTIONAL_INLINE
+  ~TMLockMutexGuard()
+  {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+    if (was_elided()) xend(); else
+#endif
+    lock_sys.wr_unlock();
+  }
+
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+  bool was_elided() const noexcept
+  { return !lock_sys.latch.is_locked_or_waiting(); }
+#else
+  bool was_elided() const noexcept { return false; }
+#endif
+};
+
+/** lock_sys latch guard for 1 page_id_t, using transactional memory */
+struct TMLockGuard
+{
+  TRANSACTIONAL_TARGET
+  TMLockGuard(lock_sys_t::hash_table &hash, const page_id_t id);
+  TRANSACTIONAL_INLINE ~TMLockGuard()
+  {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+    if (elided)
+    {
+      xend();
+      return;
+    }
+#endif
+    lock_sys_t::hash_table::latch(cell_)->release();
+    /* Must be last, to avoid a race with lock_sys_t::hash_table::resize() */
+    lock_sys.rd_unlock();
+  }
+  /** @return the hash array cell */
+  hash_cell_t &cell() const { return *cell_; }
+private:
+  /** The hash array cell */
+  hash_cell_t *cell_;
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+  /** whether the latches were elided */
+  bool elided;
+#endif
+};
+
+/** guard for shared lock_sys.latch and trx_t::mutex using
+transactional memory */
+struct TMLockTrxGuard
+{
+  trx_t &trx;
+
+  TRANSACTIONAL_INLINE
+#ifndef UNIV_PFS_RWLOCK
+  TMLockTrxGuard(trx_t &trx) : trx(trx)
+# define TMLockTrxArgs(trx) trx
+#else
+  TMLockTrxGuard(const char *file, unsigned line, trx_t &trx) : trx(trx)
+# define TMLockTrxArgs(trx) SRW_LOCK_CALL, trx
+#endif
+  {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+    if (xbegin())
+    {
+      if (!lock_sys.latch.is_write_locked() && was_elided())
+        return;
+      xabort();
+    }
+#endif
+    lock_sys.rd_lock(SRW_LOCK_ARGS(file, line));
+    trx.mutex_lock();
+  }
+  TRANSACTIONAL_INLINE
+  ~TMLockTrxGuard()
+  {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+    if (was_elided())
+    {
+      xend();
+      return;
+    }
+#endif
+    lock_sys.rd_unlock();
+    trx.mutex_unlock();
+  }
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+  bool was_elided() const noexcept { return !trx.mutex_is_locked(); }
+#else
+  bool was_elided() const noexcept { return false; }
+#endif
+};
+
+/** guard for trx_t::mutex using transactional memory */
+struct TMTrxGuard
+{
+  trx_t &trx;
+
+  TRANSACTIONAL_INLINE TMTrxGuard(trx_t &trx) : trx(trx)
+  {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+    if (xbegin())
+    {
+      if (was_elided())
+        return;
+      xabort();
+    }
+#endif
+    trx.mutex_lock();
+  }
+  TRANSACTIONAL_INLINE ~TMTrxGuard()
+  {
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+    if (was_elided())
+    {
+      xend();
+      return;
+    }
+#endif
+    trx.mutex_unlock();
+  }
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+  bool was_elided() const noexcept { return !trx.mutex_is_locked(); }
+#else
+  bool was_elided() const noexcept { return false; }
+#endif
+};
+
+/*********************************************************************//**
+Creates a new record lock and inserts it to the lock queue. Does NOT check
+for deadlocks or lock compatibility!
+@return created lock */
+UNIV_INLINE
+lock_t*
+lock_rec_create(
+/*============*/
+	lock_t*			c_lock,	/*!< conflicting lock */
+	unsigned		type_mode,/*!< in: lock mode and wait flag */
+	const buf_block_t*	block,	/*!< in: buffer block containing
+					the record */
+	ulint			heap_no,/*!< in: heap number of the record */
+	dict_index_t*		index,	/*!< in: index of record */
+	trx_t*			trx,	/*!< in,out: transaction */
+	bool			caller_owns_trx_mutex);
+					/*!< in: true if caller owns
+					trx mutex */
+
+/** Remove a record lock request, waiting or granted, on a discarded page
+@param hash     hash table
+@param in_lock  lock object */
+void lock_rec_discard(lock_sys_t::hash_table &lock_hash, lock_t *in_lock);
+
+/** Create a new record lock and inserts it to the lock queue,
+without checking for deadlocks or conflicts.
+@param[in]	c_lock		conflicting lock, or NULL
+@param[in]	type_mode	lock mode and wait flag
+@param[in]	page_id		index page number
+@param[in]	page		R-tree index page, or NULL
+@param[in]	heap_no		record heap number in the index page
+@param[in]	index		the index tree
+@param[in,out]	trx		transaction
+@param[in]	holds_trx_mutex	whether the caller holds trx->mutex
+@return created lock */
+lock_t*
+lock_rec_create_low(
+	lock_t*		c_lock,
+	unsigned	type_mode,
+	const page_id_t	page_id,
+	const page_t*	page,
+	ulint		heap_no,
+	dict_index_t*	index,
+	trx_t*		trx,
+	bool		holds_trx_mutex);
+
+/** Enqueue a waiting request for a lock which cannot be granted immediately.
+Check for deadlocks.
+@param[in]	c_lock		conflicting lock
+@param[in]	type_mode	the requested lock mode (LOCK_S or LOCK_X)
+				possibly ORed with LOCK_GAP or
+				LOCK_REC_NOT_GAP, ORed with
+				LOCK_INSERT_INTENTION if this
+				waiting lock request is set
+				when performing an insert of
+				an index record
+@param[in]	id		page identifier
+@param[in]	page		leaf page in the index
+@param[in]	heap_no		record heap number in the block
+@param[in]	index		index tree
+@param[in,out]	thr		query thread
+@param[in]	prdt		minimum bounding box (spatial index)
+@retval	DB_LOCK_WAIT		if the waiting lock was enqueued
+@retval	DB_DEADLOCK		if this transaction was chosen as the victim */
+dberr_t
+lock_rec_enqueue_waiting(
+	lock_t*			c_lock,
+	unsigned		type_mode,
+	const page_id_t		id,
+	const page_t*		page,
+	ulint			heap_no,
+	dict_index_t*		index,
+	que_thr_t*		thr,
+	lock_prdt_t*		prdt);
+/*************************************************************//**
+Moves the explicit locks on user records to another page if a record
+list start is moved to another page. */
+void
+lock_rtr_move_rec_list(
+/*===================*/
+	const buf_block_t*	new_block,	/*!< in: index page to
+						move to */
+	const buf_block_t*	block,		/*!< in: index page */
+	rtr_rec_move_t*		rec_move,	/*!< in: recording records
+						moved */
+	ulint			num_move);	/*!< in: num of rec to move */
+
+#include "lock0lock.inl"
+
+#endif
diff --git a/storage/innobase/include/lock0lock.inl b/storage/innobase/include/lock0lock.inl
new file mode 100644
index 00000000..1b9255ff
--- /dev/null
+++ b/storage/innobase/include/lock0lock.inl
@@ -0,0 +1,78 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0lock.ic
+The transaction lock system
+
+Created 5/7/1996 Heikki Tuuri
+*******************************************************/
+
+#include "dict0dict.h"
+#include "buf0buf.h"
+#include "page0page.h"
+
+/*********************************************************************//**
+Gets the heap_no of the smallest user record on a page.
+@return heap_no of smallest user record, or PAGE_HEAP_NO_SUPREMUM */
+UNIV_INLINE
+ulint
+lock_get_min_heap_no(
+/*=================*/
+	const buf_block_t*	block)	/*!< in: buffer block */
+{
+	const page_t*	page	= block->page.frame;
+
+	if (page_is_comp(page)) {
+		return(rec_get_heap_no_new(
+			       page
+			       + rec_get_next_offs(page + PAGE_NEW_INFIMUM,
+						   TRUE)));
+	} else {
+		return(rec_get_heap_no_old(
+			       page
+			       + rec_get_next_offs(page + PAGE_OLD_INFIMUM,
+						   FALSE)));
+	}
+}
+
+/*********************************************************************//**
+Creates a new record lock and inserts it to the lock queue. Does NOT check
+for deadlocks or lock compatibility!
+@return created lock */
+UNIV_INLINE
+lock_t*
+lock_rec_create(
+/*============*/
+	lock_t*			c_lock,	/*!< conflicting lock */
+	unsigned		type_mode,/*!< in: lock mode and wait flag */
+	const buf_block_t*	block,	/*!< in: buffer block containing
+					the record */
+	ulint			heap_no,/*!< in: heap number of the record */
+	dict_index_t*		index,	/*!< in: index of record */
+	trx_t*			trx,	/*!< in,out: transaction */
+	bool			caller_owns_trx_mutex)
+					/*!< in: TRUE if caller owns
+					trx mutex */
+{
+	return lock_rec_create_low(
+		c_lock,
+		type_mode, block->page.id(), block->page.frame, heap_no,
+		index, trx, caller_owns_trx_mutex);
+}
diff --git a/storage/innobase/include/lock0prdt.h b/storage/innobase/include/lock0prdt.h
new file mode 100644
index 00000000..db8e3392
--- /dev/null
+++ b/storage/innobase/include/lock0prdt.h
@@ -0,0 +1,192 @@
+/*****************************************************************************
+
+Copyright (c) 2014, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0prdt.h
+The predicate lock system
+
+Created 9/7/2013 Jimmy Yang
+*******************************************************/
+#ifndef lock0prdt_h
+#define lock0prdt_h
+
+#include "lock0lock.h"
+
+/* Predicate lock data */
+typedef struct lock_prdt {
+	void*		data;		/* Predicate data */
+	uint16		op;		/* Predicate operator */
+} lock_prdt_t;
+
+/*********************************************************************//**
+Acquire a predicate lock on a block
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_prdt_lock(
+/*===========*/
+	buf_block_t*	block,	/*!< in/out: buffer block of rec */
+	lock_prdt_t*	prdt,	/*!< in: Predicate for the lock */
+	dict_index_t*	index,	/*!< in: secondary index */
+	enum lock_mode	mode,	/*!< in: mode of the lock which
+				the read cursor should set on
+				records: LOCK_S or LOCK_X; the
+				latter is possible in
+				SELECT FOR UPDATE */
+	unsigned	type_mode,
+				/*!< in: LOCK_PREDICATE or LOCK_PRDT_PAGE */
+	que_thr_t*	thr);	/*!< in: query thread
+				(can be NULL if BTR_NO_LOCKING_FLAG) */
+
+/*********************************************************************//**
+Acquire a "Page" lock on a block
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_place_prdt_page_lock(
+	const page_id_t	page_id,	/*!< in: page identifier */
+	dict_index_t*	index,	/*!< in: secondary index */
+	que_thr_t*	thr);	/*!< in: query thread */
+
+/*********************************************************************//**
+Initiate a Predicate lock from a MBR */
+void
+lock_init_prdt_from_mbr(
+/*====================*/
+	lock_prdt_t*	prdt,	/*!< in/out: predicate to initialized */
+	rtr_mbr_t*	mbr,	/*!< in: Minimum Bounding Rectangle */
+	ulint		mode,	/*!< in: Search mode */
+	mem_heap_t*	heap);	/*!< in: heap for allocating memory */
+
+/*********************************************************************//**
+Get predicate lock's minimum bounding box
+@return the minimum bounding box*/
+lock_prdt_t*
+lock_get_prdt_from_lock(
+/*====================*/
+	const lock_t*	lock);	/*!< in: the lock */
+
+/*********************************************************************//**
+Checks if a predicate lock request for a new lock has to wait for
+request lock2.
+@return true if new lock has to wait for lock2 to be removed */
+bool
+lock_prdt_has_to_wait(
+/*==================*/
+	const trx_t*	trx,	/*!< in: trx of new lock */
+	unsigned	type_mode,/*!< in: precise mode of the new lock
+				to set: LOCK_S or LOCK_X, possibly
+				ORed to LOCK_PREDICATE or LOCK_PRDT_PAGE,
+				LOCK_INSERT_INTENTION */
+	lock_prdt_t*	prdt,	/*!< in: lock predicate to check */
+	const lock_t*	lock2);	/*!< in: another record lock; NOTE that
+				it is assumed that this has a lock bit
+				set on the same record as in the new
+				lock we are setting */
+
+/**************************************************************//**
+Update predicate lock when page splits */
+void
+lock_prdt_update_split(
+/*===================*/
+	buf_block_t*	new_block,	/*!< in/out: the new half page */
+	lock_prdt_t*	prdt,		/*!< in: MBR on the old page */
+	lock_prdt_t*	new_prdt,	/*!< in: MBR on the new page */
+	const page_id_t	page_id);	/*!< in: page number */
+
+/**************************************************************//**
+Ajust locks from an ancester page of Rtree on the appropriate level . */
+void
+lock_prdt_update_parent(
+/*====================*/
+	buf_block_t*	left_block,	/*!< in/out: page to be split */
+	buf_block_t*	right_block,	/*!< in/out: the new half page */
+	lock_prdt_t*	left_prdt,	/*!< in: MBR on the old page */
+	lock_prdt_t*	right_prdt,	/*!< in: MBR on the new page */
+	const page_id_t	page_id);	/*!< in: parent page */
+
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate insert of
+a predicate record.
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_prdt_insert_check_and_lock(
+/*============================*/
+	const rec_t*	rec,	/*!< in: record after which to insert */
+	buf_block_t*	block,	/*!< in/out: buffer block of rec */
+	dict_index_t*	index,	/*!< in: index */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	lock_prdt_t*	prdt);	/*!< in: Minimum Bound Rectangle */
+
+/*********************************************************************//**
+Append a predicate to the lock */
+void
+lock_prdt_set_prdt(
+/*===============*/
+	lock_t*			lock,	/*!< in: lock */
+	const lock_prdt_t*	prdt);	/*!< in: Predicate */
+
+#if 0
+
+/*********************************************************************//**
+Checks if a predicate lock request for a new lock has to wait for
+request lock2.
+@return true if new lock has to wait for lock2 to be removed */
+UNIV_INLINE
+bool
+lock_prdt_has_to_wait(
+/*==================*/
+	const trx_t*	trx,	/*!< in: trx of new lock */
+	unsigned	type_mode,/*!< in: precise mode of the new lock
+				to set: LOCK_S or LOCK_X, possibly
+				ORed to LOCK_PREDICATE or LOCK_PRDT_PAGE,
+				LOCK_INSERT_INTENTION */
+	lock_prdt_t*	prdt,	/*!< in: lock predicate to check */
+	const lock_t*	lock2);	/*!< in: another record lock; NOTE that
+				it is assumed that this has a lock bit
+				set on the same record as in the new
+				lock we are setting */
+
+/*********************************************************************//**
+Get predicate lock's minimum bounding box
+@return the minimum bounding box*/
+UNIV_INLINE
+rtr_mbr_t*
+prdt_get_mbr_from_prdt(
+/*===================*/
+	const lock_prdt_t*	prdt);	/*!< in: the lock predicate */
+
+
+#endif
+/*************************************************************//**
+Moves the locks of a record to another record and resets the lock bits of
+the donating record. */
+void
+lock_prdt_rec_move(
+/*===============*/
+	const buf_block_t*	receiver,	/*!< in: buffer block containing
+						the receiving record */
+	const page_id_t		donator);	/*!< in: target page */
+
+/** Check whether there are R-tree Page lock on a page
+@param[in]	trx	trx to test the lock
+@param[in]	page_id	page identifier
+@return	true if there is none */
+bool lock_test_prdt_page_lock(const trx_t *trx, const page_id_t page_id);
+
+#endif
diff --git a/storage/innobase/include/lock0priv.h b/storage/innobase/include/lock0priv.h
new file mode 100644
index 00000000..b0a5f7aa
--- /dev/null
+++ b/storage/innobase/include/lock0priv.h
@@ -0,0 +1,582 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0priv.h
+Lock module internal structures and methods.
+
+Created July 12, 2007 Vasil Dimov
+*******************************************************/
+
+#ifndef lock0priv_h
+#define lock0priv_h
+
+#ifndef LOCK_MODULE_IMPLEMENTATION
+/* If you need to access members of the structures defined in this
+file, please write appropriate functions that retrieve them and put
+those functions in lock/ */
+#error Do not include lock0priv.h outside of the lock/ module
+#endif
+
+#include "hash0hash.h"
+#include "rem0types.h"
+#include "trx0trx.h"
+
+#ifndef UINT32_MAX
+#define UINT32_MAX             (4294967295U)
+#endif
+
+/** Print the table lock into the given output stream
+@param[in,out]	out	the output stream
+@return the given output stream. */
+inline
+std::ostream& lock_table_t::print(std::ostream& out) const
+{
+	out << "[lock_table_t: name=" << table->name << "]";
+	return(out);
+}
+
+/** The global output operator is overloaded to conveniently
+print the lock_table_t object into the given output stream.
+@param[in,out]	out	the output stream
+@param[in]	lock	the table lock
+@return the given output stream */
+inline
+std::ostream&
+operator<<(std::ostream& out, const lock_table_t& lock)
+{
+	return(lock.print(out));
+}
+
+inline
+std::ostream&
+ib_lock_t::print(std::ostream& out) const
+{
+  static_assert(LOCK_MODE_MASK == 7, "compatibility");
+  static_assert(LOCK_IS == 0, "compatibility");
+  static_assert(LOCK_IX == 1, "compatibility");
+  static_assert(LOCK_S == 2, "compatibility");
+  static_assert(LOCK_X == 3, "compatibility");
+  static_assert(LOCK_AUTO_INC == 4, "compatibility");
+  static_assert(LOCK_NONE == 5, "compatibility");
+  static_assert(LOCK_NONE_UNSET == 7, "compatibility");
+  const char *const modes[8]=
+  { "IS", "IX", "S", "X", "AUTO_INC", "NONE", "?", "NONE_UNSET" };
+
+  out << "[lock_t: type_mode=" << type_mode << "(" << type_string()
+      << " | LOCK_" << modes[mode()];
+
+  if (is_record_not_gap())
+    out << " | LOCK_REC_NOT_GAP";
+  if (is_waiting())
+    out << " | LOCK_WAIT";
+
+  if (is_gap())
+    out << " | LOCK_GAP";
+
+  if (is_insert_intention())
+    out << " | LOCK_INSERT_INTENTION";
+
+  out << ")";
+
+  if (is_table())
+    out << un_member.tab_lock;
+  else
+    out << un_member.rec_lock;
+
+  out << "]";
+  return out;
+}
+
+inline
+std::ostream&
+operator<<(std::ostream& out, const ib_lock_t& lock)
+{
+	return(lock.print(out));
+}
+
+#ifdef UNIV_DEBUG
+extern ibool	lock_print_waits;
+#endif /* UNIV_DEBUG */
+
+/* An explicit record lock affects both the record and the gap before it.
+An implicit x-lock does not affect the gap, it only locks the index
+record from read or update.
+
+If a transaction has modified or inserted an index record, then
+it owns an implicit x-lock on the record. On a secondary index record,
+a transaction has an implicit x-lock also if it has modified the
+clustered index record, the max trx id of the page where the secondary
+index record resides is >= trx id of the transaction (or database recovery
+is running), and there are no explicit non-gap lock requests on the
+secondary index record.
+
+This complicated definition for a secondary index comes from the
+implementation: we want to be able to determine if a secondary index
+record has an implicit x-lock, just by looking at the present clustered
+index record, not at the historical versions of the record. The
+complicated definition can be explained to the user so that there is
+nondeterminism in the access path when a query is answered: we may,
+or may not, access the clustered index record and thus may, or may not,
+bump into an x-lock set there.
+
+Different transaction can have conflicting locks set on the gap at the
+same time. The locks on the gap are purely inhibitive: an insert cannot
+be made, or a select cursor may have to wait if a different transaction
+has a conflicting lock on the gap. An x-lock on the gap does not give
+the right to insert into the gap.
+
+An explicit lock can be placed on a user record or the supremum record of
+a page. The locks on the supremum record are always thought to be of the gap
+type, though the gap bit is not set. When we perform an update of a record
+where the size of the record changes, we may temporarily store its explicit
+locks on the infimum record of the page, though the infimum otherwise never
+carries locks.
+
+A waiting record lock can also be of the gap type. A waiting lock request
+can be granted when there is no conflicting mode lock request by another
+transaction ahead of it in the explicit lock queue.
+
+In version 4.0.5 we added yet another explicit lock type: LOCK_REC_NOT_GAP.
+It only locks the record it is placed on, not the gap before the record.
+This lock type is necessary to emulate an Oracle-like READ COMMITTED isolation
+level.
+
+-------------------------------------------------------------------------
+RULE 1: If there is an implicit x-lock on a record, and there are non-gap
+-------
+lock requests waiting in the queue, then the transaction holding the implicit
+x-lock also has an explicit non-gap record x-lock. Therefore, as locks are
+released, we can grant locks to waiting lock requests purely by looking at
+the explicit lock requests in the queue.
+
+RULE 3: Different transactions cannot have conflicting granted non-gap locks
+-------
+on a record at the same time. However, they can have conflicting granted gap
+locks.
+RULE 4: If a there is a waiting lock request in a queue, no lock request,
+-------
+gap or not, can be inserted ahead of it in the queue. In record deletes
+and page splits new gap type locks can be created by the database manager
+for a transaction, and without rule 4, the waits-for graph of transactions
+might become cyclic without the database noticing it, as the deadlock check
+is only performed when a transaction itself requests a lock!
+-------------------------------------------------------------------------
+
+An insert is allowed to a gap if there are no explicit lock requests by
+other transactions on the next record. It does not matter if these lock
+requests are granted or waiting, gap bit set or not, with the exception
+that a gap type request set by another transaction to wait for
+its turn to do an insert is ignored. On the other hand, an
+implicit x-lock by another transaction does not prevent an insert, which
+allows for more concurrency when using an Oracle-style sequence number
+generator for the primary key with many transactions doing inserts
+concurrently.
+
+A modify of a record is allowed if the transaction has an x-lock on the
+record, or if other transactions do not have any non-gap lock requests on the
+record.
+
+A read of a single user record with a cursor is allowed if the transaction
+has a non-gap explicit, or an implicit lock on the record, or if the other
+transactions have no x-lock requests on the record. At a page supremum a
+read is always allowed.
+
+In summary, an implicit lock is seen as a granted x-lock only on the
+record, not on the gap. An explicit lock with no gap bit set is a lock
+both on the record and the gap. If the gap bit is set, the lock is only
+on the gap. Different transaction cannot own conflicting locks on the
+record at the same time, but they may own conflicting locks on the gap.
+Granted locks on a record give an access right to the record, but gap type
+locks just inhibit operations.
+
+NOTE: Finding out if some transaction has an implicit x-lock on a secondary
+index record can be cumbersome. We may have to look at previous versions of
+the corresponding clustered index record to find out if a delete marked
+secondary index record was delete marked by an active transaction, not by
+a committed one.
+
+FACT A: If a transaction has inserted a row, it can delete it any time
+without need to wait for locks.
+
+PROOF: The transaction has an implicit x-lock on every index record inserted
+for the row, and can thus modify each record without the need to wait. Q.E.D.
+
+FACT B: If a transaction has read some result set with a cursor, it can read
+it again, and retrieves the same result set, if it has not modified the
+result set in the meantime. Hence, there is no phantom problem. If the
+biggest record, in the alphabetical order, touched by the cursor is removed,
+a lock wait may occur, otherwise not.
+
+PROOF: When a read cursor proceeds, it sets an s-lock on each user record
+it passes, and a gap type s-lock on each page supremum. The cursor must
+wait until it has these locks granted. Then no other transaction can
+have a granted x-lock on any of the user records, and therefore cannot
+modify the user records. Neither can any other transaction insert into
+the gaps which were passed over by the cursor. Page splits and merges,
+and removal of obsolete versions of records do not affect this, because
+when a user record or a page supremum is removed, the next record inherits
+its locks as gap type locks, and therefore blocks inserts to the same gap.
+Also, if a page supremum is inserted, it inherits its locks from the successor
+record. When the cursor is positioned again at the start of the result set,
+the records it will touch on its course are either records it touched
+during the last pass or new inserted page supremums. It can immediately
+access all these records, and when it arrives at the biggest record, it
+notices that the result set is complete. If the biggest record was removed,
+lock wait can occur because the next record only inherits a gap type lock,
+and a wait may be needed. Q.E.D. */
+
+/* If an index record should be changed or a new inserted, we must check
+the lock on the record or the next. When a read cursor starts reading,
+we will set a record level s-lock on each record it passes, except on the
+initial record on which the cursor is positioned before we start to fetch
+records. Our index tree search has the convention that the B-tree
+cursor is positioned BEFORE the first possibly matching record in
+the search. Optimizations are possible here: if the record is searched
+on an equality condition to a unique key, we could actually set a special
+lock on the record, a lock which would not prevent any insert before
+this record. In the next key locking an x-lock set on a record also
+prevents inserts just before that record.
+	There are special infimum and supremum records on each page.
+A supremum record can be locked by a read cursor. This records cannot be
+updated but the lock prevents insert of a user record to the end of
+the page.
+	Next key locks will prevent the phantom problem where new rows
+could appear to SELECT result sets after the select operation has been
+performed. Prevention of phantoms ensures the serilizability of
+transactions.
+	What should we check if an insert of a new record is wanted?
+Only the lock on the next record on the same page, because also the
+supremum record can carry a lock. An s-lock prevents insertion, but
+what about an x-lock? If it was set by a searched update, then there
+is implicitly an s-lock, too, and the insert should be prevented.
+What if our transaction owns an x-lock to the next record, but there is
+a waiting s-lock request on the next record? If this s-lock was placed
+by a read cursor moving in the ascending order in the index, we cannot
+do the insert immediately, because when we finally commit our transaction,
+the read cursor should see also the new inserted record. So we should
+move the read cursor backward from the next record for it to pass over
+the new inserted record. This move backward may be too cumbersome to
+implement. If we in this situation just enqueue a second x-lock request
+for our transaction on the next record, then the deadlock mechanism
+notices a deadlock between our transaction and the s-lock request
+transaction. This seems to be an ok solution.
+	We could have the convention that granted explicit record locks,
+lock the corresponding records from changing, and also lock the gaps
+before them from inserting. A waiting explicit lock request locks the gap
+before from inserting. Implicit record x-locks, which we derive from the
+transaction id in the clustered index record, only lock the record itself
+from modification, not the gap before it from inserting.
+	How should we store update locks? If the search is done by a unique
+key, we could just modify the record trx id. Otherwise, we could put a record
+x-lock on the record. If the update changes ordering fields of the
+clustered index record, the inserted new record needs no record lock in
+lock table, the trx id is enough. The same holds for a secondary index
+record. Searched delete is similar to update.
+
+PROBLEM:
+What about waiting lock requests? If a transaction is waiting to make an
+update to a record which another modified, how does the other transaction
+know to send the end-lock-wait signal to the waiting transaction? If we have
+the convention that a transaction may wait for just one lock at a time, how
+do we preserve it if lock wait ends?
+
+PROBLEM:
+Checking the trx id label of a secondary index record. In the case of a
+modification, not an insert, is this necessary? A secondary index record
+is modified only by setting or resetting its deleted flag. A secondary index
+record contains fields to uniquely determine the corresponding clustered
+index record. A secondary index record is therefore only modified if we
+also modify the clustered index record, and the trx id checking is done
+on the clustered index record, before we come to modify the secondary index
+record. So, in the case of delete marking or unmarking a secondary index
+record, we do not have to care about trx ids, only the locks in the lock
+table must be checked. In the case of a select from a secondary index, the
+trx id is relevant, and in this case we may have to search the clustered
+index record.
+
+PROBLEM: How to update record locks when page is split or merged, or
+--------------------------------------------------------------------
+a record is deleted or updated?
+If the size of fields in a record changes, we perform the update by
+a delete followed by an insert. How can we retain the locks set or
+waiting on the record? Because a record lock is indexed in the bitmap
+by the heap number of the record, when we remove the record from the
+record list, it is possible still to keep the lock bits. If the page
+is reorganized, we could make a table of old and new heap numbers,
+and permute the bitmaps in the locks accordingly. We can add to the
+table a row telling where the updated record ended. If the update does
+not require a reorganization of the page, we can simply move the lock
+bits for the updated record to the position determined by its new heap
+number (we may have to allocate a new lock, if we run out of the bitmap
+in the old one).
+	A more complicated case is the one where the reinsertion of the
+updated record is done pessimistically, because the structure of the
+tree may change.
+
+PROBLEM: If a supremum record is removed in a page merge, or a record
+---------------------------------------------------------------------
+removed in a purge, what to do to the waiting lock requests? In a split to
+the right, we just move the lock requests to the new supremum. If a record
+is removed, we could move the waiting lock request to its inheritor, the
+next record in the index. But, the next record may already have lock
+requests on its own queue. A new deadlock check should be made then. Maybe
+it is easier just to release the waiting transactions. They can then enqueue
+new lock requests on appropriate records.
+
+PROBLEM: When a record is inserted, what locks should it inherit from the
+-------------------------------------------------------------------------
+upper neighbor? An insert of a new supremum record in a page split is
+always possible, but an insert of a new user record requires that the upper
+neighbor does not have any lock requests by other transactions, granted or
+waiting, in its lock queue. Solution: We can copy the locks as gap type
+locks, so that also the waiting locks are transformed to granted gap type
+locks on the inserted record. */
+
+/* LOCK COMPATIBILITY MATRIX
+ *    IS IX S  X  AI
+ * IS +	 +  +  -  +
+ * IX +	 +  -  -  +
+ * S  +	 -  +  -  -
+ * X  -	 -  -  -  -
+ * AI +	 +  -  -  -
+ *
+ * Note that for rows, InnoDB only acquires S or X locks.
+ * For tables, InnoDB normally acquires IS or IX locks.
+ * S or X table locks are only acquired for LOCK TABLES.
+ * Auto-increment (AI) locks are needed because of
+ * statement-level MySQL binlog.
+ * See also lock_mode_compatible().
+ */
+static const byte lock_compatibility_matrix[5][5] = {
+ /**         IS     IX       S     X       AI */
+ /* IS */ {  TRUE,  TRUE,  TRUE,  FALSE,  TRUE},
+ /* IX */ {  TRUE,  TRUE,  FALSE, FALSE,  TRUE},
+ /* S  */ {  TRUE,  FALSE, TRUE,  FALSE,  FALSE},
+ /* X  */ {  FALSE, FALSE, FALSE, FALSE,  FALSE},
+ /* AI */ {  TRUE,  TRUE,  FALSE, FALSE,  FALSE}
+};
+
+/* STRONGER-OR-EQUAL RELATION (mode1=row, mode2=column)
+ *    IS IX S  X  AI
+ * IS +  -  -  -  -
+ * IX +  +  -  -  -
+ * S  +  -  +  -  -
+ * X  +  +  +  +  +
+ * AI -  -  -  -  +
+ * See lock_mode_stronger_or_eq().
+ */
+static const byte lock_strength_matrix[5][5] = {
+ /**         IS     IX       S     X       AI */
+ /* IS */ {  TRUE,  FALSE, FALSE,  FALSE, FALSE},
+ /* IX */ {  TRUE,  TRUE,  FALSE, FALSE,  FALSE},
+ /* S  */ {  TRUE,  FALSE, TRUE,  FALSE,  FALSE},
+ /* X  */ {  TRUE,  TRUE,  TRUE,  TRUE,   TRUE},
+ /* AI */ {  FALSE, FALSE, FALSE, FALSE,  TRUE}
+};
+
+#define PRDT_HEAPNO	PAGE_HEAP_NO_INFIMUM
+/** Record locking request status */
+enum lock_rec_req_status {
+        /** Failed to acquire a lock */
+        LOCK_REC_FAIL,
+        /** Succeeded in acquiring a lock (implicit or already acquired) */
+        LOCK_REC_SUCCESS,
+        /** Explicitly created a new lock */
+        LOCK_REC_SUCCESS_CREATED
+};
+
+#ifdef UNIV_DEBUG
+/** The count of the types of locks. */
+static const ulint      lock_types = UT_ARR_SIZE(lock_compatibility_matrix);
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Gets the previous record lock set on a record.
+@return previous lock on the same record, NULL if none exists */
+const lock_t*
+lock_rec_get_prev(
+/*==============*/
+	const lock_t*	in_lock,/*!< in: record lock */
+	ulint		heap_no);/*!< in: heap number of the record */
+
+/*********************************************************************//**
+Checks if some transaction has an implicit x-lock on a record in a clustered
+index.
+@return transaction id of the transaction which has the x-lock, or 0 */
+UNIV_INLINE
+trx_id_t
+lock_clust_rec_some_has_impl(
+/*=========================*/
+	const rec_t*		rec,	/*!< in: user record */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const rec_offs*		offsets)/*!< in: rec_get_offsets(rec, index) */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************//**
+Gets the first or next record lock on a page.
+@return next lock, NULL if none exists */
+UNIV_INLINE
+const lock_t*
+lock_rec_get_next_on_page_const(
+/*============================*/
+	const lock_t*	lock);	/*!< in: a record lock */
+
+/*********************************************************************//**
+Gets the nth bit of a record lock.
+@return TRUE if bit set also if i == ULINT_UNDEFINED return FALSE*/
+UNIV_INLINE
+ibool
+lock_rec_get_nth_bit(
+/*=================*/
+	const lock_t*	lock,	/*!< in: record lock */
+	ulint		i);	/*!< in: index of the bit */
+
+/*********************************************************************//**
+Gets the number of bits in a record lock bitmap.
+@return number of bits */
+UNIV_INLINE
+ulint
+lock_rec_get_n_bits(
+/*================*/
+	const lock_t*	lock);	/*!< in: record lock */
+
+/**********************************************************************//**
+Sets the nth bit of a record lock to TRUE. */
+inline
+void
+lock_rec_set_nth_bit(
+/*=================*/
+	lock_t*	lock,	/*!< in: record lock */
+	ulint	i);	/*!< in: index of the bit */
+
+/** Reset the nth bit of a record lock.
+@param[in,out] lock record lock
+@param[in] i index of the bit that will be reset
+@return previous value of the bit */
+inline byte lock_rec_reset_nth_bit(lock_t* lock, ulint i)
+{
+	ut_ad(!lock->is_table());
+#ifdef SUX_LOCK_GENERIC
+	ut_ad(lock_sys.is_writer() || lock->trx->mutex_is_owner());
+#else
+	ut_ad(lock_sys.is_writer() || lock->trx->mutex_is_owner()
+	      || (xtest() && !lock->trx->mutex_is_locked()));
+#endif
+	ut_ad(i < lock->un_member.rec_lock.n_bits);
+
+	byte*	b = reinterpret_cast<byte*>(&lock[1]) + (i >> 3);
+	byte	mask = byte(1U << (i & 7));
+	byte	bit = *b & mask;
+	*b &= byte(~mask);
+
+	if (bit != 0) {
+		ut_d(auto n=)
+		lock->trx->lock.n_rec_locks--;
+		ut_ad(n);
+	}
+
+	return(bit);
+}
+
+/*********************************************************************//**
+Gets the first or next record lock on a page.
+@return next lock, NULL if none exists */
+UNIV_INLINE
+lock_t*
+lock_rec_get_next_on_page(
+/*======================*/
+	lock_t*		lock);		/*!< in: a record lock */
+
+/*********************************************************************//**
+Gets the next explicit lock request on a record.
+@return next lock, NULL if none exists or if heap_no == ULINT_UNDEFINED */
+UNIV_INLINE
+lock_t*
+lock_rec_get_next(
+/*==============*/
+	ulint	heap_no,/*!< in: heap number of the record */
+	lock_t*	lock);	/*!< in: lock */
+
+/*********************************************************************//**
+Gets the next explicit lock request on a record.
+@return next lock, NULL if none exists or if heap_no == ULINT_UNDEFINED */
+UNIV_INLINE
+const lock_t*
+lock_rec_get_next_const(
+/*====================*/
+	ulint		heap_no,/*!< in: heap number of the record */
+	const lock_t*	lock);	/*!< in: lock */
+
+/** Get the first explicit lock request on a record.
+@param cell     first lock hash table cell
+@param id       page identifier
+@param heap_no  record identifier in page
+@return first lock
+@retval nullptr if none exists */
+inline lock_t *lock_sys_t::get_first(const hash_cell_t &cell, page_id_t id,
+                                     ulint heap_no)
+{
+  lock_sys.assert_locked(cell);
+
+  for (lock_t *lock= static_cast<lock_t*>(cell.node); lock; lock= lock->hash)
+  {
+    ut_ad(!lock->is_table());
+    if (lock->un_member.rec_lock.page_id == id &&
+        lock_rec_get_nth_bit(lock, heap_no))
+      return lock;
+  }
+  return nullptr;
+}
+
+/*********************************************************************//**
+Calculates if lock mode 1 is compatible with lock mode 2.
+@return nonzero if mode1 compatible with mode2 */
+UNIV_INLINE
+ulint
+lock_mode_compatible(
+/*=================*/
+	enum lock_mode	mode1,	/*!< in: lock mode */
+	enum lock_mode	mode2);	/*!< in: lock mode */
+
+/*********************************************************************//**
+Calculates if lock mode 1 is stronger or equal to lock mode 2.
+@return nonzero if mode1 stronger or equal to mode2 */
+UNIV_INLINE
+ulint
+lock_mode_stronger_or_eq(
+/*=====================*/
+	enum lock_mode	mode1,	/*!< in: lock mode */
+	enum lock_mode	mode2);	/*!< in: lock mode */
+
+/*********************************************************************//**
+Checks if a transaction has the specified table lock, or stronger. This
+function should only be called by the thread that owns the transaction.
+@return lock or NULL */
+UNIV_INLINE
+const lock_t*
+lock_table_has(
+/*===========*/
+	const trx_t*		trx,	/*!< in: transaction */
+	const dict_table_t*	table,	/*!< in: table */
+	enum lock_mode		mode);	/*!< in: lock mode */
+
+#include "lock0priv.inl"
+
+#endif /* lock0priv_h */
diff --git a/storage/innobase/include/lock0priv.inl b/storage/innobase/include/lock0priv.inl
new file mode 100644
index 00000000..3b4ebcc8
--- /dev/null
+++ b/storage/innobase/include/lock0priv.inl
@@ -0,0 +1,255 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0priv.ic
+Lock module internal inline methods.
+
+Created July 16, 2007 Vasil Dimov
+*******************************************************/
+
+/* This file contains only methods which are used in
+lock/lock0* files, other than lock/lock0lock.cc.
+I.e. lock/lock0lock.cc contains more internal inline
+methods but they are used only in that file. */
+
+#ifndef LOCK_MODULE_IMPLEMENTATION
+#error Do not include lock0priv.ic outside of the lock/ module
+#endif
+
+#include "row0row.h"
+
+/*********************************************************************//**
+Checks if some transaction has an implicit x-lock on a record in a clustered
+index.
+@return transaction id of the transaction which has the x-lock, or 0 */
+UNIV_INLINE
+trx_id_t
+lock_clust_rec_some_has_impl(
+/*=========================*/
+	const rec_t*		rec,	/*!< in: user record */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const rec_offs*		offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(page_rec_is_user_rec(rec));
+
+	return(row_get_rec_trx_id(rec, index, offsets));
+}
+
+/*********************************************************************//**
+Gets the number of bits in a record lock bitmap.
+@return	number of bits */
+UNIV_INLINE
+ulint
+lock_rec_get_n_bits(
+/*================*/
+	const lock_t*	lock)	/*!< in: record lock */
+{
+	return(lock->un_member.rec_lock.n_bits);
+}
+
+/**********************************************************************//**
+Sets the nth bit of a record lock to TRUE. */
+inline
+void
+lock_rec_set_nth_bit(
+/*=================*/
+	lock_t*	lock,	/*!< in: record lock */
+	ulint	i)	/*!< in: index of the bit */
+{
+	ulint	byte_index;
+	ulint	bit_index;
+
+	ut_ad(!lock->is_table());
+	ut_ad(i < lock->un_member.rec_lock.n_bits);
+
+	byte_index = i / 8;
+	bit_index = i % 8;
+
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 4 and 5 need this here */
+#endif
+	((byte*) &lock[1])[byte_index] |= static_cast<byte>(1 << bit_index);
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
+#ifdef SUX_LOCK_GENERIC
+	ut_ad(lock_sys.is_writer() || lock->trx->mutex_is_owner());
+#else
+	ut_ad(lock_sys.is_writer() || lock->trx->mutex_is_owner()
+	      || (xtest() && !lock->trx->mutex_is_locked()));
+#endif
+	lock->trx->lock.n_rec_locks++;
+}
+
+/*********************************************************************//**
+Gets the first or next record lock on a page.
+@return	next lock, NULL if none exists */
+UNIV_INLINE
+lock_t*
+lock_rec_get_next_on_page(
+/*======================*/
+	lock_t*	lock)	/*!< in: a record lock */
+{
+  return const_cast<lock_t*>(lock_rec_get_next_on_page_const(lock));
+}
+
+/*********************************************************************//**
+Gets the next explicit lock request on a record.
+@return	next lock, NULL if none exists or if heap_no == ULINT_UNDEFINED */
+UNIV_INLINE
+lock_t*
+lock_rec_get_next(
+/*==============*/
+	ulint	heap_no,/*!< in: heap number of the record */
+	lock_t*	lock)	/*!< in: lock */
+{
+	do {
+		lock = lock_rec_get_next_on_page(lock);
+	} while (lock && !lock_rec_get_nth_bit(lock, heap_no));
+
+	return(lock);
+}
+
+/*********************************************************************//**
+Gets the next explicit lock request on a record.
+@return	next lock, NULL if none exists or if heap_no == ULINT_UNDEFINED */
+UNIV_INLINE
+const lock_t*
+lock_rec_get_next_const(
+/*====================*/
+	ulint		heap_no,/*!< in: heap number of the record */
+	const lock_t*	lock)	/*!< in: lock */
+{
+  return lock_rec_get_next(heap_no, const_cast<lock_t*>(lock));
+}
+
+/*********************************************************************//**
+Gets the nth bit of a record lock.
+@return TRUE if bit set also if i == ULINT_UNDEFINED return FALSE*/
+UNIV_INLINE
+ibool
+lock_rec_get_nth_bit(
+/*=================*/
+	const lock_t*	lock,	/*!< in: record lock */
+	ulint		i)	/*!< in: index of the bit */
+{
+	const byte*     b;
+
+	ut_ad(!lock->is_table());
+
+	if (i >= lock->un_member.rec_lock.n_bits) {
+
+		return(FALSE);
+	}
+
+	b = ((const byte*) &lock[1]) + (i / 8);
+
+	return(1 & *b >> (i % 8));
+}
+
+/*********************************************************************//**
+Gets the first or next record lock on a page.
+@return next lock, NULL if none exists */
+UNIV_INLINE
+const lock_t*
+lock_rec_get_next_on_page_const(
+/*============================*/
+	const lock_t*	lock)	/*!< in: a record lock */
+{
+  ut_ad(!lock->is_table());
+
+  const page_id_t page_id{lock->un_member.rec_lock.page_id};
+
+  while (!!(lock= static_cast<const lock_t*>(HASH_GET_NEXT(hash, lock))))
+    if (lock->un_member.rec_lock.page_id == page_id)
+      break;
+  return lock;
+}
+
+/*********************************************************************//**
+Calculates if lock mode 1 is compatible with lock mode 2.
+@return nonzero if mode1 compatible with mode2 */
+UNIV_INLINE
+ulint
+lock_mode_compatible(
+/*=================*/
+	enum lock_mode	mode1,	/*!< in: lock mode */
+	enum lock_mode	mode2)	/*!< in: lock mode */
+{
+	ut_ad((ulint) mode1 < lock_types);
+	ut_ad((ulint) mode2 < lock_types);
+
+	return(lock_compatibility_matrix[mode1][mode2]);
+}
+
+/*********************************************************************//**
+Calculates if lock mode 1 is stronger or equal to lock mode 2.
+@return nonzero if mode1 stronger or equal to mode2 */
+UNIV_INLINE
+ulint
+lock_mode_stronger_or_eq(
+/*=====================*/
+	enum lock_mode	mode1,	/*!< in: lock mode */
+	enum lock_mode	mode2)	/*!< in: lock mode */
+{
+	ut_ad((ulint) mode1 < lock_types);
+	ut_ad((ulint) mode2 < lock_types);
+
+	return(lock_strength_matrix[mode1][mode2]);
+}
+
+/*********************************************************************//**
+Checks if a transaction has the specified table lock, or stronger. This
+function should only be called by the thread that owns the transaction.
+@return lock or NULL */
+UNIV_INLINE
+const lock_t*
+lock_table_has(
+/*===========*/
+	const trx_t*		trx,	/*!< in: transaction */
+	const dict_table_t*	table,	/*!< in: table */
+	lock_mode		in_mode)/*!< in: lock mode */
+{
+	/* Look for stronger locks the same trx already has on the table */
+
+	for (lock_list::const_iterator it = trx->lock.table_locks.begin(),
+             end = trx->lock.table_locks.end(); it != end; ++it) {
+
+		const lock_t*	lock = *it;
+
+		if (lock == NULL) {
+			continue;
+		}
+
+		ut_ad(trx == lock->trx);
+		ut_ad(lock->is_table());
+		ut_ad(lock->un_member.tab_lock.table);
+
+		if (table == lock->un_member.tab_lock.table
+		    && lock_mode_stronger_or_eq(lock->mode(), in_mode)) {
+			ut_ad(!lock->is_waiting());
+			return(lock);
+		}
+	}
+
+	return(NULL);
+}
diff --git a/storage/innobase/include/lock0types.h b/storage/innobase/include/lock0types.h
new file mode 100644
index 00000000..0d00b4b3
--- /dev/null
+++ b/storage/innobase/include/lock0types.h
@@ -0,0 +1,251 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0types.h
+The transaction lock system global types
+
+Created 5/7/1996 Heikki Tuuri
+*******************************************************/
+
+#include "dict0types.h"
+#include "buf0types.h"
+#include "ut0lst.h"
+
+#ifndef lock0types_h
+#define lock0types_h
+
+#define lock_t ib_lock_t
+
+struct lock_t;
+struct lock_table_t;
+
+/* Basic lock modes */
+enum lock_mode {
+	LOCK_IS = 0,	/* intention shared */
+	LOCK_IX,	/* intention exclusive */
+	LOCK_S,		/* shared */
+	LOCK_X,		/* exclusive */
+	LOCK_AUTO_INC,	/* locks the auto-inc counter of a table
+			in an exclusive mode */
+	LOCK_NONE,	/* this is used elsewhere to note consistent read */
+	LOCK_NUM = LOCK_NONE, /* number of lock modes */
+	LOCK_NONE_UNSET = 7
+};
+
+/** A table lock */
+struct lock_table_t {
+	dict_table_t*	table;		/*!< database table in dictionary
+					cache */
+	UT_LIST_NODE_T(ib_lock_t)
+			locks;		/*!< list of locks on the same
+					table */
+	/** Print the table lock into the given output stream
+	@param[in,out]	out	the output stream
+	@return the given output stream. */
+	std::ostream& print(std::ostream& out) const;
+};
+
+/** Record lock for a page */
+struct lock_rec_t {
+	/** page identifier */
+	page_id_t	page_id;
+	ib_uint32_t	n_bits;		/*!< number of bits in the lock
+					bitmap; NOTE: the lock bitmap is
+					placed immediately after the
+					lock struct */
+
+	/** Print the record lock into the given output stream
+	@param[in,out]	out	the output stream
+	@return the given output stream. */
+	std::ostream& print(std::ostream& out) const;
+};
+
+/** Print the record lock into the given output stream
+@param[in,out]	out	the output stream
+@return the given output stream. */
+inline std::ostream &lock_rec_t::print(std::ostream &out) const
+{
+  out << "[lock_rec_t: space=" << page_id.space()
+      << ", page_no=" << page_id.page_no()
+      << ", n_bits=" << n_bits << "]";
+  return out;
+}
+
+inline
+std::ostream&
+operator<<(std::ostream& out, const lock_rec_t& lock)
+{
+	return(lock.print(out));
+}
+
+#define LOCK_MODE_MASK	0x7	/*!< mask used to extract mode from the
+				type_mode field in a lock */
+/** Lock types */
+/* @{ */
+/** table lock (record lock if the flag is not set) */
+#define LOCK_TABLE	8U
+
+#define LOCK_WAIT	256U	/*!< Waiting lock flag; when set, it
+				means that the lock has not yet been
+				granted, it is just waiting for its
+				turn in the wait queue */
+/* Precise modes */
+#define LOCK_ORDINARY	0	/*!< this flag denotes an ordinary
+				next-key lock in contrast to LOCK_GAP
+				or LOCK_REC_NOT_GAP */
+#define LOCK_GAP	512U	/*!< when this bit is set, it means that the
+				lock holds only on the gap before the record;
+				for instance, an x-lock on the gap does not
+				give permission to modify the record on which
+				the bit is set; locks of this type are created
+				when records are removed from the index chain
+				of records */
+#define LOCK_REC_NOT_GAP 1024U	/*!< this bit means that the lock is only on
+				the index record and does NOT block inserts
+				to the gap before the index record; this is
+				used in the case when we retrieve a record
+				with a unique key, and is also used in
+				locking plain SELECTs (not part of UPDATE
+				or DELETE) when the user has set the READ
+				COMMITTED isolation level */
+#define LOCK_INSERT_INTENTION 2048U/*!< this bit is set when we place a waiting
+				gap type record lock request in order to let
+				an insert of an index record to wait until
+				there are no conflicting locks by other
+				transactions on the gap; note that this flag
+				remains set when the waiting lock is granted,
+				or if the lock is inherited to a neighboring
+				record */
+#define LOCK_PREDICATE	8192U	/*!< Predicate lock */
+#define LOCK_PRDT_PAGE	16384U	/*!< Page lock */
+
+
+#if (LOCK_WAIT|LOCK_GAP|LOCK_REC_NOT_GAP|LOCK_INSERT_INTENTION|LOCK_PREDICATE|LOCK_PRDT_PAGE)&LOCK_MODE_MASK
+# error
+#endif
+#if (LOCK_WAIT|LOCK_GAP|LOCK_REC_NOT_GAP|LOCK_INSERT_INTENTION|LOCK_PREDICATE|LOCK_PRDT_PAGE)&LOCK_TYPE_MASK
+# error
+#endif
+/* @} */
+
+/**
+Checks if the `mode` is LOCK_S or LOCK_X (possibly ORed with LOCK_WAIT or
+LOCK_REC) which means the lock is a
+Next Key Lock, a.k.a. LOCK_ORDINARY, as opposed to Predicate Lock,
+GAP lock, Insert Intention or Record Lock.
+@param  mode  A mode and flags, of a lock.
+@return true if the only bits set in `mode` are LOCK_S or LOCK_X and optionally
+LOCK_WAIT or LOCK_REC */
+static inline bool lock_mode_is_next_key_lock(ulint mode)
+{
+  static_assert(LOCK_ORDINARY == 0, "LOCK_ORDINARY must be 0 (no flags)");
+  ut_ad((mode & LOCK_TABLE) == 0);
+  mode&= ~LOCK_WAIT;
+  ut_ad((mode & LOCK_WAIT) == 0);
+  ut_ad(((mode & ~(LOCK_MODE_MASK)) == LOCK_ORDINARY) ==
+        (mode == LOCK_S || mode == LOCK_X));
+  return (mode & ~(LOCK_MODE_MASK)) == LOCK_ORDINARY;
+}
+
+/** Lock struct; protected by lock_sys.latch */
+struct ib_lock_t
+{
+  /** the owner of the lock */
+  trx_t *trx;
+  /** other locks of the transaction; protected by
+  lock_sys.is_writer() and trx->mutex_is_owner(); @see trx_lock_t::trx_locks */
+  UT_LIST_NODE_T(ib_lock_t) trx_locks;
+
+	dict_index_t*	index;		/*!< index for a record lock */
+
+	ib_lock_t*	hash;		/*!< hash chain node for a record
+					lock. The link node in a singly linked
+					list, used during hashing. */
+
+	/** time(NULL) of the lock request creation.
+	Used for computing wait_time and diagnostics only.
+	Note: bogus durations may be reported
+	when the system time is adjusted! */
+	time_t		requested_time;
+	/** Cumulated wait time in seconds.
+	Note: may be bogus when the system time is adjusted! */
+	ulint		wait_time;
+
+	union {
+		lock_table_t	tab_lock;/*!< table lock */
+		lock_rec_t	rec_lock;/*!< record lock */
+	} un_member;			/*!< lock details */
+
+	ib_uint32_t	type_mode;	/*!< lock type, mode, LOCK_GAP or
+					LOCK_REC_NOT_GAP,
+					LOCK_INSERT_INTENTION,
+					wait flag, ORed */
+
+	bool is_waiting() const
+	{
+		return(type_mode & LOCK_WAIT);
+	}
+
+	bool is_gap() const
+	{
+		return(type_mode & LOCK_GAP);
+	}
+
+	bool is_record_not_gap() const
+	{
+		return(type_mode & LOCK_REC_NOT_GAP);
+	}
+
+	/** @return true if the lock is a Next Key Lock */
+	bool is_next_key_lock() const
+	{
+		return !(type_mode & LOCK_TABLE) &&
+		       lock_mode_is_next_key_lock(type_mode);
+	}
+
+	bool is_insert_intention() const
+	{
+		return(type_mode & LOCK_INSERT_INTENTION);
+	}
+
+	bool is_table() const { return type_mode & LOCK_TABLE; }
+
+	enum lock_mode mode() const
+	{
+		return(static_cast<enum lock_mode>(type_mode & LOCK_MODE_MASK));
+	}
+
+        bool is_rec_granted_exclusive_not_gap() const
+        {
+          return (type_mode & (LOCK_MODE_MASK | LOCK_GAP)) == LOCK_X;
+        }
+
+	/** Print the lock object into the given output stream.
+	@param[in,out]	out	the output stream
+	@return the given output stream. */
+	std::ostream& print(std::ostream& out) const;
+
+	const char* type_string() const
+	{ return is_table() ? "LOCK_TABLE" : "LOCK_REC"; }
+};
+
+typedef UT_LIST_BASE_NODE_T(ib_lock_t) trx_lock_list_t;
+
+#endif /* lock0types_h */
diff --git a/storage/innobase/include/log0crypt.h b/storage/innobase/include/log0crypt.h
new file mode 100644
index 00000000..22c0c963
--- /dev/null
+++ b/storage/innobase/include/log0crypt.h
@@ -0,0 +1,115 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2015, Google Inc. All Rights Reserved.
+Copyright (C) 2014, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/**************************************************//**
+@file include/log0crypt.h
+Innodb log encrypt/decrypt
+
+Created 11/25/2013 Minli Zhu
+Modified           Jan Lindström jan.lindstrom@mariadb.com
+MDEV-11782: Rewritten for MariaDB 10.2 by Marko Mäkelä, MariaDB Corporation.
+*******************************************************/
+#pragma once
+
+#include "log0log.h"
+
+/** Initialize the redo log encryption key and random parameters
+when creating a new redo log.
+The random parameters will be persisted in the log header.
+@see log_crypt_write_header()
+@see log_crypt_read_header()
+@return whether the operation succeeded */
+bool log_crypt_init();
+
+/** Add the encryption information to the log header buffer.
+@param buf   part of log header buffer */
+void log_crypt_write_header(byte *buf);
+
+/** Read the encryption information from a redo log checkpoint buffer.
+@param buf   part of checkpoint buffer
+@return whether the operation was successful */
+bool log_crypt_read_header(const byte *buf);
+
+/** Read the MariaDB 10.1 checkpoint crypto (version, msg and iv) info.
+@param[in]	buf	checkpoint buffer
+@return	whether the operation was successful */
+ATTRIBUTE_COLD bool log_crypt_101_read_checkpoint(const byte* buf);
+
+/** Decrypt a MariaDB 10.1 redo log block.
+@param[in,out]	buf		log block
+@param[in]	start_lsn	server start LSN
+@return	whether the decryption was successful */
+ATTRIBUTE_COLD bool log_crypt_101_read_block(byte* buf, lsn_t start_lsn);
+
+/** Read the checkpoint crypto (version, msg and iv) info.
+@param[in]	buf	checkpoint buffer
+@return	whether the operation was successful */
+ATTRIBUTE_COLD bool log_crypt_read_checkpoint_buf(const byte* buf);
+
+/** Decrypt log blocks.
+@param[in,out]	buf	log blocks to decrypt
+@param[in]	lsn	log sequence number of the start of the buffer
+@param[in]	size	size of the buffer, in bytes
+@return	whether the operation succeeded */
+ATTRIBUTE_COLD bool log_decrypt(byte* buf, lsn_t lsn, ulint size);
+
+/** Decrypt part of a log record.
+@param iv    initialization vector
+@param buf   buffer for the decrypted data
+@param data  the encrypted data
+@param len   length of the data, in bytes
+@return buf */
+byte *log_decrypt_buf(const byte *iv, byte *buf, const byte *data, uint len);
+
+/** Decrypt a log snippet.
+@param iv    initialization vector
+@param buf   buffer to be replaced with encrypted contents
+@param end   pointer past the end of buf */
+void log_decrypt_buf(const byte *iv, byte *buf, const byte *const end);
+
+/** Encrypt or decrypt a temporary file block.
+@param[in]	src		block to encrypt or decrypt
+@param[in]	size		size of the block
+@param[out]	dst		destination block
+@param[in]	offs		offset to block
+@param[in]	encrypt		true=encrypt; false=decrypt
+@return whether the operation succeeded */
+bool log_tmp_block_encrypt(
+	const byte*	src,
+	ulint		size,
+	byte*		dst,
+	uint64_t	offs,
+	bool		encrypt = true)
+	MY_ATTRIBUTE((warn_unused_result, nonnull));
+
+/** Decrypt a temporary file block.
+@param[in]	src		block to decrypt
+@param[in]	size		size of the block
+@param[out]	dst		destination block
+@param[in]	offs		offset to block
+@return whether the operation succeeded */
+inline
+bool
+log_tmp_block_decrypt(
+	const byte*	src,
+	ulint		size,
+	byte*		dst,
+	uint64_t	offs)
+{
+	return(log_tmp_block_encrypt(src, size, dst, offs, false));
+}
diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h
new file mode 100644
index 00000000..f873eabf
--- /dev/null
+++ b/storage/innobase/include/log0log.h
@@ -0,0 +1,529 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All rights reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/log0log.h
+Database log
+
+Created 12/9/1995 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "log0types.h"
+#include "os0file.h"
+#include "span.h"
+#include "my_atomic_wrapper.h"
+#include "srw_lock.h"
+#include <string>
+
+using st_::span;
+
+static const char LOG_FILE_NAME_PREFIX[] = "ib_logfile";
+static const char LOG_FILE_NAME[] = "ib_logfile0";
+
+/** Composes full path for a redo log file
+@param[in]	filename	name of the redo log file
+@return path with log file name*/
+std::string get_log_file_path(const char *filename= LOG_FILE_NAME);
+
+/** Delete log file.
+@param[in]	suffix	suffix of the file name */
+static inline void delete_log_file(const char* suffix)
+{
+  auto path = get_log_file_path(LOG_FILE_NAME_PREFIX).append(suffix);
+  os_file_delete_if_exists_func(path.c_str(), nullptr);
+}
+
+struct completion_callback;
+
+/** Ensure that the log has been written to the log file up to a given
+log entry (such as that of a transaction commit). Start a new write, or
+wait and check if an already running write is covering the request.
+@param lsn      log sequence number that should be included in the file write
+@param durable  whether the write needs to be durable
+@param callback log write completion callback */
+void log_write_up_to(lsn_t lsn, bool durable,
+                     const completion_callback *callback= nullptr);
+
+/** Write to the log file up to the last log entry.
+@param durable  whether to wait for a durable write to complete */
+void log_buffer_flush_to_disk(bool durable= true);
+
+
+/** Prepare to invoke log_write_and_flush(), before acquiring log_sys.latch. */
+ATTRIBUTE_COLD void log_write_and_flush_prepare();
+
+/** Durably write the log up to log_sys.get_lsn(). */
+ATTRIBUTE_COLD void log_write_and_flush();
+
+/** Make a checkpoint */
+ATTRIBUTE_COLD void log_make_checkpoint();
+
+/** Make a checkpoint at the latest lsn on shutdown. */
+ATTRIBUTE_COLD void logs_empty_and_mark_files_at_shutdown();
+
+/**
+Checks that there is enough free space in the log to start a new query step.
+Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this
+function may only be called if the calling thread owns no synchronization
+objects! */
+ATTRIBUTE_COLD void log_check_margins();
+
+/******************************************************//**
+Prints info of the log. */
+void
+log_print(
+/*======*/
+	FILE*	file);	/*!< in: file where to print */
+
+/** Offsets of a log file header */
+/* @{ */
+/** Log file header format identifier (32-bit unsigned big-endian integer).
+This used to be called LOG_GROUP_ID and always written as 0,
+because InnoDB never supported more than one copy of the redo log. */
+#define LOG_HEADER_FORMAT	0
+/** LSN of the start of data in this log file (with format version 1;
+in format version 0, it was called LOG_FILE_START_LSN and at offset 4). */
+#define LOG_HEADER_START_LSN	8
+/** A null-terminated string which will contain either the string 'ibbackup'
+and the creation time if the log file was created by mysqlbackup --restore,
+or the MySQL version that created the redo log file. */
+#define LOG_HEADER_CREATOR	16
+/** End of the log file creator field. */
+#define LOG_HEADER_CREATOR_END	48
+/* @} */
+
+struct log_t;
+
+/** File abstraction */
+class log_file_t
+{
+  friend log_t;
+  os_file_t m_file{OS_FILE_CLOSED};
+public:
+  log_file_t()= default;
+  log_file_t(os_file_t file) noexcept : m_file(file) {}
+
+  /** Open a file
+  @return file size in bytes
+  @retval 0 if not readable */
+  os_offset_t open(bool read_only) noexcept;
+  bool is_opened() const noexcept { return m_file != OS_FILE_CLOSED; }
+
+  dberr_t close() noexcept;
+  dberr_t read(os_offset_t offset, span<byte> buf) noexcept;
+  void write(os_offset_t offset, span<const byte> buf) noexcept;
+  bool flush() const noexcept { return os_file_flush(m_file); }
+#ifdef HAVE_PMEM
+  byte *mmap(bool read_only, const struct stat &st) noexcept;
+#endif
+};
+
+/** Redo log buffer */
+struct log_t
+{
+  /** The original (not version-tagged) InnoDB redo log format */
+  static constexpr uint32_t FORMAT_3_23= 0;
+  /** The MySQL 5.7.9/MariaDB 10.2.2 log format */
+  static constexpr uint32_t FORMAT_10_2= 1;
+  /** The MariaDB 10.3.2 log format. */
+  static constexpr uint32_t FORMAT_10_3= 103;
+  /** The MariaDB 10.4.0 log format. */
+  static constexpr uint32_t FORMAT_10_4= 104;
+  /** Encrypted MariaDB redo log */
+  static constexpr uint32_t FORMAT_ENCRYPTED= 1U << 31;
+  /** The MariaDB 10.4.0 log format (only with innodb_encrypt_log=ON) */
+  static constexpr uint32_t FORMAT_ENC_10_4= FORMAT_10_4 | FORMAT_ENCRYPTED;
+  /** The MariaDB 10.5.1 physical redo log format */
+  static constexpr uint32_t FORMAT_10_5= 0x50485953;
+  /** The MariaDB 10.5.1 physical format (only with innodb_encrypt_log=ON) */
+  static constexpr uint32_t FORMAT_ENC_10_5= FORMAT_10_5 | FORMAT_ENCRYPTED;
+  /** The MariaDB 10.8.0 variable-block-size redo log format */
+  static constexpr uint32_t FORMAT_10_8= 0x50687973;
+  /** The MariaDB 10.8.0 format with innodb_encrypt_log=ON */
+  static constexpr uint32_t FORMAT_ENC_10_8= FORMAT_10_8 | FORMAT_ENCRYPTED;
+
+  /** Location of the first checkpoint block */
+  static constexpr size_t CHECKPOINT_1= 4096;
+  /** Location of the second checkpoint block */
+  static constexpr size_t CHECKPOINT_2= 8192;
+  /** Start of record payload */
+  static constexpr lsn_t START_OFFSET= 12288;
+
+  /** smallest possible log sequence number in the current format
+  (used to be 2048 before FORMAT_10_8). */
+  static constexpr lsn_t FIRST_LSN= START_OFFSET;
+
+private:
+  /** The log sequence number of the last change of durable InnoDB files */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE)
+  std::atomic<lsn_t> lsn;
+  /** the first guaranteed-durable log sequence number */
+  std::atomic<lsn_t> flushed_to_disk_lsn;
+  /** log sequence number when log resizing was initiated, or 0 */
+  std::atomic<lsn_t> resize_lsn;
+  /** set when there may be need to flush the log buffer, or
+  preflush buffer pool pages, or initiate a log checkpoint.
+  This must hold if lsn - last_checkpoint_lsn > max_checkpoint_age. */
+  std::atomic<bool> check_flush_or_checkpoint_;
+
+
+#if defined(__aarch64__)
+/* On ARM, we do more spinning */
+typedef srw_spin_lock log_rwlock_t;
+#define LSN_LOCK_ATTR MY_MUTEX_INIT_FAST
+#else
+typedef srw_lock log_rwlock_t;
+#define LSN_LOCK_ATTR nullptr
+#endif
+
+public:
+  /** rw-lock protecting buf */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) log_rwlock_t latch;
+private:
+  /** Last written LSN */
+  lsn_t write_lsn;
+public:
+  /** log record buffer, written to by mtr_t::commit() */
+  byte *buf;
+  /** buffer for writing data to ib_logfile0, or nullptr if is_pmem()
+  In write_buf(), buf and flush_buf are swapped */
+  byte *flush_buf;
+  /** number of std::swap(buf, flush_buf) and writes from buf to log;
+  protected by latch.wr_lock() */
+  ulint write_to_log;
+
+  /** Log sequence number when a log file overwrite (broken crash recovery)
+  was noticed. Protected by latch.wr_lock(). */
+  lsn_t overwrite_warned;
+
+  /** innodb_log_buffer_size (size of buf,flush_buf if !is_pmem(), in bytes) */
+  size_t buf_size;
+
+private:
+  /** Log file being constructed during resizing; protected by latch */
+  log_file_t resize_log;
+  /** size of resize_log; protected by latch */
+  lsn_t resize_target;
+  /** Buffer for writing to resize_log; @see buf */
+  byte *resize_buf;
+  /** Buffer for writing to resize_log; @see flush_buf */
+  byte *resize_flush_buf;
+
+  /** spin lock protecting lsn, buf_free in append_prepare() */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) pthread_mutex_t lsn_lock;
+  void init_lsn_lock() { pthread_mutex_init(&lsn_lock, LSN_LOCK_ATTR); }
+  void lock_lsn() { pthread_mutex_lock(&lsn_lock); }
+  void unlock_lsn() { pthread_mutex_unlock(&lsn_lock); }
+  void destroy_lsn_lock() { pthread_mutex_destroy(&lsn_lock); }
+
+public:
+  /** first free offset within buf use; protected by lsn_lock */
+  Atomic_relaxed<size_t> buf_free;
+  /** number of write requests (to buf); protected by exclusive lsn_lock */
+  ulint write_to_buf;
+  /** number of waits in append_prepare(); protected by lsn_lock */
+  ulint waits;
+  /** recommended maximum size of buf, after which the buffer is flushed */
+  size_t max_buf_free;
+
+  /** log file size in bytes, including the header */
+  lsn_t file_size;
+private:
+  /** the log sequence number at the start of the log file */
+  lsn_t first_lsn;
+#if defined __linux__ || defined _WIN32
+  /** The physical block size of the storage */
+  uint32_t block_size;
+#endif
+public:
+  /** format of the redo log: e.g., FORMAT_10_8 */
+  uint32_t format;
+  /** Log file */
+  log_file_t log;
+#if defined __linux__ || defined _WIN32
+  /** whether file system caching is enabled for the log */
+  my_bool log_buffered;
+# ifdef _WIN32
+  static constexpr bool log_maybe_unbuffered= true;
+# else
+  /** whether file system caching may be disabled */
+  bool log_maybe_unbuffered;
+# endif
+#endif
+
+	/** Fields involved in checkpoints @{ */
+	lsn_t		log_capacity;	/*!< capacity of the log; if
+					the checkpoint age exceeds this, it is
+					a serious error because it is possible
+					we will then overwrite log and spoil
+					crash recovery */
+	lsn_t		max_modified_age_async;
+					/*!< when this recommended
+					value for lsn -
+					buf_pool.get_oldest_modification()
+					is exceeded, we start an
+					asynchronous preflush of pool pages */
+	lsn_t		max_checkpoint_age;
+					/*!< this is the maximum allowed value
+					for lsn - last_checkpoint_lsn when a
+					new query step is started */
+  /** latest completed checkpoint (protected by latch.wr_lock()) */
+  Atomic_relaxed<lsn_t> last_checkpoint_lsn;
+  /** next checkpoint LSN (protected by log_sys.latch) */
+  lsn_t next_checkpoint_lsn;
+  /** next checkpoint number (protected by latch.wr_lock()) */
+  ulint next_checkpoint_no;
+  /** whether a checkpoint is pending */
+  Atomic_relaxed<bool> checkpoint_pending;
+
+  /** buffer for checkpoint header */
+  byte *checkpoint_buf;
+	/* @} */
+
+  bool is_initialised() const noexcept { return max_buf_free != 0; }
+
+#ifdef HAVE_PMEM
+  bool is_pmem() const noexcept { return !flush_buf; }
+#else
+  static constexpr bool is_pmem() { return false; }
+#endif
+
+  bool is_opened() const noexcept { return log.is_opened(); }
+
+  /** @return LSN at which log resizing was started and is still in progress
+      @retval 0 if no log resizing is in progress */
+  lsn_t resize_in_progress() const noexcept
+  { return resize_lsn.load(std::memory_order_relaxed); }
+
+  /** Status of resize_start() */
+  enum resize_start_status {
+    RESIZE_NO_CHANGE, RESIZE_IN_PROGRESS, RESIZE_STARTED, RESIZE_FAILED
+  };
+
+  /** Start resizing the log and release the exclusive latch.
+  @param size  requested new file_size
+  @return whether the resizing was started successfully */
+  resize_start_status resize_start(os_offset_t size) noexcept;
+
+  /** Abort any resize_start(). */
+  void resize_abort() noexcept;
+
+  /** Replicate a write to the log.
+  @param lsn  start LSN
+  @param end  end of the mini-transaction
+  @param len  length of the mini-transaction
+  @param seq  offset of the sequence bit from the end */
+  inline void resize_write(lsn_t lsn, const byte *end,
+                           size_t len, size_t seq) noexcept;
+
+  /** Write resize_buf to resize_log.
+  @param length  the used length of resize_buf */
+  ATTRIBUTE_COLD void resize_write_buf(size_t length) noexcept;
+
+  /** Rename a log file after resizing.
+  @return whether an error occurred */
+  static bool resize_rename() noexcept;
+
+#ifdef HAVE_PMEM
+  /** @return pointer for writing to resize_buf
+  @retval nullptr if no PMEM based resizing is active */
+  inline byte *resize_buf_begin(lsn_t lsn) const noexcept;
+  /** @return end of resize_buf */
+  inline const byte *resize_buf_end() const noexcept
+  { return resize_buf + resize_target; }
+
+  /** Initialise the redo log subsystem. */
+  void create_low();
+  /** Initialise the redo log subsystem.
+  @return whether the initialisation succeeded */
+  bool create() { create_low(); return true; }
+
+  /** Attach a log file.
+  @return whether the memory allocation succeeded */
+  bool attach(log_file_t file, os_offset_t size);
+#else
+  /** Initialise the redo log subsystem.
+  @return whether the initialisation succeeded */
+  bool create();
+  /** Attach a log file. */
+  void attach_low(log_file_t file, os_offset_t size);
+  bool attach(log_file_t file, os_offset_t size)
+  { attach_low(file, size); return true; }
+#endif
+
+#if defined __linux__ || defined _WIN32
+  /** Try to enable or disable file system caching (update log_buffered) */
+  void set_buffered(bool buffered);
+#endif
+
+  void close_file();
+
+  /** Calculate the checkpoint safety margins. */
+  static void set_capacity();
+
+  /** Write a log file header.
+  @param buf        log header buffer
+  @param lsn        log sequence number corresponding to log_sys.START_OFFSET
+  @param encrypted  whether the log is encrypted */
+  static void header_write(byte *buf, lsn_t lsn, bool encrypted);
+
+  lsn_t get_lsn(std::memory_order order= std::memory_order_relaxed) const
+  { return lsn.load(order); }
+
+  lsn_t get_flushed_lsn(std::memory_order order= std::memory_order_acquire)
+    const noexcept
+  { return flushed_to_disk_lsn.load(order); }
+
+  /** Initialize the LSN on initial log file creation. */
+  lsn_t init_lsn() noexcept
+  {
+    latch.wr_lock(SRW_LOCK_CALL);
+    const lsn_t lsn{get_lsn()};
+    flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed);
+    write_lsn= lsn;
+    latch.wr_unlock();
+    return lsn;
+  }
+
+  void set_recovered_lsn(lsn_t lsn) noexcept
+  {
+#ifndef SUX_LOCK_GENERIC
+    ut_ad(latch.is_write_locked());
+#endif /* SUX_LOCK_GENERIC */
+    write_lsn= lsn;
+    this->lsn.store(lsn, std::memory_order_relaxed);
+    flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed);
+  }
+
+#ifdef HAVE_PMEM
+  /** Persist the log.
+  @param lsn    desired new value of flushed_to_disk_lsn */
+  inline void persist(lsn_t lsn) noexcept;
+#endif
+
+  bool check_flush_or_checkpoint() const
+  {
+    return UNIV_UNLIKELY
+      (check_flush_or_checkpoint_.load(std::memory_order_relaxed));
+  }
+  void set_check_flush_or_checkpoint(bool flag= true)
+  { check_flush_or_checkpoint_.store(flag, std::memory_order_relaxed); }
+
+  /** Make previous write_buf() durable and update flushed_to_disk_lsn. */
+  bool flush(lsn_t lsn) noexcept;
+
+  /** Shut down the redo log subsystem. */
+  void close();
+
+#if defined __linux__ || defined _WIN32
+  /** @return the physical block size of the storage */
+  size_t get_block_size() const noexcept
+  { ut_ad(block_size); return block_size; }
+  /** Set the log block size for file I/O. */
+  void set_block_size(uint32_t size) noexcept { block_size= size; }
+#else
+  /** @return the physical block size of the storage */
+  static size_t get_block_size() { return 512; }
+#endif
+
+private:
+  /** Wait in append_prepare() for buffer to become available
+  @param ex   whether log_sys.latch is exclusively locked */
+  ATTRIBUTE_COLD static void append_prepare_wait(bool ex) noexcept;
+public:
+  /** Reserve space in the log buffer for appending data.
+  @tparam pmem  log_sys.is_pmem()
+  @param size   total length of the data to append(), in bytes
+  @param ex     whether log_sys.latch is exclusively locked
+  @return the start LSN and the buffer position for append() */
+  template<bool pmem>
+  inline std::pair<lsn_t,byte*> append_prepare(size_t size, bool ex) noexcept;
+
+  /** Append a string of bytes to the redo log.
+  @param d     destination
+  @param s     string of bytes
+  @param size  length of str, in bytes */
+  void append(byte *&d, const void *s, size_t size) noexcept
+  {
+#ifndef SUX_LOCK_GENERIC
+    ut_ad(latch.is_locked());
+#endif
+    ut_ad(d + size <= buf + (is_pmem() ? file_size : buf_size));
+    memcpy(d, s, size);
+    d+= size;
+  }
+
+  /** Set the log file format. */
+  void set_latest_format(bool encrypted) noexcept
+  { format= encrypted ? FORMAT_ENC_10_8 : FORMAT_10_8; }
+  /** @return whether the redo log is encrypted */
+  bool is_encrypted() const noexcept { return format & FORMAT_ENCRYPTED; }
+  /** @return whether the redo log is in the latest format */
+  bool is_latest() const noexcept
+  { return (~FORMAT_ENCRYPTED & format) == FORMAT_10_8; }
+
+  /** @return capacity in bytes */
+  lsn_t capacity() const noexcept { return file_size - START_OFFSET; }
+
+  /** Set the LSN of the log file at file creation. */
+  void set_first_lsn(lsn_t lsn) noexcept { write_lsn= first_lsn= lsn; }
+  /** @return the first LSN of the log file */
+  lsn_t get_first_lsn() const noexcept { return first_lsn; }
+
+  /** Determine the sequence bit at a log sequence number */
+  byte get_sequence_bit(lsn_t lsn) const noexcept
+  {
+    ut_ad(lsn >= first_lsn);
+    return !(((lsn - first_lsn) / capacity()) & 1);
+  }
+
+  /** Calculate the offset of a log sequence number.
+      @param lsn   log sequence number
+      @return byte offset within ib_logfile0 */
+  lsn_t calc_lsn_offset(lsn_t lsn) const noexcept
+  {
+    ut_ad(lsn >= first_lsn);
+    return START_OFFSET + (lsn - first_lsn) % capacity();
+  }
+
+  /** Write checkpoint information and invoke latch.wr_unlock().
+  @param end_lsn    start LSN of the FILE_CHECKPOINT mini-transaction */
+  inline void write_checkpoint(lsn_t end_lsn) noexcept;
+
+  /** Write buf to ib_logfile0.
+  @tparam release_latch whether to invoke latch.wr_unlock()
+  @return the current log sequence number */
+  template<bool release_latch> inline lsn_t write_buf() noexcept;
+
+  /** Create the log. */
+  void create(lsn_t lsn) noexcept;
+};
+
+/** Redo log system */
+extern log_t	log_sys;
+
+/** Wait for a log checkpoint if needed.
+NOTE that this function may only be called while not holding
+any synchronization objects except dict_sys.latch. */
+void log_free_check();
+
+/** Release the latches that protect log resizing. */
+void log_resize_release();
diff --git a/storage/innobase/include/log0recv.h b/storage/innobase/include/log0recv.h
new file mode 100644
index 00000000..6d75e15a
--- /dev/null
+++ b/storage/innobase/include/log0recv.h
@@ -0,0 +1,491 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/log0recv.h
+Recovery
+
+Created 9/20/1997 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "ut0new.h"
+#include "buf0types.h"
+#include "log0log.h"
+#include "mtr0types.h"
+
+#include <deque>
+#include <map>
+
+/** @return whether recovery is currently running. */
+#define recv_recovery_is_on() UNIV_UNLIKELY(recv_sys.recovery_on)
+
+ATTRIBUTE_COLD MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Apply any buffered redo log to a page.
+@param space     tablespace
+@param bpage     buffer pool page
+@return whether the page was recovered correctly */
+bool recv_recover_page(fil_space_t* space, buf_page_t* bpage);
+
+/** Start recovering from a redo log checkpoint.
+of first system tablespace page
+@return error code or DB_SUCCESS */
+dberr_t recv_recovery_from_checkpoint_start();
+
+/** Report an operation to create, delete, or rename a file during backup.
+@param[in]	space_id	tablespace identifier
+@param[in]	type		file operation redo log type
+@param[in]	name		file name (not NUL-terminated)
+@param[in]	len		length of name, in bytes
+@param[in]	new_name	new file name (NULL if not rename)
+@param[in]	new_len		length of new_name, in bytes (0 if NULL) */
+extern void (*log_file_op)(uint32_t space_id, int type,
+			   const byte* name, ulint len,
+			   const byte* new_name, ulint new_len);
+
+/** Report an operation which does undo log tablespace truncation
+during backup
+@param	space_id	undo tablespace identifier */
+extern void (*undo_space_trunc)(uint32_t space_id);
+
+/** Report an operation which does INIT_PAGE for page0 during backup.
+@param	space_id	tablespace identifier */
+extern void (*first_page_init)(uint32_t space_id);
+
+/** Stored redo log record */
+struct log_rec_t
+{
+  log_rec_t(lsn_t lsn) : next(nullptr), lsn(lsn) { ut_ad(lsn); }
+  log_rec_t()= delete;
+  log_rec_t(const log_rec_t&)= delete;
+  log_rec_t &operator=(const log_rec_t&)= delete;
+
+  /** next record */
+  log_rec_t *next;
+  /** mtr_t::commit_lsn() of the mini-transaction */
+  const lsn_t lsn;
+};
+
+struct recv_dblwr_t
+{
+  /** Add a page frame to the doublewrite recovery buffer. */
+  void add(byte *page) { pages.push_front(page); }
+
+  /** Validate the page.
+  @param page_id  page identifier
+  @param page     page contents
+  @param space    the tablespace of the page (not available for page 0)
+  @param tmp_buf  2*srv_page_size for decrypting and decompressing any
+  page_compressed or encrypted pages
+  @return whether the page is valid */
+  bool validate_page(const page_id_t page_id, const byte *page,
+                     const fil_space_t *space, byte *tmp_buf);
+
+  /** Find a doublewrite copy of a page.
+  @param page_id  page identifier
+  @param space    tablespace (not available for page_id.page_no()==0)
+  @param tmp_buf  2*srv_page_size for decrypting and decompressing any
+  page_compressed or encrypted pages
+  @return page frame
+  @retval NULL if no valid page for page_id was found */
+  byte* find_page(const page_id_t page_id, const fil_space_t *space= NULL,
+                  byte *tmp_buf= NULL);
+
+  /** Restore the first page of the given tablespace from
+  doublewrite buffer.
+  @param space_id  tablespace identifier
+  @param name      tablespace filepath
+  @param file      tablespace file handle
+  @return whether the operation failed */
+  bool restore_first_page(uint32_t space_id, const char *name, os_file_t file);
+
+  typedef std::deque<byte*, ut_allocator<byte*> > list;
+
+  /** Recovered doublewrite buffer page frames */
+  list pages;
+};
+
+/** recv_sys.pages entry; protected by recv_sys.mutex */
+struct page_recv_t
+{
+  /** Recovery status: 0=not in progress, 1=log is being applied,
+  -1=log has been applied and the entry may be erased.
+  Transitions from 1 to -1 are NOT protected by recv_sys.mutex. */
+  Atomic_relaxed<int8_t> being_processed{0};
+  /** Whether reading the page will be skipped */
+  bool skip_read= false;
+  /** Latest written byte offset when applying the log records.
+  @see mtr_t::m_last_offset */
+  uint16_t last_offset= 1;
+  /** log records for a page */
+  class recs_t
+  {
+    /** The first log record */
+    log_rec_t *head= nullptr;
+    /** The last log record */
+    log_rec_t *tail= nullptr;
+    friend struct page_recv_t;
+  public:
+    /** Append a redo log snippet for the page
+    @param recs log snippet */
+    void append(log_rec_t* recs)
+    {
+      if (tail)
+        tail->next= recs;
+      else
+        head= recs;
+      tail= recs;
+    }
+    /** Remove the last records for the page
+    @param start_lsn   start of the removed log */
+    ATTRIBUTE_COLD void rewind(lsn_t start_lsn);
+
+    /** @return the last log snippet */
+    const log_rec_t* last() const { return tail; }
+    /** @return the last log snippet */
+    log_rec_t* last() { return tail; }
+
+    class iterator
+    {
+      log_rec_t *cur;
+    public:
+      iterator(log_rec_t* rec) : cur(rec) {}
+      log_rec_t* operator*() const { return cur; }
+      iterator &operator++() { cur= cur->next; return *this; }
+      bool operator!=(const iterator& i) const { return cur != i.cur; }
+    };
+    iterator begin() { return head; }
+    iterator end() { return NULL; }
+    bool empty() const { ut_ad(!head == !tail); return !head; }
+    /** Clear and free the records; @see recv_sys_t::add() */
+    void clear();
+  } log;
+
+  /** Trim old log records for a page.
+  @param start_lsn oldest log sequence number to preserve
+  @return whether all the log for the page was trimmed */
+  inline bool trim(lsn_t start_lsn);
+  /** Ignore any earlier redo log records for this page. */
+  inline void will_not_read();
+};
+
+/** A page initialization operation that was parsed from the redo log */
+struct recv_init
+{
+  /** log sequence number of the page initialization */
+  lsn_t lsn;
+  /** Whether btr_page_create() avoided a read of the page.
+  At the end of the last recovery batch, mark_ibuf_exist()
+  will mark pages for which this flag is set. */
+  bool created;
+};
+
+/** Recovery system data structure */
+struct recv_sys_t
+{
+  using init= recv_init;
+
+  /** mutex protecting this as well as some of page_recv_t */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t mutex;
+private:
+  /** set when finding a corrupt log block or record, or there is a
+  log parsing buffer overflow */
+  bool found_corrupt_log;
+  /** set when an inconsistency with the file system contents is detected
+  during log scan or apply */
+  bool found_corrupt_fs;
+public:
+  /** @return maximum guaranteed size of a mini-transaction on recovery */
+  static constexpr size_t MTR_SIZE_MAX{1U << 20};
+
+  /** whether we are applying redo log records during crash recovery */
+  bool recovery_on;
+  /** whether recv_recover_page(), invoked from buf_page_t::read_complete(),
+  should apply log records*/
+  bool apply_log_recs;
+  /** number of bytes in log_sys.buf */
+  size_t len;
+  /** start offset of non-parsed log records in log_sys.buf */
+  size_t offset;
+  /** log sequence number of the first non-parsed record */
+  lsn_t lsn;
+  /** log sequence number of the last parsed mini-transaction */
+  lsn_t scanned_lsn;
+  /** log sequence number at the end of the FILE_CHECKPOINT record, or 0 */
+  lsn_t file_checkpoint;
+  /** the time when progress was last reported */
+  time_t progress_time;
+
+  using map = std::map<const page_id_t, page_recv_t,
+                       std::less<const page_id_t>,
+                       ut_allocator<std::pair<const page_id_t, page_recv_t>>>;
+  /** buffered records waiting to be applied to pages */
+  map pages;
+
+private:
+  /** iterator to pages, used by parse() */
+  map::iterator pages_it;
+
+  /** Process a record that indicates that a tablespace size is being shrunk.
+  @param page_id first page that is not in the file
+  @param lsn     log sequence number of the shrink operation */
+  inline void trim(const page_id_t page_id, lsn_t lsn);
+
+  /** Undo tablespaces for which truncate has been logged
+  (indexed by page_id_t::space() - srv_undo_space_id_start) */
+  struct trunc
+  {
+    /** log sequence number of FILE_CREATE, or 0 if none */
+    lsn_t lsn;
+    /** truncated size of the tablespace, or 0 if not truncated */
+    unsigned pages;
+  } truncated_undo_spaces[127];
+
+public:
+  /** The contents of the doublewrite buffer */
+  recv_dblwr_t dblwr;
+
+  __attribute__((warn_unused_result)) 
+  inline dberr_t read(os_offset_t offset, span<byte> buf);
+  inline size_t files_size();
+  void close_files();
+
+  /** Advance pages_it if it matches the iterator */
+  void pages_it_invalidate(const map::iterator &p)
+  {
+    mysql_mutex_assert_owner(&mutex);
+    if (pages_it == p)
+      pages_it++;
+  }
+  /** Invalidate pages_it if it points to the given tablespace */
+  void pages_it_invalidate(uint32_t space_id)
+  {
+    mysql_mutex_assert_owner(&mutex);
+    if (pages_it != pages.end() && pages_it->first.space() == space_id)
+      pages_it= pages.end();
+  }
+
+private:
+  /** Attempt to initialize a page based on redo log records.
+  @param p        iterator
+  @param mtr      mini-transaction
+  @param b        pre-allocated buffer pool block
+  @param init     page initialization
+  @return the recovered block
+  @retval nullptr if the page cannot be initialized based on log records
+  @retval -1      if the page cannot be recovered due to corruption */
+  inline buf_block_t *recover_low(const map::iterator &p, mtr_t &mtr,
+                                  buf_block_t *b, init &init);
+  /** Attempt to initialize a page based on redo log records.
+  @param page_id  page identifier
+  @return the recovered block
+  @retval nullptr if the page cannot be initialized based on log records
+  @retval -1      if the page cannot be recovered due to corruption */
+  ATTRIBUTE_COLD buf_block_t *recover_low(const page_id_t page_id);
+
+  /** All found log files (multiple ones are possible if we are upgrading
+  from before MariaDB Server 10.5.1) */
+  std::vector<log_file_t> files;
+
+  /** Base node of the redo block list.
+  List elements are linked via buf_block_t::unzip_LRU. */
+  UT_LIST_BASE_NODE_T(buf_block_t) blocks;
+
+  /** Allocate a block from the buffer pool for recv_sys.pages */
+  ATTRIBUTE_COLD buf_block_t *add_block();
+
+  /** Wait for buffer pool to become available.
+  @param pages number of buffer pool pages needed */
+  ATTRIBUTE_COLD void wait_for_pool(size_t pages);
+
+  /** Free log for processed pages. */
+  void garbage_collect();
+
+  /** Apply a recovery batch.
+  @param space_id       current tablespace identifier
+  @param space          current tablespace
+  @param free_block     spare buffer block
+  @param last_batch     whether it is possible to write more redo log
+  @return whether the caller must provide a new free_block */
+  bool apply_batch(uint32_t space_id, fil_space_t *&space,
+                   buf_block_t *&free_block, bool last_batch);
+
+public:
+  /** Apply buffered log to persistent data pages.
+  @param last_batch     whether it is possible to write more redo log */
+  void apply(bool last_batch);
+
+#ifdef UNIV_DEBUG
+  /** whether all redo log in the current batch has been applied */
+  bool after_apply= false;
+#endif
+  /** Initialize the redo log recovery subsystem. */
+  void create();
+
+  /** Free most recovery data structures. */
+  void debug_free();
+
+  /** Clean up after create() */
+  void close();
+
+  bool is_initialised() const { return scanned_lsn != 0; }
+
+  /** Find the latest checkpoint.
+  @return error code or DB_SUCCESS */
+  dberr_t find_checkpoint();
+
+  /** Register a redo log snippet for a page.
+  @param it       page iterator
+  @param start_lsn start LSN of the mini-transaction
+  @param lsn      @see mtr_t::commit_lsn()
+  @param l        redo log snippet
+  @param len      length of l, in bytes
+  @return whether we ran out of memory */
+  bool add(map::iterator it, lsn_t start_lsn, lsn_t lsn,
+           const byte *l, size_t len);
+
+  /** Parsing result */
+  enum parse_mtr_result {
+    /** a record was successfully parsed */
+    OK,
+    /** the log ended prematurely (need to read more) */
+    PREMATURE_EOF,
+    /** the end of the log was reached */
+    GOT_EOF,
+    /** parse<true>(l, false) ran out of memory */
+    GOT_OOM
+  };
+
+private:
+  /** Parse and register one log_t::FORMAT_10_8 mini-transaction.
+  @tparam store     whether to store the records
+  @param  l         log data source
+  @param  if_exists if store: whether to check if the tablespace exists */
+  template<typename source,bool store>
+  inline parse_mtr_result parse(source &l, bool if_exists) noexcept;
+
+  /** Rewind a mini-transaction when parse() runs out of memory.
+  @param  l         log data source
+  @param  begin     start of the mini-transaction */
+  template<typename source>
+  ATTRIBUTE_COLD void rewind(source &l, source &begin) noexcept;
+
+  /** Report progress in terms of LSN or pages remaining */
+  ATTRIBUTE_COLD void report_progress() const;
+public:
+  /** Parse and register one log_t::FORMAT_10_8 mini-transaction,
+  handling log_sys.is_pmem() buffer wrap-around.
+  @tparam store     whether to store the records
+  @param  if_exists if store: whether to check if the tablespace exists */
+  template<bool store>
+  static parse_mtr_result parse_mtr(bool if_exists) noexcept;
+
+  /** Parse and register one log_t::FORMAT_10_8 mini-transaction,
+  handling log_sys.is_pmem() buffer wrap-around.
+  @tparam store     whether to store the records
+  @param  if_exists if store: whether to check if the tablespace exists */
+  template<bool store>
+  static parse_mtr_result parse_pmem(bool if_exists) noexcept
+#ifdef HAVE_PMEM
+    ;
+#else
+  { return parse_mtr<store>(if_exists); }
+#endif
+
+  /** Erase log records for a page. */
+  void erase(map::iterator p);
+
+  /** Clear a fully processed set of stored redo log records. */
+  void clear();
+
+  /** Determine whether redo log recovery progress should be reported.
+  @param time  the current time
+  @return whether progress should be reported
+  (the last report was at least 15 seconds ago) */
+  bool report(time_t time);
+
+  /** The alloc() memory alignment, in bytes */
+  static constexpr size_t ALIGNMENT= sizeof(size_t);
+
+  /** Free a redo log snippet.
+  @param data buffer allocated in add() */
+  inline void free(const void *data);
+
+  /** Remove records for a corrupted page.
+  This function should only be called when innodb_force_recovery is set.
+  @param page_id  corrupted page identifier */
+  ATTRIBUTE_COLD void free_corrupted_page(page_id_t page_id);
+
+  /** Flag data file corruption during recovery. */
+  ATTRIBUTE_COLD void set_corrupt_fs();
+  /** Flag log file corruption during recovery. */
+  ATTRIBUTE_COLD void set_corrupt_log();
+
+  /** @return whether data file corruption was found */
+  bool is_corrupt_fs() const { return UNIV_UNLIKELY(found_corrupt_fs); }
+  /** @return whether log file corruption was found */
+  bool is_corrupt_log() const { return UNIV_UNLIKELY(found_corrupt_log); }
+
+  /** Attempt to initialize a page based on redo log records.
+  @param page_id  page identifier
+  @return the recovered block
+  @retval nullptr if the page cannot be initialized based on log records
+  @retval -1      if the page cannot be recovered due to corruption */
+  buf_block_t *recover(const page_id_t page_id)
+  {
+    return UNIV_UNLIKELY(recovery_on) ? recover_low(page_id) : nullptr;
+  }
+
+  /** Try to recover a tablespace that was not readable earlier
+  @param p          iterator
+  @param name       tablespace file name
+  @param free_block spare buffer block
+  @return recovered tablespace
+  @retval nullptr if recovery failed */
+  fil_space_t *recover_deferred(const map::iterator &p,
+                                const std::string &name,
+                                buf_block_t *&free_block);
+};
+
+/** The recovery system */
+extern recv_sys_t	recv_sys;
+
+/** If the following is TRUE, the buffer pool file pages must be invalidated
+after recovery and no ibuf operations are allowed; this will be set if
+recv_sys.pages becomes too full, and log records must be merged
+to file pages already before the recovery is finished: in this case no
+ibuf operations are allowed, as they could modify the pages read in the
+buffer pool before the pages have been recovered to the up-to-date state.
+
+TRUE means that recovery is running and no operations on the log files
+are allowed yet: the variable name is misleading. */
+extern bool		recv_no_ibuf_operations;
+/** TRUE when recv_init_crash_recovery() has been called. */
+extern bool		recv_needed_recovery;
+#ifdef UNIV_DEBUG
+/** whether writing to the redo log is forbidden;
+protected by exclusive log_sys.latch. */
+extern bool recv_no_log_write;
+#endif /* UNIV_DEBUG */
+
+/** TRUE if buf_page_is_corrupted() should check if the log sequence
+number (FIL_PAGE_LSN) is in the future.  Initially FALSE, and set by
+recv_recovery_from_checkpoint_start(). */
+extern bool		recv_lsn_checks_on;
diff --git a/storage/innobase/include/log0types.h b/storage/innobase/include/log0types.h
new file mode 100644
index 00000000..df87968d
--- /dev/null
+++ b/storage/innobase/include/log0types.h
@@ -0,0 +1,38 @@
+/*****************************************************************************
+
+Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/log0types.h
+Log types
+
+Created 2013-03-15 Sunny Bains
+*******************************************************/
+
+#ifndef log0types_h
+#define log0types_h
+
+#include "univ.i"
+
+/* Type used for all log sequence number storage and arithmetics */
+typedef	ib_uint64_t		lsn_t;
+
+#define LSN_MAX			IB_UINT64_MAX
+
+#define LSN_PF			UINT64PF
+
+#endif /* log0types_h */
diff --git a/storage/innobase/include/mach0data.h b/storage/innobase/include/mach0data.h
new file mode 100644
index 00000000..79cbd7d1
--- /dev/null
+++ b/storage/innobase/include/mach0data.h
@@ -0,0 +1,375 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/mach0data.h
+Utilities for converting data from the database file
+to the machine format.
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#ifndef mach0data_h
+#define mach0data_h
+
+#include "univ.i"
+#include "mtr0types.h"
+
+#ifndef UNIV_INNOCHECKSUM
+
+/* The data and all fields are always stored in a database file
+in the same format: ascii, big-endian, ... .
+All data in the files MUST be accessed using the functions in this
+module. */
+
+/*******************************************************//**
+The following function is used to store data in one byte. */
+UNIV_INLINE
+void
+mach_write_to_1(
+/*============*/
+	byte*	b,	/*!< in: pointer to byte where to store */
+	ulint	n);	 /*!< in: ulint integer to be stored, >= 0, < 256 */
+/** The following function is used to fetch data from one byte.
+@param[in]	b	pointer to a byte to read
+@return ulint integer, >= 0, < 256 */
+UNIV_INLINE
+uint8_t
+mach_read_from_1(
+	const byte*	b)
+	MY_ATTRIBUTE((warn_unused_result));
+/*******************************************************//**
+The following function is used to store data in two consecutive
+bytes. We store the most significant byte to the lower address. */
+UNIV_INLINE
+void
+mach_write_to_2(
+/*============*/
+	byte*	b,	/*!< in: pointer to two bytes where to store */
+	ulint	n);	 /*!< in: ulint integer to be stored, >= 0, < 64k */
+#endif /* !UNIV_INNOCHECKSUM */
+/** The following function is used to fetch data from 2 consecutive
+bytes. The most significant byte is at the lowest address.
+@param[in]	b	pointer to 2 bytes where to store
+@return 2-byte integer, >= 0, < 64k */
+UNIV_INLINE
+uint16_t
+mach_read_from_2(
+	const byte*	b)
+	MY_ATTRIBUTE((warn_unused_result));
+
+#ifndef UNIV_INNOCHECKSUM
+/********************************************************//**
+The following function is used to convert a 16-bit data item
+to the canonical format, for fast bytewise equality test
+against memory.
+@return 16-bit integer in canonical format */
+UNIV_INLINE
+uint16
+mach_encode_2(
+/*==========*/
+	ulint	n)	/*!< in: integer in machine-dependent format */
+	MY_ATTRIBUTE((const));
+/********************************************************//**
+The following function is used to convert a 16-bit data item
+from the canonical format, for fast bytewise equality test
+against memory.
+@return integer in machine-dependent format */
+UNIV_INLINE
+ulint
+mach_decode_2(
+/*==========*/
+	uint16	n)	/*!< in: 16-bit integer in canonical format */
+	MY_ATTRIBUTE((const));
+/*******************************************************//**
+The following function is used to store data in 3 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_3(
+/*============*/
+	byte*	b,	/*!< in: pointer to 3 bytes where to store */
+	ulint	n);	 /*!< in: ulint integer to be stored */
+/** The following function is used to fetch data from 3 consecutive
+bytes. The most significant byte is at the lowest address.
+@param[in]	b	pointer to 3 bytes to read
+@return 32 bit integer */
+UNIV_INLINE
+uint32_t
+mach_read_from_3(
+	const byte*	b)
+	MY_ATTRIBUTE((warn_unused_result));
+/*******************************************************//**
+The following function is used to store data in four consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_4(
+/*============*/
+	byte*	b,	/*!< in: pointer to four bytes where to store */
+	ulint	n);	 /*!< in: ulint integer to be stored */
+/** The following function is used to fetch data from 4 consecutive
+bytes. The most significant byte is at the lowest address.
+@param[in]	b	pointer to 4 bytes to read
+@return 32 bit integer */
+UNIV_INLINE
+uint32_t
+mach_read_from_4(
+	const byte*	b)
+	MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************//**
+Writes a ulint in a compressed form (1..5 bytes).
+@return stored size in bytes */
+UNIV_INLINE
+ulint
+mach_write_compressed(
+/*==================*/
+	byte*	b,	/*!< in: pointer to memory where to store */
+	ulint	n);	/*!< in: ulint integer to be stored */
+/*********************************************************//**
+Returns the size of an ulint when written in the compressed form.
+@return compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_get_compressed_size(
+/*=====================*/
+	ulint	n)	/*!< in: ulint integer to be stored */
+	MY_ATTRIBUTE((const));
+/** Read a 32-bit integer in a compressed form.
+@param[in,out]	b	pointer to memory where to read;
+advanced by the number of bytes consumed
+@return unsigned value */
+UNIV_INLINE
+ib_uint32_t
+mach_read_next_compressed(
+	const byte**	b);
+/*******************************************************//**
+The following function is used to store data in 6 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_6(
+/*============*/
+	byte*		b,	/*!< in: pointer to 6 bytes where to store */
+	ib_uint64_t	id);	/*!< in: 48-bit integer */
+/********************************************************//**
+The following function is used to fetch data from 6 consecutive
+bytes. The most significant byte is at the lowest address.
+@return 48-bit integer */
+UNIV_INLINE
+ib_uint64_t
+mach_read_from_6(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to 6 bytes */
+	MY_ATTRIBUTE((warn_unused_result));
+/*******************************************************//**
+The following function is used to store data in 7 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_7(
+/*============*/
+	byte*		b,	/*!< in: pointer to 7 bytes where to store */
+	ib_uint64_t	n);	/*!< in: 56-bit integer */
+/********************************************************//**
+The following function is used to fetch data from 7 consecutive
+bytes. The most significant byte is at the lowest address.
+@return 56-bit integer */
+UNIV_INLINE
+ib_uint64_t
+mach_read_from_7(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to 7 bytes */
+	MY_ATTRIBUTE((warn_unused_result));
+/*******************************************************//**
+The following function is used to store data in 8 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_8(
+/*============*/
+	void*		b,	/*!< in: pointer to 8 bytes where to store */
+	ib_uint64_t	n);	/*!< in: 64-bit integer to be stored */
+/********************************************************//**
+The following function is used to fetch data from 8 consecutive
+bytes. The most significant byte is at the lowest address.
+@return 64-bit integer */
+UNIV_INLINE
+ib_uint64_t
+mach_read_from_8(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to 8 bytes */
+	MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************//**
+Writes a 64-bit integer in a compressed form (5..9 bytes).
+@return size in bytes */
+UNIV_INLINE
+ulint
+mach_u64_write_compressed(
+/*======================*/
+	byte*		b,	/*!< in: pointer to memory where to store */
+	ib_uint64_t	n);	/*!< in: 64-bit integer to be stored */
+/** Read a 64-bit integer in a compressed form.
+@param[in,out]	b	pointer to memory where to read;
+advanced by the number of bytes consumed
+@return unsigned value */
+UNIV_INLINE
+ib_uint64_t
+mach_u64_read_next_compressed(
+	const byte**	b);
+/*********************************************************//**
+Writes a 64-bit integer in a compressed form (1..11 bytes).
+@return size in bytes */
+UNIV_INLINE
+ulint
+mach_u64_write_much_compressed(
+/*===========================*/
+	byte*		b,	/*!< in: pointer to memory where to store */
+	ib_uint64_t	n);	/*!< in: 64-bit integer to be stored */
+/*********************************************************//**
+Reads a 64-bit integer in a compressed form.
+@return the value read */
+UNIV_INLINE
+ib_uint64_t
+mach_u64_read_much_compressed(
+/*==========================*/
+	const byte*	b)	/*!< in: pointer to memory from where to read */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************//**
+Reads a double. It is stored in a little-endian format.
+@return double read */
+UNIV_INLINE
+double
+mach_double_read(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to memory from where to read */
+	MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************//**
+Writes a double. It is stored in a little-endian format. */
+UNIV_INLINE
+void
+mach_double_write(
+/*==============*/
+	byte*	b,	/*!< in: pointer to memory where to write */
+	double	d);	/*!< in: double */
+/*********************************************************//**
+Reads a float. It is stored in a little-endian format.
+@return float read */
+UNIV_INLINE
+float
+mach_float_read(
+/*============*/
+	const byte*	b)	/*!< in: pointer to memory from where to read */
+	MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************//**
+Writes a float. It is stored in a little-endian format. */
+UNIV_INLINE
+void
+mach_float_write(
+/*=============*/
+	byte*	b,	/*!< in: pointer to memory where to write */
+	float	d);	/*!< in: float */
+/*********************************************************//**
+Reads a ulint stored in the little-endian format.
+@return unsigned long int */
+UNIV_INLINE
+ulint
+mach_read_from_n_little_endian(
+/*===========================*/
+	const byte*	buf,		/*!< in: from where to read */
+	ulint		buf_size)	/*!< in: from how many bytes to read */
+	MY_ATTRIBUTE((warn_unused_result));
+
+
+/** Reads a 64 bit stored in big endian format
+@param	buf		From where to read
+@return uint64_t */
+UNIV_INLINE
+uint64_t
+mach_read_uint64_little_endian(const byte* buf)
+{
+#ifdef WORDS_BIGENDIAN
+  return
+    uint64_t(buf[0])       | uint64_t(buf[1]) << 8 |
+    uint64_t(buf[2]) << 16 | uint64_t(buf[3]) << 24 |
+    uint64_t(buf[4]) << 32 | uint64_t(buf[5]) << 40 |
+    uint64_t(buf[6]) << 48 | uint64_t(buf[7]) << 56;
+#else
+  uint64_t n;
+  memcpy(&n, buf, sizeof(uint64_t));
+  return n;
+#endif
+}
+
+/*********************************************************//**
+Writes a ulint in the little-endian format. */
+UNIV_INLINE
+void
+mach_write_to_n_little_endian(
+/*==========================*/
+	byte*	dest,		/*!< in: where to write */
+	ulint	dest_size,	/*!< in: into how many bytes to write */
+	ulint	n);		/*!< in: unsigned long int to write */
+/*********************************************************//**
+Reads a ulint stored in the little-endian format.
+@return unsigned long int */
+UNIV_INLINE
+ulint
+mach_read_from_2_little_endian(
+/*===========================*/
+	const byte*	buf)		/*!< in: from where to read */
+	MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************//**
+Writes a ulint in the little-endian format. */
+UNIV_INLINE
+void
+mach_write_to_2_little_endian(
+/*==========================*/
+	byte*	dest,		/*!< in: where to write */
+	ulint	n);		/*!< in: unsigned long int to write */
+/*********************************************************//**
+Convert integral type from storage byte order (big endian) to
+host byte order.
+@return integer value */
+UNIV_INLINE
+ib_uint64_t
+mach_read_int_type(
+/*===============*/
+	const byte*	src,		/*!< in: where to read from */
+	ulint		len,		/*!< in: length of src */
+	ibool		unsigned_type);	/*!< in: signed or unsigned flag */
+
+/*************************************************************
+Convert a ulonglong integer from host byte order to (big-endian)
+storage byte order. */
+UNIV_INLINE
+void
+mach_write_ulonglong(
+/*=================*/
+	byte*		dest,		/*!< in: where to write */
+	ulonglong	src,		/*!< in: where to read from */
+	ulint		len,		/*!< in: length of dest */
+	bool		usign);		/*!< in: signed or unsigned flag */
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+#include "mach0data.inl"
+
+#endif
diff --git a/storage/innobase/include/mach0data.inl b/storage/innobase/include/mach0data.inl
new file mode 100644
index 00000000..2f970fd2
--- /dev/null
+++ b/storage/innobase/include/mach0data.inl
@@ -0,0 +1,837 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/mach0data.ic
+Utilities for converting data from the database file
+to the machine format.
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#ifndef UNIV_INNOCHECKSUM
+
+#include "mtr0types.h"
+#include "ut0byte.h"
+
+/*******************************************************//**
+The following function is used to store data in one byte. */
+UNIV_INLINE
+void
+mach_write_to_1(
+/*============*/
+	byte*	b,	/*!< in: pointer to byte where to store */
+	ulint	n)	/*!< in: ulint integer to be stored, >= 0, < 256 */
+{
+	ut_ad((n & ~0xFFUL) == 0);
+
+	b[0] = (byte) n;
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/*******************************************************//**
+The following function is used to store data in two consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_2(
+/*============*/
+	byte*	b,	/*!< in: pointer to two bytes where to store */
+	ulint	n)	/*!< in: ulint integer to be stored */
+{
+	ut_ad((n & ~0xFFFFUL) == 0);
+
+	b[0] = (byte)(n >> 8);
+	b[1] = (byte)(n);
+}
+
+/** The following function is used to fetch data from one byte.
+@param[in]	b	pointer to a byte to read
+@return ulint integer, >= 0, < 256 */
+UNIV_INLINE
+uint8_t
+mach_read_from_1(
+	const byte*	b)
+{
+	return(uint8_t(*b));
+}
+
+/** The following function is used to fetch data from 2 consecutive
+bytes. The most significant byte is at the lowest address.
+@param[in]	b	pointer to 2 bytes to read
+@return 2-byte integer, >= 0, < 64k */
+UNIV_INLINE
+uint16_t
+mach_read_from_2(
+	const byte*	b)
+{
+	return(uint16_t(uint16_t(b[0]) << 8 | b[1]));
+}
+
+#ifndef UNIV_INNOCHECKSUM
+
+/********************************************************//**
+The following function is used to convert a 16-bit data item
+to the canonical format, for fast bytewise equality test
+against memory.
+@return 16-bit integer in canonical format */
+UNIV_INLINE
+uint16
+mach_encode_2(
+/*==========*/
+	ulint	n)	/*!< in: integer in machine-dependent format */
+{
+	uint16	ret;
+	ut_ad(2 == sizeof ret);
+	mach_write_to_2((byte*) &ret, n);
+	return(ret);
+}
+/********************************************************//**
+The following function is used to convert a 16-bit data item
+from the canonical format, for fast bytewise equality test
+against memory.
+@return integer in machine-dependent format */
+UNIV_INLINE
+ulint
+mach_decode_2(
+/*==========*/
+	uint16	n)	/*!< in: 16-bit integer in canonical format */
+{
+	ut_ad(2 == sizeof n);
+	return(mach_read_from_2((const byte*) &n));
+}
+
+/*******************************************************//**
+The following function is used to store data in 3 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_3(
+/*============*/
+	byte*	b,	/*!< in: pointer to 3 bytes where to store */
+	ulint	n)	/*!< in: ulint integer to be stored */
+{
+	ut_ad((n & ~0xFFFFFFUL) == 0);
+
+	b[0] = (byte)(n >> 16);
+	b[1] = (byte)(n >> 8);
+	b[2] = (byte)(n);
+}
+
+/** The following function is used to fetch data from 3 consecutive
+bytes. The most significant byte is at the lowest address.
+@param[in]	b	pointer to 3 bytes to read
+@return uint32_t integer */
+UNIV_INLINE
+uint32_t
+mach_read_from_3(
+	const byte*	b)
+{
+	return( (static_cast<uint32_t>(b[0]) << 16)
+		| (static_cast<uint32_t>(b[1]) << 8)
+		| static_cast<uint32_t>(b[2])
+		);
+}
+#endif /* !UNIV_INNOCHECKSUM */
+
+/*******************************************************//**
+The following function is used to store data in four consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_4(
+/*============*/
+	byte*	b,	/*!< in: pointer to four bytes where to store */
+	ulint	n)	/*!< in: ulint integer to be stored */
+{
+	b[0] = (byte)(n >> 24);
+	b[1] = (byte)(n >> 16);
+	b[2] = (byte)(n >> 8);
+	b[3] = (byte) n;
+}
+
+/** The following function is used to fetch data from 4 consecutive
+bytes. The most significant byte is at the lowest address.
+@param[in]	b	pointer to 4 bytes to read
+@return 32 bit integer */
+UNIV_INLINE
+uint32_t
+mach_read_from_4(
+	const byte*	b)
+{
+	return( (static_cast<uint32_t>(b[0]) << 24)
+		| (static_cast<uint32_t>(b[1]) << 16)
+		| (static_cast<uint32_t>(b[2]) << 8)
+		| static_cast<uint32_t>(b[3])
+		);
+}
+
+#ifndef UNIV_INNOCHECKSUM
+
+/*********************************************************//**
+Writes a ulint in a compressed form where the first byte codes the
+length of the stored ulint. We look at the most significant bits of
+the byte. If the most significant bit is zero, it means 1-byte storage,
+else if the 2nd bit is 0, it means 2-byte storage, else if 3rd is 0,
+it means 3-byte storage, else if 4th is 0, it means 4-byte storage,
+else the storage is 5-byte.
+@return compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_write_compressed(
+/*==================*/
+	byte*	b,	/*!< in: pointer to memory where to store */
+	ulint	n)	/*!< in: ulint integer (< 2^32) to be stored */
+{
+	if (n < 0x80) {
+		/* 0nnnnnnn (7 bits) */
+		mach_write_to_1(b, n);
+		return(1);
+	} else if (n < 0x4000) {
+		/* 10nnnnnn nnnnnnnn (14 bits) */
+		mach_write_to_2(b, n | 0x8000);
+		return(2);
+	} else if (n < 0x200000) {
+		/* 110nnnnn nnnnnnnn nnnnnnnn (21 bits) */
+		mach_write_to_3(b, n | 0xC00000);
+		return(3);
+	} else if (n < 0x10000000) {
+		/* 1110nnnn nnnnnnnn nnnnnnnn nnnnnnnn (28 bits) */
+		mach_write_to_4(b, n | 0xE0000000);
+		return(4);
+	} else {
+		/* 11110000 nnnnnnnn nnnnnnnn nnnnnnnn nnnnnnnn (32 bits) */
+		mach_write_to_1(b, 0xF0);
+		mach_write_to_4(b + 1, n);
+		return(5);
+	}
+}
+
+/*********************************************************//**
+Returns the size of a ulint when written in the compressed form.
+@return compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_get_compressed_size(
+/*=====================*/
+	ulint	n)	/*!< in: ulint integer (< 2^32) to be stored */
+{
+	if (n < 0x80) {
+		/* 0nnnnnnn (7 bits) */
+		return(1);
+	} else if (n < 0x4000) {
+		/* 10nnnnnn nnnnnnnn (14 bits) */
+		return(2);
+	} else if (n < 0x200000) {
+		/* 110nnnnn nnnnnnnn nnnnnnnn (21 bits) */
+		return(3);
+	} else if (n < 0x10000000) {
+		/* 1110nnnn nnnnnnnn nnnnnnnn nnnnnnnn (28 bits) */
+		return(4);
+	} else {
+		/* 11110000 nnnnnnnn nnnnnnnn nnnnnnnn nnnnnnnn (32 bits) */
+		return(5);
+	}
+}
+
+/*********************************************************//**
+Reads a ulint in a compressed form.
+@return read integer (< 2^32) */
+UNIV_INLINE
+ulint
+mach_read_compressed(
+/*=================*/
+	const byte*	b)	/*!< in: pointer to memory from where to read */
+{
+	ulint	val;
+
+	val = mach_read_from_1(b);
+
+	if (val < 0x80) {
+		/* 0nnnnnnn (7 bits) */
+	} else if (val < 0xC0) {
+		/* 10nnnnnn nnnnnnnn (14 bits) */
+		val = mach_read_from_2(b) & 0x3FFF;
+		ut_ad(val > 0x7F);
+	} else if (val < 0xE0) {
+		/* 110nnnnn nnnnnnnn nnnnnnnn (21 bits) */
+		val = mach_read_from_3(b) & 0x1FFFFF;
+		ut_ad(val > 0x3FFF);
+	} else if (val < 0xF0) {
+		/* 1110nnnn nnnnnnnn nnnnnnnn nnnnnnnn (28 bits) */
+		val = mach_read_from_4(b) & 0xFFFFFFF;
+		ut_ad(val > 0x1FFFFF);
+	} else {
+		/* 11110000 nnnnnnnn nnnnnnnn nnnnnnnn nnnnnnnn (32 bits) */
+		ut_ad(val == 0xF0);
+		val = mach_read_from_4(b + 1);
+		ut_ad(val > 0xFFFFFFF);
+	}
+
+	return(val);
+}
+
+/** Read a 32-bit integer in a compressed form.
+@param[in,out]	b	pointer to memory where to read;
+advanced by the number of bytes consumed
+@return unsigned value */
+UNIV_INLINE
+ib_uint32_t
+mach_read_next_compressed(
+	const byte**	b)
+{
+	ulint	val = mach_read_from_1(*b);
+
+	if (val < 0x80) {
+		/* 0nnnnnnn (7 bits) */
+		++*b;
+	} else if (val < 0xC0) {
+		/* 10nnnnnn nnnnnnnn (14 bits) */
+		val = mach_read_from_2(*b) & 0x3FFF;
+		ut_ad(val > 0x7F);
+		*b += 2;
+	} else if (val < 0xE0) {
+		/* 110nnnnn nnnnnnnn nnnnnnnn (21 bits) */
+		val = mach_read_from_3(*b) & 0x1FFFFF;
+		ut_ad(val > 0x3FFF);
+		*b += 3;
+	} else if (val < 0xF0) {
+		/* 1110nnnn nnnnnnnn nnnnnnnn nnnnnnnn (28 bits) */
+		val = mach_read_from_4(*b) & 0xFFFFFFF;
+		ut_ad(val > 0x1FFFFF);
+		*b += 4;
+	} else {
+		/* 11110000 nnnnnnnn nnnnnnnn nnnnnnnn nnnnnnnn (32 bits) */
+		ut_ad(val == 0xF0);
+		val = mach_read_from_4(*b + 1);
+		ut_ad(val > 0xFFFFFFF);
+		*b += 5;
+	}
+
+	return(static_cast<ib_uint32_t>(val));
+}
+
+/*******************************************************//**
+The following function is used to store data in 8 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_8(
+/*============*/
+	void*		b,	/*!< in: pointer to 8 bytes where to store */
+	ib_uint64_t	n)	/*!< in: 64-bit integer to be stored */
+{
+	mach_write_to_4(static_cast<byte*>(b), (ulint) (n >> 32));
+	mach_write_to_4(static_cast<byte*>(b) + 4, (ulint) n);
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/********************************************************//**
+The following function is used to fetch data from 8 consecutive
+bytes. The most significant byte is at the lowest address.
+@return 64-bit integer */
+UNIV_INLINE
+ib_uint64_t
+mach_read_from_8(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to 8 bytes */
+{
+	ib_uint64_t	u64;
+
+	u64 = mach_read_from_4(b);
+	u64 <<= 32;
+	u64 |= mach_read_from_4(b + 4);
+
+	return(u64);
+}
+
+#ifndef UNIV_INNOCHECKSUM
+
+/*******************************************************//**
+The following function is used to store data in 7 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_7(
+/*============*/
+	byte*		b,	/*!< in: pointer to 7 bytes where to store */
+	ib_uint64_t	n)	/*!< in: 56-bit integer */
+{
+	mach_write_to_3(b, (ulint) (n >> 32));
+	mach_write_to_4(b + 3, (ulint) n);
+}
+
+/********************************************************//**
+The following function is used to fetch data from 7 consecutive
+bytes. The most significant byte is at the lowest address.
+@return 56-bit integer */
+UNIV_INLINE
+ib_uint64_t
+mach_read_from_7(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to 7 bytes */
+{
+	return(ut_ull_create(mach_read_from_3(b), mach_read_from_4(b + 3)));
+}
+
+/*******************************************************//**
+The following function is used to store data in 6 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_6(
+/*============*/
+	byte*		b,	/*!< in: pointer to 6 bytes where to store */
+	ib_uint64_t	n)	/*!< in: 48-bit integer */
+{
+	mach_write_to_2(b, (ulint) (n >> 32));
+	mach_write_to_4(b + 2, (ulint) n);
+}
+
+/********************************************************//**
+The following function is used to fetch data from 6 consecutive
+bytes. The most significant byte is at the lowest address.
+@return 48-bit integer */
+UNIV_INLINE
+ib_uint64_t
+mach_read_from_6(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to 6 bytes */
+{
+	return(ut_ull_create(mach_read_from_2(b), mach_read_from_4(b + 2)));
+}
+
+/*********************************************************//**
+Writes a 64-bit integer in a compressed form (5..9 bytes).
+@return size in bytes */
+UNIV_INLINE
+ulint
+mach_u64_write_compressed(
+/*======================*/
+	byte*		b,	/*!< in: pointer to memory where to store */
+	ib_uint64_t	n)	/*!< in: 64-bit integer to be stored */
+{
+	ulint	size = mach_write_compressed(b, (ulint) (n >> 32));
+	mach_write_to_4(b + size, (ulint) n);
+
+	return(size + 4);
+}
+
+/** Read a 64-bit integer in a compressed form.
+@param[in,out]	b	pointer to memory where to read;
+advanced by the number of bytes consumed
+@return unsigned value */
+UNIV_INLINE
+ib_uint64_t
+mach_u64_read_next_compressed(
+	const byte**	b)
+{
+	ib_uint64_t	val;
+
+	val = mach_read_next_compressed(b);
+	val <<= 32;
+	val |= mach_read_from_4(*b);
+	*b += 4;
+	return(val);
+}
+
+/*********************************************************//**
+Writes a 64-bit integer in a compressed form (1..11 bytes).
+@return size in bytes */
+UNIV_INLINE
+ulint
+mach_u64_write_much_compressed(
+/*===========================*/
+	byte*		b,	/*!< in: pointer to memory where to store */
+	ib_uint64_t	n)	/*!< in: 64-bit integer to be stored */
+{
+	ulint	size;
+
+	if (!(n >> 32)) {
+		return(mach_write_compressed(b, (ulint) n));
+	}
+
+	*b = (byte)0xFF;
+	size = 1 + mach_write_compressed(b + 1, (ulint) (n >> 32));
+
+	size += mach_write_compressed(b + size, (ulint) n & 0xFFFFFFFF);
+
+	return(size);
+}
+
+/*********************************************************//**
+Reads a 64-bit integer in a compressed form.
+@return the value read */
+UNIV_INLINE
+ib_uint64_t
+mach_u64_read_much_compressed(
+/*==========================*/
+	const byte*	b)	/*!< in: pointer to memory from where to read */
+{
+	ib_uint64_t	n;
+
+	if (*b != 0xFF) {
+		return(mach_read_compressed(b));
+	}
+
+	b++;
+	n = mach_read_next_compressed(&b);
+	n <<= 32;
+	n |= mach_read_compressed(b);
+
+	return(n);
+}
+
+/** Read a 64-bit integer in a compressed form.
+@param[in,out]	b	pointer to memory where to read;
+advanced by the number of bytes consumed
+@return unsigned value */
+UNIV_INLINE
+ib_uint64_t
+mach_read_next_much_compressed(
+	const byte**	b)
+{
+	ib_uint64_t	val = mach_read_from_1(*b);
+
+	if (val < 0x80) {
+		/* 0nnnnnnn (7 bits) */
+		++*b;
+	} else if (val < 0xC0) {
+		/* 10nnnnnn nnnnnnnn (14 bits) */
+		val = mach_read_from_2(*b) & 0x3FFF;
+		ut_ad(val > 0x7F);
+		*b += 2;
+	} else if (val < 0xE0) {
+		/* 110nnnnn nnnnnnnn nnnnnnnn (21 bits) */
+		val = mach_read_from_3(*b) & 0x1FFFFF;
+		ut_ad(val > 0x3FFF);
+		*b += 3;
+	} else if (val < 0xF0) {
+		/* 1110nnnn nnnnnnnn nnnnnnnn nnnnnnnn (28 bits) */
+		val = mach_read_from_4(*b) & 0xFFFFFFF;
+		ut_ad(val > 0x1FFFFF);
+		*b += 4;
+	} else if (val == 0xF0) {
+		/* 11110000 nnnnnnnn nnnnnnnn nnnnnnnn nnnnnnnn (32 bits) */
+		val = mach_read_from_4(*b + 1);
+		ut_ad(val > 0xFFFFFFF);
+		*b += 5;
+	} else {
+		/* 11111111 followed by up to 64 bits */
+		ut_ad(val == 0xFF);
+		++*b;
+		val = mach_read_next_compressed(b);
+		ut_ad(val > 0);
+		val <<= 32;
+		val |= mach_read_next_compressed(b);
+	}
+
+	return(val);
+}
+
+/*********************************************************//**
+Reads a double. It is stored in a little-endian format.
+@return double read */
+UNIV_INLINE
+double
+mach_double_read(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to memory from where to read */
+{
+	double	d;
+	ulint	i;
+	byte*	ptr;
+
+	ptr = (byte*) &d;
+
+	for (i = 0; i < sizeof(double); i++) {
+#ifdef WORDS_BIGENDIAN
+		ptr[sizeof(double) - i - 1] = b[i];
+#else
+		ptr[i] = b[i];
+#endif
+	}
+
+	return(d);
+}
+
+/*********************************************************//**
+Writes a double. It is stored in a little-endian format. */
+UNIV_INLINE
+void
+mach_double_write(
+/*==============*/
+	byte*	b,	/*!< in: pointer to memory where to write */
+	double	d)	/*!< in: double */
+{
+	ulint	i;
+	byte*	ptr;
+
+	ptr = (byte*) &d;
+
+	for (i = 0; i < sizeof(double); i++) {
+#ifdef WORDS_BIGENDIAN
+		b[i] = ptr[sizeof(double) - i - 1];
+#else
+		b[i] = ptr[i];
+#endif
+	}
+}
+
+/*********************************************************//**
+Reads a float. It is stored in a little-endian format.
+@return float read */
+UNIV_INLINE
+float
+mach_float_read(
+/*============*/
+	const byte*	b)	/*!< in: pointer to memory from where to read */
+{
+	float	d;
+	ulint	i;
+	byte*	ptr;
+
+	ptr = (byte*) &d;
+
+	for (i = 0; i < sizeof(float); i++) {
+#ifdef WORDS_BIGENDIAN
+		ptr[sizeof(float) - i - 1] = b[i];
+#else
+		ptr[i] = b[i];
+#endif
+	}
+
+	return(d);
+}
+
+/*********************************************************//**
+Writes a float. It is stored in a little-endian format. */
+UNIV_INLINE
+void
+mach_float_write(
+/*=============*/
+	byte*	b,	/*!< in: pointer to memory where to write */
+	float	d)	/*!< in: float */
+{
+	ulint	i;
+	byte*	ptr;
+
+	ptr = (byte*) &d;
+
+	for (i = 0; i < sizeof(float); i++) {
+#ifdef WORDS_BIGENDIAN
+		b[i] = ptr[sizeof(float) - i - 1];
+#else
+		b[i] = ptr[i];
+#endif
+	}
+}
+
+/*********************************************************//**
+Reads a ulint stored in the little-endian format.
+@return unsigned long int */
+UNIV_INLINE
+ulint
+mach_read_from_n_little_endian(
+/*===========================*/
+	const byte*	buf,		/*!< in: from where to read */
+	ulint		buf_size)	/*!< in: from how many bytes to read */
+{
+	ulint	n	= 0;
+	const byte*	ptr;
+
+	ut_ad(buf_size > 0);
+
+	ptr = buf + buf_size;
+
+	for (;;) {
+		ptr--;
+
+		n = n << 8;
+
+		n += (ulint)(*ptr);
+
+		if (ptr == buf) {
+			break;
+		}
+	}
+
+	return(n);
+}
+
+/*********************************************************//**
+Writes a ulint in the little-endian format. */
+UNIV_INLINE
+void
+mach_write_to_n_little_endian(
+/*==========================*/
+	byte*	dest,		/*!< in: where to write */
+	ulint	dest_size,	/*!< in: into how many bytes to write */
+	ulint	n)		/*!< in: unsigned long int to write */
+{
+	byte*	end;
+
+	ut_ad(dest_size <= sizeof(ulint));
+	ut_ad(dest_size > 0);
+
+	end = dest + dest_size;
+
+	for (;;) {
+		*dest = (byte)(n & 0xFF);
+
+		n = n >> 8;
+
+		dest++;
+
+		if (dest == end) {
+			break;
+		}
+	}
+
+	ut_ad(n == 0);
+}
+
+/*********************************************************//**
+Reads a ulint stored in the little-endian format.
+@return unsigned long int */
+UNIV_INLINE
+ulint
+mach_read_from_2_little_endian(
+/*===========================*/
+	const byte*	buf)		/*!< in: from where to read */
+{
+	return((ulint)(buf[0]) | ((ulint)(buf[1]) << 8));
+}
+
+/*********************************************************//**
+Writes a ulint in the little-endian format. */
+UNIV_INLINE
+void
+mach_write_to_2_little_endian(
+/*==========================*/
+	byte*	dest,		/*!< in: where to write */
+	ulint	n)		/*!< in: unsigned long int to write */
+{
+	ut_ad(n < 256 * 256);
+
+	*dest = (byte)(n & 0xFFUL);
+
+	n = n >> 8;
+	dest++;
+
+	*dest = (byte)(n & 0xFFUL);
+}
+
+/*********************************************************//**
+Convert integral type from storage byte order (big endian) to
+host byte order.
+@return integer value */
+UNIV_INLINE
+ib_uint64_t
+mach_read_int_type(
+/*===============*/
+	const byte*	src,		/*!< in: where to read from */
+	ulint		len,		/*!< in: length of src */
+	ibool		unsigned_type)	/*!< in: signed or unsigned flag */
+{
+	/* XXX this can be optimized on big-endian machines */
+
+	uintmax_t	ret;
+	uint		i;
+
+	if (unsigned_type || (src[0] & 0x80)) {
+
+		ret = 0x0000000000000000ULL;
+	} else {
+
+		ret = 0xFFFFFFFFFFFFFF00ULL;
+	}
+
+	if (unsigned_type) {
+
+		ret |= src[0];
+	} else {
+
+		ret |= src[0] ^ 0x80;
+	}
+
+	for (i = 1; i < len; i++) {
+		ret <<= 8;
+		ret |= src[i];
+	}
+
+	return(ret);
+}
+/*********************************************************//**
+Swap byte ordering. */
+UNIV_INLINE
+void
+mach_swap_byte_order(
+/*=================*/
+        byte*           dest,           /*!< out: where to write */
+        const byte*     from,           /*!< in: where to read from */
+        ulint           len)            /*!< in: length of src */
+{
+        ut_ad(len > 0);
+        ut_ad(len <= 8);
+
+        dest += len;
+
+        switch (len & 0x7) {
+        case 0: *--dest = *from++; /* fall through */
+        case 7: *--dest = *from++; /* fall through */
+        case 6: *--dest = *from++; /* fall through */
+        case 5: *--dest = *from++; /* fall through */
+        case 4: *--dest = *from++; /* fall through */
+        case 3: *--dest = *from++; /* fall through */
+        case 2: *--dest = *from++; /* fall through */
+        case 1: *--dest = *from;
+        }
+}
+
+/*************************************************************
+Convert a ulonglong integer from host byte order to (big-endian)
+storage byte order. */
+UNIV_INLINE
+void
+mach_write_ulonglong(
+/*=================*/
+	byte*		dest,		/*!< in: where to write */
+	ulonglong	src,		/*!< in: where to read from */
+	ulint		len,		/*!< in: length of dest */
+	bool		usign)		/*!< in: signed or unsigned flag */
+{
+	byte*		ptr = reinterpret_cast<byte*>(&src);
+
+	ut_ad(len <= sizeof(ulonglong));
+
+#ifdef WORDS_BIGENDIAN
+	memcpy(dest, ptr + (sizeof(src) - len), len);
+#else
+	mach_swap_byte_order(dest, reinterpret_cast<byte*>(ptr), len);
+#endif /* WORDS_BIGENDIAN */
+
+	if (!usign) {
+		*dest ^=  0x80;
+	}
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/mariadb_stats.h b/storage/innobase/include/mariadb_stats.h
new file mode 100644
index 00000000..e9051c0c
--- /dev/null
+++ b/storage/innobase/include/mariadb_stats.h
@@ -0,0 +1,119 @@
+/*****************************************************************************
+
+Copyright (c) 2023, MariaDB Foundation
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#ifndef mariadb_stats_h
+#define mariadb_stats_h
+
+/* Include file to handle mariadbd handler specific stats */
+
+#include "ha_handler_stats.h"
+#include "my_rdtsc.h"
+
+/* Not active threads are ponting to this structure */
+extern thread_local ha_handler_stats mariadb_dummy_stats;
+
+/* Points to either THD->handler_stats or mariad_dummy_stats */
+extern thread_local ha_handler_stats *mariadb_stats;
+
+/*
+  Returns 1 if MariaDB wants engine status
+*/
+
+inline bool mariadb_stats_active()
+{
+  return mariadb_stats->active != 0;
+}
+
+inline bool mariadb_stats_active(ha_handler_stats *stats)
+{
+  return stats->active != 0;
+}
+
+/* The following functions increment different engine status */
+
+inline void mariadb_increment_pages_accessed()
+{
+  mariadb_stats->pages_accessed++;
+}
+
+inline void mariadb_increment_pages_updated(ulonglong count)
+{
+  mariadb_stats->pages_updated+= count;
+}
+
+inline void mariadb_increment_pages_read()
+{
+  mariadb_stats->pages_read_count++;
+}
+
+inline void mariadb_increment_undo_records_read()
+{
+  mariadb_stats->undo_records_read++;
+}
+
+/*
+  The following has to be identical code as measure() in sql_analyze_stmt.h
+
+  One should only call this if mariadb_stats_active() is true.
+*/
+
+inline ulonglong mariadb_measure()
+{
+#if (MY_TIMER_ROUTINE_CYCLES)
+    return my_timer_cycles();
+#else
+    return my_timer_microseconds();
+#endif
+}
+
+/*
+  Call this only of start_time != 0
+  See buf0rea.cc for an example of how to use it efficiently
+*/
+
+inline void mariadb_increment_pages_read_time(ulonglong start_time)
+{
+  ha_handler_stats *stats= mariadb_stats;
+  ulonglong end_time= mariadb_measure();
+  /* Check that we only call this if active, see example! */
+  DBUG_ASSERT(start_time);
+  DBUG_ASSERT(mariadb_stats_active(stats));
+
+  stats->pages_read_time+= (end_time - start_time);
+}
+
+
+/*
+  Helper class to set mariadb_stats temporarly for one call in handler.cc
+*/
+
+class mariadb_set_stats
+{
+public:
+  uint flag;
+  mariadb_set_stats(ha_handler_stats *stats)
+  {
+    mariadb_stats= stats ? stats : &mariadb_dummy_stats;
+  }
+  ~mariadb_set_stats()
+  {
+    mariadb_stats= &mariadb_dummy_stats;
+  }
+};
+
+#endif /* mariadb_stats_h */
diff --git a/storage/innobase/include/mem0mem.h b/storage/innobase/include/mem0mem.h
new file mode 100644
index 00000000..959147a6
--- /dev/null
+++ b/storage/innobase/include/mem0mem.h
@@ -0,0 +1,345 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mem0mem.h
+The memory management
+
+Created 6/9/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef mem0mem_h
+#define mem0mem_h
+
+#include "ut0mem.h"
+#include "ut0rnd.h"
+#include "mach0data.h"
+
+#include <memory>
+
+/* -------------------- MEMORY HEAPS ----------------------------- */
+
+/** A block of a memory heap consists of the info structure
+followed by an area of memory */
+typedef struct mem_block_info_t	mem_block_t;
+
+/** A memory heap is a nonempty linear list of memory blocks */
+typedef mem_block_t		mem_heap_t;
+
+/** Types of allocation for memory heaps: DYNAMIC means allocation from the
+dynamic memory pool of the C compiler, BUFFER means allocation from the
+buffer pool; the latter method is used for very big heaps */
+
+#define MEM_HEAP_DYNAMIC	0	/* the most common type */
+#define MEM_HEAP_BUFFER		1
+#define MEM_HEAP_BTR_SEARCH	2	/* this flag can optionally be
+					ORed to MEM_HEAP_BUFFER, in which
+					case heap->free_block is used in
+					some cases for memory allocations,
+					and if it's NULL, the memory
+					allocation functions can return
+					NULL. */
+
+/** Different type of heaps in terms of which datastructure is using them */
+#define MEM_HEAP_FOR_BTR_SEARCH		(MEM_HEAP_BTR_SEARCH | MEM_HEAP_BUFFER)
+#define MEM_HEAP_FOR_LOCK_HEAP		(MEM_HEAP_BUFFER)
+
+/** The following start size is used for the first block in the memory heap if
+the size is not specified, i.e., 0 is given as the parameter in the call of
+create. The standard size is the maximum (payload) size of the blocks used for
+allocations of small buffers. */
+
+#define MEM_BLOCK_START_SIZE		64
+#define MEM_BLOCK_STANDARD_SIZE		\
+	(srv_page_size >= 16384 ? 8000 : MEM_MAX_ALLOC_IN_BUF)
+
+/** If a memory heap is allowed to grow into the buffer pool, the following
+is the maximum size for a single allocated buffer: */
+#define MEM_MAX_ALLOC_IN_BUF		(srv_page_size - 200 + REDZONE_SIZE)
+
+/** Space needed when allocating for a user a field of length N.
+The space is allocated only in multiples of UNIV_MEM_ALIGNMENT.  */
+#define MEM_SPACE_NEEDED(N) UT_CALC_ALIGN((N), UNIV_MEM_ALIGNMENT)
+
+#ifdef UNIV_DEBUG
+/** Macro for memory heap creation.
+@param[in]	size		Desired start block size. */
+# define mem_heap_create(size)					\
+	 mem_heap_create_func((size), __FILE__, __LINE__, MEM_HEAP_DYNAMIC)
+
+/** Macro for memory heap creation.
+@param[in]	size		Desired start block size.
+@param[in]	type		Heap type */
+# define mem_heap_create_typed(size, type)			\
+	 mem_heap_create_func((size), __FILE__, __LINE__, (type))
+
+#else /* UNIV_DEBUG */
+/** Macro for memory heap creation.
+@param[in]	size		Desired start block size. */
+# define mem_heap_create(size) mem_heap_create_func((size), MEM_HEAP_DYNAMIC)
+
+/** Macro for memory heap creation.
+@param[in]	size		Desired start block size.
+@param[in]	type		Heap type */
+# define mem_heap_create_typed(size, type)			\
+	 mem_heap_create_func((size), (type))
+
+#endif /* UNIV_DEBUG */
+
+/** Creates a memory heap.
+NOTE: Use the corresponding macros instead of this function.
+A single user buffer of 'size' will fit in the block.
+0 creates a default size block.
+@param[in]	size		Desired start block size.
+@param[in]	file_name	File name where created
+@param[in]	line		Line where created
+@param[in]	type		Heap type
+@return own: memory heap, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INLINE
+mem_heap_t*
+mem_heap_create_func(
+	ulint		size,
+#ifdef UNIV_DEBUG
+	const char*	file_name,
+	unsigned	line,
+#endif /* UNIV_DEBUG */
+	ulint		type);
+
+/** Frees the space occupied by a memory heap.
+NOTE: Use the corresponding macro instead of this function.
+@param[in]	heap	Heap to be freed */
+UNIV_INLINE
+void
+mem_heap_free(
+	mem_heap_t*	heap);
+
+/** Allocates and zero-fills n bytes of memory from a memory heap.
+@param[in]	heap	memory heap
+@param[in]	n	number of bytes; if the heap is allowed to grow into
+the buffer pool, this must be <= MEM_MAX_ALLOC_IN_BUF
+@return allocated, zero-filled storage */
+UNIV_INLINE
+void*
+mem_heap_zalloc(
+	mem_heap_t*	heap,
+	ulint		n);
+
+/** Allocates n bytes of memory from a memory heap.
+@param[in]	heap	memory heap
+@param[in]	n	number of bytes; if the heap is allowed to grow into
+the buffer pool, this must be <= MEM_MAX_ALLOC_IN_BUF
+@return allocated storage, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INLINE
+void*
+mem_heap_alloc(
+	mem_heap_t*	heap,
+	ulint		n);
+
+/** Returns a pointer to the heap top.
+@param[in]	heap		memory heap
+@return pointer to the heap top */
+UNIV_INLINE
+byte*
+mem_heap_get_heap_top(
+	mem_heap_t*	heap);
+
+/** Frees the space in a memory heap exceeding the pointer given.
+The pointer must have been acquired from mem_heap_get_heap_top.
+The first memory block of the heap is not freed.
+@param[in]	heap		heap from which to free
+@param[in]	old_top		pointer to old top of heap */
+UNIV_INLINE
+void
+mem_heap_free_heap_top(
+	mem_heap_t*	heap,
+	byte*		old_top);
+
+/** Empties a memory heap.
+The first memory block of the heap is not freed.
+@param[in]	heap		heap to empty */
+UNIV_INLINE
+void
+mem_heap_empty(
+	mem_heap_t*	heap);
+
+/** Returns a pointer to the topmost element in a memory heap.
+The size of the element must be given.
+@param[in]	heap	memory heap
+@param[in]	n	size of the topmost element
+@return pointer to the topmost element */
+UNIV_INLINE
+void*
+mem_heap_get_top(
+	mem_heap_t*	heap,
+	ulint		n);
+
+/*****************************************************************//**
+Frees the topmost element in a memory heap.
+The size of the element must be given. */
+UNIV_INLINE
+void
+mem_heap_free_top(
+/*==============*/
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	ulint		n);	/*!< in: size of the topmost element */
+/*****************************************************************//**
+Returns the space in bytes occupied by a memory heap. */
+UNIV_INLINE
+ulint
+mem_heap_get_size(
+/*==============*/
+	mem_heap_t*	heap);		/*!< in: heap */
+
+/**********************************************************************//**
+Duplicates a NUL-terminated string.
+@return own: a copy of the string, must be deallocated with ut_free */
+UNIV_INLINE
+char*
+mem_strdup(
+/*=======*/
+	const char*	str);	/*!< in: string to be copied */
+/**********************************************************************//**
+Makes a NUL-terminated copy of a nonterminated string.
+@return own: a copy of the string, must be deallocated with ut_free */
+UNIV_INLINE
+char*
+mem_strdupl(
+/*========*/
+	const char*	str,	/*!< in: string to be copied */
+	ulint		len);	/*!< in: length of str, in bytes */
+
+/** Duplicate a block of data, allocated from a memory heap.
+@param[in]	heap	memory heap where string is allocated
+@param[in]	data	block of data to be copied
+@param[in]	len	length of data, in bytes
+@return own: a copy of data */
+inline
+void*
+mem_heap_dup(mem_heap_t* heap, const void* data, size_t len)
+{
+	ut_ad(data || !len);
+	return UNIV_LIKELY(data != NULL)
+		? memcpy(mem_heap_alloc(heap, len), data, len)
+		: NULL;
+}
+
+/** Duplicate a NUL-terminated string, allocated from a memory heap.
+@param[in]	heap	memory heap where string is allocated
+@param[in]	str	string to be copied
+@return own: a copy of the string */
+inline
+char*
+mem_heap_strdup(mem_heap_t* heap, const char* str)
+{
+	return(static_cast<char*>(mem_heap_dup(heap, str, strlen(str) + 1)));
+}
+
+/** Duplicate a string, allocated from a memory heap.
+@param[in]	heap	memory heap where string is allocated
+@param[in]	str	string to be copied
+@param[in]	len	length of str, in bytes
+@return own: a NUL-terminated copy of str */
+inline
+char*
+mem_heap_strdupl(mem_heap_t* heap, const char* str, size_t len)
+{
+	char*	s = static_cast<char*>(mem_heap_alloc(heap, len + 1));
+	s[len] = 0;
+	return(static_cast<char*>(memcpy(s, str, len)));
+}
+
+/**********************************************************************//**
+Concatenate two strings and return the result, using a memory heap.
+@return own: the result */
+char*
+mem_heap_strcat(
+/*============*/
+	mem_heap_t*	heap,	/*!< in: memory heap where string is allocated */
+	const char*	s1,	/*!< in: string 1 */
+	const char*	s2);	/*!< in: string 2 */
+
+/****************************************************************//**
+A simple sprintf replacement that dynamically allocates the space for the
+formatted string from the given heap. This supports a very limited set of
+the printf syntax: types 's' and 'u' and length modifier 'l' (which is
+required for the 'u' type).
+@return heap-allocated formatted string */
+char*
+mem_heap_printf(
+/*============*/
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	const char*	format,	/*!< in: format string */
+	...) MY_ATTRIBUTE ((format (printf, 2, 3)));
+
+#ifdef UNIV_DEBUG
+/** Validates the contents of a memory heap.
+Asserts that the memory heap is consistent
+@param[in]	heap	Memory heap to validate */
+void
+mem_heap_validate(
+	const mem_heap_t*	heap);
+
+#endif /* UNIV_DEBUG */
+
+/*#######################################################################*/
+
+/** The info structure stored at the beginning of a heap block */
+struct mem_block_info_t {
+#ifdef UNIV_DEBUG
+	char	file_name[8];/* file name where the mem heap was created */
+	unsigned line;	/*!< line number where the mem heap was created */
+#endif /* UNIV_DEBUG */
+	UT_LIST_BASE_NODE_T(mem_block_t) base; /* In the first block in the
+			the list this is the base node of the list of blocks;
+			in subsequent blocks this is undefined */
+	UT_LIST_NODE_T(mem_block_t) list; /* This contains pointers to next
+			and prev in the list. The first block allocated
+			to the heap is also the first block in this list,
+			though it also contains the base node of the list. */
+	ulint	len;	/*!< physical length of this block in bytes */
+	ulint	total_size; /*!< physical length in bytes of all blocks
+			in the heap. This is defined only in the base
+			node and is set to ULINT_UNDEFINED in others. */
+	ulint	type;	/*!< type of heap: MEM_HEAP_DYNAMIC, or
+			MEM_HEAP_BUF possibly ORed to MEM_HEAP_BTR_SEARCH */
+	ulint	free;	/*!< offset in bytes of the first free position for
+			user data in the block */
+	ulint	start;	/*!< the value of the struct field 'free' at the
+			creation of the block */
+
+	void*	free_block;
+			/* if the MEM_HEAP_BTR_SEARCH bit is set in type,
+			and this is the heap root, this can contain an
+			allocated buffer frame, which can be appended as a
+			free block to the heap, if we need more space;
+			otherwise, this is NULL */
+	void*	buf_block;
+			/* if this block has been allocated from the buffer
+			pool, this contains the buf_block_t handle;
+			otherwise, this is NULL */
+};
+
+/* Header size for a memory heap block */
+#define MEM_BLOCK_HEADER_SIZE	UT_CALC_ALIGN(sizeof(mem_block_info_t),\
+					      UNIV_MEM_ALIGNMENT)
+
+#include "mem0mem.inl"
+#endif
diff --git a/storage/innobase/include/mem0mem.inl b/storage/innobase/include/mem0mem.inl
new file mode 100644
index 00000000..9906daf3
--- /dev/null
+++ b/storage/innobase/include/mem0mem.inl
@@ -0,0 +1,468 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/mem0mem.ic
+The memory management
+
+Created 6/8/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "ut0new.h"
+
+#ifdef UNIV_DEBUG
+# define mem_heap_create_block(heap, n, type, file_name, line)		\
+	mem_heap_create_block_func(heap, n, file_name, line, type)
+# define mem_heap_create_at(N, file_name, line)				\
+	mem_heap_create_func(N, file_name, line, MEM_HEAP_DYNAMIC)
+#else /* UNIV_DEBUG */
+# define mem_heap_create_block(heap, n, type, file_name, line)		\
+	mem_heap_create_block_func(heap, n, type)
+# define mem_heap_create_at(N, file_name, line)				\
+	mem_heap_create_func(N, MEM_HEAP_DYNAMIC)
+#endif /* UNIV_DEBUG */
+/***************************************************************//**
+Creates a memory heap block where data can be allocated.
+@return own: memory heap block, NULL if did not succeed (only possible
+for MEM_HEAP_BTR_SEARCH type heaps) */
+mem_block_t*
+mem_heap_create_block_func(
+/*=======================*/
+	mem_heap_t*	heap,	/*!< in: memory heap or NULL if first block
+				should be created */
+	ulint		n,	/*!< in: number of bytes needed for user data */
+#ifdef UNIV_DEBUG
+	const char*	file_name,/*!< in: file name where created */
+	unsigned	line,	/*!< in: line where created */
+#endif /* UNIV_DEBUG */
+	ulint		type);	/*!< in: type of heap: MEM_HEAP_DYNAMIC or
+				MEM_HEAP_BUFFER */
+
+/******************************************************************//**
+Frees a block from a memory heap. */
+void
+mem_heap_block_free(
+/*================*/
+	mem_heap_t*	heap,	/*!< in: heap */
+	mem_block_t*	block);	/*!< in: block to free */
+
+/******************************************************************//**
+Frees the free_block field from a memory heap. */
+void
+mem_heap_free_block_free(
+/*=====================*/
+	mem_heap_t*	heap);	/*!< in: heap */
+
+/***************************************************************//**
+Adds a new block to a memory heap.
+@param[in]	heap	memory heap
+@param[in]	n	number of bytes needed
+@return created block, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+mem_block_t*
+mem_heap_add_block(
+	mem_heap_t*	heap,
+	ulint		n);
+
+UNIV_INLINE
+void
+mem_block_set_len(mem_block_t* block, ulint len)
+{
+	ut_ad(len > 0);
+
+	block->len = len;
+}
+
+UNIV_INLINE
+ulint
+mem_block_get_len(mem_block_t* block)
+{
+	return(block->len);
+}
+
+UNIV_INLINE
+void
+mem_block_set_type(mem_block_t* block, ulint type)
+{
+	ut_ad((type == MEM_HEAP_DYNAMIC) || (type == MEM_HEAP_BUFFER)
+	      || (type == MEM_HEAP_BUFFER + MEM_HEAP_BTR_SEARCH));
+
+	block->type = type;
+}
+
+UNIV_INLINE
+ulint
+mem_block_get_type(mem_block_t* block)
+{
+	return(block->type);
+}
+
+UNIV_INLINE
+void
+mem_block_set_free(mem_block_t* block, ulint free)
+{
+	ut_ad(free > 0);
+	ut_ad(free <= mem_block_get_len(block));
+
+	block->free = free;
+}
+
+UNIV_INLINE
+ulint
+mem_block_get_free(mem_block_t* block)
+{
+	return(block->free);
+}
+
+UNIV_INLINE
+void
+mem_block_set_start(mem_block_t* block, ulint start)
+{
+	ut_ad(start > 0);
+
+	block->start = start;
+}
+
+UNIV_INLINE
+ulint
+mem_block_get_start(mem_block_t* block)
+{
+	return(block->start);
+}
+
+/** Allocates and zero-fills n bytes of memory from a memory heap.
+@param[in]	heap	memory heap
+@param[in]	n	number of bytes; if the heap is allowed to grow into
+the buffer pool, this must be <= MEM_MAX_ALLOC_IN_BUF
+@return allocated, zero-filled storage */
+UNIV_INLINE
+void*
+mem_heap_zalloc(
+	mem_heap_t*	heap,
+	ulint		n)
+{
+	ut_ad(heap);
+	ut_ad(!(heap->type & MEM_HEAP_BTR_SEARCH));
+	return(memset(mem_heap_alloc(heap, n), 0, n));
+}
+
+/** Allocates n bytes of memory from a memory heap.
+@param[in]	heap	memory heap
+@param[in]	n	number of bytes; if the heap is allowed to grow into
+the buffer pool, this must be <= MEM_MAX_ALLOC_IN_BUF
+@return allocated storage, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INLINE
+void*
+mem_heap_alloc(
+	mem_heap_t*	heap,
+	ulint		n)
+{
+	mem_block_t*	block;
+	byte*		buf;
+	ulint		free;
+
+	block = UT_LIST_GET_LAST(heap->base);
+
+	n += REDZONE_SIZE;
+
+	ut_ad(!(block->type & MEM_HEAP_BUFFER) || (n <= MEM_MAX_ALLOC_IN_BUF));
+
+	/* Check if there is enough space in block. If not, create a new
+	block to the heap */
+
+	if (mem_block_get_len(block)
+	    < mem_block_get_free(block) + MEM_SPACE_NEEDED(n)) {
+
+		block = mem_heap_add_block(heap, n);
+
+		if (block == NULL) {
+
+			return(NULL);
+		}
+	}
+
+	free = mem_block_get_free(block);
+
+	buf = (byte*) block + free;
+
+	mem_block_set_free(block, free + MEM_SPACE_NEEDED(n));
+
+	buf = buf + REDZONE_SIZE;
+	MEM_MAKE_ADDRESSABLE(buf, n - REDZONE_SIZE);
+	return(buf);
+}
+
+/** Returns a pointer to the heap top.
+@param[in]	heap	memory heap
+@return pointer to the heap top */
+UNIV_INLINE
+byte*
+mem_heap_get_heap_top(
+	mem_heap_t*	heap)
+{
+	mem_block_t*	block;
+	byte*		buf;
+
+	block = UT_LIST_GET_LAST(heap->base);
+
+	buf = (byte*) block + mem_block_get_free(block);
+
+	return(buf);
+}
+
+/** Frees the space in a memory heap exceeding the pointer given.
+The pointer must have been acquired from mem_heap_get_heap_top.
+The first memory block of the heap is not freed.
+@param[in]	heap		heap from which to free
+@param[in]	old_top		pointer to old top of heap */
+UNIV_INLINE
+void
+mem_heap_free_heap_top(
+	mem_heap_t*	heap,
+	byte*		old_top)
+{
+	mem_block_t*	block;
+	mem_block_t*	prev_block;
+
+	ut_d(mem_heap_validate(heap));
+
+	block = UT_LIST_GET_LAST(heap->base);
+
+	while (block != NULL) {
+		if (((byte*) block + mem_block_get_free(block) >= old_top)
+		    && ((byte*) block <= old_top)) {
+			/* Found the right block */
+
+			break;
+		}
+
+		/* Store prev_block value before freeing the current block
+		(the current block will be erased in freeing) */
+
+		prev_block = UT_LIST_GET_PREV(list, block);
+
+		mem_heap_block_free(heap, block);
+
+		block = prev_block;
+	}
+
+	ut_ad(block);
+
+	/* Set the free field of block */
+	mem_block_set_free(block,
+			   ulint(old_top - reinterpret_cast<byte*>(block)));
+
+	ut_ad(mem_block_get_start(block) <= mem_block_get_free(block));
+	MEM_NOACCESS(old_top, (byte*) block + block->len - old_top);
+
+	/* If free == start, we may free the block if it is not the first
+	one */
+
+	if ((heap != block) && (mem_block_get_free(block)
+				== mem_block_get_start(block))) {
+		mem_heap_block_free(heap, block);
+	}
+}
+
+/** Empties a memory heap.
+The first memory block of the heap is not freed.
+@param[in]	heap	heap to empty */
+UNIV_INLINE
+void
+mem_heap_empty(
+	mem_heap_t*	heap)
+{
+	mem_heap_free_heap_top(heap, (byte*) heap + mem_block_get_start(heap));
+
+	if (heap->free_block) {
+		mem_heap_free_block_free(heap);
+	}
+}
+
+/** Returns a pointer to the topmost element in a memory heap.
+The size of the element must be given.
+@param[in]	heap	memory heap
+@param[in]	n	size of the topmost element
+@return pointer to the topmost element */
+UNIV_INLINE
+void*
+mem_heap_get_top(
+	mem_heap_t*	heap,
+	ulint		n)
+{
+	mem_block_t*	block;
+	byte*		buf;
+
+	block = UT_LIST_GET_LAST(heap->base);
+
+	buf = (byte*) block + mem_block_get_free(block) - MEM_SPACE_NEEDED(n);
+
+	return((void*) buf);
+}
+
+/*****************************************************************//**
+Frees the topmost element in a memory heap. The size of the element must be
+given. */
+UNIV_INLINE
+void
+mem_heap_free_top(
+/*==============*/
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	ulint		n)	/*!< in: size of the topmost element */
+{
+	mem_block_t*	block;
+
+	n += REDZONE_SIZE;
+
+	block = UT_LIST_GET_LAST(heap->base);
+
+	/* Subtract the free field of block */
+	mem_block_set_free(block, mem_block_get_free(block)
+			   - MEM_SPACE_NEEDED(n));
+
+	/* If free == start, we may free the block if it is not the first
+	one */
+
+	if ((heap != block) && (mem_block_get_free(block)
+				== mem_block_get_start(block))) {
+		mem_heap_block_free(heap, block);
+	} else {
+		MEM_NOACCESS((byte*) block + mem_block_get_free(block), n);
+	}
+}
+
+/** Creates a memory heap.
+NOTE: Use the corresponding macros instead of this function.
+A single user buffer of 'size' will fit in the block.
+0 creates a default size block.
+@param[in]	size		Desired start block size.
+@param[in]	file_name	File name where created
+@param[in]	line		Line where created
+@param[in]	type		Heap type
+@return own: memory heap, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INLINE
+mem_heap_t*
+mem_heap_create_func(
+	ulint		size,
+#ifdef UNIV_DEBUG
+	const char*	file_name,
+	unsigned	line,
+#endif /* UNIV_DEBUG */
+	ulint		type)
+{
+	mem_block_t*   block;
+
+	if (!size) {
+		size = MEM_BLOCK_START_SIZE;
+	}
+
+	block = mem_heap_create_block(NULL, size, type, file_name, line);
+
+	if (block == NULL) {
+
+		return(NULL);
+	}
+
+	/* The first block should not be in buffer pool,
+	because it might be relocated to resize buffer pool. */
+	ut_ad(block->buf_block == NULL);
+
+	UT_LIST_INIT(block->base, &mem_block_t::list);
+
+	/* Add the created block itself as the first block in the list */
+	UT_LIST_ADD_FIRST(block->base, block);
+
+	return(block);
+}
+
+/** Frees the space occupied by a memory heap.
+NOTE: Use the corresponding macro instead of this function.
+@param[in]	heap	Heap to be freed */
+UNIV_INLINE
+void
+mem_heap_free(
+	mem_heap_t*	heap)
+{
+	mem_block_t*	block;
+	mem_block_t*	prev_block;
+
+	block = UT_LIST_GET_LAST(heap->base);
+
+	if (heap->free_block) {
+		mem_heap_free_block_free(heap);
+	}
+
+	while (block != NULL) {
+		/* Store the contents of info before freeing current block
+		(it is erased in freeing) */
+
+		prev_block = UT_LIST_GET_PREV(list, block);
+
+		mem_heap_block_free(heap, block);
+
+		block = prev_block;
+	}
+}
+
+/*****************************************************************//**
+Returns the space in bytes occupied by a memory heap. */
+UNIV_INLINE
+ulint
+mem_heap_get_size(
+/*==============*/
+	mem_heap_t*	heap)	/*!< in: heap */
+{
+	ulint size = heap->total_size;
+
+	if (heap->free_block) {
+		size += srv_page_size;
+	}
+
+	return(size);
+}
+
+/**********************************************************************//**
+Duplicates a NUL-terminated string.
+@return own: a copy of the string, must be deallocated with ut_free */
+UNIV_INLINE
+char*
+mem_strdup(
+/*=======*/
+	const char*	str)	/*!< in: string to be copied */
+{
+	ulint	len = strlen(str) + 1;
+	return(static_cast<char*>(memcpy(ut_malloc_nokey(len), str, len)));
+}
+
+/**********************************************************************//**
+Makes a NUL-terminated copy of a nonterminated string.
+@return own: a copy of the string, must be deallocated with ut_free */
+UNIV_INLINE
+char*
+mem_strdupl(
+/*========*/
+	const char*	str,	/*!< in: string to be copied */
+	ulint		len)	/*!< in: length of str, in bytes */
+{
+	char*	s = static_cast<char*>(ut_malloc_nokey(len + 1));
+	s[len] = 0;
+	return(static_cast<char*>(memcpy(s, str, len)));
+}
diff --git a/storage/innobase/include/mtr0log.h b/storage/innobase/include/mtr0log.h
new file mode 100644
index 00000000..e2419309
--- /dev/null
+++ b/storage/innobase/include/mtr0log.h
@@ -0,0 +1,637 @@
+/*****************************************************************************
+
+Copyright (c) 2019, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**
+@file include/mtr0log.h
+Mini-transaction log record encoding and decoding
+*******************************************************/
+
+#pragma once
+#include "mtr0mtr.h"
+
+/** The smallest invalid page identifier for persistent tablespaces */
+constexpr page_id_t end_page_id{SRV_SPACE_ID_UPPER_BOUND, 0};
+
+/** The minimum 2-byte integer (0b10xxxxxx xxxxxxxx) */
+constexpr uint32_t MIN_2BYTE= 1 << 7;
+/** The minimum 3-byte integer (0b110xxxxx xxxxxxxx xxxxxxxx) */
+constexpr uint32_t MIN_3BYTE= MIN_2BYTE + (1 << 14);
+/** The minimum 4-byte integer (0b1110xxxx xxxxxxxx xxxxxxxx xxxxxxxx) */
+constexpr uint32_t MIN_4BYTE= MIN_3BYTE + (1 << 21);
+/** Minimum 5-byte integer (0b11110000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx) */
+constexpr uint32_t MIN_5BYTE= MIN_4BYTE + (1 << 28);
+
+/** Error from mlog_decode_varint() */
+constexpr uint32_t MLOG_DECODE_ERROR= ~0U;
+
+/** Decode the length of a variable-length encoded integer.
+@param first  first byte of the encoded integer
+@return the length, in bytes */
+inline uint8_t mlog_decode_varint_length(byte first)
+{
+  uint8_t len= 1;
+  for (; first & 0x80; len++, first= static_cast<uint8_t>(first << 1));
+  return len;
+}
+
+/** Decode an integer in a redo log record.
+@param log    redo log record buffer
+@return the decoded integer
+@retval MLOG_DECODE_ERROR on error */
+template<typename byte_pointer>
+inline uint32_t mlog_decode_varint(const byte_pointer log)
+{
+  uint32_t i= *log;
+  if (i < MIN_2BYTE)
+    return i;
+  if (i < 0xc0)
+    return MIN_2BYTE + ((i & ~0x80) << 8 | log[1]);
+  if (i < 0xe0)
+    return MIN_3BYTE + ((i & ~0xc0) << 16 | uint32_t{log[1]} << 8 | log[2]);
+  if (i < 0xf0)
+    return MIN_4BYTE + ((i & ~0xe0) << 24 | uint32_t{log[1]} << 16 |
+                        uint32_t{log[2]} << 8 | log[3]);
+  if (i == 0xf0)
+  {
+    i= uint32_t{log[1]} << 24 | uint32_t{log[2]} << 16 |
+      uint32_t{log[3]} << 8 | log[4];
+    if (i <= ~MIN_5BYTE)
+      return MIN_5BYTE + i;
+  }
+  return MLOG_DECODE_ERROR;
+}
+
+/** Encode an integer in a redo log record.
+@param log  redo log record buffer
+@param i    the integer to encode
+@return end of the encoded integer */
+inline byte *mlog_encode_varint(byte *log, size_t i)
+{
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 4 and 5 need this here */
+#endif
+  if (i < MIN_2BYTE)
+  {
+  }
+  else if (i < MIN_3BYTE)
+  {
+    i-= MIN_2BYTE;
+    static_assert(MIN_3BYTE - MIN_2BYTE == 1 << 14, "compatibility");
+    *log++= 0x80 | static_cast<byte>(i >> 8);
+  }
+  else if (i < MIN_4BYTE)
+  {
+    i-= MIN_3BYTE;
+    static_assert(MIN_4BYTE - MIN_3BYTE == 1 << 21, "compatibility");
+    *log++= 0xc0 | static_cast<byte>(i >> 16);
+    goto last2;
+  }
+  else if (i < MIN_5BYTE)
+  {
+    i-= MIN_4BYTE;
+    static_assert(MIN_5BYTE - MIN_4BYTE == 1 << 28, "compatibility");
+    *log++= 0xe0 | static_cast<byte>(i >> 24);
+    goto last3;
+  }
+  else
+  {
+    ut_ad(i < MLOG_DECODE_ERROR);
+    i-= MIN_5BYTE;
+    *log++= 0xf0;
+    *log++= static_cast<byte>(i >> 24);
+last3:
+    *log++= static_cast<byte>(i >> 16);
+last2:
+    *log++= static_cast<byte>(i >> 8);
+  }
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
+  *log++= static_cast<byte>(i);
+  return log;
+}
+
+/** Determine the length of a log record.
+@param log  start of log record
+@param end  end of the log record buffer
+@return the length of the record, in bytes
+@retval 0                 if the log extends past the end
+@retval MLOG_DECODE_ERROR if the record is corrupted */
+inline uint32_t mlog_decode_len(const byte *log, const byte *end)
+{
+  ut_ad(log < end);
+  uint32_t i= *log;
+  if (!i)
+    return 0; /* end of mini-transaction */
+  if (~i & 15)
+    return (i & 15) + 1; /* 1..16 bytes */
+  if (UNIV_UNLIKELY(++log == end))
+    return 0; /* end of buffer */
+  i= *log;
+  if (UNIV_LIKELY(i < MIN_2BYTE)) /* 1 additional length byte: 16..143 bytes */
+    return 16 + i;
+  if (i < 0xc0) /* 2 additional length bytes: 144..16,527 bytes */
+  {
+    if (UNIV_UNLIKELY(log + 1 == end))
+      return 0; /* end of buffer */
+    return 16 + MIN_2BYTE + ((i & ~0xc0) << 8 | log[1]);
+  }
+  if (i < 0xe0) /* 3 additional length bytes: 16528..1065103 bytes */
+  {
+    if (UNIV_UNLIKELY(log + 2 == end))
+      return 0; /* end of buffer */
+    return 16 + MIN_3BYTE + ((i & ~0xe0) << 16 |
+                             static_cast<uint32_t>(log[1]) << 8 | log[2]);
+  }
+  /* 1,065,103 bytes per log record ought to be enough for everyone */
+  return MLOG_DECODE_ERROR;
+}
+
+/** Write 1, 2, 4, or 8 bytes to a file page.
+@param[in]      block   file page
+@param[in,out]  ptr     pointer in file page
+@param[in]      val     value to write
+@tparam l       number of bytes to write
+@tparam w       write request type
+@tparam V       type of val
+@return whether any log was written */
+template<unsigned l,mtr_t::write_type w,typename V>
+inline bool mtr_t::write(const buf_block_t &block, void *ptr, V val)
+{
+  ut_ad(ut_align_down(ptr, srv_page_size) == block.page.frame);
+  static_assert(l == 1 || l == 2 || l == 4 || l == 8, "wrong length");
+  byte buf[l];
+
+  switch (l) {
+  case 1:
+    ut_ad(val == static_cast<byte>(val));
+    buf[0]= static_cast<byte>(val);
+    break;
+  case 2:
+    ut_ad(val == static_cast<uint16_t>(val));
+    mach_write_to_2(buf, static_cast<uint16_t>(val));
+    break;
+  case 4:
+    ut_ad(val == static_cast<uint32_t>(val));
+    mach_write_to_4(buf, static_cast<uint32_t>(val));
+    break;
+  case 8:
+    mach_write_to_8(buf, val);
+    break;
+  }
+  byte *p= static_cast<byte*>(ptr);
+  const byte *const end= p + l;
+  if (w != FORCED && is_logged())
+  {
+    const byte *b= buf;
+    while (*p++ == *b++)
+    {
+      if (p == end)
+      {
+        ut_ad(w == MAYBE_NOP);
+        return false;
+      }
+    }
+    p--;
+  }
+  ::memcpy(ptr, buf, l);
+  memcpy_low(block, static_cast<uint16_t>
+             (ut_align_offset(p, srv_page_size)), p, end - p);
+  return true;
+}
+
+/** Log an initialization of a string of bytes.
+@param[in]      b       buffer page
+@param[in]      ofs     byte offset from b->frame
+@param[in]      len     length of the data to write
+@param[in]      val     the data byte to write */
+inline void mtr_t::memset(const buf_block_t &b, ulint ofs, ulint len, byte val)
+{
+  ut_ad(len);
+  set_modified(b);
+  if (!is_logged())
+    return;
+
+  static_assert(MIN_4BYTE > UNIV_PAGE_SIZE_MAX, "consistency");
+  size_t lenlen= (len < MIN_2BYTE ? 1 + 1 : len < MIN_3BYTE ? 2 + 1 : 3 + 1);
+  byte *l= log_write<MEMSET>(b.page.id(), &b.page, lenlen, true, ofs);
+  l= mlog_encode_varint(l, len);
+  *l++= val;
+  m_log.close(l);
+  m_last_offset= static_cast<uint16_t>(ofs + len);
+}
+
+/** Initialize a string of bytes.
+@param[in,out]  b       buffer page
+@param[in]      ofs     byte offset from block->frame
+@param[in]      len     length of the data to write
+@param[in]      val     the data byte to write */
+inline void mtr_t::memset(const buf_block_t *b, ulint ofs, ulint len, byte val)
+{
+  ut_ad(ofs <= ulint(srv_page_size));
+  ut_ad(ofs + len <= ulint(srv_page_size));
+  ::memset(ofs + b->page.frame, val, len);
+  memset(*b, ofs, len, val);
+}
+
+/** Log an initialization of a repeating string of bytes.
+@param[in]      b       buffer page
+@param[in]      ofs     byte offset from b->frame
+@param[in]      len     length of the data to write, in bytes
+@param[in]      str     the string to write
+@param[in]      size    size of str, in bytes */
+inline void mtr_t::memset(const buf_block_t &b, ulint ofs, size_t len,
+                          const void *str, size_t size)
+{
+  ut_ad(size);
+  ut_ad(len > size); /* use mtr_t::memcpy() for shorter writes */
+  set_modified(b);
+  if (!is_logged())
+    return;
+
+  static_assert(MIN_4BYTE > UNIV_PAGE_SIZE_MAX, "consistency");
+  size_t lenlen= (len < MIN_2BYTE ? 1 : len < MIN_3BYTE ? 2 : 3);
+  byte *l= log_write<MEMSET>(b.page.id(), &b.page, lenlen + size, true, ofs);
+  l= mlog_encode_varint(l, len);
+  ::memcpy(l, str, size);
+  l+= size;
+  m_log.close(l);
+  m_last_offset= static_cast<uint16_t>(ofs + len);
+}
+
+/** Initialize a repeating string of bytes.
+@param[in,out]  b       buffer page
+@param[in]      ofs     byte offset from b->frame
+@param[in]      len     length of the data to write, in bytes
+@param[in]      str     the string to write
+@param[in]      size    size of str, in bytes */
+inline void mtr_t::memset(const buf_block_t *b, ulint ofs, size_t len,
+                          const void *str, size_t size)
+{
+  ut_ad(ofs <= ulint(srv_page_size));
+  ut_ad(ofs + len <= ulint(srv_page_size));
+  ut_ad(len > size); /* use mtr_t::memcpy() for shorter writes */
+  size_t s= 0;
+  while (s < len)
+  {
+    ::memcpy(ofs + s + b->page.frame, str, size);
+    s+= len;
+  }
+  ::memcpy(ofs + s + b->page.frame, str, len - s);
+  memset(*b, ofs, len, str, size);
+}
+
+/** Log a write of a byte string to a page.
+@param[in]      b       buffer page
+@param[in]      offset  byte offset from b->frame
+@param[in]      str     the data to write
+@param[in]      len     length of the data to write */
+inline void mtr_t::memcpy(const buf_block_t &b, ulint offset, ulint len)
+{
+  ut_ad(len);
+  ut_ad(offset <= ulint(srv_page_size));
+  ut_ad(offset + len <= ulint(srv_page_size));
+  memcpy_low(b, uint16_t(offset), &b.page.frame[offset], len);
+}
+
+/** Log a write of a byte string to a page.
+@param block   page
+@param offset  byte offset within page
+@param data    data to be written
+@param len     length of the data, in bytes */
+inline void mtr_t::memcpy_low(const buf_block_t &block, uint16_t offset,
+                              const void *data, size_t len)
+{
+  ut_ad(len);
+  set_modified(block);
+  if (!is_logged())
+    return;
+  if (len < mtr_buf_t::MAX_DATA_SIZE - (1 + 3 + 3 + 5 + 5))
+  {
+    byte *end= log_write<WRITE>(block.page.id(), &block.page, len, true,
+                                offset);
+    ::memcpy(end, data, len);
+    m_log.close(end + len);
+  }
+  else
+  {
+    m_log.close(log_write<WRITE>(block.page.id(), &block.page, len, false,
+                                 offset));
+    m_log.push(static_cast<const byte*>(data), static_cast<uint32_t>(len));
+  }
+  m_last_offset= static_cast<uint16_t>(offset + len);
+}
+
+/** Log that a string of bytes was copied from the same page.
+@param[in]      b       buffer page
+@param[in]      d       destination offset within the page
+@param[in]      s       source offset within the page
+@param[in]      len     length of the data to copy */
+inline void mtr_t::memmove(const buf_block_t &b, ulint d, ulint s, ulint len)
+{
+  ut_ad(d >= 8);
+  ut_ad(s >= 8);
+  ut_ad(len);
+  ut_ad(s <= ulint(srv_page_size));
+  ut_ad(s + len <= ulint(srv_page_size));
+  ut_ad(s != d);
+  ut_ad(d <= ulint(srv_page_size));
+  ut_ad(d + len <= ulint(srv_page_size));
+
+  set_modified(b);
+  if (!is_logged())
+    return;
+  static_assert(MIN_4BYTE > UNIV_PAGE_SIZE_MAX, "consistency");
+  size_t lenlen= (len < MIN_2BYTE ? 1 : len < MIN_3BYTE ? 2 : 3);
+  /* The source offset is encoded relative to the destination offset,
+  with the sign in the least significant bit. */
+  if (s > d)
+    s= (s - d) << 1;
+  else
+    s= (d - s) << 1 | 1;
+  /* The source offset 0 is not possible. */
+  s-= 1 << 1;
+  size_t slen= (s < MIN_2BYTE ? 1 : s < MIN_3BYTE ? 2 : 3);
+  byte *l= log_write<MEMMOVE>(b.page.id(), &b.page, lenlen + slen, true, d);
+  l= mlog_encode_varint(l, len);
+  l= mlog_encode_varint(l, s);
+  m_log.close(l);
+  m_last_offset= static_cast<uint16_t>(d + len);
+}
+
+/**
+Write a log record.
+@tparam type   redo log record type
+@param id     persistent page identifier
+@param bpage  buffer pool page, or nullptr
+@param len    number of additional bytes to write
+@param alloc  whether to allocate the additional bytes
+@param offset byte offset, or 0 if the record type does not allow one
+@return end of mini-transaction log, minus len */
+template<byte type>
+inline byte *mtr_t::log_write(const page_id_t id, const buf_page_t *bpage,
+                              size_t len, bool alloc, size_t offset)
+{
+  static_assert(!(type & 15) && type != RESERVED &&
+                type <= FILE_CHECKPOINT, "invalid type");
+  ut_ad(type >= FILE_CREATE || is_named_space(id.space()));
+  ut_ad(!bpage || bpage->id() == id);
+  ut_ad(id < end_page_id);
+  constexpr bool have_len= type != INIT_PAGE && type != FREE_PAGE;
+  constexpr bool have_offset= type == WRITE || type == MEMSET ||
+    type == MEMMOVE;
+  static_assert(!have_offset || have_len, "consistency");
+  ut_ad(have_len || len == 0);
+  ut_ad(have_len || !alloc);
+  ut_ad(have_offset || offset == 0);
+  ut_ad(offset + len <= srv_page_size);
+  static_assert(MIN_4BYTE >= UNIV_PAGE_SIZE_MAX, "consistency");
+  ut_ad(type == FREE_PAGE || type == OPTION || (type == EXTENDED && !bpage) ||
+        memo_contains_flagged(bpage, MTR_MEMO_MODIFY));
+  size_t max_len;
+  if (!have_len)
+    max_len= 1 + 5 + 5;
+  else if (!have_offset)
+    max_len= bpage && m_last == bpage
+      ? 1 + 3
+      : 1 + 3 + 5 + 5;
+  else if (bpage && m_last == bpage && m_last_offset <= offset)
+  {
+    /* Encode the offset relative from m_last_offset. */
+    offset-= m_last_offset;
+    max_len= 1 + 3 + 3;
+  }
+  else
+    max_len= 1 + 3 + 5 + 5 + 3;
+  byte *const log_ptr= m_log.open(alloc ? max_len + len : max_len);
+  byte *end= log_ptr + 1;
+  const byte same_page= max_len < 1 + 5 + 5 ? 0x80 : 0;
+  if (!same_page)
+  {
+    end= mlog_encode_varint(end, id.space());
+    end= mlog_encode_varint(end, id.page_no());
+    m_last= bpage;
+  }
+  if (have_offset)
+  {
+    byte* oend= mlog_encode_varint(end, offset);
+    if (oend + len > &log_ptr[16])
+    {
+      len+= oend - log_ptr - 15;
+      if (len >= MIN_3BYTE - 1)
+        len+= 2;
+      else if (len >= MIN_2BYTE)
+        len++;
+
+      *log_ptr= type | same_page;
+      end= mlog_encode_varint(log_ptr + 1, len);
+      if (!same_page)
+      {
+        end= mlog_encode_varint(end, id.space());
+        end= mlog_encode_varint(end, id.page_no());
+      }
+      end= mlog_encode_varint(end, offset);
+      return end;
+    }
+    else
+      end= oend;
+  }
+  else if (len >= 3 && end + len > &log_ptr[16])
+  {
+    len+= end - log_ptr - 15;
+    if (len >= MIN_3BYTE - 1)
+      len+= 2;
+    else if (len >= MIN_2BYTE)
+      len++;
+
+    end= log_ptr;
+    *end++= type | same_page;
+    end= mlog_encode_varint(end, len);
+
+    if (!same_page)
+    {
+      end= mlog_encode_varint(end, id.space());
+      end= mlog_encode_varint(end, id.page_no());
+    }
+    return end;
+  }
+
+  ut_ad(end + len >= &log_ptr[1] + !same_page);
+  ut_ad(end + len <= &log_ptr[16]);
+  ut_ad(end <= &log_ptr[max_len]);
+  *log_ptr= type | same_page | static_cast<byte>(end + len - log_ptr - 1);
+  ut_ad(*log_ptr & 15);
+  return end;
+}
+
+/** Write a byte string to a page.
+@param[in]      b       buffer page
+@param[in]      dest    destination within b.frame
+@param[in]      str     the data to write
+@param[in]      len     length of the data to write
+@tparam w       write request type */
+template<mtr_t::write_type w>
+inline void mtr_t::memcpy(const buf_block_t &b, void *dest, const void *str,
+                          ulint len)
+{
+  ut_ad(ut_align_down(dest, srv_page_size) == b.page.frame);
+  char *d= static_cast<char*>(dest);
+  const char *s= static_cast<const char*>(str);
+  if (w != FORCED && is_logged())
+  {
+    ut_ad(len);
+    const char *const end= d + len;
+    while (*d++ == *s++)
+    {
+      if (d == end)
+      {
+        ut_ad(w == MAYBE_NOP);
+        return;
+      }
+    }
+    s--;
+    d--;
+    len= static_cast<ulint>(end - d);
+  }
+  ::memcpy(d, s, len);
+  memcpy(b, ut_align_offset(d, srv_page_size), len);
+}
+
+/** Write an EXTENDED log record.
+@param block  buffer pool page
+@param type   extended record subtype; @see mrec_ext_t */
+inline void mtr_t::log_write_extended(const buf_block_t &block, byte type)
+{
+  set_modified(block);
+  if (!is_logged())
+    return;
+  byte *l= log_write<EXTENDED>(block.page.id(), &block.page, 1, true);
+  *l++= type;
+  m_log.close(l);
+  m_last_offset= FIL_PAGE_TYPE;
+}
+
+/** Write log for partly initializing a B-tree or R-tree page.
+@param block    B-tree or R-tree page
+@param comp     false=ROW_FORMAT=REDUNDANT, true=COMPACT or DYNAMIC */
+inline void mtr_t::page_create(const buf_block_t &block, bool comp)
+{
+  static_assert(false == INIT_ROW_FORMAT_REDUNDANT, "encoding");
+  static_assert(true == INIT_ROW_FORMAT_DYNAMIC, "encoding");
+  log_write_extended(block, comp);
+}
+
+/** Write log for deleting a B-tree or R-tree record in ROW_FORMAT=REDUNDANT.
+@param block      B-tree or R-tree page
+@param prev_rec   byte offset of the predecessor of the record to delete,
+                  starting from PAGE_OLD_INFIMUM */
+inline void mtr_t::page_delete(const buf_block_t &block, ulint prev_rec)
+{
+  ut_ad(!block.zip_size());
+  ut_ad(prev_rec < block.physical_size());
+  set_modified(block);
+  if (!is_logged())
+    return;
+  size_t len= (prev_rec < MIN_2BYTE ? 2 : prev_rec < MIN_3BYTE ? 3 : 4);
+  byte *l= log_write<EXTENDED>(block.page.id(), &block.page, len, true);
+  ut_d(byte *end= l + len);
+  *l++= DELETE_ROW_FORMAT_REDUNDANT;
+  l= mlog_encode_varint(l, prev_rec);
+  ut_ad(end == l);
+  m_log.close(l);
+  m_last_offset= FIL_PAGE_TYPE;
+}
+
+/** Write log for deleting a COMPACT or DYNAMIC B-tree or R-tree record.
+@param block      B-tree or R-tree page
+@param prev_rec   byte offset of the predecessor of the record to delete,
+                  starting from PAGE_NEW_INFIMUM
+@param prev_rec   the predecessor of the record to delete
+@param hdr_size   record header size, excluding REC_N_NEW_EXTRA_BYTES
+@param data_size  data payload size, in bytes */
+inline void mtr_t::page_delete(const buf_block_t &block, ulint prev_rec,
+                               size_t hdr_size, size_t data_size)
+{
+  ut_ad(!block.zip_size());
+  set_modified(block);
+  ut_ad(hdr_size < MIN_3BYTE);
+  ut_ad(prev_rec < block.physical_size());
+  ut_ad(data_size < block.physical_size());
+  if (!is_logged())
+    return;
+  size_t len= prev_rec < MIN_2BYTE ? 2 : prev_rec < MIN_3BYTE ? 3 : 4;
+  len+= hdr_size < MIN_2BYTE ? 1 : 2;
+  len+= data_size < MIN_2BYTE ? 1 : data_size < MIN_3BYTE ? 2 : 3;
+  byte *l= log_write<EXTENDED>(block.page.id(), &block.page, len, true);
+  ut_d(byte *end= l + len);
+  *l++= DELETE_ROW_FORMAT_DYNAMIC;
+  l= mlog_encode_varint(l, prev_rec);
+  l= mlog_encode_varint(l, hdr_size);
+  l= mlog_encode_varint(l, data_size);
+  ut_ad(end == l);
+  m_log.close(l);
+  m_last_offset= FIL_PAGE_TYPE;
+}
+
+/** Write log for initializing an undo log page.
+@param block    undo page */
+inline void mtr_t::undo_create(const buf_block_t &block)
+{
+  log_write_extended(block, UNDO_INIT);
+}
+
+/** Write log for appending an undo log record.
+@param block    undo page
+@param data     record within the undo page
+@param len      length of the undo record, in bytes */
+inline void mtr_t::undo_append(const buf_block_t &block,
+                               const void *data, size_t len)
+{
+  ut_ad(len > 2);
+  set_modified(block);
+  if (!is_logged())
+    return;
+  const bool small= len + 1 < mtr_buf_t::MAX_DATA_SIZE - (1 + 3 + 3 + 5 + 5);
+  byte *end= log_write<EXTENDED>(block.page.id(), &block.page, len + 1, small);
+  if (UNIV_LIKELY(small))
+  {
+    *end++= UNDO_APPEND;
+    ::memcpy(end, data, len);
+    m_log.close(end + len);
+  }
+  else
+  {
+    m_log.close(end);
+    *m_log.push<byte*>(1)= UNDO_APPEND;
+    m_log.push(static_cast<const byte*>(data), static_cast<uint32_t>(len));
+  }
+  m_last_offset= FIL_PAGE_TYPE;
+}
+
+/** Trim the end of a tablespace.
+@param id       first page identifier that will not be in the file */
+inline void mtr_t::trim_pages(const page_id_t id)
+{
+  if (!is_logged())
+    return;
+  byte *l= log_write<EXTENDED>(id, nullptr, 1, true);
+  *l++= TRIM_PAGES;
+  m_log.close(l);
+  set_trim_pages();
+}
diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h
new file mode 100644
index 00000000..841cfab1
--- /dev/null
+++ b/storage/innobase/include/mtr0mtr.h
@@ -0,0 +1,780 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mtr0mtr.h
+Mini-transaction buffer
+
+Created 11/26/1995 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "fil0fil.h"
+#include "dyn0buf.h"
+#include "buf0buf.h"
+#include "small_vector.h"
+
+/** Start a mini-transaction. */
+#define mtr_start(m)		(m)->start()
+
+/** Commit a mini-transaction. */
+#define mtr_commit(m)		(m)->commit()
+
+/** Change the logging mode of a mini-transaction.
+@return	old mode */
+#define mtr_set_log_mode(m, d)	(m)->set_log_mode((d))
+
+#ifdef UNIV_PFS_RWLOCK
+# define mtr_s_lock_index(i,m)	(m)->s_lock(__FILE__, __LINE__, &(i)->lock)
+# define mtr_x_lock_index(i,m)	(m)->x_lock(__FILE__, __LINE__, &(i)->lock)
+# define mtr_sx_lock_index(i,m)	(m)->u_lock(__FILE__, __LINE__, &(i)->lock)
+#else
+# define mtr_s_lock_index(i,m)	(m)->s_lock(&(i)->lock)
+# define mtr_x_lock_index(i,m)	(m)->x_lock(&(i)->lock)
+# define mtr_sx_lock_index(i,m)	(m)->u_lock(&(i)->lock)
+#endif
+
+/** Mini-transaction memo stack slot. */
+struct mtr_memo_slot_t
+{
+  /** pointer to the object */
+  void *object;
+  /** type of the stored object */
+  mtr_memo_type_t type;
+
+  /** Release the object */
+  void release() const;
+};
+
+/** Mini-transaction handle and buffer */
+struct mtr_t {
+  mtr_t();
+  ~mtr_t();
+
+  /** Start a mini-transaction. */
+  void start();
+
+  /** Commit the mini-transaction. */
+  void commit();
+
+  /** Release latches of unmodified buffer pages.
+  @param begin   first slot to release
+  @param end     last slot to release, or get_savepoint() */
+  void rollback_to_savepoint(ulint begin, ulint end);
+
+  /** Release latches of unmodified buffer pages.
+  @param begin   first slot to release */
+  void rollback_to_savepoint(ulint begin)
+  { rollback_to_savepoint(begin, m_memo.size()); }
+
+  /** Release the last acquired buffer page latch. */
+  void release_last_page()
+  { auto s= m_memo.size(); rollback_to_savepoint(s - 1, s); }
+
+  /** Commit a mini-transaction that is shrinking a tablespace.
+  @param space   tablespace that is being shrunk */
+  ATTRIBUTE_COLD void commit_shrink(fil_space_t &space);
+
+  /** Commit a mini-transaction that is deleting or renaming a file.
+  @param space           tablespace that is being renamed or deleted
+  @param name            new file name (nullptr=the file will be deleted)
+  @return whether the operation succeeded */
+  ATTRIBUTE_COLD bool commit_file(fil_space_t &space, const char *name);
+
+  /** Commit a mini-transaction that did not modify any pages,
+  but generated some redo log on a higher level, such as
+  FILE_MODIFY records and an optional FILE_CHECKPOINT marker.
+  The caller must hold exclusive log_sys.latch.
+  This is to be used at log_checkpoint().
+  @param checkpoint_lsn   the log sequence number of a checkpoint, or 0
+  @return current LSN */
+  lsn_t commit_files(lsn_t checkpoint_lsn= 0);
+
+  /** @return mini-transaction savepoint (current size of m_memo) */
+  ulint get_savepoint() const
+  {
+    ut_ad(is_active());
+    return m_memo.size();
+  }
+
+  /** Get the block at a savepoint */
+  buf_block_t *at_savepoint(ulint savepoint) const
+  {
+    ut_ad(is_active());
+    const mtr_memo_slot_t &slot= m_memo[savepoint];
+    ut_ad(slot.type < MTR_MEMO_S_LOCK);
+    ut_ad(slot.object);
+    return static_cast<buf_block_t*>(slot.object);
+  }
+
+  /** Try to get a block at a savepoint.
+  @param savepoint the savepoint right before the block was acquired
+  @return the block at the savepoint
+  @retval nullptr  if no buffer block was registered at that savepoint */
+  buf_block_t *block_at_savepoint(ulint savepoint) const
+  {
+    ut_ad(is_active());
+    const mtr_memo_slot_t &slot= m_memo[savepoint];
+    return slot.type < MTR_MEMO_S_LOCK
+      ? static_cast<buf_block_t*>(slot.object)
+      : nullptr;
+  }
+
+  /** Retrieve a page that has already been latched.
+  @param id    page identifier
+  @param type  page latch type
+  @return block
+  @retval nullptr if the block had not been latched yet */
+  buf_block_t *get_already_latched(const page_id_t id, mtr_memo_type_t type)
+    const;
+
+  /** @return the logging mode */
+  mtr_log_t get_log_mode() const
+  {
+    static_assert(MTR_LOG_ALL == 0, "efficiency");
+    return static_cast<mtr_log_t>(m_log_mode);
+  }
+
+  /** @return whether log is to be written for changes */
+  bool is_logged() const
+  {
+    static_assert(MTR_LOG_ALL == 0, "efficiency");
+    static_assert(MTR_LOG_NONE & MTR_LOG_NO_REDO, "efficiency");
+    static_assert(!(MTR_LOG_NONE & MTR_LOG_SUB), "efficiency");
+    return !(m_log_mode & MTR_LOG_NONE);
+  }
+
+  /** Change the logging mode.
+  @param mode	 logging mode
+  @return	old mode */
+  mtr_log_t set_log_mode(mtr_log_t mode)
+  {
+    const mtr_log_t old_mode= get_log_mode();
+    m_log_mode= mode & 3;
+    return old_mode;
+  }
+
+  /** Set the log mode of a sub-minitransaction
+  @param mtr  parent mini-transaction */
+  void set_log_mode_sub(const mtr_t &mtr)
+  {
+    ut_ad(mtr.m_log_mode == MTR_LOG_ALL || mtr.m_log_mode == MTR_LOG_NO_REDO);
+    m_log_mode= mtr.m_log_mode | MTR_LOG_SUB;
+    static_assert((MTR_LOG_SUB | MTR_LOG_NO_REDO) == MTR_LOG_NO_REDO, "");
+  }
+
+  /** Check if we are holding a block latch in exclusive mode
+  @param block  buffer pool block to search for */
+  bool have_x_latch(const buf_block_t &block) const;
+
+  /** Check if we are holding a block latch in S or U mode
+  @param block  buffer pool block to search for */
+  bool have_u_or_x_latch(const buf_block_t &block) const;
+
+	/** Copy the tablespaces associated with the mini-transaction
+	(needed for generating FILE_MODIFY records)
+	@param[in]	mtr	mini-transaction that may modify
+	the same set of tablespaces as this one */
+	void set_spaces(const mtr_t& mtr)
+	{
+		ut_ad(!m_user_space_id);
+		ut_ad(!m_user_space);
+
+		ut_d(m_user_space_id = mtr.m_user_space_id);
+		m_user_space = mtr.m_user_space;
+	}
+
+	/** Set the tablespace associated with the mini-transaction
+	(needed for generating a FILE_MODIFY record)
+	@param[in]	space_id	user or system tablespace ID
+	@return	the tablespace */
+	fil_space_t* set_named_space_id(uint32_t space_id)
+	{
+		ut_ad(!m_user_space_id);
+		ut_d(m_user_space_id = space_id);
+		if (!space_id) {
+			return fil_system.sys_space;
+		} else {
+			ut_ad(m_user_space_id == space_id);
+			ut_ad(!m_user_space);
+			m_user_space = fil_space_get(space_id);
+			ut_ad(m_user_space);
+			return m_user_space;
+		}
+	}
+
+	/** Set the tablespace associated with the mini-transaction
+	(needed for generating a FILE_MODIFY record)
+	@param[in]	space	user or system tablespace */
+	void set_named_space(fil_space_t* space)
+	{
+		ut_ad(!m_user_space_id);
+		ut_d(m_user_space_id = space->id);
+		if (space->id) {
+			m_user_space = space;
+		}
+	}
+
+#ifdef UNIV_DEBUG
+	/** Check the tablespace associated with the mini-transaction
+	(needed for generating a FILE_MODIFY record)
+	@param[in]	space	tablespace
+	@return whether the mini-transaction is associated with the space */
+	bool is_named_space(uint32_t space) const;
+	/** Check the tablespace associated with the mini-transaction
+	(needed for generating a FILE_MODIFY record)
+	@param[in]	space	tablespace
+	@return whether the mini-transaction is associated with the space */
+	bool is_named_space(const fil_space_t* space) const;
+#endif /* UNIV_DEBUG */
+
+  /** Acquire a tablespace X-latch.
+  @param space_id   tablespace ID
+  @return the tablespace object (never NULL) */
+  fil_space_t *x_lock_space(uint32_t space_id);
+
+  /** Acquire a shared rw-latch. */
+  void s_lock(
+#ifdef UNIV_PFS_RWLOCK
+    const char *file, unsigned line,
+#endif
+    index_lock *lock)
+  {
+    lock->s_lock(SRW_LOCK_ARGS(file, line));
+    memo_push(lock, MTR_MEMO_S_LOCK);
+  }
+
+  /** Acquire an exclusive rw-latch. */
+  void x_lock(
+#ifdef UNIV_PFS_RWLOCK
+    const char *file, unsigned line,
+#endif
+    index_lock *lock)
+  {
+    lock->x_lock(SRW_LOCK_ARGS(file, line));
+    memo_push(lock, MTR_MEMO_X_LOCK);
+  }
+
+  /** Acquire an update latch. */
+  void u_lock(
+#ifdef UNIV_PFS_RWLOCK
+    const char *file, unsigned line,
+#endif
+    index_lock *lock)
+  {
+    lock->u_lock(SRW_LOCK_ARGS(file, line));
+    memo_push(lock, MTR_MEMO_SX_LOCK);
+  }
+
+  /** Acquire an exclusive tablespace latch.
+  @param space  tablespace */
+  void x_lock_space(fil_space_t *space);
+
+  /** Release an index latch. */
+  void release(const index_lock &lock) { release(&lock); }
+  /** Release a latch to an unmodified page. */
+  void release(const buf_block_t &block) { release(&block); }
+private:
+  /** Release an unmodified object. */
+  void release(const void *object);
+public:
+  /** Mark the given latched page as modified.
+  @param block   page that will be modified */
+  void set_modified(const buf_block_t &block);
+
+  /** Set the state to not-modified. This will not log the changes.
+  This is only used during redo log apply, to avoid logging the changes. */
+  void discard_modifications() { m_modifications= false; }
+
+  /** Get the LSN of commit().
+  @return the commit LSN
+  @retval 0 if the transaction only modified temporary tablespaces */
+  lsn_t commit_lsn() const { ut_ad(has_committed()); return m_commit_lsn; }
+
+  /** Note that we are inside the change buffer code. */
+  void enter_ibuf() { m_inside_ibuf= true; }
+
+  /** Note that we have exited from the change buffer code. */
+  void exit_ibuf() { m_inside_ibuf= false; }
+
+  /** @return true if we are inside the change buffer code */
+  bool is_inside_ibuf() const { return m_inside_ibuf; }
+
+  /** Note that some pages have been freed */
+  void set_trim_pages() { m_trim_pages= true; }
+
+  /** Latch a buffer pool block.
+  @param block    block to be latched
+  @param rw_latch RW_S_LATCH, RW_SX_LATCH, RW_X_LATCH, RW_NO_LATCH */
+  void page_lock(buf_block_t *block, ulint rw_latch);
+
+  /** Acquire a latch on a buffer-fixed buffer pool block.
+  @param savepoint   savepoint location of the buffer-fixed block
+  @param rw_latch    latch to acquire */
+  void upgrade_buffer_fix(ulint savepoint, rw_lock_type_t rw_latch);
+
+  /** Register a change to the page latch state. */
+  void lock_register(ulint savepoint, mtr_memo_type_t type)
+  {
+    mtr_memo_slot_t &slot= m_memo[savepoint];
+    ut_ad(slot.type <= MTR_MEMO_BUF_FIX);
+    ut_ad(type < MTR_MEMO_S_LOCK);
+    slot.type= type;
+  }
+
+  /** Upgrade U locks on a block to X */
+  void page_lock_upgrade(const buf_block_t &block);
+
+  /** Upgrade index U lock to X */
+  ATTRIBUTE_COLD void index_lock_upgrade();
+
+  /** Check if we are holding tablespace latch
+  @param space  tablespace to search for
+  @return whether space.latch is being held */
+  bool memo_contains(const fil_space_t& space) const
+    MY_ATTRIBUTE((warn_unused_result));
+#ifdef UNIV_DEBUG
+  /** Check if we are holding an rw-latch in this mini-transaction
+  @param lock   latch to search for
+  @param type   held latch type
+  @return whether (lock,type) is contained */
+  bool memo_contains(const index_lock &lock, mtr_memo_type_t type) const
+    MY_ATTRIBUTE((warn_unused_result));
+
+  /** Check if memo contains an index or buffer block latch.
+  @param object    object to search
+  @param flags     specify types of object latches
+  @return true if contains */
+  bool memo_contains_flagged(const void *object, ulint flags) const
+    MY_ATTRIBUTE((warn_unused_result, nonnull));
+
+  /** Check if memo contains the given page.
+  @param ptr   pointer to within page frame
+  @param flags types latch to look for
+  @return the block
+  @retval nullptr    if not found */
+  buf_block_t *memo_contains_page_flagged(const byte *ptr, ulint flags) const;
+
+  /** @return whether this mini-transaction modifies persistent data */
+  bool has_modifications() const { return m_modifications; }
+#endif /* UNIV_DEBUG */
+
+  /** Push a buffer page to an the memo.
+  @param block  buffer block
+  @param type	object type: MTR_MEMO_S_LOCK, ... */
+  void memo_push(buf_block_t *block, mtr_memo_type_t type)
+    __attribute__((nonnull))
+  {
+    ut_ad(is_active());
+    ut_ad(type <= MTR_MEMO_PAGE_SX_MODIFY);
+    ut_ad(block->page.buf_fix_count());
+    ut_ad(block->page.in_file());
+#ifdef UNIV_DEBUG
+    switch (type) {
+    case MTR_MEMO_PAGE_S_FIX:
+      ut_ad(block->page.lock.have_s());
+      break;
+    case MTR_MEMO_PAGE_X_FIX: case MTR_MEMO_PAGE_X_MODIFY:
+      ut_ad(block->page.lock.have_x());
+      break;
+    case MTR_MEMO_PAGE_SX_FIX: case MTR_MEMO_PAGE_SX_MODIFY:
+      ut_ad(block->page.lock.have_u_or_x());
+      break;
+    case MTR_MEMO_BUF_FIX:
+      break;
+    case MTR_MEMO_MODIFY:
+    case MTR_MEMO_S_LOCK: case MTR_MEMO_X_LOCK: case MTR_MEMO_SX_LOCK:
+    case MTR_MEMO_SPACE_X_LOCK:
+      ut_ad("invalid type" == 0);
+    }
+#endif
+    if (!(type & MTR_MEMO_MODIFY));
+    else if (block->page.id().space() >= SRV_TMP_SPACE_ID)
+    {
+      block->page.set_temp_modified();
+      type= mtr_memo_type_t(type & ~MTR_MEMO_MODIFY);
+    }
+    else
+    {
+      m_modifications= true;
+      if (!m_made_dirty)
+        /* If we are going to modify a previously clean persistent page,
+        we must set m_made_dirty, so that commit() will acquire
+        log_sys.flush_order_mutex and insert the block into
+        buf_pool.flush_list. */
+        m_made_dirty= block->page.oldest_modification() <= 1;
+    }
+    m_memo.emplace_back(mtr_memo_slot_t{block, type});
+  }
+
+  /** Push an index lock or tablespace latch to the memo.
+  @param object index lock or tablespace latch
+  @param type	object type: MTR_MEMO_S_LOCK, ... */
+  void memo_push(void *object, mtr_memo_type_t type) __attribute__((nonnull))
+  {
+    ut_ad(is_active());
+    ut_ad(type >= MTR_MEMO_S_LOCK);
+    m_memo.emplace_back(mtr_memo_slot_t{object, type});
+  }
+
+  /** @return the size of the log is empty */
+  size_t get_log_size() const { return m_log.size(); }
+  /** @return whether the log and memo are empty */
+  bool is_empty() const { return !get_savepoint() && !get_log_size(); }
+
+  /** Write an OPT_PAGE_CHECKSUM record. */
+  inline void page_checksum(const buf_page_t &bpage);
+
+  /** Write request types */
+  enum write_type
+  {
+    /** the page is guaranteed to always change */
+    NORMAL= 0,
+    /** optional: the page contents might not change */
+    MAYBE_NOP,
+    /** force a write, even if the page contents is not changing */
+    FORCED
+  };
+
+  /** Write 1, 2, 4, or 8 bytes to a file page.
+  @param[in]      block   file page
+  @param[in,out]  ptr     pointer in file page
+  @param[in]      val     value to write
+  @tparam l       number of bytes to write
+  @tparam w       write request type
+  @tparam V       type of val
+  @return whether any log was written */
+  template<unsigned l,write_type w= NORMAL,typename V>
+  inline bool write(const buf_block_t &block, void *ptr, V val)
+    MY_ATTRIBUTE((nonnull));
+
+  /** Log a write of a byte string to a page.
+  @param[in]      b       buffer page
+  @param[in]      ofs     byte offset from b->frame
+  @param[in]      len     length of the data to write */
+  inline void memcpy(const buf_block_t &b, ulint ofs, ulint len);
+
+  /** Write a byte string to a page.
+  @param[in,out]  b       buffer page
+  @param[in]      dest    destination within b.frame
+  @param[in]      str     the data to write
+  @param[in]      len     length of the data to write
+  @tparam w       write request type */
+  template<write_type w= NORMAL>
+  inline void memcpy(const buf_block_t &b, void *dest, const void *str,
+                     ulint len);
+
+  /** Log a write of a byte string to a ROW_FORMAT=COMPRESSED page.
+  @param[in]      b       ROW_FORMAT=COMPRESSED index page
+  @param[in]      offset  byte offset from b.zip.data
+  @param[in]      len     length of the data to write */
+  inline void zmemcpy(const buf_block_t &b, ulint offset, ulint len);
+
+  /** Write a byte string to a ROW_FORMAT=COMPRESSED page.
+  @param[in]      b       ROW_FORMAT=COMPRESSED index page
+  @param[in]      dest    destination within b.zip.data
+  @param[in]      str     the data to write
+  @param[in]      len     length of the data to write
+  @tparam w       write request type */
+  template<write_type w= NORMAL>
+  inline void zmemcpy(const buf_block_t &b, void *dest, const void *str,
+                      ulint len);
+
+  /** Log an initialization of a string of bytes.
+  @param[in]      b       buffer page
+  @param[in]      ofs     byte offset from b->frame
+  @param[in]      len     length of the data to write
+  @param[in]      val     the data byte to write */
+  inline void memset(const buf_block_t &b, ulint ofs, ulint len, byte val);
+
+  /** Initialize a string of bytes.
+  @param[in,out]        b       buffer page
+  @param[in]            ofs     byte offset from b->frame
+  @param[in]            len     length of the data to write
+  @param[in]            val     the data byte to write */
+  inline void memset(const buf_block_t *b, ulint ofs, ulint len, byte val);
+
+  /** Log an initialization of a repeating string of bytes.
+  @param[in]      b       buffer page
+  @param[in]      ofs     byte offset from b->frame
+  @param[in]      len     length of the data to write, in bytes
+  @param[in]      str     the string to write
+  @param[in]      size    size of str, in bytes */
+  inline void memset(const buf_block_t &b, ulint ofs, size_t len,
+                     const void *str, size_t size);
+
+  /** Initialize a repeating string of bytes.
+  @param[in,out]  b       buffer page
+  @param[in]      ofs     byte offset from b->frame
+  @param[in]      len     length of the data to write, in bytes
+  @param[in]      str     the string to write
+  @param[in]      size    size of str, in bytes */
+  inline void memset(const buf_block_t *b, ulint ofs, size_t len,
+                     const void *str, size_t size);
+
+  /** Log that a string of bytes was copied from the same page.
+  @param[in]      b       buffer page
+  @param[in]      d       destination offset within the page
+  @param[in]      s       source offset within the page
+  @param[in]      len     length of the data to copy */
+  inline void memmove(const buf_block_t &b, ulint d, ulint s, ulint len);
+
+  /** Initialize an entire page.
+  @param[in,out]        b       buffer page */
+  void init(buf_block_t *b);
+  /** Free a page.
+  @param space   tablespace
+  @param offset  offset of the page to be freed */
+  void free(const fil_space_t &space, uint32_t offset);
+  /** Write log for partly initializing a B-tree or R-tree page.
+  @param block    B-tree or R-tree page
+  @param comp     false=ROW_FORMAT=REDUNDANT, true=COMPACT or DYNAMIC */
+  inline void page_create(const buf_block_t &block, bool comp);
+
+  /** Write log for inserting a B-tree or R-tree record in
+  ROW_FORMAT=REDUNDANT.
+  @param block      B-tree or R-tree page
+  @param reuse      false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE
+  @param prev_rec   byte offset of the predecessor of the record to insert,
+                    starting from PAGE_OLD_INFIMUM
+  @param info_bits  info_bits of the record
+  @param n_fields_s number of fields << 1 | rec_get_1byte_offs_flag()
+  @param hdr_c      number of common record header bytes with prev_rec
+  @param data_c     number of common data bytes with prev_rec
+  @param hdr        record header bytes to copy to the log
+  @param hdr_l      number of copied record header bytes
+  @param data       record payload bytes to copy to the log
+  @param data_l     number of copied record data bytes */
+  inline void page_insert(const buf_block_t &block, bool reuse,
+                          ulint prev_rec, byte info_bits,
+                          ulint n_fields_s, size_t hdr_c, size_t data_c,
+                          const byte *hdr, size_t hdr_l,
+                          const byte *data, size_t data_l);
+  /** Write log for inserting a B-tree or R-tree record in
+  ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC.
+  @param block       B-tree or R-tree page
+  @param reuse       false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE
+  @param prev_rec    byte offset of the predecessor of the record to insert,
+                     starting from PAGE_NEW_INFIMUM
+  @param info_status rec_get_info_and_status_bits()
+  @param shift       unless !reuse: number of bytes the PAGE_FREE is moving
+  @param hdr_c       number of common record header bytes with prev_rec
+  @param data_c      number of common data bytes with prev_rec
+  @param hdr         record header bytes to copy to the log
+  @param hdr_l       number of copied record header bytes
+  @param data        record payload bytes to copy to the log
+  @param data_l      number of copied record data bytes */
+  inline void page_insert(const buf_block_t &block, bool reuse,
+                          ulint prev_rec, byte info_status,
+                          ssize_t shift, size_t hdr_c, size_t data_c,
+                          const byte *hdr, size_t hdr_l,
+                          const byte *data, size_t data_l);
+  /** Write log for deleting a B-tree or R-tree record in ROW_FORMAT=REDUNDANT.
+  @param block      B-tree or R-tree page
+  @param prev_rec   byte offset of the predecessor of the record to delete,
+                    starting from PAGE_OLD_INFIMUM */
+  inline void page_delete(const buf_block_t &block, ulint prev_rec);
+  /** Write log for deleting a COMPACT or DYNAMIC B-tree or R-tree record.
+  @param block      B-tree or R-tree page
+  @param prev_rec   byte offset of the predecessor of the record to delete,
+                    starting from PAGE_NEW_INFIMUM
+  @param hdr_size   record header size, excluding REC_N_NEW_EXTRA_BYTES
+  @param data_size  data payload size, in bytes */
+  inline void page_delete(const buf_block_t &block, ulint prev_rec,
+                          size_t hdr_size, size_t data_size);
+
+  /** Write log for initializing an undo log page.
+  @param block    undo page */
+  inline void undo_create(const buf_block_t &block);
+  /** Write log for appending an undo log record.
+  @param block    undo page
+  @param data     record within the undo page
+  @param len      length of the undo record, in bytes */
+  inline void undo_append(const buf_block_t &block,
+                          const void *data, size_t len);
+  /** Trim the end of a tablespace.
+  @param id       first page identifier that will not be in the file */
+  inline void trim_pages(const page_id_t id);
+
+  /** Write a log record about a file operation.
+  @param type           file operation
+  @param space_id       tablespace identifier
+  @param path           file path
+  @param new_path       new file path for type=FILE_RENAME */
+  inline void log_file_op(mfile_type_t type, uint32_t space_id,
+                          const char *path,
+                          const char *new_path= nullptr);
+
+  /** Add freed page numbers to freed_pages */
+  void add_freed_offset(fil_space_t *space, uint32_t page)
+  {
+    ut_ad(is_named_space(space));
+    if (!m_freed_pages)
+    {
+      m_freed_pages= new range_set();
+      ut_ad(!m_freed_space);
+      m_freed_space= space;
+    }
+    else
+      ut_ad(m_freed_space == space);
+    m_freed_pages->add_value(page);
+  }
+
+  /** Determine the added buffer fix count of a block.
+  @param block block to be checked
+  @return number of buffer count added by this mtr */
+  uint32_t get_fix_count(const buf_block_t *block) const;
+
+  /** Note that log_sys.latch is no longer being held exclusively. */
+  void flag_wr_unlock() noexcept { ut_ad(m_latch_ex); m_latch_ex= false; }
+
+  /** type of page flushing is needed during commit() */
+  enum page_flush_ahead
+  {
+    /** no need to trigger page cleaner */
+    PAGE_FLUSH_NO= 0,
+    /** asynchronous flushing is needed */
+    PAGE_FLUSH_ASYNC,
+    /** furious flushing is needed */
+    PAGE_FLUSH_SYNC
+  };
+
+private:
+  /** Handle any pages that were freed during the mini-transaction. */
+  void process_freed_pages();
+  /** Release modified pages when no log was written. */
+  void release_unlogged();
+
+  /** Log a write of a byte string to a page.
+  @param block   buffer page
+  @param offset  byte offset within page
+  @param data    data to be written
+  @param len     length of the data, in bytes */
+  inline void memcpy_low(const buf_block_t &block, uint16_t offset,
+                         const void *data, size_t len);
+  /**
+  Write a log record.
+  @tparam type  redo log record type
+  @param id     persistent page identifier
+  @param bpage  buffer pool page, or nullptr
+  @param len    number of additional bytes to write
+  @param alloc  whether to allocate the additional bytes
+  @param offset byte offset, or 0 if the record type does not allow one
+  @return end of mini-transaction log, minus len */
+  template<byte type>
+  inline byte *log_write(const page_id_t id, const buf_page_t *bpage,
+                         size_t len= 0, bool alloc= false, size_t offset= 0);
+
+  /** Write an EXTENDED log record.
+  @param block  buffer pool page
+  @param type   extended record subtype; @see mrec_ext_t */
+  inline void log_write_extended(const buf_block_t &block, byte type);
+
+  /** Write a FILE_MODIFY record when a non-predefined persistent
+  tablespace was modified for the first time since fil_names_clear(). */
+  ATTRIBUTE_NOINLINE ATTRIBUTE_COLD void name_write();
+
+  /** Encrypt the log */
+  ATTRIBUTE_NOINLINE void encrypt();
+
+  /** Append the redo log records to the redo log buffer.
+  @return {start_lsn,flush_ahead} */
+  std::pair<lsn_t,page_flush_ahead> do_write();
+
+  /** Append the redo log records to the redo log buffer.
+  @param len   number of bytes to write
+  @return {start_lsn,flush_ahead} */
+  std::pair<lsn_t,page_flush_ahead> finish_write(size_t len);
+
+  /** Release all latches. */
+  void release();
+  /** Release the resources */
+  inline void release_resources();
+
+#ifdef UNIV_DEBUG
+public:
+  /** @return whether the mini-transaction is active */
+  bool is_active() const
+  { ut_ad(!m_commit || m_start); return m_start && !m_commit; }
+  /** @return whether the mini-transaction has been committed */
+  bool has_committed() const { ut_ad(!m_commit || m_start); return m_commit; }
+  /** @return whether the mini-transaction is freeing an index tree */
+  bool is_freeing_tree() const { return m_freeing_tree; }
+  /** Notify that the mini-transaction is freeing an index tree */
+  void freeing_tree() { m_freeing_tree= true; }
+private:
+  /** whether start() has been called */
+  bool m_start= false;
+  /** whether commit() has been called */
+  bool m_commit= false;
+  /** whether freeing_tree() has been called */
+  bool m_freeing_tree= false;
+#endif
+private:
+  /** The page of the most recent m_log record written, or NULL */
+  const buf_page_t* m_last;
+  /** The current byte offset in m_last, or 0 */
+  uint16_t m_last_offset;
+
+  /** specifies which operations should be logged; default MTR_LOG_ALL */
+  uint16_t m_log_mode:2;
+
+  /** whether at least one persistent page was written to */
+  uint16_t m_modifications:1;
+
+  /** whether at least one previously clean buffer pool page was written to */
+  uint16_t m_made_dirty:1;
+
+  /** whether log_sys.latch is locked exclusively */
+  uint16_t m_latch_ex:1;
+
+  /** whether change buffer is latched; only needed in non-debug builds
+  to suppress some read-ahead operations, @see ibuf_inside() */
+  uint16_t m_inside_ibuf:1;
+
+  /** whether the pages has been trimmed */
+  uint16_t m_trim_pages:1;
+
+  /** CRC-32C of m_log */
+  uint32_t m_crc;
+
+#ifdef UNIV_DEBUG
+  /** Persistent user tablespace associated with the
+  mini-transaction, or 0 (TRX_SYS_SPACE) if none yet */
+  uint32_t m_user_space_id;
+#endif /* UNIV_DEBUG */
+
+  /** acquired dict_index_t::lock, fil_space_t::latch, buf_block_t */
+  small_vector<mtr_memo_slot_t, 16> m_memo;
+
+  /** mini-transaction log */
+  mtr_buf_t m_log;
+
+  /** user tablespace that is being modified by the mini-transaction */
+  fil_space_t* m_user_space;
+
+  /** LSN at commit time */
+  lsn_t m_commit_lsn;
+
+  /** tablespace where pages have been freed */
+  fil_space_t *m_freed_space= nullptr;
+  /** set of freed page ids */
+  range_set *m_freed_pages= nullptr;
+};
diff --git a/storage/innobase/include/mtr0types.h b/storage/innobase/include/mtr0types.h
new file mode 100644
index 00000000..19db13a1
--- /dev/null
+++ b/storage/innobase/include/mtr0types.h
@@ -0,0 +1,347 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mtr0types.h
+Mini-transaction buffer global types
+
+Created 11/26/1995 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "buf0types.h"
+
+#include "ut0byte.h"
+
+struct mtr_t;
+
+/** Logging modes for a mini-transaction */
+enum mtr_log_t {
+	/** Default mode: log all operations modifying disk-based data */
+	MTR_LOG_ALL = 0,
+
+	/** Log no operations and dirty pages are not added to the flush list.
+	Set for attempting modification of a ROW_FORMAT=COMPRESSED page. */
+	MTR_LOG_NONE,
+
+	/** Log all operations, but do not write any OPT_PAGE_CHECKSUM
+	records because some of the modified pages were also modified
+	by another mini-transaction that did not write its log yet. */
+	MTR_LOG_SUB,
+
+	/** Don't generate REDO log but add dirty pages to flush list */
+	MTR_LOG_NO_REDO
+};
+
+/*
+A mini-transaction is a stream of records that is always terminated by
+a byte 0x00 or 0x01. The first byte of a mini-transaction record is
+never one of these bytes, but these bytes can occur within mini-transaction
+records.
+
+The first byte of the record would contain a record type, flags, and a
+part of length. The optional second byte of the record will contain
+more length. (Not needed for short records.)
+
+For example, because the length of an INIT_PAGE record is 3 to 11 bytes,
+the first byte will be 0x02 to 0x0a, indicating the number of subsequent bytes.
+
+Bit 7 of the first byte of a redo log record is the same_page flag.
+If same_page=1, the record is referring to the same page as the
+previous record. Records that do not refer to data pages but to file
+operations are identified by setting the same_page=1 in the very first
+record(s) of the mini-transaction. A mini-transaction record that
+carries same_page=0 must only be followed by page-oriented records.
+
+Bits 6..4 of the first byte of a redo log record identify the redo log
+type. The following record types refer to data pages:
+
+    FREE_PAGE (0): corresponds to MLOG_INIT_FREE_PAGE
+    INIT_PAGE (1): corresponds to MLOG_INIT_FILE_PAGE2
+    EXTENDED (2): extended record; followed by subtype code @see mrec_ext_t
+    WRITE (3): replaces MLOG_nBYTES, MLOG_WRITE_STRING, MLOG_ZIP_*
+    MEMSET (4): extends the 10.4 MLOG_MEMSET record
+    MEMMOVE (5): copy data within the page (avoids logging redundant data)
+    RESERVED (6): reserved for future use; a subtype code
+    (encoded immediately after the length) would be written
+    to reserve code space for further extensions
+    OPTION (7): optional record that may be ignored; a subtype @see mrec_opt
+    (encoded immediately after the length) would distinguish actual usage
+
+Bits 3..0 indicate the redo log record length, excluding the first
+byte, but including additional length bytes and any other bytes,
+such as the optional tablespace identifier and page number.
+Values 1..15 represent lengths of 1 to 15 bytes. The special value 0
+indicates that 1 to 3 length bytes will follow to encode the remaining
+length that exceeds 16 bytes.
+
+Additional length bytes if length>16: 0 to 3 bytes
+0xxxxxxx                   for 0 to 127 (total: 16 to 143 bytes)
+10xxxxxx xxxxxxxx          for 128 to 16511 (total: 144 to 16527)
+110xxxxx xxxxxxxx xxxxxxxx for 16512 to 2113663 (total: 16528 to 2113679)
+111xxxxx                   reserved (corrupted record, and file!)
+
+If same_page=0, the tablespace identifier and page number will use
+similar 1-to-5-byte variable-length encoding:
+0xxxxxxx                                     for 0 to 127
+10xxxxxx xxxxxxxx                            for 128 to 16,511
+110xxxxx xxxxxxxx xxxxxxxx                   for 16,512 to 2,113,663
+1110xxxx xxxxxxxx xxxxxxxx xxxxxxxx          for 2,113,664 to 270,549,119
+11110xxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx for 270,549,120 to 34,630,287,487
+11111xxx                                     reserved (corrupted record)
+Note: Some 5-byte values are reserved, because the tablespace identifier
+and page number can only be up to 4,294,967,295.
+
+If same_page=1 is set in a record that follows a same_page=0 record
+in a mini-transaction, the tablespace identifier and page number
+fields will be omitted.
+
+For FILE_ records (if same_page=1 for the first record
+of a mini-transaction), we will write a tablespace identifier and
+a page number (always 0) using the same 1-to-5-byte encoding.
+
+For FREE_PAGE or INIT_PAGE, if same_page=1, the record will be treated
+as corrupted (or reserved for future extension).  The type code must
+be followed by 1+1 to 5+5 bytes (to encode the tablespace identifier
+and page number). If the record length does not match the encoded
+lengths of the tablespace identifier and page number, the record will
+be treated as corrupted. This allows future expansion of the format.
+
+If there is a FREE_PAGE record in a mini-transaction, it must be the
+only record for that page in the mini-transaction. If there is an
+INIT_PAGE record for a page in a mini-transaction, it must be the
+first record for that page in the mini-transaction.
+
+An EXTENDED record must be followed by 1+1 to 5+5 bytes for the page
+identifier (unless the same_page flag is set) and a subtype; @see mrec_ext_t
+
+For WRITE, MEMSET, MEMMOVE, the next 1 to 3 bytes are the byte offset
+on the page, relative from the previous offset. If same_page=0, the
+"previous offset" is 0. If same_page=1, the "previous offset" is where
+the previous operation ended (FIL_PAGE_TYPE for INIT_PAGE).
+0xxxxxxx                                     for 0 to 127
+10xxxxxx xxxxxxxx                            for 128 to 16,511
+110xxxxx xxxxxxxx xxxxxxxx                   for 16,512 to 2,113,663
+111xxxxx                                     reserved (corrupted record)
+If the sum of the "previous offset" and the current offset exceeds the
+page size, the record is treated as corrupted. Negative relative offsets
+cannot be written. Instead, a record with same_page=0 can be written.
+
+For MEMSET and MEMMOVE, the target length will follow, encoded in 1 to
+3 bytes.  If the length+offset exceeds the page size, the record will
+be treated as corrupted.
+
+For MEMMOVE, the source offset will follow, encoded in 1 to 3 bytes,
+relative to the current offset. The offset 0 is not possible, and
+the sign bit is the least significant bit. That is,
++x is encoded as (x-1)<<1 (+1,+2,+3,... is 0,2,4,...) and
+-x is encoded as (x-1)<<1|1 (-1,-2,-3,... is 1,3,5,...).
+The source offset must be within the page size, or else the record
+will be treated as corrupted.
+
+For MEMSET or WRITE, the byte(s) to be written will follow. For
+MEMSET, it usually is a single byte, but it could also be a multi-byte
+string, which would be copied over and over until the target length is
+reached. The length of the remaining bytes is implied by the length
+bytes at the start of the record.
+
+For MEMMOVE, if any bytes follow, the record is treated as corrupted
+(future expansion).
+
+As mentioned at the start of this comment, the type byte 0 would be
+special, marking the end of a mini-transaction. We could use the
+corresponding value 0x80 (with same_page=1) for something special,
+such as a future extension when more type codes are needed, or for
+encoding rarely needed redo log records.
+
+Examples:
+
+INIT could be logged as 0x12 0x34 0x56, meaning "type code 1 (INIT), 2
+bytes to follow" and "tablespace ID 0x34", "page number 0x56".
+The first byte must be between 0x12 and 0x1a, and the total length of
+the record must match the lengths of the encoded tablespace ID and
+page number.
+
+WRITE could be logged as 0x36 0x40 0x57 0x60 0x12 0x34 0x56, meaning
+"type code 3 (WRITE), 6 bytes to follow" and "tablespace ID 0x40",
+"page number 0x57", "byte offset 0x60", data 0x34,0x56.
+
+A subsequent WRITE to the same page could be logged 0xb5 0x7f 0x23
+0x34 0x56 0x78, meaning "same page, type code 3 (WRITE), 5 bytes to
+follow", "byte offset 0x7f"+0x60+2, bytes 0x23,0x34,0x56,0x78.
+
+The end of the mini-transaction would be indicated by the end byte
+0x00 or 0x01; @see log_sys.get_sequence_bit().
+If log_sys.is_encrypted(), that is followed by 8 bytes of nonce
+(part of initialization vector). That will be followed by 4 bytes
+of CRC-32C of the entire mini-tranasction, excluding the end byte. */
+
+/** Redo log record types. These bit patterns (3 bits) will be written
+to the redo log file, so the existing codes or their interpretation on
+crash recovery must not be changed. */
+enum mrec_type_t
+{
+  /** Free a page. On recovery, it is unnecessary to read the page.
+  The next record for the page (if any) must be INIT_PAGE.
+  After this record has been written, the page may be
+  overwritten with zeros, or discarded or trimmed. */
+  FREE_PAGE= 0,
+  /** Zero-initialize a page. The current byte offset (for subsequent
+  records) will be reset to FIL_PAGE_TYPE. */
+  INIT_PAGE= 0x10,
+  /** Extended record; @see mrec_ext_t */
+  EXTENDED= 0x20,
+  /** Write a string of bytes. Followed by the byte offset (unsigned,
+  relative to the current byte offset, encoded in 1 to 3 bytes) and
+  the bytes to write (at least one). The current byte offset will be
+  set after the last byte written. */
+  WRITE= 0x30,
+  /** Like WRITE, but before the bytes to write, the data_length-1
+  (encoded in 1 to 3 bytes) will be encoded, and it must be more
+  than the length of the following data bytes to write.
+  The data byte(s) will be repeatedly copied to the output until
+  the data_length is reached. */
+  MEMSET= 0x40,
+  /** Like MEMSET, but instead of the bytes to write, a source byte
+  offset (signed, nonzero, relative to the target byte offset, encoded
+  in 1 to 3 bytes, with the sign bit in the least significant bit)
+  will be written.
+  That is, +x is encoded as (x-1)<<1 (+1,+2,+3,... is 0,2,4,...)
+  and -x is encoded as (x-1)<<1|1 (-1,-2,-3,... is 1,3,5,...).
+  The source offset and data_length must be within the page size, or
+  else the record will be treated as corrupted. The data will be
+  copied from the page as it was at the start of the
+  mini-transaction. */
+  MEMMOVE= 0x50,
+  /** Reserved for future use. */
+  RESERVED= 0x60,
+  /** Optional record that may be ignored in crash recovery.
+  A subtype (@see mrec_opt) will be encoded after the page identifier. */
+  OPTION= 0x70
+};
+
+
+/** Supported EXTENDED record subtypes. */
+enum mrec_ext_t
+{
+  /** Partly initialize a ROW_FORMAT=REDUNDANT B-tree or R-tree index page,
+  including writing the "infimum" and "supremum" pseudo-records.
+  The current byte offset will be reset to FIL_PAGE_TYPE. */
+  INIT_ROW_FORMAT_REDUNDANT= 0,
+  /** Partly initialize a ROW_FORMAT=COMPACT or DYNAMIC index page,
+  including writing the "infimum" and "supremum" pseudo-records.
+  The current byte offset will be reset to FIL_PAGE_TYPE. */
+  INIT_ROW_FORMAT_DYNAMIC= 1,
+  /** Initialize an undo log page.
+  This is roughly (not exactly) equivalent to the old MLOG_UNDO_INIT record.
+  The current byte offset will be reset to FIL_PAGE_TYPE. */
+  UNDO_INIT= 2,
+  /** Append a record to an undo log page.
+  This is equivalent to the old MLOG_UNDO_INSERT record.
+  The current byte offset will be reset to FIL_PAGE_TYPE. */
+  UNDO_APPEND= 3,
+  /** Insert a ROW_FORMAT=REDUNDANT record, extending PAGE_HEAP_TOP.
+  The current byte offset will be reset to FIL_PAGE_TYPE. */
+  INSERT_HEAP_REDUNDANT= 4,
+  /** Insert a ROW_FORMAT=REDUNDANT record, reusing PAGE_FREE.
+  The current byte offset will be reset to FIL_PAGE_TYPE. */
+  INSERT_REUSE_REDUNDANT= 5,
+  /** Insert a ROW_FORMAT=COMPACT or DYNAMIC record, extending PAGE_HEAP_TOP.
+  The current byte offset will be reset to FIL_PAGE_TYPE. */
+  INSERT_HEAP_DYNAMIC= 6,
+  /** Insert a ROW_FORMAT=COMPACT or DYNAMIC record, reusing PAGE_FREE.
+  The current byte offset will be reset to FIL_PAGE_TYPE. */
+  INSERT_REUSE_DYNAMIC= 7,
+  /** Delete a record on a ROW_FORMAT=REDUNDANT page.
+  We point to the precedessor of the record to be deleted.
+  The current byte offset will be reset to FIL_PAGE_TYPE.
+  This is similar to the old MLOG_REC_DELETE record. */
+  DELETE_ROW_FORMAT_REDUNDANT= 8,
+  /** Delete a record on a ROW_FORMAT=COMPACT or DYNAMIC page.
+  We point to the precedessor of the record to be deleted
+  and include the total size of the record being deleted.
+  The current byte offset will be reset to FIL_PAGE_TYPE.
+  This is similar to the old MLOG_COMP_REC_DELETE record. */
+  DELETE_ROW_FORMAT_DYNAMIC= 9,
+  /** Truncate a data file. */
+  TRIM_PAGES= 10
+};
+
+
+/** Recognized OPTION record subtypes. */
+enum mrec_opt
+{
+  /** page checksum at the end of the mini-transaction */
+  OPT_PAGE_CHECKSUM= 0
+  /* Other possible subtypes: a binlog record, or an SQL statement. */
+};
+
+
+/** Redo log record types for file-level operations. These bit
+patterns will be written to redo log files, so the existing codes or
+their interpretation on crash recovery must not be changed. */
+enum mfile_type_t
+{
+  /** Create a file. Followed by tablespace ID and the file name. */
+  FILE_CREATE = 0x80,
+  /** Delete a file. Followed by tablespace ID and the file name.  */
+  FILE_DELETE = 0x90,
+  /** Rename a file. Followed by tablespace ID and the old file name,
+  NUL, and the new file name.  */
+  FILE_RENAME = 0xa0,
+  /** Modify a file. Followed by tablespace ID and the file name. */
+  FILE_MODIFY = 0xb0,
+  /** End-of-checkpoint marker, at the end of a mini-transaction.
+  Followed by 2 NUL bytes of page identifier and 8 bytes of LSN;
+  @see SIZE_OF_FILE_CHECKPOINT.
+  When all bytes are NUL, this is a dummy padding record. */
+  FILE_CHECKPOINT = 0xf0
+};
+
+/** Size of a FILE_CHECKPOINT record, including the trailing byte to
+terminate the mini-transaction and the CRC-32C. */
+constexpr byte SIZE_OF_FILE_CHECKPOINT= 3/*type,page_id*/ + 8/*LSN*/ + 1 + 4;
+
+#ifndef UNIV_INNOCHECKSUM
+/** Types for the mlock objects to store in the mtr_t::m_memo */
+enum mtr_memo_type_t {
+	MTR_MEMO_PAGE_S_FIX = RW_S_LATCH,
+
+	MTR_MEMO_PAGE_X_FIX = RW_X_LATCH,
+
+	MTR_MEMO_PAGE_SX_FIX = RW_SX_LATCH,
+
+	MTR_MEMO_BUF_FIX = RW_NO_LATCH,
+
+	MTR_MEMO_MODIFY = 16,
+
+	MTR_MEMO_PAGE_X_MODIFY = MTR_MEMO_PAGE_X_FIX | MTR_MEMO_MODIFY,
+	MTR_MEMO_PAGE_SX_MODIFY = MTR_MEMO_PAGE_SX_FIX | MTR_MEMO_MODIFY,
+
+	MTR_MEMO_S_LOCK = RW_S_LATCH << 5,
+
+	MTR_MEMO_X_LOCK = RW_X_LATCH << 5,
+
+	MTR_MEMO_SX_LOCK = RW_SX_LATCH << 5,
+
+	/** wr_lock() on fil_space_t::latch */
+	MTR_MEMO_SPACE_X_LOCK = MTR_MEMO_SX_LOCK << 1
+};
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h
new file mode 100644
index 00000000..c9db6a1f
--- /dev/null
+++ b/storage/innobase/include/os0file.h
@@ -0,0 +1,1188 @@
+/***********************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2009, Percona Inc.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+***********************************************************************/
+
+/**************************************************//**
+@file include/os0file.h
+The interface to the operating system file io
+
+Created 10/21/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef os0file_h
+#define os0file_h
+
+#include "fsp0types.h"
+#include "tpool.h"
+#include "my_counter.h"
+
+#ifndef _WIN32
+#include <dirent.h>
+#include <sys/stat.h>
+#include <time.h>
+#endif /* !_WIN32 */
+
+extern bool	os_has_said_disk_full;
+
+/** File offset in bytes */
+typedef ib_uint64_t os_offset_t;
+
+class buf_tmp_buffer_t;
+
+#ifdef _WIN32
+
+/** We define always WIN_ASYNC_IO, and check at run-time whether
+the OS actually supports it: Win 95 does not, NT does. */
+# define WIN_ASYNC_IO
+
+/** Use unbuffered I/O */
+# define UNIV_NON_BUFFERED_IO
+
+/** File handle */
+typedef native_file_handle os_file_t;
+
+
+#else /* _WIN32 */
+
+/** File handle */
+typedef int	os_file_t;
+
+#endif /* _WIN32 */
+
+static const os_file_t OS_FILE_CLOSED = IF_WIN(os_file_t(INVALID_HANDLE_VALUE),-1);
+
+/** File descriptor with optional PERFORMANCE_SCHEMA instrumentation */
+struct pfs_os_file_t
+{
+	/** Default constructor */
+	pfs_os_file_t(os_file_t file = OS_FILE_CLOSED) : m_file(file)
+#ifdef UNIV_PFS_IO
+	, m_psi(NULL)
+#endif
+	{}
+
+	/** The wrapped file handle */
+	os_file_t   m_file;
+#ifdef UNIV_PFS_IO
+	/** PERFORMANCE_SCHEMA descriptor */
+	struct PSI_file *m_psi;
+#endif
+	/** Implicit type conversion.
+	@return the wrapped file handle */
+	operator os_file_t() const { return m_file; }
+	/** Assignment operator.
+	@param[in]	file	file handle to be assigned */
+	void operator=(os_file_t file) { m_file = file; }
+	bool operator==(os_file_t file) const { return m_file == file; }
+	bool operator!=(os_file_t file) const { return !(*this == file); }
+#ifndef DBUG_OFF
+	friend std::ostream& operator<<(std::ostream& os, pfs_os_file_t f){
+		os << os_file_t(f);
+		return os;
+	}
+#endif
+};
+
+/** Options for os_file_create_func @{ */
+enum os_file_create_t {
+	OS_FILE_OPEN = 51,		/*!< to open an existing file (if
+					doesn't exist, error) */
+	OS_FILE_CREATE,			/*!< to create new file (if
+					exists, error) */
+	OS_FILE_OVERWRITE,		/*!< to create a new file, if exists
+					the overwrite old file */
+	OS_FILE_OPEN_RAW,		/*!< to open a raw device or disk
+					partition */
+	OS_FILE_CREATE_PATH,		/*!< to create the directories */
+	OS_FILE_OPEN_RETRY,		/*!< open with retry */
+
+	/** Flags that can be combined with the above values. Please ensure
+	that the above values stay below 128. */
+
+	OS_FILE_ON_ERROR_NO_EXIT = 128,	/*!< do not exit on unknown errors */
+	OS_FILE_ON_ERROR_SILENT = 256	/*!< don't print diagnostic messages to
+					the log unless it is a fatal error,
+					this flag is only used if
+					ON_ERROR_NO_EXIT is set */
+};
+
+static const ulint OS_FILE_READ_ONLY = 333;
+static const ulint OS_FILE_READ_WRITE = 444;
+
+/** Used by MySQLBackup */
+static const ulint OS_FILE_READ_ALLOW_DELETE = 555;
+
+/* Options for file_create */
+static const ulint OS_FILE_AIO = 61;
+static const ulint OS_FILE_NORMAL = 62;
+/* @} */
+
+/** Types for file create @{ */
+static const ulint OS_DATA_FILE = 100;
+static const ulint OS_LOG_FILE = 101;
+static const ulint OS_DATA_FILE_NO_O_DIRECT = 103;
+/* @} */
+
+/** Error codes from os_file_get_last_error @{ */
+static const ulint OS_FILE_NAME_TOO_LONG = 36;
+static const ulint OS_FILE_NOT_FOUND = 71;
+static const ulint OS_FILE_DISK_FULL = 72;
+static const ulint OS_FILE_ALREADY_EXISTS = 73;
+static const ulint OS_FILE_PATH_ERROR = 74;
+
+/** wait for OS aio resources to become available again */
+static const ulint OS_FILE_AIO_RESOURCES_RESERVED = 75;
+
+static const ulint OS_FILE_SHARING_VIOLATION = 76;
+static const ulint OS_FILE_ERROR_NOT_SPECIFIED = 77;
+static const ulint OS_FILE_INSUFFICIENT_RESOURCE = 78;
+static const ulint OS_FILE_AIO_INTERRUPTED = 79;
+static const ulint OS_FILE_OPERATION_ABORTED = 80;
+static const ulint OS_FILE_ACCESS_VIOLATION = 81;
+static const ulint OS_FILE_OPERATION_NOT_SUPPORTED = 125;
+static const ulint OS_FILE_ERROR_MAX = 200;
+/* @} */
+
+/**
+The I/O context that is passed down to the low level IO code */
+class IORequest
+{
+public:
+  enum Type
+  {
+    /** Synchronous read */
+    READ_SYNC= 2,
+    /** Asynchronous read; some errors will be ignored */
+    READ_ASYNC= READ_SYNC | 1,
+    /** Possibly partial read; only used with
+    os_file_read_no_error_handling() */
+    READ_MAYBE_PARTIAL= READ_SYNC | 4,
+    /** Read for doublewrite buffer recovery */
+    DBLWR_RECOVER= READ_SYNC | 8,
+    /** Synchronous write */
+    WRITE_SYNC= 16,
+    /** Asynchronous write */
+    WRITE_ASYNC= WRITE_SYNC | 1,
+    /** A doublewrite batch */
+    DBLWR_BATCH= WRITE_ASYNC | 8,
+    /** Write data; evict the block on write completion */
+    WRITE_LRU= WRITE_ASYNC | 32,
+    /** Write data and punch hole for the rest */
+    PUNCH= WRITE_ASYNC | 64,
+    /** Write data and punch hole; evict the block on write completion */
+    PUNCH_LRU= PUNCH | WRITE_LRU,
+    /** Zero out a range of bytes in fil_space_t::io() */
+    PUNCH_RANGE= WRITE_SYNC | 128,
+  };
+
+  constexpr IORequest(buf_page_t *bpage, buf_tmp_buffer_t *slot,
+                      fil_node_t *node, Type type) :
+    bpage(bpage), slot(slot), node(node), type(type) {}
+
+  constexpr IORequest(Type type= READ_SYNC, buf_page_t *bpage= nullptr,
+                      buf_tmp_buffer_t *slot= nullptr) :
+    bpage(bpage), slot(slot), type(type) {}
+
+  bool is_read() const { return (type & READ_SYNC) != 0; }
+  bool is_write() const { return (type & WRITE_SYNC) != 0; }
+  bool is_LRU() const { return (type & (WRITE_LRU ^ WRITE_ASYNC)) != 0; }
+  bool is_async() const { return (type & (READ_SYNC ^ READ_ASYNC)) != 0; }
+
+  void write_complete(int io_error) const;
+  void read_complete(int io_error) const;
+  void fake_read_complete(os_offset_t offset) const;
+
+  /** If requested, free storage space associated with a section of the file.
+  @param off   byte offset from the start (SEEK_SET)
+  @param len   size of the hole in bytes
+  @return DB_SUCCESS or error code */
+  dberr_t maybe_punch_hole(os_offset_t off, ulint len)
+  {
+    return off && len && node && (type & (PUNCH ^ WRITE_ASYNC))
+      ? punch_hole(off, len)
+      : DB_SUCCESS;
+  }
+
+private:
+  /** Free storage space associated with a section of the file.
+  @param off   byte offset from the start (SEEK_SET)
+  @param len   size of the hole in bytes
+  @return DB_SUCCESS or error code */
+  dberr_t punch_hole(os_offset_t off, ulint len) const;
+
+public:
+  /** Page to be written on write operation */
+  buf_page_t *const bpage= nullptr;
+
+  /** Memory to be used for encrypted or page_compressed pages */
+  buf_tmp_buffer_t *const slot= nullptr;
+
+  /** File descriptor */
+  fil_node_t *const node= nullptr;
+
+  /** Request type bit flags */
+  const Type type;
+};
+
+constexpr IORequest IORequestRead(IORequest::READ_SYNC);
+constexpr IORequest IORequestReadPartial(IORequest::READ_MAYBE_PARTIAL);
+constexpr IORequest IORequestWrite(IORequest::WRITE_SYNC);
+
+/** Sparse file size information. */
+struct os_file_size_t {
+	/** Total size of file in bytes */
+	os_offset_t	m_total_size;
+
+	/** If it is a sparse file then this is the number of bytes
+	actually allocated for the file. */
+	os_offset_t	m_alloc_size;
+};
+
+constexpr ulint OS_AIO_N_PENDING_IOS_PER_THREAD= 256;
+
+extern Atomic_counter<ulint> os_n_file_reads;
+extern Atomic_counter<size_t> os_n_file_writes;
+extern Atomic_counter<size_t> os_n_fsyncs;
+
+/* File types for directory entry data type */
+
+enum os_file_type_t {
+	OS_FILE_TYPE_UNKNOWN = 0,
+	OS_FILE_TYPE_FILE,			/* regular file */
+	OS_FILE_TYPE_DIR,			/* directory */
+	OS_FILE_TYPE_LINK,			/* symbolic link */
+	OS_FILE_TYPE_BLOCK			/* block device */
+};
+
+/* Maximum path string length in bytes when referring to tables with in the
+'./databasename/tablename.ibd' path format; we can allocate at least 2 buffers
+of this size from the thread stack; that is why this should not be made much
+bigger than 4000 bytes.  The maximum path length used by any storage engine
+in the server must be at least this big. */
+
+/* MySQL 5.7 my_global.h */
+#ifndef FN_REFLEN_SE
+#define FN_REFLEN_SE        4000
+#endif
+
+#define OS_FILE_MAX_PATH	4000
+#if (FN_REFLEN_SE < OS_FILE_MAX_PATH)
+# error "(FN_REFLEN_SE < OS_FILE_MAX_PATH)"
+#endif
+
+/** Struct used in fetching information of a file in a directory */
+struct os_file_stat_t {
+	char		name[OS_FILE_MAX_PATH];	/*!< path to a file */
+	os_file_type_t	type;			/*!< file type */
+	os_offset_t	size;			/*!< file size in bytes */
+	os_offset_t	alloc_size;		/*!< Allocated size for
+						sparse files in bytes */
+	size_t		block_size;		/*!< Block size to use for IO
+						in bytes*/
+	time_t		ctime;			/*!< creation time */
+	time_t		mtime;			/*!< modification time */
+	time_t		atime;			/*!< access time */
+	bool		rw_perm;		/*!< true if can be opened
+						in read-write mode. Only valid
+						if type == OS_FILE_TYPE_FILE */
+};
+
+/** Create a temporary file. This function is like tmpfile(3), but
+the temporary file is created in the in the mysql server configuration
+parameter (--tmpdir).
+@return temporary file handle, or NULL on error */
+FILE*
+os_file_create_tmpfile();
+
+/**
+This function attempts to create a directory named pathname. The new directory
+gets default permissions. On Unix, the permissions are (0770 & ~umask). If the
+directory exists already, nothing is done and the call succeeds, unless the
+fail_if_exists arguments is true.
+
+@param[in]	pathname	directory name as null-terminated string
+@param[in]	fail_if_exists	if true, pre-existing directory is treated
+				as an error.
+@return true if call succeeds, false on error */
+bool
+os_file_create_directory(
+	const char*	pathname,
+	bool		fail_if_exists);
+
+/** NOTE! Use the corresponding macro os_file_create_simple(), not directly
+this function!
+A simple function to open or create a file.
+@param[in]	name		name of the file or path as a null-terminated
+				string
+@param[in]	create_mode	create mode
+@param[in]	access_type	OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
+@param[in]	read_only	if true read only mode checks are enforced
+@param[out]	success		true if succeed, false if error
+@return own: handle to the file, not defined if error, error number
+	can be retrieved with os_file_get_last_error */
+pfs_os_file_t
+os_file_create_simple_func(
+	const char*	name,
+	ulint		create_mode,
+	ulint		access_type,
+	bool		read_only,
+	bool*		success);
+
+/** NOTE! Use the corresponding macro
+os_file_create_simple_no_error_handling(), not directly this function!
+A simple function to open or create a file.
+@param[in]	name		name of the file or path as a null-terminated string
+@param[in]	create_mode	create mode
+@param[in]	access_type	OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
+				OS_FILE_READ_ALLOW_DELETE; the last option
+				is used by a backup program reading the file
+@param[in]	read_only	if true read only mode checks are enforced
+@param[out]	success		true if succeeded
+@return own: handle to the file, not defined if error, error number
+	can be retrieved with os_file_get_last_error */
+pfs_os_file_t
+os_file_create_simple_no_error_handling_func(
+	const char*	name,
+	ulint		create_mode,
+	ulint		access_type,
+	bool		read_only,
+	bool*		success)
+	MY_ATTRIBUTE((warn_unused_result));
+
+#ifdef  _WIN32
+#define os_file_set_nocache(fd, file_name, operation_name) do{}while(0)
+#else
+/** Tries to disable OS caching on an opened file descriptor.
+@param[in]	fd		file descriptor to alter
+@param[in]	file_name	file name, used in the diagnostic message
+@param[in]	name		"open" or "create"; used in the diagnostic
+				message */
+void
+os_file_set_nocache(
+/*================*/
+	int	fd,		/*!< in: file descriptor to alter */
+	const char*	file_name,
+	const char*	operation_name);
+#endif
+
+#ifndef _WIN32 /* On Microsoft Windows, mandatory locking is used */
+/** Obtain an exclusive lock on a file.
+@param fd      file descriptor
+@param name    file name
+@return 0 on success */
+int os_file_lock(int fd, const char *name);
+#endif
+
+/** NOTE! Use the corresponding macro os_file_create(), not directly
+this function!
+Opens an existing file or creates a new.
+@param[in]	name		name of the file or path as a null-terminated
+				string
+@param[in]	create_mode	create mode
+@param[in]	purpose		OS_FILE_AIO, if asynchronous, non-buffered I/O
+				is desired, OS_FILE_NORMAL, if any normal file;
+				NOTE that it also depends on type, os_aio_..
+				and srv_.. variables whether we really use
+				async I/O or unbuffered I/O: look in the
+				function source code for the exact rules
+@param[in]	type		OS_DATA_FILE or OS_LOG_FILE
+@param[in]	read_only	if true read only mode checks are enforced
+@param[in]	success		true if succeeded
+@return own: handle to the file, not defined if error, error number
+	can be retrieved with os_file_get_last_error */
+pfs_os_file_t
+os_file_create_func(
+	const char*	name,
+	ulint		create_mode,
+	ulint		purpose,
+	ulint		type,
+	bool		read_only,
+	bool*		success)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Deletes a file. The file has to be closed before calling this.
+@param[in]	name		file path as a null-terminated string
+@return true if success */
+bool
+os_file_delete_func(const char* name);
+
+/** Deletes a file if it exists. The file has to be closed before calling this.
+@param[in]	name		file path as a null-terminated string
+@param[out]	exist		indicate if file pre-exist
+@return true if success */
+bool
+os_file_delete_if_exists_func(const char* name, bool* exist);
+
+/** NOTE! Use the corresponding macro os_file_rename(), not directly
+this function!
+Renames a file (can also move it to another directory). It is safest that the
+file is closed before calling this function.
+@param[in]	oldpath		old file path as a null-terminated string
+@param[in]	newpath		new file path
+@return true if success */
+bool
+os_file_rename_func(const char* oldpath, const char* newpath);
+
+/** NOTE! Use the corresponding macro os_file_close(), not directly this
+function!
+Closes a file handle. In case of error, error number can be retrieved with
+os_file_get_last_error.
+@param[in]	file		own: handle to a file
+@return true if success */
+bool os_file_close_func(os_file_t file);
+
+#ifdef UNIV_PFS_IO
+
+/* Keys to register InnoDB I/O with performance schema */
+extern mysql_pfs_key_t	innodb_data_file_key;
+extern mysql_pfs_key_t	innodb_temp_file_key;
+
+/* Following four macros are instumentations to register
+various file I/O operations with performance schema.
+1) register_pfs_file_open_begin() and register_pfs_file_open_end() are
+used to register file creation, opening, closing and renaming.
+2) register_pfs_file_rename_begin() and  register_pfs_file_rename_end()
+are used to register file renaming
+2) register_pfs_file_io_begin() and register_pfs_file_io_end() are
+used to register actual file read, write and flush
+3) register_pfs_file_close_begin() and register_pfs_file_close_end()
+are used to register file deletion operations*/
+# define register_pfs_file_open_begin(state, locker, key, op, name,	\
+				      src_file, src_line)		\
+do {									\
+	locker = PSI_FILE_CALL(get_thread_file_name_locker)(		\
+		state, key, op, name, &locker);				\
+	if (locker != NULL) {						\
+		PSI_FILE_CALL(start_file_open_wait)(			\
+			locker, src_file, src_line);			\
+	}								\
+} while (0)
+
+# define register_pfs_file_open_end(locker, file, result)		\
+do {									\
+	if (locker != NULL) {						\
+		file.m_psi = PSI_FILE_CALL(end_file_open_wait)(	\
+			locker, result);				\
+	}								\
+} while (0)
+
+# define register_pfs_file_rename_begin(state, locker, key, op, name,	\
+				src_file, src_line)			\
+	register_pfs_file_open_begin(state, locker, key, op, name,	\
+					src_file, src_line)		\
+
+# define register_pfs_file_rename_end(locker, from, to, result)		\
+do {									\
+	if (locker != NULL) {						\
+		 PSI_FILE_CALL(						\
+			end_file_rename_wait)(				\
+			locker, from, to, result);			\
+	}								\
+} while (0)
+
+# define register_pfs_file_close_begin(state, locker, key, op, name,	\
+				      src_file, src_line)		\
+do {									\
+	locker = PSI_FILE_CALL(get_thread_file_name_locker)(		\
+		state, key, op, name, &locker);				\
+	if (locker != NULL) {						\
+		PSI_FILE_CALL(start_file_close_wait)(			\
+			locker, src_file, src_line);			\
+	}								\
+} while (0)
+
+# define register_pfs_file_close_end(locker, result)			\
+do {									\
+	if (locker != NULL) {						\
+		PSI_FILE_CALL(end_file_close_wait)(			\
+			locker, result);				\
+	}								\
+} while (0)
+
+# define register_pfs_file_io_begin(state, locker, file, count, op,	\
+				    src_file, src_line)			\
+do {									\
+	locker = PSI_FILE_CALL(get_thread_file_stream_locker)(		\
+		state, file.m_psi, op);					\
+	if (locker != NULL) {						\
+		PSI_FILE_CALL(start_file_wait)(				\
+			locker, count, src_file, src_line);		\
+	}								\
+} while (0)
+
+# define register_pfs_file_io_end(locker, count)			\
+do {									\
+	if (locker != NULL) {						\
+		PSI_FILE_CALL(end_file_wait)(locker, count);		\
+	}								\
+} while (0)
+
+/* Following macros/functions are file I/O APIs that would be performance
+schema instrumented if "UNIV_PFS_IO" is defined. They would point to
+wrapper functions with performance schema instrumentation in such case.
+
+os_file_create
+os_file_create_simple
+os_file_create_simple_no_error_handling
+os_file_close
+os_file_rename
+os_aio
+os_file_read
+os_file_read_no_error_handling
+os_file_write
+
+The wrapper functions have the prefix of "innodb_". */
+
+# define os_file_create(key, name, create, purpose, type, read_only,	\
+			success)					\
+	pfs_os_file_create_func(key, name, create, purpose,	type,	\
+		read_only, success, __FILE__, __LINE__)
+
+# define os_file_create_simple(key, name, create, access,		\
+		read_only, success)					\
+	pfs_os_file_create_simple_func(key, name, create, access,	\
+		read_only, success, __FILE__, __LINE__)
+
+# define os_file_create_simple_no_error_handling(			\
+	key, name, create_mode, access, read_only, success)		\
+	pfs_os_file_create_simple_no_error_handling_func(		\
+		key, name, create_mode, access,				\
+		read_only, success, __FILE__, __LINE__)
+
+# define os_file_close(file)						\
+	pfs_os_file_close_func(file, __FILE__, __LINE__)
+
+# define os_file_read(type, file, buf, offset, n, o)			\
+	pfs_os_file_read_func(type, file, buf, offset, n,o, __FILE__, __LINE__)
+
+# define os_file_write(type, name, file, buf, offset, n)	\
+	pfs_os_file_write_func(type, name, file, buf, offset,	\
+			       n, __FILE__, __LINE__)
+
+# define os_file_flush(file)					\
+	pfs_os_file_flush_func(file, __FILE__, __LINE__)
+
+# define os_file_rename(key, oldpath, newpath)				\
+	pfs_os_file_rename_func(key, oldpath, newpath, __FILE__, __LINE__)
+
+# define os_file_delete(key, name)					\
+	pfs_os_file_delete_func(key, name, __FILE__, __LINE__)
+
+# define os_file_delete_if_exists(key, name, exist)			\
+	pfs_os_file_delete_if_exists_func(key, name, exist, __FILE__, __LINE__)
+
+/** NOTE! Please use the corresponding macro os_file_create_simple(),
+not directly this function!
+A performance schema instrumented wrapper function for
+os_file_create_simple() which opens or creates a file.
+@param[in]	key		Performance Schema Key
+@param[in]	name		name of the file or path as a null-terminated
+				string
+@param[in]	create_mode	create mode
+@param[in]	access_type	OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
+@param[in]	read_only	if true read only mode checks are enforced
+@param[out]	success		true if succeeded
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return own: handle to the file, not defined if error, error number
+	can be retrieved with os_file_get_last_error */
+UNIV_INLINE
+pfs_os_file_t
+pfs_os_file_create_simple_func(
+	mysql_pfs_key_t key,
+	const char*	name,
+	ulint		create_mode,
+	ulint		access_type,
+	bool		read_only,
+	bool*		success,
+	const char*	src_file,
+	uint		src_line)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** NOTE! Please use the corresponding macro
+os_file_create_simple_no_error_handling(), not directly this function!
+A performance schema instrumented wrapper function for
+os_file_create_simple_no_error_handling(). Add instrumentation to
+monitor file creation/open.
+@param[in]	key		Performance Schema Key
+@param[in]	name		name of the file or path as a null-terminated
+				string
+@param[in]	create_mode	create mode
+@param[in]	access_type	OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
+				OS_FILE_READ_ALLOW_DELETE; the last option is
+				used by a backup program reading the file
+@param[in]	read_only	if true read only mode checks are enforced
+@param[out]	success		true if succeeded
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return own: handle to the file, not defined if error, error number
+	can be retrieved with os_file_get_last_error */
+UNIV_INLINE
+pfs_os_file_t
+pfs_os_file_create_simple_no_error_handling_func(
+	mysql_pfs_key_t key,
+	const char*	name,
+	ulint		create_mode,
+	ulint		access_type,
+	bool		read_only,
+	bool*		success,
+	const char*	src_file,
+	uint		src_line)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** NOTE! Please use the corresponding macro os_file_create(), not directly
+this function!
+A performance schema wrapper function for os_file_create().
+Add instrumentation to monitor file creation/open.
+@param[in]	key		Performance Schema Key
+@param[in]	name		name of the file or path as a null-terminated
+				string
+@param[in]	create_mode	create mode
+@param[in]	purpose		OS_FILE_AIO, if asynchronous, non-buffered I/O
+				is desired, OS_FILE_NORMAL, if any normal file;
+				NOTE that it also depends on type, os_aio_..
+				and srv_.. variables whether we really use
+				async I/O or unbuffered I/O: look in the
+				function source code for the exact rules
+@param[in]	read_only	if true read only mode checks are enforced
+@param[out]	success		true if succeeded
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return own: handle to the file, not defined if error, error number
+	can be retrieved with os_file_get_last_error */
+UNIV_INLINE
+pfs_os_file_t
+pfs_os_file_create_func(
+	mysql_pfs_key_t key,
+	const char*	name,
+	ulint		create_mode,
+	ulint		purpose,
+	ulint		type,
+	bool		read_only,
+	bool*		success,
+	const char*	src_file,
+	uint		src_line)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** NOTE! Please use the corresponding macro os_file_close(), not directly
+this function!
+A performance schema instrumented wrapper function for os_file_close().
+@param[in]	file		handle to a file
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return true if success */
+UNIV_INLINE
+bool
+pfs_os_file_close_func(
+	pfs_os_file_t	file,
+	const char*	src_file,
+	uint		src_line);
+
+/** NOTE! Please use the corresponding macro os_file_read(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_read() which requests a synchronous read operation.
+@param[in]	type		IO request context
+@param[in]	file		Open file handle
+@param[out]	buf		buffer where to read
+@param[in]	offset		file offset where to read
+@param[in]	n		number of bytes to read
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return DB_SUCCESS if request was successful */
+UNIV_INLINE
+dberr_t
+pfs_os_file_read_func(
+	const IORequest&	type,
+	pfs_os_file_t		file,
+	void*			buf,
+	os_offset_t		offset,
+	ulint			n,
+	ulint*			o,
+	const char*		src_file,
+	uint			src_line);
+
+/** NOTE! Please use the corresponding macro os_file_write(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_write() which requests a synchronous write operation.
+@param[in]	type		IO request context
+@param[in]	name		Name of the file or path as NUL terminated
+				string
+@param[in]	file		Open file handle
+@param[out]	buf		buffer where to read
+@param[in]	offset		file offset where to read
+@param[in]	n		number of bytes to read
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return DB_SUCCESS if request was successful */
+UNIV_INLINE
+dberr_t
+pfs_os_file_write_func(
+	const IORequest&	type,
+	const char*		name,
+	pfs_os_file_t		file,
+	const void*		buf,
+	os_offset_t		offset,
+	ulint			n,
+	const char*		src_file,
+	uint			src_line);
+
+/** NOTE! Please use the corresponding macro os_file_flush(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_flush() which flushes the write buffers of a given file to the disk.
+Flushes the write buffers of a given file to the disk.
+@param[in]	file		Open file handle
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return TRUE if success */
+UNIV_INLINE
+bool
+pfs_os_file_flush_func(
+	pfs_os_file_t	file,
+	const char*	src_file,
+	uint		src_line);
+
+
+/** NOTE! Please use the corresponding macro os_file_rename(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_rename()
+@param[in]	key		Performance Schema Key
+@param[in]	oldpath		old file path as a null-terminated string
+@param[in]	newpath		new file path
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return true if success */
+UNIV_INLINE
+bool
+pfs_os_file_rename_func(
+	mysql_pfs_key_t	key,
+	const char*	oldpath,
+	const char*	newpath,
+	const char*	src_file,
+	uint		src_line);
+
+/**
+NOTE! Please use the corresponding macro os_file_delete(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_delete()
+@param[in]	key		Performance Schema Key
+@param[in]	name		old file path as a null-terminated string
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return true if success */
+UNIV_INLINE
+bool
+pfs_os_file_delete_func(
+	mysql_pfs_key_t	key,
+	const char*	name,
+	const char*	src_file,
+	uint		src_line);
+
+/**
+NOTE! Please use the corresponding macro os_file_delete_if_exists(), not
+directly this function!
+This is the performance schema instrumented wrapper function for
+os_file_delete_if_exists()
+@param[in]	key		Performance Schema Key
+@param[in]	name		old file path as a null-terminated string
+@param[in]	exist		indicate if file pre-exist
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return true if success */
+UNIV_INLINE
+bool
+pfs_os_file_delete_if_exists_func(
+	mysql_pfs_key_t	key,
+	const char*	name,
+	bool*		exist,
+	const char*	src_file,
+	uint		src_line);
+
+#else /* UNIV_PFS_IO */
+
+/* If UNIV_PFS_IO is not defined, these I/O APIs point
+to original un-instrumented file I/O APIs */
+# define os_file_create(key, name, create, purpose, type, read_only,	\
+			success)					\
+	os_file_create_func(name, create, purpose, type, read_only,	\
+			success)
+
+# define os_file_create_simple(key, name, create_mode, access,		\
+		read_only, success)					\
+	os_file_create_simple_func(name, create_mode, access,		\
+		read_only, success)
+
+# define os_file_create_simple_no_error_handling(			\
+	key, name, create_mode, access, read_only, success)		\
+	os_file_create_simple_no_error_handling_func(			\
+		name, create_mode, access, read_only, success)
+
+# define os_file_close(file)	os_file_close_func(file)
+
+# define os_file_read(type, file, buf, offset, n, o)		\
+	os_file_read_func(type, file, buf, offset, n, o)
+
+# define os_file_write(type, name, file, buf, offset, n)	\
+	os_file_write_func(type, name, file, buf, offset, n)
+
+# define os_file_flush(file)	os_file_flush_func(file)
+
+# define os_file_rename(key, oldpath, newpath)				\
+	os_file_rename_func(oldpath, newpath)
+
+# define os_file_delete(key, name)	os_file_delete_func(name)
+
+# define os_file_delete_if_exists(key, name, exist)			\
+	os_file_delete_if_exists_func(name, exist)
+
+#endif	/* UNIV_PFS_IO */
+
+/** Gets a file size.
+@param[in]	file		handle to a file
+@return file size if OK, else set m_total_size to ~0 and m_alloc_size
+	to errno */
+os_file_size_t
+os_file_get_size(
+	const char*	filename)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Gets a file size.
+@param[in]	file		handle to a file
+@return file size, or (os_offset_t) -1 on failure */
+os_offset_t
+os_file_get_size(
+	os_file_t	file)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Extend a file.
+
+On Windows, extending a file allocates blocks for the file,
+unless the file is sparse.
+
+On Unix, we will extend the file with ftruncate(), if
+file needs to be sparse. Otherwise posix_fallocate() is used
+when available, and if not, binary zeroes are added to the end
+of file.
+
+@param[in]	name	file name
+@param[in]	file	file handle
+@param[in]	size	desired file size
+@param[in]	sparse	whether to create a sparse file (no preallocating)
+@return	whether the operation succeeded */
+bool
+os_file_set_size(
+	const char*	name,
+	os_file_t	file,
+	os_offset_t	size,
+	bool		is_sparse = false)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Truncates a file at its current position.
+@param[in/out]	file	file to be truncated
+@return true if success */
+bool
+os_file_set_eof(
+	FILE*		file);	/*!< in: file to be truncated */
+
+/** Truncate a file to a specified size in bytes.
+@param[in]	pathname	file path
+@param[in]	file		file to be truncated
+@param[in]	size		size preserved in bytes
+@param[in]	allow_shrink	whether to allow the file to become smaller
+@return true if success */
+bool
+os_file_truncate(
+	const char*	pathname,
+	os_file_t	file,
+	os_offset_t	size,
+	bool		allow_shrink = false);
+
+/** NOTE! Use the corresponding macro os_file_flush(), not directly this
+function!
+Flushes the write buffers of a given file to the disk.
+@param[in]	file		handle to a file
+@return true if success */
+bool
+os_file_flush_func(
+	os_file_t	file);
+
+/** Retrieves the last error number if an error occurs in a file io function.
+The number should be retrieved before any other OS calls (because they may
+overwrite the error number). If the number is not known to this program,
+the OS error number + OS_FILE_ERROR_MAX is returned.
+@param[in]	report_all_errors	true if we want an error message
+                                        printed of all errors
+@param[in]	on_error_silent		true then don't print any diagnostic
+                                        to the log
+@return error number, or OS error number + OS_FILE_ERROR_MAX */
+ulint os_file_get_last_error(bool report_all_errors,
+                             bool on_error_silent= false);
+
+/** NOTE! Use the corresponding macro os_file_read(), not directly this
+function!
+Requests a synchronous read operation.
+@param[in]	type		IO request context
+@param[in]	file		Open file handle
+@param[out]	buf		buffer where to read
+@param[in]	offset		file offset where to read
+@param[in]	n		number of bytes to read
+@param[out]	o		number of bytes actually read
+@return DB_SUCCESS if request was successful */
+dberr_t
+os_file_read_func(
+	const IORequest&	type,
+	os_file_t		file,
+	void*			buf,
+	os_offset_t		offset,
+	ulint			n,
+	ulint*			o)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Rewind file to its start, read at most size - 1 bytes from it to str, and
+NUL-terminate str. All errors are silently ignored. This function is
+mostly meant to be used with temporary files.
+@param[in,out]	file		file to read from
+@param[in,out]	str		buffer where to read
+@param[in]	size		size of buffer */
+void
+os_file_read_string(
+	FILE*		file,
+	char*		str,
+	ulint		size);
+
+/** NOTE! Use the corresponding macro os_file_write(), not directly this
+function!
+Requests a synchronous write operation.
+@param[in]	type		IO request context
+@param[in]	file		Open file handle
+@param[out]	buf		buffer where to read
+@param[in]	offset		file offset where to read
+@param[in]	n		number of bytes to read
+@return DB_SUCCESS if request was successful */
+dberr_t
+os_file_write_func(
+	const IORequest&	type,
+	const char*		name,
+	os_file_t		file,
+	const void*		buf,
+	os_offset_t		offset,
+	ulint			n)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Check the existence and type of the given file.
+@param[in]	path		pathname of the file
+@param[out]	exists		true if file exists
+@param[out]	type		type of the file (if it exists)
+@return true if call succeeded */
+bool
+os_file_status(
+	const char*	path,
+	bool*		exists,
+	os_file_type_t* type);
+
+/** This function reduces a null-terminated full remote path name into
+the path that is sent by MySQL for DATA DIRECTORY clause.  It replaces
+the 'databasename/tablename.ibd' found at the end of the path with just
+'tablename'.
+
+Since the result is always smaller than the path sent in, no new memory
+is allocated. The caller should allocate memory for the path sent in.
+This function manipulates that path in place.
+
+If the path format is not as expected, just return.  The result is used
+to inform a SHOW CREATE TABLE command.
+@param[in,out]	data_dir_path		Full path/data_dir_path */
+void
+os_file_make_data_dir_path(
+	char*	data_dir_path);
+
+/** Create all missing subdirectories along the given path.
+@return DB_SUCCESS if OK, otherwise error code. */
+dberr_t
+os_file_create_subdirs_if_needed(
+	const char*	path);
+
+#ifdef UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR
+/* Test the function os_file_get_parent_dir. */
+void
+unit_test_os_file_get_parent_dir();
+#endif /* UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR */
+
+/**
+Initializes the asynchronous io system. */
+int os_aio_init();
+
+/**
+Frees the asynchronous io system. */
+void os_aio_free();
+
+/** Submit a fake read request during crash recovery.
+@param type   fake read request
+@param offset additional context */
+void os_fake_read(const IORequest &type, os_offset_t offset);
+
+/** Request a read or write.
+@param type		I/O request
+@param buf		buffer
+@param offset		file offset
+@param n		number of bytes
+@retval DB_SUCCESS if request was queued successfully
+@retval DB_IO_ERROR on I/O error */
+dberr_t os_aio(const IORequest &type, void *buf, os_offset_t offset, size_t n);
+
+/** @return number of pending reads */
+size_t os_aio_pending_reads();
+/** @return approximate number of pending reads */
+size_t os_aio_pending_reads_approx();
+/** @return number of pending writes */
+size_t os_aio_pending_writes();
+
+/** Wait until there are no pending asynchronous writes.
+@param declare  whether the wait will be declared in tpool */
+void os_aio_wait_until_no_pending_writes(bool declare);
+
+/** Wait until all pending asynchronous reads have completed.
+@param declare  whether the wait will be declared in tpool */
+void os_aio_wait_until_no_pending_reads(bool declare);
+
+/** Prints info of the aio arrays.
+@param[in/out]	file		file where to print */
+void
+os_aio_print(FILE* file);
+
+/** Refreshes the statistics used to print per-second averages. */
+void
+os_aio_refresh_stats();
+
+/** Checks that all slots in the system have been freed, that is, there are
+no pending io operations. */
+bool
+os_aio_all_slots_free();
+
+
+/** This function returns information about the specified file
+@param[in]	path		pathname of the file
+@param[in]	stat_info	information of a file in a directory
+@param[in]	check_rw_perm	for testing whether the file can be opened
+				in RW mode
+@param[in]	read_only	if true read only mode checks are enforced
+@return DB_SUCCESS if all OK */
+dberr_t
+os_file_get_status(
+	const char*	path,
+	os_file_stat_t* stat_info,
+	bool		check_rw_perm,
+	bool		read_only);
+
+/** Set the file create umask
+@param[in]	umask		The umask to use for file creation. */
+void
+os_file_set_umask(ulint umask);
+
+#ifdef _WIN32
+
+/**
+Make file sparse, on Windows.
+
+@param[in]	file  file handle
+@param[in]	is_sparse if true, make file sparse,
+			otherwise "unsparse" the file
+@return true on success, false on error */
+bool os_file_set_sparse_win32(os_file_t file, bool is_sparse = true);
+
+/**
+Changes file size on Windows
+
+If file is extended, following happens  the bytes between
+old and new EOF are zeros.
+
+If file is sparse, "virtual" block is added at the end of
+allocated area.
+
+If file is normal, file system allocates storage.
+
+@param[in]	pathname	file path
+@param[in]	file		file handle
+@param[in]	size		size to preserve in bytes
+@return true if success */
+bool
+os_file_change_size_win32(
+	const char*	pathname,
+	os_file_t	file,
+	os_offset_t	size);
+
+#endif /*_WIN32 */
+
+/** Free storage space associated with a section of the file.
+@param[in]	fh		Open file handle
+@param[in]	off		Starting offset (SEEK_SET)
+@param[in]	len		Size of the hole
+@return DB_SUCCESS or error code */
+dberr_t
+os_file_punch_hole(
+	os_file_t	fh,
+	os_offset_t	off,
+	os_offset_t	len)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/* Determine if a path is an absolute path or not.
+@param[in]	OS directory or file path to evaluate
+@retval true if an absolute path
+@retval false if a relative path */
+inline bool is_absolute_path(const char *path)
+{
+  switch (path[0]) {
+#ifdef _WIN32
+  case '\0':
+    return false;
+  case '\\':
+#endif
+  case '/':
+    return true;
+  }
+
+#ifdef _WIN32
+  if (path[1] == ':')
+  {
+    switch (path[2]) {
+    case '/':
+    case '\\':
+      return true;
+    }
+  }
+#endif /* _WIN32 */
+
+  return false;
+}
+
+#include "os0file.inl"
+
+#endif /* os0file_h */
diff --git a/storage/innobase/include/os0file.inl b/storage/innobase/include/os0file.inl
new file mode 100644
index 00000000..7de31505
--- /dev/null
+++ b/storage/innobase/include/os0file.inl
@@ -0,0 +1,412 @@
+/*****************************************************************************
+
+Copyright (c) 2010, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/os0file.ic
+The interface to the operating system file io
+
+Created 2/20/2010 Jimmy Yang
+*******************************************************/
+
+#ifdef UNIV_PFS_IO
+/** NOTE! Please use the corresponding macro os_file_create_simple(),
+not directly this function!
+A performance schema instrumented wrapper function for
+os_file_create_simple() which opens or creates a file.
+@param[in]	key		Performance Schema Key
+@param[in]	name		name of the file or path as a null-terminated
+				string
+@param[in]	create_mode	create mode
+@param[in]	access_type	OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
+@param[in]	read_only	if true read only mode checks are enforced
+@param[out]	success		true if succeeded
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INLINE
+pfs_os_file_t
+pfs_os_file_create_simple_func(
+	mysql_pfs_key_t key,
+	const char*	name,
+	ulint		create_mode,
+	ulint		access_type,
+	bool		read_only,
+	bool*		success,
+	const char*	src_file,
+	uint		src_line)
+{
+	PSI_file_locker_state	state;
+	struct PSI_file_locker* locker = NULL;
+
+	/* register a file open or creation depending on "create_mode" */
+	register_pfs_file_open_begin(
+		&state, locker, key,
+		(create_mode == OS_FILE_CREATE)
+		? PSI_FILE_CREATE : PSI_FILE_OPEN,
+		name, src_file, src_line);
+
+	pfs_os_file_t	file = os_file_create_simple_func(
+		name, create_mode, access_type, read_only, success);
+
+	/* Register psi value for the file */
+	register_pfs_file_open_end(locker, file,
+				   (*success == TRUE ? success : 0));
+
+	return(file);
+}
+
+/** NOTE! Please use the corresponding macro
+os_file_create_simple_no_error_handling(), not directly this function!
+A performance schema instrumented wrapper function for
+os_file_create_simple_no_error_handling(). Add instrumentation to
+monitor file creation/open.
+@param[in]	key		Performance Schema Key
+@param[in]	name		name of the file or path as a null-terminated
+				string
+@param[in]	create_mode	create mode
+@param[in]	access_type	OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
+				OS_FILE_READ_ALLOW_DELETE; the last option is
+				used by a backup program reading the file
+@param[in]	read_only	if true read only mode checks are enforced
+@param[out]	success		true if succeeded
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INLINE
+pfs_os_file_t
+pfs_os_file_create_simple_no_error_handling_func(
+	mysql_pfs_key_t key,
+	const char*	name,
+	ulint		create_mode,
+	ulint		access_type,
+	bool		read_only,
+	bool*		success,
+	const char*	src_file,
+	uint		src_line)
+{
+	PSI_file_locker_state	state;
+	struct PSI_file_locker* locker = NULL;
+
+	/* register a file open or creation depending on "create_mode" */
+	register_pfs_file_open_begin(
+		&state, locker, key,
+		create_mode == OS_FILE_CREATE
+		? PSI_FILE_CREATE : PSI_FILE_OPEN,
+		name, src_file, src_line);
+
+	pfs_os_file_t	file = os_file_create_simple_no_error_handling_func(
+		name, create_mode, access_type, read_only, success);
+
+	register_pfs_file_open_end(locker, file,
+				 (*success == TRUE ? success : 0));
+
+	return(file);
+}
+
+/** NOTE! Please use the corresponding macro os_file_create(), not directly
+this function!
+A performance schema wrapper function for os_file_create().
+Add instrumentation to monitor file creation/open.
+@param[in]	key		Performance Schema Key
+@param[in]	name		name of the file or path as a null-terminated
+				string
+@param[in]	create_mode	create mode
+@param[in]	purpose		OS_FILE_AIO, if asynchronous, non-buffered I/O
+				is desired, OS_FILE_NORMAL, if any normal file;
+				NOTE that it also depends on type, os_aio_..
+				and srv_.. variables whether we really us
+				async I/O or unbuffered I/O: look in the
+				function source code for the exact rules
+@param[in]	read_only	if true read only mode checks are enforced
+@param[out]	success		true if succeeded
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INLINE
+pfs_os_file_t
+pfs_os_file_create_func(
+	mysql_pfs_key_t key,
+	const char*	name,
+	ulint		create_mode,
+	ulint		purpose,
+	ulint		type,
+	bool		read_only,
+	bool*		success,
+	const char*	src_file,
+	uint		src_line)
+{
+	PSI_file_locker_state	state;
+	struct PSI_file_locker* locker = NULL;
+
+	/* register a file open or creation depending on "create_mode" */
+	register_pfs_file_open_begin(
+		&state, locker, key,
+		create_mode == OS_FILE_CREATE
+		? PSI_FILE_CREATE : PSI_FILE_OPEN,
+		name, src_file, src_line);
+
+	pfs_os_file_t	file = os_file_create_func(
+		name, create_mode, purpose, type, read_only, success);
+
+	register_pfs_file_open_end(locker, file,
+				(*success == TRUE ? success : 0));
+
+	return(file);
+}
+/**
+NOTE! Please use the corresponding macro os_file_close(), not directly
+this function!
+A performance schema instrumented wrapper function for os_file_close().
+@param[in]	file		handle to a file
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return true if success */
+UNIV_INLINE
+bool
+pfs_os_file_close_func(
+	pfs_os_file_t	file,
+	const char*	src_file,
+	uint		src_line)
+{
+	PSI_file_locker_state	state;
+	struct PSI_file_locker*	locker = NULL;
+
+	/* register the file close */
+	register_pfs_file_io_begin(
+		&state, locker, file, 0, PSI_FILE_CLOSE, src_file, src_line);
+
+	bool	result = os_file_close_func(file);
+
+	register_pfs_file_io_end(locker, 0);
+
+	return(result);
+}
+
+/** NOTE! Please use the corresponding macro os_file_read(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_read() which requests a synchronous read operation.
+@param[in]	type		IO request context
+@param[in]	file		Open file handle
+@param[out]	buf		buffer where to read
+@param[in]	offset		file offset where to read
+@param[in]	n		number of bytes to read
+@param[out]	o		number of bytes actually read
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return DB_SUCCESS if request was successful */
+UNIV_INLINE
+dberr_t
+pfs_os_file_read_func(
+	const IORequest&	type,
+	pfs_os_file_t		file,
+	void*			buf,
+	os_offset_t		offset,
+	ulint			n,
+	ulint*			o,
+	const char*		src_file,
+	uint			src_line)
+{
+	PSI_file_locker_state	state;
+	struct PSI_file_locker*	locker = NULL;
+
+	register_pfs_file_io_begin(
+		&state, locker, file, n, PSI_FILE_READ, src_file, src_line);
+
+	dberr_t		result;
+
+	result = os_file_read_func(type, file, buf, offset, n, o);
+
+	register_pfs_file_io_end(locker, n);
+
+	return(result);
+}
+
+/** NOTE! Please use the corresponding macro os_file_write(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_write() which requests a synchronous write operation.
+@param[in]	type		IO request context
+@param[in]	name		Name of the file or path as NUL terminated
+				string
+@param[in]	file		Open file handle
+@param[out]	buf		buffer where to read
+@param[in]	offset		file offset where to read
+@param[in]	n		number of bytes to read
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return	error code
+@retval	DB_SUCCESS	if the request was successfully fulfilled */
+UNIV_INLINE
+dberr_t
+pfs_os_file_write_func(
+	const IORequest&	type,
+	const char*		name,
+	pfs_os_file_t		file,
+	const void*		buf,
+	os_offset_t		offset,
+	ulint			n,
+	const char*		src_file,
+	uint			src_line)
+{
+	PSI_file_locker_state	state;
+	struct PSI_file_locker*	locker = NULL;
+
+	register_pfs_file_io_begin(
+		&state, locker, file, n, PSI_FILE_WRITE, src_file, src_line);
+
+	dberr_t		result;
+
+	result = os_file_write_func(type, name, file, buf, offset, n);
+
+	register_pfs_file_io_end(locker, n);
+
+	return(result);
+}
+
+
+/** NOTE! Please use the corresponding macro os_file_flush(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_flush() which flushes the write buffers of a given file to the disk.
+Flushes the write buffers of a given file to the disk.
+@param[in]	file		Open file handle
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return TRUE if success */
+UNIV_INLINE
+bool
+pfs_os_file_flush_func(
+	pfs_os_file_t	file,
+	const char*	src_file,
+	uint		src_line)
+{
+	PSI_file_locker_state	state;
+	struct PSI_file_locker*	locker = NULL;
+
+	register_pfs_file_io_begin(
+		&state, locker, file, 0, PSI_FILE_SYNC, src_file, src_line);
+
+	bool	result = os_file_flush_func(file);
+
+	register_pfs_file_io_end(locker, 0);
+
+	return(result);
+}
+
+/** NOTE! Please use the corresponding macro os_file_rename(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_rename()
+@param[in]	key		Performance Schema Key
+@param[in]	oldpath		old file path as a null-terminated string
+@param[in]	newpath		new file path
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return true if success */
+UNIV_INLINE
+bool
+pfs_os_file_rename_func(
+	mysql_pfs_key_t	key,
+	const char*	oldpath,
+	const char*	newpath,
+	const char*	src_file,
+	uint		src_line)
+
+{
+	PSI_file_locker_state	state;
+	struct PSI_file_locker*	locker = NULL;
+
+	register_pfs_file_rename_begin(
+		&state, locker, key, PSI_FILE_RENAME, newpath,
+		src_file, src_line);
+
+	bool	result = os_file_rename_func(oldpath, newpath);
+
+	register_pfs_file_rename_end(locker, oldpath, newpath, !result);
+
+	return(result);
+}
+
+/** NOTE! Please use the corresponding macro os_file_delete(), not directly
+this function!
+This is the performance schema instrumented wrapper function for
+os_file_delete()
+@param[in]	key		Performance Schema Key
+@param[in]	name		old file path as a null-terminated string
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return true if success */
+UNIV_INLINE
+bool
+pfs_os_file_delete_func(
+	mysql_pfs_key_t	key,
+	const char*	name,
+	const char*	src_file,
+	uint		src_line)
+{
+	PSI_file_locker_state	state;
+	struct PSI_file_locker*	locker = NULL;
+
+	register_pfs_file_close_begin(
+		&state, locker, key, PSI_FILE_DELETE, name, src_file, src_line);
+
+	bool	result = os_file_delete_func(name);
+
+	register_pfs_file_close_end(locker, 0);
+
+	return(result);
+}
+
+/**
+NOTE! Please use the corresponding macro os_file_delete_if_exists(), not
+directly this function!
+This is the performance schema instrumented wrapper function for
+os_file_delete_if_exists()
+@param[in]	key		Performance Schema Key
+@param[in]	name		old file path as a null-terminated string
+@param[in]	exist		indicate if file pre-exist
+@param[in]	src_file	file name where func invoked
+@param[in]	src_line	line where the func invoked
+@return true if success */
+UNIV_INLINE
+bool
+pfs_os_file_delete_if_exists_func(
+	mysql_pfs_key_t	key,
+	const char*	name,
+	bool*		exist,
+	const char*	src_file,
+	uint		src_line)
+{
+	PSI_file_locker_state	state;
+	struct PSI_file_locker*	locker = NULL;
+
+	register_pfs_file_close_begin(
+		&state, locker, key, PSI_FILE_DELETE, name, src_file, src_line);
+
+	bool	result = os_file_delete_if_exists_func(name, exist);
+
+	register_pfs_file_close_end(locker, 0);
+
+	return(result);
+}
+#endif /* UNIV_PFS_IO */
diff --git a/storage/innobase/include/page0cur.h b/storage/innobase/include/page0cur.h
new file mode 100644
index 00000000..28aa3056
--- /dev/null
+++ b/storage/innobase/include/page0cur.h
@@ -0,0 +1,303 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/page0cur.h
+The page cursor
+
+Created 10/4/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifndef page0cur_h
+#define page0cur_h
+
+#include "page0page.h"
+
+#ifdef UNIV_DEBUG
+/*********************************************************//**
+Gets pointer to the page frame where the cursor is positioned.
+@return page */
+UNIV_INLINE
+page_t*
+page_cur_get_page(
+/*==============*/
+	page_cur_t*	cur);	/*!< in: page cursor */
+/*********************************************************//**
+Gets pointer to the buffer block where the cursor is positioned.
+@return page */
+UNIV_INLINE
+buf_block_t*
+page_cur_get_block(
+/*===============*/
+	page_cur_t*	cur);	/*!< in: page cursor */
+/*********************************************************//**
+Gets pointer to the page frame where the cursor is positioned.
+@return page */
+UNIV_INLINE
+page_zip_des_t*
+page_cur_get_page_zip(
+/*==================*/
+	page_cur_t*	cur);	/*!< in: page cursor */
+/* Gets the record where the cursor is positioned.
+@param cur page cursor
+@return record */
+UNIV_INLINE
+rec_t *page_cur_get_rec(const page_cur_t *cur);
+#else /* UNIV_DEBUG */
+# define page_cur_get_page(cur)		page_align((cur)->rec)
+# define page_cur_get_block(cur)	(cur)->block
+# define page_cur_get_page_zip(cur)	buf_block_get_page_zip((cur)->block)
+# define page_cur_get_rec(cur)		(cur)->rec
+#endif /* UNIV_DEBUG */
+# define is_page_cur_get_page_zip(cur)	is_buf_block_get_page_zip((cur)->block)
+/*********************************************************//**
+Sets the cursor object to point before the first user record
+on the page. */
+UNIV_INLINE
+void
+page_cur_set_before_first(
+/*======================*/
+	const buf_block_t*	block,	/*!< in: index page */
+	page_cur_t*		cur);	/*!< in: cursor */
+/*********************************************************//**
+Sets the cursor object to point after the last user record on
+the page. */
+UNIV_INLINE
+void
+page_cur_set_after_last(
+/*====================*/
+	const buf_block_t*	block,	/*!< in: index page */
+	page_cur_t*		cur);	/*!< in: cursor */
+/*********************************************************//**
+Returns TRUE if the cursor is before first user record on page.
+@return TRUE if at start */
+UNIV_INLINE
+ibool
+page_cur_is_before_first(
+/*=====================*/
+	const page_cur_t*	cur);	/*!< in: cursor */
+/*********************************************************//**
+Returns TRUE if the cursor is after last user record.
+@return TRUE if at end */
+UNIV_INLINE
+ibool
+page_cur_is_after_last(
+/*===================*/
+	const page_cur_t*	cur);	/*!< in: cursor */
+/**********************************************************//**
+Positions the cursor on the given record. */
+UNIV_INLINE
+void
+page_cur_position(
+/*==============*/
+	const rec_t*		rec,	/*!< in: record on a page */
+	const buf_block_t*	block,	/*!< in: buffer block containing
+					the record */
+	page_cur_t*		cur);	/*!< out: page cursor */
+
+/***********************************************************//**
+Inserts a record next to page cursor. Returns pointer to inserted record if
+succeed, i.e., enough space available, NULL otherwise. The cursor stays at
+the same logical position, but the physical position may change if it is
+pointing to a compressed page that was reorganized.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return pointer to record if succeed, NULL otherwise */
+UNIV_INLINE
+rec_t*
+page_cur_tuple_insert(
+/*==================*/
+	page_cur_t*	cursor,	/*!< in/out: a page cursor */
+	const dtuple_t*	tuple,	/*!< in: pointer to a data tuple */
+	rec_offs**	offsets,/*!< out: offsets on *rec */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/***********************************************************//**
+Inserts a record next to page cursor on an uncompressed page.
+@return pointer to record
+@retval nullptr if not enough space was available */
+rec_t*
+page_cur_insert_rec_low(
+/*====================*/
+	const page_cur_t*cur,	/*!< in: page cursor */
+	const rec_t*	rec,	/*!< in: record to insert after cur */
+	rec_offs*	offsets,/*!< in/out: rec_get_offsets(rec, index) */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/***********************************************************//**
+Inserts a record next to page cursor on a compressed and uncompressed
+page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return pointer to inserted record
+@return nullptr on failure */
+rec_t*
+page_cur_insert_rec_zip(
+/*====================*/
+	page_cur_t*	cursor,	/*!< in/out: page cursor,
+				logical position unchanged  */
+	const rec_t*	rec,	/*!< in: pointer to a physical record */
+	rec_offs*	offsets,/*!< in/out: rec_get_offsets(rec, index) */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/***********************************************************//**
+Deletes a record at the page cursor. The cursor is moved to the
+next record after the deleted one. */
+void
+page_cur_delete_rec(
+/*================*/
+	page_cur_t*		cursor,	/*!< in/out: a page cursor */
+	const rec_offs*		offsets,/*!< in: rec_get_offsets(
+					cursor->rec, index) */
+	mtr_t*			mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull));
+
+/** Apply a INSERT_HEAP_REDUNDANT or INSERT_REUSE_REDUNDANT record that was
+written by page_cur_insert_rec_low() for a ROW_FORMAT=REDUNDANT page.
+@param block      B-tree or R-tree page in ROW_FORMAT=COMPACT or DYNAMIC
+@param reuse      false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE
+@param prev       byte offset of the predecessor, relative to PAGE_OLD_INFIMUM
+@param enc_hdr    encoded fixed-size header bits
+@param hdr_c      number of common record header bytes with prev
+@param data_c     number of common data bytes with prev
+@param data       literal header and data bytes
+@param data_len   length of the literal data, in bytes
+@return whether the operation failed (inconcistency was noticed) */
+bool page_apply_insert_redundant(const buf_block_t &block, bool reuse,
+                                 ulint prev, ulint enc_hdr,
+                                 size_t hdr_c, size_t data_c,
+                                 const void *data, size_t data_len);
+
+/** Apply a INSERT_HEAP_DYNAMIC or INSERT_REUSE_DYNAMIC record that was
+written by page_cur_insert_rec_low() for a ROW_FORMAT=COMPACT or DYNAMIC page.
+@param block      B-tree or R-tree page in ROW_FORMAT=COMPACT or DYNAMIC
+@param reuse      false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE
+@param prev       byte offset of the predecessor, relative to PAGE_NEW_INFIMUM
+@param shift      unless !reuse: number of bytes the PAGE_FREE is moving
+@param enc_hdr_l  number of copied record header bytes, plus record type bits
+@param hdr_c      number of common record header bytes with prev
+@param data_c     number of common data bytes with prev
+@param data       literal header and data bytes
+@param data_len   length of the literal data, in bytes
+@return whether the operation failed (inconcistency was noticed) */
+bool page_apply_insert_dynamic(const buf_block_t &block, bool reuse,
+                               ulint prev, ulint shift, ulint enc_hdr_l,
+                               size_t hdr_c, size_t data_c,
+                               const void *data, size_t data_len);
+
+/** Apply a DELETE_ROW_FORMAT_REDUNDANT record that was written by
+page_cur_delete_rec() for a ROW_FORMAT=REDUNDANT page.
+@param block    B-tree or R-tree page in ROW_FORMAT=REDUNDANT
+@param prev     byte offset of the predecessor, relative to PAGE_OLD_INFIMUM
+@return whether the operation failed (inconcistency was noticed) */
+bool page_apply_delete_redundant(const buf_block_t &block, ulint prev);
+
+/** Apply a DELETE_ROW_FORMAT_DYNAMIC record that was written by
+page_cur_delete_rec() for a ROW_FORMAT=COMPACT or DYNAMIC page.
+@param block      B-tree or R-tree page in ROW_FORMAT=COMPACT or DYNAMIC
+@param prev       byte offset of the predecessor, relative to PAGE_NEW_INFIMUM
+@param hdr_size   record header size, excluding REC_N_NEW_EXTRA_BYTES
+@param data_size  data payload size, in bytes
+@return whether the operation failed (inconcistency was noticed) */
+bool page_apply_delete_dynamic(const buf_block_t &block, ulint prev,
+                               size_t hdr_size, size_t data_size);
+
+MY_ATTRIBUTE((warn_unused_result))
+/****************************************************************//**
+Searches the right position for a page cursor. */
+bool
+page_cur_search_with_match(
+/*=======================*/
+	const dtuple_t*		tuple,	/*!< in: data tuple */
+	page_cur_mode_t		mode,	/*!< in: PAGE_CUR_L,
+					PAGE_CUR_LE, PAGE_CUR_G, or
+					PAGE_CUR_GE */
+	ulint*			iup_matched_fields,
+					/*!< in/out: already matched
+					fields in upper limit record */
+	ulint*			ilow_matched_fields,
+					/*!< in/out: already matched
+					fields in lower limit record */
+	page_cur_t*		cursor,	/*!< in/out: page cursor */
+	rtr_info_t*		rtr_info);/*!< in/out: rtree search stack */
+#ifdef BTR_CUR_HASH_ADAPT
+MY_ATTRIBUTE((warn_unused_result))
+/** Search the right position for a page cursor.
+@param[in]	tuple			key to be searched for
+@param[in]	mode			search mode
+@param[in,out]	iup_matched_fields	already matched fields in the
+upper limit record
+@param[in,out]	iup_matched_bytes	already matched bytes in the
+first partially matched field in the upper limit record
+@param[in,out]	ilow_matched_fields	already matched fields in the
+lower limit record
+@param[in,out]	ilow_matched_bytes	already matched bytes in the
+first partially matched field in the lower limit record
+@param[in,out]	cursor			page cursor */
+bool
+page_cur_search_with_match_bytes(
+	const dtuple_t*		tuple,
+	page_cur_mode_t		mode,
+	ulint*			iup_matched_fields,
+	ulint*			iup_matched_bytes,
+	ulint*			ilow_matched_fields,
+	ulint*			ilow_matched_bytes,
+	page_cur_t*		cursor);
+#endif /* BTR_CUR_HASH_ADAPT */
+/***********************************************************//**
+Positions a page cursor on a randomly chosen user record on a page. If there
+are no user records, sets the cursor on the infimum record. */
+void page_cur_open_on_rnd_user_rec(page_cur_t *cursor);
+
+/** Index page cursor */
+
+struct page_cur_t{
+	dict_index_t*	index;
+	rec_t*		rec;	/*!< pointer to a record on page */
+	rec_offs*	offsets;
+	buf_block_t*	block;	/*!< pointer to the block containing rec */
+};
+
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+inline rec_t *page_cur_move_to_next(page_cur_t *cur)
+{
+  return cur->rec= page_rec_get_next(cur->rec);
+}
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+inline rec_t *page_cur_move_to_prev(page_cur_t *cur)
+{
+  return cur->rec= page_rec_get_prev(cur->rec);
+}
+
+#include "page0cur.inl"
+
+#endif
diff --git a/storage/innobase/include/page0cur.inl b/storage/innobase/include/page0cur.inl
new file mode 100644
index 00000000..7c4eafa2
--- /dev/null
+++ b/storage/innobase/include/page0cur.inl
@@ -0,0 +1,203 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/page0cur.ic
+The page cursor
+
+Created 10/4/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifdef UNIV_DEBUG
+/*********************************************************//**
+Gets pointer to the page frame where the cursor is positioned.
+@return page */
+UNIV_INLINE
+page_t*
+page_cur_get_page(
+/*==============*/
+	page_cur_t*	cur)	/*!< in: page cursor */
+{
+  return page_align(page_cur_get_rec(cur));
+}
+
+/*********************************************************//**
+Gets pointer to the buffer block where the cursor is positioned.
+@return page */
+UNIV_INLINE
+buf_block_t*
+page_cur_get_block(
+/*===============*/
+	page_cur_t*	cur)	/*!< in: page cursor */
+{
+  ut_ad(cur);
+  ut_ad(!cur->rec || page_align(cur->rec) == cur->block->page.frame);
+  return cur->block;
+}
+
+/*********************************************************//**
+Gets pointer to the page frame where the cursor is positioned.
+@return page */
+UNIV_INLINE
+page_zip_des_t*
+page_cur_get_page_zip(
+/*==================*/
+	page_cur_t*	cur)	/*!< in: page cursor */
+{
+	return(buf_block_get_page_zip(page_cur_get_block(cur)));
+}
+
+/* Gets the record where the cursor is positioned.
+@param cur page cursor
+@return record */
+UNIV_INLINE
+rec_t *page_cur_get_rec(const page_cur_t *cur)
+{
+  ut_ad(cur);
+  ut_ad(!cur->rec || page_align(cur->rec) == cur->block->page.frame);
+  return cur->rec;
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************//**
+Sets the cursor object to point before the first user record
+on the page. */
+UNIV_INLINE
+void
+page_cur_set_before_first(
+/*======================*/
+	const buf_block_t*	block,	/*!< in: index page */
+	page_cur_t*		cur)	/*!< in: cursor */
+{
+	cur->block = const_cast<buf_block_t*>(block);
+	cur->rec = page_get_infimum_rec(buf_block_get_frame(cur->block));
+}
+
+/*********************************************************//**
+Sets the cursor object to point after the last user record on
+the page. */
+UNIV_INLINE
+void
+page_cur_set_after_last(
+/*====================*/
+	const buf_block_t*	block,	/*!< in: index page */
+	page_cur_t*		cur)	/*!< in: cursor */
+{
+	cur->block = const_cast<buf_block_t*>(block);
+	cur->rec = page_get_supremum_rec(buf_block_get_frame(cur->block));
+}
+
+/*********************************************************//**
+Returns TRUE if the cursor is before first user record on page.
+@return TRUE if at start */
+UNIV_INLINE
+ibool
+page_cur_is_before_first(
+/*=====================*/
+	const page_cur_t*	cur)	/*!< in: cursor */
+{
+	ut_ad(cur);
+	ut_ad(page_align(cur->rec) == cur->block->page.frame);
+	return(page_rec_is_infimum(cur->rec));
+}
+
+/*********************************************************//**
+Returns TRUE if the cursor is after last user record.
+@return TRUE if at end */
+UNIV_INLINE
+ibool
+page_cur_is_after_last(
+/*===================*/
+	const page_cur_t*	cur)	/*!< in: cursor */
+{
+	ut_ad(cur);
+	ut_ad(page_align(cur->rec) == cur->block->page.frame);
+	return(page_rec_is_supremum(cur->rec));
+}
+
+/**********************************************************//**
+Positions the cursor on the given record. */
+UNIV_INLINE
+void
+page_cur_position(
+/*==============*/
+	const rec_t*		rec,	/*!< in: record on a page */
+	const buf_block_t*	block,	/*!< in: buffer block containing
+					the record */
+	page_cur_t*		cur)	/*!< out: page cursor */
+{
+	ut_ad(rec && block && cur);
+	ut_ad(page_align(rec) == block->page.frame);
+
+	cur->rec = (rec_t*) rec;
+	cur->block = (buf_block_t*) block;
+}
+
+/***********************************************************//**
+Inserts a record next to page cursor. Returns pointer to inserted record if
+succeed, i.e., enough space available, NULL otherwise. The cursor stays at
+the same logical position, but the physical position may change if it is
+pointing to a compressed page that was reorganized.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return pointer to record if succeed, NULL otherwise */
+UNIV_INLINE
+rec_t*
+page_cur_tuple_insert(
+/*==================*/
+	page_cur_t*	cursor,	/*!< in/out: a page cursor */
+	const dtuple_t*	tuple,	/*!< in: pointer to a data tuple */
+	rec_offs**	offsets,/*!< out: offsets on *rec */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	ulint size = rec_get_converted_size(cursor->index, tuple, n_ext);
+
+	if (!*heap) {
+		*heap = mem_heap_create(size
+					+ (4 + REC_OFFS_HEADER_SIZE
+					   + dtuple_get_n_fields(tuple))
+					* sizeof **offsets);
+	}
+
+	rec_t* rec = rec_convert_dtuple_to_rec(
+		static_cast<byte*>(mem_heap_alloc(*heap, size)),
+		cursor->index, tuple, n_ext);
+
+	*offsets = rec_get_offsets(rec, cursor->index, *offsets,
+				   page_is_leaf(cursor->block->page.frame)
+				   ? cursor->index->n_core_fields : 0,
+				   ULINT_UNDEFINED, heap);
+	ut_ad(size == rec_offs_size(*offsets));
+
+	if (is_buf_block_get_page_zip(cursor->block)) {
+		rec = page_cur_insert_rec_zip(cursor, rec, *offsets, mtr);
+	} else {
+		rec = page_cur_insert_rec_low(cursor, rec, *offsets, mtr);
+	}
+
+	ut_ad(!rec || !cmp_dtuple_rec(tuple, rec, cursor->index, *offsets));
+	return(rec);
+}
+
diff --git a/storage/innobase/include/page0page.h b/storage/innobase/include/page0page.h
new file mode 100644
index 00000000..2978656b
--- /dev/null
+++ b/storage/innobase/include/page0page.h
@@ -0,0 +1,1101 @@
+/*****************************************************************************
+Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0page.h
+Index page routines
+
+Created 2/2/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef page0page_h
+#define page0page_h
+
+#include "page0types.h"
+#include "fsp0fsp.h"
+#include "fil0fil.h"
+#include "buf0buf.h"
+#include "rem0rec.h"
+#include "mach0data.h"
+#ifndef UNIV_INNOCHECKSUM
+#include "dict0dict.h"
+#include "data0data.h"
+#include "mtr0mtr.h"
+
+/*			PAGE HEADER
+			===========
+
+Index page header starts at the first offset left free by the FIL-module */
+
+typedef	byte		page_header_t;
+#endif /* !UNIV_INNOCHECKSUM */
+
+#define	PAGE_HEADER	FSEG_PAGE_DATA	/* index page header starts at this
+				offset */
+/*-----------------------------*/
+#define PAGE_N_DIR_SLOTS 0	/* number of slots in page directory */
+#define	PAGE_HEAP_TOP	 2	/* pointer to record heap top */
+#define	PAGE_N_HEAP	 4	/* number of records in the heap,
+				bit 15=flag: new-style compact page format */
+#define	PAGE_FREE	 6	/* pointer to start of page free record list */
+#define	PAGE_GARBAGE	 8	/* number of bytes in deleted records */
+#define	PAGE_LAST_INSERT 10	/* pointer to the last inserted record, or
+				0 if this info has been reset by a delete,
+				for example */
+
+/** This 10-bit field is usually 0. In B-tree index pages of
+ROW_FORMAT=REDUNDANT tables, this byte can contain garbage if the .ibd
+file was created in MySQL 4.1.0 or if the table resides in the system
+tablespace and was created before MySQL 4.1.1 or MySQL 4.0.14.
+In this case, the FIL_PAGE_TYPE would be FIL_PAGE_INDEX.
+
+In ROW_FORMAT=COMPRESSED tables, this field is always 0, because
+instant ADD COLUMN is not supported.
+
+In ROW_FORMAT=COMPACT and ROW_FORMAT=DYNAMIC tables, this field is
+always 0, except in the root page of the clustered index after instant
+ADD COLUMN.
+
+Instant ADD COLUMN will change FIL_PAGE_TYPE to FIL_PAGE_TYPE_INSTANT
+and initialize the PAGE_INSTANT field to the original number of
+fields in the clustered index (dict_index_t::n_core_fields).  The most
+significant bits are in the first byte, and the least significant 5
+bits are stored in the most significant 5 bits of PAGE_DIRECTION_B.
+
+These FIL_PAGE_TYPE_INSTANT and PAGE_INSTANT may be assigned even if
+instant ADD COLUMN was not committed. Changes to these page header fields
+are not undo-logged, but changes to the hidden metadata record are.
+If the server is killed and restarted, the page header fields could
+remain set even though no metadata record is present.
+
+When the table becomes empty, the PAGE_INSTANT field and the
+FIL_PAGE_TYPE can be reset and any metadata record be removed. */
+#define PAGE_INSTANT	12
+
+/** last insert direction: PAGE_LEFT, ....
+In ROW_FORMAT=REDUNDANT tables created before MySQL 4.1.1 or MySQL 4.0.14,
+this byte can be garbage. */
+#define	PAGE_DIRECTION_B 13
+#define	PAGE_N_DIRECTION 14	/* number of consecutive inserts to the same
+				direction */
+#define	PAGE_N_RECS	 16	/* number of user records on the page */
+/** The largest DB_TRX_ID that may have modified a record on the page;
+Defined only in secondary index leaf pages and in change buffer leaf pages.
+Otherwise written as 0. @see PAGE_ROOT_AUTO_INC */
+#define PAGE_MAX_TRX_ID	 18
+/** The AUTO_INCREMENT value (on persistent clustered index root pages). */
+#define PAGE_ROOT_AUTO_INC	PAGE_MAX_TRX_ID
+#define PAGE_HEADER_PRIV_END 26	/* end of private data structure of the page
+				header which are set in a page create */
+/*----*/
+#define	PAGE_LEVEL	 26	/* level of the node in an index tree; the
+				leaf level is the level 0.  This field should
+				not be written to after page creation. */
+#define	PAGE_INDEX_ID	 28	/* index id where the page belongs.
+				This field should not be written to after
+				page creation. */
+
+#define PAGE_BTR_SEG_LEAF 36	/* file segment header for the leaf pages in
+				a B-tree: defined only on the root page of a
+				B-tree, but not in the root of an ibuf tree */
+#define PAGE_BTR_IBUF_FREE_LIST	PAGE_BTR_SEG_LEAF
+#define PAGE_BTR_IBUF_FREE_LIST_NODE PAGE_BTR_SEG_LEAF
+				/* in the place of PAGE_BTR_SEG_LEAF and _TOP
+				there is a free list base node if the page is
+				the root page of an ibuf tree, and at the same
+				place is the free list node if the page is in
+				a free list */
+#define PAGE_BTR_SEG_TOP (36 + FSEG_HEADER_SIZE)
+				/* file segment header for the non-leaf pages
+				in a B-tree: defined only on the root page of
+				a B-tree, but not in the root of an ibuf
+				tree */
+/*----*/
+#define PAGE_DATA	(PAGE_HEADER + 36 + 2 * FSEG_HEADER_SIZE)
+				/* start of data on the page */
+
+#define PAGE_OLD_INFIMUM	(PAGE_DATA + 1 + REC_N_OLD_EXTRA_BYTES)
+				/* offset of the page infimum record on an
+				old-style page */
+#define PAGE_OLD_SUPREMUM	(PAGE_DATA + 2 + 2 * REC_N_OLD_EXTRA_BYTES + 8)
+				/* offset of the page supremum record on an
+				old-style page */
+#define PAGE_OLD_SUPREMUM_END (PAGE_OLD_SUPREMUM + 9)
+				/* offset of the page supremum record end on
+				an old-style page */
+#define PAGE_NEW_INFIMUM	(PAGE_DATA + REC_N_NEW_EXTRA_BYTES)
+				/* offset of the page infimum record on a
+				new-style compact page */
+#define PAGE_NEW_SUPREMUM	(PAGE_DATA + 2 * REC_N_NEW_EXTRA_BYTES + 8)
+				/* offset of the page supremum record on a
+				new-style compact page */
+#define PAGE_NEW_SUPREMUM_END (PAGE_NEW_SUPREMUM + 8)
+				/* offset of the page supremum record end on
+				a new-style compact page */
+/*-----------------------------*/
+
+/* Heap numbers */
+#define PAGE_HEAP_NO_INFIMUM	0U	/* page infimum */
+#define PAGE_HEAP_NO_SUPREMUM	1U	/* page supremum */
+#define PAGE_HEAP_NO_USER_LOW	2U	/* first user record in
+					creation (insertion) order,
+					not necessarily collation order;
+					this record may have been deleted */
+
+/* Directions of cursor movement (stored in PAGE_DIRECTION field) */
+constexpr uint16_t PAGE_LEFT= 1;
+constexpr uint16_t PAGE_RIGHT= 2;
+constexpr uint16_t PAGE_SAME_REC= 3;
+constexpr uint16_t PAGE_SAME_PAGE= 4;
+constexpr uint16_t PAGE_NO_DIRECTION= 5;
+
+#ifndef UNIV_INNOCHECKSUM
+
+/*			PAGE DIRECTORY
+			==============
+*/
+
+typedef	byte			page_dir_slot_t;
+
+/* Offset of the directory start down from the page end. We call the
+slot with the highest file address directory start, as it points to
+the first record in the list of records. */
+#define	PAGE_DIR		FIL_PAGE_DATA_END
+
+/* We define a slot in the page directory as two bytes */
+constexpr uint16_t PAGE_DIR_SLOT_SIZE= 2;
+
+/* The offset of the physically lower end of the directory, counted from
+page end, when the page is empty */
+#define PAGE_EMPTY_DIR_START	(PAGE_DIR + 2 * PAGE_DIR_SLOT_SIZE)
+
+/* The maximum and minimum number of records owned by a directory slot. The
+number may drop below the minimum in the first and the last slot in the
+directory. */
+#define PAGE_DIR_SLOT_MAX_N_OWNED	8
+#define	PAGE_DIR_SLOT_MIN_N_OWNED	4
+
+extern my_bool srv_immediate_scrub_data_uncompressed;
+#endif /* UNIV_INNOCHECKSUM */
+
+/** Get the start of a page frame.
+@param[in]	ptr	pointer within a page frame
+@return start of the page frame */
+MY_ATTRIBUTE((const))
+inline page_t* page_align(void *ptr)
+{
+  return my_assume_aligned<UNIV_PAGE_SIZE_MIN>
+    (reinterpret_cast<page_t*>(ut_align_down(ptr, srv_page_size)));
+}
+inline const page_t *page_align(const void *ptr)
+{
+  return page_align(const_cast<void*>(ptr));
+}
+
+/** Gets the byte offset within a page frame.
+@param[in]	ptr	pointer within a page frame
+@return offset from the start of the page */
+MY_ATTRIBUTE((const))
+inline uint16_t page_offset(const void*	ptr)
+{
+  return static_cast<uint16_t>(ut_align_offset(ptr, srv_page_size));
+}
+
+/** Determine whether an index page is not in ROW_FORMAT=REDUNDANT.
+@param[in]	page	index page
+@return	nonzero	if ROW_FORMAT is one of COMPACT,DYNAMIC,COMPRESSED
+@retval	0	if ROW_FORMAT=REDUNDANT */
+inline
+byte
+page_is_comp(const page_t* page)
+{
+	ut_ad(!ut_align_offset(page, UNIV_ZIP_SIZE_MIN));
+	return(page[PAGE_HEADER + PAGE_N_HEAP] & 0x80);
+}
+
+/** Determine whether an index page is empty.
+@param[in]	page	index page
+@return whether the page is empty (PAGE_N_RECS = 0) */
+inline
+bool
+page_is_empty(const page_t* page)
+{
+	ut_ad(!ut_align_offset(page, UNIV_ZIP_SIZE_MIN));
+	return !*reinterpret_cast<const uint16_t*>(PAGE_HEADER + PAGE_N_RECS
+						   + page);
+}
+
+/** Determine whether an index page contains garbage.
+@param[in]	page	index page
+@return whether the page contains garbage (PAGE_GARBAGE is not 0) */
+inline
+bool
+page_has_garbage(const page_t* page)
+{
+	ut_ad(!ut_align_offset(page, UNIV_ZIP_SIZE_MIN));
+	return *reinterpret_cast<const uint16_t*>(PAGE_HEADER + PAGE_GARBAGE
+						  + page);
+}
+
+/** Determine whether an B-tree or R-tree index page is a leaf page.
+@param[in]	page	index page
+@return true if the page is a leaf (PAGE_LEVEL = 0) */
+inline
+bool
+page_is_leaf(const page_t* page)
+{
+	ut_ad(!ut_align_offset(page, UNIV_ZIP_SIZE_MIN));
+	return !*reinterpret_cast<const uint16_t*>(PAGE_HEADER + PAGE_LEVEL
+						   + page);
+}
+
+#ifndef UNIV_INNOCHECKSUM
+/** Determine whether an index page record is not in ROW_FORMAT=REDUNDANT.
+@param[in]	rec	record in an index page frame (not a copy)
+@return	nonzero	if ROW_FORMAT is one of COMPACT,DYNAMIC,COMPRESSED
+@retval	0	if ROW_FORMAT=REDUNDANT */
+inline
+byte
+page_rec_is_comp(const byte* rec)
+{
+	return(page_is_comp(page_align(rec)));
+}
+
+# ifdef UNIV_DEBUG
+/** Determine if the record is the metadata pseudo-record
+in the clustered index.
+@param[in]	rec	leaf page record on an index page
+@return	whether the record is the metadata pseudo-record */
+inline bool page_rec_is_metadata(const rec_t* rec)
+{
+	return rec_get_info_bits(rec, page_rec_is_comp(rec))
+		& REC_INFO_MIN_REC_FLAG;
+}
+# endif /* UNIV_DEBUG */
+
+/** Determine the offset of the infimum record on the page.
+@param[in]	page	index page
+@return offset of the infimum record in record list, relative from page */
+inline
+unsigned
+page_get_infimum_offset(const page_t* page)
+{
+	ut_ad(!page_offset(page));
+	return page_is_comp(page) ? PAGE_NEW_INFIMUM : PAGE_OLD_INFIMUM;
+}
+
+/** Determine the offset of the supremum record on the page.
+@param[in]	page	index page
+@return offset of the supremum record in record list, relative from page */
+inline
+unsigned
+page_get_supremum_offset(const page_t* page)
+{
+	ut_ad(!page_offset(page));
+	return page_is_comp(page) ? PAGE_NEW_SUPREMUM : PAGE_OLD_SUPREMUM;
+}
+
+/** Determine whether an index page record is a user record.
+@param[in]	offset	record offset in the page
+@retval true if a user record
+@retval	false if the infimum or supremum pseudo-record */
+inline
+bool
+page_rec_is_user_rec_low(ulint offset)
+{
+	compile_time_assert(PAGE_OLD_INFIMUM >= PAGE_NEW_INFIMUM);
+	compile_time_assert(PAGE_OLD_SUPREMUM >= PAGE_NEW_SUPREMUM);
+	compile_time_assert(PAGE_NEW_INFIMUM < PAGE_OLD_SUPREMUM);
+	compile_time_assert(PAGE_OLD_INFIMUM < PAGE_NEW_SUPREMUM);
+	compile_time_assert(PAGE_NEW_SUPREMUM < PAGE_OLD_SUPREMUM_END);
+	compile_time_assert(PAGE_OLD_SUPREMUM < PAGE_NEW_SUPREMUM_END);
+	ut_ad(offset >= PAGE_NEW_INFIMUM);
+	ut_ad(offset <= srv_page_size - PAGE_EMPTY_DIR_START);
+
+	return(offset != PAGE_NEW_SUPREMUM
+	       && offset != PAGE_NEW_INFIMUM
+	       && offset != PAGE_OLD_INFIMUM
+	       && offset != PAGE_OLD_SUPREMUM);
+}
+
+/** Determine if a record is the supremum record on an index page.
+@param[in]	offset	record offset in an index page
+@return true if the supremum record */
+inline
+bool
+page_rec_is_supremum_low(ulint offset)
+{
+	ut_ad(offset >= PAGE_NEW_INFIMUM);
+	ut_ad(offset <= srv_page_size - PAGE_EMPTY_DIR_START);
+	return(offset == PAGE_NEW_SUPREMUM || offset == PAGE_OLD_SUPREMUM);
+}
+
+/** Determine if a record is the infimum record on an index page.
+@param[in]	offset	record offset in an index page
+@return true if the infimum record */
+inline
+bool
+page_rec_is_infimum_low(ulint offset)
+{
+	ut_ad(offset >= PAGE_NEW_INFIMUM);
+	ut_ad(offset <= srv_page_size - PAGE_EMPTY_DIR_START);
+	return(offset == PAGE_NEW_INFIMUM || offset == PAGE_OLD_INFIMUM);
+}
+
+/** Determine whether an B-tree or R-tree index record is in a leaf page.
+@param[in]	rec	index record in an index page
+@return true if the record is in a leaf page */
+inline
+bool
+page_rec_is_leaf(const page_t* rec)
+{
+	const page_t* page = page_align(rec);
+	ut_ad(ulint(rec - page) >= page_get_infimum_offset(page));
+	bool leaf = page_is_leaf(page);
+	ut_ad(!page_rec_is_comp(rec)
+	      || !page_rec_is_user_rec_low(ulint(rec - page))
+	      || leaf == !rec_get_node_ptr_flag(rec));
+	return leaf;
+}
+
+/** Determine whether an index page record is a user record.
+@param[in]	rec	record in an index page
+@return true if a user record */
+inline
+bool
+page_rec_is_user_rec(const rec_t* rec);
+
+/** Determine whether an index page record is the supremum record.
+@param[in]	rec	record in an index page
+@return true if the supremum record */
+inline
+bool
+page_rec_is_supremum(const rec_t* rec);
+
+/** Determine whether an index page record is the infimum record.
+@param[in]	rec	record in an index page
+@return true if the infimum record */
+inline
+bool
+page_rec_is_infimum(const rec_t* rec);
+
+/** Read PAGE_MAX_TRX_ID.
+@param[in]      page    index page
+@return the value of PAGE_MAX_TRX_ID or PAGE_ROOT_AUTO_INC */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+inline trx_id_t page_get_max_trx_id(const page_t *page)
+{
+  ut_ad(fil_page_index_page_check(page));
+  static_assert((PAGE_HEADER + PAGE_MAX_TRX_ID) % 8 == 0, "alignment");
+  const auto *p= my_assume_aligned<8>(page + PAGE_HEADER + PAGE_MAX_TRX_ID);
+  return mach_read_from_8(p);
+}
+
+/**
+Set the number of owned records.
+@tparam compressed    whether to update any ROW_FORMAT=COMPRESSED page as well
+@param[in,out]  block   index page
+@param[in,out]  rec     record in block.frame
+@param[in]      n_owned number of records skipped in the sparse page directory
+@param[in]      comp    whether ROW_FORMAT is one of COMPACT,DYNAMIC,COMPRESSED
+@param[in,out]  mtr     mini-transaction */
+template<bool compressed>
+inline void page_rec_set_n_owned(buf_block_t *block, rec_t *rec, ulint n_owned,
+                                 bool comp, mtr_t *mtr)
+{
+  ut_ad(block->page.frame == page_align(rec));
+  ut_ad(comp == (page_is_comp(block->page.frame) != 0));
+
+  if (page_zip_des_t *page_zip= compressed
+      ? buf_block_get_page_zip(block) : nullptr)
+  {
+    ut_ad(comp);
+    rec_set_bit_field_1(rec, n_owned, REC_NEW_N_OWNED,
+                        REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+    if (rec_get_status(rec) != REC_STATUS_SUPREMUM)
+      page_zip_rec_set_owned(block, rec, n_owned, mtr);
+  }
+  else
+  {
+    rec-= comp ? REC_NEW_N_OWNED : REC_OLD_N_OWNED;
+    mtr->write<1,mtr_t::MAYBE_NOP>(*block, rec, (*rec & ~REC_N_OWNED_MASK) |
+                                   (n_owned << REC_N_OWNED_SHIFT));
+  }
+}
+
+/*************************************************************//**
+Sets the max trx id field value. */
+void
+page_set_max_trx_id(
+/*================*/
+	buf_block_t*	block,	/*!< in/out: page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction, or NULL */
+/*************************************************************//**
+Sets the max trx id field value if trx_id is bigger than the previous
+value. */
+UNIV_INLINE
+void
+page_update_max_trx_id(
+/*===================*/
+	buf_block_t*	block,	/*!< in/out: page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be updated, or NULL */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
+
+/** Persist the AUTO_INCREMENT value on a clustered index root page.
+@param[in,out]	block	clustered index root page
+@param[in]	autoinc	next available AUTO_INCREMENT value
+@param[in,out]	mtr	mini-transaction
+@param[in]	reset	whether to reset the AUTO_INCREMENT
+			to a possibly smaller value than currently
+			exists in the page */
+void
+page_set_autoinc(
+	buf_block_t*		block,
+	ib_uint64_t		autoinc,
+	mtr_t*			mtr,
+	bool			reset)
+	MY_ATTRIBUTE((nonnull));
+
+/*************************************************************//**
+Returns the RTREE SPLIT SEQUENCE NUMBER (FIL_RTREE_SPLIT_SEQ_NUM).
+@return SPLIT SEQUENCE NUMBER */
+UNIV_INLINE
+node_seq_t
+page_get_ssn_id(
+/*============*/
+	const page_t*	page);	/*!< in: page */
+/*************************************************************//**
+Sets the RTREE SPLIT SEQUENCE NUMBER field value */
+UNIV_INLINE
+void
+page_set_ssn_id(
+/*============*/
+	buf_block_t*	block,	/*!< in/out: page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be updated, or NULL */
+	node_seq_t	ssn_id,	/*!< in: split sequence id */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
+
+#endif /* !UNIV_INNOCHECKSUM */
+/** Read a page header field. */
+inline uint16_t page_header_get_field(const page_t *page, ulint field)
+{
+  ut_ad(field <= PAGE_INDEX_ID);
+  ut_ad(!(field & 1));
+  return mach_read_from_2(my_assume_aligned<2>(PAGE_HEADER + field + page));
+}
+
+#ifndef UNIV_INNOCHECKSUM
+/*************************************************************//**
+Returns the offset stored in the given header field.
+@return offset from the start of the page, or 0 */
+UNIV_INLINE
+uint16_t
+page_header_get_offs(
+/*=================*/
+	const page_t*	page,	/*!< in: page */
+	ulint		field)	/*!< in: PAGE_FREE, ... */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/*************************************************************//**
+Returns the pointer stored in the given header field, or NULL. */
+#define page_header_get_ptr(page, field)			\
+	(page_header_get_offs(page, field)			\
+	 ? page + page_header_get_offs(page, field) : NULL)
+
+/**
+Reset PAGE_LAST_INSERT.
+@param[in,out]  block    file page
+@param[in,out]  mtr      mini-transaction */
+inline void page_header_reset_last_insert(buf_block_t *block, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull));
+#define page_get_infimum_rec(page) ((page) + page_get_infimum_offset(page))
+#define page_get_supremum_rec(page) ((page) + page_get_supremum_offset(page))
+
+/************************************************************//**
+Returns the nth record of the record list.
+This is the inverse function of page_rec_get_n_recs_before().
+@return nth record
+@retval nullptr on corrupted page */
+const rec_t*
+page_rec_get_nth_const(
+/*===================*/
+	const page_t*	page,	/*!< in: page */
+	ulint		nth)	/*!< in: nth record */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/************************************************************//**
+Returns the nth record of the record list.
+This is the inverse function of page_rec_get_n_recs_before().
+@return nth record
+@retval nullptr on corrupted page */
+inline rec_t *page_rec_get_nth(page_t* page, ulint nth)
+{
+  return const_cast<rec_t*>(page_rec_get_nth_const(page, nth));
+}
+
+/************************************************************//**
+Returns the middle record of the records on the page. If there is an
+even number of records in the list, returns the first record of the
+upper half-list.
+@return middle record */
+UNIV_INLINE
+rec_t*
+page_get_middle_rec(
+/*================*/
+	page_t*	page)	/*!< in: page */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*************************************************************//**
+Gets the page number.
+@return page number */
+UNIV_INLINE
+uint32_t
+page_get_page_no(
+/*=============*/
+	const page_t*	page);	/*!< in: page */
+
+/*************************************************************//**
+Gets the tablespace identifier.
+@return space id */
+UNIV_INLINE
+uint32_t
+page_get_space_id(
+/*==============*/
+	const page_t*	page);	/*!< in: page */
+
+/*************************************************************//**
+Gets the number of user records on page (the infimum and supremum records
+are not user records).
+@return number of user records */
+UNIV_INLINE
+uint16_t
+page_get_n_recs(
+/*============*/
+	const page_t*	page);	/*!< in: index page */
+
+/** Return the number of preceding records in an index page.
+@param rec index record
+@return number of preceding records, including the infimum pseudo-record
+@retval ULINT_UNDEFINED on corrupted page */
+ulint page_rec_get_n_recs_before(const rec_t *rec);
+/*************************************************************//**
+Gets the number of records in the heap.
+@return number of user records */
+UNIV_INLINE
+uint16_t
+page_dir_get_n_heap(
+/*================*/
+	const page_t*	page);	/*!< in: index page */
+/*************************************************************//**
+Gets the number of dir slots in directory.
+@return number of slots */
+UNIV_INLINE
+uint16_t
+page_dir_get_n_slots(
+/*=================*/
+	const page_t*	page);	/*!< in: index page */
+/** Gets the pointer to a directory slot.
+@param n  sparse directory slot number
+@return pointer to the sparse directory slot */
+inline page_dir_slot_t *page_dir_get_nth_slot(page_t *page, ulint n)
+{
+  ut_ad(page_dir_get_n_slots(page) > n);
+  static_assert(PAGE_DIR_SLOT_SIZE == 2, "compatibility");
+  return my_assume_aligned<2>(page + srv_page_size - (PAGE_DIR + 2) - n * 2);
+}
+inline const page_dir_slot_t *page_dir_get_nth_slot(const page_t *page,ulint n)
+{
+  return page_dir_get_nth_slot(const_cast<page_t*>(page), n);
+}
+/**************************************************************//**
+Used to check the consistency of a record on a page.
+@return TRUE if succeed */
+UNIV_INLINE
+ibool
+page_rec_check(
+/*===========*/
+	const rec_t*	rec);	/*!< in: record */
+/** Get the record pointed to by a directory slot.
+@param[in] slot   directory slot
+@return pointer to record */
+inline rec_t *page_dir_slot_get_rec(page_dir_slot_t *slot)
+{
+  return page_align(slot) + mach_read_from_2(my_assume_aligned<2>(slot));
+}
+inline const rec_t *page_dir_slot_get_rec(const page_dir_slot_t *slot)
+{
+  return page_dir_slot_get_rec(const_cast<rec_t*>(slot));
+}
+
+inline rec_t *page_dir_slot_get_rec_validate(page_dir_slot_t *slot)
+{
+  const size_t s= mach_read_from_2(my_assume_aligned<2>(slot));
+  page_t *page= page_align(slot);
+
+  return UNIV_LIKELY(s >= PAGE_NEW_INFIMUM &&
+                     s <= page_header_get_field(page, PAGE_HEAP_TOP))
+    ? page + s
+    : nullptr;
+}
+inline const rec_t *page_dir_slot_get_rec_validate(const page_dir_slot_t *slot)
+{
+  return page_dir_slot_get_rec_validate(const_cast<rec_t*>(slot));
+}
+
+
+/***************************************************************//**
+Gets the number of records owned by a directory slot.
+@return number of records */
+UNIV_INLINE
+ulint
+page_dir_slot_get_n_owned(
+/*======================*/
+	const page_dir_slot_t*	slot);	/*!< in: page directory slot */
+/************************************************************//**
+Calculates the space reserved for directory slots of a given
+number of records. The exact value is a fraction number
+n * PAGE_DIR_SLOT_SIZE / PAGE_DIR_SLOT_MIN_N_OWNED, and it is
+rounded upwards to an integer. */
+UNIV_INLINE
+ulint
+page_dir_calc_reserved_space(
+/*=========================*/
+	ulint	n_recs);	/*!< in: number of records */
+/***************************************************************//**
+Looks for the directory slot which owns the given record.
+@return the directory slot number
+@retval ULINT_UNDEFINED on corruption */
+ulint
+page_dir_find_owner_slot(
+/*=====================*/
+	const rec_t*	rec);	/*!< in: the physical record */
+
+/***************************************************************//**
+Returns the heap number of a record.
+@return heap number */
+UNIV_INLINE
+ulint
+page_rec_get_heap_no(
+/*=================*/
+	const rec_t*	rec);	/*!< in: the physical record */
+/** Determine whether a page has any siblings.
+@param[in]	page	page frame
+@return true if the page has any siblings */
+inline bool page_has_siblings(const page_t* page)
+{
+	compile_time_assert(!(FIL_PAGE_PREV % 8));
+	compile_time_assert(FIL_PAGE_NEXT == FIL_PAGE_PREV + 4);
+	compile_time_assert(FIL_NULL == 0xffffffff);
+	return *reinterpret_cast<const uint64_t*>(page + FIL_PAGE_PREV)
+		!= ~uint64_t(0);
+}
+
+/** Determine whether a page has a predecessor.
+@param[in]	page	page frame
+@return true if the page has a predecessor */
+inline bool page_has_prev(const page_t* page)
+{
+	return *reinterpret_cast<const uint32_t*>(page + FIL_PAGE_PREV)
+		!= FIL_NULL;
+}
+
+/** Determine whether a page has a successor.
+@param[in]	page	page frame
+@return true if the page has a successor */
+inline bool page_has_next(const page_t* page)
+{
+	return *reinterpret_cast<const uint32_t*>(page + FIL_PAGE_NEXT)
+		!= FIL_NULL;
+}
+
+/** Read the AUTO_INCREMENT value from a clustered index root page.
+@param[in]	page	clustered index root page
+@return	the persisted AUTO_INCREMENT value */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+inline uint64_t page_get_autoinc(const page_t *page)
+{
+  ut_d(uint16_t page_type= fil_page_get_type(page));
+  ut_ad(page_type == FIL_PAGE_INDEX || page_type == FIL_PAGE_TYPE_INSTANT);
+  ut_ad(!page_has_siblings(page));
+  const auto *p= my_assume_aligned<8>(page + PAGE_HEADER + PAGE_ROOT_AUTO_INC);
+  return mach_read_from_8(p);
+}
+
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return pointer to next record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_next_low(
+/*==================*/
+	const rec_t*	rec,	/*!< in: pointer to record */
+	ulint		comp);	/*!< in: nonzero=compact page layout */
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return pointer to next record */
+UNIV_INLINE
+rec_t*
+page_rec_get_next(
+/*==============*/
+	rec_t*	rec);	/*!< in: pointer to record */
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return pointer to next record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_next_const(
+/*====================*/
+	const rec_t*	rec);	/*!< in: pointer to record */
+/************************************************************//**
+Gets the pointer to the previous record.
+@return pointer to previous record
+@retval nullptr on error */
+const rec_t*
+page_rec_get_prev_const(
+/*====================*/
+	const rec_t*	rec);	/*!< in: pointer to record, must not be page
+				infimum */
+/************************************************************//**
+Gets the pointer to the previous record.
+@param rec  record (not page infimum)
+@return pointer to previous record
+@retval nullptr on error */
+inline rec_t *page_rec_get_prev(rec_t *rec)
+{
+  return const_cast<rec_t*>(page_rec_get_prev_const(rec));
+}
+
+/************************************************************//**
+true if the record is the first user record on a page.
+@return true if the first user record */
+UNIV_INLINE
+bool
+page_rec_is_first(
+/*==============*/
+	const rec_t*	rec,	/*!< in: record */
+	const page_t*	page)	/*!< in: page */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/************************************************************//**
+true if the record is the last user record on a page.
+@return true if the last user record */
+UNIV_INLINE
+bool
+page_rec_is_last(
+/*=============*/
+	const rec_t*	rec,	/*!< in: record */
+	const page_t*	page)	/*!< in: page */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/************************************************************//**
+Returns the maximum combined size of records which can be inserted on top
+of record heap.
+@return maximum combined size for inserted records */
+UNIV_INLINE
+ulint
+page_get_max_insert_size(
+/*=====================*/
+	const page_t*	page,	/*!< in: index page */
+	ulint		n_recs);/*!< in: number of records */
+/************************************************************//**
+Returns the maximum combined size of records which can be inserted on top
+of record heap if page is first reorganized.
+@return maximum combined size for inserted records */
+UNIV_INLINE
+ulint
+page_get_max_insert_size_after_reorganize(
+/*======================================*/
+	const page_t*	page,	/*!< in: index page */
+	ulint		n_recs);/*!< in: number of records */
+/*************************************************************//**
+Calculates free space if a page is emptied.
+@return free space */
+UNIV_INLINE
+ulint
+page_get_free_space_of_empty(
+/*=========================*/
+	ulint	comp)	/*!< in: nonzero=compact page format */
+		MY_ATTRIBUTE((const));
+/************************************************************//**
+Returns the sum of the sizes of the records in the record list
+excluding the infimum and supremum records.
+@return data in bytes */
+UNIV_INLINE
+uint16_t
+page_get_data_size(
+/*===============*/
+	const page_t*	page);	/*!< in: index page */
+/** Read the PAGE_DIRECTION field from a byte.
+@param[in]	ptr	pointer to PAGE_DIRECTION_B
+@return	the value of the PAGE_DIRECTION field */
+inline
+byte
+page_ptr_get_direction(const byte* ptr);
+
+/** Read the PAGE_DIRECTION field.
+@param[in]	page	index page
+@return	the value of the PAGE_DIRECTION field */
+inline
+byte
+page_get_direction(const page_t* page)
+{
+	return page_ptr_get_direction(PAGE_HEADER + PAGE_DIRECTION_B + page);
+}
+
+/** Read the PAGE_INSTANT field.
+@param[in]	page	index page
+@return the value of the PAGE_INSTANT field */
+inline
+uint16_t
+page_get_instant(const page_t* page);
+
+/** Create an uncompressed index page.
+@param[in,out]	block	buffer block
+@param[in,out]	mtr	mini-transaction
+@param[in]	comp	set unless ROW_FORMAT=REDUNDANT */
+void page_create(buf_block_t *block, mtr_t *mtr, bool comp);
+/**********************************************************//**
+Create a compressed B-tree index page. */
+void
+page_create_zip(
+/*============*/
+	buf_block_t*		block,		/*!< in/out: a buffer frame
+						where the page is created */
+	dict_index_t*		index,		/*!< in: the index of the
+						page */
+	ulint			level,		/*!< in: the B-tree level of
+						the page */
+	trx_id_t		max_trx_id,	/*!< in: PAGE_MAX_TRX_ID */
+	mtr_t*			mtr);		/*!< in/out: mini-transaction
+						handle */
+/**********************************************************//**
+Empty a previously created B-tree index page. */
+void
+page_create_empty(
+/*==============*/
+	buf_block_t*	block,	/*!< in/out: B-tree block */
+	dict_index_t*	index,	/*!< in: the index of the page */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull(1,2)));
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/*************************************************************//**
+Differs from page_copy_rec_list_end, because this function does not
+touch the lock table and max trx id on page or compress the page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_t::commit().
+
+@return error code */
+dberr_t
+page_copy_rec_list_end_no_locks(
+/*============================*/
+	buf_block_t*	new_block,	/*!< in: index page to copy to */
+	buf_block_t*	block,		/*!< in: index page of rec */
+	rec_t*		rec,		/*!< in: record on page */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr);		/*!< in: mtr */
+/*************************************************************//**
+Copies records from page to new_page, from the given record onward,
+including that record. Infimum and supremum records are not copied.
+The records are copied to the start of the record list on new_page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_t::commit().
+
+@return pointer to the original successor of the infimum record on new_block
+@retval nullptr on ROW_FORMAT=COMPRESSED page overflow */
+rec_t*
+page_copy_rec_list_end(
+/*===================*/
+	buf_block_t*	new_block,	/*!< in/out: index page to copy to */
+	buf_block_t*	block,		/*!< in: index page containing rec */
+	rec_t*		rec,		/*!< in: record on page */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr,		/*!< in/out: mini-transaction */
+	dberr_t*	err)		/*!< out: error code */
+	MY_ATTRIBUTE((nonnull(1,2,3,4,5), warn_unused_result));
+/*************************************************************//**
+Copies records from page to new_page, up to the given record, NOT
+including that record. Infimum and supremum records are not copied.
+The records are copied to the end of the record list on new_page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return pointer to the original predecessor of the supremum record on new_block
+@retval nullptr on ROW_FORMAT=COMPRESSED page overflow */
+rec_t*
+page_copy_rec_list_start(
+/*=====================*/
+	buf_block_t*	new_block,	/*!< in/out: index page to copy to */
+	buf_block_t*	block,		/*!< in: index page containing rec */
+	rec_t*		rec,		/*!< in: record on page */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr,		/*!< in/out: mini-transaction */
+	dberr_t*	err)		/*!< out: error code */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*************************************************************//**
+Deletes records from a page from a given record onward, including that record.
+The infimum and supremum records are not deleted. */
+dberr_t
+page_delete_rec_list_end(
+/*=====================*/
+	rec_t*		rec,	/*!< in: pointer to record on page */
+	buf_block_t*	block,	/*!< in: buffer block of the page */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	ulint		n_recs,	/*!< in: number of records to delete,
+				or ULINT_UNDEFINED if not known */
+	ulint		size,	/*!< in: the sum of the sizes of the
+				records in the end of the chain to
+				delete, or ULINT_UNDEFINED if not known */
+	mtr_t*		mtr)	/*!< in: mtr */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*************************************************************//**
+Deletes records from page, up to the given record, NOT including
+that record. Infimum and supremum records are not deleted. */
+void
+page_delete_rec_list_start(
+/*=======================*/
+	rec_t*		rec,	/*!< in: record on page */
+	buf_block_t*	block,	/*!< in: buffer block of the page */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	mtr_t*		mtr)	/*!< in: mtr */
+	MY_ATTRIBUTE((nonnull));
+/** Create an index page.
+@param[in,out]	block	buffer block
+@param[in]	comp	nonzero=compact page format */
+void page_create_low(const buf_block_t* block, bool comp);
+
+/************************************************************//**
+Prints record contents including the data relevant only in
+the index page context. */
+void
+page_rec_print(
+/*===========*/
+	const rec_t*	rec,	/*!< in: physical record */
+	const rec_offs*	offsets);/*!< in: record descriptor */
+# ifdef UNIV_BTR_PRINT
+/***************************************************************//**
+This is used to print the contents of the directory for
+debugging purposes. */
+void
+page_dir_print(
+/*===========*/
+	page_t*	page,	/*!< in: index page */
+	ulint	pr_n);	/*!< in: print n first and n last entries */
+/***************************************************************//**
+This is used to print the contents of the page record list for
+debugging purposes. */
+void
+page_print_list(
+/*============*/
+	buf_block_t*	block,	/*!< in: index page */
+	dict_index_t*	index,	/*!< in: dictionary index of the page */
+	ulint		pr_n);	/*!< in: print n first and n last entries */
+/***************************************************************//**
+Prints the info in a page header. */
+void
+page_header_print(
+/*==============*/
+	const page_t*	page);	/*!< in: index page */
+/***************************************************************//**
+This is used to print the contents of the page for
+debugging purposes. */
+void
+page_print(
+/*=======*/
+	buf_block_t*	block,	/*!< in: index page */
+	dict_index_t*	index,	/*!< in: dictionary index of the page */
+	ulint		dn,	/*!< in: print dn first and last entries
+				in directory */
+	ulint		rn);	/*!< in: print rn first and last records
+				in directory */
+# endif /* UNIV_BTR_PRINT */
+/***************************************************************//**
+The following is used to validate a record on a page. This function
+differs from rec_validate as it can also check the n_owned field and
+the heap_no field.
+@return TRUE if ok */
+ibool
+page_rec_validate(
+/*==============*/
+	const rec_t*	rec,	/*!< in: physical record */
+	const rec_offs*	offsets);/*!< in: array returned by rec_get_offsets() */
+#ifdef UNIV_DEBUG
+/***************************************************************//**
+Checks that the first directory slot points to the infimum record and
+the last to the supremum. This function is intended to track if the
+bug fixed in 4.0.14 has caused corruption to users' databases. */
+void
+page_check_dir(
+/*===========*/
+	const page_t*	page);	/*!< in: index page */
+#endif /* UNIV_DEBUG */
+/***************************************************************//**
+This function checks the consistency of an index page when we do not
+know the index. This is also resilient so that this should never crash
+even if the page is total garbage.
+@return TRUE if ok */
+ibool
+page_simple_validate_old(
+/*=====================*/
+	const page_t*	page);	/*!< in: index page in ROW_FORMAT=REDUNDANT */
+/***************************************************************//**
+This function checks the consistency of an index page when we do not
+know the index. This is also resilient so that this should never crash
+even if the page is total garbage.
+@return TRUE if ok */
+ibool
+page_simple_validate_new(
+/*=====================*/
+	const page_t*	page);	/*!< in: index page in ROW_FORMAT!=REDUNDANT */
+/** Check the consistency of an index page.
+@param[in]	page	index page
+@param[in]	index	B-tree or R-tree index
+@return	whether the page is valid */
+bool page_validate(const page_t* page, const dict_index_t* index)
+	MY_ATTRIBUTE((nonnull));
+/***************************************************************//**
+Looks in the page record list for a record with the given heap number.
+@return record, NULL if not found */
+const rec_t*
+page_find_rec_with_heap_no(
+/*=======================*/
+	const page_t*	page,	/*!< in: index page */
+	ulint		heap_no);/*!< in: heap number */
+/** Get the last non-delete-marked record on a page.
+@param[in]	page	index tree leaf page
+@return the last record, not delete-marked
+@retval infimum record if all records are delete-marked */
+const rec_t *page_find_rec_last_not_deleted(const page_t *page);
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+#include "page0page.inl"
+
+#endif
diff --git a/storage/innobase/include/page0page.inl b/storage/innobase/include/page0page.inl
new file mode 100644
index 00000000..6c0167ed
--- /dev/null
+++ b/storage/innobase/include/page0page.inl
@@ -0,0 +1,550 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0page.ic
+Index page routines
+
+Created 2/2/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef UNIV_INNOCHECKSUM
+#include "rem0cmp.h"
+#include "mtr0log.h"
+#include "page0zip.h"
+
+/*************************************************************//**
+Sets the max trx id field value if trx_id is bigger than the previous
+value. */
+UNIV_INLINE
+void
+page_update_max_trx_id(
+/*===================*/
+	buf_block_t*	block,	/*!< in/out: page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be updated, or NULL */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	ut_ad(block);
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(trx_id);
+	ut_ad(page_is_leaf(buf_block_get_frame(block)));
+
+	if (page_get_max_trx_id(buf_block_get_frame(block)) < trx_id) {
+
+		page_set_max_trx_id(block, page_zip, trx_id, mtr);
+	}
+}
+
+/*************************************************************//**
+Returns the RTREE SPLIT SEQUENCE NUMBER (FIL_RTREE_SPLIT_SEQ_NUM).
+@return	SPLIT SEQUENCE NUMBER */
+UNIV_INLINE
+node_seq_t
+page_get_ssn_id(
+/*============*/
+	const page_t*	page)	/*!< in: page */
+{
+	ut_ad(page);
+
+	return(static_cast<node_seq_t>(
+		mach_read_from_8(page + FIL_RTREE_SPLIT_SEQ_NUM)));
+}
+
+/*************************************************************//**
+Sets the RTREE SPLIT SEQUENCE NUMBER field value */
+UNIV_INLINE
+void
+page_set_ssn_id(
+/*============*/
+	buf_block_t*	block,	/*!< in/out: page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be updated, or NULL */
+	node_seq_t	ssn_id,	/*!< in: transaction id */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+  ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_SX_FIX |
+                                   MTR_MEMO_PAGE_X_FIX));
+  ut_ad(!page_zip || page_zip == &block->page.zip);
+  constexpr uint16_t field= FIL_RTREE_SPLIT_SEQ_NUM;
+  byte *b= my_assume_aligned<2>(&block->page.frame[field]);
+  if (mtr->write<8,mtr_t::MAYBE_NOP>(*block, b, ssn_id) &&
+      UNIV_LIKELY_NULL(page_zip))
+    memcpy_aligned<2>(&page_zip->data[field], b, 8);
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+#ifndef UNIV_INNOCHECKSUM
+/*************************************************************//**
+Returns the offset stored in the given header field.
+@return offset from the start of the page, or 0 */
+UNIV_INLINE
+uint16_t
+page_header_get_offs(
+/*=================*/
+	const page_t*	page,	/*!< in: page */
+	ulint		field)	/*!< in: PAGE_FREE, ... */
+{
+	ut_ad((field == PAGE_FREE)
+	      || (field == PAGE_LAST_INSERT)
+	      || (field == PAGE_HEAP_TOP));
+
+	uint16_t offs = page_header_get_field(page, field);
+
+	ut_ad((field != PAGE_HEAP_TOP) || offs);
+
+	return(offs);
+}
+
+
+/**
+Reset PAGE_LAST_INSERT.
+@param[in,out]  block    file page
+@param[in,out]  mtr      mini-transaction */
+inline void page_header_reset_last_insert(buf_block_t *block, mtr_t *mtr)
+{
+  constexpr uint16_t field= PAGE_HEADER + PAGE_LAST_INSERT;
+  byte *b= my_assume_aligned<2>(&block->page.frame[field]);
+  if (mtr->write<2,mtr_t::MAYBE_NOP>(*block, b, 0U) &&
+      UNIV_LIKELY_NULL(block->page.zip.data))
+    memset_aligned<2>(&block->page.zip.data[field], 0, 2);
+}
+
+/***************************************************************//**
+Returns the heap number of a record.
+@return heap number */
+UNIV_INLINE
+ulint
+page_rec_get_heap_no(
+/*=================*/
+	const rec_t*	rec)	/*!< in: the physical record */
+{
+	if (page_rec_is_comp(rec)) {
+		return(rec_get_heap_no_new(rec));
+	} else {
+		return(rec_get_heap_no_old(rec));
+	}
+}
+
+/** Determine whether an index page record is a user record.
+@param[in]	rec	record in an index page
+@return true if a user record */
+inline
+bool
+page_rec_is_user_rec(const rec_t* rec)
+{
+	ut_ad(page_rec_check(rec));
+	return(page_rec_is_user_rec_low(page_offset(rec)));
+}
+
+/** Determine whether an index page record is the supremum record.
+@param[in]	rec	record in an index page
+@return true if the supremum record */
+inline
+bool
+page_rec_is_supremum(const rec_t* rec)
+{
+	ut_ad(page_rec_check(rec));
+	return(page_rec_is_supremum_low(page_offset(rec)));
+}
+
+/** Determine whether an index page record is the infimum record.
+@param[in]	rec	record in an index page
+@return true if the infimum record */
+inline
+bool
+page_rec_is_infimum(const rec_t* rec)
+{
+	ut_ad(page_rec_check(rec));
+	return(page_rec_is_infimum_low(page_offset(rec)));
+}
+
+/************************************************************//**
+true if the record is the first user record on a page.
+@return true if the first user record */
+UNIV_INLINE
+bool
+page_rec_is_first(
+/*==============*/
+	const rec_t*	rec,	/*!< in: record */
+	const page_t*	page)	/*!< in: page */
+{
+	ut_ad(page_get_n_recs(page) > 0);
+
+	return(page_rec_get_next_const(page_get_infimum_rec(page)) == rec);
+}
+
+/************************************************************//**
+true if the record is the last user record on a page.
+@return true if the last user record */
+UNIV_INLINE
+bool
+page_rec_is_last(
+/*=============*/
+	const rec_t*	rec,	/*!< in: record */
+	const page_t*	page)	/*!< in: page */
+{
+	ut_ad(page_get_n_recs(page) > 0);
+
+	return(page_rec_get_next_const(rec) == page_get_supremum_rec(page));
+}
+
+/************************************************************//**
+Returns the middle record of the records on the page. If there is an
+even number of records in the list, returns the first record of the
+upper half-list.
+@return middle record */
+UNIV_INLINE
+rec_t*
+page_get_middle_rec(
+/*================*/
+	page_t*	page)	/*!< in: page */
+{
+	ulint	middle = (ulint(page_get_n_recs(page))
+			  + PAGE_HEAP_NO_USER_LOW) / 2;
+
+	return(page_rec_get_nth(page, middle));
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/*************************************************************//**
+Gets the page number.
+@return page number */
+UNIV_INLINE
+uint32_t
+page_get_page_no(
+/*=============*/
+	const page_t*	page)	/*!< in: page */
+{
+  ut_ad(page == page_align((page_t*) page));
+  return mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_OFFSET));
+}
+
+#ifndef UNIV_INNOCHECKSUM
+/*************************************************************//**
+Gets the tablespace identifier.
+@return space id */
+UNIV_INLINE
+uint32_t
+page_get_space_id(
+/*==============*/
+	const page_t*	page)	/*!< in: page */
+{
+  ut_ad(page == page_align((page_t*) page));
+  return mach_read_from_4(my_assume_aligned<2>
+                          (page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
+}
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/*************************************************************//**
+Gets the number of user records on page (infimum and supremum records
+are not user records).
+@return number of user records */
+UNIV_INLINE
+uint16_t
+page_get_n_recs(
+/*============*/
+	const page_t*	page)	/*!< in: index page */
+{
+	return(page_header_get_field(page, PAGE_N_RECS));
+}
+
+#ifndef UNIV_INNOCHECKSUM
+/*************************************************************//**
+Gets the number of dir slots in directory.
+@return number of slots */
+UNIV_INLINE
+uint16_t
+page_dir_get_n_slots(
+/*=================*/
+	const page_t*	page)	/*!< in: index page */
+{
+	return(page_header_get_field(page, PAGE_N_DIR_SLOTS));
+}
+
+/*************************************************************//**
+Gets the number of records in the heap.
+@return number of user records */
+UNIV_INLINE
+uint16_t
+page_dir_get_n_heap(
+/*================*/
+	const page_t*	page)	/*!< in: index page */
+{
+	return(page_header_get_field(page, PAGE_N_HEAP) & 0x7fff);
+}
+
+/**************************************************************//**
+Used to check the consistency of a record on a page.
+@return TRUE if succeed */
+UNIV_INLINE
+ibool
+page_rec_check(
+/*===========*/
+	const rec_t*	rec)	/*!< in: record */
+{
+	const page_t*	page = page_align(rec);
+
+	ut_a(rec);
+
+	ut_a(page_offset(rec) <= page_header_get_field(page, PAGE_HEAP_TOP));
+	ut_a(page_offset(rec) >= PAGE_DATA);
+
+	return(TRUE);
+}
+
+/***************************************************************//**
+Gets the number of records owned by a directory slot.
+@return number of records */
+UNIV_INLINE
+ulint
+page_dir_slot_get_n_owned(
+/*======================*/
+	const page_dir_slot_t*	slot)	/*!< in: page directory slot */
+{
+	const rec_t*	rec	= page_dir_slot_get_rec(slot);
+	if (page_rec_is_comp(slot)) {
+		return(rec_get_n_owned_new(rec));
+	} else {
+		return(rec_get_n_owned_old(rec));
+	}
+}
+
+/************************************************************//**
+Calculates the space reserved for directory slots of a given number of
+records. The exact value is a fraction number n * PAGE_DIR_SLOT_SIZE /
+PAGE_DIR_SLOT_MIN_N_OWNED, and it is rounded upwards to an integer. */
+UNIV_INLINE
+ulint
+page_dir_calc_reserved_space(
+/*=========================*/
+	ulint	n_recs)		/*!< in: number of records */
+{
+	return((PAGE_DIR_SLOT_SIZE * n_recs + PAGE_DIR_SLOT_MIN_N_OWNED - 1)
+	       / PAGE_DIR_SLOT_MIN_N_OWNED);
+}
+
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return pointer to next record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_next_low(
+/*==================*/
+	const rec_t*	rec,	/*!< in: pointer to record */
+	ulint		comp)	/*!< in: nonzero=compact page layout */
+{
+  const page_t *page= page_align(rec);
+  ut_ad(page_rec_check(rec));
+  ulint offs= rec_get_next_offs(rec, comp);
+  if (!offs)
+    return nullptr;
+  if (UNIV_UNLIKELY(offs < (comp ? PAGE_NEW_SUPREMUM : PAGE_OLD_SUPREMUM)))
+    return nullptr;
+  if (UNIV_UNLIKELY(offs > page_header_get_field(page, PAGE_HEAP_TOP)))
+    return nullptr;
+  ut_ad(page_rec_is_infimum(rec) ||
+        (!page_is_leaf(page) && !page_has_prev(page)) ||
+        !(rec_get_info_bits(page + offs, comp) & REC_INFO_MIN_REC_FLAG));
+  return page + offs;
+}
+
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return pointer to next record */
+UNIV_INLINE
+rec_t*
+page_rec_get_next(
+/*==============*/
+	rec_t*	rec)	/*!< in: pointer to record */
+{
+	return((rec_t*) page_rec_get_next_low(rec, page_rec_is_comp(rec)));
+}
+
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return pointer to next record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_next_const(
+/*====================*/
+	const rec_t*	rec)	/*!< in: pointer to record */
+{
+	return(page_rec_get_next_low(rec, page_rec_is_comp(rec)));
+}
+#endif /* UNIV_INNOCHECKSUM */
+
+/************************************************************//**
+Returns the sum of the sizes of the records in the record list, excluding
+the infimum and supremum records.
+@return data in bytes */
+UNIV_INLINE
+uint16_t
+page_get_data_size(
+/*===============*/
+	const page_t*	page)	/*!< in: index page */
+{
+	unsigned ret = page_header_get_field(page, PAGE_HEAP_TOP)
+		- (page_is_comp(page)
+		   ? PAGE_NEW_SUPREMUM_END
+		   : PAGE_OLD_SUPREMUM_END)
+		- page_header_get_field(page, PAGE_GARBAGE);
+	ut_ad(ret < srv_page_size);
+	return static_cast<uint16_t>(ret);
+}
+
+#ifndef UNIV_INNOCHECKSUM
+/*************************************************************//**
+Calculates free space if a page is emptied.
+@return free space */
+UNIV_INLINE
+ulint
+page_get_free_space_of_empty(
+/*=========================*/
+	ulint	comp)		/*!< in: nonzero=compact page layout */
+{
+	if (comp) {
+		return((ulint)(srv_page_size
+			       - PAGE_NEW_SUPREMUM_END
+			       - PAGE_DIR
+			       - 2 * PAGE_DIR_SLOT_SIZE));
+	}
+
+	return((ulint)(srv_page_size
+		       - PAGE_OLD_SUPREMUM_END
+		       - PAGE_DIR
+		       - 2 * PAGE_DIR_SLOT_SIZE));
+}
+
+/************************************************************//**
+Each user record on a page, and also the deleted user records in the heap
+takes its size plus the fraction of the dir cell size /
+PAGE_DIR_SLOT_MIN_N_OWNED bytes for it. If the sum of these exceeds the
+value of page_get_free_space_of_empty, the insert is impossible, otherwise
+it is allowed. This function returns the maximum combined size of records
+which can be inserted on top of the record heap.
+@return maximum combined size for inserted records */
+UNIV_INLINE
+ulint
+page_get_max_insert_size(
+/*=====================*/
+	const page_t*	page,	/*!< in: index page */
+	ulint		n_recs)	/*!< in: number of records */
+{
+	ulint	occupied;
+	ulint	free_space;
+
+	if (page_is_comp(page)) {
+		occupied = page_header_get_field(page, PAGE_HEAP_TOP)
+			- PAGE_NEW_SUPREMUM_END
+			+ page_dir_calc_reserved_space(
+				n_recs + page_dir_get_n_heap(page) - 2);
+
+		free_space = page_get_free_space_of_empty(TRUE);
+	} else {
+		occupied = page_header_get_field(page, PAGE_HEAP_TOP)
+			- PAGE_OLD_SUPREMUM_END
+			+ page_dir_calc_reserved_space(
+				n_recs + page_dir_get_n_heap(page) - 2);
+
+		free_space = page_get_free_space_of_empty(FALSE);
+	}
+
+	/* Above the 'n_recs +' part reserves directory space for the new
+	inserted records; the '- 2' excludes page infimum and supremum
+	records */
+
+	if (occupied > free_space) {
+
+		return(0);
+	}
+
+	return(free_space - occupied);
+}
+
+/************************************************************//**
+Returns the maximum combined size of records which can be inserted on top
+of the record heap if a page is first reorganized.
+@return maximum combined size for inserted records */
+UNIV_INLINE
+ulint
+page_get_max_insert_size_after_reorganize(
+/*======================================*/
+	const page_t*	page,	/*!< in: index page */
+	ulint		n_recs)	/*!< in: number of records */
+{
+	ulint	occupied;
+	ulint	free_space;
+
+	occupied = page_get_data_size(page)
+		+ page_dir_calc_reserved_space(n_recs + page_get_n_recs(page));
+
+	free_space = page_get_free_space_of_empty(page_is_comp(page));
+
+	if (occupied > free_space) {
+
+		return(0);
+	}
+
+	return(free_space - occupied);
+}
+
+/** Read the PAGE_DIRECTION field from a byte.
+@param[in]	ptr	pointer to PAGE_DIRECTION_B
+@return	the value of the PAGE_DIRECTION field */
+inline
+byte
+page_ptr_get_direction(const byte* ptr)
+{
+	ut_ad(page_offset(ptr) == PAGE_HEADER + PAGE_DIRECTION_B);
+	return *ptr & ((1U << 3) - 1);
+}
+
+/** Read the PAGE_INSTANT field.
+@param[in]	page	index page
+@return the value of the PAGE_INSTANT field */
+inline
+uint16_t
+page_get_instant(const page_t* page)
+{
+	uint16_t i = page_header_get_field(page, PAGE_INSTANT);
+#ifdef UNIV_DEBUG
+	switch (fil_page_get_type(page)) {
+	case FIL_PAGE_TYPE_INSTANT:
+		ut_ad(page_get_direction(page) <= PAGE_NO_DIRECTION);
+		ut_ad(i >> 3);
+		break;
+	case FIL_PAGE_INDEX:
+		ut_ad(i <= PAGE_NO_DIRECTION || !page_is_comp(page));
+		break;
+	case FIL_PAGE_RTREE:
+		ut_ad(i <= PAGE_NO_DIRECTION);
+		break;
+	default:
+		ut_ad("invalid page type" == 0);
+		break;
+	}
+#endif /* UNIV_DEBUG */
+	return static_cast<uint16_t>(i >> 3);  /* i / 8 */
+}
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/include/page0types.h b/storage/innobase/include/page0types.h
new file mode 100644
index 00000000..83fc45cd
--- /dev/null
+++ b/storage/innobase/include/page0types.h
@@ -0,0 +1,188 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0types.h
+Index page routines
+
+Created 2/2/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef page0types_h
+#define page0types_h
+
+#include "dict0types.h"
+#include "mtr0types.h"
+#include "rem0types.h"
+#include "ut0new.h"
+
+#include <map>
+
+/** Eliminates a name collision on HP-UX */
+#define page_t	   ib_page_t
+/** Type of the index page */
+typedef	byte		page_t;
+#ifndef UNIV_INNOCHECKSUM
+/** Index page cursor */
+struct page_cur_t;
+/** Buffer pool block */
+struct buf_block_t;
+
+/** Compressed index page */
+typedef byte		page_zip_t;
+
+/* The following definitions would better belong to page0zip.h,
+but we cannot include page0zip.h from rem0rec.ic, because
+page0*.h includes rem0rec.h and may include rem0rec.ic. */
+
+/** Number of bits needed for representing different compressed page sizes */
+#define PAGE_ZIP_SSIZE_BITS 3
+
+/** Maximum compressed page shift size */
+#define PAGE_ZIP_SSIZE_MAX	\
+	(UNIV_ZIP_SIZE_SHIFT_MAX - UNIV_ZIP_SIZE_SHIFT_MIN + 1)
+
+/* Make sure there are enough bits available to store the maximum zip
+ssize, which is the number of shifts from 512. */
+#if PAGE_ZIP_SSIZE_MAX >= (1 << PAGE_ZIP_SSIZE_BITS)
+# error "PAGE_ZIP_SSIZE_MAX >= (1 << PAGE_ZIP_SSIZE_BITS)"
+#endif
+
+/* Page cursor search modes; the values must be in this order! */
+enum page_cur_mode_t {
+	PAGE_CUR_UNSUPP	= 0,
+	PAGE_CUR_G	= 1,
+	PAGE_CUR_GE	= 2,
+	PAGE_CUR_L	= 3,
+	PAGE_CUR_LE	= 4,
+
+/*      PAGE_CUR_LE_OR_EXTENDS = 5,*/ /* This is a search mode used in
+				 "column LIKE 'abc%' ORDER BY column DESC";
+				 we have to find strings which are <= 'abc' or
+				 which extend it */
+
+/* These search mode is for search R-tree index. */
+	PAGE_CUR_CONTAIN		= 7,
+	PAGE_CUR_INTERSECT		= 8,
+	PAGE_CUR_WITHIN			= 9,
+	PAGE_CUR_DISJOINT		= 10,
+	PAGE_CUR_MBR_EQUAL		= 11,
+	PAGE_CUR_RTREE_INSERT		= 12,
+	PAGE_CUR_RTREE_LOCATE		= 13,
+	PAGE_CUR_RTREE_GET_FATHER	= 14
+};
+
+class buf_pool_t;
+class buf_page_t;
+
+/** Compressed page descriptor */
+struct page_zip_des_t
+{
+	page_zip_t*	data;		/*!< compressed page data */
+
+	uint32_t	m_end:16;	/*!< end offset of modification log */
+	uint32_t	m_nonempty:1;	/*!< TRUE if the modification log
+					is not empty */
+	uint32_t	n_blobs:12;	/*!< number of externally stored
+					columns on the page; the maximum
+					is 744 on a 16 KiB page */
+	uint32_t	ssize:PAGE_ZIP_SSIZE_BITS;
+					/*!< 0 or compressed page shift size;
+					the size in bytes is
+					(UNIV_ZIP_SIZE_MIN >> 1) << ssize. */
+#ifdef UNIV_DEBUG
+	uint16_t	m_start;	/*!< start offset of modification log */
+	bool		m_external;	/*!< Allocated externally, not from the
+					buffer pool */
+#endif /* UNIV_DEBUG */
+
+	void clear() {
+		/* Clear everything except the member "fix". */
+		memset((void*) this, 0,
+		       reinterpret_cast<char*>(&fix)
+		       - reinterpret_cast<char*>(this));
+	}
+
+	page_zip_des_t() = default;
+	page_zip_des_t(const page_zip_des_t&) = default;
+
+	/* Initialize everything except the member "fix". */
+	page_zip_des_t(const page_zip_des_t& old, bool) {
+		memcpy((void*) this, (void*) &old,
+		       reinterpret_cast<char*>(&fix)
+		       - reinterpret_cast<char*>(this));
+	}
+
+private:
+	friend buf_pool_t;
+	friend buf_page_t;
+	/** fix count and state used in buf_page_t */
+	Atomic_relaxed<uint32_t> fix;
+};
+
+/** Compression statistics for a given page size */
+struct page_zip_stat_t {
+	/** Number of page compressions */
+	ulint		compressed;
+	/** Number of successful page compressions */
+	ulint		compressed_ok;
+	/** Number of page decompressions */
+	ulint		decompressed;
+	/** Duration of page compressions in microseconds */
+	ib_uint64_t	compressed_usec;
+	/** Duration of page decompressions in microseconds */
+	ib_uint64_t	decompressed_usec;
+	page_zip_stat_t() :
+		/* Initialize members to 0 so that when we do
+		stlmap[key].compressed++ and element with "key" does not
+		exist it gets inserted with zeroed members. */
+		compressed(0),
+		compressed_ok(0),
+		decompressed(0),
+		compressed_usec(0),
+		decompressed_usec(0)
+	{ }
+};
+
+/** Compression statistics types */
+typedef std::map<
+	index_id_t,
+	page_zip_stat_t,
+	std::less<index_id_t>,
+	ut_allocator<std::pair<const index_id_t, page_zip_stat_t> > >
+	page_zip_stat_per_index_t;
+
+/** Statistics on compression, indexed by page_zip_des_t::ssize - 1 */
+extern page_zip_stat_t			page_zip_stat[PAGE_ZIP_SSIZE_MAX];
+/** Statistics on compression, indexed by dict_index_t::id */
+extern page_zip_stat_per_index_t	page_zip_stat_per_index;
+
+/**********************************************************************//**
+Write the "owned" flag of a record on a compressed page.  The n_owned field
+must already have been written on the uncompressed page. */
+void
+page_zip_rec_set_owned(
+/*===================*/
+	buf_block_t*	block,	/*!< in/out: ROW_FORMAT=COMPRESSED page */
+	const byte*	rec,	/*!< in: record on the uncompressed page */
+	ulint		flag,	/*!< in: the owned flag (nonzero=TRUE) */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull));
+#endif /* !UNIV_INNOCHECKSUM */
+#endif
diff --git a/storage/innobase/include/page0zip.h b/storage/innobase/include/page0zip.h
new file mode 100644
index 00000000..43329906
--- /dev/null
+++ b/storage/innobase/include/page0zip.h
@@ -0,0 +1,383 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0zip.h
+Compressed page interface
+
+Created June 2005 by Marko Makela
+*******************************************************/
+
+#ifndef page0zip_h
+#define page0zip_h
+
+#include "buf0types.h"
+
+#ifndef UNIV_INNOCHECKSUM
+#include "mtr0types.h"
+#include "page0types.h"
+#include "dict0types.h"
+#include "srv0srv.h"
+#include "trx0types.h"
+#include "mem0mem.h"
+
+/* Compression level to be used by zlib. Settable by user. */
+extern uint	page_zip_level;
+
+/* Default compression level. */
+#define DEFAULT_COMPRESSION_LEVEL	6
+/** Start offset of the area that will be compressed */
+#define PAGE_ZIP_START			PAGE_NEW_SUPREMUM_END
+/** Size of an compressed page directory entry */
+#define PAGE_ZIP_DIR_SLOT_SIZE		2
+/** Predefine the sum of DIR_SLOT, TRX_ID & ROLL_PTR */
+#define PAGE_ZIP_CLUST_LEAF_SLOT_SIZE		\
+		(PAGE_ZIP_DIR_SLOT_SIZE		\
+		+ DATA_TRX_ID_LEN		\
+		+ DATA_ROLL_PTR_LEN)
+/** Mask of record offsets */
+#define PAGE_ZIP_DIR_SLOT_MASK		0x3fffU
+/** 'owned' flag */
+#define PAGE_ZIP_DIR_SLOT_OWNED		0x4000U
+/** 'deleted' flag */
+#define PAGE_ZIP_DIR_SLOT_DEL		0x8000U
+
+/**********************************************************************//**
+Determine the size of a compressed page in bytes.
+@return size in bytes */
+UNIV_INLINE
+ulint
+page_zip_get_size(
+/*==============*/
+	const page_zip_des_t*	page_zip)	/*!< in: compressed page */
+	MY_ATTRIBUTE((warn_unused_result));
+/**********************************************************************//**
+Set the size of a compressed page in bytes. */
+UNIV_INLINE
+void
+page_zip_set_size(
+/*==============*/
+	page_zip_des_t*	page_zip,	/*!< in/out: compressed page */
+	ulint		size);		/*!< in: size in bytes */
+
+/** Determine if a record is so big that it needs to be stored externally.
+@param[in]	rec_size	length of the record in bytes
+@param[in]	comp		nonzero=compact format
+@param[in]	n_fields	number of fields in the record; ignored if
+tablespace is not compressed
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@return false if the entire record can be stored locally on the page */
+inline bool page_zip_rec_needs_ext(ulint rec_size, ulint comp, ulint n_fields,
+				   ulint zip_size)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/**********************************************************************//**
+Determine the guaranteed free space on an empty page.
+@return minimum payload size on the page */
+ulint
+page_zip_empty_size(
+/*================*/
+	ulint	n_fields,	/*!< in: number of columns in the index */
+	ulint	zip_size)	/*!< in: compressed page size in bytes */
+	MY_ATTRIBUTE((const));
+
+/** Check whether a tuple is too big for compressed table
+@param[in]	index	dict index object
+@param[in]	entry	entry for the index
+@return	true if it's too big, otherwise false */
+bool
+page_zip_is_too_big(
+	const dict_index_t*	index,
+	const dtuple_t*		entry);
+
+/**********************************************************************//**
+Initialize a compressed page descriptor. */
+#define page_zip_des_init(page_zip) (page_zip)->clear()
+
+/**********************************************************************//**
+Configure the zlib allocator to use the given memory heap. */
+void
+page_zip_set_alloc(
+/*===============*/
+	void*		stream,		/*!< in/out: zlib stream */
+	mem_heap_t*	heap);		/*!< in: memory heap to use */
+
+/** Attempt to compress a ROW_FORMAT=COMPRESSED page.
+@retval true on success
+@retval false on failure; block->page.zip will be left intact. */
+bool
+page_zip_compress(
+	buf_block_t*		block,	/*!< in/out: buffer block */
+	dict_index_t*		index,	/*!< in: index of the B-tree node */
+	ulint			level,	/*!< in: commpression level */
+	mtr_t*			mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull));
+
+/**********************************************************************//**
+Write the index information for the compressed page.
+@return used size of buf */
+ulint
+page_zip_fields_encode(
+/*===================*/
+	ulint			n,	/*!< in: number of fields
+					to compress */
+	const dict_index_t*	index,	/*!< in: index comprising
+					at least n fields */
+	ulint			trx_id_pos,
+					/*!< in: position of the trx_id column
+					in the index, or ULINT_UNDEFINED if
+					this is a non-leaf page */
+	byte*			buf);	/*!< out: buffer of (n + 1) * 2 bytes */
+
+/**********************************************************************//**
+Decompress a page.  This function should tolerate errors on the compressed
+page.  Instead of letting assertions fail, it will return FALSE if an
+inconsistency is detected.
+@return TRUE on success, FALSE on failure */
+ibool
+page_zip_decompress(
+/*================*/
+	page_zip_des_t*	page_zip,/*!< in: data, ssize;
+				out: m_start, m_end, m_nonempty, n_blobs */
+	page_t*		page,	/*!< out: uncompressed page, may be trashed */
+	ibool		all)	/*!< in: TRUE=decompress the whole page;
+				FALSE=verify but do not copy some
+				page header fields that should not change
+				after page creation */
+	MY_ATTRIBUTE((nonnull(1,2)));
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Validate a compressed page descriptor.
+@return TRUE if ok */
+UNIV_INLINE
+ibool
+page_zip_simple_validate(
+/*=====================*/
+	const page_zip_des_t*	page_zip);	/*!< in: compressed page
+						descriptor */
+#endif /* UNIV_DEBUG */
+
+#ifdef UNIV_ZIP_DEBUG
+/**********************************************************************//**
+Check that the compressed and decompressed pages match.
+@return TRUE if valid, FALSE if not */
+ibool
+page_zip_validate_low(
+/*==================*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	const page_t*		page,	/*!< in: uncompressed page */
+	const dict_index_t*	index,	/*!< in: index of the page, if known */
+	ibool			sloppy)	/*!< in: FALSE=strict,
+					TRUE=ignore the MIN_REC_FLAG */
+	MY_ATTRIBUTE((nonnull(1,2)));
+/**********************************************************************//**
+Check that the compressed and decompressed pages match. */
+ibool
+page_zip_validate(
+/*==============*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	const page_t*		page,	/*!< in: uncompressed page */
+	const dict_index_t*	index)	/*!< in: index of the page, if known */
+	MY_ATTRIBUTE((nonnull(1,2)));
+#endif /* UNIV_ZIP_DEBUG */
+
+/**********************************************************************//**
+Determine how big record can be inserted without recompressing the page.
+@return a positive number indicating the maximum size of a record
+whose insertion is guaranteed to succeed, or zero or negative */
+UNIV_INLINE
+lint
+page_zip_max_ins_size(
+/*==================*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	ibool			is_clust)/*!< in: TRUE if clustered index */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/**********************************************************************//**
+Determine if enough space is available in the modification log.
+@return TRUE if page_zip_write_rec() will succeed */
+UNIV_INLINE
+ibool
+page_zip_available(
+/*===============*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	ibool			is_clust,/*!< in: TRUE if clustered index */
+	ulint			length,	/*!< in: combined size of the record */
+	ulint			create)	/*!< in: nonzero=add the record to
+					the heap */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Write an entire record to the ROW_FORMAT=COMPRESSED page.
+The data must already have been written to the uncompressed page.
+@param[in,out]	block		ROW_FORMAT=COMPRESSED page
+@param[in]	rec		record in the uncompressed page
+@param[in]	index		the index that the page belongs to
+@param[in]	offsets		rec_get_offsets(rec, index)
+@param[in]	create		nonzero=insert, zero=update
+@param[in,out]	mtr		mini-transaction */
+void page_zip_write_rec(buf_block_t *block, const byte *rec,
+                        const dict_index_t *index, const rec_offs *offsets,
+                        ulint create, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull));
+
+/**********************************************************************//**
+Write a BLOB pointer of a record on the leaf page of a clustered index.
+The information must already have been updated on the uncompressed page. */
+void
+page_zip_write_blob_ptr(
+/*====================*/
+	buf_block_t*	block,	/*!< in/out: ROW_FORMAT=COMPRESSED page */
+	const byte*	rec,	/*!< in/out: record whose data is being
+				written */
+	dict_index_t*	index,	/*!< in: index of the page */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	ulint		n,	/*!< in: column index */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull));
+
+/**********************************************************************//**
+Write the node pointer of a record on a non-leaf compressed page. */
+void
+page_zip_write_node_ptr(
+/*====================*/
+	buf_block_t*	block,	/*!< in/out: compressed page */
+	byte*		rec,	/*!< in/out: record */
+	ulint		size,	/*!< in: data size of rec */
+	ulint		ptr,	/*!< in: node pointer */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull));
+
+/** Write the DB_TRX_ID,DB_ROLL_PTR into a clustered index leaf page record.
+@param[in,out]	block		ROW_FORMAT=COMPRESSED page
+@param[in,out]	rec		record
+@param[in]	offsets		rec_get_offsets(rec, index)
+@param[in]	trx_id_field	field number of DB_TRX_ID (number of PK fields)
+@param[in]	trx_id		DB_TRX_ID value (transaction identifier)
+@param[in]	roll_ptr	DB_ROLL_PTR value (undo log pointer)
+@param[in,out]	mtr		mini-transaction */
+void
+page_zip_write_trx_id_and_roll_ptr(
+	buf_block_t*	block,
+	byte*		rec,
+	const rec_offs*	offsets,
+	ulint		trx_id_col,
+	trx_id_t	trx_id,
+	roll_ptr_t	roll_ptr,
+	mtr_t*		mtr)
+	MY_ATTRIBUTE((nonnull));
+
+/** Modify the delete-mark flag of a ROW_FORMAT=COMPRESSED record.
+@param[in,out]  block   buffer block
+@param[in,out]  rec     record on a physical index page
+@param[in]      flag    the value of the delete-mark flag
+@param[in,out]  mtr     mini-transaction  */
+void page_zip_rec_set_deleted(buf_block_t *block, rec_t *rec, bool flag,
+                              mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull));
+
+/**********************************************************************//**
+Insert a record to the dense page directory. */
+void
+page_zip_dir_insert(
+/*================*/
+	page_cur_t*	cursor,	/*!< in/out: page cursor */
+	uint16_t	free_rec,/*!< in: record from which rec was
+				allocated, or 0 */
+	byte*		rec,	/*!< in: record to insert */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	MY_ATTRIBUTE((nonnull(1,3,4)));
+
+/** Shift the dense page directory and the array of BLOB pointers
+when a record is deleted.
+@param[in,out]  block   index page
+@param[in,out]  rec     record being deleted
+@param[in]      index   the index that the page belongs to
+@param[in]      offsets rec_get_offsets(rec, index)
+@param[in]	free	previous start of the free list
+@param[in,out]  mtr     mini-transaction */
+void page_zip_dir_delete(buf_block_t *block, byte *rec,
+                         const dict_index_t *index, const rec_offs *offsets,
+                         const byte *free, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull(1,2,3,4,6)));
+
+/**********************************************************************//**
+Reorganize and compress a page.  This is a low-level operation for
+compressed pages, to be used when page_zip_compress() fails.
+On success, redo log will be written.
+The function btr_page_reorganize() should be preferred whenever possible.
+IMPORTANT: if page_zip_reorganize() is invoked on a leaf page of a
+non-clustered index, the caller must update the insert buffer free
+bits in the same mini-transaction in such a way that the modification
+will be redo-logged.
+@return error code
+@retval DB_FAIL on overflow; the block_zip will be left intact */
+dberr_t
+page_zip_reorganize(
+	buf_block_t*	block,	/*!< in/out: page with compressed page;
+				on the compressed page, in: size;
+				out: data, n_blobs,
+				m_start, m_end, m_nonempty */
+	dict_index_t*	index,	/*!< in: index of the B-tree node */
+	ulint		z_level,/*!< in: compression level */
+	mtr_t*		mtr,	/*!< in: mini-transaction */
+	bool		restore = false)/*!< whether to restore on failure */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/**********************************************************************//**
+Copy the records of a page byte for byte.  Do not copy the page header
+or trailer, except those B-tree header fields that are directly
+related to the storage of records.  Also copy PAGE_MAX_TRX_ID.
+NOTE: The caller must update the lock table and the adaptive hash index. */
+void
+page_zip_copy_recs(
+	buf_block_t*		block,		/*!< in/out: buffer block */
+	const page_zip_des_t*	src_zip,	/*!< in: compressed page */
+	const page_t*		src,		/*!< in: page */
+	dict_index_t*		index,		/*!< in: index of the B-tree */
+	mtr_t*			mtr);		/*!< in: mini-transaction */
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** Calculate the compressed page checksum.
+@param data		compressed page
+@param size		size of compressed page
+@param use_adler	whether to use Adler32 instead of a XOR of 3 CRC-32C
+@return page checksum */
+uint32_t page_zip_calc_checksum(const void *data, size_t size, bool use_adler);
+
+/** Validate the checksum on a ROW_FORMAT=COMPRESSED page.
+@param data    ROW_FORMAT=COMPRESSED page
+@param size    size of the page, in bytes
+@return whether the stored checksum matches innodb_checksum_algorithm */
+bool page_zip_verify_checksum(const byte *data, size_t size);
+
+#ifndef UNIV_INNOCHECKSUM
+/**********************************************************************//**
+Reset the counters used for filling
+INFORMATION_SCHEMA.innodb_cmp_per_index. */
+UNIV_INLINE
+void
+page_zip_reset_stat_per_index();
+/*===========================*/
+
+#include "page0zip.inl"
+#endif /* !UNIV_INNOCHECKSUM */
+
+#endif /* page0zip_h */
diff --git a/storage/innobase/include/page0zip.inl b/storage/innobase/include/page0zip.inl
new file mode 100644
index 00000000..afc877c3
--- /dev/null
+++ b/storage/innobase/include/page0zip.inl
@@ -0,0 +1,317 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0zip.ic
+Compressed page interface
+
+Created June 2005 by Marko Makela
+*******************************************************/
+
+#include "page0page.h"
+
+/* The format of compressed pages is as follows.
+
+The header and trailer of the uncompressed pages, excluding the page
+directory in the trailer, are copied as is to the header and trailer
+of the compressed page.
+
+At the end of the compressed page, there is a dense page directory
+pointing to every user record contained on the page, including deleted
+records on the free list.  The dense directory is indexed in the
+collation order, i.e., in the order in which the record list is
+linked on the uncompressed page.  The infimum and supremum records are
+excluded.  The two most significant bits of the entries are allocated
+for the delete-mark and an n_owned flag indicating the last record in
+a chain of records pointed to from the sparse page directory on the
+uncompressed page.
+
+The data between PAGE_ZIP_START and the last page directory entry will
+be written in compressed format, starting at offset PAGE_DATA.
+Infimum and supremum records are not stored.  We exclude the
+REC_N_NEW_EXTRA_BYTES in every record header.  These can be recovered
+from the dense page directory stored at the end of the compressed
+page.
+
+The fields node_ptr (in non-leaf B-tree nodes; level>0), trx_id and
+roll_ptr (in leaf B-tree nodes; level=0), and BLOB pointers of
+externally stored columns are stored separately, in ascending order of
+heap_no and column index, starting backwards from the dense page
+directory.
+
+The compressed data stream may be followed by a modification log
+covering the compressed portion of the page, as follows.
+
+MODIFICATION LOG ENTRY FORMAT
+- write record:
+  - (heap_no - 1) << 1 (1..2 bytes)
+  - extra bytes backwards
+  - data bytes
+- clear record:
+  - (heap_no - 1) << 1 | 1 (1..2 bytes)
+
+The integer values are stored in a variable-length format:
+- 0xxxxxxx: 0..127
+- 1xxxxxxx xxxxxxxx: 0..32767
+
+The end of the modification log is marked by a 0 byte.
+
+In summary, the compressed page looks like this:
+
+(1) Uncompressed page header (PAGE_DATA bytes)
+(2) Compressed index information
+(3) Compressed page data
+(4) Page modification log (page_zip->m_start..page_zip->m_end)
+(5) Empty zero-filled space
+(6) BLOB pointers (on leaf pages)
+  - BTR_EXTERN_FIELD_REF_SIZE for each externally stored column
+  - in descending collation order
+(7) Uncompressed columns of user records, n_dense * uncompressed_size bytes,
+  - indexed by heap_no
+  - DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN for leaf pages of clustered indexes
+  - REC_NODE_PTR_SIZE for non-leaf pages
+  - 0 otherwise
+(8) dense page directory, stored backwards
+  - n_dense = n_heap - 2
+  - existing records in ascending collation order
+  - deleted records (free list) in link order
+*/
+
+/**********************************************************************//**
+Determine the size of a compressed page in bytes.
+@return size in bytes */
+UNIV_INLINE
+ulint
+page_zip_get_size(
+/*==============*/
+	const page_zip_des_t*	page_zip)	/*!< in: compressed page */
+{
+	ulint	size;
+
+	if (!page_zip->ssize) {
+		return(0);
+	}
+
+	size = (UNIV_ZIP_SIZE_MIN >> 1) << page_zip->ssize;
+
+	ut_ad(size >= UNIV_ZIP_SIZE_MIN);
+	ut_ad(size <= srv_page_size);
+
+	return(size);
+}
+/**********************************************************************//**
+Set the size of a compressed page in bytes. */
+UNIV_INLINE
+void
+page_zip_set_size(
+/*==============*/
+	page_zip_des_t*	page_zip,	/*!< in/out: compressed page */
+	ulint		size)		/*!< in: size in bytes */
+{
+	if (size) {
+		unsigned	ssize;
+
+		ut_ad(ut_is_2pow(size));
+
+		for (ssize = 1; size > (512U << ssize); ssize++) {
+		}
+
+		page_zip->ssize = ssize & ((1U << PAGE_ZIP_SSIZE_BITS) - 1);
+	} else {
+		page_zip->ssize = 0;
+	}
+
+	ut_ad(page_zip_get_size(page_zip) == size);
+}
+
+/** Determine if a record is so big that it needs to be stored externally.
+@param[in]	rec_size	length of the record in bytes
+@param[in]	comp		nonzero=compact format
+@param[in]	n_fields	number of fields in the record; ignored if
+tablespace is not compressed
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@return false if the entire record can be stored locally on the page */
+inline bool page_zip_rec_needs_ext(ulint rec_size, ulint comp, ulint n_fields,
+				   ulint zip_size)
+{
+	/* FIXME: row size check is this function seems to be the most correct.
+	Put it in a separate function and use in more places of InnoDB */
+
+	ut_ad(rec_size
+	      > ulint(comp ? REC_N_NEW_EXTRA_BYTES : REC_N_OLD_EXTRA_BYTES));
+	ut_ad(comp || !zip_size);
+
+#if UNIV_PAGE_SIZE_MAX > COMPRESSED_REC_MAX_DATA_SIZE
+	if (comp ? rec_size >= COMPRESSED_REC_MAX_DATA_SIZE :
+		   rec_size >= REDUNDANT_REC_MAX_DATA_SIZE) {
+		return(TRUE);
+	}
+#endif
+
+	if (zip_size) {
+		ut_ad(comp);
+		/* On a compressed page, there is a two-byte entry in
+		the dense page directory for every record.  But there
+		is no record header.  There should be enough room for
+		one record on an empty leaf page.  Subtract 1 byte for
+		the encoded heap number.  Check also the available space
+		on the uncompressed page. */
+		return(rec_size - (REC_N_NEW_EXTRA_BYTES - 2 - 1)
+		       >= page_zip_empty_size(n_fields, zip_size)
+		       || rec_size >= page_get_free_space_of_empty(TRUE) / 2);
+	}
+
+	return(rec_size >= page_get_free_space_of_empty(comp) / 2);
+}
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Validate a compressed page descriptor.
+@return TRUE if ok */
+UNIV_INLINE
+ibool
+page_zip_simple_validate(
+/*=====================*/
+	const page_zip_des_t*	page_zip)/*!< in: compressed page descriptor */
+{
+	ut_ad(page_zip);
+	ut_ad(page_zip->data);
+	ut_ad(page_zip->ssize <= PAGE_ZIP_SSIZE_MAX);
+	ut_ad(page_zip_get_size(page_zip)
+	      > PAGE_DATA + PAGE_ZIP_DIR_SLOT_SIZE);
+	ut_ad(page_zip->m_start <= page_zip->m_end);
+	ut_ad(page_zip->m_end < page_zip_get_size(page_zip));
+	ut_ad(page_zip->n_blobs
+	      < page_zip_get_size(page_zip) / BTR_EXTERN_FIELD_REF_SIZE);
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/**********************************************************************//**
+Determine if the length of the page trailer.
+@return length of the page trailer, in bytes, not including the
+terminating zero byte of the modification log */
+UNIV_INLINE
+ibool
+page_zip_get_trailer_len(
+/*=====================*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	ibool			is_clust)/*!< in: TRUE if clustered index */
+{
+	ulint	uncompressed_size;
+
+	ut_ad(page_zip_simple_validate(page_zip));
+	MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+
+	if (!page_is_leaf(page_zip->data)) {
+		uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE
+			+ REC_NODE_PTR_SIZE;
+		ut_ad(!page_zip->n_blobs);
+	} else if (is_clust) {
+		uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE
+			+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+	} else {
+		uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE;
+		ut_ad(!page_zip->n_blobs);
+	}
+
+	return (ulint(page_dir_get_n_heap(page_zip->data)) - 2)
+		* uncompressed_size
+		+ ulint(page_zip->n_blobs) * BTR_EXTERN_FIELD_REF_SIZE;
+}
+
+/**********************************************************************//**
+Determine how big record can be inserted without recompressing the page.
+@return a positive number indicating the maximum size of a record
+whose insertion is guaranteed to succeed, or zero or negative */
+UNIV_INLINE
+lint
+page_zip_max_ins_size(
+/*==================*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	ibool			is_clust)/*!< in: TRUE if clustered index */
+{
+	ulint	trailer_len;
+
+	trailer_len = page_zip_get_trailer_len(page_zip, is_clust);
+
+	/* When a record is created, a pointer may be added to
+	the dense directory.
+	Likewise, space for the columns that will not be
+	compressed will be allocated from the page trailer.
+	Also the BLOB pointers will be allocated from there, but
+	we may as well count them in the length of the record. */
+
+	trailer_len += PAGE_ZIP_DIR_SLOT_SIZE;
+
+	return(lint(page_zip_get_size(page_zip)
+		    - trailer_len - page_zip->m_end
+		    - (REC_N_NEW_EXTRA_BYTES - 2)));
+}
+
+/**********************************************************************//**
+Determine if enough space is available in the modification log.
+@return TRUE if enough space is available */
+UNIV_INLINE
+ibool
+page_zip_available(
+/*===============*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	ibool			is_clust,/*!< in: TRUE if clustered index */
+	ulint			length,	/*!< in: combined size of the record */
+	ulint			create)	/*!< in: nonzero=add the record to
+					the heap */
+{
+	ulint	trailer_len;
+
+	ut_ad(length > REC_N_NEW_EXTRA_BYTES);
+
+	trailer_len = page_zip_get_trailer_len(page_zip, is_clust);
+
+	/* Subtract the fixed extra bytes and add the maximum
+	space needed for identifying the record (encoded heap_no). */
+	length -= REC_N_NEW_EXTRA_BYTES - 2;
+
+	if (create > 0) {
+		/* When a record is created, a pointer may be added to
+		the dense directory.
+		Likewise, space for the columns that will not be
+		compressed will be allocated from the page trailer.
+		Also the BLOB pointers will be allocated from there, but
+		we may as well count them in the length of the record. */
+
+		trailer_len += PAGE_ZIP_DIR_SLOT_SIZE;
+	}
+
+	return(length + trailer_len + page_zip->m_end
+	       < page_zip_get_size(page_zip));
+}
+
+/**********************************************************************//**
+Reset the counters used for filling
+INFORMATION_SCHEMA.innodb_cmp_per_index. */
+UNIV_INLINE
+void
+page_zip_reset_stat_per_index()
+/*===========================*/
+{
+	mysql_mutex_lock(&page_zip_stat_per_index_mutex);
+	page_zip_stat_per_index.clear();
+	mysql_mutex_unlock(&page_zip_stat_per_index_mutex);
+}
diff --git a/storage/innobase/include/pars0grm.h b/storage/innobase/include/pars0grm.h
new file mode 100644
index 00000000..e7112d99
--- /dev/null
+++ b/storage/innobase/include/pars0grm.h
@@ -0,0 +1,151 @@
+/* A Bison parser, made by GNU Bison 3.7.6.  */
+
+/* Bison interface for Yacc-like parsers in C
+
+   Copyright (C) 1984, 1989-1990, 2000-2015, 2018-2021 Free Software Foundation,
+   Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+/* As a special exception, you may create a larger work that contains
+   part or all of the Bison parser skeleton and distribute that work
+   under terms of your choice, so long as that work isn't itself a
+   parser generator using the skeleton or a modified version thereof
+   as a parser skeleton.  Alternatively, if you modify or redistribute
+   the parser skeleton itself, you may (at your option) remove this
+   special exception, which will cause the skeleton and the resulting
+   Bison output files to be licensed under the GNU General Public
+   License without this special exception.
+
+   This special exception was added by the Free Software Foundation in
+   version 2.2 of Bison.  */
+
+/* DO NOT RELY ON FEATURES THAT ARE NOT DOCUMENTED in the manual,
+   especially those whose name start with YY_ or yy_.  They are
+   private implementation details that can be changed or removed.  */
+
+#ifndef YY_YY_PARS0GRM_TAB_H_INCLUDED
+# define YY_YY_PARS0GRM_TAB_H_INCLUDED
+/* Debug traces.  */
+#ifndef YYDEBUG
+# define YYDEBUG 0
+#endif
+#if YYDEBUG
+extern int yydebug;
+#endif
+
+/* Token kinds.  */
+#ifndef YYTOKENTYPE
+# define YYTOKENTYPE
+  enum yytokentype
+  {
+    YYEMPTY = -2,
+    YYEOF = 0,                     /* "end of file"  */
+    YYerror = 256,                 /* error  */
+    YYUNDEF = 257,                 /* "invalid token"  */
+    PARS_INT_LIT = 258,            /* PARS_INT_LIT  */
+    PARS_FLOAT_LIT = 259,          /* PARS_FLOAT_LIT  */
+    PARS_STR_LIT = 260,            /* PARS_STR_LIT  */
+    PARS_NULL_LIT = 261,           /* PARS_NULL_LIT  */
+    PARS_ID_TOKEN = 262,           /* PARS_ID_TOKEN  */
+    PARS_AND_TOKEN = 263,          /* PARS_AND_TOKEN  */
+    PARS_OR_TOKEN = 264,           /* PARS_OR_TOKEN  */
+    PARS_NOT_TOKEN = 265,          /* PARS_NOT_TOKEN  */
+    PARS_GE_TOKEN = 266,           /* PARS_GE_TOKEN  */
+    PARS_LE_TOKEN = 267,           /* PARS_LE_TOKEN  */
+    PARS_NE_TOKEN = 268,           /* PARS_NE_TOKEN  */
+    PARS_PROCEDURE_TOKEN = 269,    /* PARS_PROCEDURE_TOKEN  */
+    PARS_IN_TOKEN = 270,           /* PARS_IN_TOKEN  */
+    PARS_INT_TOKEN = 271,          /* PARS_INT_TOKEN  */
+    PARS_CHAR_TOKEN = 272,         /* PARS_CHAR_TOKEN  */
+    PARS_IS_TOKEN = 273,           /* PARS_IS_TOKEN  */
+    PARS_BEGIN_TOKEN = 274,        /* PARS_BEGIN_TOKEN  */
+    PARS_END_TOKEN = 275,          /* PARS_END_TOKEN  */
+    PARS_IF_TOKEN = 276,           /* PARS_IF_TOKEN  */
+    PARS_THEN_TOKEN = 277,         /* PARS_THEN_TOKEN  */
+    PARS_ELSE_TOKEN = 278,         /* PARS_ELSE_TOKEN  */
+    PARS_ELSIF_TOKEN = 279,        /* PARS_ELSIF_TOKEN  */
+    PARS_LOOP_TOKEN = 280,         /* PARS_LOOP_TOKEN  */
+    PARS_WHILE_TOKEN = 281,        /* PARS_WHILE_TOKEN  */
+    PARS_RETURN_TOKEN = 282,       /* PARS_RETURN_TOKEN  */
+    PARS_SELECT_TOKEN = 283,       /* PARS_SELECT_TOKEN  */
+    PARS_COUNT_TOKEN = 284,        /* PARS_COUNT_TOKEN  */
+    PARS_FROM_TOKEN = 285,         /* PARS_FROM_TOKEN  */
+    PARS_WHERE_TOKEN = 286,        /* PARS_WHERE_TOKEN  */
+    PARS_FOR_TOKEN = 287,          /* PARS_FOR_TOKEN  */
+    PARS_DDOT_TOKEN = 288,         /* PARS_DDOT_TOKEN  */
+    PARS_ORDER_TOKEN = 289,        /* PARS_ORDER_TOKEN  */
+    PARS_BY_TOKEN = 290,           /* PARS_BY_TOKEN  */
+    PARS_ASC_TOKEN = 291,          /* PARS_ASC_TOKEN  */
+    PARS_DESC_TOKEN = 292,         /* PARS_DESC_TOKEN  */
+    PARS_INSERT_TOKEN = 293,       /* PARS_INSERT_TOKEN  */
+    PARS_INTO_TOKEN = 294,         /* PARS_INTO_TOKEN  */
+    PARS_VALUES_TOKEN = 295,       /* PARS_VALUES_TOKEN  */
+    PARS_UPDATE_TOKEN = 296,       /* PARS_UPDATE_TOKEN  */
+    PARS_SET_TOKEN = 297,          /* PARS_SET_TOKEN  */
+    PARS_DELETE_TOKEN = 298,       /* PARS_DELETE_TOKEN  */
+    PARS_CURRENT_TOKEN = 299,      /* PARS_CURRENT_TOKEN  */
+    PARS_OF_TOKEN = 300,           /* PARS_OF_TOKEN  */
+    PARS_CREATE_TOKEN = 301,       /* PARS_CREATE_TOKEN  */
+    PARS_TABLE_TOKEN = 302,        /* PARS_TABLE_TOKEN  */
+    PARS_INDEX_TOKEN = 303,        /* PARS_INDEX_TOKEN  */
+    PARS_UNIQUE_TOKEN = 304,       /* PARS_UNIQUE_TOKEN  */
+    PARS_CLUSTERED_TOKEN = 305,    /* PARS_CLUSTERED_TOKEN  */
+    PARS_ON_TOKEN = 306,           /* PARS_ON_TOKEN  */
+    PARS_ASSIGN_TOKEN = 307,       /* PARS_ASSIGN_TOKEN  */
+    PARS_DECLARE_TOKEN = 308,      /* PARS_DECLARE_TOKEN  */
+    PARS_CURSOR_TOKEN = 309,       /* PARS_CURSOR_TOKEN  */
+    PARS_SQL_TOKEN = 310,          /* PARS_SQL_TOKEN  */
+    PARS_OPEN_TOKEN = 311,         /* PARS_OPEN_TOKEN  */
+    PARS_FETCH_TOKEN = 312,        /* PARS_FETCH_TOKEN  */
+    PARS_CLOSE_TOKEN = 313,        /* PARS_CLOSE_TOKEN  */
+    PARS_NOTFOUND_TOKEN = 314,     /* PARS_NOTFOUND_TOKEN  */
+    PARS_TO_BINARY_TOKEN = 315,    /* PARS_TO_BINARY_TOKEN  */
+    PARS_SUBSTR_TOKEN = 316,       /* PARS_SUBSTR_TOKEN  */
+    PARS_CONCAT_TOKEN = 317,       /* PARS_CONCAT_TOKEN  */
+    PARS_INSTR_TOKEN = 318,        /* PARS_INSTR_TOKEN  */
+    PARS_LENGTH_TOKEN = 319,       /* PARS_LENGTH_TOKEN  */
+    PARS_COMMIT_TOKEN = 320,       /* PARS_COMMIT_TOKEN  */
+    PARS_ROLLBACK_TOKEN = 321,     /* PARS_ROLLBACK_TOKEN  */
+    PARS_WORK_TOKEN = 322,         /* PARS_WORK_TOKEN  */
+    PARS_EXIT_TOKEN = 323,         /* PARS_EXIT_TOKEN  */
+    PARS_FUNCTION_TOKEN = 324,     /* PARS_FUNCTION_TOKEN  */
+    PARS_LOCK_TOKEN = 325,         /* PARS_LOCK_TOKEN  */
+    PARS_SHARE_TOKEN = 326,        /* PARS_SHARE_TOKEN  */
+    PARS_MODE_TOKEN = 327,         /* PARS_MODE_TOKEN  */
+    PARS_LIKE_TOKEN = 328,         /* PARS_LIKE_TOKEN  */
+    PARS_LIKE_TOKEN_EXACT = 329,   /* PARS_LIKE_TOKEN_EXACT  */
+    PARS_LIKE_TOKEN_PREFIX = 330,  /* PARS_LIKE_TOKEN_PREFIX  */
+    PARS_LIKE_TOKEN_SUFFIX = 331,  /* PARS_LIKE_TOKEN_SUFFIX  */
+    PARS_LIKE_TOKEN_SUBSTR = 332,  /* PARS_LIKE_TOKEN_SUBSTR  */
+    PARS_TABLE_NAME_TOKEN = 333,   /* PARS_TABLE_NAME_TOKEN  */
+    PARS_BIGINT_TOKEN = 334,       /* PARS_BIGINT_TOKEN  */
+    NEG = 335                      /* NEG  */
+  };
+  typedef enum yytokentype yytoken_kind_t;
+#endif
+
+/* Value type.  */
+#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
+typedef int YYSTYPE;
+# define YYSTYPE_IS_TRIVIAL 1
+# define YYSTYPE_IS_DECLARED 1
+#endif
+
+
+extern YYSTYPE yylval;
+
+int yyparse (void);
+
+#endif /* !YY_YY_PARS0GRM_TAB_H_INCLUDED  */
diff --git a/storage/innobase/include/pars0opt.h b/storage/innobase/include/pars0opt.h
new file mode 100644
index 00000000..07a726ea
--- /dev/null
+++ b/storage/innobase/include/pars0opt.h
@@ -0,0 +1,68 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2018, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0opt.h
+Simple SQL optimizer
+
+Created 12/21/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef pars0opt_h
+#define pars0opt_h
+
+#include "que0types.h"
+#include "pars0sym.h"
+#include "row0sel.h"
+
+/*******************************************************************//**
+Optimizes a select. Decides which indexes to tables to use. The tables
+are accessed in the order that they were written to the FROM part in the
+select statement. */
+void
+opt_search_plan(
+/*============*/
+	sel_node_t*	sel_node);	/*!< in: parsed select node */
+/*******************************************************************//**
+Looks for occurrences of the columns of the table in the query subgraph and
+adds them to the list of columns if an occurrence of the same column does not
+already exist in the list. If the column is already in the list, puts a value
+indirection to point to the occurrence in the column list, except if the
+column occurrence we are looking at is in the column list, in which case
+nothing is done. */
+void
+opt_find_all_cols(
+/*==============*/
+	ibool		copy_val,	/*!< in: if TRUE, new found columns are
+					added as columns to copy */
+	dict_index_t*	index,		/*!< in: index to use */
+	sym_node_list_t* col_list,	/*!< in: base node of a list where
+					to add new found columns */
+	plan_t*		plan,		/*!< in: plan or NULL */
+	que_node_t*	exp);		/*!< in: expression or condition */
+#ifdef UNIV_SQL_DEBUG
+/********************************************************************//**
+Prints info of a query plan. */
+void
+opt_print_query_plan(
+/*=================*/
+	sel_node_t*	sel_node);	/*!< in: select node */
+#endif /* UNIV_SQL_DEBUG */
+
+#endif
diff --git a/storage/innobase/include/pars0pars.h b/storage/innobase/include/pars0pars.h
new file mode 100644
index 00000000..16823ce1
--- /dev/null
+++ b/storage/innobase/include/pars0pars.h
@@ -0,0 +1,695 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0pars.h
+SQL parser
+
+Created 11/19/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef pars0pars_h
+#define pars0pars_h
+
+#include "que0types.h"
+#include "pars0types.h"
+#include "row0types.h"
+#include "trx0types.h"
+#include "ut0vec.h"
+#include "row0mysql.h"
+
+/** Type of the user functions. The first argument is always InnoDB-supplied
+and varies in type, while 'user_arg' is a user-supplied argument. The
+meaning of the return type also varies. See the individual use cases, e.g.
+the FETCH statement, for details on them. */
+typedef ibool	(*pars_user_func_cb_t)(void* arg, void* user_arg);
+
+/** If the following is set TRUE, the parser will emit debugging
+information */
+extern int	yydebug;
+
+/* Global variable used while parsing a single procedure or query : the code is
+NOT re-entrant */
+extern sym_tab_t*	pars_sym_tab_global;
+
+extern pars_res_word_t	pars_to_binary_token;
+extern pars_res_word_t	pars_substr_token;
+extern pars_res_word_t	pars_concat_token;
+extern pars_res_word_t	pars_length_token;
+extern pars_res_word_t	pars_instr_token;
+extern pars_res_word_t	pars_count_token;
+extern pars_res_word_t	pars_int_token;
+extern pars_res_word_t	pars_bigint_token;
+extern pars_res_word_t	pars_char_token;
+extern pars_res_word_t	pars_update_token;
+extern pars_res_word_t	pars_asc_token;
+extern pars_res_word_t	pars_desc_token;
+extern pars_res_word_t	pars_open_token;
+extern pars_res_word_t	pars_close_token;
+extern pars_res_word_t	pars_share_token;
+extern pars_res_word_t	pars_unique_token;
+extern pars_res_word_t	pars_clustered_token;
+
+extern ulint		pars_star_denoter;
+
+/* Procedure parameter types */
+#define PARS_INPUT	0
+#define PARS_OUTPUT	1
+#define PARS_NOT_PARAM	2
+
+int
+yyparse(void);
+
+/*************************************************************//**
+Parses an SQL string returning the query graph.
+@return own: the query graph */
+que_t*
+pars_sql(
+/*=====*/
+	pars_info_t*	info,	/*!< in: extra information, or NULL */
+	const char*	str);	/*!< in: SQL string */
+/*************************************************************//**
+Retrieves characters to the lexical analyzer.
+@return number of characters copied or 0 on EOF */
+int
+pars_get_lex_chars(
+/*===============*/
+	char*	buf,		/*!< in/out: buffer where to copy */
+	size_t	max_size);	/*!< in: maximum number of characters which fit
+				in the buffer */
+/*************************************************************//**
+Called by yyparse on error. */
+void
+yyerror(
+/*====*/
+	const char*	s);	/*!< in: error message string */
+/*********************************************************************//**
+Parses a variable declaration.
+@return own: symbol table node of type SYM_VAR */
+sym_node_t*
+pars_variable_declaration(
+/*======================*/
+	sym_node_t*	node,	/*!< in: symbol table node allocated for the
+				id of the variable */
+	pars_res_word_t* type);	/*!< in: pointer to a type token */
+/*********************************************************************//**
+Parses a function expression.
+@return own: function node in a query tree */
+func_node_t*
+pars_func(
+/*======*/
+	que_node_t*	res_word,/*!< in: function name reserved word */
+	que_node_t*	arg);	/*!< in: first argument in the argument list */
+/*************************************************************************
+Rebind a LIKE search string. NOTE: We ignore any '%' characters embedded
+within the search string.
+@return own: function node in a query tree */
+int
+pars_like_rebind(
+/*=============*/
+        sym_node_t*     node,   /* in: The search string node.*/
+        const byte*     ptr,    /* in: literal to (re) bind */
+        ulint           len);   /* in: length of literal to (re) bind*/
+/*********************************************************************//**
+Parses an operator expression.
+@return own: function node in a query tree */
+func_node_t*
+pars_op(
+/*====*/
+	int		func,	/*!< in: operator token code */
+	que_node_t*	arg1,	/*!< in: first argument */
+	que_node_t*	arg2);	/*!< in: second argument or NULL for an unary
+				operator */
+/*********************************************************************//**
+Parses an ORDER BY clause. Order by a single column only is supported.
+@return own: order-by node in a query tree */
+order_node_t*
+pars_order_by(
+/*==========*/
+	sym_node_t*	column,	/*!< in: column name */
+	pars_res_word_t* asc);	/*!< in: &pars_asc_token or pars_desc_token */
+/*********************************************************************//**
+Parses a select list; creates a query graph node for the whole SELECT
+statement.
+@return own: select node in a query tree */
+sel_node_t*
+pars_select_list(
+/*=============*/
+	que_node_t*	select_list,	/*!< in: select list */
+	sym_node_t*	into_list);	/*!< in: variables list or NULL */
+/*********************************************************************//**
+Parses a cursor declaration.
+@return sym_node */
+que_node_t*
+pars_cursor_declaration(
+/*====================*/
+	sym_node_t*	sym_node,	/*!< in: cursor id node in the symbol
+					table */
+	sel_node_t*	select_node);	/*!< in: select node */
+/*********************************************************************//**
+Parses a function declaration.
+@return sym_node */
+que_node_t*
+pars_function_declaration(
+/*======================*/
+	sym_node_t*	sym_node);	/*!< in: function id node in the symbol
+					table */
+/*********************************************************************//**
+Parses a select statement.
+@return own: select node in a query tree */
+sel_node_t*
+pars_select_statement(
+/*==================*/
+	sel_node_t*	select_node,	/*!< in: select node already containing
+					the select list */
+	sym_node_t*	table_list,	/*!< in: table list */
+	que_node_t*	search_cond,	/*!< in: search condition or NULL */
+	pars_res_word_t* for_update,	/*!< in: NULL or &pars_update_token */
+	pars_res_word_t* consistent_read,/*!< in: NULL or
+						&pars_consistent_token */
+	order_node_t*	order_by);	/*!< in: NULL or an order-by node */
+/*********************************************************************//**
+Parses a column assignment in an update.
+@return column assignment node */
+col_assign_node_t*
+pars_column_assignment(
+/*===================*/
+	sym_node_t*	column,	/*!< in: column to assign */
+	que_node_t*	exp);	/*!< in: value to assign */
+/*********************************************************************//**
+Parses a delete or update statement start.
+@return own: update node in a query tree */
+upd_node_t*
+pars_update_statement_start(
+/*========================*/
+	ibool		is_delete,	/*!< in: TRUE if delete */
+	sym_node_t*	table_sym,	/*!< in: table name node */
+	col_assign_node_t* col_assign_list);/*!< in: column assignment list, NULL
+					if delete */
+/*********************************************************************//**
+Parses an update or delete statement.
+@return own: update node in a query tree */
+upd_node_t*
+pars_update_statement(
+/*==================*/
+	upd_node_t*	node,		/*!< in: update node */
+	sym_node_t*	cursor_sym,	/*!< in: pointer to a cursor entry in
+					the symbol table or NULL */
+	que_node_t*	search_cond);	/*!< in: search condition or NULL */
+/*********************************************************************//**
+Parses an insert statement.
+@return own: update node in a query tree */
+ins_node_t*
+pars_insert_statement(
+/*==================*/
+	sym_node_t*	table_sym,	/*!< in: table name node */
+	que_node_t*	values_list,	/*!< in: value expression list or NULL */
+	sel_node_t*	select);	/*!< in: select condition or NULL */
+/*********************************************************************//**
+Parses an elsif element.
+@return elsif node */
+elsif_node_t*
+pars_elsif_element(
+/*===============*/
+	que_node_t*	cond,		/*!< in: if-condition */
+	que_node_t*	stat_list);	/*!< in: statement list */
+/*********************************************************************//**
+Parses an if-statement.
+@return if-statement node */
+if_node_t*
+pars_if_statement(
+/*==============*/
+	que_node_t*	cond,		/*!< in: if-condition */
+	que_node_t*	stat_list,	/*!< in: statement list */
+	que_node_t*	else_part);	/*!< in: else-part statement list */
+/*********************************************************************//**
+Parses a for-loop-statement.
+@return for-statement node */
+for_node_t*
+pars_for_statement(
+/*===============*/
+	sym_node_t*	loop_var,	/*!< in: loop variable */
+	que_node_t*	loop_start_limit,/*!< in: loop start expression */
+	que_node_t*	loop_end_limit,	/*!< in: loop end expression */
+	que_node_t*	stat_list);	/*!< in: statement list */
+/*********************************************************************//**
+Parses a while-statement.
+@return while-statement node */
+while_node_t*
+pars_while_statement(
+/*=================*/
+	que_node_t*	cond,		/*!< in: while-condition */
+	que_node_t*	stat_list);	/*!< in: statement list */
+/*********************************************************************//**
+Parses an exit statement.
+@return exit statement node */
+exit_node_t*
+pars_exit_statement(void);
+/*=====================*/
+/*********************************************************************//**
+Parses a return-statement.
+@return return-statement node */
+return_node_t*
+pars_return_statement(void);
+/*=======================*/
+/*********************************************************************//**
+Parses a procedure call.
+@return function node */
+func_node_t*
+pars_procedure_call(
+/*================*/
+	que_node_t*	res_word,/*!< in: procedure name reserved word */
+	que_node_t*	args);	/*!< in: argument list */
+/*********************************************************************//**
+Parses an assignment statement.
+@return assignment statement node */
+assign_node_t*
+pars_assignment_statement(
+/*======================*/
+	sym_node_t*	var,	/*!< in: variable to assign */
+	que_node_t*	val);	/*!< in: value to assign */
+/*********************************************************************//**
+Parses a fetch statement. into_list or user_func (but not both) must be
+non-NULL.
+@return fetch statement node */
+fetch_node_t*
+pars_fetch_statement(
+/*=================*/
+	sym_node_t*	cursor,		/*!< in: cursor node */
+	sym_node_t*	into_list,	/*!< in: variables to set, or NULL */
+	sym_node_t*	user_func);	/*!< in: user function name, or NULL */
+/*********************************************************************//**
+Parses an open or close cursor statement.
+@return fetch statement node */
+open_node_t*
+pars_open_statement(
+/*================*/
+	ulint		type,	/*!< in: ROW_SEL_OPEN_CURSOR
+				or ROW_SEL_CLOSE_CURSOR */
+	sym_node_t*	cursor);	/*!< in: cursor node */
+/*********************************************************************//**
+Parses a row_printf-statement.
+@return row_printf-statement node */
+row_printf_node_t*
+pars_row_printf_statement(
+/*======================*/
+	sel_node_t*	sel_node);	/*!< in: select node */
+/*********************************************************************//**
+Parses a commit statement.
+@return own: commit node struct */
+commit_node_t*
+pars_commit_statement(void);
+/*=======================*/
+/*********************************************************************//**
+Parses a rollback statement.
+@return own: rollback node struct */
+roll_node_t*
+pars_rollback_statement(void);
+/*=========================*/
+/*********************************************************************//**
+Parses a column definition at a table creation.
+@return column sym table node */
+sym_node_t*
+pars_column_def(
+/*============*/
+	sym_node_t*		sym_node,	/*!< in: column node in the
+						symbol table */
+	pars_res_word_t*	type,		/*!< in: data type */
+	sym_node_t*		len,		/*!< in: length of column, or
+						NULL */
+	void*			is_not_null);	/*!< in: if not NULL, column
+						is of type NOT NULL. */
+/*********************************************************************//**
+Parses a table creation operation.
+@return table create subgraph */
+tab_node_t*
+pars_create_table(
+/*==============*/
+	sym_node_t*	table_sym,	/*!< in: table name node in the symbol
+					table */
+	sym_node_t*	column_defs);	/*!< in: list of column names */
+/*********************************************************************//**
+Parses an index creation operation.
+@return index create subgraph */
+ind_node_t*
+pars_create_index(
+/*==============*/
+	pars_res_word_t* unique_def,	/*!< in: not NULL if a unique index */
+	pars_res_word_t* clustered_def,	/*!< in: not NULL if a clustered index */
+	sym_node_t*	index_sym,	/*!< in: index name node in the symbol
+					table */
+	sym_node_t*	table_sym,	/*!< in: table name node in the symbol
+					table */
+	sym_node_t*	column_list);	/*!< in: list of column names */
+/*********************************************************************//**
+Parses a procedure definition.
+@return query fork node */
+que_fork_t*
+pars_procedure_definition(
+/*======================*/
+	sym_node_t*	sym_node,	/*!< in: procedure id node in the symbol
+					table */
+	que_node_t*	stat_list);	/*!< in: statement list */
+
+/** Completes a query graph by adding query thread and fork nodes
+above it and prepares the graph for running.
+@param[in]	node		root node for an incomplete query
+				graph, or NULL for dummy graph
+@param[in]	trx		transaction handle
+@param[in]	heap		memory heap from which allocated
+@param[in]	prebuilt	row prebuilt structure
+@return query thread node to run */
+que_thr_t*
+pars_complete_graph_for_exec(
+	que_node_t*	node,
+	trx_t*		trx,
+	mem_heap_t*	heap,
+	row_prebuilt_t*	prebuilt)
+	MY_ATTRIBUTE((nonnull(2,3), warn_unused_result));
+
+/****************************************************************//**
+Create parser info struct.
+@return own: info struct */
+pars_info_t*
+pars_info_create(void);
+/*==================*/
+
+/****************************************************************//**
+Add bound literal. */
+void
+pars_info_add_literal(
+/*==================*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	const void*	address,	/*!< in: address */
+	ulint		length,		/*!< in: length of data */
+	ulint		type,		/*!< in: type, e.g. DATA_FIXBINARY */
+	ulint		prtype);	/*!< in: precise type, e.g.
+					DATA_UNSIGNED */
+
+/****************************************************************//**
+Equivalent to pars_info_add_literal(info, name, str, strlen(str),
+DATA_VARCHAR, DATA_ENGLISH). */
+void
+pars_info_add_str_literal(
+/*======================*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	const char*	str);		/*!< in: string */
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry.*/
+void
+pars_info_bind_literal(
+/*===================*/
+	pars_info_t*	info,		/* in: info struct */
+	const char*	name,		/* in: name */
+	const void*	address,	/* in: address */
+	ulint		length,		/* in: length of data */
+	ulint		type,		/* in: type, e.g. DATA_FIXBINARY */
+	ulint		prtype);	/* in: precise type, e.g. */
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry.*/
+void
+pars_info_bind_varchar_literal(
+/*===========================*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	const byte*	str,		/*!< in: string */
+	ulint		str_len);	/*!< in: string length */
+/****************************************************************//**
+Equivalent to:
+
+char buf[4];
+mach_write_to_4(buf, val);
+pars_info_add_literal(info, name, buf, 4, DATA_INT, 0);
+
+except that the buffer is dynamically allocated from the info struct's
+heap. */
+void
+pars_info_bind_int4_literal(
+/*=======================*/
+	pars_info_t*		info,		/*!< in: info struct */
+	const char*		name,		/*!< in: name */
+	const ib_uint32_t*	val);		/*!< in: value */
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry. */
+void
+pars_info_bind_int8_literal(
+/*=======================*/
+	pars_info_t*		info,		/*!< in: info struct */
+	const char*		name,		/*!< in: name */
+	const ib_uint64_t*	val);		/*!< in: value */
+/****************************************************************//**
+Add user function. */
+void
+pars_info_bind_function(
+/*===================*/
+	pars_info_t*		info,	/*!< in: info struct */
+	const char*		name,	/*!< in: function name */
+	pars_user_func_cb_t	func,	/*!< in: function address */
+	void*			arg);	/*!< in: user-supplied argument */
+/****************************************************************//**
+Add bound id. */
+void
+pars_info_bind_id(
+/*=============*/
+	pars_info_t*		info,	/*!< in: info struct */
+	const char*		name,	/*!< in: name */
+	const char*		id);	/*!< in: id */
+/****************************************************************//**
+Equivalent to:
+
+char buf[4];
+mach_write_to_4(buf, val);
+pars_info_add_literal(info, name, buf, 4, DATA_INT, 0);
+
+except that the buffer is dynamically allocated from the info struct's
+heap. */
+void
+pars_info_add_int4_literal(
+/*=======================*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	ulint		val);		/*!< in: value */
+
+/****************************************************************//**
+Equivalent to:
+
+char buf[8];
+mach_write_to_8(buf, val);
+pars_info_add_literal(info, name, buf, 8, DATA_FIXBINARY, 0);
+
+except that the buffer is dynamically allocated from the info struct's
+heap. */
+void
+pars_info_add_ull_literal(
+/*======================*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	ib_uint64_t	val);		/*!< in: value */
+
+/****************************************************************//**
+If the literal value already exists then it rebinds otherwise it
+creates a new entry. */
+void
+pars_info_bind_ull_literal(
+/*=======================*/
+	pars_info_t*		info,	/*!< in: info struct */
+	const char*		name,	/*!< in: name */
+	const ib_uint64_t*	val)	/*!< in: value */
+	MY_ATTRIBUTE((nonnull));
+
+/****************************************************************//**
+Get bound literal with the given name.
+@return bound literal, or NULL if not found */
+pars_bound_lit_t*
+pars_info_get_bound_lit(
+/*====================*/
+	pars_info_t*		info,	/*!< in: info struct */
+	const char*		name);	/*!< in: bound literal name to find */
+
+/****************************************************************//**
+Get bound id with the given name.
+@return bound id, or NULL if not found */
+pars_bound_id_t*
+pars_info_get_bound_id(
+/*===================*/
+	pars_info_t*		info,	/*!< in: info struct */
+	const char*		name);	/*!< in: bound id name to find */
+
+/******************************************************************//**
+Release any resources used by the lexer. */
+void
+pars_lexer_close(void);
+/*==================*/
+
+/** Extra information supplied for pars_sql(). */
+struct pars_info_t {
+	mem_heap_t*	heap;		/*!< our own memory heap */
+
+	ib_vector_t*	funcs;		/*!< user functions, or NUll
+					(pars_user_func_t*) */
+	ib_vector_t*	bound_lits;	/*!< bound literals, or NULL
+					(pars_bound_lit_t*) */
+	ib_vector_t*	bound_ids;	/*!< bound ids, or NULL
+					(pars_bound_id_t*) */
+};
+
+inline void pars_info_free(pars_info_t *info) { mem_heap_free(info->heap); }
+
+/** User-supplied function and argument. */
+struct pars_user_func_t {
+	const char*		name;	/*!< function name */
+	pars_user_func_cb_t	func;	/*!< function address */
+	void*			arg;	/*!< user-supplied argument */
+};
+
+/** Bound literal. */
+struct pars_bound_lit_t {
+	const char*	name;		/*!< name */
+	const void*	address;	/*!< address */
+	ulint		length;		/*!< length of data */
+	ulint		type;		/*!< type, e.g. DATA_FIXBINARY */
+	ulint		prtype;		/*!< precise type, e.g. DATA_UNSIGNED */
+	sym_node_t*	node;		/*!< symbol node */
+};
+
+/** Bound identifier. */
+struct pars_bound_id_t {
+	const char*	name;		/*!< name */
+	const char*	id;		/*!< identifier */
+};
+
+/** Struct used to denote a reserved word in a parsing tree */
+struct pars_res_word_t{
+	int	code;	/*!< the token code for the reserved word from
+			pars0grm.h */
+};
+
+/** A predefined function or operator node in a parsing tree; this construct
+is also used for some non-functions like the assignment ':=' */
+struct func_node_t{
+	que_common_t	common;	/*!< type: QUE_NODE_FUNC */
+	int		func;	/*!< token code of the function name */
+	ulint		fclass;	/*!< class of the function */
+	que_node_t*	args;	/*!< argument(s) of the function */
+	UT_LIST_NODE_T(func_node_t) cond_list;
+				/*!< list of comparison conditions; defined
+				only for comparison operator nodes except,
+				presently, for OPT_SCROLL_TYPE ones */
+	UT_LIST_NODE_T(func_node_t) func_node_list;
+				/*!< list of function nodes in a parsed
+				query graph */
+};
+
+/** An order-by node in a select */
+struct order_node_t{
+	que_common_t	common;	/*!< type: QUE_NODE_ORDER */
+	sym_node_t*	column;	/*!< order-by column */
+	ibool		asc;	/*!< TRUE if ascending, FALSE if descending */
+};
+
+/** Procedure definition node */
+struct proc_node_t{
+	que_common_t	common;		/*!< type: QUE_NODE_PROC */
+	sym_node_t*	proc_id;	/*!< procedure name symbol in the symbol
+					table of this same procedure */
+	que_node_t*	stat_list;	/*!< statement list */
+	sym_tab_t*	sym_tab;	/*!< symbol table of this procedure */
+};
+
+/** elsif-element node */
+struct elsif_node_t{
+	que_common_t	common;		/*!< type: QUE_NODE_ELSIF */
+	que_node_t*	cond;		/*!< if condition */
+	que_node_t*	stat_list;	/*!< statement list */
+};
+
+/** if-statement node */
+struct if_node_t{
+	que_common_t	common;		/*!< type: QUE_NODE_IF */
+	que_node_t*	cond;		/*!< if condition */
+	que_node_t*	stat_list;	/*!< statement list */
+	que_node_t*	else_part;	/*!< else-part statement list */
+	elsif_node_t*	elsif_list;	/*!< elsif element list */
+};
+
+/** while-statement node */
+struct while_node_t{
+	que_common_t	common;		/*!< type: QUE_NODE_WHILE */
+	que_node_t*	cond;		/*!< while condition */
+	que_node_t*	stat_list;	/*!< statement list */
+};
+
+/** for-loop-statement node */
+struct for_node_t{
+	que_common_t	common;		/*!< type: QUE_NODE_FOR */
+	sym_node_t*	loop_var;	/*!< loop variable: this is the
+					dereferenced symbol from the
+					variable declarations, not the
+					symbol occurrence in the for loop
+					definition */
+	que_node_t*	loop_start_limit;/*!< initial value of loop variable */
+	que_node_t*	loop_end_limit;	/*!< end value of loop variable */
+	lint		loop_end_value;	/*!< evaluated value for the end value:
+					it is calculated only when the loop
+					is entered, and will not change within
+					the loop */
+	que_node_t*	stat_list;	/*!< statement list */
+};
+
+/** exit statement node */
+struct exit_node_t{
+	que_common_t	common;		/*!< type: QUE_NODE_EXIT */
+};
+
+/** return-statement node */
+struct return_node_t{
+	que_common_t	common;		/*!< type: QUE_NODE_RETURN */
+};
+
+/** Assignment statement node */
+struct assign_node_t{
+	que_common_t	common;		/*!< type: QUE_NODE_ASSIGNMENT */
+	sym_node_t*	var;		/*!< variable to set */
+	que_node_t*	val;		/*!< value to assign */
+};
+
+/** Column assignment node */
+struct col_assign_node_t{
+	que_common_t	common;		/*!< type: QUE_NODE_COL_ASSIGN */
+	sym_node_t*	col;		/*!< column to set */
+	que_node_t*	val;		/*!< value to assign */
+};
+
+/** Classes of functions */
+/* @{ */
+#define PARS_FUNC_ARITH		1	/*!< +, -, *, / */
+#define	PARS_FUNC_LOGICAL	2	/*!< AND, OR, NOT */
+#define PARS_FUNC_CMP		3	/*!< comparison operators */
+#define	PARS_FUNC_PREDEFINED	4	/*!< TO_NUMBER, SUBSTR, ... */
+#define	PARS_FUNC_AGGREGATE	5	/*!< COUNT */
+#define	PARS_FUNC_OTHER		6	/*!< these are not real functions,
+					e.g., := */
+/* @} */
+
+#endif
diff --git a/storage/innobase/include/pars0sym.h b/storage/innobase/include/pars0sym.h
new file mode 100644
index 00000000..59f6cc31
--- /dev/null
+++ b/storage/innobase/include/pars0sym.h
@@ -0,0 +1,243 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0sym.h
+SQL parser symbol table
+
+Created 12/15/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef pars0sym_h
+#define pars0sym_h
+
+#include "que0types.h"
+#include "pars0types.h"
+#include "row0types.h"
+
+/******************************************************************//**
+Creates a symbol table for a single stored procedure or query.
+@return own: symbol table */
+sym_tab_t*
+sym_tab_create(
+/*===========*/
+	mem_heap_t*	heap);	/*!< in: memory heap where to create */
+/******************************************************************//**
+Frees the memory allocated dynamically AFTER parsing phase for variables
+etc. in the symbol table. Does not free the mem heap where the table was
+originally created. Frees also SQL explicit cursor definitions. */
+void
+sym_tab_free_private(
+/*=================*/
+	sym_tab_t*	sym_tab);	/*!< in, own: symbol table */
+/******************************************************************//**
+Adds an integer literal to a symbol table.
+@return symbol table node */
+sym_node_t*
+sym_tab_add_int_lit(
+/*================*/
+	sym_tab_t*	sym_tab,	/*!< in: symbol table */
+	ulint		val);		/*!< in: integer value */
+/******************************************************************//**
+Adds an string literal to a symbol table.
+@return symbol table node */
+sym_node_t*
+sym_tab_add_str_lit(
+/*================*/
+	sym_tab_t*	sym_tab,	/*!< in: symbol table */
+	const byte*	str,		/*!< in: string with no quotes around
+					it */
+	ulint		len);		/*!< in: string length */
+/******************************************************************//**
+Add a bound literal to a symbol table.
+@return symbol table node */
+sym_node_t*
+sym_tab_add_bound_lit(
+/*==================*/
+	sym_tab_t*	sym_tab,	/*!< in: symbol table */
+	const char*	name,		/*!< in: name of bound literal */
+	ulint*		lit_type);	/*!< out: type of literal (PARS_*_LIT) */
+/**********************************************************************
+Rebind literal to a node in the symbol table. */
+sym_node_t*
+sym_tab_rebind_lit(
+/*===============*/
+                                        /* out: symbol table node */
+        sym_node_t*     node,           /* in: node that is bound to literal*/
+        const void*     address,        /* in: pointer to data */
+        ulint           length);        /* in: length of data */
+/******************************************************************//**
+Adds an SQL null literal to a symbol table.
+@return symbol table node */
+sym_node_t*
+sym_tab_add_null_lit(
+/*=================*/
+	sym_tab_t*	sym_tab);	/*!< in: symbol table */
+/******************************************************************//**
+Adds an identifier to a symbol table.
+@return symbol table node */
+sym_node_t*
+sym_tab_add_id(
+/*===========*/
+	sym_tab_t*	sym_tab,	/*!< in: symbol table */
+	byte*		name,		/*!< in: identifier name */
+	ulint		len);		/*!< in: identifier length */
+
+/******************************************************************//**
+Add a bound identifier to a symbol table.
+@return symbol table node */
+sym_node_t*
+sym_tab_add_bound_id(
+/*===========*/
+	sym_tab_t*	sym_tab,	/*!< in: symbol table */
+	const char*	name);		/*!< in: name of bound id */
+
+/** Index of sym_node_t::field_nos corresponding to the clustered index */
+#define	SYM_CLUST_FIELD_NO	0
+/** Index of sym_node_t::field_nos corresponding to a secondary index */
+#define	SYM_SEC_FIELD_NO	1
+
+/** Types of a symbol table node */
+enum sym_tab_entry {
+	SYM_UNSET,		/*!< Unset entry. */
+	SYM_VAR = 91,		/*!< declared parameter or local
+				variable of a procedure */
+	SYM_IMPLICIT_VAR,	/*!< storage for a intermediate result
+				of a calculation */
+	SYM_LIT,		/*!< literal */
+	SYM_TABLE_REF_COUNTED,	/*!< database table name, ref counted. Must
+				be closed explicitly. */
+	SYM_TABLE,		/*!< database table name */
+	SYM_COLUMN,		/*!< database table name */
+	SYM_CURSOR,		/*!< named cursor */
+	SYM_PROCEDURE_NAME,	/*!< stored procedure name */
+	SYM_INDEX,		/*!< database index name */
+	SYM_FUNCTION		/*!< user function name */
+};
+
+/** Symbol table node */
+struct sym_node_t{
+	que_common_t			common;		/*!< node type:
+							QUE_NODE_SYMBOL */
+	/* NOTE: if the data field in 'common.val' is not NULL and the symbol
+	table node is not for a temporary column, the memory for the value has
+	been allocated from dynamic memory and it should be freed when the
+	symbol table is discarded */
+
+	/* 'alias' and 'indirection' are almost the same, but not quite.
+	'alias' always points to the primary instance of the variable, while
+	'indirection' does the same only if we should use the primary
+	instance's values for the node's data. This is usually the case, but
+	when initializing a cursor (e.g., "DECLARE CURSOR c IS SELECT * FROM
+	t WHERE id = x;"), we copy the values from the primary instance to
+	the cursor's instance so that they are fixed for the duration of the
+	cursor, and set 'indirection' to NULL. If we did not, the value of
+	'x' could change between fetches and things would break horribly.
+
+	TODO: It would be cleaner to make 'indirection' a boolean field and
+	always use 'alias' to refer to the primary node. */
+
+	sym_node_t*			indirection;	/*!< pointer to
+							another symbol table
+							node which contains
+							the value for this
+							node, NULL otherwise */
+	sym_node_t*			alias;		/*!< pointer to
+							another symbol table
+							node for which this
+							node is an alias,
+							NULL otherwise */
+	UT_LIST_NODE_T(sym_node_t)	col_var_list;	/*!< list of table
+							columns or a list of
+							input variables for an
+							explicit cursor */
+	ibool				copy_val;	/*!< TRUE if a column
+							and its value should
+							be copied to dynamic
+							memory when fetched */
+	ulint				field_nos[2];	/*!< if a column, in
+							the position
+							SYM_CLUST_FIELD_NO is
+							the field number in the
+							clustered index; in
+							the position
+							SYM_SEC_FIELD_NO
+							the field number in the
+							non-clustered index to
+							use first; if not found
+							from the index, then
+							ULINT_UNDEFINED */
+	ibool				resolved;	/*!< TRUE if the
+							meaning of a variable
+							or a column has been
+							resolved; for literals
+							this is always TRUE */
+	enum sym_tab_entry		token_type;	/*!< type of the
+							parsed token */
+	const char*			name;		/*!< name of an id */
+	ulint				name_len;	/*!< id name length */
+	dict_table_t*			table;		/*!< table definition
+							if a table id or a
+							column id */
+	ulint				col_no;		/*!< column number if a
+							column */
+	sel_buf_t*			prefetch_buf;	/*!< NULL, or a buffer
+							for cached column
+							values for prefetched
+							rows */
+	sel_node_t*			cursor_def;	/*!< cursor definition
+							select node if a
+							named cursor */
+	ulint				param_type;	/*!< PARS_INPUT,
+							PARS_OUTPUT, or
+							PARS_NOT_PARAM if not a
+							procedure parameter */
+	sym_tab_t*			sym_table;	/*!< back pointer to
+							the symbol table */
+	UT_LIST_NODE_T(sym_node_t)	sym_list;	/*!< list of symbol
+							nodes */
+	sym_node_t*			like_node;	/* LIKE operator node*/
+};
+
+/** Symbol table */
+struct sym_tab_t{
+	que_t*			query_graph;
+					/*!< query graph generated by the
+					parser */
+	const char*		sql_string;
+					/*!< SQL string to parse */
+	size_t			string_len;
+					/*!< SQL string length */
+	size_t			next_char_pos;
+					/*!< position of the next character in
+					sql_string to give to the lexical
+					analyzer */
+	pars_info_t*		info;	/*!< extra information, or NULL */
+	sym_node_list_t		sym_list;
+					/*!< list of symbol nodes in the symbol
+					table */
+	UT_LIST_BASE_NODE_T(func_node_t)
+				func_node_list;
+					/*!< list of function nodes in the
+					parsed query graph */
+	mem_heap_t*		heap;	/*!< memory heap from which we can
+					allocate space */
+};
+
+#endif
diff --git a/storage/innobase/include/pars0types.h b/storage/innobase/include/pars0types.h
new file mode 100644
index 00000000..f5b69522
--- /dev/null
+++ b/storage/innobase/include/pars0types.h
@@ -0,0 +1,50 @@
+/*****************************************************************************
+
+Copyright (c) 1998, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0types.h
+SQL parser global types
+
+Created 1/11/1998 Heikki Tuuri
+*******************************************************/
+
+#ifndef pars0types_h
+#define pars0types_h
+
+struct pars_info_t;
+struct pars_user_func_t;
+struct pars_bound_lit_t;
+struct pars_bound_id_t;
+struct sym_node_t;
+struct sym_tab_t;
+struct pars_res_word_t;
+struct func_node_t;
+struct order_node_t;
+struct proc_node_t;
+struct elsif_node_t;
+struct if_node_t;
+struct while_node_t;
+struct for_node_t;
+struct exit_node_t;
+struct return_node_t;
+struct assign_node_t;
+struct col_assign_node_t;
+
+typedef UT_LIST_BASE_NODE_T(sym_node_t)	sym_node_list_t;
+
+#endif
diff --git a/storage/innobase/include/que0que.h b/storage/innobase/include/que0que.h
new file mode 100644
index 00000000..c60f390a
--- /dev/null
+++ b/storage/innobase/include/que0que.h
@@ -0,0 +1,314 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/que0que.h
+Query graph
+
+Created 5/27/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef que0que_h
+#define que0que_h
+
+#include "data0data.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "srv0srv.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "pars0types.h"
+
+/***********************************************************************//**
+Creates a query graph fork node.
+@return own: fork node */
+que_fork_t *que_fork_create(mem_heap_t* heap);
+/***********************************************************************//**
+Gets the first thr in a fork. */
+UNIV_INLINE
+que_thr_t*
+que_fork_get_first_thr(
+/*===================*/
+	que_fork_t*	fork);	/*!< in: query fork */
+/***********************************************************************//**
+Gets the child node of the first thr in a fork. */
+UNIV_INLINE
+que_node_t*
+que_fork_get_child(
+/*===============*/
+	que_fork_t*	fork);	/*!< in: query fork */
+/***********************************************************************//**
+Sets the parent of a graph node. */
+UNIV_INLINE
+void
+que_node_set_parent(
+/*================*/
+	que_node_t*	node,	/*!< in: graph node */
+	que_node_t*	parent);/*!< in: parent */
+/** Creates a query graph thread node.
+@param[in]	parent		parent node, i.e., a fork node
+@param[in]	heap		memory heap where created
+@param[in]	prebuilt	row prebuilt structure
+@return own: query thread node */
+que_thr_t*
+que_thr_create(
+	que_fork_t*	parent,
+	mem_heap_t*	heap,
+	row_prebuilt_t*	prebuilt);
+/**********************************************************************//**
+Frees a query graph, but not the heap where it was created. Does not free
+explicit cursor declarations, they are freed in que_graph_free. */
+void
+que_graph_free_recursive(
+/*=====================*/
+	que_node_t*	node);	/*!< in: query graph node */
+/**********************************************************************//**
+Frees a query graph. */
+void
+que_graph_free(
+/*===========*/
+	que_t*	graph);	/*!< in: query graph; we assume that the memory
+			heap where this graph was created is private
+			to this graph: if not, then use
+			que_graph_free_recursive and free the heap
+			afterwards! */
+
+/**********************************************************************//**
+Run a query thread. Handles lock waits. */
+void
+que_run_threads(
+/*============*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Starts execution of a command in a query fork. Picks a query thread which
+is not in the QUE_THR_RUNNING state and moves it to that state. If none
+can be chosen, a situation which may arise in parallelized fetches, NULL
+is returned.
+@return a query thread of the graph moved to QUE_THR_RUNNING state, or
+NULL; the query thread should be executed by que_run_threads by the
+caller */
+que_thr_t*
+que_fork_start_command(
+/*===================*/
+	que_fork_t*	fork);	/*!< in: a query fork */
+/***********************************************************************//**
+Gets the trx of a query thread. */
+UNIV_INLINE
+trx_t*
+thr_get_trx(
+/*========*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/***********************************************************************//**
+Gets the type of a graph node. */
+UNIV_INLINE
+ulint
+que_node_get_type(
+/*==============*/
+	const que_node_t*	node);	/*!< in: graph node */
+/***********************************************************************//**
+Gets pointer to the value data type field of a graph node. */
+UNIV_INLINE
+dtype_t*
+que_node_get_data_type(
+/*===================*/
+	que_node_t*	node);	/*!< in: graph node */
+/***********************************************************************//**
+Gets pointer to the value dfield of a graph node. */
+UNIV_INLINE
+dfield_t*
+que_node_get_val(
+/*=============*/
+	que_node_t*	node);	/*!< in: graph node */
+/***********************************************************************//**
+Gets the value buffer size of a graph node.
+@return val buffer size, not defined if val.data == NULL in node */
+UNIV_INLINE
+ulint
+que_node_get_val_buf_size(
+/*======================*/
+	que_node_t*	node);	/*!< in: graph node */
+/***********************************************************************//**
+Sets the value buffer size of a graph node. */
+UNIV_INLINE
+void
+que_node_set_val_buf_size(
+/*======================*/
+	que_node_t*	node,	/*!< in: graph node */
+	ulint		size);	/*!< in: size */
+/*********************************************************************//**
+Gets the next list node in a list of query graph nodes. */
+UNIV_INLINE
+que_node_t*
+que_node_get_next(
+/*==============*/
+	que_node_t*	node);	/*!< in: node in a list */
+/*********************************************************************//**
+Gets the parent node of a query graph node.
+@return parent node or NULL */
+UNIV_INLINE
+que_node_t*
+que_node_get_parent(
+/*================*/
+	que_node_t*	node);	/*!< in: node */
+/****************************************************************//**
+Get the first containing loop node (e.g. while_node_t or for_node_t) for the
+given node, or NULL if the node is not within a loop.
+@return containing loop node, or NULL. */
+que_node_t*
+que_node_get_containing_loop_node(
+/*==============================*/
+	que_node_t*	node);	/*!< in: node */
+/*********************************************************************//**
+Catenates a query graph node to a list of them, possible empty list.
+@return one-way list of nodes */
+UNIV_INLINE
+que_node_t*
+que_node_list_add_last(
+/*===================*/
+	que_node_t*	node_list,	/*!< in: node list, or NULL */
+	que_node_t*	node);		/*!< in: node */
+/*************************************************************************
+Get the last node from the list.*/
+UNIV_INLINE
+que_node_t*
+que_node_list_get_last(
+/*===================*/
+					/* out: node last node from list.*/
+	que_node_t*	node_list);	/* in: node list, or NULL */
+/*********************************************************************//**
+Gets a query graph node list length.
+@return length, for NULL list 0 */
+UNIV_INLINE
+ulint
+que_node_list_get_len(
+/*==================*/
+	que_node_t*	node_list);	/*!< in: node list, or NULL */
+/*********************************************************************//**
+Evaluate the given SQL
+@return error code or DB_SUCCESS */
+dberr_t
+que_eval_sql(
+/*=========*/
+	pars_info_t*	info,	/*!< in: info struct, or NULL */
+	const char*	sql,	/*!< in: SQL string */
+	trx_t*		trx);	/*!< in: trx */
+
+/**********************************************************************//**
+Round robin scheduler.
+@return a query thread of the graph moved to QUE_THR_RUNNING state, or
+NULL; the query thread should be executed by que_run_threads by the
+caller */
+que_thr_t*
+que_fork_scheduler_round_robin(
+/*===========================*/
+	que_fork_t*	fork,		/*!< in: a query fork */
+	que_thr_t*	thr);		/*!< in: current pos */
+
+/** Query thread states */
+enum que_thr_state_t {
+	/** in selects this means that the thread is at the end of its
+	result set (or start, in case of a scroll cursor); in other
+	statements, this means the thread has done its task */
+	QUE_THR_COMPLETED,
+	QUE_THR_RUNNING
+};
+
+/** Query thread lock states */
+enum que_thr_lock_t {
+	QUE_THR_LOCK_NOLOCK,
+	QUE_THR_LOCK_ROW,
+	QUE_THR_LOCK_TABLE
+};
+
+/* Query graph query thread node: the fields are protected by the
+trx_t::mutex with the exceptions named below */
+
+struct que_thr_t{
+	que_common_t	common;		/*!< type: QUE_NODE_THR */
+	que_node_t*	child;		/*!< graph child node */
+	que_t*		graph;		/*!< graph where this node belongs */
+	que_thr_state_t	state;		/*!< state of the query thread */
+	/*------------------------------*/
+	/* The following fields are private to the OS thread executing the
+	query thread, and are not protected by any mutex: */
+
+	que_node_t*	run_node;	/*!< pointer to the node where the
+					subgraph down from this node is
+					currently executed */
+	que_node_t*	prev_node;	/*!< pointer to the node from which
+					the control came */
+	ulint		resource;	/*!< resource usage of the query thread
+					thus far */
+	ulint		lock_state;	/*!< lock state of thread (table or
+					row) */
+	/*------------------------------*/
+	/* The following fields are links for the various lists that
+	this type can be on. */
+	UT_LIST_NODE_T(que_thr_t)
+			thrs;		/*!< list of thread nodes of the fork
+					node */
+	UT_LIST_NODE_T(que_thr_t)
+			queue;		/*!< list of runnable thread nodes in
+					the server task queue */
+	ulint		fk_cascade_depth; /*!< maximum cascading call depth
+					supported for foreign key constraint
+					related delete/updates */
+	row_prebuilt_t*	prebuilt;	/*!< prebuilt structure processed by
+					the query thread */
+};
+
+/* Query graph fork node: its fields are protected by the query thread mutex */
+struct que_fork_t{
+	que_common_t	common;		/*!< type: QUE_NODE_FORK */
+	que_t*		graph;		/*!< query graph of this node */
+	trx_t*		trx;		/*!< transaction: this is set only in
+					the root node */
+	ulint		state;		/*!< state of the fork node */
+	que_thr_t*	caller;		/*!< pointer to a possible calling query
+					thread */
+	UT_LIST_BASE_NODE_T(que_thr_t)
+			thrs;		/*!< list of query threads */
+	/*------------------------------*/
+	/* The fields in this section are defined only in the root node */
+	sym_tab_t*	sym_tab;	/*!< symbol table of the query,
+					generated by the parser, or NULL
+					if the graph was created 'by hand' */
+	pars_info_t*	info;		/*!< info struct, or NULL */
+
+	sel_node_t*	last_sel_node;	/*!< last executed select node, or NULL
+					if none */
+	UT_LIST_NODE_T(que_fork_t)
+			graphs;		/*!< list of query graphs of a session
+					or a stored procedure */
+	/*------------------------------*/
+	mem_heap_t*	heap;		/*!< memory heap where the fork was
+					created */
+
+};
+
+/* Query fork (or graph) states */
+#define QUE_FORK_ACTIVE		1
+#define QUE_FORK_COMMAND_WAIT	2
+
+/* Flag which is ORed to control structure statement node types */
+#define QUE_NODE_CONTROL_STAT	1024
+
+#include "que0que.inl"
+
+#endif
diff --git a/storage/innobase/include/que0que.inl b/storage/innobase/include/que0que.inl
new file mode 100644
index 00000000..e21cbad3
--- /dev/null
+++ b/storage/innobase/include/que0que.inl
@@ -0,0 +1,245 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2020, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/que0que.ic
+Query graph
+
+Created 5/27/1996 Heikki Tuuri
+*******************************************************/
+
+/***********************************************************************//**
+Gets the trx of a query thread. */
+UNIV_INLINE
+trx_t*
+thr_get_trx(
+/*========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ut_ad(thr);
+
+	return(thr->graph->trx);
+}
+
+/***********************************************************************//**
+Gets the first thr in a fork. */
+UNIV_INLINE
+que_thr_t*
+que_fork_get_first_thr(
+/*===================*/
+	que_fork_t*	fork)	/*!< in: query fork */
+{
+	return(UT_LIST_GET_FIRST(fork->thrs));
+}
+
+/***********************************************************************//**
+Gets the child node of the first thr in a fork. */
+UNIV_INLINE
+que_node_t*
+que_fork_get_child(
+/*===============*/
+	que_fork_t*	fork)	/*!< in: query fork */
+{
+	que_thr_t*	thr;
+
+	thr = UT_LIST_GET_FIRST(fork->thrs);
+
+	return(thr->child);
+}
+
+/***********************************************************************//**
+Gets the type of a graph node. */
+UNIV_INLINE
+ulint
+que_node_get_type(
+/*==============*/
+	const que_node_t*	node)	/*!< in: graph node */
+{
+	return(reinterpret_cast<const que_common_t*>(node)->type);
+}
+
+/***********************************************************************//**
+Gets pointer to the value dfield of a graph node. */
+UNIV_INLINE
+dfield_t*
+que_node_get_val(
+/*=============*/
+	que_node_t*	node)	/*!< in: graph node */
+{
+	ut_ad(node);
+
+	return(&(((que_common_t*) node)->val));
+}
+
+/***********************************************************************//**
+Gets the value buffer size of a graph node.
+@return val buffer size, not defined if val.data == NULL in node */
+UNIV_INLINE
+ulint
+que_node_get_val_buf_size(
+/*======================*/
+	que_node_t*	node)	/*!< in: graph node */
+{
+	ut_ad(node);
+
+	return(((que_common_t*) node)->val_buf_size);
+}
+
+/***********************************************************************//**
+Sets the value buffer size of a graph node. */
+UNIV_INLINE
+void
+que_node_set_val_buf_size(
+/*======================*/
+	que_node_t*	node,	/*!< in: graph node */
+	ulint		size)	/*!< in: size */
+{
+	ut_ad(node);
+
+	((que_common_t*) node)->val_buf_size = size;
+}
+
+/***********************************************************************//**
+Sets the parent of a graph node. */
+UNIV_INLINE
+void
+que_node_set_parent(
+/*================*/
+	que_node_t*	node,	/*!< in: graph node */
+	que_node_t*	parent)	/*!< in: parent */
+{
+	ut_ad(node);
+
+	((que_common_t*) node)->parent = parent;
+}
+
+/***********************************************************************//**
+Gets pointer to the value data type field of a graph node. */
+UNIV_INLINE
+dtype_t*
+que_node_get_data_type(
+/*===================*/
+	que_node_t*	node)	/*!< in: graph node */
+{
+	ut_ad(node);
+
+	return(dfield_get_type(&((que_common_t*) node)->val));
+}
+
+/*********************************************************************//**
+Catenates a query graph node to a list of them, possible empty list.
+@return one-way list of nodes */
+UNIV_INLINE
+que_node_t*
+que_node_list_add_last(
+/*===================*/
+	que_node_t*	node_list,	/*!< in: node list, or NULL */
+	que_node_t*	node)		/*!< in: node */
+{
+	que_common_t*	cnode;
+	que_common_t*	cnode2;
+
+	cnode = (que_common_t*) node;
+
+	cnode->brother = NULL;
+
+	if (node_list == NULL) {
+
+		return(node);
+	}
+
+	cnode2 = (que_common_t*) node_list;
+
+	while (cnode2->brother != NULL) {
+		cnode2 = (que_common_t*) cnode2->brother;
+	}
+
+	cnode2->brother = node;
+
+	return(node_list);
+}
+
+/*************************************************************************
+Removes a query graph node from the list.*/
+UNIV_INLINE
+que_node_t*
+que_node_list_get_last(
+/*===================*/
+					/* out: last node in list.*/
+	que_node_t*	node_list)	/* in: node list */
+{
+	que_common_t*	node;
+
+	ut_a(node_list != NULL);
+
+	node = (que_common_t*) node_list;
+
+	/* We need the last element */
+	while (node->brother != NULL) {
+		node = (que_common_t*) node->brother;
+	}
+
+	return(node);
+}
+/*********************************************************************//**
+Gets the next list node in a list of query graph nodes.
+@return next node in a list of nodes */
+UNIV_INLINE
+que_node_t*
+que_node_get_next(
+/*==============*/
+	que_node_t*	node)	/*!< in: node in a list */
+{
+	return(((que_common_t*) node)->brother);
+}
+
+/*********************************************************************//**
+Gets a query graph node list length.
+@return length, for NULL list 0 */
+UNIV_INLINE
+ulint
+que_node_list_get_len(
+/*==================*/
+	que_node_t*	node_list)	/*!< in: node list, or NULL */
+{
+	const que_common_t*	cnode;
+	ulint			len;
+
+	cnode = (const que_common_t*) node_list;
+	len = 0;
+
+	while (cnode != NULL) {
+		len++;
+		cnode = (const que_common_t*) cnode->brother;
+	}
+
+	return(len);
+}
+
+/*********************************************************************//**
+Gets the parent node of a query graph node.
+@return parent node or NULL */
+UNIV_INLINE
+que_node_t*
+que_node_get_parent(
+/*================*/
+	que_node_t*	node)	/*!< in: node */
+{
+	return(((que_common_t*) node)->parent);
+}
diff --git a/storage/innobase/include/que0types.h b/storage/innobase/include/que0types.h
new file mode 100644
index 00000000..38f6e380
--- /dev/null
+++ b/storage/innobase/include/que0types.h
@@ -0,0 +1,97 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/que0types.h
+Query graph global types
+
+Created 5/27/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef que0types_h
+#define que0types_h
+
+#include "data0data.h"
+
+/* Pseudotype for all graph nodes */
+typedef void	que_node_t;
+
+/* Query graph root is a fork node */
+typedef	struct que_fork_t	que_t;
+
+struct row_prebuilt_t;
+struct que_thr_t;
+
+/* Query graph node types */
+#define	QUE_NODE_LOCK		1
+#define	QUE_NODE_INSERT		2
+#define QUE_NODE_UPDATE		4
+#define	QUE_NODE_CURSOR		5
+#define	QUE_NODE_SELECT		6
+#define	QUE_NODE_AGGREGATE	7
+#define QUE_NODE_FORK		8
+#define QUE_NODE_THR		9
+#define QUE_NODE_UNDO		10
+#define QUE_NODE_COMMIT		11
+#define QUE_NODE_ROLLBACK	12
+#define QUE_NODE_PURGE		13
+#define QUE_NODE_CREATE_TABLE	14
+#define QUE_NODE_CREATE_INDEX	15
+#define QUE_NODE_SYMBOL		16
+#define QUE_NODE_RES_WORD	17
+#define QUE_NODE_FUNC		18
+#define QUE_NODE_ORDER		19
+#define QUE_NODE_PROC		(20 + QUE_NODE_CONTROL_STAT)
+#define QUE_NODE_IF		(21 + QUE_NODE_CONTROL_STAT)
+#define QUE_NODE_WHILE		(22 + QUE_NODE_CONTROL_STAT)
+#define QUE_NODE_ASSIGNMENT	23
+#define QUE_NODE_FETCH		24
+#define QUE_NODE_OPEN		25
+#define QUE_NODE_COL_ASSIGNMENT	26
+#define QUE_NODE_FOR		(27 + QUE_NODE_CONTROL_STAT)
+#define QUE_NODE_RETURN		28
+#define QUE_NODE_ROW_PRINTF	29
+#define QUE_NODE_ELSIF		30
+#define QUE_NODE_CALL		31
+#define QUE_NODE_EXIT		32
+
+/* Common struct at the beginning of each query graph node; the name of this
+substruct must be 'common' */
+
+struct que_common_t{
+	ulint		type;	/*!< query node type */
+	que_node_t*	parent;	/*!< back pointer to parent node, or NULL */
+	que_node_t*	brother;/* pointer to a possible brother node */
+	dfield_t	val;	/*!< evaluated value for an expression */
+	ulint		val_buf_size;
+				/* buffer size for the evaluated value data,
+				if the buffer has been allocated dynamically:
+				if this field is != 0, and the node is a
+				symbol node or a function node, then we
+				have to free the data field in val
+				explicitly */
+
+	/** Constructor */
+	que_common_t(ulint type, que_node_t* parent) :
+		type(type), parent(parent), brother(NULL),
+		val(), val_buf_size(0)
+	{}
+};
+
+#endif
diff --git a/storage/innobase/include/read0types.h b/storage/innobase/include/read0types.h
new file mode 100644
index 00000000..e002f1b7
--- /dev/null
+++ b/storage/innobase/include/read0types.h
@@ -0,0 +1,275 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/read0types.h
+Cursor read
+
+Created 2/16/1997 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "dict0mem.h"
+#include "trx0types.h"
+#include "srw_lock.h"
+#include <algorithm>
+
+/**
+  Read view lists the trx ids of those transactions for which a consistent read
+  should not see the modifications to the database.
+*/
+class ReadViewBase
+{
+  /**
+    The read should not see any transaction with trx id >= this value.
+    In other words, this is the "high water mark".
+  */
+  trx_id_t m_low_limit_id= 0;
+
+  /**
+    The read should see all trx ids which are strictly
+    smaller (<) than this value. In other words, this is the
+    low water mark".
+  */
+  trx_id_t m_up_limit_id;
+
+  /** Set of RW transactions that was active when this snapshot was taken */
+  trx_ids_t m_ids;
+
+  /**
+    The view does not need to see the undo logs for transactions whose
+    transaction number is strictly smaller (<) than this value: they can be
+    removed in purge if not needed by other views.
+  */
+  trx_id_t m_low_limit_no;
+
+protected:
+  bool empty() { return m_ids.empty(); }
+
+  /** @return the up limit id */
+  trx_id_t up_limit_id() const { return m_up_limit_id; }
+
+public:
+  /**
+    Append state from another view.
+
+    This method is used to find min(m_low_limit_no), min(m_low_limit_id) and
+    all transaction ids below min(m_low_limit_id). These values effectively
+    form oldest view.
+
+    @param other    view to copy from
+  */
+  void append(const ReadViewBase &other)
+  {
+    ut_ad(&other != this);
+    if (m_low_limit_no > other.m_low_limit_no)
+      m_low_limit_no= other.m_low_limit_no;
+    if (m_low_limit_id > other.m_low_limit_id)
+      m_low_limit_id= other.m_low_limit_id;
+
+    trx_ids_t::iterator dst= m_ids.begin();
+    for (const trx_id_t id : other.m_ids)
+    {
+      if (id >= m_low_limit_id)
+        break;
+loop:
+      if (dst == m_ids.end())
+      {
+        m_ids.push_back(id);
+        dst= m_ids.end();
+        continue;
+      }
+      if (*dst < id)
+      {
+        dst++;
+        goto loop;
+      }
+      else if (*dst > id)
+        dst= m_ids.insert(dst, id) + 1;
+    }
+    m_ids.erase(std::lower_bound(dst, m_ids.end(), m_low_limit_id),
+                m_ids.end());
+
+    m_up_limit_id= m_ids.empty() ? m_low_limit_id : m_ids.front();
+    ut_ad(m_up_limit_id <= m_low_limit_id);
+  }
+
+
+  /**
+    Creates a snapshot where exactly the transactions serialized before this
+    point in time are seen in the view.
+
+    @param[in,out] trx transaction
+  */
+  inline void snapshot(trx_t *trx);
+
+
+  /**
+    Check whether the changes by id are visible.
+    @param[in] id transaction id to check against the view
+    @return whether the view sees the modifications of id.
+  */
+  bool changes_visible(trx_id_t id) const
+  MY_ATTRIBUTE((warn_unused_result))
+  {
+    if (id >= m_low_limit_id)
+      return false;
+    return id < m_up_limit_id ||
+           m_ids.empty() ||
+           !std::binary_search(m_ids.begin(), m_ids.end(), id);
+  }
+
+  /**
+    @param id transaction to check
+    @return true if view sees transaction id
+  */
+  bool sees(trx_id_t id) const { return id < m_up_limit_id; }
+
+  /** @return the low limit no */
+  trx_id_t low_limit_no() const { return m_low_limit_no; }
+
+  /** @return the low limit id */
+  trx_id_t low_limit_id() const { return m_low_limit_id; }
+
+  /** Clamp the low limit id for purge_sys.end_view */
+  void clamp_low_limit_id(trx_id_t limit)
+  {
+    if (m_low_limit_id > limit)
+      m_low_limit_id= limit;
+  }
+};
+
+
+/** A ReadView with extra members required for trx_t::read_view. */
+class ReadView: public ReadViewBase
+{
+  /**
+    View state.
+
+    Implemented as atomic to allow mutex-free view close and re-use.
+    Non-owner thread is allowed to call is_open() alone without mutex
+    protection as well. E.g. trx_sys.view_count() does this.
+
+    If non-owner thread intends to access other members as well, both
+    is_open() and other members accesses must be protected by m_mutex.
+    E.g. copy_to().
+  */
+  std::atomic<bool> m_open;
+
+  /** For synchronisation with purge coordinator. */
+  mutable srw_mutex m_mutex;
+
+  /**
+    trx id of creating transaction.
+    Used exclusively by the read view owner thread.
+  */
+  trx_id_t m_creator_trx_id;
+
+public:
+  ReadView()
+  {
+    memset(reinterpret_cast<void*>(this), 0, sizeof *this);
+    m_mutex.init();
+  }
+  ~ReadView() { m_mutex.destroy(); }
+
+
+  /**
+    Opens a read view where exactly the transactions serialized before this
+    point in time are seen in the view.
+
+    View becomes visible to purge thread. Intended to be called by the ReadView
+    owner thread.
+
+    @param[in,out] trx transaction
+  */
+  void open(trx_t *trx);
+
+
+  /**
+    Closes the view.
+
+    View becomes not visible to purge thread. Intended to be called by the
+    ReadView owner thread.
+  */
+  void close() { m_open.store(false, std::memory_order_relaxed); }
+
+
+  /** Returns true if view is open. */
+  bool is_open() const { return m_open.load(std::memory_order_relaxed); }
+
+
+  /**
+    Sets the creator transaction id.
+
+    This should be set only for views created by RW transactions.
+    Intended to be called by the ReadView owner thread.
+  */
+  void set_creator_trx_id(trx_id_t id)
+  {
+    ut_ad(m_creator_trx_id == 0);
+    m_creator_trx_id= id;
+  }
+
+
+  /**
+    Writes the limits to the file.
+    @param file file to write to
+  */
+  void print_limits(FILE *file) const
+  {
+    m_mutex.wr_lock();
+    if (is_open())
+      fprintf(file, "Trx read view will not see trx with"
+                    " id >= " TRX_ID_FMT ", sees < " TRX_ID_FMT "\n",
+                    low_limit_id(), up_limit_id());
+    m_mutex.wr_unlock();
+  }
+
+
+  /**
+    A wrapper around ReadViewBase::changes_visible().
+    Intended to be called by the ReadView owner thread.
+  */
+  bool changes_visible(trx_id_t id) const
+  { return id == m_creator_trx_id || ReadViewBase::changes_visible(id); }
+
+  /**
+    A wrapper around ReadViewBase::append().
+    Intended to be called by the purge coordinator task.
+  */
+  void append_to(ReadViewBase *to) const
+  {
+    m_mutex.wr_lock();
+    if (is_open())
+      to->append(*this);
+    m_mutex.wr_unlock();
+  }
+
+  /**
+    Declare the object mostly unaccessible.
+  */
+  void mem_noaccess() const
+  {
+    MEM_NOACCESS(&m_open, sizeof m_open);
+    /* m_mutex is accessed via trx_sys.rw_trx_hash */
+    MEM_NOACCESS(&m_creator_trx_id, sizeof m_creator_trx_id);
+  }
+};
diff --git a/storage/innobase/include/rem0cmp.h b/storage/innobase/include/rem0cmp.h
new file mode 100644
index 00000000..3a30f5a9
--- /dev/null
+++ b/storage/innobase/include/rem0cmp.h
@@ -0,0 +1,286 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/rem0cmp.h
+Comparison services for records
+
+Created 7/1/1994 Heikki Tuuri
+************************************************************************/
+
+#pragma once
+
+#include "data0data.h"
+#include "data0type.h"
+#include "rem0types.h"
+#include "page0types.h"
+
+/*************************************************************//**
+Returns TRUE if two columns are equal for comparison purposes.
+@return TRUE if the columns are considered equal in comparisons */
+ibool
+cmp_cols_are_equal(
+/*===============*/
+	const dict_col_t*	col1,	/*!< in: column 1 */
+	const dict_col_t*	col2,	/*!< in: column 2 */
+	ibool			check_charsets);
+					/*!< in: whether to check charsets */
+/** Compare two data fields.
+@param mtype          main type
+@param prtype         precise type
+@param descending     whether to use descending order
+@param data1          data field
+@param len1           length of data1 in bytes, or UNIV_SQL_NULL
+@param data2          data field
+@param len2           length of data2 in bytes, or UNIV_SQL_NULL
+@return the comparison result of data1 and data2
+@retval 0 if data1 is equal to data2
+@retval negative if data1 is less than data2
+@retval positive if data1 is greater than data2 */
+int cmp_data(ulint mtype, ulint prtype, bool descending,
+             const byte *data1, size_t len1, const byte *data2, size_t len2)
+  MY_ATTRIBUTE((warn_unused_result));
+
+/** Compare two data fields.
+@param dfield1       data field; must have type field set
+@param dfield2       data field
+@param descending    whether to use descending order
+@return the comparison result of dfield1 and dfield2
+@retval 0 if dfield1 is equal to dfield2
+@retval negative if dfield1 is less than dfield2
+@retval positive if dfield1 is greater than dfield2 */
+inline int cmp_dfield_dfield(const dfield_t *dfield1, const dfield_t *dfield2,
+                             bool descending= false)
+{
+  ut_ad(dfield_check_typed(dfield1));
+  const dtype_t *type= dfield_get_type(dfield1);
+  return cmp_data(type->mtype, type->prtype, descending,
+                  static_cast<const byte*>(dfield_get_data(dfield1)),
+                  dfield_get_len(dfield1),
+                  static_cast<const byte*>(dfield_get_data(dfield2)),
+                  dfield_get_len(dfield2));
+}
+
+#ifdef UNIV_DEBUG
+/** Compare a GIS data tuple to a physical record.
+@param[in] dtuple data tuple
+@param[in] rec R-tree record
+@param[in] mode compare mode
+@retval negative if dtuple is less than rec */
+int cmp_dtuple_rec_with_gis(const dtuple_t *dtuple, const rec_t *rec,
+                            page_cur_mode_t mode)
+  MY_ATTRIBUTE((nonnull));
+#endif
+
+/** Compare two minimum bounding rectangles.
+@return	1, 0, -1, if a is greater, equal, less than b, respectively */
+inline int cmp_geometry_field(const void *a, const void *b)
+{
+  const byte *mbr1= static_cast<const byte*>(a);
+  const byte *mbr2= static_cast<const byte*>(b);
+
+  static_assert(SPDIMS == 2, "compatibility");
+  static_assert(DATA_MBR_LEN == SPDIMS * 2 * sizeof(double), "compatibility");
+
+  /* Try to compare mbr left lower corner (xmin, ymin) */
+  double x1= mach_double_read(mbr1);
+  double x2= mach_double_read(mbr2);
+  if (x1 > x2)
+    return 1;
+  if (x1 < x2)
+    return -1;
+
+  x1= mach_double_read(mbr1 + sizeof(double) * SPDIMS);
+  x2= mach_double_read(mbr2 + sizeof(double) * SPDIMS);
+
+  if (x1 > x2)
+    return 1;
+  if (x1 < x2)
+    return -1;
+
+  /* left lower corner (xmin, ymin) overlaps, now right upper corner */
+  x1= mach_double_read(mbr1 + sizeof(double));
+  x2= mach_double_read(mbr2 + sizeof(double));
+
+  if (x1 > x2)
+    return 1;
+  if (x1 < x2)
+    return -1;
+
+  x1= mach_double_read(mbr1 + sizeof(double) * 2 + sizeof(double));
+  x2= mach_double_read(mbr2 + sizeof(double) * 2 + sizeof(double));
+
+  if (x1 > x2)
+    return 1;
+  if (x1 < x2)
+    return -1;
+
+  return 0;
+}
+
+/** Compare a data tuple to a physical record.
+@param dtuple          data tuple
+@param rec             B-tree index record
+@param index           B-tree index
+@param offsets         rec_get_offsets(rec,index)
+@param n_cmp           number of fields to compare
+@param matched_fields  number of completely matched fields
+@return the comparison result of dtuple and rec
+@retval 0 if dtuple is equal to rec
+@retval negative if dtuple is less than rec
+@retval positive if dtuple is greater than rec */
+int cmp_dtuple_rec_with_match_low(const dtuple_t *dtuple, const rec_t *rec,
+                                  const dict_index_t *index,
+                                  const rec_offs *offsets,
+                                  ulint n_cmp, ulint *matched_fields)
+  MY_ATTRIBUTE((nonnull));
+#define cmp_dtuple_rec_with_match(tuple,rec,index,offsets,fields)	\
+	cmp_dtuple_rec_with_match_low(					\
+		tuple,rec,index,offsets,dtuple_get_n_fields_cmp(tuple),fields)
+/** Compare a data tuple to a physical record.
+@param[in]	dtuple		data tuple
+@param[in]	rec		B-tree or R-tree index record
+@param[in]	index		index tree
+@param[in]	offsets		rec_get_offsets(rec)
+@param[in,out]	matched_fields	number of completely matched fields
+@param[in,out]	matched_bytes	number of matched bytes in the first
+field that is not matched
+@return the comparison result of dtuple and rec
+@retval 0 if dtuple is equal to rec
+@retval negative if dtuple is less than rec
+@retval positive if dtuple is greater than rec */
+int
+cmp_dtuple_rec_with_match_bytes(
+	const dtuple_t*		dtuple,
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	const rec_offs*		offsets,
+	ulint*			matched_fields,
+	ulint*			matched_bytes)
+	MY_ATTRIBUTE((warn_unused_result));
+/** Compare a data tuple to a physical record.
+@see cmp_dtuple_rec_with_match
+@param dtuple  data tuple
+@param rec     index record
+@param index   index
+@param offsets rec_get_offsets(rec, index)
+@return the comparison result of dtuple and rec
+@retval 0 if dtuple is equal to rec
+@retval negative if dtuple is less than rec
+@retval positive if dtuple is greater than rec */
+inline int cmp_dtuple_rec(const dtuple_t *dtuple, const rec_t *rec,
+                          const dict_index_t *index, const rec_offs *offsets)
+{
+  ulint matched= 0;
+  return cmp_dtuple_rec_with_match(dtuple, rec, index, offsets, &matched);
+}
+
+/** Check if a dtuple is a prefix of a record.
+@param dtuple  data tuple
+@param rec     index record
+@param index   index
+@param offsets rec_get_offsets(rec)
+@return whether dtuple is a prefix of rec */
+bool cmp_dtuple_is_prefix_of_rec(const dtuple_t *dtuple, const rec_t *rec,
+                                 const dict_index_t *index,
+                                 const rec_offs *offsets)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Compare two physical records that contain the same number of columns,
+none of which are stored externally.
+@retval positive if rec1 (including non-ordering columns) is greater than rec2
+@retval negative if rec1 (including non-ordering columns) is less than rec2
+@retval 0 if rec1 is a duplicate of rec2 */
+int
+cmp_rec_rec_simple(
+/*===============*/
+	const rec_t*		rec1,	/*!< in: physical record */
+	const rec_t*		rec2,	/*!< in: physical record */
+	const rec_offs*		offsets1,/*!< in: rec_get_offsets(rec1, ...) */
+	const rec_offs*		offsets2,/*!< in: rec_get_offsets(rec2, ...) */
+	const dict_index_t*	index,	/*!< in: data dictionary index */
+	struct TABLE*		table)	/*!< in: MySQL table, for reporting
+					duplicate key value if applicable,
+					or NULL */
+	MY_ATTRIBUTE((nonnull(1,2,3,4), warn_unused_result));
+
+/** Compare two B-tree or R-tree records.
+Only the common first fields are compared, and externally stored field
+are treated as equal.
+@param[in]	rec1		record (possibly not on an index page)
+@param[in]	rec2		B-tree or R-tree record in an index page
+@param[in]	offsets1	rec_get_offsets(rec1, index)
+@param[in]	offsets2	rec_get_offsets(rec2, index)
+@param[in]	nulls_unequal	true if this is for index cardinality
+				statistics estimation with
+				innodb_stats_method=nulls_unequal
+				or innodb_stats_method=nulls_ignored
+@param[out]	matched_fields	number of completely matched fields
+				within the first field not completely matched
+@retval 0 if rec1 is equal to rec2
+@retval negative if rec1 is less than rec2
+@retval positive if rec1 is greater than rec2 */
+int
+cmp_rec_rec(
+	const rec_t*		rec1,
+	const rec_t*		rec2,
+	const rec_offs*		offsets1,
+	const rec_offs*		offsets2,
+	const dict_index_t*	index,
+	bool			nulls_unequal = false,
+	ulint*			matched_fields = NULL)
+	MY_ATTRIBUTE((nonnull(1,2,3,4,5)));
+
+/** Compare two data fields.
+@param dfield1        data field
+@param dfield2        data field
+@return the comparison result of dfield1 and dfield2
+@retval true if dfield1 is equal to dfield2, or a prefix of dfield1
+@retval false otherwise */
+inline bool cmp_dfield_dfield_eq_prefix(const dfield_t *dfield1,
+                                        const dfield_t *dfield2)
+{
+  ut_ad(dfield_check_typed(dfield1));
+  ut_ad(dfield_check_typed(dfield2));
+  const dtype_t *type= dfield_get_type(dfield1);
+
+#ifdef UNIV_DEBUG
+  switch (type->prtype & DATA_MYSQL_TYPE_MASK) {
+  case MYSQL_TYPE_BIT:
+  case MYSQL_TYPE_STRING:
+  case MYSQL_TYPE_VAR_STRING:
+  case MYSQL_TYPE_TINY_BLOB:
+  case MYSQL_TYPE_MEDIUM_BLOB:
+  case MYSQL_TYPE_BLOB:
+  case MYSQL_TYPE_LONG_BLOB:
+  case MYSQL_TYPE_VARCHAR:
+    break;
+  default:
+    ut_error;
+  }
+#endif /* UNIV_DEBUG */
+
+  uint cs_num= dtype_get_charset_coll(type->prtype);
+  CHARSET_INFO *cs= get_charset(cs_num, MYF(MY_WME));
+  ut_a(cs);
+  return !cs->strnncoll(static_cast<const uchar*>(dfield_get_data(dfield1)),
+                        dfield_get_len(dfield1),
+                        static_cast<const uchar*>(dfield_get_data(dfield2)),
+                        dfield_get_len(dfield2), 1);
+}
diff --git a/storage/innobase/include/rem0rec.h b/storage/innobase/include/rem0rec.h
new file mode 100644
index 00000000..2f038ab3
--- /dev/null
+++ b/storage/innobase/include/rem0rec.h
@@ -0,0 +1,1276 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/rem0rec.h
+Record manager
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifndef rem0rec_h
+#define rem0rec_h
+
+#ifndef UNIV_INNOCHECKSUM
+#include "data0data.h"
+#include "rem0types.h"
+#include "mtr0types.h"
+#include "page0types.h"
+#include "dict0dict.h"
+#include "trx0types.h"
+#endif /*! UNIV_INNOCHECKSUM */
+#include <ostream>
+#include <sstream>
+
+/* Number of extra bytes in an old-style record,
+in addition to the data and the offsets */
+#define REC_N_OLD_EXTRA_BYTES	6
+/* Number of extra bytes in a new-style record,
+in addition to the data and the offsets */
+#define REC_N_NEW_EXTRA_BYTES	5
+
+#define REC_NEW_STATUS		3	/* This is single byte bit-field */
+#define REC_NEW_STATUS_MASK	0x7UL
+#define REC_NEW_STATUS_SHIFT	0
+
+/* The following four constants are needed in page0zip.cc in order to
+efficiently compress and decompress pages. */
+
+/* The offset of heap_no in a compact record */
+#define REC_NEW_HEAP_NO		4
+/* The shift of heap_no in a compact record.
+The status is stored in the low-order bits. */
+#define	REC_HEAP_NO_SHIFT	3
+
+/* Length of a B-tree node pointer, in bytes */
+#define REC_NODE_PTR_SIZE	4
+
+#ifndef UNIV_INNOCHECKSUM
+/** SQL null flag in a 1-byte offset of ROW_FORMAT=REDUNDANT records */
+constexpr rec_offs REC_1BYTE_SQL_NULL_MASK= 0x80;
+/** SQL null flag in a 2-byte offset of ROW_FORMAT=REDUNDANT records */
+constexpr rec_offs REC_2BYTE_SQL_NULL_MASK= 0x8000;
+
+/** In a 2-byte offset of ROW_FORMAT=REDUNDANT records, the second most
+significant bit denotes that the tail of a field is stored off-page. */
+constexpr rec_offs REC_2BYTE_EXTERN_MASK= 0x4000;
+
+constexpr size_t RECORD_OFFSET= 2;
+constexpr size_t INDEX_OFFSET=
+    RECORD_OFFSET + sizeof(rec_t *) / sizeof(rec_offs);
+#endif /* UNIV_INNOCHECKSUM */
+
+/* Length of the rec_get_offsets() header */
+constexpr size_t REC_OFFS_HEADER_SIZE=
+#ifdef UNIV_DEBUG
+#ifndef UNIV_INNOCHECKSUM
+    sizeof(rec_t *) / sizeof(rec_offs) +
+    sizeof(dict_index_t *) / sizeof(rec_offs) +
+#endif /* UNIV_INNOCHECKSUM */
+#endif /* UNIV_DEBUG */
+    2;
+
+/* Number of elements that should be initially allocated for the
+offsets[] array, first passed to rec_get_offsets() */
+constexpr size_t REC_OFFS_NORMAL_SIZE= 300;
+constexpr size_t REC_OFFS_SMALL_SIZE= 18;
+constexpr size_t REC_OFFS_SEC_INDEX_SIZE=
+    /* PK max key parts */ 16 + /* sec idx max key parts */ 16 +
+    /* child page number for non-leaf pages */ 1;
+
+/** Get the base address of offsets.  The extra_size is stored at
+this position, and following positions hold the end offsets of
+the fields. */
+#define rec_offs_base(offsets) (offsets + REC_OFFS_HEADER_SIZE)
+
+#ifndef UNIV_INNOCHECKSUM
+/* Offset consists of two parts: 2 upper bits is type and all other bits is
+value */
+
+/** Only 4 different values is possible! */
+enum field_type_t
+{
+  /** normal field */
+  STORED_IN_RECORD= 0 << 14,
+  /** this field is stored off-page */
+  STORED_OFFPAGE= 1 << 14,
+  /** just an SQL NULL */
+  SQL_NULL= 2 << 14,
+  /** instantly added field */
+  DEFAULT= 3 << 14,
+};
+
+/** without 2 upper bits */
+static constexpr rec_offs DATA_MASK= 0x3fff;
+/** 2 upper bits */
+static constexpr rec_offs TYPE_MASK= ~DATA_MASK;
+inline field_type_t get_type(rec_offs n)
+{
+  return static_cast<field_type_t>(n & TYPE_MASK);
+}
+inline void set_type(rec_offs &n, field_type_t type)
+{
+  n= static_cast<rec_offs>((n & DATA_MASK) | type);
+}
+inline rec_offs get_value(rec_offs n) { return n & DATA_MASK; }
+inline rec_offs combine(rec_offs value, field_type_t type)
+{
+  return static_cast<rec_offs>(get_value(value) | type);
+}
+
+/** Compact flag ORed to the extra size returned by rec_get_offsets() */
+constexpr rec_offs REC_OFFS_COMPACT= rec_offs(~(rec_offs(~0) >> 1));
+/** External flag in offsets returned by rec_get_offsets() */
+constexpr rec_offs REC_OFFS_EXTERNAL= REC_OFFS_COMPACT >> 1;
+/** Default value flag in offsets returned by rec_get_offsets() */
+constexpr rec_offs REC_OFFS_DEFAULT= REC_OFFS_COMPACT >> 2;
+constexpr rec_offs REC_OFFS_MASK= REC_OFFS_DEFAULT - 1;
+
+/******************************************************//**
+The following function is used to get the offset of the
+next chained record on the same page.
+@return the page offset of the next chained record, or 0 if none */
+UNIV_INLINE
+ulint
+rec_get_next_offs(
+/*==============*/
+	const rec_t*	rec,	/*!< in: physical record */
+	ulint		comp)	/*!< in: nonzero=compact page format */
+	MY_ATTRIBUTE((warn_unused_result));
+/******************************************************//**
+The following function is used to set the next record offset field
+of an old-style record. */
+UNIV_INLINE
+void
+rec_set_next_offs_old(
+/*==================*/
+	rec_t*	rec,	/*!< in: old-style physical record */
+	ulint	next)	/*!< in: offset of the next record */
+	MY_ATTRIBUTE((nonnull));
+/******************************************************//**
+The following function is used to set the next record offset field
+of a new-style record. */
+UNIV_INLINE
+void
+rec_set_next_offs_new(
+/*==================*/
+	rec_t*	rec,	/*!< in/out: new-style physical record */
+	ulint	next)	/*!< in: offset of the next record */
+	MY_ATTRIBUTE((nonnull));
+/******************************************************//**
+The following function is used to get the number of fields
+in an old-style record.
+@return number of data fields */
+UNIV_INLINE
+ulint
+rec_get_n_fields_old(
+/*=================*/
+	const rec_t*	rec)	/*!< in: physical record */
+	MY_ATTRIBUTE((warn_unused_result));
+/******************************************************//**
+The following function is used to get the number of fields
+in a record.
+@return number of data fields */
+UNIV_INLINE
+ulint
+rec_get_n_fields(
+/*=============*/
+	const rec_t*		rec,	/*!< in: physical record */
+	const dict_index_t*	index)	/*!< in: record descriptor */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Confirms the n_fields of the entry is sane with comparing the other
+record in the same page specified
+@param[in]	index	index
+@param[in]	rec	record of the same page
+@param[in]	entry	index entry
+@return	true if n_fields is sane */
+UNIV_INLINE
+bool
+rec_n_fields_is_sane(
+	dict_index_t*	index,
+	const rec_t*	rec,
+	const dtuple_t*	entry)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/******************************************************//**
+The following function is used to get the number of records owned by the
+previous directory record.
+@return number of owned records */
+UNIV_INLINE
+ulint
+rec_get_n_owned_old(
+/*================*/
+	const rec_t*	rec)	/*!< in: old-style physical record */
+	MY_ATTRIBUTE((warn_unused_result));
+/******************************************************//**
+The following function is used to get the number of records owned by the
+previous directory record.
+@return number of owned records */
+UNIV_INLINE
+ulint
+rec_get_n_owned_new(
+/*================*/
+	const rec_t*	rec)	/*!< in: new-style physical record */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/******************************************************//**
+The following function is used to retrieve the info bits of
+a record.
+@return info bits */
+UNIV_INLINE
+byte
+rec_get_info_bits(
+/*==============*/
+	const rec_t*	rec,	/*!< in: physical record */
+	ulint		comp)	/*!< in: nonzero=compact page format */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Determine the status bits of a non-REDUNDANT record.
+@param[in]	rec	ROW_FORMAT=COMPACT,DYNAMIC,COMPRESSED record
+@return status bits */
+inline
+rec_comp_status_t
+rec_get_status(const rec_t* rec)
+{
+	byte bits = rec[-REC_NEW_STATUS] & REC_NEW_STATUS_MASK;
+	ut_ad(bits <= REC_STATUS_INSTANT);
+	return static_cast<rec_comp_status_t>(bits);
+}
+
+/** Set the status bits of a non-REDUNDANT record.
+@param[in,out]	rec	ROW_FORMAT=COMPACT,DYNAMIC,COMPRESSED record
+@param[in]	bits	status bits */
+inline void rec_set_status(rec_t *rec, byte bits)
+{
+  ut_ad(bits <= REC_STATUS_INSTANT);
+  rec[-REC_NEW_STATUS]= static_cast<byte>((rec[-REC_NEW_STATUS] &
+                                           ~REC_NEW_STATUS_MASK) | bits);
+}
+
+/** Get the length of added field count in a REC_STATUS_INSTANT record.
+@param[in]	n_add_field	number of added fields, minus one
+@return	storage size of the field count, in bytes */
+inline unsigned rec_get_n_add_field_len(ulint n_add_field)
+{
+	ut_ad(n_add_field < REC_MAX_N_FIELDS);
+	return n_add_field < 0x80 ? 1 : 2;
+}
+
+/** Get the added field count in a REC_STATUS_INSTANT record.
+@param[in,out]	header	variable header of a REC_STATUS_INSTANT record
+@return	number of added fields */
+inline unsigned rec_get_n_add_field(const byte*& header)
+{
+	unsigned n_fields_add = *--header;
+	if (n_fields_add < 0x80) {
+		ut_ad(rec_get_n_add_field_len(n_fields_add) == 1);
+		return n_fields_add;
+	}
+
+	n_fields_add &= 0x7f;
+	n_fields_add |= unsigned(*--header) << 7;
+	ut_ad(n_fields_add < REC_MAX_N_FIELDS);
+	ut_ad(rec_get_n_add_field_len(n_fields_add) == 2);
+	return n_fields_add;
+}
+
+/** Set the added field count in a REC_STATUS_INSTANT record.
+@param[in,out]	header	variable header of a REC_STATUS_INSTANT record
+@param[in]	n_add	number of added fields, minus 1
+@return	record header before the number of added fields */
+inline void rec_set_n_add_field(byte*& header, ulint n_add)
+{
+	ut_ad(n_add < REC_MAX_N_FIELDS);
+
+	if (n_add < 0x80) {
+		*header-- = byte(n_add);
+	} else {
+		*header-- = byte(byte(n_add) | 0x80);
+		*header-- = byte(n_add >> 7);
+	}
+}
+
+/******************************************************//**
+The following function is used to retrieve the info and status
+bits of a record.  (Only compact records have status bits.)
+@return info and status bits */
+UNIV_INLINE
+byte
+rec_get_info_and_status_bits(
+/*=========================*/
+	const rec_t*	rec,	/*!< in: physical record */
+	ulint		comp)	/*!< in: nonzero=compact page format */
+	MY_ATTRIBUTE((warn_unused_result));
+/******************************************************//**
+The following function is used to set the info and status
+bits of a record.  (Only compact records have status bits.) */
+UNIV_INLINE
+void
+rec_set_info_and_status_bits(
+/*=========================*/
+	rec_t*	rec,	/*!< in/out: compact physical record */
+	ulint	bits)	/*!< in: info bits */
+	MY_ATTRIBUTE((nonnull));
+
+/******************************************************//**
+The following function tells if record is delete marked.
+@return nonzero if delete marked */
+UNIV_INLINE
+ulint
+rec_get_deleted_flag(
+/*=================*/
+	const rec_t*	rec,	/*!< in: physical record */
+	ulint		comp)	/*!< in: nonzero=compact page format */
+	MY_ATTRIBUTE((warn_unused_result));
+/******************************************************//**
+The following function tells if a new-style record is a node pointer.
+@return TRUE if node pointer */
+UNIV_INLINE
+bool
+rec_get_node_ptr_flag(
+/*==================*/
+	const rec_t*	rec)	/*!< in: physical record */
+	MY_ATTRIBUTE((warn_unused_result));
+/******************************************************//**
+The following function is used to get the order number
+of an old-style record in the heap of the index page.
+@return heap order number */
+UNIV_INLINE
+ulint
+rec_get_heap_no_old(
+/*================*/
+	const rec_t*	rec)	/*!< in: physical record */
+	MY_ATTRIBUTE((warn_unused_result));
+/******************************************************//**
+The following function is used to get the order number
+of a new-style record in the heap of the index page.
+@return heap order number */
+UNIV_INLINE
+ulint
+rec_get_heap_no_new(
+/*================*/
+	const rec_t*	rec)	/*!< in: physical record */
+	MY_ATTRIBUTE((warn_unused_result));
+/******************************************************//**
+The following function is used to test whether the data offsets
+in the record are stored in one-byte or two-byte format.
+@return TRUE if 1-byte form */
+UNIV_INLINE
+ibool
+rec_get_1byte_offs_flag(
+/*====================*/
+	const rec_t*	rec)	/*!< in: physical record */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/******************************************************//**
+The following function is used to set the 1-byte offsets flag. */
+UNIV_INLINE
+void
+rec_set_1byte_offs_flag(
+/*====================*/
+	rec_t*	rec,	/*!< in: physical record */
+	ibool	flag)	/*!< in: TRUE if 1byte form */
+	MY_ATTRIBUTE((nonnull));
+
+/******************************************************//**
+Returns the offset of nth field end if the record is stored in the 1-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value.
+@return offset of the start of the field, SQL null flag ORed */
+UNIV_INLINE
+uint8_t
+rec_1_get_field_end_info(
+/*=====================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/******************************************************//**
+Returns the offset of nth field end if the record is stored in the 2-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value.
+@return offset of the start of the field, SQL null flag and extern
+storage flag ORed */
+UNIV_INLINE
+uint16_t
+rec_2_get_field_end_info(
+/*=====================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/******************************************************//**
+Returns nonzero if the field is stored off-page.
+@retval 0 if the field is stored in-page
+@retval REC_2BYTE_EXTERN_MASK if the field is stored externally */
+UNIV_INLINE
+ulint
+rec_2_is_field_extern(
+/*==================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/******************************************************//**
+Determine how many of the first n columns in a compact
+physical record are stored externally.
+@return number of externally stored columns */
+ulint
+rec_get_n_extern_new(
+/*=================*/
+	const rec_t*		rec,	/*!< in: compact physical record */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	ulint			n)	/*!< in: number of columns to scan */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Determine the offsets to each field in an index record.
+@param[in]	rec		physical record
+@param[in]	index		the index that the record belongs to
+@param[in,out]	offsets		array comprising offsets[0] allocated elements,
+				or an array from rec_get_offsets(), or NULL
+@param[in]	n_core		0, or index->n_core_fields for leaf page
+@param[in]	n_fields	maximum number of offsets to compute
+				(ULINT_UNDEFINED to compute all offsets)
+@param[in,out]	heap		memory heap
+@return the new offsets */
+rec_offs*
+rec_get_offsets_func(
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	rec_offs*		offsets,
+	ulint			n_core,
+	ulint			n_fields,
+#ifdef UNIV_DEBUG
+	const char*		file,	/*!< in: file name where called */
+	unsigned		line,	/*!< in: line number where called */
+#endif /* UNIV_DEBUG */
+	mem_heap_t**		heap)	/*!< in/out: memory heap */
+#ifdef UNIV_DEBUG
+	MY_ATTRIBUTE((nonnull(1,2,6,8),warn_unused_result));
+#else /* UNIV_DEBUG */
+	MY_ATTRIBUTE((nonnull(1,2,6),warn_unused_result));
+#endif /* UNIV_DEBUG */
+
+#ifdef UNIV_DEBUG
+# define rec_get_offsets(rec, index, offsets, leaf, n, heap)		\
+	rec_get_offsets_func(rec,index,offsets,leaf,n,__FILE__,__LINE__,heap)
+#else /* UNIV_DEBUG */
+# define rec_get_offsets(rec, index, offsets, leaf, n, heap)		\
+	rec_get_offsets_func(rec, index, offsets, leaf, n, heap)
+#endif /* UNIV_DEBUG */
+
+/******************************************************//**
+The following function determines the offsets to each field
+in the record.  It can reuse a previously allocated array. */
+void
+rec_get_offsets_reverse(
+/*====================*/
+	const byte*		extra,	/*!< in: the extra bytes of a
+					compact record in reverse order,
+					excluding the fixed-size
+					REC_N_NEW_EXTRA_BYTES */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	ulint			node_ptr,/*!< in: nonzero=node pointer,
+					0=leaf node */
+	rec_offs*		offsets)/*!< in/out: array consisting of
+					offsets[0] allocated elements */
+	MY_ATTRIBUTE((nonnull));
+#ifdef UNIV_DEBUG
+/** Validate offsets returned by rec_get_offsets().
+@param[in]	rec	record, or NULL
+@param[in]	index	the index that the record belongs in, or NULL
+@param[in,out]	offsets	the offsets of the record
+@return true */
+bool
+rec_offs_validate(
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	const rec_offs*		offsets)
+	MY_ATTRIBUTE((nonnull(3), warn_unused_result));
+/** Update debug data in offsets, in order to tame rec_offs_validate().
+@param[in]	rec	record
+@param[in]	index	the index that the record belongs in
+@param[in]	leaf	whether the record resides in a leaf page
+@param[in,out]	offsets	offsets from rec_get_offsets() to adjust */
+void
+rec_offs_make_valid(
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	bool			leaf,
+	rec_offs*		offsets)
+	MY_ATTRIBUTE((nonnull));
+#else
+# define rec_offs_make_valid(rec, index, leaf, offsets)
+#endif /* UNIV_DEBUG */
+
+/************************************************************//**
+The following function is used to get the offset to the nth
+data field in an old-style record.
+@return offset to the field */
+ulint
+rec_get_nth_field_offs_old(
+/*=======================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n,	/*!< in: index of the field */
+	ulint*		len)	/*!< out: length of the field; UNIV_SQL_NULL
+				if SQL null */
+	MY_ATTRIBUTE((nonnull));
+#define rec_get_nth_field_old(rec, n, len) \
+((rec) + rec_get_nth_field_offs_old(rec, n, len))
+/************************************************************//**
+Gets the physical size of an old-style field.
+Also an SQL null may have a field of size > 0,
+if the data type is of a fixed size.
+@return field size in bytes */
+UNIV_INLINE
+ulint
+rec_get_nth_field_size(
+/*===================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: index of the field */
+	MY_ATTRIBUTE((warn_unused_result));
+/************************************************************//**
+The following function is used to get an offset to the nth
+data field in a record.
+@return offset from the origin of rec */
+UNIV_INLINE
+rec_offs
+rec_get_nth_field_offs(
+/*===================*/
+	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n,	/*!< in: index of the field */
+	ulint*		len)	/*!< out: length of the field; UNIV_SQL_NULL
+				if SQL null */
+	MY_ATTRIBUTE((nonnull));
+#define rec_get_nth_field(rec, offsets, n, len) \
+((rec) + rec_get_nth_field_offs(offsets, n, len))
+/******************************************************//**
+Determine if the offsets are for a record containing null BLOB pointers.
+@return first field containing a null BLOB pointer, or NULL if none found */
+UNIV_INLINE
+const byte*
+rec_offs_any_null_extern(
+/*=====================*/
+	const rec_t*	rec,		/*!< in: record */
+	const rec_offs*	offsets)	/*!< in: rec_get_offsets(rec) */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Mark the nth field as externally stored.
+@param[in]	offsets		array returned by rec_get_offsets()
+@param[in]	n		nth field */
+void
+rec_offs_make_nth_extern(
+        rec_offs*	offsets,
+        const ulint     n);
+
+MY_ATTRIBUTE((nonnull))
+/** Determine the number of allocated elements for an array of offsets.
+@param[in]	offsets		offsets after rec_offs_set_n_alloc()
+@return number of elements */
+inline ulint rec_offs_get_n_alloc(const rec_offs *offsets)
+{
+  ut_ad(offsets);
+  ulint n_alloc= offsets[0];
+  ut_ad(n_alloc > REC_OFFS_HEADER_SIZE);
+  MEM_CHECK_ADDRESSABLE(offsets, n_alloc * sizeof *offsets);
+  return n_alloc;
+}
+
+/** Determine the number of fields for which offsets have been initialized.
+@param[in]	offsets	rec_get_offsets()
+@return number of fields */
+inline
+ulint
+rec_offs_n_fields(const rec_offs* offsets)
+{
+	ulint	n_fields;
+	ut_ad(offsets);
+	n_fields = offsets[1];
+	ut_ad(n_fields > 0);
+	ut_ad(n_fields <= REC_MAX_N_FIELDS);
+	ut_ad(n_fields + REC_OFFS_HEADER_SIZE
+	      <= rec_offs_get_n_alloc(offsets));
+	return(n_fields);
+}
+
+/** Get a flag of a record field.
+@param[in]	offsets	rec_get_offsets()
+@param[in]	n	nth field
+@param[in]	flag	flag to extract
+@return	type of the record field */
+inline field_type_t rec_offs_nth_type(const rec_offs *offsets, ulint n)
+{
+  ut_ad(rec_offs_validate(NULL, NULL, offsets));
+  ut_ad(n < rec_offs_n_fields(offsets));
+  return get_type(rec_offs_base(offsets)[1 + n]);
+}
+
+/** Determine if a record field is missing
+(should be replaced by dict_index_t::instant_field_value()).
+@param[in]	offsets	rec_get_offsets()
+@param[in]	n	nth field
+@return	nonzero if default bit is set */
+inline ulint rec_offs_nth_default(const rec_offs *offsets, ulint n)
+{
+  return rec_offs_nth_type(offsets, n) == DEFAULT;
+}
+
+/** Determine if a record field is SQL NULL
+(should be replaced by dict_index_t::instant_field_value()).
+@param[in]	offsets	rec_get_offsets()
+@param[in]	n	nth field
+@return	nonzero if SQL NULL set */
+inline ulint rec_offs_nth_sql_null(const rec_offs *offsets, ulint n)
+{
+  return rec_offs_nth_type(offsets, n) == SQL_NULL;
+}
+
+/** Determine if a record field is stored off-page.
+@param[in]	offsets	rec_get_offsets()
+@param[in]	n	nth field
+Returns nonzero if the extern bit is set in nth field of rec.
+@return nonzero if externally stored */
+inline ulint rec_offs_nth_extern(const rec_offs *offsets, ulint n)
+{
+  return rec_offs_nth_type(offsets, n) == STORED_OFFPAGE;
+}
+
+/** Get a global flag of a record.
+@param[in]	offsets	rec_get_offsets()
+@param[in]	flag	flag to extract
+@return	the flag of the record field */
+inline ulint rec_offs_any_flag(const rec_offs *offsets, ulint flag)
+{
+  ut_ad(rec_offs_validate(NULL, NULL, offsets));
+  return *rec_offs_base(offsets) & flag;
+}
+
+/** Determine if the offsets are for a record containing off-page columns.
+@param[in]	offsets	rec_get_offsets()
+@return nonzero if any off-page columns exist */
+inline bool rec_offs_any_extern(const rec_offs *offsets)
+{
+  return rec_offs_any_flag(offsets, REC_OFFS_EXTERNAL);
+}
+
+/** Determine if the offsets are for a record that is missing fields.
+@param[in]	offsets	rec_get_offsets()
+@return nonzero if any fields need to be replaced with
+		dict_index_t::instant_field_value() */
+inline ulint rec_offs_any_default(const rec_offs *offsets)
+{
+  return rec_offs_any_flag(offsets, REC_OFFS_DEFAULT);
+}
+
+/** Determine if the offsets are for other than ROW_FORMAT=REDUNDANT.
+@param[in]	offsets	rec_get_offsets()
+@return	nonzero	if ROW_FORMAT is COMPACT,DYNAMIC or COMPRESSED
+@retval	0	if ROW_FORMAT=REDUNDANT */
+inline ulint rec_offs_comp(const rec_offs *offsets)
+{
+  ut_ad(rec_offs_validate(NULL, NULL, offsets));
+  return (*rec_offs_base(offsets) & REC_OFFS_COMPACT);
+}
+
+/** Determine if the record is the metadata pseudo-record
+in the clustered index for instant ADD COLUMN or ALTER TABLE.
+@param[in]	rec	leaf page record
+@param[in]	comp	0 if ROW_FORMAT=REDUNDANT, else nonzero
+@return	whether the record is the metadata pseudo-record */
+inline bool rec_is_metadata(const rec_t* rec, ulint comp)
+{
+	bool is = !!(rec_get_info_bits(rec, comp) & REC_INFO_MIN_REC_FLAG);
+	ut_ad(!is || !comp || rec_get_status(rec) == REC_STATUS_INSTANT);
+	return is;
+}
+
+/** Determine if the record is the metadata pseudo-record
+in the clustered index for instant ADD COLUMN or ALTER TABLE.
+@param[in]	rec	leaf page record
+@param[in]	index	index of the record
+@return	whether the record is the metadata pseudo-record */
+inline bool rec_is_metadata(const rec_t *rec, const dict_index_t &index)
+{
+  return rec_is_metadata(rec, index.table->not_redundant());
+}
+
+/** Determine if the record is the metadata pseudo-record
+in the clustered index for instant ADD COLUMN (not other ALTER TABLE).
+@param[in]	rec	leaf page record
+@param[in]	comp	0 if ROW_FORMAT=REDUNDANT, else nonzero
+@return	whether the record is the metadata pseudo-record */
+inline bool rec_is_add_metadata(const rec_t* rec, ulint comp)
+{
+	bool is = rec_get_info_bits(rec, comp) == REC_INFO_MIN_REC_FLAG;
+	ut_ad(!is || !comp || rec_get_status(rec) == REC_STATUS_INSTANT);
+	return is;
+}
+
+/** Determine if the record is the metadata pseudo-record
+in the clustered index for instant ADD COLUMN (not other ALTER TABLE).
+@param[in]	rec	leaf page record
+@param[in]	index	index of the record
+@return	whether the record is the metadata pseudo-record */
+inline bool rec_is_add_metadata(const rec_t* rec, const dict_index_t& index)
+{
+	bool is = rec_is_add_metadata(rec, dict_table_is_comp(index.table));
+	ut_ad(!is || index.is_instant());
+	return is;
+}
+
+/** Determine if the record is the metadata pseudo-record
+in the clustered index for instant ALTER TABLE (not plain ADD COLUMN).
+@param[in]	rec	leaf page record
+@param[in]	comp	0 if ROW_FORMAT=REDUNDANT, else nonzero
+@return	whether the record is the ALTER TABLE metadata pseudo-record */
+inline bool rec_is_alter_metadata(const rec_t* rec, ulint comp)
+{
+	bool is = !(~rec_get_info_bits(rec, comp)
+		    & (REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG));
+	ut_ad(!is || rec_is_metadata(rec, comp));
+	return is;
+}
+
+/** Determine if the record is the metadata pseudo-record
+in the clustered index for instant ALTER TABLE (not plain ADD COLUMN).
+@param[in]	rec	leaf page record
+@param[in]	index	index of the record
+@return	whether the record is the ALTER TABLE metadata pseudo-record */
+inline bool rec_is_alter_metadata(const rec_t* rec, const dict_index_t& index)
+{
+	bool is = rec_is_alter_metadata(rec, dict_table_is_comp(index.table));
+	ut_ad(!is || index.is_dummy || index.is_instant());
+	return is;
+}
+
+/** Determine if a record is delete-marked (not a metadata pseudo-record).
+@param[in]	rec	record
+@param[in]	comp	nonzero if ROW_FORMAT!=REDUNDANT
+@return	whether the record is a delete-marked user record */
+inline bool rec_is_delete_marked(const rec_t* rec, ulint comp)
+{
+	return (rec_get_info_bits(rec, comp)
+		& (REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG))
+		== REC_INFO_DELETED_FLAG;
+}
+
+/** Get the nth field from an index.
+@param[in]	rec	index record
+@param[in]	index	index
+@param[in]	offsets	rec_get_offsets(rec, index)
+@param[in]	n	field number
+@param[out]	len	length of the field in bytes, or UNIV_SQL_NULL
+@return a read-only copy of the index field */
+inline
+const byte*
+rec_get_nth_cfield(
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	const rec_offs*		offsets,
+	ulint			n,
+	ulint*			len)
+{
+	/* Because this function may be invoked by innobase_rec_to_mysql()
+	for reporting a duplicate key during ALTER TABLE or
+	CREATE UNIQUE INDEX, and in that case the rec omit the fixed-size
+	header of 5 or 6 bytes, the check
+	rec_offs_validate(rec, index, offsets) must be avoided here. */
+	if (!rec_offs_nth_default(offsets, n)) {
+		return rec_get_nth_field(rec, offsets, n, len);
+	}
+	return index->instant_field_value(n, len);
+}
+
+/******************************************************//**
+Gets the physical size of a field.
+@return length of field */
+UNIV_INLINE
+ulint
+rec_offs_nth_size(
+/*==============*/
+	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n)	/*!< in: nth field */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/******************************************************//**
+Returns the number of extern bits set in a record.
+@return number of externally stored fields */
+UNIV_INLINE
+ulint
+rec_offs_n_extern(
+/*==============*/
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+	MY_ATTRIBUTE((warn_unused_result));
+/**********************************************************//**
+The following function returns the data size of an old-style physical
+record, that is the sum of field lengths. SQL null fields
+are counted as length 0 fields. The value returned by the function
+is the distance from record origin to record end in bytes.
+@return size */
+UNIV_INLINE
+ulint
+rec_get_data_size_old(
+/*==================*/
+	const rec_t*	rec)	/*!< in: physical record */
+	MY_ATTRIBUTE((warn_unused_result));
+/**********************************************************//**
+The following function sets the number of allocated elements
+for an array of offsets. */
+UNIV_INLINE
+void
+rec_offs_set_n_alloc(
+/*=================*/
+	rec_offs*offsets,	/*!< out: array for rec_get_offsets(),
+				must be allocated */
+	ulint	n_alloc)	/*!< in: number of elements */
+	MY_ATTRIBUTE((nonnull));
+#define rec_offs_init(offsets) \
+	rec_offs_set_n_alloc(offsets, (sizeof offsets) / sizeof *offsets)
+/**********************************************************//**
+The following function returns the data size of a physical
+record, that is the sum of field lengths. SQL null fields
+are counted as length 0 fields. The value returned by the function
+is the distance from record origin to record end in bytes.
+@return size */
+UNIV_INLINE
+ulint
+rec_offs_data_size(
+/*===============*/
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+	MY_ATTRIBUTE((warn_unused_result));
+/**********************************************************//**
+Returns the total size of record minus data size of record.
+The value returned by the function is the distance from record
+start to record origin in bytes.
+@return size */
+UNIV_INLINE
+ulint
+rec_offs_extra_size(
+/*================*/
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+	MY_ATTRIBUTE((warn_unused_result));
+/**********************************************************//**
+Returns the total size of a physical record.
+@return size */
+UNIV_INLINE
+ulint
+rec_offs_size(
+/*==========*/
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+	MY_ATTRIBUTE((warn_unused_result));
+#ifdef UNIV_DEBUG
+/**********************************************************//**
+Returns a pointer to the start of the record.
+@return pointer to start */
+UNIV_INLINE
+byte*
+rec_get_start(
+/*==========*/
+	const rec_t*	rec,	/*!< in: pointer to record */
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+	MY_ATTRIBUTE((warn_unused_result));
+/**********************************************************//**
+Returns a pointer to the end of the record.
+@return pointer to end */
+UNIV_INLINE
+byte*
+rec_get_end(
+/*========*/
+	const rec_t*	rec,	/*!< in: pointer to record */
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+	MY_ATTRIBUTE((warn_unused_result));
+#else /* UNIV_DEBUG */
+# define rec_get_start(rec, offsets) ((rec) - rec_offs_extra_size(offsets))
+# define rec_get_end(rec, offsets) ((rec) + rec_offs_data_size(offsets))
+#endif /* UNIV_DEBUG */
+
+/** Copy a physical record to a buffer.
+@param[in]	buf	buffer
+@param[in]	rec	physical record
+@param[in]	offsets	array returned by rec_get_offsets()
+@return pointer to the origin of the copy */
+UNIV_INLINE
+rec_t*
+rec_copy(
+	void*		buf,
+	const rec_t*	rec,
+	const rec_offs*	offsets);
+
+/** Determine the size of a data tuple prefix in a temporary file.
+@tparam redundant_temp whether to use the ROW_FORMAT=REDUNDANT format
+@param[in]	index		clustered or secondary index
+@param[in]	fields		data fields
+@param[in]	n_fields	number of data fields
+@param[out]	extra		record header size
+@param[in]	status		REC_STATUS_ORDINARY or REC_STATUS_INSTANT
+@return	total size, in bytes */
+template<bool redundant_temp>
+ulint
+rec_get_converted_size_temp(
+	const dict_index_t*	index,
+	const dfield_t*		fields,
+	ulint			n_fields,
+	ulint*			extra,
+	rec_comp_status_t	status = REC_STATUS_ORDINARY)
+	MY_ATTRIBUTE((warn_unused_result, nonnull));
+
+/** Determine the offset to each field in temporary file.
+@param[in]	rec	temporary file record
+@param[in]	index	index of that the record belongs to
+@param[in,out]	offsets	offsets to the fields; in: rec_offs_n_fields(offsets)
+@param[in]	n_core	number of core fields (index->n_core_fields)
+@param[in]	def_val	default values for non-core fields
+@param[in]	status	REC_STATUS_ORDINARY or REC_STATUS_INSTANT */
+void
+rec_init_offsets_temp(
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	rec_offs*		offsets,
+	ulint			n_core,
+	const dict_col_t::def_t*def_val,
+	rec_comp_status_t	status = REC_STATUS_ORDINARY)
+	MY_ATTRIBUTE((nonnull(1,2,3)));
+/** Determine the offset to each field in temporary file.
+@param[in]	rec	temporary file record
+@param[in]	index	index of that the record belongs to
+@param[in,out]	offsets	offsets to the fields; in: rec_offs_n_fields(offsets)
+*/
+void
+rec_init_offsets_temp(
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	rec_offs*		offsets)
+	MY_ATTRIBUTE((nonnull));
+
+/** Convert a data tuple prefix to the temporary file format.
+@tparam redundant_temp whether to use the ROW_FORMAT=REDUNDANT format
+@param[out]	rec		record in temporary file format
+@param[in]	index		clustered or secondary index
+@param[in]	fields		data fields
+@param[in]	n_fields	number of data fields
+@param[in]	status		REC_STATUS_ORDINARY or REC_STATUS_INSTANT */
+template<bool redundant_temp>
+void
+rec_convert_dtuple_to_temp(
+	rec_t*			rec,
+	const dict_index_t*	index,
+	const dfield_t*		fields,
+	ulint			n_fields,
+	rec_comp_status_t	status = REC_STATUS_ORDINARY)
+	MY_ATTRIBUTE((nonnull));
+
+/**************************************************************//**
+Copies the first n fields of a physical record to a new physical record in
+a buffer.
+@return own: copied record */
+rec_t*
+rec_copy_prefix_to_buf(
+/*===================*/
+	const rec_t*		rec,		/*!< in: physical record */
+	const dict_index_t*	index,		/*!< in: record descriptor */
+	ulint			n_fields,	/*!< in: number of fields
+						to copy */
+	byte**			buf,		/*!< in/out: memory buffer
+						for the copied prefix,
+						or NULL */
+	ulint*			buf_size)	/*!< in/out: buffer size */
+	MY_ATTRIBUTE((nonnull));
+/*********************************************************//**
+Builds a physical record out of a data tuple and
+stores it into the given buffer.
+@return pointer to the origin of physical record */
+rec_t*
+rec_convert_dtuple_to_rec(
+/*======================*/
+	byte*			buf,	/*!< in: start address of the
+					physical record */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	const dtuple_t*		dtuple,	/*!< in: data tuple */
+	ulint			n_ext)	/*!< in: number of
+					externally stored columns */
+	MY_ATTRIBUTE((warn_unused_result));
+/**********************************************************//**
+Returns the extra size of an old-style physical record if we know its
+data size and number of fields.
+@return extra size */
+UNIV_INLINE
+ulint
+rec_get_converted_extra_size(
+/*=========================*/
+	ulint	data_size,	/*!< in: data size */
+	ulint	n_fields,	/*!< in: number of fields */
+	ulint	n_ext)		/*!< in: number of externally stored columns */
+	MY_ATTRIBUTE((const));
+/**********************************************************//**
+Determines the size of a data tuple prefix in ROW_FORMAT=COMPACT.
+@return total size */
+ulint
+rec_get_converted_size_comp_prefix(
+/*===============================*/
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	const dfield_t*		fields,	/*!< in: array of data fields */
+	ulint			n_fields,/*!< in: number of data fields */
+	ulint*			extra)	/*!< out: extra size */
+	MY_ATTRIBUTE((warn_unused_result, nonnull(1,2)));
+
+/** Determine the size of a record in ROW_FORMAT=COMPACT.
+@param[in]	index		record descriptor. dict_table_is_comp()
+				is assumed to hold, even if it doesn't
+@param[in]	tuple		logical record
+@param[out]	extra		extra size
+@return total size */
+ulint
+rec_get_converted_size_comp(
+	const dict_index_t*	index,
+	const dtuple_t*		tuple,
+	ulint*			extra)
+	MY_ATTRIBUTE((nonnull(1,2)));
+
+/**********************************************************//**
+The following function returns the size of a data tuple when converted to
+a physical record.
+@return size */
+UNIV_INLINE
+ulint
+rec_get_converted_size(
+/*===================*/
+	dict_index_t*	index,	/*!< in: record descriptor */
+	const dtuple_t*	dtuple,	/*!< in: data tuple */
+	ulint		n_ext)	/*!< in: number of externally stored columns */
+	MY_ATTRIBUTE((warn_unused_result, nonnull));
+/** Copy the first n fields of a (copy of a) physical record to a data tuple.
+The fields are copied into the memory heap.
+@param[out]	tuple		data tuple
+@param[in]	rec		index record, or a copy thereof
+@param[in]	index		index of rec
+@param[in]	n_core		index->n_core_fields at the time rec was
+				copied, or 0 if non-leaf page record
+@param[in]	n_fields	number of fields to copy
+@param[in,out]	heap		memory heap */
+void
+rec_copy_prefix_to_dtuple(
+	dtuple_t*		tuple,
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	ulint			n_core,
+	ulint			n_fields,
+	mem_heap_t*		heap)
+	MY_ATTRIBUTE((nonnull));
+/***************************************************************//**
+Validates the consistency of a physical record.
+@return TRUE if ok */
+ibool
+rec_validate(
+/*=========*/
+	const rec_t*	rec,	/*!< in: physical record */
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+	MY_ATTRIBUTE((nonnull));
+/***************************************************************//**
+Prints an old-style physical record. */
+void
+rec_print_old(
+/*==========*/
+	FILE*		file,	/*!< in: file where to print */
+	const rec_t*	rec)	/*!< in: physical record */
+	MY_ATTRIBUTE((nonnull));
+/***************************************************************//**
+Prints a spatial index record. */
+void
+rec_print_mbr_rec(
+/*==========*/
+	FILE*		file,	/*!< in: file where to print */
+	const rec_t*	rec,	/*!< in: physical record */
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+	MY_ATTRIBUTE((nonnull));
+/***************************************************************//**
+Prints a physical record. */
+void
+rec_print_new(
+/*==========*/
+	FILE*		file,	/*!< in: file where to print */
+	const rec_t*	rec,	/*!< in: physical record */
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+	MY_ATTRIBUTE((nonnull));
+/***************************************************************//**
+Prints a physical record. */
+void
+rec_print(
+/*======*/
+	FILE*			file,	/*!< in: file where to print */
+	const rec_t*		rec,	/*!< in: physical record */
+	const dict_index_t*	index)	/*!< in: record descriptor */
+	MY_ATTRIBUTE((nonnull));
+
+/** Pretty-print a record.
+@param[in,out]	o	output stream
+@param[in]	rec	physical record
+@param[in]	info	rec_get_info_bits(rec)
+@param[in]	offsets	rec_get_offsets(rec) */
+void
+rec_print(
+	std::ostream&	o,
+	const rec_t*	rec,
+	ulint		info,
+	const rec_offs*	offsets);
+
+/** Wrapper for pretty-printing a record */
+struct rec_index_print
+{
+	/** Constructor */
+	rec_index_print(const rec_t* rec, const dict_index_t* index) :
+		m_rec(rec), m_index(index)
+	{}
+
+	/** Record */
+	const rec_t*		m_rec;
+	/** Index */
+	const dict_index_t*	m_index;
+};
+
+/** Display a record.
+@param[in,out]	o	output stream
+@param[in]	r	record to display
+@return	the output stream */
+std::ostream&
+operator<<(std::ostream& o, const rec_index_print& r);
+
+/** Wrapper for pretty-printing a record */
+struct rec_offsets_print
+{
+	/** Constructor */
+	rec_offsets_print(const rec_t* rec, const rec_offs* offsets) :
+		m_rec(rec), m_offsets(offsets)
+	{}
+
+	/** Record */
+	const rec_t*		m_rec;
+	/** Offsets to each field */
+	const rec_offs*		m_offsets;
+};
+
+/** Display a record.
+@param[in,out]	o	output stream
+@param[in]	r	record to display
+@return	the output stream */
+ATTRIBUTE_COLD
+std::ostream&
+operator<<(std::ostream& o, const rec_offsets_print& r);
+
+/** Pretty-printer of records and tuples */
+class rec_printer : public std::ostringstream {
+public:
+	/** Construct a pretty-printed record.
+	@param rec	record with header
+	@param offsets	rec_get_offsets(rec, ...) */
+	ATTRIBUTE_COLD
+	rec_printer(const rec_t* rec, const rec_offs* offsets)
+		:
+		std::ostringstream ()
+	{
+		rec_print(*this, rec,
+			  rec_get_info_bits(rec, rec_offs_comp(offsets)),
+			  offsets);
+	}
+
+	/** Construct a pretty-printed record.
+	@param rec record, possibly lacking header
+	@param info rec_get_info_bits(rec)
+	@param offsets rec_get_offsets(rec, ...) */
+	ATTRIBUTE_COLD
+	rec_printer(const rec_t* rec, ulint info, const rec_offs* offsets)
+		:
+		std::ostringstream ()
+	{
+		rec_print(*this, rec, info, offsets);
+	}
+
+	/** Construct a pretty-printed tuple.
+	@param tuple	data tuple */
+	ATTRIBUTE_COLD
+	rec_printer(const dtuple_t* tuple)
+		:
+		std::ostringstream ()
+	{
+		dtuple_print(*this, tuple);
+	}
+
+	/** Construct a pretty-printed tuple.
+	@param field	array of data tuple fields
+	@param n	number of fields */
+	ATTRIBUTE_COLD
+	rec_printer(const dfield_t* field, ulint n)
+		:
+		std::ostringstream ()
+	{
+		dfield_print(*this, field, n);
+	}
+
+	/** Destructor */
+	~rec_printer() override = default;
+
+private:
+	/** Copy constructor */
+	rec_printer(const rec_printer& other);
+	/** Assignment operator */
+	rec_printer& operator=(const rec_printer& other);
+};
+
+
+# ifdef UNIV_DEBUG
+/** Read the DB_TRX_ID of a clustered index record.
+@param[in]	rec	clustered index record
+@param[in]	index	clustered index
+@return the value of DB_TRX_ID */
+trx_id_t
+rec_get_trx_id(
+	const rec_t*		rec,
+	const dict_index_t*	index)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+# endif /* UNIV_DEBUG */
+
+/* Maximum lengths for the data in a physical record if the offsets
+are given in one byte (resp. two byte) format. */
+#define REC_1BYTE_OFFS_LIMIT	0x7FUL
+#define REC_2BYTE_OFFS_LIMIT	0x7FFFUL
+
+/* The data size of record must not be larger than this on
+REDUNDANT row format because we reserve two upmost bits in a
+two byte offset for special purposes */
+#define REDUNDANT_REC_MAX_DATA_SIZE    (16383)
+
+/* The data size of record must be smaller than this on
+COMPRESSED row format because we reserve two upmost bits in a
+two byte offset for special purposes */
+#define COMPRESSED_REC_MAX_DATA_SIZE   (16384)
+
+#ifdef WITH_WSREP
+int wsrep_rec_get_foreign_key(
+	byte 		*buf,     /* out: extracted key */
+	ulint 		*buf_len, /* in/out: length of buf */
+	const rec_t*	rec,	  /* in: physical record */
+	dict_index_t*	index_for,  /* in: index for foreign table */
+	dict_index_t*	index_ref,  /* in: index for referenced table */
+	ibool		new_protocol); /* in: protocol > 1 */
+#endif /* WITH_WSREP */
+
+#include "rem0rec.inl"
+
+#endif /* !UNIV_INNOCHECKSUM */
+#endif /* rem0rec_h */
diff --git a/storage/innobase/include/rem0rec.inl b/storage/innobase/include/rem0rec.inl
new file mode 100644
index 00000000..46c209cb
--- /dev/null
+++ b/storage/innobase/include/rem0rec.inl
@@ -0,0 +1,1134 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/rem0rec.ic
+Record manager
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "mach0data.h"
+#include "ut0byte.h"
+#include "dict0boot.h"
+#include "btr0types.h"
+
+/* Offsets of the bit-fields in an old-style record. NOTE! In the table the
+most significant bytes and bits are written below less significant.
+
+	(1) byte offset		(2) bit usage within byte
+	downward from
+	origin ->	1	8 bits pointer to next record
+			2	8 bits pointer to next record
+			3	1 bit short flag
+				7 bits number of fields
+			4	3 bits number of fields
+				5 bits heap number
+			5	8 bits heap number
+			6	4 bits n_owned
+				4 bits info bits
+*/
+
+/* Offsets of the bit-fields in a new-style record. NOTE! In the table the
+most significant bytes and bits are written below less significant.
+
+	(1) byte offset		(2) bit usage within byte
+	downward from
+	origin ->	1	8 bits relative offset of next record
+			2	8 bits relative offset of next record
+				  the relative offset is an unsigned 16-bit
+				  integer:
+				  (offset_of_next_record
+				   - offset_of_this_record) mod 64Ki,
+				  where mod is the modulo as a non-negative
+				  number;
+				  we can calculate the offset of the next
+				  record with the formula:
+				  relative_offset + offset_of_this_record
+				  mod srv_page_size
+			3	3 bits status:
+					000=REC_STATUS_ORDINARY
+					001=REC_STATUS_NODE_PTR
+					010=REC_STATUS_INFIMUM
+					011=REC_STATUS_SUPREMUM
+					100=REC_STATUS_INSTANT
+					1xx=reserved
+				5 bits heap number
+			4	8 bits heap number
+			5	4 bits n_owned
+				4 bits info bits
+*/
+
+/* We list the byte offsets from the origin of the record, the mask,
+and the shift needed to obtain each bit-field of the record. */
+
+#define REC_NEXT		2
+#define REC_NEXT_MASK		0xFFFFUL
+#define REC_NEXT_SHIFT		0
+
+#define REC_OLD_SHORT		3	/* This is single byte bit-field */
+#define REC_OLD_SHORT_MASK	0x1UL
+#define REC_OLD_SHORT_SHIFT	0
+
+#define REC_OLD_N_FIELDS	4
+#define REC_OLD_N_FIELDS_MASK	0x7FEUL
+#define REC_OLD_N_FIELDS_SHIFT	1
+
+#define REC_OLD_HEAP_NO		5
+#define REC_HEAP_NO_MASK	0xFFF8UL
+#if 0 /* defined in rem0rec.h for use of page0zip.cc */
+#define REC_NEW_HEAP_NO		4
+#define	REC_HEAP_NO_SHIFT	3
+#endif
+
+#define REC_OLD_N_OWNED		6	/* This is single byte bit-field */
+#define REC_NEW_N_OWNED		5	/* This is single byte bit-field */
+#define	REC_N_OWNED_MASK	0xFUL
+#define REC_N_OWNED_SHIFT	0
+
+#define REC_OLD_INFO_BITS	6	/* This is single byte bit-field */
+#define REC_NEW_INFO_BITS	5	/* This is single byte bit-field */
+#define	REC_INFO_BITS_MASK	0xF0UL
+#define REC_INFO_BITS_SHIFT	0
+
+#if REC_OLD_SHORT_MASK << (8 * (REC_OLD_SHORT - 3)) \
+		^ REC_OLD_N_FIELDS_MASK << (8 * (REC_OLD_N_FIELDS - 4)) \
+		^ REC_HEAP_NO_MASK << (8 * (REC_OLD_HEAP_NO - 4)) \
+		^ REC_N_OWNED_MASK << (8 * (REC_OLD_N_OWNED - 3)) \
+		^ REC_INFO_BITS_MASK << (8 * (REC_OLD_INFO_BITS - 3)) \
+		^ 0xFFFFFFFFUL
+# error "sum of old-style masks != 0xFFFFFFFFUL"
+#endif
+#if REC_NEW_STATUS_MASK << (8 * (REC_NEW_STATUS - 3)) \
+		^ REC_HEAP_NO_MASK << (8 * (REC_NEW_HEAP_NO - 4)) \
+		^ REC_N_OWNED_MASK << (8 * (REC_NEW_N_OWNED - 3)) \
+		^ REC_INFO_BITS_MASK << (8 * (REC_NEW_INFO_BITS - 3)) \
+		^ 0xFFFFFFUL
+# error "sum of new-style masks != 0xFFFFFFUL"
+#endif
+
+/******************************************************//**
+Gets a bit field from within 1 byte. */
+UNIV_INLINE
+byte
+rec_get_bit_field_1(
+/*================*/
+	const rec_t*	rec,	/*!< in: pointer to record origin */
+	ulint		offs,	/*!< in: offset from the origin down */
+	ulint		mask,	/*!< in: mask used to filter bits */
+	ulint		shift)	/*!< in: shift right applied after masking */
+{
+  return static_cast<byte>((*(rec - offs) & mask) >> shift);
+}
+
+/******************************************************//**
+Sets a bit field within 1 byte. */
+UNIV_INLINE
+void
+rec_set_bit_field_1(
+/*================*/
+	rec_t*	rec,	/*!< in: pointer to record origin */
+	ulint	val,	/*!< in: value to set */
+	ulint	offs,	/*!< in: offset from the origin down */
+	ulint	mask,	/*!< in: mask used to filter bits */
+	ulint	shift)	/*!< in: shift right applied after masking */
+{
+	ut_ad(rec);
+	ut_ad(offs <= REC_N_OLD_EXTRA_BYTES);
+	ut_ad(mask);
+	ut_ad(mask <= 0xFFUL);
+	ut_ad(((mask >> shift) << shift) == mask);
+	ut_ad(((val << shift) & mask) == (val << shift));
+
+	mach_write_to_1(rec - offs,
+			(mach_read_from_1(rec - offs) & ~mask)
+			| (val << shift));
+}
+
+/******************************************************//**
+Gets a bit field from within 2 bytes. */
+UNIV_INLINE
+ulint
+rec_get_bit_field_2(
+/*================*/
+	const rec_t*	rec,	/*!< in: pointer to record origin */
+	ulint		offs,	/*!< in: offset from the origin down */
+	ulint		mask,	/*!< in: mask used to filter bits */
+	ulint		shift)	/*!< in: shift right applied after masking */
+{
+	ut_ad(rec);
+
+	return((mach_read_from_2(rec - offs) & mask) >> shift);
+}
+
+/******************************************************//**
+Sets a bit field within 2 bytes. */
+UNIV_INLINE
+void
+rec_set_bit_field_2(
+/*================*/
+	rec_t*	rec,	/*!< in: pointer to record origin */
+	ulint	val,	/*!< in: value to set */
+	ulint	offs,	/*!< in: offset from the origin down */
+	ulint	mask,	/*!< in: mask used to filter bits */
+	ulint	shift)	/*!< in: shift right applied after masking */
+{
+	ut_ad(rec);
+	ut_ad(offs <= REC_N_OLD_EXTRA_BYTES);
+	ut_ad(mask > 0xFFUL);
+	ut_ad(mask <= 0xFFFFUL);
+	ut_ad((mask >> shift) & 1);
+	ut_ad(0 == ((mask >> shift) & ((mask >> shift) + 1)));
+	ut_ad(((mask >> shift) << shift) == mask);
+	ut_ad(((val << shift) & mask) == (val << shift));
+
+	mach_write_to_2(rec - offs,
+			(mach_read_from_2(rec - offs) & ~mask)
+			| (val << shift));
+}
+
+/******************************************************//**
+The following function is used to get the offset of the next chained record
+on the same page.
+@return the page offset of the next chained record, or 0 if none */
+UNIV_INLINE
+ulint
+rec_get_next_offs(
+/*==============*/
+	const rec_t*	rec,	/*!< in: physical record */
+	ulint		comp)	/*!< in: nonzero=compact page format */
+{
+	ulint	field_value;
+	compile_time_assert(REC_NEXT_MASK == 0xFFFFUL);
+	compile_time_assert(REC_NEXT_SHIFT == 0);
+
+	field_value = mach_read_from_2(rec - REC_NEXT);
+
+	if (comp) {
+#if UNIV_PAGE_SIZE_MAX <= 32768
+		/* Note that for 64 KiB pages, field_value can 'wrap around'
+		and the debug assertion is not valid */
+
+		/* In the following assertion, field_value is interpreted
+		as signed 16-bit integer in 2's complement arithmetics.
+		If all platforms defined int16_t in the standard headers,
+		the expression could be written simpler as
+		(int16_t) field_value + ut_align_offset(...) < srv_page_size
+		*/
+		ut_ad((field_value >= 32768
+		       ? field_value - 65536
+		       : field_value)
+		      + ut_align_offset(rec, srv_page_size)
+		      < srv_page_size);
+#endif
+		if (field_value == 0) {
+
+			return(0);
+		}
+
+		/* There must be at least REC_N_NEW_EXTRA_BYTES + 1
+		between each record. */
+		ut_ad((field_value > REC_N_NEW_EXTRA_BYTES
+		       && field_value < 32768)
+		      || field_value < (uint16) -REC_N_NEW_EXTRA_BYTES);
+
+		return(ut_align_offset(rec + field_value, srv_page_size));
+	} else {
+		ut_ad(field_value < srv_page_size);
+
+		return(field_value);
+	}
+}
+
+/******************************************************//**
+The following function is used to set the next record offset field
+of an old-style record. */
+UNIV_INLINE
+void
+rec_set_next_offs_old(
+/*==================*/
+	rec_t*	rec,	/*!< in: old-style physical record */
+	ulint	next)	/*!< in: offset of the next record */
+{
+	ut_ad(srv_page_size > next);
+	compile_time_assert(REC_NEXT_MASK == 0xFFFFUL);
+	compile_time_assert(REC_NEXT_SHIFT == 0);
+	mach_write_to_2(rec - REC_NEXT, next);
+}
+
+/******************************************************//**
+The following function is used to set the next record offset field
+of a new-style record. */
+UNIV_INLINE
+void
+rec_set_next_offs_new(
+/*==================*/
+	rec_t*	rec,	/*!< in/out: new-style physical record */
+	ulint	next)	/*!< in: offset of the next record */
+{
+	ulint	field_value;
+
+	ut_ad(srv_page_size > next);
+
+	if (!next) {
+		field_value = 0;
+	} else {
+		/* The following two statements calculate
+		next - offset_of_rec mod 64Ki, where mod is the modulo
+		as a non-negative number */
+
+		field_value = (ulint)
+			((lint) next
+			 - (lint) ut_align_offset(rec, srv_page_size));
+		field_value &= REC_NEXT_MASK;
+	}
+
+	mach_write_to_2(rec - REC_NEXT, field_value);
+}
+
+/******************************************************//**
+The following function is used to get the number of fields
+in an old-style record.
+@return number of data fields */
+UNIV_INLINE
+ulint
+rec_get_n_fields_old(
+/*=================*/
+	const rec_t*	rec)	/*!< in: physical record */
+{
+	ulint	ret;
+
+	ut_ad(rec);
+
+	ret = rec_get_bit_field_2(rec, REC_OLD_N_FIELDS,
+				  REC_OLD_N_FIELDS_MASK,
+				  REC_OLD_N_FIELDS_SHIFT);
+	ut_ad(ret <= REC_MAX_N_FIELDS);
+	ut_ad(ret > 0);
+
+	return(ret);
+}
+
+/******************************************************//**
+The following function is used to set the number of fields
+in an old-style record. */
+UNIV_INLINE
+void
+rec_set_n_fields_old(
+/*=================*/
+	rec_t*	rec,		/*!< in: physical record */
+	ulint	n_fields)	/*!< in: the number of fields */
+{
+	ut_ad(rec);
+	ut_ad(n_fields <= REC_MAX_N_FIELDS);
+	ut_ad(n_fields > 0);
+
+	rec_set_bit_field_2(rec, n_fields, REC_OLD_N_FIELDS,
+			    REC_OLD_N_FIELDS_MASK, REC_OLD_N_FIELDS_SHIFT);
+}
+
+/******************************************************//**
+The following function is used to get the number of fields
+in a record.
+@return number of data fields */
+UNIV_INLINE
+ulint
+rec_get_n_fields(
+/*=============*/
+	const rec_t*		rec,	/*!< in: physical record */
+	const dict_index_t*	index)	/*!< in: record descriptor */
+{
+	ut_ad(rec);
+	ut_ad(index);
+
+	if (!dict_table_is_comp(index->table)) {
+		return(rec_get_n_fields_old(rec));
+	}
+
+	switch (rec_get_status(rec)) {
+	case REC_STATUS_INSTANT:
+	case REC_STATUS_ORDINARY:
+		return(dict_index_get_n_fields(index));
+	case REC_STATUS_NODE_PTR:
+		return(dict_index_get_n_unique_in_tree(index) + 1);
+	case REC_STATUS_INFIMUM:
+	case REC_STATUS_SUPREMUM:
+		return(1);
+	}
+
+	ut_error;
+	return(ULINT_UNDEFINED);
+}
+
+/** Confirms the n_fields of the entry is sane with comparing the other
+record in the same page specified
+@param[in]	index	index
+@param[in]	rec	record of the same page
+@param[in]	entry	index entry
+@return	true if n_fields is sane */
+UNIV_INLINE
+bool
+rec_n_fields_is_sane(
+	dict_index_t*	index,
+	const rec_t*	rec,
+	const dtuple_t*	entry)
+{
+	const ulint n_fields = rec_get_n_fields(rec, index);
+
+	return(n_fields == dtuple_get_n_fields(entry)
+	       || (index->is_instant()
+		   && n_fields >= index->n_core_fields)
+	       /* a record for older SYS_INDEXES table
+	       (missing merge_threshold column) is acceptable. */
+	       || (index->table->id == DICT_INDEXES_ID
+		   && n_fields == dtuple_get_n_fields(entry) - 1));
+}
+
+/******************************************************//**
+The following function is used to get the number of records owned by the
+previous directory record.
+@return number of owned records */
+UNIV_INLINE
+ulint
+rec_get_n_owned_old(
+/*================*/
+	const rec_t*	rec)	/*!< in: old-style physical record */
+{
+	return(rec_get_bit_field_1(rec, REC_OLD_N_OWNED,
+				   REC_N_OWNED_MASK, REC_N_OWNED_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to get the number of records owned by the
+previous directory record.
+@return number of owned records */
+UNIV_INLINE
+ulint
+rec_get_n_owned_new(
+/*================*/
+	const rec_t*	rec)	/*!< in: new-style physical record */
+{
+	return(rec_get_bit_field_1(rec, REC_NEW_N_OWNED,
+				   REC_N_OWNED_MASK, REC_N_OWNED_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to retrieve the info bits of a record.
+@return info bits */
+UNIV_INLINE
+byte
+rec_get_info_bits(
+/*==============*/
+	const rec_t*	rec,	/*!< in: physical record */
+	ulint		comp)	/*!< in: nonzero=compact page format */
+{
+	return rec_get_bit_field_1(
+		rec, comp ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS,
+		REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT);
+}
+
+/******************************************************//**
+The following function is used to retrieve the info and status
+bits of a record.  (Only compact records have status bits.)
+@return info and status bits */
+UNIV_INLINE
+byte
+rec_get_info_and_status_bits(
+/*=========================*/
+	const rec_t*	rec,	/*!< in: physical record */
+	ulint		comp)	/*!< in: nonzero=compact page format */
+{
+  compile_time_assert(!((REC_NEW_STATUS_MASK >> REC_NEW_STATUS_SHIFT)
+                        & (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT)));
+  if (comp)
+    return static_cast<byte>(rec_get_info_bits(rec, TRUE) |
+                             rec_get_status(rec));
+  else
+    return rec_get_info_bits(rec, FALSE);
+}
+/******************************************************//**
+The following function is used to set the info and status
+bits of a record.  (Only compact records have status bits.) */
+UNIV_INLINE
+void
+rec_set_info_and_status_bits(
+/*=========================*/
+	rec_t*	rec,	/*!< in/out: physical record */
+	ulint	bits)	/*!< in: info bits */
+{
+	compile_time_assert(!((REC_NEW_STATUS_MASK >> REC_NEW_STATUS_SHIFT)
+			      & (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT)));
+	rec_set_status(rec, bits & REC_NEW_STATUS_MASK);
+	rec_set_bit_field_1(rec, bits & ~REC_NEW_STATUS_MASK,
+			    REC_NEW_INFO_BITS,
+			    REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT);
+}
+
+/******************************************************//**
+The following function tells if record is delete marked.
+@return nonzero if delete marked */
+UNIV_INLINE
+ulint
+rec_get_deleted_flag(
+/*=================*/
+	const rec_t*	rec,	/*!< in: physical record */
+	ulint		comp)	/*!< in: nonzero=compact page format */
+{
+	if (comp) {
+		return(rec_get_bit_field_1(rec, REC_NEW_INFO_BITS,
+					   REC_INFO_DELETED_FLAG,
+					   REC_INFO_BITS_SHIFT));
+	} else {
+		return(rec_get_bit_field_1(rec, REC_OLD_INFO_BITS,
+					   REC_INFO_DELETED_FLAG,
+					   REC_INFO_BITS_SHIFT));
+	}
+}
+
+/******************************************************//**
+The following function tells if a new-style record is a node pointer.
+@return TRUE if node pointer */
+UNIV_INLINE
+bool
+rec_get_node_ptr_flag(
+/*==================*/
+	const rec_t*	rec)	/*!< in: physical record */
+{
+	return(REC_STATUS_NODE_PTR == rec_get_status(rec));
+}
+
+/******************************************************//**
+The following function is used to get the order number
+of an old-style record in the heap of the index page.
+@return heap order number */
+UNIV_INLINE
+ulint
+rec_get_heap_no_old(
+/*================*/
+	const rec_t*	rec)	/*!< in: physical record */
+{
+	return(rec_get_bit_field_2(rec, REC_OLD_HEAP_NO,
+				   REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to get the order number
+of a new-style record in the heap of the index page.
+@return heap order number */
+UNIV_INLINE
+ulint
+rec_get_heap_no_new(
+/*================*/
+	const rec_t*	rec)	/*!< in: physical record */
+{
+	return(rec_get_bit_field_2(rec, REC_NEW_HEAP_NO,
+				   REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to test whether the data offsets in the record
+are stored in one-byte or two-byte format.
+@return TRUE if 1-byte form */
+UNIV_INLINE
+ibool
+rec_get_1byte_offs_flag(
+/*====================*/
+	const rec_t*	rec)	/*!< in: physical record */
+{
+	return(rec_get_bit_field_1(rec, REC_OLD_SHORT, REC_OLD_SHORT_MASK,
+				   REC_OLD_SHORT_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to set the 1-byte offsets flag. */
+UNIV_INLINE
+void
+rec_set_1byte_offs_flag(
+/*====================*/
+	rec_t*	rec,	/*!< in: physical record */
+	ibool	flag)	/*!< in: TRUE if 1byte form */
+{
+	ut_ad(flag <= 1);
+
+	rec_set_bit_field_1(rec, flag, REC_OLD_SHORT, REC_OLD_SHORT_MASK,
+			    REC_OLD_SHORT_SHIFT);
+}
+
+/******************************************************//**
+Returns the offset of nth field end if the record is stored in the 1-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value.
+@return offset of the start of the field, SQL null flag ORed */
+UNIV_INLINE
+uint8_t
+rec_1_get_field_end_info(
+/*=====================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+{
+	ut_ad(rec_get_1byte_offs_flag(rec));
+	ut_ad(n < rec_get_n_fields_old(rec));
+
+	return(mach_read_from_1(rec - (REC_N_OLD_EXTRA_BYTES + n + 1)));
+}
+
+/******************************************************//**
+Returns the offset of nth field end if the record is stored in the 2-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value.
+@return offset of the start of the field, SQL null flag and extern
+storage flag ORed */
+UNIV_INLINE
+uint16_t
+rec_2_get_field_end_info(
+/*=====================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+{
+	ut_ad(!rec_get_1byte_offs_flag(rec));
+	ut_ad(n < rec_get_n_fields_old(rec));
+
+	return(mach_read_from_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n + 2)));
+}
+
+/******************************************************//**
+Returns nonzero if the field is stored off-page.
+@retval 0 if the field is stored in-page
+@retval REC_2BYTE_EXTERN_MASK if the field is stored externally */
+UNIV_INLINE
+ulint
+rec_2_is_field_extern(
+/*==================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+{
+	return(rec_2_get_field_end_info(rec, n) & REC_2BYTE_EXTERN_MASK);
+}
+
+/**********************************************************//**
+The following function sets the number of allocated elements
+for an array of offsets. */
+UNIV_INLINE
+void
+rec_offs_set_n_alloc(
+/*=================*/
+	rec_offs*offsets,	/*!< out: array for rec_get_offsets(),
+				must be allocated */
+	ulint	n_alloc)	/*!< in: number of elements */
+{
+	ut_ad(n_alloc > REC_OFFS_HEADER_SIZE);
+	MEM_UNDEFINED(offsets, n_alloc * sizeof *offsets);
+	offsets[0] = static_cast<rec_offs>(n_alloc);
+}
+
+/************************************************************//**
+The following function is used to get an offset to the nth
+data field in a record.
+@return offset from the origin of rec */
+UNIV_INLINE
+rec_offs
+rec_get_nth_field_offs(
+/*===================*/
+	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n,	/*!< in: index of the field */
+	ulint*		len)	/*!< out: length of the field; UNIV_SQL_NULL
+				if SQL null; UNIV_SQL_DEFAULT is default value */
+{
+	ut_ad(n < rec_offs_n_fields(offsets));
+
+	rec_offs offs = n == 0 ? 0 : get_value(rec_offs_base(offsets)[n]);
+	rec_offs next_offs = rec_offs_base(offsets)[1 + n];
+
+	if (get_type(next_offs) == SQL_NULL) {
+		*len = UNIV_SQL_NULL;
+	} else if (get_type(next_offs) == DEFAULT) {
+		*len = UNIV_SQL_DEFAULT;
+	} else {
+		*len = get_value(next_offs) - offs;
+	}
+
+	return(offs);
+}
+
+/******************************************************//**
+Determine if the offsets are for a record containing null BLOB pointers.
+@return first field containing a null BLOB pointer, or NULL if none found */
+UNIV_INLINE
+const byte*
+rec_offs_any_null_extern(
+/*=====================*/
+	const rec_t*	rec,		/*!< in: record */
+	const rec_offs*	offsets)	/*!< in: rec_get_offsets(rec) */
+{
+	ulint	i;
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+	if (!rec_offs_any_extern(offsets)) {
+		return(NULL);
+	}
+
+	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+		if (rec_offs_nth_extern(offsets, i)) {
+			ulint		len;
+			const byte*	field
+				= rec_get_nth_field(rec, offsets, i, &len);
+
+			ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+			if (!memcmp(field + len
+				    - BTR_EXTERN_FIELD_REF_SIZE,
+				    field_ref_zero,
+				    BTR_EXTERN_FIELD_REF_SIZE)) {
+				return(field);
+			}
+		}
+	}
+
+	return(NULL);
+}
+
+/******************************************************//**
+Gets the physical size of a field.
+@return length of field */
+UNIV_INLINE
+ulint
+rec_offs_nth_size(
+/*==============*/
+	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n)	/*!< in: nth field */
+{
+	ut_ad(rec_offs_validate(NULL, NULL, offsets));
+	ut_ad(n < rec_offs_n_fields(offsets));
+	if (!n) {
+		return get_value(rec_offs_base(offsets)[1 + n]);
+	}
+	return get_value((rec_offs_base(offsets)[1 + n]))
+	       - get_value(rec_offs_base(offsets)[n]);
+}
+
+/******************************************************//**
+Returns the number of extern bits set in a record.
+@return number of externally stored fields */
+UNIV_INLINE
+ulint
+rec_offs_n_extern(
+/*==============*/
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ulint	n = 0;
+
+	if (rec_offs_any_extern(offsets)) {
+		ulint	i;
+
+		for (i = rec_offs_n_fields(offsets); i--; ) {
+			if (rec_offs_nth_extern(offsets, i)) {
+				n++;
+			}
+		}
+	}
+
+	return(n);
+}
+
+/******************************************************//**
+Returns the offset of n - 1th field end if the record is stored in the 1-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value. This function and the 2-byte counterpart are defined here because the
+C-compiler was not able to sum negative and positive constant offsets, and
+warned of constant arithmetic overflow within the compiler.
+@return offset of the start of the PREVIOUS field, SQL null flag ORed */
+UNIV_INLINE
+ulint
+rec_1_get_prev_field_end_info(
+/*==========================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+{
+	ut_ad(rec_get_1byte_offs_flag(rec));
+	ut_ad(n <= rec_get_n_fields_old(rec));
+
+	return(mach_read_from_1(rec - (REC_N_OLD_EXTRA_BYTES + n)));
+}
+
+/******************************************************//**
+Returns the offset of n - 1th field end if the record is stored in the 2-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value.
+@return offset of the start of the PREVIOUS field, SQL null flag ORed */
+UNIV_INLINE
+ulint
+rec_2_get_prev_field_end_info(
+/*==========================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+{
+	ut_ad(!rec_get_1byte_offs_flag(rec));
+	ut_ad(n <= rec_get_n_fields_old(rec));
+
+	return(mach_read_from_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n)));
+}
+
+/******************************************************//**
+Sets the field end info for the nth field if the record is stored in the
+1-byte format. */
+UNIV_INLINE
+void
+rec_1_set_field_end_info(
+/*=====================*/
+	rec_t*	rec,	/*!< in: record */
+	ulint	n,	/*!< in: field index */
+	ulint	info)	/*!< in: value to set */
+{
+	ut_ad(rec_get_1byte_offs_flag(rec));
+	ut_ad(n < rec_get_n_fields_old(rec));
+
+	mach_write_to_1(rec - (REC_N_OLD_EXTRA_BYTES + n + 1), info);
+}
+
+/******************************************************//**
+Sets the field end info for the nth field if the record is stored in the
+2-byte format. */
+UNIV_INLINE
+void
+rec_2_set_field_end_info(
+/*=====================*/
+	rec_t*	rec,	/*!< in: record */
+	ulint	n,	/*!< in: field index */
+	ulint	info)	/*!< in: value to set */
+{
+	ut_ad(!rec_get_1byte_offs_flag(rec));
+	ut_ad(n < rec_get_n_fields_old(rec));
+
+	mach_write_to_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n + 2), info);
+}
+
+/******************************************************//**
+Returns the offset of nth field start if the record is stored in the 1-byte
+offsets form.
+@return offset of the start of the field */
+UNIV_INLINE
+ulint
+rec_1_get_field_start_offs(
+/*=======================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+{
+	ut_ad(rec_get_1byte_offs_flag(rec));
+	ut_ad(n <= rec_get_n_fields_old(rec));
+
+	if (n == 0) {
+
+		return(0);
+	}
+
+	return(rec_1_get_prev_field_end_info(rec, n)
+	       & ~REC_1BYTE_SQL_NULL_MASK);
+}
+
+/******************************************************//**
+Returns the offset of nth field start if the record is stored in the 2-byte
+offsets form.
+@return offset of the start of the field */
+UNIV_INLINE
+ulint
+rec_2_get_field_start_offs(
+/*=======================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+{
+	ut_ad(!rec_get_1byte_offs_flag(rec));
+	ut_ad(n <= rec_get_n_fields_old(rec));
+
+	if (n == 0) {
+
+		return(0);
+	}
+
+	return(rec_2_get_prev_field_end_info(rec, n)
+	       & ~(REC_2BYTE_SQL_NULL_MASK | REC_2BYTE_EXTERN_MASK));
+}
+
+/******************************************************//**
+The following function is used to read the offset of the start of a data field
+in the record. The start of an SQL null field is the end offset of the
+previous non-null field, or 0, if none exists. If n is the number of the last
+field + 1, then the end offset of the last field is returned.
+@return offset of the start of the field */
+UNIV_INLINE
+ulint
+rec_get_field_start_offs(
+/*=====================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+{
+	ut_ad(rec);
+	ut_ad(n <= rec_get_n_fields_old(rec));
+
+	if (n == 0) {
+
+		return(0);
+	}
+
+	if (rec_get_1byte_offs_flag(rec)) {
+
+		return(rec_1_get_field_start_offs(rec, n));
+	}
+
+	return(rec_2_get_field_start_offs(rec, n));
+}
+
+/************************************************************//**
+Gets the physical size of an old-style field.
+Also an SQL null may have a field of size > 0,
+if the data type is of a fixed size.
+@return field size in bytes */
+UNIV_INLINE
+ulint
+rec_get_nth_field_size(
+/*===================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: index of the field */
+{
+	ulint	os;
+	ulint	next_os;
+
+	os = rec_get_field_start_offs(rec, n);
+	next_os = rec_get_field_start_offs(rec, n + 1);
+
+	ut_ad(next_os - os < srv_page_size);
+
+	return(next_os - os);
+}
+
+/**********************************************************//**
+The following function returns the data size of an old-style physical
+record, that is the sum of field lengths. SQL null fields
+are counted as length 0 fields. The value returned by the function
+is the distance from record origin to record end in bytes.
+@return size */
+UNIV_INLINE
+ulint
+rec_get_data_size_old(
+/*==================*/
+	const rec_t*	rec)	/*!< in: physical record */
+{
+	ut_ad(rec);
+
+	return(rec_get_field_start_offs(rec, rec_get_n_fields_old(rec)));
+}
+
+/**********************************************************//**
+The following function sets the number of fields in offsets. */
+UNIV_INLINE
+void
+rec_offs_set_n_fields(
+/*==================*/
+	rec_offs*	offsets,	/*!< in/out: array returned by
+				rec_get_offsets() */
+	ulint		n_fields)	/*!< in: number of fields */
+{
+	ut_ad(offsets);
+	ut_ad(n_fields > 0);
+	ut_ad(n_fields <= REC_MAX_N_FIELDS);
+	ut_ad(n_fields + REC_OFFS_HEADER_SIZE
+	      <= rec_offs_get_n_alloc(offsets));
+	offsets[1] = static_cast<rec_offs>(n_fields);
+}
+
+/**********************************************************//**
+The following function returns the data size of a physical
+record, that is the sum of field lengths. SQL null fields
+are counted as length 0 fields. The value returned by the function
+is the distance from record origin to record end in bytes.
+@return size */
+UNIV_INLINE
+ulint
+rec_offs_data_size(
+/*===============*/
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ulint	size;
+
+	ut_ad(rec_offs_validate(NULL, NULL, offsets));
+	size = get_value(rec_offs_base(offsets)[rec_offs_n_fields(offsets)]);
+	ut_ad(size < srv_page_size);
+	return(size);
+}
+
+/**********************************************************//**
+Returns the total size of record minus data size of record. The value
+returned by the function is the distance from record start to record origin
+in bytes.
+@return size */
+UNIV_INLINE
+ulint
+rec_offs_extra_size(
+/*================*/
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ulint	size;
+	ut_ad(rec_offs_validate(NULL, NULL, offsets));
+	size = *rec_offs_base(offsets) & REC_OFFS_MASK;
+	ut_ad(size < srv_page_size);
+	return(size);
+}
+
+/**********************************************************//**
+Returns the total size of a physical record.
+@return size */
+UNIV_INLINE
+ulint
+rec_offs_size(
+/*==========*/
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	return(rec_offs_data_size(offsets) + rec_offs_extra_size(offsets));
+}
+
+#ifdef UNIV_DEBUG
+/**********************************************************//**
+Returns a pointer to the end of the record.
+@return pointer to end */
+UNIV_INLINE
+byte*
+rec_get_end(
+/*========*/
+	const rec_t*	rec,	/*!< in: pointer to record */
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	return(const_cast<rec_t*>(rec + rec_offs_data_size(offsets)));
+}
+
+/**********************************************************//**
+Returns a pointer to the start of the record.
+@return pointer to start */
+UNIV_INLINE
+byte*
+rec_get_start(
+/*==========*/
+	const rec_t*	rec,	/*!< in: pointer to record */
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	return(const_cast<rec_t*>(rec - rec_offs_extra_size(offsets)));
+}
+#endif /* UNIV_DEBUG */
+
+/** Copy a physical record to a buffer.
+@param[in]	buf	buffer
+@param[in]	rec	physical record
+@param[in]	offsets	array returned by rec_get_offsets()
+@return pointer to the origin of the copy */
+UNIV_INLINE
+rec_t*
+rec_copy(
+	void*		buf,
+	const rec_t*	rec,
+	const rec_offs*	offsets)
+{
+	ulint	extra_len;
+	ulint	data_len;
+
+	ut_ad(rec != NULL);
+	ut_ad(buf != NULL);
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	ut_ad(rec_validate(rec, offsets));
+
+	extra_len = rec_offs_extra_size(offsets);
+	data_len = rec_offs_data_size(offsets);
+
+	memcpy(buf, rec - extra_len, extra_len + data_len);
+
+	return((byte*) buf + extra_len);
+}
+
+/**********************************************************//**
+Returns the extra size of an old-style physical record if we know its
+data size and number of fields.
+@return extra size */
+UNIV_INLINE
+ulint
+rec_get_converted_extra_size(
+/*=========================*/
+	ulint	data_size,	/*!< in: data size */
+	ulint	n_fields,	/*!< in: number of fields */
+	ulint	n_ext)		/*!< in: number of externally stored columns */
+{
+	if (!n_ext && data_size <= REC_1BYTE_OFFS_LIMIT) {
+
+		return(REC_N_OLD_EXTRA_BYTES + n_fields);
+	}
+
+	return(REC_N_OLD_EXTRA_BYTES + 2 * n_fields);
+}
+
+/**********************************************************//**
+The following function returns the size of a data tuple when converted to
+a physical record.
+@return size */
+UNIV_INLINE
+ulint
+rec_get_converted_size(
+/*===================*/
+	dict_index_t*	index,	/*!< in: record descriptor */
+	const dtuple_t*	dtuple,	/*!< in: data tuple */
+	ulint		n_ext)	/*!< in: number of externally stored columns */
+{
+	ulint	data_size;
+	ulint	extra_size;
+
+	ut_ad(dtuple_check_typed(dtuple));
+#ifdef UNIV_DEBUG
+	if (dict_index_is_ibuf(index)) {
+		ut_ad(dtuple->n_fields > 1);
+	} else if ((dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK)
+		   == REC_STATUS_NODE_PTR) {
+		ut_ad(dtuple->n_fields - 1
+		      == dict_index_get_n_unique_in_tree_nonleaf(index));
+	} else if (index->table->id == DICT_INDEXES_ID) {
+		/* The column SYS_INDEXES.MERGE_THRESHOLD was
+		instantly added in MariaDB 10.2.2 (MySQL 5.7). */
+		ut_ad(!index->table->is_temporary());
+		ut_ad(index->n_fields == DICT_NUM_FIELDS__SYS_INDEXES);
+		ut_ad(dtuple->n_fields == DICT_NUM_FIELDS__SYS_INDEXES
+		      || dtuple->n_fields
+		      == DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD);
+	} else {
+		ut_ad(dtuple->n_fields >= index->n_core_fields);
+		ut_ad(dtuple->n_fields <= index->n_fields
+		      || dtuple->is_alter_metadata());
+	}
+#endif
+
+	if (dict_table_is_comp(index->table)) {
+		return rec_get_converted_size_comp(index, dtuple, NULL);
+	}
+
+	data_size = dtuple_get_data_size(dtuple, 0);
+
+	/* If primary key is being updated then the new record inherits
+	externally stored fields from the delete-marked old record.
+	In that case, n_ext may be less value than
+	dtuple_get_n_ext(tuple). */
+	ut_ad(n_ext <= dtuple_get_n_ext(dtuple));
+	extra_size = rec_get_converted_extra_size(
+		data_size, dtuple_get_n_fields(dtuple), n_ext);
+
+	return(data_size + extra_size);
+}
diff --git a/storage/innobase/include/rem0types.h b/storage/innobase/include/rem0types.h
new file mode 100644
index 00000000..0e4075a9
--- /dev/null
+++ b/storage/innobase/include/rem0types.h
@@ -0,0 +1,78 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/rem0types.h
+Record manager global types
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifndef rem0types_h
+#define rem0types_h
+
+/* We define the physical record simply as an array of bytes */
+typedef byte	rec_t;
+
+/** This type represents a field offset in a rec_t* */
+typedef unsigned short int rec_offs;
+
+/* Maximum values for various fields (for non-blob tuples) */
+#define REC_MAX_N_FIELDS	(1024 - 1)
+#define REC_MAX_HEAP_NO		(2 * 8192 - 1)
+#define REC_MAX_N_OWNED		(16 - 1)
+
+/* Maximum number of user defined fields/columns. The reserved columns
+are the ones InnoDB adds internally: DB_ROW_ID, DB_TRX_ID, DB_ROLL_PTR.
+Before MariaDB Server 10.5, we needed "* 2" because mlog_parse_index()
+created a dummy table object possibly, with some of the system columns
+in it, and then adds the 3 system columns (again) using
+dict_table_add_system_columns().
+For now, we will keep this limitation to maintain file format compatibility
+with older versions. */
+#define REC_MAX_N_USER_FIELDS	(REC_MAX_N_FIELDS - DATA_N_SYS_COLS * 2)
+
+/* REC_ANTELOPE_MAX_INDEX_COL_LEN is measured in bytes and is the maximum
+indexed field length (or indexed prefix length) for indexes on tables of
+ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT format.
+Before we support UTF-8 encodings with mbmaxlen = 4, a UTF-8 character
+may take at most 3 bytes.  So the limit was set to 3*256, so that one
+can create a column prefix index on 256 characters of a TEXT or VARCHAR
+column also in the UTF-8 charset.
+This constant MUST NOT BE CHANGED, or the compatibility of InnoDB data
+files would be at risk! */
+#define REC_ANTELOPE_MAX_INDEX_COL_LEN		768
+
+/** Maximum indexed field length for tables that have atomic BLOBs.
+This (3072) is the maximum index row length allowed, so we cannot create index
+prefix column longer than that. */
+#define REC_VERSION_56_MAX_INDEX_COL_LEN	3072
+
+/** Innodb row types are a subset of the MySQL global enum row_type.
+They are made into their own enum so that switch statements can account
+for each of them. */
+enum rec_format_enum {
+	REC_FORMAT_REDUNDANT	= 0,	/*!< REDUNDANT row format */
+	REC_FORMAT_COMPACT	= 1,	/*!< COMPACT row format */
+	REC_FORMAT_COMPRESSED	= 2,	/*!< COMPRESSED row format */
+	REC_FORMAT_DYNAMIC	= 3	/*!< DYNAMIC row format */
+};
+typedef enum rec_format_enum rec_format_t;
+
+#endif
diff --git a/storage/innobase/include/row0ext.h b/storage/innobase/include/row0ext.h
new file mode 100644
index 00000000..78886332
--- /dev/null
+++ b/storage/innobase/include/row0ext.h
@@ -0,0 +1,101 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0ext.h
+Caching of externally stored column prefixes
+
+Created September 2006 Marko Makela
+*******************************************************/
+
+#ifndef row0ext_h
+#define row0ext_h
+
+#include "data0types.h"
+#include "mem0mem.h"
+#include "dict0types.h"
+#include "fsp0types.h"
+#include "row0types.h"
+
+/********************************************************************//**
+Creates a cache of column prefixes of externally stored columns.
+@return own: column prefix cache */
+row_ext_t*
+row_ext_create(
+/*===========*/
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	const ulint*	ext,	/*!< in: col_no's of externally stored columns
+				in the InnoDB table object, as reported by
+				dict_col_get_no(); NOT relative to the records
+				in the clustered index */
+	const dict_table_t& table, /*!< in: table */
+	const dtuple_t*	tuple,	/*!< in: data tuple containing the field
+				references of the externally stored
+				columns; must be indexed by col_no;
+				the clustered index record must be
+				covered by a lock or a page latch
+				to prevent deletion (rollback or purge). */
+	mem_heap_t*	heap);	/*!< in: heap where created */
+
+/********************************************************************//**
+Looks up a column prefix of an externally stored column.
+@return column prefix, or NULL if the column is not stored externally,
+or pointer to field_ref_zero if the BLOB pointer is unset */
+UNIV_INLINE
+const byte*
+row_ext_lookup_ith(
+/*===============*/
+	const row_ext_t*	ext,	/*!< in/out: column prefix cache */
+	ulint			i,	/*!< in: index of ext->ext[] */
+	ulint*			len);	/*!< out: length of prefix, in bytes,
+					at most the length determined by
+					DICT_MAX_FIELD_LEN_BY_FORMAT() */
+/********************************************************************//**
+Looks up a column prefix of an externally stored column.
+@return column prefix, or NULL if the column is not stored externally,
+or pointer to field_ref_zero if the BLOB pointer is unset */
+UNIV_INLINE
+const byte*
+row_ext_lookup(
+/*===========*/
+	const row_ext_t*	ext,	/*!< in: column prefix cache */
+	ulint			col,	/*!< in: column number in the InnoDB
+					table object, as reported by
+					dict_col_get_no(); NOT relative to the
+					records in the clustered index */
+	ulint*			len);	/*!< out: length of prefix, in bytes,
+					at most the length determined by
+					DICT_MAX_FIELD_LEN_BY_FORMAT() */
+
+/** Prefixes of externally stored columns */
+struct row_ext_t{
+	ulint		n_ext;	/*!< number of externally stored columns */
+	const ulint*	ext;	/*!< col_no's of externally stored columns */
+	byte*		buf;	/*!< backing store of the column prefix cache */
+	ulint		max_len;/*!< maximum prefix length, it could be
+				REC_ANTELOPE_MAX_INDEX_COL_LEN or
+				REC_VERSION_56_MAX_INDEX_COL_LEN depending
+				on row format */
+	ulint		zip_size;/*!< ROW_FORMAT=COMPRESSED page size, or 0 */
+	ulint		len[1];	/*!< prefix lengths; 0 if not cached */
+};
+
+#include "row0ext.inl"
+
+#endif
diff --git a/storage/innobase/include/row0ext.inl b/storage/innobase/include/row0ext.inl
new file mode 100644
index 00000000..913b51b3
--- /dev/null
+++ b/storage/innobase/include/row0ext.inl
@@ -0,0 +1,87 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0ext.ic
+Caching of externally stored column prefixes
+
+Created September 2006 Marko Makela
+*******************************************************/
+
+#include "rem0types.h"
+#include "btr0types.h"
+
+/********************************************************************//**
+Looks up a column prefix of an externally stored column.
+@return column prefix, or NULL if the column is not stored externally,
+or pointer to field_ref_zero if the BLOB pointer is unset */
+UNIV_INLINE
+const byte*
+row_ext_lookup_ith(
+/*===============*/
+	const row_ext_t*	ext,	/*!< in/out: column prefix cache */
+	ulint			i,	/*!< in: index of ext->ext[] */
+	ulint*			len)	/*!< out: length of prefix, in bytes,
+					at most ext->max_len */
+{
+	ut_ad(ext);
+	ut_ad(len);
+	ut_ad(i < ext->n_ext);
+
+	*len = ext->len[i];
+
+	ut_ad(*len <= ext->max_len);
+	ut_ad(ext->max_len > 0);
+
+	if (*len == 0) {
+		/* The BLOB could not be fetched to the cache. */
+		return(field_ref_zero);
+	} else {
+		return(ext->buf + i * ext->max_len);
+	}
+}
+
+/********************************************************************//**
+Looks up a column prefix of an externally stored column.
+@return column prefix, or NULL if the column is not stored externally,
+or pointer to field_ref_zero if the BLOB pointer is unset */
+UNIV_INLINE
+const byte*
+row_ext_lookup(
+/*===========*/
+	const row_ext_t*	ext,	/*!< in: column prefix cache */
+	ulint			col,	/*!< in: column number in the InnoDB
+					table object, as reported by
+					dict_col_get_no(); NOT relative to the
+					records in the clustered index */
+	ulint*			len)	/*!< out: length of prefix, in bytes,
+					at most ext->max_len */
+{
+	ulint	i;
+
+	ut_ad(ext);
+	ut_ad(len);
+
+	for (i = 0; i < ext->n_ext; i++) {
+		if (col == ext->ext[i]) {
+			return(row_ext_lookup_ith(ext, i, len));
+		}
+	}
+
+	return(NULL);
+}
diff --git a/storage/innobase/include/row0ftsort.h b/storage/innobase/include/row0ftsort.h
new file mode 100644
index 00000000..3ffa8243
--- /dev/null
+++ b/storage/innobase/include/row0ftsort.h
@@ -0,0 +1,268 @@
+/*****************************************************************************
+
+Copyright (c) 2010, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0ftsort.h
+Create Full Text Index with (parallel) merge sort
+
+Created 10/13/2010 Jimmy Yang
+*******************************************************/
+
+#ifndef row0ftsort_h
+#define row0ftsort_h
+
+#include "data0data.h"
+#include "fts0fts.h"
+#include "fts0priv.h"
+#include "rem0types.h"
+#include "row0merge.h"
+#include "btr0bulk.h"
+#include "srv0srv.h"
+
+/** This structure defineds information the scan thread will fetch
+and put to the linked list for parallel tokenization/sort threads
+to process */
+typedef struct fts_doc_item     fts_doc_item_t;
+
+/** Information about temporary files used in merge sort */
+struct fts_doc_item {
+	dfield_t*	field;		/*!< field contains document string */
+	doc_id_t	doc_id;		/*!< document ID */
+	UT_LIST_NODE_T(fts_doc_item_t)	doc_list;
+					/*!< list of doc items */
+};
+
+/** This defines the list type that scan thread would feed the parallel
+tokenization threads and sort threads. */
+typedef UT_LIST_BASE_NODE_T(fts_doc_item_t)     fts_doc_list_t;
+
+#define FTS_PLL_MERGE		1
+
+/** Sort information passed to each individual parallel sort thread */
+struct fts_psort_t;
+
+/** Common info passed to each parallel sort thread */
+struct fts_psort_common_t {
+	row_merge_dup_t*	dup;		/*!< descriptor of FTS index */
+	dict_table_t*		new_table;	/*!< source table */
+	/** Old table page size */
+	ulint			old_zip_size;
+	trx_t*			trx;		/*!< transaction */
+	fts_psort_t*		all_info;	/*!< all parallel sort info */
+	pthread_cond_t		sort_cond;	/*!< sort completion */
+	ibool			opt_doc_id_size;/*!< whether to use 4 bytes
+						instead of 8 bytes integer to
+						store Doc ID during sort, if
+						Doc ID will not be big enough
+						to use 8 bytes value */
+};
+
+struct fts_psort_t {
+	ulint			psort_id;	/*!< Parallel sort ID */
+	row_merge_buf_t*	merge_buf[FTS_NUM_AUX_INDEX];
+						/*!< sort buffer */
+	merge_file_t*		merge_file[FTS_NUM_AUX_INDEX];
+						/*!< sort file */
+	row_merge_block_t*	merge_block[FTS_NUM_AUX_INDEX];
+						/*!< buffer to write to file */
+	row_merge_block_t*	crypt_block[FTS_NUM_AUX_INDEX];
+						/*!< buffer to crypt data */
+	ulint			child_status;	/*!< child task status */
+	ulint			state;		/*!< parent state */
+	fts_doc_list_t		fts_doc_list;	/*!< doc list to process */
+	fts_psort_common_t*	psort_common;	/*!< ptr to all psort info */
+	tpool::waitable_task*	task;	/*!< threadpool task */
+	dberr_t			error;		/*!< db error during psort */
+	ulint			memory_used;	/*!< memory used by fts_doc_list */
+	mysql_mutex_t		mutex;		/*!< mutex for fts_doc_list */
+};
+
+/** Row fts token for plugin parser */
+struct row_fts_token_t {
+	fts_string_t*	text;		/*!< token */
+	UT_LIST_NODE_T(row_fts_token_t)
+			token_list;	/*!< next token link */
+};
+
+typedef UT_LIST_BASE_NODE_T(row_fts_token_t)     fts_token_list_t;
+
+/** Structure stores information from string tokenization operation */
+struct fts_tokenize_ctx {
+	/** the processed string length in bytes
+	(when using the built-in tokenizer),
+	or the number of row_merge_fts_doc_tokenize_by_parser() calls */
+	ulint			processed_len;
+	ulint			init_pos;       /*!< doc start position */
+	ulint			buf_used;       /*!< the sort buffer (ID) when
+						tokenization stops, which
+						could due to sort buffer full */
+	ulint			rows_added[FTS_NUM_AUX_INDEX];
+						/*!< number of rows added for
+						each FTS index partition */
+	ib_rbt_t*		cached_stopword;/*!< in: stopword list */
+	dfield_t		sort_field[FTS_NUM_FIELDS_SORT];
+						/*!< in: sort field */
+	/** parsed tokens (when using an external parser) */
+	fts_token_list_t	fts_token_list;
+
+	fts_tokenize_ctx() :
+		processed_len(0), init_pos(0), buf_used(0),
+		rows_added(), cached_stopword(NULL), sort_field(),
+		fts_token_list()
+	{
+		memset(rows_added, 0, sizeof rows_added);
+		memset(sort_field, 0, sizeof sort_field);
+		UT_LIST_INIT(fts_token_list, &row_fts_token_t::token_list);
+	}
+};
+
+typedef struct fts_tokenize_ctx fts_tokenize_ctx_t;
+
+/** Structure stores information needed for the insertion phase of FTS
+parallel sort. */
+struct fts_psort_insert {
+	CHARSET_INFO*	charset;	/*!< charset info */
+	mem_heap_t*	heap;		/*!< heap */
+	ibool		opt_doc_id_size;/*!< Whether to use smaller (4 bytes)
+					integer for Doc ID */
+	BtrBulk*	btr_bulk;	/*!< Bulk load instance */
+	dtuple_t*	tuple;		/*!< Tuple to insert */
+
+#ifdef UNIV_DEBUG
+	ulint		aux_index_id;	/*!< Auxiliary index id */
+#endif
+};
+
+typedef struct fts_psort_insert	fts_psort_insert_t;
+
+
+/** status bit used for communication between parent and child thread */
+#define FTS_PARENT_COMPLETE	1
+#define FTS_PARENT_EXITING	2
+#define FTS_CHILD_COMPLETE	1
+
+/** Print some debug information */
+#define	FTSORT_PRINT
+
+#ifdef	FTSORT_PRINT
+#define	DEBUG_FTS_SORT_PRINT(str)		\
+	do {					\
+		ut_print_timestamp(stderr);	\
+		fprintf(stderr, str);		\
+	} while (0)
+#else
+#define DEBUG_FTS_SORT_PRINT(str)
+#endif	/* FTSORT_PRINT */
+
+/*************************************************************//**
+Create a temporary "fts sort index" used to merge sort the
+tokenized doc string. The index has three "fields":
+
+1) Tokenized word,
+2) Doc ID
+3) Word's position in original 'doc'.
+
+@return dict_index_t structure for the fts sort index */
+dict_index_t*
+row_merge_create_fts_sort_index(
+/*============================*/
+	dict_index_t*	index,	/*!< in: Original FTS index
+				based on which this sort index
+				is created */
+	dict_table_t*	table,	/*!< in,out: table that FTS index
+				is being created on */
+	ibool*		opt_doc_id_size);
+				/*!< out: whether to use 4 bytes
+				instead of 8 bytes integer to
+				store Doc ID during sort */
+
+/** Initialize FTS parallel sort structures.
+@param[in]	trx		transaction
+@param[in,out]	dup		descriptor of FTS index being created
+@param[in]	new_table	table where indexes are created
+@param[in]	opt_doc_id_size	whether to use 4 bytes instead of 8 bytes
+				integer to store Doc ID during sort
+@param[in]	old_zip_size	page size of the old table during alter
+@param[out]	psort		parallel sort info to be instantiated
+@param[out]	merge		parallel merge info to be instantiated
+@return true if all successful */
+bool
+row_fts_psort_info_init(
+	trx_t*		trx,
+	row_merge_dup_t*dup,
+	dict_table_t*	new_table,
+	bool		opt_doc_id_size,
+	ulint		old_zip_size,
+	fts_psort_t**	psort,
+	fts_psort_t**	merge)
+	MY_ATTRIBUTE((nonnull));
+
+/********************************************************************//**
+Clean up and deallocate FTS parallel sort structures, and close
+temparary merge sort files */
+void
+row_fts_psort_info_destroy(
+/*=======================*/
+	fts_psort_t*	psort_info,	/*!< parallel sort info */
+	fts_psort_t*	merge_info);	/*!< parallel merge info */
+/********************************************************************//**
+Free up merge buffers when merge sort is done */
+void
+row_fts_free_pll_merge_buf(
+/*=======================*/
+	fts_psort_t*	psort_info);	/*!< in: parallel sort info */
+
+/*********************************************************************//**
+Start the parallel tokenization and parallel merge sort */
+void
+row_fts_start_psort(
+/*================*/
+	fts_psort_t*	psort_info);	/*!< in: parallel sort info */
+/*********************************************************************//**
+Kick off the parallel merge and insert thread */
+void
+row_fts_start_parallel_merge(
+/*=========================*/
+	fts_psort_t*	merge_info);	/*!< in: parallel sort info */
+/********************************************************************//**
+Propagate a newly added record up one level in the selection tree
+@return parent where this value propagated to */
+int
+row_merge_fts_sel_propagate(
+/*========================*/
+	int		propogated,	/*<! in: tree node propagated */
+	int*		sel_tree,	/*<! in: selection tree */
+	ulint		level,		/*<! in: selection tree level */
+	const mrec_t**	 mrec,		/*<! in: sort record */
+	rec_offs**	offsets,	/*<! in: record offsets */
+	dict_index_t*	index);		/*<! in: FTS index */
+/********************************************************************//**
+Read sorted file containing index data tuples and insert these data
+tuples to the index
+@return DB_SUCCESS or error number */
+dberr_t
+row_fts_merge_insert(
+/*=================*/
+	dict_index_t*	index,		/*!< in: index */
+	dict_table_t*	table,		/*!< in: new table */
+	fts_psort_t*	psort_info,	/*!< parallel sort info */
+	ulint		id)		/* !< in: which auxiliary table's data
+					to insert to */
+	MY_ATTRIBUTE((nonnull));
+#endif /* row0ftsort_h */
diff --git a/storage/innobase/include/row0import.h b/storage/innobase/include/row0import.h
new file mode 100644
index 00000000..fd2651da
--- /dev/null
+++ b/storage/innobase/include/row0import.h
@@ -0,0 +1,67 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0import.h
+Header file for import tablespace functions.
+
+Created 2012-02-08 by Sunny Bains
+*******************************************************/
+
+#ifndef row0import_h
+#define row0import_h
+
+#include "dict0types.h"
+
+// Forward declarations
+struct trx_t;
+struct dict_table_t;
+struct row_prebuilt_t;
+
+/*****************************************************************//**
+Imports a tablespace. The space id in the .ibd file must match the space id
+of the table in the data dictionary.
+@return error code or DB_SUCCESS */
+dberr_t
+row_import_for_mysql(
+/*=================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct
+						in MySQL */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Update the DICT_TF2_DISCARDED flag in SYS_TABLES.MIX_LEN.
+@param[in,out]	trx		dictionary transaction
+@param[in]	table_id	table identifier
+@param[in]	discarded	whether to set or clear the flag
+@return DB_SUCCESS or error code */
+dberr_t row_import_update_discarded_flag(trx_t* trx, table_id_t table_id,
+					 bool discarded)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Update the root page numbers and tablespace ID of a table.
+@param[in,out]	trx	dictionary transaction
+@param[in,out]	table	persistent table
+@param[in]	reset	whether to reset the fields to FIL_NULL
+@return DB_SUCCESS or error code */
+dberr_t
+row_import_update_index_root(trx_t* trx, dict_table_t* table, bool reset)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+#endif /* row0import_h */
diff --git a/storage/innobase/include/row0ins.h b/storage/innobase/include/row0ins.h
new file mode 100644
index 00000000..ac2479c4
--- /dev/null
+++ b/storage/innobase/include/row0ins.h
@@ -0,0 +1,224 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0ins.h
+Insert into a table
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0ins_h
+#define row0ins_h
+
+#include "data0data.h"
+#include "que0types.h"
+#include "trx0types.h"
+#include "row0types.h"
+#include <vector>
+
+/***************************************************************//**
+Checks if foreign key constraint fails for an index entry. Sets shared locks
+which lock either the success or the failure of the constraint. NOTE that
+the caller must have a shared latch on dict_foreign_key_check_lock.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_NO_REFERENCED_ROW, or
+DB_ROW_IS_REFERENCED */
+dberr_t
+row_ins_check_foreign_constraint(
+/*=============================*/
+	ibool		check_ref,/*!< in: TRUE If we want to check that
+				the referenced table is ok, FALSE if we
+				want to check the foreign key table */
+	dict_foreign_t*	foreign,/*!< in: foreign constraint; NOTE that the
+				tables mentioned in it must be in the
+				dictionary cache if they exist at all */
+	dict_table_t*	table,	/*!< in: if check_ref is TRUE, then the foreign
+				table, else the referenced table */
+	dtuple_t*	entry,	/*!< in: index entry for index */
+	que_thr_t*	thr)	/*!< in: query thread */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/*********************************************************************//**
+Sets a new row to insert for an INS_DIRECT node. This function is only used
+if we have constructed the row separately, which is a rare case; this
+function is quite slow. */
+void
+ins_node_set_new_row(
+/*=================*/
+	ins_node_t*	node,	/*!< in: insert node */
+	dtuple_t*	row);	/*!< in: new row (or first row) for the node */
+/***************************************************************//**
+Tries to insert an entry into a clustered index, ignoring foreign key
+constraints. If a record with the same unique key is found, the other
+record is necessarily marked deleted by a committed transaction, or a
+unique key violation error occurs. The delete marked record is then
+updated to an existing record, and we must write an undo log record on
+the delete marked record.
+@retval DB_SUCCESS on success
+@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG)
+@retval DB_FAIL if retry with BTR_MODIFY_TREE is needed
+@return error code */
+dberr_t
+row_ins_clust_index_entry_low(
+/*==========================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_latch_mode	mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+				depending on whether we wish optimistic or
+				pessimistic descent down the index tree */
+	dict_index_t*	index,	/*!< in: clustered index */
+	ulint		n_uniq,	/*!< in: 0 or index->n_uniq */
+	dtuple_t*	entry,	/*!< in/out: index entry to insert */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	que_thr_t*	thr)	/*!< in: query thread or NULL */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/***************************************************************//**
+Tries to insert an entry into a secondary index. If a record with exactly the
+same fields is found, the other record is necessarily marked deleted.
+It is then unmarked. Otherwise, the entry is just inserted to the index.
+@retval DB_SUCCESS on success
+@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG)
+@retval DB_FAIL if retry with BTR_INSERT_TREE is needed
+@return error code */
+dberr_t
+row_ins_sec_index_entry_low(
+/*========================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_latch_mode	mode,	/*!< in: BTR_MODIFY_LEAF or BTR_INSERT_TREE,
+				depending on whether we wish optimistic or
+				pessimistic descent down the index tree */
+	dict_index_t*	index,	/*!< in: secondary index */
+	mem_heap_t*	offsets_heap,
+				/*!< in/out: memory heap that can be emptied */
+	mem_heap_t*	heap,	/*!< in/out: memory heap */
+	dtuple_t*	entry,	/*!< in/out: index entry to insert */
+	trx_id_t	trx_id,	/*!< in: PAGE_MAX_TRX_ID during
+				row_log_table_apply(), or 0 */
+	que_thr_t*	thr)	/*!< in: query thread */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/***************************************************************//**
+Inserts an entry into a clustered index. Tries first optimistic,
+then pessimistic descent down the tree. If the entry matches enough
+to a delete marked record, performs the insert by updating or delete
+unmarking the delete marked record.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */
+dberr_t
+row_ins_clust_index_entry(
+/*======================*/
+	dict_index_t*	index,	/*!< in: clustered index */
+	dtuple_t*	entry,	/*!< in/out: index entry to insert */
+	que_thr_t*	thr,	/*!< in: query thread */
+	ulint		n_ext)	/*!< in: number of externally stored columns */
+	MY_ATTRIBUTE((warn_unused_result));
+/***************************************************************//**
+Inserts an entry into a secondary index. Tries first optimistic,
+then pessimistic descent down the tree. If the entry matches enough
+to a delete marked record, performs the insert by updating or delete
+unmarking the delete marked record.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */
+dberr_t
+row_ins_sec_index_entry(
+/*====================*/
+	dict_index_t*	index,	/*!< in: secondary index */
+	dtuple_t*	entry,	/*!< in/out: index entry to insert */
+	que_thr_t*	thr,	/*!< in: query thread */
+	bool		check_foreign = true) /*!< in: true if check
+				foreign table is needed, false otherwise */
+	MY_ATTRIBUTE((warn_unused_result));
+/***********************************************************//**
+Inserts a row to a table. This is a high-level function used in
+SQL execution graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+row_ins_step(
+/*=========*/
+	que_thr_t*	thr);	/*!< in: query thread */
+
+/* Insert node types */
+#define INS_SEARCHED	0	/* INSERT INTO ... SELECT ... */
+#define INS_VALUES	1	/* INSERT INTO ... VALUES ... */
+#define INS_DIRECT	2	/* this is for internal use in dict0crea:
+				insert the row directly */
+
+/* Node execution states */
+#define	INS_NODE_SET_IX_LOCK	1	/* we should set an IX lock on table */
+#define INS_NODE_ALLOC_ROW_ID	2	/* row id should be allocated */
+#define	INS_NODE_INSERT_ENTRIES 3	/* index entries should be built and
+					inserted */
+
+struct row_prebuilt_t;
+
+/** Insert node structure */
+struct ins_node_t
+{
+	explicit ins_node_t(ulint ins_type, dict_table_t *table) :
+		common(QUE_NODE_INSERT, NULL),
+		ins_type(ins_type),
+		row(NULL), table(table), select(NULL), values_list(NULL),
+		state(INS_NODE_SET_IX_LOCK), index(NULL),
+		entry_list(), entry(entry_list.end()),
+		trx_id(0), entry_sys_heap(mem_heap_create(128))
+	{
+	}
+	~ins_node_t() { mem_heap_free(entry_sys_heap); }
+	que_common_t common;	 /*!< node type: QUE_NODE_INSERT */
+	ulint		ins_type;/* INS_VALUES, INS_SEARCHED, or INS_DIRECT */
+	dtuple_t*	row;	/*!< row to insert */
+	dict_table_t*	table;	/*!< table where to insert */
+	sel_node_t*	select;	/*!< select in searched insert */
+	que_node_t*	values_list;/* list of expressions to evaluate and
+				insert in an INS_VALUES insert */
+	ulint		state;	/*!< node execution state */
+	dict_index_t*	index;	/*!< NULL, or the next index where the index
+				entry should be inserted */
+	std::vector<dtuple_t*>
+			entry_list;/* list of entries, one for each index */
+	std::vector<dtuple_t*>::iterator
+			entry;	/*!< NULL, or entry to insert in the index;
+				after a successful insert of the entry,
+				this should be reset to NULL */
+	/** buffer for the system columns */
+	byte		sys_buf[DATA_ROW_ID_LEN
+				+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN];
+	trx_id_t	trx_id;	/*!< trx id or the last trx which executed the
+				node */
+	byte		vers_start_buf[8]; /* Buffers for System Versioning */
+	byte		vers_end_buf[8];   /* system fields. */
+	mem_heap_t*	entry_sys_heap;
+				/* memory heap used as auxiliary storage;
+				entry_list and sys fields are stored here;
+				if this is NULL, entry list should be created
+				and buffers for sys fields in row allocated */
+        void vers_update_end(row_prebuilt_t *prebuilt, bool history_row);
+};
+
+/** Create an insert object.
+@param ins_type     INS_VALUES, ...
+@param table        table where to insert
+@param heap         memory heap
+@return the created object */
+inline ins_node_t *ins_node_create(ulint ins_type, dict_table_t *table,
+                                   mem_heap_t *heap)
+{
+  return new (mem_heap_alloc(heap, sizeof(ins_node_t)))
+    ins_node_t(ins_type, table);
+}
+
+#endif
diff --git a/storage/innobase/include/row0log.h b/storage/innobase/include/row0log.h
new file mode 100644
index 00000000..469f1f8a
--- /dev/null
+++ b/storage/innobase/include/row0log.h
@@ -0,0 +1,239 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0log.h
+Modification log for online index creation and online table rebuild
+
+Created 2011-05-26 Marko Makela
+*******************************************************/
+
+#pragma once
+
+#include "que0types.h"
+#include "mtr0types.h"
+#include "row0types.h"
+#include "rem0types.h"
+#include "dict0dict.h"
+#include "trx0types.h"
+#include "trx0undo.h"
+
+class ut_stage_alter_t;
+
+extern Atomic_counter<ulint> onlineddl_rowlog_rows;
+extern ulint onlineddl_rowlog_pct_used;
+extern ulint onlineddl_pct_progress;
+
+/******************************************************//**
+Allocate the row log for an index and flag the index
+for online creation.
+@retval true if success, false if not */
+bool
+row_log_allocate(
+/*=============*/
+	const trx_t*	trx,	/*!< in: the ALTER TABLE transaction */
+	dict_index_t*	index,	/*!< in/out: index */
+	dict_table_t*	table,	/*!< in/out: new table being rebuilt,
+				or NULL when creating a secondary index */
+	bool		same_pk,/*!< in: whether the definition of the
+				PRIMARY KEY has remained the same */
+	const dtuple_t*	defaults,
+				/*!< in: default values of
+				added, changed columns, or NULL */
+	const ulint*	col_map,/*!< in: mapping of old column
+				numbers to new ones, or NULL if !table */
+	const char*	path,	/*!< in: where to create temporary file */
+	const TABLE*	old_table,	/*!< in:table definition before alter */
+	bool		allow_not_null) /*!< in: allow null to non-null
+					conversion */
+	MY_ATTRIBUTE((nonnull(1), warn_unused_result));
+
+/******************************************************//**
+Free the row log for an index that was being created online. */
+void
+row_log_free(
+/*=========*/
+	row_log_t*	log)	/*!< in,own: row log */
+	MY_ATTRIBUTE((nonnull));
+
+/******************************************************//**
+Free the row log for an index on which online creation was aborted. */
+inline void row_log_abort_sec(dict_index_t *index)
+{
+  ut_ad(index->lock.have_u_or_x());
+  ut_ad(!index->is_clust());
+  dict_index_set_online_status(index, ONLINE_INDEX_ABORTED);
+  row_log_free(index->online_log);
+  index->online_log= nullptr;
+}
+
+/** Logs an operation to a secondary index that is (or was) being created.
+@param	index	index, S or X latched
+@param	tuple	index tuple
+@param	trx_id	transaction ID for insert, or 0 for delete
+@retval false if row_log_apply() failure happens
+or true otherwise */
+bool row_log_online_op(dict_index_t *index, const dtuple_t *tuple,
+                       trx_id_t trx_id) ATTRIBUTE_COLD;
+
+/******************************************************//**
+Gets the error status of the online index rebuild log.
+@return DB_SUCCESS or error code */
+dberr_t
+row_log_table_get_error(
+/*====================*/
+	const dict_index_t*	index)	/*!< in: clustered index of a table
+					that is being rebuilt online */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Check whether a virtual column is indexed in the new table being
+created during alter table
+@param[in]	index	cluster index
+@param[in]	v_no	virtual column number
+@return true if it is indexed, else false */
+bool
+row_log_col_is_indexed(
+	const dict_index_t*	index,
+	ulint			v_no);
+
+/******************************************************//**
+Logs a delete operation to a table that is being rebuilt.
+This will be merged in row_log_table_apply_delete(). */
+void
+row_log_table_delete(
+/*=================*/
+	const rec_t*	rec,	/*!< in: clustered index leaf page record,
+				page X-latched */
+	dict_index_t*	index,	/*!< in/out: clustered index, S-latched
+				or X-latched */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec,index) */
+	const byte*	sys)	/*!< in: DB_TRX_ID,DB_ROLL_PTR that should
+				be logged, or NULL to use those in rec */
+	ATTRIBUTE_COLD __attribute__((nonnull(1,2,3)));
+
+/******************************************************//**
+Logs an update operation to a table that is being rebuilt.
+This will be merged in row_log_table_apply_update(). */
+void
+row_log_table_update(
+/*=================*/
+	const rec_t*	rec,	/*!< in: clustered index leaf page record,
+				page X-latched */
+	dict_index_t*	index,	/*!< in/out: clustered index, S-latched
+				or X-latched */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec,index) */
+	const dtuple_t*	old_pk);/*!< in: row_log_table_get_pk()
+				before the update */
+
+/******************************************************//**
+Constructs the old PRIMARY KEY and DB_TRX_ID,DB_ROLL_PTR
+of a table that is being rebuilt.
+@return tuple of PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR in the rebuilt table,
+or NULL if the PRIMARY KEY definition does not change */
+const dtuple_t*
+row_log_table_get_pk(
+/*=================*/
+	const rec_t*	rec,	/*!< in: clustered index leaf page record,
+				page X-latched */
+	dict_index_t*	index,	/*!< in/out: clustered index, S-latched
+				or X-latched */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec,index),
+				or NULL */
+	byte*		sys,	/*!< out: DB_TRX_ID,DB_ROLL_PTR for
+				row_log_table_delete(), or NULL */
+	mem_heap_t**	heap)	/*!< in/out: memory heap where allocated */
+	ATTRIBUTE_COLD __attribute__((nonnull(1,2,5), warn_unused_result));
+
+/******************************************************//**
+Logs an insert to a table that is being rebuilt.
+This will be merged in row_log_table_apply_insert(). */
+void
+row_log_table_insert(
+/*=================*/
+	const rec_t*	rec,	/*!< in: clustered index leaf page record,
+				page X-latched */
+	dict_index_t*	index,	/*!< in/out: clustered index, S-latched
+				or X-latched */
+	const rec_offs*	offsets);/*!< in: rec_get_offsets(rec,index) */
+
+/** Apply the row_log_table log to a table upon completing rebuild.
+@param[in]	thr		query graph
+@param[in]	old_table	old table
+@param[in,out]	table		MySQL table (for reporting duplicates)
+@param[in,out]	stage		performance schema accounting object, used by
+ALTER TABLE. stage->begin_phase_log_table() will be called initially and then
+stage->inc() will be called for each block of log that is applied.
+@param[in]	new_table	Altered table
+@return DB_SUCCESS, or error code on failure */
+dberr_t
+row_log_table_apply(
+	que_thr_t*		thr,
+	dict_table_t*		old_table,
+	struct TABLE*		table,
+	ut_stage_alter_t*	stage,
+	dict_table_t*		new_table)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/******************************************************//**
+Get the latest transaction ID that has invoked row_log_online_op()
+during online creation.
+@return latest transaction ID, or 0 if nothing was logged */
+trx_id_t
+row_log_get_max_trx(
+/*================*/
+	dict_index_t*	index)	/*!< in: index, must be locked */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Apply the row log to the index upon completing index creation.
+@param[in]	trx	transaction (for checking if the operation was
+interrupted)
+@param[in,out]	index	secondary index
+@param[in,out]	table	MySQL table (for reporting duplicates)
+@param[in,out]	stage	performance schema accounting object, used by
+ALTER TABLE. stage->begin_phase_log_index() will be called initially and then
+stage->inc() will be called for each block of log that is applied.
+@return DB_SUCCESS, or error code on failure */
+dberr_t
+row_log_apply(
+	const trx_t*		trx,
+	dict_index_t*		index,
+	struct TABLE*		table,
+	ut_stage_alter_t*	stage)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Get the n_core_fields of online log for the index
+@param	 index	index whose n_core_fields of log to be accessed
+@return number of n_core_fields */
+unsigned row_log_get_n_core_fields(const dict_index_t *index);
+
+/** Get the error code of online log for the index
+@param	index	online index
+@return error code present in online log */
+dberr_t row_log_get_error(const dict_index_t *index);
+
+#ifdef HAVE_PSI_STAGE_INTERFACE
+/** Estimate how much work is to be done by the log apply phase
+of an ALTER TABLE for this index.
+@param[in]	index	index whose log to assess
+@return work to be done by log-apply in abstract units
+*/
+ulint
+row_log_estimate_work(
+	const dict_index_t*	index);
+#endif /* HAVE_PSI_STAGE_INTERFACE */
diff --git a/storage/innobase/include/row0merge.h b/storage/innobase/include/row0merge.h
new file mode 100644
index 00000000..93ea650d
--- /dev/null
+++ b/storage/innobase/include/row0merge.h
@@ -0,0 +1,496 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0merge.h
+Index build routines using a merge sort
+
+Created 13/06/2005 Jan Lindstrom
+*******************************************************/
+
+#pragma once
+
+#include "que0types.h"
+#include "trx0types.h"
+#include "mtr0mtr.h"
+#include "rem0types.h"
+#include "rem0rec.h"
+#include "btr0types.h"
+#include "row0mysql.h"
+#include "lock0types.h"
+#include "srv0srv.h"
+
+class ut_stage_alter_t;
+
+/* Reserve free space from every block for key_version */
+#define ROW_MERGE_RESERVE_SIZE 4
+
+/* Cluster index read task is mandatory */
+#define COST_READ_CLUSTERED_INDEX            1.0
+
+/* Basic fixed cost to build all type of index */
+#define COST_BUILD_INDEX_STATIC              0.5
+/* Dynamic cost to build all type of index, dynamic cost will be re-distributed based on page count ratio of each index */
+#define COST_BUILD_INDEX_DYNAMIC             0.5
+
+/* Sum of below two must be 1.0 */
+#define PCT_COST_MERGESORT_INDEX                 0.4
+#define PCT_COST_INSERT_INDEX                    0.6
+
+// Forward declaration
+struct ib_sequence_t;
+
+/** @brief Block size for I/O operations in merge sort.
+
+The minimum is srv_page_size, or page_get_free_space_of_empty()
+rounded to a power of 2.
+
+When not creating a PRIMARY KEY that contains column prefixes, this
+can be set as small as srv_page_size / 2. */
+typedef byte	row_merge_block_t;
+
+/** @brief Secondary buffer for I/O operations of merge records.
+
+This buffer is used for writing or reading a record that spans two
+row_merge_block_t.  Thus, it must be able to hold one merge record,
+whose maximum size is the same as the minimum size of
+row_merge_block_t. */
+typedef byte	mrec_buf_t[UNIV_PAGE_SIZE_MAX];
+
+/** @brief Merge record in row_merge_block_t.
+
+The format is the same as a record in ROW_FORMAT=COMPACT with the
+exception that the REC_N_NEW_EXTRA_BYTES are omitted. */
+typedef byte	mrec_t;
+
+/** Merge record in row_merge_buf_t */
+struct mtuple_t {
+	dfield_t*	fields;		/*!< data fields */
+};
+
+/** Buffer for sorting in main memory. */
+struct row_merge_buf_t {
+	mem_heap_t*	heap;		/*!< memory heap where allocated */
+	dict_index_t*	index;		/*!< the index the tuples belong to */
+	ulint		total_size;	/*!< total amount of data bytes */
+	ulint		n_tuples;	/*!< number of data tuples */
+	ulint		max_tuples;	/*!< maximum number of data tuples */
+	mtuple_t*	tuples;		/*!< array of data tuples */
+	mtuple_t*	tmp_tuples;	/*!< temporary copy of tuples,
+					for sorting */
+};
+
+/** Information about temporary files used in merge sort */
+struct merge_file_t {
+	pfs_os_file_t	fd;		/*!< file descriptor */
+	ulint		offset;		/*!< file offset (end of file) */
+	ib_uint64_t	n_rec;		/*!< number of records in the file */
+};
+
+/** Index field definition */
+struct index_field_t {
+	ulint		col_no;		/*!< column offset */
+	ulint		prefix_len;	/*!< column prefix length, or 0
+					if indexing the whole column */
+	bool		is_v_col;	/*!< whether this is a virtual column */
+	bool		descending;	/*!< whether to use DESC order */
+};
+
+/** Definition of an index being created */
+struct index_def_t {
+	const char*	name;		/*!< index name */
+	bool		rebuild;	/*!< whether the table is rebuilt */
+	ulint		ind_type;	/*!< 0, DICT_UNIQUE,
+					or DICT_CLUSTERED */
+	ulint		key_number;	/*!< MySQL key number,
+					or ULINT_UNDEFINED if none */
+	ulint		n_fields;	/*!< number of fields in index */
+	index_field_t*	fields;		/*!< field definitions */
+	st_mysql_ftparser*
+			parser;		/*!< fulltext parser plugin */
+};
+
+/** Structure for reporting duplicate records. */
+struct row_merge_dup_t {
+	dict_index_t*		index;	/*!< index being sorted */
+	struct TABLE*		table;	/*!< MySQL table object */
+	const ulint*		col_map;/*!< mapping of column numbers
+					in table to the rebuilt table
+					(index->table), or NULL if not
+					rebuilding table */
+	ulint			n_dup;	/*!< number of duplicates */
+};
+
+/*************************************************************//**
+Report a duplicate key. */
+void
+row_merge_dup_report(
+/*=================*/
+	row_merge_dup_t*	dup,	/*!< in/out: for reporting duplicates */
+	const dfield_t*		entry)	/*!< in: duplicate index entry */
+	MY_ATTRIBUTE((nonnull));
+
+/** Drop indexes that were created before an error occurred.
+The data dictionary must have been locked exclusively by the caller,
+because the transaction will not be committed.
+@param trx              dictionary transaction
+@param table            table containing the indexes
+@param locked           True if table is locked,
+                        false - may need to do lazy drop
+@param alter_trx        Alter table transaction */
+void
+row_merge_drop_indexes(
+        trx_t*          trx,
+        dict_table_t*   table,
+        bool            locked,
+        const trx_t*    alter_trx=NULL);
+
+/** During recovery, drop recovered index stubs that were created in
+prepare_inplace_alter_table_dict(). */
+void row_merge_drop_temp_indexes();
+
+/** Create temporary merge files in the given paramater path, and if
+UNIV_PFS_IO defined, register the file descriptor with Performance Schema.
+@param[in]	path	location for creating temporary merge files, or NULL
+@return File descriptor */
+pfs_os_file_t
+row_merge_file_create_low(
+	const char*	path)
+	MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************************//**
+Destroy a merge file. And de-register the file from Performance Schema
+if UNIV_PFS_IO is defined. */
+void
+row_merge_file_destroy_low(
+/*=======================*/
+	const pfs_os_file_t&	fd);	/*!< in: merge file descriptor */
+
+/*********************************************************************//**
+Rename an index in the dictionary that was created. The data
+dictionary must have been locked exclusively by the caller, because
+the transaction will not be committed.
+@return DB_SUCCESS if all OK */
+dberr_t
+row_merge_rename_index_to_add(
+/*==========================*/
+	trx_t*		trx,		/*!< in/out: transaction */
+	table_id_t	table_id,	/*!< in: table identifier */
+	index_id_t	index_id)	/*!< in: index identifier */
+	MY_ATTRIBUTE((nonnull(1), warn_unused_result));
+
+/** Create the index and load in to the dictionary.
+@param[in,out]	table		the index is on this table
+@param[in]	index_def	the index definition
+@param[in]	add_v		new virtual columns added along with add
+				index call
+@return index, or NULL on error */
+dict_index_t*
+row_merge_create_index(
+	dict_table_t*		table,
+	const index_def_t*	index_def,
+	const dict_add_v_col_t*	add_v)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************//**
+Check if a transaction can use an index.
+@return whether the index can be used by the transaction */
+bool
+row_merge_is_index_usable(
+/*======================*/
+	const trx_t*		trx,	/*!< in: transaction */
+	const dict_index_t*	index)	/*!< in: index to check */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Map from column numbers to column definitions that include
+changes to the collation, when the encoding is compatible with
+the original column and no table rebuild is needed */
+typedef std::map<unsigned, dict_col_t*> col_collations;
+
+/** Build indexes on a table by reading a clustered index, creating a temporary
+file containing index entries, merge sorting these index entries and inserting
+sorted index entries to indexes.
+@param[in]	trx		transaction
+@param[in]	old_table	table where rows are read from
+@param[in]	new_table	table where indexes are created; identical to
+old_table unless creating a PRIMARY KEY
+@param[in]	online		true if creating indexes online
+@param[in]	indexes		indexes to be created
+@param[in]	key_numbers	MySQL key numbers
+@param[in]	n_indexes	size of indexes[]
+@param[in,out]	table		MySQL table, for reporting erroneous key value
+if applicable
+@param[in]	defaults	default values of added, changed columns, or NULL
+@param[in]	col_map		mapping of old column numbers to new ones, or
+NULL if old_table == new_table
+@param[in]	add_autoinc	number of added AUTO_INCREMENT columns, or
+ULINT_UNDEFINED if none is added
+@param[in,out]	sequence	autoinc sequence
+@param[in]	skip_pk_sort	whether the new PRIMARY KEY will follow
+existing order
+@param[in,out]	stage		performance schema accounting object, used by
+ALTER TABLE. stage->begin_phase_read_pk() will be called at the beginning of
+this function and it will be passed to other functions for further accounting.
+@param[in]	add_v		new virtual columns added along with indexes
+@param[in]	eval_table	mysql table used to evaluate virtual column
+				value, see innobase_get_computed_value().
+@param[in]	allow_non_null	allow the conversion from null to not-null
+@param[in]	col_collate	columns whose collations changed, or nullptr
+@return DB_SUCCESS or error code */
+dberr_t
+row_merge_build_indexes(
+	trx_t*			trx,
+	dict_table_t*		old_table,
+	dict_table_t*		new_table,
+	bool			online,
+	dict_index_t**		indexes,
+	const ulint*		key_numbers,
+	ulint			n_indexes,
+	struct TABLE*		table,
+	const dtuple_t*		defaults,
+	const ulint*		col_map,
+	ulint			add_autoinc,
+	ib_sequence_t&		sequence,
+	bool			skip_pk_sort,
+	ut_stage_alter_t*	stage,
+	const dict_add_v_col_t*	add_v,
+	struct TABLE*		eval_table,
+	bool			allow_non_null,
+	const col_collations*	col_collate)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Write a buffer to a block.
+@param buf              sorted buffer
+@param block            buffer for writing to file
+@param blob_file        blob file handle for doing bulk insert operation */
+dberr_t row_merge_buf_write(const row_merge_buf_t *buf,
+#ifndef DBUG_OFF
+                            const merge_file_t *of, /*!< output file */
+#endif
+                            row_merge_block_t *block,
+                            merge_file_t *blob_file= nullptr);
+
+/********************************************************************//**
+Sort a buffer. */
+void
+row_merge_buf_sort(
+/*===============*/
+	row_merge_buf_t*	buf,	/*!< in/out: sort buffer */
+	row_merge_dup_t*	dup)	/*!< in/out: reporter of duplicates
+					(NULL if non-unique index) */
+	MY_ATTRIBUTE((nonnull(1)));
+
+/********************************************************************//**
+Write a merge block to the file system.
+@return whether the request was completed successfully
+@retval	false	on error
+@retval	true	on success */
+bool
+row_merge_write(
+	const pfs_os_file_t&	fd,	/*!< in: file descriptor */
+	ulint		offset,	/*!< in: offset where to write,
+				in number of row_merge_block_t elements */
+	const void*	buf,	/*!< in: data */
+	void*		crypt_buf,		/*!< in: crypt buf or NULL */
+	ulint		space)			/*!< in: space id */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/********************************************************************//**
+Empty a sort buffer.
+@return sort buffer */
+row_merge_buf_t*
+row_merge_buf_empty(
+/*================*/
+	row_merge_buf_t*	buf)	/*!< in,own: sort buffer */
+	MY_ATTRIBUTE((warn_unused_result, nonnull));
+
+/** Create a merge file in the given location.
+@param[out]	merge_file	merge file structure
+@param[in]	path		location for creating temporary file, or NULL
+@return file descriptor, or -1 on failure */
+pfs_os_file_t
+row_merge_file_create(
+	merge_file_t*	merge_file,
+	const char*	path)
+	MY_ATTRIBUTE((warn_unused_result, nonnull(1)));
+
+/** Merge disk files.
+@param[in]	trx	transaction
+@param[in]	dup	descriptor of index being created
+@param[in,out]	file	file containing index entries
+@param[in,out]	block	3 buffers
+@param[in,out]	tmpfd	temporary file handle
+@param[in]      update_progress true, if we should update progress status
+@param[in]      pct_progress total progress percent until now
+@param[in]      pct_ocst current progress percent
+@param[in]      crypt_block crypt buf or NULL
+@param[in]      space    space_id
+@param[in,out]	stage	performance schema accounting object, used by
+ALTER TABLE. If not NULL, stage->begin_phase_sort() will be called initially
+and then stage->inc() will be called for each record processed.
+@return DB_SUCCESS or error code */
+dberr_t
+row_merge_sort(
+/*===========*/
+	trx_t*			trx,
+	const row_merge_dup_t*	dup,
+	merge_file_t*		file,
+	row_merge_block_t*	block,
+	pfs_os_file_t*		tmpfd,
+	const bool		update_progress,
+	const double	pct_progress,
+	const double	pct_cost,
+	row_merge_block_t*	crypt_block,
+	ulint			space,
+	ut_stage_alter_t*	stage = NULL)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************//**
+Allocate a sort buffer.
+@return own: sort buffer */
+row_merge_buf_t*
+row_merge_buf_create(
+/*=================*/
+	dict_index_t*	index)	/*!< in: secondary index */
+	MY_ATTRIBUTE((warn_unused_result, nonnull, malloc));
+
+/*********************************************************************//**
+Deallocate a sort buffer. */
+void
+row_merge_buf_free(
+/*===============*/
+	row_merge_buf_t*	buf)	/*!< in,own: sort buffer to be freed */
+	MY_ATTRIBUTE((nonnull));
+
+/*********************************************************************//**
+Destroy a merge file. */
+void
+row_merge_file_destroy(
+/*===================*/
+	merge_file_t*	merge_file)	/*!< in/out: merge file structure */
+	MY_ATTRIBUTE((nonnull));
+
+/** Read a merge block from the file system.
+@return whether the request was completed successfully */
+bool
+row_merge_read(
+/*===========*/
+	const pfs_os_file_t&	fd,	/*!< in: file descriptor */
+	ulint			offset,	/*!< in: offset where to read
+					in number of row_merge_block_t
+					elements */
+	row_merge_block_t*	buf,	/*!< out: data */
+	row_merge_block_t*	crypt_buf, /*!< in: crypt buf or NULL */
+	ulint			space)	   /*!< in: space id */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/********************************************************************//**
+Read a merge record.
+@return pointer to next record, or NULL on I/O error or end of list */
+const byte*
+row_merge_read_rec(
+/*===============*/
+	row_merge_block_t*	block,	/*!< in/out: file buffer */
+	mrec_buf_t*		buf,	/*!< in/out: secondary buffer */
+	const byte*		b,	/*!< in: pointer to record */
+	const dict_index_t*	index,	/*!< in: index of the record */
+	const pfs_os_file_t&	fd,	/*!< in: file descriptor */
+	ulint*			foffs,	/*!< in/out: file offset */
+	const mrec_t**		mrec,	/*!< out: pointer to merge record,
+					or NULL on end of list
+					(non-NULL on I/O error) */
+	rec_offs*		offsets,/*!< out: offsets of mrec */
+	row_merge_block_t*	crypt_block, /*!< in: crypt buf or NULL */
+	ulint			space)	   /*!< in: space id */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Buffer for bulk insert */
+class row_merge_bulk_t
+{
+  /** Buffer for each index in the table. main memory
+  buffer for sorting the index */
+  row_merge_buf_t *m_merge_buf;
+  /** Block for IO operation */
+  row_merge_block_t *m_block= nullptr;
+  /** File to store the buffer and used for merge sort */
+  merge_file_t *m_merge_files= nullptr;
+  /** Temporary file to be used for merge sort */
+  pfs_os_file_t m_tmpfd;
+  /** Allocate memory for merge file data structure */
+  ut_allocator<row_merge_block_t> m_alloc;
+  /** Storage for description for the m_alloc */
+  ut_new_pfx_t m_block_pfx;
+  /** Temporary file to store the blob */
+  merge_file_t m_blob_file;
+  /** Storage for description for the crypt_block */
+  ut_new_pfx_t m_crypt_pfx;
+  /** Block for encryption */
+  row_merge_block_t *m_crypt_block= nullptr;
+public:
+  /** Constructor.
+  Create all merge files, merge buffer for all the table indexes
+  expect fts indexes.
+  Create a merge block which is used to write IO operation
+  @param table  table which undergoes bulk insert operation */
+  row_merge_bulk_t(dict_table_t *table);
+
+  /** Destructor.
+  Remove all merge files, merge buffer for all table indexes. */
+  ~row_merge_bulk_t();
+
+  /** Remove all buffer for the table indexes */
+  void remove_all_bulk_buffer();
+
+  /** Clean the merge buffer for the given index number */
+  void clean_bulk_buffer(ulint index_no);
+
+  /** Create the temporary file for the given index number
+  @retval true if temporary file creation went well */
+  bool create_tmp_file(ulint index_no);
+
+  /** Write the merge buffer to the tmp file for the given
+  index number.
+  @param index_no       buffer to be written for the index */
+  dberr_t write_to_tmp_file(ulint index_no);
+
+  /** Add the tuple to the merge buffer for the given index.
+  If the buffer ran out of memory then write the buffer into
+  the temporary file and do insert the tuple again.
+  @param row     tuple to be inserted
+  @param ind     index to be buffered
+  @param trx     bulk transaction */
+  dberr_t bulk_insert_buffered(const dtuple_t &row, const dict_index_t &ind,
+                               trx_t *trx);
+
+  /** Do bulk insert operation into the index tree from
+  buffer or merge file if exists
+  @param index_no  index to be inserted
+  @param trx       bulk transaction */
+  dberr_t write_to_index(ulint index_no, trx_t *trx);
+
+  /** Do bulk insert for the buffered insert for the table.
+  @param table  table which undergoes for bulk insert operation
+  @param trx    bulk transaction */
+  dberr_t write_to_table(dict_table_t *table, trx_t *trx);
+
+  /** Allocate block for writing the buffer into disk */
+  dberr_t alloc_block();
+
+  /** Init temporary files for each index */
+  void init_tmp_file();
+};
diff --git a/storage/innobase/include/row0mysql.h b/storage/innobase/include/row0mysql.h
new file mode 100644
index 00000000..878d9c9f
--- /dev/null
+++ b/storage/innobase/include/row0mysql.h
@@ -0,0 +1,841 @@
+/*****************************************************************************
+
+Copyright (c) 2000, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0mysql.h
+Interface between Innobase row operations and MySQL.
+Contains also create table and other data dictionary operations.
+
+Created 9/17/2000 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0mysql_h
+#define row0mysql_h
+
+#include "que0types.h"
+#include "trx0types.h"
+#include "row0types.h"
+#include "btr0types.h"
+#include "lock0types.h"
+#include "fil0fil.h"
+#include "fts0fts.h"
+#include "gis0type.h"
+
+struct row_prebuilt_t;
+class ha_innobase;
+class ha_handler_stats;
+
+/*******************************************************************//**
+Frees the blob heap in prebuilt when no longer needed. */
+void
+row_mysql_prebuilt_free_blob_heap(
+/*==============================*/
+	row_prebuilt_t*	prebuilt);	/*!< in: prebuilt struct of a
+					ha_innobase:: table handle */
+/*******************************************************************//**
+Stores a >= 5.0.3 format true VARCHAR length to dest, in the MySQL row
+format.
+@return pointer to the data, we skip the 1 or 2 bytes at the start
+that are used to store the len */
+byte*
+row_mysql_store_true_var_len(
+/*=========================*/
+	byte*	dest,	/*!< in: where to store */
+	ulint	len,	/*!< in: length, must fit in two bytes */
+	ulint	lenlen);/*!< in: storage length of len: either 1 or 2 bytes */
+/*******************************************************************//**
+Reads a >= 5.0.3 format true VARCHAR length, in the MySQL row format, and
+returns a pointer to the data.
+@return pointer to the data, we skip the 1 or 2 bytes at the start
+that are used to store the len */
+const byte*
+row_mysql_read_true_varchar(
+/*========================*/
+	ulint*		len,	/*!< out: variable-length field length */
+	const byte*	field,	/*!< in: field in the MySQL format */
+	ulint		lenlen);/*!< in: storage length of len: either 1
+				or 2 bytes */
+/*******************************************************************//**
+Stores a reference to a BLOB in the MySQL format. */
+void
+row_mysql_store_blob_ref(
+/*=====================*/
+	byte*		dest,	/*!< in: where to store */
+	ulint		col_len,/*!< in: dest buffer size: determines into
+				how many bytes the BLOB length is stored,
+				the space for the length may vary from 1
+				to 4 bytes */
+	const void*	data,	/*!< in: BLOB data; if the value to store
+				is SQL NULL this should be NULL pointer */
+	ulint		len);	/*!< in: BLOB length; if the value to store
+				is SQL NULL this should be 0; remember
+				also to set the NULL bit in the MySQL record
+				header! */
+/*******************************************************************//**
+Reads a reference to a BLOB in the MySQL format.
+@return pointer to BLOB data */
+const byte*
+row_mysql_read_blob_ref(
+/*====================*/
+	ulint*		len,		/*!< out: BLOB length */
+	const byte*	ref,		/*!< in: BLOB reference in the
+					MySQL format */
+	ulint		col_len);	/*!< in: BLOB reference length
+					(not BLOB length) */
+/*******************************************************************//**
+Converts InnoDB geometry data format to MySQL data format. */
+void
+row_mysql_store_geometry(
+/*=====================*/
+	byte*		dest,		/*!< in/out: where to store */
+	ulint		dest_len,	/*!< in: dest buffer size: determines into
+					how many bytes the geometry length is stored,
+					the space for the length may vary from 1
+					to 4 bytes */
+	const byte*	src,		/*!< in: geometry data; if the value to store
+					is SQL NULL this should be NULL pointer */
+	ulint		src_len);	/*!< in: geometry length; if the value to store
+					is SQL NULL this should be 0; remember
+					also to set the NULL bit in the MySQL record
+					header! */
+/**************************************************************//**
+Pad a column with spaces. */
+void
+row_mysql_pad_col(
+/*==============*/
+	ulint	mbminlen,	/*!< in: minimum size of a character,
+				in bytes */
+	byte*	pad,		/*!< out: padded buffer */
+	ulint	len);		/*!< in: number of bytes to pad */
+
+/**************************************************************//**
+Stores a non-SQL-NULL field given in the MySQL format in the InnoDB format.
+The counterpart of this function is row_sel_field_store_in_mysql_format() in
+row0sel.cc.
+@return up to which byte we used buf in the conversion */
+byte*
+row_mysql_store_col_in_innobase_format(
+/*===================================*/
+	dfield_t*	dfield,		/*!< in/out: dfield where dtype
+					information must be already set when
+					this function is called! */
+	byte*		buf,		/*!< in/out: buffer for a converted
+					integer value; this must be at least
+					col_len long then! NOTE that dfield
+					may also get a pointer to 'buf',
+					therefore do not discard this as long
+					as dfield is used! */
+	ibool		row_format_col,	/*!< TRUE if the mysql_data is from
+					a MySQL row, FALSE if from a MySQL
+					key value;
+					in MySQL, a true VARCHAR storage
+					format differs in a row and in a
+					key value: in a key value the length
+					is always stored in 2 bytes! */
+	const byte*	mysql_data,	/*!< in: MySQL column value, not
+					SQL NULL; NOTE that dfield may also
+					get a pointer to mysql_data,
+					therefore do not discard this as long
+					as dfield is used! */
+	ulint		col_len,	/*!< in: MySQL column length; NOTE that
+					this is the storage length of the
+					column in the MySQL format row, not
+					necessarily the length of the actual
+					payload data; if the column is a true
+					VARCHAR then this is irrelevant */
+	ulint		comp);		/*!< in: nonzero=compact format */
+/****************************************************************//**
+Handles user errors and lock waits detected by the database engine.
+@return true if it was a lock wait and we should continue running the
+query thread */
+bool
+row_mysql_handle_errors(
+/*====================*/
+	dberr_t*	new_err,/*!< out: possible new error encountered in
+				rollback, or the old error which was
+				during the function entry */
+	trx_t*		trx,	/*!< in: transaction */
+	que_thr_t*	thr,	/*!< in: query thread, or NULL */
+	trx_savept_t*	savept)	/*!< in: savepoint, or NULL */
+	MY_ATTRIBUTE((nonnull(1,2)));
+/********************************************************************//**
+Create a prebuilt struct for a MySQL table handle.
+@return own: a prebuilt struct */
+row_prebuilt_t*
+row_create_prebuilt(
+/*================*/
+	dict_table_t*	table,		/*!< in: Innobase table handle */
+	ulint		mysql_row_len);	/*!< in: length in bytes of a row in
+					the MySQL format */
+/** Free a prebuilt struct for a TABLE handle. */
+void row_prebuilt_free(row_prebuilt_t *prebuilt);
+/*********************************************************************//**
+Updates the transaction pointers in query graphs stored in the prebuilt
+struct. */
+void
+row_update_prebuilt_trx(
+/*====================*/
+	row_prebuilt_t*	prebuilt,	/*!< in/out: prebuilt struct
+					in MySQL handle */
+	trx_t*		trx);		/*!< in: transaction handle */
+
+/*********************************************************************//**
+Sets an AUTO_INC type lock on the table mentioned in prebuilt. The
+AUTO_INC lock gives exclusive access to the auto-inc counter of the
+table. The lock is reserved only for the duration of an SQL statement.
+It is not compatible with another AUTO_INC or exclusive lock on the
+table.
+@return error code or DB_SUCCESS */
+dberr_t
+row_lock_table_autoinc_for_mysql(
+/*=============================*/
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in the MySQL
+					table handle */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Lock a table.
+@param[in,out]	prebuilt	table handle
+@return error code or DB_SUCCESS */
+dberr_t
+row_lock_table(row_prebuilt_t* prebuilt);
+
+/** System Versioning: row_insert_for_mysql() modes */
+enum ins_mode_t {
+	/* plain row (without versioning) */
+	ROW_INS_NORMAL = 0,
+	/* row_start = TRX_ID, row_end = MAX */
+	ROW_INS_VERSIONED,
+	/* row_end = TRX_ID */
+	ROW_INS_HISTORICAL
+};
+
+/** Does an insert for MySQL.
+@param[in]	mysql_rec	row in the MySQL format
+@param[in,out]	prebuilt	prebuilt struct in MySQL handle
+@param[in]	ins_mode	what row type we're inserting
+@return error code or DB_SUCCESS*/
+dberr_t
+row_insert_for_mysql(
+	const byte*		mysql_rec,
+	row_prebuilt_t*		prebuilt,
+	ins_mode_t		ins_mode)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************//**
+Builds a dummy query graph used in selects. */
+void
+row_prebuild_sel_graph(
+/*===================*/
+	row_prebuilt_t*	prebuilt);	/*!< in: prebuilt struct in MySQL
+					handle */
+/*********************************************************************//**
+Gets pointer to a prebuilt update vector used in updates. If the update
+graph has not yet been built in the prebuilt struct, then this function
+first builds it.
+@return prebuilt update vector */
+upd_t*
+row_get_prebuilt_update_vector(
+/*===========================*/
+	row_prebuilt_t*	prebuilt);	/*!< in: prebuilt struct in MySQL
+					handle */
+/** Does an update or delete of a row for MySQL.
+@param[in,out]	prebuilt	prebuilt struct in MySQL handle
+@return error code or DB_SUCCESS */
+dberr_t
+row_update_for_mysql(
+	row_prebuilt_t*		prebuilt)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** This can only be used when the current transaction is at
+READ COMMITTED or READ UNCOMMITTED isolation level.
+Before calling this function row_search_mvcc() must have
+initialized prebuilt->new_rec_locks to store the information which new
+record locks really were set. This function removes a newly set
+clustered index record lock under prebuilt->pcur or
+prebuilt->clust_pcur.  Thus, this implements a 'mini-rollback' that
+releases the latest clustered index record lock we set.
+@param[in,out]	prebuilt		prebuilt struct in MySQL handle
+@param[in]	has_latches_on_recs	TRUE if called so that we have the
+					latches on the records under pcur
+					and clust_pcur, and we do not need
+					to reposition the cursors. */
+void
+row_unlock_for_mysql(
+	row_prebuilt_t*	prebuilt,
+	ibool		has_latches_on_recs);
+
+/*********************************************************************//**
+Creates an query graph node of 'update' type to be used in the MySQL
+interface.
+@return own: update node */
+upd_node_t*
+row_create_update_node_for_mysql(
+/*=============================*/
+	dict_table_t*	table,	/*!< in: table to update */
+	mem_heap_t*	heap);	/*!< in: mem heap from which allocated */
+
+/**********************************************************************//**
+Does a cascaded delete or set null in a foreign key operation.
+@return error code or DB_SUCCESS */
+dberr_t
+row_update_cascade_for_mysql(
+/*=========================*/
+        que_thr_t*      thr,    /*!< in: query thread */
+        upd_node_t*     node,   /*!< in: update node used in the cascade
+                                or set null operation */
+        dict_table_t*   table)  /*!< in: table where we do the operation */
+        MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Lock the data dictionary cache exclusively. */
+#define row_mysql_lock_data_dictionary(trx)			\
+	do {							\
+		ut_ad(!trx->dict_operation_lock_mode);		\
+		dict_sys.lock(SRW_LOCK_CALL);			\
+		trx->dict_operation_lock_mode = true;		\
+	} while (0)
+
+/** Unlock the data dictionary. */
+#define row_mysql_unlock_data_dictionary(trx)			\
+	do {							\
+		ut_ad(!lock_trx_has_sys_table_locks(trx));	\
+		ut_ad(trx->dict_operation_lock_mode);		\
+		trx->dict_operation_lock_mode = false;		\
+		dict_sys.unlock();				\
+	} while (0)
+
+/*********************************************************************//**
+Creates a table for MySQL. On failure the transaction will be rolled back
+and the 'table' object will be freed.
+@return error code or DB_SUCCESS */
+dberr_t
+row_create_table_for_mysql(
+/*=======================*/
+	dict_table_t*	table,	/*!< in, own: table definition
+				(will be freed, or on DB_SUCCESS
+				added to the data dictionary cache) */
+	trx_t*		trx)	/*!< in/out: transaction */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************//**
+Create an index when creating a table.
+On failure, the caller must drop the table!
+@return error number or DB_SUCCESS */
+dberr_t
+row_create_index_for_mysql(
+/*=======================*/
+	dict_index_t*	index,		/*!< in, own: index definition
+					(will be freed) */
+	trx_t*		trx,		/*!< in: transaction handle */
+	const ulint*	field_lengths,	/*!< in: if not NULL, must contain
+					dict_index_get_n_fields(index)
+					actual field lengths for the
+					index columns, which are
+					then checked for not being too
+					large. */
+	fil_encryption_t mode,	/*!< in: encryption mode */
+	uint32_t	key_id)	/*!< in: encryption key_id */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/*********************************************************************//**
+Discards the tablespace of a table which stored in an .ibd file. Discarding
+means that this function deletes the .ibd file and assigns a new table id for
+the table. Also the file_unreadable flag is set.
+@return error code or DB_SUCCESS */
+dberr_t row_discard_tablespace_for_mysql(dict_table_t *table, trx_t *trx)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*****************************************************************//**
+Imports a tablespace. The space id in the .ibd file must match the space id
+of the table in the data dictionary.
+@return error code or DB_SUCCESS */
+dberr_t
+row_import_tablespace_for_mysql(
+/*============================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in MySQL */
+        MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/*********************************************************************//**
+Renames a table for MySQL.
+@return error code or DB_SUCCESS */
+dberr_t
+row_rename_table_for_mysql(
+/*=======================*/
+	const char*	old_name,	/*!< in: old table name */
+	const char*	new_name,	/*!< in: new table name */
+	trx_t*		trx,		/*!< in/out: transaction */
+	bool		use_fk)		/*!< in: whether to parse and enforce
+					FOREIGN KEY constraints */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/* A struct describing a place for an individual column in the MySQL
+row format which is presented to the table handler in ha_innobase.
+This template struct is used to speed up row transformations between
+Innobase and MySQL. */
+
+struct mysql_row_templ_t {
+	ulint	col_no;			/*!< column number of the column */
+	ulint	rec_field_no;		/*!< field number of the column in an
+					Innobase record in the current index;
+					not defined if template_type is
+					ROW_MYSQL_WHOLE_ROW */
+	ibool	rec_field_is_prefix;	/* is this field in a prefix index? */
+	ulint	rec_prefix_field_no;	/* record field, even if just a
+					prefix; same as rec_field_no when not a
+					prefix, otherwise rec_field_no is
+					ULINT_UNDEFINED but this is the true
+					field number*/
+	ulint	clust_rec_field_no;	/*!< field number of the column in an
+					Innobase record in the clustered index;
+					not defined if template_type is
+					ROW_MYSQL_WHOLE_ROW */
+	ulint	icp_rec_field_no;	/*!< field number of the column in an
+					Innobase record in the current index;
+					not defined unless
+					index condition pushdown is used */
+	ulint	mysql_col_offset;	/*!< offset of the column in the MySQL
+					row format */
+	ulint	mysql_col_len;		/*!< length of the column in the MySQL
+					row format */
+	ulint	mysql_null_byte_offset;	/*!< MySQL NULL bit byte offset in a
+					MySQL record */
+	ulint	mysql_null_bit_mask;	/*!< bit mask to get the NULL bit,
+					zero if column cannot be NULL */
+	ulint	type;			/*!< column type in Innobase mtype
+					numbers DATA_CHAR... */
+	ulint	mysql_type;		/*!< MySQL type code; this is always
+					< 256 */
+	ulint	mysql_length_bytes;	/*!< if mysql_type
+					== DATA_MYSQL_TRUE_VARCHAR, this tells
+					whether we should use 1 or 2 bytes to
+					store the MySQL true VARCHAR data
+					length at the start of row in the MySQL
+					format (NOTE that the MySQL key value
+					format always uses 2 bytes for the data
+					len) */
+	ulint	charset;		/*!< MySQL charset-collation code
+					of the column, or zero */
+	ulint	mbminlen;		/*!< minimum length of a char, in bytes,
+					or zero if not a char type */
+	ulint	mbmaxlen;		/*!< maximum length of a char, in bytes,
+					or zero if not a char type */
+	ulint	is_unsigned;		/*!< if a column type is an integer
+					type and this field is != 0, then
+					it is an unsigned integer type */
+	ulint	is_virtual;		/*!< if a column is a virtual column */
+};
+
+#define MYSQL_FETCH_CACHE_SIZE		8
+/* After fetching this many rows, we start caching them in fetch_cache */
+#define MYSQL_FETCH_CACHE_THRESHOLD	4
+
+#define ROW_PREBUILT_ALLOCATED	78540783
+#define ROW_PREBUILT_FREED	26423527
+
+/** A struct for (sometimes lazily) prebuilt structures in an Innobase table
+handle used within MySQL; these are used to save CPU time. */
+
+struct row_prebuilt_t {
+	ulint		magic_n;	/*!< this magic number is set to
+					ROW_PREBUILT_ALLOCATED when created,
+					or ROW_PREBUILT_FREED when the
+					struct has been freed */
+	dict_table_t*	table;		/*!< Innobase table handle */
+	dict_index_t*	index;		/*!< current index for a search, if
+					any */
+	trx_t*		trx;		/*!< current transaction handle */
+	unsigned	sql_stat_start:1;/*!< TRUE when we start processing of
+					an SQL statement: we may have to set
+					an intention lock on the table,
+					create a consistent read view etc. */
+	unsigned	clust_index_was_generated:1;
+					/*!< if the user did not define a
+					primary key in MySQL, then Innobase
+					automatically generated a clustered
+					index where the ordering column is
+					the row id: in this case this flag
+					is set to TRUE */
+	unsigned	index_usable:1;	/*!< caches the value of
+					row_merge_is_index_usable(trx,index) */
+	unsigned	read_just_key:1;/*!< set to 1 when MySQL calls
+					ha_innobase::extra with the
+					argument HA_EXTRA_KEYREAD; it is enough
+					to read just columns defined in
+					the index (i.e., no read of the
+					clustered index record necessary) */
+	unsigned	used_in_HANDLER:1;/*!< TRUE if we have been using this
+					handle in a MySQL HANDLER low level
+					index cursor command: then we must
+					store the pcur position even in a
+					unique search from a clustered index,
+					because HANDLER allows NEXT and PREV
+					in such a situation */
+	unsigned	template_type:2;/*!< ROW_MYSQL_WHOLE_ROW,
+					ROW_MYSQL_REC_FIELDS,
+					ROW_MYSQL_DUMMY_TEMPLATE, or
+					ROW_MYSQL_NO_TEMPLATE */
+	unsigned	n_template:10;	/*!< number of elements in the
+					template */
+	unsigned	null_bitmap_len:10;/*!< number of bytes in the SQL NULL
+					bitmap at the start of a row in the
+					MySQL format */
+	unsigned	need_to_access_clustered:1; /*!< if we are fetching
+					columns through a secondary index
+					and at least one column is not in
+					the secondary index, then this is
+					set to TRUE; note that sometimes this
+					is set but we later optimize out the
+					clustered index lookup */
+	unsigned	templ_contains_blob:1;/*!< TRUE if the template contains
+					a column with DATA_LARGE_MTYPE(
+					get_innobase_type_from_mysql_type())
+					is TRUE;
+					not to be confused with InnoDB
+					externally stored columns
+					(VARCHAR can be off-page too) */
+	unsigned	versioned_write:1;/*!< whether this is
+					a versioned write */
+	mysql_row_templ_t* mysql_template;/*!< template used to transform
+					rows fast between MySQL and Innobase
+					formats; memory for this template
+					is not allocated from 'heap' */
+	mem_heap_t*	heap;		/*!< memory heap from which
+					these auxiliary structures are
+					allocated when needed */
+	ins_node_t*	ins_node;	/*!< Innobase SQL insert node
+					used to perform inserts
+					to the table */
+	byte*		ins_upd_rec_buff;/*!< buffer for storing data converted
+					to the Innobase format from the MySQL
+					format */
+	const byte*	default_rec;	/*!< the default values of all columns
+					(a "default row") in MySQL format */
+	ulint		hint_need_to_fetch_extra_cols;
+					/*!< normally this is set to 0; if this
+					is set to ROW_RETRIEVE_PRIMARY_KEY,
+					then we should at least retrieve all
+					columns in the primary key; if this
+					is set to ROW_RETRIEVE_ALL_COLS, then
+					we must retrieve all columns in the
+					key (if read_just_key == 1), or all
+					columns in the table */
+	upd_node_t*	upd_node;	/*!< Innobase SQL update node used
+					to perform updates and deletes */
+	trx_id_t	trx_id;		/*!< The table->def_trx_id when
+					ins_graph was built */
+	que_fork_t*	ins_graph;	/*!< Innobase SQL query graph used
+					in inserts. Will be rebuilt on
+					trx_id or n_indexes mismatch. */
+	que_fork_t*	upd_graph;	/*!< Innobase SQL query graph used
+					in updates or deletes */
+	btr_pcur_t*	pcur;		/*!< persistent cursor used in selects
+					and updates */
+	btr_pcur_t*	clust_pcur;	/*!< persistent cursor used in
+					some selects and updates */
+	que_fork_t*	sel_graph;	/*!< dummy query graph used in
+					selects */
+	dtuple_t*	search_tuple;	/*!< prebuilt dtuple used in selects */
+	byte		row_id[DATA_ROW_ID_LEN];
+					/*!< if the clustered index was
+					generated, the row id of the
+					last row fetched is stored
+					here */
+	doc_id_t	fts_doc_id;	/* if the table has an FTS index on
+					it then we fetch the doc_id.
+					FTS-FIXME: Currently we fetch it always
+					but in the future we must only fetch
+					it when FTS columns are being
+					updated */
+	dtuple_t*	clust_ref;	/*!< prebuilt dtuple used in
+					sel/upd/del */
+	lock_mode	select_lock_type;/*!< LOCK_NONE, LOCK_S, or LOCK_X */
+	bool		skip_locked;	/*!< TL_{READ,WRITE}_SKIP_LOCKED */
+	lock_mode	stored_select_lock_type;/*!< this field is used to
+					remember the original select_lock_type
+					that was decided in ha_innodb.cc,
+					::store_lock(), ::external_lock(),
+					etc. */
+	ulint		row_read_type;	/*!< ROW_READ_WITH_LOCKS if row locks
+					should be the obtained for records
+					under an UPDATE or DELETE cursor.
+					At READ UNCOMMITTED or
+					READ COMMITTED isolation level,
+					this can be set to
+					ROW_READ_TRY_SEMI_CONSISTENT, so that
+					if the row under an UPDATE or DELETE
+					cursor was locked by another
+					transaction, InnoDB will resort
+					to reading the last committed value
+					('semi-consistent read').  Then,
+					this field will be set to
+					ROW_READ_DID_SEMI_CONSISTENT to
+					indicate that.	If the row does not
+					match the WHERE condition, MySQL will
+					invoke handler::unlock_row() to
+					clear the flag back to
+					ROW_READ_TRY_SEMI_CONSISTENT and
+					to simply skip the row.	 If
+					the row matches, the next call to
+					row_search_mvcc() will lock
+					the row.
+					This eliminates lock waits in some
+					cases; note that this breaks
+					serializability. */
+	ulint		new_rec_locks;	/*!< normally 0; if
+					the session is using READ
+					COMMITTED or READ UNCOMMITTED
+					isolation level, set in
+					row_search_mvcc() if we set a new
+					record lock on the secondary
+					or clustered index; this is
+					used in row_unlock_for_mysql()
+					when releasing the lock under
+					the cursor if we determine
+					after retrieving the row that
+					it does not need to be locked
+					('mini-rollback') */
+	ulint		mysql_prefix_len;/*!< byte offset of the end of
+					the last requested column */
+	ulint		mysql_row_len;	/*!< length in bytes of a row in the
+					MySQL format */
+	ulint		n_rows_fetched;	/*!< number of rows fetched after
+					positioning the current cursor */
+	ulint		fetch_direction;/*!< ROW_SEL_NEXT or ROW_SEL_PREV */
+	byte*		fetch_cache[MYSQL_FETCH_CACHE_SIZE];
+					/*!< a cache for fetched rows if we
+					fetch many rows from the same cursor:
+					it saves CPU time to fetch them in a
+					batch; we reserve mysql_row_len
+					bytes for each such row; these
+					pointers point 4 bytes past the
+					allocated mem buf start, because
+					there is a 4 byte magic number at the
+					start and at the end */
+	bool		keep_other_fields_on_keyread; /*!< when using fetch
+					cache with HA_EXTRA_KEYREAD, don't
+					overwrite other fields in mysql row
+					row buffer.*/
+	ulint		fetch_cache_first;/*!< position of the first not yet
+					fetched row in fetch_cache */
+	ulint		n_fetch_cached;	/*!< number of not yet fetched rows
+					in fetch_cache */
+	mem_heap_t*	blob_heap;	/*!< in SELECTS BLOB fields are copied
+					to this heap */
+	mem_heap_t*	old_vers_heap;	/*!< memory heap where a previous
+					version is built in consistent read */
+	bool		in_fts_query;	/*!< Whether we are in a FTS query */
+	bool		fts_doc_id_in_read_set; /*!< true if table has externally
+					defined FTS_DOC_ID coulmn. */
+	/*----------------------*/
+	ulonglong	autoinc_last_value;
+					/*!< last value of AUTO-INC interval */
+	ulonglong	autoinc_increment;/*!< The increment step of the auto
+					increment column. Value must be
+					greater than or equal to 1. Required to
+					calculate the next value */
+	ulonglong	autoinc_offset; /*!< The offset passed to
+					get_auto_increment() by MySQL. Required
+					to calculate the next value */
+	dberr_t		autoinc_error;	/*!< The actual error code encountered
+					while trying to init or read the
+					autoinc value from the table. We
+					store it here so that we can return
+					it to MySQL */
+	/*----------------------*/
+
+	/** Argument of handler_rowid_filter_check(),
+	or NULL if no PRIMARY KEY filter is pushed */
+	ha_innobase*	pk_filter;
+
+	/** Argument to handler_index_cond_check(),
+	or NULL if no index condition pushdown (ICP) is used. */
+	ha_innobase*	idx_cond;
+	ulint		idx_cond_n_cols;/*!< Number of fields in idx_cond_cols.
+					0 if and only if idx_cond == NULL. */
+	/*----------------------*/
+
+	/*----------------------*/
+	rtr_info_t*	rtr_info;	/*!< R-tree Search Info */
+	/*----------------------*/
+
+	ulint		magic_n2;	/*!< this should be the same as
+					magic_n */
+
+	byte*		srch_key_val1;  /*!< buffer used in converting
+					search key values from MySQL format
+					to InnoDB format.*/
+	byte*		srch_key_val2;  /*!< buffer used in converting
+					search key values from MySQL format
+					to InnoDB format.*/
+	uint		srch_key_val_len; /*!< Size of search key */
+	/** The MySQL table object */
+	TABLE*		m_mysql_table;
+
+	/** Get template by dict_table_t::cols[] number */
+	const mysql_row_templ_t* get_template_by_col(ulint col) const
+	{
+		ut_ad(col < n_template);
+		ut_ad(mysql_template);
+		for (ulint i = col; i < n_template; ++i) {
+			const mysql_row_templ_t* templ = &mysql_template[i];
+			if (!templ->is_virtual && templ->col_no == col) {
+				return templ;
+			}
+		}
+		return NULL;
+	}
+};
+
+/** Callback for row_mysql_sys_index_iterate() */
+struct SysIndexCallback {
+	virtual ~SysIndexCallback() = default;
+
+	/** Callback method
+	@param mtr current mini transaction
+	@param pcur persistent cursor. */
+	virtual void operator()(mtr_t* mtr, btr_pcur_t* pcur) throw() = 0;
+};
+
+
+/** Storage for calculating virtual columns */
+
+class String;
+struct VCOL_STORAGE
+{
+	TABLE *maria_table;
+	byte *innobase_record;
+	byte *maria_record;
+	String *blob_value_storage;
+	VCOL_STORAGE(): maria_table(NULL), innobase_record(NULL),
+		maria_record(NULL),  blob_value_storage(NULL) {}
+};
+
+/**
+   Allocate a heap and record for calculating virtual fields
+   Used mainly for virtual fields in indexes
+
+@param[in]	thd		MariaDB THD
+@param[in]	index		Index in use
+@param[out]	heap		Heap that holds temporary row
+@param[in,out]	mysql_table	MariaDB table
+@param[out]	rec		Pointer to allocated MariaDB record
+@param[out]	storage		Internal storage for blobs etc
+
+@return		FALSE ok
+@return		TRUE  malloc failure
+*/
+
+bool innobase_allocate_row_for_vcol(THD *thd,
+				    const dict_index_t* index,
+				    mem_heap_t**  heap,
+				    TABLE**	  table,
+				    VCOL_STORAGE* storage);
+
+/** Free memory allocated by innobase_allocate_row_for_vcol() */
+void innobase_free_row_for_vcol(VCOL_STORAGE *storage);
+
+class ib_vcol_row
+{
+  VCOL_STORAGE storage;
+public:
+  mem_heap_t *heap;
+
+  ib_vcol_row(mem_heap_t *heap) : heap(heap) {}
+
+  byte *record(THD *thd, const dict_index_t *index, TABLE **table)
+  {
+    if (!storage.innobase_record &&
+        !innobase_allocate_row_for_vcol(thd, index, &heap, table, &storage))
+      return nullptr;
+    return storage.innobase_record;
+  }
+
+  ~ib_vcol_row()
+  {
+    if (heap)
+    {
+      if (storage.innobase_record)
+        innobase_free_row_for_vcol(&storage);
+      mem_heap_free(heap);
+    }
+  }
+};
+
+/** Report virtual value computation failure in ib::error
+@param[in]    row    the data row
+*/
+ATTRIBUTE_COLD
+void innobase_report_computed_value_failed(dtuple_t *row);
+
+/** Get the computed value by supplying the base column values.
+@param[in,out]	row		the data row
+@param[in]	col		virtual column
+@param[in]	index		index on the virtual column
+@param[in,out]	local_heap	heap memory for processing large data etc.
+@param[in,out]	heap		memory heap that copies the actual index row
+@param[in]	ifield		index field
+@param[in]	thd		connection handle
+@param[in,out]	mysql_table	MariaDB table handle
+@param[in,out]	mysql_rec	MariaDB record buffer
+@param[in]	old_table	during ALTER TABLE, this is the old table
+				or NULL.
+@param[in]	update	update vector for the parent row
+@param[in]	ignore_warnings	ignore warnings during calculation. Usually
+				means that a calculation is internal and
+				should have no side effects.
+@return the field filled with computed value */
+dfield_t*
+innobase_get_computed_value(
+	dtuple_t*		row,
+	const dict_v_col_t*	col,
+	const dict_index_t*	index,
+	mem_heap_t**		local_heap,
+	mem_heap_t*		heap,
+	const dict_field_t*	ifield,
+	THD*			thd,
+	TABLE*			mysql_table,
+	byte*			mysql_rec,
+	const dict_table_t*	old_table=NULL,
+	const upd_t*		update=NULL,
+	bool			ignore_warnings=false);
+
+/** Change dbname and table name in table->vc_templ.
+@param[in,out]	table	the table whose virtual column template
+dbname and tbname to be renamed. */
+void
+innobase_rename_vc_templ(
+	dict_table_t*	table);
+
+#define ROW_PREBUILT_FETCH_MAGIC_N	465765687
+
+#define ROW_MYSQL_WHOLE_ROW	0
+#define ROW_MYSQL_REC_FIELDS	1
+#define ROW_MYSQL_NO_TEMPLATE	2
+#define ROW_MYSQL_DUMMY_TEMPLATE 3	/* dummy template used in
+					row_check_index() */
+
+/* Values for hint_need_to_fetch_extra_cols */
+#define ROW_RETRIEVE_PRIMARY_KEY	1
+#define ROW_RETRIEVE_ALL_COLS		2
+
+/* Values for row_read_type */
+#define ROW_READ_WITH_LOCKS		0
+#define ROW_READ_TRY_SEMI_CONSISTENT	1
+#define ROW_READ_DID_SEMI_CONSISTENT	2
+
+#endif /* row0mysql.h */
diff --git a/storage/innobase/include/row0purge.h b/storage/innobase/include/row0purge.h
new file mode 100644
index 00000000..1daf4d4a
--- /dev/null
+++ b/storage/innobase/include/row0purge.h
@@ -0,0 +1,149 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0purge.h
+Purge obsolete records
+
+Created 3/14/1997 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "que0types.h"
+#include "btr0types.h"
+#include "btr0pcur.h"
+#include "trx0types.h"
+#include "row0types.h"
+#include "row0mysql.h"
+#include "mysqld.h"
+#include <queue>
+#include <unordered_map>
+
+class MDL_ticket;
+/** Determines if it is possible to remove a secondary index entry.
+Removal is possible if the secondary index entry does not refer to any
+not delete marked version of a clustered index record where DB_TRX_ID
+is newer than the purge view.
+
+NOTE: This function should only be called by the purge thread, only
+while holding a latch on the leaf page of the secondary index entry
+(or keeping the buffer pool watch on the page).  It is possible that
+this function first returns true and then false, if a user transaction
+inserts a record that the secondary index entry would refer to.
+However, in that case, the user transaction would also re-insert the
+secondary index entry after purge has removed it and released the leaf
+page latch.
+@param[in,out]	node		row purge node
+@param[in]	index		secondary index
+@param[in]	entry		secondary index entry
+@param[in,out]	sec_pcur	secondary index cursor or NULL
+				if it is called for purge buffering
+				operation.
+@param[in,out]	sec_mtr		mini-transaction which holds
+				secondary index entry or NULL if it is
+				called for purge buffering operation.
+@param[in]	is_tree		true=pessimistic purge,
+				false=optimistic (leaf-page only)
+@return true if the secondary index record can be purged */
+bool
+row_purge_poss_sec(
+	purge_node_t*	node,
+	dict_index_t*	index,
+	const dtuple_t*	entry,
+	btr_pcur_t*	sec_pcur=NULL,
+	mtr_t*		sec_mtr=NULL,
+	bool		is_tree=false);
+
+/***************************************************************
+Does the purge operation.
+@return query thread to run next */
+que_thr_t*
+row_purge_step(
+/*===========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Purge worker context */
+struct purge_node_t
+{
+  /** node type: QUE_NODE_PURGE */
+  que_common_t common;
+
+  /** DB_TRX_ID of the undo log record */
+  trx_id_t trx_id;
+  /** DB_ROLL_PTR pointing to undo log record */
+  roll_ptr_t roll_ptr;
+
+  /** undo number of the record */
+  undo_no_t undo_no;
+
+  /** record type: TRX_UNDO_INSERT_REC, ... */
+  byte rec_type;
+  /** compiler analysis info of an update */
+  byte cmpl_info;
+  /** whether the clustered index record determined by ref was found
+  in the clustered index of the table, and we were able to position
+  pcur on it */
+  bool found_clust;
+#ifdef UNIV_DEBUG
+  /** whether the operation is in progress */
+  bool in_progress= false;
+#endif
+  /** table where purge is done */
+  dict_table_t *table= nullptr;
+  /** update vector for a clustered index record */
+  upd_t *update;
+  /** row reference to the next row to handle, or nullptr */
+  const dtuple_t *ref;
+  /** nullptr, or a deep copy of the indexed fields of the row to handle */
+  dtuple_t *row;
+  /** nullptr, or the next index of table whose record should be handled */
+  dict_index_t *index;
+  /** memory heap used as auxiliary storage; must be emptied between rows */
+  mem_heap_t *heap;
+  /** persistent cursor to the clustered index record */
+  btr_pcur_t pcur;
+
+  /** Undo recs to purge */
+  std::queue<trx_purge_rec_t> undo_recs;
+
+  /** map of table identifiers to table handles and meta-data locks */
+  std::unordered_map<table_id_t, std::pair<dict_table_t*,MDL_ticket*>> tables;
+
+  /** Constructor */
+  explicit purge_node_t(que_thr_t *parent) :
+    common(QUE_NODE_PURGE, parent), heap(mem_heap_create(256)),
+    tables(TRX_PURGE_TABLE_BUCKETS) {}
+
+#ifdef UNIV_DEBUG
+  /** Validate the persistent cursor. The purge node has two references
+  to the clustered index record: ref and pcur, which must match
+  each other if found_clust.
+  @return whether pcur is consistent with ref */
+  bool validate_pcur();
+#endif
+
+  /** Start processing an undo log record. */
+  inline void start();
+
+  /** Reset the state at end
+  @return the query graph parent */
+  inline que_node_t *end(THD *);
+};
diff --git a/storage/innobase/include/row0quiesce.h b/storage/innobase/include/row0quiesce.h
new file mode 100644
index 00000000..b05b7666
--- /dev/null
+++ b/storage/innobase/include/row0quiesce.h
@@ -0,0 +1,67 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0quiesce.h
+
+Header file for tablespace quiesce functions.
+
+Created 2012-02-08 by Sunny Bains
+*******************************************************/
+
+#ifndef row0quiesce_h
+#define row0quiesce_h
+
+#include "dict0types.h"
+
+struct trx_t;
+
+/** The version number of the export meta-data text file. */
+#define IB_EXPORT_CFG_VERSION_V1	0x1UL
+
+/*********************************************************************//**
+Quiesce the tablespace that the table resides in. */
+void
+row_quiesce_table_start(
+/*====================*/
+	dict_table_t*	table,		/*!< in: quiesce this table */
+	trx_t*		trx)		/*!< in/out: transaction/session */
+        MY_ATTRIBUTE((nonnull));
+
+/*********************************************************************//**
+Set a table's quiesce state.
+@return DB_SUCCESS or errro code. */
+dberr_t
+row_quiesce_set_state(
+/*==================*/
+	dict_table_t*	table,		/*!< in: quiesce this table */
+	ib_quiesce_t	state,		/*!< in: quiesce state to set */
+	trx_t*		trx)		/*!< in/out: transaction */
+        MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/*********************************************************************//**
+Cleanup after table quiesce. */
+void
+row_quiesce_table_complete(
+/*=======================*/
+	dict_table_t*	table,		/*!< in: quiesce this table */
+	trx_t*		trx)		/*!< in/out: transaction/session */
+        MY_ATTRIBUTE((nonnull));
+
+#endif /* row0quiesce_h */
diff --git a/storage/innobase/include/row0row.h b/storage/innobase/include/row0row.h
new file mode 100644
index 00000000..a1350740
--- /dev/null
+++ b/storage/innobase/include/row0row.h
@@ -0,0 +1,431 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0row.h
+General row routines
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0row_h
+#define row0row_h
+
+#include "que0types.h"
+#include "ibuf0ibuf.h"
+#include "trx0types.h"
+#include "mtr0mtr.h"
+#include "rem0types.h"
+#include "row0types.h"
+#include "btr0types.h"
+
+/*********************************************************************//**
+Gets the offset of the DB_TRX_ID field, in bytes relative to the origin of
+a clustered index record.
+@return offset of DATA_TRX_ID */
+UNIV_INLINE
+ulint
+row_get_trx_id_offset(
+/*==================*/
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const rec_offs*		offsets)/*!< in: record offsets */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Reads the trx id field from a clustered index record.
+@return value of the field */
+UNIV_INLINE
+trx_id_t
+row_get_rec_trx_id(
+/*===============*/
+	const rec_t*		rec,	/*!< in: record */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const rec_offs*		offsets)/*!< in: rec_get_offsets(rec, index) */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Reads the roll pointer field from a clustered index record.
+@return value of the field */
+UNIV_INLINE
+roll_ptr_t
+row_get_rec_roll_ptr(
+/*=================*/
+	const rec_t*		rec,	/*!< in: record */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const rec_offs*		offsets)/*!< in: rec_get_offsets(rec, index) */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/* Flags for row build type. */
+#define ROW_BUILD_NORMAL	0	/*!< build index row */
+#define ROW_BUILD_FOR_PURGE	1	/*!< build row for purge. */
+#define ROW_BUILD_FOR_UNDO	2	/*!< build row for undo. */
+#define ROW_BUILD_FOR_INSERT	3	/*!< build row for insert. */
+
+/*****************************************************************//**
+When an insert or purge to a table is performed, this function builds
+the entry to be inserted into or purged from an index on the table.
+@return index entry which should be inserted or purged
+@retval NULL if the externally stored columns in the clustered index record
+are unavailable and ext != NULL, or row is missing some needed columns. */
+dtuple_t*
+row_build_index_entry_low(
+/*======================*/
+	const dtuple_t*		row,	/*!< in: row which should be
+					inserted or purged */
+	const row_ext_t*	ext,	/*!< in: externally stored column
+					prefixes, or NULL */
+	const dict_index_t*	index,	/*!< in: index on the table */
+	mem_heap_t*		heap,	/*!< in,out: memory heap from which
+					the memory for the index entry
+					is allocated */
+	ulint			flag)	/*!< in: ROW_BUILD_NORMAL,
+					ROW_BUILD_FOR_PURGE
+                                        or ROW_BUILD_FOR_UNDO */
+	MY_ATTRIBUTE((warn_unused_result, nonnull(1,3,4)));
+/*****************************************************************//**
+When an insert or purge to a table is performed, this function builds
+the entry to be inserted into or purged from an index on the table.
+@return index entry which should be inserted or purged, or NULL if the
+externally stored columns in the clustered index record are
+unavailable and ext != NULL */
+UNIV_INLINE
+dtuple_t*
+row_build_index_entry(
+/*==================*/
+	const dtuple_t*		row,	/*!< in: row which should be
+					inserted or purged */
+	const row_ext_t*	ext,	/*!< in: externally stored column
+					prefixes, or NULL */
+	const dict_index_t*	index,	/*!< in: index on the table */
+	mem_heap_t*		heap)	/*!< in,out: memory heap from which
+					the memory for the index entry
+					is allocated */
+	MY_ATTRIBUTE((warn_unused_result, nonnull(1,3,4)));
+/*******************************************************************//**
+An inverse function to row_build_index_entry. Builds a row from a
+record in a clustered index.
+@return own: row built; see the NOTE below! */
+dtuple_t*
+row_build(
+/*======*/
+	ulint			type,	/*!< in: ROW_COPY_POINTERS or
+					ROW_COPY_DATA; the latter
+					copies also the data fields to
+					heap while the first only
+					places pointers to data fields
+					on the index page, and thus is
+					more efficient */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const rec_t*		rec,	/*!< in: record in the clustered
+					index; NOTE: in the case
+					ROW_COPY_POINTERS the data
+					fields in the row will point
+					directly into this record,
+					therefore, the buffer page of
+					this record must be at least
+					s-latched and the latch held
+					as long as the row dtuple is used! */
+	const rec_offs*		offsets,/*!< in: rec_get_offsets(rec,index)
+					or NULL, in which case this function
+					will invoke rec_get_offsets() */
+	const dict_table_t*	col_table,
+					/*!< in: table, to check which
+					externally stored columns
+					occur in the ordering columns
+					of an index, or NULL if
+					index->table should be
+					consulted instead; the user
+					columns in this table should be
+					the same columns as in index->table */
+	const dtuple_t*		defaults,
+					/*!< in: default values of
+					added, changed columns, or NULL */
+	const ulint*		col_map,/*!< in: mapping of old column
+					numbers to new ones, or NULL */
+	row_ext_t**		ext,	/*!< out, own: cache of
+					externally stored column
+					prefixes, or NULL */
+	mem_heap_t*		heap);	/*!< in: memory heap from which
+					the memory needed is allocated */
+
+/** An inverse function to row_build_index_entry. Builds a row from a
+record in a clustered index, with possible indexing on ongoing
+addition of new virtual columns.
+@param[in]	type		ROW_COPY_POINTERS or ROW_COPY_DATA;
+@param[in]	index		clustered index
+@param[in]	rec		record in the clustered index
+@param[in]	offsets		rec_get_offsets(rec,index) or NULL
+@param[in]	col_table	table, to check which
+				externally stored columns
+				occur in the ordering columns
+				of an index, or NULL if
+				index->table should be
+				consulted instead
+@param[in]	defaults	default values of added, changed columns, or NULL
+@param[in]	add_v		new virtual columns added
+				along with new indexes
+@param[in]	col_map		mapping of old column
+				numbers to new ones, or NULL
+@param[in]	ext		cache of externally stored column
+				prefixes, or NULL
+@param[in]	heap		memory heap from which
+				the memory needed is allocated
+@return own: row built */
+dtuple_t*
+row_build_w_add_vcol(
+	ulint			type,
+	const dict_index_t*	index,
+	const rec_t*		rec,
+	const rec_offs*		offsets,
+	const dict_table_t*	col_table,
+	const dtuple_t*		defaults,
+	const dict_add_v_col_t*	add_v,
+	const ulint*		col_map,
+	row_ext_t**		ext,
+	mem_heap_t*		heap);
+
+/*******************************************************************//**
+Converts an index record to a typed data tuple.
+@return index entry built; does not set info_bits, and the data fields
+in the entry will point directly to rec */
+dtuple_t*
+row_rec_to_index_entry_low(
+/*=======================*/
+	const rec_t*		rec,	/*!< in: record in the index */
+	const dict_index_t*	index,	/*!< in: index */
+	const rec_offs*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	mem_heap_t*		heap)	/*!< in: memory heap from which
+					the memory needed is allocated */
+	MY_ATTRIBUTE((warn_unused_result));
+/*******************************************************************//**
+Converts an index record to a typed data tuple. NOTE that externally
+stored (often big) fields are NOT copied to heap.
+@return own: index entry built */
+dtuple_t*
+row_rec_to_index_entry(
+/*===================*/
+	const rec_t*		rec,	/*!< in: record in the index */
+	const dict_index_t*	index,	/*!< in: index */
+	const rec_offs*		offsets,/*!< in/out: rec_get_offsets(rec) */
+	mem_heap_t*		heap)	/*!< in: memory heap from which
+					the memory needed is allocated */
+	MY_ATTRIBUTE((warn_unused_result));
+
+/** Convert a metadata record to a data tuple.
+@param[in]	rec		metadata record
+@param[in]	index		clustered index after instant ALTER TABLE
+@param[in]	offsets		rec_get_offsets(rec)
+@param[in,out]	heap		memory heap for allocations
+@param[in]	info_bits	the info_bits after an update
+@param[in]	pad		whether to pad to index->n_fields */
+dtuple_t*
+row_metadata_to_tuple(
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	const rec_offs*		offsets,
+	mem_heap_t*		heap,
+	ulint			info_bits,
+	bool			pad)
+	MY_ATTRIBUTE((nonnull,warn_unused_result));
+
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record.
+@return own: row reference built; see the NOTE below! */
+dtuple_t*
+row_build_row_ref(
+/*==============*/
+	ulint		type,	/*!< in: ROW_COPY_DATA, or ROW_COPY_POINTERS:
+				the former copies also the data fields to
+				heap, whereas the latter only places pointers
+				to data fields on the index page */
+	dict_index_t*	index,	/*!< in: secondary index */
+	const rec_t*	rec,	/*!< in: record in the index;
+				NOTE: in the case ROW_COPY_POINTERS
+				the data fields in the row will point
+				directly into this record, therefore,
+				the buffer page of this record must be
+				at least s-latched and the latch held
+				as long as the row reference is used! */
+	mem_heap_t*	heap)	/*!< in: memory heap from which the memory
+				needed is allocated */
+	MY_ATTRIBUTE((warn_unused_result));
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+void
+row_build_row_ref_in_tuple(
+/*=======================*/
+	dtuple_t*		ref,	/*!< in/out: row reference built;
+					see the NOTE below! */
+	const rec_t*		rec,	/*!< in: record in the index;
+					NOTE: the data fields in ref
+					will point directly into this
+					record, therefore, the buffer
+					page of this record must be at
+					least s-latched and the latch
+					held as long as the row
+					reference is used! */
+	const dict_index_t*	index,	/*!< in: secondary index */
+	rec_offs*		offsets)/*!< in: rec_get_offsets(rec, index)
+					or NULL */
+	MY_ATTRIBUTE((nonnull(1,2,3)));
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+UNIV_INLINE
+void
+row_build_row_ref_fast(
+/*===================*/
+	dtuple_t*	ref,	/*!< in/out: typed data tuple where the
+				reference is built */
+	const ulint*	map,	/*!< in: array of field numbers in rec
+				telling how ref should be built from
+				the fields of rec */
+	const rec_t*	rec,	/*!< in: secondary index record;
+				must be preserved while ref is used, as we do
+				not copy field values to heap */
+	const rec_offs*	offsets);/*!< in: array returned by rec_get_offsets() */
+/***************************************************************//**
+Searches the clustered index record for a row, if we have the row
+reference.
+@return true if found */
+bool
+row_search_on_row_ref(
+/*==================*/
+	btr_pcur_t*		pcur,	/*!< out: persistent cursor, which must
+					be closed by the caller */
+	btr_latch_mode		mode,	/*!< in: BTR_MODIFY_LEAF, ... */
+	const dict_table_t*	table,	/*!< in: table */
+	const dtuple_t*		ref,	/*!< in: row reference */
+	mtr_t*			mtr)	/*!< in/out: mtr */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*********************************************************************//**
+Fetches the clustered index record for a secondary index record. The latches
+on the secondary index record are preserved.
+@return record or NULL, if no record found */
+rec_t*
+row_get_clust_rec(
+/*==============*/
+	btr_latch_mode	mode,	/*!< in: BTR_MODIFY_LEAF, ... */
+	const rec_t*	rec,	/*!< in: record in a secondary index */
+	dict_index_t*	index,	/*!< in: secondary index */
+	dict_index_t**	clust_index,/*!< out: clustered index */
+	mtr_t*		mtr)	/*!< in: mtr */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Parse the integer data from specified data, which could be
+DATA_INT, DATA_FLOAT or DATA_DOUBLE. If the value is less than 0
+and the type is not unsigned then we reset the value to 0
+@param[in]	data		data to read
+@param[in]	len		length of data
+@param[in]	mtype		mtype of data
+@param[in]	unsigned_type	if the data is unsigned
+@return the integer value from the data */
+inline
+ib_uint64_t
+row_parse_int(
+	const byte*	data,
+	ulint		len,
+	ulint		mtype,
+	bool		unsigned_type);
+
+/** Result of row_search_index_entry */
+enum row_search_result {
+	ROW_FOUND = 0,		/*!< the record was found */
+	ROW_NOT_FOUND,		/*!< record not found */
+	ROW_BUFFERED,		/*!< one of BTR_INSERT, BTR_DELETE, or
+				BTR_DELETE_MARK was specified, the
+				secondary index leaf page was not in
+				the buffer pool, and the operation was
+				enqueued in the insert/delete buffer */
+	ROW_NOT_DELETED_REF	/*!< BTR_DELETE was specified, and
+				row_purge_poss_sec() failed */
+};
+
+/***************************************************************//**
+Searches an index record.
+@return whether the record was found or buffered */
+enum row_search_result
+row_search_index_entry(
+/*===================*/
+	const dtuple_t*	entry,	/*!< in: index entry */
+	btr_latch_mode	mode,	/*!< in: BTR_MODIFY_LEAF, ... */
+	btr_pcur_t*	pcur,	/*!< in/out: persistent cursor, which must
+				be closed by the caller */
+	mtr_t*		mtr)	/*!< in: mtr */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+#define ROW_COPY_DATA		1
+#define ROW_COPY_POINTERS	2
+
+/* The allowed latching order of index records is the following:
+(1) a secondary index record ->
+(2) the clustered index record ->
+(3) rollback segment data for the clustered index record. */
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) using
+"dict_field" and writes the result to "buf".
+Not more than "buf_size" bytes are written to "buf".
+The result is always NUL-terminated (provided buf_size is positive) and the
+number of bytes that were written to "buf" is returned (including the
+terminating NUL).
+@return number of bytes that were written */
+ulint
+row_raw_format(
+/*===========*/
+	const char*		data,		/*!< in: raw data */
+	ulint			data_len,	/*!< in: raw data length
+						in bytes */
+	const dict_field_t*	dict_field,	/*!< in: index field */
+	char*			buf,		/*!< out: output buffer */
+	ulint			buf_size)	/*!< in: output buffer size
+						in bytes */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Prepare to start a mini-transaction to modify an index.
+@param[in,out]	mtr		mini-transaction
+@param[in,out]	index		possibly secondary index
+@param[in]	pessimistic	whether this is a pessimistic operation */
+inline
+void
+row_mtr_start(mtr_t* mtr, dict_index_t* index, bool pessimistic)
+{
+	mtr->start();
+
+	switch (index->table->space_id) {
+	case IBUF_SPACE_ID:
+		if (pessimistic
+		    && !(index->type & (DICT_UNIQUE | DICT_SPATIAL))) {
+			ibuf_free_excess_pages();
+		}
+		break;
+	case SRV_TMP_SPACE_ID:
+		mtr->set_log_mode(MTR_LOG_NO_REDO);
+		break;
+	default:
+		index->set_modified(*mtr);
+		break;
+	}
+
+	log_free_check();
+}
+
+#include "row0row.inl"
+
+#endif
diff --git a/storage/innobase/include/row0row.inl b/storage/innobase/include/row0row.inl
new file mode 100644
index 00000000..e89adb58
--- /dev/null
+++ b/storage/innobase/include/row0row.inl
@@ -0,0 +1,221 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2018, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0row.ic
+General row routines
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#include "dict0dict.h"
+#include "rem0rec.h"
+#include "trx0undo.h"
+
+/*********************************************************************//**
+Gets the offset of the DB_TRX_ID field, in bytes relative to the origin of
+a clustered index record.
+@return offset of DATA_TRX_ID */
+UNIV_INLINE
+ulint
+row_get_trx_id_offset(
+/*==================*/
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const rec_offs*		offsets)/*!< in: record offsets */
+{
+	ulint	offset;
+	ulint	len;
+
+	ut_ad(rec_offs_validate(NULL, index, offsets));
+
+	offset = rec_get_nth_field_offs(offsets, index->db_trx_id(), &len);
+
+	ut_ad(len == DATA_TRX_ID_LEN);
+
+	return(offset);
+}
+
+/*********************************************************************//**
+Reads the trx id field from a clustered index record.
+@return value of the field */
+UNIV_INLINE
+trx_id_t
+row_get_rec_trx_id(
+/*===============*/
+	const rec_t*		rec,	/*!< in: record */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const rec_offs*		offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+	ulint	offset;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	offset = index->trx_id_offset;
+
+	if (!offset) {
+		offset = row_get_trx_id_offset(index, offsets);
+	}
+
+	return(trx_read_trx_id(rec + offset));
+}
+
+/*********************************************************************//**
+Reads the roll pointer field from a clustered index record.
+@return value of the field */
+UNIV_INLINE
+roll_ptr_t
+row_get_rec_roll_ptr(
+/*=================*/
+	const rec_t*		rec,	/*!< in: record */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const rec_offs*		offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+	ulint	offset;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	offset = index->trx_id_offset;
+
+	if (!offset) {
+		offset = row_get_trx_id_offset(index, offsets);
+	}
+
+	return(trx_read_roll_ptr(rec + offset + DATA_TRX_ID_LEN));
+}
+
+/*****************************************************************//**
+When an insert or purge to a table is performed, this function builds
+the entry to be inserted into or purged from an index on the table.
+@return index entry which should be inserted or purged, or NULL if the
+externally stored columns in the clustered index record are
+unavailable and ext != NULL */
+UNIV_INLINE
+dtuple_t*
+row_build_index_entry(
+/*==================*/
+	const dtuple_t*		row,	/*!< in: row which should be
+					inserted or purged */
+	const row_ext_t*	ext,	/*!< in: externally stored column
+					prefixes, or NULL */
+	const dict_index_t*	index,	/*!< in: index on the table */
+	mem_heap_t*		heap)	/*!< in,out: memory heap from which
+					the memory for the index entry
+					is allocated */
+{
+	dtuple_t*	entry;
+
+	ut_ad(dtuple_check_typed(row));
+	entry = row_build_index_entry_low(row, ext, index, heap,
+					  ROW_BUILD_NORMAL);
+	ut_ad(!entry || dtuple_check_typed(entry));
+	return(entry);
+}
+
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+UNIV_INLINE
+void
+row_build_row_ref_fast(
+/*===================*/
+	dtuple_t*	ref,	/*!< in/out: typed data tuple where the
+				reference is built */
+	const ulint*	map,	/*!< in: array of field numbers in rec
+				telling how ref should be built from
+				the fields of rec */
+	const rec_t*	rec,	/*!< in: secondary index record;
+				must be preserved while ref is used, as we do
+				not copy field values to heap */
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	dfield_t*	dfield;
+	const byte*	field;
+	ulint		len;
+	ulint		ref_len;
+	ulint		field_no;
+	ulint		i;
+
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	ut_ad(!rec_offs_any_extern(offsets));
+	ref_len = dtuple_get_n_fields(ref);
+
+	for (i = 0; i < ref_len; i++) {
+		dfield = dtuple_get_nth_field(ref, i);
+
+		field_no = *(map + i);
+
+		if (field_no != ULINT_UNDEFINED) {
+
+			field = rec_get_nth_field(rec, offsets,
+						  field_no, &len);
+			dfield_set_data(dfield, field, len);
+		}
+	}
+}
+
+/** Parse the integer data from specified data, which could be
+DATA_INT, DATA_FLOAT or DATA_DOUBLE. If the value is less than 0
+and the type is not unsigned then we reset the value to 0
+@param[in]	data		data to read
+@param[in]	len		length of data
+@param[in]	mtype		mtype of data
+@param[in]	unsigned_type	if the data is unsigned
+@return the integer value from the data */
+ib_uint64_t
+row_parse_int(
+	const byte*	data,
+	ulint		len,
+	ulint		mtype,
+	bool		unsigned_type)
+{
+	ib_uint64_t	value = 0;
+
+	switch (mtype) {
+	case DATA_INT:
+
+		ut_a(len <= sizeof value);
+		value = mach_read_int_type(data, len, unsigned_type);
+		break;
+
+	case DATA_FLOAT:
+
+		ut_a(len == sizeof(float));
+		value = static_cast<ib_uint64_t>(mach_float_read(data));
+		break;
+
+	case DATA_DOUBLE:
+
+		ut_a(len == sizeof(double));
+		value = static_cast<ib_uint64_t>(mach_double_read(data));
+		break;
+
+	default:
+		ut_error;
+
+	}
+
+	if (!unsigned_type && static_cast<int64_t>(value) < 0) {
+		value = 0;
+	}
+
+	return(value);
+}
+
diff --git a/storage/innobase/include/row0sel.h b/storage/innobase/include/row0sel.h
new file mode 100644
index 00000000..8134c60f
--- /dev/null
+++ b/storage/innobase/include/row0sel.h
@@ -0,0 +1,457 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2017, Oracle and/or its affiliates.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0sel.h
+Select
+
+Created 12/19/1997 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "data0data.h"
+#include "que0types.h"
+#include "trx0types.h"
+#include "read0types.h"
+#include "row0types.h"
+#include "que0types.h"
+#include "pars0sym.h"
+#include "btr0pcur.h"
+#include "row0mysql.h"
+
+/*********************************************************************//**
+Creates a select node struct.
+@return own: select node struct */
+sel_node_t*
+sel_node_create(
+/*============*/
+	mem_heap_t*	heap);	/*!< in: memory heap where created */
+/*********************************************************************//**
+Frees the memory private to a select node when a query graph is freed,
+does not free the heap where the node was originally created. */
+void
+sel_node_free_private(
+/*==================*/
+	sel_node_t*	node);	/*!< in: select node struct */
+/*********************************************************************//**
+Frees a prefetch buffer for a column, including the dynamically allocated
+memory for data stored there. */
+void
+sel_col_prefetch_buf_free(
+/*======================*/
+	sel_buf_t*	prefetch_buf);	/*!< in, own: prefetch buffer */
+/**********************************************************************//**
+Performs a select step. This is a high-level function used in SQL execution
+graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+row_sel_step(
+/*=========*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Performs a fetch for a cursor.
+@return query thread to run next or NULL */
+que_thr_t*
+fetch_step(
+/*=======*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/***********************************************************//**
+Prints a row in a select result.
+@return query thread to run next or NULL */
+que_thr_t*
+row_printf_step(
+/*============*/
+	que_thr_t*	thr);	/*!< in: query thread */
+
+/** Copy used fields from cached row.
+Copy cache record field by field, don't touch fields that
+are not covered by current key.
+@param[out]	buf		Where to copy the MySQL row.
+@param[in]	cached_rec	What to copy (in MySQL row format).
+@param[in]	prebuilt	prebuilt struct. */
+void
+row_sel_copy_cached_fields_for_mysql(
+	byte*		buf,
+	const byte*	cached_rec,
+	row_prebuilt_t*	prebuilt);
+
+/****************************************************************//**
+Converts a key value stored in MySQL format to an Innobase dtuple. The last
+field of the key value may be just a prefix of a fixed length field: hence
+the parameter key_len. But currently we do not allow search keys where the
+last field is only a prefix of the full key field len and print a warning if
+such appears. */
+void
+row_sel_convert_mysql_key_to_innobase(
+/*==================================*/
+	dtuple_t*	tuple,		/*!< in/out: tuple where to build;
+					NOTE: we assume that the type info
+					in the tuple is already according
+					to index! */
+	byte*		buf,		/*!< in: buffer to use in field
+					conversions; NOTE that dtuple->data
+					may end up pointing inside buf so
+					do not discard that buffer while
+					the tuple is being used. See
+					row_mysql_store_col_in_innobase_format()
+					in the case of DATA_INT */
+	ulint		buf_len,	/*!< in: buffer length */
+	dict_index_t*	index,		/*!< in: index of the key value */
+	const byte*	key_ptr,	/*!< in: MySQL key value */
+	ulint		key_len);	/*!< in: MySQL key value length */
+
+
+/** Search for rows in the database using cursor.
+Function is mainly used for tables that are shared across connections and
+so it employs technique that can help re-construct the rows that
+transaction is suppose to see.
+It also has optimization such as pre-caching the rows, using AHI, etc.
+
+@param[out]	buf		buffer for the fetched row in MySQL format
+@param[in]	mode		search mode PAGE_CUR_L
+@param[in,out]	prebuilt	prebuilt struct for the table handler;
+				this contains the info to search_tuple,
+				index; if search tuple contains 0 field then
+				we position the cursor at start or the end of
+				index, depending on 'mode'
+@param[in]	match_mode	0 or ROW_SEL_EXACT or ROW_SEL_EXACT_PREFIX
+@param[in]	direction	0 or ROW_SEL_NEXT or ROW_SEL_PREV;
+				Note: if this is != 0, then prebuilt must has a
+				pcur with stored position! In opening of a
+				cursor 'direction' should be 0.
+@return DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK,
+DB_LOCK_TABLE_FULL, DB_CORRUPTION, or DB_TOO_BIG_RECORD */
+dberr_t
+row_search_mvcc(
+	byte*		buf,
+	page_cur_mode_t	mode,
+	row_prebuilt_t*	prebuilt,
+	ulint		match_mode,
+	ulint		direction)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/********************************************************************//**
+Count rows in a R-Tree leaf level.
+@return DB_SUCCESS if successful */
+dberr_t
+row_count_rtree_recs(
+/*=================*/
+	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct for the
+					table handle; this contains the info
+					of search_tuple, index; if search
+					tuple contains 0 fields then we
+					position the cursor at the start or
+					the end of the index, depending on
+					'mode' */
+	ulint*		n_rows);	/*!< out: number of entries
+					seen in the consistent read */
+
+/**
+Check the index records in CHECK TABLE.
+The index must contain entries in an ascending order,
+unique constraint must not be violated by duplicated keys,
+and the number of index entries is counted in according to the
+current read view.
+
+@param prebuilt    index and transaction
+@param n_rows      number of records counted
+
+@return error code
+@retval DB_SUCCESS  if no error was found */
+dberr_t row_check_index(row_prebuilt_t *prebuilt, ulint *n_rows)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Read the max AUTOINC value from an index.
+@param[in] index	index starting with an AUTO_INCREMENT column
+@return	the largest AUTO_INCREMENT value
+@retval	0	if no records were found */
+ib_uint64_t
+row_search_max_autoinc(dict_index_t* index)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** A structure for caching column values for prefetched rows */
+struct sel_buf_t{
+	byte*		data;	/*!< data, or NULL; if not NULL, this field
+				has allocated memory which must be explicitly
+				freed; can be != NULL even when len is
+				UNIV_SQL_NULL */
+	ulint		len;	/*!< data length or UNIV_SQL_NULL */
+	ulint		val_buf_size;
+				/*!< size of memory buffer allocated for data:
+				this can be more than len; this is defined
+				when data != NULL */
+};
+
+/** Copy used fields from cached row.
+Copy cache record field by field, don't touch fields that
+are not covered by current key.
+@param[out]     buf             Where to copy the MySQL row.
+@param[in]      cached_rec      What to copy (in MySQL row format).
+@param[in]      prebuilt        prebuilt struct. */
+void
+row_sel_copy_cached_fields_for_mysql(
+        byte*           buf,
+        const byte*     cached_rec,
+        row_prebuilt_t* prebuilt);
+
+/** Query plan */
+struct plan_t{
+	dict_table_t*	table;		/*!< table struct in the dictionary
+					cache */
+	dict_index_t*	index;		/*!< table index used in the search */
+	btr_pcur_t	pcur;		/*!< persistent cursor used to search
+					the index */
+	ibool		asc;		/*!< TRUE if cursor traveling upwards */
+	ibool		pcur_is_open;	/*!< TRUE if pcur has been positioned
+					and we can try to fetch new rows */
+	ibool		cursor_at_end;	/*!< TRUE if the cursor is open but
+					we know that there are no more
+					qualifying rows left to retrieve from
+					the index tree; NOTE though, that
+					there may still be unprocessed rows in
+					the prefetch stack; always FALSE when
+					pcur_is_open is FALSE */
+	ibool		stored_cursor_rec_processed;
+					/*!< TRUE if the pcur position has been
+					stored and the record it is positioned
+					on has already been processed */
+	que_node_t**	tuple_exps;	/*!< array of expressions
+					which are used to calculate
+					the field values in the search
+					tuple: there is one expression
+					for each field in the search
+					tuple */
+	dtuple_t*	tuple;		/*!< search tuple */
+	page_cur_mode_t	mode;		/*!< search mode: PAGE_CUR_G, ... */
+	ulint		n_exact_match;	/*!< number of first fields in
+					the search tuple which must be
+					exactly matched */
+	ibool		unique_search;	/*!< TRUE if we are searching an
+					index record with a unique key */
+	ulint		n_rows_fetched;	/*!< number of rows fetched using pcur
+					after it was opened */
+	ulint		n_rows_prefetched;/*!< number of prefetched rows cached
+					for fetch: fetching several rows in
+					the same mtr saves CPU time */
+	ulint		first_prefetched;/*!< index of the first cached row in
+					select buffer arrays for each column */
+	ibool		no_prefetch;	/*!< no prefetch for this table */
+	sym_node_list_t	columns;	/*!< symbol table nodes for the columns
+					to retrieve from the table */
+	UT_LIST_BASE_NODE_T(func_node_t)
+			end_conds;	/*!< conditions which determine the
+					fetch limit of the index segment we
+					have to look at: when one of these
+					fails, the result set has been
+					exhausted for the cursor in this
+					index; these conditions are normalized
+					so that in a comparison the column
+					for this table is the first argument */
+	UT_LIST_BASE_NODE_T(func_node_t)
+			other_conds;	/*!< the rest of search conditions we can
+					test at this table in a join */
+	ibool		must_get_clust;	/*!< TRUE if index is a non-clustered
+					index and we must also fetch the
+					clustered index record; this is the
+					case if the non-clustered record does
+					not contain all the needed columns, or
+					if this is a single-table explicit
+					cursor, or a searched update or
+					delete */
+	ulint*		clust_map;	/*!< map telling how clust_ref is built
+					from the fields of a non-clustered
+					record */
+	dtuple_t*	clust_ref;	/*!< the reference to the clustered
+					index entry is built here if index is
+					a non-clustered index */
+	btr_pcur_t	clust_pcur;	/*!< if index is non-clustered, we use
+					this pcur to search the clustered
+					index */
+	mem_heap_t*	old_vers_heap;	/*!< memory heap used in building an old
+					version of a row, or NULL */
+};
+
+/** Select node states */
+enum sel_node_state {
+	SEL_NODE_CLOSED,	/*!< it is a declared cursor which is not
+				currently open */
+	SEL_NODE_OPEN,		/*!< intention locks not yet set on tables */
+	SEL_NODE_FETCH,		/*!< intention locks have been set */
+	SEL_NODE_NO_MORE_ROWS	/*!< cursor has reached the result set end */
+};
+
+/** Select statement node */
+struct sel_node_t{
+	que_common_t	common;		/*!< node type: QUE_NODE_SELECT */
+	enum sel_node_state
+			state;	/*!< node state */
+	que_node_t*	select_list;	/*!< select list */
+	sym_node_t*	into_list;	/*!< variables list or NULL */
+	sym_node_t*	table_list;	/*!< table list */
+	ibool		asc;		/*!< TRUE if the rows should be fetched
+					in an ascending order */
+	ibool		set_x_locks;	/*!< TRUE if the cursor is for update or
+					delete, which means that a row x-lock
+					should be placed on the cursor row */
+	lock_mode	row_lock_mode;	/*!< LOCK_X or LOCK_S */
+	ulint		n_tables;	/*!< number of tables */
+	ulint		fetch_table;	/*!< number of the next table to access
+					in the join */
+	plan_t*		plans;		/*!< array of n_tables many plan nodes
+					containing the search plan and the
+					search data structures */
+	que_node_t*	search_cond;	/*!< search condition */
+	ReadView*	read_view;	/*!< if the query is a non-locking
+					consistent read, its read view is
+					placed here, otherwise NULL */
+	ibool		consistent_read;/*!< TRUE if the select is a consistent,
+					non-locking read */
+	order_node_t*	order_by;	/*!< order by column definition, or
+					NULL */
+	ibool		is_aggregate;	/*!< TRUE if the select list consists of
+					aggregate functions */
+	ibool		aggregate_already_fetched;
+					/*!< TRUE if the aggregate row has
+					already been fetched for the current
+					cursor */
+	ibool		can_get_updated;/*!< this is TRUE if the select
+					is in a single-table explicit
+					cursor which can get updated
+					within the stored procedure,
+					or in a searched update or
+					delete; NOTE that to determine
+					of an explicit cursor if it
+					can get updated, the parser
+					checks from a stored procedure
+					if it contains positioned
+					update or delete statements */
+	sym_node_t*	explicit_cursor;/*!< not NULL if an explicit cursor */
+	UT_LIST_BASE_NODE_T(sym_node_t)
+			copy_variables; /*!< variables whose values we have to
+					copy when an explicit cursor is opened,
+					so that they do not change between
+					fetches */
+};
+
+/**
+Get the plan node for a table in a join.
+@param node  query graph node for SELECT
+@param i     plan node element
+@return ith plan node */
+inline plan_t *sel_node_get_nth_plan(sel_node_t *node, ulint i)
+{
+  ut_ad(i < node->n_tables);
+  return &node->plans[i];
+}
+
+/** Fetch statement node */
+struct fetch_node_t{
+	que_common_t	common;		/*!< type: QUE_NODE_FETCH */
+	sel_node_t*	cursor_def;	/*!< cursor definition */
+	sym_node_t*	into_list;	/*!< variables to set */
+
+	pars_user_func_t*
+			func;		/*!< User callback function or NULL.
+					The first argument to the function
+					is a sel_node_t*, containing the
+					results of the SELECT operation for
+					one row. If the function returns
+					NULL, it is not interested in
+					further rows and the cursor is
+					modified so (cursor % NOTFOUND) is
+					true. If it returns not-NULL,
+					continue normally. */
+};
+
+/** Open or close cursor operation type */
+enum open_node_op {
+	ROW_SEL_OPEN_CURSOR,	/*!< open cursor */
+	ROW_SEL_CLOSE_CURSOR	/*!< close cursor */
+};
+
+/** Open or close cursor statement node */
+struct open_node_t{
+	que_common_t	common;		/*!< type: QUE_NODE_OPEN */
+	enum open_node_op
+			op_type;	/*!< operation type: open or
+					close cursor */
+	sel_node_t*	cursor_def;	/*!< cursor definition */
+};
+
+/** Row printf statement node */
+struct row_printf_node_t{
+	que_common_t	common;		/*!< type: QUE_NODE_ROW_PRINTF */
+	sel_node_t*	sel_node;	/*!< select */
+};
+
+/** Search direction for the MySQL interface */
+enum row_sel_direction {
+	ROW_SEL_NEXT = 1,	/*!< ascending direction */
+	ROW_SEL_PREV = 2	/*!< descending direction */
+};
+
+/** Match mode for the MySQL interface */
+enum row_sel_match_mode {
+	ROW_SEL_EXACT = 1,	/*!< search using a complete key value */
+	ROW_SEL_EXACT_PREFIX	/*!< search using a key prefix which
+				must match rows: the prefix may
+				contain an incomplete field (the last
+				field in prefix may be just a prefix
+				of a fixed length column) */
+};
+
+#ifdef UNIV_DEBUG
+/** Convert a non-SQL-NULL field from Innobase format to MySQL format. */
+# define row_sel_field_store_in_mysql_format(dest,templ,idx,field,src,len) \
+        row_sel_field_store_in_mysql_format_func(dest,templ,idx,field,src,len)
+#else /* UNIV_DEBUG */
+/** Convert a non-SQL-NULL field from Innobase format to MySQL format. */
+# define row_sel_field_store_in_mysql_format(dest,templ,idx,field,src,len) \
+        row_sel_field_store_in_mysql_format_func(dest,templ,src,len)
+#endif /* UNIV_DEBUG */
+
+/**************************************************************//**
+Stores a non-SQL-NULL field in the MySQL format. The counterpart of this
+function is row_mysql_store_col_in_innobase_format() in row0mysql.cc. */
+
+void
+row_sel_field_store_in_mysql_format_func(
+/*=====================================*/
+        byte*           dest,   /*!< in/out: buffer where to store; NOTE
+                                that BLOBs are not in themselves
+                                stored here: the caller must allocate
+                                and copy the BLOB into buffer before,
+                                and pass the pointer to the BLOB in
+                                'data' */
+        const mysql_row_templ_t* templ,
+                                /*!< in: MySQL column template.
+                                Its following fields are referenced:
+                                type, is_unsigned, mysql_col_len,
+                                mbminlen, mbmaxlen */
+#ifdef UNIV_DEBUG
+        const dict_index_t* index,
+                                /*!< in: InnoDB index */
+        ulint           field_no,
+                                /*!< in: templ->rec_field_no or
+                                templ->clust_rec_field_no or
+                                templ->icp_rec_field_no */
+#endif /* UNIV_DEBUG */
+        const byte*     data,   /*!< in: data to store */
+        ulint           len);    /*!< in: length of the data */
diff --git a/storage/innobase/include/row0types.h b/storage/innobase/include/row0types.h
new file mode 100644
index 00000000..5e737c1c
--- /dev/null
+++ b/storage/innobase/include/row0types.h
@@ -0,0 +1,54 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0types.h
+Row operation global types
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+#include "buf0types.h"
+
+struct plan_t;
+
+struct upd_t;
+struct upd_field_t;
+struct upd_node_t;
+struct del_node_t;
+struct ins_node_t;
+struct sel_node_t;
+struct open_node_t;
+struct fetch_node_t;
+
+struct row_printf_node_t;
+struct sel_buf_t;
+
+struct undo_node_t;
+
+struct purge_node_t;
+
+struct row_ext_t;
+
+/** Buffer for logging modifications during online index creation */
+struct row_log_t;
+
+/* MySQL data types */
+struct TABLE;
diff --git a/storage/innobase/include/row0uins.h b/storage/innobase/include/row0uins.h
new file mode 100644
index 00000000..a9877969
--- /dev/null
+++ b/storage/innobase/include/row0uins.h
@@ -0,0 +1,50 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0uins.h
+Fresh insert undo
+
+Created 2/25/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0uins_h
+#define row0uins_h
+
+#include "data0data.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "mtr0mtr.h"
+
+/***********************************************************//**
+Undoes a fresh insert of a row to a table. A fresh insert means that
+the same clustered index unique key did not have any record, even delete
+marked, at the time of the insert.  InnoDB is eager in a rollback:
+if it figures out that an index record will be removed in the purge
+anyway, it will remove it in the rollback.
+@return DB_SUCCESS */
+dberr_t
+row_undo_ins(
+/*=========*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+#endif
diff --git a/storage/innobase/include/row0umod.h b/storage/innobase/include/row0umod.h
new file mode 100644
index 00000000..5032e103
--- /dev/null
+++ b/storage/innobase/include/row0umod.h
@@ -0,0 +1,46 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0umod.h
+Undo modify of a row
+
+Created 2/27/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0umod_h
+#define row0umod_h
+
+#include "data0data.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "mtr0mtr.h"
+
+/***********************************************************//**
+Undoes a modify operation on a row of a table.
+@return DB_SUCCESS or error code */
+dberr_t
+row_undo_mod(
+/*=========*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+	MY_ATTRIBUTE((warn_unused_result));
+
+#endif
diff --git a/storage/innobase/include/row0undo.h b/storage/innobase/include/row0undo.h
new file mode 100644
index 00000000..ae067a8a
--- /dev/null
+++ b/storage/innobase/include/row0undo.h
@@ -0,0 +1,114 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0undo.h
+Row undo
+
+Created 1/8/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0undo_h
+#define row0undo_h
+
+#include "trx0sys.h"
+#include "btr0types.h"
+#include "btr0pcur.h"
+#include "que0types.h"
+#include "row0types.h"
+
+/********************************************************************//**
+Creates a row undo node to a query graph.
+@return own: undo node */
+undo_node_t*
+row_undo_node_create(
+/*=================*/
+	trx_t*		trx,	/*!< in: transaction */
+	que_thr_t*	parent,	/*!< in: parent node, i.e., a thr node */
+	mem_heap_t*	heap);	/*!< in: memory heap where created */
+/***********************************************************//**
+Looks for the clustered index record when node has the row reference.
+The pcur in node is used in the search. If found, stores the row to node,
+and stores the position of pcur, and detaches it. The pcur must be closed
+by the caller in any case.
+@return true if found; NOTE the node->pcur must be closed by the
+caller, regardless of the return value */
+bool
+row_undo_search_clust_to_pcur(
+/*==========================*/
+	undo_node_t*	node)	/*!< in/out: row undo node */
+	MY_ATTRIBUTE((warn_unused_result));
+/***********************************************************//**
+Undoes a row operation in a table. This is a high-level function used
+in SQL execution graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+row_undo_step(
+/*==========*/
+	que_thr_t*	thr);	/*!< in: query thread */
+
+/* A single query thread will try to perform the undo for all successive
+versions of a clustered index record, if the transaction has modified it
+several times during the execution which is rolled back. It may happen
+that the task is transferred to another query thread, if the other thread
+is assigned to handle an undo log record in the chain of different versions
+of the record, and the other thread happens to get the x-latch to the
+clustered index record at the right time.
+	If a query thread notices that the clustered index record it is looking
+for is missing, or the roll ptr field in the record doed not point to the
+undo log record the thread was assigned to handle, then it gives up the undo
+task for that undo log record, and fetches the next. This situation can occur
+just in the case where the transaction modified the same record several times
+and another thread is currently doing the undo for successive versions of
+that index record. */
+
+/** Undo node structure */
+struct undo_node_t{
+	que_common_t	common;	/*!< node type: QUE_NODE_UNDO */
+	bool		is_temp;/*!< whether this is a temporary table */
+	trx_t*		trx;	/*!< trx for which undo is done */
+	roll_ptr_t	roll_ptr;/*!< roll pointer to undo log record */
+	trx_undo_rec_t*	undo_rec;/*!< undo log record */
+	undo_no_t	undo_no;/*!< undo number of the record */
+	byte		rec_type;/*!< undo log record type: TRX_UNDO_INSERT_REC,
+				... */
+	trx_id_t	new_trx_id; /*!< trx id to restore to clustered index
+				record */
+	btr_pcur_t	pcur;	/*!< persistent cursor used in searching the
+				clustered index record */
+	dict_table_t*	table;	/*!< table where undo is done */
+	ulint		cmpl_info;/*!< compiler analysis of an update */
+	upd_t*		update;	/*!< update vector for a clustered index
+				record */
+	const dtuple_t*	ref;	/*!< row reference to the next row to handle */
+	dtuple_t*	row;	/*!< a copy (also fields copied to heap) of the
+				row to handle */
+	row_ext_t*	ext;	/*!< NULL, or prefixes of the externally
+				stored columns of the row */
+	dtuple_t*	undo_row;/*!< NULL, or the row after undo */
+	row_ext_t*	undo_ext;/*!< NULL, or prefixes of the externally
+				stored columns of undo_row */
+	dict_index_t*	index;	/*!< the next index whose record should be
+				handled */
+	mem_heap_t*	heap;	/*!< memory heap used as auxiliary storage for
+				row; this must be emptied after undo is tried
+				on a row */
+};
+
+#endif
diff --git a/storage/innobase/include/row0upd.h b/storage/innobase/include/row0upd.h
new file mode 100644
index 00000000..f60fc359
--- /dev/null
+++ b/storage/innobase/include/row0upd.h
@@ -0,0 +1,559 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0upd.h
+Update of a row
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0upd_h
+#define row0upd_h
+
+#include "data0data.h"
+#include "rem0types.h"
+#include "row0types.h"
+#include "btr0types.h"
+#include "trx0types.h"
+#include "btr0pcur.h"
+#include "que0types.h"
+#include "pars0types.h"
+
+/*********************************************************************//**
+Creates an update vector object.
+@return own: update vector object */
+UNIV_INLINE
+upd_t*
+upd_create(
+/*=======*/
+	ulint		n,	/*!< in: number of fields */
+	mem_heap_t*	heap);	/*!< in: heap from which memory allocated */
+/*********************************************************************//**
+Returns the number of fields in the update vector == number of columns
+to be updated by an update vector.
+@return number of fields */
+UNIV_INLINE
+ulint
+upd_get_n_fields(
+/*=============*/
+	const upd_t*	update);	/*!< in: update vector */
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Returns the nth field of an update vector.
+@return update vector field */
+UNIV_INLINE
+upd_field_t*
+upd_get_nth_field(
+/*==============*/
+	const upd_t*	update,	/*!< in: update vector */
+	ulint		n);	/*!< in: field position in update vector */
+#else
+# define upd_get_nth_field(update, n) ((update)->fields + (n))
+#endif
+
+/*********************************************************************//**
+Sets an index field number to be updated by an update vector field. */
+UNIV_INLINE
+void
+upd_field_set_field_no(
+/*===================*/
+	upd_field_t*	upd_field,	/*!< in: update vector field */
+	uint16_t	field_no,	/*!< in: field number in a clustered
+					index */
+	dict_index_t*	index);
+
+/** set field number to a update vector field, marks this field is updated
+@param[in,out]	upd_field	update vector field
+@param[in]	field_no	virtual column sequence num
+@param[in]	index		index */
+UNIV_INLINE
+void
+upd_field_set_v_field_no(
+	upd_field_t*	upd_field,
+	uint16_t	field_no,
+	dict_index_t*	index);
+/*********************************************************************//**
+Returns a field of an update vector by field_no.
+@return update vector field, or NULL */
+UNIV_INLINE
+const upd_field_t*
+upd_get_field_by_field_no(
+/*======================*/
+	const upd_t*	update,	/*!< in: update vector */
+	uint16_t	no,	/*!< in: field_no */
+	bool		is_virtual) /*!< in: if it is a virtual column */
+	MY_ATTRIBUTE((warn_unused_result));
+/*********************************************************************//**
+Creates an update node for a query graph.
+@return own: update node */
+upd_node_t*
+upd_node_create(
+/*============*/
+	mem_heap_t*	heap);	/*!< in: mem heap where created */
+/***********************************************************//**
+Returns TRUE if row update changes size of some field in index or if some
+field to be updated is stored externally in rec or update.
+@return TRUE if the update changes the size of some field in index or
+the field is external in rec or update */
+ibool
+row_upd_changes_field_size_or_external(
+/*===================================*/
+	dict_index_t*	index,	/*!< in: index */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	const upd_t*	update);/*!< in: update vector */
+
+/***************************************************************//**
+Builds an update vector from those fields which in a secondary index entry
+differ from a record that has the equal ordering fields. NOTE: we compare
+the fields as binary strings!
+@return own: update vector of differing fields */
+upd_t*
+row_upd_build_sec_rec_difference_binary(
+/*====================================*/
+	const rec_t*	rec,	/*!< in: secondary index record */
+	dict_index_t*	index,	/*!< in: index */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	const dtuple_t*	entry,	/*!< in: entry to insert */
+	mem_heap_t*	heap)	/*!< in: memory heap from which allocated */
+	MY_ATTRIBUTE((warn_unused_result, nonnull));
+/** Builds an update vector from those fields, excluding the roll ptr and
+trx id fields, which in an index entry differ from a record that has
+the equal ordering fields. NOTE: we compare the fields as binary strings!
+@param[in]	index		clustered index
+@param[in]	entry		clustered index entry to insert
+@param[in]	rec		clustered index record
+@param[in]	offsets		rec_get_offsets(rec,index), or NULL
+@param[in]	no_sys		skip the system columns
+				DB_TRX_ID and DB_ROLL_PTR
+@param[in]	ignore_warnings ignore warnings during vcol calculation, which
+				means that this calculation is internal only
+@param[in]	trx		transaction (for diagnostics),
+				or NULL
+@param[in]	heap		memory heap from which allocated
+@param[in,out]	mysql_table	NULL, or mysql table object when
+				user thread invokes dml
+@param[out]	error		error number in case of failure
+@return own: update vector of differing fields, excluding roll ptr and
+trx id */
+upd_t*
+row_upd_build_difference_binary(
+	dict_index_t*	index,
+	const dtuple_t*	entry,
+	const rec_t*	rec,
+	const rec_offs*	offsets,
+	bool		no_sys,
+	bool		ignore_warnings,
+	trx_t*		trx,
+	mem_heap_t*	heap,
+	TABLE*		mysql_table,
+	dberr_t*	error)
+	MY_ATTRIBUTE((nonnull(1,2,3,8,10), warn_unused_result));
+/** Apply an update vector to an index entry.
+@param[in,out]	entry	index entry to be updated; the clustered index record
+			must be covered by a lock or a page latch to prevent
+			deletion (rollback or purge)
+@param[in]	index	index of the entry
+@param[in]	update	update vector built for the entry
+@param[in,out]	heap	memory heap for copying off-page columns */
+void
+row_upd_index_replace_new_col_vals_index_pos(
+	dtuple_t*		entry,
+	const dict_index_t*	index,
+	const upd_t*		update,
+	mem_heap_t*		heap)
+	MY_ATTRIBUTE((nonnull));
+/** Replace the new column values stored in the update vector,
+during trx_undo_prev_version_build().
+@param entry   clustered index tuple where the values are replaced
+               (the clustered index leaf page latch must be held)
+@param index   clustered index
+@param update  update vector for the clustered index
+@param heap    memory heap for allocating and copying values
+@return whether the previous version was built successfully */
+bool
+row_upd_index_replace_new_col_vals(dtuple_t *entry, const dict_index_t &index,
+                                   const upd_t *update, mem_heap_t *heap)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+/***********************************************************//**
+Replaces the new column values stored in the update vector. */
+void
+row_upd_replace(
+/*============*/
+	dtuple_t*		row,	/*!< in/out: row where replaced,
+					indexed by col_no;
+					the clustered index record must be
+					covered by a lock or a page latch to
+					prevent deletion (rollback or purge) */
+	row_ext_t**		ext,	/*!< out, own: NULL, or externally
+					stored column prefixes */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const upd_t*		update,	/*!< in: an update vector built for the
+					clustered index */
+	mem_heap_t*		heap);	/*!< in: memory heap */
+/** Replaces the virtual column values stored in a dtuple with that of
+a update vector.
+@param[in,out]	row	dtuple whose column to be updated
+@param[in]	table	table
+@param[in]	update	an update vector built for the clustered index
+@param[in]	upd_new	update to new or old value
+@param[in,out]	undo_row undo row (if needs to be updated)
+@param[in]	ptr	remaining part in update undo log */
+void
+row_upd_replace_vcol(
+	dtuple_t*		row,
+	const dict_table_t*	table,
+	const upd_t*		update,
+	bool			upd_new,
+	dtuple_t*		undo_row,
+	const byte*		ptr);
+
+/***********************************************************//**
+Checks if an update vector changes an ordering field of an index record.
+
+This function is fast if the update vector is short or the number of ordering
+fields in the index is small. Otherwise, this can be quadratic.
+NOTE: we compare the fields as binary strings!
+@return TRUE if update vector changes an ordering field in the index record */
+ibool
+row_upd_changes_ord_field_binary_func(
+/*==================================*/
+	dict_index_t*	index,	/*!< in: index of the record */
+	const upd_t*	update,	/*!< in: update vector for the row; NOTE: the
+				field numbers in this MUST be clustered index
+				positions! */
+#ifdef UNIV_DEBUG
+	const que_thr_t*thr,	/*!< in: query thread */
+#endif /* UNIV_DEBUG */
+	const dtuple_t*	row,	/*!< in: old value of row, or NULL if the
+				row and the data values in update are not
+				known when this function is called, e.g., at
+				compile time */
+	const row_ext_t*ext,	/*!< NULL, or prefixes of the externally
+				stored columns in the old row */
+	ulint		flag)	/*!< in: ROW_BUILD_NORMAL,
+				ROW_BUILD_FOR_PURGE or ROW_BUILD_FOR_UNDO */
+	MY_ATTRIBUTE((nonnull(1,2), warn_unused_result));
+#ifdef UNIV_DEBUG
+# define row_upd_changes_ord_field_binary(index,update,thr,row,ext)	\
+	row_upd_changes_ord_field_binary_func(index,update,thr,row,ext,0)
+#else /* UNIV_DEBUG */
+# define row_upd_changes_ord_field_binary(index,update,thr,row,ext)	\
+	row_upd_changes_ord_field_binary_func(index,update,row,ext,0)
+#endif /* UNIV_DEBUG */
+/***********************************************************//**
+Checks if an FTS indexed column is affected by an UPDATE.
+@return offset within fts_t::indexes if FTS indexed column updated else
+ULINT_UNDEFINED */
+ulint
+row_upd_changes_fts_column(
+/*=======================*/
+	dict_table_t*	table,		/*!< in: table */
+	upd_field_t*	upd_field);	/*!< in: field to check */
+/***********************************************************//**
+Checks if an FTS Doc ID column is affected by an UPDATE.
+@return whether Doc ID column is affected */
+bool
+row_upd_changes_doc_id(
+/*===================*/
+	dict_table_t*	table,		/*!< in: table */
+	upd_field_t*	upd_field)	/*!< in: field to check */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/***********************************************************//**
+Checks if an update vector changes an ordering field of an index record.
+This function is fast if the update vector is short or the number of ordering
+fields in the index is small. Otherwise, this can be quadratic.
+NOTE: we compare the fields as binary strings!
+@return TRUE if update vector may change an ordering field in an index
+record */
+ibool
+row_upd_changes_some_index_ord_field_binary(
+/*========================================*/
+	const dict_table_t*	table,	/*!< in: table */
+	const upd_t*		update);/*!< in: update vector for the row */
+/***********************************************************//**
+Updates a row in a table. This is a high-level function used
+in SQL execution graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+row_upd_step(
+/*=========*/
+	que_thr_t*	thr);	/*!< in: query thread */
+
+/* Update vector field */
+struct upd_field_t{
+	uint16_t	field_no;	/*!< field number in an index, usually
+					the clustered index, but in updating
+					a secondary index record in btr0cur.cc
+					this is the position in the secondary
+					index. If this field is a virtual
+					column, then field_no represents
+					the nth virtual	column in the table */
+	uint16_t	orig_len;	/*!< original length of the locally
+					stored part of an externally stored
+					column, or 0 */
+	que_node_t*	exp;		/*!< expression for calculating a new
+					value: it refers to column values and
+					constants in the symbol table of the
+					query graph */
+	dfield_t	new_val;	/*!< new value for the column */
+	dfield_t*	old_v_val;	/*!< old value for the virtual column */
+};
+
+
+/* check whether an update field is on virtual column */
+#define upd_fld_is_virtual_col(upd_fld)			\
+	(((upd_fld)->new_val.type.prtype & DATA_VIRTUAL) == DATA_VIRTUAL)
+
+/* set DATA_VIRTUAL bit on update field to show it is a virtual column */
+#define upd_fld_set_virtual_col(upd_fld)			\
+	((upd_fld)->new_val.type.prtype |= DATA_VIRTUAL)
+
+/* Update vector structure */
+struct upd_t{
+	mem_heap_t*	heap;		/*!< heap from which memory allocated */
+	byte		info_bits;	/*!< new value of info bits to record;
+					default is 0 */
+	dtuple_t*	old_vrow;	/*!< pointer to old row, used for
+					virtual column update now */
+	ulint		n_fields;	/*!< number of update fields */
+	upd_field_t*	fields;		/*!< array of update fields */
+	byte		vers_sys_value[8]; /*!< buffer for updating system fields */
+
+	/** Append an update field to the end of array
+	@param[in]	field	an update field */
+	void append(const upd_field_t& field)
+	{
+		fields[n_fields++] = field;
+	}
+
+        void remove_element(ulint i)
+        {
+          ut_ad(n_fields > 0);
+          ut_ad(i < n_fields);
+          while (i < n_fields - 1)
+          {
+            fields[i]= fields[i + 1];
+            i++;
+          }
+          n_fields--;
+        }
+
+        bool remove(const ulint field_no)
+        {
+          for (ulint i= 0; i < n_fields; ++i)
+          {
+            if (field_no == fields[i].field_no)
+            {
+              remove_element(i);
+              return true;
+            }
+          }
+          return false;
+        }
+
+        /** Determine if the given field_no is modified.
+	@return true if modified, false otherwise.  */
+	bool is_modified(uint16_t field_no) const
+	{
+		for (ulint i = 0; i < n_fields; ++i) {
+			if (field_no == fields[i].field_no) {
+				return(true);
+			}
+		}
+		return(false);
+	}
+
+	/** Determine if the update affects a system versioned column or row_end. */
+	bool affects_versioned() const
+	{
+		for (ulint i = 0; i < n_fields; i++) {
+			dtype_t type = fields[i].new_val.type;
+			if (type.is_versioned()) {
+				return true;
+			}
+			// versioned DELETE is UPDATE SET row_end=NOW
+			if (type.vers_sys_end()) {
+				return true;
+			}
+		}
+		return false;
+	}
+
+	/** @return whether this is for a hidden metadata record
+	for instant ALTER TABLE */
+	bool is_metadata() const { return dtuple_t::is_metadata(info_bits); }
+	/** @return whether this is for a hidden metadata record
+	for instant ALTER TABLE (not only ADD COLUMN) */
+	bool is_alter_metadata() const
+	{ return dtuple_t::is_alter_metadata(info_bits); }
+
+#ifdef UNIV_DEBUG
+        bool validate() const
+        {
+                for (ulint i = 0; i < n_fields; ++i) {
+                        dfield_t* field = &fields[i].new_val;
+                        if (dfield_is_ext(field)) {
+				ut_ad(dfield_get_len(field)
+				      >= BTR_EXTERN_FIELD_REF_SIZE);
+                        }
+                }
+                return(true);
+        }
+#endif // UNIV_DEBUG
+};
+
+/** Kinds of update operation */
+enum delete_mode_t {
+	NO_DELETE = 0,		/*!< this operation does not delete */
+	PLAIN_DELETE,		/*!< ordinary delete */
+	VERSIONED_DELETE	/*!< update old and insert a new row */
+};
+
+/* Update node structure which also implements the delete operation
+of a row */
+
+struct upd_node_t{
+	que_common_t	common;	/*!< node type: QUE_NODE_UPDATE */
+	delete_mode_t	is_delete;	/*!< kind of DELETE */
+	ibool		searched_update;
+				/* TRUE if searched update, FALSE if
+				positioned */
+	bool		in_mysql_interface;
+				/* whether the update node was created
+				for the MySQL interface */
+	dict_foreign_t*	foreign;/* NULL or pointer to a foreign key
+				constraint if this update node is used in
+				doing an ON DELETE or ON UPDATE operation */
+	upd_node_t*	cascade_node;/* NULL or an update node template which
+				is used to implement ON DELETE/UPDATE CASCADE
+				or ... SET NULL for foreign keys */
+	mem_heap_t*	cascade_heap;
+				/*!< NULL or a mem heap where cascade
+				node is created.*/
+	sel_node_t*	select;	/*!< query graph subtree implementing a base
+				table cursor: the rows returned will be
+				updated */
+	btr_pcur_t*	pcur;	/*!< persistent cursor placed on the clustered
+				index record which should be updated or
+				deleted; the cursor is stored in the graph
+				of 'select' field above, except in the case
+				of the MySQL interface */
+	dict_table_t*	table;	/*!< table where updated */
+	upd_t*		update;	/*!< update vector for the row */
+	ulint		update_n_fields;
+				/* when this struct is used to implement
+				a cascade operation for foreign keys, we store
+				here the size of the buffer allocated for use
+				as the update vector */
+	sym_node_list_t	columns;/* symbol table nodes for the columns
+				to retrieve from the table */
+	ibool		has_clust_rec_x_lock;
+				/* TRUE if the select which retrieves the
+				records to update already sets an x-lock on
+				the clustered record; note that it must always
+				set at least an s-lock */
+	ulint		cmpl_info;/* information extracted during query
+				compilation; speeds up execution:
+				UPD_NODE_NO_ORD_CHANGE and
+				UPD_NODE_NO_SIZE_CHANGE, ORed */
+	/*----------------------*/
+	/* Local storage for this graph node */
+	ulint		state;	/*!< node execution state */
+	dict_index_t*	index;	/*!< NULL, or the next index whose record should
+				be updated */
+	dtuple_t*	row;	/*!< NULL, or a copy (also fields copied to
+				heap) of the row to update; this must be reset
+				to NULL after a successful update */
+	dtuple_t*	historical_row;	/*!< historical row used in
+				CASCADE UPDATE/SET NULL;
+				allocated from historical_heap  */
+	mem_heap_t*	historical_heap; /*!< heap for historical row insertion;
+				created when row to update is located;
+				freed right before row update */
+	row_ext_t*	ext;	/*!< NULL, or prefixes of the externally
+				stored columns in the old row */
+	dtuple_t*	upd_row;/* NULL, or a copy of the updated row */
+	row_ext_t*	upd_ext;/* NULL, or prefixes of the externally
+				stored columns in upd_row */
+	mem_heap_t*	heap;	/*!< memory heap used as auxiliary storage;
+				this must be emptied after a successful
+				update */
+	/*----------------------*/
+	sym_node_t*	table_sym;/* table node in symbol table */
+	que_node_t*	col_assign_list;
+				/* column assignment list */
+	ulint		magic_n;
+
+private:
+	/** Appends row_start or row_end field to update vector and sets a
+	CURRENT_TIMESTAMP/trx->id value to it.
+	Supposed to be called only by make_versioned_update() and
+	make_versioned_delete().
+	@param[in]	trx	transaction
+	@param[in]	vers_sys_idx	table->row_start or table->row_end */
+  void vers_update_fields(const trx_t *trx, ulint idx);
+
+public:
+	/** Also set row_start = CURRENT_TIMESTAMP/trx->id
+	@param[in]	trx	transaction */
+  void vers_make_update(const trx_t *trx)
+  {
+    vers_update_fields(trx, table->vers_start);
+  }
+
+  /** Prepare update vector for versioned delete.
+  Set row_end to CURRENT_TIMESTAMP or trx->id.
+  Initialize fts_next_doc_id for versioned delete.
+  @param[in] trx transaction */
+  void vers_make_delete(trx_t *trx);
+};
+
+#define	UPD_NODE_MAGIC_N	1579975
+
+/* Node execution states */
+#define UPD_NODE_SET_IX_LOCK	   1	/* execution came to the node from
+					a node above and if the field
+					has_clust_rec_x_lock is FALSE, we
+					should set an intention x-lock on
+					the table */
+#define UPD_NODE_UPDATE_CLUSTERED  2	/* clustered index record should be
+					updated */
+#define UPD_NODE_INSERT_CLUSTERED  3	/* clustered index record should be
+					inserted, old record is already delete
+					marked */
+#define UPD_NODE_UPDATE_ALL_SEC	   5	/* an ordering field of the clustered
+					index record was changed, or this is
+					a delete operation: should update
+					all the secondary index records */
+#define UPD_NODE_UPDATE_SOME_SEC   6	/* secondary index entries should be
+					looked at and updated if an ordering
+					field changed */
+
+/* Compilation info flags: these must fit within 3 bits; see trx0rec.h */
+#define UPD_NODE_NO_ORD_CHANGE	1	/* no secondary index record will be
+					changed in the update and no ordering
+					field of the clustered index */
+#define UPD_NODE_NO_SIZE_CHANGE	2	/* no record field size will be
+					changed in the update */
+
+
+#include "row0upd.inl"
+
+#endif
diff --git a/storage/innobase/include/row0upd.inl b/storage/innobase/include/row0upd.inl
new file mode 100644
index 00000000..13aacf3f
--- /dev/null
+++ b/storage/innobase/include/row0upd.inl
@@ -0,0 +1,153 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0upd.ic
+Update of a row
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#include "mtr0log.h"
+#include "trx0trx.h"
+#include "trx0undo.h"
+#include "row0row.h"
+#include "lock0lock.h"
+#include "page0zip.h"
+
+/*********************************************************************//**
+Creates an update vector object.
+@return own: update vector object */
+UNIV_INLINE
+upd_t*
+upd_create(
+/*=======*/
+	ulint		n,	/*!< in: number of fields */
+	mem_heap_t*	heap)	/*!< in: heap from which memory allocated */
+{
+	upd_t*	update;
+
+	update = static_cast<upd_t*>(mem_heap_zalloc(
+			heap, sizeof(upd_t) + sizeof(upd_field_t) * n));
+
+	update->n_fields = n;
+	update->fields = reinterpret_cast<upd_field_t*>(&update[1]);
+	update->heap = heap;
+
+	return(update);
+}
+
+/*********************************************************************//**
+Returns the number of fields in the update vector == number of columns
+to be updated by an update vector.
+@return number of fields */
+UNIV_INLINE
+ulint
+upd_get_n_fields(
+/*=============*/
+	const upd_t*	update)	/*!< in: update vector */
+{
+	ut_ad(update);
+
+	return(update->n_fields);
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Returns the nth field of an update vector.
+@return update vector field */
+UNIV_INLINE
+upd_field_t*
+upd_get_nth_field(
+/*==============*/
+	const upd_t*	update,	/*!< in: update vector */
+	ulint		n)	/*!< in: field position in update vector */
+{
+	ut_ad(update);
+	ut_ad(n < update->n_fields);
+
+	return((upd_field_t*) update->fields + n);
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Sets an index field number to be updated by an update vector field. */
+UNIV_INLINE
+void
+upd_field_set_field_no(
+/*===================*/
+	upd_field_t*	upd_field,	/*!< in: update vector field */
+	uint16_t	field_no,	/*!< in: field number in a clustered
+					index */
+	dict_index_t*	index)		/*!< in: index */
+{
+	upd_field->field_no = field_no;
+	upd_field->orig_len = 0;
+	dict_col_copy_type(dict_index_get_nth_col(index, field_no),
+			   dfield_get_type(&upd_field->new_val));
+}
+
+/** set field number to a update vector field, marks this field is updated.
+@param[in,out]	upd_field	update vector field
+@param[in]	field_no	virtual column sequence num
+@param[in]	index		index */
+UNIV_INLINE
+void
+upd_field_set_v_field_no(
+	upd_field_t*	upd_field,
+	uint16_t	field_no,
+	dict_index_t*	index)
+{
+	ut_a(field_no < dict_table_get_n_v_cols(index->table));
+	upd_field->field_no = field_no;
+	upd_field->orig_len = 0;
+
+	dict_col_copy_type(&dict_table_get_nth_v_col(
+				index->table, field_no)->m_col,
+			   dfield_get_type(&upd_field->new_val));
+}
+
+/*********************************************************************//**
+Returns a field of an update vector by field_no.
+@return update vector field, or NULL */
+UNIV_INLINE
+const upd_field_t*
+upd_get_field_by_field_no(
+/*======================*/
+	const upd_t*	update,	/*!< in: update vector */
+	uint16_t	no,	/*!< in: field_no */
+	bool		is_virtual) /*!< in: if it is virtual column */
+{
+	ulint	i;
+	for (i = 0; i < upd_get_n_fields(update); i++) {
+		const upd_field_t*	uf = upd_get_nth_field(update, i);
+
+		/* matches only if the field matches that of is_virtual */
+		if ((!is_virtual) != (!upd_fld_is_virtual_col(uf))) {
+			continue;
+		}
+
+		if (uf->field_no == no) {
+
+			return(uf);
+		}
+	}
+
+	return(NULL);
+}
diff --git a/storage/innobase/include/row0vers.h b/storage/innobase/include/row0vers.h
new file mode 100644
index 00000000..60f310e1
--- /dev/null
+++ b/storage/innobase/include/row0vers.h
@@ -0,0 +1,143 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0vers.h
+Row versions
+
+Created 2/6/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0vers_h
+#define row0vers_h
+
+#include "data0data.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "rem0types.h"
+#include "mtr0mtr.h"
+#include "dict0mem.h"
+#include "row0types.h"
+
+// Forward declaration
+class ReadView;
+
+/** Determine if an active transaction has inserted or modified a secondary
+index record.
+@param[in,out]	caller_trx	trx of current thread
+@param[in]	rec	secondary index record
+@param[in]	index	secondary index
+@param[in]	offsets	rec_get_offsets(rec, index)
+@return	the active transaction; state must be rechecked after
+acquiring trx->mutex, and trx->release_reference() must be invoked
+@retval	NULL if the record was committed */
+trx_t*
+row_vers_impl_x_locked(
+	trx_t*		caller_trx,
+	const rec_t*	rec,
+	dict_index_t*	index,
+	const rec_offs*	offsets);
+
+/** Finds out if a version of the record, where the version >= the current
+purge_sys.view, should have ientry as its secondary index entry. We check
+if there is any not delete marked version of the record where the trx
+id >= purge view, and the secondary index entry == ientry; exactly in
+this case we return TRUE.
+@param[in]	also_curr	TRUE if also rec is included in the versions
+				to search; otherwise only versions prior
+				to it are searched
+@param[in]	rec		record in the clustered index; the caller
+				must have a latch on the page
+@param[in]	mtr		mtr holding the latch on rec; it will
+				also hold the latch on purge_view
+@param[in]	index		secondary index
+@param[in]	ientry		secondary index entry
+@param[in]	roll_ptr	roll_ptr for the purge record
+@param[in]	trx_id		transaction ID on the purging record
+@return TRUE if earlier version should have */
+bool
+row_vers_old_has_index_entry(
+	bool			also_curr,
+	const rec_t*		rec,
+	mtr_t*			mtr,
+	dict_index_t*		index,
+	const dtuple_t*		ientry,
+	roll_ptr_t		roll_ptr,
+	trx_id_t		trx_id);
+
+/*****************************************************************//**
+Constructs the version of a clustered index record which a consistent
+read should see. We assume that the trx id stored in rec is such that
+the consistent read should not see rec in its present version.
+@return error code
+@retval DB_SUCCESS if a previous version was fetched
+@retval DB_MISSING_HISTORY if the history is missing (a sign of corruption) */
+dberr_t
+row_vers_build_for_consistent_read(
+/*===============================*/
+	const rec_t*	rec,	/*!< in: record in a clustered index; the
+				caller must have a latch on the page; this
+				latch locks the top of the stack of versions
+				of this records */
+	mtr_t*		mtr,	/*!< in: mtr holding the latch on rec; it will
+				also hold the latch on purge_view */
+	dict_index_t*	index,	/*!< in: the clustered index */
+	rec_offs**	offsets,/*!< in/out: offsets returned by
+				rec_get_offsets(rec, index) */
+	ReadView*	view,	/*!< in: the consistent read view */
+	mem_heap_t**	offset_heap,/*!< in/out: memory heap from which
+				the offsets are allocated */
+	mem_heap_t*	in_heap,/*!< in: memory heap from which the memory for
+				*old_vers is allocated; memory for possible
+				intermediate versions is allocated and freed
+				locally within the function */
+	rec_t**		old_vers,/*!< out, own: old version, or NULL
+				if the history is missing or the record
+				does not exist in the view, that is,
+				it was freshly inserted afterwards */
+	dtuple_t**	vrow);	/*!< out: reports virtual column info if any */
+
+/*****************************************************************//**
+Constructs the last committed version of a clustered index record,
+which should be seen by a semi-consistent read. */
+void
+row_vers_build_for_semi_consistent_read(
+/*====================================*/
+	trx_t*		caller_trx,/*!<in/out: trx of current thread */
+	const rec_t*	rec,	/*!< in: record in a clustered index; the
+				caller must have a latch on the page; this
+				latch locks the top of the stack of versions
+				of this records */
+	mtr_t*		mtr,	/*!< in: mtr holding the latch on rec */
+	dict_index_t*	index,	/*!< in: the clustered index */
+	rec_offs**	offsets,/*!< in/out: offsets returned by
+				rec_get_offsets(rec, index) */
+	mem_heap_t**	offset_heap,/*!< in/out: memory heap from which
+				the offsets are allocated */
+	mem_heap_t*	in_heap,/*!< in: memory heap from which the memory for
+				*old_vers is allocated; memory for possible
+				intermediate versions is allocated and freed
+				locally within the function */
+	const rec_t**	old_vers,/*!< out: rec, old version, or NULL if the
+				record does not exist in the view, that is,
+				it was freshly inserted afterwards */
+	dtuple_t**	vrow);	/*!< out: holds virtual column info if any
+				is updated in the view */
+
+#endif
diff --git a/storage/innobase/include/rw_lock.h b/storage/innobase/include/rw_lock.h
new file mode 100644
index 00000000..4881f2f1
--- /dev/null
+++ b/storage/innobase/include/rw_lock.h
@@ -0,0 +1,138 @@
+/*****************************************************************************
+
+Copyright (c) 2020, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#pragma once
+#include <atomic>
+#include "my_dbug.h"
+
+/** Simple read-write lock based on std::atomic */
+class rw_lock
+{
+  /** The lock word */
+  std::atomic<uint32_t> lock;
+
+protected:
+  /** Available lock */
+  static constexpr uint32_t UNLOCKED= 0;
+  /** Flag to indicate that write_lock() is being held */
+  static constexpr uint32_t WRITER= 1U << 31;
+  /** Flag to indicate that write_lock_wait() is pending */
+  static constexpr uint32_t WRITER_WAITING= 1U << 30;
+  /** Flag to indicate that write_lock() or write_lock_wait() is pending */
+  static constexpr uint32_t WRITER_PENDING= WRITER | WRITER_WAITING;
+
+  /** Start waiting for an exclusive lock. */
+  void write_lock_wait_start()
+  {
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+    static_assert(WRITER_WAITING == 1U << 30, "compatibility");
+    __asm__ __volatile__("lock btsl $30, %0" : "+m" (lock));
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+    static_assert(WRITER_WAITING == 1U << 30, "compatibility");
+    _interlockedbittestandset(reinterpret_cast<volatile long*>(&lock), 30);
+#else
+    lock.fetch_or(WRITER_WAITING, std::memory_order_relaxed);
+#endif
+  }
+  /** Start waiting for an exclusive lock.
+  @return current value of the lock word */
+  uint32_t write_lock_wait_start_read()
+  { return lock.fetch_or(WRITER_WAITING, std::memory_order_relaxed); }
+  /** Wait for an exclusive lock.
+  @param l the value of the lock word
+  @return whether the exclusive lock was acquired */
+  bool write_lock_wait_try(uint32_t &l)
+  {
+    return lock.compare_exchange_strong(l, WRITER, std::memory_order_acquire,
+                                        std::memory_order_relaxed);
+  }
+  /** Try to acquire a shared lock.
+  @param l the value of the lock word
+  @return whether the lock was acquired */
+  bool read_trylock(uint32_t &l)
+  {
+    l= UNLOCKED;
+    while (!lock.compare_exchange_strong(l, l + 1, std::memory_order_acquire,
+                                         std::memory_order_relaxed))
+    {
+      DBUG_ASSERT(!(WRITER & l) || !(~WRITER_PENDING & l));
+      if (l & WRITER_PENDING)
+        return false;
+    }
+    return true;
+  }
+
+  /** Wait for an exclusive lock.
+  @return whether the exclusive lock was acquired */
+  bool write_lock_poll()
+  {
+    auto l= WRITER_WAITING;
+    if (write_lock_wait_try(l))
+      return true;
+    if (!(l & WRITER_WAITING))
+      /* write_lock() must have succeeded for another thread */
+      write_lock_wait_start();
+    return false;
+  }
+  /** @return the lock word value */
+  uint32_t value() const { return lock.load(std::memory_order_acquire); }
+
+public:
+  /** Default constructor */
+  rw_lock() : lock(UNLOCKED) {}
+
+  /** Release a shared lock.
+  @return whether any writers may have to be woken up */
+  bool read_unlock()
+  {
+    auto l= lock.fetch_sub(1, std::memory_order_release);
+    DBUG_ASSERT(!(l & WRITER)); /* no write lock must have existed */
+    DBUG_ASSERT(~(WRITER_PENDING) & l); /* at least one read lock */
+    return (~WRITER_PENDING & l) == 1;
+  }
+  /** Release an exclusive lock */
+  void write_unlock()
+  {
+    /* Below, we use fetch_sub(WRITER) instead of fetch_and(~WRITER).
+    The reason is that on IA-32 and AMD64 it translates into the 80486
+    instruction LOCK XADD, while fetch_and() translates into a loop
+    around LOCK CMPXCHG. For other ISA either form should be fine. */
+    static_assert(WRITER == 1U << 31, "compatibility");
+    IF_DBUG_ASSERT(auto l=,) lock.fetch_sub(WRITER, std::memory_order_release);
+    /* the write lock must have existed */
+    DBUG_ASSERT(l & WRITER);
+  }
+  /** Try to acquire a shared lock.
+  @return whether the lock was acquired */
+  bool read_trylock() { uint32_t l; return read_trylock(l); }
+  /** Try to acquire an exclusive lock.
+  @return whether the lock was acquired */
+  bool write_trylock()
+  {
+    auto l= UNLOCKED;
+    return lock.compare_exchange_strong(l, WRITER, std::memory_order_acquire,
+                                        std::memory_order_relaxed);
+  }
+
+  /** @return whether an exclusive lock is being held by any thread */
+  bool is_write_locked() const { return !!(value() & WRITER); }
+  /** @return whether any lock is being held or waited for by any thread */
+  bool is_locked_or_waiting() const { return value() != 0; }
+  /** @return whether any lock is being held by any thread */
+  bool is_locked() const { return (value() & ~WRITER_WAITING) != 0; }
+};
diff --git a/storage/innobase/include/small_vector.h b/storage/innobase/include/small_vector.h
new file mode 100644
index 00000000..d28a3618
--- /dev/null
+++ b/storage/innobase/include/small_vector.h
@@ -0,0 +1,100 @@
+/*****************************************************************************
+
+Copyright (c) 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#pragma once
+/* A normally small vector, inspired by llvm::SmallVector */
+#include "my_global.h"
+#include <iterator>
+#include <memory>
+
+class small_vector_base
+{
+protected:
+  typedef uint32_t Size_T;
+  void *BeginX;
+  Size_T Size= 0, Capacity;
+  small_vector_base()= delete;
+  small_vector_base(void *small, size_t small_size)
+    : BeginX(small), Capacity(Size_T(small_size)) {}
+  ATTRIBUTE_COLD void grow_by_1(void *small, size_t element_size);
+public:
+  size_t size() const { return Size; }
+  size_t capacity() const { return Capacity; }
+  bool empty() const { return !Size; }
+  void clear() { Size= 0; }
+protected:
+  void set_size(size_t N) { Size= Size_T(N); }
+};
+
+template <typename T, unsigned N>
+class small_vector : public small_vector_base
+{
+  /** The fixed storage allocation */
+  T small[N];
+
+  using small_vector_base::set_size;
+
+  void grow_if_needed()
+  {
+    if (unlikely(size() >= capacity()))
+      grow_by_1(small, sizeof *small);
+  }
+
+public:
+  small_vector() : small_vector_base(small, N)
+  {
+    TRASH_ALLOC(small, sizeof small);
+  }
+  ~small_vector()
+  {
+    if (small != begin())
+      my_free(begin());
+    MEM_MAKE_ADDRESSABLE(small, sizeof small);
+  }
+
+  using iterator= T *;
+  using const_iterator= const T *;
+  using reverse_iterator= std::reverse_iterator<iterator>;
+  using reference= T &;
+  using const_reference= const T&;
+
+  iterator begin() { return static_cast<iterator>(BeginX); }
+  const_iterator begin() const { return static_cast<const_iterator>(BeginX); }
+  iterator end() { return begin() + size(); }
+  const_iterator end() const { return begin() + size(); }
+
+  reverse_iterator rbegin() { return reverse_iterator(end()); }
+  reverse_iterator rend() { return reverse_iterator(begin()); }
+
+  reference operator[](size_t i) { assert(i < size()); return begin()[i]; }
+  const_reference operator[](size_t i) const
+  { return const_cast<small_vector&>(*this)[i]; }
+
+  void erase(const_iterator S, const_iterator E)
+  {
+    set_size(std::move(const_cast<iterator>(E), end(),
+                       const_cast<iterator>(S)) - begin());
+  }
+
+  void emplace_back(T &&arg)
+  {
+    grow_if_needed();
+    ::new (end()) T(arg);
+    set_size(size() + 1);
+  }
+};
diff --git a/storage/innobase/include/srv0mon.h b/storage/innobase/include/srv0mon.h
new file mode 100644
index 00000000..51f3049b
--- /dev/null
+++ b/storage/innobase/include/srv0mon.h
@@ -0,0 +1,846 @@
+/***********************************************************************
+
+Copyright (c) 2010, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+***********************************************************************/
+
+/**************************************************//**
+@file include/srv0mon.h
+Server monitor counter related defines
+
+Created 12/15/2009	Jimmy Yang
+*******************************************************/
+
+#ifndef srv0mon_h
+#define srv0mon_h
+
+#include "univ.i"
+
+#ifndef __STDC_LIMIT_MACROS
+/* Required for FreeBSD so that INT64_MAX is defined. */
+#define __STDC_LIMIT_MACROS
+#endif /* __STDC_LIMIT_MACROS */
+
+#include <cstdint>
+#include "my_atomic.h"
+#include "my_atomic_wrapper.h"
+
+/** Possible status values for "mon_status" in "struct monitor_value" */
+enum monitor_running_status {
+	MONITOR_STARTED = 1,	/*!< Monitor has been turned on */
+	MONITOR_STOPPED = 2	/*!< Monitor has been turned off */
+};
+
+typedef enum monitor_running_status	monitor_running_t;
+
+/** Monitor counter value type */
+typedef	int64_t				mon_type_t;
+
+/** Two monitor structures are defined in this file. One is
+"monitor_value_t" which contains dynamic counter values for each
+counter. The other is "monitor_info_t", which contains
+static information (counter name, desc etc.) for each counter.
+In addition, an enum datatype "monitor_id_t" is also defined,
+it identifies each monitor with an internally used symbol, whose
+integer value indexes into above two structure for its dynamic
+and static information.
+Developer who intend to add new counters would require to
+fill in counter information as described in "monitor_info_t" and
+create the internal counter ID in "monitor_id_t". */
+
+/** Structure containing the actual values of a monitor counter. */
+struct monitor_value_t {
+	time_t	mon_start_time;	/*!< Start time of monitoring  */
+	time_t	mon_stop_time;	/*!< Stop time of monitoring */
+	time_t	mon_reset_time;	/*!< Time of resetting the counter */
+	mon_type_t	mon_value;	/*!< Current counter Value */
+	mon_type_t	mon_max_value;	/*!< Current Max value */
+	mon_type_t	mon_min_value;	/*!< Current Min value */
+	mon_type_t	mon_value_reset;/*!< value at last reset */
+	mon_type_t	mon_max_value_start; /*!< Max value since start */
+	mon_type_t	mon_min_value_start; /*!< Min value since start */
+	mon_type_t	mon_start_value;/*!< Value at the start time */
+	mon_type_t	mon_last_value;	/*!< Last set of values */
+	monitor_running_t mon_status;	/* whether monitor still running */
+};
+
+/** Follwoing defines are possible values for "monitor_type" field in
+"struct monitor_info" */
+enum monitor_type_t {
+	MONITOR_NONE = 0,	/*!< No monitoring */
+	MONITOR_MODULE = 1,	/*!< This is a monitor module type,
+				not a counter */
+	MONITOR_EXISTING = 2,	/*!< The monitor carries information from
+				an existing system status variable */
+	MONITOR_NO_AVERAGE = 4,	/*!< Set this status if we don't want to
+				calculate the average value for the counter */
+	MONITOR_DISPLAY_CURRENT = 8, /*!< Display current value of the
+				counter, rather than incremental value
+				over the period. Mostly for counters
+				displaying current resource usage */
+	MONITOR_GROUP_MODULE = 16, /*!< Monitor can be turned on/off
+				only as a module, but not individually */
+	MONITOR_DEFAULT_ON = 32,/*!< Monitor will be turned on by default at
+				server start up */
+	MONITOR_SET_OWNER = 64,	/*!< Owner of "monitor set", a set of
+				monitor counters */
+	MONITOR_SET_MEMBER = 128,/*!< Being part of a "monitor set" */
+	MONITOR_HIDDEN = 256	/*!< Do not display this monitor in the
+				metrics table */
+};
+
+/** Counter minimum value is initialized to be max value of
+ mon_type_t (int64_t) */
+#ifndef INT64_MAX
+#define INT64_MAX		(9223372036854775807LL)
+#endif
+#ifndef INT64_MIN
+#define INT64_MIN		(-9223372036854775807LL-1)
+#endif
+#define	MIN_RESERVED		INT64_MAX
+#define	MAX_RESERVED		INT64_MIN
+
+/** This enumeration defines internal monitor identifier used internally
+to identify each particular counter. Its value indexes into two arrays,
+one is the "innodb_counter_value" array which records actual monitor
+counter values, the other is "innodb_counter_info" array which describes
+each counter's basic information (name, desc etc.). A couple of
+naming rules here:
+1) If the monitor defines a module, it starts with MONITOR_MODULE
+2) If the monitor uses exisitng counters from "status variable", its ID
+name shall start with MONITOR_OVLD
+
+Please refer to "innodb_counter_info" in srv/srv0mon.cc for detail
+information for each monitor counter */
+
+enum monitor_id_t {
+	/* This is to identify the default value set by the metrics
+	control global variables */
+	MONITOR_DEFAULT_START = 0,
+
+	/* Start of Metadata counter */
+	MONITOR_MODULE_METADATA,
+	MONITOR_TABLE_OPEN,
+
+	/* Lock manager related counters */
+	MONITOR_MODULE_LOCK,
+	MONITOR_DEADLOCK,
+	MONITOR_TIMEOUT,
+	MONITOR_LOCKREC_WAIT,
+	MONITOR_TABLELOCK_WAIT,
+	MONITOR_NUM_RECLOCK_REQ,
+	MONITOR_RECLOCK_CREATED,
+	MONITOR_RECLOCK_REMOVED,
+	MONITOR_NUM_RECLOCK,
+	MONITOR_TABLELOCK_CREATED,
+	MONITOR_TABLELOCK_REMOVED,
+	MONITOR_NUM_TABLELOCK,
+	MONITOR_OVLD_ROW_LOCK_CURRENT_WAIT,
+	MONITOR_OVLD_LOCK_WAIT_TIME,
+	MONITOR_OVLD_LOCK_MAX_WAIT_TIME,
+	MONITOR_OVLD_ROW_LOCK_WAIT,
+	MONITOR_OVLD_LOCK_AVG_WAIT_TIME,
+
+	/* Buffer and I/O realted counters. */
+	MONITOR_MODULE_BUFFER,
+	MONITOR_OVLD_BUFFER_POOL_SIZE,
+	MONITOR_OVLD_BUF_POOL_READS,
+	MONITOR_OVLD_BUF_POOL_READ_REQUESTS,
+	MONITOR_OVLD_BUF_POOL_WRITE_REQUEST,
+	MONITOR_OVLD_BUF_POOL_WAIT_FREE,
+	MONITOR_OVLD_BUF_POOL_READ_AHEAD,
+	MONITOR_OVLD_BUF_POOL_READ_AHEAD_EVICTED,
+	MONITOR_OVLD_BUF_POOL_PAGE_TOTAL,
+	MONITOR_OVLD_BUF_POOL_PAGE_MISC,
+	MONITOR_OVLD_BUF_POOL_PAGES_DATA,
+	MONITOR_OVLD_BUF_POOL_BYTES_DATA,
+	MONITOR_OVLD_BUF_POOL_PAGES_DIRTY,
+	MONITOR_OVLD_BUF_POOL_BYTES_DIRTY,
+	MONITOR_OVLD_BUF_POOL_PAGES_FREE,
+	MONITOR_OVLD_PAGE_CREATED,
+	MONITOR_OVLD_PAGES_WRITTEN,
+	MONITOR_OVLD_PAGES_READ,
+	MONITOR_OVLD_BYTE_READ,
+	MONITOR_OVLD_BYTE_WRITTEN,
+	MONITOR_FLUSH_BATCH_SCANNED,
+	MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
+	MONITOR_FLUSH_BATCH_SCANNED_PER_CALL,
+	MONITOR_FLUSH_BATCH_TOTAL_PAGE,
+	MONITOR_FLUSH_BATCH_COUNT,
+	MONITOR_FLUSH_BATCH_PAGES,
+	MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
+	MONITOR_FLUSH_NEIGHBOR_COUNT,
+	MONITOR_FLUSH_NEIGHBOR_PAGES,
+	MONITOR_FLUSH_N_TO_FLUSH_REQUESTED,
+
+	MONITOR_FLUSH_N_TO_FLUSH_BY_AGE,
+	MONITOR_FLUSH_ADAPTIVE_AVG_TIME,
+
+	MONITOR_FLUSH_ADAPTIVE_AVG_PASS,
+
+	MONITOR_LRU_GET_FREE_LOOPS,
+	MONITOR_LRU_GET_FREE_WAITS,
+
+	MONITOR_FLUSH_AVG_PAGE_RATE,
+	MONITOR_FLUSH_LSN_AVG_RATE,
+	MONITOR_FLUSH_PCT_FOR_DIRTY,
+	MONITOR_FLUSH_PCT_FOR_LSN,
+	MONITOR_FLUSH_SYNC_WAITS,
+	MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
+	MONITOR_FLUSH_ADAPTIVE_COUNT,
+	MONITOR_FLUSH_ADAPTIVE_PAGES,
+	MONITOR_FLUSH_SYNC_TOTAL_PAGE,
+	MONITOR_FLUSH_SYNC_COUNT,
+	MONITOR_FLUSH_SYNC_PAGES,
+	MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
+	MONITOR_FLUSH_BACKGROUND_COUNT,
+	MONITOR_FLUSH_BACKGROUND_PAGES,
+	MONITOR_LRU_BATCH_SCANNED,
+	MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
+	MONITOR_LRU_BATCH_SCANNED_PER_CALL,
+	MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
+	MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
+	MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT,
+	MONITOR_LRU_GET_FREE_SEARCH,
+	MONITOR_LRU_SEARCH_SCANNED,
+	MONITOR_LRU_SEARCH_SCANNED_NUM_CALL,
+	MONITOR_LRU_SEARCH_SCANNED_PER_CALL,
+	MONITOR_LRU_UNZIP_SEARCH_SCANNED,
+	MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL,
+	MONITOR_LRU_UNZIP_SEARCH_SCANNED_PER_CALL,
+
+	/* Buffer Page I/O specific counters. */
+	MONITOR_MODULE_BUF_PAGE,
+	MONITOR_INDEX_LEAF_PAGE_READ,
+	MONITOR_INDEX_NON_LEAF_PAGE_READ,
+	MONITOR_INDEX_IBUF_LEAF_PAGE_READ,
+	MONITOR_INDEX_IBUF_NON_LEAF_PAGE_READ,
+	MONITOR_UNDO_LOG_PAGE_READ,
+	MONITOR_INODE_PAGE_READ,
+	MONITOR_IBUF_FREELIST_PAGE_READ,
+	MONITOR_IBUF_BITMAP_PAGE_READ,
+	MONITOR_SYSTEM_PAGE_READ,
+	MONITOR_TRX_SYSTEM_PAGE_READ,
+	MONITOR_FSP_HDR_PAGE_READ,
+	MONITOR_XDES_PAGE_READ,
+	MONITOR_BLOB_PAGE_READ,
+	MONITOR_ZBLOB_PAGE_READ,
+	MONITOR_ZBLOB2_PAGE_READ,
+	MONITOR_OTHER_PAGE_READ,
+	MONITOR_INDEX_LEAF_PAGE_WRITTEN,
+	MONITOR_INDEX_NON_LEAF_PAGE_WRITTEN,
+	MONITOR_INDEX_IBUF_LEAF_PAGE_WRITTEN,
+	MONITOR_INDEX_IBUF_NON_LEAF_PAGE_WRITTEN,
+	MONITOR_UNDO_LOG_PAGE_WRITTEN,
+	MONITOR_INODE_PAGE_WRITTEN,
+	MONITOR_IBUF_FREELIST_PAGE_WRITTEN,
+	MONITOR_IBUF_BITMAP_PAGE_WRITTEN,
+	MONITOR_SYSTEM_PAGE_WRITTEN,
+	MONITOR_TRX_SYSTEM_PAGE_WRITTEN,
+	MONITOR_FSP_HDR_PAGE_WRITTEN,
+	MONITOR_XDES_PAGE_WRITTEN,
+	MONITOR_BLOB_PAGE_WRITTEN,
+	MONITOR_ZBLOB_PAGE_WRITTEN,
+	MONITOR_ZBLOB2_PAGE_WRITTEN,
+	MONITOR_OTHER_PAGE_WRITTEN,
+
+	/* OS level counters (I/O) */
+	MONITOR_MODULE_OS,
+	MONITOR_OVLD_OS_FILE_READ,
+	MONITOR_OVLD_OS_FILE_WRITE,
+	MONITOR_OVLD_OS_FSYNC,
+	MONITOR_OS_PENDING_READS,
+	MONITOR_OS_PENDING_WRITES,
+	MONITOR_OVLD_OS_LOG_WRITTEN,
+
+	/* Transaction related counters */
+	MONITOR_MODULE_TRX,
+	MONITOR_TRX_RW_COMMIT,
+	MONITOR_TRX_RO_COMMIT,
+	MONITOR_TRX_NL_RO_COMMIT,
+	MONITOR_TRX_COMMIT_UNDO,
+	MONITOR_TRX_ROLLBACK,
+	MONITOR_TRX_ROLLBACK_SAVEPOINT,
+	MONITOR_RSEG_HISTORY_LEN,
+	MONITOR_NUM_UNDO_SLOT_USED,
+	MONITOR_NUM_UNDO_SLOT_CACHED,
+	MONITOR_RSEG_CUR_SIZE,
+
+	/* Purge related counters */
+	MONITOR_MODULE_PURGE,
+	MONITOR_N_DEL_ROW_PURGE,
+	MONITOR_N_UPD_EXIST_EXTERN,
+	MONITOR_PURGE_INVOKED,
+	MONITOR_PURGE_N_PAGE_HANDLED,
+	MONITOR_DML_PURGE_DELAY,
+	MONITOR_PURGE_STOP_COUNT,
+	MONITOR_PURGE_RESUME_COUNT,
+
+	/* Recovery related counters */
+	MONITOR_MODULE_RECOVERY,
+	MONITOR_OVLD_CHECKPOINTS,
+	MONITOR_OVLD_LSN_FLUSHDISK,
+	MONITOR_OVLD_LSN_CHECKPOINT,
+	MONITOR_OVLD_LSN_CURRENT,
+	MONITOR_LSN_CHECKPOINT_AGE,
+	MONITOR_OVLD_BUF_OLDEST_LSN,
+	MONITOR_OVLD_MAX_AGE_ASYNC,
+	MONITOR_OVLD_LOG_WAITS,
+	MONITOR_OVLD_LOG_WRITE_REQUEST,
+	MONITOR_OVLD_LOG_WRITES,
+
+	/* Page Manager related counters */
+	MONITOR_MODULE_PAGE,
+	MONITOR_PAGE_COMPRESS,
+	MONITOR_PAGE_DECOMPRESS,
+	MONITOR_PAD_INCREMENTS,
+	MONITOR_PAD_DECREMENTS,
+	/* New monitor variables for page compression */
+	MONITOR_OVLD_PAGE_COMPRESS_SAVED,
+	MONITOR_OVLD_PAGES_PAGE_COMPRESSED,
+	MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP,
+	MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED,
+	MONITOR_OVLD_PAGES_PAGE_COMPRESSION_ERROR,
+
+	/* New monitor variables for page encryption */
+	MONITOR_OVLD_PAGES_ENCRYPTED,
+	MONITOR_OVLD_PAGES_DECRYPTED,
+
+	/* Index related counters */
+	MONITOR_MODULE_INDEX,
+	MONITOR_INDEX_SPLIT,
+	MONITOR_INDEX_MERGE_ATTEMPTS,
+	MONITOR_INDEX_MERGE_SUCCESSFUL,
+	MONITOR_INDEX_REORG_ATTEMPTS,
+	MONITOR_INDEX_REORG_SUCCESSFUL,
+	MONITOR_INDEX_DISCARD,
+
+#ifdef BTR_CUR_HASH_ADAPT
+	/* Adaptive Hash Index related counters */
+	MONITOR_MODULE_ADAPTIVE_HASH,
+	MONITOR_OVLD_ADAPTIVE_HASH_SEARCH,
+	MONITOR_OVLD_ADAPTIVE_HASH_SEARCH_BTREE,
+	MONITOR_ADAPTIVE_HASH_PAGE_ADDED,
+	MONITOR_ADAPTIVE_HASH_PAGE_REMOVED,
+	MONITOR_ADAPTIVE_HASH_ROW_ADDED,
+	MONITOR_ADAPTIVE_HASH_ROW_REMOVED,
+	MONITOR_ADAPTIVE_HASH_ROW_REMOVE_NOT_FOUND,
+	MONITOR_ADAPTIVE_HASH_ROW_UPDATED,
+#endif /* BTR_CUR_HASH_ADAPT */
+
+	/* Tablespace related counters */
+	MONITOR_MODULE_FIL_SYSTEM,
+	MONITOR_OVLD_N_FILE_OPENED,
+
+	/* InnoDB Change Buffer related counters */
+	MONITOR_MODULE_IBUF_SYSTEM,
+	MONITOR_OVLD_IBUF_MERGE_INSERT,
+	MONITOR_OVLD_IBUF_MERGE_DELETE,
+	MONITOR_OVLD_IBUF_MERGE_PURGE,
+	MONITOR_OVLD_IBUF_MERGE_DISCARD_INSERT,
+	MONITOR_OVLD_IBUF_MERGE_DISCARD_DELETE,
+	MONITOR_OVLD_IBUF_MERGE_DISCARD_PURGE,
+	MONITOR_OVLD_IBUF_MERGES,
+	MONITOR_OVLD_IBUF_SIZE,
+
+	/* Counters for server operations */
+	MONITOR_MODULE_SERVER,
+	MONITOR_MASTER_THREAD_SLEEP,
+	MONITOR_OVLD_SERVER_ACTIVITY,
+	MONITOR_MASTER_ACTIVE_LOOPS,
+	MONITOR_MASTER_IDLE_LOOPS,
+	MONITOR_SRV_LOG_FLUSH_MICROSECOND,
+	MONITOR_SRV_DICT_LRU_MICROSECOND,
+	MONITOR_SRV_DICT_LRU_EVICT_COUNT_ACTIVE,
+	MONITOR_SRV_DICT_LRU_EVICT_COUNT_IDLE,
+	MONITOR_OVLD_SRV_DBLWR_WRITES,
+	MONITOR_OVLD_SRV_DBLWR_PAGES_WRITTEN,
+	MONITOR_OVLD_SRV_PAGE_SIZE,
+
+	/* Data DDL related counters */
+	MONITOR_MODULE_DDL_STATS,
+	MONITOR_BACKGROUND_DROP_INDEX,
+	MONITOR_ONLINE_CREATE_INDEX,
+	MONITOR_PENDING_ALTER_TABLE,
+	MONITOR_ALTER_TABLE_SORT_FILES,
+	MONITOR_ALTER_TABLE_LOG_FILES,
+
+	MONITOR_MODULE_ICP,
+	MONITOR_ICP_ATTEMPTS,
+	MONITOR_ICP_NO_MATCH,
+	MONITOR_ICP_OUT_OF_RANGE,
+	MONITOR_ICP_MATCH,
+
+	/* This is used only for control system to turn
+	on/off and reset all monitor counters */
+	MONITOR_ALL_COUNTER,
+
+	/* This must be the last member */
+	NUM_MONITOR
+};
+
+/** This informs the monitor control system to turn
+on/off and reset monitor counters through wild card match */
+#define	MONITOR_WILDCARD_MATCH		(NUM_MONITOR + 1)
+
+/** Cannot find monitor counter with a specified name */
+#define	MONITOR_NO_MATCH		(NUM_MONITOR + 2)
+
+/** struct monitor_info describes the basic/static information
+about each monitor counter. */
+struct monitor_info_t {
+	const char*	monitor_name;	/*!< Monitor name */
+	const char*	monitor_module;	/*!< Sub Module the monitor
+					belongs to */
+	const char*	monitor_desc;	/*!< Brief desc of monitor counter */
+	monitor_type_t	monitor_type;	/*!< Type of Monitor Info */
+	monitor_id_t	monitor_related_id;/*!< Monitor ID of counter that
+					related to this monitor. This is
+					set when the monitor belongs to
+					a "monitor set" */
+	monitor_id_t	monitor_id;	/*!< Monitor ID as defined in enum
+					monitor_id_t */
+};
+
+/** Following are the "set_option" values allowed for
+srv_mon_process_existing_counter() and srv_mon_process_existing_counter()
+functions. To turn on/off/reset the monitor counters. */
+enum mon_option_t {
+	MONITOR_TURN_ON = 1,		/*!< Turn on the counter */
+	MONITOR_TURN_OFF,		/*!< Turn off the counter */
+	MONITOR_RESET_VALUE,		/*!< Reset current values */
+	MONITOR_RESET_ALL_VALUE,	/*!< Reset all values */
+	MONITOR_GET_VALUE		/*!< Option for
+					srv_mon_process_existing_counter()
+					function */
+};
+
+/** Number of bit in a ulint datatype */
+#define	NUM_BITS_ULINT	(sizeof(ulint) * CHAR_BIT)
+
+/** This "monitor_set_tbl" is a bitmap records whether a particular monitor
+counter has been turned on or off */
+extern Atomic_relaxed<ulint>
+    monitor_set_tbl[(NUM_MONITOR + NUM_BITS_ULINT - 1) / NUM_BITS_ULINT];
+
+/** Macros to turn on/off the control bit in monitor_set_tbl for a monitor
+counter option. */
+#define MONITOR_ON(monitor)                                                   \
+  (monitor_set_tbl[unsigned(monitor) / NUM_BITS_ULINT].fetch_or(              \
+      (ulint(1) << (unsigned(monitor) % NUM_BITS_ULINT))))
+
+#define MONITOR_OFF(monitor)                                                  \
+  (monitor_set_tbl[unsigned(monitor) / NUM_BITS_ULINT].fetch_and(             \
+      ~(ulint(1) << (unsigned(monitor) % NUM_BITS_ULINT))))
+
+/** Check whether the requested monitor is turned on/off */
+#define MONITOR_IS_ON(monitor)                                                \
+  (monitor_set_tbl[unsigned(monitor) / NUM_BITS_ULINT] &                      \
+   (ulint(1) << (unsigned(monitor) % NUM_BITS_ULINT)))
+
+/** The actual monitor counter array that records each monintor counter
+value */
+extern monitor_value_t	 innodb_counter_value[NUM_MONITOR];
+
+/** Following are macro defines for basic montior counter manipulations.
+Please note we do not provide any synchronization for these monitor
+operations due to performance consideration. Most counters can
+be placed under existing mutex protections in respective code
+module. */
+
+/** Macros to access various fields of a monitor counters */
+#define MONITOR_FIELD(monitor, field)			\
+		(innodb_counter_value[monitor].field)
+
+#define MONITOR_VALUE(monitor)				\
+		MONITOR_FIELD(monitor, mon_value)
+
+#define MONITOR_MAX_VALUE(monitor)			\
+		MONITOR_FIELD(monitor, mon_max_value)
+
+#define MONITOR_MIN_VALUE(monitor)			\
+		MONITOR_FIELD(monitor, mon_min_value)
+
+#define MONITOR_VALUE_RESET(monitor)			\
+		MONITOR_FIELD(monitor, mon_value_reset)
+
+#define MONITOR_MAX_VALUE_START(monitor)		\
+		MONITOR_FIELD(monitor, mon_max_value_start)
+
+#define MONITOR_MIN_VALUE_START(monitor)		\
+		MONITOR_FIELD(monitor, mon_min_value_start)
+
+#define MONITOR_LAST_VALUE(monitor)			\
+		MONITOR_FIELD(monitor, mon_last_value)
+
+#define MONITOR_START_VALUE(monitor)			\
+		MONITOR_FIELD(monitor, mon_start_value)
+
+#define MONITOR_VALUE_SINCE_START(monitor)		\
+		(MONITOR_VALUE(monitor) + MONITOR_VALUE_RESET(monitor))
+
+#define MONITOR_STATUS(monitor)				\
+		MONITOR_FIELD(monitor, mon_status)
+
+#define MONITOR_SET_START(monitor)					\
+	do {								\
+		MONITOR_STATUS(monitor) = MONITOR_STARTED;		\
+		MONITOR_FIELD((monitor), mon_start_time) = time(NULL);	\
+	} while (0)
+
+#define MONITOR_SET_OFF(monitor)					\
+	do {								\
+		MONITOR_STATUS(monitor) = MONITOR_STOPPED;		\
+		MONITOR_FIELD((monitor), mon_stop_time) = time(NULL);	\
+	} while (0)
+
+#define	MONITOR_INIT_ZERO_VALUE		0
+
+/** Max and min values are initialized when we first turn on the monitor
+counter, and set the MONITOR_STATUS. */
+#define MONITOR_MAX_MIN_NOT_INIT(monitor)				\
+		(MONITOR_STATUS(monitor) == MONITOR_INIT_ZERO_VALUE	\
+		 && MONITOR_MIN_VALUE(monitor) == MONITOR_INIT_ZERO_VALUE \
+		 && MONITOR_MAX_VALUE(monitor) == MONITOR_INIT_ZERO_VALUE)
+
+#define MONITOR_INIT(monitor)						\
+	if (MONITOR_MAX_MIN_NOT_INIT(monitor)) {			\
+		MONITOR_MIN_VALUE(monitor) = MIN_RESERVED;		\
+		MONITOR_MIN_VALUE_START(monitor) = MIN_RESERVED;	\
+		MONITOR_MAX_VALUE(monitor) = MAX_RESERVED;		\
+		MONITOR_MAX_VALUE_START(monitor) = MAX_RESERVED;	\
+	}
+
+/** Macros to increment/decrement the counters. The normal
+monitor counter operation expects appropriate synchronization
+already exists. No additional mutex is necessary when operating
+on the counters */
+#define	MONITOR_INC(monitor)						\
+	if (MONITOR_IS_ON(monitor)) {					\
+		MONITOR_VALUE(monitor)++;				\
+		if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) {  \
+			MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	}
+
+/** Atomically increment a monitor counter.
+Use MONITOR_INC if appropriate mutex protection exists.
+@param monitor	monitor to be incremented by 1
+@param enabled	whether the monitor is enabled */
+#define MONITOR_ATOMIC_INC_LOW(monitor, enabled)			\
+	if (enabled) {							\
+		ib_uint64_t	value;					\
+		value  = my_atomic_add64_explicit(			\
+			(int64*) &MONITOR_VALUE(monitor), 1,		\
+			MY_MEMORY_ORDER_RELAXED) + 1;			\
+		/* Note: This is not 100% accurate because of the	\
+		inherent race, we ignore it due to performance. */	\
+		if (value > (ib_uint64_t) MONITOR_MAX_VALUE(monitor)) {	\
+			MONITOR_MAX_VALUE(monitor) = value;		\
+		}							\
+	}
+
+/** Atomically decrement a monitor counter.
+Use MONITOR_DEC if appropriate mutex protection exists.
+@param monitor	monitor to be decremented by 1
+@param enabled	whether the monitor is enabled */
+#define MONITOR_ATOMIC_DEC_LOW(monitor, enabled)			\
+	if (enabled) {							\
+		ib_uint64_t	value;					\
+		value = my_atomic_add64_explicit(			\
+			(int64*) &MONITOR_VALUE(monitor), -1,		\
+			MY_MEMORY_ORDER_RELAXED) - 1;			\
+		/* Note: This is not 100% accurate because of the	\
+		inherent race, we ignore it due to performance. */	\
+		if (value < (ib_uint64_t) MONITOR_MIN_VALUE(monitor)) {	\
+			MONITOR_MIN_VALUE(monitor) = value;		\
+		}							\
+	}
+
+/** Atomically increment a monitor counter if it is enabled.
+Use MONITOR_INC if appropriate mutex protection exists.
+@param monitor	monitor to be incremented by 1 */
+#define MONITOR_ATOMIC_INC(monitor)				\
+	MONITOR_ATOMIC_INC_LOW(monitor, MONITOR_IS_ON(monitor))
+/** Atomically decrement a monitor counter if it is enabled.
+Use MONITOR_DEC if appropriate mutex protection exists.
+@param monitor	monitor to be decremented by 1 */
+#define MONITOR_ATOMIC_DEC(monitor)				\
+	MONITOR_ATOMIC_DEC_LOW(monitor, MONITOR_IS_ON(monitor))
+
+#define	MONITOR_DEC(monitor)						\
+	if (MONITOR_IS_ON(monitor)) {					\
+		MONITOR_VALUE(monitor)--;				\
+		if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) {  \
+			MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	}
+
+#ifdef HAVE_MEM_CHECK
+# define MONITOR_CHECK_DEFINED(value) do {	\
+    mon_type_t m __attribute__((unused))= value;        \
+	MEM_CHECK_DEFINED(&m, sizeof m);	\
+} while (0)
+#else /* HAVE_MEM_CHECK */
+# define MONITOR_CHECK_DEFINED(value) (void) 0
+#endif /* HAVE_MEM_CHECK */
+
+#define	MONITOR_INC_VALUE(monitor, value)				\
+	MONITOR_CHECK_DEFINED(value);					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		MONITOR_VALUE(monitor) += (mon_type_t) (value);		\
+		if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) {  \
+			MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	}
+
+#define	MONITOR_DEC_VALUE(monitor, value)				\
+	MONITOR_CHECK_DEFINED(value);					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		ut_ad(MONITOR_VALUE(monitor) >= (mon_type_t) (value);	\
+		MONITOR_VALUE(monitor) -= (mon_type_t) (value);		\
+		if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) {  \
+			MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	}
+
+/* Increment/decrement counter without check the monitor on/off bit, which
+could already be checked as a module group */
+#define	MONITOR_INC_NOCHECK(monitor)					\
+	do {								\
+		MONITOR_VALUE(monitor)++;				\
+		if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) {  \
+			MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	} while (0)							\
+
+#define	MONITOR_DEC_NOCHECK(monitor)					\
+	do {								\
+		MONITOR_VALUE(monitor)--;				\
+		if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) {  \
+			MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	} while (0)
+
+/** Directly set a monitor counter's value */
+#define	MONITOR_SET(monitor, value)					\
+	MONITOR_CHECK_DEFINED(value);					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		MONITOR_VALUE(monitor) = (mon_type_t) (value);		\
+		if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) {  \
+			MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+		if (MONITOR_VALUE(monitor) < MONITOR_MIN_VALUE(monitor)) {  \
+			MONITOR_MIN_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	}
+
+/** Add time difference between now and input "value" (in seconds) to the
+monitor counter
+@param monitor monitor to update for the time difference
+@param value the start time value */
+#define	MONITOR_INC_TIME_IN_MICRO_SECS(monitor, value)			\
+	MONITOR_CHECK_DEFINED(value);					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		uintmax_t	old_time = value;			\
+		value = microsecond_interval_timer();			\
+		MONITOR_VALUE(monitor) += (mon_type_t) (value - old_time);\
+	}
+
+/** This macro updates 3 counters in one call. However, it only checks the
+main/first monitor counter 'monitor', to see it is on or off to decide
+whether to do the update.
+@param monitor the main monitor counter to update. It accounts for
+			the accumulative value for the counter.
+@param monitor_n_calls counter that counts number of times this macro is
+			called
+@param monitor_per_call counter that records the current and max value of
+			each incremental value
+@param value incremental value to record this time */
+#define MONITOR_INC_VALUE_CUMULATIVE(					\
+		monitor, monitor_n_calls, monitor_per_call, value)	\
+	MONITOR_CHECK_DEFINED(value);					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		MONITOR_VALUE(monitor_n_calls)++;			\
+		MONITOR_VALUE(monitor_per_call) = (mon_type_t) (value);	\
+		if (MONITOR_VALUE(monitor_per_call)			\
+		    > MONITOR_MAX_VALUE(monitor_per_call)) {		\
+			MONITOR_MAX_VALUE(monitor_per_call) =		\
+				 (mon_type_t) (value);			\
+		}							\
+		MONITOR_VALUE(monitor) += (mon_type_t) (value);		\
+		if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) {  \
+			MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	}
+
+/** Directly set a monitor counter's value, and if the value
+is monotonically increasing, only max value needs to be updated */
+#define	MONITOR_SET_UPD_MAX_ONLY(monitor, value)			\
+	MONITOR_CHECK_DEFINED(value);					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		MONITOR_VALUE(monitor) = (mon_type_t) (value);		\
+		if (MONITOR_VALUE(monitor) > MONITOR_MAX_VALUE(monitor)) {  \
+			MONITOR_MAX_VALUE(monitor) = MONITOR_VALUE(monitor);\
+		}							\
+	}
+
+/** Some values such as log sequence number are montomically increasing
+number, do not need to record max/min values */
+#define MONITOR_SET_SIMPLE(monitor, value)				\
+	MONITOR_CHECK_DEFINED(value);					\
+	if (MONITOR_IS_ON(monitor)) {					\
+		MONITOR_VALUE(monitor) = (mon_type_t) (value);		\
+	}
+
+/** Reset the monitor value and max/min value to zero. The reset
+operation would only be conducted when the counter is turned off */
+#define MONITOR_RESET_ALL(monitor)					\
+	do {								\
+		MONITOR_VALUE(monitor) = MONITOR_INIT_ZERO_VALUE;	\
+		MONITOR_MAX_VALUE(monitor) = MAX_RESERVED;		\
+		MONITOR_MIN_VALUE(monitor) = MIN_RESERVED;		\
+		MONITOR_VALUE_RESET(monitor) = MONITOR_INIT_ZERO_VALUE;	\
+		MONITOR_MAX_VALUE_START(monitor) = MAX_RESERVED;	\
+		MONITOR_MIN_VALUE_START(monitor) = MIN_RESERVED;	\
+		MONITOR_LAST_VALUE(monitor) = MONITOR_INIT_ZERO_VALUE;	\
+		MONITOR_FIELD(monitor, mon_start_time) =		\
+					MONITOR_INIT_ZERO_VALUE;	\
+		MONITOR_FIELD(monitor, mon_stop_time) =			\
+					MONITOR_INIT_ZERO_VALUE;	\
+		MONITOR_FIELD(monitor, mon_reset_time) =		\
+					MONITOR_INIT_ZERO_VALUE;	\
+	} while (0)
+
+/** Following four macros defines necessary operations to fetch and
+consolidate information from existing system status variables. */
+
+/** Save the passed-in value to mon_start_value field of monitor
+counters */
+#define MONITOR_SAVE_START(monitor, value) do {				\
+	MONITOR_CHECK_DEFINED(value);					\
+	(MONITOR_START_VALUE(monitor) =					\
+		(mon_type_t) (value) - MONITOR_VALUE_RESET(monitor));	\
+	} while (0)
+
+/** Save the passed-in value to mon_last_value field of monitor
+counters */
+#define MONITOR_SAVE_LAST(monitor)					\
+	do {								\
+		MONITOR_LAST_VALUE(monitor) = MONITOR_VALUE(monitor);	\
+		MONITOR_START_VALUE(monitor) += MONITOR_VALUE(monitor);	\
+	} while (0)
+
+/** Set monitor value to the difference of value and mon_start_value
+compensated by mon_last_value if accumulated value is required. */
+#define MONITOR_SET_DIFF(monitor, value)				\
+	MONITOR_SET_UPD_MAX_ONLY(monitor, ((value)			\
+	- MONITOR_VALUE_RESET(monitor)					\
+	- MONITOR_FIELD(monitor, mon_start_value)			\
+	+ MONITOR_FIELD(monitor, mon_last_value)))
+
+/****************************************************************//**
+Get monitor's monitor_info_t by its monitor id (index into the
+innodb_counter_info array
+@return Point to corresponding monitor_info_t, or NULL if no such
+monitor */
+monitor_info_t*
+srv_mon_get_info(
+/*=============*/
+	monitor_id_t	monitor_id);	/*!< id index into the
+					innodb_counter_info array */
+/****************************************************************//**
+Get monitor's name by its monitor id (index into the
+innodb_counter_info array
+@return corresponding monitor name, or NULL if no such
+monitor */
+const char*
+srv_mon_get_name(
+/*=============*/
+	monitor_id_t	monitor_id);	/*!< id index into the
+					innodb_counter_info array */
+
+/****************************************************************//**
+Turn on/off/reset monitor counters in a module. If module_value
+is NUM_MONITOR then turn on all monitor counters.
+@return 0 if successful, or the first monitor that cannot be
+turned on because it is already turned on. */
+void
+srv_mon_set_module_control(
+/*=======================*/
+	monitor_id_t	module_id,	/*!< in: Module ID as in
+					monitor_counter_id. If it is
+					set to NUM_MONITOR, this means
+					we shall turn on all the counters */
+	mon_option_t	set_option);	/*!< in: Turn on/off reset the
+					counter */
+/****************************************************************//**
+This function consolidates some existing server counters used
+by "system status variables". These existing system variables do not have
+mechanism to start/stop and reset the counters, so we simulate these
+controls by remembering the corresponding counter values when the
+corresponding monitors are turned on/off/reset, and do appropriate
+mathematics to deduct the actual value. */
+void
+srv_mon_process_existing_counter(
+/*=============================*/
+	monitor_id_t	monitor_id,	/*!< in: the monitor's ID as in
+					monitor_counter_id */
+	mon_option_t	set_option);	/*!< in: Turn on/off reset the
+					counter */
+/*************************************************************//**
+This function is used to calculate the maximum counter value
+since the start of monitor counter
+@return max counter value since start. */
+UNIV_INLINE
+mon_type_t
+srv_mon_calc_max_since_start(
+/*=========================*/
+	monitor_id_t	monitor);	/*!< in: monitor id */
+/*************************************************************//**
+This function is used to calculate the minimum counter value
+since the start of monitor counter
+@return min counter value since start. */
+UNIV_INLINE
+mon_type_t
+srv_mon_calc_min_since_start(
+/*=========================*/
+	monitor_id_t	monitor);	/*!< in: monitor id*/
+/*************************************************************//**
+Reset a monitor, create a new base line with the current monitor
+value. This baseline is recorded by MONITOR_VALUE_RESET(monitor) */
+void
+srv_mon_reset(
+/*==========*/
+	monitor_id_t	monitor);	/*!< in: monitor id*/
+/*************************************************************//**
+This function resets all values of a monitor counter */
+UNIV_INLINE
+void
+srv_mon_reset_all(
+/*==============*/
+	monitor_id_t	monitor);	/*!< in: monitor id*/
+/*************************************************************//**
+Turn on monitor counters that are marked as default ON. */
+void
+srv_mon_default_on(void);
+/*====================*/
+
+#include "srv0mon.inl"
+
+#endif
diff --git a/storage/innobase/include/srv0mon.inl b/storage/innobase/include/srv0mon.inl
new file mode 100644
index 00000000..158345b2
--- /dev/null
+++ b/storage/innobase/include/srv0mon.inl
@@ -0,0 +1,113 @@
+/*****************************************************************************
+
+Copyright (c) 2010, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/srv0mon.ic
+Server monitoring system
+
+Created 1/20/2010	Jimmy Yang
+************************************************************************/
+
+/*************************************************************//**
+This function is used to calculate the maximum counter value
+since the start of monitor counter
+@return max counter value since start. */
+UNIV_INLINE
+mon_type_t
+srv_mon_calc_max_since_start(
+/*=========================*/
+	monitor_id_t	monitor)	/*!< in: monitor id */
+{
+	if (MONITOR_MAX_VALUE_START(monitor) == MAX_RESERVED) {
+
+		/* MONITOR_MAX_VALUE_START has not yet been
+		initialized, the max value since start is the
+		max count in MONITOR_MAX_VALUE */
+		MONITOR_MAX_VALUE_START(monitor) =
+				MONITOR_MAX_VALUE(monitor);
+
+	} else if (MONITOR_MAX_VALUE(monitor) != MAX_RESERVED
+		   && (MONITOR_MAX_VALUE(monitor)
+		       + MONITOR_VALUE_RESET(monitor)
+		      > MONITOR_MAX_VALUE_START(monitor))) {
+
+		/* If the max value since reset (as specified
+		in MONITOR_MAX_VALUE) plus the reset value is
+		larger than MONITOR_MAX_VALUE_START, reset
+		MONITOR_MAX_VALUE_START to this new max value */
+		MONITOR_MAX_VALUE_START(monitor) =
+				MONITOR_MAX_VALUE(monitor)
+				+ MONITOR_VALUE_RESET(monitor);
+	}
+
+	return(MONITOR_MAX_VALUE_START(monitor));
+}
+
+/*************************************************************//**
+This function is used to calculate the minimum counter value
+since the start of monitor counter
+@return min counter value since start. */
+UNIV_INLINE
+mon_type_t
+srv_mon_calc_min_since_start(
+/*=========================*/
+	monitor_id_t	monitor)	/*!< in: monitor id */
+{
+	if (MONITOR_MIN_VALUE_START(monitor) == MIN_RESERVED) {
+
+		/* MONITOR_MIN_VALUE_START has not yet been
+		initialized, the min value since start is the
+		min count in MONITOR_MIN_VALUE */
+		MONITOR_MIN_VALUE_START(monitor) =
+				MONITOR_MIN_VALUE(monitor);
+
+	} else if (MONITOR_MIN_VALUE(monitor) != MIN_RESERVED
+		   && (MONITOR_MIN_VALUE(monitor)
+		       + MONITOR_VALUE_RESET(monitor)
+		       < MONITOR_MIN_VALUE_START(monitor))) {
+
+		/* If the min value since reset (as specified
+		in MONITOR_MIN_VALUE) plus the reset value is
+		less than MONITOR_MIN_VALUE_START, reset
+		MONITOR_MIN_VALUE_START to this new min value */
+		MONITOR_MIN_VALUE_START(monitor) =
+			MONITOR_MIN_VALUE(monitor)
+                        + MONITOR_VALUE_RESET(monitor);
+        }
+
+	return(MONITOR_MIN_VALUE_START(monitor));
+}
+
+/*************************************************************//**
+This function resets all values of a monitor counter */
+UNIV_INLINE
+void
+srv_mon_reset_all(
+/*==============*/
+	monitor_id_t	monitor)	/*!< in: monitor id */
+{
+	/* Do not reset all counter values if monitor is still on. */
+	if (MONITOR_IS_ON(monitor)) {
+		fprintf(stderr, "InnoDB: Cannot reset all values for"
+			" monitor counter %s while it is on. Please"
+			" turn it off and retry.\n",
+			srv_mon_get_name(monitor));
+	} else {
+		MONITOR_RESET_ALL(monitor);
+	}
+}
diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h
new file mode 100644
index 00000000..db846795
--- /dev/null
+++ b/storage/innobase/include/srv0srv.h
@@ -0,0 +1,715 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All rights reserved.
+Copyright (c) 2008, 2009, Google Inc.
+Copyright (c) 2009, Percona Inc.
+Copyright (c) 2013, 2023, MariaDB Corporation.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/srv0srv.h
+The server main program
+
+Created 10/10/1995 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "log0log.h"
+#include "que0types.h"
+#include "trx0types.h"
+#include "fil0fil.h"
+#include "ut0counter.h"
+
+#include "mysql/psi/mysql_stage.h"
+#include "mysql/psi/psi.h"
+#include <tpool.h>
+#include <memory>
+
+/** Simple non-atomic counter
+@tparam	Type  the integer type of the counter */
+template <typename Type>
+struct alignas(CPU_LEVEL1_DCACHE_LINESIZE) simple_counter
+{
+  /** Increment the counter */
+  Type inc() { return add(1); }
+  /** Decrement the counter */
+  Type dec() { return add(Type(~0)); }
+
+  /** Add to the counter
+  @param i  amount to be added
+  @return the value of the counter after adding */
+  Type add(Type i) { return m_counter += i; }
+
+  /** @return the value of the counter */
+  operator Type() const { return m_counter; }
+
+private:
+  /** The counter */
+  Type m_counter;
+};
+
+/** Global counters used inside InnoDB. */
+struct srv_stats_t
+{
+	typedef ib_counter_t<ulint> ulint_ctr_n_t;
+	typedef simple_counter<lsn_t> lsn_ctr_1_t;
+	typedef simple_counter<ulint> ulint_ctr_1_t;
+	typedef simple_counter<int64_t> int64_ctr_1_t;
+
+	/** Count the amount of data written in total (in bytes) */
+	ulint_ctr_1_t		data_written;
+	/** Number of bytes saved by page compression */
+	ulint_ctr_n_t          page_compression_saved;
+	/* Number of pages compressed with page compression */
+        ulint_ctr_n_t          pages_page_compressed;
+	/* Number of TRIM operations induced by page compression */
+        ulint_ctr_n_t          page_compressed_trim_op;
+	/* Number of pages decompressed with page compression */
+        ulint_ctr_n_t          pages_page_decompressed;
+	/* Number of page compression errors */
+	ulint_ctr_n_t          pages_page_compression_error;
+	/* Number of pages encrypted */
+	ulint_ctr_n_t          pages_encrypted;
+   	/* Number of pages decrypted */
+	ulint_ctr_n_t          pages_decrypted;
+	/* Number of merge blocks encrypted */
+	ulint_ctr_n_t          n_merge_blocks_encrypted;
+	/* Number of merge blocks decrypted */
+	ulint_ctr_n_t          n_merge_blocks_decrypted;
+	/* Number of row log blocks encrypted */
+	ulint_ctr_n_t          n_rowlog_blocks_encrypted;
+	/* Number of row log blocks decrypted */
+	ulint_ctr_n_t          n_rowlog_blocks_decrypted;
+
+	/** Number of data read in total (in bytes) */
+	ulint_ctr_1_t		data_read;
+
+	/** Number of encryption_get_latest_key_version calls */
+	ulint_ctr_n_t		n_key_requests;
+
+	/** Number of temporary tablespace blocks encrypted */
+	ulint_ctr_n_t		n_temp_blocks_encrypted;
+
+	/** Number of temporary tablespace blocks decrypted */
+	ulint_ctr_n_t		n_temp_blocks_decrypted;
+};
+
+/** We are prepared for a situation that we have this many threads waiting for
+a transactional lock inside InnoDB. srv_start() sets the value. */
+extern ulint srv_max_n_threads;
+
+extern const char*	srv_main_thread_op_info;
+
+/** Prefix used by MySQL to indicate pre-5.1 table name encoding */
+extern const char	srv_mysql50_table_name_prefix[10];
+
+/** The buffer pool dump/load file name */
+#define SRV_BUF_DUMP_FILENAME_DEFAULT	"ib_buffer_pool"
+extern char*		srv_buf_dump_filename;
+
+/** Boolean config knobs that tell InnoDB to dump the buffer pool at shutdown
+and/or load it during startup. */
+extern char		srv_buffer_pool_dump_at_shutdown;
+extern char		srv_buffer_pool_load_at_startup;
+
+/* Whether to disable file system cache if it is defined */
+extern char		srv_disable_sort_file_cache;
+
+/* If the last data file is auto-extended, we add this many pages to it
+at a time */
+#define SRV_AUTO_EXTEND_INCREMENT (srv_sys_space.get_autoextend_increment())
+
+/** Mutex protecting page_zip_stat_per_index */
+extern mysql_mutex_t page_zip_stat_per_index_mutex;
+/** Mutex for locking srv_monitor_file */
+extern mysql_mutex_t srv_monitor_file_mutex;
+/* Temporary file for innodb monitor output */
+extern FILE*	srv_monitor_file;
+/** Mutex for locking srv_misc_tmpfile */
+extern mysql_mutex_t srv_misc_tmpfile_mutex;
+/* Temporary file for miscellanous diagnostic output */
+extern FILE*	srv_misc_tmpfile;
+
+/* Server parameters which are read from the initfile */
+
+extern char*	srv_data_home;
+
+/** Set if InnoDB must operate in read-only mode. We don't do any
+recovery and open all tables in RO mode instead of RW mode. We don't
+sync the max trx id to disk either. */
+extern my_bool	srv_read_only_mode;
+/** Set if InnoDB operates in read-only mode or innodb-force-recovery
+is greater than SRV_FORCE_NO_IBUF_MERGE. */
+extern my_bool	high_level_read_only;
+/** store to its own file each table created by an user; data
+dictionary tables are in the system tablespace 0 */
+extern my_bool	srv_file_per_table;
+
+/** Sort buffer size in index creation */
+extern ulong	srv_sort_buf_size;
+/** Maximum modification log file size for online index creation */
+extern unsigned long long	srv_online_max_size;
+
+/* If this flag is TRUE, then we will use the native aio of the
+OS (provided we compiled Innobase with it in), otherwise we will
+use simulated aio.
+Currently we support native aio on windows and linux */
+extern my_bool	srv_use_native_aio;
+extern my_bool	srv_numa_interleave;
+
+/* Use atomic writes i.e disable doublewrite buffer */
+extern my_bool srv_use_atomic_writes;
+
+/* Compression algorithm*/
+extern ulong innodb_compression_algorithm;
+
+/** TRUE if the server was successfully started */
+extern bool	srv_was_started;
+
+/** Server undo tablespaces directory, can be absolute path. */
+extern char*	srv_undo_dir;
+
+/** Number of undo tablespaces to use. */
+extern uint	srv_undo_tablespaces;
+
+/** The number of UNDO tablespaces that are active (hosting some rollback
+segment). It is quite possible that some of the tablespaces doesn't host
+any of the rollback-segment based on configuration used. */
+extern uint32_t srv_undo_tablespaces_active;
+
+/** Maximum size of undo tablespace. */
+extern unsigned long long	srv_max_undo_log_size;
+
+extern uint	srv_n_fil_crypt_threads;
+extern uint	srv_n_fil_crypt_threads_started;
+
+/** Rate at which UNDO records should be purged. */
+extern ulong	srv_purge_rseg_truncate_frequency;
+
+/** Enable or Disable Truncate of UNDO tablespace. */
+extern my_bool	srv_undo_log_truncate;
+
+/** Default size of UNDO tablespace (10MiB for innodb_page_size=16k) */
+constexpr ulint SRV_UNDO_TABLESPACE_SIZE_IN_PAGES= (10U << 20) /
+  UNIV_PAGE_SIZE_DEF;
+
+extern char*	srv_log_group_home_dir;
+
+/** The InnoDB redo log file size, or 0 when changing the redo log format
+at startup (while disallowing writes to the redo log). */
+extern ulonglong	srv_log_file_size;
+extern ulong	srv_flush_log_at_trx_commit;
+extern uint	srv_flush_log_at_timeout;
+extern my_bool	srv_adaptive_flushing;
+extern my_bool	srv_flush_sync;
+
+/** Requested size in bytes */
+extern ulint		srv_buf_pool_size;
+/** Requested buffer pool chunk size */
+extern size_t		srv_buf_pool_chunk_unit;
+/** Scan depth for LRU flush batch i.e.: number of blocks scanned*/
+extern ulong	srv_LRU_scan_depth;
+/** Whether or not to flush neighbors of a block */
+extern ulong	srv_flush_neighbors;
+/** Previously requested size */
+extern ulint	srv_buf_pool_old_size;
+/** Current size as scaling factor for the other components */
+extern ulint	srv_buf_pool_base_size;
+/** Current size in bytes */
+extern ulint	srv_buf_pool_curr_size;
+/** Dump this % of each buffer pool during BP dump */
+extern ulong	srv_buf_pool_dump_pct;
+#ifdef UNIV_DEBUG
+/** Abort load after this amount of pages */
+extern ulong srv_buf_pool_load_pages_abort;
+#endif
+/** Lock table size in bytes */
+extern ulint	srv_lock_table_size;
+
+/** the value of innodb_checksum_algorithm */
+extern ulong	srv_checksum_algorithm;
+extern my_bool	srv_random_read_ahead;
+extern ulong	srv_read_ahead_threshold;
+extern uint	srv_n_read_io_threads;
+extern uint	srv_n_write_io_threads;
+
+/* Defragmentation, Origianlly facebook default value is 100, but it's too high */
+#define SRV_DEFRAGMENT_FREQUENCY_DEFAULT 40
+extern my_bool	srv_defragment;
+extern uint	srv_defragment_n_pages;
+extern uint	srv_defragment_stats_accuracy;
+extern uint	srv_defragment_fill_factor_n_recs;
+extern double	srv_defragment_fill_factor;
+extern uint	srv_defragment_frequency;
+extern ulonglong	srv_defragment_interval;
+
+extern uint	srv_change_buffer_max_size;
+
+/* Number of IO operations per second the server can do */
+extern ulong    srv_io_capacity;
+
+/* We use this dummy default value at startup for max_io_capacity.
+The real value is set based on the value of io_capacity. */
+#define SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT	(~0UL)
+#define SRV_MAX_IO_CAPACITY_LIMIT		(~0UL)
+extern ulong    srv_max_io_capacity;
+
+/* The "innodb_stats_method" setting, decides how InnoDB is going
+to treat NULL value when collecting statistics. It is not defined
+as enum type because the configure option takes unsigned integer type. */
+extern ulong	srv_innodb_stats_method;
+
+extern ulint	srv_max_n_open_files;
+
+extern double	srv_max_buf_pool_modified_pct;
+extern double	srv_max_dirty_pages_pct_lwm;
+
+extern double	srv_adaptive_flushing_lwm;
+extern ulong	srv_flushing_avg_loops;
+
+extern ulong	srv_force_recovery;
+
+/** innodb_fast_shutdown=1 skips purge and change buffer merge.
+innodb_fast_shutdown=2 effectively crashes the server (no log checkpoint).
+innodb_fast_shutdown=3 is a clean shutdown that skips the rollback
+of active transaction (to be done on restart). */
+extern uint	srv_fast_shutdown;
+
+extern ibool	srv_innodb_status;
+
+extern unsigned long long	srv_stats_transient_sample_pages;
+extern my_bool			srv_stats_persistent;
+extern unsigned long long	srv_stats_persistent_sample_pages;
+extern my_bool			srv_stats_auto_recalc;
+extern my_bool			srv_stats_include_delete_marked;
+extern unsigned long long	srv_stats_modified_counter;
+extern my_bool			srv_stats_sample_traditional;
+
+extern my_bool	srv_use_doublewrite_buf;
+extern ulong	srv_checksum_algorithm;
+
+extern my_bool	srv_force_primary_key;
+
+extern ulong	srv_max_purge_lag;
+extern ulong	srv_max_purge_lag_delay;
+
+extern my_bool	innodb_encrypt_temporary_tables;
+
+extern my_bool  srv_immediate_scrub_data_uncompressed;
+/*-------------------------------------------*/
+
+/** Modes of operation */
+enum srv_operation_mode {
+	/** Normal mode (MariaDB Server) */
+	SRV_OPERATION_NORMAL,
+	/** Mariabackup is executing server to export already restored
+	tablespaces */
+	SRV_OPERATION_EXPORT_RESTORED,
+	/** Mariabackup taking a backup */
+	SRV_OPERATION_BACKUP,
+	/** Mariabackup restoring a backup for subsequent --copy-back */
+	SRV_OPERATION_RESTORE,
+	/** Mariabackup restoring the incremental part of a backup */
+	SRV_OPERATION_RESTORE_DELTA,
+	/** Mariabackup restoring a backup for subsequent --export */
+	SRV_OPERATION_RESTORE_EXPORT,
+	/** Mariabackup taking a backup and avoid deferring
+	any tablespace */
+	SRV_OPERATION_BACKUP_NO_DEFER
+};
+
+/** Current mode of operation */
+extern enum srv_operation_mode srv_operation;
+
+/** whether this is the server's first start after mariabackup --prepare */
+extern bool srv_start_after_restore;
+
+extern my_bool	srv_print_innodb_monitor;
+extern my_bool	srv_print_innodb_lock_monitor;
+extern ibool	srv_print_verbose_log;
+
+extern bool	srv_monitor_active;
+
+
+extern ulong	srv_n_spin_wait_rounds;
+extern uint	srv_spin_wait_delay;
+
+/** Number of initialized rollback segments for persistent undo log */
+extern ulong	srv_available_undo_logs;
+/** Iterations of the loop bounded by 'srv_active' label. */
+extern ulint	srv_main_active_loops;
+/** Iterations of the loop bounded by the 'srv_idle' label. */
+extern ulint	srv_main_idle_loops;
+/** Log writes involving flush. */
+extern ulint	srv_log_writes_and_flush;
+
+#ifdef UNIV_DEBUG
+extern my_bool	innodb_evict_tables_on_commit_debug;
+extern my_bool	srv_purge_view_update_only_debug;
+
+/** InnoDB system tablespace to set during recovery */
+extern uint	srv_sys_space_size_debug;
+/** whether redo log file has been created at startup */
+extern bool	srv_log_file_created;
+#endif /* UNIV_DEBUG */
+
+extern ulint	srv_dml_needed_delay;
+
+/** innodb_purge_threads; the number of purge tasks to use */
+extern uint srv_n_purge_threads;
+
+/* the number of pages to purge in one batch */
+extern ulong srv_purge_batch_size;
+
+/* print all user-level transactions deadlocks to mysqld stderr */
+extern my_bool srv_print_all_deadlocks;
+
+extern my_bool	srv_cmp_per_index_enabled;
+
+/** innodb_encrypt_log */
+extern my_bool	srv_encrypt_log;
+
+/* is encryption enabled */
+extern ulong	srv_encrypt_tables;
+
+
+/** Status variables to be passed to MySQL */
+extern struct export_var_t export_vars;
+
+/** Global counters */
+extern srv_stats_t	srv_stats;
+
+/** Fatal semaphore wait threshold = maximum number of seconds
+that semaphore times out in InnoDB */
+#define DEFAULT_SRV_FATAL_SEMAPHORE_TIMEOUT 600
+extern ulong	srv_fatal_semaphore_wait_threshold;
+
+/** Buffer pool dump status frequence in percentages */
+extern ulong srv_buf_dump_status_frequency;
+
+# ifdef UNIV_PFS_THREAD
+extern mysql_pfs_key_t	page_cleaner_thread_key;
+extern mysql_pfs_key_t	trx_rollback_clean_thread_key;
+extern mysql_pfs_key_t	thread_pool_thread_key;
+
+/* This macro register the current thread and its key with performance
+schema */
+#  define pfs_register_thread(key)			\
+do {							\
+	struct PSI_thread* psi __attribute__((unused))	\
+		= PSI_CALL_new_thread(key, NULL, 0);	\
+	PSI_CALL_set_thread_os_id(psi);			\
+	PSI_CALL_set_thread(psi);			\
+} while (0)
+
+/* This macro delist the current thread from performance schema */
+#  define pfs_delete_thread()				\
+do {								\
+	PSI_CALL_delete_current_thread();		\
+} while (0)
+# else
+#  define pfs_register_thread(key)
+#  define pfs_delete_thread()
+# endif /* UNIV_PFS_THREAD */
+
+#ifdef HAVE_PSI_STAGE_INTERFACE
+/** Performance schema stage event for monitoring ALTER TABLE progress
+in ha_innobase::commit_inplace_alter_table(). */
+extern PSI_stage_info	srv_stage_alter_table_end;
+
+/** Performance schema stage event for monitoring ALTER TABLE progress
+row_merge_insert_index_tuples(). */
+extern PSI_stage_info	srv_stage_alter_table_insert;
+
+/** Performance schema stage event for monitoring ALTER TABLE progress
+row_log_apply(). */
+extern PSI_stage_info	srv_stage_alter_table_log_index;
+
+/** Performance schema stage event for monitoring ALTER TABLE progress
+row_log_table_apply(). */
+extern PSI_stage_info	srv_stage_alter_table_log_table;
+
+/** Performance schema stage event for monitoring ALTER TABLE progress
+row_merge_sort(). */
+extern PSI_stage_info	srv_stage_alter_table_merge_sort;
+
+/** Performance schema stage event for monitoring ALTER TABLE progress
+row_merge_read_clustered_index(). */
+extern PSI_stage_info	srv_stage_alter_table_read_pk_internal_sort;
+
+/** Performance schema stage event for monitoring buffer pool load progress. */
+extern PSI_stage_info	srv_stage_buffer_pool_load;
+#endif /* HAVE_PSI_STAGE_INTERFACE */
+
+/** Alternatives for srv_force_recovery. Non-zero values are intended
+to help the user get a damaged database up so that he can dump intact
+tables and rows with SELECT INTO OUTFILE. The database must not otherwise
+be used with these options! A bigger number below means that all precautions
+of lower numbers are included. */
+enum {
+	SRV_FORCE_IGNORE_CORRUPT = 1,	/*!< let the server run even if it
+					detects a corrupt page */
+	SRV_FORCE_NO_BACKGROUND	= 2,	/*!< prevent the main thread from
+					running: if a crash would occur
+					in purge, this prevents it */
+	SRV_FORCE_NO_TRX_UNDO = 3,	/*!< do not run DML rollback after
+					recovery */
+	SRV_FORCE_NO_DDL_UNDO = 4,	/*!< prevent also DDL rollback */
+	SRV_FORCE_NO_UNDO_LOG_SCAN = 5,	/*!< do not look at undo logs when
+					starting the database: InnoDB will
+					treat even incomplete transactions
+					as committed */
+	SRV_FORCE_NO_LOG_REDO = 6	/*!< do not do the log roll-forward
+					in connection with recovery */
+};
+
+/* Alternatives for srv_innodb_stats_method, which could be changed by
+setting innodb_stats_method */
+enum srv_stats_method_name_enum {
+	SRV_STATS_NULLS_EQUAL,		/* All NULL values are treated as
+					equal. This is the default setting
+					for innodb_stats_method */
+	SRV_STATS_NULLS_UNEQUAL,	/* All NULL values are treated as
+					NOT equal. */
+	SRV_STATS_NULLS_IGNORED		/* NULL values are ignored */
+};
+
+typedef enum srv_stats_method_name_enum		srv_stats_method_name_t;
+
+/*********************************************************************//**
+Boots Innobase server. */
+void
+srv_boot(void);
+/*==========*/
+/*********************************************************************//**
+Frees the data structures created in srv_init(). */
+void
+srv_free(void);
+
+/******************************************************************//**
+Outputs to a file the output of the InnoDB Monitor.
+@return FALSE if not all information printed
+due to failure to obtain necessary mutex */
+ibool
+srv_printf_innodb_monitor(
+/*======================*/
+	FILE*	file,		/*!< in: output stream */
+	ibool	nowait,		/*!< in: whether to wait for lock_sys.latch */
+	ulint*	trx_start,	/*!< out: file position of the start of
+				the list of active transactions */
+	ulint*	trx_end);	/*!< out: file position of the end of
+				the list of active transactions */
+
+/******************************************************************//**
+Function to pass InnoDB status variables to MySQL */
+void
+srv_export_innodb_status(void);
+/*==========================*/
+/*******************************************************************//**
+Get current server activity count.
+@return activity count. */
+ulint
+srv_get_activity_count(void);
+/*========================*/
+
+/******************************************************************//**
+Increment the server activity counter. */
+void
+srv_inc_activity_count(void);
+/*=========================*/
+
+/**********************************************************************//**
+Enqueues a task to server task queue and releases a worker thread, if there
+is a suspended one. */
+void
+srv_que_task_enqueue_low(
+/*=====================*/
+	que_thr_t*	thr);	/*!< in: query thread */
+
+#ifdef UNIV_DEBUG
+/** @return whether purge or master task is active */
+bool srv_any_background_activity();
+#endif
+
+extern "C" {
+
+
+/** Periodic task which prints the info output by various InnoDB monitors.*/
+void srv_monitor_task(void*);
+
+
+/** The periodic master task controlling the server. */
+void srv_master_callback(void*);
+
+
+/**
+Complete the shutdown tasks such as background DROP TABLE,
+and optionally change buffer merge (on innodb_fast_shutdown=0). */
+void srv_shutdown(bool ibuf_merge);
+
+} /* extern "C" */
+
+#ifdef UNIV_DEBUG
+/** @return number of tasks in queue */
+ulint srv_get_task_queue_length();
+#endif
+
+/** Shut down the purge threads. */
+void srv_purge_shutdown();
+
+/** Init purge tasks*/
+void srv_init_purge_tasks();
+
+/** Status variables to be passed to MySQL */
+struct export_var_t{
+#ifdef BTR_CUR_HASH_ADAPT
+	ulint innodb_ahi_hit;
+	ulint innodb_ahi_miss;
+#endif /* BTR_CUR_HASH_ADAPT */
+	char  innodb_buffer_pool_dump_status[OS_FILE_MAX_PATH + 128];/*!< Buf pool dump status */
+	char  innodb_buffer_pool_load_status[OS_FILE_MAX_PATH + 128];/*!< Buf pool load status */
+	char  innodb_buffer_pool_resize_status[512];/*!< Buf pool resize status */
+	my_bool innodb_buffer_pool_load_incomplete;/*!< Buf pool load incomplete */
+	ulint innodb_buffer_pool_pages_total;	/*!< Buffer pool size */
+	ulint innodb_buffer_pool_bytes_data;	/*!< File bytes used */
+	ulint innodb_buffer_pool_pages_misc;	/*!< Miscellanous pages */
+#ifdef UNIV_DEBUG
+	ulint innodb_buffer_pool_pages_latched;	/*!< Latched pages */
+#endif /* UNIV_DEBUG */
+	/** buf_pool.stat.n_page_gets (a sharded counter) */
+	ulint innodb_buffer_pool_read_requests;
+	ulint innodb_checkpoint_age;
+	ulint innodb_checkpoint_max_age;
+	ulint innodb_data_pending_reads;	/*!< Pending reads */
+	ulint innodb_data_pending_writes;	/*!< Pending writes */
+	ulint innodb_data_read;			/*!< Data bytes read */
+	ulint innodb_data_writes;		/*!< I/O write requests */
+	ulint innodb_data_written;		/*!< Data bytes written */
+	ulint innodb_data_reads;		/*!< I/O read requests */
+	ulint innodb_dblwr_pages_written;	/*!< srv_dblwr_pages_written */
+	ulint innodb_dblwr_writes;		/*!< srv_dblwr_writes */
+	ulint innodb_deadlocks;
+	ulint innodb_history_list_length;
+	lsn_t innodb_lsn_current;
+	lsn_t innodb_lsn_flushed;
+	lsn_t innodb_lsn_last_checkpoint;
+	trx_id_t innodb_max_trx_id;
+#ifdef BTR_CUR_HASH_ADAPT
+	ulint innodb_mem_adaptive_hash;
+#endif
+	ulint innodb_mem_dictionary;
+	/** log_sys.get_lsn() - recv_sys.lsn */
+	lsn_t innodb_os_log_written;
+	ulint innodb_row_lock_waits;		/*!< srv_n_lock_wait_count */
+	ulint innodb_row_lock_current_waits;	/*!< srv_n_lock_wait_current_count */
+	int64_t innodb_row_lock_time;		/*!< srv_n_lock_wait_time
+						/ 1000 */
+	uint64_t innodb_row_lock_time_avg;	/*!< srv_n_lock_wait_time
+						     / srv_n_lock_wait_count */
+	uint64_t innodb_row_lock_time_max;	/*!< srv_n_lock_max_wait_time */
+
+	/** Number of undo tablespace truncation operations */
+	ulong innodb_undo_truncations;
+	ulint innodb_defragment_compression_failures; /*!< Number of
+						defragment re-compression
+						failures */
+
+	ulint innodb_defragment_failures;	/*!< Number of defragment
+						failures*/
+	ulint innodb_defragment_count;		/*!< Number of defragment
+						operations*/
+
+	/** Number of instant ALTER TABLE operations that affect columns */
+	ulong innodb_instant_alter_column;
+
+	ulint innodb_onlineddl_rowlog_rows;	/*!< Online alter rows */
+	ulint innodb_onlineddl_rowlog_pct_used; /*!< Online alter percentage
+						of used row log buffer */
+	ulint innodb_onlineddl_pct_progress;	/*!< Online alter progress */
+
+	int64_t innodb_page_compression_saved;/*!< Number of bytes saved
+						by page compression */
+	int64_t innodb_pages_page_compressed;/*!< Number of pages
+						compressed by page compression */
+	int64_t innodb_page_compressed_trim_op;/*!< Number of TRIM operations
+						induced by page compression */
+	int64_t innodb_pages_page_decompressed;/*!< Number of pages
+						decompressed by page
+						compression */
+	int64_t innodb_pages_page_compression_error;/*!< Number of page
+						compression errors */
+	int64_t innodb_pages_encrypted;      /*!< Number of pages
+						encrypted */
+	int64_t innodb_pages_decrypted;      /*!< Number of pages
+						decrypted */
+
+	/*!< Number of merge blocks encrypted */
+	ib_int64_t innodb_n_merge_blocks_encrypted;
+	/*!< Number of merge blocks decrypted */
+	ib_int64_t innodb_n_merge_blocks_decrypted;
+	/*!< Number of row log blocks encrypted */
+	ib_int64_t innodb_n_rowlog_blocks_encrypted;
+	/*!< Number of row log blocks decrypted */
+	ib_int64_t innodb_n_rowlog_blocks_decrypted;
+
+	/* Number of temporary tablespace pages encrypted */
+	ib_int64_t innodb_n_temp_blocks_encrypted;
+
+	/* Number of temporary tablespace pages decrypted */
+	ib_int64_t innodb_n_temp_blocks_decrypted;
+
+	ulint innodb_encryption_rotation_pages_read_from_cache;
+	ulint innodb_encryption_rotation_pages_read_from_disk;
+	ulint innodb_encryption_rotation_pages_modified;
+	ulint innodb_encryption_rotation_pages_flushed;
+	ulint innodb_encryption_rotation_estimated_iops;
+	int64_t innodb_encryption_key_requests;
+};
+
+extern tpool::thread_pool *srv_thread_pool;
+extern std::unique_ptr<tpool::timer> srv_master_timer;
+extern std::unique_ptr<tpool::timer> srv_monitor_timer;
+
+/** The interval at which srv_monitor_task is invoked, in milliseconds */
+constexpr unsigned SRV_MONITOR_INTERVAL= 15000; /* 4 times per minute */
+
+static inline void srv_monitor_timer_schedule_now()
+{
+  srv_monitor_timer->set_time(0, SRV_MONITOR_INTERVAL);
+}
+static inline void srv_start_periodic_timer(std::unique_ptr<tpool::timer>& t,
+                                            void (*func)(void*), int period)
+{
+  t.reset(srv_thread_pool->create_timer(func));
+  t->set_time(0, period);
+}
+
+void srv_thread_pool_init();
+void srv_thread_pool_end();
diff --git a/storage/innobase/include/srv0start.h b/storage/innobase/include/srv0start.h
new file mode 100644
index 00000000..c18cf1ce
--- /dev/null
+++ b/storage/innobase/include/srv0start.h
@@ -0,0 +1,124 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/srv0start.h
+Starts the Innobase database server
+
+Created 10/10/1995 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "log0log.h"
+#include "ut0byte.h"
+
+// Forward declaration
+struct dict_table_t;
+
+/** Open the configured number of dedicated undo tablespaces.
+@param[in]      create_new_undo whether the undo tablespaces has to be created
+@param[in,out]  mtr             mini-transaction
+@return DB_SUCCESS or error code */
+dberr_t srv_undo_tablespaces_init(bool create_new_undo, mtr_t *mtr);
+
+/** Start InnoDB.
+@param[in]	create_new_db	whether to create a new database
+@return DB_SUCCESS or error code */
+dberr_t srv_start(bool create_new_db);
+
+/**
+  Shutdown purge to make sure that there is no possibility that we call any
+  plugin code (e.g., audit) inside virtual column computation.
+*/
+void innodb_preshutdown();
+
+/** Shut down InnoDB. */
+void innodb_shutdown();
+
+/*************************************************************//**
+Copy the file path component of the physical file to parameter. It will
+copy up to and including the terminating path separator.
+@return number of bytes copied or ULINT_UNDEFINED if destination buffer
+	is smaller than the path to be copied. */
+ulint
+srv_path_copy(
+/*==========*/
+	char*		dest,		/*!< out: destination buffer */
+	ulint		dest_len,	/*!< in: max bytes to copy */
+	const char*	basedir,	/*!< in: base directory */
+	const char*	table_name)	/*!< in: source table name */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Get the meta-data filename from the table name for a
+single-table tablespace.
+@param[in]	table		table object
+@param[out]	filename	filename
+@param[in]	max_len		filename max length */
+void
+srv_get_meta_data_filename(
+	dict_table_t*	table,
+	char*		filename,
+	ulint		max_len);
+
+/** Get the encryption-data filename from the table name for a
+single-table tablespace.
+@param[in]	table		table object
+@param[out]	filename	filename
+@param[in]	max_len		filename max length */
+void
+srv_get_encryption_data_filename(
+	dict_table_t*	table,
+	char*		filename,
+	ulint		max_len);
+
+/** Log sequence number at shutdown */
+extern	lsn_t	srv_shutdown_lsn;
+
+/** TRUE if the server is being started */
+extern	bool	srv_is_being_started;
+/** TRUE if the server is being started, before rolling back any
+incomplete transactions */
+extern	bool	srv_startup_is_before_trx_rollback_phase;
+
+/** TRUE if a raw partition is in use */
+extern	ibool	srv_start_raw_disk_in_use;
+
+/** Shutdown state */
+enum srv_shutdown_t {
+	SRV_SHUTDOWN_NONE = 0,	/*!< Database running normally */
+	/** Shutdown initiated in srv_shutdown_bg_undo_sources() */
+	SRV_SHUTDOWN_INITIATED,
+	SRV_SHUTDOWN_CLEANUP,	/*!< Cleaning up in
+				logs_empty_and_mark_files_at_shutdown() */
+	SRV_SHUTDOWN_LAST_PHASE,/*!< Last phase after ensuring that
+				the buffer pool can be freed: flush
+				all file spaces and close all files */
+	SRV_SHUTDOWN_EXIT_THREADS/*!< Exit all threads */
+};
+
+/** Whether any undo log records can be generated */
+extern bool srv_undo_sources;
+
+/** At a shutdown this value climbs from SRV_SHUTDOWN_NONE to
+SRV_SHUTDOWN_CLEANUP and then to SRV_SHUTDOWN_LAST_PHASE, and so on */
+extern	enum srv_shutdown_t	srv_shutdown_state;
+
+/** Files comprising the system tablespace */
+extern pfs_os_file_t	files[1000];
diff --git a/storage/innobase/include/srw_lock.h b/storage/innobase/include/srw_lock.h
new file mode 100644
index 00000000..1dca0cc1
--- /dev/null
+++ b/storage/innobase/include/srw_lock.h
@@ -0,0 +1,554 @@
+/*****************************************************************************
+
+Copyright (c) 2020, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#pragma once
+#include "univ.i"
+#include "rw_lock.h"
+
+#if defined __linux__
+/* futex(2): FUTEX_WAIT_PRIVATE, FUTEX_WAKE_PRIVATE */
+#elif defined __OpenBSD__ || defined __FreeBSD__ || defined __DragonFly__
+/* system calls similar to Linux futex(2) */
+#elif defined _WIN32
+/* SRWLOCK as well as WaitOnAddress(), WakeByAddressSingle() */
+#else
+# define SUX_LOCK_GENERIC /* fall back to generic synchronization primitives */
+#endif
+
+#if !defined SUX_LOCK_GENERIC && 0 /* defined SAFE_MUTEX */
+# define SUX_LOCK_GENERIC /* Use dummy implementation for debugging purposes */
+#endif
+
+#ifdef SUX_LOCK_GENERIC
+/** An exclusive-only variant of srw_lock */
+template<bool spinloop>
+class pthread_mutex_wrapper final
+{
+  pthread_mutex_t lock;
+public:
+  void init()
+  {
+    if (spinloop)
+      pthread_mutex_init(&lock, MY_MUTEX_INIT_FAST);
+    else
+      pthread_mutex_init(&lock, nullptr);
+  }
+  void destroy() { pthread_mutex_destroy(&lock); }
+# ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
+  void wr_lock() { pthread_mutex_lock(&lock); }
+# else
+private:
+  void wr_wait();
+public:
+  inline void wr_lock();
+# endif
+  void wr_unlock() { pthread_mutex_unlock(&lock); }
+  bool wr_lock_try() { return !pthread_mutex_trylock(&lock); }
+};
+
+# ifndef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
+template<> void pthread_mutex_wrapper<true>::wr_wait();
+template<>
+inline void pthread_mutex_wrapper<false>::wr_lock()
+{ pthread_mutex_lock(&lock); }
+template<>
+inline void pthread_mutex_wrapper<true>::wr_lock()
+{ if (!wr_lock_try()) wr_wait(); }
+# endif
+#endif
+
+/** Futex-based mutex */
+template<bool spinloop>
+class srw_mutex_impl final
+{
+  /** The lock word, containing HOLDER + 1 if the lock is being held,
+  plus the number of waiters */
+  std::atomic<uint32_t> lock;
+  /** Identifies that the lock is being held */
+  static constexpr uint32_t HOLDER= 1U << 31;
+
+#ifdef SUX_LOCK_GENERIC
+public:
+  /** The mutex for the condition variables. */
+  pthread_mutex_t mutex;
+private:
+  /** Condition variable for the lock word. Used with mutex. */
+  pthread_cond_t cond;
+#endif
+
+  /** Wait until the mutex has been acquired */
+  void wait_and_lock();
+  /** Wait for lock!=lk */
+  inline void wait(uint32_t lk);
+  /** Wake up one wait() thread */
+  void wake();
+public:
+  /** @return whether the mutex is being held or waited for */
+  bool is_locked_or_waiting() const
+  { return lock.load(std::memory_order_acquire) != 0; }
+  /** @return whether the mutex is being held by any thread */
+  bool is_locked() const
+  { return (lock.load(std::memory_order_acquire) & HOLDER) != 0; }
+
+  void init()
+  {
+    DBUG_ASSERT(!is_locked_or_waiting());
+#ifdef SUX_LOCK_GENERIC
+    pthread_mutex_init(&mutex, nullptr);
+    pthread_cond_init(&cond, nullptr);
+#endif
+  }
+  void destroy()
+  {
+    DBUG_ASSERT(!is_locked_or_waiting());
+#ifdef SUX_LOCK_GENERIC
+    pthread_mutex_destroy(&mutex);
+    pthread_cond_destroy(&cond);
+#endif
+  }
+
+  /** @return whether the mutex was acquired */
+  bool wr_lock_try()
+  {
+    uint32_t lk= 0;
+    return lock.compare_exchange_strong(lk, HOLDER + 1,
+                                        std::memory_order_acquire,
+                                        std::memory_order_relaxed);
+  }
+
+  void wr_lock() { if (!wr_lock_try()) wait_and_lock(); }
+  void wr_unlock()
+  {
+    const uint32_t lk= lock.fetch_sub(HOLDER + 1, std::memory_order_release);
+    if (lk != HOLDER + 1)
+    {
+      DBUG_ASSERT(lk & HOLDER);
+      wake();
+    }
+  }
+};
+
+#ifdef SUX_LOCK_GENERIC
+typedef pthread_mutex_wrapper<true> srw_spin_mutex;
+typedef pthread_mutex_wrapper<false> srw_mutex;
+#else
+typedef srw_mutex_impl<true> srw_spin_mutex;
+typedef srw_mutex_impl<false> srw_mutex;
+#endif
+
+template<bool spinloop> class srw_lock_impl;
+
+/** Slim shared-update-exclusive lock with no recursion */
+template<bool spinloop>
+class ssux_lock_impl final
+{
+#ifdef UNIV_PFS_RWLOCK
+  friend class ssux_lock;
+# ifdef SUX_LOCK_GENERIC
+# elif defined _WIN32
+# else
+  friend srw_lock_impl<spinloop>;
+# endif
+#endif
+  /** mutex for synchronization; held by U or X lock holders */
+  srw_mutex_impl<spinloop> writer;
+#ifdef SUX_LOCK_GENERIC
+  /** Condition variable for "readers"; used with writer.mutex. */
+  pthread_cond_t readers_cond;
+#endif
+  /** S or U holders, and WRITER flag for X holder or waiter */
+  std::atomic<uint32_t> readers;
+  /** indicates an X request; readers=WRITER indicates granted X lock */
+  static constexpr uint32_t WRITER= 1U << 31;
+
+  /** Wait for readers!=lk */
+  inline void wait(uint32_t lk);
+
+  /** Wait for readers!=lk|WRITER */
+  void wr_wait(uint32_t lk);
+  /** Wake up wait() on the last rd_unlock() */
+  void wake();
+  /** Acquire a read lock */
+  void rd_wait();
+public:
+  void init()
+  {
+    writer.init();
+    DBUG_ASSERT(is_vacant());
+#ifdef SUX_LOCK_GENERIC
+    pthread_cond_init(&readers_cond, nullptr);
+#endif
+  }
+  void destroy()
+  {
+    DBUG_ASSERT(is_vacant());
+    writer.destroy();
+#ifdef SUX_LOCK_GENERIC
+    pthread_cond_destroy(&readers_cond);
+#endif
+  }
+  /** @return whether any writer is waiting */
+  bool is_waiting() const
+  { return (readers.load(std::memory_order_relaxed) & WRITER) != 0; }
+#ifndef DBUG_OFF
+  /** @return whether the lock is being held or waited for */
+  bool is_vacant() const { return !is_locked_or_waiting(); }
+#endif /* !DBUG_OFF */
+
+  bool rd_lock_try()
+  {
+    uint32_t lk= 0;
+    while (!readers.compare_exchange_weak(lk, lk + 1,
+                                          std::memory_order_acquire,
+                                          std::memory_order_relaxed))
+      if (lk & WRITER)
+        return false;
+    return true;
+  }
+
+  bool u_lock_try()
+  {
+    if (!writer.wr_lock_try())
+      return false;
+    IF_DBUG_ASSERT(uint32_t lk=,)
+    readers.fetch_add(1, std::memory_order_acquire);
+    DBUG_ASSERT(lk < WRITER - 1);
+    return true;
+  }
+
+  bool wr_lock_try()
+  {
+    if (!writer.wr_lock_try())
+      return false;
+    uint32_t lk= 0;
+    if (readers.compare_exchange_strong(lk, WRITER,
+                                        std::memory_order_acquire,
+                                        std::memory_order_relaxed))
+      return true;
+    writer.wr_unlock();
+    return false;
+  }
+
+  void rd_lock() { if (!rd_lock_try()) rd_wait(); }
+  void u_lock()
+  {
+    writer.wr_lock();
+    IF_DBUG_ASSERT(uint32_t lk=,)
+    readers.fetch_add(1, std::memory_order_acquire);
+    DBUG_ASSERT(lk < WRITER - 1);
+  }
+  void wr_lock()
+  {
+    writer.wr_lock();
+#if defined __i386__||defined __x86_64__||defined _M_IX86||defined _M_X64
+    /* On IA-32 and AMD64, this type of fetch_or() can only be implemented
+    as a loop around LOCK CMPXCHG. In this particular case, setting the
+    most significant bit using fetch_add() is equivalent, and is
+    translated into a simple LOCK XADD. */
+    static_assert(WRITER == 1U << 31, "compatibility");
+    if (uint32_t lk= readers.fetch_add(WRITER, std::memory_order_acquire))
+      wr_wait(lk);
+#else
+    if (uint32_t lk= readers.fetch_or(WRITER, std::memory_order_acquire))
+      wr_wait(lk);
+#endif
+  }
+
+  void u_wr_upgrade()
+  {
+    DBUG_ASSERT(writer.is_locked());
+    uint32_t lk= readers.fetch_add(WRITER - 1, std::memory_order_acquire);
+    if (lk != 1)
+      wr_wait(lk - 1);
+  }
+  void wr_u_downgrade()
+  {
+    DBUG_ASSERT(writer.is_locked());
+    DBUG_ASSERT(is_write_locked());
+    readers.store(1, std::memory_order_release);
+    /* Note: Any pending rd_lock() will not be woken up until u_unlock() */
+  }
+
+  void rd_unlock()
+  {
+    uint32_t lk= readers.fetch_sub(1, std::memory_order_release);
+    ut_ad(~WRITER & lk);
+    if (lk == WRITER + 1)
+      wake();
+  }
+  void u_unlock()
+  {
+    IF_DBUG_ASSERT(uint32_t lk=,)
+    readers.fetch_sub(1, std::memory_order_release);
+    DBUG_ASSERT(lk);
+    DBUG_ASSERT(lk < WRITER);
+    writer.wr_unlock();
+  }
+  void wr_unlock()
+  {
+    DBUG_ASSERT(is_write_locked());
+    readers.store(0, std::memory_order_release);
+    writer.wr_unlock();
+  }
+  /** @return whether an exclusive lock may be held by any thread */
+  bool is_write_locked() const noexcept
+  { return readers.load(std::memory_order_acquire) == WRITER; }
+  /** @return whether any lock may be held by any thread */
+  bool is_locked() const noexcept
+  { return readers.load(std::memory_order_acquire) != 0; }
+  /** @return whether any lock may be held by any thread */
+  bool is_locked_or_waiting() const noexcept
+  { return is_locked() || writer.is_locked_or_waiting(); }
+
+  void lock_shared() { rd_lock(); }
+  void unlock_shared() { rd_unlock(); }
+  void lock() { wr_lock(); }
+  void unlock() { wr_unlock(); }
+};
+
+#if defined _WIN32 || defined SUX_LOCK_GENERIC
+/** Slim read-write lock */
+template<bool spinloop>
+class srw_lock_
+{
+# ifdef UNIV_PFS_RWLOCK
+  friend srw_lock_impl<spinloop>;
+# endif
+# ifdef _WIN32
+  SRWLOCK lk;
+# else
+  rw_lock_t lk;
+# endif
+
+  void rd_wait();
+  void wr_wait();
+public:
+  void init() { IF_WIN(,my_rwlock_init(&lk, nullptr)); }
+  void destroy() { IF_WIN(,rwlock_destroy(&lk)); }
+  inline void rd_lock();
+  inline void wr_lock();
+  bool rd_lock_try()
+  { return IF_WIN(TryAcquireSRWLockShared(&lk), !rw_tryrdlock(&lk)); }
+  void rd_unlock()
+  { IF_WIN(ReleaseSRWLockShared(&lk), rw_unlock(&lk)); }
+  bool wr_lock_try()
+  { return IF_WIN(TryAcquireSRWLockExclusive(&lk), !rw_trywrlock(&lk)); }
+  void wr_unlock()
+  { IF_WIN(ReleaseSRWLockExclusive(&lk), rw_unlock(&lk)); }
+#ifdef _WIN32
+  /** @return whether any lock may be held by any thread */
+  bool is_locked_or_waiting() const noexcept { return (size_t&)(lk) != 0; }
+  /** @return whether any lock may be held by any thread */
+  bool is_locked() const noexcept { return is_locked_or_waiting(); }
+  /** @return whether an exclusive lock may be held by any thread */
+  bool is_write_locked() const noexcept
+  {
+    // FIXME: this returns false positives for shared locks
+    return is_locked();
+  }
+
+  void lock_shared() { rd_lock(); }
+  void unlock_shared() { rd_unlock(); }
+  void lock() { wr_lock(); }
+  void unlock() { wr_unlock(); }
+#endif
+};
+
+template<> void srw_lock_<true>::rd_wait();
+template<> void srw_lock_<true>::wr_wait();
+
+template<>
+inline void srw_lock_<false>::rd_lock()
+{ IF_WIN(AcquireSRWLockShared(&lk), rw_rdlock(&lk)); }
+template<>
+inline void srw_lock_<false>::wr_lock()
+{ IF_WIN(AcquireSRWLockExclusive(&lk), rw_wrlock(&lk)); }
+
+template<>
+inline void srw_lock_<true>::rd_lock() { if (!rd_lock_try()) rd_wait(); }
+template<>
+inline void srw_lock_<true>::wr_lock() { if (!wr_lock_try()) wr_wait(); }
+
+typedef srw_lock_<false> srw_lock_low;
+typedef srw_lock_<true> srw_spin_lock_low;
+#else
+typedef ssux_lock_impl<false> srw_lock_low;
+typedef ssux_lock_impl<true> srw_spin_lock_low;
+#endif
+
+#ifndef UNIV_PFS_RWLOCK
+# define SRW_LOCK_INIT(key) init()
+# define SRW_LOCK_ARGS(file, line) /* nothing */
+# define SRW_LOCK_CALL /* nothing */
+typedef srw_lock_low srw_lock;
+typedef srw_spin_lock_low srw_spin_lock;
+#else
+# define SRW_LOCK_INIT(key) init(key)
+# define SRW_LOCK_ARGS(file, line) file, line
+# define SRW_LOCK_CALL __FILE__, __LINE__
+
+/** Slim shared-update-exclusive lock with PERFORMANCE_SCHEMA instrumentation */
+class ssux_lock
+{
+  PSI_rwlock *pfs_psi;
+  ssux_lock_impl<false> lock;
+
+  ATTRIBUTE_NOINLINE void psi_rd_lock(const char *file, unsigned line);
+  ATTRIBUTE_NOINLINE void psi_wr_lock(const char *file, unsigned line);
+  ATTRIBUTE_NOINLINE void psi_u_lock(const char *file, unsigned line);
+  ATTRIBUTE_NOINLINE void psi_u_wr_upgrade(const char *file, unsigned line);
+public:
+  void init(mysql_pfs_key_t key)
+  {
+    pfs_psi= PSI_RWLOCK_CALL(init_rwlock)(key, this);
+    lock.init();
+  }
+  void destroy()
+  {
+    if (psi_likely(pfs_psi != nullptr))
+    {
+      PSI_RWLOCK_CALL(destroy_rwlock)(pfs_psi);
+      pfs_psi= nullptr;
+    }
+    lock.destroy();
+  }
+  void rd_lock(const char *file, unsigned line)
+  {
+    if (psi_likely(pfs_psi != nullptr))
+      psi_rd_lock(file, line);
+    else
+      lock.rd_lock();
+  }
+  void rd_unlock()
+  {
+    if (psi_likely(pfs_psi != nullptr))
+      PSI_RWLOCK_CALL(unlock_rwlock)(pfs_psi);
+    lock.rd_unlock();
+  }
+  void u_lock(const char *file, unsigned line)
+  {
+    if (psi_likely(pfs_psi != nullptr))
+      psi_u_lock(file, line);
+    else
+      lock.u_lock();
+  }
+  void u_unlock()
+  {
+    if (psi_likely(pfs_psi != nullptr))
+      PSI_RWLOCK_CALL(unlock_rwlock)(pfs_psi);
+    lock.u_unlock();
+  }
+  void wr_lock(const char *file, unsigned line)
+  {
+    if (psi_likely(pfs_psi != nullptr))
+      psi_wr_lock(file, line);
+    else
+      lock.wr_lock();
+  }
+  void wr_unlock()
+  {
+    if (psi_likely(pfs_psi != nullptr))
+      PSI_RWLOCK_CALL(unlock_rwlock)(pfs_psi);
+    lock.wr_unlock();
+  }
+  void u_wr_upgrade(const char *file, unsigned line)
+  {
+    if (psi_likely(pfs_psi != nullptr))
+      psi_u_wr_upgrade(file, line);
+    else
+      lock.u_wr_upgrade();
+  }
+  bool rd_lock_try() { return lock.rd_lock_try(); }
+  bool u_lock_try() { return lock.u_lock_try(); }
+  bool wr_lock_try() { return lock.wr_lock_try(); }
+  bool is_waiting() const { return lock.is_waiting(); }
+};
+
+/** Slim reader-writer lock with PERFORMANCE_SCHEMA instrumentation */
+template<bool spinloop>
+class srw_lock_impl
+{
+  PSI_rwlock *pfs_psi;
+# if defined _WIN32 || defined SUX_LOCK_GENERIC
+  srw_lock_<spinloop> lock;
+# else
+  ssux_lock_impl<spinloop> lock;
+# endif
+
+  ATTRIBUTE_NOINLINE void psi_rd_lock(const char *file, unsigned line);
+  ATTRIBUTE_NOINLINE void psi_wr_lock(const char *file, unsigned line);
+public:
+  void init(mysql_pfs_key_t key)
+  {
+    pfs_psi= PSI_RWLOCK_CALL(init_rwlock)(key, this);
+    lock.init();
+  }
+  void destroy()
+  {
+    if (psi_likely(pfs_psi != nullptr))
+    {
+      PSI_RWLOCK_CALL(destroy_rwlock)(pfs_psi);
+      pfs_psi= nullptr;
+    }
+    lock.destroy();
+  }
+  void rd_lock(const char *file, unsigned line)
+  {
+    if (psi_likely(pfs_psi != nullptr))
+      psi_rd_lock(file, line);
+    else
+      lock.rd_lock();
+  }
+  void rd_unlock()
+  {
+    if (psi_likely(pfs_psi != nullptr))
+      PSI_RWLOCK_CALL(unlock_rwlock)(pfs_psi);
+    lock.rd_unlock();
+  }
+  void wr_lock(const char *file, unsigned line)
+  {
+    if (psi_likely(pfs_psi != nullptr))
+      psi_wr_lock(file, line);
+    else
+      lock.wr_lock();
+  }
+  void wr_unlock()
+  {
+    if (psi_likely(pfs_psi != nullptr))
+      PSI_RWLOCK_CALL(unlock_rwlock)(pfs_psi);
+    lock.wr_unlock();
+  }
+  bool rd_lock_try() { return lock.rd_lock_try(); }
+  bool wr_lock_try() { return lock.wr_lock_try(); }
+  void lock_shared() { return rd_lock(SRW_LOCK_CALL); }
+  void unlock_shared() { return rd_unlock(); }
+#ifndef SUX_LOCK_GENERIC
+  /** @return whether any lock may be held by any thread */
+  bool is_locked_or_waiting() const noexcept
+  { return lock.is_locked_or_waiting(); }
+  /** @return whether an exclusive lock may be held by any thread */
+  bool is_locked() const noexcept { return lock.is_locked(); }
+  /** @return whether an exclusive lock may be held by any thread */
+  bool is_write_locked() const noexcept { return lock.is_write_locked(); }
+#endif
+};
+
+typedef srw_lock_impl<false> srw_lock;
+typedef srw_lock_impl<true> srw_spin_lock;
+
+#endif
diff --git a/storage/innobase/include/sux_lock.h b/storage/innobase/include/sux_lock.h
new file mode 100644
index 00000000..2c0167ac
--- /dev/null
+++ b/storage/innobase/include/sux_lock.h
@@ -0,0 +1,472 @@
+/*****************************************************************************
+
+Copyright (c) 2020, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#pragma once
+#include "srw_lock.h"
+#include "my_atomic_wrapper.h"
+#ifdef UNIV_DEBUG
+# include <unordered_set>
+#endif
+
+/** A "fat" rw-lock that supports
+S (shared), U (update, or shared-exclusive), and X (exclusive) modes
+as well as recursive U and X latch acquisition
+@tparam ssux ssux_lock_impl or ssux_lock */
+template<typename ssux>
+class sux_lock final
+{
+  /** The underlying non-recursive lock */
+  ssux lock;
+  /** Numbers of U and X locks. Protected by lock. */
+  uint32_t recursive;
+  /** The owner of the U or X lock (0 if none); protected by lock */
+  std::atomic<pthread_t> writer;
+  /** Special writer!=0 value to indicate that the lock is non-recursive
+  and will be released by an I/O thread */
+#if defined __linux__ || defined _WIN32
+  static constexpr pthread_t FOR_IO= pthread_t(~0UL);
+#else
+# define FOR_IO ((pthread_t) ~0UL) /* it could be a pointer */
+#endif
+#ifdef UNIV_DEBUG
+  /** Protects readers */
+  mutable srw_mutex readers_lock;
+  /** Threads that hold the lock in shared mode */
+  std::atomic<std::unordered_multiset<pthread_t>*> readers;
+#endif
+
+  /** The multiplier in recursive for X locks */
+  static constexpr uint32_t RECURSIVE_X= 1U;
+  /** The multiplier in recursive for U locks */
+  static constexpr uint32_t RECURSIVE_U= 1U << 16;
+  /** The maximum allowed level of recursion */
+  static constexpr uint32_t RECURSIVE_MAX= RECURSIVE_U - 1;
+
+public:
+#ifdef UNIV_PFS_RWLOCK
+  inline void init();
+#endif
+  void SRW_LOCK_INIT(mysql_pfs_key_t key)
+  {
+    lock.SRW_LOCK_INIT(key);
+    ut_ad(!writer.load(std::memory_order_relaxed));
+    ut_ad(!recursive);
+    ut_d(readers_lock.init());
+#ifdef UNIV_DEBUG
+    if (auto r= readers.load(std::memory_order_relaxed))
+      ut_ad(r->empty());
+#endif
+  }
+
+  /** Free the rw-lock after init() */
+  void free()
+  {
+    ut_ad(!writer.load(std::memory_order_relaxed));
+    ut_ad(!recursive);
+#ifdef UNIV_DEBUG
+    readers_lock.destroy();
+    if (auto r= readers.load(std::memory_order_relaxed))
+    {
+      ut_ad(r->empty());
+      delete r;
+      readers.store(nullptr, std::memory_order_relaxed);
+    }
+#endif
+    lock.destroy();
+  }
+
+  /** needed for dict_index_t::clone() */
+  inline void operator=(const sux_lock&);
+
+#ifdef UNIV_DEBUG
+  /** @return whether no recursive locks are being held */
+  bool not_recursive() const
+  {
+    ut_ad(recursive);
+    return recursive == RECURSIVE_X || recursive == RECURSIVE_U;
+  }
+
+  /** @return the number of X locks being held (by any thread) */
+  unsigned x_lock_count() const { return recursive & RECURSIVE_MAX; }
+#endif
+
+  /** Acquire a recursive lock */
+  template<bool allow_readers> void writer_recurse()
+  {
+    ut_ad(writer == pthread_self());
+    ut_d(auto rec= (recursive / (allow_readers ? RECURSIVE_U : RECURSIVE_X)) &
+         RECURSIVE_MAX);
+    ut_ad(allow_readers ? recursive : rec);
+    ut_ad(rec < RECURSIVE_MAX);
+    recursive+= allow_readers ? RECURSIVE_U : RECURSIVE_X;
+  }
+
+private:
+  /** Transfer the ownership of a write lock to another thread
+  @param id the new owner of the U or X lock */
+  void set_new_owner(pthread_t id)
+  {
+    IF_DBUG(DBUG_ASSERT(writer.exchange(id, std::memory_order_relaxed)),
+            writer.store(id, std::memory_order_relaxed));
+  }
+  /** Assign the ownership of a write lock to a thread
+  @param id the owner of the U or X lock */
+  void set_first_owner(pthread_t id)
+  {
+    IF_DBUG(DBUG_ASSERT(!writer.exchange(id, std::memory_order_relaxed)),
+            writer.store(id, std::memory_order_relaxed));
+  }
+#ifdef UNIV_DEBUG
+  /** Register the current thread as a holder of a shared lock */
+  void s_lock_register()
+  {
+    const pthread_t id= pthread_self();
+    readers_lock.wr_lock();
+    auto r= readers.load(std::memory_order_relaxed);
+    if (!r)
+    {
+      r= new std::unordered_multiset<pthread_t>();
+      readers.store(r, std::memory_order_relaxed);
+    }
+    r->emplace(id);
+    readers_lock.wr_unlock();
+  }
+#endif
+
+public:
+  /** In crash recovery or the change buffer, claim the ownership
+  of the exclusive block lock to the current thread */
+  void claim_ownership() { set_new_owner(pthread_self()); }
+
+  /** @return whether the current thread is holding X or U latch */
+  bool have_u_or_x() const
+  {
+    if (pthread_self() != writer.load(std::memory_order_relaxed))
+      return false;
+    ut_ad(recursive);
+    return true;
+  }
+  /** @return whether the current thread is holding U but not X latch */
+  bool have_u_not_x() const
+  { return have_u_or_x() && !((recursive / RECURSIVE_X) & RECURSIVE_MAX); }
+  /** @return whether the current thread is holding X latch */
+  bool have_x() const
+  { return have_u_or_x() && ((recursive / RECURSIVE_X) & RECURSIVE_MAX); }
+#ifdef UNIV_DEBUG
+  /** @return whether the current thread is holding S latch */
+  bool have_s() const
+  {
+    if (auto r= readers.load(std::memory_order_relaxed))
+    {
+      readers_lock.wr_lock();
+      bool found= r->find(pthread_self()) != r->end();
+      readers_lock.wr_unlock();
+      return found;
+    }
+    return false;
+  }
+  /** @return whether the current thread is holding the latch */
+  bool have_any() const { return have_u_or_x() || have_s(); }
+#endif
+
+  /** Acquire a shared lock */
+  inline void s_lock();
+  inline void s_lock(const char *file, unsigned line);
+  /** Acquire an update lock */
+  inline void u_lock();
+  inline void u_lock(const char *file, unsigned line);
+  /** Acquire an exclusive lock */
+  inline void x_lock(bool for_io= false);
+  inline void x_lock(const char *file, unsigned line);
+  /** Acquire a recursive exclusive lock */
+  void x_lock_recursive() { writer_recurse<false>(); }
+  /** Upgrade an update lock */
+  inline void u_x_upgrade();
+  inline void u_x_upgrade(const char *file, unsigned line);
+  /** Downgrade a single exclusive lock to an update lock */
+  void x_u_downgrade()
+  {
+    ut_ad(have_u_or_x());
+    ut_ad(recursive <= RECURSIVE_MAX);
+    recursive*= RECURSIVE_U;
+    lock.wr_u_downgrade();
+  }
+
+  /** Acquire an exclusive lock or upgrade an update lock
+  @return whether U locks were upgraded to X */
+  inline bool x_lock_upgraded();
+
+  /** @return whether a shared lock was acquired */
+  bool s_lock_try()
+  {
+    bool acquired= lock.rd_lock_try();
+    ut_d(if (acquired) s_lock_register());
+    return acquired;
+  }
+
+  /** Try to acquire an update lock
+  @param for_io  whether the lock will be released by another thread
+  @return whether the update lock was acquired */
+  inline bool u_lock_try(bool for_io);
+
+  /** Try to acquire an exclusive lock
+  @return whether an exclusive lock was acquired */
+  inline bool x_lock_try();
+
+  /** Release a shared lock */
+  void s_unlock()
+  {
+#ifdef UNIV_DEBUG
+    const pthread_t id= pthread_self();
+    auto r= readers.load(std::memory_order_relaxed);
+    ut_ad(r);
+    readers_lock.wr_lock();
+    auto i= r->find(id);
+    ut_ad(i != r->end());
+    r->erase(i);
+    readers_lock.wr_unlock();
+#endif
+    lock.rd_unlock();
+  }
+  /** Release an update or exclusive lock
+  @param allow_readers    whether we are releasing a U lock
+  @param claim_ownership  whether the lock was acquired by another thread */
+  void u_or_x_unlock(bool allow_readers, bool claim_ownership= false)
+  {
+    ut_d(auto owner= writer.load(std::memory_order_relaxed));
+    ut_ad(owner == pthread_self() ||
+          (owner == FOR_IO && claim_ownership &&
+           recursive == (allow_readers ? RECURSIVE_U : RECURSIVE_X)));
+    ut_d(auto rec= (recursive / (allow_readers ? RECURSIVE_U : RECURSIVE_X)) &
+         RECURSIVE_MAX);
+    ut_ad(rec);
+    if (!(recursive-= allow_readers ? RECURSIVE_U : RECURSIVE_X))
+    {
+      set_new_owner(0);
+      if (allow_readers)
+        lock.u_unlock();
+      else
+        lock.wr_unlock();
+    }
+  }
+  /** Release an update lock */
+  void u_unlock(bool claim_ownership= false)
+  { u_or_x_unlock(true, claim_ownership); }
+  /** Release an exclusive lock */
+  void x_unlock(bool claim_ownership= false)
+  { u_or_x_unlock(false, claim_ownership); }
+
+  /** @return whether any writer is waiting */
+  bool is_waiting() const { return lock.is_waiting(); }
+
+  bool is_write_locked() const { return lock.is_write_locked(); }
+
+  bool is_locked_or_waiting() const { return lock.is_locked_or_waiting(); }
+
+  inline void lock_shared();
+  inline void unlock_shared();
+};
+
+typedef sux_lock<ssux_lock_impl<true>> block_lock;
+
+#ifndef UNIV_PFS_RWLOCK
+typedef sux_lock<ssux_lock_impl<false>> index_lock;
+#else
+typedef sux_lock<ssux_lock> index_lock;
+
+template<> inline void sux_lock<ssux_lock_impl<true>>::init()
+{
+  lock.init();
+  ut_ad(!writer.load(std::memory_order_relaxed));
+  ut_ad(!recursive);
+  ut_d(readers_lock.init());
+#ifdef UNIV_DEBUG
+  if (auto r= readers.load(std::memory_order_relaxed))
+    ut_ad(r->empty());
+#endif
+}
+
+template<>
+inline void sux_lock<ssux_lock>::s_lock(const char *file, unsigned line)
+{
+  ut_ad(!have_x());
+  ut_ad(!have_s());
+  lock.rd_lock(file, line);
+  ut_d(s_lock_register());
+}
+
+template<>
+inline void sux_lock<ssux_lock>::u_lock(const char *file, unsigned line)
+{
+  pthread_t id= pthread_self();
+  if (writer.load(std::memory_order_relaxed) == id)
+    writer_recurse<true>();
+  else
+  {
+    lock.u_lock(file, line);
+    ut_ad(!recursive);
+    recursive= RECURSIVE_U;
+    set_first_owner(id);
+  }
+}
+
+template<>
+inline void sux_lock<ssux_lock>::x_lock(const char *file, unsigned line)
+{
+  pthread_t id= pthread_self();
+  if (writer.load(std::memory_order_relaxed) == id)
+    writer_recurse<false>();
+  else
+  {
+    lock.wr_lock(file, line);
+    ut_ad(!recursive);
+    recursive= RECURSIVE_X;
+    set_first_owner(id);
+  }
+}
+
+template<>
+inline void sux_lock<ssux_lock>::u_x_upgrade(const char *file, unsigned line)
+{
+  ut_ad(have_u_not_x());
+  lock.u_wr_upgrade(file, line);
+  recursive/= RECURSIVE_U;
+}
+#endif
+
+/** needed for dict_index_t::clone() */
+template<> inline void index_lock::operator=(const sux_lock&)
+{
+  memset((void*) this, 0, sizeof *this);
+}
+
+template<typename ssux> inline void sux_lock<ssux>::s_lock()
+{
+  ut_ad(!have_x());
+  ut_ad(!have_s());
+  lock.rd_lock();
+  ut_d(s_lock_register());
+}
+
+template<typename ssux>
+inline void sux_lock<ssux>::lock_shared() { s_lock(); }
+template<typename ssux>
+inline void sux_lock<ssux>::unlock_shared() { s_unlock(); }
+
+template<typename ssux> inline void sux_lock<ssux>::u_lock()
+{
+  pthread_t id= pthread_self();
+  if (writer.load(std::memory_order_relaxed) == id)
+    writer_recurse<true>();
+  else
+  {
+    lock.u_lock();
+    ut_ad(!recursive);
+    recursive= RECURSIVE_U;
+    set_first_owner(id);
+  }
+}
+
+template<typename ssux> inline void sux_lock<ssux>::x_lock(bool for_io)
+{
+  pthread_t id= pthread_self();
+  if (writer.load(std::memory_order_relaxed) == id)
+  {
+    ut_ad(!for_io);
+    writer_recurse<false>();
+  }
+  else
+  {
+    lock.wr_lock();
+    ut_ad(!recursive);
+    recursive= RECURSIVE_X;
+    set_first_owner(for_io ? FOR_IO : id);
+  }
+}
+
+template<typename ssux> inline void sux_lock<ssux>::u_x_upgrade()
+{
+  ut_ad(have_u_not_x());
+  lock.u_wr_upgrade();
+  recursive/= RECURSIVE_U;
+}
+
+template<typename ssux> inline bool sux_lock<ssux>::x_lock_upgraded()
+{
+  pthread_t id= pthread_self();
+  if (writer.load(std::memory_order_relaxed) == id)
+  {
+    ut_ad(recursive);
+    static_assert(RECURSIVE_X == 1, "compatibility");
+    if (recursive & RECURSIVE_MAX)
+    {
+      writer_recurse<false>();
+      return false;
+    }
+    /* Upgrade the lock. */
+    lock.u_wr_upgrade();
+    recursive/= RECURSIVE_U;
+    return true;
+  }
+  else
+  {
+    lock.wr_lock();
+    ut_ad(!recursive);
+    recursive= RECURSIVE_X;
+    set_first_owner(id);
+    return false;
+  }
+}
+
+template<typename ssux> inline bool sux_lock<ssux>::u_lock_try(bool for_io)
+{
+  pthread_t id= pthread_self();
+  if (writer.load(std::memory_order_relaxed) == id)
+  {
+    if (for_io)
+      return false;
+    writer_recurse<true>();
+    return true;
+  }
+  if (lock.u_lock_try())
+  {
+    ut_ad(!recursive);
+    recursive= RECURSIVE_U;
+    set_first_owner(for_io ? FOR_IO : id);
+    return true;
+  }
+  return false;
+}
+
+template<typename ssux> inline bool sux_lock<ssux>::x_lock_try()
+{
+  pthread_t id= pthread_self();
+  if (writer.load(std::memory_order_relaxed) == id)
+  {
+    writer_recurse<false>();
+    return true;
+  }
+  if (lock.wr_lock_try())
+  {
+    ut_ad(!recursive);
+    recursive= RECURSIVE_X;
+    set_first_owner(id);
+    return true;
+  }
+  return false;
+}
diff --git a/storage/innobase/include/transactional_lock_guard.h b/storage/innobase/include/transactional_lock_guard.h
new file mode 100644
index 00000000..168a6897
--- /dev/null
+++ b/storage/innobase/include/transactional_lock_guard.h
@@ -0,0 +1,174 @@
+/*****************************************************************************
+
+Copyright (c) 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#pragma once
+
+#if defined __powerpc64__
+#elif defined __s390__
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64) && !defined(__clang__)
+#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+# if __GNUC__ >= 8
+# elif defined __clang_major__ && __clang_major__ > 6
+# else
+#  define NO_ELISION
+# endif
+#else /* Transactional memory has not been implemented for this ISA */
+# define NO_ELISION
+#endif
+
+#ifdef NO_ELISION
+constexpr bool have_transactional_memory= false;
+# ifdef UNIV_DEBUG
+static inline bool xtest() { return false; }
+# endif
+# define TRANSACTIONAL_TARGET /* nothing */
+# define TRANSACTIONAL_INLINE /* nothing */
+#else
+# if defined __i386__||defined __x86_64__||defined _M_IX86||defined _M_X64
+extern bool have_transactional_memory;
+bool transactional_lock_enabled();
+
+#  include <immintrin.h>
+#  if defined __GNUC__ && !defined __INTEL_COMPILER
+#   define TRANSACTIONAL_TARGET __attribute__((target("rtm"),hot))
+#   define TRANSACTIONAL_INLINE __attribute__((target("rtm"),hot,always_inline))
+#  else
+#   define TRANSACTIONAL_TARGET /* nothing */
+#   define TRANSACTIONAL_INLINE /* nothing */
+#  endif
+
+TRANSACTIONAL_INLINE static inline bool xbegin()
+{
+  return have_transactional_memory && _xbegin() == _XBEGIN_STARTED;
+}
+
+#  ifdef UNIV_DEBUG
+#   ifdef __GNUC__
+/** @return whether a memory transaction is active */
+bool xtest();
+#   else
+static inline bool xtest() { return have_transactional_memory && _xtest(); }
+#   endif
+#  endif
+
+TRANSACTIONAL_INLINE static inline void xabort() { _xabort(0); }
+
+TRANSACTIONAL_INLINE static inline void xend() { _xend(); }
+# elif defined __powerpc64__ || defined __s390__
+extern bool have_transactional_memory;
+bool transactional_lock_enabled();
+#   define TRANSACTIONAL_TARGET __attribute__((hot))
+#   define TRANSACTIONAL_INLINE __attribute__((hot,always_inline))
+
+/**
+  Newer gcc compilers only provide __builtin_{htm}
+  functions when the -mhtm CFLAG is actually provided. So
+  we've got the option of including it globally, or
+  pushing down the inclusion of htmxlintrin.h to one
+  file with -mhtm enabled and removing the inline
+  optimization.
+
+  Per FIXME in s390x's htmxlintrin.h, the __TM_simple_begin
+  isn't always_inline resulting in duplicate definitions if
+  it where included more than once.  While xabort and xend
+  could be implemented here, we keep the implementation the
+  same as ppc64.
+ */
+TRANSACTIONAL_TARGET bool xbegin();
+TRANSACTIONAL_TARGET void xabort();
+TRANSACTIONAL_TARGET void xend();
+#  ifdef UNIV_DEBUG
+bool xtest();
+#  endif
+
+# endif
+#endif
+
+template<class mutex>
+class transactional_lock_guard
+{
+  mutex &m;
+
+public:
+  TRANSACTIONAL_INLINE transactional_lock_guard(mutex &m) : m(m)
+  {
+#ifndef NO_ELISION
+    if (xbegin())
+    {
+      if (was_elided())
+        return;
+      xabort();
+    }
+#endif
+    m.lock();
+  }
+  transactional_lock_guard(const transactional_lock_guard &)= delete;
+  TRANSACTIONAL_INLINE ~transactional_lock_guard()
+  {
+#ifndef NO_ELISION
+    if (was_elided()) xend(); else
+#endif
+    m.unlock();
+  }
+
+#ifndef NO_ELISION
+  bool was_elided() const noexcept { return !m.is_locked_or_waiting(); }
+#else
+  bool was_elided() const noexcept { return false; }
+#endif
+};
+
+template<class mutex>
+class transactional_shared_lock_guard
+{
+  mutex &m;
+#ifndef NO_ELISION
+  bool elided;
+#else
+  static constexpr bool elided= false;
+#endif
+
+public:
+  TRANSACTIONAL_INLINE transactional_shared_lock_guard(mutex &m) : m(m)
+  {
+#ifndef NO_ELISION
+    if (xbegin())
+    {
+      if (!m.is_write_locked())
+      {
+        elided= true;
+        return;
+      }
+      xabort();
+    }
+    elided= false;
+#endif
+    m.lock_shared();
+  }
+  transactional_shared_lock_guard(const transactional_shared_lock_guard &)=
+    delete;
+  TRANSACTIONAL_INLINE ~transactional_shared_lock_guard()
+  {
+#ifndef NO_ELISION
+    if (was_elided()) xend(); else
+#endif
+    m.unlock_shared();
+  }
+
+  bool was_elided() const noexcept { return elided; }
+};
diff --git a/storage/innobase/include/trx0i_s.h b/storage/innobase/include/trx0i_s.h
new file mode 100644
index 00000000..caacfa09
--- /dev/null
+++ b/storage/innobase/include/trx0i_s.h
@@ -0,0 +1,277 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0i_s.h
+INFORMATION SCHEMA innodb_trx, innodb_locks and
+innodb_lock_waits tables cache structures and public
+functions.
+
+Created July 17, 2007 Vasil Dimov
+*******************************************************/
+
+#ifndef trx0i_s_h
+#define trx0i_s_h
+
+#include "trx0types.h"
+#include "dict0types.h"
+#include "buf0types.h"
+
+/** The maximum amount of memory that can be consumed by innodb_trx,
+innodb_locks and innodb_lock_waits information schema tables. */
+#define TRX_I_S_MEM_LIMIT		16777216 /* 16 MiB */
+
+/** The maximum length of a string that can be stored in
+i_s_locks_row_t::lock_data */
+#define TRX_I_S_LOCK_DATA_MAX_LEN	8192
+
+/** The maximum length of a string that can be stored in
+i_s_trx_row_t::trx_query */
+#define TRX_I_S_TRX_QUERY_MAX_LEN	1024
+
+/** The maximum length of a string that can be stored in
+i_s_trx_row_t::trx_foreign_key_error */
+#define TRX_I_S_TRX_FK_ERROR_MAX_LEN	256
+
+/** Safely copy strings in to the INNODB_TRX table's
+string based columns */
+#define TRX_I_S_STRING_COPY(data, field, constraint, tcache)	\
+do {								\
+	if (strlen(data) > constraint) {			\
+		char	buff[constraint + 1];			\
+		strncpy(buff, data, constraint);		\
+		buff[constraint] = '\0';			\
+								\
+		field = static_cast<const char*>(		\
+			ha_storage_put_memlim(			\
+			(tcache)->storage, buff, constraint + 1,\
+			MAX_ALLOWED_FOR_STORAGE(tcache)));	\
+	} else {						\
+		field = static_cast<const char*>(		\
+			ha_storage_put_str_memlim(		\
+			(tcache)->storage, data,		\
+			MAX_ALLOWED_FOR_STORAGE(tcache)));	\
+	}							\
+} while (0)
+
+/** A row of INFORMATION_SCHEMA.innodb_locks */
+struct i_s_locks_row_t;
+
+/** Objects of trx_i_s_cache_t::locks_hash */
+struct i_s_hash_chain_t;
+
+/** Objects of this type are added to the hash table
+trx_i_s_cache_t::locks_hash */
+struct i_s_hash_chain_t {
+	i_s_locks_row_t*	value;	/*!< row of
+					INFORMATION_SCHEMA.innodb_locks*/
+	i_s_hash_chain_t*	next;	/*!< next item in the hash chain */
+};
+
+/** This structure represents INFORMATION_SCHEMA.innodb_locks row */
+struct i_s_locks_row_t {
+	trx_id_t	lock_trx_id;	/*!< transaction identifier */
+	const char*	lock_table;	/*!< table name from
+					lock_get_table_name() */
+	/** index name of a record lock; NULL for table locks */
+	const char*	lock_index;
+	/** page identifier of the record; (0,0) if !lock_index */
+	page_id_t	lock_page;
+	/** heap number of the record; 0 if !lock_index */
+	uint16_t	lock_rec;
+	/** lock mode corresponding to lock_mode_values_typelib */
+	uint8_t		lock_mode;
+	/** (some) content of the record, if available in the buffer pool;
+	NULL if !lock_index */
+	const char*	lock_data;
+
+	/** The following are auxiliary and not included in the table */
+	/* @{ */
+	table_id_t	lock_table_id;
+					/*!< table identifier from
+					lock_get_table_id */
+	i_s_hash_chain_t hash_chain;	/*!< hash table chain node for
+					trx_i_s_cache_t::locks_hash */
+	/* @} */
+};
+
+/** This structure represents INFORMATION_SCHEMA.innodb_trx row */
+struct i_s_trx_row_t {
+	trx_id_t		trx_id;		/*!< transaction identifier */
+	const char*		trx_state;
+	time_t			trx_started;	/*!< trx_t::start_time */
+	const i_s_locks_row_t*	requested_lock_row;
+					/*!< pointer to a row
+					in innodb_locks if trx
+					is waiting, or NULL */
+	time_t		trx_wait_started; /*!< trx_t->lock.wait_started */
+	uintmax_t	trx_weight;	/*!< TRX_WEIGHT() */
+	ulint		trx_mysql_thread_id; /*!< thd_get_thread_id() */
+	const char*	trx_query;	/*!< MySQL statement being
+					executed in the transaction */
+	CHARSET_INFO*	trx_query_cs;	/*!< the charset of trx_query */
+	const char*	trx_operation_state; /*!< trx_t::op_info */
+	ulint		trx_tables_in_use;/*!< n_mysql_tables_in_use in
+					 trx_t */
+	ulint		trx_tables_locked;
+					/*!< mysql_n_tables_locked in
+					trx_t */
+	ulint		trx_lock_structs;/*!< list len of trx_locks in
+					trx_t */
+	ulint		trx_lock_memory_bytes;
+					/*!< mem_heap_get_size(
+					trx->lock_heap) */
+	ulint		trx_rows_locked;/*!< trx_lock_t::n_rec_locks */
+	uintmax_t	trx_rows_modified;/*!< trx_t::undo_no */
+	uint		trx_isolation_level;
+					/*!< trx_t::isolation_level */
+	bool		trx_unique_checks;
+					/*!< check_unique_secondary in trx_t*/
+	bool		trx_foreign_key_checks;
+					/*!< check_foreigns in trx_t */
+	const char*	trx_foreign_key_error;
+					/*!< detailed_error in trx_t */
+	bool		trx_is_read_only;
+					/*!< trx_t::read_only */
+	bool		trx_is_autocommit_non_locking;
+					/*!< trx:t::is_autocommit_non_locking()
+					*/
+};
+
+/** This structure represents INFORMATION_SCHEMA.innodb_lock_waits row */
+struct i_s_lock_waits_row_t {
+	const i_s_locks_row_t*	requested_lock_row;	/*!< requested lock */
+	const i_s_locks_row_t*	blocking_lock_row;	/*!< blocking lock */
+};
+
+/** Cache of INFORMATION_SCHEMA table data */
+struct trx_i_s_cache_t;
+
+/** Auxiliary enum used by functions that need to select one of the
+INFORMATION_SCHEMA tables */
+enum i_s_table {
+	I_S_INNODB_TRX,		/*!< INFORMATION_SCHEMA.innodb_trx */
+	I_S_INNODB_LOCKS,	/*!< INFORMATION_SCHEMA.innodb_locks */
+	I_S_INNODB_LOCK_WAITS	/*!< INFORMATION_SCHEMA.innodb_lock_waits */
+};
+
+/** This is the intermediate buffer where data needed to fill the
+INFORMATION SCHEMA tables is fetched and later retrieved by the C++
+code in handler/i_s.cc. */
+extern trx_i_s_cache_t*	trx_i_s_cache;
+
+/*******************************************************************//**
+Initialize INFORMATION SCHEMA trx related cache. */
+void
+trx_i_s_cache_init(
+/*===============*/
+	trx_i_s_cache_t*	cache);	/*!< out: cache to init */
+/*******************************************************************//**
+Free the INFORMATION SCHEMA trx related cache. */
+void
+trx_i_s_cache_free(
+/*===============*/
+	trx_i_s_cache_t*	cache);	/*!< in/out: cache to free */
+
+/*******************************************************************//**
+Issue a shared/read lock on the tables cache. */
+void
+trx_i_s_cache_start_read(
+/*=====================*/
+	trx_i_s_cache_t*	cache);	/*!< in: cache */
+
+/*******************************************************************//**
+Release a shared/read lock on the tables cache. */
+void
+trx_i_s_cache_end_read(
+/*===================*/
+	trx_i_s_cache_t*	cache);	/*!< in: cache */
+
+/*******************************************************************//**
+Issue an exclusive/write lock on the tables cache. */
+void
+trx_i_s_cache_start_write(
+/*======================*/
+	trx_i_s_cache_t*	cache);	/*!< in: cache */
+
+/*******************************************************************//**
+Release an exclusive/write lock on the tables cache. */
+void
+trx_i_s_cache_end_write(
+/*====================*/
+	trx_i_s_cache_t*	cache);	/*!< in: cache */
+
+
+/*******************************************************************//**
+Retrieves the number of used rows in the cache for a given
+INFORMATION SCHEMA table.
+@return number of rows */
+ulint
+trx_i_s_cache_get_rows_used(
+/*========================*/
+	trx_i_s_cache_t*	cache,	/*!< in: cache */
+	enum i_s_table		table);	/*!< in: which table */
+
+/*******************************************************************//**
+Retrieves the nth row in the cache for a given INFORMATION SCHEMA
+table.
+@return row */
+void*
+trx_i_s_cache_get_nth_row(
+/*======================*/
+	trx_i_s_cache_t*	cache,	/*!< in: cache */
+	enum i_s_table		table,	/*!< in: which table */
+	ulint			n);	/*!< in: row number */
+
+/*******************************************************************//**
+Update the transactions cache if it has not been read for some time.
+@return 0 - fetched, 1 - not */
+int
+trx_i_s_possibly_fetch_data_into_cache(
+/*===================================*/
+	trx_i_s_cache_t*	cache);	/*!< in/out: cache */
+
+/*******************************************************************//**
+Returns true, if the data in the cache is truncated due to the memory
+limit posed by TRX_I_S_MEM_LIMIT.
+@return TRUE if truncated */
+bool
+trx_i_s_cache_is_truncated(
+/*=======================*/
+	trx_i_s_cache_t*	cache);	/*!< in: cache */
+/** The maximum length of a resulting lock_id_size in
+trx_i_s_create_lock_id(), not including the terminating NUL.
+":%lu:%lu:%lu" -> 63 chars */
+#define TRX_I_S_LOCK_ID_MAX_LEN	(TRX_ID_MAX_LEN + 63)
+
+/*******************************************************************//**
+Crafts a lock id string from a i_s_locks_row_t object. Returns its
+second argument. This function aborts if there is not enough space in
+lock_id. Be sure to provide at least TRX_I_S_LOCK_ID_MAX_LEN + 1 if you
+want to be 100% sure that it will not abort.
+@return resulting lock id */
+char*
+trx_i_s_create_lock_id(
+/*===================*/
+	const i_s_locks_row_t*	row,	/*!< in: innodb_locks row */
+	char*			lock_id,/*!< out: resulting lock_id */
+	ulint			lock_id_size);/*!< in: size of the lock id
+					buffer */
+
+#endif /* trx0i_s_h */
diff --git a/storage/innobase/include/trx0purge.h b/storage/innobase/include/trx0purge.h
new file mode 100644
index 00000000..3ddd2e98
--- /dev/null
+++ b/storage/innobase/include/trx0purge.h
@@ -0,0 +1,427 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0purge.h
+Purge old versions
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "trx0sys.h"
+#include "que0types.h"
+#include "srw_lock.h"
+
+#include <queue>
+#include <unordered_map>
+
+/** Prepend the history list with an undo log.
+Remove the undo log segment from the rseg slot if it is too big for reuse.
+@param[in]	trx		transaction
+@param[in,out]	undo		undo log
+@param[in,out]	mtr		mini-transaction */
+void
+trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr);
+
+/**
+Remove unnecessary history data from rollback segments. NOTE that when this
+function is called, the caller (purge_coordinator_callback)
+must not have any latches on undo log pages!
+*/
+void trx_purge_truncate_history();
+
+/**
+Run a purge batch.
+@param n_tasks       number of purge tasks to submit to the queue
+@param history_size  trx_sys.history_size()
+@return number of undo log pages handled in the batch */
+ulint trx_purge(ulint n_tasks, ulint history_size);
+
+/** Rollback segements from a given transaction with trx-no
+scheduled for purge. */
+class TrxUndoRsegs {
+private:
+	typedef std::vector<trx_rseg_t*, ut_allocator<trx_rseg_t*> >
+		trx_rsegs_t;
+public:
+	typedef trx_rsegs_t::iterator iterator;
+	typedef trx_rsegs_t::const_iterator const_iterator;
+
+	TrxUndoRsegs() = default;
+
+	/** Constructor */
+	TrxUndoRsegs(trx_rseg_t& rseg)
+		: trx_no(rseg.last_trx_no()), m_rsegs(1, &rseg) {}
+	/** Constructor */
+	TrxUndoRsegs(trx_id_t trx_no, trx_rseg_t& rseg)
+		: trx_no(trx_no), m_rsegs(1, &rseg) {}
+
+	bool operator!=(const TrxUndoRsegs& other) const
+	{ return trx_no != other.trx_no; }
+	bool empty() const { return m_rsegs.empty(); }
+	void erase(iterator& it) { m_rsegs.erase(it); }
+	iterator begin() { return(m_rsegs.begin()); }
+	iterator end() { return(m_rsegs.end()); }
+	const_iterator begin() const { return m_rsegs.begin(); }
+	const_iterator end() const { return m_rsegs.end(); }
+
+	/** Compare two TrxUndoRsegs based on trx_no.
+	@param elem1 first element to compare
+	@param elem2 second element to compare
+	@return true if elem1 > elem2 else false.*/
+	bool operator()(const TrxUndoRsegs& lhs, const TrxUndoRsegs& rhs)
+	{
+		return(lhs.trx_no > rhs.trx_no);
+	}
+
+	/** Copy of trx_rseg_t::last_trx_no() */
+	trx_id_t trx_no= 0;
+private:
+	/** Rollback segments of a transaction, scheduled for purge. */
+	trx_rsegs_t m_rsegs{};
+};
+
+typedef std::priority_queue<
+	TrxUndoRsegs,
+	std::vector<TrxUndoRsegs, ut_allocator<TrxUndoRsegs> >,
+	TrxUndoRsegs>	purge_pq_t;
+
+/** Chooses the rollback segment with the oldest committed transaction */
+struct TrxUndoRsegsIterator {
+	/** Constructor */
+	TrxUndoRsegsIterator();
+	/** Sets the next rseg to purge in purge_sys.
+	Executed in the purge coordinator thread.
+	@retval false when nothing is to be purged
+	@retval true  when purge_sys.rseg->latch was locked */
+	inline bool set_next();
+
+private:
+	// Disable copying
+	TrxUndoRsegsIterator(const TrxUndoRsegsIterator&);
+	TrxUndoRsegsIterator& operator=(const TrxUndoRsegsIterator&);
+
+	/** The current element to process */
+	TrxUndoRsegs			m_rsegs;
+	/** Track the current element in m_rsegs */
+	TrxUndoRsegs::const_iterator	m_iter;
+};
+
+/** The control structure used in the purge operation */
+class purge_sys_t
+{
+  friend TrxUndoRsegsIterator;
+public:
+  /** latch protecting view, m_enabled */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) mutable srw_spin_lock latch;
+private:
+  /** Read view at the start of a purge batch. Any encountered index records
+  that are older than view will be removed. */
+  ReadViewBase view;
+  /** whether the subsystem has been initialized */
+  bool m_initialized{false};
+  /** whether purge is enabled; protected by latch and std::atomic */
+  std::atomic<bool> m_enabled{false};
+public:
+  /** whether purge is active (may hold table handles) */
+  std::atomic<bool> m_active{false};
+private:
+  /** number of pending stop() calls without resume() */
+  Atomic_counter<uint32_t> m_paused;
+  /** number of stop_SYS() calls without resume_SYS() */
+  Atomic_counter<uint32_t> m_SYS_paused;
+  /** number of stop_FTS() calls without resume_FTS() */
+  Atomic_counter<uint32_t> m_FTS_paused;
+
+  /** latch protecting end_view */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) srw_spin_lock_low end_latch;
+  /** Read view at the end of a purge batch (copied from view). Any undo pages
+  containing records older than end_view may be freed. */
+  ReadViewBase end_view;
+
+  struct hasher
+  {
+    size_t operator()(const page_id_t &id) const { return size_t(id.raw()); }
+  };
+
+  using unordered_map =
+    std::unordered_map<const page_id_t, buf_block_t*, hasher,
+#if defined __GNUC__ && __GNUC__ == 4 && __GNUC_MINOR__ >= 8
+                       std::equal_to<page_id_t>
+                       /* GCC 4.8.5 would fail to find a matching allocator */
+#else
+                       std::equal_to<page_id_t>,
+                       ut_allocator<std::pair<const page_id_t, buf_block_t*>>
+#endif
+                       >;
+  /** map of buffer-fixed undo log pages processed during a purge batch */
+  unordered_map pages;
+public:
+  /** @return the number of processed undo pages */
+  size_t n_pages_handled() const { return pages.size(); }
+
+  /** Look up an undo log page.
+  @param id    undo page identifier
+  @return undo page
+  @retval nullptr in case the page is corrupted */
+  buf_block_t *get_page(page_id_t id);
+
+	que_t*		query;		/*!< The query graph which will do the
+					parallelized purge operation */
+
+	/** Iterator to the undo log records of committed transactions */
+	struct iterator
+	{
+		bool operator<=(const iterator& other) const
+		{
+			if (trx_no < other.trx_no) return true;
+			if (trx_no > other.trx_no) return false;
+			return undo_no <= other.undo_no;
+		}
+
+		/** Free the undo pages up to this. */
+		dberr_t free_history() const;
+
+		/** trx_t::no of the committed transaction */
+		trx_id_t	trx_no;
+		/** The record number within the committed transaction's undo
+		log, increasing, purged from from 0 onwards */
+		undo_no_t	undo_no;
+	};
+
+	/** The tail of the purge queue; the last parsed undo log of a
+	committed transaction. */
+	iterator	tail;
+	/** The head of the purge queue; any older undo logs of committed
+	transactions may be discarded (history list truncation).
+	Protected by latch. */
+	iterator	head;
+	/*-----------------------------*/
+	bool		next_stored;	/*!< whether rseg holds the next record
+					to purge */
+	trx_rseg_t*	rseg;		/*!< Rollback segment for the next undo
+					record to purge */
+private:
+	uint32_t	page_no;	/*!< Page number for the next undo
+					record to purge, page number of the
+					log header, if dummy record */
+	uint32_t	hdr_page_no;	/*!< Header page of the undo log where
+					the next record to purge belongs */
+	uint16_t	offset;		/*!< Page offset for the next undo
+					record to purge, 0 if the dummy
+					record */
+	uint16_t	hdr_offset;	/*!< Header byte offset on the page */
+
+
+	TrxUndoRsegsIterator
+			rseg_iter;	/*!< Iterator to get the next rseg
+					to process */
+public:
+	purge_pq_t	purge_queue;	/*!< Binary min-heap, ordered on
+					TrxUndoRsegs::trx_no. It is protected
+					by the pq_mutex */
+	mysql_mutex_t	pq_mutex;	/*!< Mutex protecting purge_queue */
+
+	/** Undo tablespace file truncation (only accessed by the
+	srv_purge_coordinator_thread) */
+	struct {
+		/** The undo tablespace that is currently being truncated */
+		fil_space_t*	current;
+		/** The undo tablespace that was last truncated */
+		fil_space_t*	last;
+	} truncate;
+
+  /** Create the instance */
+  void create();
+
+  /** Close the purge system on shutdown */
+  void close();
+
+  /** @return whether purge is enabled */
+  bool enabled() { return m_enabled.load(std::memory_order_relaxed); }
+  /** @return whether the purge coordinator is paused */
+  bool paused()
+  { return m_paused != 0; }
+
+  /** Enable purge at startup. */
+  void coordinator_startup()
+  {
+    ut_ad(!enabled());
+    m_enabled.store(true, std::memory_order_relaxed);
+    wake_if_not_active();
+  }
+
+  /** Disable purge at shutdown */
+  void coordinator_shutdown()
+  {
+    ut_ad(enabled());
+    m_enabled.store(false, std::memory_order_relaxed);
+  }
+
+  /** @return whether the purge tasks are active */
+  static bool running();
+
+  /** Stop purge during FLUSH TABLES FOR EXPORT. */
+  void stop();
+  /** Resume purge at UNLOCK TABLES after FLUSH TABLES FOR EXPORT */
+  void resume();
+
+  /** Close and reopen all tables in case of a MDL conflict with DDL */
+  dict_table_t *close_and_reopen(table_id_t id, THD *thd, MDL_ticket **mdl);
+private:
+  /** Suspend purge during a DDL operation on FULLTEXT INDEX tables */
+  void wait_FTS(bool also_sys);
+public:
+  /** Suspend purge in data dictionary tables */
+  void stop_SYS() { m_SYS_paused++; }
+  /** Resume purge in data dictionary tables */
+  static void resume_SYS(void *);
+
+  /** Pause purge during a DDL operation that could drop FTS_ tables. */
+  void stop_FTS();
+  /** Resume purge after stop_FTS(). */
+  void resume_FTS() { ut_d(const auto p=) m_FTS_paused--; ut_ad(p); }
+  /** @return whether stop_SYS() is in effect */
+  bool must_wait_FTS() const { return m_FTS_paused; }
+
+private:
+  /**
+  Get the next record to purge and update the info in the purge system.
+  @param roll_ptr           undo log pointer to the record
+  @return buffer-fixed reference to undo log record
+  @retval {nullptr,1} if the whole undo log can skipped in purge
+  @retval {nullptr,0} if nothing is left, or on corruption */
+  inline trx_purge_rec_t get_next_rec(roll_ptr_t roll_ptr);
+
+  /** Choose the next undo log to purge.
+  @return whether anything is to be purged */
+  bool choose_next_log();
+
+  /** Update the last not yet purged history log info in rseg when
+  we have purged a whole undo log. Advances also purge_trx_no
+  past the purged log. */
+  void rseg_get_next_history_log();
+
+public:
+  /**
+  Fetch the next undo log record from the history list to purge.
+  @return buffer-fixed reference to undo log record
+  @retval {nullptr,1} if the whole undo log can skipped in purge
+  @retval {nullptr,0} if nothing is left, or on corruption */
+  inline trx_purge_rec_t fetch_next_rec();
+
+  /** Determine if the history of a transaction is purgeable.
+  @param trx_id  transaction identifier
+  @return whether the history is purgeable */
+  TRANSACTIONAL_TARGET bool is_purgeable(trx_id_t trx_id) const;
+
+  /** A wrapper around ReadView::low_limit_no(). */
+  trx_id_t low_limit_no() const
+  {
+    /* This function may only be called by purge_coordinator_callback().
+
+    The purge coordinator task may call this without holding any latch,
+    because it is the only thread that may modify purge_sys.view.
+
+    Any other threads that access purge_sys.view must hold purge_sys.latch,
+    typically via purge_sys_t::view_guard. */
+    return view.low_limit_no();
+  }
+  /** A wrapper around ReadView::sees(). */
+  trx_id_t sees(trx_id_t id) const
+  {
+    /* This function may only be called by purge_coordinator_callback().
+
+    The purge coordinator task may call this without holding any latch,
+    because it is the only thread that may modify purge_sys.view.
+
+    Any other threads that access purge_sys.view must hold purge_sys.latch,
+    typically via purge_sys_t::view_guard. */
+    return view.sees(id);
+  }
+  /** A wrapper around trx_sys_t::clone_oldest_view(). */
+  template<bool also_end_view= false>
+  void clone_oldest_view()
+  {
+    if (!also_end_view)
+      wait_FTS(true);
+    latch.wr_lock(SRW_LOCK_CALL);
+    trx_sys.clone_oldest_view(&view);
+    if (also_end_view)
+      (end_view= view).
+        clamp_low_limit_id(head.trx_no ? head.trx_no : tail.trx_no);
+    latch.wr_unlock();
+  }
+
+  /** Wake up the purge threads if there is work to do. */
+  void wake_if_not_active();
+
+  /** Release undo pages and update end_view at the end of a purge batch.
+  @retval false when nothing is to be purged
+  @retval true  when purge_sys.rseg->latch was locked  */
+  inline void batch_cleanup(const iterator &head);
+
+  struct view_guard
+  {
+    inline view_guard();
+    inline ~view_guard();
+
+    /** @return purge_sys.view */
+    inline const ReadViewBase &view() const;
+  };
+
+  struct end_view_guard
+  {
+    inline end_view_guard();
+    inline ~end_view_guard();
+
+    /** @return purge_sys.end_view */
+    inline const ReadViewBase &view() const;
+  };
+
+  /** Stop the purge thread and check n_ref_count of all auxiliary
+  and common table associated with the fts table.
+  @param	table		parent FTS table
+  @param	already_stopped	True indicates purge threads were
+				already stopped */
+  void stop_FTS(const dict_table_t &table, bool already_stopped=false);
+};
+
+/** The global data structure coordinating a purge */
+extern purge_sys_t	purge_sys;
+
+purge_sys_t::view_guard::view_guard()
+{ purge_sys.latch.rd_lock(SRW_LOCK_CALL); }
+
+purge_sys_t::view_guard::~view_guard()
+{ purge_sys.latch.rd_unlock(); }
+
+const ReadViewBase &purge_sys_t::view_guard::view() const
+{ return purge_sys.view; }
+
+purge_sys_t::end_view_guard::end_view_guard()
+{ purge_sys.end_latch.rd_lock(); }
+
+purge_sys_t::end_view_guard::~end_view_guard()
+{ purge_sys.end_latch.rd_unlock(); }
+
+const ReadViewBase &purge_sys_t::end_view_guard::view() const
+{ return purge_sys.end_view; }
diff --git a/storage/innobase/include/trx0rec.h b/storage/innobase/include/trx0rec.h
new file mode 100644
index 00000000..3d9b1868
--- /dev/null
+++ b/storage/innobase/include/trx0rec.h
@@ -0,0 +1,299 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0rec.h
+Transaction undo log record
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+
+#include "trx0types.h"
+#include "row0types.h"
+#include "page0types.h"
+#include "que0types.h"
+
+/**********************************************************************//**
+Reads the undo log record number.
+@return undo no */
+inline undo_no_t trx_undo_rec_get_undo_no(const trx_undo_rec_t *undo_rec)
+{
+  return mach_u64_read_much_compressed(undo_rec + 3);
+}
+
+/**********************************************************************//**
+Returns the start of the undo record data area. */
+#define trx_undo_rec_get_ptr(undo_rec, undo_no)		\
+	((undo_rec) + trx_undo_rec_get_offset(undo_no))
+
+/**********************************************************************//**
+Reads from an undo log record the general parameters.
+@return remaining part of undo log record after reading these values */
+const byte*
+trx_undo_rec_get_pars(
+/*==================*/
+	const trx_undo_rec_t*	undo_rec,	/*!< in: undo log record */
+	byte*		type,		/*!< out: undo record type:
+					TRX_UNDO_INSERT_REC, ... */
+	byte*		cmpl_info,	/*!< out: compiler info, relevant only
+					for update type records */
+	bool*		updated_extern,	/*!< out: true if we updated an
+					externally stored fild */
+	undo_no_t*	undo_no,	/*!< out: undo log record number */
+	table_id_t*	table_id)	/*!< out: table id */
+	MY_ATTRIBUTE((nonnull));
+
+/*******************************************************************//**
+Builds a row reference from an undo log record.
+@return pointer to remaining part of undo record */
+const byte*
+trx_undo_rec_get_row_ref(
+/*=====================*/
+	const byte*	ptr,	/*!< in: remaining part of a copy of an undo log
+				record, at the start of the row reference;
+				NOTE that this copy of the undo log record must
+				be preserved as long as the row reference is
+				used, as we do NOT copy the data in the
+				record! */
+	dict_index_t*	index,	/*!< in: clustered index */
+	const dtuple_t**ref,	/*!< out, own: row reference */
+	mem_heap_t*	heap)	/*!< in: memory heap from which the memory
+				needed is allocated */
+	MY_ATTRIBUTE((nonnull));
+/**********************************************************************//**
+Reads from an undo log update record the system field values of the old
+version.
+@return remaining part of undo log record after reading these values */
+byte*
+trx_undo_update_rec_get_sys_cols(
+/*=============================*/
+	const byte*	ptr,		/*!< in: remaining part of undo
+					log record after reading
+					general parameters */
+	trx_id_t*	trx_id,		/*!< out: trx id */
+	roll_ptr_t*	roll_ptr,	/*!< out: roll ptr */
+	byte*		info_bits);	/*!< out: info bits state */
+/*******************************************************************//**
+Builds an update vector based on a remaining part of an undo log record.
+@return remaining part of the record, NULL if an error detected, which
+means that the record is corrupted */
+byte*
+trx_undo_update_rec_get_update(
+/*===========================*/
+	const byte*	ptr,	/*!< in: remaining part in update undo log
+				record, after reading the row reference
+				NOTE that this copy of the undo log record must
+				be preserved as long as the update vector is
+				used, as we do NOT copy the data in the
+				record! */
+	dict_index_t*	index,	/*!< in: clustered index */
+	ulint		type,	/*!< in: TRX_UNDO_UPD_EXIST_REC,
+				TRX_UNDO_UPD_DEL_REC, or
+				TRX_UNDO_DEL_MARK_REC; in the last case,
+				only trx id and roll ptr fields are added to
+				the update vector */
+	trx_id_t	trx_id,	/*!< in: transaction id from this undorecord */
+	roll_ptr_t	roll_ptr,/*!< in: roll pointer from this undo record */
+	byte		info_bits,/*!< in: info bits from this undo record */
+	mem_heap_t*	heap,	/*!< in: memory heap from which the memory
+				needed is allocated */
+	upd_t**		upd);	/*!< out, own: update vector */
+/** Report a RENAME TABLE operation.
+@param[in,out]	trx	transaction
+@param[in]	table	table that is being renamed
+@return	DB_SUCCESS or error code */
+dberr_t trx_undo_report_rename(trx_t* trx, const dict_table_t* table)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/***********************************************************************//**
+Writes information to an undo log about an insert, update, or a delete marking
+of a clustered index record. This information is used in a rollback of the
+transaction and in consistent reads that must look to the history of this
+transaction.
+@return DB_SUCCESS or error code */
+dberr_t
+trx_undo_report_row_operation(
+/*==========================*/
+	que_thr_t*	thr,		/*!< in: query thread */
+	dict_index_t*	index,		/*!< in: clustered index */
+	const dtuple_t*	clust_entry,	/*!< in: in the case of an insert,
+					index entry to insert into the
+					clustered index; in updates,
+					may contain a clustered index
+					record tuple that also contains
+					virtual columns of the table;
+					otherwise, NULL */
+	const upd_t*	update,		/*!< in: in the case of an update,
+					the update vector, otherwise NULL */
+	ulint		cmpl_info,	/*!< in: compiler info on secondary
+					index updates */
+	const rec_t*	rec,		/*!< in: case of an update or delete
+					marking, the record in the clustered
+					index; NULL if insert */
+	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec) */
+	roll_ptr_t*	roll_ptr)	/*!< out: DB_ROLL_PTR to the
+					undo log record */
+	MY_ATTRIBUTE((nonnull(1,2), warn_unused_result));
+
+/** status bit used for trx_undo_prev_version_build() */
+
+/** TRX_UNDO_PREV_IN_PURGE tells trx_undo_prev_version_build() that it
+is being called purge view and we would like to get the purge record
+even it is in the purge view (in normal case, it will return without
+fetching the purge record */
+static constexpr ulint TRX_UNDO_PREV_IN_PURGE = 1;
+
+/** This tells trx_undo_prev_version_build() to fetch the old value in
+the undo log (which is the after image for an update) */
+static constexpr ulint TRX_UNDO_GET_OLD_V_VALUE = 2;
+
+/** indicate a call from row_vers_old_has_index_entry() */
+static constexpr ulint TRX_UNDO_CHECK_PURGEABILITY = 4;
+
+/** Build a previous version of a clustered index record. The caller
+must hold a latch on the index page of the clustered index record.
+@param	rec		version of a clustered index record
+@param	index		clustered index
+@param	offsets		rec_get_offsets(rec, index)
+@param	heap		memory heap from which the memory needed is
+			allocated
+@param	old_vers	previous version or NULL if rec is the
+			first inserted version, or if history data
+			has been deleted (an error), or if the purge
+			could have removed the version
+			though it has not yet done so
+@param	v_heap		memory heap used to create vrow
+			dtuple if it is not yet created. This heap
+                        diffs from "heap" above in that it could be
+                        prebuilt->old_vers_heap for selection
+@param	vrow		virtual column info, if any
+@param	v_status	status determine if it is going into this
+			function by purge thread or not.
+			And if we read "after image" of undo log
+@return error code
+@retval DB_SUCCESS if previous version was successfully built,
+or if it was an insert or the undo record refers to the table before rebuild
+@retval DB_MISSING_HISTORY if the history is missing */
+dberr_t
+trx_undo_prev_version_build(
+	const rec_t 	*rec,
+	dict_index_t	*index,
+	rec_offs	*offsets,
+	mem_heap_t	*heap,
+	rec_t		**old_vers,
+	mem_heap_t	*v_heap,
+	dtuple_t	**vrow,
+	ulint		v_status);
+
+/** Read from an undo log record a non-virtual column value.
+@param ptr	pointer to remaining part of the undo record
+@param field	stored field
+@param len	length of the field, or UNIV_SQL_NULL
+@param orig_len	original length of the locally stored part
+of an externally stored column, or 0
+@return remaining part of undo log record after reading these values */
+const byte *trx_undo_rec_get_col_val(const byte *ptr, const byte **field,
+                                     uint32_t *len, uint32_t *orig_len);
+
+/** Read virtual column value from undo log
+@param[in]	table		the table
+@param[in]	ptr		undo log pointer
+@param[in,out]	row		the dtuple to fill
+@param[in]	in_purge	whether this is called by purge */
+void
+trx_undo_read_v_cols(
+	const dict_table_t*	table,
+	const byte*		ptr,
+	dtuple_t*		row,
+	bool			in_purge);
+
+/** Read virtual column index from undo log if the undo log contains such
+info, and verify the column is still indexed, and output its position
+@param[in]	table		the table
+@param[in]	ptr		undo log pointer
+@param[in]	first_v_col	if this is the first virtual column, which
+				has the version marker
+@param[in,out]	is_undo_log	his function is used to parse both undo log,
+				and online log for virtual columns. So
+				check to see if this is undo log
+@param[out]	field_no	the column number, or FIL_NULL if not indexed
+@return remaining part of undo log record after reading these values */
+const byte*
+trx_undo_read_v_idx(
+	const dict_table_t*	table,
+	const byte*		ptr,
+	bool			first_v_col,
+	bool*			is_undo_log,
+	uint32_t*		field_no);
+
+/* Types of an undo log record: these have to be smaller than 16, as the
+compilation info multiplied by 16 is ORed to this value in an undo log
+record */
+
+/** Undo log records for DDL operations
+
+Note: special rollback and purge triggers exist for SYS_INDEXES records:
+@see dict_drop_index_tree() */
+enum trx_undo_ddl_type
+{
+  /** RENAME TABLE (logging the old table name).
+
+  Because SYS_TABLES has PRIMARY KEY(NAME), the row-level undo log records
+  for SYS_TABLES cannot be distinguished from DROP TABLE, CREATE TABLE. */
+  TRX_UNDO_RENAME_TABLE= 9,
+  /** insert a metadata pseudo-record for instant ALTER TABLE */
+  TRX_UNDO_INSERT_METADATA= 10
+};
+
+/* DML operations */
+#define	TRX_UNDO_INSERT_REC	11	/* fresh insert into clustered index */
+#define	TRX_UNDO_UPD_EXIST_REC	12	/* update of a non-delete-marked
+					record */
+#define	TRX_UNDO_UPD_DEL_REC	13	/* update of a delete marked record to
+					a not delete marked record; also the
+					fields of the record can change */
+#define	TRX_UNDO_DEL_MARK_REC	14	/* delete marking of a record; fields
+					do not change */
+/** Bulk insert operation. It is written only when the table is
+under exclusive lock and the clustered index root page latch is being held,
+and the clustered index is empty. Rollback will empty the table and
+free the leaf segment of all indexes, re-create the new
+leaf segment and re-initialize the root page alone. */
+#define	TRX_UNDO_EMPTY		15
+
+#define	TRX_UNDO_CMPL_INFO_MULT	16U	/* compilation info is multiplied by
+					this and ORed to the type above */
+#define	TRX_UNDO_UPD_EXTERN	128U	/* This bit can be ORed to type_cmpl
+					to denote that we updated external
+					storage fields: used by purge to
+					free the external storage */
+
+/** The search tuple corresponding to TRX_UNDO_INSERT_METADATA */
+extern const dtuple_t trx_undo_metadata;
+
+/** Read the table id from an undo log record.
+@param[in]      rec        Undo log record
+@return table id stored as a part of undo log record */
+inline table_id_t trx_undo_rec_get_table_id(const trx_undo_rec_t *rec)
+{
+  rec+= 3;
+  mach_read_next_much_compressed(&rec);
+  return mach_read_next_much_compressed(&rec);
+}
diff --git a/storage/innobase/include/trx0roll.h b/storage/innobase/include/trx0roll.h
new file mode 100644
index 00000000..9ef9ebe9
--- /dev/null
+++ b/storage/innobase/include/trx0roll.h
@@ -0,0 +1,168 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0roll.h
+Transaction rollback
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0roll_h
+#define trx0roll_h
+
+#include "trx0trx.h"
+#include "mtr0mtr.h"
+#include "trx0sys.h"
+
+extern bool		trx_rollback_is_active;
+extern const trx_t*	trx_roll_crash_recv_trx;
+
+/** Report progress when rolling back a row of a recovered transaction. */
+void trx_roll_report_progress();
+/*******************************************************************//**
+Rollback or clean up any incomplete transactions which were
+encountered in crash recovery.  If the transaction already was
+committed, then we clean up a possible insert undo log. If the
+transaction was not yet committed, then we roll it back.
+@param all true=roll back all recovered active transactions;
+false=roll back any incomplete dictionary transaction */
+void
+trx_rollback_recovered(bool all);
+/*******************************************************************//**
+Rollback or clean up any incomplete transactions which were
+encountered in crash recovery.  If the transaction already was
+committed, then we clean up a possible insert undo log. If the
+transaction was not yet committed, then we roll it back.
+Note: this is done in a background thread. */
+void trx_rollback_all_recovered(void*);
+/*********************************************************************//**
+Creates a rollback command node struct.
+@return own: rollback node struct */
+roll_node_t*
+roll_node_create(
+/*=============*/
+	mem_heap_t*	heap);	/*!< in: mem heap where created */
+/***********************************************************//**
+Performs an execution step for a rollback command node in a query graph.
+@return query thread to run next, or NULL */
+que_thr_t*
+trx_rollback_step(
+/*==============*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/*******************************************************************//**
+Rollback a transaction used in MySQL.
+@return error code or DB_SUCCESS */
+dberr_t
+trx_rollback_for_mysql(
+/*===================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+	MY_ATTRIBUTE((nonnull));
+/*******************************************************************//**
+Rollback the latest SQL statement for MySQL.
+@return error code or DB_SUCCESS */
+dberr_t
+trx_rollback_last_sql_stat_for_mysql(
+/*=================================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+	MY_ATTRIBUTE((nonnull));
+/*******************************************************************//**
+Rolls back a transaction back to a named savepoint. Modifications after the
+savepoint are undone but InnoDB does NOT release the corresponding locks
+which are stored in memory. If a lock is 'implicit', that is, a new inserted
+row holds a lock where the lock information is carried by the trx id stored in
+the row, these locks are naturally released in the rollback. Savepoints which
+were set after this savepoint are deleted.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+dberr_t
+trx_rollback_to_savepoint_for_mysql(
+/*================================*/
+	trx_t*		trx,			/*!< in: transaction handle */
+	const char*	savepoint_name,		/*!< in: savepoint name */
+	int64_t*	mysql_binlog_cache_pos)	/*!< out: the MySQL binlog cache
+						position corresponding to this
+						savepoint; MySQL needs this
+						information to remove the
+						binlog entries of the queries
+						executed after the savepoint */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/*******************************************************************//**
+Creates a named savepoint. If the transaction is not yet started, starts it.
+If there is already a savepoint of the same name, this call erases that old
+savepoint and replaces it with a new. Savepoints are deleted in a transaction
+commit or rollback.
+@return always DB_SUCCESS */
+dberr_t
+trx_savepoint_for_mysql(
+/*====================*/
+	trx_t*		trx,			/*!< in: transaction handle */
+	const char*	savepoint_name,		/*!< in: savepoint name */
+	int64_t		binlog_cache_pos)	/*!< in: MySQL binlog cache
+						position corresponding to this
+						connection at the time of the
+						savepoint */
+	MY_ATTRIBUTE((nonnull));
+/*******************************************************************//**
+Releases a named savepoint. Savepoints which
+were set after this savepoint are deleted.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+dberr_t
+trx_release_savepoint_for_mysql(
+/*============================*/
+	trx_t*		trx,			/*!< in: transaction handle */
+	const char*	savepoint_name)		/*!< in: savepoint name */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Rollback node states */
+enum roll_node_state {
+	ROLL_NODE_NONE = 0,		/*!< Unknown state */
+	ROLL_NODE_SEND,			/*!< about to send a rollback signal to
+					the transaction */
+	ROLL_NODE_WAIT			/*!< rollback signal sent to the
+					transaction, waiting for completion */
+};
+
+/** Rollback command node in a query graph */
+struct roll_node_t{
+	que_common_t		common;	/*!< node type: QUE_NODE_ROLLBACK */
+	enum roll_node_state	state;	/*!< node execution state */
+	const trx_savept_t*	savept;	/*!< savepoint to which to
+					roll back, in the case of a
+					partial rollback */
+	que_thr_t*		undo_thr;/*!< undo query graph */
+};
+
+/** A savepoint set with SQL's "SAVEPOINT savepoint_id" command */
+struct trx_named_savept_t{
+	char*		name;		/*!< savepoint name */
+	trx_savept_t	savept;		/*!< the undo number corresponding to
+					the savepoint */
+	int64_t		mysql_binlog_cache_pos;
+					/*!< the MySQL binlog cache position
+					corresponding to this savepoint, not
+					defined if the MySQL binlogging is not
+					enabled */
+	UT_LIST_NODE_T(trx_named_savept_t)
+			trx_savepoints;	/*!< the list of savepoints of a
+					transaction */
+};
+
+#endif
diff --git a/storage/innobase/include/trx0rseg.h b/storage/innobase/include/trx0rseg.h
new file mode 100644
index 00000000..43e0c290
--- /dev/null
+++ b/storage/innobase/include/trx0rseg.h
@@ -0,0 +1,301 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0rseg.h
+Rollback segment
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+#include "trx0types.h"
+#include "fut0lst.h"
+
+/** Create a rollback segment header.
+@param[in,out]  space           system, undo, or temporary tablespace
+@param[in]      rseg_id         rollback segment identifier
+@param[in]      max_trx_id      new value of TRX_RSEG_MAX_TRX_ID
+@param[in,out]  mtr             mini-transaction
+@param[out]     err             error code
+@return the created rollback segment
+@retval nullptr on failure */
+buf_block_t *trx_rseg_header_create(fil_space_t *space, ulint rseg_id,
+                                    trx_id_t max_trx_id, mtr_t *mtr,
+                                    dberr_t *err)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Initialize or recover the rollback segments at startup. */
+dberr_t trx_rseg_array_init();
+
+/** Create the temporary rollback segments. */
+dberr_t trx_temp_rseg_create(mtr_t *mtr);
+
+/* Number of undo log slots in a rollback segment file copy */
+#define TRX_RSEG_N_SLOTS	(srv_page_size / 16)
+
+/* Maximum number of transactions supported by a single rollback segment */
+#define TRX_RSEG_MAX_N_TRXS	(TRX_RSEG_N_SLOTS / 2)
+
+/** The rollback segment memory object */
+struct alignas(CPU_LEVEL1_DCACHE_LINESIZE) trx_rseg_t
+{
+  /** tablespace containing the rollback segment; constant after init() */
+  fil_space_t *space;
+  /** latch protecting everything except page_no, space */
+  srw_spin_lock latch;
+  /** rollback segment header page number; constant after init() */
+  uint32_t page_no;
+  /** length of the TRX_RSEG_HISTORY list (number of transactions) */
+  uint32_t history_size;
+
+  /** Last known transaction that has not been purged yet,
+  or 0 if everything has been purged. */
+  trx_id_t needs_purge;
+
+private:
+  /** Reference counter to track is_persistent() transactions,
+  with SKIP flag. */
+  std::atomic<uint32_t> ref;
+
+  /** Whether undo tablespace truncation is pending */
+  static constexpr uint32_t SKIP= 1;
+  /** Transaction reference count multiplier */
+  static constexpr uint32_t REF= 2;
+
+  uint32_t ref_load() const { return ref.load(std::memory_order_relaxed); }
+
+  /** Set the SKIP bit */
+  void ref_set_skip()
+  {
+    static_assert(SKIP == 1U, "compatibility");
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+    __asm__ __volatile__("lock btsl $0, %0" : "+m" (ref));
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+    _interlockedbittestandset(reinterpret_cast<volatile long*>(&ref), 0);
+#else
+    ref.fetch_or(SKIP, std::memory_order_relaxed);
+#endif
+  }
+  /** Clear a bit in ref */
+  void ref_reset_skip()
+  {
+    static_assert(SKIP == 1U, "compatibility");
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+    __asm__ __volatile__("lock btrl $0, %0" : "+m" (ref));
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+    _interlockedbittestandreset(reinterpret_cast<volatile long*>(&ref), 0);
+#else
+    ref.fetch_and(~SKIP, std::memory_order_relaxed);
+#endif
+  }
+
+public:
+
+  /** Initialize the fields that are not zero-initialized. */
+  void init(fil_space_t *space, uint32_t page);
+  /** Reinitialize the fields on undo tablespace truncation. */
+  void reinit(uint32_t page);
+  /** Clean up. */
+  void destroy();
+
+  /** Note that undo tablespace truncation was started. */
+  void set_skip_allocation() { ut_ad(is_persistent()); ref_set_skip(); }
+  /** Note that undo tablespace truncation was completed. */
+  void clear_skip_allocation()
+  {
+    ut_ad(is_persistent());
+#if defined DBUG_OFF
+    ref_reset_skip();
+#else
+    ut_d(auto r=) ref.fetch_and(~SKIP, std::memory_order_relaxed);
+    ut_ad(r == SKIP);
+#endif
+  }
+  /** @return whether the segment is marked for undo truncation */
+  bool skip_allocation() const
+  { return ref.load(std::memory_order_acquire) & SKIP; }
+  /** Increment the reference count */
+  void acquire()
+  { ut_d(auto r=) ref.fetch_add(REF); ut_ad(!(r & SKIP)); }
+  /** Increment the reference count if possible
+  @retval true  if the reference count was incremented
+  @retval false if skip_allocation() holds */
+  bool acquire_if_available()
+  {
+    uint32_t r= 0;
+    while (!ref.compare_exchange_weak(r, r + REF,
+                                      std::memory_order_relaxed,
+                                      std::memory_order_relaxed))
+      if (r & SKIP)
+        return false;
+    return true;
+  }
+
+  /** Decrement the reference count */
+  void release()
+  {
+    ut_d(const auto r=)
+    ref.fetch_sub(REF, std::memory_order_relaxed);
+    ut_ad(r >= REF);
+  }
+  /** @return whether references exist */
+  bool is_referenced() const { return ref_load() >= REF; }
+
+  /** current size in pages */
+  uint32_t curr_size;
+
+  /** List of undo logs (transactions) */
+  UT_LIST_BASE_NODE_T(trx_undo_t) undo_list;
+  /** List of undo log segments cached for fast reuse */
+  UT_LIST_BASE_NODE_T(trx_undo_t) undo_cached;
+
+  /** Last not yet purged undo log header; FIL_NULL if all purged */
+  uint32_t last_page_no;
+
+  /** trx_t::no | last_offset << 48 */
+  uint64_t last_commit_and_offset;
+
+  /** @return the commit ID of the last committed transaction */
+  trx_id_t last_trx_no() const
+  { return last_commit_and_offset & ((1ULL << 48) - 1); }
+  /** @return header offset of the last committed transaction */
+  uint16_t last_offset() const
+  { return static_cast<uint16_t>(last_commit_and_offset >> 48); }
+
+  void set_last_commit(uint16_t last_offset, trx_id_t trx_no)
+  {
+    last_commit_and_offset= static_cast<uint64_t>(last_offset) << 48 | trx_no;
+  }
+
+  /** @return the page identifier */
+  page_id_t page_id() const { return page_id_t{space->id, page_no}; }
+
+  /** @return the rollback segment header page, exclusively latched */
+  buf_block_t *get(mtr_t *mtr, dberr_t *err) const;
+
+  /** @return whether the rollback segment is persistent */
+  bool is_persistent() const
+  {
+    ut_ad(space == fil_system.temp_space || space == fil_system.sys_space ||
+          (srv_undo_space_id_start > 0 &&
+           space->id >= srv_undo_space_id_start &&
+           space->id <= srv_undo_space_id_start + TRX_SYS_MAX_UNDO_SPACES));
+    ut_ad(space == fil_system.temp_space || space == fil_system.sys_space ||
+          !srv_was_started ||
+          (srv_undo_space_id_start > 0 &&
+           space->id >= srv_undo_space_id_start
+           && space->id <= srv_undo_space_id_start +
+           srv_undo_tablespaces_open));
+    return space->id != SRV_TMP_SPACE_ID;
+  }
+};
+
+/* Undo log segment slot in a rollback segment header */
+/*-------------------------------------------------------------*/
+#define	TRX_RSEG_SLOT_PAGE_NO	0	/* Page number of the header page of
+					an undo log segment */
+/*-------------------------------------------------------------*/
+/* Slot size */
+#define TRX_RSEG_SLOT_SIZE	4
+
+/* The offset of the rollback segment header on its page */
+#define	TRX_RSEG		FSEG_PAGE_DATA
+
+/* Transaction rollback segment header */
+/*-------------------------------------------------------------*/
+/** 0xfffffffe = pre-MariaDB 10.3.5 format; 0=MariaDB 10.3.5 or later */
+#define	TRX_RSEG_FORMAT		0
+/** Number of pages in the TRX_RSEG_HISTORY list */
+#define	TRX_RSEG_HISTORY_SIZE	4
+/** Committed transaction logs that have not been purged yet */
+#define	TRX_RSEG_HISTORY	8
+#define	TRX_RSEG_FSEG_HEADER	(8 + FLST_BASE_NODE_SIZE)
+					/* Header for the file segment where
+					this page is placed */
+#define TRX_RSEG_UNDO_SLOTS	(8 + FLST_BASE_NODE_SIZE + FSEG_HEADER_SIZE)
+					/* Undo log segment slots */
+/** Maximum transaction ID (valid only if TRX_RSEG_FORMAT is 0) */
+#define TRX_RSEG_MAX_TRX_ID	(TRX_RSEG_UNDO_SLOTS + TRX_RSEG_N_SLOTS	\
+				 * TRX_RSEG_SLOT_SIZE)
+
+/** 8 bytes offset within the binlog file */
+#define TRX_RSEG_BINLOG_OFFSET		TRX_RSEG_MAX_TRX_ID + 8
+/** MySQL log file name, 512 bytes, including terminating NUL
+(valid only if TRX_RSEG_FORMAT is 0).
+If no binlog information is present, the first byte is NUL. */
+#define TRX_RSEG_BINLOG_NAME		TRX_RSEG_MAX_TRX_ID + 16
+/** Maximum length of binlog file name, including terminating NUL, in bytes */
+#define TRX_RSEG_BINLOG_NAME_LEN	512
+
+#ifdef WITH_WSREP
+# include "trx0xa.h"
+
+/** Update the WSREP XID information in rollback segment header.
+@param[in,out]	rseg_header	rollback segment header
+@param[in]	xid		WSREP XID
+@param[in,out]	mtr		mini-transaction */
+void
+trx_rseg_update_wsrep_checkpoint(
+	buf_block_t*	rseg_header,
+	const XID*	xid,
+	mtr_t*		mtr);
+
+/** Update WSREP checkpoint XID in first rollback segment header
+as part of wsrep_set_SE_checkpoint() when it is guaranteed that there
+are no wsrep transactions committing.
+If the UUID part of the WSREP XID does not match to the UUIDs of XIDs already
+stored into rollback segments, the WSREP XID in all the remaining rollback
+segments will be reset.
+@param[in]	xid		WSREP XID */
+void trx_rseg_update_wsrep_checkpoint(const XID* xid);
+
+/** Recover the latest WSREP checkpoint XID.
+@param[out]	xid	WSREP XID
+@return	whether the WSREP XID was found */
+bool trx_rseg_read_wsrep_checkpoint(XID& xid);
+#endif /* WITH_WSREP */
+
+/** Read the page number of an undo log slot.
+@param[in]      rseg_header     rollback segment header
+@param[in]      n               slot number */
+inline uint32_t trx_rsegf_get_nth_undo(const buf_block_t *rseg_header, ulint n)
+{
+  ut_ad(n < TRX_RSEG_N_SLOTS);
+  return mach_read_from_4(TRX_RSEG + TRX_RSEG_UNDO_SLOTS +
+                          n * TRX_RSEG_SLOT_SIZE + rseg_header->page.frame);
+}
+
+/** Upgrade a rollback segment header page to MariaDB 10.3 format.
+@param[in,out]	rseg_header	rollback segment header page
+@param[in,out]	mtr		mini-transaction */
+void trx_rseg_format_upgrade(buf_block_t *rseg_header, mtr_t *mtr);
+
+/** Update the offset information about the end of the binlog entry
+which corresponds to the transaction just being committed.
+In a replication slave, this updates the master binlog position
+up to which replication has proceeded.
+@param[in,out]	rseg_header	rollback segment header
+@param[in]	log_file_name	binlog file name
+@param[in]	log_offset	binlog offset value
+@param[in,out]	mtr		mini-transaction */
+void trx_rseg_update_binlog_offset(buf_block_t *rseg_header,
+                                   const char *log_file_name,
+                                   ulonglong log_offset,
+                                   mtr_t *mtr);
diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h
new file mode 100644
index 00000000..5dd0169f
--- /dev/null
+++ b/storage/innobase/include/trx0sys.h
@@ -0,0 +1,1274 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0sys.h
+Transaction system
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+#include "buf0buf.h"
+#include "fil0fil.h"
+#include "trx0rseg.h"
+#include "mem0mem.h"
+#include "mtr0mtr.h"
+#include "ut0byte.h"
+#include "ut0lst.h"
+#include "read0types.h"
+#include "page0types.h"
+#include "trx0trx.h"
+#include "ilist.h"
+#include "my_cpu.h"
+
+#ifdef UNIV_PFS_MUTEX
+extern mysql_pfs_key_t trx_sys_mutex_key;
+#endif
+
+/** Checks if a page address is the trx sys header page.
+@param[in]	page_id	page id
+@return true if trx sys header page */
+inline bool trx_sys_hdr_page(const page_id_t page_id)
+{
+  return page_id == page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO);
+}
+
+/*****************************************************************//**
+Creates and initializes the transaction system at the database creation. */
+dberr_t trx_sys_create_sys_pages(mtr_t *mtr);
+
+/** Find an available rollback segment.
+@param[in]	sys_header
+@return an unallocated rollback segment slot in the TRX_SYS header
+@retval ULINT_UNDEFINED if not found */
+ulint
+trx_sys_rseg_find_free(const buf_block_t* sys_header);
+/** Request the TRX_SYS page.
+@param[in]	rw	whether to lock the page for writing
+@return the TRX_SYS page
+@retval	NULL	if the page cannot be read */
+inline buf_block_t *trx_sysf_get(mtr_t* mtr, bool rw= true)
+{
+  return buf_page_get(page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO),
+                      0, rw ? RW_X_LATCH : RW_S_LATCH, mtr);
+}
+
+#ifdef UNIV_DEBUG
+/* Flag to control TRX_RSEG_N_SLOTS behavior debugging. */
+extern uint			trx_rseg_n_slots_debug;
+#endif
+
+/** Write DB_TRX_ID.
+@param[out]	db_trx_id	the DB_TRX_ID field to be written to
+@param[in]	id		transaction ID */
+UNIV_INLINE
+void
+trx_write_trx_id(byte* db_trx_id, trx_id_t id)
+{
+	compile_time_assert(DATA_TRX_ID_LEN == 6);
+	mach_write_to_6(db_trx_id, id);
+}
+
+/** Read a transaction identifier.
+@return id */
+inline
+trx_id_t
+trx_read_trx_id(const byte* ptr)
+{
+	compile_time_assert(DATA_TRX_ID_LEN == 6);
+	return(mach_read_from_6(ptr));
+}
+
+#ifdef UNIV_DEBUG
+/** Check that the DB_TRX_ID in a record is valid.
+@param[in]	db_trx_id	the DB_TRX_ID column to validate
+@param[in]	trx_id		the id of the ALTER TABLE transaction */
+inline bool trx_id_check(const void* db_trx_id, trx_id_t trx_id)
+{
+	trx_id_t id = trx_read_trx_id(static_cast<const byte*>(db_trx_id));
+	ut_ad(id == 0 || id > trx_id);
+	return true;
+}
+#endif
+
+/*****************************************************************//**
+Updates the offset information about the end of the MySQL binlog entry
+which corresponds to the transaction just being committed. In a MySQL
+replication slave updates the latest master binlog position up to which
+replication has proceeded. */
+void
+trx_sys_update_mysql_binlog_offset(
+/*===============================*/
+	const char*	file_name,/*!< in: MySQL log file name */
+	int64_t		offset,	/*!< in: position in that log file */
+	buf_block_t*	sys_header, /*!< in,out: trx sys header */
+	mtr_t*		mtr);	/*!< in,out: mini-transaction */
+/** Display the MySQL binlog offset info if it is present in the trx
+system header. */
+void
+trx_sys_print_mysql_binlog_offset();
+
+/** Create the rollback segments.
+@return	whether the creation succeeded */
+bool
+trx_sys_create_rsegs();
+
+/** The offset of the transaction system header on the page */
+#define	TRX_SYS		FSEG_PAGE_DATA
+
+/** Transaction system header */
+/*------------------------------------------------------------- @{ */
+/** In old versions of InnoDB, this persisted the value of
+trx_sys.get_max_trx_id(). Starting with MariaDB 10.3.5,
+the field TRX_RSEG_MAX_TRX_ID in rollback segment header pages
+and the fields TRX_UNDO_TRX_ID, TRX_UNDO_TRX_NO in undo log pages
+are used instead. The field only exists for the purpose of upgrading
+from older MySQL or MariaDB versions. */
+#define	TRX_SYS_TRX_ID_STORE	0
+#define TRX_SYS_FSEG_HEADER	8	/*!< segment header for the
+					tablespace segment the trx
+					system is created into */
+#define	TRX_SYS_RSEGS		(8 + FSEG_HEADER_SIZE)
+					/*!< the start of the array of
+					rollback segment specification
+					slots */
+
+/* Rollback segment specification slot offsets */
+
+/** the tablespace ID of an undo log header; starting with
+MySQL/InnoDB 5.1.7, this is FIL_NULL if the slot is unused */
+#define	TRX_SYS_RSEG_SPACE	0
+/** the page number of an undo log header, or FIL_NULL if unused */
+#define	TRX_SYS_RSEG_PAGE_NO	4
+/** Size of a rollback segment specification slot */
+#define TRX_SYS_RSEG_SLOT_SIZE	8
+
+/** Read the tablespace ID of a rollback segment slot.
+@param[in]	sys_header	TRX_SYS page
+@param[in]	rseg_id		rollback segment identifier
+@return	undo tablespace id */
+inline
+uint32_t
+trx_sysf_rseg_get_space(const buf_block_t* sys_header, ulint rseg_id)
+{
+	ut_ad(rseg_id < TRX_SYS_N_RSEGS);
+	return mach_read_from_4(TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_SPACE
+				+ rseg_id * TRX_SYS_RSEG_SLOT_SIZE
+				+ sys_header->page.frame);
+}
+
+/** Read the page number of a rollback segment slot.
+@param[in]	sys_header	TRX_SYS page
+@param[in]	rseg_id		rollback segment identifier
+@return	undo page number */
+inline uint32_t
+trx_sysf_rseg_get_page_no(const buf_block_t *sys_header, ulint rseg_id)
+{
+  ut_ad(rseg_id < TRX_SYS_N_RSEGS);
+  return mach_read_from_4(TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_PAGE_NO +
+			  rseg_id * TRX_SYS_RSEG_SLOT_SIZE +
+			  sys_header->page.frame);
+}
+
+/** Maximum length of MySQL binlog file name, in bytes.
+(Used before MariaDB 10.3.5.) */
+#define TRX_SYS_MYSQL_LOG_NAME_LEN	512
+/** Contents of TRX_SYS_MYSQL_LOG_MAGIC_N_FLD */
+#define TRX_SYS_MYSQL_LOG_MAGIC_N	873422344
+
+#if UNIV_PAGE_SIZE_MIN < 4096
+# error "UNIV_PAGE_SIZE_MIN < 4096"
+#endif
+/** The offset of the MySQL binlog offset info in the trx system header */
+#define TRX_SYS_MYSQL_LOG_INFO		(srv_page_size - 1000)
+#define	TRX_SYS_MYSQL_LOG_MAGIC_N_FLD	0	/*!< magic number which is
+						TRX_SYS_MYSQL_LOG_MAGIC_N
+						if we have valid data in the
+						MySQL binlog info */
+#define TRX_SYS_MYSQL_LOG_OFFSET	4	/*!< the 64-bit offset
+						within that file */
+#define TRX_SYS_MYSQL_LOG_NAME		12	/*!< MySQL log file name */
+
+/** Memory map TRX_SYS_PAGE_NO = 5 when srv_page_size = 4096
+
+0...37 FIL_HEADER
+38...45 TRX_SYS_TRX_ID_STORE
+46...55 TRX_SYS_FSEG_HEADER (FSEG_HEADER_SIZE == 10)
+56      TRX_SYS_RSEGS
+  56...59  TRX_SYS_RSEG_SPACE       for slot 0
+  60...63  TRX_SYS_RSEG_PAGE_NO     for slot 0
+  64...67  TRX_SYS_RSEG_SPACE       for slot 1
+  68...71  TRX_SYS_RSEG_PAGE_NO     for slot 1
+....
+ 594..597  TRX_SYS_RSEG_SPACE       for slot 72
+ 598..601  TRX_SYS_RSEG_PAGE_NO     for slot 72
+...
+  ...1063  TRX_SYS_RSEG_PAGE_NO     for slot 126
+
+(srv_page_size-3500 WSREP ::: FAIL would overwrite undo tablespace
+space_id, page_no pairs :::)
+596 TRX_SYS_WSREP_XID_INFO             TRX_SYS_WSREP_XID_MAGIC_N_FLD
+600 TRX_SYS_WSREP_XID_FORMAT
+604 TRX_SYS_WSREP_XID_GTRID_LEN
+608 TRX_SYS_WSREP_XID_BQUAL_LEN
+612 TRX_SYS_WSREP_XID_DATA   (len = 128)
+739 TRX_SYS_WSREP_XID_DATA_END
+
+FIXED WSREP XID info offsets for 4k page size 10.0.32-galera
+(srv_page_size-2500)
+1596 TRX_SYS_WSREP_XID_INFO             TRX_SYS_WSREP_XID_MAGIC_N_FLD
+1600 TRX_SYS_WSREP_XID_FORMAT
+1604 TRX_SYS_WSREP_XID_GTRID_LEN
+1608 TRX_SYS_WSREP_XID_BQUAL_LEN
+1612 TRX_SYS_WSREP_XID_DATA   (len = 128)
+1739 TRX_SYS_WSREP_XID_DATA_END
+
+(srv_page_size - 2000 MYSQL MASTER LOG)
+2096   TRX_SYS_MYSQL_MASTER_LOG_INFO   TRX_SYS_MYSQL_LOG_MAGIC_N_FLD
+2100   TRX_SYS_MYSQL_LOG_OFFSET_HIGH
+2104   TRX_SYS_MYSQL_LOG_OFFSET_LOW
+2108   TRX_SYS_MYSQL_LOG_NAME
+
+(srv_page_size - 1000 MYSQL LOG)
+3096   TRX_SYS_MYSQL_LOG_INFO          TRX_SYS_MYSQL_LOG_MAGIC_N_FLD
+3100   TRX_SYS_MYSQL_LOG_OFFSET_HIGH
+3104   TRX_SYS_MYSQL_LOG_OFFSET_LOW
+3108   TRX_SYS_MYSQL_LOG_NAME
+
+(srv_page_size - 200 DOUBLEWRITE)
+3896   TRX_SYS_DOUBLEWRITE		TRX_SYS_DOUBLEWRITE_FSEG
+3906         TRX_SYS_DOUBLEWRITE_MAGIC
+3910         TRX_SYS_DOUBLEWRITE_BLOCK1
+3914         TRX_SYS_DOUBLEWRITE_BLOCK2
+3918         TRX_SYS_DOUBLEWRITE_REPEAT
+3930         TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N
+
+(srv_page_size - 8, TAILER)
+4088..4096	FIL_TAILER
+
+*/
+#ifdef WITH_WSREP
+/** The offset to WSREP XID headers (used before MariaDB 10.3.5) */
+#define TRX_SYS_WSREP_XID_INFO std::max(srv_page_size - 3500, 1596UL)
+#define TRX_SYS_WSREP_XID_MAGIC_N_FLD 0
+#define TRX_SYS_WSREP_XID_MAGIC_N 0x77737265
+
+/** XID field: formatID, gtrid_len, bqual_len, xid_data */
+#define TRX_SYS_WSREP_XID_LEN        (4 + 4 + 4 + XIDDATASIZE)
+#define TRX_SYS_WSREP_XID_FORMAT     4
+#define TRX_SYS_WSREP_XID_GTRID_LEN  8
+#define TRX_SYS_WSREP_XID_BQUAL_LEN 12
+#define TRX_SYS_WSREP_XID_DATA      16
+#endif /* WITH_WSREP*/
+
+/** Doublewrite buffer */
+/* @{ */
+/** The offset of the doublewrite buffer header on the trx system header page */
+#define TRX_SYS_DOUBLEWRITE		(srv_page_size - 200)
+/*-------------------------------------------------------------*/
+#define TRX_SYS_DOUBLEWRITE_FSEG	0	/*!< fseg header of the fseg
+						containing the doublewrite
+						buffer */
+#define TRX_SYS_DOUBLEWRITE_MAGIC	FSEG_HEADER_SIZE
+						/*!< 4-byte magic number which
+						shows if we already have
+						created the doublewrite
+						buffer */
+#define TRX_SYS_DOUBLEWRITE_BLOCK1	(4 + FSEG_HEADER_SIZE)
+						/*!< page number of the
+						first page in the first
+						sequence of 64
+						(= FSP_EXTENT_SIZE) consecutive
+						pages in the doublewrite
+						buffer */
+#define TRX_SYS_DOUBLEWRITE_BLOCK2	(8 + FSEG_HEADER_SIZE)
+						/*!< page number of the
+						first page in the second
+						sequence of 64 consecutive
+						pages in the doublewrite
+						buffer */
+#define TRX_SYS_DOUBLEWRITE_REPEAT	12	/*!< we repeat
+						TRX_SYS_DOUBLEWRITE_MAGIC,
+						TRX_SYS_DOUBLEWRITE_BLOCK1,
+						TRX_SYS_DOUBLEWRITE_BLOCK2
+						so that if the trx sys
+						header is half-written
+						to disk, we still may
+						be able to recover the
+						information */
+/** If this is not yet set to TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
+we must reset the doublewrite buffer, because starting from 4.1.x the
+space id of a data page is stored into
+FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */
+#define TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED (24 + FSEG_HEADER_SIZE)
+
+/*-------------------------------------------------------------*/
+/** Contents of TRX_SYS_DOUBLEWRITE_MAGIC */
+constexpr uint32_t TRX_SYS_DOUBLEWRITE_MAGIC_N= 536853855;
+/** Contents of TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED */
+constexpr uint32_t TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N= 1783657386;
+/* @} */
+
+trx_t* current_trx();
+
+struct rw_trx_hash_element_t
+{
+  rw_trx_hash_element_t()
+  {
+    memset(reinterpret_cast<void*>(this), 0, sizeof *this);
+    mutex.init();
+  }
+
+
+  ~rw_trx_hash_element_t() { mutex.destroy(); }
+
+
+  trx_id_t id; /* lf_hash_init() relies on this to be first in the struct */
+
+  /**
+    Transaction serialization number.
+
+    Assigned shortly before the transaction is moved to COMMITTED_IN_MEMORY
+    state. Initially set to TRX_ID_MAX.
+  */
+  Atomic_counter<trx_id_t> no;
+  trx_t *trx;
+  srw_mutex mutex;
+};
+
+
+/**
+  Wrapper around LF_HASH to store set of in memory read-write transactions.
+*/
+
+class rw_trx_hash_t
+{
+  LF_HASH hash;
+
+
+  template <typename T>
+  using walk_action= my_bool(rw_trx_hash_element_t *element, T *action);
+
+
+  /**
+    Constructor callback for lock-free allocator.
+
+    Object is just allocated and is not yet accessible via rw_trx_hash by
+    concurrent threads. Object can be reused multiple times before it is freed.
+    Every time object is being reused initializer() callback is called.
+  */
+
+  static void rw_trx_hash_constructor(uchar *arg)
+  {
+    new(arg + LF_HASH_OVERHEAD) rw_trx_hash_element_t();
+  }
+
+
+  /**
+    Destructor callback for lock-free allocator.
+
+    Object is about to be freed and is not accessible via rw_trx_hash by
+    concurrent threads.
+  */
+
+  static void rw_trx_hash_destructor(uchar *arg)
+  {
+    reinterpret_cast<rw_trx_hash_element_t*>
+      (arg + LF_HASH_OVERHEAD)->~rw_trx_hash_element_t();
+  }
+
+
+  /**
+    Destructor callback for lock-free allocator.
+
+    This destructor is used at shutdown. It frees remaining transaction
+    objects.
+
+    XA PREPARED transactions may remain if they haven't been committed or
+    rolled back. ACTIVE transactions may remain if startup was interrupted or
+    server is running in read-only mode or for certain srv_force_recovery
+    levels.
+  */
+
+  static void rw_trx_hash_shutdown_destructor(uchar *arg)
+  {
+    rw_trx_hash_element_t *element=
+      reinterpret_cast<rw_trx_hash_element_t*>(arg + LF_HASH_OVERHEAD);
+    if (trx_t *trx= element->trx)
+    {
+      ut_ad(trx_state_eq(trx, TRX_STATE_PREPARED) ||
+            trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED) ||
+            (trx_state_eq(trx, TRX_STATE_ACTIVE) &&
+             (!srv_was_started ||
+              srv_read_only_mode ||
+              srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO)));
+      trx_free_at_shutdown(trx);
+    }
+    element->~rw_trx_hash_element_t();
+  }
+
+
+  /**
+    Initializer callback for lock-free hash.
+
+    Object is not yet accessible via rw_trx_hash by concurrent threads, but is
+    about to become such. Object id can be changed only by this callback and
+    remains the same until all pins to this object are released.
+
+    Object trx can be changed to 0 by erase() under object mutex protection,
+    which indicates it is about to be removed from lock-free hash and become
+    not accessible by concurrent threads.
+  */
+
+  static void rw_trx_hash_initializer(LF_HASH *,
+                                      rw_trx_hash_element_t *element,
+                                      trx_t *trx)
+  {
+    ut_ad(element->trx == 0);
+    element->trx= trx;
+    element->id= trx->id;
+    element->no= TRX_ID_MAX;
+    trx->rw_trx_hash_element= element;
+  }
+
+
+  /**
+    Gets LF_HASH pins.
+
+    Pins are used to protect object from being destroyed or reused. They are
+    normally stored in trx object for quick access. If caller doesn't have trx
+    available, we try to get it using currnet_trx(). If caller doesn't have trx
+    at all, temporary pins are allocated.
+  */
+
+  LF_PINS *get_pins(trx_t *trx)
+  {
+    if (!trx->rw_trx_hash_pins)
+    {
+      trx->rw_trx_hash_pins= lf_hash_get_pins(&hash);
+      ut_a(trx->rw_trx_hash_pins);
+    }
+    return trx->rw_trx_hash_pins;
+  }
+
+
+  template <typename T> struct eliminate_duplicates_arg
+  {
+    trx_ids_t ids;
+    walk_action<T> *action;
+    T *argument;
+    eliminate_duplicates_arg(size_t size, walk_action<T> *act, T *arg):
+      action(act), argument(arg) { ids.reserve(size); }
+  };
+
+
+  template <typename T>
+  static my_bool eliminate_duplicates(rw_trx_hash_element_t *element,
+                                      eliminate_duplicates_arg<T> *arg)
+  {
+    for (trx_ids_t::iterator it= arg->ids.begin(); it != arg->ids.end(); it++)
+    {
+      if (*it == element->id)
+        return 0;
+    }
+    arg->ids.push_back(element->id);
+    return arg->action(element, arg->argument);
+  }
+
+
+#ifdef UNIV_DEBUG
+  static void validate_element(trx_t *trx)
+  {
+    ut_ad(!trx->read_only || !trx->rsegs.m_redo.rseg);
+    ut_ad(!trx->is_autocommit_non_locking());
+    /* trx->state can be anything except TRX_STATE_NOT_STARTED */
+    ut_d(trx->mutex_lock());
+    ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) ||
+          trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY) ||
+          trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED) ||
+          trx_state_eq(trx, TRX_STATE_PREPARED));
+    ut_d(trx->mutex_unlock());
+  }
+
+
+  template <typename T> struct debug_iterator_arg
+  {
+    walk_action<T> *action;
+    T *argument;
+  };
+
+
+  template <typename T>
+  static my_bool debug_iterator(rw_trx_hash_element_t *element,
+                                debug_iterator_arg<T> *arg)
+  {
+    element->mutex.wr_lock();
+    if (element->trx)
+      validate_element(element->trx);
+    element->mutex.wr_unlock();
+    ut_ad(element->id < element->no);
+    return arg->action(element, arg->argument);
+  }
+#endif
+
+
+public:
+  void init()
+  {
+    lf_hash_init(&hash, sizeof(rw_trx_hash_element_t), LF_HASH_UNIQUE, 0,
+                 sizeof(trx_id_t), 0, &my_charset_bin);
+    hash.alloc.constructor= rw_trx_hash_constructor;
+    hash.alloc.destructor= rw_trx_hash_destructor;
+    hash.initializer=
+      reinterpret_cast<lf_hash_initializer>(rw_trx_hash_initializer);
+  }
+
+
+  void destroy()
+  {
+    hash.alloc.destructor= rw_trx_hash_shutdown_destructor;
+    lf_hash_destroy(&hash);
+  }
+
+
+  /**
+    Releases LF_HASH pins.
+
+    Must be called by thread that owns trx_t object when the latter is being
+    "detached" from thread (e.g. released to the pool by trx_t::free()). Can be
+    called earlier if thread is expected not to use rw_trx_hash.
+
+    Since pins are not allowed to be transferred to another thread,
+    initialisation thread calls this for recovered transactions.
+  */
+
+  void put_pins(trx_t *trx)
+  {
+    if (trx->rw_trx_hash_pins)
+    {
+      lf_hash_put_pins(trx->rw_trx_hash_pins);
+      trx->rw_trx_hash_pins= 0;
+    }
+  }
+
+
+  /**
+    Finds trx object in lock-free hash with given id.
+
+    Only ACTIVE or PREPARED trx objects may participate in hash. Nevertheless
+    the transaction may get committed before this method returns.
+
+    With do_ref_count == false the caller may dereference returned trx pointer
+    only if lock_sys.latch was acquired before calling find().
+
+    With do_ref_count == true caller may dereference trx even if it is not
+    holding lock_sys.latch. Caller is responsible for calling
+    trx->release_reference() when it is done playing with trx.
+
+    Ideally this method should get caller rw_trx_hash_pins along with trx
+    object as a parameter, similar to insert() and erase(). However most
+    callers lose trx early in their call chains and it is not that easy to pass
+    them through.
+
+    So we take more expensive approach: get trx through current_thd()->ha_data.
+    Some threads don't have trx attached to THD, and at least server
+    initialisation thread, fts_optimize_thread, srv_master_thread,
+    dict_stats_thread, srv_monitor_thread, btr_defragment_thread don't even
+    have THD at all. For such cases we allocate pins only for duration of
+    search and free them immediately.
+
+    This has negative performance impact and should be fixed eventually (by
+    passing caller_trx as a parameter). Still stream of DML is more or less Ok.
+
+    @return
+      @retval 0 not found
+      @retval pointer to trx
+  */
+
+  trx_t *find(trx_t *caller_trx, trx_id_t trx_id, bool do_ref_count)
+  {
+    /*
+      In MariaDB 10.3, purge will reset DB_TRX_ID to 0
+      when the history is lost. Read/write transactions will
+      always have a nonzero trx_t::id; there the value 0 is
+      reserved for transactions that did not write or lock
+      anything yet.
+
+      The caller should already have handled trx_id==0 specially.
+    */
+    ut_ad(trx_id);
+    ut_ad(!caller_trx || caller_trx->id != trx_id || !do_ref_count);
+
+    trx_t *trx= 0;
+    LF_PINS *pins= caller_trx ? get_pins(caller_trx) : lf_hash_get_pins(&hash);
+    ut_a(pins);
+
+    rw_trx_hash_element_t *element= reinterpret_cast<rw_trx_hash_element_t*>
+      (lf_hash_search(&hash, pins, reinterpret_cast<const void*>(&trx_id),
+                      sizeof(trx_id_t)));
+    if (element)
+    {
+      /* rw_trx_hash_t::erase() sets element->trx to nullptr under
+      element->mutex protection before removing the element from hash table.
+      If the element was removed before the mutex acquisition, element->trx
+      will be equal to nullptr. */
+      DEBUG_SYNC_C("before_trx_hash_find_element_mutex_enter");
+      element->mutex.wr_lock();
+      /* element_trx can't point to reused object now. If transaction was
+      deregistered before element->mutex acquisition, element->trx is nullptr.
+      It can't be deregistered while element->mutex is held. */
+      trx_t *element_trx = element->trx;
+      lf_hash_search_unpin(pins);
+      /* The *element can be reused now, as element->trx value is stored
+      locally in element_trx. */
+      DEBUG_SYNC_C("after_trx_hash_find_element_mutex_enter");
+      if ((trx= element_trx)) {
+        DBUG_ASSERT(trx_id == trx->id);
+        ut_d(validate_element(trx));
+        if (do_ref_count)
+        {
+          /*
+            We have an early state check here to avoid committer
+            starvation in a wait loop for transaction references,
+            when there's a stream of trx_sys.find() calls from other
+            threads. The trx->state may change to COMMITTED after
+            trx->mutex is released, and it will have to be rechecked
+            by the caller after reacquiring the mutex.
+          */
+          /* trx_t::commit_in_memory() sets the state to
+          TRX_STATE_COMMITTED_IN_MEMORY before deregistering the transaction.
+          It also waits for any implicit-to-explicit lock conversions to cease
+          after deregistering. */
+          if (trx->state == TRX_STATE_COMMITTED_IN_MEMORY)
+            trx= nullptr;
+          else
+            trx->reference();
+        }
+      }
+      /* element's lifetime is equal to the hash lifetime, that's why
+      element->mutex is valid here despite the element is unpinned. In the
+      worst case some thread will wait for element->mutex releasing. */
+      element->mutex.wr_unlock();
+    }
+    if (!caller_trx)
+      lf_hash_put_pins(pins);
+    return trx;
+  }
+
+
+  /**
+    Inserts trx to lock-free hash.
+
+    Object becomes accessible via rw_trx_hash.
+  */
+
+  void insert(trx_t *trx)
+  {
+    ut_d(validate_element(trx));
+    int res= lf_hash_insert(&hash, get_pins(trx),
+                            reinterpret_cast<void*>(trx));
+    ut_a(res == 0);
+  }
+
+
+  /**
+    Removes trx from lock-free hash.
+
+    Object becomes not accessible via rw_trx_hash. But it still can be pinned
+    by concurrent find(), which is supposed to release it immediately after
+    it sees object trx is 0.
+  */
+
+  void erase(trx_t *trx)
+  {
+    ut_d(validate_element(trx));
+    trx->rw_trx_hash_element->mutex.wr_lock();
+    trx->rw_trx_hash_element->trx= nullptr;
+    trx->rw_trx_hash_element->mutex.wr_unlock();
+    int res= lf_hash_delete(&hash, get_pins(trx),
+                            reinterpret_cast<const void*>(&trx->id),
+                            sizeof(trx_id_t));
+    ut_a(res == 0);
+  }
+
+
+  /**
+    Returns the number of elements in the hash.
+
+    The number is exact only if hash is protected against concurrent
+    modifications (e.g. single threaded startup or hash is protected
+    by some mutex). Otherwise the number may be used as a hint only,
+    because it may change even before this method returns.
+  */
+
+  uint32_t size() { return uint32_t(lf_hash_size(&hash)); }
+
+
+  /**
+    Iterates the hash.
+
+    @param caller_trx  used to get/set pins
+    @param action      called for every element in hash
+    @param argument    opque argument passed to action
+
+    May return the same element multiple times if hash is under contention.
+    If caller doesn't like to see the same transaction multiple times, it has
+    to call iterate_no_dups() instead.
+
+    May return element with committed transaction. If caller doesn't like to
+    see committed transactions, it has to skip those under element mutex:
+
+      element->mutex.wr_lock();
+      if (trx_t trx= element->trx)
+      {
+        // trx is protected against commit in this branch
+      }
+      element->mutex.wr_unlock();
+
+    May miss concurrently inserted transactions.
+
+    @return
+      @retval 0 iteration completed successfully
+      @retval 1 iteration was interrupted (action returned 1)
+  */
+
+  template <typename T>
+  int iterate(trx_t *caller_trx, walk_action<T> *action, T *argument= nullptr)
+  {
+    LF_PINS *pins= caller_trx ? get_pins(caller_trx) : lf_hash_get_pins(&hash);
+    ut_a(pins);
+#ifdef UNIV_DEBUG
+    debug_iterator_arg<T> debug_arg= { action, argument };
+    action= reinterpret_cast<decltype(action)>(debug_iterator<T>);
+    argument= reinterpret_cast<T*>(&debug_arg);
+#endif
+    int res= lf_hash_iterate(&hash, pins,
+                             reinterpret_cast<my_hash_walk_action>(action),
+                             const_cast<void*>(static_cast<const void*>
+                             (argument)));
+    if (!caller_trx)
+      lf_hash_put_pins(pins);
+    return res;
+  }
+
+
+  template <typename T>
+  int iterate(walk_action<T> *action, T *argument= nullptr)
+  {
+    return iterate(current_trx(), action, argument);
+  }
+
+
+  /**
+    Iterates the hash and eliminates duplicate elements.
+
+    @sa iterate()
+  */
+
+  template <typename T>
+  int iterate_no_dups(trx_t *caller_trx, walk_action<T> *action,
+                      T *argument= nullptr)
+  {
+    eliminate_duplicates_arg<T> arg(size() + 32, action, argument);
+    return iterate(caller_trx, eliminate_duplicates<T>, &arg);
+  }
+
+
+  template <typename T>
+  int iterate_no_dups(walk_action<T> *action, T *argument= nullptr)
+  {
+    return iterate_no_dups(current_trx(), action, argument);
+  }
+};
+
+class thread_safe_trx_ilist_t
+{
+public:
+  void create() { mysql_mutex_init(trx_sys_mutex_key, &mutex, nullptr); }
+  void close() { mysql_mutex_destroy(&mutex); }
+
+  bool empty() const
+  {
+    mysql_mutex_lock(&mutex);
+    auto result= trx_list.empty();
+    mysql_mutex_unlock(&mutex);
+    return result;
+  }
+
+  void push_front(trx_t &trx)
+  {
+    mysql_mutex_lock(&mutex);
+    trx_list.push_front(trx);
+    mysql_mutex_unlock(&mutex);
+  }
+
+  void remove(trx_t &trx)
+  {
+    mysql_mutex_lock(&mutex);
+    trx_list.remove(trx);
+    mysql_mutex_unlock(&mutex);
+  }
+
+  template <typename Callable> void for_each(Callable &&callback) const
+  {
+    mysql_mutex_lock(&mutex);
+    for (const auto &trx : trx_list)
+      callback(trx);
+    mysql_mutex_unlock(&mutex);
+  }
+
+  template <typename Callable> void for_each(Callable &&callback)
+  {
+    mysql_mutex_lock(&mutex);
+    for (auto &trx : trx_list)
+      callback(trx);
+    mysql_mutex_unlock(&mutex);
+  }
+
+  void freeze() const { mysql_mutex_lock(&mutex); }
+  void unfreeze() const { mysql_mutex_unlock(&mutex); }
+
+private:
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) mutable mysql_mutex_t mutex;
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) ilist<trx_t> trx_list;
+};
+
+/** The transaction system central memory data structure. */
+class trx_sys_t
+{
+  /**
+    The smallest number not yet assigned as a transaction id or transaction
+    number. Accessed and updated with atomic operations.
+  */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) Atomic_counter<trx_id_t> m_max_trx_id;
+
+
+  /**
+    Solves race conditions between register_rw() and snapshot_ids() as well as
+    race condition between assign_new_trx_no() and snapshot_ids().
+
+    @sa register_rw()
+    @sa assign_new_trx_no()
+    @sa snapshot_ids()
+  */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE)
+  std::atomic<trx_id_t> m_rw_trx_hash_version;
+
+
+  bool m_initialised;
+
+  /** False if there is no undo log to purge or rollback */
+  bool undo_log_nonempty;
+public:
+  /** List of all transactions. */
+  thread_safe_trx_ilist_t trx_list;
+
+  /** Temporary rollback segments */
+  trx_rseg_t temp_rsegs[TRX_SYS_N_RSEGS];
+
+  /** Persistent rollback segments; space==nullptr if slot not in use */
+  trx_rseg_t rseg_array[TRX_SYS_N_RSEGS];
+
+  /**
+    Lock-free hash of in memory read-write transactions.
+    Works faster when it is on it's own cache line (tested).
+  */
+
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) rw_trx_hash_t rw_trx_hash;
+
+
+#ifdef WITH_WSREP
+  /** Latest recovered XID during startup */
+  XID recovered_wsrep_xid;
+#endif
+  /** Latest recovered binlog offset */
+  uint64_t recovered_binlog_offset;
+  /** Latest recovered binlog file name */
+  char recovered_binlog_filename[TRX_SYS_MYSQL_LOG_NAME_LEN];
+  /** FIL_PAGE_LSN of the page with the latest recovered binlog metadata */
+  lsn_t recovered_binlog_lsn;
+
+
+  /**
+    Constructor.
+
+    Some members may require late initialisation, thus we just mark object as
+    uninitialised. Real initialisation happens in create().
+  */
+
+  trx_sys_t(): m_initialised(false) {}
+
+
+  /**
+    @return TRX_RSEG_HISTORY length (number of committed transactions to purge)
+  */
+  size_t history_size();
+
+
+  /**
+    Check whether history_size() exceeds a specified number.
+    @param threshold   number of committed transactions
+    @return whether TRX_RSEG_HISTORY length exceeds the threshold
+  */
+  bool history_exceeds(size_t threshold);
+
+
+  /**
+    @return approximate history_size(), without latch protection
+  */
+  TPOOL_SUPPRESS_TSAN size_t history_size_approx() const;
+
+
+  /**
+    @return whether history_size() is nonzero (with some race condition)
+  */
+  TPOOL_SUPPRESS_TSAN bool history_exists();
+
+
+  /**
+    Determine if the specified transaction or any older one might be active.
+
+    @param trx         current transaction
+    @param id          transaction identifier
+    @return whether any transaction not newer than id might be active
+  */
+
+  bool find_same_or_older(trx_t *trx, trx_id_t id)
+  {
+    if (trx->max_inactive_id >= id)
+      return false;
+    bool found= rw_trx_hash.iterate(trx, find_same_or_older_callback, &id);
+    if (!found)
+      trx->max_inactive_id= id;
+    return found;
+  }
+
+
+  /**
+    Determines the maximum transaction id.
+
+    @return maximum currently allocated trx id; will be stale after the
+            next call to trx_sys.get_new_trx_id()
+  */
+
+  trx_id_t get_max_trx_id()
+  {
+    return m_max_trx_id;
+  }
+
+
+  /**
+    Allocates a new transaction id.
+    @return new, allocated trx id
+  */
+
+  trx_id_t get_new_trx_id()
+  {
+    trx_id_t id= get_new_trx_id_no_refresh();
+    refresh_rw_trx_hash_version();
+    return id;
+  }
+
+
+  /**
+    Allocates and assigns new transaction serialisation number.
+
+    There's a gap between m_max_trx_id increment and transaction serialisation
+    number becoming visible through rw_trx_hash. While we're in this gap
+    concurrent thread may come and do MVCC snapshot without seeing allocated
+    but not yet assigned serialisation number. Then at some point purge thread
+    may clone this view. As a result it won't see newly allocated serialisation
+    number and may remove "unnecessary" history data of this transaction from
+    rollback segments.
+
+    m_rw_trx_hash_version is intended to solve this problem. MVCC snapshot has
+    to wait until m_max_trx_id == m_rw_trx_hash_version, which effectively
+    means that all transaction serialisation numbers up to m_max_trx_id are
+    available through rw_trx_hash.
+
+    We rely on refresh_rw_trx_hash_version() to issue RELEASE memory barrier so
+    that m_rw_trx_hash_version increment happens after
+    trx->rw_trx_hash_element->no becomes visible through rw_trx_hash.
+
+    @param trx transaction
+  */
+  void assign_new_trx_no(trx_t *trx)
+  {
+    trx->rw_trx_hash_element->no= get_new_trx_id_no_refresh();
+    refresh_rw_trx_hash_version();
+  }
+
+
+  /**
+    Takes MVCC snapshot.
+
+    To reduce malloc probablility we reserve rw_trx_hash.size() + 32 elements
+    in ids.
+
+    For details about get_rw_trx_hash_version() != get_max_trx_id() spin
+    @sa register_rw() and @sa assign_new_trx_no().
+
+    We rely on get_rw_trx_hash_version() to issue ACQUIRE memory barrier so
+    that loading of m_rw_trx_hash_version happens before accessing rw_trx_hash.
+
+    To optimise snapshot creation rw_trx_hash.iterate() is being used instead
+    of rw_trx_hash.iterate_no_dups(). It means that some transaction
+    identifiers may appear multiple times in ids.
+
+    @param[in,out] caller_trx used to get access to rw_trx_hash_pins
+    @param[out]    ids        array to store registered transaction identifiers
+    @param[out]    max_trx_id variable to store m_max_trx_id value
+    @param[out]    mix_trx_no variable to store min(no) value
+  */
+
+  void snapshot_ids(trx_t *caller_trx, trx_ids_t *ids, trx_id_t *max_trx_id,
+                    trx_id_t *min_trx_no)
+  {
+    snapshot_ids_arg arg(ids);
+
+    while ((arg.m_id= get_rw_trx_hash_version()) != get_max_trx_id())
+      ut_delay(1);
+    arg.m_no= arg.m_id;
+
+    ids->clear();
+    ids->reserve(rw_trx_hash.size() + 32);
+    rw_trx_hash.iterate(caller_trx, copy_one_id, &arg);
+
+    *max_trx_id= arg.m_id;
+    *min_trx_no= arg.m_no;
+  }
+
+
+  /** Initialiser for m_max_trx_id and m_rw_trx_hash_version. */
+  void init_max_trx_id(trx_id_t value)
+  {
+    m_max_trx_id= value;
+    m_rw_trx_hash_version.store(value, std::memory_order_relaxed);
+  }
+
+
+  bool is_initialised() const { return m_initialised; }
+
+
+  /** Initialise the transaction subsystem. */
+  void create();
+
+  /** Close the transaction subsystem on shutdown. */
+  void close();
+
+  /** @return total number of active (non-prepared) transactions */
+  size_t any_active_transactions(size_t *prepared= nullptr);
+
+
+  /**
+    Determine the rollback segment identifier.
+
+    @param rseg        rollback segment
+    @param persistent  whether the rollback segment is persistent
+    @return the rollback segment identifier
+  */
+  unsigned rseg_id(const trx_rseg_t *rseg, bool persistent) const
+  {
+    const trx_rseg_t *array= persistent ? rseg_array : temp_rsegs;
+    ut_ad(rseg >= array);
+    ut_ad(rseg < &array[TRX_SYS_N_RSEGS]);
+    return static_cast<unsigned>(rseg - array);
+  }
+
+
+  /**
+    Registers read-write transaction.
+
+    Transaction becomes visible to MVCC.
+
+    There's a gap between m_max_trx_id increment and transaction becoming
+    visible through rw_trx_hash. While we're in this gap concurrent thread may
+    come and do MVCC snapshot. As a result concurrent read view will be able to
+    observe records owned by this transaction even before it was committed.
+
+    m_rw_trx_hash_version is intended to solve this problem. MVCC snapshot has
+    to wait until m_max_trx_id == m_rw_trx_hash_version, which effectively
+    means that all transactions up to m_max_trx_id are available through
+    rw_trx_hash.
+
+    We rely on refresh_rw_trx_hash_version() to issue RELEASE memory barrier so
+    that m_rw_trx_hash_version increment happens after transaction becomes
+    visible through rw_trx_hash.
+  */
+
+  void register_rw(trx_t *trx)
+  {
+    trx->id= get_new_trx_id_no_refresh();
+    rw_trx_hash.insert(trx);
+    refresh_rw_trx_hash_version();
+  }
+
+
+  /**
+    Deregisters read-write transaction.
+
+    Transaction is removed from rw_trx_hash, which releases all implicit locks.
+    MVCC snapshot won't see this transaction anymore.
+  */
+
+  void deregister_rw(trx_t *trx)
+  {
+    rw_trx_hash.erase(trx);
+  }
+
+
+  bool is_registered(trx_t *caller_trx, trx_id_t id)
+  {
+    return id && find(caller_trx, id, false);
+  }
+
+
+  trx_t *find(trx_t *caller_trx, trx_id_t id, bool do_ref_count= true)
+  {
+    return rw_trx_hash.find(caller_trx, id, do_ref_count);
+  }
+
+
+  /**
+    Registers transaction in trx_sys.
+
+    @param trx transaction
+  */
+  void register_trx(trx_t *trx)
+  {
+    trx_list.push_front(*trx);
+  }
+
+
+  /**
+    Deregisters transaction in trx_sys.
+
+    @param trx transaction
+  */
+  void deregister_trx(trx_t *trx)
+  {
+    trx_list.remove(*trx);
+  }
+
+
+  /**
+    Clones the oldest view and stores it in view.
+
+    No need to call ReadView::close(). The caller owns the view that is passed
+    in. This function is called by purge thread to determine whether it should
+    purge the delete marked record or not.
+  */
+  void clone_oldest_view(ReadViewBase *view) const;
+
+
+  /** @return the number of active views */
+  size_t view_count() const
+  {
+    size_t count= 0;
+
+    trx_list.for_each([&count](const trx_t &trx) {
+      if (trx.read_view.is_open())
+        ++count;
+    });
+
+    return count;
+  }
+
+  /** Set the undo log empty value */
+  void set_undo_non_empty(bool val)
+  {
+    if (!undo_log_nonempty)
+      undo_log_nonempty= val;
+  }
+
+  /** Get the undo log empty value */
+  bool is_undo_empty() const { return !undo_log_nonempty; }
+
+  /* Reset the trx_sys page and retain the dblwr information,
+  system rollback segment header page
+  @return error code */
+  inline dberr_t reset_page(mtr_t *mtr);
+private:
+  static my_bool find_same_or_older_callback(rw_trx_hash_element_t *element,
+                                             trx_id_t *id)
+  {
+    return element->id <= *id;
+  }
+
+
+  struct snapshot_ids_arg
+  {
+    snapshot_ids_arg(trx_ids_t *ids): m_ids(ids) {}
+    trx_ids_t *m_ids;
+    trx_id_t m_id;
+    trx_id_t m_no;
+  };
+
+
+  static my_bool copy_one_id(rw_trx_hash_element_t *element,
+                             snapshot_ids_arg *arg)
+  {
+    if (element->id < arg->m_id)
+    {
+      trx_id_t no= element->no;
+      arg->m_ids->push_back(element->id);
+      if (no < arg->m_no)
+        arg->m_no= no;
+    }
+    return 0;
+  }
+
+
+  /** Getter for m_rw_trx_hash_version, must issue ACQUIRE memory barrier. */
+  trx_id_t get_rw_trx_hash_version()
+  {
+    return m_rw_trx_hash_version.load(std::memory_order_acquire);
+  }
+
+
+  /** Increments m_rw_trx_hash_version, must issue RELEASE memory barrier. */
+  void refresh_rw_trx_hash_version()
+  {
+    m_rw_trx_hash_version.fetch_add(1, std::memory_order_release);
+  }
+
+
+  /**
+    Allocates new transaction id without refreshing rw_trx_hash version.
+
+    This method is extracted for exclusive use by register_rw() and
+    assign_new_trx_no() where new id must be allocated atomically with
+    payload of these methods from MVCC snapshot point of view.
+
+    @sa get_new_trx_id()
+    @sa assign_new_trx_no()
+
+    @return new transaction id
+  */
+
+  trx_id_t get_new_trx_id_no_refresh()
+  {
+    return m_max_trx_id++;
+  }
+};
+
+
+/** The transaction system */
+extern trx_sys_t trx_sys;
diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h
new file mode 100644
index 00000000..3cfbe331
--- /dev/null
+++ b/storage/innobase/include/trx0trx.h
@@ -0,0 +1,1268 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0trx.h
+The transaction
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0trx_h
+#define trx0trx_h
+
+#include "trx0types.h"
+#include "lock0types.h"
+#include "que0types.h"
+#include "mem0mem.h"
+#include "trx0xa.h"
+#include "ut0vec.h"
+#include "fts0fts.h"
+#include "read0types.h"
+#include "ilist.h"
+#include "row0merge.h"
+
+#include <vector>
+
+// Forward declaration
+struct mtr_t;
+struct rw_trx_hash_element_t;
+
+/******************************************************************//**
+Set detailed error message for the transaction. */
+void
+trx_set_detailed_error(
+/*===================*/
+	trx_t*		trx,	/*!< in: transaction struct */
+	const char*	msg);	/*!< in: detailed error message */
+/*************************************************************//**
+Set detailed error message for the transaction from a file. Note that the
+file is rewinded before reading from it. */
+void
+trx_set_detailed_error_from_file(
+/*=============================*/
+	trx_t*	trx,	/*!< in: transaction struct */
+	FILE*	file);	/*!< in: file to read message from */
+/****************************************************************//**
+Retrieves the error_info field from a trx.
+@return the error info */
+UNIV_INLINE
+const dict_index_t*
+trx_get_error_info(
+/*===============*/
+	const trx_t*	trx);	/*!< in: trx object */
+
+/** @return an allocated transaction */
+trx_t *trx_create();
+
+/** At shutdown, frees a transaction object. */
+void trx_free_at_shutdown(trx_t *trx);
+
+/** Disconnect a prepared transaction from MySQL.
+@param[in,out]	trx	transaction */
+void trx_disconnect_prepared(trx_t *trx);
+
+/** Initialize (resurrect) transactions at startup. */
+dberr_t trx_lists_init_at_db_start();
+
+/*************************************************************//**
+Starts the transaction if it is not yet started. */
+void
+trx_start_if_not_started_xa_low(
+/*============================*/
+	trx_t*	trx,		/*!< in/out: transaction */
+	bool	read_write);	/*!< in: true if read write transaction */
+/*************************************************************//**
+Starts the transaction if it is not yet started. */
+void
+trx_start_if_not_started_low(
+/*=========================*/
+	trx_t*	trx,		/*!< in/out: transaction */
+	bool	read_write);	/*!< in: true if read write transaction */
+
+/**
+Start a transaction for internal processing.
+@param trx          transaction
+@param read_write   whether writes may be performed */
+void trx_start_internal_low(trx_t *trx, bool read_write);
+
+#ifdef UNIV_DEBUG
+#define trx_start_if_not_started_xa(t, rw)			\
+	do {							\
+	(t)->start_line = __LINE__;				\
+	(t)->start_file = __FILE__;				\
+	trx_start_if_not_started_xa_low((t), rw);		\
+	} while (false)
+
+#define trx_start_if_not_started(t, rw)				\
+	do {							\
+	(t)->start_line = __LINE__;				\
+	(t)->start_file = __FILE__;				\
+	trx_start_if_not_started_low((t), rw);			\
+	} while (false)
+
+#define trx_start_internal(t)					\
+	do {							\
+	(t)->start_line = __LINE__;				\
+	(t)->start_file = __FILE__;				\
+	trx_start_internal_low(t, true);			\
+	} while (false)
+#define trx_start_internal_read_only(t)				\
+	do {							\
+	(t)->start_line = __LINE__;				\
+	(t)->start_file = __FILE__;				\
+	trx_start_internal_low(t, false);			\
+	} while (false)
+#else
+#define trx_start_if_not_started(t, rw)				\
+	trx_start_if_not_started_low((t), rw)
+
+#define trx_start_internal(t) trx_start_internal_low(t, true)
+#define trx_start_internal_read_only(t) trx_start_internal_low(t, false)
+
+#define trx_start_if_not_started_xa(t, rw)			\
+	trx_start_if_not_started_xa_low((t), (rw))
+#endif /* UNIV_DEBUG */
+
+/** Start a transaction for a DDL operation.
+@param trx   transaction */
+void trx_start_for_ddl_low(trx_t *trx);
+
+#ifdef UNIV_DEBUG
+# define trx_start_for_ddl(t)					\
+	do {							\
+	ut_ad((t)->start_file == 0);				\
+	(t)->start_line = __LINE__;				\
+	(t)->start_file = __FILE__;				\
+	trx_start_for_ddl_low(t);				\
+	} while (0)
+#else
+# define trx_start_for_ddl(t) trx_start_for_ddl_low(t)
+#endif /* UNIV_DEBUG */
+
+/**********************************************************************//**
+Does the transaction commit for MySQL.
+@return DB_SUCCESS or error number */
+dberr_t
+trx_commit_for_mysql(
+/*=================*/
+	trx_t*	trx);	/*!< in/out: transaction */
+/** XA PREPARE a transaction.
+@param[in,out]	trx	transaction to prepare */
+void trx_prepare_for_mysql(trx_t* trx);
+/**********************************************************************//**
+This function is used to find number of prepared transactions and
+their transaction objects for a recovery.
+@return number of prepared transactions */
+int
+trx_recover_for_mysql(
+/*==================*/
+	XID*	xid_list,	/*!< in/out: prepared transactions */
+	uint	len);		/*!< in: number of slots in xid_list */
+/** Look up an X/Open distributed transaction in XA PREPARE state.
+@param[in]	xid	X/Open XA transaction identifier
+@return	transaction on match (the trx_t::xid will be invalidated);
+note that the trx may have been committed before the caller acquires
+trx_t::mutex
+@retval	NULL if no match */
+trx_t* trx_get_trx_by_xid(const XID* xid);
+/** Durably write log until trx->commit_lsn
+(if trx_t::commit_in_memory() was invoked with flush_log_later=true). */
+void trx_commit_complete_for_mysql(trx_t *trx);
+/**********************************************************************//**
+Marks the latest SQL statement ended. */
+void
+trx_mark_sql_stat_end(
+/*==================*/
+	trx_t*	trx);	/*!< in: trx handle */
+/****************************************************************//**
+Prepares a transaction for commit/rollback. */
+void
+trx_commit_or_rollback_prepare(
+/*===========================*/
+	trx_t*	trx);	/*!< in/out: transaction */
+/*********************************************************************//**
+Creates a commit command node struct.
+@return own: commit node struct */
+commit_node_t*
+trx_commit_node_create(
+/*===================*/
+	mem_heap_t*	heap);	/*!< in: mem heap where created */
+/***********************************************************//**
+Performs an execution step for a commit type node in a query graph.
+@return query thread to run next, or NULL */
+que_thr_t*
+trx_commit_step(
+/*============*/
+	que_thr_t*	thr);	/*!< in: query thread */
+
+/**********************************************************************//**
+Prints info about a transaction. */
+void
+trx_print_low(
+/*==========*/
+	FILE*		f,
+			/*!< in: output stream */
+	const trx_t*	trx,
+			/*!< in: transaction */
+	ulint		max_query_len,
+			/*!< in: max query length to print,
+			or 0 to use the default max length */
+	ulint		n_rec_locks,
+			/*!< in: trx->lock.n_rec_locks */
+	ulint		n_trx_locks,
+			/*!< in: length of trx->lock.trx_locks */
+	ulint		heap_size);
+			/*!< in: mem_heap_get_size(trx->lock.lock_heap) */
+
+/**********************************************************************//**
+Prints info about a transaction.
+When possible, use trx_print() instead. */
+void
+trx_print_latched(
+/*==============*/
+	FILE*		f,		/*!< in: output stream */
+	const trx_t*	trx,		/*!< in: transaction */
+	ulint		max_query_len);	/*!< in: max query length to print,
+					or 0 to use the default max length */
+
+/**********************************************************************//**
+Prints info about a transaction.
+Acquires and releases lock_sys.latch. */
+void
+trx_print(
+/*======*/
+	FILE*		f,		/*!< in: output stream */
+	const trx_t*	trx,		/*!< in: transaction */
+	ulint		max_query_len);	/*!< in: max query length to print,
+					or 0 to use the default max length */
+
+/**********************************************************************//**
+Determines if a transaction is in the given state.
+The caller must hold trx->mutex, or it must be the thread
+that is serving a running transaction.
+A running RW transaction must be in trx_sys.rw_trx_hash.
+@return TRUE if trx->state == state */
+UNIV_INLINE
+bool
+trx_state_eq(
+/*=========*/
+	const trx_t*	trx,	/*!< in: transaction */
+	trx_state_t	state,	/*!< in: state;
+				if state != TRX_STATE_NOT_STARTED
+				asserts that
+				trx->state != TRX_STATE_NOT_STARTED */
+	bool		relaxed = false)
+				/*!< in: whether to allow
+				trx->state == TRX_STATE_NOT_STARTED
+				after an error has been reported */
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/**********************************************************************//**
+Determines if the currently running transaction has been interrupted.
+@return true if interrupted */
+bool
+trx_is_interrupted(
+/*===============*/
+	const trx_t*	trx);	/*!< in: transaction */
+
+/*******************************************************************//**
+Calculates the "weight" of a transaction. The weight of one transaction
+is estimated as the number of altered rows + the number of locked rows.
+@param t transaction
+@return transaction weight */
+#define TRX_WEIGHT(t)	((t)->undo_no + UT_LIST_GET_LEN((t)->lock.trx_locks))
+
+/** Create the trx_t pool */
+void
+trx_pool_init();
+
+/** Destroy the trx_t pool */
+void
+trx_pool_close();
+
+/**
+Set the transaction as a read-write transaction if it is not already
+tagged as such.
+@param[in,out] trx	Transaction that needs to be "upgraded" to RW from RO */
+void
+trx_set_rw_mode(
+	trx_t*		trx);
+
+/**
+Transactions that aren't started by the MySQL server don't set
+the trx_t::mysql_thd field. For such transactions we set the lock
+wait timeout to 0 instead of the user configured value that comes
+from innodb_lock_wait_timeout via trx_t::mysql_thd.
+@param trx transaction
+@return lock wait timeout in seconds */
+#define trx_lock_wait_timeout_get(t)					\
+	((t)->mysql_thd != NULL						\
+	 ? thd_lock_wait_timeout((t)->mysql_thd)			\
+	 : 0)
+
+typedef std::vector<ib_lock_t*, ut_allocator<ib_lock_t*> >	lock_list;
+
+/** The locks and state of an active transaction. Protected by
+lock_sys.latch, trx->mutex or both. */
+struct trx_lock_t
+{
+  /** Lock request being waited for.
+  Set to nonnull when holding lock_sys.latch, lock_sys.wait_mutex and
+  trx->mutex, by the thread that is executing the transaction.
+  Set to nullptr when holding lock_sys.wait_mutex. */
+  Atomic_relaxed<lock_t*> wait_lock;
+  /** Transaction being waited for; protected by lock_sys.wait_mutex */
+  trx_t *wait_trx;
+  /** condition variable for !wait_lock; used with lock_sys.wait_mutex */
+  pthread_cond_t cond;
+  /** lock wait start time */
+  Atomic_relaxed<my_hrtime_t> suspend_time;
+
+#if  defined(UNIV_DEBUG) || !defined(DBUG_OFF)
+  /** 2=high priority WSREP thread has marked this trx to abort;
+  1=another transaction chose this as a victim in deadlock resolution.
+
+  Other threads than the one that is executing the transaction may set
+  flags in this while holding lock_sys.wait_mutex. */
+  Atomic_relaxed<byte> was_chosen_as_deadlock_victim;
+
+  /** Flag the lock owner as a victim in Galera conflict resolution. */
+  void set_wsrep_victim()
+  {
+# if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+    /* There is no 8-bit version of the 80386 BTS instruction.
+    Technically, this is the wrong addressing mode (16-bit), but
+    there are other data members stored after the byte. */
+    __asm__ __volatile__("lock btsw $1, %0"
+                         : "+m" (was_chosen_as_deadlock_victim));
+# else
+    was_chosen_as_deadlock_victim.fetch_or(2);
+# endif
+  }
+#else /* defined(UNIV_DEBUG) || !defined(DBUG_OFF) */
+
+  /** High priority WSREP thread has marked this trx to abort or
+  another transaction chose this as a victim in deadlock resolution.
+
+  Other threads than the one that is executing the transaction may set
+  this while holding lock_sys.wait_mutex. */
+  Atomic_relaxed<bool> was_chosen_as_deadlock_victim;
+
+  /** Flag the lock owner as a victim in Galera conflict resolution. */
+  void set_wsrep_victim() { was_chosen_as_deadlock_victim= true; }
+#endif /* defined(UNIV_DEBUG) || !defined(DBUG_OFF) */
+
+  /** Next available rec_pool[] entry */
+  byte rec_cached;
+  /** Next available table_pool[] entry */
+  byte table_cached;
+
+	que_thr_t*	wait_thr;	/*!< query thread belonging to this
+					trx that is in waiting
+					state. For threads suspended in a
+					lock wait, this is protected by
+					lock_sys.latch. Otherwise, this may
+					only be modified by the thread that is
+					serving the running transaction. */
+
+  /** Pre-allocated record locks */
+  struct {
+    alignas(CPU_LEVEL1_DCACHE_LINESIZE) ib_lock_t lock;
+  } rec_pool[8];
+
+  /** Pre-allocated table locks */
+  ib_lock_t table_pool[8];
+
+  /** Memory heap for trx_locks. Protected by lock_sys.assert_locked()
+  and lock_sys.is_writer() || trx->mutex_is_owner(). */
+  mem_heap_t *lock_heap;
+
+  /** Locks held by the transaction. Protected by lock_sys.assert_locked()
+  and lock_sys.is_writer() || trx->mutex_is_owner().
+  (If lock_sys.latch is only held in shared mode, then the modification
+  must be protected by trx->mutex.) */
+  trx_lock_list_t trx_locks;
+
+	lock_list	table_locks;	/*!< All table locks requested by this
+					transaction, including AUTOINC locks */
+
+	/** List of pending trx_t::evict_table() */
+	UT_LIST_BASE_NODE_T(dict_table_t) evicted_tables;
+
+  /** number of record locks; protected by lock_sys.assert_locked(page_id) */
+  ulint n_rec_locks;
+};
+
+/** Logical first modification time of a table in a transaction */
+class trx_mod_table_time_t
+{
+  /** Impossible value for trx_t::undo_no */
+  static constexpr undo_no_t NONE= ~undo_no_t{0};
+  /** Theoretical maximum value for trx_t::undo_no.
+  DB_ROLL_PTR is only 7 bytes, so it cannot point to more than
+  this many undo log records. */
+  static constexpr undo_no_t LIMIT= (undo_no_t{1} << (7 * 8)) - 1;
+
+  /** Flag in 'first' to indicate that subsequent operations are
+  covered by a TRX_UNDO_EMPTY record (for the first statement to
+  insert into an empty table) */
+  static constexpr undo_no_t BULK= 1ULL << 63;
+
+  /** First modification of the table, possibly ORed with BULK */
+  undo_no_t first;
+  /** First modification of a system versioned column
+  (NONE= no versioning, BULK= the table was dropped) */
+  undo_no_t first_versioned= NONE;
+#ifdef UNIV_DEBUG
+  /** Whether the modified table is a FTS auxiliary table */
+  bool fts_aux_table= false;
+#endif /* UNIV_DEBUG */
+
+  /** Buffer to store insert opertion */
+  row_merge_bulk_t *bulk_store= nullptr;
+
+  friend struct trx_t;
+public:
+  /** Constructor
+  @param rows   number of modified rows so far */
+  trx_mod_table_time_t(undo_no_t rows) : first(rows) { ut_ad(rows < LIMIT); }
+
+#ifdef UNIV_DEBUG
+  /** Validation
+  @param rows   number of modified rows so far
+  @return whether the object is valid */
+  bool valid(undo_no_t rows= NONE) const
+  { auto f= first & LIMIT; return f <= first_versioned && f <= rows; }
+#endif /* UNIV_DEBUG */
+  /** @return if versioned columns were modified */
+  bool is_versioned() const { return (~first_versioned & LIMIT) != 0; }
+  /** @return if the table was dropped */
+  bool is_dropped() const { return first_versioned == BULK; }
+
+  /** After writing an undo log record, set is_versioned() if needed
+  @param rows   number of modified rows so far */
+  void set_versioned(undo_no_t rows)
+  {
+    ut_ad(first_versioned == NONE);
+    first_versioned= rows;
+    ut_ad(valid(rows));
+  }
+
+  /** After writing an undo log record, note that the table will be dropped */
+  void set_dropped()
+  {
+    ut_ad(first_versioned == NONE);
+    first_versioned= BULK;
+  }
+
+  /** Notify the start of a bulk insert operation
+  @param table table to do bulk operation */
+  void start_bulk_insert(dict_table_t *table)
+  {
+    first|= BULK;
+    if (!table->is_temporary())
+      bulk_store= new row_merge_bulk_t(table);
+  }
+
+  /** Notify the end of a bulk insert operation */
+  void end_bulk_insert() { first&= ~BULK; }
+
+  /** @return whether an insert is covered by TRX_UNDO_EMPTY record */
+  bool is_bulk_insert() const { return first & BULK; }
+
+  /** Invoked after partial rollback
+  @param limit	number of surviving modified rows (trx_t::undo_no)
+  @return	whether this should be erased from trx_t::mod_tables */
+  bool rollback(undo_no_t limit)
+  {
+    ut_ad(valid());
+    if ((LIMIT & first) >= limit)
+      return true;
+    if (first_versioned < limit)
+      first_versioned= NONE;
+    return false;
+  }
+
+#ifdef UNIV_DEBUG
+  void set_aux_table() { fts_aux_table= true; }
+
+  bool is_aux_table() const { return fts_aux_table; }
+#endif /* UNIV_DEBUG */
+
+  /** @return the first undo record that modified the table */
+  undo_no_t get_first() const
+  {
+    ut_ad(valid());
+    return LIMIT & first;
+  }
+
+  /** Add the tuple to the transaction bulk buffer for the given index.
+  @param entry  tuple to be inserted
+  @param index  bulk insert for the index
+  @param trx    transaction */
+  dberr_t bulk_insert_buffered(const dtuple_t &entry,
+                               const dict_index_t &index, trx_t *trx)
+  {
+    return bulk_store->bulk_insert_buffered(entry, index, trx);
+  }
+
+  /** Do bulk insert operation present in the buffered operation
+  @return DB_SUCCESS or error code */
+  dberr_t write_bulk(dict_table_t *table, trx_t *trx);
+
+  /** @return whether the buffer storage exist */
+  bool bulk_buffer_exist() const
+  {
+    return bulk_store && is_bulk_insert();
+  }
+
+  /** Free bulk insert operation */
+  void clear_bulk_buffer()
+  {
+    delete bulk_store;
+    bulk_store= nullptr;
+  }
+};
+
+/** Collection of persistent tables and their first modification
+in a transaction.
+We store pointers to the table objects in memory because
+we know that a table object will not be destroyed while a transaction
+that modified it is running. */
+typedef std::map<
+	dict_table_t*, trx_mod_table_time_t,
+	std::less<dict_table_t*>,
+	ut_allocator<std::pair<dict_table_t* const, trx_mod_table_time_t> > >
+	trx_mod_tables_t;
+
+/** The transaction handle
+
+Normally, there is a 1:1 relationship between a transaction handle
+(trx) and a session (client connection). One session is associated
+with exactly one user transaction. There are some exceptions to this:
+
+* For DDL operations, a subtransaction is allocated that modifies the
+data dictionary tables. Lock waits and deadlocks are prevented by
+acquiring the dict_sys.latch before starting the subtransaction
+and releasing it after committing the subtransaction.
+
+* The purge system uses a special transaction that is not associated
+with any session.
+
+* If the system crashed or it was quickly shut down while there were
+transactions in the ACTIVE or PREPARED state, these transactions would
+no longer be associated with a session when the server is restarted.
+
+A session may be served by at most one thread at a time. The serving
+thread of a session might change in some MySQL implementations.
+Therefore we do not have pthread_self() assertions in the code.
+
+Normally, only the thread that is currently associated with a running
+transaction may access (read and modify) the trx object, and it may do
+so without holding any mutex. The following are exceptions to this:
+
+* trx_rollback_recovered() may access resurrected (connectionless)
+transactions (state == TRX_STATE_ACTIVE && is_recovered)
+while the system is already processing new user transactions (!is_recovered).
+
+* trx_print_low() may access transactions not associated with the current
+thread. The caller must be holding lock_sys.latch.
+
+* When a transaction handle is in the trx_sys.trx_list, some of its fields
+must not be modified without holding trx->mutex.
+
+* The locking code (in particular, lock_deadlock_recursive() and
+lock_rec_convert_impl_to_expl()) will access transactions associated
+to other connections. The locks of transactions are protected by
+lock_sys.latch (insertions also by trx->mutex). */
+
+/** Represents an instance of rollback segment along with its state variables.*/
+struct trx_undo_ptr_t {
+	trx_rseg_t*	rseg;		/*!< rollback segment assigned to the
+					transaction, or NULL if not assigned
+					yet */
+	trx_undo_t*	undo;		/*!< pointer to the undo log, or
+					NULL if nothing logged yet */
+};
+
+/** An instance of temporary rollback segment. */
+struct trx_temp_undo_t {
+	/** temporary rollback segment, or NULL if not assigned yet */
+	trx_rseg_t*	rseg;
+	/** pointer to the undo log, or NULL if nothing logged yet */
+	trx_undo_t*	undo;
+};
+
+/** Rollback segments assigned to a transaction for undo logging. */
+struct trx_rsegs_t {
+	/** undo log ptr holding reference to a rollback segment that resides in
+	system/undo tablespace used for undo logging of tables that needs
+	to be recovered on crash. */
+	trx_undo_ptr_t	m_redo;
+
+	/** undo log for temporary tables; discarded immediately after
+	transaction commit/rollback */
+	trx_temp_undo_t	m_noredo;
+};
+
+struct trx_t : ilist_node<>
+{
+private:
+  /**
+    Least significant 31 bits is count of references.
+
+    We can't release the locks nor commit the transaction until this reference
+    is 0. We can change the state to TRX_STATE_COMMITTED_IN_MEMORY to signify
+    that it is no longer "active".
+
+    If the most significant bit is set this transaction should stop inheriting
+    (GAP)locks. Generally set to true during transaction prepare for RC or lower
+    isolation, if requested. Needed for replication replay where
+    we don't want to get blocked on GAP locks taken for protecting
+    concurrent unique insert or replace operation.
+  */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE)
+  Atomic_relaxed<uint32_t> skip_lock_inheritance_and_n_ref;
+
+
+public:
+  /** Transaction identifier (0 if no locks were acquired).
+  Set by trx_sys_t::register_rw() or trx_resurrect() before
+  the transaction is added to trx_sys.rw_trx_hash.
+  Cleared in commit_in_memory() after commit_state(),
+  trx_sys_t::deregister_rw(), release_locks(). */
+  trx_id_t id;
+  /** The largest encountered transaction identifier for which no
+  transaction was observed to be active. This is a cache to speed up
+  trx_sys_t::find_same_or_older(). */
+  trx_id_t max_inactive_id;
+
+private:
+  /** mutex protecting state and some of lock
+  (some are protected by lock_sys.latch) */
+  srw_spin_mutex mutex;
+#ifdef UNIV_DEBUG
+  /** The owner of mutex (0 if none); protected by mutex */
+  std::atomic<pthread_t> mutex_owner{0};
+#endif /* UNIV_DEBUG */
+public:
+  void mutex_init() { mutex.init(); }
+  void mutex_destroy() { mutex.destroy(); }
+
+  /** Acquire the mutex */
+  void mutex_lock()
+  {
+    ut_ad(!mutex_is_owner());
+    mutex.wr_lock();
+    ut_ad(!mutex_owner.exchange(pthread_self(),
+                                std::memory_order_relaxed));
+  }
+  /** Release the mutex */
+  void mutex_unlock()
+  {
+    ut_ad(mutex_owner.exchange(0, std::memory_order_relaxed)
+	  == pthread_self());
+    mutex.wr_unlock();
+  }
+#ifndef SUX_LOCK_GENERIC
+  bool mutex_is_locked() const noexcept { return mutex.is_locked(); }
+#endif
+#ifdef UNIV_DEBUG
+  /** @return whether the current thread holds the mutex */
+  bool mutex_is_owner() const
+  {
+    return mutex_owner.load(std::memory_order_relaxed) ==
+      pthread_self();
+  }
+#endif /* UNIV_DEBUG */
+
+  /** State of the trx from the point of view of concurrency control
+  and the valid state transitions.
+
+  Possible states:
+
+  TRX_STATE_NOT_STARTED
+  TRX_STATE_ACTIVE
+  TRX_STATE_PREPARED
+  TRX_STATE_PREPARED_RECOVERED (special case of TRX_STATE_PREPARED)
+  TRX_STATE_COMMITTED_IN_MEMORY (alias below COMMITTED)
+
+  Valid state transitions are:
+
+  Regular transactions:
+  * NOT_STARTED -> ACTIVE -> COMMITTED -> NOT_STARTED
+
+  Auto-commit non-locking read-only:
+  * NOT_STARTED -> ACTIVE -> NOT_STARTED
+
+  XA (2PC):
+  * NOT_STARTED -> ACTIVE -> PREPARED -> COMMITTED -> NOT_STARTED
+
+  Recovered XA:
+  * NOT_STARTED -> PREPARED -> COMMITTED -> (freed)
+
+  Recovered XA followed by XA ROLLBACK:
+  * NOT_STARTED -> PREPARED -> ACTIVE -> COMMITTED -> (freed)
+
+  XA (2PC) (shutdown or disconnect before ROLLBACK or COMMIT):
+  * NOT_STARTED -> PREPARED -> (freed)
+
+  Disconnected XA PREPARE transaction can become recovered:
+  * ... -> ACTIVE -> PREPARED (connected) -> PREPARED (disconnected)
+
+  Latching and various transaction lists membership rules:
+
+  XA (2PC) transactions are always treated as non-autocommit.
+
+  Transitions to ACTIVE or NOT_STARTED occur when transaction
+  is not in rw_trx_hash.
+
+  Autocommit non-locking read-only transactions move between states
+  without holding any mutex. They are not in rw_trx_hash.
+
+  All transactions, unless they are determined to be ac-nl-ro,
+  explicitly tagged as read-only or read-write, will first be put
+  on the read-only transaction list. Only when a !read-only transaction
+  in the read-only list tries to acquire an X or IX lock on a table
+  do we remove it from the read-only list and put it on the read-write
+  list. During this switch we assign it a rollback segment.
+
+  When a transaction is NOT_STARTED, it can be in trx_list. It cannot be
+  in rw_trx_hash.
+
+  ACTIVE->PREPARED->COMMITTED is only possible when trx is in rw_trx_hash.
+  The transition ACTIVE->PREPARED is protected by trx->mutex.
+
+  ACTIVE->COMMITTED is possible when the transaction is in
+  rw_trx_hash.
+
+  Transitions to COMMITTED are protected by trx_t::mutex. */
+  Atomic_relaxed<trx_state_t> state;
+
+  /** The locks of the transaction. Protected by lock_sys.latch
+  (insertions also by trx_t::mutex). */
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) trx_lock_t lock;
+
+#ifdef WITH_WSREP
+  /** whether wsrep_on(mysql_thd) held at the start of transaction */
+  byte wsrep;
+  bool is_wsrep() const { return UNIV_UNLIKELY(wsrep); }
+  bool is_wsrep_UK_scan() const { return UNIV_UNLIKELY(wsrep & 2); }
+#else /* WITH_WSREP */
+  bool is_wsrep() const { return false; }
+#endif /* WITH_WSREP */
+
+  /** Consistent read view of the transaction */
+  ReadView read_view;
+
+	/* These fields are not protected by any mutex. */
+
+	/** false=normal transaction, true=recovered (must be rolled back)
+	or disconnected transaction in XA PREPARE STATE.
+
+	This field is accessed by the thread that owns the transaction,
+	without holding any mutex.
+	There is only one foreign-thread access in trx_print_low()
+	and a possible race condition with trx_disconnect_prepared(). */
+	bool		is_recovered;
+	const char*	op_info;	/*!< English text describing the
+					current operation, or an empty
+					string */
+	uint		isolation_level;/*!< TRX_ISO_REPEATABLE_READ, ... */
+	bool		check_foreigns;	/*!< normally TRUE, but if the user
+					wants to suppress foreign key checks,
+					(in table imports, for example) we
+					set this FALSE */
+  /** whether an insert into an empty table is active */
+  bool bulk_insert;
+	/*------------------------------*/
+	/* MySQL has a transaction coordinator to coordinate two phase
+	commit between multiple storage engines and the binary log. When
+	an engine participates in a transaction, it's responsible for
+	registering itself using the trans_register_ha() API. */
+	bool		is_registered;	/* This flag is set to true after the
+					transaction has been registered with
+					the coordinator using the XA API, and
+					is set to false  after commit or
+					rollback. */
+	/** whether this is holding the prepare mutex */
+	bool		active_commit_ordered;
+	/*------------------------------*/
+	bool		check_unique_secondary;
+					/*!< normally TRUE, but if the user
+					wants to speed up inserts by
+					suppressing unique key checks
+					for secondary indexes when we decide
+					if we can use the insert buffer for
+					them, we set this FALSE */
+	bool		flush_log_later;/* In 2PC, we hold the
+					prepare_commit mutex across
+					both phases. In that case, we
+					defer flush of the logs to disk
+					until after we release the
+					mutex. */
+	ulint		duplicates;	/*!< TRX_DUP_IGNORE | TRX_DUP_REPLACE */
+  /** whether this modifies InnoDB dictionary tables */
+  bool dict_operation;
+#ifdef UNIV_DEBUG
+  /** copy of dict_operation during commit() */
+  bool was_dict_operation;
+#endif
+	/** whether dict_sys.latch is held exclusively; protected by
+	dict_sys.latch */
+	bool dict_operation_lock_mode;
+
+	/** wall-clock time of the latest transition to TRX_STATE_ACTIVE;
+	used for diagnostic purposes only */
+	time_t		start_time;
+	/** microsecond_interval_timer() of transaction start */
+	ulonglong	start_time_micro;
+	lsn_t		commit_lsn;	/*!< lsn at the time of the commit */
+	/*------------------------------*/
+	THD*		mysql_thd;	/*!< MySQL thread handle corresponding
+					to this trx, or NULL */
+
+	const char*	mysql_log_file_name;
+					/*!< if MySQL binlog is used, this field
+					contains a pointer to the latest file
+					name; this is NULL if binlog is not
+					used */
+	ulonglong	mysql_log_offset;
+					/*!< if MySQL binlog is used, this
+					field contains the end offset of the
+					binlog entry */
+	/*------------------------------*/
+	ib_uint32_t	n_mysql_tables_in_use; /*!< number of Innobase tables
+					used in the processing of the current
+					SQL statement in MySQL */
+	ib_uint32_t	mysql_n_tables_locked;
+					/*!< how many tables the current SQL
+					statement uses, except those
+					in consistent read */
+
+  /** DB_SUCCESS or error code; usually only the thread that is running
+  the transaction is allowed to modify this field. The only exception is
+  when a thread invokes lock_sys_t::cancel() in order to abort a
+  lock_wait(). That is protected by lock_sys.wait_mutex and lock.wait_lock. */
+  dberr_t error_state;
+
+	const dict_index_t*error_info;	/*!< if the error number indicates a
+					duplicate key error, a pointer to
+					the problematic index is stored here */
+	ulint		error_key_num;	/*!< if the index creation fails to a
+					duplicate key error, a mysql key
+					number of that index is stored here */
+	que_t*		graph;		/*!< query currently run in the session,
+					or NULL if none; NOTE that the query
+					belongs to the session, and it can
+					survive over a transaction commit, if
+					it is a stored procedure with a COMMIT
+					WORK statement, for instance */
+	/*------------------------------*/
+	UT_LIST_BASE_NODE_T(trx_named_savept_t)
+			trx_savepoints;	/*!< savepoints set with SAVEPOINT ...,
+					oldest first */
+	/*------------------------------*/
+	undo_no_t	undo_no;	/*!< next undo log record number to
+					assign; since the undo log is
+					private for a transaction, this
+					is a simple ascending sequence
+					with no gaps; thus it represents
+					the number of modified/inserted
+					rows in a transaction */
+	trx_savept_t	last_sql_stat_start;
+					/*!< undo_no when the last sql statement
+					was started: in case of an error, trx
+					is rolled back down to this number */
+	trx_rsegs_t	rsegs;		/* rollback segments for undo logging */
+	undo_no_t	roll_limit;	/*!< least undo number to undo during
+					a partial rollback; 0 otherwise */
+	bool		in_rollback;	/*!< true when the transaction is
+					executing a partial or full rollback */
+	ulint		pages_undone;	/*!< number of undo log pages undone
+					since the last undo log truncation */
+	/*------------------------------*/
+	ulint		n_autoinc_rows;	/*!< no. of AUTO-INC rows required for
+					an SQL statement. This is useful for
+					multi-row INSERTs */
+	ib_vector_t*    autoinc_locks;  /* AUTOINC locks held by this
+					transaction. Note that these are
+					also in the lock list trx_locks. This
+					vector needs to be freed explicitly
+					when the trx instance is destroyed.
+					Protected by lock_sys.latch. */
+	/*------------------------------*/
+	bool		read_only;	/*!< true if transaction is flagged
+					as a READ-ONLY transaction.
+					if auto_commit && !will_lock
+					then it will be handled as a
+					AC-NL-RO-SELECT (Auto Commit Non-Locking
+					Read Only Select). A read only
+					transaction will not be assigned an
+					UNDO log. */
+	bool		auto_commit;	/*!< true if it is an autocommit */
+	bool		will_lock;	/*!< set to inform trx_start_low() that
+					the transaction may acquire locks */
+	/* True if transaction has to read the undo log and
+	log the DML changes for online DDL table */
+	bool		apply_online_log = false;
+
+	/*------------------------------*/
+	fts_trx_t*	fts_trx;	/*!< FTS information, or NULL if
+					transaction hasn't modified tables
+					with FTS indexes (yet). */
+	doc_id_t	fts_next_doc_id;/* The document id used for updates */
+	/*------------------------------*/
+	ib_uint32_t	flush_tables;	/*!< if "covering" the FLUSH TABLES",
+					count of tables being flushed. */
+
+	/*------------------------------*/
+#ifdef UNIV_DEBUG
+	unsigned	start_line;	/*!< Track where it was started from */
+	const char*	start_file;	/*!< Filename where it was started */
+#endif /* UNIV_DEBUG */
+
+	XID		xid;		/*!< X/Open XA transaction
+					identification to identify a
+					transaction branch */
+	trx_mod_tables_t mod_tables;	/*!< List of tables that were modified
+					by this transaction */
+	/*------------------------------*/
+	char*		detailed_error;	/*!< detailed error message for last
+					error, or empty. */
+	rw_trx_hash_element_t *rw_trx_hash_element;
+	LF_PINS *rw_trx_hash_pins;
+	ulint		magic_n;
+
+	/** @return whether any persistent undo log has been generated */
+	bool has_logged_persistent() const
+	{
+		return(rsegs.m_redo.undo);
+	}
+
+	/** @return whether any undo log has been generated */
+	bool has_logged() const
+	{
+		return(has_logged_persistent() || rsegs.m_noredo.undo);
+	}
+
+	/** @return rollback segment for modifying temporary tables */
+	trx_rseg_t* get_temp_rseg()
+	{
+		if (trx_rseg_t* rseg = rsegs.m_noredo.rseg) {
+			ut_ad(id != 0);
+			return(rseg);
+		}
+
+		return(assign_temp_rseg());
+	}
+
+  /** Transition to committed state, to release implicit locks. */
+  inline void commit_state();
+
+  /** Release any explicit locks of a committing transaction. */
+  inline void release_locks();
+
+  /** Evict a table definition due to the rollback of ALTER TABLE.
+  @param table_id   table identifier
+  @param reset_only whether to only reset dict_table_t::def_trx_id */
+  void evict_table(table_id_t table_id, bool reset_only= false);
+
+  /** Initiate rollback.
+  @param savept     savepoint to which to roll back
+  @return error code or DB_SUCCESS */
+  dberr_t rollback(trx_savept_t *savept= nullptr);
+  /** Roll back an active transaction.
+  @param savept     savepoint to which to roll back */
+  inline void rollback_low(trx_savept_t *savept= nullptr);
+  /** Finish rollback.
+  @return whether the rollback was completed normally
+  @retval false if the rollback was aborted by shutdown */
+  inline bool rollback_finish();
+private:
+  /** Apply any changes to tables for which online DDL is in progress. */
+  ATTRIBUTE_COLD void apply_log();
+  /** Process tables that were modified by the committing transaction. */
+  inline void commit_tables();
+  /** Mark a transaction committed in the main memory data structures.
+  @param mtr  mini-transaction (if there are any persistent modifications) */
+  inline void commit_in_memory(const mtr_t *mtr);
+  /** Write log for committing the transaction. */
+  void commit_persist();
+  /** Clean up the transaction after commit_in_memory() */
+  void commit_cleanup();
+  /** Commit the transaction in a mini-transaction.
+  @param mtr  mini-transaction (if there are any persistent modifications) */
+  void commit_low(mtr_t *mtr= nullptr);
+  /** Commit an empty transaction.
+  @param mtr   mini-transaction */
+  void commit_empty(mtr_t *mtr);
+  /** Commit an empty transaction.
+  @param mtr   mini-transaction */
+  /** Assign the transaction its history serialisation number and write the
+  UNDO log to the assigned rollback segment.
+  @param mtr   mini-transaction */
+  inline void write_serialisation_history(mtr_t *mtr);
+public:
+  /** Commit the transaction. */
+  void commit();
+
+  /** Try to drop a persistent table.
+  @param table       persistent table
+  @param fk          whether to drop FOREIGN KEY metadata
+  @return error code */
+  dberr_t drop_table(const dict_table_t &table);
+  /** Try to drop the foreign key constraints for a persistent table.
+  @param name        name of persistent table
+  @return error code */
+  dberr_t drop_table_foreign(const table_name_t &name);
+  /** Try to drop the statistics for a persistent table.
+  @param name        name of persistent table
+  @return error code */
+  dberr_t drop_table_statistics(const table_name_t &name);
+  /** Commit the transaction, possibly after drop_table().
+  @param deleted   handles of data files that were deleted */
+  void commit(std::vector<pfs_os_file_t> &deleted);
+
+
+  /** Discard all savepoints */
+  void savepoints_discard()
+  { savepoints_discard(UT_LIST_GET_FIRST(trx_savepoints)); }
+
+
+  /** Discard all savepoints starting from a particular savepoint.
+  @param savept    first savepoint to discard */
+  void savepoints_discard(trx_named_savept_t *savept);
+
+
+  bool is_referenced() const
+  {
+    return (skip_lock_inheritance_and_n_ref & ~(1U << 31)) > 0;
+  }
+
+
+  void reference()
+  {
+    ut_d(auto old_n_ref =)
+    skip_lock_inheritance_and_n_ref.fetch_add(1);
+    ut_ad(int32_t(old_n_ref << 1) >= 0);
+  }
+
+  void release_reference()
+  {
+    ut_d(auto old_n_ref =)
+    skip_lock_inheritance_and_n_ref.fetch_sub(1);
+    ut_ad(int32_t(old_n_ref << 1) > 0);
+  }
+
+  bool is_not_inheriting_locks() const
+  {
+    return skip_lock_inheritance_and_n_ref >> 31;
+  }
+
+  void set_skip_lock_inheritance()
+  {
+    ut_d(auto old_n_ref=) skip_lock_inheritance_and_n_ref.fetch_add(1U << 31);
+    ut_ad(!(old_n_ref >> 31));
+  }
+
+  void reset_skip_lock_inheritance()
+  {
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+    __asm__("lock btrl $31, %0" : : "m"(skip_lock_inheritance_and_n_ref));
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+    _interlockedbittestandreset(
+        reinterpret_cast<volatile long *>(&skip_lock_inheritance_and_n_ref),
+        31);
+#else
+    skip_lock_inheritance_and_n_ref.fetch_and(~1U << 31);
+#endif
+  }
+
+  /** @return whether the table has lock on
+  mysql.innodb_table_stats or mysql.innodb_index_stats */
+  bool has_stats_table_lock() const;
+
+  /** Free the memory to trx_pools */
+  void free();
+
+
+  void assert_freed() const
+  {
+    ut_ad(state == TRX_STATE_NOT_STARTED);
+    ut_ad(!id);
+    ut_ad(!mutex_is_owner());
+    ut_ad(!has_logged());
+    ut_ad(!is_referenced());
+    ut_ad(!is_wsrep());
+    ut_ad(!lock.was_chosen_as_deadlock_victim);
+    ut_ad(mod_tables.empty());
+    ut_ad(!read_view.is_open());
+    ut_ad(!lock.wait_thr);
+    ut_ad(!lock.wait_lock);
+    ut_ad(UT_LIST_GET_LEN(lock.trx_locks) == 0);
+    ut_ad(lock.table_locks.empty());
+    ut_ad(!autoinc_locks || ib_vector_is_empty(autoinc_locks));
+    ut_ad(UT_LIST_GET_LEN(lock.evicted_tables) == 0);
+    ut_ad(!dict_operation);
+    ut_ad(!apply_online_log);
+    ut_ad(!is_not_inheriting_locks());
+    ut_ad(check_foreigns);
+    ut_ad(check_unique_secondary);
+  }
+
+  /** This has to be invoked on SAVEPOINT or at the end of a statement.
+  Even if a TRX_UNDO_EMPTY record was written for this table to cover an
+  insert into an empty table, subsequent operations will have to be covered
+  by row-level undo log records, so that ROLLBACK TO SAVEPOINT or a
+  rollback to the start of a statement will work.
+  @param table   table on which any preceding bulk insert ended */
+  void end_bulk_insert(const dict_table_t &table)
+  {
+    auto it= mod_tables.find(const_cast<dict_table_t*>(&table));
+    if (it != mod_tables.end())
+      it->second.end_bulk_insert();
+  }
+
+  /** @return whether this is a non-locking autocommit transaction */
+  bool is_autocommit_non_locking() const { return auto_commit && !will_lock; }
+
+  /** This has to be invoked on SAVEPOINT or at the start of a statement.
+  Even if TRX_UNDO_EMPTY records were written for any table to cover an
+  insert into an empty table, subsequent operations will have to be covered
+  by row-level undo log records, so that ROLLBACK TO SAVEPOINT or a
+  rollback to the start of a statement will work. */
+  void end_bulk_insert()
+  {
+    for (auto& t : mod_tables)
+      t.second.end_bulk_insert();
+  }
+
+  /** @return whether a bulk insert into empty table is in progress */
+  bool is_bulk_insert() const
+  {
+    if (!bulk_insert || check_unique_secondary || check_foreigns)
+      return false;
+    for (const auto& t : mod_tables)
+      if (t.second.is_bulk_insert())
+        return true;
+    return false;
+  }
+
+  /** @return logical modification time of a table only
+  if the table has bulk buffer exist in the transaction */
+  trx_mod_table_time_t *check_bulk_buffer(dict_table_t *table)
+  {
+    if (UNIV_LIKELY(!bulk_insert))
+      return nullptr;
+    ut_ad(!check_unique_secondary);
+    ut_ad(!check_foreigns);
+    auto it= mod_tables.find(table);
+    if (it == mod_tables.end() || !it->second.bulk_buffer_exist())
+      return nullptr;
+    return &it->second;
+  }
+
+  /** Do the bulk insert for the buffered insert operation
+  for the transaction.
+  @return DB_SUCCESS or error code */
+  dberr_t bulk_insert_apply()
+  {
+    return UNIV_UNLIKELY(bulk_insert) ? bulk_insert_apply_low(): DB_SUCCESS;
+  }
+
+private:
+  /** Apply the buffered bulk inserts. */
+  dberr_t bulk_insert_apply_low();
+
+  /** Assign a rollback segment for modifying temporary tables.
+  @return the assigned rollback segment */
+  trx_rseg_t *assign_temp_rseg();
+};
+
+/**
+Check if transaction is started.
+@param[in] trx		Transaction whose state we need to check
+@reutrn true if transaction is in state started */
+inline bool trx_is_started(const trx_t* trx)
+{
+	return trx->state != TRX_STATE_NOT_STARTED;
+}
+
+/* Transaction isolation levels (trx->isolation_level) */
+#define TRX_ISO_READ_UNCOMMITTED	0	/* dirty read: non-locking
+						SELECTs are performed so that
+						we do not look at a possible
+						earlier version of a record;
+						thus they are not 'consistent'
+						reads under this isolation
+						level; otherwise like level
+						2 */
+
+#define TRX_ISO_READ_COMMITTED		1	/* somewhat Oracle-like
+						isolation, except that in
+						range UPDATE and DELETE we
+						must block phantom rows
+						with next-key locks;
+						SELECT ... FOR UPDATE and ...
+						LOCK IN SHARE MODE only lock
+						the index records, NOT the
+						gaps before them, and thus
+						allow free inserting;
+						each consistent read reads its
+						own snapshot */
+
+#define TRX_ISO_REPEATABLE_READ		2	/* this is the default;
+						all consistent reads in the
+						same trx read the same
+						snapshot;
+						full next-key locking used
+						in locking reads to block
+						insertions into gaps */
+
+#define TRX_ISO_SERIALIZABLE		3	/* all plain SELECTs are
+						converted to LOCK IN SHARE
+						MODE reads */
+
+/* Treatment of duplicate values (trx->duplicates; for example, in inserts).
+Multiple flags can be combined with bitwise OR. */
+#define TRX_DUP_IGNORE	1U	/* duplicate rows are to be updated */
+#define TRX_DUP_REPLACE	2U	/* duplicate rows are to be replaced */
+
+
+/** Commit node states */
+enum commit_node_state {
+	COMMIT_NODE_SEND = 1,	/*!< about to send a commit signal to
+				the transaction */
+	COMMIT_NODE_WAIT	/*!< commit signal sent to the transaction,
+				waiting for completion */
+};
+
+/** Commit command node in a query graph */
+struct commit_node_t{
+	que_common_t	common;	/*!< node type: QUE_NODE_COMMIT */
+	enum commit_node_state
+			state;	/*!< node execution state */
+};
+
+
+#include "trx0trx.inl"
+
+#endif
diff --git a/storage/innobase/include/trx0trx.inl b/storage/innobase/include/trx0trx.inl
new file mode 100644
index 00000000..b063c920
--- /dev/null
+++ b/storage/innobase/include/trx0trx.inl
@@ -0,0 +1,86 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0trx.ic
+The transaction
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+/**********************************************************************//**
+Determines if a transaction is in the given state.
+The caller must hold trx->mutex, or it must be the thread
+that is serving a running transaction.
+A running RW transaction must be in trx_sys.rw_trx_hash.
+@return TRUE if trx->state == state */
+UNIV_INLINE
+bool
+trx_state_eq(
+/*=========*/
+	const trx_t*	trx,	/*!< in: transaction */
+	trx_state_t	state,	/*!< in: state;
+				if state != TRX_STATE_NOT_STARTED
+				asserts that
+				trx->state != TRX_STATE_NOT_STARTED */
+	bool		relaxed)
+				/*!< in: whether to allow
+				trx->state == TRX_STATE_NOT_STARTED
+				after an error has been reported */
+{
+#ifdef UNIV_DEBUG
+	switch (trx->state) {
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_PREPARED_RECOVERED:
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		ut_ad(!trx->is_autocommit_non_locking());
+		return(trx->state == state);
+
+	case TRX_STATE_ACTIVE:
+		if (trx->is_autocommit_non_locking()) {
+			ut_ad(!trx->is_recovered);
+			ut_ad(trx->read_only);
+			ut_ad(trx->mysql_thd);
+		}
+		return(state == trx->state);
+
+	case TRX_STATE_NOT_STARTED:
+		/* These states are not allowed for running transactions. */
+		ut_a(state == TRX_STATE_NOT_STARTED
+		     || (relaxed
+			 && thd_get_error_number(trx->mysql_thd)));
+
+		return(true);
+	}
+	ut_error;
+#endif /* UNIV_DEBUG */
+	return(trx->state == state);
+}
+
+/****************************************************************//**
+Retrieves the error_info field from a trx.
+@return the error info */
+UNIV_INLINE
+const dict_index_t*
+trx_get_error_info(
+/*===============*/
+	const trx_t*	trx)	/*!< in: trx object */
+{
+	return(trx->error_info);
+}
diff --git a/storage/innobase/include/trx0types.h b/storage/innobase/include/trx0types.h
new file mode 100644
index 00000000..bfa2adc0
--- /dev/null
+++ b/storage/innobase/include/trx0types.h
@@ -0,0 +1,131 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0types.h
+Transaction system global type definitions
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#pragma once
+#include "univ.i"
+#include "ut0new.h"
+
+#include <vector>
+
+/** printf(3) format used for printing DB_TRX_ID and other system fields */
+#define TRX_ID_FMT	IB_ID_FMT
+
+/** maximum length that a formatted trx_t::id could take, not including
+the terminating NUL character. */
+static const ulint TRX_ID_MAX_LEN = 17;
+
+/** Space id of the transaction system page (the system tablespace) */
+static constexpr uint32_t TRX_SYS_SPACE= 0;
+
+/** Page number of the transaction system page */
+#define TRX_SYS_PAGE_NO		FSP_TRX_SYS_PAGE_NO
+
+/** Random value to check for corruption of trx_t */
+static const ulint TRX_MAGIC_N = 91118598;
+
+constexpr uint innodb_purge_threads_MAX= 32;
+constexpr uint innodb_purge_batch_size_MAX= 5000;
+
+/** Transaction states (trx_t::state) */
+enum trx_state_t {
+	TRX_STATE_NOT_STARTED,
+
+	TRX_STATE_ACTIVE,
+	/** XA PREPARE has been executed; only XA COMMIT or XA ROLLBACK
+	are possible */
+	TRX_STATE_PREPARED,
+	/** XA PREPARE transaction that was returned to ha_recover() */
+	TRX_STATE_PREPARED_RECOVERED,
+	TRX_STATE_COMMITTED_IN_MEMORY
+};
+
+/** Memory objects */
+/* @{ */
+/** Transaction */
+struct trx_t;
+/** The locks and state of an active transaction */
+struct trx_lock_t;
+/** Rollback segment */
+struct trx_rseg_t;
+/** Transaction undo log */
+struct trx_undo_t;
+/** Rollback command node in a query graph */
+struct roll_node_t;
+/** Commit command node in a query graph */
+struct commit_node_t;
+/** SAVEPOINT command node in a query graph */
+struct trx_named_savept_t;
+/* @} */
+
+/** Row identifier (DB_ROW_ID, DATA_ROW_ID) */
+typedef ib_id_t	row_id_t;
+/** Transaction identifier (DB_TRX_ID, DATA_TRX_ID) */
+typedef ib_id_t	trx_id_t;
+/** Rollback pointer (DB_ROLL_PTR, DATA_ROLL_PTR) */
+typedef ib_id_t	roll_ptr_t;
+/** Undo number */
+typedef ib_id_t	undo_no_t;
+
+/** Transaction savepoint */
+struct trx_savept_t{
+	undo_no_t	least_undo_no;	/*!< least undo number to undo */
+};
+
+/** File objects */
+/* @{ */
+/** Undo segment header */
+typedef byte	trx_usegf_t;
+/** Undo log header */
+typedef byte	trx_ulogf_t;
+/** Undo log page header */
+typedef byte	trx_upagef_t;
+
+/** Undo log record */
+typedef	byte	trx_undo_rec_t;
+
+/* @} */
+
+/** Info required to purge a record */
+struct trx_purge_rec_t
+{
+  /** Undo log record, or nullptr (roll_ptr!=0 if the log can be skipped) */
+  const trx_undo_rec_t *undo_rec;
+  /** File pointer to undo_rec */
+  roll_ptr_t roll_ptr;
+};
+
+typedef std::vector<trx_id_t, ut_allocator<trx_id_t> >	trx_ids_t;
+
+/** Number of std::unordered_map hash buckets expected to be needed
+for table IDs in a purge batch. GNU libstdc++ would default to 1 and
+enlarge and rehash on demand. */
+static constexpr size_t TRX_PURGE_TABLE_BUCKETS= 128;
+
+/** The number of rollback segments; rollback segment id must fit in
+the 7 bits reserved for it in DB_ROLL_PTR. */
+static constexpr unsigned TRX_SYS_N_RSEGS= 128;
+/** Maximum number of undo tablespaces (not counting the system tablespace) */
+static constexpr unsigned TRX_SYS_MAX_UNDO_SPACES= TRX_SYS_N_RSEGS - 1;
diff --git a/storage/innobase/include/trx0undo.h b/storage/innobase/include/trx0undo.h
new file mode 100644
index 00000000..3d22a33e
--- /dev/null
+++ b/storage/innobase/include/trx0undo.h
@@ -0,0 +1,514 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0undo.h
+Transaction undo log
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0undo_h
+#define trx0undo_h
+
+#ifndef UNIV_INNOCHECKSUM
+#include "trx0sys.h"
+
+/** The LSB of the "is insert" flag in DB_ROLL_PTR */
+#define ROLL_PTR_INSERT_FLAG_POS 55
+/** The LSB of the 7-bit trx_rseg_t::id in DB_ROLL_PTR */
+#define ROLL_PTR_RSEG_ID_POS 48
+/** The LSB of the 32-bit undo log page number in DB_ROLL_PTR */
+#define ROLL_PTR_PAGE_POS 16
+/** The LSB of the 16-bit byte offset within an undo log page in DB_ROLL_PTR */
+#define ROLL_PTR_BYTE_POS 0
+
+/***********************************************************************//**
+Builds a roll pointer.
+@return roll pointer */
+UNIV_INLINE
+roll_ptr_t
+trx_undo_build_roll_ptr(
+/*====================*/
+	bool	is_insert,	/*!< in: TRUE if insert undo log */
+	ulint	rseg_id,	/*!< in: rollback segment id */
+	uint32_t page_no,	/*!< in: page number */
+	uint16_t offset);	/*!< in: offset of the undo entry within page */
+/***********************************************************************//**
+Decodes a roll pointer. */
+UNIV_INLINE
+void
+trx_undo_decode_roll_ptr(
+/*=====================*/
+	roll_ptr_t	roll_ptr,	/*!< in: roll pointer */
+	bool*		is_insert,	/*!< out: TRUE if insert undo log */
+	ulint*		rseg_id,	/*!< out: rollback segment id */
+	uint32_t*	page_no,	/*!< out: page number */
+	uint16_t*	offset);	/*!< out: offset of the undo
+					entry within page */
+/***********************************************************************//**
+Determine if DB_ROLL_PTR is of the insert type.
+@return true if insert */
+UNIV_INLINE
+bool
+trx_undo_roll_ptr_is_insert(
+/*========================*/
+	roll_ptr_t	roll_ptr);	/*!< in: roll pointer */
+/***********************************************************************//**
+Returns true if the record is of the insert type.
+@return true if the record was freshly inserted (not updated). */
+UNIV_INLINE
+bool
+trx_undo_trx_id_is_insert(
+/*======================*/
+	const byte*	trx_id)	/*!< in: DB_TRX_ID, followed by DB_ROLL_PTR */
+	MY_ATTRIBUTE((warn_unused_result));
+/** Write DB_ROLL_PTR.
+@param[out]	ptr		buffer
+@param[in]	roll_ptr	DB_ROLL_PTR value */
+inline void trx_write_roll_ptr(byte* ptr, roll_ptr_t roll_ptr)
+{
+	compile_time_assert(DATA_ROLL_PTR_LEN == 7);
+	mach_write_to_7(ptr, roll_ptr);
+}
+/** Read DB_ROLL_PTR.
+@param[in]	ptr	buffer
+@return roll ptr */
+inline roll_ptr_t trx_read_roll_ptr(const byte* ptr)
+{
+	compile_time_assert(DATA_ROLL_PTR_LEN == 7);
+	return mach_read_from_7(ptr);
+}
+
+/** Get the next record in an undo log.
+@param[in]      undo_page       undo log page
+@param[in]      rec             undo record offset in the page
+@param[in]      page_no         undo log header page number
+@param[in]      offset          undo log header offset on page
+@return undo log record, the page latched, NULL if none */
+inline trx_undo_rec_t*
+trx_undo_page_get_next_rec(const buf_block_t *undo_page, uint16_t rec,
+                           uint32_t page_no, uint16_t offset);
+/** Get the previous record in an undo log.
+@param[in,out]  block   undo log page
+@param[in]      rec     undo record offset in the page
+@param[in]      page_no undo log header page number
+@param[in]      offset  undo log header offset on page
+@param[in]      shared  latching mode: true=RW_S_LATCH, false=RW_X_LATCH
+@param[in,out]  mtr     mini-transaction
+@return undo log record, the page latched, NULL if none */
+trx_undo_rec_t*
+trx_undo_get_prev_rec(buf_block_t *&block, uint16_t rec, uint32_t page_no,
+                      uint16_t offset, bool shared, mtr_t *mtr);
+
+/** Get the first undo log record on a page.
+@param[in]	block	undo log page
+@param[in]	page_no	undo log header page number
+@param[in]	offset	undo log header page offset
+@return	pointer to first record
+@retval	nullptr	if none exists */
+trx_undo_rec_t*
+trx_undo_page_get_first_rec(const buf_block_t *block, uint32_t page_no,
+                            uint16_t offset);
+
+/** Initialize an undo log page.
+NOTE: This corresponds to a redo log record and must not be changed!
+@see mtr_t::undo_create()
+@param[in,out]	block	undo log page */
+void trx_undo_page_init(const buf_block_t &block);
+
+/** Allocate an undo log page.
+@param[in,out]	undo	undo log
+@param[in,out]	mtr	mini-transaction that does not hold any page latch
+@param[out]	err	error code
+@return	X-latched block if success
+@retval	nullptr	on failure */
+buf_block_t *trx_undo_add_page(trx_undo_t *undo, mtr_t *mtr, dberr_t *err)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Free the last undo log page. The caller must hold the rseg mutex.
+@param[in,out]	undo	undo log
+@param[in,out]	mtr	mini-transaction that does not hold any undo log page
+			or that has allocated the undo log page
+@return error code */
+dberr_t trx_undo_free_last_page(trx_undo_t *undo, mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Try to truncate the undo logs.
+@param trx transaction
+@return error code */
+dberr_t trx_undo_try_truncate(const trx_t &trx);
+
+/** Truncate the head of an undo log.
+NOTE that only whole pages are freed; the header page is not
+freed, but emptied, if all the records there are below the limit.
+@param[in,out]	rseg		rollback segment
+@param[in]	hdr_page_no	header page number
+@param[in]	hdr_offset	header offset on the page
+@param[in]	limit		first undo number to preserve
+(everything below the limit will be truncated)
+@return error code */
+dberr_t
+trx_undo_truncate_start(
+	trx_rseg_t*	rseg,
+	uint32_t	hdr_page_no,
+	uint16_t	hdr_offset,
+	undo_no_t	limit)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+/** Mark that an undo log header belongs to a data dictionary transaction.
+@param[in]	trx	dictionary transaction
+@param[in,out]	undo	undo log
+@param[in,out]	mtr	mini-transaction */
+void trx_undo_mark_as_dict(const trx_t* trx, trx_undo_t* undo, mtr_t* mtr);
+/** Assign an undo log for a persistent transaction.
+A new undo log is created or a cached undo log reused.
+@param[in,out]	trx	transaction
+@param[out]	err	error code
+@param[in,out]	mtr	mini-transaction
+@return	the undo log block
+@retval	NULL	on error */
+buf_block_t*
+trx_undo_assign(trx_t* trx, dberr_t* err, mtr_t* mtr)
+	MY_ATTRIBUTE((nonnull));
+/** Assign an undo log for a transaction.
+A new undo log is created or a cached undo log reused.
+@tparam is_temp  whether this is temporary undo log
+@param[in,out]	trx	transaction
+@param[in]	rseg	rollback segment
+@param[out]	undo	the undo log
+@param[in,out]	mtr	mini-transaction
+@param[out]	err	error code
+@return	the undo log block
+@retval	nullptr	on error */
+template<bool is_temp>
+buf_block_t*
+trx_undo_assign_low(trx_t *trx, trx_rseg_t *rseg, trx_undo_t **undo,
+                    mtr_t *mtr, dberr_t *err)
+	MY_ATTRIBUTE((nonnull, warn_unused_result));
+
+/** Set the state of the undo log segment at a XA PREPARE or XA ROLLBACK.
+@param[in,out]	trx		transaction
+@param[in,out]	undo		undo log
+@param[in]	rollback	false=XA PREPARE, true=XA ROLLBACK
+@param[in,out]	mtr		mini-transaction */
+void trx_undo_set_state_at_prepare(trx_t *trx, trx_undo_t *undo, bool rollback,
+                                   mtr_t *mtr)
+  MY_ATTRIBUTE((nonnull));
+
+/** At shutdown, frees the undo logs of a transaction. */
+void
+trx_undo_free_at_shutdown(trx_t *trx);
+
+/** Read an undo log when starting up the database.
+@param[in,out]	rseg		rollback segment
+@param[in]	id		rollback segment slot
+@param[in]	page_no		undo log segment page number
+@return	the undo log
+@retval nullptr on error */
+trx_undo_t *
+trx_undo_mem_create_at_db_start(trx_rseg_t *rseg, ulint id, uint32_t page_no);
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** the only rollback segment type since MariaDB 10.3.1 */
+constexpr uint16_t TRX_UNDO_UPDATE= 2;
+/* TRX_UNDO_STATE values of an undo log segment */
+/** contains an undo log of an active transaction */
+constexpr uint16_t TRX_UNDO_ACTIVE = 1;
+/** cached for quick reuse */
+constexpr uint16_t TRX_UNDO_CACHED = 2;
+/** can be freed in purge when all undo data in it is removed */
+constexpr uint16_t TRX_UNDO_TO_PURGE = 4;
+/** contains an undo log of a prepared transaction */
+constexpr uint16_t TRX_UNDO_PREPARED = 5;
+
+#ifndef UNIV_INNOCHECKSUM
+
+/** Transaction undo log memory object; modified by the thread associated
+with the transaction. */
+
+struct trx_undo_t {
+	/*-----------------------------*/
+	ulint		id;		/*!< undo log slot number within the
+					rollback segment */
+	ulint		state;		/*!< state of the corresponding undo log
+					segment */
+	trx_id_t	trx_id;		/*!< id of the trx assigned to the undo
+					log */
+	XID		xid;		/*!< X/Open XA transaction
+					identification */
+	bool		dict_operation;	/*!< TRUE if a dict operation trx */
+	trx_rseg_t*	rseg;		/*!< rseg where the undo log belongs */
+	/*-----------------------------*/
+	uint32_t	hdr_page_no;	/*!< page number of the header page in
+					the undo log */
+	uint32_t	last_page_no;	/*!< page number of the last page in the
+					undo log; this may differ from
+					top_page_no during a rollback */
+	uint16_t	hdr_offset;	/*!< header offset of the undo log on
+				       	the page */
+	uint32_t	size;		/*!< current size in pages */
+	/*-----------------------------*/
+	uint32_t	top_page_no;	/*!< page number where the latest undo
+					log record was catenated; during
+					rollback the page from which the latest
+					undo record was chosen */
+	uint16_t	top_offset;	/*!< offset of the latest undo record,
+					i.e., the topmost element in the undo
+					log if we think of it as a stack */
+	undo_no_t	top_undo_no;	/*!< undo number of the latest record
+					(IB_ID_MAX if the undo log is empty) */
+	buf_block_t*	guess_block;	/*!< guess for the buffer block where
+					the top page might reside */
+
+	/** @return whether the undo log is empty */
+	bool empty() const { return top_undo_no == IB_ID_MAX; }
+
+	/*-----------------------------*/
+	UT_LIST_NODE_T(trx_undo_t) undo_list;
+					/*!< undo log objects in the rollback
+					segment are chained into lists */
+};
+
+/** Cache a pointer to an undo record in a latched buffer pool page,
+parse the undo log record and store the record type, update vector
+and compiler information */
+class UndorecApplier
+{
+  /** Undo log block page id */
+  page_id_t page_id;
+  /** Pointer to within undo log record */
+  const trx_undo_rec_t *undo_rec;
+  /** Undo log record type */
+  byte type;
+  /** compiler information */
+  byte cmpl_info;
+  /** page_offset(undo_rec) of the start of undo_rec */
+  uint16_t offset;
+  /** Transaction id of the undo log */
+  const trx_id_t trx_id;
+  /** Update vector */
+  upd_t *update;
+  /** memory heap which can be used to build previous version of
+  the index record and its offsets */
+  mem_heap_t *heap;
+  /** mini-transaction for accessing B-tree pages */
+  mtr_t mtr;
+
+public:
+  UndorecApplier(page_id_t page_id, trx_id_t trx_id) :
+    page_id(page_id), trx_id(trx_id), heap(mem_heap_create(100))
+  {
+  }
+
+  /** Assign the next page id */
+  void assign_next(const page_id_t next_page_id)
+  {
+    page_id= next_page_id;
+  }
+
+  page_id_t get_page_id() const { return page_id; }
+
+  /** Handle the DML undo log and apply it on online indexes */
+  inline void apply_undo_rec(const trx_undo_rec_t *rec);
+
+  ~UndorecApplier()
+  {
+    mem_heap_free(heap);
+  }
+
+private:
+  /** Handle the insert undo log and apply it on online indexes
+  @param  tuple		row reference from undo log record
+  @param  clust_index	clustered index */
+  void log_insert(const dtuple_t &tuple, dict_index_t *clust_index);
+
+  /** Handle the update, delete undo log and apply it on online
+  indexes.
+  @param  tuple		row reference from undo log record
+  @param  clust_index	clustered index */
+  void log_update(const dtuple_t &tuple, dict_index_t *clust_index);
+
+  /** Check whether the given roll pointer is generated by
+  the current undo log record information stored.
+  @return true if roll pointer matches with current undo log info */
+  inline bool is_same(roll_ptr_t roll_ptr) const;
+
+  /** Clear the undo log record information */
+  void clear_undo_rec()
+  {
+    undo_rec= nullptr;
+    cmpl_info= 0;
+    type= 0;
+    update= nullptr;
+    mem_heap_empty(heap);
+  }
+
+  /** Get the correct version of the clustered index record that
+  was modified by the current undo log record. Because there could
+  be the multiple successive updates of the same record within the
+  same transaction.
+  @param	tuple		tuple contains primary key value
+  @param	index		clustered index
+  @param[out]	clust_rec	current clustered index record
+  @param	offsets		offsets points to the record
+  @return clustered index record which was changed by
+  the undo log record or nullptr when there is no clustered
+  index record changed by undo log record */
+  const rec_t* get_old_rec(const dtuple_t &tuple, dict_index_t *index,
+                           const rec_t **clust_rec, rec_offs **offsets);
+};
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** The offset of the undo log page header on pages of the undo log */
+#define	TRX_UNDO_PAGE_HDR	FSEG_PAGE_DATA
+/*-------------------------------------------------------------*/
+/** Transaction undo log page header offsets */
+/* @{ */
+#define	TRX_UNDO_PAGE_TYPE	0	/*!< unused; 0 (before MariaDB 10.3.1:
+					1=TRX_UNDO_INSERT or
+					2=TRX_UNDO_UPDATE) */
+#define	TRX_UNDO_PAGE_START	2	/*!< Byte offset where the undo log
+					records for the LATEST transaction
+					start on this page (remember that
+					in an update undo log, the first page
+					can contain several undo logs) */
+#define	TRX_UNDO_PAGE_FREE	4	/*!< On each page of the undo log this
+					field contains the byte offset of the
+					first free byte on the page */
+#define TRX_UNDO_PAGE_NODE	6	/*!< The file list node in the chain
+					of undo log pages */
+/*-------------------------------------------------------------*/
+#define TRX_UNDO_PAGE_HDR_SIZE	(6 + FLST_NODE_SIZE)
+					/*!< Size of the transaction undo
+					log page header, in bytes */
+/* @} */
+
+/** An update undo segment with just one page can be reused if it has
+at most this many bytes used; we must leave space at least for one new undo
+log header on the page */
+
+#define TRX_UNDO_PAGE_REUSE_LIMIT	(3 << (srv_page_size_shift - 2))
+
+/* An update undo log segment may contain several undo logs on its first page
+if the undo logs took so little space that the segment could be cached and
+reused. All the undo log headers are then on the first page, and the last one
+owns the undo log records on subsequent pages if the segment is bigger than
+one page. If an undo log is stored in a segment, then on the first page it is
+allowed to have zero undo records, but if the segment extends to several
+pages, then all the rest of the pages must contain at least one undo log
+record. */
+
+/** The offset of the undo log segment header on the first page of the undo
+log segment */
+
+#define	TRX_UNDO_SEG_HDR	(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE)
+/** Undo log segment header */
+/* @{ */
+/*-------------------------------------------------------------*/
+#define	TRX_UNDO_STATE		0	/*!< TRX_UNDO_ACTIVE, ... */
+
+#ifndef UNIV_INNOCHECKSUM
+
+#define	TRX_UNDO_LAST_LOG	2	/*!< Offset of the last undo log header
+					on the segment header page, 0 if
+					none */
+#define	TRX_UNDO_FSEG_HEADER	4	/*!< Header for the file segment which
+					the undo log segment occupies */
+#define	TRX_UNDO_PAGE_LIST	(4 + FSEG_HEADER_SIZE)
+					/*!< Base node for the list of pages in
+					the undo log segment; defined only on
+					the undo log segment's first page */
+/*-------------------------------------------------------------*/
+/** Size of the undo log segment header */
+#define TRX_UNDO_SEG_HDR_SIZE	(4 + FSEG_HEADER_SIZE + FLST_BASE_NODE_SIZE)
+/* @} */
+
+/** The undo log header. There can be several undo log headers on the first
+page of an update undo log segment. */
+/* @{ */
+/*-------------------------------------------------------------*/
+/** Transaction start identifier, or 0 if the undo log segment has been
+completely purged and trx_purge_free_segment() has started freeing it */
+#define	TRX_UNDO_TRX_ID		0
+/** Transaction end identifier (if the log is in a history list),
+or 0 if the transaction has not been committed */
+#define	TRX_UNDO_TRX_NO		8
+/** Before MariaDB 10.3.1, when purge did not reset DB_TRX_ID of
+surviving user records, this used to be called TRX_UNDO_DEL_MARKS.
+
+This field is redundant; it is only being read by some debug assertions.
+
+The value 1 indicates that purge needs to process the undo log segment.
+The value 0 indicates that all of it has been processed, and
+trx_purge_free_segment() has been invoked, so the log is not safe to access.
+
+Before MariaDB 10.3.1, a log segment may carry the value 0 even before
+trx_purge_free_segment() was called, for those undo log records for
+which purge would not result in removing delete-marked records. */
+#define	TRX_UNDO_NEEDS_PURGE	16
+#define	TRX_UNDO_LOG_START	18	/*!< Offset of the first undo log record
+					of this log on the header page; purge
+					may remove undo log record from the
+					log start, and therefore this is not
+					necessarily the same as this log
+					header end offset */
+#define	TRX_UNDO_XID_EXISTS	20	/*!< TRUE if undo log header includes
+					X/Open XA transaction identification
+					XID */
+#define	TRX_UNDO_DICT_TRANS	21	/*!< TRUE if the transaction is a table
+					create, index create, or drop
+					transaction: in recovery
+					the transaction cannot be rolled back
+					in the usual way: a 'rollback' rather
+					means dropping the created or dropped
+					table, if it still exists */
+#define TRX_UNDO_TABLE_ID	22	/*!< Id of the table if the preceding
+					field is TRUE */
+#define	TRX_UNDO_NEXT_LOG	30	/*!< Offset of the next undo log header
+					on this page, 0 if none */
+#define	TRX_UNDO_PREV_LOG	32	/*!< Offset of the previous undo log
+					header on this page, 0 if none */
+#define TRX_UNDO_HISTORY_NODE	34	/*!< If the log is put to the history
+					list, the file list node is here */
+/*-------------------------------------------------------------*/
+/** Size of the undo log header without XID information */
+#define TRX_UNDO_LOG_OLD_HDR_SIZE (34 + FLST_NODE_SIZE)
+
+/** X/Open XA Transaction Identification (XID) */
+/* @{ */
+/** xid_t::formatID */
+#define	TRX_UNDO_XA_FORMAT	(TRX_UNDO_LOG_OLD_HDR_SIZE)
+/** xid_t::gtrid_length */
+#define	TRX_UNDO_XA_TRID_LEN	(TRX_UNDO_XA_FORMAT + 4)
+/** xid_t::bqual_length */
+#define	TRX_UNDO_XA_BQUAL_LEN	(TRX_UNDO_XA_TRID_LEN + 4)
+/** Distributed transaction identifier data */
+#define	TRX_UNDO_XA_XID		(TRX_UNDO_XA_BQUAL_LEN + 4)
+/*--------------------------------------------------------------*/
+#define TRX_UNDO_LOG_XA_HDR_SIZE (TRX_UNDO_XA_XID + XIDDATASIZE)
+					/*!< Total size of the undo log header
+					with the XA XID */
+/* @} */
+
+#include "trx0undo.inl"
+#endif /* !UNIV_INNOCHECKSUM */
+
+#endif
diff --git a/storage/innobase/include/trx0undo.inl b/storage/innobase/include/trx0undo.inl
new file mode 100644
index 00000000..9f05989f
--- /dev/null
+++ b/storage/innobase/include/trx0undo.inl
@@ -0,0 +1,129 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0undo.ic
+Transaction undo log
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "data0type.h"
+#include "page0page.h"
+
+/***********************************************************************//**
+Builds a roll pointer.
+@return roll pointer */
+UNIV_INLINE
+roll_ptr_t
+trx_undo_build_roll_ptr(
+/*====================*/
+	bool	is_insert,	/*!< in: TRUE if insert undo log */
+	ulint	rseg_id,	/*!< in: rollback segment id */
+	uint32_t page_no,	/*!< in: page number */
+	uint16_t offset)		/*!< in: offset of the undo entry within page */
+{
+  compile_time_assert(DATA_ROLL_PTR_LEN == 7);
+  ut_ad(rseg_id < TRX_SYS_N_RSEGS);
+
+  return roll_ptr_t{is_insert} << ROLL_PTR_INSERT_FLAG_POS |
+    roll_ptr_t{rseg_id} << ROLL_PTR_RSEG_ID_POS |
+    roll_ptr_t{page_no} << ROLL_PTR_PAGE_POS | offset;
+}
+
+/***********************************************************************//**
+Decodes a roll pointer. */
+UNIV_INLINE
+void
+trx_undo_decode_roll_ptr(
+/*=====================*/
+	roll_ptr_t	roll_ptr,	/*!< in: roll pointer */
+	bool*		is_insert,	/*!< out: TRUE if insert undo log */
+	ulint*		rseg_id,	/*!< out: rollback segment id */
+	uint32_t*	page_no,	/*!< out: page number */
+	uint16_t*	offset)		/*!< out: offset of the undo
+					entry within page */
+{
+  compile_time_assert(DATA_ROLL_PTR_LEN == 7);
+  ut_ad(roll_ptr < (1ULL << 56));
+  *offset= static_cast<uint16_t>(roll_ptr);
+  *page_no= static_cast<uint32_t>(roll_ptr >> 16);
+  *rseg_id= static_cast<ulint>(roll_ptr >> 48 & 0x7F);
+  *is_insert= static_cast<bool>(roll_ptr >> 55);
+}
+
+/***********************************************************************//**
+Determine if DB_ROLL_PTR is of the insert type.
+@return true if insert */
+UNIV_INLINE
+bool
+trx_undo_roll_ptr_is_insert(
+/*========================*/
+	roll_ptr_t	roll_ptr)	/*!< in: roll pointer */
+{
+	compile_time_assert(DATA_ROLL_PTR_LEN == 7);
+	ut_ad(roll_ptr < (1ULL << (ROLL_PTR_INSERT_FLAG_POS + 1)));
+	return static_cast<bool>(roll_ptr >> ROLL_PTR_INSERT_FLAG_POS);
+}
+
+/***********************************************************************//**
+Returns true if the record is of the insert type.
+@return true if the record was freshly inserted (not updated). */
+UNIV_INLINE
+bool
+trx_undo_trx_id_is_insert(
+/*======================*/
+	const byte*	trx_id)	/*!< in: DB_TRX_ID, followed by DB_ROLL_PTR */
+{
+	compile_time_assert(DATA_TRX_ID + 1 == DATA_ROLL_PTR);
+	return bool(trx_id[DATA_TRX_ID_LEN] >> 7);
+}
+
+/** Determine the end offset of undo log records of an undo log page.
+@param[in]	undo_page	undo log page
+@param[in]	page_no		undo log header page number
+@param[in]	offset		undo log header offset
+@return end offset */
+inline
+uint16_t trx_undo_page_get_end(const buf_block_t *undo_page, uint32_t page_no,
+                               uint16_t offset)
+{
+  if (page_no == undo_page->page.id().page_no())
+    if (uint16_t end = mach_read_from_2(TRX_UNDO_NEXT_LOG + offset +
+					undo_page->page.frame))
+      return end;
+
+  return mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE +
+			  undo_page->page.frame);
+}
+
+/** Get the next record in an undo log.
+@param[in]      undo_page       undo log page
+@param[in]      rec             undo record offset in the page
+@param[in]      page_no         undo log header page number
+@param[in]      offset          undo log header offset on page
+@return undo log record, the page latched, NULL if none */
+inline trx_undo_rec_t*
+trx_undo_page_get_next_rec(const buf_block_t *undo_page, uint16_t rec,
+                           uint32_t page_no, uint16_t offset)
+{
+  uint16_t end= trx_undo_page_get_end(undo_page, page_no, offset);
+  uint16_t next= mach_read_from_2(undo_page->page.frame + rec);
+  return next == end ? nullptr : undo_page->page.frame + next;
+}
diff --git a/storage/innobase/include/trx0xa.h b/storage/innobase/include/trx0xa.h
new file mode 100644
index 00000000..cb5d67cf
--- /dev/null
+++ b/storage/innobase/include/trx0xa.h
@@ -0,0 +1,61 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*
+ * Start of xa.h header
+ *
+ * Define a symbol to prevent multiple inclusions of this header file
+ */
+#ifndef	XA_H
+#define	XA_H
+
+#include "handler.h"
+
+/*
+ * Transaction branch identification: XID and NULLXID:
+ */
+#ifndef XIDDATASIZE
+
+/** Sizes of transaction identifier */
+#define	XIDDATASIZE	128		/*!< maximum size of a transaction
+					identifier, in bytes */
+#define	MAXGTRIDSIZE	 64		/*!< maximum size in bytes of gtrid */
+#define	MAXBQUALSIZE	 64		/*!< maximum size in bytes of bqual */
+
+#endif
+/** X/Open XA distributed transaction status codes */
+/* @{ */
+#define	XA_OK		0		/*!< normal execution */
+#define	XAER_ASYNC	-2		/*!< asynchronous operation already
+					outstanding */
+#define	XAER_RMERR	-3		/*!< a resource manager error
+					occurred in the transaction
+					branch */
+#define	XAER_NOTA	-4		/*!< the XID is not valid */
+#define	XAER_INVAL	-5		/*!< invalid arguments were given */
+#define	XAER_PROTO	-6		/*!< routine invoked in an improper
+					context */
+#define	XAER_RMFAIL	-7		/*!< resource manager unavailable */
+#define	XAER_DUPID	-8		/*!< the XID already exists */
+#define	XAER_OUTSIDE	-9		/*!< resource manager doing
+					work outside transaction */
+/* @} */
+#endif /* ifndef XA_H */
+/*
+ * End of xa.h header
+ */
diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i
new file mode 100644
index 00000000..1b4f70b6
--- /dev/null
+++ b/storage/innobase/include/univ.i
@@ -0,0 +1,503 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/***********************************************************************//**
+@file include/univ.i
+Version control for database, common definitions, and include files
+
+Created 1/20/1994 Heikki Tuuri
+****************************************************************************/
+
+#pragma once
+
+/** How far ahead should we tell the service manager the timeout
+(time in seconds) */
+#define INNODB_EXTEND_TIMEOUT_INTERVAL 30
+
+#if defined(_WIN32)
+# include <windows.h>
+#endif /* _WIN32 */
+
+/* Include a minimum number of SQL header files so that few changes
+made in SQL code cause a complete InnoDB rebuild.  These headers are
+used throughout InnoDB but do not include too much themselves.  They
+support cross-platform development and expose comonly used SQL names. */
+
+#include <my_global.h>
+#include "my_counter.h"
+#include "aligned.h"
+#include <m_string.h>
+#include <mysqld_error.h>
+
+/* Include <sys/stat.h> to get S_I... macros defined for os0file.cc */
+#include <sys/stat.h>
+
+#ifndef _WIN32
+# include <sched.h>
+# include "my_config.h"
+#endif
+
+#include <stdint.h>
+#include <inttypes.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+#include "my_pthread.h"
+
+/* Following defines are to enable performance schema
+instrumentation in each of five InnoDB modules if
+HAVE_PSI_INTERFACE is defined. */
+#ifdef HAVE_PSI_INTERFACE
+# define UNIV_PFS_MUTEX
+# define UNIV_PFS_RWLOCK
+# define UNIV_PFS_IO
+# define UNIV_PFS_THREAD
+
+# include "mysql/psi/psi.h" /* HAVE_PSI_MEMORY_INTERFACE */
+# ifdef HAVE_PSI_MEMORY_INTERFACE
+#  define UNIV_PFS_MEMORY
+# endif /* HAVE_PSI_MEMORY_INTERFACE */
+
+#ifdef HAVE_PFS_THREAD_PROVIDER_H
+/* For PSI_MUTEX_CALL() and similar. */
+#include "pfs_thread_provider.h"
+#endif
+
+#include "mysql/psi/mysql_thread.h"
+/* For PSI_FILE_CALL(). */
+#ifdef HAVE_PFS_FILE_PROVIDER_H
+#include "pfs_file_provider.h"
+#endif
+
+#include "mysql/psi/mysql_file.h"
+
+#endif /* HAVE_PSI_INTERFACE */
+
+#ifdef _WIN32
+# define YY_NO_UNISTD_H 1
+/* VC++ tries to optimise for size by default, from V8+. The size of
+the pointer to member depends on whether the type is defined before the
+compiler sees the type in the translation unit. This default behaviour
+can cause the pointer to be a different size in different translation
+units, depending on the above rule. We force optimise for size behaviour
+for all cases. This is used by ut0lst.h related code. */
+# pragma pointers_to_members(full_generality, multiple_inheritance)
+#endif /* _WIN32 */
+
+/*			DEBUG VERSION CONTROL
+			===================== */
+
+/* When this macro is defined then additional test functions will be
+compiled. These functions live at the end of each relevant source file
+and have "test_" prefix. These functions can be called from the end of
+innodb_init() or they can be called from gdb after srv_start() has executed
+using the call command. */
+/*
+#define UNIV_COMPILE_TEST_FUNCS
+#define UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR
+#define UNIV_ENABLE_UNIT_TEST_MAKE_FILEPATH
+#define UNIV_ENABLE_UNIT_TEST_DICT_STATS
+#define UNIV_ENABLE_UNIT_TEST_ROW_RAW_FORMAT_INT
+*/
+
+#ifdef DBUG_OFF
+# undef UNIV_DEBUG
+#elif !defined UNIV_DEBUG
+# define UNIV_DEBUG
+#endif
+
+#if 0
+#define UNIV_DEBUG_PRINT			/* Enable the compilation of
+						some debug print functions */
+#define UNIV_AHI_DEBUG				/* Enable adaptive hash index
+						debugging without UNIV_DEBUG */
+#define UNIV_BLOB_LIGHT_DEBUG			/* Enable off-page column
+						debugging without UNIV_DEBUG */
+#define UNIV_DEBUG_LOCK_VALIDATE		/* Enable
+						ut_ad(lock_rec_validate_page())
+						assertions. */
+#define UNIV_LRU_DEBUG				/* debug the buffer pool LRU */
+#define UNIV_HASH_DEBUG				/* debug HASH_ macros */
+#define UNIV_IBUF_DEBUG				/* debug the insert buffer */
+#define UNIV_PERF_DEBUG                         /* debug flag that enables
+                                                light weight performance
+                                                related stuff. */
+#define UNIV_SEARCH_PERF_STAT			/* statistics for the
+						adaptive hash index */
+#define UNIV_BTR_PRINT				/* enable functions for
+						printing B-trees */
+#define UNIV_ZIP_DEBUG				/* extensive consistency checks
+						for compressed pages */
+#define UNIV_ZIP_COPY				/* call page_zip_copy_recs()
+						more often */
+#define UNIV_AIO_DEBUG				/* prints info about
+						submitted and reaped AIO
+						requests to the log. */
+#define UNIV_STATS_DEBUG			/* prints various stats
+						related debug info from
+						dict0stats.c */
+#define FTS_INTERNAL_DIAG_PRINT                 /* FTS internal debugging
+                                                info output */
+#endif
+
+// #define UNIV_SQL_DEBUG
+
+#ifndef MY_ATTRIBUTE
+#if defined(__GNUC__)
+#  define MY_ATTRIBUTE(A) __attribute__(A)
+#else
+#  define MY_ATTRIBUTE(A)
+#endif
+#endif
+
+#define UNIV_INLINE static inline
+
+#define UNIV_WORD_SIZE		SIZEOF_SIZE_T
+
+/** The following alignment is used in memory allocations in memory heap
+management to ensure correct alignment for doubles etc. */
+#define UNIV_MEM_ALIGNMENT	8U
+
+/*
+			DATABASE VERSION CONTROL
+			========================
+*/
+
+#if defined (HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE) || defined(_WIN32)
+#define IF_PUNCH_HOLE(A,B) A
+#else
+#define IF_PUNCH_HOLE(A,B) B
+#endif
+
+/** log2 of smallest compressed page size (1<<10 == 1024 bytes)
+Note: This must never change! */
+#define UNIV_ZIP_SIZE_SHIFT_MIN		10U
+
+/** log2 of largest compressed page size (1<<14 == 16384 bytes).
+A compressed page directory entry reserves 14 bits for the start offset
+and 2 bits for flags. This limits the uncompressed page size to 16k.
+*/
+#define UNIV_ZIP_SIZE_SHIFT_MAX		14U
+
+/* Define the Min, Max, Default page sizes. */
+/** Minimum Page Size Shift (power of 2) */
+#define UNIV_PAGE_SIZE_SHIFT_MIN	12U
+/** log2 of largest page size (1<<16 == 64436 bytes). */
+/** Maximum Page Size Shift (power of 2) */
+#define UNIV_PAGE_SIZE_SHIFT_MAX	16U
+/** log2 of default page size (1<<14 == 16384 bytes). */
+/** Default Page Size Shift (power of 2) */
+#define UNIV_PAGE_SIZE_SHIFT_DEF	14U
+/** Original 16k InnoDB Page Size Shift, in case the default changes */
+#define UNIV_PAGE_SIZE_SHIFT_ORIG	14U
+/** Original 16k InnoDB Page Size as an ssize (log2 - 9) */
+#define UNIV_PAGE_SSIZE_ORIG		(UNIV_PAGE_SIZE_SHIFT_ORIG - 9U)
+
+/** Minimum page size InnoDB currently supports. */
+#define UNIV_PAGE_SIZE_MIN	(1U << UNIV_PAGE_SIZE_SHIFT_MIN)
+/** Maximum page size InnoDB currently supports. */
+#define UNIV_PAGE_SIZE_MAX	(1U << UNIV_PAGE_SIZE_SHIFT_MAX)
+/** Default page size for InnoDB tablespaces. */
+#define UNIV_PAGE_SIZE_DEF	(1U << UNIV_PAGE_SIZE_SHIFT_DEF)
+/** Original 16k page size for InnoDB tablespaces. */
+#define UNIV_PAGE_SIZE_ORIG	(1U << UNIV_PAGE_SIZE_SHIFT_ORIG)
+
+/** Smallest compressed page size */
+#define UNIV_ZIP_SIZE_MIN	(1U << UNIV_ZIP_SIZE_SHIFT_MIN)
+
+/** Largest compressed page size */
+#define UNIV_ZIP_SIZE_MAX	(1U << UNIV_ZIP_SIZE_SHIFT_MAX)
+
+/** Largest possible ssize for an uncompressed page.
+(The convention 'ssize' is used for 'log2 minus 9' or the number of
+shifts starting with 512.)
+This max number varies depending on srv_page_size. */
+#define UNIV_PAGE_SSIZE_MAX	\
+	ulint(srv_page_size_shift - UNIV_ZIP_SIZE_SHIFT_MIN + 1U)
+
+/** Smallest possible ssize for an uncompressed page. */
+#define UNIV_PAGE_SSIZE_MIN	\
+	ulint(UNIV_PAGE_SIZE_SHIFT_MIN - UNIV_ZIP_SIZE_SHIFT_MIN + 1U)
+
+/** Maximum number of parallel threads in a parallelized operation */
+#define UNIV_MAX_PARALLELISM	32
+
+/** This is the "mbmaxlen" for my_charset_filename (defined in
+strings/ctype-utf8.c), which is used to encode File and Database names. */
+#define FILENAME_CHARSET_MAXNAMLEN	5
+
+/** The maximum length of an encode table name in bytes.  The max
+table and database names are NAME_CHAR_LEN (64) characters. After the
+encoding, the max length would be NAME_CHAR_LEN (64) *
+FILENAME_CHARSET_MAXNAMLEN (5) = 320 bytes. The number does not include a
+terminating '\0'. InnoDB can handle longer names internally */
+#define MAX_TABLE_NAME_LEN	320
+
+/** The maximum length of a database name. Like MAX_TABLE_NAME_LEN this is
+the MySQL's NAME_LEN, see check_and_convert_db_name(). */
+#define MAX_DATABASE_NAME_LEN	MAX_TABLE_NAME_LEN
+
+/** MAX_FULL_NAME_LEN defines the full name path including the
+database name and table name. In addition, 14 bytes is added for:
+	2 for surrounding quotes around table name
+	1 for the separating dot (.)
+	9 for the #mysql50# prefix */
+#define MAX_FULL_NAME_LEN				\
+	(MAX_TABLE_NAME_LEN + MAX_DATABASE_NAME_LEN + 14)
+
+/** Maximum length of the compression alogrithm string. Currently we support
+only (NONE | ZLIB | LZ4). */
+#define MAX_COMPRESSION_LEN     4
+
+/** The maximum length in bytes that a database name can occupy when stored in
+UTF8, including the terminating '\0', see dict_fs2utf8(). You must include
+mysql_com.h if you are to use this macro. */
+#define MAX_DB_UTF8_LEN		(NAME_LEN + 1)
+
+/** The maximum length in bytes that a table name can occupy when stored in
+UTF8, including the terminating '\0', see dict_fs2utf8(). You must include
+mysql_com.h if you are to use this macro. */
+#define MAX_TABLE_UTF8_LEN	(NAME_LEN + sizeof(srv_mysql50_table_name_prefix))
+
+/*
+			UNIVERSAL TYPE DEFINITIONS
+			==========================
+*/
+
+/** Unsigned octet of bits */
+typedef unsigned char byte;
+/** Machine-word-width unsigned integer */
+typedef size_t ulint;
+/** Machine-word-width signed integer */
+typedef ssize_t lint;
+
+/** ulint format for the printf() family of functions */
+#define ULINTPF "%zu"
+/** ulint hexadecimal format for the printf() family of functions */
+#define ULINTPFx "%zx"
+
+#ifdef _WIN32
+/* Use the integer types and formatting strings defined in Visual Studio. */
+# define UINT32PF	"%u"
+# define UINT64scan     "llu"
+# define UINT64PFx	"%016llx"
+#elif defined __APPLE__
+/* Apple prefers to call the 64-bit types 'long long'
+in both 32-bit and 64-bit environments. */
+# define UINT32PF	"%" PRIu32
+# define UINT64scan     "llu"
+# define UINT64PFx	"%016llx"
+#elif defined _AIX
+/* Workaround for macros expension trouble */
+# define UINT32PF      "%u"
+# define UINT64scan    "lu"
+# define UINT64PFx     "%016lx"
+#else
+/* Use the integer types and formatting strings defined in the C99 standard. */
+# define UINT32PF	"%" PRIu32
+# define INT64PF	"%" PRId64
+# define UINT64scan	PRIu64
+# define UINT64PFx	"%016" PRIx64
+#endif
+
+typedef int64_t ib_int64_t;
+typedef uint64_t ib_uint64_t;
+typedef uint32_t ib_uint32_t;
+
+#define UINT64PF	"%" UINT64scan
+#define IB_ID_FMT	UINT64PF
+
+/** Log sequence number (also used for redo log byte arithmetics) */
+typedef	ib_uint64_t		lsn_t;
+
+/** The 'undefined' value for a ulint */
+#define ULINT_UNDEFINED		((ulint)(-1))
+
+/** The 'undefined' value for a ib_uint64_t */
+#define UINT64_UNDEFINED	((ib_uint64_t)(-1))
+
+/** The bitmask of 32-bit unsigned integer */
+#define ULINT32_MASK		0xFFFFFFFFU
+/** The undefined 32-bit unsigned integer */
+#define	ULINT32_UNDEFINED	ULINT32_MASK
+
+/** Maximum value for a ulint */
+#define ULINT_MAX		((ulint)(-2))
+
+/** Maximum value for ib_uint64_t */
+#define IB_UINT64_MAX		((ib_uint64_t) (~0ULL))
+
+/** The generic InnoDB system object identifier data type */
+typedef ib_uint64_t	        ib_id_t;
+#define IB_ID_MAX               (~(ib_id_t) 0)
+#define IB_ID_FMT               UINT64PF
+
+#ifndef UINTMAX_MAX
+#define UINTMAX_MAX		IB_UINT64_MAX
+#endif
+/** This 'ibool' type is used within Innobase. Remember that different included
+headers may define 'bool' differently. Do not assume that 'bool' is a ulint! */
+#define ibool			ulint
+
+#ifndef TRUE
+
+#define TRUE    1
+#define FALSE   0
+
+#endif
+
+#define UNIV_NOTHROW
+
+/** The following number as the length of a logical field means that the field
+has the SQL NULL as its value. NOTE that because we assume that the length
+of a field is a 32-bit integer when we store it, for example, to an undo log
+on disk, we must have also this number fit in 32 bits, also in 64-bit
+computers! */
+
+#define UNIV_SQL_NULL ULINT32_UNDEFINED
+
+/** Lengths which are not UNIV_SQL_NULL, but bigger than the following
+number indicate that a field contains a reference to an externally
+stored part of the field in the tablespace. The length field then
+contains the sum of the following flag and the locally stored len. */
+
+#define UNIV_EXTERN_STORAGE_FIELD (UNIV_SQL_NULL - UNIV_PAGE_SIZE_DEF)
+
+#if defined(__GNUC__)
+/* Tell the compiler that variable/function is unused. */
+# define UNIV_UNUSED    MY_ATTRIBUTE ((unused))
+#else
+# define UNIV_UNUSED
+#endif /* CHECK FOR GCC VER_GT_2 */
+
+/* Some macros to improve branch prediction and reduce cache misses */
+#ifdef __GNUC__
+/* Tell the compiler that 'expr' probably evaluates to 'constant'. */
+# define UNIV_EXPECT(expr,constant) __builtin_expect(expr, constant)
+/* Tell the compiler that a pointer is likely to be NULL */
+# define UNIV_LIKELY_NULL(ptr) __builtin_expect((ptr) != 0, 0)
+/* Minimize cache-miss latency by moving data at addr into a cache before
+it is read. */
+# define UNIV_PREFETCH_R(addr) __builtin_prefetch(addr, 0, 3)
+/* Minimize cache-miss latency by moving data at addr into a cache before
+it is read or written. */
+# define UNIV_PREFETCH_RW(addr) __builtin_prefetch(addr, 1, 3)
+
+/* Sun Studio includes sun_prefetch.h as of version 5.9 */
+#elif (defined(__SUNPRO_C) || defined(__SUNPRO_CC))
+
+# include <sun_prefetch.h>
+
+# define UNIV_EXPECT(expr,value) (expr)
+# define UNIV_LIKELY_NULL(expr) (expr)
+
+//# define UNIV_PREFETCH_R(addr) sun_prefetch_read_many((void*) addr)
+# define UNIV_PREFETCH_R(addr) ((void) 0)
+# define UNIV_PREFETCH_RW(addr) sun_prefetch_write_many(addr)
+
+# elif defined _MSC_VER
+# define UNIV_EXPECT(expr,value) (expr)
+# define UNIV_LIKELY_NULL(expr) (expr)
+# if defined _M_IX86 || defined _M_X64
+   // __MM_HINT_T0 - (temporal data)
+   // prefetch data into all levels of the cache hierarchy.
+#  define UNIV_PREFETCH_R(addr) _mm_prefetch((char *) addr, _MM_HINT_T0)
+#  define UNIV_PREFETCH_RW(addr) _mm_prefetch((char *) addr, _MM_HINT_T0)
+# elif defined _M_ARM64
+#  define UNIV_PREFETCH_R(addr) __prefetch(addr)
+#  define UNIV_PREFETCH_RW(addr) __prefetch(addr)
+# else
+#  define UNIV_PREFETCH_R ((void) 0)
+#  define  UNIV_PREFETCH_RW(addr) ((void) 0)
+# endif
+#else
+/* Dummy versions of the macros */
+# define UNIV_EXPECT(expr,value) (expr)
+# define UNIV_LIKELY_NULL(expr) (expr)
+# define UNIV_PREFETCH_R(addr) ((void) 0)
+# define UNIV_PREFETCH_RW(addr) ((void) 0)
+#endif
+
+/* Tell the compiler that cond is likely to hold */
+#define UNIV_LIKELY(cond) UNIV_EXPECT(cond, TRUE)
+/* Tell the compiler that cond is unlikely to hold */
+#define UNIV_UNLIKELY(cond) UNIV_EXPECT(cond, FALSE)
+
+/* Compile-time constant of the given array's size. */
+#define UT_ARR_SIZE(a) (sizeof(a) / sizeof((a)[0]))
+
+#include <stdio.h>
+#include "db0err.h"
+#include "ut0dbg.h"
+#include "ut0lst.h"
+#include "ut0ut.h"
+
+extern uint32_t srv_page_size_shift;
+extern ulong	srv_page_size;
+
+/* Dimension of spatial object we support so far. It has its root in
+myisam/sp_defs.h. We only support 2 dimension data */
+#define SPDIMS          2
+
+#ifdef HAVE_PSI_INTERFACE
+typedef unsigned int mysql_pfs_key_t;
+
+# ifdef UNIV_PFS_MUTEX
+extern mysql_pfs_key_t buf_pool_mutex_key;
+extern mysql_pfs_key_t dict_foreign_err_mutex_key;
+extern mysql_pfs_key_t fil_system_mutex_key;
+extern mysql_pfs_key_t flush_list_mutex_key;
+extern mysql_pfs_key_t fts_cache_mutex_key;
+extern mysql_pfs_key_t fts_cache_init_mutex_key;
+extern mysql_pfs_key_t fts_delete_mutex_key;
+extern mysql_pfs_key_t fts_doc_id_mutex_key;
+extern mysql_pfs_key_t ibuf_bitmap_mutex_key;
+extern mysql_pfs_key_t ibuf_mutex_key;
+extern mysql_pfs_key_t ibuf_pessimistic_insert_mutex_key;
+extern mysql_pfs_key_t recalc_pool_mutex_key;
+extern mysql_pfs_key_t purge_sys_pq_mutex_key;
+extern mysql_pfs_key_t recv_sys_mutex_key;
+extern mysql_pfs_key_t rtr_active_mutex_key;
+extern mysql_pfs_key_t rtr_match_mutex_key;
+extern mysql_pfs_key_t rtr_path_mutex_key;
+extern mysql_pfs_key_t page_zip_stat_per_index_mutex_key;
+extern mysql_pfs_key_t srv_innodb_monitor_mutex_key;
+extern mysql_pfs_key_t srv_misc_tmpfile_mutex_key;
+extern mysql_pfs_key_t srv_monitor_file_mutex_key;
+extern mysql_pfs_key_t buf_dblwr_mutex_key;
+extern mysql_pfs_key_t trx_pool_mutex_key;
+extern mysql_pfs_key_t trx_pool_manager_mutex_key;
+extern mysql_pfs_key_t lock_wait_mutex_key;
+extern mysql_pfs_key_t srv_threads_mutex_key;
+# endif /* UNIV_PFS_MUTEX */
+
+# ifdef UNIV_PFS_RWLOCK
+extern mysql_pfs_key_t dict_operation_lock_key;
+extern mysql_pfs_key_t fil_space_latch_key;
+extern mysql_pfs_key_t trx_i_s_cache_lock_key;
+extern mysql_pfs_key_t trx_purge_latch_key;
+extern mysql_pfs_key_t index_tree_rw_lock_key;
+extern mysql_pfs_key_t index_online_log_key;
+extern mysql_pfs_key_t trx_sys_rw_lock_key;
+extern mysql_pfs_key_t lock_latch_key;
+extern mysql_pfs_key_t log_latch_key;
+extern mysql_pfs_key_t trx_rseg_latch_key;
+# endif /* UNIV_PFS_RWLOCK */
+#endif /* HAVE_PSI_INTERFACE */
diff --git a/storage/innobase/include/ut0byte.h b/storage/innobase/include/ut0byte.h
new file mode 100644
index 00000000..2b70fac3
--- /dev/null
+++ b/storage/innobase/include/ut0byte.h
@@ -0,0 +1,107 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0byte.h
+Utilities for byte operations
+
+Created 1/20/1994 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0byte_h
+#define ut0byte_h
+
+#include "univ.i"
+
+/*******************************************************//**
+Creates a 64-bit integer out of two 32-bit integers.
+@return created integer */
+UNIV_INLINE
+ib_uint64_t
+ut_ull_create(
+/*==========*/
+	ulint	high,	/*!< in: high-order 32 bits */
+	ulint	low)	/*!< in: low-order 32 bits */
+	MY_ATTRIBUTE((const));
+
+/********************************************************//**
+Rounds a 64-bit integer downward to a multiple of a power of 2.
+@return rounded value */
+UNIV_INLINE
+ib_uint64_t
+ut_uint64_align_down(
+/*=================*/
+	ib_uint64_t	 n,		/*!< in: number to be rounded */
+	ulint		 align_no);	/*!< in: align by this number
+					which must be a power of 2 */
+/********************************************************//**
+Rounds ib_uint64_t upward to a multiple of a power of 2.
+@return rounded value */
+UNIV_INLINE
+ib_uint64_t
+ut_uint64_align_up(
+/*===============*/
+	ib_uint64_t	 n,		/*!< in: number to be rounded */
+	ulint		 align_no);	/*!< in: align by this number
+					which must be a power of 2 */
+/** Round down a pointer to the nearest aligned address.
+@param ptr        pointer
+@param alignment  a power of 2
+@return aligned pointer */
+static inline void *ut_align_down(void *ptr, size_t alignment)
+{
+  ut_ad(alignment > 0);
+  ut_ad(ut_is_2pow(alignment));
+  ut_ad(ptr);
+  static_assert(sizeof ptr == sizeof(size_t), "compatibility");
+
+  return reinterpret_cast<void*>(reinterpret_cast<size_t>(ptr) &
+                                 ~(alignment - 1));
+}
+
+static inline const void *ut_align_down(const void *ptr, size_t alignment)
+{
+  return ut_align_down(const_cast<void*>(ptr), alignment);
+}
+
+/** Compute the offset of a pointer from the nearest aligned address.
+@param ptr        pointer
+@param alignment  a power of 2
+@return distance from aligned pointer */
+inline size_t ut_align_offset(const void *ptr, size_t alignment)
+{
+  ut_ad(alignment > 0);
+  ut_ad(ut_is_2pow(alignment));
+  ut_ad(ptr);
+  return reinterpret_cast<size_t>(ptr) & (alignment - 1);
+}
+
+/*****************************************************************//**
+Gets the nth bit of a ulint.
+@return TRUE if nth bit is 1; 0th bit is defined to be the least significant */
+UNIV_INLINE
+ibool
+ut_bit_get_nth(
+/*===========*/
+	ulint	a,	/*!< in: ulint */
+	ulint	n);	/*!< in: nth bit requested */
+
+#include "ut0byte.inl"
+
+#endif
diff --git a/storage/innobase/include/ut0byte.inl b/storage/innobase/include/ut0byte.inl
new file mode 100644
index 00000000..dfa069c2
--- /dev/null
+++ b/storage/innobase/include/ut0byte.inl
@@ -0,0 +1,90 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************************//**
+@file include/ut0byte.ic
+Utilities for byte operations
+
+Created 5/30/1994 Heikki Tuuri
+*******************************************************************/
+
+/*******************************************************//**
+Creates a 64-bit integer out of two 32-bit integers.
+@return created integer */
+UNIV_INLINE
+ib_uint64_t
+ut_ull_create(
+/*==========*/
+	ulint	high,	/*!< in: high-order 32 bits */
+	ulint	low)	/*!< in: low-order 32 bits */
+{
+	ut_ad(high <= ULINT32_MASK);
+	ut_ad(low <= ULINT32_MASK);
+	return(((ib_uint64_t) high) << 32 | low);
+}
+
+/********************************************************//**
+Rounds a 64-bit integer downward to a multiple of a power of 2.
+@return rounded value */
+UNIV_INLINE
+ib_uint64_t
+ut_uint64_align_down(
+/*=================*/
+	ib_uint64_t	 n,		/*!< in: number to be rounded */
+	ulint		 align_no)	/*!< in: align by this number
+					which must be a power of 2 */
+{
+	ut_ad(align_no > 0);
+	ut_ad(ut_is_2pow(align_no));
+
+	return(n & ~((ib_uint64_t) align_no - 1));
+}
+
+/********************************************************//**
+Rounds ib_uint64_t upward to a multiple of a power of 2.
+@return rounded value */
+UNIV_INLINE
+ib_uint64_t
+ut_uint64_align_up(
+/*===============*/
+	ib_uint64_t	 n,		/*!< in: number to be rounded */
+	ulint		 align_no)	/*!< in: align by this number
+					which must be a power of 2 */
+{
+	ib_uint64_t	align_1 = (ib_uint64_t) align_no - 1;
+
+	ut_ad(align_no > 0);
+	ut_ad(ut_is_2pow(align_no));
+
+	return((n + align_1) & ~align_1);
+}
+
+/*****************************************************************//**
+Gets the nth bit of a ulint.
+@return TRUE if nth bit is 1; 0th bit is defined to be the least significant */
+UNIV_INLINE
+ibool
+ut_bit_get_nth(
+/*===========*/
+	ulint	a,	/*!< in: ulint */
+	ulint	n)	/*!< in: nth bit requested */
+{
+	ut_ad(n < 8 * sizeof(ulint));
+	return(1 & (a >> n));
+}
diff --git a/storage/innobase/include/ut0counter.h b/storage/innobase/include/ut0counter.h
new file mode 100644
index 00000000..d6589cc4
--- /dev/null
+++ b/storage/innobase/include/ut0counter.h
@@ -0,0 +1,123 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ut0counter.h
+
+Counter utility class
+
+Created 2012/04/12 by Sunny Bains
+*******************************************************/
+
+#ifndef ut0counter_h
+#define ut0counter_h
+
+#include "univ.i"
+#include "my_rdtsc.h"
+
+/** Use the result of my_timer_cycles(), which mainly uses RDTSC for cycles
+as a random value. See the comments for my_timer_cycles() */
+/** @return result from RDTSC or similar functions. */
+static inline size_t
+get_rnd_value()
+{
+	size_t c = static_cast<size_t>(my_timer_cycles());
+
+	if (c != 0) {
+		return c;
+	}
+
+	/* We may go here if my_timer_cycles() returns 0,
+	so we have to have the plan B for the counter. */
+#if !defined(_WIN32)
+	return (size_t)pthread_self();
+#else
+	LARGE_INTEGER cnt;
+	QueryPerformanceCounter(&cnt);
+
+	return static_cast<size_t>(cnt.QuadPart);
+#endif /* !_WIN32 */
+}
+
+/** Atomic which occupies whole CPU cache line.
+Note: We rely on the default constructor of std::atomic and
+do not explicitly initialize the contents. This works for us,
+because ib_counter_t is only intended for usage with global
+memory that is allocated from the .bss and thus guaranteed to
+be zero-initialized by the run-time environment.
+@see srv_stats */
+template <typename Type>
+struct ib_atomic_counter_element_t {
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) Atomic_relaxed<Type> value;
+};
+
+template <typename Type>
+struct ib_counter_element_t {
+  alignas(CPU_LEVEL1_DCACHE_LINESIZE) Type value;
+};
+
+
+/** Class for using fuzzy counters. The counter is multi-instance relaxed atomic
+so the results are not guaranteed to be 100% accurate but close
+enough. */
+template <typename Type,
+          template <typename T> class Element = ib_atomic_counter_element_t,
+          int N = 128 >
+struct ib_counter_t {
+	/** Increment the counter by 1. */
+	void inc() { add(1); }
+	ib_counter_t& operator++() { inc(); return *this; }
+
+	/** Increment the counter by 1.
+	@param[in]	index	a reasonably thread-unique identifier */
+	void inc(size_t index) { add(index, 1); }
+
+	/** Add to the counter.
+	@param[in]	n	amount to be added */
+	void add(Type n) { add(get_rnd_value(), n); }
+
+	/** Add to the counter.
+	@param[in]	index	a reasonably thread-unique identifier
+	@param[in]	n	amount to be added */
+	TPOOL_SUPPRESS_TSAN void add(size_t index, Type n) {
+		index = index % N;
+
+		ut_ad(index < UT_ARR_SIZE(m_counter));
+
+		m_counter[index].value += n;
+	}
+
+	/* @return total value - not 100% accurate, since it is relaxed atomic*/
+	operator Type() const {
+		Type	total = 0;
+
+		for (const auto &counter : m_counter) {
+			total += counter.value;
+		}
+
+		return(total);
+	}
+
+private:
+	static_assert(sizeof(Element<Type>) == CPU_LEVEL1_DCACHE_LINESIZE, "");
+	/** Array of counter elements */
+	alignas(CPU_LEVEL1_DCACHE_LINESIZE) Element<Type> m_counter[N];
+};
+
+#endif /* ut0counter_h */
diff --git a/storage/innobase/include/ut0dbg.h b/storage/innobase/include/ut0dbg.h
new file mode 100644
index 00000000..85856660
--- /dev/null
+++ b/storage/innobase/include/ut0dbg.h
@@ -0,0 +1,179 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*****************************************************************//**
+@file include/ut0dbg.h
+Debug utilities for Innobase
+
+Created 1/30/1994 Heikki Tuuri
+**********************************************************************/
+
+#ifndef ut0dbg_h
+#define ut0dbg_h
+
+#ifdef UNIV_INNOCHECKSUM
+#define ut_a		assert
+#define ut_ad		assert
+#define ut_error	assert(0)
+#else /* !UNIV_INNOCHECKSUM */
+
+/* Do not include univ.i because univ.i includes this. */
+
+/*************************************************************//**
+Report a failed assertion. */
+ATTRIBUTE_NORETURN ATTRIBUTE_COLD __attribute__((nonnull(2)))
+void
+ut_dbg_assertion_failed(
+/*====================*/
+	const char*	expr,	/*!< in: the failed assertion */
+	const char*	file,	/*!< in: source file containing the assertion */
+	unsigned	line);	/*!< in: line number of the assertion */
+
+/** Abort execution if EXPR does not evaluate to nonzero.
+@param EXPR assertion expression that should hold */
+#define ut_a(EXPR) do {						\
+	if (UNIV_UNLIKELY(!(ulint) (EXPR))) {			\
+		ut_dbg_assertion_failed(#EXPR,			\
+				__FILE__, __LINE__);		\
+	}							\
+} while (0)
+
+/** Abort execution. */
+#define ut_error						\
+	ut_dbg_assertion_failed(0, __FILE__, __LINE__)
+
+/** Debug assertion */
+#define ut_ad	DBUG_SLOW_ASSERT
+#if defined(UNIV_DEBUG) || !defined(DBUG_OFF)
+/** Debug statement. Does nothing unless UNIV_DEBUG is defined. */
+#define ut_d(EXPR)	EXPR
+#else
+/** Debug statement. Does nothing unless UNIV_DEBUG is defined. */
+#define ut_d(EXPR)
+#endif
+
+#if defined(HAVE_SYS_TIME_H) && defined(HAVE_SYS_RESOURCE_H)
+
+#define HAVE_UT_CHRONO_T
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+/** A "chronometer" used to clock snippets of code.
+Example usage:
+	ut_chrono_t	ch("this loop");
+	for (;;) { ... }
+	ch.show();
+would print the timings of the for() loop, prefixed with "this loop:" */
+class ut_chrono_t {
+public:
+	/** Constructor.
+	@param[in]	name	chrono's name, used when showing the values */
+	ut_chrono_t(
+		const char*	name)
+		:
+		m_name(name),
+		m_show_from_destructor(true)
+	{
+		reset();
+	}
+
+	/** Resets the chrono (records the current time in it). */
+	void
+	reset()
+	{
+		gettimeofday(&m_tv, NULL);
+
+		getrusage(RUSAGE_SELF, &m_ru);
+	}
+
+	/** Shows the time elapsed and usage statistics since the last reset. */
+	void
+	show()
+	{
+		struct rusage	ru_now;
+		struct timeval	tv_now;
+		struct timeval	tv_diff;
+
+		getrusage(RUSAGE_SELF, &ru_now);
+
+		gettimeofday(&tv_now, NULL);
+
+#ifndef timersub
+#define timersub(a, b, r)						\
+		do {							\
+			(r)->tv_sec = (a)->tv_sec - (b)->tv_sec;	\
+			(r)->tv_usec = (a)->tv_usec - (b)->tv_usec;	\
+			if ((r)->tv_usec < 0) {				\
+				(r)->tv_sec--;				\
+				(r)->tv_usec += 1000000;		\
+			}						\
+		} while (0)
+#endif /* timersub */
+
+#define CHRONO_PRINT(type, tvp)						\
+		fprintf(stderr, "%s: %s% 5ld.%06ld sec\n",		\
+			m_name, type,					\
+			static_cast<long>((tvp)->tv_sec),		\
+			static_cast<long>((tvp)->tv_usec))
+
+		timersub(&tv_now, &m_tv, &tv_diff);
+		CHRONO_PRINT("real", &tv_diff);
+
+		timersub(&ru_now.ru_utime, &m_ru.ru_utime, &tv_diff);
+		CHRONO_PRINT("user", &tv_diff);
+
+		timersub(&ru_now.ru_stime, &m_ru.ru_stime, &tv_diff);
+		CHRONO_PRINT("sys ", &tv_diff);
+	}
+
+	/** Cause the timings not to be printed from the destructor. */
+	void end()
+	{
+		m_show_from_destructor = false;
+	}
+
+	/** Destructor. */
+	~ut_chrono_t()
+	{
+		if (m_show_from_destructor) {
+			show();
+		}
+	}
+
+private:
+	/** Name of this chronometer. */
+	const char*	m_name;
+
+	/** True if the current timings should be printed by the destructor. */
+	bool		m_show_from_destructor;
+
+	/** getrusage() result as of the last reset(). */
+	struct rusage	m_ru;
+
+	/** gettimeofday() result as of the last reset(). */
+	struct timeval	m_tv;
+};
+
+#endif /* HAVE_SYS_TIME_H && HAVE_SYS_RESOURCE_H */
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+#endif
diff --git a/storage/innobase/include/ut0list.h b/storage/innobase/include/ut0list.h
new file mode 100644
index 00000000..765f6a2a
--- /dev/null
+++ b/storage/innobase/include/ut0list.h
@@ -0,0 +1,146 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0list.h
+A double-linked list
+
+Created 4/26/2006 Osku Salerma
+************************************************************************/
+
+/*******************************************************************//**
+A double-linked list. This differs from the one in ut0lst.h in that in this
+one, each list node contains a pointer to the data, whereas the one in
+ut0lst.h uses a strategy where the list pointers are embedded in the data
+items themselves.
+
+Use this one when you need to store arbitrary data in the list where you
+can't embed the list pointers in the data, if a data item needs to be
+stored in multiple lists, etc.
+
+Note about the memory management: ib_list_t is a fixed-size struct whose
+allocation/deallocation is done through ib_list_create/ib_list_free, but the
+memory for the list nodes is allocated through a user-given memory heap,
+which can either be the same for all nodes or vary per node. Most users will
+probably want to create a memory heap to store the item-specific data, and
+pass in this same heap to the list node creation functions, thus
+automatically freeing the list node when the item's heap is freed.
+
+************************************************************************/
+
+#ifndef IB_LIST_H
+#define IB_LIST_H
+
+#include "mem0mem.h"
+
+struct ib_list_t;
+struct ib_list_node_t;
+
+/****************************************************************//**
+Create a new list using mem_alloc. Lists created with this function must be
+freed with ib_list_free.
+@return list */
+ib_list_t*
+ib_list_create(void);
+/*=================*/
+
+/****************************************************************//**
+Free a list. */
+void
+ib_list_free(
+/*=========*/
+	ib_list_t*	list);	/*!< in: list */
+
+/****************************************************************//**
+Add the data to the end of the list.
+@return new list node */
+ib_list_node_t*
+ib_list_add_last(
+/*=============*/
+	ib_list_t*	list,	/*!< in: list */
+	void*		data,	/*!< in: data */
+	mem_heap_t*	heap);	/*!< in: memory heap to use */
+
+/****************************************************************//**
+Remove the node from the list. */
+void
+ib_list_remove(
+/*===========*/
+	ib_list_t*	list,	/*!< in: list */
+	ib_list_node_t*	node);	/*!< in: node to remove */
+
+/****************************************************************//**
+Get the first node in the list.
+@return first node, or NULL */
+UNIV_INLINE
+ib_list_node_t*
+ib_list_get_first(
+/*==============*/
+	ib_list_t*	list);	/*!< in: list */
+
+/****************************************************************//**
+Get the last node in the list.
+@return last node, or NULL */
+UNIV_INLINE
+ib_list_node_t*
+ib_list_get_last(
+/*=============*/
+	ib_list_t*	list);	/*!< in: list */
+
+/********************************************************************
+Check if list is empty. */
+UNIV_INLINE
+ibool
+ib_list_is_empty(
+/*=============*/
+					/* out: TRUE if empty else  */
+	const ib_list_t*	list);	/* in: list */
+
+/********************************************************************
+Get number of items on list.
+@return number of items on list */
+UNIV_INLINE
+ulint
+ib_list_len(
+/*========*/
+	const ib_list_t*	list);		/*<! in: list */
+
+/* List. */
+struct ib_list_t {
+	ib_list_node_t*		first;		/*!< first node */
+	ib_list_node_t*		last;		/*!< last node */
+};
+
+/* A list node. */
+struct ib_list_node_t {
+	ib_list_node_t*		prev;		/*!< previous node */
+	ib_list_node_t*		next;		/*!< next node */
+	void*			data;		/*!< user data */
+};
+
+/* Quite often, the only additional piece of data you need is the per-item
+memory heap, so we have this generic struct available to use in those
+cases. */
+struct ib_list_helper_t {
+	mem_heap_t*	heap;		/*!< memory heap */
+	void*		data;		/*!< user data */
+};
+
+#include "ut0list.inl"
+
+#endif
diff --git a/storage/innobase/include/ut0list.inl b/storage/innobase/include/ut0list.inl
new file mode 100644
index 00000000..3bdba52b
--- /dev/null
+++ b/storage/innobase/include/ut0list.inl
@@ -0,0 +1,80 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2013, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0list.ic
+A double-linked list
+
+Created 4/26/2006 Osku Salerma
+************************************************************************/
+
+/****************************************************************//**
+Get the first node in the list.
+@return first node, or NULL */
+UNIV_INLINE
+ib_list_node_t*
+ib_list_get_first(
+/*==============*/
+	ib_list_t*	list)	/*!< in: list */
+{
+	return(list->first);
+}
+
+/****************************************************************//**
+Get the last node in the list.
+@return last node, or NULL */
+UNIV_INLINE
+ib_list_node_t*
+ib_list_get_last(
+/*=============*/
+	ib_list_t*	list)	/*!< in: list */
+{
+	return(list->last);
+}
+
+/********************************************************************
+Check if list is empty. */
+UNIV_INLINE
+ibool
+ib_list_is_empty(
+/*=============*/
+					/* out: TRUE if empty else FALSE */
+	const ib_list_t*	list)	/* in: list */
+{
+	return(!(list->first || list->last));
+}
+
+/********************************************************************
+Get number of items on list.
+@return number of items on list */
+UNIV_INLINE
+ulint
+ib_list_len(
+/*========*/
+	const ib_list_t*	list)		/*<! in: list */
+{
+	ulint len = 0;
+	ib_list_node_t* node = list->first;
+
+	while(node) {
+		len++;
+		node = node->next;
+	}
+
+	return (len);
+}
diff --git a/storage/innobase/include/ut0lst.h b/storage/innobase/include/ut0lst.h
new file mode 100644
index 00000000..7b7ed7b8
--- /dev/null
+++ b/storage/innobase/include/ut0lst.h
@@ -0,0 +1,563 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0lst.h
+List utilities
+
+Created 9/10/1995 Heikki Tuuri
+Rewritten by Sunny Bains Dec 2011.
+***********************************************************************/
+
+#pragma once
+
+/* Do not include univ.i because univ.i includes this. */
+
+#include "ut0dbg.h"
+
+/* This module implements the two-way linear list. Note that a single
+list node may belong to two or more lists, but is only on one list
+at a time. */
+
+/*******************************************************************//**
+The two way list node.
+@param TYPE the list node type name */
+template <typename Type>
+struct ut_list_node {
+	Type*		prev;			/*!< pointer to the previous
+						node, NULL if start of list */
+	Type*		next;			/*!< pointer to next node,
+						NULL if end of list */
+
+	void reverse()
+	{
+		Type*	tmp = prev;
+		prev = next;
+		next = tmp;
+	}
+};
+
+/** Macro used for legacy reasons */
+#define UT_LIST_NODE_T(t)		ut_list_node<t>
+
+/*******************************************************************//**
+The two-way list base node. The base node contains pointers to both ends
+of the list and a count of nodes in the list (excluding the base node
+from the count). We also store a pointer to the member field so that it
+doesn't have to be specified when doing list operations.
+@param Type the type of the list element
+@param NodePtr field member pointer that points to the list node */
+template <typename Type, typename NodePtr>
+struct ut_list_base {
+	typedef Type elem_type;
+	typedef NodePtr node_ptr;
+	typedef ut_list_node<Type> node_type;
+
+	ulint		count;			/*!< count of nodes in list */
+	elem_type*	start;			/*!< pointer to list start,
+						NULL if empty */
+	elem_type*	end;			/*!< pointer to list end,
+						NULL if empty */
+	node_ptr	node;			/*!< Pointer to member field
+						that is used as a link node */
+#ifdef UNIV_DEBUG
+	ulint		init;			/*!< UT_LIST_INITIALISED if
+						the list was initialised with
+						UT_LIST_INIT() */
+#endif /* UNIV_DEBUG */
+
+	void reverse()
+	{
+		Type*	tmp = start;
+		start = end;
+		end = tmp;
+	}
+};
+
+#define UT_LIST_BASE_NODE_T(t)	ut_list_base<t, ut_list_node<t> t::*>
+
+#ifdef UNIV_DEBUG
+# define UT_LIST_INITIALISED		0xCAFE
+# define UT_LIST_INITIALISE(b)		(b).init = UT_LIST_INITIALISED
+# define UT_LIST_IS_INITIALISED(b)	ut_a(((b).init == UT_LIST_INITIALISED))
+#else
+# define UT_LIST_INITIALISE(b)
+# define UT_LIST_IS_INITIALISED(b)
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+Note: This is really the list constructor. We should be able to use
+placement new here.
+Initializes the base node of a two-way list.
+@param b the list base node
+@param pmf point to member field that will be used as the link node */
+#define UT_LIST_INIT(b, pmf)						\
+{									\
+	(b).count = 0;							\
+	(b).start = 0;							\
+	(b).end   = 0;							\
+	(b).node  = pmf;						\
+	UT_LIST_INITIALISE(b);						\
+}
+
+/** Functor for accessing the embedded node within a list element. This is
+required because some lists can have the node emebedded inside a nested
+struct/union. See lock0priv.h (table locks) for an example. It provides a
+specialised functor to grant access to the list node. */
+template <typename Type>
+struct GenericGetNode {
+
+	typedef ut_list_node<Type> node_type;
+
+	GenericGetNode(node_type Type::* node) : m_node(node) {}
+
+	node_type& operator() (Type& elem)
+	{
+		return(elem.*m_node);
+	}
+
+	node_type	Type::*m_node;
+};
+
+/*******************************************************************//**
+Adds the node as the first element in a two-way linked list.
+@param list the base node (not a pointer to it)
+@param elem the element to add */
+template <typename List>
+void
+ut_list_prepend(
+	List&				list,
+	typename List::elem_type*	elem)
+{
+	typename List::node_type&	elem_node = elem->*list.node;
+
+	UT_LIST_IS_INITIALISED(list);
+
+	elem_node.prev = 0;
+	elem_node.next = list.start;
+
+	if (list.start != 0) {
+		typename List::node_type&	base_node =
+			list.start->*list.node;
+
+		ut_ad(list.start != elem);
+
+		base_node.prev = elem;
+	}
+
+	list.start = elem;
+
+	if (list.end == 0) {
+		list.end = elem;
+	}
+
+	++list.count;
+}
+
+/*******************************************************************//**
+Adds the node as the first element in a two-way linked list.
+@param LIST the base node (not a pointer to it)
+@param ELEM the element to add */
+#define UT_LIST_ADD_FIRST(LIST, ELEM)	ut_list_prepend(LIST, ELEM)
+
+/*******************************************************************//**
+Adds the node as the last element in a two-way linked list.
+@param list list
+@param elem the element to add
+@param get_node to get the list node for that element */
+template <typename List, typename Functor>
+void
+ut_list_append(
+	List&				list,
+	typename List::elem_type*	elem,
+	Functor				get_node)
+{
+	typename List::node_type&	node = get_node(*elem);
+
+	UT_LIST_IS_INITIALISED(list);
+
+	node.next = 0;
+	node.prev = list.end;
+
+	if (list.end != 0) {
+		typename List::node_type&	base_node = get_node(*list.end);
+
+		ut_ad(list.end != elem);
+
+		base_node.next = elem;
+	}
+
+	list.end = elem;
+
+	if (list.start == 0) {
+		list.start = elem;
+	}
+
+	++list.count;
+}
+
+/*******************************************************************//**
+Adds the node as the last element in a two-way linked list.
+@param list list
+@param elem the element to add */
+template <typename List>
+void
+ut_list_append(
+	List&				list,
+	typename List::elem_type*	elem)
+{
+	ut_list_append(
+		list, elem,
+		GenericGetNode<typename List::elem_type>(list.node));
+}
+
+/*******************************************************************//**
+Adds the node as the last element in a two-way linked list.
+@param LIST list base node (not a pointer to it)
+@param ELEM the element to add */
+#define UT_LIST_ADD_LAST(LIST, ELEM)	ut_list_append(LIST, ELEM)
+
+/*******************************************************************//**
+Inserts a ELEM2 after ELEM1 in a list.
+@param list the base node
+@param elem1 node after which ELEM2 is inserted
+@param elem2 node being inserted after ELEM1 */
+template <typename List>
+void
+ut_list_insert(
+	List&				list,
+	typename List::elem_type*	elem1,
+	typename List::elem_type*	elem2)
+{
+	ut_ad(elem1 != elem2);
+	UT_LIST_IS_INITIALISED(list);
+
+	typename List::node_type&	elem1_node = elem1->*list.node;
+	typename List::node_type&	elem2_node = elem2->*list.node;
+
+	elem2_node.prev = elem1;
+	elem2_node.next = elem1_node.next;
+
+	if (elem1_node.next != NULL) {
+		typename List::node_type&	next_node =
+			elem1_node.next->*list.node;
+
+		next_node.prev = elem2;
+	}
+
+	elem1_node.next = elem2;
+
+	if (list.end == elem1) {
+		list.end = elem2;
+	}
+
+	++list.count;
+}
+
+/*******************************************************************//**
+Inserts a ELEM2 after ELEM1 in a list.
+@param LIST list base node (not a pointer to it)
+@param ELEM1 node after which ELEM2 is inserted
+@param ELEM2 node being inserted after ELEM1 */
+#define UT_LIST_INSERT_AFTER(LIST, ELEM1, ELEM2)			\
+	ut_list_insert(LIST, ELEM1, ELEM2)
+
+/*******************************************************************//**
+Inserts a ELEM2 after ELEM1 in a list.
+@param list the base node
+@param elem1 node after which ELEM2 is inserted
+@param elem2 node being inserted after ELEM1
+@param get_node to get the list node for that element */
+
+template <typename List, typename Functor>
+void
+ut_list_insert(
+	List&				list,
+	typename List::elem_type*	elem1,
+        typename List::elem_type*	elem2,
+	Functor				get_node)
+{
+	ut_ad(elem1 != elem2);
+	UT_LIST_IS_INITIALISED(list);
+
+	typename List::node_type&	elem1_node = get_node(*elem1);
+	typename List::node_type&	elem2_node = get_node(*elem2);
+
+	elem2_node.prev = elem1;
+	elem2_node.next = elem1_node.next;
+
+	if (elem1_node.next != NULL) {
+		typename List::node_type&	next_node =
+			get_node(*elem1_node.next);
+
+		next_node.prev = elem2;
+	}
+
+	elem1_node.next = elem2;
+
+	if (list.end == elem1) {
+		list.end = elem2;
+	}
+
+	++list.count;
+
+}
+/*******************************************************************//**
+Removes a node from a two-way linked list.
+@param list the base node (not a pointer to it)
+@param node member node within list element that is to be removed
+@param get_node functor to get the list node from elem */
+template <typename List, typename Functor>
+void
+ut_list_remove(
+	List&				list,
+	typename List::node_type&	node,
+	Functor				get_node)
+{
+	ut_a(list.count > 0);
+	UT_LIST_IS_INITIALISED(list);
+
+	if (node.next != NULL) {
+		typename List::node_type&	next_node =
+			get_node(*node.next);
+
+		next_node.prev = node.prev;
+	} else {
+		list.end = node.prev;
+	}
+
+	if (node.prev != NULL) {
+		typename List::node_type&	prev_node =
+			get_node(*node.prev);
+
+		prev_node.next = node.next;
+	} else {
+		list.start = node.next;
+	}
+
+	node.next = 0;
+	node.prev = 0;
+
+	--list.count;
+}
+
+/*******************************************************************//**
+Removes a node from a two-way linked list.
+@param list the base node (not a pointer to it)
+@param elem element to be removed from the list
+@param get_node functor to get the list node from elem */
+template <typename List, typename Functor>
+void
+ut_list_remove(
+	List&				list,
+	typename List::elem_type*	elem,
+	Functor				get_node)
+{
+	ut_list_remove(list, get_node(*elem), get_node);
+}
+
+/*******************************************************************//**
+Removes a node from a two-way linked list.
+@param list the base node (not a pointer to it)
+@param elem element to be removed from the list */
+template <typename List>
+void
+ut_list_remove(
+	List&				list,
+	typename List::elem_type*	elem)
+{
+	ut_list_remove(
+		list, elem->*list.node,
+		GenericGetNode<typename List::elem_type>(list.node));
+}
+
+/*******************************************************************//**
+Removes a node from a two-way linked list.
+@param LIST the base node (not a pointer to it)
+@param ELEM node to be removed from the list */
+#define UT_LIST_REMOVE(LIST, ELEM)	ut_list_remove(LIST, ELEM)
+
+/********************************************************************//**
+Gets the next node in a two-way list.
+@param NAME list name
+@param N pointer to a node
+@return the successor of N in NAME, or NULL */
+#define UT_LIST_GET_NEXT(NAME, N)	(((N)->NAME).next)
+
+/********************************************************************//**
+Gets the previous node in a two-way list.
+@param NAME list name
+@param N pointer to a node
+@return the predecessor of N in NAME, or NULL */
+#define UT_LIST_GET_PREV(NAME, N)	(((N)->NAME).prev)
+
+/********************************************************************//**
+Alternative macro to get the number of nodes in a two-way list, i.e.,
+its length.
+@param BASE the base node (not a pointer to it).
+@return the number of nodes in the list */
+#define UT_LIST_GET_LEN(BASE)		(BASE).count
+
+/********************************************************************//**
+Gets the first node in a two-way list.
+@param BASE the base node (not a pointer to it)
+@return first node, or NULL if the list is empty */
+#define UT_LIST_GET_FIRST(BASE)		(BASE).start
+
+/********************************************************************//**
+Gets the last node in a two-way list.
+@param BASE the base node (not a pointer to it)
+@return last node, or NULL if the list is empty */
+#define UT_LIST_GET_LAST(BASE)		(BASE).end
+
+struct NullValidate { void operator()(const void*) const {} };
+
+/** Iterate over all the elements and call the functor for each element.
+@param[in]	list	base node (not a pointer to it)
+@param[in,out]	functor	Functor that is called for each element in the list */
+template <typename List, class Functor>
+inline void ut_list_map(const List& list, Functor& functor)
+{
+	ulint count = 0;
+
+	UT_LIST_IS_INITIALISED(list);
+
+	for (typename List::elem_type* elem = list.start; elem;
+	     elem = (elem->*list.node).next, ++count) {
+
+		functor(elem);
+	}
+
+	ut_a(count == list.count);
+}
+
+/** Iterate over all the elements and call the functor for each element.
+@param[in]	list	base node (not a pointer to it)
+@param[in]	functor	Functor that is called for each element in the list */
+template <typename List, class Functor>
+inline void ut_list_map(const List& list, const Functor& functor)
+{
+	ulint count = 0;
+
+	UT_LIST_IS_INITIALISED(list);
+
+	for (typename List::elem_type* elem = list.start; elem;
+	     elem = (elem->*list.node).next, ++count) {
+
+		functor(elem);
+	}
+
+	ut_a(count == list.count);
+}
+
+/** Check the consistency of a doubly linked list.
+@param[in] list		base node (not a pointer to it)
+@param[in,out] functor	Functor that is called for each element in the list */
+template <typename List, class Functor>
+void ut_list_validate(const List& list, Functor& functor)
+{
+	ut_list_map(list, functor);
+#ifdef UNIV_DEBUG
+	/* Validate the list backwards. */
+	ulint count = list.count;
+
+	for (typename List::elem_type* elem = list.end;
+	     elem != 0;
+	     elem = (elem->*list.node).prev) {
+		--count;
+	}
+	ut_ad(!count);
+#endif
+}
+
+/** Check the consistency of a doubly linked list.
+@param[in] list		base node (not a pointer to it)
+@param[in] functor	Functor that is called for each element in the list */
+template <typename List, class Functor>
+inline void ut_list_validate(const List& list, const Functor& functor)
+{
+	ut_list_map(list, functor);
+#ifdef UNIV_DEBUG
+	/* Validate the list backwards. */
+	ulint count = list.count;
+
+	for (typename List::elem_type* elem = list.end;
+	     elem != 0;
+	     elem = (elem->*list.node).prev) {
+		--count;
+	}
+
+	ut_ad(!count);
+#endif
+}
+
+template <typename List>
+inline void ut_list_validate(const List& list)
+{
+  ut_d(ut_list_validate(list, NullValidate()));
+}
+
+#ifdef UNIV_DEBUG
+template <typename List>
+inline void ut_list_reverse(List& list)
+{
+	UT_LIST_IS_INITIALISED(list);
+
+	for (typename List::elem_type* elem = list.start;
+	     elem != 0;
+	     elem = (elem->*list.node).prev) {
+		(elem->*list.node).reverse();
+	}
+
+	list.reverse();
+}
+
+/** Check if the given element exists in the list.
+@param[in,out]	list	the list object
+@param[in]	elem	the element of the list which will be checked */
+template <typename List>
+inline bool ut_list_exists(const List& list, typename List::elem_type* elem)
+{
+	for (typename List::elem_type* e1 = UT_LIST_GET_FIRST(list); e1;
+	     e1 = (e1->*list.node).next) {
+		if (elem == e1) {
+			return true;
+		}
+	}
+	return false;
+}
+#endif
+
+/** Move the given element to the beginning of the list.
+@param[in,out]	list	the list object
+@param[in]	elem	the element of the list which will be moved
+			to the beginning of the list. */
+template <typename List>
+void
+ut_list_move_to_front(
+	List&				list,
+	typename List::elem_type*	elem)
+{
+	ut_ad(ut_list_exists(list, elem));
+
+	if (UT_LIST_GET_FIRST(list) != elem) {
+		ut_list_remove(list, elem);
+		ut_list_prepend(list, elem);
+	}
+}
diff --git a/storage/innobase/include/ut0mem.h b/storage/innobase/include/ut0mem.h
new file mode 100644
index 00000000..a5ed72f9
--- /dev/null
+++ b/storage/innobase/include/ut0mem.h
@@ -0,0 +1,76 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0mem.h
+Memory primitives
+
+Created 5/30/1994 Heikki Tuuri
+************************************************************************/
+
+#ifndef ut0mem_h
+#define ut0mem_h
+
+#include "univ.i"
+
+/********************************************************************
+Concatenate 3 strings.*/
+char*
+ut_str3cat(
+/*=======*/
+				/* out, own: concatenated string, must be
+				freed with ut_free() */
+	const char*	s1,	/* in: string 1 */
+	const char*	s2,	/* in: string 2 */
+	const char*	s3);	/* in: string 3 */
+
+/**********************************************************************//**
+Converts a raw binary data to a NUL-terminated hex string. The output is
+truncated if there is not enough space in "hex", make sure "hex_size" is at
+least (2 * raw_size + 1) if you do not want this to happen. Returns the
+actual number of characters written to "hex" (including the NUL).
+@return number of chars written */
+UNIV_INLINE
+ulint
+ut_raw_to_hex(
+/*==========*/
+	const void*	raw,		/*!< in: raw data */
+	ulint		raw_size,	/*!< in: "raw" length in bytes */
+	char*		hex,		/*!< out: hex string */
+	ulint		hex_size);	/*!< in: "hex" size in bytes */
+
+/*******************************************************************//**
+Adds single quotes to the start and end of string and escapes any quotes
+by doubling them. Returns the number of bytes that were written to "buf"
+(including the terminating NUL). If buf_size is too small then the
+trailing bytes from "str" are discarded.
+@return number of bytes that were written */
+UNIV_INLINE
+ulint
+ut_str_sql_format(
+/*==============*/
+	const char*	str,		/*!< in: string */
+	ulint		str_len,	/*!< in: string length in bytes */
+	char*		buf,		/*!< out: output buffer */
+	ulint		buf_size);	/*!< in: output buffer size
+					in bytes */
+
+#include "ut0mem.inl"
+
+#endif
diff --git a/storage/innobase/include/ut0mem.inl b/storage/innobase/include/ut0mem.inl
new file mode 100644
index 00000000..cc95a036
--- /dev/null
+++ b/storage/innobase/include/ut0mem.inl
@@ -0,0 +1,246 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0mem.ic
+Memory primitives
+
+Created 5/30/1994 Heikki Tuuri
+************************************************************************/
+
+#include "ut0byte.h"
+#include "mach0data.h"
+
+/**********************************************************************//**
+Converts a raw binary data to a NUL-terminated hex string. The output is
+truncated if there is not enough space in "hex", make sure "hex_size" is at
+least (2 * raw_size + 1) if you do not want this to happen. Returns the
+actual number of characters written to "hex" (including the NUL).
+@return number of chars written */
+UNIV_INLINE
+ulint
+ut_raw_to_hex(
+/*==========*/
+	const void*	raw,		/*!< in: raw data */
+	ulint		raw_size,	/*!< in: "raw" length in bytes */
+	char*		hex,		/*!< out: hex string */
+	ulint		hex_size)	/*!< in: "hex" size in bytes */
+{
+
+#ifdef WORDS_BIGENDIAN
+
+#define MK_UINT16(a, b) (((uint16) (a)) << 8 | (uint16) (b))
+
+#define UINT16_GET_A(u)	((char) ((u) >> 8))
+#define UINT16_GET_B(u)	((char) ((u) & 0xFF))
+
+#else /* WORDS_BIGENDIAN */
+
+#define MK_UINT16(a, b) (((uint16) (b)) << 8 | (uint16) (a))
+
+#define UINT16_GET_A(u)	((char) ((u) & 0xFF))
+#define UINT16_GET_B(u)	((char) ((u) >> 8))
+
+#endif /* WORDS_BIGENDIAN */
+
+#define MK_ALL_UINT16_WITH_A(a)	\
+	MK_UINT16(a, '0'),	\
+	MK_UINT16(a, '1'),	\
+	MK_UINT16(a, '2'),	\
+	MK_UINT16(a, '3'),	\
+	MK_UINT16(a, '4'),	\
+	MK_UINT16(a, '5'),	\
+	MK_UINT16(a, '6'),	\
+	MK_UINT16(a, '7'),	\
+	MK_UINT16(a, '8'),	\
+	MK_UINT16(a, '9'),	\
+	MK_UINT16(a, 'A'),	\
+	MK_UINT16(a, 'B'),	\
+	MK_UINT16(a, 'C'),	\
+	MK_UINT16(a, 'D'),	\
+	MK_UINT16(a, 'E'),	\
+	MK_UINT16(a, 'F')
+
+	static const uint16	hex_map[256] = {
+		MK_ALL_UINT16_WITH_A('0'),
+		MK_ALL_UINT16_WITH_A('1'),
+		MK_ALL_UINT16_WITH_A('2'),
+		MK_ALL_UINT16_WITH_A('3'),
+		MK_ALL_UINT16_WITH_A('4'),
+		MK_ALL_UINT16_WITH_A('5'),
+		MK_ALL_UINT16_WITH_A('6'),
+		MK_ALL_UINT16_WITH_A('7'),
+		MK_ALL_UINT16_WITH_A('8'),
+		MK_ALL_UINT16_WITH_A('9'),
+		MK_ALL_UINT16_WITH_A('A'),
+		MK_ALL_UINT16_WITH_A('B'),
+		MK_ALL_UINT16_WITH_A('C'),
+		MK_ALL_UINT16_WITH_A('D'),
+		MK_ALL_UINT16_WITH_A('E'),
+		MK_ALL_UINT16_WITH_A('F')
+	};
+	const unsigned char*	rawc;
+	ulint			read_bytes;
+	ulint			write_bytes;
+	ulint			i;
+
+	rawc = (const unsigned char*) raw;
+
+	if (hex_size == 0) {
+
+		return(0);
+	}
+
+	if (hex_size <= 2 * raw_size) {
+
+		read_bytes = hex_size / 2;
+		write_bytes = hex_size;
+	} else {
+
+		read_bytes = raw_size;
+		write_bytes = 2 * raw_size + 1;
+	}
+
+#define LOOP_READ_BYTES(ASSIGN)			\
+	for (i = 0; i < read_bytes; i++) {	\
+		ASSIGN;				\
+		hex += 2;			\
+		rawc++;				\
+	}
+
+	if (ut_align_offset(hex, 2) == 0) {
+
+		LOOP_READ_BYTES(
+			*(uint16*) hex = hex_map[*rawc]
+		);
+	} else {
+
+		LOOP_READ_BYTES(
+			*hex       = UINT16_GET_A(hex_map[*rawc]);
+			*(hex + 1) = UINT16_GET_B(hex_map[*rawc])
+		);
+	}
+
+	if (hex_size <= 2 * raw_size && hex_size % 2 == 0) {
+
+		hex--;
+	}
+
+	*hex = '\0';
+
+	return(write_bytes);
+}
+
+/*******************************************************************//**
+Adds single quotes to the start and end of string and escapes any quotes
+by doubling them. Returns the number of bytes that were written to "buf"
+(including the terminating NUL). If buf_size is too small then the
+trailing bytes from "str" are discarded.
+@return number of bytes that were written */
+UNIV_INLINE
+ulint
+ut_str_sql_format(
+/*==============*/
+	const char*	str,		/*!< in: string */
+	ulint		str_len,	/*!< in: string length in bytes */
+	char*		buf,		/*!< out: output buffer */
+	ulint		buf_size)	/*!< in: output buffer size
+					in bytes */
+{
+	ulint	str_i;
+	ulint	buf_i;
+
+	buf_i = 0;
+
+	switch (buf_size) {
+	case 3:
+
+		if (str_len == 0) {
+
+			buf[buf_i] = '\'';
+			buf_i++;
+			buf[buf_i] = '\'';
+			buf_i++;
+		}
+		/* FALLTHROUGH */
+	case 2:
+	case 1:
+
+		buf[buf_i] = '\0';
+		buf_i++;
+		/* FALLTHROUGH */
+	case 0:
+
+		return(buf_i);
+	}
+
+	/* buf_size >= 4 */
+
+	buf[0] = '\'';
+	buf_i = 1;
+
+	for (str_i = 0; str_i < str_len; str_i++) {
+
+		char	ch;
+
+		if (buf_size - buf_i == 2) {
+
+			break;
+		}
+
+		ch = str[str_i];
+
+		switch (ch) {
+		case '\0':
+
+			if (buf_size - buf_i < 4) {
+
+				goto func_exit;
+			}
+			buf[buf_i] = '\\';
+			buf_i++;
+			buf[buf_i] = '0';
+			buf_i++;
+			break;
+		case '\'':
+		case '\\':
+
+			if (buf_size - buf_i < 4) {
+
+				goto func_exit;
+			}
+			buf[buf_i] = ch;
+			buf_i++;
+			/* FALLTHROUGH */
+		default:
+
+			buf[buf_i] = ch;
+			buf_i++;
+		}
+	}
+
+func_exit:
+
+	buf[buf_i] = '\'';
+	buf_i++;
+	buf[buf_i] = '\0';
+	buf_i++;
+
+	return(buf_i);
+}
diff --git a/storage/innobase/include/ut0new.h b/storage/innobase/include/ut0new.h
new file mode 100644
index 00000000..f4183e4c
--- /dev/null
+++ b/storage/innobase/include/ut0new.h
@@ -0,0 +1,1099 @@
+/*****************************************************************************
+
+Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file ut/ut0new.h
+Instrumented memory allocator.
+
+Created May 26, 2014 Vasil Dimov
+*******************************************************/
+
+/** Dynamic memory allocation within InnoDB guidelines.
+All dynamic (heap) memory allocations (malloc(3), strdup(3), etc, "new",
+various std:: containers that allocate memory internally), that are done
+within InnoDB are instrumented. This means that InnoDB uses a custom set
+of functions for allocating memory, rather than calling e.g. "new" directly.
+
+Here follows a cheat sheet on what InnoDB functions to use whenever a
+standard one would have been used.
+
+Creating new objects with "new":
+--------------------------------
+Standard:
+  new expression
+  or
+  new(std::nothrow) expression
+InnoDB, default instrumentation:
+  UT_NEW_NOKEY(expression)
+InnoDB, custom instrumentation, preferred:
+  UT_NEW(expression, key)
+
+Destroying objects, created with "new":
+---------------------------------------
+Standard:
+  delete ptr
+InnoDB:
+  UT_DELETE(ptr)
+
+Creating new arrays with "new[]":
+---------------------------------
+Standard:
+  new type[num]
+  or
+  new(std::nothrow) type[num]
+InnoDB, default instrumentation:
+  UT_NEW_ARRAY_NOKEY(type, num)
+InnoDB, custom instrumentation, preferred:
+  UT_NEW_ARRAY(type, num, key)
+
+Destroying arrays, created with "new[]":
+----------------------------------------
+Standard:
+  delete[] ptr
+InnoDB:
+  UT_DELETE_ARRAY(ptr)
+
+Declaring a type with a std:: container, e.g. std::vector:
+----------------------------------------------------------
+Standard:
+  std::vector<t>
+InnoDB:
+  std::vector<t, ut_allocator<t> >
+
+Declaring objects of some std:: type:
+-------------------------------------
+Standard:
+  std::vector<t> v
+InnoDB, default instrumentation:
+  std::vector<t, ut_allocator<t> > v
+InnoDB, custom instrumentation, preferred:
+  std::vector<t, ut_allocator<t> > v(ut_allocator<t>(key))
+
+Raw block allocation (as usual in C++, consider whether using "new" would
+not be more appropriate):
+-------------------------------------------------------------------------
+Standard:
+  malloc(num)
+InnoDB, default instrumentation:
+  ut_malloc_nokey(num)
+InnoDB, custom instrumentation, preferred:
+  ut_malloc(num, key)
+
+Raw block resize:
+-----------------
+Standard:
+  realloc(ptr, new_size)
+InnoDB:
+  ut_realloc(ptr, new_size)
+
+Raw block deallocation:
+-----------------------
+Standard:
+  free(ptr)
+InnoDB:
+  ut_free(ptr)
+
+Note: the expression passed to UT_NEW() or UT_NEW_NOKEY() must always end
+with (), thus:
+Standard:
+  new int
+InnoDB:
+  UT_NEW_NOKEY(int())
+*/
+
+#ifndef ut0new_h
+#define ut0new_h
+
+#include <limits> /* std::numeric_limits */
+#include <thread>
+
+#include <stddef.h>
+#include <stdlib.h> /* malloc() */
+#include <string.h> /* strlen(), strrchr(), strncmp() */
+
+#include <my_sys.h> /* my_large_free/malloc() */
+
+#include "my_global.h" /* needed for headers from mysql/psi/ */
+
+#include "mysql/psi/mysql_memory.h" /* PSI_MEMORY_CALL() */
+
+#include "mysql/psi/psi_memory.h" /* PSI_memory_key, PSI_memory_info */
+
+#include "ut0ut.h" /* ut_strcmp_functor */
+
+#define	OUT_OF_MEMORY_MSG \
+	"Check if you should increase the swap file or ulimits of your" \
+	" operating system. Note that on most 32-bit computers the process" \
+	" memory space is limited to 2 GB or 4 GB."
+
+/** The total amount of memory currently allocated from the operating
+system with allocate_large() */
+extern Atomic_counter<ulint> os_total_large_mem_allocated;
+
+/** Maximum number of retries to allocate memory. */
+extern const size_t	alloc_max_retries;
+
+constexpr uint32_t INVALID_AUTOEVENT_IDX = 0xFFFFFFFFU;
+
+/** Keys for registering allocations with performance schema.
+Pointers to these variables are supplied to PFS code via the pfs_info[]
+array and the PFS code initializes them via PSI_MEMORY_CALL(register_memory)().
+mem_key_other and mem_key_std are special in the following way (see also
+ut_allocator::get_mem_key()):
+* If the caller has not provided a key and the file name of the caller is
+  unknown, then mem_key_std will be used. This happens only when called from
+  within std::* containers.
+* If the caller has not provided a key and the file name of the caller is
+  known, but is not amongst the predefined names (see ut_new_boot()) then
+  mem_key_other will be used. Generally this should not happen and if it
+  happens then that means that the list of predefined names must be extended.
+Keep this list alphabetically sorted. */
+extern PSI_memory_key	mem_key_ahi;
+extern PSI_memory_key	mem_key_buf_buf_pool;
+extern PSI_memory_key	mem_key_dict_stats_bg_recalc_pool_t;
+extern PSI_memory_key	mem_key_dict_stats_index_map_t;
+extern PSI_memory_key	mem_key_dict_stats_n_diff_on_level;
+extern PSI_memory_key	mem_key_other;
+extern PSI_memory_key	mem_key_row_log_buf;
+extern PSI_memory_key	mem_key_row_merge_sort;
+extern PSI_memory_key	mem_key_std;
+
+/** Setup the internal objects needed for UT_NEW() to operate.
+This must be called before the first call to UT_NEW(). */
+void
+ut_new_boot();
+
+#ifdef UNIV_PFS_MEMORY
+
+/**
+Retrieve a memory key (registered with PFS),
+given AUTOEVENT_IDX of the caller
+
+@param[in] autoevent_idx - AUTOEVENT_IDX value of the caller
+@return registered memory key or PSI_NOT_INSTRUMENTED */
+PSI_memory_key ut_new_get_key_by_file(uint32_t autoevent_idx);
+
+#endif /* UNIV_PFS_MEMORY */
+
+/** A structure that holds the necessary data for performance schema
+accounting. An object of this type is put in front of each allocated block
+of memory when allocation is done by ut_allocator::allocate(). This is
+because the data is needed even when freeing the memory. Users of
+ut_allocator::allocate_large() are responsible for maintaining this
+themselves. */
+struct ut_new_pfx_t {
+
+#ifdef UNIV_PFS_MEMORY
+
+	/** Performance schema key. Assigned to a name at startup via
+	PSI_MEMORY_CALL(register_memory)() and later used for accounting
+	allocations and deallocations with
+	PSI_MEMORY_CALL(memory_alloc)(key, size, owner) and
+	PSI_MEMORY_CALL(memory_free)(key, size, owner). */
+	PSI_memory_key	m_key;
+
+        /**
+          Thread owner.
+          Instrumented thread that owns the allocated memory.
+          This state is used by the performance schema to maintain
+          per thread statistics,
+          when memory is given from thread A to thread B.
+        */
+        struct PSI_thread *m_owner;
+
+#endif /* UNIV_PFS_MEMORY */
+
+	/** Size of the allocated block in bytes, including this prepended
+	aux structure (for ut_allocator::allocate()). For example if InnoDB
+	code requests to allocate 100 bytes, and sizeof(ut_new_pfx_t) is 16,
+	then 116 bytes are allocated in total and m_size will be 116.
+	ut_allocator::allocate_large() does not prepend this struct to the
+	allocated block and its users are responsible for maintaining it
+	and passing it later to ut_allocator::deallocate_large(). */
+	size_t		m_size;
+#if SIZEOF_VOIDP == 4
+	/** Pad the header size to a multiple of 64 bits on 32-bit systems,
+	so that the payload will be aligned to 64 bits. */
+	size_t		pad;
+#endif
+};
+
+#if defined(DBUG_OFF) && defined(HAVE_MADVISE) && defined(MADV_DODUMP)
+static inline void ut_dontdump(void *ptr, size_t m_size, bool dontdump)
+{
+	ut_a(ptr != NULL);
+
+	if (dontdump && madvise(ptr, m_size, MADV_DONTDUMP)) {
+		ib::warn() << "Failed to set memory to " DONTDUMP_STR ": "
+			   << strerror(errno)
+			   << " ptr " << ptr
+			   << " size " << m_size;
+	}
+}
+
+static inline void ut_dodump(void* ptr, size_t m_size)
+{
+	if (ptr && madvise(ptr, m_size, MADV_DODUMP)) {
+		ib::warn() << "Failed to set memory to " DODUMP_STR ": "
+			   << strerror(errno)
+			   << " ptr " << ptr
+			   << " size " << m_size;
+	}
+}
+#else
+static inline void ut_dontdump(void *, size_t, bool) {}
+static inline void ut_dodump(void*, size_t) {}
+#endif
+
+/** Allocator class for allocating memory from inside std::* containers.
+@tparam	T		type of allocated object
+@tparam oom_fatal	whether to commit suicide when running out of memory */
+template <class T, bool oom_fatal = true>
+class ut_allocator {
+public:
+	typedef T*		pointer;
+	typedef const T*	const_pointer;
+	typedef T&		reference;
+	typedef const T&	const_reference;
+	typedef T		value_type;
+	typedef size_t		size_type;
+	typedef ptrdiff_t	difference_type;
+
+#ifdef UNIV_PFS_MEMORY
+	/** Default constructor. */
+	explicit
+	ut_allocator(PSI_memory_key key = PSI_NOT_INSTRUMENTED)
+		: m_key(key)
+	{
+	}
+#else
+	ut_allocator() = default;
+	ut_allocator(PSI_memory_key) {}
+#endif /* UNIV_PFS_MEMORY */
+
+	/** Constructor from allocator of another type. */
+	template <class U>
+	ut_allocator(const ut_allocator<U>&
+#ifdef UNIV_PFS_MEMORY
+		     other
+#endif
+		     )
+	{
+#ifdef UNIV_PFS_MEMORY
+		const PSI_memory_key other_key = other.get_mem_key();
+
+		m_key = (other_key != mem_key_std)
+			? other_key
+			: PSI_NOT_INSTRUMENTED;
+#endif /* UNIV_PFS_MEMORY */
+	}
+
+	/** Return the maximum number of objects that can be allocated by
+	this allocator. */
+	size_type
+	max_size() const
+	{
+		const size_type	s_max = std::numeric_limits<size_type>::max();
+
+#ifdef UNIV_PFS_MEMORY
+		return((s_max - sizeof(ut_new_pfx_t)) / sizeof(T));
+#else
+		return(s_max / sizeof(T));
+#endif /* UNIV_PFS_MEMORY */
+	}
+
+	pointer allocate(size_type n) { return allocate(n, NULL, INVALID_AUTOEVENT_IDX); }
+
+	/** Allocate a chunk of memory that can hold 'n_elements' objects of
+	type 'T' and trace the allocation.
+	If the allocation fails this method may throw an exception. This
+	is mandated by the standard and if it returns NULL instead, then
+	STL containers that use it (e.g. std::vector) may get confused.
+	After successfull allocation the returned pointer must be passed
+	to ut_allocator::deallocate() when no longer needed.
+	@param[in]	n_elements	number of elements
+	@param[in]	set_to_zero	if true, then the returned memory is
+	initialized with 0x0 bytes.
+	@param[in]	throw_on_error	if true, raize exception if too big
+	@return pointer to the allocated memory */
+	pointer
+	allocate(
+		size_type	n_elements,
+		const_pointer,
+		uint32_t
+#ifdef UNIV_PFS_MEMORY
+		autoevent_idx /* AUTOEVENT_IDX of the caller */
+#endif
+		,
+		bool		set_to_zero = false,
+		bool		throw_on_error = true)
+	{
+		if (n_elements == 0) {
+			return(NULL);
+		}
+
+		if (n_elements > max_size()) {
+			if (throw_on_error) {
+				throw(std::bad_alloc());
+			} else {
+				return(NULL);
+			}
+		}
+
+		void*	ptr;
+		size_t	total_bytes = n_elements * sizeof(T);
+
+#ifdef UNIV_PFS_MEMORY
+		/* The header size must not ruin the 64-bit alignment
+		on 32-bit systems. Some allocated structures use
+		64-bit fields. */
+		ut_ad((sizeof(ut_new_pfx_t) & 7) == 0);
+		total_bytes += sizeof(ut_new_pfx_t);
+#endif /* UNIV_PFS_MEMORY */
+
+		for (size_t retries = 1; ; retries++) {
+
+			if (set_to_zero) {
+				ptr = calloc(1, total_bytes);
+			} else {
+				ptr = malloc(total_bytes);
+			}
+
+			if (ptr != NULL || retries >= alloc_max_retries) {
+				break;
+			}
+
+			std::this_thread::sleep_for(std::chrono::seconds(1));
+		}
+
+		if (ptr == NULL) {
+			ib::fatal_or_error(oom_fatal)
+				<< "Cannot allocate " << total_bytes
+				<< " bytes of memory after "
+				<< alloc_max_retries << " retries over "
+				<< alloc_max_retries << " seconds. OS error: "
+				<< strerror(errno) << " (" << errno << "). "
+				<< OUT_OF_MEMORY_MSG;
+			if (throw_on_error) {
+				throw(std::bad_alloc());
+			} else {
+				return(NULL);
+			}
+		}
+
+#ifdef UNIV_PFS_MEMORY
+		ut_new_pfx_t*	pfx = static_cast<ut_new_pfx_t*>(ptr);
+
+		allocate_trace(total_bytes, autoevent_idx, pfx);
+
+		return(reinterpret_cast<pointer>(pfx + 1));
+#else
+		return(reinterpret_cast<pointer>(ptr));
+#endif /* UNIV_PFS_MEMORY */
+	}
+
+	/** Free a memory allocated by allocate() and trace the deallocation.
+	@param[in,out]	ptr		pointer to memory to free */
+	void deallocate(pointer ptr, size_type n_elements = 0)
+	{
+#ifdef UNIV_PFS_MEMORY
+		if (ptr == NULL) {
+			return;
+		}
+
+		ut_new_pfx_t*	pfx = reinterpret_cast<ut_new_pfx_t*>(ptr) - 1;
+
+		deallocate_trace(pfx);
+
+		free(pfx);
+#else
+		free(ptr);
+#endif /* UNIV_PFS_MEMORY */
+	}
+
+	/** Create an object of type 'T' using the value 'val' over the
+	memory pointed by 'p'. */
+	void
+	construct(
+		pointer		p,
+		const T&	val)
+	{
+		new(p) T(val);
+	}
+
+	/** Destroy an object pointed by 'p'. */
+	void
+	destroy(
+		pointer	p)
+	{
+		p->~T();
+	}
+
+	/** Return the address of an object. */
+	pointer
+	address(
+		reference	x) const
+	{
+		return(&x);
+	}
+
+	/** Return the address of a const object. */
+	const_pointer
+	address(
+		const_reference	x) const
+	{
+		return(&x);
+	}
+
+	template <class U>
+	struct rebind {
+		typedef ut_allocator<U>	other;
+	};
+
+	/* The following are custom methods, not required by the standard. */
+
+#ifdef UNIV_PFS_MEMORY
+
+	/** realloc(3)-like method.
+	The passed in ptr must have been returned by allocate() and the
+	pointer returned by this method must be passed to deallocate() when
+	no longer needed.
+	@param[in,out]	ptr		old pointer to reallocate
+	@param[in]	n_elements	new number of elements to allocate
+	@param[in]	file		file name of the caller
+	@return newly allocated memory */
+	pointer
+	reallocate(
+		void*		ptr,
+		size_type	n_elements,
+		uint32_t	autoevent_idx)
+	{
+		if (n_elements == 0) {
+			deallocate(static_cast<pointer>(ptr));
+			return(NULL);
+		}
+
+		if (ptr == NULL) {
+			return(allocate(n_elements, NULL, autoevent_idx, false, false));
+		}
+
+		if (n_elements > max_size()) {
+			return(NULL);
+		}
+
+		ut_new_pfx_t*	pfx_old;
+		ut_new_pfx_t*	pfx_new;
+		size_t		total_bytes;
+
+		pfx_old = reinterpret_cast<ut_new_pfx_t*>(ptr) - 1;
+
+		total_bytes = n_elements * sizeof(T) + sizeof(ut_new_pfx_t);
+
+		for (size_t retries = 1; ; retries++) {
+
+			pfx_new = static_cast<ut_new_pfx_t*>(
+				realloc(pfx_old, total_bytes));
+
+			if (pfx_new != NULL || retries >= alloc_max_retries) {
+				break;
+			}
+
+			std::this_thread::sleep_for(std::chrono::seconds(1));
+		}
+
+		if (pfx_new == NULL) {
+			ib::fatal_or_error(oom_fatal)
+				<< "Cannot reallocate " << total_bytes
+				<< " bytes of memory after "
+				<< alloc_max_retries << " retries over "
+				<< alloc_max_retries << " seconds. OS error: "
+				<< strerror(errno) << " (" << errno << "). "
+				<< OUT_OF_MEMORY_MSG;
+			return(NULL);
+		}
+
+		/* pfx_new still contains the description of the old block
+		that was presumably freed by realloc(). */
+		deallocate_trace(pfx_new);
+
+		/* pfx_new is set here to describe the new block. */
+		allocate_trace(total_bytes, autoevent_idx, pfx_new);
+
+		return(reinterpret_cast<pointer>(pfx_new + 1));
+	}
+
+	/** Allocate, trace the allocation and construct 'n_elements' objects
+	of type 'T'. If the allocation fails or if some of the constructors
+	throws an exception, then this method will return NULL. It does not
+	throw exceptions. After successfull completion the returned pointer
+	must be passed to delete_array() when no longer needed.
+	@param[in]	n_elements	number of elements to allocate
+	@param[in]	file		file name of the caller
+	@return pointer to the first allocated object or NULL */
+	pointer
+	new_array(
+		size_type	n_elements,
+		uint32_t autoevent_idx
+		)
+	{
+		T*	p = allocate(n_elements, NULL, autoevent_idx, false, false);
+
+		if (p == NULL) {
+			return(NULL);
+		}
+
+		T*		first = p;
+		size_type	i;
+
+		try {
+			for (i = 0; i < n_elements; i++) {
+				new(p) T;
+				++p;
+			}
+		} catch (...) {
+			for (size_type j = 0; j < i; j++) {
+				--p;
+				p->~T();
+			}
+
+			deallocate(first);
+
+			throw;
+		}
+
+		return(first);
+	}
+
+	/** Destroy, deallocate and trace the deallocation of an array created
+	by new_array().
+	@param[in,out]	ptr	pointer to the first object in the array */
+	void
+	delete_array(
+		T*	ptr)
+	{
+		if (ptr == NULL) {
+			return;
+		}
+
+		const size_type	n_elements = n_elements_allocated(ptr);
+
+		T*		p = ptr + n_elements - 1;
+
+		for (size_type i = 0; i < n_elements; i++) {
+			p->~T();
+			--p;
+		}
+
+		deallocate(ptr);
+	}
+
+#endif /* UNIV_PFS_MEMORY */
+
+	/** Allocate a large chunk of memory that can hold 'n_elements'
+	objects of type 'T' and trace the allocation.
+	@param[in]	n_elements	number of elements
+	@param[in]	dontdump	if true, advise the OS is not to core
+	dump this memory.
+	@param[out]	pfx		storage for the description of the
+	allocated memory. The caller must provide space for this one and keep
+	it until the memory is no longer needed and then pass it to
+	deallocate_large().
+	@return pointer to the allocated memory or NULL */
+	pointer
+	allocate_large(
+		size_type	n_elements,
+		ut_new_pfx_t*	pfx,
+		bool		dontdump = false)
+	{
+		if (n_elements == 0 || n_elements > max_size()) {
+			return(NULL);
+		}
+
+		ulint	n_bytes = n_elements * sizeof(T);
+
+		pointer	ptr = reinterpret_cast<pointer>(
+			my_large_malloc(&n_bytes, MYF(0)));
+
+		if (ptr == NULL) {
+			return NULL;
+		}
+
+		ut_dontdump(ptr, n_bytes, dontdump);
+
+		if (pfx != NULL) {
+#ifdef UNIV_PFS_MEMORY
+			allocate_trace(n_bytes, 0, pfx);
+#endif /* UNIV_PFS_MEMORY */
+			pfx->m_size = n_bytes;
+		}
+
+		os_total_large_mem_allocated += n_bytes;
+
+		return(ptr);
+	}
+
+	pointer
+	allocate_large_dontdump(
+		size_type	n_elements,
+		ut_new_pfx_t*	pfx)
+	{
+		return allocate_large(n_elements, pfx, true);
+	}
+	/** Free a memory allocated by allocate_large() and trace the
+	deallocation.
+	@param[in,out]	ptr	pointer to memory to free
+	@param[in]	pfx	descriptor of the memory, as returned by
+	allocate_large(). */
+	void
+	deallocate_large(
+		pointer			ptr,
+		const ut_new_pfx_t*	pfx)
+	{
+		size_t size = pfx->m_size;
+#ifdef UNIV_PFS_MEMORY
+		if (pfx) {
+			deallocate_trace(pfx);
+		}
+#endif /* UNIV_PFS_MEMORY */
+		os_total_large_mem_allocated -= size;
+
+		my_large_free(ptr, size);
+	}
+
+	void
+	deallocate_large_dodump(
+		pointer			ptr,
+		const ut_new_pfx_t*	pfx)
+	{
+		ut_dodump(ptr, pfx->m_size);
+		deallocate_large(ptr, pfx);
+	}
+
+#ifdef UNIV_PFS_MEMORY
+	/** Get the performance schema key to use for tracing allocations.
+	@param[in]	file	file name of the caller or NULL if unknown
+	@return performance schema key */
+	PSI_memory_key
+	get_mem_key(
+		uint32_t autoevent_idx = INVALID_AUTOEVENT_IDX) const
+	{
+		if (m_key != PSI_NOT_INSTRUMENTED) {
+			return(m_key);
+		}
+
+		if (autoevent_idx == INVALID_AUTOEVENT_IDX) {
+			return(mem_key_std);
+		}
+		const PSI_memory_key	key = ut_new_get_key_by_file(autoevent_idx);
+
+		if (key != PSI_NOT_INSTRUMENTED) {
+			return(key);
+		}
+
+		return(mem_key_other);
+	}
+
+private:
+
+	/** Retrieve the size of a memory block allocated by new_array().
+	@param[in]	ptr	pointer returned by new_array().
+	@return size of memory block */
+	size_type
+	n_elements_allocated(
+		const_pointer	ptr)
+	{
+		const ut_new_pfx_t*	pfx
+			= reinterpret_cast<const ut_new_pfx_t*>(ptr) - 1;
+
+		const size_type		user_bytes
+			= pfx->m_size - sizeof(ut_new_pfx_t);
+
+		ut_ad(user_bytes % sizeof(T) == 0);
+
+		return(user_bytes / sizeof(T));
+	}
+
+	/** Trace a memory allocation.
+	After the accounting, the data needed for tracing the deallocation
+	later is written into 'pfx'.
+	The PFS event name is picked on the following criteria:
+	1. If key (!= PSI_NOT_INSTRUMENTED) has been specified when constructing
+	   this ut_allocator object, then the name associated with that key will
+	   be used (this is the recommended approach for new code)
+	2. Otherwise, if "file" is NULL, then the name associated with
+	   mem_key_std will be used
+	3. Otherwise, if an entry is found by ut_new_get_key_by_file(), that
+	   corresponds to "file", that will be used (see ut_new_boot())
+	4. Otherwise, the name associated with mem_key_other will be used.
+	@param[in]	size	number of bytes that were allocated
+	@param[in]	autoevent_idx	autoevent_idx of the caller
+	@param[out]	pfx	placeholder to store the info which will be
+	needed when freeing the memory */
+	void
+	allocate_trace(
+		size_t		size,
+		const uint32_t autoevent_idx,
+		ut_new_pfx_t*	pfx)
+	{
+		const PSI_memory_key	key = get_mem_key(autoevent_idx);
+
+		pfx->m_key = PSI_MEMORY_CALL(memory_alloc)(key, size, & pfx->m_owner);
+		pfx->m_size = size;
+	}
+
+	/** Trace a memory deallocation.
+	@param[in]	pfx	info for the deallocation */
+	void
+	deallocate_trace(
+		const ut_new_pfx_t*	pfx)
+	{
+		PSI_MEMORY_CALL(memory_free)(pfx->m_key, pfx->m_size, pfx->m_owner);
+	}
+
+	/** Performance schema key. */
+	PSI_memory_key	m_key;
+
+#endif /* UNIV_PFS_MEMORY */
+
+private:
+
+	/** Assignment operator, not used, thus disabled (private). */
+	template <class U>
+	void
+	operator=(
+		const ut_allocator<U>&);
+};
+
+/** Compare two allocators of the same type.
+As long as the type of A1 and A2 is the same, a memory allocated by A1
+could be freed by A2 even if the pfs mem key is different. */
+template <typename T>
+inline
+bool
+operator==(const ut_allocator<T>&, const ut_allocator<T>&) { return(true); }
+
+/** Compare two allocators of the same type. */
+template <typename T>
+inline
+bool
+operator!=(
+	const ut_allocator<T>&	lhs,
+	const ut_allocator<T>&	rhs)
+{
+	return(!(lhs == rhs));
+}
+
+#ifdef UNIV_PFS_MEMORY
+
+/*
+ constexpr trickery ahead.
+
+ Compute AUTOEVENT_IDX at compile time.
+ (index in the auto_event_names array, corresponding to basename of __FILE__)
+
+ The tricks are necessary to reduce the cost of lookup the
+ PSI_memory_key for auto event.
+*/
+
+static constexpr const char* cexpr_basename_helper(const char* s, const char* last_slash)
+{
+  return
+    *s == '\0' ? last_slash :
+    *s == '/' || *s == '\\' ? cexpr_basename_helper(s + 1, s + 1) :
+    cexpr_basename_helper(s + 1, last_slash);
+}
+
+static constexpr const char* cexpr_basename(const char* filename)
+{
+  return cexpr_basename_helper(filename, filename);
+}
+
+static constexpr bool cexpr_strequal_ignore_dot(const char* a, const char* b)
+{
+  return  *a == 0 || *a == '.' ? (*b == 0 || *b == '.')
+    : *a == *b ? cexpr_strequal_ignore_dot(a + 1, b + 1) : false;
+}
+
+constexpr const char* const auto_event_names[] =
+{
+  "btr0btr",
+  "btr0buf",
+  "btr0bulk",
+  "btr0cur",
+  "btr0pcur",
+  "btr0sea",
+  "buf0buf",
+  "buf0dblwr",
+  "buf0dump",
+  "buf0lru",
+  "buf0rea",
+  "dict0dict",
+  "dict0mem",
+  "dict0stats",
+  "eval0eval",
+  "fil0crypt",
+  "fil0fil",
+  "fsp0file",
+  "fts0ast",
+  "fts0blex",
+  "fts0config",
+  "fts0file",
+  "fts0fts",
+  "fts0opt",
+  "fts0pars",
+  "fts0que",
+  "fts0sql",
+  "fts0tlex",
+  "gis0sea",
+  "ha_innodb",
+  "handler0alter",
+  "hash0hash",
+  "i_s",
+  "lexyy",
+  "lock0lock",
+  "mem0mem",
+  "os0file",
+  "pars0lex",
+  "rem0rec",
+  "row0ftsort",
+  "row0import",
+  "row0log",
+  "row0merge",
+  "row0mysql",
+  "row0sel",
+  "srv0start",
+  "trx0i_s",
+  "trx0i_s",
+  "trx0roll",
+  "trx0rseg",
+  "trx0seg",
+  "trx0trx",
+  "trx0undo",
+  "ut0list",
+  "ut0mem",
+  "ut0new",
+  "ut0pool",
+  "ut0rbt",
+  "ut0wqueue",
+  "xtrabackup",
+  nullptr
+};
+
+constexpr uint32_t cexpr_lookup_auto_event_name(const char* name, uint32_t idx = 0)
+{
+  return !auto_event_names[idx] ? INVALID_AUTOEVENT_IDX :
+    cexpr_strequal_ignore_dot(name, auto_event_names[idx]) ? idx :
+    cexpr_lookup_auto_event_name(name, idx + 1);
+}
+
+/*
+ The AUTOEVENT_IDX macro.
+
+ Note, that there is a static_assert that checks whether
+ basename of the __FILE is not registered in the auto_event_names array.
+ If you run into this assert, add the basename to the array.
+
+ Weird looking lambda is used to force the evaluation at the compile time.
+*/
+#define AUTOEVENT_IDX []()\
+{\
+  constexpr auto idx = cexpr_lookup_auto_event_name(cexpr_basename(__FILE__)); \
+  static_assert(idx != INVALID_AUTOEVENT_IDX, "auto_event_names contains no entry for " __FILE__);\
+  return idx; \
+}()
+
+
+/** Allocate, trace the allocation and construct an object.
+Use this macro instead of 'new' within InnoDB.
+For example: instead of
+	Foo*	f = new Foo(args);
+use:
+	Foo*	f = UT_NEW(Foo(args), mem_key_some);
+Upon failure to allocate the memory, this macro may return NULL. It
+will not throw exceptions. After successfull allocation the returned
+pointer must be passed to UT_DELETE() when no longer needed.
+@param[in]	expr	any expression that could follow "new"
+@param[in]	key	performance schema memory tracing key
+@return pointer to the created object or NULL */
+#define UT_NEW(expr, key) \
+	/* Placement new will return NULL and not attempt to construct an
+	object if the passed in pointer is NULL, e.g. if allocate() has
+	failed to allocate memory and has returned NULL. */ \
+	::new(ut_allocator<byte>(key).allocate( \
+		sizeof expr, NULL, AUTOEVENT_IDX, false, false)) expr
+
+/** Allocate, trace the allocation and construct an object.
+Use this macro instead of 'new' within InnoDB and instead of UT_NEW()
+when creating a dedicated memory key is not feasible.
+For example: instead of
+	Foo*	f = new Foo(args);
+use:
+	Foo*	f = UT_NEW_NOKEY(Foo(args));
+Upon failure to allocate the memory, this macro may return NULL. It
+will not throw exceptions. After successfull allocation the returned
+pointer must be passed to UT_DELETE() when no longer needed.
+@param[in]	expr	any expression that could follow "new"
+@return pointer to the created object or NULL */
+#define UT_NEW_NOKEY(expr)	UT_NEW(expr, PSI_NOT_INSTRUMENTED)
+
+/** Destroy, deallocate and trace the deallocation of an object created by
+UT_NEW() or UT_NEW_NOKEY().
+We can't instantiate ut_allocator without having the type of the object, thus
+we redirect this to a templated function. */
+#define UT_DELETE(ptr)		ut_delete(ptr)
+
+
+/** Destroy and account object created by UT_NEW() or UT_NEW_NOKEY().
+@param[in,out]	ptr	pointer to the object */
+template <typename T>
+inline
+void
+ut_delete(
+	T*	ptr)
+{
+	if (ptr == NULL) {
+		return;
+	}
+
+	ut_allocator<T>	allocator;
+
+	allocator.destroy(ptr);
+	allocator.deallocate(ptr);
+}
+
+/** Allocate and account 'n_elements' objects of type 'type'.
+Use this macro to allocate memory within InnoDB instead of 'new[]'.
+The returned pointer must be passed to UT_DELETE_ARRAY().
+@param[in]	type		type of objects being created
+@param[in]	n_elements	number of objects to create
+@param[in]	key		performance schema memory tracing key
+@return pointer to the first allocated object or NULL */
+#define UT_NEW_ARRAY(type, n_elements, key) \
+	ut_allocator<type>(key).new_array(n_elements, AUTOEVENT_IDX)
+
+/** Allocate and account 'n_elements' objects of type 'type'.
+Use this macro to allocate memory within InnoDB instead of 'new[]' and
+instead of UT_NEW_ARRAY() when it is not feasible to create a dedicated key.
+@param[in]	type		type of objects being created
+@param[in]	n_elements	number of objects to create
+@return pointer to the first allocated object or NULL */
+#define UT_NEW_ARRAY_NOKEY(type, n_elements) \
+	UT_NEW_ARRAY(type, n_elements, PSI_NOT_INSTRUMENTED)
+
+/** Destroy, deallocate and trace the deallocation of an array created by
+UT_NEW_ARRAY() or UT_NEW_ARRAY_NOKEY().
+We can't instantiate ut_allocator without having the type of the object, thus
+we redirect this to a templated function. */
+#define UT_DELETE_ARRAY(ptr)	ut_delete_array(ptr)
+
+/** Destroy and account objects created by UT_NEW_ARRAY() or
+UT_NEW_ARRAY_NOKEY().
+@param[in,out]	ptr	pointer to the first object in the array */
+template <typename T>
+inline
+void
+ut_delete_array(
+	T*	ptr)
+{
+	ut_allocator<T>().delete_array(ptr);
+}
+
+#define ut_malloc(n_bytes, key)		static_cast<void*>( \
+	ut_allocator<byte>(key).allocate( \
+		n_bytes, NULL, AUTOEVENT_IDX, false, false))
+
+#define ut_malloc_dontdump(n_bytes, key) static_cast<void*>( \
+	ut_allocator<byte>(key).allocate_large( \
+		n_bytes, NULL, true))
+
+#define ut_zalloc(n_bytes, key)		static_cast<void*>( \
+	ut_allocator<byte>(key).allocate( \
+		n_bytes, NULL, AUTOEVENT_IDX, true, false))
+
+#define ut_malloc_nokey(n_bytes)	static_cast<void*>( \
+	ut_allocator<byte>(PSI_NOT_INSTRUMENTED).allocate( \
+		n_bytes, NULL, AUTOEVENT_IDX, false, false))
+
+#define ut_zalloc_nokey(n_bytes)	static_cast<void*>( \
+	ut_allocator<byte>(PSI_NOT_INSTRUMENTED).allocate( \
+		n_bytes, NULL, AUTOEVENT_IDX, true, false))
+
+#define ut_zalloc_nokey_nofatal(n_bytes)	static_cast<void*>( \
+	ut_allocator<byte, false>(PSI_NOT_INSTRUMENTED).allocate( \
+		n_bytes, NULL, AUTOEVENT_IDX, true, false))
+
+#define ut_realloc(ptr, n_bytes)	static_cast<void*>( \
+	ut_allocator<byte>(PSI_NOT_INSTRUMENTED).reallocate( \
+		ptr, n_bytes, AUTOEVENT_IDX))
+
+#define ut_free(ptr)	ut_allocator<byte>(PSI_NOT_INSTRUMENTED).deallocate( \
+	reinterpret_cast<byte*>(ptr))
+
+#else /* UNIV_PFS_MEMORY */
+
+/* Fallbacks when memory tracing is disabled at compile time. */
+
+#define UT_NEW(expr, key)		::new(std::nothrow) expr
+#define UT_NEW_NOKEY(expr)		::new(std::nothrow) expr
+#define UT_DELETE(ptr)			::delete ptr
+
+#define UT_NEW_ARRAY(type, n_elements, key) \
+	::new(std::nothrow) type[n_elements]
+
+#define UT_NEW_ARRAY_NOKEY(type, n_elements) \
+	::new(std::nothrow) type[n_elements]
+
+#define UT_DELETE_ARRAY(ptr)		::delete[] ptr
+
+#define ut_malloc(n_bytes, key)		::malloc(n_bytes)
+
+#define ut_zalloc(n_bytes, key)		::calloc(1, n_bytes)
+
+#define ut_malloc_nokey(n_bytes)	::malloc(n_bytes)
+
+static inline void *ut_malloc_dontdump(size_t n_bytes, ...)
+{
+	void *ptr = my_large_malloc(&n_bytes, MYF(0));
+
+	ut_dontdump(ptr, n_bytes, true);
+
+	if (ptr) {
+		os_total_large_mem_allocated += n_bytes;
+	}
+	return ptr;
+}
+
+#define ut_zalloc_nokey(n_bytes)	::calloc(1, n_bytes)
+
+#define ut_zalloc_nokey_nofatal(n_bytes)	::calloc(1, n_bytes)
+
+#define ut_realloc(ptr, n_bytes)	::realloc(ptr, n_bytes)
+
+#define ut_free(ptr)			::free(ptr)
+
+#endif /* UNIV_PFS_MEMORY */
+
+static inline void ut_free_dodump(void *ptr, size_t size)
+{
+	ut_dodump(ptr, size);
+	os_total_large_mem_allocated -= size;
+	my_large_free(ptr, size);
+}
+
+#endif /* ut0new_h */
diff --git a/storage/innobase/include/ut0pool.h b/storage/innobase/include/ut0pool.h
new file mode 100644
index 00000000..aa0cfb9e
--- /dev/null
+++ b/storage/innobase/include/ut0pool.h
@@ -0,0 +1,365 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0pool.h
+Object pool.
+
+Created 2012-Feb-26 Sunny Bains
+***********************************************************************/
+
+#ifndef ut0pool_h
+#define ut0pool_h
+
+#include <vector>
+#include <queue>
+#include <functional>
+
+#include <my_global.h>
+
+/** Allocate the memory for the object in blocks. We keep the objects sorted
+on pointer so that they are closer together in case they have to be iterated
+over in a list. */
+template <typename Type, typename Factory, typename LockStrategy>
+struct Pool {
+
+	typedef Type value_type;
+
+	struct Element {
+		Pool*		m_pool;
+		value_type	m_type;
+	};
+
+	/** Constructor
+	@param size size of the memory block */
+	Pool(size_t size)
+		:
+		m_end(),
+		m_start(),
+		m_size(size),
+		m_last()
+	{
+		ut_ad(ut_is_2pow(size));
+		ut_a(size >= sizeof(Element));
+		static_assert(!(sizeof(Element) % CPU_LEVEL1_DCACHE_LINESIZE),
+			      "alignment");
+
+		m_lock_strategy.create();
+
+		ut_a(m_start == 0);
+
+		m_start = static_cast<Element*>(
+			aligned_malloc(m_size, CPU_LEVEL1_DCACHE_LINESIZE));
+		memset_aligned<CPU_LEVEL1_DCACHE_LINESIZE>(
+			m_start, 0, m_size);
+
+		m_last = m_start;
+
+		m_end = &m_start[m_size / sizeof *m_start];
+
+		/* Note: Initialise only a small subset, even though we have
+		allocated all the memory. This is required only because PFS
+		(MTR) results change if we instantiate too many mutexes up
+		front. */
+
+		init(ut_min(size_t(16), size_t(m_end - m_start)));
+
+		ut_ad(m_pqueue.size() <= size_t(m_last - m_start));
+	}
+
+	/** Destructor */
+	~Pool()
+	{
+		m_lock_strategy.destroy();
+
+		for (Element* elem = m_start; elem != m_last; ++elem) {
+
+			ut_ad(elem->m_pool == this);
+			Factory::destroy(&elem->m_type);
+		}
+
+		IF_WIN(_aligned_free,free)(m_start);
+		m_end = m_last = m_start = 0;
+		m_size = 0;
+	}
+
+	/** Get an object from the pool.
+	@retrun a free instance or NULL if exhausted. */
+	Type*	get()
+	{
+		Element*	elem;
+
+		m_lock_strategy.enter();
+
+		if (!m_pqueue.empty()) {
+
+			elem = m_pqueue.top();
+			m_pqueue.pop();
+
+		} else if (m_last < m_end) {
+
+			/* Initialise the remaining elements. */
+			init(size_t(m_end - m_last));
+
+			ut_ad(!m_pqueue.empty());
+
+			elem = m_pqueue.top();
+			m_pqueue.pop();
+		} else {
+			elem = NULL;
+		}
+
+		m_lock_strategy.exit();
+		return elem ? &elem->m_type : NULL;
+	}
+
+	/** Add the object to the pool.
+	@param ptr object to free */
+	static void mem_free(value_type* ptr)
+	{
+		Element*	elem;
+		byte*		p = reinterpret_cast<byte*>(ptr + 1);
+
+		elem = reinterpret_cast<Element*>(p - sizeof(*elem));
+
+		elem->m_pool->m_lock_strategy.enter();
+
+		elem->m_pool->putl(elem);
+
+		elem->m_pool->m_lock_strategy.exit();
+	}
+
+protected:
+	// Disable copying
+	Pool(const Pool&);
+	Pool& operator=(const Pool&);
+
+private:
+
+	/* We only need to compare on pointer address. */
+	typedef std::priority_queue<
+		Element*,
+		std::vector<Element*, ut_allocator<Element*> >,
+		std::greater<Element*> >	pqueue_t;
+
+	/** Release the object to the free pool
+	@param elem element to free */
+	void putl(Element* elem)
+	{
+		ut_ad(elem >= m_start && elem < m_last);
+		m_pqueue.push(elem);
+	}
+
+	/** Initialise the elements.
+	@param n_elems Number of elements to initialise */
+	void init(size_t n_elems)
+	{
+		ut_ad(size_t(m_end - m_last) >= n_elems);
+
+		for (size_t i = 0; i < n_elems; ++i, ++m_last) {
+
+			m_last->m_pool = this;
+			Factory::init(&m_last->m_type);
+			m_pqueue.push(m_last);
+		}
+
+		ut_ad(m_last <= m_end);
+	}
+
+private:
+	/** Pointer to the last element */
+	Element*		m_end;
+
+	/** Pointer to the first element */
+	Element*		m_start;
+
+	/** Size of the block in bytes */
+	size_t			m_size;
+
+	/** Upper limit of used space */
+	Element*		m_last;
+
+	/** Priority queue ordered on the pointer addresse. */
+	pqueue_t		m_pqueue;
+
+	/** Lock strategy to use */
+	LockStrategy		m_lock_strategy;
+};
+
+template <typename Pool, typename LockStrategy>
+struct PoolManager {
+
+	typedef Pool PoolType;
+	typedef typename PoolType::value_type value_type;
+
+	PoolManager(size_t size)
+		:
+		m_size(size)
+	{
+		create();
+	}
+
+	~PoolManager()
+	{
+		destroy();
+
+		ut_a(m_pools.empty());
+	}
+
+	/** Get an element from one of the pools.
+	@return instance or NULL if pool is empty. */
+	value_type* get()
+	{
+		size_t		index = 0;
+		size_t		delay = 1;
+		value_type*	ptr = NULL;
+
+		do {
+			m_lock_strategy.enter();
+
+			ut_ad(!m_pools.empty());
+
+			size_t	n_pools = m_pools.size();
+
+			PoolType*	pool = m_pools[index % n_pools];
+
+			m_lock_strategy.exit();
+
+			ptr = pool->get();
+
+			if (ptr == 0 && (index / n_pools) > 2) {
+
+				if (!add_pool(n_pools)) {
+
+					ib::error() << "Failed to allocate"
+						" memory for a pool of size "
+						<< m_size << " bytes. Will"
+						" wait for " << delay
+						<< " seconds for a thread to"
+						" free a resource";
+
+					/* There is nothing much we can do
+					except crash and burn, however lets
+					be a little optimistic and wait for
+					a resource to be freed. */
+					std::this_thread::sleep_for(
+						std::chrono::seconds(delay));
+
+					if (delay < 32) {
+						delay <<= 1;
+					}
+
+				} else {
+					delay = 1;
+				}
+			}
+
+			++index;
+
+		} while (ptr == NULL);
+
+		return(ptr);
+	}
+
+	static void mem_free(value_type* ptr)
+	{
+		PoolType::mem_free(ptr);
+	}
+
+private:
+	/** Add a new pool
+	@param n_pools Number of pools that existed when the add pool was
+			called.
+	@return true on success */
+	bool add_pool(size_t n_pools)
+	{
+		bool	added = false;
+
+		m_lock_strategy.enter();
+
+		if (n_pools < m_pools.size()) {
+			/* Some other thread already added a pool. */
+			added = true;
+		} else {
+			PoolType*	pool;
+
+			ut_ad(n_pools == m_pools.size());
+
+			pool = UT_NEW_NOKEY(PoolType(m_size));
+
+			if (pool != NULL) {
+				m_pools.push_back(pool);
+
+				ib::info() << "Number of transaction pools: "
+					<< m_pools.size();
+
+				added = true;
+			}
+		}
+
+		ut_ad(n_pools < m_pools.size() || !added);
+
+		m_lock_strategy.exit();
+
+		return(added);
+	}
+
+	/** Create the pool manager. */
+	void create()
+	{
+		ut_a(m_size > sizeof(value_type));
+		m_lock_strategy.create();
+
+		add_pool(0);
+	}
+
+	/** Release the resources. */
+	void destroy()
+	{
+		typename Pools::iterator it;
+		typename Pools::iterator end = m_pools.end();
+
+		for (it = m_pools.begin(); it != end; ++it) {
+			PoolType*	pool = *it;
+
+			UT_DELETE(pool);
+		}
+
+		m_pools.clear();
+
+		m_lock_strategy.destroy();
+	}
+private:
+	// Disable copying
+	PoolManager(const PoolManager&);
+	PoolManager& operator=(const PoolManager&);
+
+	typedef std::vector<PoolType*, ut_allocator<PoolType*> >	Pools;
+
+	/** Size of each block */
+	size_t		m_size;
+
+	/** Pools managed this manager */
+	Pools		m_pools;
+
+	/** Lock strategy to use */
+	LockStrategy		m_lock_strategy;
+};
+
+#endif /* ut0pool_h */
diff --git a/storage/innobase/include/ut0rbt.h b/storage/innobase/include/ut0rbt.h
new file mode 100644
index 00000000..38071165
--- /dev/null
+++ b/storage/innobase/include/ut0rbt.h
@@ -0,0 +1,254 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/******************************************************************//**
+@file include/ut0rbt.h
+Various utilities
+
+Created 2007-03-20 Sunny Bains
+*******************************************************/
+
+#ifndef INNOBASE_UT0RBT_H
+#define INNOBASE_UT0RBT_H
+
+#if !defined(IB_RBT_TESTING)
+#include "ut0mem.h"
+#else
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#define	ut_malloc	malloc
+#define	ut_free		free
+#define	ulint		unsigned long
+#define	ut_a(c)		assert(c)
+#define ut_error	assert(0)
+#define	ibool		unsigned int
+#define	TRUE		1
+#define	FALSE		0
+#endif
+
+struct ib_rbt_node_t;
+typedef void (*ib_rbt_print_node)(const ib_rbt_node_t* node);
+typedef int (*ib_rbt_compare)(const void* p1, const void* p2);
+typedef int (*ib_rbt_arg_compare)(const void*, const void* p1, const void* p2);
+
+/** Red black tree color types */
+enum ib_rbt_color_t {
+	IB_RBT_RED,
+	IB_RBT_BLACK
+};
+
+/** Red black tree node */
+struct ib_rbt_node_t {
+	ib_rbt_color_t	color;			/* color of this node */
+
+	ib_rbt_node_t*	left;			/* points left child */
+	ib_rbt_node_t*	right;			/* points right child */
+	ib_rbt_node_t*	parent;			/* points parent node */
+
+	char		value[1];		/* Data value */
+};
+
+/** Red black tree instance.*/
+struct	ib_rbt_t {
+	ib_rbt_node_t*	nil;			/* Black colored node that is
+						used as a sentinel. This is
+						pre-allocated too.*/
+
+	ib_rbt_node_t*	root;			/* Root of the tree, this is
+						pre-allocated and the first
+						data node is the left child.*/
+
+	ulint		n_nodes;		/* Total number of data nodes */
+
+	ib_rbt_compare	compare;		/* Fn. to use for comparison */
+	ib_rbt_arg_compare
+			compare_with_arg;	/* Fn. to use for comparison
+						with argument */
+	ulint		sizeof_value;		/* Sizeof the item in bytes */
+	void*		cmp_arg;		/* Compare func argument */
+};
+
+/** The result of searching for a key in the tree, this is useful for
+a speedy lookup and insert if key doesn't exist.*/
+struct ib_rbt_bound_t {
+	const ib_rbt_node_t*
+			last;			/* Last node visited */
+
+	int		result;			/* Result of comparing with
+						the last non-nil node that
+						was visited */
+};
+
+/* Size in elements (t is an rb tree instance) */
+#define rbt_size(t)	(t->n_nodes)
+
+/* Check whether the rb tree is empty (t is an rb tree instance) */
+#define rbt_empty(t)	(rbt_size(t) == 0)
+
+/* Get data value (t is the data type, n is an rb tree node instance) */
+#define rbt_value(t, n) ((t*) &n->value[0])
+
+/* Compare a key with the node value (t is tree, k is key, n is node)*/
+#define rbt_compare(t, k, n) (t->compare(k, n->value))
+
+/**********************************************************************//**
+Free an instance of  a red black tree */
+void
+rbt_free(
+/*=====*/
+	ib_rbt_t*	tree);			/*!< in: rb tree to free */
+/**********************************************************************//**
+Create an instance of a red black tree
+@return rb tree instance */
+ib_rbt_t*
+rbt_create(
+/*=======*/
+	size_t		sizeof_value,		/*!< in: size in bytes */
+	ib_rbt_compare	compare);		/*!< in: comparator */
+/**********************************************************************//**
+Create an instance of a red black tree, whose comparison function takes
+an argument
+@return rb tree instance */
+ib_rbt_t*
+rbt_create_arg_cmp(
+/*===============*/
+	size_t		sizeof_value,		/*!< in: size in bytes */
+	ib_rbt_arg_compare
+			compare,		/*!< in: comparator */
+	void*	cmp_arg);		/*!< in: compare fn arg */
+/**********************************************************************//**
+Delete a node from the red black tree, identified by key */
+ibool
+rbt_delete(
+/*=======*/
+						/* in: TRUE on success */
+	ib_rbt_t*	tree,			/* in: rb tree */
+	const void*	key);			/* in: key to delete */
+/**********************************************************************//**
+Remove a node from the red black tree, NOTE: This function will not delete
+the node instance, THAT IS THE CALLERS RESPONSIBILITY.
+@return the deleted node with the const. */
+ib_rbt_node_t*
+rbt_remove_node(
+/*============*/
+	ib_rbt_t*	tree,			/*!< in: rb tree */
+	const ib_rbt_node_t*
+			node);			/*!< in: node to delete, this
+						is a fudge and declared const
+						because the caller has access
+						only to const nodes.*/
+/**********************************************************************//**
+Add data to the red black tree, identified by key (no dups yet!)
+@return inserted node */
+const ib_rbt_node_t*
+rbt_insert(
+/*=======*/
+	ib_rbt_t*	tree,			/*!< in: rb tree */
+	const void*	key,			/*!< in: key for ordering */
+	const void*	value);			/*!< in: data that will be
+						copied to the node.*/
+/**********************************************************************//**
+Add a new node to the tree, useful for data that is pre-sorted.
+@return appended node */
+const ib_rbt_node_t*
+rbt_add_node(
+/*=========*/
+	ib_rbt_t*	tree,			/*!< in: rb tree */
+	ib_rbt_bound_t*	parent,			/*!< in: parent */
+	const void*	value);			/*!< in: this value is copied
+						to the node */
+/**********************************************************************//**
+Return the left most data node in the tree
+@return left most node */
+const ib_rbt_node_t*
+rbt_first(
+/*======*/
+	const ib_rbt_t*	tree);			/*!< in: rb tree */
+/**********************************************************************//**
+Return the right most data node in the tree
+@return right most node */
+const ib_rbt_node_t*
+rbt_last(
+/*=====*/
+	const ib_rbt_t*	tree);			/*!< in: rb tree */
+/**********************************************************************//**
+Return the next node from current.
+@return successor node to current that is passed in. */
+const ib_rbt_node_t*
+rbt_next(
+/*=====*/
+	const ib_rbt_t*	tree,			/*!< in: rb tree */
+	const ib_rbt_node_t*			/* in: current node */
+			current);
+/**********************************************************************//**
+Return the prev node from current.
+@return precedessor node to current that is passed in */
+const ib_rbt_node_t*
+rbt_prev(
+/*=====*/
+	const ib_rbt_t*	tree,			/*!< in: rb tree */
+	const ib_rbt_node_t*			/* in: current node */
+			current);
+/**********************************************************************//**
+Search for the key, a node will be retuned in parent.last, whether it
+was found or not. If not found then parent.last will contain the
+parent node for the possibly new key otherwise the matching node.
+@return result of last comparison */
+int
+rbt_search(
+/*=======*/
+	const ib_rbt_t*	tree,			/*!< in: rb tree */
+	ib_rbt_bound_t*	parent,			/*!< in: search bounds */
+	const void*	key);			/*!< in: key to search */
+/**********************************************************************//**
+Search for the key, a node will be retuned in parent.last, whether it
+was found or not. If not found then parent.last will contain the
+parent node for the possibly new key otherwise the matching node.
+@return result of last comparison */
+int
+rbt_search_cmp(
+/*===========*/
+	const ib_rbt_t*	tree,			/*!< in: rb tree */
+	ib_rbt_bound_t*	parent,			/*!< in: search bounds */
+	const void*	key,			/*!< in: key to search */
+	ib_rbt_compare	compare,		/*!< in: comparator */
+	ib_rbt_arg_compare
+			arg_compare);		/*!< in: fn to compare items
+						with argument */
+/**********************************************************************//**
+Merge the node from dst into src. Return the number of nodes merged.
+@return no. of recs merged */
+ulint
+rbt_merge_uniq(
+/*===========*/
+	ib_rbt_t*	dst,			/*!< in: dst rb tree */
+	const ib_rbt_t*	src);			/*!< in: src rb tree */
+#if defined UNIV_DEBUG || defined IB_RBT_TESTING
+/**********************************************************************//**
+Verify the integrity of the RB tree. For debugging. 0 failure else height
+of tree (in count of black nodes).
+@return TRUE if OK FALSE if tree invalid. */
+ibool
+rbt_validate(
+/*=========*/
+	const ib_rbt_t*	tree);			/*!< in: tree to validate */
+#endif /* UNIV_DEBUG || IB_RBT_TESTING */
+
+#endif /* INNOBASE_UT0RBT_H */
diff --git a/storage/innobase/include/ut0rnd.h b/storage/innobase/include/ut0rnd.h
new file mode 100644
index 00000000..511eb21f
--- /dev/null
+++ b/storage/innobase/include/ut0rnd.h
@@ -0,0 +1,128 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0rnd.h
+Random numbers and hashing
+
+Created 1/20/1994 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0rnd_h
+#define ut0rnd_h
+
+#include "ut0byte.h"
+#include <my_sys.h>
+
+#ifndef UNIV_INNOCHECKSUM
+/** Seed value of ut_rnd_gen() */
+extern std::atomic<uint32_t> ut_rnd_current;
+
+/** @return a pseudo-random 32-bit number */
+inline uint32_t ut_rnd_gen()
+{
+  /* This is a Galois linear-feedback shift register.
+  https://en.wikipedia.org/wiki/Linear-feedback_shift_register#Galois_LFSRs
+  The generating primitive Galois Field polynomial is the Castagnoli
+  polynomial that was made popular by CRC-32C:
+  x^32+x^28+x^27+x^26+x^25+x^23+x^22+x^20+
+  x^19+x^18+x^14+x^13+x^11+x^10+x^9+x^8+x^6+1 */
+  const uint32_t crc32c= 0x1edc6f41;
+
+  uint32_t rnd= ut_rnd_current.load(std::memory_order_relaxed);
+
+  if (UNIV_UNLIKELY(rnd == 0))
+  {
+    rnd= static_cast<uint32_t>(my_interval_timer());
+    if (!rnd) rnd= 1;
+  }
+  else
+  {
+    bool lsb= rnd & 1;
+    rnd>>= 1;
+    if (lsb)
+      rnd^= crc32c;
+  }
+
+  ut_rnd_current.store(rnd, std::memory_order_relaxed);
+  return rnd;
+}
+
+/** @return a random number between 0 and n-1, inclusive */
+inline ulint ut_rnd_interval(ulint n)
+{
+  return n > 1 ? static_cast<ulint>(ut_rnd_gen() % n) : 0;
+}
+
+/*******************************************************//**
+The following function generates a hash value for a ulint integer
+to a hash table of size table_size, which should be a prime or some
+random number to work reliably.
+@return hash value */
+UNIV_INLINE
+ulint
+ut_hash_ulint(
+/*==========*/
+	ulint	 key,		/*!< in: value to be hashed */
+	ulint	 table_size);	/*!< in: hash table size */
+/*************************************************************//**
+Folds a 64-bit integer.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_ull(
+/*========*/
+	ib_uint64_t	d)	/*!< in: 64-bit integer */
+	MY_ATTRIBUTE((const));
+/***********************************************************//**
+Looks for a prime number slightly greater than the given argument.
+The prime is chosen so that it is not near any power of 2.
+@return prime */
+ulint
+ut_find_prime(
+/*==========*/
+	ulint	n)	/*!< in: positive number > 100 */
+	MY_ATTRIBUTE((const));
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+/*************************************************************//**
+Folds a pair of ulints.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_ulint_pair(
+/*===============*/
+	ulint	n1,	/*!< in: ulint */
+	ulint	n2)	/*!< in: ulint */
+	MY_ATTRIBUTE((const));
+/*************************************************************//**
+Folds a binary string.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_binary(
+/*===========*/
+	const byte*	str,	/*!< in: string of bytes */
+	ulint		len)	/*!< in: length */
+	MY_ATTRIBUTE((pure));
+
+#include "ut0rnd.inl"
+
+#endif
diff --git a/storage/innobase/include/ut0rnd.inl b/storage/innobase/include/ut0rnd.inl
new file mode 100644
index 00000000..37da323f
--- /dev/null
+++ b/storage/innobase/include/ut0rnd.inl
@@ -0,0 +1,128 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************************//**
+@file include/ut0rnd.ic
+Random numbers and hashing
+
+Created 5/30/1994 Heikki Tuuri
+*******************************************************************/
+
+#define UT_HASH_RANDOM_MASK	1463735687
+#define UT_HASH_RANDOM_MASK2	1653893711
+
+#ifndef UNIV_INNOCHECKSUM
+
+/*******************************************************//**
+The following function generates a hash value for a ulint integer
+to a hash table of size table_size, which should be a prime
+or some random number for the hash table to work reliably.
+@return hash value */
+UNIV_INLINE
+ulint
+ut_hash_ulint(
+/*==========*/
+	ulint	 key,		/*!< in: value to be hashed */
+	ulint	 table_size)	/*!< in: hash table size */
+{
+	ut_ad(table_size);
+	key = key ^ UT_HASH_RANDOM_MASK2;
+
+	return(key % table_size);
+}
+
+/*************************************************************//**
+Folds a 64-bit integer.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_ull(
+/*========*/
+	ib_uint64_t	d)	/*!< in: 64-bit integer */
+{
+	return(ut_fold_ulint_pair((ulint) d & ULINT32_MASK,
+				  (ulint) (d >> 32)));
+}
+#endif /* !UNIV_INNOCHECKSUM */
+
+/*************************************************************//**
+Folds a pair of ulints.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_ulint_pair(
+/*===============*/
+	ulint	n1,	/*!< in: ulint */
+	ulint	n2)	/*!< in: ulint */
+{
+	return(((((n1 ^ n2 ^ UT_HASH_RANDOM_MASK2) << 8) + n1)
+		^ UT_HASH_RANDOM_MASK) + n2);
+}
+
+/*************************************************************//**
+Folds a binary string.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_binary(
+/*===========*/
+	const byte*	str,	/*!< in: string of bytes */
+	ulint		len)	/*!< in: length */
+{
+	ulint		fold = 0;
+	const byte*	str_end	= str + (len & 0xFFFFFFF8);
+
+	ut_ad(str || !len);
+
+	while (str < str_end) {
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+	}
+
+	switch (len & 0x7) {
+	case 7:
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		/* fall through */
+	case 6:
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		/* fall through */
+	case 5:
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		/* fall through */
+	case 4:
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		/* fall through */
+	case 3:
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		/* fall through */
+	case 2:
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+		/* fall through */
+	case 1:
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str++));
+	}
+
+	return(fold);
+}
diff --git a/storage/innobase/include/ut0sort.h b/storage/innobase/include/ut0sort.h
new file mode 100644
index 00000000..4f1d4c04
--- /dev/null
+++ b/storage/innobase/include/ut0sort.h
@@ -0,0 +1,104 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0sort.h
+Sort utility
+
+Created 11/9/1995 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0sort_h
+#define ut0sort_h
+
+/* This module gives a macro definition of the body of
+a standard sort function for an array of elements of any
+type. The comparison function is given as a parameter to
+the macro. The sort algorithm is mergesort which has logarithmic
+worst case.
+*/
+
+/*******************************************************************//**
+This macro expands to the body of a standard sort function.
+The sort function uses mergesort and must be defined separately
+for each type of array.
+Also the comparison function has to be defined individually
+for each array cell type. SORT_FUN is the sort function name.
+The function takes the array to be sorted (ARR),
+the array of auxiliary space (AUX_ARR) of same size,
+and the low (LOW), inclusive, and high (HIGH), noninclusive,
+limits for the sort interval as arguments.
+CMP_FUN is the comparison function name. It takes as arguments
+two elements from the array and returns 1, if the first is bigger,
+0 if equal, and -1 if the second bigger. */
+
+#define UT_SORT_FUNCTION_BODY(SORT_FUN, ARR, AUX_ARR, LOW, HIGH, CMP_FUN)\
+{\
+	ulint		ut_sort_mid77;\
+	ulint		ut_sort_i77;\
+	ulint		ut_sort_low77;\
+	ulint		ut_sort_high77;\
+\
+	ut_ad((LOW) < (HIGH));\
+	ut_ad(ARR);\
+	ut_ad(AUX_ARR);\
+\
+	if ((LOW) == (HIGH) - 1) {\
+		return;\
+	} else if ((LOW) == (HIGH) - 2) {\
+		if (CMP_FUN((ARR)[LOW], (ARR)[(HIGH) - 1]) > 0) {\
+			(AUX_ARR)[LOW] = (ARR)[LOW];\
+			(ARR)[LOW] = (ARR)[(HIGH) - 1];\
+			(ARR)[(HIGH) - 1] = (AUX_ARR)[LOW];\
+		}\
+		return;\
+	}\
+\
+	ut_sort_mid77 = ((LOW) + (HIGH)) / 2;\
+\
+	SORT_FUN((ARR), (AUX_ARR), (LOW), ut_sort_mid77);\
+	SORT_FUN((ARR), (AUX_ARR), ut_sort_mid77, (HIGH));\
+\
+	ut_sort_low77 = (LOW);\
+	ut_sort_high77 = ut_sort_mid77;\
+\
+	for (ut_sort_i77 = (LOW); ut_sort_i77 < (HIGH); ut_sort_i77++) {\
+\
+		if (ut_sort_low77 >= ut_sort_mid77) {\
+			(AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_high77];\
+			ut_sort_high77++;\
+		} else if (ut_sort_high77 >= (HIGH)) {\
+			(AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_low77];\
+			ut_sort_low77++;\
+		} else if (CMP_FUN((ARR)[ut_sort_low77],\
+				   (ARR)[ut_sort_high77]) > 0) {\
+			(AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_high77];\
+			ut_sort_high77++;\
+		} else {\
+			(AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_low77];\
+			ut_sort_low77++;\
+		}\
+	}\
+\
+	memcpy((void*) ((ARR) + (LOW)), (AUX_ARR) + (LOW),\
+	       ((HIGH) - (LOW)) * sizeof *(ARR));\
+}\
+
+
+#endif
+
diff --git a/storage/innobase/include/ut0stage.h b/storage/innobase/include/ut0stage.h
new file mode 100644
index 00000000..17fbd91b
--- /dev/null
+++ b/storage/innobase/include/ut0stage.h
@@ -0,0 +1,499 @@
+/*****************************************************************************
+
+Copyright (c) 2014, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file ut/ut0stage.h
+Supplementary code to performance schema stage instrumentation.
+
+Created Nov 12, 2014 Vasil Dimov
+*******************************************************/
+
+#ifndef ut0stage_h
+#define ut0stage_h
+
+#include <algorithm>
+#include <math.h>
+
+#include "my_global.h" /* needed for headers from mysql/psi/ */
+
+#include "mysql/psi/mysql_stage.h" /* mysql_stage_inc_work_completed */
+#include "mysql/psi/psi.h" /* HAVE_PSI_STAGE_INTERFACE, PSI_stage_progress */
+
+#include "dict0mem.h" /* dict_index_t */
+#include "row0log.h" /* row_log_estimate_work() */
+#include "srv0srv.h" /* ut_stage_alter_t */
+
+#ifdef HAVE_PSI_STAGE_INTERFACE
+
+/** Class used to report ALTER TABLE progress via performance_schema.
+The only user of this class is the ALTER TABLE code and it calls the methods
+in the following order
+constructor
+begin_phase_read_pk()
+  multiple times:
+    n_pk_recs_inc() // once per record read
+    inc() // once per page read
+end_phase_read_pk()
+if any new indexes are being added, for each one:
+  begin_phase_sort()
+    multiple times:
+      inc() // once per record sorted
+  begin_phase_insert()
+    multiple times:
+      inc() // once per record inserted
+  being_phase_log_index()
+    multiple times:
+      inc() // once per log-block applied
+begin_phase_log_table()
+    multiple times:
+      inc() // once per log-block applied
+begin_phase_end()
+destructor
+
+This class knows the specifics of each phase and tries to increment the
+progress in an even manner across the entire ALTER TABLE lifetime. */
+class ut_stage_alter_t {
+public:
+	/** Constructor.
+	@param[in]	pk	primary key of the old table */
+	explicit
+	ut_stage_alter_t(
+		const dict_index_t*	pk)
+		:
+		m_progress(NULL),
+		m_pk(pk),
+		m_n_pk_recs(0),
+		m_n_pk_pages(0),
+		m_n_recs_processed(0),
+		m_cur_phase(NOT_STARTED)
+	{
+	}
+
+	/** Destructor. */
+	~ut_stage_alter_t();
+
+	/** Flag an ALTER TABLE start (read primary key phase).
+	@param[in]	n_sort_indexes	number of indexes that will be sorted
+	during ALTER TABLE, used for estimating the total work to be done */
+	void
+	begin_phase_read_pk(
+		ulint	n_sort_indexes);
+
+	/** Increment the number of records in PK (table) with 1.
+	This is used to get more accurate estimate about the number of
+	records per page which is needed because some phases work on
+	per-page basis while some work on per-record basis and we want
+	to get the progress as even as possible. */
+	void
+	n_pk_recs_inc();
+
+	/** Flag either one record or one page processed, depending on the
+	current phase.
+	@param[in]	inc_val	flag this many units processed at once */
+	void
+	inc(
+		ulint	inc_val = 1);
+
+	/** Flag the end of reading of the primary key.
+	Here we know the exact number of pages and records and calculate
+	the number of records per page and refresh the estimate. */
+	void
+	end_phase_read_pk();
+
+	/** Flag the beginning of the sort phase.
+	@param[in]	sort_multi_factor	since merge sort processes
+	one page more than once we only update the estimate once per this
+	many pages processed. */
+	void
+	begin_phase_sort(
+		double	sort_multi_factor);
+
+	/** Flag the beginning of the insert phase. */
+	void
+	begin_phase_insert();
+
+	/** Flag the beginning of the log index phase. */
+	void
+	begin_phase_log_index();
+
+	/** Flag the beginning of the log table phase. */
+	void
+	begin_phase_log_table();
+
+	/** Flag the beginning of the end phase. */
+	void
+	begin_phase_end();
+
+private:
+
+	/** Update the estimate of total work to be done. */
+	void
+	reestimate();
+
+	/** Change the current phase.
+	@param[in]	new_stage	pointer to the new stage to change to */
+	void
+	change_phase(
+		const PSI_stage_info*	new_stage);
+
+	/** Performance schema accounting object. */
+	PSI_stage_progress*	m_progress;
+
+	/** Old table PK. Used for calculating the estimate. */
+	const dict_index_t*	m_pk;
+
+	/** Number of records in the primary key (table), including delete
+	marked records. */
+	ulint			m_n_pk_recs;
+
+	/** Number of leaf pages in the primary key. */
+	ulint			m_n_pk_pages;
+
+	/** Estimated number of records per page in the primary key. */
+	double			m_n_recs_per_page;
+
+	/** Number of indexes that are being added. */
+	ulint			m_n_sort_indexes;
+
+	/** During the sort phase, increment the counter once per this
+	many pages processed. This is because sort processes one page more
+	than once. */
+	ulint			m_sort_multi_factor;
+
+	/** Number of records processed during sort & insert phases. We
+	need to increment the counter only once page, or once per
+	recs-per-page records. */
+	ulint			m_n_recs_processed;
+
+	/** Current phase. */
+	enum {
+		NOT_STARTED = 0,
+		READ_PK = 1,
+		SORT = 2,
+		INSERT = 3,
+		/* JAN: TODO: MySQL 5.7 vrs. MariaDB sql/log.h
+		LOG_INDEX = 5,
+		LOG_TABLE = 6, */
+		LOG_INNODB_INDEX = 5,
+		LOG_INNODB_TABLE = 6,
+		END = 7,
+	}			m_cur_phase;
+};
+
+/** Destructor. */
+inline
+ut_stage_alter_t::~ut_stage_alter_t()
+{
+	if (m_progress == NULL) {
+		return;
+	}
+
+	/* Set completed = estimated before we quit. */
+	mysql_stage_set_work_completed(
+		m_progress,
+		mysql_stage_get_work_estimated(m_progress));
+
+	mysql_end_stage();
+}
+
+/** Flag an ALTER TABLE start (read primary key phase).
+@param[in]	n_sort_indexes	number of indexes that will be sorted
+during ALTER TABLE, used for estimating the total work to be done */
+inline
+void
+ut_stage_alter_t::begin_phase_read_pk(
+	ulint	n_sort_indexes)
+{
+	m_n_sort_indexes = n_sort_indexes;
+
+	m_cur_phase = READ_PK;
+
+	m_progress = mysql_set_stage(
+		srv_stage_alter_table_read_pk_internal_sort.m_key);
+
+	mysql_stage_set_work_completed(m_progress, 0);
+	reestimate();
+}
+
+/** Increment the number of records in PK (table) with 1.
+This is used to get more accurate estimate about the number of
+records per page which is needed because some phases work on
+per-page basis while some work on per-record basis and we want
+to get the progress as even as possible. */
+inline
+void
+ut_stage_alter_t::n_pk_recs_inc()
+{
+	m_n_pk_recs++;
+}
+
+/** Flag either one record or one page processed, depending on the
+current phase. */
+inline
+void
+ut_stage_alter_t::inc(ulint inc_val)
+{
+	if (m_progress == NULL) {
+		return;
+	}
+
+	ulint	multi_factor = 1;
+	bool	should_proceed = true;
+
+	switch (m_cur_phase) {
+	case NOT_STARTED:
+		ut_error;
+	case READ_PK:
+		m_n_pk_pages++;
+		ut_ad(inc_val == 1);
+		/* Overall the read pk phase will read all the pages from the
+		PK and will do work, proportional to the number of added
+		indexes, thus when this is called once per read page we
+		increment with 1 + m_n_sort_indexes */
+		inc_val = 1 + m_n_sort_indexes;
+		break;
+	case SORT:
+		multi_factor = m_sort_multi_factor;
+		/* fall through */
+	case INSERT: {
+		/* Increment the progress every nth record. During
+		sort and insert phases, this method is called once per
+		record processed. We need fractional point numbers here
+		because "records per page" is such a number naturally and
+		to avoid rounding skew we want, for example: if there are
+		(double) N records per page, then the work_completed
+	        should be incremented on the inc() calls round(k*N),
+		for k=1,2,3... */
+		const double	every_nth = m_n_recs_per_page *
+			static_cast<double>(multi_factor);
+
+		const ulint	k = static_cast<ulint>(
+			round(static_cast<double>(m_n_recs_processed) /
+			      every_nth));
+
+		const ulint	nth = static_cast<ulint>(
+			round(static_cast<double>(k) * every_nth));
+
+		should_proceed = m_n_recs_processed == nth;
+
+		m_n_recs_processed++;
+
+		break;
+	}
+	/* JAN: TODO: MySQL 5.7
+	case LOG_INDEX:
+		break;
+	case LOG_TABLE:
+	break; */
+	case LOG_INNODB_INDEX:
+	case LOG_INNODB_TABLE:
+		break;
+	case END:
+		break;
+	}
+
+	if (should_proceed) {
+		mysql_stage_inc_work_completed(m_progress, inc_val);
+		reestimate();
+	}
+}
+
+/** Flag the end of reading of the primary key.
+Here we know the exact number of pages and records and calculate
+the number of records per page and refresh the estimate. */
+inline
+void
+ut_stage_alter_t::end_phase_read_pk()
+{
+	reestimate();
+
+	if (m_n_pk_pages == 0) {
+		/* The number of pages in the PK could be 0 if the tree is
+		empty. In this case we set m_n_recs_per_page to 1 to avoid
+		division by zero later. */
+		m_n_recs_per_page = 1.0;
+	} else {
+		m_n_recs_per_page = std::max(
+			static_cast<double>(m_n_pk_recs)
+			/ static_cast<double>(m_n_pk_pages),
+			1.0);
+	}
+}
+
+/** Flag the beginning of the sort phase.
+@param[in]	sort_multi_factor	since merge sort processes
+one page more than once we only update the estimate once per this
+many pages processed. */
+inline
+void
+ut_stage_alter_t::begin_phase_sort(
+	double	sort_multi_factor)
+{
+	if (sort_multi_factor <= 1.0) {
+		m_sort_multi_factor = 1;
+	} else {
+		m_sort_multi_factor = static_cast<ulint>(
+			round(sort_multi_factor));
+	}
+
+	change_phase(&srv_stage_alter_table_merge_sort);
+}
+
+/** Flag the beginning of the insert phase. */
+inline
+void
+ut_stage_alter_t::begin_phase_insert()
+{
+	change_phase(&srv_stage_alter_table_insert);
+}
+
+/** Flag the beginning of the log index phase. */
+inline
+void
+ut_stage_alter_t::begin_phase_log_index()
+{
+	change_phase(&srv_stage_alter_table_log_index);
+}
+
+/** Flag the beginning of the log table phase. */
+inline
+void
+ut_stage_alter_t::begin_phase_log_table()
+{
+	change_phase(&srv_stage_alter_table_log_table);
+}
+
+/** Flag the beginning of the end phase. */
+inline
+void
+ut_stage_alter_t::begin_phase_end()
+{
+	change_phase(&srv_stage_alter_table_end);
+}
+
+/** Update the estimate of total work to be done. */
+inline
+void
+ut_stage_alter_t::reestimate()
+{
+	if (m_progress == NULL) {
+		return;
+	}
+
+	/* During the log table phase we calculate the estimate as
+	work done so far + log size remaining. */
+	if (m_cur_phase == LOG_INNODB_TABLE) {
+		mysql_stage_set_work_estimated(
+			m_progress,
+			mysql_stage_get_work_completed(m_progress)
+			+ row_log_estimate_work(m_pk));
+		return;
+	}
+
+	/* During the other phases we use a formula, regardless of
+	how much work has been done so far. */
+
+	/* For number of pages in the PK - if the PK has not been
+	read yet, use stat_n_leaf_pages (approximate), otherwise
+	use the exact number we gathered. */
+	const ulint	n_pk_pages
+		= m_cur_phase != READ_PK
+		? m_n_pk_pages
+		: m_pk->stat_n_leaf_pages;
+
+	ulonglong	estimate __attribute__((unused))
+		= n_pk_pages
+		* (1 /* read PK */
+		   + m_n_sort_indexes /* row_merge_buf_sort() inside the
+				      read PK per created index */
+		   + m_n_sort_indexes * 2 /* sort & insert per created index */)
+		+ row_log_estimate_work(m_pk);
+
+	/* Prevent estimate < completed */
+	estimate = std::max(estimate,
+			    mysql_stage_get_work_completed(m_progress));
+
+	mysql_stage_set_work_estimated(m_progress, estimate);
+}
+
+/** Change the current phase.
+@param[in]	new_stage	pointer to the new stage to change to */
+inline
+void
+ut_stage_alter_t::change_phase(
+	const PSI_stage_info*	new_stage)
+{
+	if (m_progress == NULL) {
+		return;
+	}
+
+	if (new_stage == &srv_stage_alter_table_read_pk_internal_sort) {
+		m_cur_phase = READ_PK;
+	} else if (new_stage == &srv_stage_alter_table_merge_sort) {
+		m_cur_phase = SORT;
+	} else if (new_stage == &srv_stage_alter_table_insert) {
+		m_cur_phase = INSERT;
+	/* JAN: TODO: MySQL 5.7 used LOG_INDEX and LOG_TABLE */
+	} else if (new_stage == &srv_stage_alter_table_log_index) {
+		m_cur_phase = LOG_INNODB_INDEX;
+	} else if (new_stage == &srv_stage_alter_table_log_table) {
+		m_cur_phase = LOG_INNODB_TABLE;
+	} else if (new_stage == &srv_stage_alter_table_end) {
+		m_cur_phase = END;
+	} else {
+		ut_error;
+	}
+
+	const ulonglong	c = mysql_stage_get_work_completed(m_progress);
+	const ulonglong	e = mysql_stage_get_work_estimated(m_progress);
+
+	m_progress = mysql_set_stage(new_stage->m_key);
+
+	mysql_stage_set_work_completed(m_progress, c);
+	mysql_stage_set_work_estimated(m_progress, e);
+}
+#else /* HAVE_PSI_STAGE_INTERFACE */
+
+class ut_stage_alter_t {
+public:
+	explicit ut_stage_alter_t(const dict_index_t*) {}
+
+	void begin_phase_read_pk(ulint)	{}
+
+	void n_pk_recs_inc() {}
+
+	void inc() {}
+	void inc(ulint) {}
+
+	void end_phase_read_pk() {}
+
+	void begin_phase_sort(double) {}
+
+	void begin_phase_insert() {}
+
+	void begin_phase_log_index() {}
+
+	void begin_phase_log_table() {}
+
+	void begin_phase_end() {}
+};
+
+#endif /* HAVE_PSI_STAGE_INTERFACE */
+
+#endif /* ut0stage_h */
diff --git a/storage/innobase/include/ut0ut.h b/storage/innobase/include/ut0ut.h
new file mode 100644
index 00000000..fe16ce14
--- /dev/null
+++ b/storage/innobase/include/ut0ut.h
@@ -0,0 +1,444 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0ut.h
+Various utilities
+
+Created 1/20/1994 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0ut_h
+#define ut0ut_h
+
+/* Do not include univ.i because univ.i includes this. */
+
+#include <ostream>
+#include <sstream>
+#include <string.h>
+
+#ifndef UNIV_INNOCHECKSUM
+
+#include "db0err.h"
+
+#include <time.h>
+
+#ifndef MYSQL_SERVER
+#include <ctype.h>
+#endif /* MYSQL_SERVER */
+
+#include <stdarg.h>
+
+#include <string>
+
+/** Index name prefix in fast index creation, as a string constant */
+#define TEMP_INDEX_PREFIX_STR	"\377"
+
+#define ut_max	std::max
+#define ut_min	std::min
+
+/** Calculate the minimum of two pairs.
+@param[out]	min_hi	MSB of the minimum pair
+@param[out]	min_lo	LSB of the minimum pair
+@param[in]	a_hi	MSB of the first pair
+@param[in]	a_lo	LSB of the first pair
+@param[in]	b_hi	MSB of the second pair
+@param[in]	b_lo	LSB of the second pair */
+UNIV_INLINE
+void
+ut_pair_min(
+	ulint*	min_hi,
+	ulint*	min_lo,
+	ulint	a_hi,
+	ulint	a_lo,
+	ulint	b_hi,
+	ulint	b_lo);
+/******************************************************//**
+Compares two ulints.
+@return 1 if a > b, 0 if a == b, -1 if a < b */
+UNIV_INLINE
+int
+ut_ulint_cmp(
+/*=========*/
+	ulint	a,	/*!< in: ulint */
+	ulint	b);	/*!< in: ulint */
+/** Compare two pairs of integers.
+@param[in]	a_h	more significant part of first pair
+@param[in]	a_l	less significant part of first pair
+@param[in]	b_h	more significant part of second pair
+@param[in]	b_l	less significant part of second pair
+@return comparison result of (a_h,a_l) and (b_h,b_l)
+@retval -1 if (a_h,a_l) is less than (b_h,b_l)
+@retval 0 if (a_h,a_l) is equal to (b_h,b_l)
+@retval 1 if (a_h,a_l) is greater than (b_h,b_l) */
+UNIV_INLINE
+int
+ut_pair_cmp(
+	ulint	a_h,
+	ulint	a_l,
+	ulint	b_h,
+	ulint	b_l)
+	MY_ATTRIBUTE((warn_unused_result));
+
+/*************************************************************//**
+Calculates fast the remainder of n/m when m is a power of two.
+@param n in: numerator
+@param m in: denominator, must be a power of two
+@return the remainder of n/m */
+template <typename T> inline T ut_2pow_remainder(T n, T m){return n & (m - 1);}
+/*************************************************************//**
+Calculates the biggest multiple of m that is not bigger than n
+when m is a power of two.  In other words, rounds n down to m * k.
+@param n in: number to round down
+@param m in: alignment, must be a power of two
+@return n rounded down to the biggest possible integer multiple of m */
+template <typename T> inline T ut_2pow_round(T n, T m) { return n & ~(m - 1); }
+/********************************************************//**
+Calculates the smallest multiple of m that is not smaller than n
+when m is a power of two.  In other words, rounds n up to m * k.
+@param n in: number to round up
+@param m in: alignment, must be a power of two
+@return n rounded up to the smallest possible integer multiple of m */
+#define UT_CALC_ALIGN(n, m) ((n + m - 1) & ~(m - 1))
+template <typename T> inline T ut_calc_align(T n, T m)
+{ return static_cast<T>(UT_CALC_ALIGN(n, m)); }
+
+/*************************************************************//**
+Calculates fast the 2-logarithm of a number, rounded upward to an
+integer.
+@return logarithm in the base 2, rounded upward */
+UNIV_INLINE
+ulint
+ut_2_log(
+/*=====*/
+	ulint	n);	/*!< in: number */
+/*************************************************************//**
+Calculates 2 to power n.
+@return 2 to power n */
+UNIV_INLINE
+ulint
+ut_2_exp(
+/*=====*/
+	ulint	n);	/*!< in: number */
+
+/**********************************************************//**
+Returns the number of milliseconds since some epoch.  The
+value may wrap around.  It should only be used for heuristic
+purposes.
+@return ms since epoch */
+ulint
+ut_time_ms(void);
+/*============*/
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** Determine how many bytes (groups of 8 bits) are needed to
+store the given number of bits.
+@param b in: bits
+@return number of bytes (octets) needed to represent b */
+#define UT_BITS_IN_BYTES(b) (((b) + 7) >> 3)
+
+/** Determines if a number is zero or a power of two.
+@param[in]	n	number
+@return nonzero if n is zero or a power of two; zero otherwise */
+#define ut_is_2pow(n) (!((n) & ((n) - 1)))
+
+/** Functor that compares two C strings. Can be used as a comparator for
+e.g. std::map that uses char* as keys. */
+struct ut_strcmp_functor
+{
+	bool operator()(
+		const char*	a,
+		const char*	b) const
+	{
+		return(strcmp(a, b) < 0);
+	}
+};
+
+/**********************************************************//**
+Prints a timestamp to a file. */
+void
+ut_print_timestamp(
+/*===============*/
+	FILE*	file)	/*!< in: file where to print */
+	ATTRIBUTE_COLD __attribute__((nonnull));
+
+#ifndef UNIV_INNOCHECKSUM
+
+/**********************************************************//**
+Sprintfs a timestamp to a buffer, 13..14 chars plus terminating NUL. */
+void
+ut_sprintf_timestamp(
+/*=================*/
+	char*	buf); /*!< in: buffer where to sprintf */
+
+/*************************************************************//**
+Prints the contents of a memory buffer in hex and ascii. */
+void
+ut_print_buf(
+/*=========*/
+	FILE*		file,	/*!< in: file where to print */
+	const void*	buf,	/*!< in: memory buffer */
+	ulint		len);	/*!< in: length of the buffer */
+
+/*************************************************************//**
+Prints the contents of a memory buffer in hex. */
+void
+ut_print_buf_hex(
+/*=============*/
+	std::ostream&	o,	/*!< in/out: output stream */
+	const void*	buf,	/*!< in: memory buffer */
+	ulint		len)	/*!< in: length of the buffer */
+	MY_ATTRIBUTE((nonnull));
+/*************************************************************//**
+Prints the contents of a memory buffer in hex and ascii. */
+void
+ut_print_buf(
+/*=========*/
+	std::ostream&	o,	/*!< in/out: output stream */
+	const void*	buf,	/*!< in: memory buffer */
+	ulint		len)	/*!< in: length of the buffer */
+	MY_ATTRIBUTE((nonnull));
+
+/* Forward declaration of transaction handle */
+struct trx_t;
+
+/** Get a fixed-length string, quoted as an SQL identifier.
+If the string contains a slash '/', the string will be
+output as two identifiers separated by a period (.),
+as in SQL database_name.identifier.
+ @param		[in]	trx		transaction (NULL=no quotes).
+ @param		[in]	name		table name.
+ @retval	String quoted as an SQL identifier.
+*/
+std::string
+ut_get_name(
+	const trx_t*	trx,
+	const char*	name);
+
+/**********************************************************************//**
+Outputs a fixed-length string, quoted as an SQL identifier.
+If the string contains a slash '/', the string will be
+output as two identifiers separated by a period (.),
+as in SQL database_name.identifier. */
+void
+ut_print_name(
+/*==========*/
+	FILE*		ef,	/*!< in: stream */
+	const trx_t*	trx,	/*!< in: transaction */
+	const char*	name);	/*!< in: table name to print */
+/** Format a table name, quoted as an SQL identifier.
+If the name contains a slash '/', the result will contain two
+identifiers separated by a period (.), as in SQL
+database_name.table_name.
+@see table_name_t
+@param[in]	name		table or index name
+@param[out]	formatted	formatted result, will be NUL-terminated
+@param[in]	formatted_size	size of the buffer in bytes
+@return pointer to 'formatted' */
+char*
+ut_format_name(
+	const char*	name,
+	char*		formatted,
+	ulint		formatted_size);
+
+/**********************************************************************//**
+Catenate files. */
+void
+ut_copy_file(
+/*=========*/
+	FILE*	dest,	/*!< in: output file */
+	FILE*	src);	/*!< in: input file to be appended to output */
+
+/*************************************************************//**
+Convert an error number to a human readable text message. The
+returned string is static and should not be freed or modified.
+@return string, describing the error */
+const char*
+ut_strerr(
+/*======*/
+	dberr_t	num);	/*!< in: error number */
+
+#endif /* !UNIV_INNOCHECKSUM */
+
+namespace ib {
+
+/** This is a wrapper class, used to print any unsigned integer type
+in hexadecimal format.  The main purpose of this data type is to
+overload the global operator<<, so that we can print the given
+wrapper value in hex. */
+struct hex {
+	explicit hex(uintmax_t t): m_val(t) {}
+	const uintmax_t	m_val;
+};
+
+/** This is an overload of the global operator<< for the user defined type
+ib::hex.  The unsigned value held in the ib::hex wrapper class will be printed
+into the given output stream in hexadecimal format.
+@param[in,out]	lhs	the output stream into which rhs is written.
+@param[in]	rhs	the object to be written into lhs.
+@retval	reference to the output stream. */
+inline
+std::ostream&
+operator<<(
+	std::ostream&	lhs,
+	const hex&	rhs)
+{
+	std::ios_base::fmtflags	ff = lhs.flags();
+	lhs << std::showbase << std::hex << rhs.m_val;
+	lhs.setf(ff);
+	return(lhs);
+}
+
+/** This is a wrapper class, used to print any number in IEC style */
+struct bytes_iec {
+  explicit bytes_iec(unsigned long long t): m_val(t) {}
+  double get_double() const { return static_cast<double>(m_val); }
+  const unsigned long long m_val;
+};
+
+/** Like hex operator above, except for bytes_iec */
+std::ostream &operator<<(std::ostream &lhs, const bytes_iec &rhs);
+
+/** The class logger is the base class of all the error log related classes.
+It contains a std::ostringstream object.  The main purpose of this class is
+to forward operator<< to the underlying std::ostringstream object.  Do not
+use this class directly, instead use one of the derived classes. */
+class logger
+{
+protected:
+  /* This class must not be used directly */
+  ATTRIBUTE_COLD ATTRIBUTE_NOINLINE logger() = default;
+public:
+  template<typename T> ATTRIBUTE_COLD ATTRIBUTE_NOINLINE
+  logger& operator<<(const T& rhs)
+  {
+    m_oss << rhs;
+    return *this;
+  }
+
+  /** Handle a fixed character string in the same way as a pointer to
+  an unknown-length character string, to reduce object code bloat. */
+  template<size_t N> logger& operator<<(const char (&rhs)[N])
+  { return *this << static_cast<const char*>(rhs); }
+
+  /** Output an error code name */
+  ATTRIBUTE_COLD logger& operator<<(dberr_t err);
+
+  /** Append a string.
+  @param buf   string buffer
+  @param size  buffer size
+  @return the output stream */
+  ATTRIBUTE_COLD __attribute__((noinline))
+  std::ostream &write(const char *buf, std::streamsize size)
+  {
+    return m_oss.write(buf, size);
+  }
+
+  std::ostream &write(const byte *buf, std::streamsize size)
+  { return write(reinterpret_cast<const char*>(buf), size); }
+
+  std::ostringstream m_oss;
+};
+
+/** The class info is used to emit informational log messages.  It is to be
+used similar to std::cout.  But the log messages will be emitted only when
+the dtor is called.  The preferred usage of this class is to make use of
+unnamed temporaries as follows:
+
+info() << "The server started successfully.";
+
+In the above usage, the temporary object will be destroyed at the end of the
+statement and hence the log message will be emitted at the end of the
+statement.  If a named object is created, then the log message will be emitted
+only when it goes out of scope or destroyed. */
+class info : public logger {
+public:
+	ATTRIBUTE_COLD
+	~info();
+};
+
+/** The class warn is used to emit warnings.  Refer to the documentation of
+class info for further details. */
+class warn : public logger {
+public:
+	ATTRIBUTE_COLD
+	~warn();
+};
+
+/** The class error is used to emit error messages.  Refer to the
+documentation of class info for further details. */
+class error : public logger {
+public:
+	ATTRIBUTE_COLD
+	~error();
+	/** Indicates that error::~error() was invoked. Can be used to
+	determine if error messages were logged during innodb code execution.
+	@return true if there were error messages, false otherwise. */
+	static bool was_logged() { return logged; }
+
+private:
+	/** true if error::~error() was invoked, false otherwise */
+	static bool logged;
+};
+
+/** The class fatal is used to emit an error message and stop the server
+by crashing it.  Use this class when MySQL server needs to be stopped
+immediately.  Refer to the documentation of class info for usage details. */
+class fatal : public logger {
+public:
+	ATTRIBUTE_NORETURN
+	~fatal();
+};
+
+/** Emit an error message if the given predicate is true, otherwise emit a
+warning message */
+class error_or_warn : public logger {
+public:
+	ATTRIBUTE_COLD
+	error_or_warn(bool	pred)
+	: m_error(pred)
+	{}
+
+	ATTRIBUTE_COLD
+	~error_or_warn();
+private:
+	const bool	m_error;
+};
+
+/** Emit a fatal message if the given predicate is true, otherwise emit a
+error message. */
+class fatal_or_error : public logger {
+public:
+	ATTRIBUTE_COLD
+	fatal_or_error(bool	pred)
+	: m_fatal(pred)
+	{}
+
+	ATTRIBUTE_COLD
+	~fatal_or_error();
+private:
+	const bool	m_fatal;
+};
+
+} // namespace ib
+
+#include "ut0ut.inl"
+
+#endif
+
diff --git a/storage/innobase/include/ut0ut.inl b/storage/innobase/include/ut0ut.inl
new file mode 100644
index 00000000..73feaf82
--- /dev/null
+++ b/storage/innobase/include/ut0ut.inl
@@ -0,0 +1,143 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************************//**
+@file include/ut0ut.ic
+Various utilities
+
+Created 5/30/1994 Heikki Tuuri
+*******************************************************************/
+
+#include <algorithm>
+
+/** Calculate the minimum of two pairs.
+@param[out]	min_hi	MSB of the minimum pair
+@param[out]	min_lo	LSB of the minimum pair
+@param[in]	a_hi	MSB of the first pair
+@param[in]	a_lo	LSB of the first pair
+@param[in]	b_hi	MSB of the second pair
+@param[in]	b_lo	LSB of the second pair */
+UNIV_INLINE
+void
+ut_pair_min(
+	ulint*	min_hi,
+	ulint*	min_lo,
+	ulint	a_hi,
+	ulint	a_lo,
+	ulint	b_hi,
+	ulint	b_lo)
+{
+	if (a_hi == b_hi) {
+		*min_hi = a_hi;
+		*min_lo = std::min(a_lo, b_lo);
+	} else if (a_hi < b_hi) {
+		*min_hi = a_hi;
+		*min_lo = a_lo;
+	} else {
+		*min_hi = b_hi;
+		*min_lo = b_lo;
+	}
+}
+
+/******************************************************//**
+Compares two ulints.
+@return 1 if a > b, 0 if a == b, -1 if a < b */
+UNIV_INLINE
+int
+ut_ulint_cmp(
+/*=========*/
+	ulint	a,	/*!< in: ulint */
+	ulint	b)	/*!< in: ulint */
+{
+	if (a < b) {
+		return(-1);
+	} else if (a == b) {
+		return(0);
+	} else {
+		return(1);
+	}
+}
+
+/** Compare two pairs of integers.
+@param[in]	a_h	more significant part of first pair
+@param[in]	a_l	less significant part of first pair
+@param[in]	b_h	more significant part of second pair
+@param[in]	b_l	less significant part of second pair
+@return comparison result of (a_h,a_l) and (b_h,b_l)
+@retval -1 if (a_h,a_l) is less than (b_h,b_l)
+@retval 0 if (a_h,a_l) is equal to (b_h,b_l)
+@retval 1 if (a_h,a_l) is greater than (b_h,b_l) */
+UNIV_INLINE
+int
+ut_pair_cmp(
+	ulint	a_h,
+	ulint	a_l,
+	ulint	b_h,
+	ulint	b_l)
+{
+	if (a_h < b_h) {
+		return(-1);
+	}
+	if (a_h > b_h) {
+		return(1);
+	}
+	return(ut_ulint_cmp(a_l, b_l));
+}
+
+/*************************************************************//**
+Calculates fast the 2-logarithm of a number, rounded upward to an
+integer.
+@return logarithm in the base 2, rounded upward */
+UNIV_INLINE
+ulint
+ut_2_log(
+/*=====*/
+	ulint	n)	/*!< in: number != 0 */
+{
+	ulint	res;
+
+	res = 0;
+
+	ut_ad(n > 0);
+
+	n = n - 1;
+
+	for (;;) {
+		n = n / 2;
+
+		if (n == 0) {
+			break;
+		}
+
+		res++;
+	}
+
+	return(res + 1);
+}
+
+/*************************************************************//**
+Calculates 2 to power n.
+@return 2 to power n */
+UNIV_INLINE
+ulint
+ut_2_exp(
+/*=====*/
+	ulint	n)	/*!< in: number */
+{
+	return((ulint) 1 << n);
+}
diff --git a/storage/innobase/include/ut0vec.h b/storage/innobase/include/ut0vec.h
new file mode 100644
index 00000000..f4660f96
--- /dev/null
+++ b/storage/innobase/include/ut0vec.h
@@ -0,0 +1,285 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0vec.h
+A vector of pointers to data items
+
+Created 4/6/2006 Osku Salerma
+************************************************************************/
+
+#ifndef IB_VECTOR_H
+#define IB_VECTOR_H
+
+#include "mem0mem.h"
+
+struct ib_alloc_t;
+struct ib_vector_t;
+
+typedef void* (*ib_mem_alloc_t)(
+					/* out: Pointer to allocated memory */
+	ib_alloc_t*	allocator,	/* in: Pointer to allocator instance */
+	ulint		size);		/* in: Number of bytes to allocate */
+
+typedef void (*ib_mem_free_t)(
+	ib_alloc_t*	allocator,	/* in: Pointer to allocator instance */
+	void*		ptr);		/* in: Memory to free */
+
+typedef void* (*ib_mem_resize_t)(
+					/* out: Pointer to resized memory */
+	ib_alloc_t*	allocator,	/* in: Pointer to allocator */
+	void*		ptr,		/* in: Memory to resize */
+	ulint		old_size,	/* in: Old memory size in bytes */
+	ulint		new_size);	/* in: New size in bytes */
+
+typedef int (*ib_compare_t)(const void*, const void*);
+
+/* An automatically resizing vector datatype with the following properties:
+
+ -All memory allocation is done through an allocator, which is  responsible for
+freeing it when done with the vector.
+*/
+
+/* This is useful shorthand for elements of type void* */
+#define	ib_vector_getp(v, n)	(*(void**) ib_vector_get(v, n))
+#define	ib_vector_getp_const(v, n)	(*(void**) ib_vector_get_const(v, n))
+
+#define ib_vector_allocator(v)	(v->allocator)
+
+/********************************************************************
+Create a new vector with the given initial size. */
+ib_vector_t*
+ib_vector_create(
+/*=============*/
+					/* out: vector */
+	ib_alloc_t*	alloc,		/* in: Allocator */
+					/* in: size of the data item */
+	ulint		sizeof_value,
+	ulint		size);		/* in: initial size */
+
+/********************************************************************
+Destroy the vector. Make sure the vector owns the allocator, e.g.,
+the heap in the the heap allocator. */
+UNIV_INLINE
+void
+ib_vector_free(
+/*===========*/
+	ib_vector_t*	vec);		/* in/out: vector */
+
+/********************************************************************
+Push a new element to the vector, increasing its size if necessary,
+if elem is not NULL then elem is copied to the vector.*/
+UNIV_INLINE
+void*
+ib_vector_push(
+/*===========*/
+					/* out: pointer the "new" element */
+	ib_vector_t*	vec,		/* in/out: vector */
+	const void*	elem);		/* in: data element */
+
+/********************************************************************
+Pop the last element from the vector.*/
+UNIV_INLINE
+void*
+ib_vector_pop(
+/*==========*/
+					/* out: pointer to the "new" element */
+	ib_vector_t*	vec);		/* in/out: vector */
+
+/*******************************************************************//**
+Remove an element to the vector
+@return pointer to the "removed" element */
+UNIV_INLINE
+void*
+ib_vector_remove(
+/*=============*/
+	ib_vector_t*	vec,	/*!< in: vector */
+	const void*	elem);	/*!< in: value to remove */
+
+/********************************************************************
+Get the number of elements in the vector. */
+UNIV_INLINE
+ulint
+ib_vector_size(
+/*===========*/
+					/* out: number of elements in vector */
+	const ib_vector_t*	vec);	/* in: vector */
+
+/********************************************************************
+Increase the size of the vector. */
+void
+ib_vector_resize(
+/*=============*/
+					/* out: number of elements in vector */
+	ib_vector_t*	vec);		/* in/out: vector */
+
+/********************************************************************
+Test whether a vector is empty or not.
+@return TRUE if empty */
+UNIV_INLINE
+ibool
+ib_vector_is_empty(
+/*===============*/
+	const ib_vector_t*	vec);    /*!< in: vector */
+
+/****************************************************************//**
+Get the n'th element.
+@return n'th element */
+UNIV_INLINE
+void*
+ib_vector_get(
+/*==========*/
+	ib_vector_t*	vec,	/*!< in: vector */
+	ulint		n);	/*!< in: element index to get */
+
+/********************************************************************
+Const version of the get n'th element.
+@return n'th element */
+UNIV_INLINE
+const void*
+ib_vector_get_const(
+/*================*/
+	const ib_vector_t*	vec,	/* in: vector */
+	ulint			n);	/* in: element index to get */
+/****************************************************************//**
+Get last element. The vector must not be empty.
+@return last element */
+UNIV_INLINE
+void*
+ib_vector_get_last(
+/*===============*/
+	ib_vector_t*	vec);	/*!< in: vector */
+/****************************************************************//**
+Set the n'th element. */
+UNIV_INLINE
+void
+ib_vector_set(
+/*==========*/
+	ib_vector_t*	vec,	/*!< in/out: vector */
+	ulint		n,	/*!< in: element index to set */
+	void*		elem);	/*!< in: data element */
+
+/********************************************************************
+Reset the vector size to 0 elements. */
+UNIV_INLINE
+void
+ib_vector_reset(
+/*============*/
+	ib_vector_t*	vec);		/* in/out: vector */
+
+/********************************************************************
+Get the last element of the vector. */
+UNIV_INLINE
+void*
+ib_vector_last(
+/*===========*/
+					/* out: pointer to last element */
+	ib_vector_t*	vec);		/* in/out: vector */
+
+/********************************************************************
+Get the last element of the vector. */
+UNIV_INLINE
+const void*
+ib_vector_last_const(
+/*=================*/
+					/* out: pointer to last element */
+	const ib_vector_t*	vec);	/* in: vector */
+
+/********************************************************************
+Sort the vector elements. */
+UNIV_INLINE
+void
+ib_vector_sort(
+/*===========*/
+	ib_vector_t*	vec,		/* in/out: vector */
+	ib_compare_t	compare);	/* in: the comparator to use for sort */
+
+/********************************************************************
+The default ib_vector_t heap free. Does nothing. */
+UNIV_INLINE
+void
+ib_heap_free(
+/*=========*/
+	ib_alloc_t*	allocator,	/* in: allocator */
+	void*		ptr);		/* in: size in bytes */
+
+/********************************************************************
+The default ib_vector_t heap malloc. Uses mem_heap_alloc(). */
+UNIV_INLINE
+void*
+ib_heap_malloc(
+/*===========*/
+					/* out: pointer to allocated memory */
+	ib_alloc_t*	allocator,	/* in: allocator */
+	ulint		size);		/* in: size in bytes */
+
+/********************************************************************
+The default ib_vector_t heap resize. Since we can't resize the heap
+we have to copy the elements from the old ptr to the new ptr.
+Uses mem_heap_alloc(). */
+UNIV_INLINE
+void*
+ib_heap_resize(
+/*===========*/
+					/* out: pointer to reallocated
+					memory */
+	ib_alloc_t*	allocator,	/* in: allocator */
+	void*		old_ptr,	/* in: pointer to memory */
+	ulint		old_size,	/* in: old size in bytes */
+	ulint		new_size);	/* in: new size in bytes */
+
+/********************************************************************
+Create a heap allocator that uses the passed in heap. */
+UNIV_INLINE
+ib_alloc_t*
+ib_heap_allocator_create(
+/*=====================*/
+					/* out: heap allocator instance */
+	mem_heap_t*	heap);		/* in: heap to use */
+
+/********************************************************************
+Free a heap allocator. */
+UNIV_INLINE
+void
+ib_heap_allocator_free(
+/*===================*/
+	ib_alloc_t*	ib_ut_alloc);	/* in: alloc instace to free */
+
+/* Allocator used by ib_vector_t. */
+struct ib_alloc_t {
+	ib_mem_alloc_t	mem_malloc;	/* For allocating memory */
+	ib_mem_free_t	mem_release;	/* For freeing memory */
+	ib_mem_resize_t	mem_resize;	/* For resizing memory */
+	void*		arg;		/* Currently if not NULL then it
+					points to the heap instance */
+};
+
+/* See comment at beginning of file. */
+struct ib_vector_t {
+	ib_alloc_t*	allocator;	/* Allocator, because one size
+					doesn't fit all */
+	void*		data;		/* data elements */
+	ulint		used;		/* number of elements currently used */
+	ulint		total;		/* number of elements allocated */
+					/* Size of a data item */
+	ulint		sizeof_value;
+};
+
+#include "ut0vec.inl"
+
+#endif /* IB_VECTOR_H */
diff --git a/storage/innobase/include/ut0vec.inl b/storage/innobase/include/ut0vec.inl
new file mode 100644
index 00000000..531f0f22
--- /dev/null
+++ b/storage/innobase/include/ut0vec.inl
@@ -0,0 +1,348 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2014, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0vec.ic
+A vector of pointers to data items
+
+Created 4/6/2006 Osku Salerma
+************************************************************************/
+
+#define	IB_VEC_OFFSET(v, i)	(vec->sizeof_value * i)
+
+/********************************************************************
+The default ib_vector_t heap malloc. Uses mem_heap_alloc(). */
+UNIV_INLINE
+void*
+ib_heap_malloc(
+/*===========*/
+	ib_alloc_t*	allocator,	/* in: allocator */
+	ulint		size)		/* in: size in bytes */
+{
+	mem_heap_t*	heap = (mem_heap_t*) allocator->arg;
+
+	return(mem_heap_alloc(heap, size));
+}
+
+/********************************************************************
+The default ib_vector_t heap free. Does nothing. */
+UNIV_INLINE
+void
+ib_heap_free(
+/*=========*/
+	ib_alloc_t*	allocator UNIV_UNUSED,	/* in: allocator */
+	void*		ptr UNIV_UNUSED)	/* in: size in bytes */
+{
+	/* We can't free individual elements. */
+}
+
+/********************************************************************
+The default ib_vector_t heap resize. Since we can't resize the heap
+we have to copy the elements from the old ptr to the new ptr.
+We always assume new_size >= old_size, so the buffer won't overflow.
+Uses mem_heap_alloc(). */
+UNIV_INLINE
+void*
+ib_heap_resize(
+/*===========*/
+	ib_alloc_t*	allocator,	/* in: allocator */
+	void*		old_ptr,	/* in: pointer to memory */
+	ulint		old_size,	/* in: old size in bytes */
+	ulint		new_size)	/* in: new size in bytes */
+{
+	void*		new_ptr;
+	mem_heap_t*	heap = (mem_heap_t*) allocator->arg;
+
+	ut_a(new_size >= old_size);
+	new_ptr = mem_heap_alloc(heap, new_size);
+	memcpy(new_ptr, old_ptr, old_size);
+
+	return(new_ptr);
+}
+
+/********************************************************************
+Create a heap allocator that uses the passed in heap. */
+UNIV_INLINE
+ib_alloc_t*
+ib_heap_allocator_create(
+/*=====================*/
+	mem_heap_t*	heap)		/* in: heap to use */
+{
+	ib_alloc_t*	heap_alloc;
+
+	heap_alloc = (ib_alloc_t*) mem_heap_alloc(heap, sizeof(*heap_alloc));
+
+	heap_alloc->arg = heap;
+	heap_alloc->mem_release = ib_heap_free;
+	heap_alloc->mem_malloc = ib_heap_malloc;
+	heap_alloc->mem_resize = ib_heap_resize;
+
+	return(heap_alloc);
+}
+
+/********************************************************************
+Free a heap allocator. */
+UNIV_INLINE
+void
+ib_heap_allocator_free(
+/*===================*/
+	ib_alloc_t*	ib_ut_alloc)	/* in: alloc instace to free */
+{
+	mem_heap_free((mem_heap_t*) ib_ut_alloc->arg);
+}
+
+/********************************************************************
+Get number of elements in vector. */
+UNIV_INLINE
+ulint
+ib_vector_size(
+/*===========*/
+					/* out: number of elements in vector*/
+	const ib_vector_t*	vec)	/* in: vector */
+{
+	return(vec->used);
+}
+
+/****************************************************************//**
+Get n'th element. */
+UNIV_INLINE
+void*
+ib_vector_get(
+/*==========*/
+	ib_vector_t*	vec,	/*!< in: vector */
+	ulint		n)	/*!< in: element index to get */
+{
+	ut_a(n < vec->used);
+
+	return((byte*) vec->data + IB_VEC_OFFSET(vec, n));
+}
+
+/********************************************************************
+Const version of the get n'th element.
+@return n'th element */
+UNIV_INLINE
+const void*
+ib_vector_get_const(
+/*================*/
+	const ib_vector_t*	vec,	/* in: vector */
+	ulint			n)	/* in: element index to get */
+{
+	ut_a(n < vec->used);
+
+	return((byte*) vec->data + IB_VEC_OFFSET(vec, n));
+}
+/****************************************************************//**
+Get last element. The vector must not be empty.
+@return last element */
+UNIV_INLINE
+void*
+ib_vector_get_last(
+/*===============*/
+	ib_vector_t*	vec)	/*!< in: vector */
+{
+	ut_a(vec->used > 0);
+
+	return((byte*) ib_vector_get(vec, vec->used - 1));
+}
+
+/****************************************************************//**
+Set the n'th element. */
+UNIV_INLINE
+void
+ib_vector_set(
+/*==========*/
+	ib_vector_t*	vec,	/*!< in/out: vector */
+	ulint		n,	/*!< in: element index to set */
+	void*		elem)	/*!< in: data element */
+{
+	void*		slot;
+
+	ut_a(n < vec->used);
+
+	slot = ((byte*) vec->data + IB_VEC_OFFSET(vec, n));
+	memcpy(slot, elem, vec->sizeof_value);
+}
+
+/********************************************************************
+Reset the vector size to 0 elements. */
+UNIV_INLINE
+void
+ib_vector_reset(
+/*============*/
+					/* out: void */
+	ib_vector_t*	vec)		/* in: vector */
+{
+	vec->used = 0;
+}
+
+/********************************************************************
+Get the last element of the vector. */
+UNIV_INLINE
+void*
+ib_vector_last(
+/*===========*/
+					/* out: void */
+	ib_vector_t*	vec)		/* in: vector */
+{
+	ut_a(ib_vector_size(vec) > 0);
+
+	return(ib_vector_get(vec, ib_vector_size(vec) - 1));
+}
+
+/********************************************************************
+Get the last element of the vector. */
+UNIV_INLINE
+const void*
+ib_vector_last_const(
+/*=================*/
+					/* out: void */
+	const ib_vector_t*	vec)	/* in: vector */
+{
+	ut_a(ib_vector_size(vec) > 0);
+
+	return(ib_vector_get_const(vec, ib_vector_size(vec) - 1));
+}
+
+/****************************************************************//**
+Remove the last element from the vector.
+@return last vector element */
+UNIV_INLINE
+void*
+ib_vector_pop(
+/*==========*/
+				/* out: pointer to element */
+	ib_vector_t*	vec)	/* in: vector */
+{
+	void*		elem;
+
+	ut_a(vec->used > 0);
+
+	elem = ib_vector_last(vec);
+	--vec->used;
+
+	return(elem);
+}
+
+/********************************************************************
+Append an element to the vector, if elem != NULL then copy the data
+from elem.*/
+UNIV_INLINE
+void*
+ib_vector_push(
+/*===========*/
+				/* out: pointer to the "new" element */
+	ib_vector_t*	vec,	/* in: vector */
+	const void*	elem)	/* in: element to add (can be NULL) */
+{
+	void*		last;
+
+	if (vec->used >= vec->total) {
+		ib_vector_resize(vec);
+	}
+
+	last = (byte*) vec->data + IB_VEC_OFFSET(vec, vec->used);
+
+#ifdef UNIV_DEBUG
+	memset(last, 0, vec->sizeof_value);
+#endif
+
+	if (elem) {
+		memcpy(last, elem, vec->sizeof_value);
+	}
+
+	++vec->used;
+
+	return(last);
+}
+
+/*******************************************************************//**
+Remove an element to the vector
+@return pointer to the "removed" element */
+UNIV_INLINE
+void*
+ib_vector_remove(
+/*=============*/
+	ib_vector_t*	vec,	/*!< in: vector */
+	const void*	elem)	/*!< in: value to remove */
+{
+	void*		current = NULL;
+	void*		next;
+	ulint		i;
+	ulint		old_used_count = vec->used;
+
+	for (i = 0; i < vec->used; i++) {
+		current = ib_vector_get(vec, i);
+
+		if (*(void**) current == elem) {
+			if (i == vec->used - 1) {
+				return(ib_vector_pop(vec));
+			}
+
+			next = ib_vector_get(vec, i + 1);
+			memmove(current, next, vec->sizeof_value
+			        * (vec->used - i - 1));
+			--vec->used;
+			break;
+		}
+	}
+
+	return((old_used_count != vec->used) ? current : NULL);
+}
+
+/********************************************************************
+Sort the vector elements. */
+UNIV_INLINE
+void
+ib_vector_sort(
+/*===========*/
+				/* out: void */
+	ib_vector_t*	vec,	/* in: vector */
+	ib_compare_t	compare)/* in: the comparator to use for sort */
+{
+	qsort(vec->data, vec->used, vec->sizeof_value, compare);
+}
+
+/********************************************************************
+Destroy the vector. Make sure the vector owns the allocator, e.g.,
+the heap in the the heap allocator. */
+UNIV_INLINE
+void
+ib_vector_free(
+/*===========*/
+	ib_vector_t*	vec)		/* in, own: vector */
+{
+	/* Currently we only support one type of allocator - heap,
+	when the heap is freed all the elements are freed too. */
+
+	/* Only the heap allocator uses the arg field. */
+	ut_ad(vec->allocator->arg != NULL);
+
+	mem_heap_free((mem_heap_t*) vec->allocator->arg);
+}
+
+/********************************************************************
+Test whether a vector is empty or not.
+@return TRUE if empty */
+UNIV_INLINE
+ibool
+ib_vector_is_empty(
+/*===============*/
+	const ib_vector_t*	vec)	/*!< in: vector */
+{
+	return(ib_vector_size(vec) == 0);
+}
diff --git a/storage/innobase/include/ut0wqueue.h b/storage/innobase/include/ut0wqueue.h
new file mode 100644
index 00000000..95c7a248
--- /dev/null
+++ b/storage/innobase/include/ut0wqueue.h
@@ -0,0 +1,86 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0wqueue.h
+A work queue
+
+Created 4/26/2006 Osku Salerma
+************************************************************************/
+
+/*******************************************************************//**
+A Work queue. Threads can add work items to the queue and other threads can
+wait for work items to be available and take them off the queue for
+processing.
+************************************************************************/
+
+#pragma once
+
+#include "ut0list.h"
+#include "mem0mem.h"
+
+// Forward declaration
+struct ib_list_t;
+
+/** Work queue */
+struct ib_wqueue_t
+{
+  /** Mutex protecting everything */
+  mysql_mutex_t mutex;
+  /** Work item list */
+  ib_list_t *items;
+  /** ib_list_len(*items) */
+  size_t length;
+};
+
+/****************************************************************//**
+Create a new work queue.
+@return work queue */
+ib_wqueue_t*
+ib_wqueue_create();
+/*===============*/
+
+/****************************************************************//**
+Free a work queue. */
+void
+ib_wqueue_free(
+/*===========*/
+	ib_wqueue_t*	wq);		/*!< in: work queue */
+
+/** Add a work item to the queue.
+@param[in,out]	wq		work queue
+@param[in]	item		work item
+@param[in,out]	heap		memory heap to use for allocating list node
+@param[in]	wq_locked	work queue mutex locked */
+void
+ib_wqueue_add(ib_wqueue_t* wq, void* item, mem_heap_t* heap,
+	      bool wq_locked = false);
+
+/** Check if queue is empty.
+@param wq wait queue
+@return whether the queue is empty */
+bool ib_wqueue_is_empty(ib_wqueue_t* wq);
+
+/********************************************************************
+Return first item on work queue or NULL if queue is empty
+@return work item or NULL */
+void*
+ib_wqueue_nowait(
+/*=============*/
+	ib_wqueue_t*	wq);		/*<! in: work queue */
diff --git a/storage/innobase/lock/lock0iter.cc b/storage/innobase/lock/lock0iter.cc
new file mode 100644
index 00000000..0cd271bf
--- /dev/null
+++ b/storage/innobase/lock/lock0iter.cc
@@ -0,0 +1,88 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2020, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file lock/lock0iter.cc
+Lock queue iterator. Can iterate over table and record
+lock queues.
+
+Created July 16, 2007 Vasil Dimov
+*******************************************************/
+
+#define LOCK_MODULE_IMPLEMENTATION
+
+#include "dict0mem.h"
+#include "lock0iter.h"
+#include "lock0lock.h"
+#include "lock0priv.h"
+
+/*******************************************************************//**
+Initialize lock queue iterator so that it starts to iterate from
+"lock". bit_no specifies the record number within the heap where the
+record is stored. It can be undefined (ULINT_UNDEFINED) in two cases:
+1. If the lock is a table lock, thus we have a table lock queue;
+2. If the lock is a record lock and it is a wait lock. In this case
+   bit_no is calculated in this function by using
+   lock_rec_find_set_bit(). There is exactly one bit set in the bitmap
+   of a wait lock. */
+void
+lock_queue_iterator_reset(
+/*======================*/
+	lock_queue_iterator_t*	iter,	/*!< out: iterator */
+	const lock_t*		lock,	/*!< in: lock to start from */
+	ulint			bit_no)	/*!< in: record number in the
+					heap */
+{
+  lock_sys.assert_locked(*lock);
+
+  iter->current_lock = lock;
+
+  if (bit_no != ULINT_UNDEFINED);
+  else if (lock->is_table())
+    bit_no= ULINT_UNDEFINED;
+  else
+  {
+    bit_no= lock_rec_find_set_bit(lock);
+    ut_ad(bit_no != ULINT_UNDEFINED);
+  }
+
+  iter->bit_no= bit_no;
+}
+
+/*******************************************************************//**
+Gets the previous lock in the lock queue, returns NULL if there are no
+more locks (i.e. the current lock is the first one). The iterator is
+receded (if not-NULL is returned).
+@return previous lock or NULL */
+const lock_t*
+lock_queue_iterator_get_prev(
+/*=========================*/
+	lock_queue_iterator_t*	iter)	/*!< in/out: iterator */
+{
+  lock_sys.assert_locked(*iter->current_lock);
+
+  const lock_t *prev_lock= !iter->current_lock->is_table()
+    ? lock_rec_get_prev(iter->current_lock, iter->bit_no)
+    : UT_LIST_GET_PREV(un_member.tab_lock.locks, iter->current_lock);
+
+  if (prev_lock)
+    iter->current_lock= prev_lock;
+
+  return prev_lock;
+}
diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc
new file mode 100644
index 00000000..df51ceb1
--- /dev/null
+++ b/storage/innobase/lock/lock0lock.cc
@@ -0,0 +1,6812 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2022, Oracle and/or its affiliates.
+Copyright (c) 2014, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file lock/lock0lock.cc
+The transaction lock system
+
+Created 5/7/1996 Heikki Tuuri
+*******************************************************/
+
+#define LOCK_MODULE_IMPLEMENTATION
+
+#include "univ.i"
+
+#include <mysql/service_thd_error_context.h>
+#include <mysql/service_thd_wait.h>
+#include <sql_class.h>
+
+#include "lock0lock.h"
+#include "lock0priv.h"
+#include "dict0mem.h"
+#include "trx0purge.h"
+#include "trx0sys.h"
+#include "ut0vec.h"
+#include "btr0cur.h"
+#include "row0sel.h"
+#include "row0mysql.h"
+#include "row0vers.h"
+#include "pars0pars.h"
+#include "srv0mon.h"
+#include "que0que.h"
+#include "scope.h"
+#include <debug_sync.h>
+
+#include <set>
+
+#ifdef WITH_WSREP
+#include <mysql/service_wsrep.h>
+#endif /* WITH_WSREP */
+
+/** The value of innodb_deadlock_detect */
+my_bool innodb_deadlock_detect;
+/** The value of innodb_deadlock_report */
+ulong innodb_deadlock_report;
+
+#ifdef HAVE_REPLICATION
+extern "C" void thd_rpl_deadlock_check(MYSQL_THD thd, MYSQL_THD other_thd);
+extern "C" int thd_need_wait_reports(const MYSQL_THD thd);
+extern "C" int thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd);
+extern "C" int thd_deadlock_victim_preference(const MYSQL_THD thd1, const MYSQL_THD thd2);
+#endif
+
+/** Functor for accessing the embedded node within a table lock. */
+struct TableLockGetNode
+{
+  ut_list_node<lock_t> &operator()(lock_t &elem)
+  { return(elem.un_member.tab_lock.locks); }
+};
+
+/** Create the hash table.
+@param n  the lower bound of n_cells */
+void lock_sys_t::hash_table::create(ulint n)
+{
+  n_cells= ut_find_prime(n);
+  const size_t size= MY_ALIGN(pad(n_cells) * sizeof *array,
+                              CPU_LEVEL1_DCACHE_LINESIZE);
+  void *v= aligned_malloc(size, CPU_LEVEL1_DCACHE_LINESIZE);
+  memset_aligned<CPU_LEVEL1_DCACHE_LINESIZE>(v, 0, size);
+  array= static_cast<hash_cell_t*>(v);
+}
+
+/** Resize the hash table.
+@param n  the lower bound of n_cells */
+void lock_sys_t::hash_table::resize(ulint n)
+{
+  ut_ad(lock_sys.is_writer());
+  ulint new_n_cells= ut_find_prime(n);
+  const size_t size= MY_ALIGN(pad(new_n_cells) * sizeof *array,
+                              CPU_LEVEL1_DCACHE_LINESIZE);
+  void *v= aligned_malloc(size, CPU_LEVEL1_DCACHE_LINESIZE);
+  memset_aligned<CPU_LEVEL1_DCACHE_LINESIZE>(v, 0, size);
+  hash_cell_t *new_array= static_cast<hash_cell_t*>(v);
+
+  for (auto i= pad(n_cells); i--; )
+  {
+    if (lock_t *lock= static_cast<lock_t*>(array[i].node))
+    {
+      /* all hash_latch must vacated */
+      ut_ad(i % (ELEMENTS_PER_LATCH + LATCH) >= LATCH);
+      do
+      {
+        ut_ad(!lock->is_table());
+        hash_cell_t *c= calc_hash(lock->un_member.rec_lock.page_id.fold(),
+                                  new_n_cells) + new_array;
+        lock_t *next= lock->hash;
+        lock->hash= nullptr;
+        if (!c->node)
+          c->node= lock;
+        else if (!lock->is_waiting())
+        {
+          lock->hash= static_cast<lock_t*>(c->node);
+          c->node= lock;
+        }
+        else
+        {
+          lock_t *next= static_cast<lock_t*>(c->node);
+          while (next->hash)
+            next= next->hash;
+          next->hash= lock;
+        }
+        lock= next;
+      }
+      while (lock);
+    }
+  }
+
+  aligned_free(array);
+  array= new_array;
+  n_cells= new_n_cells;
+}
+
+#ifdef SUX_LOCK_GENERIC
+void lock_sys_t::hash_latch::wait()
+{
+  pthread_mutex_lock(&lock_sys.hash_mutex);
+  while (!write_trylock())
+    pthread_cond_wait(&lock_sys.hash_cond, &lock_sys.hash_mutex);
+  pthread_mutex_unlock(&lock_sys.hash_mutex);
+}
+
+void lock_sys_t::hash_latch::release()
+{
+  pthread_mutex_lock(&lock_sys.hash_mutex);
+  write_unlock();
+  pthread_cond_signal(&lock_sys.hash_cond);
+  pthread_mutex_unlock(&lock_sys.hash_mutex);
+}
+#endif
+
+#ifdef UNIV_DEBUG
+/** Assert that a lock shard is exclusively latched by this thread */
+void lock_sys_t::assert_locked(const lock_t &lock) const
+{
+  ut_ad(this == &lock_sys);
+  if (is_writer())
+    return;
+  if (lock.is_table())
+    assert_locked(*lock.un_member.tab_lock.table);
+  else
+    lock_sys.hash_get(lock.type_mode).
+      assert_locked(lock.un_member.rec_lock.page_id);
+}
+
+/** Assert that a table lock shard is exclusively latched by this thread */
+void lock_sys_t::assert_locked(const dict_table_t &table) const
+{
+  ut_ad(!table.is_temporary());
+  if (is_writer())
+    return;
+  ut_ad(readers);
+  ut_ad(table.lock_mutex_is_owner());
+}
+
+/** Assert that hash cell for page is exclusively latched by this thread */
+void lock_sys_t::hash_table::assert_locked(const page_id_t id) const
+{
+  if (lock_sys.is_writer())
+    return;
+  ut_ad(lock_sys.readers);
+  ut_ad(latch(cell_get(id.fold()))->is_locked());
+}
+
+/** Assert that a hash table cell is exclusively latched (by some thread) */
+void lock_sys_t::assert_locked(const hash_cell_t &cell) const
+{
+  if (is_writer())
+    return;
+  ut_ad(lock_sys.readers);
+  ut_ad(hash_table::latch(const_cast<hash_cell_t*>(&cell))->is_locked());
+}
+#endif
+
+LockGuard::LockGuard(lock_sys_t::hash_table &hash, page_id_t id)
+{
+  const auto id_fold= id.fold();
+  lock_sys.rd_lock(SRW_LOCK_CALL);
+  cell_= hash.cell_get(id_fold);
+  hash.latch(cell_)->acquire();
+}
+
+LockMultiGuard::LockMultiGuard(lock_sys_t::hash_table &hash,
+                               const page_id_t id1, const page_id_t id2)
+{
+  ut_ad(id1.space() == id2.space());
+  const auto id1_fold= id1.fold(), id2_fold= id2.fold();
+  lock_sys.rd_lock(SRW_LOCK_CALL);
+  cell1_= hash.cell_get(id1_fold);
+  cell2_= hash.cell_get(id2_fold);
+
+  auto latch1= hash.latch(cell1_), latch2= hash.latch(cell2_);
+  if (latch1 > latch2)
+    std::swap(latch1, latch2);
+  latch1->acquire();
+  if (latch1 != latch2)
+    latch2->acquire();
+}
+
+LockMultiGuard::~LockMultiGuard()
+{
+  auto latch1= lock_sys_t::hash_table::latch(cell1_),
+    latch2= lock_sys_t::hash_table::latch(cell2_);
+  latch1->release();
+  if (latch1 != latch2)
+    latch2->release();
+  /* Must be last, to avoid a race with lock_sys_t::hash_table::resize() */
+  lock_sys.rd_unlock();
+}
+
+TRANSACTIONAL_TARGET
+TMLockGuard::TMLockGuard(lock_sys_t::hash_table &hash, page_id_t id)
+{
+  const auto id_fold= id.fold();
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+  if (xbegin())
+  {
+    if (lock_sys.latch.is_write_locked())
+      xabort();
+    cell_= hash.cell_get(id_fold);
+    if (hash.latch(cell_)->is_locked())
+      xabort();
+    elided= true;
+    return;
+  }
+  elided= false;
+#endif
+  lock_sys.rd_lock(SRW_LOCK_CALL);
+  cell_= hash.cell_get(id_fold);
+  hash.latch(cell_)->acquire();
+}
+
+/** Pretty-print a table lock.
+@param[in,out]	file	output stream
+@param[in]	lock	table lock */
+static void lock_table_print(FILE* file, const lock_t* lock);
+
+/** Pretty-print a record lock.
+@param[in,out]	file	output stream
+@param[in]	lock	record lock
+@param[in,out]	mtr	mini-transaction for accessing the record */
+static void lock_rec_print(FILE* file, const lock_t* lock, mtr_t& mtr);
+
+namespace Deadlock
+{
+  /** Whether to_check may be nonempty */
+  static Atomic_relaxed<bool> to_be_checked;
+  /** Transactions to check for deadlock. Protected by lock_sys.wait_mutex. */
+  static std::set<trx_t*> to_check;
+
+  MY_ATTRIBUTE((nonnull, warn_unused_result))
+  /** Check if a lock request results in a deadlock.
+  Resolve a deadlock by choosing a transaction that will be rolled back.
+  @param trx        transaction requesting a lock
+  @param wait_lock  the lock being requested
+  @return the lock that trx is or was waiting for
+  @retval nullptr if the lock wait was resolved
+  @retval -1 if trx must report DB_DEADLOCK */
+  static lock_t *check_and_resolve(trx_t *trx, lock_t *wait_lock);
+
+  /** Quickly detect a deadlock using Brent's cycle detection algorithm.
+  @param trx     transaction that is waiting for another transaction
+  @return a transaction that is part of a cycle
+  @retval nullptr if no cycle was found */
+  inline trx_t *find_cycle(trx_t *trx)
+  {
+    mysql_mutex_assert_owner(&lock_sys.wait_mutex);
+    trx_t *tortoise= trx, *hare= trx;
+    for (unsigned power= 1, l= 1; (hare= hare->lock.wait_trx) != nullptr; l++)
+    {
+      if (tortoise == hare)
+      {
+        ut_ad(l > 1);
+        lock_sys.deadlocks++;
+        /* Note: Normally, trx should be part of any deadlock cycle
+        that is found. However, if innodb_deadlock_detect=OFF had been
+        in effect in the past, it is possible that trx will be waiting
+        for a transaction that participates in a pre-existing deadlock
+        cycle. In that case, our victim will not be trx. */
+        return hare;
+      }
+      if (l == power)
+      {
+        /* The maximum concurrent number of TRX_STATE_ACTIVE transactions
+        is TRX_RSEG_N_SLOTS * 128, or innodb_page_size / 16 * 128
+        (default: 131,072, maximum: 524,288).
+        Our maximum possible number of iterations should be twice that. */
+        power<<= 1;
+        l= 0;
+        tortoise= hare;
+      }
+    }
+    return nullptr;
+  }
+};
+
+#ifdef UNIV_DEBUG
+/** Validate the transactional locks. */
+static void lock_validate();
+
+/** Validate the record lock queues on a page.
+@param block    buffer pool block
+@param latched  whether the tablespace latch may be held
+@return true if ok */
+static bool lock_rec_validate_page(const buf_block_t *block, bool latched)
+  MY_ATTRIBUTE((nonnull, warn_unused_result));
+#endif /* UNIV_DEBUG */
+
+/* The lock system */
+lock_sys_t lock_sys;
+
+/** Only created if !srv_read_only_mode. Protected by lock_sys.latch. */
+static FILE *lock_latest_err_file;
+
+/*********************************************************************//**
+Reports that a transaction id is insensible, i.e., in the future. */
+ATTRIBUTE_COLD
+void
+lock_report_trx_id_insanity(
+/*========================*/
+	trx_id_t	trx_id,		/*!< in: trx id */
+	const rec_t*	rec,		/*!< in: user record */
+	dict_index_t*	index,		/*!< in: index */
+	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec, index) */
+	trx_id_t	max_trx_id)	/*!< in: trx_sys.get_max_trx_id() */
+{
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(!rec_is_metadata(rec, *index));
+
+	ib::error()
+		<< "Transaction id " << ib::hex(trx_id)
+		<< " associated with record" << rec_offsets_print(rec, offsets)
+		<< " in index " << index->name
+		<< " of table " << index->table->name
+		<< " is greater than the global counter " << max_trx_id
+		<< "! The table is corrupted.";
+}
+
+/*********************************************************************//**
+Checks that a transaction id is sensible, i.e., not in the future.
+@return true if ok */
+bool
+lock_check_trx_id_sanity(
+/*=====================*/
+	trx_id_t	trx_id,		/*!< in: trx id */
+	const rec_t*	rec,		/*!< in: user record */
+	dict_index_t*	index,		/*!< in: index */
+	const rec_offs*	offsets)	/*!< in: rec_get_offsets(rec, index) */
+{
+  ut_ad(rec_offs_validate(rec, index, offsets));
+  ut_ad(!rec_is_metadata(rec, *index));
+
+  trx_id_t max_trx_id= trx_sys.get_max_trx_id();
+  ut_ad(max_trx_id || srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN);
+
+  if (UNIV_LIKELY(max_trx_id != 0) && UNIV_UNLIKELY(trx_id >= max_trx_id))
+  {
+    lock_report_trx_id_insanity(trx_id, rec, index, offsets, max_trx_id);
+    return false;
+  }
+  return true;
+}
+
+
+/**
+  Creates the lock system at database start.
+
+  @param[in] n_cells number of slots in lock hash table
+*/
+void lock_sys_t::create(ulint n_cells)
+{
+  ut_ad(this == &lock_sys);
+  ut_ad(!is_initialised());
+
+  m_initialised= true;
+
+  latch.SRW_LOCK_INIT(lock_latch_key);
+#ifdef __aarch64__
+  mysql_mutex_init(lock_wait_mutex_key, &wait_mutex, MY_MUTEX_INIT_FAST);
+#else
+  mysql_mutex_init(lock_wait_mutex_key, &wait_mutex, nullptr);
+#endif
+#ifdef SUX_LOCK_GENERIC
+  pthread_mutex_init(&hash_mutex, nullptr);
+  pthread_cond_init(&hash_cond, nullptr);
+#endif
+
+  rec_hash.create(n_cells);
+  prdt_hash.create(n_cells);
+  prdt_page_hash.create(n_cells);
+
+  if (!srv_read_only_mode)
+  {
+    lock_latest_err_file= os_file_create_tmpfile();
+    ut_a(lock_latest_err_file);
+  }
+}
+
+#ifdef UNIV_PFS_RWLOCK
+/** Acquire exclusive lock_sys.latch */
+void lock_sys_t::wr_lock(const char *file, unsigned line)
+{
+  mysql_mutex_assert_not_owner(&wait_mutex);
+  latch.wr_lock(file, line);
+  ut_ad(!writer.exchange(pthread_self(), std::memory_order_relaxed));
+}
+/** Release exclusive lock_sys.latch */
+void lock_sys_t::wr_unlock()
+{
+  ut_ad(writer.exchange(0, std::memory_order_relaxed) ==
+        pthread_self());
+  latch.wr_unlock();
+}
+
+/** Acquire shared lock_sys.latch */
+void lock_sys_t::rd_lock(const char *file, unsigned line)
+{
+  mysql_mutex_assert_not_owner(&wait_mutex);
+  latch.rd_lock(file, line);
+  ut_ad(!writer.load(std::memory_order_relaxed));
+  ut_d(readers.fetch_add(1, std::memory_order_relaxed));
+}
+
+/** Release shared lock_sys.latch */
+void lock_sys_t::rd_unlock()
+{
+  ut_ad(!writer.load(std::memory_order_relaxed));
+  ut_ad(readers.fetch_sub(1, std::memory_order_relaxed));
+  latch.rd_unlock();
+}
+#endif
+
+/**
+  Resize the lock hash table.
+
+  @param[in] n_cells number of slots in lock hash table
+*/
+void lock_sys_t::resize(ulint n_cells)
+{
+  ut_ad(this == &lock_sys);
+  /* Buffer pool resizing is rarely initiated by the user, and this
+  would exceed the maximum size of a memory transaction. */
+  LockMutexGuard g{SRW_LOCK_CALL};
+  rec_hash.resize(n_cells);
+  prdt_hash.resize(n_cells);
+  prdt_page_hash.resize(n_cells);
+}
+
+/** Closes the lock system at database shutdown. */
+void lock_sys_t::close()
+{
+  ut_ad(this == &lock_sys);
+
+  if (!m_initialised)
+    return;
+
+  if (lock_latest_err_file)
+  {
+    my_fclose(lock_latest_err_file, MYF(MY_WME));
+    lock_latest_err_file= nullptr;
+  }
+
+  rec_hash.free();
+  prdt_hash.free();
+  prdt_page_hash.free();
+#ifdef SUX_LOCK_GENERIC
+  pthread_mutex_destroy(&hash_mutex);
+  pthread_cond_destroy(&hash_cond);
+#endif
+
+  latch.destroy();
+  mysql_mutex_destroy(&wait_mutex);
+
+  Deadlock::to_check.clear();
+  Deadlock::to_be_checked= false;
+
+  m_initialised= false;
+}
+
+#ifdef WITH_WSREP
+# ifdef UNIV_DEBUG
+/** Check if both conflicting lock transaction and other transaction
+requesting record lock are brute force (BF). If they are check is
+this BF-BF wait correct and if not report BF wait and assert.
+
+@param[in]	lock_rec	other waiting record lock
+@param[in]	trx		trx requesting conflicting record lock
+*/
+static void wsrep_assert_no_bf_bf_wait(const lock_t *lock, const trx_t *trx)
+{
+	ut_ad(!lock->is_table());
+	lock_sys.assert_locked(*lock);
+	trx_t* lock_trx= lock->trx;
+
+	/* Note that we are holding lock_sys.latch, thus we should
+	not acquire THD::LOCK_thd_data mutex below to avoid latching
+	order violation. */
+
+	if (!trx->is_wsrep() || !lock_trx->is_wsrep())
+		return;
+	if (UNIV_LIKELY(!wsrep_thd_is_BF(trx->mysql_thd, FALSE))
+	    || UNIV_LIKELY(!wsrep_thd_is_BF(lock_trx->mysql_thd, FALSE)))
+		return;
+
+	ut_ad(trx->state == TRX_STATE_ACTIVE);
+
+	switch (lock_trx->state) {
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		/* The state change is only protected by trx_t::mutex,
+		which we are not even holding here. */
+	case TRX_STATE_PREPARED:
+		/* Wait for lock->trx to complete the commit
+		(or XA ROLLBACK) and to release the lock. */
+		return;
+	case TRX_STATE_ACTIVE:
+		break;
+	default:
+		ut_ad("invalid state" == 0);
+	}
+
+	/* If BF - BF order is honored, i.e. trx already holding
+	record lock should be ordered before this new lock request
+	we can keep trx waiting for the lock. If conflicting
+	transaction is already aborting or rolling back for replaying
+	we can also let new transaction waiting. */
+	if (wsrep_thd_order_before(lock_trx->mysql_thd, trx->mysql_thd)
+	    || wsrep_thd_is_aborting(lock_trx->mysql_thd)) {
+		return;
+	}
+
+	mtr_t mtr;
+
+	ib::error() << "Conflicting lock on table: "
+		    << lock->index->table->name
+		    << " index: "
+		    << lock->index->name()
+		    << " that has lock ";
+	lock_rec_print(stderr, lock, mtr);
+
+	ib::error() << "WSREP state: ";
+
+	wsrep_report_bf_lock_wait(trx->mysql_thd,
+				  trx->id);
+	wsrep_report_bf_lock_wait(lock_trx->mysql_thd,
+				  lock_trx->id);
+	/* BF-BF wait is a bug */
+	ut_error;
+}
+# endif /* UNIV_DEBUG */
+
+/** check if lock timeout was for priority thread,
+as a side effect trigger lock monitor
+@param trx    transaction owning the lock
+@return false for regular lock timeout */
+ATTRIBUTE_NOINLINE static bool wsrep_is_BF_lock_timeout(const trx_t &trx)
+{
+  ut_ad(trx.is_wsrep());
+
+  if (trx.error_state == DB_DEADLOCK || !srv_monitor_timer ||
+      !wsrep_thd_is_BF(trx.mysql_thd, false))
+    return false;
+
+  ib::info() << "WSREP: BF lock wait long for trx:" << ib::hex(trx.id)
+             << " query: " << wsrep_thd_query(trx.mysql_thd);
+  return true;
+}
+#endif /* WITH_WSREP */
+
+/*********************************************************************//**
+Checks if a lock request for a new lock has to wait for request lock2.
+@return TRUE if new lock has to wait for lock2 to be removed */
+UNIV_INLINE
+bool
+lock_rec_has_to_wait(
+/*=================*/
+	const trx_t*	trx,	/*!< in: trx of new lock */
+	unsigned	type_mode,/*!< in: precise mode of the new lock
+				to set: LOCK_S or LOCK_X, possibly
+				ORed to LOCK_GAP or LOCK_REC_NOT_GAP,
+				LOCK_INSERT_INTENTION */
+	const lock_t*	lock2,	/*!< in: another record lock; NOTE that
+				it is assumed that this has a lock bit
+				set on the same record as in the new
+				lock we are setting */
+	bool		lock_is_on_supremum)
+				/*!< in: TRUE if we are setting the
+				lock on the 'supremum' record of an
+				index page: we know then that the lock
+				request is really for a 'gap' type lock */
+{
+	ut_ad(trx);
+	ut_ad(!lock2->is_table());
+	ut_d(lock_sys.hash_get(type_mode).assert_locked(
+		     lock2->un_member.rec_lock.page_id));
+
+	if (trx == lock2->trx
+	    || lock_mode_compatible(
+		       static_cast<lock_mode>(LOCK_MODE_MASK & type_mode),
+		       lock2->mode())) {
+		return false;
+	}
+
+	/* We have somewhat complex rules when gap type record locks
+	cause waits */
+
+	if ((lock_is_on_supremum || (type_mode & LOCK_GAP))
+	    && !(type_mode & LOCK_INSERT_INTENTION)) {
+
+		/* Gap type locks without LOCK_INSERT_INTENTION flag
+		do not need to wait for anything. This is because
+		different users can have conflicting lock types
+		on gaps. */
+
+		return false;
+	}
+
+	if (!(type_mode & LOCK_INSERT_INTENTION) && lock2->is_gap()) {
+
+		/* Record lock (LOCK_ORDINARY or LOCK_REC_NOT_GAP
+		does not need to wait for a gap type lock */
+
+		return false;
+	}
+
+	if ((type_mode & LOCK_GAP) && lock2->is_record_not_gap()) {
+
+		/* Lock on gap does not need to wait for
+		a LOCK_REC_NOT_GAP type lock */
+
+		return false;
+	}
+
+	if (lock2->is_insert_intention()) {
+		/* No lock request needs to wait for an insert
+		intention lock to be removed. This is ok since our
+		rules allow conflicting locks on gaps. This eliminates
+		a spurious deadlock caused by a next-key lock waiting
+		for an insert intention lock; when the insert
+		intention lock was granted, the insert deadlocked on
+		the waiting next-key lock.
+
+		Also, insert intention locks do not disturb each
+		other. */
+
+		return false;
+	}
+
+#ifdef HAVE_REPLICATION
+	if ((type_mode & LOCK_GAP || lock2->is_gap())
+	    && !thd_need_ordering_with(trx->mysql_thd, lock2->trx->mysql_thd)) {
+		/* If the upper server layer has already decided on the
+		commit order between the transaction requesting the
+		lock and the transaction owning the lock, we do not
+		need to wait for gap locks. Such ordeering by the upper
+		server layer happens in parallel replication, where the
+		commit order is fixed to match the original order on the
+		master.
+
+		Such gap locks are mainly needed to get serialisability
+		between transactions so that they will be binlogged in
+		the correct order so that statement-based replication
+		will give the correct results. Since the right order
+		was already determined on the master, we do not need
+		to enforce it again here.
+
+		Skipping the locks is not essential for correctness,
+		since in case of deadlock we will just kill the later
+		transaction and retry it. But it can save some
+		unnecessary rollbacks and retries. */
+
+		return false;
+	}
+#endif /* HAVE_REPLICATION */
+
+#ifdef WITH_WSREP
+	/* New lock request from a transaction is using unique key
+	scan and this transaction is a wsrep high priority transaction
+	(brute force). If conflicting transaction is also wsrep high
+	priority transaction we should avoid lock conflict because
+	ordering of these transactions is already decided and
+	conflicting transaction will be later replayed. */
+	if (trx->is_wsrep_UK_scan()
+	    && wsrep_thd_is_BF(lock2->trx->mysql_thd, false)) {
+		return false;
+	}
+
+	/* if BF-BF conflict, we have to look at write set order */
+	if (trx->is_wsrep() &&
+	   (type_mode & LOCK_MODE_MASK) == LOCK_X &&
+	   (lock2->type_mode & LOCK_MODE_MASK) == LOCK_X &&
+	   wsrep_thd_order_before(trx->mysql_thd,
+				  lock2->trx->mysql_thd)) {
+		return false;
+	}
+
+	/* We very well can let bf to wait normally as other
+	BF will be replayed in case of conflict. For debug
+	builds we will do additional sanity checks to catch
+	unsupported bf wait if any. */
+	ut_d(wsrep_assert_no_bf_bf_wait(lock2, trx));
+#endif /* WITH_WSREP */
+
+	return true;
+}
+
+/*********************************************************************//**
+Checks if a lock request lock1 has to wait for request lock2.
+@return TRUE if lock1 has to wait for lock2 to be removed */
+bool
+lock_has_to_wait(
+/*=============*/
+	const lock_t*	lock1,	/*!< in: waiting lock */
+	const lock_t*	lock2)	/*!< in: another lock; NOTE that it is
+				assumed that this has a lock bit set
+				on the same record as in lock1 if the
+				locks are record locks */
+{
+	ut_ad(lock1 && lock2);
+
+	if (lock1->trx == lock2->trx
+	    || lock_mode_compatible(lock1->mode(), lock2->mode())) {
+		return false;
+	}
+
+	if (lock1->is_table()) {
+		return true;
+	}
+
+	ut_ad(!lock2->is_table());
+
+	if (lock1->type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE)) {
+		return lock_prdt_has_to_wait(lock1->trx, lock1->type_mode,
+					     lock_get_prdt_from_lock(lock1),
+					     lock2);
+	}
+
+	return lock_rec_has_to_wait(
+		lock1->trx, lock1->type_mode, lock2,
+		lock_rec_get_nth_bit(lock1, PAGE_HEAP_NO_SUPREMUM));
+}
+
+/*============== RECORD LOCK BASIC FUNCTIONS ============================*/
+
+/**********************************************************************//**
+Looks for a set bit in a record lock bitmap. Returns ULINT_UNDEFINED,
+if none found.
+@return bit index == heap number of the record, or ULINT_UNDEFINED if
+none found */
+ulint
+lock_rec_find_set_bit(
+/*==================*/
+	const lock_t*	lock)	/*!< in: record lock with at least one bit set */
+{
+	for (ulint i = 0; i < lock_rec_get_n_bits(lock); ++i) {
+
+		if (lock_rec_get_nth_bit(lock, i)) {
+
+			return(i);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/*********************************************************************//**
+Resets the record lock bitmap to zero. NOTE: does not touch the wait_lock
+pointer in the transaction! This function is used in lock object creation
+and resetting. */
+static
+void
+lock_rec_bitmap_reset(
+/*==================*/
+	lock_t*	lock)	/*!< in: record lock */
+{
+	ulint	n_bytes;
+
+	ut_ad(!lock->is_table());
+
+	/* Reset to zero the bitmap which resides immediately after the lock
+	struct */
+
+	n_bytes = lock_rec_get_n_bits(lock) / 8;
+
+	ut_ad((lock_rec_get_n_bits(lock) % 8) == 0);
+
+	memset(reinterpret_cast<void*>(&lock[1]), 0, n_bytes);
+}
+
+/*********************************************************************//**
+Copies a record lock to heap.
+@return copy of lock */
+static
+lock_t*
+lock_rec_copy(
+/*==========*/
+	const lock_t*	lock,	/*!< in: record lock */
+	mem_heap_t*	heap)	/*!< in: memory heap */
+{
+	ulint	size;
+
+	ut_ad(!lock->is_table());
+
+	size = sizeof(lock_t) + lock_rec_get_n_bits(lock) / 8;
+
+	return(static_cast<lock_t*>(mem_heap_dup(heap, lock, size)));
+}
+
+/*********************************************************************//**
+Gets the previous record lock set on a record.
+@return previous lock on the same record, NULL if none exists */
+const lock_t*
+lock_rec_get_prev(
+/*==============*/
+	const lock_t*	in_lock,/*!< in: record lock */
+	ulint		heap_no)/*!< in: heap number of the record */
+{
+  ut_ad(!in_lock->is_table());
+  const page_id_t id{in_lock->un_member.rec_lock.page_id};
+  hash_cell_t *cell= lock_sys.hash_get(in_lock->type_mode).cell_get(id.fold());
+
+  for (lock_t *lock= lock_sys_t::get_first(*cell, id); lock != in_lock;
+       lock= lock_rec_get_next_on_page(lock))
+    if (lock_rec_get_nth_bit(lock, heap_no))
+      return lock;
+
+  return nullptr;
+}
+
+/*============= FUNCTIONS FOR ANALYZING RECORD LOCK QUEUE ================*/
+
+/*********************************************************************//**
+Checks if a transaction has a GRANTED explicit lock on rec stronger or equal
+to precise_mode.
+@return lock or NULL */
+UNIV_INLINE
+lock_t*
+lock_rec_has_expl(
+/*==============*/
+	ulint			precise_mode,/*!< in: LOCK_S or LOCK_X
+					possibly ORed to LOCK_GAP or
+					LOCK_REC_NOT_GAP, for a
+					supremum record we regard this
+					always a gap type request */
+	const hash_cell_t&	cell,	/*!< in: lock hash table cell */
+	const page_id_t		id,	/*!< in: page identifier */
+	ulint			heap_no,/*!< in: heap number of the record */
+	const trx_t*		trx)	/*!< in: transaction */
+{
+  ut_ad((precise_mode & LOCK_MODE_MASK) == LOCK_S
+	|| (precise_mode & LOCK_MODE_MASK) == LOCK_X);
+  ut_ad(!(precise_mode & LOCK_INSERT_INTENTION));
+
+  for (lock_t *lock= lock_sys_t::get_first(cell, id, heap_no); lock;
+       lock= lock_rec_get_next(heap_no, lock))
+    if (lock->trx == trx &&
+	!(lock->type_mode & (LOCK_WAIT | LOCK_INSERT_INTENTION)) &&
+	(!((LOCK_REC_NOT_GAP | LOCK_GAP) & lock->type_mode) ||
+	 heap_no == PAGE_HEAP_NO_SUPREMUM ||
+	 ((LOCK_REC_NOT_GAP | LOCK_GAP) & precise_mode & lock->type_mode)) &&
+	lock_mode_stronger_or_eq(lock->mode(), static_cast<lock_mode>
+				 (precise_mode & LOCK_MODE_MASK)))
+      return lock;
+
+  return nullptr;
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Checks if some other transaction has a lock request in the queue.
+@return lock or NULL */
+static
+lock_t*
+lock_rec_other_has_expl_req(
+/*========================*/
+	lock_mode		mode,	/*!< in: LOCK_S or LOCK_X */
+	const hash_cell_t&	cell,	/*!< in: lock hash table cell */
+	const page_id_t		id,	/*!< in: page identifier */
+	bool			wait,	/*!< in: whether also waiting locks
+					are taken into account */
+	ulint			heap_no,/*!< in: heap number of the record */
+	const trx_t*		trx)	/*!< in: transaction, or NULL if
+					requests by all transactions
+					are taken into account */
+{
+	ut_ad(mode == LOCK_X || mode == LOCK_S);
+
+	/* Only GAP lock can be on SUPREMUM, and we are not looking for
+	GAP lock */
+	if (heap_no == PAGE_HEAP_NO_SUPREMUM) {
+		return(NULL);
+	}
+
+	for (lock_t* lock = lock_sys_t::get_first(cell, id, heap_no);
+	     lock; lock = lock_rec_get_next(heap_no, lock)) {
+		if (lock->trx != trx
+		    && !lock->is_gap()
+		    && (!lock->is_waiting() || wait)
+		    && lock_mode_stronger_or_eq(lock->mode(), mode)) {
+
+			return(lock);
+		}
+	}
+
+	return(NULL);
+}
+#endif /* UNIV_DEBUG */
+
+#ifdef WITH_WSREP
+void lock_wait_wsrep_kill(trx_t *bf_trx, ulong thd_id, trx_id_t trx_id);
+
+#ifdef UNIV_DEBUG
+void wsrep_report_error(const lock_t* victim_lock, const trx_t *bf_trx)
+{
+  // We have conflicting BF-BF case, these threads
+  // should not execute concurrently
+  mtr_t mtr;
+  WSREP_ERROR("BF request is not compatible with victim");
+  WSREP_ERROR("BF requesting lock: ");
+  lock_rec_print(stderr, bf_trx->lock.wait_lock, mtr);
+  WSREP_ERROR("victim holding lock: ");
+  lock_rec_print(stderr, victim_lock, mtr);
+  wsrep_assert_no_bf_bf_wait(victim_lock, bf_trx);
+}
+#endif /* WITH_DEBUG */
+
+/** Kill the holders of conflicting locks.
+@param trx   brute-force applier transaction running in the current thread */
+ATTRIBUTE_COLD ATTRIBUTE_NOINLINE
+static void lock_wait_wsrep(trx_t *trx)
+{
+  DBUG_ASSERT(wsrep_on(trx->mysql_thd));
+  if (!wsrep_thd_is_BF(trx->mysql_thd, false))
+    return;
+
+  std::set<trx_t*> victims;
+
+  lock_sys.wr_lock(SRW_LOCK_CALL);
+  mysql_mutex_lock(&lock_sys.wait_mutex);
+
+  const lock_t *wait_lock= trx->lock.wait_lock;
+  if (!wait_lock)
+  {
+func_exit:
+    lock_sys.wr_unlock();
+    mysql_mutex_unlock(&lock_sys.wait_mutex);
+    return;
+  }
+
+  if (wait_lock->is_table())
+  {
+    dict_table_t *table= wait_lock->un_member.tab_lock.table;
+    for (lock_t *lock= UT_LIST_GET_FIRST(table->locks); lock;
+         lock= UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock))
+    {
+      /* if victim has also BF status, but has earlier seqno, we have to wait */
+      if (lock->trx != trx &&
+          !(wsrep_thd_is_BF(lock->trx->mysql_thd, false) &&
+            wsrep_thd_order_before(lock->trx->mysql_thd, trx->mysql_thd)))
+      {
+        if (wsrep_thd_is_BF(lock->trx->mysql_thd, false))
+        {
+          // There is no need to kill victim with compatible lock
+          if (!lock_has_to_wait(trx->lock.wait_lock, lock))
+            continue;
+
+#ifdef UNIV_DEBUG
+          wsrep_report_error(lock, trx);
+#endif
+        }
+
+        victims.emplace(lock->trx);
+      }
+    }
+  }
+  else
+  {
+    const page_id_t id{wait_lock->un_member.rec_lock.page_id};
+    hash_cell_t &cell= *(wait_lock->type_mode & LOCK_PREDICATE
+                         ? lock_sys.prdt_hash : lock_sys.rec_hash).cell_get
+      (id.fold());
+    if (lock_t *lock= lock_sys_t::get_first(cell, id))
+    {
+      const ulint heap_no= lock_rec_find_set_bit(wait_lock);
+      if (!lock_rec_get_nth_bit(lock, heap_no))
+        lock= lock_rec_get_next(heap_no, lock);
+      do
+      {
+        /* if victim has also BF status, but has earlier seqno, we have to wait */
+        if (lock->trx != trx &&
+            !(wsrep_thd_is_BF(lock->trx->mysql_thd, false) &&
+              wsrep_thd_order_before(lock->trx->mysql_thd, trx->mysql_thd)))
+        {
+          if (wsrep_thd_is_BF(lock->trx->mysql_thd, false))
+          {
+            // There is no need to kill victim with compatible lock
+            if (!lock_has_to_wait(trx->lock.wait_lock, lock))
+              continue;
+
+#ifdef UNIV_DEBUG
+            wsrep_report_error(lock, trx);
+#endif
+          }
+
+          victims.emplace(lock->trx);
+        }
+      } while ((lock= lock_rec_get_next(heap_no, lock)));
+    }
+  }
+
+  if (victims.empty())
+    goto func_exit;
+
+  std::vector<std::pair<ulong,trx_id_t>> victim_id;
+  for (trx_t *v : victims)
+    victim_id.emplace_back(std::pair<ulong,trx_id_t>
+                           {thd_get_thread_id(v->mysql_thd), v->id});
+
+  DBUG_EXECUTE_IF("sync.before_wsrep_thd_abort",
+                  {
+                    const char act[]=
+                      "now SIGNAL sync.before_wsrep_thd_abort_reached "
+                      "WAIT_FOR signal.before_wsrep_thd_abort";
+                    DBUG_ASSERT(!debug_sync_set_action(trx->mysql_thd,
+                                                       STRING_WITH_LEN(act)));
+                  };);
+
+  lock_sys.wr_unlock();
+  mysql_mutex_unlock(&lock_sys.wait_mutex);
+
+  for (const auto &v : victim_id)
+    lock_wait_wsrep_kill(trx, v.first, v.second);
+}
+#endif /* WITH_WSREP */
+
+/*********************************************************************//**
+Checks if some other transaction has a conflicting explicit lock request
+in the queue, so that we have to wait.
+@param[in] mode LOCK_S or LOCK_X, possibly ORed to LOCK_GAP or LOC_REC_NOT_GAP,
+LOCK_INSERT_INTENTION
+@param[in] cell lock hash table cell
+@param[in] id page identifier
+@param[in] heap_no heap number of the record
+@param[in] trx our transaction
+@return conflicting lock and the flag which indicated if conflicting locks
+which wait for the current transaction were ignored */
+static lock_t *lock_rec_other_has_conflicting(unsigned mode,
+                                              const hash_cell_t &cell,
+                                              const page_id_t id,
+                                              ulint heap_no, const trx_t *trx)
+{
+	bool	is_supremum = (heap_no == PAGE_HEAP_NO_SUPREMUM);
+
+	for (lock_t* lock = lock_sys_t::get_first(cell, id, heap_no);
+	     lock; lock = lock_rec_get_next(heap_no, lock)) {
+		if (lock_rec_has_to_wait(trx, mode, lock, is_supremum)) {
+			return(lock);
+		}
+	}
+
+	return(NULL);
+}
+
+/*********************************************************************//**
+Checks if some transaction has an implicit x-lock on a record in a secondary
+index.
+@return transaction id of the transaction which has the x-lock, or 0;
+NOTE that this function can return false positives but never false
+negatives. The caller must confirm all positive results by calling
+trx_is_active(). */
+static
+trx_t*
+lock_sec_rec_some_has_impl(
+/*=======================*/
+	trx_t*		caller_trx,/*!<in/out: trx of current thread */
+	const rec_t*	rec,	/*!< in: user record */
+	dict_index_t*	index,	/*!< in: secondary index */
+	const rec_offs*	offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+  lock_sys.assert_unlocked();
+  ut_ad(!dict_index_is_clust(index));
+  ut_ad(page_rec_is_user_rec(rec));
+  ut_ad(rec_offs_validate(rec, index, offsets));
+  ut_ad(!rec_is_metadata(rec, *index));
+
+  const trx_id_t max_trx_id= page_get_max_trx_id(page_align(rec));
+
+  /* Note: It is possible to have caller_trx->id == 0 in a locking read
+  if caller_trx has not modified any persistent tables. */
+  if (!trx_sys.find_same_or_older(caller_trx, max_trx_id) ||
+      !lock_check_trx_id_sanity(max_trx_id, rec, index, offsets))
+    return nullptr;
+
+  /* We checked above that some active (or XA PREPARE) transaction exists
+  that is older than PAGE_MAX_TRX_ID. That is, some transaction may be
+  holding an implicit lock on the record. We have to look up the
+  clustered index record to find if it is (or was) the case. */
+  return row_vers_impl_x_locked(caller_trx, rec, index, offsets);
+}
+
+/*********************************************************************//**
+Return the number of table locks for a transaction.
+The caller must be holding lock_sys.latch. */
+ulint
+lock_number_of_tables_locked(
+/*=========================*/
+	const trx_lock_t*	trx_lock)	/*!< in: transaction locks */
+{
+	const lock_t*	lock;
+	ulint		n_tables = 0;
+
+	lock_sys.assert_locked();
+
+	for (lock = UT_LIST_GET_FIRST(trx_lock->trx_locks);
+	     lock != NULL;
+	     lock = UT_LIST_GET_NEXT(trx_locks, lock)) {
+
+		if (lock->is_table()) {
+			n_tables++;
+		}
+	}
+
+	return(n_tables);
+}
+
+/*============== RECORD LOCK CREATION AND QUEUE MANAGEMENT =============*/
+
+/** Reset the wait status of a lock.
+@param[in,out]	lock	lock that was possibly being waited for */
+static void lock_reset_lock_and_trx_wait(lock_t *lock)
+{
+  lock_sys.assert_locked(*lock);
+  mysql_mutex_assert_owner(&lock_sys.wait_mutex);
+  trx_t *trx= lock->trx;
+  ut_ad(lock->is_waiting());
+  ut_ad(!trx->lock.wait_lock || trx->lock.wait_lock == lock);
+  if (trx_t *wait_trx= trx->lock.wait_trx)
+    Deadlock::to_check.erase(wait_trx);
+  trx->lock.wait_lock= nullptr;
+  trx->lock.wait_trx= nullptr;
+  lock->type_mode&= ~LOCK_WAIT;
+}
+
+#ifdef UNIV_DEBUG
+/** Check transaction state */
+static void check_trx_state(const trx_t *trx)
+{
+  ut_ad(!trx->auto_commit || trx->will_lock);
+  const auto state= trx->state;
+  ut_ad(state == TRX_STATE_ACTIVE ||
+        state == TRX_STATE_PREPARED_RECOVERED ||
+        state == TRX_STATE_PREPARED ||
+        state == TRX_STATE_COMMITTED_IN_MEMORY);
+}
+#endif
+
+/** Create a new record lock and inserts it to the lock queue,
+without checking for deadlocks or conflicts.
+@param[in]	c_lock		conflicting lock
+@param[in]	type_mode	lock mode and wait flag
+@param[in]	page_id		index page number
+@param[in]	page		R-tree index page, or NULL
+@param[in]	heap_no		record heap number in the index page
+@param[in]	index		the index tree
+@param[in,out]	trx		transaction
+@param[in]	holds_trx_mutex	whether the caller holds trx->mutex
+@return created lock */
+lock_t*
+lock_rec_create_low(
+	lock_t*		c_lock,
+	unsigned	type_mode,
+	const page_id_t	page_id,
+	const page_t*	page,
+	ulint		heap_no,
+	dict_index_t*	index,
+	trx_t*		trx,
+	bool		holds_trx_mutex)
+{
+	lock_t*		lock;
+	ulint		n_bytes;
+
+	ut_d(lock_sys.hash_get(type_mode).assert_locked(page_id));
+	ut_ad(xtest() || holds_trx_mutex == trx->mutex_is_owner());
+	ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index));
+	ut_ad(!(type_mode & LOCK_TABLE));
+	ut_ad(trx->state != TRX_STATE_NOT_STARTED);
+	ut_ad(!trx->is_autocommit_non_locking());
+
+	/* If rec is the supremum record, then we reset the gap and
+	LOCK_REC_NOT_GAP bits, as all locks on the supremum are
+	automatically of the gap type */
+
+	if (UNIV_UNLIKELY(heap_no == PAGE_HEAP_NO_SUPREMUM)) {
+		ut_ad(!(type_mode & LOCK_REC_NOT_GAP));
+		type_mode = type_mode & ~(LOCK_GAP | LOCK_REC_NOT_GAP);
+	}
+
+	if (UNIV_LIKELY(!(type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE)))) {
+		n_bytes = (page_dir_get_n_heap(page) + 7) / 8;
+	} else {
+		ut_ad(heap_no == PRDT_HEAPNO);
+
+		/* The lock is always on PAGE_HEAP_NO_INFIMUM (0), so
+		we only need 1 bit (which round up to 1 byte) for
+		lock bit setting */
+		n_bytes = 1;
+
+		if (type_mode & LOCK_PREDICATE) {
+			ulint	tmp = UNIV_WORD_SIZE - 1;
+
+			/* We will attach predicate structure after lock.
+			Make sure the memory is aligned on 8 bytes,
+			the mem_heap_alloc will align it with
+			MEM_SPACE_NEEDED anyway. */
+			n_bytes = (n_bytes + sizeof(lock_prdt_t) + tmp) & ~tmp;
+			ut_ad(n_bytes == sizeof(lock_prdt_t) + UNIV_WORD_SIZE);
+		}
+	}
+
+	if (!holds_trx_mutex) {
+		trx->mutex_lock();
+	}
+	ut_ad(trx->mutex_is_owner());
+	ut_ad(trx->state != TRX_STATE_NOT_STARTED);
+
+	if (trx->lock.rec_cached >= UT_ARR_SIZE(trx->lock.rec_pool)
+	    || sizeof *lock + n_bytes > sizeof *trx->lock.rec_pool) {
+		lock = static_cast<lock_t*>(
+			mem_heap_alloc(trx->lock.lock_heap,
+				       sizeof *lock + n_bytes));
+	} else {
+		lock = &trx->lock.rec_pool[trx->lock.rec_cached++].lock;
+	}
+
+	lock->trx = trx;
+	lock->type_mode = type_mode;
+	lock->index = index;
+	lock->un_member.rec_lock.page_id = page_id;
+
+	if (UNIV_LIKELY(!(type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE)))) {
+		lock->un_member.rec_lock.n_bits = uint32_t(n_bytes * 8);
+	} else {
+		/* Predicate lock always on INFIMUM (0) */
+		lock->un_member.rec_lock.n_bits = 8;
+ 	}
+	lock_rec_bitmap_reset(lock);
+	lock_rec_set_nth_bit(lock, heap_no);
+	index->table->n_rec_locks++;
+	ut_ad(index->table->get_ref_count() || !index->table->can_be_evicted);
+
+	const auto lock_hash = &lock_sys.hash_get(type_mode);
+	lock_hash->cell_get(page_id.fold())->append(*lock, &lock_t::hash);
+
+	if (type_mode & LOCK_WAIT) {
+		if (trx->lock.wait_trx) {
+			ut_ad(!c_lock || trx->lock.wait_trx == c_lock->trx);
+			ut_ad(trx->lock.wait_lock);
+			ut_ad((*trx->lock.wait_lock).trx == trx);
+		} else {
+			ut_ad(c_lock);
+			trx->lock.wait_trx = c_lock->trx;
+			ut_ad(!trx->lock.wait_lock);
+		}
+		trx->lock.wait_lock = lock;
+	}
+	UT_LIST_ADD_LAST(trx->lock.trx_locks, lock);
+	if (!holds_trx_mutex) {
+		trx->mutex_unlock();
+	}
+	MONITOR_INC(MONITOR_RECLOCK_CREATED);
+	MONITOR_INC(MONITOR_NUM_RECLOCK);
+
+	return lock;
+}
+
+/** Enqueue a waiting request for a lock which cannot be granted immediately.
+Check for deadlocks.
+@param[in]	type_mode	the requested lock mode (LOCK_S or LOCK_X)
+				possibly ORed with LOCK_GAP or
+				LOCK_REC_NOT_GAP, ORed with
+				LOCK_INSERT_INTENTION if this
+				waiting lock request is set
+				when performing an insert of
+				an index record
+@param[in]	id		page identifier
+@param[in]	page		leaf page in the index
+@param[in]	heap_no		record heap number in the block
+@param[in]	index		index tree
+@param[in,out]	thr		query thread
+@param[in]	prdt		minimum bounding box (spatial index)
+@retval	DB_LOCK_WAIT		if the waiting lock was enqueued
+@retval	DB_DEADLOCK		if this transaction was chosen as the victim */
+dberr_t
+lock_rec_enqueue_waiting(
+	lock_t*			c_lock,
+	unsigned		type_mode,
+	const page_id_t		id,
+	const page_t*		page,
+	ulint			heap_no,
+	dict_index_t*		index,
+	que_thr_t*		thr,
+	lock_prdt_t*		prdt)
+{
+	ut_d(lock_sys.hash_get(type_mode).assert_locked(id));
+	ut_ad(!srv_read_only_mode);
+	ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index));
+
+	trx_t* trx = thr_get_trx(thr);
+	ut_ad(xtest() || trx->mutex_is_owner());
+	ut_ad(!trx->dict_operation_lock_mode);
+        /* Apart from Galera, only transactions that have waiting lock can be
+        chosen as deadlock victim. Only one lock can be waited for at a time,
+        and a transaction is associated with a single thread. That is why there
+        must not be waiting lock requests if the transaction is deadlock victim
+        and it is not WSREP. Galera transaction abort can be invoked from MDL
+        acquisition code when the transaction does not have waiting record
+        lock, that's why we check only deadlock victim bit here. */
+        ut_ad(!(trx->lock.was_chosen_as_deadlock_victim & 1));
+
+	if (trx->mysql_thd && thd_lock_wait_timeout(trx->mysql_thd) == 0) {
+		trx->error_state = DB_LOCK_WAIT_TIMEOUT;
+		return DB_LOCK_WAIT_TIMEOUT;
+	}
+
+	/* Enqueue the lock request that will wait to be granted, note that
+	we already own the trx mutex. */
+	lock_t* lock = lock_rec_create_low(
+		c_lock,
+		type_mode | LOCK_WAIT, id, page, heap_no, index, trx, true);
+
+	if (prdt && type_mode & LOCK_PREDICATE) {
+		lock_prdt_set_prdt(lock, prdt);
+	}
+
+	trx->lock.wait_thr = thr;
+
+	DBUG_LOG("ib_lock", "trx " << ib::hex(trx->id)
+		 << " waits for lock in index " << index->name
+		 << " of table " << index->table->name);
+
+	MONITOR_INC(MONITOR_LOCKREC_WAIT);
+
+	return DB_LOCK_WAIT;
+}
+
+/*********************************************************************//**
+Looks for a suitable type record lock struct by the same trx on the same page.
+This can be used to save space when a new record lock should be set on a page:
+no new struct is needed, if a suitable old is found.
+@return lock or NULL */
+static inline
+lock_t*
+lock_rec_find_similar_on_page(
+	ulint           type_mode,      /*!< in: lock type_mode field */
+	ulint           heap_no,        /*!< in: heap number of the record */
+	lock_t*         lock,           /*!< in: lock_sys.get_first() */
+	const trx_t*    trx)            /*!< in: transaction */
+{
+	lock_sys.rec_hash.assert_locked(lock->un_member.rec_lock.page_id);
+
+	for (/* No op */;
+	     lock != NULL;
+	     lock = lock_rec_get_next_on_page(lock)) {
+
+		if (lock->trx == trx
+		    && lock->type_mode == type_mode
+		    && lock_rec_get_n_bits(lock) > heap_no) {
+
+			return(lock);
+		}
+	}
+
+	return(NULL);
+}
+
+/*********************************************************************//**
+Adds a record lock request in the record queue. The request is normally
+added as the last in the queue, but if there are no waiting lock requests
+on the record, and the request to be added is not a waiting request, we
+can reuse a suitable record lock object already existing on the same page,
+just setting the appropriate bit in its bitmap. This is a low-level function
+which does NOT check for deadlocks or lock compatibility!
+@param[in] type_mode lock mode, wait, gap etc. flags
+@param[in,out] cell first hash table cell
+@param[in] id page identifier
+@param[in] page buffer block containing the record
+@param[in] heap_no heap number of the record
+@param[in] index index of record
+@param[in,out] trx transaction
+@param[in] caller_owns_trx_mutex TRUE if caller owns the transaction mutex */
+TRANSACTIONAL_TARGET
+static void lock_rec_add_to_queue(unsigned type_mode, const hash_cell_t &cell,
+                                  const page_id_t id, const page_t *page,
+                                  ulint heap_no, dict_index_t *index,
+                                  trx_t *trx, bool caller_owns_trx_mutex)
+{
+	ut_d(lock_sys.hash_get(type_mode).assert_locked(id));
+	ut_ad(xtest() || caller_owns_trx_mutex == trx->mutex_is_owner());
+	ut_ad(index->is_primary()
+	      || dict_index_get_online_status(index) != ONLINE_INDEX_CREATION);
+	ut_ad(!(type_mode & LOCK_TABLE));
+#ifdef UNIV_DEBUG
+	switch (type_mode & LOCK_MODE_MASK) {
+	case LOCK_X:
+	case LOCK_S:
+		break;
+	default:
+		ut_error;
+	}
+
+	if (!(type_mode & (LOCK_WAIT | LOCK_GAP))) {
+		lock_mode	mode = (type_mode & LOCK_MODE_MASK) == LOCK_S
+			? LOCK_X
+			: LOCK_S;
+		const lock_t*	other_lock
+			= lock_rec_other_has_expl_req(
+				mode, cell, id, false, heap_no, trx);
+#ifdef WITH_WSREP
+		if (UNIV_LIKELY_NULL(other_lock) && trx->is_wsrep()) {
+			/* Only BF transaction may be granted lock
+			before other conflicting lock request. */
+			if (!wsrep_thd_is_BF(trx->mysql_thd, FALSE)
+			    && !wsrep_thd_is_BF(other_lock->trx->mysql_thd, FALSE)) {
+				/* If it is not BF, this case is a bug. */
+				wsrep_report_bf_lock_wait(trx->mysql_thd, trx->id);
+				wsrep_report_bf_lock_wait(other_lock->trx->mysql_thd, other_lock->trx->id);
+				ut_error;
+			}
+		} else
+#endif /* WITH_WSREP */
+		ut_ad(!other_lock);
+	}
+#endif /* UNIV_DEBUG */
+
+	/* If rec is the supremum record, then we can reset the gap bit, as
+	all locks on the supremum are automatically of the gap type, and we
+	try to avoid unnecessary memory consumption of a new record lock
+	struct for a gap type lock */
+
+	if (heap_no == PAGE_HEAP_NO_SUPREMUM) {
+		ut_ad(!(type_mode & LOCK_REC_NOT_GAP));
+
+		/* There should never be LOCK_REC_NOT_GAP on a supremum
+		record, but let us play safe */
+
+		type_mode &= ~(LOCK_GAP | LOCK_REC_NOT_GAP);
+	}
+
+	if (type_mode & LOCK_WAIT) {
+		goto create;
+	} else if (lock_t *first_lock = lock_sys_t::get_first(cell, id)) {
+		for (lock_t* lock = first_lock;;) {
+			if (lock->is_waiting()
+			    && lock_rec_get_nth_bit(lock, heap_no)) {
+				goto create;
+			}
+			if (!(lock = lock_rec_get_next_on_page(lock))) {
+				break;
+			}
+		}
+
+		/* Look for a similar record lock on the same page:
+		if one is found and there are no waiting lock requests,
+		we can just set the bit */
+		if (lock_t* lock = lock_rec_find_similar_on_page(
+			    type_mode, heap_no, first_lock, trx)) {
+			trx_t* lock_trx = lock->trx;
+			if (caller_owns_trx_mutex) {
+				trx->mutex_unlock();
+			}
+			{
+				TMTrxGuard tg{*lock_trx};
+				lock_rec_set_nth_bit(lock, heap_no);
+			}
+
+			if (caller_owns_trx_mutex) {
+				trx->mutex_lock();
+			}
+			return;
+		}
+	}
+
+create:
+	/* Note: We will not pass any conflicting lock to lock_rec_create(),
+	because we should be moving an existing waiting lock request. */
+	ut_ad(!(type_mode & LOCK_WAIT) || trx->lock.wait_trx);
+
+	lock_rec_create_low(nullptr,
+			    type_mode, id, page, heap_no, index, trx,
+			    caller_owns_trx_mutex);
+}
+
+/** A helper function for lock_rec_lock_slow(), which grants a Next Key Lock
+(either LOCK_X or LOCK_S as specified by `mode`) on <`block`,`heap_no`> in the
+`index` to the `trx`, assuming that it already has a granted `held_lock`, which
+is at least as strong as mode|LOCK_REC_NOT_GAP. It does so by either reusing the
+lock if it already covers the gap, or by ensuring a separate GAP Lock, which in
+combination with Record Lock satisfies the request.
+@param[in]      held_lock   a lock granted to `trx` which is at least as strong
+                            as mode|LOCK_REC_NOT_GAP
+@param[in]      mode        requested lock mode: LOCK_X or LOCK_S
+@param[in]      cell        lock hash table cell
+@param[in]      id          page identifier
+@param[in]      page        buffer block containing the record
+@param[in]      heap_no     heap number of the record to be locked
+@param[in]      index       index of record to be locked
+@param[in]      trx         the transaction requesting the Next Key Lock */
+static void lock_reuse_for_next_key_lock(const lock_t *held_lock,
+                                         unsigned mode,
+                                         const hash_cell_t &cell,
+                                         const page_id_t id,
+                                         const page_t *page, ulint heap_no,
+                                         dict_index_t *index, trx_t *trx)
+{
+  ut_ad(trx->mutex_is_owner());
+  ut_ad(mode == LOCK_S || mode == LOCK_X);
+  ut_ad(lock_mode_is_next_key_lock(mode));
+
+  if (!held_lock->is_record_not_gap())
+  {
+    ut_ad(held_lock->is_next_key_lock());
+    return;
+  }
+
+  /* We have a Record Lock granted, so we only need a GAP Lock. We assume
+  that GAP Locks do not conflict with anything. Therefore a GAP Lock
+  could be granted to us right now if we've requested: */
+  mode|= LOCK_GAP;
+  ut_ad(nullptr ==
+        lock_rec_other_has_conflicting(mode, cell, id, heap_no, trx));
+
+  /* It might be the case we already have one, so we first check that. */
+  if (lock_rec_has_expl(mode, cell, id, heap_no, trx) == nullptr)
+    lock_rec_add_to_queue(mode, cell, id, page, heap_no, index, trx, true);
+}
+
+
+/*********************************************************************//**
+Tries to lock the specified record in the mode requested. If not immediately
+possible, enqueues a waiting lock request. This is a low-level function
+which does NOT look at implicit locks! Checks lock compatibility within
+explicit locks. This function sets a normal next-key lock, or in the case
+of a page supremum record, a gap type lock.
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, or DB_DEADLOCK */
+static
+dberr_t
+lock_rec_lock(
+/*==========*/
+	bool			impl,	/*!< in: if true, no lock is set
+					if no wait is necessary: we
+					assume that the caller will
+					set an implicit lock */
+	unsigned		mode,	/*!< in: lock mode: LOCK_X or
+					LOCK_S possibly ORed to either
+					LOCK_GAP or LOCK_REC_NOT_GAP */
+	const buf_block_t*	block,	/*!< in: buffer block containing
+					the record */
+	ulint			heap_no,/*!< in: heap number of record */
+	dict_index_t*		index,	/*!< in: index of record */
+	que_thr_t*		thr)	/*!< in: query thread */
+{
+  trx_t *trx= thr_get_trx(thr);
+  /* There must not be lock requests for reads or updates if transaction was
+  chosen as deadlock victim. Apart from Galera, only transactions that have
+  waiting lock may be chosen as deadlock victims. Only one lock can be waited
+  for at a time, and a transaction is associated with a single thread. Galera
+  transaction abort can be invoked from MDL acquisition code when the
+  transaction does not have waiting lock, that's why we check only deadlock
+  victim bit here. */
+  ut_ad(!(trx->lock.was_chosen_as_deadlock_victim & 1));
+  ut_ad(!srv_read_only_mode);
+  ut_ad(((LOCK_MODE_MASK | LOCK_TABLE) & mode) == LOCK_S ||
+        ((LOCK_MODE_MASK | LOCK_TABLE) & mode) == LOCK_X);
+  ut_ad(~mode & (LOCK_GAP | LOCK_REC_NOT_GAP));
+  ut_ad(dict_index_is_clust(index) || !dict_index_is_online_ddl(index));
+  DBUG_EXECUTE_IF("innodb_report_deadlock", return DB_DEADLOCK;);
+#ifdef ENABLED_DEBUG_SYNC
+  if (trx->mysql_thd)
+    DEBUG_SYNC_C("lock_rec");
+#endif
+
+  ut_ad((LOCK_MODE_MASK & mode) != LOCK_S ||
+        lock_table_has(trx, index->table, LOCK_IS));
+  ut_ad((LOCK_MODE_MASK & mode) != LOCK_X ||
+         lock_table_has(trx, index->table, LOCK_IX));
+
+  if (lock_table_has(trx, index->table,
+                     static_cast<lock_mode>(LOCK_MODE_MASK & mode)))
+    return DB_SUCCESS;
+
+  /* During CREATE TABLE, we will write to newly created FTS_*_CONFIG
+  on which no lock has been created yet. */
+  ut_ad(!trx->dict_operation_lock_mode ||
+        (strstr(index->table->name.m_name, "/FTS_") &&
+         strstr(index->table->name.m_name, "_CONFIG") + sizeof("_CONFIG") ==
+         index->table->name.m_name + strlen(index->table->name.m_name) + 1));
+  MONITOR_ATOMIC_INC(MONITOR_NUM_RECLOCK_REQ);
+  const page_id_t id{block->page.id()};
+  LockGuard g{lock_sys.rec_hash, id};
+
+  if (lock_t *lock= lock_sys_t::get_first(g.cell(), id))
+  {
+    dberr_t err= DB_SUCCESS;
+    trx->mutex_lock();
+    if (lock_rec_get_next_on_page(lock) ||
+        lock->trx != trx ||
+        lock->type_mode != mode ||
+        lock_rec_get_n_bits(lock) <= heap_no)
+    {
+
+      unsigned checked_mode= (heap_no != PAGE_HEAP_NO_SUPREMUM &&
+                          lock_mode_is_next_key_lock(mode))
+                             ? mode | LOCK_REC_NOT_GAP
+                             : mode;
+
+      const lock_t *held_lock=
+          lock_rec_has_expl(checked_mode, g.cell(), id, heap_no, trx);
+
+      /* Do nothing if the trx already has a strong enough lock on rec */
+      if (!held_lock)
+      {
+        if (lock_t *c_lock= lock_rec_other_has_conflicting(mode, g.cell(), id,
+                                                           heap_no, trx))
+          /*
+            If another transaction has a non-gap conflicting
+            request in the queue, as this transaction does not
+            have a lock strong enough already granted on the
+            record, we have to wait.
+          */
+          err= lock_rec_enqueue_waiting(c_lock, mode, id, block->page.frame,
+                                        heap_no, index, thr, nullptr);
+        else if (!impl)
+        {
+          /* Set the requested lock on the record. */
+          lock_rec_add_to_queue(mode, g.cell(), id, block->page.frame, heap_no,
+                                index, trx, true);
+          err= DB_SUCCESS_LOCKED_REC;
+        }
+      }
+      /* If checked_mode == mode, trx already has a strong enough lock on rec */
+      else if (checked_mode != mode)
+      {
+        /* As check_mode != mode, the mode is Next Key Lock, which can not be
+        emulated by implicit lock (which are LOCK_REC_NOT_GAP only). */
+        ut_ad(!impl);
+
+        lock_reuse_for_next_key_lock(held_lock, mode, g.cell(), id,
+                                     block->page.frame, heap_no, index, trx);
+      }
+    }
+    else if (!impl)
+    {
+      /*
+        If the nth bit of the record lock is already set then we do not set
+        a new lock bit, otherwise we do set
+      */
+      if (!lock_rec_get_nth_bit(lock, heap_no))
+      {
+        lock_rec_set_nth_bit(lock, heap_no);
+        err= DB_SUCCESS_LOCKED_REC;
+      }
+    }
+    trx->mutex_unlock();
+    return err;
+  }
+
+  /* Simplified and faster path for the most common cases */
+  if (!impl)
+    lock_rec_create_low(nullptr, mode, id, block->page.frame, heap_no, index,
+                        trx, false);
+
+  return DB_SUCCESS_LOCKED_REC;
+}
+
+/*********************************************************************//**
+Checks if a waiting record lock request still has to wait in a queue.
+@return lock that is causing the wait */
+static
+const lock_t*
+lock_rec_has_to_wait_in_queue(const hash_cell_t &cell, const lock_t *wait_lock)
+{
+	const lock_t*	lock;
+	ulint		heap_no;
+	ulint		bit_mask;
+	ulint		bit_offset;
+
+	ut_ad(wait_lock->is_waiting());
+	ut_ad(!wait_lock->is_table());
+
+	heap_no = lock_rec_find_set_bit(wait_lock);
+
+	bit_offset = heap_no / 8;
+	bit_mask = static_cast<ulint>(1) << (heap_no % 8);
+
+	for (lock = lock_sys_t::get_first(
+		     cell, wait_lock->un_member.rec_lock.page_id);
+	     lock != wait_lock;
+	     lock = lock_rec_get_next_on_page_const(lock)) {
+		const byte*	p = (const byte*) &lock[1];
+
+		if (heap_no < lock_rec_get_n_bits(lock)
+		    && (p[bit_offset] & bit_mask)
+		    && lock_has_to_wait(wait_lock, lock)) {
+#ifdef WITH_WSREP
+			if (lock->trx->is_wsrep() &&
+			    wsrep_thd_order_before(wait_lock->trx->mysql_thd,
+						   lock->trx->mysql_thd)) {
+				/* don't wait for another BF lock */
+				continue;
+			}
+#endif
+			return(lock);
+		}
+	}
+
+	return(NULL);
+}
+
+/** Note that a record lock wait started */
+inline void lock_sys_t::wait_start()
+{
+  mysql_mutex_assert_owner(&wait_mutex);
+  wait_count+= WAIT_COUNT_STEP + 1;
+  /* The maximum number of concurrently waiting transactions is one less
+  than the maximum number of concurrent transactions. */
+  static_assert(WAIT_COUNT_STEP == UNIV_PAGE_SIZE_MAX / 16 * TRX_SYS_N_RSEGS,
+                "compatibility");
+}
+
+/** Note that a record lock wait resumed */
+inline
+void lock_sys_t::wait_resume(THD *thd, my_hrtime_t start, my_hrtime_t now)
+{
+  mysql_mutex_assert_owner(&wait_mutex);
+  ut_ad(get_wait_pending());
+  ut_ad(get_wait_cumulative());
+  wait_count--;
+  if (now.val >= start.val)
+  {
+    const uint64_t diff_time=
+      static_cast<uint64_t>((now.val - start.val) / 1000);
+    wait_time+= diff_time;
+
+    if (diff_time > wait_time_max)
+      wait_time_max= diff_time;
+
+    thd_storage_lock_wait(thd, diff_time);
+  }
+}
+
+#ifdef HAVE_REPLICATION
+ATTRIBUTE_NOINLINE MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Report lock waits to parallel replication. Sets
+trx->error_state= DB_DEADLOCK if trx->lock.was_chosen_as_deadlock_victim was
+set when lock_sys.wait_mutex was unlocked.
+@param trx       transaction that may be waiting for a lock
+@param wait_lock lock that is being waited for
+@return lock being waited for (may have been replaced by an equivalent one)
+@retval nullptr if no lock is being waited for */
+static lock_t *lock_wait_rpl_report(trx_t *trx)
+{
+  mysql_mutex_assert_owner(&lock_sys.wait_mutex);
+  ut_ad(trx->state == TRX_STATE_ACTIVE);
+  THD *const thd= trx->mysql_thd;
+  ut_ad(thd);
+  lock_t *wait_lock= trx->lock.wait_lock;
+  if (!wait_lock)
+    return nullptr;
+  /* This would likely be too large to attempt to use a memory transaction,
+  even for wait_lock->is_table(). */
+  const bool nowait= lock_sys.wr_lock_try();
+  if (!nowait)
+  {
+    mysql_mutex_unlock(&lock_sys.wait_mutex);
+    lock_sys.wr_lock(SRW_LOCK_CALL);
+    mysql_mutex_lock(&lock_sys.wait_mutex);
+    wait_lock= trx->lock.wait_lock;
+    if (!wait_lock)
+    {
+func_exit:
+      lock_sys.wr_unlock();
+      /* trx->lock.was_chosen_as_deadlock_victim can be set when
+      lock_sys.wait_mutex was unlocked, let's check it. */
+      if (!nowait && trx->lock.was_chosen_as_deadlock_victim)
+        trx->error_state= DB_DEADLOCK;
+      return wait_lock;
+    }
+    ut_ad(wait_lock->is_waiting());
+  }
+  else if (!wait_lock->is_waiting())
+  {
+    wait_lock= trx->lock.wait_lock;
+    if (!wait_lock)
+      goto func_exit;
+    if (!wait_lock->is_waiting())
+    {
+      wait_lock= nullptr;
+      goto func_exit;
+    }
+  }
+
+  if (wait_lock->is_table())
+  {
+    dict_table_t *table= wait_lock->un_member.tab_lock.table;
+    for (lock_t *lock= UT_LIST_GET_FIRST(table->locks); lock;
+         lock= UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock))
+      if (lock->trx != trx)
+        thd_rpl_deadlock_check(thd, lock->trx->mysql_thd);
+  }
+  else
+  {
+    const page_id_t id{wait_lock->un_member.rec_lock.page_id};
+    hash_cell_t &cell= *(wait_lock->type_mode & LOCK_PREDICATE
+                         ? lock_sys.prdt_hash : lock_sys.rec_hash).cell_get
+      (id.fold());
+    if (lock_t *lock= lock_sys_t::get_first(cell, id))
+    {
+      const ulint heap_no= lock_rec_find_set_bit(wait_lock);
+      if (!lock_rec_get_nth_bit(lock, heap_no))
+        lock= lock_rec_get_next(heap_no, lock);
+      do
+        if (lock->trx->mysql_thd != thd)
+          thd_rpl_deadlock_check(thd, lock->trx->mysql_thd);
+      while ((lock= lock_rec_get_next(heap_no, lock)));
+    }
+  }
+
+  goto func_exit;
+}
+#endif /* HAVE_REPLICATION */
+
+/** Wait for a lock to be released.
+@retval DB_DEADLOCK if this transaction was chosen as the deadlock victim
+@retval DB_INTERRUPTED if the execution was interrupted by the user
+@retval DB_LOCK_WAIT_TIMEOUT if the lock wait timed out
+@retval DB_SUCCESS if the lock was granted */
+dberr_t lock_wait(que_thr_t *thr)
+{
+  trx_t *trx= thr_get_trx(thr);
+
+#ifdef ENABLED_DEBUG_SYNC
+  if (trx->mysql_thd)
+    DEBUG_SYNC_C("lock_wait_start");
+
+  /* Create the sync point for any quit from the function. */
+  SCOPE_EXIT([trx]() {
+    if (trx->mysql_thd)
+      DEBUG_SYNC_C("lock_wait_end");
+  });
+#endif
+
+  /* InnoDB system transactions may use the global value of
+  innodb_lock_wait_timeout, because trx->mysql_thd == NULL. */
+  const ulong innodb_lock_wait_timeout= trx_lock_wait_timeout_get(trx);
+  const my_hrtime_t suspend_time= my_hrtime_coarse();
+  ut_ad(!trx->dict_operation_lock_mode);
+
+  /* The wait_lock can be cleared by another thread in lock_grant(),
+  lock_rec_cancel(), lock_cancel_waiting_and_release(), which could be
+  invoked from the high-level function lock_sys_t::cancel().
+  But, a wait can only be initiated by the current thread which owns
+  the transaction.
+
+  Even if trx->lock.wait_lock were changed, the object that it used to
+  point to it will remain valid memory (remain allocated from
+  trx->lock.lock_heap). If trx->lock.wait_lock was set to nullptr, the
+  original object could be transformed to a granted lock. On a page
+  split or merge, we would change trx->lock.wait_lock to point to
+  another waiting lock request object, and the old object would be
+  logically discarded.
+
+  In any case, it is safe to read the memory that wait_lock points to,
+  even though we are not holding any mutex. We are only reading
+  wait_lock->type_mode & (LOCK_TABLE | LOCK_AUTO_INC), which will be
+  unaffected by any page split or merge operation. (Furthermore,
+  table lock objects will never be cloned or moved.) */
+  lock_t *wait_lock= trx->lock.wait_lock;
+
+  if (!wait_lock)
+  {
+    /* The lock has already been released or this transaction
+    was chosen as a deadlock victim: no need to wait */
+    if (trx->lock.was_chosen_as_deadlock_victim)
+      trx->error_state= DB_DEADLOCK;
+    else if (trx->error_state == DB_LOCK_WAIT)
+      trx->error_state= DB_SUCCESS;
+    return trx->error_state;
+  }
+
+  /* Because we are not holding exclusive lock_sys.latch, the
+  wait_lock may be changed by other threads during a page split or
+  merge in case it is a record lock.
+
+  Because at this point we are not holding lock_sys.wait_mutex either,
+  another thread may set trx->lock.wait_lock == nullptr at any time. */
+
+  trx->lock.suspend_time= suspend_time;
+
+  ut_ad(!trx->dict_operation_lock_mode);
+
+  IF_WSREP(if (trx->is_wsrep()) lock_wait_wsrep(trx),);
+
+  const auto type_mode= wait_lock->type_mode;
+#ifdef HAVE_REPLICATION
+  /* Even though lock_wait_rpl_report() has nothing to do with
+  deadlock detection, it was always disabled by innodb_deadlock_detect=OFF.
+  We will keep it in that way, because unfortunately
+  thd_need_wait_reports() will hold even if parallel (or any) replication
+  is not being used. We want to be allow the user to skip
+  lock_wait_rpl_report(). */
+  const bool rpl= trx->mysql_thd && innodb_deadlock_detect &&
+    thd_need_wait_reports(trx->mysql_thd);
+#endif
+  const bool row_lock_wait= thr->lock_state == QUE_THR_LOCK_ROW;
+  timespec abstime;
+  set_timespec_time_nsec(abstime, suspend_time.val * 1000);
+  abstime.MY_tv_sec+= innodb_lock_wait_timeout;
+  /* Dictionary transactions must wait be immune to lock wait timeouts
+  for locks on data dictionary tables. Here we check only for
+  SYS_TABLES, SYS_COLUMNS, SYS_INDEXES, SYS_FIELDS. Locks on further
+  tables SYS_FOREIGN, SYS_FOREIGN_COLS, SYS_VIRTUAL will only be
+  acquired while holding an exclusive lock on one of the 4 tables. */
+  const bool no_timeout= innodb_lock_wait_timeout >= 100000000 ||
+    ((type_mode & LOCK_TABLE) &&
+     wait_lock->un_member.tab_lock.table->id <= DICT_FIELDS_ID);
+  thd_wait_begin(trx->mysql_thd, (type_mode & LOCK_TABLE)
+                 ? THD_WAIT_TABLE_LOCK : THD_WAIT_ROW_LOCK);
+
+  mysql_mutex_lock(&lock_sys.wait_mutex);
+  /* Now that we are holding lock_sys.wait_mutex, we must reload
+  trx->lock.wait_mutex. It cannot be cleared as long as we are holding
+  lock_sys.wait_mutex, but as long as we do not hold exclusive
+  lock_sys.latch, a waiting record lock can be replaced with an
+  equivalent waiting record lock during a page split or merge by
+  another thread. See lock_sys_t::cancel(). */
+  wait_lock= trx->lock.wait_lock;
+
+  if (wait_lock)
+  {
+    /* Dictionary transactions must ignore KILL, because they could
+    be executed as part of a multi-transaction DDL operation,
+    such as rollback_inplace_alter_table() or ha_innobase::delete_table(). */
+    if (!trx->dict_operation && trx_is_interrupted(trx))
+    {
+      /* innobase_kill_query() can only set trx->error_state=DB_INTERRUPTED
+      for any transaction that is attached to a connection.
+
+      Furthermore, innobase_kill_query() could have been invoked before
+      this thread entered a lock wait. The thd_kill_level() or thd::killed
+      is only being checked every now and then. */
+      trx->error_state= DB_INTERRUPTED;
+      goto abort_wait;
+    }
+
+    wait_lock= Deadlock::check_and_resolve(trx, wait_lock);
+
+    if (wait_lock == reinterpret_cast<lock_t*>(-1))
+    {
+      trx->error_state= DB_DEADLOCK;
+      goto end_wait;
+    }
+  }
+  else
+  {
+    /* trx->lock.was_chosen_as_deadlock_victim can be changed before
+    lock_sys.wait_mutex is acquired, so let's check it once more. */
+    if (trx->lock.was_chosen_as_deadlock_victim)
+      trx->error_state= DB_DEADLOCK;
+    else if (trx->error_state == DB_LOCK_WAIT)
+      trx->error_state= DB_SUCCESS;
+    goto end_wait;
+  }
+  if (row_lock_wait)
+    lock_sys.wait_start();
+
+#ifdef HAVE_REPLICATION
+  if (rpl)
+    wait_lock= lock_wait_rpl_report(trx);
+#endif
+
+  switch (trx->error_state) {
+  case DB_SUCCESS:
+    break;
+  case DB_LOCK_WAIT:
+    trx->error_state= DB_SUCCESS;
+    break;
+  default:
+#ifdef UNIV_DEBUG
+    ut_ad("invalid state" == 0);
+    break;
+  case DB_DEADLOCK:
+  case DB_INTERRUPTED:
+#endif
+    goto end_loop;
+  }
+
+  while (wait_lock)
+  {
+    int err;
+    ut_ad(trx->lock.wait_lock);
+
+    DEBUG_SYNC_C("lock_wait_before_suspend");
+
+    if (no_timeout)
+    {
+      my_cond_wait(&trx->lock.cond, &lock_sys.wait_mutex.m_mutex);
+      err= 0;
+    }
+    else
+      err= my_cond_timedwait(&trx->lock.cond, &lock_sys.wait_mutex.m_mutex,
+                             &abstime);
+
+    wait_lock= trx->lock.wait_lock;
+
+    switch (trx->error_state) {
+    case DB_DEADLOCK:
+    case DB_INTERRUPTED:
+      break;
+#ifdef UNIV_DEBUG
+    case DB_LOCK_WAIT_TIMEOUT:
+    case DB_LOCK_WAIT:
+      ut_ad("invalid state" == 0);
+      break;
+#endif
+    default:
+      /* Dictionary transactions must ignore KILL, because they could
+      be executed as part of a multi-transaction DDL operation,
+      such as rollback_inplace_alter_table() or ha_innobase::delete_table(). */
+      if (!trx->dict_operation && trx_is_interrupted(trx))
+        /* innobase_kill_query() can only set trx->error_state=DB_INTERRUPTED
+        for any transaction that is attached to a connection. */
+        trx->error_state= DB_INTERRUPTED;
+      else if (!err)
+        continue;
+#ifdef WITH_WSREP
+      else if (trx->is_wsrep() && wsrep_is_BF_lock_timeout(*trx));
+#endif
+      else
+      {
+        trx->error_state= DB_LOCK_WAIT_TIMEOUT;
+        lock_sys.timeouts++;
+      }
+    }
+    break;
+  }
+
+end_loop:
+  if (row_lock_wait)
+    lock_sys.wait_resume(trx->mysql_thd, suspend_time, my_hrtime_coarse());
+
+  ut_ad(!wait_lock == !trx->lock.wait_lock);
+
+  if (wait_lock)
+  {
+  abort_wait:
+    lock_sys_t::cancel<false>(trx, wait_lock);
+    lock_sys.deadlock_check();
+  }
+
+end_wait:
+  mysql_mutex_unlock(&lock_sys.wait_mutex);
+  DBUG_EXECUTE_IF("small_sleep_after_lock_wait",
+    {
+      if (!(type_mode & LOCK_TABLE) &&
+	  (type_mode & LOCK_MODE_MASK) == LOCK_X &&
+	  trx->error_state != DB_DEADLOCK && !trx_is_interrupted(trx)) {
+	      my_sleep(20000);
+      }
+    });
+  thd_wait_end(trx->mysql_thd);
+
+#ifdef UNIV_DEBUG
+  switch (trx->error_state) {
+  case DB_SUCCESS:
+  case DB_DEADLOCK:
+  case DB_INTERRUPTED:
+  case DB_LOCK_WAIT_TIMEOUT:
+    break;
+  default:
+    ut_ad("invalid state" == 0);
+  }
+#endif
+
+  return trx->error_state;
+}
+
+
+/** Resume a lock wait */
+template <bool from_deadlock= false>
+void lock_wait_end(trx_t *trx)
+{
+  mysql_mutex_assert_owner(&lock_sys.wait_mutex);
+  ut_ad(trx->mutex_is_owner());
+  ut_d(const auto state= trx->state);
+  ut_ad(state == TRX_STATE_COMMITTED_IN_MEMORY || state == TRX_STATE_ACTIVE ||
+        state == TRX_STATE_PREPARED);
+  /* lock_wait() checks trx->lock.was_chosen_as_deadlock_victim flag before
+  requesting lock_sys.wait_mutex, and if the flag is set, it returns error,
+  what causes transaction rollback, which can reset trx->lock.wait_thr before
+  deadlock resolution starts cancelling victim's waiting lock. That's why we
+  don't check trx->lock.wait_thr here if the function was called from deadlock
+  resolution function. */
+  ut_ad(from_deadlock || trx->lock.wait_thr);
+
+  if (trx->lock.was_chosen_as_deadlock_victim)
+  {
+    ut_ad(from_deadlock || state == TRX_STATE_ACTIVE);
+    trx->error_state= DB_DEADLOCK;
+  }
+
+  trx->lock.wait_thr= nullptr;
+  pthread_cond_signal(&trx->lock.cond);
+}
+
+/** Grant a waiting lock request and release the waiting transaction. */
+static void lock_grant(lock_t *lock)
+{
+  lock_reset_lock_and_trx_wait(lock);
+  trx_t *trx= lock->trx;
+  trx->mutex_lock();
+  if (lock->mode() == LOCK_AUTO_INC)
+  {
+    dict_table_t *table= lock->un_member.tab_lock.table;
+    ut_ad(!table->autoinc_trx);
+    table->autoinc_trx= trx;
+    ib_vector_push(trx->autoinc_locks, &lock);
+  }
+
+  DBUG_PRINT("ib_lock", ("wait for trx " TRX_ID_FMT " ends", trx->id));
+
+  /* If we are resolving a deadlock by choosing another transaction as
+  a victim, then our original transaction may not be waiting anymore */
+
+  if (trx->lock.wait_thr)
+    lock_wait_end(trx);
+
+  trx->mutex_unlock();
+}
+
+/*************************************************************//**
+Cancels a waiting record lock request and releases the waiting transaction
+that requested it. NOTE: does NOT check if waiting lock requests behind this
+one can now be granted! */
+static void lock_rec_cancel(lock_t *lock)
+{
+  trx_t *trx= lock->trx;
+  mysql_mutex_lock(&lock_sys.wait_mutex);
+  trx->mutex_lock();
+
+  ut_d(lock_sys.hash_get(lock->type_mode).
+       assert_locked(lock->un_member.rec_lock.page_id));
+  /* Reset the bit (there can be only one set bit) in the lock bitmap */
+  lock_rec_reset_nth_bit(lock, lock_rec_find_set_bit(lock));
+
+  /* Reset the wait flag and the back pointer to lock in trx */
+  lock_reset_lock_and_trx_wait(lock);
+
+  /* The following releases the trx from lock wait */
+  lock_wait_end(trx);
+  mysql_mutex_unlock(&lock_sys.wait_mutex);
+  trx->mutex_unlock();
+}
+
+/** Remove a record lock request, waiting or granted, from the queue and
+grant locks to other transactions in the queue if they now are entitled
+to a lock. NOTE: all record locks contained in in_lock are removed.
+@param[in,out]	in_lock		record lock
+@param[in]	owns_wait_mutex	whether lock_sys.wait_mutex is held */
+static void lock_rec_dequeue_from_page(lock_t *in_lock, bool owns_wait_mutex)
+{
+#ifdef SAFE_MUTEX
+	ut_ad(owns_wait_mutex == mysql_mutex_is_owner(&lock_sys.wait_mutex));
+#endif /* SAFE_MUTEX */
+	ut_ad(!in_lock->is_table());
+
+	const page_id_t page_id{in_lock->un_member.rec_lock.page_id};
+	auto& lock_hash = lock_sys.hash_get(in_lock->type_mode);
+	ut_ad(lock_sys.is_writer() || in_lock->trx->mutex_is_owner());
+
+	ut_d(auto old_n_locks=)
+	in_lock->index->table->n_rec_locks--;
+	ut_ad(old_n_locks);
+
+	const ulint rec_fold = page_id.fold();
+	hash_cell_t &cell = *lock_hash.cell_get(rec_fold);
+	lock_sys.assert_locked(cell);
+
+	HASH_DELETE(lock_t, hash, &lock_hash, rec_fold, in_lock);
+	ut_ad(lock_sys.is_writer() || in_lock->trx->mutex_is_owner());
+	UT_LIST_REMOVE(in_lock->trx->lock.trx_locks, in_lock);
+
+	MONITOR_INC(MONITOR_RECLOCK_REMOVED);
+	MONITOR_DEC(MONITOR_NUM_RECLOCK);
+
+	bool acquired = false;
+
+	/* Check if waiting locks in the queue can now be granted:
+	grant locks if there are no conflicting locks ahead. Stop at
+	the first X lock that is waiting or has been granted. */
+
+	for (lock_t* lock = lock_sys_t::get_first(cell, page_id);
+	     lock != NULL;
+	     lock = lock_rec_get_next_on_page(lock)) {
+
+		if (!lock->is_waiting()) {
+			continue;
+		}
+
+		if (!owns_wait_mutex) {
+			mysql_mutex_lock(&lock_sys.wait_mutex);
+			acquired = owns_wait_mutex = true;
+		}
+
+		ut_ad(lock->trx->lock.wait_trx);
+		ut_ad(lock->trx->lock.wait_lock);
+
+		if (const lock_t* c = lock_rec_has_to_wait_in_queue(
+			    cell, lock)) {
+			trx_t* c_trx = c->trx;
+			lock->trx->lock.wait_trx = c_trx;
+			if (c_trx->lock.wait_trx
+			    && innodb_deadlock_detect
+			    && Deadlock::to_check.emplace(c_trx).second) {
+				Deadlock::to_be_checked = true;
+			}
+		} else {
+			/* Grant the lock */
+			ut_ad(lock->trx != in_lock->trx);
+			lock_grant(lock);
+		}
+	}
+
+	if (acquired) {
+		mysql_mutex_unlock(&lock_sys.wait_mutex);
+	}
+}
+
+/** Remove a record lock request, waiting or granted, on a discarded page
+@param hash     hash table
+@param in_lock  lock object */
+TRANSACTIONAL_TARGET
+void lock_rec_discard(lock_sys_t::hash_table &lock_hash, lock_t *in_lock)
+{
+  ut_ad(!in_lock->is_table());
+  lock_hash.assert_locked(in_lock->un_member.rec_lock.page_id);
+
+  HASH_DELETE(lock_t, hash, &lock_hash,
+              in_lock->un_member.rec_lock.page_id.fold(), in_lock);
+  ut_d(uint32_t old_locks);
+  {
+    trx_t *trx= in_lock->trx;
+    TMTrxGuard tg{*trx};
+    ut_d(old_locks=)
+    in_lock->index->table->n_rec_locks--;
+    UT_LIST_REMOVE(trx->lock.trx_locks, in_lock);
+  }
+  ut_ad(old_locks);
+  MONITOR_INC(MONITOR_RECLOCK_REMOVED);
+  MONITOR_DEC(MONITOR_NUM_RECLOCK);
+}
+
+/*************************************************************//**
+Removes record lock objects set on an index page which is discarded. This
+function does not move locks, or check for waiting locks, therefore the
+lock bitmaps must already be reset when this function is called. */
+static void
+lock_rec_free_all_from_discard_page(page_id_t id, const hash_cell_t &cell,
+                                    lock_sys_t::hash_table &lock_hash)
+{
+  for (lock_t *lock= lock_sys_t::get_first(cell, id); lock; )
+  {
+    ut_ad(&lock_hash != &lock_sys.rec_hash ||
+          lock_rec_find_set_bit(lock) == ULINT_UNDEFINED);
+    ut_ad(!lock->is_waiting());
+    lock_t *next_lock= lock_rec_get_next_on_page(lock);
+    lock_rec_discard(lock_hash, lock);
+    lock= next_lock;
+  }
+}
+
+/** Discard locks for an index when purging DELETE FROM SYS_INDEXES
+after an aborted CREATE INDEX operation.
+@param index   a stale index on which ADD INDEX operation was aborted */
+ATTRIBUTE_COLD void lock_discard_for_index(const dict_index_t &index)
+{
+  ut_ad(!index.is_committed());
+  /* This is very rarely executed code, and the size of the hash array
+  would exceed the maximum size of a memory transaction. */
+  LockMutexGuard g{SRW_LOCK_CALL};
+  const ulint n= lock_sys.rec_hash.pad(lock_sys.rec_hash.n_cells);
+  for (ulint i= 0; i < n; i++)
+  {
+    for (lock_t *lock= static_cast<lock_t*>(lock_sys.rec_hash.array[i].node);
+         lock; )
+    {
+      ut_ad(!lock->is_table());
+      if (lock->index == &index)
+      {
+        ut_ad(!lock->is_waiting());
+        lock_rec_discard(lock_sys.rec_hash, lock);
+        lock= static_cast<lock_t*>(lock_sys.rec_hash.array[i].node);
+      }
+      else
+        lock= lock->hash;
+    }
+  }
+}
+
+/*============= RECORD LOCK MOVING AND INHERITING ===================*/
+
+/*************************************************************//**
+Resets the lock bits for a single record. Releases transactions waiting for
+lock requests here. */
+TRANSACTIONAL_TARGET
+static
+void
+lock_rec_reset_and_release_wait(const hash_cell_t &cell, const page_id_t id,
+                                ulint heap_no)
+{
+  for (lock_t *lock= lock_sys.get_first(cell, id, heap_no); lock;
+       lock= lock_rec_get_next(heap_no, lock))
+  {
+    if (lock->is_waiting())
+      lock_rec_cancel(lock);
+    else
+    {
+      TMTrxGuard tg{*lock->trx};
+      lock_rec_reset_nth_bit(lock, heap_no);
+    }
+  }
+}
+
+/** Makes a record to inherit the locks (except LOCK_INSERT_INTENTION type)
+of another record as gap type locks, but does not reset the lock bits of
+the other record. Also waiting lock requests on rec are inherited as
+GRANTED gap locks.
+@param heir_cell    heir hash table cell
+@param heir         page containing the record which inherits
+@param donor_cell   donor hash table cell
+@param donor        page containing the record from which inherited; does NOT
+                    reset the locks on this record
+@param heir_page    heir page frame
+@param heir_heap_no heap_no of the inheriting record
+@param heap_no      heap_no of the donating record
+@tparam from_split  true if the function is invoked from
+                    lock_update_split_(left|right)(), in this case not-gap
+                    locks are not inherited to supremum if transaction
+                    isolation level less or equal to READ COMMITTED */
+template <bool from_split= false>
+static void
+lock_rec_inherit_to_gap(hash_cell_t &heir_cell, const page_id_t heir,
+                        const hash_cell_t &donor_cell, const page_id_t donor,
+                        const page_t *heir_page, ulint heir_heap_no,
+                        ulint heap_no)
+{
+  ut_ad(!from_split || heir_heap_no == PAGE_HEAP_NO_SUPREMUM);
+
+  /* At READ UNCOMMITTED or READ COMMITTED isolation level,
+  we do not want locks set
+  by an UPDATE or a DELETE to be inherited as gap type locks. But we
+  DO want S-locks/X-locks(taken for replace) set by a consistency
+  constraint to be inherited also then. */
+
+  for (lock_t *lock= lock_sys_t::get_first(donor_cell, donor, heap_no); lock;
+       lock= lock_rec_get_next(heap_no, lock))
+  {
+    trx_t *lock_trx= lock->trx;
+    if (!lock->trx->is_not_inheriting_locks() &&
+        !lock->is_insert_intention() &&
+        (lock_trx->isolation_level > TRX_ISO_READ_COMMITTED ||
+         /* When we are in a page split (not purge), then we don't set a lock
+         on supremum if the donor lock type is LOCK_REC_NOT_GAP. That is, do
+         not create bogus gap locks for non-gap locks for READ UNCOMMITTED and
+         READ COMMITTED isolation levels. LOCK_ORDINARY and
+         LOCK_GAP require a gap before the record to be locked, that is why
+         setting lock on supremmum is necessary. */
+         ((!from_split || !lock->is_record_not_gap()) &&
+          lock->mode() != (lock_trx->duplicates ? LOCK_S : LOCK_X))))
+    {
+      lock_rec_add_to_queue(LOCK_GAP | lock->mode(), heir_cell, heir,
+                            heir_page, heir_heap_no, lock->index, lock_trx,
+                            false);
+    }
+  }
+}
+
+/*************************************************************//**
+Makes a record to inherit the gap locks (except LOCK_INSERT_INTENTION type)
+of another record as gap type locks, but does not reset the lock bits of the
+other record. Also waiting lock requests are inherited as GRANTED gap locks. */
+static
+void
+lock_rec_inherit_to_gap_if_gap_lock(
+/*================================*/
+	const buf_block_t*	block,		/*!< in: buffer block */
+	ulint			heir_heap_no,	/*!< in: heap_no of
+						record which inherits */
+	ulint			heap_no)	/*!< in: heap_no of record
+						from which inherited;
+						does NOT reset the locks
+						on this record */
+{
+  const page_id_t id{block->page.id()};
+  LockGuard g{lock_sys.rec_hash, id};
+
+  for (lock_t *lock= lock_sys_t::get_first(g.cell(), id, heap_no); lock;
+       lock= lock_rec_get_next(heap_no, lock))
+     if (!lock->trx->is_not_inheriting_locks() &&
+         !lock->is_insert_intention() && (heap_no == PAGE_HEAP_NO_SUPREMUM ||
+                                          !lock->is_record_not_gap()) &&
+         !lock_table_has(lock->trx, lock->index->table, LOCK_X))
+       lock_rec_add_to_queue(LOCK_GAP | lock->mode(),
+                             g.cell(), id, block->page.frame,
+                             heir_heap_no, lock->index, lock->trx, false);
+}
+
+/*************************************************************//**
+Moves the locks of a record to another record and resets the lock bits of
+the donating record. */
+TRANSACTIONAL_TARGET
+static
+void
+lock_rec_move(
+	hash_cell_t&		receiver_cell,	/*!< in: hash table cell */
+	const buf_block_t&	receiver,	/*!< in: buffer block containing
+						the receiving record */
+	const page_id_t		receiver_id,	/*!< in: page identifier */
+	const hash_cell_t&	donator_cell,	/*!< in: hash table cell */
+	const page_id_t		donator_id,	/*!< in: page identifier of
+						the donating record */
+	ulint			receiver_heap_no,/*!< in: heap_no of the record
+						which gets the locks; there
+						must be no lock requests
+						on it! */
+	ulint			donator_heap_no)/*!< in: heap_no of the record
+						which gives the locks */
+{
+	ut_ad(!lock_sys_t::get_first(receiver_cell,
+				     receiver_id, receiver_heap_no));
+
+	for (lock_t *lock = lock_sys_t::get_first(donator_cell, donator_id,
+						  donator_heap_no);
+	     lock != NULL;
+	     lock = lock_rec_get_next(donator_heap_no, lock)) {
+		const auto type_mode = lock->type_mode;
+		if (type_mode & LOCK_WAIT) {
+			ut_ad(lock->trx->lock.wait_lock == lock);
+			lock->type_mode &= ~LOCK_WAIT;
+		}
+
+		trx_t* lock_trx = lock->trx;
+		lock_trx->mutex_lock();
+		lock_rec_reset_nth_bit(lock, donator_heap_no);
+
+		/* Note that we FIRST reset the bit, and then set the lock:
+		the function works also if donator_id == receiver_id */
+
+		lock_rec_add_to_queue(type_mode, receiver_cell,
+				      receiver_id, receiver.page.frame,
+				      receiver_heap_no,
+				      lock->index, lock_trx, true);
+		lock_trx->mutex_unlock();
+	}
+
+	ut_ad(!lock_sys_t::get_first(donator_cell, donator_id,
+				     donator_heap_no));
+}
+
+/** Move all the granted locks to the front of the given lock list.
+All the waiting locks will be at the end of the list.
+@param[in,out]	lock_list	the given lock list.  */
+static
+void
+lock_move_granted_locks_to_front(
+	UT_LIST_BASE_NODE_T(lock_t)&	lock_list)
+{
+	lock_t*	lock;
+
+	bool seen_waiting_lock = false;
+
+	for (lock = UT_LIST_GET_FIRST(lock_list); lock;
+	     lock = UT_LIST_GET_NEXT(trx_locks, lock)) {
+
+		if (!seen_waiting_lock) {
+			if (lock->is_waiting()) {
+				seen_waiting_lock = true;
+			}
+			continue;
+		}
+
+		ut_ad(seen_waiting_lock);
+
+		if (!lock->is_waiting()) {
+			lock_t* prev = UT_LIST_GET_PREV(trx_locks, lock);
+			ut_a(prev);
+			ut_list_move_to_front(lock_list, lock);
+			lock = prev;
+		}
+	}
+}
+
+/*************************************************************//**
+Updates the lock table when we have reorganized a page. NOTE: we copy
+also the locks set on the infimum of the page; the infimum may carry
+locks if an update of a record is occurring on the page, and its locks
+were temporarily stored on the infimum. */
+TRANSACTIONAL_TARGET
+void
+lock_move_reorganize_page(
+/*======================*/
+	const buf_block_t*	block,	/*!< in: old index page, now
+					reorganized */
+	const buf_block_t*	oblock)	/*!< in: copy of the old, not
+					reorganized page */
+{
+  mem_heap_t *heap;
+
+  {
+    UT_LIST_BASE_NODE_T(lock_t) old_locks;
+    UT_LIST_INIT(old_locks, &lock_t::trx_locks);
+
+    const page_id_t id{block->page.id()};
+    const auto id_fold= id.fold();
+    {
+      TMLockGuard g{lock_sys.rec_hash, id};
+      if (!lock_sys_t::get_first(g.cell(), id))
+        return;
+    }
+
+    /* We will modify arbitrary trx->lock.trx_locks.
+    Do not bother with a memory transaction; we are going
+    to allocate memory and copy a lot of data. */
+    LockMutexGuard g{SRW_LOCK_CALL};
+    hash_cell_t &cell= *lock_sys.rec_hash.cell_get(id_fold);
+
+    /* Note: Predicate locks for SPATIAL INDEX are not affected by
+    page reorganize, because they do not refer to individual record
+    heap numbers. */
+    lock_t *lock= lock_sys_t::get_first(cell, id);
+
+    if (!lock)
+      return;
+
+    heap= mem_heap_create(256);
+
+    /* Copy first all the locks on the page to heap and reset the
+    bitmaps in the original locks; chain the copies of the locks
+    using the trx_locks field in them. */
+
+    do
+    {
+      /* Make a copy of the lock */
+      lock_t *old_lock= lock_rec_copy(lock, heap);
+
+      UT_LIST_ADD_LAST(old_locks, old_lock);
+
+      /* Reset bitmap of lock */
+      lock_rec_bitmap_reset(lock);
+
+      if (lock->is_waiting())
+      {
+        ut_ad(lock->trx->lock.wait_lock == lock);
+        lock->type_mode&= ~LOCK_WAIT;
+      }
+
+      lock= lock_rec_get_next_on_page(lock);
+    }
+    while (lock);
+
+    const ulint comp= page_is_comp(block->page.frame);
+    ut_ad(comp == page_is_comp(oblock->page.frame));
+
+    lock_move_granted_locks_to_front(old_locks);
+
+    DBUG_EXECUTE_IF("do_lock_reverse_page_reorganize",
+                    ut_list_reverse(old_locks););
+
+    for (lock= UT_LIST_GET_FIRST(old_locks); lock;
+         lock= UT_LIST_GET_NEXT(trx_locks, lock))
+    {
+      /* NOTE: we copy also the locks set on the infimum and
+      supremum of the page; the infimum may carry locks if an
+      update of a record is occurring on the page, and its locks
+      were temporarily stored on the infimum */
+      const rec_t *rec1= page_get_infimum_rec(block->page.frame);
+      const rec_t *rec2= page_get_infimum_rec(oblock->page.frame);
+
+      /* Set locks according to old locks */
+      for (;;)
+      {
+        ulint old_heap_no;
+        ulint new_heap_no;
+        ut_d(const rec_t* const orec= rec1);
+        ut_ad(page_rec_is_metadata(rec1) == page_rec_is_metadata(rec2));
+
+        if (comp)
+        {
+          old_heap_no= rec_get_heap_no_new(rec2);
+          new_heap_no= rec_get_heap_no_new(rec1);
+
+          rec1= page_rec_get_next_low(rec1, TRUE);
+          rec2= page_rec_get_next_low(rec2, TRUE);
+        }
+        else
+        {
+          old_heap_no= rec_get_heap_no_old(rec2);
+          new_heap_no= rec_get_heap_no_old(rec1);
+          ut_ad(!memcmp(rec1, rec2, rec_get_data_size_old(rec2)));
+
+          rec1= page_rec_get_next_low(rec1, FALSE);
+          rec2= page_rec_get_next_low(rec2, FALSE);
+        }
+
+        trx_t *lock_trx= lock->trx;
+	lock_trx->mutex_lock();
+
+	/* Clear the bit in old_lock. */
+	if (old_heap_no < lock->un_member.rec_lock.n_bits &&
+            lock_rec_reset_nth_bit(lock, old_heap_no))
+        {
+          ut_ad(!page_rec_is_metadata(orec));
+
+          /* NOTE that the old lock bitmap could be too
+          small for the new heap number! */
+          lock_rec_add_to_queue(lock->type_mode, cell, id, block->page.frame,
+                                new_heap_no, lock->index, lock_trx, true);
+        }
+
+	lock_trx->mutex_unlock();
+
+        if (!rec1 || !rec2)
+        {
+          ut_ad(!rec1 == !rec2);
+          ut_ad(new_heap_no == PAGE_HEAP_NO_SUPREMUM);
+          ut_ad(old_heap_no == PAGE_HEAP_NO_SUPREMUM);
+          break;
+        }
+      }
+
+      ut_ad(lock_rec_find_set_bit(lock) == ULINT_UNDEFINED);
+    }
+  }
+
+  mem_heap_free(heap);
+
+#ifdef UNIV_DEBUG_LOCK_VALIDATE
+  if (fil_space_t *space= fil_space_t::get(id.space()))
+  {
+    ut_ad(lock_rec_validate_page(block, space->is_latched()));
+    space->release();
+  }
+#endif
+}
+
+/*************************************************************//**
+Moves the explicit locks on user records to another page if a record
+list end is moved to another page. */
+TRANSACTIONAL_TARGET
+void
+lock_move_rec_list_end(
+/*===================*/
+	const buf_block_t*	new_block,	/*!< in: index page to move to */
+	const buf_block_t*	block,		/*!< in: index page */
+	const rec_t*		rec)		/*!< in: record on page: this
+						is the first record moved */
+{
+  const ulint comp= page_rec_is_comp(rec);
+
+  ut_ad(block->page.frame == page_align(rec));
+  ut_ad(comp == page_is_comp(new_block->page.frame));
+
+  const page_id_t id{block->page.id()};
+  const page_id_t new_id{new_block->page.id()};
+  {
+    /* This would likely be too large for a memory transaction. */
+    LockMultiGuard g{lock_sys.rec_hash, id, new_id};
+
+    /* Note: when we move locks from record to record, waiting locks
+    and possible granted gap type locks behind them are enqueued in
+    the original order, because new elements are inserted to a hash
+    table to the end of the hash chain, and lock_rec_add_to_queue
+    does not reuse locks if there are waiters in the queue. */
+    for (lock_t *lock= lock_sys_t::get_first(g.cell1(), id); lock;
+         lock= lock_rec_get_next_on_page(lock))
+    {
+      const rec_t *rec1= rec;
+      const rec_t *rec2;
+      const auto type_mode= lock->type_mode;
+
+      if (comp)
+      {
+        if (page_offset(rec1) == PAGE_NEW_INFIMUM)
+          rec1= page_rec_get_next_low(rec1, TRUE);
+        rec2= page_rec_get_next_low(new_block->page.frame + PAGE_NEW_INFIMUM,
+                                    TRUE);
+      }
+      else
+      {
+        if (page_offset(rec1) == PAGE_OLD_INFIMUM)
+          rec1= page_rec_get_next_low(rec1, FALSE);
+        rec2= page_rec_get_next_low(new_block->page.frame + PAGE_OLD_INFIMUM,
+                                    FALSE);
+      }
+
+      if (UNIV_UNLIKELY(!rec1 || !rec2))
+      {
+        ut_ad("corrupted page" == 0);
+        return;
+      }
+
+      /* Copy lock requests on user records to new page and
+      reset the lock bits on the old */
+      for (;;)
+      {
+        ut_ad(page_rec_is_metadata(rec1) == page_rec_is_metadata(rec2));
+        ut_d(const rec_t* const orec= rec1);
+
+        ulint rec1_heap_no;
+        ulint rec2_heap_no;
+
+        if (comp)
+        {
+          rec1_heap_no= rec_get_heap_no_new(rec1);
+          if (!(rec1= page_rec_get_next_low(rec1, TRUE)))
+          {
+            ut_ad(rec1_heap_no == PAGE_HEAP_NO_SUPREMUM);
+            break;
+          }
+          rec2_heap_no= rec_get_heap_no_new(rec2);
+          rec2= page_rec_get_next_low(rec2, TRUE);
+        }
+        else
+        {
+          ut_d(const rec_t *old1= rec1);
+          rec1_heap_no= rec_get_heap_no_old(rec1);
+          if (!(rec1= page_rec_get_next_low(rec1, FALSE)))
+          {
+            ut_ad(rec1_heap_no == PAGE_HEAP_NO_SUPREMUM);
+            break;
+          }
+
+          ut_ad(rec_get_data_size_old(old1) == rec_get_data_size_old(rec2));
+          ut_ad(!memcmp(old1, rec2, rec_get_data_size_old(old1)));
+
+          rec2_heap_no= rec_get_heap_no_old(rec2);
+          rec2= page_rec_get_next_low(rec2, FALSE);
+        }
+
+        if (UNIV_UNLIKELY(!rec2))
+        {
+          ut_ad("corrupted page" == 0);
+          return;
+        }
+
+        trx_t *lock_trx= lock->trx;
+        lock_trx->mutex_lock();
+
+        if (rec1_heap_no < lock->un_member.rec_lock.n_bits &&
+            lock_rec_reset_nth_bit(lock, rec1_heap_no))
+        {
+          ut_ad(!page_rec_is_metadata(orec));
+
+          if (type_mode & LOCK_WAIT)
+          {
+            ut_ad(lock_trx->lock.wait_lock == lock);
+            lock->type_mode&= ~LOCK_WAIT;
+          }
+
+          lock_rec_add_to_queue(type_mode, g.cell2(), new_id,
+                                new_block->page.frame,
+                                rec2_heap_no, lock->index, lock_trx, true);
+        }
+
+        lock_trx->mutex_unlock();
+      }
+    }
+  }
+
+#ifdef UNIV_DEBUG_LOCK_VALIDATE
+  if (fil_space_t *space= fil_space_t::get(id.space()))
+  {
+    const bool is_latched{space->is_latched()};
+    ut_ad(lock_rec_validate_page(block, is_latched));
+    ut_ad(lock_rec_validate_page(new_block, is_latched));
+    space->release();
+  }
+#endif
+}
+
+/*************************************************************//**
+Moves the explicit locks on user records to another page if a record
+list start is moved to another page. */
+TRANSACTIONAL_TARGET
+void
+lock_move_rec_list_start(
+/*=====================*/
+	const buf_block_t*	new_block,	/*!< in: index page to
+						move to */
+	const buf_block_t*	block,		/*!< in: index page */
+	const rec_t*		rec,		/*!< in: record on page:
+						this is the first
+						record NOT copied */
+	const rec_t*		old_end)	/*!< in: old
+						previous-to-last
+						record on new_page
+						before the records
+						were copied */
+{
+  const ulint comp= page_rec_is_comp(rec);
+
+  ut_ad(block->page.frame == page_align(rec));
+  ut_ad(comp == page_is_comp(new_block->page.frame));
+  ut_ad(new_block->page.frame == page_align(old_end));
+  ut_ad(!page_rec_is_metadata(rec));
+  const page_id_t id{block->page.id()};
+  const page_id_t new_id{new_block->page.id()};
+
+  {
+    /* This would likely be too large for a memory transaction. */
+    LockMultiGuard g{lock_sys.rec_hash, id, new_id};
+
+    for (lock_t *lock= lock_sys_t::get_first(g.cell1(), id); lock;
+         lock= lock_rec_get_next_on_page(lock))
+    {
+      const rec_t *rec1;
+      const rec_t *rec2;
+      const auto type_mode= lock->type_mode;
+
+      if (comp)
+      {
+        rec1= page_rec_get_next_low(block->page.frame + PAGE_NEW_INFIMUM,
+                                    TRUE);
+        rec2= page_rec_get_next_low(old_end, TRUE);
+      }
+      else
+      {
+        rec1= page_rec_get_next_low(block->page.frame + PAGE_OLD_INFIMUM,
+                                    FALSE);
+        rec2= page_rec_get_next_low(old_end, FALSE);
+      }
+
+      /* Copy lock requests on user records to new page and
+      reset the lock bits on the old */
+
+      while (rec1 != rec)
+      {
+        if (UNIV_UNLIKELY(!rec1 || !rec2))
+        {
+          ut_ad("corrupted page" == 0);
+          return;
+        }
+
+        ut_ad(page_rec_is_metadata(rec1) == page_rec_is_metadata(rec2));
+        ut_d(const rec_t* const prev= rec1);
+
+        ulint rec1_heap_no;
+        ulint rec2_heap_no;
+
+        if (comp)
+        {
+          rec1_heap_no= rec_get_heap_no_new(rec1);
+          rec2_heap_no= rec_get_heap_no_new(rec2);
+
+          rec1= page_rec_get_next_low(rec1, TRUE);
+          rec2= page_rec_get_next_low(rec2, TRUE);
+        }
+        else
+        {
+          rec1_heap_no= rec_get_heap_no_old(rec1);
+          rec2_heap_no= rec_get_heap_no_old(rec2);
+
+          ut_ad(!memcmp(rec1, rec2, rec_get_data_size_old(rec2)));
+
+          rec1= page_rec_get_next_low(rec1, FALSE);
+          rec2= page_rec_get_next_low(rec2, FALSE);
+        }
+
+        trx_t *lock_trx= lock->trx;
+        lock_trx->mutex_lock();
+
+        if (rec1_heap_no < lock->un_member.rec_lock.n_bits &&
+            lock_rec_reset_nth_bit(lock, rec1_heap_no))
+        {
+          ut_ad(!page_rec_is_metadata(prev));
+
+          if (type_mode & LOCK_WAIT)
+          {
+            ut_ad(lock_trx->lock.wait_lock == lock);
+            lock->type_mode&= ~LOCK_WAIT;
+          }
+
+          lock_rec_add_to_queue(type_mode, g.cell2(), new_id,
+                                new_block->page.frame,
+                                rec2_heap_no, lock->index, lock_trx, true);
+        }
+
+        lock_trx->mutex_unlock();
+      }
+
+#ifdef UNIV_DEBUG
+      if (page_rec_is_supremum(rec))
+        for (auto i= lock_rec_get_n_bits(lock); --i > PAGE_HEAP_NO_USER_LOW; )
+          ut_ad(!lock_rec_get_nth_bit(lock, i));
+#endif /* UNIV_DEBUG */
+    }
+  }
+
+#ifdef UNIV_DEBUG_LOCK_VALIDATE
+  ut_ad(lock_rec_validate_page(block));
+#endif
+}
+
+/*************************************************************//**
+Moves the explicit locks on user records to another page if a record
+list start is moved to another page. */
+TRANSACTIONAL_TARGET
+void
+lock_rtr_move_rec_list(
+/*===================*/
+	const buf_block_t*	new_block,	/*!< in: index page to
+						move to */
+	const buf_block_t*	block,		/*!< in: index page */
+	rtr_rec_move_t*		rec_move,       /*!< in: recording records
+						moved */
+	ulint			num_move)       /*!< in: num of rec to move */
+{
+  if (!num_move)
+    return;
+
+  const ulint comp= page_rec_is_comp(rec_move[0].old_rec);
+
+  ut_ad(block->page.frame == page_align(rec_move[0].old_rec));
+  ut_ad(new_block->page.frame == page_align(rec_move[0].new_rec));
+  ut_ad(comp == page_rec_is_comp(rec_move[0].new_rec));
+  const page_id_t id{block->page.id()};
+  const page_id_t new_id{new_block->page.id()};
+
+  {
+    /* This would likely be too large for a memory transaction. */
+    LockMultiGuard g{lock_sys.rec_hash, id, new_id};
+
+    for (lock_t *lock= lock_sys_t::get_first(g.cell1(), id); lock;
+         lock= lock_rec_get_next_on_page(lock))
+    {
+      const rec_t *rec1;
+      const rec_t *rec2;
+      const auto type_mode= lock->type_mode;
+
+      /* Copy lock requests on user records to new page and
+      reset the lock bits on the old */
+
+      for (ulint moved= 0; moved < num_move; moved++)
+      {
+        ulint rec1_heap_no;
+        ulint rec2_heap_no;
+
+        rec1= rec_move[moved].old_rec;
+        rec2= rec_move[moved].new_rec;
+        ut_ad(!page_rec_is_metadata(rec1));
+        ut_ad(!page_rec_is_metadata(rec2));
+
+        if (comp)
+        {
+          rec1_heap_no= rec_get_heap_no_new(rec1);
+          rec2_heap_no= rec_get_heap_no_new(rec2);
+        }
+        else
+        {
+          rec1_heap_no= rec_get_heap_no_old(rec1);
+          rec2_heap_no= rec_get_heap_no_old(rec2);
+
+          ut_ad(!memcmp(rec1, rec2, rec_get_data_size_old(rec2)));
+        }
+
+        trx_t *lock_trx= lock->trx;
+        lock_trx->mutex_lock();
+
+        if (rec1_heap_no < lock->un_member.rec_lock.n_bits &&
+            lock_rec_reset_nth_bit(lock, rec1_heap_no))
+        {
+          if (type_mode & LOCK_WAIT)
+          {
+            ut_ad(lock_trx->lock.wait_lock == lock);
+            lock->type_mode&= ~LOCK_WAIT;
+          }
+
+          lock_rec_add_to_queue(type_mode, g.cell2(), new_id,
+                                new_block->page.frame,
+                                rec2_heap_no, lock->index, lock_trx, true);
+
+          rec_move[moved].moved= true;
+        }
+
+        lock_trx->mutex_unlock();
+      }
+    }
+  }
+
+#ifdef UNIV_DEBUG_LOCK_VALIDATE
+  ut_ad(lock_rec_validate_page(block));
+#endif
+}
+/*************************************************************//**
+Updates the lock table when a page is split to the right. */
+void
+lock_update_split_right(
+/*====================*/
+	const buf_block_t*	right_block,	/*!< in: right page */
+	const buf_block_t*	left_block)	/*!< in: left page */
+{
+  const ulint h= lock_get_min_heap_no(right_block);
+  const page_id_t l{left_block->page.id()};
+  const page_id_t r{right_block->page.id()};
+
+  /* This would likely be too large for a memory transaction. */
+  LockMultiGuard g{lock_sys.rec_hash, l, r};
+
+  /* Move the locks on the supremum of the left page to the supremum
+  of the right page */
+
+  lock_rec_move(g.cell2(), *right_block, r, g.cell1(), l,
+                PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM);
+
+  /* Inherit the locks to the supremum of left page from the successor
+  of the infimum on right page */
+  lock_rec_inherit_to_gap<true>(g.cell1(), l, g.cell2(), r,
+                                left_block->page.frame, PAGE_HEAP_NO_SUPREMUM,
+                                h);
+}
+
+void lock_update_node_pointer(const buf_block_t *left_block,
+                              const buf_block_t *right_block)
+{
+  const ulint h= lock_get_min_heap_no(right_block);
+  const page_id_t l{left_block->page.id()};
+  const page_id_t r{right_block->page.id()};
+  LockMultiGuard g{lock_sys.rec_hash, l, r};
+
+  lock_rec_inherit_to_gap(g.cell2(), r, g.cell1(), l, right_block->page.frame,
+                          h, PAGE_HEAP_NO_SUPREMUM);
+}
+
+#ifdef UNIV_DEBUG
+static void lock_assert_no_spatial(const page_id_t id)
+{
+  const auto id_fold= id.fold();
+  auto cell= lock_sys.prdt_page_hash.cell_get(id_fold);
+  auto latch= lock_sys_t::hash_table::latch(cell);
+  latch->acquire();
+  /* there should exist no page lock on the left page,
+  otherwise, it will be blocked from merge */
+  ut_ad(!lock_sys_t::get_first(*cell, id));
+  latch->release();
+  cell= lock_sys.prdt_hash.cell_get(id_fold);
+  latch= lock_sys_t::hash_table::latch(cell);
+  latch->acquire();
+  ut_ad(!lock_sys_t::get_first(*cell, id));
+  latch->release();
+}
+#endif
+
+/*************************************************************//**
+Updates the lock table when a page is merged to the right. */
+void
+lock_update_merge_right(
+/*====================*/
+	const buf_block_t*	right_block,	/*!< in: right page to
+						which merged */
+	const rec_t*		orig_succ,	/*!< in: original
+						successor of infimum
+						on the right page
+						before merge */
+	const buf_block_t*	left_block)	/*!< in: merged index
+						page which will be
+						discarded */
+{
+  ut_ad(!page_rec_is_metadata(orig_succ));
+
+  const page_id_t l{left_block->page.id()};
+  const page_id_t r{right_block->page.id()};
+  /* This would likely be too large for a memory transaction. */
+  LockMultiGuard g{lock_sys.rec_hash, l, r};
+
+  /* Inherit the locks from the supremum of the left page to the
+  original successor of infimum on the right page, to which the left
+  page was merged */
+  lock_rec_inherit_to_gap(g.cell2(), r, g.cell1(), l, right_block->page.frame,
+                          page_rec_get_heap_no(orig_succ),
+                          PAGE_HEAP_NO_SUPREMUM);
+
+  /* Reset the locks on the supremum of the left page, releasing
+  waiting transactions */
+  lock_rec_reset_and_release_wait(g.cell1(), l, PAGE_HEAP_NO_SUPREMUM);
+  lock_rec_free_all_from_discard_page(l, g.cell1(), lock_sys.rec_hash);
+
+  ut_d(lock_assert_no_spatial(l));
+}
+
+/** Update locks when the root page is copied to another in
+btr_root_raise_and_insert(). Note that we leave lock structs on the
+root page, even though they do not make sense on other than leaf
+pages: the reason is that in a pessimistic update the infimum record
+of the root page will act as a dummy carrier of the locks of the record
+to be updated. */
+void lock_update_root_raise(const buf_block_t &block, const page_id_t root)
+{
+  const page_id_t id{block.page.id()};
+  /* This would likely be too large for a memory transaction. */
+  LockMultiGuard g{lock_sys.rec_hash, id, root};
+  /* Move the locks on the supremum of the root to the supremum of block */
+  lock_rec_move(g.cell1(), block, id, g.cell2(), root,
+                PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM);
+}
+
+/** Update the lock table when a page is copied to another.
+@param new_block  the target page
+@param old        old page (not index root page) */
+void lock_update_copy_and_discard(const buf_block_t &new_block, page_id_t old)
+{
+  const page_id_t id{new_block.page.id()};
+  /* This would likely be too large for a memory transaction. */
+  LockMultiGuard g{lock_sys.rec_hash, id, old};
+  /* Move the locks on the supremum of the old page to the supremum of new */
+  lock_rec_move(g.cell1(), new_block, id, g.cell2(), old,
+                PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM);
+  lock_rec_free_all_from_discard_page(old, g.cell2(), lock_sys.rec_hash);
+}
+
+/*************************************************************//**
+Updates the lock table when a page is split to the left. */
+void
+lock_update_split_left(
+/*===================*/
+	const buf_block_t*	right_block,	/*!< in: right page */
+	const buf_block_t*	left_block)	/*!< in: left page */
+{
+  ulint h= lock_get_min_heap_no(right_block);
+  const page_id_t l{left_block->page.id()};
+  const page_id_t r{right_block->page.id()};
+  LockMultiGuard g{lock_sys.rec_hash, l, r};
+  /* Inherit the locks to the supremum of the left page from the
+  successor of the infimum on the right page */
+  lock_rec_inherit_to_gap<true>(g.cell1(), l, g.cell2(), r,
+                                left_block->page.frame, PAGE_HEAP_NO_SUPREMUM,
+                                h);
+}
+
+/** Update the lock table when a page is merged to the left.
+@param left      left page
+@param orig_pred original predecessor of supremum on the left page before merge
+@param right     merged, to-be-discarded right page */
+void lock_update_merge_left(const buf_block_t& left, const rec_t *orig_pred,
+                            const page_id_t right)
+{
+  ut_ad(left.page.frame == page_align(orig_pred));
+
+  const page_id_t l{left.page.id()};
+  const rec_t *left_next_rec= page_rec_get_next_const(orig_pred);
+  if (UNIV_UNLIKELY(!left_next_rec))
+  {
+    ut_ad("corrupted page" == 0);
+    return;
+  }
+
+  /* This would likely be too large for a memory transaction. */
+  LockMultiGuard g{lock_sys.rec_hash, l, right};
+  if (!page_rec_is_supremum(left_next_rec))
+  {
+    /* Inherit the locks on the supremum of the left page to the
+    first record which was moved from the right page */
+    lock_rec_inherit_to_gap(g.cell1(), l, g.cell1(), l, left.page.frame,
+                            page_rec_get_heap_no(left_next_rec),
+                            PAGE_HEAP_NO_SUPREMUM);
+
+    /* Reset the locks on the supremum of the left page,
+    releasing waiting transactions */
+    lock_rec_reset_and_release_wait(g.cell1(), l, PAGE_HEAP_NO_SUPREMUM);
+  }
+
+  /* Move the locks from the supremum of right page to the supremum
+  of the left page */
+  lock_rec_move(g.cell1(), left, l, g.cell2(), right,
+                PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM);
+  lock_rec_free_all_from_discard_page(right, g.cell2(), lock_sys.rec_hash);
+
+  /* there should exist no page lock on the right page,
+  otherwise, it will be blocked from merge */
+  ut_d(lock_assert_no_spatial(right));
+}
+
+/*************************************************************//**
+Resets the original locks on heir and replaces them with gap type locks
+inherited from rec. */
+void
+lock_rec_reset_and_inherit_gap_locks(
+/*=================================*/
+	const buf_block_t&	heir_block,	/*!< in: block containing the
+						record which inherits */
+	const page_id_t		donor,		/*!< in: page containing the
+						record from which inherited;
+						does NOT reset the locks on
+						this record */
+	ulint			heir_heap_no,	/*!< in: heap_no of the
+						inheriting record */
+	ulint			heap_no)	/*!< in: heap_no of the
+						donating record */
+{
+  const page_id_t heir{heir_block.page.id()};
+  /* This is a rare operation and likely too large for a memory transaction. */
+  LockMultiGuard g{lock_sys.rec_hash, heir, donor};
+  lock_rec_reset_and_release_wait(g.cell1(), heir, heir_heap_no);
+  lock_rec_inherit_to_gap(g.cell1(), heir, g.cell2(), donor,
+                          heir_block.page.frame, heir_heap_no, heap_no);
+}
+
+/*************************************************************//**
+Updates the lock table when a page is discarded. */
+void
+lock_update_discard(
+/*================*/
+	const buf_block_t*	heir_block,	/*!< in: index page
+						which will inherit the locks */
+	ulint			heir_heap_no,	/*!< in: heap_no of the record
+						which will inherit the locks */
+	const buf_block_t*	block)		/*!< in: index page
+						which will be discarded */
+{
+	const page_t*	page = block->page.frame;
+	const rec_t*	rec;
+	ulint		heap_no;
+	const page_id_t	heir(heir_block->page.id());
+	const page_id_t	page_id(block->page.id());
+	/* This would likely be too large for a memory transaction. */
+	LockMultiGuard	g{lock_sys.rec_hash, heir, page_id};
+
+	if (lock_sys_t::get_first(g.cell2(), page_id)) {
+		ut_d(lock_assert_no_spatial(page_id));
+		/* Inherit all the locks on the page to the record and
+		reset all the locks on the page */
+
+		if (page_is_comp(page)) {
+			rec = page + PAGE_NEW_INFIMUM;
+
+			do {
+				heap_no = rec_get_heap_no_new(rec);
+
+				lock_rec_inherit_to_gap(g.cell1(), heir,
+							g.cell2(), page_id,
+							heir_block->page.frame,
+							heir_heap_no, heap_no);
+
+				lock_rec_reset_and_release_wait(
+					g.cell2(), page_id, heap_no);
+
+				rec = page + rec_get_next_offs(rec, TRUE);
+			} while (heap_no != PAGE_HEAP_NO_SUPREMUM);
+		} else {
+			rec = page + PAGE_OLD_INFIMUM;
+
+			do {
+				heap_no = rec_get_heap_no_old(rec);
+
+				lock_rec_inherit_to_gap(g.cell1(), heir,
+							g.cell2(), page_id,
+							heir_block->page.frame,
+							heir_heap_no, heap_no);
+
+				lock_rec_reset_and_release_wait(
+					g.cell2(), page_id, heap_no);
+
+				rec = page + rec_get_next_offs(rec, FALSE);
+			} while (heap_no != PAGE_HEAP_NO_SUPREMUM);
+		}
+
+		lock_rec_free_all_from_discard_page(page_id, g.cell2(),
+						    lock_sys.rec_hash);
+	} else {
+		const auto fold = page_id.fold();
+		auto cell = lock_sys.prdt_hash.cell_get(fold);
+		auto latch = lock_sys_t::hash_table::latch(cell);
+		latch->acquire();
+		lock_rec_free_all_from_discard_page(page_id, *cell,
+						    lock_sys.prdt_hash);
+		latch->release();
+		cell = lock_sys.prdt_page_hash.cell_get(fold);
+		latch = lock_sys_t::hash_table::latch(cell);
+		latch->acquire();
+		lock_rec_free_all_from_discard_page(page_id, *cell,
+						    lock_sys.prdt_page_hash);
+		latch->release();
+	}
+}
+
+/*************************************************************//**
+Updates the lock table when a new user record is inserted. */
+void
+lock_update_insert(
+/*===============*/
+	const buf_block_t*	block,	/*!< in: buffer block containing rec */
+	const rec_t*		rec)	/*!< in: the inserted record */
+{
+	ulint	receiver_heap_no;
+	ulint	donator_heap_no;
+
+	ut_ad(block->page.frame == page_align(rec));
+	ut_ad(!page_rec_is_metadata(rec));
+
+	/* Inherit the gap-locking locks for rec, in gap mode, from the next
+	record */
+
+	if (page_rec_is_comp(rec)) {
+		receiver_heap_no = rec_get_heap_no_new(rec);
+		rec = page_rec_get_next_low(rec, TRUE);
+		if (UNIV_UNLIKELY(!rec)) {
+			return;
+		}
+		donator_heap_no = rec_get_heap_no_new(rec);
+	} else {
+		receiver_heap_no = rec_get_heap_no_old(rec);
+		rec = page_rec_get_next_low(rec, FALSE);
+		if (UNIV_UNLIKELY(!rec)) {
+			return;
+		}
+		donator_heap_no = rec_get_heap_no_old(rec);
+	}
+
+	lock_rec_inherit_to_gap_if_gap_lock(
+		block, receiver_heap_no, donator_heap_no);
+}
+
+/*************************************************************//**
+Updates the lock table when a record is removed. */
+void
+lock_update_delete(
+/*===============*/
+	const buf_block_t*	block,	/*!< in: buffer block containing rec */
+	const rec_t*		rec)	/*!< in: the record to be removed */
+{
+	const page_t*	page = block->page.frame;
+	ulint		heap_no;
+	ulint		next_heap_no;
+
+	ut_ad(page == page_align(rec));
+	ut_ad(!page_rec_is_metadata(rec));
+
+	if (page_is_comp(page)) {
+		heap_no = rec_get_heap_no_new(rec);
+		next_heap_no = rec_get_heap_no_new(page
+						   + rec_get_next_offs(rec,
+								       TRUE));
+	} else {
+		heap_no = rec_get_heap_no_old(rec);
+		next_heap_no = rec_get_heap_no_old(page
+						   + rec_get_next_offs(rec,
+								       FALSE));
+	}
+
+	const page_id_t id{block->page.id()};
+	LockGuard g{lock_sys.rec_hash, id};
+
+	/* Let the next record inherit the locks from rec, in gap mode */
+
+	lock_rec_inherit_to_gap(g.cell(), id, g.cell(), id, block->page.frame,
+				next_heap_no, heap_no);
+
+	/* Reset the lock bits on rec and release waiting transactions */
+	lock_rec_reset_and_release_wait(g.cell(), id, heap_no);
+}
+
+/*********************************************************************//**
+Stores on the page infimum record the explicit locks of another record.
+This function is used to store the lock state of a record when it is
+updated and the size of the record changes in the update. The record
+is moved in such an update, perhaps to another page. The infimum record
+acts as a dummy carrier record, taking care of lock releases while the
+actual record is being moved. */
+void
+lock_rec_store_on_page_infimum(
+/*===========================*/
+	const buf_block_t*	block,	/*!< in: buffer block containing rec */
+	const rec_t*		rec)	/*!< in: record whose lock state
+					is stored on the infimum
+					record of the same page; lock
+					bits are reset on the
+					record */
+{
+  const ulint heap_no= page_rec_get_heap_no(rec);
+
+  ut_ad(block->page.frame == page_align(rec));
+  const page_id_t id{block->page.id()};
+#ifdef ENABLED_DEBUG_SYNC
+  SCOPE_EXIT([]() { DEBUG_SYNC_C("lock_rec_store_on_page_infimum_end"); });
+#endif
+
+  LockGuard g{lock_sys.rec_hash, id};
+  lock_rec_move(g.cell(), *block, id, g.cell(), id,
+                PAGE_HEAP_NO_INFIMUM, heap_no);
+}
+
+/** Restore the explicit lock requests on a single record, where the
+state was stored on the infimum of a page.
+@param block   buffer block containing rec
+@param rec     record whose lock state is restored
+@param donator page (rec is not necessarily on this page)
+whose infimum stored the lock state; lock bits are reset on the infimum */
+void lock_rec_restore_from_page_infimum(const buf_block_t &block,
+					const rec_t *rec, page_id_t donator)
+{
+  const ulint heap_no= page_rec_get_heap_no(rec);
+  const page_id_t id{block.page.id()};
+  LockMultiGuard g{lock_sys.rec_hash, id, donator};
+  lock_rec_move(g.cell1(), block, id, g.cell2(), donator, heap_no,
+                PAGE_HEAP_NO_INFIMUM);
+}
+
+/*========================= TABLE LOCKS ==============================*/
+
+/**
+Create a table lock, without checking for deadlocks or lock compatibility.
+@param table      table on which the lock is created
+@param type_mode  lock type and mode
+@param trx        transaction
+@param c_lock     conflicting lock
+@return the created lock object */
+lock_t *lock_table_create(dict_table_t *table, unsigned type_mode, trx_t *trx,
+                          lock_t *c_lock)
+{
+	lock_t*		lock;
+
+	lock_sys.assert_locked(*table);
+	ut_ad(trx->mutex_is_owner());
+	ut_ad(!trx->is_wsrep() || lock_sys.is_writer());
+	ut_ad(trx->state == TRX_STATE_ACTIVE || trx->is_recovered);
+	ut_ad(!trx->is_autocommit_non_locking());
+	/* During CREATE TABLE, we will write to newly created FTS_*_CONFIG
+	on which no lock has been created yet. */
+	ut_ad(!trx->dict_operation_lock_mode
+	      || (strstr(table->name.m_name, "/FTS_")
+		  && strstr(table->name.m_name, "_CONFIG") + sizeof("_CONFIG")
+		  == table->name.m_name + strlen(table->name.m_name) + 1));
+
+	switch (LOCK_MODE_MASK & type_mode) {
+	case LOCK_AUTO_INC:
+		++table->n_waiting_or_granted_auto_inc_locks;
+		/* For AUTOINC locking we reuse the lock instance only if
+		there is no wait involved else we allocate the waiting lock
+		from the transaction lock heap. */
+		if (type_mode == LOCK_AUTO_INC) {
+			lock = table->autoinc_lock;
+
+			ut_ad(!table->autoinc_trx);
+			table->autoinc_trx = trx;
+
+			ib_vector_push(trx->autoinc_locks, &lock);
+			goto allocated;
+		}
+
+		break;
+	case LOCK_X:
+	case LOCK_S:
+		++table->n_lock_x_or_s;
+		break;
+	}
+
+	lock = trx->lock.table_cached < array_elements(trx->lock.table_pool)
+		? &trx->lock.table_pool[trx->lock.table_cached++]
+		: static_cast<lock_t*>(
+			mem_heap_alloc(trx->lock.lock_heap, sizeof *lock));
+
+allocated:
+	lock->type_mode = ib_uint32_t(type_mode | LOCK_TABLE);
+	lock->trx = trx;
+
+	lock->un_member.tab_lock.table = table;
+
+	ut_ad(table->get_ref_count() > 0 || !table->can_be_evicted);
+
+	UT_LIST_ADD_LAST(trx->lock.trx_locks, lock);
+
+	ut_list_append(table->locks, lock, TableLockGetNode());
+
+	if (type_mode & LOCK_WAIT) {
+		if (trx->lock.wait_trx) {
+			ut_ad(!c_lock || trx->lock.wait_trx == c_lock->trx);
+			ut_ad(trx->lock.wait_lock);
+			ut_ad((*trx->lock.wait_lock).trx == trx);
+		} else {
+			ut_ad(c_lock);
+			trx->lock.wait_trx = c_lock->trx;
+			ut_ad(!trx->lock.wait_lock);
+		}
+		trx->lock.wait_lock = lock;
+	}
+
+	lock->trx->lock.table_locks.push_back(lock);
+
+	MONITOR_INC(MONITOR_TABLELOCK_CREATED);
+	MONITOR_INC(MONITOR_NUM_TABLELOCK);
+
+	return(lock);
+}
+
+/*************************************************************//**
+Pops autoinc lock requests from the transaction's autoinc_locks. We
+handle the case where there are gaps in the array and they need to
+be popped off the stack. */
+UNIV_INLINE
+void
+lock_table_pop_autoinc_locks(
+/*=========================*/
+	trx_t*	trx)	/*!< in/out: transaction that owns the AUTOINC locks */
+{
+	ut_ad(!ib_vector_is_empty(trx->autoinc_locks));
+
+	/* Skip any gaps, gaps are NULL lock entries in the
+	trx->autoinc_locks vector. */
+
+	do {
+		ib_vector_pop(trx->autoinc_locks);
+
+		if (ib_vector_is_empty(trx->autoinc_locks)) {
+			return;
+		}
+
+	} while (*(lock_t**) ib_vector_get_last(trx->autoinc_locks) == NULL);
+}
+
+/*************************************************************//**
+Removes an autoinc lock request from the transaction's autoinc_locks. */
+UNIV_INLINE
+void
+lock_table_remove_autoinc_lock(
+/*===========================*/
+	lock_t*	lock,	/*!< in: table lock */
+	trx_t*	trx)	/*!< in/out: transaction that owns the lock */
+{
+	ut_ad(lock->type_mode == (LOCK_AUTO_INC | LOCK_TABLE));
+	lock_sys.assert_locked(*lock->un_member.tab_lock.table);
+	ut_ad(trx->mutex_is_owner());
+
+	auto s = ib_vector_size(trx->autoinc_locks);
+	ut_ad(s);
+
+	/* With stored functions and procedures the user may drop
+	a table within the same "statement". This special case has
+	to be handled by deleting only those AUTOINC locks that were
+	held by the table being dropped. */
+
+	lock_t*	autoinc_lock = *static_cast<lock_t**>(
+		ib_vector_get(trx->autoinc_locks, --s));
+
+	/* This is the default fast case. */
+
+	if (autoinc_lock == lock) {
+		lock_table_pop_autoinc_locks(trx);
+	} else {
+		/* The last element should never be NULL */
+		ut_a(autoinc_lock != NULL);
+
+		/* Handle freeing the locks from within the stack. */
+
+		while (s) {
+			autoinc_lock = *static_cast<lock_t**>(
+				ib_vector_get(trx->autoinc_locks, --s));
+
+			if (autoinc_lock == lock) {
+				void*	null_var = NULL;
+				ib_vector_set(trx->autoinc_locks, s, &null_var);
+				return;
+			}
+		}
+
+		/* Must find the autoinc lock. */
+		ut_error;
+	}
+}
+
+/*************************************************************//**
+Removes a table lock request from the queue and the trx list of locks;
+this is a low-level function which does NOT check if waiting requests
+can now be granted. */
+UNIV_INLINE
+const dict_table_t*
+lock_table_remove_low(
+/*==================*/
+	lock_t*	lock)	/*!< in/out: table lock */
+{
+	ut_ad(lock->is_table());
+
+	trx_t*		trx;
+	dict_table_t*	table;
+
+	ut_ad(lock->is_table());
+	trx = lock->trx;
+	table = lock->un_member.tab_lock.table;
+	lock_sys.assert_locked(*table);
+	ut_ad(trx->mutex_is_owner());
+
+	/* Remove the table from the transaction's AUTOINC vector, if
+	the lock that is being released is an AUTOINC lock. */
+	switch (lock->mode()) {
+	case LOCK_AUTO_INC:
+		ut_ad((table->autoinc_trx == trx) == !lock->is_waiting());
+
+		if (table->autoinc_trx == trx) {
+			table->autoinc_trx = NULL;
+			/* The locks must be freed in the reverse order from
+			the one in which they were acquired. This is to avoid
+			traversing the AUTOINC lock vector unnecessarily.
+
+			We only store locks that were granted in the
+			trx->autoinc_locks vector (see lock_table_create()
+			and lock_grant()). */
+			lock_table_remove_autoinc_lock(lock, trx);
+		}
+
+		ut_ad(table->n_waiting_or_granted_auto_inc_locks);
+		--table->n_waiting_or_granted_auto_inc_locks;
+		break;
+	case LOCK_X:
+	case LOCK_S:
+		ut_ad(table->n_lock_x_or_s);
+		--table->n_lock_x_or_s;
+		break;
+	default:
+		break;
+	}
+
+	UT_LIST_REMOVE(trx->lock.trx_locks, lock);
+	ut_list_remove(table->locks, lock, TableLockGetNode());
+
+	MONITOR_INC(MONITOR_TABLELOCK_REMOVED);
+	MONITOR_DEC(MONITOR_NUM_TABLELOCK);
+	return table;
+}
+
+/*********************************************************************//**
+Enqueues a waiting request for a table lock which cannot be granted
+immediately. Checks for deadlocks.
+@retval	DB_LOCK_WAIT	if the waiting lock was enqueued
+@retval	DB_DEADLOCK	if this transaction was chosen as the victim */
+static
+dberr_t
+lock_table_enqueue_waiting(
+/*=======================*/
+	unsigned	mode,	/*!< in: lock mode this transaction is
+				requesting */
+	dict_table_t*	table,	/*!< in/out: table */
+	que_thr_t*	thr,	/*!< in: query thread */
+	lock_t*		c_lock)	/*!< in: conflicting lock or NULL */
+{
+	lock_sys.assert_locked(*table);
+	ut_ad(!srv_read_only_mode);
+
+	trx_t* trx = thr_get_trx(thr);
+	ut_ad(trx->mutex_is_owner());
+	ut_ad(!trx->dict_operation_lock_mode);
+
+	/* Enqueue the lock request that will wait to be granted */
+	lock_table_create(table, mode | LOCK_WAIT, trx, c_lock);
+
+	trx->lock.wait_thr = thr;
+        /* Apart from Galera, only transactions that have waiting lock
+        may be chosen as deadlock victims. Only one lock can be waited for at a
+        time, and a transaction is associated with a single thread. That is why
+        there must not be waiting lock requests if the transaction is deadlock
+        victim and it is not WSREP. Galera transaction abort can be invoked
+        from MDL acquisition code when the transaction does not have waiting
+        lock, that's why we check only deadlock victim bit here. */
+        ut_ad(!(trx->lock.was_chosen_as_deadlock_victim & 1));
+
+	MONITOR_INC(MONITOR_TABLELOCK_WAIT);
+	return(DB_LOCK_WAIT);
+}
+
+/*********************************************************************//**
+Checks if other transactions have an incompatible mode lock request in
+the lock queue.
+@return lock or NULL */
+UNIV_INLINE
+lock_t*
+lock_table_other_has_incompatible(
+/*==============================*/
+	const trx_t*		trx,	/*!< in: transaction, or NULL if all
+					transactions should be included */
+	ulint			wait,	/*!< in: LOCK_WAIT if also
+					waiting locks are taken into
+					account, or 0 if not */
+	const dict_table_t*	table,	/*!< in: table */
+	lock_mode		mode)	/*!< in: lock mode */
+{
+	lock_sys.assert_locked(*table);
+
+	static_assert(LOCK_IS == 0, "compatibility");
+	static_assert(LOCK_IX == 1, "compatibility");
+
+	if (UNIV_LIKELY(mode <= LOCK_IX && !table->n_lock_x_or_s)) {
+		return(NULL);
+	}
+
+	for (lock_t* lock = UT_LIST_GET_LAST(table->locks);
+	     lock;
+	     lock = UT_LIST_GET_PREV(un_member.tab_lock.locks, lock)) {
+
+		trx_t* lock_trx = lock->trx;
+
+		if (lock_trx != trx
+		    && !lock_mode_compatible(lock->mode(), mode)
+		    && (wait || !lock->is_waiting())) {
+			return(lock);
+		}
+	}
+
+	return(NULL);
+}
+
+/** Aqcuire or enqueue a table lock */
+static dberr_t lock_table_low(dict_table_t *table, lock_mode mode,
+                              que_thr_t *thr, trx_t *trx)
+{
+  DBUG_EXECUTE_IF("innodb_table_deadlock", return DB_DEADLOCK;);
+  lock_t *wait_for=
+    lock_table_other_has_incompatible(trx, LOCK_WAIT, table, mode);
+  dberr_t err= DB_SUCCESS;
+
+  trx->mutex_lock();
+
+  if (wait_for)
+    err= lock_table_enqueue_waiting(mode, table, thr, wait_for);
+  else
+    lock_table_create(table, mode, trx, nullptr);
+
+  trx->mutex_unlock();
+
+  return err;
+}
+
+#ifdef WITH_WSREP
+/** Aqcuire or enqueue a table lock in Galera replication mode. */
+ATTRIBUTE_NOINLINE
+static dberr_t lock_table_wsrep(dict_table_t *table, lock_mode mode,
+                                que_thr_t *thr, trx_t *trx)
+{
+  LockMutexGuard g{SRW_LOCK_CALL};
+  return lock_table_low(table, mode, thr, trx);
+}
+#endif
+
+/** Acquire a table lock.
+@param table   table to be locked
+@param fktable pointer to table, in case of a FOREIGN key check
+@param mode    lock mode
+@param thr     SQL execution thread
+@retval DB_SUCCESS    if the lock was acquired
+@retval DB_DEADLOCK   if a deadlock occurred, or fktable && *fktable != table
+@retval DB_LOCK_WAIT  if lock_wait() must be invoked */
+dberr_t lock_table(dict_table_t *table, dict_table_t *const*fktable,
+                   lock_mode mode, que_thr_t *thr)
+{
+  ut_ad(table);
+
+  if (!fktable && table->is_temporary())
+    return DB_SUCCESS;
+
+  ut_ad(fktable || table->get_ref_count() || !table->can_be_evicted);
+
+  trx_t *trx= thr_get_trx(thr);
+
+  /* Look for equal or stronger locks the same trx already has on the
+  table. No need to acquire LockMutexGuard here because only the
+  thread that is executing a transaction can access trx_t::table_locks. */
+  if (lock_table_has(trx, table, mode) || srv_read_only_mode)
+    return DB_SUCCESS;
+
+  if ((mode == LOCK_IX || mode == LOCK_X) &&
+      !trx->read_only && !trx->rsegs.m_redo.rseg)
+    trx_set_rw_mode(trx);
+
+#ifdef WITH_WSREP
+  if (trx->is_wsrep())
+    return lock_table_wsrep(table, mode, thr, trx);
+#endif
+  lock_sys.rd_lock(SRW_LOCK_CALL);
+  dberr_t err;
+  if (fktable != nullptr && *fktable != table)
+    err= DB_DEADLOCK;
+  else
+  {
+    table->lock_mutex_lock();
+    err= lock_table_low(table, mode, thr, trx);
+    table->lock_mutex_unlock();
+  }
+  lock_sys.rd_unlock();
+
+  return err;
+}
+
+/** Create a table lock object for a resurrected transaction.
+@param table    table to be X-locked
+@param trx      transaction
+@param mode     LOCK_X or LOCK_IX */
+void lock_table_resurrect(dict_table_t *table, trx_t *trx, lock_mode mode)
+{
+  ut_ad(trx->is_recovered);
+  ut_ad(mode == LOCK_X || mode == LOCK_IX);
+
+  if (lock_table_has(trx, table, mode))
+    return;
+
+  {
+    /* This is executed at server startup while no connections
+    are alowed. Do not bother with lock elision. */
+    LockMutexGuard g{SRW_LOCK_CALL};
+    ut_ad(!lock_table_other_has_incompatible(trx, LOCK_WAIT, table, mode));
+
+    trx->mutex_lock();
+    lock_table_create(table, mode, trx);
+  }
+  trx->mutex_unlock();
+}
+
+/** Find a lock that a waiting table lock request still has to wait for. */
+static const lock_t *lock_table_has_to_wait_in_queue(const lock_t *wait_lock)
+{
+  ut_ad(wait_lock->is_waiting());
+  ut_ad(wait_lock->is_table());
+
+  dict_table_t *table= wait_lock->un_member.tab_lock.table;
+  lock_sys.assert_locked(*table);
+
+  static_assert(LOCK_IS == 0, "compatibility");
+  static_assert(LOCK_IX == 1, "compatibility");
+
+  if (UNIV_LIKELY(wait_lock->mode() <= LOCK_IX && !table->n_lock_x_or_s))
+    return nullptr;
+
+  for (const lock_t *lock= UT_LIST_GET_FIRST(table->locks); lock != wait_lock;
+       lock= UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock))
+    if (lock_has_to_wait(wait_lock, lock))
+      return lock;
+
+  return nullptr;
+}
+
+/*************************************************************//**
+Removes a table lock request, waiting or granted, from the queue and grants
+locks to other transactions in the queue, if they now are entitled to a
+lock.
+@param[in,out]	in_lock		table lock
+@param[in]	owns_wait_mutex	whether lock_sys.wait_mutex is held */
+static void lock_table_dequeue(lock_t *in_lock, bool owns_wait_mutex)
+{
+#ifdef SAFE_MUTEX
+	ut_ad(owns_wait_mutex == mysql_mutex_is_owner(&lock_sys.wait_mutex));
+#endif
+	ut_ad(in_lock->trx->mutex_is_owner());
+	lock_t*	lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, in_lock);
+
+	const dict_table_t* table = lock_table_remove_low(in_lock);
+
+	static_assert(LOCK_IS == 0, "compatibility");
+	static_assert(LOCK_IX == 1, "compatibility");
+
+	if (UNIV_LIKELY(in_lock->mode() <= LOCK_IX && !table->n_lock_x_or_s)) {
+		return;
+	}
+
+	bool acquired = false;
+
+	/* Check if waiting locks in the queue can now be granted: grant
+	locks if there are no conflicting locks ahead. */
+
+	for (/* No op */;
+	     lock != NULL;
+	     lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock)) {
+		if (!lock->is_waiting()) {
+			continue;
+		}
+
+		if (!owns_wait_mutex) {
+			mysql_mutex_lock(&lock_sys.wait_mutex);
+			acquired = owns_wait_mutex = true;
+		}
+
+		ut_ad(lock->trx->lock.wait_trx);
+		ut_ad(lock->trx->lock.wait_lock);
+
+		if (const lock_t* c = lock_table_has_to_wait_in_queue(lock)) {
+			trx_t* c_trx = c->trx;
+			lock->trx->lock.wait_trx = c_trx;
+			if (c_trx->lock.wait_trx
+			    && innodb_deadlock_detect
+			    && Deadlock::to_check.emplace(c_trx).second) {
+				Deadlock::to_be_checked = true;
+			}
+		} else {
+			/* Grant the lock */
+			ut_ad(in_lock->trx != lock->trx);
+			in_lock->trx->mutex_unlock();
+			lock_grant(lock);
+			in_lock->trx->mutex_lock();
+		}
+	}
+
+	if (acquired) {
+		mysql_mutex_unlock(&lock_sys.wait_mutex);
+	}
+}
+
+
+/** Sets a lock on a table based on the given mode.
+@param table	table to lock
+@param trx	transaction
+@param mode	LOCK_X or LOCK_S
+@param no_wait  whether to skip handling DB_LOCK_WAIT
+@return error code */
+dberr_t lock_table_for_trx(dict_table_t *table, trx_t *trx, lock_mode mode,
+                           bool no_wait)
+{
+  mem_heap_t *heap= mem_heap_create(512);
+  sel_node_t *node= sel_node_create(heap);
+  que_thr_t *thr= pars_complete_graph_for_exec(node, trx, heap, nullptr);
+  thr->graph->state= QUE_FORK_ACTIVE;
+
+  thr= static_cast<que_thr_t*>
+    (que_fork_get_first_thr(static_cast<que_fork_t*>
+                            (que_node_get_parent(thr))));
+
+run_again:
+  thr->run_node= thr;
+  thr->prev_node= thr->common.parent;
+  dberr_t err= lock_table(table, nullptr, mode, thr);
+
+  switch (err) {
+  case DB_SUCCESS:
+    break;
+  case DB_LOCK_WAIT:
+    if (no_wait)
+    {
+      lock_sys.cancel_lock_wait_for_trx(trx);
+      break;
+    }
+    /* fall through */
+  default:
+    trx->error_state= err;
+    if (row_mysql_handle_errors(&err, trx, thr, nullptr))
+      goto run_again;
+  }
+
+  que_graph_free(thr->graph);
+  trx->op_info= "";
+
+  return err;
+}
+
+/** Exclusively lock the data dictionary tables.
+@param trx  dictionary transaction
+@return error code
+@retval DB_SUCCESS on success */
+dberr_t lock_sys_tables(trx_t *trx)
+{
+  dberr_t err;
+  if (!(err= lock_table_for_trx(dict_sys.sys_tables, trx, LOCK_X)) &&
+      !(err= lock_table_for_trx(dict_sys.sys_columns, trx, LOCK_X)) &&
+      !(err= lock_table_for_trx(dict_sys.sys_indexes, trx, LOCK_X)) &&
+      !(err= lock_table_for_trx(dict_sys.sys_fields, trx, LOCK_X)))
+  {
+    if (dict_sys.sys_foreign)
+      err= lock_table_for_trx(dict_sys.sys_foreign, trx, LOCK_X);
+    if (!err && dict_sys.sys_foreign_cols)
+      err= lock_table_for_trx(dict_sys.sys_foreign_cols, trx, LOCK_X);
+    if (!err && dict_sys.sys_virtual)
+      err= lock_table_for_trx(dict_sys.sys_virtual, trx, LOCK_X);
+  }
+  return err;
+}
+
+/** Rebuild waiting queue after first_lock for heap_no. The queue is rebuilt
+close to the way lock_rec_dequeue_from_page() does it.
+@param trx        transaction that has set a lock, which caused the queue
+                  rebuild
+@param cell       rec hash cell of first_lock
+@param first_lock the lock after which waiting queue will be rebuilt
+@param heap_no    heap no of the record for which waiting queue to rebuild */
+static void lock_rec_rebuild_waiting_queue(
+#if defined(UNIV_DEBUG) || !defined(DBUG_OFF)
+    trx_t *trx,
+#endif /* defined(UNIV_DEBUG) || !defined(DBUG_OFF) */
+    hash_cell_t &cell, lock_t *first_lock, ulint heap_no)
+{
+  lock_sys.assert_locked(cell);
+
+  for (lock_t *lock= first_lock; lock != NULL;
+       lock= lock_rec_get_next(heap_no, lock))
+  {
+    if (!lock->is_waiting())
+      continue;
+    mysql_mutex_lock(&lock_sys.wait_mutex);
+    ut_ad(lock->trx->lock.wait_trx);
+    ut_ad(lock->trx->lock.wait_lock);
+
+    if (const lock_t *c= lock_rec_has_to_wait_in_queue(cell, lock))
+      lock->trx->lock.wait_trx= c->trx;
+    else
+    {
+      /* Grant the lock */
+      ut_ad(trx != lock->trx);
+      lock_grant(lock);
+    }
+    mysql_mutex_unlock(&lock_sys.wait_mutex);
+  }
+}
+
+/*=========================== LOCK RELEASE ==============================*/
+
+/*************************************************************//**
+Removes a granted record lock of a transaction from the queue and grants
+locks to other transactions waiting in the queue if they now are entitled
+to a lock. */
+TRANSACTIONAL_TARGET
+void
+lock_rec_unlock(
+/*============*/
+	trx_t*			trx,	/*!< in/out: transaction that has
+					set a record lock */
+	const page_id_t		id,	/*!< in: page containing rec */
+	const rec_t*		rec,	/*!< in: record */
+	lock_mode		lock_mode)/*!< in: LOCK_S or LOCK_X */
+{
+	lock_t*		first_lock;
+	lock_t*		lock;
+	ulint		heap_no;
+
+	ut_ad(trx);
+	ut_ad(rec);
+	ut_ad(!trx->lock.wait_lock);
+	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+	ut_ad(!page_rec_is_metadata(rec));
+
+	heap_no = page_rec_get_heap_no(rec);
+
+	LockGuard g{lock_sys.rec_hash, id};
+
+	first_lock = lock_sys_t::get_first(g.cell(), id, heap_no);
+
+	/* Find the last lock with the same lock_mode and transaction
+	on the record. */
+
+	for (lock = first_lock; lock != NULL;
+	     lock = lock_rec_get_next(heap_no, lock)) {
+		if (lock->trx == trx && lock->mode() == lock_mode) {
+			goto released;
+		}
+	}
+
+	{
+		ib::error	err;
+		err << "Unlock row could not find a " << lock_mode
+			<< " mode lock on the record. Current statement: ";
+		size_t		stmt_len;
+		if (const char* stmt = innobase_get_stmt_unsafe(
+			    trx->mysql_thd, &stmt_len)) {
+			err.write(stmt, stmt_len);
+		}
+	}
+
+	return;
+
+released:
+	ut_a(!lock->is_waiting());
+	{
+		TMTrxGuard tg{*trx};
+		lock_rec_reset_nth_bit(lock, heap_no);
+	}
+
+	/* Check if we can now grant waiting lock requests */
+	lock_rec_rebuild_waiting_queue(
+#if defined(UNIV_DEBUG) || !defined(DBUG_OFF)
+					trx,
+#endif /* defined(UNIV_DEBUG) || !defined(DBUG_OFF) */
+					g.cell(), first_lock, heap_no);
+}
+
+/** Release the explicit locks of a committing transaction,
+and release possible other transactions waiting because of these locks.
+@return whether the operation succeeded */
+TRANSACTIONAL_TARGET static bool lock_release_try(trx_t *trx)
+{
+  /* At this point, trx->lock.trx_locks cannot be modified by other
+  threads, because our transaction has been committed.
+  See the checks and assertions in lock_rec_create_low() and
+  lock_rec_add_to_queue().
+
+  The function lock_table_create() should never be invoked on behalf
+  of a transaction running in another thread. Also there, we will
+  assert that the current transaction be active. */
+  DBUG_ASSERT(trx->state == TRX_STATE_COMMITTED_IN_MEMORY);
+  DBUG_ASSERT(!trx->is_referenced());
+
+  bool all_released= true;
+restart:
+  ulint count= 1000;
+  /* We will not attempt hardware lock elision (memory transaction)
+  here. Both lock_rec_dequeue_from_page() and lock_table_dequeue()
+  would likely lead to a memory transaction due to a system call, to
+  wake up a waiting transaction. */
+  lock_sys.rd_lock(SRW_LOCK_CALL);
+  trx->mutex_lock();
+
+  /* Note: Anywhere else, trx->mutex is not held while acquiring
+  a lock table latch, but here we are following the opposite order.
+  To avoid deadlocks, we only try to acquire the lock table latches
+  but not keep waiting for them. */
+
+  for (lock_t *lock= UT_LIST_GET_LAST(trx->lock.trx_locks); lock; )
+  {
+    ut_ad(lock->trx == trx);
+    lock_t *prev= UT_LIST_GET_PREV(trx_locks, lock);
+    if (!lock->is_table())
+    {
+      ut_ad(!lock->index->table->is_temporary());
+      ut_ad(lock->mode() != LOCK_X ||
+            lock->index->table->id >= DICT_HDR_FIRST_ID ||
+            trx->dict_operation || trx->was_dict_operation);
+      auto &lock_hash= lock_sys.hash_get(lock->type_mode);
+      auto cell= lock_hash.cell_get(lock->un_member.rec_lock.page_id.fold());
+      auto latch= lock_sys_t::hash_table::latch(cell);
+      if (!latch->try_acquire())
+        all_released= false;
+      else
+      {
+        lock_rec_dequeue_from_page(lock, false);
+        latch->release();
+      }
+    }
+    else
+    {
+      dict_table_t *table= lock->un_member.tab_lock.table;
+      ut_ad(!table->is_temporary());
+      ut_ad(table->id >= DICT_HDR_FIRST_ID ||
+            (lock->mode() != LOCK_IX && lock->mode() != LOCK_X) ||
+            trx->dict_operation || trx->was_dict_operation);
+      if (!table->lock_mutex_trylock())
+        all_released= false;
+      else
+      {
+        lock_table_dequeue(lock, false);
+        table->lock_mutex_unlock();
+      }
+    }
+
+    lock= all_released ? UT_LIST_GET_LAST(trx->lock.trx_locks) : prev;
+    if (!--count)
+      break;
+  }
+
+  lock_sys.rd_unlock();
+  trx->mutex_unlock();
+  if (all_released && !count)
+    goto restart;
+  return all_released;
+}
+
+/** Release the explicit locks of a committing transaction,
+and release possible other transactions waiting because of these locks. */
+void lock_release(trx_t *trx)
+{
+#ifdef UNIV_DEBUG
+  std::set<table_id_t> to_evict;
+  if (innodb_evict_tables_on_commit_debug &&
+      !trx->is_recovered && !dict_sys.locked())
+    for (const auto& p : trx->mod_tables)
+      if (!p.first->is_temporary())
+        to_evict.emplace(p.first->id);
+#endif
+  ulint count;
+
+  for (count= 5; count--; )
+    if (lock_release_try(trx))
+      goto released;
+
+  /* Fall back to acquiring lock_sys.latch in exclusive mode */
+restart:
+  count= 1000;
+  /* There is probably no point to try lock elision here;
+  in lock_release_try() it is different. */
+  lock_sys.wr_lock(SRW_LOCK_CALL);
+  trx->mutex_lock();
+
+  while (lock_t *lock= UT_LIST_GET_LAST(trx->lock.trx_locks))
+  {
+    ut_ad(lock->trx == trx);
+    if (!lock->is_table())
+    {
+      ut_ad(!lock->index->table->is_temporary());
+      ut_ad(lock->mode() != LOCK_X ||
+            lock->index->table->id >= DICT_HDR_FIRST_ID ||
+            trx->dict_operation || trx->was_dict_operation);
+      lock_rec_dequeue_from_page(lock, false);
+    }
+    else
+    {
+      ut_d(dict_table_t *table= lock->un_member.tab_lock.table);
+      ut_ad(!table->is_temporary());
+      ut_ad(table->id >= DICT_HDR_FIRST_ID ||
+            (lock->mode() != LOCK_IX && lock->mode() != LOCK_X) ||
+            trx->dict_operation || trx->was_dict_operation);
+      lock_table_dequeue(lock, false);
+    }
+
+    if (!--count)
+      break;
+  }
+
+  lock_sys.wr_unlock();
+  trx->mutex_unlock();
+  if (!count)
+    goto restart;
+
+released:
+  if (UNIV_UNLIKELY(Deadlock::to_be_checked))
+  {
+    mysql_mutex_lock(&lock_sys.wait_mutex);
+    lock_sys.deadlock_check();
+    mysql_mutex_unlock(&lock_sys.wait_mutex);
+  }
+
+  trx->lock.n_rec_locks= 0;
+
+#ifdef UNIV_DEBUG
+  if (to_evict.empty())
+    return;
+  dict_sys.lock(SRW_LOCK_CALL);
+  LockMutexGuard g{SRW_LOCK_CALL};
+  for (const table_id_t id : to_evict)
+    if (dict_table_t *table= dict_sys.find_table(id))
+      if (!table->get_ref_count() && !UT_LIST_GET_LEN(table->locks))
+        dict_sys.remove(table, true);
+  dict_sys.unlock();
+#endif
+}
+
+/** Release the explicit locks of a committing transaction while
+dict_sys.latch is exclusively locked,
+and release possible other transactions waiting because of these locks. */
+void lock_release_on_drop(trx_t *trx)
+{
+  ut_ad(lock_sys.is_writer());
+  ut_ad(trx->mutex_is_owner());
+  ut_ad(trx->dict_operation);
+
+  while (lock_t *lock= UT_LIST_GET_LAST(trx->lock.trx_locks))
+  {
+    ut_ad(lock->trx == trx);
+    if (!lock->is_table())
+    {
+      ut_ad(!lock->index->table->is_temporary());
+      ut_ad(lock->mode() != LOCK_X ||
+            lock->index->table->id >= DICT_HDR_FIRST_ID ||
+            trx->dict_operation);
+      lock_rec_dequeue_from_page(lock, false);
+    }
+    else
+    {
+      ut_d(dict_table_t *table= lock->un_member.tab_lock.table);
+      ut_ad(!table->is_temporary());
+      ut_ad(table->id >= DICT_HDR_FIRST_ID ||
+            (lock->mode() != LOCK_IX && lock->mode() != LOCK_X) ||
+            trx->dict_operation);
+      lock_table_dequeue(lock, false);
+    }
+  }
+}
+
+/** Reset lock bit for supremum and rebuild waiting queue.
+@param cell rec hash cell of in_lock
+@param lock the lock with supemum bit set */
+static void lock_rec_unlock_supremum(hash_cell_t &cell, lock_t *lock)
+{
+  ut_ad(lock_rec_get_nth_bit(lock, PAGE_HEAP_NO_SUPREMUM));
+#ifdef SAFE_MUTEX
+  ut_ad(!mysql_mutex_is_owner(&lock_sys.wait_mutex));
+#endif /* SAFE_MUTEX */
+  ut_ad(!lock->is_table());
+  ut_ad(lock_sys.is_writer() || lock->trx->mutex_is_owner());
+
+  lock_rec_reset_nth_bit(lock, PAGE_HEAP_NO_SUPREMUM);
+
+  lock_t *first_lock= lock_sys_t::get_first(
+      cell, lock->un_member.rec_lock.page_id, PAGE_HEAP_NO_SUPREMUM);
+
+  lock_rec_rebuild_waiting_queue(
+#if defined(UNIV_DEBUG) || !defined(DBUG_OFF)
+      lock->trx,
+#endif /* defined(UNIV_DEBUG) || !defined(DBUG_OFF) */
+      cell, first_lock, PAGE_HEAP_NO_SUPREMUM);
+}
+
+/** Release non-exclusive locks on XA PREPARE,
+and wake up possible other transactions waiting because of these locks.
+@param trx   transaction in XA PREPARE state
+@return whether all locks were released */
+static bool lock_release_on_prepare_try(trx_t *trx)
+{
+  /* At this point, trx->lock.trx_locks can still be modified by other
+  threads to convert implicit exclusive locks into explicit ones.
+
+  The function lock_table_create() should never be invoked on behalf
+  of a transaction that is running in another thread. Also there, we
+  will assert that the current transaction be active. */
+  DBUG_ASSERT(trx->state == TRX_STATE_PREPARED);
+
+  bool all_released= true;
+  lock_sys.rd_lock(SRW_LOCK_CALL);
+  trx->mutex_lock();
+
+  /* Note: Normally, trx->mutex is not held while acquiring
+  a lock table latch, but here we are following the opposite order.
+  To avoid deadlocks, we only try to acquire the lock table latches
+  but not keep waiting for them. */
+
+  for (lock_t *prev, *lock= UT_LIST_GET_LAST(trx->lock.trx_locks); lock;
+       lock= prev)
+  {
+    ut_ad(lock->trx == trx);
+    prev= UT_LIST_GET_PREV(trx_locks, lock);
+    if (!lock->is_table())
+    {
+      ut_ad(!lock->index->table->is_temporary());
+      bool supremum_bit = lock_rec_get_nth_bit(lock, PAGE_HEAP_NO_SUPREMUM);
+      bool rec_granted_exclusive_not_gap =
+        lock->is_rec_granted_exclusive_not_gap();
+      if (!supremum_bit && rec_granted_exclusive_not_gap)
+        continue;
+      auto &lock_hash= lock_sys.hash_get(lock->type_mode);
+      auto cell= lock_hash.cell_get(lock->un_member.rec_lock.page_id.fold());
+      auto latch= lock_sys_t::hash_table::latch(cell);
+      if (latch->try_acquire())
+      {
+        if (!rec_granted_exclusive_not_gap)
+          lock_rec_dequeue_from_page(lock, false);
+        else if (supremum_bit)
+          lock_rec_unlock_supremum(*cell, lock);
+        latch->release();
+      }
+      else
+        all_released= false;
+    }
+    else
+    {
+      dict_table_t *table= lock->un_member.tab_lock.table;
+      ut_ad(!table->is_temporary());
+      switch (lock->mode()) {
+      case LOCK_IS:
+      case LOCK_S:
+        if (table->lock_mutex_trylock())
+        {
+          lock_table_dequeue(lock, false);
+          table->lock_mutex_unlock();
+        }
+        else
+          all_released= false;
+        break;
+      case LOCK_IX:
+      case LOCK_X:
+        ut_ad(table->id >= DICT_HDR_FIRST_ID || trx->dict_operation);
+        /* fall through */
+      default:
+        break;
+      }
+    }
+  }
+
+  lock_sys.rd_unlock();
+  trx->mutex_unlock();
+  return all_released;
+}
+
+/** Release non-exclusive locks on XA PREPARE,
+and release possible other transactions waiting because of these locks. */
+void lock_release_on_prepare(trx_t *trx)
+{
+  trx->set_skip_lock_inheritance();
+
+  for (ulint count= 5; count--; )
+    if (lock_release_on_prepare_try(trx))
+      return;
+
+  LockMutexGuard g{SRW_LOCK_CALL};
+  trx->mutex_lock();
+
+  for (lock_t *prev, *lock= UT_LIST_GET_LAST(trx->lock.trx_locks); lock;
+       lock= prev)
+  {
+    ut_ad(lock->trx == trx);
+    prev= UT_LIST_GET_PREV(trx_locks, lock);
+    if (!lock->is_table())
+    {
+      ut_ad(!lock->index->table->is_temporary());
+      if (!lock->is_rec_granted_exclusive_not_gap())
+        lock_rec_dequeue_from_page(lock, false);
+      else if (lock_rec_get_nth_bit(lock, PAGE_HEAP_NO_SUPREMUM))
+      {
+        auto &lock_hash= lock_sys.hash_get(lock->type_mode);
+        auto cell= lock_hash.cell_get(lock->un_member.rec_lock.page_id.fold());
+        lock_rec_unlock_supremum(*cell, lock);
+      }
+      else
+        ut_ad(lock->trx->isolation_level > TRX_ISO_READ_COMMITTED ||
+              /* Insert-intention lock is valid for supremum for isolation
+              level > TRX_ISO_READ_COMMITTED */
+              lock->mode() == LOCK_X ||
+              !lock_rec_get_nth_bit(lock, PAGE_HEAP_NO_SUPREMUM));
+    }
+    else
+    {
+      ut_d(dict_table_t *table= lock->un_member.tab_lock.table);
+      ut_ad(!table->is_temporary());
+      switch (lock->mode()) {
+      case LOCK_IS:
+      case LOCK_S:
+        lock_table_dequeue(lock, false);
+        break;
+      case LOCK_IX:
+      case LOCK_X:
+        ut_ad(table->id >= DICT_HDR_FIRST_ID || trx->dict_operation);
+        /* fall through */
+      default:
+        break;
+      }
+    }
+  }
+
+  trx->mutex_unlock();
+}
+
+/** Release locks on a table whose creation is being rolled back */
+ATTRIBUTE_COLD
+void lock_release_on_rollback(trx_t *trx, dict_table_t *table)
+{
+  trx->mod_tables.erase(table);
+
+  /* This is very rarely executed code, in the rare case that an
+  CREATE TABLE operation is being rolled back. Theoretically,
+  we might try to remove the locks in multiple memory transactions. */
+  lock_sys.wr_lock(SRW_LOCK_CALL);
+  trx->mutex_lock();
+
+  for (lock_t *next, *lock= UT_LIST_GET_FIRST(table->locks); lock; lock= next)
+  {
+    next= UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock);
+    ut_ad(lock->trx == trx);
+    UT_LIST_REMOVE(trx->lock.trx_locks, lock);
+    ut_list_remove(table->locks, lock, TableLockGetNode());
+  }
+
+  for (lock_t *p, *lock= UT_LIST_GET_LAST(trx->lock.trx_locks); lock; lock= p)
+  {
+    p= UT_LIST_GET_PREV(trx_locks, lock);
+    ut_ad(lock->trx == trx);
+    if (lock->is_table())
+      ut_ad(lock->un_member.tab_lock.table != table);
+    else if (lock->index->table == table)
+      lock_rec_dequeue_from_page(lock, false);
+  }
+
+  lock_sys.wr_unlock();
+  trx->mutex_unlock();
+}
+
+/*********************************************************************//**
+Removes table locks of the transaction on a table to be dropped. */
+static
+void
+lock_trx_table_locks_remove(
+/*========================*/
+	const lock_t*	lock_to_remove)		/*!< in: lock to remove */
+{
+	trx_t*		trx = lock_to_remove->trx;
+
+	ut_ad(lock_to_remove->is_table());
+	lock_sys.assert_locked(*lock_to_remove->un_member.tab_lock.table);
+	ut_ad(trx->mutex_is_owner());
+
+	for (lock_list::iterator it = trx->lock.table_locks.begin(),
+             end = trx->lock.table_locks.end(); it != end; ++it) {
+		const lock_t*	lock = *it;
+
+		ut_ad(!lock || trx == lock->trx);
+		ut_ad(!lock || lock->is_table());
+		ut_ad(!lock || lock->un_member.tab_lock.table);
+
+		if (lock == lock_to_remove) {
+			*it = NULL;
+			return;
+		}
+	}
+
+	/* Lock must exist in the vector. */
+	ut_error;
+}
+
+/*===================== VALIDATION AND DEBUGGING ====================*/
+
+/** Print info of a table lock.
+@param[in,out]	file	output stream
+@param[in]	lock	table lock */
+static
+void
+lock_table_print(FILE* file, const lock_t* lock)
+{
+	lock_sys.assert_locked();
+	ut_a(lock->is_table());
+
+	fputs("TABLE LOCK table ", file);
+	ut_print_name(file, lock->trx,
+		      lock->un_member.tab_lock.table->name.m_name);
+	fprintf(file, " trx id " TRX_ID_FMT, lock->trx->id);
+
+	switch (auto mode = lock->mode()) {
+	case LOCK_S:
+		fputs(" lock mode S", file);
+		break;
+	case LOCK_X:
+		ut_ad(lock->trx->id != 0);
+		fputs(" lock mode X", file);
+		break;
+	case LOCK_IS:
+		fputs(" lock mode IS", file);
+		break;
+	case LOCK_IX:
+		ut_ad(lock->trx->id != 0);
+		fputs(" lock mode IX", file);
+		break;
+	case LOCK_AUTO_INC:
+		fputs(" lock mode AUTO-INC", file);
+		break;
+	default:
+		fprintf(file, " unknown lock mode %u", mode);
+	}
+
+	if (lock->is_waiting()) {
+		fputs(" waiting", file);
+	}
+
+	putc('\n', file);
+}
+
+/** Pretty-print a record lock.
+@param[in,out]	file	output stream
+@param[in]	lock	record lock
+@param[in,out]	mtr	mini-transaction for accessing the record */
+static void lock_rec_print(FILE* file, const lock_t* lock, mtr_t& mtr)
+{
+	ut_ad(!lock->is_table());
+
+	const page_id_t page_id{lock->un_member.rec_lock.page_id};
+	ut_d(lock_sys.hash_get(lock->type_mode).assert_locked(page_id));
+
+	fprintf(file, "RECORD LOCKS space id %u page no %u n bits " ULINTPF
+		" index %s of table ",
+		page_id.space(), page_id.page_no(),
+		lock_rec_get_n_bits(lock),
+		lock->index->name());
+	ut_print_name(file, lock->trx, lock->index->table->name.m_name);
+	fprintf(file, " trx id " TRX_ID_FMT, lock->trx->id);
+
+	switch (lock->mode()) {
+	case LOCK_S:
+		fputs(" lock mode S", file);
+		break;
+	case LOCK_X:
+		fputs(" lock_mode X", file);
+		break;
+	default:
+		ut_error;
+	}
+
+	if (lock->is_gap()) {
+		fputs(" locks gap before rec", file);
+	}
+
+	if (lock->is_record_not_gap()) {
+		fputs(" locks rec but not gap", file);
+	}
+
+	if (lock->is_insert_intention()) {
+		fputs(" insert intention", file);
+	}
+
+	if (lock->is_waiting()) {
+		fputs(" waiting", file);
+	}
+
+	putc('\n', file);
+
+	mem_heap_t*		heap		= NULL;
+	rec_offs		offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	mtr.start();
+	const buf_block_t* block = buf_page_try_get(page_id, &mtr);
+
+	for (ulint i = 0; i < lock_rec_get_n_bits(lock); ++i) {
+
+		if (!lock_rec_get_nth_bit(lock, i)) {
+			continue;
+		}
+
+		fprintf(file, "Record lock, heap no %lu", (ulong) i);
+
+		if (block) {
+			ut_ad(page_is_leaf(block->page.frame));
+			const rec_t*	rec;
+
+			rec = page_find_rec_with_heap_no(
+				buf_block_get_frame(block), i);
+			ut_ad(!page_rec_is_metadata(rec));
+
+			offsets = rec_get_offsets(
+				rec, lock->index, offsets,
+				lock->index->n_core_fields,
+				ULINT_UNDEFINED, &heap);
+
+			putc(' ', file);
+			rec_print_new(file, rec, offsets);
+		}
+
+		putc('\n', file);
+	}
+
+	mtr.commit();
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+}
+
+#ifdef UNIV_DEBUG
+/* Print the number of lock structs from lock_print_info_summary() only
+in non-production builds for performance reasons, see
+http://bugs.mysql.com/36942 */
+#define PRINT_NUM_OF_LOCK_STRUCTS
+#endif /* UNIV_DEBUG */
+
+#ifdef PRINT_NUM_OF_LOCK_STRUCTS
+/*********************************************************************//**
+Calculates the number of record lock structs in the record lock hash table.
+@return number of record locks */
+TRANSACTIONAL_TARGET
+static ulint lock_get_n_rec_locks()
+{
+	ulint	n_locks	= 0;
+	ulint	i;
+
+	lock_sys.assert_locked();
+
+	for (i = 0; i < lock_sys.rec_hash.n_cells; i++) {
+		const lock_t*	lock;
+
+		for (lock = static_cast<const lock_t*>(
+			     HASH_GET_FIRST(&lock_sys.rec_hash, i));
+		     lock != 0;
+		     lock = static_cast<const lock_t*>(
+				HASH_GET_NEXT(hash, lock))) {
+
+			n_locks++;
+		}
+	}
+
+	return(n_locks);
+}
+#endif /* PRINT_NUM_OF_LOCK_STRUCTS */
+
+/*********************************************************************//**
+Prints info of locks for all transactions.
+@return FALSE if not able to acquire lock_sys.latch (and dislay info) */
+ibool
+lock_print_info_summary(
+/*====================*/
+	FILE*	file,	/*!< in: file where to print */
+	ibool	nowait)	/*!< in: whether to wait for lock_sys.latch */
+{
+	/* Here, lock elision does not make sense, because
+	for the output we are going to invoke system calls,
+	which would interrupt a memory transaction. */
+	if (!nowait) {
+		lock_sys.wr_lock(SRW_LOCK_CALL);
+	} else if (!lock_sys.wr_lock_try()) {
+		fputs("FAIL TO OBTAIN LOCK MUTEX,"
+		      " SKIP LOCK INFO PRINTING\n", file);
+		return(FALSE);
+	}
+
+	if (lock_sys.deadlocks) {
+		fputs("------------------------\n"
+		      "LATEST DETECTED DEADLOCK\n"
+		      "------------------------\n", file);
+
+		if (!srv_read_only_mode) {
+			ut_copy_file(file, lock_latest_err_file);
+		}
+	}
+
+	fputs("------------\n"
+	      "TRANSACTIONS\n"
+	      "------------\n", file);
+
+	fprintf(file, "Trx id counter " TRX_ID_FMT "\n",
+		trx_sys.get_max_trx_id());
+
+	fprintf(file,
+		"Purge done for trx's n:o < " TRX_ID_FMT
+		" undo n:o < " TRX_ID_FMT " state: %s\n"
+		"History list length %zu\n",
+		purge_sys.tail.trx_no,
+		purge_sys.tail.undo_no,
+		purge_sys.enabled()
+		? (purge_sys.running() ? "running"
+		   : purge_sys.paused() ? "stopped" : "running but idle")
+		: "disabled",
+		trx_sys.history_size_approx());
+
+#ifdef PRINT_NUM_OF_LOCK_STRUCTS
+	fprintf(file,
+		"Total number of lock structs in row lock hash table %lu\n",
+		(ulong) lock_get_n_rec_locks());
+#endif /* PRINT_NUM_OF_LOCK_STRUCTS */
+	return(TRUE);
+}
+
+/** Prints transaction lock wait and MVCC state.
+@param[in,out]	file	file where to print
+@param[in]	trx	transaction
+@param[in]	now	current my_hrtime_coarse() */
+void lock_trx_print_wait_and_mvcc_state(FILE *file, const trx_t *trx,
+                                        my_hrtime_t now)
+{
+	fprintf(file, "---");
+
+	trx_print_latched(file, trx, 600);
+	trx->read_view.print_limits(file);
+
+	if (const lock_t* wait_lock = trx->lock.wait_lock) {
+		const my_hrtime_t suspend_time= trx->lock.suspend_time;
+		fprintf(file,
+			"------- TRX HAS BEEN WAITING %llu ns"
+			" FOR THIS LOCK TO BE GRANTED:\n",
+			now.val - suspend_time.val);
+
+		if (!wait_lock->is_table()) {
+			mtr_t mtr;
+			lock_rec_print(file, wait_lock, mtr);
+		} else {
+			lock_table_print(file, wait_lock);
+		}
+
+		fprintf(file, "------------------\n");
+	}
+}
+
+/*********************************************************************//**
+Prints info of locks for a transaction. */
+static
+void
+lock_trx_print_locks(
+/*=================*/
+	FILE*		file,		/*!< in/out: File to write */
+	const trx_t*	trx)		/*!< in: current transaction */
+{
+	mtr_t mtr;
+	uint32_t i= 0;
+	/* Iterate over the transaction's locks. */
+	lock_sys.assert_locked();
+	for (lock_t *lock = UT_LIST_GET_FIRST(trx->lock.trx_locks);
+	     lock != NULL;
+	     lock = UT_LIST_GET_NEXT(trx_locks, lock)) {
+		if (!lock->is_table()) {
+			lock_rec_print(file, lock, mtr);
+		} else {
+			lock_table_print(file, lock);
+		}
+
+		if (++i == 10) {
+
+			fprintf(file,
+				"10 LOCKS PRINTED FOR THIS TRX:"
+				" SUPPRESSING FURTHER PRINTS\n");
+
+			break;
+		}
+	}
+}
+
+/** Functor to display all transactions */
+struct lock_print_info
+{
+  lock_print_info(FILE* file, my_hrtime_t now) :
+    file(file), now(now),
+    purge_trx(purge_sys.query ? purge_sys.query->trx : nullptr)
+  {}
+
+  void operator()(const trx_t &trx) const
+  {
+    if (UNIV_UNLIKELY(&trx == purge_trx))
+      return;
+    lock_trx_print_wait_and_mvcc_state(file, &trx, now);
+
+    if (trx.will_lock && srv_print_innodb_lock_monitor)
+      lock_trx_print_locks(file, &trx);
+  }
+
+  FILE* const file;
+  const my_hrtime_t now;
+  const trx_t* const purge_trx;
+};
+
+/*********************************************************************//**
+Prints info of locks for each transaction. This function will release
+lock_sys.latch, which the caller must be holding in exclusive mode. */
+void
+lock_print_info_all_transactions(
+/*=============================*/
+	FILE*		file)	/*!< in/out: file where to print */
+{
+	fprintf(file, "LIST OF TRANSACTIONS FOR EACH SESSION:\n");
+
+	trx_sys.trx_list.for_each(lock_print_info(file, my_hrtime_coarse()));
+	lock_sys.wr_unlock();
+
+	ut_d(lock_validate());
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Find the the lock in the trx_t::trx_lock_t::table_locks vector.
+@return true if found */
+static
+bool
+lock_trx_table_locks_find(
+/*======================*/
+	trx_t*		trx,		/*!< in: trx to validate */
+	const lock_t*	find_lock)	/*!< in: lock to find */
+{
+	bool		found = false;
+
+	ut_ad(trx->mutex_is_owner());
+
+	for (lock_list::const_iterator it = trx->lock.table_locks.begin(),
+             end = trx->lock.table_locks.end(); it != end; ++it) {
+
+		const lock_t*	lock = *it;
+
+		if (lock == NULL) {
+
+			continue;
+
+		} else if (lock == find_lock) {
+
+			/* Can't be duplicates. */
+			ut_a(!found);
+			found = true;
+		}
+
+		ut_a(trx == lock->trx);
+		ut_a(lock->is_table());
+		ut_a(lock->un_member.tab_lock.table != NULL);
+	}
+
+	return(found);
+}
+
+/*********************************************************************//**
+Validates the lock queue on a table.
+@return TRUE if ok */
+static
+ibool
+lock_table_queue_validate(
+/*======================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	const lock_t*	lock;
+
+	lock_sys.assert_locked(*table);
+
+	for (lock = UT_LIST_GET_FIRST(table->locks);
+	     lock != NULL;
+	     lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock)) {
+
+		/* lock->trx->state cannot change from or to NOT_STARTED
+		while we are holding the lock_sys.latch. It may change
+		from ACTIVE or PREPARED to PREPARED or COMMITTED. */
+		lock->trx->mutex_lock();
+		check_trx_state(lock->trx);
+
+		if (lock->trx->state == TRX_STATE_COMMITTED_IN_MEMORY) {
+		} else if (!lock->is_waiting()) {
+			ut_a(!lock_table_other_has_incompatible(
+				     lock->trx, 0, table,
+				     lock->mode()));
+		} else {
+			ut_a(lock_table_has_to_wait_in_queue(lock));
+		}
+
+		ut_a(lock_trx_table_locks_find(lock->trx, lock));
+		lock->trx->mutex_unlock();
+	}
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Validates the lock queue on a single record.
+@return TRUE if ok */
+static
+bool
+lock_rec_queue_validate(
+/*====================*/
+	bool			locked_lock_trx_sys,
+					/*!< in: if the caller holds
+					both the lock_sys.latch and
+					trx_sys_t->lock. */
+	const page_id_t		id,	/*!< in: page identifier */
+	const rec_t*		rec,	/*!< in: record to look at */
+	const dict_index_t*	index,	/*!< in: index, or NULL if not known */
+	const rec_offs*		offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+	const lock_t*	lock;
+	ulint		heap_no;
+
+	ut_a(rec);
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(!page_rec_is_comp(rec) == !rec_offs_comp(offsets));
+	ut_ad(page_rec_is_leaf(rec));
+	ut_ad(!index || dict_index_is_clust(index)
+	      || !dict_index_is_online_ddl(index));
+
+	heap_no = page_rec_get_heap_no(rec);
+
+	if (!locked_lock_trx_sys) {
+		lock_sys.wr_lock(SRW_LOCK_CALL);
+	}
+
+	hash_cell_t &cell= *lock_sys.rec_hash.cell_get(id.fold());
+	lock_sys.assert_locked(cell);
+
+	if (!page_rec_is_user_rec(rec)) {
+
+		for (lock = lock_sys_t::get_first(cell, id, heap_no);
+		     lock != NULL;
+		     lock = lock_rec_get_next_const(heap_no, lock)) {
+
+			ut_ad(!index || lock->index == index);
+
+			lock->trx->mutex_lock();
+			ut_ad(!lock->trx->read_only
+			      || !lock->trx->is_autocommit_non_locking());
+			ut_ad(trx_state_eq(lock->trx,
+					   TRX_STATE_COMMITTED_IN_MEMORY)
+			      || !lock->is_waiting()
+			      || lock_rec_has_to_wait_in_queue(cell, lock));
+			lock->trx->mutex_unlock();
+		}
+
+func_exit:
+		if (!locked_lock_trx_sys) {
+			lock_sys.wr_unlock();
+		}
+
+		return true;
+	}
+
+	ut_ad(page_rec_is_leaf(rec));
+
+	const trx_id_t impl_trx_id = index && index->is_primary()
+		? lock_clust_rec_some_has_impl(rec, index, offsets)
+		: 0;
+
+	if (trx_t *impl_trx = impl_trx_id
+	    ? trx_sys.find(current_trx(), impl_trx_id, false)
+	    : 0) {
+		/* impl_trx could have been committed before we
+		acquire its mutex, but not thereafter. */
+
+		impl_trx->mutex_lock();
+		ut_ad(impl_trx->state != TRX_STATE_NOT_STARTED);
+		if (impl_trx->state == TRX_STATE_COMMITTED_IN_MEMORY) {
+		} else if (const lock_t* other_lock
+			   = lock_rec_other_has_expl_req(
+				   LOCK_S, cell, id, true, heap_no,
+				   impl_trx)) {
+			/* The impl_trx is holding an implicit lock on the
+			given record 'rec'. So there cannot be another
+			explicit granted lock.  Also, there can be another
+			explicit waiting lock only if the impl_trx has an
+			explicit granted lock. */
+
+#ifdef WITH_WSREP
+			/** Galera record locking rules:
+			* If there is no other record lock to the same record, we may grant
+			the lock request.
+			* If there is other record lock but this requested record lock is
+			compatible, we may grant the lock request.
+			* If there is other record lock and it is not compatible with
+			requested lock, all normal transactions must wait.
+			* BF (brute force) additional exceptions :
+			** If BF already holds record lock for requested record, we may
+			grant new record lock even if there is conflicting record lock(s)
+			waiting on a queue.
+			** If conflicting transaction holds requested record lock,
+			we will cancel this record lock and select conflicting transaction
+			for BF abort or kill victim.
+			** If conflicting transaction is waiting for requested record lock
+			we will cancel this wait and select conflicting transaction
+			for BF abort or kill victim.
+			** There should not be two BF transactions waiting for same record lock
+			*/
+			if (other_lock->trx->is_wsrep() && !other_lock->is_waiting()) {
+				wsrep_report_bf_lock_wait(impl_trx->mysql_thd, impl_trx->id);
+				wsrep_report_bf_lock_wait(other_lock->trx->mysql_thd, other_lock->trx->id);
+
+				if (!lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP,
+						       cell, id, heap_no,
+						       impl_trx)) {
+					ib::info() << "WSREP impl BF lock conflict";
+				}
+			} else
+#endif /* WITH_WSREP */
+			{
+				ut_ad(other_lock->is_waiting());
+				ut_ad(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP,
+						        cell, id, heap_no,
+							impl_trx));
+			}
+		}
+
+		impl_trx->mutex_unlock();
+	}
+
+	for (lock = lock_sys_t::get_first(cell, id, heap_no);
+	     lock != NULL;
+	     lock = lock_rec_get_next_const(heap_no, lock)) {
+		ut_ad(!lock->trx->read_only
+		      || !lock->trx->is_autocommit_non_locking());
+		ut_ad(!page_rec_is_metadata(rec));
+
+		if (index) {
+			ut_a(lock->index == index);
+		}
+
+		if (lock->is_waiting()) {
+			ut_a(lock->is_gap()
+			     || lock_rec_has_to_wait_in_queue(cell, lock));
+		} else if (!lock->is_gap()) {
+			const lock_mode	mode = lock->mode() == LOCK_S
+				? LOCK_X : LOCK_S;
+
+			const lock_t*	other_lock
+				= lock_rec_other_has_expl_req(
+					mode, cell, id, false, heap_no,
+					lock->trx);
+#ifdef WITH_WSREP
+			if (UNIV_UNLIKELY(other_lock && lock->trx->is_wsrep())) {
+				/* Only BF transaction may be granted
+				lock before other conflicting lock
+				request. */
+				if (!wsrep_thd_is_BF(lock->trx->mysql_thd, FALSE)
+				    && !wsrep_thd_is_BF(other_lock->trx->mysql_thd, FALSE)) {
+					/* If no BF, this case is a bug. */
+					wsrep_report_bf_lock_wait(lock->trx->mysql_thd, lock->trx->id);
+					wsrep_report_bf_lock_wait(other_lock->trx->mysql_thd, other_lock->trx->id);
+					ut_error;
+				}
+			} else
+#endif /* WITH_WSREP */
+			ut_ad(!other_lock);
+		}
+	}
+
+	goto func_exit;
+}
+
+/** Validate the record lock queues on a page.
+@param block    buffer pool block
+@param latched  whether the tablespace latch may be held
+@return true if ok */
+static bool lock_rec_validate_page(const buf_block_t *block, bool latched)
+{
+	const lock_t*	lock;
+	const rec_t*	rec;
+	ulint		nth_lock	= 0;
+	ulint		nth_bit		= 0;
+	ulint		i;
+	mem_heap_t*	heap		= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	const page_id_t id{block->page.id()};
+
+	LockGuard g{lock_sys.rec_hash, id};
+loop:
+	lock = lock_sys_t::get_first(g.cell(), id);
+
+	if (!lock) {
+		goto function_exit;
+	}
+
+	DBUG_ASSERT(!block->page.is_freed());
+
+	for (i = 0; i < nth_lock; i++) {
+
+		lock = lock_rec_get_next_on_page_const(lock);
+
+		if (!lock) {
+			goto function_exit;
+		}
+	}
+
+	ut_ad(!lock->trx->read_only
+	      || !lock->trx->is_autocommit_non_locking());
+
+	/* Only validate the record queues when this thread is not
+	holding a tablespace latch. */
+	if (!latched)
+	for (i = nth_bit; i < lock_rec_get_n_bits(lock); i++) {
+		bool locked = lock_rec_get_nth_bit(lock, i);
+		if (locked || i == PAGE_HEAP_NO_SUPREMUM) {
+
+			rec = page_find_rec_with_heap_no(block->page.frame, i);
+			ut_a(rec);
+			ut_ad(!locked || page_rec_is_leaf(rec));
+
+			/* If this thread is holding the file space
+			latch (fil_space_t::latch), the following
+			check WILL break the latching order and may
+			cause a deadlock of threads. */
+
+			if (locked) {
+				offsets = rec_get_offsets(rec, lock->index,
+					offsets, lock->index->n_core_fields,
+					ULINT_UNDEFINED, &heap);
+				lock_rec_queue_validate(true, id, rec,
+					lock->index, offsets);
+			}
+
+			nth_bit = i + 1;
+
+			goto loop;
+		}
+	}
+
+	nth_bit = 0;
+	nth_lock++;
+
+	goto loop;
+
+function_exit:
+	if (heap != NULL) {
+		mem_heap_free(heap);
+	}
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Validate record locks up to a limit.
+@return lock at limit or NULL if no more locks in the hash bucket */
+static MY_ATTRIBUTE((warn_unused_result))
+const lock_t*
+lock_rec_validate(
+/*==============*/
+	ulint		start,		/*!< in: lock_sys.rec_hash
+					bucket */
+	page_id_t*	limit)		/*!< in/out: upper limit of
+					(space, page_no) */
+{
+	lock_sys.assert_locked();
+
+	for (const lock_t* lock = static_cast<const lock_t*>(
+		     HASH_GET_FIRST(&lock_sys.rec_hash, start));
+	     lock != NULL;
+	     lock = static_cast<const lock_t*>(HASH_GET_NEXT(hash, lock))) {
+
+		ut_ad(!lock->trx->read_only
+		      || !lock->trx->is_autocommit_non_locking());
+		ut_ad(!lock->is_table());
+
+		page_id_t current(lock->un_member.rec_lock.page_id);
+
+		if (current > *limit) {
+			*limit = current + 1;
+			return(lock);
+		}
+	}
+
+	return(0);
+}
+
+/*********************************************************************//**
+Validate a record lock's block */
+static void lock_rec_block_validate(const page_id_t page_id)
+{
+	/* The lock and the block that it is referring to may be freed at
+	this point. */
+
+	buf_block_t*	block;
+	mtr_t		mtr;
+
+	/* Transactional locks should never refer to dropped
+	tablespaces, because all DDL operations that would drop or
+	discard or rebuild a tablespace do hold an exclusive table
+	lock, which would conflict with any locks referring to the
+	tablespace from other transactions. */
+	if (fil_space_t* space = fil_space_t::get(page_id.space())) {
+		dberr_t err = DB_SUCCESS;
+		mtr_start(&mtr);
+
+		block = buf_page_get_gen(
+			page_id,
+			space->zip_size(),
+			RW_S_LATCH, NULL,
+			BUF_GET_POSSIBLY_FREED,
+			&mtr, &err);
+
+		ut_ad(!block
+		      || lock_rec_validate_page(block, space->is_latched()));
+
+		mtr_commit(&mtr);
+
+		space->release();
+	}
+}
+
+static my_bool lock_validate_table_locks(rw_trx_hash_element_t *element, void*)
+{
+  lock_sys.assert_locked();
+  element->mutex.wr_lock();
+  if (element->trx)
+  {
+    check_trx_state(element->trx);
+    for (const lock_t *lock= UT_LIST_GET_FIRST(element->trx->lock.trx_locks);
+         lock != NULL;
+         lock= UT_LIST_GET_NEXT(trx_locks, lock))
+      if (lock->is_table())
+        lock_table_queue_validate(lock->un_member.tab_lock.table);
+  }
+  element->mutex.wr_unlock();
+  return 0;
+}
+
+
+/** Validate the transactional locks. */
+static void lock_validate()
+{
+  std::set<page_id_t> pages;
+  {
+    LockMutexGuard g{SRW_LOCK_CALL};
+    /* Validate table locks */
+    trx_sys.rw_trx_hash.iterate(lock_validate_table_locks);
+
+    for (ulint i= 0; i < lock_sys.rec_hash.n_cells; i++)
+    {
+      page_id_t limit{0, 0};
+      while (const lock_t *lock= lock_rec_validate(i, &limit))
+      {
+        if (lock_rec_find_set_bit(lock) == ULINT_UNDEFINED)
+          /* The lock bitmap is empty; ignore it. */
+          continue;
+        pages.insert(lock->un_member.rec_lock.page_id);
+      }
+    }
+  }
+
+  for (page_id_t page_id : pages)
+    lock_rec_block_validate(page_id);
+}
+#endif /* UNIV_DEBUG */
+/*============ RECORD LOCK CHECKS FOR ROW OPERATIONS ====================*/
+
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate insert of
+a record. If they do, first tests if the query thread should anyway
+be suspended for some reason; if not, then puts the transaction and
+the query thread to the lock wait state and inserts a waiting request
+for a gap x-lock to the lock queue.
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+TRANSACTIONAL_TARGET
+dberr_t
+lock_rec_insert_check_and_lock(
+/*===========================*/
+	const rec_t*	rec,	/*!< in: record after which to insert */
+	buf_block_t*	block,	/*!< in/out: buffer block of rec */
+	dict_index_t*	index,	/*!< in: index */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	bool*		inherit)/*!< out: set to true if the new
+				inserted record maybe should inherit
+				LOCK_GAP type locks from the successor
+				record */
+{
+  ut_ad(block->page.frame == page_align(rec));
+  ut_ad(mtr->is_named_space(index->table->space));
+  ut_ad(page_is_leaf(block->page.frame));
+  ut_ad(!index->table->is_temporary());
+
+  const rec_t *next_rec= page_rec_get_next_const(rec);
+  if (UNIV_UNLIKELY(!next_rec || rec_is_metadata(next_rec, *index)))
+    return DB_CORRUPTION;
+
+  dberr_t err= DB_SUCCESS;
+  bool inherit_in= *inherit;
+  trx_t *trx= thr_get_trx(thr);
+  ulint heap_no= page_rec_get_heap_no(next_rec);
+  const page_id_t id{block->page.id()};
+
+  {
+    LockGuard g{lock_sys.rec_hash, id};
+    /* Because this code is invoked for a running transaction by
+    the thread that is serving the transaction, it is not necessary
+    to hold trx->mutex here. */
+
+    /* When inserting a record into an index, the table must be at
+    least IX-locked. When we are building an index, we would pass
+    BTR_NO_LOCKING_FLAG and skip the locking altogether. */
+    ut_ad(lock_table_has(trx, index->table, LOCK_IX));
+
+    *inherit= lock_sys_t::get_first(g.cell(), id, heap_no);
+
+    if (*inherit)
+    {
+      /* Spatial index does not use GAP lock protection. It uses
+      "predicate lock" to protect the "range" */
+      if (index->is_spatial())
+        return DB_SUCCESS;
+
+      /* If another transaction has an explicit lock request which locks
+      the gap, waiting or granted, on the successor, the insert has to wait.
+
+      An exception is the case where the lock by the another transaction
+      is a gap type lock which it placed to wait for its turn to insert. We
+      do not consider that kind of a lock conflicting with our insert. This
+      eliminates an unnecessary deadlock which resulted when 2 transactions
+      had to wait for their insert. Both had waiting gap type lock requests
+      on the successor, which produced an unnecessary deadlock. */
+      const unsigned type_mode= LOCK_X | LOCK_GAP | LOCK_INSERT_INTENTION;
+
+      if (lock_t *c_lock= lock_rec_other_has_conflicting(type_mode,
+                                                         g.cell(), id,
+                                                         heap_no, trx))
+      {
+        trx->mutex_lock();
+        err= lock_rec_enqueue_waiting(c_lock, type_mode, id, block->page.frame,
+                                      heap_no, index, thr, nullptr);
+        trx->mutex_unlock();
+      }
+    }
+  }
+
+  switch (err) {
+  case DB_SUCCESS_LOCKED_REC:
+    err = DB_SUCCESS;
+    /* fall through */
+  case DB_SUCCESS:
+    if (!inherit_in || index->is_clust())
+      break;
+    /* Update the page max trx id field */
+    page_update_max_trx_id(block, buf_block_get_page_zip(block), trx->id, mtr);
+  default:
+    /* We only care about the two return values. */
+    break;
+  }
+
+#ifdef UNIV_DEBUG
+  {
+    mem_heap_t *heap= nullptr;
+    rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+    const rec_offs *offsets;
+    rec_offs_init(offsets_);
+
+    offsets= rec_get_offsets(next_rec, index, offsets_, index->n_core_fields,
+                             ULINT_UNDEFINED, &heap);
+
+    ut_ad(lock_rec_queue_validate(false, id, next_rec, index, offsets));
+
+    if (UNIV_LIKELY_NULL(heap))
+      mem_heap_free(heap);
+  }
+#endif /* UNIV_DEBUG */
+
+  return err;
+}
+
+/*********************************************************************//**
+Creates an explicit record lock for a running transaction that currently only
+has an implicit lock on the record. The transaction instance must have a
+reference count > 0 so that it can't be committed and freed before this
+function has completed. */
+static
+bool
+lock_rec_convert_impl_to_expl_for_trx(
+/*==================================*/
+	trx_t*			trx,	/*!< in/out: active transaction */
+	const page_id_t		id,	/*!< in: page identifier */
+	const rec_t*		rec,	/*!< in: user record on page */
+	dict_index_t*		index)	/*!< in: index of record */
+{
+  if (!trx)
+    return false;
+
+  ut_ad(trx->is_referenced());
+  ut_ad(page_rec_is_leaf(rec));
+  ut_ad(!rec_is_metadata(rec, *index));
+
+  DEBUG_SYNC_C("before_lock_rec_convert_impl_to_expl_for_trx");
+  ulint heap_no= page_rec_get_heap_no(rec);
+
+  {
+    LockGuard g{lock_sys.rec_hash, id};
+    trx->mutex_lock();
+    ut_ad(!trx_state_eq(trx, TRX_STATE_NOT_STARTED));
+
+    if (!trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY) &&
+        !lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, g.cell(), id, heap_no,
+                           trx))
+      lock_rec_add_to_queue(LOCK_X | LOCK_REC_NOT_GAP, g.cell(), id,
+                            page_align(rec), heap_no, index, trx, true);
+  }
+
+  trx->mutex_unlock();
+  trx->release_reference();
+
+  DEBUG_SYNC_C("after_lock_rec_convert_impl_to_expl_for_trx");
+  return false;
+}
+
+
+#ifdef UNIV_DEBUG
+struct lock_rec_other_trx_holds_expl_arg
+{
+  const ulint heap_no;
+  const hash_cell_t &cell;
+  const page_id_t id;
+  const trx_t &impl_trx;
+};
+
+
+static my_bool lock_rec_other_trx_holds_expl_callback(
+  rw_trx_hash_element_t *element,
+  lock_rec_other_trx_holds_expl_arg *arg)
+{
+  element->mutex.wr_lock();
+  if (element->trx)
+  {
+    element->trx->mutex_lock();
+    ut_ad(element->trx->state != TRX_STATE_NOT_STARTED);
+    lock_t *expl_lock= element->trx->state == TRX_STATE_COMMITTED_IN_MEMORY
+      ? nullptr
+      : lock_rec_has_expl(LOCK_S | LOCK_REC_NOT_GAP,
+                          arg->cell, arg->id, arg->heap_no, element->trx);
+    /*
+      An explicit lock is held by trx other than the trx holding the implicit
+      lock.
+    */
+    ut_ad(!expl_lock || expl_lock->trx == &arg->impl_trx);
+    element->trx->mutex_unlock();
+  }
+  element->mutex.wr_unlock();
+  return 0;
+}
+
+
+/**
+  Checks if some transaction, other than given trx_id, has an explicit
+  lock on the given rec.
+
+  FIXME: if the current transaction holds implicit lock from INSERT, a
+  subsequent locking read should not convert it to explicit. See also
+  MDEV-11215.
+
+  @param      caller_trx  trx of current thread
+  @param[in]  trx         trx holding implicit lock on rec
+  @param[in]  rec         user record
+  @param[in]  id          page identifier
+*/
+static void lock_rec_other_trx_holds_expl(trx_t *caller_trx, trx_t *trx,
+                                          const rec_t *rec,
+                                          const page_id_t id)
+{
+  if (trx)
+  {
+    ut_ad(!page_rec_is_metadata(rec));
+    LockGuard g{lock_sys.rec_hash, id};
+    ut_ad(trx->is_referenced());
+    const trx_state_t state{trx->state};
+    ut_ad(state != TRX_STATE_NOT_STARTED);
+    if (state == TRX_STATE_COMMITTED_IN_MEMORY)
+      /* The transaction was committed before we acquired LockGuard. */
+      return;
+    lock_rec_other_trx_holds_expl_arg arg=
+    { page_rec_get_heap_no(rec), g.cell(), id, *trx };
+    trx_sys.rw_trx_hash.iterate(caller_trx,
+                                lock_rec_other_trx_holds_expl_callback, &arg);
+  }
+}
+#endif /* UNIV_DEBUG */
+
+/** If an implicit x-lock exists on a record, convert it to an explicit one.
+
+Often, this is called by a transaction that is about to enter a lock wait
+due to the lock conflict. Two explicit locks would be created: first the
+exclusive lock on behalf of the lock-holder transaction in this function,
+and then a wait request on behalf of caller_trx, in the calling function.
+
+This may also be called by the same transaction that is already holding
+an implicit exclusive lock on the record. In this case, no explicit lock
+should be created.
+
+@tparam		is_primary	whether the index is the primary key
+@param[in,out]	caller_trx	current transaction
+@param[in]	id		index tree leaf page identifier
+@param[in]	rec		record on the leaf page
+@param[in]	index		the index of the record
+@param[in]	offsets		rec_get_offsets(rec,index)
+@return	whether caller_trx already holds an exclusive lock on rec */
+template<bool is_primary>
+static
+bool
+lock_rec_convert_impl_to_expl(
+	trx_t*			caller_trx,
+	page_id_t		id,
+	const rec_t*		rec,
+	dict_index_t*		index,
+	const rec_offs*		offsets)
+{
+	trx_t*		trx;
+
+	lock_sys.assert_unlocked();
+	ut_ad(page_rec_is_user_rec(rec));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(!page_rec_is_comp(rec) == !rec_offs_comp(offsets));
+	ut_ad(page_rec_is_leaf(rec));
+	ut_ad(!rec_is_metadata(rec, *index));
+	ut_ad(index->is_primary() == is_primary);
+
+	if (is_primary) {
+		trx_id_t	trx_id;
+
+		trx_id = lock_clust_rec_some_has_impl(rec, index, offsets);
+
+		if (trx_id == 0) {
+			return false;
+		}
+		if (UNIV_UNLIKELY(trx_id == caller_trx->id)) {
+			return true;
+		}
+
+		trx = trx_sys.find(caller_trx, trx_id);
+	} else {
+		ut_ad(!dict_index_is_online_ddl(index));
+
+		trx = lock_sec_rec_some_has_impl(caller_trx, rec, index,
+						 offsets);
+		if (trx == caller_trx) {
+			trx->release_reference();
+			return true;
+		}
+
+		ut_d(lock_rec_other_trx_holds_expl(caller_trx, trx, rec, id));
+	}
+
+	return lock_rec_convert_impl_to_expl_for_trx(trx, id, rec, index);
+}
+
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate modify (update,
+delete mark, or delete unmark) of a clustered index record. If they do,
+first tests if the query thread should anyway be suspended for some
+reason; if not, then puts the transaction and the query thread to the
+lock wait state and inserts a waiting request for a record x-lock to the
+lock queue.
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_clust_rec_modify_check_and_lock(
+/*=================================*/
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: record which should be
+					modified */
+	dict_index_t*		index,	/*!< in: clustered index */
+	const rec_offs*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	que_thr_t*		thr)	/*!< in: query thread */
+{
+	dberr_t	err;
+	ulint	heap_no;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(page_rec_is_leaf(rec));
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(block->page.frame == page_align(rec));
+
+	ut_ad(!rec_is_metadata(rec, *index));
+	ut_ad(!index->table->is_temporary());
+
+	heap_no = rec_offs_comp(offsets)
+		? rec_get_heap_no_new(rec)
+		: rec_get_heap_no_old(rec);
+
+	/* If a transaction has no explicit x-lock set on the record, set one
+	for it */
+
+	if (lock_rec_convert_impl_to_expl<true>(thr_get_trx(thr),
+						block->page.id(),
+						rec, index, offsets)) {
+		/* We already hold an implicit exclusive lock. */
+		return DB_SUCCESS;
+	}
+
+	err = lock_rec_lock(true, LOCK_X | LOCK_REC_NOT_GAP,
+			    block, heap_no, index, thr);
+
+	ut_ad(lock_rec_queue_validate(false, block->page.id(),
+				      rec, index, offsets));
+
+	if (err == DB_SUCCESS_LOCKED_REC) {
+		err = DB_SUCCESS;
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate modify (delete
+mark or delete unmark) of a secondary index record.
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_sec_rec_modify_check_and_lock(
+/*===============================*/
+	ulint		flags,	/*!< in: if BTR_NO_LOCKING_FLAG
+				bit is set, does nothing */
+	buf_block_t*	block,	/*!< in/out: buffer block of rec */
+	const rec_t*	rec,	/*!< in: record which should be
+				modified; NOTE: as this is a secondary
+				index, we always have to modify the
+				clustered index record first: see the
+				comment below */
+	dict_index_t*	index,	/*!< in: secondary index */
+	que_thr_t*	thr,	/*!< in: query thread
+				(can be NULL if BTR_NO_LOCKING_FLAG) */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	dberr_t	err;
+	ulint	heap_no;
+
+	ut_ad(!dict_index_is_clust(index));
+	ut_ad(!dict_index_is_online_ddl(index) || (flags & BTR_CREATE_FLAG));
+	ut_ad(block->page.frame == page_align(rec));
+	ut_ad(mtr->is_named_space(index->table->space));
+	ut_ad(page_rec_is_leaf(rec));
+	ut_ad(!rec_is_metadata(rec, *index));
+
+	if (flags & BTR_NO_LOCKING_FLAG) {
+
+		return(DB_SUCCESS);
+	}
+	ut_ad(!index->table->is_temporary());
+
+	heap_no = page_rec_get_heap_no(rec);
+
+#ifdef WITH_WSREP
+	trx_t *trx= thr_get_trx(thr);
+	/* If transaction scanning an unique secondary key is wsrep
+	high priority thread (brute force) this scanning may involve
+	GAP-locking in the index. As this locking happens also when
+	applying replication events in high priority applier threads,
+	there is a probability for lock conflicts between two wsrep
+	high priority threads. To avoid this GAP-locking we mark that
+	this transaction is using unique key scan here. */
+	if (trx->is_wsrep() && wsrep_thd_is_BF(trx->mysql_thd, false))
+		trx->wsrep = 3;
+#endif /* WITH_WSREP */
+
+	/* Another transaction cannot have an implicit lock on the record,
+	because when we come here, we already have modified the clustered
+	index record, and this would not have been possible if another active
+	transaction had modified this secondary index record. */
+
+	err = lock_rec_lock(true, LOCK_X | LOCK_REC_NOT_GAP,
+			    block, heap_no, index, thr);
+
+#ifdef WITH_WSREP
+	if (trx->wsrep == 3) trx->wsrep = 1;
+#endif /* WITH_WSREP */
+
+#ifdef UNIV_DEBUG
+	{
+		mem_heap_t*	heap		= NULL;
+		rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+		const rec_offs*	offsets;
+		rec_offs_init(offsets_);
+
+		offsets = rec_get_offsets(rec, index, offsets_,
+					  index->n_core_fields,
+					  ULINT_UNDEFINED, &heap);
+
+		ut_ad(lock_rec_queue_validate(
+			      false, block->page.id(), rec, index, offsets));
+
+		if (heap != NULL) {
+			mem_heap_free(heap);
+		}
+	}
+#endif /* UNIV_DEBUG */
+
+	if (err == DB_SUCCESS || err == DB_SUCCESS_LOCKED_REC) {
+		/* Update the page max trx id field */
+		/* It might not be necessary to do this if
+		err == DB_SUCCESS (no new lock created),
+		but it should not cost too much performance. */
+		page_update_max_trx_id(block,
+				       buf_block_get_page_zip(block),
+				       thr_get_trx(thr)->id, mtr);
+		err = DB_SUCCESS;
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Like lock_clust_rec_read_check_and_lock(), but reads a
+secondary index record.
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_sec_rec_read_check_and_lock(
+/*=============================*/
+	ulint			flags,	/*!< in: if BTR_NO_LOCKING_FLAG
+					bit is set, does nothing */
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: user record or page
+					supremum record which should
+					be read or passed over by a
+					read cursor */
+	dict_index_t*		index,	/*!< in: secondary index */
+	const rec_offs*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	lock_mode		mode,	/*!< in: mode of the lock which
+					the read cursor should set on
+					records: LOCK_S or LOCK_X; the
+					latter is possible in
+					SELECT FOR UPDATE */
+	unsigned		gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+					LOCK_REC_NOT_GAP */
+	que_thr_t*		thr)	/*!< in: query thread */
+{
+	dberr_t	err;
+
+	ut_ad(!dict_index_is_clust(index));
+	ut_ad(!dict_index_is_online_ddl(index));
+	ut_ad(block->page.frame == page_align(rec));
+	ut_ad(page_rec_is_user_rec(rec) || page_rec_is_supremum(rec));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(page_rec_is_leaf(rec));
+	ut_ad(mode == LOCK_X || mode == LOCK_S);
+
+	if ((flags & BTR_NO_LOCKING_FLAG)
+	    || srv_read_only_mode
+	    || index->table->is_temporary()) {
+
+		return(DB_SUCCESS);
+	}
+
+	ut_ad(!rec_is_metadata(rec, *index));
+
+	trx_t *trx = thr_get_trx(thr);
+
+	if (lock_table_has(trx, index->table, mode)) {
+		return DB_SUCCESS;
+	}
+
+	if (!page_rec_is_supremum(rec)
+	    && lock_rec_convert_impl_to_expl<false>(
+		       trx, block->page.id(), rec, index, offsets)
+	    && gap_mode == LOCK_REC_NOT_GAP) {
+		/* We already hold an implicit exclusive lock. */
+		return DB_SUCCESS;
+	}
+
+#ifdef WITH_WSREP
+	/* If transaction scanning an unique secondary key is wsrep
+	high priority thread (brute force) this scanning may involve
+	GAP-locking in the index. As this locking happens also when
+	applying replication events in high priority applier threads,
+	there is a probability for lock conflicts between two wsrep
+	high priority threads. To avoid this GAP-locking we mark that
+	this transaction is using unique key scan here. */
+	if (trx->is_wsrep() && wsrep_thd_is_BF(trx->mysql_thd, false))
+		trx->wsrep = 3;
+#endif /* WITH_WSREP */
+
+	err = lock_rec_lock(false, gap_mode | mode,
+			    block, page_rec_get_heap_no(rec), index, thr);
+
+#ifdef WITH_WSREP
+	if (trx->wsrep == 3) trx->wsrep = 1;
+#endif /* WITH_WSREP */
+
+	ut_ad(lock_rec_queue_validate(false, block->page.id(),
+				      rec, index, offsets));
+
+	DEBUG_SYNC_C("lock_sec_rec_read_check_and_lock_has_locked");
+
+	return(err);
+}
+
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate read, or passing
+over by a read cursor, of a clustered index record. If they do, first tests
+if the query thread should anyway be suspended for some reason; if not, then
+puts the transaction and the query thread to the lock wait state and inserts a
+waiting request for a record lock to the lock queue. Sets the requested mode
+lock on the record.
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_clust_rec_read_check_and_lock(
+/*===============================*/
+	ulint			flags,	/*!< in: if BTR_NO_LOCKING_FLAG
+					bit is set, does nothing */
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: user record or page
+					supremum record which should
+					be read or passed over by a
+					read cursor */
+	dict_index_t*		index,	/*!< in: clustered index */
+	const rec_offs*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	lock_mode		mode,	/*!< in: mode of the lock which
+					the read cursor should set on
+					records: LOCK_S or LOCK_X; the
+					latter is possible in
+					SELECT FOR UPDATE */
+	unsigned		gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+					LOCK_REC_NOT_GAP */
+	que_thr_t*		thr)	/*!< in: query thread */
+{
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(block->page.frame == page_align(rec));
+	ut_ad(page_rec_is_user_rec(rec) || page_rec_is_supremum(rec));
+	ut_ad(gap_mode == LOCK_ORDINARY || gap_mode == LOCK_GAP
+	      || gap_mode == LOCK_REC_NOT_GAP);
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(page_rec_is_leaf(rec));
+	ut_ad(!rec_is_metadata(rec, *index));
+
+	if ((flags & BTR_NO_LOCKING_FLAG)
+	    || srv_read_only_mode
+	    || index->table->is_temporary()) {
+
+		return(DB_SUCCESS);
+	}
+
+	const page_id_t id{block->page.id()};
+
+	ulint heap_no = page_rec_get_heap_no(rec);
+
+	trx_t *trx = thr_get_trx(thr);
+	if (!lock_table_has(trx, index->table, LOCK_X)
+	    && heap_no != PAGE_HEAP_NO_SUPREMUM
+	    && lock_rec_convert_impl_to_expl<true>(trx, id,
+						   rec, index, offsets)
+	    && gap_mode == LOCK_REC_NOT_GAP) {
+		/* We already hold an implicit exclusive lock. */
+		return DB_SUCCESS;
+	}
+
+	dberr_t err = lock_rec_lock(false, gap_mode | mode,
+				    block, heap_no, index, thr);
+
+	ut_ad(lock_rec_queue_validate(false, id, rec, index, offsets));
+
+	DEBUG_SYNC_C("after_lock_clust_rec_read_check_and_lock");
+
+	return(err);
+}
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate read, or passing
+over by a read cursor, of a clustered index record. If they do, first tests
+if the query thread should anyway be suspended for some reason; if not, then
+puts the transaction and the query thread to the lock wait state and inserts a
+waiting request for a record lock to the lock queue. Sets the requested mode
+lock on the record. This is an alternative version of
+lock_clust_rec_read_check_and_lock() that does not require the parameter
+"offsets".
+@return DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_clust_rec_read_check_and_lock_alt(
+/*===================================*/
+	ulint			flags,	/*!< in: if BTR_NO_LOCKING_FLAG
+					bit is set, does nothing */
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: user record or page
+					supremum record which should
+					be read or passed over by a
+					read cursor */
+	dict_index_t*		index,	/*!< in: clustered index */
+	lock_mode		mode,	/*!< in: mode of the lock which
+					the read cursor should set on
+					records: LOCK_S or LOCK_X; the
+					latter is possible in
+					SELECT FOR UPDATE */
+	unsigned		gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+					LOCK_REC_NOT_GAP */
+	que_thr_t*		thr)	/*!< in: query thread */
+{
+	mem_heap_t*	tmp_heap	= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets		= offsets_;
+	dberr_t		err;
+	rec_offs_init(offsets_);
+
+	ut_ad(page_rec_is_leaf(rec));
+	offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+				  ULINT_UNDEFINED, &tmp_heap);
+	err = lock_clust_rec_read_check_and_lock(flags, block, rec, index,
+						 offsets, mode, gap_mode, thr);
+	if (tmp_heap) {
+		mem_heap_free(tmp_heap);
+	}
+
+	if (err == DB_SUCCESS_LOCKED_REC) {
+		err = DB_SUCCESS;
+	}
+
+	return(err);
+}
+
+/*******************************************************************//**
+Check if a transaction holds any autoinc locks.
+@return TRUE if the transaction holds any AUTOINC locks. */
+static
+ibool
+lock_trx_holds_autoinc_locks(
+/*=========================*/
+	const trx_t*	trx)		/*!< in: transaction */
+{
+	ut_a(trx->autoinc_locks != NULL);
+
+	return(!ib_vector_is_empty(trx->autoinc_locks));
+}
+
+/** Release all AUTO_INCREMENT locks of the transaction. */
+static void lock_release_autoinc_locks(trx_t *trx)
+{
+  {
+    LockMutexGuard g{SRW_LOCK_CALL};
+    mysql_mutex_lock(&lock_sys.wait_mutex);
+    trx->mutex_lock();
+    auto autoinc_locks= trx->autoinc_locks;
+    ut_a(autoinc_locks);
+
+    /* We release the locks in the reverse order. This is to avoid
+    searching the vector for the element to delete at the lower level.
+    See (lock_table_remove_low()) for details. */
+    while (ulint size= ib_vector_size(autoinc_locks))
+    {
+      lock_t *lock= *static_cast<lock_t**>
+        (ib_vector_get(autoinc_locks, size - 1));
+      ut_ad(lock->type_mode == (LOCK_AUTO_INC | LOCK_TABLE));
+      lock_table_dequeue(lock, true);
+      lock_trx_table_locks_remove(lock);
+    }
+  }
+  mysql_mutex_unlock(&lock_sys.wait_mutex);
+  trx->mutex_unlock();
+}
+
+/** Cancel a waiting lock request and release possibly waiting transactions */
+template <bool from_deadlock= false, bool inner_trx_lock= true>
+void lock_cancel_waiting_and_release(lock_t *lock)
+{
+  lock_sys.assert_locked(*lock);
+  mysql_mutex_assert_owner(&lock_sys.wait_mutex);
+  trx_t *trx= lock->trx;
+  if (inner_trx_lock)
+    trx->mutex_lock();
+  ut_d(const auto trx_state= trx->state);
+  ut_ad(trx_state == TRX_STATE_COMMITTED_IN_MEMORY ||
+        trx_state == TRX_STATE_ACTIVE);
+
+  if (!lock->is_table())
+    lock_rec_dequeue_from_page(lock, true);
+  else
+  {
+    if (lock->type_mode == (LOCK_AUTO_INC | LOCK_TABLE))
+    {
+      ut_ad(trx->autoinc_locks);
+      ib_vector_remove(trx->autoinc_locks, lock);
+    }
+    lock_table_dequeue(lock, true);
+    /* Remove the lock from table lock vector too. */
+    lock_trx_table_locks_remove(lock);
+  }
+
+  /* Reset the wait flag and the back pointer to lock in trx. */
+  lock_reset_lock_and_trx_wait(lock);
+
+  lock_wait_end<from_deadlock>(trx);
+
+  if (inner_trx_lock)
+    trx->mutex_unlock();
+}
+
+void lock_sys_t::cancel_lock_wait_for_trx(trx_t *trx)
+{
+  lock_sys.wr_lock(SRW_LOCK_CALL);
+  mysql_mutex_lock(&lock_sys.wait_mutex);
+  if (lock_t *lock= trx->lock.wait_lock)
+  {
+    /* check if victim is still waiting */
+    if (lock->is_waiting())
+      lock_cancel_waiting_and_release(lock);
+  }
+  lock_sys.wr_unlock();
+  mysql_mutex_unlock(&lock_sys.wait_mutex);
+}
+
+#ifdef WITH_WSREP
+void lock_sys_t::cancel_lock_wait_for_wsrep_bf_abort(trx_t *trx)
+{
+  lock_sys.assert_locked();
+  mysql_mutex_assert_owner(&lock_sys.wait_mutex);
+  ut_ad(trx->mutex_is_owner());
+  ut_ad(trx->state == TRX_STATE_ACTIVE || trx->state == TRX_STATE_PREPARED);
+  trx->lock.set_wsrep_victim();
+  if (lock_t *lock= trx->lock.wait_lock)
+    lock_cancel_waiting_and_release<false, false>(lock);
+}
+#endif /* WITH_WSREP */
+
+/** Cancel a waiting lock request.
+@tparam check_victim  whether to check for DB_DEADLOCK
+@param trx            active transaction
+@param lock           waiting lock request
+@retval DB_SUCCESS    if no lock existed
+@retval DB_DEADLOCK   if trx->lock.was_chosen_as_deadlock_victim was set
+@retval DB_LOCK_WAIT  if the lock was canceled */
+template<bool check_victim>
+dberr_t lock_sys_t::cancel(trx_t *trx, lock_t *lock)
+{
+  DEBUG_SYNC_C("lock_sys_t_cancel_enter");
+  mysql_mutex_assert_owner(&lock_sys.wait_mutex);
+  ut_ad(trx->state == TRX_STATE_ACTIVE);
+  /* trx->lock.wait_lock may be changed by other threads as long as
+  we are not holding lock_sys.latch.
+
+  So, trx->lock.wait_lock==lock does not necessarily hold, but both
+  pointers should be valid, because other threads cannot assign
+  trx->lock.wait_lock=nullptr (or invalidate *lock) while we are
+  holding lock_sys.wait_mutex. Also, the type of trx->lock.wait_lock
+  (record or table lock) cannot be changed by other threads. So, it is
+  safe to call lock->is_table() while not holding lock_sys.latch. If
+  we have to release and reacquire lock_sys.wait_mutex, we must reread
+  trx->lock.wait_lock. We must also reread trx->lock.wait_lock after
+  lock_sys.latch acquiring, as it can be changed to not-null in lock moving
+  functions even if we hold lock_sys.wait_mutex. */
+  dberr_t err= DB_SUCCESS;
+  /* This would be too large for a memory transaction, except in the
+  DB_DEADLOCK case, which was already tested in lock_trx_handle_wait(). */
+  if (lock->is_table())
+  {
+    if (!lock_sys.rd_lock_try())
+    {
+      mysql_mutex_unlock(&lock_sys.wait_mutex);
+      lock_sys.rd_lock(SRW_LOCK_CALL);
+      mysql_mutex_lock(&lock_sys.wait_mutex);
+      lock= trx->lock.wait_lock;
+      /* Even if waiting lock was cancelled while lock_sys.wait_mutex was
+      unlocked, we need to return deadlock error if transaction was chosen
+      as deadlock victim to rollback it */
+      if (check_victim && trx->lock.was_chosen_as_deadlock_victim)
+        err= DB_DEADLOCK;
+      else if (lock)
+        goto resolve_table_lock;
+    }
+    else
+    {
+      /* This function is invoked from the thread which executes the
+      transaction. Table locks are requested before record locks. Some other
+      transaction can't change trx->lock.wait_lock from table to record for the
+      current transaction at this point, because the current transaction has not
+      requested record locks yet. There is no need to move any table locks by
+      other threads. And trx->lock.wait_lock can't be set to null while we are
+      holding lock_sys.wait_mutex. That's why there is no need to reload
+      trx->lock.wait_lock here. */
+      ut_ad(lock == trx->lock.wait_lock);
+resolve_table_lock:
+      dict_table_t *table= lock->un_member.tab_lock.table;
+      if (!table->lock_mutex_trylock())
+      {
+        /* The correct latching order is:
+        lock_sys.latch, table->lock_mutex_lock(), lock_sys.wait_mutex.
+        Thus, we must release lock_sys.wait_mutex for a blocking wait. */
+        mysql_mutex_unlock(&lock_sys.wait_mutex);
+        table->lock_mutex_lock();
+        mysql_mutex_lock(&lock_sys.wait_mutex);
+        /* Cache trx->lock.wait_lock under the corresponding latches. */
+        lock= trx->lock.wait_lock;
+        if (!lock)
+          goto retreat;
+        else if (check_victim && trx->lock.was_chosen_as_deadlock_victim)
+        {
+          err= DB_DEADLOCK;
+          goto retreat;
+        }
+      }
+      else
+        /* Cache trx->lock.wait_lock under the corresponding latches if
+        it was not cached yet */
+        lock= trx->lock.wait_lock;
+      if (lock->is_waiting())
+        lock_cancel_waiting_and_release(lock);
+      /* Even if lock->is_waiting() did not hold above, we must return
+      DB_LOCK_WAIT, or otherwise optimistic parallel replication could
+      occasionally hang. Potentially affected tests:
+      rpl.rpl_parallel_optimistic
+      rpl.rpl_parallel_optimistic_nobinlog
+      rpl.rpl_parallel_optimistic_xa_lsu_off */
+      err= DB_LOCK_WAIT;
+retreat:
+      table->lock_mutex_unlock();
+    }
+    lock_sys.rd_unlock();
+  }
+  else
+  {
+    /* To prevent the record lock from being moved between pages
+    during a page split or merge, we must hold exclusive lock_sys.latch. */
+    if (!lock_sys.wr_lock_try())
+    {
+      mysql_mutex_unlock(&lock_sys.wait_mutex);
+      lock_sys.wr_lock(SRW_LOCK_CALL);
+      mysql_mutex_lock(&lock_sys.wait_mutex);
+      /* Cache trx->lock.wait_lock under the corresponding latches. */
+      lock= trx->lock.wait_lock;
+      /* Even if waiting lock was cancelled while lock_sys.wait_mutex was
+      unlocked, we need to return deadlock error if transaction was chosen
+      as deadlock victim to rollback it */
+      if (check_victim && trx->lock.was_chosen_as_deadlock_victim)
+        err= DB_DEADLOCK;
+      else if (lock)
+        goto resolve_record_lock;
+    }
+    else
+    {
+      /* Cache trx->lock.wait_lock under the corresponding latches if
+      it was not cached yet */
+      lock= trx->lock.wait_lock;
+resolve_record_lock:
+      if (lock->is_waiting())
+        lock_cancel_waiting_and_release(lock);
+      /* Even if lock->is_waiting() did not hold above, we must return
+      DB_LOCK_WAIT, or otherwise optimistic parallel replication could
+      occasionally hang. Potentially affected tests:
+      rpl.rpl_parallel_optimistic
+      rpl.rpl_parallel_optimistic_nobinlog
+      rpl.rpl_parallel_optimistic_xa_lsu_off */
+      err= DB_LOCK_WAIT;
+    }
+    lock_sys.wr_unlock();
+  }
+
+  return err;
+}
+
+template dberr_t lock_sys_t::cancel<false>(trx_t *, lock_t *);
+
+/*********************************************************************//**
+Unlocks AUTO_INC type locks that were possibly reserved by a trx. This
+function should be called at the the end of an SQL statement, by the
+connection thread that owns the transaction (trx->mysql_thd). */
+void
+lock_unlock_table_autoinc(
+/*======================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	lock_sys.assert_unlocked();
+	ut_ad(!trx->mutex_is_owner());
+	ut_ad(!trx->lock.wait_lock);
+
+	/* This can be invoked on NOT_STARTED, ACTIVE, PREPARED,
+	but not COMMITTED transactions. */
+
+	ut_ad(trx_state_eq(trx, TRX_STATE_NOT_STARTED)
+	      || !trx_state_eq(trx, TRX_STATE_COMMITTED_IN_MEMORY));
+
+	/* This function is invoked for a running transaction by the
+	thread that is serving the transaction. Therefore it is not
+	necessary to hold trx->mutex here. */
+
+	if (lock_trx_holds_autoinc_locks(trx)) {
+		lock_release_autoinc_locks(trx);
+	}
+}
+
+/** Handle a pending lock wait (DB_LOCK_WAIT) in a semi-consistent read
+while holding a clustered index leaf page latch.
+
+@param trx           transaction that is or was waiting for a lock
+@retval DB_SUCCESS   if the lock was granted
+@retval DB_DEADLOCK  if the transaction must be aborted due to a deadlock
+@retval DB_LOCK_WAIT if a lock wait would be necessary; the pending
+                     lock request was released */
+dberr_t lock_trx_handle_wait(trx_t *trx)
+{
+  DEBUG_SYNC_C("lock_trx_handle_wait_enter");
+  if (trx->lock.was_chosen_as_deadlock_victim)
+    return DB_DEADLOCK;
+  DEBUG_SYNC_C("lock_trx_handle_wait_before_unlocked_wait_lock_check");
+  /* trx->lock.was_chosen_as_deadlock_victim must always be set before
+  trx->lock.wait_lock if the transaction was chosen as deadlock victim,
+  the function must not return DB_SUCCESS if
+  trx->lock.was_chosen_as_deadlock_victim is set. */
+  if (!trx->lock.wait_lock)
+    return trx->lock.was_chosen_as_deadlock_victim ? DB_DEADLOCK : DB_SUCCESS;
+  dberr_t err= DB_SUCCESS;
+  mysql_mutex_lock(&lock_sys.wait_mutex);
+  if (trx->lock.was_chosen_as_deadlock_victim)
+    err= DB_DEADLOCK;
+  /* Cache trx->lock.wait_lock to avoid unnecessary atomic variable load */
+  else if (lock_t *wait_lock= trx->lock.wait_lock)
+    err= lock_sys_t::cancel<true>(trx, wait_lock);
+  lock_sys.deadlock_check();
+  mysql_mutex_unlock(&lock_sys.wait_mutex);
+  return err;
+}
+
+#ifdef UNIV_DEBUG
+/**
+  Do an exhaustive check for any locks (table or rec) against the table.
+
+  @param[in]  table  check if there are any locks held on records in this table
+                     or on the table itself
+*/
+
+static my_bool lock_table_locks_lookup(rw_trx_hash_element_t *element,
+                                       const dict_table_t *table)
+{
+  lock_sys.assert_locked();
+  element->mutex.wr_lock();
+  if (element->trx)
+  {
+    element->trx->mutex_lock();
+    check_trx_state(element->trx);
+    if (element->trx->state != TRX_STATE_COMMITTED_IN_MEMORY)
+    {
+      for (const lock_t *lock= UT_LIST_GET_FIRST(element->trx->lock.trx_locks);
+           lock != NULL;
+           lock= UT_LIST_GET_NEXT(trx_locks, lock))
+      {
+        ut_ad(lock->trx == element->trx);
+        if (!lock->is_table())
+        {
+          ut_ad(lock->index->online_status != ONLINE_INDEX_CREATION ||
+                lock->index->is_primary());
+          ut_ad(lock->index->table != table);
+        }
+        else
+          ut_ad(lock->un_member.tab_lock.table != table);
+      }
+    }
+    element->trx->mutex_unlock();
+  }
+  element->mutex.wr_unlock();
+  return 0;
+}
+#endif /* UNIV_DEBUG */
+
+/** Check if there are any locks on a table.
+@return true if table has either table or record locks. */
+TRANSACTIONAL_TARGET
+bool lock_table_has_locks(dict_table_t *table)
+{
+  if (table->n_rec_locks)
+    return true;
+  ulint len;
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+  if (xbegin())
+  {
+    if (table->lock_mutex_is_locked())
+      xabort();
+    len= UT_LIST_GET_LEN(table->locks);
+    xend();
+  }
+  else
+#endif
+  {
+    table->lock_mutex_lock();
+    len= UT_LIST_GET_LEN(table->locks);
+    table->lock_mutex_unlock();
+  }
+  if (len)
+    return true;
+#ifdef UNIV_DEBUG
+  {
+    LockMutexGuard g{SRW_LOCK_CALL};
+    trx_sys.rw_trx_hash.iterate(lock_table_locks_lookup,
+                                const_cast<const dict_table_t*>(table));
+  }
+#endif /* UNIV_DEBUG */
+  return false;
+}
+
+/*******************************************************************//**
+Initialise the table lock list. */
+void
+lock_table_lock_list_init(
+/*======================*/
+	table_lock_list_t*	lock_list)	/*!< List to initialise */
+{
+	UT_LIST_INIT(*lock_list, &lock_table_t::locks);
+}
+
+#ifdef UNIV_DEBUG
+/*******************************************************************//**
+Check if the transaction holds any locks on the sys tables
+or its records.
+@return the strongest lock found on any sys table or 0 for none */
+const lock_t*
+lock_trx_has_sys_table_locks(
+/*=========================*/
+	const trx_t*	trx)	/*!< in: transaction to check */
+{
+	const lock_t*	strongest_lock = 0;
+	lock_mode	strongest = LOCK_NONE;
+
+	LockMutexGuard g{SRW_LOCK_CALL};
+
+	const lock_list::const_iterator end = trx->lock.table_locks.end();
+	lock_list::const_iterator it = trx->lock.table_locks.begin();
+
+	/* Find a valid mode. Note: ib_vector_size() can be 0. */
+
+	for (/* No op */; it != end; ++it) {
+		const lock_t*	lock = *it;
+
+		if (lock != NULL
+		    && dict_is_sys_table(lock->un_member.tab_lock.table->id)) {
+
+			strongest = lock->mode();
+			ut_ad(strongest != LOCK_NONE);
+			strongest_lock = lock;
+			break;
+		}
+	}
+
+	if (strongest == LOCK_NONE) {
+		return(NULL);
+	}
+
+	for (/* No op */; it != end; ++it) {
+		const lock_t*	lock = *it;
+
+		if (lock == NULL) {
+			continue;
+		}
+
+		ut_ad(trx == lock->trx);
+		ut_ad(lock->is_table());
+		ut_ad(lock->un_member.tab_lock.table);
+
+		lock_mode mode = lock->mode();
+
+		if (dict_is_sys_table(lock->un_member.tab_lock.table->id)
+		    && lock_mode_stronger_or_eq(mode, strongest)) {
+
+			strongest = mode;
+			strongest_lock = lock;
+		}
+	}
+
+	return(strongest_lock);
+}
+
+/** Check if the transaction holds an explicit exclusive lock on a record.
+@param[in]	trx	transaction
+@param[in]	table	table
+@param[in]	id	leaf page identifier
+@param[in]	heap_no	heap number identifying the record
+@return whether an explicit X-lock is held */
+bool lock_trx_has_expl_x_lock(const trx_t &trx, const dict_table_t &table,
+                              page_id_t id, ulint heap_no)
+{
+  ut_ad(heap_no > PAGE_HEAP_NO_SUPREMUM);
+  ut_ad(lock_table_has(&trx, &table, LOCK_IX));
+  if (!lock_table_has(&trx, &table, LOCK_X))
+  {
+    LockGuard g{lock_sys.rec_hash, id};
+    ut_ad(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP,
+                            g.cell(), id, heap_no, &trx));
+  }
+  return true;
+}
+#endif /* UNIV_DEBUG */
+
+namespace Deadlock
+{
+  /** rewind(3) the file used for storing the latest detected deadlock and
+  print a heading message to stderr if printing of all deadlocks to stderr
+  is enabled. */
+  static void start_print()
+  {
+    lock_sys.assert_locked();
+
+    rewind(lock_latest_err_file);
+    ut_print_timestamp(lock_latest_err_file);
+
+    if (srv_print_all_deadlocks)
+      ib::info() << "Transactions deadlock detected,"
+                    " dumping detailed information.";
+  }
+
+  /** Print a message to the deadlock file and possibly to stderr.
+  @param msg message to print */
+  static void print(const char *msg)
+  {
+    fputs(msg, lock_latest_err_file);
+    if (srv_print_all_deadlocks)
+      ib::info() << msg;
+  }
+
+  /** Print transaction data to the deadlock file and possibly to stderr.
+  @param trx transaction */
+  static void print(const trx_t &trx)
+  {
+    lock_sys.assert_locked();
+
+    ulint n_rec_locks= trx.lock.n_rec_locks;
+    ulint n_trx_locks= UT_LIST_GET_LEN(trx.lock.trx_locks);
+    ulint heap_size= mem_heap_get_size(trx.lock.lock_heap);
+
+    trx_print_low(lock_latest_err_file, &trx, 3000,
+                  n_rec_locks, n_trx_locks, heap_size);
+
+    if (srv_print_all_deadlocks)
+      trx_print_low(stderr, &trx, 3000, n_rec_locks, n_trx_locks, heap_size);
+  }
+
+  /** Print lock data to the deadlock file and possibly to stderr.
+  @param lock record or table type lock */
+  static void print(const lock_t &lock)
+  {
+    lock_sys.assert_locked();
+
+    if (!lock.is_table())
+    {
+      mtr_t mtr;
+      lock_rec_print(lock_latest_err_file, &lock, mtr);
+
+      if (srv_print_all_deadlocks)
+        lock_rec_print(stderr, &lock, mtr);
+    }
+    else
+    {
+      lock_table_print(lock_latest_err_file, &lock);
+
+      if (srv_print_all_deadlocks)
+        lock_table_print(stderr, &lock);
+    }
+  }
+
+  ATTRIBUTE_COLD
+  /** Calculate a number used to compare deadlock victim candidates.
+Bit 62 is used to prefer transaction that did not modified non-transactional
+tables. Bits 1-61 are set to TRX_WEIGHT to prefer transactions with less locks
+and less modified rows. Bit 0 is used to prefer orig_trx in case of a tie.
+  @param trx  Transaction
+  @return a 64-bit unsigned, the lower the more preferred TRX is as a deadlock
+          victim */
+  static undo_no_t calc_victim_weight(trx_t *trx, const trx_t *orig_trx)
+  {
+    const undo_no_t trx_weight= (trx != orig_trx) | (TRX_WEIGHT(trx) << 1) |
+      (trx->mysql_thd &&
+#ifdef WITH_WSREP
+       (thd_has_edited_nontrans_tables(trx->mysql_thd) ||
+        (trx->is_wsrep() && wsrep_thd_is_BF(trx->mysql_thd, false)))
+#else
+       thd_has_edited_nontrans_tables(trx->mysql_thd)
+#endif /* WITH_WSREP */
+       ? 1ULL << 62 : 0);
+    return trx_weight;
+  }
+
+  ATTRIBUTE_COLD
+  /** Report a deadlock (cycle in the waits-for graph).
+  @param trx        transaction waiting for a lock in this thread
+  @param current_trx whether trx belongs to the current thread
+  @return the transaction to be rolled back (unless one was committed already)
+  @return nullptr if no deadlock */
+  static trx_t *report(trx_t *const trx, bool current_trx)
+  {
+    mysql_mutex_assert_owner(&lock_sys.wait_mutex);
+    ut_ad(xtest() || lock_sys.is_writer() == !current_trx);
+
+    /* Normally, trx should be a direct part of the deadlock
+    cycle. However, if innodb_deadlock_detect had been OFF in the
+    past, or if current_trx=false, trx may be waiting for a lock that
+    is held by a participant of a pre-existing deadlock, without being
+    part of the deadlock itself. That is, the path to the deadlock may be
+    P-shaped instead of O-shaped, with trx being at the foot of the P.
+
+    We will process the entire path leading to a cycle, and we will
+    choose the victim (to be aborted) among the cycle. */
+
+    static const char rollback_msg[]= "*** WE ROLL BACK TRANSACTION (%u)\n";
+    char buf[9 + sizeof rollback_msg];
+    trx_t *victim= nullptr;
+
+    /* Here, lock elision does not make sense, because
+    for the output we are going to invoke system calls,
+    which would interrupt a memory transaction. */
+    if (current_trx && !lock_sys.wr_lock_try())
+    {
+      mysql_mutex_unlock(&lock_sys.wait_mutex);
+      lock_sys.wr_lock(SRW_LOCK_CALL);
+      mysql_mutex_lock(&lock_sys.wait_mutex);
+    }
+
+    {
+      unsigned l= 1;
+      /* Now that we are holding lock_sys.wait_mutex again, check
+      whether a cycle still exists. */
+      trx_t *cycle= find_cycle(trx);
+      if (!cycle)
+        goto func_exit; /* One of the transactions was already aborted. */
+
+      victim= cycle;
+      undo_no_t victim_weight= calc_victim_weight(victim, trx);
+      unsigned victim_pos= l;
+      for (trx_t *next= cycle;;)
+      {
+        next= next->lock.wait_trx;
+        l++;
+        const undo_no_t next_weight= calc_victim_weight(next, trx);
+#ifdef HAVE_REPLICATION
+        const int pref=
+          thd_deadlock_victim_preference(victim->mysql_thd, next->mysql_thd);
+        /* Set bit 63 for any non-preferred victim to make such preference take
+        priority in the weight comparison.
+        -1 means victim is preferred. 1 means next is preferred. */
+        undo_no_t victim_not_pref= (1ULL << 63) & (undo_no_t)(int64_t)(-pref);
+        undo_no_t next_not_pref= (1ULL << 63) & (undo_no_t)(int64_t)pref;
+#else
+        undo_no_t victim_not_pref= 0;
+        undo_no_t next_not_pref= 0;
+#endif
+        /* Single comparison to decide which of two transactions is preferred
+        as a deadlock victim.
+         - If thd_deadlock_victim_preference() returned non-zero, bit 63
+           comparison will decide the preferred one.
+         - Else if exactly one of them modified non-transactional tables,
+           bit 62 will decide.
+         - Else the TRX_WEIGHT in bits 1-61 will decide, if not equal.
+         - Else, if one of them is the original trx, bit 0 will decide.
+         - If all is equal, previous victim will arbitrarily be chosen. */
+        if ((next_weight|next_not_pref) < (victim_weight|victim_not_pref))
+        {
+          victim_weight= next_weight;
+          victim= next;
+          victim_pos= l;
+        }
+        if (next == cycle)
+          break;
+      }
+
+      /* Finally, display the deadlock */
+      switch (const auto r= static_cast<enum report>(innodb_deadlock_report)) {
+      case REPORT_OFF:
+        break;
+      case REPORT_BASIC:
+      case REPORT_FULL:
+        start_print();
+        l= 0;
+
+        for (trx_t *next= cycle;;)
+        {
+          next= next->lock.wait_trx;
+          ut_ad(next);
+          ut_ad(next->state == TRX_STATE_ACTIVE);
+          const lock_t *wait_lock= next->lock.wait_lock;
+          ut_ad(wait_lock);
+          snprintf(buf, sizeof buf, "\n*** (%u) TRANSACTION:\n", ++l);
+          print(buf);
+          print(*next);
+          print("*** WAITING FOR THIS LOCK TO BE GRANTED:\n");
+          print(*wait_lock);
+          if (r == REPORT_BASIC);
+          else if (wait_lock->is_table())
+          {
+            if (const lock_t *lock=
+                UT_LIST_GET_FIRST(wait_lock->un_member.tab_lock.table->locks))
+            {
+              ut_ad(!lock->is_waiting());
+              print("*** CONFLICTING WITH:\n");
+              do
+                print(*lock);
+              while ((lock= UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock)) &&
+                     !lock->is_waiting());
+            }
+            else
+              ut_ad("no conflicting table lock found" == 0);
+          }
+          else
+          {
+            const page_id_t id{wait_lock->un_member.rec_lock.page_id};
+            hash_cell_t &cell= *(wait_lock->type_mode & LOCK_PREDICATE
+                                 ? lock_sys.prdt_hash : lock_sys.rec_hash).
+              cell_get(id.fold());
+            if (const lock_t *lock= lock_sys_t::get_first(cell, id))
+            {
+              const ulint heap_no= lock_rec_find_set_bit(wait_lock);
+              if (!lock_rec_get_nth_bit(lock, heap_no))
+                lock= lock_rec_get_next_const(heap_no, lock);
+              ut_ad(!lock->is_waiting());
+              print("*** CONFLICTING WITH:\n");
+              do
+                print(*lock);
+              while ((lock= lock_rec_get_next_const(heap_no, lock)) &&
+                     !lock->is_waiting());
+            }
+            else
+              ut_ad("no conflicting record lock found" == 0);
+          }
+          if (next == cycle)
+            break;
+        }
+        snprintf(buf, sizeof buf, rollback_msg, victim_pos);
+        print(buf);
+      }
+
+      ut_ad(victim->state == TRX_STATE_ACTIVE);
+
+      /* victim->lock.was_chosen_as_deadlock_victim must always be set before
+      releasing waiting locks and reseting trx->lock.wait_lock */
+      victim->lock.was_chosen_as_deadlock_victim= true;
+      DEBUG_SYNC_C("deadlock_report_before_lock_releasing");
+      lock_cancel_waiting_and_release<true>(victim->lock.wait_lock);
+#ifdef WITH_WSREP
+      if (victim->is_wsrep() && wsrep_thd_is_SR(victim->mysql_thd))
+        wsrep_handle_SR_rollback(trx->mysql_thd, victim->mysql_thd);
+#endif
+    }
+
+func_exit:
+    if (current_trx)
+      lock_sys.wr_unlock();
+    return victim;
+  }
+}
+
+/** Check if a lock request results in a deadlock.
+Resolve a deadlock by choosing a transaction that will be rolled back.
+@param trx        transaction requesting a lock
+@param wait_lock  the lock being requested
+@return the lock that trx is or was waiting for
+@retval nullptr if the lock wait was resolved
+@retval -1 if trx must report DB_DEADLOCK */
+static lock_t *Deadlock::check_and_resolve(trx_t *trx, lock_t *wait_lock)
+{
+  mysql_mutex_assert_owner(&lock_sys.wait_mutex);
+
+  ut_ad(!trx->mutex_is_owner());
+  ut_ad(trx->state == TRX_STATE_ACTIVE);
+  ut_ad(!srv_read_only_mode);
+  ut_ad(wait_lock);
+
+  if (!innodb_deadlock_detect)
+    return wait_lock;
+
+  if (UNIV_LIKELY_NULL(find_cycle(trx)))
+  {
+    if (report(trx, true) == trx)
+      return reinterpret_cast<lock_t*>(-1);
+    /* Because report() released and reacquired lock_sys.wait_mutex,
+    another thread may have cleared trx->lock.wait_lock meanwhile. */
+    wait_lock= trx->lock.wait_lock;
+  }
+
+  if (UNIV_LIKELY(!trx->lock.was_chosen_as_deadlock_victim))
+    return wait_lock;
+
+  if (wait_lock)
+    lock_sys_t::cancel<false>(trx, wait_lock);
+
+  lock_sys.deadlock_check();
+  return reinterpret_cast<lock_t*>(-1);
+}
+
+/** Check for deadlocks while holding only lock_sys.wait_mutex. */
+TRANSACTIONAL_TARGET
+void lock_sys_t::deadlock_check()
+{
+  ut_ad(!is_writer());
+  mysql_mutex_assert_owner(&wait_mutex);
+  bool acquired= false;
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+  bool elided= false;
+#endif
+
+  if (Deadlock::to_be_checked)
+  {
+    for (;;)
+    {
+      auto i= Deadlock::to_check.begin();
+      if (i == Deadlock::to_check.end())
+        break;
+      if (acquired);
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+      else if (xbegin())
+      {
+        if (latch.is_locked_or_waiting())
+          xabort();
+        acquired= elided= true;
+      }
+#endif
+      else
+      {
+        acquired= wr_lock_try();
+        if (!acquired)
+        {
+          acquired= true;
+          mysql_mutex_unlock(&wait_mutex);
+          lock_sys.wr_lock(SRW_LOCK_CALL);
+          mysql_mutex_lock(&wait_mutex);
+          continue;
+        }
+      }
+      trx_t *trx= *i;
+      Deadlock::to_check.erase(i);
+      if (Deadlock::find_cycle(trx))
+        Deadlock::report(trx, false);
+    }
+    Deadlock::to_be_checked= false;
+  }
+  ut_ad(Deadlock::to_check.empty());
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+  if (elided)
+    return;
+#endif
+  if (acquired)
+    wr_unlock();
+}
+
+/** Update the locks when a page is split and merged to two pages,
+in defragmentation. */
+void lock_update_split_and_merge(
+	const buf_block_t* left_block,	/*!< in: left page to which merged */
+	const rec_t* orig_pred,		/*!< in: original predecessor of
+					supremum on the left page before merge*/
+	const buf_block_t* right_block)	/*!< in: right page from which merged */
+{
+  ut_ad(page_is_leaf(left_block->page.frame));
+  ut_ad(page_is_leaf(right_block->page.frame));
+  ut_ad(page_align(orig_pred) == left_block->page.frame);
+
+  const page_id_t l{left_block->page.id()};
+  const page_id_t r{right_block->page.id()};
+  const rec_t *left_next_rec= page_rec_get_next_const(orig_pred);
+  if (UNIV_UNLIKELY(!left_next_rec))
+  {
+    ut_ad("corrupted page" == 0);
+    return;
+  }
+  ut_ad(!page_rec_is_metadata(left_next_rec));
+
+  /* This would likely be too large for a memory transaction. */
+  LockMultiGuard g{lock_sys.rec_hash, l, r};
+
+  /* Inherit the locks on the supremum of the left page to the
+  first record which was moved from the right page */
+  lock_rec_inherit_to_gap(g.cell1(), l, g.cell1(), l, left_block->page.frame,
+                          page_rec_get_heap_no(left_next_rec),
+                          PAGE_HEAP_NO_SUPREMUM);
+
+  /* Reset the locks on the supremum of the left page,
+  releasing waiting transactions */
+  lock_rec_reset_and_release_wait(g.cell1(), l, PAGE_HEAP_NO_SUPREMUM);
+
+  /* Inherit the locks to the supremum of the left page from the
+  successor of the infimum on the right page */
+  lock_rec_inherit_to_gap(g.cell1(), l, g.cell2(), r, left_block->page.frame,
+                          PAGE_HEAP_NO_SUPREMUM,
+                          lock_get_min_heap_no(right_block));
+}
diff --git a/storage/innobase/lock/lock0prdt.cc b/storage/innobase/lock/lock0prdt.cc
new file mode 100644
index 00000000..29756591
--- /dev/null
+++ b/storage/innobase/lock/lock0prdt.cc
@@ -0,0 +1,928 @@
+/*****************************************************************************
+
+Copyright (c) 2014, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file lock/lock0prdt.cc
+The transaction lock system
+
+Created 9/7/2013 Jimmy Yang
+*******************************************************/
+
+#define LOCK_MODULE_IMPLEMENTATION
+
+#include "lock0lock.h"
+#include "lock0priv.h"
+#include "lock0prdt.h"
+#include "dict0mem.h"
+#include "que0que.h"
+
+/*********************************************************************//**
+Get a minimum bounding box from a Predicate
+@return	the minimum bounding box */
+UNIV_INLINE
+rtr_mbr_t*
+prdt_get_mbr_from_prdt(
+/*===================*/
+	const lock_prdt_t*	prdt)	/*!< in: the lock predicate */
+{
+	rtr_mbr_t*	mbr_loc = reinterpret_cast<rtr_mbr_t*>(prdt->data);
+
+	return(mbr_loc);
+}
+
+/*********************************************************************//**
+Get a predicate from a lock
+@return	the predicate */
+lock_prdt_t*
+lock_get_prdt_from_lock(
+/*====================*/
+	const lock_t*	lock)	/*!< in: the lock */
+{
+	lock_prdt_t*	prdt = reinterpret_cast<lock_prdt_t*>(
+				&((reinterpret_cast<byte*>(
+					const_cast<lock_t*>(&lock[1])))[
+						UNIV_WORD_SIZE]));
+
+	return(prdt);
+}
+
+/*********************************************************************//**
+Get a minimum bounding box directly from a lock
+@return	the minimum bounding box*/
+UNIV_INLINE
+rtr_mbr_t*
+lock_prdt_get_mbr_from_lock(
+/*========================*/
+	const lock_t*	lock)	/*!< in: the lock */
+{
+	ut_ad(lock->type_mode & LOCK_PREDICATE);
+
+	lock_prdt_t*	prdt = lock_get_prdt_from_lock(lock);
+
+	rtr_mbr_t*	mbr_loc = prdt_get_mbr_from_prdt(prdt);
+
+	return(mbr_loc);
+}
+
+/*********************************************************************//**
+Append a predicate to the lock */
+void
+lock_prdt_set_prdt(
+/*===============*/
+	lock_t*			lock,	/*!< in: lock */
+	const lock_prdt_t*	prdt)	/*!< in: Predicate */
+{
+	ut_ad(lock->type_mode & LOCK_PREDICATE);
+
+	memcpy(&(((byte*) &lock[1])[UNIV_WORD_SIZE]), prdt, sizeof *prdt);
+}
+
+
+/** Check whether two predicate locks are compatible with each other
+@param[in]	prdt1	first predicate lock
+@param[in]	prdt2	second predicate lock
+@param[in]	op	predicate comparison operator
+@return	true if consistent */
+static
+bool
+lock_prdt_consistent(
+	lock_prdt_t*	prdt1,
+	lock_prdt_t*	prdt2,
+	ulint		op)
+{
+	bool		ret = false;
+	rtr_mbr_t*	mbr1 = prdt_get_mbr_from_prdt(prdt1);
+	rtr_mbr_t*	mbr2 = prdt_get_mbr_from_prdt(prdt2);
+	ulint		action;
+
+	if (op) {
+		action = op;
+	} else {
+		if (prdt2->op != 0 && (prdt1->op != prdt2->op)) {
+			return(false);
+		}
+
+		action = prdt1->op;
+	}
+
+	switch (action) {
+	case PAGE_CUR_CONTAIN:
+		ret = MBR_CONTAIN_CMP(mbr1, mbr2);
+		break;
+	case PAGE_CUR_DISJOINT:
+		ret = MBR_DISJOINT_CMP(mbr1, mbr2);
+		break;
+	case PAGE_CUR_MBR_EQUAL:
+		ret = MBR_EQUAL_CMP(mbr1, mbr2);
+		break;
+	case PAGE_CUR_INTERSECT:
+		ret = MBR_INTERSECT_CMP(mbr1, mbr2);
+		break;
+	case PAGE_CUR_WITHIN:
+		ret = MBR_WITHIN_CMP(mbr1, mbr2);
+		break;
+	default:
+		ib::error() << "invalid operator " << action;
+		ut_error;
+	}
+
+	return(ret);
+}
+
+/*********************************************************************//**
+Checks if a predicate lock request for a new lock has to wait for
+another lock.
+@return	true if new lock has to wait for lock2 to be released */
+bool
+lock_prdt_has_to_wait(
+/*==================*/
+	const trx_t*	trx,	/*!< in: trx of new lock */
+	unsigned	type_mode,/*!< in: precise mode of the new lock
+				to set: LOCK_S or LOCK_X, possibly
+				ORed to LOCK_PREDICATE or LOCK_PRDT_PAGE,
+				LOCK_INSERT_INTENTION */
+	lock_prdt_t*	prdt,	/*!< in: lock predicate to check */
+	const lock_t*	lock2)	/*!< in: another record lock; NOTE that
+				it is assumed that this has a lock bit
+				set on the same record as in the new
+				lock we are setting */
+{
+	lock_prdt_t*	cur_prdt = lock_get_prdt_from_lock(lock2);
+
+	ut_ad(trx && lock2);
+	ut_ad((lock2->type_mode & LOCK_PREDICATE && type_mode & LOCK_PREDICATE)
+	      || (lock2->type_mode & LOCK_PRDT_PAGE
+		  && type_mode & LOCK_PRDT_PAGE));
+
+	ut_ad(type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE));
+
+	if (trx != lock2->trx
+	    && !lock_mode_compatible(static_cast<lock_mode>(
+			             LOCK_MODE_MASK & type_mode),
+				     lock2->mode())) {
+
+		/* If it is a page lock, then return true (conflict) */
+		if (type_mode & LOCK_PRDT_PAGE) {
+			ut_ad(lock2->type_mode & LOCK_PRDT_PAGE);
+
+			return(true);
+		}
+
+		/* Predicate lock does not conflicts with non-predicate lock */
+		if (!(lock2->type_mode & LOCK_PREDICATE)) {
+			return(FALSE);
+		}
+
+		ut_ad(lock2->type_mode & LOCK_PREDICATE);
+
+		if (!(type_mode & LOCK_INSERT_INTENTION)) {
+			/* PREDICATE locks without LOCK_INSERT_INTENTION flag
+			do not need to wait for anything. This is because
+			different users can have conflicting lock types
+			on predicates. */
+
+			return(FALSE);
+		}
+
+		if (lock2->type_mode & LOCK_INSERT_INTENTION) {
+
+			/* No lock request needs to wait for an insert
+			intention lock to be removed. This makes it similar
+			to GAP lock, that allows conflicting insert intention
+			locks */
+			return(FALSE);
+		}
+
+		if (!lock_prdt_consistent(cur_prdt, prdt, 0)) {
+			return(false);
+		}
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Checks if a transaction has a GRANTED stronger or equal predicate lock
+on the page
+@return	lock or NULL */
+UNIV_INLINE
+lock_t*
+lock_prdt_has_lock(
+/*===============*/
+	ulint			precise_mode,	/*!< in: LOCK_S or LOCK_X */
+	hash_cell_t&		cell,		/*!< hash table cell of id */
+	const page_id_t		id,		/*!< in: page identifier */
+	lock_prdt_t*		prdt,		/*!< in: The predicate to be
+						attached to the new lock */
+	const trx_t*		trx)		/*!< in: transaction */
+{
+	ut_ad((precise_mode & LOCK_MODE_MASK) == LOCK_S
+	      || (precise_mode & LOCK_MODE_MASK) == LOCK_X);
+	ut_ad(!(precise_mode & LOCK_INSERT_INTENTION));
+
+	for (lock_t*lock= lock_sys_t::get_first(cell, id, PRDT_HEAPNO);
+	     lock;
+	     lock = lock_rec_get_next(PRDT_HEAPNO, lock)) {
+		ut_ad(lock->type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE));
+
+		if (lock->trx == trx
+		    && !(lock->type_mode & (LOCK_INSERT_INTENTION | LOCK_WAIT))
+		    && lock_mode_stronger_or_eq(
+			    lock->mode(),
+			    static_cast<lock_mode>(
+				    precise_mode & LOCK_MODE_MASK))) {
+			if (lock->type_mode & LOCK_PRDT_PAGE) {
+				return(lock);
+			}
+
+			lock_prdt_t*	cur_prdt = lock_get_prdt_from_lock(
+							lock);
+
+			/* if the lock predicate operator is the same
+			as the one to look, and prdicate test is successful,
+			then we find a lock */
+			if (cur_prdt->op == prdt->op
+			    && lock_prdt_consistent(cur_prdt, prdt, 0)) {
+
+				return(lock);
+			}
+		}
+	}
+
+	return(NULL);
+}
+
+/*********************************************************************//**
+Checks if some other transaction has a conflicting predicate
+lock request in the queue, so that we have to wait.
+@return	lock or NULL */
+static
+lock_t*
+lock_prdt_other_has_conflicting(
+/*============================*/
+	unsigned		mode,	/*!< in: LOCK_S or LOCK_X,
+					possibly ORed to LOCK_PREDICATE or
+					LOCK_PRDT_PAGE, LOCK_INSERT_INTENTION */
+	const hash_cell_t&	cell,	/*!< in: hash table cell */
+	const page_id_t		id,	/*!< in: page identifier */
+	lock_prdt_t*		prdt,    /*!< in: Predicates (currently)
+					the Minimum Bounding Rectangle)
+					the new lock will be on */
+	const trx_t*		trx)	/*!< in: our transaction */
+{
+	for (lock_t* lock = lock_sys_t::get_first(cell, id, PRDT_HEAPNO);
+	     lock != NULL;
+	     lock = lock_rec_get_next(PRDT_HEAPNO, lock)) {
+
+		if (lock->trx == trx) {
+			continue;
+		}
+
+		if (lock_prdt_has_to_wait(trx, mode, prdt, lock)) {
+			return(lock);
+		}
+	}
+
+	return(NULL);
+}
+
+/*********************************************************************//**
+Reset the Minimum Bounding Rectangle (to a large area) */
+static
+void
+lock_prdt_enlarge_mbr(
+/*==================*/
+	const lock_t*	lock,	/*!< in/out: lock to modify */
+	rtr_mbr_t*	mbr)    /*!< in: Minimum Bounding Rectangle */
+{
+	rtr_mbr_t*	cur_mbr = lock_prdt_get_mbr_from_lock(lock);
+
+	if (cur_mbr->xmin > mbr->xmin) {
+		cur_mbr->xmin = mbr->xmin;
+	}
+
+	if (cur_mbr->ymin > mbr->ymin) {
+		cur_mbr->ymin = mbr->ymin;
+	}
+
+	if (cur_mbr->xmax < mbr->xmax) {
+		cur_mbr->xmax = mbr->xmax;
+	}
+
+	if (cur_mbr->ymax < mbr->ymax) {
+		cur_mbr->ymax = mbr->ymax;
+	}
+}
+
+/*********************************************************************//**
+Reset the predicates to a "covering" (larger) predicates */
+static
+void
+lock_prdt_enlarge_prdt(
+/*===================*/
+	lock_t*		lock,	/*!< in/out: lock to modify */
+	lock_prdt_t*	prdt)	/*!< in: predicate */
+{
+	rtr_mbr_t*	mbr = prdt_get_mbr_from_prdt(prdt);
+
+	lock_prdt_enlarge_mbr(lock, mbr);
+}
+
+/*********************************************************************//**
+Check two predicates' MBRs are the same
+@return	true if they are the same */
+static
+bool
+lock_prdt_is_same(
+/*==============*/
+	lock_prdt_t*	prdt1,		/*!< in: MBR with the lock */
+	lock_prdt_t*	prdt2)		/*!< in: MBR with the lock */
+{
+	rtr_mbr_t*	mbr1 = prdt_get_mbr_from_prdt(prdt1);
+	rtr_mbr_t*	mbr2 = prdt_get_mbr_from_prdt(prdt2);
+
+	if (prdt1->op == prdt2->op && MBR_EQUAL_CMP(mbr1, mbr2)) {
+		return(true);
+	}
+
+	return(false);
+}
+
+/*********************************************************************//**
+Looks for a similar predicate lock struct by the same trx on the same page.
+This can be used to save space when a new record lock should be set on a page:
+no new struct is needed, if a suitable old one is found.
+@return	lock or NULL */
+static
+lock_t*
+lock_prdt_find_on_page(
+/*===================*/
+	unsigned		type_mode,	/*!< in: lock type_mode field */
+	const buf_block_t*	block,		/*!< in: buffer block */
+	lock_prdt_t*		prdt,		/*!< in: MBR with the lock */
+	const trx_t*		trx)		/*!< in: transaction */
+{
+	const page_id_t id{block->page.id()};
+	hash_cell_t& cell = *lock_sys.hash_get(type_mode).cell_get(id.fold());
+
+	for (lock_t *lock = lock_sys_t::get_first(cell, id);
+	     lock != NULL;
+	     lock = lock_rec_get_next_on_page(lock)) {
+
+		if (lock->trx == trx
+		    && lock->type_mode == type_mode) {
+			if (lock->type_mode & LOCK_PRDT_PAGE) {
+				return(lock);
+			}
+
+			ut_ad(lock->type_mode & LOCK_PREDICATE);
+
+			if (lock_prdt_is_same(lock_get_prdt_from_lock(lock),
+					      prdt)) {
+				return(lock);
+			}
+		}
+	}
+
+	return(NULL);
+}
+
+/*********************************************************************//**
+Adds a predicate lock request in the predicate lock queue.
+@return	lock where the bit was set */
+static
+lock_t*
+lock_prdt_add_to_queue(
+/*===================*/
+	unsigned		type_mode,/*!< in: lock mode, wait, predicate
+					etc. flags */
+	const buf_block_t*	block,	/*!< in: buffer block containing
+					the record */
+	dict_index_t*		index,	/*!< in: index of record */
+	trx_t*			trx,	/*!< in/out: transaction */
+	lock_prdt_t*		prdt,	/*!< in: Minimum Bounding Rectangle
+					the new lock will be on */
+	bool			caller_owns_trx_mutex)
+					/*!< in: TRUE if caller owns the
+					transaction mutex */
+{
+	ut_ad(caller_owns_trx_mutex == trx->mutex_is_owner());
+	ut_ad(index->is_spatial());
+	ut_ad(!dict_index_is_online_ddl(index));
+	ut_ad(type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE));
+
+#ifdef UNIV_DEBUG
+	switch (type_mode & LOCK_MODE_MASK) {
+	case LOCK_X:
+	case LOCK_S:
+		break;
+	default:
+		ut_error;
+	}
+#endif /* UNIV_DEBUG */
+
+	/* Try to extend a similar non-waiting lock on the same page */
+	if (!(type_mode & LOCK_WAIT)) {
+		const page_id_t id{block->page.id()};
+		hash_cell_t& cell = *lock_sys.hash_get(type_mode).
+			cell_get(id.fold());
+
+		for (lock_t* lock = lock_sys_t::get_first(cell, id);
+		     lock; lock = lock_rec_get_next_on_page(lock)) {
+			if (lock->is_waiting()
+			    && lock->type_mode
+			    & (LOCK_PREDICATE | LOCK_PRDT_PAGE)
+			    && lock_rec_get_nth_bit(lock, PRDT_HEAPNO)) {
+				goto create;
+			}
+		}
+
+		if (lock_t* lock = lock_prdt_find_on_page(type_mode, block,
+							  prdt, trx)) {
+			if (lock->type_mode & LOCK_PREDICATE) {
+				lock_prdt_enlarge_prdt(lock, prdt);
+			}
+
+			return lock;
+		}
+	}
+
+create:
+	/* Note: We will not pass any conflicting lock to lock_rec_create(),
+	because we should be moving an existing waiting lock request. */
+	ut_ad(!(type_mode & LOCK_WAIT) || trx->lock.wait_trx);
+
+	lock_t* lock = lock_rec_create(nullptr,
+				       type_mode, block, PRDT_HEAPNO, index,
+				       trx, caller_owns_trx_mutex);
+
+	if (lock->type_mode & LOCK_PREDICATE) {
+		lock_prdt_set_prdt(lock, prdt);
+	}
+
+	return lock;
+}
+
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate insert of
+a predicate record.
+@return	DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_prdt_insert_check_and_lock(
+/*============================*/
+	const rec_t*	rec,	/*!< in: record after which to insert */
+	buf_block_t*	block,	/*!< in/out: buffer block of rec */
+	dict_index_t*	index,	/*!< in: index */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	lock_prdt_t*	prdt)	/*!< in: Predicates with Minimum Bound
+				Rectangle */
+{
+  ut_ad(block->page.frame == page_align(rec));
+  ut_ad(!index->table->is_temporary());
+  ut_ad(index->is_spatial());
+
+  trx_t *trx= thr_get_trx(thr);
+  const page_id_t id{block->page.id()};
+  dberr_t err= DB_SUCCESS;
+
+  {
+    LockGuard g{lock_sys.prdt_hash, id};
+    /* Because this code is invoked for a running transaction by
+    the thread that is serving the transaction, it is not necessary
+    to hold trx->mutex here. */
+    ut_ad(lock_table_has(trx, index->table, LOCK_IX));
+
+    /* Only need to check locks on prdt_hash */
+    if (ut_d(lock_t *lock=) lock_sys_t::get_first(g.cell(), id, PRDT_HEAPNO))
+    {
+      ut_ad(lock->type_mode & LOCK_PREDICATE);
+
+      /* If another transaction has an explicit lock request which locks
+      the predicate, waiting or granted, on the successor, the insert
+      has to wait.
+
+      Similar to GAP lock, we do not consider lock from inserts conflicts
+      with each other */
+
+      const ulint mode= LOCK_X | LOCK_PREDICATE | LOCK_INSERT_INTENTION;
+      lock_t *c_lock= lock_prdt_other_has_conflicting(mode, g.cell(), id,
+                                                      prdt, trx);
+
+      if (c_lock)
+      {
+        rtr_mbr_t *mbr= prdt_get_mbr_from_prdt(prdt);
+        trx->mutex_lock();
+        /* Allocate MBR on the lock heap */
+        lock_init_prdt_from_mbr(prdt, mbr, 0, trx->lock.lock_heap);
+        err= lock_rec_enqueue_waiting(c_lock, mode, id, block->page.frame,
+                                      PRDT_HEAPNO, index, thr, prdt);
+        trx->mutex_unlock();
+      }
+    }
+  }
+
+  if (err == DB_SUCCESS)
+    /* Update the page max trx id field */
+    page_update_max_trx_id(block, buf_block_get_page_zip(block), trx->id, mtr);
+
+  return err;
+}
+
+/**************************************************************//**
+Check whether any predicate lock in parent needs to propagate to
+child page after split. */
+void
+lock_prdt_update_parent(
+/*====================*/
+        buf_block_t*    left_block,	/*!< in/out: page to be split */
+        buf_block_t*    right_block,	/*!< in/out: the new half page */
+        lock_prdt_t*	left_prdt,	/*!< in: MBR on the old page */
+        lock_prdt_t*	right_prdt,	/*!< in: MBR on the new page */
+	const page_id_t	page_id)	/*!< in: parent page */
+{
+	auto fold= page_id.fold();
+	LockMutexGuard g{SRW_LOCK_CALL};
+	hash_cell_t& cell = *lock_sys.prdt_hash.cell_get(fold);
+
+	/* Get all locks in parent */
+	for (lock_t *lock = lock_sys_t::get_first(cell, page_id);
+	     lock;
+	     lock = lock_rec_get_next_on_page(lock)) {
+		lock_prdt_t*	lock_prdt;
+		ulint		op = PAGE_CUR_DISJOINT;
+
+		ut_ad(lock);
+
+		if (!(lock->type_mode & LOCK_PREDICATE)
+		    || (lock->type_mode & LOCK_MODE_MASK) == LOCK_X) {
+			continue;
+		}
+
+		lock_prdt = lock_get_prdt_from_lock(lock);
+
+		/* Check each lock in parent to see if it intersects with
+		left or right child */
+		if (!lock_prdt_consistent(lock_prdt, left_prdt, op)
+		    && !lock_prdt_find_on_page(lock->type_mode, left_block,
+					       lock_prdt, lock->trx)) {
+			lock_prdt_add_to_queue(lock->type_mode,
+					       left_block, lock->index,
+					       lock->trx, lock_prdt,
+					       false);
+		}
+
+		if (!lock_prdt_consistent(lock_prdt, right_prdt, op)
+		    && !lock_prdt_find_on_page(lock->type_mode, right_block,
+					       lock_prdt, lock->trx)) {
+			lock_prdt_add_to_queue(lock->type_mode, right_block,
+					       lock->index, lock->trx,
+					       lock_prdt, false);
+		}
+	}
+}
+
+/**************************************************************//**
+Update predicate lock when page splits */
+static
+void
+lock_prdt_update_split_low(
+/*=======================*/
+	buf_block_t*	new_block,	/*!< in/out: the new half page */
+	lock_prdt_t*	prdt,		/*!< in: MBR on the old page */
+	lock_prdt_t*	new_prdt,	/*!< in: MBR on the new page */
+	const page_id_t	id,		/*!< in: page number */
+	unsigned	type_mode)	/*!< in: LOCK_PREDICATE or
+					LOCK_PRDT_PAGE */
+{
+	hash_cell_t& cell = *lock_sys.hash_get(type_mode).cell_get(id.fold());
+
+	for (lock_t* lock = lock_sys_t::get_first(cell, id);
+	     lock;
+	     lock = lock_rec_get_next_on_page(lock)) {
+		/* First dealing with Page Lock */
+		if (lock->type_mode & LOCK_PRDT_PAGE) {
+			/* Duplicate the lock to new page */
+			lock_prdt_add_to_queue(lock->type_mode,
+					       new_block,
+					       lock->index,
+					       lock->trx, nullptr, false);
+			continue;
+		}
+
+		/* Now dealing with Predicate Lock */
+		lock_prdt_t*	lock_prdt;
+		ulint		op = PAGE_CUR_DISJOINT;
+
+		ut_ad(lock->type_mode & LOCK_PREDICATE);
+
+		/* No need to duplicate waiting X locks */
+		if ((lock->type_mode & LOCK_MODE_MASK) == LOCK_X) {
+			continue;
+		}
+
+		lock_prdt = lock_get_prdt_from_lock(lock);
+
+		if (!lock_prdt_consistent(lock_prdt, new_prdt, op)) {
+			/* Move the lock to new page */
+			lock_prdt_add_to_queue(lock->type_mode, new_block,
+					       lock->index, lock->trx,
+					       lock_prdt, false);
+		}
+	}
+}
+
+/**************************************************************//**
+Update predicate lock when page splits */
+void
+lock_prdt_update_split(
+/*===================*/
+	buf_block_t*	new_block,	/*!< in/out: the new half page */
+	lock_prdt_t*	prdt,		/*!< in: MBR on the old page */
+	lock_prdt_t*	new_prdt,	/*!< in: MBR on the new page */
+	const page_id_t	page_id)	/*!< in: page number */
+{
+	LockMutexGuard g{SRW_LOCK_CALL};
+	lock_prdt_update_split_low(new_block, prdt, new_prdt,
+				   page_id, LOCK_PREDICATE);
+
+	lock_prdt_update_split_low(new_block, NULL, NULL,
+				   page_id, LOCK_PRDT_PAGE);
+}
+
+/*********************************************************************//**
+Initiate a Predicate Lock from a MBR */
+void
+lock_init_prdt_from_mbr(
+/*====================*/
+	lock_prdt_t*	prdt,	/*!< in/out: predicate to initialized */
+	rtr_mbr_t*	mbr,	/*!< in: Minimum Bounding Rectangle */
+	ulint		mode,	/*!< in: Search mode */
+	mem_heap_t*	heap)	/*!< in: heap for allocating memory */
+{
+	memset(prdt, 0, sizeof(*prdt));
+
+	if (heap != NULL) {
+		prdt->data = mem_heap_dup(heap, mbr, sizeof *mbr);
+	} else {
+		prdt->data = static_cast<void*>(mbr);
+	}
+
+	prdt->op = static_cast<uint16>(mode);
+}
+
+/*********************************************************************//**
+Acquire a predicate lock on a block
+@return	DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_prdt_lock(
+/*===========*/
+	buf_block_t*	block,	/*!< in/out: buffer block of rec */
+	lock_prdt_t*	prdt,	/*!< in: Predicate for the lock */
+	dict_index_t*	index,	/*!< in: secondary index */
+	lock_mode	mode,	/*!< in: mode of the lock which
+				the read cursor should set on
+				records: LOCK_S or LOCK_X; the
+				latter is possible in
+				SELECT FOR UPDATE */
+	unsigned	type_mode,
+				/*!< in: LOCK_PREDICATE or LOCK_PRDT_PAGE */
+	que_thr_t*	thr)	/*!< in: query thread
+				(can be NULL if BTR_NO_LOCKING_FLAG) */
+{
+	trx_t*		trx = thr_get_trx(thr);
+	dberr_t		err = DB_SUCCESS;
+	lock_rec_req_status	status = LOCK_REC_SUCCESS;
+
+	if (trx->read_only || index->table->is_temporary()) {
+		return(DB_SUCCESS);
+	}
+
+	ut_ad(!dict_index_is_clust(index));
+	ut_ad(!dict_index_is_online_ddl(index));
+	ut_ad(type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE));
+
+	auto& hash = lock_sys.prdt_hash_get(type_mode != LOCK_PREDICATE);
+	const page_id_t id{block->page.id()};
+
+	/* Another transaction cannot have an implicit lock on the record,
+	because when we come here, we already have modified the clustered
+	index record, and this would not have been possible if another active
+	transaction had modified this secondary index record. */
+
+	LockGuard g{hash, id};
+
+	const unsigned	prdt_mode = type_mode | mode;
+	lock_t*		lock = lock_sys_t::get_first(g.cell(), id);
+
+	if (lock == NULL) {
+		lock = lock_rec_create(
+			NULL,
+			prdt_mode, block, PRDT_HEAPNO,
+			index, trx, FALSE);
+
+		status = LOCK_REC_SUCCESS_CREATED;
+	} else {
+		if (lock_rec_get_next_on_page(lock)
+		    || lock->trx != trx
+		    || lock->type_mode != prdt_mode
+		    || lock_rec_get_n_bits(lock) == 0
+		    || ((type_mode & LOCK_PREDICATE)
+		        && (!lock_prdt_consistent(
+				lock_get_prdt_from_lock(lock), prdt, 0)))) {
+			trx->mutex_lock();
+
+			lock = lock_prdt_has_lock(
+				mode, g.cell(), id, prdt, trx);
+
+			if (lock) {
+			} else if (lock_t* wait_for
+				   = lock_prdt_other_has_conflicting(
+					   prdt_mode, g.cell(), id, prdt,
+					   trx)) {
+				err = lock_rec_enqueue_waiting(
+					wait_for, prdt_mode, id,
+					block->page.frame, PRDT_HEAPNO,
+					index, thr, prdt);
+			} else {
+				lock_prdt_add_to_queue(
+					prdt_mode, block, index, trx,
+					prdt, true);
+			}
+
+			trx->mutex_unlock();
+		} else {
+			if (!lock_rec_get_nth_bit(lock, PRDT_HEAPNO)) {
+				lock_rec_set_nth_bit(lock, PRDT_HEAPNO);
+				status = LOCK_REC_SUCCESS_CREATED;
+			}
+		}
+	}
+
+	if (status == LOCK_REC_SUCCESS_CREATED && type_mode == LOCK_PREDICATE) {
+		/* Append the predicate in the lock record */
+		lock_prdt_set_prdt(lock, prdt);
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Acquire a "Page" lock on a block
+@return	DB_SUCCESS, DB_LOCK_WAIT, or DB_DEADLOCK */
+dberr_t
+lock_place_prdt_page_lock(
+	const page_id_t	page_id,	/*!< in: page identifier */
+	dict_index_t*	index,		/*!< in: secondary index */
+	que_thr_t*	thr)		/*!< in: query thread */
+{
+	ut_ad(thr != NULL);
+	ut_ad(!high_level_read_only);
+
+	ut_ad(index->is_spatial());
+	ut_ad(!dict_index_is_online_ddl(index));
+	if (index->table->is_temporary()) {
+		return DB_SUCCESS;
+	}
+
+	/* Another transaction cannot have an implicit lock on the record,
+	because when we come here, we already have modified the clustered
+	index record, and this would not have been possible if another active
+	transaction had modified this secondary index record. */
+
+	LockGuard g{lock_sys.prdt_page_hash, page_id};
+
+	const lock_t*	lock = lock_sys_t::get_first(g.cell(), page_id);
+	const ulint	mode = LOCK_S | LOCK_PRDT_PAGE;
+	trx_t*		trx = thr_get_trx(thr);
+
+	if (lock != NULL) {
+		/* Find a matching record lock owned by this transaction. */
+
+		while (lock != NULL && lock->trx != trx) {
+			lock = lock_rec_get_next_on_page_const(lock);
+		}
+
+		ut_ad(lock == NULL || lock->type_mode == mode);
+		ut_ad(lock == NULL || lock_rec_get_n_bits(lock) != 0);
+	}
+
+	if (lock == NULL) {
+		lock = lock_rec_create_low(
+			NULL,
+			mode, page_id, NULL, PRDT_HEAPNO,
+			index, trx, FALSE);
+
+#ifdef PRDT_DIAG
+		printf("GIS_DIAGNOSTIC: page lock %d\n", (int) page_no);
+#endif /* PRDT_DIAG */
+	}
+
+	return(DB_SUCCESS);
+}
+
+/** Check whether there are R-tree Page lock on a page
+@param[in]	trx	trx to test the lock
+@param[in]	page_id	page identifier
+@return	true if there is none */
+bool lock_test_prdt_page_lock(const trx_t *trx, const page_id_t page_id)
+{
+  LockGuard g{lock_sys.prdt_page_hash, page_id};
+  lock_t *lock= lock_sys_t::get_first(g.cell(), page_id);
+  return !lock || trx == lock->trx;
+}
+
+/*************************************************************//**
+Moves the locks of a page to another page and resets the lock bits of
+the donating records. */
+void
+lock_prdt_rec_move(
+/*===============*/
+	const buf_block_t*	receiver,	/*!< in: buffer block containing
+						the receiving record */
+	const page_id_t		donator)	/*!< in: target page */
+{
+	LockMultiGuard g{lock_sys.prdt_hash, receiver->page.id(), donator};
+
+	for (lock_t *lock = lock_sys_t::get_first(g.cell2(), donator,
+						  PRDT_HEAPNO);
+	     lock;
+	     lock = lock_rec_get_next(PRDT_HEAPNO, lock)) {
+
+		const auto type_mode = lock->type_mode;
+		lock_prdt_t*	lock_prdt = lock_get_prdt_from_lock(lock);
+
+		lock_rec_reset_nth_bit(lock, PRDT_HEAPNO);
+		if (type_mode & LOCK_WAIT) {
+			ut_ad(lock->trx->lock.wait_lock == lock);
+			lock->type_mode &= ~LOCK_WAIT;
+		}
+		lock_prdt_add_to_queue(
+			type_mode, receiver, lock->index, lock->trx,
+			lock_prdt, false);
+	}
+}
+
+/** Remove locks on a discarded SPATIAL INDEX page.
+@param id   page to be discarded
+@param page whether to discard also from lock_sys.prdt_hash */
+void lock_sys_t::prdt_page_free_from_discard(const page_id_t id, bool all)
+{
+  const auto id_fold= id.fold();
+  rd_lock(SRW_LOCK_CALL);
+  auto cell= prdt_page_hash.cell_get(id_fold);
+  auto latch= hash_table::latch(cell);
+  latch->acquire();
+
+  for (lock_t *lock= get_first(*cell, id), *next; lock; lock= next)
+  {
+    next= lock_rec_get_next_on_page(lock);
+    lock_rec_discard(prdt_page_hash, lock);
+  }
+
+  if (all)
+  {
+    latch->release();
+    cell= prdt_hash.cell_get(id_fold);
+    latch= hash_table::latch(cell);
+    latch->acquire();
+    for (lock_t *lock= get_first(*cell, id), *next; lock; lock= next)
+    {
+      next= lock_rec_get_next_on_page(lock);
+      lock_rec_discard(prdt_hash, lock);
+    }
+  }
+
+  latch->release();
+  cell= rec_hash.cell_get(id_fold);
+  latch= hash_table::latch(cell);
+  latch->acquire();
+
+  for (lock_t *lock= get_first(*cell, id), *next; lock; lock= next)
+  {
+    next= lock_rec_get_next_on_page(lock);
+    lock_rec_discard(rec_hash, lock);
+  }
+
+  latch->release();
+  /* Must be last, to avoid a race with lock_sys_t::hash_table::resize() */
+  rd_unlock();
+}
diff --git a/storage/innobase/log/log0crypt.cc b/storage/innobase/log/log0crypt.cc
new file mode 100644
index 00000000..8a771410
--- /dev/null
+++ b/storage/innobase/log/log0crypt.cc
@@ -0,0 +1,641 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2015, Google Inc. All Rights Reserved.
+Copyright (C) 2014, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/**************************************************//**
+@file log0crypt.cc
+Innodb log encrypt/decrypt
+
+Created 11/25/2013 Minli Zhu Google
+Modified           Jan Lindström jan.lindstrom@mariadb.com
+MDEV-11782: Rewritten for MariaDB 10.2 by Marko Mäkelä, MariaDB Corporation.
+*******************************************************/
+#include <my_global.h>
+#include "log0crypt.h"
+#include <mysql/service_my_crypt.h>
+#include "assume_aligned.h"
+
+#include "log0crypt.h"
+#include "log0recv.h"  // for recv_sys
+#include "mach0data.h"
+
+/** Redo log encryption key ID */
+#define LOG_DEFAULT_ENCRYPTION_KEY 1
+
+struct crypt_info_t {
+	uint32_t	checkpoint_no; /*!< checkpoint no; 32 bits */
+	uint32_t	key_version;   /*!< key version */
+	/** random string for encrypting the key */
+	alignas(8) byte	crypt_msg[MY_AES_BLOCK_SIZE];
+	/** the secret key */
+	alignas(8) byte crypt_key[MY_AES_BLOCK_SIZE];
+	/** a random string for the per-block initialization vector */
+	alignas(4) byte	crypt_nonce[4];
+};
+
+/** The crypt info */
+static crypt_info_t info;
+
+/** Initialization vector used for temporary files/tablespace */
+static byte tmp_iv[MY_AES_BLOCK_SIZE];
+
+/** Crypt info when upgrading from 10.1 */
+static crypt_info_t infos[5 * 2];
+/** First unused slot in infos[] */
+static size_t infos_used;
+
+/* Offsets of a log block header */
+#define	LOG_BLOCK_HDR_NO	0	/* block number which must be > 0 and
+					is allowed to wrap around at 2G; the
+					highest bit is set to 1 if this is the
+					first log block in a log flush write
+					segment */
+#define LOG_BLOCK_FLUSH_BIT_MASK 0x80000000UL
+					/* mask used to get the highest bit in
+					the preceding field */
+#define	LOG_BLOCK_HDR_DATA_LEN	4	/* number of bytes of log written to
+					this block */
+#define	LOG_BLOCK_FIRST_REC_GROUP 6	/* offset of the first start of an
+					mtr log record group in this log block,
+					0 if none; if the value is the same
+					as LOG_BLOCK_HDR_DATA_LEN, it means
+					that the first rec group has not yet
+					been catenated to this log block, but
+					if it will, it will start at this
+					offset; an archive recovery can
+					start parsing the log records starting
+					from this offset in this log block,
+					if value not 0 */
+#define LOG_BLOCK_HDR_SIZE	12	/* size of the log block header in
+					bytes */
+
+#define	LOG_BLOCK_KEY		4	/* encryption key version
+					before LOG_BLOCK_CHECKSUM;
+					after log_t::FORMAT_ENC_10_4 only */
+#define	LOG_BLOCK_CHECKSUM	4	/* 4 byte checksum of the log block
+					contents; in InnoDB versions
+					< 3.23.52 this did not contain the
+					checksum but the same value as
+					LOG_BLOCK_HDR_NO */
+
+/*********************************************************************//**
+Get a log block's start lsn.
+@return a log block's start lsn */
+static inline
+lsn_t
+log_block_get_start_lsn(
+/*====================*/
+	lsn_t lsn,			/*!< in: checkpoint lsn */
+	ulint log_block_no)		/*!< in: log block number */
+{
+	lsn_t start_lsn =
+		(lsn & (lsn_t)0xffffffff00000000ULL) |
+		(((log_block_no - 1) & (lsn_t)0x3fffffff) << 9);
+	return start_lsn;
+}
+
+/** Generate crypt key from crypt msg.
+@param[in,out]	info	encryption key
+@param[in]	upgrade	whether to use the key in MariaDB 10.1 format
+@return whether the operation was successful */
+static bool init_crypt_key(crypt_info_t* info, bool upgrade = false)
+{
+	byte	mysqld_key[MY_AES_MAX_KEY_LENGTH];
+	uint	keylen = sizeof mysqld_key;
+
+	compile_time_assert(16 == sizeof info->crypt_key);
+	compile_time_assert(16 == MY_AES_BLOCK_SIZE);
+
+	if (uint rc = encryption_key_get(LOG_DEFAULT_ENCRYPTION_KEY,
+					 info->key_version, mysqld_key,
+					 &keylen)) {
+		ib::error()
+			<< "Obtaining redo log encryption key version "
+			<< info->key_version << " failed (" << rc
+			<< "). Maybe the key or the required encryption "
+			"key management plugin was not found.";
+		info->key_version = ENCRYPTION_KEY_VERSION_INVALID;
+		return false;
+	}
+
+	if (upgrade) {
+		while (keylen < sizeof mysqld_key) {
+			mysqld_key[keylen++] = 0;
+		}
+	}
+
+	uint dst_len;
+	int err= my_aes_crypt(MY_AES_ECB,
+			      ENCRYPTION_FLAG_NOPAD | ENCRYPTION_FLAG_ENCRYPT,
+			      info->crypt_msg, MY_AES_BLOCK_SIZE,
+			      info->crypt_key, &dst_len,
+			      mysqld_key, keylen, NULL, 0);
+
+	if (err != MY_AES_OK || dst_len != MY_AES_BLOCK_SIZE) {
+		ib::error() << "Getting redo log crypto key failed: err = "
+			<< err << ", len = " << dst_len;
+		info->key_version = ENCRYPTION_KEY_VERSION_INVALID;
+		return false;
+	}
+
+	return true;
+}
+
+static ulint log_block_get_hdr_no(const byte *log_block)
+{
+  static_assert(LOG_BLOCK_HDR_NO == 0, "compatibility");
+  return mach_read_from_4(my_assume_aligned<4>(log_block)) &
+    ~LOG_BLOCK_FLUSH_BIT_MASK;
+}
+
+/** Decrypt log blocks.
+@param[in,out]	buf	log blocks to decrypt
+@param[in]	lsn	log sequence number of the start of the buffer
+@param[in]	size	size of the buffer, in bytes
+@return	whether the operation succeeded */
+ATTRIBUTE_COLD bool log_decrypt(byte* buf, lsn_t lsn, ulint size)
+{
+	ut_ad(!(size & 511));
+	ut_ad(!(ulint(buf) & 511));
+	ut_a(info.key_version);
+
+	alignas(8) byte aes_ctr_iv[MY_AES_BLOCK_SIZE];
+
+#define LOG_CRYPT_HDR_SIZE 4
+	lsn &= ~lsn_t{511};
+
+	const bool has_encryption_key_rotation
+		= log_sys.format == log_t::FORMAT_ENC_10_4
+		|| log_sys.format == log_t::FORMAT_ENC_10_5;
+
+	for (const byte* const end = buf + size; buf != end;
+	     buf += 512, lsn += 512) {
+		alignas(4) byte dst[512 - LOG_CRYPT_HDR_SIZE
+				    - LOG_BLOCK_CHECKSUM];
+
+		/* The log block number is not encrypted. */
+		memcpy_aligned<4>(dst, buf + LOG_BLOCK_HDR_NO, 4);
+		memcpy_aligned<4>(aes_ctr_iv, buf + LOG_BLOCK_HDR_NO, 4);
+		*aes_ctr_iv &= byte(~(LOG_BLOCK_FLUSH_BIT_MASK >> 24));
+		static_assert(LOG_BLOCK_HDR_NO + 4 == LOG_CRYPT_HDR_SIZE,
+			      "compatibility");
+		memcpy_aligned<4>(aes_ctr_iv + 4, info.crypt_nonce, 4);
+		mach_write_to_8(my_assume_aligned<8>(aes_ctr_iv + 8), lsn);
+		ut_ad(log_block_get_start_lsn(lsn,
+					      log_block_get_hdr_no(buf))
+		      == lsn);
+		byte* key_ver = &buf[512 - LOG_BLOCK_KEY - LOG_BLOCK_CHECKSUM];
+
+		const size_t dst_size = has_encryption_key_rotation
+			? sizeof dst - LOG_BLOCK_KEY
+			: sizeof dst;
+		if (has_encryption_key_rotation) {
+			const auto key_version = info.key_version;
+			info.key_version = mach_read_from_4(key_ver);
+			if (key_version == info.key_version) {
+			} else if (!init_crypt_key(&info)) {
+				return false;
+#ifndef DBUG_OFF
+			} else {
+				DBUG_PRINT("ib_log", ("key_version: %x -> %x",
+						      key_version,
+						      info.key_version));
+#endif /* !DBUG_OFF */
+			}
+		}
+
+		ut_ad(LOG_CRYPT_HDR_SIZE + dst_size
+		      == 512 - LOG_BLOCK_CHECKSUM - LOG_BLOCK_KEY);
+
+		uint dst_len;
+		int rc = encryption_crypt(
+			buf + LOG_CRYPT_HDR_SIZE, static_cast<uint>(dst_size),
+			reinterpret_cast<byte*>(dst), &dst_len,
+			const_cast<byte*>(info.crypt_key),
+			MY_AES_BLOCK_SIZE,
+			aes_ctr_iv, sizeof aes_ctr_iv,
+			ENCRYPTION_FLAG_DECRYPT | ENCRYPTION_FLAG_NOPAD,
+			LOG_DEFAULT_ENCRYPTION_KEY,
+			info.key_version);
+		ut_a(rc == MY_AES_OK);
+		ut_a(dst_len == dst_size);
+		memcpy(buf + LOG_CRYPT_HDR_SIZE, dst, dst_size);
+	}
+
+	return true;
+}
+
+/** Initialize the redo log encryption key and random parameters
+when creating a new redo log.
+The random parameters will be persisted in the log checkpoint pages.
+@see log_crypt_write_header()
+@see log_crypt_read_header()
+@return whether the operation succeeded */
+bool log_crypt_init()
+{
+  info.key_version=
+    encryption_key_get_latest_version(LOG_DEFAULT_ENCRYPTION_KEY);
+
+  if (info.key_version == ENCRYPTION_KEY_VERSION_INVALID)
+    ib::error() << "log_crypt_init(): cannot get key version";
+  else if (my_random_bytes(tmp_iv, MY_AES_BLOCK_SIZE) != MY_AES_OK ||
+           my_random_bytes(info.crypt_msg, sizeof info.crypt_msg) !=
+           MY_AES_OK ||
+           my_random_bytes(info.crypt_nonce, sizeof info.crypt_nonce) !=
+           MY_AES_OK)
+    ib::error() << "log_crypt_init(): my_random_bytes() failed";
+  else if (init_crypt_key(&info))
+    goto func_exit;
+
+  info.key_version= 0;
+func_exit:
+  return info.key_version != 0;
+}
+
+/** Read the MariaDB 10.1 checkpoint crypto (version, msg and iv) info.
+@param[in]	buf	checkpoint buffer
+@return	whether the operation was successful */
+ATTRIBUTE_COLD bool log_crypt_101_read_checkpoint(const byte* buf)
+{
+	buf += 20 + 32 * 9;
+
+	const size_t n = *buf++ == 2 ? std::min(unsigned(*buf++), 5U) : 0;
+
+	for (size_t i = 0; i < n; i++) {
+		struct crypt_info_t& info = infos[infos_used];
+		unsigned checkpoint_no = mach_read_from_4(buf);
+		for (size_t j = 0; j < infos_used; j++) {
+			if (infos[j].checkpoint_no == checkpoint_no) {
+				/* Do not overwrite an existing slot. */
+				goto next_slot;
+			}
+		}
+		if (infos_used >= UT_ARR_SIZE(infos)) {
+			ut_ad("too many checkpoint pages" == 0);
+			goto next_slot;
+		}
+		infos_used++;
+		info.checkpoint_no = checkpoint_no;
+		info.key_version = mach_read_from_4(buf + 4);
+		memcpy(info.crypt_msg, buf + 8, MY_AES_BLOCK_SIZE);
+		memcpy(info.crypt_nonce, buf + 24, sizeof info.crypt_nonce);
+
+		if (!init_crypt_key(&info, true)) {
+			return false;
+		}
+next_slot:
+		buf += 4 + 4 + 2 * MY_AES_BLOCK_SIZE;
+	}
+
+	return true;
+}
+
+/** Decrypt a MariaDB 10.1 redo log block.
+@param[in,out]	buf		log block
+@param[in]	start_lsn	server start LSN
+@return	whether the decryption was successful */
+ATTRIBUTE_COLD bool log_crypt_101_read_block(byte* buf, lsn_t start_lsn)
+{
+	const uint32_t checkpoint_no = mach_read_from_4(buf + 8);
+	const crypt_info_t* info = infos;
+	for (const crypt_info_t* const end = info + infos_used; info < end;
+	     info++) {
+		if (info->key_version
+		    && info->key_version != ENCRYPTION_KEY_VERSION_INVALID
+		    && info->checkpoint_no == checkpoint_no) {
+			goto found;
+		}
+	}
+
+	if (infos_used == 0) {
+		return false;
+	}
+	/* MariaDB Server 10.1 would use the first key if it fails to
+	find a key for the current checkpoint. */
+	info = infos;
+	if (info->key_version == ENCRYPTION_KEY_VERSION_INVALID) {
+		return false;
+	}
+found:
+	byte dst[512];
+	uint dst_len;
+	byte aes_ctr_iv[MY_AES_BLOCK_SIZE];
+
+	const uint src_len = 512 - LOG_BLOCK_HDR_SIZE;
+
+	ulint log_block_no = log_block_get_hdr_no(buf);
+
+	/* The log block header is not encrypted. */
+	memcpy(dst, buf, 512);
+
+	memcpy(aes_ctr_iv, info->crypt_nonce, 3);
+	mach_write_to_8(aes_ctr_iv + 3,
+			log_block_get_start_lsn(start_lsn, log_block_no));
+	memcpy(aes_ctr_iv + 11, buf, 4);
+	aes_ctr_iv[11] &= byte(~(LOG_BLOCK_FLUSH_BIT_MASK >> 24));
+	aes_ctr_iv[15] = 0;
+
+	int rc = encryption_crypt(buf + LOG_BLOCK_HDR_SIZE, src_len,
+				  dst + LOG_BLOCK_HDR_SIZE, &dst_len,
+				  const_cast<byte*>(info->crypt_key),
+				  MY_AES_BLOCK_SIZE,
+				  aes_ctr_iv, MY_AES_BLOCK_SIZE,
+				  ENCRYPTION_FLAG_DECRYPT
+				  | ENCRYPTION_FLAG_NOPAD,
+				  LOG_DEFAULT_ENCRYPTION_KEY,
+				  info->key_version);
+
+	if (rc != MY_AES_OK || dst_len != src_len) {
+		return false;
+	}
+
+	memcpy(buf, dst, sizeof dst);
+	return true;
+}
+
+/** MariaDB 10.2.5 encrypted redo log encryption key version (32 bits)*/
+constexpr size_t LOG_CHECKPOINT_CRYPT_KEY= 32;
+/** MariaDB 10.2.5 encrypted redo log random nonce (32 bits) */
+constexpr size_t LOG_CHECKPOINT_CRYPT_NONCE= 36;
+/** MariaDB 10.2.5 encrypted redo log random message (MY_AES_BLOCK_SIZE) */
+constexpr size_t LOG_CHECKPOINT_CRYPT_MESSAGE= 40;
+
+/** Add the encryption information to the log header buffer.
+@param buf   part of log header buffer */
+void log_crypt_write_header(byte *buf)
+{
+  ut_ad(info.key_version);
+  mach_write_to_4(my_assume_aligned<4>(buf), LOG_DEFAULT_ENCRYPTION_KEY);
+  mach_write_to_4(my_assume_aligned<4>(buf + 4), info.key_version);
+  memcpy_aligned<8>(buf + 8, info.crypt_msg, MY_AES_BLOCK_SIZE);
+  static_assert(MY_AES_BLOCK_SIZE == 16, "compatibility");
+  memcpy_aligned<4>(buf + 24, info.crypt_nonce, sizeof info.crypt_nonce);
+}
+
+/** Read the encryption information from a log header buffer.
+@param buf   part of log header buffer
+@return whether the operation was successful */
+bool log_crypt_read_header(const byte *buf)
+{
+  MEM_UNDEFINED(&info.checkpoint_no, sizeof info.checkpoint_no);
+  MEM_NOACCESS(&info.checkpoint_no, sizeof info.checkpoint_no);
+  if (mach_read_from_4(my_assume_aligned<4>(buf)) !=
+      LOG_DEFAULT_ENCRYPTION_KEY)
+    return false;
+  info.key_version= mach_read_from_4(my_assume_aligned<4>(buf + 4));
+  memcpy_aligned<8>(info.crypt_msg, buf + 8, MY_AES_BLOCK_SIZE);
+  memcpy_aligned<4>(info.crypt_nonce, buf + 24, sizeof info.crypt_nonce);
+  return init_crypt_key(&info);
+}
+
+/** Read the checkpoint crypto (version, msg and iv) info.
+@param[in]	buf	checkpoint buffer
+@return	whether the operation was successful */
+ATTRIBUTE_COLD bool log_crypt_read_checkpoint_buf(const byte* buf)
+{
+	info.checkpoint_no = mach_read_from_4(buf + 4);
+	info.key_version = mach_read_from_4(buf + LOG_CHECKPOINT_CRYPT_KEY);
+
+#if MY_AES_BLOCK_SIZE != 16
+# error "MY_AES_BLOCK_SIZE != 16; redo log checkpoint format affected"
+#endif
+	compile_time_assert(16 == sizeof info.crypt_msg);
+	compile_time_assert(16 == MY_AES_BLOCK_SIZE);
+	compile_time_assert(LOG_CHECKPOINT_CRYPT_MESSAGE
+			    - LOG_CHECKPOINT_CRYPT_NONCE
+			    == sizeof info.crypt_nonce);
+
+	memcpy(info.crypt_msg, buf + LOG_CHECKPOINT_CRYPT_MESSAGE,
+	       MY_AES_BLOCK_SIZE);
+	memcpy(info.crypt_nonce, buf + LOG_CHECKPOINT_CRYPT_NONCE,
+	       sizeof info.crypt_nonce);
+
+	return init_crypt_key(&info);
+}
+
+/** Encrypt or decrypt a temporary file block.
+@param[in]	src		block to encrypt or decrypt
+@param[in]	size		size of the block
+@param[out]	dst		destination block
+@param[in]	offs		offset to block
+@param[in]	encrypt		true=encrypt; false=decrypt
+@return whether the operation succeeded */
+bool log_tmp_block_encrypt(
+	const byte*	src,
+	ulint		size,
+	byte*		dst,
+	uint64_t	offs,
+	bool		encrypt)
+{
+	uint dst_len;
+	uint64_t iv[MY_AES_BLOCK_SIZE / sizeof(uint64_t)];
+	iv[0] = offs;
+	memcpy(iv + 1, tmp_iv, sizeof iv - sizeof *iv);
+
+	int rc = encryption_crypt(
+		src, uint(size), dst, &dst_len,
+		const_cast<byte*>(info.crypt_key), MY_AES_BLOCK_SIZE,
+		reinterpret_cast<byte*>(iv), uint(sizeof iv),
+		encrypt
+		? ENCRYPTION_FLAG_ENCRYPT|ENCRYPTION_FLAG_NOPAD
+		: ENCRYPTION_FLAG_DECRYPT|ENCRYPTION_FLAG_NOPAD,
+		LOG_DEFAULT_ENCRYPTION_KEY, info.key_version);
+
+	if (rc != MY_AES_OK) {
+		ib::error() << (encrypt ? "Encryption" : "Decryption")
+			    << " failed for temporary file: " << rc;
+	}
+
+	return rc == MY_AES_OK;
+}
+
+/** Decrypt part of a log record.
+@param iv    initialization vector
+@param buf   buffer for the decrypted data
+@param data  the encrypted data
+@param len   length of the data, in bytes
+@return buf */
+byte *log_decrypt_buf(const byte *iv, byte *buf, const byte *data, uint len)
+{
+  ut_a(MY_AES_OK == encryption_crypt(data, len, buf, &len,
+                                     info.crypt_key, MY_AES_BLOCK_SIZE,
+                                     iv, MY_AES_BLOCK_SIZE,
+                                     ENCRYPTION_FLAG_DECRYPT |
+                                     ENCRYPTION_FLAG_NOPAD,
+                                     LOG_DEFAULT_ENCRYPTION_KEY,
+                                     info.key_version));
+  return buf;
+}
+
+#include "mtr0log.h"
+
+/** Encrypt a log snippet
+@param iv    initialization vector
+@param tmp   temporary buffer
+@param buf   buffer to be replaced with encrypted contents
+@param end   pointer past the end of buf
+@return encrypted data bytes that follow */
+static size_t log_encrypt_buf(byte iv[MY_AES_BLOCK_SIZE],
+                              byte *&tmp, byte *buf, const byte *const end)
+{
+  for (byte *l= buf; l != end; )
+  {
+    const byte b= *l++;
+    size_t rlen= b & 0xf;
+    if (!rlen)
+    {
+      const size_t lenlen= mlog_decode_varint_length(*l);
+      const uint32_t addlen= mlog_decode_varint(l);
+      ut_ad(addlen != MLOG_DECODE_ERROR);
+      rlen= addlen + 15 - lenlen;
+      l+= lenlen;
+    }
+
+    if (b < 0x80)
+    {
+      /* Add the page identifier to the initialization vector. */
+      size_t idlen= mlog_decode_varint_length(*l);
+      ut_ad(idlen <= 5);
+      ut_ad(idlen < rlen);
+      mach_write_to_4(my_assume_aligned<4>(iv + 8), mlog_decode_varint(l));
+      l+= idlen;
+      rlen-= idlen;
+      idlen= mlog_decode_varint_length(*l);
+      ut_ad(idlen <= 5);
+      ut_ad(idlen <= rlen);
+      mach_write_to_4(my_assume_aligned<4>(iv + 12), mlog_decode_varint(l));
+      l+= idlen;
+      rlen-= idlen;
+    }
+
+    uint len;
+
+    if (l + rlen > end)
+    {
+      if (size_t len= end - l)
+      {
+        /* Only WRITE or EXTENDED records may comprise multiple segments. */
+        static_assert((EXTENDED | 0x10) == WRITE, "compatibility");
+        ut_ad((b & 0x60) == EXTENDED);
+        ut_ad(l < end);
+        memcpy(tmp, l, len);
+        tmp+= len;
+        rlen-= len;
+      }
+      return rlen;
+    }
+
+    if (!rlen)
+      continue; /* FREE_PAGE and INIT_PAGE have no payload. */
+
+    len= static_cast<uint>(rlen);
+    ut_a(MY_AES_OK == encryption_crypt(l, len, tmp, &len,
+                                       info.crypt_key, MY_AES_BLOCK_SIZE,
+                                       iv, MY_AES_BLOCK_SIZE,
+                                       ENCRYPTION_FLAG_ENCRYPT |
+                                       ENCRYPTION_FLAG_NOPAD,
+                                       LOG_DEFAULT_ENCRYPTION_KEY,
+                                       info.key_version));
+    ut_ad(len == rlen);
+    memcpy(l, tmp, rlen);
+    l+= rlen;
+  }
+
+  return 0;
+}
+
+/** Encrypt the log */
+ATTRIBUTE_NOINLINE void mtr_t::encrypt()
+{
+  ut_ad(log_sys.format == log_t::FORMAT_ENC_10_8);
+  ut_ad(m_log.size());
+
+  alignas(8) byte iv[MY_AES_BLOCK_SIZE];
+
+  m_commit_lsn= log_sys.get_lsn();
+  ut_ad(m_commit_lsn);
+  byte *tmp= static_cast<byte*>(alloca(srv_page_size)), *t= tmp;
+  byte *dst= static_cast<byte*>(alloca(srv_page_size));
+  mach_write_to_8(iv, m_commit_lsn);
+  mtr_buf_t::block_t *start= nullptr;
+  size_t size= 0, start_size= 0;
+  m_crc= 0;
+
+  m_log.for_each_block([&](mtr_buf_t::block_t *b)
+  {
+    ut_ad(t - tmp + size <= srv_page_size);
+    byte *buf= b->begin();
+    if (!start)
+    {
+    parse:
+      ut_ad(t == tmp);
+      size= log_encrypt_buf(iv, t, buf, b->end());
+      if (!size)
+      {
+        ut_ad(t == tmp);
+        start_size= 0;
+      }
+      else
+      {
+        start= b;
+        start_size= t - tmp;
+      }
+      m_crc= my_crc32c(m_crc, buf, b->end() - buf - start_size);
+    }
+    else if (size > b->used())
+    {
+      ::memcpy(t, buf, b->used());
+      t+= b->used();
+      size-= b->used();
+    }
+    else
+    {
+      ::memcpy(t, buf, size);
+      t+= size;
+      buf+= size;
+      uint len= static_cast<uint>(t - tmp);
+      ut_a(MY_AES_OK == encryption_crypt(tmp, len, dst, &len,
+                                         info.crypt_key, MY_AES_BLOCK_SIZE,
+                                         iv, MY_AES_BLOCK_SIZE,
+                                         ENCRYPTION_FLAG_ENCRYPT |
+                                         ENCRYPTION_FLAG_NOPAD,
+                                         LOG_DEFAULT_ENCRYPTION_KEY,
+                                         info.key_version));
+      ut_ad(tmp + len == t);
+      m_crc= my_crc32c(m_crc, dst, len);
+      /* Copy the encrypted data back to the log snippets. */
+      ::memcpy(start->end() - start_size, dst, start_size);
+      t= dst + start_size;
+      for (ilist<mtr_buf_t::block_t>::iterator i(start); &*++i != b;)
+      {
+        const size_t l{i->used()};
+        ::memcpy(i->begin(), t, l);
+        t+= l;
+      }
+      ::memcpy(b->begin(), t, size);
+      ut_ad(t + size == dst + len);
+      t= tmp;
+      start= nullptr;
+      goto parse;
+    }
+    return true;
+  });
+
+  ut_ad(t == tmp);
+  ut_ad(!start);
+  ut_ad(!size);
+}
diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc
new file mode 100644
index 00000000..91999c81
--- /dev/null
+++ b/storage/innobase/log/log0log.cc
@@ -0,0 +1,1358 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2014, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file log/log0log.cc
+Database log
+
+Created 12/9/1995 Heikki Tuuri
+*******************************************************/
+
+#include "univ.i"
+#include <debug_sync.h>
+#include <my_service_manager.h>
+
+#include "log0log.h"
+#include "log0crypt.h"
+#include "buf0buf.h"
+#include "buf0flu.h"
+#include "lock0lock.h"
+#include "log0recv.h"
+#include "fil0fil.h"
+#include "dict0stats_bg.h"
+#include "btr0defragment.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "trx0sys.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "srv0mon.h"
+#include "buf0dump.h"
+#include "log0sync.h"
+#include "log.h"
+
+/*
+General philosophy of InnoDB redo-logs:
+
+Every change to a contents of a data page must be done
+through mtr_t, and mtr_t::commit() will write log records
+to the InnoDB redo log. */
+
+alignas(CPU_LEVEL1_DCACHE_LINESIZE)
+static group_commit_lock flush_lock;
+alignas(CPU_LEVEL1_DCACHE_LINESIZE)
+static group_commit_lock write_lock;
+
+/** Redo log system */
+log_t	log_sys;
+
+/* Margins for free space in the log buffer after a log entry is catenated */
+#define LOG_BUF_FLUSH_RATIO	2
+#define LOG_BUF_FLUSH_MARGIN	((4 * 4096) /* cf. log_t::append_prepare() */ \
+				 + (4U << srv_page_size_shift))
+
+void log_t::set_capacity()
+{
+#ifndef SUX_LOCK_GENERIC
+	ut_ad(log_sys.latch.is_write_locked());
+#endif
+	/* Margin for the free space in the smallest log, before a new query
+	step which modifies the database, is started */
+
+	lsn_t smallest_capacity = srv_log_file_size - log_t::START_OFFSET;
+	/* Add extra safety */
+	smallest_capacity -= smallest_capacity / 10;
+
+	lsn_t margin = smallest_capacity - (48 << srv_page_size_shift);
+	margin -= margin / 10;	/* Add still some extra safety */
+
+	log_sys.log_capacity = smallest_capacity;
+
+	log_sys.max_modified_age_async = margin - margin / 8;
+	log_sys.max_checkpoint_age = margin;
+}
+
+#ifdef HAVE_PMEM
+void log_t::create_low()
+#else
+bool log_t::create()
+#endif
+{
+  ut_ad(this == &log_sys);
+  ut_ad(!is_initialised());
+
+  /* LSN 0 and 1 are reserved; @see buf_page_t::oldest_modification_ */
+  lsn.store(FIRST_LSN, std::memory_order_relaxed);
+  flushed_to_disk_lsn.store(FIRST_LSN, std::memory_order_relaxed);
+  write_lsn= FIRST_LSN;
+
+#ifndef HAVE_PMEM
+  buf= static_cast<byte*>(ut_malloc_dontdump(buf_size, PSI_INSTRUMENT_ME));
+  if (!buf)
+  {
+  alloc_fail:
+    sql_print_error("InnoDB: Cannot allocate memory;"
+                    " too large innodb_log_buffer_size?");
+    return false;
+  }
+  flush_buf= static_cast<byte*>(ut_malloc_dontdump(buf_size,
+                                                   PSI_INSTRUMENT_ME));
+  if (!flush_buf)
+  {
+    ut_free_dodump(buf, buf_size);
+    buf= nullptr;
+    goto alloc_fail;
+  }
+
+  TRASH_ALLOC(buf, buf_size);
+  TRASH_ALLOC(flush_buf, buf_size);
+  checkpoint_buf= static_cast<byte*>(aligned_malloc(4096, 4096));
+  memset_aligned<4096>(checkpoint_buf, 0, 4096);
+#else
+  ut_ad(!checkpoint_buf);
+  ut_ad(!buf);
+  ut_ad(!flush_buf);
+#endif
+
+  latch.SRW_LOCK_INIT(log_latch_key);
+  init_lsn_lock();
+
+  max_buf_free= buf_size / LOG_BUF_FLUSH_RATIO - LOG_BUF_FLUSH_MARGIN;
+  set_check_flush_or_checkpoint();
+
+  last_checkpoint_lsn= FIRST_LSN;
+  log_capacity= 0;
+  max_modified_age_async= 0;
+  max_checkpoint_age= 0;
+  next_checkpoint_lsn= 0;
+  checkpoint_pending= false;
+
+  buf_free= 0;
+
+  ut_ad(is_initialised());
+#ifndef HAVE_PMEM
+  return true;
+#endif
+}
+
+dberr_t log_file_t::close() noexcept
+{
+  ut_a(is_opened());
+
+  if (!os_file_close_func(m_file))
+    return DB_ERROR;
+
+  m_file= OS_FILE_CLOSED;
+  return DB_SUCCESS;
+}
+
+__attribute__((warn_unused_result))
+dberr_t log_file_t::read(os_offset_t offset, span<byte> buf) noexcept
+{
+  ut_ad(is_opened());
+  return os_file_read(IORequestRead, m_file, buf.data(), offset, buf.size(),
+                      nullptr);
+}
+
+void log_file_t::write(os_offset_t offset, span<const byte> buf) noexcept
+{
+  ut_ad(is_opened());
+  if (dberr_t err= os_file_write_func(IORequestWrite, "ib_logfile0", m_file,
+                                      buf.data(), offset, buf.size()))
+    ib::fatal() << "write(\"ib_logfile0\") returned " << err;
+}
+
+#ifdef HAVE_PMEM
+# include <libpmem.h>
+
+/** Attempt to memory map a file.
+@param file  log file handle
+@param size  file size
+@return pointer to memory mapping
+@retval MAP_FAILED  if the memory cannot be mapped */
+static void *log_mmap(os_file_t file, os_offset_t size)
+{
+  void *ptr=
+    my_mmap(0, size_t(size),
+            srv_read_only_mode ? PROT_READ : PROT_READ | PROT_WRITE,
+            MAP_SHARED_VALIDATE | MAP_SYNC, file, 0);
+#ifdef __linux__
+  if (ptr == MAP_FAILED)
+  {
+    struct stat st;
+    if (!fstat(file, &st))
+    {
+      MSAN_STAT_WORKAROUND(&st);
+      const auto st_dev= st.st_dev;
+      if (!stat("/dev/shm", &st))
+      {
+        MSAN_STAT_WORKAROUND(&st);
+        if (st.st_dev == st_dev)
+          ptr= my_mmap(0, size_t(size),
+                       srv_read_only_mode ? PROT_READ : PROT_READ | PROT_WRITE,
+                       MAP_SHARED, file, 0);
+      }
+    }
+  }
+#endif /* __linux__ */
+  return ptr;
+}
+#endif
+
+#ifdef HAVE_PMEM
+bool log_t::attach(log_file_t file, os_offset_t size)
+#else
+void log_t::attach_low(log_file_t file, os_offset_t size)
+#endif
+{
+  log= file;
+  ut_ad(!size || size >= START_OFFSET + SIZE_OF_FILE_CHECKPOINT);
+  file_size= size;
+
+#ifdef HAVE_PMEM
+  ut_ad(!buf);
+  ut_ad(!flush_buf);
+  if (size && !(size_t(size) & 4095) && srv_operation != SRV_OPERATION_BACKUP)
+  {
+    void *ptr= log_mmap(log.m_file, size);
+    if (ptr != MAP_FAILED)
+    {
+      log.close();
+      mprotect(ptr, size_t(size), PROT_READ);
+      buf= static_cast<byte*>(ptr);
+# if defined __linux__ || defined _WIN32
+      set_block_size(CPU_LEVEL1_DCACHE_LINESIZE);
+# endif
+      log_maybe_unbuffered= true;
+      log_buffered= false;
+      return true;
+    }
+  }
+  buf= static_cast<byte*>(ut_malloc_dontdump(buf_size, PSI_INSTRUMENT_ME));
+  if (!buf)
+  {
+  alloc_fail:
+    max_buf_free= 0;
+    sql_print_error("InnoDB: Cannot allocate memory;"
+                    " too large innodb_log_buffer_size?");
+    return false;
+  }
+  flush_buf= static_cast<byte*>(ut_malloc_dontdump(buf_size,
+                                                   PSI_INSTRUMENT_ME));
+  if (!flush_buf)
+  {
+    ut_free_dodump(buf, buf_size);
+    buf= nullptr;
+    goto alloc_fail;
+  }
+
+  TRASH_ALLOC(buf, buf_size);
+  TRASH_ALLOC(flush_buf, buf_size);
+#endif
+
+#if defined __linux__ || defined _WIN32
+  sql_print_information("InnoDB: %s (block size=%u bytes)",
+                        log_buffered
+                        ? "Buffered log writes"
+                        : "File system buffers for log disabled",
+                        block_size);
+#endif
+
+#ifdef HAVE_PMEM
+  checkpoint_buf= static_cast<byte*>(aligned_malloc(block_size, block_size));
+  memset_aligned<64>(checkpoint_buf, 0, block_size);
+  return true;
+#endif
+}
+
+/** Write a log file header.
+@param buf        log header buffer
+@param lsn        log sequence number corresponding to log_sys.START_OFFSET
+@param encrypted  whether the log is encrypted */
+void log_t::header_write(byte *buf, lsn_t lsn, bool encrypted)
+{
+  mach_write_to_4(my_assume_aligned<4>(buf) + LOG_HEADER_FORMAT,
+                  log_sys.FORMAT_10_8);
+  mach_write_to_8(my_assume_aligned<8>(buf + LOG_HEADER_START_LSN), lsn);
+
+#if defined __GNUC__ && __GNUC__ > 7
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wstringop-truncation"
+#endif
+  strncpy(reinterpret_cast<char*>(buf) + LOG_HEADER_CREATOR,
+          "MariaDB " PACKAGE_VERSION,
+          LOG_HEADER_CREATOR_END - LOG_HEADER_CREATOR);
+#if defined __GNUC__ && __GNUC__ > 7
+# pragma GCC diagnostic pop
+#endif
+
+  if (encrypted)
+    log_crypt_write_header(buf + LOG_HEADER_CREATOR_END);
+  mach_write_to_4(my_assume_aligned<4>(508 + buf), my_crc32c(0, buf, 508));
+}
+
+void log_t::create(lsn_t lsn) noexcept
+{
+#ifndef SUX_LOCK_GENERIC
+  ut_ad(latch.is_write_locked());
+#endif
+  ut_ad(!recv_no_log_write);
+  ut_ad(is_latest());
+  ut_ad(this == &log_sys);
+
+  this->lsn.store(lsn, std::memory_order_relaxed);
+  this->flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed);
+  first_lsn= lsn;
+  write_lsn= lsn;
+
+  last_checkpoint_lsn= 0;
+
+#ifdef HAVE_PMEM
+  if (is_pmem())
+  {
+    mprotect(buf, size_t(file_size), PROT_READ | PROT_WRITE);
+    memset_aligned<4096>(buf, 0, 4096);
+    buf_free= START_OFFSET;
+  }
+  else
+#endif
+  {
+    buf_free= 0;
+    memset_aligned<4096>(flush_buf, 0, buf_size);
+    memset_aligned<4096>(buf, 0, buf_size);
+  }
+
+  log_sys.header_write(buf, lsn, is_encrypted());
+  DBUG_PRINT("ib_log", ("write header " LSN_PF, lsn));
+
+#ifdef HAVE_PMEM
+  if (is_pmem())
+    pmem_persist(buf, 512);
+  else
+#endif
+  {
+    log.write(0, {buf, 4096});
+    memset_aligned<512>(buf, 0, 512);
+  }
+}
+
+void log_t::close_file()
+{
+#ifdef HAVE_PMEM
+  if (is_pmem())
+  {
+    ut_ad(!is_opened());
+    ut_ad(!checkpoint_buf);
+    if (buf)
+    {
+      my_munmap(buf, file_size);
+      buf= nullptr;
+    }
+    return;
+  }
+
+  ut_free_dodump(buf, buf_size);
+  buf= nullptr;
+  ut_free_dodump(flush_buf, buf_size);
+  flush_buf= nullptr;
+  aligned_free(checkpoint_buf);
+  checkpoint_buf= nullptr;
+#endif
+  if (is_opened())
+    if (const dberr_t err= log.close())
+      ib::fatal() << "closing ib_logfile0 failed: " << err;
+}
+
+/** Acquire all latches that protect the log. */
+static void log_resize_acquire()
+{
+  if (!log_sys.is_pmem())
+  {
+    while (flush_lock.acquire(log_sys.get_lsn() + 1, nullptr) !=
+           group_commit_lock::ACQUIRED);
+    while (write_lock.acquire(log_sys.get_lsn() + 1, nullptr) !=
+           group_commit_lock::ACQUIRED);
+  }
+
+  log_sys.latch.wr_lock(SRW_LOCK_CALL);
+}
+
+/** Release the latches that protect the log. */
+void log_resize_release()
+{
+  log_sys.latch.wr_unlock();
+
+  if (!log_sys.is_pmem())
+  {
+    lsn_t lsn1= write_lock.release(write_lock.value());
+    lsn_t lsn2= flush_lock.release(flush_lock.value());
+    if (lsn1 || lsn2)
+      log_write_up_to(std::max(lsn1, lsn2), true, nullptr);
+  }
+}
+
+#if defined __linux__ || defined _WIN32
+/** Try to enable or disable file system caching (update log_buffered) */
+void log_t::set_buffered(bool buffered)
+{
+  if (!log_maybe_unbuffered || is_pmem() || high_level_read_only)
+    return;
+  log_resize_acquire();
+  if (!resize_in_progress() && is_opened() && bool(log_buffered) != buffered)
+  {
+    os_file_close_func(log.m_file);
+    log.m_file= OS_FILE_CLOSED;
+    std::string path{get_log_file_path()};
+    log_buffered= buffered;
+    bool success;
+    log.m_file= os_file_create_func(path.c_str(),
+                                    OS_FILE_OPEN, OS_FILE_NORMAL, OS_LOG_FILE,
+                                    false, &success);
+    ut_a(log.m_file != OS_FILE_CLOSED);
+    sql_print_information("InnoDB: %s (block size=%u bytes)",
+                          log_buffered
+                          ? "Buffered log writes"
+                          : "File system buffers for log disabled",
+                          block_size);
+  }
+  log_resize_release();
+}
+#endif
+
+/** Start resizing the log and release the exclusive latch.
+@param size  requested new file_size
+@return whether the resizing was started successfully */
+log_t::resize_start_status log_t::resize_start(os_offset_t size) noexcept
+{
+  ut_ad(size >= 4U << 20);
+  ut_ad(!(size & 4095));
+  ut_ad(!srv_read_only_mode);
+
+  log_resize_acquire();
+
+  resize_start_status status= RESIZE_NO_CHANGE;
+  lsn_t start_lsn{0};
+
+  if (resize_in_progress())
+    status= RESIZE_IN_PROGRESS;
+  else if (size != file_size)
+  {
+    ut_ad(!resize_in_progress());
+    ut_ad(!resize_log.is_opened());
+    ut_ad(!resize_buf);
+    ut_ad(!resize_flush_buf);
+    std::string path{get_log_file_path("ib_logfile101")};
+    bool success;
+    resize_lsn.store(1, std::memory_order_relaxed);
+    resize_target= 0;
+    resize_log.m_file=
+      os_file_create_func(path.c_str(),
+                          OS_FILE_CREATE | OS_FILE_ON_ERROR_NO_EXIT,
+                          OS_FILE_NORMAL, OS_LOG_FILE, false, &success);
+    if (success)
+    {
+      log_resize_release();
+
+      void *ptr= nullptr, *ptr2= nullptr;
+      success= os_file_set_size(path.c_str(), resize_log.m_file, size);
+      if (!success);
+#ifdef HAVE_PMEM
+      else if (is_pmem())
+      {
+        ptr= log_mmap(resize_log.m_file, size);
+        if (ptr == MAP_FAILED)
+          goto alloc_fail;
+      }
+#endif
+      else
+      {
+        ptr= ut_malloc_dontdump(buf_size, PSI_INSTRUMENT_ME);
+        if (ptr)
+        {
+          TRASH_ALLOC(ptr, buf_size);
+          ptr2= ut_malloc_dontdump(buf_size, PSI_INSTRUMENT_ME);
+          if (ptr2)
+            TRASH_ALLOC(ptr2, buf_size);
+          else
+          {
+            ut_free_dodump(ptr, buf_size);
+            ptr= nullptr;
+            goto alloc_fail;
+          }
+        }
+        else
+        alloc_fail:
+          success= false;
+      }
+
+      log_resize_acquire();
+
+      if (!success)
+      {
+        resize_log.close();
+        IF_WIN(DeleteFile(path.c_str()), unlink(path.c_str()));
+      }
+      else
+      {
+        resize_target= size;
+        resize_buf= static_cast<byte*>(ptr);
+        resize_flush_buf= static_cast<byte*>(ptr2);
+        if (is_pmem())
+        {
+          resize_log.close();
+          start_lsn= get_lsn();
+        }
+        else
+        {
+          memcpy_aligned<16>(resize_buf, buf, (buf_free + 15) & ~15);
+          start_lsn= first_lsn +
+            (~lsn_t{get_block_size() - 1} & (write_lsn - first_lsn));
+        }
+      }
+      resize_lsn.store(start_lsn, std::memory_order_relaxed);
+      status= success ? RESIZE_STARTED : RESIZE_FAILED;
+    }
+  }
+
+  log_resize_release();
+
+  if (start_lsn)
+  {
+    mysql_mutex_lock(&buf_pool.flush_list_mutex);
+    lsn_t target_lsn= buf_pool.get_oldest_modification(0);
+    if (start_lsn < target_lsn)
+      start_lsn= target_lsn + 1;
+    mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+    buf_flush_ahead(start_lsn, false);
+  }
+
+  return status;
+}
+
+/** Abort log resizing. */
+void log_t::resize_abort() noexcept
+{
+  log_resize_acquire();
+
+  if (resize_in_progress() > 1)
+  {
+    if (!is_pmem())
+    {
+      resize_log.close();
+      ut_free_dodump(resize_buf, buf_size);
+      ut_free_dodump(resize_flush_buf, buf_size);
+      resize_flush_buf= nullptr;
+    }
+#ifdef HAVE_PMEM
+    else
+    {
+      ut_ad(!resize_log.is_opened());
+      ut_ad(!resize_flush_buf);
+      if (resize_buf)
+        my_munmap(resize_buf, resize_target);
+    }
+#endif
+    resize_buf= nullptr;
+    resize_target= 0;
+    resize_lsn.store(0, std::memory_order_relaxed);
+  }
+
+  log_resize_release();
+}
+
+/** Write an aligned buffer to ib_logfile0.
+@param buf    buffer to be written
+@param len    length of data to be written
+@param offset log file offset */
+static void log_write_buf(const byte *buf, size_t len, lsn_t offset)
+{
+  ut_ad(write_lock.is_owner());
+  ut_ad(!recv_no_log_write);
+  ut_d(const size_t block_size_1= log_sys.get_block_size() - 1);
+  ut_ad(!(offset & block_size_1));
+  ut_ad(!(len & block_size_1));
+  ut_ad(!(size_t(buf) & block_size_1));
+  ut_ad(len);
+
+  if (UNIV_LIKELY(offset + len <= log_sys.file_size))
+  {
+write:
+    log_sys.log.write(offset, {buf, len});
+    return;
+  }
+
+  const size_t write_len= size_t(log_sys.file_size - offset);
+  log_sys.log.write(offset, {buf, write_len});
+  len-= write_len;
+  buf+= write_len;
+  ut_ad(log_sys.START_OFFSET + len < offset);
+  offset= log_sys.START_OFFSET;
+  goto write;
+}
+
+/** Invoke commit_checkpoint_notify_ha() to notify that outstanding
+log writes have been completed. */
+void log_flush_notify(lsn_t flush_lsn);
+
+#if 0 // Currently we overwrite the last log block until it is complete.
+/** CRC-32C of pad messages using between 1 and 15 bytes of NUL bytes
+in the payload */
+static const unsigned char pad_crc[15][4]= {
+  {0xA6,0x59,0xC1,0xDB}, {0xF2,0xAF,0x80,0x73}, {0xED,0x02,0xF1,0x90},
+  {0x68,0x4E,0xA3,0xF3}, {0x5D,0x1B,0xEA,0x6A}, {0xE0,0x01,0x86,0xB9},
+  {0xD1,0x06,0x86,0xF5}, {0xEB,0x20,0x12,0x33}, {0xBA,0x73,0xB2,0xA3},
+  {0x5F,0xA2,0x08,0x03}, {0x70,0x03,0xD6,0x9D}, {0xED,0xB3,0x49,0x78},
+  {0xFD,0xD6,0xB9,0x9C}, {0x25,0xF8,0xB1,0x2C}, {0xCD,0xAA,0xE7,0x10}
+};
+
+/** Pad the log with some dummy bytes
+@param lsn    desired log sequence number
+@param pad    number of bytes to append to the log
+@param begin  buffer to write 'pad' bytes to
+@param extra  buffer for additional pad bytes (up to 15 bytes)
+@return additional bytes used in extra[] */
+ATTRIBUTE_NOINLINE
+static size_t log_pad(lsn_t lsn, size_t pad, byte *begin, byte *extra)
+{
+  ut_ad(!(size_t(begin + pad) & (log_sys.get_block_size() - 1)));
+  byte *b= begin;
+  const byte seq{log_sys.get_sequence_bit(lsn)};
+  /* The caller should never request padding such that the
+  file would wrap around to the beginning. That is, the sequence
+  bit must be the same for all records. */
+  ut_ad(seq == log_sys.get_sequence_bit(lsn + pad));
+
+  if (log_sys.is_encrypted())
+  {
+    /* The lengths of our pad messages vary between 15 and 29 bytes
+    (FILE_CHECKPOINT byte, 1 to 15 NUL bytes, sequence byte,
+    4 bytes checksum, 8 NUL bytes nonce). */
+    if (pad < 15)
+    {
+      extra[0]= FILE_CHECKPOINT | 1;
+      extra[1]= 0;
+      extra[2]= seq;
+      memcpy(extra + 3, pad_crc[0], 4);
+      memset(extra + 7, 0, 8);
+      memcpy(b, extra, pad);
+      memmove(extra, extra + pad, 15 - pad);
+      return 15 - pad;
+    }
+
+    /* Pad first with 29-byte messages until the remaining size is
+    less than 29+15 bytes, and then write 1 or 2 shorter messages. */
+    const byte *const end= begin + pad;
+    for (; b + (29 + 15) < end; b+= 29)
+    {
+      b[0]= FILE_CHECKPOINT | 15;
+      memset(b + 1, 0, 15);
+      b[16]= seq;
+      memcpy(b + 17, pad_crc[14], 4);
+      memset(b + 21, 0, 8);
+    }
+    if (b + 29 < end)
+    {
+      b[0]= FILE_CHECKPOINT | 1;
+      b[1]= 0;
+      b[2]= seq;
+      memcpy(b + 3, pad_crc[0], 4);
+      memset(b + 7, 0, 8);
+      b+= 15;
+    }
+    const size_t last_pad(end - b);
+    ut_ad(last_pad >= 15);
+    ut_ad(last_pad <= 29);
+    b[0]= FILE_CHECKPOINT | byte(last_pad - 14);
+    memset(b + 1, 0, last_pad - 14);
+    b[last_pad - 13]= seq;
+    memcpy(b + last_pad - 12, pad_crc[last_pad - 15], 4);
+    memset(b + last_pad - 8, 0, 8);
+  }
+  else
+  {
+    /* The lengths of our pad messages vary between 7 and 21 bytes
+    (FILE_CHECKPOINT byte, 1 to 15 NUL bytes, sequence byte,
+    4 bytes checksum). */
+    if (pad < 7)
+    {
+      extra[0]= FILE_CHECKPOINT | 1;
+      extra[1]= 0;
+      extra[2]= seq;
+      memcpy(extra + 3, pad_crc[0], 4);
+      memcpy(b, extra, pad);
+      memmove(extra, extra + pad, 7 - pad);
+      return 7 - pad;
+    }
+
+    /* Pad first with 21-byte messages until the remaining size is
+    less than 21+7 bytes, and then write 1 or 2 shorter messages. */
+    const byte *const end= begin + pad;
+    for (; b + (21 + 7) < end; b+= 21)
+    {
+      b[0]= FILE_CHECKPOINT | 15;
+      memset(b + 1, 0, 15);
+      b[16]= seq;
+      memcpy(b + 17, pad_crc[14], 4);
+    }
+    if (b + 21 < end)
+    {
+      b[0]= FILE_CHECKPOINT | 1;
+      b[1]= 0;
+      b[2]= seq;
+      memcpy(b + 3, pad_crc[0], 4);
+      b+= 7;
+    }
+    const size_t last_pad(end - b);
+    ut_ad(last_pad >= 7);
+    ut_ad(last_pad <= 21);
+    b[0]= FILE_CHECKPOINT | byte(last_pad - 6);
+    memset(b + 1, 0, last_pad - 6);
+    b[last_pad - 5]= seq;
+    memcpy(b + last_pad - 4, pad_crc[last_pad - 7], 4);
+  }
+
+  return 0;
+}
+#endif
+
+#ifdef HAVE_PMEM
+/** Persist the log.
+@param lsn    desired new value of flushed_to_disk_lsn */
+inline void log_t::persist(lsn_t lsn) noexcept
+{
+  ut_ad(is_pmem());
+  ut_ad(!write_lock.is_owner());
+  ut_ad(!flush_lock.is_owner());
+
+  lsn_t old= flushed_to_disk_lsn.load(std::memory_order_relaxed);
+
+  if (old >= lsn)
+    return;
+
+  const lsn_t resizing{resize_in_progress()};
+  if (UNIV_UNLIKELY(resizing))
+    latch.rd_lock(SRW_LOCK_CALL);
+  const size_t start(calc_lsn_offset(old));
+  const size_t end(calc_lsn_offset(lsn));
+
+  if (UNIV_UNLIKELY(end < start))
+  {
+    pmem_persist(log_sys.buf + start, log_sys.file_size - start);
+    pmem_persist(log_sys.buf + log_sys.START_OFFSET,
+                 end - log_sys.START_OFFSET);
+  }
+  else
+    pmem_persist(log_sys.buf + start, end - start);
+
+  old= flushed_to_disk_lsn.load(std::memory_order_relaxed);
+
+  if (old < lsn)
+  {
+    while (!flushed_to_disk_lsn.compare_exchange_weak
+           (old, lsn, std::memory_order_release, std::memory_order_relaxed))
+      if (old >= lsn)
+        break;
+
+    log_flush_notify(lsn);
+    DBUG_EXECUTE_IF("crash_after_log_write_upto", DBUG_SUICIDE(););
+  }
+
+  if (UNIV_UNLIKELY(resizing))
+    latch.rd_unlock();
+}
+#endif
+
+/** Write resize_buf to resize_log.
+@param length  the used length of resize_buf */
+ATTRIBUTE_COLD void log_t::resize_write_buf(size_t length) noexcept
+{
+  const size_t block_size_1= get_block_size() - 1;
+  ut_ad(!(resize_target & block_size_1));
+  ut_ad(!(length & block_size_1));
+  ut_ad(length > block_size_1);
+  ut_ad(length <= resize_target);
+  const lsn_t resizing{resize_in_progress()};
+  ut_ad(resizing <= write_lsn);
+  lsn_t offset= START_OFFSET +
+    ((write_lsn - resizing) & ~lsn_t{block_size_1}) %
+    (resize_target - START_OFFSET);
+
+  if (UNIV_UNLIKELY(offset + length > resize_target))
+  {
+    offset= START_OFFSET;
+    resize_lsn.store(first_lsn +
+                     (~lsn_t{block_size_1} & (write_lsn - first_lsn)),
+                     std::memory_order_relaxed);
+  }
+
+  ut_a(os_file_write_func(IORequestWrite, "ib_logfile101", resize_log.m_file,
+                          resize_flush_buf, offset, length) == DB_SUCCESS);
+}
+
+/** Write buf to ib_logfile0.
+@tparam release_latch whether to invoke latch.wr_unlock()
+@return the current log sequence number */
+template<bool release_latch> inline lsn_t log_t::write_buf() noexcept
+{
+#ifndef SUX_LOCK_GENERIC
+  ut_ad(latch.is_write_locked());
+#endif
+  ut_ad(!srv_read_only_mode);
+  ut_ad(!is_pmem());
+
+  const lsn_t lsn{get_lsn(std::memory_order_relaxed)};
+
+  if (write_lsn >= lsn)
+  {
+    if (release_latch)
+      latch.wr_unlock();
+    ut_ad(write_lsn == lsn);
+  }
+  else
+  {
+    ut_ad(!recv_no_log_write);
+    write_lock.set_pending(lsn);
+    ut_ad(write_lsn >= get_flushed_lsn());
+    const size_t block_size_1{get_block_size() - 1};
+    lsn_t offset{calc_lsn_offset(write_lsn) & ~lsn_t{block_size_1}};
+
+    DBUG_PRINT("ib_log", ("write " LSN_PF " to " LSN_PF " at " LSN_PF,
+                          write_lsn, lsn, offset));
+    const byte *write_buf{buf};
+    size_t length{buf_free};
+    ut_ad(length >= (calc_lsn_offset(write_lsn) & block_size_1));
+    const size_t new_buf_free{length & block_size_1};
+    buf_free= new_buf_free;
+    ut_ad(new_buf_free == ((lsn - first_lsn) & block_size_1));
+
+    if (new_buf_free)
+    {
+#if 0 /* TODO: Pad the last log block with dummy records. */
+      buf_free= log_pad(lsn, get_block_size() - new_buf_free,
+                        buf + new_buf_free, flush_buf);
+      ... /* TODO: Update the LSN and adjust other code. */
+#else
+      /* The rest of the block will be written as garbage.
+      (We want to avoid memset() while holding mutex.)
+      This block will be overwritten later, once records beyond
+      the current LSN are generated. */
+# ifdef HAVE_valgrind
+      MEM_MAKE_DEFINED(buf + length, get_block_size() - new_buf_free);
+      if (UNIV_LIKELY_NULL(resize_flush_buf))
+        MEM_MAKE_DEFINED(resize_buf + length, get_block_size() - new_buf_free);
+# endif
+      buf[length]= 0; /* allow recovery to catch EOF faster */
+      length&= ~block_size_1;
+      memcpy_aligned<16>(flush_buf, buf + length, (new_buf_free + 15) & ~15);
+      if (UNIV_LIKELY_NULL(resize_flush_buf))
+        memcpy_aligned<16>(resize_flush_buf, resize_buf + length,
+                           (new_buf_free + 15) & ~15);
+      length+= get_block_size();
+#endif
+    }
+
+    std::swap(buf, flush_buf);
+    std::swap(resize_buf, resize_flush_buf);
+    write_to_log++;
+    if (release_latch)
+      latch.wr_unlock();
+
+    if (UNIV_UNLIKELY(srv_shutdown_state > SRV_SHUTDOWN_INITIATED))
+    {
+      service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
+                                     "InnoDB log write: " LSN_PF, write_lsn);
+    }
+
+    /* Do the write to the log file */
+    log_write_buf(write_buf, length, offset);
+    if (UNIV_LIKELY_NULL(resize_buf))
+      resize_write_buf(length);
+    write_lsn= lsn;
+  }
+
+  return lsn;
+}
+
+bool log_t::flush(lsn_t lsn) noexcept
+{
+  ut_ad(lsn >= get_flushed_lsn());
+  flush_lock.set_pending(lsn);
+  const bool success{srv_file_flush_method == SRV_O_DSYNC || log.flush()};
+  if (UNIV_LIKELY(success))
+  {
+    flushed_to_disk_lsn.store(lsn, std::memory_order_release);
+    log_flush_notify(lsn);
+  }
+  return success;
+}
+
+/** Ensure that previous log writes are durable.
+@param lsn  previously written LSN
+@return new durable lsn target
+@retval 0  if there are no pending callbacks on flush_lock
+           or there is another group commit lead.
+*/
+static lsn_t log_flush(lsn_t lsn)
+{
+  ut_ad(!log_sys.is_pmem());
+  ut_a(log_sys.flush(lsn));
+  DBUG_EXECUTE_IF("crash_after_log_write_upto", DBUG_SUICIDE(););
+  return flush_lock.release(lsn);
+}
+
+static const completion_callback dummy_callback{[](void *) {},nullptr};
+
+/** Ensure that the log has been written to the log file up to a given
+log entry (such as that of a transaction commit). Start a new write, or
+wait and check if an already running write is covering the request.
+@param lsn      log sequence number that should be included in the file write
+@param durable  whether the write needs to be durable
+@param callback log write completion callback */
+void log_write_up_to(lsn_t lsn, bool durable,
+                     const completion_callback *callback)
+{
+  ut_ad(!srv_read_only_mode);
+  ut_ad(lsn != LSN_MAX);
+
+  if (UNIV_UNLIKELY(recv_no_ibuf_operations))
+  {
+    /* A non-final batch of recovery is active no writes to the log
+    are allowed yet. */
+    ut_a(!callback);
+    return;
+  }
+
+  ut_ad(lsn <= log_sys.get_lsn());
+
+#ifdef HAVE_PMEM
+  if (log_sys.is_pmem())
+  {
+    ut_ad(!callback);
+    if (durable)
+      log_sys.persist(lsn);
+    return;
+  }
+#endif
+
+repeat:
+  if (durable)
+  {
+    if (flush_lock.acquire(lsn, callback) != group_commit_lock::ACQUIRED)
+      return;
+    flush_lock.set_pending(log_sys.get_lsn());
+  }
+
+  lsn_t pending_write_lsn= 0, pending_flush_lsn= 0;
+
+  if (write_lock.acquire(lsn, durable ? nullptr : callback) ==
+      group_commit_lock::ACQUIRED)
+  {
+    log_sys.latch.wr_lock(SRW_LOCK_CALL);
+    pending_write_lsn= write_lock.release(log_sys.write_buf<true>());
+  }
+
+  if (durable)
+  {
+    pending_flush_lsn= log_flush(write_lock.value());
+  }
+
+  if (pending_write_lsn || pending_flush_lsn)
+  {
+    /* There is no new group commit lead; some async waiters could stall. */
+    callback= &dummy_callback;
+    lsn= std::max(pending_write_lsn, pending_flush_lsn);
+    goto repeat;
+  }
+}
+
+/** Write to the log file up to the last log entry.
+@param durable  whether to wait for a durable write to complete */
+void log_buffer_flush_to_disk(bool durable)
+{
+  ut_ad(!srv_read_only_mode);
+  log_write_up_to(log_sys.get_lsn(std::memory_order_acquire), durable);
+}
+
+/** Prepare to invoke log_write_and_flush(), before acquiring log_sys.latch. */
+ATTRIBUTE_COLD void log_write_and_flush_prepare()
+{
+  if (log_sys.is_pmem())
+    return;
+
+  while (flush_lock.acquire(log_sys.get_lsn() + 1, nullptr) !=
+         group_commit_lock::ACQUIRED);
+  while (write_lock.acquire(log_sys.get_lsn() + 1, nullptr) !=
+         group_commit_lock::ACQUIRED);
+}
+
+/** Durably write the log up to log_sys.get_lsn(). */
+ATTRIBUTE_COLD void log_write_and_flush()
+{
+  ut_ad(!srv_read_only_mode);
+  if (!log_sys.is_pmem())
+  {
+    const lsn_t lsn{log_sys.write_buf<false>()};
+    write_lock.release(lsn);
+    log_flush(lsn);
+  }
+#ifdef HAVE_PMEM
+  else
+    log_sys.persist(log_sys.get_lsn());
+#endif
+}
+
+/********************************************************************
+
+Tries to establish a big enough margin of free space in the log buffer, such
+that a new log entry can be catenated without an immediate need for a flush. */
+ATTRIBUTE_COLD static void log_flush_margin()
+{
+  if (log_sys.buf_free > log_sys.max_buf_free)
+    log_buffer_flush_to_disk(false);
+}
+
+/****************************************************************//**
+Tries to establish a big enough margin of free space in the log, such
+that a new log entry can be catenated without an immediate need for a
+checkpoint. NOTE: this function may only be called if the calling thread
+owns no synchronization objects! */
+ATTRIBUTE_COLD static void log_checkpoint_margin()
+{
+  while (log_sys.check_flush_or_checkpoint())
+  {
+    log_sys.latch.rd_lock(SRW_LOCK_CALL);
+    ut_ad(!recv_no_log_write);
+
+    if (!log_sys.check_flush_or_checkpoint())
+    {
+func_exit:
+      log_sys.latch.rd_unlock();
+      return;
+    }
+
+    const lsn_t lsn= log_sys.get_lsn();
+    const lsn_t checkpoint= log_sys.last_checkpoint_lsn;
+    const lsn_t sync_lsn= checkpoint + log_sys.max_checkpoint_age;
+
+    if (lsn <= sync_lsn)
+    {
+#ifndef DBUG_OFF
+    skip_checkpoint:
+#endif
+      log_sys.set_check_flush_or_checkpoint(false);
+      goto func_exit;
+    }
+
+    DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", goto skip_checkpoint;);
+    log_sys.latch.rd_unlock();
+
+    /* We must wait to prevent the tail of the log overwriting the head. */
+    buf_flush_wait_flushed(std::min(sync_lsn, checkpoint + (1U << 20)));
+    /* Sleep to avoid a thundering herd */
+    std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  }
+}
+
+/**
+Checks that there is enough free space in the log to start a new query step.
+Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this
+function may only be called if the calling thread owns no synchronization
+objects! */
+ATTRIBUTE_COLD void log_check_margins()
+{
+  do
+  {
+    log_flush_margin();
+    log_checkpoint_margin();
+    ut_ad(!recv_no_log_write);
+  }
+  while (log_sys.check_flush_or_checkpoint());
+}
+
+/** Wait for a log checkpoint if needed.
+NOTE that this function may only be called while not holding
+any synchronization objects except dict_sys.latch. */
+void log_free_check()
+{
+  ut_ad(!lock_sys.is_writer());
+  if (log_sys.check_flush_or_checkpoint())
+    log_check_margins();
+}
+
+extern void buf_resize_shutdown();
+
+/** Make a checkpoint at the latest lsn on shutdown. */
+ATTRIBUTE_COLD void logs_empty_and_mark_files_at_shutdown()
+{
+	lsn_t			lsn;
+	ulint			count = 0;
+
+	ib::info() << "Starting shutdown...";
+
+	/* Wait until the master thread and all other operations are idle: our
+	algorithm only works if the server is idle at shutdown */
+	bool do_srv_shutdown = false;
+	if (srv_master_timer) {
+		do_srv_shutdown = srv_fast_shutdown < 2;
+		srv_master_timer.reset();
+	}
+
+	/* Wait for the end of the buffer resize task.*/
+	buf_resize_shutdown();
+	dict_stats_shutdown();
+	btr_defragment_shutdown();
+
+	srv_shutdown_state = SRV_SHUTDOWN_CLEANUP;
+
+	if (srv_buffer_pool_dump_at_shutdown &&
+		!srv_read_only_mode && srv_fast_shutdown < 2) {
+		buf_dump_start();
+	}
+	srv_monitor_timer.reset();
+
+	if (do_srv_shutdown) {
+		srv_shutdown(srv_fast_shutdown == 0);
+	}
+
+
+loop:
+	ut_ad(lock_sys.is_initialised() || !srv_was_started);
+	ut_ad(log_sys.is_initialised() || !srv_was_started);
+	ut_ad(fil_system.is_initialised() || !srv_was_started);
+
+#define COUNT_INTERVAL 600U
+#define CHECK_INTERVAL 100000U
+	std::this_thread::sleep_for(std::chrono::microseconds(CHECK_INTERVAL));
+
+	count++;
+
+	/* Check that there are no longer transactions, except for
+	PREPARED ones. We need this wait even for the 'very fast'
+	shutdown, because the InnoDB layer may have committed or
+	prepared transactions and we don't want to lose them. */
+
+	if (ulint total_trx = srv_was_started && !srv_read_only_mode
+	    && srv_force_recovery < SRV_FORCE_NO_TRX_UNDO
+	    ? trx_sys.any_active_transactions() : 0) {
+
+		if (srv_print_verbose_log && count > COUNT_INTERVAL) {
+			service_manager_extend_timeout(
+				COUNT_INTERVAL * CHECK_INTERVAL/1000000 * 2,
+				"Waiting for %lu active transactions to finish",
+				(ulong) total_trx);
+			ib::info() << "Waiting for " << total_trx << " active"
+				<< " transactions to finish";
+
+			count = 0;
+		}
+
+		goto loop;
+	}
+
+	/* We need these threads to stop early in shutdown. */
+	const char* thread_name = srv_fast_shutdown != 2
+		&& trx_rollback_is_active
+		? "rollback of recovered transactions" : nullptr;
+
+	if (thread_name) {
+		ut_ad(!srv_read_only_mode);
+wait_suspend_loop:
+		service_manager_extend_timeout(
+			COUNT_INTERVAL * CHECK_INTERVAL/1000000 * 2,
+			"Waiting for %s to exit", thread_name);
+		if (srv_print_verbose_log && count > COUNT_INTERVAL) {
+			ib::info() << "Waiting for " << thread_name
+				   << " to exit";
+			count = 0;
+		}
+		goto loop;
+	}
+
+	/* Check that the background threads are suspended */
+
+	ut_ad(!srv_any_background_activity());
+	if (srv_n_fil_crypt_threads_started) {
+		fil_crypt_threads_signal(true);
+		thread_name = "fil_crypt_thread";
+		goto wait_suspend_loop;
+	}
+
+	if (buf_page_cleaner_is_active) {
+		thread_name = "page cleaner thread";
+		pthread_cond_signal(&buf_pool.do_flush_list);
+		goto wait_suspend_loop;
+	}
+
+	buf_load_dump_end();
+
+	if (!buf_pool.is_initialised()) {
+		ut_ad(!srv_was_started);
+	} else {
+		buf_flush_buffer_pool();
+	}
+
+	if (srv_fast_shutdown == 2 || !srv_was_started) {
+		if (!srv_read_only_mode && srv_was_started) {
+			sql_print_information(
+				"InnoDB: Executing innodb_fast_shutdown=2."
+				" Next startup will execute crash recovery!");
+
+			/* In this fastest shutdown we do not flush the
+			buffer pool:
+
+			it is essentially a 'crash' of the InnoDB server.
+			Make sure that the log is all flushed to disk, so
+			that we can recover all committed transactions in
+			a crash recovery. */
+			log_buffer_flush_to_disk();
+		}
+
+		srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;
+		return;
+	}
+
+	if (!srv_read_only_mode) {
+		service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
+			"ensuring dirty buffer pool are written to log");
+		log_make_checkpoint();
+
+                const auto sizeof_cp = log_sys.is_encrypted()
+			? SIZE_OF_FILE_CHECKPOINT + 8
+			: SIZE_OF_FILE_CHECKPOINT;
+
+		log_sys.latch.rd_lock(SRW_LOCK_CALL);
+
+		lsn = log_sys.get_lsn();
+
+		const bool lsn_changed = lsn != log_sys.last_checkpoint_lsn
+			&& lsn != log_sys.last_checkpoint_lsn + sizeof_cp;
+		ut_ad(lsn >= log_sys.last_checkpoint_lsn);
+
+		log_sys.latch.rd_unlock();
+
+		if (lsn_changed) {
+			goto loop;
+		}
+	} else {
+		lsn = recv_sys.lsn;
+	}
+
+	srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;
+
+	/* Make some checks that the server really is quiet */
+	ut_ad(!srv_any_background_activity());
+
+	service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
+				       "Free innodb buffer pool");
+	ut_d(buf_pool.assert_all_freed());
+
+	ut_a(lsn == log_sys.get_lsn()
+	     || srv_force_recovery == SRV_FORCE_NO_LOG_REDO);
+
+	if (UNIV_UNLIKELY(lsn < recv_sys.lsn)) {
+		sql_print_error("InnoDB: Shutdown LSN=" LSN_PF
+				" is less than start LSN=" LSN_PF,
+				lsn, recv_sys.lsn);
+	}
+
+	srv_shutdown_lsn = lsn;
+
+	/* Make some checks that the server really is quiet */
+	ut_ad(!srv_any_background_activity());
+
+	ut_a(lsn == log_sys.get_lsn()
+	     || srv_force_recovery == SRV_FORCE_NO_LOG_REDO);
+}
+
+/******************************************************//**
+Prints info of the log. */
+void
+log_print(
+/*======*/
+	FILE*	file)	/*!< in: file where to print */
+{
+	log_sys.latch.rd_lock(SRW_LOCK_CALL);
+
+	const lsn_t lsn= log_sys.get_lsn();
+	mysql_mutex_lock(&buf_pool.flush_list_mutex);
+	const lsn_t pages_flushed = buf_pool.get_oldest_modification(lsn);
+	mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+	fprintf(file,
+		"Log sequence number " LSN_PF "\n"
+		"Log flushed up to   " LSN_PF "\n"
+		"Pages flushed up to " LSN_PF "\n"
+		"Last checkpoint at  " LSN_PF "\n",
+		lsn,
+		log_sys.get_flushed_lsn(),
+		pages_flushed,
+		lsn_t{log_sys.last_checkpoint_lsn});
+
+	log_sys.latch.rd_unlock();
+}
+
+/** Shut down the redo log subsystem. */
+void log_t::close()
+{
+  ut_ad(this == &log_sys);
+  if (!is_initialised()) return;
+  close_file();
+
+#ifndef HAVE_PMEM
+  ut_free_dodump(buf, buf_size);
+  buf= nullptr;
+  ut_free_dodump(flush_buf, buf_size);
+  flush_buf= nullptr;
+  aligned_free(checkpoint_buf);
+  checkpoint_buf= nullptr;
+#else
+  ut_ad(!checkpoint_buf);
+  ut_ad(!buf);
+  ut_ad(!flush_buf);
+#endif
+
+  latch.destroy();
+  destroy_lsn_lock();
+
+  recv_sys.close();
+
+  max_buf_free= 0;
+}
+
+std::string get_log_file_path(const char *filename)
+{
+  const size_t size= strlen(srv_log_group_home_dir) + /* path separator */ 1 +
+                     strlen(filename) + /* longest suffix */ 3;
+  std::string path;
+  path.reserve(size);
+  path.assign(srv_log_group_home_dir);
+
+  switch (path.back()) {
+#ifdef _WIN32
+  case '\\':
+#endif
+  case '/':
+    break;
+  default:
+    path.push_back('/');
+  }
+  path.append(filename);
+
+  return path;
+}
diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc
new file mode 100644
index 00000000..3c3fe41e
--- /dev/null
+++ b/storage/innobase/log/log0recv.cc
@@ -0,0 +1,4870 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file log/log0recv.cc
+Recovery
+
+Created 9/20/1997 Heikki Tuuri
+*******************************************************/
+
+#include "univ.i"
+
+#include <map>
+#include <string>
+#include <my_service_manager.h>
+
+#include "log0recv.h"
+
+#ifdef HAVE_MY_AES_H
+#include <my_aes.h>
+#endif
+
+#include "log0crypt.h"
+#include "mem0mem.h"
+#include "buf0buf.h"
+#include "buf0dblwr.h"
+#include "buf0flu.h"
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+#include "page0page.h"
+#include "page0cur.h"
+#include "trx0undo.h"
+#include "ibuf0ibuf.h"
+#include "trx0undo.h"
+#include "trx0rec.h"
+#include "fil0fil.h"
+#include "buf0rea.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "fil0pagecompress.h"
+#include "log.h"
+
+/** The recovery system */
+recv_sys_t	recv_sys;
+/** TRUE when recv_init_crash_recovery() has been called. */
+bool	recv_needed_recovery;
+#ifdef UNIV_DEBUG
+/** TRUE if writing to the redo log (mtr_commit) is forbidden.
+Protected by log_sys.latch. */
+bool	recv_no_log_write = false;
+#endif /* UNIV_DEBUG */
+
+/** TRUE if buf_page_is_corrupted() should check if the log sequence
+number (FIL_PAGE_LSN) is in the future.  Initially FALSE, and set by
+recv_recovery_from_checkpoint_start(). */
+bool	recv_lsn_checks_on;
+
+/** If the following is TRUE, the buffer pool file pages must be invalidated
+after recovery and no ibuf operations are allowed; this becomes TRUE if
+the log record hash table becomes too full, and log records must be merged
+to file pages already before the recovery is finished: in this case no
+ibuf operations are allowed, as they could modify the pages read in the
+buffer pool before the pages have been recovered to the up-to-date state.
+
+true means that recovery is running and no operations on the log file
+are allowed yet: the variable name is misleading. */
+bool	recv_no_ibuf_operations;
+
+/** The maximum lsn we see for a page during the recovery process. If this
+is bigger than the lsn we are able to scan up to, that is an indication that
+the recovery failed and the database may be corrupt. */
+static lsn_t	recv_max_page_lsn;
+
+/** Stored physical log record */
+struct log_phys_t : public log_rec_t
+{
+  /** start LSN of the mini-transaction (not necessarily of this record) */
+  const lsn_t start_lsn;
+private:
+  /** @return the start of length and data */
+  const byte *start() const
+  {
+    return my_assume_aligned<sizeof(size_t)>
+      (reinterpret_cast<const byte*>(&start_lsn + 1));
+  }
+  /** @return the start of length and data */
+  byte *start()
+  { return const_cast<byte*>(const_cast<const log_phys_t*>(this)->start()); }
+  /** @return the length of the following record */
+  uint16_t len() const { uint16_t i; memcpy(&i, start(), 2); return i; }
+
+  /** @return start of the log records */
+  byte *begin() { return start() + 2; }
+  /** @return end of the log records */
+  byte *end() { byte *e= begin() + len(); ut_ad(!*e); return e; }
+public:
+  /** @return start of the log records */
+  const byte *begin() const { return const_cast<log_phys_t*>(this)->begin(); }
+  /** @return end of the log records */
+  const byte *end() const { return const_cast<log_phys_t*>(this)->end(); }
+
+  /** Determine the allocated size of the object.
+  @param len  length of recs, excluding terminating NUL byte
+  @return the total allocation size */
+  static inline size_t alloc_size(size_t len);
+
+  /** Constructor.
+  @param start_lsn start LSN of the mini-transaction
+  @param lsn  mtr_t::commit_lsn() of the mini-transaction
+  @param recs the first log record for the page in the mini-transaction
+  @param size length of recs, in bytes, excluding terminating NUL byte */
+  log_phys_t(lsn_t start_lsn, lsn_t lsn, const byte *recs, size_t size) :
+    log_rec_t(lsn), start_lsn(start_lsn)
+  {
+    ut_ad(start_lsn);
+    ut_ad(start_lsn < lsn);
+    const uint16_t len= static_cast<uint16_t>(size);
+    ut_ad(len == size);
+    memcpy(start(), &len, 2);
+    reinterpret_cast<byte*>(memcpy(begin(), recs, size))[size]= 0;
+  }
+
+  /** Append a record to the log.
+  @param recs  log to append
+  @param size  size of the log, in bytes */
+  void append(const byte *recs, size_t size)
+  {
+    ut_ad(start_lsn < lsn);
+    uint16_t l= len();
+    reinterpret_cast<byte*>(memcpy(end(), recs, size))[size]= 0;
+    l= static_cast<uint16_t>(l + size);
+    memcpy(start(), &l, 2);
+  }
+
+  /** Apply an UNDO_APPEND record.
+  @see mtr_t::undo_append()
+  @param block   undo log page
+  @param data    undo log record
+  @param len     length of the undo log record
+  @return whether the operation failed (inconcistency was noticed) */
+  static bool undo_append(const buf_block_t &block, const byte *data,
+                          size_t len)
+  {
+    ut_ad(len > 2);
+    byte *free_p= my_assume_aligned<2>
+      (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + block.page.frame);
+    const uint16_t free= mach_read_from_2(free_p);
+    if (UNIV_UNLIKELY(free < TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE ||
+                      free + len + 6 >= srv_page_size - FIL_PAGE_DATA_END))
+    {
+      ib::error() << "Not applying UNDO_APPEND due to corruption on "
+                  << block.page.id();
+      return true;
+    }
+
+    byte *p= block.page.frame + free;
+    mach_write_to_2(free_p, free + 4 + len);
+    memcpy(p, free_p, 2);
+    p+= 2;
+    memcpy(p, data, len);
+    p+= len;
+    mach_write_to_2(p, free);
+    return false;
+  }
+
+  /** Check an OPT_PAGE_CHECKSUM record.
+  @see mtr_t::page_checksum()
+  @param block   buffer page
+  @param l       pointer to checksum
+  @return whether an unrecoverable mismatch was found */
+  static bool page_checksum(const buf_block_t &block, const byte *l)
+  {
+    size_t size;
+    const byte *page= block.page.zip.data;
+    if (UNIV_LIKELY_NULL(page))
+      size= (UNIV_ZIP_SIZE_MIN >> 1) << block.page.zip.ssize;
+    else
+    {
+      page= block.page.frame;
+      size= srv_page_size;
+    }
+    if (UNIV_LIKELY(my_crc32c(my_crc32c(my_crc32c(0, page + FIL_PAGE_OFFSET,
+                                                  FIL_PAGE_LSN -
+                                                  FIL_PAGE_OFFSET),
+                                        page + FIL_PAGE_TYPE, 2),
+                              page + FIL_PAGE_SPACE_ID,
+                              size - (FIL_PAGE_SPACE_ID + 8)) ==
+                    mach_read_from_4(l)))
+      return false;
+
+    ib::error() << "OPT_PAGE_CHECKSUM mismatch on " << block.page.id();
+    return !srv_force_recovery;
+  }
+
+  /** The status of apply() */
+  enum apply_status {
+    /** The page was not affected */
+    APPLIED_NO= 0,
+    /** The page was modified */
+    APPLIED_YES,
+    /** The page was modified, affecting the encryption parameters */
+    APPLIED_TO_ENCRYPTION,
+    /** The page was modified, affecting the tablespace header */
+    APPLIED_TO_FSP_HEADER,
+    /** The page was found to be corrupted */
+    APPLIED_CORRUPTED,
+  };
+
+  /** Apply log to a page frame.
+  @param[in,out] block         buffer block
+  @param[in,out] last_offset   last byte offset, for same_page records
+  @return whether any log was applied to the page */
+  apply_status apply(const buf_block_t &block, uint16_t &last_offset) const
+  {
+    const byte * const recs= begin();
+    byte *const frame= block.page.zip.data
+      ? block.page.zip.data : block.page.frame;
+    const size_t size= block.physical_size();
+    apply_status applied= APPLIED_NO;
+
+    for (const byte *l= recs;;)
+    {
+      const byte b= *l++;
+      if (!b)
+        return applied;
+      ut_ad((b & 0x70) != RESERVED);
+      size_t rlen= b & 0xf;
+      if (!rlen)
+      {
+        const size_t lenlen= mlog_decode_varint_length(*l);
+        const uint32_t addlen= mlog_decode_varint(l);
+        ut_ad(addlen != MLOG_DECODE_ERROR);
+        rlen= addlen + 15 - lenlen;
+        l+= lenlen;
+      }
+      if (!(b & 0x80))
+      {
+        /* Skip the page identifier. It has already been validated. */
+        size_t idlen= mlog_decode_varint_length(*l);
+        ut_ad(idlen <= 5);
+        ut_ad(idlen < rlen);
+        ut_ad(mlog_decode_varint(l) == block.page.id().space());
+        l+= idlen;
+        rlen-= idlen;
+        idlen= mlog_decode_varint_length(*l);
+        ut_ad(idlen <= 5);
+        ut_ad(idlen <= rlen);
+        ut_ad(mlog_decode_varint(l) == block.page.id().page_no());
+        l+= idlen;
+        rlen-= idlen;
+        last_offset= 0;
+      }
+
+      switch (b & 0x70) {
+      case FREE_PAGE:
+        ut_ad(last_offset == 0);
+        goto next_not_same_page;
+      case INIT_PAGE:
+        if (UNIV_LIKELY(rlen == 0))
+        {
+          memset_aligned<UNIV_ZIP_SIZE_MIN>(frame, 0, size);
+          mach_write_to_4(frame + FIL_PAGE_OFFSET, block.page.id().page_no());
+          memset_aligned<8>(FIL_PAGE_PREV + frame, 0xff, 8);
+          mach_write_to_4(frame + FIL_PAGE_SPACE_ID, block.page.id().space());
+          last_offset= FIL_PAGE_TYPE;
+        next_after_applying:
+          if (applied == APPLIED_NO)
+            applied= APPLIED_YES;
+        }
+        else
+        {
+        record_corrupted:
+          if (!srv_force_recovery)
+          {
+            recv_sys.set_corrupt_log();
+            return applied;
+          }
+        next_not_same_page:
+          last_offset= 1; /* the next record must not be same_page  */
+        }
+        l+= rlen;
+        continue;
+      case OPTION:
+        ut_ad(rlen == 5);
+        ut_ad(*l == OPT_PAGE_CHECKSUM);
+        if (page_checksum(block, l + 1))
+        {
+page_corrupted:
+          sql_print_error("InnoDB: Set innodb_force_recovery=1"
+                          " to ignore corruption.");
+          return APPLIED_CORRUPTED;
+        }
+        goto next_after_applying;
+      }
+
+      ut_ad(mach_read_from_4(frame + FIL_PAGE_OFFSET) ==
+            block.page.id().page_no());
+      ut_ad(mach_read_from_4(frame + FIL_PAGE_SPACE_ID) ==
+            block.page.id().space());
+      ut_ad(last_offset <= 1 || last_offset > 8);
+      ut_ad(last_offset <= size);
+
+      switch (b & 0x70) {
+      case EXTENDED:
+        if (UNIV_UNLIKELY(block.page.id().page_no() < 3 ||
+                          block.page.zip.ssize))
+          goto record_corrupted;
+        static_assert(INIT_ROW_FORMAT_REDUNDANT == 0, "compatiblity");
+        static_assert(INIT_ROW_FORMAT_DYNAMIC == 1, "compatibility");
+        if (UNIV_UNLIKELY(!rlen))
+          goto record_corrupted;
+        switch (const byte subtype= *l) {
+          uint8_t ll;
+          size_t prev_rec, hdr_size;
+        default:
+          goto record_corrupted;
+        case INIT_ROW_FORMAT_REDUNDANT:
+        case INIT_ROW_FORMAT_DYNAMIC:
+          if (UNIV_UNLIKELY(rlen != 1))
+            goto record_corrupted;
+          page_create_low(&block, *l != INIT_ROW_FORMAT_REDUNDANT);
+          break;
+        case UNDO_INIT:
+          if (UNIV_UNLIKELY(rlen != 1))
+            goto record_corrupted;
+          trx_undo_page_init(block);
+          break;
+        case UNDO_APPEND:
+          if (UNIV_UNLIKELY(rlen <= 3))
+            goto record_corrupted;
+          if (undo_append(block, ++l, --rlen) && !srv_force_recovery)
+            goto page_corrupted;
+          break;
+        case INSERT_HEAP_REDUNDANT:
+        case INSERT_REUSE_REDUNDANT:
+        case INSERT_HEAP_DYNAMIC:
+        case INSERT_REUSE_DYNAMIC:
+          if (UNIV_UNLIKELY(rlen < 2))
+            goto record_corrupted;
+          rlen--;
+          ll= mlog_decode_varint_length(*++l);
+          if (UNIV_UNLIKELY(ll > 3 || ll >= rlen))
+            goto record_corrupted;
+          prev_rec= mlog_decode_varint(l);
+          ut_ad(prev_rec != MLOG_DECODE_ERROR);
+          rlen-= ll;
+          l+= ll;
+          ll= mlog_decode_varint_length(*l);
+          static_assert(INSERT_HEAP_REDUNDANT == 4, "compatibility");
+          static_assert(INSERT_REUSE_REDUNDANT == 5, "compatibility");
+          static_assert(INSERT_HEAP_DYNAMIC == 6, "compatibility");
+          static_assert(INSERT_REUSE_DYNAMIC == 7, "compatibility");
+          if (subtype & 2)
+          {
+            size_t shift= 0;
+            if (subtype & 1)
+            {
+              if (UNIV_UNLIKELY(ll > 3 || ll >= rlen))
+                goto record_corrupted;
+              shift= mlog_decode_varint(l);
+              ut_ad(shift != MLOG_DECODE_ERROR);
+              rlen-= ll;
+              l+= ll;
+              ll= mlog_decode_varint_length(*l);
+            }
+            if (UNIV_UNLIKELY(ll > 3 || ll >= rlen))
+              goto record_corrupted;
+            size_t enc_hdr_l= mlog_decode_varint(l);
+            ut_ad(enc_hdr_l != MLOG_DECODE_ERROR);
+            rlen-= ll;
+            l+= ll;
+            ll= mlog_decode_varint_length(*l);
+            if (UNIV_UNLIKELY(ll > 2 || ll >= rlen))
+              goto record_corrupted;
+            size_t hdr_c= mlog_decode_varint(l);
+            ut_ad(hdr_c != MLOG_DECODE_ERROR);
+            rlen-= ll;
+            l+= ll;
+            ll= mlog_decode_varint_length(*l);
+            if (UNIV_UNLIKELY(ll > 3 || ll > rlen))
+              goto record_corrupted;
+            size_t data_c= mlog_decode_varint(l);
+            ut_ad(data_c != MLOG_DECODE_ERROR);
+            rlen-= ll;
+            l+= ll;
+            if (page_apply_insert_dynamic(block, subtype & 1, prev_rec,
+                                          shift, enc_hdr_l, hdr_c, data_c,
+                                          l, rlen) && !srv_force_recovery)
+              goto page_corrupted;
+          }
+          else
+          {
+            if (UNIV_UNLIKELY(ll > 2 || ll >= rlen))
+              goto record_corrupted;
+            size_t header= mlog_decode_varint(l);
+            ut_ad(header != MLOG_DECODE_ERROR);
+            rlen-= ll;
+            l+= ll;
+            ll= mlog_decode_varint_length(*l);
+            if (UNIV_UNLIKELY(ll > 2 || ll >= rlen))
+              goto record_corrupted;
+            size_t hdr_c= mlog_decode_varint(l);
+            ut_ad(hdr_c != MLOG_DECODE_ERROR);
+            rlen-= ll;
+            l+= ll;
+            ll= mlog_decode_varint_length(*l);
+            if (UNIV_UNLIKELY(ll > 2 || ll > rlen))
+              goto record_corrupted;
+            size_t data_c= mlog_decode_varint(l);
+            rlen-= ll;
+            l+= ll;
+            if (page_apply_insert_redundant(block, subtype & 1, prev_rec,
+                                            header, hdr_c, data_c,
+                                            l, rlen) && !srv_force_recovery)
+              goto page_corrupted;
+          }
+          break;
+        case DELETE_ROW_FORMAT_REDUNDANT:
+          if (UNIV_UNLIKELY(rlen < 2 || rlen > 4))
+            goto record_corrupted;
+          rlen--;
+          ll= mlog_decode_varint_length(*++l);
+          if (UNIV_UNLIKELY(ll != rlen))
+            goto record_corrupted;
+          if (page_apply_delete_redundant(block, mlog_decode_varint(l)) &&
+              !srv_force_recovery)
+            goto page_corrupted;
+          break;
+        case DELETE_ROW_FORMAT_DYNAMIC:
+          if (UNIV_UNLIKELY(rlen < 2))
+            goto record_corrupted;
+          rlen--;
+          ll= mlog_decode_varint_length(*++l);
+          if (UNIV_UNLIKELY(ll > 3 || ll >= rlen))
+            goto record_corrupted;
+          prev_rec= mlog_decode_varint(l);
+          ut_ad(prev_rec != MLOG_DECODE_ERROR);
+          rlen-= ll;
+          l+= ll;
+          ll= mlog_decode_varint_length(*l);
+          if (UNIV_UNLIKELY(ll > 2 || ll >= rlen))
+            goto record_corrupted;
+          hdr_size= mlog_decode_varint(l);
+          ut_ad(hdr_size != MLOG_DECODE_ERROR);
+          rlen-= ll;
+          l+= ll;
+          ll= mlog_decode_varint_length(*l);
+          if (UNIV_UNLIKELY(ll > 3 || ll != rlen))
+            goto record_corrupted;
+          if (page_apply_delete_dynamic(block, prev_rec, hdr_size,
+                                        mlog_decode_varint(l)) &&
+              !srv_force_recovery)
+            goto page_corrupted;
+          break;
+        }
+        last_offset= FIL_PAGE_TYPE;
+        goto next_after_applying;
+      case WRITE:
+      case MEMSET:
+      case MEMMOVE:
+        if (UNIV_UNLIKELY(last_offset == 1))
+          goto record_corrupted;
+        const size_t olen= mlog_decode_varint_length(*l);
+        if (UNIV_UNLIKELY(olen >= rlen) || UNIV_UNLIKELY(olen > 3))
+          goto record_corrupted;
+        const uint32_t offset= mlog_decode_varint(l);
+        ut_ad(offset != MLOG_DECODE_ERROR);
+        static_assert(FIL_PAGE_OFFSET == 4, "compatibility");
+        if (UNIV_UNLIKELY(offset >= size))
+          goto record_corrupted;
+        if (UNIV_UNLIKELY(offset + last_offset < 8 ||
+                          offset + last_offset >= size))
+          goto record_corrupted;
+        last_offset= static_cast<uint16_t>(last_offset + offset);
+        l+= olen;
+        rlen-= olen;
+        size_t llen= rlen;
+        if ((b & 0x70) == WRITE)
+        {
+          if (UNIV_UNLIKELY(rlen + last_offset > size))
+            goto record_corrupted;
+          memcpy(frame + last_offset, l, llen);
+          if (UNIV_LIKELY(block.page.id().page_no()));
+          else if (llen == 11 + MY_AES_BLOCK_SIZE &&
+                   last_offset == FSP_HEADER_OFFSET + MAGIC_SZ +
+                   fsp_header_get_encryption_offset(block.zip_size()))
+            applied= APPLIED_TO_ENCRYPTION;
+          else if (last_offset < FSP_HEADER_OFFSET + FSP_FREE + FLST_LEN + 4 &&
+                   last_offset + llen >= FSP_HEADER_OFFSET + FSP_SIZE)
+            applied= APPLIED_TO_FSP_HEADER;
+        next_after_applying_write:
+          ut_ad(llen + last_offset <= size);
+          last_offset= static_cast<uint16_t>(last_offset + llen);
+          goto next_after_applying;
+        }
+        llen= mlog_decode_varint_length(*l);
+        if (UNIV_UNLIKELY(llen > rlen || llen > 3))
+          goto record_corrupted;
+        const uint32_t len= mlog_decode_varint(l);
+        ut_ad(len != MLOG_DECODE_ERROR);
+        if (UNIV_UNLIKELY(len + last_offset > size))
+          goto record_corrupted;
+        l+= llen;
+        rlen-= llen;
+        llen= len;
+        if ((b & 0x70) == MEMSET)
+        {
+          ut_ad(rlen <= llen);
+          if (UNIV_UNLIKELY(rlen != 1))
+          {
+            size_t s;
+            for (s= 0; s < llen; s+= rlen)
+              memcpy(frame + last_offset + s, l, rlen);
+            memcpy(frame + last_offset + s, l, llen - s);
+          }
+          else
+            memset(frame + last_offset, *l, llen);
+          goto next_after_applying_write;
+        }
+        const size_t slen= mlog_decode_varint_length(*l);
+        if (UNIV_UNLIKELY(slen != rlen || slen > 3))
+          goto record_corrupted;
+        uint32_t s= mlog_decode_varint(l);
+        ut_ad(slen != MLOG_DECODE_ERROR);
+        if (s & 1)
+          s= last_offset - (s >> 1) - 1;
+        else
+          s= last_offset + (s >> 1) + 1;
+        if (UNIV_LIKELY(s >= 8 && s + llen <= size))
+        {
+          memmove(frame + last_offset, frame + s, llen);
+          goto next_after_applying_write;
+        }
+      }
+      goto record_corrupted;
+    }
+  }
+};
+
+
+inline size_t log_phys_t::alloc_size(size_t len)
+{
+  return len + (1 + 2 + sizeof(log_phys_t));
+}
+
+
+/** Tablespace item during recovery */
+struct file_name_t {
+	/** Tablespace file name (FILE_MODIFY) */
+	std::string	name;
+	/** Tablespace object (NULL if not valid or not found) */
+	fil_space_t*	space = nullptr;
+
+	/** Tablespace status. */
+	enum fil_status {
+		/** Normal tablespace */
+		NORMAL,
+		/** Deleted tablespace */
+		DELETED,
+		/** Missing tablespace */
+		MISSING
+	};
+
+	/** Status of the tablespace */
+	fil_status	status;
+
+	/** FSP_SIZE of tablespace */
+	uint32_t	size = 0;
+
+	/** Freed pages of tablespace */
+	range_set	freed_ranges;
+
+	/** Dummy flags before they have been read from the .ibd file */
+	static constexpr uint32_t initial_flags = FSP_FLAGS_FCRC32_MASK_MARKER;
+	/** FSP_SPACE_FLAGS of tablespace */
+	uint32_t	flags = initial_flags;
+
+	/** Constructor */
+	file_name_t(std::string name_, bool deleted)
+		: name(std::move(name_)), status(deleted ? DELETED: NORMAL) {}
+
+  /** Add the freed pages */
+  void add_freed_page(uint32_t page_no) { freed_ranges.add_value(page_no); }
+
+  /** Remove the freed pages */
+  void remove_freed_page(uint32_t page_no)
+  {
+    if (freed_ranges.empty()) return;
+    freed_ranges.remove_value(page_no);
+  }
+};
+
+/** Map of dirty tablespaces during recovery */
+typedef std::map<
+	uint32_t,
+	file_name_t,
+	std::less<uint32_t>,
+	ut_allocator<std::pair<const uint32_t, file_name_t> > >	recv_spaces_t;
+
+static recv_spaces_t	recv_spaces;
+
+/** The last parsed FILE_RENAME records */
+static std::map<uint32_t,std::string> renamed_spaces;
+
+/** Files for which fil_ibd_load() returned FIL_LOAD_DEFER */
+static struct
+{
+  /** Maintains the last opened defer file name along with lsn */
+  struct item
+  {
+    /** Log sequence number of latest add() called by fil_name_process() */
+    lsn_t lsn;
+    /** File name from the FILE_ record */
+    std::string file_name;
+    /** whether a FILE_DELETE record was encountered */
+    mutable bool deleted;
+  };
+
+  using map= std::map<const uint32_t, item, std::less<const uint32_t>,
+                      ut_allocator<std::pair<const uint32_t, item> > >;
+
+  /** Map of defer tablespaces */
+  map defers;
+
+  /** Add the deferred space only if it is latest one
+  @param space  space identifier
+  @param f_name file name
+  @param lsn    log sequence number of the FILE_ record */
+  void add(uint32_t space, const std::string &f_name, lsn_t lsn)
+  {
+    mysql_mutex_assert_owner(&recv_sys.mutex);
+    const char *filename= f_name.c_str();
+
+    if (srv_operation == SRV_OPERATION_RESTORE)
+    {
+      /* Replace absolute DATA DIRECTORY file paths with
+      short names relative to the backup directory. */
+      if (const char *name= strrchr(filename, '/'))
+      {
+        while (--name > filename && *name != '/');
+        if (name > filename)
+          filename= name + 1;
+      }
+    }
+
+    char *fil_path= fil_make_filepath(nullptr, {filename, strlen(filename)},
+                                      IBD, false);
+    const item defer{lsn, fil_path, false};
+    ut_free(fil_path);
+
+    /* The file name must be unique. Keep the one with the latest LSN. */
+    auto d= defers.begin();
+
+    while (d != defers.end())
+    {
+      if (d->second.file_name != defer.file_name)
+        ++d;
+      else if (d->first == space)
+      {
+        /* Neither the file name nor the tablespace ID changed.
+        Update the LSN if needed. */
+        if (d->second.lsn < lsn)
+          d->second.lsn= lsn;
+        return;
+      }
+      else if (d->second.lsn < lsn)
+      {
+        /* Reset the old tablespace name in recovered spaces list */
+        recv_spaces_t::iterator it{recv_spaces.find(d->first)};
+        if (it != recv_spaces.end() &&
+            it->second.name == d->second.file_name)
+          it->second.name = "";
+        defers.erase(d++);
+      }
+      else
+      {
+        ut_ad(d->second.lsn != lsn);
+        return; /* A later tablespace already has this name. */
+      }
+    }
+
+    auto p= defers.emplace(space, defer);
+    if (!p.second && p.first->second.lsn <= lsn)
+    {
+      p.first->second.lsn= lsn;
+      p.first->second.file_name= defer.file_name;
+    }
+    /* Add the newly added defered space and change the file name */
+    recv_spaces_t::iterator it{recv_spaces.find(space)};
+    if (it != recv_spaces.end())
+      it->second.name = defer.file_name;
+  }
+
+  void remove(uint32_t space)
+  {
+    mysql_mutex_assert_owner(&recv_sys.mutex);
+    defers.erase(space);
+  }
+
+  /** Look up a tablespace that was found corrupted during recovery.
+  @param id   tablespace id
+  @return tablespace whose creation was deferred
+  @retval nullptr if no such tablespace was found */
+  item *find(uint32_t id)
+  {
+    mysql_mutex_assert_owner(&recv_sys.mutex);
+    auto it= defers.find(id);
+    if (it != defers.end())
+      return &it->second;
+    return nullptr;
+  }
+
+  void clear()
+  {
+    mysql_mutex_assert_owner(&recv_sys.mutex);
+    defers.clear();
+  }
+
+  /** Initialize all deferred tablespaces.
+  @return whether any deferred initialization failed */
+  bool reinit_all()
+  {
+retry:
+    log_sys.latch.wr_unlock();
+    fil_space_t *space= fil_system.sys_space;
+    buf_block_t *free_block= buf_LRU_get_free_block(false);
+    log_sys.latch.wr_lock(SRW_LOCK_CALL);
+    mysql_mutex_lock(&recv_sys.mutex);
+
+    for (auto d= defers.begin(); d != defers.end(); )
+    {
+      const uint32_t space_id{d->first};
+      recv_sys_t::map::iterator p{recv_sys.pages.lower_bound({space_id,0})};
+
+      if (d->second.deleted ||
+          p == recv_sys.pages.end() || p->first.space() != space_id)
+      {
+        /* We found a FILE_DELETE record for the tablespace, or
+        there were no buffered records. Either way, we must create a
+        dummy tablespace with the latest known name,
+        for dict_drop_index_tree(). */
+        recv_sys.pages_it_invalidate(space_id);
+        while (p != recv_sys.pages.end() && p->first.space() == space_id)
+        {
+          ut_ad(!p->second.being_processed);
+          recv_sys_t::map::iterator r= p++;
+          recv_sys.erase(r);
+        }
+        recv_spaces_t::iterator it{recv_spaces.find(space_id)};
+        if (it != recv_spaces.end())
+        {
+          const std::string *name= &d->second.file_name;
+          if (d->second.deleted)
+          {
+            const auto r= renamed_spaces.find(space_id);
+            if (r != renamed_spaces.end())
+              name= &r->second;
+            bool exists;
+            os_file_type_t ftype;
+            if (!os_file_status(name->c_str(), &exists, &ftype) || !exists)
+              goto processed;
+          }
+          if (create(it, *name, static_cast<uint32_t>
+                     (1U << FSP_FLAGS_FCRC32_POS_MARKER |
+                      FSP_FLAGS_FCRC32_PAGE_SSIZE()), nullptr, 0))
+            mysql_mutex_unlock(&fil_system.mutex);
+        }
+      }
+      else
+        space= recv_sys.recover_deferred(p, d->second.file_name, free_block);
+processed:
+      auto e= d++;
+      defers.erase(e);
+      if (!space)
+        break;
+      if (space != fil_system.sys_space)
+        space->release();
+      if (free_block)
+        continue;
+      mysql_mutex_unlock(&recv_sys.mutex);
+      goto retry;
+    }
+
+    clear();
+    mysql_mutex_unlock(&recv_sys.mutex);
+    if (free_block)
+      buf_pool.free_block(free_block);
+    return !space;
+  }
+
+  /** Create tablespace metadata for a data file that was initially
+  found corrupted during recovery.
+  @param it         tablespace iterator
+  @param name       latest file name
+  @param flags      FSP_SPACE_FLAGS
+  @param crypt_data encryption metadata
+  @param size       tablespace size in pages
+  @return tablespace; the caller must release fil_system.mutex
+  @retval nullptr   if crypt_data is invalid */
+  static fil_space_t *create(const recv_spaces_t::const_iterator &it,
+                             const std::string &name, uint32_t flags,
+                             fil_space_crypt_t *crypt_data, uint32_t size)
+  {
+    if (crypt_data && !fil_crypt_check(crypt_data, name.c_str()))
+      return nullptr;
+    mysql_mutex_lock(&fil_system.mutex);
+    fil_space_t *space= fil_space_t::create(it->first, flags,
+                                            FIL_TYPE_TABLESPACE, crypt_data);
+    ut_ad(space);
+    const char *filename= name.c_str();
+    if (srv_operation == SRV_OPERATION_RESTORE)
+    {
+      if (const char *tbl_name= strrchr(filename, '/'))
+      {
+        while (--tbl_name > filename && *tbl_name != '/');
+        if (tbl_name > filename)
+          filename= tbl_name + 1;
+      }
+    }
+    space->add(filename, OS_FILE_CLOSED, size, false, false);
+    space->recv_size= it->second.size;
+    space->size_in_header= size;
+    return space;
+  }
+
+  /** Attempt to recover pages from the doublewrite buffer.
+  This is invoked if we found neither a valid first page in the
+  data file nor redo log records that would initialize the first
+  page. */
+  void deferred_dblwr()
+  {
+    for (auto d= defers.begin(); d != defers.end(); )
+    {
+      if (d->second.deleted)
+      {
+      next_item:
+        d++;
+        continue;
+      }
+      const page_id_t page_id{d->first, 0};
+      const byte *page= recv_sys.dblwr.find_page(page_id);
+      if (!page)
+        goto next_item;
+      const uint32_t space_id= mach_read_from_4(page + FIL_PAGE_SPACE_ID);
+      const uint32_t flags= fsp_header_get_flags(page);
+      const uint32_t page_no= mach_read_from_4(page + FIL_PAGE_OFFSET);
+      const uint32_t size= fsp_header_get_field(page, FSP_SIZE);
+
+      if (page_no == 0 && space_id == d->first && size >= 4 &&
+          fil_space_t::is_valid_flags(flags, space_id) &&
+          fil_space_t::logical_size(flags) == srv_page_size)
+      {
+        recv_spaces_t::iterator it {recv_spaces.find(d->first)};
+        ut_ad(it != recv_spaces.end());
+
+        fil_space_t *space= create(
+          it, d->second.file_name.c_str(), flags,
+          fil_space_read_crypt_data(fil_space_t::zip_size(flags), page),
+          size);
+
+        if (!space)
+          goto next_item;
+
+        space->free_limit= fsp_header_get_field(page, FSP_FREE_LIMIT);
+        space->free_len= flst_get_len(FSP_HEADER_OFFSET + FSP_FREE + page);
+        fil_node_t *node= UT_LIST_GET_FIRST(space->chain);
+        mysql_mutex_unlock(&fil_system.mutex);
+        if (!space->acquire())
+        {
+free_space:
+          fil_space_free(it->first, false);
+          goto next_item;
+        }
+        if (os_file_write(IORequestWrite, node->name, node->handle,
+                          page, 0, fil_space_t::physical_size(flags)) !=
+            DB_SUCCESS)
+        {
+          space->release();
+          goto free_space;
+        }
+        space->release();
+        it->second.space= space;
+        defers.erase(d++);
+        continue;
+      }
+      goto next_item;
+    }
+  }
+}
+deferred_spaces;
+
+/** Report an operation to create, delete, or rename a file during backup.
+@param[in]	space_id	tablespace identifier
+@param[in]	type		redo log type
+@param[in]	name		file name (not NUL-terminated)
+@param[in]	len		length of name, in bytes
+@param[in]	new_name	new file name (NULL if not rename)
+@param[in]	new_len		length of new_name, in bytes (0 if NULL) */
+void (*log_file_op)(uint32_t space_id, int type,
+		    const byte* name, ulint len,
+		    const byte* new_name, ulint new_len);
+
+void (*undo_space_trunc)(uint32_t space_id);
+
+void (*first_page_init)(uint32_t space_id);
+
+/** Information about initializing page contents during redo log processing.
+FIXME: Rely on recv_sys.pages! */
+class mlog_init_t
+{
+  using map= std::map<const page_id_t, recv_init,
+                      std::less<const page_id_t>,
+                      ut_allocator<std::pair<const page_id_t, recv_init>>>;
+  /** Map of page initialization operations.
+  FIXME: Merge this to recv_sys.pages! */
+  map inits;
+
+  /** Iterator to the last add() or will_avoid_read(), for speeding up
+  will_avoid_read(). */
+  map::iterator i;
+public:
+  /** Constructor */
+  mlog_init_t() : i(inits.end()) {}
+
+  /** Record that a page will be initialized by the redo log.
+  @param page_id     page identifier
+  @param lsn         log sequence number
+  @return whether the state was changed */
+  bool add(const page_id_t page_id, lsn_t lsn)
+  {
+    mysql_mutex_assert_owner(&recv_sys.mutex);
+    const recv_init init = { lsn, false };
+    std::pair<map::iterator, bool> p=
+      inits.insert(map::value_type(page_id, init));
+    ut_ad(!p.first->second.created);
+    if (p.second) return true;
+    if (p.first->second.lsn >= lsn) return false;
+    p.first->second = init;
+    i = p.first;
+    return true;
+  }
+
+  /** Get the last stored lsn of the page id and its respective
+  init/load operation.
+  @param page_id    page identifier
+  @return the latest page initialization;
+  not valid after releasing recv_sys.mutex. */
+  recv_init &last(page_id_t page_id)
+  {
+    mysql_mutex_assert_owner(&recv_sys.mutex);
+    return inits.find(page_id)->second;
+  }
+
+  /** Determine if a page will be initialized or freed after a time.
+  @param page_id      page identifier
+  @param lsn          log sequence number
+  @return whether page_id will be freed or initialized after lsn */
+  bool will_avoid_read(page_id_t page_id, lsn_t lsn)
+  {
+    mysql_mutex_assert_owner(&recv_sys.mutex);
+    if (i != inits.end() && i->first == page_id)
+      return i->second.lsn > lsn;
+    i = inits.lower_bound(page_id);
+    return i != inits.end() && i->first == page_id && i->second.lsn > lsn;
+  }
+
+  /** At the end of each recovery batch, reset the 'created' flags. */
+  void reset()
+  {
+    mysql_mutex_assert_owner(&recv_sys.mutex);
+    ut_ad(recv_no_ibuf_operations);
+    for (map::value_type &i : inits)
+      i.second.created= false;
+  }
+
+  /** During the last recovery batch, mark whether there exist
+  buffered changes for the pages that were initialized
+  by buf_page_create() and still reside in the buffer pool. */
+  void mark_ibuf_exist()
+  {
+    mysql_mutex_assert_owner(&recv_sys.mutex);
+
+    for (const map::value_type &i : inits)
+      if (i.second.created)
+      {
+        auto &chain= buf_pool.page_hash.cell_get(i.first.fold());
+        page_hash_latch &hash_lock= buf_pool.page_hash.lock_get(chain);
+
+        hash_lock.lock_shared();
+        buf_block_t *block= reinterpret_cast<buf_block_t*>
+          (buf_pool.page_hash.get(i.first, chain));
+        bool got_latch= block && block->page.lock.x_lock_try();
+        hash_lock.unlock_shared();
+
+        if (!block)
+          continue;
+
+        uint32_t state;
+
+        if (!got_latch)
+        {
+          mysql_mutex_lock(&buf_pool.mutex);
+          block= reinterpret_cast<buf_block_t*>
+            (buf_pool.page_hash.get(i.first, chain));
+          if (!block)
+          {
+            mysql_mutex_unlock(&buf_pool.mutex);
+            continue;
+          }
+
+          state= block->page.fix();
+          mysql_mutex_unlock(&buf_pool.mutex);
+          if (state < buf_page_t::UNFIXED)
+          {
+            block->page.unfix();
+            continue;
+          }
+          block->page.lock.x_lock();
+          state= block->page.unfix();
+          ut_ad(state < buf_page_t::READ_FIX);
+          if (state >= buf_page_t::UNFIXED && block->page.id() == i.first)
+            goto check_ibuf;
+        }
+        else
+        {
+          state= block->page.state();
+          ut_ad(state >= buf_page_t::FREED);
+          ut_ad(state < buf_page_t::READ_FIX);
+
+          if (state >= buf_page_t::UNFIXED)
+          {
+          check_ibuf:
+            mysql_mutex_unlock(&recv_sys.mutex);
+            if (ibuf_page_exists(block->page.id(), block->zip_size()))
+              block->page.set_ibuf_exist();
+            mysql_mutex_lock(&recv_sys.mutex);
+          }
+        }
+
+        block->page.lock.x_unlock();
+      }
+  }
+
+  /** Clear the data structure */
+  void clear() { inits.clear(); i = inits.end(); }
+};
+
+static mlog_init_t mlog_init;
+
+/** Try to recover a tablespace that was not readable earlier
+@param p          iterator to the page
+@param name       tablespace file name
+@param free_block spare buffer block
+@return recovered tablespace
+@retval nullptr if recovery failed */
+fil_space_t *recv_sys_t::recover_deferred(const recv_sys_t::map::iterator &p,
+                                          const std::string &name,
+                                          buf_block_t *&free_block)
+{
+  mysql_mutex_assert_owner(&mutex);
+
+  ut_ad(p->first.space());
+
+  recv_spaces_t::iterator it{recv_spaces.find(p->first.space())};
+  ut_ad(it != recv_spaces.end());
+
+  if (!p->first.page_no() && p->second.skip_read)
+  {
+    mtr_t mtr;
+    ut_ad(!p->second.being_processed);
+    p->second.being_processed= 1;
+    init &init= mlog_init.last(p->first);
+    mysql_mutex_unlock(&mutex);
+    buf_block_t *block= recover_low(p, mtr, free_block, init);
+    mysql_mutex_lock(&mutex);
+    p->second.being_processed= -1;
+    ut_ad(block == free_block || block == reinterpret_cast<buf_block_t*>(-1));
+    free_block= nullptr;
+    if (UNIV_UNLIKELY(!block || block == reinterpret_cast<buf_block_t*>(-1)))
+      goto fail;
+    const byte *page= UNIV_LIKELY_NULL(block->page.zip.data)
+      ? block->page.zip.data
+      : block->page.frame;
+    const uint32_t space_id= mach_read_from_4(page + FIL_PAGE_SPACE_ID);
+    const uint32_t flags= fsp_header_get_flags(page);
+    const uint32_t page_no= mach_read_from_4(page + FIL_PAGE_OFFSET);
+    const uint32_t size= fsp_header_get_field(page, FSP_SIZE);
+
+    if (page_id_t{space_id, page_no} == p->first && size >= 4 &&
+        fil_space_t::is_valid_flags(flags, space_id) &&
+        fil_space_t::logical_size(flags) == srv_page_size)
+    {
+      fil_space_t *space= deferred_spaces.create(it, name, flags,
+                                                 fil_space_read_crypt_data
+                                                 (fil_space_t::zip_size(flags),
+                                                  page), size);
+      if (!space)
+        goto release_and_fail;
+      space->free_limit= fsp_header_get_field(page, FSP_FREE_LIMIT);
+      space->free_len= flst_get_len(FSP_HEADER_OFFSET + FSP_FREE + page);
+      fil_node_t *node= UT_LIST_GET_FIRST(space->chain);
+      node->deferred= true;
+      mysql_mutex_unlock(&fil_system.mutex);
+      if (!space->acquire())
+        goto release_and_fail;
+      fil_names_dirty(space);
+      const bool is_compressed= fil_space_t::is_compressed(flags);
+#ifdef _WIN32
+      const bool is_sparse= is_compressed;
+      if (is_compressed)
+        os_file_set_sparse_win32(node->handle);
+#else
+      const bool is_sparse= is_compressed &&
+        DB_SUCCESS == os_file_punch_hole(node->handle, 0, 4096) &&
+        !my_test_if_thinly_provisioned(node->handle);
+#endif
+      /* Mimic fil_node_t::read_page0() in case the file exists and
+      has already been extended to a larger size. */
+      ut_ad(node->size == size);
+      const os_offset_t file_size= os_file_get_size(node->handle);
+      if (file_size != os_offset_t(-1))
+      {
+        const uint32_t n_pages=
+          uint32_t(file_size / fil_space_t::physical_size(flags));
+        if (n_pages > size)
+        {
+          mysql_mutex_lock(&fil_system.mutex);
+          space->size= node->size= n_pages;
+          space->set_committed_size();
+          mysql_mutex_unlock(&fil_system.mutex);
+          goto size_set;
+        }
+      }
+      if (!os_file_set_size(node->name, node->handle,
+                            (size * fil_space_t::physical_size(flags)) &
+                            ~4095ULL, is_sparse))
+      {
+        space->release();
+        goto release_and_fail;
+      }
+    size_set:
+      node->deferred= false;
+      it->second.space= space;
+      block->page.lock.x_unlock();
+      p->second.being_processed= -1;
+      return space;
+    }
+
+  release_and_fail:
+    block->page.lock.x_unlock();
+  }
+
+fail:
+  ib::error() << "Cannot apply log to " << p->first
+              << " of corrupted file '" << name << "'";
+  return nullptr;
+}
+
+/** Process a record that indicates that a tablespace is
+being shrunk in size.
+@param page_id	first page identifier that is not in the file
+@param lsn	log sequence number of the shrink operation */
+inline void recv_sys_t::trim(const page_id_t page_id, lsn_t lsn)
+{
+  DBUG_ENTER("recv_sys_t::trim");
+  DBUG_LOG("ib_log", "discarding log beyond end of tablespace "
+           << page_id << " before LSN " << lsn);
+  mysql_mutex_assert_owner(&mutex);
+  if (pages_it != pages.end() && pages_it->first.space() == page_id.space())
+    pages_it= pages.end();
+  for (recv_sys_t::map::iterator p = pages.lower_bound(page_id);
+       p != pages.end() && p->first.space() == page_id.space();)
+  {
+    recv_sys_t::map::iterator r = p++;
+    if (r->second.trim(lsn))
+    {
+      ut_ad(!r->second.being_processed);
+      pages.erase(r);
+    }
+  }
+  DBUG_VOID_RETURN;
+}
+
+inline dberr_t recv_sys_t::read(os_offset_t total_offset, span<byte> buf)
+{
+  size_t file_idx= static_cast<size_t>(total_offset / log_sys.file_size);
+  os_offset_t offset= total_offset % log_sys.file_size;
+  return file_idx
+    ? recv_sys.files[file_idx].read(offset, buf)
+    : log_sys.log.read(offset, buf);
+}
+
+inline size_t recv_sys_t::files_size()
+{
+  ut_ad(!files.empty());
+  return files.size();
+}
+
+/** Process a file name from a FILE_* record.
+@param[in]	name		file name
+@param[in]	len		length of the file name
+@param[in]	space_id	the tablespace ID
+@param[in]	ftype		FILE_MODIFY, FILE_DELETE, or FILE_RENAME
+@param[in]	lsn		lsn of the redo log
+@param[in]	if_exists	whether to check if the tablespace exists */
+static void fil_name_process(const char *name, ulint len, uint32_t space_id,
+                             mfile_type_t ftype, lsn_t lsn, bool if_exists)
+{
+	ut_ad(srv_operation <= SRV_OPERATION_EXPORT_RESTORED
+	      || srv_operation == SRV_OPERATION_RESTORE
+	      || srv_operation == SRV_OPERATION_RESTORE_EXPORT);
+
+	/* We will also insert space=NULL into the map, so that
+	further checks can ensure that a FILE_MODIFY record was
+	scanned before applying any page records for the space_id. */
+
+	const bool deleted{ftype == FILE_DELETE};
+	const file_name_t fname(std::string(name, len), deleted);
+	std::pair<recv_spaces_t::iterator,bool> p = recv_spaces.emplace(
+		space_id, fname);
+	ut_ad(p.first->first == space_id);
+
+	file_name_t&	f = p.first->second;
+
+	if (auto d = deferred_spaces.find(space_id)) {
+		if (deleted) {
+			d->deleted = true;
+			goto got_deleted;
+		}
+		goto reload;
+	}
+
+	if (deleted) {
+got_deleted:
+		/* Got FILE_DELETE */
+		if (!p.second && f.status != file_name_t::DELETED) {
+			f.status = file_name_t::DELETED;
+			if (f.space != NULL) {
+				fil_space_free(space_id, false);
+				f.space = NULL;
+			}
+		}
+
+		ut_ad(f.space == NULL);
+	} else if (p.second // the first FILE_MODIFY or FILE_RENAME
+		   || f.name != fname.name) {
+reload:
+		fil_space_t*	space;
+
+		/* Check if the tablespace file exists and contains
+		the space_id. If not, ignore the file after displaying
+		a note. Abort if there are multiple files with the
+		same space_id. */
+		switch (fil_ibd_load(space_id, fname.name.c_str(), space)) {
+		case FIL_LOAD_OK:
+			ut_ad(space != NULL);
+
+			deferred_spaces.remove(space_id);
+			if (!f.space) {
+				if (f.size
+				    || f.flags != f.initial_flags) {
+					fil_space_set_recv_size_and_flags(
+						space->id, f.size, f.flags);
+				}
+
+				f.space = space;
+				goto same_space;
+			} else if (f.space == space) {
+same_space:
+				f.name = fname.name;
+				f.status = file_name_t::NORMAL;
+			} else {
+				sql_print_error("InnoDB: Tablespace " UINT32PF
+						" has been found"
+						" in two places:"
+						" '%.*s' and '%.*s'."
+						" You must delete"
+						" one of them.",
+						space_id,
+						int(f.name.size()),
+						f.name.data(),
+						int(fname.name.size()),
+						fname.name.data());
+				recv_sys.set_corrupt_fs();
+			}
+			break;
+
+		case FIL_LOAD_ID_CHANGED:
+			ut_ad(space == NULL);
+			break;
+
+		case FIL_LOAD_NOT_FOUND:
+			/* No matching tablespace was found; maybe it
+			was renamed, and we will find a subsequent
+			FILE_* record. */
+			ut_ad(space == NULL);
+
+			if (srv_force_recovery) {
+				/* Without innodb_force_recovery,
+				missing tablespaces will only be
+				reported in
+				recv_init_crash_recovery_spaces().
+				Enable some more diagnostics when
+				forcing recovery. */
+
+				sql_print_information(
+					"InnoDB: At LSN: " LSN_PF
+					": unable to open file %.*s"
+					" for tablespace " UINT32PF,
+					recv_sys.lsn,
+					int(fname.name.size()),
+					fname.name.data(), space_id);
+			}
+			break;
+
+		case FIL_LOAD_DEFER:
+			/** Skip the deferred spaces
+			when lsn is already processed */
+			if (!if_exists) {
+				deferred_spaces.add(
+					space_id, fname.name.c_str(), lsn);
+			}
+			break;
+		case FIL_LOAD_INVALID:
+			ut_ad(space == NULL);
+			if (srv_force_recovery == 0) {
+				sql_print_error("InnoDB: Recovery cannot access"
+						" file %.*s (tablespace "
+						UINT32PF ")", int(len), name,
+						space_id);
+				sql_print_information("InnoDB: You may set "
+						      "innodb_force_recovery=1"
+						      " to ignore this and"
+						      " possibly get a"
+						      " corrupted database.");
+				recv_sys.set_corrupt_fs();
+				break;
+			}
+
+			sql_print_warning("InnoDB: Ignoring changes to"
+					  " file %.*s (tablespace "
+					  UINT32PF ")"
+					  " due to innodb_force_recovery",
+					  int(len), name, space_id);
+		}
+	}
+}
+
+void recv_sys_t::close_files()
+{
+  for (auto &file : files)
+    if (file.is_opened())
+      file.close();
+  files.clear();
+  files.shrink_to_fit();
+}
+
+/** Clean up after recv_sys_t::create() */
+void recv_sys_t::close()
+{
+  ut_ad(this == &recv_sys);
+
+  if (is_initialised())
+  {
+    dblwr.pages.clear();
+    ut_d(mysql_mutex_lock(&mutex));
+    clear();
+    deferred_spaces.clear();
+    ut_d(mysql_mutex_unlock(&mutex));
+
+    scanned_lsn= 0;
+    mysql_mutex_destroy(&mutex);
+  }
+
+  recv_spaces.clear();
+  renamed_spaces.clear();
+  mlog_init.clear();
+  close_files();
+}
+
+/** Initialize the redo log recovery subsystem. */
+void recv_sys_t::create()
+{
+	ut_ad(this == &recv_sys);
+	ut_ad(!is_initialised());
+	mysql_mutex_init(recv_sys_mutex_key, &mutex, nullptr);
+
+	apply_log_recs = false;
+
+	len = 0;
+	offset = 0;
+	lsn = 0;
+	scanned_lsn = 1;
+	found_corrupt_log = false;
+	found_corrupt_fs = false;
+	file_checkpoint = 0;
+
+	progress_time = time(NULL);
+	ut_ad(pages.empty());
+	pages_it = pages.end();
+	recv_max_page_lsn = 0;
+
+	memset(truncated_undo_spaces, 0, sizeof truncated_undo_spaces);
+	UT_LIST_INIT(blocks, &buf_block_t::unzip_LRU);
+}
+
+/** Clear a fully processed set of stored redo log records. */
+void recv_sys_t::clear()
+{
+  mysql_mutex_assert_owner(&mutex);
+  apply_log_recs= false;
+  ut_ad(!after_apply || found_corrupt_fs || !UT_LIST_GET_LAST(blocks));
+  pages.clear();
+  pages_it= pages.end();
+
+  for (buf_block_t *block= UT_LIST_GET_LAST(blocks); block; )
+  {
+    buf_block_t *prev_block= UT_LIST_GET_PREV(unzip_LRU, block);
+    ut_ad(block->page.state() == buf_page_t::MEMORY);
+    UT_LIST_REMOVE(blocks, block);
+    MEM_MAKE_ADDRESSABLE(block->page.frame, srv_page_size);
+    buf_block_free(block);
+    block= prev_block;
+  }
+}
+
+/** Free most recovery data structures. */
+void recv_sys_t::debug_free()
+{
+  ut_ad(this == &recv_sys);
+  ut_ad(is_initialised());
+  mysql_mutex_lock(&mutex);
+
+  recovery_on= false;
+  pages.clear();
+  pages_it= pages.end();
+
+  mysql_mutex_unlock(&mutex);
+}
+
+
+/** Free a redo log snippet.
+@param data buffer allocated in add() */
+inline void recv_sys_t::free(const void *data)
+{
+  ut_ad(!ut_align_offset(data, ALIGNMENT));
+  data= page_align(data);
+  mysql_mutex_assert_owner(&mutex);
+
+  /* MDEV-14481 FIXME: To prevent race condition with buf_pool.resize(),
+  we must acquire and hold the buffer pool mutex here. */
+  ut_ad(!buf_pool.resize_in_progress());
+
+  auto *chunk= buf_pool.chunks;
+  for (auto i= buf_pool.n_chunks; i--; chunk++)
+  {
+    if (data < chunk->blocks->page.frame)
+      continue;
+    const size_t offs= (reinterpret_cast<const byte*>(data) -
+                        chunk->blocks->page.frame) >> srv_page_size_shift;
+    if (offs >= chunk->size)
+      continue;
+    buf_block_t *block= &chunk->blocks[offs];
+    ut_ad(block->page.frame == data);
+    ut_ad(block->page.state() == buf_page_t::MEMORY);
+    ut_ad(static_cast<uint16_t>(block->page.access_time - 1) <
+          srv_page_size);
+    unsigned a= block->page.access_time;
+    ut_ad(a >= 1U << 16);
+    a-= 1U << 16;
+    block->page.access_time= a;
+    if (!(a >> 16))
+    {
+      UT_LIST_REMOVE(blocks, block);
+      MEM_MAKE_ADDRESSABLE(block->page.frame, srv_page_size);
+      buf_block_free(block);
+    }
+    return;
+  }
+  ut_ad(0);
+}
+
+
+/** @return whether a log_t::FORMAT_10_5 log block checksum matches */
+static bool recv_check_log_block(const byte *buf)
+{
+  return mach_read_from_4(my_assume_aligned<4>(508 + buf)) ==
+    my_crc32c(0, buf, 508);
+}
+
+/** Calculate the checksum for a log block using the pre-10.2.2 algorithm. */
+inline uint32_t log_block_calc_checksum_format_0(const byte *b)
+{
+  uint32_t sum= 1;
+  const byte *const end= &b[512 - 4];
+
+  for (uint32_t sh= 0; b < end; )
+  {
+    sum&= 0x7FFFFFFFUL;
+    sum+= uint32_t{*b} << sh++;
+    sum+= *b++;
+    if (sh > 24)
+      sh= 0;
+  }
+
+  return sum;
+}
+
+/** Determine if a redo log from before MariaDB 10.2.2 is clean.
+@return error code
+@retval DB_SUCCESS      if the redo log is clean
+@retval DB_CORRUPTION   if the redo log is corrupted
+@retval DB_ERROR        if the redo log is not empty */
+ATTRIBUTE_COLD static dberr_t recv_log_recover_pre_10_2()
+{
+  uint64_t max_no= 0;
+
+  ut_ad(log_sys.format == 0);
+
+  /** Offset of the first checkpoint checksum */
+  constexpr uint CHECKSUM_1= 288;
+  /** Offset of the second checkpoint checksum */
+  constexpr uint CHECKSUM_2= CHECKSUM_1 + 4;
+  /** the checkpoint LSN field */
+  constexpr uint CHECKPOINT_LSN= 8;
+  /** Most significant bits of the checkpoint offset */
+  constexpr uint OFFS_HI= CHECKSUM_2 + 12;
+  /** Least significant bits of the checkpoint offset */
+  constexpr uint OFFS_LO= 16;
+
+  lsn_t source_offset= 0;
+  const lsn_t log_size{(log_sys.file_size - 2048) * recv_sys.files_size()};
+  for (size_t field= 512; field < 2048; field+= 1024)
+  {
+    const byte *buf= log_sys.buf + field;
+
+    if (static_cast<uint32_t>(ut_fold_binary(buf, CHECKSUM_1)) !=
+        mach_read_from_4(buf + CHECKSUM_1) ||
+        static_cast<uint32_t>(ut_fold_binary(buf + CHECKPOINT_LSN,
+                                             CHECKSUM_2 - CHECKPOINT_LSN)) !=
+        mach_read_from_4(buf + CHECKSUM_2))
+    {
+      DBUG_PRINT("ib_log", ("invalid pre-10.2.2 checkpoint %zu", field));
+      continue;
+    }
+
+    if (!log_crypt_101_read_checkpoint(buf))
+    {
+      sql_print_error("InnoDB: Decrypting checkpoint failed");
+      continue;
+    }
+
+    const uint64_t checkpoint_no= mach_read_from_8(buf);
+
+    DBUG_PRINT("ib_log", ("checkpoint " UINT64PF " at " LSN_PF " found",
+                          checkpoint_no,
+                          mach_read_from_8(buf + CHECKPOINT_LSN)));
+
+    if (checkpoint_no < max_no)
+      continue;
+
+    const lsn_t o= lsn_t{mach_read_from_4(buf + OFFS_HI)} << 32 |
+      mach_read_from_4(buf + OFFS_LO);
+    if (o >= 0x80c && (o & ~511) + 512 < log_size)
+    {
+      max_no= checkpoint_no;
+      log_sys.next_checkpoint_lsn= mach_read_from_8(buf + CHECKPOINT_LSN);
+      source_offset= o;
+    }
+  }
+
+  const char *uag= srv_operation == SRV_OPERATION_NORMAL
+    ? "InnoDB: Upgrade after a crash is not supported."
+    : "mariadb-backup --prepare is not possible.";
+
+  if (!log_sys.next_checkpoint_lsn)
+  {
+    sql_print_error("%s"
+                    " This redo log was created before MariaDB 10.2.2,"
+                    " and we did not find a valid checkpoint."
+                    " Please follow the instructions at"
+                    " https://mariadb.com/kb/en/library/upgrading/", uag);
+    return DB_ERROR;
+  }
+
+  static const char pre_10_2[]=
+    " This redo log was created before MariaDB 10.2.2";
+
+  byte *buf= const_cast<byte*>(field_ref_zero);
+
+  if (source_offset < (log_sys.is_pmem() ? log_sys.file_size : 4096))
+    memcpy_aligned<512>(buf, &log_sys.buf[source_offset & ~511], 512);
+  else
+    if (dberr_t err= recv_sys.read(source_offset & ~511, {buf, 512}))
+      return err;
+
+  if (log_block_calc_checksum_format_0(buf) !=
+      mach_read_from_4(my_assume_aligned<4>(buf + 508)) &&
+      !log_crypt_101_read_block(buf, log_sys.next_checkpoint_lsn))
+  {
+    sql_print_error("%s%s, and it appears corrupted.", uag, pre_10_2);
+    return DB_CORRUPTION;
+  }
+
+  if (mach_read_from_2(buf + 4) == (source_offset & 511))
+    return DB_SUCCESS;
+
+  if (buf[20 + 32 * 9] == 2)
+    sql_print_error("InnoDB: Cannot decrypt log for upgrading."
+                    " The encrypted log was created before MariaDB 10.2.2.");
+  else
+    sql_print_error("%s%s. You must start up and shut down"
+                    " MariaDB 10.1 or MySQL 5.6 or earlier"
+                    " on the data directory.",
+                    uag, pre_10_2);
+
+  return DB_ERROR;
+}
+
+/** Determine if a redo log from MariaDB 10.2.2, 10.3, 10.4, or 10.5 is clean.
+@param lsn_offset  checkpoint LSN offset
+@return	error code
+@retval	DB_SUCCESS	if the redo log is clean
+@retval	DB_CORRUPTION	if the redo log is corrupted
+@retval	DB_ERROR	if the redo log is not empty */
+static dberr_t recv_log_recover_10_5(lsn_t lsn_offset)
+{
+  byte *buf= const_cast<byte*>(field_ref_zero);
+
+  if (lsn_offset < (log_sys.is_pmem() ? log_sys.file_size : 4096))
+    memcpy_aligned<512>(buf, &log_sys.buf[lsn_offset & ~511], 512);
+  else
+  {
+    if (dberr_t err= recv_sys.read(lsn_offset & ~lsn_t{4095}, {buf, 4096}))
+      return err;
+    buf+= lsn_offset & 0xe00;
+  }
+
+  if (!recv_check_log_block(buf))
+  {
+    sql_print_error("InnoDB: Invalid log header checksum");
+    return DB_CORRUPTION;
+  }
+
+  if (log_sys.is_encrypted() &&
+      !log_decrypt(buf, log_sys.next_checkpoint_lsn & ~511, 512))
+    return DB_ERROR;
+
+  /* On a clean shutdown, the redo log will be logically empty
+  after the checkpoint lsn. */
+
+  if (mach_read_from_2(my_assume_aligned<2>(buf + 4)) != (lsn_offset & 511))
+    return DB_ERROR;
+
+  return DB_SUCCESS;
+}
+
+dberr_t recv_sys_t::find_checkpoint()
+{
+  bool wrong_size= false;
+  byte *buf;
+
+  ut_ad(pages.empty());
+  pages_it= pages.end();
+
+  if (files.empty())
+  {
+    file_checkpoint= 0;
+    std::string path{get_log_file_path()};
+    bool success;
+    os_file_t file{os_file_create_func(path.c_str(),
+                                       OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT,
+                                       OS_FILE_NORMAL, OS_LOG_FILE,
+                                       srv_read_only_mode, &success)};
+    if (file == OS_FILE_CLOSED)
+      return DB_ERROR;
+    const os_offset_t size{os_file_get_size(file)};
+    if (!size)
+    {
+      if (srv_operation != SRV_OPERATION_NORMAL)
+        goto too_small;
+    }
+    else if (size < log_t::START_OFFSET + SIZE_OF_FILE_CHECKPOINT)
+    {
+    too_small:
+      sql_print_error("InnoDB: File %.*s is too small",
+                      int(path.size()), path.data());
+    err_exit:
+      os_file_close(file);
+      return DB_ERROR;
+    }
+    else if (!log_sys.attach(file, size))
+      goto err_exit;
+    else
+      file= OS_FILE_CLOSED;
+
+    recv_sys.files.emplace_back(file);
+    for (int i= 1; i < 101; i++)
+    {
+      path= get_log_file_path(LOG_FILE_NAME_PREFIX).append(std::to_string(i));
+      file= os_file_create_func(path.c_str(),
+                                OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT |
+                                OS_FILE_ON_ERROR_SILENT,
+                                OS_FILE_NORMAL, OS_LOG_FILE, true, &success);
+      if (file == OS_FILE_CLOSED)
+        break;
+      const os_offset_t sz{os_file_get_size(file)};
+      if (size != sz)
+      {
+        sql_print_error("InnoDB: Log file %.*s is of different size " UINT64PF
+                        " bytes than other log files " UINT64PF " bytes!",
+                        int(path.size()), path.data(), sz, size);
+        wrong_size= true;
+      }
+      recv_sys.files.emplace_back(file);
+    }
+
+    if (!size)
+    {
+      if (wrong_size)
+        return DB_CORRUPTION;
+      if (log_sys.next_checkpoint_lsn < 8204)
+      {
+        /* Before MDEV-14425, InnoDB had a minimum LSN of 8192+12=8204.
+        Likewise, mariadb-backup --prepare would create an empty
+        ib_logfile0 after applying the log. We will allow an upgrade
+        from such an empty log.
+
+        If a user replaces the redo log with an empty file and the
+        FIL_PAGE_FILE_FLUSH_LSN field was zero in the system
+        tablespace (see SysTablespace::read_lsn_and_check_flags()) we
+        must refuse to start up. */
+        sql_print_error("InnoDB: ib_logfile0 is empty, and LSN is unknown.");
+        return DB_CORRUPTION;
+      }
+      lsn= log_sys.next_checkpoint_lsn;
+      log_sys.format= log_t::FORMAT_3_23;
+      goto upgrade;
+    }
+  }
+  else
+    ut_ad(srv_operation == SRV_OPERATION_BACKUP);
+  log_sys.next_checkpoint_lsn= 0;
+  lsn= 0;
+  buf= my_assume_aligned<4096>(log_sys.buf);
+  if (!log_sys.is_pmem())
+    if (dberr_t err= log_sys.log.read(0, {buf, 4096}))
+      return err;
+  /* Check the header page checksum. There was no
+  checksum in the first redo log format (version 0). */
+  log_sys.format= mach_read_from_4(buf + LOG_HEADER_FORMAT);
+  if (log_sys.format == log_t::FORMAT_3_23)
+  {
+    if (wrong_size)
+      return DB_CORRUPTION;
+    if (dberr_t err= recv_log_recover_pre_10_2())
+      return err;
+  upgrade:
+    memset_aligned<4096>(const_cast<byte*>(field_ref_zero), 0, 4096);
+    /* Mark the redo log for upgrading. */
+    log_sys.last_checkpoint_lsn= log_sys.next_checkpoint_lsn;
+    log_sys.set_recovered_lsn(log_sys.next_checkpoint_lsn);
+    lsn= file_checkpoint= log_sys.next_checkpoint_lsn;
+    log_sys.next_checkpoint_no= 0;
+    return DB_SUCCESS;
+  }
+
+  if (!recv_check_log_block(buf))
+  {
+    sql_print_error("InnoDB: Invalid log header checksum");
+    return DB_CORRUPTION;
+  }
+
+  const lsn_t first_lsn{mach_read_from_8(buf + LOG_HEADER_START_LSN)};
+  log_sys.set_first_lsn(first_lsn);
+  char creator[LOG_HEADER_CREATOR_END - LOG_HEADER_CREATOR + 1];
+  memcpy(creator, buf + LOG_HEADER_CREATOR, sizeof creator);
+  /* Ensure that the string is NUL-terminated. */
+  creator[LOG_HEADER_CREATOR_END - LOG_HEADER_CREATOR]= 0;
+
+  lsn_t lsn_offset= 0;
+
+  switch (log_sys.format) {
+  default:
+    sql_print_error("InnoDB: Unsupported redo log format."
+                    " The redo log was created with %s.", creator);
+    return DB_ERROR;
+  case log_t::FORMAT_10_8:
+    if (files.size() != 1)
+    {
+      sql_print_error("InnoDB: Expecting only ib_logfile0");
+      return DB_CORRUPTION;
+    }
+
+    if (*reinterpret_cast<const uint32_t*>(buf + LOG_HEADER_FORMAT + 4) ||
+        first_lsn < log_t::FIRST_LSN)
+    {
+      sql_print_error("InnoDB: Invalid ib_logfile0 header block;"
+                      " the log was created with %s.", creator);
+      return DB_CORRUPTION;
+    }
+
+    if (!mach_read_from_4(buf + LOG_HEADER_CREATOR_END));
+    else if (!log_crypt_read_header(buf + LOG_HEADER_CREATOR_END))
+    {
+      sql_print_error("InnoDB: Reading log encryption info failed;"
+                      " the log was created with %s.", creator);
+      return DB_ERROR;
+    }
+    else
+      log_sys.format= log_t::FORMAT_ENC_10_8;
+
+    for (size_t field= log_t::CHECKPOINT_1; field <= log_t::CHECKPOINT_2;
+         field+= log_t::CHECKPOINT_2 - log_t::CHECKPOINT_1)
+    {
+      if (log_sys.is_pmem())
+        buf= log_sys.buf + field;
+      else
+        if (dberr_t err= log_sys.log.read(field,
+                                          {buf, log_sys.get_block_size()}))
+          return err;
+      const lsn_t checkpoint_lsn{mach_read_from_8(buf)};
+      const lsn_t end_lsn{mach_read_from_8(buf + 8)};
+      if (checkpoint_lsn < first_lsn || end_lsn < checkpoint_lsn ||
+          memcmp(buf + 16, field_ref_zero, 60 - 16) ||
+          my_crc32c(0, buf, 60) != mach_read_from_4(buf + 60))
+      {
+        DBUG_PRINT("ib_log", ("invalid checkpoint at %zu", field));
+        continue;
+      }
+
+      if (checkpoint_lsn >= log_sys.next_checkpoint_lsn)
+      {
+        log_sys.next_checkpoint_lsn= checkpoint_lsn;
+        log_sys.next_checkpoint_no= field == log_t::CHECKPOINT_1;
+        lsn= end_lsn;
+      }
+    }
+    if (!log_sys.next_checkpoint_lsn)
+      goto got_no_checkpoint;
+    if (!memcmp(creator, "Backup ", 7))
+      srv_start_after_restore= true;
+    return DB_SUCCESS;
+  case log_t::FORMAT_10_5:
+  case log_t::FORMAT_10_5 | log_t::FORMAT_ENCRYPTED:
+    if (files.size() != 1)
+    {
+      sql_print_error("InnoDB: Expecting only ib_logfile0");
+      return DB_CORRUPTION;
+    }
+    /* fall through */
+  case log_t::FORMAT_10_2:
+  case log_t::FORMAT_10_2 | log_t::FORMAT_ENCRYPTED:
+  case log_t::FORMAT_10_3:
+  case log_t::FORMAT_10_3 | log_t::FORMAT_ENCRYPTED:
+  case log_t::FORMAT_10_4:
+  case log_t::FORMAT_10_4 | log_t::FORMAT_ENCRYPTED:
+    uint64_t max_no= 0;
+    const lsn_t log_size{(log_sys.file_size - 2048) * files.size()};
+    for (size_t field= 512; field < 2048; field += 1024)
+    {
+      const byte *b = buf + field;
+
+      if (!recv_check_log_block(b))
+      {
+        DBUG_PRINT("ib_log", ("invalid checkpoint checksum at %zu", field));
+        continue;
+      }
+
+      if (log_sys.is_encrypted() && !log_crypt_read_checkpoint_buf(b))
+      {
+        sql_print_error("InnoDB: Reading checkpoint encryption info failed.");
+        continue;
+      }
+
+      const uint64_t checkpoint_no= mach_read_from_8(b);
+      const lsn_t checkpoint_lsn= mach_read_from_8(b + 8);
+      DBUG_PRINT("ib_log", ("checkpoint " UINT64PF " at " LSN_PF " found",
+                            checkpoint_no, checkpoint_lsn));
+      const lsn_t o{mach_read_from_8(b + 16)};
+      if (checkpoint_no >= max_no && o >= 0x80c && (o & ~511) + 512 < log_size)
+      {
+        max_no= checkpoint_no;
+        log_sys.next_checkpoint_lsn= checkpoint_lsn;
+        log_sys.next_checkpoint_no= field == 512;
+        lsn_offset= mach_read_from_8(b + 16);
+      }
+    }
+  }
+
+  if (!log_sys.next_checkpoint_lsn)
+  {
+  got_no_checkpoint:
+    sql_print_error("InnoDB: No valid checkpoint was found;"
+                    " the log was created with %s.", creator);
+    return DB_ERROR;
+  }
+
+  if (wrong_size)
+    return DB_CORRUPTION;
+
+  if (dberr_t err= recv_log_recover_10_5(lsn_offset))
+  {
+    const char *msg1, *msg2, *msg3;
+    msg1= srv_operation == SRV_OPERATION_NORMAL
+      ? "InnoDB: Upgrade after a crash is not supported."
+      : "mariadb-backup --prepare is not possible.";
+
+    if (err == DB_ERROR)
+    {
+      msg2= srv_operation == SRV_OPERATION_NORMAL
+        ? ". You must start up and shut down MariaDB "
+        : ". You must use mariadb-backup ";
+      msg3= (log_sys.format & ~log_t::FORMAT_ENCRYPTED) == log_t::FORMAT_10_5
+        ? "10.7 or earlier." : "10.4 or earlier.";
+    }
+    else
+      msg2= ", and it appears corrupted.", msg3= "";
+
+    sql_print_error("%s The redo log was created with %s%s%s",
+                    msg1, creator, msg2, msg3);
+    return err;
+  }
+
+  goto upgrade;
+}
+
+/** Trim old log records for a page.
+@param start_lsn oldest log sequence number to preserve
+@return whether all the log for the page was trimmed */
+inline bool page_recv_t::trim(lsn_t start_lsn)
+{
+  while (log.head)
+  {
+    if (log.head->lsn > start_lsn) return false;
+    last_offset= 1; /* the next record must not be same_page */
+    log_rec_t *next= log.head->next;
+    recv_sys.free(log.head);
+    log.head= next;
+  }
+  log.tail= nullptr;
+  return true;
+}
+
+
+void page_recv_t::recs_t::rewind(lsn_t start_lsn)
+{
+  mysql_mutex_assert_owner(&recv_sys.mutex);
+  log_phys_t *trim= static_cast<log_phys_t*>(head);
+  ut_ad(trim);
+  while (log_phys_t *next= static_cast<log_phys_t*>(trim->next))
+  {
+    ut_ad(trim->start_lsn < start_lsn);
+    if (next->start_lsn == start_lsn)
+      break;
+    trim= next;
+  }
+  tail= trim;
+  log_rec_t *l= tail->next;
+  tail->next= nullptr;
+  while (l)
+  {
+    log_rec_t *next= l->next;
+    recv_sys.free(l);
+    l= next;
+  }
+}
+
+
+void page_recv_t::recs_t::clear()
+{
+  mysql_mutex_assert_owner(&recv_sys.mutex);
+  for (const log_rec_t *l= head; l; )
+  {
+    const log_rec_t *next= l->next;
+    recv_sys.free(l);
+    l= next;
+  }
+  head= tail= nullptr;
+}
+
+/** Ignore any earlier redo log records for this page. */
+inline void page_recv_t::will_not_read()
+{
+  ut_ad(!being_processed);
+  skip_read= true;
+  log.clear();
+}
+
+void recv_sys_t::erase(map::iterator p)
+{
+  ut_ad(p->second.being_processed <= 0);
+  p->second.log.clear();
+  pages.erase(p);
+}
+
+/** Free log for processed pages. */
+void recv_sys_t::garbage_collect()
+{
+  mysql_mutex_assert_owner(&mutex);
+
+  if (pages_it != pages.end() && pages_it->second.being_processed < 0)
+    pages_it= pages.end();
+
+  for (map::iterator p= pages.begin(); p != pages.end(); )
+  {
+    if (p->second.being_processed < 0)
+    {
+      map::iterator r= p++;
+      erase(r);
+    }
+    else
+      p++;
+  }
+}
+
+/** Allocate a block from the buffer pool for recv_sys.pages */
+ATTRIBUTE_COLD buf_block_t *recv_sys_t::add_block()
+{
+  for (bool freed= false;;)
+  {
+    const auto rs= UT_LIST_GET_LEN(blocks) * 2;
+    mysql_mutex_lock(&buf_pool.mutex);
+    const auto bs=
+      UT_LIST_GET_LEN(buf_pool.free) + UT_LIST_GET_LEN(buf_pool.LRU);
+    if (UNIV_LIKELY(bs > BUF_LRU_MIN_LEN || rs < bs))
+    {
+      buf_block_t *block= buf_LRU_get_free_block(true);
+      mysql_mutex_unlock(&buf_pool.mutex);
+      return block;
+    }
+    /* out of memory: redo log occupies more than 1/3 of buf_pool
+    and there are fewer than BUF_LRU_MIN_LEN pages left */
+    mysql_mutex_unlock(&buf_pool.mutex);
+    if (freed)
+      return nullptr;
+    freed= true;
+    garbage_collect();
+  }
+}
+
+/** Wait for buffer pool to become available. */
+ATTRIBUTE_COLD void recv_sys_t::wait_for_pool(size_t pages)
+{
+  mysql_mutex_unlock(&mutex);
+  os_aio_wait_until_no_pending_reads(false);
+  mysql_mutex_lock(&mutex);
+  garbage_collect();
+  mysql_mutex_lock(&buf_pool.mutex);
+  bool need_more= UT_LIST_GET_LEN(buf_pool.free) < pages;
+  mysql_mutex_unlock(&buf_pool.mutex);
+  if (need_more)
+    buf_flush_sync_batch(lsn);
+}
+
+/** Register a redo log snippet for a page.
+@param it       page iterator
+@param start_lsn start LSN of the mini-transaction
+@param lsn      @see mtr_t::commit_lsn()
+@param l        redo log snippet
+@param len      length of l, in bytes
+@return whether we ran out of memory */
+ATTRIBUTE_NOINLINE
+bool recv_sys_t::add(map::iterator it, lsn_t start_lsn, lsn_t lsn,
+                     const byte *l, size_t len)
+{
+  mysql_mutex_assert_owner(&mutex);
+  page_recv_t &recs= it->second;
+  buf_block_t *block;
+
+  switch (*l & 0x70) {
+  case FREE_PAGE: case INIT_PAGE:
+    recs.will_not_read();
+    mlog_init.add(it->first, start_lsn); /* FIXME: remove this! */
+    /* fall through */
+  default:
+    log_phys_t *tail= static_cast<log_phys_t*>(recs.log.last());
+    if (!tail)
+      break;
+    if (tail->start_lsn != start_lsn)
+      break;
+    ut_ad(tail->lsn == lsn);
+    block= UT_LIST_GET_LAST(blocks);
+    ut_ad(block);
+    const size_t used= static_cast<uint16_t>(block->page.access_time - 1) + 1;
+    ut_ad(used >= ALIGNMENT);
+    const byte *end= const_cast<const log_phys_t*>(tail)->end();
+    if (!((reinterpret_cast<size_t>(end + len) ^
+           reinterpret_cast<size_t>(end)) & ~(ALIGNMENT - 1)))
+    {
+      /* Use already allocated 'padding' bytes */
+append:
+      MEM_MAKE_ADDRESSABLE(end + 1, len);
+      /* Append to the preceding record for the page */
+      tail->append(l, len);
+      return false;
+    }
+    if (end <= &block->page.frame[used - ALIGNMENT] ||
+        &block->page.frame[used] >= end)
+      break; /* Not the last allocated record in the page */
+    const size_t new_used= static_cast<size_t>
+      (end - block->page.frame + len + 1);
+    ut_ad(new_used > used);
+    if (new_used > srv_page_size)
+      break;
+    block->page.access_time= (block->page.access_time & ~0U << 16) |
+      ut_calc_align<uint16_t>(static_cast<uint16_t>(new_used), ALIGNMENT);
+    goto append;
+  }
+
+  const size_t size{log_phys_t::alloc_size(len)};
+  ut_ad(size <= srv_page_size);
+  void *buf;
+  block= UT_LIST_GET_FIRST(blocks);
+  if (UNIV_UNLIKELY(!block))
+  {
+  create_block:
+    block= add_block();
+    if (UNIV_UNLIKELY(!block))
+      return true;
+    block->page.access_time= 1U << 16 |
+      ut_calc_align<uint16_t>(static_cast<uint16_t>(size), ALIGNMENT);
+    static_assert(ut_is_2pow(ALIGNMENT), "ALIGNMENT must be a power of 2");
+    UT_LIST_ADD_FIRST(blocks, block);
+    MEM_MAKE_ADDRESSABLE(block->page.frame, size);
+    MEM_NOACCESS(block->page.frame + size, srv_page_size - size);
+    buf= block->page.frame;
+  }
+  else
+  {
+    size_t free_offset= static_cast<uint16_t>(block->page.access_time);
+    ut_ad(!ut_2pow_remainder(free_offset, ALIGNMENT));
+    if (UNIV_UNLIKELY(!free_offset))
+    {
+      ut_ad(srv_page_size == 65536);
+      goto create_block;
+    }
+    ut_ad(free_offset <= srv_page_size);
+    free_offset+= size;
+
+    if (free_offset > srv_page_size)
+      goto create_block;
+
+    block->page.access_time= ((block->page.access_time >> 16) + 1) << 16 |
+      ut_calc_align<uint16_t>(static_cast<uint16_t>(free_offset), ALIGNMENT);
+    MEM_MAKE_ADDRESSABLE(block->page.frame + free_offset - size, size);
+    buf= block->page.frame + free_offset - size;
+  }
+
+  recs.log.append(new (my_assume_aligned<ALIGNMENT>(buf))
+                  log_phys_t{start_lsn, lsn, l, len});
+  return false;
+}
+
+/** Store/remove the freed pages in fil_name_t of recv_spaces.
+@param[in]	page_id		freed or init page_id
+@param[in]	freed		TRUE if page is freed */
+static void store_freed_or_init_rec(page_id_t page_id, bool freed)
+{
+  uint32_t space_id= page_id.space();
+  uint32_t page_no= page_id.page_no();
+  if (is_predefined_tablespace(space_id))
+  {
+    if (!srv_immediate_scrub_data_uncompressed)
+      return;
+    fil_space_t *space;
+    if (space_id == TRX_SYS_SPACE)
+      space= fil_system.sys_space;
+    else
+      space= fil_space_get(space_id);
+
+    space->free_page(page_no, freed);
+    return;
+  }
+
+  recv_spaces_t::iterator i= recv_spaces.lower_bound(space_id);
+  if (i != recv_spaces.end() && i->first == space_id)
+  {
+    if (freed)
+      i->second.add_freed_page(page_no);
+    else
+      i->second.remove_freed_page(page_no);
+  }
+}
+
+/** Wrapper for log_sys.buf[] between recv_sys.offset and recv_sys.len */
+struct recv_buf
+{
+  bool is_pmem() const noexcept { return log_sys.is_pmem(); }
+
+  const byte *ptr;
+
+  constexpr recv_buf(const byte *ptr) : ptr(ptr) {}
+  constexpr bool operator==(const recv_buf other) const
+  { return ptr == other.ptr; }
+
+  static const byte *end() { return &log_sys.buf[recv_sys.len]; }
+
+  const char *get_filename(byte*, size_t) const noexcept
+  { return reinterpret_cast<const char*>(ptr); }
+
+  bool is_eof(size_t len= 0) const noexcept { return ptr + len >= end(); }
+
+  byte operator*() const noexcept
+  {
+    ut_ad(ptr >= log_sys.buf);
+    ut_ad(ptr < end());
+    return *ptr;
+  }
+  byte operator[](size_t size) const noexcept { return *(*this + size); }
+  recv_buf operator+(size_t len) const noexcept
+  { recv_buf r{*this}; return r+= len; }
+  recv_buf &operator++() noexcept { return *this+= 1; }
+  recv_buf &operator+=(size_t len) noexcept { ptr+= len; return *this; }
+
+  size_t operator-(const recv_buf start) const noexcept
+  {
+    ut_ad(ptr >= start.ptr);
+    return size_t(ptr - start.ptr);
+  }
+
+  uint32_t crc32c(const recv_buf start) const noexcept
+  {
+    return my_crc32c(0, start.ptr, ptr - start.ptr);
+  }
+
+  void *memcpy(void *buf, size_t size) const noexcept
+  {
+    ut_ad(size);
+    ut_ad(!is_eof(size - 1));
+    return ::memcpy(buf, ptr, size);
+  }
+
+  bool is_zero(size_t size) const noexcept
+  {
+    ut_ad(!is_eof(size));
+    return !memcmp(ptr, field_ref_zero, size);
+  }
+
+  uint64_t read8() const noexcept
+  { ut_ad(!is_eof(7)); return mach_read_from_8(ptr); }
+  uint32_t read4() const noexcept
+  { ut_ad(!is_eof(3)); return mach_read_from_4(ptr); }
+
+  /** Update the pointer if the new pointer is within the buffer. */
+  bool set_if_contains(const byte *pos) noexcept
+  {
+    if (pos > end() || pos < ptr)
+      return false;
+    ptr= pos;
+    return true;
+  }
+
+  /** Get the contiguous, unencrypted buffer.
+  @param buf         return value of copy_if_needed()
+  @param start       start of the mini-transaction
+  @param decrypt_buf possibly, a copy of the mini-transaction
+  @return contiguous, non-encrypted buffer */
+  const byte *get_buf(const byte *buf, const recv_buf start,
+                      const byte *decrypt_buf) const noexcept
+  { return ptr == buf ? start.ptr : decrypt_buf; }
+
+  /** Copy and decrypt a log record if needed.
+  @param iv    initialization vector
+  @param tmp   buffer for the decrypted log record
+  @param start un-encrypted start of the log record
+  @param len   length of the possibly encrypted part, in bytes */
+  const byte *copy_if_needed(const byte *iv, byte *tmp, recv_buf start,
+                             size_t len)
+  {
+    ut_ad(*this - start + len <= srv_page_size);
+    if (!len || !log_sys.is_encrypted())
+      return ptr;
+    const size_t s(*this - start);
+    start.memcpy(tmp, s);
+    return log_decrypt_buf(iv, tmp + s, ptr, static_cast<uint>(len));
+  }
+};
+
+#ifdef HAVE_PMEM
+/** Ring buffer wrapper for log_sys.buf[]; recv_sys.len == log_sys.file_size */
+struct recv_ring : public recv_buf
+{
+  static constexpr bool is_pmem() { return true; }
+
+  constexpr recv_ring(const byte *ptr) : recv_buf(ptr) {}
+
+  constexpr static bool is_eof() { return false; }
+  constexpr static bool is_eof(size_t) { return false; }
+
+  byte operator*() const noexcept
+  {
+    ut_ad(ptr >= &log_sys.buf[log_sys.START_OFFSET]);
+    ut_ad(ptr < end());
+    return *ptr;
+  }
+  byte operator[](size_t size) const noexcept { return *(*this + size); }
+  recv_ring operator+(size_t len) const noexcept
+  { recv_ring r{*this}; return r+= len; }
+  recv_ring &operator++() noexcept { return *this+= 1; }
+  recv_ring &operator+=(size_t len) noexcept
+  {
+    ut_ad(ptr < end());
+    ut_ad(ptr >= &log_sys.buf[log_sys.START_OFFSET]);
+    ut_ad(len < recv_sys.MTR_SIZE_MAX * 2);
+    ptr+= len;
+    if (ptr >= end())
+    {
+      ptr-= recv_sys.len - log_sys.START_OFFSET;
+      ut_ad(ptr >= &log_sys.buf[log_sys.START_OFFSET]);
+      ut_ad(ptr < end());
+    }
+    return *this;
+  }
+  size_t operator-(const recv_ring start) const noexcept
+  {
+    auto s= ptr - start.ptr;
+    return s >= 0
+      ? size_t(s)
+      : size_t(s + recv_sys.len - log_sys.START_OFFSET);
+  }
+
+  uint32_t crc32c(const recv_ring start) const noexcept
+  {
+    return ptr >= start.ptr
+      ? my_crc32c(0, start.ptr, ptr - start.ptr)
+      : my_crc32c(my_crc32c(0, start.ptr, end() - start.ptr),
+                  &log_sys.buf[log_sys.START_OFFSET],
+                  ptr - &log_sys.buf[log_sys.START_OFFSET]);
+  }
+
+  void *memcpy(void *buf, size_t size) const noexcept
+  {
+    ut_ad(size);
+    ut_ad(size < srv_page_size);
+
+    auto s= ptr + size - end();
+    if (s <= 0)
+      return ::memcpy(buf, ptr, size);
+    ::memcpy(buf, ptr, size - s);
+    ::memcpy(static_cast<byte*>(buf) + size - s,
+             &log_sys.buf[log_sys.START_OFFSET], s);
+    return buf;
+  }
+
+  bool is_zero(size_t size) const noexcept
+  {
+    auto s= ptr + size - end();
+    if (s <= 0)
+      return !memcmp(ptr, field_ref_zero, size);
+    return !memcmp(ptr, field_ref_zero, size - s) &&
+      !memcmp(&log_sys.buf[log_sys.START_OFFSET], field_ref_zero, s);
+  }
+
+  uint64_t read8() const noexcept
+  {
+    if (UNIV_LIKELY(ptr + 8 <= end()))
+      return mach_read_from_8(ptr);
+    byte b[8];
+    return mach_read_from_8(static_cast<const byte*>(memcpy(b, 8)));
+  }
+  uint32_t read4() const noexcept
+  {
+    if (UNIV_LIKELY(ptr + 4 <= end()))
+      return mach_read_from_4(ptr);
+    byte b[4];
+    return mach_read_from_4(static_cast<const byte*>(memcpy(b, 4)));
+  }
+
+  /** Get the contiguous, unencrypted buffer.
+  @param buf         return value of copy_if_needed()
+  @param start       start of the mini-transaction
+  @param decrypt_buf possibly, a copy of the mini-transaction
+  @return contiguous, non-encrypted buffer */
+  const byte *get_buf(const byte *buf, const recv_ring start,
+                      const byte *decrypt_buf) const noexcept
+  { return ptr == buf && start.ptr < ptr ? start.ptr : decrypt_buf; }
+
+  const char *get_filename(byte* buf, size_t rlen) const noexcept
+  {
+    return UNIV_LIKELY(ptr + rlen <= end())
+      ? reinterpret_cast<const char*>(ptr)
+      : static_cast<const char*>(memcpy(buf, rlen));
+  }
+
+  /** Copy and decrypt a log record if needed.
+  @param iv    initialization vector
+  @param tmp   buffer for the decrypted log record
+  @param start un-encrypted start of the log record
+  @param len   length of the possibly encrypted part, in bytes */
+  const byte *copy_if_needed(const byte *iv, byte *tmp, recv_ring start,
+                             size_t len)
+  {
+    const size_t s(*this - start);
+    ut_ad(s + len <= srv_page_size);
+    if (!log_sys.is_encrypted())
+    {
+      if (start.ptr + s == ptr && ptr + len <= end())
+        return ptr;
+      start.memcpy(tmp, s + len);
+      return tmp + s;
+    }
+
+    start.memcpy(tmp, s);
+
+    const byte *b= ptr;
+    if (ptr + len > end())
+      b= static_cast<byte*>(memcpy(alloca(len), len));
+    return log_decrypt_buf(iv, tmp + s, b, static_cast<uint>(len));
+  }
+};
+#endif
+
+template<typename source>
+void recv_sys_t::rewind(source &l, source &begin) noexcept
+{
+  ut_ad(srv_operation != SRV_OPERATION_BACKUP);
+  mysql_mutex_assert_owner(&mutex);
+
+  const source end= l;
+  uint32_t rlen;
+  for (l= begin; !(l == end); l+= rlen)
+  {
+    const source recs{l};
+    ++l;
+    const byte b= *recs;
+
+    ut_ad(b > 1);
+    ut_ad(UNIV_LIKELY((b & 0x70) != RESERVED) || srv_force_recovery);
+
+    rlen= b & 0xf;
+    if (!rlen)
+    {
+      const uint32_t lenlen= mlog_decode_varint_length(*l);
+      const uint32_t addlen= mlog_decode_varint(l);
+      ut_ad(addlen != MLOG_DECODE_ERROR);
+      rlen= addlen + 15 - lenlen;
+      l+= lenlen;
+    }
+    ut_ad(!l.is_eof(rlen));
+    if (b & 0x80)
+      continue;
+
+    uint32_t idlen= mlog_decode_varint_length(*l);
+    if (UNIV_UNLIKELY(idlen > 5 || idlen >= rlen))
+      continue;
+    const uint32_t space_id= mlog_decode_varint(l);
+    if (UNIV_UNLIKELY(space_id == MLOG_DECODE_ERROR))
+      continue;
+    l+= idlen;
+    rlen-= idlen;
+    idlen= mlog_decode_varint_length(*l);
+    if (UNIV_UNLIKELY(idlen > 5 || idlen > rlen))
+      continue;
+    const uint32_t page_no= mlog_decode_varint(l);
+    if (UNIV_UNLIKELY(page_no == MLOG_DECODE_ERROR))
+      continue;
+    const page_id_t id{space_id, page_no};
+    if (pages_it == pages.end() || pages_it->first != id)
+    {
+      pages_it= pages.find(id);
+      if (pages_it == pages.end())
+        continue;
+    }
+
+    ut_ad(!pages_it->second.being_processed);
+    const log_phys_t *head=
+      static_cast<log_phys_t*>(*pages_it->second.log.begin());
+    if (!head || head->start_lsn == lsn)
+    {
+      erase(pages_it);
+      pages_it= pages.end();
+    }
+    else
+      pages_it->second.log.rewind(lsn);
+  }
+
+  l= begin;
+  pages_it= pages.end();
+}
+
+/** Parse and register one log_t::FORMAT_10_8 mini-transaction.
+@tparam store     whether to store the records
+@param  l         log data source
+@param  if_exists if store: whether to check if the tablespace exists */
+template<typename source,bool store>
+inline
+recv_sys_t::parse_mtr_result recv_sys_t::parse(source &l, bool if_exists)
+  noexcept
+{
+restart:
+#ifndef SUX_LOCK_GENERIC
+  ut_ad(log_sys.latch.is_write_locked() ||
+        srv_operation == SRV_OPERATION_BACKUP ||
+        srv_operation == SRV_OPERATION_BACKUP_NO_DEFER);
+#endif
+  mysql_mutex_assert_owner(&mutex);
+  ut_ad(log_sys.next_checkpoint_lsn);
+  ut_ad(log_sys.is_latest());
+  ut_ad(store || !if_exists);
+  ut_ad(store ||
+        srv_operation != SRV_OPERATION_BACKUP ||
+        srv_operation != SRV_OPERATION_BACKUP_NO_DEFER);
+
+  alignas(8) byte iv[MY_AES_BLOCK_SIZE];
+  byte *decrypt_buf= static_cast<byte*>(alloca(srv_page_size));
+
+  const lsn_t start_lsn{lsn};
+
+  /* Check that the entire mini-transaction is included within the buffer */
+  if (l.is_eof(0))
+    return PREMATURE_EOF;
+
+  if (*l <= 1)
+    return GOT_EOF; /* We should never write an empty mini-transaction. */
+
+  source begin{l};
+  uint32_t rlen;
+  for (uint32_t total_len= 0; !l.is_eof(); l+= rlen, total_len+= rlen)
+  {
+    if (total_len >= MTR_SIZE_MAX)
+      return GOT_EOF;
+    if (*l <= 1)
+      goto eom_found;
+    rlen= *l & 0xf;
+    ++l;
+    if (!rlen)
+    {
+      if (l.is_eof(0))
+        break;
+      rlen= mlog_decode_varint_length(*l);
+      if (l.is_eof(rlen))
+        break;
+      const uint32_t addlen= mlog_decode_varint(l);
+      if (UNIV_UNLIKELY(addlen >= MTR_SIZE_MAX))
+        return GOT_EOF;
+      rlen= addlen + 15;
+    }
+  }
+
+  /* Not the entire mini-transaction was present. */
+  return PREMATURE_EOF;
+
+ eom_found:
+  if (*l != log_sys.get_sequence_bit((l - begin) + lsn))
+    return GOT_EOF;
+
+  if (l.is_eof(4))
+    return PREMATURE_EOF;
+
+  uint32_t crc{l.crc32c(begin)};
+
+  if (log_sys.is_encrypted())
+  {
+    if (l.is_eof(8 + 4))
+      return PREMATURE_EOF;
+    (l + 1).memcpy(iv, 8);
+    l+= 8;
+    crc= my_crc32c(crc, iv, 8);
+  }
+
+  DBUG_EXECUTE_IF("log_intermittent_checksum_mismatch",
+                  {
+                    static int c;
+                    if (!c++)
+                    {
+                      sql_print_information("Invalid log block checksum");
+                      return GOT_EOF;
+                    }
+                  });
+
+  if (crc != (l + 1).read4())
+    return GOT_EOF;
+
+  l+= 5;
+  ut_d(const source el{l});
+  lsn+= l - begin;
+  offset= l.ptr - log_sys.buf;
+  if (!l.is_pmem());
+  else if (offset == log_sys.file_size)
+    offset= log_sys.START_OFFSET;
+  else
+    ut_ad(offset < log_sys.file_size);
+
+  ut_d(std::set<page_id_t> freed);
+#if 0 && defined UNIV_DEBUG /* MDEV-21727 FIXME: enable this */
+  /* Pages that have been modified in this mini-transaction.
+  If a mini-transaction writes INIT_PAGE for a page, it should not have
+  written any log records for the page. Unfortunately, this does not
+  hold for ROW_FORMAT=COMPRESSED pages, because page_zip_compress()
+  can be invoked in a pessimistic operation, even after log has
+  been written for other pages. */
+  ut_d(std::set<page_id_t> modified);
+#endif
+
+  uint32_t space_id= 0, page_no= 0, last_offset= 0;
+  bool got_page_op= false;
+
+  for (l= begin;; l+= rlen)
+  {
+    const source recs{l};
+    ++l;
+    const byte b= *recs;
+
+    if (b <= 1)
+      break;
+
+    if (UNIV_LIKELY((b & 0x70) != RESERVED));
+    else if (srv_force_recovery)
+      sql_print_warning("InnoDB: Ignoring unknown log record at LSN " LSN_PF,
+                        lsn);
+    else
+    {
+      sql_print_error("InnoDB: Unknown log record at LSN " LSN_PF, lsn);
+    corrupted:
+      found_corrupt_log= true;
+      return GOT_EOF;
+    }
+
+    rlen= b & 0xf;
+    if (!rlen)
+    {
+      const uint32_t lenlen= mlog_decode_varint_length(*l);
+      const uint32_t addlen= mlog_decode_varint(l);
+      ut_ad(addlen != MLOG_DECODE_ERROR);
+      rlen= addlen + 15 - lenlen;
+      l+= lenlen;
+    }
+    ut_ad(!l.is_eof(rlen));
+
+    uint32_t idlen;
+    if ((b & 0x80) && got_page_op)
+    {
+      /* This record is for the same page as the previous one. */
+      if (UNIV_UNLIKELY((b & 0x70) <= INIT_PAGE))
+      {
+      record_corrupted:
+        /* FREE_PAGE,INIT_PAGE cannot be with same_page flag */
+        if (!srv_force_recovery)
+        {
+        malformed:
+          sql_print_error("InnoDB: Malformed log record at LSN " LSN_PF
+                          "; set innodb_force_recovery=1 to ignore.", lsn);
+          goto corrupted;
+        }
+        sql_print_warning("InnoDB: Ignoring malformed log record at LSN "
+                          LSN_PF, lsn);
+        last_offset= 1; /* the next record must not be same_page  */
+        continue;
+      }
+      if (srv_operation == SRV_OPERATION_BACKUP)
+        continue;
+      DBUG_PRINT("ib_log",
+                 ("scan " LSN_PF ": rec %x len %zu page %u:%u",
+                  lsn, b, l - recs + rlen, space_id, page_no));
+      goto same_page;
+    }
+    last_offset= 0;
+    idlen= mlog_decode_varint_length(*l);
+    if (UNIV_UNLIKELY(idlen > 5 || idlen >= rlen))
+    {
+      if (!*l && b == FILE_CHECKPOINT + 1)
+        continue;
+    page_id_corrupted:
+      if (!srv_force_recovery)
+      {
+        sql_print_error("InnoDB: Corrupted page identifier at " LSN_PF
+                        "; set innodb_force_recovery=1 to ignore the record.",
+                        lsn);
+        goto corrupted;
+      }
+      sql_print_warning("InnoDB: Ignoring corrupted page identifier at LSN "
+                        LSN_PF, lsn);
+      continue;
+    }
+    space_id= mlog_decode_varint(l);
+    if (UNIV_UNLIKELY(space_id == MLOG_DECODE_ERROR))
+      goto page_id_corrupted;
+    l+= idlen;
+    rlen-= idlen;
+    idlen= mlog_decode_varint_length(*l);
+    if (UNIV_UNLIKELY(idlen > 5 || idlen > rlen))
+      goto page_id_corrupted;
+    page_no= mlog_decode_varint(l);
+    if (UNIV_UNLIKELY(page_no == MLOG_DECODE_ERROR))
+      goto page_id_corrupted;
+    l+= idlen;
+    rlen-= idlen;
+    mach_write_to_4(iv + 8, space_id);
+    mach_write_to_4(iv + 12, page_no);
+    got_page_op= !(b & 0x80);
+    if (!got_page_op);
+    else if (!store && srv_operation == SRV_OPERATION_BACKUP)
+    {
+      if (page_no == 0 && first_page_init && (b & 0x10))
+        first_page_init(space_id);
+      continue;
+    }
+    else if (store && file_checkpoint && !is_predefined_tablespace(space_id))
+    {
+      recv_spaces_t::iterator i= recv_spaces.lower_bound(space_id);
+      if (i != recv_spaces.end() && i->first == space_id);
+      else if (lsn < file_checkpoint)
+        /* We have not seen all records between the checkpoint and
+        FILE_CHECKPOINT. There should be a FILE_DELETE for this
+        tablespace later. */
+        recv_spaces.emplace_hint(i, space_id, file_name_t("", false));
+      else
+      {
+        const page_id_t id(space_id, page_no);
+        if (!srv_force_recovery)
+        {
+          ib::error() << "Missing FILE_DELETE or FILE_MODIFY for " << id
+                      << " at " << lsn
+                      << "; set innodb_force_recovery=1 to ignore the record.";
+          goto corrupted;
+        }
+        ib::warn() << "Ignoring record for " << id << " at " << lsn;
+        continue;
+      }
+    }
+    DBUG_PRINT("ib_log",
+               ("scan " LSN_PF ": rec %x len %zu page %u:%u",
+                lsn, b, l - recs + rlen, space_id, page_no));
+    if (got_page_op)
+    {
+    same_page:
+      const byte *cl= l.ptr;
+      if (!rlen);
+      else if (UNIV_UNLIKELY(l - recs + rlen > srv_page_size))
+        goto record_corrupted;
+      const page_id_t id{space_id, page_no};
+      ut_d(if ((b & 0x70) == INIT_PAGE || (b & 0x70) == OPTION)
+             freed.erase(id));
+      ut_ad(freed.find(id) == freed.end());
+      switch (b & 0x70) {
+      case FREE_PAGE:
+        ut_ad(freed.emplace(id).second);
+        last_offset= 1; /* the next record must not be same_page  */
+        goto free_or_init_page;
+      case INIT_PAGE:
+        last_offset= FIL_PAGE_TYPE;
+      free_or_init_page:
+        if (store)
+          store_freed_or_init_rec(id, (b & 0x70) == FREE_PAGE);
+        if (UNIV_UNLIKELY(rlen != 0))
+          goto record_corrupted;
+      copy_if_needed:
+        cl= l.copy_if_needed(iv, decrypt_buf, recs, rlen);
+        break;
+      case EXTENDED:
+        if (UNIV_UNLIKELY(!rlen))
+          goto record_corrupted;
+        cl= l.copy_if_needed(iv, decrypt_buf, recs, rlen);
+        if (rlen == 1 && *cl == TRIM_PAGES)
+        {
+#if 0 /* For now, we can only truncate an undo log tablespace */
+          if (UNIV_UNLIKELY(!space_id || !page_no))
+            goto record_corrupted;
+#else
+          if (!srv_is_undo_tablespace(space_id) ||
+              page_no != SRV_UNDO_TABLESPACE_SIZE_IN_PAGES)
+            goto record_corrupted;
+          static_assert(UT_ARR_SIZE(truncated_undo_spaces) ==
+                        TRX_SYS_MAX_UNDO_SPACES, "compatibility");
+          /* The entire undo tablespace will be reinitialized by
+          innodb_undo_log_truncate=ON. Discard old log for all pages. */
+          trim({space_id, 0}, start_lsn);
+          truncated_undo_spaces[space_id - srv_undo_space_id_start]=
+            { start_lsn, page_no };
+          if (!store && undo_space_trunc)
+            undo_space_trunc(space_id);
+#endif
+          last_offset= 1; /* the next record must not be same_page  */
+          continue;
+        }
+        last_offset= FIL_PAGE_TYPE;
+        break;
+      case OPTION:
+        if (rlen == 5 && *l == OPT_PAGE_CHECKSUM)
+          goto copy_if_needed;
+        /* fall through */
+      case RESERVED:
+        continue;
+      case WRITE:
+      case MEMMOVE:
+      case MEMSET:
+        if (UNIV_UNLIKELY(rlen == 0 || last_offset == 1))
+          goto record_corrupted;
+        ut_d(const source payload{l});
+        cl= l.copy_if_needed(iv, decrypt_buf, recs, rlen);
+        const uint32_t olen= mlog_decode_varint_length(*cl);
+        if (UNIV_UNLIKELY(olen >= rlen) || UNIV_UNLIKELY(olen > 3))
+          goto record_corrupted;
+        const uint32_t offset= mlog_decode_varint(cl);
+        ut_ad(offset != MLOG_DECODE_ERROR);
+        static_assert(FIL_PAGE_OFFSET == 4, "compatibility");
+        if (UNIV_UNLIKELY(offset >= srv_page_size))
+          goto record_corrupted;
+        last_offset+= offset;
+        if (UNIV_UNLIKELY(last_offset < 8 || last_offset >= srv_page_size))
+          goto record_corrupted;
+        cl+= olen;
+        rlen-= olen;
+        if ((b & 0x70) == WRITE)
+        {
+          if (UNIV_UNLIKELY(rlen + last_offset > srv_page_size))
+            goto record_corrupted;
+          if (store && UNIV_UNLIKELY(!page_no) && file_checkpoint)
+          {
+            const bool has_size= last_offset <= FSP_HEADER_OFFSET + FSP_SIZE &&
+              last_offset + rlen >= FSP_HEADER_OFFSET + FSP_SIZE + 4;
+            const bool has_flags= last_offset <=
+              FSP_HEADER_OFFSET + FSP_SPACE_FLAGS &&
+              last_offset + rlen >= FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + 4;
+            if (has_size || has_flags)
+            {
+              recv_spaces_t::iterator it= recv_spaces.find(space_id);
+              const uint32_t size= has_size
+                ? mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE + cl -
+                                   last_offset)
+                : 0;
+              const uint32_t flags= has_flags
+                ? mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + cl -
+                                   last_offset)
+                : file_name_t::initial_flags;
+              if (it == recv_spaces.end())
+                ut_ad(!file_checkpoint || space_id == TRX_SYS_SPACE ||
+                      srv_is_undo_tablespace(space_id));
+              else if (!it->second.space)
+              {
+                if (has_size)
+                  it->second.size= size;
+                if (has_flags)
+                  it->second.flags= flags;
+              }
+              fil_space_set_recv_size_and_flags(space_id, size, flags);
+            }
+          }
+        parsed_ok:
+          last_offset+= rlen;
+          ut_ad(l == payload);
+          if (!l.set_if_contains(cl))
+            (l= recs)+= cl - decrypt_buf;
+          break;
+        }
+        uint32_t llen= mlog_decode_varint_length(*cl);
+        if (UNIV_UNLIKELY(llen > rlen || llen > 3))
+          goto record_corrupted;
+        const uint32_t len= mlog_decode_varint(cl);
+        ut_ad(len != MLOG_DECODE_ERROR);
+        if (UNIV_UNLIKELY(last_offset + len > srv_page_size))
+          goto record_corrupted;
+        cl+= llen;
+        rlen-= llen;
+        llen= len;
+        if ((b & 0x70) == MEMSET)
+        {
+          if (UNIV_UNLIKELY(rlen > llen))
+            goto record_corrupted;
+          goto parsed_ok;
+        }
+        const uint32_t slen= mlog_decode_varint_length(*cl);
+        if (UNIV_UNLIKELY(slen != rlen || slen > 3))
+          goto record_corrupted;
+        uint32_t s= mlog_decode_varint(cl);
+        ut_ad(slen != MLOG_DECODE_ERROR);
+        if (s & 1)
+          s= last_offset - (s >> 1) - 1;
+        else
+          s= last_offset + (s >> 1) + 1;
+        if (UNIV_UNLIKELY(s < 8 || s + llen > srv_page_size))
+          goto record_corrupted;
+        goto parsed_ok;
+      }
+#if 0 && defined UNIV_DEBUG
+      switch (b & 0x70) {
+      case RESERVED:
+        ut_ad(0); /* we did "continue" earlier */
+        break;
+      case OPTION:
+      case FREE_PAGE:
+        break;
+      default:
+        ut_ad(modified.emplace(id).second || (b & 0x70) != INIT_PAGE);
+      }
+#endif
+      if (store)
+      {
+        if (if_exists)
+        {
+          if (fil_space_t *space= fil_space_t::get(space_id))
+          {
+            const auto size= space->get_size();
+            space->release();
+            if (!size)
+              continue;
+          }
+          else if (!deferred_spaces.find(space_id))
+            continue;
+        }
+        if (!mlog_init.will_avoid_read(id, start_lsn))
+        {
+          if (pages_it == pages.end() || pages_it->first != id)
+            pages_it= pages.emplace(id, page_recv_t{}).first;
+          if (UNIV_UNLIKELY(add(pages_it, start_lsn, lsn,
+                                l.get_buf(cl, recs, decrypt_buf),
+                                l - recs + rlen)))
+          {
+            lsn= start_lsn;
+            log_sys.set_recovered_lsn(start_lsn);
+            l+= rlen;
+            offset= begin.ptr - log_sys.buf;
+            rewind(l, begin);
+            if (if_exists)
+            {
+              apply(false);
+              if (is_corrupt_fs())
+                return GOT_EOF;
+              goto restart;
+            }
+            sql_print_information("InnoDB: Multi-batch recovery needed at LSN "
+                                  LSN_PF, lsn);
+            return GOT_OOM;
+          }
+        }
+      }
+      else if ((b & 0x70) <= INIT_PAGE)
+      {
+        mlog_init.add(id, start_lsn);
+        if (pages_it == pages.end() || pages_it->first != id)
+        {
+          pages_it= pages.find(id);
+          if (pages_it == pages.end())
+            continue;
+        }
+        map::iterator r= pages_it++;
+        erase(r);
+      }
+    }
+    else if (rlen)
+    {
+      switch (b & 0xf0) {
+      case FILE_CHECKPOINT:
+        if (space_id || page_no || l[rlen] > 1);
+        else if (rlen != 8)
+        {
+          if (rlen < UNIV_PAGE_SIZE_MAX && !l.is_zero(rlen))
+            continue;
+        }
+        else if (store)
+        {
+          ut_ad(file_checkpoint);
+          continue;
+        }
+        else if (const lsn_t c= l.read8())
+        {
+          if (UNIV_UNLIKELY(srv_print_verbose_log == 2))
+            fprintf(stderr, "FILE_CHECKPOINT(" LSN_PF ") %s at " LSN_PF "\n",
+                    c, c != log_sys.next_checkpoint_lsn
+                    ? "ignored" : file_checkpoint ? "reread" : "read", lsn);
+
+          DBUG_PRINT("ib_log",
+                     ("FILE_CHECKPOINT(" LSN_PF ") %s at " LSN_PF,
+                      c, c != log_sys.next_checkpoint_lsn
+                      ? "ignored" : file_checkpoint ? "reread" : "read", lsn));
+
+          if (c == log_sys.next_checkpoint_lsn)
+          {
+            /* There can be multiple FILE_CHECKPOINT for the same LSN. */
+            if (file_checkpoint)
+              continue;
+            file_checkpoint= lsn;
+            return GOT_EOF;
+          }
+          continue;
+        }
+        else
+          continue;
+        /* fall through */
+      default:
+        if (!srv_force_recovery)
+          goto malformed;
+        sql_print_warning("InnoDB: Ignoring malformed log record at LSN "
+                          LSN_PF, lsn);
+        continue;
+      case FILE_DELETE:
+      case FILE_MODIFY:
+      case FILE_RENAME:
+        if (UNIV_UNLIKELY(page_no != 0))
+        {
+        file_rec_error:
+          if (!srv_force_recovery)
+          {
+            sql_print_error("InnoDB: Corrupted file-level record;"
+                            " set innodb_force_recovery=1 to ignore.");
+            goto corrupted;
+          }
+
+          sql_print_warning("InnoDB: Ignoring corrupted file-level record"
+                            " at LSN " LSN_PF, lsn);
+          continue;
+        }
+        /* fall through */
+      case FILE_CREATE:
+        if (UNIV_UNLIKELY(!space_id || page_no))
+          goto file_rec_error;
+        /* There is no terminating NUL character. Names must end in .ibd.
+        For FILE_RENAME, there is a NUL between the two file names. */
+
+        const char * const fn= l.get_filename(decrypt_buf, rlen);
+        const char *fn2= static_cast<const char*>(memchr(fn, 0, rlen));
+
+        if (UNIV_UNLIKELY((fn2 == nullptr) == ((b & 0xf0) == FILE_RENAME)))
+          goto file_rec_error;
+
+        const char * const fnend= fn2 ? fn2 : fn + rlen;
+        const char * const fn2end= fn2 ? fn + rlen : nullptr;
+
+        if (fn2)
+        {
+          fn2++;
+          if (memchr(fn2, 0, fn2end - fn2))
+            goto file_rec_error;
+          if (fn2end - fn2 < 4 || memcmp(fn2end - 4, DOT_IBD, 4))
+            goto file_rec_error;
+        }
+
+        if (is_predefined_tablespace(space_id))
+          goto file_rec_error;
+        if (fnend - fn < 4 || memcmp(fnend - 4, DOT_IBD, 4))
+          goto file_rec_error;
+
+        if (UNIV_UNLIKELY(!recv_needed_recovery && srv_read_only_mode))
+          continue;
+
+        if (!store &&
+            (srv_operation == SRV_OPERATION_BACKUP ||
+             srv_operation == SRV_OPERATION_BACKUP_NO_DEFER))
+        {
+          if ((b & 0xf0) < FILE_CHECKPOINT && log_file_op)
+            log_file_op(space_id, b & 0xf0,
+                        reinterpret_cast<const byte*>(fn),
+                        static_cast<ulint>(fnend - fn),
+                        reinterpret_cast<const byte*>(fn2),
+                        fn2 ? static_cast<ulint>(fn2end - fn2) : 0);
+          continue;
+        }
+
+        fil_name_process(fn, fnend - fn, space_id,
+                         (b & 0xf0) == FILE_DELETE ? FILE_DELETE : FILE_MODIFY,
+                         start_lsn, if_exists);
+
+        if (fn2)
+        {
+          fil_name_process(fn2, fn2end - fn2, space_id,
+                           FILE_RENAME, start_lsn, if_exists);
+          if (file_checkpoint)
+          {
+            const size_t len= fn2end - fn2;
+            auto r= renamed_spaces.emplace(space_id, std::string{fn2, len});
+            if (!r.second)
+              r.first->second= std::string{fn2, len};
+          }
+        }
+
+        if (is_corrupt_fs())
+          return GOT_EOF;
+      }
+    }
+    else if (b == FILE_CHECKPOINT + 2 && !space_id && !page_no);
+    else
+      goto malformed;
+  }
+
+  l+= log_sys.is_encrypted() ? 4U + 8U : 4U;
+  ut_ad(l == el);
+  return OK;
+}
+
+template<bool store>
+recv_sys_t::parse_mtr_result recv_sys_t::parse_mtr(bool if_exists) noexcept
+{
+  recv_buf s{&log_sys.buf[recv_sys.offset]};
+  return recv_sys.parse<recv_buf,store>(s, if_exists);
+}
+
+/** for mariadb-backup; @see xtrabackup_copy_logfile() */
+template
+recv_sys_t::parse_mtr_result recv_sys_t::parse_mtr<false>(bool) noexcept;
+
+#ifdef HAVE_PMEM
+template<bool store>
+recv_sys_t::parse_mtr_result recv_sys_t::parse_pmem(bool if_exists) noexcept
+{
+  recv_sys_t::parse_mtr_result r{parse_mtr<store>(if_exists)};
+  if (UNIV_LIKELY(r != PREMATURE_EOF) || !log_sys.is_pmem())
+    return r;
+  ut_ad(recv_sys.len == log_sys.file_size);
+  ut_ad(recv_sys.offset >= log_sys.START_OFFSET);
+  ut_ad(recv_sys.offset <= recv_sys.len);
+  recv_ring s
+    {recv_sys.offset == recv_sys.len
+     ? &log_sys.buf[log_sys.START_OFFSET]
+     : &log_sys.buf[recv_sys.offset]};
+  return recv_sys.parse<recv_ring,store>(s, if_exists);
+}
+#endif
+
+/** Apply the hashed log records to the page, if the page lsn is less than the
+lsn of a log record.
+@param[in,out]	block		buffer pool page
+@param[in,out]	mtr		mini-transaction
+@param[in,out]	recs		log records to apply
+@param[in,out]	space		tablespace, or NULL if not looked up yet
+@param[in,out]	init		page initialization operation, or NULL
+@return the recovered page
+@retval nullptr on failure */
+static buf_block_t *recv_recover_page(buf_block_t *block, mtr_t &mtr,
+                                      page_recv_t &recs,
+                                      fil_space_t *space,
+                                      recv_init *init)
+{
+	mysql_mutex_assert_not_owner(&recv_sys.mutex);
+	ut_ad(recv_sys.apply_log_recs);
+	ut_ad(recv_needed_recovery);
+	ut_ad(!init || init->created);
+	ut_ad(!init || init->lsn);
+	ut_ad(recs.being_processed == 1);
+	ut_ad(!space || space->id == block->page.id().space());
+	ut_ad(log_sys.is_latest());
+
+	if (UNIV_UNLIKELY(srv_print_verbose_log == 2)) {
+		ib::info() << "Applying log to page " << block->page.id();
+	}
+
+	DBUG_PRINT("ib_log", ("Applying log to page %u:%u",
+			      block->page.id().space(),
+			      block->page.id().page_no()));
+
+	byte *frame = UNIV_LIKELY_NULL(block->page.zip.data)
+		? block->page.zip.data
+		: block->page.frame;
+	const lsn_t page_lsn = init
+		? 0
+		: mach_read_from_8(frame + FIL_PAGE_LSN);
+	bool free_page = false;
+	lsn_t start_lsn = 0, end_lsn = 0;
+	ut_d(lsn_t recv_start_lsn = 0);
+	const lsn_t init_lsn = init ? init->lsn : 0;
+
+	bool skipped_after_init = false;
+
+	for (const log_rec_t* recv : recs.log) {
+		const log_phys_t* l = static_cast<const log_phys_t*>(recv);
+		ut_ad(l->lsn);
+		ut_ad(end_lsn <= l->lsn);
+		ut_ad(l->lsn <= recv_sys.lsn);
+
+		ut_ad(l->start_lsn);
+		ut_ad(recv_start_lsn <= l->start_lsn);
+		ut_d(recv_start_lsn = l->start_lsn);
+
+		if (l->start_lsn < page_lsn) {
+			/* This record has already been applied. */
+			DBUG_PRINT("ib_log", ("apply skip %u:%u LSN " LSN_PF
+					      " < " LSN_PF,
+					      block->page.id().space(),
+					      block->page.id().page_no(),
+					      l->start_lsn, page_lsn));
+			skipped_after_init = true;
+			end_lsn = l->lsn;
+			continue;
+		}
+
+		if (l->start_lsn < init_lsn) {
+			DBUG_PRINT("ib_log", ("init skip %u:%u LSN " LSN_PF
+					      " < " LSN_PF,
+					      block->page.id().space(),
+					      block->page.id().page_no(),
+					      l->start_lsn, init_lsn));
+			skipped_after_init = false;
+			end_lsn = l->lsn;
+			continue;
+		}
+
+		/* There is no need to check LSN for just initialized pages. */
+		if (skipped_after_init) {
+			skipped_after_init = false;
+			ut_ad(end_lsn == page_lsn);
+			if (end_lsn != page_lsn) {
+				sql_print_warning(
+					"InnoDB: The last skipped log record"
+					" LSN " LSN_PF
+					" is not equal to page LSN " LSN_PF,
+					end_lsn, page_lsn);
+			}
+		}
+
+		end_lsn = l->lsn;
+
+		if (UNIV_UNLIKELY(srv_print_verbose_log == 2)) {
+			ib::info() << "apply " << l->start_lsn
+				   << ": " << block->page.id();
+		}
+
+		DBUG_PRINT("ib_log", ("apply " LSN_PF ": %u:%u",
+				      l->start_lsn,
+				      block->page.id().space(),
+				      block->page.id().page_no()));
+
+		log_phys_t::apply_status a= l->apply(*block, recs.last_offset);
+
+		switch (a) {
+		case log_phys_t::APPLIED_NO:
+			ut_ad(!mtr.has_modifications());
+			free_page = true;
+			start_lsn = 0;
+			continue;
+		case log_phys_t::APPLIED_YES:
+		case log_phys_t::APPLIED_CORRUPTED:
+			goto set_start_lsn;
+		case log_phys_t::APPLIED_TO_FSP_HEADER:
+		case log_phys_t::APPLIED_TO_ENCRYPTION:
+			break;
+		}
+
+		if (fil_space_t* s = space
+		    ? space
+		    : fil_space_t::get(block->page.id().space())) {
+			switch (a) {
+			case log_phys_t::APPLIED_TO_FSP_HEADER:
+				s->flags = mach_read_from_4(
+					FSP_HEADER_OFFSET
+					+ FSP_SPACE_FLAGS + frame);
+				s->size_in_header = mach_read_from_4(
+					FSP_HEADER_OFFSET + FSP_SIZE
+					+ frame);
+				s->free_limit = mach_read_from_4(
+					FSP_HEADER_OFFSET
+					+ FSP_FREE_LIMIT + frame);
+				s->free_len = mach_read_from_4(
+					FSP_HEADER_OFFSET + FSP_FREE
+					+ FLST_LEN + frame);
+				break;
+			default:
+				byte* b= frame
+					+ fsp_header_get_encryption_offset(
+						block->zip_size())
+					+ FSP_HEADER_OFFSET;
+				if (memcmp(b, CRYPT_MAGIC, MAGIC_SZ)) {
+					break;
+				}
+				b += MAGIC_SZ;
+				if (*b != CRYPT_SCHEME_UNENCRYPTED
+				    && *b != CRYPT_SCHEME_1) {
+					break;
+				}
+				if (b[1] != MY_AES_BLOCK_SIZE) {
+					break;
+				}
+				if (b[2 + MY_AES_BLOCK_SIZE + 4 + 4]
+				    > FIL_ENCRYPTION_OFF) {
+					break;
+				}
+				fil_crypt_parse(s, b);
+			}
+
+			if (!space) {
+				s->release();
+			}
+		}
+
+set_start_lsn:
+		if ((a == log_phys_t::APPLIED_CORRUPTED
+		     || recv_sys.is_corrupt_log()) && !srv_force_recovery) {
+			if (init) {
+				init->created = false;
+			}
+
+			mtr.discard_modifications();
+			mtr.commit();
+
+			buf_pool.corrupted_evict(&block->page,
+						 block->page.state() &
+						 buf_page_t::LRU_MASK);
+			block = nullptr;
+			goto done;
+		}
+
+		if (!start_lsn) {
+			start_lsn = l->start_lsn;
+		}
+	}
+
+	if (start_lsn) {
+		ut_ad(end_lsn >= start_lsn);
+		ut_ad(!block->page.oldest_modification());
+		mach_write_to_8(FIL_PAGE_LSN + frame, end_lsn);
+		if (UNIV_LIKELY(!block->page.zip.data)) {
+			mach_write_to_8(srv_page_size
+					- FIL_PAGE_END_LSN_OLD_CHKSUM
+					+ frame, end_lsn);
+		} else {
+			buf_zip_decompress(block, false);
+		}
+		/* The following is adapted from
+		buf_pool_t::insert_into_flush_list() */
+		mysql_mutex_lock(&buf_pool.flush_list_mutex);
+		buf_pool.flush_list_bytes+= block->physical_size();
+		block->page.set_oldest_modification(start_lsn);
+		UT_LIST_ADD_FIRST(buf_pool.flush_list, &block->page);
+		buf_pool.page_cleaner_wakeup();
+		mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+	} else if (free_page && init) {
+		/* There have been no operations that modify the page.
+		Any buffered changes must not be merged. A subsequent
+		buf_page_create() from a user thread should discard
+		any buffered changes. */
+		init->created = false;
+		ut_ad(!mtr.has_modifications());
+		block->page.set_freed(block->page.state());
+	}
+
+	/* Make sure that committing mtr does not change the modification
+	lsn values of page */
+
+	mtr.discard_modifications();
+	mtr.commit();
+
+done:
+	/* FIXME: do this in page read, protected with recv_sys.mutex! */
+	if (recv_max_page_lsn < page_lsn) {
+		recv_max_page_lsn = page_lsn;
+	}
+
+	return block;
+}
+
+/** Remove records for a corrupted page.
+This function should only be called when innodb_force_recovery is set.
+@param page_id  corrupted page identifier */
+ATTRIBUTE_COLD void recv_sys_t::free_corrupted_page(page_id_t page_id)
+{
+  if (!recovery_on)
+    return;
+
+  mysql_mutex_lock(&mutex);
+  map::iterator p= pages.find(page_id);
+  if (p == pages.end())
+  {
+    mysql_mutex_unlock(&mutex);
+    return;
+  }
+
+  p->second.being_processed= -1;
+  if (!srv_force_recovery)
+    set_corrupt_fs();
+  mysql_mutex_unlock(&mutex);
+
+  ib::error_or_warn(!srv_force_recovery)
+    << "Unable to apply log to corrupted page " << page_id;
+}
+
+ATTRIBUTE_COLD void recv_sys_t::set_corrupt_log()
+{
+  mysql_mutex_lock(&mutex);
+  found_corrupt_log= true;
+  mysql_mutex_unlock(&mutex);
+}
+
+ATTRIBUTE_COLD void recv_sys_t::set_corrupt_fs()
+{
+  mysql_mutex_assert_owner(&mutex);
+  if (!srv_force_recovery)
+    sql_print_information("InnoDB: Set innodb_force_recovery=1"
+                          " to ignore corrupted pages.");
+  found_corrupt_fs= true;
+}
+
+/** Apply any buffered redo log to a page.
+@param space     tablespace
+@param bpage     buffer pool page
+@return whether the page was recovered correctly */
+bool recv_recover_page(fil_space_t* space, buf_page_t* bpage)
+{
+  mtr_t mtr;
+  mtr.start();
+  mtr.set_log_mode(MTR_LOG_NO_REDO);
+
+  ut_ad(bpage->frame);
+  /* Move the ownership of the x-latch on the page to this OS thread,
+  so that we can acquire a second x-latch on it. This is needed for
+  the operations to the page to pass the debug checks. */
+  bpage->lock.claim_ownership();
+  bpage->lock.x_lock_recursive();
+  bpage->fix_on_recovery();
+  mtr.memo_push(reinterpret_cast<buf_block_t*>(bpage), MTR_MEMO_PAGE_X_FIX);
+
+  buf_block_t *success= reinterpret_cast<buf_block_t*>(bpage);
+
+  mysql_mutex_lock(&recv_sys.mutex);
+  if (recv_sys.apply_log_recs)
+  {
+    const page_id_t id{bpage->id()};
+    recv_sys_t::map::iterator p= recv_sys.pages.find(id);
+    if (p == recv_sys.pages.end());
+    else if (p->second.being_processed < 0)
+    {
+      recv_sys.pages_it_invalidate(p);
+      recv_sys.erase(p);
+    }
+    else
+    {
+      p->second.being_processed= 1;
+      recv_sys_t::init *init= nullptr;
+      if (p->second.skip_read)
+        (init= &mlog_init.last(id))->created= true;
+      mysql_mutex_unlock(&recv_sys.mutex);
+      success= recv_recover_page(success, mtr, p->second, space, init);
+      p->second.being_processed= -1;
+      goto func_exit;
+    }
+  }
+
+  mysql_mutex_unlock(&recv_sys.mutex);
+  mtr.commit();
+func_exit:
+  ut_ad(mtr.has_committed());
+  return success;
+}
+
+void IORequest::fake_read_complete(os_offset_t offset) const
+{
+  ut_ad(node);
+  ut_ad(is_read());
+  ut_ad(bpage);
+  ut_ad(bpage->frame);
+  ut_ad(recv_recovery_is_on());
+  ut_ad(offset);
+
+  mtr_t mtr;
+  mtr.start();
+  mtr.set_log_mode(MTR_LOG_NO_REDO);
+
+  ut_ad(bpage->frame);
+  /* Move the ownership of the x-latch on the page to this OS thread,
+  so that we can acquire a second x-latch on it. This is needed for
+  the operations to the page to pass the debug checks. */
+  bpage->lock.claim_ownership();
+  bpage->lock.x_lock_recursive();
+  bpage->fix_on_recovery();
+  mtr.memo_push(reinterpret_cast<buf_block_t*>(bpage), MTR_MEMO_PAGE_X_FIX);
+
+  page_recv_t &recs= *reinterpret_cast<page_recv_t*>(slot);
+  ut_ad(recs.being_processed == 1);
+  recv_init &init= *reinterpret_cast<recv_init*>(offset);
+  ut_ad(init.lsn > 1);
+  init.created= true;
+
+  if (recv_recover_page(reinterpret_cast<buf_block_t*>(bpage),
+                        mtr, recs, node->space, &init))
+  {
+    ut_ad(bpage->oldest_modification() || bpage->is_freed());
+    bpage->lock.x_unlock(true);
+  }
+  recs.being_processed= -1;
+  ut_ad(mtr.has_committed());
+
+  node->space->release();
+}
+
+/** @return whether a page has been freed */
+inline bool fil_space_t::is_freed(uint32_t page)
+{
+  std::lock_guard<std::mutex> freed_lock(freed_range_mutex);
+  return freed_ranges.contains(page);
+}
+
+bool recv_sys_t::report(time_t time)
+{
+  if (time - progress_time < 15)
+    return false;
+  progress_time= time;
+  return true;
+}
+
+ATTRIBUTE_COLD
+void recv_sys_t::report_progress() const
+{
+  mysql_mutex_assert_owner(&mutex);
+  const size_t n{pages.size()};
+  if (recv_sys.scanned_lsn == recv_sys.lsn)
+  {
+    sql_print_information("InnoDB: To recover: %zu pages", n);
+    service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
+                                   "To recover: %zu pages", n);
+  }
+  else
+  {
+    sql_print_information("InnoDB: To recover: LSN " LSN_PF
+                          "/" LSN_PF "; %zu pages",
+                          recv_sys.lsn, recv_sys.scanned_lsn, n);
+    service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
+                                   "To recover: LSN " LSN_PF
+                                   "/" LSN_PF "; %zu pages",
+                                   recv_sys.lsn, recv_sys.scanned_lsn, n);
+  }
+}
+
+/** Apply a recovery batch.
+@param space_id       current tablespace identifier
+@param space          current tablespace
+@param free_block     spare buffer block
+@param last_batch     whether it is possible to write more redo log
+@return whether the caller must provide a new free_block */
+bool recv_sys_t::apply_batch(uint32_t space_id, fil_space_t *&space,
+                             buf_block_t *&free_block, bool last_batch)
+{
+  mysql_mutex_assert_owner(&mutex);
+  ut_ad(pages_it != pages.end());
+  ut_ad(!pages_it->second.log.empty());
+
+  mysql_mutex_lock(&buf_pool.mutex);
+  size_t n= 0, max_n= std::min<size_t>(BUF_LRU_MIN_LEN,
+                                       UT_LIST_GET_LEN(buf_pool.LRU) +
+                                       UT_LIST_GET_LEN(buf_pool.free));
+  mysql_mutex_unlock(&buf_pool.mutex);
+
+  map::iterator begin= pages.end();
+  page_id_t begin_id{~0ULL};
+
+  while (pages_it != pages.end() && n < max_n)
+  {
+    ut_ad(!buf_dblwr.is_inside(pages_it->first));
+    if (!pages_it->second.being_processed)
+    {
+      if (space_id != pages_it->first.space())
+      {
+        space_id= pages_it->first.space();
+        if (space)
+          space->release();
+        space= fil_space_t::get(space_id);
+        if (!space)
+        {
+          auto d= deferred_spaces.defers.find(space_id);
+          if (d == deferred_spaces.defers.end() || d->second.deleted)
+            /* For deleted files we preserve the deferred_spaces entry */;
+          else if (!free_block)
+            return true;
+          else
+          {
+            space= recover_deferred(pages_it, d->second.file_name, free_block);
+            deferred_spaces.defers.erase(d);
+            if (!space && !srv_force_recovery)
+            {
+              set_corrupt_fs();
+              return false;
+            }
+          }
+        }
+      }
+      if (!space || space->is_freed(pages_it->first.page_no()))
+        pages_it->second.being_processed= -1;
+      else if (!n++)
+      {
+        begin= pages_it;
+        begin_id= pages_it->first;
+      }
+    }
+    pages_it++;
+  }
+
+  if (!last_batch)
+    log_sys.latch.wr_unlock();
+
+  pages_it= begin;
+
+  if (report(time(nullptr)))
+    report_progress();
+
+  if (!n)
+    goto wait;
+
+  mysql_mutex_lock(&buf_pool.mutex);
+
+  if (UNIV_UNLIKELY(UT_LIST_GET_LEN(buf_pool.free) < n))
+  {
+    mysql_mutex_unlock(&buf_pool.mutex);
+  wait:
+    wait_for_pool(n);
+    if (n);
+    else if (!last_batch)
+      goto unlock_relock;
+    else
+      goto get_last;
+    pages_it= pages.lower_bound(begin_id);
+    ut_ad(pages_it != pages.end());
+  }
+  else
+    mysql_mutex_unlock(&buf_pool.mutex);
+
+  while (pages_it != pages.end())
+  {
+    ut_ad(!buf_dblwr.is_inside(pages_it->first));
+    if (!pages_it->second.being_processed)
+    {
+      const page_id_t id{pages_it->first};
+
+      if (space_id != id.space())
+      {
+        space_id= id.space();
+        if (space)
+          space->release();
+        space= fil_space_t::get(space_id);
+      }
+      if (!space)
+      {
+	const auto it= deferred_spaces.defers.find(space_id);
+	if (it != deferred_spaces.defers.end() && !it->second.deleted)
+          /* The records must be processed after recover_deferred(). */
+	  goto next;
+        goto space_not_found;
+      }
+      else if (space->is_freed(id.page_no()))
+      {
+      space_not_found:
+        pages_it->second.being_processed= -1;
+        goto next;
+      }
+      else
+      {
+        page_recv_t &recs= pages_it->second;
+        ut_ad(!recs.log.empty());
+        recs.being_processed= 1;
+        init *init= recs.skip_read ? &mlog_init.last(id) : nullptr;
+        mysql_mutex_unlock(&mutex);
+        buf_read_recover(space, id, recs, init);
+      }
+
+      if (!--n)
+      {
+        if (last_batch)
+          goto relock_last;
+        goto relock;
+      }
+      mysql_mutex_lock(&mutex);
+      pages_it= pages.lower_bound(id);
+    }
+    else
+    next:
+      pages_it++;
+  }
+
+  if (!last_batch)
+  {
+  unlock_relock:
+    mysql_mutex_unlock(&mutex);
+  relock:
+    log_sys.latch.wr_lock(SRW_LOCK_CALL);
+  relock_last:
+    mysql_mutex_lock(&mutex);
+  get_last:
+    pages_it= pages.lower_bound(begin_id);
+  }
+
+  return false;
+}
+
+/** Attempt to initialize a page based on redo log records.
+@param p        iterator
+@param mtr      mini-transaction
+@param b        pre-allocated buffer pool block
+@param init     page initialization
+@return the recovered block
+@retval nullptr if the page cannot be initialized based on log records
+@retval -1      if the page cannot be recovered due to corruption */
+inline buf_block_t *recv_sys_t::recover_low(const map::iterator &p, mtr_t &mtr,
+                                            buf_block_t *b, init &init)
+{
+  mysql_mutex_assert_not_owner(&mutex);
+  page_recv_t &recs= p->second;
+  ut_ad(recs.skip_read);
+  ut_ad(recs.being_processed == 1);
+  buf_block_t* block= nullptr;
+  const lsn_t end_lsn= recs.log.last()->lsn;
+  if (end_lsn < init.lsn)
+    DBUG_LOG("ib_log", "skip log for page " << p->first
+             << " LSN " << end_lsn << " < " << init.lsn);
+  fil_space_t *space= fil_space_t::get(p->first.space());
+
+  mtr.start();
+  mtr.set_log_mode(MTR_LOG_NO_REDO);
+
+  ulint zip_size= space ? space->zip_size() : 0;
+
+  if (!space)
+  {
+    if (p->first.page_no() != 0)
+    {
+    nothing_recoverable:
+      mtr.commit();
+      return nullptr;
+    }
+    auto it= recv_spaces.find(p->first.space());
+    ut_ad(it != recv_spaces.end());
+    uint32_t flags= it->second.flags;
+    zip_size= fil_space_t::zip_size(flags);
+    block= buf_page_create_deferred(p->first.space(), zip_size, &mtr, b);
+    ut_ad(block == b);
+    block->page.lock.x_lock_recursive();
+  }
+  else
+  {
+    block= buf_page_create(space, p->first.page_no(), zip_size, &mtr, b);
+
+    if (UNIV_UNLIKELY(block != b))
+    {
+      /* The page happened to exist in the buffer pool, or it
+      was just being read in. Before the exclusive page latch was acquired by
+      buf_page_create(), all changes to the page must have been applied. */
+      ut_d(mysql_mutex_lock(&mutex));
+      ut_ad(pages.find(p->first) == pages.end());
+      ut_d(mysql_mutex_unlock(&mutex));
+      space->release();
+      goto nothing_recoverable;
+    }
+  }
+
+  ut_d(mysql_mutex_lock(&mutex));
+  ut_ad(&recs == &pages.find(p->first)->second);
+  ut_d(mysql_mutex_unlock(&mutex));
+  init.created= true;
+  block= recv_recover_page(block, mtr, recs, space, &init);
+  ut_ad(mtr.has_committed());
+
+  if (space)
+    space->release();
+
+  return block ? block : reinterpret_cast<buf_block_t*>(-1);
+}
+
+/** Attempt to initialize a page based on redo log records.
+@param page_id  page identifier
+@return recovered block
+@retval nullptr if the page cannot be initialized based on log records */
+ATTRIBUTE_COLD buf_block_t *recv_sys_t::recover_low(const page_id_t page_id)
+{
+  mysql_mutex_lock(&mutex);
+  map::iterator p= pages.find(page_id);
+
+  if (p != pages.end() && !p->second.being_processed && p->second.skip_read)
+  {
+    p->second.being_processed= 1;
+    init &init= mlog_init.last(page_id);
+    mysql_mutex_unlock(&mutex);
+    buf_block_t *free_block= buf_LRU_get_free_block(false);
+    mtr_t mtr;
+    buf_block_t *block= recover_low(p, mtr, free_block, init);
+    p->second.being_processed= -1;
+    ut_ad(!block || block == reinterpret_cast<buf_block_t*>(-1) ||
+          block == free_block);
+    if (UNIV_UNLIKELY(!block))
+      buf_pool.free_block(free_block);
+    return block;
+  }
+
+  mysql_mutex_unlock(&mutex);
+  return nullptr;
+}
+
+inline fil_space_t *fil_system_t::find(const char *path) const
+{
+  mysql_mutex_assert_owner(&mutex);
+  for (fil_space_t &space : fil_system.space_list)
+    if (space.chain.start && !strcmp(space.chain.start->name, path))
+      return &space;
+  return nullptr;
+}
+
+/** Thread-safe function which sorts flush_list by oldest_modification */
+static void log_sort_flush_list()
+{
+  /* Ensure that oldest_modification() cannot change during std::sort() */
+  {
+    const double pct_lwm= srv_max_dirty_pages_pct_lwm;
+    /* Disable "idle" flushing in order to minimize the wait time below. */
+    srv_max_dirty_pages_pct_lwm= 0.0;
+
+    for (;;)
+    {
+      os_aio_wait_until_no_pending_writes(false);
+      mysql_mutex_lock(&buf_pool.flush_list_mutex);
+      if (buf_pool.page_cleaner_active())
+        my_cond_wait(&buf_pool.done_flush_list,
+                     &buf_pool.flush_list_mutex.m_mutex);
+      else if (!os_aio_pending_writes())
+        break;
+      mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+    }
+
+    srv_max_dirty_pages_pct_lwm= pct_lwm;
+  }
+
+  const size_t size= UT_LIST_GET_LEN(buf_pool.flush_list);
+  std::unique_ptr<buf_page_t *[]> list(new buf_page_t *[size]);
+
+  /* Copy the dirty blocks from buf_pool.flush_list to an array for sorting. */
+  size_t idx= 0;
+  for (buf_page_t *p= UT_LIST_GET_FIRST(buf_pool.flush_list); p; )
+  {
+    const lsn_t lsn{p->oldest_modification()};
+    ut_ad(lsn > 2 || lsn == 1);
+    buf_page_t *n= UT_LIST_GET_NEXT(list, p);
+    if (lsn > 1)
+      list.get()[idx++]= p;
+    else
+      buf_pool.delete_from_flush_list(p);
+    p= n;
+  }
+
+  std::sort(list.get(), list.get() + idx,
+            [](const buf_page_t *lhs, const buf_page_t *rhs) {
+              const lsn_t l{lhs->oldest_modification()};
+              const lsn_t r{rhs->oldest_modification()};
+              DBUG_ASSERT(l > 2); DBUG_ASSERT(r > 2);
+              return r < l;
+            });
+
+  UT_LIST_INIT(buf_pool.flush_list, &buf_page_t::list);
+
+  for (size_t i= 0; i < idx; i++)
+  {
+    UT_LIST_ADD_LAST(buf_pool.flush_list, list[i]);
+    DBUG_ASSERT(list[i]->oldest_modification() > 2);
+  }
+
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+}
+
+/** Apply buffered log to persistent data pages.
+@param last_batch     whether it is possible to write more redo log */
+void recv_sys_t::apply(bool last_batch)
+{
+  ut_ad(srv_operation <= SRV_OPERATION_EXPORT_RESTORED ||
+        srv_operation == SRV_OPERATION_RESTORE ||
+        srv_operation == SRV_OPERATION_RESTORE_EXPORT);
+
+  mysql_mutex_assert_owner(&mutex);
+
+  garbage_collect();
+
+  if (!pages.empty())
+  {
+    recv_no_ibuf_operations = !last_batch ||
+      srv_operation == SRV_OPERATION_RESTORE ||
+      srv_operation == SRV_OPERATION_RESTORE_EXPORT;
+    ut_ad(!last_batch || lsn == scanned_lsn);
+    progress_time= time(nullptr);
+    report_progress();
+
+    apply_log_recs= true;
+
+    for (auto id= srv_undo_tablespaces_open; id--;)
+    {
+      const trunc& t= truncated_undo_spaces[id];
+      if (t.lsn)
+      {
+        /* The entire undo tablespace will be reinitialized by
+        innodb_undo_log_truncate=ON. Discard old log for all pages.
+        Even though we recv_sys_t::parse() already invoked trim(),
+        this will be needed in case recovery consists of multiple batches
+        (there was an invocation with !last_batch). */
+        trim({id + srv_undo_space_id_start, 0}, t.lsn);
+        if (fil_space_t *space = fil_space_get(id + srv_undo_space_id_start))
+        {
+          ut_ad(UT_LIST_GET_LEN(space->chain) == 1);
+          ut_ad(space->recv_size >= t.pages);
+          fil_node_t *file= UT_LIST_GET_FIRST(space->chain);
+          ut_ad(file->is_open());
+          os_file_truncate(file->name, file->handle,
+                           os_offset_t{space->recv_size} <<
+                           srv_page_size_shift, true);
+        }
+      }
+    }
+
+    fil_system.extend_to_recv_size();
+
+    fil_space_t *space= nullptr;
+    uint32_t space_id= ~0;
+    buf_block_t *free_block= nullptr;
+
+    for (pages_it= pages.begin(); pages_it != pages.end();
+         pages_it= pages.begin())
+    {
+      if (!free_block)
+      {
+        if (!last_batch)
+          log_sys.latch.wr_unlock();
+        wait_for_pool(1);
+        pages_it= pages.begin();
+        mysql_mutex_unlock(&mutex);
+        /* We must release log_sys.latch and recv_sys.mutex before
+        invoking buf_LRU_get_free_block(). Allocating a block may initiate
+        a redo log write and therefore acquire log_sys.latch. To avoid
+        deadlocks, log_sys.latch must not be acquired while holding
+        recv_sys.mutex. */
+        free_block= buf_LRU_get_free_block(false);
+        if (!last_batch)
+          log_sys.latch.wr_lock(SRW_LOCK_CALL);
+        mysql_mutex_lock(&mutex);
+        pages_it= pages.begin();
+      }
+
+      while (pages_it != pages.end())
+      {
+        if (is_corrupt_fs() || is_corrupt_log())
+        {
+          if (space)
+            space->release();
+          if (free_block)
+          {
+            mysql_mutex_unlock(&mutex);
+            mysql_mutex_lock(&buf_pool.mutex);
+            buf_LRU_block_free_non_file_page(free_block);
+            mysql_mutex_unlock(&buf_pool.mutex);
+            mysql_mutex_lock(&mutex);
+          }
+          return;
+        }
+        if (apply_batch(space_id, space, free_block, last_batch))
+          break;
+      }
+    }
+
+    if (space)
+      space->release();
+
+    if (free_block)
+    {
+      mysql_mutex_lock(&buf_pool.mutex);
+      buf_LRU_block_free_non_file_page(free_block);
+      mysql_mutex_unlock(&buf_pool.mutex);
+    }
+  }
+
+  if (last_batch)
+  {
+    if (!recv_no_ibuf_operations)
+      /* We skipped this in buf_page_create(). */
+      mlog_init.mark_ibuf_exist();
+    mlog_init.clear();
+  }
+  else
+  {
+    mlog_init.reset();
+    log_sys.latch.wr_unlock();
+  }
+
+  mysql_mutex_unlock(&mutex);
+
+  if (!last_batch)
+  {
+    buf_flush_sync_batch(lsn);
+    buf_pool_invalidate();
+    log_sys.latch.wr_lock(SRW_LOCK_CALL);
+  }
+  else if (srv_operation == SRV_OPERATION_RESTORE ||
+           srv_operation == SRV_OPERATION_RESTORE_EXPORT)
+    buf_flush_sync_batch(lsn);
+  else
+    /* Instead of flushing, last_batch sorts the buf_pool.flush_list
+    in ascending order of buf_page_t::oldest_modification. */
+    log_sort_flush_list();
+
+#ifdef HAVE_PMEM
+  if (last_batch && log_sys.is_pmem())
+    mprotect(log_sys.buf, len, PROT_READ | PROT_WRITE);
+#endif
+
+  mysql_mutex_lock(&mutex);
+
+  ut_d(after_apply= true);
+  clear();
+}
+
+/** Scan log_t::FORMAT_10_8 log store records to the parsing buffer.
+@param last_phase     whether changes can be applied to the tablespaces
+@return whether rescan is needed (not everything was stored) */
+static bool recv_scan_log(bool last_phase)
+{
+  DBUG_ENTER("recv_scan_log");
+
+  ut_ad(log_sys.is_latest());
+  const size_t block_size_1{log_sys.get_block_size() - 1};
+
+  mysql_mutex_lock(&recv_sys.mutex);
+  ut_d(recv_sys.after_apply= last_phase);
+  if (!last_phase)
+    recv_sys.clear();
+  else
+    ut_ad(recv_sys.file_checkpoint);
+
+  bool store{recv_sys.file_checkpoint != 0};
+  size_t buf_size= log_sys.buf_size;
+#ifdef HAVE_PMEM
+  if (log_sys.is_pmem())
+  {
+    recv_sys.offset= size_t(log_sys.calc_lsn_offset(recv_sys.lsn));
+    buf_size= size_t(log_sys.file_size);
+    recv_sys.len= size_t(log_sys.file_size);
+  }
+  else
+#endif
+  {
+    recv_sys.offset= size_t(recv_sys.lsn - log_sys.get_first_lsn()) &
+      block_size_1;
+    recv_sys.len= 0;
+  }
+
+  lsn_t rewound_lsn= 0;
+  for (ut_d(lsn_t source_offset= 0);;)
+  {
+#ifndef SUX_LOCK_GENERIC
+    ut_ad(log_sys.latch.is_write_locked());
+#endif
+#ifdef UNIV_DEBUG
+    const bool wrap{source_offset + recv_sys.len == log_sys.file_size};
+#endif
+    if (size_t size= buf_size - recv_sys.len)
+    {
+#ifndef UNIV_DEBUG
+      lsn_t
+#endif
+      source_offset=
+        log_sys.calc_lsn_offset(recv_sys.lsn + recv_sys.len - recv_sys.offset);
+      ut_ad(!wrap || source_offset == log_t::START_OFFSET);
+      source_offset&= ~block_size_1;
+
+      if (source_offset + size > log_sys.file_size)
+        size= static_cast<size_t>(log_sys.file_size - source_offset);
+
+      if (dberr_t err= log_sys.log.read(source_offset,
+                                        {log_sys.buf + recv_sys.len, size}))
+      {
+        mysql_mutex_unlock(&recv_sys.mutex);
+        ib::error() << "Failed to read log at " << source_offset
+                    << ": " << err;
+        recv_sys.set_corrupt_log();
+        mysql_mutex_lock(&recv_sys.mutex);
+      }
+      else
+        recv_sys.len+= size;
+    }
+
+    if (recv_sys.report(time(nullptr)))
+    {
+      sql_print_information("InnoDB: Read redo log up to LSN=" LSN_PF,
+                            recv_sys.lsn);
+      service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
+                                     "Read redo log up to LSN=" LSN_PF,
+                                     recv_sys.lsn);
+    }
+
+    recv_sys_t::parse_mtr_result r;
+
+    if (UNIV_UNLIKELY(!recv_needed_recovery))
+    {
+      ut_ad(!last_phase);
+      ut_ad(recv_sys.lsn >= log_sys.next_checkpoint_lsn);
+
+      if (!store)
+      {
+        ut_ad(!recv_sys.file_checkpoint);
+        for (;;)
+        {
+          const byte& b{log_sys.buf[recv_sys.offset]};
+          r= recv_sys.parse_pmem<false>(false);
+          switch (r) {
+          case recv_sys_t::PREMATURE_EOF:
+            goto read_more;
+          default:
+            ut_ad(r == recv_sys_t::GOT_EOF);
+            break;
+          case recv_sys_t::OK:
+            if (b == FILE_CHECKPOINT + 2 + 8 || (b & 0xf0) == FILE_MODIFY)
+              continue;
+          }
+
+          const lsn_t end{recv_sys.file_checkpoint};
+          ut_ad(!end || end == recv_sys.lsn);
+          mysql_mutex_unlock(&recv_sys.mutex);
+
+          if (!end)
+          {
+            recv_sys.set_corrupt_log();
+            sql_print_error("InnoDB: Missing FILE_CHECKPOINT(" LSN_PF
+                            ") at " LSN_PF, log_sys.next_checkpoint_lsn,
+                            recv_sys.lsn);
+          }
+          DBUG_RETURN(true);
+        }
+      }
+      else
+      {
+        ut_ad(recv_sys.file_checkpoint != 0);
+        switch ((r= recv_sys.parse_pmem<true>(false))) {
+        case recv_sys_t::PREMATURE_EOF:
+          goto read_more;
+        case recv_sys_t::GOT_EOF:
+          break;
+        default:
+          ut_ad(r == recv_sys_t::OK);
+          recv_needed_recovery= true;
+          if (srv_read_only_mode)
+          {
+            mysql_mutex_unlock(&recv_sys.mutex);
+            DBUG_RETURN(false);
+          }
+          sql_print_information("InnoDB: Starting crash recovery from"
+                                " checkpoint LSN="  LSN_PF,
+                                log_sys.next_checkpoint_lsn);
+        }
+      }
+    }
+
+    if (!store)
+    skip_the_rest:
+      while ((r= recv_sys.parse_pmem<false>(false)) == recv_sys_t::OK);
+    else
+    {
+      uint16_t count= 0;
+      while ((r= recv_sys.parse_pmem<true>(last_phase)) == recv_sys_t::OK)
+        if (!++count && recv_sys.report(time(nullptr)))
+        {
+          const size_t n= recv_sys.pages.size();
+          sql_print_information("InnoDB: Parsed redo log up to LSN=" LSN_PF
+                                "; to recover: %zu pages", recv_sys.lsn, n);
+          service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
+                                         "Parsed redo log up to LSN=" LSN_PF
+                                         "; to recover: %zu pages",
+                                         recv_sys.lsn, n);
+        }
+      if (r == recv_sys_t::GOT_OOM)
+      {
+        ut_ad(!last_phase);
+        rewound_lsn= recv_sys.lsn;
+        store= false;
+        if (recv_sys.scanned_lsn <= 1)
+          goto skip_the_rest;
+        ut_ad(recv_sys.file_checkpoint);
+        goto func_exit;
+      }
+    }
+
+    if (r != recv_sys_t::PREMATURE_EOF)
+    {
+      ut_ad(r == recv_sys_t::GOT_EOF);
+    got_eof:
+      ut_ad(recv_sys.is_initialised());
+      if (recv_sys.scanned_lsn > 1)
+      {
+        ut_ad(recv_sys.scanned_lsn == recv_sys.lsn);
+        break;
+      }
+      recv_sys.scanned_lsn= recv_sys.lsn;
+      sql_print_information("InnoDB: End of log at LSN=" LSN_PF, recv_sys.lsn);
+      break;
+    }
+
+  read_more:
+#ifdef HAVE_PMEM
+    if (log_sys.is_pmem())
+      break;
+#endif
+    if (recv_sys.is_corrupt_log())
+      break;
+
+    if (recv_sys.offset < log_sys.get_block_size() &&
+        recv_sys.lsn == recv_sys.scanned_lsn)
+      goto got_eof;
+
+    if (recv_sys.offset > buf_size / 4 ||
+        (recv_sys.offset > block_size_1 &&
+         recv_sys.len >= buf_size - recv_sys.MTR_SIZE_MAX))
+    {
+      const size_t ofs{recv_sys.offset & ~block_size_1};
+      memmove_aligned<64>(log_sys.buf, log_sys.buf + ofs, recv_sys.len - ofs);
+      recv_sys.len-= ofs;
+      recv_sys.offset&= block_size_1;
+    }
+  }
+
+  if (last_phase)
+  {
+    ut_ad(!rewound_lsn);
+    ut_ad(recv_sys.lsn >= recv_sys.file_checkpoint);
+    log_sys.set_recovered_lsn(recv_sys.lsn);
+  }
+  else if (rewound_lsn)
+  {
+    ut_ad(!store);
+    ut_ad(recv_sys.file_checkpoint);
+    recv_sys.lsn= rewound_lsn;
+  }
+func_exit:
+  mysql_mutex_unlock(&recv_sys.mutex);
+  DBUG_RETURN(!store);
+}
+
+/** Report a missing tablespace for which page-redo log exists.
+@param[in]	err	previous error code
+@param[in]	i	tablespace descriptor
+@return new error code */
+static
+dberr_t
+recv_init_missing_space(dberr_t err, const recv_spaces_t::const_iterator& i)
+{
+	switch (srv_operation) {
+	default:
+		break;
+	case SRV_OPERATION_RESTORE:
+	case SRV_OPERATION_RESTORE_EXPORT:
+		if (i->second.name.find("/#sql") != std::string::npos) {
+			sql_print_warning("InnoDB: Tablespace " UINT32PF
+					  " was not found at %.*s when"
+					  " restoring a (partial?) backup."
+					  " All redo log"
+					  " for this file will be ignored!",
+					  i->first, int(i->second.name.size()),
+					  i->second.name.data());
+		}
+		return(err);
+	}
+
+	if (srv_force_recovery == 0) {
+		sql_print_error("InnoDB: Tablespace " UINT32PF " was not"
+				" found at %.*s.", i->first,
+				int(i->second.name.size()),
+				i->second.name.data());
+
+		if (err == DB_SUCCESS) {
+			sql_print_information(
+				"InnoDB: Set innodb_force_recovery=1 to"
+				" ignore this and to permanently lose"
+				" all changes to the tablespace.");
+			err = DB_TABLESPACE_NOT_FOUND;
+		}
+	} else {
+		sql_print_warning("InnoDB: Tablespace " UINT32PF
+				  " was not found at %.*s"
+				  ", and innodb_force_recovery was set."
+				  " All redo log for this tablespace"
+				  " will be ignored!",
+				  i->first, int(i->second.name.size()),
+				  i->second.name.data());
+	}
+
+	return(err);
+}
+
+/** Report the missing tablespace and discard the redo logs for the deleted
+tablespace.
+@param[in]	rescan			rescan of redo logs is needed
+					if hash table ran out of memory
+@param[out]	missing_tablespace	missing tablespace exists or not
+@return error code or DB_SUCCESS. */
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+recv_validate_tablespace(bool rescan, bool& missing_tablespace)
+{
+	dberr_t err = DB_SUCCESS;
+
+	mysql_mutex_lock(&recv_sys.mutex);
+
+	for (recv_sys_t::map::iterator p = recv_sys.pages.begin();
+	     p != recv_sys.pages.end();) {
+		ut_ad(!p->second.log.empty());
+		const uint32_t space = p->first.space();
+		if (is_predefined_tablespace(space)) {
+next:
+			p++;
+			continue;
+		}
+
+		recv_spaces_t::iterator i = recv_spaces.find(space);
+		ut_ad(i != recv_spaces.end());
+
+		if (deferred_spaces.find(static_cast<uint32_t>(space))) {
+			/* Skip redo logs belonging to
+			incomplete tablespaces */
+			goto next;
+		}
+
+		switch (i->second.status) {
+		case file_name_t::NORMAL:
+			goto next;
+		case file_name_t::MISSING:
+			err = recv_init_missing_space(err, i);
+			i->second.status = file_name_t::DELETED;
+			/* fall through */
+		case file_name_t::DELETED:
+			recv_sys_t::map::iterator r = p++;
+			recv_sys.pages_it_invalidate(r);
+			recv_sys.erase(r);
+			continue;
+		}
+		ut_ad(0);
+	}
+
+	if (err != DB_SUCCESS) {
+func_exit:
+		mysql_mutex_unlock(&recv_sys.mutex);
+		return(err);
+	}
+
+	/* When rescan is not needed, recv_sys.pages will contain the
+	entire redo log. If rescan is needed or innodb_force_recovery
+	is set, we can ignore missing tablespaces. */
+	for (const recv_spaces_t::value_type& rs : recv_spaces) {
+		if (UNIV_LIKELY(rs.second.status != file_name_t::MISSING)) {
+			continue;
+		}
+
+		if (deferred_spaces.find(static_cast<uint32_t>(rs.first))) {
+			continue;
+		}
+
+		if (srv_force_recovery) {
+			sql_print_warning("InnoDB: Tablespace " UINT32PF
+					  " was not found at %.*s,"
+					  " and innodb_force_recovery was set."
+					  " All redo log for this tablespace"
+					  " will be ignored!",
+					  rs.first, int(rs.second.name.size()),
+					  rs.second.name.data());
+			continue;
+		}
+
+		if (!rescan) {
+			sql_print_information("InnoDB: Tablespace " UINT32PF
+					      " was not found at '%.*s',"
+					      " but there were"
+					      " no modifications either.",
+					      rs.first,
+					      int(rs.second.name.size()),
+					      rs.second.name.data());
+		} else {
+			missing_tablespace = true;
+		}
+	}
+
+	goto func_exit;
+}
+
+/** Check if all tablespaces were found for crash recovery.
+@param[in]	rescan			rescan of redo logs is needed
+@param[out]	missing_tablespace	missing table exists
+@return error code or DB_SUCCESS */
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+recv_init_crash_recovery_spaces(bool rescan, bool& missing_tablespace)
+{
+	bool		flag_deleted	= false;
+
+	ut_ad(!srv_read_only_mode);
+	ut_ad(recv_needed_recovery);
+
+	for (recv_spaces_t::value_type& rs : recv_spaces) {
+		ut_ad(!is_predefined_tablespace(rs.first));
+		ut_ad(rs.second.status != file_name_t::DELETED
+		      || !rs.second.space);
+
+		if (rs.second.status == file_name_t::DELETED) {
+			/* The tablespace was deleted,
+			so we can ignore any redo log for it. */
+			flag_deleted = true;
+		} else if (rs.second.space != NULL) {
+			/* The tablespace was found, and there
+			are some redo log records for it. */
+			fil_names_dirty(rs.second.space);
+
+			/* Add the freed page ranges in the respective
+			tablespace */
+			if (!rs.second.freed_ranges.empty()
+			    && (srv_immediate_scrub_data_uncompressed
+				|| rs.second.space->is_compressed())) {
+
+				rs.second.space->add_free_ranges(
+					std::move(rs.second.freed_ranges));
+			}
+		} else if (rs.second.name == "") {
+			sql_print_error("InnoDB: Missing FILE_CREATE,"
+					" FILE_DELETE or FILE_MODIFY"
+					" before FILE_CHECKPOINT"
+					" for tablespace " UINT32PF, rs.first);
+			recv_sys.set_corrupt_log();
+			return(DB_CORRUPTION);
+		} else {
+			rs.second.status = file_name_t::MISSING;
+			flag_deleted = true;
+		}
+
+		ut_ad(rs.second.status == file_name_t::DELETED
+		      || rs.second.name != "");
+	}
+
+	if (flag_deleted) {
+		return recv_validate_tablespace(rescan, missing_tablespace);
+	}
+
+	return DB_SUCCESS;
+}
+
+/** Apply any FILE_RENAME records */
+static dberr_t recv_rename_files()
+{
+  mysql_mutex_assert_owner(&recv_sys.mutex);
+#ifndef SUX_LOCK_GENERIC
+  ut_ad(log_sys.latch.is_write_locked());
+#endif
+
+  dberr_t err= DB_SUCCESS;
+
+  for (auto i= renamed_spaces.begin(); i != renamed_spaces.end(); )
+  {
+    const auto &r= *i;
+    const uint32_t id= r.first;
+    fil_space_t *space= fil_space_t::get(id);
+    if (!space)
+    {
+      i++;
+      continue;
+    }
+    ut_ad(UT_LIST_GET_LEN(space->chain) == 1);
+    char *old= space->chain.start->name;
+    if (r.second != old)
+    {
+      bool exists;
+      os_file_type_t ftype;
+      const char *new_name= r.second.c_str();
+      mysql_mutex_lock(&fil_system.mutex);
+      const fil_space_t *other= nullptr;
+      if (!space->chain.start->is_open() && space->chain.start->deferred &&
+          (other= fil_system.find(new_name)) &&
+          (other->chain.start->is_open() || !other->chain.start->deferred))
+        other= nullptr;
+
+      if (other)
+      {
+        /* Multiple tablespaces use the same file name. This should
+        only be possible if the recovery of both files was deferred
+        (no valid page 0 is contained in either file). We shall not
+        rename the file, just rename the metadata. */
+        sql_print_information("InnoDB: Renaming tablespace metadata " UINT32PF
+                              " from '%s' to '%s' that is also associated"
+                              " with tablespace " UINT32PF,
+                              id, old, new_name, other->id);
+        space->chain.start->name= mem_strdup(new_name);
+        ut_free(old);
+      }
+      else if (!os_file_status(new_name, &exists, &ftype) || exists)
+      {
+        sql_print_error("InnoDB: Cannot replay rename of tablespace " UINT32PF
+                        " from '%s' to '%s'%s",
+                        id, old, new_name, exists ?
+                        " because the target file exists" : "");
+        err= DB_TABLESPACE_EXISTS;
+      }
+      else
+      {
+        mysql_mutex_unlock(&fil_system.mutex);
+        err= space->rename(new_name, false);
+        if (err != DB_SUCCESS)
+          sql_print_error("InnoDB: Cannot replay rename of tablespace "
+                          UINT32PF " to '%s: %s", new_name, ut_strerr(err));
+        goto done;
+      }
+      mysql_mutex_unlock(&fil_system.mutex);
+    }
+done:
+    space->release();
+    if (err != DB_SUCCESS)
+    {
+      recv_sys.set_corrupt_fs();
+      break;
+    }
+    renamed_spaces.erase(i++);
+  }
+  return err;
+}
+
+/** Start recovering from a redo log checkpoint.
+of first system tablespace page
+@return error code or DB_SUCCESS */
+dberr_t recv_recovery_from_checkpoint_start()
+{
+	bool		rescan = false;
+
+	ut_ad(srv_operation <= SRV_OPERATION_EXPORT_RESTORED
+	      || srv_operation == SRV_OPERATION_RESTORE
+	      || srv_operation == SRV_OPERATION_RESTORE_EXPORT);
+	ut_d(mysql_mutex_lock(&buf_pool.flush_list_mutex));
+	ut_ad(UT_LIST_GET_LEN(buf_pool.LRU) == 0);
+	ut_ad(UT_LIST_GET_LEN(buf_pool.unzip_LRU) == 0);
+	ut_d(mysql_mutex_unlock(&buf_pool.flush_list_mutex));
+
+	if (srv_force_recovery >= SRV_FORCE_NO_LOG_REDO) {
+		sql_print_information("InnoDB: innodb_force_recovery=6"
+				      " skips redo log apply");
+		return(DB_SUCCESS);
+	}
+
+	recv_sys.recovery_on = true;
+
+	log_sys.latch.wr_lock(SRW_LOCK_CALL);
+
+	dberr_t err = recv_sys.find_checkpoint();
+        if (err != DB_SUCCESS) {
+early_exit:
+		log_sys.latch.wr_unlock();
+		return err;
+	}
+
+	log_sys.set_capacity();
+
+	/* Start reading the log from the checkpoint lsn. The variable
+	contiguous_lsn contains an lsn up to which the log is known to
+	be contiguously written. */
+
+	ut_ad(recv_sys.pages.empty());
+
+	if (log_sys.format == log_t::FORMAT_3_23) {
+		goto early_exit;
+	}
+
+	if (log_sys.is_latest()) {
+		const bool rewind = recv_sys.lsn
+			!= log_sys.next_checkpoint_lsn;
+		log_sys.last_checkpoint_lsn = log_sys.next_checkpoint_lsn;
+
+		recv_scan_log(false);
+		if (recv_needed_recovery) {
+read_only_recovery:
+			sql_print_warning("InnoDB: innodb_read_only"
+					  " prevents crash recovery");
+			err = DB_READ_ONLY;
+			goto early_exit;
+		}
+		if (recv_sys.is_corrupt_log()) {
+			sql_print_error("InnoDB: Log scan aborted at LSN "
+					LSN_PF, recv_sys.lsn);
+                        goto err_exit;
+		}
+		ut_ad(recv_sys.file_checkpoint);
+		if (rewind) {
+			recv_sys.lsn = log_sys.next_checkpoint_lsn;
+			recv_sys.offset = 0;
+			recv_sys.len = 0;
+		}
+		ut_ad(!recv_max_page_lsn);
+		rescan = recv_scan_log(false);
+
+		if (srv_read_only_mode && recv_needed_recovery) {
+			goto read_only_recovery;
+		}
+
+		if ((recv_sys.is_corrupt_log() && !srv_force_recovery)
+		    || recv_sys.is_corrupt_fs()) {
+			goto err_exit;
+		}
+	}
+
+	log_sys.set_recovered_lsn(recv_sys.lsn);
+
+	if (recv_needed_recovery) {
+		bool missing_tablespace = false;
+
+		err = recv_init_crash_recovery_spaces(
+			rescan, missing_tablespace);
+
+		if (err != DB_SUCCESS) {
+			goto early_exit;
+		}
+
+		if (missing_tablespace) {
+			ut_ad(rescan);
+			/* If any tablespaces seem to be missing,
+			validate the remaining log records. */
+
+			do {
+				rescan = recv_scan_log(false);
+				ut_ad(!recv_sys.is_corrupt_fs());
+
+				if (recv_sys.is_corrupt_log()) {
+					goto err_exit;
+				}
+
+				missing_tablespace = false;
+
+				err = recv_validate_tablespace(
+					rescan, missing_tablespace);
+
+				if (err != DB_SUCCESS) {
+					goto early_exit;
+				}
+			} while (missing_tablespace);
+
+			rescan = true;
+			/* Because in the loop above we overwrote the
+			initially stored recv_sys.pages, we must
+			restart parsing the log from the very beginning. */
+
+			/* FIXME: Use a separate loop for checking for
+			tablespaces (not individual pages), while retaining
+			the initial recv_sys.pages. */
+			mysql_mutex_lock(&recv_sys.mutex);
+			recv_sys.clear();
+			recv_sys.lsn = log_sys.next_checkpoint_lsn;
+			mysql_mutex_unlock(&recv_sys.mutex);
+		}
+
+		if (srv_operation <= SRV_OPERATION_EXPORT_RESTORED) {
+			deferred_spaces.deferred_dblwr();
+			buf_dblwr.recover();
+		}
+
+		ut_ad(srv_force_recovery <= SRV_FORCE_NO_UNDO_LOG_SCAN);
+
+		if (rescan) {
+			recv_scan_log(true);
+			if ((recv_sys.is_corrupt_log()
+			     && !srv_force_recovery)
+			    || recv_sys.is_corrupt_fs()) {
+				goto err_exit;
+			}
+
+			/* In case of multi-batch recovery,
+			redo log for the last batch is not
+			applied yet. */
+			ut_d(recv_sys.after_apply = false);
+		}
+	} else {
+		ut_ad(recv_sys.pages.empty());
+	}
+
+	if (log_sys.is_latest()
+	    && (recv_sys.lsn < log_sys.next_checkpoint_lsn
+		|| recv_sys.lsn < recv_max_page_lsn)) {
+
+		sql_print_error("InnoDB: We scanned the log up to " LSN_PF "."
+				" A checkpoint was at " LSN_PF
+				" and the maximum LSN on a database page was "
+				LSN_PF ". It is possible that the"
+				" database is now corrupt!",
+				recv_sys.lsn,
+				log_sys.next_checkpoint_lsn,
+				recv_max_page_lsn);
+	}
+
+	if (recv_sys.lsn < log_sys.next_checkpoint_lsn) {
+err_exit:
+		err = DB_ERROR;
+		goto early_exit;
+	}
+
+	if (!srv_read_only_mode && log_sys.is_latest()) {
+		ut_ad(log_sys.get_flushed_lsn() == log_sys.get_lsn());
+		ut_ad(recv_sys.lsn == log_sys.get_lsn());
+		if (!log_sys.is_pmem()) {
+			const size_t bs_1{log_sys.get_block_size() - 1};
+			const size_t ro{recv_sys.offset};
+			recv_sys.offset &= bs_1;
+			memmove_aligned<64>(log_sys.buf,
+					    log_sys.buf + (ro & ~bs_1),
+					    log_sys.get_block_size());
+#ifdef HAVE_PMEM
+		} else {
+			mprotect(log_sys.buf, size_t(log_sys.file_size),
+				 PROT_READ | PROT_WRITE);
+#endif
+		}
+		log_sys.buf_free = recv_sys.offset;
+		if (recv_needed_recovery
+	            && srv_operation <= SRV_OPERATION_EXPORT_RESTORED) {
+			/* Write a FILE_CHECKPOINT marker as the first thing,
+			before generating any other redo log. This ensures
+			that subsequent crash recovery will be possible even
+			if the server were killed soon after this. */
+			fil_names_clear(log_sys.next_checkpoint_lsn);
+		}
+	}
+
+	mysql_mutex_lock(&recv_sys.mutex);
+	if (UNIV_UNLIKELY(recv_sys.scanned_lsn != recv_sys.lsn)
+	    && log_sys.is_latest()) {
+		ut_ad("log parsing error" == 0);
+		mysql_mutex_unlock(&recv_sys.mutex);
+		err = DB_CORRUPTION;
+		goto early_exit;
+	}
+	recv_sys.apply_log_recs = true;
+	recv_no_ibuf_operations = false;
+	ut_d(recv_no_log_write = srv_operation == SRV_OPERATION_RESTORE
+	     || srv_operation == SRV_OPERATION_RESTORE_EXPORT);
+	if (srv_operation == SRV_OPERATION_NORMAL) {
+		err = recv_rename_files();
+	}
+	mysql_mutex_unlock(&recv_sys.mutex);
+
+	recv_lsn_checks_on = true;
+
+	/* The database is now ready to start almost normal processing of user
+	transactions: transaction rollbacks and the application of the log
+	records in the hash table can be run in background. */
+	if (err == DB_SUCCESS && deferred_spaces.reinit_all()
+	    && !srv_force_recovery) {
+		err = DB_CORRUPTION;
+	}
+
+	log_sys.latch.wr_unlock();
+	return err;
+}
+
+bool recv_dblwr_t::validate_page(const page_id_t page_id,
+                                 const byte *page,
+                                 const fil_space_t *space,
+                                 byte *tmp_buf)
+{
+  if (page_id.page_no() == 0)
+  {
+    uint32_t flags= fsp_header_get_flags(page);
+    if (!fil_space_t::is_valid_flags(flags, page_id.space()))
+    {
+      uint32_t cflags= fsp_flags_convert_from_101(flags);
+      if (cflags == UINT32_MAX)
+      {
+        ib::warn() << "Ignoring a doublewrite copy of page " << page_id
+                   << "due to invalid flags " << ib::hex(flags);
+        return false;
+      }
+
+      flags= cflags;
+    }
+
+    /* Page 0 is never page_compressed or encrypted. */
+    return !buf_page_is_corrupted(true, page, flags);
+  }
+
+  ut_ad(tmp_buf);
+  byte *tmp_frame= tmp_buf;
+  byte *tmp_page= tmp_buf + srv_page_size;
+  const uint16_t page_type= mach_read_from_2(page + FIL_PAGE_TYPE);
+  const bool expect_encrypted= space->crypt_data &&
+    space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED;
+
+  if (space->full_crc32())
+    return !buf_page_is_corrupted(true, page, space->flags);
+
+  if (expect_encrypted &&
+      mach_read_from_4(page + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION))
+  {
+    if (!fil_space_verify_crypt_checksum(page, space->zip_size()))
+      return false;
+    if (page_type != FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED)
+      return true;
+    if (space->zip_size())
+      return false;
+    memcpy(tmp_page, page, space->physical_size());
+    if (!fil_space_decrypt(space, tmp_frame, tmp_page))
+      return false;
+  }
+
+  switch (page_type) {
+  case FIL_PAGE_PAGE_COMPRESSED:
+    memcpy(tmp_page, page, space->physical_size());
+    /* fall through */
+  case FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED:
+    if (space->zip_size())
+      return false; /* ROW_FORMAT=COMPRESSED cannot be page_compressed */
+    ulint decomp= fil_page_decompress(tmp_frame, tmp_page, space->flags);
+    if (!decomp)
+      return false; /* decompression failed */
+    if (decomp == srv_page_size)
+      return false; /* the page was not compressed (invalid page type) */
+    return !buf_page_is_corrupted(true, tmp_page, space->flags);
+  }
+
+  return !buf_page_is_corrupted(true, page, space->flags);
+}
+
+byte *recv_dblwr_t::find_page(const page_id_t page_id,
+                              const fil_space_t *space, byte *tmp_buf)
+{
+  byte *result= NULL;
+  lsn_t max_lsn= 0;
+
+  for (byte *page : pages)
+  {
+    if (page_get_page_no(page) != page_id.page_no() ||
+        page_get_space_id(page) != page_id.space())
+      continue;
+    if (page_id.page_no() == 0)
+    {
+      uint32_t flags= mach_read_from_4(
+        FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + page);
+      if (!fil_space_t::is_valid_flags(flags, page_id.space()))
+        continue;
+    }
+
+    const lsn_t lsn= mach_read_from_8(page + FIL_PAGE_LSN);
+    if (lsn <= max_lsn ||
+        !validate_page(page_id, page, space, tmp_buf))
+    {
+      /* Mark processed for subsequent iterations in buf_dblwr_t::recover() */
+      memset(page + FIL_PAGE_LSN, 0, 8);
+      continue;
+    }
+
+    ut_a(page_get_page_no(page) == page_id.page_no());
+    max_lsn= lsn;
+    result= page;
+  }
+
+  return result;
+}
+
+bool recv_dblwr_t::restore_first_page(uint32_t space_id, const char *name,
+                                      os_file_t file)
+{
+  const page_id_t page_id(space_id, 0);
+  const byte* page= find_page(page_id);
+  if (!page)
+  {
+    /* If the first page of the given user tablespace is not there
+    in the doublewrite buffer, then the recovery is going to fail
+    now. Hence this is treated as error. */
+    ib::error()
+            << "Corrupted page " << page_id << " of datafile '"
+            << name <<"' could not be found in the doublewrite buffer.";
+    return true;
+  }
+
+  ulint physical_size= fil_space_t::physical_size(
+    mach_read_from_4(page + FSP_HEADER_OFFSET + FSP_SPACE_FLAGS));
+  ib::info() << "Restoring page " << page_id << " of datafile '"
+          << name << "' from the doublewrite buffer. Writing "
+          << physical_size << " bytes into file '" << name << "'";
+
+  return os_file_write(
+           IORequestWrite, name, file, page, 0, physical_size) !=
+         DB_SUCCESS;
+}
diff --git a/storage/innobase/log/log0sync.cc b/storage/innobase/log/log0sync.cc
new file mode 100644
index 00000000..6b14d1d3
--- /dev/null
+++ b/storage/innobase/log/log0sync.cc
@@ -0,0 +1,404 @@
+/*****************************************************************************
+Copyright (c) 2020 MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*
+The  group commit synchronization used in log_write_up_to()
+works as follows
+
+For simplicity, lets consider only write operation,synchronozation of
+flush operation works the same.
+
+Rules of the game
+
+A thread enters log_write_up_to() with lsn of the current transaction
+1. If last written lsn is greater than wait lsn (another thread already
+   wrote the log buffer),then there is no need to do anything.
+2. If no other thread is currently writing, write the log buffer,
+   and update last written lsn.
+3. Otherwise, wait, and go to step 1.
+
+Synchronization can be done in different ways, e.g
+
+a) Simple mutex locking the entire check and write operation
+Disadvantage that threads that could continue after updating
+last written lsn, still wait.
+
+b) Spinlock, with periodic checks for last written lsn.
+Fixes a) but burns CPU unnecessary.
+
+c) Mutex / condition variable  combo.
+
+Condtion variable notifies (broadcast) all waiters, whenever
+last written lsn is changed.
+
+Has a disadvantage of many suprious wakeups, stress on OS scheduler,
+and mutex contention.
+
+d) Something else.
+Make use of the waiter's lsn parameter, and only wakeup "right" waiting
+threads.
+
+We chose d). Even if implementation is more complicated than alternatves
+due to the need to maintain list of waiters, it provides the best performance.
+
+See group_commit_lock implementation for details.
+
+Note that if write operation is very fast, a) or b) can be fine as alternative.
+*/
+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+#ifdef __linux__
+#include <linux/futex.h>
+#include <sys/syscall.h>
+#endif
+
+#include <atomic>
+#include <thread>
+#include <mutex>
+#include <condition_variable>
+#include <my_cpu.h>
+
+#include <log0types.h>
+#include "log0sync.h"
+#include <mysql/service_thd_wait.h>
+#include <sql_class.h>
+/**
+  Helper class , used in group commit lock.
+
+  Binary semaphore, or (same thing), an auto-reset event
+  Has state (signalled or not), and provides 2 operations.
+  wait() and wake()
+
+  The implementation uses efficient locking primitives on Linux and Windows.
+  Or, mutex/condition combo elsewhere.
+*/
+
+class binary_semaphore
+{
+public:
+  /**Wait until semaphore becomes signalled, and atomically reset the state
+  to non-signalled*/
+  void wait();
+  /** signals the semaphore */
+  void wake();
+
+private:
+#if defined(__linux__) || defined (_WIN32)
+  std::atomic<int> m_signalled;
+  static constexpr std::memory_order mem_order= std::memory_order_acq_rel;
+public:
+  binary_semaphore() :m_signalled(0) {}
+#else
+  std::mutex m_mtx{};
+  std::condition_variable m_cv{};
+  bool m_signalled = false;
+#endif
+};
+
+#if defined (__linux__) || defined (_WIN32)
+void binary_semaphore::wait()
+{
+  for (;;)
+  {
+    if (m_signalled.exchange(0, mem_order) == 1)
+    {
+      break;
+    }
+#ifdef _WIN32
+    int zero = 0;
+    WaitOnAddress(&m_signalled, &zero, sizeof(m_signalled), INFINITE);
+#else
+    syscall(SYS_futex, &m_signalled, FUTEX_WAIT_PRIVATE, 0, NULL, NULL, 0);
+#endif
+  }
+}
+
+void binary_semaphore::wake()
+{
+  if (m_signalled.exchange(1, mem_order) == 0)
+  {
+#ifdef _WIN32
+    WakeByAddressSingle(&m_signalled);
+#else
+    syscall(SYS_futex, &m_signalled, FUTEX_WAKE_PRIVATE, 1, NULL, NULL, 0);
+#endif
+  }
+}
+#else
+void binary_semaphore::wait()
+{
+  std::unique_lock<std::mutex> lk(m_mtx);
+  while (!m_signalled)
+    m_cv.wait(lk);
+  m_signalled = false;
+}
+void binary_semaphore::wake()
+{
+  std::unique_lock<std::mutex> lk(m_mtx);
+  m_signalled = true;
+  m_cv.notify_one();
+}
+#endif
+
+/* A thread helper structure, used in group commit lock below*/
+struct group_commit_waiter_t
+{
+  lsn_t m_value=0;
+  binary_semaphore m_sema{};
+  group_commit_waiter_t* m_next= nullptr;
+  bool m_group_commit_leader=false;
+};
+
+group_commit_lock::group_commit_lock() :
+  m_mtx(), m_value(0), m_pending_value(0), m_lock(false), m_waiters_list()
+{
+}
+
+group_commit_lock::value_type group_commit_lock::value() const
+{
+  return m_value.load(std::memory_order::memory_order_relaxed);
+}
+
+group_commit_lock::value_type group_commit_lock::pending() const
+{
+  return m_pending_value.load(std::memory_order::memory_order_relaxed);
+}
+
+void group_commit_lock::set_pending(group_commit_lock::value_type num)
+{
+  ut_a(num >= value());
+  m_pending_value.store(num, std::memory_order::memory_order_relaxed);
+}
+
+const unsigned int MAX_SPINS = 1; /** max spins in acquire */
+thread_local group_commit_waiter_t thread_local_waiter;
+
+static inline void do_completion_callback(const completion_callback* cb)
+{
+  if (cb)
+    cb->m_callback(cb->m_param);
+}
+
+group_commit_lock::lock_return_code group_commit_lock::acquire(value_type num, const completion_callback *callback)
+{
+  unsigned int spins = MAX_SPINS;
+
+  for(;;)
+  {
+    if (num <= value())
+    {
+      /* No need to wait.*/
+      do_completion_callback(callback);
+      return lock_return_code::EXPIRED;
+    }
+
+    if(spins-- == 0)
+      break;
+    if (num > pending())
+    {
+      /* Longer wait expected (longer than currently running operation),
+        don't spin.*/
+      break;
+    }
+    ut_delay(1);
+  }
+
+  thread_local_waiter.m_value = num;
+  thread_local_waiter.m_group_commit_leader= false;
+  std::unique_lock<std::mutex> lk(m_mtx, std::defer_lock);
+  while (num > value() || thread_local_waiter.m_group_commit_leader)
+  {
+    lk.lock();
+
+    /* Re-read current value after acquiring the lock*/
+    if (num <= value() &&
+       (!thread_local_waiter.m_group_commit_leader || m_lock))
+    {
+      lk.unlock();
+      do_completion_callback(callback);
+      return lock_return_code::EXPIRED;
+    }
+
+    if (!m_lock)
+    {
+      /* Take the lock, become group commit leader.*/
+      m_lock = true;
+#ifndef DBUG_OFF
+      m_owner_id = std::this_thread::get_id();
+#endif
+      if (callback)
+        m_pending_callbacks.push_back({num,*callback});
+      return lock_return_code::ACQUIRED;
+    }
+
+    if (callback && (m_waiters_list || num <= pending()))
+    {
+      /*
+      If num > pending(), we have a good candidate for the next group
+      commit lead, that will be taking over the lock after current owner
+      releases it.  We put current thread into waiter's list so it sleeps
+      and can be signaled and marked as group commit lead  during lock release.
+
+      For this to work well, pending() must deliver a good approximation for N
+      in the next call to group_commit_lock::release(N).
+      */
+      m_pending_callbacks.push_back({num, *callback});
+      return lock_return_code::CALLBACK_QUEUED;
+    }
+
+    /* Add yourself to waiters list.*/
+    thread_local_waiter.m_group_commit_leader= false;
+    thread_local_waiter.m_next = m_waiters_list;
+    m_waiters_list = &thread_local_waiter;
+    lk.unlock();
+
+    /* Sleep until woken in release().*/
+    thd_wait_begin(0,THD_WAIT_GROUP_COMMIT);
+    thread_local_waiter.m_sema.wait();
+    thd_wait_end(0);
+
+  }
+  do_completion_callback(callback);
+  return lock_return_code::EXPIRED;
+}
+
+group_commit_lock::value_type group_commit_lock::release(value_type num)
+{
+  completion_callback callbacks[1000];
+  size_t callback_count = 0;
+  value_type ret = 0;
+  std::unique_lock<std::mutex> lk(m_mtx);
+  m_lock = false;
+
+  /* Update current value. */
+  ut_a(num >= value());
+  m_value.store(num, std::memory_order_relaxed);
+
+  /*
+    Wake waiters for value <= current value.
+    Wake one more waiter, who will become the group commit lead.
+  */
+  group_commit_waiter_t* cur, * prev, * next;
+  group_commit_waiter_t* wakeup_list = nullptr;
+  for (auto& c : m_pending_callbacks)
+  {
+    if (c.first <= num)
+    {
+      if (callback_count < array_elements(callbacks))
+        callbacks[callback_count++] = c.second;
+      else
+        c.second.m_callback(c.second.m_param);
+    }
+  }
+
+  for (prev= nullptr, cur= m_waiters_list; cur; cur= next)
+  {
+    next= cur->m_next;
+    if (cur->m_value <= num)
+    {
+      /* Move current waiter to wakeup_list*/
+
+      if (!prev)
+      {
+        /* Remove from the start of the list.*/
+        m_waiters_list = next;
+      }
+      else
+      {
+        /* Remove from the middle of the list.*/
+        prev->m_next= cur->m_next;
+      }
+
+      /* Append entry to the wakeup list.*/
+      cur->m_next = wakeup_list;
+      wakeup_list = cur;
+    }
+    else
+    {
+      prev= cur;
+    }
+  }
+
+  auto it= std::remove_if(
+      m_pending_callbacks.begin(), m_pending_callbacks.end(),
+      [num](const pending_cb &c) { return c.first <= num; });
+
+  m_pending_callbacks.erase(it, m_pending_callbacks.end());
+
+  if (m_pending_callbacks.size() || m_waiters_list)
+  {
+    /*
+     Ensure that after this thread released the lock,
+     there is a new group commit leader
+     We take this from waiters list or wakeup list. It
+     might look like a spurious wake, but in fact we just
+     ensure the waiter do not wait for eternity.
+    */
+    if (m_waiters_list)
+    {
+      /* Move one waiter to wakeup list */
+      auto e= m_waiters_list;
+      m_waiters_list= m_waiters_list->m_next;
+      e->m_next= wakeup_list;
+      e->m_group_commit_leader= true;
+      wakeup_list = e;
+    }
+    else if (wakeup_list)
+    {
+      wakeup_list->m_group_commit_leader=true;
+    }
+    else
+    {
+      /* Tell the caller that some pending callbacks left, and he should
+      do something to prevent stalls. This should be a rare situation.*/
+      ret= m_pending_callbacks[0].first;
+    }
+  }
+
+  lk.unlock();
+
+  /*
+    Release designated next group commit lead first,
+    to minimize spurious wakeups.
+  */
+  if (wakeup_list && wakeup_list->m_group_commit_leader)
+  {
+    next = wakeup_list->m_next;
+    wakeup_list->m_sema.wake();
+    wakeup_list= next;
+  }
+
+  for (size_t i = 0; i < callback_count; i++)
+    callbacks[i].m_callback(callbacks[i].m_param);
+
+  for (cur= wakeup_list; cur; cur= next)
+  {
+    next= cur->m_next;
+    cur->m_sema.wake();
+  }
+  return ret;
+}
+
+#ifndef DBUG_OFF
+bool group_commit_lock::is_owner()
+{
+  return m_lock && std::this_thread::get_id() == m_owner_id;
+}
+#endif
+
diff --git a/storage/innobase/log/log0sync.h b/storage/innobase/log/log0sync.h
new file mode 100644
index 00000000..00686d39
--- /dev/null
+++ b/storage/innobase/log/log0sync.h
@@ -0,0 +1,99 @@
+/*****************************************************************************
+Copyright (c) 2020 MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#include <atomic>
+#include <thread>
+#include <log0types.h>
+#include <vector>
+
+struct group_commit_waiter_t;
+struct completion_callback
+{
+  void (*m_callback)(void*);
+  void* m_param;
+};
+
+/**
+Special synchronization primitive, which is helpful for
+performing group commit.
+
+It has a state consisting of
+ - locked (bool)
+ - current value (number). This value is always increasing.
+ - pending value (number). current value can soon become this number
+   This is only used for optimization, does not have to be exact
+
+Operations supported on this semaphore
+
+1.acquire(num, callback):
+- waits until current value exceeds num, or until lock is granted.
+  if running synchronously (callback is nullptr)
+
+- returns EXPIRED if current_value >= num,
+  or ACQUIRED, if current_value < num and lock is granted,
+  or CALLBACK_QUEUED, if callback was not nullptr, and function
+  would otherwise have to wait
+
+2.release(num)
+- releases lock
+- sets new current value to max(num,current_value)
+- releases some threads waiting in acquire()
+- executes some callbacks
+- might return some lsn, meaning there are some pending
+  callbacks left, and there is no new group commit lead
+  (i.e caller must do something to flush those pending callbacks)
+
+3. value()
+- read current value
+
+4. pending_value()
+- read pending value
+
+5. set_pending_value()
+*/
+class group_commit_lock
+{
+  using value_type = lsn_t;
+#ifndef DBUG_OFF
+  std::thread::id m_owner_id{};
+#endif
+  std::mutex m_mtx;
+  std::atomic<value_type> m_value;
+  std::atomic<value_type> m_pending_value;
+  bool m_lock;
+  group_commit_waiter_t* m_waiters_list;
+
+  typedef std::pair<value_type, completion_callback> pending_cb;
+  std::vector<pending_cb> m_pending_callbacks;
+
+public:
+  group_commit_lock();
+  enum lock_return_code
+  {
+    ACQUIRED,
+    EXPIRED,
+    CALLBACK_QUEUED
+  };
+  lock_return_code acquire(value_type num, const completion_callback *cb);
+  value_type release(value_type num);
+  value_type value() const;
+  value_type pending() const;
+  void set_pending(value_type num);
+#ifndef DBUG_OFF
+  bool is_owner();
+#endif
+};
diff --git a/storage/innobase/mem/mem0mem.cc b/storage/innobase/mem/mem0mem.cc
new file mode 100644
index 00000000..5e8587bf
--- /dev/null
+++ b/storage/innobase/mem/mem0mem.cc
@@ -0,0 +1,436 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file mem/mem0mem.cc
+The memory management
+
+Created 6/9/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "mem0mem.h"
+#include "buf0buf.h"
+#include "srv0srv.h"
+#include <stdarg.h>
+
+/**********************************************************************//**
+Concatenate two strings and return the result, using a memory heap.
+@return own: the result */
+char*
+mem_heap_strcat(
+/*============*/
+	mem_heap_t*	heap,	/*!< in: memory heap where string is allocated */
+	const char*	s1,	/*!< in: string 1 */
+	const char*	s2)	/*!< in: string 2 */
+{
+	char*	s;
+	ulint	s1_len = strlen(s1);
+	ulint	s2_len = strlen(s2);
+
+	s = static_cast<char*>(mem_heap_alloc(heap, s1_len + s2_len + 1));
+
+	memcpy(s, s1, s1_len);
+	memcpy(s + s1_len, s2, s2_len);
+
+	s[s1_len + s2_len] = '\0';
+
+	return(s);
+}
+
+
+/****************************************************************//**
+Helper function for mem_heap_printf.
+@return length of formatted string, including terminating NUL */
+static
+ulint
+mem_heap_printf_low(
+/*================*/
+	char*		buf,	/*!< in/out: buffer to store formatted string
+				in, or NULL to just calculate length */
+	const char*	format,	/*!< in: format string */
+	va_list		ap)	/*!< in: arguments */
+{
+	ulint 		len = 0;
+
+	while (*format) {
+
+		/* Does this format specifier have the 'l' length modifier. */
+		ibool	is_long = FALSE;
+
+		/* Length of one parameter. */
+		size_t	plen;
+
+		if (*format++ != '%') {
+			/* Non-format character. */
+
+			len++;
+
+			if (buf) {
+				*buf++ = *(format - 1);
+			}
+
+			continue;
+		}
+
+		if (*format == 'l') {
+			is_long = TRUE;
+			format++;
+		}
+
+		switch (*format++) {
+		case 's':
+			/* string */
+			{
+				char*	s = va_arg(ap, char*);
+
+				/* "%ls" is a non-sensical format specifier. */
+				ut_a(!is_long);
+
+				plen = strlen(s);
+				len += plen;
+
+				if (buf) {
+					memcpy(buf, s, plen);
+					buf += plen;
+				}
+			}
+
+			break;
+
+		case 'u':
+			/* unsigned int */
+			{
+				char		tmp[32];
+				unsigned long	val;
+
+				/* We only support 'long' values for now. */
+				ut_a(is_long);
+
+				val = va_arg(ap, unsigned long);
+
+				plen = size_t(sprintf(tmp, "%lu", val));
+				len += plen;
+
+				if (buf) {
+					memcpy(buf, tmp, plen);
+					buf += plen;
+				}
+			}
+
+			break;
+
+		case '%':
+
+			/* "%l%" is a non-sensical format specifier. */
+			ut_a(!is_long);
+
+			len++;
+
+			if (buf) {
+				*buf++ = '%';
+			}
+
+			break;
+
+		default:
+			ut_error;
+		}
+	}
+
+	/* For the NUL character. */
+	len++;
+
+	if (buf) {
+		*buf = '\0';
+	}
+
+	return(len);
+}
+
+/****************************************************************//**
+A simple sprintf replacement that dynamically allocates the space for the
+formatted string from the given heap. This supports a very limited set of
+the printf syntax: types 's' and 'u' and length modifier 'l' (which is
+required for the 'u' type).
+@return heap-allocated formatted string */
+char*
+mem_heap_printf(
+/*============*/
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	const char*	format,	/*!< in: format string */
+	...)
+{
+	va_list		ap;
+	char*		str;
+	ulint 		len;
+
+	/* Calculate length of string */
+	len = 0;
+	va_start(ap, format);
+	len = mem_heap_printf_low(NULL, format, ap);
+	va_end(ap);
+
+	/* Now create it for real. */
+	str = static_cast<char*>(mem_heap_alloc(heap, len));
+	va_start(ap, format);
+	mem_heap_printf_low(str, format, ap);
+	va_end(ap);
+
+	return(str);
+}
+
+#ifdef UNIV_DEBUG
+/** Validates the contents of a memory heap.
+Checks a memory heap for consistency, prints the contents if any error
+is detected. A fatal error is logged if an error is detected.
+@param[in]	heap	Memory heap to validate. */
+void
+mem_heap_validate(
+	const mem_heap_t*	heap)
+{
+	ulint	size = 0;
+
+	for (const mem_block_t* block = heap;
+		block != NULL;
+		block = UT_LIST_GET_NEXT(list, block)) {
+
+		switch (block->type) {
+		case MEM_HEAP_DYNAMIC:
+			break;
+		case MEM_HEAP_BUFFER:
+		case MEM_HEAP_BUFFER | MEM_HEAP_BTR_SEARCH:
+			ut_ad(block->len <= srv_page_size);
+			break;
+		default:
+			ut_error;
+		}
+
+		size += block->len;
+	}
+
+	ut_ad(size == heap->total_size);
+}
+
+/** Copy the tail of a string.
+@param[in,out]	dst	destination buffer
+@param[in]	src	string whose tail to copy
+@param[in]	size	size of dst buffer, in bytes, including NUL terminator
+@return strlen(src) */
+static void ut_strlcpy_rev(char* dst, const char* src, ulint size)
+{
+	size_t src_size = strlen(src), n = std::min(src_size, size - 1);
+	memcpy(dst, src + src_size - n, n + 1);
+}
+#endif /* UNIV_DEBUG */
+
+/***************************************************************//**
+Creates a memory heap block where data can be allocated.
+@return own: memory heap block, NULL if did not succeed (only possible
+for MEM_HEAP_BTR_SEARCH type heaps) */
+mem_block_t*
+mem_heap_create_block_func(
+/*=======================*/
+	mem_heap_t*	heap,	/*!< in: memory heap or NULL if first block
+				should be created */
+	ulint		n,	/*!< in: number of bytes needed for user data */
+#ifdef UNIV_DEBUG
+	const char*	file_name,/*!< in: file name where created */
+	unsigned	line,	/*!< in: line where created */
+#endif /* UNIV_DEBUG */
+	ulint		type)	/*!< in: type of heap: MEM_HEAP_DYNAMIC or
+				MEM_HEAP_BUFFER */
+{
+	buf_block_t*	buf_block = NULL;
+	mem_block_t*	block;
+	ulint		len;
+
+	ut_ad((type == MEM_HEAP_DYNAMIC) || (type == MEM_HEAP_BUFFER)
+	      || (type == MEM_HEAP_BUFFER + MEM_HEAP_BTR_SEARCH));
+
+	if (heap != NULL) {
+		ut_d(mem_heap_validate(heap));
+	}
+
+	/* In dynamic allocation, calculate the size: block header + data. */
+	len = MEM_BLOCK_HEADER_SIZE + MEM_SPACE_NEEDED(n);
+
+	if (type == MEM_HEAP_DYNAMIC || len < srv_page_size / 2) {
+
+		ut_ad(type == MEM_HEAP_DYNAMIC || n <= MEM_MAX_ALLOC_IN_BUF);
+
+		block = static_cast<mem_block_t*>(ut_malloc_nokey(len));
+	} else {
+		len = srv_page_size;
+
+		if ((type & MEM_HEAP_BTR_SEARCH) && heap) {
+			/* We cannot allocate the block from the
+			buffer pool, but must get the free block from
+			the heap header free block field */
+
+			buf_block = static_cast<buf_block_t*>(heap->free_block);
+			heap->free_block = NULL;
+
+			if (UNIV_UNLIKELY(!buf_block)) {
+
+				return(NULL);
+			}
+		} else {
+			buf_block = buf_block_alloc();
+		}
+
+		block = (mem_block_t*) buf_block->page.frame;
+	}
+
+	if (block == NULL) {
+		ib::fatal() << "Unable to allocate memory of size "
+			<< len << ".";
+	}
+
+	block->buf_block = buf_block;
+	block->free_block = NULL;
+
+	ut_d(ut_strlcpy_rev(block->file_name, file_name,
+			    sizeof(block->file_name)));
+	ut_d(block->line = line);
+
+	mem_block_set_len(block, len);
+	mem_block_set_type(block, type);
+	mem_block_set_free(block, MEM_BLOCK_HEADER_SIZE);
+	mem_block_set_start(block, MEM_BLOCK_HEADER_SIZE);
+
+	if (UNIV_UNLIKELY(heap == NULL)) {
+		/* This is the first block of the heap. The field
+		total_size should be initialized here */
+		block->total_size = len;
+	} else {
+		/* Not the first allocation for the heap. This block's
+		total_length field should be set to undefined. */
+		ut_d(block->total_size = ULINT_UNDEFINED);
+		MEM_UNDEFINED(&block->total_size, sizeof block->total_size);
+
+		heap->total_size += len;
+	}
+
+	/* Poison all available memory. Individual chunks will be unpoisoned on
+	every mem_heap_alloc() call. */
+	compile_time_assert(MEM_BLOCK_HEADER_SIZE >= sizeof *block);
+	MEM_NOACCESS(block + 1, len - sizeof *block);
+
+	ut_ad((ulint)MEM_BLOCK_HEADER_SIZE < len);
+
+	return(block);
+}
+
+/***************************************************************//**
+Adds a new block to a memory heap.
+@return created block, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+mem_block_t*
+mem_heap_add_block(
+/*===============*/
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	ulint		n)	/*!< in: number of bytes user needs */
+{
+	mem_block_t*	block;
+	mem_block_t*	new_block;
+	ulint		new_size;
+
+	block = UT_LIST_GET_LAST(heap->base);
+
+	/* We have to allocate a new block. The size is always at least
+	doubled until the standard size is reached. After that the size
+	stays the same, except in cases where the caller needs more space. */
+
+	new_size = 2 * mem_block_get_len(block);
+
+	if (heap->type != MEM_HEAP_DYNAMIC) {
+		/* From the buffer pool we allocate buffer frames */
+		ut_a(n <= MEM_MAX_ALLOC_IN_BUF);
+
+		if (new_size > MEM_MAX_ALLOC_IN_BUF) {
+			new_size = MEM_MAX_ALLOC_IN_BUF;
+		}
+	} else if (new_size > MEM_BLOCK_STANDARD_SIZE) {
+
+		new_size = MEM_BLOCK_STANDARD_SIZE;
+	}
+
+	if (new_size < n) {
+		new_size = n;
+	}
+
+	new_block = mem_heap_create_block(heap, new_size, heap->type,
+					  heap->file_name, heap->line);
+	if (new_block == NULL) {
+
+		return(NULL);
+	}
+
+	/* Add the new block as the last block */
+
+	UT_LIST_INSERT_AFTER(heap->base, block, new_block);
+
+	return(new_block);
+}
+
+/******************************************************************//**
+Frees a block from a memory heap. */
+void
+mem_heap_block_free(
+/*================*/
+	mem_heap_t*	heap,	/*!< in: heap */
+	mem_block_t*	block)	/*!< in: block to free */
+{
+	ulint		type;
+	ulint		len;
+	buf_block_t*	buf_block;
+
+	buf_block = static_cast<buf_block_t*>(block->buf_block);
+
+	UT_LIST_REMOVE(heap->base, block);
+
+	ut_ad(heap->total_size >= block->len);
+	heap->total_size -= block->len;
+
+	type = heap->type;
+	len = block->len;
+
+	if (type == MEM_HEAP_DYNAMIC || len < srv_page_size / 2) {
+		ut_ad(!buf_block);
+		ut_free(block);
+	} else {
+		ut_ad(type & MEM_HEAP_BUFFER);
+		buf_block_free(buf_block);
+	}
+}
+
+/******************************************************************//**
+Frees the free_block field from a memory heap. */
+void
+mem_heap_free_block_free(
+/*=====================*/
+	mem_heap_t*	heap)	/*!< in: heap */
+{
+	if (UNIV_LIKELY_NULL(heap->free_block)) {
+
+		buf_block_free(static_cast<buf_block_t*>(heap->free_block));
+
+		heap->free_block = NULL;
+	}
+}
diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc
new file mode 100644
index 00000000..1834a164
--- /dev/null
+++ b/storage/innobase/mtr/mtr0mtr.cc
@@ -0,0 +1,1667 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file mtr/mtr0mtr.cc
+Mini-transaction buffer
+
+Created 11/26/1995 Heikki Tuuri
+*******************************************************/
+
+#include "mtr0log.h"
+#include "buf0buf.h"
+#include "buf0flu.h"
+#include "page0types.h"
+#include "log0crypt.h"
+#ifdef BTR_CUR_HASH_ADAPT
+# include "btr0sea.h"
+#else
+# include "btr0cur.h"
+#endif
+#include "srv0start.h"
+#include "log.h"
+#include "mariadb_stats.h"
+
+void mtr_memo_slot_t::release() const
+{
+  ut_ad(object);
+
+  switch (type) {
+  case MTR_MEMO_S_LOCK:
+    static_cast<index_lock*>(object)->s_unlock();
+    break;
+  case MTR_MEMO_X_LOCK:
+  case MTR_MEMO_SX_LOCK:
+    static_cast<index_lock*>(object)->
+      u_or_x_unlock(type == MTR_MEMO_SX_LOCK);
+    break;
+  case MTR_MEMO_SPACE_X_LOCK:
+    static_cast<fil_space_t*>(object)->set_committed_size();
+    static_cast<fil_space_t*>(object)->x_unlock();
+    break;
+  default:
+    buf_page_t *bpage= static_cast<buf_page_t*>(object);
+    ut_d(const auto s=)
+      bpage->unfix();
+    ut_ad(s < buf_page_t::READ_FIX || s >= buf_page_t::WRITE_FIX);
+    switch (type) {
+    case MTR_MEMO_PAGE_S_FIX:
+      bpage->lock.s_unlock();
+      break;
+    case MTR_MEMO_BUF_FIX:
+      break;
+    default:
+      ut_ad(type == MTR_MEMO_PAGE_SX_FIX ||
+            type == MTR_MEMO_PAGE_X_FIX ||
+            type == MTR_MEMO_PAGE_SX_MODIFY ||
+            type == MTR_MEMO_PAGE_X_MODIFY);
+      bpage->lock.u_or_x_unlock(type & MTR_MEMO_PAGE_SX_FIX);
+    }
+  }
+}
+
+/** Prepare to insert a modified blcok into flush_list.
+@param lsn start LSN of the mini-transaction
+@return insert position for insert_into_flush_list() */
+inline buf_page_t *buf_pool_t::prepare_insert_into_flush_list(lsn_t lsn)
+  noexcept
+{
+#ifndef SUX_LOCK_GENERIC
+  ut_ad(recv_recovery_is_on() || log_sys.latch.is_locked());
+#endif
+  ut_ad(lsn >= log_sys.last_checkpoint_lsn);
+  mysql_mutex_assert_owner(&flush_list_mutex);
+  static_assert(log_t::FIRST_LSN >= 2, "compatibility");
+
+rescan:
+  buf_page_t *prev= UT_LIST_GET_FIRST(flush_list);
+  if (prev)
+  {
+    lsn_t om= prev->oldest_modification();
+    if (om == 1)
+    {
+      delete_from_flush_list(prev);
+      goto rescan;
+    }
+    ut_ad(om > 2);
+    if (om <= lsn)
+      return nullptr;
+    while (buf_page_t *next= UT_LIST_GET_NEXT(list, prev))
+    {
+      om= next->oldest_modification();
+      if (om == 1)
+      {
+        delete_from_flush_list(next);
+        continue;
+      }
+      ut_ad(om > 2);
+      if (om <= lsn)
+        break;
+      prev= next;
+    }
+    flush_hp.adjust(prev);
+  }
+  return prev;
+}
+
+/** Insert a modified block into the flush list.
+@param prev     insert position (from prepare_insert_into_flush_list())
+@param block    modified block
+@param lsn      start LSN of the mini-transaction that modified the block */
+inline void buf_pool_t::insert_into_flush_list(buf_page_t *prev,
+                                               buf_block_t *block, lsn_t lsn)
+  noexcept
+{
+  ut_ad(!fsp_is_system_temporary(block->page.id().space()));
+  mysql_mutex_assert_owner(&flush_list_mutex);
+
+  MEM_CHECK_DEFINED(block->page.zip.data
+                    ? block->page.zip.data : block->page.frame,
+                    block->physical_size());
+
+  if (const lsn_t old= block->page.oldest_modification())
+  {
+    if (old > 1)
+      return;
+    flush_hp.adjust(&block->page);
+    UT_LIST_REMOVE(flush_list, &block->page);
+  }
+  else
+    flush_list_bytes+= block->physical_size();
+
+  ut_ad(flush_list_bytes <= curr_pool_size);
+
+  if (prev)
+    UT_LIST_INSERT_AFTER(flush_list, prev, &block->page);
+  else
+    UT_LIST_ADD_FIRST(flush_list, &block->page);
+
+  block->page.set_oldest_modification(lsn);
+}
+
+mtr_t::mtr_t()= default;
+mtr_t::~mtr_t()= default;
+
+/** Start a mini-transaction. */
+void mtr_t::start()
+{
+  ut_ad(m_memo.empty());
+  ut_ad(!m_freed_pages);
+  ut_ad(!m_freed_space);
+  MEM_UNDEFINED(this, sizeof *this);
+  MEM_MAKE_DEFINED(&m_memo, sizeof m_memo);
+  MEM_MAKE_DEFINED(&m_freed_space, sizeof m_freed_space);
+  MEM_MAKE_DEFINED(&m_freed_pages, sizeof m_freed_pages);
+
+  ut_d(m_start= true);
+  ut_d(m_commit= false);
+  ut_d(m_freeing_tree= false);
+
+  m_last= nullptr;
+  m_last_offset= 0;
+
+  new(&m_log) mtr_buf_t();
+
+  m_made_dirty= false;
+  m_latch_ex= false;
+  m_inside_ibuf= false;
+  m_modifications= false;
+  m_log_mode= MTR_LOG_ALL;
+  ut_d(m_user_space_id= TRX_SYS_SPACE);
+  m_user_space= nullptr;
+  m_commit_lsn= 0;
+  m_trim_pages= false;
+}
+
+/** Release the resources */
+inline void mtr_t::release_resources()
+{
+  ut_ad(is_active());
+  ut_ad(m_memo.empty());
+  m_log.erase();
+  ut_d(m_commit= true);
+}
+
+/** Handle any pages that were freed during the mini-transaction. */
+void mtr_t::process_freed_pages()
+{
+  if (m_freed_pages)
+  {
+    ut_ad(!m_freed_pages->empty());
+    ut_ad(m_freed_space);
+    ut_ad(m_freed_space->is_owner());
+    ut_ad(is_named_space(m_freed_space));
+
+    /* Update the last freed lsn */
+    m_freed_space->freed_range_mutex.lock();
+    m_freed_space->update_last_freed_lsn(m_commit_lsn);
+    if (!m_trim_pages)
+      for (const auto &range : *m_freed_pages)
+        m_freed_space->add_free_range(range);
+    else
+      m_freed_space->clear_freed_ranges();
+    m_freed_space->freed_range_mutex.unlock();
+
+    delete m_freed_pages;
+    m_freed_pages= nullptr;
+    m_freed_space= nullptr;
+    /* mtr_t::start() will reset m_trim_pages */
+  }
+  else
+    ut_ad(!m_freed_space);
+}
+
+ATTRIBUTE_COLD __attribute__((noinline))
+/** Insert a modified block into buf_pool.flush_list on IMPORT TABLESPACE. */
+static void insert_imported(buf_block_t *block)
+{
+  if (block->page.oldest_modification() <= 1)
+  {
+    log_sys.latch.rd_lock(SRW_LOCK_CALL);
+    const lsn_t lsn= log_sys.last_checkpoint_lsn;
+    mysql_mutex_lock(&buf_pool.flush_list_mutex);
+    buf_pool.insert_into_flush_list
+      (buf_pool.prepare_insert_into_flush_list(lsn), block, lsn);
+    log_sys.latch.rd_unlock();
+    mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+  }
+}
+
+/** Release modified pages when no log was written. */
+void mtr_t::release_unlogged()
+{
+  ut_ad(m_log_mode == MTR_LOG_NO_REDO);
+  ut_ad(m_log.size() == 0);
+
+  process_freed_pages();
+
+  for (auto it= m_memo.rbegin(); it != m_memo.rend(); it++)
+  {
+    mtr_memo_slot_t &slot= *it;
+    ut_ad(slot.object);
+    switch (slot.type) {
+    case MTR_MEMO_S_LOCK:
+      static_cast<index_lock*>(slot.object)->s_unlock();
+      break;
+    case MTR_MEMO_SPACE_X_LOCK:
+      static_cast<fil_space_t*>(slot.object)->set_committed_size();
+      static_cast<fil_space_t*>(slot.object)->x_unlock();
+      break;
+    case MTR_MEMO_X_LOCK:
+    case MTR_MEMO_SX_LOCK:
+      static_cast<index_lock*>(slot.object)->
+        u_or_x_unlock(slot.type == MTR_MEMO_SX_LOCK);
+      break;
+    default:
+      buf_block_t *block= static_cast<buf_block_t*>(slot.object);
+      ut_d(const auto s=) block->page.unfix();
+      ut_ad(s >= buf_page_t::FREED);
+      ut_ad(s < buf_page_t::READ_FIX);
+
+      if (slot.type & MTR_MEMO_MODIFY)
+      {
+        ut_ad(slot.type == MTR_MEMO_PAGE_X_MODIFY ||
+              slot.type == MTR_MEMO_PAGE_SX_MODIFY);
+        ut_ad(block->page.id() < end_page_id);
+        insert_imported(block);
+      }
+
+      switch (slot.type) {
+      case MTR_MEMO_PAGE_S_FIX:
+        block->page.lock.s_unlock();
+        break;
+      case MTR_MEMO_BUF_FIX:
+        break;
+      default:
+        ut_ad(slot.type == MTR_MEMO_PAGE_SX_FIX ||
+              slot.type == MTR_MEMO_PAGE_X_FIX ||
+              slot.type == MTR_MEMO_PAGE_SX_MODIFY ||
+              slot.type == MTR_MEMO_PAGE_X_MODIFY);
+        block->page.lock.u_or_x_unlock(slot.type & MTR_MEMO_PAGE_SX_FIX);
+      }
+    }
+  }
+
+  m_memo.clear();
+}
+
+void mtr_t::release()
+{
+  for (auto it= m_memo.rbegin(); it != m_memo.rend(); it++)
+    it->release();
+  m_memo.clear();
+}
+
+/** Commit a mini-transaction. */
+void mtr_t::commit()
+{
+  ut_ad(is_active());
+  ut_ad(!is_inside_ibuf());
+
+  /* This is a dirty read, for debugging. */
+  ut_ad(!m_modifications || !recv_no_log_write);
+  ut_ad(!m_modifications || m_log_mode != MTR_LOG_NONE);
+  ut_ad(!m_latch_ex);
+
+  if (m_modifications && (m_log_mode == MTR_LOG_NO_REDO || !m_log.empty()))
+  {
+    if (UNIV_UNLIKELY(!is_logged()))
+    {
+      release_unlogged();
+      goto func_exit;
+    }
+
+    ut_ad(!srv_read_only_mode);
+    std::pair<lsn_t,page_flush_ahead> lsns{do_write()};
+    process_freed_pages();
+    size_t modified= 0;
+
+    if (m_made_dirty)
+    {
+      auto it= m_memo.rbegin();
+
+      mysql_mutex_lock(&buf_pool.flush_list_mutex);
+
+      buf_page_t *const prev=
+        buf_pool.prepare_insert_into_flush_list(lsns.first);
+
+      while (it != m_memo.rend())
+      {
+        const mtr_memo_slot_t &slot= *it++;
+        if (slot.type & MTR_MEMO_MODIFY)
+        {
+          ut_ad(slot.type == MTR_MEMO_PAGE_X_MODIFY ||
+                slot.type == MTR_MEMO_PAGE_SX_MODIFY);
+          modified++;
+          buf_block_t *b= static_cast<buf_block_t*>(slot.object);
+          ut_ad(b->page.id() < end_page_id);
+          ut_d(const auto s= b->page.state());
+          ut_ad(s > buf_page_t::FREED);
+          ut_ad(s < buf_page_t::READ_FIX);
+          ut_ad(mach_read_from_8(b->page.frame + FIL_PAGE_LSN) <=
+                m_commit_lsn);
+          mach_write_to_8(b->page.frame + FIL_PAGE_LSN, m_commit_lsn);
+          if (UNIV_LIKELY_NULL(b->page.zip.data))
+            memcpy_aligned<8>(FIL_PAGE_LSN + b->page.zip.data,
+                              FIL_PAGE_LSN + b->page.frame, 8);
+          buf_pool.insert_into_flush_list(prev, b, lsns.first);
+        }
+      }
+
+      ut_ad(modified);
+      buf_pool.flush_list_requests+= modified;
+      buf_pool.page_cleaner_wakeup();
+      mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+      if (m_latch_ex)
+      {
+        log_sys.latch.wr_unlock();
+        m_latch_ex= false;
+      }
+      else
+        log_sys.latch.rd_unlock();
+
+      release();
+    }
+    else
+    {
+      if (m_latch_ex)
+      {
+        log_sys.latch.wr_unlock();
+        m_latch_ex= false;
+      }
+      else
+        log_sys.latch.rd_unlock();
+
+      for (auto it= m_memo.rbegin(); it != m_memo.rend(); )
+      {
+        const mtr_memo_slot_t &slot= *it++;
+        ut_ad(slot.object);
+        switch (slot.type) {
+        case MTR_MEMO_S_LOCK:
+          static_cast<index_lock*>(slot.object)->s_unlock();
+          break;
+        case MTR_MEMO_SPACE_X_LOCK:
+          static_cast<fil_space_t*>(slot.object)->set_committed_size();
+          static_cast<fil_space_t*>(slot.object)->x_unlock();
+          break;
+        case MTR_MEMO_X_LOCK:
+        case MTR_MEMO_SX_LOCK:
+          static_cast<index_lock*>(slot.object)->
+            u_or_x_unlock(slot.type == MTR_MEMO_SX_LOCK);
+          break;
+        default:
+          buf_page_t *bpage= static_cast<buf_page_t*>(slot.object);
+          const auto s= bpage->unfix();
+          if (slot.type & MTR_MEMO_MODIFY)
+          {
+            ut_ad(slot.type == MTR_MEMO_PAGE_X_MODIFY ||
+                  slot.type == MTR_MEMO_PAGE_SX_MODIFY);
+            ut_ad(bpage->oldest_modification() > 1);
+            ut_ad(bpage->oldest_modification() < m_commit_lsn);
+            ut_ad(bpage->id() < end_page_id);
+            ut_ad(s >= buf_page_t::FREED);
+            ut_ad(s < buf_page_t::READ_FIX);
+            ut_ad(mach_read_from_8(bpage->frame + FIL_PAGE_LSN) <=
+                  m_commit_lsn);
+            if (s >= buf_page_t::UNFIXED)
+            {
+              mach_write_to_8(bpage->frame + FIL_PAGE_LSN, m_commit_lsn);
+              if (UNIV_LIKELY_NULL(bpage->zip.data))
+                memcpy_aligned<8>(FIL_PAGE_LSN + bpage->zip.data,
+                                  FIL_PAGE_LSN + bpage->frame, 8);
+            }
+            modified++;
+          }
+          switch (auto latch= slot.type & ~MTR_MEMO_MODIFY) {
+          case MTR_MEMO_PAGE_S_FIX:
+            bpage->lock.s_unlock();
+            continue;
+          case MTR_MEMO_PAGE_SX_FIX:
+          case MTR_MEMO_PAGE_X_FIX:
+            bpage->lock.u_or_x_unlock(latch == MTR_MEMO_PAGE_SX_FIX);
+            continue;
+          default:
+            ut_ad(latch == MTR_MEMO_BUF_FIX);
+          }
+        }
+      }
+
+      buf_pool.add_flush_list_requests(modified);
+      m_memo.clear();
+    }
+
+    mariadb_increment_pages_updated(modified);
+
+    if (UNIV_UNLIKELY(lsns.second != PAGE_FLUSH_NO))
+      buf_flush_ahead(m_commit_lsn, lsns.second == PAGE_FLUSH_SYNC);
+  }
+  else
+  {
+    if (m_freed_pages)
+    {
+      ut_ad(!m_freed_pages->empty());
+      ut_ad(m_freed_space == fil_system.temp_space);
+      ut_ad(!m_trim_pages);
+      for (const auto &range : *m_freed_pages)
+        m_freed_space->add_free_range(range);
+      delete m_freed_pages;
+      m_freed_pages= nullptr;
+      m_freed_space= nullptr;
+    }
+    release();
+  }
+
+func_exit:
+  release_resources();
+}
+
+void mtr_t::rollback_to_savepoint(ulint begin, ulint end)
+{
+  ut_ad(end <= m_memo.size());
+  ut_ad(begin <= end);
+  ulint s= end;
+
+  while (s-- > begin)
+  {
+    const mtr_memo_slot_t &slot= m_memo[s];
+    ut_ad(slot.object);
+    /* This is intended for releasing latches on indexes or unmodified
+    buffer pool pages. */
+    ut_ad(slot.type <= MTR_MEMO_SX_LOCK);
+    ut_ad(!(slot.type & MTR_MEMO_MODIFY));
+    slot.release();
+  }
+
+  m_memo.erase(m_memo.begin() + begin, m_memo.begin() + end);
+}
+
+/** Commit a mini-transaction that is shrinking a tablespace.
+@param space   tablespace that is being shrunk */
+void mtr_t::commit_shrink(fil_space_t &space)
+{
+  ut_ad(is_active());
+  ut_ad(!is_inside_ibuf());
+  ut_ad(!high_level_read_only);
+  ut_ad(m_modifications);
+  ut_ad(m_made_dirty);
+  ut_ad(!m_memo.empty());
+  ut_ad(!recv_recovery_is_on());
+  ut_ad(m_log_mode == MTR_LOG_ALL);
+  ut_ad(!m_freed_pages);
+  ut_ad(UT_LIST_GET_LEN(space.chain) == 1);
+
+  log_write_and_flush_prepare();
+  m_latch_ex= true;
+  log_sys.latch.wr_lock(SRW_LOCK_CALL);
+
+  const lsn_t start_lsn= do_write().first;
+  ut_d(m_log.erase());
+
+  /* Durably write the reduced FSP_SIZE before truncating the data file. */
+  log_write_and_flush();
+#ifndef SUX_LOCK_GENERIC
+  ut_ad(log_sys.latch.is_write_locked());
+#endif
+
+  os_file_truncate(space.chain.start->name, space.chain.start->handle,
+                   os_offset_t{space.size} << srv_page_size_shift, true);
+
+  space.clear_freed_ranges();
+
+  const page_id_t high{space.id, space.size};
+  size_t modified= 0;
+  auto it= m_memo.rbegin();
+  mysql_mutex_lock(&buf_pool.flush_list_mutex);
+
+  buf_page_t *const prev= buf_pool.prepare_insert_into_flush_list(start_lsn);
+
+  while (it != m_memo.rend())
+  {
+    mtr_memo_slot_t &slot= *it++;
+
+    ut_ad(slot.object);
+    if (slot.type == MTR_MEMO_SPACE_X_LOCK)
+      ut_ad(high.space() == static_cast<fil_space_t*>(slot.object)->id);
+    else
+    {
+      ut_ad(slot.type == MTR_MEMO_PAGE_X_MODIFY ||
+            slot.type == MTR_MEMO_PAGE_SX_MODIFY ||
+            slot.type == MTR_MEMO_PAGE_X_FIX ||
+            slot.type == MTR_MEMO_PAGE_SX_FIX);
+      buf_block_t *b= static_cast<buf_block_t*>(slot.object);
+      const page_id_t id{b->page.id()};
+      const auto s= b->page.state();
+      ut_ad(s > buf_page_t::FREED);
+      ut_ad(s < buf_page_t::READ_FIX);
+      ut_ad(b->page.frame);
+      ut_ad(mach_read_from_8(b->page.frame + FIL_PAGE_LSN) <= m_commit_lsn);
+      ut_ad(!b->page.zip.data); // we no not shrink ROW_FORMAT=COMPRESSED
+
+      if (id < high)
+      {
+        ut_ad(id.space() == high.space() ||
+              (id == page_id_t{0, TRX_SYS_PAGE_NO} &&
+               srv_is_undo_tablespace(high.space())));
+        if (slot.type & MTR_MEMO_MODIFY)
+        {
+          modified++;
+          mach_write_to_8(b->page.frame + FIL_PAGE_LSN, m_commit_lsn);
+          buf_pool.insert_into_flush_list(prev, b, start_lsn);
+        }
+      }
+      else
+      {
+        ut_ad(id.space() == high.space());
+        if (s >= buf_page_t::UNFIXED)
+          b->page.set_freed(s);
+        if (b->page.oldest_modification() > 1)
+          b->page.reset_oldest_modification();
+        slot.type= mtr_memo_type_t(slot.type & ~MTR_MEMO_MODIFY);
+      }
+    }
+  }
+
+  ut_ad(modified);
+  buf_pool.flush_list_requests+= modified;
+  buf_pool.page_cleaner_wakeup();
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+  log_sys.latch.wr_unlock();
+  m_latch_ex= false;
+
+  mysql_mutex_lock(&fil_system.mutex);
+  ut_ad(space.is_being_truncated);
+  ut_ad(space.is_stopping_writes());
+  space.clear_stopping();
+  space.is_being_truncated= false;
+  mysql_mutex_unlock(&fil_system.mutex);
+
+  release();
+  release_resources();
+}
+
+/** Commit a mini-transaction that is deleting or renaming a file.
+@param space   tablespace that is being renamed or deleted
+@param name    new file name (nullptr=the file will be deleted)
+@return whether the operation succeeded */
+bool mtr_t::commit_file(fil_space_t &space, const char *name)
+{
+  ut_ad(is_active());
+  ut_ad(!is_inside_ibuf());
+  ut_ad(!high_level_read_only);
+  ut_ad(m_modifications);
+  ut_ad(!m_made_dirty);
+  ut_ad(!recv_recovery_is_on());
+  ut_ad(m_log_mode == MTR_LOG_ALL);
+  ut_ad(UT_LIST_GET_LEN(space.chain) == 1);
+  ut_ad(!m_latch_ex);
+
+  m_latch_ex= true;
+
+  log_write_and_flush_prepare();
+
+  log_sys.latch.wr_lock(SRW_LOCK_CALL);
+
+  size_t size= m_log.size() + 5;
+
+  if (log_sys.is_encrypted())
+  {
+    /* We will not encrypt any FILE_ records, but we will reserve
+    a nonce at the end. */
+    size+= 8;
+    m_commit_lsn= log_sys.get_lsn();
+  }
+  else
+    m_commit_lsn= 0;
+
+  m_crc= 0;
+  m_log.for_each_block([this](const mtr_buf_t::block_t *b)
+  { m_crc= my_crc32c(m_crc, b->begin(), b->used()); return true; });
+  finish_write(size);
+
+  if (!name && space.max_lsn)
+  {
+    ut_d(space.max_lsn= 0);
+    fil_system.named_spaces.remove(space);
+  }
+
+  /* Block log_checkpoint(). */
+  mysql_mutex_lock(&buf_pool.flush_list_mutex);
+
+  /* Durably write the log for the file system operation. */
+  log_write_and_flush();
+
+  log_sys.latch.wr_unlock();
+  m_latch_ex= false;
+
+  char *old_name= space.chain.start->name;
+  bool success= true;
+
+  if (name)
+  {
+    char *new_name= mem_strdup(name);
+    mysql_mutex_lock(&fil_system.mutex);
+    success= os_file_rename(innodb_data_file_key, old_name, name);
+    if (success)
+      space.chain.start->name= new_name;
+    else
+      old_name= new_name;
+    mysql_mutex_unlock(&fil_system.mutex);
+    ut_free(old_name);
+  }
+
+  mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+  release_resources();
+
+  return success;
+}
+
+/** Commit a mini-transaction that did not modify any pages,
+but generated some redo log on a higher level, such as
+FILE_MODIFY records and an optional FILE_CHECKPOINT marker.
+The caller must hold exclusive log_sys.latch.
+This is to be used at log_checkpoint().
+@param checkpoint_lsn   the log sequence number of a checkpoint, or 0
+@return current LSN */
+lsn_t mtr_t::commit_files(lsn_t checkpoint_lsn)
+{
+#ifndef SUX_LOCK_GENERIC
+  ut_ad(log_sys.latch.is_write_locked());
+#endif
+  ut_ad(is_active());
+  ut_ad(!is_inside_ibuf());
+  ut_ad(m_log_mode == MTR_LOG_ALL);
+  ut_ad(!m_made_dirty);
+  ut_ad(m_memo.empty());
+  ut_ad(!srv_read_only_mode);
+  ut_ad(!m_freed_space);
+  ut_ad(!m_freed_pages);
+  ut_ad(!m_user_space);
+  ut_ad(!m_latch_ex);
+
+  m_latch_ex= true;
+
+  if (checkpoint_lsn)
+  {
+    byte *ptr= m_log.push<byte*>(3 + 8);
+    *ptr= FILE_CHECKPOINT | (2 + 8);
+    ::memset(ptr + 1, 0, 2);
+    mach_write_to_8(ptr + 3, checkpoint_lsn);
+  }
+
+  size_t size= m_log.size() + 5;
+
+  if (log_sys.is_encrypted())
+  {
+    /* We will not encrypt any FILE_ records, but we will reserve
+    a nonce at the end. */
+    size+= 8;
+    m_commit_lsn= log_sys.get_lsn();
+  }
+  else
+    m_commit_lsn= 0;
+
+  m_crc= 0;
+  m_log.for_each_block([this](const mtr_buf_t::block_t *b)
+  { m_crc= my_crc32c(m_crc, b->begin(), b->used()); return true; });
+  finish_write(size);
+  release_resources();
+
+  if (checkpoint_lsn)
+    DBUG_PRINT("ib_log",
+               ("FILE_CHECKPOINT(" LSN_PF ") written at " LSN_PF,
+                checkpoint_lsn, m_commit_lsn));
+
+  return m_commit_lsn;
+}
+
+#ifdef UNIV_DEBUG
+/** Check if a tablespace is associated with the mini-transaction
+(needed for generating a FILE_MODIFY record)
+@param[in]	space	tablespace
+@return whether the mini-transaction is associated with the space */
+bool
+mtr_t::is_named_space(uint32_t space) const
+{
+  ut_ad(!m_user_space || m_user_space->id != TRX_SYS_SPACE);
+  return !is_logged() || m_user_space_id == space ||
+    is_predefined_tablespace(space);
+}
+/** Check if a tablespace is associated with the mini-transaction
+(needed for generating a FILE_MODIFY record)
+@param[in]	space	tablespace
+@return whether the mini-transaction is associated with the space */
+bool mtr_t::is_named_space(const fil_space_t* space) const
+{
+  ut_ad(!m_user_space || m_user_space->id != TRX_SYS_SPACE);
+
+  return !is_logged() || m_user_space == space ||
+    is_predefined_tablespace(space->id);
+}
+#endif /* UNIV_DEBUG */
+
+/** Acquire a tablespace X-latch.
+@param[in]	space_id	tablespace ID
+@return the tablespace object (never NULL) */
+fil_space_t *mtr_t::x_lock_space(uint32_t space_id)
+{
+	fil_space_t*	space;
+
+	ut_ad(is_active());
+
+	if (space_id == TRX_SYS_SPACE) {
+		space = fil_system.sys_space;
+	} else if ((space = m_user_space) && space_id == space->id) {
+	} else {
+		space = fil_space_get(space_id);
+		ut_ad(m_log_mode != MTR_LOG_NO_REDO
+		      || space->purpose == FIL_TYPE_TEMPORARY
+		      || space->purpose == FIL_TYPE_IMPORT);
+	}
+
+	ut_ad(space);
+	ut_ad(space->id == space_id);
+	x_lock_space(space);
+	return(space);
+}
+
+/** Acquire an exclusive tablespace latch.
+@param space  tablespace */
+void mtr_t::x_lock_space(fil_space_t *space)
+{
+  ut_ad(space->purpose == FIL_TYPE_TEMPORARY ||
+        space->purpose == FIL_TYPE_IMPORT ||
+        space->purpose == FIL_TYPE_TABLESPACE);
+  if (!memo_contains(*space))
+  {
+    memo_push(space, MTR_MEMO_SPACE_X_LOCK);
+    space->x_lock();
+  }
+}
+
+void mtr_t::release(const void *object)
+{
+  ut_ad(is_active());
+
+  auto it=
+    std::find_if(m_memo.begin(), m_memo.end(),
+                 [object](const mtr_memo_slot_t& slot)
+                 { return slot.object == object; });
+  ut_ad(it != m_memo.end());
+  ut_ad(!(it->type & MTR_MEMO_MODIFY));
+  it->release();
+  m_memo.erase(it, it + 1);
+  ut_ad(std::find_if(m_memo.begin(), m_memo.end(),
+                     [object](const mtr_memo_slot_t& slot)
+                     { return slot.object == &object; }) == m_memo.end());
+}
+
+static time_t log_close_warn_time;
+
+/** Display a warning that the log tail is overwriting the head,
+making the server crash-unsafe. */
+ATTRIBUTE_COLD static void log_overwrite_warning(lsn_t lsn)
+{
+  if (log_sys.overwrite_warned)
+    return;
+
+  time_t t= time(nullptr);
+  if (difftime(t, log_close_warn_time) < 15)
+    return;
+
+  if (!log_sys.overwrite_warned)
+    log_sys.overwrite_warned= lsn;
+  log_close_warn_time= t;
+
+  sql_print_error("InnoDB: Crash recovery is broken due to"
+                  " insufficient innodb_log_file_size;"
+                  " last checkpoint LSN=" LSN_PF ", current LSN=" LSN_PF
+                  "%s.",
+                  lsn_t{log_sys.last_checkpoint_lsn}, lsn,
+                  srv_shutdown_state != SRV_SHUTDOWN_INITIATED
+                  ? ". Shutdown is in progress" : "");
+}
+
+/** Wait in append_prepare() for buffer to become available
+@param ex   whether log_sys.latch is exclusively locked */
+ATTRIBUTE_COLD void log_t::append_prepare_wait(bool ex) noexcept
+{
+  log_sys.waits++;
+  log_sys.unlock_lsn();
+
+  if (ex)
+    log_sys.latch.wr_unlock();
+  else
+    log_sys.latch.rd_unlock();
+
+  DEBUG_SYNC_C("log_buf_size_exceeded");
+  log_buffer_flush_to_disk(log_sys.is_pmem());
+
+  if (ex)
+    log_sys.latch.wr_lock(SRW_LOCK_CALL);
+  else
+    log_sys.latch.rd_lock(SRW_LOCK_CALL);
+
+  log_sys.lock_lsn();
+}
+
+/** Reserve space in the log buffer for appending data.
+@tparam pmem  log_sys.is_pmem()
+@param size   total length of the data to append(), in bytes
+@param ex     whether log_sys.latch is exclusively locked
+@return the start LSN and the buffer position for append() */
+template<bool pmem>
+inline
+std::pair<lsn_t,byte*> log_t::append_prepare(size_t size, bool ex) noexcept
+{
+#ifndef SUX_LOCK_GENERIC
+  ut_ad(latch.is_locked());
+# ifndef _WIN32 // there is no accurate is_write_locked() on SRWLOCK
+  ut_ad(ex == latch.is_write_locked());
+# endif
+#endif
+  ut_ad(pmem == is_pmem());
+  const lsn_t checkpoint_margin{last_checkpoint_lsn + log_capacity - size};
+  const size_t avail{(pmem ? size_t(capacity()) : buf_size) - size};
+  lock_lsn();
+  write_to_buf++;
+
+  for (ut_d(int count= 50);
+       UNIV_UNLIKELY((pmem
+                      ? size_t(get_lsn() -
+                               get_flushed_lsn(std::memory_order_relaxed))
+                      : size_t{buf_free}) > avail); )
+  {
+    append_prepare_wait(ex);
+    ut_ad(count--);
+  }
+
+  const lsn_t l{lsn.load(std::memory_order_relaxed)};
+  lsn.store(l + size, std::memory_order_relaxed);
+  const size_t b{buf_free};
+  size_t new_buf_free{b};
+  new_buf_free+= size;
+  if (pmem && new_buf_free >= file_size)
+    new_buf_free-= size_t(capacity());
+  buf_free= new_buf_free;
+  unlock_lsn();
+
+  if (UNIV_UNLIKELY(l > checkpoint_margin) ||
+      (!pmem && b >= max_buf_free))
+    set_check_flush_or_checkpoint();
+
+  return {l, &buf[b]};
+}
+
+/** Finish appending data to the log.
+@param lsn  the end LSN of the log record
+@return whether buf_flush_ahead() will have to be invoked */
+static mtr_t::page_flush_ahead log_close(lsn_t lsn) noexcept
+{
+#ifndef SUX_LOCK_GENERIC
+  ut_ad(log_sys.latch.is_locked());
+#endif
+
+  const lsn_t checkpoint_age= lsn - log_sys.last_checkpoint_lsn;
+
+  if (UNIV_UNLIKELY(checkpoint_age >= log_sys.log_capacity) &&
+      /* silence message on create_log_file() after the log had been deleted */
+      checkpoint_age != lsn)
+    log_overwrite_warning(lsn);
+  else if (UNIV_LIKELY(checkpoint_age <= log_sys.max_modified_age_async))
+    return mtr_t::PAGE_FLUSH_NO;
+  else if (UNIV_LIKELY(checkpoint_age <= log_sys.max_checkpoint_age))
+    return mtr_t::PAGE_FLUSH_ASYNC;
+
+  log_sys.set_check_flush_or_checkpoint();
+  return mtr_t::PAGE_FLUSH_SYNC;
+}
+
+inline void mtr_t::page_checksum(const buf_page_t &bpage)
+{
+  const byte *page= bpage.frame;
+  size_t size= srv_page_size;
+
+  if (UNIV_LIKELY_NULL(bpage.zip.data))
+  {
+    size= (UNIV_ZIP_SIZE_MIN >> 1) << bpage.zip.ssize;
+    switch (fil_page_get_type(bpage.zip.data)) {
+    case FIL_PAGE_TYPE_ALLOCATED:
+    case FIL_PAGE_INODE:
+    case FIL_PAGE_IBUF_BITMAP:
+    case FIL_PAGE_TYPE_FSP_HDR:
+    case FIL_PAGE_TYPE_XDES:
+      /* These are essentially uncompressed pages. */
+      break;
+    default:
+      page= bpage.zip.data;
+    }
+  }
+
+  /* We have to exclude from the checksum the normal
+  page checksum that is written by buf_flush_init_for_writing()
+  and FIL_PAGE_LSN which would be updated once we have actually
+  allocated the LSN.
+
+  Unfortunately, we cannot access fil_space_t easily here. In order to
+  be compatible with encrypted tablespaces in the pre-full_crc32
+  format we will unconditionally exclude the 8 bytes at
+  FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION
+  a.k.a. FIL_RTREE_SPLIT_SEQ_NUM. */
+  const uint32_t checksum=
+    my_crc32c(my_crc32c(my_crc32c(0, page + FIL_PAGE_OFFSET,
+                                  FIL_PAGE_LSN - FIL_PAGE_OFFSET),
+                        page + FIL_PAGE_TYPE, 2),
+              page + FIL_PAGE_SPACE_ID, size - (FIL_PAGE_SPACE_ID + 8));
+
+  byte *l= log_write<OPTION>(bpage.id(), nullptr, 5, true, 0);
+  *l++= OPT_PAGE_CHECKSUM;
+  mach_write_to_4(l, checksum);
+  m_log.close(l + 4);
+}
+
+std::pair<lsn_t,mtr_t::page_flush_ahead> mtr_t::do_write()
+{
+  ut_ad(!recv_no_log_write);
+  ut_ad(is_logged());
+  ut_ad(m_log.size());
+#ifndef SUX_LOCK_GENERIC
+  ut_ad(!m_latch_ex || log_sys.latch.is_write_locked());
+#endif
+
+#ifndef DBUG_OFF
+  do
+  {
+    if (m_log_mode != MTR_LOG_ALL)
+      continue;
+    DBUG_EXECUTE_IF("skip_page_checksum", continue;);
+
+    for (const mtr_memo_slot_t& slot : m_memo)
+      if (slot.type & MTR_MEMO_MODIFY)
+      {
+        const buf_page_t &b= *static_cast<const buf_page_t*>(slot.object);
+        if (!b.is_freed())
+          page_checksum(b);
+      }
+  }
+  while (0);
+#endif
+
+  size_t len= m_log.size() + 5;
+  ut_ad(len > 5);
+
+  if (log_sys.is_encrypted())
+  {
+    len+= 8;
+    encrypt();
+  }
+  else
+  {
+    m_crc= 0;
+    m_commit_lsn= 0;
+    m_log.for_each_block([this](const mtr_buf_t::block_t *b)
+    { m_crc= my_crc32c(m_crc, b->begin(), b->used()); return true; });
+  }
+
+  if (!m_latch_ex)
+    log_sys.latch.rd_lock(SRW_LOCK_CALL);
+
+  if (UNIV_UNLIKELY(m_user_space && !m_user_space->max_lsn &&
+                    !is_predefined_tablespace(m_user_space->id)))
+  {
+    if (!m_latch_ex)
+    {
+      m_latch_ex= true;
+      log_sys.latch.rd_unlock();
+      log_sys.latch.wr_lock(SRW_LOCK_CALL);
+      if (UNIV_UNLIKELY(m_user_space->max_lsn != 0))
+        goto func_exit;
+    }
+    name_write();
+  }
+func_exit:
+  return finish_write(len);
+}
+
+inline void log_t::resize_write(lsn_t lsn, const byte *end, size_t len,
+                                size_t seq) noexcept
+{
+#ifndef SUX_LOCK_GENERIC
+  ut_ad(latch.is_locked());
+#endif
+
+  if (UNIV_LIKELY_NULL(resize_buf))
+  {
+    ut_ad(end >= buf);
+    end-= len;
+    size_t s;
+
+#ifdef HAVE_PMEM
+    if (!resize_flush_buf)
+    {
+      ut_ad(is_pmem());
+      const size_t resize_capacity{resize_target - START_OFFSET};
+      const lsn_t resizing{resize_in_progress()};
+      if (UNIV_UNLIKELY(lsn < resizing))
+      {
+        size_t l= resizing - lsn;
+        if (l >= len)
+          return;
+        end+= l - len;
+        len-= l;
+        lsn+= l;
+      }
+      lsn-= resizing;
+      s= START_OFFSET + lsn % resize_capacity;
+
+      if (UNIV_UNLIKELY(end < &buf[START_OFFSET]))
+      {
+        /* The source buffer (log_sys.buf) wrapped around */
+        ut_ad(end + capacity() < &buf[file_size]);
+        ut_ad(end + len >= &buf[START_OFFSET]);
+        ut_ad(end + capacity() + len >= &buf[file_size]);
+
+        size_t l= size_t(buf - (end - START_OFFSET));
+        if (UNIV_LIKELY(s + len <= resize_target))
+        {
+          /* The destination buffer (log_sys.resize_buf) did not wrap around */
+          memcpy(resize_buf + s, end + capacity(), l);
+          memcpy(resize_buf + s + l, &buf[START_OFFSET], len - l);
+          goto pmem_nowrap;
+        }
+        else
+        {
+          /* Both log_sys.buf and log_sys.resize_buf wrapped around */
+          const size_t rl= resize_target - s;
+          if (l <= rl)
+          {
+            /* log_sys.buf wraps around first */
+            memcpy(resize_buf + s, end + capacity(), l);
+            memcpy(resize_buf + s + l, &buf[START_OFFSET], rl - l);
+            memcpy(resize_buf + START_OFFSET, &buf[START_OFFSET + rl - l],
+                   len - l);
+          }
+          else
+          {
+            /* log_sys.resize_buf wraps around first */
+            memcpy(resize_buf + s, end + capacity(), rl);
+            memcpy(resize_buf + START_OFFSET, end + capacity() + rl, l - rl);
+            memcpy(resize_buf + START_OFFSET + (l - rl),
+                   &buf[START_OFFSET], len - l);
+          }
+          goto pmem_wrap;
+        }
+      }
+      else
+      {
+        ut_ad(end + len <= &buf[file_size]);
+
+        if (UNIV_LIKELY(s + len <= resize_target))
+        {
+          memcpy(resize_buf + s, end, len);
+        pmem_nowrap:
+          s+= len - seq;
+        }
+        else
+        {
+          /* The log_sys.resize_buf wrapped around */
+          memcpy(resize_buf + s, end, resize_target - s);
+          memcpy(resize_buf + START_OFFSET, end + (resize_target - s),
+                 len - (resize_target - s));
+        pmem_wrap:
+          s+= len - seq;
+          if (s >= resize_target)
+            s-= resize_capacity;
+          resize_lsn.fetch_add(resize_capacity); /* Move the target ahead. */
+        }
+      }
+    }
+    else
+#endif
+    {
+      ut_ad(resize_flush_buf);
+      s= end - buf;
+      ut_ad(s + len <= buf_size);
+      memcpy(resize_buf + s, end, len);
+      s+= len - seq;
+    }
+
+    /* Always set the sequence bit. If the resized log were to wrap around,
+    we will advance resize_lsn. */
+    ut_ad(resize_buf[s] <= 1);
+    resize_buf[s]= 1;
+  }
+}
+
+/** Write the mini-transaction log to the redo log buffer.
+@param len   number of bytes to write
+@return {start_lsn,flush_ahead} */
+std::pair<lsn_t,mtr_t::page_flush_ahead>
+mtr_t::finish_write(size_t len)
+{
+  ut_ad(!recv_no_log_write);
+  ut_ad(is_logged());
+#ifndef SUX_LOCK_GENERIC
+# ifndef _WIN32 // there is no accurate is_write_locked() on SRWLOCK
+  ut_ad(m_latch_ex == log_sys.latch.is_write_locked());
+# endif
+#endif
+
+  const size_t size{m_commit_lsn ? 5U + 8U : 5U};
+  std::pair<lsn_t, byte*> start;
+
+  if (!log_sys.is_pmem())
+  {
+    start= log_sys.append_prepare<false>(len, m_latch_ex);
+    m_log.for_each_block([&start](const mtr_buf_t::block_t *b)
+    { log_sys.append(start.second, b->begin(), b->used()); return true; });
+
+#ifdef HAVE_PMEM
+  write_trailer:
+#endif
+    *start.second++= log_sys.get_sequence_bit(start.first + len - size);
+    if (m_commit_lsn)
+    {
+      mach_write_to_8(start.second, m_commit_lsn);
+      m_crc= my_crc32c(m_crc, start.second, 8);
+      start.second+= 8;
+    }
+    mach_write_to_4(start.second, m_crc);
+    start.second+= 4;
+  }
+#ifdef HAVE_PMEM
+  else
+  {
+    start= log_sys.append_prepare<true>(len, m_latch_ex);
+    if (UNIV_LIKELY(start.second + len <= &log_sys.buf[log_sys.file_size]))
+    {
+      m_log.for_each_block([&start](const mtr_buf_t::block_t *b)
+      { log_sys.append(start.second, b->begin(), b->used()); return true; });
+      goto write_trailer;
+    }
+    m_log.for_each_block([&start](const mtr_buf_t::block_t *b)
+    {
+      size_t size{b->used()};
+      const size_t size_left(&log_sys.buf[log_sys.file_size] - start.second);
+      const byte *src= b->begin();
+      if (size > size_left)
+      {
+        ::memcpy(start.second, src, size_left);
+        start.second= &log_sys.buf[log_sys.START_OFFSET];
+        src+= size_left;
+        size-= size_left;
+      }
+      ::memcpy(start.second, src, size);
+      start.second+= size;
+      return true;
+    });
+    const size_t size_left(&log_sys.buf[log_sys.file_size] - start.second);
+    if (size_left > size)
+      goto write_trailer;
+
+    byte tail[5 + 8];
+    tail[0]= log_sys.get_sequence_bit(start.first + len - size);
+
+    if (m_commit_lsn)
+    {
+      mach_write_to_8(tail + 1, m_commit_lsn);
+      m_crc= my_crc32c(m_crc, tail + 1, 8);
+      mach_write_to_4(tail + 9, m_crc);
+    }
+    else
+      mach_write_to_4(tail + 1, m_crc);
+
+    ::memcpy(start.second, tail, size_left);
+    ::memcpy(log_sys.buf + log_sys.START_OFFSET, tail + size_left,
+             size - size_left);
+    start.second= log_sys.buf +
+      ((size >= size_left) ? log_sys.START_OFFSET : log_sys.file_size) +
+      (size - size_left);
+  }
+#endif
+
+  log_sys.resize_write(start.first, start.second, len, size);
+
+  m_commit_lsn= start.first + len;
+  return {start.first, log_close(m_commit_lsn)};
+}
+
+bool mtr_t::have_x_latch(const buf_block_t &block) const
+{
+  ut_d(const mtr_memo_slot_t *found= nullptr);
+
+  for (const mtr_memo_slot_t &slot : m_memo)
+  {
+    if (slot.object != &block)
+      continue;
+
+    ut_d(found= &slot);
+
+    if (!(slot.type & MTR_MEMO_PAGE_X_FIX))
+      continue;
+
+    ut_ad(block.page.lock.have_x());
+    return true;
+  }
+
+  ut_ad(!found);
+  return false;
+}
+
+bool mtr_t::have_u_or_x_latch(const buf_block_t &block) const
+{
+  for (const mtr_memo_slot_t &slot : m_memo)
+  {
+    if (slot.object == &block &&
+        slot.type & (MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX))
+    {
+      ut_ad(block.page.lock.have_u_or_x());
+      return true;
+    }
+  }
+  return false;
+}
+
+/** Check if we are holding exclusive tablespace latch
+@param space  tablespace to search for
+@return whether space.latch is being held */
+bool mtr_t::memo_contains(const fil_space_t& space) const
+{
+  for (const mtr_memo_slot_t &slot : m_memo)
+  {
+    if (slot.object == &space && slot.type == MTR_MEMO_SPACE_X_LOCK)
+    {
+      ut_ad(space.is_owner());
+      return true;
+    }
+  }
+
+  return false;
+}
+
+void mtr_t::page_lock_upgrade(const buf_block_t &block)
+{
+  ut_ad(block.page.lock.have_x());
+
+  for (mtr_memo_slot_t &slot : m_memo)
+    if (slot.object == &block && slot.type & MTR_MEMO_PAGE_SX_FIX)
+      slot.type= mtr_memo_type_t(slot.type ^
+                                 (MTR_MEMO_PAGE_SX_FIX | MTR_MEMO_PAGE_X_FIX));
+
+#ifdef BTR_CUR_HASH_ADAPT
+  ut_ad(!block.index || !block.index->freed());
+#endif /* BTR_CUR_HASH_ADAPT */
+}
+
+/** Latch a buffer pool block.
+@param block    block to be latched
+@param rw_latch RW_S_LATCH, RW_SX_LATCH, RW_X_LATCH, RW_NO_LATCH */
+void mtr_t::page_lock(buf_block_t *block, ulint rw_latch)
+{
+  mtr_memo_type_t fix_type;
+  ut_d(const auto state= block->page.state());
+  ut_ad(state > buf_page_t::FREED);
+  ut_ad(state > buf_page_t::WRITE_FIX || state < buf_page_t::READ_FIX);
+  switch (rw_latch) {
+  case RW_NO_LATCH:
+    fix_type= MTR_MEMO_BUF_FIX;
+    goto done;
+  case RW_S_LATCH:
+    fix_type= MTR_MEMO_PAGE_S_FIX;
+    block->page.lock.s_lock();
+    break;
+  case RW_SX_LATCH:
+    fix_type= MTR_MEMO_PAGE_SX_FIX;
+    block->page.lock.u_lock();
+    ut_ad(!block->page.is_io_fixed());
+    break;
+  default:
+    ut_ad(rw_latch == RW_X_LATCH);
+    fix_type= MTR_MEMO_PAGE_X_FIX;
+    if (block->page.lock.x_lock_upgraded())
+    {
+      block->unfix();
+      page_lock_upgrade(*block);
+      return;
+    }
+    ut_ad(!block->page.is_io_fixed());
+  }
+
+#ifdef BTR_CUR_HASH_ADAPT
+  btr_search_drop_page_hash_index(block, true);
+#endif
+
+done:
+  ut_ad(state < buf_page_t::UNFIXED ||
+        page_id_t(page_get_space_id(block->page.frame),
+                  page_get_page_no(block->page.frame)) == block->page.id());
+  memo_push(block, fix_type);
+}
+
+void mtr_t::upgrade_buffer_fix(ulint savepoint, rw_lock_type_t rw_latch)
+{
+  ut_ad(is_active());
+  mtr_memo_slot_t &slot= m_memo[savepoint];
+  ut_ad(slot.type == MTR_MEMO_BUF_FIX);
+  buf_block_t *block= static_cast<buf_block_t*>(slot.object);
+  ut_d(const auto state= block->page.state());
+  ut_ad(state > buf_page_t::UNFIXED);
+  ut_ad(state > buf_page_t::WRITE_FIX || state < buf_page_t::READ_FIX);
+  static_assert(int{MTR_MEMO_PAGE_S_FIX} == int{RW_S_LATCH}, "");
+  static_assert(int{MTR_MEMO_PAGE_X_FIX} == int{RW_X_LATCH}, "");
+  static_assert(int{MTR_MEMO_PAGE_SX_FIX} == int{RW_SX_LATCH}, "");
+  slot.type= mtr_memo_type_t(rw_latch);
+
+  switch (rw_latch) {
+  default:
+    ut_ad("invalid state" == 0);
+    break;
+  case RW_S_LATCH:
+    block->page.lock.s_lock();
+    break;
+  case RW_SX_LATCH:
+    block->page.lock.u_lock();
+    ut_ad(!block->page.is_io_fixed());
+    break;
+  case RW_X_LATCH:
+    block->page.lock.x_lock();
+    ut_ad(!block->page.is_io_fixed());
+  }
+
+#ifdef BTR_CUR_HASH_ADAPT
+  btr_search_drop_page_hash_index(block, true);
+#endif
+  ut_ad(page_id_t(page_get_space_id(block->page.frame),
+                  page_get_page_no(block->page.frame)) == block->page.id());
+}
+
+#ifdef UNIV_DEBUG
+/** Check if we are holding an rw-latch in this mini-transaction
+@param lock   latch to search for
+@param type   held latch type
+@return whether (lock,type) is contained */
+bool mtr_t::memo_contains(const index_lock &lock, mtr_memo_type_t type) const
+{
+  ut_ad(type == MTR_MEMO_X_LOCK || type == MTR_MEMO_S_LOCK ||
+        type == MTR_MEMO_SX_LOCK);
+
+  for (const mtr_memo_slot_t &slot : m_memo)
+  {
+    if (slot.object == &lock && slot.type == type)
+    {
+      switch (type) {
+      case MTR_MEMO_X_LOCK:
+        ut_ad(lock.have_x());
+        break;
+      case MTR_MEMO_SX_LOCK:
+        ut_ad(lock.have_u_or_x());
+        break;
+      case MTR_MEMO_S_LOCK:
+        ut_ad(lock.have_s());
+        break;
+      default:
+        break;
+      }
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/** Check if memo contains the given item.
+@param object		object to search
+@param flags		specify types of object (can be ORred) of
+			MTR_MEMO_PAGE_S_FIX ... values
+@return true if contains */
+bool mtr_t::memo_contains_flagged(const void *object, ulint flags) const
+{
+  ut_ad(is_active());
+  ut_ad(flags);
+  /* Look for rw-lock-related and page-related flags. */
+  ut_ad(!(flags & ulint(~(MTR_MEMO_PAGE_S_FIX | MTR_MEMO_PAGE_X_FIX |
+                          MTR_MEMO_PAGE_SX_FIX | MTR_MEMO_BUF_FIX |
+                          MTR_MEMO_MODIFY | MTR_MEMO_X_LOCK |
+                          MTR_MEMO_SX_LOCK | MTR_MEMO_S_LOCK))));
+  /* Either some rw-lock-related or page-related flags
+  must be specified, but not both at the same time. */
+  ut_ad(!(flags & (MTR_MEMO_PAGE_S_FIX | MTR_MEMO_PAGE_X_FIX |
+                   MTR_MEMO_PAGE_SX_FIX | MTR_MEMO_BUF_FIX |
+                   MTR_MEMO_MODIFY)) ==
+        !!(flags & (MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK | MTR_MEMO_S_LOCK)));
+
+  for (const mtr_memo_slot_t &slot : m_memo)
+  {
+    if (object != slot.object)
+      continue;
+
+    auto f = flags & slot.type;
+    if (!f)
+      continue;
+
+    if (f & (MTR_MEMO_PAGE_S_FIX | MTR_MEMO_PAGE_SX_FIX | MTR_MEMO_PAGE_X_FIX))
+    {
+      const block_lock &lock= static_cast<const buf_page_t*>(object)->lock;
+      ut_ad(!(f & MTR_MEMO_PAGE_S_FIX) || lock.have_s());
+      ut_ad(!(f & MTR_MEMO_PAGE_SX_FIX) || lock.have_u_or_x());
+      ut_ad(!(f & MTR_MEMO_PAGE_X_FIX) || lock.have_x());
+    }
+    else
+    {
+      const index_lock &lock= *static_cast<const index_lock*>(object);
+      ut_ad(!(f & MTR_MEMO_S_LOCK) || lock.have_s());
+      ut_ad(!(f & MTR_MEMO_SX_LOCK) || lock.have_u_or_x());
+      ut_ad(!(f & MTR_MEMO_X_LOCK) || lock.have_x());
+    }
+
+    return true;
+  }
+
+  return false;
+}
+
+buf_block_t* mtr_t::memo_contains_page_flagged(const byte *ptr, ulint flags)
+  const
+{
+  ptr= page_align(ptr);
+
+  for (const mtr_memo_slot_t &slot : m_memo)
+  {
+    ut_ad(slot.object);
+    if (!(flags & slot.type))
+      continue;
+
+    buf_page_t *bpage= static_cast<buf_page_t*>(slot.object);
+
+    if (ptr != bpage->frame)
+      continue;
+
+    ut_ad(!(slot.type & MTR_MEMO_PAGE_S_FIX) || bpage->lock.have_s());
+    ut_ad(!(slot.type & MTR_MEMO_PAGE_SX_FIX) || bpage->lock.have_u_or_x());
+    ut_ad(!(slot.type & MTR_MEMO_PAGE_X_FIX) || bpage->lock.have_x());
+    return static_cast<buf_block_t*>(slot.object);
+  }
+
+  return nullptr;
+}
+#endif /* UNIV_DEBUG */
+
+
+/** Mark the given latched page as modified.
+@param block   page that will be modified */
+void mtr_t::set_modified(const buf_block_t &block)
+{
+  if (block.page.id().space() >= SRV_TMP_SPACE_ID)
+  {
+    const_cast<buf_block_t&>(block).page.set_temp_modified();
+    return;
+  }
+
+  m_modifications= true;
+
+  if (UNIV_UNLIKELY(m_log_mode == MTR_LOG_NONE))
+    return;
+
+  for (mtr_memo_slot_t &slot : m_memo)
+  {
+    if (slot.object == &block &&
+        slot.type & (MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX))
+    {
+      if (slot.type & MTR_MEMO_MODIFY)
+        ut_ad(m_made_dirty || block.page.oldest_modification() > 1);
+      else
+      {
+        slot.type= static_cast<mtr_memo_type_t>(slot.type | MTR_MEMO_MODIFY);
+        if (!m_made_dirty)
+          m_made_dirty= block.page.oldest_modification() <= 1;
+      }
+      return;
+    }
+  }
+
+  /* This must be PageConverter::update_page() in IMPORT TABLESPACE. */
+  ut_ad(m_memo.empty());
+  ut_ad(!block.page.in_LRU_list);
+}
+
+void mtr_t::init(buf_block_t *b)
+{
+  const page_id_t id{b->page.id()};
+  ut_ad(is_named_space(id.space()));
+  ut_ad(!m_freed_pages == !m_freed_space);
+  ut_ad(memo_contains_flagged(b, MTR_MEMO_PAGE_X_FIX));
+
+  if (id.space() >= SRV_TMP_SPACE_ID)
+    b->page.set_temp_modified();
+  else
+  {
+    for (mtr_memo_slot_t &slot : m_memo)
+    {
+      if (slot.object == b && slot.type & MTR_MEMO_PAGE_X_FIX)
+      {
+        slot.type= MTR_MEMO_PAGE_X_MODIFY;
+        m_modifications= true;
+        if (!m_made_dirty)
+          m_made_dirty= b->page.oldest_modification() <= 1;
+        goto found;
+      }
+    }
+    ut_ad("block not X-latched" == 0);
+  }
+
+ found:
+  if (UNIV_LIKELY_NULL(m_freed_space) &&
+      m_freed_space->id == id.space() &&
+      m_freed_pages->remove_if_exists(id.page_no()) &&
+      m_freed_pages->empty())
+  {
+    delete m_freed_pages;
+    m_freed_pages= nullptr;
+    m_freed_space= nullptr;
+  }
+
+  b->page.set_reinit(b->page.state() & buf_page_t::LRU_MASK);
+
+  if (!is_logged())
+    return;
+
+  m_log.close(log_write<INIT_PAGE>(id, &b->page));
+  m_last_offset= FIL_PAGE_TYPE;
+}
+
+/** Free a page.
+@param space   tablespace
+@param offset  offset of the page to be freed */
+void mtr_t::free(const fil_space_t &space, uint32_t offset)
+{
+  ut_ad(is_named_space(&space));
+  ut_ad(!m_freed_space || m_freed_space == &space);
+
+  buf_block_t *freed= nullptr;
+  const page_id_t id{space.id, offset};
+
+  for (auto it= m_memo.end(); it != m_memo.begin(); )
+  {
+    it--;
+  next:
+    mtr_memo_slot_t &slot= *it;
+    buf_block_t *block= static_cast<buf_block_t*>(slot.object);
+    ut_ad(block);
+    if (block == freed)
+    {
+      if (slot.type & (MTR_MEMO_PAGE_SX_FIX | MTR_MEMO_PAGE_X_FIX))
+        slot.type= MTR_MEMO_PAGE_X_FIX;
+      else
+      {
+        ut_ad(slot.type == MTR_MEMO_BUF_FIX);
+        block->page.unfix();
+        m_memo.erase(it, it + 1);
+        goto next;
+      }
+    }
+    else if (slot.type & (MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX) &&
+               block->page.id() == id)
+    {
+      ut_ad(!block->page.is_freed());
+      ut_ad(!freed);
+      freed= block;
+      if (!(slot.type & MTR_MEMO_PAGE_X_FIX))
+      {
+        ut_d(bool upgraded=) block->page.lock.x_lock_upgraded();
+        ut_ad(upgraded);
+      }
+      if (id.space() >= SRV_TMP_SPACE_ID)
+      {
+        block->page.set_temp_modified();
+        slot.type= MTR_MEMO_PAGE_X_FIX;
+      }
+      else
+      {
+        slot.type= MTR_MEMO_PAGE_X_MODIFY;
+        if (!m_made_dirty)
+          m_made_dirty= block->page.oldest_modification() <= 1;
+      }
+#ifdef BTR_CUR_HASH_ADAPT
+      if (block->index)
+        btr_search_drop_page_hash_index(block, false);
+#endif /* BTR_CUR_HASH_ADAPT */
+      block->page.set_freed(block->page.state());
+    }
+  }
+
+  if (is_logged())
+    m_log.close(log_write<FREE_PAGE>(id, nullptr));
+}
+
+void small_vector_base::grow_by_1(void *small, size_t element_size)
+{
+  const size_t cap= Capacity*= 2, s= cap * element_size;
+  void *new_begin;
+  if (BeginX == small)
+  {
+    new_begin= my_malloc(PSI_NOT_INSTRUMENTED, s, MYF(0));
+    memcpy(new_begin, BeginX, size() * element_size);
+    TRASH_FREE(small, size() * element_size);
+  }
+  else
+    new_begin= my_realloc(PSI_NOT_INSTRUMENTED, BeginX, s, MYF(0));
+
+  BeginX= new_begin;
+}
diff --git a/storage/innobase/mysql-test/storage_engine/alter_tablespace.opt b/storage/innobase/mysql-test/storage_engine/alter_tablespace.opt
new file mode 100644
index 00000000..cf4b117e
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/alter_tablespace.opt
@@ -0,0 +1,2 @@
+--innodb-file-per-table=1
+
diff --git a/storage/innobase/mysql-test/storage_engine/autoinc_secondary.rdiff b/storage/innobase/mysql-test/storage_engine/autoinc_secondary.rdiff
new file mode 100644
index 00000000..00cda7c4
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/autoinc_secondary.rdiff
@@ -0,0 +1,30 @@
+--- suite/storage_engine/autoinc_secondary.result	2012-07-12 04:34:18.153885986 +0400
++++ suite/storage_engine/autoinc_secondary.reject	2012-07-15 17:47:03.937703666 +0400
+@@ -13,18 +13,15 @@
+ 5	a
+ DROP TABLE t1;
+ CREATE TABLE t1 (a <CHAR_COLUMN>, b <INT_COLUMN> AUTO_INCREMENT, PRIMARY KEY (a,b)) ENGINE=<STORAGE_ENGINE> <CUSTOM_TABLE_OPTIONS>;
+-INSERT INTO t1 (a) VALUES ('a'),('b'),('b'),('c'),('a');
+-SELECT LAST_INSERT_ID();
+-LAST_INSERT_ID()
+-1
+-SELECT a,b FROM t1;
+-a	b
+-a	1
+-a	2
+-b	1
+-b	2
+-c	1
+-DROP TABLE t1;
++ERROR 42000: Incorrect table definition; there can be only one auto column and it must be defined as a key
++# ERROR: Statement ended with errno 1075, errname ER_WRONG_AUTO_KEY (expected to succeed)
++# ------------ UNEXPECTED RESULT ------------
++# The statement|command finished with ER_WRONG_AUTO_KEY.
++# Multi-part keys or PK or AUTO_INCREMENT (on a secondary column) or the mix could be unsupported|malfunctioning, or the problem was caused by previous errors. 
++# You can change the engine code, or create an rdiff, or disable the test by adding it to disabled.def.
++# Further in this test, the message might sometimes be suppressed; a part of the test might be skipped.
++# Also, this problem may cause a chain effect (more errors of different kinds in the test).
++# -------------------------------------------
+ CREATE TABLE t1 (a <CHAR_COLUMN>, b <INT_COLUMN> AUTO_INCREMENT, PRIMARY KEY (a,b), <CUSTOM_INDEX>(b)) ENGINE=<STORAGE_ENGINE> <CUSTOM_TABLE_OPTIONS>;
+ INSERT INTO t1 (a) VALUES ('a'),('b'),('b'),('c'),('a');
+ SELECT LAST_INSERT_ID();
diff --git a/storage/innobase/mysql-test/storage_engine/cache_index.rdiff b/storage/innobase/mysql-test/storage_engine/cache_index.rdiff
new file mode 100644
index 00000000..e04df87a
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/cache_index.rdiff
@@ -0,0 +1,71 @@
+--- suite/storage_engine/cache_index.result	2012-07-15 00:22:19.822493731 +0400
++++ suite/storage_engine/cache_index.reject	2012-07-15 17:47:18.321522834 +0400
+@@ -12,31 +12,31 @@
+ SET GLOBAL <CACHE_NAME>.key_buffer_size=128*1024;
+ CACHE INDEX t1 INDEX (a), t2 IN <CACHE_NAME>;
+ Table	Op	Msg_type	Msg_text
+-test.t1	assign_to_keycache	status	OK
+-test.t2	assign_to_keycache	status	OK
++test.t1	assign_to_keycache	note	The storage engine for the table doesn't support assign_to_keycache
++test.t2	assign_to_keycache	note	The storage engine for the table doesn't support assign_to_keycache
+ LOAD INDEX INTO CACHE t1, t2;
+ Table	Op	Msg_type	Msg_text
+-test.t1	preload_keys	status	OK
+-test.t2	preload_keys	status	OK
++test.t1	preload_keys	note	The storage engine for the table doesn't support preload_keys
++test.t2	preload_keys	note	The storage engine for the table doesn't support preload_keys
+ INSERT INTO t1 (a,b) VALUES (3,'c'),(4,'d');
+ SET GLOBAL <CACHE_NAME>.key_buffer_size=8*1024;
+ LOAD INDEX INTO CACHE t1, t2 IGNORE LEAVES;
+ Table	Op	Msg_type	Msg_text
+-test.t1	preload_keys	status	OK
+-test.t2	preload_keys	status	OK
++test.t1	preload_keys	note	The storage engine for the table doesn't support preload_keys
++test.t2	preload_keys	note	The storage engine for the table doesn't support preload_keys
+ SET GLOBAL <CACHE_NAME>.key_cache_age_threshold = 100, <CACHE_NAME>.key_cache_block_size = 512, <CACHE_NAME>.key_cache_division_limit = 1, <CACHE_NAME>.key_cache_segments=2;
+ INSERT INTO t1 (a,b) VALUES (5,'e'),(6,'f');
+ LOAD INDEX INTO CACHE t1;
+ Table	Op	Msg_type	Msg_text
+-test.t1	preload_keys	status	OK
++test.t1	preload_keys	note	The storage engine for the table doesn't support preload_keys
+ SET GLOBAL new_<CACHE_NAME>.key_buffer_size=128*1024;
+ CACHE INDEX t1  IN new_<CACHE_NAME>;
+ Table	Op	Msg_type	Msg_text
+-test.t1	assign_to_keycache	status	OK
++test.t1	assign_to_keycache	note	The storage engine for the table doesn't support assign_to_keycache
+ INSERT INTO t1 (a,b) VALUES (7,'g'),(8,'h');
+ LOAD INDEX INTO CACHE t1 IGNORE LEAVES;
+ Table	Op	Msg_type	Msg_text
+-test.t1	preload_keys	status	OK
++test.t1	preload_keys	note	The storage engine for the table doesn't support preload_keys
+ INSERT INTO t1 (a,b) VALUES (9,'i');
+ DROP TABLE t2;
+ DROP TABLE t1;
+@@ -47,11 +47,11 @@
+ ) ENGINE=<STORAGE_ENGINE> <CUSTOM_TABLE_OPTIONS>;
+ CACHE INDEX t1 IN <CACHE_NAME>;
+ Table	Op	Msg_type	Msg_text
+-test.t1	assign_to_keycache	status	OK
++test.t1	assign_to_keycache	note	The storage engine for the table doesn't support assign_to_keycache
+ INSERT INTO t1 (a,b) VALUES (1,'a'),(2,'b');
+ LOAD INDEX INTO CACHE t1;
+ Table	Op	Msg_type	Msg_text
+-test.t1	preload_keys	status	OK
++test.t1	preload_keys	note	The storage engine for the table doesn't support preload_keys
+ DROP TABLE t1;
+ CREATE TABLE t1 (a <INT_COLUMN>,
+ b <CHAR_COLUMN>,
+@@ -59,11 +59,11 @@
+ ) ENGINE=<STORAGE_ENGINE> <CUSTOM_TABLE_OPTIONS>;
+ CACHE INDEX t1 IN <CACHE_NAME>;
+ Table	Op	Msg_type	Msg_text
+-test.t1	assign_to_keycache	status	OK
++test.t1	assign_to_keycache	note	The storage engine for the table doesn't support assign_to_keycache
+ INSERT INTO t1 (a,b) VALUES (1,'a'),(2,'b');
+ LOAD INDEX INTO CACHE t1;
+ Table	Op	Msg_type	Msg_text
+-test.t1	preload_keys	status	OK
++test.t1	preload_keys	note	The storage engine for the table doesn't support preload_keys
+ DROP TABLE t1;
+ SET GLOBAL <CACHE_NAME>.key_buffer_size=0;
+ SET GLOBAL new_<CACHE_NAME>.key_buffer_size=0;
diff --git a/storage/innobase/mysql-test/storage_engine/checksum_table_live.rdiff b/storage/innobase/mysql-test/storage_engine/checksum_table_live.rdiff
new file mode 100644
index 00000000..71c78284
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/checksum_table_live.rdiff
@@ -0,0 +1,13 @@
+--- suite/storage_engine/checksum_table_live.result	2012-07-12 21:05:44.497062968 +0400
++++ suite/storage_engine/checksum_table_live.reject	2012-07-15 17:47:28.105399836 +0400
+@@ -11,8 +11,8 @@
+ test.t1	4272806499
+ CHECKSUM TABLE t1, t2 QUICK;
+ Table	Checksum
+-test.t1	4272806499
+-test.t2	0
++test.t1	NULL
++test.t2	NULL
+ CHECKSUM TABLE t1, t2 EXTENDED;
+ Table	Checksum
+ test.t1	4272806499
diff --git a/storage/innobase/mysql-test/storage_engine/col_opt_not_null.opt b/storage/innobase/mysql-test/storage_engine/col_opt_not_null.opt
new file mode 100644
index 00000000..a007f405
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/col_opt_not_null.opt
@@ -0,0 +1 @@
+--innodb_log_file_size=200M
diff --git a/storage/innobase/mysql-test/storage_engine/col_opt_null.opt b/storage/innobase/mysql-test/storage_engine/col_opt_null.opt
new file mode 100644
index 00000000..a007f405
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/col_opt_null.opt
@@ -0,0 +1 @@
+--innodb_log_file_size=200M
diff --git a/storage/innobase/mysql-test/storage_engine/define_engine.inc b/storage/innobase/mysql-test/storage_engine/define_engine.inc
new file mode 100644
index 00000000..7d7b0c74
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/define_engine.inc
@@ -0,0 +1,45 @@
+###########################################
+#
+# This is a template of the include file define_engine.inc which 
+# should be placed in storage/<engine>/mysql-test/storage_engine folder.
+#
+################################
+#
+# The name of the engine under test must be defined in $ENGINE variable.
+# You can set it either here (uncomment and edit) or in your environment.
+#
+let $ENGINE = InnoDB;
+#
+################################
+#
+# The following three variables define specific options for columns and tables.
+# Normally there should be none needed, but for some engines it can be different.
+# If the engine requires specific column option for all or indexed columns,
+# set them inside the comment, e.g. /*!NOT NULL*/.
+# Do the same for table options if needed, e.g. /*!INSERT_METHOD=LAST*/
+
+let $default_col_opts = /*!*/;
+let $default_col_indexed_opts = /*!*/;
+let $default_tbl_opts = /*!*/;
+
+# INDEX, UNIQUE INDEX, PRIMARY KEY, special index type - choose the fist that the engine allows, 
+# or set it to /*!*/ if none is supported
+
+let $default_index = /*!INDEX*/;
+
+# If the engine does not support the following types, replace them with the closest possible
+
+let $default_int_type = INT(11);
+let $default_char_type = CHAR(8);
+
+################################
+
+--disable_query_log
+--disable_result_log
+
+# Here you can place your custom MTR code which needs to be executed before each test,
+# e.g. creation of an additional schema or table, etc.
+# The cleanup part should be defined in cleanup_engine.inc
+
+--enable_query_log
+--enable_result_log
diff --git a/storage/innobase/mysql-test/storage_engine/disabled.def b/storage/innobase/mysql-test/storage_engine/disabled.def
new file mode 100644
index 00000000..1d67f931
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/disabled.def
@@ -0,0 +1,9 @@
+tbl_opt_ai : MySQL:65901 (AUTO_INCREMENT option on InnoDB table is ignored if added before autoinc column)
+delete_low_prio : InnoDB does not use table-level locking
+insert_high_prio : InnoDB does not use table-level locking
+insert_low_prio : InnoDB does not use table-level locking
+select_high_prio : InnoDB does not use table-level locking
+update_low_prio : InnoDB does not use table-level locking
+insert_delayed    : MDEV-12880 - INSERT DELAYED is not detected as inapplicable to a table under lock
+lock_concurrent   : MDEV-12882 - Assertion failure
+tbl_opt_index_dir : INDEX DIRECTORY option is not supported anymore
diff --git a/storage/innobase/mysql-test/storage_engine/fulltext_search.rdiff b/storage/innobase/mysql-test/storage_engine/fulltext_search.rdiff
new file mode 100644
index 00000000..a68fe830
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/fulltext_search.rdiff
@@ -0,0 +1,49 @@
+--- suite/storage_engine/fulltext_search.result	2013-11-27 18:50:16.000000000 +0400
++++ suite/storage_engine/fulltext_search.reject	2014-02-05 15:33:26.000000000 +0400
+@@ -52,15 +52,14 @@
+ INSERT INTO t1 (v0,v1,v2) VALUES ('text4','Contributing more...','...is a good idea'),('text5','test','test');
+ SELECT v0, MATCH(v1) AGAINST('contributing') AS rating FROM t1 WHERE MATCH(v1) AGAINST ('contributing');
+ v0	rating
+-text4	1.3705332279205322
++text4	0.4885590672492981
+ SELECT v0 FROM t1 WHERE MATCH(v1,v2) AGAINST ('-test1 +critical +Cook*' IN BOOLEAN MODE);
+-v0
+-text1
++ERROR HY000: Can't find FULLTEXT index matching the column list
+ SELECT v0 FROM t1 WHERE MATCH(v1,v2) AGAINST ('-patch +critical +Cook*' IN BOOLEAN MODE);
+-v0
++ERROR HY000: Can't find FULLTEXT index matching the column list
+ SELECT v0, MATCH(v1) AGAINST('database' WITH QUERY EXPANSION) AS rating FROM t1 WHERE MATCH(v1) AGAINST ('database' WITH QUERY EXPANSION);
+ v0	rating
+-text1	178.11756896972656
++text1	151.4530487060547
+ DROP TABLE t1;
+ CREATE TABLE t1 (v0 VARCHAR(64) <CUSTOM_COL_OPTIONS>,
+ v1 VARCHAR(16384) <CUSTOM_COL_OPTIONS>,
+@@ -112,14 +111,15 @@
+ 	), ('text2','test1','test2');
+ SELECT v0 FROM t1 WHERE MATCH(v1,v2) AGAINST ('contributing' IN NATURAL LANGUAGE MODE);
+ v0
++text1
+ INSERT INTO t1 (v0,v1,v2) VALUES ('text3','test','test');
+ SELECT v0, MATCH(v1,v2) AGAINST('contributing' IN NATURAL LANGUAGE MODE) AS rating FROM t1 WHERE MATCH(v1,v2) AGAINST ('contributing' IN NATURAL LANGUAGE MODE);
+ v0	rating
+-text1	0.2809644043445587
++text1	0.45528939366340637
+ INSERT INTO t1 (v0,v1,v2) VALUES ('text4','Contributing more...','...is a good idea'),('text5','test','test');
+ SELECT v0, MATCH(v1) AGAINST('contributing') AS rating FROM t1 WHERE MATCH(v1) AGAINST ('contributing');
+ v0	rating
+-text4	1.3705332279205322
++text4	0.4885590672492981
+ SELECT v0 FROM t1 WHERE MATCH(v1,v2) AGAINST ('-test1 +critical +Cook*' IN BOOLEAN MODE);
+ v0
+ text1
+@@ -127,6 +127,6 @@
+ v0
+ SELECT v0, MATCH(v1,v2) AGAINST('database' WITH QUERY EXPANSION) AS rating FROM t1 WHERE MATCH(v1,v2) AGAINST ('database' WITH QUERY EXPANSION);
+ v0	rating
+-text1	190.56150817871094
+-text4	1.1758291721343994
++text1	229.60874938964844
++text4	0.31671249866485596
+ DROP TABLE t1;
diff --git a/storage/innobase/mysql-test/storage_engine/index_enable_disable.rdiff b/storage/innobase/mysql-test/storage_engine/index_enable_disable.rdiff
new file mode 100644
index 00000000..f8e812e7
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/index_enable_disable.rdiff
@@ -0,0 +1,33 @@
+--- suite/storage_engine/index_enable_disable.result	2012-07-15 00:30:05.296641931 +0400
++++ suite/storage_engine/index_enable_disable.reject	2012-07-15 17:49:12.988081281 +0400
+@@ -11,15 +11,19 @@
+ Table	Non_unique	Key_name	Seq_in_index	Column_name	Collation	Cardinality	Sub_part	Packed	Null	Index_type	Comment	Index_comment
+ t1	1	a	1	a	#	#	NULL	NULL	YES	BTREE		
+ ALTER TABLE t1 DISABLE KEYS;
++Warnings:
++Note	1031	Storage engine <STORAGE_ENGINE> of the table `test`.`t1` doesn't have this option
+ SHOW INDEX IN t1;
+ Table	Non_unique	Key_name	Seq_in_index	Column_name	Collation	Cardinality	Sub_part	Packed	Null	Index_type	Comment	Index_comment
+-t1	1	a	1	a	#	#	NULL	NULL	YES	BTREE	disabled	
++t1	1	a	1	a	#	#	NULL	NULL	YES	BTREE		
+ EXPLAIN SELECT a FROM t1 ORDER BY a;
+ id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+-1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	19	Using filesort
++1	SIMPLE	t1	index	NULL	a	5	NULL	19	Using index
+ INSERT INTO t1 (a) VALUES
+ (11),(12),(13),(14),(15),(16),(17),(18),(19),(20);
+ ALTER TABLE t1 ENABLE KEYS;
++Warnings:
++Note	1031	Storage engine <STORAGE_ENGINE> of the table `test`.`t1` doesn't have this option
+ SHOW INDEX IN t1;
+ Table	Non_unique	Key_name	Seq_in_index	Column_name	Collation	Cardinality	Sub_part	Packed	Null	Index_type	Comment	Index_comment
+ t1	1	a	1	a	#	#	NULL	NULL	YES	BTREE		
+@@ -32,6 +36,8 @@
+ (1),(2),(3),(4),(5),(6),(7),(8),(9),
+ (21),(22),(23),(24),(25),(26),(27),(28),(29);
+ ALTER TABLE t1 DISABLE KEYS;
++Warnings:
++Note	1031	Storage engine <STORAGE_ENGINE> of the table `test`.`t1` doesn't have this option
+ INSERT INTO t1 (a) VALUES (29);
+ ERROR 23000: Duplicate entry '29' for key 'a'
+ # Statement ended with one of expected results (ER_DUP_ENTRY,ER_DUP_KEY). 
diff --git a/storage/innobase/mysql-test/storage_engine/index_type_hash.rdiff b/storage/innobase/mysql-test/storage_engine/index_type_hash.rdiff
new file mode 100644
index 00000000..02f9d935
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/index_type_hash.rdiff
@@ -0,0 +1,60 @@
+--- suite/storage_engine/index_type_hash.result	2012-07-15 01:10:17.919128889 +0400
++++ suite/storage_engine/index_type_hash.reject	2012-07-15 17:49:26.135915989 +0400
+@@ -4,7 +4,7 @@
+ ) ENGINE=<STORAGE_ENGINE> <CUSTOM_TABLE_OPTIONS>;
+ SHOW KEYS IN t1;
+ Table	Non_unique	Key_name	Seq_in_index	Column_name	Collation	Cardinality	Sub_part	Packed	Null	Index_type	Comment	Index_comment
+-t1	1	a	1	a	#	#	NULL	NULL	#	HASH		
++t1	1	a	1	a	#	#	NULL	NULL	#	BTREE		
+ DROP TABLE t1;
+ CREATE TABLE t1 (a <INT_COLUMN>,
+ b <CHAR_COLUMN>,
+@@ -12,8 +12,8 @@
+  ) ENGINE=<STORAGE_ENGINE> <CUSTOM_TABLE_OPTIONS>;
+ SHOW KEYS IN t1;
+ Table	Non_unique	Key_name	Seq_in_index	Column_name	Collation	Cardinality	Sub_part	Packed	Null	Index_type	Comment	Index_comment
+-t1	1	a_b	1	a	#	#	NULL	NULL	#	HASH		a_b index
+-t1	1	a_b	2	b	#	#	NULL	NULL	#	HASH		a_b index
++t1	1	a_b	1	a	#	#	NULL	NULL	#	BTREE		a_b index
++t1	1	a_b	2	b	#	#	NULL	NULL	#	BTREE		a_b index
+ DROP TABLE t1;
+ CREATE TABLE t1 (a <INT_COLUMN>,
+ b <CHAR_COLUMN>,
+@@ -22,8 +22,8 @@
+ ) ENGINE=<STORAGE_ENGINE> <CUSTOM_TABLE_OPTIONS>;
+ SHOW KEYS IN t1;
+ Table	Non_unique	Key_name	Seq_in_index	Column_name	Collation	Cardinality	Sub_part	Packed	Null	Index_type	Comment	Index_comment
+-t1	1	a	1	a	#	#	NULL	NULL	#	HASH		
+-t1	1	b	1	b	#	#	NULL	NULL	#	HASH		
++t1	1	a	1	a	#	#	NULL	NULL	#	BTREE		
++t1	1	b	1	b	#	#	NULL	NULL	#	BTREE		
+ DROP TABLE t1;
+ CREATE TABLE t1 (a <INT_COLUMN>,
+ b <CHAR_COLUMN>,
+@@ -31,7 +31,7 @@
+ ) ENGINE=<STORAGE_ENGINE> <CUSTOM_TABLE_OPTIONS>;
+ SHOW KEYS IN t1;
+ Table	Non_unique	Key_name	Seq_in_index	Column_name	Collation	Cardinality	Sub_part	Packed	Null	Index_type	Comment	Index_comment
+-t1	0	a	1	a	#	#	NULL	NULL	#	HASH		
++t1	0	a	1	a	#	#	NULL	NULL	#	BTREE		
+ INSERT INTO t1 (a,b) VALUES (1,'a'),(2,'b');
+ INSERT INTO t1 (a,b) VALUES (1,'c');
+ ERROR 23000: Duplicate entry '1' for key 'a'
+@@ -43,7 +43,7 @@
+ ALTER TABLE t1 ADD <CUSTOM_INDEX> (a) USING HASH COMMENT 'simple index on a';
+ SHOW INDEX FROM t1;
+ Table	Non_unique	Key_name	Seq_in_index	Column_name	Collation	Cardinality	Sub_part	Packed	Null	Index_type	Comment	Index_comment
+-t1	1	a	1	a	#	#	NULL	NULL	#	HASH		simple index on a
++t1	1	a	1	a	#	#	NULL	NULL	#	BTREE		simple index on a
+ ALTER TABLE t1 DROP KEY a;
+ DROP TABLE t1;
+ CREATE TABLE t1 (a <INT_COLUMN>,
+@@ -52,7 +52,7 @@
+ ) ENGINE=<STORAGE_ENGINE> <CUSTOM_TABLE_OPTIONS>;
+ SHOW KEYS IN t1;
+ Table	Non_unique	Key_name	Seq_in_index	Column_name	Collation	Cardinality	Sub_part	Packed	Null	Index_type	Comment	Index_comment
+-t1	0	a	1	a	#	#	NULL	NULL	#	HASH		
++t1	0	a	1	a	#	#	NULL	NULL	#	BTREE		
+ INSERT INTO t1 (a,b) VALUES (1,'a'),(2,'b');
+ INSERT INTO t1 (a,b) VALUES (1,'c');
+ ERROR 23000: Duplicate entry '1' for key 'a'
diff --git a/storage/innobase/mysql-test/storage_engine/insert_delayed.rdiff b/storage/innobase/mysql-test/storage_engine/insert_delayed.rdiff
new file mode 100644
index 00000000..9e6cddf0
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/insert_delayed.rdiff
@@ -0,0 +1,26 @@
+--- suite/storage_engine/insert_delayed.result	2013-01-23 01:23:49.461254916 +0400
++++ suite/storage_engine/insert_delayed.reject	2013-01-23 01:47:05.975698364 +0400
+@@ -5,7 +5,16 @@
+ connect  con0,localhost,root,,;
+ SET lock_wait_timeout = 1;
+ INSERT DELAYED INTO t1 (a,b) VALUES (3,'c');
++ERROR HY000: DELAYED option not supported for table 't1'
++# ------------ UNEXPECTED RESULT ------------
++# The statement|command finished with ER_DELAYED_NOT_SUPPORTED.
++# INSERT DELAYED or the mix could be unsupported|malfunctioning, or the problem was caused by previous errors. 
++# You can change the engine code, or create an rdiff, or disable the test by adding it to disabled.def.
++# Further in this test, the message might sometimes be suppressed; a part of the test might be skipped.
++# Also, this problem may cause a chain effect (more errors of different kinds in the test).
++# -------------------------------------------
+ INSERT DELAYED INTO t1 SET a=4, b='d';
++ERROR HY000: DELAYED option not supported for table 't1'
+ INSERT DELAYED INTO t1 (a,b) SELECT 5, 'e';
+ ERROR HY000: Lock wait timeout exceeded; try restarting transaction
+ disconnect con0;
+@@ -20,6 +29,4 @@
+ a	b
+ 1	f
+ 2	b
+-3	c
+-4	d
+ DROP TABLE t1;
diff --git a/storage/innobase/mysql-test/storage_engine/lock_concurrent.rdiff b/storage/innobase/mysql-test/storage_engine/lock_concurrent.rdiff
new file mode 100644
index 00000000..c76a5fe7
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/lock_concurrent.rdiff
@@ -0,0 +1,25 @@
+--- suite/storage_engine/lock_concurrent.result	2012-06-24 23:55:19.539380000 +0400
++++ suite/storage_engine/lock_concurrent.reject	2012-07-15 17:50:21.279222746 +0400
+@@ -4,6 +4,14 @@
+ connect  con1,localhost,root,,;
+ SET lock_wait_timeout = 1;
+ LOCK TABLES t1 READ LOCAL;
++ERROR HY000: Lock wait timeout exceeded; try restarting transaction
++# ------------ UNEXPECTED RESULT ------------
++# The statement|command finished with ER_LOCK_WAIT_TIMEOUT.
++# LOCK .. WRITE CONCURRENT or the mix could be unsupported|malfunctioning, or the problem was caused by previous errors. 
++# You can change the engine code, or create an rdiff, or disable the test by adding it to disabled.def.
++# Further in this test, the message might sometimes be suppressed; a part of the test might be skipped.
++# Also, this problem may cause a chain effect (more errors of different kinds in the test).
++# -------------------------------------------
+ UNLOCK TABLES;
+ connection default;
+ UNLOCK TABLES;
+@@ -11,6 +19,7 @@
+ LOCK TABLES t1 READ LOCAL;
+ connection default;
+ LOCK TABLES t1 WRITE CONCURRENT, t1 AS t2 READ;
++ERROR HY000: Lock wait timeout exceeded; try restarting transaction
+ UNLOCK TABLES;
+ UNLOCK TABLES;
+ DROP TABLE t1;
diff --git a/storage/innobase/mysql-test/storage_engine/optimize_table.rdiff b/storage/innobase/mysql-test/storage_engine/optimize_table.rdiff
new file mode 100644
index 00000000..54d1f600
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/optimize_table.rdiff
@@ -0,0 +1,37 @@
+--- suite/storage_engine/optimize_table.result	2012-07-12 19:13:53.741428591 +0400
++++ suite/storage_engine/optimize_table.reject	2012-07-15 17:50:30.843102510 +0400
+@@ -5,25 +5,32 @@
+ INSERT INTO t1 (a,b) VALUES (3,'c'),(4,'d');
+ OPTIMIZE TABLE t1;
+ Table	Op	Msg_type	Msg_text
++test.t1	optimize	note	Table does not support optimize, doing recreate + analyze instead
+ test.t1	optimize	status	OK
+ INSERT INTO t2 (a,b) VALUES (4,'d');
+ OPTIMIZE NO_WRITE_TO_BINLOG TABLE t2;
+ Table	Op	Msg_type	Msg_text
++test.t2	optimize	note	Table does not support optimize, doing recreate + analyze instead
+ test.t2	optimize	status	OK
+ INSERT INTO t2 (a,b) VALUES (5,'e');
+ INSERT INTO t1 (a,b) VALUES (6,'f');
+ OPTIMIZE LOCAL TABLE t1, t2;
+ Table	Op	Msg_type	Msg_text
++test.t1	optimize	note	Table does not support optimize, doing recreate + analyze instead
+ test.t1	optimize	status	OK
++test.t2	optimize	note	Table does not support optimize, doing recreate + analyze instead
+ test.t2	optimize	status	OK
+ OPTIMIZE TABLE t1, t2;
+ Table	Op	Msg_type	Msg_text
+-test.t1	optimize	status	Table is already up to date
+-test.t2	optimize	status	Table is already up to date
++test.t1	optimize	note	Table does not support optimize, doing recreate + analyze instead
++test.t1	optimize	status	OK
++test.t2	optimize	note	Table does not support optimize, doing recreate + analyze instead
++test.t2	optimize	status	OK
+ DROP TABLE t1, t2;
+ CREATE TABLE t1 (a <INT_COLUMN>, b <CHAR_COLUMN>, <CUSTOM_INDEX> (a)) ENGINE=<STORAGE_ENGINE> <CUSTOM_TABLE_OPTIONS>;
+ INSERT INTO t1 (a,b) VALUES (1,'a'),(100,'b'),(2,'c'),(3,'d');
+ OPTIMIZE TABLE t1;
+ Table	Op	Msg_type	Msg_text
++test.t1	optimize	note	Table does not support optimize, doing recreate + analyze instead
+ test.t1	optimize	status	OK
+ DROP TABLE t1;
diff --git a/storage/innobase/mysql-test/storage_engine/parts/checksum_table.rdiff b/storage/innobase/mysql-test/storage_engine/parts/checksum_table.rdiff
new file mode 100644
index 00000000..c8aabb78
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/parts/checksum_table.rdiff
@@ -0,0 +1,13 @@
+--- suite/storage_engine/parts/checksum_table.result	2013-11-08 22:30:34.000000000 +0400
++++ suite/storage_engine/parts/checksum_table.reject	2013-11-08 22:32:30.000000000 +0400
+@@ -31,8 +31,8 @@
+ test.t1	4272806499
+ CHECKSUM TABLE t1, t2 QUICK;
+ Table	Checksum
+-test.t1	4272806499
+-test.t2	0
++test.t1	NULL
++test.t2	NULL
+ CHECKSUM TABLE t1, t2 EXTENDED;
+ Table	Checksum
+ test.t1	4272806499
diff --git a/storage/innobase/mysql-test/storage_engine/parts/create_table.rdiff b/storage/innobase/mysql-test/storage_engine/parts/create_table.rdiff
new file mode 100644
index 00000000..0df91c6f
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/parts/create_table.rdiff
@@ -0,0 +1,20 @@
+--- suite/storage_engine/parts/create_table.result	2012-07-12 21:56:38.618667460 +0400
++++ suite/storage_engine/parts/create_table.reject	2012-07-15 20:06:43.496358345 +0400
+@@ -65,7 +65,7 @@
+ 1	SIMPLE	t1	abc,def	#	#	#	#	#	#	#
+ EXPLAIN PARTITIONS SELECT a FROM t1 WHERE a = 100;
+ id	select_type	table	partitions	type	possible_keys	key	key_len	ref	rows	Extra
+-1	SIMPLE	NULL	NULL	#	#	#	#	#	#	#
++1	SIMPLE	t1	def	#	#	#	#	#	#	#
+ INSERT INTO t1 (a) VALUES (50);
+ ERROR HY000: Table has no partition for value 50
+ DROP TABLE t1;
+@@ -81,7 +81,7 @@
+ 1	SIMPLE	t1	abc_abcsp0,def_defsp0	#	#	#	#	#	#	#
+ EXPLAIN PARTITIONS SELECT a FROM t1 WHERE a = 100;
+ id	select_type	table	partitions	type	possible_keys	key	key_len	ref	rows	Extra
+-1	SIMPLE	NULL	NULL	#	#	#	#	#	#	#
++1	SIMPLE	t1	def_defsp0	#	#	#	#	#	#	#
+ SELECT TABLE_SCHEMA, TABLE_NAME, PARTITION_NAME, SUBPARTITION_NAME, PARTITION_METHOD, SUBPARTITION_METHOD 
+ FROM INFORMATION_SCHEMA.PARTITIONS WHERE TABLE_NAME = 't1';
+ TABLE_SCHEMA	TABLE_NAME	PARTITION_NAME	SUBPARTITION_NAME	PARTITION_METHOD	SUBPARTITION_METHOD
diff --git a/storage/innobase/mysql-test/storage_engine/parts/disabled.def b/storage/innobase/mysql-test/storage_engine/parts/disabled.def
new file mode 100644
index 00000000..796bdfc7
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/parts/disabled.def
@@ -0,0 +1 @@
+repair_table : InnoDB of 5.6.10 does not support repair on partitioned tables (fixed by 5.6.14)
diff --git a/storage/innobase/mysql-test/storage_engine/parts/optimize_table.rdiff b/storage/innobase/mysql-test/storage_engine/parts/optimize_table.rdiff
new file mode 100644
index 00000000..a35ba516
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/parts/optimize_table.rdiff
@@ -0,0 +1,58 @@
+--- suite/storage_engine/parts/optimize_table.result	2013-07-18 22:55:38.000000000 +0400
++++ suite/storage_engine/parts/optimize_table.reject	2013-08-05 19:45:19.000000000 +0400
+@@ -9,18 +9,22 @@
+ INSERT INTO t1 (a,b) VALUES (3,'c'),(4,'d');
+ ALTER TABLE t1 OPTIMIZE PARTITION p1;
+ Table	Op	Msg_type	Msg_text
++test.t1	optimize	note	Table does not support optimize on partitions. All partitions will be rebuilt and analyzed.
+ test.t1	optimize	status	OK
+ INSERT INTO t2 (a,b) VALUES (4,'d');
+ ALTER TABLE t2 OPTIMIZE PARTITION p0 NO_WRITE_TO_BINLOG;
+ Table	Op	Msg_type	Msg_text
++test.t2	optimize	note	Table does not support optimize on partitions. All partitions will be rebuilt and analyzed.
+ test.t2	optimize	status	OK
+ INSERT INTO t1 (a,b) VALUES (6,'f');
+ ALTER TABLE t1 OPTIMIZE PARTITION ALL LOCAL;
+ Table	Op	Msg_type	Msg_text
++test.t1	optimize	note	Table does not support optimize on partitions. All partitions will be rebuilt and analyzed.
+ test.t1	optimize	status	OK
+ INSERT INTO t2 (a,b) VALUES (5,'e');
+ ALTER TABLE t2 OPTIMIZE PARTITION p1,p0;
+ Table	Op	Msg_type	Msg_text
++test.t2	optimize	note	Table does not support optimize on partitions. All partitions will be rebuilt and analyzed.
+ test.t2	optimize	status	OK
+ DROP TABLE t1, t2;
+ DROP TABLE IF EXISTS t1,t2;
+@@ -30,25 +34,32 @@
+ INSERT INTO t1 (a,b) VALUES (3,'c'),(4,'d');
+ OPTIMIZE TABLE t1;
+ Table	Op	Msg_type	Msg_text
++test.t1	optimize	note	Table does not support optimize, doing recreate + analyze instead
+ test.t1	optimize	status	OK
+ INSERT INTO t2 (a,b) VALUES (4,'d');
+ OPTIMIZE NO_WRITE_TO_BINLOG TABLE t2;
+ Table	Op	Msg_type	Msg_text
++test.t2	optimize	note	Table does not support optimize, doing recreate + analyze instead
+ test.t2	optimize	status	OK
+ INSERT INTO t2 (a,b) VALUES (5,'e');
+ INSERT INTO t1 (a,b) VALUES (6,'f');
+ OPTIMIZE LOCAL TABLE t1, t2;
+ Table	Op	Msg_type	Msg_text
++test.t1	optimize	note	Table does not support optimize, doing recreate + analyze instead
+ test.t1	optimize	status	OK
++test.t2	optimize	note	Table does not support optimize, doing recreate + analyze instead
+ test.t2	optimize	status	OK
+ OPTIMIZE TABLE t1, t2;
+ Table	Op	Msg_type	Msg_text
++test.t1	optimize	note	Table does not support optimize, doing recreate + analyze instead
+ test.t1	optimize	status	OK
++test.t2	optimize	note	Table does not support optimize, doing recreate + analyze instead
+ test.t2	optimize	status	OK
+ DROP TABLE t1, t2;
+ CREATE TABLE t1 (a <INT_COLUMN>, b <CHAR_COLUMN>, <CUSTOM_INDEX> (a)) ENGINE=<STORAGE_ENGINE> <CUSTOM_TABLE_OPTIONS> PARTITION BY HASH(a) PARTITIONS 2;
+ INSERT INTO t1 (a,b) VALUES (1,'a'),(100,'b'),(2,'c'),(3,'d');
+ OPTIMIZE TABLE t1;
+ Table	Op	Msg_type	Msg_text
++test.t1	optimize	note	Table does not support optimize, doing recreate + analyze instead
+ test.t1	optimize	status	OK
+ DROP TABLE t1;
diff --git a/storage/innobase/mysql-test/storage_engine/parts/repair_table.rdiff b/storage/innobase/mysql-test/storage_engine/parts/repair_table.rdiff
new file mode 100644
index 00000000..35b150e8
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/parts/repair_table.rdiff
@@ -0,0 +1,158 @@
+--- suite/storage_engine/parts/repair_table.result	2013-07-18 22:55:38.000000000 +0400
++++ suite/storage_engine/parts/repair_table.reject	2013-08-05 19:54:09.000000000 +0400
+@@ -25,7 +25,7 @@
+ INSERT INTO t1 (a,b) VALUES (10,'j');
+ ALTER TABLE t1 REPAIR PARTITION p1 QUICK USE_FRM;
+ Table	Op	Msg_type	Msg_text
+-test.t1	repair	status	OK
++test.t1	repair	note	The storage engine for the table doesn't support repair
+ INSERT INTO t2 (a,b) VALUES (12,'l');
+ ALTER TABLE t2 REPAIR PARTITION NO_WRITE_TO_BINLOG ALL QUICK EXTENDED USE_FRM;
+ Table	Op	Msg_type	Msg_text
+@@ -58,8 +58,8 @@
+ INSERT INTO t2 (a,b) VALUES (11,'k');
+ REPAIR TABLE t1, t2 QUICK USE_FRM;
+ Table	Op	Msg_type	Msg_text
+-test.t1	repair	status	OK
+-test.t2	repair	status	OK
++test.t1	repair	note	The storage engine for the table doesn't support repair
++test.t2	repair	note	The storage engine for the table doesn't support repair
+ INSERT INTO t1 (a,b) VALUES (12,'l');
+ INSERT INTO t2 (a,b) VALUES (13,'m');
+ REPAIR NO_WRITE_TO_BINLOG TABLE t1, t2 QUICK EXTENDED USE_FRM;
+@@ -101,119 +101,13 @@
+ INSERT INTO t1 (a,b) VALUES (10,'j');
+ REPAIR TABLE t1 USE_FRM;
+ Table	Op	Msg_type	Msg_text
+-test.t1	repair	status	OK
+-t1#P#p0.MYD
+-t1#P#p0.MYI
+-t1#P#p1.MYD
+-t1#P#p1.MYI
++test.t1	repair	note	The storage engine for the table doesn't support repair
+ t1.frm
+ t1.par
+ INSERT INTO t1 (a,b) VALUES (14,'n'),(15,'o');
+ # Statement ended with one of expected results (0,144). 
+ # If you got a difference in error message, just add it to rdiff file
+ FLUSH TABLE t1;
+-Restoring <DATADIR>/test/t1#P#p0.MYD
+-CHECK TABLE t1;
+-Table	Op	Msg_type	Msg_text
+-test.t1	check	error	Size of datafile is: 26         Should be: 39
+-test.t1	check	error	Partition p0 returned error
+-test.t1	check	error	Corrupt
+-SELECT a,b FROM t1;
+-a	b
+-8	h
+-10	j
+-7	g
+-15	o
+-Warnings:
+-Error	145	Table './test/t1#P#p0' is marked as crashed and should be repaired
+-Error	1194	Table 't1' is marked as crashed and should be repaired
+-Error	1034	Number of rows changed from 3 to 2
+-# Statement ended with one of expected results (0,ER_NOT_KEYFILE,144). 
+-# If you got a difference in error message, just add it to rdiff file
+-INSERT INTO t1 (a,b) VALUES (14,'n'),(15,'o');
+-# Statement ended with one of expected results (0,144). 
+-# If you got a difference in error message, just add it to rdiff file
+-FLUSH TABLE t1;
+-Restoring <DATADIR>/test/t1#P#p0.MYI
+-CHECK TABLE t1;
+-Table	Op	Msg_type	Msg_text
+-test.t1	check	warning	Size of datafile is: 39       Should be: 26
+-test.t1	check	error	Record-count is not ok; is 3   Should be: 2
+-test.t1	check	warning	Found 3 key parts. Should be: 2
+-test.t1	check	error	Partition p0 returned error
+-test.t1	check	error	Corrupt
+-SELECT a,b FROM t1;
+-a	b
+-8	h
+-10	j
+-14	n
+-7	g
+-15	o
+-15	o
+-Warnings:
+-Error	145	Table './test/t1#P#p0' is marked as crashed and should be repaired
+-Error	1194	Table 't1' is marked as crashed and should be repaired
+-Error	1034	Number of rows changed from 2 to 3
+-# Statement ended with one of expected results (0,ER_NOT_KEYFILE,144). 
+-# If you got a difference in error message, just add it to rdiff file
+-INSERT INTO t1 (a,b) VALUES (14,'n'),(15,'o');
+-# Statement ended with one of expected results (0,144). 
+-# If you got a difference in error message, just add it to rdiff file
+-FLUSH TABLE t1;
+-Restoring <DATADIR>/test/t1#P#p1.MYD
+-CHECK TABLE t1;
+-Table	Op	Msg_type	Msg_text
+-test.t1	check	error	Size of datafile is: 39         Should be: 52
+-test.t1	check	error	Partition p1 returned error
+-test.t1	check	error	Corrupt
+-SELECT a,b FROM t1;
+-a	b
+-8	h
+-10	j
+-14	n
+-14	n
+-7	g
+-15	o
+-15	o
+-Warnings:
+-Error	145	Table './test/t1#P#p1' is marked as crashed and should be repaired
+-Error	1194	Table 't1' is marked as crashed and should be repaired
+-Error	1034	Number of rows changed from 4 to 3
+-# Statement ended with one of expected results (0,ER_NOT_KEYFILE,144). 
+-# If you got a difference in error message, just add it to rdiff file
+-INSERT INTO t1 (a,b) VALUES (14,'n'),(15,'o');
+-# Statement ended with one of expected results (0,144). 
+-# If you got a difference in error message, just add it to rdiff file
+-FLUSH TABLE t1;
+-Restoring <DATADIR>/test/t1#P#p1.MYI
+-CHECK TABLE t1;
+-Table	Op	Msg_type	Msg_text
+-test.t1	check	warning	Size of datafile is: 52       Should be: 39
+-test.t1	check	error	Record-count is not ok; is 4   Should be: 3
+-test.t1	check	warning	Found 4 key parts. Should be: 3
+-test.t1	check	error	Partition p1 returned error
+-test.t1	check	error	Corrupt
+-SELECT a,b FROM t1;
+-a	b
+-8	h
+-10	j
+-14	n
+-14	n
+-14	n
+-7	g
+-15	o
+-15	o
+-15	o
+-Warnings:
+-Error	145	Table './test/t1#P#p1' is marked as crashed and should be repaired
+-Error	1194	Table 't1' is marked as crashed and should be repaired
+-Error	1034	Number of rows changed from 3 to 4
+-# Statement ended with one of expected results (0,ER_NOT_KEYFILE,144). 
+-# If you got a difference in error message, just add it to rdiff file
+-INSERT INTO t1 (a,b) VALUES (14,'n'),(15,'o');
+-# Statement ended with one of expected results (0,144). 
+-# If you got a difference in error message, just add it to rdiff file
+-FLUSH TABLE t1;
+ Restoring <DATADIR>/test/t1.par
+ CHECK TABLE t1;
+ Table	Op	Msg_type	Msg_text
+@@ -223,14 +117,8 @@
+ 8	h
+ 10	j
+ 14	n
+-14	n
+-14	n
+-14	n
+ 7	g
+ 15	o
+-15	o
+-15	o
+-15	o
+ # Statement ended with one of expected results (0,ER_NOT_KEYFILE,144). 
+ # If you got a difference in error message, just add it to rdiff file
+ DROP TABLE t1;
diff --git a/storage/innobase/mysql-test/storage_engine/parts/suite.opt b/storage/innobase/mysql-test/storage_engine/parts/suite.opt
new file mode 100644
index 00000000..66f581b5
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/parts/suite.opt
@@ -0,0 +1,2 @@
+--innodb
+
diff --git a/storage/innobase/mysql-test/storage_engine/repair_table.rdiff b/storage/innobase/mysql-test/storage_engine/repair_table.rdiff
new file mode 100644
index 00000000..e9c46b3a
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/repair_table.rdiff
@@ -0,0 +1,139 @@
+--- suite/storage_engine/repair_table.result	2013-10-03 20:35:06.000000000 +0400
++++ suite/storage_engine/repair_table.reject	2013-11-08 22:04:22.000000000 +0400
+@@ -4,56 +4,57 @@
+ CREATE TABLE t2 (a <INT_COLUMN>, b <CHAR_COLUMN>) ENGINE=<STORAGE_ENGINE> <CUSTOM_TABLE_OPTIONS>;
+ REPAIR TABLE t1;
+ Table	Op	Msg_type	Msg_text
+-test.t1	repair	status	OK
++test.t1	repair	note	The storage engine for the table doesn't support repair
+ INSERT INTO t1 (a,b) VALUES (3,'c');
+ INSERT INTO t2 (a,b) VALUES (4,'d');
+ REPAIR NO_WRITE_TO_BINLOG TABLE t1, t2;
+ Table	Op	Msg_type	Msg_text
+-test.t1	repair	status	OK
+-test.t2	repair	status	OK
++test.t1	repair	note	The storage engine for the table doesn't support repair
++test.t2	repair	note	The storage engine for the table doesn't support repair
+ INSERT INTO t2 (a,b) VALUES (5,'e'),(6,'f');
+ REPAIR LOCAL TABLE t2;
+ Table	Op	Msg_type	Msg_text
+-test.t2	repair	status	OK
++test.t2	repair	note	The storage engine for the table doesn't support repair
+ INSERT INTO t1 (a,b) VALUES (7,'g'),(8,'h');
+ INSERT INTO t2 (a,b) VALUES (9,'i');
+ REPAIR LOCAL TABLE t2, t1 EXTENDED;
+ Table	Op	Msg_type	Msg_text
+-test.t2	repair	status	OK
+-test.t1	repair	status	OK
++test.t2	repair	note	The storage engine for the table doesn't support repair
++test.t1	repair	note	The storage engine for the table doesn't support repair
+ INSERT INTO t1 (a,b) VALUES (10,'j');
+ INSERT INTO t2 (a,b) VALUES (11,'k');
+ REPAIR TABLE t1, t2 QUICK USE_FRM;
+ Table	Op	Msg_type	Msg_text
+-test.t1	repair	warning	Number of rows changed from 0 to 6
+-test.t1	repair	status	OK
+-test.t2	repair	warning	Number of rows changed from 0 to 5
+-test.t2	repair	status	OK
++test.t1	repair	note	The storage engine for the table doesn't support repair
++test.t2	repair	note	The storage engine for the table doesn't support repair
+ INSERT INTO t1 (a,b) VALUES (12,'l');
+ INSERT INTO t2 (a,b) VALUES (13,'m');
+ REPAIR NO_WRITE_TO_BINLOG TABLE t1, t2 QUICK EXTENDED USE_FRM;
+ Table	Op	Msg_type	Msg_text
+-test.t1	repair	warning	Number of rows changed from 0 to 7
+-test.t1	repair	status	OK
+-test.t2	repair	warning	Number of rows changed from 0 to 6
+-test.t2	repair	status	OK
++test.t1	repair	note	The storage engine for the table doesn't support repair
++test.t2	repair	note	The storage engine for the table doesn't support repair
+ FLUSH TABLE t1;
+ INSERT INTO t1 (a,b) VALUES (14,'n');
+-ERROR HY000: Incorrect file format 't1'
+ # Statement ended with one of expected results (0,130,ER_FAILED_READ_FROM_PAR_FILE,ER_OPEN_AS_READONLY). 
+ # If you got a difference in error message, just add it to rdiff file
+ CHECK TABLE t1;
+ Table	Op	Msg_type	Msg_text
+-test.t1	check	Error	Incorrect file format 't1'
+-test.t1	check	error	Corrupt
++test.t1	check	status	OK
+ SELECT a,b FROM t1;
+-ERROR HY000: Incorrect file format 't1'
++a	b
++1	a
++2	b
++3	c
++7	g
++8	h
++10	j
++12	l
++14	n
+ # Statement ended with one of expected results (0,130,ER_FAILED_READ_FROM_PAR_FILE,ER_OPEN_AS_READONLY). 
+ # If you got a difference in error message, just add it to rdiff file
+ REPAIR TABLE t1;
+ Table	Op	Msg_type	Msg_text
+-test.t1	repair	Error	Incorrect file format 't1'
+-test.t1	repair	error	Corrupt
++test.t1	repair	note	The storage engine for the table doesn't support repair
+ DROP TABLE t1, t2;
+ call mtr.add_suppression("Got an error from thread_id=.*");
+ call mtr.add_suppression("MySQL thread id .*, query id .* localhost.*root Checking table");
+@@ -63,46 +64,33 @@
+ CREATE TABLE t1 (a <INT_COLUMN>, b <CHAR_COLUMN>, <CUSTOM_INDEX> (a)) ENGINE=<STORAGE_ENGINE> <CUSTOM_TABLE_OPTIONS>;
+ REPAIR TABLE t1;
+ Table	Op	Msg_type	Msg_text
+-test.t1	repair	status	OK
++test.t1	repair	note	The storage engine for the table doesn't support repair
+ INSERT INTO t1 (a,b) VALUES (7,'g'),(8,'h');
+ REPAIR TABLE t1 EXTENDED;
+ Table	Op	Msg_type	Msg_text
+-test.t1	repair	status	OK
++test.t1	repair	note	The storage engine for the table doesn't support repair
+ INSERT INTO t1 (a,b) VALUES (10,'j');
+ REPAIR TABLE t1 USE_FRM;
+ Table	Op	Msg_type	Msg_text
+-test.t1	repair	warning	Number of rows changed from 0 to 3
+-test.t1	repair	status	OK
++test.t1	repair	note	The storage engine for the table doesn't support repair
+ db.opt
+-t1.MYD
+-t1.MYI
+ t1.frm
++t1.ibd
+ INSERT INTO t1 (a,b) VALUES (14,'n'),(15,'o');
+ # Statement ended with one of expected results (0,144). 
+ # If you got a difference in error message, just add it to rdiff file
+ FLUSH TABLE t1;
+-Restoring <DATADIR>/test/t1.MYD
++Restoring <DATADIR>/test/t1.ibd
+ CHECK TABLE t1;
+ Table	Op	Msg_type	Msg_text
+-test.t1	check	error	Size of datafile is: 39         Should be: 65
+-test.t1	check	error	Corrupt
++test.t1	check	status	OK
+ SELECT a,b FROM t1;
+-ERROR HY000: Index for table 't1' is corrupt; try to repair it
+-# Statement ended with one of expected results (0,ER_NOT_KEYFILE,144). 
+-# If you got a difference in error message, just add it to rdiff file
+-INSERT INTO t1 (a,b) VALUES (14,'n'),(15,'o');
+-ERROR HY000: Table './test/t1' is marked as crashed and last (automatic?) repair failed
+-# Statement ended with one of expected results (0,144). 
+-# If you got a difference in error message, just add it to rdiff file
+-FLUSH TABLE t1;
+-Restoring <DATADIR>/test/t1.MYI
+-CHECK TABLE t1;
+-Table	Op	Msg_type	Msg_text
+-test.t1	check	warning	Table is marked as crashed and last repair failed
+-test.t1	check	error	Size of datafile is: 39         Should be: 65
+-test.t1	check	error	Corrupt
+-SELECT a,b FROM t1;
+-ERROR HY000: Table './test/t1' is marked as crashed and last (automatic?) repair failed
++a	b
++7	g
++8	h
++10	j
++14	n
++15	o
+ # Statement ended with one of expected results (0,ER_NOT_KEYFILE,144). 
+ # If you got a difference in error message, just add it to rdiff file
+ DROP TABLE t1;
diff --git a/storage/innobase/mysql-test/storage_engine/suite.opt b/storage/innobase/mysql-test/storage_engine/suite.opt
new file mode 100644
index 00000000..627becdb
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/suite.opt
@@ -0,0 +1 @@
+--innodb
diff --git a/storage/innobase/mysql-test/storage_engine/tbl_opt_index_dir.rdiff b/storage/innobase/mysql-test/storage_engine/tbl_opt_index_dir.rdiff
new file mode 100644
index 00000000..e09e50b1
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/tbl_opt_index_dir.rdiff
@@ -0,0 +1,23 @@
+--- suite/storage_engine/tbl_opt_data_index_dir.result	2013-10-03 20:35:06.000000000 +0400
++++ suite/storage_engine/tbl_opt_data_index_dir.reject	2013-11-08 22:06:54.000000000 +0400
+@@ -1,10 +1,12 @@
+ DROP TABLE IF EXISTS t1;
++Warnings:
++Warning	1618	<INDEX DIRECTORY> option ignored
+ SHOW CREATE TABLE t1;
+ Table	Create Table
+ t1	CREATE TABLE `t1` (
+   `a` int(11) DEFAULT NULL,
+   `b` char(8) DEFAULT NULL
+-) ENGINE=<STORAGE_ENGINE> DEFAULT CHARSET=latin1 DATA DIRECTORY='<DATA_DIR>' INDEX DIRECTORY='<INDEX_DIR>'
++) ENGINE=<STORAGE_ENGINE> DEFAULT CHARSET=latin1 DATA DIRECTORY='<DATA_DIR>'
+ Warnings:
+ Warning	1618	<INDEX DIRECTORY> option ignored
+ SHOW CREATE TABLE t1;
+@@ -12,5 +14,5 @@
+ t1	CREATE TABLE `t1` (
+   `a` int(11) DEFAULT NULL,
+   `b` char(8) DEFAULT NULL
+-) ENGINE=<STORAGE_ENGINE> DEFAULT CHARSET=latin1 DATA DIRECTORY='<DATA_DIR>' INDEX DIRECTORY='<INDEX_DIR>'
++) ENGINE=<STORAGE_ENGINE> DEFAULT CHARSET=latin1 DATA DIRECTORY='<DATA_DIR>'
+ DROP TABLE t1;
diff --git a/storage/innobase/mysql-test/storage_engine/tbl_opt_insert_method.rdiff b/storage/innobase/mysql-test/storage_engine/tbl_opt_insert_method.rdiff
new file mode 100644
index 00000000..468b8292
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/tbl_opt_insert_method.rdiff
@@ -0,0 +1,11 @@
+--- suite/storage_engine/tbl_opt_insert_method.result	2012-06-24 23:55:19.539380000 +0400
++++ suite/storage_engine/tbl_opt_insert_method.reject	2012-07-15 17:51:09.978610512 +0400
+@@ -5,7 +5,7 @@
+ t1	CREATE TABLE `t1` (
+   `a` int(11) DEFAULT NULL,
+   `b` char(8) DEFAULT NULL
+-) ENGINE=<STORAGE_ENGINE> DEFAULT CHARSET=latin1 INSERT_METHOD=FIRST
++) ENGINE=<STORAGE_ENGINE> DEFAULT CHARSET=latin1
+ ALTER TABLE t1 INSERT_METHOD=NO;
+ SHOW CREATE TABLE t1;
+ Table	Create Table
diff --git a/storage/innobase/mysql-test/storage_engine/tbl_opt_row_format.rdiff b/storage/innobase/mysql-test/storage_engine/tbl_opt_row_format.rdiff
new file mode 100644
index 00000000..daa5fc67
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/tbl_opt_row_format.rdiff
@@ -0,0 +1,44 @@
+--- ../storage/innobase/mysql-test/storage_engine/tbl_opt_row_format.result~	2017-05-24 00:40:12.854181048 +0300
++++ ../storage/innobase/mysql-test/storage_engine/tbl_opt_row_format.reject	2017-05-24 00:49:06.578191030 +0300
+@@ -7,19 +7,39 @@
+   `b` char(8) DEFAULT NULL
+ ) ENGINE=<STORAGE_ENGINE> DEFAULT CHARSET=latin1 ROW_FORMAT=DYNAMIC
+ ALTER TABLE t1 ROW_FORMAT=FIXED;
++ERROR HY000: Table storage engine '<STORAGE_ENGINE>' does not support the create option 'ROW_TYPE'
++# ERROR: Statement ended with errno 1478, errname ER_ILLEGAL_HA_CREATE_OPTION (expected to succeed)
++# ------------ UNEXPECTED RESULT ------------
++# [ ALTER TABLE t1 ROW_FORMAT=FIXED ]
++# The statement|command finished with ER_ILLEGAL_HA_CREATE_OPTION.
++# ALTER TABLE or the mix could be unsupported|malfunctioning, or the problem was caused by previous errors. 
++# You can change the engine code, or create an rdiff, or disable the test by adding it to disabled.def.
++# Further in this test, the message might sometimes be suppressed; a part of the test might be skipped.
++# Also, this problem may cause a chain effect (more errors of different kinds in the test).
++# -------------------------------------------
+ SHOW CREATE TABLE t1;
+ Table	Create Table
+ t1	CREATE TABLE `t1` (
+   `a` int(11) DEFAULT NULL,
+   `b` char(8) DEFAULT NULL
+-) ENGINE=<STORAGE_ENGINE> DEFAULT CHARSET=latin1 ROW_FORMAT=FIXED
++) ENGINE=<STORAGE_ENGINE> DEFAULT CHARSET=latin1 ROW_FORMAT=DYNAMIC
+ ALTER TABLE t1 ROW_FORMAT=PAGE;
++ERROR HY000: Table storage engine '<STORAGE_ENGINE>' does not support the create option 'ROW_TYPE'
++# ERROR: Statement ended with errno 1478, errname ER_ILLEGAL_HA_CREATE_OPTION (expected to succeed)
++# ------------ UNEXPECTED RESULT ------------
++# [ ALTER TABLE t1 ROW_FORMAT=PAGE ]
++# The statement|command finished with ER_ILLEGAL_HA_CREATE_OPTION.
++# ALTER TABLE or the mix could be unsupported|malfunctioning, or the problem was caused by previous errors. 
++# You can change the engine code, or create an rdiff, or disable the test by adding it to disabled.def.
++# Further in this test, the message might sometimes be suppressed; a part of the test might be skipped.
++# Also, this problem may cause a chain effect (more errors of different kinds in the test).
++# -------------------------------------------
+ SHOW CREATE TABLE t1;
+ Table	Create Table
+ t1	CREATE TABLE `t1` (
+   `a` int(11) DEFAULT NULL,
+   `b` char(8) DEFAULT NULL
+-) ENGINE=<STORAGE_ENGINE> DEFAULT CHARSET=latin1 ROW_FORMAT=PAGE
++) ENGINE=<STORAGE_ENGINE> DEFAULT CHARSET=latin1 ROW_FORMAT=DYNAMIC
+ ALTER TABLE t1 ROW_FORMAT=COMPACT;
+ SHOW CREATE TABLE t1;
+ Table	Create Table
diff --git a/storage/innobase/mysql-test/storage_engine/tbl_opt_union.rdiff b/storage/innobase/mysql-test/storage_engine/tbl_opt_union.rdiff
new file mode 100644
index 00000000..cbdf5818
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/tbl_opt_union.rdiff
@@ -0,0 +1,16 @@
+--- suite/storage_engine/tbl_opt_union.result	2012-06-24 23:55:19.539380000 +0400
++++ suite/storage_engine/tbl_opt_union.reject	2012-07-15 17:51:31.014346053 +0400
+@@ -4,11 +4,11 @@
+ Table	Create Table
+ t1	CREATE TABLE `t1` (
+   `a` int(11) DEFAULT NULL
+-) ENGINE=<STORAGE_ENGINE> DEFAULT CHARSET=latin1 UNION=(`child1`)
++) ENGINE=<STORAGE_ENGINE> DEFAULT CHARSET=latin1
+ ALTER TABLE t1 UNION = (child1,child2);
+ SHOW CREATE TABLE t1;
+ Table	Create Table
+ t1	CREATE TABLE `t1` (
+   `a` int(11) DEFAULT NULL
+-) ENGINE=<STORAGE_ENGINE> DEFAULT CHARSET=latin1 UNION=(`child1`,`child2`)
++) ENGINE=<STORAGE_ENGINE> DEFAULT CHARSET=latin1
+ DROP TABLE t1, child1, child2;
diff --git a/storage/innobase/mysql-test/storage_engine/trx/cons_snapshot_serializable.rdiff b/storage/innobase/mysql-test/storage_engine/trx/cons_snapshot_serializable.rdiff
new file mode 100644
index 00000000..e6149be5
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/trx/cons_snapshot_serializable.rdiff
@@ -0,0 +1,18 @@
+--- suite/storage_engine/trx/cons_snapshot_serializable.result	2013-11-27 18:46:36.000000000 +0400
++++ suite/storage_engine/trx/cons_snapshot_serializable.reject	2013-11-28 19:17:02.000000000 +0400
+@@ -5,12 +5,15 @@
+ CREATE TABLE t1 (a <INT_COLUMN>) ENGINE=<STORAGE_ENGINE> <CUSTOM_TABLE_OPTIONS>;
+ SET SESSION TRANSACTION ISOLATION LEVEL SERIALIZABLE;
+ START TRANSACTION WITH CONSISTENT SNAPSHOT;
++Warnings:
++Warning	138	InnoDB: WITH CONSISTENT SNAPSHOT was ignored because this phrase can only be used with REPEATABLE READ isolation level.
+ connection con2;
+ INSERT INTO t1 (a) VALUES (1);
+ connection con1;
+ # If consistent read works on this isolation level (SERIALIZABLE), the following SELECT should not return the value we inserted (1)
+ SELECT a FROM t1;
+ a
++1
+ COMMIT;
+ connection default;
+ disconnect con1;
diff --git a/storage/innobase/mysql-test/storage_engine/trx/level_read_committed.rdiff b/storage/innobase/mysql-test/storage_engine/trx/level_read_committed.rdiff
new file mode 100644
index 00000000..cb64d321
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/trx/level_read_committed.rdiff
@@ -0,0 +1,11 @@
+--- suite/storage_engine/trx/level_read_committed.result	2013-11-28 19:18:48.000000000 +0400
++++ suite/storage_engine/trx/level_read_committed.reject	2013-11-28 19:18:59.000000000 +0400
+@@ -77,6 +77,8 @@
+ CREATE TABLE t1 (a <INT_COLUMN>) ENGINE=<STORAGE_ENGINE> <CUSTOM_TABLE_OPTIONS>;
+ SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED;
+ START TRANSACTION WITH CONSISTENT SNAPSHOT;
++Warnings:
++Warning	138	InnoDB: WITH CONSISTENT SNAPSHOT was ignored because this phrase can only be used with REPEATABLE READ isolation level.
+ connection con2;
+ INSERT INTO t1 (a) VALUES (1);
+ connection con1;
diff --git a/storage/innobase/mysql-test/storage_engine/trx/level_read_uncommitted.rdiff b/storage/innobase/mysql-test/storage_engine/trx/level_read_uncommitted.rdiff
new file mode 100644
index 00000000..6a79abe3
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/trx/level_read_uncommitted.rdiff
@@ -0,0 +1,11 @@
+--- suite/storage_engine/trx/level_read_uncommitted.result	2013-11-28 19:18:48.000000000 +0400
++++ suite/storage_engine/trx/level_read_uncommitted.reject	2013-11-28 19:19:50.000000000 +0400
+@@ -102,6 +102,8 @@
+ CREATE TABLE t1 (a <INT_COLUMN>) ENGINE=<STORAGE_ENGINE> <CUSTOM_TABLE_OPTIONS>;
+ SET SESSION TRANSACTION ISOLATION LEVEL READ UNCOMMITTED;
+ START TRANSACTION WITH CONSISTENT SNAPSHOT;
++Warnings:
++Warning	138	InnoDB: WITH CONSISTENT SNAPSHOT was ignored because this phrase can only be used with REPEATABLE READ isolation level.
+ connection con2;
+ INSERT INTO t1 (a) VALUES (1);
+ connection con1;
diff --git a/storage/innobase/mysql-test/storage_engine/trx/suite.opt b/storage/innobase/mysql-test/storage_engine/trx/suite.opt
new file mode 100644
index 00000000..64bbe8b5
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/trx/suite.opt
@@ -0,0 +1,3 @@
+--innodb
+--innodb-lock-wait-timeout=1
+
diff --git a/storage/innobase/mysql-test/storage_engine/type_blob.opt b/storage/innobase/mysql-test/storage_engine/type_blob.opt
new file mode 100644
index 00000000..a007f405
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/type_blob.opt
@@ -0,0 +1 @@
+--innodb_log_file_size=200M
diff --git a/storage/innobase/mysql-test/storage_engine/type_char_indexes.rdiff b/storage/innobase/mysql-test/storage_engine/type_char_indexes.rdiff
new file mode 100644
index 00000000..98e17f3c
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/type_char_indexes.rdiff
@@ -0,0 +1,11 @@
+--- suite/storage_engine/type_char_indexes.result	2014-10-12 14:22:11.000000000 +0400
++++ suite/storage_engine/type_char_indexes.reject	2014-10-12 14:23:28.000000000 +0400
+@@ -137,7 +137,7 @@
+ r3a
+ EXPLAIN SELECT c,c20,v16,v128 FROM t1 WHERE v16 = 'varchar1a' OR v16 = 'varchar3a' ORDER BY v16;
+ id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+-#	#	#	range	#	v16	#	#	#	#
++#	#	#	ALL	#	NULL	#	#	#	#
+ SELECT c,c20,v16,v128 FROM t1 WHERE v16 = 'varchar1a' OR v16 = 'varchar3a' ORDER BY v16;
+ c	c20	v16	v128
+ a	char1	varchar1a	varchar1b
diff --git a/storage/innobase/mysql-test/storage_engine/type_float_indexes.rdiff b/storage/innobase/mysql-test/storage_engine/type_float_indexes.rdiff
new file mode 100644
index 00000000..6ebfd61d
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/type_float_indexes.rdiff
@@ -0,0 +1,11 @@
+--- suite/storage_engine/type_float_indexes.result	2012-07-12 19:37:27.031661128 +0400
++++ suite/storage_engine/type_float_indexes.reject	2012-07-15 17:52:12.189828410 +0400
+@@ -60,7 +60,7 @@
+ ALTER TABLE t1 ADD UNIQUE KEY(d);
+ EXPLAIN SELECT d FROM t1 WHERE r > 0 and d > 0 ORDER BY d;
+ id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+-#	#	#	#	#	d	#	#	#	#
++#	#	#	#	#	NULL	#	#	#	#
+ SELECT d FROM t1 WHERE r > 0 and d > 0 ORDER BY d;
+ d
+ 1.2345
diff --git a/storage/innobase/mysql-test/storage_engine/type_text.opt b/storage/innobase/mysql-test/storage_engine/type_text.opt
new file mode 100644
index 00000000..a007f405
--- /dev/null
+++ b/storage/innobase/mysql-test/storage_engine/type_text.opt
@@ -0,0 +1 @@
+--innodb_log_file_size=200M
diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc
new file mode 100644
index 00000000..5e674806
--- /dev/null
+++ b/storage/innobase/os/os0file.cc
@@ -0,0 +1,4270 @@
+/***********************************************************************
+
+Copyright (c) 1995, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2009, Percona Inc.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+***********************************************************************/
+
+/**************************************************//**
+@file os/os0file.cc
+The interface to the operating system file i/o primitives
+
+Created 10/21/1995 Heikki Tuuri
+*******************************************************/
+
+#include "os0file.h"
+#include "sql_const.h"
+#include "log.h"
+
+#ifdef __linux__
+# include <sys/types.h>
+# include <sys/stat.h>
+# include <sys/sysmacros.h>
+#endif
+
+#include "srv0mon.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "fil0fil.h"
+#include "fsp0fsp.h"
+#ifdef HAVE_LINUX_UNISTD_H
+#include "unistd.h"
+#endif
+#include "buf0dblwr.h"
+
+#include <tpool_structs.h>
+
+#ifdef LINUX_NATIVE_AIO
+#include <libaio.h>
+#endif /* LINUX_NATIVE_AIO */
+
+#ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
+# include <fcntl.h>
+# include <linux/falloc.h>
+#endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
+
+#ifdef _WIN32
+#include <winioctl.h>
+#endif
+
+// my_test_if_atomic_write() , my_win_secattr()
+#include <my_sys.h>
+
+#include <thread>
+#include <chrono>
+
+/* Per-IO operation environment*/
+class io_slots
+{
+private:
+	tpool::cache<tpool::aiocb> m_cache;
+	tpool::task_group m_group;
+	int m_max_aio;
+public:
+	io_slots(int max_submitted_io, int max_callback_concurrency) :
+		m_cache(max_submitted_io), m_group(max_callback_concurrency, false),
+		m_max_aio(max_submitted_io)
+	{
+	}
+	/* Get cached AIO control block */
+	tpool::aiocb* acquire()
+	{
+		return m_cache.get();
+	}
+	/* Release AIO control block back to cache */
+	void release(tpool::aiocb* aiocb)
+	{
+		m_cache.put(aiocb);
+	}
+
+	bool contains(tpool::aiocb* aiocb)
+	{
+		return m_cache.contains(aiocb);
+	}
+
+	/* Wait for completions of all AIO operations */
+	void wait(mysql_mutex_t &m)
+	{
+		m_cache.wait(m);
+	}
+
+	void wait()
+	{
+		m_cache.wait();
+	}
+
+	size_t pending_io_count()
+	{
+		return m_cache.pos();
+	}
+
+	tpool::task_group* get_task_group()
+	{
+		return &m_group;
+	}
+
+	~io_slots()
+	{
+		wait();
+	}
+
+	mysql_mutex_t& mutex()
+	{
+		return m_cache.mutex();
+	}
+
+	void resize(int max_submitted_io, int max_callback_concurrency)
+	{
+		m_cache.resize(max_submitted_io);
+		m_group.set_max_tasks(max_callback_concurrency);
+		m_max_aio = max_submitted_io;
+	}
+
+	tpool::task_group& task_group()
+	{
+		return m_group;
+	}
+};
+
+static io_slots *read_slots;
+static io_slots *write_slots;
+
+/** Number of retries for partial I/O's */
+constexpr ulint NUM_RETRIES_ON_PARTIAL_IO = 10;
+
+/* This specifies the file permissions InnoDB uses when it creates files in
+Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
+my_umask */
+
+#ifndef _WIN32
+/** Umask for creating files */
+static ulint	os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
+#else
+/** Umask for creating files */
+static ulint	os_innodb_umask	= 0;
+#endif /* _WIN32 */
+
+Atomic_counter<ulint> os_n_file_reads;
+static ulint	os_bytes_read_since_printout;
+Atomic_counter<size_t> os_n_file_writes;
+Atomic_counter<size_t> os_n_fsyncs;
+static ulint	os_n_file_reads_old;
+static ulint	os_n_file_writes_old;
+static ulint	os_n_fsyncs_old;
+
+static time_t	os_last_printout;
+bool	os_has_said_disk_full;
+
+/** Default Zip compression level */
+extern uint page_zip_level;
+
+#ifdef UNIV_PFS_IO
+/* Keys to register InnoDB I/O with performance schema */
+mysql_pfs_key_t  innodb_data_file_key;
+mysql_pfs_key_t  innodb_temp_file_key;
+#endif
+
+/** Handle errors for file operations.
+@param[in]	name		name of a file or NULL
+@param[in]	operation	operation
+@param[in]	should_abort	whether to abort on an unknown error
+@param[in]	on_error_silent	whether to suppress reports of non-fatal errors
+@return true if we should retry the operation */
+static
+bool
+os_file_handle_error_cond_exit(
+	const char*	name,
+	const char*	operation,
+	bool		should_abort,
+	bool		on_error_silent);
+
+/** Does error handling when a file operation fails.
+@param[in]	name		name of a file or NULL
+@param[in]	operation	operation name that failed
+@return true if we should retry the operation */
+static
+bool
+os_file_handle_error(
+	const char*	name,
+	const char*	operation)
+{
+	/* Exit in case of unknown error */
+	return(os_file_handle_error_cond_exit(name, operation, true, false));
+}
+
+/** Does error handling when a file operation fails.
+@param[in]	name		name of a file or NULL
+@param[in]	operation	operation name that failed
+@param[in]	on_error_silent	if true then don't print any message to the log.
+@return true if we should retry the operation */
+static
+bool
+os_file_handle_error_no_exit(
+	const char*	name,
+	const char*	operation,
+	bool		on_error_silent)
+{
+	/* Don't exit in case of unknown error */
+	return(os_file_handle_error_cond_exit(
+			name, operation, false, on_error_silent));
+}
+
+/** Handle RENAME error.
+@param name	old name of the file
+@param new_name	new name of the file */
+static void os_file_handle_rename_error(const char* name, const char* new_name)
+{
+	if (os_file_get_last_error(true) != OS_FILE_DISK_FULL) {
+		ib::error() << "Cannot rename file '" << name << "' to '"
+			<< new_name << "'";
+	} else if (!os_has_said_disk_full) {
+		os_has_said_disk_full = true;
+		/* Disk full error is reported irrespective of the
+		on_error_silent setting. */
+		ib::error() << "Full disk prevents renaming file '"
+			<< name << "' to '" << new_name << "'";
+	}
+}
+
+
+#ifdef _WIN32
+
+/**
+ Wrapper around Windows DeviceIoControl() function.
+
+ Works synchronously, also in case for handle opened
+ for async access (i.e with FILE_FLAG_OVERLAPPED).
+
+ Accepts the same parameters as DeviceIoControl(),except
+ last parameter (OVERLAPPED).
+*/
+static
+BOOL
+os_win32_device_io_control(
+	HANDLE handle,
+	DWORD code,
+	LPVOID inbuf,
+	DWORD inbuf_size,
+	LPVOID outbuf,
+	DWORD outbuf_size,
+	LPDWORD bytes_returned
+)
+{
+	OVERLAPPED overlapped = { 0 };
+	overlapped.hEvent = tpool::win_get_syncio_event();
+	BOOL result = DeviceIoControl(handle, code, inbuf, inbuf_size, outbuf,
+		outbuf_size,  NULL, &overlapped);
+
+	if (result || (GetLastError() == ERROR_IO_PENDING)) {
+		/* Wait for async io to complete */
+		result = GetOverlappedResult(handle, &overlapped, bytes_returned, TRUE);
+	}
+
+	return result;
+}
+
+#endif
+
+
+
+/** Helper class for doing synchronous file IO. Currently, the objective
+is to hide the OS specific code, so that the higher level functions aren't
+peppered with #ifdef. Makes the code flow difficult to follow.  */
+class SyncFileIO
+{
+public:
+  /** Constructor
+  @param[in]     fh     File handle
+  @param[in,out] buf    Buffer to read/write
+  @param[in]     n      Number of bytes to read/write
+  @param[in]     offset Offset where to read or write */
+  SyncFileIO(os_file_t fh, void *buf, ulint n, os_offset_t offset) :
+    m_fh(fh), m_buf(buf), m_n(static_cast<ssize_t>(n)), m_offset(offset)
+  { ut_ad(m_n > 0); }
+
+  /** Do the read/write
+  @param[in]	request	The IO context and type
+  @return the number of bytes read/written or negative value on error */
+  ssize_t execute(const IORequest &request);
+
+  /** Move the read/write offset up to where the partial IO succeeded.
+  @param[in]	n_bytes	The number of bytes to advance */
+  void advance(ssize_t n_bytes)
+  {
+    m_offset+= n_bytes;
+    ut_ad(m_n >= n_bytes);
+    m_n-= n_bytes;
+    m_buf= reinterpret_cast<uchar*>(m_buf) + n_bytes;
+  }
+
+private:
+  /** Open file handle */
+  const os_file_t m_fh;
+  /** Buffer to read/write */
+  void *m_buf;
+  /** Number of bytes to read/write */
+  ssize_t m_n;
+  /** Offset from where to read/write */
+  os_offset_t m_offset;
+};
+
+#ifndef _WIN32 /* On Microsoft Windows, mandatory locking is used */
+/** Obtain an exclusive lock on a file.
+@param fd      file descriptor
+@param name    file name
+@return 0 on success */
+int os_file_lock(int fd, const char *name)
+{
+	struct flock lk;
+
+	lk.l_type = F_WRLCK;
+	lk.l_whence = SEEK_SET;
+	lk.l_start = lk.l_len = 0;
+
+	if (fcntl(fd, F_SETLK, &lk) == -1) {
+
+		ib::error()
+			<< "Unable to lock " << name
+			<< " error: " << errno;
+
+		if (errno == EAGAIN || errno == EACCES) {
+
+			ib::info()
+				<< "Check that you do not already have"
+				" another mariadbd process using the"
+				" same InnoDB data or log files.";
+		}
+
+		return(-1);
+	}
+
+	return(0);
+}
+#endif /* !_WIN32 */
+
+
+/** Create a temporary file. This function is like tmpfile(3), but
+the temporary file is created in the in the mysql server configuration
+parameter (--tmpdir).
+@return temporary file handle, or NULL on error */
+FILE*
+os_file_create_tmpfile()
+{
+	FILE*	file	= NULL;
+	File	fd	= mysql_tmpfile("ib");
+
+	if (fd >= 0) {
+		file = my_fdopen(fd, 0, O_RDWR|O_TRUNC|O_CREAT|FILE_BINARY,
+				 MYF(MY_WME));
+		if (!file) {
+			my_close(fd, MYF(MY_WME));
+		}
+	}
+
+	if (file == NULL) {
+
+		ib::error()
+			<< "Unable to create temporary file; errno: "
+			<< errno;
+	}
+
+	return(file);
+}
+
+/** Rewind file to its start, read at most size - 1 bytes from it to str, and
+NUL-terminate str. All errors are silently ignored. This function is
+mostly meant to be used with temporary files.
+@param[in,out]	file		File to read from
+@param[in,out]	str		Buffer where to read
+@param[in]	size		Size of buffer */
+void
+os_file_read_string(
+	FILE*		file,
+	char*		str,
+	ulint		size)
+{
+	if (size != 0) {
+		rewind(file);
+
+		size_t	flen = fread(str, 1, size - 1, file);
+
+		str[flen] = '\0';
+	}
+}
+
+/** This function reduces a null-terminated full remote path name into
+the path that is sent by MySQL for DATA DIRECTORY clause.  It replaces
+the 'databasename/tablename.ibd' found at the end of the path with just
+'tablename'.
+
+Since the result is always smaller than the path sent in, no new memory
+is allocated. The caller should allocate memory for the path sent in.
+This function manipulates that path in place.
+
+If the path format is not as expected, just return.  The result is used
+to inform a SHOW CREATE TABLE command.
+@param[in,out]	data_dir_path		Full path/data_dir_path */
+void
+os_file_make_data_dir_path(
+	char*	data_dir_path)
+{
+	/* Replace the period before the extension with a null byte. */
+	char*	ptr = strrchr(data_dir_path, '.');
+
+	if (ptr == NULL) {
+		return;
+	}
+
+	ptr[0] = '\0';
+
+	/* The tablename starts after the last slash. */
+	ptr = strrchr(data_dir_path, '/');
+
+
+	if (ptr == NULL) {
+		return;
+	}
+
+	ptr[0] = '\0';
+
+	char*	tablename = ptr + 1;
+
+	/* The databasename starts after the next to last slash. */
+	ptr = strrchr(data_dir_path, '/');
+#ifdef _WIN32
+	if (char *aptr = strrchr(data_dir_path, '\\')) {
+		if (aptr > ptr) {
+			ptr = aptr;
+		}
+	}
+#endif
+
+	if (ptr == NULL) {
+		return;
+	}
+
+	ulint	tablename_len = strlen(tablename);
+
+	memmove(++ptr, tablename, tablename_len);
+
+	ptr[tablename_len] = '\0';
+}
+
+/** Check if the path refers to the root of a drive using a pointer
+to the last directory separator that the caller has fixed.
+@param[in]	path	path name
+@param[in]	path	last directory separator in the path
+@return true if this path is a drive root, false if not */
+UNIV_INLINE
+bool
+os_file_is_root(
+	const char*	path,
+	const char*	last_slash)
+{
+	return(
+#ifdef _WIN32
+	       (last_slash == path + 2 && path[1] == ':') ||
+#endif /* _WIN32 */
+	       last_slash == path);
+}
+
+/** Return the parent directory component of a null-terminated path.
+Return a new buffer containing the string up to, but not including,
+the final component of the path.
+The path returned will not contain a trailing separator.
+Do not return a root path, return NULL instead.
+The final component trimmed off may be a filename or a directory name.
+If the final component is the only component of the path, return NULL.
+It is the caller's responsibility to free the returned string after it
+is no longer needed.
+@param[in]	path		Path name
+@return own: parent directory of the path */
+static
+char*
+os_file_get_parent_dir(
+	const char*	path)
+{
+	/* Find the offset of the last slash */
+	const char* last_slash = strrchr(path, '/');
+
+#ifdef _WIN32
+	if (const char *last = strrchr(path, '\\')) {
+		if (last > last_slash) {
+			last_slash = last;
+		}
+	}
+#endif
+
+	if (!last_slash) {
+		/* No slash in the path, return NULL */
+		return(NULL);
+	}
+
+	/* Ok, there is a slash. Is there anything after it? */
+	const bool has_trailing_slash = last_slash[1] == '\0';
+
+	/* Reduce repetitive slashes. */
+	while (last_slash > path
+	       && (IF_WIN(last_slash[-1] == '\\' ||,) last_slash[-1] == '/')) {
+		last_slash--;
+	}
+
+	/* Check for the root of a drive. */
+	if (os_file_is_root(path, last_slash)) {
+		return(NULL);
+	}
+
+	/* If a trailing slash prevented the first strrchr() from trimming
+	the last component of the path, trim that component now. */
+	if (has_trailing_slash) {
+		/* Back up to the previous slash. */
+		last_slash--;
+		while (last_slash > path
+		       && (IF_WIN(last_slash[0] != '\\' &&,)
+			   last_slash[0] != '/')) {
+			last_slash--;
+		}
+
+		/* Reduce repetitive slashes. */
+		while (last_slash > path
+		       && (IF_WIN(last_slash[-1] == '\\' ||,)
+			   last_slash[-1] == '/')) {
+			last_slash--;
+		}
+	}
+
+	/* Check for the root of a drive. */
+	if (os_file_is_root(path, last_slash)) {
+		return(NULL);
+	}
+
+	if (last_slash - path < 0) {
+		/* Sanity check, it prevents gcc from trying to handle this case which
+		 * results in warnings for some optimized builds */
+		return (NULL);
+	}
+
+	/* Non-trivial directory component */
+
+	return(mem_strdupl(path, ulint(last_slash - path)));
+}
+#ifdef UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR
+
+/* Test the function os_file_get_parent_dir. */
+void
+test_os_file_get_parent_dir(
+	const char*	child_dir,
+	const char*	expected_dir)
+{
+	char* child = mem_strdup(child_dir);
+	char* expected = expected_dir == NULL ? NULL
+			 : mem_strdup(expected_dir);
+
+	char* parent = os_file_get_parent_dir(child);
+
+	bool unexpected = (expected == NULL
+			  ? (parent != NULL)
+			  : (0 != strcmp(parent, expected)));
+	if (unexpected) {
+		ib::fatal() << "os_file_get_parent_dir('" << child
+			<< "') returned '" << parent
+			<< "', instead of '" << expected << "'.";
+	}
+	ut_free(parent);
+	ut_free(child);
+	ut_free(expected);
+}
+
+/* Test the function os_file_get_parent_dir. */
+void
+unit_test_os_file_get_parent_dir()
+{
+	test_os_file_get_parent_dir("/usr/lib/a", "/usr/lib");
+	test_os_file_get_parent_dir("/usr/", NULL);
+	test_os_file_get_parent_dir("//usr//", NULL);
+	test_os_file_get_parent_dir("usr", NULL);
+	test_os_file_get_parent_dir("usr//", NULL);
+	test_os_file_get_parent_dir("/", NULL);
+	test_os_file_get_parent_dir("//", NULL);
+	test_os_file_get_parent_dir(".", NULL);
+	test_os_file_get_parent_dir("..", NULL);
+# ifdef _WIN32
+	test_os_file_get_parent_dir("D:", NULL);
+	test_os_file_get_parent_dir("D:/", NULL);
+	test_os_file_get_parent_dir("D:\\", NULL);
+	test_os_file_get_parent_dir("D:/data", NULL);
+	test_os_file_get_parent_dir("D:/data/", NULL);
+	test_os_file_get_parent_dir("D:\\data\\", NULL);
+	test_os_file_get_parent_dir("D:///data/////", NULL);
+	test_os_file_get_parent_dir("D:\\\\\\data\\\\\\\\", NULL);
+	test_os_file_get_parent_dir("D:/data//a", "D:/data");
+	test_os_file_get_parent_dir("D:\\data\\\\a", "D:\\data");
+	test_os_file_get_parent_dir("D:///data//a///b/", "D:///data//a");
+	test_os_file_get_parent_dir("D:\\\\\\data\\\\a\\\\\\b\\", "D:\\\\\\data\\\\a");
+#endif  /* _WIN32 */
+}
+#endif /* UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR */
+
+
+/** Creates all missing subdirectories along the given path.
+@param[in]	path		Path name
+@return DB_SUCCESS if OK, otherwise error code. */
+dberr_t
+os_file_create_subdirs_if_needed(
+	const char*	path)
+{
+	if (srv_read_only_mode) {
+
+		ib::error()
+			<< "read only mode set. Can't create "
+			<< "subdirectories '" << path << "'";
+
+		return(DB_READ_ONLY);
+
+	}
+
+	char*	subdir = os_file_get_parent_dir(path);
+
+	if (subdir == NULL) {
+		/* subdir is root or cwd, nothing to do */
+		return(DB_SUCCESS);
+	}
+
+	/* Test if subdir exists */
+	os_file_type_t	type;
+	bool	subdir_exists;
+	bool	success = os_file_status(subdir, &subdir_exists, &type);
+
+	if (success && !subdir_exists) {
+
+		/* Subdir does not exist, create it */
+		dberr_t	err = os_file_create_subdirs_if_needed(subdir);
+
+		if (err != DB_SUCCESS) {
+
+			ut_free(subdir);
+
+			return(err);
+		}
+
+		success = os_file_create_directory(subdir, false);
+	}
+
+	ut_free(subdir);
+
+	return(success ? DB_SUCCESS : DB_ERROR);
+}
+
+
+
+/** Do the read/write
+@param[in]	request	The IO context and type
+@return the number of bytes read/written or negative value on error */
+ssize_t
+SyncFileIO::execute(const IORequest& request)
+{
+	ssize_t	n_bytes;
+
+	if (request.is_read()) {
+#ifdef _WIN32
+		n_bytes = tpool::pread(m_fh, m_buf, m_n, m_offset);
+#else
+		n_bytes = pread(m_fh, m_buf, m_n, m_offset);
+#endif
+	} else {
+		ut_ad(request.is_write());
+#ifdef _WIN32
+		n_bytes = tpool::pwrite(m_fh, m_buf, m_n, m_offset);
+#else
+		n_bytes = pwrite(m_fh, m_buf, m_n, m_offset);
+#endif
+	}
+
+	return(n_bytes);
+}
+
+#ifndef _WIN32
+/** Free storage space associated with a section of the file.
+@param[in]	fh		Open file handle
+@param[in]	off		Starting offset (SEEK_SET)
+@param[in]	len		Size of the hole
+@return DB_SUCCESS or error code */
+static
+dberr_t
+os_file_punch_hole_posix(
+	os_file_t	fh,
+	os_offset_t	off,
+	os_offset_t	len)
+{
+
+#ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
+	const int	mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
+
+	int		ret = fallocate(fh, mode, off, len);
+
+	if (ret == 0) {
+		return(DB_SUCCESS);
+	}
+
+	if (errno == ENOTSUP) {
+		return(DB_IO_NO_PUNCH_HOLE);
+	}
+
+	ib::warn()
+		<< "fallocate("
+		<<", FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, "
+		<< off << ", " << len << ") returned errno: "
+		<<  errno;
+
+	return(DB_IO_ERROR);
+
+#elif defined __sun__
+
+	// Use F_FREESP
+
+#endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
+
+	return(DB_IO_NO_PUNCH_HOLE);
+}
+
+/** Retrieves the last error number if an error occurs in a file io function.
+The number should be retrieved before any other OS calls (because they may
+overwrite the error number). If the number is not known to this program,
+the OS error number + 100 is returned.
+@param[in]	report_all_errors	true if we want an error message
+                                        printed of all errors
+@param[in]	on_error_silent		true then don't print any diagnostic
+					to the log
+@return error number, or OS error number + 100 */
+ulint os_file_get_last_error(bool report_all_errors, bool on_error_silent)
+{
+	int	err = errno;
+
+	if (err == 0) {
+		return(0);
+	}
+
+	if (report_all_errors
+	    || (err != ENOSPC && err != EEXIST && err != ENOENT
+		&& !on_error_silent)) {
+
+		ib::error()
+			<< "Operating system error number "
+			<< err
+			<< " in a file operation.";
+
+		if (err == EACCES) {
+
+			ib::error()
+				<< "The error means mariadbd does not have"
+				" the access rights to the directory.";
+
+		} else {
+			if (strerror(err) != NULL) {
+
+				ib::error()
+					<< "Error number " << err << " means '"
+					<< strerror(err) << "'";
+			}
+
+			ib::info() << OPERATING_SYSTEM_ERROR_MSG;
+		}
+	}
+
+	switch (err) {
+	case ENOSPC:
+		return(OS_FILE_DISK_FULL);
+	case ENOENT:
+		return(OS_FILE_NOT_FOUND);
+	case EEXIST:
+		return(OS_FILE_ALREADY_EXISTS);
+	case EXDEV:
+	case ENOTDIR:
+	case EISDIR:
+	case EPERM:
+		return(OS_FILE_PATH_ERROR);
+	case EAGAIN:
+		if (srv_use_native_aio) {
+			return(OS_FILE_AIO_RESOURCES_RESERVED);
+		}
+		break;
+	case EINTR:
+		if (srv_use_native_aio) {
+			return(OS_FILE_AIO_INTERRUPTED);
+		}
+		break;
+	case EACCES:
+		return(OS_FILE_ACCESS_VIOLATION);
+	}
+	return(OS_FILE_ERROR_MAX + err);
+}
+
+/** Wrapper to fsync() or fdatasync() that retries the call on some errors.
+Returns the value 0 if successful; otherwise the value -1 is returned and
+the global variable errno is set to indicate the error.
+@param[in]	file		open file handle
+@return 0 if success, -1 otherwise */
+static int os_file_sync_posix(os_file_t file)
+{
+#if !defined(HAVE_FDATASYNC) || HAVE_DECL_FDATASYNC == 0
+  auto func= fsync;
+  auto func_name= "fsync()";
+#else
+  auto func= fdatasync;
+  auto func_name= "fdatasync()";
+#endif
+
+  ulint failures= 0;
+
+  for (;;)
+  {
+    ++os_n_fsyncs;
+
+    int ret= func(file);
+
+    if (ret == 0)
+      return ret;
+
+    switch (errno)
+    {
+    case ENOLCK:
+      ++failures;
+      ut_a(failures < 1000);
+
+      if (!(failures % 100))
+        ib::warn() << func_name << ": No locks available; retrying";
+
+      std::this_thread::sleep_for(std::chrono::milliseconds(200));
+      break;
+
+    case EINTR:
+      ++failures;
+      ut_a(failures < 2000);
+      break;
+
+    default:
+      ib::fatal() << func_name << " returned " << errno;
+    }
+  }
+}
+
+/** Check the existence and type of the given file.
+@param[in]	path		path name of file
+@param[out]	exists		true if the file exists
+@param[out]	type		Type of the file, if it exists
+@return true if call succeeded */
+static
+bool
+os_file_status_posix(
+	const char*	path,
+	bool*		exists,
+	os_file_type_t* type)
+{
+	struct stat	statinfo;
+
+	int	ret = stat(path, &statinfo);
+
+	*exists = !ret;
+
+	if (!ret) {
+		/* file exists, everything OK */
+		MSAN_STAT_WORKAROUND(&statinfo);
+	} else if (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG) {
+		/* file does not exist */
+		return(true);
+
+	} else {
+		/* file exists, but stat call failed */
+		os_file_handle_error_no_exit(path, "stat", false);
+		return(false);
+	}
+
+	if (S_ISDIR(statinfo.st_mode)) {
+		*type = OS_FILE_TYPE_DIR;
+
+	} else if (S_ISLNK(statinfo.st_mode)) {
+		*type = OS_FILE_TYPE_LINK;
+
+	} else if (S_ISREG(statinfo.st_mode)) {
+		*type = OS_FILE_TYPE_FILE;
+	} else {
+		*type = OS_FILE_TYPE_UNKNOWN;
+	}
+
+	return(true);
+}
+
+/** NOTE! Use the corresponding macro os_file_flush(), not directly this
+function!
+Flushes the write buffers of a given file to the disk.
+@param[in]	file		handle to a file
+@return true if success */
+bool
+os_file_flush_func(
+	os_file_t	file)
+{
+	int	ret;
+
+	ret = os_file_sync_posix(file);
+
+	if (ret == 0) {
+		return(true);
+	}
+
+	/* Since Linux returns EINVAL if the 'file' is actually a raw device,
+	we choose to ignore that error if we are using raw disks */
+
+	if (srv_start_raw_disk_in_use && errno == EINVAL) {
+
+		return(true);
+	}
+
+	ib::error() << "The OS said file flush did not succeed";
+
+	os_file_handle_error(NULL, "flush");
+
+	/* It is a fatal error if a file flush does not succeed, because then
+	the database can get corrupt on disk */
+	ut_error;
+
+	return(false);
+}
+
+/** NOTE! Use the corresponding macro os_file_create_simple(), not directly
+this function!
+A simple function to open or create a file.
+@param[in]	name		name of the file or path as a null-terminated
+				string
+@param[in]	create_mode	create mode
+@param[in]	access_type	OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
+@param[in]	read_only	if true, read only checks are enforced
+@param[out]	success		true if succeed, false if error
+@return handle to the file, not defined if error, error number
+	can be retrieved with os_file_get_last_error */
+pfs_os_file_t
+os_file_create_simple_func(
+	const char*	name,
+	ulint		create_mode,
+	ulint		access_type,
+	bool		read_only,
+	bool*		success)
+{
+	pfs_os_file_t	file;
+
+	*success = false;
+
+	int		create_flag;
+	const char*	mode_str	= NULL;
+
+	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
+	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
+
+	if (create_mode == OS_FILE_OPEN) {
+		mode_str = "OPEN";
+
+		if (access_type == OS_FILE_READ_ONLY) {
+
+			create_flag = O_RDONLY;
+
+		} else if (read_only) {
+
+			create_flag = O_RDONLY;
+
+		} else {
+			create_flag = O_RDWR;
+		}
+
+	} else if (read_only) {
+
+		mode_str = "OPEN";
+		create_flag = O_RDONLY;
+
+	} else if (create_mode == OS_FILE_CREATE) {
+
+		mode_str = "CREATE";
+		create_flag = O_RDWR | O_CREAT | O_EXCL;
+
+	} else if (create_mode == OS_FILE_CREATE_PATH) {
+
+		mode_str = "CREATE PATH";
+		/* Create subdirs along the path if needed. */
+
+		*success = os_file_create_subdirs_if_needed(name);
+
+		if (!*success) {
+
+			ib::error()
+				<< "Unable to create subdirectories '"
+				<< name << "'";
+
+			return(OS_FILE_CLOSED);
+		}
+
+		create_flag = O_RDWR | O_CREAT | O_EXCL;
+		create_mode = OS_FILE_CREATE;
+	} else {
+
+		ib::error()
+			<< "Unknown file create mode ("
+			<< create_mode
+			<< " for file '" << name << "'";
+
+		return(OS_FILE_CLOSED);
+	}
+
+	bool	retry;
+
+	do {
+		file = open(name, create_flag | O_CLOEXEC, os_innodb_umask);
+
+		if (file == -1) {
+			*success = false;
+			retry = os_file_handle_error(
+				name,
+				create_mode == OS_FILE_OPEN
+				? "open" : "create");
+		} else {
+			*success = true;
+			retry = false;
+		}
+
+	} while (retry);
+
+	/* This function is always called for data files, we should disable
+	OS caching (O_DIRECT) here as we do in os_file_create_func(), so
+	we open the same file in the same mode, see man page of open(2). */
+	if (!srv_read_only_mode && *success) {
+		switch (srv_file_flush_method) {
+		case SRV_O_DSYNC:
+		case SRV_O_DIRECT:
+		case SRV_O_DIRECT_NO_FSYNC:
+			os_file_set_nocache(file, name, mode_str);
+			break;
+		default:
+			break;
+		}
+	}
+
+#ifndef _WIN32
+	if (!read_only
+	    && *success
+	    && access_type == OS_FILE_READ_WRITE
+	    && !my_disable_locking
+	    && os_file_lock(file, name)) {
+
+		*success = false;
+		close(file);
+		file = -1;
+	}
+#endif /* !_WIN32 */
+
+	return(file);
+}
+
+/** This function attempts to create a directory named pathname. The new
+directory gets default permissions. On Unix the permissions are
+(0770 & ~umask). If the directory exists already, nothing is done and
+the call succeeds, unless the fail_if_exists arguments is true.
+If another error occurs, such as a permission error, this does not crash,
+but reports the error and returns false.
+@param[in]	pathname	directory name as null-terminated string
+@param[in]	fail_if_exists	if true, pre-existing directory is treated as
+				an error.
+@return true if call succeeds, false on error */
+bool
+os_file_create_directory(
+	const char*	pathname,
+	bool		fail_if_exists)
+{
+	int	rcode;
+
+	rcode = mkdir(pathname, 0770);
+
+	if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
+		/* failure */
+		os_file_handle_error_no_exit(pathname, "mkdir", false);
+
+		return(false);
+	}
+
+	return(true);
+}
+
+/** NOTE! Use the corresponding macro os_file_create(), not directly
+this function!
+Opens an existing file or creates a new.
+@param[in]	name		name of the file or path as a null-terminated
+				string
+@param[in]	create_mode	create mode
+@param[in]	purpose		OS_FILE_AIO, if asynchronous, non-buffered I/O
+				is desired, OS_FILE_NORMAL, if any normal file;
+				NOTE that it also depends on type, os_aio_..
+				and srv_.. variables whether we really use async
+				I/O or unbuffered I/O: look in the function
+				source code for the exact rules
+@param[in]	type		OS_DATA_FILE or OS_LOG_FILE
+@param[in]	read_only	true, if read only checks should be enforcedm
+@param[in]	success		true if succeeded
+@return handle to the file, not defined if error, error number
+	can be retrieved with os_file_get_last_error */
+pfs_os_file_t
+os_file_create_func(
+	const char*	name,
+	ulint		create_mode,
+	ulint		purpose,
+	ulint		type,
+	bool		read_only,
+	bool*		success)
+{
+	bool		on_error_no_exit;
+	bool		on_error_silent;
+
+	*success = false;
+
+	DBUG_EXECUTE_IF(
+		"ib_create_table_fail_disk_full",
+		*success = false;
+		errno = ENOSPC;
+		return(OS_FILE_CLOSED);
+	);
+
+	int		create_flag;
+	const char*	mode_str	= NULL;
+
+	on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
+		? true : false;
+	on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
+		? true : false;
+
+	create_mode &= ulint(~(OS_FILE_ON_ERROR_NO_EXIT
+			       | OS_FILE_ON_ERROR_SILENT));
+
+	if (create_mode == OS_FILE_OPEN
+	    || create_mode == OS_FILE_OPEN_RAW
+	    || create_mode == OS_FILE_OPEN_RETRY) {
+
+		mode_str = "OPEN";
+
+		create_flag = read_only ? O_RDONLY : O_RDWR;
+
+	} else if (read_only) {
+
+		mode_str = "OPEN";
+
+		create_flag = O_RDONLY;
+
+	} else if (create_mode == OS_FILE_CREATE) {
+
+		mode_str = "CREATE";
+		create_flag = O_RDWR | O_CREAT | O_EXCL;
+
+	} else if (create_mode == OS_FILE_OVERWRITE) {
+
+		mode_str = "OVERWRITE";
+		create_flag = O_RDWR | O_CREAT | O_TRUNC;
+
+	} else {
+		ib::error()
+			<< "Unknown file create mode (" << create_mode << ")"
+			<< " for file '" << name << "'";
+
+		return(OS_FILE_CLOSED);
+	}
+
+	ut_a(type == OS_LOG_FILE
+	     || type == OS_DATA_FILE
+	     || type == OS_DATA_FILE_NO_O_DIRECT);
+
+	ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
+
+	/* We let O_DSYNC only affect log files */
+
+	if (!read_only
+	    && type == OS_LOG_FILE
+	    && srv_file_flush_method == SRV_O_DSYNC) {
+#ifdef O_DSYNC
+		create_flag |= O_DSYNC;
+#else
+		create_flag |= O_SYNC;
+#endif
+	}
+
+	os_file_t	file;
+	bool		retry;
+
+	do {
+		file = open(name, create_flag | O_CLOEXEC, os_innodb_umask);
+
+		if (file == -1) {
+			const char*	operation;
+
+			operation = (create_mode == OS_FILE_CREATE
+				     && !read_only) ? "create" : "open";
+
+			*success = false;
+
+			if (on_error_no_exit) {
+				retry = os_file_handle_error_no_exit(
+					name, operation, on_error_silent);
+			} else {
+				retry = os_file_handle_error(name, operation);
+			}
+		} else {
+			*success = true;
+			retry = false;
+		}
+
+	} while (retry);
+
+	if (!*success) {
+		return file;
+	}
+
+#if (defined __sun__ && defined DIRECTIO_ON) || defined O_DIRECT
+	if (type == OS_DATA_FILE) {
+		switch (srv_file_flush_method) {
+		case SRV_O_DSYNC:
+		case SRV_O_DIRECT:
+		case SRV_O_DIRECT_NO_FSYNC:
+# ifdef __linux__
+use_o_direct:
+# endif
+			os_file_set_nocache(file, name, mode_str);
+			break;
+		default:
+			break;
+		}
+	}
+# ifdef __linux__
+	else if (type == OS_LOG_FILE && !log_sys.is_opened()) {
+		struct stat st;
+		char b[20 + sizeof "/sys/dev/block/" ":"
+		       "/../queue/physical_block_size"];
+		int f;
+		if (fstat(file, &st)) {
+			goto skip_o_direct;
+		}
+		MSAN_STAT_WORKAROUND(&st);
+		if (snprintf(b, sizeof b,
+			     "/sys/dev/block/%u:%u/queue/physical_block_size",
+			     major(st.st_dev), minor(st.st_dev))
+		    >= static_cast<int>(sizeof b)) {
+			goto skip_o_direct;
+		}
+		if ((f = open(b, O_RDONLY)) == -1) {
+			if (snprintf(b, sizeof b,
+				     "/sys/dev/block/%u:%u/../queue/"
+				     "physical_block_size",
+				     major(st.st_dev), minor(st.st_dev))
+			    >= static_cast<int>(sizeof b)) {
+				goto skip_o_direct;
+			}
+			f = open(b, O_RDONLY);
+		}
+		if (f != -1) {
+			ssize_t l = read(f, b, sizeof b);
+			unsigned long s = 0;
+
+			if (l > 0 && static_cast<size_t>(l) < sizeof b
+			    && b[l - 1] == '\n') {
+				char* end = b;
+				s = strtoul(b, &end, 10);
+				if (b == end || *end != '\n') {
+					s = 0;
+				}
+			}
+			close(f);
+			if (s > 4096 || s < 64 || !ut_is_2pow(s)) {
+				goto skip_o_direct;
+			}
+			log_sys.log_maybe_unbuffered= true;
+			log_sys.set_block_size(uint32_t(s));
+			if (!log_sys.log_buffered && !(st.st_size & (s - 1))) {
+				goto use_o_direct;
+			}
+		} else {
+skip_o_direct:
+			log_sys.log_maybe_unbuffered= false;
+			log_sys.log_buffered= true;
+			log_sys.set_block_size(512);
+		}
+	}
+# endif
+#endif
+
+#ifndef _WIN32
+	if (!read_only
+	    && create_mode != OS_FILE_OPEN_RAW
+	    && !my_disable_locking
+	    && os_file_lock(file, name)) {
+
+		if (create_mode == OS_FILE_OPEN_RETRY) {
+			ib::info()
+				<< "Retrying to lock the first data file";
+
+			for (int i = 0; i < 100; i++) {
+				std::this_thread::sleep_for(
+					std::chrono::seconds(1));
+
+				if (!os_file_lock(file, name)) {
+					*success = true;
+					return(file);
+				}
+			}
+
+			ib::info()
+				<< "Unable to open the first data file";
+		}
+
+		*success = false;
+		close(file);
+		file = -1;
+	}
+#endif /* !_WIN32 */
+
+	return(file);
+}
+
+/** NOTE! Use the corresponding macro
+os_file_create_simple_no_error_handling(), not directly this function!
+A simple function to open or create a file.
+@param[in]	name		name of the file or path as a null-terminated
+				string
+@param[in]	create_mode	create mode
+@param[in]	access_type	OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
+				OS_FILE_READ_ALLOW_DELETE; the last option
+				is used by a backup program reading the file
+@param[in]	read_only	if true read only mode checks are enforced
+@param[out]	success		true if succeeded
+@return own: handle to the file, not defined if error, error number
+	can be retrieved with os_file_get_last_error */
+pfs_os_file_t
+os_file_create_simple_no_error_handling_func(
+	const char*	name,
+	ulint		create_mode,
+	ulint		access_type,
+	bool		read_only,
+	bool*		success)
+{
+	os_file_t	file;
+	int		create_flag;
+
+	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
+	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
+
+	*success = false;
+
+	if (create_mode == OS_FILE_OPEN) {
+
+		if (access_type == OS_FILE_READ_ONLY) {
+
+			create_flag = O_RDONLY;
+
+		} else if (read_only) {
+
+			create_flag = O_RDONLY;
+
+		} else {
+
+			ut_a(access_type == OS_FILE_READ_WRITE
+			     || access_type == OS_FILE_READ_ALLOW_DELETE);
+
+			create_flag = O_RDWR;
+		}
+
+	} else if (read_only) {
+
+		create_flag = O_RDONLY;
+
+	} else if (create_mode == OS_FILE_CREATE) {
+
+		create_flag = O_RDWR | O_CREAT | O_EXCL;
+
+	} else {
+
+		ib::error()
+			<< "Unknown file create mode "
+			<< create_mode << " for file '" << name << "'";
+
+		return(OS_FILE_CLOSED);
+	}
+
+	file = open(name, create_flag | O_CLOEXEC, os_innodb_umask);
+
+	*success = (file != -1);
+
+#ifndef _WIN32
+	if (!read_only
+	    && *success
+	    && access_type == OS_FILE_READ_WRITE
+	    && !my_disable_locking
+	    && os_file_lock(file, name)) {
+
+		*success = false;
+		close(file);
+		file = -1;
+
+	}
+#endif /* !_WIN32 */
+
+	return(file);
+}
+
+/** Deletes a file if it exists. The file has to be closed before calling this.
+@param[in]	name		file path as a null-terminated string
+@param[out]	exist		indicate if file pre-exist
+@return true if success */
+bool
+os_file_delete_if_exists_func(
+	const char*	name,
+	bool*		exist)
+{
+	if (exist != NULL) {
+		*exist = true;
+	}
+
+	int	ret;
+
+	ret = unlink(name);
+
+	if (ret != 0 && errno == ENOENT) {
+		if (exist != NULL) {
+			*exist = false;
+		}
+	} else if (ret != 0 && errno != ENOENT) {
+		os_file_handle_error_no_exit(name, "delete", false);
+
+		return(false);
+	}
+
+	return(true);
+}
+
+/** Deletes a file. The file has to be closed before calling this.
+@param[in]	name		file path as a null-terminated string
+@return true if success */
+bool
+os_file_delete_func(
+	const char*	name)
+{
+	int	ret;
+
+	ret = unlink(name);
+
+	if (ret != 0) {
+		os_file_handle_error_no_exit(name, "delete", FALSE);
+
+		return(false);
+	}
+
+	return(true);
+}
+
+/** NOTE! Use the corresponding macro os_file_rename(), not directly this
+function!
+Renames a file (can also move it to another directory). It is safest that the
+file is closed before calling this function.
+@param[in]	oldpath		old file path as a null-terminated string
+@param[in]	newpath		new file path
+@return true if success */
+bool
+os_file_rename_func(
+	const char*	oldpath,
+	const char*	newpath)
+{
+#ifdef UNIV_DEBUG
+	os_file_type_t	type;
+	bool		exists;
+
+	/* New path must not exist. */
+	ut_ad(os_file_status(newpath, &exists, &type));
+	ut_ad(!exists);
+
+	/* Old path must exist. */
+	ut_ad(os_file_status(oldpath, &exists, &type));
+	ut_ad(exists);
+#endif /* UNIV_DEBUG */
+
+	int	ret;
+
+	ret = rename(oldpath, newpath);
+
+	if (ret != 0) {
+		os_file_handle_rename_error(oldpath, newpath);
+
+		return(false);
+	}
+
+	return(true);
+}
+
+/** NOTE! Use the corresponding macro os_file_close(), not directly this
+function!
+Closes a file handle. In case of error, error number can be retrieved with
+os_file_get_last_error.
+@param[in]	file		Handle to close
+@return true if success */
+bool os_file_close_func(os_file_t file)
+{
+  int ret= close(file);
+
+  if (!ret)
+    return true;
+
+  os_file_handle_error(NULL, "close");
+  return false;
+}
+
+/** Gets a file size.
+@param[in]	file		handle to an open file
+@return file size, or (os_offset_t) -1 on failure */
+os_offset_t
+os_file_get_size(os_file_t file)
+{
+  struct stat statbuf;
+  if (fstat(file, &statbuf)) return os_offset_t(-1);
+  MSAN_STAT_WORKAROUND(&statbuf);
+  return statbuf.st_size;
+}
+
+/** Gets a file size.
+@param[in]	filename	Full path to the filename to check
+@return file size if OK, else set m_total_size to ~0 and m_alloc_size to
+	errno */
+os_file_size_t
+os_file_get_size(
+	const char*	filename)
+{
+	struct stat	s;
+	os_file_size_t	file_size;
+
+	int	ret = stat(filename, &s);
+
+	if (ret == 0) {
+		MSAN_STAT_WORKAROUND(&s);
+		file_size.m_total_size = s.st_size;
+		/* st_blocks is in 512 byte sized blocks */
+		file_size.m_alloc_size = s.st_blocks * 512;
+	} else {
+		file_size.m_total_size = ~0U;
+		file_size.m_alloc_size = (os_offset_t) errno;
+	}
+
+	return(file_size);
+}
+
+/** This function returns information about the specified file
+@param[in]	path		pathname of the file
+@param[out]	stat_info	information of a file in a directory
+@param[in,out]	statinfo	information of a file in a directory
+@param[in]	check_rw_perm	for testing whether the file can be opened
+				in RW mode
+@param[in]	read_only	if true read only mode checks are enforced
+@return DB_SUCCESS if all OK */
+static
+dberr_t
+os_file_get_status_posix(
+	const char*	path,
+	os_file_stat_t* stat_info,
+	struct stat*	statinfo,
+	bool		check_rw_perm,
+	bool		read_only)
+{
+	int	ret = stat(path, statinfo);
+
+	if (ret && (errno == ENOENT || errno == ENOTDIR
+		    || errno == ENAMETOOLONG)) {
+		/* file does not exist */
+
+		return(DB_NOT_FOUND);
+
+	} else if (ret) {
+		/* file exists, but stat call failed */
+
+		os_file_handle_error_no_exit(path, "stat", false);
+
+		return(DB_FAIL);
+	}
+
+	MSAN_STAT_WORKAROUND(statinfo);
+
+	switch (statinfo->st_mode & S_IFMT) {
+	case S_IFDIR:
+		stat_info->type = OS_FILE_TYPE_DIR;
+		break;
+	case S_IFLNK:
+		stat_info->type = OS_FILE_TYPE_LINK;
+		break;
+	case S_IFBLK:
+		/* Handle block device as regular file. */
+	case S_IFCHR:
+		/* Handle character device as regular file. */
+	case S_IFREG:
+		stat_info->type = OS_FILE_TYPE_FILE;
+		break;
+	default:
+		stat_info->type = OS_FILE_TYPE_UNKNOWN;
+	}
+
+	stat_info->size = statinfo->st_size;
+	stat_info->block_size = statinfo->st_blksize;
+	stat_info->alloc_size = statinfo->st_blocks * 512;
+
+	if (check_rw_perm
+	    && (stat_info->type == OS_FILE_TYPE_FILE
+		|| stat_info->type == OS_FILE_TYPE_BLOCK)) {
+
+		stat_info->rw_perm = !access(path, read_only
+					     ? R_OK : R_OK | W_OK);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/** Truncates a file to a specified size in bytes.
+Do nothing if the size to preserve is greater or equal to the current
+size of the file.
+@param[in]	pathname	file path
+@param[in]	file		file to be truncated
+@param[in]	size		size to preserve in bytes
+@return true if success */
+static
+bool
+os_file_truncate_posix(
+	const char*	pathname,
+	os_file_t	file,
+	os_offset_t	size)
+{
+	int	res = ftruncate(file, size);
+
+	if (res == -1) {
+
+		bool	retry;
+
+		retry = os_file_handle_error_no_exit(
+			pathname, "truncate", false);
+
+		if (retry) {
+			ib::warn()
+				<< "Truncate failed for '"
+				<< pathname << "'";
+		}
+	}
+
+	return(res == 0);
+}
+
+/** Truncates a file at its current position.
+@return true if success */
+bool
+os_file_set_eof(
+	FILE*		file)	/*!< in: file to be truncated */
+{
+	return(!ftruncate(fileno(file), ftell(file)));
+}
+
+#else /* !_WIN32 */
+
+#include <WinIoCtl.h>
+
+
+
+/** Free storage space associated with a section of the file.
+@param[in]	fh		Open file handle
+@param[in]	off		Starting offset (SEEK_SET)
+@param[in]	len		Size of the hole
+@return 0 on success or errno */
+static
+dberr_t
+os_file_punch_hole_win32(
+	os_file_t	fh,
+	os_offset_t	off,
+	os_offset_t	len)
+{
+	FILE_ZERO_DATA_INFORMATION	punch;
+
+	punch.FileOffset.QuadPart = off;
+	punch.BeyondFinalZero.QuadPart = off + len;
+
+	/* If lpOverlapped is NULL, lpBytesReturned cannot be NULL,
+	therefore we pass a dummy parameter. */
+	DWORD	temp;
+	BOOL	success = os_win32_device_io_control(
+		fh, FSCTL_SET_ZERO_DATA, &punch, sizeof(punch),
+		NULL, 0, &temp);
+
+	return(success ? DB_SUCCESS: DB_IO_NO_PUNCH_HOLE);
+}
+
+/** Check the existence and type of the given file.
+@param[in]	path		path name of file
+@param[out]	exists		true if the file exists
+@param[out]	type		Type of the file, if it exists
+@return true if call succeeded */
+static
+bool
+os_file_status_win32(
+	const char*	path,
+	bool*		exists,
+	os_file_type_t* type)
+{
+	int		ret;
+	struct _stat64	statinfo;
+
+	ret = _stat64(path, &statinfo);
+
+	*exists = !ret;
+
+	if (!ret) {
+		/* file exists, everything OK */
+
+	} else if (errno == ENOENT || errno == ENOTDIR || errno == ENAMETOOLONG) {
+		/* file does not exist */
+		return(true);
+
+	} else {
+		/* file exists, but stat call failed */
+		os_file_handle_error_no_exit(path, "stat", false);
+		return(false);
+	}
+
+	if (_S_IFDIR & statinfo.st_mode) {
+		*type = OS_FILE_TYPE_DIR;
+
+	} else if (_S_IFREG & statinfo.st_mode) {
+		*type = OS_FILE_TYPE_FILE;
+
+	} else {
+		*type = OS_FILE_TYPE_UNKNOWN;
+	}
+
+	return(true);
+}
+
+/* Dynamically load NtFlushBuffersFileEx, used in os_file_flush_func */
+#include <winternl.h>
+typedef NTSTATUS(WINAPI* pNtFlushBuffersFileEx)(
+  HANDLE FileHandle, ULONG Flags, PVOID Parameters, ULONG ParametersSize,
+  PIO_STATUS_BLOCK IoStatusBlock);
+
+static pNtFlushBuffersFileEx my_NtFlushBuffersFileEx
+  = (pNtFlushBuffersFileEx)GetProcAddress(GetModuleHandle("ntdll"),
+    "NtFlushBuffersFileEx");
+
+/** NOTE! Use the corresponding macro os_file_flush(), not directly this
+function!
+Flushes the write buffers of a given file to the disk.
+@param[in]	file		handle to a file
+@return true if success */
+bool os_file_flush_func(os_file_t file)
+{
+  ++os_n_fsyncs;
+  static bool disable_datasync;
+
+  if (my_NtFlushBuffersFileEx && !disable_datasync)
+  {
+    IO_STATUS_BLOCK iosb{};
+    NTSTATUS status= my_NtFlushBuffersFileEx(
+        file, FLUSH_FLAGS_FILE_DATA_SYNC_ONLY, nullptr, 0, &iosb);
+    if (!status)
+      return true;
+    /*
+      NtFlushBuffersFileEx(FLUSH_FLAGS_FILE_DATA_SYNC_ONLY) might fail
+      unless on Win10+, and maybe non-NTFS. Switch to using FlushFileBuffers().
+    */
+    disable_datasync= true;
+  }
+
+  if (FlushFileBuffers(file))
+    return true;
+
+  /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
+  actually a raw device, we choose to ignore that error if we are using
+  raw disks */
+  if (srv_start_raw_disk_in_use && GetLastError() == ERROR_INVALID_FUNCTION)
+    return true;
+
+  os_file_handle_error(nullptr, "flush");
+
+  /* It is a fatal error if a file flush does not succeed, because then
+  the database can get corrupt on disk */
+  ut_error;
+
+  return false;
+}
+
+/** Retrieves the last error number if an error occurs in a file io function.
+The number should be retrieved before any other OS calls (because they may
+overwrite the error number). If the number is not known to this program,
+then OS error number + OS_FILE_ERROR_MAX is returned.
+@param[in]	report_all_errors	true if we want an error message
+printed of all errors
+@param[in]	on_error_silent		true then don't print any diagnostic
+					to the log
+@return error number, or OS error number + OS_FILE_ERROR_MAX */
+ulint os_file_get_last_error(bool report_all_errors, bool on_error_silent)
+
+{
+	ulint	err = (ulint) GetLastError();
+
+	if (err == ERROR_SUCCESS) {
+		return(0);
+	}
+
+	if (report_all_errors
+	    || (!on_error_silent
+		&& err != ERROR_DISK_FULL
+		&& err != ERROR_FILE_NOT_FOUND
+		&& err != ERROR_FILE_EXISTS)) {
+
+		ib::error()
+			<< "Operating system error number " << err
+			<< " in a file operation.";
+
+		switch (err) {
+		case ERROR_PATH_NOT_FOUND:
+			break;
+		case ERROR_ACCESS_DENIED:
+			ib::error()
+				<< "The error means mariadbd does not have"
+				" the access rights to"
+				" the directory. It may also be"
+				" you have created a subdirectory"
+				" of the same name as a data file.";
+			break;
+		case ERROR_SHARING_VIOLATION:
+		case ERROR_LOCK_VIOLATION:
+			ib::error()
+				<< "The error means that another program"
+				" is using InnoDB's files."
+				" This might be a backup or antivirus"
+				" software or another instance"
+				" of MariaDB."
+				" Please close it to get rid of this error.";
+			break;
+		case ERROR_WORKING_SET_QUOTA:
+		case ERROR_NO_SYSTEM_RESOURCES:
+			ib::error()
+				<< "The error means that there are no"
+				" sufficient system resources or quota to"
+				" complete the operation.";
+			break;
+		case ERROR_OPERATION_ABORTED:
+			ib::error()
+				<< "The error means that the I/O"
+				" operation has been aborted"
+				" because of either a thread exit"
+				" or an application request."
+				" Retry attempt is made.";
+			break;
+		default:
+			ib::info() << OPERATING_SYSTEM_ERROR_MSG;
+		}
+	}
+
+	if (err == ERROR_FILE_NOT_FOUND) {
+		return(OS_FILE_NOT_FOUND);
+	} else if (err == ERROR_DISK_FULL) {
+		return(OS_FILE_DISK_FULL);
+	} else if (err == ERROR_FILE_EXISTS) {
+		return(OS_FILE_ALREADY_EXISTS);
+	} else if (err == ERROR_SHARING_VIOLATION
+		   || err == ERROR_LOCK_VIOLATION) {
+		return(OS_FILE_SHARING_VIOLATION);
+	} else if (err == ERROR_WORKING_SET_QUOTA
+		   || err == ERROR_NO_SYSTEM_RESOURCES) {
+		return(OS_FILE_INSUFFICIENT_RESOURCE);
+	} else if (err == ERROR_OPERATION_ABORTED) {
+		return(OS_FILE_OPERATION_ABORTED);
+	} else if (err == ERROR_ACCESS_DENIED) {
+		return(OS_FILE_ACCESS_VIOLATION);
+	}
+
+	return(OS_FILE_ERROR_MAX + err);
+}
+
+
+/** NOTE! Use the corresponding macro os_file_create_simple(), not directly
+this function!
+A simple function to open or create a file.
+@param[in]	name		name of the file or path as a null-terminated
+				string
+@param[in]	create_mode	create mode
+@param[in]	access_type	OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
+@param[in]	read_only	if true read only mode checks are enforced
+@param[out]	success		true if succeed, false if error
+@return handle to the file, not defined if error, error number
+	can be retrieved with os_file_get_last_error */
+pfs_os_file_t
+os_file_create_simple_func(
+	const char*	name,
+	ulint		create_mode,
+	ulint		access_type,
+	bool		read_only,
+	bool*		success)
+{
+	os_file_t	file;
+
+	*success = false;
+
+	DWORD		access;
+	DWORD		create_flag;
+	DWORD		attributes = 0;
+
+	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
+	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
+	ut_ad(srv_operation == SRV_OPERATION_NORMAL);
+
+	if (create_mode == OS_FILE_OPEN) {
+
+		create_flag = OPEN_EXISTING;
+
+	} else if (read_only) {
+
+		create_flag = OPEN_EXISTING;
+
+	} else if (create_mode == OS_FILE_CREATE) {
+
+		create_flag = CREATE_NEW;
+
+	} else if (create_mode == OS_FILE_CREATE_PATH) {
+
+		/* Create subdirs along the path if needed. */
+		*success = os_file_create_subdirs_if_needed(name);
+
+		if (!*success) {
+
+			ib::error()
+				<< "Unable to create subdirectories '"
+				<< name << "'";
+
+			return(OS_FILE_CLOSED);
+		}
+
+		create_flag = CREATE_NEW;
+		create_mode = OS_FILE_CREATE;
+
+	} else {
+
+		ib::error()
+			<< "Unknown file create mode ("
+			<< create_mode << ") for file '"
+			<< name << "'";
+
+		return(OS_FILE_CLOSED);
+	}
+
+	if (access_type == OS_FILE_READ_ONLY) {
+
+		access = GENERIC_READ;
+
+	} else if (read_only) {
+
+		ib::info()
+			<< "Read only mode set. Unable to"
+			" open file '" << name << "' in RW mode, "
+			<< "trying RO mode";
+
+		access = GENERIC_READ;
+
+	} else if (access_type == OS_FILE_READ_WRITE) {
+
+		access = GENERIC_READ | GENERIC_WRITE;
+
+	} else {
+
+		ib::error()
+			<< "Unknown file access type (" << access_type << ") "
+			"for file '" << name << "'";
+
+		return(OS_FILE_CLOSED);
+	}
+
+	bool	retry;
+
+	do {
+		/* Use default security attributes and no template file. */
+
+		file = CreateFile(
+			(LPCTSTR) name, access,
+			FILE_SHARE_READ | FILE_SHARE_DELETE,
+			my_win_file_secattr(), create_flag, attributes, NULL);
+
+		if (file == INVALID_HANDLE_VALUE) {
+
+			*success = false;
+
+			retry = os_file_handle_error(
+				name, create_mode == OS_FILE_OPEN ?
+				"open" : "create");
+
+		} else {
+
+			retry = false;
+
+			*success = true;
+		}
+
+	} while (retry);
+
+	return(file);
+}
+
+/** This function attempts to create a directory named pathname. The new
+directory gets default permissions. On Unix the permissions are
+(0770 & ~umask). If the directory exists already, nothing is done and
+the call succeeds, unless the fail_if_exists arguments is true.
+If another error occurs, such as a permission error, this does not crash,
+but reports the error and returns false.
+@param[in]	pathname	directory name as null-terminated string
+@param[in]	fail_if_exists	if true, pre-existing directory is treated
+				as an error.
+@return true if call succeeds, false on error */
+bool
+os_file_create_directory(
+	const char*	pathname,
+	bool		fail_if_exists)
+{
+	BOOL	rcode;
+
+	rcode = CreateDirectory((LPCTSTR) pathname, NULL);
+	if (!(rcode != 0
+	      || (GetLastError() == ERROR_ALREADY_EXISTS
+		  && !fail_if_exists))) {
+
+		os_file_handle_error_no_exit(
+			pathname, "CreateDirectory", false);
+
+		return(false);
+	}
+
+	return(true);
+}
+
+/** Get disk sector size for a file. */
+static size_t get_sector_size(HANDLE file)
+{
+  FILE_STORAGE_INFO fsi;
+  ULONG s= 4096;
+  if (GetFileInformationByHandleEx(file, FileStorageInfo, &fsi, sizeof fsi))
+  {
+    s= fsi.PhysicalBytesPerSectorForPerformance;
+    if (s > 4096 || s < 64 || !ut_is_2pow(s))
+      return 4096;
+  }
+  return s;
+}
+
+/** NOTE! Use the corresponding macro os_file_create(), not directly
+this function!
+Opens an existing file or creates a new.
+@param[in]	name		name of the file or path as a null-terminated
+				string
+@param[in]	create_mode	create mode
+@param[in]	purpose		OS_FILE_AIO, if asynchronous, non-buffered I/O
+				is desired, OS_FILE_NORMAL, if any normal file;
+				NOTE that it also depends on type, os_aio_..
+				and srv_.. variables whether we really use async
+				I/O or unbuffered I/O: look in the function
+				source code for the exact rules
+@param[in]	type		OS_DATA_FILE or OS_LOG_FILE
+@param[in]	success		true if succeeded
+@return handle to the file, not defined if error, error number
+	can be retrieved with os_file_get_last_error */
+pfs_os_file_t
+os_file_create_func(
+	const char*	name,
+	ulint		create_mode,
+	ulint		purpose,
+	ulint		type,
+	bool		read_only,
+	bool*		success)
+{
+	os_file_t	file;
+	bool		retry;
+	bool		on_error_no_exit;
+	bool		on_error_silent;
+
+	*success = false;
+
+	DBUG_EXECUTE_IF(
+		"ib_create_table_fail_disk_full",
+		*success = false;
+		SetLastError(ERROR_DISK_FULL);
+		return(OS_FILE_CLOSED);
+	);
+
+	DWORD		create_flag;
+	DWORD		share_mode = read_only
+		? FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE
+		: FILE_SHARE_READ | FILE_SHARE_DELETE;
+
+	on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
+		? true : false;
+
+	on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
+		? true : false;
+
+	create_mode &= ~(OS_FILE_ON_ERROR_NO_EXIT | OS_FILE_ON_ERROR_SILENT);
+
+	if (create_mode == OS_FILE_OPEN_RAW) {
+
+		ut_a(!read_only);
+
+		/* On Windows Physical devices require admin privileges and
+		have to have the write-share mode set. See the remarks
+		section for the CreateFile() function documentation in MSDN. */
+
+		share_mode |= FILE_SHARE_WRITE;
+
+		create_flag = OPEN_EXISTING;
+
+	} else if (create_mode == OS_FILE_OPEN
+		   || create_mode == OS_FILE_OPEN_RETRY) {
+
+		create_flag = OPEN_EXISTING;
+
+	} else if (read_only) {
+
+		create_flag = OPEN_EXISTING;
+
+	} else if (create_mode == OS_FILE_CREATE) {
+
+		create_flag = CREATE_NEW;
+
+	} else if (create_mode == OS_FILE_OVERWRITE) {
+
+		create_flag = CREATE_ALWAYS;
+
+	} else {
+		ib::error()
+			<< "Unknown file create mode (" << create_mode << ") "
+			<< " for file '" << name << "'";
+
+		return(OS_FILE_CLOSED);
+	}
+
+	DWORD attributes = (purpose == OS_FILE_AIO && srv_use_native_aio)
+		? FILE_FLAG_OVERLAPPED : 0;
+
+	if (type == OS_LOG_FILE) {
+		if (!log_sys.is_opened() && !log_sys.log_buffered) {
+			attributes|= FILE_FLAG_NO_BUFFERING;
+		}
+		if (srv_file_flush_method == SRV_O_DSYNC)
+			attributes|= FILE_FLAG_WRITE_THROUGH;
+	}
+	else if (type == OS_DATA_FILE)
+	{
+		switch (srv_file_flush_method)
+		{
+		case SRV_FSYNC:
+		case SRV_LITTLESYNC:
+		case SRV_NOSYNC:
+			break;
+		default:
+			attributes|= FILE_FLAG_NO_BUFFERING;
+		}
+	}
+
+	DWORD	access = GENERIC_READ;
+
+	if (!read_only) {
+		access |= GENERIC_WRITE;
+	}
+
+	for (;;) {
+		const  char *operation;
+
+		/* Use default security attributes and no template file. */
+		file = CreateFile(
+			name, access, share_mode, my_win_file_secattr(),
+			create_flag, attributes, NULL);
+
+		*success = file != INVALID_HANDLE_VALUE;
+
+		if (*success && type == OS_LOG_FILE) {
+			uint32_t s = uint32_t(get_sector_size(file));
+			log_sys.set_block_size(s);
+			if (attributes & FILE_FLAG_NO_BUFFERING) {
+				if (os_file_get_size(file) % s) {
+					attributes &= ~FILE_FLAG_NO_BUFFERING;
+					create_flag = OPEN_ALWAYS;
+					CloseHandle(file);
+					continue;
+				}
+				log_sys.log_buffered = false;
+			}
+		}
+
+		if (*success) {
+			break;
+		}
+
+		operation = (create_mode == OS_FILE_CREATE && !read_only) ?
+			"create" : "open";
+
+		if (on_error_no_exit) {
+			retry = os_file_handle_error_no_exit(
+				name, operation, on_error_silent);
+		}
+		else {
+			retry = os_file_handle_error(name, operation);
+		}
+
+		if (!retry) {
+			break;
+		}
+	}
+
+	if (*success &&  (attributes & FILE_FLAG_OVERLAPPED) && srv_thread_pool) {
+		srv_thread_pool->bind(file);
+	}
+	return(file);
+}
+
+/** NOTE! Use the corresponding macro os_file_create_simple_no_error_handling(),
+not directly this function!
+A simple function to open or create a file.
+@param[in]	name		name of the file or path as a null-terminated
+				string
+@param[in]	create_mode	create mode
+@param[in]	access_type	OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
+				OS_FILE_READ_ALLOW_DELETE; the last option is
+				used by a backup program reading the file
+@param[out]	success		true if succeeded
+@return own: handle to the file, not defined if error, error number
+	can be retrieved with os_file_get_last_error */
+
+pfs_os_file_t
+os_file_create_simple_no_error_handling_func(
+	const char*	name,
+	ulint		create_mode,
+	ulint		access_type,
+	bool		read_only,
+	bool*		success)
+{
+	os_file_t	file;
+
+	*success = false;
+
+	DWORD		access;
+	DWORD		create_flag;
+	DWORD		attributes	= 0;
+	DWORD		share_mode = read_only
+		? FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE
+		: FILE_SHARE_READ | FILE_SHARE_DELETE;
+
+	ut_a(name);
+
+	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
+	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
+
+	if (create_mode == OS_FILE_OPEN) {
+
+		create_flag = OPEN_EXISTING;
+
+	} else if (read_only) {
+
+		create_flag = OPEN_EXISTING;
+
+	} else if (create_mode == OS_FILE_CREATE) {
+
+		create_flag = CREATE_NEW;
+
+	} else {
+
+		ib::error()
+			<< "Unknown file create mode (" << create_mode << ") "
+			<< " for file '" << name << "'";
+
+		return(OS_FILE_CLOSED);
+	}
+
+	if (access_type == OS_FILE_READ_ONLY) {
+
+		access = GENERIC_READ;
+
+	} else if (read_only) {
+
+		access = GENERIC_READ;
+
+	} else if (access_type == OS_FILE_READ_WRITE) {
+
+		access = GENERIC_READ | GENERIC_WRITE;
+
+	} else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
+
+		ut_a(!read_only);
+
+		access = GENERIC_READ;
+
+		/*!< A backup program has to give mysqld the maximum
+		freedom to do what it likes with the file */
+
+		share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE
+			| FILE_SHARE_READ;
+
+	} else {
+
+		ib::error()
+			<< "Unknown file access type (" << access_type << ") "
+			<< "for file '" << name << "'";
+
+		return(OS_FILE_CLOSED);
+	}
+
+	file = CreateFile((LPCTSTR) name,
+			  access,
+			  share_mode,
+			  my_win_file_secattr(),
+			  create_flag,
+			  attributes,
+			  NULL);		// No template file
+
+	*success = (file != INVALID_HANDLE_VALUE);
+
+	return(file);
+}
+
+/** Deletes a file if it exists. The file has to be closed before calling this.
+@param[in]	name		file path as a null-terminated string
+@param[out]	exist		indicate if file pre-exist
+@return true if success */
+bool
+os_file_delete_if_exists_func(
+	const char*	name,
+	bool*		exist)
+{
+	ulint	count	= 0;
+
+	if (exist != NULL) {
+		*exist = true;
+	}
+
+	for (;;) {
+		/* In Windows, deleting an .ibd file may fail if
+		the file is being accessed by an external program,
+		such as a backup tool. */
+
+		bool	ret = DeleteFile((LPCTSTR) name);
+
+		if (ret) {
+			return(true);
+		}
+
+		switch (GetLastError()) {
+		case ERROR_FILE_NOT_FOUND:
+		case ERROR_PATH_NOT_FOUND:
+			/* the file does not exist, this not an error */
+			if (exist != NULL) {
+				*exist = false;
+			}
+			/* fall through */
+		case ERROR_ACCESS_DENIED:
+			return(true);
+		}
+
+		++count;
+
+		if (count > 100 && 0 == (count % 10)) {
+
+			/* Print error information */
+			os_file_get_last_error(true);
+
+			ib::warn() << "Delete of file '" << name << "' failed.";
+		}
+
+		std::this_thread::sleep_for(std::chrono::seconds(1));
+
+		if (count > 2000) {
+
+			return(false);
+		}
+	}
+}
+
+/** Deletes a file. The file has to be closed before calling this.
+@param[in]	name		File path as NUL terminated string
+@return true if success */
+bool
+os_file_delete_func(
+	const char*	name)
+{
+	ulint	count	= 0;
+
+	for (;;) {
+		/* In Windows, deleting an .ibd file may fail if
+		the file is being accessed by an external program,
+		such as a backup tool. */
+
+		BOOL	ret = DeleteFile((LPCTSTR) name);
+
+		if (ret) {
+			return(true);
+		}
+
+		if (GetLastError() == ERROR_FILE_NOT_FOUND) {
+			/* If the file does not exist, we classify this as
+			a 'mild' error and return */
+
+			return(false);
+		}
+
+		++count;
+
+		if (count > 100 && 0 == (count % 10)) {
+
+			/* print error information */
+			os_file_get_last_error(true);
+
+			ib::warn()
+				<< "Cannot delete file '" << name << "'. Is "
+				<< "another program accessing it?";
+		}
+
+		std::this_thread::sleep_for(std::chrono::seconds(1));
+
+		if (count > 2000) {
+
+			return(false);
+		}
+	}
+
+	ut_error;
+	return(false);
+}
+
+/** NOTE! Use the corresponding macro os_file_rename(), not directly this
+function!
+Renames a file (can also move it to another directory). It is safest that the
+file is closed before calling this function.
+@param[in]	oldpath		old file path as a null-terminated string
+@param[in]	newpath		new file path
+@return true if success */
+bool
+os_file_rename_func(
+	const char*	oldpath,
+	const char*	newpath)
+{
+#ifdef UNIV_DEBUG
+	os_file_type_t	type;
+	bool		exists;
+
+	/* New path must not exist. */
+	ut_ad(os_file_status(newpath, &exists, &type));
+	ut_ad(!exists);
+
+	/* Old path must exist. */
+	ut_ad(os_file_status(oldpath, &exists, &type));
+	ut_ad(exists);
+#endif /* UNIV_DEBUG */
+
+	if (MoveFileEx(oldpath, newpath, MOVEFILE_REPLACE_EXISTING)) {
+		return(true);
+	}
+
+	os_file_handle_rename_error(oldpath, newpath);
+	return(false);
+}
+
+/** NOTE! Use the corresponding macro os_file_close(), not directly
+this function!
+Closes a file handle. In case of error, error number can be retrieved with
+os_file_get_last_error.
+@param[in,own]	file		Handle to a file
+@return true if success */
+bool os_file_close_func(os_file_t file)
+{
+  ut_ad(file);
+  if (!CloseHandle(file))
+  {
+    os_file_handle_error(NULL, "close");
+    return false;
+  }
+
+  if(srv_thread_pool)
+    srv_thread_pool->unbind(file);
+  return true;
+}
+
+/** Gets a file size.
+@param[in]	file		Handle to a file
+@return file size, or (os_offset_t) -1 on failure */
+os_offset_t os_file_get_size(os_file_t file)
+{
+  LARGE_INTEGER li;
+  if (GetFileSizeEx(file, &li))
+    return li.QuadPart;
+  return ((os_offset_t) -1);
+}
+
+/** Gets a file size.
+@param[in]	filename	Full path to the filename to check
+@return file size if OK, else set m_total_size to ~0 and m_alloc_size to
+	errno */
+os_file_size_t
+os_file_get_size(
+	const char*	filename)
+{
+	struct __stat64	s;
+	os_file_size_t	file_size;
+
+	int		ret = _stat64(filename, &s);
+
+	if (ret == 0) {
+
+		file_size.m_total_size = s.st_size;
+
+		DWORD	low_size;
+		DWORD	high_size;
+
+		low_size = GetCompressedFileSize(filename, &high_size);
+
+		if (low_size != INVALID_FILE_SIZE) {
+
+			file_size.m_alloc_size = high_size;
+			file_size.m_alloc_size <<= 32;
+			file_size.m_alloc_size |= low_size;
+
+		} else {
+			ib::error()
+				<< "GetCompressedFileSize("
+				<< filename << ", ..) failed.";
+
+			file_size.m_alloc_size = (os_offset_t) -1;
+		}
+	} else {
+		file_size.m_total_size = ~0;
+		file_size.m_alloc_size = (os_offset_t) ret;
+	}
+
+	return(file_size);
+}
+
+/** This function returns information about the specified file
+@param[in]	path		pathname of the file
+@param[out]	stat_info	information of a file in a directory
+@param[in,out]	statinfo	information of a file in a directory
+@param[in]	check_rw_perm	for testing whether the file can be opened
+				in RW mode
+@param[in]	read_only	true if the file is opened in read-only mode
+@return DB_SUCCESS if all OK */
+static
+dberr_t
+os_file_get_status_win32(
+	const char*	path,
+	os_file_stat_t* stat_info,
+	struct _stat64*	statinfo,
+	bool		check_rw_perm,
+	bool		read_only)
+{
+	int	ret = _stat64(path, statinfo);
+
+	if (ret && (errno == ENOENT || errno == ENOTDIR
+		    || errno == ENAMETOOLONG)) {
+		/* file does not exist */
+
+		return(DB_NOT_FOUND);
+
+	} else if (ret) {
+		/* file exists, but stat call failed */
+
+		os_file_handle_error_no_exit(path, "STAT", false);
+
+		return(DB_FAIL);
+
+	} else if (_S_IFDIR & statinfo->st_mode) {
+
+		stat_info->type = OS_FILE_TYPE_DIR;
+
+	} else if (_S_IFREG & statinfo->st_mode) {
+
+		DWORD	access = GENERIC_READ;
+
+		if (!read_only) {
+			access |= GENERIC_WRITE;
+		}
+
+		stat_info->type = OS_FILE_TYPE_FILE;
+
+		/* Check if we can open it in read-only mode. */
+
+		if (check_rw_perm) {
+			HANDLE	fh;
+
+			fh = CreateFile(
+				(LPCTSTR) path,		// File to open
+				access,
+				FILE_SHARE_READ | FILE_SHARE_WRITE
+				| FILE_SHARE_DELETE,	// Full sharing
+				my_win_file_secattr(),
+				OPEN_EXISTING,		// Existing file only
+				FILE_ATTRIBUTE_NORMAL,	// Normal file
+				NULL);			// No attr. template
+
+			if (fh == INVALID_HANDLE_VALUE) {
+				stat_info->rw_perm = false;
+			} else {
+				stat_info->rw_perm = true;
+				CloseHandle(fh);
+			}
+		}
+	} else {
+		stat_info->type = OS_FILE_TYPE_UNKNOWN;
+	}
+
+	return(DB_SUCCESS);
+}
+
+/**
+Sets a sparse flag on Windows file.
+@param[in]	file  file handle
+@return true on success, false on error
+*/
+#include <versionhelpers.h>
+bool os_file_set_sparse_win32(os_file_t file, bool is_sparse)
+{
+	if (!is_sparse && !IsWindows8OrGreater()) {
+		/* Cannot  unset sparse flag on older Windows.
+		Until Windows8 it is documented to produce unpredictable results,
+		if there are unallocated ranges in file.*/
+		return false;
+	}
+	DWORD temp;
+	FILE_SET_SPARSE_BUFFER sparse_buffer;
+	sparse_buffer.SetSparse = is_sparse;
+	return os_win32_device_io_control(file,
+		FSCTL_SET_SPARSE, &sparse_buffer, sizeof(sparse_buffer), 0, 0,&temp);
+}
+
+
+/**
+Change file size on Windows.
+
+If file is extended, the bytes between old and new EOF
+are zeros.
+
+If file is sparse, "virtual" block is added at the end of
+allocated area.
+
+If file is normal, file system allocates storage.
+
+@param[in]	pathname	file path
+@param[in]	file		file handle
+@param[in]	size		size to preserve in bytes
+@return true if success */
+bool
+os_file_change_size_win32(
+	const char*	pathname,
+	os_file_t	file,
+	os_offset_t	size)
+{
+	LARGE_INTEGER	length;
+
+	length.QuadPart = size;
+
+	BOOL	success = SetFilePointerEx(file, length, NULL, FILE_BEGIN);
+
+	if (!success) {
+		os_file_handle_error_no_exit(
+			pathname, "SetFilePointerEx", false);
+	} else {
+		success = SetEndOfFile(file);
+		if (!success) {
+			os_file_handle_error_no_exit(
+				pathname, "SetEndOfFile", false);
+		}
+	}
+	return(success);
+}
+
+/** Truncates a file at its current position.
+@param[in]	file		Handle to be truncated
+@return true if success */
+bool
+os_file_set_eof(
+	FILE*		file)
+{
+	HANDLE	h = (HANDLE) _get_osfhandle(fileno(file));
+
+	return(SetEndOfFile(h));
+}
+
+#endif /* !_WIN32*/
+
+/** Does a synchronous read or write depending upon the type specified
+In case of partial reads/writes the function tries
+NUM_RETRIES_ON_PARTIAL_IO times to read/write the complete data.
+@param[in]	type,		IO flags
+@param[in]	file		handle to an open file
+@param[out]	buf		buffer where to read
+@param[in]	offset		file offset from the start where to read
+@param[in]	n		number of bytes to read, starting from offset
+@param[out]	err		DB_SUCCESS or error code
+@return number of bytes read/written, -1 if error */
+static MY_ATTRIBUTE((warn_unused_result))
+ssize_t
+os_file_io(
+	const IORequest&in_type,
+	os_file_t	file,
+	void*		buf,
+	ulint		n,
+	os_offset_t	offset,
+	dberr_t*	err)
+{
+	ssize_t		original_n = ssize_t(n);
+	IORequest	type = in_type;
+	ssize_t		bytes_returned = 0;
+
+	SyncFileIO	sync_file_io(file, buf, n, offset);
+
+	for (ulint i = 0; i < NUM_RETRIES_ON_PARTIAL_IO; ++i) {
+
+		ssize_t	n_bytes = sync_file_io.execute(type);
+
+		/* Check for a hard error. Not much we can do now. */
+		if (n_bytes < 0) {
+
+			break;
+
+		} else if (n_bytes + bytes_returned == ssize_t(n)) {
+
+			bytes_returned += n_bytes;
+
+			*err = type.maybe_punch_hole(offset, n);
+
+			return(original_n);
+		}
+
+		/* Handle partial read/write. */
+
+		ut_ad(ulint(n_bytes + bytes_returned) < n);
+
+		bytes_returned += n_bytes;
+
+		if (type.type != IORequest::READ_MAYBE_PARTIAL) {
+			sql_print_warning("InnoDB: %zu bytes should have been"
+					  " %s at %llu from %s,"
+					  " but got only %zd."
+					  " Retrying.",
+					  n, type.is_read()
+					  ? "read" : "written", offset,
+					  type.node
+					  ? type.node->name
+					  : "(unknown file)", bytes_returned);
+		}
+
+		/* Advance the offset and buffer by n_bytes */
+		sync_file_io.advance(n_bytes);
+	}
+
+	*err = DB_IO_ERROR;
+
+	if (type.type != IORequest::READ_MAYBE_PARTIAL) {
+		ib::warn()
+			<< "Retry attempts for "
+			<< (type.is_read() ? "reading" : "writing")
+			<< " partial data failed.";
+	}
+
+	return(bytes_returned);
+}
+
+/** Does a synchronous write operation in Posix.
+@param[in]	type		IO context
+@param[in]	file		handle to an open file
+@param[out]	buf		buffer from which to write
+@param[in]	n		number of bytes to write, starting from offset
+@param[in]	offset		file offset from the start where to write
+@param[out]	err		DB_SUCCESS or error code
+@return number of bytes written
+@retval -1 on error */
+static MY_ATTRIBUTE((warn_unused_result))
+ssize_t
+os_file_pwrite(
+	const IORequest&	type,
+	os_file_t		file,
+	const byte*		buf,
+	ulint			n,
+	os_offset_t		offset,
+	dberr_t*		err)
+{
+	ut_ad(type.is_write());
+
+	++os_n_file_writes;
+
+	const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_WRITES);
+	MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
+	ssize_t	n_bytes = os_file_io(type, file, const_cast<byte*>(buf),
+				     n, offset, err);
+	MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_WRITES, monitor);
+
+	return(n_bytes);
+}
+
+/** NOTE! Use the corresponding macro os_file_write(), not directly
+Requests a synchronous write operation.
+@param[in]	type		IO flags
+@param[in]	file		handle to an open file
+@param[out]	buf		buffer from which to write
+@param[in]	offset		file offset from the start where to read
+@param[in]	n		number of bytes to read, starting from offset
+@return error code
+@retval	DB_SUCCESS	if the operation succeeded */
+dberr_t
+os_file_write_func(
+	const IORequest&	type,
+	const char*		name,
+	os_file_t		file,
+	const void*		buf,
+	os_offset_t		offset,
+	ulint			n)
+{
+	dberr_t		err;
+
+	ut_ad(n > 0);
+
+	ssize_t	n_bytes = os_file_pwrite(type, file, (byte*)buf, n, offset, &err);
+
+	if ((ulint) n_bytes != n && !os_has_said_disk_full) {
+
+		ib::error()
+			<< "Write to file " << name << " failed at offset "
+			<< offset << ", " << n
+			<< " bytes should have been written,"
+			" only " << n_bytes << " were written."
+			" Operating system error number " << IF_WIN(GetLastError(),errno) << "."
+			" Check that your OS and file system"
+			" support files of this size."
+			" Check also that the disk is not full"
+			" or a disk quota exceeded.";
+#ifndef _WIN32
+		if (strerror(errno) != NULL) {
+
+			ib::error()
+				<< "Error number " << errno
+				<< " means '" << strerror(errno) << "'";
+		}
+
+		ib::info() << OPERATING_SYSTEM_ERROR_MSG;
+#endif
+		os_has_said_disk_full = true;
+	}
+
+	return(err);
+}
+
+/** Does a synchronous read operation in Posix.
+@param[in]	type		IO flags
+@param[in]	file		handle to an open file
+@param[out]	buf		buffer where to read
+@param[in]	offset		file offset from the start where to read
+@param[in]	n		number of bytes to read, starting from offset
+@param[out]	err		DB_SUCCESS or error code
+@return number of bytes read, -1 if error */
+static MY_ATTRIBUTE((warn_unused_result))
+ssize_t
+os_file_pread(
+	const IORequest&	type,
+	os_file_t		file,
+	void*			buf,
+	ulint			n,
+	os_offset_t		offset,
+	dberr_t*		err)
+{
+	ut_ad(type.is_read());
+
+	++os_n_file_reads;
+
+	const bool monitor = MONITOR_IS_ON(MONITOR_OS_PENDING_READS);
+	MONITOR_ATOMIC_INC_LOW(MONITOR_OS_PENDING_READS, monitor);
+	ssize_t	n_bytes = os_file_io(type, file, buf, n, offset, err);
+	MONITOR_ATOMIC_DEC_LOW(MONITOR_OS_PENDING_READS, monitor);
+
+	return(n_bytes);
+}
+
+/** Requests a synchronous positioned read operation.
+@return DB_SUCCESS if request was successful, false if fail
+@param[in]	type		IO flags
+@param[in]	file		handle to an open file
+@param[out]	buf		buffer where to read
+@param[in]	offset		file offset from the start where to read
+@param[in]	n		number of bytes to read, starting from offset
+@param[out]	o		number of bytes actually read
+@return DB_SUCCESS or error code */
+dberr_t
+os_file_read_func(
+	const IORequest&	type,
+	os_file_t	file,
+	void*			buf,
+	os_offset_t		offset,
+	ulint			n,
+	ulint*			o)
+{
+  ut_ad(!type.node || type.node->handle == file);
+  ut_ad(n);
+
+  os_bytes_read_since_printout+= n;
+
+  dberr_t err;
+  ssize_t n_bytes= os_file_pread(type, file, buf, n, offset, &err);
+
+  if (o)
+    *o= ulint(n_bytes);
+
+  if (ulint(n_bytes) == n || err != DB_SUCCESS)
+    return err;
+
+  os_file_handle_error_cond_exit(type.node ? type.node->name : nullptr, "read",
+                                 false, false);
+  sql_print_error("InnoDB: Tried to read %zu bytes at offset %llu"
+                  " of file %s, but was only able to read %zd",
+                  n, offset, type.node ? type.node->name : "(unknown)",
+                  n_bytes);
+
+  return err ? err : DB_IO_ERROR;
+}
+
+/** Handle errors for file operations.
+@param[in]	name		name of a file or NULL
+@param[in]	operation	operation
+@param[in]	should_abort	whether to abort on an unknown error
+@param[in]	on_error_silent	whether to suppress reports of non-fatal errors
+@return true if we should retry the operation */
+static MY_ATTRIBUTE((warn_unused_result))
+bool
+os_file_handle_error_cond_exit(
+	const char*	name,
+	const char*	operation,
+	bool		should_abort,
+	bool		on_error_silent)
+{
+	ulint	err;
+
+	err = os_file_get_last_error(false, on_error_silent);
+
+	switch (err) {
+	case OS_FILE_DISK_FULL:
+		/* We only print a warning about disk full once */
+
+		if (os_has_said_disk_full) {
+
+			return(false);
+		}
+
+		/* Disk full error is reported irrespective of the
+		on_error_silent setting. */
+
+		if (name) {
+
+			ib::error()
+				<< "Encountered a problem with file '"
+				<< name << "'";
+		}
+
+		ib::error()
+			<< "Disk is full. Try to clean the disk to free space.";
+
+		os_has_said_disk_full = true;
+
+		return(false);
+
+	case OS_FILE_AIO_RESOURCES_RESERVED:
+	case OS_FILE_AIO_INTERRUPTED:
+
+		return(true);
+
+	case OS_FILE_PATH_ERROR:
+	case OS_FILE_ALREADY_EXISTS:
+	case OS_FILE_ACCESS_VIOLATION:
+		return(false);
+
+        case OS_FILE_NOT_FOUND:
+		if (!on_error_silent) {
+			sql_print_error("InnoDB: File %s was not found", name);
+		}
+		return false;
+
+	case OS_FILE_SHARING_VIOLATION:
+
+		std::this_thread::sleep_for(std::chrono::seconds(10));
+		return(true);
+
+	case OS_FILE_OPERATION_ABORTED:
+	case OS_FILE_INSUFFICIENT_RESOURCE:
+
+		std::this_thread::sleep_for(std::chrono::milliseconds(100));
+		return(true);
+
+	default:
+
+		/* If it is an operation that can crash on error then it
+		is better to ignore on_error_silent and print an error message
+		to the log. */
+
+		if (should_abort || !on_error_silent) {
+			ib::error() << "File "
+				<< (name != NULL ? name : "(unknown)")
+				<< ": '" << operation << "'"
+				" returned OS error " << err << "."
+				<< (should_abort
+				    ? " Cannot continue operation" : "");
+		}
+
+		if (should_abort) {
+			abort();
+		}
+	}
+
+	return(false);
+}
+
+#ifndef _WIN32
+/** Tries to disable OS caching on an opened file descriptor.
+@param[in]	fd		file descriptor to alter
+@param[in]	file_name	file name, used in the diagnostic message
+@param[in]	name		"open" or "create"; used in the diagnostic
+				message */
+void
+os_file_set_nocache(
+	int	fd		MY_ATTRIBUTE((unused)),
+	const char*	file_name	MY_ATTRIBUTE((unused)),
+	const char*	operation_name	MY_ATTRIBUTE((unused)))
+{
+	/* some versions of Solaris may not have DIRECTIO_ON */
+#if defined(__sun__) && defined(DIRECTIO_ON)
+	if (directio(fd, DIRECTIO_ON) == -1) {
+		int	errno_save = errno;
+
+		ib::error()
+			<< "Failed to set DIRECTIO_ON on file "
+			<< file_name << "; " << operation_name << ": "
+			<< strerror(errno_save) << ","
+			" continuing anyway.";
+	}
+#elif defined(O_DIRECT)
+	if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
+		int		errno_save = errno;
+		static bool	warning_message_printed = false;
+		if (errno_save == EINVAL) {
+			if (!warning_message_printed) {
+				warning_message_printed = true;
+				ib::info()
+					<< "Setting O_DIRECT on file "
+					<< file_name << " failed";
+			}
+		} else {
+			ib::warn()
+				<< "Failed to set O_DIRECT on file "
+				<< file_name << "; " << operation_name
+				<< " : " << strerror(errno_save)
+				<< ", continuing anyway.";
+		}
+	}
+#endif /* defined(__sun__) && defined(DIRECTIO_ON) */
+}
+
+#endif /* _WIN32 */
+
+/** Check if the file system supports sparse files.
+@param fh	file handle
+@return true if the file system supports sparse files */
+static bool os_is_sparse_file_supported(os_file_t fh)
+{
+#ifdef _WIN32
+	FILE_ATTRIBUTE_TAG_INFO info;
+	if (GetFileInformationByHandleEx(fh, FileAttributeTagInfo,
+		&info, (DWORD)sizeof(info))) {
+		if (info.FileAttributes != INVALID_FILE_ATTRIBUTES) {
+			return (info.FileAttributes & FILE_ATTRIBUTE_SPARSE_FILE) != 0;
+		}
+	}
+	return false;
+#else
+	/* We don't know the FS block size, use the sector size. The FS
+	will do the magic. */
+	return DB_SUCCESS == os_file_punch_hole_posix(fh, 0, srv_page_size);
+#endif /* _WIN32 */
+}
+
+/** Extend a file.
+
+On Windows, extending a file allocates blocks for the file,
+unless the file is sparse.
+
+On Unix, we will extend the file with ftruncate(), if
+file needs to be sparse. Otherwise posix_fallocate() is used
+when available, and if not, binary zeroes are added to the end
+of file.
+
+@param[in]	name	file name
+@param[in]	file	file handle
+@param[in]	size	desired file size
+@param[in]	sparse	whether to create a sparse file (no preallocating)
+@return	whether the operation succeeded */
+bool
+os_file_set_size(
+	const char*	name,
+	os_file_t	file,
+	os_offset_t	size,
+	bool	is_sparse)
+{
+	ut_ad(!(size & 4095));
+
+#ifdef _WIN32
+	/* On Windows, changing file size works well and as expected for both
+	sparse and normal files.
+
+	However, 10.2 up until 10.2.9 made every file sparse in innodb,
+	causing NTFS fragmentation issues(MDEV-13941). We try to undo
+	the damage, and unsparse the file.*/
+
+	if (!is_sparse && os_is_sparse_file_supported(file)) {
+		if (!os_file_set_sparse_win32(file, false))
+			/* Unsparsing file failed. Fallback to writing binary
+			zeros, to avoid even higher fragmentation.*/
+			goto fallback;
+	}
+
+	return os_file_change_size_win32(name, file, size);
+
+fallback:
+#else
+	struct stat statbuf;
+
+	if (is_sparse) {
+		bool success = !ftruncate(file, size);
+		if (!success) {
+			ib::error() << "ftruncate of file " << name << " to "
+				    << size << " bytes failed with error "
+				    << errno;
+		}
+		return(success);
+	}
+
+# ifdef HAVE_POSIX_FALLOCATE
+	int err;
+	do {
+		if (fstat(file, &statbuf)) {
+			err = errno;
+		} else {
+			MSAN_STAT_WORKAROUND(&statbuf);
+			os_offset_t current_size = statbuf.st_size;
+			if (current_size >= size) {
+				return true;
+			}
+			current_size &= ~4095ULL;
+			err = posix_fallocate(file, current_size,
+					      size - current_size);
+		}
+	} while (err == EINTR
+		 && srv_shutdown_state <= SRV_SHUTDOWN_INITIATED);
+
+	switch (err) {
+	case 0:
+		return true;
+	default:
+		ib::error() << "preallocating "
+			    << size << " bytes for file " << name
+			    << " failed with error " << err;
+		/* fall through */
+	case EINTR:
+		errno = err;
+		return false;
+	case EINVAL:
+	case EOPNOTSUPP:
+		/* fall back to the code below */
+		break;
+	}
+# endif /* HAVE_POSIX_ALLOCATE */
+#endif /* _WIN32*/
+
+#ifdef _WIN32
+	os_offset_t	current_size = os_file_get_size(file);
+	FILE_STORAGE_INFO info;
+	if (GetFileInformationByHandleEx(file, FileStorageInfo, &info,
+					 sizeof info)) {
+		if (info.LogicalBytesPerSector) {
+			current_size &= ~os_offset_t(info.LogicalBytesPerSector
+						     - 1);
+		}
+	}
+#else
+	if (fstat(file, &statbuf)) {
+		return false;
+	}
+	os_offset_t current_size = statbuf.st_size & ~4095ULL;
+#endif
+	if (current_size >= size) {
+		return true;
+	}
+
+	/* Write up to 1 megabyte at a time. */
+	ulint	buf_size = ut_min(ulint(64),
+				  ulint(size >> srv_page_size_shift))
+		<< srv_page_size_shift;
+
+	/* Align the buffer for possible raw i/o */
+	byte*	buf = static_cast<byte*>(aligned_malloc(buf_size,
+							srv_page_size));
+	/* Write buffer full of zeros */
+	memset(buf, 0, buf_size);
+
+	while (current_size < size
+	       && srv_shutdown_state <= SRV_SHUTDOWN_INITIATED) {
+		ulint	n_bytes;
+
+		if (size - current_size < (os_offset_t) buf_size) {
+			n_bytes = (ulint) (size - current_size);
+		} else {
+			n_bytes = buf_size;
+		}
+
+		if (os_file_write(IORequestWrite, name,
+				  file, buf, current_size, n_bytes) !=
+		    DB_SUCCESS) {
+			break;
+		}
+
+		current_size += n_bytes;
+	}
+
+	aligned_free(buf);
+
+	return(current_size >= size && os_file_flush(file));
+}
+
+/** Truncate a file to a specified size in bytes.
+@param[in]	pathname	file path
+@param[in]	file		file to be truncated
+@param[in]	size		size preserved in bytes
+@param[in]	allow_shrink	whether to allow the file to become smaller
+@return true if success */
+bool
+os_file_truncate(
+	const char*	pathname,
+	os_file_t	file,
+	os_offset_t	size,
+	bool		allow_shrink)
+{
+	if (!allow_shrink) {
+		/* Do nothing if the size preserved is larger than or
+		equal to the current size of file */
+		os_offset_t	size_bytes = os_file_get_size(file);
+
+		if (size >= size_bytes) {
+			return(true);
+		}
+	}
+
+#ifdef _WIN32
+	return(os_file_change_size_win32(pathname, file, size));
+#else /* _WIN32 */
+	return(os_file_truncate_posix(pathname, file, size));
+#endif /* _WIN32 */
+}
+
+/** Check the existence and type of the given file.
+@param[in]	path		path name of file
+@param[out]	exists		true if the file exists
+@param[out]	type		Type of the file, if it exists
+@return true if call succeeded */
+bool
+os_file_status(
+	const char*	path,
+	bool*		exists,
+	os_file_type_t* type)
+{
+#ifdef _WIN32
+	return(os_file_status_win32(path, exists, type));
+#else
+	return(os_file_status_posix(path, exists, type));
+#endif /* _WIN32 */
+}
+
+/** Free storage space associated with a section of the file.
+@param[in]	fh		Open file handle
+@param[in]	off		Starting offset (SEEK_SET)
+@param[in]	len		Size of the hole
+@return DB_SUCCESS or error code */
+dberr_t
+os_file_punch_hole(
+	os_file_t	fh,
+	os_offset_t	off,
+	os_offset_t	len)
+{
+#ifdef _WIN32
+	return os_file_punch_hole_win32(fh, off, len);
+#else
+	return os_file_punch_hole_posix(fh, off, len);
+#endif /* _WIN32 */
+}
+
+/** Free storage space associated with a section of the file.
+@param off   byte offset from the start (SEEK_SET)
+@param len   size of the hole in bytes
+@return DB_SUCCESS or error code */
+dberr_t IORequest::punch_hole(os_offset_t off, ulint len) const
+{
+	ulint trim_len = bpage ? bpage->physical_size() - len : 0;
+
+	if (trim_len == 0) {
+		return(DB_SUCCESS);
+	}
+
+	off += len;
+
+	/* Check does file system support punching holes for this
+	tablespace. */
+	if (!node->punch_hole) {
+		return DB_IO_NO_PUNCH_HOLE;
+	}
+
+	dberr_t err = os_file_punch_hole(node->handle, off, trim_len);
+
+	switch (err) {
+	case DB_SUCCESS:
+		srv_stats.page_compressed_trim_op.inc();
+		return err;
+	case DB_IO_NO_PUNCH_HOLE:
+		node->punch_hole = false;
+		err = DB_SUCCESS;
+		/* fall through */
+	default:
+		return err;
+	}
+}
+
+/*
+  Get file system block size, by path.
+
+  This is expensive on Windows, and not very useful in general,
+  (only shown in some I_S table), so we keep that out of usual
+  stat.
+*/
+size_t os_file_get_fs_block_size(const char *path)
+{
+#ifdef _WIN32
+  char volname[MAX_PATH];
+  if (!GetVolumePathName(path, volname, MAX_PATH))
+    return 0;
+  DWORD sectorsPerCluster;
+  DWORD bytesPerSector;
+  DWORD numberOfFreeClusters;
+  DWORD totalNumberOfClusters;
+
+  if (GetDiskFreeSpace(volname, &sectorsPerCluster, &bytesPerSector,
+                       &numberOfFreeClusters, &totalNumberOfClusters))
+    return ((size_t) bytesPerSector) * sectorsPerCluster;
+#else
+  os_file_stat_t info;
+  if (os_file_get_status(path, &info, false, false) == DB_SUCCESS)
+    return info.block_size;
+#endif
+  return 0;
+}
+
+/** This function returns information about the specified file
+@param[in]	path		pathname of the file
+@param[out]	stat_info	information of a file in a directory
+@param[in]	check_rw_perm	for testing whether the file can be opened
+				in RW mode
+@param[in]	read_only	true if file is opened in read-only mode
+@return DB_SUCCESS if all OK */
+dberr_t
+os_file_get_status(
+	const char*	path,
+	os_file_stat_t* stat_info,
+	bool		check_rw_perm,
+	bool		read_only)
+{
+	dberr_t	ret;
+
+#ifdef _WIN32
+	struct _stat64	info;
+
+	ret = os_file_get_status_win32(
+		path, stat_info, &info, check_rw_perm, read_only);
+
+#else
+	struct stat	info;
+
+	ret = os_file_get_status_posix(
+		path, stat_info, &info, check_rw_perm, read_only);
+
+#endif /* _WIN32 */
+
+	if (ret == DB_SUCCESS) {
+		stat_info->ctime = info.st_ctime;
+		stat_info->atime = info.st_atime;
+		stat_info->mtime = info.st_mtime;
+		stat_info->size  = info.st_size;
+	}
+
+	return(ret);
+}
+
+static void fake_io_callback(void *c)
+{
+  tpool::aiocb *cb= static_cast<tpool::aiocb*>(c);
+  ut_ad(read_slots->contains(cb));
+  static_cast<const IORequest*>(static_cast<const void*>(cb->m_userdata))->
+    fake_read_complete(cb->m_offset);
+  read_slots->release(cb);
+}
+
+static void read_io_callback(void *c)
+{
+  tpool::aiocb *cb= static_cast<tpool::aiocb*>(c);
+  ut_ad(cb->m_opcode == tpool::aio_opcode::AIO_PREAD);
+  ut_ad(read_slots->contains(cb));
+  const IORequest &request= *static_cast<const IORequest*>
+    (static_cast<const void*>(cb->m_userdata));
+  request.read_complete(cb->m_err);
+  read_slots->release(cb);
+}
+
+static void write_io_callback(void *c)
+{
+  tpool::aiocb *cb= static_cast<tpool::aiocb*>(c);
+  ut_ad(cb->m_opcode == tpool::aio_opcode::AIO_PWRITE);
+  ut_ad(write_slots->contains(cb));
+  const IORequest &request= *static_cast<const IORequest*>
+    (static_cast<const void*>(cb->m_userdata));
+
+  if (UNIV_UNLIKELY(cb->m_err != 0))
+    ib::info () << "IO Error: " << cb->m_err
+                << "during write of "
+                << cb->m_len << " bytes, for file "
+                << request.node->name << "(" << cb->m_fh << "), returned "
+                << cb->m_ret_len;
+
+  request.write_complete(cb->m_err);
+  write_slots->release(cb);
+}
+
+#ifdef LINUX_NATIVE_AIO
+/** Checks if the system supports native linux aio. On some kernel
+versions where native aio is supported it won't work on tmpfs. In such
+cases we can't use native aio.
+
+@return: true if supported, false otherwise. */
+static bool is_linux_native_aio_supported()
+{
+	File		fd;
+	io_context_t	io_ctx;
+	std::string log_file_path = get_log_file_path();
+
+	memset(&io_ctx, 0, sizeof(io_ctx));
+	if (io_setup(1, &io_ctx)) {
+
+		/* The platform does not support native aio. */
+
+		return(false);
+
+	}
+	else if (!srv_read_only_mode) {
+
+		/* Now check if tmpdir supports native aio ops. */
+		fd = mysql_tmpfile("ib");
+
+		if (fd < 0) {
+			ib::warn()
+				<< "Unable to create temp file to check"
+				" native AIO support.";
+
+			int ret = io_destroy(io_ctx);
+			ut_a(ret != -EINVAL);
+			ut_ad(ret != -EFAULT);
+
+			return(false);
+		}
+	}
+	else {
+		fd = my_open(log_file_path.c_str(), O_RDONLY | O_CLOEXEC,
+			     MYF(0));
+
+		if (fd == -1) {
+
+			ib::warn() << "Unable to open \"" << log_file_path
+				   << "\" to check native"
+				   << " AIO read support.";
+
+			int ret = io_destroy(io_ctx);
+			ut_a(ret != EINVAL);
+			ut_ad(ret != EFAULT);
+
+			return(false);
+		}
+	}
+
+	struct io_event	io_event;
+
+	memset(&io_event, 0x0, sizeof(io_event));
+
+	byte* ptr = static_cast<byte*>(aligned_malloc(srv_page_size,
+						      srv_page_size));
+
+	struct iocb	iocb;
+
+	/* Suppress valgrind warning. */
+	memset(ptr, 0, srv_page_size);
+	memset(&iocb, 0x0, sizeof(iocb));
+
+	struct iocb* p_iocb = &iocb;
+
+	if (!srv_read_only_mode) {
+
+		io_prep_pwrite(p_iocb, fd, ptr, srv_page_size, 0);
+
+	}
+	else {
+		ut_a(srv_page_size >= 512);
+		io_prep_pread(p_iocb, fd, ptr, 512, 0);
+	}
+
+	int	err = io_submit(io_ctx, 1, &p_iocb);
+
+	if (err >= 1) {
+		/* Now collect the submitted IO request. */
+		err = io_getevents(io_ctx, 1, 1, &io_event, NULL);
+	}
+
+	aligned_free(ptr);
+	my_close(fd, MYF(MY_WME));
+
+	switch (err) {
+	case 1:
+		{
+			int ret = io_destroy(io_ctx);
+			ut_a(ret != -EINVAL);
+			ut_ad(ret != -EFAULT);
+
+			return(true);
+		}
+
+	case -EINVAL:
+	case -ENOSYS:
+		ib::warn()
+			<< "Linux Native AIO not supported. You can either"
+			" move "
+			<< (srv_read_only_mode ? log_file_path : "tmpdir")
+			<< " to a file system that supports native"
+			" AIO or you can set innodb_use_native_aio to"
+			" FALSE to avoid this message.";
+
+		/* fall through. */
+	default:
+		ib::warn()
+			<< "Linux Native AIO check on "
+			<< (srv_read_only_mode ? log_file_path : "tmpdir")
+			<< "returned error[" << -err << "]";
+	}
+
+	int ret = io_destroy(io_ctx);
+	ut_a(ret != -EINVAL);
+	ut_ad(ret != -EFAULT);
+
+	return(false);
+}
+#endif
+
+int os_aio_init()
+{
+  int max_write_events= int(srv_n_write_io_threads *
+                            OS_AIO_N_PENDING_IOS_PER_THREAD);
+  int max_read_events= int(srv_n_read_io_threads *
+                           OS_AIO_N_PENDING_IOS_PER_THREAD);
+  int max_events= max_read_events + max_write_events;
+  int ret;
+#if LINUX_NATIVE_AIO
+  if (srv_use_native_aio && !is_linux_native_aio_supported())
+    goto disable;
+#endif
+
+  ret= srv_thread_pool->configure_aio(srv_use_native_aio, max_events);
+
+#ifdef LINUX_NATIVE_AIO
+  if (ret)
+  {
+    ut_ad(srv_use_native_aio);
+disable:
+    ib::warn() << "Linux Native AIO disabled.";
+    srv_use_native_aio= false;
+    ret= srv_thread_pool->configure_aio(false, max_events);
+  }
+#endif
+
+#ifdef HAVE_URING
+  if (ret)
+  {
+    ut_ad(srv_use_native_aio);
+    ib::warn()
+	    << "liburing disabled: falling back to innodb_use_native_aio=OFF";
+    srv_use_native_aio= false;
+    ret= srv_thread_pool->configure_aio(false, max_events);
+  }
+#endif
+
+  if (!ret)
+  {
+    read_slots= new io_slots(max_read_events, srv_n_read_io_threads);
+    write_slots= new io_slots(max_write_events, srv_n_write_io_threads);
+  }
+  return ret;
+}
+
+
+/**
+Change reader or writer thread parameter on a running server.
+This includes resizing  the io slots, as we calculate
+number of outstanding IOs based on the these variables.
+
+It is trickier with when Linux AIO is involved (io_context
+needs to be recreated to account for different number of
+max_events). With Linux AIO, depending on fs-max-aio number
+and user and system wide max-aio limitation, this can fail.
+
+Otherwise, we just resize the slots, and allow for
+more concurrent threads via thread_group setting.
+
+@param[in] n_reader_threads - max number of concurrently
+  executing read callbacks
+@param[in] n_writer_thread - max number of cuncurrently
+  executing write callbacks
+@return 0 for success, !=0 for error.
+*/
+int os_aio_resize(ulint n_reader_threads, ulint n_writer_threads)
+{
+  /* Lock the slots, and wait until all current IOs finish.*/
+  auto &lk_read= read_slots->mutex(), &lk_write= write_slots->mutex();
+  mysql_mutex_lock(&lk_read);
+  mysql_mutex_lock(&lk_write);
+
+  read_slots->wait(lk_read);
+  write_slots->wait(lk_write);
+
+  /* Now, all IOs have finished and no new ones can start, due to locks. */
+  int max_read_events= int(n_reader_threads * OS_AIO_N_PENDING_IOS_PER_THREAD);
+  int max_write_events= int(n_writer_threads * OS_AIO_N_PENDING_IOS_PER_THREAD);
+  int events= max_read_events + max_write_events;
+
+  /** Do the Linux AIO dance (this will try to create a new
+  io context with changed max_events ,etc*/
+
+  int ret= srv_thread_pool->reconfigure_aio(srv_use_native_aio, events);
+
+  if (ret)
+  {
+    /** Do the best effort. We can't change the parallel io number,
+    but we still can adjust the number of concurrent completion handlers.*/
+    read_slots->task_group().set_max_tasks(static_cast<int>(n_reader_threads));
+    write_slots->task_group().set_max_tasks(static_cast<int>(n_writer_threads));
+  }
+  else
+  {
+    /* Allocation succeeded, resize the slots*/
+    read_slots->resize(max_read_events, static_cast<int>(n_reader_threads));
+    write_slots->resize(max_write_events, static_cast<int>(n_writer_threads));
+  }
+
+  mysql_mutex_unlock(&lk_read);
+  mysql_mutex_unlock(&lk_write);
+  return ret;
+}
+
+void os_aio_free()
+{
+  srv_thread_pool->disable_aio();
+  delete read_slots;
+  delete write_slots;
+  read_slots= nullptr;
+  write_slots= nullptr;
+}
+
+/** Wait until there are no pending asynchronous writes. */
+static void os_aio_wait_until_no_pending_writes_low(bool declare)
+{
+  const bool notify_wait= declare && write_slots->pending_io_count();
+
+  if (notify_wait)
+    tpool::tpool_wait_begin();
+
+   write_slots->wait();
+
+   if (notify_wait)
+     tpool::tpool_wait_end();
+}
+
+/** Wait until there are no pending asynchronous writes.
+@param declare  whether the wait will be declared in tpool */
+void os_aio_wait_until_no_pending_writes(bool declare)
+{
+  os_aio_wait_until_no_pending_writes_low(declare);
+  buf_dblwr.wait_flush_buffered_writes();
+}
+
+/** @return number of pending reads */
+size_t os_aio_pending_reads()
+{
+  mysql_mutex_lock(&read_slots->mutex());
+  size_t pending= read_slots->pending_io_count();
+  mysql_mutex_unlock(&read_slots->mutex());
+  return pending;
+}
+
+/** @return approximate number of pending reads */
+size_t os_aio_pending_reads_approx()
+{
+  return read_slots->pending_io_count();
+}
+
+/** @return number of pending writes */
+size_t os_aio_pending_writes()
+{
+  mysql_mutex_lock(&write_slots->mutex());
+  size_t pending= write_slots->pending_io_count();
+  mysql_mutex_unlock(&write_slots->mutex());
+  return pending;
+}
+
+/** Wait until all pending asynchronous reads have completed.
+@param declare  whether the wait will be declared in tpool */
+void os_aio_wait_until_no_pending_reads(bool declare)
+{
+  const bool notify_wait= declare && read_slots->pending_io_count();
+
+  if (notify_wait)
+    tpool::tpool_wait_begin();
+
+  read_slots->wait();
+
+  if (notify_wait)
+    tpool::tpool_wait_end();
+}
+
+/** Submit a fake read request during crash recovery.
+@param type  fake read request
+@param offset additional context */
+void os_fake_read(const IORequest &type, os_offset_t offset)
+{
+  tpool::aiocb *cb= read_slots->acquire();
+
+  cb->m_group= read_slots->get_task_group();
+  cb->m_fh= type.node->handle.m_file;
+  cb->m_buffer= nullptr;
+  cb->m_len= 0;
+  cb->m_offset= offset;
+  cb->m_opcode= tpool::aio_opcode::AIO_PREAD;
+  new (cb->m_userdata) IORequest{type};
+  cb->m_internal_task.m_func= fake_io_callback;
+  cb->m_internal_task.m_arg= cb;
+  cb->m_internal_task.m_group= cb->m_group;
+
+  srv_thread_pool->submit_task(&cb->m_internal_task);
+}
+
+
+/** Request a read or write.
+@param type		I/O request
+@param buf		buffer
+@param offset		file offset
+@param n		number of bytes
+@retval DB_SUCCESS if request was queued successfully
+@retval DB_IO_ERROR on I/O error */
+dberr_t os_aio(const IORequest &type, void *buf, os_offset_t offset, size_t n)
+{
+	ut_ad(n > 0);
+	ut_ad(!(n & 511)); /* payload of page_compressed tables */
+	ut_ad((offset % UNIV_ZIP_SIZE_MIN) == 0);
+	ut_ad((reinterpret_cast<size_t>(buf) % UNIV_ZIP_SIZE_MIN) == 0);
+	ut_ad(type.is_read() || type.is_write());
+	ut_ad(type.node);
+	ut_ad(type.node->is_open());
+
+#ifdef WIN_ASYNC_IO
+	ut_ad((n & 0xFFFFFFFFUL) == n);
+#endif /* WIN_ASYNC_IO */
+
+#ifdef UNIV_PFS_IO
+	PSI_file_locker_state state;
+	PSI_file_locker* locker= nullptr;
+	register_pfs_file_io_begin(&state, locker, type.node->handle, n,
+				   type.is_write()
+				   ? PSI_FILE_WRITE : PSI_FILE_READ,
+				   __FILE__, __LINE__);
+#endif /* UNIV_PFS_IO */
+	dberr_t err = DB_SUCCESS;
+
+	if (!type.is_async()) {
+		err = type.is_read()
+			? os_file_read_func(type, type.node->handle,
+					    buf, offset, n, nullptr)
+			: os_file_write_func(type, type.node->name,
+					     type.node->handle,
+					     buf, offset, n);
+func_exit:
+#ifdef UNIV_PFS_IO
+		register_pfs_file_io_end(locker, n);
+#endif /* UNIV_PFS_IO */
+		return err;
+	}
+
+	io_slots* slots;
+	tpool::callback_func callback;
+	tpool::aio_opcode opcode;
+
+	if (type.is_read()) {
+		++os_n_file_reads;
+		slots = read_slots;
+		callback = read_io_callback;
+		opcode = tpool::aio_opcode::AIO_PREAD;
+	} else {
+		++os_n_file_writes;
+		slots = write_slots;
+		callback = write_io_callback;
+		opcode = tpool::aio_opcode::AIO_PWRITE;
+	}
+
+	compile_time_assert(sizeof(IORequest) <= tpool::MAX_AIO_USERDATA_LEN);
+	tpool::aiocb* cb = slots->acquire();
+
+	cb->m_buffer = buf;
+	cb->m_callback = callback;
+	cb->m_group = slots->get_task_group();
+	cb->m_fh = type.node->handle.m_file;
+	cb->m_len = (int)n;
+	cb->m_offset = offset;
+	cb->m_opcode = opcode;
+	new (cb->m_userdata) IORequest{type};
+
+	if (srv_thread_pool->submit_io(cb)) {
+		slots->release(cb);
+		os_file_handle_error(type.node->name, type.is_read()
+				     ? "aio read" : "aio write");
+		err = DB_IO_ERROR;
+		type.node->space->release();
+	}
+
+	goto func_exit;
+}
+
+/** Prints info of the aio arrays.
+@param[in,out]	file		file where to print */
+void
+os_aio_print(FILE*	file)
+{
+	time_t		current_time;
+	double		time_elapsed;
+
+	current_time = time(NULL);
+	time_elapsed = 0.001 + difftime(current_time, os_last_printout);
+
+	fprintf(file,
+		"Pending flushes (fsync): " ULINTPF "\n"
+		ULINTPF " OS file reads, %zu OS file writes, %zu OS fsyncs\n",
+		ulint{fil_n_pending_tablespace_flushes},
+		ulint{os_n_file_reads},
+		static_cast<size_t>(os_n_file_writes),
+		static_cast<size_t>(os_n_fsyncs));
+
+	const ulint n_reads = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_READS));
+	const ulint n_writes = ulint(MONITOR_VALUE(MONITOR_OS_PENDING_WRITES));
+
+	if (n_reads != 0 || n_writes != 0) {
+		fprintf(file,
+			ULINTPF " pending reads, " ULINTPF " pending writes\n",
+			n_reads, n_writes);
+	}
+
+	ulint avg_bytes_read = (os_n_file_reads == os_n_file_reads_old)
+		? 0
+		: os_bytes_read_since_printout
+		/ (os_n_file_reads - os_n_file_reads_old);
+
+	fprintf(file,
+		"%.2f reads/s, " ULINTPF " avg bytes/read,"
+		" %.2f writes/s, %.2f fsyncs/s\n",
+		static_cast<double>(os_n_file_reads - os_n_file_reads_old)
+		/ time_elapsed,
+		avg_bytes_read,
+		static_cast<double>(os_n_file_writes - os_n_file_writes_old)
+		/ time_elapsed,
+		static_cast<double>(os_n_fsyncs - os_n_fsyncs_old)
+		/ time_elapsed);
+
+	os_n_file_reads_old = os_n_file_reads;
+	os_n_file_writes_old = os_n_file_writes;
+	os_n_fsyncs_old = os_n_fsyncs;
+	os_bytes_read_since_printout = 0;
+
+	os_last_printout = current_time;
+}
+
+/** Refreshes the statistics used to print per-second averages. */
+void
+os_aio_refresh_stats()
+{
+	os_n_fsyncs_old = os_n_fsyncs;
+
+	os_bytes_read_since_printout = 0;
+
+	os_n_file_reads_old = os_n_file_reads;
+
+	os_n_file_writes_old = os_n_file_writes;
+
+	os_n_fsyncs_old = os_n_fsyncs;
+
+	os_bytes_read_since_printout = 0;
+
+	os_last_printout = time(NULL);
+}
+
+
+/**
+Set the file create umask
+@param[in]	umask		The umask to use for file creation. */
+void
+os_file_set_umask(ulint umask)
+{
+	os_innodb_umask = umask;
+}
+
+#ifdef _WIN32
+
+/* Checks whether physical drive is on SSD.*/
+static bool is_drive_on_ssd(DWORD nr)
+{
+  char physical_drive_path[32];
+  snprintf(physical_drive_path, sizeof(physical_drive_path),
+           "\\\\.\\PhysicalDrive%lu", nr);
+
+  HANDLE h= CreateFile(physical_drive_path, 0,
+                 FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
+                 nullptr, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, nullptr);
+  if (h == INVALID_HANDLE_VALUE)
+    return false;
+
+  DEVICE_SEEK_PENALTY_DESCRIPTOR seek_penalty;
+  STORAGE_PROPERTY_QUERY storage_query{};
+  storage_query.PropertyId= StorageDeviceSeekPenaltyProperty;
+  storage_query.QueryType= PropertyStandardQuery;
+
+  bool on_ssd= false;
+  DWORD bytes_written;
+  if (DeviceIoControl(h, IOCTL_STORAGE_QUERY_PROPERTY, &storage_query,
+                      sizeof storage_query, &seek_penalty, sizeof seek_penalty,
+                      &bytes_written, nullptr))
+  {
+    on_ssd= !seek_penalty.IncursSeekPenalty;
+  }
+  else
+  {
+    on_ssd= false;
+  }
+  CloseHandle(h);
+  return on_ssd;
+}
+
+/*
+  Checks whether volume is on SSD, by checking all physical drives
+  in that volume.
+*/
+static bool is_volume_on_ssd(const char *volume_mount_point)
+{
+  char volume_name[MAX_PATH];
+
+  if (!GetVolumeNameForVolumeMountPoint(volume_mount_point, volume_name,
+                                        array_elements(volume_name)))
+  {
+    /* This can fail, e.g if file is on network share */
+    return false;
+  }
+
+  /* Chomp last backslash, this is needed to open volume.*/
+  size_t length= strlen(volume_name);
+  if (length && volume_name[length - 1] == '\\')
+    volume_name[length - 1]= 0;
+
+  /* Open volume handle */
+  HANDLE volume_handle= CreateFile(
+      volume_name, 0, FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
+      nullptr, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, nullptr);
+
+  if (volume_handle == INVALID_HANDLE_VALUE)
+    return false;
+
+  /*
+   Enumerate all volume extends, check whether all of them are on SSD
+  */
+
+  /* Anticipate common case where there is only one extent.*/
+  VOLUME_DISK_EXTENTS single_extent;
+
+  /* But also have a place to manage allocated data.*/
+  std::unique_ptr<BYTE[]> lifetime;
+
+  DWORD bytes_written;
+  VOLUME_DISK_EXTENTS *extents= nullptr;
+  if (DeviceIoControl(volume_handle, IOCTL_VOLUME_GET_VOLUME_DISK_EXTENTS,
+                      nullptr, 0, &single_extent, sizeof(single_extent),
+                      &bytes_written, nullptr))
+  {
+    /* Worked on the first try. Use the preallocated buffer.*/
+    extents= &single_extent;
+  }
+  else
+  {
+    VOLUME_DISK_EXTENTS *last_query= &single_extent;
+    while (GetLastError() == ERROR_MORE_DATA)
+    {
+      DWORD extentCount= last_query->NumberOfDiskExtents;
+      DWORD allocatedSize=
+          FIELD_OFFSET(VOLUME_DISK_EXTENTS, Extents[extentCount]);
+      lifetime.reset(new BYTE[allocatedSize]);
+      last_query= (VOLUME_DISK_EXTENTS *) lifetime.get();
+      if (DeviceIoControl(volume_handle, IOCTL_VOLUME_GET_VOLUME_DISK_EXTENTS,
+                          nullptr, 0, last_query, allocatedSize,
+                          &bytes_written, nullptr))
+      {
+        extents= last_query;
+        break;
+      }
+    }
+  }
+  CloseHandle(volume_handle);
+  if (!extents)
+    return false;
+
+  for (DWORD i= 0; i < extents->NumberOfDiskExtents; i++)
+    if (!is_drive_on_ssd(extents->Extents[i].DiskNumber))
+      return false;
+
+  return true;
+}
+
+#include <unordered_map>
+static bool is_path_on_ssd(char *file_path)
+{
+  /* Preset result, in case something fails, e.g we're on network drive.*/
+  char volume_path[MAX_PATH];
+  if (!GetVolumePathName(file_path, volume_path, array_elements(volume_path)))
+    return false;
+  return is_volume_on_ssd(volume_path);
+}
+
+static bool is_file_on_ssd(HANDLE handle, char *file_path)
+{
+  ULONGLONG volume_serial_number;
+  FILE_ID_INFO info;
+  if(!GetFileInformationByHandleEx(handle, FileIdInfo, &info, sizeof(info)))
+    return false;
+  volume_serial_number= info.VolumeSerialNumber;
+
+  static std::unordered_map<ULONGLONG, bool> cache;
+  static SRWLOCK lock= SRWLOCK_INIT;
+  bool found;
+  bool result;
+  AcquireSRWLockShared(&lock);
+  auto e= cache.find(volume_serial_number);
+  if ((found= e != cache.end()))
+    result= e->second;
+  ReleaseSRWLockShared(&lock);
+  if (!found)
+  {
+    result= is_path_on_ssd(file_path);
+    /* Update cache */
+    AcquireSRWLockExclusive(&lock);
+    cache[volume_serial_number]= result;
+    ReleaseSRWLockExclusive(&lock);
+  }
+  return result;
+}
+
+#endif
+
+void fil_node_t::find_metadata(os_file_t file
+#ifndef _WIN32
+                               , bool create, struct stat *statbuf
+#endif
+                               )
+{
+  if (!is_open())
+  {
+    handle= file;
+    ut_ad(is_open());
+  }
+
+  if (!space->is_compressed())
+    punch_hole= 0;
+  else if (my_test_if_thinly_provisioned(file))
+    punch_hole= 2;
+  else
+    punch_hole= IF_WIN(, !create ||) os_is_sparse_file_supported(file);
+
+#ifdef _WIN32
+  on_ssd= is_file_on_ssd(file, name);
+  FILE_STORAGE_INFO info;
+  if (GetFileInformationByHandleEx(file, FileStorageInfo, &info, sizeof info))
+    block_size= info.PhysicalBytesPerSectorForAtomicity;
+  else
+    block_size= 512;
+#else
+  struct stat sbuf;
+  if (!statbuf && !fstat(file, &sbuf))
+  {
+    MSAN_STAT_WORKAROUND(&sbuf);
+    statbuf= &sbuf;
+  }
+  if (statbuf)
+    block_size= statbuf->st_blksize;
+# ifdef __linux__
+  on_ssd= statbuf && fil_system.is_ssd(statbuf->st_dev);
+# endif
+#endif
+
+  if (space->purpose != FIL_TYPE_TABLESPACE)
+  {
+    /* For temporary tablespace or during IMPORT TABLESPACE, we
+    disable neighbour flushing and do not care about atomicity. */
+    on_ssd= true;
+    atomic_write= true;
+  }
+  else
+    /* On Windows, all single sector writes are atomic, as per
+    WriteFile() documentation on MSDN. */
+    atomic_write= srv_use_atomic_writes &&
+      IF_WIN(srv_page_size == block_size,
+	     my_test_if_atomic_write(file, space->physical_size()));
+}
+
+/** Read the first page of a data file.
+@return	whether the page was found valid */
+bool fil_node_t::read_page0()
+{
+  mysql_mutex_assert_owner(&fil_system.mutex);
+  const unsigned psize= space->physical_size();
+#ifndef _WIN32
+  struct stat statbuf;
+  if (fstat(handle, &statbuf))
+    return false;
+  MSAN_STAT_WORKAROUND(&statbuf);
+  os_offset_t size_bytes= statbuf.st_size;
+#else
+  os_offset_t size_bytes= os_file_get_size(handle);
+  ut_a(size_bytes != (os_offset_t) -1);
+#endif
+  const uint32_t min_size= FIL_IBD_FILE_INITIAL_SIZE * psize;
+
+  if (size_bytes < min_size)
+  {
+    ib::error() << "The size of the file " << name
+      << " is only " << size_bytes
+      << " bytes, should be at least " << min_size;
+    return false;
+  }
+
+  if (!deferred)
+  {
+    page_t *page= static_cast<byte*>(aligned_malloc(psize, psize));
+    if (os_file_read(IORequestRead, handle, page, 0, psize, nullptr)
+        != DB_SUCCESS)
+    {
+      sql_print_error("InnoDB: Unable to read first page of file %s", name);
+corrupted:
+      aligned_free(page);
+      return false;
+    }
+
+    const ulint space_id= memcmp_aligned<2>
+      (FIL_PAGE_SPACE_ID + page,
+       FSP_HEADER_OFFSET + FSP_SPACE_ID + page, 4)
+      ? ULINT_UNDEFINED
+      : mach_read_from_4(FIL_PAGE_SPACE_ID + page);
+    uint32_t flags= fsp_header_get_flags(page);
+    const uint32_t size= fsp_header_get_field(page, FSP_SIZE);
+    const uint32_t free_limit= fsp_header_get_field(page, FSP_FREE_LIMIT);
+    const uint32_t free_len= flst_get_len(FSP_HEADER_OFFSET + FSP_FREE + page);
+    if (!fil_space_t::is_valid_flags(flags, space->id))
+    {
+      uint32_t cflags= fsp_flags_convert_from_101(flags);
+      if (cflags == UINT32_MAX)
+      {
+invalid:
+        ib::error() << "Expected tablespace flags "
+          << ib::hex(space->flags)
+          << " but found " << ib::hex(flags)
+          << " in the file " << name;
+        goto corrupted;
+      }
+
+      uint32_t cf= cflags & ~FSP_FLAGS_MEM_MASK;
+      uint32_t sf= space->flags & ~FSP_FLAGS_MEM_MASK;
+
+      if (!fil_space_t::is_flags_equal(cf, sf) &&
+          !fil_space_t::is_flags_equal(sf, cf))
+        goto invalid;
+      flags= cflags;
+    }
+
+    ut_ad(!(flags & FSP_FLAGS_MEM_MASK));
+
+    /* Try to read crypt_data from page 0 if it is not yet read. */
+    if (!space->crypt_data)
+      space->crypt_data= fil_space_read_crypt_data(
+        fil_space_t::zip_size(flags), page);
+    aligned_free(page);
+
+    if (UNIV_UNLIKELY(space_id != space->id))
+    {
+      ib::error() << "Expected tablespace id " << space->id
+        << " but found " << space_id
+        << " in the file " << name;
+      return false;
+    }
+
+    space->flags= (space->flags & FSP_FLAGS_MEM_MASK) | flags;
+    ut_ad(space->free_limit == 0 || space->free_limit == free_limit);
+    ut_ad(space->free_len == 0 || space->free_len == free_len);
+    space->size_in_header= size;
+    space->free_limit= free_limit;
+    space->free_len= free_len;
+  }
+
+  IF_WIN(find_metadata(), find_metadata(handle, false, &statbuf));
+  /* Truncate the size to a multiple of extent size. */
+  ulint	mask= psize * FSP_EXTENT_SIZE - 1;
+
+  if (size_bytes <= mask);
+    /* .ibd files start smaller than an
+    extent size. Do not truncate valid data. */
+  else
+    size_bytes&= ~os_offset_t(mask);
+
+  this->size= uint32_t(size_bytes / psize);
+  space->set_sizes(this->size);
+  return true;
+}
diff --git a/storage/innobase/page/page0cur.cc b/storage/innobase/page/page0cur.cc
new file mode 100644
index 00000000..b019694b
--- /dev/null
+++ b/storage/innobase/page/page0cur.cc
@@ -0,0 +1,3097 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2018, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file page/page0cur.cc
+The page cursor
+
+Created 10/4/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "page0cur.h"
+#include "page0zip.h"
+#include "btr0btr.h"
+#include "mtr0log.h"
+#include "log0recv.h"
+#include "rem0cmp.h"
+#include "gis0rtree.h"
+
+#include <algorithm>
+
+#ifdef BTR_CUR_HASH_ADAPT
+# ifdef UNIV_SEARCH_PERF_STAT
+static ulint	page_cur_short_succ;
+# endif /* UNIV_SEARCH_PERF_STAT */
+
+/** Try a search shortcut based on the last insert.
+@param[in]	block			index page
+@param[in]	index			index tree
+@param[in]	tuple			search key
+@param[in,out]	iup_matched_fields	already matched fields in the
+upper limit record
+@param[in,out]	ilow_matched_fields	already matched fields in the
+lower limit record
+@param[out]	cursor			page cursor
+@return true on success */
+UNIV_INLINE
+bool
+page_cur_try_search_shortcut(
+	const buf_block_t*	block,
+	const dict_index_t*	index,
+	const dtuple_t*		tuple,
+	ulint*			iup_matched_fields,
+	ulint*			ilow_matched_fields,
+	page_cur_t*		cursor)
+{
+	const rec_t*	rec;
+	const rec_t*	next_rec;
+	ulint		low_match;
+	ulint		up_match;
+	ibool		success		= FALSE;
+	const page_t*	page		= buf_block_get_frame(block);
+	mem_heap_t*	heap		= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(dtuple_check_typed(tuple));
+	ut_ad(page_is_leaf(page));
+
+	rec = page_header_get_ptr(page, PAGE_LAST_INSERT);
+	offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+				  dtuple_get_n_fields(tuple), &heap);
+
+	ut_ad(rec);
+	ut_ad(page_rec_is_user_rec(rec));
+
+	low_match = up_match = std::min(*ilow_matched_fields,
+					*iup_matched_fields);
+
+	if (cmp_dtuple_rec_with_match(tuple, rec, index, offsets,
+				      &low_match) < 0) {
+		goto exit_func;
+	}
+
+	if (!(next_rec = page_rec_get_next_const(rec))) {
+		goto exit_func;
+	}
+
+	if (!page_rec_is_supremum(next_rec)) {
+		offsets = rec_get_offsets(next_rec, index, offsets,
+					  index->n_core_fields,
+					  dtuple_get_n_fields(tuple), &heap);
+
+		if (cmp_dtuple_rec_with_match(tuple, next_rec, index, offsets,
+					      &up_match) >= 0) {
+			goto exit_func;
+		}
+
+		*iup_matched_fields = up_match;
+	}
+
+	page_cur_position(rec, block, cursor);
+
+	*ilow_matched_fields = low_match;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+	page_cur_short_succ++;
+#endif
+	success = TRUE;
+exit_func:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(success);
+}
+
+/** Try a search shortcut based on the last insert.
+@param[in]	block			index page
+@param[in]	index			index tree
+@param[in]	tuple			search key
+@param[in,out]	iup_matched_fields	already matched fields in the
+upper limit record
+@param[in,out]	iup_matched_bytes	already matched bytes in the
+first partially matched field in the upper limit record
+@param[in,out]	ilow_matched_fields	already matched fields in the
+lower limit record
+@param[in,out]	ilow_matched_bytes	already matched bytes in the
+first partially matched field in the lower limit record
+@param[out]	cursor			page cursor
+@return true on success */
+UNIV_INLINE
+bool
+page_cur_try_search_shortcut_bytes(
+	const buf_block_t*	block,
+	const dict_index_t*	index,
+	const dtuple_t*		tuple,
+	ulint*			iup_matched_fields,
+	ulint*			iup_matched_bytes,
+	ulint*			ilow_matched_fields,
+	ulint*			ilow_matched_bytes,
+	page_cur_t*		cursor)
+{
+	const rec_t*	rec;
+	const rec_t*	next_rec;
+	ulint		low_match;
+	ulint		low_bytes;
+	ulint		up_match;
+	ulint		up_bytes;
+	ibool		success		= FALSE;
+	const page_t*	page		= buf_block_get_frame(block);
+	mem_heap_t*	heap		= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(dtuple_check_typed(tuple));
+	ut_ad(page_is_leaf(page));
+
+	rec = page_header_get_ptr(page, PAGE_LAST_INSERT);
+	offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+				  dtuple_get_n_fields(tuple), &heap);
+
+	ut_ad(rec);
+	ut_ad(page_rec_is_user_rec(rec));
+	if (ut_pair_cmp(*ilow_matched_fields, *ilow_matched_bytes,
+			*iup_matched_fields, *iup_matched_bytes) < 0) {
+		up_match = low_match = *ilow_matched_fields;
+		up_bytes = low_bytes = *ilow_matched_bytes;
+	} else {
+		up_match = low_match = *iup_matched_fields;
+		up_bytes = low_bytes = *iup_matched_bytes;
+	}
+
+	if (cmp_dtuple_rec_with_match_bytes(
+		    tuple, rec, index, offsets, &low_match, &low_bytes) < 0) {
+		goto exit_func;
+	}
+
+	if (!(next_rec = page_rec_get_next_const(rec))) {
+		goto exit_func;
+	}
+
+	if (!page_rec_is_supremum(next_rec)) {
+		offsets = rec_get_offsets(next_rec, index, offsets,
+					  index->n_core_fields,
+					  dtuple_get_n_fields(tuple), &heap);
+
+		if (cmp_dtuple_rec_with_match_bytes(
+			    tuple, next_rec, index, offsets,
+			    &up_match, &up_bytes)
+		    >= 0) {
+			goto exit_func;
+		}
+
+		*iup_matched_fields = up_match;
+		*iup_matched_bytes = up_bytes;
+	}
+
+	page_cur_position(rec, block, cursor);
+
+	*ilow_matched_fields = low_match;
+	*ilow_matched_bytes = low_bytes;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+	page_cur_short_succ++;
+#endif
+	success = TRUE;
+exit_func:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(success);
+}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+/****************************************************************//**
+Checks if the nth field in a record is a character type field which extends
+the nth field in tuple, i.e., the field is longer or equal in length and has
+common first characters.
+@return TRUE if rec field extends tuple field */
+static
+ibool
+page_cur_rec_field_extends(
+/*=======================*/
+	const dtuple_t*	tuple,	/*!< in: data tuple */
+	const rec_t*	rec,	/*!< in: record */
+	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n)	/*!< in: compare nth field */
+{
+	const dtype_t*	type;
+	const dfield_t*	dfield;
+	const byte*	rec_f;
+	ulint		rec_f_len;
+
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	dfield = dtuple_get_nth_field(tuple, n);
+
+	type = dfield_get_type(dfield);
+
+	rec_f = rec_get_nth_field(rec, offsets, n, &rec_f_len);
+
+	if (type->mtype == DATA_VARCHAR
+	    || type->mtype == DATA_CHAR
+	    || type->mtype == DATA_FIXBINARY
+	    || type->mtype == DATA_BINARY
+	    || type->mtype == DATA_BLOB
+	    || DATA_GEOMETRY_MTYPE(type->mtype)
+	    || type->mtype == DATA_VARMYSQL
+	    || type->mtype == DATA_MYSQL) {
+
+		if (dfield_get_len(dfield) != UNIV_SQL_NULL
+		    && rec_f_len != UNIV_SQL_NULL
+		    && rec_f_len >= dfield_get_len(dfield)
+		    && !cmp_data_data(type->mtype, type->prtype,
+				      dfield_get_data(dfield),
+				      dfield_get_len(dfield),
+				      rec_f, dfield_get_len(dfield))) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+
+/****************************************************************//**
+Searches the right position for a page cursor. */
+bool
+page_cur_search_with_match(
+/*=======================*/
+	const dtuple_t*		tuple,	/*!< in: data tuple */
+	page_cur_mode_t		mode,	/*!< in: PAGE_CUR_L,
+					PAGE_CUR_LE, PAGE_CUR_G, or
+					PAGE_CUR_GE */
+	ulint*			iup_matched_fields,
+					/*!< in/out: already matched
+					fields in upper limit record */
+	ulint*			ilow_matched_fields,
+					/*!< in/out: already matched
+					fields in lower limit record */
+	page_cur_t*		cursor,	/*!< out: page cursor */
+	rtr_info_t*		rtr_info)/*!< in/out: rtree search stack */
+{
+	ulint		up;
+	ulint		low;
+	ulint		mid;
+	const page_t*	page;
+	const rec_t*	up_rec;
+	const rec_t*	low_rec;
+	const rec_t*	mid_rec;
+	ulint		up_matched_fields;
+	ulint		low_matched_fields;
+	ulint		cur_matched_fields;
+	int		cmp;
+	const dict_index_t* const index = cursor->index;
+	const buf_block_t* const block = cursor->block;
+#ifdef UNIV_ZIP_DEBUG
+	const page_zip_des_t*	page_zip = buf_block_get_page_zip(block);
+#endif /* UNIV_ZIP_DEBUG */
+	mem_heap_t*	heap		= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(dtuple_validate(tuple));
+#ifdef UNIV_DEBUG
+# ifdef PAGE_CUR_DBG
+	if (mode != PAGE_CUR_DBG)
+# endif /* PAGE_CUR_DBG */
+# ifdef PAGE_CUR_LE_OR_EXTENDS
+		if (mode != PAGE_CUR_LE_OR_EXTENDS)
+# endif /* PAGE_CUR_LE_OR_EXTENDS */
+			ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
+			      || mode == PAGE_CUR_G || mode == PAGE_CUR_GE
+			      || dict_index_is_spatial(index));
+#endif /* UNIV_DEBUG */
+	page = buf_block_get_frame(block);
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+	ut_d(page_check_dir(page));
+	const ulint n_core = page_is_leaf(page) ? index->n_core_fields : 0;
+
+#ifdef BTR_CUR_HASH_ADAPT
+	if (n_core
+	    && page_get_direction(page) == PAGE_RIGHT
+	    && page_header_get_offs(page, PAGE_LAST_INSERT)
+	    && mode == PAGE_CUR_LE
+	    && !index->is_spatial()
+	    && page_header_get_field(page, PAGE_N_DIRECTION) > 3
+	    && page_cur_try_search_shortcut(
+		    block, index, tuple,
+		    iup_matched_fields, ilow_matched_fields, cursor)) {
+		return false;
+	}
+# ifdef PAGE_CUR_DBG
+	if (mode == PAGE_CUR_DBG) {
+		mode = PAGE_CUR_LE;
+	}
+# endif
+#endif /* BTR_CUR_HASH_ADAPT */
+
+	/* If the mode is for R-tree indexes, use the special MBR
+	related compare functions */
+	if (index->is_spatial() && mode > PAGE_CUR_LE) {
+		/* For leaf level insert, we still use the traditional
+		compare function for now */
+		if (mode == PAGE_CUR_RTREE_INSERT && n_core) {
+			mode = PAGE_CUR_LE;
+		} else {
+			return rtr_cur_search_with_match(
+				block, (dict_index_t*)index, tuple, mode,
+				cursor, rtr_info);
+		}
+	}
+
+	/* The following flag does not work for non-latin1 char sets because
+	cmp_full_field does not tell how many bytes matched */
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+	ut_a(mode != PAGE_CUR_LE_OR_EXTENDS);
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+
+	/* If mode PAGE_CUR_G is specified, we are trying to position the
+	cursor to answer a query of the form "tuple < X", where tuple is
+	the input parameter, and X denotes an arbitrary physical record on
+	the page. We want to position the cursor on the first X which
+	satisfies the condition. */
+
+	up_matched_fields  = *iup_matched_fields;
+	low_matched_fields = *ilow_matched_fields;
+
+	/* Perform binary search. First the search is done through the page
+	directory, after that as a linear search in the list of records
+	owned by the upper limit directory slot. */
+
+	low = 0;
+	up = ulint(page_dir_get_n_slots(page)) - 1;
+
+	/* Perform binary search until the lower and upper limit directory
+	slots come to the distance 1 of each other */
+
+	while (up - low > 1) {
+		mid = (low + up) / 2;
+		const page_dir_slot_t* slot = page_dir_get_nth_slot(page, mid);
+		if (UNIV_UNLIKELY(!(mid_rec
+				    = page_dir_slot_get_rec_validate(slot)))) {
+			goto corrupted;
+		}
+		cur_matched_fields = std::min(low_matched_fields,
+					      up_matched_fields);
+
+		offsets = offsets_;
+		offsets = rec_get_offsets(
+			mid_rec, index, offsets, n_core,
+			dtuple_get_n_fields_cmp(tuple), &heap);
+
+		cmp = cmp_dtuple_rec_with_match(
+			tuple, mid_rec, index, offsets, &cur_matched_fields);
+
+		if (cmp > 0) {
+low_slot_match:
+			low = mid;
+			low_matched_fields = cur_matched_fields;
+
+		} else if (cmp) {
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+			if (mode == PAGE_CUR_LE_OR_EXTENDS
+			    && page_cur_rec_field_extends(
+				    tuple, mid_rec, offsets,
+				    cur_matched_fields)) {
+
+				goto low_slot_match;
+			}
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+up_slot_match:
+			up = mid;
+			up_matched_fields = cur_matched_fields;
+
+		} else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+			   || mode == PAGE_CUR_LE_OR_EXTENDS
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+			   ) {
+			goto low_slot_match;
+		} else {
+
+			goto up_slot_match;
+		}
+	}
+
+	low_rec = page_dir_slot_get_rec_validate(
+		page_dir_get_nth_slot(page, low));
+	up_rec = page_dir_slot_get_rec_validate(
+		page_dir_get_nth_slot(page, up));
+	if (UNIV_UNLIKELY(!low_rec || !up_rec)) {
+corrupted:
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+		return true;
+	}
+
+	/* Perform linear search until the upper and lower records come to
+	distance 1 of each other. */
+
+	for (;;) {
+		if (const rec_t* next = page_rec_get_next_const(low_rec)) {
+			if (next == up_rec) {
+				break;
+			}
+			mid_rec = next;
+		} else {
+			goto corrupted;
+		}
+		cur_matched_fields = std::min(low_matched_fields,
+					      up_matched_fields);
+
+		offsets = offsets_;
+		offsets = rec_get_offsets(
+			mid_rec, index, offsets, n_core,
+			dtuple_get_n_fields_cmp(tuple), &heap);
+
+		cmp = cmp_dtuple_rec_with_match(
+			tuple, mid_rec, index, offsets, &cur_matched_fields);
+
+		if (cmp > 0) {
+low_rec_match:
+			low_rec = mid_rec;
+			low_matched_fields = cur_matched_fields;
+
+		} else if (cmp) {
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+			if (mode == PAGE_CUR_LE_OR_EXTENDS
+			    && page_cur_rec_field_extends(
+				    tuple, mid_rec, offsets,
+				    cur_matched_fields)) {
+
+				goto low_rec_match;
+			}
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+up_rec_match:
+			up_rec = mid_rec;
+			up_matched_fields = cur_matched_fields;
+		} else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+			   || mode == PAGE_CUR_LE_OR_EXTENDS
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+			   ) {
+			if (!cmp && !cur_matched_fields) {
+#ifdef UNIV_DEBUG
+				mtr_t	mtr;
+				mtr_start(&mtr);
+
+				/* We got a match, but cur_matched_fields is
+				0, it must have REC_INFO_MIN_REC_FLAG */
+				ulint   rec_info = rec_get_info_bits(mid_rec,
+                                                     rec_offs_comp(offsets));
+				ut_ad(rec_info & REC_INFO_MIN_REC_FLAG);
+				ut_ad(!page_has_prev(page));
+				mtr_commit(&mtr);
+#endif
+
+				cur_matched_fields = dtuple_get_n_fields_cmp(tuple);
+			}
+
+			goto low_rec_match;
+		} else {
+
+			goto up_rec_match;
+		}
+	}
+
+	if (mode <= PAGE_CUR_GE) {
+		page_cur_position(up_rec, block, cursor);
+	} else {
+		page_cur_position(low_rec, block, cursor);
+	}
+
+	*iup_matched_fields  = up_matched_fields;
+	*ilow_matched_fields = low_matched_fields;
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	return false;
+}
+
+#ifdef BTR_CUR_HASH_ADAPT
+/** Search the right position for a page cursor.
+@param[in]	block			buffer block
+@param[in]	index			index tree
+@param[in]	tuple			key to be searched for
+@param[in]	mode			search mode
+@param[in,out]	iup_matched_fields	already matched fields in the
+upper limit record
+@param[in,out]	iup_matched_bytes	already matched bytes in the
+first partially matched field in the upper limit record
+@param[in,out]	ilow_matched_fields	already matched fields in the
+lower limit record
+@param[in,out]	ilow_matched_bytes	already matched bytes in the
+first partially matched field in the lower limit record
+@param[out]	cursor			page cursor */
+bool
+page_cur_search_with_match_bytes(
+	const dtuple_t*		tuple,
+	page_cur_mode_t		mode,
+	ulint*			iup_matched_fields,
+	ulint*			iup_matched_bytes,
+	ulint*			ilow_matched_fields,
+	ulint*			ilow_matched_bytes,
+	page_cur_t*		cursor)
+{
+	ulint		up;
+	ulint		low;
+	const page_t*	page;
+	const rec_t*	up_rec;
+	const rec_t*	low_rec;
+	const rec_t*	mid_rec;
+	ulint		up_matched_fields;
+	ulint		up_matched_bytes;
+	ulint		low_matched_fields;
+	ulint		low_matched_bytes;
+	ulint		cur_matched_fields;
+	ulint		cur_matched_bytes;
+	int		cmp;
+	const dict_index_t* const index = cursor->index;
+	const buf_block_t* const block = cursor->block;
+#ifdef UNIV_ZIP_DEBUG
+	const page_zip_des_t*	page_zip = buf_block_get_page_zip(block);
+#endif /* UNIV_ZIP_DEBUG */
+	mem_heap_t*	heap		= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(dtuple_validate(tuple));
+	ut_ad(!(tuple->info_bits & REC_INFO_MIN_REC_FLAG));
+#ifdef UNIV_DEBUG
+# ifdef PAGE_CUR_DBG
+	if (mode != PAGE_CUR_DBG)
+# endif /* PAGE_CUR_DBG */
+# ifdef PAGE_CUR_LE_OR_EXTENDS
+		if (mode != PAGE_CUR_LE_OR_EXTENDS)
+# endif /* PAGE_CUR_LE_OR_EXTENDS */
+			ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
+			      || mode == PAGE_CUR_G || mode == PAGE_CUR_GE);
+#endif /* UNIV_DEBUG */
+	page = buf_block_get_frame(block);
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+	ut_d(page_check_dir(page));
+
+#ifdef BTR_CUR_HASH_ADAPT
+	if (page_is_leaf(page)
+	    && page_get_direction(page) == PAGE_RIGHT
+	    && page_header_get_offs(page, PAGE_LAST_INSERT)
+	    && mode == PAGE_CUR_LE
+	    && page_header_get_field(page, PAGE_N_DIRECTION) > 3
+	    && page_cur_try_search_shortcut_bytes(
+		    block, index, tuple,
+		    iup_matched_fields, iup_matched_bytes,
+		    ilow_matched_fields, ilow_matched_bytes,
+		    cursor)) {
+		return false;
+	}
+# ifdef PAGE_CUR_DBG
+	if (mode == PAGE_CUR_DBG) {
+		mode = PAGE_CUR_LE;
+	}
+# endif
+#endif /* BTR_CUR_HASH_ADAPT */
+
+	/* The following flag does not work for non-latin1 char sets because
+	cmp_full_field does not tell how many bytes matched */
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+	ut_a(mode != PAGE_CUR_LE_OR_EXTENDS);
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+
+	/* If mode PAGE_CUR_G is specified, we are trying to position the
+	cursor to answer a query of the form "tuple < X", where tuple is
+	the input parameter, and X denotes an arbitrary physical record on
+	the page. We want to position the cursor on the first X which
+	satisfies the condition. */
+
+	up_matched_fields  = *iup_matched_fields;
+	up_matched_bytes  = *iup_matched_bytes;
+	low_matched_fields = *ilow_matched_fields;
+	low_matched_bytes  = *ilow_matched_bytes;
+
+	/* Perform binary search. First the search is done through the page
+	directory, after that as a linear search in the list of records
+	owned by the upper limit directory slot. */
+
+	low = 0;
+	up = ulint(page_dir_get_n_slots(page)) - 1;
+
+	/* Perform binary search until the lower and upper limit directory
+	slots come to the distance 1 of each other */
+	const ulint n_core = page_is_leaf(page) ? index->n_core_fields : 0;
+
+	while (up - low > 1) {
+		const ulint mid = (low + up) / 2;
+		mid_rec = page_dir_slot_get_rec_validate(
+			page_dir_get_nth_slot(page, mid));
+		if (UNIV_UNLIKELY(!mid_rec)) {
+			goto corrupted;
+		}
+
+		ut_pair_min(&cur_matched_fields, &cur_matched_bytes,
+			    low_matched_fields, low_matched_bytes,
+			    up_matched_fields, up_matched_bytes);
+
+		offsets = rec_get_offsets(
+			mid_rec, index, offsets_, n_core,
+			dtuple_get_n_fields_cmp(tuple), &heap);
+
+		cmp = cmp_dtuple_rec_with_match_bytes(
+			tuple, mid_rec, index, offsets,
+			&cur_matched_fields, &cur_matched_bytes);
+
+		if (cmp > 0) {
+low_slot_match:
+			low = mid;
+			low_matched_fields = cur_matched_fields;
+			low_matched_bytes = cur_matched_bytes;
+
+		} else if (cmp) {
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+			if (mode == PAGE_CUR_LE_OR_EXTENDS
+			    && page_cur_rec_field_extends(
+				    tuple, mid_rec, offsets,
+				    cur_matched_fields)) {
+
+				goto low_slot_match;
+			}
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+up_slot_match:
+			up = mid;
+			up_matched_fields = cur_matched_fields;
+			up_matched_bytes = cur_matched_bytes;
+
+		} else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+			   || mode == PAGE_CUR_LE_OR_EXTENDS
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+			   ) {
+			goto low_slot_match;
+		} else {
+
+			goto up_slot_match;
+		}
+	}
+
+	low_rec = page_dir_slot_get_rec_validate(
+		page_dir_get_nth_slot(page, low));
+	up_rec = page_dir_slot_get_rec_validate(
+		page_dir_get_nth_slot(page, up));
+	if (UNIV_UNLIKELY(!low_rec || !up_rec)) {
+corrupted:
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+		return true;
+	}
+
+	/* Perform linear search until the upper and lower records come to
+	distance 1 of each other. */
+
+	for (;;) {
+		if (const rec_t* next = page_rec_get_next_const(low_rec)) {
+			if (next == up_rec) {
+				break;
+			}
+			mid_rec = next;
+		} else {
+			goto corrupted;
+		}
+		ut_pair_min(&cur_matched_fields, &cur_matched_bytes,
+			    low_matched_fields, low_matched_bytes,
+			    up_matched_fields, up_matched_bytes);
+
+		if (UNIV_UNLIKELY(rec_get_info_bits(
+					  mid_rec,
+					  dict_table_is_comp(index->table))
+				  & REC_INFO_MIN_REC_FLAG)) {
+			ut_ad(!page_has_prev(page_align(mid_rec)));
+			ut_ad(!page_rec_is_leaf(mid_rec)
+			      || rec_is_metadata(mid_rec, *index));
+			cmp = 1;
+			goto low_rec_match;
+		}
+
+		offsets = rec_get_offsets(
+			mid_rec, index, offsets_, n_core,
+			dtuple_get_n_fields_cmp(tuple), &heap);
+
+		cmp = cmp_dtuple_rec_with_match_bytes(
+			tuple, mid_rec, index, offsets,
+			&cur_matched_fields, &cur_matched_bytes);
+
+		if (cmp > 0) {
+low_rec_match:
+			low_rec = mid_rec;
+			low_matched_fields = cur_matched_fields;
+			low_matched_bytes = cur_matched_bytes;
+
+		} else if (cmp) {
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+			if (mode == PAGE_CUR_LE_OR_EXTENDS
+			    && page_cur_rec_field_extends(
+				    tuple, mid_rec, offsets,
+				    cur_matched_fields)) {
+
+				goto low_rec_match;
+			}
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+up_rec_match:
+			up_rec = mid_rec;
+			up_matched_fields = cur_matched_fields;
+			up_matched_bytes = cur_matched_bytes;
+		} else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+			   || mode == PAGE_CUR_LE_OR_EXTENDS
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+			   ) {
+			goto low_rec_match;
+		} else {
+
+			goto up_rec_match;
+		}
+	}
+
+	if (mode <= PAGE_CUR_GE) {
+		page_cur_position(up_rec, block, cursor);
+	} else {
+		page_cur_position(low_rec, block, cursor);
+	}
+
+	*iup_matched_fields  = up_matched_fields;
+	*iup_matched_bytes   = up_matched_bytes;
+	*ilow_matched_fields = low_matched_fields;
+	*ilow_matched_bytes  = low_matched_bytes;
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return false;
+}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+/***********************************************************//**
+Positions a page cursor on a randomly chosen user record on a page. If there
+are no user records, sets the cursor on the infimum record. */
+void page_cur_open_on_rnd_user_rec(page_cur_t *cursor)
+{
+  if (const ulint n_recs= page_get_n_recs(cursor->block->page.frame))
+    if ((cursor->rec= page_rec_get_nth(cursor->block->page.frame,
+                                       ut_rnd_interval(n_recs) + 1)))
+      return;
+  cursor->rec= page_get_infimum_rec(cursor->block->page.frame);
+}
+
+/**
+Set the number of owned records.
+@param[in,out]  rec     record in block.frame
+@param[in]      n_owned number of records skipped in the sparse page directory
+@param[in]      comp    whether ROW_FORMAT is COMPACT or DYNAMIC */
+static void page_rec_set_n_owned(rec_t *rec, ulint n_owned, bool comp)
+{
+  rec-= comp ? REC_NEW_N_OWNED : REC_OLD_N_OWNED;
+  *rec= static_cast<byte>((*rec & ~REC_N_OWNED_MASK) |
+                          (n_owned << REC_N_OWNED_SHIFT));
+}
+
+/**
+Split a directory slot which owns too many records.
+@param[in,out]  block   index page
+@param[in,out]  slot    the slot that needs to be split */
+static bool page_dir_split_slot(const buf_block_t &block,
+                                page_dir_slot_t *slot)
+{
+  ut_ad(slot <= &block.page.frame[srv_page_size - PAGE_EMPTY_DIR_START]);
+  slot= my_assume_aligned<2>(slot);
+
+  const ulint n_owned= PAGE_DIR_SLOT_MAX_N_OWNED + 1;
+
+  ut_ad(page_dir_slot_get_n_owned(slot) == n_owned);
+  static_assert((PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2 >=
+                PAGE_DIR_SLOT_MIN_N_OWNED, "compatibility");
+
+  /* Find a record approximately in the middle. */
+  const rec_t *rec= page_dir_slot_get_rec_validate(slot + PAGE_DIR_SLOT_SIZE);
+
+  for (ulint i= n_owned / 2; i--; )
+  {
+    if (UNIV_UNLIKELY(!rec))
+      return true;
+    rec= page_rec_get_next_const(rec);
+  }
+
+  if (UNIV_UNLIKELY(!rec))
+    return true;
+
+  /* Add a directory slot immediately below this one. */
+  constexpr uint16_t n_slots_f= PAGE_N_DIR_SLOTS + PAGE_HEADER;
+  byte *n_slots_p= my_assume_aligned<2>(n_slots_f + block.page.frame);
+  const uint16_t n_slots= mach_read_from_2(n_slots_p);
+
+  page_dir_slot_t *last_slot= static_cast<page_dir_slot_t*>
+    (block.page.frame + srv_page_size - (PAGE_DIR + PAGE_DIR_SLOT_SIZE) -
+     n_slots * PAGE_DIR_SLOT_SIZE);
+
+  if (UNIV_UNLIKELY(slot < last_slot))
+    return true;
+
+  memmove_aligned<2>(last_slot, last_slot + PAGE_DIR_SLOT_SIZE,
+                     slot - last_slot);
+
+  const ulint half_owned= n_owned / 2;
+
+  mach_write_to_2(n_slots_p, n_slots + 1);
+
+  mach_write_to_2(slot, rec - block.page.frame);
+  const bool comp= page_is_comp(block.page.frame) != 0;
+  page_rec_set_n_owned(page_dir_slot_get_rec(slot), half_owned, comp);
+  page_rec_set_n_owned(page_dir_slot_get_rec(slot - PAGE_DIR_SLOT_SIZE),
+                       n_owned - half_owned, comp);
+  return false;
+}
+
+/**
+Split a directory slot which owns too many records.
+@param[in,out]  block   index page (ROW_FORMAT=COMPRESSED)
+@param[in]      s       the slot that needs to be split
+@param[in,out]  mtr     mini-transaction */
+static void page_zip_dir_split_slot(buf_block_t *block, ulint s, mtr_t* mtr)
+{
+  ut_ad(block->page.zip.data);
+  ut_ad(page_is_comp(block->page.frame));
+  ut_ad(s);
+
+  page_dir_slot_t *slot= page_dir_get_nth_slot(block->page.frame, s);
+  const ulint n_owned= PAGE_DIR_SLOT_MAX_N_OWNED + 1;
+
+  ut_ad(page_dir_slot_get_n_owned(slot) == n_owned);
+  static_assert((PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2 >=
+                PAGE_DIR_SLOT_MIN_N_OWNED, "compatibility");
+
+  /* 1. We loop to find a record approximately in the middle of the
+  records owned by the slot. */
+
+  const rec_t *rec= page_dir_slot_get_rec(slot + PAGE_DIR_SLOT_SIZE);
+
+  /* We do not try to prevent crash on corruption here.
+  For ROW_FORMAT=COMPRESSED pages, the next-record links should
+  be validated in page_zip_decompress(). Corruption should only
+  be possible here if the buffer pool was corrupted later. */
+  for (ulint i= n_owned / 2; i--; )
+    rec= page_rec_get_next_const(rec);
+
+  /* Add a directory slot immediately below this one. */
+  constexpr uint16_t n_slots_f= PAGE_N_DIR_SLOTS + PAGE_HEADER;
+  byte *n_slots_p= my_assume_aligned<2>(n_slots_f + block->page.frame);
+  const uint16_t n_slots= mach_read_from_2(n_slots_p);
+
+  page_dir_slot_t *last_slot= static_cast<page_dir_slot_t*>
+    (block->page.frame + srv_page_size - (PAGE_DIR + PAGE_DIR_SLOT_SIZE) -
+     n_slots * PAGE_DIR_SLOT_SIZE);
+  memmove_aligned<2>(last_slot, last_slot + PAGE_DIR_SLOT_SIZE,
+                     slot - last_slot);
+
+  const ulint half_owned= n_owned / 2;
+
+  mtr->write<2>(*block, n_slots_p, 1U + n_slots);
+
+  /* Log changes to the compressed page header and the dense page directory. */
+  memcpy_aligned<2>(&block->page.zip.data[n_slots_f], n_slots_p, 2);
+  mach_write_to_2(slot, page_offset(rec));
+  page_rec_set_n_owned<true>(block, page_dir_slot_get_rec(slot), half_owned,
+                             true, mtr);
+  page_rec_set_n_owned<true>(block,
+                             page_dir_slot_get_rec(slot - PAGE_DIR_SLOT_SIZE),
+                             n_owned - half_owned, true, mtr);
+}
+
+/**
+Try to balance an underfilled directory slot with an adjacent one,
+so that there are at least the minimum number of records owned by the slot;
+this may result in merging the two slots.
+@param[in,out]	block		ROW_FORMAT=COMPRESSED page
+@param[in]	s		the slot to be balanced
+@param[in,out]	mtr		mini-transaction */
+static void page_zip_dir_balance_slot(buf_block_t *block, ulint s, mtr_t *mtr)
+{
+	ut_ad(block->page.zip.data);
+	ut_ad(page_is_comp(block->page.frame));
+	ut_ad(s > 0);
+
+	const ulint n_slots = page_dir_get_n_slots(block->page.frame);
+
+	if (UNIV_UNLIKELY(s + 1 == n_slots)) {
+		/* The last directory slot cannot be balanced. */
+		return;
+	}
+
+	ut_ad(s < n_slots);
+
+	page_dir_slot_t* slot = page_dir_get_nth_slot(block->page.frame, s);
+	rec_t* const up_rec = const_cast<rec_t*>
+		(page_dir_slot_get_rec(slot - PAGE_DIR_SLOT_SIZE));
+	rec_t* const slot_rec = const_cast<rec_t*>
+		(page_dir_slot_get_rec(slot));
+	const ulint up_n_owned = rec_get_n_owned_new(up_rec);
+
+	ut_ad(rec_get_n_owned_new(page_dir_slot_get_rec(slot))
+	      == PAGE_DIR_SLOT_MIN_N_OWNED - 1);
+
+	if (up_n_owned <= PAGE_DIR_SLOT_MIN_N_OWNED) {
+		compile_time_assert(2 * PAGE_DIR_SLOT_MIN_N_OWNED - 1
+				    <= PAGE_DIR_SLOT_MAX_N_OWNED);
+		/* Merge the slots. */
+		page_rec_set_n_owned<true>(block, slot_rec, 0, true, mtr);
+		page_rec_set_n_owned<true>(block, up_rec, up_n_owned
+					   + (PAGE_DIR_SLOT_MIN_N_OWNED - 1),
+					   true, mtr);
+		/* Shift the slots */
+		page_dir_slot_t* last_slot = page_dir_get_nth_slot(
+			block->page.frame, n_slots - 1);
+		memmove_aligned<2>(last_slot + PAGE_DIR_SLOT_SIZE, last_slot,
+				   slot - last_slot);
+		constexpr uint16_t n_slots_f = PAGE_N_DIR_SLOTS + PAGE_HEADER;
+		byte *n_slots_p= my_assume_aligned<2>
+			(n_slots_f + block->page.frame);
+		mtr->write<2>(*block, n_slots_p, n_slots - 1);
+		memcpy_aligned<2>(n_slots_f + block->page.zip.data,
+				  n_slots_p, 2);
+		memset_aligned<2>(last_slot, 0, 2);
+		return;
+	}
+
+	/* Transfer one record to the underfilled slot */
+	page_rec_set_n_owned<true>(block, slot_rec, 0, true, mtr);
+	const rec_t* new_rec = page_rec_get_next_low(slot_rec, TRUE);
+	/* We do not try to prevent crash on corruption here.
+	For ROW_FORMAT=COMPRESSED pages, the next-record links should
+	be validated in page_zip_decompress(). Corruption should only
+	be possible here if the buffer pool was corrupted later. */
+	page_rec_set_n_owned<true>(block, const_cast<rec_t*>(new_rec),
+				   PAGE_DIR_SLOT_MIN_N_OWNED,
+				   true, mtr);
+	mach_write_to_2(slot, page_offset(new_rec));
+	page_rec_set_n_owned(up_rec, up_n_owned - 1, true);
+}
+
+/**
+Try to balance an underfilled directory slot with an adjacent one,
+so that there are at least the minimum number of records owned by the slot;
+this may result in merging the two slots.
+@param[in,out]	block		index page
+@param[in]	s		the slot to be balanced */
+static void page_dir_balance_slot(const buf_block_t &block, ulint s)
+{
+	const bool comp= page_is_comp(block.page.frame);
+	ut_ad(!block.page.zip.data);
+	ut_ad(s > 0);
+
+	const ulint n_slots = page_dir_get_n_slots(block.page.frame);
+
+	if (UNIV_UNLIKELY(s + 1 == n_slots)) {
+		/* The last directory slot cannot be balanced. */
+		return;
+	}
+
+	ut_ad(s < n_slots);
+
+	page_dir_slot_t* slot = page_dir_get_nth_slot(block.page.frame, s);
+	rec_t* const up_rec = const_cast<rec_t*>
+		(page_dir_slot_get_rec(slot - PAGE_DIR_SLOT_SIZE));
+	rec_t* const slot_rec = const_cast<rec_t*>
+		(page_dir_slot_get_rec(slot));
+	const ulint up_n_owned = comp
+		? rec_get_n_owned_new(up_rec)
+		: rec_get_n_owned_old(up_rec);
+
+	ut_ad(page_dir_slot_get_n_owned(slot)
+	      == PAGE_DIR_SLOT_MIN_N_OWNED - 1);
+
+	if (up_n_owned <= PAGE_DIR_SLOT_MIN_N_OWNED) {
+		compile_time_assert(2 * PAGE_DIR_SLOT_MIN_N_OWNED - 1
+				    <= PAGE_DIR_SLOT_MAX_N_OWNED);
+		/* Merge the slots. */
+		page_rec_set_n_owned(slot_rec, 0, comp);
+		page_rec_set_n_owned(up_rec, up_n_owned
+				     + (PAGE_DIR_SLOT_MIN_N_OWNED - 1), comp);
+		/* Shift the slots */
+		page_dir_slot_t* last_slot = page_dir_get_nth_slot(
+			block.page.frame, n_slots - 1);
+		memmove_aligned<2>(last_slot + PAGE_DIR_SLOT_SIZE, last_slot,
+				   slot - last_slot);
+		memset_aligned<2>(last_slot, 0, 2);
+		constexpr uint16_t n_slots_f = PAGE_N_DIR_SLOTS + PAGE_HEADER;
+		byte *n_slots_p= my_assume_aligned<2>
+			(n_slots_f + block.page.frame);
+		mach_write_to_2(n_slots_p, n_slots - 1);
+		return;
+	}
+
+	/* Transfer one record to the underfilled slot */
+	const rec_t* new_rec;
+
+	if (comp) {
+		if (UNIV_UNLIKELY(!(new_rec =
+				    page_rec_get_next_low(slot_rec, true)))) {
+			ut_ad("corrupted page" == 0);
+			return;
+		}
+		page_rec_set_n_owned(slot_rec, 0, true);
+		page_rec_set_n_owned(const_cast<rec_t*>(new_rec),
+				     PAGE_DIR_SLOT_MIN_N_OWNED, true);
+		page_rec_set_n_owned(up_rec, up_n_owned - 1, true);
+	} else {
+		if (UNIV_UNLIKELY(!(new_rec =
+				    page_rec_get_next_low(slot_rec, false)))) {
+			ut_ad("corrupted page" == 0);
+			return;
+		}
+		page_rec_set_n_owned(slot_rec, 0, false);
+		page_rec_set_n_owned(const_cast<rec_t*>(new_rec),
+				     PAGE_DIR_SLOT_MIN_N_OWNED, false);
+		page_rec_set_n_owned(up_rec, up_n_owned - 1, false);
+	}
+
+	mach_write_to_2(slot, page_offset(new_rec));
+}
+
+/** Allocate space for inserting an index record.
+@tparam compressed  whether to update the ROW_FORMAT=COMPRESSED
+@param[in,out]	block		index page
+@param[in]	need		number of bytes needed
+@param[out]	heap_no		record heap number
+@return	pointer to the start of the allocated buffer
+@retval	NULL	if allocation fails */
+template<bool compressed=false>
+static byte* page_mem_alloc_heap(buf_block_t *block, ulint need,
+                                 ulint *heap_no)
+{
+  ut_ad(!compressed || block->page.zip.data);
+
+  byte *heap_top= my_assume_aligned<2>(PAGE_HEAP_TOP + PAGE_HEADER +
+                                       block->page.frame);
+
+  const uint16_t top= mach_read_from_2(heap_top);
+
+  if (need > page_get_max_insert_size(block->page.frame, 1))
+    return NULL;
+
+  byte *n_heap= my_assume_aligned<2>
+    (PAGE_N_HEAP + PAGE_HEADER + block->page.frame);
+
+  const uint16_t h= mach_read_from_2(n_heap);
+  if (UNIV_UNLIKELY((h + 1) & 0x6000))
+  {
+    /* At the minimum record size of 5+2 bytes, we can only reach this
+    condition when using innodb_page_size=64k. */
+    ut_ad((h & 0x7fff) == 8191);
+    ut_ad(srv_page_size == 65536);
+    return NULL;
+  }
+
+  *heap_no= h & 0x7fff;
+  ut_ad(*heap_no < srv_page_size / REC_N_NEW_EXTRA_BYTES);
+  compile_time_assert(UNIV_PAGE_SIZE_MAX / REC_N_NEW_EXTRA_BYTES < 0x3fff);
+
+  mach_write_to_2(heap_top, top + need);
+  mach_write_to_2(n_heap, h + 1);
+
+  if (compressed)
+  {
+    ut_ad(h & 0x8000);
+    memcpy_aligned<4>(&block->page.zip.data[PAGE_HEAP_TOP + PAGE_HEADER],
+                      heap_top, 4);
+  }
+
+  return &block->page.frame[top];
+}
+
+/** Write log for inserting a B-tree or R-tree record in
+ROW_FORMAT=REDUNDANT.
+@param block      B-tree or R-tree page
+@param reuse      false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE
+@param prev_rec   byte offset of the predecessor of the record to insert,
+                  starting from PAGE_OLD_INFIMUM
+@param info_bits  info_bits of the record
+@param n_fields_s number of fields << 1 | rec_get_1byte_offs_flag()
+@param hdr_c      number of common record header bytes with prev_rec
+@param data_c     number of common data bytes with prev_rec
+@param hdr        record header bytes to copy to the log
+@param hdr_l      number of copied record header bytes
+@param data       record payload bytes to copy to the log
+@param data_l     number of copied record data bytes */
+inline void mtr_t::page_insert(const buf_block_t &block, bool reuse,
+                               ulint prev_rec, byte info_bits,
+                               ulint n_fields_s, size_t hdr_c, size_t data_c,
+                               const byte *hdr, size_t hdr_l,
+                               const byte *data, size_t data_l)
+{
+  ut_ad(!block.page.zip.data);
+  ut_ad(m_log_mode == MTR_LOG_ALL);
+  ut_d(ulint n_slots= page_dir_get_n_slots(block.page.frame));
+  ut_ad(n_slots >= 2);
+  ut_d(const byte *page_end=
+       page_dir_get_nth_slot(block.page.frame, n_slots - 1));
+  ut_ad(&block.page.frame[prev_rec + PAGE_OLD_INFIMUM] <= page_end);
+  ut_ad(block.page.frame +
+        page_header_get_offs(block.page.frame, PAGE_HEAP_TOP) <= page_end);
+  ut_ad(fil_page_index_page_check(block.page.frame));
+  ut_ad(!(~(REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG) & info_bits));
+  ut_ad(n_fields_s >= 2);
+  ut_ad((n_fields_s >> 1) <= REC_MAX_N_FIELDS);
+  ut_ad(data_l + data_c <= REDUNDANT_REC_MAX_DATA_SIZE);
+
+  set_modified(block);
+
+  static_assert(REC_INFO_MIN_REC_FLAG == 0x10, "compatibility");
+  static_assert(REC_INFO_DELETED_FLAG == 0x20, "compatibility");
+  n_fields_s= (n_fields_s - 2) << 2 | info_bits >> 4;
+
+  size_t len= prev_rec < MIN_2BYTE ? 2 : prev_rec < MIN_3BYTE ? 3 : 4;
+  static_assert((REC_MAX_N_FIELDS << 1 | 1) <= MIN_3BYTE, "compatibility");
+  len+= n_fields_s < MIN_2BYTE ? 1 : 2;
+  len+= hdr_c < MIN_2BYTE ? 1 : 2;
+  static_assert(REDUNDANT_REC_MAX_DATA_SIZE <= MIN_3BYTE, "compatibility");
+  len+= data_c < MIN_2BYTE ? 1 : 2;
+  len+= hdr_l + data_l;
+
+  const bool small= len < mtr_buf_t::MAX_DATA_SIZE - (1 + 3 + 3 + 5 + 5);
+  byte *l= log_write<EXTENDED>(block.page.id(), &block.page, len, small);
+
+  if (UNIV_LIKELY(small))
+  {
+    ut_d(const byte * const end = l + len);
+    *l++= reuse ? INSERT_REUSE_REDUNDANT : INSERT_HEAP_REDUNDANT;
+    l= mlog_encode_varint(l, prev_rec);
+    l= mlog_encode_varint(l, n_fields_s);
+    l= mlog_encode_varint(l, hdr_c);
+    l= mlog_encode_varint(l, data_c);
+    ::memcpy(l, hdr, hdr_l);
+    l+= hdr_l;
+    ::memcpy(l, data, data_l);
+    l+= data_l;
+    ut_ad(end == l);
+    m_log.close(l);
+  }
+  else
+  {
+    m_log.close(l);
+    l= m_log.open(len - hdr_l - data_l);
+    ut_d(const byte * const end = l + len - hdr_l - data_l);
+    *l++= reuse ? INSERT_REUSE_REDUNDANT : INSERT_HEAP_REDUNDANT;
+    l= mlog_encode_varint(l, prev_rec);
+    l= mlog_encode_varint(l, n_fields_s);
+    l= mlog_encode_varint(l, hdr_c);
+    l= mlog_encode_varint(l, data_c);
+    ut_ad(end == l);
+    m_log.close(l);
+    m_log.push(hdr, static_cast<uint32_t>(hdr_l));
+    m_log.push(data, static_cast<uint32_t>(data_l));
+  }
+
+  m_last_offset= FIL_PAGE_TYPE;
+}
+
+/** Write log for inserting a B-tree or R-tree record in
+ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC.
+@param block       B-tree or R-tree page
+@param reuse       false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE
+@param prev_rec    byte offset of the predecessor of the record to insert,
+                   starting from PAGE_NEW_INFIMUM
+@param info_status rec_get_info_and_status_bits()
+@param shift       unless !reuse: number of bytes the PAGE_FREE is moving
+@param hdr_c       number of common record header bytes with prev_rec
+@param data_c      number of common data bytes with prev_rec
+@param hdr         record header bytes to copy to the log
+@param hdr_l       number of copied record header bytes
+@param data        record payload bytes to copy to the log
+@param data_l      number of copied record data bytes */
+inline void mtr_t::page_insert(const buf_block_t &block, bool reuse,
+                               ulint prev_rec, byte info_status,
+                               ssize_t shift, size_t hdr_c, size_t data_c,
+                               const byte *hdr, size_t hdr_l,
+                               const byte *data, size_t data_l)
+{
+  ut_ad(!block.page.zip.data);
+  ut_ad(m_log_mode == MTR_LOG_ALL);
+  ut_d(ulint n_slots= page_dir_get_n_slots(block.page.frame));
+  ut_ad(n_slots >= 2);
+  ut_d(const byte *page_end= page_dir_get_nth_slot(block.page.frame,
+                                                   n_slots - 1));
+  ut_ad(&block.page.frame[prev_rec + PAGE_NEW_INFIMUM] <= page_end);
+  ut_ad(block.page.frame +
+        page_header_get_offs(block.page.frame, PAGE_HEAP_TOP) <= page_end);
+  ut_ad(fil_page_index_page_check(block.page.frame));
+  ut_ad(hdr_l + hdr_c + data_l + data_c <= static_cast<size_t>
+        (page_end - &block.page.frame[PAGE_NEW_SUPREMUM_END]));
+  ut_ad(reuse || shift == 0);
+#ifdef UNIV_DEBUG
+  switch (~(REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG) & info_status) {
+  default:
+    ut_ad(0);
+    break;
+  case REC_STATUS_NODE_PTR:
+    ut_ad(!page_is_leaf(block.page.frame));
+    break;
+  case REC_STATUS_INSTANT:
+  case REC_STATUS_ORDINARY:
+    ut_ad(page_is_leaf(block.page.frame));
+  }
+#endif
+
+  set_modified(block);
+
+  static_assert(REC_INFO_MIN_REC_FLAG == 0x10, "compatibility");
+  static_assert(REC_INFO_DELETED_FLAG == 0x20, "compatibility");
+  static_assert(REC_STATUS_INSTANT == 4, "compatibility");
+
+  const size_t enc_hdr_l= hdr_l << 3 |
+    (info_status & REC_STATUS_INSTANT) | info_status >> 4;
+  size_t len= prev_rec < MIN_2BYTE ? 2 : prev_rec < MIN_3BYTE ? 3 : 4;
+  static_assert(REC_MAX_N_FIELDS * 2 < MIN_3BYTE, "compatibility");
+  if (reuse)
+  {
+    if (shift < 0)
+      shift= -shift << 1 | 1;
+    else
+      shift<<= 1;
+    len+= static_cast<size_t>(shift) < MIN_2BYTE
+      ? 1 : static_cast<size_t>(shift) < MIN_3BYTE ? 2 : 3;
+  }
+  ut_ad(hdr_c + hdr_l <= REC_MAX_N_FIELDS * 2);
+  len+= hdr_c < MIN_2BYTE ? 1 : 2;
+  len+= enc_hdr_l < MIN_2BYTE ? 1 : enc_hdr_l < MIN_3BYTE ? 2 : 3;
+  len+= data_c < MIN_2BYTE ? 1 : data_c < MIN_3BYTE ? 2 : 3;
+  len+= hdr_l + data_l;
+
+  const bool small= len < mtr_buf_t::MAX_DATA_SIZE - (1 + 3 + 3 + 5 + 5);
+  byte *l= log_write<EXTENDED>(block.page.id(), &block.page, len, small);
+
+  if (UNIV_LIKELY(small))
+  {
+    ut_d(const byte * const end = l + len);
+    *l++= reuse ? INSERT_REUSE_DYNAMIC : INSERT_HEAP_DYNAMIC;
+    l= mlog_encode_varint(l, prev_rec);
+    if (reuse)
+      l= mlog_encode_varint(l, shift);
+    l= mlog_encode_varint(l, enc_hdr_l);
+    l= mlog_encode_varint(l, hdr_c);
+    l= mlog_encode_varint(l, data_c);
+    ::memcpy(l, hdr, hdr_l);
+    l+= hdr_l;
+    ::memcpy(l, data, data_l);
+    l+= data_l;
+    ut_ad(end == l);
+    m_log.close(l);
+  }
+  else
+  {
+    m_log.close(l);
+    l= m_log.open(len - hdr_l - data_l);
+    ut_d(const byte * const end = l + len - hdr_l - data_l);
+    *l++= reuse ? INSERT_REUSE_DYNAMIC : INSERT_HEAP_DYNAMIC;
+    l= mlog_encode_varint(l, prev_rec);
+    if (reuse)
+      l= mlog_encode_varint(l, shift);
+    l= mlog_encode_varint(l, enc_hdr_l);
+    l= mlog_encode_varint(l, hdr_c);
+    l= mlog_encode_varint(l, data_c);
+    ut_ad(end == l);
+    m_log.close(l);
+    m_log.push(hdr, static_cast<uint32_t>(hdr_l));
+    m_log.push(data, static_cast<uint32_t>(data_l));
+  }
+
+  m_last_offset= FIL_PAGE_TYPE;
+}
+
+/** Report page directory corruption.
+@param block  index page
+@param index  index tree
+*/
+ATTRIBUTE_COLD
+static void page_cur_directory_corrupted(const buf_block_t &block,
+                                         const dict_index_t &index)
+{
+  ib::error() << "Directory of " << block.page.id()
+              << " of index " << index.name
+              << " in table " << index.table->name
+              << " is corrupted";
+}
+
+/***********************************************************//**
+Inserts a record next to page cursor on an uncompressed page.
+@return pointer to record
+@retval nullptr if not enough space was available */
+rec_t*
+page_cur_insert_rec_low(
+/*====================*/
+	const page_cur_t*cur,	/*!< in: page cursor */
+	const rec_t*	rec,	/*!< in: record to insert after cur */
+	rec_offs*	offsets,/*!< in/out: rec_get_offsets(rec, index) */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+  buf_block_t *block= cur->block;
+  dict_index_t * const index= cur->index;
+
+  ut_ad(rec_offs_validate(rec, index, offsets));
+  ut_ad(rec_offs_n_fields(offsets) > 0);
+  ut_ad(index->table->not_redundant() == !!page_is_comp(block->page.frame));
+  ut_ad(!!page_is_comp(block->page.frame) == !!rec_offs_comp(offsets));
+  ut_ad(fil_page_index_page_check(block->page.frame));
+  ut_ad(mach_read_from_8(PAGE_HEADER + PAGE_INDEX_ID + block->page.frame) ==
+        index->id ||
+        mtr->is_inside_ibuf());
+  ut_ad(page_dir_get_n_slots(block->page.frame) >= 2);
+
+  ut_ad(!page_rec_is_supremum(cur->rec));
+
+  /* We should not write log for ROW_FORMAT=COMPRESSED pages here. */
+  ut_ad(!mtr->is_logged() ||
+        !(index->table->flags & DICT_TF_MASK_ZIP_SSIZE));
+
+  /* 1. Get the size of the physical record in the page */
+  const ulint rec_size= rec_offs_size(offsets);
+
+#ifdef HAVE_MEM_CHECK
+  {
+    const void *rec_start __attribute__((unused))=
+      rec - rec_offs_extra_size(offsets);
+    ulint extra_size __attribute__((unused))=
+      rec_offs_extra_size(offsets) -
+      (page_is_comp(block->page.frame)
+       ? REC_N_NEW_EXTRA_BYTES
+       : REC_N_OLD_EXTRA_BYTES);
+    /* All data bytes of the record must be valid. */
+    MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+    /* The variable-length header must be valid. */
+    MEM_CHECK_DEFINED(rec_start, extra_size);
+  }
+#endif /* HAVE_MEM_CHECK */
+
+  /* 2. Try to find suitable space from page memory management */
+  bool reuse= false;
+  ssize_t free_offset= 0;
+  ulint heap_no;
+  byte *insert_buf;
+
+  const bool comp= page_is_comp(block->page.frame);
+  const ulint extra_size= rec_offs_extra_size(offsets);
+
+  if (rec_t* free_rec= page_header_get_ptr(block->page.frame, PAGE_FREE))
+  {
+    /* Try to reuse the head of PAGE_FREE. */
+    rec_offs foffsets_[REC_OFFS_NORMAL_SIZE];
+    mem_heap_t *heap= nullptr;
+
+    rec_offs_init(foffsets_);
+
+    rec_offs *foffsets= rec_get_offsets(free_rec, index, foffsets_,
+                                        page_is_leaf(block->page.frame)
+                                        ? index->n_core_fields : 0,
+                                        ULINT_UNDEFINED, &heap);
+    const ulint fextra_size= rec_offs_extra_size(foffsets);
+    insert_buf= free_rec - fextra_size;
+    const bool too_small= (fextra_size + rec_offs_data_size(foffsets)) <
+      rec_size;
+    if (UNIV_LIKELY_NULL(heap))
+      mem_heap_free(heap);
+
+    if (too_small)
+      goto use_heap;
+
+    byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER +
+                                          block->page.frame);
+    if (comp)
+    {
+      heap_no= rec_get_heap_no_new(free_rec);
+      uint16_t next= mach_read_from_2(free_rec - REC_NEXT);
+      mach_write_to_2(page_free, next
+                      ? static_cast<uint16_t>(free_rec + next -
+                                              block->page.frame)
+                      : 0);
+    }
+    else
+    {
+      heap_no= rec_get_heap_no_old(free_rec);
+      memcpy(page_free, free_rec - REC_NEXT, 2);
+    }
+
+    static_assert(PAGE_GARBAGE == PAGE_FREE + 2, "compatibility");
+
+    byte *page_garbage= my_assume_aligned<2>(page_free + 2);
+    ut_ad(mach_read_from_2(page_garbage) >= rec_size);
+    mach_write_to_2(page_garbage, mach_read_from_2(page_garbage) - rec_size);
+    reuse= true;
+    free_offset= extra_size - fextra_size;
+  }
+  else
+  {
+use_heap:
+    insert_buf= page_mem_alloc_heap(block, rec_size, &heap_no);
+
+    if (UNIV_UNLIKELY(!insert_buf))
+      return nullptr;
+  }
+
+  ut_ad(cur->rec != insert_buf + extra_size);
+
+  rec_t *next_rec= block->page.frame + rec_get_next_offs(cur->rec, comp);
+  ut_ad(next_rec != block->page.frame);
+
+  /* Update page header fields */
+  byte *page_last_insert= my_assume_aligned<2>(PAGE_LAST_INSERT + PAGE_HEADER +
+                                               block->page.frame);
+  const uint16_t last_insert= mach_read_from_2(page_last_insert);
+  ut_ad(!last_insert || !comp ||
+        rec_get_node_ptr_flag(block->page.frame + last_insert) ==
+        rec_get_node_ptr_flag(rec));
+
+  /* Write PAGE_LAST_INSERT */
+  mach_write_to_2(page_last_insert, page_offset(insert_buf + extra_size));
+
+  /* Update PAGE_DIRECTION_B, PAGE_N_DIRECTION if needed */
+  if (block->page.frame[FIL_PAGE_TYPE + 1] != byte(FIL_PAGE_RTREE))
+  {
+    byte *dir= &block->page.frame[PAGE_DIRECTION_B + PAGE_HEADER];
+    byte *n= my_assume_aligned<2>
+      (&block->page.frame[PAGE_N_DIRECTION + PAGE_HEADER]);
+    if (UNIV_UNLIKELY(!last_insert))
+    {
+no_direction:
+      *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_NO_DIRECTION);
+      memset(n, 0, 2);
+    }
+    else if (block->page.frame + last_insert == cur->rec &&
+             (*dir & ((1U << 3) - 1)) != PAGE_LEFT)
+    {
+      *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_RIGHT);
+inc_dir:
+      mach_write_to_2(n, mach_read_from_2(n) + 1);
+    }
+    else if (next_rec == block->page.frame + last_insert &&
+             (*dir & ((1U << 3) - 1)) != PAGE_RIGHT)
+    {
+      *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_LEFT);
+      goto inc_dir;
+    }
+    else
+      goto no_direction;
+  }
+
+  /* Update PAGE_N_RECS. */
+  byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER +
+                                          block->page.frame);
+
+  mach_write_to_2(page_n_recs, mach_read_from_2(page_n_recs) + 1);
+
+  /* Update the preceding record header, the 'owner' record and
+  prepare the record to insert. */
+  rec_t *insert_rec= insert_buf + extra_size;
+  const ulint data_size= rec_offs_data_size(offsets);
+  memcpy(insert_buf, rec - extra_size, extra_size + data_size);
+  size_t hdr_common= 0;
+  ulint n_owned;
+  const byte info_status= static_cast<byte>
+    (rec_get_info_and_status_bits(rec, comp));
+  ut_ad(!(rec_get_info_bits(rec, comp) &
+          ~(REC_INFO_DELETED_FLAG | REC_INFO_MIN_REC_FLAG)));
+
+  if (comp)
+  {
+#ifdef UNIV_DEBUG
+    switch (rec_get_status(cur->rec)) {
+    case REC_STATUS_ORDINARY:
+    case REC_STATUS_NODE_PTR:
+    case REC_STATUS_INSTANT:
+    case REC_STATUS_INFIMUM:
+      break;
+    case REC_STATUS_SUPREMUM:
+      ut_ad("wrong status on cur->rec" == 0);
+    }
+    switch (rec_get_status(rec)) {
+    case REC_STATUS_NODE_PTR:
+      ut_ad(!page_is_leaf(block->page.frame));
+      break;
+    case REC_STATUS_INSTANT:
+      ut_ad(index->is_instant());
+      ut_ad(page_is_leaf(block->page.frame));
+      if (!rec_is_metadata(rec, true))
+        break;
+      ut_ad(cur->rec == &block->page.frame[PAGE_NEW_INFIMUM]);
+      break;
+    case REC_STATUS_ORDINARY:
+      ut_ad(page_is_leaf(block->page.frame));
+      ut_ad(!(rec_get_info_bits(rec, true) & ~REC_INFO_DELETED_FLAG));
+      break;
+    case REC_STATUS_INFIMUM:
+    case REC_STATUS_SUPREMUM:
+      ut_ad("wrong status on rec" == 0);
+    }
+    ut_ad(rec_get_status(next_rec) != REC_STATUS_INFIMUM);
+#endif
+
+    rec_set_bit_field_1(insert_rec, 0, REC_NEW_N_OWNED,
+                        REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+    insert_rec[-REC_NEW_STATUS]= rec[-REC_NEW_STATUS];
+    rec_set_bit_field_2(insert_rec, heap_no,
+                        REC_NEW_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+    mach_write_to_2(insert_rec - REC_NEXT,
+                    static_cast<uint16_t>(next_rec - insert_rec));
+    mach_write_to_2(cur->rec - REC_NEXT,
+                    static_cast<uint16_t>(insert_rec - cur->rec));
+    while (!(n_owned= rec_get_n_owned_new(next_rec)))
+    {
+      next_rec= block->page.frame + rec_get_next_offs(next_rec, true);
+      ut_ad(next_rec != block->page.frame);
+    }
+    rec_set_bit_field_1(next_rec, n_owned + 1, REC_NEW_N_OWNED,
+                        REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+    if (!mtr->is_logged())
+    {
+      mtr->set_modified(*block);
+      goto copied;
+    }
+
+    const byte * const c_start= cur->rec - extra_size;
+    if (extra_size > REC_N_NEW_EXTRA_BYTES &&
+        c_start >=
+        &block->page.frame[PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES])
+    {
+      /* Find common header bytes with the preceding record. */
+      const byte *r= rec - (REC_N_NEW_EXTRA_BYTES + 1);
+      for (const byte *c= cur->rec - (REC_N_NEW_EXTRA_BYTES + 1);
+           *r == *c && c-- != c_start; r--);
+      hdr_common= static_cast<size_t>((rec - (REC_N_NEW_EXTRA_BYTES + 1)) - r);
+      ut_ad(hdr_common <= extra_size - REC_N_NEW_EXTRA_BYTES);
+    }
+  }
+  else
+  {
+#ifdef UNIV_DEBUG
+    if (!page_is_leaf(block->page.frame));
+    else if (rec_is_metadata(rec, false))
+    {
+      ut_ad(index->is_instant());
+      ut_ad(cur->rec == &block->page.frame[PAGE_OLD_INFIMUM]);
+    }
+#endif
+    rec_set_bit_field_1(insert_rec, 0, REC_OLD_N_OWNED,
+                        REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+    rec_set_bit_field_2(insert_rec, heap_no,
+                        REC_OLD_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+    memcpy(insert_rec - REC_NEXT, cur->rec - REC_NEXT, 2);
+    mach_write_to_2(cur->rec - REC_NEXT, page_offset(insert_rec));
+    while (!(n_owned= rec_get_n_owned_old(next_rec)))
+    {
+      next_rec= block->page.frame + rec_get_next_offs(next_rec, false);
+      ut_ad(next_rec != block->page.frame);
+    }
+    rec_set_bit_field_1(next_rec, n_owned + 1, REC_OLD_N_OWNED,
+                        REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+    if (!mtr->is_logged())
+    {
+      mtr->set_modified(*block);
+      goto copied;
+    }
+
+    ut_ad(extra_size > REC_N_OLD_EXTRA_BYTES);
+    const byte * const c_start= cur->rec - extra_size;
+    if (c_start >=
+        &block->page.frame[PAGE_OLD_SUPREMUM_END + REC_N_OLD_EXTRA_BYTES])
+    {
+      /* Find common header bytes with the preceding record. */
+      const byte *r= rec - (REC_N_OLD_EXTRA_BYTES + 1);
+      for (const byte *c= cur->rec - (REC_N_OLD_EXTRA_BYTES + 1);
+           *r == *c && c-- != c_start; r--);
+      hdr_common= static_cast<size_t>((rec - (REC_N_OLD_EXTRA_BYTES + 1)) - r);
+      ut_ad(hdr_common <= extra_size - REC_N_OLD_EXTRA_BYTES);
+    }
+  }
+
+  /* Insert the record, possibly copying from the preceding record. */
+  ut_ad(mtr->is_logged());
+
+  {
+    const byte *r= rec;
+    const byte *c= cur->rec;
+    const byte *c_end= c + data_size;
+    if (page_rec_is_infimum(c) && data_size > 8)
+      c_end= c + 8;
+    static_assert(REC_N_OLD_EXTRA_BYTES == REC_N_NEW_EXTRA_BYTES + 1, "");
+    if (c <= insert_buf && c_end > insert_buf)
+      c_end= insert_buf;
+    else if (c_end < next_rec &&
+             c_end >= next_rec - REC_N_OLD_EXTRA_BYTES + comp)
+      c_end= next_rec - REC_N_OLD_EXTRA_BYTES + comp;
+    else
+      c_end= std::min<const byte*>(c_end, block->page.frame + srv_page_size -
+                                   PAGE_DIR - PAGE_DIR_SLOT_SIZE *
+                                   page_dir_get_n_slots(block->page.frame));
+    size_t data_common;
+    /* Copy common data bytes of the preceding record. */
+    for (; c != c_end && *r == *c; c++, r++);
+    data_common= static_cast<size_t>(r - rec);
+
+    if (comp)
+      mtr->page_insert(*block, reuse,
+                       cur->rec - block->page.frame - PAGE_NEW_INFIMUM,
+                       info_status, free_offset, hdr_common, data_common,
+                       insert_buf,
+                       extra_size - hdr_common - REC_N_NEW_EXTRA_BYTES,
+                       r, data_size - data_common);
+    else
+      mtr->page_insert(*block, reuse,
+                       cur->rec - block->page.frame - PAGE_OLD_INFIMUM,
+                       info_status, rec_get_n_fields_old(insert_rec) << 1 |
+                       rec_get_1byte_offs_flag(insert_rec),
+                       hdr_common, data_common,
+                       insert_buf,
+                       extra_size - hdr_common - REC_N_OLD_EXTRA_BYTES,
+                       r, data_size - data_common);
+  }
+
+copied:
+  ut_ad(!memcmp(insert_buf, rec - extra_size, extra_size -
+                (comp ? REC_N_NEW_EXTRA_BYTES : REC_N_OLD_EXTRA_BYTES)));
+  ut_ad(!memcmp(insert_rec, rec, data_size));
+  /* We have incremented the n_owned field of the owner record.
+  If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED, we have to split the
+  corresponding directory slot in two. */
+
+  if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED))
+  {
+    const ulint owner= page_dir_find_owner_slot(next_rec);
+    if (UNIV_UNLIKELY(owner == ULINT_UNDEFINED))
+    {
+      page_cur_directory_corrupted(*block, *index);
+      return nullptr;
+    }
+
+    if (page_dir_split_slot(*block, page_dir_get_nth_slot(block->page.frame,
+                                                          owner)))
+      return nullptr;
+  }
+
+  rec_offs_make_valid(insert_buf + extra_size, index,
+                      page_is_leaf(block->page.frame), offsets);
+  return insert_buf + extra_size;
+}
+
+/** Add a slot to the dense page directory.
+@param[in,out]  block   ROW_FORMAT=COMPRESSED page
+@param[in]      index   the index that the page belongs to
+@param[in,out]  mtr     mini-transaction */
+static inline void page_zip_dir_add_slot(buf_block_t *block,
+                                         const dict_index_t *index, mtr_t *mtr)
+{
+  page_zip_des_t *page_zip= &block->page.zip;
+
+  ut_ad(page_is_comp(page_zip->data));
+  MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+
+  /* Read the old n_dense (n_heap has already been incremented). */
+  ulint n_dense= page_dir_get_n_heap(page_zip->data) - (PAGE_HEAP_NO_USER_LOW +
+                                                        1U);
+
+  byte *dir= page_zip->data + page_zip_get_size(page_zip) -
+    PAGE_ZIP_DIR_SLOT_SIZE * n_dense;
+  byte *stored= dir;
+
+  if (!page_is_leaf(page_zip->data))
+  {
+    ut_ad(!page_zip->n_blobs);
+    stored-= n_dense * REC_NODE_PTR_SIZE;
+  }
+  else if (index->is_clust())
+  {
+    /* Move the BLOB pointer array backwards to make space for the
+    columns DB_TRX_ID,DB_ROLL_PTR and the dense directory slot. */
+
+    stored-= n_dense * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+    byte *externs= stored - page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE;
+    byte *dst= externs - PAGE_ZIP_CLUST_LEAF_SLOT_SIZE;
+    ut_ad(!memcmp(dst, field_ref_zero, PAGE_ZIP_CLUST_LEAF_SLOT_SIZE));
+    if (const ulint len = ulint(stored - externs))
+    {
+      memmove(dst, externs, len);
+      mtr->memmove(*block, dst - page_zip->data, externs - page_zip->data,
+                   len);
+    }
+  }
+  else
+  {
+    stored-= page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE;
+    ut_ad(!memcmp(stored - PAGE_ZIP_DIR_SLOT_SIZE, field_ref_zero,
+                  PAGE_ZIP_DIR_SLOT_SIZE));
+  }
+
+  /* Move the uncompressed area backwards to make space
+  for one directory slot. */
+  if (const ulint len = ulint(dir - stored))
+  {
+    byte* dst = stored - PAGE_ZIP_DIR_SLOT_SIZE;
+    memmove(dst, stored, len);
+    mtr->memmove(*block, dst - page_zip->data, stored - page_zip->data, len);
+  }
+}
+
+/***********************************************************//**
+Inserts a record next to page cursor on a compressed and uncompressed
+page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return pointer to inserted record
+@return nullptr on failure */
+rec_t*
+page_cur_insert_rec_zip(
+/*====================*/
+	page_cur_t*	cursor,	/*!< in/out: page cursor,
+				logical position unchanged  */
+	const rec_t*	rec,	/*!< in: pointer to a physical record */
+	rec_offs*	offsets,/*!< in/out: rec_get_offsets(rec, index) */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+  page_zip_des_t * const page_zip= page_cur_get_page_zip(cursor);
+  page_t * const page= cursor->block->page.frame;
+  dict_index_t * const index = cursor->index;
+
+  ut_ad(page_zip);
+  ut_ad(rec_offs_validate(rec, index, offsets));
+
+  ut_ad(index->table->not_redundant());
+  ut_ad(page_is_comp(page));
+  ut_ad(rec_offs_comp(offsets));
+  ut_ad(fil_page_get_type(page) == FIL_PAGE_INDEX ||
+        fil_page_get_type(page) == FIL_PAGE_RTREE);
+  ut_ad(mach_read_from_8(PAGE_HEADER + PAGE_INDEX_ID + page) ==
+        index->id || mtr->is_inside_ibuf());
+  ut_ad(!page_get_instant(page));
+  ut_ad(!page_cur_is_after_last(cursor));
+#ifdef UNIV_ZIP_DEBUG
+  ut_a(page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+  /* 1. Get the size of the physical record in the page */
+  const ulint rec_size= rec_offs_size(offsets);
+
+#ifdef HAVE_MEM_CHECK
+  {
+    const void *rec_start __attribute__((unused))=
+      rec - rec_offs_extra_size(offsets);
+    ulint extra_size __attribute__((unused))=
+      rec_offs_extra_size(offsets) - REC_N_NEW_EXTRA_BYTES;
+    /* All data bytes of the record must be valid. */
+    MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+    /* The variable-length header must be valid. */
+    MEM_CHECK_DEFINED(rec_start, extra_size);
+  }
+#endif /* HAVE_MEM_CHECK */
+  const bool reorg_before_insert= page_has_garbage(page) &&
+    rec_size > page_get_max_insert_size(page, 1) &&
+    rec_size <= page_get_max_insert_size_after_reorganize(page, 1);
+  constexpr uint16_t page_free_f= PAGE_FREE + PAGE_HEADER;
+  byte* const page_free = my_assume_aligned<4>(page_free_f + page);
+  uint16_t free_rec= 0;
+
+  /* 2. Try to find suitable space from page memory management */
+  ulint heap_no;
+  byte *insert_buf;
+
+  if (reorg_before_insert ||
+      !page_zip_available(page_zip, index->is_clust(), rec_size, 1))
+  {
+    /* SET GLOBAL might be executed concurrently. Sample the value once. */
+    ulint level= page_zip_level;
+#ifdef UNIV_DEBUG
+    const rec_t * const cursor_rec= page_cur_get_rec(cursor);
+#endif /* UNIV_DEBUG */
+
+    if (page_is_empty(page))
+    {
+      ut_ad(page_cur_is_before_first(cursor));
+
+      /* This is an empty page. Recreate to remove the modification log. */
+      page_create_zip(cursor->block, index,
+                      page_header_get_field(page, PAGE_LEVEL), 0, mtr);
+      ut_ad(!page_header_get_ptr(page, PAGE_FREE));
+
+      if (page_zip_available(page_zip, index->is_clust(), rec_size, 1))
+        goto use_heap;
+
+      /* The cursor should remain on the page infimum. */
+      return nullptr;
+    }
+
+    if (page_zip->m_nonempty || page_has_garbage(page))
+    {
+      ulint pos= page_rec_get_n_recs_before(cursor->rec);
+
+      if (UNIV_UNLIKELY(pos == ULINT_UNDEFINED))
+        return nullptr;
+
+      switch (page_zip_reorganize(cursor->block, index, level, mtr, true)) {
+      case DB_FAIL:
+        ut_ad(cursor->rec == cursor_rec);
+        return nullptr;
+      case DB_SUCCESS:
+        break;
+      default:
+        return nullptr;
+      }
+
+      if (!pos)
+        ut_ad(cursor->rec == page + PAGE_NEW_INFIMUM);
+      else if (!(cursor->rec= page_rec_get_nth(page, pos)))
+      {
+        cursor->rec= page + PAGE_NEW_SUPREMUM;
+        return nullptr;
+      }
+
+      ut_ad(!page_header_get_ptr(page, PAGE_FREE));
+
+      if (page_zip_available(page_zip, index->is_clust(), rec_size, 1))
+        goto use_heap;
+    }
+
+    /* Try compressing the whole page afterwards. */
+    const mtr_log_t log_mode= mtr->set_log_mode(MTR_LOG_NONE);
+    rec_t *insert_rec= page_cur_insert_rec_low(cursor, rec, offsets, mtr);
+    mtr->set_log_mode(log_mode);
+
+    if (insert_rec)
+    {
+      ulint pos= page_rec_get_n_recs_before(insert_rec);
+      if (UNIV_UNLIKELY(!pos || pos == ULINT_UNDEFINED))
+        return nullptr;
+
+      /* We are writing entire page images to the log.  Reduce the redo
+      log volume by reorganizing the page at the same time. */
+      switch (page_zip_reorganize(cursor->block, index, level, mtr)) {
+      case DB_SUCCESS:
+        /* The page was reorganized: Seek to pos. */
+        if (pos <= 1)
+          cursor->rec= page + PAGE_NEW_INFIMUM;
+        else if (!(cursor->rec= page_rec_get_nth(page, pos - 1)))
+        {
+          cursor->rec= page + PAGE_NEW_INFIMUM;
+          return nullptr;
+        }
+        insert_rec= page + rec_get_next_offs(cursor->rec, 1);
+        rec_offs_make_valid(insert_rec, index, page_is_leaf(page), offsets);
+        break;
+      case DB_FAIL:
+        /* Theoretically, we could try one last resort of
+           page_zip_reorganize() followed by page_zip_available(), but that
+           would be very unlikely to succeed. (If the full reorganized page
+           failed to compress, why would it succeed to compress the page,
+           plus log the insert of this record?) */
+
+        /* Out of space: restore the page */
+        if (!page_zip_decompress(page_zip, page, false))
+          ut_error; /* Memory corrupted? */
+        ut_ad(page_validate(page, index));
+        /* fall through */
+      default:
+        insert_rec= nullptr;
+      }
+    }
+    return insert_rec;
+  }
+
+  free_rec= mach_read_from_2(page_free);
+  if (free_rec)
+  {
+    /* Try to allocate from the head of the free list. */
+    rec_offs foffsets_[REC_OFFS_NORMAL_SIZE];
+    mem_heap_t *heap= nullptr;
+
+    rec_offs_init(foffsets_);
+
+    rec_offs *foffsets= rec_get_offsets(page + free_rec, index, foffsets_,
+                                        page_is_leaf(page)
+                                        ? index->n_core_fields : 0,
+                                        ULINT_UNDEFINED, &heap);
+    insert_buf= page + free_rec - rec_offs_extra_size(foffsets);
+
+    if (rec_offs_size(foffsets) < rec_size)
+    {
+too_small:
+      if (UNIV_LIKELY_NULL(heap))
+        mem_heap_free(heap);
+      free_rec= 0;
+      goto use_heap;
+    }
+
+    /* On compressed pages, do not relocate records from
+    the free list. If extra_size would grow, use the heap. */
+    const ssize_t extra_size_diff= lint(rec_offs_extra_size(offsets) -
+                                        rec_offs_extra_size(foffsets));
+
+    if (UNIV_UNLIKELY(extra_size_diff < 0))
+    {
+      /* Add an offset to the extra_size. */
+      if (rec_offs_size(foffsets) < rec_size - ssize_t(extra_size_diff))
+        goto too_small;
+
+      insert_buf-= extra_size_diff;
+    }
+    else if (UNIV_UNLIKELY(extra_size_diff))
+      /* Do not allow extra_size to grow */
+      goto too_small;
+
+    byte *const free_rec_ptr= page + free_rec;
+    heap_no= rec_get_heap_no_new(free_rec_ptr);
+    int16_t next_free= mach_read_from_2(free_rec_ptr - REC_NEXT);
+    /* With innodb_page_size=64k, int16_t would be unsafe to use here,
+    but that cannot be used with ROW_FORMAT=COMPRESSED. */
+    static_assert(UNIV_ZIP_SIZE_SHIFT_MAX == 14, "compatibility");
+    if (next_free)
+    {
+      next_free= static_cast<int16_t>(next_free + free_rec);
+      if (UNIV_UNLIKELY(int{PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES} >
+                        next_free ||
+                        uint16_t(next_free) >= srv_page_size))
+      {
+        if (UNIV_LIKELY_NULL(heap))
+          mem_heap_free(heap);
+        return nullptr;
+      }
+    }
+
+    byte *hdr= my_assume_aligned<4>(&page_zip->data[page_free_f]);
+    mach_write_to_2(hdr, static_cast<uint16_t>(next_free));
+    const byte *const garbage= my_assume_aligned<2>(page_free + 2);
+    ut_ad(mach_read_from_2(garbage) >= rec_size);
+    mach_write_to_2(my_assume_aligned<2>(hdr + 2),
+                    mach_read_from_2(garbage) - rec_size);
+    static_assert(PAGE_GARBAGE == PAGE_FREE + 2, "compatibility");
+    mtr->memcpy(*cursor->block, page_free, hdr, 4);
+
+    if (!page_is_leaf(page))
+    {
+      /* Zero out the node pointer of free_rec, in case it will not be
+      overwritten by insert_rec. */
+      ut_ad(rec_size > REC_NODE_PTR_SIZE);
+
+      if (rec_offs_size(foffsets) > rec_size)
+        memset(rec_get_end(free_rec_ptr, foffsets) -
+               REC_NODE_PTR_SIZE, 0, REC_NODE_PTR_SIZE);
+    }
+    else if (index->is_clust())
+    {
+      /* Zero out DB_TRX_ID,DB_ROLL_PTR in free_rec, in case they will
+      not be overwritten by insert_rec. */
+
+      ulint len;
+      ulint trx_id_offs= rec_get_nth_field_offs(foffsets, index->db_trx_id(),
+                                                &len);
+      ut_ad(len == DATA_TRX_ID_LEN);
+
+      if (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN + trx_id_offs +
+          rec_offs_extra_size(foffsets) > rec_size)
+        memset(free_rec_ptr + trx_id_offs, 0,
+               DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+      ut_ad(free_rec_ptr + trx_id_offs + DATA_TRX_ID_LEN ==
+            rec_get_nth_field(free_rec_ptr, foffsets, index->db_roll_ptr(),
+                              &len));
+      ut_ad(len == DATA_ROLL_PTR_LEN);
+    }
+
+    if (UNIV_LIKELY_NULL(heap))
+      mem_heap_free(heap);
+  }
+  else
+  {
+use_heap:
+    ut_ad(!free_rec);
+    insert_buf= page_mem_alloc_heap<true>(cursor->block, rec_size, &heap_no);
+
+    if (UNIV_UNLIKELY(!insert_buf))
+      return insert_buf;
+
+    static_assert(PAGE_N_HEAP == PAGE_HEAP_TOP + 2, "compatibility");
+    mtr->memcpy(*cursor->block, PAGE_HEAP_TOP + PAGE_HEADER, 4);
+    page_zip_dir_add_slot(cursor->block, index, mtr);
+  }
+
+  /* next record after current before the insertion */
+  const rec_t *next_rec = page_rec_get_next_low(cursor->rec, TRUE);
+  if (UNIV_UNLIKELY(!next_rec ||
+                    rec_get_status(next_rec) == REC_STATUS_INFIMUM ||
+                    rec_get_status(cursor->rec) > REC_STATUS_INFIMUM))
+    return nullptr;
+
+  /* 3. Create the record */
+  byte *insert_rec= rec_copy(insert_buf, rec, offsets);
+  rec_offs_make_valid(insert_rec, index, page_is_leaf(page), offsets);
+
+  /* 4. Insert the record in the linked list of records */
+  ut_ad(cursor->rec != insert_rec);
+  ut_ad(rec_get_status(insert_rec) < REC_STATUS_INFIMUM);
+
+  mach_write_to_2(insert_rec - REC_NEXT, static_cast<uint16_t>
+                  (next_rec - insert_rec));
+  mach_write_to_2(cursor->rec - REC_NEXT, static_cast<uint16_t>
+                  (insert_rec - cursor->rec));
+  byte *n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER + page);
+  mtr->write<2>(*cursor->block, n_recs, 1U + mach_read_from_2(n_recs));
+  memcpy_aligned<2>(&page_zip->data[PAGE_N_RECS + PAGE_HEADER], n_recs, 2);
+
+  /* 5. Set the n_owned field in the inserted record to zero,
+  and set the heap_no field */
+  rec_set_bit_field_1(insert_rec, 0, REC_NEW_N_OWNED,
+                      REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+  rec_set_bit_field_2(insert_rec, heap_no, REC_NEW_HEAP_NO,
+                      REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+
+  MEM_CHECK_DEFINED(rec_get_start(insert_rec, offsets),
+                    rec_offs_size(offsets));
+
+  /* 6. Update the last insertion info in page header */
+  byte *last_insert= my_assume_aligned<4>(PAGE_LAST_INSERT + PAGE_HEADER +
+                                          page_zip->data);
+  const uint16_t last_insert_rec= mach_read_from_2(last_insert);
+  ut_ad(!last_insert_rec ||
+        rec_get_node_ptr_flag(page + last_insert_rec) ==
+        rec_get_node_ptr_flag(insert_rec));
+  mach_write_to_2(last_insert, page_offset(insert_rec));
+
+  if (!index->is_spatial())
+  {
+    byte *dir= &page_zip->data[PAGE_HEADER + PAGE_DIRECTION_B];
+    ut_ad(!(*dir & ~((1U << 3) - 1)));
+    byte *n= my_assume_aligned<2>
+      (&page_zip->data[PAGE_HEADER + PAGE_N_DIRECTION]);
+    if (UNIV_UNLIKELY(!last_insert_rec))
+    {
+no_direction:
+      *dir= PAGE_NO_DIRECTION;
+      memset(n, 0, 2);
+    }
+    else if (*dir != PAGE_LEFT && page + last_insert_rec == cursor->rec)
+    {
+      *dir= PAGE_RIGHT;
+inc_dir:
+      mach_write_to_2(n, mach_read_from_2(n) + 1);
+    }
+    else if (*dir != PAGE_RIGHT && page_rec_get_next(insert_rec) ==
+             page + last_insert_rec)
+    {
+      *dir= PAGE_LEFT;
+      goto inc_dir;
+    }
+    else
+      goto no_direction;
+  }
+
+  /* Write the header fields in one record. */
+  mtr->memcpy(*cursor->block,
+              my_assume_aligned<8>(PAGE_LAST_INSERT + PAGE_HEADER + page),
+              my_assume_aligned<8>(PAGE_LAST_INSERT + PAGE_HEADER +
+                                   page_zip->data),
+              PAGE_N_RECS - PAGE_LAST_INSERT + 2);
+
+  /* 7. It remains to update the owner record. */
+  ulint n_owned;
+
+  while (!(n_owned= rec_get_n_owned_new(next_rec)))
+    if (!(next_rec= page_rec_get_next_low(next_rec, true)))
+      return nullptr;
+
+  rec_set_bit_field_1(const_cast<rec_t*>(next_rec), n_owned + 1,
+                      REC_NEW_N_OWNED, REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+
+  page_zip_dir_insert(cursor, free_rec, insert_rec, mtr);
+
+  /* 8. Now we have incremented the n_owned field of the owner
+  record. If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED,
+  we have to split the corresponding directory slot in two. */
+  if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED))
+  {
+    const ulint owner= page_dir_find_owner_slot(next_rec);
+    if (UNIV_UNLIKELY(owner == ULINT_UNDEFINED))
+    {
+      page_cur_directory_corrupted(*cursor->block, *index);
+      return nullptr;
+    }
+    page_zip_dir_split_slot(cursor->block, owner, mtr);
+  }
+
+  page_zip_write_rec(cursor->block, insert_rec, index, offsets, 1, mtr);
+  return insert_rec;
+}
+
+/** Prepend a record to the PAGE_FREE list, or shrink PAGE_HEAP_TOP.
+@param[in,out]  block        index page
+@param[in,out]  rec          record being deleted
+@param[in]      data_size    record payload size, in bytes
+@param[in]      extra_size   record header size, in bytes */
+static void page_mem_free(const buf_block_t &block, rec_t *rec,
+                          size_t data_size, size_t extra_size)
+{
+  ut_ad(page_align(rec) == block.page.frame);
+  ut_ad(!block.page.zip.data);
+  const rec_t *free= page_header_get_ptr(block.page.frame, PAGE_FREE);
+
+  const uint16_t n_heap= uint16_t(page_header_get_field(block.page.frame,
+                                                        PAGE_N_HEAP) - 1);
+  ut_ad(page_get_n_recs(block.page.frame) < (n_heap & 0x7fff));
+  const bool deleting_top= n_heap == ((n_heap & 0x8000)
+                                      ? (rec_get_heap_no_new(rec) | 0x8000)
+                                      : rec_get_heap_no_old(rec));
+
+  if (deleting_top)
+  {
+    byte *page_heap_top= my_assume_aligned<2>(PAGE_HEAP_TOP + PAGE_HEADER +
+                                              block.page.frame);
+    const uint16_t heap_top= mach_read_from_2(page_heap_top);
+    const size_t extra_savings= heap_top - page_offset(rec + data_size);
+    ut_ad(extra_savings < heap_top);
+
+    /* When deleting the last record, do not add it to the PAGE_FREE list.
+    Instead, decrement PAGE_HEAP_TOP and PAGE_N_HEAP. */
+    mach_write_to_2(page_heap_top, page_offset(rec - extra_size));
+    mach_write_to_2(my_assume_aligned<2>(page_heap_top + 2), n_heap);
+    static_assert(PAGE_N_HEAP == PAGE_HEAP_TOP + 2, "compatibility");
+    if (extra_savings)
+    {
+      byte *page_garbage= my_assume_aligned<2>(PAGE_GARBAGE + PAGE_HEADER +
+                                               block.page.frame);
+      uint16_t garbage= mach_read_from_2(page_garbage);
+      ut_ad(garbage >= extra_savings);
+      mach_write_to_2(page_garbage, garbage - extra_savings);
+    }
+  }
+  else
+  {
+    byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER +
+                                          block.page.frame);
+    byte *page_garbage= my_assume_aligned<2>(PAGE_GARBAGE + PAGE_HEADER +
+                                             block.page.frame);
+    mach_write_to_2(page_free, page_offset(rec));
+    mach_write_to_2(page_garbage, mach_read_from_2(page_garbage) +
+                    extra_size + data_size);
+  }
+
+  memset_aligned<2>(PAGE_LAST_INSERT + PAGE_HEADER + block.page.frame, 0, 2);
+  byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER +
+                                          block.page.frame);
+  mach_write_to_2(page_n_recs, mach_read_from_2(page_n_recs) - 1);
+
+  const byte* const end= rec + data_size;
+
+  if (!deleting_top)
+  {
+    uint16_t next= free
+      ? ((n_heap & 0x8000)
+         ? static_cast<uint16_t>(free - rec)
+         : static_cast<uint16_t>(free - block.page.frame))
+      : uint16_t{0};
+    mach_write_to_2(rec - REC_NEXT, next);
+  }
+  else
+    rec-= extra_size;
+
+  memset(rec, 0, end - rec);
+}
+
+/***********************************************************//**
+Deletes a record at the page cursor. The cursor is moved to the next
+record after the deleted one. */
+void
+page_cur_delete_rec(
+/*================*/
+	page_cur_t*		cursor,	/*!< in/out: a page cursor */
+	const rec_offs*		offsets,/*!< in: rec_get_offsets(
+					cursor->rec, index) */
+	mtr_t*			mtr)	/*!< in/out: mini-transaction */
+{
+	page_dir_slot_t* cur_dir_slot;
+	rec_t*		current_rec;
+	rec_t*		prev_rec	= NULL;
+	rec_t*		next_rec;
+	ulint		cur_n_owned;
+	rec_t*		rec;
+
+	/* page_zip_validate() will fail here when
+	btr_cur_pessimistic_delete() invokes btr_set_min_rec_mark().
+	Then, both "page_zip" and "block->page.frame" would have the
+	min-rec-mark set on the smallest user record, but
+	"block->page.frame" would additionally have it set on the
+	smallest-but-one record.  Because sloppy
+	page_zip_validate_low() only ignores min-rec-flag differences
+	in the smallest user record, it cannot be used here either. */
+
+	current_rec = cursor->rec;
+	const dict_index_t* const index = cursor->index;
+	buf_block_t* const block = cursor->block;
+	ut_ad(rec_offs_validate(current_rec, index, offsets));
+	ut_ad(!!page_is_comp(block->page.frame)
+	      == index->table->not_redundant());
+	ut_ad(fil_page_index_page_check(block->page.frame));
+	ut_ad(mach_read_from_8(PAGE_HEADER + PAGE_INDEX_ID + block->page.frame)
+	      == index->id
+	      || mtr->is_inside_ibuf());
+	ut_ad(mtr->is_named_space(index->table->space));
+
+	/* The record must not be the supremum or infimum record. */
+	ut_ad(page_rec_is_user_rec(current_rec));
+
+	if (page_get_n_recs(block->page.frame) == 1
+	    && !rec_is_alter_metadata(current_rec, *index)) {
+		/* Empty the page. */
+		ut_ad(page_is_leaf(block->page.frame));
+		/* Usually, this should be the root page,
+		and the whole index tree should become empty.
+		However, this could also be a call in
+		btr_cur_pessimistic_update() to delete the only
+		record in the page and to insert another one. */
+		ut_ad(page_rec_is_supremum(page_rec_get_next(cursor->rec)));
+		page_cur_set_after_last(block, cursor);
+		page_create_empty(page_cur_get_block(cursor),
+				  const_cast<dict_index_t*>(index), mtr);
+		return;
+	}
+
+	/* Save to local variables some data associated with current_rec */
+	ulint cur_slot_no = page_dir_find_owner_slot(current_rec);
+
+	if (UNIV_UNLIKELY(!cur_slot_no || cur_slot_no == ULINT_UNDEFINED)) {
+		/* Avoid crashing due to a corrupted page. */
+		page_cur_directory_corrupted(*block, *index);
+		return;
+	}
+
+	cur_dir_slot = page_dir_get_nth_slot(block->page.frame, cur_slot_no);
+	cur_n_owned = page_dir_slot_get_n_owned(cur_dir_slot);
+
+	/* The page gets invalid for btr_pcur_restore_pos().
+	We avoid invoking buf_block_modify_clock_inc(block) because its
+	consistency checks would fail for the dummy block that is being
+	used during IMPORT TABLESPACE. */
+	block->modify_clock++;
+
+	/* Find the next and the previous record. Note that the cursor is
+	left at the next record. */
+
+	rec = const_cast<rec_t*>
+		(page_dir_slot_get_rec(cur_dir_slot + PAGE_DIR_SLOT_SIZE));
+
+	/* rec now points to the record of the previous directory slot. Look
+	for the immediate predecessor of current_rec in a loop. */
+
+	while (current_rec != rec) {
+		prev_rec = rec;
+		if (!(rec = page_rec_get_next(rec))) {
+			/* Avoid crashing due to a corrupted page. */
+			return;
+                }
+	}
+
+	if (!(next_rec = page_cur_move_to_next(cursor))) {
+		/* Avoid crashing due to a corrupted page. */
+		return;
+	}
+
+	/* Remove the record from the linked list of records */
+	/* If the deleted record is pointed to by a dir slot, update the
+	record pointer in slot. In the following if-clause we assume that
+	prev_rec is owned by the same slot, i.e., PAGE_DIR_SLOT_MIN_N_OWNED
+	>= 2. */
+	/* Update the number of owned records of the slot */
+
+	compile_time_assert(PAGE_DIR_SLOT_MIN_N_OWNED >= 2);
+	ut_ad(cur_n_owned > 1);
+
+	rec_t* slot_rec = const_cast<rec_t*>
+		(page_dir_slot_get_rec(cur_dir_slot));
+
+	if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+		ut_ad(page_is_comp(block->page.frame));
+		if (current_rec == slot_rec) {
+			page_zip_rec_set_owned(block, prev_rec, 1, mtr);
+			page_zip_rec_set_owned(block, slot_rec, 0, mtr);
+			slot_rec = prev_rec;
+			mach_write_to_2(cur_dir_slot, page_offset(slot_rec));
+		} else if (cur_n_owned == 1
+			   && !page_rec_is_supremum(slot_rec)) {
+			page_zip_rec_set_owned(block, slot_rec, 0, mtr);
+		}
+
+		mach_write_to_2(prev_rec - REC_NEXT, static_cast<uint16_t>
+				(next_rec - prev_rec));
+		slot_rec[-REC_NEW_N_OWNED] = static_cast<byte>(
+			(slot_rec[-REC_NEW_N_OWNED] & ~REC_N_OWNED_MASK)
+			| (cur_n_owned - 1) << REC_N_OWNED_SHIFT);
+
+		page_header_reset_last_insert(block, mtr);
+		page_zip_dir_delete(block, rec, index, offsets,
+				    page_header_get_ptr(block->page.frame,
+							PAGE_FREE),
+				    mtr);
+		if (cur_n_owned <= PAGE_DIR_SLOT_MIN_N_OWNED) {
+			page_zip_dir_balance_slot(block, cur_slot_no, mtr);
+		}
+		return;
+	}
+
+	if (current_rec == slot_rec) {
+		slot_rec = prev_rec;
+		mach_write_to_2(cur_dir_slot, page_offset(slot_rec));
+	}
+
+	const size_t data_size = rec_offs_data_size(offsets);
+	const size_t extra_size = rec_offs_extra_size(offsets);
+
+	if (page_is_comp(block->page.frame)) {
+		mtr->page_delete(*block, page_offset(prev_rec)
+				 - PAGE_NEW_INFIMUM,
+				 extra_size - REC_N_NEW_EXTRA_BYTES,
+				 data_size);
+		mach_write_to_2(prev_rec - REC_NEXT, static_cast<uint16_t>
+				(next_rec - prev_rec));
+		slot_rec[-REC_NEW_N_OWNED] = static_cast<byte>(
+			(slot_rec[-REC_NEW_N_OWNED] & ~REC_N_OWNED_MASK)
+			| (cur_n_owned - 1) << REC_N_OWNED_SHIFT);
+	} else {
+		mtr->page_delete(*block, page_offset(prev_rec)
+				 - PAGE_OLD_INFIMUM);
+		memcpy(prev_rec - REC_NEXT, current_rec - REC_NEXT, 2);
+		slot_rec[-REC_OLD_N_OWNED] = static_cast<byte>(
+			(slot_rec[-REC_OLD_N_OWNED] & ~REC_N_OWNED_MASK)
+			| (cur_n_owned - 1) << REC_N_OWNED_SHIFT);
+	}
+
+	page_mem_free(*block, current_rec, data_size, extra_size);
+
+	/* Now we have decremented the number of owned records of the slot.
+	If the number drops below PAGE_DIR_SLOT_MIN_N_OWNED, we balance the
+	slots. */
+
+	if (cur_n_owned <= PAGE_DIR_SLOT_MIN_N_OWNED) {
+		page_dir_balance_slot(*block, cur_slot_no);
+	}
+
+	ut_ad(page_is_comp(block->page.frame)
+	      ? page_simple_validate_new(block->page.frame)
+	      : page_simple_validate_old(block->page.frame));
+}
+
+/** Apply a INSERT_HEAP_REDUNDANT or INSERT_REUSE_REDUNDANT record that was
+written by page_cur_insert_rec_low() for a ROW_FORMAT=REDUNDANT page.
+@param block      B-tree or R-tree page in ROW_FORMAT=COMPACT or DYNAMIC
+@param reuse      false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE
+@param prev       byte offset of the predecessor, relative to PAGE_OLD_INFIMUM
+@param enc_hdr    encoded fixed-size header bits
+@param hdr_c      number of common record header bytes with prev
+@param data_c     number of common data bytes with prev
+@param data       literal header and data bytes
+@param data_len   length of the literal data, in bytes
+@return whether the operation failed (inconcistency was noticed) */
+bool page_apply_insert_redundant(const buf_block_t &block, bool reuse,
+                                 ulint prev, ulint enc_hdr,
+                                 size_t hdr_c, size_t data_c,
+                                 const void *data, size_t data_len)
+{
+  page_t * const page= block.page.frame;
+  const uint16_t n_slots= page_dir_get_n_slots(page);
+  byte *page_n_heap= my_assume_aligned<2>(PAGE_N_HEAP + PAGE_HEADER + page);
+  const uint16_t h= mach_read_from_2(page_n_heap);
+  const page_id_t id(block.page.id());
+  if (UNIV_UNLIKELY(n_slots < 2 || h < n_slots || h < PAGE_HEAP_NO_USER_LOW ||
+                    h >= srv_page_size / REC_N_OLD_EXTRA_BYTES ||
+                    !fil_page_index_page_check(page) ||
+                    page_get_page_no(page) != id.page_no() ||
+                    mach_read_from_2(my_assume_aligned<2>
+                                     (PAGE_OLD_SUPREMUM - REC_NEXT + page))))
+  {
+corrupted:
+    ib::error() << (reuse
+                    ? "Not applying INSERT_REUSE_REDUNDANT"
+                    " due to corruption on "
+                    : "Not applying INSERT_HEAP_REDUNDANT"
+                    " due to corruption on ")
+                << id;
+    return true;
+  }
+
+  byte * const last_slot= page_dir_get_nth_slot(page, n_slots - 1);
+  byte * const page_heap_top= my_assume_aligned<2>
+    (PAGE_HEAP_TOP + PAGE_HEADER + page);
+  const byte *const heap_bot= &page[PAGE_OLD_SUPREMUM_END];
+  byte *heap_top= page + mach_read_from_2(page_heap_top);
+  if (UNIV_UNLIKELY(heap_bot > heap_top || heap_top > last_slot))
+    goto corrupted;
+  if (UNIV_UNLIKELY(mach_read_from_2(last_slot) != PAGE_OLD_SUPREMUM))
+    goto corrupted;
+  if (UNIV_UNLIKELY(mach_read_from_2(page_dir_get_nth_slot(page, 0)) !=
+                                     PAGE_OLD_INFIMUM))
+    goto corrupted;
+  rec_t * const prev_rec= page + PAGE_OLD_INFIMUM + prev;
+  if (!prev);
+  else if (UNIV_UNLIKELY(heap_bot + (REC_N_OLD_EXTRA_BYTES + 1) > prev_rec ||
+                         prev_rec > heap_top))
+    goto corrupted;
+  const ulint pn_fields= rec_get_bit_field_2(prev_rec, REC_OLD_N_FIELDS,
+                                             REC_OLD_N_FIELDS_MASK,
+                                             REC_OLD_N_FIELDS_SHIFT);
+  if (UNIV_UNLIKELY(pn_fields == 0 || pn_fields > REC_MAX_N_FIELDS))
+    goto corrupted;
+  const ulint pextra_size= REC_N_OLD_EXTRA_BYTES +
+    (rec_get_1byte_offs_flag(prev_rec) ? pn_fields : pn_fields * 2);
+  if (prev_rec == &page[PAGE_OLD_INFIMUM]);
+  else if (UNIV_UNLIKELY(prev_rec - pextra_size < heap_bot))
+    goto corrupted;
+  if (UNIV_UNLIKELY(hdr_c && prev_rec - hdr_c < heap_bot))
+    goto corrupted;
+  const ulint pdata_size= rec_get_data_size_old(prev_rec);
+  if (UNIV_UNLIKELY(prev_rec + pdata_size > heap_top))
+    goto corrupted;
+  rec_t * const next_rec= page + mach_read_from_2(prev_rec - REC_NEXT);
+  if (next_rec == page + PAGE_OLD_SUPREMUM);
+  else if (UNIV_UNLIKELY(heap_bot + REC_N_OLD_EXTRA_BYTES > next_rec ||
+                         next_rec > heap_top))
+    goto corrupted;
+  const bool is_short= (enc_hdr >> 2) & 1;
+  const ulint n_fields= (enc_hdr >> 3) + 1;
+  if (UNIV_UNLIKELY(n_fields > REC_MAX_N_FIELDS))
+    goto corrupted;
+  const ulint extra_size= REC_N_OLD_EXTRA_BYTES +
+    (is_short ? n_fields : n_fields * 2);
+  hdr_c+= REC_N_OLD_EXTRA_BYTES;
+  if (UNIV_UNLIKELY(hdr_c > extra_size))
+    goto corrupted;
+  if (UNIV_UNLIKELY(extra_size - hdr_c > data_len))
+    goto corrupted;
+  /* We buffer all changes to the record header locally, so that
+  we will avoid modifying the page before all consistency checks
+  have been fulfilled. */
+  alignas(2) byte insert_buf[REC_N_OLD_EXTRA_BYTES + REC_MAX_N_FIELDS * 2];
+
+  ulint n_owned;
+  rec_t *owner_rec= next_rec;
+  for (ulint ns= PAGE_DIR_SLOT_MAX_N_OWNED;
+       !(n_owned= rec_get_n_owned_old(owner_rec)); )
+  {
+    owner_rec= page + mach_read_from_2(owner_rec - REC_NEXT);
+    if (owner_rec == &page[PAGE_OLD_SUPREMUM]);
+    else if (UNIV_UNLIKELY(heap_bot + REC_N_OLD_EXTRA_BYTES > owner_rec ||
+                           owner_rec > heap_top))
+      goto corrupted;
+    if (!ns--)
+      goto corrupted; /* Corrupted (cyclic?) next-record list */
+  }
+
+  page_dir_slot_t *owner_slot= last_slot;
+
+  if (n_owned > PAGE_DIR_SLOT_MAX_N_OWNED)
+    goto corrupted;
+  else
+  {
+    mach_write_to_2(insert_buf, owner_rec - page);
+    static_assert(PAGE_DIR_SLOT_SIZE == 2, "compatibility");
+    const page_dir_slot_t * const first_slot=
+      page_dir_get_nth_slot(page, 0);
+
+    while (memcmp_aligned<2>(owner_slot, insert_buf, 2))
+      if ((owner_slot+= 2) == first_slot)
+        goto corrupted;
+  }
+
+  memcpy(insert_buf, data, extra_size - hdr_c);
+  byte *insert_rec= &insert_buf[extra_size];
+  memcpy(insert_rec - hdr_c, prev_rec - hdr_c, hdr_c);
+  rec_set_bit_field_1(insert_rec, (enc_hdr & 3) << 4,
+                      REC_OLD_INFO_BITS, REC_INFO_BITS_MASK,
+                      REC_INFO_BITS_SHIFT);
+  rec_set_1byte_offs_flag(insert_rec, is_short);
+  rec_set_n_fields_old(insert_rec, n_fields);
+  rec_set_bit_field_1(insert_rec, 0, REC_OLD_N_OWNED,
+                      REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+
+  const ulint data_size= rec_get_data_size_old(insert_rec);
+  if (UNIV_UNLIKELY(data_c > data_size))
+    goto corrupted;
+  if (UNIV_UNLIKELY(extra_size - hdr_c + data_size - data_c != data_len))
+    goto corrupted;
+
+  /* Perform final consistency checks and then apply the change to the page. */
+  byte *buf;
+  if (reuse)
+  {
+    byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER +
+                                          page);
+    rec_t *free_rec= page + mach_read_from_2(page_free);
+    if (UNIV_UNLIKELY(heap_bot + REC_N_OLD_EXTRA_BYTES > free_rec ||
+                      free_rec > heap_top))
+      goto corrupted;
+    const ulint fn_fields= rec_get_n_fields_old(free_rec);
+    const ulint fextra_size= REC_N_OLD_EXTRA_BYTES +
+      (rec_get_1byte_offs_flag(free_rec) ? fn_fields : fn_fields * 2);
+    if (UNIV_UNLIKELY(free_rec - fextra_size < heap_bot))
+      goto corrupted;
+    const ulint fdata_size= rec_get_data_size_old(free_rec);
+    if (UNIV_UNLIKELY(free_rec + fdata_size > heap_top))
+      goto corrupted;
+    if (UNIV_UNLIKELY(extra_size + data_size > fextra_size + fdata_size))
+      goto corrupted;
+    byte *page_garbage= my_assume_aligned<2>(page_free + 2);
+    if (UNIV_UNLIKELY(mach_read_from_2(page_garbage) <
+                      fextra_size + fdata_size))
+      goto corrupted;
+    buf= free_rec - fextra_size;
+    const rec_t *const next_free= page +
+      mach_read_from_2(free_rec - REC_NEXT);
+    if (next_free == page);
+    else if (UNIV_UNLIKELY(next_free < &heap_bot[REC_N_OLD_EXTRA_BYTES + 1] ||
+                           heap_top < next_free))
+      goto corrupted;
+    mach_write_to_2(page_garbage, mach_read_from_2(page_garbage) -
+                    extra_size - data_size);
+    rec_set_bit_field_2(insert_rec, rec_get_heap_no_old(free_rec),
+                        REC_OLD_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+    memcpy(page_free, free_rec - REC_NEXT, 2);
+  }
+  else
+  {
+    if (UNIV_UNLIKELY(heap_top + extra_size + data_size > last_slot))
+      goto corrupted;
+    rec_set_bit_field_2(insert_rec, h,
+                        REC_OLD_HEAP_NO, REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+    mach_write_to_2(page_n_heap, h + 1);
+    mach_write_to_2(page_heap_top,
+                    mach_read_from_2(page_heap_top) + extra_size + data_size);
+    buf= heap_top;
+  }
+
+  ut_ad(data_size - data_c == data_len - (extra_size - hdr_c));
+  byte *page_last_insert= my_assume_aligned<2>(PAGE_LAST_INSERT + PAGE_HEADER +
+                                               page);
+  const uint16_t last_insert= mach_read_from_2(page_last_insert);
+  memcpy(buf, insert_buf, extra_size);
+  buf+= extra_size;
+  mach_write_to_2(page_last_insert, buf - page);
+  memcpy(prev_rec - REC_NEXT, page_last_insert, 2);
+  memcpy(buf, prev_rec, data_c);
+  memcpy(buf + data_c, static_cast<const byte*>(data) + (extra_size - hdr_c),
+         data_len - (extra_size - hdr_c));
+  rec_set_bit_field_1(owner_rec, n_owned + 1, REC_OLD_N_OWNED,
+                      REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+
+  /* Update PAGE_DIRECTION_B, PAGE_N_DIRECTION if needed */
+  if (page[FIL_PAGE_TYPE + 1] != byte(FIL_PAGE_RTREE))
+  {
+    byte *dir= &page[PAGE_DIRECTION_B + PAGE_HEADER];
+    byte *n_dir= my_assume_aligned<2>
+      (&page[PAGE_N_DIRECTION + PAGE_HEADER]);
+    if (UNIV_UNLIKELY(!last_insert))
+    {
+no_direction:
+      *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_NO_DIRECTION);
+      memset(n_dir, 0, 2);
+    }
+    else if (page + last_insert == prev_rec &&
+             (*dir & ((1U << 3) - 1)) != PAGE_LEFT)
+    {
+      *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_RIGHT);
+inc_dir:
+      mach_write_to_2(n_dir, mach_read_from_2(n_dir) + 1);
+    }
+    else if (next_rec == page + last_insert &&
+             (*dir & ((1U << 3) - 1)) != PAGE_RIGHT)
+    {
+      *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_LEFT);
+      goto inc_dir;
+    }
+    else
+      goto no_direction;
+  }
+
+  /* Update PAGE_N_RECS. */
+  byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER + page);
+
+  mach_write_to_2(page_n_recs, mach_read_from_2(page_n_recs) + 1);
+
+  if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED))
+    return page_dir_split_slot(block, owner_slot);
+  ut_ad(page_simple_validate_old(page));
+  return false;
+}
+
+/** Apply a INSERT_HEAP_DYNAMIC or INSERT_REUSE_DYNAMIC record that was
+written by page_cur_insert_rec_low() for a ROW_FORMAT=COMPACT or DYNAMIC page.
+@param block      B-tree or R-tree page in ROW_FORMAT=COMPACT or DYNAMIC
+@param reuse      false=allocate from PAGE_HEAP_TOP; true=reuse PAGE_FREE
+@param prev       byte offset of the predecessor, relative to PAGE_NEW_INFIMUM
+@param shift      unless !reuse: number of bytes the PAGE_FREE is moving
+@param enc_hdr_l  number of copied record header bytes, plus record type bits
+@param hdr_c      number of common record header bytes with prev
+@param data_c     number of common data bytes with prev
+@param data       literal header and data bytes
+@param data_len   length of the literal data, in bytes
+@return whether the operation failed (inconcistency was noticed) */
+bool page_apply_insert_dynamic(const buf_block_t &block, bool reuse,
+                               ulint prev, ulint shift, ulint enc_hdr_l,
+                               size_t hdr_c, size_t data_c,
+                               const void *data, size_t data_len)
+{
+  page_t * const page= block.page.frame;
+  const uint16_t n_slots= page_dir_get_n_slots(page);
+  byte *page_n_heap= my_assume_aligned<2>(PAGE_N_HEAP + PAGE_HEADER + page);
+  ulint h= mach_read_from_2(page_n_heap);
+  const page_id_t id(block.page.id());
+  if (UNIV_UNLIKELY(n_slots < 2 || h < (PAGE_HEAP_NO_USER_LOW | 0x8000) ||
+                    (h & 0x7fff) >= srv_page_size / REC_N_NEW_EXTRA_BYTES ||
+                    (h & 0x7fff) < n_slots ||
+                    !fil_page_index_page_check(page) ||
+                    page_get_page_no(page) != id.page_no() ||
+                    mach_read_from_2(my_assume_aligned<2>
+                                     (PAGE_NEW_SUPREMUM - REC_NEXT + page)) ||
+                    ((enc_hdr_l & REC_STATUS_INSTANT) &&
+                     !page_is_leaf(page)) ||
+                    (enc_hdr_l >> 3) > data_len))
+  {
+corrupted:
+    ib::error() << (reuse
+                    ? "Not applying INSERT_REUSE_DYNAMIC"
+                    " due to corruption on "
+                    : "Not applying INSERT_HEAP_DYNAMIC"
+                    " due to corruption on ")
+                << id;
+    return true;
+  }
+
+  byte * const last_slot= page_dir_get_nth_slot(page, n_slots - 1);
+  byte * const page_heap_top= my_assume_aligned<2>
+    (PAGE_HEAP_TOP + PAGE_HEADER + page);
+  const byte *const heap_bot= &page[PAGE_NEW_SUPREMUM_END];
+  byte *heap_top= page + mach_read_from_2(page_heap_top);
+  if (UNIV_UNLIKELY(heap_bot > heap_top || heap_top > last_slot))
+    goto corrupted;
+  if (UNIV_UNLIKELY(mach_read_from_2(last_slot) != PAGE_NEW_SUPREMUM))
+    goto corrupted;
+  if (UNIV_UNLIKELY(mach_read_from_2(page_dir_get_nth_slot(page, 0)) !=
+                                     PAGE_NEW_INFIMUM))
+    goto corrupted;
+
+  uint16_t n= static_cast<uint16_t>(PAGE_NEW_INFIMUM + prev);
+  rec_t *prev_rec= page + n;
+  n= static_cast<uint16_t>(n + mach_read_from_2(prev_rec - REC_NEXT));
+  if (!prev);
+  else if (UNIV_UNLIKELY(heap_bot + REC_N_NEW_EXTRA_BYTES > prev_rec ||
+                         prev_rec > heap_top))
+    goto corrupted;
+
+  rec_t * const next_rec= page + n;
+  if (next_rec == page + PAGE_NEW_SUPREMUM);
+  else if (UNIV_UNLIKELY(heap_bot + REC_N_NEW_EXTRA_BYTES > next_rec ||
+                         next_rec > heap_top))
+    goto corrupted;
+
+  ulint n_owned;
+  rec_t *owner_rec= next_rec;
+  n= static_cast<uint16_t>(next_rec - page);
+
+  for (ulint ns= PAGE_DIR_SLOT_MAX_N_OWNED;
+       !(n_owned= rec_get_n_owned_new(owner_rec)); )
+  {
+    n= static_cast<uint16_t>(n + mach_read_from_2(owner_rec - REC_NEXT));
+    owner_rec= page + n;
+    if (n == PAGE_NEW_SUPREMUM);
+    else if (UNIV_UNLIKELY(heap_bot + REC_N_NEW_EXTRA_BYTES > owner_rec ||
+                           owner_rec > heap_top))
+      goto corrupted;
+    if (!ns--)
+      goto corrupted; /* Corrupted (cyclic?) next-record list */
+  }
+
+  page_dir_slot_t* owner_slot= last_slot;
+
+  if (n_owned > PAGE_DIR_SLOT_MAX_N_OWNED)
+    goto corrupted;
+  else
+  {
+    static_assert(PAGE_DIR_SLOT_SIZE == 2, "compatibility");
+    alignas(2) byte slot_buf[2];
+    mach_write_to_2(slot_buf, owner_rec - page);
+    const page_dir_slot_t * const first_slot=
+      page_dir_get_nth_slot(page, 0);
+
+    while (memcmp_aligned<2>(owner_slot, slot_buf, 2))
+      if ((owner_slot+= 2) == first_slot)
+        goto corrupted;
+  }
+
+  const ulint extra_size= REC_N_NEW_EXTRA_BYTES + hdr_c + (enc_hdr_l >> 3);
+  const ulint data_size= data_c + data_len - (enc_hdr_l >> 3);
+
+  /* Perform final consistency checks and then apply the change to the page. */
+  byte *buf;
+  if (reuse)
+  {
+    byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER + page);
+    rec_t *free_rec= page + mach_read_from_2(page_free);
+    if (UNIV_UNLIKELY(heap_bot + REC_N_NEW_EXTRA_BYTES > free_rec ||
+                      free_rec > heap_top))
+      goto corrupted;
+    buf= free_rec - extra_size;
+    if (shift & 1)
+      buf-= shift >> 1;
+    else
+      buf+= shift >> 1;
+
+    if (UNIV_UNLIKELY(heap_bot > buf ||
+                      &buf[extra_size + data_size] > heap_top))
+      goto corrupted;
+    byte *page_garbage= my_assume_aligned<2>(page_free + 2);
+    if (UNIV_UNLIKELY(mach_read_from_2(page_garbage) < extra_size + data_size))
+      goto corrupted;
+    if ((n= mach_read_from_2(free_rec - REC_NEXT)) != 0)
+    {
+      n= static_cast<uint16_t>(n + free_rec - page);
+      if (UNIV_UNLIKELY(n < PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES ||
+                        heap_top < page + n))
+        goto corrupted;
+    }
+    mach_write_to_2(page_free, n);
+    mach_write_to_2(page_garbage, mach_read_from_2(page_garbage) -
+                    (extra_size + data_size));
+    h= rec_get_heap_no_new(free_rec);
+  }
+  else
+  {
+    if (UNIV_UNLIKELY(heap_top + extra_size + data_size > last_slot))
+      goto corrupted;
+    mach_write_to_2(page_n_heap, h + 1);
+    h&= 0x7fff;
+    mach_write_to_2(page_heap_top,
+                    mach_read_from_2(page_heap_top) + extra_size + data_size);
+    buf= heap_top;
+  }
+
+  memcpy(buf, data, (enc_hdr_l >> 3));
+  buf+= enc_hdr_l >> 3;
+  data_len-= enc_hdr_l >> 3;
+  data= &static_cast<const byte*>(data)[enc_hdr_l >> 3];
+
+  memcpy(buf, prev_rec - REC_N_NEW_EXTRA_BYTES - hdr_c, hdr_c);
+  buf+= hdr_c;
+  *buf++= static_cast<byte>((enc_hdr_l & 3) << 4); /* info_bits; n_owned=0 */
+  *buf++= static_cast<byte>(h >> 5); /* MSB of heap number */
+  h= (h & ((1U << 5) - 1)) << 3;
+  static_assert(REC_STATUS_ORDINARY == 0, "compatibility");
+  static_assert(REC_STATUS_INSTANT == 4, "compatibility");
+  if (page_is_leaf(page))
+    h|= enc_hdr_l & REC_STATUS_INSTANT;
+  else
+  {
+    ut_ad(!(enc_hdr_l & REC_STATUS_INSTANT)); /* Checked at the start */
+    h|= REC_STATUS_NODE_PTR;
+  }
+  *buf++= static_cast<byte>(h); /* LSB of heap number, and status */
+  static_assert(REC_NEXT == 2, "compatibility");
+  buf+= REC_NEXT;
+  mach_write_to_2(buf - REC_NEXT, static_cast<uint16_t>(next_rec - buf));
+  byte *page_last_insert= my_assume_aligned<2>(PAGE_LAST_INSERT + PAGE_HEADER +
+                                               page);
+  const uint16_t last_insert= mach_read_from_2(page_last_insert);
+  mach_write_to_2(page_last_insert, buf - page);
+  mach_write_to_2(prev_rec - REC_NEXT, static_cast<uint16_t>(buf - prev_rec));
+  memcpy(buf, prev_rec, data_c);
+  buf+= data_c;
+  memcpy(buf, data, data_len);
+
+  rec_set_bit_field_1(owner_rec, n_owned + 1, REC_NEW_N_OWNED,
+                      REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+
+  /* Update PAGE_DIRECTION_B, PAGE_N_DIRECTION if needed */
+  if (page[FIL_PAGE_TYPE + 1] != byte(FIL_PAGE_RTREE))
+  {
+    byte *dir= &page[PAGE_DIRECTION_B + PAGE_HEADER];
+    byte *n_dir= my_assume_aligned<2>(&page[PAGE_N_DIRECTION + PAGE_HEADER]);
+    if (UNIV_UNLIKELY(!last_insert))
+    {
+no_direction:
+      *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_NO_DIRECTION);
+      memset(n_dir, 0, 2);
+    }
+    else if (page + last_insert == prev_rec &&
+             (*dir & ((1U << 3) - 1)) != PAGE_LEFT)
+    {
+      *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_RIGHT);
+inc_dir:
+      mach_write_to_2(n_dir, mach_read_from_2(n_dir) + 1);
+    }
+    else if (next_rec == page + last_insert &&
+             (*dir & ((1U << 3) - 1)) != PAGE_RIGHT)
+    {
+      *dir= static_cast<byte>((*dir & ~((1U << 3) - 1)) | PAGE_LEFT);
+      goto inc_dir;
+    }
+    else
+      goto no_direction;
+  }
+
+  /* Update PAGE_N_RECS. */
+  byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER + page);
+
+  mach_write_to_2(page_n_recs, mach_read_from_2(page_n_recs) + 1);
+
+  if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED))
+    return page_dir_split_slot(block, owner_slot);
+  ut_ad(page_simple_validate_new(page));
+  return false;
+}
+
+/** Apply a DELETE_ROW_FORMAT_REDUNDANT record that was written by
+page_cur_delete_rec() for a ROW_FORMAT=REDUNDANT page.
+@param block    B-tree or R-tree page in ROW_FORMAT=REDUNDANT
+@param prev     byte offset of the predecessor, relative to PAGE_OLD_INFIMUM
+@return whether the operation failed (inconcistency was noticed) */
+bool page_apply_delete_redundant(const buf_block_t &block, ulint prev)
+{
+  page_t * const page= block.page.frame;
+  const uint16_t n_slots= page_dir_get_n_slots(page);
+  ulint n_recs= page_get_n_recs(page);
+  const page_id_t id(block.page.id());
+
+  if (UNIV_UNLIKELY(!n_recs || n_slots < 2 ||
+                    !fil_page_index_page_check(page) ||
+                    page_get_page_no(page) != id.page_no() ||
+                    mach_read_from_2(my_assume_aligned<2>
+                                     (PAGE_OLD_SUPREMUM - REC_NEXT + page)) ||
+                    page_is_comp(page)))
+  {
+corrupted:
+    ib::error() << "Not applying DELETE_ROW_FORMAT_REDUNDANT"
+                   " due to corruption on " << id;
+    return true;
+  }
+
+  byte *slot= page_dir_get_nth_slot(page, n_slots - 1);
+  rec_t *prev_rec= page + PAGE_OLD_INFIMUM + prev;
+  if (UNIV_UNLIKELY(prev_rec > slot))
+    goto corrupted;
+  uint16_t n= mach_read_from_2(prev_rec - REC_NEXT);
+  rec_t *rec= page + n;
+  if (UNIV_UNLIKELY(n < PAGE_OLD_SUPREMUM_END + REC_N_OLD_EXTRA_BYTES ||
+                    slot < rec))
+    goto corrupted;
+  const ulint extra_size= REC_N_OLD_EXTRA_BYTES + rec_get_n_fields_old(rec) *
+    (rec_get_1byte_offs_flag(rec) ? 1 : 2);
+  const ulint data_size= rec_get_data_size_old(rec);
+  if (UNIV_UNLIKELY(n < PAGE_OLD_SUPREMUM_END + extra_size ||
+                    slot < rec + data_size))
+    goto corrupted;
+
+  n= mach_read_from_2(rec - REC_NEXT);
+  rec_t *next= page + n;
+  if (n == PAGE_OLD_SUPREMUM);
+  else if (UNIV_UNLIKELY(n < PAGE_OLD_SUPREMUM_END + REC_N_OLD_EXTRA_BYTES ||
+                         slot < next))
+    goto corrupted;
+
+  rec_t *s= rec;
+  ulint slot_owned;
+  for (ulint i= n_recs; !(slot_owned= rec_get_n_owned_old(s)); )
+  {
+    n= mach_read_from_2(s - REC_NEXT);
+    s= page + n;
+    if (n == PAGE_OLD_SUPREMUM);
+    else if (UNIV_UNLIKELY(n < PAGE_OLD_SUPREMUM_END + REC_N_OLD_EXTRA_BYTES ||
+                           slot < s))
+      goto corrupted;
+    if (UNIV_UNLIKELY(!i--)) /* Corrupted (cyclic?) next-record list */
+      goto corrupted;
+  }
+  slot_owned--;
+
+  /* The first slot is always pointing to the infimum record.
+  Find the directory slot pointing to s. */
+  const byte * const first_slot= page + srv_page_size - (PAGE_DIR + 2);
+  alignas(2) byte slot_offs[2];
+  mach_write_to_2(slot_offs, s - page);
+  static_assert(PAGE_DIR_SLOT_SIZE == 2, "compatibility");
+
+  while (memcmp_aligned<2>(slot, slot_offs, 2))
+    if ((slot+= 2) == first_slot)
+      goto corrupted;
+
+  if (rec == s)
+  {
+    s= prev_rec;
+    mach_write_to_2(slot, s - page);
+  }
+
+  memcpy(prev_rec - REC_NEXT, rec - REC_NEXT, 2);
+  s-= REC_OLD_N_OWNED;
+  *s= static_cast<byte>((*s & ~REC_N_OWNED_MASK) |
+                        slot_owned << REC_N_OWNED_SHIFT);
+  page_mem_free(block, rec, data_size, extra_size);
+
+  if (slot_owned < PAGE_DIR_SLOT_MIN_N_OWNED)
+    page_dir_balance_slot(block, (first_slot - slot) / 2);
+
+  ut_ad(page_simple_validate_old(page));
+  return false;
+}
+
+/** Apply a DELETE_ROW_FORMAT_DYNAMIC record that was written by
+page_cur_delete_rec() for a ROW_FORMAT=COMPACT or DYNAMIC page.
+@param block      B-tree or R-tree page in ROW_FORMAT=COMPACT or DYNAMIC
+@param prev       byte offset of the predecessor, relative to PAGE_NEW_INFIMUM
+@param hdr_size   record header size, excluding REC_N_NEW_EXTRA_BYTES
+@param data_size  data payload size, in bytes
+@return whether the operation failed (inconcistency was noticed) */
+bool page_apply_delete_dynamic(const buf_block_t &block, ulint prev,
+                               size_t hdr_size, size_t data_size)
+{
+  page_t * const page= block.page.frame;
+  const uint16_t n_slots= page_dir_get_n_slots(page);
+  ulint n_recs= page_get_n_recs(page);
+  const page_id_t id(block.page.id());
+
+  if (UNIV_UNLIKELY(!n_recs || n_slots < 2 ||
+                    !fil_page_index_page_check(page) ||
+                    page_get_page_no(page) != id.page_no() ||
+                    mach_read_from_2(my_assume_aligned<2>
+                                     (PAGE_NEW_SUPREMUM - REC_NEXT + page)) ||
+                    !page_is_comp(page)))
+  {
+corrupted:
+    ib::error() << "Not applying DELETE_ROW_FORMAT_DYNAMIC"
+                   " due to corruption on " << id;
+    return true;
+  }
+
+  byte *slot= page_dir_get_nth_slot(page, n_slots - 1);
+  uint16_t n= static_cast<uint16_t>(PAGE_NEW_INFIMUM + prev);
+  rec_t *prev_rec= page + n;
+  if (UNIV_UNLIKELY(prev_rec > slot))
+    goto corrupted;
+  n= static_cast<uint16_t>(n + mach_read_from_2(prev_rec - REC_NEXT));
+  rec_t *rec= page + n;
+  if (UNIV_UNLIKELY(n < PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES ||
+                    slot < rec))
+    goto corrupted;
+  const ulint extra_size= REC_N_NEW_EXTRA_BYTES + hdr_size;
+  if (UNIV_UNLIKELY(n < PAGE_NEW_SUPREMUM_END + extra_size ||
+                    slot < rec + data_size))
+    goto corrupted;
+  n= static_cast<uint16_t>(n + mach_read_from_2(rec - REC_NEXT));
+  rec_t *next= page + n;
+  if (n == PAGE_NEW_SUPREMUM);
+  else if (UNIV_UNLIKELY(n < PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES ||
+                         slot < next))
+    goto corrupted;
+
+  rec_t *s= rec;
+  n= static_cast<uint16_t>(rec - page);
+  ulint slot_owned;
+  for (ulint i= n_recs; !(slot_owned= rec_get_n_owned_new(s)); )
+  {
+    const uint16_t next= mach_read_from_2(s - REC_NEXT);
+    if (UNIV_UNLIKELY(next < REC_N_NEW_EXTRA_BYTES ||
+                      next > static_cast<uint16_t>(-REC_N_NEW_EXTRA_BYTES)))
+      goto corrupted;
+    n= static_cast<uint16_t>(n + next);
+    s= page + n;
+    if (n == PAGE_NEW_SUPREMUM);
+    else if (UNIV_UNLIKELY(n < PAGE_NEW_SUPREMUM_END + REC_N_NEW_EXTRA_BYTES ||
+                           slot < s))
+      goto corrupted;
+    if (UNIV_UNLIKELY(!i--)) /* Corrupted (cyclic?) next-record list */
+      goto corrupted;
+  }
+  slot_owned--;
+
+  /* The first slot is always pointing to the infimum record.
+  Find the directory slot pointing to s. */
+  const byte * const first_slot= page + srv_page_size - (PAGE_DIR + 2);
+  alignas(2) byte slot_offs[2];
+  mach_write_to_2(slot_offs, s - page);
+  static_assert(PAGE_DIR_SLOT_SIZE == 2, "compatibility");
+
+  while (memcmp_aligned<2>(slot, slot_offs, 2))
+    if ((slot+= 2) == first_slot)
+      goto corrupted;
+
+  if (rec == s)
+  {
+    s= prev_rec;
+    mach_write_to_2(slot, s - page);
+  }
+
+  mach_write_to_2(prev_rec - REC_NEXT, static_cast<uint16_t>(next - prev_rec));
+  s-= REC_NEW_N_OWNED;
+  *s= static_cast<byte>((*s & ~REC_N_OWNED_MASK) |
+                        slot_owned << REC_N_OWNED_SHIFT);
+  page_mem_free(block, rec, data_size, extra_size);
+
+  if (slot_owned < PAGE_DIR_SLOT_MIN_N_OWNED)
+    page_dir_balance_slot(block, (first_slot - slot) / 2);
+
+  ut_ad(page_simple_validate_new(page));
+  return false;
+}
+
+#ifdef UNIV_COMPILE_TEST_FUNCS
+
+/*******************************************************************//**
+Print the first n numbers, generated by ut_rnd_gen() to make sure
+(visually) that it works properly. */
+void
+test_ut_rnd_gen(
+	int	n)	/*!< in: print first n numbers */
+{
+	int			i;
+	unsigned long long	rnd;
+
+	for (i = 0; i < n; i++) {
+		rnd = ut_rnd_gen();
+		printf("%llu\t%%2=%llu %%3=%llu %%5=%llu %%7=%llu %%11=%llu\n",
+		       rnd,
+		       rnd % 2,
+		       rnd % 3,
+		       rnd % 5,
+		       rnd % 7,
+		       rnd % 11);
+	}
+}
+
+#endif /* UNIV_COMPILE_TEST_FUNCS */
diff --git a/storage/innobase/page/page0page.cc b/storage/innobase/page/page0page.cc
new file mode 100644
index 00000000..258d47a5
--- /dev/null
+++ b/storage/innobase/page/page0page.cc
@@ -0,0 +1,2523 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file page/page0page.cc
+Index page routines
+
+Created 2/2/1994 Heikki Tuuri
+*******************************************************/
+
+#include "page0page.h"
+#include "page0cur.h"
+#include "page0zip.h"
+#include "buf0buf.h"
+#include "buf0checksum.h"
+#include "btr0btr.h"
+#include "srv0srv.h"
+#include "lock0lock.h"
+#include "fut0lst.h"
+#include "btr0sea.h"
+#include "trx0sys.h"
+#include <algorithm>
+
+/*			THE INDEX PAGE
+			==============
+
+The index page consists of a page header which contains the page's
+id and other information. On top of it are the index records
+in a heap linked into a one way linear list according to alphabetic order.
+
+Just below page end is an array of pointers which we call page directory,
+to about every sixth record in the list. The pointers are placed in
+the directory in the alphabetical order of the records pointed to,
+enabling us to make binary search using the array. Each slot n:o I
+in the directory points to a record, where a 4-bit field contains a count
+of those records which are in the linear list between pointer I and
+the pointer I - 1 in the directory, including the record
+pointed to by pointer I and not including the record pointed to by I - 1.
+We say that the record pointed to by slot I, or that slot I, owns
+these records. The count is always kept in the range 4 to 8, with
+the exception that it is 1 for the first slot, and 1--8 for the second slot.
+
+An essentially binary search can be performed in the list of index
+records, like we could do if we had pointer to every record in the
+page directory. The data structure is, however, more efficient when
+we are doing inserts, because most inserts are just pushed on a heap.
+Only every 8th insert requires block move in the directory pointer
+table, which itself is quite small. A record is deleted from the page
+by just taking it off the linear list and updating the number of owned
+records-field of the record which owns it, and updating the page directory,
+if necessary. A special case is the one when the record owns itself.
+Because the overhead of inserts is so small, we may also increase the
+page size from the projected default of 8 kB to 64 kB without too
+much loss of efficiency in inserts. Bigger page becomes actual
+when the disk transfer rate compared to seek and latency time rises.
+On the present system, the page size is set so that the page transfer
+time (3 ms) is 20 % of the disk random access time (15 ms).
+
+When the page is split, merged, or becomes full but contains deleted
+records, we have to reorganize the page.
+
+Assuming a page size of 8 kB, a typical index page of a secondary
+index contains 300 index entries, and the size of the page directory
+is 50 x 4 bytes = 200 bytes. */
+
+/***************************************************************//**
+Looks for the directory slot which owns the given record.
+@return the directory slot number
+@retval ULINT_UNDEFINED on corruption */
+ulint
+page_dir_find_owner_slot(
+/*=====================*/
+	const rec_t*	rec)	/*!< in: the physical record */
+{
+	ut_ad(page_rec_check(rec));
+
+	const page_t* page = page_align(rec);
+	const page_dir_slot_t* first_slot = page_dir_get_nth_slot(page, 0);
+	const page_dir_slot_t* slot = page_dir_get_nth_slot(
+		page, ulint(page_dir_get_n_slots(page)) - 1);
+	const rec_t*		r = rec;
+
+	if (page_is_comp(page)) {
+		while (rec_get_n_owned_new(r) == 0) {
+			r = page_rec_get_next_low(r, true);
+			if (UNIV_UNLIKELY(r < page + PAGE_NEW_SUPREMUM
+					  || r >= slot)) {
+				return ULINT_UNDEFINED;
+			}
+		}
+	} else {
+		while (rec_get_n_owned_old(r) == 0) {
+			r = page_rec_get_next_low(r, false);
+			if (UNIV_UNLIKELY(r < page + PAGE_OLD_SUPREMUM
+					  || r >= slot)) {
+				return ULINT_UNDEFINED;
+			}
+		}
+	}
+
+	while (UNIV_LIKELY(*(uint16*) slot
+			   != mach_encode_2(ulint(r - page)))) {
+		if (UNIV_UNLIKELY(slot == first_slot)) {
+			return ULINT_UNDEFINED;
+		}
+
+		slot += PAGE_DIR_SLOT_SIZE;
+	}
+
+	return(((ulint) (first_slot - slot)) / PAGE_DIR_SLOT_SIZE);
+}
+
+/**************************************************************//**
+Used to check the consistency of a directory slot.
+@return TRUE if succeed */
+static
+ibool
+page_dir_slot_check(
+/*================*/
+	const page_dir_slot_t*	slot)	/*!< in: slot */
+{
+	const page_t*	page;
+	ulint		n_slots;
+	ulint		n_owned;
+
+	ut_a(slot);
+
+	page = page_align(slot);
+
+	n_slots = page_dir_get_n_slots(page);
+
+	ut_a(slot <= page_dir_get_nth_slot(page, 0));
+	ut_a(slot >= page_dir_get_nth_slot(page, n_slots - 1));
+
+	ut_a(page_rec_check(page_dir_slot_get_rec(slot)));
+
+	if (page_is_comp(page)) {
+		n_owned = rec_get_n_owned_new(page_dir_slot_get_rec(slot));
+	} else {
+		n_owned = rec_get_n_owned_old(page_dir_slot_get_rec(slot));
+	}
+
+	if (slot == page_dir_get_nth_slot(page, 0)) {
+		ut_a(n_owned == 1);
+	} else if (slot == page_dir_get_nth_slot(page, n_slots - 1)) {
+		ut_a(n_owned >= 1);
+		ut_a(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED);
+	} else {
+		ut_a(n_owned >= PAGE_DIR_SLOT_MIN_N_OWNED);
+		ut_a(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED);
+	}
+
+	return(TRUE);
+}
+
+/*************************************************************//**
+Sets the max trx id field value. */
+void
+page_set_max_trx_id(
+/*================*/
+	buf_block_t*	block,	/*!< in/out: page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction, or NULL */
+{
+  ut_ad(!mtr || mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+  ut_ad(!page_zip || page_zip == &block->page.zip);
+  static_assert((PAGE_HEADER + PAGE_MAX_TRX_ID) % 8 == 0, "alignment");
+  byte *max_trx_id= my_assume_aligned<8>(PAGE_MAX_TRX_ID +
+                                         PAGE_HEADER + block->page.frame);
+
+  mtr->write<8>(*block, max_trx_id, trx_id);
+  if (UNIV_LIKELY_NULL(page_zip))
+    memcpy_aligned<8>(&page_zip->data[PAGE_MAX_TRX_ID + PAGE_HEADER],
+                      max_trx_id, 8);
+}
+
+/** Persist the AUTO_INCREMENT value on a clustered index root page.
+@param[in,out]	block	clustered index root page
+@param[in]	index	clustered index
+@param[in]	autoinc	next available AUTO_INCREMENT value
+@param[in,out]	mtr	mini-transaction
+@param[in]	reset	whether to reset the AUTO_INCREMENT
+			to a possibly smaller value than currently
+			exists in the page */
+void
+page_set_autoinc(
+	buf_block_t*		block,
+	ib_uint64_t		autoinc,
+	mtr_t*			mtr,
+	bool			reset)
+{
+  ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX |
+                                   MTR_MEMO_PAGE_SX_FIX));
+
+  byte *field= my_assume_aligned<8>(PAGE_HEADER + PAGE_ROOT_AUTO_INC +
+                                    block->page.frame);
+  ib_uint64_t old= mach_read_from_8(field);
+  if (old == autoinc || (old > autoinc && !reset))
+    return; /* nothing to update */
+
+  mtr->write<8>(*block, field, autoinc);
+  if (UNIV_LIKELY_NULL(block->page.zip.data))
+    memcpy_aligned<8>(PAGE_HEADER + PAGE_ROOT_AUTO_INC + block->page.zip.data,
+                      field, 8);
+}
+
+/** The page infimum and supremum of an empty page in ROW_FORMAT=REDUNDANT */
+static const byte infimum_supremum_redundant[] = {
+	/* the infimum record */
+	0x08/*end offset*/,
+	0x01/*n_owned*/,
+	0x00, 0x00/*heap_no=0*/,
+	0x03/*n_fields=1, 1-byte offsets*/,
+	0x00, 0x74/* pointer to supremum */,
+	'i', 'n', 'f', 'i', 'm', 'u', 'm', 0,
+	/* the supremum record */
+	0x09/*end offset*/,
+	0x01/*n_owned*/,
+	0x00, 0x08/*heap_no=1*/,
+	0x03/*n_fields=1, 1-byte offsets*/,
+	0x00, 0x00/* end of record list */,
+	's', 'u', 'p', 'r', 'e', 'm', 'u', 'm', 0
+};
+
+/** The page infimum and supremum of an empty page in ROW_FORMAT=COMPACT */
+static const byte infimum_supremum_compact[] = {
+	/* the infimum record */
+	0x01/*n_owned=1*/,
+	0x00, 0x02/* heap_no=0, REC_STATUS_INFIMUM */,
+	0x00, 0x0d/* pointer to supremum */,
+	'i', 'n', 'f', 'i', 'm', 'u', 'm', 0,
+	/* the supremum record */
+	0x01/*n_owned=1*/,
+	0x00, 0x0b/* heap_no=1, REC_STATUS_SUPREMUM */,
+	0x00, 0x00/* end of record list */,
+	's', 'u', 'p', 'r', 'e', 'm', 'u', 'm'
+};
+
+/** Create an index page.
+@param[in,out]	block	buffer block
+@param[in]	comp	nonzero=compact page format */
+void page_create_low(const buf_block_t* block, bool comp)
+{
+	page_t*		page;
+
+	compile_time_assert(PAGE_BTR_IBUF_FREE_LIST + FLST_BASE_NODE_SIZE
+			    <= PAGE_DATA);
+	compile_time_assert(PAGE_BTR_IBUF_FREE_LIST_NODE + FLST_NODE_SIZE
+			    <= PAGE_DATA);
+
+	page = block->page.frame;
+
+	fil_page_set_type(page, FIL_PAGE_INDEX);
+
+	memset(page + PAGE_HEADER, 0, PAGE_HEADER_PRIV_END);
+	page[PAGE_HEADER + PAGE_N_DIR_SLOTS + 1] = 2;
+	page[PAGE_HEADER + PAGE_INSTANT] = 0;
+	page[PAGE_HEADER + PAGE_DIRECTION_B] = PAGE_NO_DIRECTION;
+
+	if (comp) {
+		page[PAGE_HEADER + PAGE_N_HEAP] = 0x80;/*page_is_comp()*/
+		page[PAGE_HEADER + PAGE_N_HEAP + 1] = PAGE_HEAP_NO_USER_LOW;
+		page[PAGE_HEADER + PAGE_HEAP_TOP + 1] = PAGE_NEW_SUPREMUM_END;
+		memcpy(page + PAGE_DATA, infimum_supremum_compact,
+		       sizeof infimum_supremum_compact);
+		memset(page
+		       + PAGE_NEW_SUPREMUM_END, 0,
+		       srv_page_size - PAGE_DIR - PAGE_NEW_SUPREMUM_END);
+		page[srv_page_size - PAGE_DIR - PAGE_DIR_SLOT_SIZE * 2 + 1]
+			= PAGE_NEW_SUPREMUM;
+		page[srv_page_size - PAGE_DIR - PAGE_DIR_SLOT_SIZE + 1]
+			= PAGE_NEW_INFIMUM;
+	} else {
+		page[PAGE_HEADER + PAGE_N_HEAP + 1] = PAGE_HEAP_NO_USER_LOW;
+		page[PAGE_HEADER + PAGE_HEAP_TOP + 1] = PAGE_OLD_SUPREMUM_END;
+		memcpy(page + PAGE_DATA, infimum_supremum_redundant,
+		       sizeof infimum_supremum_redundant);
+		memset(page
+		       + PAGE_OLD_SUPREMUM_END, 0,
+		       srv_page_size - PAGE_DIR - PAGE_OLD_SUPREMUM_END);
+		page[srv_page_size - PAGE_DIR - PAGE_DIR_SLOT_SIZE * 2 + 1]
+			= PAGE_OLD_SUPREMUM;
+		page[srv_page_size - PAGE_DIR - PAGE_DIR_SLOT_SIZE + 1]
+			= PAGE_OLD_INFIMUM;
+	}
+}
+
+/** Create an uncompressed index page.
+@param[in,out]	block	buffer block
+@param[in,out]	mtr	mini-transaction
+@param[in]	comp	set unless ROW_FORMAT=REDUNDANT */
+void page_create(buf_block_t *block, mtr_t *mtr, bool comp)
+{
+  mtr->page_create(*block, comp);
+  buf_block_modify_clock_inc(block);
+  page_create_low(block, comp);
+}
+
+/**********************************************************//**
+Create a compressed B-tree index page. */
+void
+page_create_zip(
+/*============*/
+	buf_block_t*		block,		/*!< in/out: a buffer frame
+						where the page is created */
+	dict_index_t*		index,		/*!< in: the index of the
+						page */
+	ulint			level,		/*!< in: the B-tree level
+						of the page */
+	trx_id_t		max_trx_id,	/*!< in: PAGE_MAX_TRX_ID */
+	mtr_t*			mtr)		/*!< in/out: mini-transaction
+						handle */
+{
+	ut_ad(block);
+	ut_ad(buf_block_get_page_zip(block));
+	ut_ad(dict_table_is_comp(index->table));
+
+	/* PAGE_MAX_TRX_ID or PAGE_ROOT_AUTO_INC are always 0 for
+	temporary tables. */
+	ut_ad(max_trx_id == 0 || !index->table->is_temporary());
+	/* In secondary indexes and the change buffer, PAGE_MAX_TRX_ID
+	must be zero on non-leaf pages. max_trx_id can be 0 when the
+	index consists of an empty root (leaf) page. */
+	ut_ad(max_trx_id == 0
+	      || level == 0
+	      || !dict_index_is_sec_or_ibuf(index)
+	      || index->table->is_temporary());
+	/* In the clustered index, PAGE_ROOT_AUTOINC or
+	PAGE_MAX_TRX_ID must be 0 on other pages than the root. */
+	ut_ad(level == 0 || max_trx_id == 0
+	      || !dict_index_is_sec_or_ibuf(index)
+	      || index->table->is_temporary());
+
+	buf_block_modify_clock_inc(block);
+	page_create_low(block, true);
+
+	if (index->is_spatial()) {
+		mach_write_to_2(FIL_PAGE_TYPE + block->page.frame,
+				FIL_PAGE_RTREE);
+		memset(block->page.frame + FIL_RTREE_SPLIT_SEQ_NUM, 0, 8);
+		memset(block->page.zip.data + FIL_RTREE_SPLIT_SEQ_NUM, 0, 8);
+	}
+
+	mach_write_to_2(PAGE_HEADER + PAGE_LEVEL + block->page.frame, level);
+	mach_write_to_8(PAGE_HEADER + PAGE_MAX_TRX_ID + block->page.frame,
+			max_trx_id);
+
+	if (!page_zip_compress(block, index, page_zip_level, mtr)) {
+		/* The compression of a newly created
+		page should always succeed. */
+		ut_error;
+	}
+}
+
+/**********************************************************//**
+Empty a previously created B-tree index page. */
+void
+page_create_empty(
+/*==============*/
+	buf_block_t*	block,	/*!< in/out: B-tree block */
+	dict_index_t*	index,	/*!< in: the index of the page */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	trx_id_t	max_trx_id;
+	page_zip_des_t*	page_zip= buf_block_get_page_zip(block);
+
+	ut_ad(fil_page_index_page_check(block->page.frame));
+	ut_ad(!index->is_dummy);
+	ut_ad(block->page.id().space() == index->table->space->id);
+
+	/* Multiple transactions cannot simultaneously operate on the
+	same temp-table in parallel.
+	max_trx_id is ignored for temp tables because it not required
+	for MVCC. */
+	if (dict_index_is_sec_or_ibuf(index)
+	    && !index->table->is_temporary()
+	    && page_is_leaf(block->page.frame)) {
+		max_trx_id = page_get_max_trx_id(block->page.frame);
+		ut_ad(max_trx_id);
+	} else if (block->page.id().page_no() == index->page) {
+		/* Preserve PAGE_ROOT_AUTO_INC. */
+		max_trx_id = page_get_max_trx_id(block->page.frame);
+	} else {
+		max_trx_id = 0;
+	}
+
+	if (page_zip) {
+		ut_ad(!index->table->is_temporary());
+		page_create_zip(block, index,
+				page_header_get_field(block->page.frame,
+						      PAGE_LEVEL),
+				max_trx_id, mtr);
+	} else {
+		page_create(block, mtr, index->table->not_redundant());
+		if (index->is_spatial()) {
+			static_assert(((FIL_PAGE_INDEX & 0xff00)
+				       | byte(FIL_PAGE_RTREE))
+				      == FIL_PAGE_RTREE, "compatibility");
+			mtr->write<1>(*block,
+				      FIL_PAGE_TYPE + 1 + block->page.frame,
+				      byte(FIL_PAGE_RTREE));
+			if (mach_read_from_8(block->page.frame
+					     + FIL_RTREE_SPLIT_SEQ_NUM)) {
+				mtr->memset(block, FIL_RTREE_SPLIT_SEQ_NUM,
+					    8, 0);
+			}
+		}
+
+		if (max_trx_id) {
+			mtr->write<8>(*block, PAGE_HEADER + PAGE_MAX_TRX_ID
+				      + block->page.frame, max_trx_id);
+		}
+	}
+}
+
+/*************************************************************//**
+Differs from page_copy_rec_list_end, because this function does not
+touch the lock table and max trx id on page or compress the page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return error code */
+dberr_t
+page_copy_rec_list_end_no_locks(
+/*============================*/
+	buf_block_t*	new_block,	/*!< in: index page to copy to */
+	buf_block_t*	block,		/*!< in: index page of rec */
+	rec_t*		rec,		/*!< in: record on page */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	page_t*		new_page	= buf_block_get_frame(new_block);
+	page_cur_t	cur1;
+	page_cur_t	cur2;
+	mem_heap_t*	heap		= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	cur1.index = cur2.index = index;
+	page_cur_position(rec, block, &cur1);
+
+	if (page_cur_is_before_first(&cur1) && !page_cur_move_to_next(&cur1)) {
+		return DB_CORRUPTION;
+	}
+
+	if (UNIV_UNLIKELY(page_is_comp(new_page) != page_rec_is_comp(rec)
+			  || mach_read_from_2(new_page + srv_page_size - 10)
+			  != ulint(page_is_comp(new_page)
+				   ? PAGE_NEW_INFIMUM : PAGE_OLD_INFIMUM))) {
+		return DB_CORRUPTION;
+	}
+
+	const ulint n_core = page_is_leaf(block->page.frame)
+		? index->n_core_fields : 0;
+
+	dberr_t err = DB_SUCCESS;
+	page_cur_set_before_first(new_block, &cur2);
+
+	/* Copy records from the original page to the new page */
+
+	while (!page_cur_is_after_last(&cur1)) {
+		rec_t*	ins_rec;
+		offsets = rec_get_offsets(cur1.rec, index, offsets, n_core,
+					  ULINT_UNDEFINED, &heap);
+		ins_rec = page_cur_insert_rec_low(&cur2, cur1.rec, offsets,
+						  mtr);
+		if (UNIV_UNLIKELY(!ins_rec || !page_cur_move_to_next(&cur1))) {
+			err = DB_CORRUPTION;
+			break;
+		}
+		ut_ad(!(rec_get_info_bits(cur1.rec, page_is_comp(new_page))
+			& REC_INFO_MIN_REC_FLAG));
+		cur2.rec = ins_rec;
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	return err;
+}
+
+/*************************************************************//**
+Copies records from page to new_page, from a given record onward,
+including that record. Infimum and supremum records are not copied.
+The records are copied to the start of the record list on new_page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_t::commit().
+
+@return pointer to the original successor of the infimum record on new_block
+@retval nullptr on ROW_FORMAT=COMPRESSED page overflow */
+rec_t*
+page_copy_rec_list_end(
+/*===================*/
+	buf_block_t*	new_block,	/*!< in/out: index page to copy to */
+	buf_block_t*	block,		/*!< in: index page containing rec */
+	rec_t*		rec,		/*!< in: record on page */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr,		/*!< in/out: mini-transaction */
+	dberr_t*	err)		/*!< out: error code */
+{
+	page_t*		new_page	= new_block->page.frame;
+	page_zip_des_t*	new_page_zip	= buf_block_get_page_zip(new_block);
+	page_t*		page		= block->page.frame;
+	rec_t*		ret		= page_rec_get_next(
+		page_get_infimum_rec(new_page));
+	ulint		num_moved	= 0;
+	ut_ad(page_align(rec) == page);
+
+	if (UNIV_UNLIKELY(!ret)) {
+		*err = DB_CORRUPTION;
+		return nullptr;
+	}
+
+#ifdef UNIV_ZIP_DEBUG
+	if (new_page_zip) {
+		page_zip_des_t*	page_zip = buf_block_get_page_zip(block);
+		ut_a(page_zip);
+
+		/* Strict page_zip_validate() may fail here.
+		Furthermore, btr_compress() may set FIL_PAGE_PREV to
+		FIL_NULL on new_page while leaving it intact on
+		new_page_zip.  So, we cannot validate new_page_zip. */
+		ut_a(page_zip_validate_low(page_zip, page, index, TRUE));
+	}
+#endif /* UNIV_ZIP_DEBUG */
+	ut_ad(buf_block_get_frame(block) == page);
+	ut_ad(page_is_leaf(page) == page_is_leaf(new_page));
+	ut_ad(page_is_comp(page) == page_is_comp(new_page));
+	/* Here, "ret" may be pointing to a user record or the
+	predefined supremum record. */
+
+	const mtr_log_t log_mode = new_page_zip
+		? mtr->set_log_mode(MTR_LOG_NONE) : MTR_LOG_NONE;
+	const bool was_empty = page_dir_get_n_heap(new_page)
+		== PAGE_HEAP_NO_USER_LOW;
+	alignas(2) byte h[PAGE_N_DIRECTION + 2 - PAGE_LAST_INSERT];
+	memcpy_aligned<2>(h, PAGE_HEADER + PAGE_LAST_INSERT + new_page,
+			  sizeof h);
+	mem_heap_t* heap = nullptr;
+	rtr_rec_move_t* rec_move = nullptr;
+
+	if (index->is_spatial()) {
+		ulint	max_to_move = page_get_n_recs(
+			buf_block_get_frame(block));
+		heap = mem_heap_create(256);
+		rec_move= static_cast<rtr_rec_move_t*>(
+			mem_heap_alloc(heap, max_to_move * sizeof *rec_move));
+		/* For spatial index, we need to insert recs one by one
+		to keep recs ordered. */
+		*err = rtr_page_copy_rec_list_end_no_locks(new_block,
+							   block, rec, index,
+							   heap, rec_move,
+							   max_to_move,
+							   &num_moved,
+							   mtr);
+	} else {
+		*err = page_copy_rec_list_end_no_locks(new_block, block, rec,
+						       index, mtr);
+		if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+err_exit:
+			if (UNIV_LIKELY_NULL(heap)) {
+				mem_heap_free(heap);
+			}
+			return nullptr;
+		}
+		if (was_empty) {
+			mtr->memcpy<mtr_t::MAYBE_NOP>(*new_block, PAGE_HEADER
+						      + PAGE_LAST_INSERT
+						      + new_page, h, sizeof h);
+		}
+	}
+
+	/* Update PAGE_MAX_TRX_ID on the uncompressed page.
+	Modifications will be redo logged and copied to the compressed
+	page in page_zip_compress() or page_zip_reorganize() below.
+	Multiple transactions cannot simultaneously operate on the
+	same temp-table in parallel.
+	max_trx_id is ignored for temp tables because it not required
+	for MVCC. */
+	if (dict_index_is_sec_or_ibuf(index)
+	    && page_is_leaf(page)
+	    && !index->table->is_temporary()) {
+		ut_ad(!was_empty || page_dir_get_n_heap(new_page)
+		      == PAGE_HEAP_NO_USER_LOW
+		      + page_header_get_field(new_page, PAGE_N_RECS));
+		page_update_max_trx_id(new_block, NULL,
+				       page_get_max_trx_id(page), mtr);
+	}
+
+	if (new_page_zip) {
+		mtr_set_log_mode(mtr, log_mode);
+
+		if (!page_zip_compress(new_block, index,
+				       page_zip_level, mtr)) {
+			/* Before trying to reorganize the page,
+			store the number of preceding records on the page. */
+			ulint	ret_pos
+				= page_rec_get_n_recs_before(ret);
+			/* Before copying, "ret" was the successor of
+			the predefined infimum record.  It must still
+			have at least one predecessor (the predefined
+			infimum record, or a freshly copied record
+			that is smaller than "ret"). */
+			if (UNIV_UNLIKELY(!ret_pos
+					  || ret_pos == ULINT_UNDEFINED)) {
+				*err = DB_CORRUPTION;
+				goto err_exit;
+			}
+
+			*err = page_zip_reorganize(new_block, index,
+						   page_zip_level, mtr);
+			switch (*err) {
+			case DB_FAIL:
+				if (!page_zip_decompress(new_page_zip,
+							 new_page, FALSE)) {
+					ut_error;
+				}
+				ut_ad(page_validate(new_page, index));
+				/* fall through */
+			default:
+				goto err_exit;
+			case DB_SUCCESS:
+				/* The page was reorganized:
+				Seek to ret_pos. */
+				ret = page_rec_get_nth(new_page, ret_pos);
+				ut_ad(ret);
+			}
+		}
+	}
+
+	/* Update the lock table and possible hash index */
+
+	if (!index->has_locking()) {
+	} else if (UNIV_LIKELY_NULL(rec_move)) {
+		lock_rtr_move_rec_list(new_block, block, rec_move, num_moved);
+	} else {
+		lock_move_rec_list_end(new_block, block, rec);
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	btr_search_move_or_delete_hash_entries(new_block, block);
+
+	return(ret);
+}
+
+/*************************************************************//**
+Copies records from page to new_page, up to the given record,
+NOT including that record. Infimum and supremum records are not copied.
+The records are copied to the end of the record list on new_page.
+
+IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
+if new_block is a compressed leaf page in a secondary index.
+This has to be done either within the same mini-transaction,
+or by invoking ibuf_reset_free_bits() before mtr_commit().
+
+@return pointer to the original predecessor of the supremum record on new_block
+@retval nullptr on ROW_FORMAT=COMPRESSED page overflow */
+rec_t*
+page_copy_rec_list_start(
+/*=====================*/
+	buf_block_t*	new_block,	/*!< in/out: index page to copy to */
+	buf_block_t*	block,		/*!< in: index page containing rec */
+	rec_t*		rec,		/*!< in: record on page */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr,		/*!< in/out: mini-transaction */
+	dberr_t*	err)		/*!< out: error code */
+{
+	ut_ad(page_align(rec) == block->page.frame);
+
+	page_t*		new_page	= buf_block_get_frame(new_block);
+	page_zip_des_t*	new_page_zip	= buf_block_get_page_zip(new_block);
+	page_cur_t	cur1;
+	page_cur_t	cur2;
+	mem_heap_t*	heap		= NULL;
+	ulint		num_moved	= 0;
+	rtr_rec_move_t*	rec_move	= NULL;
+	rec_t*		ret
+		= page_rec_get_prev(page_get_supremum_rec(new_page));
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	if (UNIV_UNLIKELY(!ret)) {
+corrupted:
+		*err = DB_CORRUPTION;
+		return nullptr;
+	}
+
+	/* Here, "ret" may be pointing to a user record or the
+	predefined infimum record. */
+
+	if (page_rec_is_infimum(rec)) {
+		*err = DB_SUCCESS;
+		return(ret);
+	}
+
+	page_cur_set_before_first(block, &cur1);
+	if (UNIV_UNLIKELY(!page_cur_move_to_next(&cur1))) {
+		goto corrupted;
+	}
+
+	mtr_log_t	log_mode = MTR_LOG_NONE;
+
+	if (new_page_zip) {
+		log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
+	}
+
+	cur2.index = index;
+	page_cur_position(ret, new_block, &cur2);
+
+	const ulint n_core = page_rec_is_leaf(rec) ? index->n_core_fields : 0;
+
+	/* Copy records from the original page to the new page */
+	if (index->is_spatial()) {
+		ut_ad(!index->is_instant());
+		ulint		max_to_move = page_get_n_recs(
+						buf_block_get_frame(block));
+		heap = mem_heap_create(256);
+
+		rec_move = static_cast<rtr_rec_move_t*>(mem_heap_alloc(
+					heap,
+					sizeof (*rec_move) * max_to_move));
+
+		/* For spatial index, we need to insert recs one by one
+		to keep recs ordered. */
+		*err = rtr_page_copy_rec_list_start_no_locks(new_block,
+							     block, rec, index,
+							     heap, rec_move,
+							     max_to_move,
+							     &num_moved, mtr);
+		if (*err != DB_SUCCESS) {
+			return nullptr;
+		}
+	} else {
+		while (page_cur_get_rec(&cur1) != rec) {
+			offsets = rec_get_offsets(cur1.rec, index, offsets,
+						  n_core,
+						  ULINT_UNDEFINED, &heap);
+			cur2.rec = page_cur_insert_rec_low(&cur2, cur1.rec,
+							   offsets, mtr);
+			if (UNIV_UNLIKELY(!cur2.rec
+					  || !page_cur_move_to_next(&cur1))) {
+				*err = DB_CORRUPTION;
+				return nullptr;
+			}
+
+			ut_ad(!(rec_get_info_bits(cur1.rec,
+						  page_is_comp(new_page))
+				& REC_INFO_MIN_REC_FLAG));
+		}
+	}
+
+	/* Update PAGE_MAX_TRX_ID on the uncompressed page.
+	Modifications will be redo logged and copied to the compressed
+	page in page_zip_compress() or page_zip_reorganize() below.
+	Multiple transactions cannot simultaneously operate on the
+	same temp-table in parallel.
+	max_trx_id is ignored for temp tables because it not required
+	for MVCC. */
+	if (n_core && !index->is_primary() && !index->table->is_temporary()) {
+		page_update_max_trx_id(new_block, nullptr,
+				       page_get_max_trx_id(block->page.frame),
+                                       mtr);
+	}
+
+	if (new_page_zip) {
+		mtr_set_log_mode(mtr, log_mode);
+
+		DBUG_EXECUTE_IF("page_copy_rec_list_start_compress_fail",
+				goto zip_reorganize;);
+
+		if (!page_zip_compress(new_block, index,
+				       page_zip_level, mtr)) {
+#ifndef DBUG_OFF
+zip_reorganize:
+#endif /* DBUG_OFF */
+			/* Before trying to reorganize the page,
+			store the number of preceding records on the page. */
+			ulint ret_pos = page_rec_get_n_recs_before(ret);
+			/* Before copying, "ret" was the predecessor
+			of the predefined supremum record.  If it was
+			the predefined infimum record, then it would
+			still be the infimum, and we would have
+			ret_pos == 0. */
+			if (UNIV_UNLIKELY(!ret_pos
+					  || ret_pos == ULINT_UNDEFINED)) {
+				*err = DB_CORRUPTION;
+				return nullptr;
+			}
+			*err = page_zip_reorganize(new_block, index,
+						   page_zip_level, mtr);
+			switch (*err) {
+			case DB_SUCCESS:
+				ret = page_rec_get_nth(new_page, ret_pos);
+				ut_ad(ret);
+				break;
+			case DB_FAIL:
+				if (UNIV_UNLIKELY
+				    (!page_zip_decompress(new_page_zip,
+							  new_page, FALSE))) {
+					ut_error;
+				}
+				ut_ad(page_validate(new_page, index));
+				/* fall through */
+			default:
+				if (UNIV_LIKELY_NULL(heap)) {
+					mem_heap_free(heap);
+				}
+
+				return nullptr;
+			}
+		}
+	}
+
+	/* Update the lock table and possible hash index */
+
+	if (!index->has_locking()) {
+	} else if (dict_index_is_spatial(index)) {
+		lock_rtr_move_rec_list(new_block, block, rec_move, num_moved);
+	} else {
+		lock_move_rec_list_start(new_block, block, rec, ret);
+	}
+
+	if (heap) {
+		mem_heap_free(heap);
+	}
+
+	btr_search_move_or_delete_hash_entries(new_block, block);
+
+	*err = DB_SUCCESS;
+	return(ret);
+}
+
+/*************************************************************//**
+Deletes records from a page from a given record onward, including that record.
+The infimum and supremum records are not deleted. */
+dberr_t
+page_delete_rec_list_end(
+/*=====================*/
+	rec_t*		rec,	/*!< in: pointer to record on page */
+	buf_block_t*	block,	/*!< in: buffer block of the page */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	ulint		n_recs,	/*!< in: number of records to delete,
+				or ULINT_UNDEFINED if not known */
+	ulint		size,	/*!< in: the sum of the sizes of the
+				records in the end of the chain to
+				delete, or ULINT_UNDEFINED if not known */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+  page_t * const page= block->page.frame;
+
+  ut_ad(size == ULINT_UNDEFINED || size < srv_page_size);
+  ut_ad(page_align(rec) == page);
+  ut_ad(index->table->not_redundant() == !!page_is_comp(page));
+#ifdef UNIV_ZIP_DEBUG
+  ut_a(!block->page.zip.data ||
+       page_zip_validate(&block->page.zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+  if (page_rec_is_supremum(rec))
+  {
+    ut_ad(n_recs == 0 || n_recs == ULINT_UNDEFINED);
+    /* Nothing to do, there are no records bigger than the page supremum. */
+    return DB_SUCCESS;
+  }
+
+  if (page_rec_is_infimum(rec) ||
+      n_recs == page_get_n_recs(page) ||
+      rec == (page_is_comp(page)
+              ? page_rec_get_next_low(page + PAGE_NEW_INFIMUM, 1)
+              : page_rec_get_next_low(page + PAGE_OLD_INFIMUM, 0)))
+  {
+    /* We are deleting all records. */
+    page_create_empty(block, index, mtr);
+    return DB_SUCCESS;
+  }
+
+#if 0 // FIXME: consider deleting the last record as a special case
+  if (page_rec_is_last(rec))
+  {
+    page_cur_t cursor= { index, rec, offsets, block };
+    page_cur_delete_rec(&cursor, index, offsets, mtr);
+    return DB_SUCCESS;
+  }
+#endif
+
+  /* The page becomes invalid for optimistic searches */
+  buf_block_modify_clock_inc(block);
+
+  const ulint n_core= page_is_leaf(page) ? index->n_core_fields : 0;
+  mem_heap_t *heap= nullptr;
+  rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+  rec_offs *offsets= offsets_;
+  rec_offs_init(offsets_);
+
+#if 1 // FIXME: remove this, and write minimal amount of log! */
+  if (UNIV_LIKELY_NULL(block->page.zip.data))
+  {
+    ut_ad(page_is_comp(page));
+    do
+    {
+      page_cur_t cur;
+      page_cur_position(rec, block, &cur);
+      cur.index= index;
+      offsets= rec_get_offsets(rec, index, offsets, n_core,
+			       ULINT_UNDEFINED, &heap);
+      rec= const_cast<rec_t*>(page_rec_get_next_low(rec, true));
+#ifdef UNIV_ZIP_DEBUG
+      ut_a(page_zip_validate(&block->page.zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+      page_cur_delete_rec(&cur, offsets, mtr);
+    }
+    while (page_offset(rec) != PAGE_NEW_SUPREMUM);
+
+    if (UNIV_LIKELY_NULL(heap))
+      mem_heap_free(heap);
+    return DB_SUCCESS;
+  }
+#endif
+
+  byte *prev_rec= page_rec_get_prev(rec);
+  if (UNIV_UNLIKELY(!prev_rec))
+    return DB_CORRUPTION;
+  byte *last_rec= page_rec_get_prev(page_get_supremum_rec(page));
+  if (UNIV_UNLIKELY(!last_rec))
+    return DB_CORRUPTION;
+
+  // FIXME: consider a special case of shrinking PAGE_HEAP_TOP
+
+  const bool scrub= srv_immediate_scrub_data_uncompressed;
+  if (scrub || size == ULINT_UNDEFINED || n_recs == ULINT_UNDEFINED)
+  {
+    rec_t *rec2= rec;
+    /* Calculate the sum of sizes and the number of records */
+    size= 0;
+    n_recs= 0;
+
+    do
+    {
+      offsets = rec_get_offsets(rec2, index, offsets, n_core,
+                                ULINT_UNDEFINED, &heap);
+      ulint s= rec_offs_size(offsets);
+      ut_ad(ulint(rec2 - page) + s - rec_offs_extra_size(offsets) <
+            srv_page_size);
+      ut_ad(size + s < srv_page_size);
+      size+= s;
+      n_recs++;
+
+      if (scrub)
+        mtr->memset(block, page_offset(rec2), rec_offs_data_size(offsets), 0);
+
+      rec2= page_rec_get_next(rec2);
+    }
+    while (rec2 && !page_rec_is_supremum(rec2));
+
+    if (UNIV_LIKELY_NULL(heap))
+      mem_heap_free(heap);
+
+    if (UNIV_UNLIKELY(!rec))
+      return DB_CORRUPTION;
+  }
+
+  ut_ad(size < srv_page_size);
+
+  ulint slot_index, n_owned;
+  {
+    const rec_t *owner_rec= rec;
+    ulint count= 0;
+
+    if (page_is_comp(page))
+      while (!(n_owned= rec_get_n_owned_new(owner_rec)))
+      {
+        count++;
+	if (!(owner_rec= page_rec_get_next_low(owner_rec, true)))
+          return DB_CORRUPTION;
+      }
+    else
+      while (!(n_owned= rec_get_n_owned_old(owner_rec)))
+      {
+        count++;
+	if (!(owner_rec= page_rec_get_next_low(owner_rec, false)))
+          return DB_CORRUPTION;
+      }
+
+    ut_ad(n_owned > count);
+    n_owned-= count;
+    slot_index= page_dir_find_owner_slot(owner_rec);
+  }
+
+  if (UNIV_UNLIKELY(!slot_index || slot_index == ULINT_UNDEFINED))
+    return DB_CORRUPTION;
+
+  mtr->write<2,mtr_t::MAYBE_NOP>(*block, my_assume_aligned<2>
+                                 (PAGE_N_DIR_SLOTS + PAGE_HEADER + page),
+                                 slot_index + 1);
+  mtr->write<2,mtr_t::MAYBE_NOP>(*block, my_assume_aligned<2>
+                                 (PAGE_LAST_INSERT + PAGE_HEADER + page), 0U);
+  /* Catenate the deleted chain segment to the page free list */
+  alignas(4) byte page_header[4];
+  byte *page_free= my_assume_aligned<4>(PAGE_HEADER + PAGE_FREE + page);
+  const uint16_t free= page_header_get_field(page, PAGE_FREE);
+  static_assert(PAGE_FREE + 2 == PAGE_GARBAGE, "compatibility");
+
+  mach_write_to_2(page_header, page_offset(rec));
+  mach_write_to_2(my_assume_aligned<2>(page_header + 2),
+                  mach_read_from_2(my_assume_aligned<2>(page_free + 2)) +
+                  size);
+  mtr->memcpy(*block, page_free, page_header, 4);
+
+  byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER + page);
+  mtr->write<2>(*block, page_n_recs,
+                ulint{mach_read_from_2(page_n_recs)} - n_recs);
+
+  /* Update the page directory; there is no need to balance the number
+  of the records owned by the supremum record, as it is allowed to be
+  less than PAGE_DIR_SLOT_MIN_N_OWNED */
+  page_dir_slot_t *slot= page_dir_get_nth_slot(page, slot_index);
+
+  if (page_is_comp(page))
+  {
+    mtr->write<2,mtr_t::MAYBE_NOP>(*block, slot, PAGE_NEW_SUPREMUM);
+    byte *owned= PAGE_NEW_SUPREMUM - REC_NEW_N_OWNED + page;
+    byte new_owned= static_cast<byte>((*owned & ~REC_N_OWNED_MASK) |
+                                      n_owned << REC_N_OWNED_SHIFT);
+#if 0 // FIXME: implement minimal logging for ROW_FORMAT=COMPRESSED
+    if (UNIV_LIKELY_NULL(block->page.zip.data))
+    {
+      *owned= new_owned;
+      memcpy_aligned<2>(PAGE_N_DIR_SLOTS + PAGE_HEADER + block->page.zip.data,
+                        PAGE_N_DIR_SLOTS + PAGE_HEADER + page,
+			PAGE_N_RECS + 2 - PAGE_N_DIR_SLOTS);
+      // TODO: the equivalent of page_zip_dir_delete() for all records
+      mach_write_to_2(prev_rec - REC_NEXT, static_cast<uint16_t>
+		      (PAGE_NEW_SUPREMUM - page_offset(prev_rec)));
+      mach_write_to_2(last_rec - REC_NEXT, free
+                    ? static_cast<uint16_t>(free - page_offset(last_rec))
+                    : 0U);
+      return DB_SUCCESS;
+    }
+#endif
+    mtr->write<1,mtr_t::MAYBE_NOP>(*block, owned, new_owned);
+    mtr->write<2>(*block, prev_rec - REC_NEXT, static_cast<uint16_t>
+                  (PAGE_NEW_SUPREMUM - page_offset(prev_rec)));
+    mtr->write<2>(*block, last_rec - REC_NEXT, free
+                  ? static_cast<uint16_t>(free - page_offset(last_rec))
+                  : 0U);
+  }
+  else
+  {
+    mtr->write<2,mtr_t::MAYBE_NOP>(*block, slot, PAGE_OLD_SUPREMUM);
+    byte *owned= PAGE_OLD_SUPREMUM - REC_OLD_N_OWNED + page;
+    byte new_owned= static_cast<byte>((*owned & ~REC_N_OWNED_MASK) |
+                                      n_owned << REC_N_OWNED_SHIFT);
+    mtr->write<1,mtr_t::MAYBE_NOP>(*block, owned, new_owned);
+    mtr->write<2>(*block, prev_rec - REC_NEXT, PAGE_OLD_SUPREMUM);
+    mtr->write<2>(*block, last_rec - REC_NEXT, free);
+  }
+
+  return DB_SUCCESS;
+}
+
+/*************************************************************//**
+Deletes records from page, up to the given record, NOT including
+that record. Infimum and supremum records are not deleted. */
+void
+page_delete_rec_list_start(
+/*=======================*/
+	rec_t*		rec,	/*!< in: record on page */
+	buf_block_t*	block,	/*!< in: buffer block of the page */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_cur_t	cur1;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets		= offsets_;
+	mem_heap_t*	heap		= NULL;
+
+	rec_offs_init(offsets_);
+
+	ut_ad(page_align(rec) == block->page.frame);
+	ut_ad((ibool) !!page_rec_is_comp(rec)
+	      == dict_table_is_comp(index->table));
+#ifdef UNIV_ZIP_DEBUG
+	{
+		page_zip_des_t*	page_zip= buf_block_get_page_zip(block);
+		page_t*		page	= buf_block_get_frame(block);
+
+		/* page_zip_validate() would detect a min_rec_mark mismatch
+		in btr_page_split_and_insert()
+		between btr_attach_half_pages() and insert_page = ...
+		when btr_page_get_split_rec_to_left() holds
+		(direction == FSP_DOWN). */
+		ut_a(!page_zip
+		     || page_zip_validate_low(page_zip, page, index, TRUE));
+	}
+#endif /* UNIV_ZIP_DEBUG */
+
+	if (page_rec_is_infimum(rec)) {
+		return;
+	}
+
+	if (page_rec_is_supremum(rec)) {
+		/* We are deleting all records. */
+		page_create_empty(block, index, mtr);
+		return;
+	}
+
+	cur1.index = index;
+	page_cur_set_before_first(block, &cur1);
+	if (UNIV_UNLIKELY(!page_cur_move_to_next(&cur1))) {
+		ut_ad("corrupted page" == 0);
+		return;
+	}
+
+	const ulint	n_core = page_rec_is_leaf(rec)
+		? index->n_core_fields : 0;
+
+	while (page_cur_get_rec(&cur1) != rec) {
+		offsets = rec_get_offsets(page_cur_get_rec(&cur1), index,
+					  offsets, n_core,
+					  ULINT_UNDEFINED, &heap);
+		page_cur_delete_rec(&cur1, offsets, mtr);
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+}
+
+/************************************************************//**
+Returns the nth record of the record list.
+This is the inverse function of page_rec_get_n_recs_before().
+@return nth record
+@retval nullptr on corrupted page */
+const rec_t*
+page_rec_get_nth_const(
+/*===================*/
+	const page_t*	page,	/*!< in: page */
+	ulint		nth)	/*!< in: nth record */
+{
+	const page_dir_slot_t*	slot;
+	ulint			i;
+	ulint			n_owned;
+	const rec_t*		rec;
+
+	if (nth == 0) {
+		return(page_get_infimum_rec(page));
+	}
+
+	ut_ad(nth < srv_page_size / (REC_N_NEW_EXTRA_BYTES + 1));
+
+	for (i = 0;; i++) {
+		slot = page_dir_get_nth_slot(page, i);
+		n_owned = page_dir_slot_get_n_owned(slot);
+
+		if (n_owned > nth) {
+			break;
+		} else {
+			nth -= n_owned;
+		}
+	}
+
+	if (UNIV_UNLIKELY(!i)) {
+		return nullptr;
+	}
+	rec = page_dir_slot_get_rec(slot + 2);
+
+	if (page_is_comp(page)) {
+		do {
+			rec = page_rec_get_next_low(rec, TRUE);
+		} while (rec && nth--);
+	} else {
+		do {
+			rec = page_rec_get_next_low(rec, FALSE);
+		} while (rec && nth--);
+	}
+
+	return(rec);
+}
+
+
+/************************************************************//**
+Gets the pointer to the previous record.
+@return pointer to previous record
+@retval nullptr on error */
+const rec_t*
+page_rec_get_prev_const(
+/*====================*/
+	const rec_t*	rec)	/*!< in: pointer to record, must not be page
+				infimum */
+{
+	const rec_t*		rec2;
+	const rec_t*		prev_rec = NULL;
+
+	ut_ad(page_rec_check(rec));
+
+	const page_t* const page = page_align(rec);
+
+	ut_ad(!page_rec_is_infimum(rec));
+
+	ulint slot_no = page_dir_find_owner_slot(rec);
+
+	if (UNIV_UNLIKELY(!slot_no || slot_no == ULINT_UNDEFINED)) {
+		return nullptr;
+	}
+
+	const page_dir_slot_t* slot = page_dir_get_nth_slot(page, slot_no - 1);
+
+	if (UNIV_UNLIKELY(!(rec2 = page_dir_slot_get_rec_validate(slot)))) {
+		return nullptr;
+	}
+
+	if (page_is_comp(page)) {
+		while (rec2 && rec != rec2) {
+			prev_rec = rec2;
+			ulint offs = rec_get_next_offs(rec2, TRUE);
+			if (offs < PAGE_NEW_INFIMUM
+			    || offs > page_header_get_field(page,
+							    PAGE_HEAP_TOP)) {
+				return nullptr;
+			}
+			rec2 = page + offs;
+		}
+		switch (rec_get_status(prev_rec)) {
+		case REC_STATUS_INSTANT:
+		case REC_STATUS_ORDINARY:
+			if (!page_is_leaf(page)) {
+				return nullptr;
+			}
+			break;
+		case REC_STATUS_INFIMUM:
+			break;
+		case REC_STATUS_NODE_PTR:
+			if (!page_is_leaf(page)) {
+				break;
+			}
+			/* fall through */
+		default:
+			return nullptr;
+		}
+	} else {
+		while (rec2 && rec != rec2) {
+			prev_rec = rec2;
+			ulint offs = rec_get_next_offs(rec2, FALSE);
+			if (offs < PAGE_OLD_INFIMUM
+			    || offs > page_header_get_field(page,
+							    PAGE_HEAP_TOP)) {
+				return nullptr;
+			}
+			rec2 = page + offs;
+		}
+	}
+
+	return(prev_rec);
+}
+
+/** Return the number of preceding records in an index page.
+@param rec index record
+@return number of preceding records, including the infimum pseudo-record
+@retval ULINT_UNDEFINED on corrupted page */
+ulint page_rec_get_n_recs_before(const rec_t *rec)
+{
+  const page_t *const page= page_align(rec);
+  const page_dir_slot_t *slot = page_dir_get_nth_slot(page, 0);
+  const page_dir_slot_t *const end_slot= slot - 2 * page_dir_get_n_slots(page);
+
+  lint n= 0;
+
+  ut_ad(page_rec_check(rec));
+
+  if (page_is_comp(page))
+  {
+    for (; rec_get_n_owned_new(rec) == 0; n--)
+      if (UNIV_UNLIKELY(!(rec= page_rec_get_next_low(rec, true))))
+        return ULINT_UNDEFINED;
+
+    do
+    {
+      const rec_t *slot_rec= page_dir_slot_get_rec_validate(slot);
+      if (UNIV_UNLIKELY(!slot_rec))
+        break;
+      n+= lint(rec_get_n_owned_new(slot_rec));
+
+      if (rec == slot_rec)
+        goto found;
+    }
+    while ((slot-= 2) > end_slot);
+  }
+  else
+  {
+    for (; rec_get_n_owned_old(rec) == 0; n--)
+      if (UNIV_UNLIKELY(!(rec= page_rec_get_next_low(rec, false))))
+        return ULINT_UNDEFINED;
+
+    do
+    {
+      const rec_t *slot_rec= page_dir_slot_get_rec_validate(slot);
+      if (UNIV_UNLIKELY(!slot_rec))
+        break;
+      n+= lint(rec_get_n_owned_old(slot_rec));
+
+      if (rec == slot_rec)
+        goto found;
+    }
+    while ((slot-= 2) > end_slot);
+  }
+
+  return ULINT_UNDEFINED;
+found:
+  return --n < 0 ? ULINT_UNDEFINED : ulint(n);
+}
+
+/************************************************************//**
+Prints record contents including the data relevant only in
+the index page context. */
+void
+page_rec_print(
+/*===========*/
+	const rec_t*	rec,	/*!< in: physical record */
+	const rec_offs*	offsets)/*!< in: record descriptor */
+{
+	ut_a(!page_rec_is_comp(rec) == !rec_offs_comp(offsets));
+	rec_print_new(stderr, rec, offsets);
+	if (page_rec_is_comp(rec)) {
+		ib::info() << "n_owned: " << rec_get_n_owned_new(rec)
+			<< "; heap_no: " << rec_get_heap_no_new(rec)
+			<< "; next rec: " << rec_get_next_offs(rec, TRUE);
+	} else {
+		ib::info() << "n_owned: " << rec_get_n_owned_old(rec)
+			<< "; heap_no: " << rec_get_heap_no_old(rec)
+			<< "; next rec: " << rec_get_next_offs(rec, FALSE);
+	}
+
+	page_rec_check(rec);
+	rec_validate(rec, offsets);
+}
+
+#ifdef UNIV_BTR_PRINT
+/***************************************************************//**
+This is used to print the contents of the directory for
+debugging purposes. */
+void
+page_dir_print(
+/*===========*/
+	page_t*	page,	/*!< in: index page */
+	ulint	pr_n)	/*!< in: print n first and n last entries */
+{
+	ulint			n;
+	ulint			i;
+	page_dir_slot_t*	slot;
+
+	n = page_dir_get_n_slots(page);
+
+	fprintf(stderr, "--------------------------------\n"
+		"PAGE DIRECTORY\n"
+		"Page address %p\n"
+		"Directory stack top at offs: %lu; number of slots: %lu\n",
+		page, (ulong) page_offset(page_dir_get_nth_slot(page, n - 1)),
+		(ulong) n);
+	for (i = 0; i < n; i++) {
+		slot = page_dir_get_nth_slot(page, i);
+		if ((i == pr_n) && (i < n - pr_n)) {
+			fputs("    ...   \n", stderr);
+		}
+		if ((i < pr_n) || (i >= n - pr_n)) {
+			fprintf(stderr,
+				"Contents of slot: %lu: n_owned: %lu,"
+				" rec offs: %lu\n",
+				(ulong) i,
+				(ulong) page_dir_slot_get_n_owned(slot),
+				(ulong)
+				page_offset(page_dir_slot_get_rec(slot)));
+		}
+	}
+	fprintf(stderr, "Total of %lu records\n"
+		"--------------------------------\n",
+		(ulong) (PAGE_HEAP_NO_USER_LOW + page_get_n_recs(page)));
+}
+
+/***************************************************************//**
+This is used to print the contents of the page record list for
+debugging purposes. */
+void
+page_print_list(
+/*============*/
+	buf_block_t*	block,	/*!< in: index page */
+	dict_index_t*	index,	/*!< in: dictionary index of the page */
+	ulint		pr_n)	/*!< in: print n first and n last entries */
+{
+	page_t*		page		= block->page.frame;
+	page_cur_t	cur;
+	ulint		count;
+	ulint		n_recs;
+	mem_heap_t*	heap		= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_a((ibool)!!page_is_comp(page) == dict_table_is_comp(index->table));
+
+	fprint(stderr,
+		"--------------------------------\n"
+		"PAGE RECORD LIST\n"
+		"Page address %p\n", page);
+
+	n_recs = page_get_n_recs(page);
+
+	page_cur_set_before_first(block, &cur);
+	count = 0;
+	for (;;) {
+		offsets = rec_get_offsets(cur.rec, index, offsets,
+					  page_rec_is_leaf(cur.rec),
+					  ULINT_UNDEFINED, &heap);
+		page_rec_print(cur.rec, offsets);
+
+		if (count == pr_n) {
+			break;
+		}
+		if (page_cur_is_after_last(&cur)) {
+			break;
+		}
+		page_cur_move_to_next(&cur);
+		count++;
+	}
+
+	if (n_recs > 2 * pr_n) {
+		fputs(" ... \n", stderr);
+	}
+
+	while (!page_cur_is_after_last(&cur)) {
+		page_cur_move_to_next(&cur);
+
+		if (count + pr_n >= n_recs) {
+			offsets = rec_get_offsets(cur.rec, index, offsets,
+						  page_rec_is_leaf(cur.rec),
+						  ULINT_UNDEFINED, &heap);
+			page_rec_print(cur.rec, offsets);
+		}
+		count++;
+	}
+
+	fprintf(stderr,
+		"Total of %lu records \n"
+		"--------------------------------\n",
+		(ulong) (count + 1));
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+}
+
+/***************************************************************//**
+Prints the info in a page header. */
+void
+page_header_print(
+/*==============*/
+	const page_t*	page)
+{
+	fprintf(stderr,
+		"--------------------------------\n"
+		"PAGE HEADER INFO\n"
+		"Page address %p, n records %u (%s)\n"
+		"n dir slots %u, heap top %u\n"
+		"Page n heap %u, free %u, garbage %u\n"
+		"Page last insert %u, direction %u, n direction %u\n",
+		page, page_header_get_field(page, PAGE_N_RECS),
+		page_is_comp(page) ? "compact format" : "original format",
+		page_header_get_field(page, PAGE_N_DIR_SLOTS),
+		page_header_get_field(page, PAGE_HEAP_TOP),
+		page_dir_get_n_heap(page),
+		page_header_get_field(page, PAGE_FREE),
+		page_header_get_field(page, PAGE_GARBAGE),
+		page_header_get_field(page, PAGE_LAST_INSERT),
+		page_get_direction(page),
+		page_header_get_field(page, PAGE_N_DIRECTION));
+}
+
+/***************************************************************//**
+This is used to print the contents of the page for
+debugging purposes. */
+void
+page_print(
+/*=======*/
+	buf_block_t*	block,	/*!< in: index page */
+	dict_index_t*	index,	/*!< in: dictionary index of the page */
+	ulint		dn,	/*!< in: print dn first and last entries
+				in directory */
+	ulint		rn)	/*!< in: print rn first and last records
+				in directory */
+{
+	page_t*	page = block->page.frame;
+
+	page_header_print(page);
+	page_dir_print(page, dn);
+	page_print_list(block, index, rn);
+}
+#endif /* UNIV_BTR_PRINT */
+
+/***************************************************************//**
+The following is used to validate a record on a page. This function
+differs from rec_validate as it can also check the n_owned field and
+the heap_no field.
+@return TRUE if ok */
+ibool
+page_rec_validate(
+/*==============*/
+	const rec_t*	rec,	/*!< in: physical record */
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ulint		n_owned;
+	ulint		heap_no;
+	const page_t*	page;
+
+	page = page_align(rec);
+	ut_a(!page_is_comp(page) == !rec_offs_comp(offsets));
+
+	page_rec_check(rec);
+	rec_validate(rec, offsets);
+
+	if (page_rec_is_comp(rec)) {
+		n_owned = rec_get_n_owned_new(rec);
+		heap_no = rec_get_heap_no_new(rec);
+	} else {
+		n_owned = rec_get_n_owned_old(rec);
+		heap_no = rec_get_heap_no_old(rec);
+	}
+
+	if (UNIV_UNLIKELY(!(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED))) {
+		ib::warn() << "Dir slot of rec " << page_offset(rec)
+			<< ", n owned too big " << n_owned;
+		return(FALSE);
+	}
+
+	if (UNIV_UNLIKELY(!(heap_no < page_dir_get_n_heap(page)))) {
+		ib::warn() << "Heap no of rec " << page_offset(rec)
+			<< " too big " << heap_no << " "
+			<< page_dir_get_n_heap(page);
+		return(FALSE);
+	}
+
+	return(TRUE);
+}
+
+#ifdef UNIV_DEBUG
+/***************************************************************//**
+Checks that the first directory slot points to the infimum record and
+the last to the supremum. This function is intended to track if the
+bug fixed in 4.0.14 has caused corruption to users' databases. */
+void
+page_check_dir(
+/*===========*/
+	const page_t*	page)	/*!< in: index page */
+{
+	ulint	n_slots;
+	ulint	infimum_offs;
+	ulint	supremum_offs;
+
+	n_slots = page_dir_get_n_slots(page);
+	infimum_offs = mach_read_from_2(page_dir_get_nth_slot(page, 0));
+	supremum_offs = mach_read_from_2(page_dir_get_nth_slot(page,
+							       n_slots - 1));
+
+	if (UNIV_UNLIKELY(!page_rec_is_infimum_low(infimum_offs))) {
+
+		ib::fatal() << "Page directory corruption: infimum not"
+			" pointed to";
+	}
+
+	if (UNIV_UNLIKELY(!page_rec_is_supremum_low(supremum_offs))) {
+
+		ib::fatal() << "Page directory corruption: supremum not"
+			" pointed to";
+	}
+}
+#endif /* UNIV_DEBUG */
+
+/***************************************************************//**
+This function checks the consistency of an index page when we do not
+know the index. This is also resilient so that this should never crash
+even if the page is total garbage.
+@return TRUE if ok */
+ibool
+page_simple_validate_old(
+/*=====================*/
+	const page_t*	page)	/*!< in: index page in ROW_FORMAT=REDUNDANT */
+{
+	const page_dir_slot_t*	slot;
+	ulint			slot_no;
+	ulint			n_slots;
+	const rec_t*		rec;
+	const byte*		rec_heap_top;
+	ulint			count;
+	ulint			own_count;
+	ibool			ret	= FALSE;
+
+	ut_a(!page_is_comp(page));
+
+	/* Check first that the record heap and the directory do not
+	overlap. */
+
+	n_slots = page_dir_get_n_slots(page);
+
+	if (UNIV_UNLIKELY(n_slots < 2 || n_slots > srv_page_size / 4)) {
+		ib::error() << "Nonsensical number of page dir slots: "
+			    << n_slots;
+		goto func_exit;
+	}
+
+	rec_heap_top = page_header_get_ptr(page, PAGE_HEAP_TOP);
+
+	if (UNIV_UNLIKELY(rec_heap_top
+			  > page_dir_get_nth_slot(page, n_slots - 1))) {
+		ib::error()
+			<< "Record heap and dir overlap on a page, heap top "
+			<< page_header_get_field(page, PAGE_HEAP_TOP)
+			<< ", dir "
+			<< page_offset(page_dir_get_nth_slot(page,
+							     n_slots - 1));
+
+		goto func_exit;
+	}
+
+	/* Validate the record list in a loop checking also that it is
+	consistent with the page record directory. */
+
+	count = 0;
+	own_count = 1;
+	slot_no = 0;
+	slot = page_dir_get_nth_slot(page, slot_no);
+
+	rec = page_get_infimum_rec(page);
+
+	for (;;) {
+		if (UNIV_UNLIKELY(rec > rec_heap_top)) {
+			ib::error() << "Record " << (rec - page)
+				<< " is above rec heap top "
+				<< (rec_heap_top - page);
+
+			goto func_exit;
+		}
+
+		if (UNIV_UNLIKELY(rec_get_n_owned_old(rec) != 0)) {
+			/* This is a record pointed to by a dir slot */
+			if (UNIV_UNLIKELY(rec_get_n_owned_old(rec)
+					  != own_count)) {
+
+				ib::error() << "Wrong owned count "
+					<< rec_get_n_owned_old(rec)
+					<< ", " << own_count << ", rec "
+					<< (rec - page);
+
+				goto func_exit;
+			}
+
+			if (UNIV_UNLIKELY
+			    (page_dir_slot_get_rec(slot) != rec)) {
+				ib::error() << "Dir slot does not point"
+					" to right rec " << (rec - page);
+
+				goto func_exit;
+			}
+
+			own_count = 0;
+
+			if (!page_rec_is_supremum(rec)) {
+				slot_no++;
+				slot = page_dir_get_nth_slot(page, slot_no);
+			}
+		}
+
+		if (page_rec_is_supremum(rec)) {
+
+			break;
+		}
+
+		if (UNIV_UNLIKELY
+		    (rec_get_next_offs(rec, FALSE) < FIL_PAGE_DATA
+		     || rec_get_next_offs(rec, FALSE) >= srv_page_size)) {
+
+			ib::error() << "Next record offset nonsensical "
+				<< rec_get_next_offs(rec, FALSE) << " for rec "
+				<< (rec - page);
+
+			goto func_exit;
+		}
+
+		count++;
+
+		if (UNIV_UNLIKELY(count > srv_page_size)) {
+			ib::error() << "Page record list appears"
+				" to be circular " << count;
+			goto func_exit;
+		}
+
+		rec = page_rec_get_next_const(rec);
+		own_count++;
+	}
+
+	if (UNIV_UNLIKELY(rec_get_n_owned_old(rec) == 0)) {
+		ib::error() << "n owned is zero in a supremum rec";
+
+		goto func_exit;
+	}
+
+	if (UNIV_UNLIKELY(slot_no != n_slots - 1)) {
+		ib::error() <<  "n slots wrong "
+			<< slot_no << ", " << (n_slots - 1);
+		goto func_exit;
+	}
+
+	if (UNIV_UNLIKELY(ulint(page_header_get_field(page, PAGE_N_RECS))
+			  + PAGE_HEAP_NO_USER_LOW
+			  != count + 1)) {
+		ib::error() <<  "n recs wrong "
+			<< page_header_get_field(page, PAGE_N_RECS)
+			+ PAGE_HEAP_NO_USER_LOW << " " << (count + 1);
+
+		goto func_exit;
+	}
+
+	/* Check then the free list */
+	rec = page_header_get_ptr(page, PAGE_FREE);
+
+	while (rec != NULL) {
+		if (UNIV_UNLIKELY(rec < page + FIL_PAGE_DATA
+				  || rec >= page + srv_page_size)) {
+			ib::error() << "Free list record has"
+				" a nonsensical offset " << (rec - page);
+
+			goto func_exit;
+		}
+
+		if (UNIV_UNLIKELY(rec > rec_heap_top)) {
+			ib::error() << "Free list record " << (rec - page)
+				<< " is above rec heap top "
+				<< (rec_heap_top - page);
+
+			goto func_exit;
+		}
+
+		count++;
+
+		if (UNIV_UNLIKELY(count > srv_page_size)) {
+			ib::error() << "Page free list appears"
+				" to be circular " << count;
+			goto func_exit;
+		}
+
+		ulint offs = rec_get_next_offs(rec, FALSE);
+		if (!offs) {
+			break;
+		}
+		if (UNIV_UNLIKELY(offs < PAGE_OLD_INFIMUM
+				  || offs >= srv_page_size)) {
+			ib::error() << "Page free list is corrupted " << count;
+			goto func_exit;
+		}
+
+		rec = page + offs;
+	}
+
+	if (UNIV_UNLIKELY(page_dir_get_n_heap(page) != count + 1)) {
+
+		ib::error() <<  "N heap is wrong "
+			<< page_dir_get_n_heap(page) << ", " << (count + 1);
+
+		goto func_exit;
+	}
+
+	ret = TRUE;
+
+func_exit:
+	return(ret);
+}
+
+/***************************************************************//**
+This function checks the consistency of an index page when we do not
+know the index. This is also resilient so that this should never crash
+even if the page is total garbage.
+@return TRUE if ok */
+ibool
+page_simple_validate_new(
+/*=====================*/
+	const page_t*	page)	/*!< in: index page in ROW_FORMAT!=REDUNDANT */
+{
+	const page_dir_slot_t*	slot;
+	ulint			slot_no;
+	ulint			n_slots;
+	const rec_t*		rec;
+	const byte*		rec_heap_top;
+	ulint			count;
+	ulint			own_count;
+	ibool			ret	= FALSE;
+
+	ut_a(page_is_comp(page));
+
+	/* Check first that the record heap and the directory do not
+	overlap. */
+
+	n_slots = page_dir_get_n_slots(page);
+
+	if (UNIV_UNLIKELY(n_slots < 2 || n_slots > srv_page_size / 4)) {
+		ib::error() << "Nonsensical number of page dir slots: "
+			    << n_slots;
+		goto func_exit;
+	}
+
+	rec_heap_top = page_header_get_ptr(page, PAGE_HEAP_TOP);
+
+	if (UNIV_UNLIKELY(rec_heap_top
+			  > page_dir_get_nth_slot(page, n_slots - 1))) {
+
+		ib::error() << "Record heap and dir overlap on a page,"
+			" heap top "
+			<< page_header_get_field(page, PAGE_HEAP_TOP)
+			<< ", dir " << page_offset(
+				page_dir_get_nth_slot(page, n_slots - 1));
+
+		goto func_exit;
+	}
+
+	/* Validate the record list in a loop checking also that it is
+	consistent with the page record directory. */
+
+	count = 0;
+	own_count = 1;
+	slot_no = 0;
+	slot = page_dir_get_nth_slot(page, slot_no);
+
+	rec = page + PAGE_NEW_INFIMUM;
+
+	for (;;) {
+		if (UNIV_UNLIKELY(rec < page + PAGE_NEW_INFIMUM
+				  || rec > rec_heap_top)) {
+			ib::error() << "Record " << page_offset(rec)
+				<< " is out of bounds: "
+				<< page_offset(rec_heap_top);
+			goto func_exit;
+		}
+
+		if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) != 0)) {
+			/* This is a record pointed to by a dir slot */
+			if (UNIV_UNLIKELY(rec_get_n_owned_new(rec)
+					  != own_count)) {
+
+				ib::error() << "Wrong owned count "
+					<< rec_get_n_owned_new(rec) << ", "
+					<< own_count << ", rec "
+					<< page_offset(rec);
+
+				goto func_exit;
+			}
+
+			if (UNIV_UNLIKELY
+			    (page_dir_slot_get_rec(slot) != rec)) {
+				ib::error() << "Dir slot does not point"
+					" to right rec " << page_offset(rec);
+
+				goto func_exit;
+			}
+
+			own_count = 0;
+
+			if (!page_rec_is_supremum(rec)) {
+				slot_no++;
+				slot = page_dir_get_nth_slot(page, slot_no);
+			}
+		}
+
+		if (page_rec_is_supremum(rec)) {
+
+			break;
+		}
+
+		if (UNIV_UNLIKELY
+		    (rec_get_next_offs(rec, TRUE) < FIL_PAGE_DATA
+		     || rec_get_next_offs(rec, TRUE) >= srv_page_size)) {
+
+			ib::error() << "Next record offset nonsensical "
+				<< rec_get_next_offs(rec, TRUE)
+				<< " for rec " << page_offset(rec);
+
+			goto func_exit;
+		}
+
+		count++;
+
+		if (UNIV_UNLIKELY(count > srv_page_size)) {
+			ib::error() << "Page record list appears to be"
+				" circular " << count;
+			goto func_exit;
+		}
+
+		rec = page_rec_get_next_const(rec);
+		own_count++;
+	}
+
+	if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) == 0)) {
+		ib::error() << "n owned is zero in a supremum rec";
+
+		goto func_exit;
+	}
+
+	if (UNIV_UNLIKELY(slot_no != n_slots - 1)) {
+		ib::error() << "n slots wrong " << slot_no << ", "
+			<< (n_slots - 1);
+		goto func_exit;
+	}
+
+	if (UNIV_UNLIKELY(ulint(page_header_get_field(page, PAGE_N_RECS))
+			  + PAGE_HEAP_NO_USER_LOW
+			  != count + 1)) {
+		ib::error() << "n recs wrong "
+			<< page_header_get_field(page, PAGE_N_RECS)
+			+ PAGE_HEAP_NO_USER_LOW << " " << (count + 1);
+
+		goto func_exit;
+	}
+
+	/* Check then the free list */
+	rec = page_header_get_ptr(page, PAGE_FREE);
+
+	while (rec != NULL) {
+		if (UNIV_UNLIKELY(rec < page + FIL_PAGE_DATA
+				  || rec >= page + srv_page_size)) {
+
+			ib::error() << "Free list record has"
+				" a nonsensical offset " << page_offset(rec);
+
+			goto func_exit;
+		}
+
+		if (UNIV_UNLIKELY(rec > rec_heap_top)) {
+			ib::error() << "Free list record " << page_offset(rec)
+				<< " is above rec heap top "
+				<< page_offset(rec_heap_top);
+
+			goto func_exit;
+		}
+
+		count++;
+
+		if (UNIV_UNLIKELY(count > srv_page_size)) {
+			ib::error() << "Page free list appears to be"
+				" circular " << count;
+			goto func_exit;
+		}
+
+		const ulint offs = rec_get_next_offs(rec, TRUE);
+		if (!offs) {
+			break;
+		}
+		if (UNIV_UNLIKELY(offs < PAGE_OLD_INFIMUM
+				  || offs >= srv_page_size)) {
+			ib::error() << "Page free list is corrupted " << count;
+			goto func_exit;
+		}
+
+		rec = page + offs;
+	}
+
+	if (UNIV_UNLIKELY(page_dir_get_n_heap(page) != count + 1)) {
+
+		ib::error() << "N heap is wrong "
+			<< page_dir_get_n_heap(page) << ", " << (count + 1);
+
+		goto func_exit;
+	}
+
+	ret = TRUE;
+
+func_exit:
+	return(ret);
+}
+
+/** Check the consistency of an index page.
+@param[in]	page	index page
+@param[in]	index	B-tree or R-tree index
+@return	whether the page is valid */
+bool page_validate(const page_t* page, const dict_index_t* index)
+{
+	const page_dir_slot_t*	slot;
+	const rec_t*		rec;
+	const rec_t*		old_rec		= NULL;
+	const rec_t*		first_rec	= NULL;
+	ulint			offs = 0;
+	ulint			n_slots;
+	ibool			ret		= TRUE;
+	ulint			i;
+	rec_offs		offsets_1[REC_OFFS_NORMAL_SIZE];
+	rec_offs		offsets_2[REC_OFFS_NORMAL_SIZE];
+	rec_offs*		offsets		= offsets_1;
+	rec_offs*		old_offsets	= offsets_2;
+
+	rec_offs_init(offsets_1);
+	rec_offs_init(offsets_2);
+
+#ifdef UNIV_GIS_DEBUG
+	if (dict_index_is_spatial(index)) {
+		fprintf(stderr, "Page no: %lu\n", page_get_page_no(page));
+	}
+#endif /* UNIV_DEBUG */
+
+	if (UNIV_UNLIKELY((ibool) !!page_is_comp(page)
+			  != dict_table_is_comp(index->table))) {
+		ib::error() << "'compact format' flag mismatch";
+func_exit2:
+		ib::error() << "Apparent corruption in space "
+			    << page_get_space_id(page) << " page "
+			    << page_get_page_no(page)
+			    << " of index " << index->name
+			    << " of table " << index->table->name;
+		return FALSE;
+	}
+
+	if (page_is_comp(page)) {
+		if (UNIV_UNLIKELY(!page_simple_validate_new(page))) {
+			goto func_exit2;
+		}
+	} else {
+		if (UNIV_UNLIKELY(!page_simple_validate_old(page))) {
+			goto func_exit2;
+		}
+	}
+
+	/* Multiple transactions cannot simultaneously operate on the
+	same temp-table in parallel.
+	max_trx_id is ignored for temp tables because it not required
+	for MVCC. */
+	if (!page_is_leaf(page) || page_is_empty(page)
+	    || !dict_index_is_sec_or_ibuf(index)
+	    || index->table->is_temporary()) {
+	} else if (trx_id_t sys_max_trx_id = trx_sys.get_max_trx_id()) {
+		trx_id_t	max_trx_id	= page_get_max_trx_id(page);
+
+		if (max_trx_id == 0 || max_trx_id > sys_max_trx_id) {
+			ib::error() << "PAGE_MAX_TRX_ID out of bounds: "
+				<< max_trx_id << ", " << sys_max_trx_id;
+			ret = FALSE;
+		}
+	} else {
+		ut_ad(srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN);
+	}
+
+	/* Check first that the record heap and the directory do not
+	overlap. */
+
+	n_slots = page_dir_get_n_slots(page);
+
+	if (UNIV_UNLIKELY(!(page_header_get_ptr(page, PAGE_HEAP_TOP)
+			    <= page_dir_get_nth_slot(page, n_slots - 1)))) {
+
+		ib::warn() << "Record heap and directory overlap";
+		goto func_exit2;
+	}
+
+	switch (uint16_t type = fil_page_get_type(page)) {
+	case FIL_PAGE_RTREE:
+		if (!index->is_spatial()) {
+wrong_page_type:
+			ib::warn() << "Wrong page type " << type;
+			ret = FALSE;
+		}
+		break;
+	case FIL_PAGE_TYPE_INSTANT:
+		if (index->is_instant()
+		    && page_get_page_no(page) == index->page) {
+			break;
+		}
+		goto wrong_page_type;
+	case FIL_PAGE_INDEX:
+		if (index->is_spatial()) {
+			goto wrong_page_type;
+		}
+		if (index->is_instant()
+		    && page_get_page_no(page) == index->page) {
+			goto wrong_page_type;
+		}
+		break;
+	default:
+		goto wrong_page_type;
+	}
+
+	/* The following buffer is used to check that the
+	records in the page record heap do not overlap */
+	mem_heap_t* heap = mem_heap_create(srv_page_size + 200);;
+	byte* buf = static_cast<byte*>(mem_heap_zalloc(heap, srv_page_size));
+
+	/* Validate the record list in a loop checking also that
+	it is consistent with the directory. */
+	ulint count = 0, data_size = 0, own_count = 1, slot_no = 0;
+	ulint info_bits;
+	slot_no = 0;
+	slot = page_dir_get_nth_slot(page, slot_no);
+
+	rec = page_get_infimum_rec(page);
+
+	const ulint n_core = page_is_leaf(page) ? index->n_core_fields : 0;
+
+	for (;;) {
+		offsets = rec_get_offsets(rec, index, offsets, n_core,
+					  ULINT_UNDEFINED, &heap);
+
+		if (page_is_comp(page) && page_rec_is_user_rec(rec)
+		    && UNIV_UNLIKELY(rec_get_node_ptr_flag(rec)
+				     == page_is_leaf(page))) {
+			ib::error() << "'node_ptr' flag mismatch";
+			ret = FALSE;
+			goto next_rec;
+		}
+
+		if (UNIV_UNLIKELY(!page_rec_validate(rec, offsets))) {
+			ret = FALSE;
+			goto next_rec;
+		}
+
+		info_bits = rec_get_info_bits(rec, page_is_comp(page));
+		if (info_bits
+		    & ~(REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG)) {
+			ib::error() << "info_bits has an incorrect value "
+				    << info_bits;
+			ret = false;
+		}
+
+		if (rec == first_rec) {
+			if (info_bits & REC_INFO_MIN_REC_FLAG) {
+				if (page_has_prev(page)) {
+					ib::error() << "REC_INFO_MIN_REC_FLAG "
+						"is set on non-left page";
+					ret = false;
+				} else if (!page_is_leaf(page)) {
+					/* leftmost node pointer page */
+				} else if (!index->is_instant()) {
+					ib::error() << "REC_INFO_MIN_REC_FLAG "
+						"is set in a leaf-page record";
+					ret = false;
+				} else if (!(info_bits & REC_INFO_DELETED_FLAG)
+					   != !index->table->instant) {
+					ib::error() << (index->table->instant
+							? "Metadata record "
+							"is not delete-marked"
+							: "Metadata record "
+							"is delete-marked");
+					ret = false;
+				}
+			} else if (!page_has_prev(page)
+				   && index->is_instant()) {
+				ib::error() << "Metadata record is missing";
+				ret = false;
+			}
+		} else if (info_bits & REC_INFO_MIN_REC_FLAG) {
+			ib::error() << "REC_INFO_MIN_REC_FLAG record is not "
+				       "first in page";
+			ret = false;
+		}
+
+		if (page_is_comp(page)) {
+			const rec_comp_status_t status = rec_get_status(rec);
+			if (status != REC_STATUS_ORDINARY
+			    && status != REC_STATUS_NODE_PTR
+			    && status != REC_STATUS_INFIMUM
+			    && status != REC_STATUS_SUPREMUM
+			    && status != REC_STATUS_INSTANT) {
+				ib::error() << "impossible record status "
+					    << status;
+				ret = false;
+			} else if (page_rec_is_infimum(rec)) {
+				if (status != REC_STATUS_INFIMUM) {
+					ib::error()
+						<< "infimum record has status "
+						<< status;
+					ret = false;
+				}
+			} else if (page_rec_is_supremum(rec)) {
+				if (status != REC_STATUS_SUPREMUM) {
+					ib::error() << "supremum record has "
+						       "status "
+						    << status;
+					ret = false;
+				}
+			} else if (!page_is_leaf(page)) {
+				if (status != REC_STATUS_NODE_PTR) {
+					ib::error() << "node ptr record has "
+						       "status "
+						    << status;
+					ret = false;
+				}
+			} else if (!index->is_instant()
+				   && status == REC_STATUS_INSTANT) {
+				ib::error() << "instantly added record in a "
+					       "non-instant index";
+				ret = false;
+			}
+		}
+
+		/* Check that the records are in the ascending order */
+		if (count >= PAGE_HEAP_NO_USER_LOW
+		    && !page_rec_is_supremum(rec)) {
+
+			int	ret = cmp_rec_rec(
+				rec, old_rec, offsets, old_offsets, index);
+
+			/* For spatial index, on nonleaf leavel, we
+			allow recs to be equal. */
+			if (ret <= 0 && !(ret == 0 && index->is_spatial()
+					  && !page_is_leaf(page))) {
+
+				ib::error() << "Records in wrong order";
+
+				fputs("\nInnoDB: previous record ", stderr);
+				/* For spatial index, print the mbr info.*/
+				if (index->type & DICT_SPATIAL) {
+					putc('\n', stderr);
+					rec_print_mbr_rec(stderr,
+						old_rec, old_offsets);
+					fputs("\nInnoDB: record ", stderr);
+					putc('\n', stderr);
+					rec_print_mbr_rec(stderr, rec, offsets);
+					putc('\n', stderr);
+					putc('\n', stderr);
+
+				} else {
+					rec_print_new(stderr, old_rec, old_offsets);
+					fputs("\nInnoDB: record ", stderr);
+					rec_print_new(stderr, rec, offsets);
+					putc('\n', stderr);
+				}
+
+				ret = FALSE;
+			}
+		}
+
+		if (page_rec_is_user_rec(rec)) {
+
+			data_size += rec_offs_size(offsets);
+
+#if defined(UNIV_GIS_DEBUG)
+			/* For spatial index, print the mbr info.*/
+			if (index->type & DICT_SPATIAL) {
+				rec_print_mbr_rec(stderr, rec, offsets);
+				putc('\n', stderr);
+			}
+#endif /* UNIV_GIS_DEBUG */
+		}
+
+		offs = page_offset(rec_get_start(rec, offsets));
+		i = rec_offs_size(offsets);
+		if (UNIV_UNLIKELY(offs + i >= srv_page_size)) {
+			ib::error() << "Record offset out of bounds: "
+				    << offs << '+' << i;
+			ret = FALSE;
+			goto next_rec;
+		}
+		while (i--) {
+			if (UNIV_UNLIKELY(buf[offs + i])) {
+				ib::error() << "Record overlaps another: "
+					    << offs << '+' << i;
+				ret = FALSE;
+				break;
+			}
+			buf[offs + i] = 1;
+		}
+
+		if (ulint rec_own_count = page_is_comp(page)
+		    ? rec_get_n_owned_new(rec)
+		    : rec_get_n_owned_old(rec)) {
+			/* This is a record pointed to by a dir slot */
+			if (UNIV_UNLIKELY(rec_own_count != own_count)) {
+				ib::error() << "Wrong owned count at " << offs
+					    << ": " << rec_own_count
+					    << ", " << own_count;
+				ret = FALSE;
+			}
+
+			if (page_dir_slot_get_rec(slot) != rec) {
+				ib::error() << "Dir slot does not"
+					" point to right rec at " << offs;
+				ret = FALSE;
+			}
+
+			if (ret) {
+				page_dir_slot_check(slot);
+			}
+
+			own_count = 0;
+			if (!page_rec_is_supremum(rec)) {
+				slot_no++;
+				slot = page_dir_get_nth_slot(page, slot_no);
+			}
+		}
+
+next_rec:
+		old_rec = rec;
+		rec = page_rec_get_next_const(rec);
+
+		if (UNIV_UNLIKELY(!rec != page_rec_is_supremum(old_rec))) {
+			ib::error() << "supremum is not last record: " << offs;
+			ret = FALSE;
+		}
+
+		if (!rec) {
+			rec = old_rec; /* supremum */
+			break;
+		}
+
+		count++;
+		own_count++;
+
+		if (page_rec_is_infimum(old_rec)
+		    && page_rec_is_user_rec(rec)) {
+			first_rec = rec;
+		}
+
+		/* set old_offsets to offsets; recycle offsets */
+		std::swap(old_offsets, offsets);
+	}
+
+	if (page_is_comp(page)) {
+		if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) == 0)) {
+
+			goto n_owned_zero;
+		}
+	} else if (UNIV_UNLIKELY(rec_get_n_owned_old(rec) == 0)) {
+n_owned_zero:
+		ib::error() <<  "n owned is zero at " << offs;
+		ret = FALSE;
+	}
+
+	if (UNIV_UNLIKELY(slot_no != n_slots - 1)) {
+		ib::error() << "n slots wrong " << slot_no << " "
+			<< (n_slots - 1);
+		ret = FALSE;
+	}
+
+	if (UNIV_UNLIKELY(ulint(page_header_get_field(page, PAGE_N_RECS))
+			  + PAGE_HEAP_NO_USER_LOW
+			  != count + 1)) {
+		ib::error() << "n recs wrong "
+			<< page_header_get_field(page, PAGE_N_RECS)
+			+ PAGE_HEAP_NO_USER_LOW << " " << (count + 1);
+		ret = FALSE;
+	}
+
+	if (UNIV_UNLIKELY(data_size != page_get_data_size(page))) {
+		ib::error() << "Summed data size " << data_size
+			<< ", returned by func " << page_get_data_size(page);
+		ret = FALSE;
+	}
+
+	/* Check then the free list */
+	rec = page_header_get_ptr(page, PAGE_FREE);
+
+	while (rec != NULL) {
+		offsets = rec_get_offsets(rec, index, offsets, n_core,
+					  ULINT_UNDEFINED, &heap);
+		if (UNIV_UNLIKELY(!page_rec_validate(rec, offsets))) {
+			ret = FALSE;
+next_free:
+			const ulint offs = rec_get_next_offs(
+				rec, page_is_comp(page));
+			if (!offs) {
+				break;
+			}
+			if (UNIV_UNLIKELY(offs < PAGE_OLD_INFIMUM
+					  || offs >= srv_page_size)) {
+				ib::error() << "Page free list is corrupted";
+				ret = FALSE;
+				break;
+			}
+
+			rec = page + offs;
+			continue;
+		}
+
+		count++;
+		offs = page_offset(rec_get_start(rec, offsets));
+		i = rec_offs_size(offsets);
+		if (UNIV_UNLIKELY(offs + i >= srv_page_size)) {
+			ib::error() << "Free record offset out of bounds: "
+				    << offs << '+' << i;
+			ret = FALSE;
+			goto next_free;
+		}
+		while (i--) {
+			if (UNIV_UNLIKELY(buf[offs + i])) {
+				ib::error() << "Free record overlaps another: "
+					    << offs << '+' << i;
+				ret = FALSE;
+				break;
+			}
+			buf[offs + i] = 1;
+		}
+
+		goto next_free;
+	}
+
+	if (UNIV_UNLIKELY(page_dir_get_n_heap(page) != count + 1)) {
+		ib::error() << "N heap is wrong "
+			<< page_dir_get_n_heap(page) << " " << count + 1;
+		ret = FALSE;
+	}
+
+	mem_heap_free(heap);
+
+	if (UNIV_UNLIKELY(!ret)) {
+		goto func_exit2;
+	}
+
+	return(ret);
+}
+
+/***************************************************************//**
+Looks in the page record list for a record with the given heap number.
+@return record, NULL if not found */
+const rec_t*
+page_find_rec_with_heap_no(
+/*=======================*/
+	const page_t*	page,	/*!< in: index page */
+	ulint		heap_no)/*!< in: heap number */
+{
+	const rec_t*	rec;
+
+	if (page_is_comp(page)) {
+		rec = page + PAGE_NEW_INFIMUM;
+
+		for (;;) {
+			ulint	rec_heap_no = rec_get_heap_no_new(rec);
+
+			if (rec_heap_no == heap_no) {
+
+				return(rec);
+			} else if (rec_heap_no == PAGE_HEAP_NO_SUPREMUM) {
+
+				return(NULL);
+			}
+
+			rec = page + rec_get_next_offs(rec, TRUE);
+		}
+	} else {
+		rec = page + PAGE_OLD_INFIMUM;
+
+		for (;;) {
+			ulint	rec_heap_no = rec_get_heap_no_old(rec);
+
+			if (rec_heap_no == heap_no) {
+
+				return(rec);
+			} else if (rec_heap_no == PAGE_HEAP_NO_SUPREMUM) {
+
+				return(NULL);
+			}
+
+			rec = page + rec_get_next_offs(rec, FALSE);
+		}
+	}
+}
+
+/** Get the last non-delete-marked record on a page.
+@param[in]	page	index tree leaf page
+@return the last record, not delete-marked
+@retval infimum record if all records are delete-marked */
+const rec_t *page_find_rec_last_not_deleted(const page_t *page)
+{
+  ut_ad(page_is_leaf(page));
+
+  if (page_is_comp(page))
+  {
+    const rec_t *rec= page + PAGE_NEW_INFIMUM;
+    const rec_t *prev_rec= rec;
+    do
+    {
+      if (!(rec[-REC_NEW_INFO_BITS] &
+            (REC_INFO_DELETED_FLAG | REC_INFO_MIN_REC_FLAG)))
+        prev_rec= rec;
+      if (!(rec= page_rec_get_next_low(rec, true)))
+        return page + PAGE_NEW_INFIMUM;
+    } while (rec != page + PAGE_NEW_SUPREMUM);
+    return prev_rec;
+  }
+  else
+  {
+    const rec_t *rec= page + PAGE_OLD_INFIMUM;
+    const rec_t *prev_rec= rec;
+    do
+    {
+      if (!(rec[-REC_OLD_INFO_BITS] &
+            (REC_INFO_DELETED_FLAG | REC_INFO_MIN_REC_FLAG)))
+        prev_rec= rec;
+      if (!(rec= page_rec_get_next_low(rec, false)))
+        return page + PAGE_OLD_INFIMUM;
+    } while (rec != page + PAGE_OLD_SUPREMUM);
+    return prev_rec;
+  }
+}
diff --git a/storage/innobase/page/page0zip.cc b/storage/innobase/page/page0zip.cc
new file mode 100644
index 00000000..89e6d149
--- /dev/null
+++ b/storage/innobase/page/page0zip.cc
@@ -0,0 +1,4666 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2014, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file page/page0zip.cc
+Compressed page interface
+
+Created June 2005 by Marko Makela
+*******************************************************/
+
+#include "page0zip.h"
+#include "fsp0types.h"
+#include "page0page.h"
+#include "buf0checksum.h"
+#include "zlib.h"
+#include "span.h"
+
+using st_::span;
+
+#ifndef UNIV_INNOCHECKSUM
+#include "mtr0log.h"
+#include "dict0dict.h"
+#include "btr0cur.h"
+#include "log0recv.h"
+#include "row0row.h"
+#include "btr0sea.h"
+#include "dict0boot.h"
+#include "lock0lock.h"
+#include "srv0srv.h"
+#include "buf0lru.h"
+#include "srv0mon.h"
+
+#include <map>
+#include <algorithm>
+
+/** Statistics on compression, indexed by page_zip_des_t::ssize - 1 */
+page_zip_stat_t		page_zip_stat[PAGE_ZIP_SSIZE_MAX];
+/** Statistics on compression, indexed by index->id */
+page_zip_stat_per_index_t	page_zip_stat_per_index;
+
+/** Compression level to be used by zlib. Settable by user. */
+uint	page_zip_level;
+
+/* Please refer to ../include/page0zip.ic for a description of the
+compressed page format. */
+
+/* The infimum and supremum records are omitted from the compressed page.
+On compress, we compare that the records are there, and on uncompress we
+restore the records. */
+/** Extra bytes of an infimum record */
+static const byte infimum_extra[] = {
+	0x01,			/* info_bits=0, n_owned=1 */
+	0x00, 0x02		/* heap_no=0, status=2 */
+	/* ?, ?	*/		/* next=(first user rec, or supremum) */
+};
+/** Data bytes of an infimum record */
+static const byte infimum_data[] = {
+	0x69, 0x6e, 0x66, 0x69,
+	0x6d, 0x75, 0x6d, 0x00	/* "infimum\0" */
+};
+/** Extra bytes and data bytes of a supremum record */
+static const byte supremum_extra_data alignas(4) [] = {
+	/* 0x0?, */		/* info_bits=0, n_owned=1..8 */
+	0x00, 0x0b,		/* heap_no=1, status=3 */
+	0x00, 0x00,		/* next=0 */
+	0x73, 0x75, 0x70, 0x72,
+	0x65, 0x6d, 0x75, 0x6d	/* "supremum" */
+};
+
+/** Assert that a block of memory is filled with zero bytes.
+@param b in: memory block
+@param s in: size of the memory block, in bytes */
+#define ASSERT_ZERO(b, s) ut_ad(!memcmp(b, field_ref_zero, s))
+/** Assert that a BLOB pointer is filled with zero bytes.
+@param b in: BLOB pointer */
+#define ASSERT_ZERO_BLOB(b) ASSERT_ZERO(b, FIELD_REF_SIZE)
+
+/* Enable some extra debugging output.  This code can be enabled
+independently of any UNIV_ debugging conditions. */
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+# include <stdarg.h>
+MY_ATTRIBUTE((format (printf, 1, 2)))
+/**********************************************************************//**
+Report a failure to decompress or compress.
+@return number of characters printed */
+static
+int
+page_zip_fail_func(
+/*===============*/
+	const char*	fmt,	/*!< in: printf(3) format string */
+	...)			/*!< in: arguments corresponding to fmt */
+{
+	int	res;
+	va_list	ap;
+
+	ut_print_timestamp(stderr);
+	fputs("  InnoDB: ", stderr);
+	va_start(ap, fmt);
+	res = vfprintf(stderr, fmt, ap);
+	va_end(ap);
+
+	return(res);
+}
+/** Wrapper for page_zip_fail_func()
+@param fmt_args in: printf(3) format string and arguments */
+# define page_zip_fail(fmt_args) page_zip_fail_func fmt_args
+#else /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+/** Dummy wrapper for page_zip_fail_func()
+@param fmt_args ignored: printf(3) format string and arguments */
+# define page_zip_fail(fmt_args) /* empty */
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+
+/**********************************************************************//**
+Determine the guaranteed free space on an empty page.
+@return minimum payload size on the page */
+ulint
+page_zip_empty_size(
+/*================*/
+	ulint	n_fields,	/*!< in: number of columns in the index */
+	ulint	zip_size)	/*!< in: compressed page size in bytes */
+{
+	ulint	size = zip_size
+		/* subtract the page header and the longest
+		uncompressed data needed for one record */
+		- (PAGE_DATA
+		   + PAGE_ZIP_CLUST_LEAF_SLOT_SIZE
+		   + 1/* encoded heap_no==2 in page_zip_write_rec() */
+		   + 1/* end of modification log */
+		   - REC_N_NEW_EXTRA_BYTES/* omitted bytes */)
+		/* subtract the space for page_zip_fields_encode() */
+		- compressBound(static_cast<uLong>(2 * (n_fields + 1)));
+	return(lint(size) > 0 ? size : 0);
+}
+
+/** Check whether a tuple is too big for compressed table
+@param[in]	index	dict index object
+@param[in]	entry	entry for the index
+@return	true if it's too big, otherwise false */
+bool
+page_zip_is_too_big(
+	const dict_index_t*	index,
+	const dtuple_t*		entry)
+{
+	const ulint zip_size = index->table->space->zip_size();
+
+	/* Estimate the free space of an empty compressed page.
+	Subtract one byte for the encoded heap_no in the
+	modification log. */
+	ulint	free_space_zip = page_zip_empty_size(
+		index->n_fields, zip_size);
+	ulint	n_uniq = dict_index_get_n_unique_in_tree(index);
+
+	ut_ad(dict_table_is_comp(index->table));
+	ut_ad(zip_size);
+
+	if (free_space_zip == 0) {
+		return(true);
+	}
+
+	/* Subtract one byte for the encoded heap_no in the
+	modification log. */
+	free_space_zip--;
+
+	/* There should be enough room for two node pointer
+	records on an empty non-leaf page.  This prevents
+	infinite page splits. */
+
+	if (entry->n_fields >= n_uniq
+	    && (REC_NODE_PTR_SIZE
+		+ rec_get_converted_size_comp_prefix(
+			index, entry->fields, n_uniq, NULL)
+		/* On a compressed page, there is
+		a two-byte entry in the dense
+		page directory for every record.
+		But there is no record header. */
+		- (REC_N_NEW_EXTRA_BYTES - 2)
+		> free_space_zip / 2)) {
+		return(true);
+	}
+
+	return(false);
+}
+
+/*************************************************************//**
+Gets the number of elements in the dense page directory,
+including deleted records (the free list).
+@return number of elements in the dense page directory */
+UNIV_INLINE
+ulint
+page_zip_dir_elems(
+/*===============*/
+	const page_zip_des_t*	page_zip)	/*!< in: compressed page */
+{
+	/* Exclude the page infimum and supremum from the record count. */
+	return ulint(page_dir_get_n_heap(page_zip->data))
+		- PAGE_HEAP_NO_USER_LOW;
+}
+
+/*************************************************************//**
+Gets the size of the compressed page trailer (the dense page directory),
+including deleted records (the free list).
+@return length of dense page directory, in bytes */
+UNIV_INLINE
+ulint
+page_zip_dir_size(
+/*==============*/
+	const page_zip_des_t*	page_zip)	/*!< in: compressed page */
+{
+	return(PAGE_ZIP_DIR_SLOT_SIZE * page_zip_dir_elems(page_zip));
+}
+
+/*************************************************************//**
+Gets an offset to the compressed page trailer (the dense page directory),
+including deleted records (the free list).
+@return offset of the dense page directory */
+UNIV_INLINE
+ulint
+page_zip_dir_start_offs(
+/*====================*/
+	const page_zip_des_t*	page_zip,	/*!< in: compressed page */
+	ulint			n_dense)	/*!< in: directory size */
+{
+	ut_ad(n_dense * PAGE_ZIP_DIR_SLOT_SIZE < page_zip_get_size(page_zip));
+
+	return(page_zip_get_size(page_zip) - n_dense * PAGE_ZIP_DIR_SLOT_SIZE);
+}
+
+/*************************************************************//**
+Gets a pointer to the compressed page trailer (the dense page directory),
+including deleted records (the free list).
+@param[in] page_zip compressed page
+@param[in] n_dense number of entries in the directory
+@return pointer to the dense page directory */
+#define page_zip_dir_start_low(page_zip, n_dense)			\
+	((page_zip)->data + page_zip_dir_start_offs(page_zip, n_dense))
+/*************************************************************//**
+Gets a pointer to the compressed page trailer (the dense page directory),
+including deleted records (the free list).
+@param[in] page_zip compressed page
+@return pointer to the dense page directory */
+#define page_zip_dir_start(page_zip)					\
+	page_zip_dir_start_low(page_zip, page_zip_dir_elems(page_zip))
+
+/*************************************************************//**
+Gets the size of the compressed page trailer (the dense page directory),
+only including user records (excluding the free list).
+@return length of dense page directory comprising existing records, in bytes */
+UNIV_INLINE
+ulint
+page_zip_dir_user_size(
+/*===================*/
+	const page_zip_des_t*	page_zip)	/*!< in: compressed page */
+{
+	ulint	size = PAGE_ZIP_DIR_SLOT_SIZE
+		* ulint(page_get_n_recs(page_zip->data));
+	ut_ad(size <= page_zip_dir_size(page_zip));
+	return(size);
+}
+
+/*************************************************************//**
+Find the slot of the given record in the dense page directory.
+@return dense directory slot, or NULL if record not found */
+UNIV_INLINE
+byte*
+page_zip_dir_find_low(
+/*==================*/
+	byte*	slot,			/*!< in: start of records */
+	byte*	end,			/*!< in: end of records */
+	ulint	offset)			/*!< in: offset of user record */
+{
+	ut_ad(slot <= end);
+
+	for (; slot < end; slot += PAGE_ZIP_DIR_SLOT_SIZE) {
+		if ((mach_read_from_2(slot) & PAGE_ZIP_DIR_SLOT_MASK)
+		    == offset) {
+			return(slot);
+		}
+	}
+
+	return(NULL);
+}
+
+/*************************************************************//**
+Find the slot of the given non-free record in the dense page directory.
+@return dense directory slot, or NULL if record not found */
+UNIV_INLINE
+byte*
+page_zip_dir_find(
+/*==============*/
+	page_zip_des_t*	page_zip,		/*!< in: compressed page */
+	ulint		offset)			/*!< in: offset of user record */
+{
+	byte*	end	= page_zip->data + page_zip_get_size(page_zip);
+
+	ut_ad(page_zip_simple_validate(page_zip));
+
+	return(page_zip_dir_find_low(end - page_zip_dir_user_size(page_zip),
+				     end,
+				     offset));
+}
+
+/*************************************************************//**
+Find the slot of the given free record in the dense page directory.
+@return dense directory slot, or NULL if record not found */
+UNIV_INLINE
+byte*
+page_zip_dir_find_free(
+/*===================*/
+	page_zip_des_t*	page_zip,		/*!< in: compressed page */
+	ulint		offset)			/*!< in: offset of user record */
+{
+	byte*	end	= page_zip->data + page_zip_get_size(page_zip);
+
+	ut_ad(page_zip_simple_validate(page_zip));
+
+	return(page_zip_dir_find_low(end - page_zip_dir_size(page_zip),
+				     end - page_zip_dir_user_size(page_zip),
+				     offset));
+}
+
+/*************************************************************//**
+Read a given slot in the dense page directory.
+@return record offset on the uncompressed page, possibly ORed with
+PAGE_ZIP_DIR_SLOT_DEL or PAGE_ZIP_DIR_SLOT_OWNED */
+UNIV_INLINE
+ulint
+page_zip_dir_get(
+/*=============*/
+	const page_zip_des_t*	page_zip,	/*!< in: compressed page */
+	ulint			slot)		/*!< in: slot
+						(0=first user record) */
+{
+	ut_ad(page_zip_simple_validate(page_zip));
+	ut_ad(slot < page_zip_dir_size(page_zip) / PAGE_ZIP_DIR_SLOT_SIZE);
+	return(mach_read_from_2(page_zip->data + page_zip_get_size(page_zip)
+				- PAGE_ZIP_DIR_SLOT_SIZE * (slot + 1)));
+}
+
+/** Write a byte string to a ROW_FORMAT=COMPRESSED page.
+@param[in]      b       ROW_FORMAT=COMPRESSED index page
+@param[in]      offset  byte offset from b.zip.data
+@param[in]      len     length of the data to write */
+inline void mtr_t::zmemcpy(const buf_block_t &b, ulint offset, ulint len)
+{
+  ut_ad(fil_page_get_type(b.page.zip.data) == FIL_PAGE_INDEX ||
+        fil_page_get_type(b.page.zip.data) == FIL_PAGE_RTREE);
+  ut_ad(page_zip_simple_validate(&b.page.zip));
+  ut_ad(offset + len <= page_zip_get_size(&b.page.zip));
+
+  memcpy_low(b, static_cast<uint16_t>(offset), &b.page.zip.data[offset], len);
+  m_last_offset= static_cast<uint16_t>(offset + len);
+}
+
+/** Write a byte string to a ROW_FORMAT=COMPRESSED page.
+@param[in]      b       ROW_FORMAT=COMPRESSED index page
+@param[in]      dest    destination within b.zip.data
+@param[in]      str     the data to write
+@param[in]      len     length of the data to write
+@tparam w       write request type */
+template<mtr_t::write_type w>
+inline void mtr_t::zmemcpy(const buf_block_t &b, void *dest, const void *str,
+                           ulint len)
+{
+  byte *d= static_cast<byte*>(dest);
+  const byte *s= static_cast<const byte*>(str);
+  ut_ad(d >= b.page.zip.data + FIL_PAGE_OFFSET);
+  if (w != FORCED)
+  {
+    ut_ad(len);
+    const byte *const end= d + len;
+    while (*d++ == *s++)
+    {
+      if (d == end)
+      {
+        ut_ad(w == MAYBE_NOP);
+        return;
+      }
+    }
+    s--;
+    d--;
+    len= static_cast<ulint>(end - d);
+  }
+  ::memcpy(d, s, len);
+  zmemcpy(b, d - b.page.zip.data, len);
+}
+
+/** Write redo log for compressing a ROW_FORMAT=COMPRESSED index page.
+@param[in,out]	block	ROW_FORMAT=COMPRESSED index page
+@param[in]	index	the index that the block belongs to
+@param[in,out]	mtr	mini-transaction */
+static void page_zip_compress_write_log(buf_block_t *block,
+                                        dict_index_t *index, mtr_t *mtr)
+{
+  ut_ad(!index->is_ibuf());
+
+  if (!mtr->is_logged())
+    return;
+
+  const page_t *page= block->page.frame;
+  const page_zip_des_t *page_zip= &block->page.zip;
+  /* Read the number of user records. */
+  ulint trailer_size= ulint(page_dir_get_n_heap(page_zip->data)) -
+    PAGE_HEAP_NO_USER_LOW;
+  /* Multiply by uncompressed of size stored per record */
+  if (!page_is_leaf(page))
+    trailer_size*= PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE;
+  else if (index->is_clust())
+    trailer_size*= PAGE_ZIP_DIR_SLOT_SIZE + DATA_TRX_ID_LEN +
+      DATA_ROLL_PTR_LEN;
+  else
+    trailer_size*= PAGE_ZIP_DIR_SLOT_SIZE;
+  /* Add the space occupied by BLOB pointers. */
+  trailer_size+= page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE;
+  ut_a(page_zip->m_end > PAGE_DATA);
+  compile_time_assert(FIL_PAGE_DATA <= PAGE_DATA);
+  ut_a(page_zip->m_end + trailer_size <= page_zip_get_size(page_zip));
+
+  mtr->init(block);
+  mtr->zmemcpy(*block, FIL_PAGE_PREV, page_zip->m_end - FIL_PAGE_PREV);
+
+  if (trailer_size)
+    mtr->zmemcpy(*block, page_zip_get_size(page_zip) - trailer_size,
+                 trailer_size);
+}
+
+/******************************************************//**
+Determine how many externally stored columns are contained
+in existing records with smaller heap_no than rec. */
+static
+ulint
+page_zip_get_n_prev_extern(
+/*=======================*/
+	const page_zip_des_t*	page_zip,/*!< in: dense page directory on
+					compressed page */
+	const rec_t*		rec,	/*!< in: compact physical record
+					on a B-tree leaf page */
+	const dict_index_t*	index)	/*!< in: record descriptor */
+{
+	const page_t*	page	= page_align(rec);
+	ulint		n_ext	= 0;
+	ulint		i;
+	ulint		left;
+	ulint		heap_no;
+	ulint		n_recs	= page_get_n_recs(page_zip->data);
+
+	ut_ad(page_is_leaf(page));
+	ut_ad(page_is_comp(page));
+	ut_ad(dict_table_is_comp(index->table));
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(!dict_index_is_ibuf(index));
+
+	heap_no = rec_get_heap_no_new(rec);
+	ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW);
+	left = heap_no - PAGE_HEAP_NO_USER_LOW;
+	if (UNIV_UNLIKELY(!left)) {
+		return(0);
+	}
+
+	for (i = 0; i < n_recs; i++) {
+		const rec_t*	r	= page + (page_zip_dir_get(page_zip, i)
+						  & PAGE_ZIP_DIR_SLOT_MASK);
+
+		if (rec_get_heap_no_new(r) < heap_no) {
+			n_ext += rec_get_n_extern_new(r, index,
+						      ULINT_UNDEFINED);
+			if (!--left) {
+				break;
+			}
+		}
+	}
+
+	return(n_ext);
+}
+
+/**********************************************************************//**
+Encode the length of a fixed-length column.
+@return buf + length of encoded val */
+static
+byte*
+page_zip_fixed_field_encode(
+/*========================*/
+	byte*	buf,	/*!< in: pointer to buffer where to write */
+	ulint	val)	/*!< in: value to write */
+{
+	ut_ad(val >= 2);
+
+	if (UNIV_LIKELY(val < 126)) {
+		/*
+		0 = nullable variable field of at most 255 bytes length;
+		1 = not null variable field of at most 255 bytes length;
+		126 = nullable variable field with maximum length >255;
+		127 = not null variable field with maximum length >255
+		*/
+		*buf++ = (byte) val;
+	} else {
+		*buf++ = (byte) (0x80 | val >> 8);
+		*buf++ = (byte) val;
+	}
+
+	return(buf);
+}
+
+/**********************************************************************//**
+Write the index information for the compressed page.
+@return used size of buf */
+ulint
+page_zip_fields_encode(
+/*===================*/
+	ulint			n,	/*!< in: number of fields
+					to compress */
+	const dict_index_t*	index,	/*!< in: index comprising
+					at least n fields */
+	ulint			trx_id_pos,
+					/*!< in: position of the trx_id column
+					in the index, or ULINT_UNDEFINED if
+					this is a non-leaf page */
+	byte*			buf)	/*!< out: buffer of (n + 1) * 2 bytes */
+{
+	const byte*	buf_start	= buf;
+	ulint		i;
+	ulint		col;
+	ulint		trx_id_col	= 0;
+	/* sum of lengths of preceding non-nullable fixed fields, or 0 */
+	ulint		fixed_sum	= 0;
+
+	ut_ad(trx_id_pos == ULINT_UNDEFINED || trx_id_pos < n);
+
+	for (i = col = 0; i < n; i++) {
+		dict_field_t*	field = dict_index_get_nth_field(index, i);
+		ulint		val;
+
+		if (dict_field_get_col(field)->prtype & DATA_NOT_NULL) {
+			val = 1; /* set the "not nullable" flag */
+		} else {
+			val = 0; /* nullable field */
+		}
+
+		if (!field->fixed_len) {
+			/* variable-length field */
+			const dict_col_t*	column
+				= dict_field_get_col(field);
+
+			if (DATA_BIG_COL(column)) {
+				val |= 0x7e; /* max > 255 bytes */
+			}
+
+			if (fixed_sum) {
+				/* write out the length of any
+				preceding non-nullable fields */
+				buf = page_zip_fixed_field_encode(
+					buf, fixed_sum << 1 | 1);
+				fixed_sum = 0;
+				col++;
+			}
+
+			*buf++ = (byte) val;
+			col++;
+		} else if (val) {
+			/* fixed-length non-nullable field */
+
+			if (fixed_sum && UNIV_UNLIKELY
+			    (fixed_sum + field->fixed_len
+			     > DICT_MAX_FIXED_COL_LEN)) {
+				/* Write out the length of the
+				preceding non-nullable fields,
+				to avoid exceeding the maximum
+				length of a fixed-length column. */
+				buf = page_zip_fixed_field_encode(
+					buf, fixed_sum << 1 | 1);
+				fixed_sum = 0;
+				col++;
+			}
+
+			if (i && UNIV_UNLIKELY(i == trx_id_pos)) {
+				if (fixed_sum) {
+					/* Write out the length of any
+					preceding non-nullable fields,
+					and start a new trx_id column. */
+					buf = page_zip_fixed_field_encode(
+						buf, fixed_sum << 1 | 1);
+					col++;
+				}
+
+				trx_id_col = col;
+				fixed_sum = field->fixed_len;
+			} else {
+				/* add to the sum */
+				fixed_sum += field->fixed_len;
+			}
+		} else {
+			/* fixed-length nullable field */
+
+			if (fixed_sum) {
+				/* write out the length of any
+				preceding non-nullable fields */
+				buf = page_zip_fixed_field_encode(
+					buf, fixed_sum << 1 | 1);
+				fixed_sum = 0;
+				col++;
+			}
+
+			buf = page_zip_fixed_field_encode(
+				buf, ulint(field->fixed_len) << 1);
+			col++;
+		}
+	}
+
+	if (fixed_sum) {
+		/* Write out the lengths of last fixed-length columns. */
+		buf = page_zip_fixed_field_encode(buf, fixed_sum << 1 | 1);
+	}
+
+	if (trx_id_pos != ULINT_UNDEFINED) {
+		/* Write out the position of the trx_id column */
+		i = trx_id_col;
+	} else {
+		/* Write out the number of nullable fields */
+		i = index->n_nullable;
+	}
+
+	if (i < 128) {
+		*buf++ = (byte) i;
+	} else {
+		*buf++ = (byte) (0x80 | i >> 8);
+		*buf++ = (byte) i;
+	}
+
+	ut_ad((ulint) (buf - buf_start) <= (n + 2) * 2);
+	return((ulint) (buf - buf_start));
+}
+
+/**********************************************************************//**
+Populate the dense page directory from the sparse directory. */
+static
+void
+page_zip_dir_encode(
+/*================*/
+	const page_t*	page,	/*!< in: compact page */
+	byte*		buf,	/*!< in: pointer to dense page directory[-1];
+				out: dense directory on compressed page */
+	const rec_t**	recs)	/*!< in: pointer to an array of 0, or NULL;
+				out: dense page directory sorted by ascending
+				address (and heap_no) */
+{
+	const byte*	rec;
+	ulint		status;
+	ulint		min_mark;
+	ulint		heap_no;
+	ulint		i;
+	ulint		n_heap;
+	ulint		offs;
+
+	min_mark = 0;
+
+	if (page_is_leaf(page)) {
+		status = REC_STATUS_ORDINARY;
+	} else {
+		status = REC_STATUS_NODE_PTR;
+		if (UNIV_UNLIKELY(!page_has_prev(page))) {
+			min_mark = REC_INFO_MIN_REC_FLAG;
+		}
+	}
+
+	n_heap = page_dir_get_n_heap(page);
+
+	/* Traverse the list of stored records in the collation order,
+	starting from the first user record. */
+
+	rec = page + PAGE_NEW_INFIMUM;
+
+	i = 0;
+
+	for (;;) {
+		ulint	info_bits;
+		offs = rec_get_next_offs(rec, TRUE);
+		if (UNIV_UNLIKELY(offs == PAGE_NEW_SUPREMUM)) {
+			break;
+		}
+		rec = page + offs;
+		heap_no = rec_get_heap_no_new(rec);
+		ut_a(heap_no >= PAGE_HEAP_NO_USER_LOW);
+		ut_a(heap_no < n_heap);
+		ut_a(offs < srv_page_size - PAGE_DIR);
+		ut_a(offs >= PAGE_ZIP_START);
+		compile_time_assert(!(PAGE_ZIP_DIR_SLOT_MASK
+				      & (PAGE_ZIP_DIR_SLOT_MASK + 1)));
+		compile_time_assert(PAGE_ZIP_DIR_SLOT_MASK
+				    >= UNIV_ZIP_SIZE_MAX - 1);
+
+		if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) != 0)) {
+			offs |= PAGE_ZIP_DIR_SLOT_OWNED;
+		}
+
+		info_bits = rec_get_info_bits(rec, TRUE);
+		if (info_bits & REC_INFO_DELETED_FLAG) {
+			info_bits &= ~REC_INFO_DELETED_FLAG;
+			offs |= PAGE_ZIP_DIR_SLOT_DEL;
+		}
+		ut_a(info_bits == min_mark);
+		/* Only the smallest user record can have
+		REC_INFO_MIN_REC_FLAG set. */
+		min_mark = 0;
+
+		mach_write_to_2(buf - PAGE_ZIP_DIR_SLOT_SIZE * ++i, offs);
+
+		if (UNIV_LIKELY_NULL(recs)) {
+			/* Ensure that each heap_no occurs at most once. */
+			ut_a(!recs[heap_no - PAGE_HEAP_NO_USER_LOW]);
+			/* exclude infimum and supremum */
+			recs[heap_no - PAGE_HEAP_NO_USER_LOW] = rec;
+		}
+
+		ut_a(ulint(rec_get_status(rec)) == status);
+	}
+
+	offs = page_header_get_field(page, PAGE_FREE);
+
+	/* Traverse the free list (of deleted records). */
+	while (offs) {
+		ut_ad(!(offs & ~PAGE_ZIP_DIR_SLOT_MASK));
+		rec = page + offs;
+
+		heap_no = rec_get_heap_no_new(rec);
+		ut_a(heap_no >= PAGE_HEAP_NO_USER_LOW);
+		ut_a(heap_no < n_heap);
+
+		ut_a(!rec[-REC_N_NEW_EXTRA_BYTES]); /* info_bits and n_owned */
+		ut_a(ulint(rec_get_status(rec)) == status);
+
+		mach_write_to_2(buf - PAGE_ZIP_DIR_SLOT_SIZE * ++i, offs);
+
+		if (UNIV_LIKELY_NULL(recs)) {
+			/* Ensure that each heap_no occurs at most once. */
+			ut_a(!recs[heap_no - PAGE_HEAP_NO_USER_LOW]);
+			/* exclude infimum and supremum */
+			recs[heap_no - PAGE_HEAP_NO_USER_LOW] = rec;
+		}
+
+		offs = rec_get_next_offs(rec, TRUE);
+	}
+
+	/* Ensure that each heap no occurs at least once. */
+	ut_a(i + PAGE_HEAP_NO_USER_LOW == n_heap);
+}
+
+extern "C" {
+
+/**********************************************************************//**
+Allocate memory for zlib. */
+static
+void*
+page_zip_zalloc(
+/*============*/
+	void*	opaque,	/*!< in/out: memory heap */
+	uInt	items,	/*!< in: number of items to allocate */
+	uInt	size)	/*!< in: size of an item in bytes */
+{
+	return(mem_heap_zalloc(static_cast<mem_heap_t*>(opaque), items * size));
+}
+
+/**********************************************************************//**
+Deallocate memory for zlib. */
+static
+void
+page_zip_free(
+/*==========*/
+	void*	opaque MY_ATTRIBUTE((unused)),	/*!< in: memory heap */
+	void*	address MY_ATTRIBUTE((unused)))/*!< in: object to free */
+{
+}
+
+} /* extern "C" */
+
+/**********************************************************************//**
+Configure the zlib allocator to use the given memory heap. */
+void
+page_zip_set_alloc(
+/*===============*/
+	void*		stream,		/*!< in/out: zlib stream */
+	mem_heap_t*	heap)		/*!< in: memory heap to use */
+{
+	z_stream*	strm = static_cast<z_stream*>(stream);
+
+	strm->zalloc = page_zip_zalloc;
+	strm->zfree = page_zip_free;
+	strm->opaque = heap;
+}
+
+#if 0 || defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+/** Symbol for enabling compression and decompression diagnostics */
+# define PAGE_ZIP_COMPRESS_DBG
+#endif
+
+#ifdef PAGE_ZIP_COMPRESS_DBG
+/** Set this variable in a debugger to enable
+excessive logging in page_zip_compress(). */
+static bool	page_zip_compress_dbg;
+/** Set this variable in a debugger to enable
+binary logging of the data passed to deflate().
+When this variable is nonzero, it will act
+as a log file name generator. */
+static unsigned	page_zip_compress_log;
+
+/**********************************************************************//**
+Wrapper for deflate().  Log the operation if page_zip_compress_dbg is set.
+@return deflate() status: Z_OK, Z_BUF_ERROR, ... */
+static
+int
+page_zip_compress_deflate(
+/*======================*/
+	FILE*		logfile,/*!< in: log file, or NULL */
+	z_streamp	strm,	/*!< in/out: compressed stream for deflate() */
+	int		flush)	/*!< in: deflate() flushing method */
+{
+	int	status;
+	if (UNIV_UNLIKELY(page_zip_compress_dbg)) {
+		ut_print_buf(stderr, strm->next_in, strm->avail_in);
+	}
+	if (UNIV_LIKELY_NULL(logfile)) {
+		if (fwrite(strm->next_in, 1, strm->avail_in, logfile)
+		    != strm->avail_in) {
+			perror("fwrite");
+		}
+	}
+	status = deflate(strm, flush);
+	if (UNIV_UNLIKELY(page_zip_compress_dbg)) {
+		fprintf(stderr, " -> %d\n", status);
+	}
+	return(status);
+}
+
+/* Redefine deflate(). */
+# undef deflate
+/** Debug wrapper for the zlib compression routine deflate().
+Log the operation if page_zip_compress_dbg is set.
+@param strm in/out: compressed stream
+@param flush in: flushing method
+@return deflate() status: Z_OK, Z_BUF_ERROR, ... */
+# define deflate(strm, flush) page_zip_compress_deflate(logfile, strm, flush)
+/** Declaration of the logfile parameter */
+# define FILE_LOGFILE FILE* logfile,
+/** The logfile parameter */
+# define LOGFILE logfile,
+#else /* PAGE_ZIP_COMPRESS_DBG */
+/** Empty declaration of the logfile parameter */
+# define FILE_LOGFILE
+/** Missing logfile parameter */
+# define LOGFILE
+#endif /* PAGE_ZIP_COMPRESS_DBG */
+
+/**********************************************************************//**
+Compress the records of a node pointer page.
+@return Z_OK, or a zlib error code */
+static
+int
+page_zip_compress_node_ptrs(
+/*========================*/
+	FILE_LOGFILE
+	z_stream*	c_stream,	/*!< in/out: compressed page stream */
+	const rec_t**	recs,		/*!< in: dense page directory
+					sorted by address */
+	ulint		n_dense,	/*!< in: size of recs[] */
+	dict_index_t*	index,		/*!< in: the index of the page */
+	byte*		storage,	/*!< in: end of dense page directory */
+	mem_heap_t*	heap)		/*!< in: temporary memory heap */
+{
+	int	err	= Z_OK;
+	rec_offs* offsets = NULL;
+
+	do {
+		const rec_t*	rec = *recs++;
+
+		offsets = rec_get_offsets(rec, index, offsets, 0,
+					  ULINT_UNDEFINED, &heap);
+		/* Only leaf nodes may contain externally stored columns. */
+		ut_ad(!rec_offs_any_extern(offsets));
+
+		MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+		MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+				  rec_offs_extra_size(offsets));
+
+		/* Compress the extra bytes. */
+		c_stream->avail_in = static_cast<uInt>(
+			rec - REC_N_NEW_EXTRA_BYTES - c_stream->next_in);
+
+		if (c_stream->avail_in) {
+			err = deflate(c_stream, Z_NO_FLUSH);
+			if (UNIV_UNLIKELY(err != Z_OK)) {
+				break;
+			}
+		}
+		ut_ad(!c_stream->avail_in);
+
+		/* Compress the data bytes, except node_ptr. */
+		c_stream->next_in = (byte*) rec;
+		c_stream->avail_in = static_cast<uInt>(
+			rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE);
+
+		if (c_stream->avail_in) {
+			err = deflate(c_stream, Z_NO_FLUSH);
+			if (UNIV_UNLIKELY(err != Z_OK)) {
+				break;
+			}
+		}
+
+		ut_ad(!c_stream->avail_in);
+
+		memcpy(storage - REC_NODE_PTR_SIZE
+		       * (rec_get_heap_no_new(rec) - 1),
+		       c_stream->next_in, REC_NODE_PTR_SIZE);
+		c_stream->next_in += REC_NODE_PTR_SIZE;
+	} while (--n_dense);
+
+	return(err);
+}
+
+/**********************************************************************//**
+Compress the records of a leaf node of a secondary index.
+@return Z_OK, or a zlib error code */
+static
+int
+page_zip_compress_sec(
+/*==================*/
+	FILE_LOGFILE
+	z_stream*	c_stream,	/*!< in/out: compressed page stream */
+	const rec_t**	recs,		/*!< in: dense page directory
+					sorted by address */
+	ulint		n_dense)	/*!< in: size of recs[] */
+{
+	int		err	= Z_OK;
+
+	ut_ad(n_dense > 0);
+
+	do {
+		const rec_t*	rec = *recs++;
+
+		/* Compress everything up to this record. */
+		c_stream->avail_in = static_cast<uInt>(
+			rec - REC_N_NEW_EXTRA_BYTES
+			- c_stream->next_in);
+
+		if (UNIV_LIKELY(c_stream->avail_in != 0)) {
+			MEM_CHECK_DEFINED(c_stream->next_in,
+					  c_stream->avail_in);
+			err = deflate(c_stream, Z_NO_FLUSH);
+			if (UNIV_UNLIKELY(err != Z_OK)) {
+				break;
+			}
+		}
+
+		ut_ad(!c_stream->avail_in);
+		ut_ad(c_stream->next_in == rec - REC_N_NEW_EXTRA_BYTES);
+
+		/* Skip the REC_N_NEW_EXTRA_BYTES. */
+
+		c_stream->next_in = (byte*) rec;
+	} while (--n_dense);
+
+	return(err);
+}
+
+/**********************************************************************//**
+Compress a record of a leaf node of a clustered index that contains
+externally stored columns.
+@return Z_OK, or a zlib error code */
+static
+int
+page_zip_compress_clust_ext(
+/*========================*/
+	FILE_LOGFILE
+	z_stream*	c_stream,	/*!< in/out: compressed page stream */
+	const rec_t*	rec,		/*!< in: record */
+	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec) */
+	ulint		trx_id_col,	/*!< in: position of of DB_TRX_ID */
+	byte*		deleted,	/*!< in: dense directory entry pointing
+					to the head of the free list */
+	byte*		storage,	/*!< in: end of dense page directory */
+	byte**		externs,	/*!< in/out: pointer to the next
+					available BLOB pointer */
+	ulint*		n_blobs)	/*!< in/out: number of
+					externally stored columns */
+{
+	int	err;
+	ulint	i;
+
+	MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+	MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+			  rec_offs_extra_size(offsets));
+
+	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+		ulint		len;
+		const byte*	src;
+
+		if (UNIV_UNLIKELY(i == trx_id_col)) {
+			ut_ad(!rec_offs_nth_extern(offsets, i));
+			/* Store trx_id and roll_ptr
+			in uncompressed form. */
+			src = rec_get_nth_field(rec, offsets, i, &len);
+			ut_ad(src + DATA_TRX_ID_LEN
+			      == rec_get_nth_field(rec, offsets,
+						   i + 1, &len));
+			ut_ad(len == DATA_ROLL_PTR_LEN);
+
+			/* Compress any preceding bytes. */
+			c_stream->avail_in = static_cast<uInt>(
+				src - c_stream->next_in);
+
+			if (c_stream->avail_in) {
+				err = deflate(c_stream, Z_NO_FLUSH);
+				if (UNIV_UNLIKELY(err != Z_OK)) {
+
+					return(err);
+				}
+			}
+
+			ut_ad(!c_stream->avail_in);
+			ut_ad(c_stream->next_in == src);
+
+			memcpy(storage
+			       - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
+			       * (rec_get_heap_no_new(rec) - 1),
+			       c_stream->next_in,
+			       DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+			c_stream->next_in
+				+= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+
+			/* Skip also roll_ptr */
+			i++;
+		} else if (rec_offs_nth_extern(offsets, i)) {
+			src = rec_get_nth_field(rec, offsets, i, &len);
+			ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE);
+			src += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+			c_stream->avail_in = static_cast<uInt>(
+				src - c_stream->next_in);
+			if (UNIV_LIKELY(c_stream->avail_in != 0)) {
+				err = deflate(c_stream, Z_NO_FLUSH);
+				if (UNIV_UNLIKELY(err != Z_OK)) {
+
+					return(err);
+				}
+			}
+
+			ut_ad(!c_stream->avail_in);
+			ut_ad(c_stream->next_in == src);
+
+			/* Reserve space for the data at
+			the end of the space reserved for
+			the compressed data and the page
+			modification log. */
+
+			if (UNIV_UNLIKELY
+			    (c_stream->avail_out
+			     <= BTR_EXTERN_FIELD_REF_SIZE)) {
+				/* out of space */
+				return(Z_BUF_ERROR);
+			}
+
+			ut_ad(*externs == c_stream->next_out
+			      + c_stream->avail_out
+			      + 1/* end of modif. log */);
+
+			c_stream->next_in
+				+= BTR_EXTERN_FIELD_REF_SIZE;
+
+			/* Skip deleted records. */
+			if (UNIV_LIKELY_NULL
+			    (page_zip_dir_find_low(
+				    storage, deleted,
+				    page_offset(rec)))) {
+				continue;
+			}
+
+			(*n_blobs)++;
+			c_stream->avail_out
+				-= BTR_EXTERN_FIELD_REF_SIZE;
+			*externs -= BTR_EXTERN_FIELD_REF_SIZE;
+
+			/* Copy the BLOB pointer */
+			memcpy(*externs, c_stream->next_in
+			       - BTR_EXTERN_FIELD_REF_SIZE,
+			       BTR_EXTERN_FIELD_REF_SIZE);
+		}
+	}
+
+	return(Z_OK);
+}
+
+/**********************************************************************//**
+Compress the records of a leaf node of a clustered index.
+@return Z_OK, or a zlib error code */
+static
+int
+page_zip_compress_clust(
+/*====================*/
+	FILE_LOGFILE
+	z_stream*	c_stream,	/*!< in/out: compressed page stream */
+	const rec_t**	recs,		/*!< in: dense page directory
+					sorted by address */
+	ulint		n_dense,	/*!< in: size of recs[] */
+	dict_index_t*	index,		/*!< in: the index of the page */
+	ulint*		n_blobs,	/*!< in: 0; out: number of
+					externally stored columns */
+	ulint		trx_id_col,	/*!< index of the trx_id column */
+	byte*		deleted,	/*!< in: dense directory entry pointing
+					to the head of the free list */
+	byte*		storage,	/*!< in: end of dense page directory */
+	mem_heap_t*	heap)		/*!< in: temporary memory heap */
+{
+	int	err		= Z_OK;
+	rec_offs* offsets		= NULL;
+	/* BTR_EXTERN_FIELD_REF storage */
+	byte*	externs		= storage - n_dense
+		* (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+	ut_ad(*n_blobs == 0);
+
+	do {
+		const rec_t*	rec = *recs++;
+
+		offsets = rec_get_offsets(rec, index, offsets, index->n_fields,
+					  ULINT_UNDEFINED, &heap);
+		ut_ad(rec_offs_n_fields(offsets)
+		      == dict_index_get_n_fields(index));
+		MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+		MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+				  rec_offs_extra_size(offsets));
+
+		/* Compress the extra bytes. */
+		c_stream->avail_in = static_cast<uInt>(
+			rec - REC_N_NEW_EXTRA_BYTES
+			- c_stream->next_in);
+
+		if (c_stream->avail_in) {
+			err = deflate(c_stream, Z_NO_FLUSH);
+			if (UNIV_UNLIKELY(err != Z_OK)) {
+
+				goto func_exit;
+			}
+		}
+		ut_ad(!c_stream->avail_in);
+		ut_ad(c_stream->next_in == rec - REC_N_NEW_EXTRA_BYTES);
+
+		/* Compress the data bytes. */
+
+		c_stream->next_in = (byte*) rec;
+
+		/* Check if there are any externally stored columns.
+		For each externally stored column, store the
+		BTR_EXTERN_FIELD_REF separately. */
+		if (rec_offs_any_extern(offsets)) {
+			ut_ad(dict_index_is_clust(index));
+
+			err = page_zip_compress_clust_ext(
+				LOGFILE
+				c_stream, rec, offsets, trx_id_col,
+				deleted, storage, &externs, n_blobs);
+
+			if (UNIV_UNLIKELY(err != Z_OK)) {
+
+				goto func_exit;
+			}
+		} else {
+			ulint		len;
+			const byte*	src;
+
+			/* Store trx_id and roll_ptr in uncompressed form. */
+			src = rec_get_nth_field(rec, offsets,
+						trx_id_col, &len);
+			ut_ad(src + DATA_TRX_ID_LEN
+			      == rec_get_nth_field(rec, offsets,
+						   trx_id_col + 1, &len));
+			ut_ad(len == DATA_ROLL_PTR_LEN);
+			MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+			MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+					  rec_offs_extra_size(offsets));
+
+			/* Compress any preceding bytes. */
+			c_stream->avail_in = static_cast<uInt>(
+				src - c_stream->next_in);
+
+			if (c_stream->avail_in) {
+				err = deflate(c_stream, Z_NO_FLUSH);
+				if (UNIV_UNLIKELY(err != Z_OK)) {
+
+					return(err);
+				}
+			}
+
+			ut_ad(!c_stream->avail_in);
+			ut_ad(c_stream->next_in == src);
+
+			memcpy(storage
+			       - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
+			       * (rec_get_heap_no_new(rec) - 1),
+			       c_stream->next_in,
+			       DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+			c_stream->next_in
+				+= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+
+			/* Skip also roll_ptr */
+			ut_ad(trx_id_col + 1 < rec_offs_n_fields(offsets));
+		}
+
+		/* Compress the last bytes of the record. */
+		c_stream->avail_in = static_cast<uInt>(
+			rec + rec_offs_data_size(offsets) - c_stream->next_in);
+
+		if (c_stream->avail_in) {
+			err = deflate(c_stream, Z_NO_FLUSH);
+			if (UNIV_UNLIKELY(err != Z_OK)) {
+
+				goto func_exit;
+			}
+		}
+		ut_ad(!c_stream->avail_in);
+	} while (--n_dense);
+
+func_exit:
+	return(err);}
+
+/** Attempt to compress a ROW_FORMAT=COMPRESSED page.
+@retval true on success
+@retval false on failure; block->page.zip will be left intact. */
+bool
+page_zip_compress(
+	buf_block_t*		block,	/*!< in/out: buffer block */
+	dict_index_t*		index,	/*!< in: index of the B-tree node */
+	ulint			level,	/*!< in: commpression level */
+	mtr_t*			mtr)	/*!< in/out: mini-transaction */
+{
+	z_stream		c_stream;
+	int			err;
+	byte*			fields;		/*!< index field information */
+	byte*			buf;		/*!< compressed payload of the
+						page */
+	byte*			buf_end;	/* end of buf */
+	ulint			n_dense;
+	ulint			slot_size;	/* amount of uncompressed bytes
+						per record */
+	const rec_t**		recs;		/*!< dense page directory,
+						sorted by address */
+	mem_heap_t*		heap;
+	ulint			trx_id_col = ULINT_UNDEFINED;
+	ulint			n_blobs	= 0;
+	byte*			storage;	/* storage of uncompressed
+						columns */
+	const ulonglong		ns = my_interval_timer();
+#ifdef PAGE_ZIP_COMPRESS_DBG
+	FILE*			logfile = NULL;
+#endif
+	/* A local copy of srv_cmp_per_index_enabled to avoid reading that
+	variable multiple times in this function since it can be changed at
+	anytime. */
+	my_bool			cmp_per_index_enabled;
+	cmp_per_index_enabled	= srv_cmp_per_index_enabled;
+
+	page_t* page = block->page.frame;
+	page_zip_des_t* page_zip = &block->page.zip;
+
+	ut_a(page_is_comp(page));
+	ut_a(fil_page_index_page_check(page));
+	ut_ad(page_simple_validate_new((page_t*) page));
+	ut_ad(page_zip_simple_validate(page_zip));
+	ut_ad(dict_table_is_comp(index->table));
+	ut_ad(!dict_index_is_ibuf(index));
+
+	MEM_CHECK_DEFINED(page, srv_page_size);
+
+	/* Check the data that will be omitted. */
+	ut_a(!memcmp(page + (PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES),
+		     infimum_extra, sizeof infimum_extra));
+	ut_a(!memcmp(page + PAGE_NEW_INFIMUM,
+		     infimum_data, sizeof infimum_data));
+	ut_a(page[PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES]
+	     /* info_bits == 0, n_owned <= max */
+	     <= PAGE_DIR_SLOT_MAX_N_OWNED);
+	ut_a(!memcmp(page + (PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES + 1),
+		     supremum_extra_data, sizeof supremum_extra_data));
+
+	if (page_is_empty(page)) {
+		ut_a(rec_get_next_offs(page + PAGE_NEW_INFIMUM, TRUE)
+		     == PAGE_NEW_SUPREMUM);
+	}
+
+	const ulint n_fields = page_is_leaf(page)
+		? dict_index_get_n_fields(index)
+		: dict_index_get_n_unique_in_tree_nonleaf(index);
+	index_id_t ind_id = index->id;
+
+	/* The dense directory excludes the infimum and supremum records. */
+	n_dense = ulint(page_dir_get_n_heap(page)) - PAGE_HEAP_NO_USER_LOW;
+#ifdef PAGE_ZIP_COMPRESS_DBG
+	if (UNIV_UNLIKELY(page_zip_compress_dbg)) {
+		ib::info() << "compress "
+			<< static_cast<void*>(page_zip) << " "
+			<< static_cast<const void*>(page) << " "
+			<< page_is_leaf(page) << " "
+			<< n_fields << " " << n_dense;
+	}
+
+	if (UNIV_UNLIKELY(page_zip_compress_log)) {
+		/* Create a log file for every compression attempt. */
+		char	logfilename[9];
+		snprintf(logfilename, sizeof logfilename,
+			 "%08x", page_zip_compress_log++);
+		logfile = fopen(logfilename, "wb");
+
+		if (logfile) {
+			/* Write the uncompressed page to the log. */
+			if (fwrite(page, 1, srv_page_size, logfile)
+			    != srv_page_size) {
+				perror("fwrite");
+			}
+			/* Record the compressed size as zero.
+			This will be overwritten at successful exit. */
+			putc(0, logfile);
+			putc(0, logfile);
+			putc(0, logfile);
+			putc(0, logfile);
+		}
+	}
+#endif /* PAGE_ZIP_COMPRESS_DBG */
+	page_zip_stat[page_zip->ssize - 1].compressed++;
+	if (cmp_per_index_enabled) {
+		mysql_mutex_lock(&page_zip_stat_per_index_mutex);
+		page_zip_stat_per_index[ind_id].compressed++;
+		mysql_mutex_unlock(&page_zip_stat_per_index_mutex);
+	}
+
+	if (UNIV_UNLIKELY(n_dense * PAGE_ZIP_DIR_SLOT_SIZE
+			  >= page_zip_get_size(page_zip))) {
+
+		goto err_exit;
+	}
+
+	MONITOR_INC(MONITOR_PAGE_COMPRESS);
+
+	heap = mem_heap_create(page_zip_get_size(page_zip)
+			       + n_fields * (2 + sizeof(ulint))
+			       + REC_OFFS_HEADER_SIZE
+			       + n_dense * ((sizeof *recs)
+					    - PAGE_ZIP_DIR_SLOT_SIZE)
+			       + srv_page_size * 4
+			       + (512 << MAX_MEM_LEVEL));
+
+	recs = static_cast<const rec_t**>(
+		mem_heap_zalloc(heap, n_dense * sizeof *recs));
+
+	fields = static_cast<byte*>(mem_heap_alloc(heap, (n_fields + 1) * 2));
+
+	buf = static_cast<byte*>(
+		mem_heap_alloc(heap, page_zip_get_size(page_zip) - PAGE_DATA));
+
+	buf_end = buf + page_zip_get_size(page_zip) - PAGE_DATA;
+
+	/* Compress the data payload. */
+	page_zip_set_alloc(&c_stream, heap);
+
+	err = deflateInit2(&c_stream, static_cast<int>(level),
+			   Z_DEFLATED, static_cast<int>(srv_page_size_shift),
+			   MAX_MEM_LEVEL, Z_DEFAULT_STRATEGY);
+	ut_a(err == Z_OK);
+
+	c_stream.next_out = buf;
+
+	/* Subtract the space reserved for uncompressed data. */
+	/* Page header and the end marker of the modification log */
+	c_stream.avail_out = static_cast<uInt>(buf_end - buf - 1);
+
+	/* Dense page directory and uncompressed columns, if any */
+	if (page_is_leaf(page)) {
+		if (dict_index_is_clust(index)) {
+			trx_id_col = index->db_trx_id();
+
+			slot_size = PAGE_ZIP_DIR_SLOT_SIZE
+				+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+
+		} else {
+			/* Signal the absence of trx_id
+			in page_zip_fields_encode() */
+			trx_id_col = 0;
+			slot_size = PAGE_ZIP_DIR_SLOT_SIZE;
+		}
+	} else {
+		slot_size = PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE;
+		trx_id_col = ULINT_UNDEFINED;
+	}
+
+	if (UNIV_UNLIKELY(c_stream.avail_out <= n_dense * slot_size
+			  + 6/* sizeof(zlib header and footer) */)) {
+		goto zlib_error;
+	}
+
+	c_stream.avail_out -= uInt(n_dense * slot_size);
+	c_stream.avail_in = uInt(page_zip_fields_encode(n_fields, index,
+							trx_id_col, fields));
+	c_stream.next_in = fields;
+
+	if (UNIV_LIKELY(!trx_id_col)) {
+		trx_id_col = ULINT_UNDEFINED;
+	}
+
+	MEM_CHECK_DEFINED(c_stream.next_in, c_stream.avail_in);
+	err = deflate(&c_stream, Z_FULL_FLUSH);
+	if (err != Z_OK) {
+		goto zlib_error;
+	}
+
+	ut_ad(!c_stream.avail_in);
+
+	page_zip_dir_encode(page, buf_end, recs);
+
+	c_stream.next_in = (byte*) page + PAGE_ZIP_START;
+
+	storage = buf_end - n_dense * PAGE_ZIP_DIR_SLOT_SIZE;
+
+	/* Compress the records in heap_no order. */
+	if (UNIV_UNLIKELY(!n_dense)) {
+	} else if (!page_is_leaf(page)) {
+		/* This is a node pointer page. */
+		err = page_zip_compress_node_ptrs(LOGFILE
+						  &c_stream, recs, n_dense,
+						  index, storage, heap);
+		if (UNIV_UNLIKELY(err != Z_OK)) {
+			goto zlib_error;
+		}
+	} else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) {
+		/* This is a leaf page in a secondary index. */
+		err = page_zip_compress_sec(LOGFILE
+					    &c_stream, recs, n_dense);
+		if (UNIV_UNLIKELY(err != Z_OK)) {
+			goto zlib_error;
+		}
+	} else {
+		/* This is a leaf page in a clustered index. */
+		err = page_zip_compress_clust(LOGFILE
+					      &c_stream, recs, n_dense,
+					      index, &n_blobs, trx_id_col,
+					      buf_end - PAGE_ZIP_DIR_SLOT_SIZE
+					      * page_get_n_recs(page),
+					      storage, heap);
+		if (UNIV_UNLIKELY(err != Z_OK)) {
+			goto zlib_error;
+		}
+	}
+
+	/* Finish the compression. */
+	ut_ad(!c_stream.avail_in);
+	/* Compress any trailing garbage, in case the last record was
+	allocated from an originally longer space on the free list,
+	or the data of the last record from page_zip_compress_sec(). */
+	c_stream.avail_in = static_cast<uInt>(
+		page_header_get_field(page, PAGE_HEAP_TOP)
+		- (c_stream.next_in - page));
+	ut_a(c_stream.avail_in <= srv_page_size - PAGE_ZIP_START - PAGE_DIR);
+
+	MEM_CHECK_DEFINED(c_stream.next_in, c_stream.avail_in);
+	err = deflate(&c_stream, Z_FINISH);
+
+	if (UNIV_UNLIKELY(err != Z_STREAM_END)) {
+zlib_error:
+		deflateEnd(&c_stream);
+		mem_heap_free(heap);
+err_exit:
+#ifdef PAGE_ZIP_COMPRESS_DBG
+		if (logfile) {
+			fclose(logfile);
+		}
+#endif /* PAGE_ZIP_COMPRESS_DBG */
+		if (page_is_leaf(page)) {
+			dict_index_zip_failure(index);
+		}
+
+		const uint64_t time_diff = (my_interval_timer() - ns) / 1000;
+		page_zip_stat[page_zip->ssize - 1].compressed_usec
+			+= time_diff;
+		if (cmp_per_index_enabled) {
+			mysql_mutex_lock(&page_zip_stat_per_index_mutex);
+			page_zip_stat_per_index[ind_id].compressed_usec
+				+= time_diff;
+			mysql_mutex_unlock(&page_zip_stat_per_index_mutex);
+		}
+		return false;
+	}
+
+	err = deflateEnd(&c_stream);
+	ut_a(err == Z_OK);
+
+	ut_ad(buf + c_stream.total_out == c_stream.next_out);
+	ut_ad((ulint) (storage - c_stream.next_out) >= c_stream.avail_out);
+
+#if defined HAVE_valgrind && !__has_feature(memory_sanitizer)
+	/* Valgrind believes that zlib does not initialize some bits
+	in the last 7 or 8 bytes of the stream.  Make Valgrind happy. */
+	MEM_MAKE_DEFINED(buf, c_stream.total_out);
+#endif /* HAVE_valgrind && !memory_sanitizer */
+
+	/* Zero out the area reserved for the modification log.
+	Space for the end marker of the modification log is not
+	included in avail_out. */
+	memset(c_stream.next_out, 0, c_stream.avail_out + 1/* end marker */);
+
+#ifdef UNIV_DEBUG
+	page_zip->m_start =
+#endif /* UNIV_DEBUG */
+		page_zip->m_end = uint16_t(PAGE_DATA + c_stream.total_out);
+	page_zip->m_nonempty = FALSE;
+	page_zip->n_blobs = unsigned(n_blobs) & ((1U << 12) - 1);
+	/* Copy those header fields that will not be written
+	in buf_flush_init_for_writing() */
+	memcpy_aligned<8>(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV,
+			  FIL_PAGE_LSN - FIL_PAGE_PREV);
+	memcpy_aligned<2>(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE,
+			  2);
+	memcpy_aligned<2>(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA,
+			  PAGE_DATA - FIL_PAGE_DATA);
+	/* Copy the rest of the compressed page */
+	memcpy_aligned<2>(page_zip->data + PAGE_DATA, buf,
+			  page_zip_get_size(page_zip) - PAGE_DATA);
+	mem_heap_free(heap);
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+	page_zip_compress_write_log(block, index, mtr);
+
+	MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+
+#ifdef PAGE_ZIP_COMPRESS_DBG
+	if (logfile) {
+		/* Record the compressed size of the block. */
+		byte sz[4];
+		mach_write_to_4(sz, c_stream.total_out);
+		fseek(logfile, srv_page_size, SEEK_SET);
+		if (fwrite(sz, 1, sizeof sz, logfile) != sizeof sz) {
+			perror("fwrite");
+		}
+		fclose(logfile);
+	}
+#endif /* PAGE_ZIP_COMPRESS_DBG */
+	const uint64_t time_diff = (my_interval_timer() - ns) / 1000;
+	page_zip_stat[page_zip->ssize - 1].compressed_ok++;
+	page_zip_stat[page_zip->ssize - 1].compressed_usec += time_diff;
+	if (cmp_per_index_enabled) {
+		mysql_mutex_lock(&page_zip_stat_per_index_mutex);
+		page_zip_stat_per_index[ind_id].compressed_ok++;
+		page_zip_stat_per_index[ind_id].compressed_usec += time_diff;
+		mysql_mutex_unlock(&page_zip_stat_per_index_mutex);
+	}
+
+	if (page_is_leaf(page)) {
+		dict_index_zip_success(index);
+	}
+
+	return true;
+}
+
+/**********************************************************************//**
+Deallocate the index information initialized by page_zip_fields_decode(). */
+static
+void
+page_zip_fields_free(
+/*=================*/
+	dict_index_t*	index)	/*!< in: dummy index to be freed */
+{
+	if (index) {
+		dict_table_t*	table = index->table;
+		index->zip_pad.mutex.~mutex();
+		mem_heap_free(index->heap);
+
+		dict_mem_table_free(table);
+	}
+}
+
+/**********************************************************************//**
+Read the index information for the compressed page.
+@return own: dummy index describing the page, or NULL on error */
+static
+dict_index_t*
+page_zip_fields_decode(
+/*===================*/
+	const byte*	buf,	/*!< in: index information */
+	const byte*	end,	/*!< in: end of buf */
+	ulint*		trx_id_col,/*!< in: NULL for non-leaf pages;
+				for leaf pages, pointer to where to store
+				the position of the trx_id column */
+	bool		is_spatial)/*< in: is spatial index or not */
+{
+	const byte*	b;
+	ulint		n;
+	ulint		i;
+	ulint		val;
+	dict_table_t*	table;
+	dict_index_t*	index;
+
+	/* Determine the number of fields. */
+	for (b = buf, n = 0; b < end; n++) {
+		if (*b++ & 0x80) {
+			b++; /* skip the second byte */
+		}
+	}
+
+	n--; /* n_nullable or trx_id */
+
+	if (UNIV_UNLIKELY(n > REC_MAX_N_FIELDS)) {
+
+		page_zip_fail(("page_zip_fields_decode: n = %lu\n",
+			       (ulong) n));
+		return(NULL);
+	}
+
+	if (UNIV_UNLIKELY(b > end)) {
+
+		page_zip_fail(("page_zip_fields_decode: %p > %p\n",
+			       (const void*) b, (const void*) end));
+		return(NULL);
+	}
+
+	table = dict_table_t::create({C_STRING_WITH_LEN("ZIP_DUMMY")},
+				     nullptr, n, 0, DICT_TF_COMPACT, 0);
+	index = dict_mem_index_create(table, "ZIP_DUMMY", 0, n);
+	index->n_uniq = static_cast<unsigned>(n) & dict_index_t::MAX_N_FIELDS;
+	/* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */
+	index->cached = TRUE;
+
+	/* Initialize the fields. */
+	for (b = buf, i = 0; i < n; i++) {
+		ulint	mtype;
+		ulint	len;
+
+		val = *b++;
+
+		if (UNIV_UNLIKELY(val & 0x80)) {
+			/* fixed length > 62 bytes */
+			val = (val & 0x7f) << 8 | *b++;
+			len = val >> 1;
+			mtype = DATA_FIXBINARY;
+		} else if (UNIV_UNLIKELY(val >= 126)) {
+			/* variable length with max > 255 bytes */
+			len = 0x7fff;
+			mtype = DATA_BINARY;
+		} else if (val <= 1) {
+			/* variable length with max <= 255 bytes */
+			len = 0;
+			mtype = DATA_BINARY;
+		} else {
+			/* fixed length < 62 bytes */
+			len = val >> 1;
+			mtype = DATA_FIXBINARY;
+		}
+
+		dict_mem_table_add_col(table, NULL, NULL, mtype,
+				       val & 1 ? DATA_NOT_NULL : 0, len);
+		dict_index_add_col(index, table,
+				   dict_table_get_nth_col(table, i), 0);
+	}
+
+	val = *b++;
+	if (UNIV_UNLIKELY(val & 0x80)) {
+		val = (val & 0x7f) << 8 | *b++;
+	}
+
+	/* Decode the position of the trx_id column. */
+	if (trx_id_col) {
+		if (!val) {
+			val = ULINT_UNDEFINED;
+		} else if (UNIV_UNLIKELY(val >= n)) {
+fail:
+			page_zip_fields_free(index);
+			return NULL;
+		} else {
+			index->type = DICT_CLUSTERED;
+		}
+
+		*trx_id_col = val;
+	} else {
+		/* Decode the number of nullable fields. */
+		if (UNIV_UNLIKELY(index->n_nullable > val)) {
+			goto fail;
+		} else {
+			index->n_nullable = static_cast<unsigned>(val)
+				& dict_index_t::MAX_N_FIELDS;
+		}
+	}
+
+	/* ROW_FORMAT=COMPRESSED does not support instant ADD COLUMN */
+	index->n_core_fields = index->n_fields;
+	index->n_core_null_bytes = static_cast<uint8_t>(
+		UT_BITS_IN_BYTES(unsigned(index->n_nullable)));
+
+	ut_ad(b == end);
+
+	if (is_spatial) {
+		index->type |= DICT_SPATIAL;
+	}
+
+	return(index);
+}
+
+/**********************************************************************//**
+Populate the sparse page directory from the dense directory.
+@return TRUE on success, FALSE on failure */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+ibool
+page_zip_dir_decode(
+/*================*/
+	const page_zip_des_t*	page_zip,/*!< in: dense page directory on
+					compressed page */
+	page_t*			page,	/*!< in: compact page with valid header;
+					out: trailer and sparse page directory
+					filled in */
+	rec_t**			recs,	/*!< out: dense page directory sorted by
+					ascending address (and heap_no) */
+	ulint			n_dense)/*!< in: number of user records, and
+					size of recs[] */
+{
+	ulint	i;
+	ulint	n_recs;
+	byte*	slot;
+
+	n_recs = page_get_n_recs(page);
+
+	if (UNIV_UNLIKELY(n_recs > n_dense)) {
+		page_zip_fail(("page_zip_dir_decode 1: %lu > %lu\n",
+			       (ulong) n_recs, (ulong) n_dense));
+		return(FALSE);
+	}
+
+	/* Traverse the list of stored records in the sorting order,
+	starting from the first user record. */
+
+	slot = page + (srv_page_size - PAGE_DIR - PAGE_DIR_SLOT_SIZE);
+	UNIV_PREFETCH_RW(slot);
+
+	/* Zero out the page trailer. */
+	memset(slot + PAGE_DIR_SLOT_SIZE, 0, PAGE_DIR);
+
+	mach_write_to_2(slot, PAGE_NEW_INFIMUM);
+	slot -= PAGE_DIR_SLOT_SIZE;
+	UNIV_PREFETCH_RW(slot);
+
+	/* Initialize the sparse directory and copy the dense directory. */
+	for (i = 0; i < n_recs; i++) {
+		ulint	offs = page_zip_dir_get(page_zip, i);
+
+		if (offs & PAGE_ZIP_DIR_SLOT_OWNED) {
+			mach_write_to_2(slot, offs & PAGE_ZIP_DIR_SLOT_MASK);
+			slot -= PAGE_DIR_SLOT_SIZE;
+			UNIV_PREFETCH_RW(slot);
+		}
+
+		if (UNIV_UNLIKELY((offs & PAGE_ZIP_DIR_SLOT_MASK)
+				  < PAGE_ZIP_START + REC_N_NEW_EXTRA_BYTES)) {
+			page_zip_fail(("page_zip_dir_decode 2: %u %u %lx\n",
+				       (unsigned) i, (unsigned) n_recs,
+				       (ulong) offs));
+			return(FALSE);
+		}
+
+		recs[i] = page + (offs & PAGE_ZIP_DIR_SLOT_MASK);
+	}
+
+	mach_write_to_2(slot, PAGE_NEW_SUPREMUM);
+	{
+		const page_dir_slot_t*	last_slot = page_dir_get_nth_slot(
+			page, page_dir_get_n_slots(page) - 1U);
+
+		if (UNIV_UNLIKELY(slot != last_slot)) {
+			page_zip_fail(("page_zip_dir_decode 3: %p != %p\n",
+				       (const void*) slot,
+				       (const void*) last_slot));
+			return(FALSE);
+		}
+	}
+
+	/* Copy the rest of the dense directory. */
+	for (; i < n_dense; i++) {
+		ulint	offs = page_zip_dir_get(page_zip, i);
+
+		if (UNIV_UNLIKELY(offs & ~PAGE_ZIP_DIR_SLOT_MASK)) {
+			page_zip_fail(("page_zip_dir_decode 4: %u %u %lx\n",
+				       (unsigned) i, (unsigned) n_dense,
+				       (ulong) offs));
+			return(FALSE);
+		}
+
+		recs[i] = page + offs;
+	}
+
+	std::sort(recs, recs + n_dense);
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Initialize the REC_N_NEW_EXTRA_BYTES of each record.
+@return TRUE on success, FALSE on failure */
+static
+ibool
+page_zip_set_extra_bytes(
+/*=====================*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	page_t*			page,	/*!< in/out: uncompressed page */
+	ulint			info_bits)/*!< in: REC_INFO_MIN_REC_FLAG or 0 */
+{
+	ulint	n;
+	ulint	i;
+	ulint	n_owned = 1;
+	ulint	offs;
+	rec_t*	rec;
+
+	n = page_get_n_recs(page);
+	rec = page + PAGE_NEW_INFIMUM;
+
+	for (i = 0; i < n; i++) {
+		offs = page_zip_dir_get(page_zip, i);
+
+		if (offs & PAGE_ZIP_DIR_SLOT_DEL) {
+			info_bits |= REC_INFO_DELETED_FLAG;
+		}
+		if (UNIV_UNLIKELY(offs & PAGE_ZIP_DIR_SLOT_OWNED)) {
+			info_bits |= n_owned;
+			n_owned = 1;
+		} else {
+			n_owned++;
+		}
+		offs &= PAGE_ZIP_DIR_SLOT_MASK;
+		if (UNIV_UNLIKELY(offs < PAGE_ZIP_START
+				  + REC_N_NEW_EXTRA_BYTES)) {
+			page_zip_fail(("page_zip_set_extra_bytes 1:"
+				       " %u %u %lx\n",
+				       (unsigned) i, (unsigned) n,
+				       (ulong) offs));
+			return(FALSE);
+		}
+
+		rec_set_next_offs_new(rec, offs);
+		rec = page + offs;
+		rec[-REC_N_NEW_EXTRA_BYTES] = (byte) info_bits;
+		info_bits = 0;
+	}
+
+	/* Set the next pointer of the last user record. */
+	rec_set_next_offs_new(rec, PAGE_NEW_SUPREMUM);
+
+	/* Set n_owned of the supremum record. */
+	page[PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES] = (byte) n_owned;
+
+	/* The dense directory excludes the infimum and supremum records. */
+	n = ulint(page_dir_get_n_heap(page)) - PAGE_HEAP_NO_USER_LOW;
+
+	if (i >= n) {
+		if (UNIV_LIKELY(i == n)) {
+			return(TRUE);
+		}
+
+		page_zip_fail(("page_zip_set_extra_bytes 2: %u != %u\n",
+			       (unsigned) i, (unsigned) n));
+		return(FALSE);
+	}
+
+	offs = page_zip_dir_get(page_zip, i);
+
+	/* Set the extra bytes of deleted records on the free list. */
+	for (;;) {
+		if (UNIV_UNLIKELY(!offs)
+		    || UNIV_UNLIKELY(offs & ~PAGE_ZIP_DIR_SLOT_MASK)) {
+
+			page_zip_fail(("page_zip_set_extra_bytes 3: %lx\n",
+				       (ulong) offs));
+			return(FALSE);
+		}
+
+		rec = page + offs;
+		rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */
+
+		if (++i == n) {
+			break;
+		}
+
+		offs = page_zip_dir_get(page_zip, i);
+		rec_set_next_offs_new(rec, offs);
+	}
+
+	/* Terminate the free list. */
+	rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */
+	rec_set_next_offs_new(rec, 0);
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Apply the modification log to a record containing externally stored
+columns.  Do not copy the fields that are stored separately.
+@return pointer to modification log, or NULL on failure */
+static
+const byte*
+page_zip_apply_log_ext(
+/*===================*/
+	rec_t*		rec,		/*!< in/out: record */
+	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec) */
+	ulint		trx_id_col,	/*!< in: position of of DB_TRX_ID */
+	const byte*	data,		/*!< in: modification log */
+	const byte*	end)		/*!< in: end of modification log */
+{
+	ulint	i;
+	ulint	len;
+	byte*	next_out = rec;
+
+	/* Check if there are any externally stored columns.
+	For each externally stored column, skip the
+	BTR_EXTERN_FIELD_REF. */
+
+	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+		byte*	dst;
+
+		if (UNIV_UNLIKELY(i == trx_id_col)) {
+			/* Skip trx_id and roll_ptr */
+			dst = rec_get_nth_field(rec, offsets,
+						i, &len);
+			if (UNIV_UNLIKELY(dst - next_out >= end - data)
+			    || UNIV_UNLIKELY
+			    (len < (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN))
+			    || rec_offs_nth_extern(offsets, i)) {
+				page_zip_fail(("page_zip_apply_log_ext:"
+					       " trx_id len %lu,"
+					       " %p - %p >= %p - %p\n",
+					       (ulong) len,
+					       (const void*) dst,
+					       (const void*) next_out,
+					       (const void*) end,
+					       (const void*) data));
+				return(NULL);
+			}
+
+			memcpy(next_out, data, ulint(dst - next_out));
+			data += ulint(dst - next_out);
+			next_out = dst + (DATA_TRX_ID_LEN
+					  + DATA_ROLL_PTR_LEN);
+		} else if (rec_offs_nth_extern(offsets, i)) {
+			dst = rec_get_nth_field(rec, offsets,
+						i, &len);
+			ut_ad(len
+			      >= BTR_EXTERN_FIELD_REF_SIZE);
+
+			len += ulint(dst - next_out)
+				- BTR_EXTERN_FIELD_REF_SIZE;
+
+			if (UNIV_UNLIKELY(data + len >= end)) {
+				page_zip_fail(("page_zip_apply_log_ext:"
+					       " ext %p+%lu >= %p\n",
+					       (const void*) data,
+					       (ulong) len,
+					       (const void*) end));
+				return(NULL);
+			}
+
+			memcpy(next_out, data, len);
+			data += len;
+			next_out += len
+				+ BTR_EXTERN_FIELD_REF_SIZE;
+		}
+	}
+
+	/* Copy the last bytes of the record. */
+	len = ulint(rec_get_end(rec, offsets) - next_out);
+	if (UNIV_UNLIKELY(data + len >= end)) {
+		page_zip_fail(("page_zip_apply_log_ext:"
+			       " last %p+%lu >= %p\n",
+			       (const void*) data,
+			       (ulong) len,
+			       (const void*) end));
+		return(NULL);
+	}
+	memcpy(next_out, data, len);
+	data += len;
+
+	return(data);
+}
+
+/**********************************************************************//**
+Apply the modification log to an uncompressed page.
+Do not copy the fields that are stored separately.
+@return pointer to end of modification log, or NULL on failure */
+static
+const byte*
+page_zip_apply_log(
+/*===============*/
+	const byte*	data,	/*!< in: modification log */
+	ulint		size,	/*!< in: maximum length of the log, in bytes */
+	rec_t**		recs,	/*!< in: dense page directory,
+				sorted by address (indexed by
+				heap_no - PAGE_HEAP_NO_USER_LOW) */
+	ulint		n_dense,/*!< in: size of recs[] */
+	ulint		n_core,	/*!< in: index->n_fields, or 0 for non-leaf */
+	ulint		trx_id_col,/*!< in: column number of trx_id in the index,
+				or ULINT_UNDEFINED if none */
+	ulint		heap_status,
+				/*!< in: heap_no and status bits for
+				the next record to uncompress */
+	dict_index_t*	index,	/*!< in: index of the page */
+	rec_offs*	offsets)/*!< in/out: work area for
+				rec_get_offsets_reverse() */
+{
+	const byte* const end = data + size;
+
+	for (;;) {
+		ulint	val;
+		rec_t*	rec;
+		ulint	len;
+		ulint	hs;
+
+		val = *data++;
+		if (UNIV_UNLIKELY(!val)) {
+			return(data - 1);
+		}
+		if (val & 0x80) {
+			val = (val & 0x7f) << 8 | *data++;
+			if (UNIV_UNLIKELY(!val)) {
+				page_zip_fail(("page_zip_apply_log:"
+					       " invalid val %x%x\n",
+					       data[-2], data[-1]));
+				return(NULL);
+			}
+		}
+		if (UNIV_UNLIKELY(data >= end)) {
+			page_zip_fail(("page_zip_apply_log: %p >= %p\n",
+				       (const void*) data,
+				       (const void*) end));
+			return(NULL);
+		}
+		if (UNIV_UNLIKELY((val >> 1) > n_dense)) {
+			page_zip_fail(("page_zip_apply_log: %lu>>1 > %lu\n",
+				       (ulong) val, (ulong) n_dense));
+			return(NULL);
+		}
+
+		/* Determine the heap number and status bits of the record. */
+		rec = recs[(val >> 1) - 1];
+
+		hs = ((val >> 1) + 1) << REC_HEAP_NO_SHIFT;
+		hs |= heap_status & ((1 << REC_HEAP_NO_SHIFT) - 1);
+
+		/* This may either be an old record that is being
+		overwritten (updated in place, or allocated from
+		the free list), or a new record, with the next
+		available_heap_no. */
+		if (UNIV_UNLIKELY(hs > heap_status)) {
+			page_zip_fail(("page_zip_apply_log: %lu > %lu\n",
+				       (ulong) hs, (ulong) heap_status));
+			return(NULL);
+		} else if (hs == heap_status) {
+			/* A new record was allocated from the heap. */
+			if (UNIV_UNLIKELY(val & 1)) {
+				/* Only existing records may be cleared. */
+				page_zip_fail(("page_zip_apply_log:"
+					       " attempting to create"
+					       " deleted rec %lu\n",
+					       (ulong) hs));
+				return(NULL);
+			}
+			heap_status += 1 << REC_HEAP_NO_SHIFT;
+		}
+
+		mach_write_to_2(rec - REC_NEW_HEAP_NO, hs);
+
+		if (val & 1) {
+			/* Clear the data bytes of the record. */
+			mem_heap_t*	heap	= NULL;
+			rec_offs*	offs;
+			offs = rec_get_offsets(rec, index, offsets, n_core,
+					       ULINT_UNDEFINED, &heap);
+			memset(rec, 0, rec_offs_data_size(offs));
+
+			if (UNIV_LIKELY_NULL(heap)) {
+				mem_heap_free(heap);
+			}
+			continue;
+		}
+
+		compile_time_assert(REC_STATUS_NODE_PTR == TRUE);
+		rec_get_offsets_reverse(data, index,
+					hs & REC_STATUS_NODE_PTR,
+					offsets);
+		/* Silence a debug assertion in rec_offs_make_valid().
+		This will be overwritten in page_zip_set_extra_bytes(),
+		called by page_zip_decompress_low(). */
+		ut_d(rec[-REC_NEW_INFO_BITS] = 0);
+		rec_offs_make_valid(rec, index, n_core != 0, offsets);
+
+		/* Copy the extra bytes (backwards). */
+		{
+			byte*	start	= rec_get_start(rec, offsets);
+			byte*	b	= rec - REC_N_NEW_EXTRA_BYTES;
+			while (b != start) {
+				*--b = *data++;
+			}
+		}
+
+		/* Copy the data bytes. */
+		if (UNIV_UNLIKELY(rec_offs_any_extern(offsets))) {
+			/* Non-leaf nodes should not contain any
+			externally stored columns. */
+			if (UNIV_UNLIKELY(hs & REC_STATUS_NODE_PTR)) {
+				page_zip_fail(("page_zip_apply_log:"
+					       " %lu&REC_STATUS_NODE_PTR\n",
+					       (ulong) hs));
+				return(NULL);
+			}
+
+			data = page_zip_apply_log_ext(
+				rec, offsets, trx_id_col, data, end);
+
+			if (UNIV_UNLIKELY(!data)) {
+				return(NULL);
+			}
+		} else if (UNIV_UNLIKELY(hs & REC_STATUS_NODE_PTR)) {
+			len = rec_offs_data_size(offsets)
+				- REC_NODE_PTR_SIZE;
+			/* Copy the data bytes, except node_ptr. */
+			if (UNIV_UNLIKELY(data + len >= end)) {
+				page_zip_fail(("page_zip_apply_log:"
+					       " node_ptr %p+%lu >= %p\n",
+					       (const void*) data,
+					       (ulong) len,
+					       (const void*) end));
+				return(NULL);
+			}
+			memcpy(rec, data, len);
+			data += len;
+		} else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) {
+			len = rec_offs_data_size(offsets);
+
+			/* Copy all data bytes of
+			a record in a secondary index. */
+			if (UNIV_UNLIKELY(data + len >= end)) {
+				page_zip_fail(("page_zip_apply_log:"
+					       " sec %p+%lu >= %p\n",
+					       (const void*) data,
+					       (ulong) len,
+					       (const void*) end));
+				return(NULL);
+			}
+
+			memcpy(rec, data, len);
+			data += len;
+		} else {
+			/* Skip DB_TRX_ID and DB_ROLL_PTR. */
+			ulint	l = rec_get_nth_field_offs(offsets,
+							   trx_id_col, &len);
+			byte*	b;
+
+			if (UNIV_UNLIKELY(data + l >= end)
+			    || UNIV_UNLIKELY(len < (DATA_TRX_ID_LEN
+						    + DATA_ROLL_PTR_LEN))) {
+				page_zip_fail(("page_zip_apply_log:"
+					       " trx_id %p+%lu >= %p\n",
+					       (const void*) data,
+					       (ulong) l,
+					       (const void*) end));
+				return(NULL);
+			}
+
+			/* Copy any preceding data bytes. */
+			memcpy(rec, data, l);
+			data += l;
+
+			/* Copy any bytes following DB_TRX_ID, DB_ROLL_PTR. */
+			b = rec + l + (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+			len = ulint(rec_get_end(rec, offsets) - b);
+			if (UNIV_UNLIKELY(data + len >= end)) {
+				page_zip_fail(("page_zip_apply_log:"
+					       " clust %p+%lu >= %p\n",
+					       (const void*) data,
+					       (ulong) len,
+					       (const void*) end));
+				return(NULL);
+			}
+			memcpy(b, data, len);
+			data += len;
+		}
+	}
+}
+
+/**********************************************************************//**
+Set the heap_no in a record, and skip the fixed-size record header
+that is not included in the d_stream.
+@return TRUE on success, FALSE if d_stream does not end at rec */
+static
+ibool
+page_zip_decompress_heap_no(
+/*========================*/
+	z_stream*	d_stream,	/*!< in/out: compressed page stream */
+	rec_t*		rec,		/*!< in/out: record */
+	ulint&		heap_status)	/*!< in/out: heap_no and status bits */
+{
+	if (d_stream->next_out != rec - REC_N_NEW_EXTRA_BYTES) {
+		/* n_dense has grown since the page was last compressed. */
+		return(FALSE);
+	}
+
+	/* Skip the REC_N_NEW_EXTRA_BYTES. */
+	d_stream->next_out = rec;
+
+	/* Set heap_no and the status bits. */
+	mach_write_to_2(rec - REC_NEW_HEAP_NO, heap_status);
+	heap_status += 1 << REC_HEAP_NO_SHIFT;
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Decompress the records of a node pointer page.
+@return TRUE on success, FALSE on failure */
+static
+ibool
+page_zip_decompress_node_ptrs(
+/*==========================*/
+	page_zip_des_t*	page_zip,	/*!< in/out: compressed page */
+	z_stream*	d_stream,	/*!< in/out: compressed page stream */
+	rec_t**		recs,		/*!< in: dense page directory
+					sorted by address */
+	ulint		n_dense,	/*!< in: size of recs[] */
+	dict_index_t*	index,		/*!< in: the index of the page */
+	rec_offs*	offsets,	/*!< in/out: temporary offsets */
+	mem_heap_t*	heap)		/*!< in: temporary memory heap */
+{
+	ulint		heap_status = REC_STATUS_NODE_PTR
+		| PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT;
+	ulint		slot;
+	const byte*	storage;
+
+	/* Subtract the space reserved for uncompressed data. */
+	d_stream->avail_in -= static_cast<uInt>(
+		n_dense * (PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE));
+
+	/* Decompress the records in heap_no order. */
+	for (slot = 0; slot < n_dense; slot++) {
+		rec_t*	rec = recs[slot];
+
+		d_stream->avail_out = static_cast<uInt>(
+			rec - REC_N_NEW_EXTRA_BYTES - d_stream->next_out);
+
+		ut_ad(d_stream->avail_out < srv_page_size
+		      - PAGE_ZIP_START - PAGE_DIR);
+		switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+		case Z_STREAM_END:
+			page_zip_decompress_heap_no(
+				d_stream, rec, heap_status);
+			goto zlib_done;
+		case Z_OK:
+		case Z_BUF_ERROR:
+			if (!d_stream->avail_out) {
+				break;
+			}
+			/* fall through */
+		default:
+			page_zip_fail(("page_zip_decompress_node_ptrs:"
+				       " 1 inflate(Z_SYNC_FLUSH)=%s\n",
+				       d_stream->msg));
+			goto zlib_error;
+		}
+
+		if (!page_zip_decompress_heap_no(
+			    d_stream, rec, heap_status)) {
+			ut_ad(0);
+		}
+
+		/* Read the offsets. The status bits are needed here. */
+		offsets = rec_get_offsets(rec, index, offsets, 0,
+					  ULINT_UNDEFINED, &heap);
+
+		/* Non-leaf nodes should not have any externally
+		stored columns. */
+		ut_ad(!rec_offs_any_extern(offsets));
+
+		/* Decompress the data bytes, except node_ptr. */
+		d_stream->avail_out =static_cast<uInt>(
+			rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE);
+
+		switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+		case Z_STREAM_END:
+			goto zlib_done;
+		case Z_OK:
+		case Z_BUF_ERROR:
+			if (!d_stream->avail_out) {
+				break;
+			}
+			/* fall through */
+		default:
+			page_zip_fail(("page_zip_decompress_node_ptrs:"
+				       " 2 inflate(Z_SYNC_FLUSH)=%s\n",
+				       d_stream->msg));
+			goto zlib_error;
+		}
+
+		/* Clear the node pointer in case the record
+		will be deleted and the space will be reallocated
+		to a smaller record. */
+		memset(d_stream->next_out, 0, REC_NODE_PTR_SIZE);
+		d_stream->next_out += REC_NODE_PTR_SIZE;
+
+		ut_ad(d_stream->next_out == rec_get_end(rec, offsets));
+	}
+
+	/* Decompress any trailing garbage, in case the last record was
+	allocated from an originally longer space on the free list. */
+	d_stream->avail_out = static_cast<uInt>(
+		page_header_get_field(page_zip->data, PAGE_HEAP_TOP)
+		- page_offset(d_stream->next_out));
+	if (UNIV_UNLIKELY(d_stream->avail_out > srv_page_size
+			  - PAGE_ZIP_START - PAGE_DIR)) {
+
+		page_zip_fail(("page_zip_decompress_node_ptrs:"
+			       " avail_out = %u\n",
+			       d_stream->avail_out));
+		goto zlib_error;
+	}
+
+	if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) {
+		page_zip_fail(("page_zip_decompress_node_ptrs:"
+			       " inflate(Z_FINISH)=%s\n",
+			       d_stream->msg));
+zlib_error:
+		inflateEnd(d_stream);
+		return(FALSE);
+	}
+
+	/* Note that d_stream->avail_out > 0 may hold here
+	if the modification log is nonempty. */
+
+zlib_done:
+	if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) {
+		ut_error;
+	}
+
+	{
+		page_t*	page = page_align(d_stream->next_out);
+
+		/* Clear the unused heap space on the uncompressed page. */
+		memset(d_stream->next_out, 0,
+		       ulint(page_dir_get_nth_slot(page,
+						   page_dir_get_n_slots(page)
+						   - 1U)
+			     - d_stream->next_out));
+	}
+
+#ifdef UNIV_DEBUG
+	page_zip->m_start = uint16_t(PAGE_DATA + d_stream->total_in);
+#endif /* UNIV_DEBUG */
+
+	/* Apply the modification log. */
+	{
+		const byte*	mod_log_ptr;
+		mod_log_ptr = page_zip_apply_log(d_stream->next_in,
+						 d_stream->avail_in + 1,
+						 recs, n_dense, 0,
+						 ULINT_UNDEFINED, heap_status,
+						 index, offsets);
+
+		if (UNIV_UNLIKELY(!mod_log_ptr)) {
+			return(FALSE);
+		}
+		page_zip->m_end = uint16_t(mod_log_ptr - page_zip->data);
+		page_zip->m_nonempty = mod_log_ptr != d_stream->next_in;
+	}
+
+	if (UNIV_UNLIKELY
+	    (page_zip_get_trailer_len(page_zip,
+				      dict_index_is_clust(index))
+	     + page_zip->m_end >= page_zip_get_size(page_zip))) {
+		page_zip_fail(("page_zip_decompress_node_ptrs:"
+			       " %lu + %lu >= %lu, %lu\n",
+			       (ulong) page_zip_get_trailer_len(
+				       page_zip, dict_index_is_clust(index)),
+			       (ulong) page_zip->m_end,
+			       (ulong) page_zip_get_size(page_zip),
+			       (ulong) dict_index_is_clust(index)));
+		return(FALSE);
+	}
+
+	/* Restore the uncompressed columns in heap_no order. */
+	storage = page_zip_dir_start_low(page_zip, n_dense);
+
+	for (slot = 0; slot < n_dense; slot++) {
+		rec_t*		rec	= recs[slot];
+
+		offsets = rec_get_offsets(rec, index, offsets, 0,
+					  ULINT_UNDEFINED, &heap);
+		/* Non-leaf nodes should not have any externally
+		stored columns. */
+		ut_ad(!rec_offs_any_extern(offsets));
+		storage -= REC_NODE_PTR_SIZE;
+
+		memcpy(rec_get_end(rec, offsets) - REC_NODE_PTR_SIZE,
+		       storage, REC_NODE_PTR_SIZE);
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Decompress the records of a leaf node of a secondary index.
+@return TRUE on success, FALSE on failure */
+static
+ibool
+page_zip_decompress_sec(
+/*====================*/
+	page_zip_des_t*	page_zip,	/*!< in/out: compressed page */
+	z_stream*	d_stream,	/*!< in/out: compressed page stream */
+	rec_t**		recs,		/*!< in: dense page directory
+					sorted by address */
+	ulint		n_dense,	/*!< in: size of recs[] */
+	dict_index_t*	index,		/*!< in: the index of the page */
+	rec_offs*	offsets)	/*!< in/out: temporary offsets */
+{
+	ulint	heap_status	= REC_STATUS_ORDINARY
+		| PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT;
+	ulint	slot;
+
+	ut_a(!dict_index_is_clust(index));
+
+	/* Subtract the space reserved for uncompressed data. */
+	d_stream->avail_in -= static_cast<uint>(
+		n_dense * PAGE_ZIP_DIR_SLOT_SIZE);
+
+	for (slot = 0; slot < n_dense; slot++) {
+		rec_t*	rec = recs[slot];
+
+		/* Decompress everything up to this record. */
+		d_stream->avail_out = static_cast<uint>(
+			rec - REC_N_NEW_EXTRA_BYTES - d_stream->next_out);
+
+		if (UNIV_LIKELY(d_stream->avail_out)) {
+			switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+			case Z_STREAM_END:
+				page_zip_decompress_heap_no(
+					d_stream, rec, heap_status);
+				goto zlib_done;
+			case Z_OK:
+			case Z_BUF_ERROR:
+				if (!d_stream->avail_out) {
+					break;
+				}
+				/* fall through */
+			default:
+				page_zip_fail(("page_zip_decompress_sec:"
+					       " inflate(Z_SYNC_FLUSH)=%s\n",
+					       d_stream->msg));
+				goto zlib_error;
+			}
+		}
+
+		if (!page_zip_decompress_heap_no(
+			    d_stream, rec, heap_status)) {
+			ut_ad(0);
+		}
+	}
+
+	/* Decompress the data of the last record and any trailing garbage,
+	in case the last record was allocated from an originally longer space
+	on the free list. */
+	d_stream->avail_out = static_cast<uInt>(
+		page_header_get_field(page_zip->data, PAGE_HEAP_TOP)
+		- page_offset(d_stream->next_out));
+	if (UNIV_UNLIKELY(d_stream->avail_out > srv_page_size
+			  - PAGE_ZIP_START - PAGE_DIR)) {
+
+		page_zip_fail(("page_zip_decompress_sec:"
+			       " avail_out = %u\n",
+			       d_stream->avail_out));
+		goto zlib_error;
+	}
+
+	if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) {
+		page_zip_fail(("page_zip_decompress_sec:"
+			       " inflate(Z_FINISH)=%s\n",
+			       d_stream->msg));
+zlib_error:
+		inflateEnd(d_stream);
+		return(FALSE);
+	}
+
+	/* Note that d_stream->avail_out > 0 may hold here
+	if the modification log is nonempty. */
+
+zlib_done:
+	if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) {
+		ut_error;
+	}
+
+	{
+		page_t*	page = page_align(d_stream->next_out);
+
+		/* Clear the unused heap space on the uncompressed page. */
+		memset(d_stream->next_out, 0,
+		       ulint(page_dir_get_nth_slot(page,
+						   page_dir_get_n_slots(page)
+						   - 1U)
+			     - d_stream->next_out));
+	}
+
+	ut_d(page_zip->m_start = uint16_t(PAGE_DATA + d_stream->total_in));
+
+	/* Apply the modification log. */
+	{
+		const byte*	mod_log_ptr;
+		mod_log_ptr = page_zip_apply_log(d_stream->next_in,
+						 d_stream->avail_in + 1,
+						 recs, n_dense,
+						 index->n_fields,
+						 ULINT_UNDEFINED, heap_status,
+						 index, offsets);
+
+		if (UNIV_UNLIKELY(!mod_log_ptr)) {
+			return(FALSE);
+		}
+		page_zip->m_end = uint16_t(mod_log_ptr - page_zip->data);
+		page_zip->m_nonempty = mod_log_ptr != d_stream->next_in;
+	}
+
+	if (UNIV_UNLIKELY(page_zip_get_trailer_len(page_zip, FALSE)
+			  + page_zip->m_end >= page_zip_get_size(page_zip))) {
+
+		page_zip_fail(("page_zip_decompress_sec: %lu + %lu >= %lu\n",
+			       (ulong) page_zip_get_trailer_len(
+				       page_zip, FALSE),
+			       (ulong) page_zip->m_end,
+			       (ulong) page_zip_get_size(page_zip)));
+		return(FALSE);
+	}
+
+	/* There are no uncompressed columns on leaf pages of
+	secondary indexes. */
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Decompress a record of a leaf node of a clustered index that contains
+externally stored columns.
+@return TRUE on success */
+static
+ibool
+page_zip_decompress_clust_ext(
+/*==========================*/
+	z_stream*	d_stream,	/*!< in/out: compressed page stream */
+	rec_t*		rec,		/*!< in/out: record */
+	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec) */
+	ulint		trx_id_col)	/*!< in: position of of DB_TRX_ID */
+{
+	ulint	i;
+
+	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+		ulint	len;
+		byte*	dst;
+
+		if (UNIV_UNLIKELY(i == trx_id_col)) {
+			/* Skip trx_id and roll_ptr */
+			dst = rec_get_nth_field(rec, offsets, i, &len);
+			if (UNIV_UNLIKELY(len < DATA_TRX_ID_LEN
+					  + DATA_ROLL_PTR_LEN)) {
+
+				page_zip_fail(("page_zip_decompress_clust_ext:"
+					       " len[%lu] = %lu\n",
+					       (ulong) i, (ulong) len));
+				return(FALSE);
+			}
+
+			if (rec_offs_nth_extern(offsets, i)) {
+
+				page_zip_fail(("page_zip_decompress_clust_ext:"
+					       " DB_TRX_ID at %lu is ext\n",
+					       (ulong) i));
+				return(FALSE);
+			}
+
+			d_stream->avail_out = static_cast<uInt>(
+				dst - d_stream->next_out);
+
+			switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+			case Z_STREAM_END:
+			case Z_OK:
+			case Z_BUF_ERROR:
+				if (!d_stream->avail_out) {
+					break;
+				}
+				/* fall through */
+			default:
+				page_zip_fail(("page_zip_decompress_clust_ext:"
+					       " 1 inflate(Z_SYNC_FLUSH)=%s\n",
+					       d_stream->msg));
+				return(FALSE);
+			}
+
+			ut_ad(d_stream->next_out == dst);
+
+			/* Clear DB_TRX_ID and DB_ROLL_PTR in order to
+			avoid uninitialized bytes in case the record
+			is affected by page_zip_apply_log(). */
+			memset(dst, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+			d_stream->next_out += DATA_TRX_ID_LEN
+				+ DATA_ROLL_PTR_LEN;
+		} else if (rec_offs_nth_extern(offsets, i)) {
+			dst = rec_get_nth_field(rec, offsets, i, &len);
+			ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE);
+			dst += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+			d_stream->avail_out = static_cast<uInt>(
+				dst - d_stream->next_out);
+			switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+			case Z_STREAM_END:
+			case Z_OK:
+			case Z_BUF_ERROR:
+				if (!d_stream->avail_out) {
+					break;
+				}
+				/* fall through */
+			default:
+				page_zip_fail(("page_zip_decompress_clust_ext:"
+					       " 2 inflate(Z_SYNC_FLUSH)=%s\n",
+					       d_stream->msg));
+				return(FALSE);
+			}
+
+			ut_ad(d_stream->next_out == dst);
+
+			/* Clear the BLOB pointer in case
+			the record will be deleted and the
+			space will not be reused.  Note that
+			the final initialization of the BLOB
+			pointers (copying from "externs"
+			or clearing) will have to take place
+			only after the page modification log
+			has been applied.  Otherwise, we
+			could end up with an uninitialized
+			BLOB pointer when a record is deleted,
+			reallocated and deleted. */
+			memset(d_stream->next_out, 0,
+			       BTR_EXTERN_FIELD_REF_SIZE);
+			d_stream->next_out
+				+= BTR_EXTERN_FIELD_REF_SIZE;
+		}
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Compress the records of a leaf node of a clustered index.
+@return TRUE on success, FALSE on failure */
+static
+ibool
+page_zip_decompress_clust(
+/*======================*/
+	page_zip_des_t*	page_zip,	/*!< in/out: compressed page */
+	z_stream*	d_stream,	/*!< in/out: compressed page stream */
+	rec_t**		recs,		/*!< in: dense page directory
+					sorted by address */
+	ulint		n_dense,	/*!< in: size of recs[] */
+	dict_index_t*	index,		/*!< in: the index of the page */
+	ulint		trx_id_col,	/*!< index of the trx_id column */
+	rec_offs*	offsets,	/*!< in/out: temporary offsets */
+	mem_heap_t*	heap)		/*!< in: temporary memory heap */
+{
+	int		err;
+	ulint		slot;
+	ulint		heap_status	= REC_STATUS_ORDINARY
+		| PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT;
+	const byte*	storage;
+	const byte*	externs;
+
+	ut_a(dict_index_is_clust(index));
+
+	/* Subtract the space reserved for uncompressed data. */
+	d_stream->avail_in -= static_cast<uInt>(n_dense)
+			    * (PAGE_ZIP_CLUST_LEAF_SLOT_SIZE);
+
+	/* Decompress the records in heap_no order. */
+	for (slot = 0; slot < n_dense; slot++) {
+		rec_t*	rec	= recs[slot];
+
+		d_stream->avail_out =static_cast<uInt>(
+			rec - REC_N_NEW_EXTRA_BYTES - d_stream->next_out);
+
+		ut_ad(d_stream->avail_out < srv_page_size
+		      - PAGE_ZIP_START - PAGE_DIR);
+		err = inflate(d_stream, Z_SYNC_FLUSH);
+		switch (err) {
+		case Z_STREAM_END:
+			page_zip_decompress_heap_no(
+				d_stream, rec, heap_status);
+			goto zlib_done;
+		case Z_OK:
+		case Z_BUF_ERROR:
+			if (UNIV_LIKELY(!d_stream->avail_out)) {
+				break;
+			}
+			/* fall through */
+		default:
+			page_zip_fail(("page_zip_decompress_clust:"
+				       " 1 inflate(Z_SYNC_FLUSH)=%s\n",
+				       d_stream->msg));
+			goto zlib_error;
+		}
+
+		if (!page_zip_decompress_heap_no(
+			    d_stream, rec, heap_status)) {
+			ut_ad(0);
+		}
+
+		/* Read the offsets. The status bits are needed here. */
+		offsets = rec_get_offsets(rec, index, offsets, index->n_fields,
+					  ULINT_UNDEFINED, &heap);
+
+		/* This is a leaf page in a clustered index. */
+
+		/* Check if there are any externally stored columns.
+		For each externally stored column, restore the
+		BTR_EXTERN_FIELD_REF separately. */
+
+		if (rec_offs_any_extern(offsets)) {
+			if (UNIV_UNLIKELY
+			    (!page_zip_decompress_clust_ext(
+				    d_stream, rec, offsets, trx_id_col))) {
+
+				goto zlib_error;
+			}
+		} else {
+			/* Skip trx_id and roll_ptr */
+			ulint	len;
+			byte*	dst = rec_get_nth_field(rec, offsets,
+							trx_id_col, &len);
+			if (UNIV_UNLIKELY(len < DATA_TRX_ID_LEN
+					  + DATA_ROLL_PTR_LEN)) {
+
+				page_zip_fail(("page_zip_decompress_clust:"
+					       " len = %lu\n", (ulong) len));
+				goto zlib_error;
+			}
+
+			d_stream->avail_out = static_cast<uInt>(
+				dst - d_stream->next_out);
+
+			switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+			case Z_STREAM_END:
+			case Z_OK:
+			case Z_BUF_ERROR:
+				if (!d_stream->avail_out) {
+					break;
+				}
+				/* fall through */
+			default:
+				page_zip_fail(("page_zip_decompress_clust:"
+					       " 2 inflate(Z_SYNC_FLUSH)=%s\n",
+					       d_stream->msg));
+				goto zlib_error;
+			}
+
+			ut_ad(d_stream->next_out == dst);
+
+			/* Clear DB_TRX_ID and DB_ROLL_PTR in order to
+			avoid uninitialized bytes in case the record
+			is affected by page_zip_apply_log(). */
+			memset(dst, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+			d_stream->next_out += DATA_TRX_ID_LEN
+				+ DATA_ROLL_PTR_LEN;
+		}
+
+		/* Decompress the last bytes of the record. */
+		d_stream->avail_out = static_cast<uInt>(
+			rec_get_end(rec, offsets) - d_stream->next_out);
+
+		switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+		case Z_STREAM_END:
+		case Z_OK:
+		case Z_BUF_ERROR:
+			if (!d_stream->avail_out) {
+				break;
+			}
+			/* fall through */
+		default:
+			page_zip_fail(("page_zip_decompress_clust:"
+				       " 3 inflate(Z_SYNC_FLUSH)=%s\n",
+				       d_stream->msg));
+			goto zlib_error;
+		}
+	}
+
+	/* Decompress any trailing garbage, in case the last record was
+	allocated from an originally longer space on the free list. */
+	d_stream->avail_out = static_cast<uInt>(
+		page_header_get_field(page_zip->data, PAGE_HEAP_TOP)
+		- page_offset(d_stream->next_out));
+	if (UNIV_UNLIKELY(d_stream->avail_out > srv_page_size
+			  - PAGE_ZIP_START - PAGE_DIR)) {
+
+		page_zip_fail(("page_zip_decompress_clust:"
+			       " avail_out = %u\n",
+			       d_stream->avail_out));
+		goto zlib_error;
+	}
+
+	if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) {
+		page_zip_fail(("page_zip_decompress_clust:"
+			       " inflate(Z_FINISH)=%s\n",
+			       d_stream->msg));
+zlib_error:
+		inflateEnd(d_stream);
+		return(FALSE);
+	}
+
+	/* Note that d_stream->avail_out > 0 may hold here
+	if the modification log is nonempty. */
+
+zlib_done:
+	if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) {
+		ut_error;
+	}
+
+	{
+		page_t*	page = page_align(d_stream->next_out);
+
+		/* Clear the unused heap space on the uncompressed page. */
+		memset(d_stream->next_out, 0,
+		       ulint(page_dir_get_nth_slot(page,
+						   page_dir_get_n_slots(page)
+						   - 1U)
+			     - d_stream->next_out));
+	}
+
+	ut_d(page_zip->m_start = uint16_t(PAGE_DATA + d_stream->total_in));
+
+	/* Apply the modification log. */
+	{
+		const byte*	mod_log_ptr;
+		mod_log_ptr = page_zip_apply_log(d_stream->next_in,
+						 d_stream->avail_in + 1,
+						 recs, n_dense,
+						 index->n_fields,
+						 trx_id_col, heap_status,
+						 index, offsets);
+
+		if (UNIV_UNLIKELY(!mod_log_ptr)) {
+			return(FALSE);
+		}
+		page_zip->m_end = uint16_t(mod_log_ptr - page_zip->data);
+		page_zip->m_nonempty = mod_log_ptr != d_stream->next_in;
+	}
+
+	if (UNIV_UNLIKELY(page_zip_get_trailer_len(page_zip, TRUE)
+			  + page_zip->m_end >= page_zip_get_size(page_zip))) {
+
+		page_zip_fail(("page_zip_decompress_clust: %lu + %lu >= %lu\n",
+			       (ulong) page_zip_get_trailer_len(
+				       page_zip, TRUE),
+			       (ulong) page_zip->m_end,
+			       (ulong) page_zip_get_size(page_zip)));
+		return(FALSE);
+	}
+
+	storage = page_zip_dir_start_low(page_zip, n_dense);
+
+	externs = storage - n_dense
+		* (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+	/* Restore the uncompressed columns in heap_no order. */
+
+	for (slot = 0; slot < n_dense; slot++) {
+		ulint	i;
+		ulint	len;
+		byte*	dst;
+		rec_t*	rec	= recs[slot];
+		bool	exists	= !page_zip_dir_find_free(
+			page_zip, page_offset(rec));
+		offsets = rec_get_offsets(rec, index, offsets, index->n_fields,
+					  ULINT_UNDEFINED, &heap);
+
+		dst = rec_get_nth_field(rec, offsets,
+					trx_id_col, &len);
+		ut_ad(len >= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+		storage -= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+		memcpy(dst, storage,
+		       DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+		/* Check if there are any externally stored
+		columns in this record.  For each externally
+		stored column, restore or clear the
+		BTR_EXTERN_FIELD_REF. */
+		if (!rec_offs_any_extern(offsets)) {
+			continue;
+		}
+
+		for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+			if (!rec_offs_nth_extern(offsets, i)) {
+				continue;
+			}
+			dst = rec_get_nth_field(rec, offsets, i, &len);
+
+			if (UNIV_UNLIKELY(len < BTR_EXTERN_FIELD_REF_SIZE)) {
+				page_zip_fail(("page_zip_decompress_clust:"
+					       " %lu < 20\n",
+					       (ulong) len));
+				return(FALSE);
+			}
+
+			dst += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+			if (UNIV_LIKELY(exists)) {
+				/* Existing record:
+				restore the BLOB pointer */
+				externs -= BTR_EXTERN_FIELD_REF_SIZE;
+
+				if (UNIV_UNLIKELY
+				    (externs < page_zip->data
+				     + page_zip->m_end)) {
+					page_zip_fail(("page_zip_"
+						       "decompress_clust:"
+						       " %p < %p + %lu\n",
+						       (const void*) externs,
+						       (const void*)
+						       page_zip->data,
+						       (ulong)
+						       page_zip->m_end));
+					return(FALSE);
+				}
+
+				memcpy(dst, externs,
+				       BTR_EXTERN_FIELD_REF_SIZE);
+
+				page_zip->n_blobs++;
+			} else {
+				/* Deleted record:
+				clear the BLOB pointer */
+				memset(dst, 0,
+				       BTR_EXTERN_FIELD_REF_SIZE);
+			}
+		}
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Decompress a page.  This function should tolerate errors on the compressed
+page.  Instead of letting assertions fail, it will return FALSE if an
+inconsistency is detected.
+@return TRUE on success, FALSE on failure */
+static
+ibool
+page_zip_decompress_low(
+/*====================*/
+	page_zip_des_t*	page_zip,/*!< in: data, ssize;
+				out: m_start, m_end, m_nonempty, n_blobs */
+	page_t*		page,	/*!< out: uncompressed page, may be trashed */
+	ibool		all)	/*!< in: TRUE=decompress the whole page;
+				FALSE=verify but do not copy some
+				page header fields that should not change
+				after page creation */
+{
+	z_stream	d_stream;
+	dict_index_t*	index	= NULL;
+	rec_t**		recs;	/*!< dense page directory, sorted by address */
+	ulint		n_dense;/* number of user records on the page */
+	ulint		trx_id_col = ULINT_UNDEFINED;
+	mem_heap_t*	heap;
+	rec_offs*	offsets;
+
+	ut_ad(page_zip_simple_validate(page_zip));
+	MEM_CHECK_ADDRESSABLE(page, srv_page_size);
+	MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+
+	/* The dense directory excludes the infimum and supremum records. */
+	n_dense = page_dir_get_n_heap(page_zip->data) - PAGE_HEAP_NO_USER_LOW;
+	if (UNIV_UNLIKELY(n_dense * PAGE_ZIP_DIR_SLOT_SIZE
+			  >= page_zip_get_size(page_zip))) {
+		page_zip_fail(("page_zip_decompress 1: %lu %lu\n",
+			       (ulong) n_dense,
+			       (ulong) page_zip_get_size(page_zip)));
+		return(FALSE);
+	}
+
+	heap = mem_heap_create(n_dense * (3 * sizeof *recs) + srv_page_size);
+
+	recs = static_cast<rec_t**>(
+		mem_heap_alloc(heap, n_dense * sizeof *recs));
+
+	if (all) {
+		/* Copy the page header. */
+		memcpy_aligned<2>(page, page_zip->data, PAGE_DATA);
+	} else {
+		/* Check that the bytes that we skip are identical. */
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+		ut_a(!memcmp(FIL_PAGE_TYPE + page,
+			     FIL_PAGE_TYPE + page_zip->data,
+			     PAGE_HEADER - FIL_PAGE_TYPE));
+		ut_a(!memcmp(PAGE_HEADER + PAGE_LEVEL + page,
+			     PAGE_HEADER + PAGE_LEVEL + page_zip->data,
+			     PAGE_DATA - (PAGE_HEADER + PAGE_LEVEL)));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+
+		/* Copy the mutable parts of the page header. */
+		memcpy_aligned<8>(page, page_zip->data, FIL_PAGE_TYPE);
+		memcpy_aligned<2>(PAGE_HEADER + page,
+				  PAGE_HEADER + page_zip->data,
+				  PAGE_LEVEL - PAGE_N_DIR_SLOTS);
+
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+		/* Check that the page headers match after copying. */
+		ut_a(!memcmp(page, page_zip->data, PAGE_DATA));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+	}
+
+#ifdef UNIV_ZIP_DEBUG
+	/* Clear the uncompressed page, except the header. */
+	memset(PAGE_DATA + page, 0x55, srv_page_size - PAGE_DATA);
+#endif /* UNIV_ZIP_DEBUG */
+	MEM_UNDEFINED(PAGE_DATA + page, srv_page_size - PAGE_DATA);
+
+	/* Copy the page directory. */
+	if (UNIV_UNLIKELY(!page_zip_dir_decode(page_zip, page, recs,
+					       n_dense))) {
+zlib_error:
+		mem_heap_free(heap);
+		return(FALSE);
+	}
+
+	/* Copy the infimum and supremum records. */
+	memcpy(page + (PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES),
+	       infimum_extra, sizeof infimum_extra);
+	if (page_is_empty(page)) {
+		rec_set_next_offs_new(page + PAGE_NEW_INFIMUM,
+				      PAGE_NEW_SUPREMUM);
+	} else {
+		rec_set_next_offs_new(page + PAGE_NEW_INFIMUM,
+				      page_zip_dir_get(page_zip, 0)
+				      & PAGE_ZIP_DIR_SLOT_MASK);
+	}
+	memcpy(page + PAGE_NEW_INFIMUM, infimum_data, sizeof infimum_data);
+	memcpy_aligned<4>(PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES + 1
+			  + page, supremum_extra_data,
+			  sizeof supremum_extra_data);
+
+	page_zip_set_alloc(&d_stream, heap);
+
+	d_stream.next_in = page_zip->data + PAGE_DATA;
+	/* Subtract the space reserved for
+	the page header and the end marker of the modification log. */
+	d_stream.avail_in = static_cast<uInt>(
+		page_zip_get_size(page_zip) - (PAGE_DATA + 1));
+	d_stream.next_out = page + PAGE_ZIP_START;
+	d_stream.avail_out = uInt(srv_page_size - PAGE_ZIP_START);
+
+	if (UNIV_UNLIKELY(inflateInit2(&d_stream, int(srv_page_size_shift))
+			  != Z_OK)) {
+		ut_error;
+	}
+
+	/* Decode the zlib header and the index information. */
+	if (UNIV_UNLIKELY(inflate(&d_stream, Z_BLOCK) != Z_OK)) {
+
+		page_zip_fail(("page_zip_decompress:"
+			       " 1 inflate(Z_BLOCK)=%s\n", d_stream.msg));
+		goto zlib_error;
+	}
+
+	if (UNIV_UNLIKELY(inflate(&d_stream, Z_BLOCK) != Z_OK)) {
+
+		page_zip_fail(("page_zip_decompress:"
+			       " 2 inflate(Z_BLOCK)=%s\n", d_stream.msg));
+		goto zlib_error;
+	}
+
+	index = page_zip_fields_decode(
+		page + PAGE_ZIP_START, d_stream.next_out,
+		page_is_leaf(page) ? &trx_id_col : NULL,
+		fil_page_get_type(page) == FIL_PAGE_RTREE);
+
+	if (UNIV_UNLIKELY(!index)) {
+
+		goto zlib_error;
+	}
+
+	/* Decompress the user records. */
+	page_zip->n_blobs = 0;
+	d_stream.next_out = page + PAGE_ZIP_START;
+
+	{
+		/* Pre-allocate the offsets for rec_get_offsets_reverse(). */
+		ulint	n = 1 + 1/* node ptr */ + REC_OFFS_HEADER_SIZE
+			+ dict_index_get_n_fields(index);
+
+		offsets = static_cast<rec_offs*>(
+			mem_heap_alloc(heap, n * sizeof(ulint)));
+
+		rec_offs_set_n_alloc(offsets, n);
+	}
+
+	/* Decompress the records in heap_no order. */
+	if (!page_is_leaf(page)) {
+		/* This is a node pointer page. */
+		ulint	info_bits;
+
+		if (UNIV_UNLIKELY
+		    (!page_zip_decompress_node_ptrs(page_zip, &d_stream,
+						    recs, n_dense, index,
+						    offsets, heap))) {
+			goto err_exit;
+		}
+
+		info_bits = page_has_prev(page) ? 0 : REC_INFO_MIN_REC_FLAG;
+
+		if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip, page,
+							    info_bits))) {
+			goto err_exit;
+		}
+	} else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) {
+		/* This is a leaf page in a secondary index. */
+		if (UNIV_UNLIKELY(!page_zip_decompress_sec(page_zip, &d_stream,
+							   recs, n_dense,
+							   index, offsets))) {
+			goto err_exit;
+		}
+
+		if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip,
+							    page, 0))) {
+err_exit:
+			page_zip_fields_free(index);
+			mem_heap_free(heap);
+			return(FALSE);
+		}
+	} else {
+		/* This is a leaf page in a clustered index. */
+		if (UNIV_UNLIKELY(!page_zip_decompress_clust(page_zip,
+							     &d_stream, recs,
+							     n_dense, index,
+							     trx_id_col,
+							     offsets, heap))) {
+			goto err_exit;
+		}
+
+		if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip,
+							    page, 0))) {
+			goto err_exit;
+		}
+	}
+
+	ut_a(page_is_comp(page));
+	MEM_CHECK_DEFINED(page, srv_page_size);
+
+	page_zip_fields_free(index);
+	mem_heap_free(heap);
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Decompress a page.  This function should tolerate errors on the compressed
+page.  Instead of letting assertions fail, it will return FALSE if an
+inconsistency is detected.
+@return TRUE on success, FALSE on failure */
+ibool
+page_zip_decompress(
+/*================*/
+	page_zip_des_t*	page_zip,/*!< in: data, ssize;
+				out: m_start, m_end, m_nonempty, n_blobs */
+	page_t*		page,	/*!< out: uncompressed page, may be trashed */
+	ibool		all)	/*!< in: TRUE=decompress the whole page;
+				FALSE=verify but do not copy some
+				page header fields that should not change
+				after page creation */
+{
+	const ulonglong ns = my_interval_timer();
+
+	if (!page_zip_decompress_low(page_zip, page, all)) {
+		return(FALSE);
+	}
+
+	const uint64_t time_diff = (my_interval_timer() - ns) / 1000;
+	page_zip_stat[page_zip->ssize - 1].decompressed++;
+	page_zip_stat[page_zip->ssize - 1].decompressed_usec += time_diff;
+
+	index_id_t	index_id = btr_page_get_index_id(page);
+
+	if (srv_cmp_per_index_enabled) {
+		mysql_mutex_lock(&page_zip_stat_per_index_mutex);
+		page_zip_stat_per_index[index_id].decompressed++;
+		page_zip_stat_per_index[index_id].decompressed_usec += time_diff;
+		mysql_mutex_unlock(&page_zip_stat_per_index_mutex);
+	}
+
+	/* Update the stat counter for LRU policy. */
+	buf_LRU_stat_inc_unzip();
+
+	MONITOR_INC(MONITOR_PAGE_DECOMPRESS);
+
+	return(TRUE);
+}
+
+#ifdef UNIV_ZIP_DEBUG
+/**********************************************************************//**
+Dump a block of memory on the standard error stream. */
+static
+void
+page_zip_hexdump_func(
+/*==================*/
+	const char*	name,	/*!< in: name of the data structure */
+	const void*	buf,	/*!< in: data */
+	ulint		size)	/*!< in: length of the data, in bytes */
+{
+	const byte*	s	= static_cast<const byte*>(buf);
+	ulint		addr;
+	const ulint	width	= 32; /* bytes per line */
+
+	fprintf(stderr, "%s:\n", name);
+
+	for (addr = 0; addr < size; addr += width) {
+		ulint	i;
+
+		fprintf(stderr, "%04lx ", (ulong) addr);
+
+		i = ut_min(width, size - addr);
+
+		while (i--) {
+			fprintf(stderr, "%02x", *s++);
+		}
+
+		putc('\n', stderr);
+	}
+}
+
+/** Dump a block of memory on the standard error stream.
+@param buf in: data
+@param size in: length of the data, in bytes */
+#define page_zip_hexdump(buf, size) page_zip_hexdump_func(#buf, buf, size)
+
+/** Flag: make page_zip_validate() compare page headers only */
+bool	page_zip_validate_header_only;
+
+/**********************************************************************//**
+Check that the compressed and decompressed pages match.
+@return TRUE if valid, FALSE if not */
+ibool
+page_zip_validate_low(
+/*==================*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	const page_t*		page,	/*!< in: uncompressed page */
+	const dict_index_t*	index,	/*!< in: index of the page, if known */
+	ibool			sloppy)	/*!< in: FALSE=strict,
+					TRUE=ignore the MIN_REC_FLAG */
+{
+	page_zip_des_t	temp_page_zip;
+	ibool		valid;
+
+	if (memcmp(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV,
+		   FIL_PAGE_LSN - FIL_PAGE_PREV)
+	    || memcmp(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE, 2)
+	    || memcmp(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA,
+		      PAGE_ROOT_AUTO_INC)
+	    /* The PAGE_ROOT_AUTO_INC can be updated while holding an SX-latch
+	    on the clustered index root page (page number 3 in .ibd files).
+	    That allows concurrent readers (holding buf_block_t::lock S-latch).
+	    Because we do not know what type of a latch our caller is holding,
+	    we will ignore the field on clustered index root pages in order
+	    to avoid false positives. */
+	    || (page_get_page_no(page) != 3/* clustered index root page */
+		&& memcmp(&page_zip->data[FIL_PAGE_DATA + PAGE_ROOT_AUTO_INC],
+			  &page[FIL_PAGE_DATA + PAGE_ROOT_AUTO_INC], 8))
+	    || memcmp(&page_zip->data[FIL_PAGE_DATA + PAGE_HEADER_PRIV_END],
+		      &page[FIL_PAGE_DATA + PAGE_HEADER_PRIV_END],
+		      PAGE_DATA - FIL_PAGE_DATA - PAGE_HEADER_PRIV_END)) {
+		page_zip_fail(("page_zip_validate: page header\n"));
+		page_zip_hexdump(page_zip, sizeof *page_zip);
+		page_zip_hexdump(page_zip->data, page_zip_get_size(page_zip));
+		page_zip_hexdump(page, srv_page_size);
+		return(FALSE);
+	}
+
+	ut_a(page_is_comp(page));
+
+	if (page_zip_validate_header_only) {
+		return(TRUE);
+	}
+
+	/* page_zip_decompress() expects the uncompressed page to be
+	srv_page_size aligned. */
+	page_t* temp_page = static_cast<byte*>(aligned_malloc(srv_page_size,
+							      srv_page_size));
+
+	MEM_CHECK_DEFINED(page, srv_page_size);
+	MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+
+	temp_page_zip = *page_zip;
+	valid = page_zip_decompress_low(&temp_page_zip, temp_page, TRUE);
+	if (!valid) {
+		fputs("page_zip_validate(): failed to decompress\n", stderr);
+		goto func_exit;
+	}
+	if (page_zip->n_blobs != temp_page_zip.n_blobs) {
+		page_zip_fail(("page_zip_validate: n_blobs: %u!=%u\n",
+			       page_zip->n_blobs, temp_page_zip.n_blobs));
+		valid = FALSE;
+	}
+#ifdef UNIV_DEBUG
+	if (page_zip->m_start != temp_page_zip.m_start) {
+		page_zip_fail(("page_zip_validate: m_start: %u!=%u\n",
+			       page_zip->m_start, temp_page_zip.m_start));
+		valid = FALSE;
+	}
+#endif /* UNIV_DEBUG */
+	if (page_zip->m_end != temp_page_zip.m_end) {
+		page_zip_fail(("page_zip_validate: m_end: %u!=%u\n",
+			       page_zip->m_end, temp_page_zip.m_end));
+		valid = FALSE;
+	}
+	if (page_zip->m_nonempty != temp_page_zip.m_nonempty) {
+		page_zip_fail(("page_zip_validate(): m_nonempty: %u!=%u\n",
+			       page_zip->m_nonempty,
+			       temp_page_zip.m_nonempty));
+		valid = FALSE;
+	}
+	if (memcmp(page + PAGE_HEADER, temp_page + PAGE_HEADER,
+		   srv_page_size - PAGE_HEADER - FIL_PAGE_DATA_END)) {
+
+		/* In crash recovery, the "minimum record" flag may be
+		set incorrectly until the mini-transaction is
+		committed.  Let us tolerate that difference when we
+		are performing a sloppy validation. */
+
+		rec_offs*	offsets;
+		mem_heap_t*	heap;
+		const rec_t*	rec;
+		const rec_t*	trec;
+		byte		info_bits_diff;
+		ulint		offset
+			= rec_get_next_offs(page + PAGE_NEW_INFIMUM, TRUE);
+		ut_a(offset >= PAGE_NEW_SUPREMUM);
+		offset -= 5/*REC_NEW_INFO_BITS*/;
+
+		info_bits_diff = page[offset] ^ temp_page[offset];
+
+		if (info_bits_diff == REC_INFO_MIN_REC_FLAG) {
+			temp_page[offset] = page[offset];
+
+			if (!memcmp(page + PAGE_HEADER,
+				    temp_page + PAGE_HEADER,
+				    srv_page_size - PAGE_HEADER
+				    - FIL_PAGE_DATA_END)) {
+
+				/* Only the minimum record flag
+				differed.  Let us ignore it. */
+				page_zip_fail(("page_zip_validate:"
+					       " min_rec_flag"
+					       " (%s" UINT32PF "," UINT32PF
+					       ",0x%02x)\n",
+					       sloppy ? "ignored, " : "",
+					       page_get_space_id(page),
+					       page_get_page_no(page),
+					       page[offset]));
+				/* We don't check for spatial index, since
+				the "minimum record" could be deleted when
+				doing rtr_update_mbr_field.
+				GIS_FIXME: need to validate why
+				rtr_update_mbr_field.() could affect this */
+				if (index && dict_index_is_spatial(index)) {
+					valid = true;
+				} else {
+					valid = sloppy;
+				}
+				goto func_exit;
+			}
+		}
+
+		/* Compare the pointers in the PAGE_FREE list. */
+		rec = page_header_get_ptr(page, PAGE_FREE);
+		trec = page_header_get_ptr(temp_page, PAGE_FREE);
+
+		while (rec || trec) {
+			if (page_offset(rec) != page_offset(trec)) {
+				page_zip_fail(("page_zip_validate:"
+					       " PAGE_FREE list: %u!=%u\n",
+					       (unsigned) page_offset(rec),
+					       (unsigned) page_offset(trec)));
+				valid = FALSE;
+				goto func_exit;
+			}
+
+			rec = page_rec_get_next_low(rec, TRUE);
+			trec = page_rec_get_next_low(trec, TRUE);
+		}
+
+		/* Compare the records. */
+		heap = NULL;
+		offsets = NULL;
+		rec = page_rec_get_next_low(
+			page + PAGE_NEW_INFIMUM, TRUE);
+		trec = page_rec_get_next_low(
+			temp_page + PAGE_NEW_INFIMUM, TRUE);
+		const ulint n_core = (index && page_is_leaf(page))
+			? index->n_fields : 0;
+
+		do {
+			if (page_offset(rec) != page_offset(trec)) {
+				page_zip_fail(("page_zip_validate:"
+					       " record list: 0x%02x!=0x%02x\n",
+					       (unsigned) page_offset(rec),
+					       (unsigned) page_offset(trec)));
+				valid = FALSE;
+				break;
+			}
+
+			if (index) {
+				/* Compare the data. */
+				offsets = rec_get_offsets(
+					rec, index, offsets, n_core,
+					ULINT_UNDEFINED, &heap);
+
+				if (memcmp(rec - rec_offs_extra_size(offsets),
+					   trec - rec_offs_extra_size(offsets),
+					   rec_offs_size(offsets))) {
+					page_zip_fail(
+						("page_zip_validate:"
+						 " record content: 0x%02x",
+						 (unsigned) page_offset(rec)));
+					valid = FALSE;
+					break;
+				}
+			}
+
+			rec = page_rec_get_next_low(rec, TRUE);
+			trec = page_rec_get_next_low(trec, TRUE);
+		} while (rec || trec);
+
+		if (heap) {
+			mem_heap_free(heap);
+		}
+	}
+
+func_exit:
+	if (!valid) {
+		page_zip_hexdump(page_zip, sizeof *page_zip);
+		page_zip_hexdump(page_zip->data, page_zip_get_size(page_zip));
+		page_zip_hexdump(page, srv_page_size);
+		page_zip_hexdump(temp_page, srv_page_size);
+	}
+	aligned_free(temp_page);
+	return(valid);
+}
+
+/**********************************************************************//**
+Check that the compressed and decompressed pages match.
+@return TRUE if valid, FALSE if not */
+ibool
+page_zip_validate(
+/*==============*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	const page_t*		page,	/*!< in: uncompressed page */
+	const dict_index_t*	index)	/*!< in: index of the page, if known */
+{
+	return(page_zip_validate_low(page_zip, page, index,
+				     recv_recovery_is_on()));
+}
+#endif /* UNIV_ZIP_DEBUG */
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Assert that the compressed and decompressed page headers match.
+@return TRUE */
+static
+ibool
+page_zip_header_cmp(
+/*================*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	const byte*		page)	/*!< in: uncompressed page */
+{
+	ut_ad(!memcmp(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV,
+		      FIL_PAGE_LSN - FIL_PAGE_PREV));
+	ut_ad(!memcmp(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE,
+		      2));
+	ut_ad(!memcmp(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA,
+		      PAGE_DATA - FIL_PAGE_DATA));
+
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/**********************************************************************//**
+Write a record on the compressed page that contains externally stored
+columns.  The data must already have been written to the uncompressed page.
+@return end of modification log */
+static
+byte*
+page_zip_write_rec_ext(
+/*===================*/
+	buf_block_t*	block,		/*!< in/out: compressed page */
+	const byte*	rec,		/*!< in: record being written */
+	const dict_index_t*index,	/*!< in: record descriptor */
+	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec, index) */
+	ulint		create,		/*!< in: nonzero=insert, zero=update */
+	ulint		trx_id_col,	/*!< in: position of DB_TRX_ID */
+	ulint		heap_no,	/*!< in: heap number of rec */
+	byte*		storage,	/*!< in: end of dense page directory */
+	byte*		data,		/*!< in: end of modification log */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
+{
+	const byte*	start	= rec;
+	ulint		i;
+	ulint		len;
+	byte*		externs	= storage;
+	ulint		n_ext	= rec_offs_n_extern(offsets);
+	const page_t* const page = block->page.frame;
+	page_zip_des_t* const page_zip = &block->page.zip;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+	MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+			  rec_offs_extra_size(offsets));
+
+	externs -= (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
+		* (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW);
+
+	/* Note that this will not take into account
+	the BLOB columns of rec if create==TRUE. */
+	ut_ad(data + rec_offs_data_size(offsets)
+	      - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
+	      - n_ext * FIELD_REF_SIZE
+	      < externs - FIELD_REF_SIZE * page_zip->n_blobs);
+
+	if (n_ext) {
+		ulint	blob_no = page_zip_get_n_prev_extern(
+			page_zip, rec, index);
+		byte*	ext_end = externs - page_zip->n_blobs * FIELD_REF_SIZE;
+		ut_ad(blob_no <= page_zip->n_blobs);
+		externs -= blob_no * FIELD_REF_SIZE;
+
+		if (create) {
+			page_zip->n_blobs = (page_zip->n_blobs + n_ext)
+				& ((1U << 12) - 1);
+			ASSERT_ZERO_BLOB(ext_end - n_ext * FIELD_REF_SIZE);
+			if (ulint len = ulint(externs - ext_end)) {
+				byte* ext_start = ext_end
+					- n_ext * FIELD_REF_SIZE;
+				memmove(ext_start, ext_end, len);
+				mtr->memmove(*block,
+					     ext_start - page_zip->data,
+					     ext_end - page_zip->data, len);
+			}
+		}
+
+		ut_a(blob_no + n_ext <= page_zip->n_blobs);
+	}
+
+	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+		const byte*	src;
+
+		if (UNIV_UNLIKELY(i == trx_id_col)) {
+			ut_ad(!rec_offs_nth_extern(offsets,
+						   i));
+			ut_ad(!rec_offs_nth_extern(offsets,
+						   i + 1));
+			/* Locate trx_id and roll_ptr. */
+			src = rec_get_nth_field(rec, offsets,
+						i, &len);
+			ut_ad(len == DATA_TRX_ID_LEN);
+			ut_ad(src + DATA_TRX_ID_LEN
+			      == rec_get_nth_field(
+				      rec, offsets,
+				      i + 1, &len));
+			ut_ad(len == DATA_ROLL_PTR_LEN);
+
+			/* Log the preceding fields. */
+			ASSERT_ZERO(data, src - start);
+			memcpy(data, start, ulint(src - start));
+			data += src - start;
+			start = src + (DATA_TRX_ID_LEN
+				       + DATA_ROLL_PTR_LEN);
+
+			/* Store trx_id and roll_ptr. */
+			constexpr ulint sys_len = DATA_TRX_ID_LEN
+				+ DATA_ROLL_PTR_LEN;
+			byte* sys = storage - sys_len * (heap_no - 1);
+			memcpy(sys, src, sys_len);
+			i++; /* skip also roll_ptr */
+			mtr->zmemcpy(*block, sys - page_zip->data, sys_len);
+		} else if (rec_offs_nth_extern(offsets, i)) {
+			src = rec_get_nth_field(rec, offsets,
+						i, &len);
+
+			ut_ad(dict_index_is_clust(index));
+			ut_ad(len >= FIELD_REF_SIZE);
+			src += len - FIELD_REF_SIZE;
+
+			ASSERT_ZERO(data, src - start);
+			memcpy(data, start, ulint(src - start));
+			data += src - start;
+			start = src + FIELD_REF_SIZE;
+
+			/* Store the BLOB pointer. */
+			externs -= FIELD_REF_SIZE;
+			ut_ad(data < externs);
+			memcpy(externs, src, FIELD_REF_SIZE);
+			mtr->zmemcpy(*block, externs - page_zip->data,
+				     FIELD_REF_SIZE);
+		}
+	}
+
+	/* Log the last bytes of the record. */
+	len = rec_offs_data_size(offsets) - ulint(start - rec);
+
+	ASSERT_ZERO(data, len);
+	memcpy(data, start, len);
+	data += len;
+
+	return(data);
+}
+
+/** Write an entire record to the ROW_FORMAT=COMPRESSED page.
+The data must already have been written to the uncompressed page.
+@param[in,out]	block		ROW_FORMAT=COMPRESSED page
+@param[in]	rec		record in the uncompressed page
+@param[in]	index		the index that the page belongs to
+@param[in]	offsets		rec_get_offsets(rec, index)
+@param[in]	create		nonzero=insert, zero=update
+@param[in,out]	mtr		mini-transaction */
+void page_zip_write_rec(buf_block_t *block, const byte *rec,
+                        const dict_index_t *index, const rec_offs *offsets,
+                        ulint create, mtr_t *mtr)
+{
+	const page_t* const page = block->page.frame;
+	page_zip_des_t* const page_zip = &block->page.zip;
+	byte*		data;
+	byte*		storage;
+	ulint		heap_no;
+	byte*		slot;
+
+	ut_ad(page_zip_simple_validate(page_zip));
+	ut_ad(page_zip_get_size(page_zip)
+	      > PAGE_DATA + page_zip_dir_size(page_zip));
+	ut_ad(rec_offs_comp(offsets));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	ut_ad(page_zip->m_start >= PAGE_DATA);
+
+	ut_ad(page_zip_header_cmp(page_zip, page));
+	ut_ad(page_simple_validate_new((page_t*) page));
+
+	MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+	MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+	MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+			  rec_offs_extra_size(offsets));
+
+	slot = page_zip_dir_find(page_zip, page_offset(rec));
+	ut_a(slot);
+	byte s = *slot;
+	/* Copy the delete mark. */
+	if (rec_get_deleted_flag(rec, TRUE)) {
+		/* In delete-marked records, DB_TRX_ID must
+		always refer to an existing undo log record.
+		On non-leaf pages, the delete-mark flag is garbage. */
+		ut_ad(!index->is_primary() || !page_is_leaf(page)
+		      || row_get_rec_trx_id(rec, index, offsets));
+		s |= PAGE_ZIP_DIR_SLOT_DEL >> 8;
+	} else {
+		s &= byte(~(PAGE_ZIP_DIR_SLOT_DEL >> 8));
+	}
+
+	if (s != *slot) {
+		*slot = s;
+		mtr->zmemcpy(*block, slot - page_zip->data, 1);
+	}
+
+	ut_ad(rec_get_start((rec_t*) rec, offsets) >= page + PAGE_ZIP_START);
+	ut_ad(rec_get_end((rec_t*) rec, offsets) <= page + srv_page_size
+	      - PAGE_DIR - PAGE_DIR_SLOT_SIZE
+	      * page_dir_get_n_slots(page));
+
+	heap_no = rec_get_heap_no_new(rec);
+	ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW); /* not infimum or supremum */
+	ut_ad(heap_no < page_dir_get_n_heap(page));
+
+	/* Append to the modification log. */
+	data = page_zip->data + page_zip->m_end;
+	ut_ad(!*data);
+
+	/* Identify the record by writing its heap number - 1.
+	0 is reserved to indicate the end of the modification log. */
+
+	if (UNIV_UNLIKELY(heap_no - 1 >= 64)) {
+		*data++ = (byte) (0x80 | (heap_no - 1) >> 7);
+		ut_ad(!*data);
+	}
+	*data++ = (byte) ((heap_no - 1) << 1);
+	ut_ad(!*data);
+
+	{
+		const byte*	start	= rec - rec_offs_extra_size(offsets);
+		const byte*	b	= rec - REC_N_NEW_EXTRA_BYTES;
+
+		/* Write the extra bytes backwards, so that
+		rec_offs_extra_size() can be easily computed in
+		page_zip_apply_log() by invoking
+		rec_get_offsets_reverse(). */
+
+		while (b != start) {
+			*data++ = *--b;
+			ut_ad(!*data);
+		}
+	}
+
+	/* Write the data bytes.  Store the uncompressed bytes separately. */
+	storage = page_zip_dir_start(page_zip);
+
+	if (page_is_leaf(page)) {
+		if (dict_index_is_clust(index)) {
+			/* Store separately trx_id, roll_ptr and
+			the BTR_EXTERN_FIELD_REF of each BLOB column. */
+			if (rec_offs_any_extern(offsets)) {
+				data = page_zip_write_rec_ext(
+					block,
+					rec, index, offsets, create,
+					index->db_trx_id(), heap_no,
+					storage, data, mtr);
+			} else {
+				/* Locate trx_id and roll_ptr. */
+				ulint len;
+				const byte*	src
+					= rec_get_nth_field(rec, offsets,
+							    index->db_trx_id(),
+							    &len);
+				ut_ad(len == DATA_TRX_ID_LEN);
+				ut_ad(src + DATA_TRX_ID_LEN
+				      == rec_get_nth_field(
+					      rec, offsets,
+					      index->db_roll_ptr(), &len));
+				ut_ad(len == DATA_ROLL_PTR_LEN);
+
+				/* Log the preceding fields. */
+				ASSERT_ZERO(data, src - rec);
+				memcpy(data, rec, ulint(src - rec));
+				data += src - rec;
+
+				/* Store trx_id and roll_ptr. */
+				constexpr ulint sys_len
+					= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+				byte* sys = storage - sys_len * (heap_no - 1);
+				memcpy(sys, src, sys_len);
+
+				src += sys_len;
+				mtr->zmemcpy(*block, sys - page_zip->data,
+					     sys_len);
+				/* Log the last bytes of the record. */
+				len = rec_offs_data_size(offsets)
+					- ulint(src - rec);
+
+				ASSERT_ZERO(data, len);
+				memcpy(data, src, len);
+				data += len;
+			}
+		} else {
+			/* Leaf page of a secondary index:
+			no externally stored columns */
+			ut_ad(!rec_offs_any_extern(offsets));
+
+			/* Log the entire record. */
+			ulint len = rec_offs_data_size(offsets);
+
+			ASSERT_ZERO(data, len);
+			memcpy(data, rec, len);
+			data += len;
+		}
+	} else {
+		/* This is a node pointer page. */
+		/* Non-leaf nodes should not have any externally
+		stored columns. */
+		ut_ad(!rec_offs_any_extern(offsets));
+
+		/* Copy the data bytes, except node_ptr. */
+		ulint len = rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE;
+		ut_ad(data + len < storage - REC_NODE_PTR_SIZE
+		      * (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW));
+		ASSERT_ZERO(data, len);
+		memcpy(data, rec, len);
+		data += len;
+
+		/* Copy the node pointer to the uncompressed area. */
+		byte* node_ptr = storage - REC_NODE_PTR_SIZE * (heap_no - 1);
+		mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, node_ptr,
+					       rec + len, REC_NODE_PTR_SIZE);
+	}
+
+	ut_a(!*data);
+	ut_ad((ulint) (data - page_zip->data) < page_zip_get_size(page_zip));
+	mtr->zmemcpy(*block, page_zip->m_end,
+		     data - page_zip->data - page_zip->m_end);
+	page_zip->m_end = uint16_t(data - page_zip->data);
+	page_zip->m_nonempty = TRUE;
+
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(page_zip_validate(page_zip, page_align(rec), index));
+#endif /* UNIV_ZIP_DEBUG */
+}
+
+/**********************************************************************//**
+Write a BLOB pointer of a record on the leaf page of a clustered index.
+The information must already have been updated on the uncompressed page. */
+void
+page_zip_write_blob_ptr(
+/*====================*/
+	buf_block_t*	block,	/*!< in/out: ROW_FORMAT=COMPRESSED page */
+	const byte*	rec,	/*!< in/out: record whose data is being
+				written */
+	dict_index_t*	index,	/*!< in: index of the page */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	ulint		n,	/*!< in: column index */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	const byte*	field;
+	byte*		externs;
+	const page_t* const page = block->page.frame;
+	page_zip_des_t* const page_zip = &block->page.zip;
+	ulint		blob_no;
+	ulint		len;
+
+	ut_ad(page_align(rec) == page);
+	ut_ad(index != NULL);
+	ut_ad(offsets != NULL);
+	ut_ad(page_simple_validate_new((page_t*) page));
+	ut_ad(page_zip_simple_validate(page_zip));
+	ut_ad(page_zip_get_size(page_zip)
+	      > PAGE_DATA + page_zip_dir_size(page_zip));
+	ut_ad(rec_offs_comp(offsets));
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	ut_ad(rec_offs_any_extern(offsets));
+	ut_ad(rec_offs_nth_extern(offsets, n));
+
+	ut_ad(page_zip->m_start >= PAGE_DATA);
+	ut_ad(page_zip_header_cmp(page_zip, page));
+
+	ut_ad(page_is_leaf(page));
+	ut_ad(dict_index_is_clust(index));
+
+	MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+	MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+	MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+			  rec_offs_extra_size(offsets));
+
+	blob_no = page_zip_get_n_prev_extern(page_zip, rec, index)
+		+ rec_get_n_extern_new(rec, index, n);
+	ut_a(blob_no < page_zip->n_blobs);
+
+	externs = page_zip->data + page_zip_get_size(page_zip)
+		- (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW)
+		* PAGE_ZIP_CLUST_LEAF_SLOT_SIZE;
+
+	field = rec_get_nth_field(rec, offsets, n, &len);
+
+	externs -= (blob_no + 1) * BTR_EXTERN_FIELD_REF_SIZE;
+	field += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+	mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, externs, field,
+				       BTR_EXTERN_FIELD_REF_SIZE);
+
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+}
+
+/**********************************************************************//**
+Write the node pointer of a record on a non-leaf compressed page. */
+void
+page_zip_write_node_ptr(
+/*====================*/
+	buf_block_t*	block,	/*!< in/out: compressed page */
+	byte*		rec,	/*!< in/out: record */
+	ulint		size,	/*!< in: data size of rec */
+	ulint		ptr,	/*!< in: node pointer */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	byte*	field;
+	byte*	storage;
+	page_zip_des_t* const page_zip = &block->page.zip;
+
+	ut_d(const page_t* const page = block->page.frame);
+	ut_ad(page_simple_validate_new(page));
+	ut_ad(page_zip_simple_validate(page_zip));
+	ut_ad(page_zip_get_size(page_zip)
+	      > PAGE_DATA + page_zip_dir_size(page_zip));
+	ut_ad(page_rec_is_comp(rec));
+
+	ut_ad(page_zip->m_start >= PAGE_DATA);
+	ut_ad(page_zip_header_cmp(page_zip, page));
+
+	ut_ad(!page_is_leaf(page));
+
+	MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+	MEM_CHECK_DEFINED(rec, size);
+
+	storage = page_zip_dir_start(page_zip)
+		- (rec_get_heap_no_new(rec) - 1) * REC_NODE_PTR_SIZE;
+	field = rec + size - REC_NODE_PTR_SIZE;
+
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+	ut_a(!memcmp(storage, field, REC_NODE_PTR_SIZE));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+	compile_time_assert(REC_NODE_PTR_SIZE == 4);
+	mach_write_to_4(field, ptr);
+	mtr->zmemcpy(*block, storage, field, REC_NODE_PTR_SIZE);
+}
+
+/** Write the DB_TRX_ID,DB_ROLL_PTR into a clustered index leaf page record.
+@param[in,out]	block		ROW_FORMAT=COMPRESSED page
+@param[in,out]	rec		record
+@param[in]	offsets		rec_get_offsets(rec, index)
+@param[in]	trx_id_field	field number of DB_TRX_ID (number of PK fields)
+@param[in]	trx_id		DB_TRX_ID value (transaction identifier)
+@param[in]	roll_ptr	DB_ROLL_PTR value (undo log pointer)
+@param[in,out]	mtr		mini-transaction */
+void
+page_zip_write_trx_id_and_roll_ptr(
+	buf_block_t*	block,
+	byte*		rec,
+	const rec_offs*	offsets,
+	ulint		trx_id_col,
+	trx_id_t	trx_id,
+	roll_ptr_t	roll_ptr,
+	mtr_t*		mtr)
+{
+	page_zip_des_t* const page_zip = &block->page.zip;
+
+	ut_d(const page_t* const page = block->page.frame);
+	ut_ad(page_align(rec) == page);
+	ut_ad(page_simple_validate_new(page));
+	ut_ad(page_zip_simple_validate(page_zip));
+	ut_ad(page_zip_get_size(page_zip)
+	      > PAGE_DATA + page_zip_dir_size(page_zip));
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	ut_ad(rec_offs_comp(offsets));
+
+	ut_ad(page_zip->m_start >= PAGE_DATA);
+	ut_ad(page_zip_header_cmp(page_zip, page));
+
+	ut_ad(page_is_leaf(page));
+
+	MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+
+	constexpr ulint sys_len = DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+	const ulint heap_no = rec_get_heap_no_new(rec);
+	ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW);
+	byte* storage = page_zip_dir_start(page_zip) - (heap_no - 1) * sys_len;
+
+	compile_time_assert(DATA_TRX_ID + 1 == DATA_ROLL_PTR);
+	ulint len;
+	byte* field = rec_get_nth_field(rec, offsets, trx_id_col, &len);
+	ut_ad(len == DATA_TRX_ID_LEN);
+	ut_ad(field + DATA_TRX_ID_LEN
+	      == rec_get_nth_field(rec, offsets, trx_id_col + 1, &len));
+	ut_ad(len == DATA_ROLL_PTR_LEN);
+	compile_time_assert(DATA_TRX_ID_LEN == 6);
+	mach_write_to_6(field, trx_id);
+	compile_time_assert(DATA_ROLL_PTR_LEN == 7);
+	mach_write_to_7(field + DATA_TRX_ID_LEN, roll_ptr);
+	len = 0;
+	if (heap_no > PAGE_HEAP_NO_USER_LOW) {
+		byte* prev = storage + sys_len;
+		for (; len < sys_len && prev[len] == field[len]; len++);
+		if (len > 4) {
+			/* We save space by replacing a single record
+
+			WRITE,offset(storage),byte[13]
+
+			with up to two records:
+
+			MEMMOVE,offset(storage),len(1 byte),+13(1 byte),
+			WRITE|0x80,0,byte[13-len]
+
+			The single WRITE record would be x+13 bytes long (x>2).
+			The MEMMOVE record would be x+1+1 = x+2 bytes, and
+			the second WRITE would be 1+1+13-len = 15-len bytes.
+
+			The total size is: x+13 versus x+2+15-len = x+17-len.
+			To save space, we must have len>4. */
+			memcpy(storage, prev, len);
+			mtr->memmove(*block, ulint(storage - page_zip->data),
+				     ulint(storage - page_zip->data) + sys_len,
+				     len);
+			storage += len;
+			field += len;
+			if (UNIV_LIKELY(len < sys_len)) {
+				goto write;
+			}
+		} else {
+			len = 0;
+			goto write;
+		}
+	} else {
+write:
+                mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, storage, field,
+					       sys_len - len);
+	}
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+	ut_a(!memcmp(storage - len, field - len, sys_len));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+
+	MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+	MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+			  rec_offs_extra_size(offsets));
+	MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+}
+
+/**********************************************************************//**
+Clear an area on the uncompressed and compressed page.
+Do not clear the data payload, as that would grow the modification log. */
+static
+void
+page_zip_clear_rec(
+/*===============*/
+	buf_block_t*	block,		/*!< in/out: compressed page */
+	byte*		rec,		/*!< in: record to clear */
+	const dict_index_t*	index,	/*!< in: index of rec */
+	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec, index) */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
+{
+	ulint	heap_no;
+	byte*	storage;
+	byte*	field;
+	ulint	len;
+
+	ut_ad(page_align(rec) == block->page.frame);
+	page_zip_des_t* const page_zip = &block->page.zip;
+
+	/* page_zip_validate() would fail here if a record
+	containing externally stored columns is being deleted. */
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(!page_zip_dir_find(page_zip, page_offset(rec)));
+	ut_ad(page_zip_dir_find_free(page_zip, page_offset(rec)));
+	ut_ad(page_zip_header_cmp(page_zip, block->page.frame));
+
+	heap_no = rec_get_heap_no_new(rec);
+	ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW);
+
+	MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+	MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+	MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+			  rec_offs_extra_size(offsets));
+
+	if (!page_is_leaf(block->page.frame)) {
+		/* Clear node_ptr. On the compressed page,
+		there is an array of node_ptr immediately before the
+		dense page directory, at the very end of the page. */
+		storage	= page_zip_dir_start(page_zip);
+		ut_ad(dict_index_get_n_unique_in_tree_nonleaf(index) ==
+		      rec_offs_n_fields(offsets) - 1);
+		field	= rec_get_nth_field(rec, offsets,
+					    rec_offs_n_fields(offsets) - 1,
+					    &len);
+		ut_ad(len == REC_NODE_PTR_SIZE);
+		ut_ad(!rec_offs_any_extern(offsets));
+		memset(field, 0, REC_NODE_PTR_SIZE);
+		storage -= (heap_no - 1) * REC_NODE_PTR_SIZE;
+		len = REC_NODE_PTR_SIZE;
+clear_page_zip:
+		memset(storage, 0, len);
+		mtr->memset(*block, storage - page_zip->data, len, 0);
+	} else if (index->is_clust()) {
+		/* Clear trx_id and roll_ptr. On the compressed page,
+		there is an array of these fields immediately before the
+		dense page directory, at the very end of the page. */
+		const ulint	trx_id_pos
+			= dict_col_get_clust_pos(
+			dict_table_get_sys_col(
+				index->table, DATA_TRX_ID), index);
+		field	= rec_get_nth_field(rec, offsets, trx_id_pos, &len);
+		ut_ad(len == DATA_TRX_ID_LEN);
+		memset(field, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+		if (rec_offs_any_extern(offsets)) {
+			ulint	i;
+
+			for (i = rec_offs_n_fields(offsets); i--; ) {
+				/* Clear all BLOB pointers in order to make
+				page_zip_validate() pass. */
+				if (rec_offs_nth_extern(offsets, i)) {
+					field = rec_get_nth_field(
+						rec, offsets, i, &len);
+					ut_ad(len
+					      == BTR_EXTERN_FIELD_REF_SIZE);
+					memset(field + len
+					       - BTR_EXTERN_FIELD_REF_SIZE,
+					       0, BTR_EXTERN_FIELD_REF_SIZE);
+				}
+			}
+		}
+
+		len = DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+		storage = page_zip_dir_start(page_zip)
+			- (heap_no - 1)
+			* (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+		goto clear_page_zip;
+	} else {
+		ut_ad(!rec_offs_any_extern(offsets));
+	}
+}
+
+/** Modify the delete-mark flag of a ROW_FORMAT=COMPRESSED record.
+@param[in,out]  block   buffer block
+@param[in,out]  rec     record on a physical index page
+@param[in]      flag    the value of the delete-mark flag
+@param[in,out]  mtr     mini-transaction  */
+void page_zip_rec_set_deleted(buf_block_t *block, rec_t *rec, bool flag,
+                              mtr_t *mtr)
+{
+  ut_ad(page_align(rec) == block->page.frame);
+  byte *slot= page_zip_dir_find(&block->page.zip, page_offset(rec));
+  byte b= *slot;
+  if (flag)
+    b|= (PAGE_ZIP_DIR_SLOT_DEL >> 8);
+  else
+    b&= byte(~(PAGE_ZIP_DIR_SLOT_DEL >> 8));
+  mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, slot, &b, 1);
+#ifdef UNIV_ZIP_DEBUG
+  ut_a(page_zip_validate(&block->page.zip, block->page.frame, nullptr));
+#endif /* UNIV_ZIP_DEBUG */
+}
+
+/**********************************************************************//**
+Write the "owned" flag of a record on a compressed page.  The n_owned field
+must already have been written on the uncompressed page. */
+void
+page_zip_rec_set_owned(
+/*===================*/
+	buf_block_t*	block,	/*!< in/out: ROW_FORMAT=COMPRESSED page */
+	const byte*	rec,	/*!< in: record on the uncompressed page */
+	ulint		flag,	/*!< in: the owned flag (nonzero=TRUE) */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+  ut_ad(page_align(rec) == block->page.frame);
+  page_zip_des_t *const page_zip= &block->page.zip;
+  byte *slot= page_zip_dir_find(page_zip, page_offset(rec));
+  MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+  byte b= *slot;
+  if (flag)
+    b|= (PAGE_ZIP_DIR_SLOT_OWNED >> 8);
+  else
+    b&= byte(~(PAGE_ZIP_DIR_SLOT_OWNED >> 8));
+  mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, slot, &b, 1);
+}
+
+/**********************************************************************//**
+Insert a record to the dense page directory. */
+void
+page_zip_dir_insert(
+/*================*/
+	page_cur_t*	cursor,	/*!< in/out: page cursor */
+	uint16_t	free_rec,/*!< in: record from which rec was
+				allocated, or 0 */
+	byte*		rec,	/*!< in: record to insert */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	ut_ad(page_align(cursor->rec) == cursor->block->page.frame);
+	ut_ad(page_align(rec) == cursor->block->page.frame);
+	page_zip_des_t *const page_zip= &cursor->block->page.zip;
+
+	ulint	n_dense;
+	byte*	slot_rec;
+	byte*	slot_free;
+
+	ut_ad(cursor->rec != rec);
+	ut_ad(page_rec_get_next_const(cursor->rec) == rec);
+	ut_ad(page_zip_simple_validate(page_zip));
+
+	MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+
+	if (page_rec_is_infimum(cursor->rec)) {
+		/* Use the first slot. */
+		slot_rec = page_zip->data + page_zip_get_size(page_zip);
+	} else {
+		byte*	end	= page_zip->data + page_zip_get_size(page_zip);
+		byte*	start	= end - page_zip_dir_user_size(page_zip);
+
+		if (UNIV_LIKELY(!free_rec)) {
+			/* PAGE_N_RECS was already incremented
+			in page_cur_insert_rec_zip(), but the
+			dense directory slot at that position
+			contains garbage.  Skip it. */
+			start += PAGE_ZIP_DIR_SLOT_SIZE;
+		}
+
+		slot_rec = page_zip_dir_find_low(start, end,
+						 page_offset(cursor->rec));
+		ut_a(slot_rec);
+	}
+
+	/* Read the old n_dense (n_heap may have been incremented). */
+	n_dense = page_dir_get_n_heap(page_zip->data)
+		- (PAGE_HEAP_NO_USER_LOW + 1U);
+
+	if (UNIV_UNLIKELY(free_rec)) {
+		/* The record was allocated from the free list.
+		Shift the dense directory only up to that slot.
+		Note that in this case, n_dense is actually
+		off by one, because page_cur_insert_rec_zip()
+		did not increment n_heap. */
+		ut_ad(rec_get_heap_no_new(rec) < n_dense + 1
+		      + PAGE_HEAP_NO_USER_LOW);
+		ut_ad(page_offset(rec) >= free_rec);
+		slot_free = page_zip_dir_find(page_zip, free_rec);
+		ut_ad(slot_free);
+		slot_free += PAGE_ZIP_DIR_SLOT_SIZE;
+	} else {
+		/* The record was allocated from the heap.
+		Shift the entire dense directory. */
+		ut_ad(rec_get_heap_no_new(rec) == n_dense
+		      + PAGE_HEAP_NO_USER_LOW);
+
+		/* Shift to the end of the dense page directory. */
+		slot_free = page_zip->data + page_zip_get_size(page_zip)
+			- PAGE_ZIP_DIR_SLOT_SIZE * n_dense;
+	}
+
+	if (const ulint slot_len = ulint(slot_rec - slot_free)) {
+		/* Shift the dense directory to allocate place for rec. */
+		memmove_aligned<2>(slot_free - PAGE_ZIP_DIR_SLOT_SIZE,
+				   slot_free, slot_len);
+		mtr->memmove(*cursor->block, (slot_free - page_zip->data)
+			     - PAGE_ZIP_DIR_SLOT_SIZE,
+			     slot_free - page_zip->data, slot_len);
+	}
+
+	/* Write the entry for the inserted record.
+	The "owned" flag must be zero. */
+	uint16_t offs = page_offset(rec);
+	if (rec_get_deleted_flag(rec, true)) {
+		offs |= PAGE_ZIP_DIR_SLOT_DEL;
+	}
+
+	mach_write_to_2(slot_rec - PAGE_ZIP_DIR_SLOT_SIZE, offs);
+	mtr->zmemcpy(*cursor->block, slot_rec - page_zip->data
+		     - PAGE_ZIP_DIR_SLOT_SIZE, PAGE_ZIP_DIR_SLOT_SIZE);
+}
+
+/** Shift the dense page directory and the array of BLOB pointers
+when a record is deleted.
+@param[in,out]  block   index page
+@param[in,out]  rec     record being deleted
+@param[in]      index   the index that the page belongs to
+@param[in]      offsets rec_get_offsets(rec, index)
+@param[in]      free    previous start of the free list
+@param[in,out]  mtr     mini-transaction */
+void page_zip_dir_delete(buf_block_t *block, byte *rec,
+                         const dict_index_t *index, const rec_offs *offsets,
+                         const byte *free, mtr_t *mtr)
+{
+  ut_ad(page_align(rec) == block->page.frame);
+  page_zip_des_t *const page_zip= &block->page.zip;
+
+  ut_ad(rec_offs_validate(rec, index, offsets));
+  ut_ad(rec_offs_comp(offsets));
+
+  MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
+  MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
+  MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
+		    rec_offs_extra_size(offsets));
+
+  mach_write_to_2(rec - REC_NEXT,
+                  free ? static_cast<uint16_t>(free - rec) : 0);
+  byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER +
+                                        block->page.frame);
+  mtr->write<2>(*block, page_free, page_offset(rec));
+  byte *garbage= my_assume_aligned<2>(PAGE_GARBAGE + PAGE_HEADER +
+                                      block->page.frame);
+  mtr->write<2>(*block, garbage, rec_offs_size(offsets) +
+                mach_read_from_2(garbage));
+  compile_time_assert(PAGE_GARBAGE == PAGE_FREE + 2);
+  memcpy_aligned<4>(PAGE_FREE + PAGE_HEADER + page_zip->data, page_free, 4);
+  byte *slot_rec= page_zip_dir_find(page_zip, page_offset(rec));
+  ut_a(slot_rec);
+  uint16_t n_recs= page_get_n_recs(block->page.frame);
+  ut_ad(n_recs);
+  ut_ad(n_recs > 1 || page_get_page_no(block->page.frame) == index->page);
+  /* This could not be done before page_zip_dir_find(). */
+  byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER +
+                                          block->page.frame);
+  mtr->write<2>(*block, page_n_recs, n_recs - 1U);
+  memcpy_aligned<2>(PAGE_N_RECS + PAGE_HEADER + page_zip->data, page_n_recs,
+                    2);
+
+  byte *slot_free;
+
+  if (UNIV_UNLIKELY(!free))
+    /* Make the last slot the start of the free list. */
+    slot_free= page_zip->data + page_zip_get_size(page_zip) -
+      PAGE_ZIP_DIR_SLOT_SIZE * (page_dir_get_n_heap(page_zip->data) -
+                                PAGE_HEAP_NO_USER_LOW);
+  else
+  {
+    slot_free= page_zip_dir_find_free(page_zip, page_offset(free));
+    ut_a(slot_free < slot_rec);
+    /* Grow the free list by one slot by moving the start. */
+    slot_free+= PAGE_ZIP_DIR_SLOT_SIZE;
+  }
+
+  const ulint slot_len= slot_rec > slot_free ? ulint(slot_rec - slot_free) : 0;
+  if (slot_len)
+  {
+    memmove_aligned<2>(slot_free + PAGE_ZIP_DIR_SLOT_SIZE, slot_free,
+                       slot_len);
+    mtr->memmove(*block, (slot_free - page_zip->data) + PAGE_ZIP_DIR_SLOT_SIZE,
+                 slot_free - page_zip->data, slot_len);
+  }
+
+  /* Write the entry for the deleted record.
+  The "owned" and "deleted" flags will be cleared. */
+  mach_write_to_2(slot_free, page_offset(rec));
+  mtr->zmemcpy(*block, slot_free - page_zip->data, 2);
+
+  if (const ulint n_ext= rec_offs_n_extern(offsets))
+  {
+    ut_ad(index->is_primary());
+    ut_ad(page_is_leaf(block->page.frame));
+
+    /* Shift and zero fill the array of BLOB pointers. */
+    ulint blob_no = page_zip_get_n_prev_extern(page_zip, rec, index);
+    ut_a(blob_no + n_ext <= page_zip->n_blobs);
+
+    byte *externs= page_zip->data + page_zip_get_size(page_zip) -
+      (page_dir_get_n_heap(block->page.frame) - PAGE_HEAP_NO_USER_LOW) *
+      PAGE_ZIP_CLUST_LEAF_SLOT_SIZE;
+    byte *ext_end= externs - page_zip->n_blobs * FIELD_REF_SIZE;
+
+    /* Shift and zero fill the array. */
+    if (const ulint ext_len= ulint(page_zip->n_blobs - n_ext - blob_no) *
+        BTR_EXTERN_FIELD_REF_SIZE)
+    {
+      memmove(ext_end + n_ext * FIELD_REF_SIZE, ext_end, ext_len);
+      mtr->memmove(*block, (ext_end - page_zip->data) + n_ext * FIELD_REF_SIZE,
+                   ext_end - page_zip->data, ext_len);
+    }
+    memset(ext_end, 0, n_ext * FIELD_REF_SIZE);
+    mtr->memset(*block, ext_end - page_zip->data, n_ext * FIELD_REF_SIZE, 0);
+    page_zip->n_blobs = (page_zip->n_blobs - n_ext) & ((1U << 12) - 1);
+  }
+
+  /* The compression algorithm expects info_bits and n_owned
+  to be 0 for deleted records. */
+  rec[-REC_N_NEW_EXTRA_BYTES]= 0; /* info_bits and n_owned */
+
+  page_zip_clear_rec(block, rec, index, offsets, mtr);
+}
+
+/**********************************************************************//**
+Reorganize and compress a page.  This is a low-level operation for
+compressed pages, to be used when page_zip_compress() fails.
+On success, redo log will be written.
+The function btr_page_reorganize() should be preferred whenever possible.
+IMPORTANT: if page_zip_reorganize() is invoked on a leaf page of a
+non-clustered index, the caller must update the insert buffer free
+bits in the same mini-transaction in such a way that the modification
+will be redo-logged.
+@return error code
+@retval DB_FAIL on overflow; the block_zip will be left intact */
+dberr_t
+page_zip_reorganize(
+	buf_block_t*	block,	/*!< in/out: page with compressed page;
+				on the compressed page, in: size;
+				out: data, n_blobs,
+				m_start, m_end, m_nonempty */
+	dict_index_t*	index,	/*!< in: index of the B-tree node */
+	ulint		z_level,/*!< in: compression level */
+	mtr_t*		mtr,	/*!< in: mini-transaction */
+	bool		restore)/*!< whether to restore on failure */
+{
+	page_t*		page		= buf_block_get_frame(block);
+	buf_block_t*	temp_block;
+	page_t*		temp_page;
+
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(block->page.zip.data);
+	ut_ad(page_is_comp(page));
+	ut_ad(!dict_index_is_ibuf(index));
+	ut_ad(!index->table->is_temporary());
+	/* Note that page_zip_validate(page_zip, page, index) may fail here. */
+	MEM_CHECK_DEFINED(page, srv_page_size);
+	MEM_CHECK_DEFINED(buf_block_get_page_zip(block)->data,
+			  page_zip_get_size(buf_block_get_page_zip(block)));
+
+	/* Disable logging */
+	mtr_log_t	log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
+
+	temp_block = buf_block_alloc();
+	btr_search_drop_page_hash_index(block, false);
+	temp_page = temp_block->page.frame;
+
+	/* Copy the old page to temporary space */
+	memcpy_aligned<UNIV_PAGE_SIZE_MIN>(temp_page, block->page.frame,
+					   srv_page_size);
+
+	/* Recreate the page: note that global data on page (possible
+	segment headers, next page-field, etc.) is preserved intact */
+
+	page_create(block, mtr, true);
+	if (index->is_spatial()) {
+		mach_write_to_2(FIL_PAGE_TYPE + page, FIL_PAGE_RTREE);
+		memcpy_aligned<2>(block->page.zip.data + FIL_PAGE_TYPE,
+				  page + FIL_PAGE_TYPE, 2);
+		memset(FIL_RTREE_SPLIT_SEQ_NUM + page, 0, 8);
+		memset(FIL_RTREE_SPLIT_SEQ_NUM + block->page.zip.data, 0, 8);
+	}
+
+	/* Copy the records from the temporary space to the recreated page;
+	do not copy the lock bits yet */
+
+	dberr_t err = page_copy_rec_list_end_no_locks(
+		block, temp_block, page_get_infimum_rec(temp_page),
+		index, mtr);
+
+	/* Copy the PAGE_MAX_TRX_ID or PAGE_ROOT_AUTO_INC. */
+	memcpy_aligned<8>(page + (PAGE_HEADER + PAGE_MAX_TRX_ID),
+			  temp_page + (PAGE_HEADER + PAGE_MAX_TRX_ID), 8);
+	/* PAGE_MAX_TRX_ID must be set on secondary index leaf pages. */
+	ut_ad(err != DB_SUCCESS
+	      || index->is_clust() || !page_is_leaf(temp_page)
+	      || page_get_max_trx_id(page) != 0);
+	/* PAGE_MAX_TRX_ID must be zero on non-leaf pages other than
+	clustered index root pages. */
+	ut_ad(err != DB_SUCCESS
+	      || page_get_max_trx_id(page) == 0
+	      || (index->is_clust()
+		  ? !page_has_siblings(temp_page)
+		  : page_is_leaf(temp_page)));
+
+	/* Restore logging. */
+	mtr_set_log_mode(mtr, log_mode);
+
+	if (!page_zip_compress(block, index, z_level, mtr)) {
+		if (restore) {
+			/* Restore the old page and exit. */
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+			/* Check that the bytes that we skip are identical. */
+			ut_a(!memcmp(page, temp_page, PAGE_HEADER));
+			ut_a(!memcmp(PAGE_HEADER + PAGE_N_RECS + page,
+				     PAGE_HEADER + PAGE_N_RECS + temp_page,
+				     PAGE_DATA - (PAGE_HEADER + PAGE_N_RECS)));
+			ut_a(!memcmp(srv_page_size - FIL_PAGE_DATA_END + page,
+				     srv_page_size - FIL_PAGE_DATA_END
+				     + temp_page,
+				     FIL_PAGE_DATA_END));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+
+			memcpy(PAGE_HEADER + page, PAGE_HEADER + temp_page,
+			       PAGE_N_RECS - PAGE_N_DIR_SLOTS);
+			memcpy(PAGE_DATA + page, PAGE_DATA + temp_page,
+			       srv_page_size - PAGE_DATA - FIL_PAGE_DATA_END);
+
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+			ut_a(!memcmp(page, temp_page, srv_page_size));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+		}
+
+		err = DB_FAIL;
+	} else {
+		lock_move_reorganize_page(block, temp_block);
+	}
+
+	buf_block_free(temp_block);
+	return err;
+}
+
+/**********************************************************************//**
+Copy the records of a page byte for byte.  Do not copy the page header
+or trailer, except those B-tree header fields that are directly
+related to the storage of records.  Also copy PAGE_MAX_TRX_ID.
+NOTE: The caller must update the lock table and the adaptive hash index. */
+void
+page_zip_copy_recs(
+	buf_block_t*		block,		/*!< in/out: buffer block */
+	const page_zip_des_t*	src_zip,	/*!< in: compressed page */
+	const page_t*		src,		/*!< in: page */
+	dict_index_t*		index,		/*!< in: index of the B-tree */
+	mtr_t*			mtr)		/*!< in: mini-transaction */
+{
+	page_t* page = block->page.frame;
+	page_zip_des_t* page_zip = &block->page.zip;
+
+	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr->memo_contains_page_flagged(src, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(!dict_index_is_ibuf(index));
+	ut_ad(!index->table->is_temporary());
+#ifdef UNIV_ZIP_DEBUG
+	/* The B-tree operations that call this function may set
+	FIL_PAGE_PREV or PAGE_LEVEL, causing a temporary min_rec_flag
+	mismatch.  A strict page_zip_validate() will be executed later
+	during the B-tree operations. */
+	ut_a(page_zip_validate_low(src_zip, src, index, TRUE));
+#endif /* UNIV_ZIP_DEBUG */
+	ut_a(page_zip_get_size(page_zip) == page_zip_get_size(src_zip));
+	if (UNIV_UNLIKELY(src_zip->n_blobs)) {
+		ut_a(page_is_leaf(src));
+		ut_a(dict_index_is_clust(index));
+	}
+
+	MEM_CHECK_ADDRESSABLE(page, srv_page_size);
+	MEM_CHECK_ADDRESSABLE(page_zip->data, page_zip_get_size(page_zip));
+	MEM_CHECK_DEFINED(src, srv_page_size);
+	MEM_CHECK_DEFINED(src_zip->data, page_zip_get_size(page_zip));
+
+	/* Copy those B-tree page header fields that are related to
+	the records stored in the page.  Also copy the field
+	PAGE_MAX_TRX_ID.  Skip the rest of the page header and
+	trailer.  On the compressed page, there is no trailer. */
+	compile_time_assert(PAGE_MAX_TRX_ID + 8 == PAGE_HEADER_PRIV_END);
+	memcpy_aligned<2>(PAGE_HEADER + page, PAGE_HEADER + src,
+			  PAGE_HEADER_PRIV_END);
+	memcpy_aligned<2>(PAGE_DATA + page, PAGE_DATA + src,
+			  srv_page_size - (PAGE_DATA + FIL_PAGE_DATA_END));
+	memcpy_aligned<2>(PAGE_HEADER + page_zip->data,
+			  PAGE_HEADER + src_zip->data,
+			  PAGE_HEADER_PRIV_END);
+	memcpy_aligned<2>(PAGE_DATA + page_zip->data,
+			  PAGE_DATA + src_zip->data,
+			  page_zip_get_size(page_zip) - PAGE_DATA);
+
+	if (dict_index_is_clust(index)) {
+		/* Reset the PAGE_ROOT_AUTO_INC field when copying
+		from a root page. */
+		memset_aligned<8>(PAGE_HEADER + PAGE_ROOT_AUTO_INC
+				  + page, 0, 8);
+		memset_aligned<8>(PAGE_HEADER + PAGE_ROOT_AUTO_INC
+				  + page_zip->data, 0, 8);
+	} else {
+		/* The PAGE_MAX_TRX_ID must be nonzero on leaf pages
+		of secondary indexes, and 0 on others. */
+		ut_ad(!page_is_leaf(src) == !page_get_max_trx_id(src));
+	}
+
+	/* Copy all fields of src_zip to page_zip, except the pointer
+	to the compressed data page. */
+	{
+		page_zip_t*	data = page_zip->data;
+		new (page_zip) page_zip_des_t(*src_zip, false);
+		page_zip->data = data;
+	}
+	ut_ad(page_zip_get_trailer_len(page_zip, dict_index_is_clust(index))
+	      + page_zip->m_end < page_zip_get_size(page_zip));
+
+	if (!page_is_leaf(src)
+	    && UNIV_UNLIKELY(!page_has_prev(src))
+	    && UNIV_LIKELY(page_has_prev(page))) {
+		/* Clear the REC_INFO_MIN_REC_FLAG of the first user record. */
+		ulint	offs = rec_get_next_offs(page + PAGE_NEW_INFIMUM,
+						 TRUE);
+		if (UNIV_LIKELY(offs != PAGE_NEW_SUPREMUM)) {
+			rec_t*	rec = page + offs;
+			ut_a(rec[-REC_N_NEW_EXTRA_BYTES]
+			     & REC_INFO_MIN_REC_FLAG);
+			rec[-REC_N_NEW_EXTRA_BYTES]
+				&= byte(~REC_INFO_MIN_REC_FLAG);
+		}
+	}
+
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(page_zip_validate(page_zip, page, index));
+#endif /* UNIV_ZIP_DEBUG */
+	page_zip_compress_write_log(block, index, mtr);
+}
+#endif /* !UNIV_INNOCHECKSUM */
+
+/** Calculate the compressed page checksum.
+@param data		compressed page
+@param size		size of compressed page
+@param use_adler	whether to use Adler32 instead of a XOR of 3 CRC-32C
+@return page checksum */
+uint32_t page_zip_calc_checksum(const void *data, size_t size, bool use_adler)
+{
+	uLong		adler;
+	const Bytef*	s = static_cast<const byte*>(data);
+
+	/* Exclude FIL_PAGE_SPACE_OR_CHKSUM, FIL_PAGE_LSN,
+	and FIL_PAGE_FILE_FLUSH_LSN from the checksum. */
+	ut_ad(size > FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+	if (!use_adler) {
+		return my_crc32c(0, s + FIL_PAGE_OFFSET,
+				 FIL_PAGE_LSN - FIL_PAGE_OFFSET)
+			^ my_crc32c(0, s + FIL_PAGE_TYPE, 2)
+			^ my_crc32c(0, s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+				    size - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+	} else {
+		adler = adler32(0L, s + FIL_PAGE_OFFSET,
+				FIL_PAGE_LSN - FIL_PAGE_OFFSET);
+		adler = adler32(adler, s + FIL_PAGE_TYPE, 2);
+		adler = adler32(
+			adler, s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+			static_cast<uInt>(size)
+			- FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+		return(uint32_t(adler));
+	}
+}
+
+/** Validate the checksum on a ROW_FORMAT=COMPRESSED page.
+@param data    ROW_FORMAT=COMPRESSED page
+@param size    size of the page, in bytes
+@return whether the stored checksum matches innodb_checksum_algorithm */
+bool page_zip_verify_checksum(const byte *data, size_t size)
+{
+	if (buf_is_zeroes(span<const byte>(data, size))) {
+		return true;
+	}
+
+	const uint32_t stored = mach_read_from_4(
+		data + FIL_PAGE_SPACE_OR_CHKSUM);
+
+	uint32_t calc = page_zip_calc_checksum(data, size, false);
+
+#ifdef UNIV_INNOCHECKSUM
+	extern FILE* log_file;
+	extern uint32_t cur_page_num;
+
+	if (log_file) {
+		fprintf(log_file, "page::" UINT32PF ";"
+			" checksum: calculated = " UINT32PF ";"
+			" recorded = " UINT32PF "\n", cur_page_num,
+			calc, stored);
+	}
+#endif /* UNIV_INNOCHECKSUM */
+
+	if (stored == calc) {
+		return(TRUE);
+	}
+
+#ifndef UNIV_INNOCHECKSUM
+	switch (srv_checksum_algorithm) {
+	case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
+	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
+		break;
+	default:
+		if (stored == BUF_NO_CHECKSUM_MAGIC) {
+			return(TRUE);
+		}
+
+		return stored == page_zip_calc_checksum(data, size, true);
+	}
+#endif /* !UNIV_INNOCHECKSUM */
+
+	return FALSE;
+}
diff --git a/storage/innobase/pars/lexyy.cc b/storage/innobase/pars/lexyy.cc
new file mode 100644
index 00000000..e57a28ce
--- /dev/null
+++ b/storage/innobase/pars/lexyy.cc
@@ -0,0 +1,2841 @@
+#include "univ.i"
+#line 2 "lexyy.cc"
+
+#line 4 "lexyy.cc"
+
+#define  YY_INT_ALIGNED short int
+
+/* A lexical scanner generated by flex */
+
+#define FLEX_SCANNER
+#define YY_FLEX_MAJOR_VERSION 2
+#define YY_FLEX_MINOR_VERSION 6
+#define YY_FLEX_SUBMINOR_VERSION 4
+#if YY_FLEX_SUBMINOR_VERSION > 0
+#define FLEX_BETA
+#endif
+
+/* First, we deal with  platform-specific or compiler-specific issues. */
+
+/* begin standard C headers. */
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+
+/* end standard C headers. */
+
+/* flex integer type definitions */
+
+#ifndef FLEXINT_H
+#define FLEXINT_H
+
+/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+
+/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
+ * if you want the limit (max/min) macros for int types. 
+ */
+#ifndef __STDC_LIMIT_MACROS
+#define __STDC_LIMIT_MACROS 1
+#endif
+
+#include <inttypes.h>
+typedef int8_t flex_int8_t;
+typedef uint8_t flex_uint8_t;
+typedef int16_t flex_int16_t;
+typedef uint16_t flex_uint16_t;
+typedef int32_t flex_int32_t;
+typedef uint32_t flex_uint32_t;
+#else
+typedef signed char flex_int8_t;
+typedef short int flex_int16_t;
+typedef int flex_int32_t;
+typedef unsigned char flex_uint8_t; 
+typedef unsigned short int flex_uint16_t;
+typedef unsigned int flex_uint32_t;
+
+/* Limits of integral types. */
+#ifndef INT8_MIN
+#define INT8_MIN               (-128)
+#endif
+#ifndef INT16_MIN
+#define INT16_MIN              (-32767-1)
+#endif
+#ifndef INT32_MIN
+#define INT32_MIN              (-2147483647-1)
+#endif
+#ifndef INT8_MAX
+#define INT8_MAX               (127)
+#endif
+#ifndef INT16_MAX
+#define INT16_MAX              (32767)
+#endif
+#ifndef INT32_MAX
+#define INT32_MAX              (2147483647)
+#endif
+#ifndef UINT8_MAX
+#define UINT8_MAX              (255U)
+#endif
+#ifndef UINT16_MAX
+#define UINT16_MAX             (65535U)
+#endif
+#ifndef UINT32_MAX
+#define UINT32_MAX             (4294967295U)
+#endif
+
+#ifndef SIZE_MAX
+#define SIZE_MAX               (~(size_t)0)
+#endif
+
+#endif /* ! C99 */
+
+#endif /* ! FLEXINT_H */
+
+/* begin standard C++ headers. */
+
+/* TODO: this is always defined, so inline it */
+#define yyconst const
+
+#if defined(__GNUC__) && __GNUC__ >= 3
+#define yynoreturn __attribute__((__noreturn__))
+#else
+#define yynoreturn
+#endif
+
+/* Returned upon end-of-file. */
+#define YY_NULL 0
+
+/* Promotes a possibly negative, possibly signed char to an
+ *   integer in range [0..255] for use as an array index.
+ */
+#define YY_SC_TO_UI(c) ((YY_CHAR) (c))
+
+/* Enter a start condition.  This macro really ought to take a parameter,
+ * but we do it the disgusting crufty way forced on us by the ()-less
+ * definition of BEGIN.
+ */
+#define BEGIN (yy_start) = 1 + 2 *
+/* Translate the current start state into a value that can be later handed
+ * to BEGIN to return to the state.  The YYSTATE alias is for lex
+ * compatibility.
+ */
+#define YY_START (((yy_start) - 1) / 2)
+#define YYSTATE YY_START
+/* Action number for EOF rule of a given start state. */
+#define YY_STATE_EOF(state) (YY_END_OF_BUFFER + state + 1)
+/* Special action meaning "start processing a new file". */
+#define YY_NEW_FILE yyrestart( yyin  )
+#define YY_END_OF_BUFFER_CHAR 0
+
+/* Size of default input buffer. */
+#ifndef YY_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k.
+ * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
+ * Ditto for the __ia64__ case accordingly.
+ */
+#define YY_BUF_SIZE 32768
+#else
+#define YY_BUF_SIZE 16384
+#endif /* __ia64__ */
+#endif
+
+/* The state buf must be large enough to hold one state per character in the main buffer.
+ */
+#define YY_STATE_BUF_SIZE   ((YY_BUF_SIZE + 2) * sizeof(yy_state_type))
+
+#ifndef YY_TYPEDEF_YY_BUFFER_STATE
+#define YY_TYPEDEF_YY_BUFFER_STATE
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+#endif
+
+#ifndef YY_TYPEDEF_YY_SIZE_T
+#define YY_TYPEDEF_YY_SIZE_T
+typedef size_t yy_size_t;
+#endif
+
+
+
+
+
+#define EOB_ACT_CONTINUE_SCAN 0
+#define EOB_ACT_END_OF_FILE 1
+#define EOB_ACT_LAST_MATCH 2
+    
+    #define YY_LESS_LINENO(n)
+    #define YY_LINENO_REWIND_TO(ptr)
+    
+/* Return all but the first "n" matched characters back to the input stream. */
+#define yyless(n) \
+	do \
+		{ \
+		/* Undo effects of setting up yytext. */ \
+        int yyless_macro_arg = (n); \
+        YY_LESS_LINENO(yyless_macro_arg);\
+		*yy_cp = (yy_hold_char); \
+		YY_RESTORE_YY_MORE_OFFSET \
+		(yy_c_buf_p) = yy_cp = yy_bp + yyless_macro_arg - YY_MORE_ADJ; \
+		YY_DO_BEFORE_ACTION; /* set up yytext again */ \
+		} \
+	while ( 0 )
+#define unput(c) yyunput( c, (yytext_ptr)  )
+
+#ifndef YY_STRUCT_YY_BUFFER_STATE
+#define YY_STRUCT_YY_BUFFER_STATE
+struct yy_buffer_state
+	{
+	FILE *yy_input_file;
+
+	char *yy_ch_buf;		/* input buffer */
+	char *yy_buf_pos;		/* current position in input buffer */
+
+	/* Size of input buffer in bytes, not including room for EOB
+	 * characters.
+	 */
+	int yy_buf_size;
+
+	/* Number of characters read into yy_ch_buf, not including EOB
+	 * characters.
+	 */
+	int yy_n_chars;
+
+	/* Whether we "own" the buffer - i.e., we know we created it,
+	 * and can realloc() it to grow it, and should free() it to
+	 * delete it.
+	 */
+	int yy_is_our_buffer;
+
+	/* Whether this is an "interactive" input source; if so, and
+	 * if we're using stdio for input, then we want to use getc()
+	 * instead of fread(), to make sure we stop fetching input after
+	 * each newline.
+	 */
+	int yy_is_interactive;
+
+	/* Whether we're considered to be at the beginning of a line.
+	 * If so, '^' rules will be active on the next match, otherwise
+	 * not.
+	 */
+	int yy_at_bol;
+
+    int yy_bs_lineno; /**< The line count. */
+    int yy_bs_column; /**< The column count. */
+
+	/* Whether to try to fill the input buffer when we reach the
+	 * end of it.
+	 */
+	int yy_fill_buffer;
+
+	int yy_buffer_status;
+
+#define YY_BUFFER_NEW 0
+#define YY_BUFFER_NORMAL 1
+	/* When an EOF's been seen but there's still some text to process
+	 * then we mark the buffer as YY_EOF_PENDING, to indicate that we
+	 * shouldn't try reading from the input source any more.  We might
+	 * still have a bunch of tokens to match, though, because of
+	 * possible backing-up.
+	 *
+	 * When we actually see the EOF, we change the status to "new"
+	 * (via yyrestart()), so that the user can continue scanning by
+	 * just pointing yyin at a new input file.
+	 */
+#define YY_BUFFER_EOF_PENDING 2
+
+	};
+#endif /* !YY_STRUCT_YY_BUFFER_STATE */
+
+/* Stack of input buffers. */
+static size_t yy_buffer_stack_top = 0; /**< index of top of stack. */
+static size_t yy_buffer_stack_max = 0; /**< capacity of stack. */
+static YY_BUFFER_STATE * yy_buffer_stack = NULL; /**< Stack as an array. */
+
+/* We provide macros for accessing buffer states in case in the
+ * future we want to put the buffer states in a more general
+ * "scanner state".
+ *
+ * Returns the top of the stack, or NULL.
+ */
+#define YY_CURRENT_BUFFER ( (yy_buffer_stack) \
+                          ? (yy_buffer_stack)[(yy_buffer_stack_top)] \
+                          : 0)
+/* Same as previous macro, but useful when we know that the buffer stack is not
+ * NULL or when we need an lvalue. For internal use only.
+ */
+#define YY_CURRENT_BUFFER_LVALUE (yy_buffer_stack)[(yy_buffer_stack_top)]
+
+/* yy_hold_char holds the character lost when yytext is formed. */
+static char yy_hold_char;
+static int yy_n_chars;		/* number of characters read into yy_ch_buf */
+static int yyleng;
+
+/* Points to current character in buffer. */
+static char *yy_c_buf_p = NULL;
+static int yy_init = 0;		/* whether we need to initialize */
+static int yy_start = 0;	/* start state number */
+
+/* Flag which is used to allow yywrap()'s to do buffer switches
+ * instead of setting up a fresh yyin.  A bit of a hack ...
+ */
+static int yy_did_buffer_switch_on_eof;
+
+static void yyrestart ( FILE *input_file  );
+MY_ATTRIBUTE((unused)) static void yy_switch_to_buffer ( YY_BUFFER_STATE new_buffer  );
+static YY_BUFFER_STATE yy_create_buffer ( FILE *file, int size  );
+static void yy_delete_buffer ( YY_BUFFER_STATE b  );
+static void yy_flush_buffer ( YY_BUFFER_STATE b  );
+MY_ATTRIBUTE((unused)) static void yypush_buffer_state ( YY_BUFFER_STATE new_buffer  );
+MY_ATTRIBUTE((unused)) static void yypop_buffer_state ( void );
+
+static void yyensure_buffer_stack ( void );
+static void yy_load_buffer_state ( void );
+static void yy_init_buffer ( YY_BUFFER_STATE b, FILE *file  );
+#define YY_FLUSH_BUFFER yy_flush_buffer( YY_CURRENT_BUFFER )
+
+YY_BUFFER_STATE yy_scan_buffer ( char *base, yy_size_t size  );
+YY_BUFFER_STATE yy_scan_string ( const char *yy_str  );
+YY_BUFFER_STATE yy_scan_bytes ( const char *bytes, int len  );
+
+static void *yyalloc ( yy_size_t  );
+static void *yyrealloc ( void *, yy_size_t  );
+static void yyfree ( void *  );
+
+#define yy_new_buffer yy_create_buffer
+#define yy_set_interactive(is_interactive) \
+	{ \
+	if ( ! YY_CURRENT_BUFFER ){ \
+        yyensure_buffer_stack (); \
+		YY_CURRENT_BUFFER_LVALUE =    \
+            yy_create_buffer( yyin, YY_BUF_SIZE ); \
+	} \
+	YY_CURRENT_BUFFER_LVALUE->yy_is_interactive = is_interactive; \
+	}
+#define yy_set_bol(at_bol) \
+	{ \
+	if ( ! YY_CURRENT_BUFFER ){\
+        yyensure_buffer_stack (); \
+		YY_CURRENT_BUFFER_LVALUE =    \
+            yy_create_buffer( yyin, YY_BUF_SIZE ); \
+	} \
+	YY_CURRENT_BUFFER_LVALUE->yy_at_bol = at_bol; \
+	}
+#define YY_AT_BOL() (YY_CURRENT_BUFFER_LVALUE->yy_at_bol)
+
+/* Begin user sect3 */
+
+#define yywrap() (/*CONSTCOND*/1)
+#define YY_SKIP_YYWRAP
+typedef flex_uint8_t YY_CHAR;
+
+static FILE *yyin = NULL, *yyout = NULL;
+
+typedef int yy_state_type;
+
+
+static int yylineno = 1;
+
+
+#ifdef yytext_ptr
+#undef yytext_ptr
+#endif
+#define yytext_ptr yytext
+
+static yy_state_type yy_get_previous_state ( void );
+static yy_state_type yy_try_NUL_trans ( yy_state_type current_state  );
+static int yy_get_next_buffer ( void );
+static void yynoreturn yy_fatal_error ( const char* msg  );
+
+/* Done after the current pattern has been matched and before the
+ * corresponding action - sets up yytext.
+ */
+#define YY_DO_BEFORE_ACTION \
+	(yytext_ptr) = yy_bp; \
+	yyleng = (int) (yy_cp - yy_bp); \
+	(yy_hold_char) = *yy_cp; \
+	*yy_cp = '\0'; \
+	(yy_c_buf_p) = yy_cp;
+#define YY_NUM_RULES 102
+#define YY_END_OF_BUFFER 103
+/* This struct is not used in this scanner,
+   but its presence is necessary. */
+struct yy_trans_info
+	{
+	flex_int32_t yy_verify;
+	flex_int32_t yy_nxt;
+	};
+static const flex_int16_t yy_accept[307] =
+    {   0,
+        0,    0,   97,   97,    0,    0,    0,    0,  103,  101,
+      100,  100,    8,  101,   92,    5,   81,   87,   90,   88,
+       85,   89,  101,   91,    1,  101,   86,   84,   82,   83,
+       95,   74,   74,   74,   74,   74,   74,   74,   74,   74,
+       74,   74,   74,   74,   74,   74,   74,   74,   74,   74,
+       93,   94,   97,   98,    6,    7,    9,   10,  100,    4,
+       76,   96,    2,    1,    3,   77,   78,   80,   79,    0,
+       74,    0,   74,   74,   74,   74,   36,   74,   74,   74,
+       74,   74,   74,   74,   74,   74,   74,   74,   74,   74,
+       23,   17,   20,   74,   74,   74,   74,   74,   74,   46,
+
+       52,   74,   14,   74,   74,   74,   74,   74,   74,   74,
+       74,   74,   74,   74,   74,   74,   74,   74,   97,   98,
+       98,   99,    6,    7,    9,   10,    2,    0,   75,   13,
+       37,   74,   74,   74,   74,   74,   74,   74,   74,   74,
+       74,   74,   74,   74,   74,   22,   74,   74,   34,   74,
+       74,   74,   74,   18,   74,   74,   74,   74,   74,   15,
+       74,   74,   74,   74,   74,   74,   74,   43,   74,   12,
+       74,   74,   74,   74,   74,   74,   74,   74,   74,   74,
+        0,   75,   74,   74,   19,   74,   74,   74,   74,   74,
+       74,   74,   74,   74,   74,   38,   25,   74,   67,   74,
+
+       32,   74,   74,   74,   74,   40,   74,   72,   69,   27,
+       71,   74,   11,   55,   74,   74,   74,   74,   74,   74,
+       74,   74,   24,   74,   74,   74,   74,   74,   74,   66,
+        0,   21,   74,   57,   74,   74,   74,   31,   74,   74,
+       74,   74,   74,   26,   56,   74,   49,   74,   62,   74,
+       74,   35,   74,   74,   74,   74,   70,   74,   48,   74,
+       74,   74,   74,   33,   28,    0,   73,   74,   64,   61,
+       47,   74,   54,   74,   44,   74,   39,   63,   74,   74,
+       29,   74,   30,   60,   74,   50,   42,   41,   74,   45,
+       53,   74,   74,   74,   74,   74,   74,   68,   58,   74,
+
+       65,   74,   51,   16,   59,    0
+    } ;
+
+static const YY_CHAR yy_ec[256] =
+    {   0,
+        1,    1,    1,    1,    1,    1,    1,    1,    2,    3,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    2,    1,    4,    5,    6,    7,    1,    8,    9,
+       10,   11,   12,   13,   14,   15,   16,   17,   17,   17,
+       17,   17,   17,   17,   17,   17,   17,   18,   19,   20,
+       21,   22,   23,   24,   25,   26,   27,   28,   29,   30,
+       31,   32,   33,   34,   35,   36,   37,   38,   39,   40,
+       41,   42,   43,   44,   45,   46,   47,   48,   49,   34,
+        1,    1,    1,    1,   50,    1,   34,   34,   34,   34,
+
+       34,   34,   34,   34,   34,   34,   34,   51,   34,   34,
+       34,   34,   52,   34,   53,   34,   34,   34,   34,   34,
+       34,   34,   54,    1,   55,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1
+    } ;
+
+static const YY_CHAR yy_meta[56] =
+    {   0,
+        1,    1,    1,    2,    3,    1,    1,    4,    1,    1,
+        5,    1,    1,    1,    1,    6,    7,    1,    1,    1,
+        8,    1,    1,    6,    9,    9,    9,    9,    9,    9,
+        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+        9,    9,    9,    9,    9,    9,    9,    9,    9,    9,
+        9,    9,    9,    1,    1
+    } ;
+
+static const flex_int16_t yy_base[320] =
+    {   0,
+        0,    0,  262,  259,  249,  244,  239,  234,  236,  960,
+       54,   56,  960,    0,  960,  960,  960,  960,  960,  960,
+      960,  960,  217,  220,   45,  186,  960,   42,  960,  184,
+      960,   45,   49,   55,   51,   65,   80,   50,   69,   94,
+       90,   92,  104,   60,  114,  116,  131,  134,  135,  149,
+      960,  960,    0,   61,    0,  194,    0,  197,  133,    0,
+      960,  960,  163,   53,  143,  960,  960,  960,  960,  147,
+      125,  123,  138,  151,  152,  153,  155,  166,  169,  173,
+      170,  171,  176,  180,  193,  182,  200,  204,  206,  209,
+      210,  211,  213,  224,  225,  226,  235,  240,  242,  245,
+
+      251,  252,  255,  256,  258,  261,  270,  274,  272,  277,
+      289,  288,  276,  294,  295,  300,  304,  305,    0,   79,
+      110,  960,    0,  116,    0,  113,   98,   58,    0,  306,
+      315,  316,  318,  319,  322,  328,  329,  332,  334,  338,
+      344,  353,  351,  354,  366,  360,  367,  369,  376,  378,
+      381,  385,  388,  382,  394,  400,  403,  404,  406,  407,
+      410,  417,  423,  424,  426,  429,  433,  440,  442,  443,
+      444,  445,  454,  456,  459,  461,  472,  473,  474,  477,
+       53,    0,  475,  478,  479,  490,  502,  504,  505,  507,
+      508,  509,  511,  518,  520,  523,  524,  525,  529,  538,
+
+      541,  542,  543,  545,  547,  544,  556,  557,  558,  559,
+      560,  569,  572,  574,  578,  581,  579,  583,  588,  590,
+      600,  601,  602,  607,  611,  613,  612,  618,  622,  629,
+       41,  634,  636,  638,  639,  643,  645,  648,  649,  650,
+      655,  659,  661,  660,  670,  675,  676,  679,  680,  682,
+      686,  689,  691,  696,  693,  700,  705,  706,  709,  711,
+      712,  716,  722,  723,  726,   72,  727,  736,  737,  738,
+      739,  740,  742,  743,  752,  753,  755,  757,  758,  759,
+      764,  770,  769,  771,  774,  784,  785,  786,  787,  789,
+      790,  791,  796,  801,  802,  803,  806,  807,  812,  817,
+
+      816,  823,  826,  828,  832,  960,  872,  881,  890,  893,
+      896,  900,  909,  918,  927,  936,  943,  947,  950
+    } ;
+
+static const flex_int16_t yy_def[320] =
+    {   0,
+      306,    1,  307,  307,  308,  308,  309,  309,  306,  306,
+      306,  306,  306,  310,  306,  306,  306,  306,  306,  306,
+      306,  306,  306,  306,  306,  311,  306,  306,  306,  306,
+      306,  312,  312,  312,  312,  312,  312,  312,  312,  312,
+      312,  312,  312,  312,  312,  312,  312,  312,  312,  312,
+      306,  306,  313,  314,  315,  306,  316,  306,  306,  310,
+      306,  306,  306,  306,  311,  306,  306,  306,  306,  317,
+      312,  318,  312,  312,  312,  312,  312,  312,  312,  312,
+      312,  312,  312,  312,  312,  312,  312,  312,  312,  312,
+      312,  312,  312,  312,  312,  312,  312,  312,  312,  312,
+
+      312,  312,  312,  312,  312,  312,  312,  312,  312,  312,
+      312,  312,  312,  312,  312,  312,  312,  312,  313,  314,
+      314,  306,  315,  306,  316,  306,  306,  306,  319,  312,
+      312,  312,  312,  312,  312,  312,  312,  312,  312,  312,
+      312,  312,  312,  312,  312,  312,  312,  312,  312,  312,
+      312,  312,  312,  312,  312,  312,  312,  312,  312,  312,
+      312,  312,  312,  312,  312,  312,  312,  312,  312,  312,
+      312,  312,  312,  312,  312,  312,  312,  312,  312,  312,
+      306,  319,  312,  312,  312,  312,  312,  312,  312,  312,
+      312,  312,  312,  312,  312,  312,  312,  312,  312,  312,
+
+      312,  312,  312,  312,  312,  312,  312,  312,  312,  312,
+      312,  312,  312,  312,  312,  312,  312,  312,  312,  312,
+      312,  312,  312,  312,  312,  312,  312,  312,  312,  312,
+      306,  312,  312,  312,  312,  312,  312,  312,  312,  312,
+      312,  312,  312,  312,  312,  312,  312,  312,  312,  312,
+      312,  312,  312,  312,  312,  312,  312,  312,  312,  312,
+      312,  312,  312,  312,  312,  306,  312,  312,  312,  312,
+      312,  312,  312,  312,  312,  312,  312,  312,  312,  312,
+      312,  312,  312,  312,  312,  312,  312,  312,  312,  312,
+      312,  312,  312,  312,  312,  312,  312,  312,  312,  312,
+
+      312,  312,  312,  312,  312,    0,  306,  306,  306,  306,
+      306,  306,  306,  306,  306,  306,  306,  306,  306
+    } ;
+
+static const flex_int16_t yy_nxt[1016] =
+    {   0,
+       10,   11,   12,   13,   10,   14,   15,   16,   17,   18,
+       19,   20,   21,   22,   23,   24,   25,   26,   27,   28,
+       29,   30,   31,   10,   32,   33,   34,   35,   36,   37,
+       38,   38,   39,   38,   38,   40,   41,   42,   43,   44,
+       38,   45,   46,   47,   48,   49,   50,   38,   38,   38,
+       38,   38,   38,   51,   52,   59,   59,   59,   59,   63,
+       70,   64,   67,   68,   70,   70,   70,   63,   72,   64,
+       70,  121,   72,   72,   72,   70,  122,   75,   72,   83,
+       70,   76,   73,   72,   70,  129,   78,   74,   72,  306,
+       79,  266,   72,   80,  306,   70,   81,   77,   91,   82,
+
+       84,  104,   85,   72,  231,   70,   92,   70,   87,   70,
+      181,   93,   86,   72,  127,   72,  126,   72,   88,   70,
+      121,   89,   94,  124,   90,  122,   95,   72,   97,   70,
+       98,   70,   96,  100,   59,   59,   99,   72,   70,   72,
+       70,  101,  105,  102,  107,  103,   70,  108,   72,   70,
+       70,  128,  106,   70,   72,  111,  109,   72,   72,  116,
+      110,   72,  112,  306,   70,  130,   70,   70,   70,  113,
+       70,  114,   72,  115,   72,   72,   72,  131,   72,  127,
+      117,   70,  132,  133,   70,   70,   70,  118,   70,   72,
+      134,   70,   72,   72,   72,   70,   72,   70,  140,   72,
+
+      126,  124,  142,   72,   69,   72,   66,  135,   70,  137,
+      138,  143,  141,  136,  147,   70,   72,  139,  144,   70,
+      146,   70,  145,   72,   70,   70,   70,   72,   70,   72,
+       62,   61,   72,   72,   72,  306,   72,   58,  152,   70,
+       70,   70,   58,  148,  150,  149,  151,   72,   72,   72,
+       70,   56,  157,  153,  154,   70,   56,   70,   72,  156,
+       70,  155,  159,   72,  158,   72,   70,   70,   72,   54,
+       70,   70,   54,   70,   72,   72,   70,  161,   72,   72,
+      162,   72,  163,  160,   72,   70,  306,   70,  306,   70,
+      306,   70,   70,   72,  164,   72,  166,   72,  169,   72,
+
+       72,  165,  171,   70,   70,  167,  306,  170,  306,   70,
+       70,   72,   72,  168,  172,   70,  173,   72,   72,   70,
+       70,   70,  176,   72,  306,  174,  175,   72,   72,   72,
+       70,   70,  178,   70,   70,  177,  179,   70,   72,   72,
+      306,   72,   72,   70,   70,   72,  180,   70,  183,   70,
+      184,   72,   72,   70,  306,   72,  306,   72,  189,   70,
+      185,   72,  191,  306,  186,  188,   70,   72,   70,   70,
+      187,  190,  306,  306,   72,   70,   72,   72,  306,  195,
+      196,   70,   70,   72,   70,  192,  193,  306,  194,   72,
+       72,   70,   72,   70,  197,  200,   70,   70,  198,   72,
+
+       70,   72,  306,   70,   72,   72,  306,  202,   72,   70,
+      199,   72,  306,  203,  201,   70,  204,   72,   70,   70,
+      206,   70,   70,   72,  207,   70,   72,   72,  208,   72,
+       72,  205,   70,   72,  211,  306,  212,  209,   70,   70,
+       72,   70,  306,  210,   70,  213,   72,   72,   70,   72,
+      216,  215,   72,  306,  214,   70,   72,   70,   70,   70,
+       70,  219,  306,   72,  218,   72,   72,   72,   72,   70,
+      217,   70,  306,  306,   70,  306,   70,   72,  306,   72,
+      222,  224,   72,  220,   72,  226,  221,   70,   70,   70,
+       70,  223,   70,   70,   70,   72,   72,   72,   72,  225,
+
+       72,   72,   72,  306,  306,   70,  306,  306,  306,  229,
+      306,  230,  232,   72,  228,  233,  227,   70,  234,   70,
+       70,  306,   70,   70,   70,   72,   70,   72,   72,  237,
+       72,   72,   72,   70,   72,   70,  236,  240,   70,   70,
+       70,   72,  242,   72,   70,  235,   72,   72,   72,  241,
+      238,  239,   72,   70,  244,  306,   70,   70,   70,   70,
+       70,   72,   70,  243,   72,   72,   72,   72,   72,  245,
+       72,   70,   70,   70,   70,   70,  306,  306,  306,   72,
+       72,   72,   72,   72,   70,  246,  248,   70,  249,   70,
+      247,  306,   72,   70,   70,   72,   70,   72,   70,  250,
+
+      306,   72,   72,   70,   72,   70,   72,  251,  255,  253,
+      306,   72,  306,   72,  256,   70,   70,   70,  257,  252,
+      254,  306,   70,   72,   72,   72,   70,   70,   70,  259,
+       72,  306,  306,   70,   72,   72,   72,   70,  306,  260,
+      263,   72,  306,  258,   70,   72,  264,  306,  306,   70,
+      265,   70,   72,   70,   70,  261,  262,   72,   70,   72,
+       70,   72,   72,   70,   70,   70,   72,  268,   72,  306,
+       70,   72,   72,   72,   70,   70,   70,  271,   72,  267,
+      306,  306,   72,   72,   72,   70,  269,  272,  270,  275,
+       70,   70,  306,   72,   70,   70,  273,   70,   72,   72,
+
+      274,   70,   72,   72,   70,   72,   70,  276,   70,   72,
+      306,   70,   72,  278,   72,   70,   72,  282,  280,   72,
+       70,   70,  277,   72,   70,  306,   70,   70,   72,   72,
+      279,   70,   72,  281,   72,   72,  306,   70,   70,   72,
+      286,   70,   70,  283,  287,   72,   72,  284,  285,   72,
+       72,   70,   70,   70,   70,   70,  306,   70,   70,   72,
+       72,   72,   72,   72,  288,   72,   72,   70,   70,  306,
+       70,  291,   70,   70,   70,   72,   72,  289,   72,   70,
+       72,   72,   72,  290,   70,   70,   70,   72,  306,   70,
+      306,  292,   72,   72,   72,  293,  295,   72,  296,   70,
+
+       70,   70,   70,  294,   70,   70,   70,   72,   72,   72,
+       72,   70,   72,   72,   72,  297,   70,   70,   70,   72,
+      306,   70,   70,  299,   72,   72,   72,   70,  298,   72,
+       72,   70,   70,  303,  306,   72,  301,  306,   70,   72,
+       72,   70,  300,   70,  302,  304,   72,   70,  306,   72,
+      306,   72,  306,  306,  306,   72,  306,  306,  306,  306,
+      306,  306,  306,  306,  306,  306,  306,  306,  306,  306,
+      306,  305,   53,   53,   53,   53,   53,   53,   53,   53,
+       53,   55,   55,   55,   55,   55,   55,   55,   55,   55,
+       57,   57,   57,   57,   57,   57,   57,   57,   57,   60,
+
+      306,   60,   65,   65,   65,   71,   71,  306,   71,  119,
+      119,  119,  119,  306,  119,  119,  119,  119,  120,  120,
+      120,  120,  120,  120,  120,  120,  120,  123,  123,  123,
+      306,  123,  123,  123,  123,  123,  125,  306,  125,  125,
+      125,  125,  125,  125,  125,  129,  306,  306,  306,  306,
+      306,  129,   72,   72,  306,   72,  182,  306,  182,    9,
+      306,  306,  306,  306,  306,  306,  306,  306,  306,  306,
+      306,  306,  306,  306,  306,  306,  306,  306,  306,  306,
+      306,  306,  306,  306,  306,  306,  306,  306,  306,  306,
+      306,  306,  306,  306,  306,  306,  306,  306,  306,  306,
+
+      306,  306,  306,  306,  306,  306,  306,  306,  306,  306,
+      306,  306,  306,  306,  306
+    } ;
+
+static const flex_int16_t yy_chk[1016] =
+    {   0,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,   11,   11,   12,   12,   25,
+       32,   25,   28,   28,   33,   38,   35,   64,   32,   64,
+       34,   54,   33,   38,   35,   44,   54,   33,   34,   35,
+       36,   33,   32,   44,   39,  266,   34,   32,   36,  120,
+       34,  231,   39,   34,  120,   37,   34,   33,   39,   34,
+
+       36,   44,   36,   37,  181,   41,   39,   42,   37,   40,
+      128,   39,   36,   41,  127,   42,  126,   40,   37,   43,
+      121,   37,   40,  124,   37,  121,   40,   43,   41,   45,
+       42,   46,   40,   43,   59,   59,   42,   45,   72,   46,
+       71,   43,   45,   43,   46,   43,   47,   46,   71,   48,
+       49,   70,   45,   73,   47,   47,   46,   48,   49,   49,
+       46,   73,   47,   65,   50,   73,   74,   75,   76,   47,
+       77,   48,   50,   48,   74,   75,   76,   74,   77,   63,
+       50,   78,   75,   76,   79,   81,   82,   50,   80,   78,
+       78,   83,   79,   81,   82,   84,   80,   86,   81,   83,
+
+       58,   56,   83,   84,   30,   86,   26,   79,   85,   80,
+       80,   83,   82,   79,   86,   87,   85,   80,   83,   88,
+       85,   89,   84,   87,   90,   91,   92,   88,   93,   89,
+       24,   23,   90,   91,   92,    9,   93,    8,   92,   94,
+       95,   96,    7,   87,   89,   88,   90,   94,   95,   96,
+       97,    6,   96,   92,   92,   98,    5,   99,   97,   95,
+      100,   94,   97,   98,   96,   99,  101,  102,  100,    4,
+      103,  104,    3,  105,  101,  102,  106,   99,  103,  104,
+      102,  105,  103,   98,  106,  107,    0,  109,    0,  108,
+        0,  113,  110,  107,  104,  109,  106,  108,  108,  113,
+
+      110,  105,  110,  112,  111,  107,    0,  109,    0,  114,
+      115,  112,  111,  107,  111,  116,  112,  114,  115,  117,
+      118,  130,  115,  116,    0,  113,  114,  117,  118,  130,
+      131,  132,  117,  133,  134,  116,  117,  135,  131,  132,
+        0,  133,  134,  136,  137,  135,  118,  138,  132,  139,
+      133,  136,  137,  140,    0,  138,    0,  139,  138,  141,
+      134,  140,  140,    0,  135,  137,  143,  141,  142,  144,
+      136,  139,    0,    0,  143,  146,  142,  144,    0,  143,
+      144,  145,  147,  146,  148,  141,  141,    0,  142,  145,
+      147,  149,  148,  150,  145,  148,  151,  154,  145,  149,
+
+      152,  150,    0,  153,  151,  154,    0,  151,  152,  155,
+      147,  153,    0,  152,  150,  156,  153,  155,  157,  158,
+      154,  159,  160,  156,  155,  161,  157,  158,  156,  159,
+      160,  153,  162,  161,  159,    0,  160,  157,  163,  164,
+      162,  165,    0,  158,  166,  161,  163,  164,  167,  165,
+      164,  163,  166,    0,  162,  168,  167,  169,  170,  171,
+      172,  167,    0,  168,  166,  169,  170,  171,  172,  173,
+      165,  174,    0,    0,  175,    0,  176,  173,    0,  174,
+      172,  174,  175,  169,  176,  176,  171,  177,  178,  179,
+      183,  173,  180,  184,  185,  177,  178,  179,  183,  175,
+
+      180,  184,  185,    0,    0,  186,    0,    0,    0,  179,
+        0,  180,  183,  186,  178,  184,  177,  187,  186,  188,
+      189,    0,  190,  191,  192,  187,  193,  188,  189,  189,
+      190,  191,  192,  194,  193,  195,  188,  192,  196,  197,
+      198,  194,  194,  195,  199,  187,  196,  197,  198,  193,
+      190,  191,  199,  200,  198,    0,  201,  202,  203,  206,
+      204,  200,  205,  195,  201,  202,  203,  206,  204,  200,
+      205,  207,  208,  209,  210,  211,    0,    0,    0,  207,
+      208,  209,  210,  211,  212,  202,  204,  213,  205,  214,
+      203,    0,  212,  215,  217,  213,  216,  214,  218,  207,
+
+        0,  215,  217,  219,  216,  220,  218,  212,  218,  216,
+        0,  219,    0,  220,  219,  221,  222,  223,  220,  215,
+      217,    0,  224,  221,  222,  223,  225,  227,  226,  222,
+      224,    0,    0,  228,  225,  227,  226,  229,    0,  224,
+      227,  228,    0,  221,  230,  229,  228,    0,    0,  232,
+      229,  233,  230,  234,  235,  225,  226,  232,  236,  233,
+      237,  234,  235,  238,  239,  240,  236,  235,  237,    0,
+      241,  238,  239,  240,  242,  244,  243,  239,  241,  233,
+        0,    0,  242,  244,  243,  245,  236,  240,  237,  243,
+      246,  247,    0,  245,  248,  249,  241,  250,  246,  247,
+
+      242,  251,  248,  249,  252,  250,  253,  246,  255,  251,
+        0,  254,  252,  250,  253,  256,  255,  255,  253,  254,
+      257,  258,  248,  256,  259,    0,  260,  261,  257,  258,
+      251,  262,  259,  254,  260,  261,    0,  263,  264,  262,
+      261,  265,  267,  256,  262,  263,  264,  258,  260,  265,
+      267,  268,  269,  270,  271,  272,    0,  273,  274,  268,
+      269,  270,  271,  272,  263,  273,  274,  275,  276,    0,
+      277,  274,  278,  279,  280,  275,  276,  268,  277,  281,
+      278,  279,  280,  272,  283,  282,  284,  281,    0,  285,
+        0,  276,  283,  282,  284,  279,  282,  285,  285,  286,
+
+      287,  288,  289,  280,  290,  291,  292,  286,  287,  288,
+      289,  293,  290,  291,  292,  289,  294,  295,  296,  293,
+        0,  297,  298,  293,  294,  295,  296,  299,  292,  297,
+      298,  301,  300,  297,    0,  299,  295,    0,  302,  301,
+      300,  303,  294,  304,  296,  300,  302,  305,    0,  303,
+        0,  304,    0,    0,    0,  305,    0,    0,    0,    0,
+        0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
+        0,  302,  307,  307,  307,  307,  307,  307,  307,  307,
+      307,  308,  308,  308,  308,  308,  308,  308,  308,  308,
+      309,  309,  309,  309,  309,  309,  309,  309,  309,  310,
+
+        0,  310,  311,  311,  311,  312,  312,    0,  312,  313,
+      313,  313,  313,    0,  313,  313,  313,  313,  314,  314,
+      314,  314,  314,  314,  314,  314,  314,  315,  315,  315,
+        0,  315,  315,  315,  315,  315,  316,    0,  316,  316,
+      316,  316,  316,  316,  316,  317,    0,    0,    0,    0,
+        0,  317,  318,  318,    0,  318,  319,    0,  319,  306,
+      306,  306,  306,  306,  306,  306,  306,  306,  306,  306,
+      306,  306,  306,  306,  306,  306,  306,  306,  306,  306,
+      306,  306,  306,  306,  306,  306,  306,  306,  306,  306,
+      306,  306,  306,  306,  306,  306,  306,  306,  306,  306,
+
+      306,  306,  306,  306,  306,  306,  306,  306,  306,  306,
+      306,  306,  306,  306,  306
+    } ;
+
+static yy_state_type yy_last_accepting_state;
+static char *yy_last_accepting_cpos;
+
+
+static int yy_flex_debug = 0;
+
+/* The intent behind this definition is that it'll catch
+ * any uses of REJECT which flex missed.
+ */
+#define REJECT reject_used_but_not_detected
+#define yymore() yymore_used_but_not_detected
+#define YY_MORE_ADJ 0
+#define YY_RESTORE_YY_MORE_OFFSET
+static char *yytext;
+#line 1 "pars0lex.l"
+/*****************************************************************************
+
+Copyright (c) 1997, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/******************************************************
+SQL parser lexical analyzer: input file for the GNU Flex lexer generator
+
+The InnoDB parser is frozen because MySQL takes care of SQL parsing.
+Therefore we normally keep the InnoDB parser C files as they are, and do
+not automatically generate them from pars0grm.y and pars0lex.l.
+
+How to make the InnoDB parser and lexer C files:
+
+1. Run ./make_flex.sh to generate lexer files.
+
+2. Run ./make_bison.sh to generate parser files.
+
+These instructions seem to work at least with bison-1.875d and flex-2.5.31 on
+Linux.
+
+Created 12/14/1997 Heikki Tuuri
+*******************************************************/
+#define YY_NO_INPUT 1
+#define YY_NO_UNISTD_H 1
+#line 54 "pars0lex.l"
+#define YYSTYPE que_node_t*
+
+#include "univ.i"
+#include "pars0pars.h"
+#include "pars0grm.h"
+#include "pars0sym.h"
+#include "mem0mem.h"
+
+#define malloc(A)	ut_malloc_nokey(A)
+#define free(A)		ut_free(A)
+#define realloc(P, A)	ut_realloc(P, A)
+#define exit(A) 	ut_error
+
+#define YY_INPUT(buf, result, max_size) \
+	result = pars_get_lex_chars(buf, max_size)
+
+/* String buffer for removing quotes */
+static ulint	stringbuf_len_alloc = 0; /* Allocated length */
+static ulint	stringbuf_len = 0; /* Current length */
+static char*	stringbuf; /* Start of buffer */
+/** Appends a string to the buffer. */
+static
+void
+string_append(
+/*==========*/
+	const char*	str,	/*!< in: string to be appended */
+	ulint		len)	/*!< in: length of the string */
+{
+	if (stringbuf == NULL) {
+		stringbuf = static_cast<char*>(malloc(1));
+		stringbuf_len_alloc = 1;
+	}
+
+	if (stringbuf_len + len > stringbuf_len_alloc) {
+		while (stringbuf_len + len > stringbuf_len_alloc) {
+			stringbuf_len_alloc <<= 1;
+		}
+
+		stringbuf = static_cast<char*>(
+			realloc(stringbuf, stringbuf_len_alloc));
+	}
+
+	memcpy(stringbuf + stringbuf_len, str, len);
+	stringbuf_len += len;
+}
+
+#line 859 "lexyy.cc"
+
+#line 861 "lexyy.cc"
+
+#define INITIAL 0
+#define comment 1
+#define quoted 2
+#define id 3
+
+#ifndef YY_NO_UNISTD_H
+/* Special case for "unistd.h", since it is non-ANSI. We include it way
+ * down here because we want the user's section 1 to have been scanned first.
+ * The user has a chance to override it with an option.
+ */
+#include <unistd.h>
+#endif
+
+#ifndef YY_EXTRA_TYPE
+#define YY_EXTRA_TYPE void *
+#endif
+
+static int yy_init_globals ( void );
+
+/* Accessor methods to globals.
+   These are made visible to non-reentrant scanners for convenience. */
+
+MY_ATTRIBUTE((unused)) static int yylex_destroy ( void );
+
+MY_ATTRIBUTE((unused)) static int yyget_debug ( void );
+
+MY_ATTRIBUTE((unused)) static void yyset_debug ( int debug_flag  );
+
+YY_EXTRA_TYPE yyget_extra ( void );
+
+
+
+MY_ATTRIBUTE((unused)) static FILE *yyget_in ( void );
+
+MY_ATTRIBUTE((unused)) static void yyset_in  ( FILE * _in_str  );
+
+MY_ATTRIBUTE((unused)) static FILE *yyget_out ( void );
+
+MY_ATTRIBUTE((unused)) static void yyset_out  ( FILE * _out_str  );
+
+			MY_ATTRIBUTE((unused)) static int yyget_leng ( void );
+
+MY_ATTRIBUTE((unused)) static char *yyget_text ( void );
+
+MY_ATTRIBUTE((unused)) static int yyget_lineno ( void );
+
+MY_ATTRIBUTE((unused)) static void yyset_lineno ( int _line_number  );
+
+/* Macros after this point can all be overridden by user definitions in
+ * section 1.
+ */
+
+#ifndef YY_SKIP_YYWRAP
+#ifdef __cplusplus
+extern "C" int yywrap ( void );
+#else
+extern int yywrap ( void );
+#endif
+#endif
+
+#ifndef YY_NO_UNPUT
+    
+#endif
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy ( char *, const char *, int );
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen ( const char * );
+#endif
+
+#ifndef YY_NO_INPUT
+#ifdef __cplusplus
+static int yyinput ( void );
+#else
+static int input ( void );
+#endif
+
+#endif
+
+/* Amount of stuff to slurp up with each read. */
+#ifndef YY_READ_BUF_SIZE
+#ifdef __ia64__
+/* On IA-64, the buffer size is 16k, not 8k */
+#define YY_READ_BUF_SIZE 16384
+#else
+#define YY_READ_BUF_SIZE 8192
+#endif /* __ia64__ */
+#endif
+
+/* Copy whatever the last rule matched to the standard output. */
+#ifndef ECHO
+/* This used to be an fputs(), but since the string might contain NUL's,
+ * we now use fwrite().
+ */
+#define ECHO do { if (fwrite( yytext, (size_t) yyleng, 1, yyout )) {} } while (0)
+#endif
+
+/* Gets input and stuffs it into "buf".  number of characters read, or YY_NULL,
+ * is returned in "result".
+ */
+#ifndef YY_INPUT
+#define YY_INPUT(buf,result,max_size) \
+	if ( YY_CURRENT_BUFFER_LVALUE->yy_is_interactive ) \
+		{ \
+		int c = '*'; \
+		int n; \
+		for ( n = 0; n < max_size && \
+			     (c = getc( yyin )) != EOF && c != '\n'; ++n ) \
+			buf[n] = (char) c; \
+		if ( c == '\n' ) \
+			buf[n++] = (char) c; \
+		if ( c == EOF && ferror( yyin ) ) \
+			YY_FATAL_ERROR( "input in flex scanner failed" ); \
+		result = n; \
+		} \
+	else \
+		{ \
+		errno=0; \
+		while ( (result = (int) fread(buf, 1, (yy_size_t) max_size, yyin)) == 0 && ferror(yyin)) \
+			{ \
+			if( errno != EINTR) \
+				{ \
+				YY_FATAL_ERROR( "input in flex scanner failed" ); \
+				break; \
+				} \
+			errno=0; \
+			clearerr(yyin); \
+			} \
+		}\
+\
+
+#endif
+
+/* No semi-colon after return; correct usage is to write "yyterminate();" -
+ * we don't want an extra ';' after the "return" because that will cause
+ * some compilers to complain about unreachable statements.
+ */
+#ifndef yyterminate
+#define yyterminate() return YY_NULL
+#endif
+
+/* Number of entries by which start-condition stack grows. */
+#ifndef YY_START_STACK_INCR
+#define YY_START_STACK_INCR 25
+#endif
+
+/* Report a fatal error. */
+#ifndef YY_FATAL_ERROR
+#define YY_FATAL_ERROR(msg) yy_fatal_error( msg )
+#endif
+
+/* end tables serialization structures and prototypes */
+
+/* Default declaration of generated scanner - a define so the user can
+ * easily add parameters.
+ */
+#ifndef YY_DECL
+#define YY_DECL_IS_OURS 1
+
+extern int yylex (void);
+
+#define YY_DECL int yylex (void)
+#endif /* !YY_DECL */
+
+/* Code executed at the beginning of each rule, after yytext and yyleng
+ * have been set up.
+ */
+#ifndef YY_USER_ACTION
+#define YY_USER_ACTION
+#endif
+
+/* Code executed at the end of each rule. */
+#ifndef YY_BREAK
+#define YY_BREAK /*LINTED*/break;
+#endif
+
+#define YY_RULE_SETUP \
+	YY_USER_ACTION
+
+/** The main scanner function which does all the work.
+ */
+YY_DECL
+{
+	yy_state_type yy_current_state;
+	char *yy_cp, *yy_bp;
+	int yy_act;
+    
+	if ( !(yy_init) )
+		{
+		(yy_init) = 1;
+
+#ifdef YY_USER_INIT
+		YY_USER_INIT;
+#endif
+
+		if ( ! (yy_start) )
+			(yy_start) = 1;	/* first start state */
+
+		if ( ! yyin )
+			yyin = stdin;
+
+		if ( ! yyout )
+			yyout = stdout;
+
+		if ( ! YY_CURRENT_BUFFER ) {
+			yyensure_buffer_stack ();
+			YY_CURRENT_BUFFER_LVALUE =
+				yy_create_buffer( yyin, YY_BUF_SIZE );
+		}
+
+		yy_load_buffer_state(  );
+		}
+
+	{
+#line 112 "pars0lex.l"
+
+
+#line 1082 "lexyy.cc"
+
+	while ( /*CONSTCOND*/1 )		/* loops until end-of-file is reached */
+		{
+		yy_cp = (yy_c_buf_p);
+
+		/* Support of yytext. */
+		*yy_cp = (yy_hold_char);
+
+		/* yy_bp points to the position in yy_ch_buf of the start of
+		 * the current run.
+		 */
+		yy_bp = yy_cp;
+
+		yy_current_state = (yy_start);
+yy_match:
+		do
+			{
+			YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)] ;
+			if ( yy_accept[yy_current_state] )
+				{
+				(yy_last_accepting_state) = yy_current_state;
+				(yy_last_accepting_cpos) = yy_cp;
+				}
+			while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+				{
+				yy_current_state = (int) yy_def[yy_current_state];
+				if ( yy_current_state >= 307 )
+					yy_c = yy_meta[yy_c];
+				}
+			yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c];
+			++yy_cp;
+			}
+		while ( yy_current_state != 306 );
+		yy_cp = (yy_last_accepting_cpos);
+		yy_current_state = (yy_last_accepting_state);
+
+yy_find_action:
+		yy_act = yy_accept[yy_current_state];
+
+		YY_DO_BEFORE_ACTION;
+
+do_action:	/* This label is used only to access EOF actions. */
+
+		switch ( yy_act )
+	{ /* beginning of action switch */
+			case 0: /* must back up */
+			/* undo the effects of YY_DO_BEFORE_ACTION */
+			*yy_cp = (yy_hold_char);
+			yy_cp = (yy_last_accepting_cpos);
+			yy_current_state = (yy_last_accepting_state);
+			goto yy_find_action;
+
+case 1:
+YY_RULE_SETUP
+#line 114 "pars0lex.l"
+{
+			yylval = sym_tab_add_int_lit(pars_sym_tab_global,
+								atoi(yytext));
+			return(PARS_INT_LIT);
+}
+	YY_BREAK
+case 2:
+YY_RULE_SETUP
+#line 120 "pars0lex.l"
+{
+			ut_error;	/* not implemented */
+
+			return(PARS_FLOAT_LIT);
+}
+	YY_BREAK
+case 3:
+YY_RULE_SETUP
+#line 126 "pars0lex.l"
+{
+			ulint	type;
+
+			yylval = sym_tab_add_bound_lit(pars_sym_tab_global,
+				yytext + 1, &type);
+
+			return((int) type);
+}
+	YY_BREAK
+case 4:
+YY_RULE_SETUP
+#line 135 "pars0lex.l"
+{
+			yylval = sym_tab_add_bound_id(pars_sym_tab_global,
+				yytext + 1);
+
+			return(PARS_ID_TOKEN);
+}
+	YY_BREAK
+case 5:
+YY_RULE_SETUP
+#line 142 "pars0lex.l"
+{
+/* Quoted character string literals are handled in an explicit
+start state 'quoted'.  This state is entered and the buffer for
+the scanned string is emptied upon encountering a starting quote.
+
+In the state 'quoted', only two actions are possible (defined below). */
+			BEGIN(quoted);
+			stringbuf_len = 0;
+}
+	YY_BREAK
+case 6:
+/* rule 6 can match eol */
+YY_RULE_SETUP
+#line 151 "pars0lex.l"
+{
+			/* Got a sequence of characters other than "'":
+			append to string buffer */
+			string_append(yytext, yyleng);
+}
+	YY_BREAK
+case 7:
+YY_RULE_SETUP
+#line 156 "pars0lex.l"
+{
+			/* Got a sequence of "'" characters:
+			append half of them to string buffer,
+			as "''" represents a single "'".
+			We apply truncating division,
+			so that "'''" will result in "'". */
+
+			string_append(yytext, yyleng / 2);
+
+			/* If we got an odd number of quotes, then the
+			last quote we got is the terminating quote.
+			At the end of the string, we return to the
+			initial start state and report the scanned
+			string literal. */
+
+			if (yyleng % 2) {
+				BEGIN(INITIAL);
+				yylval = sym_tab_add_str_lit(
+					pars_sym_tab_global,
+					(byte*) stringbuf, stringbuf_len);
+				return(PARS_STR_LIT);
+			}
+}
+	YY_BREAK
+case 8:
+YY_RULE_SETUP
+#line 180 "pars0lex.l"
+{
+/* Quoted identifiers are handled in an explicit start state 'id'.
+This state is entered and the buffer for the scanned string is emptied
+upon encountering a starting quote.
+
+In the state 'id', only two actions are possible (defined below). */
+			BEGIN(id);
+			stringbuf_len = 0;
+}
+	YY_BREAK
+case 9:
+/* rule 9 can match eol */
+YY_RULE_SETUP
+#line 189 "pars0lex.l"
+{
+			/* Got a sequence of characters other than '"':
+			append to string buffer */
+			string_append(yytext, yyleng);
+}
+	YY_BREAK
+case 10:
+YY_RULE_SETUP
+#line 194 "pars0lex.l"
+{
+			/* Got a sequence of '"' characters:
+			append half of them to string buffer,
+			as '""' represents a single '"'.
+			We apply truncating division,
+			so that '"""' will result in '"'. */
+
+			string_append(yytext, yyleng / 2);
+
+			/* If we got an odd number of quotes, then the
+			last quote we got is the terminating quote.
+			At the end of the string, we return to the
+			initial start state and report the scanned
+			identifier. */
+
+			if (yyleng % 2) {
+				BEGIN(INITIAL);
+				yylval = sym_tab_add_id(
+					pars_sym_tab_global,
+					(byte*) stringbuf, stringbuf_len);
+
+				return(PARS_ID_TOKEN);
+			}
+}
+	YY_BREAK
+case 11:
+YY_RULE_SETUP
+#line 219 "pars0lex.l"
+{
+			yylval = sym_tab_add_null_lit(pars_sym_tab_global);
+
+			return(PARS_NULL_LIT);
+}
+	YY_BREAK
+case 12:
+YY_RULE_SETUP
+#line 225 "pars0lex.l"
+{
+			/* Implicit cursor name */
+			yylval = sym_tab_add_str_lit(pars_sym_tab_global,
+							(byte*) yytext, yyleng);
+			return(PARS_SQL_TOKEN);
+}
+	YY_BREAK
+case 13:
+YY_RULE_SETUP
+#line 232 "pars0lex.l"
+{
+			return(PARS_AND_TOKEN);
+}
+	YY_BREAK
+case 14:
+YY_RULE_SETUP
+#line 236 "pars0lex.l"
+{
+			return(PARS_OR_TOKEN);
+}
+	YY_BREAK
+case 15:
+YY_RULE_SETUP
+#line 240 "pars0lex.l"
+{
+			return(PARS_NOT_TOKEN);
+}
+	YY_BREAK
+case 16:
+YY_RULE_SETUP
+#line 244 "pars0lex.l"
+{
+			return(PARS_PROCEDURE_TOKEN);
+}
+	YY_BREAK
+case 17:
+YY_RULE_SETUP
+#line 248 "pars0lex.l"
+{
+			return(PARS_IN_TOKEN);
+}
+	YY_BREAK
+case 18:
+YY_RULE_SETUP
+#line 252 "pars0lex.l"
+{
+			return(PARS_INT_TOKEN);
+}
+	YY_BREAK
+case 19:
+YY_RULE_SETUP
+#line 256 "pars0lex.l"
+{
+			return(PARS_CHAR_TOKEN);
+}
+	YY_BREAK
+case 20:
+YY_RULE_SETUP
+#line 260 "pars0lex.l"
+{
+			return(PARS_IS_TOKEN);
+}
+	YY_BREAK
+case 21:
+YY_RULE_SETUP
+#line 264 "pars0lex.l"
+{
+			return(PARS_BEGIN_TOKEN);
+}
+	YY_BREAK
+case 22:
+YY_RULE_SETUP
+#line 268 "pars0lex.l"
+{
+			return(PARS_END_TOKEN);
+}
+	YY_BREAK
+case 23:
+YY_RULE_SETUP
+#line 272 "pars0lex.l"
+{
+			return(PARS_IF_TOKEN);
+}
+	YY_BREAK
+case 24:
+YY_RULE_SETUP
+#line 276 "pars0lex.l"
+{
+			return(PARS_THEN_TOKEN);
+}
+	YY_BREAK
+case 25:
+YY_RULE_SETUP
+#line 280 "pars0lex.l"
+{
+			return(PARS_ELSE_TOKEN);
+}
+	YY_BREAK
+case 26:
+YY_RULE_SETUP
+#line 284 "pars0lex.l"
+{
+			return(PARS_ELSIF_TOKEN);
+}
+	YY_BREAK
+case 27:
+YY_RULE_SETUP
+#line 288 "pars0lex.l"
+{
+			return(PARS_LOOP_TOKEN);
+}
+	YY_BREAK
+case 28:
+YY_RULE_SETUP
+#line 292 "pars0lex.l"
+{
+			return(PARS_WHILE_TOKEN);
+}
+	YY_BREAK
+case 29:
+YY_RULE_SETUP
+#line 296 "pars0lex.l"
+{
+			return(PARS_RETURN_TOKEN);
+}
+	YY_BREAK
+case 30:
+YY_RULE_SETUP
+#line 300 "pars0lex.l"
+{
+			return(PARS_SELECT_TOKEN);
+}
+	YY_BREAK
+case 31:
+YY_RULE_SETUP
+#line 304 "pars0lex.l"
+{
+			return(PARS_COUNT_TOKEN);
+}
+	YY_BREAK
+case 32:
+YY_RULE_SETUP
+#line 308 "pars0lex.l"
+{
+			return(PARS_FROM_TOKEN);
+}
+	YY_BREAK
+case 33:
+YY_RULE_SETUP
+#line 312 "pars0lex.l"
+{
+			return(PARS_WHERE_TOKEN);
+}
+	YY_BREAK
+case 34:
+YY_RULE_SETUP
+#line 316 "pars0lex.l"
+{
+			return(PARS_FOR_TOKEN);
+}
+	YY_BREAK
+case 35:
+YY_RULE_SETUP
+#line 320 "pars0lex.l"
+{
+			return(PARS_ORDER_TOKEN);
+}
+	YY_BREAK
+case 36:
+YY_RULE_SETUP
+#line 324 "pars0lex.l"
+{
+			return(PARS_BY_TOKEN);
+}
+	YY_BREAK
+case 37:
+YY_RULE_SETUP
+#line 328 "pars0lex.l"
+{
+			return(PARS_ASC_TOKEN);
+}
+	YY_BREAK
+case 38:
+YY_RULE_SETUP
+#line 332 "pars0lex.l"
+{
+			return(PARS_DESC_TOKEN);
+}
+	YY_BREAK
+case 39:
+YY_RULE_SETUP
+#line 336 "pars0lex.l"
+{
+			return(PARS_INSERT_TOKEN);
+}
+	YY_BREAK
+case 40:
+YY_RULE_SETUP
+#line 340 "pars0lex.l"
+{
+			return(PARS_INTO_TOKEN);
+}
+	YY_BREAK
+case 41:
+YY_RULE_SETUP
+#line 344 "pars0lex.l"
+{
+			return(PARS_VALUES_TOKEN);
+}
+	YY_BREAK
+case 42:
+YY_RULE_SETUP
+#line 348 "pars0lex.l"
+{
+			return(PARS_UPDATE_TOKEN);
+}
+	YY_BREAK
+case 43:
+YY_RULE_SETUP
+#line 352 "pars0lex.l"
+{
+			return(PARS_SET_TOKEN);
+}
+	YY_BREAK
+case 44:
+YY_RULE_SETUP
+#line 356 "pars0lex.l"
+{
+			return(PARS_DELETE_TOKEN);
+}
+	YY_BREAK
+case 45:
+YY_RULE_SETUP
+#line 360 "pars0lex.l"
+{
+			return(PARS_CURRENT_TOKEN);
+}
+	YY_BREAK
+case 46:
+YY_RULE_SETUP
+#line 364 "pars0lex.l"
+{
+			return(PARS_OF_TOKEN);
+}
+	YY_BREAK
+case 47:
+YY_RULE_SETUP
+#line 368 "pars0lex.l"
+{
+			return(PARS_CREATE_TOKEN);
+}
+	YY_BREAK
+case 48:
+YY_RULE_SETUP
+#line 372 "pars0lex.l"
+{
+			return(PARS_TABLE_TOKEN);
+}
+	YY_BREAK
+case 49:
+YY_RULE_SETUP
+#line 376 "pars0lex.l"
+{
+			return(PARS_INDEX_TOKEN);
+}
+	YY_BREAK
+case 50:
+YY_RULE_SETUP
+#line 380 "pars0lex.l"
+{
+			return(PARS_UNIQUE_TOKEN);
+}
+	YY_BREAK
+case 51:
+YY_RULE_SETUP
+#line 384 "pars0lex.l"
+{
+			return(PARS_CLUSTERED_TOKEN);
+}
+	YY_BREAK
+case 52:
+YY_RULE_SETUP
+#line 388 "pars0lex.l"
+{
+			return(PARS_ON_TOKEN);
+}
+	YY_BREAK
+case 53:
+YY_RULE_SETUP
+#line 392 "pars0lex.l"
+{
+			return(PARS_DECLARE_TOKEN);
+}
+	YY_BREAK
+case 54:
+YY_RULE_SETUP
+#line 396 "pars0lex.l"
+{
+			return(PARS_CURSOR_TOKEN);
+}
+	YY_BREAK
+case 55:
+YY_RULE_SETUP
+#line 400 "pars0lex.l"
+{
+			return(PARS_OPEN_TOKEN);
+}
+	YY_BREAK
+case 56:
+YY_RULE_SETUP
+#line 404 "pars0lex.l"
+{
+			return(PARS_FETCH_TOKEN);
+}
+	YY_BREAK
+case 57:
+YY_RULE_SETUP
+#line 408 "pars0lex.l"
+{
+			return(PARS_CLOSE_TOKEN);
+}
+	YY_BREAK
+case 58:
+YY_RULE_SETUP
+#line 412 "pars0lex.l"
+{
+			return(PARS_NOTFOUND_TOKEN);
+}
+	YY_BREAK
+case 59:
+YY_RULE_SETUP
+#line 416 "pars0lex.l"
+{
+			return(PARS_TO_BINARY_TOKEN);
+}
+	YY_BREAK
+case 60:
+YY_RULE_SETUP
+#line 420 "pars0lex.l"
+{
+			return(PARS_SUBSTR_TOKEN);
+}
+	YY_BREAK
+case 61:
+YY_RULE_SETUP
+#line 424 "pars0lex.l"
+{
+			return(PARS_CONCAT_TOKEN);
+}
+	YY_BREAK
+case 62:
+YY_RULE_SETUP
+#line 428 "pars0lex.l"
+{
+			return(PARS_INSTR_TOKEN);
+}
+	YY_BREAK
+case 63:
+YY_RULE_SETUP
+#line 432 "pars0lex.l"
+{
+			return(PARS_LENGTH_TOKEN);
+}
+	YY_BREAK
+case 64:
+YY_RULE_SETUP
+#line 436 "pars0lex.l"
+{
+			return(PARS_COMMIT_TOKEN);
+}
+	YY_BREAK
+case 65:
+YY_RULE_SETUP
+#line 440 "pars0lex.l"
+{
+			return(PARS_ROLLBACK_TOKEN);
+}
+	YY_BREAK
+case 66:
+YY_RULE_SETUP
+#line 444 "pars0lex.l"
+{
+			return(PARS_WORK_TOKEN);
+}
+	YY_BREAK
+case 67:
+YY_RULE_SETUP
+#line 448 "pars0lex.l"
+{
+			return(PARS_EXIT_TOKEN);
+}
+	YY_BREAK
+case 68:
+YY_RULE_SETUP
+#line 452 "pars0lex.l"
+{
+			return(PARS_FUNCTION_TOKEN);
+}
+	YY_BREAK
+case 69:
+YY_RULE_SETUP
+#line 456 "pars0lex.l"
+{
+			return(PARS_LOCK_TOKEN);
+}
+	YY_BREAK
+case 70:
+YY_RULE_SETUP
+#line 460 "pars0lex.l"
+{
+			return(PARS_SHARE_TOKEN);
+}
+	YY_BREAK
+case 71:
+YY_RULE_SETUP
+#line 464 "pars0lex.l"
+{
+			return(PARS_MODE_TOKEN);
+}
+	YY_BREAK
+case 72:
+YY_RULE_SETUP
+#line 468 "pars0lex.l"
+{
+                        return(PARS_LIKE_TOKEN);
+}
+	YY_BREAK
+case 73:
+YY_RULE_SETUP
+#line 472 "pars0lex.l"
+{
+			return(PARS_BIGINT_TOKEN);
+}
+	YY_BREAK
+case 74:
+YY_RULE_SETUP
+#line 476 "pars0lex.l"
+{
+			yylval = sym_tab_add_id(pars_sym_tab_global,
+							(byte*) yytext,
+							strlen(yytext));
+			return(PARS_ID_TOKEN);
+}
+	YY_BREAK
+case 75:
+YY_RULE_SETUP
+#line 483 "pars0lex.l"
+{
+			yylval = sym_tab_add_id(pars_sym_tab_global,
+							(byte*) yytext,
+							strlen(yytext));
+			return(PARS_TABLE_NAME_TOKEN);
+}
+	YY_BREAK
+case 76:
+YY_RULE_SETUP
+#line 490 "pars0lex.l"
+{
+			return(PARS_DDOT_TOKEN);
+}
+	YY_BREAK
+case 77:
+YY_RULE_SETUP
+#line 494 "pars0lex.l"
+{
+			return(PARS_ASSIGN_TOKEN);
+}
+	YY_BREAK
+case 78:
+YY_RULE_SETUP
+#line 498 "pars0lex.l"
+{
+			return(PARS_LE_TOKEN);
+}
+	YY_BREAK
+case 79:
+YY_RULE_SETUP
+#line 502 "pars0lex.l"
+{
+			return(PARS_GE_TOKEN);
+}
+	YY_BREAK
+case 80:
+YY_RULE_SETUP
+#line 506 "pars0lex.l"
+{
+			return(PARS_NE_TOKEN);
+}
+	YY_BREAK
+case 81:
+YY_RULE_SETUP
+#line 510 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 82:
+YY_RULE_SETUP
+#line 515 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 83:
+YY_RULE_SETUP
+#line 520 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 84:
+YY_RULE_SETUP
+#line 525 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 85:
+YY_RULE_SETUP
+#line 530 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 86:
+YY_RULE_SETUP
+#line 535 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 87:
+YY_RULE_SETUP
+#line 540 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 88:
+YY_RULE_SETUP
+#line 545 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 89:
+YY_RULE_SETUP
+#line 550 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 90:
+YY_RULE_SETUP
+#line 555 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 91:
+YY_RULE_SETUP
+#line 560 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 92:
+YY_RULE_SETUP
+#line 565 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 93:
+YY_RULE_SETUP
+#line 570 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 94:
+YY_RULE_SETUP
+#line 575 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 95:
+YY_RULE_SETUP
+#line 580 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 96:
+YY_RULE_SETUP
+#line 585 "pars0lex.l"
+BEGIN(comment); /* eat up comment */
+	YY_BREAK
+case 97:
+/* rule 97 can match eol */
+YY_RULE_SETUP
+#line 587 "pars0lex.l"
+
+	YY_BREAK
+case 98:
+/* rule 98 can match eol */
+YY_RULE_SETUP
+#line 588 "pars0lex.l"
+
+	YY_BREAK
+case 99:
+YY_RULE_SETUP
+#line 589 "pars0lex.l"
+BEGIN(INITIAL);
+	YY_BREAK
+case 100:
+/* rule 100 can match eol */
+YY_RULE_SETUP
+#line 591 "pars0lex.l"
+/* eat up whitespace */
+	YY_BREAK
+case 101:
+YY_RULE_SETUP
+#line 594 "pars0lex.l"
+{
+			fprintf(stderr,"Unrecognized character: %02x\n",
+				*yytext);
+
+			ut_error;
+
+			return(0);
+}
+	YY_BREAK
+case 102:
+YY_RULE_SETUP
+#line 603 "pars0lex.l"
+YY_FATAL_ERROR( "flex scanner jammed" );
+	YY_BREAK
+#line 1942 "lexyy.cc"
+case YY_STATE_EOF(INITIAL):
+case YY_STATE_EOF(comment):
+case YY_STATE_EOF(quoted):
+case YY_STATE_EOF(id):
+	yyterminate();
+
+	case YY_END_OF_BUFFER:
+		{
+		/* Amount of text matched not including the EOB char. */
+		int yy_amount_of_matched_text = (int) (yy_cp - (yytext_ptr)) - 1;
+
+		/* Undo the effects of YY_DO_BEFORE_ACTION. */
+		*yy_cp = (yy_hold_char);
+		YY_RESTORE_YY_MORE_OFFSET
+
+		if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_NEW )
+			{
+			/* We're scanning a new file or input source.  It's
+			 * possible that this happened because the user
+			 * just pointed yyin at a new source and called
+			 * yylex().  If so, then we have to assure
+			 * consistency between YY_CURRENT_BUFFER and our
+			 * globals.  Here is the right place to do so, because
+			 * this is the first action (other than possibly a
+			 * back-up) that will match for the new input source.
+			 */
+			(yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_n_chars;
+			YY_CURRENT_BUFFER_LVALUE->yy_input_file = yyin;
+			YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = YY_BUFFER_NORMAL;
+			}
+
+		/* Note that here we test for yy_c_buf_p "<=" to the position
+		 * of the first EOB in the buffer, since yy_c_buf_p will
+		 * already have been incremented past the NUL character
+		 * (since all states make transitions on EOB to the
+		 * end-of-buffer state).  Contrast this with the test
+		 * in input().
+		 */
+		if ( (yy_c_buf_p) <= &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] )
+			{ /* This was really a NUL. */
+			yy_state_type yy_next_state;
+
+			(yy_c_buf_p) = (yytext_ptr) + yy_amount_of_matched_text;
+
+			yy_current_state = yy_get_previous_state(  );
+
+			/* Okay, we're now positioned to make the NUL
+			 * transition.  We couldn't have
+			 * yy_get_previous_state() go ahead and do it
+			 * for us because it doesn't know how to deal
+			 * with the possibility of jamming (and we don't
+			 * want to build jamming into it because then it
+			 * will run more slowly).
+			 */
+
+			yy_next_state = yy_try_NUL_trans( yy_current_state );
+
+			yy_bp = (yytext_ptr) + YY_MORE_ADJ;
+
+			if ( yy_next_state )
+				{
+				/* Consume the NUL. */
+				yy_cp = ++(yy_c_buf_p);
+				yy_current_state = yy_next_state;
+				goto yy_match;
+				}
+
+			else
+				{
+				yy_cp = (yy_last_accepting_cpos);
+				yy_current_state = (yy_last_accepting_state);
+				goto yy_find_action;
+				}
+			}
+
+		else switch ( yy_get_next_buffer(  ) )
+			{
+			case EOB_ACT_END_OF_FILE:
+				{
+				(yy_did_buffer_switch_on_eof) = 0;
+
+				if ( yywrap(  ) )
+					{
+					/* Note: because we've taken care in
+					 * yy_get_next_buffer() to have set up
+					 * yytext, we can now set up
+					 * yy_c_buf_p so that if some total
+					 * hoser (like flex itself) wants to
+					 * call the scanner after we return the
+					 * YY_NULL, it'll still work - another
+					 * YY_NULL will get returned.
+					 */
+					(yy_c_buf_p) = (yytext_ptr) + YY_MORE_ADJ;
+
+					yy_act = YY_STATE_EOF(YY_START);
+					goto do_action;
+					}
+
+				else
+					{
+					if ( ! (yy_did_buffer_switch_on_eof) )
+						YY_NEW_FILE;
+					}
+				break;
+				}
+
+			case EOB_ACT_CONTINUE_SCAN:
+				(yy_c_buf_p) =
+					(yytext_ptr) + yy_amount_of_matched_text;
+
+				yy_current_state = yy_get_previous_state(  );
+
+				yy_cp = (yy_c_buf_p);
+				yy_bp = (yytext_ptr) + YY_MORE_ADJ;
+				goto yy_match;
+
+			case EOB_ACT_LAST_MATCH:
+				(yy_c_buf_p) =
+				&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)];
+
+				yy_current_state = yy_get_previous_state(  );
+
+				yy_cp = (yy_c_buf_p);
+				yy_bp = (yytext_ptr) + YY_MORE_ADJ;
+				goto yy_find_action;
+			}
+		break;
+		}
+
+	default:
+		YY_FATAL_ERROR(
+			"fatal flex scanner internal error--no action found" );
+	} /* end of action switch */
+		} /* end of scanning one token */
+	} /* end of user's declarations */
+} /* end of yylex */
+
+/* yy_get_next_buffer - try to read in a new buffer
+ *
+ * Returns a code representing an action:
+ *	EOB_ACT_LAST_MATCH -
+ *	EOB_ACT_CONTINUE_SCAN - continue scanning from current position
+ *	EOB_ACT_END_OF_FILE - end of file
+ */
+static int yy_get_next_buffer (void)
+{
+    	char *dest = YY_CURRENT_BUFFER_LVALUE->yy_ch_buf;
+	char *source = (yytext_ptr);
+	int number_to_move, i;
+	int ret_val;
+
+	if ( (yy_c_buf_p) > &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars) + 1] )
+		YY_FATAL_ERROR(
+		"fatal flex scanner internal error--end of buffer missed" );
+
+	if ( YY_CURRENT_BUFFER_LVALUE->yy_fill_buffer == 0 )
+		{ /* Don't try to fill the buffer, so this is an EOF. */
+		if ( (yy_c_buf_p) - (yytext_ptr) - YY_MORE_ADJ == 1 )
+			{
+			/* We matched a single character, the EOB, so
+			 * treat this as a final EOF.
+			 */
+			return EOB_ACT_END_OF_FILE;
+			}
+
+		else
+			{
+			/* We matched some text prior to the EOB, first
+			 * process it.
+			 */
+			return EOB_ACT_LAST_MATCH;
+			}
+		}
+
+	/* Try to read more data. */
+
+	/* First move last chars to start of buffer. */
+	number_to_move = (int) ((yy_c_buf_p) - (yytext_ptr) - 1);
+
+	for ( i = 0; i < number_to_move; ++i )
+		*(dest++) = *(source++);
+
+	if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_EOF_PENDING )
+		/* don't do the read, it's not guaranteed to return an EOF,
+		 * just force an EOF
+		 */
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars) = 0;
+
+	else
+		{
+			int num_to_read =
+			YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1;
+
+		while ( num_to_read <= 0 )
+			{ /* Not enough room in the buffer - grow it. */
+
+			/* just a shorter name for the current buffer */
+			YY_BUFFER_STATE b = YY_CURRENT_BUFFER_LVALUE;
+
+			int yy_c_buf_p_offset =
+				(int) ((yy_c_buf_p) - b->yy_ch_buf);
+
+			if ( b->yy_is_our_buffer )
+				{
+				int new_size = b->yy_buf_size * 2;
+
+				if ( new_size <= 0 )
+					b->yy_buf_size += b->yy_buf_size / 8;
+				else
+					b->yy_buf_size *= 2;
+
+				b->yy_ch_buf = (char *)
+					/* Include room in for 2 EOB chars. */
+					yyrealloc( (void *) b->yy_ch_buf,
+							 (yy_size_t) (b->yy_buf_size + 2)  );
+				}
+			else
+				/* Can't grow it, we don't own it. */
+				b->yy_ch_buf = NULL;
+
+			if ( ! b->yy_ch_buf )
+				YY_FATAL_ERROR(
+				"fatal error - scanner input buffer overflow" );
+
+			(yy_c_buf_p) = &b->yy_ch_buf[yy_c_buf_p_offset];
+
+			num_to_read = YY_CURRENT_BUFFER_LVALUE->yy_buf_size -
+						number_to_move - 1;
+
+			}
+
+		if ( num_to_read > YY_READ_BUF_SIZE )
+			num_to_read = YY_READ_BUF_SIZE;
+
+		/* Read in more data. */
+		YY_INPUT( (&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]),
+			(yy_n_chars), num_to_read );
+
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars);
+		}
+
+	if ( (yy_n_chars) == 0 )
+		{
+		if ( number_to_move == YY_MORE_ADJ )
+			{
+			ret_val = EOB_ACT_END_OF_FILE;
+			yyrestart( yyin  );
+			}
+
+		else
+			{
+			ret_val = EOB_ACT_LAST_MATCH;
+			YY_CURRENT_BUFFER_LVALUE->yy_buffer_status =
+				YY_BUFFER_EOF_PENDING;
+			}
+		}
+
+	else
+		ret_val = EOB_ACT_CONTINUE_SCAN;
+
+	if (((yy_n_chars) + number_to_move) > YY_CURRENT_BUFFER_LVALUE->yy_buf_size) {
+		/* Extend the array by 50%, plus the number we really need. */
+		int new_size = (yy_n_chars) + number_to_move + ((yy_n_chars) >> 1);
+		YY_CURRENT_BUFFER_LVALUE->yy_ch_buf = (char *) yyrealloc(
+			(void *) YY_CURRENT_BUFFER_LVALUE->yy_ch_buf, (yy_size_t) new_size  );
+		if ( ! YY_CURRENT_BUFFER_LVALUE->yy_ch_buf )
+			YY_FATAL_ERROR( "out of dynamic memory in yy_get_next_buffer()" );
+		/* "- 2" to take care of EOB's */
+		YY_CURRENT_BUFFER_LVALUE->yy_buf_size = (int) (new_size - 2);
+	}
+
+	(yy_n_chars) += number_to_move;
+	YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] = YY_END_OF_BUFFER_CHAR;
+	YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars) + 1] = YY_END_OF_BUFFER_CHAR;
+
+	(yytext_ptr) = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[0];
+
+	return ret_val;
+}
+
+/* yy_get_previous_state - get the state just before the EOB char was reached */
+
+    static yy_state_type yy_get_previous_state (void)
+{
+	yy_state_type yy_current_state;
+	char *yy_cp;
+    
+	yy_current_state = (yy_start);
+
+	for ( yy_cp = (yytext_ptr) + YY_MORE_ADJ; yy_cp < (yy_c_buf_p); ++yy_cp )
+		{
+		YY_CHAR yy_c = (*yy_cp ? yy_ec[YY_SC_TO_UI(*yy_cp)] : 1);
+		if ( yy_accept[yy_current_state] )
+			{
+			(yy_last_accepting_state) = yy_current_state;
+			(yy_last_accepting_cpos) = yy_cp;
+			}
+		while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+			{
+			yy_current_state = (int) yy_def[yy_current_state];
+			if ( yy_current_state >= 307 )
+				yy_c = yy_meta[yy_c];
+			}
+		yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c];
+		}
+
+	return yy_current_state;
+}
+
+/* yy_try_NUL_trans - try to make a transition on the NUL character
+ *
+ * synopsis
+ *	next_state = yy_try_NUL_trans( current_state );
+ */
+    static yy_state_type yy_try_NUL_trans  (yy_state_type yy_current_state )
+{
+	int yy_is_jam;
+    	char *yy_cp = (yy_c_buf_p);
+
+	YY_CHAR yy_c = 1;
+	if ( yy_accept[yy_current_state] )
+		{
+		(yy_last_accepting_state) = yy_current_state;
+		(yy_last_accepting_cpos) = yy_cp;
+		}
+	while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+		{
+		yy_current_state = (int) yy_def[yy_current_state];
+		if ( yy_current_state >= 307 )
+			yy_c = yy_meta[yy_c];
+		}
+	yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c];
+	yy_is_jam = (yy_current_state == 306);
+
+		return yy_is_jam ? 0 : yy_current_state;
+}
+
+#ifndef YY_NO_UNPUT
+
+#endif
+
+#ifndef YY_NO_INPUT
+#ifdef __cplusplus
+    static int yyinput (void)
+#else
+    static int input  (void)
+#endif
+
+{
+	int c;
+    
+	*(yy_c_buf_p) = (yy_hold_char);
+
+	if ( *(yy_c_buf_p) == YY_END_OF_BUFFER_CHAR )
+		{
+		/* yy_c_buf_p now points to the character we want to return.
+		 * If this occurs *before* the EOB characters, then it's a
+		 * valid NUL; if not, then we've hit the end of the buffer.
+		 */
+		if ( (yy_c_buf_p) < &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] )
+			/* This was really a NUL. */
+			*(yy_c_buf_p) = '\0';
+
+		else
+			{ /* need more input */
+			int offset = (int) ((yy_c_buf_p) - (yytext_ptr));
+			++(yy_c_buf_p);
+
+			switch ( yy_get_next_buffer(  ) )
+				{
+				case EOB_ACT_LAST_MATCH:
+					/* This happens because yy_g_n_b()
+					 * sees that we've accumulated a
+					 * token and flags that we need to
+					 * try matching the token before
+					 * proceeding.  But for input(),
+					 * there's no matching to consider.
+					 * So convert the EOB_ACT_LAST_MATCH
+					 * to EOB_ACT_END_OF_FILE.
+					 */
+
+					/* Reset buffer status. */
+					yyrestart( yyin );
+
+					/*FALLTHROUGH*/
+
+				case EOB_ACT_END_OF_FILE:
+					{
+					if ( yywrap(  ) )
+						return 0;
+
+					if ( ! (yy_did_buffer_switch_on_eof) )
+						YY_NEW_FILE;
+#ifdef __cplusplus
+					return yyinput();
+#else
+					return input();
+#endif
+					}
+
+				case EOB_ACT_CONTINUE_SCAN:
+					(yy_c_buf_p) = (yytext_ptr) + offset;
+					break;
+				}
+			}
+		}
+
+	c = *(unsigned char *) (yy_c_buf_p);	/* cast for 8-bit char's */
+	*(yy_c_buf_p) = '\0';	/* preserve yytext */
+	(yy_hold_char) = *++(yy_c_buf_p);
+
+	return c;
+}
+#endif	/* ifndef YY_NO_INPUT */
+
+/** Immediately switch to a different input stream.
+ * @param input_file A readable stream.
+ * 
+ * @note This function does not reset the start condition to @c INITIAL .
+ */
+    static void yyrestart  (FILE * input_file )
+{
+    
+	if ( ! YY_CURRENT_BUFFER ){
+        yyensure_buffer_stack ();
+		YY_CURRENT_BUFFER_LVALUE =
+            yy_create_buffer( yyin, YY_BUF_SIZE );
+	}
+
+	yy_init_buffer( YY_CURRENT_BUFFER, input_file );
+	yy_load_buffer_state(  );
+}
+
+/** Switch to a different input buffer.
+ * @param new_buffer The new input buffer.
+ * 
+ */
+    MY_ATTRIBUTE((unused)) static void yy_switch_to_buffer  (YY_BUFFER_STATE  new_buffer )
+{
+    
+	/* TODO. We should be able to replace this entire function body
+	 * with
+	 *		yypop_buffer_state();
+	 *		yypush_buffer_state(new_buffer);
+     */
+	yyensure_buffer_stack ();
+	if ( YY_CURRENT_BUFFER == new_buffer )
+		return;
+
+	if ( YY_CURRENT_BUFFER )
+		{
+		/* Flush out information for old buffer. */
+		*(yy_c_buf_p) = (yy_hold_char);
+		YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = (yy_c_buf_p);
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars);
+		}
+
+	YY_CURRENT_BUFFER_LVALUE = new_buffer;
+	yy_load_buffer_state(  );
+
+	/* We don't actually know whether we did this switch during
+	 * EOF (yywrap()) processing, but the only time this flag
+	 * is looked at is after yywrap() is called, so it's safe
+	 * to go ahead and always set it.
+	 */
+	(yy_did_buffer_switch_on_eof) = 1;
+}
+
+static void yy_load_buffer_state  (void)
+{
+    	(yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_n_chars;
+	(yytext_ptr) = (yy_c_buf_p) = YY_CURRENT_BUFFER_LVALUE->yy_buf_pos;
+	yyin = YY_CURRENT_BUFFER_LVALUE->yy_input_file;
+	(yy_hold_char) = *(yy_c_buf_p);
+}
+
+/** Allocate and initialize an input buffer state.
+ * @param file A readable stream.
+ * @param size The character buffer size in bytes. When in doubt, use @c YY_BUF_SIZE.
+ * 
+ * @return the allocated buffer state.
+ */
+    static YY_BUFFER_STATE yy_create_buffer  (FILE * file, int  size )
+{
+	YY_BUFFER_STATE b;
+    
+	b = (YY_BUFFER_STATE) yyalloc( sizeof( struct yy_buffer_state )  );
+	if ( ! b )
+		YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" );
+
+	b->yy_buf_size = size;
+
+	/* yy_ch_buf has to be 2 characters longer than the size given because
+	 * we need to put in 2 end-of-buffer characters.
+	 */
+	b->yy_ch_buf = (char *) yyalloc( (yy_size_t) (b->yy_buf_size + 2)  );
+	if ( ! b->yy_ch_buf )
+		YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" );
+
+	b->yy_is_our_buffer = 1;
+
+	yy_init_buffer( b, file );
+
+	return b;
+}
+
+/** Destroy the buffer.
+ * @param b a buffer created with yy_create_buffer()
+ * 
+ */
+    static void yy_delete_buffer (YY_BUFFER_STATE  b )
+{
+    
+	if ( ! b )
+		return;
+
+	if ( b == YY_CURRENT_BUFFER ) /* Not sure if we should pop here. */
+		YY_CURRENT_BUFFER_LVALUE = (YY_BUFFER_STATE) 0;
+
+	if ( b->yy_is_our_buffer )
+		yyfree( (void *) b->yy_ch_buf  );
+
+	yyfree( (void *) b  );
+}
+
+/* Initializes or reinitializes a buffer.
+ * This function is sometimes called more than once on the same buffer,
+ * such as during a yyrestart() or at EOF.
+ */
+    static void yy_init_buffer  (YY_BUFFER_STATE  b, FILE * file )
+
+{
+	int oerrno = errno;
+    
+	yy_flush_buffer( b );
+
+	b->yy_input_file = file;
+	b->yy_fill_buffer = 1;
+
+    /* If b is the current buffer, then yy_init_buffer was _probably_
+     * called from yyrestart() or through yy_get_next_buffer.
+     * In that case, we don't want to reset the lineno or column.
+     */
+    if (b != YY_CURRENT_BUFFER){
+        b->yy_bs_lineno = 1;
+        b->yy_bs_column = 0;
+    }
+
+        b->yy_is_interactive = 0;
+    
+	errno = oerrno;
+}
+
+/** Discard all buffered characters. On the next scan, YY_INPUT will be called.
+ * @param b the buffer state to be flushed, usually @c YY_CURRENT_BUFFER.
+ * 
+ */
+    static void yy_flush_buffer (YY_BUFFER_STATE  b )
+{
+    	if ( ! b )
+		return;
+
+	b->yy_n_chars = 0;
+
+	/* We always need two end-of-buffer characters.  The first causes
+	 * a transition to the end-of-buffer state.  The second causes
+	 * a jam in that state.
+	 */
+	b->yy_ch_buf[0] = YY_END_OF_BUFFER_CHAR;
+	b->yy_ch_buf[1] = YY_END_OF_BUFFER_CHAR;
+
+	b->yy_buf_pos = &b->yy_ch_buf[0];
+
+	b->yy_at_bol = 1;
+	b->yy_buffer_status = YY_BUFFER_NEW;
+
+	if ( b == YY_CURRENT_BUFFER )
+		yy_load_buffer_state(  );
+}
+
+/** Pushes the new state onto the stack. The new state becomes
+ *  the current state. This function will allocate the stack
+ *  if necessary.
+ *  @param new_buffer The new state.
+ *  
+ */
+MY_ATTRIBUTE((unused)) static void yypush_buffer_state (YY_BUFFER_STATE new_buffer )
+{
+    	if (new_buffer == NULL)
+		return;
+
+	yyensure_buffer_stack();
+
+	/* This block is copied from yy_switch_to_buffer. */
+	if ( YY_CURRENT_BUFFER )
+		{
+		/* Flush out information for old buffer. */
+		*(yy_c_buf_p) = (yy_hold_char);
+		YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = (yy_c_buf_p);
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars);
+		}
+
+	/* Only push if top exists. Otherwise, replace top. */
+	if (YY_CURRENT_BUFFER)
+		(yy_buffer_stack_top)++;
+	YY_CURRENT_BUFFER_LVALUE = new_buffer;
+
+	/* copied from yy_switch_to_buffer. */
+	yy_load_buffer_state(  );
+	(yy_did_buffer_switch_on_eof) = 1;
+}
+
+/** Removes and deletes the top of the stack, if present.
+ *  The next element becomes the new top.
+ *  
+ */
+MY_ATTRIBUTE((unused)) static void yypop_buffer_state (void)
+{
+    	if (!YY_CURRENT_BUFFER)
+		return;
+
+	yy_delete_buffer(YY_CURRENT_BUFFER );
+	YY_CURRENT_BUFFER_LVALUE = NULL;
+	if ((yy_buffer_stack_top) > 0)
+		--(yy_buffer_stack_top);
+
+	if (YY_CURRENT_BUFFER) {
+		yy_load_buffer_state(  );
+		(yy_did_buffer_switch_on_eof) = 1;
+	}
+}
+
+/* Allocates the stack if it does not exist.
+ *  Guarantees space for at least one push.
+ */
+static void yyensure_buffer_stack (void)
+{
+	yy_size_t num_to_alloc;
+    
+	if (!(yy_buffer_stack)) {
+
+		/* First allocation is just for 2 elements, since we don't know if this
+		 * scanner will even need a stack. We use 2 instead of 1 to avoid an
+		 * immediate realloc on the next call.
+         */
+      num_to_alloc = 1; /* After all that talk, this was set to 1 anyways... */
+		(yy_buffer_stack) = (struct yy_buffer_state**)yyalloc
+								(num_to_alloc * sizeof(struct yy_buffer_state*)
+								);
+		if ( ! (yy_buffer_stack) )
+			YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" );
+
+		memset((yy_buffer_stack), 0, num_to_alloc * sizeof(struct yy_buffer_state*));
+
+		(yy_buffer_stack_max) = num_to_alloc;
+		(yy_buffer_stack_top) = 0;
+		return;
+	}
+
+	if ((yy_buffer_stack_top) >= ((yy_buffer_stack_max)) - 1){
+
+		/* Increase the buffer to prepare for a possible push. */
+		yy_size_t grow_size = 8 /* arbitrary grow size */;
+
+		num_to_alloc = (yy_buffer_stack_max) + grow_size;
+		(yy_buffer_stack) = (struct yy_buffer_state**)yyrealloc
+								((yy_buffer_stack),
+								num_to_alloc * sizeof(struct yy_buffer_state*)
+								);
+		if ( ! (yy_buffer_stack) )
+			YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" );
+
+		/* zero only the new slots.*/
+		memset((yy_buffer_stack) + (yy_buffer_stack_max), 0, grow_size * sizeof(struct yy_buffer_state*));
+		(yy_buffer_stack_max) = num_to_alloc;
+	}
+}
+
+#ifndef YY_EXIT_FAILURE
+#define YY_EXIT_FAILURE 2
+#endif
+
+static void yynoreturn yy_fatal_error (const char* msg )
+{
+			fprintf( stderr, "%s\n", msg );
+	exit( YY_EXIT_FAILURE );
+}
+
+/* Redefine yyless() so it works in section 3 code. */
+
+#undef yyless
+#define yyless(n) \
+	do \
+		{ \
+		/* Undo effects of setting up yytext. */ \
+        int yyless_macro_arg = (n); \
+        YY_LESS_LINENO(yyless_macro_arg);\
+		yytext[yyleng] = (yy_hold_char); \
+		(yy_c_buf_p) = yytext + yyless_macro_arg; \
+		(yy_hold_char) = *(yy_c_buf_p); \
+		*(yy_c_buf_p) = '\0'; \
+		yyleng = yyless_macro_arg; \
+		} \
+	while ( 0 )
+
+/* Accessor  methods (get/set functions) to struct members. */
+
+/** Get the current line number.
+ * 
+ */
+MY_ATTRIBUTE((unused)) static int yyget_lineno  (void)
+{
+    
+    return yylineno;
+}
+
+/** Get the input stream.
+ * 
+ */
+MY_ATTRIBUTE((unused)) static FILE *yyget_in  (void)
+{
+        return yyin;
+}
+
+/** Get the output stream.
+ * 
+ */
+MY_ATTRIBUTE((unused)) static FILE *yyget_out  (void)
+{
+        return yyout;
+}
+
+/** Get the length of the current token.
+ * 
+ */
+MY_ATTRIBUTE((unused)) static int yyget_leng  (void)
+{
+        return yyleng;
+}
+
+/** Get the current token.
+ * 
+ */
+
+MY_ATTRIBUTE((unused)) static char *yyget_text  (void)
+{
+        return yytext;
+}
+
+/** Set the current line number.
+ * @param _line_number line number
+ * 
+ */
+MY_ATTRIBUTE((unused)) static void yyset_lineno (int  _line_number )
+{
+    
+    yylineno = _line_number;
+}
+
+/** Set the input stream. This does not discard the current
+ * input buffer.
+ * @param _in_str A readable stream.
+ * 
+ * @see yy_switch_to_buffer
+ */
+MY_ATTRIBUTE((unused)) static void yyset_in (FILE *  _in_str )
+{
+        yyin = _in_str ;
+}
+
+MY_ATTRIBUTE((unused)) static void yyset_out (FILE *  _out_str )
+{
+        yyout = _out_str ;
+}
+
+MY_ATTRIBUTE((unused)) static int yyget_debug  (void)
+{
+        return yy_flex_debug;
+}
+
+MY_ATTRIBUTE((unused)) static void yyset_debug (int  _bdebug )
+{
+        yy_flex_debug = _bdebug ;
+}
+
+static int yy_init_globals (void)
+{
+        /* Initialization is the same as for the non-reentrant scanner.
+     * This function is called from yylex_destroy(), so don't allocate here.
+     */
+
+    (yy_buffer_stack) = NULL;
+    (yy_buffer_stack_top) = 0;
+    (yy_buffer_stack_max) = 0;
+    (yy_c_buf_p) = NULL;
+    (yy_init) = 0;
+    (yy_start) = 0;
+
+/* Defined in main.c */
+#ifdef YY_STDINIT
+    yyin = stdin;
+    yyout = stdout;
+#else
+    yyin = NULL;
+    yyout = NULL;
+#endif
+
+    /* For future reference: Set errno on error, since we are called by
+     * yylex_init()
+     */
+    return 0;
+}
+
+/* yylex_destroy is for both reentrant and non-reentrant scanners. */
+MY_ATTRIBUTE((unused)) static int yylex_destroy  (void)
+{
+    
+    /* Pop the buffer stack, destroying each element. */
+	while(YY_CURRENT_BUFFER){
+		yy_delete_buffer( YY_CURRENT_BUFFER  );
+		YY_CURRENT_BUFFER_LVALUE = NULL;
+		yypop_buffer_state();
+	}
+
+	/* Destroy the stack itself. */
+	yyfree((yy_buffer_stack) );
+	(yy_buffer_stack) = NULL;
+
+    /* Reset the globals. This is important in a non-reentrant scanner so the next time
+     * yylex() is called, initialization will occur. */
+    yy_init_globals( );
+
+    return 0;
+}
+
+/*
+ * Internal utility routines.
+ */
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy (char* s1, const char * s2, int n )
+{
+		
+	int i;
+	for ( i = 0; i < n; ++i )
+		s1[i] = s2[i];
+}
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen (const char * s )
+{
+	int n;
+	for ( n = 0; s[n]; ++n )
+		;
+
+	return n;
+}
+#endif
+
+static void *yyalloc (yy_size_t  size )
+{
+			return malloc(size);
+}
+
+static void *yyrealloc  (void * ptr, yy_size_t  size )
+{
+		
+	/* The cast to (char *) in the following accommodates both
+	 * implementations that use char* generic pointers, and those
+	 * that use void* generic pointers.  It works with the latter
+	 * because both ANSI C and C++ allow castless assignment from
+	 * any pointer type to void*, and deal with argument conversions
+	 * as though doing an assignment.
+	 */
+	return realloc(ptr, size);
+}
+
+static void yyfree (void * ptr )
+{
+			free( (char *) ptr );	/* see yyrealloc() for (char *) cast */
+}
+
+#define YYTABLES_NAME "yytables"
+
+#line 603 "pars0lex.l"
+
+
+/**********************************************************************
+Release any resources used by the lexer. */
+void
+pars_lexer_close(void)
+/*==================*/
+{
+	yylex_destroy();
+	free(stringbuf);
+	stringbuf = NULL;
+	stringbuf_len_alloc = stringbuf_len = 0;
+}
+
diff --git a/storage/innobase/pars/make_bison.sh b/storage/innobase/pars/make_bison.sh
new file mode 100755
index 00000000..25c967ac
--- /dev/null
+++ b/storage/innobase/pars/make_bison.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+#
+# Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+#
+# generate parser files from bison input files.
+
+set -eu
+TMPFILE=pars0grm.tab.c
+OUTFILE=pars0grm.cc
+
+bison -d pars0grm.y
+mv pars0grm.tab.h ../include/pars0grm.h
+
+sed -e '
+s/'"$TMPFILE"'/'"$OUTFILE"'/;
+s/'"pars0grm.tab.h"'/'"pars0grm.h"'/;
+s/^\(\(YYSTYPE\|int\) yy\(char\|nerrs\)\)/static \1/;
+' < "$TMPFILE" > "$OUTFILE"
+
+rm "$TMPFILE"
diff --git a/storage/innobase/pars/make_flex.sh b/storage/innobase/pars/make_flex.sh
new file mode 100755
index 00000000..2baae9c9
--- /dev/null
+++ b/storage/innobase/pars/make_flex.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+#
+# Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+# Copyright (c) 2017, 2019, MariaDB Corporation.
+#
+# This program is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+#
+# generate lexer files from flex input files.
+
+set -eu
+
+TMPFILE=_flex_tmp.cc
+OUTFILE=lexyy.cc
+
+flex -o $TMPFILE pars0lex.l
+
+# The Microsoft compiler needs its includes done in a certain order.
+echo '#include "univ.i"' > $OUTFILE
+
+# flex assigns a pointer to an int in one place without a cast, resulting in
+# a warning on Win64.  Add the cast.  Also define some symbols as static.
+sed -e '
+s/'"$TMPFILE"'/'"$OUTFILE"'/;
+s/^void  *yyset_extra *( *YY_EXTRA_TYPE  *user_defined *) *;//
+s/\(int offset = \)\((yy_c_buf_p) - (yytext_ptr)\);/\1(int)(\2);/;
+s/\(void yy\(restart\|_\(delete\|flush\)_buffer\)\)/static \1/;
+s/\(void yy_switch_to_buffer\)/MY_ATTRIBUTE((unused)) static \1/;
+s/\(void yy\(push\|pop\)_buffer_state\)/MY_ATTRIBUTE((unused)) static \1/;
+s/\(YY_BUFFER_STATE yy_create_buffer\)/static \1/;
+s/\(\(int\|void\) yy[gs]et_\)/MY_ATTRIBUTE((unused)) static \1/;
+s/\(void \*\?yy\(\(re\)\?alloc\|free\)\)/static \1/;
+s/extern int yy\(leng\|_flex_debug\|lineno\);//;
+s/\(int yy\(leng\|lineno\|_flex_debug\)\)/static \1/;
+s/\(int yylex_destroy\)/MY_ATTRIBUTE((unused)) static \1/;
+s/^\(\(FILE\|char\) *\* *yyget\)/MY_ATTRIBUTE((unused)) static \1/;
+s/^extern \(\(FILE\|char\) *\* *yy\).*//;
+s/^\(FILE\|char\) *\* *yy/static &/;
+' < $TMPFILE >> $OUTFILE
+
+rm $TMPFILE
diff --git a/storage/innobase/pars/pars0grm.cc b/storage/innobase/pars/pars0grm.cc
new file mode 100644
index 00000000..c4d855e2
--- /dev/null
+++ b/storage/innobase/pars/pars0grm.cc
@@ -0,0 +1,2504 @@
+/* A Bison parser, made by GNU Bison 3.7.6.  */
+
+/* Bison implementation for Yacc-like parsers in C
+
+   Copyright (C) 1984, 1989-1990, 2000-2015, 2018-2021 Free Software Foundation,
+   Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+/* As a special exception, you may create a larger work that contains
+   part or all of the Bison parser skeleton and distribute that work
+   under terms of your choice, so long as that work isn't itself a
+   parser generator using the skeleton or a modified version thereof
+   as a parser skeleton.  Alternatively, if you modify or redistribute
+   the parser skeleton itself, you may (at your option) remove this
+   special exception, which will cause the skeleton and the resulting
+   Bison output files to be licensed under the GNU General Public
+   License without this special exception.
+
+   This special exception was added by the Free Software Foundation in
+   version 2.2 of Bison.  */
+
+/* C LALR(1) parser skeleton written by Richard Stallman, by
+   simplifying the original so-called "semantic" parser.  */
+
+/* DO NOT RELY ON FEATURES THAT ARE NOT DOCUMENTED in the manual,
+   especially those whose name start with YY_ or yy_.  They are
+   private implementation details that can be changed or removed.  */
+
+/* All symbols defined below should begin with yy or YY, to avoid
+   infringing on user name space.  This should be done even for local
+   variables, as they might otherwise be expanded by user macros.
+   There are some unavoidable exceptions within include files to
+   define necessary library symbols; they are noted "INFRINGES ON
+   USER NAME SPACE" below.  */
+
+/* Identify Bison output, and Bison version.  */
+#define YYBISON 30706
+
+/* Bison version string.  */
+#define YYBISON_VERSION "3.7.6"
+
+/* Skeleton name.  */
+#define YYSKELETON_NAME "yacc.c"
+
+/* Pure parsers.  */
+#define YYPURE 0
+
+/* Push parsers.  */
+#define YYPUSH 0
+
+/* Pull parsers.  */
+#define YYPULL 1
+
+
+
+
+/* First part of user prologue.  */
+#line 29 "pars0grm.y"
+
+/* The value of the semantic attribute is a pointer to a query tree node
+que_node_t */
+
+#include "univ.i"
+#include <math.h>
+#include "pars0pars.h"
+#include "mem0mem.h"
+#include "que0types.h"
+#include "que0que.h"
+#include "row0sel.h"
+#ifdef __GNUC__
+# pragma GCC diagnostic ignored "-Wpragmas"
+# pragma GCC diagnostic ignored "-Wunknown-warning-option"
+# pragma GCC diagnostic ignored "-Wfree-nonheap-object"
+# pragma GCC diagnostic ignored "-Wunused-but-set-variable"
+#endif
+#define YYSTYPE que_node_t*
+/* #define __STDC__ */
+int
+yylex(void);
+
+#line 90 "pars0grm.cc"
+
+# ifndef YY_CAST
+#  ifdef __cplusplus
+#   define YY_CAST(Type, Val) static_cast<Type> (Val)
+#   define YY_REINTERPRET_CAST(Type, Val) reinterpret_cast<Type> (Val)
+#  else
+#   define YY_CAST(Type, Val) ((Type) (Val))
+#   define YY_REINTERPRET_CAST(Type, Val) ((Type) (Val))
+#  endif
+# endif
+# ifndef YY_NULLPTR
+#  if defined __cplusplus
+#   if 201103L <= __cplusplus
+#    define YY_NULLPTR nullptr
+#   else
+#    define YY_NULLPTR 0
+#   endif
+#  else
+#   define YY_NULLPTR ((void*)0)
+#  endif
+# endif
+
+#include "pars0grm.h"
+/* Symbol kind.  */
+enum yysymbol_kind_t
+{
+  YYSYMBOL_YYEMPTY = -2,
+  YYSYMBOL_YYEOF = 0,                      /* "end of file"  */
+  YYSYMBOL_YYerror = 1,                    /* error  */
+  YYSYMBOL_YYUNDEF = 2,                    /* "invalid token"  */
+  YYSYMBOL_PARS_INT_LIT = 3,               /* PARS_INT_LIT  */
+  YYSYMBOL_PARS_FLOAT_LIT = 4,             /* PARS_FLOAT_LIT  */
+  YYSYMBOL_PARS_STR_LIT = 5,               /* PARS_STR_LIT  */
+  YYSYMBOL_PARS_NULL_LIT = 6,              /* PARS_NULL_LIT  */
+  YYSYMBOL_PARS_ID_TOKEN = 7,              /* PARS_ID_TOKEN  */
+  YYSYMBOL_PARS_AND_TOKEN = 8,             /* PARS_AND_TOKEN  */
+  YYSYMBOL_PARS_OR_TOKEN = 9,              /* PARS_OR_TOKEN  */
+  YYSYMBOL_PARS_NOT_TOKEN = 10,            /* PARS_NOT_TOKEN  */
+  YYSYMBOL_PARS_GE_TOKEN = 11,             /* PARS_GE_TOKEN  */
+  YYSYMBOL_PARS_LE_TOKEN = 12,             /* PARS_LE_TOKEN  */
+  YYSYMBOL_PARS_NE_TOKEN = 13,             /* PARS_NE_TOKEN  */
+  YYSYMBOL_PARS_PROCEDURE_TOKEN = 14,      /* PARS_PROCEDURE_TOKEN  */
+  YYSYMBOL_PARS_IN_TOKEN = 15,             /* PARS_IN_TOKEN  */
+  YYSYMBOL_PARS_INT_TOKEN = 16,            /* PARS_INT_TOKEN  */
+  YYSYMBOL_PARS_CHAR_TOKEN = 17,           /* PARS_CHAR_TOKEN  */
+  YYSYMBOL_PARS_IS_TOKEN = 18,             /* PARS_IS_TOKEN  */
+  YYSYMBOL_PARS_BEGIN_TOKEN = 19,          /* PARS_BEGIN_TOKEN  */
+  YYSYMBOL_PARS_END_TOKEN = 20,            /* PARS_END_TOKEN  */
+  YYSYMBOL_PARS_IF_TOKEN = 21,             /* PARS_IF_TOKEN  */
+  YYSYMBOL_PARS_THEN_TOKEN = 22,           /* PARS_THEN_TOKEN  */
+  YYSYMBOL_PARS_ELSE_TOKEN = 23,           /* PARS_ELSE_TOKEN  */
+  YYSYMBOL_PARS_ELSIF_TOKEN = 24,          /* PARS_ELSIF_TOKEN  */
+  YYSYMBOL_PARS_LOOP_TOKEN = 25,           /* PARS_LOOP_TOKEN  */
+  YYSYMBOL_PARS_WHILE_TOKEN = 26,          /* PARS_WHILE_TOKEN  */
+  YYSYMBOL_PARS_RETURN_TOKEN = 27,         /* PARS_RETURN_TOKEN  */
+  YYSYMBOL_PARS_SELECT_TOKEN = 28,         /* PARS_SELECT_TOKEN  */
+  YYSYMBOL_PARS_COUNT_TOKEN = 29,          /* PARS_COUNT_TOKEN  */
+  YYSYMBOL_PARS_FROM_TOKEN = 30,           /* PARS_FROM_TOKEN  */
+  YYSYMBOL_PARS_WHERE_TOKEN = 31,          /* PARS_WHERE_TOKEN  */
+  YYSYMBOL_PARS_FOR_TOKEN = 32,            /* PARS_FOR_TOKEN  */
+  YYSYMBOL_PARS_DDOT_TOKEN = 33,           /* PARS_DDOT_TOKEN  */
+  YYSYMBOL_PARS_ORDER_TOKEN = 34,          /* PARS_ORDER_TOKEN  */
+  YYSYMBOL_PARS_BY_TOKEN = 35,             /* PARS_BY_TOKEN  */
+  YYSYMBOL_PARS_ASC_TOKEN = 36,            /* PARS_ASC_TOKEN  */
+  YYSYMBOL_PARS_DESC_TOKEN = 37,           /* PARS_DESC_TOKEN  */
+  YYSYMBOL_PARS_INSERT_TOKEN = 38,         /* PARS_INSERT_TOKEN  */
+  YYSYMBOL_PARS_INTO_TOKEN = 39,           /* PARS_INTO_TOKEN  */
+  YYSYMBOL_PARS_VALUES_TOKEN = 40,         /* PARS_VALUES_TOKEN  */
+  YYSYMBOL_PARS_UPDATE_TOKEN = 41,         /* PARS_UPDATE_TOKEN  */
+  YYSYMBOL_PARS_SET_TOKEN = 42,            /* PARS_SET_TOKEN  */
+  YYSYMBOL_PARS_DELETE_TOKEN = 43,         /* PARS_DELETE_TOKEN  */
+  YYSYMBOL_PARS_CURRENT_TOKEN = 44,        /* PARS_CURRENT_TOKEN  */
+  YYSYMBOL_PARS_OF_TOKEN = 45,             /* PARS_OF_TOKEN  */
+  YYSYMBOL_PARS_CREATE_TOKEN = 46,         /* PARS_CREATE_TOKEN  */
+  YYSYMBOL_PARS_TABLE_TOKEN = 47,          /* PARS_TABLE_TOKEN  */
+  YYSYMBOL_PARS_INDEX_TOKEN = 48,          /* PARS_INDEX_TOKEN  */
+  YYSYMBOL_PARS_UNIQUE_TOKEN = 49,         /* PARS_UNIQUE_TOKEN  */
+  YYSYMBOL_PARS_CLUSTERED_TOKEN = 50,      /* PARS_CLUSTERED_TOKEN  */
+  YYSYMBOL_PARS_ON_TOKEN = 51,             /* PARS_ON_TOKEN  */
+  YYSYMBOL_PARS_ASSIGN_TOKEN = 52,         /* PARS_ASSIGN_TOKEN  */
+  YYSYMBOL_PARS_DECLARE_TOKEN = 53,        /* PARS_DECLARE_TOKEN  */
+  YYSYMBOL_PARS_CURSOR_TOKEN = 54,         /* PARS_CURSOR_TOKEN  */
+  YYSYMBOL_PARS_SQL_TOKEN = 55,            /* PARS_SQL_TOKEN  */
+  YYSYMBOL_PARS_OPEN_TOKEN = 56,           /* PARS_OPEN_TOKEN  */
+  YYSYMBOL_PARS_FETCH_TOKEN = 57,          /* PARS_FETCH_TOKEN  */
+  YYSYMBOL_PARS_CLOSE_TOKEN = 58,          /* PARS_CLOSE_TOKEN  */
+  YYSYMBOL_PARS_NOTFOUND_TOKEN = 59,       /* PARS_NOTFOUND_TOKEN  */
+  YYSYMBOL_PARS_TO_BINARY_TOKEN = 60,      /* PARS_TO_BINARY_TOKEN  */
+  YYSYMBOL_PARS_SUBSTR_TOKEN = 61,         /* PARS_SUBSTR_TOKEN  */
+  YYSYMBOL_PARS_CONCAT_TOKEN = 62,         /* PARS_CONCAT_TOKEN  */
+  YYSYMBOL_PARS_INSTR_TOKEN = 63,          /* PARS_INSTR_TOKEN  */
+  YYSYMBOL_PARS_LENGTH_TOKEN = 64,         /* PARS_LENGTH_TOKEN  */
+  YYSYMBOL_PARS_COMMIT_TOKEN = 65,         /* PARS_COMMIT_TOKEN  */
+  YYSYMBOL_PARS_ROLLBACK_TOKEN = 66,       /* PARS_ROLLBACK_TOKEN  */
+  YYSYMBOL_PARS_WORK_TOKEN = 67,           /* PARS_WORK_TOKEN  */
+  YYSYMBOL_PARS_EXIT_TOKEN = 68,           /* PARS_EXIT_TOKEN  */
+  YYSYMBOL_PARS_FUNCTION_TOKEN = 69,       /* PARS_FUNCTION_TOKEN  */
+  YYSYMBOL_PARS_LOCK_TOKEN = 70,           /* PARS_LOCK_TOKEN  */
+  YYSYMBOL_PARS_SHARE_TOKEN = 71,          /* PARS_SHARE_TOKEN  */
+  YYSYMBOL_PARS_MODE_TOKEN = 72,           /* PARS_MODE_TOKEN  */
+  YYSYMBOL_PARS_LIKE_TOKEN = 73,           /* PARS_LIKE_TOKEN  */
+  YYSYMBOL_PARS_LIKE_TOKEN_EXACT = 74,     /* PARS_LIKE_TOKEN_EXACT  */
+  YYSYMBOL_PARS_LIKE_TOKEN_PREFIX = 75,    /* PARS_LIKE_TOKEN_PREFIX  */
+  YYSYMBOL_PARS_LIKE_TOKEN_SUFFIX = 76,    /* PARS_LIKE_TOKEN_SUFFIX  */
+  YYSYMBOL_PARS_LIKE_TOKEN_SUBSTR = 77,    /* PARS_LIKE_TOKEN_SUBSTR  */
+  YYSYMBOL_PARS_TABLE_NAME_TOKEN = 78,     /* PARS_TABLE_NAME_TOKEN  */
+  YYSYMBOL_PARS_BIGINT_TOKEN = 79,         /* PARS_BIGINT_TOKEN  */
+  YYSYMBOL_80_ = 80,                       /* '='  */
+  YYSYMBOL_81_ = 81,                       /* '<'  */
+  YYSYMBOL_82_ = 82,                       /* '>'  */
+  YYSYMBOL_83_ = 83,                       /* '-'  */
+  YYSYMBOL_84_ = 84,                       /* '+'  */
+  YYSYMBOL_85_ = 85,                       /* '*'  */
+  YYSYMBOL_86_ = 86,                       /* '/'  */
+  YYSYMBOL_NEG = 87,                       /* NEG  */
+  YYSYMBOL_88_ = 88,                       /* '%'  */
+  YYSYMBOL_89_ = 89,                       /* ';'  */
+  YYSYMBOL_90_ = 90,                       /* '('  */
+  YYSYMBOL_91_ = 91,                       /* ')'  */
+  YYSYMBOL_92_ = 92,                       /* ','  */
+  YYSYMBOL_YYACCEPT = 93,                  /* $accept  */
+  YYSYMBOL_top_statement = 94,             /* top_statement  */
+  YYSYMBOL_statement = 95,                 /* statement  */
+  YYSYMBOL_statement_list = 96,            /* statement_list  */
+  YYSYMBOL_exp = 97,                       /* exp  */
+  YYSYMBOL_function_name = 98,             /* function_name  */
+  YYSYMBOL_user_function_call = 99,        /* user_function_call  */
+  YYSYMBOL_table_list = 100,               /* table_list  */
+  YYSYMBOL_variable_list = 101,            /* variable_list  */
+  YYSYMBOL_exp_list = 102,                 /* exp_list  */
+  YYSYMBOL_select_item = 103,              /* select_item  */
+  YYSYMBOL_select_item_list = 104,         /* select_item_list  */
+  YYSYMBOL_select_list = 105,              /* select_list  */
+  YYSYMBOL_search_condition = 106,         /* search_condition  */
+  YYSYMBOL_for_update_clause = 107,        /* for_update_clause  */
+  YYSYMBOL_lock_shared_clause = 108,       /* lock_shared_clause  */
+  YYSYMBOL_order_direction = 109,          /* order_direction  */
+  YYSYMBOL_order_by_clause = 110,          /* order_by_clause  */
+  YYSYMBOL_select_statement = 111,         /* select_statement  */
+  YYSYMBOL_insert_statement_start = 112,   /* insert_statement_start  */
+  YYSYMBOL_insert_statement = 113,         /* insert_statement  */
+  YYSYMBOL_column_assignment = 114,        /* column_assignment  */
+  YYSYMBOL_column_assignment_list = 115,   /* column_assignment_list  */
+  YYSYMBOL_cursor_positioned = 116,        /* cursor_positioned  */
+  YYSYMBOL_update_statement_start = 117,   /* update_statement_start  */
+  YYSYMBOL_update_statement_searched = 118, /* update_statement_searched  */
+  YYSYMBOL_update_statement_positioned = 119, /* update_statement_positioned  */
+  YYSYMBOL_delete_statement_start = 120,   /* delete_statement_start  */
+  YYSYMBOL_delete_statement_searched = 121, /* delete_statement_searched  */
+  YYSYMBOL_delete_statement_positioned = 122, /* delete_statement_positioned  */
+  YYSYMBOL_assignment_statement = 123,     /* assignment_statement  */
+  YYSYMBOL_elsif_element = 124,            /* elsif_element  */
+  YYSYMBOL_elsif_list = 125,               /* elsif_list  */
+  YYSYMBOL_else_part = 126,                /* else_part  */
+  YYSYMBOL_if_statement = 127,             /* if_statement  */
+  YYSYMBOL_while_statement = 128,          /* while_statement  */
+  YYSYMBOL_for_statement = 129,            /* for_statement  */
+  YYSYMBOL_exit_statement = 130,           /* exit_statement  */
+  YYSYMBOL_return_statement = 131,         /* return_statement  */
+  YYSYMBOL_open_cursor_statement = 132,    /* open_cursor_statement  */
+  YYSYMBOL_close_cursor_statement = 133,   /* close_cursor_statement  */
+  YYSYMBOL_fetch_statement = 134,          /* fetch_statement  */
+  YYSYMBOL_column_def = 135,               /* column_def  */
+  YYSYMBOL_column_def_list = 136,          /* column_def_list  */
+  YYSYMBOL_opt_column_len = 137,           /* opt_column_len  */
+  YYSYMBOL_opt_not_null = 138,             /* opt_not_null  */
+  YYSYMBOL_create_table = 139,             /* create_table  */
+  YYSYMBOL_column_list = 140,              /* column_list  */
+  YYSYMBOL_unique_def = 141,               /* unique_def  */
+  YYSYMBOL_clustered_def = 142,            /* clustered_def  */
+  YYSYMBOL_create_index = 143,             /* create_index  */
+  YYSYMBOL_table_name = 144,               /* table_name  */
+  YYSYMBOL_commit_statement = 145,         /* commit_statement  */
+  YYSYMBOL_rollback_statement = 146,       /* rollback_statement  */
+  YYSYMBOL_type_name = 147,                /* type_name  */
+  YYSYMBOL_variable_declaration = 148,     /* variable_declaration  */
+  YYSYMBOL_variable_declaration_list = 149, /* variable_declaration_list  */
+  YYSYMBOL_cursor_declaration = 150,       /* cursor_declaration  */
+  YYSYMBOL_function_declaration = 151,     /* function_declaration  */
+  YYSYMBOL_declaration = 152,              /* declaration  */
+  YYSYMBOL_declaration_list = 153,         /* declaration_list  */
+  YYSYMBOL_procedure_definition = 154      /* procedure_definition  */
+};
+typedef enum yysymbol_kind_t yysymbol_kind_t;
+
+
+
+
+#ifdef short
+# undef short
+#endif
+
+/* On compilers that do not define __PTRDIFF_MAX__ etc., make sure
+   <limits.h> and (if available) <stdint.h> are included
+   so that the code can choose integer types of a good width.  */
+
+#ifndef __PTRDIFF_MAX__
+# include <limits.h> /* INFRINGES ON USER NAME SPACE */
+# if defined __STDC_VERSION__ && 199901 <= __STDC_VERSION__
+#  include <stdint.h> /* INFRINGES ON USER NAME SPACE */
+#  define YY_STDINT_H
+# endif
+#endif
+
+/* Narrow types that promote to a signed type and that can represent a
+   signed or unsigned integer of at least N bits.  In tables they can
+   save space and decrease cache pressure.  Promoting to a signed type
+   helps avoid bugs in integer arithmetic.  */
+
+#ifdef __INT_LEAST8_MAX__
+typedef __INT_LEAST8_TYPE__ yytype_int8;
+#elif defined YY_STDINT_H
+typedef int_least8_t yytype_int8;
+#else
+typedef signed char yytype_int8;
+#endif
+
+#ifdef __INT_LEAST16_MAX__
+typedef __INT_LEAST16_TYPE__ yytype_int16;
+#elif defined YY_STDINT_H
+typedef int_least16_t yytype_int16;
+#else
+typedef short yytype_int16;
+#endif
+
+/* Work around bug in HP-UX 11.23, which defines these macros
+   incorrectly for preprocessor constants.  This workaround can likely
+   be removed in 2023, as HPE has promised support for HP-UX 11.23
+   (aka HP-UX 11i v2) only through the end of 2022; see Table 2 of
+   <https://h20195.www2.hpe.com/V2/getpdf.aspx/4AA4-7673ENW.pdf>.  */
+#ifdef __hpux
+# undef UINT_LEAST8_MAX
+# undef UINT_LEAST16_MAX
+# define UINT_LEAST8_MAX 255
+# define UINT_LEAST16_MAX 65535
+#endif
+
+#if defined __UINT_LEAST8_MAX__ && __UINT_LEAST8_MAX__ <= __INT_MAX__
+typedef __UINT_LEAST8_TYPE__ yytype_uint8;
+#elif (!defined __UINT_LEAST8_MAX__ && defined YY_STDINT_H \
+       && UINT_LEAST8_MAX <= INT_MAX)
+typedef uint_least8_t yytype_uint8;
+#elif !defined __UINT_LEAST8_MAX__ && UCHAR_MAX <= INT_MAX
+typedef unsigned char yytype_uint8;
+#else
+typedef short yytype_uint8;
+#endif
+
+#if defined __UINT_LEAST16_MAX__ && __UINT_LEAST16_MAX__ <= __INT_MAX__
+typedef __UINT_LEAST16_TYPE__ yytype_uint16;
+#elif (!defined __UINT_LEAST16_MAX__ && defined YY_STDINT_H \
+       && UINT_LEAST16_MAX <= INT_MAX)
+typedef uint_least16_t yytype_uint16;
+#elif !defined __UINT_LEAST16_MAX__ && USHRT_MAX <= INT_MAX
+typedef unsigned short yytype_uint16;
+#else
+typedef int yytype_uint16;
+#endif
+
+#ifndef YYPTRDIFF_T
+# if defined __PTRDIFF_TYPE__ && defined __PTRDIFF_MAX__
+#  define YYPTRDIFF_T __PTRDIFF_TYPE__
+#  define YYPTRDIFF_MAXIMUM __PTRDIFF_MAX__
+# elif defined PTRDIFF_MAX
+#  ifndef ptrdiff_t
+#   include <stddef.h> /* INFRINGES ON USER NAME SPACE */
+#  endif
+#  define YYPTRDIFF_T ptrdiff_t
+#  define YYPTRDIFF_MAXIMUM PTRDIFF_MAX
+# else
+#  define YYPTRDIFF_T long
+#  define YYPTRDIFF_MAXIMUM LONG_MAX
+# endif
+#endif
+
+#ifndef YYSIZE_T
+# ifdef __SIZE_TYPE__
+#  define YYSIZE_T __SIZE_TYPE__
+# elif defined size_t
+#  define YYSIZE_T size_t
+# elif defined __STDC_VERSION__ && 199901 <= __STDC_VERSION__
+#  include <stddef.h> /* INFRINGES ON USER NAME SPACE */
+#  define YYSIZE_T size_t
+# else
+#  define YYSIZE_T unsigned
+# endif
+#endif
+
+#define YYSIZE_MAXIMUM                                  \
+  YY_CAST (YYPTRDIFF_T,                                 \
+           (YYPTRDIFF_MAXIMUM < YY_CAST (YYSIZE_T, -1)  \
+            ? YYPTRDIFF_MAXIMUM                         \
+            : YY_CAST (YYSIZE_T, -1)))
+
+#define YYSIZEOF(X) YY_CAST (YYPTRDIFF_T, sizeof (X))
+
+
+/* Stored state numbers (used for stacks). */
+typedef yytype_int16 yy_state_t;
+
+/* State numbers in computations.  */
+typedef int yy_state_fast_t;
+
+#ifndef YY_
+# if defined YYENABLE_NLS && YYENABLE_NLS
+#  if ENABLE_NLS
+#   include <libintl.h> /* INFRINGES ON USER NAME SPACE */
+#   define YY_(Msgid) dgettext ("bison-runtime", Msgid)
+#  endif
+# endif
+# ifndef YY_
+#  define YY_(Msgid) Msgid
+# endif
+#endif
+
+
+#ifndef YY_ATTRIBUTE_PURE
+# if defined __GNUC__ && 2 < __GNUC__ + (96 <= __GNUC_MINOR__)
+#  define YY_ATTRIBUTE_PURE __attribute__ ((__pure__))
+# else
+#  define YY_ATTRIBUTE_PURE
+# endif
+#endif
+
+#ifndef YY_ATTRIBUTE_UNUSED
+# if defined __GNUC__ && 2 < __GNUC__ + (7 <= __GNUC_MINOR__)
+#  define YY_ATTRIBUTE_UNUSED __attribute__ ((__unused__))
+# else
+#  define YY_ATTRIBUTE_UNUSED
+# endif
+#endif
+
+/* Suppress unused-variable warnings by "using" E.  */
+#if ! defined lint || defined __GNUC__
+# define YY_USE(E) ((void) (E))
+#else
+# define YY_USE(E) /* empty */
+#endif
+
+#if defined __GNUC__ && ! defined __ICC && 407 <= __GNUC__ * 100 + __GNUC_MINOR__
+/* Suppress an incorrect diagnostic about yylval being uninitialized.  */
+# define YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN                            \
+    _Pragma ("GCC diagnostic push")                                     \
+    _Pragma ("GCC diagnostic ignored \"-Wuninitialized\"")              \
+    _Pragma ("GCC diagnostic ignored \"-Wmaybe-uninitialized\"")
+# define YY_IGNORE_MAYBE_UNINITIALIZED_END      \
+    _Pragma ("GCC diagnostic pop")
+#else
+# define YY_INITIAL_VALUE(Value) Value
+#endif
+#ifndef YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN
+# define YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN
+# define YY_IGNORE_MAYBE_UNINITIALIZED_END
+#endif
+#ifndef YY_INITIAL_VALUE
+# define YY_INITIAL_VALUE(Value) /* Nothing. */
+#endif
+
+#if defined __cplusplus && defined __GNUC__ && ! defined __ICC && 6 <= __GNUC__
+# define YY_IGNORE_USELESS_CAST_BEGIN                          \
+    _Pragma ("GCC diagnostic push")                            \
+    _Pragma ("GCC diagnostic ignored \"-Wuseless-cast\"")
+# define YY_IGNORE_USELESS_CAST_END            \
+    _Pragma ("GCC diagnostic pop")
+#endif
+#ifndef YY_IGNORE_USELESS_CAST_BEGIN
+# define YY_IGNORE_USELESS_CAST_BEGIN
+# define YY_IGNORE_USELESS_CAST_END
+#endif
+
+
+#define YY_ASSERT(E) ((void) (0 && (E)))
+
+#if !defined yyoverflow
+
+/* The parser invokes alloca or malloc; define the necessary symbols.  */
+
+# ifdef YYSTACK_USE_ALLOCA
+#  if YYSTACK_USE_ALLOCA
+#   ifdef __GNUC__
+#    define YYSTACK_ALLOC __builtin_alloca
+#   elif defined __BUILTIN_VA_ARG_INCR
+#    include <alloca.h> /* INFRINGES ON USER NAME SPACE */
+#   elif defined _AIX
+#    define YYSTACK_ALLOC __alloca
+#   elif defined _MSC_VER
+#    include <malloc.h> /* INFRINGES ON USER NAME SPACE */
+#    define alloca _alloca
+#   else
+#    define YYSTACK_ALLOC alloca
+#    if ! defined _ALLOCA_H && ! defined EXIT_SUCCESS
+#     include <stdlib.h> /* INFRINGES ON USER NAME SPACE */
+      /* Use EXIT_SUCCESS as a witness for stdlib.h.  */
+#     ifndef EXIT_SUCCESS
+#      define EXIT_SUCCESS 0
+#     endif
+#    endif
+#   endif
+#  endif
+# endif
+
+# ifdef YYSTACK_ALLOC
+   /* Pacify GCC's 'empty if-body' warning.  */
+#  define YYSTACK_FREE(Ptr) do { /* empty */; } while (0)
+#  ifndef YYSTACK_ALLOC_MAXIMUM
+    /* The OS might guarantee only one guard page at the bottom of the stack,
+       and a page size can be as small as 4096 bytes.  So we cannot safely
+       invoke alloca (N) if N exceeds 4096.  Use a slightly smaller number
+       to allow for a few compiler-allocated temporary stack slots.  */
+#   define YYSTACK_ALLOC_MAXIMUM 4032 /* reasonable circa 2006 */
+#  endif
+# else
+#  define YYSTACK_ALLOC YYMALLOC
+#  define YYSTACK_FREE YYFREE
+#  ifndef YYSTACK_ALLOC_MAXIMUM
+#   define YYSTACK_ALLOC_MAXIMUM YYSIZE_MAXIMUM
+#  endif
+#  if (defined __cplusplus && ! defined EXIT_SUCCESS \
+       && ! ((defined YYMALLOC || defined malloc) \
+             && (defined YYFREE || defined free)))
+#   include <stdlib.h> /* INFRINGES ON USER NAME SPACE */
+#   ifndef EXIT_SUCCESS
+#    define EXIT_SUCCESS 0
+#   endif
+#  endif
+#  ifndef YYMALLOC
+#   define YYMALLOC malloc
+#   if ! defined malloc && ! defined EXIT_SUCCESS
+void *malloc (YYSIZE_T); /* INFRINGES ON USER NAME SPACE */
+#   endif
+#  endif
+#  ifndef YYFREE
+#   define YYFREE free
+#   if ! defined free && ! defined EXIT_SUCCESS
+void free (void *); /* INFRINGES ON USER NAME SPACE */
+#   endif
+#  endif
+# endif
+#endif /* !defined yyoverflow */
+
+#if (! defined yyoverflow \
+     && (! defined __cplusplus \
+         || (defined YYSTYPE_IS_TRIVIAL && YYSTYPE_IS_TRIVIAL)))
+
+/* A type that is properly aligned for any stack member.  */
+union yyalloc
+{
+  yy_state_t yyss_alloc;
+  YYSTYPE yyvs_alloc;
+};
+
+/* The size of the maximum gap between one aligned stack and the next.  */
+# define YYSTACK_GAP_MAXIMUM (YYSIZEOF (union yyalloc) - 1)
+
+/* The size of an array large to enough to hold all stacks, each with
+   N elements.  */
+# define YYSTACK_BYTES(N) \
+     ((N) * (YYSIZEOF (yy_state_t) + YYSIZEOF (YYSTYPE)) \
+      + YYSTACK_GAP_MAXIMUM)
+
+# define YYCOPY_NEEDED 1
+
+/* Relocate STACK from its old location to the new one.  The
+   local variables YYSIZE and YYSTACKSIZE give the old and new number of
+   elements in the stack, and YYPTR gives the new location of the
+   stack.  Advance YYPTR to a properly aligned location for the next
+   stack.  */
+# define YYSTACK_RELOCATE(Stack_alloc, Stack)                           \
+    do                                                                  \
+      {                                                                 \
+        YYPTRDIFF_T yynewbytes;                                         \
+        YYCOPY (&yyptr->Stack_alloc, Stack, yysize);                    \
+        Stack = &yyptr->Stack_alloc;                                    \
+        yynewbytes = yystacksize * YYSIZEOF (*Stack) + YYSTACK_GAP_MAXIMUM; \
+        yyptr += yynewbytes / YYSIZEOF (*yyptr);                        \
+      }                                                                 \
+    while (0)
+
+#endif
+
+#if defined YYCOPY_NEEDED && YYCOPY_NEEDED
+/* Copy COUNT objects from SRC to DST.  The source and destination do
+   not overlap.  */
+# ifndef YYCOPY
+#  if defined __GNUC__ && 1 < __GNUC__
+#   define YYCOPY(Dst, Src, Count) \
+      __builtin_memcpy (Dst, Src, YY_CAST (YYSIZE_T, (Count)) * sizeof (*(Src)))
+#  else
+#   define YYCOPY(Dst, Src, Count)              \
+      do                                        \
+        {                                       \
+          YYPTRDIFF_T yyi;                      \
+          for (yyi = 0; yyi < (Count); yyi++)   \
+            (Dst)[yyi] = (Src)[yyi];            \
+        }                                       \
+      while (0)
+#  endif
+# endif
+#endif /* !YYCOPY_NEEDED */
+
+/* YYFINAL -- State number of the termination state.  */
+#define YYFINAL  5
+/* YYLAST -- Last index in YYTABLE.  */
+#define YYLAST   611
+
+/* YYNTOKENS -- Number of terminals.  */
+#define YYNTOKENS  93
+/* YYNNTS -- Number of nonterminals.  */
+#define YYNNTS  62
+/* YYNRULES -- Number of rules.  */
+#define YYNRULES  145
+/* YYNSTATES -- Number of states.  */
+#define YYNSTATES  290
+
+/* YYMAXUTOK -- Last valid token kind.  */
+#define YYMAXUTOK   335
+
+
+/* YYTRANSLATE(TOKEN-NUM) -- Symbol number corresponding to TOKEN-NUM
+   as returned by yylex, with out-of-bounds checking.  */
+#define YYTRANSLATE(YYX)                                \
+  (0 <= (YYX) && (YYX) <= YYMAXUTOK                     \
+   ? YY_CAST (yysymbol_kind_t, yytranslate[YYX])        \
+   : YYSYMBOL_YYUNDEF)
+
+/* YYTRANSLATE[TOKEN-NUM] -- Symbol number corresponding to TOKEN-NUM
+   as returned by yylex.  */
+static const yytype_int8 yytranslate[] =
+{
+       0,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,    88,     2,     2,
+      90,    91,    85,    84,    92,    83,     2,    86,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,    89,
+      81,    80,    82,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     1,     2,     3,     4,
+       5,     6,     7,     8,     9,    10,    11,    12,    13,    14,
+      15,    16,    17,    18,    19,    20,    21,    22,    23,    24,
+      25,    26,    27,    28,    29,    30,    31,    32,    33,    34,
+      35,    36,    37,    38,    39,    40,    41,    42,    43,    44,
+      45,    46,    47,    48,    49,    50,    51,    52,    53,    54,
+      55,    56,    57,    58,    59,    60,    61,    62,    63,    64,
+      65,    66,    67,    68,    69,    70,    71,    72,    73,    74,
+      75,    76,    77,    78,    79,    87
+};
+
+#if YYDEBUG
+  /* YYRLINE[YYN] -- Source line where rule number YYN was defined.  */
+static const yytype_int16 yyrline[] =
+{
+       0,   140,   140,   143,   144,   145,   146,   147,   148,   149,
+     150,   151,   152,   153,   154,   155,   156,   157,   158,   159,
+     160,   161,   165,   166,   171,   172,   174,   175,   176,   177,
+     178,   179,   180,   181,   182,   183,   184,   185,   186,   188,
+     189,   190,   191,   192,   193,   194,   195,   196,   198,   203,
+     204,   205,   206,   207,   211,   215,   216,   221,   222,   223,
+     228,   229,   230,   234,   235,   243,   244,   245,   250,   252,
+     255,   259,   260,   264,   265,   270,   271,   276,   277,   278,
+     282,   283,   290,   305,   310,   313,   321,   327,   328,   333,
+     339,   348,   356,   364,   371,   379,   387,   394,   400,   401,
+     406,   407,   409,   413,   420,   426,   436,   440,   444,   451,
+     458,   462,   470,   479,   480,   485,   486,   491,   492,   498,
+     506,   507,   512,   513,   517,   518,   522,   536,   537,   541,
+     546,   551,   552,   553,   557,   563,   565,   566,   570,   578,
+     584,   585,   588,   590,   591,   595
+};
+#endif
+
+/** Accessing symbol of state STATE.  */
+#define YY_ACCESSING_SYMBOL(State) YY_CAST (yysymbol_kind_t, yystos[State])
+
+#if YYDEBUG || 0
+/* The user-facing name of the symbol whose (internal) number is
+   YYSYMBOL.  No bounds checking.  */
+static const char *yysymbol_name (yysymbol_kind_t yysymbol) YY_ATTRIBUTE_UNUSED;
+
+/* YYTNAME[SYMBOL-NUM] -- String name of the symbol SYMBOL-NUM.
+   First, the terminals, then, starting at YYNTOKENS, nonterminals.  */
+static const char *const yytname[] =
+{
+  "\"end of file\"", "error", "\"invalid token\"", "PARS_INT_LIT",
+  "PARS_FLOAT_LIT", "PARS_STR_LIT", "PARS_NULL_LIT", "PARS_ID_TOKEN",
+  "PARS_AND_TOKEN", "PARS_OR_TOKEN", "PARS_NOT_TOKEN", "PARS_GE_TOKEN",
+  "PARS_LE_TOKEN", "PARS_NE_TOKEN", "PARS_PROCEDURE_TOKEN",
+  "PARS_IN_TOKEN", "PARS_INT_TOKEN", "PARS_CHAR_TOKEN", "PARS_IS_TOKEN",
+  "PARS_BEGIN_TOKEN", "PARS_END_TOKEN", "PARS_IF_TOKEN", "PARS_THEN_TOKEN",
+  "PARS_ELSE_TOKEN", "PARS_ELSIF_TOKEN", "PARS_LOOP_TOKEN",
+  "PARS_WHILE_TOKEN", "PARS_RETURN_TOKEN", "PARS_SELECT_TOKEN",
+  "PARS_COUNT_TOKEN", "PARS_FROM_TOKEN", "PARS_WHERE_TOKEN",
+  "PARS_FOR_TOKEN", "PARS_DDOT_TOKEN", "PARS_ORDER_TOKEN", "PARS_BY_TOKEN",
+  "PARS_ASC_TOKEN", "PARS_DESC_TOKEN", "PARS_INSERT_TOKEN",
+  "PARS_INTO_TOKEN", "PARS_VALUES_TOKEN", "PARS_UPDATE_TOKEN",
+  "PARS_SET_TOKEN", "PARS_DELETE_TOKEN", "PARS_CURRENT_TOKEN",
+  "PARS_OF_TOKEN", "PARS_CREATE_TOKEN", "PARS_TABLE_TOKEN",
+  "PARS_INDEX_TOKEN", "PARS_UNIQUE_TOKEN", "PARS_CLUSTERED_TOKEN",
+  "PARS_ON_TOKEN", "PARS_ASSIGN_TOKEN", "PARS_DECLARE_TOKEN",
+  "PARS_CURSOR_TOKEN", "PARS_SQL_TOKEN", "PARS_OPEN_TOKEN",
+  "PARS_FETCH_TOKEN", "PARS_CLOSE_TOKEN", "PARS_NOTFOUND_TOKEN",
+  "PARS_TO_BINARY_TOKEN", "PARS_SUBSTR_TOKEN", "PARS_CONCAT_TOKEN",
+  "PARS_INSTR_TOKEN", "PARS_LENGTH_TOKEN", "PARS_COMMIT_TOKEN",
+  "PARS_ROLLBACK_TOKEN", "PARS_WORK_TOKEN", "PARS_EXIT_TOKEN",
+  "PARS_FUNCTION_TOKEN", "PARS_LOCK_TOKEN", "PARS_SHARE_TOKEN",
+  "PARS_MODE_TOKEN", "PARS_LIKE_TOKEN", "PARS_LIKE_TOKEN_EXACT",
+  "PARS_LIKE_TOKEN_PREFIX", "PARS_LIKE_TOKEN_SUFFIX",
+  "PARS_LIKE_TOKEN_SUBSTR", "PARS_TABLE_NAME_TOKEN", "PARS_BIGINT_TOKEN",
+  "'='", "'<'", "'>'", "'-'", "'+'", "'*'", "'/'", "NEG", "'%'", "';'",
+  "'('", "')'", "','", "$accept", "top_statement", "statement",
+  "statement_list", "exp", "function_name", "user_function_call",
+  "table_list", "variable_list", "exp_list", "select_item",
+  "select_item_list", "select_list", "search_condition",
+  "for_update_clause", "lock_shared_clause", "order_direction",
+  "order_by_clause", "select_statement", "insert_statement_start",
+  "insert_statement", "column_assignment", "column_assignment_list",
+  "cursor_positioned", "update_statement_start",
+  "update_statement_searched", "update_statement_positioned",
+  "delete_statement_start", "delete_statement_searched",
+  "delete_statement_positioned", "assignment_statement", "elsif_element",
+  "elsif_list", "else_part", "if_statement", "while_statement",
+  "for_statement", "exit_statement", "return_statement",
+  "open_cursor_statement", "close_cursor_statement", "fetch_statement",
+  "column_def", "column_def_list", "opt_column_len", "opt_not_null",
+  "create_table", "column_list", "unique_def", "clustered_def",
+  "create_index", "table_name", "commit_statement", "rollback_statement",
+  "type_name", "variable_declaration", "variable_declaration_list",
+  "cursor_declaration", "function_declaration", "declaration",
+  "declaration_list", "procedure_definition", YY_NULLPTR
+};
+
+static const char *
+yysymbol_name (yysymbol_kind_t yysymbol)
+{
+  return yytname[yysymbol];
+}
+#endif
+
+#ifdef YYPRINT
+/* YYTOKNUM[NUM] -- (External) token number corresponding to the
+   (internal) symbol number NUM (which must be that of a token).  */
+static const yytype_int16 yytoknum[] =
+{
+       0,   256,   257,   258,   259,   260,   261,   262,   263,   264,
+     265,   266,   267,   268,   269,   270,   271,   272,   273,   274,
+     275,   276,   277,   278,   279,   280,   281,   282,   283,   284,
+     285,   286,   287,   288,   289,   290,   291,   292,   293,   294,
+     295,   296,   297,   298,   299,   300,   301,   302,   303,   304,
+     305,   306,   307,   308,   309,   310,   311,   312,   313,   314,
+     315,   316,   317,   318,   319,   320,   321,   322,   323,   324,
+     325,   326,   327,   328,   329,   330,   331,   332,   333,   334,
+      61,    60,    62,    45,    43,    42,    47,   335,    37,    59,
+      40,    41,    44
+};
+#endif
+
+#define YYPACT_NINF (-146)
+
+#define yypact_value_is_default(Yyn) \
+  ((Yyn) == YYPACT_NINF)
+
+#define YYTABLE_NINF (-1)
+
+#define yytable_value_is_error(Yyn) \
+  0
+
+  /* YYPACT[STATE-NUM] -- Index in YYTABLE of the portion describing
+     STATE-NUM.  */
+static const yytype_int16 yypact[] =
+{
+       8,    25,    40,   -44,   -43,  -146,  -146,   -41,    37,    54,
+       9,  -146,    44,  -146,  -146,  -146,   -24,   -30,  -146,  -146,
+    -146,  -146,    -5,  -146,    63,    97,   543,  -146,    93,    24,
+      79,   148,   148,  -146,    13,   126,    98,     0,   111,    -3,
+     135,   136,   138,    80,    83,  -146,  -146,   414,    67,    -7,
+      70,   130,    84,    85,   130,    86,    87,    88,    89,    90,
+      91,    92,    94,    95,    99,   100,   104,   105,   107,   108,
+     141,  -146,   148,  -146,  -146,  -146,  -146,   112,   148,   119,
+    -146,  -146,  -146,  -146,  -146,   148,   148,   193,   123,   208,
+     124,  -146,   304,  -146,   -26,   152,   172,     0,  -146,  -146,
+     181,     0,     0,  -146,   174,  -146,   186,  -146,  -146,  -146,
+    -146,  -146,  -146,   137,  -146,  -146,   102,  -146,  -146,  -146,
+    -146,  -146,  -146,  -146,  -146,  -146,  -146,  -146,  -146,  -146,
+    -146,  -146,  -146,  -146,  -146,  -146,  -146,  -146,   139,   304,
+     167,    -1,   170,    17,   159,   148,   148,   148,   148,   148,
+     543,   225,   148,   148,   148,   148,   148,   148,   148,   148,
+     543,   149,   228,    31,     0,   148,  -146,   229,  -146,   147,
+    -146,   198,   240,   148,   203,   304,  -146,  -146,  -146,  -146,
+      -1,    -1,    16,    16,   304,   371,  -146,    16,    16,    16,
+      49,    49,    17,    17,   304,   -64,   457,   158,  -146,   160,
+    -146,   -25,  -146,   247,   171,  -146,   161,   250,   254,   164,
+    -146,   160,   -38,   255,   543,   148,  -146,   239,   244,  -146,
+     148,   242,  -146,   258,   148,     0,   236,   148,   148,   229,
+       9,  -146,   -33,   218,   179,  -146,  -146,   543,   274,  -146,
+     251,   304,  -146,  -146,  -146,   230,   214,   289,   304,  -146,
+     205,  -146,   250,     0,  -146,   543,  -146,  -146,   284,   269,
+     543,   301,   295,  -146,   216,   543,   237,   272,  -146,   500,
+     219,   303,  -146,   311,   249,   312,   286,  -146,  -146,  -146,
+     -28,  -146,   103,  -146,  -146,   315,  -146,  -146,  -146,  -146
+};
+
+  /* YYDEFACT[STATE-NUM] -- Default reduction number in state STATE-NUM.
+     Performed when YYTABLE does not specify something else to do.  Zero
+     means the default is an error.  */
+static const yytype_uint8 yydefact[] =
+{
+       0,     0,     0,     0,     0,     1,     2,     0,     0,   135,
+       0,   136,   142,   131,   133,   132,     0,     0,   137,   140,
+     141,   143,     0,   134,     0,     0,     0,   144,     0,     0,
+       0,     0,     0,   107,    65,     0,     0,     0,     0,   122,
+       0,     0,     0,     0,     0,   106,    22,     0,     0,     0,
+       0,    71,     0,     0,    71,     0,     0,     0,     0,     0,
+       0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
+       0,   139,     0,    26,    27,    28,    29,    24,     0,    30,
+      49,    50,    51,    52,    53,     0,     0,     0,     0,     0,
+       0,    68,    63,    66,    70,     0,     0,     0,   127,   128,
+       0,     0,     0,   123,   124,   108,     0,   109,   129,   130,
+     145,    23,     9,     0,    85,    10,     0,    91,    92,    13,
+      14,    94,    95,    11,    12,     8,     6,     3,     4,     5,
+       7,    15,    17,    16,    20,    21,    18,    19,     0,    96,
+       0,    46,     0,    35,     0,     0,     0,     0,     0,     0,
+       0,     0,     0,     0,     0,     0,     0,     0,     0,    60,
+       0,     0,    57,     0,     0,     0,    83,     0,    93,     0,
+     125,     0,    57,    60,     0,    72,   138,    47,    48,    36,
+      44,    45,    41,    42,    43,   100,    38,    37,    39,    40,
+      32,    31,    33,    34,    61,     0,     0,     0,    58,    69,
+      67,    71,    55,     0,     0,    87,    90,     0,     0,    58,
+     111,   110,     0,     0,     0,     0,    98,   102,     0,    25,
+       0,     0,    64,     0,     0,     0,    73,     0,     0,     0,
+       0,   113,     0,     0,     0,    84,    89,   101,     0,    99,
+       0,    62,   104,    59,    56,     0,    75,     0,    86,    88,
+     115,   119,     0,     0,    54,     0,   103,    74,     0,    80,
+       0,     0,   117,   114,     0,    97,     0,     0,    82,     0,
+       0,     0,   112,     0,     0,     0,     0,   116,   118,   120,
+       0,    76,    77,   105,   126,     0,    78,    79,    81,   121
+};
+
+  /* YYPGOTO[NTERM-NUM].  */
+static const yytype_int16 yypgoto[] =
+{
+    -146,  -146,   -47,  -145,   -29,  -146,  -146,  -146,   151,   153,
+     162,  -146,  -146,   -53,  -146,  -146,  -146,  -146,   -18,  -146,
+    -146,   106,  -146,   270,  -146,  -146,  -146,  -146,  -146,  -146,
+    -146,   117,  -146,  -146,  -146,  -146,  -146,  -146,  -146,  -146,
+    -146,  -146,    96,  -146,  -146,  -146,  -146,  -146,  -146,  -146,
+    -146,   -93,  -146,  -146,   109,   324,  -146,  -146,  -146,   316,
+    -146,  -146
+};
+
+  /* YYDEFGOTO[NTERM-NUM].  */
+static const yytype_int16 yydefgoto[] =
+{
+       0,     2,    46,    47,    92,    88,   210,   201,   199,   195,
+      93,    94,    95,   117,   246,   259,   288,   268,    48,    49,
+      50,   205,   206,   118,    51,    52,    53,    54,    55,    56,
+      57,   216,   217,   218,    58,    59,    60,    61,    62,    63,
+      64,    65,   231,   232,   262,   272,    66,   280,   104,   171,
+      67,   100,    68,    69,    16,    11,    12,    19,    20,    21,
+      22,     3
+};
+
+  /* YYTABLE[YYPACT[STATE-NUM]] -- What to do in state STATE-NUM.  If
+     positive, shift that token.  If negative, reduce the rule whose
+     number is the opposite.  If YYTABLE_NINF, syntax error.  */
+static const yytype_int16 yytable[] =
+{
+     111,   121,    87,    89,   166,   185,   224,    98,   168,   169,
+     147,   148,   149,   162,    26,   196,    73,    74,    75,    76,
+      77,    34,     1,    78,    24,    13,    14,   219,   220,   149,
+     149,   114,     4,   113,    73,    74,    75,    76,    77,    25,
+       5,    78,    90,   139,   102,     6,   103,     7,    17,   141,
+       8,    10,   138,   235,   220,     9,   143,   144,   251,   252,
+      90,    10,   149,   284,   285,    23,   163,   225,    79,   237,
+      28,   202,   151,    80,    81,    82,    83,    84,    99,   152,
+     153,   154,   155,   156,   157,   158,    79,   175,    15,   151,
+     151,    80,    81,    82,    83,    84,    85,    17,    91,   155,
+     156,   157,   158,    86,    29,    73,    74,    75,    76,    77,
+     265,    70,    78,    71,    85,   269,   180,   181,   182,   183,
+     184,    86,   151,   187,   188,   189,   190,   191,   192,   193,
+     194,    72,   244,    96,   157,   158,   203,    97,   111,   286,
+     287,   101,   105,   106,   194,   107,   174,   108,   226,   111,
+     109,    73,    74,    75,    76,    77,   112,    79,    78,   115,
+     264,   116,    80,    81,    82,    83,    84,   145,   146,    34,
+     147,   148,   149,   119,   120,   123,   124,   125,   126,   127,
+     128,   129,   164,   130,   131,    85,   238,   165,   132,   133,
+     111,   241,    86,   134,   135,   175,   136,   137,   247,   248,
+     140,   145,   146,    79,   147,   148,   149,   142,    80,    81,
+      82,    83,    84,   159,   161,   150,   145,   146,   111,   147,
+     148,   149,   111,   167,   170,   172,   177,   173,   176,   178,
+     186,    85,   151,   160,   197,   198,   204,   207,    86,   152,
+     153,   154,   155,   156,   157,   158,   208,   209,   213,   222,
+     179,   228,   223,   229,   234,   145,   146,   230,   147,   148,
+     149,   233,   236,   215,   240,   243,   151,   242,   245,   253,
+     254,   257,   256,   152,   153,   154,   155,   156,   157,   158,
+     227,   151,   145,   146,   258,   147,   148,   149,   152,   153,
+     154,   155,   156,   157,   158,   261,   255,   145,   146,   266,
+     147,   148,   149,   267,   270,   271,   273,   275,   274,   278,
+     277,   283,   145,   146,   260,   147,   148,   149,   279,   282,
+     151,   281,   289,   211,   122,   200,   212,   152,   153,   154,
+     155,   156,   157,   158,   239,   249,    18,     0,    27,   250,
+       0,     0,     0,     0,     0,     0,     0,   151,   263,     0,
+       0,     0,     0,     0,   152,   153,   154,   155,   156,   157,
+     158,     0,   151,     0,     0,     0,     0,     0,     0,   152,
+     153,   154,   155,   156,   157,   158,     0,   151,    30,     0,
+       0,     0,     0,     0,   152,   153,   154,   155,   156,   157,
+     158,     0,    31,     0,   214,   215,     0,    32,    33,    34,
+       0,     0,     0,    35,     0,     0,     0,     0,     0,    36,
+       0,     0,    37,     0,    38,     0,     0,    39,     0,     0,
+       0,    30,     0,     0,     0,     0,     0,    40,    41,    42,
+       0,     0,     0,     0,   110,    31,    43,    44,     0,    45,
+      32,    33,    34,     0,     0,     0,    35,     0,     0,     0,
+       0,     0,    36,     0,     0,    37,     0,    38,     0,     0,
+      39,     0,     0,     0,    30,     0,     0,     0,     0,     0,
+      40,    41,    42,     0,     0,     0,     0,   221,    31,    43,
+      44,     0,    45,    32,    33,    34,     0,     0,     0,    35,
+       0,     0,     0,     0,     0,    36,     0,     0,    37,     0,
+      38,     0,     0,    39,     0,     0,     0,    30,     0,     0,
+       0,     0,     0,    40,    41,    42,     0,     0,     0,     0,
+     276,    31,    43,    44,     0,    45,    32,    33,    34,     0,
+       0,     0,    35,     0,     0,     0,     0,     0,    36,     0,
+       0,    37,     0,    38,     0,     0,    39,     0,     0,     0,
+      30,     0,     0,     0,     0,     0,    40,    41,    42,     0,
+       0,     0,     0,     0,    31,    43,    44,     0,    45,    32,
+      33,    34,     0,     0,     0,    35,     0,     0,     0,     0,
+       0,    36,     0,     0,    37,     0,    38,     0,     0,    39,
+       0,     0,     0,     0,     0,     0,     0,     0,     0,    40,
+      41,    42,     0,     0,     0,     0,     0,     0,    43,    44,
+       0,    45
+};
+
+static const yytype_int16 yycheck[] =
+{
+      47,    54,    31,    32,    97,   150,    31,     7,   101,   102,
+      11,    12,    13,    39,    19,   160,     3,     4,     5,     6,
+       7,    28,    14,    10,    54,    16,    17,    91,    92,    13,
+      13,    49,     7,    40,     3,     4,     5,     6,     7,    69,
+       0,    10,    29,    72,    47,    89,    49,    90,    53,    78,
+      91,     7,    70,    91,    92,    18,    85,    86,    91,    92,
+      29,     7,    13,    91,    92,    89,    92,    92,    55,   214,
+       7,   164,    73,    60,    61,    62,    63,    64,    78,    80,
+      81,    82,    83,    84,    85,    86,    55,   116,    79,    73,
+      73,    60,    61,    62,    63,    64,    83,    53,    85,    83,
+      84,    85,    86,    90,     7,     3,     4,     5,     6,     7,
+     255,    18,    10,    89,    83,   260,   145,   146,   147,   148,
+     149,    90,    73,   152,   153,   154,   155,   156,   157,   158,
+     159,    52,   225,     7,    85,    86,   165,    39,   185,    36,
+      37,    30,     7,     7,   173,     7,    44,    67,   201,   196,
+      67,     3,     4,     5,     6,     7,    89,    55,    10,    89,
+     253,    31,    60,    61,    62,    63,    64,     8,     9,    28,
+      11,    12,    13,    89,    89,    89,    89,    89,    89,    89,
+      89,    89,    30,    89,    89,    83,   215,    15,    89,    89,
+     237,   220,    90,    89,    89,   224,    89,    89,   227,   228,
+      88,     8,     9,    55,    11,    12,    13,    88,    60,    61,
+      62,    63,    64,    90,    90,    22,     8,     9,   265,    11,
+      12,    13,   269,    42,    50,    39,    59,    90,    89,    59,
+       5,    83,    73,    25,    85,     7,     7,    90,    90,    80,
+      81,    82,    83,    84,    85,    86,    48,     7,    45,    91,
+      91,    80,    92,    92,    90,     8,     9,     7,    11,    12,
+      13,     7,     7,    24,    20,     7,    73,    25,    32,    51,
+      91,    41,    21,    80,    81,    82,    83,    84,    85,    86,
+      33,    73,     8,     9,    70,    11,    12,    13,    80,    81,
+      82,    83,    84,    85,    86,    90,    22,     8,     9,    15,
+      11,    12,    13,    34,     3,    10,    90,    35,    71,     6,
+      91,    25,     8,     9,    25,    11,    12,    13,     7,     7,
+      73,    72,     7,   172,    54,   163,   173,    80,    81,    82,
+      83,    84,    85,    86,   217,   229,    12,    -1,    22,   230,
+      -1,    -1,    -1,    -1,    -1,    -1,    -1,    73,   252,    -1,
+      -1,    -1,    -1,    -1,    80,    81,    82,    83,    84,    85,
+      86,    -1,    73,    -1,    -1,    -1,    -1,    -1,    -1,    80,
+      81,    82,    83,    84,    85,    86,    -1,    73,     7,    -1,
+      -1,    -1,    -1,    -1,    80,    81,    82,    83,    84,    85,
+      86,    -1,    21,    -1,    23,    24,    -1,    26,    27,    28,
+      -1,    -1,    -1,    32,    -1,    -1,    -1,    -1,    -1,    38,
+      -1,    -1,    41,    -1,    43,    -1,    -1,    46,    -1,    -1,
+      -1,     7,    -1,    -1,    -1,    -1,    -1,    56,    57,    58,
+      -1,    -1,    -1,    -1,    20,    21,    65,    66,    -1,    68,
+      26,    27,    28,    -1,    -1,    -1,    32,    -1,    -1,    -1,
+      -1,    -1,    38,    -1,    -1,    41,    -1,    43,    -1,    -1,
+      46,    -1,    -1,    -1,     7,    -1,    -1,    -1,    -1,    -1,
+      56,    57,    58,    -1,    -1,    -1,    -1,    20,    21,    65,
+      66,    -1,    68,    26,    27,    28,    -1,    -1,    -1,    32,
+      -1,    -1,    -1,    -1,    -1,    38,    -1,    -1,    41,    -1,
+      43,    -1,    -1,    46,    -1,    -1,    -1,     7,    -1,    -1,
+      -1,    -1,    -1,    56,    57,    58,    -1,    -1,    -1,    -1,
+      20,    21,    65,    66,    -1,    68,    26,    27,    28,    -1,
+      -1,    -1,    32,    -1,    -1,    -1,    -1,    -1,    38,    -1,
+      -1,    41,    -1,    43,    -1,    -1,    46,    -1,    -1,    -1,
+       7,    -1,    -1,    -1,    -1,    -1,    56,    57,    58,    -1,
+      -1,    -1,    -1,    -1,    21,    65,    66,    -1,    68,    26,
+      27,    28,    -1,    -1,    -1,    32,    -1,    -1,    -1,    -1,
+      -1,    38,    -1,    -1,    41,    -1,    43,    -1,    -1,    46,
+      -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    56,
+      57,    58,    -1,    -1,    -1,    -1,    -1,    -1,    65,    66,
+      -1,    68
+};
+
+  /* YYSTOS[STATE-NUM] -- The (internal number of the) accessing
+     symbol of state STATE-NUM.  */
+static const yytype_uint8 yystos[] =
+{
+       0,    14,    94,   154,     7,     0,    89,    90,    91,    18,
+       7,   148,   149,    16,    17,    79,   147,    53,   148,   150,
+     151,   152,   153,    89,    54,    69,    19,   152,     7,     7,
+       7,    21,    26,    27,    28,    32,    38,    41,    43,    46,
+      56,    57,    58,    65,    66,    68,    95,    96,   111,   112,
+     113,   117,   118,   119,   120,   121,   122,   123,   127,   128,
+     129,   130,   131,   132,   133,   134,   139,   143,   145,   146,
+      18,    89,    52,     3,     4,     5,     6,     7,    10,    55,
+      60,    61,    62,    63,    64,    83,    90,    97,    98,    97,
+      29,    85,    97,   103,   104,   105,     7,    39,     7,    78,
+     144,    30,    47,    49,   141,     7,     7,     7,    67,    67,
+      20,    95,    89,    40,   111,    89,    31,   106,   116,    89,
+      89,   106,   116,    89,    89,    89,    89,    89,    89,    89,
+      89,    89,    89,    89,    89,    89,    89,    89,   111,    97,
+      88,    97,    88,    97,    97,     8,     9,    11,    12,    13,
+      22,    73,    80,    81,    82,    83,    84,    85,    86,    90,
+      25,    90,    39,    92,    30,    15,   144,    42,   144,   144,
+      50,   142,    39,    90,    44,    97,    89,    59,    59,    91,
+      97,    97,    97,    97,    97,    96,     5,    97,    97,    97,
+      97,    97,    97,    97,    97,   102,    96,    85,     7,   101,
+     103,   100,   144,    97,     7,   114,   115,    90,    48,     7,
+      99,   101,   102,    45,    23,    24,   124,   125,   126,    91,
+      92,    20,    91,    92,    31,    92,   106,    33,    80,    92,
+       7,   135,   136,     7,    90,    91,     7,    96,    97,   124,
+      20,    97,    25,     7,   144,    32,   107,    97,    97,   114,
+     147,    91,    92,    51,    91,    22,    21,    41,    70,   108,
+      25,    90,   137,   135,   144,    96,    15,    34,   110,    96,
+       3,    10,   138,    90,    71,    35,    20,    91,     6,     7,
+     140,    72,     7,    25,    91,    92,    36,    37,   109,     7
+};
+
+  /* YYR1[YYN] -- Symbol number of symbol that rule YYN derives.  */
+static const yytype_uint8 yyr1[] =
+{
+       0,    93,    94,    95,    95,    95,    95,    95,    95,    95,
+      95,    95,    95,    95,    95,    95,    95,    95,    95,    95,
+      95,    95,    96,    96,    97,    97,    97,    97,    97,    97,
+      97,    97,    97,    97,    97,    97,    97,    97,    97,    97,
+      97,    97,    97,    97,    97,    97,    97,    97,    97,    98,
+      98,    98,    98,    98,    99,   100,   100,   101,   101,   101,
+     102,   102,   102,   103,   103,   104,   104,   104,   105,   105,
+     105,   106,   106,   107,   107,   108,   108,   109,   109,   109,
+     110,   110,   111,   112,   113,   113,   114,   115,   115,   116,
+     117,   118,   119,   120,   121,   122,   123,   124,   125,   125,
+     126,   126,   126,   127,   128,   129,   130,   131,   132,   133,
+     134,   134,   135,   136,   136,   137,   137,   138,   138,   139,
+     140,   140,   141,   141,   142,   142,   143,   144,   144,   145,
+     146,   147,   147,   147,   148,   149,   149,   149,   150,   151,
+     152,   152,   153,   153,   153,   154
+};
+
+  /* YYR2[YYN] -- Number of symbols on the right hand side of rule YYN.  */
+static const yytype_int8 yyr2[] =
+{
+       0,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     1,     2,     1,     4,     1,     1,     1,     1,
+       1,     3,     3,     3,     3,     2,     3,     3,     3,     3,
+       3,     3,     3,     3,     3,     3,     2,     3,     3,     1,
+       1,     1,     1,     1,     3,     1,     3,     0,     1,     3,
+       0,     1,     3,     1,     4,     0,     1,     3,     1,     3,
+       1,     0,     2,     0,     2,     0,     4,     0,     1,     1,
+       0,     4,     8,     3,     5,     2,     3,     1,     3,     4,
+       4,     2,     2,     3,     2,     2,     3,     4,     1,     2,
+       0,     2,     1,     7,     6,    10,     1,     1,     2,     2,
+       4,     4,     4,     1,     3,     0,     3,     0,     2,     6,
+       1,     3,     0,     1,     0,     1,    10,     1,     1,     2,
+       2,     1,     1,     1,     3,     0,     1,     2,     6,     4,
+       1,     1,     0,     1,     2,    10
+};
+
+
+enum { YYENOMEM = -2 };
+
+#define yyerrok         (yyerrstatus = 0)
+#define yyclearin       (yychar = YYEMPTY)
+
+#define YYACCEPT        goto yyacceptlab
+#define YYABORT         goto yyabortlab
+#define YYERROR         goto yyerrorlab
+
+
+#define YYRECOVERING()  (!!yyerrstatus)
+
+#define YYBACKUP(Token, Value)                                    \
+  do                                                              \
+    if (yychar == YYEMPTY)                                        \
+      {                                                           \
+        yychar = (Token);                                         \
+        yylval = (Value);                                         \
+        YYPOPSTACK (yylen);                                       \
+        yystate = *yyssp;                                         \
+        goto yybackup;                                            \
+      }                                                           \
+    else                                                          \
+      {                                                           \
+        yyerror (YY_("syntax error: cannot back up")); \
+        YYERROR;                                                  \
+      }                                                           \
+  while (0)
+
+/* Backward compatibility with an undocumented macro.
+   Use YYerror or YYUNDEF. */
+#define YYERRCODE YYUNDEF
+
+
+/* Enable debugging if requested.  */
+#if YYDEBUG
+
+# ifndef YYFPRINTF
+#  include <stdio.h> /* INFRINGES ON USER NAME SPACE */
+#  define YYFPRINTF fprintf
+# endif
+
+# define YYDPRINTF(Args)                        \
+do {                                            \
+  if (yydebug)                                  \
+    YYFPRINTF Args;                             \
+} while (0)
+
+/* This macro is provided for backward compatibility. */
+# ifndef YY_LOCATION_PRINT
+#  define YY_LOCATION_PRINT(File, Loc) ((void) 0)
+# endif
+
+
+# define YY_SYMBOL_PRINT(Title, Kind, Value, Location)                    \
+do {                                                                      \
+  if (yydebug)                                                            \
+    {                                                                     \
+      YYFPRINTF (stderr, "%s ", Title);                                   \
+      yy_symbol_print (stderr,                                            \
+                  Kind, Value); \
+      YYFPRINTF (stderr, "\n");                                           \
+    }                                                                     \
+} while (0)
+
+
+/*-----------------------------------.
+| Print this symbol's value on YYO.  |
+`-----------------------------------*/
+
+static void
+yy_symbol_value_print (FILE *yyo,
+                       yysymbol_kind_t yykind, YYSTYPE const * const yyvaluep)
+{
+  FILE *yyoutput = yyo;
+  YY_USE (yyoutput);
+  if (!yyvaluep)
+    return;
+# ifdef YYPRINT
+  if (yykind < YYNTOKENS)
+    YYPRINT (yyo, yytoknum[yykind], *yyvaluep);
+# endif
+  YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN
+  YY_USE (yykind);
+  YY_IGNORE_MAYBE_UNINITIALIZED_END
+}
+
+
+/*---------------------------.
+| Print this symbol on YYO.  |
+`---------------------------*/
+
+static void
+yy_symbol_print (FILE *yyo,
+                 yysymbol_kind_t yykind, YYSTYPE const * const yyvaluep)
+{
+  YYFPRINTF (yyo, "%s %s (",
+             yykind < YYNTOKENS ? "token" : "nterm", yysymbol_name (yykind));
+
+  yy_symbol_value_print (yyo, yykind, yyvaluep);
+  YYFPRINTF (yyo, ")");
+}
+
+/*------------------------------------------------------------------.
+| yy_stack_print -- Print the state stack from its BOTTOM up to its |
+| TOP (included).                                                   |
+`------------------------------------------------------------------*/
+
+static void
+yy_stack_print (yy_state_t *yybottom, yy_state_t *yytop)
+{
+  YYFPRINTF (stderr, "Stack now");
+  for (; yybottom <= yytop; yybottom++)
+    {
+      int yybot = *yybottom;
+      YYFPRINTF (stderr, " %d", yybot);
+    }
+  YYFPRINTF (stderr, "\n");
+}
+
+# define YY_STACK_PRINT(Bottom, Top)                            \
+do {                                                            \
+  if (yydebug)                                                  \
+    yy_stack_print ((Bottom), (Top));                           \
+} while (0)
+
+
+/*------------------------------------------------.
+| Report that the YYRULE is going to be reduced.  |
+`------------------------------------------------*/
+
+static void
+yy_reduce_print (yy_state_t *yyssp, YYSTYPE *yyvsp,
+                 int yyrule)
+{
+  int yylno = yyrline[yyrule];
+  int yynrhs = yyr2[yyrule];
+  int yyi;
+  YYFPRINTF (stderr, "Reducing stack by rule %d (line %d):\n",
+             yyrule - 1, yylno);
+  /* The symbols being reduced.  */
+  for (yyi = 0; yyi < yynrhs; yyi++)
+    {
+      YYFPRINTF (stderr, "   $%d = ", yyi + 1);
+      yy_symbol_print (stderr,
+                       YY_ACCESSING_SYMBOL (+yyssp[yyi + 1 - yynrhs]),
+                       &yyvsp[(yyi + 1) - (yynrhs)]);
+      YYFPRINTF (stderr, "\n");
+    }
+}
+
+# define YY_REDUCE_PRINT(Rule)          \
+do {                                    \
+  if (yydebug)                          \
+    yy_reduce_print (yyssp, yyvsp, Rule); \
+} while (0)
+
+/* Nonzero means print parse trace.  It is left uninitialized so that
+   multiple parsers can coexist.  */
+int yydebug;
+#else /* !YYDEBUG */
+# define YYDPRINTF(Args) ((void) 0)
+# define YY_SYMBOL_PRINT(Title, Kind, Value, Location)
+# define YY_STACK_PRINT(Bottom, Top)
+# define YY_REDUCE_PRINT(Rule)
+#endif /* !YYDEBUG */
+
+
+/* YYINITDEPTH -- initial size of the parser's stacks.  */
+#ifndef YYINITDEPTH
+# define YYINITDEPTH 200
+#endif
+
+/* YYMAXDEPTH -- maximum size the stacks can grow to (effective only
+   if the built-in stack extension method is used).
+
+   Do not make this value too large; the results are undefined if
+   YYSTACK_ALLOC_MAXIMUM < YYSTACK_BYTES (YYMAXDEPTH)
+   evaluated with infinite-precision integer arithmetic.  */
+
+#ifndef YYMAXDEPTH
+# define YYMAXDEPTH 10000
+#endif
+
+
+
+
+
+
+/*-----------------------------------------------.
+| Release the memory associated to this symbol.  |
+`-----------------------------------------------*/
+
+static void
+yydestruct (const char *yymsg,
+            yysymbol_kind_t yykind, YYSTYPE *yyvaluep)
+{
+  YY_USE (yyvaluep);
+  if (!yymsg)
+    yymsg = "Deleting";
+  YY_SYMBOL_PRINT (yymsg, yykind, yyvaluep, yylocationp);
+
+  YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN
+  YY_USE (yykind);
+  YY_IGNORE_MAYBE_UNINITIALIZED_END
+}
+
+
+/* Lookahead token kind.  */
+static int yychar;
+
+/* The semantic value of the lookahead symbol.  */
+YYSTYPE yylval;
+/* Number of syntax errors so far.  */
+static int yynerrs;
+
+
+
+
+/*----------.
+| yyparse.  |
+`----------*/
+
+int
+yyparse (void)
+{
+    yy_state_fast_t yystate = 0;
+    /* Number of tokens to shift before error messages enabled.  */
+    int yyerrstatus = 0;
+
+    /* Refer to the stacks through separate pointers, to allow yyoverflow
+       to reallocate them elsewhere.  */
+
+    /* Their size.  */
+    YYPTRDIFF_T yystacksize = YYINITDEPTH;
+
+    /* The state stack: array, bottom, top.  */
+    yy_state_t yyssa[YYINITDEPTH];
+    yy_state_t *yyss = yyssa;
+    yy_state_t *yyssp = yyss;
+
+    /* The semantic value stack: array, bottom, top.  */
+    YYSTYPE yyvsa[YYINITDEPTH];
+    YYSTYPE *yyvs = yyvsa;
+    YYSTYPE *yyvsp = yyvs;
+
+  int yyn;
+  /* The return value of yyparse.  */
+  int yyresult;
+  /* Lookahead symbol kind.  */
+  yysymbol_kind_t yytoken = YYSYMBOL_YYEMPTY;
+  /* The variables used to return semantic value and location from the
+     action routines.  */
+  YYSTYPE yyval;
+
+
+
+#define YYPOPSTACK(N)   (yyvsp -= (N), yyssp -= (N))
+
+  /* The number of symbols on the RHS of the reduced rule.
+     Keep to zero when no symbol should be popped.  */
+  int yylen = 0;
+
+  YYDPRINTF ((stderr, "Starting parse\n"));
+
+  yychar = YYEMPTY; /* Cause a token to be read.  */
+  goto yysetstate;
+
+
+/*------------------------------------------------------------.
+| yynewstate -- push a new state, which is found in yystate.  |
+`------------------------------------------------------------*/
+yynewstate:
+  /* In all cases, when you get here, the value and location stacks
+     have just been pushed.  So pushing a state here evens the stacks.  */
+  yyssp++;
+
+
+/*--------------------------------------------------------------------.
+| yysetstate -- set current state (the top of the stack) to yystate.  |
+`--------------------------------------------------------------------*/
+yysetstate:
+  YYDPRINTF ((stderr, "Entering state %d\n", yystate));
+  YY_ASSERT (0 <= yystate && yystate < YYNSTATES);
+  YY_IGNORE_USELESS_CAST_BEGIN
+  *yyssp = YY_CAST (yy_state_t, yystate);
+  YY_IGNORE_USELESS_CAST_END
+  YY_STACK_PRINT (yyss, yyssp);
+
+  if (yyss + yystacksize - 1 <= yyssp)
+#if !defined yyoverflow && !defined YYSTACK_RELOCATE
+    goto yyexhaustedlab;
+#else
+    {
+      /* Get the current used size of the three stacks, in elements.  */
+      YYPTRDIFF_T yysize = yyssp - yyss + 1;
+
+# if defined yyoverflow
+      {
+        /* Give user a chance to reallocate the stack.  Use copies of
+           these so that the &'s don't force the real ones into
+           memory.  */
+        yy_state_t *yyss1 = yyss;
+        YYSTYPE *yyvs1 = yyvs;
+
+        /* Each stack pointer address is followed by the size of the
+           data in use in that stack, in bytes.  This used to be a
+           conditional around just the two extra args, but that might
+           be undefined if yyoverflow is a macro.  */
+        yyoverflow (YY_("memory exhausted"),
+                    &yyss1, yysize * YYSIZEOF (*yyssp),
+                    &yyvs1, yysize * YYSIZEOF (*yyvsp),
+                    &yystacksize);
+        yyss = yyss1;
+        yyvs = yyvs1;
+      }
+# else /* defined YYSTACK_RELOCATE */
+      /* Extend the stack our own way.  */
+      if (YYMAXDEPTH <= yystacksize)
+        goto yyexhaustedlab;
+      yystacksize *= 2;
+      if (YYMAXDEPTH < yystacksize)
+        yystacksize = YYMAXDEPTH;
+
+      {
+        yy_state_t *yyss1 = yyss;
+        union yyalloc *yyptr =
+          YY_CAST (union yyalloc *,
+                   YYSTACK_ALLOC (YY_CAST (YYSIZE_T, YYSTACK_BYTES (yystacksize))));
+        if (! yyptr)
+          goto yyexhaustedlab;
+        YYSTACK_RELOCATE (yyss_alloc, yyss);
+        YYSTACK_RELOCATE (yyvs_alloc, yyvs);
+#  undef YYSTACK_RELOCATE
+        if (yyss1 != yyssa)
+          YYSTACK_FREE (yyss1);
+      }
+# endif
+
+      yyssp = yyss + yysize - 1;
+      yyvsp = yyvs + yysize - 1;
+
+      YY_IGNORE_USELESS_CAST_BEGIN
+      YYDPRINTF ((stderr, "Stack size increased to %ld\n",
+                  YY_CAST (long, yystacksize)));
+      YY_IGNORE_USELESS_CAST_END
+
+      if (yyss + yystacksize - 1 <= yyssp)
+        YYABORT;
+    }
+#endif /* !defined yyoverflow && !defined YYSTACK_RELOCATE */
+
+  if (yystate == YYFINAL)
+    YYACCEPT;
+
+  goto yybackup;
+
+
+/*-----------.
+| yybackup.  |
+`-----------*/
+yybackup:
+  /* Do appropriate processing given the current state.  Read a
+     lookahead token if we need one and don't already have one.  */
+
+  /* First try to decide what to do without reference to lookahead token.  */
+  yyn = yypact[yystate];
+  if (yypact_value_is_default (yyn))
+    goto yydefault;
+
+  /* Not known => get a lookahead token if don't already have one.  */
+
+  /* YYCHAR is either empty, or end-of-input, or a valid lookahead.  */
+  if (yychar == YYEMPTY)
+    {
+      YYDPRINTF ((stderr, "Reading a token\n"));
+      yychar = yylex ();
+    }
+
+  if (yychar <= YYEOF)
+    {
+      yychar = YYEOF;
+      yytoken = YYSYMBOL_YYEOF;
+      YYDPRINTF ((stderr, "Now at end of input.\n"));
+    }
+  else if (yychar == YYerror)
+    {
+      /* The scanner already issued an error message, process directly
+         to error recovery.  But do not keep the error token as
+         lookahead, it is too special and may lead us to an endless
+         loop in error recovery. */
+      yychar = YYUNDEF;
+      yytoken = YYSYMBOL_YYerror;
+      goto yyerrlab1;
+    }
+  else
+    {
+      yytoken = YYTRANSLATE (yychar);
+      YY_SYMBOL_PRINT ("Next token is", yytoken, &yylval, &yylloc);
+    }
+
+  /* If the proper action on seeing token YYTOKEN is to reduce or to
+     detect an error, take that action.  */
+  yyn += yytoken;
+  if (yyn < 0 || YYLAST < yyn || yycheck[yyn] != yytoken)
+    goto yydefault;
+  yyn = yytable[yyn];
+  if (yyn <= 0)
+    {
+      if (yytable_value_is_error (yyn))
+        goto yyerrlab;
+      yyn = -yyn;
+      goto yyreduce;
+    }
+
+  /* Count tokens shifted since error; after three, turn off error
+     status.  */
+  if (yyerrstatus)
+    yyerrstatus--;
+
+  /* Shift the lookahead token.  */
+  YY_SYMBOL_PRINT ("Shifting", yytoken, &yylval, &yylloc);
+  yystate = yyn;
+  YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN
+  *++yyvsp = yylval;
+  YY_IGNORE_MAYBE_UNINITIALIZED_END
+
+  /* Discard the shifted token.  */
+  yychar = YYEMPTY;
+  goto yynewstate;
+
+
+/*-----------------------------------------------------------.
+| yydefault -- do the default action for the current state.  |
+`-----------------------------------------------------------*/
+yydefault:
+  yyn = yydefact[yystate];
+  if (yyn == 0)
+    goto yyerrlab;
+  goto yyreduce;
+
+
+/*-----------------------------.
+| yyreduce -- do a reduction.  |
+`-----------------------------*/
+yyreduce:
+  /* yyn is the number of a rule to reduce with.  */
+  yylen = yyr2[yyn];
+
+  /* If YYLEN is nonzero, implement the default value of the action:
+     '$$ = $1'.
+
+     Otherwise, the following line sets YYVAL to garbage.
+     This behavior is undocumented and Bison
+     users should not rely upon it.  Assigning to YYVAL
+     unconditionally makes the parser a bit smaller, and it avoids a
+     GCC warning that YYVAL may be used uninitialized.  */
+  yyval = yyvsp[1-yylen];
+
+
+  YY_REDUCE_PRINT (yyn);
+  switch (yyn)
+    {
+  case 22: /* statement_list: statement  */
+#line 165 "pars0grm.y"
+                                { yyval = que_node_list_add_last(NULL, yyvsp[0]); }
+#line 1545 "pars0grm.cc"
+    break;
+
+  case 23: /* statement_list: statement_list statement  */
+#line 167 "pars0grm.y"
+                                { yyval = que_node_list_add_last(yyvsp[-1], yyvsp[0]); }
+#line 1551 "pars0grm.cc"
+    break;
+
+  case 24: /* exp: PARS_ID_TOKEN  */
+#line 171 "pars0grm.y"
+                                { yyval = yyvsp[0];}
+#line 1557 "pars0grm.cc"
+    break;
+
+  case 25: /* exp: function_name '(' exp_list ')'  */
+#line 173 "pars0grm.y"
+                                { yyval = pars_func(yyvsp[-3], yyvsp[-1]); }
+#line 1563 "pars0grm.cc"
+    break;
+
+  case 26: /* exp: PARS_INT_LIT  */
+#line 174 "pars0grm.y"
+                                { yyval = yyvsp[0];}
+#line 1569 "pars0grm.cc"
+    break;
+
+  case 27: /* exp: PARS_FLOAT_LIT  */
+#line 175 "pars0grm.y"
+                                { yyval = yyvsp[0];}
+#line 1575 "pars0grm.cc"
+    break;
+
+  case 28: /* exp: PARS_STR_LIT  */
+#line 176 "pars0grm.y"
+                                { yyval = yyvsp[0];}
+#line 1581 "pars0grm.cc"
+    break;
+
+  case 29: /* exp: PARS_NULL_LIT  */
+#line 177 "pars0grm.y"
+                                { yyval = yyvsp[0];}
+#line 1587 "pars0grm.cc"
+    break;
+
+  case 30: /* exp: PARS_SQL_TOKEN  */
+#line 178 "pars0grm.y"
+                                { yyval = yyvsp[0];}
+#line 1593 "pars0grm.cc"
+    break;
+
+  case 31: /* exp: exp '+' exp  */
+#line 179 "pars0grm.y"
+                                { yyval = pars_op('+', yyvsp[-2], yyvsp[0]); }
+#line 1599 "pars0grm.cc"
+    break;
+
+  case 32: /* exp: exp '-' exp  */
+#line 180 "pars0grm.y"
+                                { yyval = pars_op('-', yyvsp[-2], yyvsp[0]); }
+#line 1605 "pars0grm.cc"
+    break;
+
+  case 33: /* exp: exp '*' exp  */
+#line 181 "pars0grm.y"
+                                { yyval = pars_op('*', yyvsp[-2], yyvsp[0]); }
+#line 1611 "pars0grm.cc"
+    break;
+
+  case 34: /* exp: exp '/' exp  */
+#line 182 "pars0grm.y"
+                                { yyval = pars_op('/', yyvsp[-2], yyvsp[0]); }
+#line 1617 "pars0grm.cc"
+    break;
+
+  case 35: /* exp: '-' exp  */
+#line 183 "pars0grm.y"
+                                { yyval = pars_op('-', yyvsp[0], NULL); }
+#line 1623 "pars0grm.cc"
+    break;
+
+  case 36: /* exp: '(' exp ')'  */
+#line 184 "pars0grm.y"
+                                { yyval = yyvsp[-1]; }
+#line 1629 "pars0grm.cc"
+    break;
+
+  case 37: /* exp: exp '=' exp  */
+#line 185 "pars0grm.y"
+                                { yyval = pars_op('=', yyvsp[-2], yyvsp[0]); }
+#line 1635 "pars0grm.cc"
+    break;
+
+  case 38: /* exp: exp PARS_LIKE_TOKEN PARS_STR_LIT  */
+#line 187 "pars0grm.y"
+                                { yyval = pars_op(PARS_LIKE_TOKEN, yyvsp[-2], yyvsp[0]); }
+#line 1641 "pars0grm.cc"
+    break;
+
+  case 39: /* exp: exp '<' exp  */
+#line 188 "pars0grm.y"
+                                { yyval = pars_op('<', yyvsp[-2], yyvsp[0]); }
+#line 1647 "pars0grm.cc"
+    break;
+
+  case 40: /* exp: exp '>' exp  */
+#line 189 "pars0grm.y"
+                                { yyval = pars_op('>', yyvsp[-2], yyvsp[0]); }
+#line 1653 "pars0grm.cc"
+    break;
+
+  case 41: /* exp: exp PARS_GE_TOKEN exp  */
+#line 190 "pars0grm.y"
+                                { yyval = pars_op(PARS_GE_TOKEN, yyvsp[-2], yyvsp[0]); }
+#line 1659 "pars0grm.cc"
+    break;
+
+  case 42: /* exp: exp PARS_LE_TOKEN exp  */
+#line 191 "pars0grm.y"
+                                { yyval = pars_op(PARS_LE_TOKEN, yyvsp[-2], yyvsp[0]); }
+#line 1665 "pars0grm.cc"
+    break;
+
+  case 43: /* exp: exp PARS_NE_TOKEN exp  */
+#line 192 "pars0grm.y"
+                                { yyval = pars_op(PARS_NE_TOKEN, yyvsp[-2], yyvsp[0]); }
+#line 1671 "pars0grm.cc"
+    break;
+
+  case 44: /* exp: exp PARS_AND_TOKEN exp  */
+#line 193 "pars0grm.y"
+                                { yyval = pars_op(PARS_AND_TOKEN, yyvsp[-2], yyvsp[0]); }
+#line 1677 "pars0grm.cc"
+    break;
+
+  case 45: /* exp: exp PARS_OR_TOKEN exp  */
+#line 194 "pars0grm.y"
+                                { yyval = pars_op(PARS_OR_TOKEN, yyvsp[-2], yyvsp[0]); }
+#line 1683 "pars0grm.cc"
+    break;
+
+  case 46: /* exp: PARS_NOT_TOKEN exp  */
+#line 195 "pars0grm.y"
+                                { yyval = pars_op(PARS_NOT_TOKEN, yyvsp[0], NULL); }
+#line 1689 "pars0grm.cc"
+    break;
+
+  case 47: /* exp: PARS_ID_TOKEN '%' PARS_NOTFOUND_TOKEN  */
+#line 197 "pars0grm.y"
+                                { yyval = pars_op(PARS_NOTFOUND_TOKEN, yyvsp[-2], NULL); }
+#line 1695 "pars0grm.cc"
+    break;
+
+  case 48: /* exp: PARS_SQL_TOKEN '%' PARS_NOTFOUND_TOKEN  */
+#line 199 "pars0grm.y"
+                                { yyval = pars_op(PARS_NOTFOUND_TOKEN, yyvsp[-2], NULL); }
+#line 1701 "pars0grm.cc"
+    break;
+
+  case 49: /* function_name: PARS_TO_BINARY_TOKEN  */
+#line 203 "pars0grm.y"
+                                { yyval = &pars_to_binary_token; }
+#line 1707 "pars0grm.cc"
+    break;
+
+  case 50: /* function_name: PARS_SUBSTR_TOKEN  */
+#line 204 "pars0grm.y"
+                                { yyval = &pars_substr_token; }
+#line 1713 "pars0grm.cc"
+    break;
+
+  case 51: /* function_name: PARS_CONCAT_TOKEN  */
+#line 205 "pars0grm.y"
+                                { yyval = &pars_concat_token; }
+#line 1719 "pars0grm.cc"
+    break;
+
+  case 52: /* function_name: PARS_INSTR_TOKEN  */
+#line 206 "pars0grm.y"
+                                { yyval = &pars_instr_token; }
+#line 1725 "pars0grm.cc"
+    break;
+
+  case 53: /* function_name: PARS_LENGTH_TOKEN  */
+#line 207 "pars0grm.y"
+                                { yyval = &pars_length_token; }
+#line 1731 "pars0grm.cc"
+    break;
+
+  case 54: /* user_function_call: PARS_ID_TOKEN '(' ')'  */
+#line 211 "pars0grm.y"
+                                { yyval = yyvsp[-2]; }
+#line 1737 "pars0grm.cc"
+    break;
+
+  case 55: /* table_list: table_name  */
+#line 215 "pars0grm.y"
+                                { yyval = que_node_list_add_last(NULL, yyvsp[0]); }
+#line 1743 "pars0grm.cc"
+    break;
+
+  case 56: /* table_list: table_list ',' table_name  */
+#line 217 "pars0grm.y"
+                                { yyval = que_node_list_add_last(yyvsp[-2], yyvsp[0]); }
+#line 1749 "pars0grm.cc"
+    break;
+
+  case 57: /* variable_list: %empty  */
+#line 221 "pars0grm.y"
+                                { yyval = NULL; }
+#line 1755 "pars0grm.cc"
+    break;
+
+  case 58: /* variable_list: PARS_ID_TOKEN  */
+#line 222 "pars0grm.y"
+                                { yyval = que_node_list_add_last(NULL, yyvsp[0]); }
+#line 1761 "pars0grm.cc"
+    break;
+
+  case 59: /* variable_list: variable_list ',' PARS_ID_TOKEN  */
+#line 224 "pars0grm.y"
+                                { yyval = que_node_list_add_last(yyvsp[-2], yyvsp[0]); }
+#line 1767 "pars0grm.cc"
+    break;
+
+  case 60: /* exp_list: %empty  */
+#line 228 "pars0grm.y"
+                                { yyval = NULL; }
+#line 1773 "pars0grm.cc"
+    break;
+
+  case 61: /* exp_list: exp  */
+#line 229 "pars0grm.y"
+                                { yyval = que_node_list_add_last(NULL, yyvsp[0]);}
+#line 1779 "pars0grm.cc"
+    break;
+
+  case 62: /* exp_list: exp_list ',' exp  */
+#line 230 "pars0grm.y"
+                                { yyval = que_node_list_add_last(yyvsp[-2], yyvsp[0]); }
+#line 1785 "pars0grm.cc"
+    break;
+
+  case 63: /* select_item: exp  */
+#line 234 "pars0grm.y"
+                                { yyval = yyvsp[0]; }
+#line 1791 "pars0grm.cc"
+    break;
+
+  case 64: /* select_item: PARS_COUNT_TOKEN '(' '*' ')'  */
+#line 236 "pars0grm.y"
+                                { yyval = pars_func(&pars_count_token,
+					  que_node_list_add_last(NULL,
+					    sym_tab_add_int_lit(
+						pars_sym_tab_global, 1))); }
+#line 1800 "pars0grm.cc"
+    break;
+
+  case 65: /* select_item_list: %empty  */
+#line 243 "pars0grm.y"
+                                { yyval = NULL; }
+#line 1806 "pars0grm.cc"
+    break;
+
+  case 66: /* select_item_list: select_item  */
+#line 244 "pars0grm.y"
+                                { yyval = que_node_list_add_last(NULL, yyvsp[0]); }
+#line 1812 "pars0grm.cc"
+    break;
+
+  case 67: /* select_item_list: select_item_list ',' select_item  */
+#line 246 "pars0grm.y"
+                                { yyval = que_node_list_add_last(yyvsp[-2], yyvsp[0]); }
+#line 1818 "pars0grm.cc"
+    break;
+
+  case 68: /* select_list: '*'  */
+#line 250 "pars0grm.y"
+                                { yyval = pars_select_list(&pars_star_denoter,
+								NULL); }
+#line 1825 "pars0grm.cc"
+    break;
+
+  case 69: /* select_list: select_item_list PARS_INTO_TOKEN variable_list  */
+#line 253 "pars0grm.y"
+                                { yyval = pars_select_list(
+					yyvsp[-2], static_cast<sym_node_t*>(yyvsp[0])); }
+#line 1832 "pars0grm.cc"
+    break;
+
+  case 70: /* select_list: select_item_list  */
+#line 255 "pars0grm.y"
+                                { yyval = pars_select_list(yyvsp[0], NULL); }
+#line 1838 "pars0grm.cc"
+    break;
+
+  case 71: /* search_condition: %empty  */
+#line 259 "pars0grm.y"
+                                { yyval = NULL; }
+#line 1844 "pars0grm.cc"
+    break;
+
+  case 72: /* search_condition: PARS_WHERE_TOKEN exp  */
+#line 260 "pars0grm.y"
+                                { yyval = yyvsp[0]; }
+#line 1850 "pars0grm.cc"
+    break;
+
+  case 73: /* for_update_clause: %empty  */
+#line 264 "pars0grm.y"
+                                { yyval = NULL; }
+#line 1856 "pars0grm.cc"
+    break;
+
+  case 74: /* for_update_clause: PARS_FOR_TOKEN PARS_UPDATE_TOKEN  */
+#line 266 "pars0grm.y"
+                                { yyval = &pars_update_token; }
+#line 1862 "pars0grm.cc"
+    break;
+
+  case 75: /* lock_shared_clause: %empty  */
+#line 270 "pars0grm.y"
+                                { yyval = NULL; }
+#line 1868 "pars0grm.cc"
+    break;
+
+  case 76: /* lock_shared_clause: PARS_LOCK_TOKEN PARS_IN_TOKEN PARS_SHARE_TOKEN PARS_MODE_TOKEN  */
+#line 272 "pars0grm.y"
+                                { yyval = &pars_share_token; }
+#line 1874 "pars0grm.cc"
+    break;
+
+  case 77: /* order_direction: %empty  */
+#line 276 "pars0grm.y"
+                                { yyval = &pars_asc_token; }
+#line 1880 "pars0grm.cc"
+    break;
+
+  case 78: /* order_direction: PARS_ASC_TOKEN  */
+#line 277 "pars0grm.y"
+                                { yyval = &pars_asc_token; }
+#line 1886 "pars0grm.cc"
+    break;
+
+  case 79: /* order_direction: PARS_DESC_TOKEN  */
+#line 278 "pars0grm.y"
+                                { yyval = &pars_desc_token; }
+#line 1892 "pars0grm.cc"
+    break;
+
+  case 80: /* order_by_clause: %empty  */
+#line 282 "pars0grm.y"
+                                { yyval = NULL; }
+#line 1898 "pars0grm.cc"
+    break;
+
+  case 81: /* order_by_clause: PARS_ORDER_TOKEN PARS_BY_TOKEN PARS_ID_TOKEN order_direction  */
+#line 284 "pars0grm.y"
+                                { yyval = pars_order_by(
+					static_cast<sym_node_t*>(yyvsp[-1]),
+					static_cast<pars_res_word_t*>(yyvsp[0])); }
+#line 1906 "pars0grm.cc"
+    break;
+
+  case 82: /* select_statement: PARS_SELECT_TOKEN select_list PARS_FROM_TOKEN table_list search_condition for_update_clause lock_shared_clause order_by_clause  */
+#line 295 "pars0grm.y"
+                                { yyval = pars_select_statement(
+					static_cast<sel_node_t*>(yyvsp[-6]),
+					static_cast<sym_node_t*>(yyvsp[-4]),
+					static_cast<que_node_t*>(yyvsp[-3]),
+					static_cast<pars_res_word_t*>(yyvsp[-2]),
+					static_cast<pars_res_word_t*>(yyvsp[-1]),
+					static_cast<order_node_t*>(yyvsp[0])); }
+#line 1918 "pars0grm.cc"
+    break;
+
+  case 83: /* insert_statement_start: PARS_INSERT_TOKEN PARS_INTO_TOKEN table_name  */
+#line 306 "pars0grm.y"
+                                { yyval = yyvsp[0]; }
+#line 1924 "pars0grm.cc"
+    break;
+
+  case 84: /* insert_statement: insert_statement_start PARS_VALUES_TOKEN '(' exp_list ')'  */
+#line 311 "pars0grm.y"
+                                { yyval = pars_insert_statement(
+					static_cast<sym_node_t*>(yyvsp[-4]), yyvsp[-1], NULL); }
+#line 1931 "pars0grm.cc"
+    break;
+
+  case 85: /* insert_statement: insert_statement_start select_statement  */
+#line 314 "pars0grm.y"
+                                { yyval = pars_insert_statement(
+					static_cast<sym_node_t*>(yyvsp[-1]),
+					NULL,
+					static_cast<sel_node_t*>(yyvsp[0])); }
+#line 1940 "pars0grm.cc"
+    break;
+
+  case 86: /* column_assignment: PARS_ID_TOKEN '=' exp  */
+#line 321 "pars0grm.y"
+                                { yyval = pars_column_assignment(
+					static_cast<sym_node_t*>(yyvsp[-2]),
+					static_cast<que_node_t*>(yyvsp[0])); }
+#line 1948 "pars0grm.cc"
+    break;
+
+  case 87: /* column_assignment_list: column_assignment  */
+#line 327 "pars0grm.y"
+                                { yyval = que_node_list_add_last(NULL, yyvsp[0]); }
+#line 1954 "pars0grm.cc"
+    break;
+
+  case 88: /* column_assignment_list: column_assignment_list ',' column_assignment  */
+#line 329 "pars0grm.y"
+                                { yyval = que_node_list_add_last(yyvsp[-2], yyvsp[0]); }
+#line 1960 "pars0grm.cc"
+    break;
+
+  case 89: /* cursor_positioned: PARS_WHERE_TOKEN PARS_CURRENT_TOKEN PARS_OF_TOKEN PARS_ID_TOKEN  */
+#line 335 "pars0grm.y"
+                                { yyval = yyvsp[0]; }
+#line 1966 "pars0grm.cc"
+    break;
+
+  case 90: /* update_statement_start: PARS_UPDATE_TOKEN table_name PARS_SET_TOKEN column_assignment_list  */
+#line 341 "pars0grm.y"
+                                { yyval = pars_update_statement_start(
+					FALSE,
+					static_cast<sym_node_t*>(yyvsp[-2]),
+					static_cast<col_assign_node_t*>(yyvsp[0])); }
+#line 1975 "pars0grm.cc"
+    break;
+
+  case 91: /* update_statement_searched: update_statement_start search_condition  */
+#line 349 "pars0grm.y"
+                                { yyval = pars_update_statement(
+					static_cast<upd_node_t*>(yyvsp[-1]),
+					NULL,
+					static_cast<que_node_t*>(yyvsp[0])); }
+#line 1984 "pars0grm.cc"
+    break;
+
+  case 92: /* update_statement_positioned: update_statement_start cursor_positioned  */
+#line 357 "pars0grm.y"
+                                { yyval = pars_update_statement(
+					static_cast<upd_node_t*>(yyvsp[-1]),
+					static_cast<sym_node_t*>(yyvsp[0]),
+					NULL); }
+#line 1993 "pars0grm.cc"
+    break;
+
+  case 93: /* delete_statement_start: PARS_DELETE_TOKEN PARS_FROM_TOKEN table_name  */
+#line 365 "pars0grm.y"
+                                { yyval = pars_update_statement_start(
+					TRUE,
+					static_cast<sym_node_t*>(yyvsp[0]), NULL); }
+#line 2001 "pars0grm.cc"
+    break;
+
+  case 94: /* delete_statement_searched: delete_statement_start search_condition  */
+#line 372 "pars0grm.y"
+                                { yyval = pars_update_statement(
+					static_cast<upd_node_t*>(yyvsp[-1]),
+					NULL,
+					static_cast<que_node_t*>(yyvsp[0])); }
+#line 2010 "pars0grm.cc"
+    break;
+
+  case 95: /* delete_statement_positioned: delete_statement_start cursor_positioned  */
+#line 380 "pars0grm.y"
+                                { yyval = pars_update_statement(
+					static_cast<upd_node_t*>(yyvsp[-1]),
+					static_cast<sym_node_t*>(yyvsp[0]),
+					NULL); }
+#line 2019 "pars0grm.cc"
+    break;
+
+  case 96: /* assignment_statement: PARS_ID_TOKEN PARS_ASSIGN_TOKEN exp  */
+#line 388 "pars0grm.y"
+                                { yyval = pars_assignment_statement(
+					static_cast<sym_node_t*>(yyvsp[-2]),
+					static_cast<que_node_t*>(yyvsp[0])); }
+#line 2027 "pars0grm.cc"
+    break;
+
+  case 97: /* elsif_element: PARS_ELSIF_TOKEN exp PARS_THEN_TOKEN statement_list  */
+#line 396 "pars0grm.y"
+                                { yyval = pars_elsif_element(yyvsp[-2], yyvsp[0]); }
+#line 2033 "pars0grm.cc"
+    break;
+
+  case 98: /* elsif_list: elsif_element  */
+#line 400 "pars0grm.y"
+                                { yyval = que_node_list_add_last(NULL, yyvsp[0]); }
+#line 2039 "pars0grm.cc"
+    break;
+
+  case 99: /* elsif_list: elsif_list elsif_element  */
+#line 402 "pars0grm.y"
+                                { yyval = que_node_list_add_last(yyvsp[-1], yyvsp[0]); }
+#line 2045 "pars0grm.cc"
+    break;
+
+  case 100: /* else_part: %empty  */
+#line 406 "pars0grm.y"
+                                { yyval = NULL; }
+#line 2051 "pars0grm.cc"
+    break;
+
+  case 101: /* else_part: PARS_ELSE_TOKEN statement_list  */
+#line 408 "pars0grm.y"
+                                { yyval = yyvsp[0]; }
+#line 2057 "pars0grm.cc"
+    break;
+
+  case 102: /* else_part: elsif_list  */
+#line 409 "pars0grm.y"
+                                { yyval = yyvsp[0]; }
+#line 2063 "pars0grm.cc"
+    break;
+
+  case 103: /* if_statement: PARS_IF_TOKEN exp PARS_THEN_TOKEN statement_list else_part PARS_END_TOKEN PARS_IF_TOKEN  */
+#line 416 "pars0grm.y"
+                                { yyval = pars_if_statement(yyvsp[-5], yyvsp[-3], yyvsp[-2]); }
+#line 2069 "pars0grm.cc"
+    break;
+
+  case 104: /* while_statement: PARS_WHILE_TOKEN exp PARS_LOOP_TOKEN statement_list PARS_END_TOKEN PARS_LOOP_TOKEN  */
+#line 422 "pars0grm.y"
+                                { yyval = pars_while_statement(yyvsp[-4], yyvsp[-2]); }
+#line 2075 "pars0grm.cc"
+    break;
+
+  case 105: /* for_statement: PARS_FOR_TOKEN PARS_ID_TOKEN PARS_IN_TOKEN exp PARS_DDOT_TOKEN exp PARS_LOOP_TOKEN statement_list PARS_END_TOKEN PARS_LOOP_TOKEN  */
+#line 430 "pars0grm.y"
+                                { yyval = pars_for_statement(
+					static_cast<sym_node_t*>(yyvsp[-8]),
+					yyvsp[-6], yyvsp[-4], yyvsp[-2]); }
+#line 2083 "pars0grm.cc"
+    break;
+
+  case 106: /* exit_statement: PARS_EXIT_TOKEN  */
+#line 436 "pars0grm.y"
+                                { yyval = pars_exit_statement(); }
+#line 2089 "pars0grm.cc"
+    break;
+
+  case 107: /* return_statement: PARS_RETURN_TOKEN  */
+#line 440 "pars0grm.y"
+                                { yyval = pars_return_statement(); }
+#line 2095 "pars0grm.cc"
+    break;
+
+  case 108: /* open_cursor_statement: PARS_OPEN_TOKEN PARS_ID_TOKEN  */
+#line 445 "pars0grm.y"
+                                { yyval = pars_open_statement(
+						ROW_SEL_OPEN_CURSOR,
+						static_cast<sym_node_t*>(yyvsp[0])); }
+#line 2103 "pars0grm.cc"
+    break;
+
+  case 109: /* close_cursor_statement: PARS_CLOSE_TOKEN PARS_ID_TOKEN  */
+#line 452 "pars0grm.y"
+                                { yyval = pars_open_statement(
+						ROW_SEL_CLOSE_CURSOR,
+						static_cast<sym_node_t*>(yyvsp[0])); }
+#line 2111 "pars0grm.cc"
+    break;
+
+  case 110: /* fetch_statement: PARS_FETCH_TOKEN PARS_ID_TOKEN PARS_INTO_TOKEN variable_list  */
+#line 459 "pars0grm.y"
+                                { yyval = pars_fetch_statement(
+					static_cast<sym_node_t*>(yyvsp[-2]),
+					static_cast<sym_node_t*>(yyvsp[0]), NULL); }
+#line 2119 "pars0grm.cc"
+    break;
+
+  case 111: /* fetch_statement: PARS_FETCH_TOKEN PARS_ID_TOKEN PARS_INTO_TOKEN user_function_call  */
+#line 463 "pars0grm.y"
+                                { yyval = pars_fetch_statement(
+					static_cast<sym_node_t*>(yyvsp[-2]),
+					NULL,
+					static_cast<sym_node_t*>(yyvsp[0])); }
+#line 2128 "pars0grm.cc"
+    break;
+
+  case 112: /* column_def: PARS_ID_TOKEN type_name opt_column_len opt_not_null  */
+#line 471 "pars0grm.y"
+                                { yyval = pars_column_def(
+					static_cast<sym_node_t*>(yyvsp[-3]),
+					static_cast<pars_res_word_t*>(yyvsp[-2]),
+					static_cast<sym_node_t*>(yyvsp[-1]),
+					yyvsp[0]); }
+#line 2138 "pars0grm.cc"
+    break;
+
+  case 113: /* column_def_list: column_def  */
+#line 479 "pars0grm.y"
+                                { yyval = que_node_list_add_last(NULL, yyvsp[0]); }
+#line 2144 "pars0grm.cc"
+    break;
+
+  case 114: /* column_def_list: column_def_list ',' column_def  */
+#line 481 "pars0grm.y"
+                                { yyval = que_node_list_add_last(yyvsp[-2], yyvsp[0]); }
+#line 2150 "pars0grm.cc"
+    break;
+
+  case 115: /* opt_column_len: %empty  */
+#line 485 "pars0grm.y"
+                                { yyval = NULL; }
+#line 2156 "pars0grm.cc"
+    break;
+
+  case 116: /* opt_column_len: '(' PARS_INT_LIT ')'  */
+#line 487 "pars0grm.y"
+                                { yyval = yyvsp[-1]; }
+#line 2162 "pars0grm.cc"
+    break;
+
+  case 117: /* opt_not_null: %empty  */
+#line 491 "pars0grm.y"
+                                { yyval = NULL; }
+#line 2168 "pars0grm.cc"
+    break;
+
+  case 118: /* opt_not_null: PARS_NOT_TOKEN PARS_NULL_LIT  */
+#line 493 "pars0grm.y"
+                                { yyval = &pars_int_token;
+					/* pass any non-NULL pointer */ }
+#line 2175 "pars0grm.cc"
+    break;
+
+  case 119: /* create_table: PARS_CREATE_TOKEN PARS_TABLE_TOKEN table_name '(' column_def_list ')'  */
+#line 500 "pars0grm.y"
+                                { yyval = pars_create_table(
+					static_cast<sym_node_t*>(yyvsp[-3]),
+					static_cast<sym_node_t*>(yyvsp[-1])); }
+#line 2183 "pars0grm.cc"
+    break;
+
+  case 120: /* column_list: PARS_ID_TOKEN  */
+#line 506 "pars0grm.y"
+                                { yyval = que_node_list_add_last(NULL, yyvsp[0]); }
+#line 2189 "pars0grm.cc"
+    break;
+
+  case 121: /* column_list: column_list ',' PARS_ID_TOKEN  */
+#line 508 "pars0grm.y"
+                                { yyval = que_node_list_add_last(yyvsp[-2], yyvsp[0]); }
+#line 2195 "pars0grm.cc"
+    break;
+
+  case 122: /* unique_def: %empty  */
+#line 512 "pars0grm.y"
+                                { yyval = NULL; }
+#line 2201 "pars0grm.cc"
+    break;
+
+  case 123: /* unique_def: PARS_UNIQUE_TOKEN  */
+#line 513 "pars0grm.y"
+                                { yyval = &pars_unique_token; }
+#line 2207 "pars0grm.cc"
+    break;
+
+  case 124: /* clustered_def: %empty  */
+#line 517 "pars0grm.y"
+                                { yyval = NULL; }
+#line 2213 "pars0grm.cc"
+    break;
+
+  case 125: /* clustered_def: PARS_CLUSTERED_TOKEN  */
+#line 518 "pars0grm.y"
+                                { yyval = &pars_clustered_token; }
+#line 2219 "pars0grm.cc"
+    break;
+
+  case 126: /* create_index: PARS_CREATE_TOKEN unique_def clustered_def PARS_INDEX_TOKEN PARS_ID_TOKEN PARS_ON_TOKEN table_name '(' column_list ')'  */
+#line 527 "pars0grm.y"
+                                { yyval = pars_create_index(
+					static_cast<pars_res_word_t*>(yyvsp[-8]),
+					static_cast<pars_res_word_t*>(yyvsp[-7]),
+					static_cast<sym_node_t*>(yyvsp[-5]),
+					static_cast<sym_node_t*>(yyvsp[-3]),
+					static_cast<sym_node_t*>(yyvsp[-1])); }
+#line 2230 "pars0grm.cc"
+    break;
+
+  case 127: /* table_name: PARS_ID_TOKEN  */
+#line 536 "pars0grm.y"
+                                { yyval = yyvsp[0]; }
+#line 2236 "pars0grm.cc"
+    break;
+
+  case 128: /* table_name: PARS_TABLE_NAME_TOKEN  */
+#line 537 "pars0grm.y"
+                                { yyval = yyvsp[0]; }
+#line 2242 "pars0grm.cc"
+    break;
+
+  case 129: /* commit_statement: PARS_COMMIT_TOKEN PARS_WORK_TOKEN  */
+#line 542 "pars0grm.y"
+                                { yyval = pars_commit_statement(); }
+#line 2248 "pars0grm.cc"
+    break;
+
+  case 130: /* rollback_statement: PARS_ROLLBACK_TOKEN PARS_WORK_TOKEN  */
+#line 547 "pars0grm.y"
+                                { yyval = pars_rollback_statement(); }
+#line 2254 "pars0grm.cc"
+    break;
+
+  case 131: /* type_name: PARS_INT_TOKEN  */
+#line 551 "pars0grm.y"
+                                { yyval = &pars_int_token; }
+#line 2260 "pars0grm.cc"
+    break;
+
+  case 132: /* type_name: PARS_BIGINT_TOKEN  */
+#line 552 "pars0grm.y"
+                                { yyval = &pars_bigint_token; }
+#line 2266 "pars0grm.cc"
+    break;
+
+  case 133: /* type_name: PARS_CHAR_TOKEN  */
+#line 553 "pars0grm.y"
+                                { yyval = &pars_char_token; }
+#line 2272 "pars0grm.cc"
+    break;
+
+  case 134: /* variable_declaration: PARS_ID_TOKEN type_name ';'  */
+#line 558 "pars0grm.y"
+                                { yyval = pars_variable_declaration(
+					static_cast<sym_node_t*>(yyvsp[-2]),
+					static_cast<pars_res_word_t*>(yyvsp[-1])); }
+#line 2280 "pars0grm.cc"
+    break;
+
+  case 138: /* cursor_declaration: PARS_DECLARE_TOKEN PARS_CURSOR_TOKEN PARS_ID_TOKEN PARS_IS_TOKEN select_statement ';'  */
+#line 572 "pars0grm.y"
+                                { yyval = pars_cursor_declaration(
+					static_cast<sym_node_t*>(yyvsp[-3]),
+					static_cast<sel_node_t*>(yyvsp[-1])); }
+#line 2288 "pars0grm.cc"
+    break;
+
+  case 139: /* function_declaration: PARS_DECLARE_TOKEN PARS_FUNCTION_TOKEN PARS_ID_TOKEN ';'  */
+#line 579 "pars0grm.y"
+                                { yyval = pars_function_declaration(
+					static_cast<sym_node_t*>(yyvsp[-1])); }
+#line 2295 "pars0grm.cc"
+    break;
+
+  case 145: /* procedure_definition: PARS_PROCEDURE_TOKEN PARS_ID_TOKEN '(' ')' PARS_IS_TOKEN variable_declaration_list declaration_list PARS_BEGIN_TOKEN statement_list PARS_END_TOKEN  */
+#line 601 "pars0grm.y"
+                                { yyval = pars_procedure_definition(
+					static_cast<sym_node_t*>(yyvsp[-8]), yyvsp[-1]); }
+#line 2302 "pars0grm.cc"
+    break;
+
+
+#line 2306 "pars0grm.cc"
+
+      default: break;
+    }
+  /* User semantic actions sometimes alter yychar, and that requires
+     that yytoken be updated with the new translation.  We take the
+     approach of translating immediately before every use of yytoken.
+     One alternative is translating here after every semantic action,
+     but that translation would be missed if the semantic action invokes
+     YYABORT, YYACCEPT, or YYERROR immediately after altering yychar or
+     if it invokes YYBACKUP.  In the case of YYABORT or YYACCEPT, an
+     incorrect destructor might then be invoked immediately.  In the
+     case of YYERROR or YYBACKUP, subsequent parser actions might lead
+     to an incorrect destructor call or verbose syntax error message
+     before the lookahead is translated.  */
+  YY_SYMBOL_PRINT ("-> $$ =", YY_CAST (yysymbol_kind_t, yyr1[yyn]), &yyval, &yyloc);
+
+  YYPOPSTACK (yylen);
+  yylen = 0;
+
+  *++yyvsp = yyval;
+
+  /* Now 'shift' the result of the reduction.  Determine what state
+     that goes to, based on the state we popped back to and the rule
+     number reduced by.  */
+  {
+    const int yylhs = yyr1[yyn] - YYNTOKENS;
+    const int yyi = yypgoto[yylhs] + *yyssp;
+    yystate = (0 <= yyi && yyi <= YYLAST && yycheck[yyi] == *yyssp
+               ? yytable[yyi]
+               : yydefgoto[yylhs]);
+  }
+
+  goto yynewstate;
+
+
+/*--------------------------------------.
+| yyerrlab -- here on detecting error.  |
+`--------------------------------------*/
+yyerrlab:
+  /* Make sure we have latest lookahead translation.  See comments at
+     user semantic actions for why this is necessary.  */
+  yytoken = yychar == YYEMPTY ? YYSYMBOL_YYEMPTY : YYTRANSLATE (yychar);
+  /* If not already recovering from an error, report this error.  */
+  if (!yyerrstatus)
+    {
+      ++yynerrs;
+      yyerror (YY_("syntax error"));
+    }
+
+  if (yyerrstatus == 3)
+    {
+      /* If just tried and failed to reuse lookahead token after an
+         error, discard it.  */
+
+      if (yychar <= YYEOF)
+        {
+          /* Return failure if at end of input.  */
+          if (yychar == YYEOF)
+            YYABORT;
+        }
+      else
+        {
+          yydestruct ("Error: discarding",
+                      yytoken, &yylval);
+          yychar = YYEMPTY;
+        }
+    }
+
+  /* Else will try to reuse lookahead token after shifting the error
+     token.  */
+  goto yyerrlab1;
+
+
+/*---------------------------------------------------.
+| yyerrorlab -- error raised explicitly by YYERROR.  |
+`---------------------------------------------------*/
+yyerrorlab:
+  /* Pacify compilers when the user code never invokes YYERROR and the
+     label yyerrorlab therefore never appears in user code.  */
+  if (0)
+    YYERROR;
+
+  /* Do not reclaim the symbols of the rule whose action triggered
+     this YYERROR.  */
+  YYPOPSTACK (yylen);
+  yylen = 0;
+  YY_STACK_PRINT (yyss, yyssp);
+  yystate = *yyssp;
+  goto yyerrlab1;
+
+
+/*-------------------------------------------------------------.
+| yyerrlab1 -- common code for both syntax error and YYERROR.  |
+`-------------------------------------------------------------*/
+yyerrlab1:
+  yyerrstatus = 3;      /* Each real token shifted decrements this.  */
+
+  /* Pop stack until we find a state that shifts the error token.  */
+  for (;;)
+    {
+      yyn = yypact[yystate];
+      if (!yypact_value_is_default (yyn))
+        {
+          yyn += YYSYMBOL_YYerror;
+          if (0 <= yyn && yyn <= YYLAST && yycheck[yyn] == YYSYMBOL_YYerror)
+            {
+              yyn = yytable[yyn];
+              if (0 < yyn)
+                break;
+            }
+        }
+
+      /* Pop the current state because it cannot handle the error token.  */
+      if (yyssp == yyss)
+        YYABORT;
+
+
+      yydestruct ("Error: popping",
+                  YY_ACCESSING_SYMBOL (yystate), yyvsp);
+      YYPOPSTACK (1);
+      yystate = *yyssp;
+      YY_STACK_PRINT (yyss, yyssp);
+    }
+
+  YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN
+  *++yyvsp = yylval;
+  YY_IGNORE_MAYBE_UNINITIALIZED_END
+
+
+  /* Shift the error token.  */
+  YY_SYMBOL_PRINT ("Shifting", YY_ACCESSING_SYMBOL (yyn), yyvsp, yylsp);
+
+  yystate = yyn;
+  goto yynewstate;
+
+
+/*-------------------------------------.
+| yyacceptlab -- YYACCEPT comes here.  |
+`-------------------------------------*/
+yyacceptlab:
+  yyresult = 0;
+  goto yyreturn;
+
+
+/*-----------------------------------.
+| yyabortlab -- YYABORT comes here.  |
+`-----------------------------------*/
+yyabortlab:
+  yyresult = 1;
+  goto yyreturn;
+
+
+#if !defined yyoverflow
+/*-------------------------------------------------.
+| yyexhaustedlab -- memory exhaustion comes here.  |
+`-------------------------------------------------*/
+yyexhaustedlab:
+  yyerror (YY_("memory exhausted"));
+  yyresult = 2;
+  goto yyreturn;
+#endif
+
+
+/*-------------------------------------------------------.
+| yyreturn -- parsing is finished, clean up and return.  |
+`-------------------------------------------------------*/
+yyreturn:
+  if (yychar != YYEMPTY)
+    {
+      /* Make sure we have latest lookahead translation.  See comments at
+         user semantic actions for why this is necessary.  */
+      yytoken = YYTRANSLATE (yychar);
+      yydestruct ("Cleanup: discarding lookahead",
+                  yytoken, &yylval);
+    }
+  /* Do not reclaim the symbols of the rule whose action triggered
+     this YYABORT or YYACCEPT.  */
+  YYPOPSTACK (yylen);
+  YY_STACK_PRINT (yyss, yyssp);
+  while (yyssp != yyss)
+    {
+      yydestruct ("Cleanup: popping",
+                  YY_ACCESSING_SYMBOL (+*yyssp), yyvsp);
+      YYPOPSTACK (1);
+    }
+#ifndef yyoverflow
+  if (yyss != yyssa)
+    YYSTACK_FREE (yyss);
+#endif
+
+  return yyresult;
+}
+
+#line 605 "pars0grm.y"
+
diff --git a/storage/innobase/pars/pars0grm.y b/storage/innobase/pars/pars0grm.y
new file mode 100644
index 00000000..c23d6844
--- /dev/null
+++ b/storage/innobase/pars/pars0grm.y
@@ -0,0 +1,609 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************
+SQL parser: input file for the GNU Bison parser generator
+
+Look from pars0lex.l for instructions how to generate the C files for
+the InnoDB parser.
+
+Created 12/14/1997 Heikki Tuuri
+*******************************************************/
+
+%{
+/* The value of the semantic attribute is a pointer to a query tree node
+que_node_t */
+
+#include "univ.i"
+#include <math.h>
+#include "pars0pars.h"
+#include "mem0mem.h"
+#include "que0types.h"
+#include "que0que.h"
+#include "row0sel.h"
+#ifdef __GNUC__
+# pragma GCC diagnostic ignored "-Wpragmas"
+# pragma GCC diagnostic ignored "-Wunknown-warning-option"
+# pragma GCC diagnostic ignored "-Wfree-nonheap-object"
+# pragma GCC diagnostic ignored "-Wunused-but-set-variable"
+#endif
+#define YYSTYPE que_node_t*
+/* #define __STDC__ */
+int
+yylex(void);
+%}
+
+%token PARS_INT_LIT
+%token PARS_FLOAT_LIT
+%token PARS_STR_LIT
+%token PARS_NULL_LIT
+%token PARS_ID_TOKEN
+%token PARS_AND_TOKEN
+%token PARS_OR_TOKEN
+%token PARS_NOT_TOKEN
+%token PARS_GE_TOKEN
+%token PARS_LE_TOKEN
+%token PARS_NE_TOKEN
+%token PARS_PROCEDURE_TOKEN
+%token PARS_IN_TOKEN
+%token PARS_INT_TOKEN
+%token PARS_CHAR_TOKEN
+%token PARS_IS_TOKEN
+%token PARS_BEGIN_TOKEN
+%token PARS_END_TOKEN
+%token PARS_IF_TOKEN
+%token PARS_THEN_TOKEN
+%token PARS_ELSE_TOKEN
+%token PARS_ELSIF_TOKEN
+%token PARS_LOOP_TOKEN
+%token PARS_WHILE_TOKEN
+%token PARS_RETURN_TOKEN
+%token PARS_SELECT_TOKEN
+%token PARS_COUNT_TOKEN
+%token PARS_FROM_TOKEN
+%token PARS_WHERE_TOKEN
+%token PARS_FOR_TOKEN
+%token PARS_DDOT_TOKEN
+%token PARS_ORDER_TOKEN
+%token PARS_BY_TOKEN
+%token PARS_ASC_TOKEN
+%token PARS_DESC_TOKEN
+%token PARS_INSERT_TOKEN
+%token PARS_INTO_TOKEN
+%token PARS_VALUES_TOKEN
+%token PARS_UPDATE_TOKEN
+%token PARS_SET_TOKEN
+%token PARS_DELETE_TOKEN
+%token PARS_CURRENT_TOKEN
+%token PARS_OF_TOKEN
+%token PARS_CREATE_TOKEN
+%token PARS_TABLE_TOKEN
+%token PARS_INDEX_TOKEN
+%token PARS_UNIQUE_TOKEN
+%token PARS_CLUSTERED_TOKEN
+%token PARS_ON_TOKEN
+%token PARS_ASSIGN_TOKEN
+%token PARS_DECLARE_TOKEN
+%token PARS_CURSOR_TOKEN
+%token PARS_SQL_TOKEN
+%token PARS_OPEN_TOKEN
+%token PARS_FETCH_TOKEN
+%token PARS_CLOSE_TOKEN
+%token PARS_NOTFOUND_TOKEN
+%token PARS_TO_BINARY_TOKEN
+%token PARS_SUBSTR_TOKEN
+%token PARS_CONCAT_TOKEN
+%token PARS_INSTR_TOKEN
+%token PARS_LENGTH_TOKEN
+%token PARS_COMMIT_TOKEN
+%token PARS_ROLLBACK_TOKEN
+%token PARS_WORK_TOKEN
+%token PARS_EXIT_TOKEN
+%token PARS_FUNCTION_TOKEN
+%token PARS_LOCK_TOKEN
+%token PARS_SHARE_TOKEN
+%token PARS_MODE_TOKEN
+%token PARS_LIKE_TOKEN
+%token PARS_LIKE_TOKEN_EXACT
+%token PARS_LIKE_TOKEN_PREFIX
+%token PARS_LIKE_TOKEN_SUFFIX
+%token PARS_LIKE_TOKEN_SUBSTR
+%token PARS_TABLE_NAME_TOKEN
+%token PARS_BIGINT_TOKEN
+
+%left PARS_AND_TOKEN PARS_OR_TOKEN
+%left PARS_NOT_TOKEN
+%left '=' '<' '>' PARS_GE_TOKEN PARS_LE_TOKEN
+%left '-' '+'
+%left '*' '/'
+%left NEG     /* negation--unary minus */
+%left '%'
+
+%expect 41
+
+/* Grammar follows */
+%%
+
+top_statement:
+        procedure_definition ';'
+
+statement:
+	while_statement ';'
+	| for_statement ';'
+	| exit_statement ';'
+	| if_statement ';'
+	| return_statement ';'
+	| assignment_statement ';'
+	| select_statement ';'
+	| insert_statement ';'
+	| delete_statement_searched ';'
+	| delete_statement_positioned ';'
+	| update_statement_searched ';'
+	| update_statement_positioned ';'
+	| open_cursor_statement ';'
+	| fetch_statement ';'
+	| close_cursor_statement ';'
+	| commit_statement ';'
+	| rollback_statement ';'
+	| create_table ';'
+	| create_index ';'
+;
+
+statement_list:
+	statement		{ $$ = que_node_list_add_last(NULL, $1); }
+	| statement_list statement
+				{ $$ = que_node_list_add_last($1, $2); }
+;
+
+exp:
+	PARS_ID_TOKEN		{ $$ = $1;}
+	| function_name '(' exp_list ')'
+				{ $$ = pars_func($1, $3); }
+	| PARS_INT_LIT		{ $$ = $1;}
+	| PARS_FLOAT_LIT	{ $$ = $1;}
+	| PARS_STR_LIT		{ $$ = $1;}
+	| PARS_NULL_LIT		{ $$ = $1;}
+	| PARS_SQL_TOKEN	{ $$ = $1;}
+	| exp '+' exp        	{ $$ = pars_op('+', $1, $3); }
+	| exp '-' exp        	{ $$ = pars_op('-', $1, $3); }
+	| exp '*' exp        	{ $$ = pars_op('*', $1, $3); }
+	| exp '/' exp        	{ $$ = pars_op('/', $1, $3); }
+	| '-' exp %prec NEG 	{ $$ = pars_op('-', $2, NULL); }
+	| '(' exp ')'        	{ $$ = $2; }
+	| exp '=' exp		{ $$ = pars_op('=', $1, $3); }
+	| exp PARS_LIKE_TOKEN PARS_STR_LIT
+				{ $$ = pars_op(PARS_LIKE_TOKEN, $1, $3); }
+	| exp '<' exp           { $$ = pars_op('<', $1, $3); }
+	| exp '>' exp           { $$ = pars_op('>', $1, $3); }
+	| exp PARS_GE_TOKEN exp	{ $$ = pars_op(PARS_GE_TOKEN, $1, $3); }
+	| exp PARS_LE_TOKEN exp	{ $$ = pars_op(PARS_LE_TOKEN, $1, $3); }
+	| exp PARS_NE_TOKEN exp	{ $$ = pars_op(PARS_NE_TOKEN, $1, $3); }
+	| exp PARS_AND_TOKEN exp{ $$ = pars_op(PARS_AND_TOKEN, $1, $3); }
+	| exp PARS_OR_TOKEN exp	{ $$ = pars_op(PARS_OR_TOKEN, $1, $3); }
+	| PARS_NOT_TOKEN exp	{ $$ = pars_op(PARS_NOT_TOKEN, $2, NULL); }
+	| PARS_ID_TOKEN '%' PARS_NOTFOUND_TOKEN
+				{ $$ = pars_op(PARS_NOTFOUND_TOKEN, $1, NULL); }
+	| PARS_SQL_TOKEN '%' PARS_NOTFOUND_TOKEN
+				{ $$ = pars_op(PARS_NOTFOUND_TOKEN, $1, NULL); }
+;
+
+function_name:
+	PARS_TO_BINARY_TOKEN	{ $$ = &pars_to_binary_token; }
+	| PARS_SUBSTR_TOKEN	{ $$ = &pars_substr_token; }
+	| PARS_CONCAT_TOKEN	{ $$ = &pars_concat_token; }
+	| PARS_INSTR_TOKEN	{ $$ = &pars_instr_token; }
+	| PARS_LENGTH_TOKEN	{ $$ = &pars_length_token; }
+;
+
+user_function_call:
+	PARS_ID_TOKEN '(' ')'	{ $$ = $1; }
+;
+
+table_list:
+	table_name		{ $$ = que_node_list_add_last(NULL, $1); }
+	| table_list ',' table_name
+				{ $$ = que_node_list_add_last($1, $3); }
+;
+
+variable_list:
+	/* Nothing */		{ $$ = NULL; }
+	| PARS_ID_TOKEN		{ $$ = que_node_list_add_last(NULL, $1); }
+	| variable_list ',' PARS_ID_TOKEN
+				{ $$ = que_node_list_add_last($1, $3); }
+;
+
+exp_list:
+	/* Nothing */		{ $$ = NULL; }
+	| exp			{ $$ = que_node_list_add_last(NULL, $1);}
+	| exp_list ',' exp	{ $$ = que_node_list_add_last($1, $3); }
+;
+
+select_item:
+	exp			{ $$ = $1; }
+	| PARS_COUNT_TOKEN '(' '*' ')'
+				{ $$ = pars_func(&pars_count_token,
+					  que_node_list_add_last(NULL,
+					    sym_tab_add_int_lit(
+						pars_sym_tab_global, 1))); }
+;
+
+select_item_list:
+	/* Nothing */		{ $$ = NULL; }
+	| select_item		{ $$ = que_node_list_add_last(NULL, $1); }
+	| select_item_list ',' select_item
+				{ $$ = que_node_list_add_last($1, $3); }
+;
+
+select_list:
+	'*'			{ $$ = pars_select_list(&pars_star_denoter,
+								NULL); }
+	| select_item_list PARS_INTO_TOKEN variable_list
+				{ $$ = pars_select_list(
+					$1, static_cast<sym_node_t*>($3)); }
+	| select_item_list	{ $$ = pars_select_list($1, NULL); }
+;
+
+search_condition:
+	/* Nothing */		{ $$ = NULL; }
+	| PARS_WHERE_TOKEN exp	{ $$ = $2; }
+;
+
+for_update_clause:
+	/* Nothing */		{ $$ = NULL; }
+	| PARS_FOR_TOKEN PARS_UPDATE_TOKEN
+				{ $$ = &pars_update_token; }
+;
+
+lock_shared_clause:
+	/* Nothing */		{ $$ = NULL; }
+	| PARS_LOCK_TOKEN PARS_IN_TOKEN PARS_SHARE_TOKEN PARS_MODE_TOKEN
+				{ $$ = &pars_share_token; }
+;
+
+order_direction:
+	/* Nothing */		{ $$ = &pars_asc_token; }
+	| PARS_ASC_TOKEN	{ $$ = &pars_asc_token; }
+	| PARS_DESC_TOKEN	{ $$ = &pars_desc_token; }
+;
+
+order_by_clause:
+	/* Nothing */		{ $$ = NULL; }
+	| PARS_ORDER_TOKEN PARS_BY_TOKEN PARS_ID_TOKEN order_direction
+				{ $$ = pars_order_by(
+					static_cast<sym_node_t*>($3),
+					static_cast<pars_res_word_t*>($4)); }
+;
+
+select_statement:
+	PARS_SELECT_TOKEN select_list
+	PARS_FROM_TOKEN table_list
+	search_condition
+	for_update_clause
+	lock_shared_clause
+	order_by_clause		{ $$ = pars_select_statement(
+					static_cast<sel_node_t*>($2),
+					static_cast<sym_node_t*>($4),
+					static_cast<que_node_t*>($5),
+					static_cast<pars_res_word_t*>($6),
+					static_cast<pars_res_word_t*>($7),
+					static_cast<order_node_t*>($8)); }
+;
+
+insert_statement_start:
+	PARS_INSERT_TOKEN PARS_INTO_TOKEN
+	table_name		{ $$ = $3; }
+;
+
+insert_statement:
+	insert_statement_start PARS_VALUES_TOKEN '(' exp_list ')'
+				{ $$ = pars_insert_statement(
+					static_cast<sym_node_t*>($1), $4, NULL); }
+	| insert_statement_start select_statement
+				{ $$ = pars_insert_statement(
+					static_cast<sym_node_t*>($1),
+					NULL,
+					static_cast<sel_node_t*>($2)); }
+;
+
+column_assignment:
+	PARS_ID_TOKEN '=' exp	{ $$ = pars_column_assignment(
+					static_cast<sym_node_t*>($1),
+					static_cast<que_node_t*>($3)); }
+;
+
+column_assignment_list:
+	column_assignment	{ $$ = que_node_list_add_last(NULL, $1); }
+	| column_assignment_list ',' column_assignment
+				{ $$ = que_node_list_add_last($1, $3); }
+;
+
+cursor_positioned:
+	PARS_WHERE_TOKEN
+	PARS_CURRENT_TOKEN PARS_OF_TOKEN
+	PARS_ID_TOKEN 		{ $$ = $4; }
+;
+
+update_statement_start:
+	PARS_UPDATE_TOKEN table_name
+	PARS_SET_TOKEN
+	column_assignment_list	{ $$ = pars_update_statement_start(
+					FALSE,
+					static_cast<sym_node_t*>($2),
+					static_cast<col_assign_node_t*>($4)); }
+;
+
+update_statement_searched:
+	update_statement_start
+	search_condition	{ $$ = pars_update_statement(
+					static_cast<upd_node_t*>($1),
+					NULL,
+					static_cast<que_node_t*>($2)); }
+;
+
+update_statement_positioned:
+	update_statement_start
+	cursor_positioned	{ $$ = pars_update_statement(
+					static_cast<upd_node_t*>($1),
+					static_cast<sym_node_t*>($2),
+					NULL); }
+;
+
+delete_statement_start:
+	PARS_DELETE_TOKEN PARS_FROM_TOKEN
+	table_name		{ $$ = pars_update_statement_start(
+					TRUE,
+					static_cast<sym_node_t*>($3), NULL); }
+;
+
+delete_statement_searched:
+	delete_statement_start
+	search_condition	{ $$ = pars_update_statement(
+					static_cast<upd_node_t*>($1),
+					NULL,
+					static_cast<que_node_t*>($2)); }
+;
+
+delete_statement_positioned:
+	delete_statement_start
+	cursor_positioned	{ $$ = pars_update_statement(
+					static_cast<upd_node_t*>($1),
+					static_cast<sym_node_t*>($2),
+					NULL); }
+;
+
+assignment_statement:
+	PARS_ID_TOKEN PARS_ASSIGN_TOKEN exp
+				{ $$ = pars_assignment_statement(
+					static_cast<sym_node_t*>($1),
+					static_cast<que_node_t*>($3)); }
+;
+
+elsif_element:
+	PARS_ELSIF_TOKEN
+	exp PARS_THEN_TOKEN statement_list
+				{ $$ = pars_elsif_element($2, $4); }
+;
+
+elsif_list:
+	elsif_element		{ $$ = que_node_list_add_last(NULL, $1); }
+	| elsif_list elsif_element
+				{ $$ = que_node_list_add_last($1, $2); }
+;
+
+else_part:
+	/* Nothing */		{ $$ = NULL; }
+	| PARS_ELSE_TOKEN statement_list
+				{ $$ = $2; }
+	| elsif_list		{ $$ = $1; }
+;
+
+if_statement:
+	PARS_IF_TOKEN exp PARS_THEN_TOKEN statement_list
+	else_part
+	PARS_END_TOKEN PARS_IF_TOKEN
+				{ $$ = pars_if_statement($2, $4, $5); }
+;
+
+while_statement:
+	PARS_WHILE_TOKEN exp PARS_LOOP_TOKEN statement_list
+	PARS_END_TOKEN PARS_LOOP_TOKEN
+				{ $$ = pars_while_statement($2, $4); }
+;
+
+for_statement:
+	PARS_FOR_TOKEN PARS_ID_TOKEN PARS_IN_TOKEN
+	exp PARS_DDOT_TOKEN exp
+	PARS_LOOP_TOKEN statement_list
+	PARS_END_TOKEN PARS_LOOP_TOKEN
+				{ $$ = pars_for_statement(
+					static_cast<sym_node_t*>($2),
+					$4, $6, $8); }
+;
+
+exit_statement:
+	PARS_EXIT_TOKEN		{ $$ = pars_exit_statement(); }
+;
+
+return_statement:
+	PARS_RETURN_TOKEN	{ $$ = pars_return_statement(); }
+;
+
+open_cursor_statement:
+	PARS_OPEN_TOKEN PARS_ID_TOKEN
+				{ $$ = pars_open_statement(
+						ROW_SEL_OPEN_CURSOR,
+						static_cast<sym_node_t*>($2)); }
+;
+
+close_cursor_statement:
+	PARS_CLOSE_TOKEN PARS_ID_TOKEN
+				{ $$ = pars_open_statement(
+						ROW_SEL_CLOSE_CURSOR,
+						static_cast<sym_node_t*>($2)); }
+;
+
+fetch_statement:
+	PARS_FETCH_TOKEN PARS_ID_TOKEN PARS_INTO_TOKEN variable_list
+				{ $$ = pars_fetch_statement(
+					static_cast<sym_node_t*>($2),
+					static_cast<sym_node_t*>($4), NULL); }
+	| PARS_FETCH_TOKEN PARS_ID_TOKEN PARS_INTO_TOKEN user_function_call
+				{ $$ = pars_fetch_statement(
+					static_cast<sym_node_t*>($2),
+					NULL,
+					static_cast<sym_node_t*>($4)); }
+;
+
+column_def:
+	PARS_ID_TOKEN type_name	opt_column_len opt_not_null
+				{ $$ = pars_column_def(
+					static_cast<sym_node_t*>($1),
+					static_cast<pars_res_word_t*>($2),
+					static_cast<sym_node_t*>($3),
+					$4); }
+;
+
+column_def_list:
+	column_def		{ $$ = que_node_list_add_last(NULL, $1); }
+	| column_def_list ',' column_def
+				{ $$ = que_node_list_add_last($1, $3); }
+;
+
+opt_column_len:
+	/* Nothing */		{ $$ = NULL; }
+	| '(' PARS_INT_LIT ')'
+				{ $$ = $2; }
+;
+
+opt_not_null:
+	/* Nothing */		{ $$ = NULL; }
+	| PARS_NOT_TOKEN PARS_NULL_LIT
+				{ $$ = &pars_int_token;
+					/* pass any non-NULL pointer */ }
+;
+
+create_table:
+	PARS_CREATE_TOKEN PARS_TABLE_TOKEN
+	table_name '(' column_def_list ')'
+				{ $$ = pars_create_table(
+					static_cast<sym_node_t*>($3),
+					static_cast<sym_node_t*>($5)); }
+;
+
+column_list:
+	PARS_ID_TOKEN		{ $$ = que_node_list_add_last(NULL, $1); }
+	| column_list ',' PARS_ID_TOKEN
+				{ $$ = que_node_list_add_last($1, $3); }
+;
+
+unique_def:
+	/* Nothing */		{ $$ = NULL; }
+	| PARS_UNIQUE_TOKEN	{ $$ = &pars_unique_token; }
+;
+
+clustered_def:
+	/* Nothing */		{ $$ = NULL; }
+	| PARS_CLUSTERED_TOKEN	{ $$ = &pars_clustered_token; }
+;
+
+create_index:
+	PARS_CREATE_TOKEN unique_def
+	clustered_def
+	PARS_INDEX_TOKEN
+	PARS_ID_TOKEN PARS_ON_TOKEN
+	table_name
+	'(' column_list ')'	{ $$ = pars_create_index(
+					static_cast<pars_res_word_t*>($2),
+					static_cast<pars_res_word_t*>($3),
+					static_cast<sym_node_t*>($5),
+					static_cast<sym_node_t*>($7),
+					static_cast<sym_node_t*>($9)); }
+;
+
+table_name:
+	PARS_ID_TOKEN		{ $$ = $1; }
+	| PARS_TABLE_NAME_TOKEN	{ $$ = $1; }
+;
+
+commit_statement:
+	PARS_COMMIT_TOKEN PARS_WORK_TOKEN
+				{ $$ = pars_commit_statement(); }
+;
+
+rollback_statement:
+	PARS_ROLLBACK_TOKEN PARS_WORK_TOKEN
+				{ $$ = pars_rollback_statement(); }
+;
+
+type_name:
+	PARS_INT_TOKEN		{ $$ = &pars_int_token; }
+	| PARS_BIGINT_TOKEN	{ $$ = &pars_bigint_token; }
+	| PARS_CHAR_TOKEN	{ $$ = &pars_char_token; }
+;
+
+variable_declaration:
+	PARS_ID_TOKEN type_name ';'
+				{ $$ = pars_variable_declaration(
+					static_cast<sym_node_t*>($1),
+					static_cast<pars_res_word_t*>($2)); }
+;
+
+variable_declaration_list:
+	/* Nothing */
+	| variable_declaration
+	| variable_declaration_list variable_declaration
+;
+
+cursor_declaration:
+	PARS_DECLARE_TOKEN PARS_CURSOR_TOKEN PARS_ID_TOKEN
+	PARS_IS_TOKEN select_statement ';'
+				{ $$ = pars_cursor_declaration(
+					static_cast<sym_node_t*>($3),
+					static_cast<sel_node_t*>($5)); }
+;
+
+function_declaration:
+	PARS_DECLARE_TOKEN PARS_FUNCTION_TOKEN PARS_ID_TOKEN ';'
+				{ $$ = pars_function_declaration(
+					static_cast<sym_node_t*>($3)); }
+;
+
+declaration:
+	cursor_declaration
+	| function_declaration
+;
+
+declaration_list:
+	/* Nothing */
+	| declaration
+	| declaration_list declaration
+;
+
+procedure_definition:
+	PARS_PROCEDURE_TOKEN PARS_ID_TOKEN '(' ')'
+	PARS_IS_TOKEN
+	variable_declaration_list
+	declaration_list
+	PARS_BEGIN_TOKEN
+	statement_list
+	PARS_END_TOKEN		{ $$ = pars_procedure_definition(
+					static_cast<sym_node_t*>($2), $9); }
+;
+
+%%
diff --git a/storage/innobase/pars/pars0lex.l b/storage/innobase/pars/pars0lex.l
new file mode 100644
index 00000000..1ddc5132
--- /dev/null
+++ b/storage/innobase/pars/pars0lex.l
@@ -0,0 +1,614 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2019, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/******************************************************
+SQL parser lexical analyzer: input file for the GNU Flex lexer generator
+
+The InnoDB parser is frozen because MySQL takes care of SQL parsing.
+Therefore we normally keep the InnoDB parser C files as they are, and do
+not automatically generate them from pars0grm.y and pars0lex.l.
+
+How to make the InnoDB parser and lexer C files:
+
+1. Run ./make_flex.sh to generate lexer files.
+
+2. Run ./make_bison.sh to generate parser files.
+
+These instructions seem to work at least with bison-1.875d and flex-2.5.31 on
+Linux.
+
+Created 12/14/1997 Heikki Tuuri
+*******************************************************/
+
+%option nostdinit
+%option 8bit
+%option warn
+%option pointer
+%option never-interactive
+%option nodefault
+%option noinput
+%option nounput
+%option noyywrap
+%option noyy_scan_buffer
+%option noyy_scan_bytes
+%option noyy_scan_string
+%option nounistd
+
+%{
+#define YYSTYPE que_node_t*
+
+#include "univ.i"
+#include "pars0pars.h"
+#include "pars0grm.h"
+#include "pars0sym.h"
+#include "mem0mem.h"
+
+#define malloc(A)	ut_malloc_nokey(A)
+#define free(A)		ut_free(A)
+#define realloc(P, A)	ut_realloc(P, A)
+#define exit(A) 	ut_error
+
+#define YY_INPUT(buf, result, max_size) \
+	result = pars_get_lex_chars(buf, max_size)
+
+/* String buffer for removing quotes */
+static ulint	stringbuf_len_alloc = 0; /* Allocated length */
+static ulint	stringbuf_len = 0; /* Current length */
+static char*	stringbuf; /* Start of buffer */
+/** Appends a string to the buffer. */
+static
+void
+string_append(
+/*==========*/
+	const char*	str,	/*!< in: string to be appended */
+	ulint		len)	/*!< in: length of the string */
+{
+	if (stringbuf == NULL) {
+		stringbuf = static_cast<char*>(malloc(1));
+		stringbuf_len_alloc = 1;
+	}
+
+	if (stringbuf_len + len > stringbuf_len_alloc) {
+		while (stringbuf_len + len > stringbuf_len_alloc) {
+			stringbuf_len_alloc <<= 1;
+		}
+
+		stringbuf = static_cast<char*>(
+			realloc(stringbuf, stringbuf_len_alloc));
+	}
+
+	memcpy(stringbuf + stringbuf_len, str, len);
+	stringbuf_len += len;
+}
+
+%}
+
+DIGIT		[0-9]
+ID		[a-z_A-Z][a-z_A-Z0-9]*
+TABLE_NAME	[a-z_A-Z][@a-z_A-Z0-9]*\/(#sql-|[a-z_A-Z])[a-z_A-Z0-9]*
+BOUND_LIT	\:[a-z_A-Z0-9]+
+BOUND_ID	\$[a-z_A-Z0-9]+
+
+%x comment
+%x quoted
+%x id
+%%
+
+{DIGIT}+	{
+			yylval = sym_tab_add_int_lit(pars_sym_tab_global,
+								atoi(yytext));
+			return(PARS_INT_LIT);
+}
+
+{DIGIT}+"."{DIGIT}* {
+			ut_error;	/* not implemented */
+
+			return(PARS_FLOAT_LIT);
+}
+
+{BOUND_LIT}	{
+			ulint	type;
+
+			yylval = sym_tab_add_bound_lit(pars_sym_tab_global,
+				yytext + 1, &type);
+
+			return((int) type);
+}
+
+{BOUND_ID}	{
+			yylval = sym_tab_add_bound_id(pars_sym_tab_global,
+				yytext + 1);
+
+			return(PARS_ID_TOKEN);
+}
+
+"'"		{
+/* Quoted character string literals are handled in an explicit
+start state 'quoted'.  This state is entered and the buffer for
+the scanned string is emptied upon encountering a starting quote.
+
+In the state 'quoted', only two actions are possible (defined below). */
+			BEGIN(quoted);
+			stringbuf_len = 0;
+}
+<quoted>[^\']+	{
+			/* Got a sequence of characters other than "'":
+			append to string buffer */
+			string_append(yytext, yyleng);
+}
+<quoted>"'"+	{
+			/* Got a sequence of "'" characters:
+			append half of them to string buffer,
+			as "''" represents a single "'".
+			We apply truncating division,
+			so that "'''" will result in "'". */
+
+			string_append(yytext, yyleng / 2);
+
+			/* If we got an odd number of quotes, then the
+			last quote we got is the terminating quote.
+			At the end of the string, we return to the
+			initial start state and report the scanned
+			string literal. */
+
+			if (yyleng % 2) {
+				BEGIN(INITIAL);
+				yylval = sym_tab_add_str_lit(
+					pars_sym_tab_global,
+					(byte*) stringbuf, stringbuf_len);
+				return(PARS_STR_LIT);
+			}
+}
+
+\"		{
+/* Quoted identifiers are handled in an explicit start state 'id'.
+This state is entered and the buffer for the scanned string is emptied
+upon encountering a starting quote.
+
+In the state 'id', only two actions are possible (defined below). */
+			BEGIN(id);
+			stringbuf_len = 0;
+}
+<id>[^\"]+	{
+			/* Got a sequence of characters other than '"':
+			append to string buffer */
+			string_append(yytext, yyleng);
+}
+<id>\"+	{
+			/* Got a sequence of '"' characters:
+			append half of them to string buffer,
+			as '""' represents a single '"'.
+			We apply truncating division,
+			so that '"""' will result in '"'. */
+
+			string_append(yytext, yyleng / 2);
+
+			/* If we got an odd number of quotes, then the
+			last quote we got is the terminating quote.
+			At the end of the string, we return to the
+			initial start state and report the scanned
+			identifier. */
+
+			if (yyleng % 2) {
+				BEGIN(INITIAL);
+				yylval = sym_tab_add_id(
+					pars_sym_tab_global,
+					(byte*) stringbuf, stringbuf_len);
+
+				return(PARS_ID_TOKEN);
+			}
+}
+
+"NULL"		{
+			yylval = sym_tab_add_null_lit(pars_sym_tab_global);
+
+			return(PARS_NULL_LIT);
+}
+
+"SQL"		{
+			/* Implicit cursor name */
+			yylval = sym_tab_add_str_lit(pars_sym_tab_global,
+							(byte*) yytext, yyleng);
+			return(PARS_SQL_TOKEN);
+}
+
+"AND"		{
+			return(PARS_AND_TOKEN);
+}
+
+"OR"		{
+			return(PARS_OR_TOKEN);
+}
+
+"NOT"		{
+			return(PARS_NOT_TOKEN);
+}
+
+"PROCEDURE"	{
+			return(PARS_PROCEDURE_TOKEN);
+}
+
+"IN"		{
+			return(PARS_IN_TOKEN);
+}
+
+"INT"		{
+			return(PARS_INT_TOKEN);
+}
+
+"CHAR"		{
+			return(PARS_CHAR_TOKEN);
+}
+
+"IS"		{
+			return(PARS_IS_TOKEN);
+}
+
+"BEGIN"		{
+			return(PARS_BEGIN_TOKEN);
+}
+
+"END"		{
+			return(PARS_END_TOKEN);
+}
+
+"IF"		{
+			return(PARS_IF_TOKEN);
+}
+
+"THEN"		{
+			return(PARS_THEN_TOKEN);
+}
+
+"ELSE"		{
+			return(PARS_ELSE_TOKEN);
+}
+
+"ELSIF"		{
+			return(PARS_ELSIF_TOKEN);
+}
+
+"LOOP"		{
+			return(PARS_LOOP_TOKEN);
+}
+
+"WHILE"		{
+			return(PARS_WHILE_TOKEN);
+}
+
+"RETURN"	{
+			return(PARS_RETURN_TOKEN);
+}
+
+"SELECT"	{
+			return(PARS_SELECT_TOKEN);
+}
+
+"COUNT"		{
+			return(PARS_COUNT_TOKEN);
+}
+
+"FROM"		{
+			return(PARS_FROM_TOKEN);
+}
+
+"WHERE"		{
+			return(PARS_WHERE_TOKEN);
+}
+
+"FOR"		{
+			return(PARS_FOR_TOKEN);
+}
+
+"ORDER"		{
+			return(PARS_ORDER_TOKEN);
+}
+
+"BY"		{
+			return(PARS_BY_TOKEN);
+}
+
+"ASC"		{
+			return(PARS_ASC_TOKEN);
+}
+
+"DESC"		{
+			return(PARS_DESC_TOKEN);
+}
+
+"INSERT"	{
+			return(PARS_INSERT_TOKEN);
+}
+
+"INTO"		{
+			return(PARS_INTO_TOKEN);
+}
+
+"VALUES"	{
+			return(PARS_VALUES_TOKEN);
+}
+
+"UPDATE"	{
+			return(PARS_UPDATE_TOKEN);
+}
+
+"SET"		{
+			return(PARS_SET_TOKEN);
+}
+
+"DELETE"	{
+			return(PARS_DELETE_TOKEN);
+}
+
+"CURRENT"	{
+			return(PARS_CURRENT_TOKEN);
+}
+
+"OF"		{
+			return(PARS_OF_TOKEN);
+}
+
+"CREATE"	{
+			return(PARS_CREATE_TOKEN);
+}
+
+"TABLE"		{
+			return(PARS_TABLE_TOKEN);
+}
+
+"INDEX"		{
+			return(PARS_INDEX_TOKEN);
+}
+
+"UNIQUE"	{
+			return(PARS_UNIQUE_TOKEN);
+}
+
+"CLUSTERED"	{
+			return(PARS_CLUSTERED_TOKEN);
+}
+
+"ON"		{
+			return(PARS_ON_TOKEN);
+}
+
+"DECLARE"	{
+			return(PARS_DECLARE_TOKEN);
+}
+
+"CURSOR"	{
+			return(PARS_CURSOR_TOKEN);
+}
+
+"OPEN"	{
+			return(PARS_OPEN_TOKEN);
+}
+
+"FETCH"	{
+			return(PARS_FETCH_TOKEN);
+}
+
+"CLOSE"	{
+			return(PARS_CLOSE_TOKEN);
+}
+
+"NOTFOUND"	{
+			return(PARS_NOTFOUND_TOKEN);
+}
+
+"TO_BINARY"	{
+			return(PARS_TO_BINARY_TOKEN);
+}
+
+"SUBSTR"	{
+			return(PARS_SUBSTR_TOKEN);
+}
+
+"CONCAT"	{
+			return(PARS_CONCAT_TOKEN);
+}
+
+"INSTR"		{
+			return(PARS_INSTR_TOKEN);
+}
+
+"LENGTH"	{
+			return(PARS_LENGTH_TOKEN);
+}
+
+"COMMIT"	{
+			return(PARS_COMMIT_TOKEN);
+}
+
+"ROLLBACK"	{
+			return(PARS_ROLLBACK_TOKEN);
+}
+
+"WORK"		{
+			return(PARS_WORK_TOKEN);
+}
+
+"EXIT"		{
+			return(PARS_EXIT_TOKEN);
+}
+
+"FUNCTION"	{
+			return(PARS_FUNCTION_TOKEN);
+}
+
+"LOCK"	{
+			return(PARS_LOCK_TOKEN);
+}
+
+"SHARE"	{
+			return(PARS_SHARE_TOKEN);
+}
+
+"MODE"	{
+			return(PARS_MODE_TOKEN);
+}
+
+"LIKE"  {
+                        return(PARS_LIKE_TOKEN);
+}
+
+"BIGINT"	{
+			return(PARS_BIGINT_TOKEN);
+}
+
+{ID}		{
+			yylval = sym_tab_add_id(pars_sym_tab_global,
+							(byte*) yytext,
+							strlen(yytext));
+			return(PARS_ID_TOKEN);
+}
+
+{TABLE_NAME}	{
+			yylval = sym_tab_add_id(pars_sym_tab_global,
+							(byte*) yytext,
+							strlen(yytext));
+			return(PARS_TABLE_NAME_TOKEN);
+}
+
+".."		{
+			return(PARS_DDOT_TOKEN);
+}
+
+":="		{
+			return(PARS_ASSIGN_TOKEN);
+}
+
+"<="		{
+			return(PARS_LE_TOKEN);
+}
+
+">="		{
+			return(PARS_GE_TOKEN);
+}
+
+"<>"		{
+			return(PARS_NE_TOKEN);
+}
+
+"("		{
+
+			return((int)(*yytext));
+}
+
+"="		{
+
+			return((int)(*yytext));
+}
+
+">"		{
+
+			return((int)(*yytext));
+}
+
+"<"		{
+
+			return((int)(*yytext));
+}
+
+","		{
+
+			return((int)(*yytext));
+}
+
+";"		{
+
+			return((int)(*yytext));
+}
+
+")"		{
+
+			return((int)(*yytext));
+}
+
+"+" 		{
+
+			return((int)(*yytext));
+}
+
+"-"		{
+
+			return((int)(*yytext));
+}
+
+"*"		{
+
+			return((int)(*yytext));
+}
+
+"/"		{
+
+			return((int)(*yytext));
+}
+
+"%"		{
+
+			return((int)(*yytext));
+}
+
+"{"		{
+
+			return((int)(*yytext));
+}
+
+"}"		{
+
+			return((int)(*yytext));
+}
+
+"?"		{
+
+			return((int)(*yytext));
+}
+
+"/*"			BEGIN(comment); /* eat up comment */
+
+<comment>[^*]*
+<comment>"*"+[^*/]*
+<comment>"*"+"/"        BEGIN(INITIAL);
+
+[ \t\n]+		/* eat up whitespace */
+
+
+.		{
+			fprintf(stderr,"Unrecognized character: %02x\n",
+				*yytext);
+
+			ut_error;
+
+			return(0);
+}
+
+%%
+
+/**********************************************************************
+Release any resources used by the lexer. */
+void
+pars_lexer_close(void)
+/*==================*/
+{
+	yylex_destroy();
+	free(stringbuf);
+	stringbuf = NULL;
+	stringbuf_len_alloc = stringbuf_len = 0;
+}
diff --git a/storage/innobase/pars/pars0opt.cc b/storage/innobase/pars/pars0opt.cc
new file mode 100644
index 00000000..44949ad0
--- /dev/null
+++ b/storage/innobase/pars/pars0opt.cc
@@ -0,0 +1,1263 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file pars/pars0opt.cc
+Simple SQL optimizer
+
+Created 12/21/1997 Heikki Tuuri
+*******************************************************/
+
+#include "pars0opt.h"
+#include "row0sel.h"
+#include "row0ins.h"
+#include "row0upd.h"
+#include "dict0boot.h"
+#include "dict0dict.h"
+#include "dict0mem.h"
+#include "que0que.h"
+#include "pars0grm.h"
+#include "pars0pars.h"
+
+#define OPT_EQUAL	1	/* comparison by = */
+#define OPT_COMPARISON	2	/* comparison by <, >, <=, or >= */
+
+#define OPT_NOT_COND	1
+#define OPT_END_COND	2
+#define OPT_TEST_COND	3
+#define OPT_SCROLL_COND	4
+
+
+/*******************************************************************//**
+Inverts a comparison operator.
+@return the equivalent operator when the order of the arguments is switched */
+static
+int
+opt_invert_cmp_op(
+/*==============*/
+	int	op)	/*!< in: operator */
+{
+	if (op == '<') {
+		return('>');
+	} else if (op == '>') {
+		return('<');
+	} else if (op == '=') {
+		return('=');
+	} else if (op == PARS_LE_TOKEN) {
+		return(PARS_GE_TOKEN);
+	} else if (op == PARS_GE_TOKEN) {
+		return(PARS_LE_TOKEN);
+	} else {
+		/* TODO: LIKE operator */
+		ut_error;
+	}
+
+	return(0);
+}
+
+/*******************************************************************//**
+Checks if the value of an expression can be calculated BEFORE the nth table
+in a join is accessed. If this is the case, it can possibly be used in an
+index search for the nth table.
+@return TRUE if already determined */
+static
+ibool
+opt_check_exp_determined_before(
+/*============================*/
+	que_node_t*	exp,		/*!< in: expression */
+	sel_node_t*	sel_node,	/*!< in: select node */
+	ulint		nth_table)	/*!< in: nth table will be accessed */
+{
+	func_node_t*	func_node;
+	sym_node_t*	sym_node;
+	dict_table_t*	table;
+	que_node_t*	arg;
+	ulint		i;
+
+	ut_ad(exp && sel_node);
+
+	if (que_node_get_type(exp) == QUE_NODE_FUNC) {
+		func_node = static_cast<func_node_t*>(exp);
+
+		arg = func_node->args;
+
+		while (arg) {
+			if (!opt_check_exp_determined_before(arg, sel_node,
+							     nth_table)) {
+				return(FALSE);
+			}
+
+			arg = que_node_get_next(arg);
+		}
+
+		return(TRUE);
+	}
+
+	ut_a(que_node_get_type(exp) == QUE_NODE_SYMBOL);
+
+	sym_node = static_cast<sym_node_t*>(exp);
+
+	if (sym_node->token_type != SYM_COLUMN) {
+
+		return(TRUE);
+	}
+
+	for (i = 0; i < nth_table; i++) {
+
+		table = sel_node_get_nth_plan(sel_node, i)->table;
+
+		if (sym_node->table == table) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/*******************************************************************//**
+Looks in a comparison condition if a column value is already restricted by
+it BEFORE the nth table is accessed.
+@return expression restricting the value of the column, or NULL if not known */
+static
+que_node_t*
+opt_look_for_col_in_comparison_before(
+/*==================================*/
+	ulint		cmp_type,	/*!< in: OPT_EQUAL, OPT_COMPARISON */
+	ulint		col_no,		/*!< in: column number */
+	func_node_t*	search_cond,	/*!< in: comparison condition */
+	sel_node_t*	sel_node,	/*!< in: select node */
+	ulint		nth_table,	/*!< in: nth table in a join (a query
+					from a single table is considered a
+					join of 1 table) */
+	ulint*		op)		/*!< out: comparison operator ('=',
+					PARS_GE_TOKEN, ... ); this is inverted
+					if the column appears on the right
+					side */
+{
+	sym_node_t*	sym_node;
+	dict_table_t*	table;
+	que_node_t*	exp;
+	que_node_t*	arg;
+
+	ut_ad(search_cond);
+
+	ut_a((search_cond->func == '<')
+	     || (search_cond->func == '>')
+	     || (search_cond->func == '=')
+	     || (search_cond->func == PARS_GE_TOKEN)
+	     || (search_cond->func == PARS_LE_TOKEN)
+	     || (search_cond->func == PARS_LIKE_TOKEN_EXACT)
+	     || (search_cond->func == PARS_LIKE_TOKEN_PREFIX)
+	     || (search_cond->func == PARS_LIKE_TOKEN_SUFFIX)
+	     || (search_cond->func == PARS_LIKE_TOKEN_SUBSTR));
+
+	table = sel_node_get_nth_plan(sel_node, nth_table)->table;
+
+	if ((cmp_type == OPT_EQUAL)
+	    && (search_cond->func != '=')
+	    && (search_cond->func != PARS_LIKE_TOKEN_EXACT)
+            && (search_cond->func != PARS_LIKE_TOKEN_PREFIX)) {
+
+		return(NULL);
+
+	} else if ((cmp_type == OPT_COMPARISON)
+		   && (search_cond->func != '<')
+		   && (search_cond->func != '>')
+		   && (search_cond->func != PARS_GE_TOKEN)
+		   && (search_cond->func != PARS_LE_TOKEN)
+		   && (search_cond->func != PARS_LIKE_TOKEN_PREFIX)
+                   && (search_cond->func != PARS_LIKE_TOKEN_SUFFIX)) {
+
+		return(NULL);
+	}
+
+	arg = search_cond->args;
+
+	if (que_node_get_type(arg) == QUE_NODE_SYMBOL) {
+		sym_node = static_cast<sym_node_t*>(arg);
+
+		if ((sym_node->token_type == SYM_COLUMN)
+		    && (sym_node->table == table)
+		    && (sym_node->col_no == col_no)) {
+
+			/* sym_node contains the desired column id */
+
+			/* Check if the expression on the right side of the
+			operator is already determined */
+
+			exp = que_node_get_next(arg);
+
+			if (opt_check_exp_determined_before(exp, sel_node,
+							    nth_table)) {
+				*op = ulint(search_cond->func);
+
+				return(exp);
+			}
+		}
+	}
+
+	exp = search_cond->args;
+	arg = que_node_get_next(arg);
+
+	if (que_node_get_type(arg) == QUE_NODE_SYMBOL) {
+		sym_node = static_cast<sym_node_t*>(arg);
+
+		if ((sym_node->token_type == SYM_COLUMN)
+		    && (sym_node->table == table)
+		    && (sym_node->col_no == col_no)) {
+
+			if (opt_check_exp_determined_before(exp, sel_node,
+							    nth_table)) {
+				*op = ulint(opt_invert_cmp_op(
+						    search_cond->func));
+
+				return(exp);
+			}
+		}
+	}
+
+	return(NULL);
+}
+
+/*******************************************************************//**
+Looks in a search condition if a column value is already restricted by the
+search condition BEFORE the nth table is accessed. Takes into account that
+if we will fetch in an ascending order, we cannot utilize an upper limit for
+a column value; in a descending order, respectively, a lower limit.
+@return expression restricting the value of the column, or NULL if not known */
+static
+que_node_t*
+opt_look_for_col_in_cond_before(
+/*============================*/
+	ulint		cmp_type,	/*!< in: OPT_EQUAL, OPT_COMPARISON */
+	ulint		col_no,		/*!< in: column number */
+	func_node_t*	search_cond,	/*!< in: search condition or NULL */
+	sel_node_t*	sel_node,	/*!< in: select node */
+	ulint		nth_table,	/*!< in: nth table in a join (a query
+					from a single table is considered a
+					join of 1 table) */
+	ulint*		op)		/*!< out: comparison operator ('=',
+					PARS_GE_TOKEN, ... ) */
+{
+	func_node_t*	new_cond;
+	que_node_t*	exp;
+
+	if (search_cond == NULL) {
+
+		return(NULL);
+	}
+
+	ut_a(que_node_get_type(search_cond) == QUE_NODE_FUNC);
+	ut_a(search_cond->func != PARS_OR_TOKEN);
+	ut_a(search_cond->func != PARS_NOT_TOKEN);
+
+	if (search_cond->func == PARS_AND_TOKEN) {
+		new_cond = static_cast<func_node_t*>(search_cond->args);
+
+		exp = opt_look_for_col_in_cond_before(cmp_type, col_no,
+						      new_cond, sel_node,
+						      nth_table, op);
+		if (exp) {
+
+			return(exp);
+		}
+
+		new_cond = static_cast<func_node_t*>(
+			que_node_get_next(new_cond));
+
+		exp = opt_look_for_col_in_cond_before(cmp_type, col_no,
+						      new_cond, sel_node,
+						      nth_table, op);
+		return(exp);
+	}
+
+	exp = opt_look_for_col_in_comparison_before(cmp_type, col_no,
+						    search_cond, sel_node,
+						    nth_table, op);
+	if (exp == NULL) {
+
+		return(NULL);
+	}
+
+	/* If we will fetch in an ascending order, we cannot utilize an upper
+	limit for a column value; in a descending order, respectively, a lower
+	limit */
+
+	if (sel_node->asc && ((*op == '<') || (*op == PARS_LE_TOKEN))) {
+
+		return(NULL);
+
+	} else if (!sel_node->asc
+		   && ((*op == '>') || (*op == PARS_GE_TOKEN))) {
+
+		return(NULL);
+	}
+
+	return(exp);
+}
+
+/*******************************************************************//**
+Calculates the goodness for an index according to a select node. The
+goodness is 4 times the number of first fields in index whose values we
+already know exactly in the query. If we have a comparison condition for
+an additional field, 2 point are added. If the index is unique, and we know
+all the unique fields for the index we add 1024 points. For a clustered index
+we add 1 point.
+@return goodness */
+static
+ulint
+opt_calc_index_goodness(
+/*====================*/
+	dict_index_t*	index,		/*!< in: index */
+	sel_node_t*	sel_node,	/*!< in: parsed select node */
+	ulint		nth_table,	/*!< in: nth table in a join */
+	que_node_t**	index_plan,	/*!< in/out: comparison expressions for
+					this index */
+	ulint*		last_op)	/*!< out: last comparison operator, if
+					goodness > 1 */
+{
+	que_node_t*	exp;
+	ulint		goodness;
+	ulint		n_fields;
+	ulint		col_no;
+	ulint		op;
+	ulint		j;
+
+	/* At least for now we don't support using FTS indexes for queries
+	done through InnoDB's own SQL parser. */
+	if (dict_index_is_online_ddl(index) || (index->type & DICT_FTS)) {
+		return(0);
+	}
+
+	goodness = 0;
+
+	/* Note that as higher level node pointers in the B-tree contain
+	page addresses as the last field, we must not put more fields in
+	the search tuple than dict_index_get_n_unique_in_tree(index); see
+	the note in btr_cur_search_to_nth_level. */
+
+	n_fields = dict_index_get_n_unique_in_tree(index);
+
+	for (j = 0; j < n_fields; j++) {
+		if (UNIV_UNLIKELY(index->fields[j].descending)) {
+			/* The internal InnoDB SQL parser does not
+			work with indexes that use DESC order. */
+			return 0;
+		}
+
+		col_no = dict_index_get_nth_col_no(index, j);
+
+		exp = opt_look_for_col_in_cond_before(
+			OPT_EQUAL, col_no,
+			static_cast<func_node_t*>(sel_node->search_cond),
+			sel_node, nth_table, &op);
+		if (exp) {
+			/* The value for this column is exactly known already
+			at this stage of the join */
+
+			index_plan[j] = exp;
+			*last_op = op;
+			goodness += 4;
+		} else {
+			/* Look for non-equality comparisons */
+
+			exp = opt_look_for_col_in_cond_before(
+				OPT_COMPARISON, col_no,
+				static_cast<func_node_t*>(
+					sel_node->search_cond),
+				sel_node, nth_table, &op);
+			if (exp) {
+				index_plan[j] = exp;
+				*last_op = op;
+				goodness += 2;
+			}
+
+			break;
+		}
+	}
+
+	if (goodness / 4 >= dict_index_get_n_unique(index)) {
+		goodness += 1024;
+
+		if (dict_index_is_clust(index)) {
+
+			goodness += 1024;
+		}
+	}
+
+	/* We have to test for goodness here, as last_op may not be set */
+	if (goodness && dict_index_is_clust(index)) {
+
+		goodness++;
+	}
+
+	return(goodness);
+}
+
+/*******************************************************************//**
+Calculates the number of matched fields based on an index goodness.
+@return number of excatly or partially matched fields */
+UNIV_INLINE
+ulint
+opt_calc_n_fields_from_goodness(
+/*============================*/
+	ulint	goodness)	/*!< in: goodness */
+{
+	return(((goodness % 1024) + 2) / 4);
+}
+
+/*******************************************************************//**
+Converts a comparison operator to the corresponding search mode PAGE_CUR_GE,
+...
+@return search mode */
+UNIV_INLINE
+page_cur_mode_t
+opt_op_to_search_mode(
+/*==================*/
+	ibool	asc,	/*!< in: TRUE if the rows should be fetched in an
+			ascending order */
+	ulint	op)	/*!< in: operator '=', PARS_GE_TOKEN, ... */
+{
+	if (op == '='
+	    || op == PARS_LIKE_TOKEN_EXACT
+	    || op == PARS_LIKE_TOKEN_PREFIX
+	    || op == PARS_LIKE_TOKEN_SUFFIX
+	    || op == PARS_LIKE_TOKEN_SUBSTR) {
+
+		if (asc) {
+			return(PAGE_CUR_GE);
+		} else {
+			return(PAGE_CUR_LE);
+		}
+	} else if (op == '<') {
+		ut_a(!asc);
+		return(PAGE_CUR_L);
+	} else if (op == '>') {
+		ut_a(asc);
+		return(PAGE_CUR_G);
+	} else if (op == PARS_GE_TOKEN) {
+		ut_a(asc);
+		return(PAGE_CUR_GE);
+	} else if (op == PARS_LE_TOKEN) {
+		ut_a(!asc);
+		return(PAGE_CUR_LE);
+	} else {
+		ut_error;
+	}
+
+	return(PAGE_CUR_UNSUPP);
+}
+
+/*******************************************************************//**
+Determines if a node is an argument node of a function node.
+@return TRUE if is an argument */
+static
+ibool
+opt_is_arg(
+/*=======*/
+	que_node_t*	arg_node,	/*!< in: possible argument node */
+	func_node_t*	func_node)	/*!< in: function node */
+{
+	que_node_t*	arg;
+
+	arg = func_node->args;
+
+	while (arg) {
+		if (arg == arg_node) {
+
+			return(TRUE);
+		}
+
+		arg = que_node_get_next(arg);
+	}
+
+	return(FALSE);
+}
+
+/*******************************************************************//**
+Decides if the fetching of rows should be made in a descending order, and
+also checks that the chosen query plan produces a result which satisfies
+the order-by. */
+static
+void
+opt_check_order_by(
+/*===============*/
+	sel_node_t*	sel_node)	/*!< in: select node; asserts an error
+					if the plan does not agree with the
+					order-by */
+{
+	order_node_t*	order_node;
+	dict_table_t*	order_table;
+	ulint		order_col_no;
+	plan_t*		plan;
+	ulint		i;
+
+	if (!sel_node->order_by) {
+
+		return;
+	}
+
+	order_node = sel_node->order_by;
+	order_col_no = order_node->column->col_no;
+	order_table = order_node->column->table;
+
+	/* If there is an order-by clause, the first non-exactly matched field
+	in the index used for the last table in the table list should be the
+	column defined in the order-by clause, and for all the other tables
+	we should get only at most a single row, otherwise we cannot presently
+	calculate the order-by, as we have no sort utility */
+
+	for (i = 0; i < sel_node->n_tables; i++) {
+
+		plan = sel_node_get_nth_plan(sel_node, i);
+
+		if (i < sel_node->n_tables - 1) {
+			ut_a(dict_index_get_n_unique(plan->index)
+			     <= plan->n_exact_match);
+		} else {
+			ut_a(plan->table == order_table);
+
+			ut_a((dict_index_get_n_unique(plan->index)
+			      <= plan->n_exact_match)
+			     || (dict_index_get_nth_col_no(plan->index,
+							   plan->n_exact_match)
+				 == order_col_no));
+		}
+	}
+}
+
+/*******************************************************************//**
+Optimizes a select. Decides which indexes to tables to use. The tables
+are accessed in the order that they were written to the FROM part in the
+select statement. */
+static
+void
+opt_search_plan_for_table(
+/*======================*/
+	sel_node_t*	sel_node,	/*!< in: parsed select node */
+	ulint		i,		/*!< in: this is the ith table */
+	dict_table_t*	table)		/*!< in: table */
+{
+	plan_t*		plan;
+	dict_index_t*	index;
+	ulint		n_fields;
+	ulint		best_last_op;
+	que_node_t*	index_plan[256];
+	que_node_t*	best_index_plan[256];
+
+	plan = sel_node_get_nth_plan(sel_node, i);
+
+	plan->table = table;
+	plan->asc = sel_node->asc;
+	plan->pcur_is_open = FALSE;
+	plan->cursor_at_end = FALSE;
+
+	/* Calculate goodness for each index of the table */
+
+	plan->index = index = dict_table_get_first_index(table);
+	ulint best_goodness = opt_calc_index_goodness(
+		index, sel_node, i, best_index_plan, &best_last_op);
+
+	while ((index = dict_table_get_next_index(index))) {
+		if (!index->is_btree()) {
+			continue;
+		}
+		ulint last_op;
+		ulint goodness = opt_calc_index_goodness(index, sel_node, i,
+							 index_plan, &last_op);
+		if (goodness > best_goodness) {
+			best_goodness = goodness;
+			plan->index = index;
+			n_fields = opt_calc_n_fields_from_goodness(goodness);
+
+			memcpy(best_index_plan, index_plan,
+			       n_fields * sizeof *index_plan);
+			best_last_op = last_op;
+		}
+	}
+
+	n_fields = opt_calc_n_fields_from_goodness(best_goodness);
+
+	if (n_fields == 0) {
+		plan->tuple = NULL;
+		plan->n_exact_match = 0;
+	} else {
+		plan->tuple = dtuple_create(pars_sym_tab_global->heap,
+					    n_fields);
+		dict_index_copy_types(plan->tuple, plan->index, n_fields);
+
+		plan->tuple_exps = static_cast<que_node_t**>(
+			mem_heap_alloc(
+				pars_sym_tab_global->heap,
+				n_fields * sizeof(void*)));
+
+		memcpy(plan->tuple_exps, best_index_plan,
+		       n_fields * sizeof *best_index_plan);
+
+		switch (best_last_op) {
+		case '=':
+		case PARS_LIKE_TOKEN_EXACT:
+		case PARS_LIKE_TOKEN_PREFIX:
+		case PARS_LIKE_TOKEN_SUFFIX:
+		case PARS_LIKE_TOKEN_SUBSTR:
+			break;
+		default:
+			n_fields--;
+		}
+
+		plan->n_exact_match = n_fields;
+		plan->mode = opt_op_to_search_mode(sel_node->asc,
+						   best_last_op);
+	}
+
+	plan->unique_search = plan->index->is_clust()
+		&& plan->n_exact_match >= plan->index->n_uniq;
+
+	plan->old_vers_heap = NULL;
+
+	btr_pcur_init(&(plan->pcur));
+	btr_pcur_init(&(plan->clust_pcur));
+}
+
+/*******************************************************************//**
+Looks at a comparison condition and decides if it can, and need, be tested for
+a table AFTER the table has been accessed.
+@return OPT_NOT_COND if not for this table, else OPT_END_COND,
+OPT_TEST_COND, or OPT_SCROLL_COND, where the last means that the
+condition need not be tested, except when scroll cursors are used */
+static
+ulint
+opt_classify_comparison(
+/*====================*/
+	sel_node_t*	sel_node,	/*!< in: select node */
+	ulint		i,		/*!< in: ith table in the join */
+	func_node_t*	cond)		/*!< in: comparison condition */
+{
+	plan_t*	plan;
+	ulint	n_fields;
+	ulint	op;
+	ulint	j;
+
+	ut_ad(cond && sel_node);
+
+	plan = sel_node_get_nth_plan(sel_node, i);
+
+	/* Check if the condition is determined after the ith table has been
+	accessed, but not after the i - 1:th */
+
+	if (!opt_check_exp_determined_before(cond, sel_node, i + 1)) {
+
+		return(OPT_NOT_COND);
+	}
+
+	if ((i > 0) && opt_check_exp_determined_before(cond, sel_node, i)) {
+
+		return(OPT_NOT_COND);
+	}
+
+	/* If the condition is an exact match condition used in constructing
+	the search tuple, it is classified as OPT_END_COND */
+
+	if (plan->tuple) {
+		n_fields = dtuple_get_n_fields(plan->tuple);
+	} else {
+		n_fields = 0;
+	}
+
+	for (j = 0; j < plan->n_exact_match; j++) {
+
+		if (opt_is_arg(plan->tuple_exps[j], cond)) {
+
+			return(OPT_END_COND);
+		}
+	}
+
+	/* If the condition is an non-exact match condition used in
+	constructing the search tuple, it is classified as OPT_SCROLL_COND.
+	When the cursor is positioned, and if a non-scroll cursor is used,
+	there is no need to test this condition; if a scroll cursor is used
+	the testing is necessary when the cursor is reversed. */
+
+	if ((n_fields > plan->n_exact_match)
+	    && opt_is_arg(plan->tuple_exps[n_fields - 1], cond)) {
+
+		return(OPT_SCROLL_COND);
+	}
+
+	/* If the condition is a non-exact match condition on the first field
+	in index for which there is no exact match, and it limits the search
+	range from the opposite side of the search tuple already BEFORE we
+	access the table, it is classified as OPT_END_COND */
+
+	if ((dict_index_get_n_fields(plan->index) > plan->n_exact_match)
+	    && opt_look_for_col_in_comparison_before(
+		    OPT_COMPARISON,
+		    dict_index_get_nth_col_no(plan->index,
+					      plan->n_exact_match),
+		    cond, sel_node, i, &op)) {
+
+		if (sel_node->asc && ((op == '<') || (op == PARS_LE_TOKEN))) {
+
+			return(OPT_END_COND);
+		}
+
+		if (!sel_node->asc && ((op == '>') || (op == PARS_GE_TOKEN))) {
+
+			return(OPT_END_COND);
+		}
+	}
+
+	/* Otherwise, cond is classified as OPT_TEST_COND */
+
+	return(OPT_TEST_COND);
+}
+
+/*******************************************************************//**
+Recursively looks for test conditions for a table in a join. */
+static
+void
+opt_find_test_conds(
+/*================*/
+	sel_node_t*	sel_node,	/*!< in: select node */
+	ulint		i,		/*!< in: ith table in the join */
+	func_node_t*	cond)		/*!< in: conjunction of search
+					conditions or NULL */
+{
+	func_node_t*	new_cond;
+	ulint		fclass;
+	plan_t*		plan;
+
+	if (cond == NULL) {
+
+		return;
+	}
+
+	if (cond->func == PARS_AND_TOKEN) {
+		new_cond = static_cast<func_node_t*>(cond->args);
+
+		opt_find_test_conds(sel_node, i, new_cond);
+
+		new_cond = static_cast<func_node_t*>(
+			que_node_get_next(new_cond));
+
+		opt_find_test_conds(sel_node, i, new_cond);
+
+		return;
+	}
+
+	plan = sel_node_get_nth_plan(sel_node, i);
+
+	fclass = opt_classify_comparison(sel_node, i, cond);
+
+	if (fclass == OPT_END_COND) {
+		UT_LIST_ADD_LAST(plan->end_conds, cond);
+
+	} else if (fclass == OPT_TEST_COND) {
+		UT_LIST_ADD_LAST(plan->other_conds, cond);
+
+	}
+}
+
+/*******************************************************************//**
+Normalizes a list of comparison conditions so that a column of the table
+appears on the left side of the comparison if possible. This is accomplished
+by switching the arguments of the operator. */
+static
+void
+opt_normalize_cmp_conds(
+/*====================*/
+	func_node_t*	cond,	/*!< in: first in a list of comparison
+				conditions, or NULL */
+	dict_table_t*	table)	/*!< in: table */
+{
+	que_node_t*	arg1;
+	que_node_t*	arg2;
+	sym_node_t*	sym_node;
+
+	while (cond) {
+		arg1 = cond->args;
+		arg2 = que_node_get_next(arg1);
+
+		if (que_node_get_type(arg2) == QUE_NODE_SYMBOL) {
+
+			sym_node = static_cast<sym_node_t*>(arg2);
+
+			if ((sym_node->token_type == SYM_COLUMN)
+			    && (sym_node->table == table)) {
+
+				/* Switch the order of the arguments */
+
+				cond->args = arg2;
+				que_node_list_add_last(NULL, arg2);
+				que_node_list_add_last(arg2, arg1);
+
+				/* Invert the operator */
+				cond->func = opt_invert_cmp_op(cond->func);
+			}
+		}
+
+		cond = UT_LIST_GET_NEXT(cond_list, cond);
+	}
+}
+
+/*******************************************************************//**
+Finds out the search condition conjuncts we can, and need, to test as the ith
+table in a join is accessed. The search tuple can eliminate the need to test
+some conjuncts. */
+static
+void
+opt_determine_and_normalize_test_conds(
+/*===================================*/
+	sel_node_t*	sel_node,	/*!< in: select node */
+	ulint		i)		/*!< in: ith table in the join */
+{
+	plan_t*	plan;
+
+	plan = sel_node_get_nth_plan(sel_node, i);
+
+	UT_LIST_INIT(plan->end_conds, &func_node_t::cond_list);
+	UT_LIST_INIT(plan->other_conds, &func_node_t::cond_list);
+
+	/* Recursively go through the conjuncts and classify them */
+
+	opt_find_test_conds(
+		sel_node,
+		i,
+		static_cast<func_node_t*>(sel_node->search_cond));
+
+	opt_normalize_cmp_conds(UT_LIST_GET_FIRST(plan->end_conds),
+				plan->table);
+
+	ut_a(UT_LIST_GET_LEN(plan->end_conds) >= plan->n_exact_match);
+}
+
+/*******************************************************************//**
+Looks for occurrences of the columns of the table in the query subgraph and
+adds them to the list of columns if an occurrence of the same column does not
+already exist in the list. If the column is already in the list, puts a value
+indirection to point to the occurrence in the column list, except if the
+column occurrence we are looking at is in the column list, in which case
+nothing is done. */
+void
+opt_find_all_cols(
+/*==============*/
+	ibool		copy_val,	/*!< in: if TRUE, new found columns are
+					added as columns to copy */
+	dict_index_t*	index,		/*!< in: index of the table to use */
+	sym_node_list_t* col_list,	/*!< in: base node of a list where
+					to add new found columns */
+	plan_t*		plan,		/*!< in: plan or NULL */
+	que_node_t*	exp)		/*!< in: expression or condition or
+					NULL */
+{
+	func_node_t*	func_node;
+	que_node_t*	arg;
+	sym_node_t*	sym_node;
+	sym_node_t*	col_node;
+	ulint		col_pos;
+
+	if (exp == NULL) {
+
+		return;
+	}
+
+	if (que_node_get_type(exp) == QUE_NODE_FUNC) {
+		func_node = static_cast<func_node_t*>(exp);
+
+		for (arg = func_node->args;
+		     arg != 0;
+		     arg = que_node_get_next(arg)) {
+
+			opt_find_all_cols(
+				copy_val, index, col_list, plan, arg);
+		}
+
+		return;
+	}
+
+	ut_a(que_node_get_type(exp) == QUE_NODE_SYMBOL);
+
+	sym_node = static_cast<sym_node_t*>(exp);
+
+	if (sym_node->token_type != SYM_COLUMN) {
+
+		return;
+	}
+
+	if (sym_node->table != index->table) {
+
+		return;
+	}
+
+	/* Look for an occurrence of the same column in the plan column
+	list */
+
+	col_node = UT_LIST_GET_FIRST(*col_list);
+
+	while (col_node) {
+		if (col_node->col_no == sym_node->col_no) {
+
+			if (col_node == sym_node) {
+				/* sym_node was already in a list: do
+				nothing */
+
+				return;
+			}
+
+			/* Put an indirection */
+			sym_node->indirection = col_node;
+			sym_node->alias = col_node;
+
+			return;
+		}
+
+		col_node = UT_LIST_GET_NEXT(col_var_list, col_node);
+	}
+
+	/* The same column did not occur in the list: add it */
+
+	UT_LIST_ADD_LAST(*col_list, sym_node);
+
+	sym_node->copy_val = copy_val;
+
+	/* Fill in the field_no fields in sym_node */
+
+	sym_node->field_nos[SYM_CLUST_FIELD_NO] = dict_index_get_nth_col_pos(
+		dict_table_get_first_index(index->table), sym_node->col_no,
+		NULL);
+	if (!dict_index_is_clust(index)) {
+
+		ut_a(plan);
+
+		col_pos = dict_index_get_nth_col_pos(index, sym_node->col_no,
+						     NULL);
+
+		if (col_pos == ULINT_UNDEFINED) {
+
+			plan->must_get_clust = TRUE;
+		}
+
+		sym_node->field_nos[SYM_SEC_FIELD_NO] = col_pos;
+	}
+}
+
+/*******************************************************************//**
+Looks for occurrences of the columns of the table in conditions which are
+not yet determined AFTER the join operation has fetched a row in the ith
+table. The values for these column must be copied to dynamic memory for
+later use. */
+static
+void
+opt_find_copy_cols(
+/*===============*/
+	sel_node_t*	sel_node,	/*!< in: select node */
+	ulint		i,		/*!< in: ith table in the join */
+	func_node_t*	search_cond)	/*!< in: search condition or NULL */
+{
+	func_node_t*	new_cond;
+	plan_t*		plan;
+
+	if (search_cond == NULL) {
+
+		return;
+	}
+
+	ut_ad(que_node_get_type(search_cond) == QUE_NODE_FUNC);
+
+	if (search_cond->func == PARS_AND_TOKEN) {
+		new_cond = static_cast<func_node_t*>(search_cond->args);
+
+		opt_find_copy_cols(sel_node, i, new_cond);
+
+		new_cond = static_cast<func_node_t*>(
+			que_node_get_next(new_cond));
+
+		opt_find_copy_cols(sel_node, i, new_cond);
+
+		return;
+	}
+
+	if (!opt_check_exp_determined_before(search_cond, sel_node, i + 1)) {
+
+		/* Any ith table columns occurring in search_cond should be
+		copied, as this condition cannot be tested already on the
+		fetch from the ith table */
+
+		plan = sel_node_get_nth_plan(sel_node, i);
+
+		opt_find_all_cols(TRUE, plan->index, &(plan->columns), plan,
+				  search_cond);
+	}
+}
+
+/*******************************************************************//**
+Classifies the table columns according to whether we use the column only while
+holding the latch on the page, or whether we have to copy the column value to
+dynamic memory. Puts the first occurrence of a column to either list in the
+plan node, and puts indirections to later occurrences of the column. */
+static
+void
+opt_classify_cols(
+/*==============*/
+	sel_node_t*	sel_node,	/*!< in: select node */
+	ulint		i)		/*!< in: ith table in the join */
+{
+	plan_t*		plan;
+	que_node_t*	exp;
+
+	plan = sel_node_get_nth_plan(sel_node, i);
+
+	/* The final value of the following field will depend on the
+	environment of the select statement: */
+
+	plan->must_get_clust = FALSE;
+
+	UT_LIST_INIT(plan->columns, &sym_node_t::col_var_list);
+
+	/* All select list columns should be copied: therefore TRUE as the
+	first argument */
+
+	for (exp = sel_node->select_list;
+	     exp != 0;
+	     exp = que_node_get_next(exp)) {
+
+		opt_find_all_cols(
+			TRUE, plan->index, &(plan->columns), plan, exp);
+	}
+
+	opt_find_copy_cols(
+		sel_node, i, static_cast<func_node_t*>(sel_node->search_cond));
+
+	/* All remaining columns in the search condition are temporary
+	columns: therefore FALSE */
+
+	opt_find_all_cols(
+		FALSE, plan->index, &plan->columns, plan,
+		static_cast<func_node_t*>(sel_node->search_cond));
+}
+
+/*******************************************************************//**
+Fills in the info in plan which is used in accessing a clustered index
+record. The columns must already be classified for the plan node. */
+static
+void
+opt_clust_access(
+/*=============*/
+	sel_node_t*	sel_node,	/*!< in: select node */
+	ulint		n)		/*!< in: nth table in select */
+{
+	plan_t*		plan;
+	dict_table_t*	table;
+	dict_index_t*	clust_index;
+	dict_index_t*	index;
+	mem_heap_t*	heap;
+	ulint		n_fields;
+	ulint		pos;
+	ulint		i;
+
+	plan = sel_node_get_nth_plan(sel_node, n);
+
+	index = plan->index;
+
+	/* The final value of the following field depends on the environment
+	of the select statement: */
+
+	plan->no_prefetch = FALSE;
+
+	if (dict_index_is_clust(index)) {
+		plan->clust_map = NULL;
+		plan->clust_ref = NULL;
+
+		return;
+	}
+
+	table = index->table;
+
+	clust_index = dict_table_get_first_index(table);
+
+	n_fields = dict_index_get_n_unique(clust_index);
+
+	heap = pars_sym_tab_global->heap;
+
+	plan->clust_ref = dtuple_create(heap, n_fields);
+
+	dict_index_copy_types(plan->clust_ref, clust_index, n_fields);
+
+	plan->clust_map = static_cast<ulint*>(
+		mem_heap_alloc(heap, n_fields * sizeof(ulint)));
+
+	for (i = 0; i < n_fields; i++) {
+		pos = dict_index_get_nth_field_pos(index, clust_index, i);
+
+		ut_a(pos != ULINT_UNDEFINED);
+
+		/* We optimize here only queries to InnoDB's internal system
+		tables, and they should not contain column prefix indexes. */
+
+		if (dict_is_sys_table(index->table->id)
+		    && (dict_index_get_nth_field(index, pos)->prefix_len != 0
+		    || dict_index_get_nth_field(clust_index, i)
+		    ->prefix_len != 0)) {
+			ib::error() << "Error in pars0opt.cc: table "
+				<< index->table->name
+				<< " has prefix_len != 0";
+		}
+
+		*(plan->clust_map + i) = pos;
+
+		ut_ad(pos != ULINT_UNDEFINED);
+	}
+}
+
+#ifdef UNIV_SQL_DEBUG
+/** Print info of a query plan.
+@param[in,out]	sel_node	select node */
+static
+void
+opt_print_query_plan(
+	sel_node_t*	sel_node);
+#endif
+
+/*******************************************************************//**
+Optimizes a select. Decides which indexes to tables to use. The tables
+are accessed in the order that they were written to the FROM part in the
+select statement. */
+void
+opt_search_plan(
+/*============*/
+	sel_node_t*	sel_node)	/*!< in: parsed select node */
+{
+	sym_node_t*	table_node;
+	dict_table_t*	table;
+	order_node_t*	order_by;
+	ulint		i;
+
+	sel_node->plans = static_cast<plan_t*>(
+		mem_heap_alloc(
+			pars_sym_tab_global->heap,
+			sel_node->n_tables * sizeof(plan_t)));
+
+	/* Analyze the search condition to find out what we know at each
+	join stage about the conditions that the columns of a table should
+	satisfy */
+
+	table_node = sel_node->table_list;
+
+	if (sel_node->order_by == NULL) {
+		sel_node->asc = TRUE;
+	} else {
+		order_by = sel_node->order_by;
+
+		sel_node->asc = order_by->asc;
+	}
+
+	for (i = 0; i < sel_node->n_tables; i++) {
+
+		table = table_node->table;
+
+		/* Choose index through which to access the table */
+
+		opt_search_plan_for_table(sel_node, i, table);
+
+		/* Determine the search condition conjuncts we can test at
+		this table; normalize the end conditions */
+
+		opt_determine_and_normalize_test_conds(sel_node, i);
+
+		table_node = static_cast<sym_node_t*>(
+			que_node_get_next(table_node));
+	}
+
+	table_node = sel_node->table_list;
+
+	for (i = 0; i < sel_node->n_tables; i++) {
+
+		/* Classify the table columns into those we only need to access
+		but not copy, and to those we must copy to dynamic memory */
+
+		opt_classify_cols(sel_node, i);
+
+		/* Calculate possible info for accessing the clustered index
+		record */
+
+		opt_clust_access(sel_node, i);
+
+		table_node = static_cast<sym_node_t*>(
+			que_node_get_next(table_node));
+	}
+
+	/* Check that the plan obeys a possible order-by clause: if not,
+	an assertion error occurs */
+
+	opt_check_order_by(sel_node);
+
+#ifdef UNIV_SQL_DEBUG
+	opt_print_query_plan(sel_node);
+#endif
+}
+
+#ifdef UNIV_SQL_DEBUG
+/** Print info of a query plan.
+@param[in,out]	sel_node	select node */
+static
+void
+opt_print_query_plan(
+	sel_node_t*	sel_node)
+{
+	plan_t*	plan;
+	ulint	n_fields;
+	ulint	i;
+
+	fputs("QUERY PLAN FOR A SELECT NODE\n", stderr);
+
+	fputs(sel_node->asc ? "Asc. search; " : "Desc. search; ", stderr);
+
+	if (sel_node->set_x_locks) {
+		fputs("sets row x-locks; ", stderr);
+		ut_a(sel_node->row_lock_mode == LOCK_X);
+		ut_a(!sel_node->consistent_read);
+	} else if (sel_node->consistent_read) {
+		fputs("consistent read; ", stderr);
+	} else {
+		ut_a(sel_node->row_lock_mode == LOCK_S);
+		fputs("sets row s-locks; ", stderr);
+	}
+
+	putc('\n', stderr);
+
+	for (i = 0; i < sel_node->n_tables; i++) {
+		plan = sel_node_get_nth_plan(sel_node, i);
+
+		if (plan->tuple) {
+			n_fields = dtuple_get_n_fields(plan->tuple);
+		} else {
+			n_fields = 0;
+		}
+
+		fprintf(stderr,
+			"Index %s of table %s"
+			"; exact m. %lu, match %lu, end conds %lu\n",
+			plan->index->name(), plan->index->table->name.m_name,
+			(unsigned long) plan->n_exact_match,
+			(unsigned long) n_fields,
+			(unsigned long) UT_LIST_GET_LEN(plan->end_conds));
+	}
+}
+#endif /* UNIV_SQL_DEBUG */
diff --git a/storage/innobase/pars/pars0pars.cc b/storage/innobase/pars/pars0pars.cc
new file mode 100644
index 00000000..61614007
--- /dev/null
+++ b/storage/innobase/pars/pars0pars.cc
@@ -0,0 +1,2381 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St,
+Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file pars/pars0pars.c
+SQL parser
+
+Created 11/19/1996 Heikki Tuuri
+*******************************************************/
+
+/* Historical note: Innobase executed its first SQL string (CREATE TABLE)
+on 1/27/1998 */
+
+#include "pars0pars.h"
+#include "row0sel.h"
+#include "row0ins.h"
+#include "row0upd.h"
+#include "dict0dict.h"
+#include "dict0mem.h"
+#include "dict0crea.h"
+#include "que0que.h"
+#include "pars0grm.h"
+#include "pars0opt.h"
+#include "data0data.h"
+#include "data0type.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "eval0eval.h"
+
+/* Global variable used while parsing a single procedure or query : the code is
+NOT re-entrant */
+sym_tab_t*	pars_sym_tab_global;
+
+/* Global variables used to denote certain reserved words, used in
+constructing the parsing tree */
+
+pars_res_word_t	pars_to_binary_token = {PARS_TO_BINARY_TOKEN};
+pars_res_word_t	pars_substr_token = {PARS_SUBSTR_TOKEN};
+pars_res_word_t	pars_concat_token = {PARS_CONCAT_TOKEN};
+pars_res_word_t	pars_instr_token = {PARS_INSTR_TOKEN};
+pars_res_word_t	pars_length_token = {PARS_LENGTH_TOKEN};
+pars_res_word_t	pars_count_token = {PARS_COUNT_TOKEN};
+pars_res_word_t	pars_int_token = {PARS_INT_TOKEN};
+pars_res_word_t	pars_bigint_token = {PARS_BIGINT_TOKEN};
+pars_res_word_t	pars_char_token = {PARS_CHAR_TOKEN};
+pars_res_word_t	pars_update_token = {PARS_UPDATE_TOKEN};
+pars_res_word_t	pars_asc_token = {PARS_ASC_TOKEN};
+pars_res_word_t	pars_desc_token = {PARS_DESC_TOKEN};
+pars_res_word_t	pars_open_token = {PARS_OPEN_TOKEN};
+pars_res_word_t	pars_close_token = {PARS_CLOSE_TOKEN};
+pars_res_word_t	pars_share_token = {PARS_SHARE_TOKEN};
+pars_res_word_t	pars_unique_token = {PARS_UNIQUE_TOKEN};
+pars_res_word_t	pars_clustered_token = {PARS_CLUSTERED_TOKEN};
+
+/** Global variable used to denote the '*' in SELECT * FROM.. */
+ulint	pars_star_denoter	= 12345678;
+
+/********************************************************************
+Get user function with the given name.*/
+UNIV_INLINE
+pars_user_func_t*
+pars_info_lookup_user_func(
+/*=======================*/
+					/* out: user func, or NULL if not
+					found */
+	pars_info_t*		info,	/* in: info struct */
+	const char*		name)	/* in: function name to find*/
+{
+	if (info && info->funcs) {
+		ulint		i;
+		ib_vector_t*	vec = info->funcs;
+
+		for (i = 0; i < ib_vector_size(vec); i++) {
+			pars_user_func_t*	puf;
+
+			puf = static_cast<pars_user_func_t*>(
+				ib_vector_get(vec, i));
+
+			if (strcmp(puf->name, name) == 0) {
+				return(puf);
+			}
+		}
+	}
+
+	return(NULL);
+}
+
+/********************************************************************
+Get bound identifier with the given name.*/
+UNIV_INLINE
+pars_bound_id_t*
+pars_info_lookup_bound_id(
+/*======================*/
+					/* out: bound literal, or NULL if
+					not found */
+	pars_info_t*		info,	/* in: info struct */
+	const char*		name)	/* in: bound literal name to find */
+{
+	if (info && info->bound_ids) {
+		ulint		i;
+		ib_vector_t*	vec = info->bound_ids;
+
+		for (i = 0; i < ib_vector_size(vec); i++) {
+			pars_bound_id_t*	bid;
+
+		       	bid = static_cast<pars_bound_id_t*>(
+				ib_vector_get(vec, i));
+
+			if (strcmp(bid->name, name) == 0) {
+				return(bid);
+			}
+		}
+	}
+
+	return(NULL);
+}
+
+/********************************************************************
+Get bound literal with the given name.*/
+UNIV_INLINE
+pars_bound_lit_t*
+pars_info_lookup_bound_lit(
+/*=======================*/
+					/* out: bound literal, or NULL if
+					not found */
+	pars_info_t*		info,	/* in: info struct */
+	const char*		name)	/* in: bound literal name to find */
+{
+	if (info && info->bound_lits) {
+		ulint		i;
+		ib_vector_t*	vec = info->bound_lits;
+
+		for (i = 0; i < ib_vector_size(vec); i++) {
+			pars_bound_lit_t*	pbl;
+
+			pbl = static_cast<pars_bound_lit_t*>(
+				ib_vector_get(vec, i));
+
+			if (strcmp(pbl->name, name) == 0) {
+				return(pbl);
+			}
+		}
+	}
+
+	return(NULL);
+}
+
+/*********************************************************************//**
+Determines the class of a function code.
+@return function class: PARS_FUNC_ARITH, ... */
+static
+ulint
+pars_func_get_class(
+/*================*/
+	int	func)	/*!< in: function code: '=', PARS_GE_TOKEN, ... */
+{
+	switch (func) {
+	case '+': case '-': case '*': case '/':
+		return(PARS_FUNC_ARITH);
+
+	case '=': case '<': case '>':
+	case PARS_GE_TOKEN: case PARS_LE_TOKEN: case PARS_NE_TOKEN:
+		return(PARS_FUNC_CMP);
+
+	case PARS_AND_TOKEN: case PARS_OR_TOKEN: case PARS_NOT_TOKEN:
+		return(PARS_FUNC_LOGICAL);
+
+	case PARS_COUNT_TOKEN:
+		return(PARS_FUNC_AGGREGATE);
+
+	case PARS_TO_BINARY_TOKEN:
+	case PARS_SUBSTR_TOKEN:
+	case PARS_CONCAT_TOKEN:
+	case PARS_LENGTH_TOKEN:
+	case PARS_INSTR_TOKEN:
+	case PARS_NOTFOUND_TOKEN:
+		return(PARS_FUNC_PREDEFINED);
+
+	default:
+		return(PARS_FUNC_OTHER);
+	}
+}
+
+/*********************************************************************//**
+Parses an operator or predefined function expression.
+@return own: function node in a query tree */
+static
+func_node_t*
+pars_func_low(
+/*==========*/
+	int		func,	/*!< in: function token code */
+	que_node_t*	arg)	/*!< in: first argument in the argument list */
+{
+	func_node_t*	node;
+
+	node = static_cast<func_node_t*>(
+		mem_heap_alloc(pars_sym_tab_global->heap, sizeof(func_node_t)));
+
+	node->common.type = QUE_NODE_FUNC;
+	dfield_set_data(&(node->common.val), NULL, 0);
+	node->common.val_buf_size = 0;
+
+	node->func = func;
+
+	node->fclass = pars_func_get_class(func);
+
+	node->args = arg;
+
+	UT_LIST_ADD_LAST(pars_sym_tab_global->func_node_list, node);
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses a function expression.
+@return own: function node in a query tree */
+func_node_t*
+pars_func(
+/*======*/
+	que_node_t*	res_word,/*!< in: function name reserved word */
+	que_node_t*	arg)	/*!< in: first argument in the argument list */
+{
+	return(pars_func_low(((pars_res_word_t*) res_word)->code, arg));
+}
+
+/*************************************************************************
+Rebind a LIKE search string. NOTE: We ignore any '%' characters embedded
+within the search string.*/
+int
+pars_like_rebind(
+/*=============*/
+				/* out, own: function node in a query tree */
+	sym_node_t*	node,	/* in: The search string node.*/
+	const byte*	ptr,	/* in: literal to (re) bind */
+	ulint		ptr_len)/* in: length of literal to (re) bind*/
+{
+	dtype_t*	dtype;
+	dfield_t*	dfield;
+	ib_like_t	op_check;
+	sym_node_t*	like_node;
+	sym_node_t*	str_node = NULL;
+	ib_like_t	op = IB_LIKE_EXACT;
+	int		func = PARS_LIKE_TOKEN_EXACT;
+
+	/* Is this a STRING% ? */
+	if (ptr[ptr_len - 1] == '%') {
+		op = IB_LIKE_PREFIX;
+	}
+
+	/* Is this a '%STRING' or %STRING% ?*/
+	ut_ad(*ptr != '%');
+
+	if (node->like_node == NULL) {
+		/* Add the LIKE operator info node to the node list.
+		This will be used during the comparison phase to determine
+		how to match.*/
+		like_node = sym_tab_add_int_lit(node->sym_table, op);
+		que_node_list_add_last(NULL, like_node);
+		node->like_node = like_node;
+		str_node = sym_tab_add_str_lit(node->sym_table, ptr, ptr_len);
+		que_node_list_add_last(like_node, str_node);
+	} else {
+		like_node = node->like_node;
+
+		/* Change the value of the string in the existing
+		string node of like node */
+		str_node = static_cast<sym_node_t*>(
+			que_node_list_get_last(like_node));
+
+		/* Must find the string node */
+		ut_a(str_node);
+		ut_a(str_node != like_node);
+		ut_a(str_node->token_type == SYM_LIT);
+
+		dfield = que_node_get_val(str_node);
+		dfield_set_data(dfield, ptr, ptr_len);
+	}
+
+	dfield = que_node_get_val(like_node);
+	dtype = dfield_get_type(dfield);
+
+	ut_a(dtype_get_mtype(dtype) == DATA_INT);
+	op_check = static_cast<ib_like_t>(
+		mach_read_from_4(static_cast<byte*>(dfield_get_data(dfield))));
+
+	switch (op_check) {
+	case IB_LIKE_PREFIX:
+	case IB_LIKE_EXACT:
+		break;
+
+	default:
+		ut_error;
+	}
+
+	mach_write_to_4(static_cast<byte*>(dfield_get_data(dfield)), op);
+
+	dfield = que_node_get_val(node);
+
+	/* Adjust the length of the search value so the '%' is not
+	visible. Then create and add a search string node to the
+	search value node. Searching for %SUFFIX and %SUBSTR% requires
+	a full table scan and so we set the search value to ''.
+	For PREFIX% we simply remove the trailing '%'.*/
+
+	switch (op) {
+	case	IB_LIKE_EXACT:
+		dfield = que_node_get_val(str_node);
+		dtype = dfield_get_type(dfield);
+
+		ut_a(dtype_get_mtype(dtype) == DATA_VARCHAR);
+
+		dfield_set_data(dfield, ptr, ptr_len);
+		break;
+
+	case	IB_LIKE_PREFIX:
+		func = PARS_LIKE_TOKEN_PREFIX;
+
+		/* Modify the original node */
+		dfield_set_len(dfield, ptr_len - 1);
+
+		dfield = que_node_get_val(str_node);
+		dtype = dfield_get_type(dfield);
+
+		ut_a(dtype_get_mtype(dtype) == DATA_VARCHAR);
+
+		dfield_set_data(dfield, ptr, ptr_len - 1);
+		break;
+
+	default:
+		ut_error;
+	}
+
+	return(func);
+}
+
+/*************************************************************************
+Parses a LIKE operator expression. */
+static
+int
+pars_like_op(
+/*=========*/
+				/* out, own: function node in a query tree */
+	que_node_t*	arg)	/* in: LIKE comparison string.*/
+{
+	char*		ptr;
+	ulint		ptr_len;
+	int		func = PARS_LIKE_TOKEN_EXACT;
+	dfield_t*	dfield = que_node_get_val(arg);
+	dtype_t*	dtype = dfield_get_type(dfield);
+
+	ut_a(dtype_get_mtype(dtype) == DATA_CHAR
+	     || dtype_get_mtype(dtype) == DATA_VARCHAR);
+
+	ptr = static_cast<char*>(dfield_get_data(dfield));
+	ptr_len = strlen(ptr);
+
+	if (ptr_len) {
+
+		func = pars_like_rebind(
+			static_cast<sym_node_t*>(arg), (byte*) ptr, ptr_len);
+	}
+
+	return(func);
+}
+/*********************************************************************//**
+Parses an operator expression.
+@return own: function node in a query tree */
+func_node_t*
+pars_op(
+/*====*/
+	int		func,	/*!< in: operator token code */
+	que_node_t*	arg1,	/*!< in: first argument */
+	que_node_t*	arg2)	/*!< in: second argument or NULL for an unary
+				operator */
+{
+	que_node_list_add_last(NULL, arg1);
+
+	if (arg2) {
+		que_node_list_add_last(arg1, arg2);
+	}
+
+	/* We need to parse the string and determine whether it's a
+	PREFIX, SUFFIX or SUBSTRING comparison */
+	if (func == PARS_LIKE_TOKEN) {
+
+		ut_a(que_node_get_type(arg2) == QUE_NODE_SYMBOL);
+
+		func = pars_like_op(arg2);
+
+		ut_a(func == PARS_LIKE_TOKEN_EXACT
+		     || func == PARS_LIKE_TOKEN_PREFIX
+		     || func == PARS_LIKE_TOKEN_SUFFIX
+		     || func == PARS_LIKE_TOKEN_SUBSTR);
+	}
+
+	return(pars_func_low(func, arg1));
+}
+
+/*********************************************************************//**
+Parses an ORDER BY clause. Order by a single column only is supported.
+@return own: order-by node in a query tree */
+order_node_t*
+pars_order_by(
+/*==========*/
+	sym_node_t*	column,	/*!< in: column name */
+	pars_res_word_t* asc)	/*!< in: &pars_asc_token or pars_desc_token */
+{
+	order_node_t*	node;
+
+	node = static_cast<order_node_t*>(
+		mem_heap_alloc(
+			pars_sym_tab_global->heap, sizeof(order_node_t)));
+
+	node->common.type = QUE_NODE_ORDER;
+
+	node->column = column;
+
+	if (asc == &pars_asc_token) {
+		node->asc = TRUE;
+	} else {
+		ut_a(asc == &pars_desc_token);
+		node->asc = FALSE;
+	}
+
+	return(node);
+}
+
+/*********************************************************************//**
+Determine if a data type is a built-in string data type of the InnoDB
+SQL parser.
+@return TRUE if string data type */
+static
+ibool
+pars_is_string_type(
+/*================*/
+	ulint	mtype)	/*!< in: main data type */
+{
+	switch (mtype) {
+	case DATA_VARCHAR: case DATA_CHAR:
+	case DATA_FIXBINARY: case DATA_BINARY:
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Resolves the data type of a function in an expression. The argument data
+types must already be resolved. */
+static
+void
+pars_resolve_func_data_type(
+/*========================*/
+	func_node_t*	node)	/*!< in: function node */
+{
+	que_node_t*	arg;
+
+	ut_a(que_node_get_type(node) == QUE_NODE_FUNC);
+
+	arg = node->args;
+
+	switch (node->func) {
+	case '+': case '-': case '*': case '/':
+		/* Inherit the data type from the first argument (which must
+		not be the SQL null literal whose type is DATA_ERROR) */
+
+		dtype_copy(que_node_get_data_type(node),
+			   que_node_get_data_type(arg));
+
+		ut_a(dtype_get_mtype(que_node_get_data_type(node))
+		     == DATA_INT);
+		break;
+
+	case PARS_COUNT_TOKEN:
+		ut_a(arg);
+		dtype_set(que_node_get_data_type(node), DATA_INT, 0, 4);
+		break;
+
+	case PARS_TO_BINARY_TOKEN:
+		if (dtype_get_mtype(que_node_get_data_type(arg)) == DATA_INT) {
+			dtype_set(que_node_get_data_type(node), DATA_VARCHAR,
+				  DATA_ENGLISH, 0);
+		} else {
+			dtype_set(que_node_get_data_type(node), DATA_BINARY,
+				  0, 0);
+		}
+		break;
+
+	case PARS_LENGTH_TOKEN:
+	case PARS_INSTR_TOKEN:
+		ut_a(pars_is_string_type(que_node_get_data_type(arg)->mtype));
+		dtype_set(que_node_get_data_type(node), DATA_INT, 0, 4);
+		break;
+
+	case PARS_SUBSTR_TOKEN:
+	case PARS_CONCAT_TOKEN:
+		ut_a(pars_is_string_type(que_node_get_data_type(arg)->mtype));
+		dtype_set(que_node_get_data_type(node), DATA_VARCHAR,
+			  DATA_ENGLISH, 0);
+		break;
+
+	case '>': case '<': case '=':
+	case PARS_GE_TOKEN:
+	case PARS_LE_TOKEN:
+	case PARS_NE_TOKEN:
+	case PARS_AND_TOKEN:
+	case PARS_OR_TOKEN:
+	case PARS_NOT_TOKEN:
+	case PARS_NOTFOUND_TOKEN:
+
+		/* We currently have no iboolean type: use integer type */
+		dtype_set(que_node_get_data_type(node), DATA_INT, 0, 4);
+		break;
+
+	case PARS_LIKE_TOKEN_EXACT:
+	case PARS_LIKE_TOKEN_PREFIX:
+	case PARS_LIKE_TOKEN_SUFFIX:
+	case PARS_LIKE_TOKEN_SUBSTR:
+		dtype_set(que_node_get_data_type(node), DATA_VARCHAR,
+			  DATA_ENGLISH, 0);
+		break;
+
+	default:
+		ut_error;
+	}
+}
+
+/*********************************************************************//**
+Resolves the meaning of variables in an expression and the data types of
+functions. It is an error if some identifier cannot be resolved here. */
+static
+void
+pars_resolve_exp_variables_and_types(
+/*=================================*/
+	sel_node_t*	select_node,	/*!< in: select node or NULL; if
+					this is not NULL then the variable
+					sym nodes are added to the
+					copy_variables list of select_node */
+	que_node_t*	exp_node)	/*!< in: expression */
+{
+	func_node_t*	func_node;
+	que_node_t*	arg;
+	sym_node_t*	sym_node;
+	sym_node_t*	node;
+
+	ut_a(exp_node);
+
+	if (que_node_get_type(exp_node) == QUE_NODE_FUNC) {
+		func_node = static_cast<func_node_t*>(exp_node);
+
+		arg = func_node->args;
+
+		while (arg) {
+			pars_resolve_exp_variables_and_types(select_node, arg);
+
+			arg = que_node_get_next(arg);
+		}
+
+		pars_resolve_func_data_type(func_node);
+
+		return;
+	}
+
+	ut_a(que_node_get_type(exp_node) == QUE_NODE_SYMBOL);
+
+	sym_node = static_cast<sym_node_t*>(exp_node);
+
+	if (sym_node->resolved) {
+
+		return;
+	}
+
+	/* Not resolved yet: look in the symbol table for a variable
+	or a cursor or a function with the same name */
+
+	node = UT_LIST_GET_FIRST(pars_sym_tab_global->sym_list);
+
+	while (node) {
+		if (node->resolved
+		    && ((node->token_type == SYM_VAR)
+			|| (node->token_type == SYM_CURSOR)
+			|| (node->token_type == SYM_FUNCTION))
+		    && node->name
+		    && sym_node->name_len == node->name_len
+		    && !memcmp(sym_node->name, node->name, node->name_len)) {
+
+			/* Found a variable or a cursor declared with
+			the same name */
+
+			break;
+		}
+
+		node = UT_LIST_GET_NEXT(sym_list, node);
+	}
+
+	if (!node) {
+		fprintf(stderr, "PARSER ERROR: Unresolved identifier %s\n",
+			sym_node->name);
+	}
+
+	ut_a(node);
+
+	sym_node->resolved = TRUE;
+	sym_node->token_type = SYM_IMPLICIT_VAR;
+	sym_node->alias = node;
+	sym_node->indirection = node;
+
+	if (select_node) {
+		UT_LIST_ADD_LAST(select_node->copy_variables, sym_node);
+	}
+
+	dfield_set_type(que_node_get_val(sym_node),
+			que_node_get_data_type(node));
+}
+
+/*********************************************************************//**
+Resolves the meaning of variables in an expression list. It is an error if
+some identifier cannot be resolved here. Resolves also the data types of
+functions. */
+static
+void
+pars_resolve_exp_list_variables_and_types(
+/*======================================*/
+	sel_node_t*	select_node,	/*!< in: select node or NULL */
+	que_node_t*	exp_node)	/*!< in: expression list first node, or
+					NULL */
+{
+	while (exp_node) {
+		pars_resolve_exp_variables_and_types(select_node, exp_node);
+
+		exp_node = que_node_get_next(exp_node);
+	}
+}
+
+/*********************************************************************//**
+Resolves the columns in an expression. */
+static
+void
+pars_resolve_exp_columns(
+/*=====================*/
+	sym_node_t*	table_node,	/*!< in: first node in a table list */
+	que_node_t*	exp_node)	/*!< in: expression */
+{
+	func_node_t*	func_node;
+	que_node_t*	arg;
+	sym_node_t*	sym_node;
+	dict_table_t*	table;
+	sym_node_t*	t_node;
+	ulint		n_cols;
+	ulint		i;
+
+	ut_a(exp_node);
+
+	if (que_node_get_type(exp_node) == QUE_NODE_FUNC) {
+		func_node = static_cast<func_node_t*>(exp_node);
+
+		arg = func_node->args;
+
+		while (arg) {
+			pars_resolve_exp_columns(table_node, arg);
+
+			arg = que_node_get_next(arg);
+		}
+
+		return;
+	}
+
+	ut_a(que_node_get_type(exp_node) == QUE_NODE_SYMBOL);
+
+	sym_node = static_cast<sym_node_t*>(exp_node);
+
+	if (sym_node->resolved) {
+
+		return;
+	}
+
+	/* Not resolved yet: look in the table list for a column with the
+	same name */
+
+	t_node = table_node;
+
+	while (t_node) {
+		table = t_node->table;
+
+		n_cols = dict_table_get_n_cols(table);
+
+		for (i = 0; i < n_cols; i++) {
+			const dict_col_t*	col
+				= dict_table_get_nth_col(table, i);
+			const char*		col_name
+				= dict_table_get_col_name(table, i);
+
+			if (sym_node->name_len == strlen(col_name)
+			    && !memcmp(sym_node->name, col_name,
+				       sym_node->name_len)) {
+				/* Found */
+				sym_node->resolved = TRUE;
+				sym_node->token_type = SYM_COLUMN;
+				sym_node->table = table;
+				sym_node->col_no = i;
+				sym_node->prefetch_buf = NULL;
+
+				dict_col_copy_type(
+					col,
+					dfield_get_type(&sym_node
+							->common.val));
+
+				return;
+			}
+		}
+
+		t_node = static_cast<sym_node_t*>(que_node_get_next(t_node));
+	}
+}
+
+/*********************************************************************//**
+Resolves the meaning of columns in an expression list. */
+static
+void
+pars_resolve_exp_list_columns(
+/*==========================*/
+	sym_node_t*	table_node,	/*!< in: first node in a table list */
+	que_node_t*	exp_node)	/*!< in: expression list first node, or
+					NULL */
+{
+	while (exp_node) {
+		pars_resolve_exp_columns(table_node, exp_node);
+
+		exp_node = que_node_get_next(exp_node);
+	}
+}
+
+/*********************************************************************//**
+Retrieves the table definition for a table name id. */
+static
+void
+pars_retrieve_table_def(
+/*====================*/
+	sym_node_t*	sym_node)	/*!< in: table node */
+{
+	ut_a(sym_node);
+	ut_a(que_node_get_type(sym_node) == QUE_NODE_SYMBOL);
+
+	/* Open the table only if it is not already opened. */
+	if (sym_node->token_type != SYM_TABLE_REF_COUNTED) {
+
+		ut_a(sym_node->table == NULL);
+
+		sym_node->resolved = TRUE;
+		sym_node->token_type = SYM_TABLE_REF_COUNTED;
+
+		sym_node->table = dict_table_open_on_name(
+			sym_node->name, true, DICT_ERR_IGNORE_NONE);
+
+		ut_a(sym_node->table != NULL);
+	}
+}
+
+/*********************************************************************//**
+Retrieves the table definitions for a list of table name ids.
+@return number of tables */
+static
+ulint
+pars_retrieve_table_list_defs(
+/*==========================*/
+	sym_node_t*	sym_node)	/*!< in: first table node in list */
+{
+	ulint		count		= 0;
+
+	if (sym_node == NULL) {
+
+		return(count);
+	}
+
+	while (sym_node) {
+		pars_retrieve_table_def(sym_node);
+
+		count++;
+
+		sym_node = static_cast<sym_node_t*>(
+			que_node_get_next(sym_node));
+	}
+
+	return(count);
+}
+
+/*********************************************************************//**
+Adds all columns to the select list if the query is SELECT * FROM ... */
+static
+void
+pars_select_all_columns(
+/*====================*/
+	sel_node_t*	select_node)	/*!< in: select node already containing
+					the table list */
+{
+	sym_node_t*	col_node;
+	sym_node_t*	table_node;
+	dict_table_t*	table;
+	ulint		i;
+
+	select_node->select_list = NULL;
+
+	table_node = select_node->table_list;
+
+	while (table_node) {
+		table = table_node->table;
+
+		for (i = 0; i < dict_table_get_n_user_cols(table); i++) {
+			const char*	col_name = dict_table_get_col_name(
+				table, i);
+
+			col_node = sym_tab_add_id(pars_sym_tab_global,
+						  (byte*) col_name,
+						  strlen(col_name));
+
+			select_node->select_list = que_node_list_add_last(
+				select_node->select_list, col_node);
+		}
+
+		table_node = static_cast<sym_node_t*>(
+			que_node_get_next(table_node));
+	}
+}
+
+/*********************************************************************//**
+Parses a select list; creates a query graph node for the whole SELECT
+statement.
+@return own: select node in a query tree */
+sel_node_t*
+pars_select_list(
+/*=============*/
+	que_node_t*	select_list,	/*!< in: select list */
+	sym_node_t*	into_list)	/*!< in: variables list or NULL */
+{
+	sel_node_t*	node;
+
+	node = sel_node_create(pars_sym_tab_global->heap);
+
+	node->select_list = select_list;
+	node->into_list = into_list;
+
+	pars_resolve_exp_list_variables_and_types(NULL, into_list);
+
+	return(node);
+}
+
+/*********************************************************************//**
+Checks if the query is an aggregate query, in which case the selct list must
+contain only aggregate function items. */
+static
+void
+pars_check_aggregate(
+/*=================*/
+	sel_node_t*	select_node)	/*!< in: select node already containing
+					the select list */
+{
+	que_node_t*	exp_node;
+	func_node_t*	func_node;
+	ulint		n_nodes			= 0;
+	ulint		n_aggregate_nodes	= 0;
+
+	exp_node = select_node->select_list;
+
+	while (exp_node) {
+
+		n_nodes++;
+
+		if (que_node_get_type(exp_node) == QUE_NODE_FUNC) {
+
+			func_node = static_cast<func_node_t*>(exp_node);
+
+			if (func_node->fclass == PARS_FUNC_AGGREGATE) {
+
+				n_aggregate_nodes++;
+			}
+		}
+
+		exp_node = que_node_get_next(exp_node);
+	}
+
+	if (n_aggregate_nodes > 0) {
+		ut_a(n_nodes == n_aggregate_nodes);
+
+		select_node->is_aggregate = TRUE;
+	} else {
+		select_node->is_aggregate = FALSE;
+	}
+}
+
+/*********************************************************************//**
+Parses a select statement.
+@return own: select node in a query tree */
+sel_node_t*
+pars_select_statement(
+/*==================*/
+	sel_node_t*	select_node,	/*!< in: select node already containing
+					the select list */
+	sym_node_t*	table_list,	/*!< in: table list */
+	que_node_t*	search_cond,	/*!< in: search condition or NULL */
+	pars_res_word_t* for_update,	/*!< in: NULL or &pars_update_token */
+	pars_res_word_t* lock_shared,	/*!< in: NULL or &pars_share_token */
+	order_node_t*	order_by)	/*!< in: NULL or an order-by node */
+{
+	select_node->state = SEL_NODE_OPEN;
+
+	select_node->table_list = table_list;
+	select_node->n_tables = pars_retrieve_table_list_defs(table_list);
+
+	if (select_node->select_list == &pars_star_denoter) {
+
+		/* SELECT * FROM ... */
+		pars_select_all_columns(select_node);
+	}
+
+	if (select_node->into_list) {
+		ut_a(que_node_list_get_len(select_node->into_list)
+		     == que_node_list_get_len(select_node->select_list));
+	}
+
+	UT_LIST_INIT(select_node->copy_variables, &sym_node_t::col_var_list);
+
+	pars_resolve_exp_list_columns(table_list, select_node->select_list);
+	pars_resolve_exp_list_variables_and_types(select_node,
+						  select_node->select_list);
+	pars_check_aggregate(select_node);
+
+	select_node->search_cond = search_cond;
+
+	if (search_cond) {
+		pars_resolve_exp_columns(table_list, search_cond);
+		pars_resolve_exp_variables_and_types(select_node, search_cond);
+	}
+
+	if (for_update) {
+		ut_a(!lock_shared);
+
+		select_node->set_x_locks = TRUE;
+		select_node->row_lock_mode = LOCK_X;
+
+		select_node->consistent_read = FALSE;
+		select_node->read_view = NULL;
+	} else if (lock_shared){
+		select_node->set_x_locks = FALSE;
+		select_node->row_lock_mode = LOCK_S;
+
+		select_node->consistent_read = FALSE;
+		select_node->read_view = NULL;
+	} else {
+		select_node->set_x_locks = FALSE;
+		select_node->row_lock_mode = LOCK_S;
+
+		select_node->consistent_read = TRUE;
+	}
+
+	select_node->order_by = order_by;
+
+	if (order_by) {
+		pars_resolve_exp_columns(table_list, order_by->column);
+	}
+
+	/* The final value of the following fields depend on the environment
+	where the select statement appears: */
+
+	select_node->can_get_updated = FALSE;
+	select_node->explicit_cursor = NULL;
+
+	opt_search_plan(select_node);
+
+	return(select_node);
+}
+
+/*********************************************************************//**
+Parses a cursor declaration.
+@return sym_node */
+que_node_t*
+pars_cursor_declaration(
+/*====================*/
+	sym_node_t*	sym_node,	/*!< in: cursor id node in the symbol
+					table */
+	sel_node_t*	select_node)	/*!< in: select node */
+{
+	sym_node->resolved = TRUE;
+	sym_node->token_type = SYM_CURSOR;
+	sym_node->cursor_def = select_node;
+
+	select_node->state = SEL_NODE_CLOSED;
+	select_node->explicit_cursor = sym_node;
+
+	return(sym_node);
+}
+
+/*********************************************************************//**
+Parses a function declaration.
+@return sym_node */
+que_node_t*
+pars_function_declaration(
+/*======================*/
+	sym_node_t*	sym_node)	/*!< in: function id node in the symbol
+					table */
+{
+	sym_node->resolved = TRUE;
+	sym_node->token_type = SYM_FUNCTION;
+
+	/* Check that the function exists. */
+	ut_a(pars_info_lookup_user_func(
+		pars_sym_tab_global->info, sym_node->name));
+
+	return(sym_node);
+}
+
+/*********************************************************************//**
+Parses a delete or update statement start.
+@return own: update node in a query tree */
+upd_node_t*
+pars_update_statement_start(
+/*========================*/
+	ibool		is_delete,	/*!< in: TRUE if delete */
+	sym_node_t*	table_sym,	/*!< in: table name node */
+	col_assign_node_t* col_assign_list)/*!< in: column assignment list, NULL
+					if delete */
+{
+	upd_node_t*	node;
+
+	node = upd_node_create(pars_sym_tab_global->heap);
+
+	node->is_delete = is_delete ? PLAIN_DELETE : NO_DELETE;
+
+	node->table_sym = table_sym;
+	node->col_assign_list = col_assign_list;
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses a column assignment in an update.
+@return column assignment node */
+col_assign_node_t*
+pars_column_assignment(
+/*===================*/
+	sym_node_t*	column,	/*!< in: column to assign */
+	que_node_t*	exp)	/*!< in: value to assign */
+{
+	col_assign_node_t*	node;
+
+	node = static_cast<col_assign_node_t*>(
+		mem_heap_alloc(pars_sym_tab_global->heap,
+			      sizeof(col_assign_node_t)));
+	node->common.type = QUE_NODE_COL_ASSIGNMENT;
+
+	node->col = column;
+	node->val = exp;
+
+	return(node);
+}
+
+/*********************************************************************//**
+Processes an update node assignment list. */
+static
+void
+pars_process_assign_list(
+/*=====================*/
+	upd_node_t*	node)	/*!< in: update node */
+{
+	col_assign_node_t*	col_assign_list;
+	sym_node_t*		table_sym;
+	col_assign_node_t*	assign_node;
+	upd_field_t*		upd_field;
+	dict_index_t*		clust_index;
+	sym_node_t*		col_sym;
+	ulint			changes_ord_field;
+	ulint			changes_field_size;
+	ulint			n_assigns;
+	ulint			i;
+
+	table_sym = node->table_sym;
+	col_assign_list = static_cast<col_assign_node_t*>(
+		 node->col_assign_list);
+	clust_index = dict_table_get_first_index(node->table);
+
+	assign_node = col_assign_list;
+	n_assigns = 0;
+
+	while (assign_node) {
+		pars_resolve_exp_columns(table_sym, assign_node->col);
+		pars_resolve_exp_columns(table_sym, assign_node->val);
+		pars_resolve_exp_variables_and_types(NULL, assign_node->val);
+#if 0
+		ut_a(dtype_get_mtype(
+			     dfield_get_type(que_node_get_val(
+						     assign_node->col)))
+		     == dtype_get_mtype(
+			     dfield_get_type(que_node_get_val(
+						     assign_node->val))));
+#endif
+
+		/* Add to the update node all the columns found in assignment
+		values as columns to copy: therefore, TRUE */
+
+		opt_find_all_cols(TRUE, clust_index, &(node->columns), NULL,
+				  assign_node->val);
+		n_assigns++;
+
+		assign_node = static_cast<col_assign_node_t*>(
+				que_node_get_next(assign_node));
+	}
+
+	node->update = upd_create(n_assigns, pars_sym_tab_global->heap);
+
+	assign_node = col_assign_list;
+
+	changes_field_size = UPD_NODE_NO_SIZE_CHANGE;
+
+	for (i = 0; i < n_assigns; i++) {
+		upd_field = upd_get_nth_field(node->update, i);
+
+		col_sym = assign_node->col;
+
+		ulint field_no = dict_index_get_nth_col_pos(
+			clust_index, col_sym->col_no, NULL);
+		ut_ad(field_no < clust_index->n_fields);
+		upd_field_set_field_no(upd_field,
+				       static_cast<uint16_t>(field_no),
+				       clust_index);
+		upd_field->exp = assign_node->val;
+
+		if (!dict_col_get_fixed_size(
+			    dict_index_get_nth_col(clust_index,
+						   upd_field->field_no),
+			    dict_table_is_comp(node->table))) {
+			changes_field_size = 0;
+		}
+
+		assign_node = static_cast<col_assign_node_t*>(
+				que_node_get_next(assign_node));
+	}
+
+	/* Find out if the update can modify an ordering field in any index */
+
+	changes_ord_field = UPD_NODE_NO_ORD_CHANGE;
+
+	if (row_upd_changes_some_index_ord_field_binary(node->table,
+							node->update)) {
+		changes_ord_field = 0;
+	}
+
+	node->cmpl_info = changes_ord_field | changes_field_size;
+}
+
+/*********************************************************************//**
+Parses an update or delete statement.
+@return own: update node in a query tree */
+upd_node_t*
+pars_update_statement(
+/*==================*/
+	upd_node_t*	node,		/*!< in: update node */
+	sym_node_t*	cursor_sym,	/*!< in: pointer to a cursor entry in
+					the symbol table or NULL */
+	que_node_t*	search_cond)	/*!< in: search condition or NULL */
+{
+	sym_node_t*	table_sym;
+	sel_node_t*	sel_node;
+	plan_t*		plan;
+
+	table_sym = node->table_sym;
+
+	pars_retrieve_table_def(table_sym);
+	node->table = table_sym->table;
+
+	UT_LIST_INIT(node->columns, &sym_node_t::col_var_list);
+
+	/* Make the single table node into a list of table nodes of length 1 */
+
+	que_node_list_add_last(NULL, table_sym);
+
+	if (cursor_sym) {
+		pars_resolve_exp_variables_and_types(NULL, cursor_sym);
+
+		sel_node = cursor_sym->alias->cursor_def;
+
+		node->searched_update = FALSE;
+	} else {
+		sel_node = pars_select_list(NULL, NULL);
+
+		pars_select_statement(sel_node, table_sym, search_cond, NULL,
+				      &pars_share_token, NULL);
+		node->searched_update = TRUE;
+		sel_node->common.parent = node;
+	}
+
+	node->select = sel_node;
+
+	ut_a(!node->is_delete || (node->col_assign_list == NULL));
+	ut_a(node->is_delete == PLAIN_DELETE || node->col_assign_list != NULL);
+
+	if (node->is_delete == PLAIN_DELETE) {
+		node->cmpl_info = 0;
+	} else {
+		pars_process_assign_list(node);
+	}
+
+	if (node->searched_update) {
+		node->has_clust_rec_x_lock = TRUE;
+		sel_node->set_x_locks = TRUE;
+		sel_node->row_lock_mode = LOCK_X;
+	} else {
+		node->has_clust_rec_x_lock = sel_node->set_x_locks;
+		ut_ad(node->has_clust_rec_x_lock);
+	}
+
+	ut_a(sel_node->n_tables == 1);
+	ut_a(sel_node->consistent_read == FALSE);
+	ut_a(sel_node->order_by == NULL);
+	ut_a(sel_node->is_aggregate == FALSE);
+
+	sel_node->can_get_updated = TRUE;
+
+	node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+	plan = sel_node_get_nth_plan(sel_node, 0);
+
+	plan->no_prefetch = TRUE;
+
+	if (!dict_index_is_clust(plan->index)) {
+
+		plan->must_get_clust = TRUE;
+
+		node->pcur = &(plan->clust_pcur);
+	} else {
+		node->pcur = &(plan->pcur);
+	}
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses an insert statement.
+@return own: update node in a query tree */
+ins_node_t*
+pars_insert_statement(
+/*==================*/
+	sym_node_t*	table_sym,	/*!< in: table name node */
+	que_node_t*	values_list,	/*!< in: value expression list or NULL */
+	sel_node_t*	select)		/*!< in: select condition or NULL */
+{
+	ins_node_t*	node;
+	dtuple_t*	row;
+	ulint		ins_type;
+
+	ut_a(values_list || select);
+	ut_a(!values_list || !select);
+
+	if (values_list) {
+		ins_type = INS_VALUES;
+	} else {
+		ins_type = INS_SEARCHED;
+	}
+
+	pars_retrieve_table_def(table_sym);
+
+	node = ins_node_create(ins_type, table_sym->table,
+			       pars_sym_tab_global->heap);
+
+	row = dtuple_create(pars_sym_tab_global->heap,
+			    dict_table_get_n_cols(node->table));
+
+	dict_table_copy_types(row, table_sym->table);
+
+	ins_node_set_new_row(node, row);
+
+	node->select = select;
+
+	if (select) {
+		select->common.parent = node;
+
+		ut_a(que_node_list_get_len(select->select_list)
+		     == dict_table_get_n_user_cols(table_sym->table));
+	}
+
+	node->values_list = values_list;
+
+	if (node->values_list) {
+		pars_resolve_exp_list_variables_and_types(NULL, values_list);
+
+		ut_a(que_node_list_get_len(values_list)
+		     == dict_table_get_n_user_cols(table_sym->table));
+	}
+
+	return(node);
+}
+
+/*********************************************************************//**
+Set the type of a dfield. */
+static
+void
+pars_set_dfield_type(
+/*=================*/
+	dfield_t*		dfield,		/*!< in: dfield */
+	pars_res_word_t*	type,		/*!< in: pointer to a type
+						token */
+	ulint			len,		/*!< in: length, or 0 */
+	bool			is_not_null)	/*!< in: whether the column is
+						NOT NULL. */
+{
+	ulint flags = 0;
+
+	if (is_not_null) {
+		flags |= DATA_NOT_NULL;
+	}
+
+	if (type == &pars_bigint_token) {
+		ut_a(len == 0);
+
+		dtype_set(dfield_get_type(dfield), DATA_INT, flags, 8);
+	} else if (type == &pars_int_token) {
+		ut_a(len == 0);
+
+		dtype_set(dfield_get_type(dfield), DATA_INT, flags, 4);
+
+	} else if (type == &pars_char_token) {
+		//ut_a(len == 0);
+
+		dtype_set(dfield_get_type(dfield), DATA_VARCHAR,
+			  DATA_ENGLISH | flags, len);
+	} else {
+		ut_error;
+	}
+}
+
+/*********************************************************************//**
+Parses a variable declaration.
+@return own: symbol table node of type SYM_VAR */
+sym_node_t*
+pars_variable_declaration(
+/*======================*/
+	sym_node_t*	node,	/*!< in: symbol table node allocated for the
+				id of the variable */
+	pars_res_word_t* type)	/*!< in: pointer to a type token */
+{
+	node->resolved = TRUE;
+	node->token_type = SYM_VAR;
+
+	node->param_type = PARS_NOT_PARAM;
+
+	pars_set_dfield_type(que_node_get_val(node), type, 0, false);
+
+	return(node);
+}
+
+/*********************************************************************//**
+Sets the parent field in a query node list. */
+static
+void
+pars_set_parent_in_list(
+/*====================*/
+	que_node_t*	node_list,	/*!< in: first node in a list */
+	que_node_t*	parent)		/*!< in: parent value to set in all
+					nodes of the list */
+{
+	que_common_t*	common;
+
+	common = static_cast<que_common_t*>(node_list);
+
+	while (common) {
+		common->parent = parent;
+
+		common = static_cast<que_common_t*>(que_node_get_next(common));
+	}
+}
+
+/*********************************************************************//**
+Parses an elsif element.
+@return elsif node */
+elsif_node_t*
+pars_elsif_element(
+/*===============*/
+	que_node_t*	cond,		/*!< in: if-condition */
+	que_node_t*	stat_list)	/*!< in: statement list */
+{
+	elsif_node_t*	node;
+
+	node = static_cast<elsif_node_t*>(
+		mem_heap_alloc(
+			pars_sym_tab_global->heap, sizeof(elsif_node_t)));
+
+	node->common.type = QUE_NODE_ELSIF;
+
+	node->cond = cond;
+
+	pars_resolve_exp_variables_and_types(NULL, cond);
+
+	node->stat_list = stat_list;
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses an if-statement.
+@return if-statement node */
+if_node_t*
+pars_if_statement(
+/*==============*/
+	que_node_t*	cond,		/*!< in: if-condition */
+	que_node_t*	stat_list,	/*!< in: statement list */
+	que_node_t*	else_part)	/*!< in: else-part statement list
+					or elsif element list */
+{
+	if_node_t*	node;
+	elsif_node_t*	elsif_node;
+
+	node = static_cast<if_node_t*>(
+		 mem_heap_alloc(
+			pars_sym_tab_global->heap, sizeof(if_node_t)));
+
+	node->common.type = QUE_NODE_IF;
+
+	node->cond = cond;
+
+	pars_resolve_exp_variables_and_types(NULL, cond);
+
+	node->stat_list = stat_list;
+
+	if (else_part && (que_node_get_type(else_part) == QUE_NODE_ELSIF)) {
+
+		/* There is a list of elsif conditions */
+
+		node->else_part = NULL;
+		node->elsif_list = static_cast<elsif_node_t*>(else_part);
+
+		elsif_node = static_cast<elsif_node_t*>(else_part);
+
+		while (elsif_node) {
+			pars_set_parent_in_list(elsif_node->stat_list, node);
+
+			elsif_node = static_cast<elsif_node_t*>(
+				que_node_get_next(elsif_node));
+		}
+	} else {
+		node->else_part = else_part;
+		node->elsif_list = NULL;
+
+		pars_set_parent_in_list(else_part, node);
+	}
+
+	pars_set_parent_in_list(stat_list, node);
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses a while-statement.
+@return while-statement node */
+while_node_t*
+pars_while_statement(
+/*=================*/
+	que_node_t*	cond,		/*!< in: while-condition */
+	que_node_t*	stat_list)	/*!< in: statement list */
+{
+	while_node_t*	node;
+
+	node = static_cast<while_node_t*>(
+		mem_heap_alloc(
+			pars_sym_tab_global->heap, sizeof(while_node_t)));
+
+	node->common.type = QUE_NODE_WHILE;
+
+	node->cond = cond;
+
+	pars_resolve_exp_variables_and_types(NULL, cond);
+
+	node->stat_list = stat_list;
+
+	pars_set_parent_in_list(stat_list, node);
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses a for-loop-statement.
+@return for-statement node */
+for_node_t*
+pars_for_statement(
+/*===============*/
+	sym_node_t*	loop_var,	/*!< in: loop variable */
+	que_node_t*	loop_start_limit,/*!< in: loop start expression */
+	que_node_t*	loop_end_limit,	/*!< in: loop end expression */
+	que_node_t*	stat_list)	/*!< in: statement list */
+{
+	for_node_t*	node;
+
+	node = static_cast<for_node_t*>(
+		mem_heap_alloc(pars_sym_tab_global->heap, sizeof(for_node_t)));
+
+	node->common.type = QUE_NODE_FOR;
+
+	pars_resolve_exp_variables_and_types(NULL, loop_var);
+	pars_resolve_exp_variables_and_types(NULL, loop_start_limit);
+	pars_resolve_exp_variables_and_types(NULL, loop_end_limit);
+
+	node->loop_var = loop_var->indirection;
+
+	ut_a(loop_var->indirection);
+
+	node->loop_start_limit = loop_start_limit;
+	node->loop_end_limit = loop_end_limit;
+
+	node->stat_list = stat_list;
+
+	pars_set_parent_in_list(stat_list, node);
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses an exit statement.
+@return exit statement node */
+exit_node_t*
+pars_exit_statement(void)
+/*=====================*/
+{
+	exit_node_t*	node;
+
+	node = static_cast<exit_node_t*>(
+		mem_heap_alloc(pars_sym_tab_global->heap, sizeof(exit_node_t)));
+	node->common.type = QUE_NODE_EXIT;
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses a return-statement.
+@return return-statement node */
+return_node_t*
+pars_return_statement(void)
+/*=======================*/
+{
+	return_node_t*	node;
+
+	node = static_cast<return_node_t*>(
+		mem_heap_alloc(
+			pars_sym_tab_global->heap, sizeof(return_node_t)));
+	node->common.type = QUE_NODE_RETURN;
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses an assignment statement.
+@return assignment statement node */
+assign_node_t*
+pars_assignment_statement(
+/*======================*/
+	sym_node_t*	var,	/*!< in: variable to assign */
+	que_node_t*	val)	/*!< in: value to assign */
+{
+	assign_node_t*	node;
+
+	node = static_cast<assign_node_t*>(
+		mem_heap_alloc(
+			pars_sym_tab_global->heap, sizeof(assign_node_t)));
+	node->common.type = QUE_NODE_ASSIGNMENT;
+
+	node->var = var;
+	node->val = val;
+
+	pars_resolve_exp_variables_and_types(NULL, var);
+	pars_resolve_exp_variables_and_types(NULL, val);
+
+	ut_a(dtype_get_mtype(dfield_get_type(que_node_get_val(var)))
+	     == dtype_get_mtype(dfield_get_type(que_node_get_val(val))));
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses a procedure call.
+@return function node */
+func_node_t*
+pars_procedure_call(
+/*================*/
+	que_node_t*	res_word,/*!< in: procedure name reserved word */
+	que_node_t*	args)	/*!< in: argument list */
+{
+	func_node_t*	node;
+
+	node = pars_func(res_word, args);
+
+	pars_resolve_exp_list_variables_and_types(NULL, args);
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses a fetch statement. into_list or user_func (but not both) must be
+non-NULL.
+@return fetch statement node */
+fetch_node_t*
+pars_fetch_statement(
+/*=================*/
+	sym_node_t*	cursor,		/*!< in: cursor node */
+	sym_node_t*	into_list,	/*!< in: variables to set, or NULL */
+	sym_node_t*	user_func)	/*!< in: user function name, or NULL */
+{
+	sym_node_t*	cursor_decl;
+	fetch_node_t*	node;
+
+	/* Logical XOR. */
+	ut_a(!into_list != !user_func);
+
+	node = static_cast<fetch_node_t*>(
+		mem_heap_alloc(
+			pars_sym_tab_global->heap, sizeof(fetch_node_t)));
+
+	node->common.type = QUE_NODE_FETCH;
+
+	pars_resolve_exp_variables_and_types(NULL, cursor);
+
+	if (into_list) {
+		pars_resolve_exp_list_variables_and_types(NULL, into_list);
+		node->into_list = into_list;
+		node->func = NULL;
+	} else {
+		pars_resolve_exp_variables_and_types(NULL, user_func);
+
+		node->func = pars_info_lookup_user_func(
+			pars_sym_tab_global->info, user_func->name);
+
+		ut_a(node->func);
+
+		node->into_list = NULL;
+	}
+
+	cursor_decl = cursor->alias;
+
+	ut_a(cursor_decl->token_type == SYM_CURSOR);
+
+	node->cursor_def = cursor_decl->cursor_def;
+
+	if (into_list) {
+		ut_a(que_node_list_get_len(into_list)
+		     == que_node_list_get_len(node->cursor_def->select_list));
+	}
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses an open or close cursor statement.
+@return fetch statement node */
+open_node_t*
+pars_open_statement(
+/*================*/
+	ulint		type,	/*!< in: ROW_SEL_OPEN_CURSOR
+				or ROW_SEL_CLOSE_CURSOR */
+	sym_node_t*	cursor)	/*!< in: cursor node */
+{
+	sym_node_t*	cursor_decl;
+	open_node_t*	node;
+
+	node = static_cast<open_node_t*>(
+		mem_heap_alloc(
+			pars_sym_tab_global->heap, sizeof(open_node_t)));
+
+	node->common.type = QUE_NODE_OPEN;
+
+	pars_resolve_exp_variables_and_types(NULL, cursor);
+
+	cursor_decl = cursor->alias;
+
+	ut_a(cursor_decl->token_type == SYM_CURSOR);
+
+	node->op_type = static_cast<open_node_op>(type);
+	node->cursor_def = cursor_decl->cursor_def;
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses a row_printf-statement.
+@return row_printf-statement node */
+row_printf_node_t*
+pars_row_printf_statement(
+/*======================*/
+	sel_node_t*	sel_node)	/*!< in: select node */
+{
+	row_printf_node_t*	node;
+
+	node = static_cast<row_printf_node_t*>(
+		mem_heap_alloc(
+			pars_sym_tab_global->heap, sizeof(row_printf_node_t)));
+	node->common.type = QUE_NODE_ROW_PRINTF;
+
+	node->sel_node = sel_node;
+
+	sel_node->common.parent = node;
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses a commit statement.
+@return own: commit node struct */
+commit_node_t*
+pars_commit_statement(void)
+/*=======================*/
+{
+	return(trx_commit_node_create(pars_sym_tab_global->heap));
+}
+
+/*********************************************************************//**
+Parses a rollback statement.
+@return own: rollback node struct */
+roll_node_t*
+pars_rollback_statement(void)
+/*=========================*/
+{
+	return(roll_node_create(pars_sym_tab_global->heap));
+}
+
+/*********************************************************************//**
+Parses a column definition at a table creation.
+@return column sym table node */
+sym_node_t*
+pars_column_def(
+/*============*/
+	sym_node_t*		sym_node,	/*!< in: column node in the
+						symbol table */
+	pars_res_word_t*	type,		/*!< in: data type */
+	sym_node_t*		len,		/*!< in: length of column, or
+						NULL */
+	void*			is_not_null)	/*!< in: if not NULL, column
+						is of type NOT NULL. */
+{
+	ulint len2;
+
+	if (len) {
+		len2 = ulint(eval_node_get_int_val(len));
+	} else {
+		len2 = 0;
+	}
+
+	pars_set_dfield_type(que_node_get_val(sym_node), type, len2,
+			     is_not_null != NULL);
+
+	return(sym_node);
+}
+
+/*********************************************************************//**
+Parses a table creation operation.
+@return table create subgraph */
+tab_node_t*
+pars_create_table(
+/*==============*/
+	sym_node_t*	table_sym,	/*!< in: table name node in the symbol
+					table */
+	sym_node_t*	column_defs)	/*!< in: list of column names */
+{
+	dict_table_t*	table;
+	sym_node_t*	column;
+	tab_node_t*	node;
+	const dtype_t*	dtype;
+	ulint		n_cols;
+	ulint		flags = 0;
+	ulint		flags2 = DICT_TF2_FTS_AUX_HEX_NAME;
+
+	DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name",
+			flags2 &= ~DICT_TF2_FTS_AUX_HEX_NAME;);
+
+	n_cols = que_node_list_get_len(column_defs);
+
+	table = dict_table_t::create(
+		{table_sym->name, strlen(table_sym->name)},
+		nullptr, n_cols, 0, flags, flags2);
+
+	mem_heap_t* heap = pars_sym_tab_global->heap;
+	column = column_defs;
+
+	while (column) {
+		dtype = dfield_get_type(que_node_get_val(column));
+
+		dict_mem_table_add_col(table, heap,
+				       column->name, dtype->mtype,
+				       dtype->prtype, dtype->len);
+		column->resolved = TRUE;
+		column->token_type = SYM_COLUMN;
+
+		column = static_cast<sym_node_t*>(que_node_get_next(column));
+	}
+
+	dict_table_add_system_columns(table, heap);
+	node = tab_create_graph_create(table, heap);
+
+	table_sym->resolved = TRUE;
+	table_sym->token_type = SYM_TABLE;
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses an index creation operation.
+@return index create subgraph */
+ind_node_t*
+pars_create_index(
+/*==============*/
+	pars_res_word_t* unique_def,	/*!< in: not NULL if a unique index */
+	pars_res_word_t* clustered_def,	/*!< in: not NULL if a clustered index */
+	sym_node_t*	index_sym,	/*!< in: index name node in the symbol
+					table */
+	sym_node_t*	table_sym,	/*!< in: table name node in the symbol
+					table */
+	sym_node_t*	column_list)	/*!< in: list of column names */
+{
+	dict_index_t*	index;
+	sym_node_t*	column;
+	ind_node_t*	node;
+	ulint		n_fields;
+	ulint		ind_type;
+
+	n_fields = que_node_list_get_len(column_list);
+
+	ind_type = 0;
+
+	if (unique_def) {
+		ind_type = ind_type | DICT_UNIQUE;
+	}
+
+	if (clustered_def) {
+		ind_type = ind_type | DICT_CLUSTERED;
+	}
+
+	index = dict_mem_index_create(NULL, index_sym->name,
+				      ind_type, n_fields);
+	column = column_list;
+
+	while (column) {
+		dict_mem_index_add_field(index, column->name, 0);
+
+		column->resolved = TRUE;
+		column->token_type = SYM_COLUMN;
+
+		column = static_cast<sym_node_t*>(que_node_get_next(column));
+	}
+
+	node = ind_create_graph_create(index, table_sym->name,
+				       pars_sym_tab_global->heap,
+				       FIL_ENCRYPTION_DEFAULT,
+				       FIL_DEFAULT_ENCRYPTION_KEY);
+
+	table_sym->resolved = TRUE;
+	table_sym->token_type = SYM_TABLE;
+
+	index_sym->resolved = TRUE;
+	index_sym->token_type = SYM_TABLE;
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses a procedure definition.
+@return query fork node */
+que_fork_t*
+pars_procedure_definition(
+/*======================*/
+	sym_node_t*	sym_node,	/*!< in: procedure id node in the symbol
+					table */
+	que_node_t*	stat_list)	/*!< in: statement list */
+{
+	proc_node_t*	node;
+	que_fork_t*	fork;
+	que_thr_t*	thr;
+	mem_heap_t*	heap;
+
+	heap = pars_sym_tab_global->heap;
+
+	fork = que_fork_create(heap);
+	fork->trx = NULL;
+
+	thr = que_thr_create(fork, heap, NULL);
+
+	node = static_cast<proc_node_t*>(
+		mem_heap_alloc(heap, sizeof(proc_node_t)));
+
+	node->common.type = QUE_NODE_PROC;
+	node->common.parent = thr;
+
+	sym_node->token_type = SYM_PROCEDURE_NAME;
+	sym_node->resolved = TRUE;
+
+	node->proc_id = sym_node;
+	node->stat_list = stat_list;
+
+	pars_set_parent_in_list(stat_list, node);
+
+	node->sym_tab = pars_sym_tab_global;
+
+	thr->child = node;
+
+	pars_sym_tab_global->query_graph = fork;
+
+	return(fork);
+}
+
+/*************************************************************//**
+Retrieves characters to the lexical analyzer. */
+int
+pars_get_lex_chars(
+/*===============*/
+	char*	buf,		/*!< in/out: buffer where to copy */
+	size_t	max_size)	/*!< in: maximum number of characters which fit
+				in the buffer */
+{
+	size_t len = pars_sym_tab_global->string_len
+		- pars_sym_tab_global->next_char_pos;
+	if (len == 0) {
+		return(0);
+	}
+
+	if (len > max_size) {
+		len = max_size;
+	}
+
+	memcpy(buf, pars_sym_tab_global->sql_string
+	       + pars_sym_tab_global->next_char_pos, len);
+
+	pars_sym_tab_global->next_char_pos += len;
+
+	return static_cast<int>(len);
+}
+
+/*************************************************************//**
+Called by yyparse on error. */
+void
+yyerror(
+/*====*/
+	const char*	s MY_ATTRIBUTE((unused)))
+				/*!< in: error message string */
+{
+	ut_ad(s);
+
+	ib::fatal() << "PARSER: Syntax error in SQL string";
+}
+
+/*************************************************************//**
+Parses an SQL string returning the query graph.
+@return own: the query graph */
+que_t*
+pars_sql(
+/*=====*/
+	pars_info_t*	info,	/*!< in: extra information, or NULL */
+	const char*	str)	/*!< in: SQL string */
+{
+	sym_node_t*	sym_node;
+	mem_heap_t*	heap;
+	que_t*		graph;
+
+	ut_ad(str);
+
+	heap = mem_heap_create(16000);
+
+	/* Currently, the parser is not reentrant: */
+	ut_ad(dict_sys.locked());
+
+	pars_sym_tab_global = sym_tab_create(heap);
+
+	pars_sym_tab_global->string_len = strlen(str);
+	pars_sym_tab_global->sql_string = static_cast<char*>(
+		mem_heap_dup(heap, str, pars_sym_tab_global->string_len + 1));
+	pars_sym_tab_global->next_char_pos = 0;
+	pars_sym_tab_global->info = info;
+
+	yyparse();
+
+	sym_node = UT_LIST_GET_FIRST(pars_sym_tab_global->sym_list);
+
+	while (sym_node) {
+		ut_a(sym_node->resolved);
+
+		sym_node = UT_LIST_GET_NEXT(sym_list, sym_node);
+	}
+
+	graph = pars_sym_tab_global->query_graph;
+
+	graph->sym_tab = pars_sym_tab_global;
+	graph->info = info;
+
+	pars_sym_tab_global = NULL;
+
+	/* fprintf(stderr, "SQL graph size %lu\n", mem_heap_get_size(heap)); */
+
+	return(graph);
+}
+
+/** Completes a query graph by adding query thread and fork nodes
+above it and prepares the graph for running.
+@param[in]	node		root node for an incomplete query
+				graph, or NULL for dummy graph
+@param[in]	trx		transaction handle
+@param[in]	heap		memory heap from which allocated
+@param[in]	prebuilt	row prebuilt structure
+@return query thread node to run */
+que_thr_t*
+pars_complete_graph_for_exec(
+	que_node_t*	node,
+	trx_t*		trx,
+	mem_heap_t*	heap,
+	row_prebuilt_t*	prebuilt)
+{
+	que_fork_t*	fork;
+	que_thr_t*	thr;
+
+	fork = que_fork_create(heap);
+	fork->trx = trx;
+
+	thr = que_thr_create(fork, heap, prebuilt);
+
+	thr->child = node;
+
+	if (node) {
+		que_node_set_parent(node, thr);
+	}
+
+	trx->graph = NULL;
+
+	return(thr);
+}
+
+/****************************************************************//**
+Create parser info struct.
+@return own: info struct */
+pars_info_t*
+pars_info_create(void)
+/*==================*/
+{
+	pars_info_t*	info;
+	mem_heap_t*	heap;
+
+	heap = mem_heap_create(512);
+
+	info = static_cast<pars_info_t*>(mem_heap_zalloc(heap, sizeof(*info)));
+
+	info->heap = heap;
+
+	return(info);
+}
+
+/****************************************************************//**
+Add bound literal. */
+void
+pars_info_add_literal(
+/*==================*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	const void*	address,	/*!< in: address */
+	ulint		length,		/*!< in: length of data */
+	ulint		type,		/*!< in: type, e.g. DATA_FIXBINARY */
+	ulint		prtype)		/*!< in: precise type, e.g.
+					DATA_UNSIGNED */
+{
+	pars_bound_lit_t*	pbl;
+
+	ut_ad(!pars_info_get_bound_lit(info, name));
+
+	pbl = static_cast<pars_bound_lit_t*>(
+		mem_heap_alloc(info->heap, sizeof(*pbl)));
+
+	pbl->name = name;
+
+	pbl->address = address;
+	pbl->length = length;
+	pbl->type = type;
+	pbl->prtype = prtype;
+
+	if (!info->bound_lits) {
+		ib_alloc_t*     heap_alloc;
+
+		heap_alloc = ib_heap_allocator_create(info->heap);
+
+		info->bound_lits = ib_vector_create(heap_alloc, sizeof(*pbl), 8);
+	}
+
+	ib_vector_push(info->bound_lits, pbl);
+}
+
+/****************************************************************//**
+Equivalent to pars_info_add_literal(info, name, str, strlen(str),
+DATA_VARCHAR, DATA_ENGLISH). */
+void
+pars_info_add_str_literal(
+/*======================*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	const char*	str)		/*!< in: string */
+{
+	pars_info_add_literal(info, name, str, strlen(str),
+			      DATA_VARCHAR, DATA_ENGLISH);
+}
+
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry.*/
+void
+pars_info_bind_literal(
+/*===================*/
+	pars_info_t*	info,		/* in: info struct */
+	const char*	name,		/* in: name */
+	const void*	address,	/* in: address */
+	ulint		length,		/* in: length of data */
+	ulint		type,		/* in: type, e.g. DATA_FIXBINARY */
+	ulint		prtype)		/* in: precise type, e.g. */
+{
+	pars_bound_lit_t*	pbl;
+
+	pbl = pars_info_lookup_bound_lit(info, name);
+
+	if (!pbl) {
+		pars_info_add_literal(
+			info, name, address, length, type, prtype);
+	} else {
+		pbl->address = address;
+		pbl->length = length;
+
+		sym_tab_rebind_lit(pbl->node, address, length);
+	}
+}
+
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry.*/
+void
+pars_info_bind_varchar_literal(
+/*===========================*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	const byte*	str,		/*!< in: string */
+	ulint		str_len)	/*!< in: string length */
+{
+	pars_bound_lit_t*	pbl;
+
+	pbl = pars_info_lookup_bound_lit(info, name);
+
+	if (!pbl) {
+		pars_info_add_literal(
+			info, name, str, str_len, DATA_VARCHAR, DATA_ENGLISH);
+	} else {
+
+		pbl->address = str;
+		pbl->length = str_len;
+
+		sym_tab_rebind_lit(pbl->node, str, str_len);
+	}
+}
+
+/****************************************************************//**
+Equivalent to:
+
+char buf[4];
+mach_write_to_4(buf, val);
+pars_info_add_literal(info, name, buf, 4, DATA_INT, 0);
+
+except that the buffer is dynamically allocated from the info struct's
+heap. */
+void
+pars_info_add_int4_literal(
+/*=======================*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	ulint		val)		/*!< in: value */
+{
+	byte*	buf = static_cast<byte*>(mem_heap_alloc(info->heap, 4));
+
+	mach_write_to_4(buf, val);
+	pars_info_add_literal(info, name, buf, 4, DATA_INT, 0);
+}
+
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry. */
+void
+pars_info_bind_int4_literal(
+/*========================*/
+	pars_info_t*		info,   /* in: info struct */
+	const char*		name,   /* in: name */
+	const ib_uint32_t*	val)    /* in: value */
+{
+	pars_bound_lit_t*       pbl;
+
+	pbl = pars_info_lookup_bound_lit(info, name);
+
+	if (!pbl) {
+		pars_info_add_literal(info, name, val, 4, DATA_INT, 0);
+	} else {
+
+		pbl->address = val;
+		pbl->length = sizeof(*val);
+
+		sym_tab_rebind_lit(pbl->node, val, sizeof(*val));
+	}
+}
+
+/********************************************************************
+If the literal value already exists then it rebinds otherwise it
+creates a new entry. */
+void
+pars_info_bind_int8_literal(
+/*========================*/
+	pars_info_t*		info,	/* in: info struct */
+	const char*		name,	/* in: name */
+	const ib_uint64_t*	val)	/* in: value */
+{
+	pars_bound_lit_t*	pbl;
+
+	pbl = pars_info_lookup_bound_lit(info, name);
+
+	if (!pbl) {
+		pars_info_add_literal(
+			info, name, val, sizeof(*val), DATA_INT, 0);
+	} else {
+
+		pbl->address = val;
+		pbl->length = sizeof(*val);
+
+		sym_tab_rebind_lit(pbl->node, val, sizeof(*val));
+	}
+}
+
+/****************************************************************//**
+Equivalent to:
+
+char buf[8];
+mach_write_to_8(buf, val);
+pars_info_add_literal(info, name, buf, 8, DATA_FIXBINARY, 0);
+
+except that the buffer is dynamically allocated from the info struct's
+heap. */
+void
+pars_info_add_ull_literal(
+/*======================*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	ib_uint64_t	val)		/*!< in: value */
+{
+	byte*	buf = static_cast<byte*>(mem_heap_alloc(info->heap, 8));
+
+	mach_write_to_8(buf, val);
+
+	pars_info_add_literal(info, name, buf, 8, DATA_FIXBINARY, 0);
+}
+
+/****************************************************************//**
+If the literal value already exists then it rebinds otherwise it
+creates a new entry. */
+void
+pars_info_bind_ull_literal(
+/*=======================*/
+	pars_info_t*		info,		/*!< in: info struct */
+	const char*		name,		/*!< in: name */
+	const ib_uint64_t*	val)		/*!< in: value */
+{
+	pars_bound_lit_t*	pbl;
+
+	pbl = pars_info_lookup_bound_lit(info, name);
+
+	if (!pbl) {
+		pars_info_add_literal(
+			info, name, val, sizeof(*val), DATA_FIXBINARY, 0);
+	} else {
+
+		pbl->address = val;
+		pbl->length = sizeof(*val);
+
+		sym_tab_rebind_lit(pbl->node, val, sizeof(*val));
+	}
+}
+
+/****************************************************************//**
+Add user function. */
+void
+pars_info_bind_function(
+/*====================*/
+	pars_info_t*		info,	/*!< in: info struct */
+	const char*		name,	/*!< in: function name */
+	pars_user_func_cb_t	func,	/*!< in: function address */
+	void*			arg)	/*!< in: user-supplied argument */
+{
+	pars_user_func_t*	puf;
+
+	puf = pars_info_lookup_user_func(info, name);
+
+	if (!puf) {
+		if (!info->funcs) {
+			ib_alloc_t*     heap_alloc;
+
+			heap_alloc = ib_heap_allocator_create(info->heap);
+
+			info->funcs = ib_vector_create(
+				heap_alloc, sizeof(*puf), 8);
+		}
+
+		/* Create a "new" element */
+		puf = static_cast<pars_user_func_t*>(
+			ib_vector_push(info->funcs, NULL));
+		puf->name = name;
+	}
+
+	puf->arg = arg;
+	puf->func = func;
+}
+
+/********************************************************************
+Add bound id. */
+void
+pars_info_bind_id(
+/*==============*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	const char*	id)		/*!< in: id */
+{
+	pars_bound_id_t*	bid;
+
+	bid = pars_info_lookup_bound_id(info, name);
+
+	if (!bid) {
+
+		if (!info->bound_ids) {
+			ib_alloc_t*     heap_alloc;
+
+			heap_alloc = ib_heap_allocator_create(info->heap);
+
+			info->bound_ids = ib_vector_create(
+				heap_alloc, sizeof(*bid), 8);
+		}
+
+		/* Create a "new" element */
+		bid = static_cast<pars_bound_id_t*>(
+			ib_vector_push(info->bound_ids, NULL));
+
+		bid->name = name;
+	}
+
+	bid->id = id;
+}
+
+/********************************************************************
+Get bound identifier with the given name.*/
+pars_bound_id_t*
+pars_info_get_bound_id(
+/*===================*/
+					/* out: bound id, or NULL if not
+					found */
+	pars_info_t*		info,	/* in: info struct */
+	const char*		name)	/* in: bound id name to find */
+{
+	return(pars_info_lookup_bound_id(info, name));
+}
+
+/****************************************************************//**
+Get bound literal with the given name.
+@return bound literal, or NULL if not found */
+pars_bound_lit_t*
+pars_info_get_bound_lit(
+/*====================*/
+	pars_info_t*		info,	/*!< in: info struct */
+	const char*		name)	/*!< in: bound literal name to find */
+{
+	return(pars_info_lookup_bound_lit(info, name));
+}
diff --git a/storage/innobase/pars/pars0sym.cc b/storage/innobase/pars/pars0sym.cc
new file mode 100644
index 00000000..03541584
--- /dev/null
+++ b/storage/innobase/pars/pars0sym.cc
@@ -0,0 +1,413 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file pars/pars0sym.cc
+SQL parser symbol table
+
+Created 12/15/1997 Heikki Tuuri
+*******************************************************/
+
+#include "pars0sym.h"
+#include "mem0mem.h"
+#include "data0type.h"
+#include "data0data.h"
+#include "pars0grm.h"
+#include "pars0pars.h"
+#include "que0que.h"
+#include "eval0eval.h"
+#include "row0sel.h"
+
+/******************************************************************//**
+Creates a symbol table for a single stored procedure or query.
+@return own: symbol table */
+sym_tab_t*
+sym_tab_create(
+/*===========*/
+	mem_heap_t*	heap)	/*!< in: memory heap where to create */
+{
+	sym_tab_t*	sym_tab;
+
+	sym_tab = static_cast<sym_tab_t*>(
+		mem_heap_alloc(heap, sizeof(sym_tab_t)));
+
+	UT_LIST_INIT(sym_tab->sym_list, &sym_node_t::sym_list);
+	UT_LIST_INIT(sym_tab->func_node_list, &func_node_t::func_node_list);
+
+	sym_tab->heap = heap;
+
+	return(sym_tab);
+}
+
+
+/******************************************************************//**
+Frees the memory allocated dynamically AFTER parsing phase for variables
+etc. in the symbol table. Does not free the mem heap where the table was
+originally created. Frees also SQL explicit cursor definitions. */
+void
+sym_tab_free_private(
+/*=================*/
+	sym_tab_t*	sym_tab)	/*!< in, own: symbol table */
+{
+	sym_node_t*	sym;
+	func_node_t*	func;
+
+	for (sym = UT_LIST_GET_FIRST(sym_tab->sym_list);
+	     sym != NULL;
+	     sym = UT_LIST_GET_NEXT(sym_list, sym)) {
+
+		/* Close the tables opened in pars_retrieve_table_def(). */
+
+		if (sym->token_type == SYM_TABLE_REF_COUNTED) {
+			sym->table->release();
+
+			sym->table = NULL;
+			sym->resolved = FALSE;
+			sym->token_type = SYM_UNSET;
+		}
+
+		eval_node_free_val_buf(sym);
+
+		if (sym->prefetch_buf) {
+			sel_col_prefetch_buf_free(sym->prefetch_buf);
+		}
+
+		if (sym->cursor_def) {
+			que_graph_free_recursive(sym->cursor_def);
+		}
+	}
+
+	for (func = UT_LIST_GET_FIRST(sym_tab->func_node_list);
+	     func != NULL;
+	     func = UT_LIST_GET_NEXT(func_node_list, func)) {
+
+		eval_node_free_val_buf(func);
+	}
+}
+
+/******************************************************************//**
+Adds an integer literal to a symbol table.
+@return symbol table node */
+sym_node_t*
+sym_tab_add_int_lit(
+/*================*/
+	sym_tab_t*	sym_tab,	/*!< in: symbol table */
+	ulint		val)		/*!< in: integer value */
+{
+	sym_node_t*	node;
+	byte*		data;
+
+	node = static_cast<sym_node_t*>(
+		mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t)));
+
+	node->common.type = QUE_NODE_SYMBOL;
+
+	node->table = NULL;
+	node->resolved = TRUE;
+	node->token_type = SYM_LIT;
+
+	node->indirection = NULL;
+
+	dtype_set(dfield_get_type(&node->common.val), DATA_INT, 0, 4);
+
+	data = static_cast<byte*>(mem_heap_alloc(sym_tab->heap, 4));
+	mach_write_to_4(data, val);
+
+	dfield_set_data(&(node->common.val), data, 4);
+
+	node->common.val_buf_size = 0;
+	node->prefetch_buf = NULL;
+	node->cursor_def = NULL;
+
+	UT_LIST_ADD_LAST(sym_tab->sym_list, node);
+
+	node->like_node = NULL;
+
+	node->sym_table = sym_tab;
+
+	return(node);
+}
+
+/******************************************************************//**
+Adds a string literal to a symbol table.
+@return symbol table node */
+sym_node_t*
+sym_tab_add_str_lit(
+/*================*/
+	sym_tab_t*	sym_tab,	/*!< in: symbol table */
+	const byte*	str,		/*!< in: string with no quotes around
+					it */
+	ulint		len)		/*!< in: string length */
+{
+	sym_node_t*	node;
+	byte*		data;
+
+	node = static_cast<sym_node_t*>(
+		mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t)));
+
+	node->common.type = QUE_NODE_SYMBOL;
+
+	node->table = NULL;
+	node->resolved = TRUE;
+	node->token_type = SYM_LIT;
+
+	node->indirection = NULL;
+
+	dtype_set(dfield_get_type(&node->common.val),
+		  DATA_VARCHAR, DATA_ENGLISH, 0);
+
+	data = (len) ? static_cast<byte*>(mem_heap_dup(sym_tab->heap, str, len))
+		: NULL;
+
+	dfield_set_data(&(node->common.val), data, len);
+
+	node->common.val_buf_size = 0;
+	node->prefetch_buf = NULL;
+	node->cursor_def = NULL;
+
+	UT_LIST_ADD_LAST(sym_tab->sym_list, node);
+
+	node->like_node = NULL;
+
+	node->sym_table = sym_tab;
+
+	return(node);
+}
+
+/******************************************************************//**
+Add a bound literal to a symbol table.
+@return symbol table node */
+sym_node_t*
+sym_tab_add_bound_lit(
+/*==================*/
+	sym_tab_t*	sym_tab,	/*!< in: symbol table */
+	const char*	name,		/*!< in: name of bound literal */
+	ulint*		lit_type)	/*!< out: type of literal (PARS_*_LIT) */
+{
+	sym_node_t*		node;
+	pars_bound_lit_t*	blit;
+	ulint			len = 0;
+
+	blit = pars_info_get_bound_lit(sym_tab->info, name);
+	ut_a(blit);
+
+	node = static_cast<sym_node_t*>(
+		mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t)));
+
+	node->common.type = QUE_NODE_SYMBOL;
+	node->common.brother = node->common.parent = NULL;
+
+	node->table = NULL;
+	node->resolved = TRUE;
+	node->token_type = SYM_LIT;
+
+	node->indirection = NULL;
+
+	switch (blit->type) {
+	case DATA_FIXBINARY:
+	case DATA_CHAR:
+		ut_ad(blit->length > 0);
+		len = blit->length;
+		/* fall through */
+	case DATA_BLOB:
+	case DATA_VARCHAR:
+		*lit_type = PARS_STR_LIT;
+		break;
+
+	case DATA_INT:
+		ut_a(blit->length > 0);
+		ut_a(blit->length <= 8);
+
+		len = blit->length;
+		*lit_type = PARS_INT_LIT;
+		break;
+
+	default:
+		ut_error;
+	}
+
+	dtype_set(dfield_get_type(&node->common.val),
+		  blit->type, blit->prtype, len);
+
+	dfield_set_data(&(node->common.val), blit->address, blit->length);
+
+	node->common.val_buf_size = 0;
+	node->prefetch_buf = NULL;
+	node->cursor_def = NULL;
+
+	UT_LIST_ADD_LAST(sym_tab->sym_list, node);
+
+	blit->node = node;
+	node->like_node = NULL;
+	node->sym_table = sym_tab;
+
+	return(node);
+}
+
+/**********************************************************************
+Rebind literal to a node in the symbol table. */
+sym_node_t*
+sym_tab_rebind_lit(
+/*===============*/
+					/* out: symbol table node */
+	sym_node_t*	node,		/* in: node that is bound to literal*/
+	const void*	address,	/* in: pointer to data */
+	ulint		length)		/* in: length of data */
+{
+	dfield_t*	dfield = que_node_get_val(node);
+	dtype_t*	dtype = dfield_get_type(dfield);
+
+	ut_a(node->token_type == SYM_LIT);
+
+	dfield_set_data(&node->common.val, address, length);
+
+	if (node->like_node) {
+
+	    ut_a(dtype_get_mtype(dtype) == DATA_CHAR
+		 || dtype_get_mtype(dtype) == DATA_VARCHAR);
+
+		/* Don't force [FALSE] creation of sub-nodes (for LIKE) */
+		pars_like_rebind(
+			node,static_cast<const byte*>(address), length);
+	}
+
+	/* FIXME: What's this ? */
+	node->common.val_buf_size = 0;
+
+	if (node->prefetch_buf) {
+		sel_col_prefetch_buf_free(node->prefetch_buf);
+		node->prefetch_buf = NULL;
+	}
+
+	if (node->cursor_def) {
+		que_graph_free_recursive(node->cursor_def);
+		node->cursor_def = NULL;
+	}
+
+	return(node);
+}
+
+/******************************************************************//**
+Adds an SQL null literal to a symbol table.
+@return symbol table node */
+sym_node_t*
+sym_tab_add_null_lit(
+/*=================*/
+	sym_tab_t*	sym_tab)	/*!< in: symbol table */
+{
+	sym_node_t*	node;
+
+	node = static_cast<sym_node_t*>(
+		mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t)));
+
+	node->common.type = QUE_NODE_SYMBOL;
+
+	node->table = NULL;
+	node->resolved = TRUE;
+	node->token_type = SYM_LIT;
+
+	node->indirection = NULL;
+
+	dfield_get_type(&node->common.val)->mtype = DATA_ERROR;
+
+	dfield_set_null(&node->common.val);
+
+	node->common.val_buf_size = 0;
+	node->prefetch_buf = NULL;
+	node->cursor_def = NULL;
+
+	UT_LIST_ADD_LAST(sym_tab->sym_list, node);
+
+	node->like_node = NULL;
+
+	node->sym_table = sym_tab;
+
+	return(node);
+}
+
+/******************************************************************//**
+Adds an identifier to a symbol table.
+@return symbol table node */
+sym_node_t*
+sym_tab_add_id(
+/*===========*/
+	sym_tab_t*	sym_tab,	/*!< in: symbol table */
+	byte*		name,		/*!< in: identifier name */
+	ulint		len)		/*!< in: identifier length */
+{
+	sym_node_t*	node;
+
+	node = static_cast<sym_node_t*>(
+		mem_heap_zalloc(sym_tab->heap, sizeof(*node)));
+
+	node->common.type = QUE_NODE_SYMBOL;
+
+	node->name = mem_heap_strdupl(sym_tab->heap, (char*) name, len);
+	node->name_len = len;
+
+	UT_LIST_ADD_LAST(sym_tab->sym_list, node);
+
+	dfield_set_null(&node->common.val);
+
+	node->sym_table = sym_tab;
+
+	return(node);
+}
+
+/******************************************************************//**
+Add a bound identifier to a symbol table.
+@return symbol table node */
+sym_node_t*
+sym_tab_add_bound_id(
+/*=================*/
+	sym_tab_t*	sym_tab,	/*!< in: symbol table */
+	const char*	name)		/*!< in: name of bound id */
+{
+	sym_node_t*		node;
+	pars_bound_id_t*	bid;
+
+	bid = pars_info_get_bound_id(sym_tab->info, name);
+	ut_a(bid);
+
+	node = static_cast<sym_node_t*>(
+		mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t)));
+
+	node->common.type = QUE_NODE_SYMBOL;
+
+	node->table = NULL;
+	node->resolved = FALSE;
+	node->token_type = SYM_UNSET;
+	node->indirection = NULL;
+
+	node->name = mem_heap_strdup(sym_tab->heap, bid->id);
+	node->name_len = strlen(node->name);
+
+	UT_LIST_ADD_LAST(sym_tab->sym_list, node);
+
+	dfield_set_null(&node->common.val);
+
+	node->common.val_buf_size = 0;
+	node->prefetch_buf = NULL;
+	node->cursor_def = NULL;
+
+	node->like_node = NULL;
+
+	node->sym_table = sym_tab;
+
+	return(node);
+}
diff --git a/storage/innobase/que/que0que.cc b/storage/innobase/que/que0que.cc
new file mode 100644
index 00000000..d910ee2a
--- /dev/null
+++ b/storage/innobase/que/que0que.cc
@@ -0,0 +1,708 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file que/que0que.cc
+Query graph
+
+Created 5/27/1996 Heikki Tuuri
+*******************************************************/
+
+#include "que0que.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "row0undo.h"
+#include "row0ins.h"
+#include "row0upd.h"
+#include "row0sel.h"
+#include "row0purge.h"
+#include "dict0crea.h"
+#include "log0log.h"
+#include "eval0proc.h"
+
+#define QUE_MAX_LOOPS_WITHOUT_CHECK	16
+
+/* Short introduction to query graphs
+   ==================================
+
+A query graph consists of nodes linked to each other in various ways. The
+execution starts at que_run_threads() which takes a que_thr_t parameter.
+que_thr_t contains two fields that control query graph execution: run_node
+and prev_node. run_node is the next node to execute and prev_node is the
+last node executed.
+
+Each node has a pointer to a 'next' statement, i.e., its brother, and a
+pointer to its parent node. The next pointer is NULL in the last statement
+of a block.
+
+Loop nodes contain a link to the first statement of the enclosed statement
+list. While the loop runs, que_thr_step() checks if execution to the loop
+node came from its parent or from one of the statement nodes in the loop. If
+it came from the parent of the loop node it starts executing the first
+statement node in the loop. If it came from one of the statement nodes in
+the loop, then it checks if the statement node has another statement node
+following it, and runs it if so.
+
+To signify loop ending, the loop statements (see e.g. while_step()) set
+que_thr_t->run_node to the loop node's parent node. This is noticed on the
+next call of que_thr_step() and execution proceeds to the node pointed to by
+the loop node's 'next' pointer.
+
+For example, the code:
+
+X := 1;
+WHILE X < 5 LOOP
+ X := X + 1;
+ X := X + 1;
+X := 5
+
+will result in the following node hierarchy, with the X-axis indicating
+'next' links and the Y-axis indicating parent/child links:
+
+A - W - A
+    |
+    |
+    A - A
+
+A = assign_node_t, W = while_node_t. */
+
+/* How a stored procedure containing COMMIT or ROLLBACK commands
+is executed?
+
+The commit or rollback can be seen as a subprocedure call.
+
+When the transaction starts to handle a rollback or commit.
+It builds a query graph which, when executed, will roll back
+or commit the incomplete transaction. The transaction
+may be moved to the TRX_QUE_ROLLING_BACK state.
+If specified, the SQL cursors opened by the transaction are closed.
+When the execution of the graph completes, it is like returning
+from a subprocedure: the query thread which requested the operation
+starts running again. */
+
+/***********************************************************************//**
+Creates a query graph fork node.
+@return own: fork node */
+que_fork_t*
+que_fork_create(mem_heap_t *heap)
+{
+	que_fork_t*	fork;
+
+	ut_ad(heap);
+
+	fork = static_cast<que_fork_t*>(mem_heap_zalloc(heap, sizeof(*fork)));
+
+	fork->heap = heap;
+
+	fork->common.type = QUE_NODE_FORK;
+
+	fork->state = QUE_FORK_COMMAND_WAIT;
+
+	fork->graph = fork;
+
+	UT_LIST_INIT(fork->thrs, &que_thr_t::thrs);
+
+	return(fork);
+}
+
+
+/** Creates a query graph thread node.
+@param[in]	parent		parent node, i.e., a fork node
+@param[in]	heap		memory heap where created
+@param[in]	prebuilt	row prebuilt structure
+@return own: query thread node */
+que_thr_t*
+que_thr_create(
+	que_fork_t*	parent,
+	mem_heap_t*	heap,
+	row_prebuilt_t*	prebuilt)
+{
+	que_thr_t*	thr;
+
+	ut_ad(parent != NULL);
+	ut_ad(heap != NULL);
+
+	thr = static_cast<que_thr_t*>(mem_heap_zalloc(heap, sizeof(*thr)));
+
+	thr->graph = parent->graph;
+
+	thr->common.parent = parent;
+
+	thr->common.type = QUE_NODE_THR;
+
+	thr->prebuilt = prebuilt;
+
+	UT_LIST_ADD_LAST(parent->thrs, thr);
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Inits a query thread for a command. */
+UNIV_INLINE
+void
+que_thr_init_command(
+/*=================*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	thr->run_node = thr;
+	thr->prev_node = thr->common.parent;
+	thr->state = QUE_THR_RUNNING;
+}
+
+/**********************************************************************//**
+Round robin scheduler.
+@return a query thread of the graph moved to QUE_THR_RUNNING state, or
+NULL; the query thread should be executed by que_run_threads by the
+caller */
+que_thr_t*
+que_fork_scheduler_round_robin(
+/*===========================*/
+	que_fork_t*	fork,		/*!< in: a query fork */
+	que_thr_t*	thr)		/*!< in: current pos */
+{
+	fork->trx->mutex_lock();
+
+	/* If no current, start first available. */
+	if (thr == NULL) {
+		thr = UT_LIST_GET_FIRST(fork->thrs);
+	} else {
+		thr = UT_LIST_GET_NEXT(thrs, thr);
+	}
+
+	if (thr) {
+
+		fork->state = QUE_FORK_ACTIVE;
+
+		fork->last_sel_node = NULL;
+		ut_ad(thr->state == QUE_THR_COMPLETED);
+		que_thr_init_command(thr);
+	}
+
+	fork->trx->mutex_unlock();
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Starts execution of a command in a query fork. Picks a query thread which
+is not in the QUE_THR_RUNNING state and moves it to that state. If none
+can be chosen, a situation which may arise in parallelized fetches, NULL
+is returned.
+@return a query thread of the graph moved to QUE_THR_RUNNING state, or
+NULL; the query thread should be executed by que_run_threads by the
+caller */
+que_thr_t*
+que_fork_start_command(
+/*===================*/
+	que_fork_t*	fork)	/*!< in: a query fork */
+{
+	fork->state = QUE_FORK_ACTIVE;
+
+	fork->last_sel_node = NULL;
+
+	que_thr_t* thr = UT_LIST_GET_FIRST(fork->thrs);
+
+	if (thr) {
+		ut_ad(thr->state == QUE_THR_COMPLETED);
+		que_thr_init_command(thr);
+	}
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Calls que_graph_free_recursive for statements in a statement list. */
+static
+void
+que_graph_free_stat_list(
+/*=====================*/
+	que_node_t*	node)	/*!< in: first query graph node in the list */
+{
+	while (node) {
+		que_node_t* next = que_node_get_next(node);
+		que_graph_free_recursive(node);
+		node = next;
+	}
+}
+
+/**********************************************************************//**
+Frees a query graph, but not the heap where it was created. Does not free
+explicit cursor declarations, they are freed in que_graph_free. */
+void
+que_graph_free_recursive(
+/*=====================*/
+	que_node_t*	node)	/*!< in: query graph node */
+{
+	que_fork_t*	fork;
+	que_thr_t*	thr;
+	undo_node_t*	undo;
+	sel_node_t*	sel;
+	ins_node_t*	ins;
+	upd_node_t*	upd;
+	tab_node_t*	cre_tab;
+	ind_node_t*	cre_ind;
+	purge_node_t*	purge;
+
+	if (node == NULL) {
+		return;
+	}
+
+	switch (que_node_get_type(node)) {
+
+	case QUE_NODE_FORK:
+		fork = static_cast<que_fork_t*>(node);
+
+		thr = UT_LIST_GET_FIRST(fork->thrs);
+
+		while (thr) {
+			que_graph_free_recursive(thr);
+
+			thr = UT_LIST_GET_NEXT(thrs, thr);
+		}
+
+		break;
+	case QUE_NODE_THR:
+		thr = static_cast<que_thr_t*>(node);
+		que_graph_free_recursive(thr->child);
+		break;
+	case QUE_NODE_UNDO:
+
+		undo = static_cast<undo_node_t*>(node);
+
+		mem_heap_free(undo->heap);
+
+		break;
+	case QUE_NODE_SELECT:
+
+		sel = static_cast<sel_node_t*>(node);
+
+		sel_node_free_private(sel);
+
+		break;
+	case QUE_NODE_INSERT:
+		ins = static_cast<ins_node_t*>(node);
+
+		que_graph_free_recursive(ins->select);
+		ins->~ins_node_t();
+		break;
+	case QUE_NODE_PURGE:
+		purge = static_cast<purge_node_t*>(node);
+
+		mem_heap_free(purge->heap);
+
+		purge->~purge_node_t();
+		break;
+
+	case QUE_NODE_UPDATE:
+		upd = static_cast<upd_node_t*>(node);
+
+		que_graph_free_recursive(upd->cascade_node);
+		ut_free(upd->pcur->old_rec_buf);
+		upd->pcur->old_rec_buf = NULL;
+
+		if (upd->cascade_heap) {
+			mem_heap_free(upd->cascade_heap);
+			upd->cascade_heap = NULL;
+		}
+
+		que_graph_free_recursive(upd->select);
+		upd->select = NULL;
+
+		if (upd->heap != NULL) {
+			mem_heap_free(upd->heap);
+			upd->heap = NULL;
+		}
+
+		break;
+	case QUE_NODE_CREATE_TABLE:
+		cre_tab = static_cast<tab_node_t*>(node);
+
+		que_graph_free_recursive(cre_tab->tab_def);
+		que_graph_free_recursive(cre_tab->col_def);
+		que_graph_free_recursive(cre_tab->v_col_def);
+
+		mem_heap_free(cre_tab->heap);
+
+		break;
+	case QUE_NODE_CREATE_INDEX:
+		cre_ind = static_cast<ind_node_t*>(node);
+
+		que_graph_free_recursive(cre_ind->ind_def);
+		que_graph_free_recursive(cre_ind->field_def);
+
+		mem_heap_free(cre_ind->heap);
+
+		break;
+	case QUE_NODE_PROC:
+		que_graph_free_stat_list(((proc_node_t*) node)->stat_list);
+
+		break;
+	case QUE_NODE_IF:
+		que_graph_free_stat_list(((if_node_t*) node)->stat_list);
+		que_graph_free_stat_list(((if_node_t*) node)->else_part);
+		que_graph_free_stat_list(((if_node_t*) node)->elsif_list);
+
+		break;
+	case QUE_NODE_ELSIF:
+		que_graph_free_stat_list(((elsif_node_t*) node)->stat_list);
+
+		break;
+	case QUE_NODE_WHILE:
+		que_graph_free_stat_list(((while_node_t*) node)->stat_list);
+
+		break;
+	case QUE_NODE_FOR:
+		que_graph_free_stat_list(((for_node_t*) node)->stat_list);
+
+		break;
+
+	case QUE_NODE_ASSIGNMENT:
+	case QUE_NODE_EXIT:
+	case QUE_NODE_RETURN:
+	case QUE_NODE_COMMIT:
+	case QUE_NODE_ROLLBACK:
+	case QUE_NODE_LOCK:
+	case QUE_NODE_FUNC:
+	case QUE_NODE_ORDER:
+	case QUE_NODE_ROW_PRINTF:
+	case QUE_NODE_OPEN:
+	case QUE_NODE_FETCH:
+		/* No need to do anything */
+
+		break;
+	default:
+		ut_error;
+	}
+}
+
+/**********************************************************************//**
+Frees a query graph. */
+void
+que_graph_free(
+/*===========*/
+	que_t*	graph)	/*!< in: query graph; we assume that the memory
+			heap where this graph was created is private
+			to this graph: if not, then use
+			que_graph_free_recursive and free the heap
+			afterwards! */
+{
+	ut_ad(graph);
+
+	if (graph->sym_tab) {
+		/* The following call frees dynamic memory allocated
+		for variables etc. during execution. Frees also explicit
+		cursor definitions. */
+
+		sym_tab_free_private(graph->sym_tab);
+	}
+
+	if (graph->info) {
+		pars_info_free(graph->info);
+	}
+
+	que_graph_free_recursive(graph);
+
+	mem_heap_free(graph->heap);
+}
+
+/****************************************************************//**
+Performs an execution step on a thr node.
+@return query thread to run next, or NULL if none */
+static
+que_thr_t*
+que_thr_node_step(
+/*==============*/
+	que_thr_t*	thr)	/*!< in: query thread where run_node must
+				be the thread node itself */
+{
+	ut_ad(thr->run_node == thr);
+
+	if (thr->prev_node == thr->common.parent) {
+		/* If control to the node came from above, it is just passed
+		on */
+
+		thr->run_node = thr->child;
+
+		return(thr);
+	}
+
+	trx_t *trx= thr->graph->trx;
+	trx->mutex_lock();
+
+	if (!trx->lock.wait_thr && thr->graph->state == QUE_FORK_ACTIVE) {
+		thr->state = QUE_THR_COMPLETED;
+		thr = NULL;
+	}
+
+	trx->mutex_unlock();
+	return(thr);
+}
+
+/****************************************************************//**
+Get the first containing loop node (e.g. while_node_t or for_node_t) for the
+given node, or NULL if the node is not within a loop.
+@return containing loop node, or NULL. */
+que_node_t*
+que_node_get_containing_loop_node(
+/*==============================*/
+	que_node_t*	node)	/*!< in: node */
+{
+	ut_ad(node);
+
+	for (;;) {
+		ulint	type;
+
+		node = que_node_get_parent(node);
+
+		if (!node) {
+			break;
+		}
+
+		type = que_node_get_type(node);
+
+		if ((type == QUE_NODE_FOR) || (type == QUE_NODE_WHILE)) {
+			break;
+		}
+	}
+
+	return(node);
+}
+
+/**********************************************************************//**
+Performs an execution step of an open or close cursor statement node.
+@param thr query thread */
+static void open_step(que_thr_t *thr)
+{
+  open_node_t *node= static_cast<open_node_t*>(thr->run_node);
+  ut_ad(que_node_get_type(node) == QUE_NODE_OPEN);
+  sel_node_t *sel_node= node->cursor_def;
+
+  if (node->op_type == ROW_SEL_OPEN_CURSOR)
+    sel_node->state= SEL_NODE_OPEN;
+  else
+  {
+    ut_ad(sel_node->state != SEL_NODE_CLOSED);
+    sel_node->state= SEL_NODE_CLOSED;
+  }
+
+  thr->run_node= que_node_get_parent(node);
+}
+
+/**********************************************************************//**
+Performs an execution step on a query thread.
+@return query thread to run next: it may differ from the input
+parameter if, e.g., a subprocedure call is made */
+UNIV_INLINE
+que_thr_t*
+que_thr_step(
+/*=========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	que_node_t*	node;
+	que_thr_t*	old_thr;
+	trx_t*		trx;
+	ulint		type;
+
+	trx = thr_get_trx(thr);
+
+	ut_ad(thr->state == QUE_THR_RUNNING);
+	ut_a(trx->error_state == DB_SUCCESS);
+
+	thr->resource++;
+
+	node = thr->run_node;
+	type = que_node_get_type(node);
+
+	old_thr = thr;
+
+	if (type & QUE_NODE_CONTROL_STAT) {
+		if ((thr->prev_node != que_node_get_parent(node))
+		    && que_node_get_next(thr->prev_node)) {
+
+			/* The control statements, like WHILE, always pass the
+			control to the next child statement if there is any
+			child left */
+
+			thr->run_node = que_node_get_next(thr->prev_node);
+
+		} else if (type == QUE_NODE_IF) {
+			if_step(thr);
+		} else if (type == QUE_NODE_FOR) {
+			for_step(thr);
+		} else if (type == QUE_NODE_PROC) {
+			if (thr->prev_node == que_node_get_parent(node)) {
+				trx->last_sql_stat_start.least_undo_no
+					= trx->undo_no;
+			}
+
+			proc_step(thr);
+		} else if (type == QUE_NODE_WHILE) {
+			while_step(thr);
+		} else {
+			ut_error;
+		}
+	} else if (type == QUE_NODE_ASSIGNMENT) {
+		assign_step(thr);
+	} else if (type == QUE_NODE_SELECT) {
+		thr = row_sel_step(thr);
+	} else if (type == QUE_NODE_INSERT) {
+		trx_start_if_not_started_xa(thr_get_trx(thr), true);
+		thr = row_ins_step(thr);
+	} else if (type == QUE_NODE_UPDATE) {
+		trx_start_if_not_started_xa(thr_get_trx(thr), true);
+		thr = row_upd_step(thr);
+	} else if (type == QUE_NODE_FETCH) {
+		thr = fetch_step(thr);
+	} else if (type == QUE_NODE_OPEN) {
+		open_step(thr);
+	} else if (type == QUE_NODE_FUNC) {
+		proc_eval_step(thr);
+
+	} else if (type == QUE_NODE_LOCK) {
+
+		ut_error;
+	} else if (type == QUE_NODE_THR) {
+		thr = que_thr_node_step(thr);
+	} else if (type == QUE_NODE_COMMIT) {
+		thr = trx_commit_step(thr);
+	} else if (type == QUE_NODE_UNDO) {
+		thr = row_undo_step(thr);
+	} else if (type == QUE_NODE_PURGE) {
+		thr = row_purge_step(thr);
+	} else if (type == QUE_NODE_RETURN) {
+		thr = return_step(thr);
+	} else if (type == QUE_NODE_EXIT) {
+		thr = exit_step(thr);
+	} else if (type == QUE_NODE_ROLLBACK) {
+		thr = trx_rollback_step(thr);
+	} else if (type == QUE_NODE_CREATE_TABLE) {
+		thr = dict_create_table_step(thr);
+	} else if (type == QUE_NODE_CREATE_INDEX) {
+		thr = dict_create_index_step(thr);
+	} else if (type == QUE_NODE_ROW_PRINTF) {
+		thr = row_printf_step(thr);
+	} else {
+		ut_error;
+	}
+
+	if (type == QUE_NODE_EXIT) {
+		old_thr->prev_node = que_node_get_containing_loop_node(node);
+	} else {
+		old_thr->prev_node = node;
+	}
+
+	if (thr) {
+		ut_a(thr_get_trx(thr)->error_state == DB_SUCCESS);
+	}
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Run a query thread until it finishes or encounters e.g. a lock wait. */
+static
+void
+que_run_threads_low(
+/*================*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ut_ad(thr->state == QUE_THR_RUNNING);
+
+	/* cumul_resource counts how much resources the OS thread (NOT the
+	query thread) has spent in this function */
+
+	for (trx_t* trx = thr_get_trx(thr);;) {
+		ut_ad(!trx->mutex_is_owner());
+		ut_a(trx->error_state == DB_SUCCESS);
+		/* Check that there is enough space in the log to accommodate
+		possible log entries by this query step; if the operation can
+		touch more than about 4 pages, checks must be made also within
+		the query step! */
+
+		log_free_check();
+
+		/* Perform the actual query step: note that the query thread
+		may change if, e.g., a subprocedure call is made */
+
+		que_thr_t* next_thr = que_thr_step(thr);
+		ut_ad(trx == thr_get_trx(thr));
+		if (!next_thr) {
+			return;
+		}
+
+		ut_a(next_thr == thr);
+	}
+}
+
+/**********************************************************************//**
+Run a query thread. Handles lock waits. */
+void
+que_run_threads(
+/*============*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	trx_t* trx = thr->graph->trx;
+loop:
+	ut_a(trx->error_state == DB_SUCCESS);
+	que_run_threads_low(thr);
+
+	if (thr->state != QUE_THR_COMPLETED) {
+		if (trx->lock.wait_thr) {
+			ut_ad(trx->id);
+			if (lock_wait(thr) == DB_SUCCESS) {
+				goto loop;
+			}
+		} else if (trx->error_state == DB_SUCCESS) {
+			goto loop;
+		}
+	}
+}
+
+/*********************************************************************//**
+Evaluate the given SQL.
+@return error code or DB_SUCCESS */
+dberr_t
+que_eval_sql(
+/*=========*/
+	pars_info_t*	info,	/*!< in: info struct, or NULL */
+	const char*	sql,	/*!< in: SQL string */
+	trx_t*		trx)	/*!< in: trx */
+{
+	que_thr_t*	thr;
+	que_t*		graph;
+
+	DBUG_ENTER("que_eval_sql");
+	DBUG_PRINT("que_eval_sql", ("query: %s", sql));
+
+	ut_a(trx->error_state == DB_SUCCESS);
+
+	graph = pars_sql(info, sql);
+
+	graph->trx = trx;
+	trx->graph = NULL;
+
+	ut_a(thr = que_fork_start_command(graph));
+
+	que_run_threads(thr);
+
+	que_graph_free(graph);
+
+	DBUG_RETURN(trx->error_state);
+}
diff --git a/storage/innobase/read/read0read.cc b/storage/innobase/read/read0read.cc
new file mode 100644
index 00000000..97eda7db
--- /dev/null
+++ b/storage/innobase/read/read0read.cc
@@ -0,0 +1,265 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file read/read0read.cc
+Cursor read
+
+Created 2/16/1997 Heikki Tuuri
+*******************************************************/
+
+#include "read0types.h"
+
+#include "srv0srv.h"
+#include "trx0sys.h"
+#include "trx0purge.h"
+
+/*
+-------------------------------------------------------------------------------
+FACT A: Cursor read view on a secondary index sees only committed versions
+-------
+of the records in the secondary index or those versions of rows created
+by transaction which created a cursor before cursor was created even
+if transaction which created the cursor has changed that clustered index page.
+
+PROOF: We must show that read goes always to the clustered index record
+to see that record is visible in the cursor read view. Consider e.g.
+following table and SQL-clauses:
+
+create table t1(a int not null, b int, primary key(a), index(b));
+insert into t1 values (1,1),(2,2);
+commit;
+
+Now consider that we have a cursor for a query
+
+select b from t1 where b >= 1;
+
+This query will use secondary key on the table t1. Now after the first fetch
+on this cursor if we do a update:
+
+update t1 set b = 5 where b = 2;
+
+Now second fetch of the cursor should not see record (2,5) instead it should
+see record (2,2).
+
+We also should show that if we have delete t1 where b = 5; we still
+can see record (2,2).
+
+When we access a secondary key record maximum transaction id is fetched
+from this record and this trx_id is compared to up_limit_id in the view.
+If trx_id in the record is greater or equal than up_limit_id in the view
+cluster record is accessed.  Because trx_id of the creating
+transaction is stored when this view was created to the list of
+trx_ids not seen by this read view previous version of the
+record is requested to be built. This is build using clustered record.
+If the secondary key record is delete-marked, its corresponding
+clustered record can be already be purged only if records
+trx_id < low_limit_no. Purge can't remove any record deleted by a
+transaction which was active when cursor was created. But, we still
+may have a deleted secondary key record but no clustered record. But,
+this is not a problem because this case is handled in
+row_sel_get_clust_rec() function which is called
+whenever we note that this read view does not see trx_id in the
+record. Thus, we see correct version. Q. E. D.
+
+-------------------------------------------------------------------------------
+FACT B: Cursor read view on a clustered index sees only committed versions
+-------
+of the records in the clustered index or those versions of rows created
+by transaction which created a cursor before cursor was created even
+if transaction which created the cursor has changed that clustered index page.
+
+PROOF:  Consider e.g.following table and SQL-clauses:
+
+create table t1(a int not null, b int, primary key(a));
+insert into t1 values (1),(2);
+commit;
+
+Now consider that we have a cursor for a query
+
+select a from t1 where a >= 1;
+
+This query will use clustered key on the table t1. Now after the first fetch
+on this cursor if we do a update:
+
+update t1 set a = 5 where a = 2;
+
+Now second fetch of the cursor should not see record (5) instead it should
+see record (2).
+
+We also should show that if we have execute delete t1 where a = 5; after
+the cursor is opened we still can see record (2).
+
+When accessing clustered record we always check if this read view sees
+trx_id stored to clustered record. By default we don't see any changes
+if record trx_id >= low_limit_id i.e. change was made transaction
+which started after transaction which created the cursor. If row
+was changed by the future transaction a previous version of the
+clustered record is created. Thus we see only committed version in
+this case. We see all changes made by committed transactions i.e.
+record trx_id < up_limit_id. In this case we don't need to do anything,
+we already see correct version of the record. We don't see any changes
+made by active transaction except creating transaction. We have stored
+trx_id of creating transaction to list of trx_ids when this view was
+created. Thus we can easily see if this record was changed by the
+creating transaction. Because we already have clustered record we can
+access roll_ptr. Using this roll_ptr we can fetch undo record.
+We can now check that undo_no of the undo record is less than undo_no of the
+trancaction which created a view when cursor was created. We see this
+clustered record only in case when record undo_no is less than undo_no
+in the view. If this is not true we build based on undo_rec previous
+version of the record. This record is found because purge can't remove
+records accessed by active transaction. Thus we see correct version. Q. E. D.
+-------------------------------------------------------------------------------
+FACT C: Purge does not remove any delete-marked row that is visible
+-------
+in any cursor read view.
+
+PROOF: We know that:
+ 1: Currently active read views in trx_sys_t::view_list are ordered by
+    ReadView::low_limit_no in descending order, that is,
+    newest read view first.
+
+ 2: Purge clones the oldest read view and uses that to determine whether there
+    are any active transactions that can see the to be purged records.
+
+Therefore any joining or active transaction will not have a view older
+than the purge view, according to 1.
+
+When purge needs to remove a delete-marked row from a secondary index,
+it will first check that the DB_TRX_ID value of the corresponding
+record in the clustered index is older than the purge view. It will
+also check if there is a newer version of the row (clustered index
+record) that is not delete-marked in the secondary index. If such a
+row exists and is collation-equal to the delete-marked secondary index
+record then purge will not remove the secondary index record.
+
+Delete-marked clustered index records will be removed by
+row_purge_remove_clust_if_poss(), unless the clustered index record
+(and its DB_ROLL_PTR) has been updated. Every new version of the
+clustered index record will update DB_ROLL_PTR, pointing to a new UNDO
+log entry that allows the old version to be reconstructed. The
+DB_ROLL_PTR in the oldest remaining version in the old-version chain
+may be pointing to garbage (an undo log record discarded by purge),
+but it will never be dereferenced, because the purge view is older
+than any active transaction.
+
+For details see: row_vers_old_has_index_entry() and row_purge_poss_sec()
+*/
+
+
+/**
+  Creates a snapshot where exactly the transactions serialized before this
+  point in time are seen in the view.
+
+  @param[in,out] trx transaction
+*/
+inline void ReadViewBase::snapshot(trx_t *trx)
+{
+  trx_sys.snapshot_ids(trx, &m_ids, &m_low_limit_id, &m_low_limit_no);
+  if (m_ids.empty())
+  {
+    m_up_limit_id= m_low_limit_id;
+    return;
+  }
+
+  std::sort(m_ids.begin(), m_ids.end());
+  m_up_limit_id= m_ids.front();
+  ut_ad(m_up_limit_id <= m_low_limit_id);
+
+  if (m_low_limit_no == m_low_limit_id &&
+      m_low_limit_id == m_up_limit_id + m_ids.size())
+  {
+    m_ids.clear();
+    m_low_limit_id= m_low_limit_no= m_up_limit_id;
+  }
+}
+
+
+/**
+  Opens a read view where exactly the transactions serialized before this
+  point in time are seen in the view.
+
+  View becomes visible to purge thread.
+
+  @param[in,out] trx transaction
+
+  Reuses closed view if there were no read-write transactions since (and at)
+  its creation time.
+
+  Original comment states: there is an inherent race here between purge
+  and this thread.
+
+  To avoid this race we should've checked trx_sys.get_max_trx_id() and
+  set m_open atomically under ReadView::m_mutex protection. But we're cutting
+  edges to achieve greater performance.
+
+  There're at least two types of concurrent threads interested in this
+  value: purge coordinator thread (see trx_sys_t::clone_oldest_view()) and
+  InnoDB monitor thread (see lock_trx_print_wait_and_mvcc_state()).
+
+  What bad things can happen because we allow this race?
+
+  Speculative execution may reorder state change before get_max_trx_id().
+  In this case purge thread has short gap to clone outdated view. Which is
+  probably not that bad: it just won't be able to purge things that it was
+  actually allowed to purge for a short while.
+
+  This thread may as well get suspended after trx_sys.get_max_trx_id() and
+  before m_open is set. New read-write transaction may get started, committed
+  and purged meanwhile. It is acceptable as well, since this view doesn't see
+  it.
+*/
+void ReadView::open(trx_t *trx)
+{
+  ut_ad(this == &trx->read_view);
+  if (is_open())
+    ut_ad(!srv_read_only_mode);
+  else if (likely(!srv_read_only_mode))
+  {
+    m_creator_trx_id= trx->id;
+    if (trx->is_autocommit_non_locking() && empty() &&
+        low_limit_id() == trx_sys.get_max_trx_id())
+      m_open.store(true, std::memory_order_relaxed);
+    else
+    {
+      m_mutex.wr_lock();
+      snapshot(trx);
+      m_open.store(true, std::memory_order_relaxed);
+      m_mutex.wr_unlock();
+    }
+  }
+}
+
+
+/**
+  Clones the oldest view and stores it in view.
+
+  No need to call ReadView::close(). The caller owns the view that is passed
+  in. This function is called by purge thread to determine whether it should
+  purge the delete marked record or not.
+*/
+void trx_sys_t::clone_oldest_view(ReadViewBase *view) const
+{
+  view->snapshot(nullptr);
+  /* Find oldest view. */
+  trx_list.for_each([view](const trx_t &trx) {
+                      trx.read_view.append_to(view);
+		    });
+}
diff --git a/storage/innobase/rem/rem0cmp.cc b/storage/innobase/rem/rem0cmp.cc
new file mode 100644
index 00000000..d190a001
--- /dev/null
+++ b/storage/innobase/rem/rem0cmp.cc
@@ -0,0 +1,901 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2020, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file rem/rem0cmp.cc
+Comparison services for records
+
+Created 7/1/1994 Heikki Tuuri
+************************************************************************/
+
+#include "rem0cmp.h"
+#include "rem0rec.h"
+#include "page0page.h"
+#include "dict0mem.h"
+#include "handler0alter.h"
+
+/*		ALPHABETICAL ORDER
+		==================
+
+The records are put into alphabetical order in the following
+way: let F be the first field where two records disagree.
+If there is a character in some position n where the
+records disagree, the order is determined by comparison of
+the characters at position n, possibly after
+collating transformation. If there is no such character,
+but the corresponding fields have different lengths, then
+if the data type of the fields is paddable,
+shorter field is padded with a padding character. If the
+data type is not paddable, longer field is considered greater.
+Finally, the SQL null is bigger than any other value.
+
+At the present, the comparison functions return 0 in the case,
+where two records disagree only in the way that one
+has more fields than the other. */
+
+#ifndef DBUG_OFF
+/** @return whether a data type is compatible with strnncoll() functions */
+static bool is_strnncoll_compatible(ulint type)
+{
+  switch (type) {
+  case MYSQL_TYPE_BIT:
+  case MYSQL_TYPE_STRING:
+  case MYSQL_TYPE_VAR_STRING:
+  case MYSQL_TYPE_TINY_BLOB:
+  case MYSQL_TYPE_MEDIUM_BLOB:
+  case MYSQL_TYPE_BLOB:
+  case MYSQL_TYPE_LONG_BLOB:
+  case MYSQL_TYPE_VARCHAR:
+    return true;
+  default:
+    return false;
+  }
+}
+#endif /* DBUG_OFF */
+
+/*************************************************************//**
+Returns TRUE if two columns are equal for comparison purposes.
+@return TRUE if the columns are considered equal in comparisons */
+ibool
+cmp_cols_are_equal(
+/*===============*/
+	const dict_col_t*	col1,	/*!< in: column 1 */
+	const dict_col_t*	col2,	/*!< in: column 2 */
+	ibool			check_charsets)
+					/*!< in: whether to check charsets */
+{
+	if (dtype_is_non_binary_string_type(col1->mtype, col1->prtype)
+	    && dtype_is_non_binary_string_type(col2->mtype, col2->prtype)) {
+
+		/* Both are non-binary string types: they can be compared if
+		and only if the charset-collation is the same */
+
+		if (check_charsets) {
+			return(dtype_get_charset_coll(col1->prtype)
+			       == dtype_get_charset_coll(col2->prtype));
+		} else {
+			return(TRUE);
+		}
+	}
+
+	if (dtype_is_binary_string_type(col1->mtype, col1->prtype)
+	    && dtype_is_binary_string_type(col2->mtype, col2->prtype)) {
+
+		/* Both are binary string types: they can be compared */
+
+		return(TRUE);
+	}
+
+	if (col1->mtype != col2->mtype) {
+
+		return(FALSE);
+	}
+
+	if (col1->mtype == DATA_INT
+	    && (col1->prtype & DATA_UNSIGNED)
+	    != (col2->prtype & DATA_UNSIGNED)) {
+
+		/* The storage format of an unsigned integer is different
+		from a signed integer: in a signed integer we OR
+		0x8000... to the value of positive integers. */
+
+		return(FALSE);
+	}
+
+	return(col1->mtype != DATA_INT || col1->len == col2->len);
+}
+
+/** Compare two DATA_DECIMAL (MYSQL_TYPE_DECIMAL) fields.
+TODO: Remove this function. Everything should use MYSQL_TYPE_NEWDECIMAL.
+@param[in] a data field
+@param[in] a_length length of a, in bytes (not UNIV_SQL_NULL)
+@param[in] b data field
+@param[in] b_length length of b, in bytes (not UNIV_SQL_NULL)
+@return positive, 0, negative, if a is greater, equal, less than b,
+respectively */
+static ATTRIBUTE_COLD
+int
+cmp_decimal(const byte*	a, ulint a_length, const byte* b, ulint b_length)
+{
+	int	swap_flag;
+
+	/* Remove preceding spaces */
+	for (; a_length && *a == ' '; a++, a_length--) { }
+	for (; b_length && *b == ' '; b++, b_length--) { }
+
+	if (*a == '-') {
+		swap_flag = -1;
+
+		if (*b != '-') {
+			return(swap_flag);
+		}
+
+		a++; b++;
+		a_length--;
+		b_length--;
+	} else {
+		swap_flag = 1;
+
+		if (*b == '-') {
+			return(swap_flag);
+		}
+	}
+
+	while (a_length > 0 && (*a == '+' || *a == '0')) {
+		a++; a_length--;
+	}
+
+	while (b_length > 0 && (*b == '+' || *b == '0')) {
+		b++; b_length--;
+	}
+
+	if (a_length != b_length) {
+		if (a_length < b_length) {
+			return(-swap_flag);
+		}
+
+		return(swap_flag);
+	}
+
+	while (a_length > 0 && *a == *b) {
+
+		a++; b++; a_length--;
+	}
+
+	if (a_length == 0) {
+		return(0);
+	}
+
+	if (*a <= *b) {
+		swap_flag = -swap_flag;
+	}
+
+	return(swap_flag);
+}
+
+/** Compare two data fields.
+@param mtype          main type
+@param prtype         precise type
+@param descending     whether to use descending order
+@param data1          data field
+@param len1           length of data1 in bytes, or UNIV_SQL_NULL
+@param data2          data field
+@param len2           length of data2 in bytes, or UNIV_SQL_NULL
+@return the comparison result of data1 and data2
+@retval 0 if data1 is equal to data2
+@retval negative if data1 is less than data2
+@retval positive if data1 is greater than data2 */
+int cmp_data(ulint mtype, ulint prtype, bool descending,
+             const byte *data1, size_t len1, const byte *data2, size_t len2)
+{
+  ut_ad(len1 != UNIV_SQL_DEFAULT);
+  ut_ad(len2 != UNIV_SQL_DEFAULT);
+
+  int cmp= 0;
+
+  if (len1 == UNIV_SQL_NULL || len2 == UNIV_SQL_NULL)
+  {
+    if (len1 == len2)
+      return 0;
+
+    /* We define the SQL null to be the smallest possible value of a field. */
+    cmp= len1 == UNIV_SQL_NULL ? -1 : 1;
+  func_exit:
+    return UNIV_UNLIKELY(descending) ? -cmp : cmp;
+  }
+
+  switch (mtype) {
+  default:
+    ib::fatal() << "Unknown data type number " << mtype;
+  case DATA_DECIMAL:
+    cmp= cmp_decimal(data1, len1, data2, len2);
+    goto func_exit;
+  case DATA_DOUBLE:
+    {
+      const double af= mach_double_read(data1), bf= mach_double_read(data2);
+      cmp= af > bf ? 1 : bf > af ? -1 : 0;
+    }
+    goto func_exit;
+  case DATA_FLOAT:
+    {
+      const float af= mach_float_read(data1), bf= mach_float_read(data2);
+      cmp= af > bf ? 1 : bf > af ? -1 : 0;
+    }
+    goto func_exit;
+  case DATA_FIXBINARY:
+  case DATA_BINARY:
+    if (dtype_get_charset_coll(prtype) != DATA_MYSQL_BINARY_CHARSET_COLL)
+    {
+      if (ulint len= std::min(len1, len2))
+      {
+        cmp= memcmp(data1, data2, len);
+        if (cmp)
+          goto func_exit;
+        data1+= len;
+        data2+= len;
+        len1-= len;
+        len2-= len;
+      }
+      if (len1)
+      {
+        const byte *end= &data1[len1];
+        do
+          cmp= static_cast<int>(*data1++ - byte{0x20});
+        while (cmp == 0 && data1 < end);
+      }
+      else if (len2)
+      {
+        const byte *end= &data2[len2];
+        do
+          cmp= static_cast<int>(byte{0x20} - *data2++);
+        while (cmp == 0 && data2 < end);
+      }
+      goto func_exit;
+    }
+    /* fall through */
+  case DATA_INT:
+  case DATA_SYS_CHILD:
+  case DATA_SYS:
+    break;
+  case DATA_GEOMETRY:
+    ut_ad(prtype & DATA_BINARY_TYPE);
+    if (prtype & DATA_GIS_MBR)
+    {
+      ut_ad(len1 == DATA_MBR_LEN);
+      ut_ad(len2 == DATA_MBR_LEN);
+      cmp= cmp_geometry_field(data1, data2);
+      goto func_exit;
+    }
+    break;
+  case DATA_BLOB:
+    if (prtype & DATA_BINARY_TYPE)
+      break;
+    /* fall through */
+  case DATA_VARMYSQL:
+    DBUG_ASSERT(is_strnncoll_compatible(prtype & DATA_MYSQL_TYPE_MASK));
+    if (CHARSET_INFO *cs= all_charsets[dtype_get_charset_coll(prtype)])
+    {
+      cmp= cs->coll->strnncollsp(cs, data1, len1, data2, len2);
+      goto func_exit;
+    }
+  no_collation:
+    ib::fatal() << "Unable to find charset-collation for " << prtype;
+  case DATA_MYSQL:
+    DBUG_ASSERT(is_strnncoll_compatible(prtype & DATA_MYSQL_TYPE_MASK));
+    if (CHARSET_INFO *cs= all_charsets[dtype_get_charset_coll(prtype)])
+    {
+      cmp= cs->coll->
+        strnncollsp_nchars(cs, data1, len1, data2, len2, std::max(len1, len2),
+                           MY_STRNNCOLLSP_NCHARS_EMULATE_TRIMMED_TRAILING_SPACES);
+      goto func_exit;
+    }
+    goto no_collation;
+  case DATA_VARCHAR:
+  case DATA_CHAR:
+    /* latin1_swedish_ci is treated as a special case in InnoDB.
+    Because it is a fixed-length encoding (mbminlen=mbmaxlen=1),
+    non-NULL CHAR(n) values will always occupy n bytes and we
+    can invoke strnncollsp() instead of strnncollsp_nchars(). */
+    cmp= my_charset_latin1.strnncollsp(data1, len1, data2, len2);
+    goto func_exit;
+  }
+
+  if (ulint len= std::min(len1, len2))
+  {
+    cmp= memcmp(data1, data2, len);
+    if (cmp)
+      goto func_exit;
+  }
+
+  cmp= int(len1 - len2);
+  goto func_exit;
+}
+
+/** Compare a data tuple to a physical record.
+@param dtuple          data tuple
+@param rec             B-tree index record
+@param index           B-tree index
+@param offsets         rec_get_offsets(rec,index)
+@param n_cmp           number of fields to compare
+@param matched_fields  number of completely matched fields
+@return the comparison result of dtuple and rec
+@retval 0 if dtuple is equal to rec
+@retval negative if dtuple is less than rec
+@retval positive if dtuple is greater than rec */
+int cmp_dtuple_rec_with_match_low(const dtuple_t *dtuple, const rec_t *rec,
+                                  const dict_index_t *index,
+                                  const rec_offs *offsets,
+                                  ulint n_cmp, ulint *matched_fields)
+{
+	ulint		cur_field;	/* current field number */
+	int		ret = 0;	/* return value */
+
+	ut_ad(dtuple_check_typed(dtuple));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	cur_field = *matched_fields;
+
+	ut_ad(n_cmp > 0);
+	ut_ad(n_cmp <= dtuple_get_n_fields(dtuple));
+	ut_ad(cur_field <= n_cmp);
+	ut_ad(cur_field <= rec_offs_n_fields(offsets));
+
+	if (cur_field == 0) {
+		ulint	rec_info = rec_get_info_bits(rec,
+						     rec_offs_comp(offsets));
+		ulint	tup_info = dtuple_get_info_bits(dtuple);
+
+		/* The "infimum node pointer" is always first. */
+		if (UNIV_UNLIKELY(rec_info & REC_INFO_MIN_REC_FLAG)) {
+			ret = !(tup_info & REC_INFO_MIN_REC_FLAG);
+			goto order_resolved;
+		} else if (UNIV_UNLIKELY(tup_info & REC_INFO_MIN_REC_FLAG)) {
+			ret = -1;
+			goto order_resolved;
+		}
+	}
+
+	/* Match fields in a loop */
+
+	for (; cur_field < n_cmp; cur_field++) {
+		const byte*	rec_b_ptr;
+		const dfield_t*	dtuple_field
+			= dtuple_get_nth_field(dtuple, cur_field);
+		const byte*	dtuple_b_ptr
+			= static_cast<const byte*>(
+				dfield_get_data(dtuple_field));
+		const dtype_t*	type
+			= dfield_get_type(dtuple_field);
+		ulint		dtuple_f_len
+			= dfield_get_len(dtuple_field);
+		ulint		rec_f_len;
+
+		/* We should never compare against an externally
+		stored field.  Only clustered index records can
+		contain externally stored fields, and the first fields
+		(primary key fields) should already differ. */
+		ut_ad(!rec_offs_nth_extern(offsets, cur_field));
+		/* We should never compare against instantly added columns.
+		Columns can only be instantly added to clustered index
+		leaf page records, and the first fields (primary key fields)
+		should already differ. */
+		ut_ad(!rec_offs_nth_default(offsets, cur_field));
+
+		rec_b_ptr = rec_get_nth_field(rec, offsets, cur_field,
+					      &rec_f_len);
+
+		ut_ad(!dfield_is_ext(dtuple_field));
+
+		ret = cmp_data(type->mtype, type->prtype, !index->is_ibuf()
+			       && index->fields[cur_field].descending,
+			       dtuple_b_ptr, dtuple_f_len,
+			       rec_b_ptr, rec_f_len);
+		if (ret) {
+			goto order_resolved;
+		}
+	}
+
+order_resolved:
+	*matched_fields = cur_field;
+	return(ret);
+}
+
+/** Get the pad character code point for a type.
+@param[in]	type
+@return		pad character code point
+@retval		ULINT_UNDEFINED if no padding is specified */
+UNIV_INLINE
+ulint
+cmp_get_pad_char(
+	const dtype_t*	type)
+{
+	switch (type->mtype) {
+	case DATA_FIXBINARY:
+	case DATA_BINARY:
+		if (dtype_get_charset_coll(type->prtype)
+		    == DATA_MYSQL_BINARY_CHARSET_COLL) {
+			/* Starting from 5.0.18, do not pad
+			VARBINARY or BINARY columns. */
+			return(ULINT_UNDEFINED);
+		}
+		/* Fall through */
+	case DATA_CHAR:
+	case DATA_VARCHAR:
+	case DATA_MYSQL:
+	case DATA_VARMYSQL:
+		/* Space is the padding character for all char and binary
+		strings, and starting from 5.0.3, also for TEXT strings. */
+		return(0x20);
+	case DATA_GEOMETRY:
+                /* DATA_GEOMETRY is binary data, not ASCII-based. */
+	        return(ULINT_UNDEFINED);
+	case DATA_BLOB:
+		if (!(type->prtype & DATA_BINARY_TYPE)) {
+			return(0x20);
+		}
+		/* Fall through */
+	default:
+		/* No padding specified */
+		return(ULINT_UNDEFINED);
+	}
+}
+
+/** Compare a data tuple to a physical record.
+@param[in]	dtuple		data tuple
+@param[in]	rec		B-tree or R-tree index record
+@param[in]	index		index tree
+@param[in]	offsets		rec_get_offsets(rec)
+@param[in,out]	matched_fields	number of completely matched fields
+@param[in,out]	matched_bytes	number of matched bytes in the first
+field that is not matched
+@return the comparison result of dtuple and rec
+@retval 0 if dtuple is equal to rec
+@retval negative if dtuple is less than rec
+@retval positive if dtuple is greater than rec */
+int
+cmp_dtuple_rec_with_match_bytes(
+	const dtuple_t*		dtuple,
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	const rec_offs*		offsets,
+	ulint*			matched_fields,
+	ulint*			matched_bytes)
+{
+	ut_ad(dtuple_check_typed(dtuple));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(!(REC_INFO_MIN_REC_FLAG
+		& dtuple_get_info_bits(dtuple)));
+	ut_ad(!index->is_ibuf());
+
+	if (UNIV_UNLIKELY(REC_INFO_MIN_REC_FLAG
+			  & rec_get_info_bits(rec, rec_offs_comp(offsets)))) {
+		ut_ad(page_rec_is_first(rec, page_align(rec)));
+		ut_ad(!page_has_prev(page_align(rec)));
+		ut_ad(rec_is_metadata(rec, *index));
+		return 1;
+	}
+
+	ulint cur_field = *matched_fields;
+	ulint cur_bytes = *matched_bytes;
+	ulint n_cmp = dtuple_get_n_fields_cmp(dtuple);
+	int ret = 0;
+
+	ut_ad(n_cmp <= dtuple_get_n_fields(dtuple));
+	ut_ad(cur_field <= n_cmp);
+	ut_ad(cur_field + (cur_bytes > 0) <= rec_offs_n_fields(offsets));
+
+	/* Match fields in a loop; stop if we run out of fields in dtuple
+	or find an externally stored field */
+
+	while (cur_field < n_cmp) {
+		const dfield_t*	dfield		= dtuple_get_nth_field(
+			dtuple, cur_field);
+		const dtype_t*	type		= dfield_get_type(dfield);
+		ulint		dtuple_f_len	= dfield_get_len(dfield);
+		const byte*	dtuple_b_ptr;
+		const byte*	rec_b_ptr;
+		ulint		rec_f_len;
+
+		dtuple_b_ptr = static_cast<const byte*>(
+			dfield_get_data(dfield));
+
+		ut_ad(!rec_offs_nth_default(offsets, cur_field));
+		rec_b_ptr = rec_get_nth_field(rec, offsets,
+					      cur_field, &rec_f_len);
+		ut_ad(!rec_offs_nth_extern(offsets, cur_field));
+
+		/* If we have matched yet 0 bytes, it may be that one or
+		both the fields are SQL null, or the record or dtuple may be
+		the predefined minimum record. */
+		if (cur_bytes == 0) {
+			if (dtuple_f_len == UNIV_SQL_NULL) {
+				if (rec_f_len == UNIV_SQL_NULL) {
+
+					goto next_field;
+				}
+
+				ret = -1;
+				goto order_resolved;
+			} else if (rec_f_len == UNIV_SQL_NULL) {
+				/* We define the SQL null to be the
+				smallest possible value of a field
+				in the alphabetical order */
+
+				ret = 1;
+				goto order_resolved;
+			}
+		}
+
+		switch (type->mtype) {
+		case DATA_FIXBINARY:
+		case DATA_BINARY:
+		case DATA_INT:
+		case DATA_SYS_CHILD:
+		case DATA_SYS:
+			break;
+		case DATA_BLOB:
+			if (type->prtype & DATA_BINARY_TYPE) {
+				break;
+			}
+			/* fall through */
+		default:
+			ret = cmp_data(type->mtype, type->prtype, false,
+				       dtuple_b_ptr, dtuple_f_len,
+				       rec_b_ptr, rec_f_len);
+
+			if (!ret) {
+				goto next_field;
+			}
+
+			cur_bytes = 0;
+			goto order_resolved;
+		}
+
+		/* Set the pointers at the current byte */
+
+		rec_b_ptr += cur_bytes;
+		dtuple_b_ptr += cur_bytes;
+		/* Compare then the fields */
+
+		for (const ulint pad = cmp_get_pad_char(type);;
+		     cur_bytes++) {
+			ulint	rec_byte = pad;
+			ulint	dtuple_byte = pad;
+
+			if (rec_f_len <= cur_bytes) {
+				if (dtuple_f_len <= cur_bytes) {
+
+					goto next_field;
+				}
+
+				if (rec_byte == ULINT_UNDEFINED) {
+					ret = 1;
+
+					goto order_resolved;
+				}
+			} else {
+				rec_byte = *rec_b_ptr++;
+			}
+
+			if (dtuple_f_len <= cur_bytes) {
+				if (dtuple_byte == ULINT_UNDEFINED) {
+					ret = -1;
+
+					goto order_resolved;
+				}
+			} else {
+				dtuple_byte = *dtuple_b_ptr++;
+			}
+
+			if (dtuple_byte < rec_byte) {
+				ret = -1;
+				goto order_resolved;
+			} else if (dtuple_byte > rec_byte) {
+				ret = 1;
+				goto order_resolved;
+			}
+		}
+
+next_field:
+		cur_field++;
+		cur_bytes = 0;
+	}
+
+	ut_ad(cur_bytes == 0);
+
+order_resolved:
+	*matched_fields = cur_field;
+	*matched_bytes = cur_bytes;
+
+	return !ret || UNIV_LIKELY(!index->fields[cur_field].descending)
+		? ret : -ret;
+}
+
+/** Check if a dtuple is a prefix of a record.
+@param dtuple  data tuple
+@param rec     index record
+@param index   index
+@param offsets rec_get_offsets(rec)
+@return whether dtuple is a prefix of rec */
+bool cmp_dtuple_is_prefix_of_rec(const dtuple_t *dtuple, const rec_t *rec,
+                                 const dict_index_t *index,
+                                 const rec_offs *offsets)
+{
+  ulint	matched_fields= 0;
+  ulint n_fields= dtuple_get_n_fields(dtuple);
+  ut_ad(n_fields <= rec_offs_n_fields(offsets));
+  cmp_dtuple_rec_with_match(dtuple, rec, index, offsets, &matched_fields);
+  return matched_fields == n_fields;
+}
+
+/*************************************************************//**
+Compare two physical record fields.
+@retval positive if rec1 field is greater than rec2
+@retval negative if rec1 field is less than rec2
+@retval 0 if rec1 field equals to rec2 */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+int
+cmp_rec_rec_simple_field(
+/*=====================*/
+	const rec_t*		rec1,	/*!< in: physical record */
+	const rec_t*		rec2,	/*!< in: physical record */
+	const rec_offs*		offsets1,/*!< in: rec_get_offsets(rec1, ...) */
+	const rec_offs*		offsets2,/*!< in: rec_get_offsets(rec2, ...) */
+	const dict_index_t*	index,	/*!< in: data dictionary index */
+	ulint			n)	/*!< in: field to compare */
+{
+	const byte*	rec1_b_ptr;
+	const byte*	rec2_b_ptr;
+	ulint		rec1_f_len;
+	ulint		rec2_f_len;
+	const dict_field_t* field = dict_index_get_nth_field(index, n);
+
+	ut_ad(!rec_offs_nth_extern(offsets1, n));
+	ut_ad(!rec_offs_nth_extern(offsets2, n));
+
+	rec1_b_ptr = rec_get_nth_field(rec1, offsets1, n, &rec1_f_len);
+	rec2_b_ptr = rec_get_nth_field(rec2, offsets2, n, &rec2_f_len);
+
+	return cmp_data(field->col->mtype, field->col->prtype,
+			field->descending,
+			rec1_b_ptr, rec1_f_len, rec2_b_ptr, rec2_f_len);
+}
+
+/** Compare two physical records that contain the same number of columns,
+none of which are stored externally.
+@retval positive if rec1 (including non-ordering columns) is greater than rec2
+@retval negative if rec1 (including non-ordering columns) is less than rec2
+@retval 0 if rec1 is a duplicate of rec2 */
+int
+cmp_rec_rec_simple(
+/*===============*/
+	const rec_t*		rec1,	/*!< in: physical record */
+	const rec_t*		rec2,	/*!< in: physical record */
+	const rec_offs*		offsets1,/*!< in: rec_get_offsets(rec1, ...) */
+	const rec_offs*		offsets2,/*!< in: rec_get_offsets(rec2, ...) */
+	const dict_index_t*	index,	/*!< in: data dictionary index */
+	struct TABLE*		table)	/*!< in: MySQL table, for reporting
+					duplicate key value if applicable,
+					or NULL */
+{
+	ulint		n;
+	ulint		n_uniq	= dict_index_get_n_unique(index);
+	bool		null_eq	= false;
+
+	ut_ad(rec_offs_n_fields(offsets1) >= n_uniq);
+	ut_ad(rec_offs_n_fields(offsets2) == rec_offs_n_fields(offsets2));
+
+	ut_ad(rec_offs_comp(offsets1) == rec_offs_comp(offsets2));
+
+	for (n = 0; n < n_uniq; n++) {
+		int cmp = cmp_rec_rec_simple_field(
+			rec1, rec2, offsets1, offsets2, index, n);
+
+		if (cmp) {
+			return(cmp);
+		}
+
+		/* If the fields are internally equal, they must both
+		be NULL or non-NULL. */
+		ut_ad(rec_offs_nth_sql_null(offsets1, n)
+		      == rec_offs_nth_sql_null(offsets2, n));
+
+		if (rec_offs_nth_sql_null(offsets1, n)) {
+			ut_ad(!(dict_index_get_nth_col(index, n)->prtype
+				& DATA_NOT_NULL));
+			null_eq = true;
+		}
+	}
+
+	/* If we ran out of fields, the ordering columns of rec1 were
+	equal to rec2. Issue a duplicate key error if needed. */
+
+	if (!null_eq && index->is_unique()) {
+		if (table) {
+			/* Report erroneous row using new version
+			of table. */
+			innobase_rec_to_mysql(table, rec1, index, offsets1);
+		}
+		return(0);
+	}
+
+	/* Else, keep comparing so that we have the full internal
+	order. */
+	for (; n < dict_index_get_n_fields(index); n++) {
+		int cmp = cmp_rec_rec_simple_field(
+			rec1, rec2, offsets1, offsets2, index, n);
+
+		if (cmp) {
+			return(cmp);
+		}
+
+		/* If the fields are internally equal, they must both
+		be NULL or non-NULL. */
+		ut_ad(rec_offs_nth_sql_null(offsets1, n)
+		      == rec_offs_nth_sql_null(offsets2, n));
+	}
+
+	/* This should never be reached. Internally, an index must
+	never contain duplicate entries. */
+	ut_ad(0);
+	return(0);
+}
+
+/** Compare two B-tree or R-tree records.
+Only the common first fields are compared, and externally stored field
+are treated as equal.
+@param[in]	rec1		record (possibly not on an index page)
+@param[in]	rec2		B-tree or R-tree record in an index page
+@param[in]	offsets1	rec_get_offsets(rec1, index)
+@param[in]	offsets2	rec_get_offsets(rec2, index)
+@param[in]	nulls_unequal	true if this is for index cardinality
+				statistics estimation with
+				innodb_stats_method=nulls_unequal
+				or innodb_stats_method=nulls_ignored
+@param[out]	matched_fields	number of completely matched fields
+				within the first field not completely matched
+@retval 0 if rec1 is equal to rec2
+@retval negative if rec1 is less than rec2
+@retval positive if rec1 is greater than rec2 */
+int
+cmp_rec_rec(
+	const rec_t*		rec1,
+	const rec_t*		rec2,
+	const rec_offs*		offsets1,
+	const rec_offs*		offsets2,
+	const dict_index_t*	index,
+	bool			nulls_unequal,
+	ulint*			matched_fields)
+{
+	ulint		rec1_f_len;	/* length of current field in rec */
+	const byte*	rec1_b_ptr;	/* pointer to the current byte
+					in rec field */
+	ulint		rec2_f_len;	/* length of current field in rec */
+	const byte*	rec2_b_ptr;	/* pointer to the current byte
+					in rec field */
+	ulint		cur_field = 0;	/* current field number */
+	int		ret = 0;	/* return value */
+
+	ut_ad(rec1 != NULL);
+	ut_ad(rec2 != NULL);
+	ut_ad(index != NULL);
+	ut_ad(rec_offs_validate(rec1, index, offsets1));
+	ut_ad(rec_offs_validate(rec2, index, offsets2));
+	ut_ad(rec_offs_comp(offsets1) == rec_offs_comp(offsets2));
+	ut_ad(fil_page_index_page_check(page_align(rec2)));
+	ut_ad(!!dict_index_is_spatial(index)
+	      == (fil_page_get_type(page_align(rec2)) == FIL_PAGE_RTREE));
+
+	ulint comp = rec_offs_comp(offsets1);
+	ulint n_fields;
+
+	/* Test if rec is the predefined minimum record */
+	if (UNIV_UNLIKELY(rec_get_info_bits(rec1, comp)
+			  & REC_INFO_MIN_REC_FLAG)) {
+		ret = UNIV_UNLIKELY(rec_get_info_bits(rec2, comp)
+				    & REC_INFO_MIN_REC_FLAG)
+			? 0 : -1;
+		goto order_resolved;
+	} else if (UNIV_UNLIKELY
+		   (rec_get_info_bits(rec2, comp)
+		    & REC_INFO_MIN_REC_FLAG)) {
+		ret = 1;
+		goto order_resolved;
+	}
+
+	/* For non-leaf spatial index records, the
+	dict_index_get_n_unique_in_tree() does include the child page
+	number, because spatial index node pointers only contain
+	the MBR (minimum bounding rectangle) and the child page number.
+
+	For B-tree node pointers, the key alone (secondary index
+	columns and PRIMARY KEY columns) must be unique, and there is
+	no need to compare the child page number. */
+	n_fields = std::min(rec_offs_n_fields(offsets1),
+			    rec_offs_n_fields(offsets2));
+	n_fields = std::min<ulint>(n_fields,
+				   dict_index_get_n_unique_in_tree(index));
+
+	for (; cur_field < n_fields; cur_field++) {
+		ulint	mtype;
+		ulint	prtype;
+		bool	descending;
+
+		if (UNIV_UNLIKELY(dict_index_is_ibuf(index))) {
+			/* This is for the insert buffer B-tree. */
+			mtype = DATA_BINARY;
+			prtype = 0;
+			descending = false;
+		} else {
+			const dict_field_t* field = dict_index_get_nth_field(
+				index, cur_field);
+			descending = field->descending;
+			mtype = field->col->mtype;
+			prtype = field->col->prtype;
+
+			if (UNIV_LIKELY(!dict_index_is_spatial(index))) {
+			} else if (cur_field == 0) {
+				ut_ad(DATA_GEOMETRY_MTYPE(mtype));
+				prtype |= DATA_GIS_MBR;
+			} else if (!page_rec_is_leaf(rec2)) {
+				/* Compare the child page number. */
+				ut_ad(cur_field == 1);
+				mtype = DATA_SYS_CHILD;
+				prtype = 0;
+			}
+		}
+
+		/* We should never encounter an externally stored field.
+		Externally stored fields only exist in clustered index
+		leaf page records. These fields should already differ
+		in the primary key columns already, before DB_TRX_ID,
+		DB_ROLL_PTR, and any externally stored columns. */
+		ut_ad(!rec_offs_nth_extern(offsets1, cur_field));
+		ut_ad(!rec_offs_nth_extern(offsets2, cur_field));
+		ut_ad(!rec_offs_nth_default(offsets1, cur_field));
+		ut_ad(!rec_offs_nth_default(offsets2, cur_field));
+
+		rec1_b_ptr = rec_get_nth_field(rec1, offsets1,
+					       cur_field, &rec1_f_len);
+		rec2_b_ptr = rec_get_nth_field(rec2, offsets2,
+					       cur_field, &rec2_f_len);
+
+		if (nulls_unequal
+		    && rec1_f_len == UNIV_SQL_NULL
+		    && rec2_f_len == UNIV_SQL_NULL) {
+			ret = -1;
+			goto order_resolved;
+		}
+
+		ret = cmp_data(mtype, prtype, descending,
+			       rec1_b_ptr, rec1_f_len, rec2_b_ptr, rec2_f_len);
+		if (ret) {
+			goto order_resolved;
+		}
+	}
+
+	/* If we ran out of fields, rec1 was equal to rec2 up
+	to the common fields */
+	ut_ad(ret == 0);
+order_resolved:
+	if (matched_fields) {
+		*matched_fields = cur_field;
+	}
+	return ret;
+}
diff --git a/storage/innobase/rem/rem0rec.cc b/storage/innobase/rem/rem0rec.cc
new file mode 100644
index 00000000..a862edd7
--- /dev/null
+++ b/storage/innobase/rem/rem0rec.cc
@@ -0,0 +1,2820 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file rem/rem0rec.cc
+Record manager
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "rem0rec.h"
+#include "page0page.h"
+#include "mtr0log.h"
+#include "fts0fts.h"
+#include "trx0sys.h"
+#include "row0log.h"
+
+/*			PHYSICAL RECORD (OLD STYLE)
+			===========================
+
+The physical record, which is the data type of all the records
+found in index pages of the database, has the following format
+(lower addresses and more significant bits inside a byte are below
+represented on a higher text line):
+
+| offset of the end of the last field of data, the most significant
+  bit is set to 1 if and only if the field is SQL-null,
+  if the offset is 2-byte, then the second most significant
+  bit is set to 1 if the field is stored on another page:
+  mostly this will occur in the case of big BLOB fields |
+...
+| offset of the end of the first field of data + the SQL-null bit |
+| 4 bits used to delete mark a record, and mark a predefined
+  minimum record in alphabetical order |
+| 4 bits giving the number of records owned by this record
+  (this term is explained in page0page.h) |
+| 13 bits giving the order number of this record in the
+  heap of the index page |
+| 10 bits giving the number of fields in this record |
+| 1 bit which is set to 1 if the offsets above are given in
+  one byte format, 0 if in two byte format |
+| two bytes giving an absolute pointer to the next record in the page |
+ORIGIN of the record
+| first field of data |
+...
+| last field of data |
+
+The origin of the record is the start address of the first field
+of data. The offsets are given relative to the origin.
+The offsets of the data fields are stored in an inverted
+order because then the offset of the first fields are near the
+origin, giving maybe a better processor cache hit rate in searches.
+
+The offsets of the data fields are given as one-byte
+(if there are less than 127 bytes of data in the record)
+or two-byte unsigned integers. The most significant bit
+is not part of the offset, instead it indicates the SQL-null
+if the bit is set to 1. */
+
+/*			PHYSICAL RECORD (NEW STYLE)
+			===========================
+
+The physical record, which is the data type of all the records
+found in index pages of the database, has the following format
+(lower addresses and more significant bits inside a byte are below
+represented on a higher text line):
+
+| length of the last non-null variable-length field of data:
+  if the maximum length is 255, one byte; otherwise,
+  0xxxxxxx (one byte, length=0..127), or 1exxxxxxxxxxxxxx (two bytes,
+  length=128..16383, extern storage flag) |
+...
+| length of first variable-length field of data |
+| SQL-null flags (1 bit per nullable field), padded to full bytes |
+| 4 bits used to delete mark a record, and mark a predefined
+  minimum record in alphabetical order |
+| 4 bits giving the number of records owned by this record
+  (this term is explained in page0page.h) |
+| 13 bits giving the order number of this record in the
+  heap of the index page |
+| 3 bits record type: 000=conventional, 001=node pointer (inside B-tree),
+  010=infimum, 011=supremum, 1xx=reserved |
+| two bytes giving a relative pointer to the next record in the page |
+ORIGIN of the record
+| first field of data |
+...
+| last field of data |
+
+The origin of the record is the start address of the first field
+of data. The offsets are given relative to the origin.
+The offsets of the data fields are stored in an inverted
+order because then the offset of the first fields are near the
+origin, giving maybe a better processor cache hit rate in searches.
+
+The offsets of the data fields are given as one-byte
+(if there are less than 127 bytes of data in the record)
+or two-byte unsigned integers. The most significant bit
+is not part of the offset, instead it indicates the SQL-null
+if the bit is set to 1. */
+
+/* CANONICAL COORDINATES. A record can be seen as a single
+string of 'characters' in the following way: catenate the bytes
+in each field, in the order of fields. An SQL-null field
+is taken to be an empty sequence of bytes. Then after
+the position of each field insert in the string
+the 'character' <FIELD-END>, except that after an SQL-null field
+insert <NULL-FIELD-END>. Now the ordinal position of each
+byte in this canonical string is its canonical coordinate.
+So, for the record ("AA", SQL-NULL, "BB", ""), the canonical
+string is "AA<FIELD_END><NULL-FIELD-END>BB<FIELD-END><FIELD-END>".
+We identify prefixes (= initial segments) of a record
+with prefixes of the canonical string. The canonical
+length of the prefix is the length of the corresponding
+prefix of the canonical string. The canonical length of
+a record is the length of its canonical string.
+
+For example, the maximal common prefix of records
+("AA", SQL-NULL, "BB", "C") and ("AA", SQL-NULL, "B", "C")
+is "AA<FIELD-END><NULL-FIELD-END>B", and its canonical
+length is 5.
+
+A complete-field prefix of a record is a prefix which ends at the
+end of some field (containing also <FIELD-END>).
+A record is a complete-field prefix of another record, if
+the corresponding canonical strings have the same property. */
+
+/***************************************************************//**
+Validates the consistency of an old-style physical record.
+@return TRUE if ok */
+static
+ibool
+rec_validate_old(
+/*=============*/
+	const rec_t*	rec);	/*!< in: physical record */
+
+/******************************************************//**
+Determine how many of the first n columns in a compact
+physical record are stored externally.
+@return number of externally stored columns */
+ulint
+rec_get_n_extern_new(
+/*=================*/
+	const rec_t*		rec,	/*!< in: compact physical record */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	ulint			n)	/*!< in: number of columns to scan */
+{
+	const byte*	nulls;
+	const byte*	lens;
+	ulint		null_mask;
+	ulint		n_extern;
+	ulint		i;
+
+	ut_ad(dict_table_is_comp(index->table));
+	ut_ad(!index->table->supports_instant());
+	ut_ad(!index->is_instant());
+	ut_ad(rec_get_status(rec) == REC_STATUS_ORDINARY
+	      || rec_get_status(rec) == REC_STATUS_INSTANT);
+	ut_ad(n == ULINT_UNDEFINED || n <= dict_index_get_n_fields(index));
+
+	if (n == ULINT_UNDEFINED) {
+		n = dict_index_get_n_fields(index);
+	}
+
+	nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1);
+	lens = nulls - UT_BITS_IN_BYTES(index->n_nullable);
+	null_mask = 1;
+	n_extern = 0;
+	i = 0;
+
+	/* read the lengths of fields 0..n */
+	do {
+		const dict_field_t*	field
+			= dict_index_get_nth_field(index, i);
+		const dict_col_t*	col
+			= dict_field_get_col(field);
+		ulint			len;
+
+		if (!(col->prtype & DATA_NOT_NULL)) {
+			/* nullable field => read the null flag */
+
+			if (UNIV_UNLIKELY(!(byte) null_mask)) {
+				nulls--;
+				null_mask = 1;
+			}
+
+			if (*nulls & null_mask) {
+				null_mask <<= 1;
+				/* No length is stored for NULL fields. */
+				continue;
+			}
+			null_mask <<= 1;
+		}
+
+		if (UNIV_UNLIKELY(!field->fixed_len)) {
+			/* Variable-length field: read the length */
+			len = *lens--;
+			/* If the maximum length of the field is up
+			to 255 bytes, the actual length is always
+			stored in one byte. If the maximum length is
+			more than 255 bytes, the actual length is
+			stored in one byte for 0..127.  The length
+			will be encoded in two bytes when it is 128 or
+			more, or when the field is stored externally. */
+			if (UNIV_UNLIKELY(len & 0x80) && DATA_BIG_COL(col)) {
+				/* 1exxxxxxx xxxxxxxx */
+				if (len & 0x40) {
+					n_extern++;
+				}
+				lens--;
+			}
+		}
+	} while (++i < n);
+
+	return(n_extern);
+}
+
+/** Format of a leaf-page ROW_FORMAT!=REDUNDANT record */
+enum rec_leaf_format {
+	/** Temporary file record */
+	REC_LEAF_TEMP,
+	/** Temporary file record, with added columns (REC_STATUS_INSTANT) */
+	REC_LEAF_TEMP_INSTANT,
+	/** Normal (REC_STATUS_ORDINARY) */
+	REC_LEAF_ORDINARY,
+	/** With add or drop columns (REC_STATUS_INSTANT) */
+	REC_LEAF_INSTANT
+};
+
+#if defined __GNUC__ && !defined __clang__
+# pragma GCC diagnostic push
+# if __GNUC__ < 12 || defined WITH_UBSAN
+#  pragma GCC diagnostic ignored "-Wconversion"
+# endif
+#endif
+/** Determine the offset to each field in a leaf-page record
+in ROW_FORMAT=COMPACT,DYNAMIC,COMPRESSED.
+This is a special case of rec_init_offsets() and rec_get_offsets_func().
+@tparam	mblob	whether the record includes a metadata BLOB
+@tparam	redundant_temp	whether the record belongs to a temporary file
+			of a ROW_FORMAT=REDUNDANT table
+@param[in]	rec	leaf-page record
+@param[in]	index	the index that the record belongs in
+@param[in]	n_core	number of core fields (index->n_core_fields)
+@param[in]	def_val	default values for non-core fields, or
+			NULL to refer to index->fields[].col->def_val
+@param[in,out]	offsets	offsets, with valid rec_offs_n_fields(offsets)
+@param[in]	format	record format */
+template<bool mblob = false, bool redundant_temp = false>
+static inline
+void
+rec_init_offsets_comp_ordinary(
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	rec_offs*		offsets,
+	ulint			n_core,
+	const dict_col_t::def_t*def_val,
+	rec_leaf_format		format)
+{
+	rec_offs	offs		= 0;
+	rec_offs	any		= 0;
+	const byte*	nulls		= rec;
+	const byte*	lens		= NULL;
+	ulint		n_fields	= n_core;
+	ulint		null_mask	= 1;
+
+	ut_ad(n_core > 0);
+	ut_ad(index->n_core_fields >= n_core);
+	ut_ad(index->n_fields >= index->n_core_fields);
+	ut_ad(index->n_core_null_bytes <= UT_BITS_IN_BYTES(index->n_nullable));
+	ut_ad(format == REC_LEAF_TEMP || format == REC_LEAF_TEMP_INSTANT
+	      || dict_table_is_comp(index->table));
+	ut_ad(format != REC_LEAF_TEMP_INSTANT
+	      || index->n_fields == rec_offs_n_fields(offsets));
+	ut_d(ulint n_null= 0);
+
+	const unsigned n_core_null_bytes = UNIV_UNLIKELY(index->n_core_fields
+							 != n_core)
+		? UT_BITS_IN_BYTES(unsigned(index->get_n_nullable(n_core)))
+		: (redundant_temp
+		   ? (index->is_instant()
+		      ? UT_BITS_IN_BYTES(index->get_n_nullable(n_core))
+		      : UT_BITS_IN_BYTES(index->n_nullable))
+		   : index->n_core_null_bytes);
+
+	if (mblob) {
+		ut_ad(index->table->instant);
+		ut_ad(index->is_instant());
+		ut_ad(rec_offs_n_fields(offsets)
+		      <= ulint(index->n_fields) + 1);
+		ut_ad(!def_val);
+		ut_ad(format == REC_LEAF_INSTANT);
+		nulls -= REC_N_NEW_EXTRA_BYTES;
+		n_fields = n_core + 1 + rec_get_n_add_field(nulls);
+		ut_ad(n_fields <= ulint(index->n_fields) + 1);
+		const ulint n_nullable = index->get_n_nullable(n_fields - 1);
+		const ulint n_null_bytes = UT_BITS_IN_BYTES(n_nullable);
+		ut_d(n_null = n_nullable);
+		ut_ad(n_null <= index->n_nullable);
+		ut_ad(n_null_bytes >= n_core_null_bytes
+		      || n_core < index->n_core_fields);
+		lens = --nulls - n_null_bytes;
+		goto start;
+	}
+
+	switch (format) {
+	case REC_LEAF_TEMP:
+		if (dict_table_is_comp(index->table)) {
+			/* No need to do adjust fixed_len=0. We only need to
+			adjust it for ROW_FORMAT=REDUNDANT. */
+			format = REC_LEAF_ORDINARY;
+		}
+		goto ordinary;
+	case REC_LEAF_ORDINARY:
+		nulls -= REC_N_NEW_EXTRA_BYTES;
+ordinary:
+		lens = --nulls - n_core_null_bytes;
+
+		ut_d(n_null = std::min<uint>(n_core_null_bytes * 8U,
+					     index->n_nullable));
+		break;
+	case REC_LEAF_INSTANT:
+		nulls -= REC_N_NEW_EXTRA_BYTES;
+		ut_ad(index->is_instant());
+		/* fall through */
+	case REC_LEAF_TEMP_INSTANT:
+		n_fields = n_core + rec_get_n_add_field(nulls) + 1;
+		ut_ad(n_fields <= index->n_fields);
+		const ulint n_nullable = index->get_n_nullable(n_fields);
+		const ulint n_null_bytes = UT_BITS_IN_BYTES(n_nullable);
+		ut_d(n_null = n_nullable);
+		ut_ad(n_null <= index->n_nullable);
+		ut_ad(n_null_bytes >= n_core_null_bytes
+		      || n_core < index->n_core_fields);
+		lens = --nulls - n_null_bytes;
+	}
+
+start:
+#ifdef UNIV_DEBUG
+	/* We cannot invoke rec_offs_make_valid() if format==REC_LEAF_TEMP.
+	Similarly, rec_offs_validate() will fail in that case, because
+	it invokes rec_get_status(). */
+	memcpy(&offsets[RECORD_OFFSET], &rec, sizeof(rec));
+	memcpy(&offsets[INDEX_OFFSET], &index, sizeof(index));
+#endif /* UNIV_DEBUG */
+
+	/* read the lengths of fields 0..n_fields */
+	rec_offs len;
+	ulint i = 0;
+	const dict_field_t* field = index->fields;
+
+	do {
+		if (mblob) {
+			if (i == index->first_user_field()) {
+				offs += FIELD_REF_SIZE;
+				len = combine(offs, STORED_OFFPAGE);
+				any |= REC_OFFS_EXTERNAL;
+				field--;
+				continue;
+			} else if (i >= n_fields) {
+				len = combine(offs, DEFAULT);
+				any |= REC_OFFS_DEFAULT;
+				continue;
+			}
+		} else if (i < n_fields) {
+			/* The field is present, and will be covered below. */
+		} else if (!mblob && def_val) {
+			const dict_col_t::def_t& d = def_val[i - n_core];
+			if (!d.data) {
+				len = combine(offs, SQL_NULL);
+				ut_ad(d.len == UNIV_SQL_NULL);
+			} else {
+				len = combine(offs, DEFAULT);
+				any |= REC_OFFS_DEFAULT;
+			}
+
+			continue;
+		} else {
+			ulint dlen;
+			if (!index->instant_field_value(i, &dlen)) {
+				len = combine(offs, SQL_NULL);
+				ut_ad(dlen == UNIV_SQL_NULL);
+			} else {
+				len = combine(offs, DEFAULT);
+				any |= REC_OFFS_DEFAULT;
+			}
+
+			continue;
+		}
+
+		const dict_col_t* col = field->col;
+
+		if (col->is_nullable()) {
+			/* nullable field => read the null flag */
+			ut_ad(n_null--);
+
+			if (UNIV_UNLIKELY(!(byte) null_mask)) {
+				nulls--;
+				null_mask = 1;
+			}
+
+			if (*nulls & null_mask) {
+				null_mask <<= 1;
+				/* No length is stored for NULL fields.
+				We do not advance offs, and we set
+				the length to zero and enable the
+				SQL NULL flag in offsets[]. */
+				len = combine(offs, SQL_NULL);
+				continue;
+			}
+			null_mask <<= 1;
+		}
+
+		if (!field->fixed_len
+		    || (format == REC_LEAF_TEMP
+			&& !dict_col_get_fixed_size(col, true))) {
+			/* Variable-length field: read the length */
+			len = *lens--;
+			/* If the maximum length of the field is up
+			to 255 bytes, the actual length is always
+			stored in one byte. If the maximum length is
+			more than 255 bytes, the actual length is
+			stored in one byte for 0..127.  The length
+			will be encoded in two bytes when it is 128 or
+			more, or when the field is stored externally. */
+			if (UNIV_UNLIKELY(len & 0x80) && DATA_BIG_COL(col)) {
+				/* 1exxxxxxx xxxxxxxx */
+				len <<= 8;
+				len |= *lens--;
+				static_assert(STORED_OFFPAGE == 0x4000, "");
+				static_assert(REC_OFFS_EXTERNAL == 0x4000, "");
+				const rec_offs ext = len & REC_OFFS_EXTERNAL;
+				offs += get_value(len);
+				len = offs | ext;
+				any |= ext;
+				ut_ad(!ext || index->is_primary());
+				continue;
+			}
+
+			len = offs += static_cast<rec_offs>(len);
+		} else {
+			len = offs += field->fixed_len;
+		}
+	} while (field++, rec_offs_base(offsets)[++i] = len,
+		 i < rec_offs_n_fields(offsets));
+
+	*rec_offs_base(offsets) = static_cast<rec_offs>((rec - (lens + 1))
+							| REC_OFFS_COMPACT
+							| any);
+}
+
+#ifdef UNIV_DEBUG
+/** Update debug data in offsets, in order to tame rec_offs_validate().
+@param[in]	rec	record
+@param[in]	index	the index that the record belongs in
+@param[in]	leaf	whether the record resides in a leaf page
+@param[in,out]	offsets	offsets from rec_get_offsets() to adjust */
+void
+rec_offs_make_valid(
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	bool			leaf,
+	rec_offs*		offsets)
+{
+	const bool is_alter_metadata = leaf
+		&& rec_is_alter_metadata(rec, *index);
+	ut_ad((leaf && rec_is_metadata(rec, *index))
+	      || index->is_dummy || index->is_ibuf()
+	      || (leaf
+		  ? rec_offs_n_fields(offsets)
+		  <= dict_index_get_n_fields(index)
+		  : rec_offs_n_fields(offsets) - 1
+		  <= dict_index_get_n_unique_in_tree_nonleaf(index)));
+	const bool is_user_rec = (dict_table_is_comp(index->table)
+				  ? rec_get_heap_no_new(rec)
+				  : rec_get_heap_no_old(rec))
+		>= PAGE_HEAP_NO_USER_LOW;
+	ulint n = rec_get_n_fields(rec, index);
+	/* The infimum and supremum records carry 1 field. */
+	ut_ad(is_user_rec || n == 1);
+	ut_ad(is_user_rec || rec_offs_n_fields(offsets) == 1);
+	ut_ad(!is_user_rec
+	      || (n + (index->id == DICT_INDEXES_ID)) >= index->n_core_fields
+	      || n >= rec_offs_n_fields(offsets));
+	for (; n < rec_offs_n_fields(offsets); n++) {
+		ut_ad(leaf);
+		ut_ad(is_alter_metadata
+		      || get_type(rec_offs_base(offsets)[1 + n]) == DEFAULT);
+	}
+	memcpy(&offsets[RECORD_OFFSET], &rec, sizeof(rec));
+	memcpy(&offsets[INDEX_OFFSET], &index, sizeof(index));
+}
+
+/** Validate offsets returned by rec_get_offsets().
+@param[in]	rec	record, or NULL
+@param[in]	index	the index that the record belongs in, or NULL
+@param[in,out]	offsets	the offsets of the record
+@return true */
+bool
+rec_offs_validate(
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	const rec_offs*		offsets)
+{
+	ulint	i	= rec_offs_n_fields(offsets);
+	ulint	last	= ULINT_MAX;
+	ulint	comp	= *rec_offs_base(offsets) & REC_OFFS_COMPACT;
+
+	if (rec) {
+		ut_ad(!memcmp(&rec, &offsets[RECORD_OFFSET], sizeof(rec)));
+		if (!comp) {
+			const bool is_user_rec = rec_get_heap_no_old(rec)
+				>= PAGE_HEAP_NO_USER_LOW;
+			ulint n = rec_get_n_fields_old(rec);
+			/* The infimum and supremum records carry 1 field. */
+			ut_ad(is_user_rec || n == 1);
+			ut_ad(is_user_rec || i == 1);
+			ut_ad(!is_user_rec || n >= i || !index
+			      || (n + (index->id == DICT_INDEXES_ID))
+			      >= index->n_core_fields);
+			for (; n < i; n++) {
+				ut_ad(get_type(rec_offs_base(offsets)[1 + n])
+				      == DEFAULT);
+			}
+		}
+	}
+	if (index) {
+		ut_ad(!memcmp(&index, &offsets[INDEX_OFFSET], sizeof(index)));
+		ulint max_n_fields = std::max<ulint>(
+			dict_index_get_n_fields(index),
+			dict_index_get_n_unique_in_tree(index) + 1);
+		if (comp && rec) {
+			switch (rec_get_status(rec)) {
+			case REC_STATUS_INSTANT:
+				ut_ad(index->is_instant() || index->is_dummy);
+				ut_ad(max_n_fields == index->n_fields);
+				max_n_fields += index->table->instant
+					|| index->is_dummy;
+				break;
+			case REC_STATUS_ORDINARY:
+				break;
+			case REC_STATUS_NODE_PTR:
+				max_n_fields = dict_index_get_n_unique_in_tree(
+					index) + 1;
+				break;
+			case REC_STATUS_INFIMUM:
+			case REC_STATUS_SUPREMUM:
+				max_n_fields = 1;
+				break;
+			default:
+				ut_error;
+			}
+		} else if (max_n_fields == index->n_fields
+			   && (index->is_dummy
+			       || (index->is_instant()
+				   && index->table->instant))) {
+			max_n_fields++;
+		}
+		/* index->n_def == 0 for dummy indexes if !comp */
+		ut_ad(!comp || index->n_def);
+		ut_ad(!index->n_def || i <= max_n_fields
+		      || rec_is_metadata(rec, *index));
+	}
+	while (i--) {
+		ulint curr = get_value(rec_offs_base(offsets)[1 + i]);
+		ut_ad(curr <= last);
+		last = curr;
+	}
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/** Determine the offsets to each field in the record.
+ The offsets are written to a previously allocated array of
+ulint, where rec_offs_n_fields(offsets) has been initialized to the
+number of fields in the record.	 The rest of the array will be
+initialized by this function.  rec_offs_base(offsets)[0] will be set
+to the extra size (if REC_OFFS_COMPACT is set, the record is in the
+new format; if REC_OFFS_EXTERNAL is set, the record contains externally
+stored columns), and rec_offs_base(offsets)[1..n_fields] will be set to
+offsets past the end of fields 0..n_fields, or to the beginning of
+fields 1..n_fields+1.  When the type of the offset at [i+1]
+is (SQL_NULL), the field i is NULL. When the type of the offset at [i+1]
+is (STORED_OFFPAGE), the field i is stored externally.
+@param[in]	rec	record
+@param[in]	index	the index that the record belongs in
+@param[in]	n_core	0, or index->n_core_fields for leaf page
+@param[in,out]	offsets	array of offsets, with valid rec_offs_n_fields() */
+static
+void
+rec_init_offsets(
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	ulint			n_core,
+	rec_offs*		offsets)
+{
+	ulint	i	= 0;
+	rec_offs	offs;
+
+	/* This assertion was relaxed for the btr_cur_t::open_leaf()
+	call in btr_cur_instant_init_low(). We cannot invoke
+	index->is_instant(), because the same assertion would fail there
+	until btr_cur_instant_init_low() has invoked
+	dict_table_t::deserialise_columns(). */
+	ut_ad(index->n_core_null_bytes <= UT_BITS_IN_BYTES(index->n_nullable)
+	      || index->in_instant_init);
+	ut_d(memcpy(&offsets[RECORD_OFFSET], &rec, sizeof(rec)));
+	ut_d(memcpy(&offsets[INDEX_OFFSET], &index, sizeof(index)));
+	ut_ad(index->n_fields >= n_core);
+	ut_ad(index->n_core_fields >= n_core);
+
+	if (dict_table_is_comp(index->table)) {
+		const byte*	nulls;
+		const byte*	lens;
+		dict_field_t*	field;
+		ulint		null_mask;
+		rec_comp_status_t status = rec_get_status(rec);
+		ulint		n_node_ptr_field = ULINT_UNDEFINED;
+
+		switch (UNIV_EXPECT(status, REC_STATUS_ORDINARY)) {
+		case REC_STATUS_INFIMUM:
+		case REC_STATUS_SUPREMUM:
+			/* the field is 8 bytes long */
+			rec_offs_base(offsets)[0]
+				= REC_N_NEW_EXTRA_BYTES | REC_OFFS_COMPACT;
+			rec_offs_base(offsets)[1] = 8;
+			return;
+		case REC_STATUS_NODE_PTR:
+			ut_ad(!n_core);
+			n_node_ptr_field
+				= dict_index_get_n_unique_in_tree_nonleaf(
+					index);
+			break;
+		case REC_STATUS_INSTANT:
+			ut_ad(index->is_instant());
+			rec_init_offsets_comp_ordinary(rec, index, offsets,
+						       n_core,
+						       NULL,
+						       REC_LEAF_INSTANT);
+			return;
+		case REC_STATUS_ORDINARY:
+			rec_init_offsets_comp_ordinary(rec, index, offsets,
+						       n_core,
+						       NULL,
+						       REC_LEAF_ORDINARY);
+			return;
+		}
+
+		/* The n_nullable flags in the clustered index node pointer
+		records in ROW_FORMAT=COMPACT or ROW_FORMAT=DYNAMIC must
+		reflect the number of 'core columns'. These flags are
+		useless garbage, and they are only reserved because of
+		file format compatibility.
+		(Clustered index node pointer records only contain the
+		PRIMARY KEY columns, which are always NOT NULL,
+		so we should have used n_nullable=0.) */
+		ut_ad(index->n_core_fields > 0);
+
+		nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1);
+		lens = nulls - index->n_core_null_bytes;
+		offs = 0;
+		null_mask = 1;
+
+		/* read the lengths of fields 0..n */
+		do {
+			rec_offs len;
+			if (UNIV_UNLIKELY(i == n_node_ptr_field)) {
+				len = offs += REC_NODE_PTR_SIZE;
+				goto resolved;
+			}
+
+			field = dict_index_get_nth_field(index, i);
+			if (!(dict_field_get_col(field)->prtype
+			      & DATA_NOT_NULL)) {
+				/* nullable field => read the null flag */
+
+				if (UNIV_UNLIKELY(!(byte) null_mask)) {
+					nulls--;
+					null_mask = 1;
+				}
+
+				if (*nulls & null_mask) {
+					null_mask <<= 1;
+					/* No length is stored for NULL fields.
+					We do not advance offs, and we set
+					the length to zero and enable the
+					SQL NULL flag in offsets[]. */
+					len = combine(offs, SQL_NULL);
+					goto resolved;
+				}
+				null_mask <<= 1;
+			}
+
+			if (UNIV_UNLIKELY(!field->fixed_len)) {
+				const dict_col_t*	col
+					= dict_field_get_col(field);
+				/* Variable-length field: read the length */
+				len = *lens--;
+				/* If the maximum length of the field
+				is up to 255 bytes, the actual length
+				is always stored in one byte. If the
+				maximum length is more than 255 bytes,
+				the actual length is stored in one
+				byte for 0..127.  The length will be
+				encoded in two bytes when it is 128 or
+				more, or when the field is stored
+				externally. */
+				if (UNIV_UNLIKELY(len & 0x80)
+				    && DATA_BIG_COL(col)) {
+					/* 1exxxxxxx xxxxxxxx */
+					len <<= 8;
+					len |= *lens--;
+
+					/* B-tree node pointers
+					must not contain externally
+					stored columns.	 Thus
+					the "e" flag must be 0. */
+					ut_a(!(len & 0x4000));
+					offs += len & 0x3fff;
+					len = offs;
+					goto resolved;
+				}
+
+				len = offs += len;
+			} else {
+				len = offs += field->fixed_len;
+			}
+resolved:
+			rec_offs_base(offsets)[i + 1] = len;
+		} while (++i < rec_offs_n_fields(offsets));
+
+		*rec_offs_base(offsets)
+			= static_cast<rec_offs>((rec - (lens + 1))
+						| REC_OFFS_COMPACT);
+	} else {
+		/* Old-style record: determine extra size and end offsets */
+		offs = REC_N_OLD_EXTRA_BYTES;
+		const ulint n_fields = rec_get_n_fields_old(rec);
+		const ulint n = std::min(n_fields, rec_offs_n_fields(offsets));
+		rec_offs any;
+
+		if (rec_get_1byte_offs_flag(rec)) {
+			offs += static_cast<rec_offs>(n_fields);
+			any = offs;
+			/* Determine offsets to fields */
+			do {
+				offs = rec_1_get_field_end_info(rec, i);
+				if (offs & REC_1BYTE_SQL_NULL_MASK) {
+					offs ^= REC_1BYTE_SQL_NULL_MASK
+						| SQL_NULL;
+				}
+				rec_offs_base(offsets)[1 + i] = offs;
+			} while (++i < n);
+		} else {
+			offs += static_cast<rec_offs>(2 * n_fields);
+			any = offs;
+			/* Determine offsets to fields */
+			do {
+				offs = rec_2_get_field_end_info(rec, i);
+				static_assert(REC_2BYTE_SQL_NULL_MASK
+					      == SQL_NULL, "");
+				static_assert(REC_2BYTE_EXTERN_MASK
+					      == STORED_OFFPAGE, "");
+				static_assert(REC_OFFS_EXTERNAL
+					      == STORED_OFFPAGE, "");
+				any |= (offs & REC_OFFS_EXTERNAL);
+				rec_offs_base(offsets)[1 + i] = offs;
+			} while (++i < n);
+		}
+
+		if (i < rec_offs_n_fields(offsets)) {
+			ut_ad(index->is_instant()
+			      || i + (index->id == DICT_INDEXES_ID)
+			      == rec_offs_n_fields(offsets));
+
+			ut_ad(i != 0);
+			offs = combine(rec_offs_base(offsets)[i], DEFAULT);
+
+			do {
+				rec_offs_base(offsets)[1 + i] = offs;
+			} while (++i < rec_offs_n_fields(offsets));
+
+			any |= REC_OFFS_DEFAULT;
+		}
+
+		*rec_offs_base(offsets) = any;
+	}
+}
+
+/** Determine the offsets to each field in an index record.
+@param[in]	rec		physical record
+@param[in]	index		the index that the record belongs to
+@param[in,out]	offsets		array comprising offsets[0] allocated elements,
+				or an array from rec_get_offsets(), or NULL
+@param[in]	n_core		0, or index->n_core_fields for leaf page
+@param[in]	n_fields	maximum number of offsets to compute
+				(ULINT_UNDEFINED to compute all offsets)
+@param[in,out]	heap		memory heap
+@return the new offsets */
+rec_offs*
+rec_get_offsets_func(
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	rec_offs*		offsets,
+	ulint			n_core,
+	ulint			n_fields,
+#ifdef UNIV_DEBUG
+	const char*		file,	/*!< in: file name where called */
+	unsigned		line,	/*!< in: line number where called */
+#endif /* UNIV_DEBUG */
+	mem_heap_t**		heap)	/*!< in/out: memory heap */
+{
+	ulint	n;
+	ulint	size;
+	bool	alter_metadata = false;
+
+	ut_ad(index->n_core_fields >= n_core);
+	/* This assertion was relaxed for the btr_cur_t::open_leaf()
+	call in btr_cur_instant_init_low(). We cannot invoke
+	index->is_instant(), because the same assertion would fail there
+	until btr_cur_instant_init_low() has invoked
+	dict_table_t::deserialise_columns(). */
+	ut_ad(index->n_fields >= index->n_core_fields
+	      || index->in_instant_init);
+
+	if (dict_table_is_comp(index->table)) {
+		switch (UNIV_EXPECT(rec_get_status(rec),
+				    REC_STATUS_ORDINARY)) {
+		case REC_STATUS_INSTANT:
+			alter_metadata = rec_is_alter_metadata(rec, true);
+			/* fall through */
+		case REC_STATUS_ORDINARY:
+			ut_ad(n_core);
+			n = dict_index_get_n_fields(index) + alter_metadata;
+			break;
+		case REC_STATUS_NODE_PTR:
+			/* Node pointer records consist of the
+			uniquely identifying fields of the record
+			followed by a child page number field. */
+			ut_ad(!n_core);
+			n = dict_index_get_n_unique_in_tree_nonleaf(index) + 1;
+			break;
+		default:
+			ut_ad("corrupted record header" == 0);
+			/* fall through */
+		case REC_STATUS_INFIMUM:
+		case REC_STATUS_SUPREMUM:
+			/* infimum or supremum record */
+			ut_ad(rec_get_heap_no_new(rec)
+			      == ulint(rec_get_status(rec)
+				       == REC_STATUS_INFIMUM
+				       ? PAGE_HEAP_NO_INFIMUM
+				       : PAGE_HEAP_NO_SUPREMUM));
+			n = 1;
+			break;
+		}
+	} else {
+		n = rec_get_n_fields_old(rec);
+		/* Here, rec can be allocated from the heap (copied
+		from an index page record), or it can be located in an
+		index page. If rec is not in an index page, then
+		page_rec_is_user_rec(rec) and similar predicates
+		cannot be evaluated. We can still distinguish the
+		infimum and supremum record based on the heap number. */
+		const bool is_user_rec = rec_get_heap_no_old(rec)
+			>= PAGE_HEAP_NO_USER_LOW;
+		/* The infimum and supremum records carry 1 field. */
+		ut_ad(is_user_rec || n == 1);
+		ut_ad(!is_user_rec || n_core || index->is_dummy
+		      || dict_index_is_ibuf(index)
+		      || n == n_fields /* dict_stats_analyze_index_level() */
+		      || n - 1
+		      == dict_index_get_n_unique_in_tree_nonleaf(index));
+		ut_ad(!is_user_rec || !n_core || index->is_dummy
+		      || dict_index_is_ibuf(index)
+		      || n == n_fields /* btr_pcur_restore_position() */
+		      || (n + (index->id == DICT_INDEXES_ID) >= n_core));
+
+		if (is_user_rec && n_core && n < index->n_fields) {
+			ut_ad(!index->is_dummy);
+			ut_ad(!dict_index_is_ibuf(index));
+			n = index->n_fields;
+		}
+	}
+
+	if (UNIV_UNLIKELY(n_fields < n)) {
+		n = n_fields;
+	}
+
+	/* The offsets header consists of the allocation size at
+	offsets[0] and the REC_OFFS_HEADER_SIZE bytes. */
+	size = n + (1 + REC_OFFS_HEADER_SIZE);
+
+	if (UNIV_UNLIKELY(!offsets)
+	    || UNIV_UNLIKELY(rec_offs_get_n_alloc(offsets) < size)) {
+		if (UNIV_UNLIKELY(!*heap)) {
+			*heap = mem_heap_create_at(size * sizeof(*offsets),
+						   file, line);
+		}
+		offsets = static_cast<rec_offs*>(
+			mem_heap_alloc(*heap, size * sizeof(*offsets)));
+
+		rec_offs_set_n_alloc(offsets, size);
+	}
+
+	rec_offs_set_n_fields(offsets, n);
+
+	if (UNIV_UNLIKELY(alter_metadata) && index->table->not_redundant()) {
+#ifdef UNIV_DEBUG
+		memcpy(&offsets[RECORD_OFFSET], &rec, sizeof rec);
+		memcpy(&offsets[INDEX_OFFSET], &index, sizeof index);
+#endif /* UNIV_DEBUG */
+		ut_ad(n_core);
+		ut_ad(index->table->instant);
+		ut_ad(index->is_instant());
+		ut_ad(rec_offs_n_fields(offsets)
+		      <= ulint(index->n_fields) + 1);
+		rec_init_offsets_comp_ordinary<true>(rec, index, offsets,
+						     index->n_core_fields,
+						     nullptr,
+						     REC_LEAF_INSTANT);
+	} else {
+		rec_init_offsets(rec, index, n_core, offsets);
+	}
+	return offsets;
+}
+
+/******************************************************//**
+The following function determines the offsets to each field
+in the record.  It can reuse a previously allocated array. */
+void
+rec_get_offsets_reverse(
+/*====================*/
+	const byte*		extra,	/*!< in: the extra bytes of a
+					compact record in reverse order,
+					excluding the fixed-size
+					REC_N_NEW_EXTRA_BYTES */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	ulint			node_ptr,/*!< in: nonzero=node pointer,
+					0=leaf node */
+	rec_offs*		offsets)/*!< in/out: array consisting of
+					offsets[0] allocated elements */
+{
+	ulint		n;
+	ulint		i;
+	rec_offs	offs;
+	rec_offs	any_ext = 0;
+	const byte*	nulls;
+	const byte*	lens;
+	dict_field_t*	field;
+	ulint		null_mask;
+	ulint		n_node_ptr_field;
+
+	ut_ad(dict_table_is_comp(index->table));
+	ut_ad(!index->is_instant());
+
+	if (UNIV_UNLIKELY(node_ptr != 0)) {
+		n_node_ptr_field =
+			dict_index_get_n_unique_in_tree_nonleaf(index);
+		n = n_node_ptr_field + 1;
+	} else {
+		n_node_ptr_field = ULINT_UNDEFINED;
+		n = dict_index_get_n_fields(index);
+	}
+
+	ut_a(rec_offs_get_n_alloc(offsets) >= n + (1 + REC_OFFS_HEADER_SIZE));
+	rec_offs_set_n_fields(offsets, n);
+
+	nulls = extra;
+	lens = nulls + UT_BITS_IN_BYTES(index->n_nullable);
+	i = offs = 0;
+	null_mask = 1;
+
+	/* read the lengths of fields 0..n */
+	do {
+		rec_offs len;
+		if (UNIV_UNLIKELY(i == n_node_ptr_field)) {
+			len = offs += REC_NODE_PTR_SIZE;
+			goto resolved;
+		}
+
+		field = dict_index_get_nth_field(index, i);
+		if (!(dict_field_get_col(field)->prtype & DATA_NOT_NULL)) {
+			/* nullable field => read the null flag */
+
+			if (UNIV_UNLIKELY(!(byte) null_mask)) {
+				nulls++;
+				null_mask = 1;
+			}
+
+			if (*nulls & null_mask) {
+				null_mask <<= 1;
+				/* No length is stored for NULL fields.
+				We do not advance offs, and we set
+				the length to zero and enable the
+				SQL NULL flag in offsets[]. */
+				len = combine(offs, SQL_NULL);
+				goto resolved;
+			}
+			null_mask <<= 1;
+		}
+
+		if (UNIV_UNLIKELY(!field->fixed_len)) {
+			/* Variable-length field: read the length */
+			const dict_col_t*	col
+				= dict_field_get_col(field);
+			len = *lens++;
+			/* If the maximum length of the field is up
+			to 255 bytes, the actual length is always
+			stored in one byte. If the maximum length is
+			more than 255 bytes, the actual length is
+			stored in one byte for 0..127.  The length
+			will be encoded in two bytes when it is 128 or
+			more, or when the field is stored externally. */
+			if (UNIV_UNLIKELY(len & 0x80) && DATA_BIG_COL(col)) {
+				/* 1exxxxxxx xxxxxxxx */
+				len &= 0x7f;
+				len <<= 8;
+				len |= *lens++;
+				static_assert(STORED_OFFPAGE == 0x4000, "");
+				static_assert(REC_OFFS_EXTERNAL == 0x4000, "");
+				rec_offs ext = len & REC_OFFS_EXTERNAL;
+				offs += get_value(len);
+				len = offs | ext;
+				any_ext |= ext;
+				goto resolved;
+			}
+
+			len = offs += len;
+		} else {
+			len = offs += field->fixed_len;
+		}
+resolved:
+		rec_offs_base(offsets)[i + 1] = len;
+	} while (++i < rec_offs_n_fields(offsets));
+
+	ut_ad(lens >= extra);
+	*rec_offs_base(offsets)
+		= static_cast<rec_offs>(lens - extra + REC_N_NEW_EXTRA_BYTES)
+		  | REC_OFFS_COMPACT | any_ext;
+}
+
+/************************************************************//**
+The following function is used to get the offset to the nth
+data field in an old-style record.
+@return offset to the field */
+ulint
+rec_get_nth_field_offs_old(
+/*=======================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n,	/*!< in: index of the field */
+	ulint*		len)	/*!< out: length of the field;
+				UNIV_SQL_NULL if SQL null */
+{
+	ulint	os;
+	ulint	next_os;
+
+	ut_a(n < rec_get_n_fields_old(rec));
+
+	if (rec_get_1byte_offs_flag(rec)) {
+		os = rec_1_get_field_start_offs(rec, n);
+
+		next_os = rec_1_get_field_end_info(rec, n);
+
+		if (next_os & REC_1BYTE_SQL_NULL_MASK) {
+			*len = UNIV_SQL_NULL;
+
+			return(os);
+		}
+
+		next_os &= ~REC_1BYTE_SQL_NULL_MASK;
+	} else {
+		os = rec_2_get_field_start_offs(rec, n);
+
+		next_os = rec_2_get_field_end_info(rec, n);
+
+		if (next_os & REC_2BYTE_SQL_NULL_MASK) {
+			*len = UNIV_SQL_NULL;
+
+			return(os);
+		}
+
+		next_os &= ~(REC_2BYTE_SQL_NULL_MASK | REC_2BYTE_EXTERN_MASK);
+	}
+
+	*len = next_os - os;
+
+	ut_ad(*len < srv_page_size);
+
+	return(os);
+}
+
+/** Determine the size of a data tuple prefix in ROW_FORMAT=COMPACT.
+@tparam	mblob		whether the record includes a metadata BLOB
+@tparam redundant_temp	whether to use the ROW_FORMAT=REDUNDANT format
+@param[in]	index		record descriptor; dict_table_is_comp()
+				is assumed to hold, even if it doesn't
+@param[in]	dfield		array of data fields
+@param[in]	n_fields	number of data fields
+@param[out]	extra		extra size
+@param[in]	status		status flags
+@param[in]	temp		whether this is a temporary file record
+@return total size */
+template<bool mblob = false, bool redundant_temp = false>
+static inline
+ulint
+rec_get_converted_size_comp_prefix_low(
+	const dict_index_t*	index,
+	const dfield_t*		dfield,
+	ulint			n_fields,
+	ulint*			extra,
+	rec_comp_status_t	status,
+	bool			temp)
+{
+	ulint	extra_size = temp ? 0 : REC_N_NEW_EXTRA_BYTES;
+	ut_ad(n_fields > 0);
+	ut_ad(n_fields - mblob <= dict_index_get_n_fields(index));
+	ut_d(ulint n_null = index->n_nullable);
+	ut_ad(status == REC_STATUS_ORDINARY || status == REC_STATUS_NODE_PTR
+	      || status == REC_STATUS_INSTANT);
+	unsigned n_core_fields = redundant_temp
+		? row_log_get_n_core_fields(index)
+		: index->n_core_fields;
+
+	if (mblob) {
+		ut_ad(index->table->instant);
+		ut_ad(!redundant_temp && index->is_instant());
+		ut_ad(status == REC_STATUS_INSTANT);
+		ut_ad(n_fields == ulint(index->n_fields) + 1);
+		extra_size += UT_BITS_IN_BYTES(index->n_nullable)
+			+ rec_get_n_add_field_len(n_fields - 1
+						  - n_core_fields);
+	} else if (status == REC_STATUS_INSTANT
+		   && (!temp || n_fields > n_core_fields)) {
+		if (!redundant_temp) { ut_ad(index->is_instant()); }
+		ut_ad(UT_BITS_IN_BYTES(n_null) >= index->n_core_null_bytes);
+		extra_size += UT_BITS_IN_BYTES(index->get_n_nullable(n_fields))
+			+ rec_get_n_add_field_len(n_fields - 1
+						  - n_core_fields);
+	} else {
+		ut_ad(n_fields <= n_core_fields);
+		extra_size += redundant_temp
+			? UT_BITS_IN_BYTES(index->n_nullable)
+			: index->n_core_null_bytes;
+	}
+
+	ulint data_size = 0;
+
+	if (temp && dict_table_is_comp(index->table)) {
+		/* No need to do adjust fixed_len=0. We only need to
+		adjust it for ROW_FORMAT=REDUNDANT. */
+		temp = false;
+	}
+
+	const dfield_t* const end = dfield + n_fields;
+	/* read the lengths of fields 0..n */
+	for (ulint i = 0; dfield < end; i++, dfield++) {
+		if (mblob && i == index->first_user_field()) {
+			data_size += FIELD_REF_SIZE;
+			if (++dfield == end) {
+				ut_ad(i == index->n_fields);
+				break;
+			}
+		}
+
+		ulint len = dfield_get_len(dfield);
+
+		const dict_field_t* field = dict_index_get_nth_field(index, i);
+#ifdef UNIV_DEBUG
+		if (dict_index_is_spatial(index)) {
+			if (DATA_GEOMETRY_MTYPE(field->col->mtype) && i == 0) {
+				ut_ad(dfield->type.prtype & DATA_GIS_MBR);
+			} else {
+				ut_ad(dfield->type.mtype == DATA_SYS_CHILD
+				      || dict_col_type_assert_equal(
+					      field->col, &dfield->type));
+			}
+		} else {
+			ut_ad(field->col->is_dropped()
+			      || dict_col_type_assert_equal(field->col,
+							    &dfield->type));
+		}
+#endif
+
+		/* All NULLable fields must be included in the n_null count. */
+		ut_ad(!field->col->is_nullable() || n_null--);
+
+		if (dfield_is_null(dfield)) {
+			/* No length is stored for NULL fields. */
+			ut_ad(field->col->is_nullable());
+			continue;
+		}
+
+		ut_ad(len <= field->col->len
+		      || DATA_LARGE_MTYPE(field->col->mtype)
+		      || (field->col->len == 0
+			  && field->col->mtype == DATA_VARCHAR));
+
+		ulint fixed_len = field->fixed_len;
+		if (temp && fixed_len
+		    && !dict_col_get_fixed_size(field->col, temp)) {
+			fixed_len = 0;
+		}
+		/* If the maximum length of a variable-length field
+		is up to 255 bytes, the actual length is always stored
+		in one byte. If the maximum length is more than 255
+		bytes, the actual length is stored in one byte for
+		0..127.  The length will be encoded in two bytes when
+		it is 128 or more, or when the field is stored externally. */
+
+		if (fixed_len) {
+#ifdef UNIV_DEBUG
+			ut_ad(len <= fixed_len);
+
+			if (dict_index_is_spatial(index)) {
+				ut_ad(dfield->type.mtype == DATA_SYS_CHILD
+				      || !field->col->mbmaxlen
+				      || len >= field->col->mbminlen
+				      * fixed_len / field->col->mbmaxlen);
+			} else {
+				ut_ad(dfield->type.mtype != DATA_SYS_CHILD);
+
+				ut_ad(field->col->is_dropped()
+				      || !field->col->mbmaxlen
+				      || len >= field->col->mbminlen
+				      * fixed_len / field->col->mbmaxlen);
+			}
+
+			/* dict_index_add_col() should guarantee this */
+			ut_ad(!field->prefix_len
+			      || fixed_len == field->prefix_len);
+#endif /* UNIV_DEBUG */
+		} else if (dfield_is_ext(dfield)) {
+			ut_ad(DATA_BIG_COL(field->col));
+			extra_size += 2;
+		} else if (UNIV_LIKELY(len < 128)
+			   || !DATA_BIG_COL(field->col)) {
+			extra_size++;
+		} else {
+			/* For variable-length columns, we look up the
+			maximum length from the column itself.  If this
+			is a prefix index column shorter than 256 bytes,
+			this will waste one byte. */
+			extra_size += 2;
+		}
+		data_size += len;
+	}
+
+	if (extra) {
+		*extra = extra_size;
+	}
+
+	return(extra_size + data_size);
+}
+
+/**********************************************************//**
+Determines the size of a data tuple prefix in ROW_FORMAT=COMPACT.
+@return total size */
+ulint
+rec_get_converted_size_comp_prefix(
+/*===============================*/
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	const dfield_t*		fields,	/*!< in: array of data fields */
+	ulint			n_fields,/*!< in: number of data fields */
+	ulint*			extra)	/*!< out: extra size */
+{
+	ut_ad(dict_table_is_comp(index->table));
+	return(rec_get_converted_size_comp_prefix_low(
+		       index, fields, n_fields, extra,
+		       REC_STATUS_ORDINARY, false));
+}
+
+/** Determine the size of a record in ROW_FORMAT=COMPACT.
+@param[in]	index		record descriptor. dict_table_is_comp()
+				is assumed to hold, even if it doesn't
+@param[in]	tuple		logical record
+@param[out]	extra		extra size
+@return total size */
+ulint
+rec_get_converted_size_comp(
+	const dict_index_t*	index,
+	const dtuple_t*		tuple,
+	ulint*			extra)
+{
+	ut_ad(tuple->n_fields > 0);
+
+	rec_comp_status_t status = rec_comp_status_t(tuple->info_bits
+						     & REC_NEW_STATUS_MASK);
+
+	switch (UNIV_EXPECT(status, REC_STATUS_ORDINARY)) {
+	case REC_STATUS_ORDINARY:
+		ut_ad(!tuple->is_metadata());
+		if (tuple->n_fields > index->n_core_fields) {
+			ut_ad(index->is_instant());
+			status = REC_STATUS_INSTANT;
+		}
+		/* fall through */
+	case REC_STATUS_INSTANT:
+		ut_ad(tuple->n_fields >= index->n_core_fields);
+		if (tuple->is_alter_metadata()) {
+			return rec_get_converted_size_comp_prefix_low<true>(
+				index, tuple->fields, tuple->n_fields,
+				extra, status, false);
+		}
+		ut_ad(tuple->n_fields <= index->n_fields);
+		return rec_get_converted_size_comp_prefix_low(
+			index, tuple->fields, tuple->n_fields,
+			extra, status, false);
+	case REC_STATUS_NODE_PTR:
+		ut_ad(tuple->n_fields - 1
+		      == dict_index_get_n_unique_in_tree_nonleaf(index));
+		ut_ad(dfield_get_len(&tuple->fields[tuple->n_fields - 1])
+		      == REC_NODE_PTR_SIZE);
+		return REC_NODE_PTR_SIZE /* child page number */
+			+ rec_get_converted_size_comp_prefix_low(
+				index, tuple->fields, tuple->n_fields - 1,
+				extra, status, false);
+	case REC_STATUS_INFIMUM:
+	case REC_STATUS_SUPREMUM:
+		/* not supported */
+		break;
+	}
+
+	ut_error;
+	return(ULINT_UNDEFINED);
+}
+
+/*********************************************************//**
+Builds an old-style physical record out of a data tuple and
+stores it beginning from the start of the given buffer.
+@return pointer to the origin of physical record */
+static
+rec_t*
+rec_convert_dtuple_to_rec_old(
+/*==========================*/
+	byte*		buf,	/*!< in: start address of the physical record */
+	const dtuple_t*	dtuple,	/*!< in: data tuple */
+	ulint		n_ext)	/*!< in: number of externally stored columns */
+{
+	const dfield_t*	field;
+	ulint		n_fields;
+	ulint		data_size;
+	rec_t*		rec;
+	ulint		end_offset;
+	ulint		ored_offset;
+	ulint		len;
+	ulint		i;
+
+	ut_ad(buf && dtuple);
+	ut_ad(dtuple_validate(dtuple));
+	ut_ad(dtuple_check_typed(dtuple));
+
+	n_fields = dtuple_get_n_fields(dtuple);
+	data_size = dtuple_get_data_size(dtuple, 0);
+
+	ut_ad(n_fields > 0);
+
+	/* Calculate the offset of the origin in the physical record */
+
+	rec = buf + rec_get_converted_extra_size(data_size, n_fields, n_ext);
+	/* Store the number of fields */
+	rec_set_n_fields_old(rec, n_fields);
+
+	/* Set the info bits of the record */
+	rec_set_bit_field_1(rec,
+			    dtuple_get_info_bits(dtuple) & REC_INFO_BITS_MASK,
+			    REC_OLD_INFO_BITS,
+			    REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT);
+	rec_set_bit_field_2(rec, PAGE_HEAP_NO_USER_LOW, REC_OLD_HEAP_NO,
+			    REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+
+	/* Store the data and the offsets */
+
+	end_offset = 0;
+
+	if (!n_ext && data_size <= REC_1BYTE_OFFS_LIMIT) {
+
+		rec_set_1byte_offs_flag(rec, TRUE);
+
+		for (i = 0; i < n_fields; i++) {
+
+			field = dtuple_get_nth_field(dtuple, i);
+
+			if (dfield_is_null(field)) {
+				len = dtype_get_sql_null_size(
+					dfield_get_type(field), 0);
+				data_write_sql_null(rec + end_offset, len);
+
+				end_offset += len;
+				ored_offset = end_offset
+					| REC_1BYTE_SQL_NULL_MASK;
+			} else {
+				/* If the data is not SQL null, store it */
+				len = dfield_get_len(field);
+
+				if (len)
+				  memcpy(rec + end_offset,
+					 dfield_get_data(field), len);
+
+				end_offset += len;
+				ored_offset = end_offset;
+			}
+
+			rec_1_set_field_end_info(rec, i, ored_offset);
+		}
+	} else {
+		rec_set_1byte_offs_flag(rec, FALSE);
+
+		for (i = 0; i < n_fields; i++) {
+
+			field = dtuple_get_nth_field(dtuple, i);
+
+			if (dfield_is_null(field)) {
+				len = dtype_get_sql_null_size(
+					dfield_get_type(field), 0);
+				data_write_sql_null(rec + end_offset, len);
+
+				end_offset += len;
+				ored_offset = end_offset
+					| REC_2BYTE_SQL_NULL_MASK;
+			} else {
+				/* If the data is not SQL null, store it */
+				len = dfield_get_len(field);
+
+				if (len)
+				   memcpy(rec + end_offset,
+					  dfield_get_data(field), len);
+
+				end_offset += len;
+				ored_offset = end_offset;
+
+				if (dfield_is_ext(field)) {
+					ored_offset |= REC_2BYTE_EXTERN_MASK;
+				}
+			}
+
+			rec_2_set_field_end_info(rec, i, ored_offset);
+		}
+	}
+
+	return(rec);
+}
+
+/** Convert a data tuple into a ROW_FORMAT=COMPACT record.
+@tparam	mblob	        whether the record includes a metadata BLOB
+@tparam redundant_temp  whether to use the ROW_FORMAT=REDUNDANT format
+@param[out]	rec		converted record
+@param[in]	index		index
+@param[in]	field		data fields to convert
+@param[in]	n_fields	number of data fields
+@param[in]	status		rec_get_status(rec)
+@param[in]	temp		whether to use the format for temporary files
+				in index creation */
+template<bool mblob = false, bool redundant_temp = false>
+static inline
+void
+rec_convert_dtuple_to_rec_comp(
+	rec_t*			rec,
+	const dict_index_t*	index,
+	const dfield_t*		field,
+	ulint			n_fields,
+	rec_comp_status_t	status,
+	bool			temp)
+{
+	byte*		end;
+	byte*		nulls = temp
+		? rec - 1 : rec - (REC_N_NEW_EXTRA_BYTES + 1);
+	byte*		UNINIT_VAR(lens);
+	ulint		UNINIT_VAR(n_node_ptr_field);
+	ulint		null_mask	= 1;
+	const ulint	n_core_fields = redundant_temp
+			? row_log_get_n_core_fields(index)
+			: index->n_core_fields;
+	ut_ad(n_fields > 0);
+	ut_ad(temp || dict_table_is_comp(index->table));
+	ut_ad(index->n_core_null_bytes <= UT_BITS_IN_BYTES(index->n_nullable));
+
+	ut_d(ulint n_null = index->n_nullable);
+
+	if (mblob) {
+		ut_ad(!temp);
+		ut_ad(index->table->instant);
+		ut_ad(!redundant_temp && index->is_instant());
+		ut_ad(status == REC_STATUS_INSTANT);
+		ut_ad(n_fields == ulint(index->n_fields) + 1);
+		rec_set_n_add_field(nulls, n_fields - 1 - n_core_fields);
+		rec_set_bit_field_2(rec, PAGE_HEAP_NO_USER_LOW,
+				    REC_NEW_HEAP_NO, REC_HEAP_NO_MASK,
+				    REC_HEAP_NO_SHIFT);
+		rec_set_status(rec, REC_STATUS_INSTANT);
+		n_node_ptr_field = ULINT_UNDEFINED;
+		lens = nulls - UT_BITS_IN_BYTES(index->n_nullable);
+		goto start;
+	}
+	switch (status) {
+	case REC_STATUS_INSTANT:
+		if (!redundant_temp) { ut_ad(index->is_instant()); }
+		ut_ad(n_fields > n_core_fields);
+		rec_set_n_add_field(nulls, n_fields - 1 - n_core_fields);
+		/* fall through */
+	case REC_STATUS_ORDINARY:
+		ut_ad(n_fields <= dict_index_get_n_fields(index));
+		if (!temp) {
+			rec_set_bit_field_2(rec, PAGE_HEAP_NO_USER_LOW,
+					    REC_NEW_HEAP_NO, REC_HEAP_NO_MASK,
+					    REC_HEAP_NO_SHIFT);
+			rec_set_status(rec, n_fields == n_core_fields
+				       ? REC_STATUS_ORDINARY
+				       : REC_STATUS_INSTANT);
+		}
+
+		if (dict_table_is_comp(index->table)) {
+			/* No need to do adjust fixed_len=0. We only
+			need to adjust it for ROW_FORMAT=REDUNDANT. */
+			temp = false;
+		}
+
+		n_node_ptr_field = ULINT_UNDEFINED;
+
+		lens = nulls - (index->is_instant()
+				? UT_BITS_IN_BYTES(index->get_n_nullable(
+					n_fields))
+				: UT_BITS_IN_BYTES(
+					unsigned(index->n_nullable)));
+		break;
+	case REC_STATUS_NODE_PTR:
+		ut_ad(!temp);
+		rec_set_bit_field_2(rec, PAGE_HEAP_NO_USER_LOW,
+				    REC_NEW_HEAP_NO, REC_HEAP_NO_MASK,
+				    REC_HEAP_NO_SHIFT);
+		rec_set_status(rec, status);
+		ut_ad(n_fields - 1
+		      == dict_index_get_n_unique_in_tree_nonleaf(index));
+		ut_d(n_null = std::min<uint>(index->n_core_null_bytes * 8U,
+					     index->n_nullable));
+		n_node_ptr_field = n_fields - 1;
+		lens = nulls - index->n_core_null_bytes;
+		break;
+	case REC_STATUS_INFIMUM:
+	case REC_STATUS_SUPREMUM:
+		ut_error;
+		return;
+	}
+
+start:
+	end = rec;
+	/* clear the SQL-null flags */
+	memset(lens + 1, 0, ulint(nulls - lens));
+
+	const dfield_t* const fend = field + n_fields;
+	/* Store the data and the offsets */
+	for (ulint i = 0; field < fend; i++, field++) {
+		ulint len = dfield_get_len(field);
+
+		if (mblob) {
+			if (i == index->first_user_field()) {
+				ut_ad(len == FIELD_REF_SIZE);
+				ut_ad(dfield_is_ext(field));
+				memcpy(end, dfield_get_data(field), len);
+				end += len;
+				if (++field == fend) {
+					ut_ad(i == index->n_fields);
+					break;
+				}
+				len = dfield_get_len(field);
+			}
+		} else if (UNIV_UNLIKELY(i == n_node_ptr_field)) {
+			ut_ad(field->type.prtype & DATA_NOT_NULL);
+			ut_ad(len == REC_NODE_PTR_SIZE);
+			memcpy(end, dfield_get_data(field), len);
+			end += REC_NODE_PTR_SIZE;
+			break;
+		}
+
+		if (!(field->type.prtype & DATA_NOT_NULL)) {
+			/* nullable field */
+			ut_ad(n_null--);
+
+			if (UNIV_UNLIKELY(!(byte) null_mask)) {
+				nulls--;
+				null_mask = 1;
+			}
+
+			ut_ad(*nulls < null_mask);
+
+			/* set the null flag if necessary */
+			if (dfield_is_null(field)) {
+				*nulls |= static_cast<byte>(null_mask);
+				null_mask <<= 1;
+				continue;
+			}
+
+			null_mask <<= 1;
+		}
+		/* only nullable fields can be null */
+		ut_ad(!dfield_is_null(field));
+
+		const dict_field_t* ifield
+			= dict_index_get_nth_field(index, i);
+		ulint fixed_len = ifield->fixed_len;
+
+		if (temp && fixed_len
+		    && !dict_col_get_fixed_size(ifield->col, temp)) {
+			fixed_len = 0;
+		}
+
+		/* If the maximum length of a variable-length field
+		is up to 255 bytes, the actual length is always stored
+		in one byte. If the maximum length is more than 255
+		bytes, the actual length is stored in one byte for
+		0..127.  The length will be encoded in two bytes when
+		it is 128 or more, or when the field is stored externally. */
+		if (fixed_len) {
+			ut_ad(len <= fixed_len);
+			ut_ad(!ifield->col->mbmaxlen
+			      || len >= ifield->col->mbminlen
+			      * fixed_len / ifield->col->mbmaxlen);
+			ut_ad(!dfield_is_ext(field));
+		} else if (dfield_is_ext(field)) {
+			ut_ad(DATA_BIG_COL(ifield->col));
+			ut_ad(len <= REC_ANTELOPE_MAX_INDEX_COL_LEN
+					+ BTR_EXTERN_FIELD_REF_SIZE);
+			*lens-- = static_cast<byte>(len >> 8 | 0xc0);
+			*lens-- = static_cast<byte>(len);
+		} else {
+			ut_ad(len <= field->type.len
+			      || DATA_LARGE_MTYPE(field->type.mtype)
+			      || !strcmp(index->name,
+					 FTS_INDEX_TABLE_IND_NAME));
+			if (len < 128 || !DATA_BIG_LEN_MTYPE(
+				    field->type.len, field->type.mtype)) {
+				*lens-- = static_cast<byte>(len);
+			} else {
+				ut_ad(len < 16384);
+				*lens-- = static_cast<byte>(len >> 8 | 0x80);
+				*lens-- = static_cast<byte>(len);
+			}
+		}
+
+		if (len) {
+			memcpy(end, dfield_get_data(field), len);
+			end += len;
+		}
+	}
+}
+
+/*********************************************************//**
+Builds a new-style physical record out of a data tuple and
+stores it beginning from the start of the given buffer.
+@return pointer to the origin of physical record */
+static
+rec_t*
+rec_convert_dtuple_to_rec_new(
+/*==========================*/
+	byte*			buf,	/*!< in: start address of
+					the physical record */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	const dtuple_t*		dtuple)	/*!< in: data tuple */
+{
+	ut_ad(!(dtuple->info_bits
+		& ~(REC_NEW_STATUS_MASK | REC_INFO_DELETED_FLAG
+		    | REC_INFO_MIN_REC_FLAG)));
+
+	ulint	extra_size;
+
+	if (UNIV_UNLIKELY(dtuple->is_alter_metadata())) {
+		ut_ad((dtuple->info_bits & REC_NEW_STATUS_MASK)
+		      == REC_STATUS_INSTANT);
+		rec_get_converted_size_comp_prefix_low<true>(
+			index, dtuple->fields, dtuple->n_fields,
+			&extra_size, REC_STATUS_INSTANT, false);
+		buf += extra_size;
+		rec_convert_dtuple_to_rec_comp<true>(
+			buf, index, dtuple->fields, dtuple->n_fields,
+			REC_STATUS_INSTANT, false);
+	} else {
+		rec_get_converted_size_comp(index, dtuple, &extra_size);
+		buf += extra_size;
+		rec_comp_status_t status = rec_comp_status_t(
+			dtuple->info_bits & REC_NEW_STATUS_MASK);
+		if (status == REC_STATUS_ORDINARY
+		    && dtuple->n_fields > index->n_core_fields) {
+			ut_ad(index->is_instant());
+			status = REC_STATUS_INSTANT;
+		}
+
+		rec_convert_dtuple_to_rec_comp(
+			buf, index, dtuple->fields, dtuple->n_fields,
+			status, false);
+	}
+
+	rec_set_bit_field_1(buf, dtuple->info_bits & ~REC_NEW_STATUS_MASK,
+			    REC_NEW_INFO_BITS,
+			    REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT);
+	return buf;
+}
+#if defined __GNUC__ && !defined __clang__
+# pragma GCC diagnostic pop /* ignored "-Wconversion" */
+#endif
+
+/*********************************************************//**
+Builds a physical record out of a data tuple and
+stores it beginning from the start of the given buffer.
+@return pointer to the origin of physical record */
+rec_t*
+rec_convert_dtuple_to_rec(
+/*======================*/
+	byte*			buf,	/*!< in: start address of the
+					physical record */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	const dtuple_t*		dtuple,	/*!< in: data tuple */
+	ulint			n_ext)	/*!< in: number of
+					externally stored columns */
+{
+	rec_t*	rec;
+
+	ut_ad(buf != NULL);
+	ut_ad(index != NULL);
+	ut_ad(dtuple != NULL);
+	ut_ad(dtuple_validate(dtuple));
+	ut_ad(dtuple_check_typed(dtuple));
+
+	if (dict_table_is_comp(index->table)) {
+		rec = rec_convert_dtuple_to_rec_new(buf, index, dtuple);
+	} else {
+		rec = rec_convert_dtuple_to_rec_old(buf, dtuple, n_ext);
+	}
+
+	return(rec);
+}
+
+/** Determine the size of a data tuple prefix in a temporary file.
+@tparam redundant_temp whether to use the ROW_FORMAT=REDUNDANT format
+@param[in]	index		clustered or secondary index
+@param[in]	fields		data fields
+@param[in]	n_fields	number of data fields
+@param[out]	extra		record header size
+@param[in]	status		REC_STATUS_ORDINARY or REC_STATUS_INSTANT
+@return	total size, in bytes */
+template<bool redundant_temp>
+ulint
+rec_get_converted_size_temp(
+	const dict_index_t*	index,
+	const dfield_t*		fields,
+	ulint			n_fields,
+	ulint*			extra,
+	rec_comp_status_t	status)
+{
+	return rec_get_converted_size_comp_prefix_low<false,redundant_temp>(
+		index, fields, n_fields, extra, status, true);
+}
+
+template ulint rec_get_converted_size_temp<false>(
+	const dict_index_t*, const dfield_t*, ulint, ulint*,
+	rec_comp_status_t);
+
+template ulint rec_get_converted_size_temp<true>(
+	const dict_index_t*, const dfield_t*, ulint, ulint*,
+	rec_comp_status_t);
+
+/** Determine the offset to each field in temporary file.
+@param[in]	rec	temporary file record
+@param[in]	index	index of that the record belongs to
+@param[in,out]	offsets	offsets to the fields; in: rec_offs_n_fields(offsets)
+@param[in]	n_core	number of core fields (index->n_core_fields)
+@param[in]	def_val	default values for non-core fields
+@param[in]	status	REC_STATUS_ORDINARY or REC_STATUS_INSTANT */
+void
+rec_init_offsets_temp(
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	rec_offs*		offsets,
+	ulint			n_core,
+	const dict_col_t::def_t*def_val,
+	rec_comp_status_t	status)
+{
+	ut_ad(status == REC_STATUS_ORDINARY
+	      || status == REC_STATUS_INSTANT);
+	/* The table may have been converted to plain format
+	if it was emptied during an ALTER TABLE operation. */
+	ut_ad(index->n_core_fields == n_core || !index->is_instant());
+	ut_ad(index->n_core_fields >= n_core);
+	if (index->table->not_redundant()) {
+		rec_init_offsets_comp_ordinary(
+			rec, index, offsets, n_core, def_val,
+			status == REC_STATUS_INSTANT
+			? REC_LEAF_TEMP_INSTANT
+			: REC_LEAF_TEMP);
+	} else {
+		rec_init_offsets_comp_ordinary<false, true>(
+			rec, index, offsets, n_core, def_val,
+			status == REC_STATUS_INSTANT
+			? REC_LEAF_TEMP_INSTANT
+			: REC_LEAF_TEMP);
+	}
+}
+
+/** Determine the offset to each field in temporary file.
+@param[in]	rec	temporary file record
+@param[in]	index	index of that the record belongs to
+@param[in,out]	offsets	offsets to the fields; in: rec_offs_n_fields(offsets)
+*/
+void
+rec_init_offsets_temp(
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	rec_offs*		offsets)
+{
+	ut_ad(!index->is_instant());
+	if (index->table->not_redundant()) {
+		rec_init_offsets_comp_ordinary(
+			rec, index, offsets,
+			index->n_core_fields, NULL, REC_LEAF_TEMP);
+	} else {
+		rec_init_offsets_comp_ordinary<false, true>(
+			rec, index, offsets,
+			index->n_core_fields, NULL, REC_LEAF_TEMP);
+	}
+}
+
+/** Convert a data tuple prefix to the temporary file format.
+@param[out]	rec		record in temporary file format
+@param[in]	index		clustered or secondary index
+@param[in]	fields		data fields
+@param[in]	n_fields	number of data fields
+@param[in]	status		REC_STATUS_ORDINARY or REC_STATUS_INSTANT
+*/
+template<bool redundant_temp>
+void
+rec_convert_dtuple_to_temp(
+	rec_t*			rec,
+	const dict_index_t*	index,
+	const dfield_t*		fields,
+	ulint			n_fields,
+	rec_comp_status_t	status)
+{
+	rec_convert_dtuple_to_rec_comp<false,redundant_temp>(
+		rec, index, fields, n_fields, status, true);
+}
+
+template void rec_convert_dtuple_to_temp<false>(
+	rec_t*, const dict_index_t*, const dfield_t*,
+	ulint, rec_comp_status_t);
+
+template void rec_convert_dtuple_to_temp<true>(
+	rec_t*, const dict_index_t*, const dfield_t*,
+	ulint, rec_comp_status_t);
+
+/** Copy the first n fields of a (copy of a) physical record to a data tuple.
+The fields are copied into the memory heap.
+@param[out]	tuple		data tuple
+@param[in]	rec		index record, or a copy thereof
+@param[in]	index		index of rec
+@param[in]	n_core		index->n_core_fields at the time rec was
+				copied, or 0 if non-leaf page record
+@param[in]	n_fields	number of fields to copy
+@param[in,out]	heap		memory heap */
+void
+rec_copy_prefix_to_dtuple(
+	dtuple_t*		tuple,
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	ulint			n_core,
+	ulint			n_fields,
+	mem_heap_t*		heap)
+{
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets	= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(n_core <= index->n_core_fields);
+	ut_ad(n_core || n_fields - 1
+	      <= dict_index_get_n_unique_in_tree_nonleaf(index));
+
+	offsets = rec_get_offsets(rec, index, offsets, n_core,
+				  n_fields, &heap);
+
+	ut_ad(rec_validate(rec, offsets));
+	ut_ad(!rec_offs_any_default(offsets));
+	ut_ad(dtuple_check_typed(tuple));
+
+	tuple->info_bits = rec_get_info_bits(rec, rec_offs_comp(offsets));
+
+	for (ulint i = 0; i < n_fields; i++) {
+		dfield_t*	field;
+		const byte*	data;
+		ulint		len;
+
+		field = dtuple_get_nth_field(tuple, i);
+		data = rec_get_nth_field(rec, offsets, i, &len);
+
+		if (len != UNIV_SQL_NULL) {
+			dfield_set_data(field,
+					mem_heap_dup(heap, data, len), len);
+			ut_ad(!rec_offs_nth_extern(offsets, i));
+		} else {
+			dfield_set_null(field);
+		}
+	}
+}
+
+/**************************************************************//**
+Copies the first n fields of an old-style physical record
+to a new physical record in a buffer.
+@return own: copied record */
+static
+rec_t*
+rec_copy_prefix_to_buf_old(
+/*=======================*/
+	const rec_t*	rec,		/*!< in: physical record */
+	ulint		n_fields,	/*!< in: number of fields to copy */
+	ulint		area_end,	/*!< in: end of the prefix data */
+	byte**		buf,		/*!< in/out: memory buffer for
+					the copied prefix, or NULL */
+	ulint*		buf_size)	/*!< in/out: buffer size */
+{
+	rec_t*	copy_rec;
+	ulint	area_start;
+	ulint	prefix_len;
+
+	if (rec_get_1byte_offs_flag(rec)) {
+		area_start = REC_N_OLD_EXTRA_BYTES + n_fields;
+	} else {
+		area_start = REC_N_OLD_EXTRA_BYTES + 2 * n_fields;
+	}
+
+	prefix_len = area_start + area_end;
+
+	if ((*buf == NULL) || (*buf_size < prefix_len)) {
+		ut_free(*buf);
+		*buf_size = prefix_len;
+		*buf = static_cast<byte*>(ut_malloc_nokey(prefix_len));
+	}
+
+	memcpy(*buf, rec - area_start, prefix_len);
+
+	copy_rec = *buf + area_start;
+
+	rec_set_n_fields_old(copy_rec, n_fields);
+
+	return(copy_rec);
+}
+
+/**************************************************************//**
+Copies the first n fields of a physical record to a new physical record in
+a buffer.
+@return own: copied record */
+rec_t*
+rec_copy_prefix_to_buf(
+/*===================*/
+	const rec_t*		rec,		/*!< in: physical record */
+	const dict_index_t*	index,		/*!< in: record descriptor */
+	ulint			n_fields,	/*!< in: number of fields
+						to copy */
+	byte**			buf,		/*!< in/out: memory buffer
+						for the copied prefix,
+						or NULL */
+	ulint*			buf_size)	/*!< in/out: buffer size */
+{
+	ut_ad(n_fields <= index->n_fields || dict_index_is_ibuf(index));
+	ut_ad(index->n_core_null_bytes <= UT_BITS_IN_BYTES(index->n_nullable));
+	UNIV_PREFETCH_RW(*buf);
+
+	if (!dict_table_is_comp(index->table)) {
+		ut_ad(rec_validate_old(rec));
+		return(rec_copy_prefix_to_buf_old(
+			       rec, n_fields,
+			       rec_get_field_start_offs(rec, n_fields),
+			       buf, buf_size));
+	}
+
+	ulint		prefix_len	= 0;
+	ulint		instant_omit	= 0;
+	const byte*	nulls		= rec - (REC_N_NEW_EXTRA_BYTES + 1);
+	const byte*	nullf		= nulls;
+	const byte*	lens		= nulls - index->n_core_null_bytes;
+
+	switch (rec_get_status(rec)) {
+	default:
+		/* infimum or supremum record: no sense to copy anything */
+		ut_error;
+		return(NULL);
+	case REC_STATUS_ORDINARY:
+		ut_ad(n_fields <= index->n_core_fields);
+		break;
+	case REC_STATUS_NODE_PTR:
+		/* For R-tree, we need to copy the child page number field. */
+		compile_time_assert(DICT_INDEX_SPATIAL_NODEPTR_SIZE == 1);
+		if (dict_index_is_spatial(index)) {
+			ut_ad(index->n_core_null_bytes == 0);
+			ut_ad(n_fields == DICT_INDEX_SPATIAL_NODEPTR_SIZE + 1);
+			ut_ad(index->fields[0].col->prtype & DATA_NOT_NULL);
+			ut_ad(DATA_BIG_COL(index->fields[0].col));
+			/* This is a deficiency of the format introduced
+			in MySQL 5.7. The length in the R-tree index should
+			always be DATA_MBR_LEN. */
+			ut_ad(!index->fields[0].fixed_len);
+			ut_ad(*lens == DATA_MBR_LEN);
+			lens--;
+			prefix_len = DATA_MBR_LEN + REC_NODE_PTR_SIZE;
+			n_fields = 0; /* skip the "for" loop below */
+			break;
+		}
+		/* it doesn't make sense to copy the child page number field */
+		ut_ad(n_fields
+		      <= dict_index_get_n_unique_in_tree_nonleaf(index));
+		break;
+	case REC_STATUS_INSTANT:
+		/* We would have !index->is_instant() when rolling back
+		an instant ADD COLUMN operation. */
+		ut_ad(index->is_instant() || page_rec_is_metadata(rec));
+		ut_ad(n_fields <= index->first_user_field());
+		nulls++;
+		const ulint n_rec = ulint(index->n_core_fields) + 1
+			+ rec_get_n_add_field(nulls)
+			- rec_is_alter_metadata(rec, true);
+		instant_omit = ulint(&rec[-REC_N_NEW_EXTRA_BYTES] - nulls);
+		ut_ad(instant_omit == 1 || instant_omit == 2);
+		nullf = nulls;
+		const uint nb = UT_BITS_IN_BYTES(index->get_n_nullable(n_rec));
+		instant_omit += nb - index->n_core_null_bytes;
+		lens = --nulls - nb;
+	}
+
+	const byte* const lenf = lens;
+	UNIV_PREFETCH_R(lens);
+
+	/* read the lengths of fields 0..n */
+	for (ulint i = 0, null_mask = 1; i < n_fields; i++) {
+		const dict_field_t*	field;
+		const dict_col_t*	col;
+
+		field = dict_index_get_nth_field(index, i);
+		col = dict_field_get_col(field);
+
+		if (!(col->prtype & DATA_NOT_NULL)) {
+			/* nullable field => read the null flag */
+			if (UNIV_UNLIKELY(!(byte) null_mask)) {
+				nulls--;
+				null_mask = 1;
+			}
+
+			if (*nulls & null_mask) {
+				null_mask <<= 1;
+				continue;
+			}
+
+			null_mask <<= 1;
+		}
+
+		if (field->fixed_len) {
+			prefix_len += field->fixed_len;
+		} else {
+			ulint	len = *lens--;
+			/* If the maximum length of the column is up
+			to 255 bytes, the actual length is always
+			stored in one byte. If the maximum length is
+			more than 255 bytes, the actual length is
+			stored in one byte for 0..127.  The length
+			will be encoded in two bytes when it is 128 or
+			more, or when the column is stored externally. */
+			if (UNIV_UNLIKELY(len & 0x80) && DATA_BIG_COL(col)) {
+				/* 1exxxxxx */
+				len &= 0x3f;
+				len <<= 8;
+				len |= *lens--;
+				UNIV_PREFETCH_R(lens);
+			}
+			prefix_len += len;
+		}
+	}
+
+	UNIV_PREFETCH_R(rec + prefix_len);
+
+	ulint size = prefix_len + ulint(rec - (lens + 1)) - instant_omit;
+
+	if (*buf == NULL || *buf_size < size) {
+		ut_free(*buf);
+		*buf_size = size;
+		*buf = static_cast<byte*>(ut_malloc_nokey(size));
+	}
+
+	if (instant_omit) {
+		/* Copy and convert the record header to a format where
+		instant ADD COLUMN has not been used:
+		+ lengths of variable-length fields in the prefix
+		- omit any null flag bytes for any instantly added columns
+		+ index->n_core_null_bytes of null flags
+		- omit the n_add_fields header (1 or 2 bytes)
+		+ REC_N_NEW_EXTRA_BYTES of fixed header */
+		byte* b = *buf;
+		/* copy the lengths of the variable-length fields */
+		memcpy(b, lens + 1, ulint(lenf - lens));
+		b += ulint(lenf - lens);
+		/* copy the null flags */
+		memcpy(b, nullf - index->n_core_null_bytes,
+		       index->n_core_null_bytes);
+		b += index->n_core_null_bytes + REC_N_NEW_EXTRA_BYTES;
+		ut_ad(ulint(b - *buf) + prefix_len == size);
+		/* copy the fixed-size header and the record prefix */
+		memcpy(b - REC_N_NEW_EXTRA_BYTES, rec - REC_N_NEW_EXTRA_BYTES,
+		       prefix_len + REC_N_NEW_EXTRA_BYTES);
+		ut_ad(rec_get_status(b) == REC_STATUS_INSTANT);
+		rec_set_status(b, REC_STATUS_ORDINARY);
+		return b;
+	} else {
+		memcpy(*buf, lens + 1, size);
+		return *buf + (rec - (lens + 1));
+	}
+}
+
+/***************************************************************//**
+Validates the consistency of an old-style physical record.
+@return TRUE if ok */
+static
+ibool
+rec_validate_old(
+/*=============*/
+	const rec_t*	rec)	/*!< in: physical record */
+{
+	ulint		len;
+	ulint		n_fields;
+	ulint		len_sum		= 0;
+	ulint		i;
+
+	ut_a(rec);
+	n_fields = rec_get_n_fields_old(rec);
+
+	if ((n_fields == 0) || (n_fields > REC_MAX_N_FIELDS)) {
+		ib::error() << "Record has " << n_fields << " fields";
+		return(FALSE);
+	}
+
+	for (i = 0; i < n_fields; i++) {
+		rec_get_nth_field_offs_old(rec, i, &len);
+
+		if (!((len < srv_page_size) || (len == UNIV_SQL_NULL))) {
+			ib::error() << "Record field " << i << " len " << len;
+			return(FALSE);
+		}
+
+		if (len != UNIV_SQL_NULL) {
+			len_sum += len;
+		} else {
+			len_sum += rec_get_nth_field_size(rec, i);
+		}
+	}
+
+	if (len_sum != rec_get_data_size_old(rec)) {
+		ib::error() << "Record len should be " << len_sum << ", len "
+			<< rec_get_data_size_old(rec);
+		return(FALSE);
+	}
+
+	return(TRUE);
+}
+
+/***************************************************************//**
+Validates the consistency of a physical record.
+@return TRUE if ok */
+ibool
+rec_validate(
+/*=========*/
+	const rec_t*	rec,	/*!< in: physical record */
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ulint		len;
+	ulint		n_fields;
+	ulint		len_sum		= 0;
+	ulint		i;
+
+	n_fields = rec_offs_n_fields(offsets);
+
+	if ((n_fields == 0) || (n_fields > REC_MAX_N_FIELDS)) {
+		ib::error() << "Record has " << n_fields << " fields";
+		return(FALSE);
+	}
+
+	ut_a(rec_offs_any_flag(offsets, REC_OFFS_COMPACT | REC_OFFS_DEFAULT)
+	     || n_fields <= rec_get_n_fields_old(rec));
+
+	for (i = 0; i < n_fields; i++) {
+		rec_get_nth_field_offs(offsets, i, &len);
+
+		switch (len) {
+		default:
+			if (len >= srv_page_size) {
+				ib::error() << "Record field " << i
+					<< " len " << len;
+				return(FALSE);
+			}
+			len_sum += len;
+			break;
+		case UNIV_SQL_DEFAULT:
+			break;
+		case UNIV_SQL_NULL:
+			if (!rec_offs_comp(offsets)) {
+				len_sum += rec_get_nth_field_size(rec, i);
+			}
+		}
+	}
+
+	if (len_sum != rec_offs_data_size(offsets)) {
+		ib::error() << "Record len should be " << len_sum << ", len "
+			<< rec_offs_data_size(offsets);
+		return(FALSE);
+	}
+
+	if (!rec_offs_comp(offsets)) {
+		ut_a(rec_validate_old(rec));
+	}
+
+	return(TRUE);
+}
+
+/***************************************************************//**
+Prints an old-style physical record. */
+void
+rec_print_old(
+/*==========*/
+	FILE*		file,	/*!< in: file where to print */
+	const rec_t*	rec)	/*!< in: physical record */
+{
+	const byte*	data;
+	ulint		len;
+	ulint		n;
+	ulint		i;
+
+	n = rec_get_n_fields_old(rec);
+
+	fprintf(file, "PHYSICAL RECORD: n_fields " ULINTPF ";"
+		" %u-byte offsets; info bits %u\n",
+		n,
+		rec_get_1byte_offs_flag(rec) ? 1 : 2,
+		rec_get_info_bits(rec, FALSE));
+
+	for (i = 0; i < n; i++) {
+
+		data = rec_get_nth_field_old(rec, i, &len);
+
+		fprintf(file, " " ULINTPF ":", i);
+
+		if (len != UNIV_SQL_NULL) {
+			if (len <= 30) {
+
+				ut_print_buf(file, data, len);
+			} else {
+				ut_print_buf(file, data, 30);
+
+				fprintf(file, " (total " ULINTPF " bytes)",
+					len);
+			}
+		} else {
+			fprintf(file, " SQL NULL, size " ULINTPF " ",
+				rec_get_nth_field_size(rec, i));
+		}
+
+		putc(';', file);
+		putc('\n', file);
+	}
+
+	rec_validate_old(rec);
+}
+
+/***************************************************************//**
+Prints a physical record in ROW_FORMAT=COMPACT.  Ignores the
+record header. */
+static
+void
+rec_print_comp(
+/*===========*/
+	FILE*		file,	/*!< in: file where to print */
+	const rec_t*	rec,	/*!< in: physical record */
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ulint	i;
+
+	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+          const byte*	UNINIT_VAR(data);
+		ulint		len;
+
+		if (rec_offs_nth_default(offsets, i)) {
+			len = UNIV_SQL_DEFAULT;
+		} else {
+			data = rec_get_nth_field(rec, offsets, i, &len);
+		}
+
+		fprintf(file, " " ULINTPF ":", i);
+
+		if (len == UNIV_SQL_NULL) {
+			fputs(" SQL NULL", file);
+		} else if (len == UNIV_SQL_DEFAULT) {
+			fputs(" SQL DEFAULT", file);
+		} else {
+			if (len <= 30) {
+
+				ut_print_buf(file, data, len);
+			} else if (rec_offs_nth_extern(offsets, i)) {
+				ut_print_buf(file, data, 30);
+				fprintf(file,
+					" (total " ULINTPF " bytes, external)",
+					len);
+				ut_print_buf(file, data + len
+					     - BTR_EXTERN_FIELD_REF_SIZE,
+					     BTR_EXTERN_FIELD_REF_SIZE);
+			} else {
+				ut_print_buf(file, data, 30);
+
+				fprintf(file, " (total " ULINTPF " bytes)",
+					len);
+			}
+		}
+		putc(';', file);
+		putc('\n', file);
+	}
+}
+
+/***************************************************************//**
+Prints an old-style spatial index record. */
+static
+void
+rec_print_mbr_old(
+/*==============*/
+	FILE*		file,	/*!< in: file where to print */
+	const rec_t*	rec)	/*!< in: physical record */
+{
+	const byte*	data;
+	ulint		len;
+	ulint		n;
+	ulint		i;
+
+	ut_ad(rec);
+
+	n = rec_get_n_fields_old(rec);
+
+	fprintf(file, "PHYSICAL RECORD: n_fields %lu;"
+		" %u-byte offsets; info bits %lu\n",
+		(ulong) n,
+		rec_get_1byte_offs_flag(rec) ? 1 : 2,
+		(ulong) rec_get_info_bits(rec, FALSE));
+
+	for (i = 0; i < n; i++) {
+
+		data = rec_get_nth_field_old(rec, i, &len);
+
+		fprintf(file, " %lu:", (ulong) i);
+
+		if (len != UNIV_SQL_NULL) {
+			if (i == 0) {
+				fprintf(file, " MBR:");
+				for (; len > 0; len -= sizeof(double)) {
+					double	d = mach_double_read(data);
+
+					if (len != sizeof(double)) {
+						fprintf(file, "%.2lf,", d);
+					} else {
+						fprintf(file, "%.2lf", d);
+					}
+
+					data += sizeof(double);
+				}
+			} else {
+				if (len <= 30) {
+
+					ut_print_buf(file, data, len);
+				} else {
+					ut_print_buf(file, data, 30);
+
+					fprintf(file, " (total %lu bytes)",
+						(ulong) len);
+				}
+			}
+		} else {
+			fprintf(file, " SQL NULL, size " ULINTPF " ",
+				rec_get_nth_field_size(rec, i));
+		}
+
+		putc(';', file);
+		putc('\n', file);
+	}
+
+	if (rec_get_deleted_flag(rec, false)) {
+		fprintf(file, " Deleted");
+	}
+
+	if (rec_get_info_bits(rec, true) & REC_INFO_MIN_REC_FLAG) {
+		fprintf(file, " First rec");
+	}
+
+	rec_validate_old(rec);
+}
+
+/***************************************************************//**
+Prints a spatial index record. */
+void
+rec_print_mbr_rec(
+/*==============*/
+	FILE*		file,	/*!< in: file where to print */
+	const rec_t*	rec,	/*!< in: physical record */
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	ut_ad(!rec_offs_any_default(offsets));
+
+	if (!rec_offs_comp(offsets)) {
+		rec_print_mbr_old(file, rec);
+		return;
+	}
+
+	for (ulint i = 0; i < rec_offs_n_fields(offsets); i++) {
+		const byte*	data;
+		ulint		len;
+
+		data = rec_get_nth_field(rec, offsets, i, &len);
+
+		if (i == 0) {
+			fprintf(file, " MBR:");
+			for (; len > 0; len -= sizeof(double)) {
+				double	d = mach_double_read(data);
+
+				if (len != sizeof(double)) {
+					fprintf(file, "%.2lf,", d);
+				} else {
+					fprintf(file, "%.2lf", d);
+				}
+
+				data += sizeof(double);
+			}
+		} else {
+			fprintf(file, " %lu:", (ulong) i);
+
+			if (len != UNIV_SQL_NULL) {
+				if (len <= 30) {
+
+					ut_print_buf(file, data, len);
+				} else {
+					ut_print_buf(file, data, 30);
+
+					fprintf(file, " (total %lu bytes)",
+						(ulong) len);
+				}
+			} else {
+				fputs(" SQL NULL", file);
+			}
+		}
+		putc(';', file);
+	}
+
+	if (rec_get_info_bits(rec, true) & REC_INFO_DELETED_FLAG) {
+		fprintf(file, " Deleted");
+	}
+
+	if (rec_get_info_bits(rec, true) & REC_INFO_MIN_REC_FLAG) {
+		fprintf(file, " First rec");
+	}
+
+
+	rec_validate(rec, offsets);
+}
+
+/***************************************************************//**
+Prints a physical record. */
+void
+rec_print_new(
+/*==========*/
+	FILE*		file,	/*!< in: file where to print */
+	const rec_t*	rec,	/*!< in: physical record */
+	const rec_offs*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+#ifdef UNIV_DEBUG
+	if (rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
+		DBUG_PRINT("info", ("deleted "));
+	} else {
+		DBUG_PRINT("info", ("not-deleted "));
+	}
+#endif /* UNIV_DEBUG */
+
+	if (!rec_offs_comp(offsets)) {
+		rec_print_old(file, rec);
+		return;
+	}
+
+	fprintf(file, "PHYSICAL RECORD: n_fields " ULINTPF ";"
+		" compact format; info bits %u\n",
+		rec_offs_n_fields(offsets),
+		rec_get_info_bits(rec, TRUE));
+
+	rec_print_comp(file, rec, offsets);
+	rec_validate(rec, offsets);
+}
+
+/***************************************************************//**
+Prints a physical record. */
+void
+rec_print(
+/*======*/
+	FILE*			file,	/*!< in: file where to print */
+	const rec_t*		rec,	/*!< in: physical record */
+	const dict_index_t*	index)	/*!< in: record descriptor */
+{
+	if (!dict_table_is_comp(index->table)) {
+		rec_print_old(file, rec);
+		return;
+	} else {
+		mem_heap_t*	heap	= NULL;
+		rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+		rec_offs_init(offsets_);
+
+		rec_print_new(file, rec,
+			      rec_get_offsets(rec, index, offsets_,
+					      page_rec_is_leaf(rec)
+					      ? index->n_core_fields : 0,
+					      ULINT_UNDEFINED, &heap));
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+	}
+}
+
+/** Pretty-print a record.
+@param[in,out]	o	output stream
+@param[in]	rec	physical record
+@param[in]	info	rec_get_info_bits(rec)
+@param[in]	offsets	rec_get_offsets(rec) */
+void
+rec_print(
+	std::ostream&	o,
+	const rec_t*	rec,
+	ulint		info,
+	const rec_offs*	offsets)
+{
+	const ulint	comp	= rec_offs_comp(offsets);
+	const ulint	n	= rec_offs_n_fields(offsets);
+
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+	o << (comp ? "COMPACT RECORD" : "RECORD")
+	  << "(info_bits=" << info << ", " << n << " fields): {";
+
+	for (ulint i = 0; i < n; i++) {
+		const byte*	data;
+		ulint		len;
+
+		if (i) {
+			o << ',';
+		}
+
+		data = rec_get_nth_field(rec, offsets, i, &len);
+
+		if (len == UNIV_SQL_DEFAULT) {
+			o << "DEFAULT";
+			continue;
+		}
+
+		if (len == UNIV_SQL_NULL) {
+			o << "NULL";
+			continue;
+		}
+
+		if (rec_offs_nth_extern(offsets, i)) {
+			ulint	local_len = len - BTR_EXTERN_FIELD_REF_SIZE;
+			ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+			o << '['
+			  << local_len
+			  << '+' << BTR_EXTERN_FIELD_REF_SIZE << ']';
+			ut_print_buf(o, data, local_len);
+			ut_print_buf_hex(o, data + local_len,
+					 BTR_EXTERN_FIELD_REF_SIZE);
+		} else {
+			o << '[' << len << ']';
+			ut_print_buf(o, data, len);
+		}
+	}
+
+	o << "}";
+}
+
+/** Display a record.
+@param[in,out]	o	output stream
+@param[in]	r	record to display
+@return	the output stream */
+std::ostream&
+operator<<(std::ostream& o, const rec_index_print& r)
+{
+	mem_heap_t*	heap	= NULL;
+	rec_offs*	offsets	= rec_get_offsets(
+		r.m_rec, r.m_index, NULL, page_rec_is_leaf(r.m_rec)
+		? r.m_index->n_core_fields : 0,
+		ULINT_UNDEFINED, &heap);
+	rec_print(o, r.m_rec,
+		  rec_get_info_bits(r.m_rec, rec_offs_comp(offsets)),
+		  offsets);
+	mem_heap_free(heap);
+	return(o);
+}
+
+/** Display a record.
+@param[in,out]	o	output stream
+@param[in]	r	record to display
+@return	the output stream */
+std::ostream&
+operator<<(std::ostream& o, const rec_offsets_print& r)
+{
+	rec_print(o, r.m_rec,
+		  rec_get_info_bits(r.m_rec, rec_offs_comp(r.m_offsets)),
+		  r.m_offsets);
+	return(o);
+}
+
+#ifdef UNIV_DEBUG
+/** Read the DB_TRX_ID of a clustered index record.
+@param[in]	rec	clustered index record
+@param[in]	index	clustered index
+@return the value of DB_TRX_ID */
+trx_id_t
+rec_get_trx_id(
+	const rec_t*		rec,
+	const dict_index_t*	index)
+{
+	const byte*	trx_id;
+	ulint		len;
+	mem_heap_t*	heap		= NULL;
+	rec_offs offsets_[REC_OFFS_HEADER_SIZE + MAX_REF_PARTS + 2];
+	rec_offs_init(offsets_);
+	rec_offs* offsets = offsets_;
+
+	offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+				  index->db_trx_id() + 1, &heap);
+
+	trx_id = rec_get_nth_field(rec, offsets, index->db_trx_id(), &len);
+
+	ut_ad(len == DATA_TRX_ID_LEN);
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	return(trx_read_trx_id(trx_id));
+}
+#endif /* UNIV_DEBUG */
+
+/** Mark the nth field as externally stored.
+@param[in]	offsets		array returned by rec_get_offsets()
+@param[in]	n		nth field */
+void
+rec_offs_make_nth_extern(
+	rec_offs*	offsets,
+	const ulint	n)
+{
+	ut_ad(!rec_offs_nth_sql_null(offsets, n));
+	set_type(rec_offs_base(offsets)[1 + n], STORED_OFFPAGE);
+}
+#ifdef WITH_WSREP
+# include "ha_prototypes.h"
+
+int
+wsrep_rec_get_foreign_key(
+	byte 		*buf,     /* out: extracted key */
+	ulint 		*buf_len, /* in/out: length of buf */
+	const rec_t*	rec,	  /* in: physical record */
+	dict_index_t*	index_for,  /* in: index in foreign table */
+	dict_index_t*	index_ref,  /* in: index in referenced table */
+	ibool		new_protocol) /* in: protocol > 1 */
+{
+	const byte*	data;
+	ulint		len;
+	ulint		key_len = 0;
+	ulint		i;
+	uint            key_parts;
+	mem_heap_t*	heap	= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	const rec_offs* offsets;
+
+	ut_ad(index_for);
+	ut_ad(index_ref);
+
+        rec_offs_init(offsets_);
+	offsets = rec_get_offsets(rec, index_for, offsets_,
+				  index_for->n_core_fields,
+				  ULINT_UNDEFINED, &heap);
+
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+	ut_ad(rec);
+
+	key_parts = dict_index_get_n_unique_in_tree(index_for);
+	for (i = 0; 
+	     i < key_parts && 
+	       (index_for->type & DICT_CLUSTERED || i < key_parts - 1); 
+	     i++) {
+		dict_field_t*	  field_f = 
+			dict_index_get_nth_field(index_for, i);
+		const dict_col_t* col_f = dict_field_get_col(field_f);
+                dict_field_t*	  field_r = 
+			dict_index_get_nth_field(index_ref, i);
+		const dict_col_t* col_r = dict_field_get_col(field_r);
+
+		ut_ad(!rec_offs_nth_default(offsets, i));
+		data = rec_get_nth_field(rec, offsets, i, &len);
+		if (key_len + ((len != UNIV_SQL_NULL) ? len + 1 : 1) > 
+		    *buf_len) {
+			fprintf(stderr,
+				"WSREP: FK key len exceeded "
+				ULINTPF " " ULINTPF " " ULINTPF "\n",
+				key_len, len, *buf_len);
+			goto err_out;
+		}
+
+		if (len == UNIV_SQL_NULL) {
+			ut_a(!(col_f->prtype & DATA_NOT_NULL));
+			*buf++ = 1;
+			key_len++;
+		} else if (!new_protocol) {
+			if (!(col_r->prtype & DATA_NOT_NULL)) {
+				*buf++ = 0;
+				key_len++;
+			}
+			memcpy(buf, data, len);
+			*buf_len = wsrep_innobase_mysql_sort(
+				(int)(col_f->prtype & DATA_MYSQL_TYPE_MASK),
+				dtype_get_charset_coll(col_f->prtype),
+				buf, static_cast<uint>(len),
+				static_cast<uint>(*buf_len));
+		} else { /* new protocol */
+			if (!(col_r->prtype & DATA_NOT_NULL)) {
+				*buf++ = 0;
+				key_len++;
+			}
+			switch (col_f->mtype) {
+			case DATA_INT: {
+				byte* ptr = buf+len;
+				for (;;) {
+					ptr--;
+					*ptr = *data;
+					if (ptr == buf) {
+						break;
+					}
+					data++;
+				}
+
+				if (!(col_f->prtype & DATA_UNSIGNED)) {
+					buf[len-1] = (byte) (buf[len-1] ^ 128);
+				}
+
+				break;
+			}
+			case DATA_VARCHAR:
+			case DATA_VARMYSQL:
+			case DATA_CHAR:
+			case DATA_MYSQL:
+				/* Copy the actual data */
+				memcpy(buf, data, len);
+				len = wsrep_innobase_mysql_sort(
+					(int)
+					(col_f->prtype & DATA_MYSQL_TYPE_MASK),
+					dtype_get_charset_coll(col_f->prtype),
+					buf, len, *buf_len);
+				break;
+			case DATA_BLOB:
+			case DATA_BINARY:
+			case DATA_FIXBINARY:
+			case DATA_GEOMETRY:
+				memcpy(buf, data, len);
+				break;
+
+			case DATA_FLOAT:
+			{
+				float f = mach_float_read(data);
+				memcpy(buf, &f, sizeof(float));
+			}
+			break;
+			case DATA_DOUBLE:
+			{
+				double d = mach_double_read(data);
+				memcpy(buf, &d, sizeof(double));
+			}
+			break;
+			default:
+				break;
+			}
+
+			key_len += len;
+			buf 	+= len;
+		}
+	}
+
+	rec_validate(rec, offsets);
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	*buf_len = key_len;
+	return DB_SUCCESS;
+
+ err_out:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return DB_ERROR;
+}
+#endif // WITH_WSREP
diff --git a/storage/innobase/row/row0ext.cc b/storage/innobase/row/row0ext.cc
new file mode 100644
index 00000000..b7a62760
--- /dev/null
+++ b/storage/innobase/row/row0ext.cc
@@ -0,0 +1,132 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0ext.cc
+Caching of externally stored column prefixes
+
+Created September 2006 Marko Makela
+*******************************************************/
+
+#include "row0ext.h"
+#include "btr0cur.h"
+
+/** Fills the column prefix cache of an externally stored column.
+@param[in,out]	ext		column prefix cache
+@param[in]	i		index of ext->ext[]
+@param[in]	space		tablespace
+@param[in]	dfield		data field */
+static
+void
+row_ext_cache_fill(
+	row_ext_t*		ext,
+	ulint			i,
+	fil_space_t*		space,
+	const dfield_t*		dfield)
+{
+	const byte*	field	= static_cast<const byte*>(
+					dfield_get_data(dfield));
+	ulint		f_len	= dfield_get_len(dfield);
+	byte*		buf	= ext->buf + i * ext->max_len;
+
+	ut_ad(ext->max_len > 0);
+	ut_ad(i < ext->n_ext);
+	ut_ad(dfield_is_ext(dfield));
+	ut_a(f_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+	if (UNIV_UNLIKELY(!memcmp(field_ref_zero,
+				  field + f_len - BTR_EXTERN_FIELD_REF_SIZE,
+				  BTR_EXTERN_FIELD_REF_SIZE))) {
+		/* The BLOB pointer is not set: we cannot fetch it */
+		ext->len[i] = 0;
+	} else {
+		if (ext->max_len == REC_VERSION_56_MAX_INDEX_COL_LEN
+		    && f_len > BTR_EXTERN_FIELD_REF_SIZE) {
+			/* In this case, the field is in B format or beyond,
+			(refer to the definition of row_ext_t.max_len)
+			and the field is already fill with prefix, otherwise
+			f_len would be BTR_EXTERN_FIELD_REF_SIZE.
+			So there is no need to re-read the prefix externally,
+			but just copy the local prefix to buf. Please note
+			if the ext->len[i] is zero, it means an error
+			as above. */
+			memcpy(buf, field, f_len - BTR_EXTERN_FIELD_REF_SIZE);
+			ext->len[i] = f_len - BTR_EXTERN_FIELD_REF_SIZE;
+		} else {
+			/* Fetch at most ext->max_len of the column.
+			The column should be non-empty.  However,
+			trx_rollback_all_recovered() may try to
+			access a half-deleted BLOB if the server previously
+			crashed during the execution of
+			btr_free_externally_stored_field(). */
+			ext->len[i] = btr_copy_externally_stored_field_prefix(
+				buf, ext->max_len, ext->zip_size,
+				field, f_len);
+		}
+	}
+}
+
+/********************************************************************//**
+Creates a cache of column prefixes of externally stored columns.
+@return own: column prefix cache */
+row_ext_t*
+row_ext_create(
+/*===========*/
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	const ulint*	ext,	/*!< in: col_no's of externally stored columns
+				in the InnoDB table object, as reported by
+				dict_col_get_no(); NOT relative to the records
+				in the clustered index */
+	const dict_table_t& table,	/*!< in: table */
+	const dtuple_t*	tuple,	/*!< in: data tuple containing the field
+				references of the externally stored
+				columns; must be indexed by col_no;
+				the clustered index record must be
+				covered by a lock or a page latch
+				to prevent deletion (rollback or purge). */
+	mem_heap_t*	heap)	/*!< in: heap where created */
+{
+	if (!table.space) {
+		return NULL;
+	}
+
+	ut_ad(n_ext > 0);
+
+	row_ext_t* ret = static_cast<row_ext_t*>(
+		mem_heap_alloc(heap,
+			       (sizeof *ret) + (n_ext - 1) * sizeof ret->len));
+
+	ret->n_ext = n_ext;
+	ret->ext = ext;
+	ret->max_len = DICT_MAX_FIELD_LEN_BY_FORMAT_FLAG(table.flags);
+	ret->zip_size = dict_tf_get_zip_size(table.flags);
+
+	ret->buf = static_cast<byte*>(
+		mem_heap_alloc(heap, n_ext * ret->max_len));
+
+	/* Fetch the BLOB prefixes */
+	for (ulint i = 0; i < n_ext; i++) {
+		const dfield_t*	dfield;
+
+		dfield = dtuple_get_nth_field(tuple, ext[i]);
+		row_ext_cache_fill(ret, i, table.space, dfield);
+	}
+
+	return(ret);
+}
diff --git a/storage/innobase/row/row0ftsort.cc b/storage/innobase/row/row0ftsort.cc
new file mode 100644
index 00000000..17a2f034
--- /dev/null
+++ b/storage/innobase/row/row0ftsort.cc
@@ -0,0 +1,1791 @@
+/*****************************************************************************
+
+Copyright (c) 2010, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0ftsort.cc
+Create Full Text Index with (parallel) merge sort
+
+Created 10/13/2010 Jimmy Yang
+*******************************************************/
+
+#include "row0ftsort.h"
+#include "dict0dict.h"
+#include "row0merge.h"
+#include "row0row.h"
+#include "btr0cur.h"
+#include "fts0plugin.h"
+#include "log0crypt.h"
+
+/** Read the next record to buffer N.
+@param N index into array of merge info structure */
+#define ROW_MERGE_READ_GET_NEXT(N)					\
+	do {								\
+		b[N] = row_merge_read_rec(				\
+			block[N], buf[N], b[N], index,			\
+			fd[N], &foffs[N], &mrec[N], offsets[N],		\
+			crypt_block[N], space);				\
+		if (UNIV_UNLIKELY(!b[N])) {				\
+			if (mrec[N]) {					\
+				goto exit;				\
+			}						\
+		}							\
+	} while (0)
+
+/** Parallel sort degree */
+ulong	fts_sort_pll_degree	= 2;
+
+/*********************************************************************//**
+Create a temporary "fts sort index" used to merge sort the
+tokenized doc string. The index has three "fields":
+
+1) Tokenized word,
+2) Doc ID (depend on number of records to sort, it can be a 4 bytes or 8 bytes
+integer value)
+3) Word's position in original doc.
+
+@see fts_create_one_index_table()
+
+@return dict_index_t structure for the fts sort index */
+dict_index_t*
+row_merge_create_fts_sort_index(
+/*============================*/
+	dict_index_t*	index,	/*!< in: Original FTS index
+				based on which this sort index
+				is created */
+	dict_table_t*	table,	/*!< in,out: table that FTS index
+				is being created on */
+	ibool*		opt_doc_id_size)
+				/*!< out: whether to use 4 bytes
+				instead of 8 bytes integer to
+				store Doc ID during sort */
+{
+	dict_index_t*   new_index;
+	dict_field_t*   field;
+	dict_field_t*   idx_field;
+	CHARSET_INFO*	charset;
+
+	// FIXME: This name shouldn't be hard coded here.
+	new_index = dict_mem_index_create(table, "tmp_fts_idx", DICT_FTS, 3);
+
+	new_index->id = index->id;
+	new_index->n_uniq = FTS_NUM_FIELDS_SORT;
+	new_index->n_def = FTS_NUM_FIELDS_SORT;
+	new_index->cached = TRUE;
+	new_index->parser = index->parser;
+
+	idx_field = dict_index_get_nth_field(index, 0);
+	charset = fts_index_get_charset(index);
+
+	/* The first field is on the Tokenized Word */
+	field = dict_index_get_nth_field(new_index, 0);
+	field->name = NULL;
+	field->prefix_len = 0;
+	field->descending = false;
+	field->col = static_cast<dict_col_t*>(
+		mem_heap_zalloc(new_index->heap, sizeof(dict_col_t)));
+	field->col->prtype = idx_field->col->prtype | DATA_NOT_NULL;
+	field->col->mtype = charset == &my_charset_latin1
+		? DATA_VARCHAR : DATA_VARMYSQL;
+	field->col->mbminlen = idx_field->col->mbminlen;
+	field->col->mbmaxlen = idx_field->col->mbmaxlen;
+	field->col->len = static_cast<uint16_t>(
+		HA_FT_MAXCHARLEN * field->col->mbmaxlen);
+
+	field->fixed_len = 0;
+
+	/* Doc ID */
+	field = dict_index_get_nth_field(new_index, 1);
+	field->name = NULL;
+	field->prefix_len = 0;
+	field->descending = false;
+	field->col = static_cast<dict_col_t*>(
+		mem_heap_zalloc(new_index->heap, sizeof(dict_col_t)));
+	field->col->mtype = DATA_INT;
+	*opt_doc_id_size = FALSE;
+
+	/* Check whether we can use 4 bytes instead of 8 bytes integer
+	field to hold the Doc ID, thus reduce the overall sort size */
+	if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_ADD_DOC_ID)) {
+		/* If Doc ID column is being added by this create
+		index, then just check the number of rows in the table */
+		if (dict_table_get_n_rows(table) < MAX_DOC_ID_OPT_VAL) {
+			*opt_doc_id_size = TRUE;
+		}
+	} else {
+		doc_id_t	max_doc_id;
+
+		/* If the Doc ID column is supplied by user, then
+		check the maximum Doc ID in the table */
+		max_doc_id = fts_get_max_doc_id((dict_table_t*) table);
+
+		if (max_doc_id && max_doc_id < MAX_DOC_ID_OPT_VAL) {
+			*opt_doc_id_size = TRUE;
+		}
+	}
+
+	if (*opt_doc_id_size) {
+		field->col->len = sizeof(ib_uint32_t);
+		field->fixed_len = sizeof(ib_uint32_t);
+	} else {
+		field->col->len = FTS_DOC_ID_LEN;
+		field->fixed_len = FTS_DOC_ID_LEN;
+	}
+
+	field->col->prtype = DATA_NOT_NULL | DATA_BINARY_TYPE;
+
+	/* The third field is on the word's position in the original doc */
+	field = dict_index_get_nth_field(new_index, 2);
+	field->name = NULL;
+	field->prefix_len = 0;
+	field->descending = false;
+	field->col = static_cast<dict_col_t*>(
+		mem_heap_zalloc(new_index->heap, sizeof(dict_col_t)));
+	field->col->mtype = DATA_INT;
+	field->col->len = 4 ;
+	field->fixed_len = 4;
+	field->col->prtype = DATA_NOT_NULL;
+
+	return(new_index);
+}
+
+/** Initialize FTS parallel sort structures.
+@param[in]	trx		transaction
+@param[in,out]	dup		descriptor of FTS index being created
+@param[in,out]	new_table	table where indexes are created
+@param[in]	opt_doc_id_size	whether to use 4 bytes instead of 8 bytes
+				integer to store Doc ID during sort
+@param[in]	old_zip_size	page size of the old table during alter
+@param[out]	psort		parallel sort info to be instantiated
+@param[out]	merge		parallel merge info to be instantiated
+@return true if all successful */
+bool
+row_fts_psort_info_init(
+	trx_t*		trx,
+	row_merge_dup_t*dup,
+	dict_table_t*	new_table,
+	bool		opt_doc_id_size,
+	ulint		old_zip_size,
+	fts_psort_t**	psort,
+	fts_psort_t**	merge)
+{
+	ulint			i;
+	ulint			j;
+	fts_psort_common_t*	common_info = NULL;
+	fts_psort_t*		psort_info = NULL;
+	fts_psort_t*		merge_info = NULL;
+	ulint			block_size;
+	ibool			ret = TRUE;
+	ut_ad(ut_is_2pow(old_zip_size));
+
+	block_size = 3 * srv_sort_buf_size;
+
+	*psort = psort_info = static_cast<fts_psort_t*>(ut_zalloc_nokey(
+		 fts_sort_pll_degree * sizeof *psort_info));
+
+	if (!psort_info) {
+		ut_free(dup);
+		return(FALSE);
+	}
+
+	/* Common Info for all sort threads */
+	common_info = static_cast<fts_psort_common_t*>(
+		ut_malloc_nokey(sizeof *common_info));
+
+	if (!common_info) {
+		ut_free(dup);
+		ut_free(psort_info);
+		return(FALSE);
+	}
+
+	common_info->dup = dup;
+	common_info->new_table = new_table;
+	common_info->old_zip_size = old_zip_size;
+	common_info->trx = trx;
+	common_info->all_info = psort_info;
+	pthread_cond_init(&common_info->sort_cond, nullptr);
+	common_info->opt_doc_id_size = opt_doc_id_size;
+
+	ut_ad(trx->mysql_thd != NULL);
+	const char*	path = thd_innodb_tmpdir(trx->mysql_thd);
+	/* There will be FTS_NUM_AUX_INDEX number of "sort buckets" for
+	each parallel sort thread. Each "sort bucket" holds records for
+	a particular "FTS index partition" */
+	for (j = 0; j < fts_sort_pll_degree; j++) {
+
+		UT_LIST_INIT(
+			psort_info[j].fts_doc_list, &fts_doc_item_t::doc_list);
+
+		for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+
+			psort_info[j].merge_file[i] =
+				 static_cast<merge_file_t*>(
+					ut_zalloc_nokey(sizeof(merge_file_t)));
+
+			if (!psort_info[j].merge_file[i]) {
+				ret = FALSE;
+				goto func_exit;
+			}
+
+			psort_info[j].merge_buf[i] = row_merge_buf_create(
+				dup->index);
+
+			if (row_merge_file_create(psort_info[j].merge_file[i],
+						  path) == OS_FILE_CLOSED) {
+				goto func_exit;
+			}
+
+			/* Need to align memory for O_DIRECT write */
+			psort_info[j].merge_block[i] =
+				static_cast<row_merge_block_t*>(
+					aligned_malloc(block_size, 1024));
+
+			if (!psort_info[j].merge_block[i]) {
+				ret = FALSE;
+				goto func_exit;
+			}
+
+			/* If tablespace is encrypted, allocate additional buffer for
+			encryption/decryption. */
+			if (srv_encrypt_log) {
+				/* Need to align memory for O_DIRECT write */
+				psort_info[j].crypt_block[i] =
+					static_cast<row_merge_block_t*>(
+						aligned_malloc(block_size,
+							       1024));
+
+				if (!psort_info[j].crypt_block[i]) {
+					ret = FALSE;
+					goto func_exit;
+				}
+			} else {
+				psort_info[j].crypt_block[i] = NULL;
+			}
+		}
+
+		psort_info[j].child_status = 0;
+		psort_info[j].state = 0;
+		psort_info[j].psort_common = common_info;
+		psort_info[j].error = DB_SUCCESS;
+		psort_info[j].memory_used = 0;
+		mysql_mutex_init(0, &psort_info[j].mutex, nullptr);
+	}
+
+	/* Initialize merge_info structures parallel merge and insert
+	into auxiliary FTS tables (FTS_INDEX_TABLE) */
+	*merge = merge_info = static_cast<fts_psort_t*>(
+		ut_malloc_nokey(FTS_NUM_AUX_INDEX * sizeof *merge_info));
+
+	for (j = 0; j < FTS_NUM_AUX_INDEX; j++) {
+
+		merge_info[j].child_status = 0;
+		merge_info[j].state = 0;
+		merge_info[j].psort_common = common_info;
+	}
+
+func_exit:
+	if (!ret) {
+		row_fts_psort_info_destroy(psort_info, merge_info);
+	}
+
+	return(ret);
+}
+/*********************************************************************//**
+Clean up and deallocate FTS parallel sort structures, and close the
+merge sort files  */
+void
+row_fts_psort_info_destroy(
+/*=======================*/
+	fts_psort_t*	psort_info,	/*!< parallel sort info */
+	fts_psort_t*	merge_info)	/*!< parallel merge info */
+{
+	ulint	i;
+	ulint	j;
+
+	if (psort_info) {
+		for (j = 0; j < fts_sort_pll_degree; j++) {
+			for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+				if (psort_info[j].merge_file[i]) {
+					row_merge_file_destroy(
+						psort_info[j].merge_file[i]);
+				}
+
+				aligned_free(psort_info[j].merge_block[i]);
+				ut_free(psort_info[j].merge_file[i]);
+				aligned_free(psort_info[j].crypt_block[i]);
+			}
+
+			mysql_mutex_destroy(&psort_info[j].mutex);
+		}
+
+		pthread_cond_destroy(&merge_info[0].psort_common->sort_cond);
+		ut_free(merge_info[0].psort_common->dup);
+		ut_free(merge_info[0].psort_common);
+		ut_free(psort_info);
+	}
+
+	ut_free(merge_info);
+}
+/*********************************************************************//**
+Free up merge buffers when merge sort is done */
+void
+row_fts_free_pll_merge_buf(
+/*=======================*/
+	fts_psort_t*	psort_info)	/*!< in: parallel sort info */
+{
+	ulint	j;
+	ulint	i;
+
+	if (!psort_info) {
+		return;
+	}
+
+	for (j = 0; j < fts_sort_pll_degree; j++) {
+		for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+			row_merge_buf_free(psort_info[j].merge_buf[i]);
+		}
+	}
+
+	return;
+}
+
+/*********************************************************************//**
+FTS plugin parser 'myql_add_word' callback function for row merge.
+Refer to 'st_mysql_ftparser_param' for more detail.
+@return always returns 0 */
+static
+int
+row_merge_fts_doc_add_word_for_parser(
+/*==================================*/
+	MYSQL_FTPARSER_PARAM	*param,		/* in: parser paramter */
+	const char		*word,		/* in: token word */
+	int			word_len,	/* in: word len */
+	MYSQL_FTPARSER_BOOLEAN_INFO*	boolean_info)	/* in: boolean info */
+{
+	fts_string_t		str;
+	fts_tokenize_ctx_t*	t_ctx;
+	row_fts_token_t*	fts_token;
+	byte*			ptr;
+
+	ut_ad(param);
+	ut_ad(param->mysql_ftparam);
+	ut_ad(word);
+	ut_ad(boolean_info);
+
+	t_ctx = static_cast<fts_tokenize_ctx_t*>(param->mysql_ftparam);
+	ut_ad(t_ctx);
+
+	str.f_str = (byte*)(word);
+	str.f_len = ulint(word_len);
+	str.f_n_char = fts_get_token_size(
+		(CHARSET_INFO*)param->cs, word, ulint(word_len));
+
+	/* JAN: TODO: MySQL 5.7 FTS
+	ut_ad(boolean_info->position >= 0);
+	*/
+
+	ptr = static_cast<byte*>(ut_malloc_nokey(sizeof(row_fts_token_t)
+			+ sizeof(fts_string_t) + str.f_len));
+	fts_token = reinterpret_cast<row_fts_token_t*>(ptr);
+	fts_token->text = reinterpret_cast<fts_string_t*>(
+			ptr + sizeof(row_fts_token_t));
+	fts_token->text->f_str = static_cast<byte*>(
+			ptr + sizeof(row_fts_token_t) + sizeof(fts_string_t));
+
+	fts_token->text->f_len = str.f_len;
+	fts_token->text->f_n_char = str.f_n_char;
+	memcpy(fts_token->text->f_str, str.f_str, str.f_len);
+
+	/* JAN: TODO: MySQL 5.7 FTS
+	fts_token->position = boolean_info->position;
+	*/
+
+	/* Add token to list */
+	UT_LIST_ADD_LAST(t_ctx->fts_token_list, fts_token);
+
+	return(0);
+}
+
+/*********************************************************************//**
+Tokenize by fts plugin parser */
+static
+void
+row_merge_fts_doc_tokenize_by_parser(
+/*=================================*/
+	fts_doc_t*		doc,	/* in: doc to tokenize */
+	st_mysql_ftparser*	parser,	/* in: plugin parser instance */
+	fts_tokenize_ctx_t*	t_ctx)	/* in/out: tokenize ctx instance */
+{
+	MYSQL_FTPARSER_PARAM	param;
+
+	ut_a(parser);
+
+	/* Set paramters for param */
+	param.mysql_parse = fts_tokenize_document_internal;
+	param.mysql_add_word = row_merge_fts_doc_add_word_for_parser;
+	param.mysql_ftparam = t_ctx;
+	param.cs = doc->charset;
+	param.doc = reinterpret_cast<char*>(doc->text.f_str);
+	param.length = static_cast<int>(doc->text.f_len);
+	param.mode= MYSQL_FTPARSER_SIMPLE_MODE;
+
+	PARSER_INIT(parser, &param);
+	/* We assume parse returns successfully here. */
+	parser->parse(&param);
+	PARSER_DEINIT(parser, &param);
+}
+
+/*********************************************************************//**
+Tokenize incoming text data and add to the sort buffer.
+@see row_merge_buf_encode()
+@return	TRUE if the record passed, FALSE if out of space */
+static
+ibool
+row_merge_fts_doc_tokenize(
+/*=======================*/
+	row_merge_buf_t**	sort_buf,	/*!< in/out: sort buffer */
+	doc_id_t		doc_id,		/*!< in: Doc ID */
+	fts_doc_t*		doc,		/*!< in: Doc to be tokenized */
+	merge_file_t**		merge_file,	/*!< in/out: merge file */
+	ibool			opt_doc_id_size,/*!< in: whether to use 4 bytes
+						instead of 8 bytes integer to
+						store Doc ID during sort*/
+	fts_tokenize_ctx_t*	t_ctx)          /*!< in/out: tokenize context */
+{
+	ulint		inc = 0;
+	fts_string_t	str;
+	ulint		len;
+	row_merge_buf_t* buf;
+	dfield_t*	field;
+	fts_string_t	t_str;
+	ibool		buf_full = FALSE;
+	byte		str_buf[FTS_MAX_WORD_LEN + 1];
+	ulint		data_size[FTS_NUM_AUX_INDEX];
+	ulint		n_tuple[FTS_NUM_AUX_INDEX];
+	st_mysql_ftparser*	parser;
+
+	t_str.f_n_char = 0;
+	t_ctx->buf_used = 0;
+
+	memset(n_tuple, 0, FTS_NUM_AUX_INDEX * sizeof(ulint));
+	memset(data_size, 0, FTS_NUM_AUX_INDEX * sizeof(ulint));
+
+	parser = sort_buf[0]->index->parser;
+
+	/* Tokenize the data and add each word string, its corresponding
+	doc id and position to sort buffer */
+	while (parser
+               ? (!t_ctx->processed_len
+                  || UT_LIST_GET_LEN(t_ctx->fts_token_list))
+               : t_ctx->processed_len < doc->text.f_len) {
+		ulint		idx = 0;
+		ulint		cur_len;
+		doc_id_t	write_doc_id;
+		row_fts_token_t* fts_token = NULL;
+
+		if (parser != NULL) {
+			if (t_ctx->processed_len == 0) {
+				UT_LIST_INIT(t_ctx->fts_token_list, &row_fts_token_t::token_list);
+
+				/* Parse the whole doc and cache tokens */
+				row_merge_fts_doc_tokenize_by_parser(doc,
+					parser, t_ctx);
+
+				/* Just indictate we have parsed all the word */
+				t_ctx->processed_len += 1;
+			}
+
+			/* Then get a token */
+			fts_token = UT_LIST_GET_FIRST(t_ctx->fts_token_list);
+			if (fts_token) {
+				str.f_len = fts_token->text->f_len;
+				str.f_n_char = fts_token->text->f_n_char;
+				str.f_str = fts_token->text->f_str;
+			} else {
+				ut_ad(UT_LIST_GET_LEN(t_ctx->fts_token_list) == 0);
+				/* Reach the end of the list */
+				t_ctx->processed_len = doc->text.f_len;
+				break;
+			}
+		} else {
+			inc = innobase_mysql_fts_get_token(
+				doc->charset,
+				doc->text.f_str + t_ctx->processed_len,
+				doc->text.f_str + doc->text.f_len, &str);
+
+			ut_a(inc > 0);
+		}
+
+		/* Ignore string whose character number is less than
+		"fts_min_token_size" or more than "fts_max_token_size" */
+		if (!fts_check_token(&str, NULL, NULL)) {
+			if (parser != NULL) {
+				UT_LIST_REMOVE(t_ctx->fts_token_list, fts_token);
+				ut_free(fts_token);
+			} else {
+				t_ctx->processed_len += inc;
+			}
+
+			continue;
+		}
+
+		t_str.f_len = innobase_fts_casedn_str(
+			doc->charset, (char*) str.f_str, str.f_len,
+			(char*) &str_buf, FTS_MAX_WORD_LEN + 1);
+
+		t_str.f_str = (byte*) &str_buf;
+
+		/* if "cached_stopword" is defined, ignore words in the
+		stopword list */
+		if (!fts_check_token(&str, t_ctx->cached_stopword,
+				     doc->charset)) {
+			if (parser != NULL) {
+				UT_LIST_REMOVE(t_ctx->fts_token_list, fts_token);
+				ut_free(fts_token);
+			} else {
+				t_ctx->processed_len += inc;
+			}
+
+			continue;
+		}
+
+		/* There are FTS_NUM_AUX_INDEX auxiliary tables, find
+		out which sort buffer to put this word record in */
+		t_ctx->buf_used = fts_select_index(
+			doc->charset, t_str.f_str, t_str.f_len);
+
+		buf = sort_buf[t_ctx->buf_used];
+
+		ut_a(t_ctx->buf_used < FTS_NUM_AUX_INDEX);
+		idx = t_ctx->buf_used;
+
+		mtuple_t* mtuple = &buf->tuples[buf->n_tuples + n_tuple[idx]];
+
+		field = mtuple->fields = static_cast<dfield_t*>(
+			mem_heap_alloc(buf->heap,
+				       FTS_NUM_FIELDS_SORT * sizeof *field));
+
+		/* The first field is the tokenized word */
+		dfield_set_data(field, t_str.f_str, t_str.f_len);
+		len = dfield_get_len(field);
+
+		dict_col_copy_type(dict_index_get_nth_col(buf->index, 0), &field->type);
+		field->type.prtype |= DATA_NOT_NULL;
+		ut_ad(len <= field->type.len);
+
+		/* For the temporary file, row_merge_buf_encode() uses
+		1 byte for representing the number of extra_size bytes.
+		This number will always be 1, because for this 3-field index
+		consisting of one variable-size column, extra_size will always
+		be 1 or 2, which can be encoded in one byte.
+
+		The extra_size is 1 byte if the length of the
+		variable-length column is less than 128 bytes or the
+		maximum length is less than 256 bytes. */
+
+		/* One variable length column, word with its lenght less than
+		fts_max_token_size, add one extra size and one extra byte.
+
+		Since the max length for FTS token now is larger than 255,
+		so we will need to signify length byte itself, so only 1 to 128
+		bytes can be used for 1 bytes, larger than that 2 bytes. */
+		if (len < 128 || field->type.len < 256) {
+			/* Extra size is one byte. */
+			cur_len = 2 + len;
+		} else {
+			/* Extra size is two bytes. */
+			cur_len = 3 + len;
+		}
+
+		dfield_dup(field, buf->heap);
+		field++;
+
+		/* The second field is the Doc ID */
+
+		ib_uint32_t	doc_id_32_bit;
+
+		if (!opt_doc_id_size) {
+			fts_write_doc_id((byte*) &write_doc_id, doc_id);
+
+			dfield_set_data(
+				field, &write_doc_id, sizeof(write_doc_id));
+		} else {
+			mach_write_to_4(
+				(byte*) &doc_id_32_bit, (ib_uint32_t) doc_id);
+
+			dfield_set_data(
+				field, &doc_id_32_bit, sizeof(doc_id_32_bit));
+		}
+
+		len = field->len;
+		ut_ad(len == FTS_DOC_ID_LEN || len == sizeof(ib_uint32_t));
+
+		field->type.mtype = DATA_INT;
+		field->type.prtype = DATA_NOT_NULL | DATA_BINARY_TYPE;
+		field->type.len = static_cast<uint16_t>(field->len);
+		field->type.mbminlen = 0;
+		field->type.mbmaxlen = 0;
+
+		cur_len += len;
+		dfield_dup(field, buf->heap);
+
+		++field;
+
+		/* The third field is the position.
+		MySQL 5.7 changed the fulltext parser plugin interface
+		by adding MYSQL_FTPARSER_BOOLEAN_INFO::position.
+		Below we assume that the field is always 0. */
+		ulint	pos = t_ctx->init_pos;
+		byte		position[4];
+		if (parser == NULL) {
+			pos += t_ctx->processed_len + inc - str.f_len;
+		}
+		len = 4;
+		mach_write_to_4(position, pos);
+		dfield_set_data(field, &position, len);
+
+		field->type.mtype = DATA_INT;
+		field->type.prtype = DATA_NOT_NULL;
+		field->type.len = 4;
+		field->type.mbminlen = 0;
+		field->type.mbmaxlen = 0;
+		cur_len += len;
+		dfield_dup(field, buf->heap);
+
+		/* Reserve one byte for the end marker of row_merge_block_t */
+		if (buf->total_size + data_size[idx] + cur_len
+		    >= srv_sort_buf_size - 1) {
+
+			buf_full = TRUE;
+			break;
+		}
+
+		/* Increment the number of tuples */
+		n_tuple[idx]++;
+		if (parser != NULL) {
+			UT_LIST_REMOVE(t_ctx->fts_token_list, fts_token);
+			ut_free(fts_token);
+		} else {
+			t_ctx->processed_len += inc;
+		}
+		data_size[idx] += cur_len;
+	}
+
+	/* Update the data length and the number of new word tuples
+	added in this round of tokenization */
+	for (ulint i = 0; i <  FTS_NUM_AUX_INDEX; i++) {
+		/* The computation of total_size below assumes that no
+		delete-mark flags will be stored and that all fields
+		are NOT NULL and fixed-length. */
+
+		sort_buf[i]->total_size += data_size[i];
+
+		sort_buf[i]->n_tuples += n_tuple[i];
+
+		merge_file[i]->n_rec += n_tuple[i];
+		t_ctx->rows_added[i] += n_tuple[i];
+	}
+
+	if (!buf_full) {
+		/* we pad one byte between text accross two fields */
+		t_ctx->init_pos += doc->text.f_len + 1;
+	}
+
+	return(!buf_full);
+}
+
+/*********************************************************************//**
+Get next doc item from fts_doc_list */
+UNIV_INLINE
+void
+row_merge_fts_get_next_doc_item(
+/*============================*/
+	fts_psort_t*		psort_info,	/*!< in: psort_info */
+	fts_doc_item_t**	doc_item)	/*!< in/out: doc item */
+{
+	if (*doc_item != NULL) {
+		ut_free(*doc_item);
+	}
+
+	mysql_mutex_lock(&psort_info->mutex);
+
+	*doc_item = UT_LIST_GET_FIRST(psort_info->fts_doc_list);
+	if (*doc_item != NULL) {
+		UT_LIST_REMOVE(psort_info->fts_doc_list, *doc_item);
+
+		ut_ad(psort_info->memory_used >= sizeof(fts_doc_item_t)
+		      + (*doc_item)->field->len);
+		psort_info->memory_used -= sizeof(fts_doc_item_t)
+			+ (*doc_item)->field->len;
+	}
+
+	mysql_mutex_unlock(&psort_info->mutex);
+}
+
+/*********************************************************************//**
+Function performs parallel tokenization of the incoming doc strings.
+It also performs the initial in memory sort of the parsed records.
+*/
+static
+void fts_parallel_tokenization(
+/*======================*/
+	void*		arg)	/*!< in: psort_info for the thread */
+{
+	fts_psort_t*		psort_info = (fts_psort_t*) arg;
+	ulint			i;
+	fts_doc_item_t*		doc_item = NULL;
+	row_merge_buf_t**	buf;
+	ibool			processed = FALSE;
+	merge_file_t**		merge_file;
+	row_merge_block_t**	block;
+	row_merge_block_t**	crypt_block;
+	pfs_os_file_t		tmpfd[FTS_NUM_AUX_INDEX];
+	ulint			mycount[FTS_NUM_AUX_INDEX];
+	ulint			num_doc_processed = 0;
+	doc_id_t		last_doc_id = 0;
+	mem_heap_t*		blob_heap = NULL;
+	fts_doc_t		doc;
+	dict_table_t*		table = psort_info->psort_common->new_table;
+	fts_tokenize_ctx_t	t_ctx;
+	ulint			retried = 0;
+	dberr_t			error = DB_SUCCESS;
+
+	ut_ad(psort_info->psort_common->trx->mysql_thd != NULL);
+
+	/* const char*		path = thd_innodb_tmpdir(
+		psort_info->psort_common->trx->mysql_thd);
+	*/
+
+	ut_ad(psort_info->psort_common->trx->mysql_thd != NULL);
+
+	const char*		path = thd_innodb_tmpdir(
+		psort_info->psort_common->trx->mysql_thd);
+
+	ut_ad(psort_info);
+
+	buf = psort_info->merge_buf;
+	merge_file = psort_info->merge_file;
+	blob_heap = mem_heap_create(512);
+	memset(&doc, 0, sizeof(doc));
+	memset(mycount, 0, FTS_NUM_AUX_INDEX * sizeof(int));
+
+	doc.charset = fts_index_get_charset(
+		psort_info->psort_common->dup->index);
+
+	block = psort_info->merge_block;
+	crypt_block = psort_info->crypt_block;
+
+	const ulint zip_size = psort_info->psort_common->old_zip_size;
+
+	row_merge_fts_get_next_doc_item(psort_info, &doc_item);
+
+	t_ctx.cached_stopword = table->fts->cache->stopword_info.cached_stopword;
+	processed = TRUE;
+loop:
+	while (doc_item) {
+		dfield_t*	dfield = doc_item->field;
+
+		last_doc_id = doc_item->doc_id;
+
+		ut_ad (dfield->data != NULL
+		       && dfield_get_len(dfield) != UNIV_SQL_NULL);
+
+		/* If finish processing the last item, update "doc" with
+		strings in the doc_item, otherwise continue processing last
+		item */
+		if (processed) {
+			byte*		data;
+			ulint		data_len;
+
+			dfield = doc_item->field;
+			data = static_cast<byte*>(dfield_get_data(dfield));
+			data_len = dfield_get_len(dfield);
+
+			if (dfield_is_ext(dfield)) {
+				doc.text.f_str =
+					btr_copy_externally_stored_field(
+						&doc.text.f_len, data,
+						zip_size, data_len, blob_heap);
+			} else {
+				doc.text.f_str = data;
+				doc.text.f_len = data_len;
+			}
+
+			doc.tokens = 0;
+			t_ctx.processed_len = 0;
+		} else {
+			/* Not yet finish processing the "doc" on hand,
+			continue processing it */
+			ut_ad(doc.text.f_str);
+			ut_ad(buf[0]->index->parser
+			      || t_ctx.processed_len < doc.text.f_len);
+		}
+
+		processed = row_merge_fts_doc_tokenize(
+			buf, doc_item->doc_id, &doc,
+			merge_file, psort_info->psort_common->opt_doc_id_size,
+			&t_ctx);
+
+		/* Current sort buffer full, need to recycle */
+		if (!processed) {
+			ut_ad(buf[0]->index->parser
+			      || t_ctx.processed_len < doc.text.f_len);
+			ut_ad(t_ctx.rows_added[t_ctx.buf_used]);
+			break;
+		}
+
+		num_doc_processed++;
+
+		if (UNIV_UNLIKELY(fts_enable_diag_print)
+		    && num_doc_processed % 10000 == 1) {
+			ib::info() << "Number of documents processed: "
+				<< num_doc_processed;
+#ifdef FTS_INTERNAL_DIAG_PRINT
+			for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+				ib::info() << "ID " << psort_info->psort_id
+					<< ", partition " << i << ", word "
+					<< mycount[i];
+			}
+#endif
+		}
+
+		mem_heap_empty(blob_heap);
+
+		row_merge_fts_get_next_doc_item(psort_info, &doc_item);
+
+		if (doc_item && last_doc_id != doc_item->doc_id) {
+			t_ctx.init_pos = 0;
+		}
+	}
+
+	/* If we run out of current sort buffer, need to sort
+	and flush the sort buffer to disk */
+	if (t_ctx.rows_added[t_ctx.buf_used] && !processed) {
+		row_merge_buf_sort(buf[t_ctx.buf_used], NULL);
+		row_merge_buf_write(buf[t_ctx.buf_used],
+#ifndef DBUG_OFF
+				    merge_file[t_ctx.buf_used],
+#endif
+				    block[t_ctx.buf_used]);
+
+		if (!row_merge_write(merge_file[t_ctx.buf_used]->fd,
+				     merge_file[t_ctx.buf_used]->offset++,
+				     block[t_ctx.buf_used],
+				     crypt_block[t_ctx.buf_used],
+				     table->space_id)) {
+			error = DB_TEMP_FILE_WRITE_FAIL;
+			goto func_exit;
+		}
+
+		MEM_UNDEFINED(block[t_ctx.buf_used], srv_sort_buf_size);
+		buf[t_ctx.buf_used] = row_merge_buf_empty(buf[t_ctx.buf_used]);
+		mycount[t_ctx.buf_used] += t_ctx.rows_added[t_ctx.buf_used];
+		t_ctx.rows_added[t_ctx.buf_used] = 0;
+
+		ut_a(doc_item);
+		goto loop;
+	}
+
+	/* Parent done scanning, and if finish processing all the docs, exit */
+	if (psort_info->state == FTS_PARENT_COMPLETE) {
+		if (UT_LIST_GET_LEN(psort_info->fts_doc_list) == 0) {
+			goto exit;
+		} else if (retried > 10000) {
+			ut_ad(!doc_item);
+			/* retried too many times and cannot get new record */
+			ib::error() << "FTS parallel sort processed "
+				<< num_doc_processed
+				<< " records, the sort queue has "
+				<< UT_LIST_GET_LEN(psort_info->fts_doc_list)
+				<< " records. But sort cannot get the next"
+				" records during alter table " << table->name;
+			goto exit;
+		}
+	} else if (psort_info->state == FTS_PARENT_EXITING) {
+		/* Parent abort */
+		goto func_exit;
+	}
+
+	if (doc_item == NULL) {
+		std::this_thread::yield();
+	}
+
+	row_merge_fts_get_next_doc_item(psort_info, &doc_item);
+
+	if (doc_item != NULL) {
+		if (last_doc_id != doc_item->doc_id) {
+			t_ctx.init_pos = 0;
+		}
+
+		retried = 0;
+	} else if (psort_info->state == FTS_PARENT_COMPLETE) {
+		retried++;
+	}
+
+	goto loop;
+
+exit:
+	/* Do a final sort of the last (or latest) batch of records
+	in block memory. Flush them to temp file if records cannot
+	be hold in one block memory */
+	for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+		if (t_ctx.rows_added[i]) {
+			row_merge_buf_sort(buf[i], NULL);
+			row_merge_buf_write(buf[i],
+#ifndef DBUG_OFF
+					    merge_file[i],
+#endif
+					    block[i]);
+
+			/* Write to temp file, only if records have
+			been flushed to temp file before (offset > 0):
+			The pseudo code for sort is following:
+
+				while (there are rows) {
+					tokenize rows, put result in block[]
+					if (block[] runs out) {
+						sort rows;
+						write to temp file with
+						row_merge_write();
+						offset++;
+					}
+				}
+
+				# write out the last batch
+				if (offset > 0) {
+					row_merge_write();
+					offset++;
+				} else {
+					# no need to write anything
+					offset stay as 0
+				}
+
+			so if merge_file[i]->offset is 0 when we come to
+			here as the last batch, this means rows have
+			never flush to temp file, it can be held all in
+			memory */
+			if (merge_file[i]->offset != 0) {
+				if (!row_merge_write(merge_file[i]->fd,
+						merge_file[i]->offset++,
+						block[i],
+						crypt_block[i],
+						table->space_id)) {
+					error = DB_TEMP_FILE_WRITE_FAIL;
+					goto func_exit;
+				}
+
+#ifdef HAVE_valgrind
+				MEM_UNDEFINED(block[i], srv_sort_buf_size);
+
+				if (crypt_block[i]) {
+					MEM_UNDEFINED(crypt_block[i],
+						      srv_sort_buf_size);
+				}
+#endif /* HAVE_valgrind */
+			}
+
+			buf[i] = row_merge_buf_empty(buf[i]);
+			t_ctx.rows_added[i] = 0;
+		}
+	}
+
+	if (UNIV_UNLIKELY(fts_enable_diag_print)) {
+		DEBUG_FTS_SORT_PRINT("  InnoDB_FTS: start merge sort\n");
+	}
+
+	for (i = 0; i < FTS_NUM_AUX_INDEX; i++) {
+		if (!merge_file[i]->offset) {
+			continue;
+		}
+
+		tmpfd[i] = row_merge_file_create_low(path);
+		if (tmpfd[i] == OS_FILE_CLOSED) {
+			error = DB_OUT_OF_MEMORY;
+			goto func_exit;
+		}
+
+		error = row_merge_sort(psort_info->psort_common->trx,
+				       psort_info->psort_common->dup,
+				       merge_file[i], block[i], &tmpfd[i],
+				       false, 0.0/* pct_progress */, 0.0/* pct_cost */,
+				       crypt_block[i], table->space_id);
+
+		if (error != DB_SUCCESS) {
+			row_merge_file_destroy_low(tmpfd[i]);
+			goto func_exit;
+		}
+
+		row_merge_file_destroy_low(tmpfd[i]);
+	}
+
+func_exit:
+	if (UNIV_UNLIKELY(fts_enable_diag_print)) {
+		DEBUG_FTS_SORT_PRINT("  InnoDB_FTS: complete merge sort\n");
+	}
+
+	mem_heap_free(blob_heap);
+
+	mysql_mutex_lock(&psort_info->mutex);
+	psort_info->error = error;
+	mysql_mutex_unlock(&psort_info->mutex);
+
+	if (UT_LIST_GET_LEN(psort_info->fts_doc_list) > 0) {
+		/* child can exit either with error or told by parent. */
+		ut_ad(error != DB_SUCCESS
+		      || psort_info->state == FTS_PARENT_EXITING);
+	}
+
+	/* Free fts doc list in case of error. */
+	do {
+		row_merge_fts_get_next_doc_item(psort_info, &doc_item);
+	} while (doc_item != NULL);
+
+	mysql_mutex_lock(&psort_info->mutex);
+	psort_info->child_status = FTS_CHILD_COMPLETE;
+	pthread_cond_signal(&psort_info->psort_common->sort_cond);
+	mysql_mutex_unlock(&psort_info->mutex);
+}
+
+/*********************************************************************//**
+Start the parallel tokenization and parallel merge sort */
+void
+row_fts_start_psort(
+/*================*/
+	fts_psort_t*	psort_info)	/*!< parallel sort structure */
+{
+	ulint		i = 0;
+
+	for (i = 0; i < fts_sort_pll_degree; i++) {
+		psort_info[i].psort_id = i;
+		psort_info[i].task =
+			new tpool::waitable_task(fts_parallel_tokenization,&psort_info[i]);
+		srv_thread_pool->submit_task(psort_info[i].task);
+	}
+}
+
+/*********************************************************************//**
+Function performs the merge and insertion of the sorted records. */
+static
+void
+fts_parallel_merge(
+/*===============*/
+	void*		arg)		/*!< in: parallel merge info */
+{
+	fts_psort_t*	psort_info = (fts_psort_t*) arg;
+	ulint		id;
+
+	ut_ad(psort_info);
+
+	id = psort_info->psort_id;
+
+	row_fts_merge_insert(psort_info->psort_common->dup->index,
+			     psort_info->psort_common->new_table,
+			     psort_info->psort_common->all_info, id);
+}
+
+/*********************************************************************//**
+Kick off the parallel merge and insert thread */
+void
+row_fts_start_parallel_merge(
+/*=========================*/
+	fts_psort_t*	merge_info)	/*!< in: parallel sort info */
+{
+	ulint		i = 0;
+
+	/* Kick off merge/insert tasks */
+	for (i = 0; i <  FTS_NUM_AUX_INDEX; i++) {
+		merge_info[i].psort_id = i;
+		merge_info[i].child_status = 0;
+
+		merge_info[i].task = new tpool::waitable_task(
+			fts_parallel_merge,
+			(void*) &merge_info[i]);
+		srv_thread_pool->submit_task(merge_info[i].task);
+	}
+}
+
+/**
+Write out a single word's data as new entry/entries in the INDEX table.
+@param[in]	ins_ctx	insert context
+@param[in]	word	word string
+@param[in]	node	node colmns
+@return	DB_SUCCUESS if insertion runs fine, otherwise error code */
+static
+dberr_t
+row_merge_write_fts_node(
+	const	fts_psort_insert_t*	ins_ctx,
+	const	fts_string_t*		word,
+	const	fts_node_t*		node)
+{
+	dtuple_t*	tuple;
+	dfield_t*	field;
+	dberr_t		ret = DB_SUCCESS;
+	doc_id_t	write_first_doc_id[8];
+	doc_id_t	write_last_doc_id[8];
+	ib_uint32_t	write_doc_count;
+
+	tuple = ins_ctx->tuple;
+
+	/* The first field is the tokenized word */
+	field = dtuple_get_nth_field(tuple, 0);
+	dfield_set_data(field, word->f_str, word->f_len);
+
+	/* The second field is first_doc_id */
+	field = dtuple_get_nth_field(tuple, 1);
+	fts_write_doc_id((byte*)&write_first_doc_id, node->first_doc_id);
+	dfield_set_data(field, &write_first_doc_id, sizeof(doc_id_t));
+
+	/* The third and fourth fileds(TRX_ID, ROLL_PTR) are filled already.*/
+	/* The fifth field is last_doc_id */
+	field = dtuple_get_nth_field(tuple, 4);
+	fts_write_doc_id((byte*)&write_last_doc_id, node->last_doc_id);
+	dfield_set_data(field, &write_last_doc_id, sizeof(doc_id_t));
+
+	/* The sixth field is doc_count */
+	field = dtuple_get_nth_field(tuple, 5);
+	mach_write_to_4((byte*)&write_doc_count, (ib_uint32_t)node->doc_count);
+	dfield_set_data(field, &write_doc_count, sizeof(ib_uint32_t));
+
+	/* The seventh field is ilist */
+	field = dtuple_get_nth_field(tuple, 6);
+	dfield_set_data(field, node->ilist, node->ilist_size);
+
+	ret = ins_ctx->btr_bulk->insert(tuple);
+
+	return(ret);
+}
+
+/********************************************************************//**
+Insert processed FTS data to auxillary index tables.
+@return DB_SUCCESS if insertion runs fine */
+static MY_ATTRIBUTE((nonnull))
+dberr_t
+row_merge_write_fts_word(
+/*=====================*/
+	fts_psort_insert_t*	ins_ctx,	/*!< in: insert context */
+	fts_tokenizer_word_t*	word)		/*!< in: sorted and tokenized
+						word */
+{
+	dberr_t	ret = DB_SUCCESS;
+
+	ut_ad(ins_ctx->aux_index_id == fts_select_index(
+		ins_ctx->charset, word->text.f_str, word->text.f_len));
+
+	/* Pop out each fts_node in word->nodes write them to auxiliary table */
+	for (ulint i = 0; i < ib_vector_size(word->nodes); i++) {
+		dberr_t		error;
+		fts_node_t*	fts_node;
+
+		fts_node = static_cast<fts_node_t*>(ib_vector_get(word->nodes, i));
+
+		error = row_merge_write_fts_node(ins_ctx, &word->text, fts_node);
+
+		if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+			ib::error() << "Failed to write word to FTS auxiliary"
+				" index table "
+				<< ins_ctx->btr_bulk->table_name()
+				<< ", error " << error;
+			ret = error;
+		}
+
+		ut_free(fts_node->ilist);
+		fts_node->ilist = NULL;
+	}
+
+	ib_vector_reset(word->nodes);
+
+	return(ret);
+}
+
+/*********************************************************************//**
+Read sorted FTS data files and insert data tuples to auxillary tables.
+@return DB_SUCCESS or error number */
+static
+void
+row_fts_insert_tuple(
+/*=================*/
+	fts_psort_insert_t*
+			ins_ctx,	/*!< in: insert context */
+	fts_tokenizer_word_t* word,	/*!< in: last processed
+					tokenized word */
+	ib_vector_t*	positions,	/*!< in: word position */
+	doc_id_t*	in_doc_id,	/*!< in: last item doc id */
+	dtuple_t*	dtuple)		/*!< in: entry to insert */
+{
+	fts_node_t*	fts_node = NULL;
+	dfield_t*	dfield;
+	doc_id_t	doc_id;
+	ulint		position;
+	fts_string_t	token_word;
+	ulint		i;
+
+	/* Get fts_node for the FTS auxillary INDEX table */
+	if (ib_vector_size(word->nodes) > 0) {
+		fts_node = static_cast<fts_node_t*>(
+			ib_vector_last(word->nodes));
+	}
+
+	if (fts_node == NULL
+	    || fts_node->ilist_size > FTS_ILIST_MAX_SIZE) {
+
+		fts_node = static_cast<fts_node_t*>(
+			ib_vector_push(word->nodes, NULL));
+
+		memset(fts_node, 0x0, sizeof(*fts_node));
+	}
+
+	/* If dtuple == NULL, this is the last word to be processed */
+	if (!dtuple) {
+		if (fts_node && ib_vector_size(positions) > 0) {
+			fts_cache_node_add_positions(
+				NULL, fts_node, *in_doc_id,
+				positions);
+
+			/* Write out the current word */
+			row_merge_write_fts_word(ins_ctx, word);
+		}
+
+		return;
+	}
+
+	/* Get the first field for the tokenized word */
+	dfield = dtuple_get_nth_field(dtuple, 0);
+
+	token_word.f_n_char = 0;
+	token_word.f_len = dfield->len;
+	token_word.f_str = static_cast<byte*>(dfield_get_data(dfield));
+
+	if (!word->text.f_str) {
+		fts_string_dup(&word->text, &token_word, ins_ctx->heap);
+	}
+
+	/* compare to the last word, to see if they are the same
+	word */
+	if (innobase_fts_text_cmp(ins_ctx->charset,
+				  &word->text, &token_word) != 0) {
+		ulint	num_item;
+
+		/* Getting a new word, flush the last position info
+		for the currnt word in fts_node */
+		if (ib_vector_size(positions) > 0) {
+			fts_cache_node_add_positions(
+				NULL, fts_node, *in_doc_id, positions);
+		}
+
+		/* Write out the current word */
+		row_merge_write_fts_word(ins_ctx, word);
+
+		/* Copy the new word */
+		fts_string_dup(&word->text, &token_word, ins_ctx->heap);
+
+		num_item = ib_vector_size(positions);
+
+		/* Clean up position queue */
+		for (i = 0; i < num_item; i++) {
+			ib_vector_pop(positions);
+		}
+
+		/* Reset Doc ID */
+		*in_doc_id = 0;
+		memset(fts_node, 0x0, sizeof(*fts_node));
+	}
+
+	/* Get the word's Doc ID */
+	dfield = dtuple_get_nth_field(dtuple, 1);
+
+	if (!ins_ctx->opt_doc_id_size) {
+		doc_id = fts_read_doc_id(
+			static_cast<byte*>(dfield_get_data(dfield)));
+	} else {
+		doc_id = (doc_id_t) mach_read_from_4(
+			static_cast<byte*>(dfield_get_data(dfield)));
+	}
+
+	/* Get the word's position info */
+	dfield = dtuple_get_nth_field(dtuple, 2);
+	position = mach_read_from_4(static_cast<byte*>(dfield_get_data(dfield)));
+
+	/* If this is the same word as the last word, and they
+	have the same Doc ID, we just need to add its position
+	info. Otherwise, we will flush position info to the
+	fts_node and initiate a new position vector  */
+	if (!(*in_doc_id) || *in_doc_id == doc_id) {
+		ib_vector_push(positions, &position);
+	} else {
+		ulint	num_pos = ib_vector_size(positions);
+
+		fts_cache_node_add_positions(NULL, fts_node,
+					     *in_doc_id, positions);
+		for (i = 0; i < num_pos; i++) {
+			ib_vector_pop(positions);
+		}
+		ib_vector_push(positions, &position);
+	}
+
+	/* record the current Doc ID */
+	*in_doc_id = doc_id;
+}
+
+/*********************************************************************//**
+Propagate a newly added record up one level in the selection tree
+@return parent where this value propagated to */
+static
+ulint
+row_fts_sel_tree_propagate(
+/*=======================*/
+	ulint		propogated,	/*<! in: tree node propagated */
+	int*		sel_tree,	/*<! in: selection tree */
+	const mrec_t**	mrec,		/*<! in: sort record */
+	rec_offs**	offsets,	/*<! in: record offsets */
+	dict_index_t*	index)		/*<! in/out: FTS index */
+{
+	ulint	parent;
+	int	child_left;
+	int	child_right;
+	int	selected;
+
+	/* Find which parent this value will be propagated to */
+	parent = (propogated - 1) / 2;
+
+	/* Find out which value is smaller, and to propagate */
+	child_left = sel_tree[parent * 2 + 1];
+	child_right = sel_tree[parent * 2 + 2];
+
+	if (child_left == -1 || mrec[child_left] == NULL) {
+		if (child_right == -1
+		    || mrec[child_right] == NULL) {
+			selected = -1;
+		} else {
+			selected = child_right ;
+		}
+	} else if (child_right == -1
+		   || mrec[child_right] == NULL) {
+		selected = child_left;
+	} else if (cmp_rec_rec_simple(mrec[child_left], mrec[child_right],
+				      offsets[child_left],
+				      offsets[child_right],
+				      index, NULL) < 0) {
+		selected = child_left;
+	} else {
+		selected = child_right;
+	}
+
+	sel_tree[parent] = selected;
+
+	return parent;
+}
+
+/*********************************************************************//**
+Readjust selection tree after popping the root and read a new value
+@return the new root */
+static
+int
+row_fts_sel_tree_update(
+/*====================*/
+	int*		sel_tree,	/*<! in/out: selection tree */
+	ulint		propagated,	/*<! in: node to propagate up */
+	ulint		height,		/*<! in: tree height */
+	const mrec_t**	mrec,		/*<! in: sort record */
+	rec_offs**	offsets,	/*<! in: record offsets */
+	dict_index_t*	index)		/*<! in: index dictionary */
+{
+	ulint	i;
+
+	for (i = 1; i <= height; i++) {
+		propagated = row_fts_sel_tree_propagate(
+			propagated, sel_tree, mrec, offsets, index);
+	}
+
+	return(sel_tree[0]);
+}
+
+/*********************************************************************//**
+Build selection tree at a specified level */
+static
+void
+row_fts_build_sel_tree_level(
+/*=========================*/
+	int*		sel_tree,	/*<! in/out: selection tree */
+	ulint		level,		/*<! in: selection tree level */
+	const mrec_t**	mrec,		/*<! in: sort record */
+	rec_offs**	offsets,	/*<! in: record offsets */
+	dict_index_t*	index)		/*<! in: index dictionary */
+{
+	ulint	start;
+	int	child_left;
+	int	child_right;
+	ulint	i;
+	ulint	num_item	= ulint(1) << level;
+
+	start = num_item - 1;
+
+	for (i = 0; i < num_item;  i++) {
+		child_left = sel_tree[(start + i) * 2 + 1];
+		child_right = sel_tree[(start + i) * 2 + 2];
+
+		if (child_left == -1) {
+			if (child_right == -1) {
+				sel_tree[start + i] = -1;
+			} else {
+				sel_tree[start + i] =  child_right;
+			}
+			continue;
+		} else if (child_right == -1) {
+			sel_tree[start + i] = child_left;
+			continue;
+		}
+
+		/* Deal with NULL child conditions */
+		if (!mrec[child_left]) {
+			if (!mrec[child_right]) {
+				sel_tree[start + i] = -1;
+			} else {
+				sel_tree[start + i] = child_right;
+			}
+			continue;
+		} else if (!mrec[child_right]) {
+			sel_tree[start + i] = child_left;
+			continue;
+		}
+
+		/* Select the smaller one to set parent pointer */
+		int cmp = cmp_rec_rec_simple(
+			mrec[child_left], mrec[child_right],
+			offsets[child_left], offsets[child_right],
+			index, NULL);
+
+		sel_tree[start + i] = cmp < 0 ? child_left : child_right;
+	}
+}
+
+/*********************************************************************//**
+Build a selection tree for merge. The selection tree is a binary tree
+and should have fts_sort_pll_degree / 2 levels. With root as level 0
+@return number of tree levels */
+static
+ulint
+row_fts_build_sel_tree(
+/*===================*/
+	int*		sel_tree,	/*<! in/out: selection tree */
+	const mrec_t**	mrec,		/*<! in: sort record */
+	rec_offs**	offsets,	/*<! in: record offsets */
+	dict_index_t*	index)		/*<! in: index dictionary */
+{
+	ulint	treelevel = 1;
+	ulint	num = 2;
+	ulint	i = 0;
+	ulint	start;
+
+	/* No need to build selection tree if we only have two merge threads */
+	if (fts_sort_pll_degree <= 2) {
+		return(0);
+	}
+
+	while (num < fts_sort_pll_degree) {
+		num = num << 1;
+		treelevel++;
+	}
+
+	start = (ulint(1) << treelevel) - 1;
+
+	for (i = 0; i < fts_sort_pll_degree; i++) {
+		sel_tree[i + start] = int(i);
+	}
+
+	i = treelevel;
+	do {
+		row_fts_build_sel_tree_level(
+			sel_tree, --i, mrec, offsets, index);
+	} while (i > 0);
+
+	return(treelevel);
+}
+
+/*********************************************************************//**
+Read sorted file containing index data tuples and insert these data
+tuples to the index
+@return DB_SUCCESS or error number */
+dberr_t
+row_fts_merge_insert(
+/*=================*/
+	dict_index_t*		index,	/*!< in: index */
+	dict_table_t*		table,	/*!< in: new table */
+	fts_psort_t*		psort_info, /*!< parallel sort info */
+	ulint			id)	/* !< in: which auxiliary table's data
+					to insert to */
+{
+	const byte**		b;
+	mem_heap_t*		tuple_heap;
+	mem_heap_t*		heap;
+	dberr_t			error = DB_SUCCESS;
+	ulint*			foffs;
+	rec_offs**		offsets;
+	fts_tokenizer_word_t	new_word;
+	ib_vector_t*		positions;
+	doc_id_t		last_doc_id;
+	ib_alloc_t*		heap_alloc;
+	ulint			i;
+	mrec_buf_t**		buf;
+	pfs_os_file_t*			fd;
+	byte**			block;
+	byte**			crypt_block;
+	const mrec_t**		mrec;
+	ulint			count = 0;
+	int*			sel_tree;
+	ulint			height;
+	ulint			start;
+	fts_psort_insert_t	ins_ctx;
+	uint64_t		count_diag = 0;
+	fts_table_t		fts_table;
+	char			aux_table_name[MAX_FULL_NAME_LEN];
+	dict_table_t*		aux_table;
+	dict_index_t*		aux_index;
+	trx_t*			trx;
+
+	/* We use the insert query graph as the dummy graph
+	needed in the row module call */
+
+	trx = trx_create();
+	trx_start_if_not_started(trx, true);
+
+	trx->op_info = "inserting index entries";
+
+	ins_ctx.opt_doc_id_size = psort_info[0].psort_common->opt_doc_id_size;
+
+	heap = mem_heap_create(500 + sizeof(mrec_buf_t));
+
+	b = (const byte**) mem_heap_alloc(
+		heap, sizeof (*b) * fts_sort_pll_degree);
+	foffs = (ulint*) mem_heap_alloc(
+		heap, sizeof(*foffs) * fts_sort_pll_degree);
+	offsets = (rec_offs**) mem_heap_alloc(
+		heap, sizeof(*offsets) * fts_sort_pll_degree);
+	buf = (mrec_buf_t**) mem_heap_alloc(
+		heap, sizeof(*buf) * fts_sort_pll_degree);
+	fd = (pfs_os_file_t*) mem_heap_alloc(heap, sizeof(*fd) * fts_sort_pll_degree);
+	block = (byte**) mem_heap_alloc(
+		heap, sizeof(*block) * fts_sort_pll_degree);
+	crypt_block = (byte**) mem_heap_alloc(
+		heap, sizeof(*block) * fts_sort_pll_degree);
+	mrec = (const mrec_t**) mem_heap_alloc(
+		heap, sizeof(*mrec) * fts_sort_pll_degree);
+	sel_tree = (int*) mem_heap_alloc(
+		heap, sizeof(*sel_tree) * (fts_sort_pll_degree * 2));
+
+	tuple_heap = mem_heap_create(1000);
+
+	ins_ctx.charset = fts_index_get_charset(index);
+	ins_ctx.heap = heap;
+
+	for (i = 0; i < fts_sort_pll_degree; i++) {
+		ulint	num;
+
+		num = 1 + REC_OFFS_HEADER_SIZE
+			+ dict_index_get_n_fields(index);
+		offsets[i] = static_cast<rec_offs*>(mem_heap_zalloc(
+			heap, num * sizeof *offsets[i]));
+		rec_offs_set_n_alloc(offsets[i], num);
+		rec_offs_set_n_fields(offsets[i], dict_index_get_n_fields(index));
+		block[i] = psort_info[i].merge_block[id];
+		crypt_block[i] = psort_info[i].crypt_block[id];
+		b[i] = psort_info[i].merge_block[id];
+		fd[i] = psort_info[i].merge_file[id]->fd;
+		foffs[i] = 0;
+
+		buf[i] = static_cast<mrec_buf_t*>(
+			mem_heap_alloc(heap, sizeof *buf[i]));
+
+		count_diag += psort_info[i].merge_file[id]->n_rec;
+	}
+
+	if (UNIV_UNLIKELY(fts_enable_diag_print)) {
+		ib::info() << "InnoDB_FTS: to insert " << count_diag
+			<< " records";
+	}
+
+	/* Initialize related variables if creating FTS indexes */
+	heap_alloc = ib_heap_allocator_create(heap);
+
+	memset(&new_word, 0, sizeof(new_word));
+
+	new_word.nodes = ib_vector_create(heap_alloc, sizeof(fts_node_t), 4);
+	positions = ib_vector_create(heap_alloc, sizeof(ulint), 32);
+	last_doc_id = 0;
+
+	/* We should set the flags2 with aux_table_name here,
+	in order to get the correct aux table names. */
+	index->table->flags2 |= DICT_TF2_FTS_AUX_HEX_NAME;
+	DBUG_EXECUTE_IF("innodb_test_wrong_fts_aux_table_name",
+			index->table->flags2 &= ~DICT_TF2_FTS_AUX_HEX_NAME
+			& ((1U << DICT_TF2_BITS) - 1););
+	fts_table.type = FTS_INDEX_TABLE;
+	fts_table.index_id = index->id;
+	fts_table.table_id = table->id;
+	fts_table.table = index->table;
+	fts_table.suffix = fts_get_suffix(id);
+
+	/* Get aux index */
+	fts_get_table_name(&fts_table, aux_table_name);
+	aux_table = dict_table_open_on_name(aux_table_name, false,
+					    DICT_ERR_IGNORE_NONE);
+	ut_ad(aux_table != NULL);
+	aux_index = dict_table_get_first_index(aux_table);
+
+	ut_ad(!aux_index->is_instant());
+	/* row_merge_write_fts_node() depends on the correct value */
+	ut_ad(aux_index->n_core_null_bytes
+	      == UT_BITS_IN_BYTES(aux_index->n_nullable));
+
+	/* Create bulk load instance */
+	ins_ctx.btr_bulk = UT_NEW_NOKEY(BtrBulk(aux_index, trx));
+
+	/* Create tuple for insert */
+	ins_ctx.tuple = dtuple_create(heap, dict_index_get_n_fields(aux_index));
+	dict_index_copy_types(ins_ctx.tuple, aux_index,
+			      dict_index_get_n_fields(aux_index));
+
+	/* Set TRX_ID and ROLL_PTR */
+	dfield_set_data(dtuple_get_nth_field(ins_ctx.tuple, 2),
+			&reset_trx_id, DATA_TRX_ID_LEN);
+	dfield_set_data(dtuple_get_nth_field(ins_ctx.tuple, 3),
+			&reset_trx_id[DATA_TRX_ID_LEN], DATA_ROLL_PTR_LEN);
+
+	ut_d(ins_ctx.aux_index_id = id);
+
+	const ulint space = table->space_id;
+
+	for (i = 0; i < fts_sort_pll_degree; i++) {
+		if (psort_info[i].merge_file[id]->n_rec == 0) {
+			/* No Rows to read */
+			mrec[i] = b[i] = NULL;
+		} else {
+			/* Read from temp file only if it has been
+			written to. Otherwise, block memory holds
+			all the sorted records */
+			if (psort_info[i].merge_file[id]->offset > 0
+			    && (!row_merge_read(
+					fd[i], foffs[i],
+					(row_merge_block_t*) block[i],
+					(row_merge_block_t*) crypt_block[i],
+					space))) {
+				error = DB_CORRUPTION;
+				goto exit;
+			}
+
+			ROW_MERGE_READ_GET_NEXT(i);
+		}
+	}
+
+	height = row_fts_build_sel_tree(sel_tree, (const mrec_t **) mrec,
+					offsets, index);
+
+	start = (1U << height) - 1;
+
+	/* Fetch sorted records from sort buffer and insert them into
+	corresponding FTS index auxiliary tables */
+	for (;;) {
+		dtuple_t*	dtuple;
+		int		min_rec = 0;
+
+		if (fts_sort_pll_degree <= 2) {
+			while (!mrec[min_rec]) {
+				min_rec++;
+
+				if (min_rec >= (int) fts_sort_pll_degree) {
+					row_fts_insert_tuple(
+						&ins_ctx, &new_word,
+						positions, &last_doc_id,
+						NULL);
+
+					goto exit;
+				}
+			}
+
+			for (i = min_rec + 1; i < fts_sort_pll_degree; i++) {
+				if (!mrec[i]) {
+					continue;
+				}
+
+				if (cmp_rec_rec_simple(
+					    mrec[i], mrec[min_rec],
+					    offsets[i], offsets[min_rec],
+					    index, NULL) < 0) {
+					min_rec = static_cast<int>(i);
+				}
+			}
+		} else {
+			min_rec = sel_tree[0];
+
+			if (min_rec ==  -1) {
+				row_fts_insert_tuple(
+					&ins_ctx, &new_word,
+					positions, &last_doc_id,
+					NULL);
+
+				goto exit;
+			}
+		}
+
+		dtuple = row_rec_to_index_entry_low(
+			mrec[min_rec], index, offsets[min_rec],
+			tuple_heap);
+
+		row_fts_insert_tuple(
+			&ins_ctx, &new_word, positions,
+			&last_doc_id, dtuple);
+
+
+		ROW_MERGE_READ_GET_NEXT(min_rec);
+
+		if (fts_sort_pll_degree > 2) {
+			if (!mrec[min_rec]) {
+				sel_tree[start + min_rec] = -1;
+			}
+
+			row_fts_sel_tree_update(sel_tree, start + min_rec,
+						height, mrec,
+						offsets, index);
+		}
+
+		count++;
+
+		mem_heap_empty(tuple_heap);
+	}
+
+exit:
+	fts_sql_commit(trx);
+
+	trx->op_info = "";
+
+	mem_heap_free(tuple_heap);
+
+	error = ins_ctx.btr_bulk->finish(error);
+	UT_DELETE(ins_ctx.btr_bulk);
+
+	aux_table->release();
+
+	trx->free();
+
+	mem_heap_free(heap);
+
+	if (UNIV_UNLIKELY(fts_enable_diag_print)) {
+		ib::info() << "InnoDB_FTS: inserted " << count << " records";
+	}
+
+	return(error);
+}
diff --git a/storage/innobase/row/row0import.cc b/storage/innobase/row/row0import.cc
new file mode 100644
index 00000000..d2609fdb
--- /dev/null
+++ b/storage/innobase/row/row0import.cc
@@ -0,0 +1,4585 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0import.cc
+Import a tablespace to a running instance.
+
+Created 2012-02-08 by Sunny Bains.
+*******************************************************/
+
+#include "row0import.h"
+#include "btr0pcur.h"
+#ifdef BTR_CUR_HASH_ADAPT
+# include "btr0sea.h"
+#endif
+#include "buf0flu.h"
+#include "que0que.h"
+#include "dict0boot.h"
+#include "dict0load.h"
+#include "pars0pars.h"
+#include "row0row.h"
+#include "row0sel.h"
+#include "row0mysql.h"
+#include "srv0start.h"
+#include "row0quiesce.h"
+#include "fil0pagecompress.h"
+#include "trx0undo.h"
+#include "lock0lock.h"
+#include "lzo/lzo1x.h"
+#include "snappy-c.h"
+#include "log.h"
+
+#include "scope.h"
+
+#include <vector>
+
+#ifdef HAVE_MY_AES_H
+#include <my_aes.h>
+#endif
+
+using st_::span;
+
+/** The size of the buffer to use for IO.
+@param n physical page size
+@return number of pages */
+#define IO_BUFFER_SIZE(n)	((1024 * 1024) / (n))
+
+/** For gathering stats on records during phase I */
+struct row_stats_t {
+	ulint		m_n_deleted;		/*!< Number of deleted records
+						found in the index */
+
+	ulint		m_n_purged;		/*!< Number of records purged
+						optimisatically */
+
+	ulint		m_n_rows;		/*!< Number of rows */
+
+	ulint		m_n_purge_failed;	/*!< Number of deleted rows
+						that could not be purged */
+};
+
+/** Index information required by IMPORT. */
+struct row_index_t {
+	index_id_t	m_id;			/*!< Index id of the table
+						in the exporting server */
+	byte*		m_name;			/*!< Index name */
+
+	uint32_t	m_space;		/*!< Space where it is placed */
+
+	uint32_t	m_page_no;		/*!< Root page number */
+
+	ulint		m_type;			/*!< Index type */
+
+	ulint		m_trx_id_offset;	/*!< Relevant only for clustered
+						indexes, offset of transaction
+						id system column */
+
+	ulint		m_n_user_defined_cols;	/*!< User defined columns */
+
+	ulint		m_n_uniq;		/*!< Number of columns that can
+						uniquely identify the row */
+
+	ulint		m_n_nullable;		/*!< Number of nullable
+						columns */
+
+	ulint		m_n_fields;		/*!< Total number of fields */
+
+	dict_field_t*	m_fields;		/*!< Index fields */
+
+	const dict_index_t*
+			m_srv_index;		/*!< Index instance in the
+						importing server */
+
+	row_stats_t	m_stats;		/*!< Statistics gathered during
+						the import phase */
+
+};
+
+/** Meta data required by IMPORT. */
+struct row_import {
+	row_import() UNIV_NOTHROW
+		:
+		m_table(NULL),
+		m_version(0),
+		m_hostname(NULL),
+		m_table_name(NULL),
+		m_autoinc(0),
+		m_zip_size(0),
+		m_flags(0),
+		m_n_cols(0),
+		m_cols(NULL),
+		m_col_names(NULL),
+		m_n_indexes(0),
+		m_indexes(NULL),
+		m_missing(true) { }
+
+	~row_import() UNIV_NOTHROW;
+
+	/** Find the index entry in in the indexes array.
+	@param name index name
+	@return instance if found else 0. */
+	row_index_t* get_index(const char* name) const UNIV_NOTHROW;
+
+	/** Get the number of rows in the index.
+	@param name index name
+	@return number of rows (doesn't include delete marked rows). */
+	ulint	get_n_rows(const char* name) const UNIV_NOTHROW;
+
+	/** Find the ordinal value of the column name in the cfg table columns.
+	@param name of column to look for.
+	@return ULINT_UNDEFINED if not found. */
+	ulint find_col(const char* name) const UNIV_NOTHROW;
+
+	/** Get the number of rows for which purge failed during the
+	convert phase.
+	@param name index name
+	@return number of rows for which purge failed. */
+	ulint get_n_purge_failed(const char* name) const UNIV_NOTHROW;
+
+	/** Check if the index is clean. ie. no delete-marked records
+	@param name index name
+	@return true if index needs to be purged. */
+	bool requires_purge(const char* name) const UNIV_NOTHROW
+	{
+		return(get_n_purge_failed(name) > 0);
+	}
+
+	/** Set the index root <space, pageno> using the index name */
+	void set_root_by_name() UNIV_NOTHROW;
+
+	/** Set the index root <space, pageno> using a heuristic
+	@return DB_SUCCESS or error code */
+	dberr_t set_root_by_heuristic() UNIV_NOTHROW;
+
+	/** Check if the index schema that was read from the .cfg file
+	matches the in memory index definition.
+	Note: It will update row_import_t::m_srv_index to map the meta-data
+	read from the .cfg file to the server index instance.
+	@return DB_SUCCESS or error code. */
+	dberr_t match_index_columns(
+		THD*			thd,
+		const dict_index_t*	index) UNIV_NOTHROW;
+
+	/** Check if the table schema that was read from the .cfg file
+	matches the in memory table definition.
+	@param thd MySQL session variable
+	@return DB_SUCCESS or error code. */
+	dberr_t match_table_columns(
+		THD*			thd) UNIV_NOTHROW;
+
+	/** Check if the table (and index) schema that was read from the
+	.cfg file matches the in memory table definition.
+	@param thd MySQL session variable
+	@return DB_SUCCESS or error code. */
+	dberr_t match_schema(
+		THD*			thd) UNIV_NOTHROW;
+
+	dberr_t match_flags(THD *thd) const ;
+
+
+	dict_table_t*	m_table;		/*!< Table instance */
+
+	ulint		m_version;		/*!< Version of config file */
+
+	byte*		m_hostname;		/*!< Hostname where the
+						tablespace was exported */
+	byte*		m_table_name;		/*!< Exporting instance table
+						name */
+
+	ib_uint64_t	m_autoinc;		/*!< Next autoinc value */
+
+	ulint		m_zip_size;		/*!< ROW_FORMAT=COMPRESSED
+						page size, or 0 */
+
+	ulint		m_flags;		/*!< Table flags */
+
+	ulint		m_n_cols;		/*!< Number of columns in the
+						meta-data file */
+
+	dict_col_t*	m_cols;			/*!< Column data */
+
+	byte**		m_col_names;		/*!< Column names, we store the
+						column naems separately becuase
+						there is no field to store the
+						value in dict_col_t */
+
+	ulint		m_n_indexes;		/*!< Number of indexes,
+						including clustered index */
+
+	row_index_t*	m_indexes;		/*!< Index meta data */
+
+	bool		m_missing;		/*!< true if a .cfg file was
+						found and was readable */
+};
+
+struct fil_iterator_t {
+	pfs_os_file_t	file;			/*!< File handle */
+	const char*	filepath;		/*!< File path name */
+	os_offset_t	start;			/*!< From where to start */
+	os_offset_t	end;			/*!< Where to stop */
+	os_offset_t	file_size;		/*!< File size in bytes */
+	ulint		n_io_buffers;		/*!< Number of pages to use
+						for IO */
+	byte*		io_buffer;		/*!< Buffer to use for IO */
+	fil_space_crypt_t *crypt_data;		/*!< Crypt data (if encrypted) */
+	byte*           crypt_io_buffer;        /*!< IO buffer when encrypted */
+};
+
+/** Use the page cursor to iterate over records in a block. */
+class RecIterator {
+public:
+	/** Default constructor */
+	RecIterator() UNIV_NOTHROW
+	{
+		memset(&m_cur, 0x0, sizeof(m_cur));
+		/* Make page_cur_delete_rec() happy. */
+		m_mtr.start();
+		m_mtr.set_log_mode(MTR_LOG_NO_REDO);
+	}
+
+	/** Position the cursor on the first user record. */
+	rec_t* open(buf_block_t* block, const dict_index_t* index) noexcept
+		MY_ATTRIBUTE((warn_unused_result))
+	{
+		m_cur.index = const_cast<dict_index_t*>(index);
+		page_cur_set_before_first(block, &m_cur);
+		return next();
+	}
+
+	/** Move to the next record. */
+	rec_t* next() noexcept MY_ATTRIBUTE((warn_unused_result))
+	{
+		return page_cur_move_to_next(&m_cur);
+	}
+
+	/**
+	@return the current record */
+	rec_t*	current() UNIV_NOTHROW
+	{
+		ut_ad(!end());
+		return(page_cur_get_rec(&m_cur));
+	}
+
+	buf_block_t* current_block() const { return m_cur.block; }
+
+	/**
+	@return true if cursor is at the end */
+	bool	end() UNIV_NOTHROW
+	{
+		return(page_cur_is_after_last(&m_cur) == TRUE);
+	}
+
+	/** Remove the current record
+	@return true on success */
+	bool remove(rec_offs* offsets) UNIV_NOTHROW
+	{
+		const dict_index_t* const index = m_cur.index;
+		ut_ad(page_is_leaf(m_cur.block->page.frame));
+		/* We can't end up with an empty page unless it is root. */
+		if (page_get_n_recs(m_cur.block->page.frame) <= 1) {
+			return(false);
+		}
+
+		if (!rec_offs_any_extern(offsets)
+		    && m_cur.block->page.id().page_no() != index->page
+		    && ((page_get_data_size(m_cur.block->page.frame)
+			 - rec_offs_size(offsets)
+			 < BTR_CUR_PAGE_COMPRESS_LIMIT(index))
+			|| !page_has_siblings(m_cur.block->page.frame)
+			|| (page_get_n_recs(m_cur.block->page.frame) < 2))) {
+			return false;
+		}
+
+#ifdef UNIV_ZIP_DEBUG
+		page_zip_des_t* page_zip = buf_block_get_page_zip(m_cur.block);
+		ut_a(!page_zip || page_zip_validate(
+			     page_zip, m_cur.block->page.frame, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+		page_cur_delete_rec(&m_cur, offsets, &m_mtr);
+
+#ifdef UNIV_ZIP_DEBUG
+		ut_a(!page_zip || page_zip_validate(
+			     page_zip, m_cur.block->page.frame, index));
+#endif /* UNIV_ZIP_DEBUG */
+
+		return true;
+	}
+
+private:
+	page_cur_t	m_cur;
+public:
+	mtr_t		m_mtr;
+};
+
+/** Class that purges delete marked records from indexes, both secondary
+and cluster. It does a pessimistic delete. This should only be done if we
+couldn't purge the delete marked reocrds during Phase I. */
+class IndexPurge {
+public:
+	/** Constructor
+	@param trx the user transaction covering the import tablespace
+	@param index to be imported
+	@param space_id space id of the tablespace */
+	IndexPurge(
+		trx_t*		trx,
+		dict_index_t*	index) UNIV_NOTHROW
+		:
+		m_trx(trx),
+		m_index(index),
+		m_n_rows(0)
+	{
+		ib::info() << "Phase II - Purge records from index "
+			<< index->name;
+	}
+
+	/** Destructor */
+	~IndexPurge() UNIV_NOTHROW = default;
+
+	/** Purge delete marked records.
+	@return DB_SUCCESS or error code. */
+	dberr_t	garbage_collect() UNIV_NOTHROW;
+
+	/** The number of records that are not delete marked.
+	@return total records in the index after purge */
+	ulint	get_n_rows() const UNIV_NOTHROW
+	{
+		return(m_n_rows);
+	}
+
+private:
+  /** Begin import, position the cursor on the first record. */
+  inline bool open() noexcept;
+
+  /** Close the persistent cursor and commit the mini-transaction. */
+  void close() noexcept { m_mtr.commit(); btr_pcur_close(&m_pcur); }
+
+  /** Position the cursor on the next record.
+  @return DB_SUCCESS or error code */
+  dberr_t next() noexcept;
+
+  /** Store the persistent cursor position and reopen the
+  B-tree cursor in BTR_MODIFY_TREE mode, because the
+  tree structure may be changed during a pessimistic delete. */
+  inline dberr_t purge_pessimistic_delete() noexcept;
+
+  /** Purge a delete-marked record. */
+  dberr_t purge() noexcept;
+
+protected:
+	// Disable copying
+	IndexPurge();
+	IndexPurge(const IndexPurge&);
+	IndexPurge &operator=(const IndexPurge&);
+
+private:
+	trx_t*			m_trx;		/*!< User transaction */
+	mtr_t			m_mtr;		/*!< Mini-transaction */
+	btr_pcur_t		m_pcur;		/*!< Persistent cursor */
+	dict_index_t*		m_index;	/*!< Index to be processed */
+	ulint			m_n_rows;	/*!< Records in index */
+};
+
+/** Functor that is called for each physical page that is read from the
+tablespace file.  */
+class AbstractCallback
+{
+public:
+	/** Constructor
+	@param trx covering transaction */
+	AbstractCallback(trx_t* trx, uint32_t space_id)
+		:
+		m_zip_size(0),
+		m_trx(trx),
+		m_space(space_id),
+		m_xdes(),
+		m_xdes_page_no(UINT32_MAX),
+		m_space_flags(UINT32_MAX) UNIV_NOTHROW { }
+
+	/** Free any extent descriptor instance */
+	virtual ~AbstractCallback()
+	{
+		UT_DELETE_ARRAY(m_xdes);
+	}
+
+	/** Determine the page size to use for traversing the tablespace
+	@param file_size size of the tablespace file in bytes
+	@param block contents of the first page in the tablespace file.
+	@retval DB_SUCCESS or error code. */
+	virtual dberr_t init(
+		os_offset_t		file_size,
+		const buf_block_t*	block) UNIV_NOTHROW;
+
+	/** @return true if compressed table. */
+	bool is_compressed_table() const UNIV_NOTHROW
+	{
+		return get_zip_size();
+	}
+
+	/** @return the tablespace flags */
+	uint32_t get_space_flags() const { return m_space_flags; }
+
+	/**
+	Set the name of the physical file and the file handle that is used
+	to open it for the file that is being iterated over.
+	@param filename the physical name of the tablespace file
+	@param file OS file handle */
+	void set_file(const char* filename, pfs_os_file_t file) UNIV_NOTHROW
+	{
+		m_file = file;
+		m_filepath = filename;
+	}
+
+	ulint get_zip_size() const { return m_zip_size; }
+	ulint physical_size() const
+	{
+		return m_zip_size ? m_zip_size : srv_page_size;
+	}
+
+	const char* filename() const { return m_filepath; }
+
+	/**
+	Called for every page in the tablespace. If the page was not
+	updated then its state must be set to BUF_PAGE_NOT_USED. For
+	compressed tables the page descriptor memory will be at offset:
+		block->page.frame + srv_page_size;
+	@param block block read from file, note it is not from the buffer pool
+	@retval DB_SUCCESS or error code. */
+	virtual dberr_t operator()(buf_block_t* block) UNIV_NOTHROW = 0;
+
+	/** @return the tablespace identifier */
+	uint32_t get_space_id() const { return m_space; }
+
+	bool is_interrupted() const { return trx_is_interrupted(m_trx); }
+
+	/**
+	Get the data page depending on the table type, compressed or not.
+	@param block - block read from disk
+	@retval the buffer frame */
+	static byte* get_frame(const buf_block_t* block)
+	{
+		return block->page.zip.data
+			? block->page.zip.data : block->page.frame;
+	}
+
+	/** Invoke the functionality for the callback */
+	virtual dberr_t run(const fil_iterator_t& iter,
+			    buf_block_t* block) UNIV_NOTHROW = 0;
+
+protected:
+	/** Get the physical offset of the extent descriptor within the page.
+	@param page_no page number of the extent descriptor
+	@param page contents of the page containing the extent descriptor.
+	@return the start of the xdes array in a page */
+	const xdes_t* xdes(
+		ulint		page_no,
+		const page_t*	page) const UNIV_NOTHROW
+	{
+		ulint	offset;
+
+		offset = xdes_calc_descriptor_index(get_zip_size(), page_no);
+
+		return(page + XDES_ARR_OFFSET + XDES_SIZE * offset);
+	}
+
+	/** Set the current page directory (xdes). If the extent descriptor is
+	marked as free then free the current extent descriptor and set it to
+	0. This implies that all pages that are covered by this extent
+	descriptor are also freed.
+
+	@param page_no offset of page within the file
+	@param page page contents
+	@return DB_SUCCESS or error code. */
+	dberr_t	set_current_xdes(
+		uint32_t	page_no,
+		const page_t*	page) UNIV_NOTHROW
+	{
+		m_xdes_page_no = page_no;
+
+		UT_DELETE_ARRAY(m_xdes);
+		m_xdes = NULL;
+
+		if (mach_read_from_4(XDES_ARR_OFFSET + XDES_STATE + page)
+		    != XDES_FREE) {
+			const ulint physical_size = m_zip_size
+				? m_zip_size : srv_page_size;
+
+			m_xdes = UT_NEW_ARRAY_NOKEY(xdes_t, physical_size);
+
+			/* Trigger OOM */
+			DBUG_EXECUTE_IF(
+				"ib_import_OOM_13",
+				UT_DELETE_ARRAY(m_xdes);
+				m_xdes = NULL;
+			);
+
+			if (m_xdes == NULL) {
+				return(DB_OUT_OF_MEMORY);
+			}
+
+			memcpy(m_xdes, page, physical_size);
+		}
+
+		return(DB_SUCCESS);
+	}
+
+	/** Check if the page is marked as free in the extent descriptor.
+	@param page_no page number to check in the extent descriptor.
+	@return true if the page is marked as free */
+	bool is_free(uint32_t page_no) const UNIV_NOTHROW
+	{
+		ut_a(xdes_calc_descriptor_page(get_zip_size(), page_no)
+		     == m_xdes_page_no);
+
+		if (m_xdes != 0) {
+			const xdes_t*	xdesc = xdes(page_no, m_xdes);
+			ulint		pos = page_no % FSP_EXTENT_SIZE;
+
+			return xdes_is_free(xdesc, pos);
+		}
+
+		/* If the current xdes was free, the page must be free. */
+		return(true);
+	}
+
+protected:
+	/** The ROW_FORMAT=COMPRESSED page size, or 0. */
+	ulint			m_zip_size;
+
+	/** File handle to the tablespace */
+	pfs_os_file_t		m_file;
+
+	/** Physical file path. */
+	const char*		m_filepath;
+
+	/** Covering transaction. */
+	trx_t*			m_trx;
+
+	/** Space id of the file being iterated over. */
+	uint32_t		m_space;
+
+	/** Current extent descriptor page */
+	xdes_t*			m_xdes;
+
+	/** Physical page offset in the file of the extent descriptor */
+	uint32_t		m_xdes_page_no;
+
+	/** Flags value read from the header page */
+	uint32_t		m_space_flags;
+};
+
+ATTRIBUTE_COLD static dberr_t invalid_space_flags(uint32_t flags)
+{
+  if (fsp_flags_is_incompatible_mysql(flags))
+  {
+    sql_print_error("InnoDB: unsupported MySQL tablespace");
+    return DB_UNSUPPORTED;
+  }
+
+  sql_print_error("InnoDB: Invalid FSP_SPACE_FLAGS=0x%" PRIx32, flags);
+  return DB_CORRUPTION;
+}
+
+/** Determine the page size to use for traversing the tablespace
+@param file_size size of the tablespace file in bytes
+@param block contents of the first page in the tablespace file.
+@retval DB_SUCCESS or error code. */
+dberr_t
+AbstractCallback::init(
+	os_offset_t		file_size,
+	const buf_block_t*	block) UNIV_NOTHROW
+{
+	const page_t*		page = block->page.frame;
+
+	m_space_flags = fsp_header_get_flags(page);
+	if (!fil_space_t::is_valid_flags(m_space_flags, true)) {
+		uint32_t cflags = fsp_flags_convert_from_101(m_space_flags);
+		if (cflags == UINT32_MAX) {
+			return DB_CORRUPTION;
+		}
+		m_space_flags = cflags;
+	}
+
+	/* Clear the DATA_DIR flag, which is basically garbage. */
+	m_space_flags &= ~(1U << FSP_FLAGS_POS_RESERVED);
+	m_zip_size = fil_space_t::zip_size(m_space_flags);
+	const ulint logical_size = fil_space_t::logical_size(m_space_flags);
+	const ulint physical_size = fil_space_t::physical_size(m_space_flags);
+
+	if (logical_size != srv_page_size) {
+
+		ib::error() << "Page size " << logical_size
+			<< " of ibd file is not the same as the server page"
+			" size " << srv_page_size;
+
+		return(DB_CORRUPTION);
+
+	} else if (file_size & (physical_size - 1)) {
+
+		ib::error() << "File size " << file_size << " is not a"
+			" multiple of the page size "
+			<< physical_size;
+
+		return(DB_CORRUPTION);
+	}
+
+	if (m_space == UINT32_MAX) {
+		m_space = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_ID
+					   + page);
+	}
+
+	return set_current_xdes(0, page);
+}
+
+/**
+TODO: This can be made parallel trivially by chunking up the file
+and creating a callback per thread.. Main benefit will be to use
+multiple CPUs for checksums and compressed tables. We have to do
+compressed tables block by block right now. Secondly we need to
+decompress/compress and copy too much of data. These are
+CPU intensive.
+
+Iterate over all the pages in the tablespace.
+@param iter - Tablespace iterator
+@param block - block to use for IO
+@param callback - Callback to inspect and update page contents
+@retval DB_SUCCESS or error code */
+static dberr_t fil_iterate(
+	const fil_iterator_t&	iter,
+	buf_block_t*		block,
+	AbstractCallback&	callback);
+
+/**
+Try and determine the index root pages by checking if the next/prev
+pointers are both FIL_NULL. We need to ensure that skip deleted pages. */
+struct FetchIndexRootPages : public AbstractCallback {
+
+	/** Index information gathered from the .ibd file. */
+	struct Index {
+
+		Index(index_id_t id, uint32_t page_no)
+			:
+			m_id(id),
+			m_page_no(page_no) { }
+
+		index_id_t	m_id;		/*!< Index id */
+		uint32_t	m_page_no;	/*!< Root page number */
+	};
+
+	/** Constructor
+	@param trx covering (user) transaction
+	@param table table definition in server .*/
+	FetchIndexRootPages(const dict_table_t* table, trx_t* trx)
+		:
+		AbstractCallback(trx, UINT32_MAX),
+		m_table(table), m_index(0, 0) UNIV_NOTHROW { }
+
+	/** Destructor */
+	~FetchIndexRootPages() UNIV_NOTHROW override = default;
+
+	/** Fetch the clustered index root page in the tablespace
+	@param iter	Tablespace iterator
+	@param block	Block to use for IO
+	@retval DB_SUCCESS or error code */
+	dberr_t run(const fil_iterator_t& iter,
+		    buf_block_t* block) UNIV_NOTHROW override;
+
+	/** Called for each block as it is read from the file.
+	@param block block to convert, it is not from the buffer pool.
+	@retval DB_SUCCESS or error code. */
+	dberr_t operator()(buf_block_t* block) UNIV_NOTHROW override;
+
+	/** Update the import configuration that will be used to import
+	the tablespace. */
+	dberr_t build_row_import(row_import* cfg) const UNIV_NOTHROW;
+
+	/** Table definition in server. */
+	const dict_table_t*	m_table;
+
+	/** Index information */
+	Index			m_index;
+};
+
+/** Called for each block as it is read from the file. Check index pages to
+determine the exact row format. We can't get that from the tablespace
+header flags alone.
+
+@param block block to convert, it is not from the buffer pool.
+@retval DB_SUCCESS or error code. */
+dberr_t FetchIndexRootPages::operator()(buf_block_t* block) UNIV_NOTHROW
+{
+	if (is_interrupted()) return DB_INTERRUPTED;
+
+	const page_t*	page = get_frame(block);
+
+	m_index.m_id = btr_page_get_index_id(page);
+	m_index.m_page_no = block->page.id().page_no();
+
+	/* Check that the tablespace flags match the table flags. */
+	const uint32_t expected = dict_tf_to_fsp_flags(m_table->flags);
+	if (!fsp_flags_match(expected, m_space_flags)) {
+		ib_errf(m_trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			ER_TABLE_SCHEMA_MISMATCH,
+			"Expected FSP_SPACE_FLAGS=0x%x, .ibd "
+			"file contains 0x%x.",
+			unsigned(expected),
+			unsigned(m_space_flags));
+		return(DB_CORRUPTION);
+	}
+
+	if (!page_is_comp(block->page.frame) !=
+	    !dict_table_is_comp(m_table)) {
+		ib_errf(m_trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			ER_TABLE_SCHEMA_MISMATCH,
+			"ROW_FORMAT mismatch");
+		return DB_CORRUPTION;
+	}
+
+	return DB_SUCCESS;
+}
+
+/**
+Update the import configuration that will be used to import the tablespace.
+@return error code or DB_SUCCESS */
+dberr_t
+FetchIndexRootPages::build_row_import(row_import* cfg) const UNIV_NOTHROW
+{
+	ut_a(cfg->m_table == m_table);
+	cfg->m_zip_size = m_zip_size;
+	cfg->m_n_indexes = 1;
+
+	if (cfg->m_n_indexes == 0) {
+
+		ib::error() << "No B+Tree found in tablespace";
+
+		return(DB_CORRUPTION);
+	}
+
+	cfg->m_indexes = UT_NEW_ARRAY_NOKEY(row_index_t, cfg->m_n_indexes);
+
+	/* Trigger OOM */
+	DBUG_EXECUTE_IF(
+		"ib_import_OOM_11",
+		UT_DELETE_ARRAY(cfg->m_indexes);
+		cfg->m_indexes = NULL;
+	);
+
+	if (cfg->m_indexes == NULL) {
+		return(DB_OUT_OF_MEMORY);
+	}
+
+	memset(cfg->m_indexes, 0x0, sizeof(*cfg->m_indexes) * cfg->m_n_indexes);
+
+	row_index_t*	cfg_index = cfg->m_indexes;
+
+	char	name[BUFSIZ];
+
+	snprintf(name, sizeof(name), "index" IB_ID_FMT, m_index.m_id);
+
+	ulint	len = strlen(name) + 1;
+
+	cfg_index->m_name = UT_NEW_ARRAY_NOKEY(byte, len);
+
+	/* Trigger OOM */
+	DBUG_EXECUTE_IF(
+		"ib_import_OOM_12",
+		UT_DELETE_ARRAY(cfg_index->m_name);
+		cfg_index->m_name = NULL;
+	);
+
+	if (cfg_index->m_name == NULL) {
+		return(DB_OUT_OF_MEMORY);
+	}
+
+	memcpy(cfg_index->m_name, name, len);
+
+	cfg_index->m_id = m_index.m_id;
+
+	cfg_index->m_space = m_space;
+
+	cfg_index->m_page_no = m_index.m_page_no;
+
+	return(DB_SUCCESS);
+}
+
+/* Functor that is called for each physical page that is read from the
+tablespace file.
+
+  1. Check each page for corruption.
+
+  2. Update the space id and LSN on every page
+     * For the header page
+       - Validate the flags
+       - Update the LSN
+
+  3. On Btree pages
+     * Set the index id
+     * Update the max trx id
+     * In a cluster index, update the system columns
+     * In a cluster index, update the BLOB ptr, set the space id
+     * Purge delete marked records, but only if they can be easily
+       removed from the page
+     * Keep a counter of number of rows, ie. non-delete-marked rows
+     * Keep a counter of number of delete marked rows
+     * Keep a counter of number of purge failure
+     * If a page is stamped with an index id that isn't in the .cfg file
+       we assume it is deleted and the page can be ignored.
+
+   4. Set the page state to dirty so that it will be written to disk.
+*/
+class PageConverter : public AbstractCallback {
+public:
+	/** Constructor
+	@param cfg config of table being imported.
+	@param space_id tablespace identifier
+	@param trx transaction covering the import */
+	PageConverter(row_import* cfg, uint32_t space_id, trx_t* trx)
+		:
+		AbstractCallback(trx, space_id),
+		m_cfg(cfg),
+		m_index(cfg->m_indexes),
+		m_rec_iter(),
+		m_offsets_(), m_offsets(m_offsets_),
+		m_heap(0),
+		m_cluster_index(dict_table_get_first_index(cfg->m_table))
+	{
+		rec_offs_init(m_offsets_);
+	}
+
+	~PageConverter() UNIV_NOTHROW override
+	{
+		if (m_heap != 0) {
+			mem_heap_free(m_heap);
+		}
+	}
+
+	dberr_t run(const fil_iterator_t& iter,
+		    buf_block_t* block) UNIV_NOTHROW override
+	{
+		return fil_iterate(iter, block, *this);
+	}
+
+	/** Called for each block as it is read from the file.
+	@param block block to convert, it is not from the buffer pool.
+	@retval DB_SUCCESS or error code. */
+	dberr_t operator()(buf_block_t* block) UNIV_NOTHROW override;
+
+private:
+	/** Update the page, set the space id, max trx id and index id.
+	@param block block read from file
+	@param page_type type of the page
+	@retval DB_SUCCESS or error code */
+	dberr_t update_page(buf_block_t* block, uint16_t& page_type)
+		UNIV_NOTHROW;
+
+	/** Update the space, index id, trx id.
+	@param block block to convert
+	@return DB_SUCCESS or error code */
+	dberr_t	update_index_page(buf_block_t*	block) UNIV_NOTHROW;
+
+	/** Update the BLOB refrences and write UNDO log entries for
+	rows that can't be purged optimistically.
+	@param block block to update
+	@retval DB_SUCCESS or error code */
+	dberr_t	update_records(buf_block_t* block) UNIV_NOTHROW;
+
+	/** Validate the space flags and update tablespace header page.
+	@param block block read from file, not from the buffer pool.
+	@retval DB_SUCCESS or error code */
+	dberr_t	update_header(buf_block_t* block) UNIV_NOTHROW;
+
+	/** Adjust the BLOB reference for a single column that is externally stored
+	@param rec record to update
+	@param offsets column offsets for the record
+	@param i column ordinal value
+	@return DB_SUCCESS or error code */
+	dberr_t	adjust_cluster_index_blob_column(
+		rec_t*		rec,
+		const rec_offs*	offsets,
+		ulint		i) UNIV_NOTHROW;
+
+	/** Adjusts the BLOB reference in the clustered index row for all
+	externally stored columns.
+	@param rec record to update
+	@param offsets column offsets for the record
+	@return DB_SUCCESS or error code */
+	dberr_t	adjust_cluster_index_blob_columns(
+		rec_t*		rec,
+		const rec_offs*	offsets) UNIV_NOTHROW;
+
+	/** In the clustered index, adjist the BLOB pointers as needed.
+	Also update the BLOB reference, write the new space id.
+	@param rec record to update
+	@param offsets column offsets for the record
+	@return DB_SUCCESS or error code */
+	dberr_t	adjust_cluster_index_blob_ref(
+		rec_t*		rec,
+		const rec_offs*	offsets) UNIV_NOTHROW;
+
+	/** Purge delete-marked records, only if it is possible to do
+	so without re-organising the B+tree.
+	@retval true if purged */
+	bool purge() UNIV_NOTHROW;
+
+	/** Adjust the BLOB references and sys fields for the current record.
+	@param rec record to update
+	@param offsets column offsets for the record
+	@return DB_SUCCESS or error code. */
+	dberr_t	adjust_cluster_record(
+		rec_t*			rec,
+		const rec_offs*		offsets) UNIV_NOTHROW;
+
+	/** Find an index with the matching id.
+	@return row_index_t* instance or 0 */
+	row_index_t* find_index(index_id_t id) UNIV_NOTHROW
+	{
+		row_index_t*	index = &m_cfg->m_indexes[0];
+
+		for (ulint i = 0; i < m_cfg->m_n_indexes; ++i, ++index) {
+			if (id == index->m_id) {
+				return(index);
+			}
+		}
+
+		return(0);
+
+	}
+private:
+	/** Config for table that is being imported. */
+	row_import*		m_cfg;
+
+	/** Current index whose pages are being imported */
+	row_index_t*		m_index;
+
+	/** Iterator over records in a block */
+	RecIterator		m_rec_iter;
+
+	/** Record offset */
+	rec_offs		m_offsets_[REC_OFFS_NORMAL_SIZE];
+
+	/** Pointer to m_offsets_ */
+	rec_offs*		m_offsets;
+
+	/** Memory heap for the record offsets */
+	mem_heap_t*		m_heap;
+
+	/** Cluster index instance */
+	dict_index_t*		m_cluster_index;
+};
+
+/**
+row_import destructor. */
+row_import::~row_import() UNIV_NOTHROW
+{
+	for (ulint i = 0; m_indexes != 0 && i < m_n_indexes; ++i) {
+		UT_DELETE_ARRAY(m_indexes[i].m_name);
+
+		if (m_indexes[i].m_fields == NULL) {
+			continue;
+		}
+
+		dict_field_t*	fields = m_indexes[i].m_fields;
+		ulint		n_fields = m_indexes[i].m_n_fields;
+
+		for (ulint j = 0; j < n_fields; ++j) {
+			UT_DELETE_ARRAY(const_cast<char*>(fields[j].name()));
+		}
+
+		UT_DELETE_ARRAY(fields);
+	}
+
+	for (ulint i = 0; m_col_names != 0 && i < m_n_cols; ++i) {
+		UT_DELETE_ARRAY(m_col_names[i]);
+	}
+
+	UT_DELETE_ARRAY(m_cols);
+	UT_DELETE_ARRAY(m_indexes);
+	UT_DELETE_ARRAY(m_col_names);
+	UT_DELETE_ARRAY(m_table_name);
+	UT_DELETE_ARRAY(m_hostname);
+}
+
+/** Find the index entry in in the indexes array.
+@param name index name
+@return instance if found else 0. */
+row_index_t*
+row_import::get_index(
+	const char*	name) const UNIV_NOTHROW
+{
+	for (ulint i = 0; i < m_n_indexes; ++i) {
+		const char*	index_name;
+		row_index_t*	index = &m_indexes[i];
+
+		index_name = reinterpret_cast<const char*>(index->m_name);
+
+		if (strcmp(index_name, name) == 0) {
+
+			return(index);
+		}
+	}
+
+	return(0);
+}
+
+/** Get the number of rows in the index.
+@param name index name
+@return number of rows (doesn't include delete marked rows). */
+ulint
+row_import::get_n_rows(
+	const char*	name) const UNIV_NOTHROW
+{
+	const row_index_t*	index = get_index(name);
+
+	ut_a(name != 0);
+
+	return(index->m_stats.m_n_rows);
+}
+
+/** Get the number of rows for which purge failed uding the convert phase.
+@param name index name
+@return number of rows for which purge failed. */
+ulint
+row_import::get_n_purge_failed(
+	const char*	name) const UNIV_NOTHROW
+{
+	const row_index_t*	index = get_index(name);
+
+	ut_a(name != 0);
+
+	return(index->m_stats.m_n_purge_failed);
+}
+
+/** Find the ordinal value of the column name in the cfg table columns.
+@param name of column to look for.
+@return ULINT_UNDEFINED if not found. */
+ulint
+row_import::find_col(
+	const char*	name) const UNIV_NOTHROW
+{
+	for (ulint i = 0; i < m_n_cols; ++i) {
+		const char*	col_name;
+
+		col_name = reinterpret_cast<const char*>(m_col_names[i]);
+
+		if (strcmp(col_name, name) == 0) {
+			return(i);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/**
+Check if the index schema that was read from the .cfg file matches the
+in memory index definition.
+@return DB_SUCCESS or error code. */
+dberr_t
+row_import::match_index_columns(
+	THD*			thd,
+	const dict_index_t*	index) UNIV_NOTHROW
+{
+	row_index_t*		cfg_index;
+	dberr_t			err = DB_SUCCESS;
+
+	cfg_index = get_index(index->name);
+
+	if (cfg_index == 0) {
+		ib_errf(thd, IB_LOG_LEVEL_ERROR,
+			ER_TABLE_SCHEMA_MISMATCH,
+			"Index %s not found in tablespace meta-data file.",
+			index->name());
+
+		return(DB_ERROR);
+	}
+
+	if (cfg_index->m_n_fields != index->n_fields) {
+
+		ib_errf(thd, IB_LOG_LEVEL_ERROR,
+			ER_TABLE_SCHEMA_MISMATCH,
+			"Index field count %u doesn't match"
+			" tablespace metadata file value " ULINTPF,
+			index->n_fields, cfg_index->m_n_fields);
+
+		return(DB_ERROR);
+	}
+
+	cfg_index->m_srv_index = index;
+
+	const dict_field_t*	field = index->fields;
+	const dict_field_t*	cfg_field = cfg_index->m_fields;
+
+	for (ulint i = 0; i < index->n_fields; ++i, ++field, ++cfg_field) {
+
+		if (field->name() && cfg_field->name()
+		     && strcmp(field->name(), cfg_field->name()) != 0) {
+			ib_errf(thd, IB_LOG_LEVEL_ERROR,
+				ER_TABLE_SCHEMA_MISMATCH,
+				"Index field name %s doesn't match"
+				" tablespace metadata field name %s"
+				" for field position " ULINTPF,
+				field->name(), cfg_field->name(), i);
+
+			err = DB_ERROR;
+		}
+
+		if (cfg_field->prefix_len != field->prefix_len) {
+			ib_errf(thd, IB_LOG_LEVEL_ERROR,
+				ER_TABLE_SCHEMA_MISMATCH,
+				"Index %s field %s prefix len %u"
+				" doesn't match metadata file value %u",
+				index->name(), field->name(),
+				field->prefix_len, cfg_field->prefix_len);
+
+			err = DB_ERROR;
+		}
+
+		if (cfg_field->fixed_len != field->fixed_len) {
+			ib_errf(thd, IB_LOG_LEVEL_ERROR,
+				ER_TABLE_SCHEMA_MISMATCH,
+				"Index %s field %s fixed len %u"
+				" doesn't match metadata file value %u",
+				index->name(), field->name(),
+				field->fixed_len,
+				cfg_field->fixed_len);
+
+			err = DB_ERROR;
+		}
+	}
+
+	return(err);
+}
+
+/** Check if the table schema that was read from the .cfg file matches the
+in memory table definition.
+@param thd MySQL session variable
+@return DB_SUCCESS or error code. */
+dberr_t
+row_import::match_table_columns(
+	THD*			thd) UNIV_NOTHROW
+{
+	dberr_t			err = DB_SUCCESS;
+	const dict_col_t*	col = m_table->cols;
+
+	for (ulint i = 0; i < m_table->n_cols; ++i, ++col) {
+
+		const char*	col_name;
+		ulint		cfg_col_index;
+
+		col_name = dict_table_get_col_name(
+			m_table, dict_col_get_no(col));
+
+		cfg_col_index = find_col(col_name);
+
+		if (cfg_col_index == ULINT_UNDEFINED) {
+
+			ib_errf(thd, IB_LOG_LEVEL_ERROR,
+				 ER_TABLE_SCHEMA_MISMATCH,
+				 "Column %s not found in tablespace.",
+				 col_name);
+
+			err = DB_ERROR;
+		} else if (cfg_col_index != col->ind) {
+
+			ib_errf(thd, IB_LOG_LEVEL_ERROR,
+				ER_TABLE_SCHEMA_MISMATCH,
+				"Column %s ordinal value mismatch, it's at %u"
+				" in the table and " ULINTPF
+				" in the tablespace meta-data file",
+				col_name, col->ind, cfg_col_index);
+
+			err = DB_ERROR;
+		} else {
+			const dict_col_t*	cfg_col;
+
+			cfg_col = &m_cols[cfg_col_index];
+			ut_a(cfg_col->ind == cfg_col_index);
+
+			if (cfg_col->prtype != col->prtype) {
+				ib_errf(thd,
+					IB_LOG_LEVEL_ERROR,
+					ER_TABLE_SCHEMA_MISMATCH,
+					"Column %s precise type mismatch,"
+					" it's 0X%X in the table and 0X%X"
+					" in the tablespace meta file",
+					col_name, col->prtype, cfg_col->prtype);
+				err = DB_ERROR;
+			}
+
+			if (cfg_col->mtype != col->mtype) {
+				ib_errf(thd,
+					IB_LOG_LEVEL_ERROR,
+					ER_TABLE_SCHEMA_MISMATCH,
+					"Column %s main type mismatch,"
+					" it's 0X%X in the table and 0X%X"
+					" in the tablespace meta file",
+					col_name, col->mtype, cfg_col->mtype);
+				err = DB_ERROR;
+			}
+
+			if (cfg_col->len != col->len) {
+				ib_errf(thd,
+					IB_LOG_LEVEL_ERROR,
+					ER_TABLE_SCHEMA_MISMATCH,
+					"Column %s length mismatch,"
+					" it's %u in the table and %u"
+					" in the tablespace meta file",
+					col_name, col->len, cfg_col->len);
+				err = DB_ERROR;
+			}
+
+			if (cfg_col->mbminlen != col->mbminlen
+			    || cfg_col->mbmaxlen != col->mbmaxlen) {
+				ib_errf(thd,
+					IB_LOG_LEVEL_ERROR,
+					ER_TABLE_SCHEMA_MISMATCH,
+					"Column %s multi-byte len mismatch,"
+					" it's %u-%u in the table and %u-%u"
+					" in the tablespace meta file",
+					col_name, col->mbminlen, col->mbmaxlen,
+					cfg_col->mbminlen, cfg_col->mbmaxlen);
+				err = DB_ERROR;
+			}
+
+			if (cfg_col->ind != col->ind) {
+				ib_errf(thd,
+					IB_LOG_LEVEL_ERROR,
+					ER_TABLE_SCHEMA_MISMATCH,
+					"Column %s position mismatch,"
+					" it's %u in the table and %u"
+					" in the tablespace meta file",
+					col_name, col->ind, cfg_col->ind);
+				err = DB_ERROR;
+			}
+
+			if (cfg_col->ord_part != col->ord_part) {
+				ib_errf(thd,
+					IB_LOG_LEVEL_ERROR,
+					ER_TABLE_SCHEMA_MISMATCH,
+					"Column %s ordering mismatch,"
+					" it's %u in the table and %u"
+					" in the tablespace meta file",
+					col_name, col->ord_part,
+					cfg_col->ord_part);
+				err = DB_ERROR;
+			}
+
+			if (cfg_col->max_prefix != col->max_prefix) {
+				ib_errf(thd,
+					IB_LOG_LEVEL_ERROR,
+					ER_TABLE_SCHEMA_MISMATCH,
+					"Column %s max prefix mismatch"
+					" it's %u in the table and %u"
+					" in the tablespace meta file",
+					col_name, col->max_prefix,
+					cfg_col->max_prefix);
+				err = DB_ERROR;
+			}
+		}
+	}
+
+	return(err);
+}
+
+dberr_t row_import::match_flags(THD *thd) const
+{
+  ulint mismatch= (m_table->flags ^ m_flags) & ~DICT_TF_MASK_DATA_DIR;
+  if (!mismatch)
+    return DB_SUCCESS;
+
+  const char *msg;
+  if (mismatch & DICT_TF_MASK_ZIP_SSIZE)
+  {
+    if ((m_table->flags & DICT_TF_MASK_ZIP_SSIZE) &&
+        (m_flags & DICT_TF_MASK_ZIP_SSIZE))
+    {
+      switch (m_flags & DICT_TF_MASK_ZIP_SSIZE) {
+      case 0U << DICT_TF_POS_ZIP_SSIZE:
+        goto uncompressed;
+      case 1U << DICT_TF_POS_ZIP_SSIZE:
+        msg= "ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=1";
+        break;
+      case 2U << DICT_TF_POS_ZIP_SSIZE:
+        msg= "ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=2";
+        break;
+      case 3U << DICT_TF_POS_ZIP_SSIZE:
+        msg= "ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=4";
+        break;
+      case 4U << DICT_TF_POS_ZIP_SSIZE:
+        msg= "ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=8";
+        break;
+      case 5U << DICT_TF_POS_ZIP_SSIZE:
+        msg= "ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=16";
+        break;
+      default:
+        msg= "strange KEY_BLOCK_SIZE";
+      }
+    }
+    else if (m_flags & DICT_TF_MASK_ZIP_SSIZE)
+      msg= "ROW_FORMAT=COMPRESSED";
+    else
+      goto uncompressed;
+  }
+  else
+  {
+  uncompressed:
+    msg= (m_flags & DICT_TF_MASK_ATOMIC_BLOBS) ? "ROW_FORMAT=DYNAMIC"
+         : (m_flags & DICT_TF_MASK_COMPACT)    ? "ROW_FORMAT=COMPACT"
+                                               : "ROW_FORMAT=REDUNDANT";
+  }
+
+  ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH,
+          "Table flags don't match, server table has 0x%x and the meta-data "
+          "file has 0x%zx; .cfg file uses %s",
+          m_table->flags, m_flags, msg);
+
+  return DB_ERROR;
+}
+
+/** Check if the table (and index) schema that was read from the .cfg file
+matches the in memory table definition.
+@param thd MySQL session variable
+@return DB_SUCCESS or error code. */
+dberr_t
+row_import::match_schema(
+	THD*		thd) UNIV_NOTHROW
+{
+	/* Do some simple checks. */
+
+	if (UT_LIST_GET_LEN(m_table->indexes) != m_n_indexes) {
+
+		/* If the number of indexes don't match then it is better
+		to abort the IMPORT. It is easy for the user to create a
+		table matching the IMPORT definition. */
+
+		ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH,
+			"Number of indexes don't match, table has " ULINTPF
+			" indexes but the tablespace meta-data file has "
+			ULINTPF " indexes",
+			UT_LIST_GET_LEN(m_table->indexes), m_n_indexes);
+
+		return(DB_ERROR);
+	}
+
+	dberr_t	err = match_table_columns(thd);
+
+	if (err != DB_SUCCESS) {
+		return(err);
+	}
+
+	/* Check if the index definitions match. */
+
+	const dict_index_t* index;
+
+	for (index = UT_LIST_GET_FIRST(m_table->indexes);
+	     index != 0;
+	     index = UT_LIST_GET_NEXT(indexes, index)) {
+
+		dberr_t	index_err;
+
+		index_err = match_index_columns(thd, index);
+
+		if (index_err != DB_SUCCESS) {
+			err = index_err;
+		}
+	}
+
+	return(err);
+}
+
+/**
+Set the index root <space, pageno>, using index name. */
+void
+row_import::set_root_by_name() UNIV_NOTHROW
+{
+	row_index_t*	cfg_index = m_indexes;
+
+	for (ulint i = 0; i < m_n_indexes; ++i, ++cfg_index) {
+		dict_index_t*	index;
+
+		const char*	index_name;
+
+		index_name = reinterpret_cast<const char*>(cfg_index->m_name);
+
+		index = dict_table_get_index_on_name(m_table, index_name);
+
+		/* We've already checked that it exists. */
+		ut_a(index != 0);
+
+		index->page = cfg_index->m_page_no;
+	}
+}
+
+/**
+Set the index root <space, pageno>, using a heuristic.
+@return DB_SUCCESS or error code */
+dberr_t
+row_import::set_root_by_heuristic() UNIV_NOTHROW
+{
+	row_index_t*	cfg_index = m_indexes;
+
+	ut_a(m_n_indexes > 0);
+
+	// TODO: For now use brute force, based on ordinality
+
+	if (UT_LIST_GET_LEN(m_table->indexes) != m_n_indexes) {
+
+		ib::warn() << "Table " << m_table->name << " should have "
+			<< UT_LIST_GET_LEN(m_table->indexes) << " indexes but"
+			" the tablespace has " << m_n_indexes << " indexes";
+	}
+
+	ulint	i = 0;
+	dberr_t	err = DB_SUCCESS;
+
+	for (dict_index_t* index = UT_LIST_GET_FIRST(m_table->indexes);
+	     index != 0;
+	     index = UT_LIST_GET_NEXT(indexes, index)) {
+
+		if (index->type & DICT_FTS) {
+			index->type |= DICT_CORRUPT;
+			ib::warn() << "Skipping FTS index: " << index->name;
+		} else if (i < m_n_indexes) {
+
+			UT_DELETE_ARRAY(cfg_index[i].m_name);
+
+			ulint	len = strlen(index->name) + 1;
+
+			cfg_index[i].m_name = UT_NEW_ARRAY_NOKEY(byte, len);
+
+			/* Trigger OOM */
+			DBUG_EXECUTE_IF(
+				"ib_import_OOM_14",
+				UT_DELETE_ARRAY(cfg_index[i].m_name);
+				cfg_index[i].m_name = NULL;
+			);
+
+			if (cfg_index[i].m_name == NULL) {
+				err = DB_OUT_OF_MEMORY;
+				break;
+			}
+
+			memcpy(cfg_index[i].m_name, index->name, len);
+
+			cfg_index[i].m_srv_index = index;
+
+			index->page = cfg_index[i++].m_page_no;
+		}
+	}
+
+	return(err);
+}
+
+/**
+Purge delete marked records.
+@return DB_SUCCESS or error code. */
+dberr_t
+IndexPurge::garbage_collect() UNIV_NOTHROW
+{
+	ibool	comp = dict_table_is_comp(m_index->table);
+
+	/* Open the persistent cursor and start the mini-transaction. */
+
+	dberr_t err = open() ? next() : DB_CORRUPTION;
+
+	for (; err == DB_SUCCESS; err = next()) {
+
+		rec_t*	rec = btr_pcur_get_rec(&m_pcur);
+		ibool	deleted = rec_get_deleted_flag(rec, comp);
+
+		if (!deleted) {
+			++m_n_rows;
+		} else {
+			err = purge();
+			if (err != DB_SUCCESS) {
+				break;
+			}
+		}
+	}
+
+	/* Close the persistent cursor and commit the mini-transaction. */
+
+	close();
+
+	return(err == DB_END_OF_INDEX ? DB_SUCCESS : err);
+}
+
+/**
+Begin import, position the cursor on the first record. */
+inline bool IndexPurge::open() noexcept
+{
+  m_mtr.start();
+  m_mtr.set_log_mode(MTR_LOG_NO_REDO);
+
+  btr_pcur_init(&m_pcur);
+
+  if (m_pcur.open_leaf(true, m_index, BTR_MODIFY_LEAF, &m_mtr) != DB_SUCCESS)
+    return false;
+
+  rec_t *rec= page_rec_get_next(btr_pcur_get_rec(&m_pcur));
+  if (!rec)
+    return false;
+  if (rec_is_metadata(rec, *m_index))
+    /* Skip the metadata pseudo-record. */
+    btr_pcur_get_page_cur(&m_pcur)->rec= rec;
+  return true;
+}
+
+/**
+Position the cursor on the next record.
+@return DB_SUCCESS or error code */
+dberr_t IndexPurge::next() noexcept
+{
+	if (UNIV_UNLIKELY(!btr_pcur_move_to_next_on_page(&m_pcur))) {
+		return DB_CORRUPTION;
+	}
+
+	/* When switching pages, commit the mini-transaction
+	in order to release the latch on the old page. */
+
+	if (!btr_pcur_is_after_last_on_page(&m_pcur)) {
+		return(DB_SUCCESS);
+	} else if (trx_is_interrupted(m_trx)) {
+		/* Check after every page because the check
+		is expensive. */
+		return(DB_INTERRUPTED);
+	}
+
+	btr_pcur_store_position(&m_pcur, &m_mtr);
+
+	mtr_commit(&m_mtr);
+
+	mtr_start(&m_mtr);
+
+	mtr_set_log_mode(&m_mtr, MTR_LOG_NO_REDO);
+
+	if (m_pcur.restore_position(BTR_MODIFY_LEAF, &m_mtr)
+	    == btr_pcur_t::CORRUPTED) {
+		return DB_CORRUPTION;
+	}
+	/* The following is based on btr_pcur_move_to_next_user_rec(). */
+	m_pcur.old_rec = nullptr;
+	ut_ad(m_pcur.latch_mode == BTR_MODIFY_LEAF);
+	do {
+		if (btr_pcur_is_after_last_on_page(&m_pcur)) {
+			if (btr_pcur_is_after_last_in_tree(&m_pcur)) {
+				return DB_END_OF_INDEX;
+			}
+
+			if (dberr_t err = btr_pcur_move_to_next_page(&m_pcur,
+								     &m_mtr)) {
+				return err;
+			}
+		} else if (!btr_pcur_move_to_next_on_page(&m_pcur)) {
+			return DB_CORRUPTION;
+		}
+	} while (!btr_pcur_is_on_user_rec(&m_pcur));
+
+	return DB_SUCCESS;
+}
+
+/**
+Store the persistent cursor position and reopen the
+B-tree cursor in BTR_MODIFY_TREE mode, because the
+tree structure may be changed during a pessimistic delete. */
+inline dberr_t IndexPurge::purge_pessimistic_delete() noexcept
+{
+  dberr_t err;
+  if (m_pcur.restore_position(BTR_PURGE_TREE, &m_mtr) != btr_pcur_t::CORRUPTED)
+  {
+    ut_ad(rec_get_deleted_flag(btr_pcur_get_rec(&m_pcur),
+                               m_index->table->not_redundant()));
+    btr_cur_pessimistic_delete(&err, FALSE, btr_pcur_get_btr_cur(&m_pcur), 0,
+                               false, &m_mtr);
+  }
+  else
+    err= DB_CORRUPTION;
+
+  m_mtr.commit();
+  return err;
+}
+
+dberr_t IndexPurge::purge() noexcept
+{
+  btr_pcur_store_position(&m_pcur, &m_mtr);
+  m_mtr.commit();
+  m_mtr.start();
+  m_mtr.set_log_mode(MTR_LOG_NO_REDO);
+  dberr_t err= purge_pessimistic_delete();
+
+  m_mtr.start();
+  m_mtr.set_log_mode(MTR_LOG_NO_REDO);
+  if (err == DB_SUCCESS)
+    err= (m_pcur.restore_position(BTR_MODIFY_LEAF, &m_mtr) ==
+          btr_pcur_t::CORRUPTED)
+      ? DB_CORRUPTION : DB_SUCCESS;
+  return err;
+}
+
+/** Adjust the BLOB reference for a single column that is externally stored
+@param rec record to update
+@param offsets column offsets for the record
+@param i column ordinal value
+@return DB_SUCCESS or error code */
+inline
+dberr_t
+PageConverter::adjust_cluster_index_blob_column(
+	rec_t*		rec,
+	const rec_offs*	offsets,
+	ulint		i) UNIV_NOTHROW
+{
+	ulint		len;
+	byte*		field;
+
+	field = rec_get_nth_field(rec, offsets, i, &len);
+
+	DBUG_EXECUTE_IF("ib_import_trigger_corruption_2",
+			len = BTR_EXTERN_FIELD_REF_SIZE - 1;);
+
+	if (len < BTR_EXTERN_FIELD_REF_SIZE) {
+
+		ib_errf(m_trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+			ER_INNODB_INDEX_CORRUPT,
+			"Externally stored column(" ULINTPF
+			") has a reference length of " ULINTPF
+			" in the cluster index %s",
+			i, len, m_cluster_index->name());
+
+		return(DB_CORRUPTION);
+	}
+
+	field += len - (BTR_EXTERN_FIELD_REF_SIZE - BTR_EXTERN_SPACE_ID);
+
+	mach_write_to_4(field, get_space_id());
+
+	if (UNIV_LIKELY_NULL(m_rec_iter.current_block()->page.zip.data)) {
+		page_zip_write_blob_ptr(
+			m_rec_iter.current_block(), rec, m_cluster_index,
+			offsets, i, &m_rec_iter.m_mtr);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/** Adjusts the BLOB reference in the clustered index row for all externally
+stored columns.
+@param rec record to update
+@param offsets column offsets for the record
+@return DB_SUCCESS or error code */
+inline
+dberr_t
+PageConverter::adjust_cluster_index_blob_columns(
+	rec_t*		rec,
+	const rec_offs*	offsets) UNIV_NOTHROW
+{
+	ut_ad(rec_offs_any_extern(offsets));
+
+	/* Adjust the space_id in the BLOB pointers. */
+
+	for (ulint i = 0; i < rec_offs_n_fields(offsets); ++i) {
+
+		/* Only if the column is stored "externally". */
+
+		if (rec_offs_nth_extern(offsets, i)) {
+			dberr_t	err;
+
+			err = adjust_cluster_index_blob_column(rec, offsets, i);
+
+			if (err != DB_SUCCESS) {
+				return(err);
+			}
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/** In the clustered index, adjust BLOB pointers as needed. Also update the
+BLOB reference, write the new space id.
+@param rec record to update
+@param offsets column offsets for the record
+@return DB_SUCCESS or error code */
+inline
+dberr_t
+PageConverter::adjust_cluster_index_blob_ref(
+	rec_t*		rec,
+	const rec_offs*	offsets) UNIV_NOTHROW
+{
+	if (rec_offs_any_extern(offsets)) {
+		dberr_t	err;
+
+		err = adjust_cluster_index_blob_columns(rec, offsets);
+
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/** Purge delete-marked records, only if it is possible to do so without
+re-organising the B+tree.
+@return true if purge succeeded */
+inline bool PageConverter::purge() UNIV_NOTHROW
+{
+	/* We can't have a page that is empty and not root. */
+	if (m_rec_iter.remove(m_offsets)) {
+
+		++m_index->m_stats.m_n_purged;
+
+		return(true);
+	} else {
+		++m_index->m_stats.m_n_purge_failed;
+	}
+
+	return(false);
+}
+
+/** Adjust the BLOB references and sys fields for the current record.
+@param rec record to update
+@param offsets column offsets for the record
+@return DB_SUCCESS or error code. */
+inline
+dberr_t
+PageConverter::adjust_cluster_record(
+	rec_t*			rec,
+	const rec_offs*		offsets) UNIV_NOTHROW
+{
+	dberr_t	err;
+
+	if ((err = adjust_cluster_index_blob_ref(rec, offsets)) == DB_SUCCESS) {
+
+		/* Reset DB_TRX_ID and DB_ROLL_PTR.  Normally, these fields
+		are only written in conjunction with other changes to the
+		record. */
+		ulint	trx_id_pos = m_cluster_index->n_uniq
+			? m_cluster_index->n_uniq : 1;
+		if (UNIV_LIKELY_NULL(m_rec_iter.current_block()
+				     ->page.zip.data)) {
+			page_zip_write_trx_id_and_roll_ptr(
+				m_rec_iter.current_block(),
+				rec, m_offsets, trx_id_pos,
+				0, roll_ptr_t(1) << ROLL_PTR_INSERT_FLAG_POS,
+				&m_rec_iter.m_mtr);
+		} else {
+			ulint	len;
+			byte*	ptr = rec_get_nth_field(
+				rec, m_offsets, trx_id_pos, &len);
+			ut_ad(len == DATA_TRX_ID_LEN);
+			memcpy(ptr, reset_trx_id, sizeof reset_trx_id);
+		}
+	}
+
+	return(err);
+}
+
+/** Update the BLOB refrences and write UNDO log entries for
+rows that can't be purged optimistically.
+@param block block to update
+@retval DB_SUCCESS or error code */
+inline
+dberr_t
+PageConverter::update_records(
+	buf_block_t*	block) UNIV_NOTHROW
+{
+	ibool	comp = dict_table_is_comp(m_cfg->m_table);
+	bool	clust_index = m_index->m_srv_index == m_cluster_index;
+
+	/* This will also position the cursor on the first user record. */
+
+	if (!m_rec_iter.open(block, m_index->m_srv_index)) {
+		return DB_CORRUPTION;
+	}
+
+	while (!m_rec_iter.end()) {
+		rec_t*	rec = m_rec_iter.current();
+		ibool	deleted = rec_get_deleted_flag(rec, comp);
+
+		/* For the clustered index we have to adjust the BLOB
+		reference and the system fields irrespective of the
+		delete marked flag. The adjustment of delete marked
+		cluster records is required for purge to work later. */
+
+		if (deleted || clust_index) {
+			m_offsets = rec_get_offsets(
+				rec, m_index->m_srv_index, m_offsets,
+				m_index->m_srv_index->n_core_fields,
+				ULINT_UNDEFINED, &m_heap);
+		}
+
+		if (clust_index) {
+
+			dberr_t err = adjust_cluster_record(rec, m_offsets);
+
+			if (err != DB_SUCCESS) {
+				return(err);
+			}
+		}
+
+		/* If it is a delete marked record then try an
+		optimistic delete. */
+
+		if (deleted) {
+			++m_index->m_stats.m_n_deleted;
+			/* A successful purge will move the cursor to the
+			next record. */
+
+			if (purge()) {
+				continue;
+			}
+		} else {
+			++m_index->m_stats.m_n_rows;
+		}
+
+		if (!m_rec_iter.next()) {
+			return DB_CORRUPTION;
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/** Update the space, index id, trx id.
+@return DB_SUCCESS or error code */
+inline
+dberr_t
+PageConverter::update_index_page(
+	buf_block_t*	block) UNIV_NOTHROW
+{
+	const page_id_t page_id(block->page.id());
+
+	if (is_free(page_id.page_no())) {
+		return(DB_SUCCESS);
+	}
+
+	buf_frame_t* page = block->page.frame;
+	const index_id_t id = btr_page_get_index_id(page);
+
+	if (id != m_index->m_id) {
+		row_index_t* index = find_index(id);
+
+		if (UNIV_UNLIKELY(!index)) {
+			if (!m_cfg->m_missing) {
+				ib::warn() << "Unknown index id " << id
+					   << " on page " << page_id.page_no();
+			}
+			return DB_SUCCESS;
+		}
+
+		m_index = index;
+	}
+
+	/* If the .cfg file is missing and there is an index mismatch
+	then ignore the error. */
+	if (m_cfg->m_missing && !m_index->m_srv_index) {
+		return(DB_SUCCESS);
+	}
+
+	if (m_index && page_id.page_no() == m_index->m_page_no) {
+		byte *b = FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF + FSEG_HDR_SPACE
+			+ page;
+		mach_write_to_4(b, page_id.space());
+
+		memcpy(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP + FSEG_HDR_SPACE
+		       + page, b, 4);
+		if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+			memcpy(&block->page.zip.data[FIL_PAGE_DATA
+						     + PAGE_BTR_SEG_TOP
+						     + FSEG_HDR_SPACE], b, 4);
+			memcpy(&block->page.zip.data[FIL_PAGE_DATA
+						     + PAGE_BTR_SEG_LEAF
+						     + FSEG_HDR_SPACE], b, 4);
+		}
+	}
+
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!block->page.zip.data || page_zip_validate(&block->page.zip, page,
+							m_index->m_srv_index));
+#endif /* UNIV_ZIP_DEBUG */
+
+	/* This has to be written to uncompressed index header. Set it to
+	the current index id. */
+	mach_write_to_8(page + (PAGE_HEADER + PAGE_INDEX_ID),
+			m_index->m_srv_index->id);
+	if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+		memcpy(&block->page.zip.data[PAGE_HEADER + PAGE_INDEX_ID],
+		       &block->page.frame[PAGE_HEADER + PAGE_INDEX_ID], 8);
+	}
+
+	if (m_index->m_srv_index->is_clust()) {
+		if (page_id.page_no() != m_index->m_srv_index->page) {
+			goto clear_page_max_trx_id;
+		}
+	} else if (page_is_leaf(page)) {
+		/* Set PAGE_MAX_TRX_ID on secondary index leaf pages. */
+		mach_write_to_8(&block->page.frame
+				[PAGE_HEADER + PAGE_MAX_TRX_ID], m_trx->id);
+		if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+			memcpy_aligned<8>(&block->page.zip.data
+					  [PAGE_HEADER + PAGE_MAX_TRX_ID],
+					  &block->page.frame
+					  [PAGE_HEADER + PAGE_MAX_TRX_ID], 8);
+		}
+	} else {
+clear_page_max_trx_id:
+		/* Clear PAGE_MAX_TRX_ID so that it can be
+		used for other purposes in the future. IMPORT
+		in MySQL 5.6, 5.7 and MariaDB 10.0 and 10.1
+		would set the field to the transaction ID even
+		on clustered index pages. */
+		memset_aligned<8>(&block->page.frame
+				  [PAGE_HEADER + PAGE_MAX_TRX_ID],
+				  0, 8);
+		if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+			memset_aligned<8>(&block->page.zip.data
+					  [PAGE_HEADER + PAGE_MAX_TRX_ID],
+					  0, 8);
+		}
+	}
+
+	if (page_is_empty(page)) {
+
+		/* Only a root page can be empty. */
+		if (page_has_siblings(page)) {
+			// TODO: We should relax this and skip secondary
+			// indexes. Mark them as corrupt because they can
+			// always be rebuilt.
+			return(DB_CORRUPTION);
+		}
+
+		return(DB_SUCCESS);
+	}
+
+	return page_is_leaf(block->page.frame)
+		? update_records(block)
+		: DB_SUCCESS;
+}
+
+/** Validate the space flags and update tablespace header page.
+@param block block read from file, not from the buffer pool.
+@retval DB_SUCCESS or error code */
+inline dberr_t PageConverter::update_header(buf_block_t* block) UNIV_NOTHROW
+{
+  byte *frame= get_frame(block);
+  if (memcmp_aligned<2>(FIL_PAGE_SPACE_ID + frame,
+                        FSP_HEADER_OFFSET + FSP_SPACE_ID + frame, 4))
+    ib::warn() << "Space id check in the header failed: ignored";
+  else if (!mach_read_from_4(FIL_PAGE_SPACE_ID + frame))
+    return DB_CORRUPTION;
+
+  memset(frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8);
+
+  /* Write space_id to the tablespace header, page 0. */
+  mach_write_to_4(FIL_PAGE_SPACE_ID + frame, get_space_id());
+  memcpy_aligned<2>(FSP_HEADER_OFFSET + FSP_SPACE_ID + frame,
+                    FIL_PAGE_SPACE_ID + frame, 4);
+  /* Write back the adjusted flags. */
+  mach_write_to_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + frame, m_space_flags);
+
+  return DB_SUCCESS;
+}
+
+/** Update the page, set the space id, max trx id and index id.
+@param block block read from file
+@retval DB_SUCCESS or error code */
+inline
+dberr_t
+PageConverter::update_page(buf_block_t* block, uint16_t& page_type)
+	UNIV_NOTHROW
+{
+	dberr_t		err = DB_SUCCESS;
+
+	ut_ad(!block->page.zip.data == !is_compressed_table());
+
+	switch (page_type = fil_page_get_type(get_frame(block))) {
+	case FIL_PAGE_TYPE_FSP_HDR:
+		ut_a(block->page.id().page_no() == 0);
+		/* Work directly on the uncompressed page headers. */
+		return(update_header(block));
+
+	case FIL_PAGE_INDEX:
+	case FIL_PAGE_RTREE:
+		/* We need to decompress the contents
+		before we can do anything. */
+
+		if (is_compressed_table() && !buf_zip_decompress(block, TRUE)) {
+			return(DB_CORRUPTION);
+		}
+
+		/* fall through */
+	case FIL_PAGE_TYPE_INSTANT:
+		/* This is on every page in the tablespace. */
+		mach_write_to_4(
+			get_frame(block)
+			+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, get_space_id());
+
+		/* Only update the Btree nodes. */
+		return(update_index_page(block));
+
+	case FIL_PAGE_TYPE_SYS:
+		/* This is page 0 in the system tablespace. */
+		return(DB_CORRUPTION);
+
+	case FIL_PAGE_TYPE_XDES:
+		err = set_current_xdes(
+			block->page.id().page_no(), get_frame(block));
+		/* fall through */
+	case FIL_PAGE_INODE:
+	case FIL_PAGE_TYPE_TRX_SYS:
+	case FIL_PAGE_IBUF_FREE_LIST:
+	case FIL_PAGE_TYPE_ALLOCATED:
+	case FIL_PAGE_IBUF_BITMAP:
+	case FIL_PAGE_TYPE_BLOB:
+	case FIL_PAGE_TYPE_ZBLOB:
+	case FIL_PAGE_TYPE_ZBLOB2:
+
+		/* Work directly on the uncompressed page headers. */
+		/* This is on every page in the tablespace. */
+		mach_write_to_4(
+			get_frame(block)
+			+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, get_space_id());
+
+		return(err);
+	}
+
+	ib::warn() << "Unknown page type (" << page_type << ")";
+
+	return(DB_CORRUPTION);
+}
+
+/** Called for every page in the tablespace. If the page was not
+updated then its state must be set to BUF_PAGE_NOT_USED.
+@param block block read from file, note it is not from the buffer pool
+@retval DB_SUCCESS or error code. */
+dberr_t PageConverter::operator()(buf_block_t* block) UNIV_NOTHROW
+{
+	/* If we already had an old page with matching number
+	in the buffer pool, evict it now, because
+	we no longer evict the pages on DISCARD TABLESPACE. */
+	buf_page_get_low(block->page.id(), get_zip_size(), RW_NO_LATCH,
+			 nullptr, BUF_PEEK_IF_IN_POOL,
+			 nullptr, nullptr, false);
+
+	uint16_t page_type;
+
+	if (dberr_t err = update_page(block, page_type)) {
+		return err;
+	}
+
+	const bool full_crc32 = fil_space_t::full_crc32(get_space_flags());
+	byte* frame = get_frame(block);
+	memset_aligned<8>(frame + FIL_PAGE_LSN, 0, 8);
+
+	if (!block->page.zip.data) {
+		buf_flush_init_for_writing(
+			NULL, block->page.frame, NULL, full_crc32);
+	} else if (fil_page_type_is_index(page_type)) {
+		buf_flush_init_for_writing(
+			NULL, block->page.zip.data, &block->page.zip,
+			full_crc32);
+	} else {
+		/* Calculate and update the checksum of non-index
+		pages for ROW_FORMAT=COMPRESSED tables. */
+		buf_flush_update_zip_checksum(
+			block->page.zip.data, block->zip_size());
+	}
+
+	return DB_SUCCESS;
+}
+
+/*****************************************************************//**
+Clean up after import tablespace. */
+static	MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_import_cleanup(
+/*===============*/
+	row_prebuilt_t*	prebuilt,	/*!< in/out: prebuilt from handler */
+	dberr_t		err)		/*!< in: error code */
+{
+	if (err != DB_SUCCESS) {
+		dict_table_t* table = prebuilt->table;
+		table->file_unreadable = true;
+		if (table->space) {
+			fil_close_tablespace(table->space_id);
+			table->space = NULL;
+		}
+
+		prebuilt->trx->error_info = NULL;
+
+		ib::info() << "Discarding tablespace of table "
+			   << table->name << ": " << err;
+
+		for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
+		     index;
+		     index = UT_LIST_GET_NEXT(indexes, index)) {
+			index->page = FIL_NULL;
+		}
+	}
+
+	DBUG_EXECUTE_IF("ib_import_before_commit_crash", DBUG_SUICIDE(););
+
+	prebuilt->trx->commit();
+
+	if (prebuilt->trx->dict_operation_lock_mode) {
+		row_mysql_unlock_data_dictionary(prebuilt->trx);
+	}
+
+	prebuilt->trx->op_info = "";
+
+	DBUG_EXECUTE_IF("ib_import_before_checkpoint_crash", DBUG_SUICIDE(););
+
+	return(err);
+}
+
+/*****************************************************************//**
+Report error during tablespace import. */
+static	MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_import_error(
+/*=============*/
+	row_prebuilt_t*	prebuilt,	/*!< in/out: prebuilt from handler */
+	dberr_t		err)		/*!< in: error code */
+{
+	if (!trx_is_interrupted(prebuilt->trx)) {
+		char	table_name[MAX_FULL_NAME_LEN + 1];
+
+		innobase_format_name(
+			table_name, sizeof(table_name),
+			prebuilt->table->name.m_name);
+
+		ib_senderrf(
+			prebuilt->trx->mysql_thd, IB_LOG_LEVEL_WARN,
+			ER_INNODB_IMPORT_ERROR,
+			table_name, (ulong) err, ut_strerr(err));
+	}
+
+	return row_import_cleanup(prebuilt, err);
+}
+
+/*****************************************************************//**
+Adjust the root page index node and leaf node segment headers, update
+with the new space id. For all the table's secondary indexes.
+@return error code */
+static	MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_import_adjust_root_pages_of_secondary_indexes(
+/*==============================================*/
+	trx_t*			trx,		/*!< in: transaction used for
+						the import */
+	dict_table_t*		table,		/*!< in: table the indexes
+						belong to */
+	const row_import&	cfg)		/*!< Import context */
+{
+	dict_index_t*		index;
+	ulint			n_rows_in_table;
+	dberr_t			err = DB_SUCCESS;
+
+	/* Skip the clustered index. */
+	index = dict_table_get_first_index(table);
+
+	n_rows_in_table = cfg.get_n_rows(index->name);
+
+	DBUG_EXECUTE_IF("ib_import_sec_rec_count_mismatch_failure",
+			n_rows_in_table++;);
+
+	/* Adjust the root pages of the secondary indexes only. */
+	while ((index = dict_table_get_next_index(index)) != NULL) {
+		ut_a(!dict_index_is_clust(index));
+
+		if (!(index->type & DICT_CORRUPT)
+		    && index->page != FIL_NULL) {
+
+			/* Update the Btree segment headers for index node and
+			leaf nodes in the root page. Set the new space id. */
+
+			err = btr_root_adjust_on_import(index);
+		} else {
+			ib::warn() << "Skip adjustment of root pages for"
+				" index " << index->name << ".";
+
+			err = DB_CORRUPTION;
+		}
+
+		if (err != DB_SUCCESS) {
+
+			if (index->type & DICT_CLUSTERED) {
+				break;
+			}
+
+			ib_errf(trx->mysql_thd,
+				IB_LOG_LEVEL_WARN,
+				ER_INNODB_INDEX_CORRUPT,
+				"Index %s not found or corrupt,"
+				" you should recreate this index.",
+				index->name());
+
+			/* Do not bail out, so that the data
+			can be recovered. */
+
+			err = DB_SUCCESS;
+			index->type |= DICT_CORRUPT;
+			continue;
+		}
+
+		/* If we failed to purge any records in the index then
+		do it the hard way.
+
+		TODO: We can do this in the first pass by generating UNDO log
+		records for the failed rows. */
+
+		if (!cfg.requires_purge(index->name)) {
+			continue;
+		}
+
+		IndexPurge   purge(trx, index);
+
+		trx->op_info = "secondary: purge delete marked records";
+
+		err = purge.garbage_collect();
+
+		trx->op_info = "";
+
+		if (err != DB_SUCCESS) {
+			break;
+		} else if (purge.get_n_rows() != n_rows_in_table) {
+
+			ib_errf(trx->mysql_thd,
+				IB_LOG_LEVEL_WARN,
+				ER_INNODB_INDEX_CORRUPT,
+				"Index '%s' contains " ULINTPF " entries, "
+				"should be " ULINTPF ", you should recreate "
+				"this index.", index->name(),
+				purge.get_n_rows(), n_rows_in_table);
+
+			index->type |= DICT_CORRUPT;
+
+			/* Do not bail out, so that the data
+			can be recovered. */
+
+			err = DB_SUCCESS;
+                }
+	}
+
+	return(err);
+}
+
+/*****************************************************************//**
+Ensure that dict_sys.row_id exceeds SELECT MAX(DB_ROW_ID). */
+MY_ATTRIBUTE((nonnull)) static
+void
+row_import_set_sys_max_row_id(
+/*==========================*/
+	row_prebuilt_t*		prebuilt,	/*!< in/out: prebuilt from
+						handler */
+	const dict_table_t*	table)		/*!< in: table to import */
+{
+	const rec_t*		rec;
+	mtr_t			mtr;
+	btr_pcur_t		pcur;
+	row_id_t		row_id	= 0;
+	dict_index_t*		index;
+
+	index = dict_table_get_first_index(table);
+	ut_ad(index->is_primary());
+	ut_ad(dict_index_is_auto_gen_clust(index));
+
+	mtr_start(&mtr);
+
+	mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO);
+
+	if (pcur.open_leaf(false, index, BTR_SEARCH_LEAF, &mtr)
+	    == DB_SUCCESS) {
+		rec = btr_pcur_move_to_prev_on_page(&pcur);
+
+		if (!rec) {
+			/* The table is corrupted. */
+		} else if (page_rec_is_infimum(rec)) {
+			/* The table is empty. */
+		} else if (rec_is_metadata(rec, *index)) {
+			/* The clustered index contains the metadata
+			record only, that is, the table is empty. */
+		} else {
+			row_id = mach_read_from_6(rec);
+		}
+	}
+
+	mtr_commit(&mtr);
+
+	if (row_id) {
+		/* Update the system row id if the imported index row id is
+		greater than the max system row id. */
+		dict_sys.update_row_id(row_id);
+	}
+}
+
+/*****************************************************************//**
+Read the a string from the meta data file.
+@return DB_SUCCESS or error code. */
+static
+dberr_t
+row_import_cfg_read_string(
+/*=======================*/
+	FILE*		file,		/*!< in/out: File to read from */
+	byte*		ptr,		/*!< out: string to read */
+	ulint		max_len)	/*!< in: maximum length of the output
+					buffer in bytes */
+{
+	DBUG_EXECUTE_IF("ib_import_string_read_error",
+			errno = EINVAL; return(DB_IO_ERROR););
+
+	ulint		len = 0;
+
+	while (!feof(file)) {
+		int	ch = fgetc(file);
+
+		if (ch == EOF) {
+			break;
+		} else if (ch != 0) {
+			if (len < max_len) {
+				ptr[len++] = static_cast<byte>(ch);
+			} else {
+				break;
+			}
+		/* max_len includes the NUL byte */
+		} else if (len != max_len - 1) {
+			break;
+		} else {
+			ptr[len] = 0;
+			return(DB_SUCCESS);
+		}
+	}
+
+	errno = EINVAL;
+
+	return(DB_IO_ERROR);
+}
+
+/*********************************************************************//**
+Write the meta data (index user fields) config file.
+@return DB_SUCCESS or error code. */
+static	MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_import_cfg_read_index_fields(
+/*=============================*/
+	FILE*			file,	/*!< in: file to write to */
+	THD*			thd,	/*!< in/out: session */
+	row_index_t*		index)	/*!< Index being read in */
+{
+	byte			row[sizeof(ib_uint32_t) * 3];
+	ulint			n_fields = index->m_n_fields;
+
+	index->m_fields = UT_NEW_ARRAY_NOKEY(dict_field_t, n_fields);
+
+	/* Trigger OOM */
+	DBUG_EXECUTE_IF(
+		"ib_import_OOM_4",
+		UT_DELETE_ARRAY(index->m_fields);
+		index->m_fields = NULL;
+	);
+
+	if (index->m_fields == NULL) {
+		return(DB_OUT_OF_MEMORY);
+	}
+
+	dict_field_t*	field = index->m_fields;
+
+	for (ulint i = 0; i < n_fields; ++i, ++field) {
+		byte*		ptr = row;
+
+		/* Trigger EOF */
+		DBUG_EXECUTE_IF("ib_import_io_read_error_1",
+				(void) fseek(file, 0L, SEEK_END););
+
+		if (fread(row, 1, sizeof(row), file) != sizeof(row)) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+				(ulong) errno, strerror(errno),
+				"while reading index fields.");
+
+			return(DB_IO_ERROR);
+		}
+
+		new (field) dict_field_t();
+
+		field->prefix_len = mach_read_from_4(ptr) & ((1U << 12) - 1);
+		ptr += sizeof(ib_uint32_t);
+
+		field->fixed_len = mach_read_from_4(ptr) & ((1U << 10) - 1);
+		ptr += sizeof(ib_uint32_t);
+
+		/* Include the NUL byte in the length. */
+		ulint	len = mach_read_from_4(ptr);
+
+		byte*	name = UT_NEW_ARRAY_NOKEY(byte, len);
+
+		/* Trigger OOM */
+		DBUG_EXECUTE_IF(
+			"ib_import_OOM_5",
+			UT_DELETE_ARRAY(name);
+			name = NULL;
+		);
+
+		if (name == NULL) {
+			return(DB_OUT_OF_MEMORY);
+		}
+
+		field->name = reinterpret_cast<const char*>(name);
+
+		dberr_t	err = row_import_cfg_read_string(file, name, len);
+
+		if (err != DB_SUCCESS) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+				(ulong) errno, strerror(errno),
+				"while parsing table name.");
+
+			return(err);
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Read the index names and root page numbers of the indexes and set the values.
+Row format [root_page_no, len of str, str ... ]
+@return DB_SUCCESS or error code. */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_import_read_index_data(
+/*=======================*/
+	FILE*		file,		/*!< in: File to read from */
+	THD*		thd,		/*!< in: session */
+	row_import*	cfg)		/*!< in/out: meta-data read */
+{
+	byte*		ptr;
+	row_index_t*	cfg_index;
+	byte		row[sizeof(index_id_t) + sizeof(ib_uint32_t) * 9];
+
+	/* FIXME: What is the max value? */
+	ut_a(cfg->m_n_indexes > 0);
+	ut_a(cfg->m_n_indexes < 1024);
+
+	cfg->m_indexes = UT_NEW_ARRAY_NOKEY(row_index_t, cfg->m_n_indexes);
+
+	/* Trigger OOM */
+	DBUG_EXECUTE_IF(
+		"ib_import_OOM_6",
+		UT_DELETE_ARRAY(cfg->m_indexes);
+		cfg->m_indexes = NULL;
+	);
+
+	if (cfg->m_indexes == NULL) {
+		return(DB_OUT_OF_MEMORY);
+	}
+
+	memset(cfg->m_indexes, 0x0, sizeof(*cfg->m_indexes) * cfg->m_n_indexes);
+
+	cfg_index = cfg->m_indexes;
+
+	for (ulint i = 0; i < cfg->m_n_indexes; ++i, ++cfg_index) {
+		/* Trigger EOF */
+		DBUG_EXECUTE_IF("ib_import_io_read_error_2",
+				(void) fseek(file, 0L, SEEK_END););
+
+		/* Read the index data. */
+		size_t	n_bytes = fread(row, 1, sizeof(row), file);
+
+		/* Trigger EOF */
+		DBUG_EXECUTE_IF("ib_import_io_read_error",
+				(void) fseek(file, 0L, SEEK_END););
+
+		if (n_bytes != sizeof(row)) {
+			char	msg[BUFSIZ];
+
+			snprintf(msg, sizeof(msg),
+				 "while reading index meta-data, expected "
+				 "to read " ULINTPF
+				 " bytes but read only " ULINTPF " bytes",
+				 sizeof(row), n_bytes);
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+				(ulong) errno, strerror(errno), msg);
+
+			ib::error() << "IO Error: " << msg;
+
+			return(DB_IO_ERROR);
+		}
+
+		ptr = row;
+
+		cfg_index->m_id = mach_read_from_8(ptr);
+		ptr += sizeof(index_id_t);
+
+		cfg_index->m_space = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		cfg_index->m_page_no = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		cfg_index->m_type = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		cfg_index->m_trx_id_offset = mach_read_from_4(ptr);
+		if (cfg_index->m_trx_id_offset != mach_read_from_4(ptr)) {
+			ut_ad(0);
+			/* Overflow. Pretend that the clustered index
+			has a variable-length PRIMARY KEY. */
+			cfg_index->m_trx_id_offset = 0;
+		}
+		ptr += sizeof(ib_uint32_t);
+
+		cfg_index->m_n_user_defined_cols = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		cfg_index->m_n_uniq = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		cfg_index->m_n_nullable = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		cfg_index->m_n_fields = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		/* The NUL byte is included in the name length. */
+		ulint	len = mach_read_from_4(ptr);
+
+		if (len > OS_FILE_MAX_PATH) {
+			ib_errf(thd, IB_LOG_LEVEL_ERROR,
+				ER_INNODB_INDEX_CORRUPT,
+				"Index name length (" ULINTPF ") is too long, "
+				"the meta-data is corrupt", len);
+
+			return(DB_CORRUPTION);
+		}
+
+		cfg_index->m_name = UT_NEW_ARRAY_NOKEY(byte, len);
+
+		/* Trigger OOM */
+		DBUG_EXECUTE_IF(
+			"ib_import_OOM_7",
+			UT_DELETE_ARRAY(cfg_index->m_name);
+			cfg_index->m_name = NULL;
+		);
+
+		if (cfg_index->m_name == NULL) {
+			return(DB_OUT_OF_MEMORY);
+		}
+
+		dberr_t	err;
+
+		err = row_import_cfg_read_string(file, cfg_index->m_name, len);
+
+		if (err != DB_SUCCESS) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+				(ulong) errno, strerror(errno),
+				"while parsing index name.");
+
+			return(err);
+		}
+
+		err = row_import_cfg_read_index_fields(file, thd, cfg_index);
+
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Set the index root page number for v1 format.
+@return DB_SUCCESS or error code. */
+static
+dberr_t
+row_import_read_indexes(
+/*====================*/
+	FILE*		file,		/*!< in: File to read from */
+	THD*		thd,		/*!< in: session */
+	row_import*	cfg)		/*!< in/out: meta-data read */
+{
+	byte		row[sizeof(ib_uint32_t)];
+
+	/* Trigger EOF */
+	DBUG_EXECUTE_IF("ib_import_io_read_error_3",
+			(void) fseek(file, 0L, SEEK_END););
+
+	/* Read the number of indexes. */
+	if (fread(row, 1, sizeof(row), file) != sizeof(row)) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			(ulong) errno, strerror(errno),
+			"while reading number of indexes.");
+
+		return(DB_IO_ERROR);
+	}
+
+	cfg->m_n_indexes = mach_read_from_4(row);
+
+	if (cfg->m_n_indexes == 0) {
+		ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			"Number of indexes in meta-data file is 0");
+
+		return(DB_CORRUPTION);
+
+	} else if (cfg->m_n_indexes > 1024) {
+		// FIXME: What is the upper limit? */
+		ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			"Number of indexes in meta-data file is too high: "
+			ULINTPF, cfg->m_n_indexes);
+		cfg->m_n_indexes = 0;
+
+		return(DB_CORRUPTION);
+	}
+
+	return(row_import_read_index_data(file, thd, cfg));
+}
+
+/*********************************************************************//**
+Read the meta data (table columns) config file. Deserialise the contents of
+dict_col_t structure, along with the column name. */
+static	MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_import_read_columns(
+/*====================*/
+	FILE*			file,	/*!< in: file to write to */
+	THD*			thd,	/*!< in/out: session */
+	row_import*		cfg)	/*!< in/out: meta-data read */
+{
+	dict_col_t*		col;
+	byte			row[sizeof(ib_uint32_t) * 8];
+
+	/* FIXME: What should the upper limit be? */
+	ut_a(cfg->m_n_cols > 0);
+	ut_a(cfg->m_n_cols < 1024);
+
+	cfg->m_cols = UT_NEW_ARRAY_NOKEY(dict_col_t, cfg->m_n_cols);
+
+	/* Trigger OOM */
+	DBUG_EXECUTE_IF(
+		"ib_import_OOM_8",
+		UT_DELETE_ARRAY(cfg->m_cols);
+		cfg->m_cols = NULL;
+	);
+
+	if (cfg->m_cols == NULL) {
+		return(DB_OUT_OF_MEMORY);
+	}
+
+	cfg->m_col_names = UT_NEW_ARRAY_NOKEY(byte*, cfg->m_n_cols);
+
+	/* Trigger OOM */
+	DBUG_EXECUTE_IF(
+		"ib_import_OOM_9",
+		UT_DELETE_ARRAY(cfg->m_col_names);
+		cfg->m_col_names = NULL;
+	);
+
+	if (cfg->m_col_names == NULL) {
+		return(DB_OUT_OF_MEMORY);
+	}
+
+	memset(cfg->m_cols, 0x0, sizeof(cfg->m_cols) * cfg->m_n_cols);
+	memset(cfg->m_col_names, 0x0, sizeof(cfg->m_col_names) * cfg->m_n_cols);
+
+	col = cfg->m_cols;
+
+	for (ulint i = 0; i < cfg->m_n_cols; ++i, ++col) {
+		byte*		ptr = row;
+
+		/* Trigger EOF */
+		DBUG_EXECUTE_IF("ib_import_io_read_error_4",
+				(void) fseek(file, 0L, SEEK_END););
+
+		if (fread(row, 1,  sizeof(row), file) != sizeof(row)) {
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+				(ulong) errno, strerror(errno),
+				"while reading table column meta-data.");
+
+			return(DB_IO_ERROR);
+		}
+
+		col->prtype = mach_read_from_4(ptr);
+		ptr += sizeof(ib_uint32_t);
+
+		col->mtype = static_cast<byte>(mach_read_from_4(ptr));
+		ptr += sizeof(ib_uint32_t);
+
+		col->len = static_cast<uint16_t>(mach_read_from_4(ptr));
+		ptr += sizeof(ib_uint32_t);
+
+		uint32_t mbminmaxlen = mach_read_from_4(ptr);
+		col->mbmaxlen = (mbminmaxlen / 5) & 7;
+		col->mbminlen = (mbminmaxlen % 5) & 7;
+		ptr += sizeof(ib_uint32_t);
+
+		col->ind = mach_read_from_4(ptr) & dict_index_t::MAX_N_FIELDS;
+		ptr += sizeof(ib_uint32_t);
+
+		col->ord_part = mach_read_from_4(ptr) & 1;
+		ptr += sizeof(ib_uint32_t);
+
+		col->max_prefix = mach_read_from_4(ptr) & ((1U << 12) - 1);
+		ptr += sizeof(ib_uint32_t);
+
+		/* Read in the column name as [len, byte array]. The len
+		includes the NUL byte. */
+
+		ulint		len = mach_read_from_4(ptr);
+
+		/* FIXME: What is the maximum column name length? */
+		if (len == 0 || len > 128) {
+			ib_errf(thd, IB_LOG_LEVEL_ERROR,
+				ER_IO_READ_ERROR,
+				"Column name length " ULINTPF ", is invalid",
+				len);
+
+			return(DB_CORRUPTION);
+		}
+
+		cfg->m_col_names[i] = UT_NEW_ARRAY_NOKEY(byte, len);
+
+		/* Trigger OOM */
+		DBUG_EXECUTE_IF(
+			"ib_import_OOM_10",
+			UT_DELETE_ARRAY(cfg->m_col_names[i]);
+			cfg->m_col_names[i] = NULL;
+		);
+
+		if (cfg->m_col_names[i] == NULL) {
+			return(DB_OUT_OF_MEMORY);
+		}
+
+		dberr_t	err;
+
+		err = row_import_cfg_read_string(
+			file, cfg->m_col_names[i], len);
+
+		if (err != DB_SUCCESS) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+				(ulong) errno, strerror(errno),
+				"while parsing table column name.");
+
+			return(err);
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Read the contents of the <tablespace>.cfg file.
+@return DB_SUCCESS or error code. */
+static	MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_import_read_v1(
+/*===============*/
+	FILE*		file,		/*!< in: File to read from */
+	THD*		thd,		/*!< in: session */
+	row_import*	cfg)		/*!< out: meta data */
+{
+	byte		value[sizeof(ib_uint32_t)];
+
+	/* Trigger EOF */
+	DBUG_EXECUTE_IF("ib_import_io_read_error_5",
+			(void) fseek(file, 0L, SEEK_END););
+
+	/* Read the hostname where the tablespace was exported. */
+	if (fread(value, 1, sizeof(value), file) != sizeof(value)) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			(ulong) errno, strerror(errno),
+			"while reading meta-data export hostname length.");
+
+		return(DB_IO_ERROR);
+	}
+
+	ulint	len = mach_read_from_4(value);
+
+	/* NUL byte is part of name length. */
+	cfg->m_hostname = UT_NEW_ARRAY_NOKEY(byte, len);
+
+	/* Trigger OOM */
+	DBUG_EXECUTE_IF(
+		"ib_import_OOM_1",
+		UT_DELETE_ARRAY(cfg->m_hostname);
+		cfg->m_hostname = NULL;
+	);
+
+	if (cfg->m_hostname == NULL) {
+		return(DB_OUT_OF_MEMORY);
+	}
+
+	dberr_t	err = row_import_cfg_read_string(file, cfg->m_hostname, len);
+
+	if (err != DB_SUCCESS) {
+
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			(ulong) errno, strerror(errno),
+			"while parsing export hostname.");
+
+		return(err);
+	}
+
+	/* Trigger EOF */
+	DBUG_EXECUTE_IF("ib_import_io_read_error_6",
+			(void) fseek(file, 0L, SEEK_END););
+
+	/* Read the table name of tablespace that was exported. */
+	if (fread(value, 1, sizeof(value), file) != sizeof(value)) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			(ulong) errno, strerror(errno),
+			"while reading meta-data table name length.");
+
+		return(DB_IO_ERROR);
+	}
+
+	len = mach_read_from_4(value);
+
+	/* NUL byte is part of name length. */
+	cfg->m_table_name = UT_NEW_ARRAY_NOKEY(byte, len);
+
+	/* Trigger OOM */
+	DBUG_EXECUTE_IF(
+		"ib_import_OOM_2",
+		UT_DELETE_ARRAY(cfg->m_table_name);
+		cfg->m_table_name = NULL;
+	);
+
+	if (cfg->m_table_name == NULL) {
+		return(DB_OUT_OF_MEMORY);
+	}
+
+	err = row_import_cfg_read_string(file, cfg->m_table_name, len);
+
+	if (err != DB_SUCCESS) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			(ulong) errno, strerror(errno),
+			"while parsing table name.");
+
+		return(err);
+	}
+
+	ib::info() << "Importing tablespace for table '" << cfg->m_table_name
+		<< "' that was exported from host '" << cfg->m_hostname << "'";
+
+	byte		row[sizeof(ib_uint32_t) * 3];
+
+	/* Trigger EOF */
+	DBUG_EXECUTE_IF("ib_import_io_read_error_7",
+			(void) fseek(file, 0L, SEEK_END););
+
+	/* Read the autoinc value. */
+	if (fread(row, 1, sizeof(ib_uint64_t), file) != sizeof(ib_uint64_t)) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			(ulong) errno, strerror(errno),
+			"while reading autoinc value.");
+
+		return(DB_IO_ERROR);
+	}
+
+	cfg->m_autoinc = mach_read_from_8(row);
+
+	/* Trigger EOF */
+	DBUG_EXECUTE_IF("ib_import_io_read_error_8",
+			(void) fseek(file, 0L, SEEK_END););
+
+	/* Read the tablespace page size. */
+	if (fread(row, 1, sizeof(row), file) != sizeof(row)) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			(ulong) errno, strerror(errno),
+			"while reading meta-data header.");
+
+		return(DB_IO_ERROR);
+	}
+
+	byte*		ptr = row;
+
+	const ulint	logical_page_size = mach_read_from_4(ptr);
+	ptr += sizeof(ib_uint32_t);
+
+	if (logical_page_size != srv_page_size) {
+
+		ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH,
+			"Tablespace to be imported has a different"
+			" page size than this server. Server page size"
+			" is %lu, whereas tablespace page size"
+			" is " ULINTPF,
+			srv_page_size,
+			logical_page_size);
+
+		return(DB_ERROR);
+	}
+
+	cfg->m_flags = mach_read_from_4(ptr);
+	ptr += sizeof(ib_uint32_t);
+
+	cfg->m_zip_size = dict_tf_get_zip_size(cfg->m_flags);
+	cfg->m_n_cols = mach_read_from_4(ptr);
+
+	if (!dict_tf_is_valid(cfg->m_flags)) {
+		ib_errf(thd, IB_LOG_LEVEL_ERROR,
+			ER_TABLE_SCHEMA_MISMATCH,
+			"Invalid table flags: " ULINTPF, cfg->m_flags);
+
+		return(DB_CORRUPTION);
+	}
+
+	err = row_import_read_columns(file, thd, cfg);
+
+	if (err == DB_SUCCESS) {
+		err = row_import_read_indexes(file, thd, cfg);
+	}
+
+	return(err);
+}
+
+/**
+Read the contents of the <tablespace>.cfg file.
+@return DB_SUCCESS or error code. */
+static	MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_import_read_meta_data(
+/*======================*/
+	FILE*		file,		/*!< in: File to read from */
+	THD*		thd,		/*!< in: session */
+	row_import&	cfg)		/*!< out: contents of the .cfg file */
+{
+	byte		row[sizeof(ib_uint32_t)];
+
+	/* Trigger EOF */
+	DBUG_EXECUTE_IF("ib_import_io_read_error_9",
+			(void) fseek(file, 0L, SEEK_END););
+
+	if (fread(&row, 1, sizeof(row), file) != sizeof(row)) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			(ulong) errno, strerror(errno),
+			"while reading meta-data version.");
+
+		return(DB_IO_ERROR);
+	}
+
+	cfg.m_version = mach_read_from_4(row);
+
+	/* Check the version number. */
+	switch (cfg.m_version) {
+	case IB_EXPORT_CFG_VERSION_V1:
+
+		return(row_import_read_v1(file, thd, &cfg));
+	default:
+		ib_errf(thd, IB_LOG_LEVEL_ERROR, ER_IO_READ_ERROR,
+			"Unsupported meta-data version number (" ULINTPF "), "
+			"file ignored", cfg.m_version);
+	}
+
+	return(DB_ERROR);
+}
+
+#define BTR_BLOB_HDR_PART_LEN 0 /*!< BLOB part len on this page */
+#define BTR_BLOB_HDR_NEXT_PAGE_NO 4 /*!< next BLOB part page no,
+                                    FIL_NULL if none */
+#define BTR_BLOB_HDR_SIZE 8 /*!< Size of a BLOB part header, in bytes */
+
+/* decrypt and decompress page if needed */
+static dberr_t decrypt_decompress(fil_space_crypt_t *space_crypt,
+                                  uint32_t space_flags, span<byte> page,
+                                  uint32_t space_id, byte *page_compress_buf)
+{
+  auto *data= page.data();
+
+  if (space_crypt && space_crypt->should_encrypt())
+  {
+    if (!buf_page_verify_crypt_checksum(data, space_flags))
+      return DB_CORRUPTION;
+
+    if (dberr_t err= fil_space_decrypt(space_id, space_flags, space_crypt,
+                                       data, page.size(), data))
+      return err;
+  }
+
+  bool page_compressed= false;
+
+  if (fil_space_t::full_crc32(space_flags) &&
+      fil_space_t::is_compressed(space_flags))
+    page_compressed= buf_page_is_compressed(data, space_flags);
+  else
+  {
+    switch (fil_page_get_type(data)) {
+    case FIL_PAGE_PAGE_COMPRESSED:
+    case FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED:
+      page_compressed= true;
+    }
+  }
+
+  if (page_compressed)
+  {
+    auto compress_length=
+      fil_page_decompress(page_compress_buf, data, space_flags);
+    ut_ad(compress_length != srv_page_size);
+
+    if (compress_length == 0)
+      return DB_CORRUPTION;
+  }
+
+  return DB_SUCCESS;
+}
+
+static size_t get_buf_size()
+{
+  return srv_page_size + (
+           provider_service_lzo->is_loaded ? LZO1X_1_15_MEM_COMPRESS :
+           provider_service_snappy->is_loaded ? snappy_max_compressed_length(srv_page_size) :
+           0
+         );
+}
+
+/* find, parse instant metadata, performing variaous checks,
+and apply it to dict_table_t
+@return DB_SUCCESS or some error */
+static dberr_t handle_instant_metadata(dict_table_t *table,
+                                       const row_import &cfg)
+{
+  dict_get_and_save_data_dir_path(table);
+
+  char *filepath;
+  if (DICT_TF_HAS_DATA_DIR(table->flags))
+  {
+    ut_a(table->data_dir_path);
+    filepath= fil_make_filepath(table->data_dir_path, table->name, IBD, true);
+  }
+  else
+    filepath= fil_make_filepath(nullptr, table->name, IBD, false);
+
+  if (!filepath)
+    return DB_OUT_OF_MEMORY;
+
+  SCOPE_EXIT([filepath]() { ut_free(filepath); });
+
+  bool success;
+  auto file= os_file_create_simple_no_error_handling(
+      innodb_data_file_key, filepath, OS_FILE_OPEN, OS_FILE_READ_WRITE, false,
+      &success);
+  if (!success)
+    return DB_IO_ERROR;
+
+  if (os_file_get_size(file) < srv_page_size)
+    return DB_CORRUPTION;
+
+  SCOPE_EXIT([&file]() { os_file_close(file); });
+
+  std::unique_ptr<byte[], decltype(&aligned_free)> first_page(
+      static_cast<byte *>(aligned_malloc(srv_page_size, srv_page_size)),
+      &aligned_free);
+
+  if (dberr_t err= os_file_read(IORequestReadPartial, file, first_page.get(),
+                                0, srv_page_size, nullptr))
+    return err;
+
+  auto space_flags= fsp_header_get_flags(first_page.get());
+
+  if (!fil_space_t::is_valid_flags(space_flags, true))
+  {
+    auto cflags= fsp_flags_convert_from_101(space_flags);
+    if (cflags == UINT32_MAX)
+      return invalid_space_flags(space_flags);
+    space_flags= static_cast<decltype(space_flags)>(cflags);
+  }
+
+  if (!cfg.m_missing)
+  {
+    if (dberr_t err= cfg.match_flags(current_thd))
+      return err;
+  }
+
+  const unsigned zip_size= fil_space_t::zip_size(space_flags);
+  const unsigned physical_size= zip_size ? zip_size : unsigned(srv_page_size);
+  ut_ad(physical_size <= UNIV_PAGE_SIZE_MAX);
+  const uint32_t space_id= page_get_space_id(first_page.get());
+
+  auto *space_crypt= fil_space_read_crypt_data(zip_size, first_page.get());
+  SCOPE_EXIT([&space_crypt]() {
+    if (space_crypt)
+      fil_space_destroy_crypt_data(&space_crypt);
+  });
+
+  std::unique_ptr<byte[], decltype(&aligned_free)> page(
+      static_cast<byte *>(
+          aligned_malloc(UNIV_PAGE_SIZE_MAX, UNIV_PAGE_SIZE_MAX)),
+      &aligned_free);
+
+  if (dberr_t err= os_file_read(
+          IORequestReadPartial, file, page.get(), 3 * physical_size,
+          physical_size, nullptr))
+    return err;
+
+  std::unique_ptr<byte[]> page_compress_buf(new byte[get_buf_size()]);
+
+  if (dberr_t err= decrypt_decompress(space_crypt, space_flags,
+                                      {page.get(), static_cast<size_t>
+                                       (physical_size)},
+                                      space_id, page_compress_buf.get()))
+    return err;
+
+  if (table->supports_instant())
+  {
+    dict_index_t *index= dict_table_get_first_index(table);
+
+    if (!page_is_comp(page.get()) != !dict_table_is_comp(table))
+    {
+      ib_errf(current_thd, IB_LOG_LEVEL_ERROR, ER_TABLE_SCHEMA_MISMATCH,
+              "ROW_FORMAT mismatch");
+      return DB_CORRUPTION;
+    }
+
+    if (btr_cur_instant_root_init(index, page.get()))
+      return DB_CORRUPTION;
+
+    ut_ad(index->n_core_null_bytes != dict_index_t::NO_CORE_NULL_BYTES);
+
+    if (fil_page_get_type(page.get()) == FIL_PAGE_INDEX)
+    {
+      ut_ad(!index->is_instant());
+      return DB_SUCCESS;
+    }
+
+    mem_heap_t *heap= NULL;
+    SCOPE_EXIT([&heap]() {
+      if (heap)
+        mem_heap_free(heap);
+    });
+
+    while (btr_page_get_level(page.get()) != 0)
+    {
+      const rec_t *rec= page_rec_get_next(page_get_infimum_rec(page.get()));
+      if (!rec)
+        return DB_CORRUPTION;
+
+      /* Relax the assertion in rec_init_offsets(). */
+      ut_ad(!index->in_instant_init);
+      ut_d(index->in_instant_init= true);
+      rec_offs *offsets=
+          rec_get_offsets(rec, index, nullptr, 0, ULINT_UNDEFINED, &heap);
+      ut_d(index->in_instant_init= false);
+
+      uint64_t child_page_no= btr_node_ptr_get_child_page_no(rec, offsets);
+
+      if (dberr_t err=
+          os_file_read(IORequestReadPartial, file, page.get(),
+                       child_page_no * physical_size, physical_size, nullptr))
+        return err;
+
+      if (dberr_t err= decrypt_decompress(space_crypt, space_flags,
+                                          {page.get(), static_cast<size_t>
+                                           (physical_size)}, space_id,
+                                          page_compress_buf.get()))
+        return err;
+    }
+
+    const auto *rec= page_rec_get_next_const(page_get_infimum_rec(page.get()));
+    const auto comp= dict_table_is_comp(index->table);
+
+    if (!rec || page_rec_is_supremum(rec))
+    {
+    corrupted_metadata:
+      ib::error() << "Table " << index->table->name
+                  << " is missing instant ALTER metadata";
+      index->table->corrupted= true;
+      return DB_CORRUPTION;
+    }
+
+    const auto info_bits= rec_get_info_bits(rec, comp);
+    if (!(info_bits & REC_INFO_MIN_REC_FLAG))
+      goto corrupted_metadata;
+
+    if ((info_bits & ~REC_INFO_DELETED_FLAG) != REC_INFO_MIN_REC_FLAG ||
+        (comp && rec_get_status(rec) != REC_STATUS_INSTANT))
+    {
+    incompatible:
+      ib::error() << "Table " << index->table->name
+                  << " contains unrecognizable instant ALTER metadata";
+      index->table->corrupted= true;
+      return DB_CORRUPTION;
+    }
+
+    if (info_bits & REC_INFO_DELETED_FLAG)
+    {
+      ulint trx_id_offset= index->trx_id_offset;
+      ut_ad(index->n_uniq);
+
+      if (trx_id_offset)
+      {
+      }
+      else if (index->table->not_redundant())
+      {
+
+        for (uint i= index->n_uniq; i--;)
+          trx_id_offset+= index->fields[i].fixed_len;
+      }
+      else if (rec_get_1byte_offs_flag(rec))
+      {
+        trx_id_offset= rec_1_get_field_end_info(rec, index->n_uniq - 1);
+        ut_ad(!(trx_id_offset & REC_1BYTE_SQL_NULL_MASK));
+        trx_id_offset&= ~REC_1BYTE_SQL_NULL_MASK;
+      }
+      else
+      {
+        trx_id_offset= rec_2_get_field_end_info(rec, index->n_uniq - 1);
+        ut_ad(!(trx_id_offset & REC_2BYTE_SQL_NULL_MASK));
+        trx_id_offset&= ~REC_2BYTE_SQL_NULL_MASK;
+      }
+
+      const byte *ptr=
+          rec + trx_id_offset + (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+      if (mach_read_from_4(ptr + BTR_EXTERN_LEN))
+        goto incompatible;
+
+      uint len= mach_read_from_4(ptr + BTR_EXTERN_LEN + 4);
+      if (!len || mach_read_from_4(ptr + BTR_EXTERN_OFFSET) != FIL_PAGE_DATA)
+        goto incompatible;
+
+      std::unique_ptr<byte[], decltype(&aligned_free)>
+        second_page(static_cast<byte*>(aligned_malloc(physical_size,
+                                                      physical_size)),
+                    &aligned_free);
+
+      if (dberr_t err=
+          os_file_read(IORequestReadPartial, file, second_page.get(),
+                       physical_size *
+                       mach_read_from_4(ptr + BTR_EXTERN_PAGE_NO),
+                       physical_size, nullptr))
+        return err;
+
+      if (dberr_t err= decrypt_decompress(space_crypt, space_flags,
+                                          {second_page.get(),
+                                           static_cast<size_t>(physical_size)},
+                                          space_id, page_compress_buf.get()))
+        return err;
+
+      if (fil_page_get_type(second_page.get()) != FIL_PAGE_TYPE_BLOB ||
+          mach_read_from_4(
+              &second_page[FIL_PAGE_DATA + BTR_BLOB_HDR_NEXT_PAGE_NO]) !=
+              FIL_NULL ||
+          mach_read_from_4(
+              &second_page[FIL_PAGE_DATA + BTR_BLOB_HDR_PART_LEN]) != len)
+        goto incompatible;
+
+      /* The unused part of the BLOB page should be zero-filled. */
+      for (const byte *
+               b= second_page.get() + (FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE) +
+                  len,
+              *const end= second_page.get() + srv_page_size - BTR_EXTERN_LEN;
+           b < end;)
+      {
+        if (*b++)
+          goto incompatible;
+      }
+
+      if (index->table->deserialise_columns(
+              &second_page[FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE], len))
+        goto incompatible;
+    }
+
+    rec_offs *offsets= rec_get_offsets(
+        rec, index, nullptr, index->n_core_fields, ULINT_UNDEFINED, &heap);
+    if (rec_offs_any_default(offsets))
+    {
+    inconsistent:
+      goto incompatible;
+    }
+
+    /* In fact, because we only ever append fields to the metadata
+    record, it is also OK to perform READ UNCOMMITTED and
+    then ignore any extra fields, provided that
+    trx_sys.is_registered(DB_TRX_ID). */
+    if (rec_offs_n_fields(offsets) >
+            ulint(index->n_fields) + !!index->table->instant &&
+        !trx_sys.is_registered(current_trx(),
+                               row_get_rec_trx_id(rec, index, offsets)))
+      goto inconsistent;
+
+    for (unsigned i= index->n_core_fields; i < index->n_fields; i++)
+    {
+      dict_col_t *col= index->fields[i].col;
+      const unsigned o= i + !!index->table->instant;
+      ulint len;
+      const byte *data= rec_get_nth_field(rec, offsets, o, &len);
+      ut_ad(!col->is_added());
+      ut_ad(!col->def_val.data);
+      col->def_val.len= len;
+      switch (len) {
+      case UNIV_SQL_NULL:
+        continue;
+      case 0:
+        col->def_val.data= field_ref_zero;
+        continue;
+      }
+      ut_ad(len != UNIV_SQL_DEFAULT);
+      if (!rec_offs_nth_extern(offsets, o))
+        col->def_val.data= mem_heap_dup(index->table->heap, data, len);
+      else if (len < BTR_EXTERN_FIELD_REF_SIZE ||
+               !memcmp(data + len - BTR_EXTERN_FIELD_REF_SIZE, field_ref_zero,
+                       BTR_EXTERN_FIELD_REF_SIZE))
+      {
+        col->def_val.len= UNIV_SQL_DEFAULT;
+        goto inconsistent;
+      }
+      else
+      {
+        col->def_val.data= btr_copy_externally_stored_field(
+            &col->def_val.len, data, srv_page_size, len, index->table->heap);
+      }
+    }
+  }
+
+  return DB_SUCCESS;
+}
+
+/**
+Read the contents of the <tablename>.cfg file.
+@return DB_SUCCESS or error code. */
+static	MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_import_read_cfg(
+/*================*/
+	dict_table_t*	table,	/*!< in: table */
+	THD*		thd,	/*!< in: session */
+	row_import&	cfg)	/*!< out: contents of the .cfg file */
+{
+	dberr_t		err;
+	char		name[OS_FILE_MAX_PATH];
+
+	cfg.m_table = table;
+
+	srv_get_meta_data_filename(table, name, sizeof(name));
+
+	FILE*	file = fopen(name, "rb");
+
+	if (file == NULL) {
+		char	msg[BUFSIZ];
+
+		snprintf(msg, sizeof(msg),
+			 "Error opening '%s', will attempt to import"
+			 " without schema verification", name);
+
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_WARN, ER_IO_READ_ERROR,
+			(ulong) errno, strerror(errno), msg);
+
+		cfg.m_missing = true;
+
+		err = DB_FAIL;
+	} else {
+
+		cfg.m_missing = false;
+
+		err = row_import_read_meta_data(file, thd, cfg);
+		fclose(file);
+	}
+
+	return(err);
+}
+
+/** Update the root page numbers and tablespace ID of a table.
+@param[in,out]	trx	dictionary transaction
+@param[in,out]	table	persistent table
+@param[in]	reset	whether to reset the fields to FIL_NULL
+@return DB_SUCCESS or error code */
+dberr_t
+row_import_update_index_root(trx_t* trx, dict_table_t* table, bool reset)
+{
+	const dict_index_t*	index;
+	que_t*			graph = 0;
+	dberr_t			err = DB_SUCCESS;
+
+	ut_ad(reset || table->space->id == table->space_id);
+
+	static const char	sql[] = {
+		"PROCEDURE UPDATE_INDEX_ROOT() IS\n"
+		"BEGIN\n"
+		"UPDATE SYS_INDEXES\n"
+		"SET SPACE = :space,\n"
+		"    PAGE_NO = :page,\n"
+		"    TYPE = :type\n"
+		"WHERE TABLE_ID = :table_id AND ID = :index_id;\n"
+		"END;\n"};
+
+	table->def_trx_id = trx->id;
+
+	for (index = dict_table_get_first_index(table);
+	     index != 0;
+	     index = dict_table_get_next_index(index)) {
+
+		pars_info_t*	info;
+		ib_uint32_t	page;
+		ib_uint32_t	space;
+		ib_uint32_t	type;
+		index_id_t	index_id;
+		table_id_t	table_id;
+
+		info = (graph != 0) ? graph->info : pars_info_create();
+
+		mach_write_to_4(
+			reinterpret_cast<byte*>(&type),
+			index->type);
+
+		mach_write_to_4(
+			reinterpret_cast<byte*>(&page),
+			reset ? FIL_NULL : index->page);
+
+		mach_write_to_4(
+			reinterpret_cast<byte*>(&space),
+			reset ? FIL_NULL : index->table->space_id);
+
+		mach_write_to_8(
+			reinterpret_cast<byte*>(&index_id),
+			index->id);
+
+		mach_write_to_8(
+			reinterpret_cast<byte*>(&table_id),
+			table->id);
+
+		/* If we set the corrupt bit during the IMPORT phase then
+		we need to update the system tables. */
+		pars_info_bind_int4_literal(info, "type", &type);
+		pars_info_bind_int4_literal(info, "space", &space);
+		pars_info_bind_int4_literal(info, "page", &page);
+		pars_info_bind_ull_literal(info, "index_id", &index_id);
+		pars_info_bind_ull_literal(info, "table_id", &table_id);
+
+		if (graph == 0) {
+			graph = pars_sql(info, sql);
+			ut_a(graph);
+			graph->trx = trx;
+		}
+
+		que_thr_t*	thr;
+
+		ut_a(thr = que_fork_start_command(graph));
+
+		que_run_threads(thr);
+
+		DBUG_EXECUTE_IF("ib_import_internal_error",
+				trx->error_state = DB_ERROR;);
+
+		err = trx->error_state;
+
+		if (err != DB_SUCCESS) {
+			ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+				ER_INTERNAL_ERROR,
+				"While updating the <space, root page"
+				" number> of index %s - %s",
+				index->name(), ut_strerr(err));
+
+			break;
+		}
+	}
+
+	que_graph_free(graph);
+
+	return(err);
+}
+
+/** Callback arg for row_import_set_discarded. */
+struct discard_t {
+	ib_uint32_t	flags2;			/*!< Value read from column */
+	bool		state;			/*!< New state of the flag */
+	ulint		n_recs;			/*!< Number of recs processed */
+};
+
+/******************************************************************//**
+Fetch callback that sets or unsets the DISCARDED tablespace flag in
+SYS_TABLES. The flags is stored in MIX_LEN column.
+@return FALSE if all OK */
+static
+ibool
+row_import_set_discarded(
+/*=====================*/
+	void*		row,			/*!< in: sel_node_t* */
+	void*		user_arg)		/*!< in: bool set/unset flag */
+{
+	sel_node_t*	node = static_cast<sel_node_t*>(row);
+	discard_t*	discard = static_cast<discard_t*>(user_arg);
+	dfield_t*	dfield = que_node_get_val(node->select_list);
+	dtype_t*	type = dfield_get_type(dfield);
+	ulint		len = dfield_get_len(dfield);
+
+	ut_a(dtype_get_mtype(type) == DATA_INT);
+	ut_a(len == sizeof(ib_uint32_t));
+
+	ulint	flags2 = mach_read_from_4(
+		static_cast<byte*>(dfield_get_data(dfield)));
+
+	if (discard->state) {
+		flags2 |= DICT_TF2_DISCARDED;
+	} else {
+		flags2 &= ~DICT_TF2_DISCARDED;
+	}
+
+	mach_write_to_4(reinterpret_cast<byte*>(&discard->flags2), flags2);
+
+	++discard->n_recs;
+
+	/* There should be at most one matching record. */
+	ut_a(discard->n_recs == 1);
+
+	return(FALSE);
+}
+
+/** Update the DICT_TF2_DISCARDED flag in SYS_TABLES.MIX_LEN.
+@param[in,out]	trx		dictionary transaction
+@param[in]	table_id	table identifier
+@param[in]	discarded	whether to set or clear the flag
+@return DB_SUCCESS or error code */
+dberr_t row_import_update_discarded_flag(trx_t* trx, table_id_t table_id,
+					 bool discarded)
+{
+	pars_info_t*		info;
+	discard_t		discard;
+
+	static const char	sql[] =
+		"PROCEDURE UPDATE_DISCARDED_FLAG() IS\n"
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR c IS\n"
+		" SELECT MIX_LEN"
+		" FROM SYS_TABLES"
+		" WHERE ID = :table_id FOR UPDATE;"
+		"\n"
+		"BEGIN\n"
+		"OPEN c;\n"
+		"WHILE 1 = 1 LOOP\n"
+		"  FETCH c INTO my_func();\n"
+		"  IF c % NOTFOUND THEN\n"
+		"    EXIT;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"UPDATE SYS_TABLES"
+		" SET MIX_LEN = :flags2"
+		" WHERE ID = :table_id;\n"
+		"CLOSE c;\n"
+		"END;\n";
+
+	discard.n_recs = 0;
+	discard.state = discarded;
+	discard.flags2 = ULINT32_UNDEFINED;
+
+	info = pars_info_create();
+
+	pars_info_add_ull_literal(info, "table_id", table_id);
+	pars_info_bind_int4_literal(info, "flags2", &discard.flags2);
+
+	pars_info_bind_function(
+		info, "my_func", row_import_set_discarded, &discard);
+
+	dberr_t	err = que_eval_sql(info, sql, trx);
+
+	ut_a(discard.n_recs == 1);
+	ut_a(discard.flags2 != ULINT32_UNDEFINED);
+
+	return(err);
+}
+
+/** InnoDB writes page by page when there is page compressed
+tablespace involved. It does help to save the disk space when
+punch hole is enabled
+@param iter     Tablespace iterator
+@param full_crc32    whether the file is in the full_crc32 format
+@param offset   offset of the file to be written
+@param writeptr buffer to be written
+@param n_bytes  number of bytes to be written
+@param try_punch_only   Try the range punch only because the
+                        current range is full of empty pages
+@return DB_SUCCESS */
+static
+dberr_t fil_import_compress_fwrite(const fil_iterator_t &iter,
+                                   bool full_crc32,
+                                   os_offset_t offset,
+                                   const byte *writeptr,
+                                   ulint n_bytes,
+                                   bool try_punch_only= false)
+{
+  if (dberr_t err= os_file_punch_hole(iter.file, offset, n_bytes))
+    return err;
+
+  if (try_punch_only)
+    return DB_SUCCESS;
+
+  for (ulint j= 0; j < n_bytes; j+= srv_page_size)
+  {
+    /* Read the original data length from block and
+    safer to read FIL_PAGE_COMPRESSED_SIZE because it
+    is not encrypted*/
+    ulint n_write_bytes= srv_page_size;
+    if (j || offset)
+    {
+      n_write_bytes= mach_read_from_2(writeptr + j + FIL_PAGE_DATA);
+      const unsigned ptype= mach_read_from_2(writeptr + j + FIL_PAGE_TYPE);
+      /* Ignore the empty page */
+      if (ptype == 0 && n_write_bytes == 0)
+        continue;
+      if (full_crc32)
+        n_write_bytes= buf_page_full_crc32_size(writeptr + j,
+                                                nullptr, nullptr);
+      else
+      {
+        n_write_bytes+= ptype == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED
+          ? FIL_PAGE_DATA + FIL_PAGE_ENCRYPT_COMP_METADATA_LEN
+          : FIL_PAGE_DATA + FIL_PAGE_COMP_METADATA_LEN;
+      }
+    }
+
+    if (dberr_t err= os_file_write(IORequestWrite, iter.filepath, iter.file,
+                                   writeptr + j, offset + j, n_write_bytes))
+      return err;
+  }
+
+  return DB_SUCCESS;
+}
+
+dberr_t FetchIndexRootPages::run(const fil_iterator_t& iter,
+                                 buf_block_t* block) UNIV_NOTHROW
+{
+  const unsigned zip_size= fil_space_t::zip_size(m_space_flags);
+  const unsigned size= zip_size ? zip_size : unsigned(srv_page_size);
+  byte* page_compress_buf= static_cast<byte*>(malloc(get_buf_size()));
+  const bool full_crc32 = fil_space_t::full_crc32(m_space_flags);
+  bool skip_checksum_check = false;
+  ut_ad(!srv_read_only_mode);
+
+  if (!page_compress_buf)
+    return DB_OUT_OF_MEMORY;
+
+  const bool encrypted= iter.crypt_data != NULL &&
+    iter.crypt_data->should_encrypt();
+  byte* const readptr= iter.io_buffer;
+  block->page.frame= readptr;
+
+  if (block->page.zip.data)
+    block->page.zip.data= readptr;
+
+  bool page_compressed= false;
+
+  dberr_t err= os_file_read(IORequestReadPartial, iter.file, readptr,
+                            3 * size, size, nullptr);
+  if (err != DB_SUCCESS)
+  {
+    ib::error() << iter.filepath << ": os_file_read() failed";
+    goto func_exit;
+  }
+
+  if (page_get_page_no(readptr) != 3)
+  {
+page_corrupted:
+    ib::warn() << filename() << ": Page 3 at offset "
+               << 3 * size << " looks corrupted.";
+    err= DB_CORRUPTION;
+    goto func_exit;
+  }
+
+  block->page.id_.set_page_no(3);
+  if (full_crc32 && fil_space_t::is_compressed(m_space_flags))
+    page_compressed= buf_page_is_compressed(readptr, m_space_flags);
+  else
+  {
+    switch (fil_page_get_type(readptr)) {
+    case FIL_PAGE_PAGE_COMPRESSED:
+    case FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED:
+      if (block->page.zip.data)
+        goto page_corrupted;
+      page_compressed= true;
+    }
+  }
+
+  if (encrypted)
+  {
+    if (!buf_page_verify_crypt_checksum(readptr, m_space_flags))
+      goto page_corrupted;
+
+    if ((err= fil_space_decrypt(get_space_id(), m_space_flags, iter.crypt_data,
+                                readptr, size, readptr)))
+      goto func_exit;
+  }
+
+  /* For full_crc32 format, skip checksum check
+  after decryption. */
+  skip_checksum_check= full_crc32 && encrypted;
+
+  if (page_compressed)
+  {
+    ulint compress_length= fil_page_decompress(page_compress_buf,
+                                               readptr,
+                                               m_space_flags);
+    ut_ad(compress_length != srv_page_size);
+    if (compress_length == 0)
+      goto page_corrupted;
+  }
+  else if (!skip_checksum_check
+           && buf_page_is_corrupted(false, readptr, m_space_flags))
+    goto page_corrupted;
+
+  err= this->operator()(block);
+func_exit:
+  free(page_compress_buf);
+  return err;
+}
+
+static dberr_t fil_iterate(
+	const fil_iterator_t&	iter,
+	buf_block_t*		block,
+	AbstractCallback&	callback)
+{
+	os_offset_t		offset;
+	const ulint		size = callback.physical_size();
+	ulint			n_bytes = iter.n_io_buffers * size;
+
+	byte* page_compress_buf= static_cast<byte*>(malloc(get_buf_size()));
+	ut_ad(!srv_read_only_mode);
+
+	if (!page_compress_buf) {
+		return DB_OUT_OF_MEMORY;
+	}
+
+	uint32_t actual_space_id = 0;
+	const bool full_crc32 = fil_space_t::full_crc32(
+		callback.get_space_flags());
+
+	/* TODO: For ROW_FORMAT=COMPRESSED tables we do a lot of useless
+	copying for non-index pages. Unfortunately, it is
+	required by buf_zip_decompress() */
+	dberr_t		err = DB_SUCCESS;
+	bool		page_compressed = false;
+	bool		punch_hole = !my_test_if_thinly_provisioned(iter.file);
+
+	for (offset = iter.start; offset < iter.end; offset += n_bytes) {
+		if (callback.is_interrupted()) {
+			err = DB_INTERRUPTED;
+			goto func_exit;
+		}
+
+		byte*		io_buffer = iter.io_buffer;
+		block->page.frame = io_buffer;
+
+		if (block->page.zip.data) {
+			/* Zip IO is done in the compressed page buffer. */
+			io_buffer = block->page.zip.data;
+		}
+
+		/* We have to read the exact number of bytes. Otherwise the
+		InnoDB IO functions croak on failed reads. */
+
+		n_bytes = ulint(ut_min(os_offset_t(n_bytes),
+				       iter.end - offset));
+
+		ut_ad(n_bytes > 0);
+		ut_ad(!(n_bytes % size));
+
+		const bool encrypted = iter.crypt_data != NULL
+			&& iter.crypt_data->should_encrypt();
+		/* Use additional crypt io buffer if tablespace is encrypted */
+		byte* const readptr = encrypted
+			? iter.crypt_io_buffer : io_buffer;
+		byte* const writeptr = readptr;
+
+		err = os_file_read(IORequestReadPartial, iter.file, readptr,
+				   offset, n_bytes, nullptr);
+		if (err != DB_SUCCESS) {
+			ib::error() << iter.filepath
+				    << ": os_file_read() failed";
+			goto func_exit;
+		}
+
+		bool		updated = false;
+		os_offset_t	page_off = offset;
+		ulint		n_pages_read = n_bytes / size;
+		/* This block is not attached to buf_pool */
+		block->page.id_.set_page_no(uint32_t(page_off / size));
+
+		for (ulint i = 0; i < n_pages_read;
+		     ++block->page.id_,
+		     ++i, page_off += size, block->page.frame += size) {
+			byte*	src = readptr + i * size;
+			const ulint page_no = page_get_page_no(src);
+			if (!page_no && block->page.id().page_no()) {
+				if (!buf_is_zeroes(span<const byte>(src,
+								    size))) {
+					goto page_corrupted;
+				}
+				/* Proceed to the next page,
+				because this one is all zero. */
+				continue;
+			}
+
+			if (page_no != block->page.id().page_no()) {
+page_corrupted:
+				ib::warn() << callback.filename()
+					   << ": Page " << (offset / size)
+					   << " at offset " << offset
+					   << " looks corrupted.";
+				err = DB_CORRUPTION;
+				goto func_exit;
+			}
+
+			if (block->page.id().page_no() == 0) {
+				actual_space_id = mach_read_from_4(
+					src + FIL_PAGE_SPACE_ID);
+			}
+
+			const uint16_t type = fil_page_get_type(src);
+			page_compressed =
+				(full_crc32
+				 && fil_space_t::is_compressed(
+					callback.get_space_flags())
+				 && buf_page_is_compressed(
+					src, callback.get_space_flags()))
+				|| type == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED
+				|| type == FIL_PAGE_PAGE_COMPRESSED;
+
+			if (page_compressed && block->page.zip.data) {
+				goto page_corrupted;
+			}
+
+			bool decrypted = false;
+			byte* dst = io_buffer + i * size;
+			bool frame_changed = false;
+			uint key_version = buf_page_get_key_version(
+				src, callback.get_space_flags());
+
+			if (!encrypted) {
+			} else if (!key_version) {
+				if (block->page.id().page_no() == 0
+				    && block->page.zip.data) {
+					block->page.zip.data = src;
+					frame_changed = true;
+				} else if (!page_compressed
+					   && type != FIL_PAGE_TYPE_XDES
+					   && !block->page.zip.data) {
+					block->page.frame = src;
+					frame_changed = true;
+				} else {
+					ut_ad(dst != src);
+					memcpy(dst, src, size);
+				}
+			} else {
+				if (!buf_page_verify_crypt_checksum(
+					src, callback.get_space_flags())) {
+					goto page_corrupted;
+				}
+
+				if ((err = fil_space_decrypt(
+					actual_space_id,
+					callback.get_space_flags(),
+					iter.crypt_data, dst,
+					callback.physical_size(),
+					src))) {
+					goto func_exit;
+				}
+
+				decrypted = true;
+				updated = true;
+			}
+
+			/* For full_crc32 format, skip checksum check
+			after decryption. */
+			bool skip_checksum_check = full_crc32 && encrypted;
+
+			/* If the original page is page_compressed, we need
+			to decompress it before adjusting further. */
+			if (page_compressed) {
+				ulint compress_length = fil_page_decompress(
+					page_compress_buf, dst,
+					callback.get_space_flags());
+				ut_ad(compress_length != srv_page_size);
+				if (compress_length == 0) {
+					goto page_corrupted;
+				}
+				updated = true;
+			} else if (!skip_checksum_check
+				   && buf_page_is_corrupted(
+					   false,
+					   encrypted && !frame_changed
+					   ? dst : src,
+					   callback.get_space_flags())) {
+				goto page_corrupted;
+			}
+
+			if ((err = callback(block)) != DB_SUCCESS) {
+				goto func_exit;
+			} else if (!updated) {
+				updated = !!block->page.frame;
+			}
+
+			/* If tablespace is encrypted we use additional
+			temporary scratch area where pages are read
+			for decrypting readptr == crypt_io_buffer != io_buffer.
+
+			Destination for decryption is a buffer pool block
+			block->page.frame == dst == io_buffer that is updated.
+			Pages that did not require decryption even when
+			tablespace is marked as encrypted are not copied
+			instead block->page.frame is set to src == readptr.
+
+			For encryption we again use temporary scratch area
+			writeptr != io_buffer == dst
+			that is then written to the tablespace
+
+			(1) For normal tables io_buffer == dst == writeptr
+			(2) For only page compressed tables
+			io_buffer == dst == writeptr
+			(3) For encrypted (and page compressed)
+			readptr != io_buffer == dst != writeptr
+			*/
+
+			ut_ad(!encrypted && !page_compressed ?
+			      src == dst && dst == writeptr + (i * size):1);
+			ut_ad(page_compressed && !encrypted ?
+			      src == dst && dst == writeptr + (i * size):1);
+			ut_ad(encrypted ?
+			      src != dst && dst != writeptr + (i * size):1);
+
+			/* When tablespace is encrypted or compressed its
+			first page (i.e. page 0) is not encrypted or
+			compressed and there is no need to copy frame. */
+			if (encrypted && block->page.id().page_no() != 0) {
+				byte *local_frame = callback.get_frame(block);
+				ut_ad((writeptr + (i * size)) != local_frame);
+				memcpy((writeptr + (i * size)), local_frame, size);
+			}
+
+			if (frame_changed) {
+				if (block->page.zip.data) {
+					block->page.zip.data = dst;
+				} else {
+					block->page.frame = dst;
+				}
+			}
+
+			src =  io_buffer + (i * size);
+
+			if (page_compressed) {
+				updated = true;
+				if (ulint len = fil_page_compress(
+					    src,
+					    page_compress_buf,
+					    callback.get_space_flags(),
+					    512,/* FIXME: proper block size */
+					    encrypted)) {
+					/* FIXME: remove memcpy() */
+					memcpy(src, page_compress_buf, len);
+					memset(src + len, 0,
+					       srv_page_size - len);
+				}
+			}
+
+			/* Encrypt the page if encryption was used. */
+			if (encrypted && decrypted) {
+				byte *dest = writeptr + i * size;
+
+				byte* tmp = fil_encrypt_buf(
+					iter.crypt_data,
+					block->page.id().space(),
+					block->page.id().page_no(),
+					src, block->zip_size(), dest,
+					full_crc32);
+
+				if (tmp == src) {
+					/* TODO: remove unnecessary memcpy's */
+					ut_ad(dest != src);
+					memcpy(dest, src, size);
+				}
+
+				updated = true;
+			}
+
+			/* Write checksum for the compressed full crc32 page.*/
+			if (full_crc32 && page_compressed) {
+				ut_ad(updated);
+				byte* dest = writeptr + i * size;
+				ut_d(bool comp = false);
+				ut_d(bool corrupt = false);
+				ulint size = buf_page_full_crc32_size(
+					dest,
+#ifdef UNIV_DEBUG
+					&comp, &corrupt
+#else
+					NULL, NULL
+#endif
+				);
+				ut_ad(!comp == (size == srv_page_size));
+				ut_ad(!corrupt);
+				mach_write_to_4(dest + (size - 4),
+						my_crc32c(0, dest, size - 4));
+			}
+		}
+
+		if (page_compressed && punch_hole) {
+			err = fil_import_compress_fwrite(
+				iter, full_crc32, offset, writeptr, n_bytes,
+				!updated);
+
+			if (err != DB_SUCCESS) {
+				punch_hole = false;
+				if (updated) {
+					goto normal_write;
+				}
+			}
+		} else if (updated) {
+normal_write:
+			/* A page was updated in the set, write it back. */
+			err = os_file_write(IORequestWrite,
+					    iter.filepath, iter.file,
+					    writeptr, offset, n_bytes);
+
+			if (err != DB_SUCCESS) {
+				goto func_exit;
+			}
+		}
+	}
+
+func_exit:
+	free(page_compress_buf);
+	return err;
+}
+
+/********************************************************************//**
+Iterate over all the pages in the tablespace.
+@param table - the table definiton in the server
+@param n_io_buffers - number of blocks to read and write together
+@param callback - functor that will do the page updates
+@return	DB_SUCCESS or error code */
+static
+dberr_t
+fil_tablespace_iterate(
+/*===================*/
+	dict_table_t*		table,
+	ulint			n_io_buffers,
+	AbstractCallback&	callback)
+{
+	dberr_t		err;
+	pfs_os_file_t	file;
+	char*		filepath;
+
+	ut_a(n_io_buffers > 0);
+	ut_ad(!srv_read_only_mode);
+
+	DBUG_EXECUTE_IF("ib_import_trigger_corruption_1",
+			return(DB_CORRUPTION););
+
+	/* Make sure the data_dir_path is set. */
+	dict_get_and_save_data_dir_path(table);
+
+	ut_ad(!DICT_TF_HAS_DATA_DIR(table->flags) || table->data_dir_path);
+
+	const char *data_dir_path = DICT_TF_HAS_DATA_DIR(table->flags)
+		? table->data_dir_path : nullptr;
+
+	filepath = fil_make_filepath(data_dir_path,
+				     {table->name.m_name,
+				      strlen(table->name.m_name)},
+				     IBD, data_dir_path != nullptr);
+	if (!filepath) {
+		return(DB_OUT_OF_MEMORY);
+	} else {
+		bool	success;
+
+		file = os_file_create_simple_no_error_handling(
+			innodb_data_file_key, filepath,
+			OS_FILE_OPEN, OS_FILE_READ_WRITE, false, &success);
+
+		if (!success) {
+			/* The following call prints an error message */
+			os_file_get_last_error(true);
+			ib::error() << "Trying to import a tablespace,"
+				" but could not open the tablespace file "
+				    << filepath;
+			ut_free(filepath);
+			return DB_TABLESPACE_NOT_FOUND;
+		} else {
+			err = DB_SUCCESS;
+		}
+	}
+
+	callback.set_file(filepath, file);
+
+	os_offset_t	file_size = os_file_get_size(file);
+	ut_a(file_size != (os_offset_t) -1);
+
+	/* Allocate a page to read in the tablespace header, so that we
+	can determine the page size and zip_size (if it is compressed).
+	We allocate an extra page in case it is a compressed table. */
+
+	byte*	page = static_cast<byte*>(aligned_malloc(2 * srv_page_size,
+							 srv_page_size));
+
+	buf_block_t* block = reinterpret_cast<buf_block_t*>
+		(ut_zalloc_nokey(sizeof *block));
+	block->page.frame = page;
+	block->page.init(buf_page_t::UNFIXED + 1, page_id_t{~0ULL});
+
+	/* Read the first page and determine the page size. */
+
+	err = os_file_read(IORequestReadPartial, file, page, 0, srv_page_size,
+			   nullptr);
+
+	if (err == DB_SUCCESS) {
+		err = callback.init(file_size, block);
+	}
+
+	if (err == DB_SUCCESS) {
+		block->page.id_ = page_id_t(callback.get_space_id(), 0);
+		if (ulint zip_size = callback.get_zip_size()) {
+			page_zip_set_size(&block->page.zip, zip_size);
+			/* ROW_FORMAT=COMPRESSED is not optimised for block IO
+			for now. We do the IMPORT page by page. */
+			n_io_buffers = 1;
+		}
+
+		fil_iterator_t	iter;
+
+		/* read (optional) crypt data */
+		iter.crypt_data = fil_space_read_crypt_data(
+			callback.get_zip_size(), page);
+
+		/* If tablespace is encrypted, it needs extra buffers */
+		if (iter.crypt_data && n_io_buffers > 1) {
+			/* decrease io buffers so that memory
+			consumption will not double */
+			n_io_buffers /= 2;
+		}
+
+		iter.file = file;
+		iter.start = 0;
+		iter.end = file_size;
+		iter.filepath = filepath;
+		iter.file_size = file_size;
+		iter.n_io_buffers = n_io_buffers;
+
+		/* Add an extra page for compressed page scratch area. */
+		iter.io_buffer = static_cast<byte*>(
+			aligned_malloc((1 + iter.n_io_buffers)
+				       << srv_page_size_shift, srv_page_size));
+
+		iter.crypt_io_buffer = iter.crypt_data
+			? static_cast<byte*>(
+				aligned_malloc((1 + iter.n_io_buffers)
+					       << srv_page_size_shift,
+					       srv_page_size))
+			: NULL;
+
+		if (block->page.zip.ssize) {
+			ut_ad(iter.n_io_buffers == 1);
+			block->page.frame = iter.io_buffer;
+			block->page.zip.data = block->page.frame
+				+ srv_page_size;
+		}
+
+		err = callback.run(iter, block);
+
+		if (iter.crypt_data) {
+			fil_space_destroy_crypt_data(&iter.crypt_data);
+		}
+
+		aligned_free(iter.crypt_io_buffer);
+		aligned_free(iter.io_buffer);
+	}
+
+	if (err == DB_SUCCESS) {
+		ib::info() << "Sync to disk";
+
+		if (!os_file_flush(file)) {
+			ib::info() << "os_file_flush() failed!";
+			err = DB_IO_ERROR;
+		} else {
+			ib::info() << "Sync to disk - done!";
+		}
+	}
+
+	os_file_close(file);
+
+	aligned_free(page);
+	ut_free(filepath);
+	ut_free(block);
+
+	return(err);
+}
+
+/*****************************************************************//**
+Imports a tablespace. The space id in the .ibd file must match the space id
+of the table in the data dictionary.
+@return error code or DB_SUCCESS */
+dberr_t
+row_import_for_mysql(
+/*=================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in MySQL */
+{
+	dberr_t		err;
+	ib_uint64_t	autoinc = 0;
+	char*		filepath = NULL;
+	trx_t*		trx = prebuilt->trx;
+
+	/* The caller assured that this is not read_only_mode and that no
+	temorary tablespace is being imported. */
+	ut_ad(!srv_read_only_mode);
+	ut_ad(!table->is_temporary());
+
+	ut_ad(table->space_id);
+	ut_ad(table->space_id < SRV_SPACE_ID_UPPER_BOUND);
+	ut_ad(trx);
+	ut_ad(trx->state == TRX_STATE_ACTIVE);
+	ut_ad(!table->is_readable());
+
+	ibuf_delete_for_discarded_space(table->space_id);
+
+	/* Assign an undo segment for the transaction, so that the
+	transaction will be recovered after a crash. */
+
+	/* TODO: Do not write any undo log for the IMPORT cleanup. */
+	{
+		mtr_t mtr;
+		mtr.start();
+		trx_undo_assign(trx, &err, &mtr);
+		mtr.commit();
+	}
+
+	DBUG_EXECUTE_IF("ib_import_undo_assign_failure",
+			err = DB_TOO_MANY_CONCURRENT_TRXS;);
+
+	if (err == DB_SUCCESS && !trx->has_logged_persistent()) {
+		err = DB_TOO_MANY_CONCURRENT_TRXS;
+	}
+	if (err != DB_SUCCESS) {
+		return row_import_cleanup(prebuilt, err);
+	}
+
+	trx->op_info = "read meta-data file";
+
+	row_import	cfg;
+	THD* thd = trx->mysql_thd;
+
+	err = row_import_read_cfg(table, thd, cfg);
+
+	/* Check if the table column definitions match the contents
+	of the config file. */
+
+	if (err == DB_SUCCESS) {
+
+		if (dberr_t err = handle_instant_metadata(table, cfg)) {
+			return row_import_error(prebuilt, err);
+		}
+
+		/* We have a schema file, try and match it with our
+		data dictionary. */
+
+		err = cfg.match_schema(thd);
+
+		/* Update index->page and SYS_INDEXES.PAGE_NO to match the
+		B-tree root page numbers in the tablespace. Use the index
+		name from the .cfg file to find match. */
+
+		if (err == DB_SUCCESS) {
+			cfg.set_root_by_name();
+			autoinc = cfg.m_autoinc;
+		}
+
+		DBUG_EXECUTE_IF("ib_import_set_index_root_failure",
+				err = DB_TOO_MANY_CONCURRENT_TRXS;);
+
+	} else if (cfg.m_missing) {
+		/* We don't have a schema file, we will have to discover
+		the index root pages from the .ibd file and skip the schema
+		matching step. */
+
+		ut_a(err == DB_FAIL);
+
+		cfg.m_zip_size = 0;
+
+		if (UT_LIST_GET_LEN(table->indexes) > 1) {
+			ib_errf(thd, IB_LOG_LEVEL_ERROR,
+				ER_INTERNAL_ERROR,
+				"Drop all secondary indexes before importing "
+				"table %s when .cfg file is missing.",
+				table->name.m_name);
+			err = DB_ERROR;
+			return row_import_error(prebuilt, err);
+		}
+
+		FetchIndexRootPages	fetchIndexRootPages(table, trx);
+
+		err = fil_tablespace_iterate(
+			table, IO_BUFFER_SIZE(srv_page_size),
+			fetchIndexRootPages);
+
+		if (err == DB_SUCCESS) {
+
+			err = fetchIndexRootPages.build_row_import(&cfg);
+
+			/* Update index->page and SYS_INDEXES.PAGE_NO
+			to match the B-tree root page numbers in the
+			tablespace. */
+
+			if (err == DB_SUCCESS) {
+				err = cfg.set_root_by_heuristic();
+
+				if (err == DB_SUCCESS) {
+					err = handle_instant_metadata(table,
+								      cfg);
+				}
+			}
+		}
+	}
+
+	if (err != DB_SUCCESS) {
+		return row_import_error(prebuilt, err);
+	}
+
+	trx->op_info = "importing tablespace";
+
+	ib::info() << "Phase I - Update all pages";
+
+	/* Iterate over all the pages and do the sanity checking and
+	the conversion required to import the tablespace. */
+
+	PageConverter	converter(&cfg, table->space_id, trx);
+
+	/* Set the IO buffer size in pages. */
+
+	err = fil_tablespace_iterate(
+		table, IO_BUFFER_SIZE(cfg.m_zip_size ? cfg.m_zip_size
+				      : srv_page_size), converter);
+
+	DBUG_EXECUTE_IF("ib_import_reset_space_and_lsn_failure",
+			err = DB_TOO_MANY_CONCURRENT_TRXS;);
+#ifdef BTR_CUR_HASH_ADAPT
+	/* On DISCARD TABLESPACE, we did not drop any adaptive hash
+	index entries. If we replaced the discarded tablespace with a
+	smaller one here, there could still be some adaptive hash
+	index entries that point to cached garbage pages in the buffer
+	pool, because PageConverter::operator() only evicted those
+	pages that were replaced by the imported pages. We must
+	detach any remaining adaptive hash index entries, because the
+	adaptive hash index must be a subset of the table contents;
+	false positives are not tolerated. */
+	for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes); index;
+	     index = UT_LIST_GET_NEXT(indexes, index)) {
+		index = index->clone_if_needed();
+	}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+	if (err != DB_SUCCESS) {
+		char	table_name[MAX_FULL_NAME_LEN + 1];
+
+		innobase_format_name(
+			table_name, sizeof(table_name),
+			table->name.m_name);
+
+		if (err != DB_DECRYPTION_FAILED) {
+
+			ib_errf(thd, IB_LOG_LEVEL_ERROR,
+				ER_INTERNAL_ERROR,
+			"Error importing tablespace for table %s : %s",
+				table_name, ut_strerr(err));
+		}
+
+		return row_import_cleanup(prebuilt, err);
+	}
+
+	/* If the table is stored in a remote tablespace, we need to
+	determine that filepath from the link file and system tables.
+	Find the space ID in SYS_TABLES since this is an ALTER TABLE. */
+	dict_get_and_save_data_dir_path(table);
+
+	ut_ad(!DICT_TF_HAS_DATA_DIR(table->flags) || table->data_dir_path);
+	const char *data_dir_path = DICT_TF_HAS_DATA_DIR(table->flags)
+		? table->data_dir_path : nullptr;
+	fil_space_t::name_type name{
+		table->name.m_name, strlen(table->name.m_name)};
+
+	filepath = fil_make_filepath(data_dir_path, name, IBD,
+				     data_dir_path != nullptr);
+
+	DBUG_EXECUTE_IF(
+		"ib_import_OOM_15",
+		ut_free(filepath);
+		filepath = NULL;
+	);
+
+	if (filepath == NULL) {
+		return row_import_cleanup(prebuilt, DB_OUT_OF_MEMORY);
+	}
+
+	/* Open the tablespace so that we can access via the buffer pool.
+	The tablespace is initially opened as a temporary one, because
+	we will not be writing any redo log for it before we have invoked
+	fil_space_t::set_imported() to declare it a persistent tablespace. */
+
+	table->space = fil_ibd_open(
+		2, FIL_TYPE_IMPORT, table->space_id,
+		dict_tf_to_fsp_flags(table->flags), name, filepath, &err);
+
+	ut_ad((table->space == NULL) == (err != DB_SUCCESS));
+	DBUG_EXECUTE_IF("ib_import_open_tablespace_failure",
+			err = DB_TABLESPACE_NOT_FOUND; table->space = NULL;);
+
+	if (!table->space) {
+		ib_senderrf(thd, IB_LOG_LEVEL_ERROR,
+			ER_GET_ERRMSG,
+			err, ut_strerr(err), filepath);
+	}
+
+	ut_free(filepath);
+
+	if (err == DB_SUCCESS) {
+		err = ibuf_check_bitmap_on_import(trx, table->space);
+	}
+
+	DBUG_EXECUTE_IF("ib_import_check_bitmap_failure", err = DB_CORRUPTION;);
+
+	if (err != DB_SUCCESS) {
+		return row_import_cleanup(prebuilt, err);
+	}
+
+	/* The first index must always be the clustered index. */
+
+	dict_index_t*	index = dict_table_get_first_index(table);
+
+	if (!dict_index_is_clust(index)) {
+		return row_import_error(prebuilt, DB_CORRUPTION);
+	}
+
+	/* Update the Btree segment headers for index node and
+	leaf nodes in the root page. Set the new space id. */
+
+	err = btr_root_adjust_on_import(index);
+
+	DBUG_EXECUTE_IF("ib_import_cluster_root_adjust_failure",
+			err = DB_CORRUPTION;);
+
+	if (err != DB_SUCCESS) {
+		return row_import_error(prebuilt, err);
+	} else if (cfg.requires_purge(index->name)) {
+
+		/* Purge any delete-marked records that couldn't be
+		purged during the page conversion phase from the
+		cluster index. */
+
+		IndexPurge	purge(trx, index);
+
+		trx->op_info = "cluster: purging delete marked records";
+
+		err = purge.garbage_collect();
+
+		trx->op_info = "";
+	}
+
+	DBUG_EXECUTE_IF("ib_import_cluster_failure", err = DB_CORRUPTION;);
+
+	if (err != DB_SUCCESS) {
+		return row_import_error(prebuilt, err);
+	}
+
+	/* For secondary indexes, purge any records that couldn't be purged
+	during the page conversion phase. */
+
+	err = row_import_adjust_root_pages_of_secondary_indexes(
+		trx, table, cfg);
+
+	DBUG_EXECUTE_IF("ib_import_sec_root_adjust_failure",
+			err = DB_CORRUPTION;);
+
+	if (err != DB_SUCCESS) {
+		return row_import_error(prebuilt, err);
+	}
+
+	/* Ensure that the next available DB_ROW_ID is not smaller than
+	any DB_ROW_ID stored in the table. */
+
+	if (prebuilt->clust_index_was_generated) {
+		row_import_set_sys_max_row_id(prebuilt, table);
+	}
+
+	ib::info() << "Phase III - Flush changes to disk";
+
+	/* Ensure that all pages dirtied during the IMPORT make it to disk.
+	The only dirty pages generated should be from the pessimistic purge
+	of delete marked records that couldn't be purged in Phase I. */
+	while (buf_flush_list_space(prebuilt->table->space));
+
+	for (ulint count = 0; prebuilt->table->space->referenced(); count++) {
+		/* Issue a warning every 10.24 seconds, starting after
+		2.56 seconds */
+		if ((count & 511) == 128) {
+			ib::warn() << "Waiting for flush to complete on "
+				   << prebuilt->table->name;
+		}
+		std::this_thread::sleep_for(std::chrono::milliseconds(20));
+	}
+
+	ib::info() << "Phase IV - Flush complete";
+	prebuilt->table->space->set_imported();
+
+	/* The dictionary latches will be released in in row_import_cleanup()
+	after the transaction commit, for both success and error. */
+
+	row_mysql_lock_data_dictionary(trx);
+
+	/* Update the root pages of the table's indexes. */
+	err = row_import_update_index_root(trx, table, false);
+
+	if (err != DB_SUCCESS) {
+		return row_import_error(prebuilt, err);
+	}
+
+	err = row_import_update_discarded_flag(trx, table->id, false);
+
+	if (err != DB_SUCCESS) {
+		return row_import_error(prebuilt, err);
+	}
+
+	table->file_unreadable = false;
+	table->flags2 &= ~DICT_TF2_DISCARDED & ((1U << DICT_TF2_BITS) - 1);
+
+	/* Set autoinc value read from .cfg file, if one was specified.
+	Otherwise, keep the PAGE_ROOT_AUTO_INC as is. */
+	if (autoinc) {
+		ib::info() << table->name << " autoinc value set to "
+			<< autoinc;
+
+		table->autoinc = autoinc--;
+		btr_write_autoinc(dict_table_get_first_index(table), autoinc);
+	}
+
+	return row_import_cleanup(prebuilt, err);
+}
diff --git a/storage/innobase/row/row0ins.cc b/storage/innobase/row/row0ins.cc
new file mode 100644
index 00000000..bdee0ed1
--- /dev/null
+++ b/storage/innobase/row/row0ins.cc
@@ -0,0 +1,3843 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0ins.cc
+Insert into a table
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#include "row0ins.h"
+#include "dict0dict.h"
+#include "trx0rec.h"
+#include "trx0undo.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "mach0data.h"
+#include "ibuf0ibuf.h"
+#include "que0que.h"
+#include "row0upd.h"
+#include "row0sel.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "log0log.h"
+#include "eval0eval.h"
+#include "data0data.h"
+#include "buf0lru.h"
+#include "fts0fts.h"
+#include "fts0types.h"
+#ifdef BTR_CUR_HASH_ADAPT
+# include "btr0sea.h"
+#endif
+#ifdef WITH_WSREP
+#include <wsrep.h>
+#include <mysql/service_wsrep.h>
+#include "ha_prototypes.h"
+#endif /* WITH_WSREP */
+
+/*************************************************************************
+IMPORTANT NOTE: Any operation that generates redo MUST check that there
+is enough space in the redo log before for that operation. This is
+done by calling log_free_check(). The reason for checking the
+availability of the redo log space before the start of the operation is
+that we MUST not hold any synchonization objects when performing the
+check.
+If you make a change in this module make sure that no codepath is
+introduced where a call to log_free_check() is bypassed. */
+
+/** Create an row template for each index of a table. */
+static void ins_node_create_entry_list(ins_node_t *node)
+{
+  node->entry_list.reserve(UT_LIST_GET_LEN(node->table->indexes));
+
+  for (dict_index_t *index= dict_table_get_first_index(node->table); index;
+       index= dict_table_get_next_index(index))
+  {
+    /* Corrupted or incomplete secondary indexes will be filtered out in
+    row_ins(). */
+    dtuple_t *entry= index->online_status >= ONLINE_INDEX_ABORTED
+      ? dtuple_create(node->entry_sys_heap, 0)
+      : row_build_index_entry_low(node->row, NULL, index, node->entry_sys_heap,
+				  ROW_BUILD_FOR_INSERT);
+    node->entry_list.push_back(entry);
+  }
+}
+
+/*****************************************************************//**
+Adds system field buffers to a row. */
+static
+void
+row_ins_alloc_sys_fields(
+/*=====================*/
+	ins_node_t*	node)	/*!< in: insert node */
+{
+	dtuple_t*		row;
+	dict_table_t*		table;
+	const dict_col_t*	col;
+	dfield_t*		dfield;
+
+	row = node->row;
+	table = node->table;
+
+	ut_ad(dtuple_get_n_fields(row) == dict_table_get_n_cols(table));
+
+	/* allocate buffer to hold the needed system created hidden columns. */
+	compile_time_assert(DATA_ROW_ID_LEN
+			    + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN
+			    == sizeof node->sys_buf);
+	memset(node->sys_buf, 0, sizeof node->sys_buf);
+	/* Assign DB_ROLL_PTR to 1 << ROLL_PTR_INSERT_FLAG_POS */
+	node->sys_buf[DATA_ROW_ID_LEN + DATA_TRX_ID_LEN] = 0x80;
+	ut_ad(!memcmp(node->sys_buf + DATA_ROW_ID_LEN, reset_trx_id,
+		      sizeof reset_trx_id));
+
+	/* 1. Populate row-id */
+	col = dict_table_get_sys_col(table, DATA_ROW_ID);
+
+	dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+
+	dfield_set_data(dfield, node->sys_buf, DATA_ROW_ID_LEN);
+
+	/* 2. Populate trx id */
+	col = dict_table_get_sys_col(table, DATA_TRX_ID);
+
+	dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+
+	dfield_set_data(dfield, &node->sys_buf[DATA_ROW_ID_LEN],
+			DATA_TRX_ID_LEN);
+
+	col = dict_table_get_sys_col(table, DATA_ROLL_PTR);
+
+	dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+
+	dfield_set_data(dfield, &node->sys_buf[DATA_ROW_ID_LEN
+					       + DATA_TRX_ID_LEN],
+			DATA_ROLL_PTR_LEN);
+}
+
+/*********************************************************************//**
+Sets a new row to insert for an INS_DIRECT node. This function is only used
+if we have constructed the row separately, which is a rare case; this
+function is quite slow. */
+void
+ins_node_set_new_row(
+/*=================*/
+	ins_node_t*	node,	/*!< in: insert node */
+	dtuple_t*	row)	/*!< in: new row (or first row) for the node */
+{
+	node->state = INS_NODE_SET_IX_LOCK;
+	node->index = NULL;
+	node->entry_list.clear();
+	node->entry = node->entry_list.end();
+
+	node->row = row;
+
+	mem_heap_empty(node->entry_sys_heap);
+
+	/* Create templates for index entries */
+
+	ins_node_create_entry_list(node);
+
+	/* Allocate from entry_sys_heap buffers for sys fields */
+
+	row_ins_alloc_sys_fields(node);
+
+	/* As we allocated a new trx id buf, the trx id should be written
+	there again: */
+
+	node->trx_id = 0;
+}
+
+/*******************************************************************//**
+Does an insert operation by updating a delete-marked existing record
+in the index. This situation can occur if the delete-marked record is
+kept in the index for consistent reads.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins_sec_index_entry_by_modify(
+/*==============================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	ulint		mode,	/*!< in: BTR_MODIFY_LEAF or BTR_INSERT_TREE,
+				depending on whether mtr holds just a leaf
+				latch or also a tree latch */
+	btr_cur_t*	cursor,	/*!< in: B-tree cursor */
+	rec_offs**	offsets,/*!< in/out: offsets on cursor->page_cur.rec */
+	mem_heap_t*	offsets_heap,
+				/*!< in/out: memory heap that can be emptied */
+	mem_heap_t*	heap,	/*!< in/out: memory heap */
+	const dtuple_t*	entry,	/*!< in: index entry to insert */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr)	/*!< in: mtr; must be committed before
+				latching any further pages */
+{
+	big_rec_t*	dummy_big_rec;
+	upd_t*		update;
+	rec_t*		rec;
+	dberr_t		err;
+
+	rec = btr_cur_get_rec(cursor);
+
+	ut_ad(!cursor->index()->is_clust());
+	ut_ad(rec_offs_validate(rec, cursor->index(), *offsets));
+	ut_ad(!entry->info_bits);
+
+	/* We know that in the alphabetical ordering, entry and rec are
+	identified. But in their binary form there may be differences if
+	there are char fields in them. Therefore we have to calculate the
+	difference. */
+
+	update = row_upd_build_sec_rec_difference_binary(
+		rec, cursor->index(), *offsets, entry, heap);
+
+	if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
+		/* We should never insert in place of a record that
+		has not been delete-marked. The only exception is when
+		online CREATE INDEX copied the changes that we already
+		made to the clustered index, and completed the
+		secondary index creation before we got here. In this
+		case, the change would already be there. The CREATE
+		INDEX should be in wait_while_table_is_used() at least
+		until this INSERT or UPDATE returns. After that point,
+		set_committed(true) would be invoked in
+		commit_inplace_alter_table(). */
+		ut_a(update->n_fields == 0);
+		ut_ad(!dict_index_is_online_ddl(cursor->index()));
+		return cursor->index()->is_committed()
+			? DB_CORRUPTION : DB_SUCCESS;
+	}
+
+	if (mode == BTR_MODIFY_LEAF) {
+		/* Try an optimistic updating of the record, keeping changes
+		within the page */
+
+		/* TODO: pass only *offsets */
+		err = btr_cur_optimistic_update(
+			flags | BTR_KEEP_SYS_FLAG, cursor,
+			offsets, &offsets_heap, update, 0, thr,
+			thr_get_trx(thr)->id, mtr);
+		switch (err) {
+		case DB_OVERFLOW:
+		case DB_UNDERFLOW:
+		case DB_ZIP_OVERFLOW:
+			err = DB_FAIL;
+		default:
+			break;
+		}
+	} else {
+		ut_ad(mode == BTR_INSERT_TREE);
+		if (buf_pool.running_out()) {
+
+			return(DB_LOCK_TABLE_FULL);
+		}
+
+		err = btr_cur_pessimistic_update(
+			flags | BTR_KEEP_SYS_FLAG, cursor,
+			offsets, &offsets_heap,
+			heap, &dummy_big_rec, update, 0,
+			thr, thr_get_trx(thr)->id, mtr);
+		ut_ad(!dummy_big_rec);
+	}
+
+	return(err);
+}
+
+/*******************************************************************//**
+Does an insert operation by delete unmarking and updating a delete marked
+existing record in the index. This situation can occur if the delete marked
+record is kept in the index for consistent reads.
+@return DB_SUCCESS, DB_FAIL, or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins_clust_index_entry_by_modify(
+/*================================*/
+	btr_pcur_t*	pcur,	/*!< in/out: a persistent cursor pointing
+				to the clust_rec that is being modified. */
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	ulint		mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+				depending on whether mtr holds just a leaf
+				latch or also a tree latch */
+	rec_offs**	offsets,/*!< out: offsets on cursor->page_cur.rec */
+	mem_heap_t**	offsets_heap,
+				/*!< in/out: pointer to memory heap that can
+				be emptied, or NULL */
+	mem_heap_t*	heap,	/*!< in/out: memory heap */
+	const dtuple_t*	entry,	/*!< in: index entry to insert */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr)	/*!< in: mtr; must be committed before
+				latching any further pages */
+{
+	const rec_t*	rec;
+	upd_t*		update;
+	dberr_t		err = DB_SUCCESS;
+	btr_cur_t*	cursor	= btr_pcur_get_btr_cur(pcur);
+	TABLE*		mysql_table = NULL;
+	ut_ad(cursor->index()->is_clust());
+
+	rec = btr_cur_get_rec(cursor);
+
+	ut_ad(rec_get_deleted_flag(rec,
+				   cursor->index()->table->not_redundant()));
+	/* In delete-marked records, DB_TRX_ID must
+	always refer to an existing undo log record. */
+	ut_ad(rec_get_trx_id(rec, cursor->index()));
+
+	/* Build an update vector containing all the fields to be modified;
+	NOTE that this vector may NOT contain system columns trx_id or
+	roll_ptr */
+	if (thr->prebuilt != NULL) {
+		mysql_table = thr->prebuilt->m_mysql_table;
+		ut_ad(thr->prebuilt->trx == thr_get_trx(thr));
+	}
+
+	update = row_upd_build_difference_binary(
+		cursor->index(), entry, rec, NULL, true, true,
+		thr_get_trx(thr), heap, mysql_table, &err);
+	if (err != DB_SUCCESS) {
+		return(err);
+	}
+
+	if (mode != BTR_MODIFY_TREE) {
+		ut_ad(mode == BTR_MODIFY_LEAF
+		      || mode == BTR_MODIFY_LEAF_ALREADY_LATCHED
+		      || mode == BTR_MODIFY_ROOT_AND_LEAF
+		      || mode == BTR_MODIFY_ROOT_AND_LEAF_ALREADY_LATCHED);
+
+		/* Try optimistic updating of the record, keeping changes
+		within the page */
+
+		err = btr_cur_optimistic_update(
+			flags, cursor, offsets, offsets_heap, update, 0, thr,
+			thr_get_trx(thr)->id, mtr);
+		switch (err) {
+		case DB_OVERFLOW:
+		case DB_UNDERFLOW:
+		case DB_ZIP_OVERFLOW:
+			err = DB_FAIL;
+		default:
+			break;
+		}
+	} else {
+		if (buf_pool.running_out()) {
+			return DB_LOCK_TABLE_FULL;
+		}
+
+		big_rec_t*	big_rec	= NULL;
+
+		err = btr_cur_pessimistic_update(
+			flags | BTR_KEEP_POS_FLAG,
+			cursor, offsets, offsets_heap, heap,
+			&big_rec, update, 0, thr, thr_get_trx(thr)->id, mtr);
+
+		if (big_rec) {
+			ut_a(err == DB_SUCCESS);
+
+			DEBUG_SYNC_C("before_row_ins_upd_extern");
+			err = btr_store_big_rec_extern_fields(
+				pcur, *offsets, big_rec, mtr,
+				BTR_STORE_INSERT_UPDATE);
+			DEBUG_SYNC_C("after_row_ins_upd_extern");
+			dtuple_big_rec_free(big_rec);
+		}
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Returns TRUE if in a cascaded update/delete an ancestor node of node
+updates (not DELETE, but UPDATE) table.
+@return TRUE if an ancestor updates table */
+static
+ibool
+row_ins_cascade_ancestor_updates_table(
+/*===================================*/
+	que_node_t*	node,	/*!< in: node in a query graph */
+	dict_table_t*	table)	/*!< in: table */
+{
+	que_node_t*	parent;
+
+	for (parent = que_node_get_parent(node);
+	     que_node_get_type(parent) == QUE_NODE_UPDATE;
+	     parent = que_node_get_parent(parent)) {
+
+		upd_node_t*	upd_node;
+
+		upd_node = static_cast<upd_node_t*>(parent);
+
+		if (upd_node->table == table && !upd_node->is_delete) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Returns the number of ancestor UPDATE or DELETE nodes of a
+cascaded update/delete node.
+@return number of ancestors */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+ulint
+row_ins_cascade_n_ancestors(
+/*========================*/
+	que_node_t*	node)	/*!< in: node in a query graph */
+{
+	que_node_t*	parent;
+	ulint		n_ancestors = 0;
+
+	for (parent = que_node_get_parent(node);
+	     que_node_get_type(parent) == QUE_NODE_UPDATE;
+	     parent = que_node_get_parent(parent)) {
+
+		n_ancestors++;
+	}
+
+	return(n_ancestors);
+}
+
+/******************************************************************//**
+Calculates the update vector node->cascade->update for a child table in
+a cascaded update.
+@return whether any FULLTEXT INDEX is affected */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+row_ins_cascade_calc_update_vec(
+/*============================*/
+	upd_node_t*	node,		/*!< in: update node of the parent
+					table */
+	dict_foreign_t*	foreign,	/*!< in: foreign key constraint whose
+					type is != 0 */
+	mem_heap_t*	heap,		/*!< in: memory heap to use as
+					temporary storage */
+	trx_t*		trx)		/*!< in: update transaction */
+{
+	upd_node_t*     cascade         = node->cascade_node;
+	dict_table_t*	table		= foreign->foreign_table;
+	dict_index_t*	index		= foreign->foreign_index;
+	upd_t*		update;
+	dict_table_t*	parent_table;
+	dict_index_t*	parent_index;
+	upd_t*		parent_update;
+	ulint		n_fields_updated;
+	ulint		parent_field_no;
+	ulint		i;
+	ulint		j;
+	bool		doc_id_updated = false;
+	unsigned	doc_id_pos = 0;
+	doc_id_t	new_doc_id = FTS_NULL_DOC_ID;
+	ulint		prefix_col;
+
+	ut_a(cascade);
+	ut_a(table);
+	ut_a(index);
+
+	/* Calculate the appropriate update vector which will set the fields
+	in the child index record to the same value (possibly padded with
+	spaces if the column is a fixed length CHAR or FIXBINARY column) as
+	the referenced index record will get in the update. */
+
+	parent_table = node->table;
+	ut_a(parent_table == foreign->referenced_table);
+	parent_index = foreign->referenced_index;
+	parent_update = node->update;
+
+	update = cascade->update;
+
+	update->info_bits = 0;
+
+	n_fields_updated = 0;
+
+	bool affects_fulltext = foreign->affects_fulltext();
+
+	if (table->fts) {
+		doc_id_pos = dict_table_get_nth_col_pos(
+			table, table->fts->doc_col, &prefix_col);
+	}
+
+	for (i = 0; i < foreign->n_fields; i++) {
+
+		parent_field_no = dict_table_get_nth_col_pos(
+			parent_table,
+			dict_index_get_nth_col_no(parent_index, i),
+			&prefix_col);
+
+		for (j = 0; j < parent_update->n_fields; j++) {
+			const upd_field_t*	parent_ufield
+				= &parent_update->fields[j];
+
+			if (parent_ufield->field_no == parent_field_no) {
+
+				ulint			min_size;
+				const dict_col_t*	col;
+				ulint			ufield_len;
+				upd_field_t*		ufield;
+
+				col = dict_index_get_nth_col(index, i);
+
+				/* A field in the parent index record is
+				updated. Let us make the update vector
+				field for the child table. */
+
+				ufield = update->fields + n_fields_updated;
+
+				ufield->field_no = static_cast<uint16_t>(
+					dict_table_get_nth_col_pos(
+						table, dict_col_get_no(col),
+						&prefix_col));
+
+				ufield->orig_len = 0;
+				ufield->exp = NULL;
+
+				ufield->new_val = parent_ufield->new_val;
+				dfield_get_type(&ufield->new_val)->prtype |=
+					col->prtype & DATA_VERSIONED;
+				ufield_len = dfield_get_len(&ufield->new_val);
+
+				/* Clear the "external storage" flag */
+				dfield_set_len(&ufield->new_val, ufield_len);
+
+				/* Do not allow a NOT NULL column to be
+				updated as NULL */
+
+				if (dfield_is_null(&ufield->new_val)
+				    && (col->prtype & DATA_NOT_NULL)) {
+					goto err_exit;
+				}
+
+				/* If the new value would not fit in the
+				column, do not allow the update */
+
+				if (!dfield_is_null(&ufield->new_val)
+				    && dtype_get_at_most_n_mbchars(
+					col->prtype,
+					col->mbminlen, col->mbmaxlen,
+					col->len,
+					ufield_len,
+					static_cast<char*>(
+						dfield_get_data(
+							&ufield->new_val)))
+				    < ufield_len) {
+					goto err_exit;
+				}
+
+				/* If the parent column type has a different
+				length than the child column type, we may
+				need to pad with spaces the new value of the
+				child column */
+
+				min_size = dict_col_get_min_size(col);
+
+				/* Because UNIV_SQL_NULL (the marker
+				of SQL NULL values) exceeds all possible
+				values of min_size, the test below will
+				not hold for SQL NULL columns. */
+
+				if (min_size > ufield_len) {
+
+					byte*	pad;
+					ulint	pad_len;
+					byte*	padded_data;
+					ulint	mbminlen;
+
+					padded_data = static_cast<byte*>(
+						mem_heap_alloc(
+							heap, min_size));
+
+					pad = padded_data + ufield_len;
+					pad_len = min_size - ufield_len;
+
+					memcpy(padded_data,
+					       dfield_get_data(&ufield
+							       ->new_val),
+					       ufield_len);
+
+					mbminlen = dict_col_get_mbminlen(col);
+
+					ut_ad(!(ufield_len % mbminlen));
+					ut_ad(!(min_size % mbminlen));
+
+					if (mbminlen == 1
+					    && dtype_get_charset_coll(
+						    col->prtype)
+					    == DATA_MYSQL_BINARY_CHARSET_COLL) {
+						/* Do not pad BINARY columns */
+						goto err_exit;
+					}
+
+					row_mysql_pad_col(mbminlen,
+							  pad, pad_len);
+					dfield_set_data(&ufield->new_val,
+							padded_data, min_size);
+				}
+
+				/* If Doc ID is updated, check whether the
+				Doc ID is valid */
+				if (table->fts
+				    && ufield->field_no == doc_id_pos) {
+					doc_id_t	n_doc_id;
+
+					n_doc_id =
+						table->fts->cache->next_doc_id;
+
+					new_doc_id = fts_read_doc_id(
+						static_cast<const byte*>(
+							dfield_get_data(
+							&ufield->new_val)));
+
+					affects_fulltext = true;
+					doc_id_updated = true;
+
+					if (new_doc_id <= 0) {
+						ib::error() << "FTS Doc ID"
+							" must be larger than"
+							" 0";
+						goto err_exit;
+					}
+
+					if (new_doc_id < n_doc_id) {
+						ib::error() << "FTS Doc ID"
+							" must be larger than "
+							<< n_doc_id - 1
+							<< " for table "
+							<< table->name;
+						goto err_exit;
+					}
+				}
+
+				n_fields_updated++;
+			}
+		}
+	}
+
+	if (affects_fulltext) {
+		ut_ad(table->fts);
+
+		if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
+			doc_id_t	doc_id;
+			doc_id_t*	next_doc_id;
+			upd_field_t*	ufield;
+
+			next_doc_id = static_cast<doc_id_t*>(mem_heap_alloc(
+				heap, sizeof(doc_id_t)));
+
+			ut_ad(!doc_id_updated);
+			ufield = update->fields + n_fields_updated;
+			fts_get_next_doc_id(table, next_doc_id);
+			doc_id = fts_update_doc_id(table, ufield, next_doc_id);
+			n_fields_updated++;
+			fts_trx_add_op(trx, table, doc_id, FTS_INSERT, NULL);
+		} else  {
+			if (doc_id_updated) {
+				ut_ad(new_doc_id);
+				fts_trx_add_op(trx, table, new_doc_id,
+					       FTS_INSERT, NULL);
+			} else {
+				ib::error() << "FTS Doc ID must be updated"
+					" along with FTS indexed column for"
+					" table " << table->name;
+err_exit:
+				n_fields_updated = ULINT_UNDEFINED;
+			}
+		}
+	}
+
+	update->n_fields = n_fields_updated;
+
+	return affects_fulltext;
+}
+
+/*********************************************************************//**
+Set detailed error message associated with foreign key errors for
+the given transaction. */
+static
+void
+row_ins_set_detailed(
+/*=================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_foreign_t*	foreign)	/*!< in: foreign key constraint */
+{
+	ut_ad(!srv_read_only_mode);
+
+	mysql_mutex_lock(&srv_misc_tmpfile_mutex);
+	rewind(srv_misc_tmpfile);
+
+	if (os_file_set_eof(srv_misc_tmpfile)) {
+		ut_print_name(srv_misc_tmpfile, trx,
+			      foreign->foreign_table_name);
+		std::string fk_str = dict_print_info_on_foreign_key_in_create_format(
+			trx, foreign, FALSE);
+		fputs(fk_str.c_str(), srv_misc_tmpfile);
+		trx_set_detailed_error_from_file(trx, srv_misc_tmpfile);
+	} else {
+		trx_set_detailed_error(trx, "temp file operation failed");
+	}
+
+	mysql_mutex_unlock(&srv_misc_tmpfile_mutex);
+}
+
+/*********************************************************************//**
+Acquires dict_foreign_err_mutex, rewinds dict_foreign_err_file
+and displays information about the given transaction.
+The caller must release dict_foreign_err_mutex. */
+TRANSACTIONAL_TARGET
+static
+void
+row_ins_foreign_trx_print(
+/*======================*/
+	trx_t*	trx)	/*!< in: transaction */
+{
+	ulint	n_rec_locks;
+	ulint	n_trx_locks;
+	ulint	heap_size;
+
+	ut_ad(!srv_read_only_mode);
+
+	{
+		TMLockMutexGuard g{SRW_LOCK_CALL};
+		n_rec_locks = trx->lock.n_rec_locks;
+		n_trx_locks = UT_LIST_GET_LEN(trx->lock.trx_locks);
+		heap_size = mem_heap_get_size(trx->lock.lock_heap);
+	}
+
+	mysql_mutex_lock(&dict_foreign_err_mutex);
+	rewind(dict_foreign_err_file);
+	ut_print_timestamp(dict_foreign_err_file);
+	fputs(" Transaction:\n", dict_foreign_err_file);
+
+	trx_print_low(dict_foreign_err_file, trx, 600,
+		      n_rec_locks, n_trx_locks, heap_size);
+
+	mysql_mutex_assert_owner(&dict_foreign_err_mutex);
+}
+
+/*********************************************************************//**
+Reports a foreign key error associated with an update or a delete of a
+parent table index entry. */
+static
+void
+row_ins_foreign_report_err(
+/*=======================*/
+	const char*	errstr,		/*!< in: error string from the viewpoint
+					of the parent table */
+	que_thr_t*	thr,		/*!< in: query thread whose run_node
+					is an update node */
+	dict_foreign_t*	foreign,	/*!< in: foreign key constraint */
+	const rec_t*	rec,		/*!< in: a matching index record in the
+					child table */
+	const dtuple_t*	entry)		/*!< in: index entry in the parent
+					table */
+{
+	std::string fk_str;
+
+	if (srv_read_only_mode) {
+		return;
+	}
+
+	FILE*	ef	= dict_foreign_err_file;
+	trx_t*	trx	= thr_get_trx(thr);
+
+	row_ins_set_detailed(trx, foreign);
+
+	row_ins_foreign_trx_print(trx);
+
+	fputs("Foreign key constraint fails for table ", ef);
+	ut_print_name(ef, trx, foreign->foreign_table_name);
+	fputs(":\n", ef);
+	fk_str = dict_print_info_on_foreign_key_in_create_format(trx, foreign,
+							TRUE);
+	fputs(fk_str.c_str(), ef);
+	putc('\n', ef);
+	fputs(errstr, ef);
+	fprintf(ef, " in parent table, in index %s",
+		foreign->referenced_index->name());
+	if (entry) {
+		fputs(" tuple:\n", ef);
+		dtuple_print(ef, entry);
+	}
+	fputs("\nBut in child table ", ef);
+	ut_print_name(ef, trx, foreign->foreign_table_name);
+	fprintf(ef, ", in index %s", foreign->foreign_index->name());
+	if (rec) {
+		fputs(", there is a record:\n", ef);
+		rec_print(ef, rec, foreign->foreign_index);
+	} else {
+		fputs(", the record is not available\n", ef);
+	}
+	putc('\n', ef);
+
+	mysql_mutex_unlock(&dict_foreign_err_mutex);
+}
+
+/*********************************************************************//**
+Reports a foreign key error to dict_foreign_err_file when we are trying
+to add an index entry to a child table. Note that the adding may be the result
+of an update, too. */
+static
+void
+row_ins_foreign_report_add_err(
+/*===========================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_foreign_t*	foreign,	/*!< in: foreign key constraint */
+	const rec_t*	rec,		/*!< in: a record in the parent table:
+					it does not match entry because we
+					have an error! */
+	const dtuple_t*	entry)		/*!< in: index entry to insert in the
+					child table */
+{
+	std::string fk_str;
+
+	if (srv_read_only_mode) {
+		return;
+	}
+
+	FILE*	ef	= dict_foreign_err_file;
+
+	row_ins_set_detailed(trx, foreign);
+
+	row_ins_foreign_trx_print(trx);
+
+	fputs("Foreign key constraint fails for table ", ef);
+	ut_print_name(ef, trx, foreign->foreign_table_name);
+	fputs(":\n", ef);
+	fk_str = dict_print_info_on_foreign_key_in_create_format(trx, foreign,
+							TRUE);
+	fputs(fk_str.c_str(), ef);
+	if (foreign->foreign_index) {
+		fprintf(ef, " in parent table, in index %s",
+			foreign->foreign_index->name());
+	} else {
+		fputs(" in parent table", ef);
+	}
+	if (entry) {
+		fputs(" tuple:\n", ef);
+		/* TODO: DB_TRX_ID and DB_ROLL_PTR may be uninitialized.
+		It would be better to only display the user columns. */
+		dtuple_print(ef, entry);
+	}
+	fputs("\nBut in parent table ", ef);
+	ut_print_name(ef, trx, foreign->referenced_table_name);
+	fprintf(ef, ", in index %s,\n"
+		"the closest match we can find is record:\n",
+		foreign->referenced_index->name());
+	if (rec && page_rec_is_supremum(rec)) {
+		/* If the cursor ended on a supremum record, it is better
+		to report the previous record in the error message, so that
+		the user gets a more descriptive error message. */
+		rec = page_rec_get_prev_const(rec);
+	}
+
+	if (rec) {
+		rec_print(ef, rec, foreign->referenced_index);
+	}
+	putc('\n', ef);
+
+	mysql_mutex_unlock(&dict_foreign_err_mutex);
+}
+
+/*********************************************************************//**
+Invalidate the query cache for the given table. */
+static
+void
+row_ins_invalidate_query_cache(
+/*===========================*/
+	que_thr_t*	thr,		/*!< in: query thread whose run_node
+					is an update node */
+	const char*	name)		/*!< in: table name prefixed with
+					database name and a '/' character */
+{
+	innobase_invalidate_query_cache(thr_get_trx(thr), name);
+}
+
+/** Fill virtual column information in cascade node for the child table.
+@param[out]	cascade		child update node
+@param[in]	rec		clustered rec of child table
+@param[in]	index		clustered index of child table
+@param[in]	node		parent update node
+@param[in]	foreign		foreign key information
+@return		error code. */
+static
+dberr_t
+row_ins_foreign_fill_virtual(
+	upd_node_t*		cascade,
+	const rec_t*		rec,
+	dict_index_t*		index,
+	upd_node_t*		node,
+	dict_foreign_t*		foreign)
+{
+	THD*		thd = current_thd;
+	row_ext_t*	ext;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs_init(offsets_);
+	const rec_offs*	offsets =
+		rec_get_offsets(rec, index, offsets_, index->n_core_fields,
+				ULINT_UNDEFINED, &cascade->heap);
+	TABLE*		mysql_table= NULL;
+	upd_t*		update = cascade->update;
+	ulint		n_v_fld = index->table->n_v_def;
+	ulint		n_diff;
+	upd_field_t*	upd_field;
+	dict_vcol_set*	v_cols = foreign->v_cols;
+	update->old_vrow = row_build(
+		ROW_COPY_DATA, index, rec,
+		offsets, index->table, NULL, NULL,
+		&ext, update->heap);
+	n_diff = update->n_fields;
+
+	ut_ad(index->table->vc_templ != NULL);
+
+	ib_vcol_row vc(NULL);
+	uchar *record = vc.record(thd, index, &mysql_table);
+	if (!record) {
+		return DB_OUT_OF_MEMORY;
+	}
+	ut_ad(!node->is_delete
+	      || (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL));
+	ut_ad(foreign->type & (DICT_FOREIGN_ON_DELETE_SET_NULL
+			       | DICT_FOREIGN_ON_UPDATE_SET_NULL
+			       | DICT_FOREIGN_ON_UPDATE_CASCADE));
+
+	for (uint16_t i = 0; i < n_v_fld; i++) {
+
+		dict_v_col_t*     col = dict_table_get_nth_v_col(
+				index->table, i);
+
+		dict_vcol_set::iterator it = v_cols->find(col);
+
+		if (it == v_cols->end()) {
+			continue;
+		}
+
+		dfield_t*	vfield = innobase_get_computed_value(
+				update->old_vrow, col, index,
+				&vc.heap, update->heap, NULL, thd, mysql_table,
+				record, NULL, NULL);
+
+		if (vfield == NULL) {
+			return DB_COMPUTE_VALUE_FAILED;
+		}
+
+		upd_field = update->fields + n_diff;
+
+		upd_field->old_v_val = static_cast<dfield_t*>(
+			mem_heap_alloc(update->heap,
+				       sizeof *upd_field->old_v_val));
+
+		dfield_copy(upd_field->old_v_val, vfield);
+
+		upd_field_set_v_field_no(upd_field, i, index);
+
+		dfield_t* new_vfield = innobase_get_computed_value(
+				update->old_vrow, col, index,
+				&vc.heap, update->heap, NULL, thd,
+				mysql_table, record, NULL,
+				update);
+
+		if (new_vfield == NULL) {
+			return DB_COMPUTE_VALUE_FAILED;
+		}
+
+		dfield_copy(&upd_field->new_val, new_vfield);
+
+		if (!dfield_datas_are_binary_equal(
+				upd_field->old_v_val,
+				&upd_field->new_val, 0))
+			n_diff++;
+	}
+
+	update->n_fields = n_diff;
+	return DB_SUCCESS;
+}
+
+#ifdef WITH_WSREP
+dberr_t wsrep_append_foreign_key(trx_t *trx,
+			       dict_foreign_t*	foreign,
+			       const rec_t*	clust_rec,
+			       dict_index_t*	clust_index,
+			       bool		referenced,
+			       upd_node_t*	upd_node,
+			       bool		pa_disable,
+			       Wsrep_service_key_type	key_type);
+#endif /* WITH_WSREP */
+
+/*********************************************************************//**
+Perform referential actions or checks when a parent row is deleted or updated
+and the constraint had an ON DELETE or ON UPDATE condition which was not
+RESTRICT.
+@return DB_SUCCESS, DB_LOCK_WAIT, or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins_foreign_check_on_constraint(
+/*================================*/
+	que_thr_t*	thr,		/*!< in: query thread whose run_node
+					is an update node */
+	dict_foreign_t*	foreign,	/*!< in: foreign key constraint whose
+					type is != 0 */
+	btr_pcur_t*	pcur,		/*!< in: cursor placed on a matching
+					index record in the child table */
+	dtuple_t*	entry,		/*!< in: index entry in the parent
+					table */
+	mtr_t*		mtr)		/*!< in: mtr holding the latch of pcur
+					page */
+{
+	upd_node_t*	node;
+	upd_node_t*	cascade;
+	dict_table_t*const*const fktable = &foreign->foreign_table;
+	dict_table_t*	table = *fktable;
+	dict_index_t*	index;
+	dict_index_t*	clust_index;
+	dtuple_t*	ref;
+	const rec_t*	rec;
+	const rec_t*	clust_rec;
+	const buf_block_t* clust_block;
+	upd_t*		update;
+	dberr_t		err;
+	trx_t*		trx;
+	mem_heap_t*	tmp_heap	= NULL;
+	doc_id_t	doc_id = FTS_NULL_DOC_ID;
+
+	DBUG_ENTER("row_ins_foreign_check_on_constraint");
+
+	trx = thr_get_trx(thr);
+
+	/* Since we are going to delete or update a row, we have to invalidate
+	the MySQL query cache for table. A deadlock of threads is not possible
+	here because the caller of this function does not hold any latches with
+	the mutex rank above the lock_sys.latch. The query cache mutex
+	has a rank just above the lock_sys.latch. */
+
+	row_ins_invalidate_query_cache(thr, table->name.m_name);
+
+	node = static_cast<upd_node_t*>(thr->run_node);
+
+	if (node->is_delete && 0 == (foreign->type
+				     & (DICT_FOREIGN_ON_DELETE_CASCADE
+					| DICT_FOREIGN_ON_DELETE_SET_NULL))) {
+
+		row_ins_foreign_report_err("Trying to delete",
+					   thr, foreign,
+					   btr_pcur_get_rec(pcur), entry);
+
+		DBUG_RETURN(DB_ROW_IS_REFERENCED);
+	}
+
+	if (!node->is_delete && 0 == (foreign->type
+				      & (DICT_FOREIGN_ON_UPDATE_CASCADE
+					 | DICT_FOREIGN_ON_UPDATE_SET_NULL))) {
+
+		/* This is an UPDATE */
+
+		row_ins_foreign_report_err("Trying to update",
+					   thr, foreign,
+					   btr_pcur_get_rec(pcur), entry);
+
+		DBUG_RETURN(DB_ROW_IS_REFERENCED);
+	}
+
+	if (node->cascade_node == NULL) {
+		node->cascade_heap = mem_heap_create(128);
+		node->cascade_node = row_create_update_node_for_mysql(
+			table, node->cascade_heap);
+		que_node_set_parent(node->cascade_node, node);
+
+	}
+	cascade = node->cascade_node;
+	cascade->table = table;
+	cascade->foreign = foreign;
+
+	if (node->is_delete
+	    && (foreign->type & DICT_FOREIGN_ON_DELETE_CASCADE)) {
+		cascade->is_delete = PLAIN_DELETE;
+	} else {
+		cascade->is_delete = NO_DELETE;
+
+		if (foreign->n_fields > cascade->update_n_fields) {
+			/* We have to make the update vector longer */
+
+			cascade->update = upd_create(foreign->n_fields,
+						     node->cascade_heap);
+			cascade->update_n_fields = foreign->n_fields;
+		}
+
+		/* We do not allow cyclic cascaded updating (DELETE is
+		allowed, but not UPDATE) of the same table, as this
+		can lead to an infinite cycle. Check that we are not
+		updating the same table which is already being
+		modified in this cascade chain. We have to check this
+		also because the modification of the indexes of a
+		'parent' table may still be incomplete, and we must
+		avoid seeing the indexes of the parent table in an
+		inconsistent state! */
+
+		if (row_ins_cascade_ancestor_updates_table(cascade, table)) {
+
+			/* We do not know if this would break foreign key
+			constraints, but play safe and return an error */
+
+			err = DB_ROW_IS_REFERENCED;
+
+			row_ins_foreign_report_err(
+				"Trying an update, possibly causing a cyclic"
+				" cascaded update\n"
+				"in the child table,", thr, foreign,
+				btr_pcur_get_rec(pcur), entry);
+
+			goto nonstandard_exit_func;
+		}
+	}
+
+	if (row_ins_cascade_n_ancestors(cascade) >= FK_MAX_CASCADE_DEL) {
+		err = DB_FOREIGN_EXCEED_MAX_CASCADE;
+
+		row_ins_foreign_report_err(
+			"Trying a too deep cascaded delete or update\n",
+			thr, foreign, btr_pcur_get_rec(pcur), entry);
+
+		goto nonstandard_exit_func;
+	}
+
+	index = pcur->index();
+
+	ut_a(index == foreign->foreign_index);
+
+	rec = btr_pcur_get_rec(pcur);
+
+	tmp_heap = mem_heap_create(256);
+
+	if (dict_index_is_clust(index)) {
+		/* pcur is already positioned in the clustered index of
+		the child table */
+
+		clust_index = index;
+		clust_rec = rec;
+		clust_block = btr_pcur_get_block(pcur);
+	} else {
+		/* We have to look for the record in the clustered index
+		in the child table */
+
+		clust_index = dict_table_get_first_index(table);
+
+		ref = row_build_row_ref(ROW_COPY_POINTERS, index, rec,
+					tmp_heap);
+		cascade->pcur->old_rec = nullptr;
+		cascade->pcur->btr_cur.page_cur.index = clust_index;
+		err = btr_pcur_open_with_no_init(ref,
+						 PAGE_CUR_LE, BTR_SEARCH_LEAF,
+						 cascade->pcur, mtr);
+		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+			goto nonstandard_exit_func;
+		}
+
+		clust_rec = btr_pcur_get_rec(cascade->pcur);
+		clust_block = btr_pcur_get_block(cascade->pcur);
+
+		if (!page_rec_is_user_rec(clust_rec)
+		    || btr_pcur_get_low_match(cascade->pcur)
+		    < dict_index_get_n_unique(clust_index)) {
+
+			ib::error() << "In cascade of a foreign key op index "
+				<< index->name
+				<< " of table " << index->table->name;
+
+			fputs("InnoDB: record ", stderr);
+			rec_print(stderr, rec, index);
+			fputs("\n"
+			      "InnoDB: clustered record ", stderr);
+			rec_print(stderr, clust_rec, clust_index);
+			fputs("\n"
+			      "InnoDB: Submit a detailed bug report to"
+			      " https://jira.mariadb.org/\n", stderr);
+			ut_ad(0);
+			err = DB_SUCCESS;
+
+			goto nonstandard_exit_func;
+		}
+	}
+
+	/* Set an X-lock on the row to delete or update in the child table */
+
+	err = lock_table(table, fktable, LOCK_IX, thr);
+
+	if (err == DB_SUCCESS) {
+		/* Here it suffices to use a LOCK_REC_NOT_GAP type lock;
+		we already have a normal shared lock on the appropriate
+		gap if the search criterion was not unique */
+
+		err = lock_clust_rec_read_check_and_lock_alt(
+			0, clust_block, clust_rec, clust_index,
+			LOCK_X, LOCK_REC_NOT_GAP, thr);
+	}
+
+	if (err != DB_SUCCESS) {
+
+		goto nonstandard_exit_func;
+	}
+
+	if (rec_get_deleted_flag(clust_rec, dict_table_is_comp(table))) {
+		/* In delete-marked records, DB_TRX_ID must
+		always refer to an existing undo log record. */
+		ut_ad(rec_get_trx_id(clust_rec, clust_index));
+		/* This can happen if there is a circular reference of
+		rows such that cascading delete comes to delete a row
+		already in the process of being delete marked */
+		err = DB_SUCCESS;
+
+		goto nonstandard_exit_func;
+	}
+
+	if (table->fts) {
+		doc_id = fts_get_doc_id_from_rec(
+			clust_rec, clust_index,
+			rec_get_offsets(clust_rec, clust_index, NULL,
+					clust_index->n_core_fields,
+					ULINT_UNDEFINED, &tmp_heap));
+	}
+
+	if (node->is_delete
+	    ? (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL)
+	    : (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL)) {
+		/* Build the appropriate update vector which sets
+		foreign->n_fields first fields in rec to SQL NULL */
+
+		update = cascade->update;
+
+		update->info_bits = 0;
+		update->n_fields = foreign->n_fields;
+		MEM_UNDEFINED(update->fields,
+			      update->n_fields * sizeof *update->fields);
+
+		for (ulint i = 0; i < foreign->n_fields; i++) {
+			upd_field_t*	ufield = &update->fields[i];
+			ulint		col_no = dict_index_get_nth_col_no(
+						index, i);
+			ulint		prefix_col;
+
+			ufield->field_no = static_cast<uint16_t>(
+				dict_table_get_nth_col_pos(
+					table, col_no, &prefix_col));
+			dict_col_t*	col = dict_table_get_nth_col(
+				table, col_no);
+			dict_col_copy_type(col, dfield_get_type(&ufield->new_val));
+
+			ufield->orig_len = 0;
+			ufield->exp = NULL;
+			dfield_set_null(&ufield->new_val);
+		}
+
+		if (foreign->affects_fulltext()) {
+			fts_trx_add_op(trx, table, doc_id, FTS_DELETE, NULL);
+		}
+
+		if (foreign->v_cols != NULL
+		    && foreign->v_cols->size() > 0) {
+			err = row_ins_foreign_fill_virtual(
+				cascade, clust_rec, clust_index,
+				node, foreign);
+
+			if (err != DB_SUCCESS) {
+				goto nonstandard_exit_func;
+			}
+		}
+	} else if (table->fts && cascade->is_delete == PLAIN_DELETE
+		   && foreign->affects_fulltext()) {
+		/* DICT_FOREIGN_ON_DELETE_CASCADE case */
+		fts_trx_add_op(trx, table, doc_id, FTS_DELETE, NULL);
+	}
+
+	if (!node->is_delete
+	    && (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE)) {
+
+		/* Build the appropriate update vector which sets changing
+		foreign->n_fields first fields in rec to new values */
+
+		bool affects_fulltext = row_ins_cascade_calc_update_vec(
+			node, foreign, tmp_heap, trx);
+
+		if (foreign->v_cols && !foreign->v_cols->empty()) {
+			err = row_ins_foreign_fill_virtual(
+				cascade, clust_rec, clust_index,
+				node, foreign);
+
+			if (err != DB_SUCCESS) {
+				goto nonstandard_exit_func;
+			}
+		}
+
+		switch (cascade->update->n_fields) {
+		case ULINT_UNDEFINED:
+			err = DB_ROW_IS_REFERENCED;
+
+			row_ins_foreign_report_err(
+				"Trying a cascaded update where the"
+				" updated value in the child\n"
+				"table would not fit in the length"
+				" of the column, or the value would\n"
+				"be NULL and the column is"
+				" declared as not NULL in the child table,",
+				thr, foreign, btr_pcur_get_rec(pcur), entry);
+
+			goto nonstandard_exit_func;
+		case 0:
+			/* The update does not change any columns referred
+			to in this foreign key constraint: no need to do
+			anything */
+
+			err = DB_SUCCESS;
+
+			goto nonstandard_exit_func;
+		}
+
+		/* Mark the old Doc ID as deleted */
+		if (affects_fulltext) {
+			ut_ad(table->fts);
+			fts_trx_add_op(trx, table, doc_id, FTS_DELETE, NULL);
+		}
+	}
+
+	if (table->versioned() && cascade->is_delete != PLAIN_DELETE
+	    && cascade->update->affects_versioned()) {
+		ut_ad(!cascade->historical_heap);
+		cascade->historical_heap = mem_heap_create(srv_page_size);
+		cascade->historical_row = row_build(
+			ROW_COPY_DATA, clust_index, clust_rec, NULL, table,
+			NULL, NULL, NULL, cascade->historical_heap);
+	}
+
+	/* Store pcur position and initialize or store the cascade node
+	pcur stored position */
+
+	btr_pcur_store_position(pcur, mtr);
+
+	if (index == clust_index) {
+		btr_pcur_copy_stored_position(cascade->pcur, pcur);
+	} else {
+		btr_pcur_store_position(cascade->pcur, mtr);
+	}
+
+#ifdef WITH_WSREP
+	if (trx->is_wsrep()) {
+		err = wsrep_append_foreign_key(trx, foreign, clust_rec, clust_index,
+					       false, NULL, true,
+					       WSREP_SERVICE_KEY_EXCLUSIVE);
+		if (err != DB_SUCCESS) {
+			goto nonstandard_exit_func;
+		}
+	}
+#endif /* WITH_WSREP */
+	mtr_commit(mtr);
+
+	ut_a(cascade->pcur->rel_pos == BTR_PCUR_ON);
+
+	cascade->state = UPD_NODE_UPDATE_CLUSTERED;
+
+	err = row_update_cascade_for_mysql(thr, cascade,
+					   foreign->foreign_table);
+
+	mtr_start(mtr);
+
+	/* Restore pcur position */
+
+	if (pcur->restore_position(BTR_SEARCH_LEAF, mtr)
+	    != btr_pcur_t::SAME_ALL) {
+		err = DB_CORRUPTION;
+	}
+
+	if (tmp_heap) {
+		mem_heap_free(tmp_heap);
+	}
+
+	DBUG_RETURN(err);
+
+nonstandard_exit_func:
+
+	if (tmp_heap) {
+		mem_heap_free(tmp_heap);
+	}
+
+	btr_pcur_store_position(pcur, mtr);
+
+	mtr_commit(mtr);
+	mtr_start(mtr);
+
+	if (pcur->restore_position(BTR_SEARCH_LEAF, mtr)
+	    != btr_pcur_t::SAME_ALL && err == DB_SUCCESS) {
+		err = DB_CORRUPTION;
+	}
+
+	DBUG_RETURN(err);
+}
+
+/*********************************************************************//**
+Sets a shared lock on a record. Used in locking possible duplicate key
+records and also in checking foreign key constraints.
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
+static
+dberr_t
+row_ins_set_shared_rec_lock(
+/*========================*/
+	unsigned		type,	/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+					LOCK_REC_NOT_GAP type lock */
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: record */
+	dict_index_t*		index,	/*!< in: index */
+	const rec_offs*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	que_thr_t*		thr)	/*!< in: query thread */
+{
+	dberr_t	err;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	if (dict_index_is_clust(index)) {
+		err = lock_clust_rec_read_check_and_lock(
+			0, block, rec, index, offsets, LOCK_S, type, thr);
+	} else {
+		err = lock_sec_rec_read_check_and_lock(
+			0, block, rec, index, offsets, LOCK_S, type, thr);
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Sets a exclusive lock on a record. Used in locking possible duplicate key
+records
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
+static
+dberr_t
+row_ins_set_exclusive_rec_lock(
+/*===========================*/
+	unsigned		type,	/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+					LOCK_REC_NOT_GAP type lock */
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: record */
+	dict_index_t*		index,	/*!< in: index */
+	const rec_offs*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	que_thr_t*		thr)	/*!< in: query thread */
+{
+	dberr_t	err;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	if (dict_index_is_clust(index)) {
+		err = lock_clust_rec_read_check_and_lock(
+			0, block, rec, index, offsets, LOCK_X, type, thr);
+	} else {
+		err = lock_sec_rec_read_check_and_lock(
+			0, block, rec, index, offsets, LOCK_X, type, thr);
+	}
+
+	return(err);
+}
+
+/***************************************************************//**
+Checks if foreign key constraint fails for an index entry. Sets shared locks
+which lock either the success or the failure of the constraint. NOTE that
+the caller must have a shared latch on dict_sys.latch.
+@return DB_SUCCESS, DB_NO_REFERENCED_ROW, or DB_ROW_IS_REFERENCED */
+dberr_t
+row_ins_check_foreign_constraint(
+/*=============================*/
+	ibool		check_ref,/*!< in: TRUE if we want to check that
+				the referenced table is ok, FALSE if we
+				want to check the foreign key table */
+	dict_foreign_t*	foreign,/*!< in: foreign constraint; NOTE that the
+				tables mentioned in it must be in the
+				dictionary cache if they exist at all */
+	dict_table_t*	table,	/*!< in: if check_ref is TRUE, then the foreign
+				table, else the referenced table */
+	dtuple_t*	entry,	/*!< in: index entry for index */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	upd_node_t*	upd_node;
+	ulint		n_fields_cmp;
+	btr_pcur_t	pcur;
+	int		cmp;
+	mtr_t		mtr;
+	trx_t*		trx		= thr_get_trx(thr);
+	mem_heap_t*	heap		= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets		= offsets_;
+
+	bool		skip_gap_lock;
+
+	skip_gap_lock = (trx->isolation_level <= TRX_ISO_READ_COMMITTED);
+
+	DBUG_ENTER("row_ins_check_foreign_constraint");
+
+	rec_offs_init(offsets_);
+
+#ifdef WITH_WSREP
+	upd_node= NULL;
+#endif /* WITH_WSREP */
+
+	if (!trx->check_foreigns) {
+		/* The user has suppressed foreign key checks currently for
+		this session */
+		DBUG_RETURN(DB_SUCCESS);
+	}
+
+	/* If any of the foreign key fields in entry is SQL NULL, we
+	suppress the foreign key check: this is compatible with Oracle,
+	for example */
+	for (ulint i = 0; i < entry->n_fields; i++) {
+		dfield_t* field = dtuple_get_nth_field(entry, i);
+		if (i < foreign->n_fields && dfield_is_null(field)) {
+			DBUG_RETURN(DB_SUCCESS);
+		}
+		/* System Versioning: if row_end != Inf, we
+		suppress the foreign key check */
+		if (field->type.vers_sys_end() && field->vers_history_row()) {
+			DBUG_RETURN(DB_SUCCESS);
+		}
+	}
+
+	if (que_node_get_type(thr->run_node) == QUE_NODE_UPDATE) {
+		upd_node = static_cast<upd_node_t*>(thr->run_node);
+
+		if (upd_node->is_delete != PLAIN_DELETE
+		    && upd_node->foreign == foreign) {
+			/* If a cascaded update is done as defined by a
+			foreign key constraint, do not check that
+			constraint for the child row. In ON UPDATE CASCADE
+			the update of the parent row is only half done when
+			we come here: if we would check the constraint here
+			for the child row it would fail.
+
+			A QUESTION remains: if in the child table there are
+			several constraints which refer to the same parent
+			table, we should merge all updates to the child as
+			one update? And the updates can be contradictory!
+			Currently we just perform the update associated
+			with each foreign key constraint, one after
+			another, and the user has problems predicting in
+			which order they are performed. */
+
+			DBUG_RETURN(DB_SUCCESS);
+		}
+	}
+
+	if (que_node_get_type(thr->run_node) == QUE_NODE_INSERT) {
+		ins_node_t* insert_node =
+			static_cast<ins_node_t*>(thr->run_node);
+		dict_table_t* table = insert_node->index->table;
+		if (table->versioned()) {
+			dfield_t* row_end = dtuple_get_nth_field(
+				insert_node->row, table->vers_end);
+			if (row_end->vers_history_row()) {
+				DBUG_RETURN(DB_SUCCESS);
+			}
+		}
+	}
+
+	dict_table_t *check_table;
+	dict_index_t *check_index;
+	dberr_t err = DB_SUCCESS;
+
+	{
+		dict_table_t*& fktable = check_ref
+			? foreign->referenced_table : foreign->foreign_table;
+		check_table = fktable;
+		if (check_table) {
+			err = lock_table(check_table, &fktable, LOCK_IS, thr);
+			if (err != DB_SUCCESS) {
+				goto do_possible_lock_wait;
+			}
+		}
+		check_table = fktable;
+	}
+
+	check_index = check_ref
+		? foreign->referenced_index : foreign->foreign_index;
+
+	if (!check_table || !check_table->is_readable() || !check_index) {
+		FILE*	ef = dict_foreign_err_file;
+		std::string fk_str;
+
+		row_ins_set_detailed(trx, foreign);
+		row_ins_foreign_trx_print(trx);
+
+		fputs("Foreign key constraint fails for table ", ef);
+		ut_print_name(ef, trx, check_ref
+			      ? foreign->foreign_table_name
+			      : foreign->referenced_table_name);
+		fputs(":\n", ef);
+		fk_str = dict_print_info_on_foreign_key_in_create_format(
+			trx, foreign, TRUE);
+		fputs(fk_str.c_str(), ef);
+		if (check_ref) {
+			if (foreign->foreign_index) {
+				fprintf(ef, "\nTrying to add to index %s"
+					" tuple:\n",
+					foreign->foreign_index->name());
+			} else {
+				fputs("\nTrying to add tuple:\n", ef);
+			}
+			dtuple_print(ef, entry);
+			fputs("\nBut the parent table ", ef);
+			ut_print_name(ef, trx, foreign->referenced_table_name);
+			fputs("\nor its .ibd file or the required index does"
+			      " not currently exist!\n", ef);
+			err = DB_NO_REFERENCED_ROW;
+		} else {
+			if (foreign->referenced_index) {
+				fprintf(ef, "\nTrying to modify index %s"
+					" tuple:\n",
+					foreign->referenced_index->name());
+			} else {
+				fputs("\nTrying to modify tuple:\n", ef);
+			}
+			dtuple_print(ef, entry);
+			fputs("\nBut the referencing table ", ef);
+			ut_print_name(ef, trx, foreign->foreign_table_name);
+			fputs("\nor its .ibd file or the required index does"
+			      " not currently exist!\n", ef);
+			err = DB_ROW_IS_REFERENCED;
+		}
+
+		mysql_mutex_unlock(&dict_foreign_err_mutex);
+		goto exit_func;
+	}
+
+	mtr_start(&mtr);
+
+	/* Store old value on n_fields_cmp */
+
+	n_fields_cmp = dtuple_get_n_fields_cmp(entry);
+
+	dtuple_set_n_fields_cmp(entry, foreign->n_fields);
+	pcur.btr_cur.page_cur.index = check_index;
+	err = btr_pcur_open(entry, PAGE_CUR_GE, BTR_SEARCH_LEAF, &pcur, &mtr);
+	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+		goto end_scan;
+	}
+
+	/* Scan index records and check if there is a matching record */
+
+	do {
+		const rec_t*		rec = btr_pcur_get_rec(&pcur);
+		const buf_block_t*	block = btr_pcur_get_block(&pcur);
+
+		if (page_rec_is_infimum(rec)) {
+
+			continue;
+		}
+
+		offsets = rec_get_offsets(rec, check_index, offsets,
+					  check_index->n_core_fields,
+					  ULINT_UNDEFINED, &heap);
+
+		if (page_rec_is_supremum(rec)) {
+
+			if (skip_gap_lock) {
+
+				continue;
+			}
+
+			err = row_ins_set_shared_rec_lock(LOCK_ORDINARY, block,
+							  rec, check_index,
+							  offsets, thr);
+			switch (err) {
+			case DB_SUCCESS_LOCKED_REC:
+			case DB_SUCCESS:
+				continue;
+			default:
+				goto end_scan;
+			}
+		}
+
+		cmp = cmp_dtuple_rec(entry, rec, check_index, offsets);
+
+		if (cmp == 0) {
+			if (rec_get_deleted_flag(rec,
+						 rec_offs_comp(offsets))) {
+				/* In delete-marked records, DB_TRX_ID must
+				always refer to an existing undo log record. */
+				ut_ad(!dict_index_is_clust(check_index)
+				      || row_get_rec_trx_id(rec, check_index,
+							    offsets));
+
+				err = row_ins_set_shared_rec_lock(
+					skip_gap_lock
+					? LOCK_REC_NOT_GAP
+					: LOCK_ORDINARY, block,
+					rec, check_index, offsets, thr);
+				switch (err) {
+				case DB_SUCCESS_LOCKED_REC:
+				case DB_SUCCESS:
+					break;
+				default:
+					goto end_scan;
+				}
+			} else {
+				if (check_table->versioned()) {
+					bool history_row = false;
+
+					if (check_index->is_primary()) {
+						history_row = check_index->
+							vers_history_row(rec,
+									 offsets);
+					} else if (check_index->
+						vers_history_row(rec,
+								 history_row)) {
+						break;
+					}
+
+					if (history_row) {
+						continue;
+					}
+				}
+				/* Found a matching record. Lock only
+				a record because we can allow inserts
+				into gaps */
+
+				err = row_ins_set_shared_rec_lock(
+					LOCK_REC_NOT_GAP, block,
+					rec, check_index, offsets, thr);
+
+				switch (err) {
+				case DB_SUCCESS_LOCKED_REC:
+				case DB_SUCCESS:
+					break;
+				default:
+					goto end_scan;
+				}
+
+				if (check_ref) {
+					err = DB_SUCCESS;
+#ifdef WITH_WSREP
+					if (trx->is_wsrep()) {
+						err = wsrep_append_foreign_key(
+							thr_get_trx(thr),
+							foreign,
+							rec,
+							check_index,
+							check_ref,
+							upd_node,
+						        false,
+						        WSREP_SERVICE_KEY_REFERENCE);
+					}
+#endif /* WITH_WSREP */
+					goto end_scan;
+				} else if (foreign->type != 0) {
+					/* There is an ON UPDATE or ON DELETE
+					condition: check them in a separate
+					function */
+
+					err = row_ins_foreign_check_on_constraint(
+						thr, foreign, &pcur, entry,
+						&mtr);
+					if (err != DB_SUCCESS) {
+						/* Since reporting a plain
+						"duplicate key" error
+						message to the user in
+						cases where a long CASCADE
+						operation would lead to a
+						duplicate key in some
+						other table is very
+						confusing, map duplicate
+						key errors resulting from
+						FK constraints to a
+						separate error code. */
+
+						if (err == DB_DUPLICATE_KEY) {
+							err = DB_FOREIGN_DUPLICATE_KEY;
+						}
+
+						goto end_scan;
+					}
+
+					/* row_ins_foreign_check_on_constraint
+					may have repositioned pcur on a
+					different block */
+					block = btr_pcur_get_block(&pcur);
+				} else {
+					row_ins_foreign_report_err(
+						"Trying to delete or update",
+						thr, foreign, rec, entry);
+
+					err = DB_ROW_IS_REFERENCED;
+					goto end_scan;
+				}
+			}
+		} else {
+			ut_a(cmp < 0);
+
+			err = skip_gap_lock
+				? DB_SUCCESS
+				: row_ins_set_shared_rec_lock(
+					LOCK_GAP, block,
+					rec, check_index, offsets, thr);
+
+			switch (err) {
+			case DB_SUCCESS_LOCKED_REC:
+				err = DB_SUCCESS;
+				/* fall through */
+			case DB_SUCCESS:
+				if (check_ref) {
+					err = DB_NO_REFERENCED_ROW;
+					row_ins_foreign_report_add_err(
+						trx, foreign, rec, entry);
+				}
+			default:
+				break;
+			}
+
+			goto end_scan;
+		}
+	} while (btr_pcur_move_to_next(&pcur, &mtr));
+
+	if (check_ref) {
+		row_ins_foreign_report_add_err(
+			trx, foreign, btr_pcur_get_rec(&pcur), entry);
+		err = DB_NO_REFERENCED_ROW;
+	} else {
+		err = DB_SUCCESS;
+	}
+
+end_scan:
+	mtr_commit(&mtr);
+	ut_free(pcur.old_rec_buf);
+
+	/* Restore old value */
+	dtuple_set_n_fields_cmp(entry, n_fields_cmp);
+
+do_possible_lock_wait:
+	if (err == DB_LOCK_WAIT) {
+		trx->error_state = err;
+
+		thr->lock_state = QUE_THR_LOCK_ROW;
+
+		err = lock_wait(thr);
+
+		thr->lock_state = QUE_THR_LOCK_NOLOCK;
+
+		if (err == DB_SUCCESS) {
+			err = DB_LOCK_WAIT;
+		}
+	}
+
+exit_func:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	DBUG_RETURN(err);
+}
+
+/** Sets the values of the dtuple fields in ref_entry from the values of
+foreign columns in entry.
+@param[in]	foreign		foreign key constraint
+@param[in]	index		clustered index
+@param[in]	entry		tuple of clustered index
+@param[in]	ref_entry	tuple of foreign columns
+@return true if all foreign key fields present in clustered index */
+static
+bool row_ins_foreign_index_entry(dict_foreign_t *foreign,
+                                 const dict_index_t *index,
+                                 const dtuple_t *entry,
+                                 dtuple_t *ref_entry)
+{
+  for (ulint i= 0; i < foreign->n_fields; i++)
+  {
+    for (ulint j= 0; j < index->n_fields; j++)
+    {
+      const dict_col_t *col= dict_index_get_nth_col(index, j);
+
+      /* A clustered index may contain instantly dropped columns,
+      which must be skipped. */
+      if (col->is_dropped())
+        continue;
+
+      const char *col_name= dict_table_get_col_name(index->table, col->ind);
+      if (0 == innobase_strcasecmp(col_name, foreign->foreign_col_names[i]))
+      {
+        dfield_copy(&ref_entry->fields[i], &entry->fields[j]);
+        goto got_match;
+      }
+    }
+    return false;
+got_match:
+    continue;
+  }
+
+  return true;
+}
+
+/***************************************************************//**
+Checks if foreign key constraints fail for an index entry. If index
+is not mentioned in any constraint, this function does nothing,
+Otherwise does searches to the indexes of referenced tables and
+sets shared locks which lock either the success or the failure of
+a constraint.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins_check_foreign_constraints(
+/*==============================*/
+	dict_table_t*	table,	/*!< in: table */
+	dict_index_t*	index,	/*!< in: index */
+	bool		pk,	/*!< in: index->is_primary() */
+	dtuple_t*	entry,	/*!< in: index entry for index */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dict_foreign_t*	foreign;
+	dberr_t		err = DB_SUCCESS;
+	mem_heap_t*	heap = NULL;
+
+	DBUG_ASSERT(index->is_primary() == pk);
+
+	DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd,
+			    "foreign_constraint_check_for_ins");
+
+	for (dict_foreign_set::iterator it = table->foreign_set.begin();
+	     err == DB_SUCCESS && it != table->foreign_set.end();
+	     ++it) {
+
+		foreign = *it;
+
+		if (foreign->foreign_index == index
+		    || (pk && !foreign->foreign_index)) {
+
+			dtuple_t*	ref_tuple = entry;
+			if (UNIV_UNLIKELY(!foreign->foreign_index)) {
+				/* Change primary key entry to
+				foreign key index entry */
+				if (!heap) {
+					heap = mem_heap_create(1000);
+				} else {
+					mem_heap_empty(heap);
+				}
+
+				ref_tuple = dtuple_create(
+					heap, foreign->n_fields);
+				dtuple_set_n_fields_cmp(
+					ref_tuple, foreign->n_fields);
+				if (!row_ins_foreign_index_entry(
+					foreign, index, entry, ref_tuple)) {
+					err = DB_NO_REFERENCED_ROW;
+					break;
+				}
+
+			}
+
+			dict_table_t*	ref_table = NULL;
+			dict_table_t*	referenced_table
+						= foreign->referenced_table;
+
+			if (referenced_table == NULL) {
+
+				ref_table = dict_table_open_on_name(
+					foreign->referenced_table_name_lookup,
+					false, DICT_ERR_IGNORE_NONE);
+			}
+
+			err = row_ins_check_foreign_constraint(
+				TRUE, foreign, table, ref_tuple, thr);
+
+			if (ref_table) {
+				dict_table_close(ref_table);
+			}
+		}
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	return err;
+}
+
+/***************************************************************//**
+Checks if a unique key violation to rec would occur at the index entry
+insert.
+@return TRUE if error */
+static
+ibool
+row_ins_dupl_error_with_rec(
+/*========================*/
+	const rec_t*	rec,	/*!< in: user record; NOTE that we assume
+				that the caller already has a record lock on
+				the record! */
+	const dtuple_t*	entry,	/*!< in: entry to insert */
+	dict_index_t*	index,	/*!< in: index */
+	const rec_offs*	offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+	ulint	matched_fields;
+	ulint	n_unique;
+	ulint	i;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	n_unique = dict_index_get_n_unique(index);
+
+	matched_fields = 0;
+
+	cmp_dtuple_rec_with_match(entry, rec, index, offsets, &matched_fields);
+
+	if (matched_fields < n_unique) {
+
+		return(FALSE);
+	}
+
+	/* In a unique secondary index we allow equal key values if they
+	contain SQL NULLs */
+
+	if (!dict_index_is_clust(index) && !index->nulls_equal) {
+
+		for (i = 0; i < n_unique; i++) {
+			if (dfield_is_null(dtuple_get_nth_field(entry, i))) {
+
+				return(FALSE);
+			}
+		}
+	}
+
+	return(!rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
+}
+
+/** Determine whether a history row was inserted by this transaction
+(row TRX_ID is the same as current TRX_ID).
+@param index  secondary index
+@param rec    secondary index record
+@param trx    transaction
+@return error code
+@retval DB_SUCCESS                on success
+@retval DB_FOREIGN_DUPLICATE_KEY  if a history row was inserted by trx */
+static dberr_t vers_row_same_trx(dict_index_t* index, const rec_t* rec,
+                                 const trx_t& trx)
+{
+  mtr_t mtr;
+  dberr_t ret= DB_SUCCESS;
+  dict_index_t *clust_index= dict_table_get_first_index(index->table);
+  ut_ad(index != clust_index);
+
+  mtr.start();
+
+  if (const rec_t *clust_rec=
+      row_get_clust_rec(BTR_SEARCH_LEAF, rec, index, &clust_index, &mtr))
+  {
+    rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+    rec_offs *clust_offs= offsets_;
+    rec_offs_init(offsets_);
+    mem_heap_t *heap= NULL;
+
+    clust_offs=
+      rec_get_offsets(clust_rec, clust_index, clust_offs,
+                      clust_index->n_core_fields, ULINT_UNDEFINED, &heap);
+    if (clust_index->vers_history_row(clust_rec, clust_offs))
+    {
+      ulint trx_id_len;
+      const byte *trx_id= rec_get_nth_field(clust_rec, clust_offs,
+                                            clust_index->n_uniq, &trx_id_len);
+      ut_ad(trx_id_len == DATA_TRX_ID_LEN);
+
+      if (trx.id == trx_read_trx_id(trx_id))
+        ret= DB_FOREIGN_DUPLICATE_KEY;
+    }
+
+    if (UNIV_LIKELY_NULL(heap))
+      mem_heap_free(heap);
+  }
+  else
+  {
+    ib::error() << "foreign constraints: secondary index " << index->name <<
+                   " of table " << index->table->name << " is out of sync";
+    ut_ad("secondary index is out of sync" == 0);
+    ret= DB_TABLE_CORRUPT;
+  }
+
+  mtr.commit();
+  return ret;
+}
+
+/***************************************************************//**
+Scans a unique non-clustered index at a given index entry to determine
+whether a uniqueness violation has occurred for the key value of the entry.
+Set shared locks on possible duplicate records.
+@return DB_SUCCESS, DB_DUPLICATE_KEY, or DB_LOCK_WAIT */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins_scan_sec_index_for_duplicate(
+/*=================================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	dict_index_t*	index,	/*!< in: non-clustered unique index */
+	dtuple_t*	entry,	/*!< in: index entry */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	mem_heap_t*	offsets_heap)
+				/*!< in/out: memory heap that can be emptied */
+{
+	ulint		n_unique;
+	int		cmp;
+	ulint		n_fields_cmp;
+	btr_pcur_t	pcur;
+	rec_offs	offsets_[REC_OFFS_SEC_INDEX_SIZE];
+	rec_offs*	offsets		= offsets_;
+	DBUG_ENTER("row_ins_scan_sec_index_for_duplicate");
+
+	rec_offs_init(offsets_);
+
+	ut_ad(!index->lock.have_any());
+
+	n_unique = dict_index_get_n_unique(index);
+
+	/* If the secondary index is unique, but one of the fields in the
+	n_unique first fields is NULL, a unique key violation cannot occur,
+	since we define NULL != NULL in this case */
+
+	if (!index->nulls_equal) {
+		for (ulint i = 0; i < n_unique; i++) {
+			if (UNIV_SQL_NULL == dfield_get_len(
+					dtuple_get_nth_field(entry, i))) {
+
+				DBUG_RETURN(DB_SUCCESS);
+			}
+		}
+	}
+
+	/* Store old value on n_fields_cmp */
+
+	n_fields_cmp = dtuple_get_n_fields_cmp(entry);
+
+	dtuple_set_n_fields_cmp(entry, n_unique);
+	pcur.btr_cur.page_cur.index = index;
+	trx_t* const trx = thr_get_trx(thr);
+	dberr_t err = btr_pcur_open(entry, PAGE_CUR_GE, BTR_SEARCH_LEAF,
+				    &pcur, mtr);
+	if (err != DB_SUCCESS) {
+		goto end_scan;
+	}
+
+	/* Scan index records and check if there is a duplicate */
+
+	do {
+		const rec_t*		rec	= btr_pcur_get_rec(&pcur);
+		const buf_block_t*	block	= btr_pcur_get_block(&pcur);
+		const ulint		lock_type = LOCK_ORDINARY;
+
+		if (page_rec_is_infimum(rec)) {
+
+			continue;
+		}
+
+		offsets = rec_get_offsets(rec, index, offsets,
+					  index->n_core_fields,
+					  ULINT_UNDEFINED, &offsets_heap);
+
+		if (flags & BTR_NO_LOCKING_FLAG) {
+			/* Set no locks when applying log
+			in online table rebuild. */
+		} else if (trx->duplicates) {
+
+			/* If the SQL-query will update or replace
+			duplicate key we will take X-lock for
+			duplicates ( REPLACE, LOAD DATAFILE REPLACE,
+			INSERT ON DUPLICATE KEY UPDATE). */
+
+			err = row_ins_set_exclusive_rec_lock(
+				lock_type, block, rec, index, offsets, thr);
+		} else {
+
+			err = row_ins_set_shared_rec_lock(
+				lock_type, block, rec, index, offsets, thr);
+		}
+
+		switch (err) {
+		case DB_SUCCESS_LOCKED_REC:
+			err = DB_SUCCESS;
+		case DB_SUCCESS:
+			break;
+		default:
+			goto end_scan;
+		}
+
+		if (page_rec_is_supremum(rec)) {
+
+			continue;
+		}
+
+		cmp = cmp_dtuple_rec(entry, rec, index, offsets);
+
+		if (cmp == 0) {
+			if (row_ins_dupl_error_with_rec(rec, entry,
+							index, offsets)) {
+
+				err = DB_DUPLICATE_KEY;
+
+				trx->error_info = index;
+
+				if (!index->table->versioned()) {
+				} else if (dberr_t e =
+					   vers_row_same_trx(index, rec,
+							     *trx)) {
+					err = e;
+					goto end_scan;
+				}
+
+				/* If the duplicate is on hidden FTS_DOC_ID,
+				state so in the error log */
+				if (index == index->table->fts_doc_id_index
+				    && DICT_TF2_FLAG_IS_SET(
+					index->table,
+					DICT_TF2_FTS_HAS_DOC_ID)) {
+
+					ib::error() << "Duplicate FTS_DOC_ID"
+						" value on table "
+						<< index->table->name;
+				}
+
+				goto end_scan;
+			}
+		} else {
+			ut_a(cmp < 0);
+			goto end_scan;
+		}
+	} while (btr_pcur_move_to_next(&pcur, mtr));
+
+end_scan:
+	/* Restore old value */
+	dtuple_set_n_fields_cmp(entry, n_fields_cmp);
+
+	DBUG_RETURN(err);
+}
+
+/** Checks for a duplicate when the table is being rebuilt online.
+@param n_uniq   index->db_trx_id()
+@param entry    entry being inserted
+@param rec      clustered index record at insert position
+@param index    clustered index
+@param offsets  rec_get_offsets(rec)
+@retval DB_SUCCESS when no duplicate is detected
+@retval DB_SUCCESS_LOCKED_REC when rec is an exact match of entry or
+a newer version of entry (the entry should not be inserted)
+@retval DB_DUPLICATE_KEY when entry is a duplicate of rec */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins_duplicate_online(ulint n_uniq, const dtuple_t *entry,
+                         const rec_t *rec, const dict_index_t *index,
+                         rec_offs *offsets)
+{
+	ulint	fields	= 0;
+
+	/* During rebuild, there should not be any delete-marked rows
+	in the new table. */
+	ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
+	ut_ad(dtuple_get_n_fields_cmp(entry) == n_uniq);
+	ut_ad(n_uniq == index->db_trx_id());
+
+	/* Compare the PRIMARY KEY fields and the DB_TRX_ID, DB_ROLL_PTR. */
+	cmp_dtuple_rec_with_match_low(entry, rec, index, offsets, n_uniq + 2,
+				      &fields);
+
+	if (fields < n_uniq) {
+		/* Not a duplicate. */
+		return(DB_SUCCESS);
+	}
+
+	ulint trx_id_len;
+
+	if (fields == n_uniq + 2
+	    && memcmp(rec_get_nth_field(rec, offsets, n_uniq, &trx_id_len),
+		      reset_trx_id, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)) {
+		ut_ad(trx_id_len == DATA_TRX_ID_LEN);
+		/* rec is an exact match of entry, and DB_TRX_ID belongs
+		to a transaction that started after our ALTER TABLE. */
+		return(DB_SUCCESS_LOCKED_REC);
+	}
+
+	return(DB_DUPLICATE_KEY);
+}
+
+/** Checks for a duplicate when the table is being rebuilt online.
+@retval DB_SUCCESS when no duplicate is detected
+@retval DB_SUCCESS_LOCKED_REC when rec is an exact match of entry or
+a newer version of entry (the entry should not be inserted)
+@retval DB_DUPLICATE_KEY when entry is a duplicate of rec */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins_duplicate_error_in_clust_online(
+/*====================================*/
+	ulint		n_uniq,	/*!< in: offset of DB_TRX_ID */
+	const dtuple_t*	entry,	/*!< in: entry that is being inserted */
+	const btr_cur_t*cursor,	/*!< in: cursor on insert position */
+	rec_offs**	offsets,/*!< in/out: rec_get_offsets(rec) */
+	mem_heap_t**	heap)	/*!< in/out: heap for offsets */
+{
+	dberr_t		err	= DB_SUCCESS;
+	const rec_t*	rec	= btr_cur_get_rec(cursor);
+
+	ut_ad(!cursor->index()->is_instant());
+
+	if (cursor->low_match >= n_uniq && !page_rec_is_infimum(rec)) {
+		*offsets = rec_get_offsets(rec, cursor->index(), *offsets,
+					   cursor->index()->n_fields,
+					   ULINT_UNDEFINED, heap);
+		err = row_ins_duplicate_online(n_uniq, entry,
+					       rec, cursor->index(), *offsets);
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+	}
+
+	if (!(rec = page_rec_get_next_const(btr_cur_get_rec(cursor)))) {
+		return DB_CORRUPTION;
+	}
+
+	if (cursor->up_match >= n_uniq && !page_rec_is_supremum(rec)) {
+		*offsets = rec_get_offsets(rec, cursor->index(), *offsets,
+					   cursor->index()->n_fields,
+					   ULINT_UNDEFINED, heap);
+		err = row_ins_duplicate_online(n_uniq, entry,
+					       rec, cursor->index(), *offsets);
+	}
+
+	return(err);
+}
+
+/***************************************************************//**
+Checks if a unique key violation error would occur at an index entry
+insert. Sets shared locks on possible duplicate records. Works only
+for a clustered index!
+@retval DB_SUCCESS if no error
+@retval DB_DUPLICATE_KEY if error,
+@retval DB_LOCK_WAIT if we have to wait for a lock on a possible duplicate
+record */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins_duplicate_error_in_clust(
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in: B-tree cursor */
+	const dtuple_t*	entry,	/*!< in: entry to insert */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dberr_t	err;
+	rec_t*	rec;
+	ulint	n_unique;
+	trx_t*	trx		= thr_get_trx(thr);
+	mem_heap_t*heap		= NULL;
+	rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs* offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(cursor->index()->is_clust());
+
+	/* NOTE: For unique non-clustered indexes there may be any number
+	of delete marked records with the same value for the non-clustered
+	index key (remember multiversioning), and which differ only in
+	the row refererence part of the index record, containing the
+	clustered index key fields. For such a secondary index record,
+	to avoid race condition, we must FIRST do the insertion and after
+	that check that the uniqueness condition is not breached! */
+
+	/* NOTE: A problem is that in the B-tree node pointers on an
+	upper level may match more to the entry than the actual existing
+	user records on the leaf level. So, even if low_match would suggest
+	that a duplicate key violation may occur, this may not be the case. */
+
+	n_unique = dict_index_get_n_unique(cursor->index());
+
+	if (cursor->low_match >= n_unique) {
+
+		rec = btr_cur_get_rec(cursor);
+
+		if (!page_rec_is_infimum(rec)) {
+			offsets = rec_get_offsets(rec, cursor->index(),
+						  offsets,
+						  cursor->index()
+						  ->n_core_fields,
+						  ULINT_UNDEFINED, &heap);
+
+			/* We set a lock on the possible duplicate: this
+			is needed in logical logging of MySQL to make
+			sure that in roll-forward we get the same duplicate
+			errors as in original execution */
+
+			if (flags & BTR_NO_LOCKING_FLAG) {
+				/* Do nothing if no-locking is set */
+				err = DB_SUCCESS;
+			} else if (trx->duplicates) {
+
+				/* If the SQL-query will update or replace
+				duplicate key we will take X-lock for
+				duplicates ( REPLACE, LOAD DATAFILE REPLACE,
+				INSERT ON DUPLICATE KEY UPDATE). */
+
+				err = row_ins_set_exclusive_rec_lock(
+					LOCK_REC_NOT_GAP,
+					btr_cur_get_block(cursor),
+					rec, cursor->index(), offsets, thr);
+			} else {
+
+				err = row_ins_set_shared_rec_lock(
+					LOCK_REC_NOT_GAP,
+					btr_cur_get_block(cursor), rec,
+					cursor->index(), offsets, thr);
+			}
+
+			switch (err) {
+			case DB_SUCCESS_LOCKED_REC:
+			case DB_SUCCESS:
+				break;
+			default:
+				goto func_exit;
+			}
+
+			if (row_ins_dupl_error_with_rec(
+				    rec, entry, cursor->index(), offsets)) {
+duplicate:
+				trx->error_info = cursor->index();
+				err = DB_DUPLICATE_KEY;
+				if (thr->prebuilt
+				    && thr->prebuilt->upd_node
+				    && thr->prebuilt->upd_node->is_delete
+					== VERSIONED_DELETE
+				    && entry->vers_history_row())
+				{
+					ulint trx_id_len;
+					byte *trx_id = rec_get_nth_field(
+						rec, offsets, n_unique,
+						&trx_id_len);
+					ut_ad(trx_id_len == DATA_TRX_ID_LEN);
+					if (trx->id == trx_read_trx_id(trx_id)) {
+						err = DB_FOREIGN_DUPLICATE_KEY;
+					}
+				}
+				goto func_exit;
+			}
+		}
+	}
+
+	err = DB_SUCCESS;
+
+	if (cursor->up_match >= n_unique) {
+
+		rec = page_rec_get_next(btr_cur_get_rec(cursor));
+
+		if (rec && !page_rec_is_supremum(rec)) {
+			offsets = rec_get_offsets(rec, cursor->index(),
+						  offsets,
+						  cursor->index()
+						  ->n_core_fields,
+						  ULINT_UNDEFINED, &heap);
+
+			if (trx->duplicates) {
+
+				/* If the SQL-query will update or replace
+				duplicate key we will take X-lock for
+				duplicates ( REPLACE, LOAD DATAFILE REPLACE,
+				INSERT ON DUPLICATE KEY UPDATE). */
+
+				err = row_ins_set_exclusive_rec_lock(
+					LOCK_REC_NOT_GAP,
+					btr_cur_get_block(cursor),
+					rec, cursor->index(), offsets, thr);
+			} else {
+
+				err = row_ins_set_shared_rec_lock(
+					LOCK_REC_NOT_GAP,
+					btr_cur_get_block(cursor),
+					rec, cursor->index(), offsets, thr);
+			}
+
+			switch (err) {
+			default:
+				break;
+			case DB_SUCCESS_LOCKED_REC:
+				err = DB_SUCCESS;
+				/* fall through */
+			case DB_SUCCESS:
+				if (row_ins_dupl_error_with_rec(
+					    rec, entry, cursor->index(),
+					    offsets)) {
+					goto duplicate;
+				}
+			}
+		}
+
+		/* This should never happen */
+		err = DB_CORRUPTION;
+	}
+func_exit:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(err);
+}
+
+/***************************************************************//**
+Checks if an index entry has long enough common prefix with an
+existing record so that the intended insert of the entry must be
+changed to a modify of the existing record. In the case of a clustered
+index, the prefix must be n_unique fields long. In the case of a
+secondary index, all fields must be equal.  InnoDB never updates
+secondary index records in place, other than clearing or setting the
+delete-mark flag. We could be able to update the non-unique fields
+of a unique secondary index record by checking the cursor->up_match,
+but we do not do so, because it could have some locking implications.
+@return TRUE if the existing record should be updated; FALSE if not */
+UNIV_INLINE
+ibool
+row_ins_must_modify_rec(
+/*====================*/
+	const btr_cur_t*	cursor)	/*!< in: B-tree cursor */
+{
+	/* NOTE: (compare to the note in row_ins_duplicate_error_in_clust)
+	Because node pointers on upper levels of the B-tree may match more
+	to entry than to actual user records on the leaf level, we
+	have to check if the candidate record is actually a user record.
+	A clustered index node pointer contains index->n_unique first fields,
+	and a secondary index node pointer contains all index fields. */
+
+	return(cursor->low_match
+	       >= dict_index_get_n_unique_in_tree(cursor->index())
+	       && !page_rec_is_infimum(btr_cur_get_rec(cursor)));
+}
+
+/** Insert the externally stored fields (off-page columns)
+of a clustered index entry.
+@param[in]	entry	index entry to insert
+@param[in]	big_rec	externally stored fields
+@param[in,out]	offsets	rec_get_offsets()
+@param[in,out]	heap	memory heap
+@param[in]	thd	client connection, or NULL
+@param[in]	index	clustered index
+@return	error code
+@retval	DB_SUCCESS
+@retval DB_OUT_OF_FILE_SPACE */
+static
+dberr_t
+row_ins_index_entry_big_rec(
+	const dtuple_t*		entry,
+	const big_rec_t*	big_rec,
+	rec_offs*		offsets,
+	mem_heap_t**		heap,
+	dict_index_t*		index,
+	const void*		thd __attribute__((unused)))
+{
+	mtr_t		mtr;
+	btr_pcur_t	pcur;
+	rec_t*		rec;
+
+	pcur.btr_cur.page_cur.index = index;
+	ut_ad(index->is_primary());
+
+	DEBUG_SYNC_C_IF_THD(thd, "before_row_ins_extern_latch");
+
+	mtr.start();
+	if (index->table->is_temporary()) {
+		mtr.set_log_mode(MTR_LOG_NO_REDO);
+	} else {
+		index->set_modified(mtr);
+	}
+
+	dberr_t error = btr_pcur_open(entry, PAGE_CUR_LE, BTR_MODIFY_TREE,
+				      &pcur, &mtr);
+	if (error != DB_SUCCESS) {
+		return error;
+	}
+
+	rec = btr_pcur_get_rec(&pcur);
+	offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+				  ULINT_UNDEFINED, heap);
+
+	DEBUG_SYNC_C_IF_THD(thd, "before_row_ins_extern");
+	error = btr_store_big_rec_extern_fields(
+		&pcur, offsets, big_rec, &mtr, BTR_STORE_INSERT);
+	DEBUG_SYNC_C_IF_THD(thd, "after_row_ins_extern");
+
+	mtr.commit();
+
+	ut_free(pcur.old_rec_buf);
+	return(error);
+}
+
+#ifdef HAVE_REPLICATION /* Working around MDEV-24622 */
+extern "C" int thd_is_slave(const MYSQL_THD thd);
+#else
+# define thd_is_slave(thd) 0
+#endif
+
+#if defined __aarch64__&&defined __GNUC__&&__GNUC__==4&&!defined __clang__
+/* Avoid GCC 4.8.5 internal compiler error due to srw_mutex::wr_unlock().
+We would only need this for row_ins_clust_index_entry_low(),
+but GCC 4.8.5 does not support pop_options. */
+# pragma GCC optimize ("O0")
+#endif
+
+/***************************************************************//**
+Tries to insert an entry into a clustered index, ignoring foreign key
+constraints. If a record with the same unique key is found, the other
+record is necessarily marked deleted by a committed transaction, or a
+unique key violation error occurs. The delete marked record is then
+updated to an existing record, and we must write an undo log record on
+the delete marked record.
+@retval DB_SUCCESS on success
+@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG)
+@retval DB_FAIL if retry with BTR_MODIFY_TREE is needed
+@return error code */
+dberr_t
+row_ins_clust_index_entry_low(
+/*==========================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_latch_mode	mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+				depending on whether we wish optimistic or
+				pessimistic descent down the index tree */
+	dict_index_t*	index,	/*!< in: clustered index */
+	ulint		n_uniq,	/*!< in: 0 or index->n_uniq */
+	dtuple_t*	entry,	/*!< in/out: index entry to insert */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	btr_pcur_t	pcur;
+	dberr_t		err		= DB_SUCCESS;
+	big_rec_t*	big_rec		= NULL;
+	mtr_t		mtr;
+	uint64_t	auto_inc	= 0;
+	mem_heap_t*	offsets_heap	= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets         = offsets_;
+	rec_offs_init(offsets_);
+	trx_t*		trx	= thr_get_trx(thr);
+	buf_block_t*	block;
+
+	DBUG_ENTER("row_ins_clust_index_entry_low");
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(!dict_index_is_unique(index)
+	      || n_uniq == dict_index_get_n_unique(index));
+	ut_ad(!n_uniq || n_uniq == dict_index_get_n_unique(index));
+	ut_ad(!trx->in_rollback);
+
+	mtr.start();
+
+	if (index->table->is_temporary()) {
+		/* Disable REDO logging as the lifetime of temp-tables is
+		limited to server or connection lifetime and so REDO
+		information is not needed on restart for recovery.
+		Disable locking as temp-tables are local to a connection. */
+
+		ut_ad(flags & BTR_NO_LOCKING_FLAG);
+		ut_ad(!dict_index_is_online_ddl(index));
+		ut_ad(!index->table->persistent_autoinc);
+		ut_ad(!index->is_instant());
+		mtr.set_log_mode(MTR_LOG_NO_REDO);
+	} else {
+		index->set_modified(mtr);
+
+		if (UNIV_UNLIKELY(entry->is_metadata())) {
+			ut_ad(index->is_instant());
+			ut_ad(!dict_index_is_online_ddl(index));
+			ut_ad(mode == BTR_MODIFY_TREE);
+		} else {
+			if (mode == BTR_MODIFY_LEAF
+			    && dict_index_is_online_ddl(index)) {
+				mode = BTR_MODIFY_LEAF_ALREADY_LATCHED;
+				mtr_s_lock_index(index, &mtr);
+			}
+
+			if (unsigned ai = index->table->persistent_autoinc) {
+				/* Prepare to persist the AUTO_INCREMENT value
+				from the index entry to PAGE_ROOT_AUTO_INC. */
+				const dfield_t* dfield = dtuple_get_nth_field(
+					entry, ai - 1);
+				if (!dfield_is_null(dfield)) {
+					auto_inc = row_parse_int(
+						static_cast<const byte*>(
+							dfield->data),
+						dfield->len,
+						dfield->type.mtype,
+						dfield->type.prtype
+						& DATA_UNSIGNED);
+					if (auto_inc
+					    && mode != BTR_MODIFY_TREE) {
+						mode = btr_latch_mode(
+							BTR_MODIFY_ROOT_AND_LEAF
+							^ BTR_MODIFY_LEAF
+							^ mode);
+					}
+				}
+			}
+		}
+	}
+
+	/* Note that we use PAGE_CUR_LE as the search mode, because then
+	the function will return in both low_match and up_match of the
+	cursor sensible values */
+	pcur.btr_cur.page_cur.index = index;
+	err = btr_pcur_open(entry, PAGE_CUR_LE, mode, &pcur, &mtr);
+	if (err != DB_SUCCESS) {
+		index->table->file_unreadable = true;
+err_exit:
+		mtr.commit();
+		goto func_exit;
+	}
+
+	if (auto_inc) {
+		buf_block_t* root
+			= mtr.at_savepoint(mode != BTR_MODIFY_ROOT_AND_LEAF);
+		ut_ad(index->page == root->page.id().page_no());
+		page_set_autoinc(root, auto_inc, &mtr, false);
+	}
+
+	btr_pcur_get_btr_cur(&pcur)->thr = thr;
+
+#ifdef UNIV_DEBUG
+	{
+		page_t*	page = btr_pcur_get_page(&pcur);
+		rec_t*	first_rec = page_rec_get_next(
+			page_get_infimum_rec(page));
+
+		ut_ad(page_rec_is_supremum(first_rec)
+		      || rec_n_fields_is_sane(index, first_rec, entry));
+	}
+#endif /* UNIV_DEBUG */
+
+	block = btr_pcur_get_block(&pcur);
+
+	DBUG_EXECUTE_IF("row_ins_row_level", goto skip_bulk_insert;);
+
+	if (!(flags & BTR_NO_UNDO_LOG_FLAG)
+	    && page_is_empty(block->page.frame)
+	    && !entry->is_metadata() && !trx->duplicates
+	    && !trx->check_unique_secondary && !trx->check_foreigns
+	    && !trx->dict_operation
+	    && block->page.id().page_no() == index->page
+	    && !index->table->skip_alter_undo
+	    && !index->table->n_rec_locks
+	    && !index->table->is_active_ddl()
+	    && !index->table->has_spatial_index()
+	    && !index->table->versioned()
+	    && !thd_is_slave(trx->mysql_thd) /* FIXME: MDEV-24622 */) {
+		DEBUG_SYNC_C("empty_root_page_insert");
+
+		trx->bulk_insert = true;
+
+		if (!index->table->is_temporary()) {
+			err = lock_table(index->table, NULL, LOCK_X, thr);
+
+			if (err != DB_SUCCESS) {
+				trx->error_state = err;
+				trx->bulk_insert = false;
+				goto err_exit;
+			}
+
+			if (index->table->n_rec_locks) {
+avoid_bulk:
+				trx->bulk_insert = false;
+				goto skip_bulk_insert;
+			}
+
+#ifdef WITH_WSREP
+			if (trx->is_wsrep())
+			{
+				if (!wsrep_thd_is_local_transaction(trx->mysql_thd))
+					goto skip_bulk_insert;
+				if (wsrep_append_table_key(trx->mysql_thd, *index->table))
+				{
+					trx->error_state = DB_ROLLBACK;
+					goto err_exit;
+				}
+			}
+#endif /* WITH_WSREP */
+
+#ifdef BTR_CUR_HASH_ADAPT
+			if (btr_search_enabled) {
+				btr_search_x_lock_all();
+				index->table->bulk_trx_id = trx->id;
+				btr_search_x_unlock_all();
+			} else {
+				index->table->bulk_trx_id = trx->id;
+			}
+#else /* BTR_CUR_HASH_ADAPT */
+			index->table->bulk_trx_id = trx->id;
+#endif /* BTR_CUR_HASH_ADAPT */
+
+			/* Write TRX_UNDO_EMPTY undo log and
+			start buffering the insert operation */
+			err = trx_undo_report_row_operation(
+				thr, index, entry,
+				nullptr, 0, nullptr, nullptr,
+				nullptr);
+
+			if (err != DB_SUCCESS) {
+				goto avoid_bulk;
+			}
+
+			goto err_exit;
+		}
+	}
+
+skip_bulk_insert:
+	if (UNIV_UNLIKELY(entry->info_bits != 0)) {
+		ut_ad(entry->is_metadata());
+		ut_ad(flags == BTR_NO_LOCKING_FLAG);
+		ut_ad(index->is_instant());
+		ut_ad(!dict_index_is_online_ddl(index));
+
+		const rec_t* rec = btr_pcur_get_rec(&pcur);
+
+		if (rec_get_info_bits(rec, page_rec_is_comp(rec))
+		    & REC_INFO_MIN_REC_FLAG) {
+			trx->error_info = index;
+			err = DB_DUPLICATE_KEY;
+			goto err_exit;
+		}
+
+		ut_ad(!row_ins_must_modify_rec(&pcur.btr_cur));
+		goto do_insert;
+	}
+
+	if (rec_is_metadata(btr_pcur_get_rec(&pcur), *index)) {
+		goto do_insert;
+	}
+
+	if (n_uniq
+	    && (pcur.btr_cur.up_match >= n_uniq
+		|| pcur.btr_cur.low_match >= n_uniq)) {
+
+		if (flags
+		    == (BTR_CREATE_FLAG | BTR_NO_LOCKING_FLAG
+			| BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG)) {
+			/* Set no locks when applying log
+			in online table rebuild. Only check for duplicates. */
+			err = row_ins_duplicate_error_in_clust_online(
+				n_uniq, entry, &pcur.btr_cur,
+				&offsets, &offsets_heap);
+
+			switch (err) {
+			case DB_SUCCESS:
+				break;
+			default:
+				ut_ad(0);
+				/* fall through */
+			case DB_SUCCESS_LOCKED_REC:
+			case DB_DUPLICATE_KEY:
+				trx->error_info = index;
+			}
+		} else {
+			/* Note that the following may return also
+			DB_LOCK_WAIT */
+
+			err = row_ins_duplicate_error_in_clust(
+				flags, &pcur.btr_cur, entry, thr);
+		}
+
+		if (err != DB_SUCCESS) {
+			goto err_exit;
+		}
+	}
+
+	/* Note: Allowing duplicates would qualify for modification of
+	an existing record as the new entry is exactly same as old entry. */
+	if (row_ins_must_modify_rec(&pcur.btr_cur)) {
+		/* There is already an index entry with a long enough common
+		prefix, we must convert the insert into a modify of an
+		existing record */
+		mem_heap_t*	entry_heap	= mem_heap_create(1024);
+
+		err = row_ins_clust_index_entry_by_modify(
+			&pcur, flags, mode, &offsets, &offsets_heap,
+			entry_heap, entry, thr, &mtr);
+
+		mtr_commit(&mtr);
+		mem_heap_free(entry_heap);
+	} else {
+		if (index->is_instant()) entry->trim(*index);
+do_insert:
+		rec_t*	insert_rec;
+
+		if (mode != BTR_MODIFY_TREE) {
+			ut_ad(mode == BTR_MODIFY_LEAF
+			      || mode == BTR_MODIFY_LEAF_ALREADY_LATCHED
+			      || mode == BTR_MODIFY_ROOT_AND_LEAF
+			      || mode
+			      == BTR_MODIFY_ROOT_AND_LEAF_ALREADY_LATCHED);
+			err = btr_cur_optimistic_insert(
+				flags, &pcur.btr_cur, &offsets, &offsets_heap,
+				entry, &insert_rec, &big_rec,
+				n_ext, thr, &mtr);
+		} else {
+			if (buf_pool.running_out()) {
+				err = DB_LOCK_TABLE_FULL;
+				goto err_exit;
+			}
+
+			err = btr_cur_optimistic_insert(
+				flags, &pcur.btr_cur,
+				&offsets, &offsets_heap,
+				entry, &insert_rec, &big_rec,
+				n_ext, thr, &mtr);
+
+			if (err == DB_FAIL) {
+				err = btr_cur_pessimistic_insert(
+					flags, &pcur.btr_cur,
+					&offsets, &offsets_heap,
+					entry, &insert_rec, &big_rec,
+					n_ext, thr, &mtr);
+			}
+		}
+
+		mtr.commit();
+
+		if (big_rec) {
+			/* Online table rebuild could read (and
+			ignore) the incomplete record at this point.
+			If online rebuild is in progress, the
+			row_ins_index_entry_big_rec() will write log. */
+
+			DBUG_EXECUTE_IF(
+				"row_ins_extern_checkpoint",
+				log_write_up_to(mtr.commit_lsn(), true););
+			err = row_ins_index_entry_big_rec(
+				entry, big_rec, offsets, &offsets_heap, index,
+				trx->mysql_thd);
+			dtuple_convert_back_big_rec(index, entry, big_rec);
+		}
+	}
+
+func_exit:
+	if (offsets_heap != NULL) {
+		mem_heap_free(offsets_heap);
+	}
+
+	ut_free(pcur.old_rec_buf);
+	DBUG_RETURN(err);
+}
+
+/** Start a mini-transaction.
+@param[in,out]	mtr		mini-transaction
+@param[in,out]	index		secondary index */
+static void row_ins_sec_mtr_start(mtr_t *mtr, dict_index_t *index)
+{
+	ut_ad(!dict_index_is_clust(index));
+	ut_ad(mtr->is_named_space(index->table->space));
+
+	const mtr_log_t	log_mode = mtr->get_log_mode();
+
+	mtr->start();
+	index->set_modified(*mtr);
+	mtr->set_log_mode(log_mode);
+}
+
+/***************************************************************//**
+Tries to insert an entry into a secondary index. If a record with exactly the
+same fields is found, the other record is necessarily marked deleted.
+It is then unmarked. Otherwise, the entry is just inserted to the index.
+@retval DB_SUCCESS on success
+@retval DB_LOCK_WAIT on lock wait when !(flags & BTR_NO_LOCKING_FLAG)
+@retval DB_FAIL if retry with BTR_INSERT_TREE is needed
+@return error code */
+dberr_t
+row_ins_sec_index_entry_low(
+/*========================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_latch_mode	mode,	/*!< in: BTR_MODIFY_LEAF or BTR_INSERT_TREE,
+				depending on whether we wish optimistic or
+				pessimistic descent down the index tree */
+	dict_index_t*	index,	/*!< in: secondary index */
+	mem_heap_t*	offsets_heap,
+				/*!< in/out: memory heap that can be emptied */
+	mem_heap_t*	heap,	/*!< in/out: memory heap */
+	dtuple_t*	entry,	/*!< in/out: index entry to insert */
+	trx_id_t	trx_id,	/*!< in: PAGE_MAX_TRX_ID during
+				row_log_table_apply(), or 0 */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	DBUG_ENTER("row_ins_sec_index_entry_low");
+
+	btr_cur_t	cursor;
+	btr_latch_mode	search_mode	= mode;
+	dberr_t		err;
+	ulint		n_unique;
+	mtr_t		mtr;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets         = offsets_;
+	rec_offs_init(offsets_);
+	rtr_info_t	rtr_info;
+
+	ut_ad(!dict_index_is_clust(index));
+	ut_ad(mode == BTR_MODIFY_LEAF || mode == BTR_INSERT_TREE);
+
+	cursor.thr = thr;
+	cursor.rtr_info = NULL;
+	cursor.page_cur.index = index;
+	ut_ad(thr_get_trx(thr)->id != 0);
+
+	mtr.start();
+
+	if (index->table->is_temporary()) {
+		/* Disable locking, because temporary tables are never
+		shared between transactions or connections. */
+		ut_ad(flags & BTR_NO_LOCKING_FLAG);
+		mtr.set_log_mode(MTR_LOG_NO_REDO);
+	} else {
+		index->set_modified(mtr);
+	}
+
+	/* Note that we use PAGE_CUR_LE as the search mode, because then
+	the function will return in both low_match and up_match of the
+	cursor sensible values */
+
+	if (index->is_spatial()) {
+		rtr_init_rtr_info(&rtr_info, false, &cursor, index, false);
+		rtr_info_update_btr(&cursor, &rtr_info);
+
+		err = rtr_insert_leaf(&cursor, entry, search_mode, &mtr);
+
+		if (err == DB_SUCCESS && search_mode == BTR_MODIFY_LEAF
+		    && rtr_info.mbr_adj) {
+			mtr_commit(&mtr);
+			search_mode = mode = BTR_MODIFY_TREE;
+			rtr_clean_rtr_info(&rtr_info, true);
+			rtr_init_rtr_info(&rtr_info, false, &cursor,
+					  index, false);
+			rtr_info_update_btr(&cursor, &rtr_info);
+			mtr.start();
+			if (index->table->is_temporary()) {
+				mtr.set_log_mode(MTR_LOG_NO_REDO);
+			} else {
+				index->set_modified(mtr);
+			}
+			err = rtr_insert_leaf(&cursor, entry,
+					      search_mode, &mtr);
+		}
+
+		DBUG_EXECUTE_IF(
+			"rtree_test_check_count", {
+			goto func_exit;});
+
+	} else {
+		if (!index->table->is_temporary()) {
+			search_mode = btr_latch_mode(
+				search_mode
+				| (thr_get_trx(thr)->check_unique_secondary
+				   ? BTR_INSERT
+				   : BTR_INSERT | BTR_IGNORE_SEC_UNIQUE));
+		}
+
+		err = cursor.search_leaf(entry, PAGE_CUR_LE, search_mode,
+					 &mtr);
+	}
+
+	if (err != DB_SUCCESS) {
+		if (err == DB_DECRYPTION_FAILED) {
+			btr_decryption_failed(*index);
+		}
+		goto func_exit;
+	}
+
+	if (cursor.flag == BTR_CUR_INSERT_TO_IBUF) {
+		ut_ad(!dict_index_is_spatial(index));
+		/* The insert was buffered during the search: we are done */
+		goto func_exit;
+	}
+
+#ifdef UNIV_DEBUG
+	{
+		page_t*	page = btr_cur_get_page(&cursor);
+		rec_t*	first_rec = page_rec_get_next(
+			page_get_infimum_rec(page));
+
+		ut_ad(page_rec_is_supremum(first_rec)
+		      || rec_n_fields_is_sane(index, first_rec, entry));
+	}
+#endif /* UNIV_DEBUG */
+
+	n_unique = dict_index_get_n_unique(index);
+
+	if (dict_index_is_unique(index)
+	    && (cursor.low_match >= n_unique || cursor.up_match >= n_unique)) {
+		mtr_commit(&mtr);
+
+		DEBUG_SYNC_C("row_ins_sec_index_unique");
+
+		row_ins_sec_mtr_start(&mtr, index);
+
+		err = row_ins_scan_sec_index_for_duplicate(
+			flags, index, entry, thr, &mtr, offsets_heap);
+
+		mtr_commit(&mtr);
+
+		switch (err) {
+		case DB_SUCCESS:
+			break;
+		case DB_DUPLICATE_KEY:
+			if (!index->is_committed()) {
+				ut_ad(!thr_get_trx(thr)
+				      ->dict_operation_lock_mode);
+				index->type |= DICT_CORRUPT;
+				/* Do not return any error to the
+				caller. The duplicate will be reported
+				by ALTER TABLE or CREATE UNIQUE INDEX.
+				Unfortunately we cannot report the
+				duplicate key value to the DDL thread,
+				because the altered_table object is
+				private to its call stack. */
+				err = DB_SUCCESS;
+			}
+			/* fall through */
+		default:
+			if (dict_index_is_spatial(index)) {
+				rtr_clean_rtr_info(&rtr_info, true);
+			}
+			DBUG_RETURN(err);
+		}
+
+		row_ins_sec_mtr_start(&mtr, index);
+
+		DEBUG_SYNC_C("row_ins_sec_index_entry_dup_locks_created");
+
+		/* We did not find a duplicate and we have now
+		locked with s-locks the necessary records to
+		prevent any insertion of a duplicate by another
+		transaction. Let us now reposition the cursor and
+		continue the insertion (bypassing the change buffer). */
+		err = cursor.search_leaf(
+			entry, PAGE_CUR_LE,
+			btr_latch_mode(search_mode
+				       & ~(BTR_INSERT
+					   | BTR_IGNORE_SEC_UNIQUE)),
+			&mtr);
+		if (err != DB_SUCCESS) {
+			goto func_exit;
+		}
+	}
+
+	if (row_ins_must_modify_rec(&cursor)) {
+		/* There is already an index entry with a long enough common
+		prefix, we must convert the insert into a modify of an
+		existing record */
+		offsets = rec_get_offsets(
+			btr_cur_get_rec(&cursor), index, offsets,
+			index->n_core_fields,
+			ULINT_UNDEFINED, &offsets_heap);
+
+		err = row_ins_sec_index_entry_by_modify(
+			flags, mode, &cursor, &offsets,
+			offsets_heap, heap, entry, thr, &mtr);
+
+		if (err == DB_SUCCESS && dict_index_is_spatial(index)
+		    && rtr_info.mbr_adj) {
+			err = rtr_ins_enlarge_mbr(&cursor, &mtr);
+		}
+	} else {
+		rec_t*		insert_rec;
+		big_rec_t*	big_rec;
+
+		if (mode == BTR_MODIFY_LEAF) {
+			err = btr_cur_optimistic_insert(
+				flags, &cursor, &offsets, &offsets_heap,
+				entry, &insert_rec,
+				&big_rec, 0, thr, &mtr);
+			if (err == DB_SUCCESS
+			    && dict_index_is_spatial(index)
+			    && rtr_info.mbr_adj) {
+				err = rtr_ins_enlarge_mbr(&cursor, &mtr);
+			}
+		} else {
+			if (buf_pool.running_out()) {
+				err = DB_LOCK_TABLE_FULL;
+				goto func_exit;
+			}
+
+			err = btr_cur_optimistic_insert(
+				flags, &cursor,
+				&offsets, &offsets_heap,
+				entry, &insert_rec,
+				&big_rec, 0, thr, &mtr);
+			if (err == DB_FAIL) {
+				err = btr_cur_pessimistic_insert(
+					flags, &cursor,
+					&offsets, &offsets_heap,
+					entry, &insert_rec,
+					&big_rec, 0, thr, &mtr);
+			}
+			if (err == DB_SUCCESS
+				   && dict_index_is_spatial(index)
+				   && rtr_info.mbr_adj) {
+				err = rtr_ins_enlarge_mbr(&cursor, &mtr);
+			}
+		}
+
+		if (err == DB_SUCCESS && trx_id) {
+			page_update_max_trx_id(
+				btr_cur_get_block(&cursor),
+				btr_cur_get_page_zip(&cursor),
+				trx_id, &mtr);
+		}
+
+		ut_ad(!big_rec);
+	}
+
+func_exit:
+	if (dict_index_is_spatial(index)) {
+		rtr_clean_rtr_info(&rtr_info, true);
+	}
+
+	mtr_commit(&mtr);
+	DBUG_RETURN(err);
+}
+
+/***************************************************************//**
+Inserts an entry into a clustered index. Tries first optimistic,
+then pessimistic descent down the tree. If the entry matches enough
+to a delete marked record, performs the insert by updating or delete
+unmarking the delete marked record.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */
+dberr_t
+row_ins_clust_index_entry(
+/*======================*/
+	dict_index_t*	index,	/*!< in: clustered index */
+	dtuple_t*	entry,	/*!< in/out: index entry to insert */
+	que_thr_t*	thr,	/*!< in: query thread */
+	ulint		n_ext)	/*!< in: number of externally stored columns */
+{
+	dberr_t	err;
+	ulint	n_uniq;
+
+	DBUG_ENTER("row_ins_clust_index_entry");
+
+	if (!index->table->foreign_set.empty()) {
+		err = row_ins_check_foreign_constraints(
+			index->table, index, true, entry, thr);
+		if (err != DB_SUCCESS) {
+
+			DBUG_RETURN(err);
+		}
+	}
+
+	n_uniq = dict_index_is_unique(index) ? index->n_uniq : 0;
+
+#ifdef WITH_WSREP
+	const bool skip_locking
+		= wsrep_thd_skip_locking(thr_get_trx(thr)->mysql_thd);
+	ulint	flags = index->table->no_rollback() ? BTR_NO_ROLLBACK
+		: (index->table->is_temporary() || skip_locking)
+		? BTR_NO_LOCKING_FLAG : 0;
+#ifdef UNIV_DEBUG
+	if (skip_locking && strcmp(wsrep_get_sr_table_name(),
+                                   index->table->name.m_name)) {
+		WSREP_ERROR("Record locking is disabled in this thread, "
+			    "but the table being modified is not "
+			    "`%s`: `%s`.", wsrep_get_sr_table_name(),
+			    index->table->name.m_name);
+		ut_error;
+	}
+#endif /* UNIV_DEBUG */
+#else
+	ulint	flags = index->table->no_rollback() ? BTR_NO_ROLLBACK
+		: index->table->is_temporary()
+		? BTR_NO_LOCKING_FLAG : 0;
+#endif /* WITH_WSREP */
+	const ulint	orig_n_fields = entry->n_fields;
+
+	/* For intermediate table during copy alter table,
+	   skip the undo log and record lock checking for
+	   insertion operation.
+	*/
+	if (index->table->skip_alter_undo) {
+		flags |= BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG;
+	}
+
+	/* Try first optimistic descent to the B-tree */
+	log_free_check();
+
+	err = row_ins_clust_index_entry_low(
+		flags, BTR_MODIFY_LEAF, index, n_uniq, entry,
+		n_ext, thr);
+
+	entry->n_fields = orig_n_fields;
+
+	DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd,
+			    "after_row_ins_clust_index_entry_leaf");
+
+	if (err != DB_FAIL) {
+		DEBUG_SYNC_C("row_ins_clust_index_entry_leaf_after");
+		DBUG_RETURN(err);
+	}
+
+	/* Try then pessimistic descent to the B-tree */
+	log_free_check();
+
+	err = row_ins_clust_index_entry_low(
+		flags, BTR_MODIFY_TREE, index, n_uniq, entry,
+		n_ext, thr);
+
+	entry->n_fields = orig_n_fields;
+
+	DBUG_RETURN(err);
+}
+
+/***************************************************************//**
+Inserts an entry into a secondary index. Tries first optimistic,
+then pessimistic descent down the tree. If the entry matches enough
+to a delete marked record, performs the insert by updating or delete
+unmarking the delete marked record.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */
+dberr_t
+row_ins_sec_index_entry(
+/*====================*/
+	dict_index_t*	index,	/*!< in: secondary index */
+	dtuple_t*	entry,	/*!< in/out: index entry to insert */
+	que_thr_t*	thr,	/*!< in: query thread */
+	bool		check_foreign) /*!< in: true if check
+				foreign table is needed, false otherwise */
+{
+	dberr_t		err = DB_SUCCESS;
+	mem_heap_t*	offsets_heap;
+	mem_heap_t*	heap;
+	trx_id_t	trx_id  = 0;
+
+	DBUG_EXECUTE_IF("row_ins_sec_index_entry_timeout", {
+			DBUG_SET("-d,row_ins_sec_index_entry_timeout");
+			return(DB_LOCK_WAIT);});
+
+	if (check_foreign && !index->table->foreign_set.empty()) {
+		err = row_ins_check_foreign_constraints(index->table, index,
+							false, entry, thr);
+		if (err != DB_SUCCESS) {
+
+			return(err);
+		}
+	}
+
+	ut_ad(thr_get_trx(thr)->id != 0);
+
+	offsets_heap = mem_heap_create(1024);
+	heap = mem_heap_create(1024);
+
+	/* Try first optimistic descent to the B-tree */
+
+	log_free_check();
+	ulint flags = index->table->is_temporary()
+		? BTR_NO_LOCKING_FLAG
+		: 0;
+
+	/* For intermediate table during copy alter table,
+	   skip the undo log and record lock checking for
+	   insertion operation.
+	*/
+	if (index->table->skip_alter_undo) {
+		trx_id = thr_get_trx(thr)->id;
+		flags |= BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG;
+	}
+
+	err = row_ins_sec_index_entry_low(
+		flags, BTR_MODIFY_LEAF, index, offsets_heap, heap, entry,
+		trx_id, thr);
+	if (err == DB_FAIL) {
+		mem_heap_empty(heap);
+
+		if (index->table->space == fil_system.sys_space
+		    && !(index->type & (DICT_UNIQUE | DICT_SPATIAL))) {
+			ibuf_free_excess_pages();
+		}
+
+		/* Try then pessimistic descent to the B-tree */
+		log_free_check();
+
+		err = row_ins_sec_index_entry_low(
+			flags, BTR_INSERT_TREE, index,
+			offsets_heap, heap, entry, 0, thr);
+	}
+
+	mem_heap_free(heap);
+	mem_heap_free(offsets_heap);
+	return(err);
+}
+
+/***************************************************************//**
+Inserts an index entry to index. Tries first optimistic, then pessimistic
+descent down the tree. If the entry matches enough to a delete marked record,
+performs the insert by updating or delete unmarking the delete marked
+record.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */
+static
+dberr_t
+row_ins_index_entry(
+/*================*/
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry,	/*!< in/out: index entry to insert */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	trx_t* trx = thr_get_trx(thr);
+
+	ut_ad(trx->id || index->table->no_rollback()
+	      || index->table->is_temporary());
+
+	DBUG_EXECUTE_IF("row_ins_index_entry_timeout", {
+			DBUG_SET("-d,row_ins_index_entry_timeout");
+			return(DB_LOCK_WAIT);});
+
+	if (index->is_btree()) {
+		if (auto t= trx->check_bulk_buffer(index->table)) {
+			/* MDEV-25036 FIXME: check also foreign key
+			constraints */
+			ut_ad(!trx->check_foreigns);
+			return t->bulk_insert_buffered(*entry, *index, trx);
+		}
+	}
+
+	if (index->is_primary()) {
+		return row_ins_clust_index_entry(index, entry, thr, 0);
+	} else {
+		return row_ins_sec_index_entry(index, entry, thr);
+	}
+}
+
+
+/*****************************************************************//**
+This function generate MBR (Minimum Bounding Box) for spatial objects
+and set it to spatial index field. */
+static
+void
+row_ins_spatial_index_entry_set_mbr_field(
+/*======================================*/
+	dfield_t*	field,		/*!< in/out: mbr field */
+	const dfield_t*	row_field)	/*!< in: row field */
+{
+	ulint		dlen = 0;
+	double		mbr[SPDIMS * 2];
+
+	/* This must be a GEOMETRY datatype */
+	ut_ad(DATA_GEOMETRY_MTYPE(field->type.mtype));
+
+	const byte* dptr = static_cast<const byte*>(
+		dfield_get_data(row_field));
+	dlen = dfield_get_len(row_field);
+
+	/* obtain the MBR */
+	rtree_mbr_from_wkb(dptr + GEO_DATA_HEADER_SIZE,
+			   static_cast<uint>(dlen - GEO_DATA_HEADER_SIZE),
+			   SPDIMS, mbr);
+
+	/* Set mbr as index entry data */
+	dfield_write_mbr(field, mbr);
+}
+
+/** Sets the values of the dtuple fields in entry from the values of appropriate
+columns in row.
+@param[in]	index	index handler
+@param[out]	entry	index entry to make
+@param[in]	row	row
+@return DB_SUCCESS if the set is successful */
+static
+dberr_t
+row_ins_index_entry_set_vals(
+	const dict_index_t*	index,
+	dtuple_t*		entry,
+	const dtuple_t*		row)
+{
+	ulint	n_fields;
+	ulint	i;
+	ulint	num_v = dtuple_get_n_v_fields(entry);
+
+	n_fields = dtuple_get_n_fields(entry);
+
+	for (i = 0; i < n_fields + num_v; i++) {
+		dict_field_t*	ind_field = NULL;
+		dfield_t*	field;
+		const dfield_t*	row_field;
+		ulint		len;
+		dict_col_t*	col;
+
+		if (i >= n_fields) {
+			/* This is virtual field */
+			field = dtuple_get_nth_v_field(entry, i - n_fields);
+			col = &dict_table_get_nth_v_col(
+				index->table, i - n_fields)->m_col;
+		} else {
+			field = dtuple_get_nth_field(entry, i);
+			ind_field = dict_index_get_nth_field(index, i);
+			col = ind_field->col;
+		}
+
+		if (col->is_virtual()) {
+			const dict_v_col_t*     v_col
+				= reinterpret_cast<const dict_v_col_t*>(col);
+			ut_ad(dtuple_get_n_fields(row)
+			      == dict_table_get_n_cols(index->table));
+			row_field = dtuple_get_nth_v_field(row, v_col->v_pos);
+		} else if (col->is_dropped()) {
+			ut_ad(index->is_primary());
+
+			if (!(col->prtype & DATA_NOT_NULL)) {
+				field->data = NULL;
+				field->len = UNIV_SQL_NULL;
+				field->type.prtype = DATA_BINARY_TYPE;
+			} else {
+				ut_ad(ind_field->fixed_len <= col->len);
+				dfield_set_data(field, field_ref_zero,
+						ind_field->fixed_len);
+				field->type.prtype = DATA_NOT_NULL;
+			}
+
+			field->type.mtype = col->len
+				? DATA_FIXBINARY : DATA_BINARY;
+			continue;
+		} else {
+			row_field = dtuple_get_nth_field(
+				row, ind_field->col->ind);
+		}
+
+		len = dfield_get_len(row_field);
+
+		/* Check column prefix indexes */
+		if (ind_field != NULL && ind_field->prefix_len > 0
+		    && len != UNIV_SQL_NULL) {
+
+			const	dict_col_t*	col
+				= dict_field_get_col(ind_field);
+
+			len = dtype_get_at_most_n_mbchars(
+				col->prtype, col->mbminlen, col->mbmaxlen,
+				ind_field->prefix_len,
+				len,
+				static_cast<const char*>(
+					dfield_get_data(row_field)));
+
+			ut_ad(!dfield_is_ext(row_field));
+		}
+
+		/* Handle spatial index. For the first field, replace
+		the data with its MBR (Minimum Bounding Box). */
+		if ((i == 0) && dict_index_is_spatial(index)) {
+			if (!row_field->data
+			    || row_field->len < GEO_DATA_HEADER_SIZE) {
+				return(DB_CANT_CREATE_GEOMETRY_OBJECT);
+			}
+			row_ins_spatial_index_entry_set_mbr_field(
+				field, row_field);
+			continue;
+		}
+
+		dfield_set_data(field, dfield_get_data(row_field), len);
+		if (dfield_is_ext(row_field)) {
+			ut_ad(dict_index_is_clust(index));
+			dfield_set_ext(field);
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/***********************************************************//**
+Inserts a single index entry to the table.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins_index_entry_step(
+/*=====================*/
+	ins_node_t*	node,	/*!< in: row insert node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dberr_t	err;
+
+	DBUG_ENTER("row_ins_index_entry_step");
+
+	ut_ad(dtuple_check_typed(node->row));
+
+	err = row_ins_index_entry_set_vals(node->index, *node->entry,
+					   node->row);
+
+	if (err != DB_SUCCESS) {
+		DBUG_RETURN(err);
+	}
+
+	ut_ad(dtuple_check_typed(*node->entry));
+
+	err = row_ins_index_entry(node->index, *node->entry, thr);
+
+	DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd,
+			    "after_row_ins_index_entry_step");
+
+	DBUG_RETURN(err);
+}
+
+/***********************************************************//**
+Allocates a row id for row and inits the node->index field. */
+UNIV_INLINE
+void
+row_ins_alloc_row_id_step(
+/*======================*/
+	ins_node_t*	node)	/*!< in: row insert node */
+{
+  ut_ad(node->state == INS_NODE_ALLOC_ROW_ID);
+  if (dict_table_get_first_index(node->table)->is_gen_clust())
+    dict_sys_write_row_id(node->sys_buf, dict_sys.get_new_row_id());
+}
+
+/***********************************************************//**
+Gets a row to insert from the values list. */
+UNIV_INLINE
+void
+row_ins_get_row_from_values(
+/*========================*/
+	ins_node_t*	node)	/*!< in: row insert node */
+{
+	que_node_t*	list_node;
+	dfield_t*	dfield;
+	dtuple_t*	row;
+	ulint		i;
+
+	/* The field values are copied in the buffers of the select node and
+	it is safe to use them until we fetch from select again: therefore
+	we can just copy the pointers */
+
+	row = node->row;
+
+	i = 0;
+	list_node = node->values_list;
+
+	while (list_node) {
+		eval_exp(list_node);
+
+		dfield = dtuple_get_nth_field(row, i);
+		dfield_copy_data(dfield, que_node_get_val(list_node));
+
+		i++;
+		list_node = que_node_get_next(list_node);
+	}
+}
+
+/***********************************************************//**
+Gets a row to insert from the select list. */
+UNIV_INLINE
+void
+row_ins_get_row_from_select(
+/*========================*/
+	ins_node_t*	node)	/*!< in: row insert node */
+{
+	que_node_t*	list_node;
+	dfield_t*	dfield;
+	dtuple_t*	row;
+	ulint		i;
+
+	/* The field values are copied in the buffers of the select node and
+	it is safe to use them until we fetch from select again: therefore
+	we can just copy the pointers */
+
+	row = node->row;
+
+	i = 0;
+	list_node = node->select->select_list;
+
+	while (list_node) {
+		dfield = dtuple_get_nth_field(row, i);
+		dfield_copy_data(dfield, que_node_get_val(list_node));
+
+		i++;
+		list_node = que_node_get_next(list_node);
+	}
+}
+
+/***********************************************************//**
+Inserts a row to a table.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_ins(
+/*====*/
+	ins_node_t*	node,	/*!< in: row insert node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	DBUG_ENTER("row_ins");
+
+	DBUG_PRINT("row_ins", ("table: %s", node->table->name.m_name));
+
+	if (node->state == INS_NODE_ALLOC_ROW_ID) {
+
+		row_ins_alloc_row_id_step(node);
+
+		node->index = dict_table_get_first_index(node->table);
+		ut_ad(node->entry_list.empty() == false);
+		node->entry = node->entry_list.begin();
+
+		if (node->ins_type == INS_SEARCHED) {
+
+			row_ins_get_row_from_select(node);
+
+		} else if (node->ins_type == INS_VALUES) {
+
+			row_ins_get_row_from_values(node);
+		}
+
+		node->state = INS_NODE_INSERT_ENTRIES;
+	}
+
+	ut_ad(node->state == INS_NODE_INSERT_ENTRIES);
+
+	while (dict_index_t *index = node->index) {
+		if (index->type & (DICT_FTS | DICT_CORRUPT)
+		    || !index->is_committed()) {
+		} else if (dberr_t err = row_ins_index_entry_step(node, thr)) {
+			DBUG_RETURN(err);
+		}
+		node->index = dict_table_get_next_index(index);
+		++node->entry;
+	}
+
+	ut_ad(node->entry == node->entry_list.end());
+
+	node->state = INS_NODE_ALLOC_ROW_ID;
+
+	DBUG_RETURN(DB_SUCCESS);
+}
+
+/***********************************************************//**
+Inserts a row to a table. This is a high-level function used in SQL execution
+graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+row_ins_step(
+/*=========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ins_node_t*	node;
+	que_node_t*	parent;
+	sel_node_t*	sel_node;
+	trx_t*		trx;
+	dberr_t		err;
+
+	ut_ad(thr);
+
+	DEBUG_SYNC_C("innodb_row_ins_step_enter");
+
+	trx = thr_get_trx(thr);
+
+	node = static_cast<ins_node_t*>(thr->run_node);
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_INSERT);
+
+	parent = que_node_get_parent(node);
+	sel_node = node->select;
+
+	if (thr->prev_node == parent) {
+		node->state = INS_NODE_SET_IX_LOCK;
+	}
+
+	/* If this is the first time this node is executed (or when
+	execution resumes after wait for the table IX lock), set an
+	IX lock on the table and reset the possible select node. MySQL's
+	partitioned table code may also call an insert within the same
+	SQL statement AFTER it has used this table handle to do a search.
+	This happens, for example, when a row update moves it to another
+	partition. In that case, we have already set the IX lock on the
+	table during the search operation, and there is no need to set
+	it again here. But we must write trx->id to node->sys_buf. */
+
+	if (node->table->no_rollback()) {
+		/* No-rollback tables should only be written to by a
+		single thread at a time, but there can be multiple
+		concurrent readers. We must hold an open table handle. */
+		DBUG_ASSERT(node->table->get_ref_count() > 0);
+		DBUG_ASSERT(node->ins_type == INS_DIRECT);
+		/* No-rollback tables can consist only of a single index. */
+		DBUG_ASSERT(node->entry_list.size() == 1);
+		DBUG_ASSERT(UT_LIST_GET_LEN(node->table->indexes) == 1);
+		/* There should be no possibility for interruption and
+		restarting here. In theory, we could allow resumption
+		from the INS_NODE_INSERT_ENTRIES state here. */
+		DBUG_ASSERT(node->state == INS_NODE_SET_IX_LOCK);
+		node->index = dict_table_get_first_index(node->table);
+		node->entry = node->entry_list.begin();
+		node->state = INS_NODE_INSERT_ENTRIES;
+		goto do_insert;
+	}
+
+	if (node->state == INS_NODE_SET_IX_LOCK) {
+
+		node->state = INS_NODE_ALLOC_ROW_ID;
+
+		if (node->table->is_temporary()) {
+			node->trx_id = trx->id;
+		}
+
+		/* It may be that the current session has not yet started
+		its transaction, or it has been committed: */
+
+		if (trx->id == node->trx_id) {
+			/* No need to do IX-locking */
+
+			goto same_trx;
+		}
+
+		err = lock_table(node->table, NULL, LOCK_IX, thr);
+
+		DBUG_EXECUTE_IF("ib_row_ins_ix_lock_wait",
+				err = DB_LOCK_WAIT;);
+
+		if (err != DB_SUCCESS) {
+			node->state = INS_NODE_SET_IX_LOCK;
+			goto error_handling;
+		}
+
+		node->trx_id = trx->id;
+same_trx:
+		if (node->ins_type == INS_SEARCHED) {
+			/* Reset the cursor */
+			sel_node->state = SEL_NODE_OPEN;
+
+			/* Fetch a row to insert */
+
+			thr->run_node = sel_node;
+
+			return(thr);
+		}
+	}
+
+	if ((node->ins_type == INS_SEARCHED)
+	    && (sel_node->state != SEL_NODE_FETCH)) {
+
+		ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
+
+		/* No more rows to insert */
+		thr->run_node = parent;
+
+		return(thr);
+	}
+do_insert:
+	/* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */
+
+	err = row_ins(node, thr);
+
+error_handling:
+	trx->error_state = err;
+
+	if (err != DB_SUCCESS) {
+		/* err == DB_LOCK_WAIT or SQL error detected */
+		return(NULL);
+	}
+
+	/* DO THE TRIGGER ACTIONS HERE */
+
+	if (node->ins_type == INS_SEARCHED) {
+		/* Fetch a row to insert */
+
+		thr->run_node = sel_node;
+	} else {
+		thr->run_node = que_node_get_parent(node);
+	}
+
+	return(thr);
+}
diff --git a/storage/innobase/row/row0log.cc b/storage/innobase/row/row0log.cc
new file mode 100644
index 00000000..c4f46304
--- /dev/null
+++ b/storage/innobase/row/row0log.cc
@@ -0,0 +1,4134 @@
+/*****************************************************************************
+
+Copyright (c) 2011, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0log.cc
+Modification log for online index creation and online table rebuild
+
+Created 2011-05-26 Marko Makela
+*******************************************************/
+
+#include "row0log.h"
+#include "row0row.h"
+#include "row0ins.h"
+#include "row0upd.h"
+#include "row0merge.h"
+#include "row0ext.h"
+#include "log0crypt.h"
+#include "data0data.h"
+#include "que0que.h"
+#include "srv0mon.h"
+#include "handler0alter.h"
+#include "ut0stage.h"
+#include "trx0rec.h"
+
+#include <sql_class.h>
+#include <algorithm>
+#include <map>
+
+Atomic_counter<ulint> onlineddl_rowlog_rows;
+ulint onlineddl_rowlog_pct_used;
+ulint onlineddl_pct_progress;
+
+/** Table row modification operations during online table rebuild.
+Delete-marked records are not copied to the rebuilt table. */
+enum row_tab_op {
+	/** Insert a record */
+	ROW_T_INSERT = 0x41,
+	/** Update a record in place */
+	ROW_T_UPDATE,
+	/** Delete (purge) a record */
+	ROW_T_DELETE
+};
+
+/** Index record modification operations during online index creation */
+enum row_op {
+	/** Insert a record */
+	ROW_OP_INSERT = 0x61,
+	/** Delete a record */
+	ROW_OP_DELETE
+};
+
+/** Size of the modification log entry header, in bytes */
+#define ROW_LOG_HEADER_SIZE 2/*op, extra_size*/
+
+/** Log block for modifications during online ALTER TABLE */
+struct row_log_buf_t {
+	byte*		block;	/*!< file block buffer */
+	size_t		size; /*!< length of block in bytes */
+	ut_new_pfx_t	block_pfx; /*!< opaque descriptor of "block". Set
+				by ut_allocator::allocate_large() and fed to
+				ut_allocator::deallocate_large(). */
+	mrec_buf_t	buf;	/*!< buffer for accessing a record
+				that spans two blocks */
+	ulint		blocks; /*!< current position in blocks */
+	ulint		bytes;	/*!< current position within block */
+	ulonglong	total;	/*!< logical position, in bytes from
+				the start of the row_log_table log;
+				0 for row_log_online_op() and
+				row_log_apply(). */
+};
+
+/** @brief Buffer for logging modifications during online index creation
+
+All modifications to an index that is being created will be logged by
+row_log_online_op() to this buffer.
+
+All modifications to a table that is being rebuilt will be logged by
+row_log_table_delete(), row_log_table_update(), row_log_table_insert()
+to this buffer.
+
+When head.blocks == tail.blocks, the reader will access tail.block
+directly. When also head.bytes == tail.bytes, both counts will be
+reset to 0 and the file will be truncated. */
+struct row_log_t {
+	pfs_os_file_t	fd;	/*!< file descriptor */
+	mysql_mutex_t	mutex;	/*!< mutex protecting error,
+				max_trx and tail */
+	dict_table_t*	table;	/*!< table that is being rebuilt,
+				or NULL when this is a secondary
+				index that is being created online */
+	bool		same_pk;/*!< whether the definition of the PRIMARY KEY
+				has remained the same */
+	const dtuple_t*	defaults;
+				/*!< default values of added, changed columns,
+				or NULL */
+	const ulint*	col_map;/*!< mapping of old column numbers to
+				new ones, or NULL if !table */
+	dberr_t		error;	/*!< error that occurred during online
+				table rebuild */
+	/** The transaction ID of the ALTER TABLE transaction.  Any
+	concurrent DML would necessarily be logged with a larger
+	transaction ID, because ha_innobase::prepare_inplace_alter_table()
+	acts as a barrier that ensures that any concurrent transaction
+	that operates on the table would have been started after
+	ha_innobase::prepare_inplace_alter_table() returns and before
+	ha_innobase::commit_inplace_alter_table(commit=true) is invoked.
+
+	Due to the nondeterministic nature of purge and due to the
+	possibility of upgrading from an earlier version of MariaDB
+	or MySQL, it is possible that row_log_table_low() would be
+	fed DB_TRX_ID that precedes than min_trx. We must normalize
+	such references to reset_trx_id[]. */
+	trx_id_t	min_trx;
+	trx_id_t	max_trx;/*!< biggest observed trx_id in
+				row_log_online_op();
+				protected by mutex and index->lock S-latch,
+				or by index->lock X-latch only */
+	row_log_buf_t	tail;	/*!< writer context;
+				protected by mutex and index->lock S-latch,
+				or by index->lock X-latch only */
+	size_t		crypt_tail_size; /*!< size of crypt_tail_size*/
+	byte*		crypt_tail; /*!< writer context;
+				temporary buffer used in encryption,
+				decryption or NULL*/
+	row_log_buf_t	head;	/*!< reader context; protected by MDL only;
+				modifiable by row_log_apply_ops() */
+	size_t		crypt_head_size; /*!< size of crypt_tail_size*/
+	byte*		crypt_head; /*!< reader context;
+				temporary buffer used in encryption,
+				decryption or NULL */
+	const char*	path;	/*!< where to create temporary file during
+				log operation */
+	/** the number of core fields in the clustered index of the
+	source table; before row_log_table_apply() completes, the
+	table could be emptied, so that table->is_instant() no longer holds,
+	but all log records must be in the "instant" format. */
+	unsigned	n_core_fields;
+	/** the default values of non-core fields when the operation started */
+	dict_col_t::def_t* non_core_fields;
+	bool		allow_not_null; /*!< Whether the alter ignore is being
+				used or if the sql mode is non-strict mode;
+				if not, NULL values will not be converted to
+				defaults */
+	const TABLE*	old_table; /*< Use old table in case of error. */
+
+	uint64_t	n_rows; /*< Number of rows read from the table */
+
+	/** Alter table transaction. It can be used to apply the DML logs
+	into the table */
+	const trx_t*	alter_trx;
+
+	/** Determine whether the log should be in the 'instant ADD' format
+	@param[in]	index	the clustered index of the source table
+	@return	whether to use the 'instant ADD COLUMN' format */
+	bool is_instant(const dict_index_t* index) const
+	{
+		ut_ad(table);
+		ut_ad(n_core_fields <= index->n_fields);
+		return n_core_fields != index->n_fields;
+	}
+
+	const byte* instant_field_value(ulint n, ulint* len) const
+	{
+		ut_ad(n >= n_core_fields);
+		const dict_col_t::def_t& d= non_core_fields[n - n_core_fields];
+		*len = d.len;
+		return static_cast<const byte*>(d.data);
+	}
+};
+
+/** Create the file or online log if it does not exist.
+@param[in,out] log     online rebuild log
+@return true if success, false if not */
+static MY_ATTRIBUTE((warn_unused_result))
+pfs_os_file_t
+row_log_tmpfile(
+	row_log_t*	log)
+{
+	DBUG_ENTER("row_log_tmpfile");
+	if (log->fd == OS_FILE_CLOSED) {
+		log->fd = row_merge_file_create_low(log->path);
+		DBUG_EXECUTE_IF("row_log_tmpfile_fail",
+				if (log->fd != OS_FILE_CLOSED)
+					row_merge_file_destroy_low(log->fd);
+				log->fd = OS_FILE_CLOSED;);
+		if (log->fd != OS_FILE_CLOSED) {
+			MONITOR_ATOMIC_INC(MONITOR_ALTER_TABLE_LOG_FILES);
+		}
+	}
+
+	DBUG_RETURN(log->fd);
+}
+
+/** Allocate the memory for the log buffer.
+@param[in,out]	log_buf	Buffer used for log operation
+@return TRUE if success, false if not */
+static MY_ATTRIBUTE((warn_unused_result))
+bool
+row_log_block_allocate(
+	row_log_buf_t&	log_buf)
+{
+	DBUG_ENTER("row_log_block_allocate");
+	if (log_buf.block == NULL) {
+		DBUG_EXECUTE_IF(
+			"simulate_row_log_allocation_failure",
+			DBUG_RETURN(false);
+		);
+
+		log_buf.block = ut_allocator<byte>(mem_key_row_log_buf)
+			.allocate_large(srv_sort_buf_size,
+					&log_buf.block_pfx);
+
+		if (log_buf.block == NULL) {
+			DBUG_RETURN(false);
+		}
+		log_buf.size = srv_sort_buf_size;
+	}
+	DBUG_RETURN(true);
+}
+
+/** Free the log buffer.
+@param[in,out]	log_buf	Buffer used for log operation */
+static
+void
+row_log_block_free(
+	row_log_buf_t&	log_buf)
+{
+	DBUG_ENTER("row_log_block_free");
+	if (log_buf.block != NULL) {
+		ut_allocator<byte>(mem_key_row_log_buf).deallocate_large(
+			log_buf.block, &log_buf.block_pfx);
+		log_buf.block = NULL;
+	}
+	DBUG_VOID_RETURN;
+}
+
+/** Logs an operation to a secondary index that is (or was) being created.
+@param  index   index, S or X latched
+@param  tuple   index tuple
+@param  trx_id  transaction ID for insert, or 0 for delete
+@retval false if row_log_apply() failure happens
+or true otherwise */
+bool row_log_online_op(dict_index_t *index, const dtuple_t *tuple,
+                       trx_id_t trx_id)
+{
+	byte*		b;
+	ulint		extra_size;
+	ulint		size;
+	ulint		mrec_size;
+	ulint		avail_size;
+	row_log_t*	log;
+	bool		success= true;
+
+	ut_ad(dtuple_validate(tuple));
+	ut_ad(dtuple_get_n_fields(tuple) == dict_index_get_n_fields(index));
+	ut_ad(index->lock.have_x() || index->lock.have_s());
+
+	if (index->is_corrupted()) {
+		return success;
+	}
+
+	ut_ad(dict_index_is_online_ddl(index)
+	      || (index->online_log
+		  && index->online_status == ONLINE_INDEX_COMPLETE));
+
+	/* Compute the size of the record. This differs from
+	row_merge_buf_encode(), because here we do not encode
+	extra_size+1 (and reserve 0 as the end-of-chunk marker). */
+
+	size = rec_get_converted_size_temp<false>(
+		index, tuple->fields, tuple->n_fields, &extra_size);
+	ut_ad(size >= extra_size);
+	ut_ad(size <= sizeof log->tail.buf);
+
+	mrec_size = ROW_LOG_HEADER_SIZE
+		+ (extra_size >= 0x80) + size
+		+ (trx_id ? DATA_TRX_ID_LEN : 0);
+
+	log = index->online_log;
+	mysql_mutex_lock(&log->mutex);
+
+start_log:
+	if (trx_id > log->max_trx) {
+		log->max_trx = trx_id;
+	}
+
+	if (!row_log_block_allocate(log->tail)) {
+		log->error = DB_OUT_OF_MEMORY;
+		goto err_exit;
+	}
+
+	MEM_UNDEFINED(log->tail.buf, sizeof log->tail.buf);
+
+	ut_ad(log->tail.bytes < srv_sort_buf_size);
+	avail_size = srv_sort_buf_size - log->tail.bytes;
+
+	if (mrec_size > avail_size) {
+		b = log->tail.buf;
+	} else {
+		b = log->tail.block + log->tail.bytes;
+	}
+
+	if (trx_id != 0) {
+		*b++ = ROW_OP_INSERT;
+		trx_write_trx_id(b, trx_id);
+		b += DATA_TRX_ID_LEN;
+	} else {
+		*b++ = ROW_OP_DELETE;
+	}
+
+	if (extra_size < 0x80) {
+		*b++ = (byte) extra_size;
+	} else {
+		ut_ad(extra_size < 0x8000);
+		*b++ = (byte) (0x80 | (extra_size >> 8));
+		*b++ = (byte) extra_size;
+	}
+
+	rec_convert_dtuple_to_temp<false>(
+		b + extra_size, index, tuple->fields, tuple->n_fields);
+
+	b += size;
+
+	if (mrec_size >= avail_size) {
+		const os_offset_t	byte_offset
+			= (os_offset_t) log->tail.blocks
+			* srv_sort_buf_size;
+		byte*			buf = log->tail.block;
+
+		if (byte_offset + srv_sort_buf_size >= srv_online_max_size) {
+			if (index->online_status != ONLINE_INDEX_COMPLETE)
+				goto write_failed;
+			/* About to run out of log, InnoDB has to
+			apply the online log for the completed index */
+			index->lock.s_unlock();
+			dberr_t error= row_log_apply(
+				log->alter_trx, index, nullptr, nullptr);
+			index->lock.s_lock(SRW_LOCK_CALL);
+			if (error != DB_SUCCESS) {
+				/* Mark all newly added indexes
+				as corrupted */
+				log->error = error;
+				success = false;
+				goto err_exit;
+			}
+
+			/* Recheck whether the index online log */
+			if (!index->online_log) {
+				goto err_exit;
+			}
+
+			goto start_log;
+		}
+
+		if (mrec_size == avail_size) {
+			ut_ad(b == &buf[srv_sort_buf_size]);
+		} else {
+			ut_ad(b == log->tail.buf + mrec_size);
+			memcpy(buf + log->tail.bytes,
+			       log->tail.buf, avail_size);
+		}
+
+		MEM_CHECK_DEFINED(buf, srv_sort_buf_size);
+
+		if (row_log_tmpfile(log) == OS_FILE_CLOSED) {
+			log->error = DB_OUT_OF_MEMORY;
+			goto err_exit;
+		}
+
+		/* If encryption is enabled encrypt buffer before writing it
+		to file system. */
+		if (srv_encrypt_log) {
+			if (!log_tmp_block_encrypt(
+				    buf, srv_sort_buf_size,
+				    log->crypt_tail, byte_offset)) {
+				log->error = DB_DECRYPTION_FAILED;
+				goto write_failed;
+			}
+
+			srv_stats.n_rowlog_blocks_encrypted.inc();
+			buf = log->crypt_tail;
+		}
+
+		log->tail.blocks++;
+		if (os_file_write(
+			    IORequestWrite,
+			    "(modification log)",
+			    log->fd,
+			    buf, byte_offset, srv_sort_buf_size)
+		    != DB_SUCCESS) {
+write_failed:
+			index->type |= DICT_CORRUPT;
+		}
+
+		MEM_UNDEFINED(log->tail.block, srv_sort_buf_size);
+		MEM_UNDEFINED(buf, srv_sort_buf_size);
+
+		memcpy(log->tail.block, log->tail.buf + avail_size,
+		       mrec_size - avail_size);
+		log->tail.bytes = mrec_size - avail_size;
+	} else {
+		log->tail.bytes += mrec_size;
+		ut_ad(b == log->tail.block + log->tail.bytes);
+	}
+
+	MEM_UNDEFINED(log->tail.buf, sizeof log->tail.buf);
+err_exit:
+	mysql_mutex_unlock(&log->mutex);
+	return success;
+}
+
+/******************************************************//**
+Gets the error status of the online index rebuild log.
+@return DB_SUCCESS or error code */
+dberr_t
+row_log_table_get_error(
+/*====================*/
+	const dict_index_t*	index)	/*!< in: clustered index of a table
+					that is being rebuilt online */
+{
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(dict_index_is_online_ddl(index));
+	return(index->online_log->error);
+}
+
+/******************************************************//**
+Starts logging an operation to a table that is being rebuilt.
+@return pointer to log, or NULL if no logging is necessary */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+byte*
+row_log_table_open(
+/*===============*/
+	row_log_t*	log,	/*!< in/out: online rebuild log */
+	ulint		size,	/*!< in: size of log record */
+	ulint*		avail)	/*!< out: available size for log record */
+{
+	mysql_mutex_lock(&log->mutex);
+
+	MEM_UNDEFINED(log->tail.buf, sizeof log->tail.buf);
+
+	if (log->error != DB_SUCCESS) {
+err_exit:
+		mysql_mutex_unlock(&log->mutex);
+		return(NULL);
+	}
+
+	if (!row_log_block_allocate(log->tail)) {
+		log->error = DB_OUT_OF_MEMORY;
+		goto err_exit;
+	}
+
+	ut_ad(log->tail.bytes < srv_sort_buf_size);
+	*avail = srv_sort_buf_size - log->tail.bytes;
+
+	if (size > *avail) {
+		/* Make sure log->tail.buf is large enough */
+		ut_ad(size <= sizeof log->tail.buf);
+		return(log->tail.buf);
+	} else {
+		return(log->tail.block + log->tail.bytes);
+	}
+}
+
+/******************************************************//**
+Stops logging an operation to a table that is being rebuilt. */
+static MY_ATTRIBUTE((nonnull))
+void
+row_log_table_close_func(
+/*=====================*/
+	dict_index_t*	index,	/*!< in/out: online rebuilt index */
+#ifdef UNIV_DEBUG
+	const byte*	b,	/*!< in: end of log record */
+#endif /* UNIV_DEBUG */
+	ulint		size,	/*!< in: size of log record */
+	ulint		avail)	/*!< in: available size for log record */
+{
+	row_log_t*	log = index->online_log;
+
+	mysql_mutex_assert_owner(&log->mutex);
+
+	if (size >= avail) {
+		const os_offset_t	byte_offset
+			= (os_offset_t) log->tail.blocks
+			* srv_sort_buf_size;
+		byte*			buf = log->tail.block;
+
+		if (byte_offset + srv_sort_buf_size >= srv_online_max_size) {
+			goto write_failed;
+		}
+
+		if (size == avail) {
+			ut_ad(b == &buf[srv_sort_buf_size]);
+		} else {
+			ut_ad(b == log->tail.buf + size);
+			memcpy(buf + log->tail.bytes, log->tail.buf, avail);
+		}
+
+		MEM_CHECK_DEFINED(buf, srv_sort_buf_size);
+
+		if (row_log_tmpfile(log) == OS_FILE_CLOSED) {
+			log->error = DB_OUT_OF_MEMORY;
+			goto err_exit;
+		}
+
+		/* If encryption is enabled encrypt buffer before writing it
+		to file system. */
+		if (srv_encrypt_log) {
+			if (!log_tmp_block_encrypt(
+				    log->tail.block, srv_sort_buf_size,
+				    log->crypt_tail, byte_offset,
+				    index->table->space_id)) {
+				log->error = DB_DECRYPTION_FAILED;
+				goto err_exit;
+			}
+
+			srv_stats.n_rowlog_blocks_encrypted.inc();
+			buf = log->crypt_tail;
+		}
+
+		log->tail.blocks++;
+		if (os_file_write(
+			    IORequestWrite,
+			    "(modification log)",
+			    log->fd,
+			    buf, byte_offset, srv_sort_buf_size)
+		    != DB_SUCCESS) {
+write_failed:
+			log->error = DB_ONLINE_LOG_TOO_BIG;
+		}
+
+		MEM_UNDEFINED(log->tail.block, srv_sort_buf_size);
+		MEM_UNDEFINED(buf, srv_sort_buf_size);
+		memcpy(log->tail.block, log->tail.buf + avail, size - avail);
+		log->tail.bytes = size - avail;
+	} else {
+		log->tail.bytes += size;
+		ut_ad(b == log->tail.block + log->tail.bytes);
+	}
+
+	log->tail.total += size;
+	MEM_UNDEFINED(log->tail.buf, sizeof log->tail.buf);
+err_exit:
+	mysql_mutex_unlock(&log->mutex);
+
+	onlineddl_rowlog_rows++;
+	/* 10000 means 100.00%, 4525 means 45.25% */
+	onlineddl_rowlog_pct_used = static_cast<ulint>((log->tail.total * 10000) / srv_online_max_size);
+}
+
+#ifdef UNIV_DEBUG
+# define row_log_table_close(index, b, size, avail)	\
+	row_log_table_close_func(index, b, size, avail)
+#else /* UNIV_DEBUG */
+# define row_log_table_close(log, b, size, avail)	\
+	row_log_table_close_func(index, size, avail)
+#endif /* UNIV_DEBUG */
+
+/** Check whether a virtual column is indexed in the new table being
+created during alter table
+@param[in]	index	cluster index
+@param[in]	v_no	virtual column number
+@return true if it is indexed, else false */
+bool
+row_log_col_is_indexed(
+	const dict_index_t*	index,
+	ulint			v_no)
+{
+	return(dict_table_get_nth_v_col(
+		index->online_log->table, v_no)->m_col.ord_part);
+}
+
+/******************************************************//**
+Logs a delete operation to a table that is being rebuilt.
+This will be merged in row_log_table_apply_delete(). */
+void
+row_log_table_delete(
+/*=================*/
+	const rec_t*	rec,	/*!< in: clustered index leaf page record,
+				page X-latched */
+	dict_index_t*	index,	/*!< in/out: clustered index, S-latched
+				or X-latched */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec,index) */
+	const byte*	sys)	/*!< in: DB_TRX_ID,DB_ROLL_PTR that should
+				be logged, or NULL to use those in rec */
+{
+	ulint		old_pk_extra_size;
+	ulint		old_pk_size;
+	ulint		mrec_size;
+	ulint		avail_size;
+	mem_heap_t*	heap		= NULL;
+	const dtuple_t*	old_pk;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(rec_offs_n_fields(offsets) == dict_index_get_n_fields(index));
+	ut_ad(rec_offs_size(offsets) <= sizeof index->online_log->tail.buf);
+	ut_ad(index->lock.have_any());
+
+	if (index->online_status != ONLINE_INDEX_CREATION
+	    || (index->type & DICT_CORRUPT) || index->table->corrupted
+	    || index->online_log->error != DB_SUCCESS) {
+		return;
+	}
+
+	dict_table_t* new_table = index->online_log->table;
+	dict_index_t* new_index = dict_table_get_first_index(new_table);
+
+	ut_ad(dict_index_is_clust(new_index));
+	ut_ad(!dict_index_is_online_ddl(new_index));
+	ut_ad(index->online_log->min_trx);
+
+	/* Create the tuple PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR in new_table. */
+	if (index->online_log->same_pk) {
+		dtuple_t*	tuple;
+		ut_ad(new_index->n_uniq == index->n_uniq);
+
+		/* The PRIMARY KEY and DB_TRX_ID,DB_ROLL_PTR are in the first
+		fields of the record. */
+		heap = mem_heap_create(
+			DATA_TRX_ID_LEN
+			+ DTUPLE_EST_ALLOC(new_index->first_user_field()));
+		old_pk = tuple = dtuple_create(heap,
+					       new_index->first_user_field());
+		dict_index_copy_types(tuple, new_index, tuple->n_fields);
+		dtuple_set_n_fields_cmp(tuple, new_index->n_uniq);
+
+		for (ulint i = 0; i < dtuple_get_n_fields(tuple); i++) {
+			ulint		len;
+			const void*	field	= rec_get_nth_field(
+				rec, offsets, i, &len);
+			dfield_t*	dfield	= dtuple_get_nth_field(
+				tuple, i);
+			ut_ad(len != UNIV_SQL_NULL);
+			ut_ad(!rec_offs_nth_extern(offsets, i));
+			dfield_set_data(dfield, field, len);
+		}
+
+		dfield_t* db_trx_id = dtuple_get_nth_field(
+			tuple, new_index->n_uniq);
+
+		const bool replace_sys_fields
+			= sys
+			|| trx_read_trx_id(static_cast<byte*>(db_trx_id->data))
+			< index->online_log->min_trx;
+
+		if (replace_sys_fields) {
+			if (!sys || trx_read_trx_id(sys)
+			    < index->online_log->min_trx) {
+				sys = reset_trx_id;
+			}
+
+			dfield_set_data(db_trx_id, sys, DATA_TRX_ID_LEN);
+			dfield_set_data(db_trx_id + 1, sys + DATA_TRX_ID_LEN,
+					DATA_ROLL_PTR_LEN);
+		}
+
+		ut_d(trx_id_check(db_trx_id->data,
+				  index->online_log->min_trx));
+	} else {
+		/* The PRIMARY KEY has changed. Translate the tuple. */
+		old_pk = row_log_table_get_pk(
+			rec, index, offsets, NULL, &heap);
+
+		if (!old_pk) {
+			ut_ad(index->online_log->error != DB_SUCCESS);
+			if (heap) {
+				goto func_exit;
+			}
+			return;
+		}
+	}
+
+	ut_ad(DATA_TRX_ID_LEN == dtuple_get_nth_field(
+		      old_pk, old_pk->n_fields - 2)->len);
+	ut_ad(DATA_ROLL_PTR_LEN == dtuple_get_nth_field(
+		      old_pk, old_pk->n_fields - 1)->len);
+	old_pk_size = rec_get_converted_size_temp<false>(
+		new_index, old_pk->fields, old_pk->n_fields,
+		&old_pk_extra_size);
+	ut_ad(old_pk_extra_size < 0x100);
+
+	/* 2 = 1 (extra_size) + at least 1 byte payload */
+	mrec_size = 2 + old_pk_size;
+
+	if (byte* b = row_log_table_open(index->online_log,
+					 mrec_size, &avail_size)) {
+		*b++ = ROW_T_DELETE;
+		*b++ = static_cast<byte>(old_pk_extra_size);
+
+		rec_convert_dtuple_to_temp<false>(
+			b + old_pk_extra_size, new_index,
+			old_pk->fields, old_pk->n_fields);
+
+		b += old_pk_size;
+
+		row_log_table_close(index, b, mrec_size, avail_size);
+	}
+
+func_exit:
+	mem_heap_free(heap);
+}
+
+/******************************************************//**
+Logs an insert or update to a table that is being rebuilt. */
+static
+void
+row_log_table_low_redundant(
+/*========================*/
+	const rec_t*		rec,	/*!< in: clustered index leaf
+					page record in ROW_FORMAT=REDUNDANT,
+					page X-latched */
+	dict_index_t*		index,	/*!< in/out: clustered index, S-latched
+					or X-latched */
+	bool			insert,	/*!< in: true if insert,
+					false if update */
+	const dtuple_t*		old_pk,	/*!< in: old PRIMARY KEY value
+					(if !insert and a PRIMARY KEY
+					is being created) */
+	const dict_index_t*	new_index)
+					/*!< in: clustered index of the
+					new table, not latched */
+{
+	ulint		old_pk_size;
+	ulint		old_pk_extra_size;
+	ulint		size;
+	ulint		extra_size;
+	ulint		mrec_size;
+	ulint		avail_size;
+	mem_heap_t*	heap		= NULL;
+	dtuple_t*	tuple;
+	const ulint	n_fields = rec_get_n_fields_old(rec);
+
+	ut_ad(index->n_fields >= n_fields);
+	ut_ad(index->n_fields == n_fields || index->is_instant());
+	ut_ad(dict_tf2_is_valid(index->table->flags, index->table->flags2));
+	ut_ad(!dict_table_is_comp(index->table));  /* redundant row format */
+	ut_ad(dict_index_is_clust(new_index));
+
+	heap = mem_heap_create(DTUPLE_EST_ALLOC(n_fields));
+	tuple = dtuple_create(heap, n_fields);
+	dict_index_copy_types(tuple, index, n_fields);
+
+	dtuple_set_n_fields_cmp(tuple, dict_index_get_n_unique(index));
+
+	if (rec_get_1byte_offs_flag(rec)) {
+		for (ulint i = 0; i < n_fields; i++) {
+			dfield_t*	dfield;
+			ulint		len;
+			const void*	field;
+
+			dfield = dtuple_get_nth_field(tuple, i);
+			field = rec_get_nth_field_old(rec, i, &len);
+
+			dfield_set_data(dfield, field, len);
+		}
+	} else {
+		for (ulint i = 0; i < n_fields; i++) {
+			dfield_t*	dfield;
+			ulint		len;
+			const void*	field;
+
+			dfield = dtuple_get_nth_field(tuple, i);
+			field = rec_get_nth_field_old(rec, i, &len);
+
+			dfield_set_data(dfield, field, len);
+
+			if (rec_2_is_field_extern(rec, i)) {
+				dfield_set_ext(dfield);
+			}
+		}
+	}
+
+	dfield_t* db_trx_id = dtuple_get_nth_field(tuple, index->n_uniq);
+	ut_ad(dfield_get_len(db_trx_id) == DATA_TRX_ID_LEN);
+	ut_ad(dfield_get_len(db_trx_id + 1) == DATA_ROLL_PTR_LEN);
+
+	if (trx_read_trx_id(static_cast<const byte*>
+			    (dfield_get_data(db_trx_id)))
+	    < index->online_log->min_trx) {
+		dfield_set_data(db_trx_id, reset_trx_id, DATA_TRX_ID_LEN);
+		dfield_set_data(db_trx_id + 1, reset_trx_id + DATA_TRX_ID_LEN,
+				DATA_ROLL_PTR_LEN);
+	}
+
+	const bool is_instant = index->online_log->is_instant(index);
+	rec_comp_status_t status = is_instant
+		? REC_STATUS_INSTANT : REC_STATUS_ORDINARY;
+
+	size = rec_get_converted_size_temp<true>(
+		index, tuple->fields, tuple->n_fields, &extra_size, status);
+	if (is_instant) {
+		size++;
+		extra_size++;
+	}
+
+	mrec_size = ROW_LOG_HEADER_SIZE + size + (extra_size >= 0x80);
+
+	if (insert || index->online_log->same_pk) {
+		ut_ad(!old_pk);
+		old_pk_extra_size = old_pk_size = 0;
+	} else {
+		ut_ad(old_pk);
+		ut_ad(old_pk->n_fields == 2 + old_pk->n_fields_cmp);
+		ut_ad(DATA_TRX_ID_LEN == dtuple_get_nth_field(
+			      old_pk, old_pk->n_fields - 2)->len);
+		ut_ad(DATA_ROLL_PTR_LEN == dtuple_get_nth_field(
+			      old_pk, old_pk->n_fields - 1)->len);
+
+		old_pk_size = rec_get_converted_size_temp<false>(
+			new_index, old_pk->fields, old_pk->n_fields,
+			&old_pk_extra_size);
+		ut_ad(old_pk_extra_size < 0x100);
+		mrec_size += 1/*old_pk_extra_size*/ + old_pk_size;
+	}
+
+	if (byte* b = row_log_table_open(index->online_log,
+					 mrec_size, &avail_size)) {
+		if (insert) {
+			*b++ = ROW_T_INSERT;
+		} else {
+			*b++ = ROW_T_UPDATE;
+
+			if (old_pk_size) {
+				*b++ = static_cast<byte>(old_pk_extra_size);
+
+				rec_convert_dtuple_to_temp<false>(
+					b + old_pk_extra_size, new_index,
+					old_pk->fields, old_pk->n_fields);
+				b += old_pk_size;
+			}
+		}
+
+		if (extra_size < 0x80) {
+			*b++ = static_cast<byte>(extra_size);
+		} else {
+			ut_ad(extra_size < 0x8000);
+			*b++ = static_cast<byte>(0x80 | (extra_size >> 8));
+			*b++ = static_cast<byte>(extra_size);
+		}
+
+		if (status == REC_STATUS_INSTANT) {
+			ut_ad(is_instant);
+			if (n_fields <= index->online_log->n_core_fields) {
+				status = REC_STATUS_ORDINARY;
+			}
+			*b = status;
+		}
+
+		rec_convert_dtuple_to_temp<true>(
+			b + extra_size, index, tuple->fields, tuple->n_fields,
+			status);
+		b += size;
+
+		row_log_table_close(index, b, mrec_size, avail_size);
+	}
+
+	mem_heap_free(heap);
+}
+
+/******************************************************//**
+Logs an insert or update to a table that is being rebuilt. */
+static
+void
+row_log_table_low(
+/*==============*/
+	const rec_t*	rec,	/*!< in: clustered index leaf page record,
+				page X-latched */
+	dict_index_t*	index,	/*!< in/out: clustered index, S-latched
+				or X-latched */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec,index) */
+	bool		insert,	/*!< in: true if insert, false if update */
+	const dtuple_t*	old_pk)	/*!< in: old PRIMARY KEY value (if !insert
+				and a PRIMARY KEY is being created) */
+{
+	ulint			old_pk_size;
+	ulint			old_pk_extra_size;
+	ulint			extra_size;
+	ulint			mrec_size;
+	ulint			avail_size;
+	const dict_index_t*	new_index;
+	row_log_t*		log = index->online_log;
+
+	new_index = dict_table_get_first_index(log->table);
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(dict_index_is_clust(new_index));
+	ut_ad(!dict_index_is_online_ddl(new_index));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(rec_offs_n_fields(offsets) == dict_index_get_n_fields(index));
+	ut_ad(rec_offs_size(offsets) <= sizeof log->tail.buf);
+	ut_ad(index->lock.have_any());
+
+	/* old_pk=row_log_table_get_pk() [not needed in INSERT] is a prefix
+	of the clustered index record (PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR),
+	with no information on virtual columns */
+	ut_ad(!old_pk || !insert);
+	ut_ad(!old_pk || old_pk->n_v_fields == 0);
+
+	if (index->online_status != ONLINE_INDEX_CREATION
+	    || (index->type & DICT_CORRUPT) || index->table->corrupted
+	    || log->error != DB_SUCCESS) {
+		return;
+	}
+
+	if (!rec_offs_comp(offsets)) {
+		row_log_table_low_redundant(
+			rec, index, insert, old_pk, new_index);
+		return;
+	}
+
+	ut_ad(rec_get_status(rec) == REC_STATUS_ORDINARY
+	      || rec_get_status(rec) == REC_STATUS_INSTANT);
+
+	const ulint omit_size = REC_N_NEW_EXTRA_BYTES;
+
+	const ulint rec_extra_size = rec_offs_extra_size(offsets) - omit_size;
+	const bool is_instant = log->is_instant(index);
+	extra_size = rec_extra_size + is_instant;
+
+	unsigned fake_extra_size = 0;
+	byte fake_extra_buf[3];
+	if (is_instant && UNIV_UNLIKELY(!index->is_instant())) {
+		/* The source table was emptied after ALTER TABLE
+		started, and it was converted to non-instant format.
+		Because row_log_table_apply_op() expects to find
+		all records to be logged in the same way, we will
+		be unable to copy the rec_extra_size bytes from the
+		record header, but must convert them here. */
+		unsigned n_add = index->n_fields - 1 - log->n_core_fields;
+		fake_extra_size = rec_get_n_add_field_len(n_add);
+		ut_ad(fake_extra_size == 1 || fake_extra_size == 2);
+		extra_size += fake_extra_size;
+		byte* fake_extra = fake_extra_buf + fake_extra_size;
+		rec_set_n_add_field(fake_extra, n_add);
+		ut_ad(fake_extra == fake_extra_buf);
+	}
+
+	mrec_size = ROW_LOG_HEADER_SIZE
+		+ (extra_size >= 0x80) + rec_offs_size(offsets) - omit_size
+		+ is_instant + fake_extra_size;
+
+	if (insert || log->same_pk) {
+		ut_ad(!old_pk);
+		old_pk_extra_size = old_pk_size = 0;
+	} else {
+		ut_ad(old_pk);
+		ut_ad(old_pk->n_fields == 2 + old_pk->n_fields_cmp);
+		ut_ad(DATA_TRX_ID_LEN == dtuple_get_nth_field(
+			      old_pk, old_pk->n_fields - 2)->len);
+		ut_ad(DATA_ROLL_PTR_LEN == dtuple_get_nth_field(
+			      old_pk, old_pk->n_fields - 1)->len);
+
+		old_pk_size = rec_get_converted_size_temp<false>(
+			new_index, old_pk->fields, old_pk->n_fields,
+			&old_pk_extra_size);
+		ut_ad(old_pk_extra_size < 0x100);
+		mrec_size += 1/*old_pk_extra_size*/ + old_pk_size;
+	}
+
+	if (byte* b = row_log_table_open(log, mrec_size, &avail_size)) {
+		if (insert) {
+			*b++ = ROW_T_INSERT;
+		} else {
+			*b++ = ROW_T_UPDATE;
+
+			if (old_pk_size) {
+				*b++ = static_cast<byte>(old_pk_extra_size);
+
+				rec_convert_dtuple_to_temp<false>(
+					b + old_pk_extra_size, new_index,
+					old_pk->fields, old_pk->n_fields);
+				b += old_pk_size;
+			}
+		}
+
+		if (extra_size < 0x80) {
+			*b++ = static_cast<byte>(extra_size);
+		} else {
+			ut_ad(extra_size < 0x8000);
+			*b++ = static_cast<byte>(0x80 | (extra_size >> 8));
+			*b++ = static_cast<byte>(extra_size);
+		}
+
+		if (is_instant) {
+			*b++ = fake_extra_size
+				? REC_STATUS_INSTANT
+				: rec_get_status(rec);
+		} else {
+			ut_ad(rec_get_status(rec) == REC_STATUS_ORDINARY);
+		}
+
+		memcpy(b, rec - rec_extra_size - omit_size, rec_extra_size);
+		b += rec_extra_size;
+		memcpy(b, fake_extra_buf + 1, fake_extra_size);
+		b += fake_extra_size;
+		ulint len;
+		ulint trx_id_offs = rec_get_nth_field_offs(
+			offsets, index->n_uniq, &len);
+		ut_ad(len == DATA_TRX_ID_LEN);
+		memcpy(b, rec, rec_offs_data_size(offsets));
+		if (trx_read_trx_id(b + trx_id_offs) < log->min_trx) {
+			memcpy(b + trx_id_offs,
+			       reset_trx_id, sizeof reset_trx_id);
+		}
+		b += rec_offs_data_size(offsets);
+
+		row_log_table_close(index, b, mrec_size, avail_size);
+	}
+}
+
+/******************************************************//**
+Logs an update to a table that is being rebuilt.
+This will be merged in row_log_table_apply_update(). */
+void
+row_log_table_update(
+/*=================*/
+	const rec_t*	rec,	/*!< in: clustered index leaf page record,
+				page X-latched */
+	dict_index_t*	index,	/*!< in/out: clustered index, S-latched
+				or X-latched */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec,index) */
+	const dtuple_t*	old_pk)	/*!< in: row_log_table_get_pk()
+				before the update */
+{
+	row_log_table_low(rec, index, offsets, false, old_pk);
+}
+
+/** Gets the old table column of a PRIMARY KEY column.
+@param table old table (before ALTER TABLE)
+@param col_map mapping of old column numbers to new ones
+@param col_no column position in the new table
+@return old table column, or NULL if this is an added column */
+static
+const dict_col_t*
+row_log_table_get_pk_old_col(
+/*=========================*/
+	const dict_table_t*	table,
+	const ulint*		col_map,
+	ulint			col_no)
+{
+	for (ulint i = 0; i < table->n_cols; i++) {
+		if (col_no == col_map[i]) {
+			return(dict_table_get_nth_col(table, i));
+		}
+	}
+
+	return(NULL);
+}
+
+/** Maps an old table column of a PRIMARY KEY column.
+@param[in]	ifield		clustered index field in the new table (after
+ALTER TABLE)
+@param[in]	index		the clustered index of ifield
+@param[in,out]	dfield		clustered index tuple field in the new table
+@param[in,out]	heap		memory heap for allocating dfield contents
+@param[in]	rec		clustered index leaf page record in the old
+table
+@param[in]	offsets		rec_get_offsets(rec)
+@param[in]	i		rec field corresponding to col
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED size of the old table
+@param[in]	max_len		maximum length of dfield
+@param[in]	log		row log for the table
+@retval DB_INVALID_NULL		if a NULL value is encountered
+@retval DB_TOO_BIG_INDEX_COL	if the maximum prefix length is exceeded */
+static
+dberr_t
+row_log_table_get_pk_col(
+	const dict_field_t*	ifield,
+	const dict_index_t*	index,
+	dfield_t*		dfield,
+	mem_heap_t*		heap,
+	const rec_t*		rec,
+	const rec_offs*		offsets,
+	ulint			i,
+	ulint			zip_size,
+	ulint			max_len,
+	const row_log_t*	log)
+{
+	const byte*	field;
+	ulint		len;
+
+	field = rec_get_nth_field(rec, offsets, i, &len);
+
+	if (len == UNIV_SQL_DEFAULT) {
+		field = log->instant_field_value(i, &len);
+	}
+
+	if (len == UNIV_SQL_NULL) {
+		if (!log->allow_not_null) {
+			return(DB_INVALID_NULL);
+		}
+
+		unsigned col_no= ifield->col->ind;
+		ut_ad(col_no < log->defaults->n_fields);
+
+		field = static_cast<const byte*>(
+			log->defaults->fields[col_no].data);
+		if (!field) {
+			return(DB_INVALID_NULL);
+		}
+		len = log->defaults->fields[col_no].len;
+	}
+
+	if (rec_offs_nth_extern(offsets, i)) {
+		ulint	field_len = ifield->prefix_len;
+		byte*	blob_field;
+
+		if (!field_len) {
+			field_len = ifield->fixed_len;
+			if (!field_len) {
+				field_len = max_len + 1;
+			}
+		}
+
+		blob_field = static_cast<byte*>(
+			mem_heap_alloc(heap, field_len));
+
+		len = btr_copy_externally_stored_field_prefix(
+			blob_field, field_len, zip_size, field, len);
+		if (len >= max_len + 1) {
+			return(DB_TOO_BIG_INDEX_COL);
+		}
+
+		dfield_set_data(dfield, blob_field, len);
+	} else {
+		dfield_set_data(dfield, mem_heap_dup(heap, field, len), len);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/******************************************************//**
+Constructs the old PRIMARY KEY and DB_TRX_ID,DB_ROLL_PTR
+of a table that is being rebuilt.
+@return tuple of PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR in the rebuilt table,
+or NULL if the PRIMARY KEY definition does not change */
+const dtuple_t*
+row_log_table_get_pk(
+/*=================*/
+	const rec_t*	rec,	/*!< in: clustered index leaf page record,
+				page X-latched */
+	dict_index_t*	index,	/*!< in/out: clustered index, S-latched
+				or X-latched */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec,index) */
+	byte*		sys,	/*!< out: DB_TRX_ID,DB_ROLL_PTR for
+				row_log_table_delete(), or NULL */
+	mem_heap_t**	heap)	/*!< in/out: memory heap where allocated */
+{
+	dtuple_t*	tuple	= NULL;
+	row_log_t*	log	= index->online_log;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(dict_index_is_online_ddl(index));
+	ut_ad(!offsets || rec_offs_validate(rec, index, offsets));
+	ut_ad(index->lock.have_any());
+	ut_ad(log);
+	ut_ad(log->table);
+	ut_ad(log->min_trx);
+
+	if (log->same_pk) {
+		/* The PRIMARY KEY columns are unchanged. */
+		if (sys) {
+			/* Store the DB_TRX_ID,DB_ROLL_PTR. */
+			ulint	trx_id_offs = index->trx_id_offset;
+
+			if (!trx_id_offs) {
+				ulint	len;
+
+				if (!offsets) {
+					offsets = rec_get_offsets(
+						rec, index, nullptr,
+						index->n_core_fields,
+						index->db_trx_id() + 1, heap);
+				}
+
+				trx_id_offs = rec_get_nth_field_offs(
+					offsets, index->db_trx_id(), &len);
+				ut_ad(len == DATA_TRX_ID_LEN);
+			}
+
+			const byte* ptr = trx_read_trx_id(rec + trx_id_offs)
+				< log->min_trx
+				? reset_trx_id
+				: rec + trx_id_offs;
+
+			memcpy(sys, ptr, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+			ut_d(trx_id_check(sys, log->min_trx));
+		}
+
+		return(NULL);
+	}
+
+	mysql_mutex_lock(&log->mutex);
+
+	/* log->error is protected by log->mutex. */
+	if (log->error == DB_SUCCESS) {
+		dict_table_t*	new_table	= log->table;
+		dict_index_t*	new_index
+			= dict_table_get_first_index(new_table);
+		const ulint	new_n_uniq
+			= dict_index_get_n_unique(new_index);
+
+		if (!*heap) {
+			ulint	size = 0;
+
+			if (!offsets) {
+				size += (1 + REC_OFFS_HEADER_SIZE
+					 + unsigned(index->n_fields))
+					* sizeof *offsets;
+			}
+
+			for (ulint i = 0; i < new_n_uniq; i++) {
+				size += dict_col_get_min_size(
+					dict_index_get_nth_col(new_index, i));
+			}
+
+			*heap = mem_heap_create(
+				DTUPLE_EST_ALLOC(new_n_uniq + 2) + size);
+		}
+
+		if (!offsets) {
+			offsets = rec_get_offsets(rec, index, nullptr,
+						  index->n_core_fields,
+						  ULINT_UNDEFINED, heap);
+		}
+
+		tuple = dtuple_create(*heap, new_n_uniq + 2);
+		dict_index_copy_types(tuple, new_index, tuple->n_fields);
+		dtuple_set_n_fields_cmp(tuple, new_n_uniq);
+
+		const ulint max_len = DICT_MAX_FIELD_LEN_BY_FORMAT(new_table);
+
+		const ulint zip_size = index->table->space->zip_size();
+
+		for (ulint new_i = 0; new_i < new_n_uniq; new_i++) {
+			dict_field_t*	ifield;
+			dfield_t*	dfield;
+			ulint		prtype;
+			ulint		mbminlen, mbmaxlen;
+
+			ifield = dict_index_get_nth_field(new_index, new_i);
+			dfield = dtuple_get_nth_field(tuple, new_i);
+
+			const ulint	col_no
+				= dict_field_get_col(ifield)->ind;
+
+			if (const dict_col_t* col
+			    = row_log_table_get_pk_old_col(
+				    index->table, log->col_map, col_no)) {
+				ulint	i = dict_col_get_clust_pos(col, index);
+
+				if (i == ULINT_UNDEFINED) {
+					ut_ad(0);
+					log->error = DB_CORRUPTION;
+					goto err_exit;
+				}
+
+				log->error = row_log_table_get_pk_col(
+					ifield, new_index, dfield, *heap,
+					rec, offsets, i, zip_size, max_len,
+					log);
+
+				if (log->error != DB_SUCCESS) {
+err_exit:
+					tuple = NULL;
+					goto func_exit;
+				}
+
+				mbminlen = col->mbminlen;
+				mbmaxlen = col->mbmaxlen;
+				prtype = col->prtype;
+			} else {
+				/* No matching column was found in the old
+				table, so this must be an added column.
+				Copy the default value. */
+				ut_ad(log->defaults);
+
+				dfield_copy(dfield, dtuple_get_nth_field(
+						    log->defaults, col_no));
+				mbminlen = dfield->type.mbminlen;
+				mbmaxlen = dfield->type.mbmaxlen;
+				prtype = dfield->type.prtype;
+			}
+
+			ut_ad(!dfield_is_ext(dfield));
+			ut_ad(!dfield_is_null(dfield));
+
+			if (ifield->prefix_len) {
+				ulint	len = dtype_get_at_most_n_mbchars(
+					prtype, mbminlen, mbmaxlen,
+					ifield->prefix_len,
+					dfield_get_len(dfield),
+					static_cast<const char*>(
+						dfield_get_data(dfield)));
+
+				ut_ad(len <= dfield_get_len(dfield));
+				dfield_set_len(dfield, len);
+			}
+		}
+
+		const byte* trx_roll = rec
+			+ row_get_trx_id_offset(index, offsets);
+
+		/* Copy the fields, because the fields will be updated
+		or the record may be moved somewhere else in the B-tree
+		as part of the upcoming operation. */
+		if (trx_read_trx_id(trx_roll) < log->min_trx) {
+			trx_roll = reset_trx_id;
+			if (sys) {
+				memcpy(sys, trx_roll,
+				       DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+			}
+		} else if (sys) {
+			memcpy(sys, trx_roll,
+			       DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+			trx_roll = sys;
+		} else {
+			trx_roll = static_cast<const byte*>(
+				mem_heap_dup(
+					*heap, trx_roll,
+					DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN));
+		}
+
+		ut_d(trx_id_check(trx_roll, log->min_trx));
+
+		dfield_set_data(dtuple_get_nth_field(tuple, new_n_uniq),
+				trx_roll, DATA_TRX_ID_LEN);
+		dfield_set_data(dtuple_get_nth_field(tuple, new_n_uniq + 1),
+				trx_roll + DATA_TRX_ID_LEN, DATA_ROLL_PTR_LEN);
+	}
+
+func_exit:
+	mysql_mutex_unlock(&log->mutex);
+	return(tuple);
+}
+
+/******************************************************//**
+Logs an insert to a table that is being rebuilt.
+This will be merged in row_log_table_apply_insert(). */
+void
+row_log_table_insert(
+/*=================*/
+	const rec_t*	rec,	/*!< in: clustered index leaf page record,
+				page X-latched */
+	dict_index_t*	index,	/*!< in/out: clustered index, S-latched
+				or X-latched */
+	const rec_offs*	offsets)/*!< in: rec_get_offsets(rec,index) */
+{
+	row_log_table_low(rec, index, offsets, true, NULL);
+}
+
+/******************************************************//**
+Converts a log record to a table row.
+@return converted row, or NULL if the conversion fails */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+const dtuple_t*
+row_log_table_apply_convert_mrec(
+/*=============================*/
+	const mrec_t*		mrec,		/*!< in: merge record */
+	dict_index_t*		index,		/*!< in: index of mrec */
+	const rec_offs*		offsets,	/*!< in: offsets of mrec */
+	row_log_t*		log,		/*!< in: rebuild context */
+	mem_heap_t*		heap,		/*!< in/out: memory heap */
+	dberr_t*		error)		/*!< out: DB_SUCCESS or
+						DB_MISSING_HISTORY or
+						reason of failure */
+{
+	dtuple_t*	row;
+
+	log->n_rows++;
+	*error = DB_SUCCESS;
+
+	/* This is based on row_build(). */
+	if (log->defaults) {
+		row = dtuple_copy(log->defaults, heap);
+		/* dict_table_copy_types() would set the fields to NULL */
+		for (ulint i = 0; i < dict_table_get_n_cols(log->table); i++) {
+			dict_col_copy_type(
+				dict_table_get_nth_col(log->table, i),
+				dfield_get_type(dtuple_get_nth_field(row, i)));
+		}
+	} else {
+		row = dtuple_create(heap, dict_table_get_n_cols(log->table));
+		dict_table_copy_types(row, log->table);
+	}
+
+	for (ulint i = 0; i < rec_offs_n_fields(offsets); i++) {
+		const dict_field_t*	ind_field
+			= dict_index_get_nth_field(index, i);
+
+		if (ind_field->prefix_len) {
+			/* Column prefixes can only occur in key
+			fields, which cannot be stored externally. For
+			a column prefix, there should also be the full
+			field in the clustered index tuple. The row
+			tuple comprises full fields, not prefixes. */
+			ut_ad(!rec_offs_nth_extern(offsets, i));
+			continue;
+		}
+
+		const dict_col_t*	col
+			= dict_field_get_col(ind_field);
+
+		if (col->is_dropped()) {
+			/* the column was instantly dropped earlier */
+			ut_ad(index->table->instant);
+			continue;
+		}
+
+		ulint			col_no
+			= log->col_map[dict_col_get_no(col)];
+
+		if (col_no == ULINT_UNDEFINED) {
+			/* the column is being dropped now */
+			continue;
+		}
+
+		dfield_t*	dfield
+			= dtuple_get_nth_field(row, col_no);
+
+		ulint			len;
+		const byte*		data;
+
+		if (rec_offs_nth_extern(offsets, i)) {
+			ut_ad(rec_offs_any_extern(offsets));
+			index->lock.x_lock(SRW_LOCK_CALL);
+
+			data = btr_rec_copy_externally_stored_field(
+				mrec, offsets,
+				index->table->space->zip_size(),
+				i, &len, heap);
+			ut_a(data);
+			dfield_set_data(dfield, data, len);
+
+			index->lock.x_unlock();
+		} else {
+			data = rec_get_nth_field(mrec, offsets, i, &len);
+			if (len == UNIV_SQL_DEFAULT) {
+				data = log->instant_field_value(i, &len);
+			}
+			dfield_set_data(dfield, data, len);
+		}
+
+		if (len != UNIV_SQL_NULL && col->mtype == DATA_MYSQL
+		    && col->len != len && !dict_table_is_comp(log->table)) {
+
+			ut_ad(col->len >= len);
+			if (dict_table_is_comp(index->table)) {
+				byte*	buf = (byte*) mem_heap_alloc(heap,
+								     col->len);
+				memcpy(buf, dfield->data, len);
+				memset(buf + len, 0x20, col->len - len);
+
+				dfield_set_data(dfield, buf, col->len);
+			} else {
+				/* field length mismatch should not happen
+				when rebuilding the redundant row format
+				table. */
+				ut_ad(0);
+				*error = DB_CORRUPTION;
+				return(NULL);
+			}
+		}
+
+		/* See if any columns were changed to NULL or NOT NULL. */
+		const dict_col_t*	new_col
+			= dict_table_get_nth_col(log->table, col_no);
+		ut_ad(new_col->same_format(*col));
+
+		/* Assert that prtype matches except for nullability. */
+		ut_ad(!((new_col->prtype ^ dfield_get_type(dfield)->prtype)
+			& ~(DATA_NOT_NULL | DATA_VERSIONED
+			    | CHAR_COLL_MASK << 16 | DATA_LONG_TRUE_VARCHAR)));
+
+		if (new_col->prtype == col->prtype) {
+			continue;
+		}
+
+		if ((new_col->prtype & DATA_NOT_NULL)
+		    && dfield_is_null(dfield)) {
+
+			if (!log->allow_not_null) {
+				/* We got a NULL value for a NOT NULL column. */
+				*error = DB_INVALID_NULL;
+				return NULL;
+			}
+
+			const dfield_t& default_field
+				= log->defaults->fields[col_no];
+
+			Field* field = log->old_table->field[col->ind];
+
+			field->set_warning(Sql_condition::WARN_LEVEL_WARN,
+					   WARN_DATA_TRUNCATED, 1,
+					   ulong(log->n_rows));
+
+			*dfield = default_field;
+		}
+
+		/* Adjust the DATA_NOT_NULL flag in the parsed row. */
+		dfield_get_type(dfield)->prtype = new_col->prtype;
+
+		ut_ad(dict_col_type_assert_equal(new_col,
+						 dfield_get_type(dfield)));
+	}
+
+	return(row);
+}
+
+/******************************************************//**
+Replays an insert operation on a table that was rebuilt.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_log_table_apply_insert_low(
+/*===========================*/
+	que_thr_t*		thr,		/*!< in: query graph */
+	const dtuple_t*		row,		/*!< in: table row
+						in the old table definition */
+	mem_heap_t*		offsets_heap,	/*!< in/out: memory heap
+						that can be emptied */
+	mem_heap_t*		heap,		/*!< in/out: memory heap */
+	row_merge_dup_t*	dup)		/*!< in/out: for reporting
+						duplicate key errors */
+{
+	dberr_t		error;
+	dtuple_t*	entry;
+	const row_log_t*log	= dup->index->online_log;
+	dict_index_t*	index	= dict_table_get_first_index(log->table);
+	ulint		n_index = 0;
+
+	ut_ad(dtuple_validate(row));
+
+	DBUG_LOG("ib_alter_table",
+		 "insert table " << index->table->id << " (index "
+		 << index->id << "): " << rec_printer(row).str());
+
+	static const ulint	flags
+		= (BTR_CREATE_FLAG
+		   | BTR_NO_LOCKING_FLAG
+		   | BTR_NO_UNDO_LOG_FLAG
+		   | BTR_KEEP_SYS_FLAG);
+
+	entry = row_build_index_entry(row, NULL, index, heap);
+
+	error = row_ins_clust_index_entry_low(
+		flags, BTR_MODIFY_TREE, index, index->n_uniq,
+		entry, 0, thr);
+
+	switch (error) {
+	case DB_SUCCESS:
+		break;
+	case DB_SUCCESS_LOCKED_REC:
+		/* The row had already been copied to the table. */
+		return(DB_SUCCESS);
+	default:
+		return(error);
+	}
+
+	ut_ad(dict_index_is_clust(index));
+
+	for (n_index += index->type != DICT_CLUSTERED;
+	     (index = dict_table_get_next_index(index)); n_index++) {
+		if (index->type & DICT_FTS) {
+			continue;
+		}
+
+		entry = row_build_index_entry(row, NULL, index, heap);
+		error = row_ins_sec_index_entry_low(
+			flags, BTR_INSERT_TREE,
+			index, offsets_heap, heap, entry,
+			thr_get_trx(thr)->id, thr);
+
+		if (error != DB_SUCCESS) {
+			if (error == DB_DUPLICATE_KEY) {
+				thr_get_trx(thr)->error_key_num = n_index;
+			}
+			break;
+		}
+	}
+
+	return(error);
+}
+
+/******************************************************//**
+Replays an insert operation on a table that was rebuilt.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_log_table_apply_insert(
+/*=======================*/
+	que_thr_t*		thr,		/*!< in: query graph */
+	const mrec_t*		mrec,		/*!< in: record to insert */
+	const rec_offs*		offsets,	/*!< in: offsets of mrec */
+	mem_heap_t*		offsets_heap,	/*!< in/out: memory heap
+						that can be emptied */
+	mem_heap_t*		heap,		/*!< in/out: memory heap */
+	row_merge_dup_t*	dup)		/*!< in/out: for reporting
+						duplicate key errors */
+{
+	row_log_t*log	= dup->index->online_log;
+	dberr_t		error;
+	const dtuple_t*	row	= row_log_table_apply_convert_mrec(
+		mrec, dup->index, offsets, log, heap, &error);
+
+	switch (error) {
+	case DB_SUCCESS:
+		ut_ad(row != NULL);
+		break;
+	default:
+		ut_ad(0);
+		/* fall through */
+	case DB_INVALID_NULL:
+		ut_ad(row == NULL);
+		return(error);
+	}
+
+	error = row_log_table_apply_insert_low(
+		thr, row, offsets_heap, heap, dup);
+	if (error != DB_SUCCESS) {
+		/* Report the erroneous row using the new
+		version of the table. */
+		innobase_row_to_mysql(dup->table, log->table, row);
+	}
+	return(error);
+}
+
+/******************************************************//**
+Deletes a record from a table that is being rebuilt.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_log_table_apply_delete_low(
+/*===========================*/
+	btr_pcur_t*		pcur,		/*!< in/out: B-tree cursor,
+						will be trashed */
+	const rec_offs*		offsets,	/*!< in: offsets on pcur */
+	mem_heap_t*		heap,		/*!< in/out: memory heap */
+	mtr_t*			mtr)		/*!< in/out: mini-transaction,
+						will be committed */
+{
+	dberr_t		error;
+	row_ext_t*	ext;
+	dtuple_t*	row;
+	dict_index_t*	index	= pcur->index();
+
+	ut_ad(dict_index_is_clust(index));
+
+	DBUG_LOG("ib_alter_table",
+		 "delete table " << index->table->id << " (index "
+		 << index->id << "): "
+		 << rec_printer(btr_pcur_get_rec(pcur), offsets).str());
+
+	if (dict_table_get_next_index(index)) {
+		/* Build a row template for purging secondary index entries. */
+		row = row_build(
+			ROW_COPY_DATA, index, btr_pcur_get_rec(pcur),
+			offsets, NULL, NULL, NULL, &ext, heap);
+	} else {
+		row = NULL;
+	}
+
+	btr_cur_pessimistic_delete(&error, FALSE, btr_pcur_get_btr_cur(pcur),
+				   BTR_CREATE_FLAG, false, mtr);
+	if (error != DB_SUCCESS) {
+err_exit:
+		mtr->commit();
+		return error;
+	}
+
+	mtr->commit();
+
+	while ((index = dict_table_get_next_index(index)) != NULL) {
+		if (index->type & DICT_FTS) {
+			continue;
+		}
+
+		const dtuple_t*	entry = row_build_index_entry(
+			row, ext, index, heap);
+		mtr->start();
+		index->set_modified(*mtr);
+		pcur->btr_cur.page_cur.index = index;
+		error = btr_pcur_open(entry, PAGE_CUR_LE, BTR_PURGE_TREE, pcur,
+				      mtr);
+		if (error) {
+			goto err_exit;
+		}
+#ifdef UNIV_DEBUG
+		switch (btr_pcur_get_btr_cur(pcur)->flag) {
+		case BTR_CUR_DELETE_REF:
+		case BTR_CUR_DEL_MARK_IBUF:
+		case BTR_CUR_DELETE_IBUF:
+		case BTR_CUR_INSERT_TO_IBUF:
+			/* We did not request buffering. */
+			break;
+		case BTR_CUR_HASH:
+		case BTR_CUR_HASH_FAIL:
+		case BTR_CUR_BINARY:
+			goto flag_ok;
+		}
+		ut_ad(0);
+flag_ok:
+#endif /* UNIV_DEBUG */
+
+		if (page_rec_is_infimum(btr_pcur_get_rec(pcur))
+		    || btr_pcur_get_low_match(pcur) < index->n_uniq) {
+			/* All secondary index entries should be
+			found, because new_table is being modified by
+			this thread only, and all indexes should be
+			updated in sync. */
+			mtr->commit();
+			return(DB_INDEX_CORRUPT);
+		}
+
+		btr_cur_pessimistic_delete(&error, FALSE,
+					   btr_pcur_get_btr_cur(pcur),
+					   BTR_CREATE_FLAG, false, mtr);
+		mtr->commit();
+	}
+
+	return(error);
+}
+
+/******************************************************//**
+Replays a delete operation on a table that was rebuilt.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_log_table_apply_delete(
+/*=======================*/
+	ulint			trx_id_col,	/*!< in: position of
+						DB_TRX_ID in the new
+						clustered index */
+	const mrec_t*		mrec,		/*!< in: merge record */
+	const rec_offs*		moffsets,	/*!< in: offsets of mrec */
+	mem_heap_t*		offsets_heap,	/*!< in/out: memory heap
+						that can be emptied */
+	mem_heap_t*		heap,		/*!< in/out: memory heap */
+	const row_log_t*	log)		/*!< in: online log */
+{
+	dict_table_t*	new_table = log->table;
+	dict_index_t*	index = dict_table_get_first_index(new_table);
+	dtuple_t*	old_pk;
+	mtr_t		mtr;
+	btr_pcur_t	pcur;
+	rec_offs*	offsets;
+
+	pcur.btr_cur.page_cur.index = index;
+	ut_ad(rec_offs_n_fields(moffsets) == index->first_user_field());
+	ut_ad(!rec_offs_any_extern(moffsets));
+
+	/* Convert the row to a search tuple. */
+	old_pk = dtuple_create(heap, index->n_uniq);
+	dict_index_copy_types(old_pk, index, index->n_uniq);
+
+	for (ulint i = 0; i < index->n_uniq; i++) {
+		ulint		len;
+		const void*	field;
+		field = rec_get_nth_field(mrec, moffsets, i, &len);
+		ut_ad(len != UNIV_SQL_NULL);
+		dfield_set_data(dtuple_get_nth_field(old_pk, i),
+				field, len);
+	}
+
+	mtr_start(&mtr);
+	index->set_modified(mtr);
+	dberr_t err = btr_pcur_open(old_pk, PAGE_CUR_LE, BTR_PURGE_TREE, &pcur,
+				    &mtr);
+	if (err != DB_SUCCESS) {
+		goto all_done;
+	}
+#ifdef UNIV_DEBUG
+	switch (btr_pcur_get_btr_cur(&pcur)->flag) {
+	case BTR_CUR_DELETE_REF:
+	case BTR_CUR_DEL_MARK_IBUF:
+	case BTR_CUR_DELETE_IBUF:
+	case BTR_CUR_INSERT_TO_IBUF:
+		/* We did not request buffering. */
+		break;
+	case BTR_CUR_HASH:
+	case BTR_CUR_HASH_FAIL:
+	case BTR_CUR_BINARY:
+		goto flag_ok;
+	}
+	ut_ad(0);
+flag_ok:
+#endif /* UNIV_DEBUG */
+
+	if (page_rec_is_infimum(btr_pcur_get_rec(&pcur))
+	    || btr_pcur_get_low_match(&pcur) < index->n_uniq) {
+all_done:
+		mtr_commit(&mtr);
+		/* The record was not found. All done. */
+		/* This should only happen when an earlier
+		ROW_T_INSERT was skipped or
+		ROW_T_UPDATE was interpreted as ROW_T_DELETE
+		due to BLOBs having been freed by rollback. */
+		return err;
+	}
+
+	offsets = rec_get_offsets(btr_pcur_get_rec(&pcur), index, nullptr,
+				  index->n_core_fields,
+				  ULINT_UNDEFINED, &offsets_heap);
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+	ut_a(!rec_offs_any_null_extern(btr_pcur_get_rec(&pcur), offsets));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
+	/* Only remove the record if DB_TRX_ID,DB_ROLL_PTR match. */
+
+	{
+		ulint		len;
+		const byte*	mrec_trx_id
+			= rec_get_nth_field(mrec, moffsets, trx_id_col, &len);
+		ut_ad(len == DATA_TRX_ID_LEN);
+		const byte*	rec_trx_id
+			= rec_get_nth_field(btr_pcur_get_rec(&pcur), offsets,
+					    trx_id_col, &len);
+		ut_ad(len == DATA_TRX_ID_LEN);
+		ut_d(trx_id_check(rec_trx_id, log->min_trx));
+		ut_d(trx_id_check(mrec_trx_id, log->min_trx));
+
+		ut_ad(rec_get_nth_field(mrec, moffsets, trx_id_col + 1, &len)
+		      == mrec_trx_id + DATA_TRX_ID_LEN);
+		ut_ad(len == DATA_ROLL_PTR_LEN);
+		ut_ad(rec_get_nth_field(btr_pcur_get_rec(&pcur), offsets,
+					trx_id_col + 1, &len)
+		      == rec_trx_id + DATA_TRX_ID_LEN);
+		ut_ad(len == DATA_ROLL_PTR_LEN);
+
+		if (memcmp(mrec_trx_id, rec_trx_id,
+			   DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)) {
+			/* The ROW_T_DELETE was logged for a different
+			PRIMARY KEY,DB_TRX_ID,DB_ROLL_PTR.
+			This is possible if a ROW_T_INSERT was skipped
+			or a ROW_T_UPDATE was interpreted as ROW_T_DELETE
+			because some BLOBs were missing due to
+			(1) rolling back the initial insert, or
+			(2) purging the BLOB for a later ROW_T_DELETE
+			(3) purging 'old values' for a later ROW_T_UPDATE
+			or ROW_T_DELETE. */
+			ut_ad(!log->same_pk);
+			goto all_done;
+		}
+	}
+
+	return row_log_table_apply_delete_low(&pcur, offsets, heap, &mtr);
+}
+
+/******************************************************//**
+Replays an update operation on a table that was rebuilt.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_log_table_apply_update(
+/*=======================*/
+	que_thr_t*		thr,		/*!< in: query graph */
+	ulint			new_trx_id_col,	/*!< in: position of
+						DB_TRX_ID in the new
+						clustered index */
+	const mrec_t*		mrec,		/*!< in: new value */
+	const rec_offs*		offsets,	/*!< in: offsets of mrec */
+	mem_heap_t*		offsets_heap,	/*!< in/out: memory heap
+						that can be emptied */
+	mem_heap_t*		heap,		/*!< in/out: memory heap */
+	row_merge_dup_t*	dup,		/*!< in/out: for reporting
+						duplicate key errors */
+	const dtuple_t*		old_pk)		/*!< in: PRIMARY KEY and
+						DB_TRX_ID,DB_ROLL_PTR
+						of the old value,
+						or PRIMARY KEY if same_pk */
+{
+	row_log_t*	log	= dup->index->online_log;
+	const dtuple_t*	row;
+	dict_index_t*	index	= dict_table_get_first_index(log->table);
+	mtr_t		mtr;
+	btr_pcur_t	pcur;
+	dberr_t		error;
+	ulint		n_index = 0;
+
+	pcur.btr_cur.page_cur.index = index;
+
+	ut_ad(dtuple_get_n_fields_cmp(old_pk)
+	      == dict_index_get_n_unique(index));
+	ut_ad(dtuple_get_n_fields(old_pk) - (log->same_pk ? 0 : 2)
+	      == dict_index_get_n_unique(index));
+
+	row = row_log_table_apply_convert_mrec(
+		mrec, dup->index, offsets, log, heap, &error);
+
+	switch (error) {
+	case DB_SUCCESS:
+		ut_ad(row != NULL);
+		break;
+	default:
+		ut_ad(0);
+		/* fall through */
+	case DB_INVALID_NULL:
+		ut_ad(row == NULL);
+		return(error);
+	}
+
+	mtr.start();
+	index->set_modified(mtr);
+	error = btr_pcur_open(old_pk, PAGE_CUR_LE, BTR_MODIFY_TREE, &pcur,
+			      &mtr);
+	if (error != DB_SUCCESS) {
+func_exit:
+		mtr.commit();
+func_exit_committed:
+		ut_ad(mtr.has_committed());
+		ut_free(pcur.old_rec_buf);
+
+		if (error != DB_SUCCESS) {
+			/* Report the erroneous row using the new
+			version of the table. */
+			innobase_row_to_mysql(dup->table, log->table, row);
+		}
+
+		return error;
+	}
+#ifdef UNIV_DEBUG
+	switch (btr_pcur_get_btr_cur(&pcur)->flag) {
+	case BTR_CUR_DELETE_REF:
+	case BTR_CUR_DEL_MARK_IBUF:
+	case BTR_CUR_DELETE_IBUF:
+	case BTR_CUR_INSERT_TO_IBUF:
+		ut_ad(0);/* We did not request buffering. */
+	case BTR_CUR_HASH:
+	case BTR_CUR_HASH_FAIL:
+	case BTR_CUR_BINARY:
+		break;
+	}
+#endif /* UNIV_DEBUG */
+
+	ut_ad(!page_rec_is_infimum(btr_pcur_get_rec(&pcur))
+	      && btr_pcur_get_low_match(&pcur) >= index->n_uniq);
+
+	/* Prepare to update (or delete) the record. */
+	rec_offs*		cur_offsets	= rec_get_offsets(
+		btr_pcur_get_rec(&pcur), index, nullptr, index->n_core_fields,
+		ULINT_UNDEFINED, &offsets_heap);
+
+#ifdef UNIV_DEBUG
+	if (!log->same_pk) {
+		ulint		len;
+		const byte*	rec_trx_id
+			= rec_get_nth_field(btr_pcur_get_rec(&pcur),
+					    cur_offsets, index->n_uniq, &len);
+		const dfield_t*	old_pk_trx_id
+			= dtuple_get_nth_field(old_pk, index->n_uniq);
+		ut_ad(len == DATA_TRX_ID_LEN);
+		ut_d(trx_id_check(rec_trx_id, log->min_trx));
+		ut_ad(old_pk_trx_id->len == DATA_TRX_ID_LEN);
+		ut_ad(old_pk_trx_id[1].len == DATA_ROLL_PTR_LEN);
+		ut_ad(DATA_TRX_ID_LEN
+		      + static_cast<const char*>(old_pk_trx_id->data)
+		      == old_pk_trx_id[1].data);
+		ut_d(trx_id_check(old_pk_trx_id->data, log->min_trx));
+		ut_ad(!memcmp(rec_trx_id, old_pk_trx_id->data,
+			      DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN));
+	}
+#endif
+
+	dtuple_t*	entry	= row_build_index_entry_low(
+		row, NULL, index, heap, ROW_BUILD_NORMAL);
+	upd_t*		update	= row_upd_build_difference_binary(
+		index, entry, btr_pcur_get_rec(&pcur), cur_offsets,
+		false, false, NULL, heap, dup->table, &error);
+	if (error != DB_SUCCESS || !update->n_fields) {
+		goto func_exit;
+	}
+
+	const bool	pk_updated
+		= upd_get_nth_field(update, 0)->field_no < new_trx_id_col;
+
+	if (pk_updated || rec_offs_any_extern(cur_offsets)) {
+		/* If the record contains any externally stored
+		columns, perform the update by delete and insert,
+		because we will not write any undo log that would
+		allow purge to free any orphaned externally stored
+		columns. */
+
+		if (pk_updated && log->same_pk) {
+			/* The ROW_T_UPDATE log record should only be
+			written when the PRIMARY KEY fields of the
+			record did not change in the old table.  We
+			can only get a change of PRIMARY KEY columns
+			in the rebuilt table if the PRIMARY KEY was
+			redefined (!same_pk). */
+			ut_ad(0);
+			error = DB_CORRUPTION;
+			goto func_exit;
+		}
+
+		error = row_log_table_apply_delete_low(
+			&pcur, cur_offsets, heap, &mtr);
+		ut_ad(mtr.has_committed());
+
+		if (error == DB_SUCCESS) {
+			error = row_log_table_apply_insert_low(
+				thr, row, offsets_heap, heap, dup);
+		}
+
+		goto func_exit_committed;
+	}
+
+	dtuple_t*	old_row;
+	row_ext_t*	old_ext;
+
+	if (dict_table_get_next_index(index)) {
+		/* Construct the row corresponding to the old value of
+		the record. */
+		old_row = row_build(
+			ROW_COPY_DATA, index, btr_pcur_get_rec(&pcur),
+			cur_offsets, NULL, NULL, NULL, &old_ext, heap);
+		ut_ad(old_row);
+
+		DBUG_LOG("ib_alter_table",
+			 "update table " << index->table->id
+			 << " (index " << index->id
+			 << ": " << rec_printer(old_row).str()
+			 << " to " << rec_printer(row).str());
+	} else {
+		old_row = NULL;
+		old_ext = NULL;
+	}
+
+	big_rec_t*	big_rec;
+
+	error = btr_cur_pessimistic_update(
+		BTR_CREATE_FLAG | BTR_NO_LOCKING_FLAG
+		| BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG
+		| BTR_KEEP_POS_FLAG,
+		btr_pcur_get_btr_cur(&pcur),
+		&cur_offsets, &offsets_heap, heap, &big_rec,
+		update, 0, thr, 0, &mtr);
+
+	if (big_rec) {
+		if (error == DB_SUCCESS) {
+			error = btr_store_big_rec_extern_fields(
+				&pcur, cur_offsets, big_rec, &mtr,
+				BTR_STORE_UPDATE);
+		}
+
+		dtuple_big_rec_free(big_rec);
+	}
+
+	for (n_index += index->type != DICT_CLUSTERED;
+	     (index = dict_table_get_next_index(index)); n_index++) {
+		if (!index->is_btree()) {
+			continue;
+		}
+
+		if (error != DB_SUCCESS) {
+			break;
+		}
+
+		if (!row_upd_changes_ord_field_binary(
+			    index, update, thr, old_row, NULL)) {
+			continue;
+		}
+
+		if (dict_index_has_virtual(index)) {
+			dtuple_copy_v_fields(old_row, old_pk);
+		}
+
+		mtr.commit();
+
+		entry = row_build_index_entry(old_row, old_ext, index, heap);
+		if (!entry) {
+			ut_ad(0);
+			error = DB_CORRUPTION;
+			goto func_exit_committed;
+		}
+
+		mtr.start();
+		index->set_modified(mtr);
+		pcur.btr_cur.page_cur.index = index;
+
+		ut_free(pcur.old_rec_buf);
+		pcur.old_rec_buf = nullptr;
+
+		if (ROW_FOUND != row_search_index_entry(
+			    entry, BTR_MODIFY_TREE, &pcur, &mtr)) {
+			ut_ad(0);
+			error = DB_CORRUPTION;
+			break;
+		}
+
+		btr_cur_pessimistic_delete(
+			&error, FALSE, btr_pcur_get_btr_cur(&pcur),
+			BTR_CREATE_FLAG, false, &mtr);
+
+		if (error != DB_SUCCESS) {
+			break;
+		}
+
+		mtr.commit();
+
+		entry = row_build_index_entry(row, NULL, index, heap);
+		error = row_ins_sec_index_entry_low(
+			BTR_CREATE_FLAG | BTR_NO_LOCKING_FLAG
+			| BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG,
+			BTR_INSERT_TREE, index, offsets_heap, heap,
+			entry, thr_get_trx(thr)->id, thr);
+
+		/* Report correct index name for duplicate key error. */
+		if (error == DB_DUPLICATE_KEY) {
+			thr_get_trx(thr)->error_key_num = n_index;
+		}
+
+		mtr.start();
+		index->set_modified(mtr);
+	}
+
+	goto func_exit;
+}
+
+/******************************************************//**
+Applies an operation to a table that was rebuilt.
+@return NULL on failure (mrec corruption) or when out of data;
+pointer to next record on success */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+const mrec_t*
+row_log_table_apply_op(
+/*===================*/
+	que_thr_t*		thr,		/*!< in: query graph */
+	ulint			new_trx_id_col,	/*!< in: position of
+						DB_TRX_ID in new index */
+	row_merge_dup_t*	dup,		/*!< in/out: for reporting
+						duplicate key errors */
+	dberr_t*		error,		/*!< out: DB_SUCCESS
+						or error code */
+	mem_heap_t*		offsets_heap,	/*!< in/out: memory heap
+						that can be emptied */
+	mem_heap_t*		heap,		/*!< in/out: memory heap */
+	const mrec_t*		mrec,		/*!< in: merge record */
+	const mrec_t*		mrec_end,	/*!< in: end of buffer */
+	rec_offs*		offsets)	/*!< in/out: work area
+						for parsing mrec */
+{
+	row_log_t*	log	= dup->index->online_log;
+	dict_index_t*	new_index = dict_table_get_first_index(log->table);
+	ulint		extra_size;
+	const mrec_t*	next_mrec;
+	dtuple_t*	old_pk;
+
+	ut_ad(dict_index_is_clust(dup->index));
+	ut_ad(dup->index->table != log->table);
+	ut_ad(log->head.total <= log->tail.total);
+
+	*error = DB_SUCCESS;
+
+	/* 3 = 1 (op type) + 1 (extra_size) + at least 1 byte payload */
+	if (mrec + 3 >= mrec_end) {
+		return(NULL);
+	}
+
+	const bool is_instant = log->is_instant(dup->index);
+	const mrec_t* const mrec_start = mrec;
+
+	switch (*mrec++) {
+	default:
+		ut_ad(0);
+		*error = DB_CORRUPTION;
+		return(NULL);
+	case ROW_T_INSERT:
+		extra_size = *mrec++;
+
+		if (extra_size >= 0x80) {
+			/* Read another byte of extra_size. */
+
+			extra_size = (extra_size & 0x7f) << 8;
+			extra_size |= *mrec++;
+		}
+
+		mrec += extra_size;
+
+		ut_ad(extra_size || !is_instant);
+
+		if (mrec > mrec_end) {
+			return(NULL);
+		}
+
+		rec_offs_set_n_fields(offsets, dup->index->n_fields);
+		rec_init_offsets_temp(mrec, dup->index, offsets,
+				      log->n_core_fields, log->non_core_fields,
+				      is_instant
+				      ? static_cast<rec_comp_status_t>(
+					      *(mrec - extra_size))
+				      : REC_STATUS_ORDINARY);
+
+		next_mrec = mrec + rec_offs_data_size(offsets);
+
+		if (next_mrec > mrec_end) {
+			return(NULL);
+		} else {
+			log->head.total += ulint(next_mrec - mrec_start);
+			*error = row_log_table_apply_insert(
+				thr, mrec, offsets, offsets_heap,
+				heap, dup);
+		}
+		break;
+
+	case ROW_T_DELETE:
+		extra_size = *mrec++;
+		ut_ad(mrec < mrec_end);
+
+		/* We assume extra_size < 0x100 for the PRIMARY KEY prefix.
+		For fixed-length PRIMARY key columns, it is 0. */
+		mrec += extra_size;
+
+		/* The ROW_T_DELETE record was converted by
+		rec_convert_dtuple_to_temp() using new_index. */
+		ut_ad(!new_index->is_instant());
+		rec_offs_set_n_fields(offsets, new_index->first_user_field());
+		rec_init_offsets_temp(mrec, new_index, offsets);
+		next_mrec = mrec + rec_offs_data_size(offsets);
+		if (next_mrec > mrec_end) {
+			return(NULL);
+		}
+
+		log->head.total += ulint(next_mrec - mrec_start);
+
+		*error = row_log_table_apply_delete(
+			new_trx_id_col,
+			mrec, offsets, offsets_heap, heap, log);
+		break;
+
+	case ROW_T_UPDATE:
+		/* Logically, the log entry consists of the
+		(PRIMARY KEY,DB_TRX_ID) of the old value (converted
+		to the new primary key definition) followed by
+		the new value in the old table definition. If the
+		definition of the columns belonging to PRIMARY KEY
+		is not changed, the log will only contain
+		DB_TRX_ID,new_row. */
+
+		if (log->same_pk) {
+			ut_ad(new_index->n_uniq == dup->index->n_uniq);
+
+			extra_size = *mrec++;
+
+			if (extra_size >= 0x80) {
+				/* Read another byte of extra_size. */
+
+				extra_size = (extra_size & 0x7f) << 8;
+				extra_size |= *mrec++;
+			}
+
+			mrec += extra_size;
+
+			ut_ad(extra_size || !is_instant);
+
+			if (mrec > mrec_end) {
+				return(NULL);
+			}
+
+			rec_offs_set_n_fields(offsets, dup->index->n_fields);
+			rec_init_offsets_temp(mrec, dup->index, offsets,
+					      log->n_core_fields,
+					      log->non_core_fields,
+					      is_instant
+					      ? static_cast<rec_comp_status_t>(
+						      *(mrec - extra_size))
+					      : REC_STATUS_ORDINARY);
+
+			next_mrec = mrec + rec_offs_data_size(offsets);
+
+			if (next_mrec > mrec_end) {
+				return(NULL);
+			}
+
+			old_pk = dtuple_create(heap, new_index->n_uniq);
+			dict_index_copy_types(
+				old_pk, new_index, old_pk->n_fields);
+
+			/* Copy the PRIMARY KEY fields from mrec to old_pk. */
+			for (ulint i = 0; i < new_index->n_uniq; i++) {
+				const void*	field;
+				ulint		len;
+				dfield_t*	dfield;
+
+				ut_ad(!rec_offs_nth_extern(offsets, i));
+
+				field = rec_get_nth_field(
+					mrec, offsets, i, &len);
+				ut_ad(len != UNIV_SQL_NULL);
+
+				dfield = dtuple_get_nth_field(old_pk, i);
+				dfield_set_data(dfield, field, len);
+			}
+		} else {
+			/* We assume extra_size < 0x100
+			for the PRIMARY KEY prefix. */
+			mrec += *mrec + 1;
+
+			if (mrec > mrec_end) {
+				return(NULL);
+			}
+
+			/* Get offsets for PRIMARY KEY,
+			DB_TRX_ID, DB_ROLL_PTR. */
+			/* The old_pk prefix was converted by
+			rec_convert_dtuple_to_temp() using new_index. */
+			ut_ad(!new_index->is_instant());
+			rec_offs_set_n_fields(offsets,
+					      new_index->first_user_field());
+			rec_init_offsets_temp(mrec, new_index, offsets);
+
+			next_mrec = mrec + rec_offs_data_size(offsets);
+			if (next_mrec + 2 > mrec_end) {
+				return(NULL);
+			}
+
+			/* Copy the PRIMARY KEY fields and
+			DB_TRX_ID, DB_ROLL_PTR from mrec to old_pk. */
+			old_pk = dtuple_create(heap,
+					       new_index->first_user_field());
+			dict_index_copy_types(old_pk, new_index,
+					      old_pk->n_fields);
+
+			for (ulint i = 0; i < new_index->first_user_field();
+			     i++) {
+				const void*	field;
+				ulint		len;
+				dfield_t*	dfield;
+
+				ut_ad(!rec_offs_nth_extern(offsets, i));
+
+				field = rec_get_nth_field(
+					mrec, offsets, i, &len);
+				ut_ad(len != UNIV_SQL_NULL);
+
+				dfield = dtuple_get_nth_field(old_pk, i);
+				dfield_set_data(dfield, field, len);
+			}
+
+			mrec = next_mrec;
+
+			/* Fetch the new value of the row as it was
+			in the old table definition. */
+			extra_size = *mrec++;
+
+			if (extra_size >= 0x80) {
+				/* Read another byte of extra_size. */
+
+				extra_size = (extra_size & 0x7f) << 8;
+				extra_size |= *mrec++;
+			}
+
+			mrec += extra_size;
+
+			ut_ad(extra_size || !is_instant);
+
+			if (mrec > mrec_end) {
+				return(NULL);
+			}
+
+			rec_offs_set_n_fields(offsets, dup->index->n_fields);
+			rec_init_offsets_temp(mrec, dup->index, offsets,
+					      log->n_core_fields,
+					      log->non_core_fields,
+					      is_instant
+					      ? static_cast<rec_comp_status_t>(
+						      *(mrec - extra_size))
+					      : REC_STATUS_ORDINARY);
+
+			next_mrec = mrec + rec_offs_data_size(offsets);
+
+			if (next_mrec > mrec_end) {
+				return(NULL);
+			}
+		}
+
+		ut_ad(next_mrec <= mrec_end);
+		log->head.total += ulint(next_mrec - mrec_start);
+		dtuple_set_n_fields_cmp(old_pk, new_index->n_uniq);
+
+		*error = row_log_table_apply_update(
+			thr, new_trx_id_col,
+			mrec, offsets, offsets_heap, heap, dup, old_pk);
+		break;
+	}
+
+	ut_ad(log->head.total <= log->tail.total);
+	mem_heap_empty(offsets_heap);
+	mem_heap_empty(heap);
+	return(next_mrec);
+}
+
+#ifdef HAVE_PSI_STAGE_INTERFACE
+/** Estimate how much an ALTER TABLE progress should be incremented per
+one block of log applied.
+For the other phases of ALTER TABLE we increment the progress with 1 per
+page processed.
+@return amount of abstract units to add to work_completed when one block
+of log is applied.
+*/
+inline
+ulint
+row_log_progress_inc_per_block()
+{
+	/* We must increment the progress once per page (as in
+	srv_page_size, default = innodb_page_size=16KiB).
+	One block here is srv_sort_buf_size (usually 1MiB). */
+	const ulint	pages_per_block = std::max<ulint>(
+		ulint(srv_sort_buf_size >> srv_page_size_shift), 1);
+
+	/* Multiply by an artificial factor of 6 to even the pace with
+	the rest of the ALTER TABLE phases, they process page_size amount
+	of data faster. */
+	return(pages_per_block * 6);
+}
+
+/** Estimate how much work is to be done by the log apply phase
+of an ALTER TABLE for this index.
+@param[in]	index	index whose log to assess
+@return work to be done by log-apply in abstract units
+*/
+ulint
+row_log_estimate_work(
+	const dict_index_t*	index)
+{
+	if (index == NULL || index->online_log == NULL
+	    || index->online_log_is_dummy()) {
+		return(0);
+	}
+
+	const row_log_t*	l = index->online_log;
+	const ulint		bytes_left =
+		static_cast<ulint>(l->tail.total - l->head.total);
+	const ulint		blocks_left = bytes_left / srv_sort_buf_size;
+
+	return(blocks_left * row_log_progress_inc_per_block());
+}
+#else /* HAVE_PSI_STAGE_INTERFACE */
+inline
+ulint
+row_log_progress_inc_per_block()
+{
+	return(0);
+}
+#endif /* HAVE_PSI_STAGE_INTERFACE */
+
+/** Applies operations to a table was rebuilt.
+@param[in]	thr	query graph
+@param[in,out]	dup	for reporting duplicate key errors
+@param[in,out]	stage	performance schema accounting object, used by
+ALTER TABLE. If not NULL, then stage->inc() will be called for each block
+of log that is applied.
+@return DB_SUCCESS, or error code on failure */
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+row_log_table_apply_ops(
+	que_thr_t*		thr,
+	row_merge_dup_t*	dup,
+	ut_stage_alter_t*	stage)
+{
+	dberr_t		error;
+	const mrec_t*	mrec		= NULL;
+	const mrec_t*	next_mrec;
+	const mrec_t*	mrec_end	= NULL; /* silence bogus warning */
+	const mrec_t*	next_mrec_end;
+	mem_heap_t*	heap;
+	mem_heap_t*	offsets_heap;
+	rec_offs*	offsets;
+	bool		has_index_lock;
+	dict_index_t*	index		= const_cast<dict_index_t*>(
+		dup->index);
+	dict_table_t*	new_table	= index->online_log->table;
+	dict_index_t*	new_index	= dict_table_get_first_index(
+		new_table);
+	const ulint	i		= 1 + REC_OFFS_HEADER_SIZE
+		+ std::max<ulint>(index->n_fields,
+				  new_index->first_user_field());
+	const ulint	new_trx_id_col	= dict_col_get_clust_pos(
+		dict_table_get_sys_col(new_table, DATA_TRX_ID), new_index);
+	trx_t*		trx		= thr_get_trx(thr);
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(dict_index_is_online_ddl(index));
+	ut_ad(trx->mysql_thd);
+	ut_ad(index->lock.have_x());
+	ut_ad(!dict_index_is_online_ddl(new_index));
+	ut_ad(dict_col_get_clust_pos(
+		      dict_table_get_sys_col(index->table, DATA_TRX_ID), index)
+	      != ULINT_UNDEFINED);
+	ut_ad(new_trx_id_col > 0);
+	ut_ad(new_trx_id_col != ULINT_UNDEFINED);
+
+	MEM_UNDEFINED(&mrec_end, sizeof mrec_end);
+
+	offsets = static_cast<rec_offs*>(ut_malloc_nokey(i * sizeof *offsets));
+	rec_offs_set_n_alloc(offsets, i);
+	rec_offs_set_n_fields(offsets, dict_index_get_n_fields(index));
+
+	heap = mem_heap_create(srv_page_size);
+	offsets_heap = mem_heap_create(srv_page_size);
+	has_index_lock = true;
+
+next_block:
+	ut_ad(has_index_lock);
+	ut_ad(index->lock.have_u_or_x());
+	ut_ad(index->online_log->head.bytes == 0);
+
+	stage->inc(row_log_progress_inc_per_block());
+
+	if (trx_is_interrupted(trx)) {
+		goto interrupted;
+	}
+
+	if (index->is_corrupted()) {
+		error = DB_INDEX_CORRUPT;
+		goto func_exit;
+	}
+
+	ut_ad(dict_index_is_online_ddl(index));
+
+	error = index->online_log->error;
+
+	if (error != DB_SUCCESS) {
+		goto func_exit;
+	}
+
+	if (UNIV_UNLIKELY(index->online_log->head.blocks
+			  > index->online_log->tail.blocks)) {
+unexpected_eof:
+		ib::error() << "Unexpected end of temporary file for table "
+			<< index->table->name;
+corruption:
+		error = DB_CORRUPTION;
+		goto func_exit;
+	}
+
+	if (index->online_log->head.blocks
+	    == index->online_log->tail.blocks) {
+		if (index->online_log->head.blocks) {
+#ifdef HAVE_FTRUNCATE
+			/* Truncate the file in order to save space. */
+			if (index->online_log->fd > 0
+			    && ftruncate(index->online_log->fd, 0) == -1) {
+				ib::error()
+					<< "\'" << index->name + 1
+					<< "\' failed with error "
+					<< errno << ":" << strerror(errno);
+
+				goto corruption;
+			}
+#endif /* HAVE_FTRUNCATE */
+			index->online_log->head.blocks
+				= index->online_log->tail.blocks = 0;
+		}
+
+		next_mrec = index->online_log->tail.block;
+		next_mrec_end = next_mrec + index->online_log->tail.bytes;
+
+		if (next_mrec_end == next_mrec) {
+			/* End of log reached. */
+all_done:
+			ut_ad(has_index_lock);
+			ut_ad(index->online_log->head.blocks == 0);
+			ut_ad(index->online_log->tail.blocks == 0);
+			index->online_log->head.bytes = 0;
+			index->online_log->tail.bytes = 0;
+			error = DB_SUCCESS;
+			goto func_exit;
+		}
+	} else {
+		os_offset_t	ofs;
+
+		ofs = (os_offset_t) index->online_log->head.blocks
+			* srv_sort_buf_size;
+
+		ut_ad(has_index_lock);
+		has_index_lock = false;
+		index->lock.x_unlock();
+
+		log_free_check();
+
+		ut_ad(dict_index_is_online_ddl(index));
+
+		if (!row_log_block_allocate(index->online_log->head)) {
+			error = DB_OUT_OF_MEMORY;
+			goto func_exit;
+		}
+
+		byte*			buf = index->online_log->head.block;
+
+		if (DB_SUCCESS
+		    != os_file_read(IORequestRead, index->online_log->fd,
+				    buf, ofs, srv_sort_buf_size, nullptr)) {
+			ib::error()
+				<< "Unable to read temporary file"
+				" for table " << index->table->name;
+			goto corruption;
+		}
+
+		if (srv_encrypt_log) {
+			if (!log_tmp_block_decrypt(
+				    buf, srv_sort_buf_size,
+				    index->online_log->crypt_head, ofs)) {
+				error = DB_DECRYPTION_FAILED;
+				goto func_exit;
+			}
+
+			srv_stats.n_rowlog_blocks_decrypted.inc();
+			memcpy(buf, index->online_log->crypt_head,
+			       srv_sort_buf_size);
+		}
+
+#ifdef POSIX_FADV_DONTNEED
+		/* Each block is read exactly once.  Free up the file cache. */
+		posix_fadvise(index->online_log->fd,
+			      ofs, srv_sort_buf_size, POSIX_FADV_DONTNEED);
+#endif /* POSIX_FADV_DONTNEED */
+
+		next_mrec = index->online_log->head.block;
+		next_mrec_end = next_mrec + srv_sort_buf_size;
+	}
+
+	/* This read is not protected by index->online_log->mutex for
+	performance reasons. We will eventually notice any error that
+	was flagged by a DML thread. */
+	error = index->online_log->error;
+
+	if (error != DB_SUCCESS) {
+		goto func_exit;
+	}
+
+	if (mrec) {
+		/* A partial record was read from the previous block.
+		Copy the temporary buffer full, as we do not know the
+		length of the record. Parse subsequent records from
+		the bigger buffer index->online_log->head.block
+		or index->online_log->tail.block. */
+
+		ut_ad(mrec == index->online_log->head.buf);
+		ut_ad(mrec_end > mrec);
+		ut_ad(mrec_end < (&index->online_log->head.buf)[1]);
+
+		memcpy((mrec_t*) mrec_end, next_mrec,
+		       ulint((&index->online_log->head.buf)[1] - mrec_end));
+		mrec = row_log_table_apply_op(
+			thr, new_trx_id_col,
+			dup, &error, offsets_heap, heap,
+			index->online_log->head.buf,
+			(&index->online_log->head.buf)[1], offsets);
+		if (error != DB_SUCCESS) {
+			goto func_exit;
+		} else if (UNIV_UNLIKELY(mrec == NULL)) {
+			/* The record was not reassembled properly. */
+			goto corruption;
+		}
+		/* The record was previously found out to be
+		truncated. Now that the parse buffer was extended,
+		it should proceed beyond the old end of the buffer. */
+		ut_a(mrec > mrec_end);
+
+		index->online_log->head.bytes = ulint(mrec - mrec_end);
+		next_mrec += index->online_log->head.bytes;
+	}
+
+	ut_ad(next_mrec <= next_mrec_end);
+	/* The following loop must not be parsing the temporary
+	buffer, but head.block or tail.block. */
+
+	/* mrec!=NULL means that the next record starts from the
+	middle of the block */
+	ut_ad((mrec == NULL) == (index->online_log->head.bytes == 0));
+
+#ifdef UNIV_DEBUG
+	if (next_mrec_end == index->online_log->head.block
+	    + srv_sort_buf_size) {
+		/* If tail.bytes == 0, next_mrec_end can also be at
+		the end of tail.block. */
+		if (index->online_log->tail.bytes == 0) {
+			ut_ad(next_mrec == next_mrec_end);
+			ut_ad(index->online_log->tail.blocks == 0);
+			ut_ad(index->online_log->head.blocks == 0);
+			ut_ad(index->online_log->head.bytes == 0);
+		} else {
+			ut_ad(next_mrec == index->online_log->head.block
+			      + index->online_log->head.bytes);
+			ut_ad(index->online_log->tail.blocks
+			      > index->online_log->head.blocks);
+		}
+	} else if (next_mrec_end == index->online_log->tail.block
+		   + index->online_log->tail.bytes) {
+		ut_ad(next_mrec == index->online_log->tail.block
+		      + index->online_log->head.bytes);
+		ut_ad(index->online_log->tail.blocks == 0);
+		ut_ad(index->online_log->head.blocks == 0);
+		ut_ad(index->online_log->head.bytes
+		      <= index->online_log->tail.bytes);
+	} else {
+		ut_error;
+	}
+#endif /* UNIV_DEBUG */
+
+	mrec_end = next_mrec_end;
+
+	while (!trx_is_interrupted(trx)) {
+		mrec = next_mrec;
+		ut_ad(mrec <= mrec_end);
+
+		if (mrec == mrec_end) {
+			/* We are at the end of the log.
+			   Mark the replay all_done. */
+			if (has_index_lock) {
+				goto all_done;
+			}
+		}
+
+		if (!has_index_lock) {
+			/* We are applying operations from a different
+			block than the one that is being written to.
+			We do not hold index->lock in order to
+			allow other threads to concurrently buffer
+			modifications. */
+			ut_ad(mrec >= index->online_log->head.block);
+			ut_ad(mrec_end == index->online_log->head.block
+			      + srv_sort_buf_size);
+			ut_ad(index->online_log->head.bytes
+			      < srv_sort_buf_size);
+
+			/* Take the opportunity to do a redo log
+			checkpoint if needed. */
+			log_free_check();
+		} else {
+			/* We are applying operations from the last block.
+			Do not allow other threads to buffer anything,
+			so that we can finally catch up and synchronize. */
+			ut_ad(index->online_log->head.blocks == 0);
+			ut_ad(index->online_log->tail.blocks == 0);
+			ut_ad(mrec_end == index->online_log->tail.block
+			      + index->online_log->tail.bytes);
+			ut_ad(mrec >= index->online_log->tail.block);
+		}
+
+		/* This read is not protected by index->online_log->mutex
+		for performance reasons. We will eventually notice any
+		error that was flagged by a DML thread. */
+		error = index->online_log->error;
+
+		if (error != DB_SUCCESS) {
+			goto func_exit;
+		}
+
+		next_mrec = row_log_table_apply_op(
+			thr, new_trx_id_col,
+			dup, &error, offsets_heap, heap,
+			mrec, mrec_end, offsets);
+
+		if (error != DB_SUCCESS) {
+			goto func_exit;
+		} else if (next_mrec == next_mrec_end) {
+			/* The record happened to end on a block boundary.
+			Do we have more blocks left? */
+			if (has_index_lock) {
+				/* The index will be locked while
+				applying the last block. */
+				goto all_done;
+			}
+
+			mrec = NULL;
+process_next_block:
+			index->lock.x_lock(SRW_LOCK_CALL);
+			has_index_lock = true;
+
+			index->online_log->head.bytes = 0;
+			index->online_log->head.blocks++;
+			goto next_block;
+		} else if (next_mrec != NULL) {
+			ut_ad(next_mrec < next_mrec_end);
+			index->online_log->head.bytes
+				+= ulint(next_mrec - mrec);
+		} else if (has_index_lock) {
+			/* When mrec is within tail.block, it should
+			be a complete record, because we are holding
+			index->lock and thus excluding the writer. */
+			ut_ad(index->online_log->tail.blocks == 0);
+			ut_ad(mrec_end == index->online_log->tail.block
+			      + index->online_log->tail.bytes);
+			ut_ad(0);
+			goto unexpected_eof;
+		} else {
+			memcpy(index->online_log->head.buf, mrec,
+			       ulint(mrec_end - mrec));
+			mrec_end += ulint(index->online_log->head.buf - mrec);
+			mrec = index->online_log->head.buf;
+			goto process_next_block;
+		}
+	}
+
+interrupted:
+	error = DB_INTERRUPTED;
+func_exit:
+	if (!has_index_lock) {
+		index->lock.x_lock(SRW_LOCK_CALL);
+	}
+
+	mem_heap_free(offsets_heap);
+	mem_heap_free(heap);
+	row_log_block_free(index->online_log->head);
+	ut_free(offsets);
+	return(error);
+}
+
+/** Apply the row_log_table log to a table upon completing rebuild.
+@param[in]	thr		query graph
+@param[in]	old_table	old table
+@param[in,out]	table		MySQL table (for reporting duplicates)
+@param[in,out]	stage		performance schema accounting object, used by
+ALTER TABLE. stage->begin_phase_log_table() will be called initially and then
+stage->inc() will be called for each block of log that is applied.
+@param[in]	new_table	Altered table
+@return DB_SUCCESS, or error code on failure */
+dberr_t
+row_log_table_apply(
+	que_thr_t*		thr,
+	dict_table_t*		old_table,
+	struct TABLE*		table,
+	ut_stage_alter_t*	stage,
+	dict_table_t*		new_table)
+{
+	dberr_t		error;
+	dict_index_t*	clust_index;
+
+	thr_get_trx(thr)->error_key_num = 0;
+	DBUG_EXECUTE_IF("innodb_trx_duplicates",
+			thr_get_trx(thr)->duplicates = TRX_DUP_REPLACE;);
+
+	stage->begin_phase_log_table();
+
+	clust_index = dict_table_get_first_index(old_table);
+
+	if (clust_index->online_log->n_rows == 0) {
+		clust_index->online_log->n_rows = new_table->stat_n_rows;
+	}
+
+	clust_index->lock.x_lock(SRW_LOCK_CALL);
+
+	if (!clust_index->online_log) {
+		ut_ad(dict_index_get_online_status(clust_index)
+		      == ONLINE_INDEX_COMPLETE);
+		/* This function should not be called unless
+		rebuilding a table online. Build in some fault
+		tolerance. */
+		ut_ad(0);
+		error = DB_ERROR;
+	} else {
+		row_merge_dup_t	dup = {
+			clust_index, table,
+			clust_index->online_log->col_map, 0
+		};
+
+		error = row_log_table_apply_ops(thr, &dup, stage);
+
+		ut_ad(error != DB_SUCCESS
+		      || clust_index->online_log->head.total
+		      == clust_index->online_log->tail.total);
+	}
+
+	clust_index->lock.x_unlock();
+	DBUG_EXECUTE_IF("innodb_trx_duplicates",
+			thr_get_trx(thr)->duplicates = 0;);
+
+	return(error);
+}
+
+/******************************************************//**
+Allocate the row log for an index and flag the index
+for online creation.
+@retval true if success, false if not */
+bool
+row_log_allocate(
+/*=============*/
+	const trx_t*	trx,	/*!< in: the ALTER TABLE transaction */
+	dict_index_t*	index,	/*!< in/out: index */
+	dict_table_t*	table,	/*!< in/out: new table being rebuilt,
+				or NULL when creating a secondary index */
+	bool		same_pk,/*!< in: whether the definition of the
+				PRIMARY KEY has remained the same */
+	const dtuple_t*	defaults,
+				/*!< in: default values of
+				added, changed columns, or NULL */
+	const ulint*	col_map,/*!< in: mapping of old column
+				numbers to new ones, or NULL if !table */
+	const char*	path,	/*!< in: where to create temporary file */
+	const TABLE*	old_table,	/*!< in: table definition before alter */
+	const bool	allow_not_null) /*!< in: allow null to not-null
+					conversion */
+{
+	row_log_t*	log;
+	DBUG_ENTER("row_log_allocate");
+
+	ut_ad(!dict_index_is_online_ddl(index));
+	ut_ad(dict_index_is_clust(index) == !!table);
+	ut_ad(!table || index->table != table);
+	ut_ad(same_pk || table);
+	ut_ad(!table || col_map);
+	ut_ad(!defaults || col_map);
+	ut_ad(index->lock.have_u_or_x());
+	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+	ut_ad(trx->id);
+
+	log = static_cast<row_log_t*>(ut_malloc_nokey(sizeof *log));
+
+	if (log == NULL) {
+		DBUG_RETURN(false);
+	}
+
+	log->fd = OS_FILE_CLOSED;
+	mysql_mutex_init(index_online_log_key, &log->mutex, nullptr);
+
+	log->table = table;
+	log->same_pk = same_pk;
+	log->defaults = defaults;
+	log->col_map = col_map;
+	log->error = DB_SUCCESS;
+	log->min_trx = trx->id;
+	log->max_trx = 0;
+	log->tail.blocks = log->tail.bytes = 0;
+	log->tail.total = 0;
+	log->tail.block = log->head.block = NULL;
+	log->crypt_tail = log->crypt_head = NULL;
+	log->head.blocks = log->head.bytes = 0;
+	log->head.total = 0;
+	log->path = path;
+	log->n_core_fields = index->n_core_fields;
+	ut_ad(!table || log->is_instant(index)
+	      == (index->n_core_fields < index->n_fields));
+	log->allow_not_null = allow_not_null;
+	log->old_table = old_table;
+	log->n_rows = 0;
+
+	if (table && index->is_instant()) {
+		const unsigned n = log->n_core_fields;
+		log->non_core_fields = UT_NEW_ARRAY_NOKEY(
+			dict_col_t::def_t, index->n_fields - n);
+		for (unsigned i = n; i < index->n_fields; i++) {
+			log->non_core_fields[i - n]
+				= index->fields[i].col->def_val;
+		}
+	} else {
+		log->non_core_fields = NULL;
+	}
+
+	dict_index_set_online_status(index, ONLINE_INDEX_CREATION);
+
+	if (srv_encrypt_log) {
+		log->crypt_head_size = log->crypt_tail_size = srv_sort_buf_size;
+		log->crypt_head = static_cast<byte *>(
+			my_large_malloc(&log->crypt_head_size, MYF(MY_WME)));
+		log->crypt_tail = static_cast<byte *>(
+			my_large_malloc(&log->crypt_tail_size, MYF(MY_WME)));
+
+		if (!log->crypt_head || !log->crypt_tail) {
+			row_log_free(log);
+			DBUG_RETURN(false);
+		}
+	}
+
+	index->online_log = log;
+
+	if (!table) {
+		/* Assign the clustered index online log to table.
+		It can be used by concurrent DML to identify whether
+		the table has any online DDL */
+		index->table->indexes.start->online_log_make_dummy();
+		log->alter_trx = trx;
+	}
+
+	/* While we might be holding an exclusive data dictionary lock
+	here, in row_log_abort_sec() we will not always be holding it. Use
+	atomic operations in both cases. */
+	MONITOR_ATOMIC_INC(MONITOR_ONLINE_CREATE_INDEX);
+
+	DBUG_RETURN(true);
+}
+
+/******************************************************//**
+Free the row log for an index that was being created online. */
+void
+row_log_free(
+/*=========*/
+	row_log_t*	log)	/*!< in,own: row log */
+{
+	MONITOR_ATOMIC_DEC(MONITOR_ONLINE_CREATE_INDEX);
+
+	UT_DELETE_ARRAY(log->non_core_fields);
+	row_log_block_free(log->tail);
+	row_log_block_free(log->head);
+	row_merge_file_destroy_low(log->fd);
+
+	if (log->crypt_head) {
+		my_large_free(log->crypt_head, log->crypt_head_size);
+	}
+
+	if (log->crypt_tail) {
+		my_large_free(log->crypt_tail, log->crypt_tail_size);
+	}
+
+	mysql_mutex_destroy(&log->mutex);
+	ut_free(log);
+}
+
+/******************************************************//**
+Get the latest transaction ID that has invoked row_log_online_op()
+during online creation.
+@return latest transaction ID, or 0 if nothing was logged */
+trx_id_t
+row_log_get_max_trx(
+/*================*/
+	dict_index_t*	index)	/*!< in: index, must be locked */
+{
+	ut_ad(dict_index_get_online_status(index) == ONLINE_INDEX_CREATION);
+#ifdef SAFE_MUTEX
+	ut_ad(index->lock.have_x()
+	      || (index->lock.have_s()
+		  && mysql_mutex_is_owner(&index->online_log->mutex)));
+#endif
+	return(index->online_log->max_trx);
+}
+
+/******************************************************//**
+Applies an operation to a secondary index that was being created. */
+static MY_ATTRIBUTE((nonnull))
+void
+row_log_apply_op_low(
+/*=================*/
+	dict_index_t*	index,		/*!< in/out: index */
+	row_merge_dup_t*dup,		/*!< in/out: for reporting
+					duplicate key errors */
+	dberr_t*	error,		/*!< out: DB_SUCCESS or error code */
+	mem_heap_t*	offsets_heap,	/*!< in/out: memory heap for
+					allocating offsets; can be emptied */
+	bool		has_index_lock, /*!< in: true if holding index->lock
+					in exclusive mode */
+	enum row_op	op,		/*!< in: operation being applied */
+	trx_id_t	trx_id,		/*!< in: transaction identifier */
+	const dtuple_t*	entry)		/*!< in: row */
+{
+	mtr_t		mtr;
+	btr_cur_t	cursor;
+	rec_offs*	offsets = NULL;
+
+	ut_ad(!dict_index_is_clust(index));
+
+	ut_ad(index->lock.have_x() == has_index_lock);
+
+	ut_ad(!index->is_corrupted());
+	ut_ad(trx_id != 0 || op == ROW_OP_DELETE);
+
+	DBUG_LOG("ib_create_index",
+		 (op == ROW_OP_INSERT ? "insert " : "delete ")
+		 << (has_index_lock ? "locked index " : "unlocked index ")
+		 << index->id << ',' << ib::hex(trx_id) << ": "
+		 << rec_printer(entry).str());
+
+	mtr_start(&mtr);
+	index->set_modified(mtr);
+	cursor.page_cur.index = index;
+	if (has_index_lock) {
+		mtr_x_lock_index(index, &mtr);
+	}
+
+	/* We perform the pessimistic variant of the operations if we
+	already hold index->lock exclusively. First, search the
+	record. The operation may already have been performed,
+	depending on when the row in the clustered index was
+	scanned. */
+	*error = cursor.search_leaf(entry, PAGE_CUR_LE, has_index_lock
+				    ? BTR_MODIFY_TREE_ALREADY_LATCHED
+				    : BTR_MODIFY_LEAF, &mtr);
+	if (UNIV_UNLIKELY(*error != DB_SUCCESS)) {
+		goto func_exit;
+	}
+
+	ut_ad(dict_index_get_n_unique(index) > 0);
+	/* This test is somewhat similar to row_ins_must_modify_rec(),
+	but not identical for unique secondary indexes. */
+	if (cursor.low_match >= dict_index_get_n_unique(index)
+	    && !page_rec_is_infimum(btr_cur_get_rec(&cursor))) {
+		/* We have a matching record. */
+		bool	exists	= (cursor.low_match
+				   == dict_index_get_n_fields(index));
+#ifdef UNIV_DEBUG
+		rec_t*	rec	= btr_cur_get_rec(&cursor);
+		ut_ad(page_rec_is_user_rec(rec));
+		ut_ad(!rec_get_deleted_flag(rec, page_rec_is_comp(rec)));
+#endif /* UNIV_DEBUG */
+
+		ut_ad(exists || dict_index_is_unique(index));
+
+		switch (op) {
+		case ROW_OP_DELETE:
+			if (!exists) {
+				/* The existing record matches the
+				unique secondary index key, but the
+				PRIMARY KEY columns differ. So, this
+				exact record does not exist. For
+				example, we could detect a duplicate
+				key error in some old index before
+				logging an ROW_OP_INSERT for our
+				index. This ROW_OP_DELETE could have
+				been logged for rolling back
+				TRX_UNDO_INSERT_REC. */
+				goto func_exit;
+			}
+
+			*error = btr_cur_optimistic_delete(
+				&cursor, BTR_CREATE_FLAG, &mtr);
+
+			if (*error != DB_FAIL) {
+				break;
+			}
+
+			if (!has_index_lock) {
+				/* This needs a pessimistic operation.
+				Lock the index tree exclusively. */
+				mtr_commit(&mtr);
+				mtr_start(&mtr);
+				index->set_modified(mtr);
+				*error = cursor.search_leaf(entry, PAGE_CUR_LE,
+							    BTR_MODIFY_TREE,
+							    &mtr);
+				if (UNIV_UNLIKELY(*error != DB_SUCCESS)) {
+					goto func_exit;
+				}
+				/* No other thread than the current one
+				is allowed to modify the index tree.
+				Thus, the record should still exist. */
+				ut_ad(cursor.low_match
+				      >= dict_index_get_n_fields(index));
+				ut_ad(page_rec_is_user_rec(
+					      btr_cur_get_rec(&cursor)));
+			}
+
+			/* As there are no externally stored fields in
+			a secondary index record, the parameter
+			rollback=false will be ignored. */
+
+			btr_cur_pessimistic_delete(
+				error, FALSE, &cursor,
+				BTR_CREATE_FLAG, false, &mtr);
+			break;
+		case ROW_OP_INSERT:
+			if (exists) {
+				/* The record already exists. There
+				is nothing to be inserted.
+				This could happen when processing
+				TRX_UNDO_DEL_MARK_REC in statement
+				rollback:
+
+				UPDATE of PRIMARY KEY can lead to
+				statement rollback if the updated
+				value of the PRIMARY KEY already
+				exists. In this case, the UPDATE would
+				be mapped to DELETE;INSERT, and we
+				only wrote undo log for the DELETE
+				part. The duplicate key error would be
+				triggered before logging the INSERT
+				part.
+
+				Theoretically, we could also get a
+				similar situation when a DELETE operation
+				is blocked by a FOREIGN KEY constraint. */
+				goto func_exit;
+			}
+
+			if (dtuple_contains_null(entry)) {
+				/* The UNIQUE KEY columns match, but
+				there is a NULL value in the key, and
+				NULL!=NULL. */
+				goto insert_the_rec;
+			}
+
+			goto duplicate;
+		}
+	} else {
+		switch (op) {
+			rec_t*		rec;
+			big_rec_t*	big_rec;
+		case ROW_OP_DELETE:
+			/* The record does not exist. For example, we
+			could detect a duplicate key error in some old
+			index before logging an ROW_OP_INSERT for our
+			index. This ROW_OP_DELETE could be logged for
+			rolling back TRX_UNDO_INSERT_REC. */
+			goto func_exit;
+		case ROW_OP_INSERT:
+			if (dict_index_is_unique(index)
+			    && (cursor.up_match
+				>= dict_index_get_n_unique(index)
+				|| cursor.low_match
+				>= dict_index_get_n_unique(index))
+			    && (!index->n_nullable
+				|| !dtuple_contains_null(entry))) {
+duplicate:
+				/* Duplicate key */
+				ut_ad(dict_index_is_unique(index));
+				row_merge_dup_report(dup, entry->fields);
+				*error = DB_DUPLICATE_KEY;
+				goto func_exit;
+			}
+insert_the_rec:
+			/* Insert the record. As we are inserting into
+			a secondary index, there cannot be externally
+			stored columns (!big_rec). */
+			*error = btr_cur_optimistic_insert(
+				BTR_NO_UNDO_LOG_FLAG
+				| BTR_NO_LOCKING_FLAG
+				| BTR_CREATE_FLAG,
+				&cursor, &offsets, &offsets_heap,
+				const_cast<dtuple_t*>(entry),
+				&rec, &big_rec, 0, NULL, &mtr);
+			ut_ad(!big_rec);
+			if (*error != DB_FAIL) {
+				break;
+			}
+
+			if (!has_index_lock) {
+				/* This needs a pessimistic operation.
+				Lock the index tree exclusively. */
+				mtr_commit(&mtr);
+				mtr_start(&mtr);
+				index->set_modified(mtr);
+				*error = cursor.search_leaf(entry, PAGE_CUR_LE,
+							    BTR_MODIFY_TREE,
+							    &mtr);
+				if (*error != DB_SUCCESS) {
+					break;
+				}
+			}
+
+			/* We already determined that the
+			record did not exist. No other thread
+			than the current one is allowed to
+			modify the index tree. Thus, the
+			record should still not exist. */
+
+			*error = btr_cur_pessimistic_insert(
+				BTR_NO_UNDO_LOG_FLAG
+				| BTR_NO_LOCKING_FLAG
+				| BTR_CREATE_FLAG,
+				&cursor, &offsets, &offsets_heap,
+				const_cast<dtuple_t*>(entry),
+				&rec, &big_rec,
+				0, NULL, &mtr);
+			ut_ad(!big_rec);
+			break;
+		}
+		mem_heap_empty(offsets_heap);
+	}
+
+	if (*error == DB_SUCCESS && trx_id) {
+		page_update_max_trx_id(btr_cur_get_block(&cursor),
+				       btr_cur_get_page_zip(&cursor),
+				       trx_id, &mtr);
+	}
+
+func_exit:
+	mtr_commit(&mtr);
+}
+
+/******************************************************//**
+Applies an operation to a secondary index that was being created.
+@return NULL on failure (mrec corruption) or when out of data;
+pointer to next record on success */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+const mrec_t*
+row_log_apply_op(
+/*=============*/
+	dict_index_t*	index,		/*!< in/out: index */
+	row_merge_dup_t*dup,		/*!< in/out: for reporting
+					duplicate key errors */
+	dberr_t*	error,		/*!< out: DB_SUCCESS or error code */
+	mem_heap_t*	offsets_heap,	/*!< in/out: memory heap for
+					allocating offsets; can be emptied */
+	mem_heap_t*	heap,		/*!< in/out: memory heap for
+					allocating data tuples */
+	bool		has_index_lock, /*!< in: true if holding index->lock
+					in exclusive mode */
+	const mrec_t*	mrec,		/*!< in: merge record */
+	const mrec_t*	mrec_end,	/*!< in: end of buffer */
+	rec_offs*	offsets)	/*!< in/out: work area for
+					rec_init_offsets_temp() */
+
+{
+	enum row_op	op;
+	ulint		extra_size;
+	ulint		data_size;
+	dtuple_t*	entry;
+	trx_id_t	trx_id;
+
+	/* Online index creation is only used for secondary indexes. */
+	ut_ad(!dict_index_is_clust(index));
+
+	ut_ad(index->lock.have_x() == has_index_lock);
+
+	if (index->is_corrupted()) {
+		*error = DB_INDEX_CORRUPT;
+		return(NULL);
+	}
+
+	*error = DB_SUCCESS;
+
+	if (mrec + ROW_LOG_HEADER_SIZE >= mrec_end) {
+		return(NULL);
+	}
+
+	switch (*mrec) {
+	case ROW_OP_INSERT:
+		if (ROW_LOG_HEADER_SIZE + DATA_TRX_ID_LEN + mrec >= mrec_end) {
+			return(NULL);
+		}
+
+		op = static_cast<enum row_op>(*mrec++);
+		trx_id = trx_read_trx_id(mrec);
+		mrec += DATA_TRX_ID_LEN;
+		break;
+	case ROW_OP_DELETE:
+		op = static_cast<enum row_op>(*mrec++);
+		trx_id = 0;
+		break;
+	default:
+corrupted:
+		ut_ad(0);
+		*error = DB_CORRUPTION;
+		return(NULL);
+	}
+
+	extra_size = *mrec++;
+
+	ut_ad(mrec < mrec_end);
+
+	if (extra_size >= 0x80) {
+		/* Read another byte of extra_size. */
+
+		extra_size = (extra_size & 0x7f) << 8;
+		extra_size |= *mrec++;
+	}
+
+	mrec += extra_size;
+
+	if (mrec > mrec_end) {
+		return(NULL);
+	}
+
+	rec_init_offsets_temp(mrec, index, offsets);
+
+	if (rec_offs_any_extern(offsets)) {
+		/* There should never be any externally stored fields
+		in a secondary index, which is what online index
+		creation is used for. Therefore, the log file must be
+		corrupted. */
+		goto corrupted;
+	}
+
+	data_size = rec_offs_data_size(offsets);
+
+	mrec += data_size;
+
+	if (mrec > mrec_end) {
+		return(NULL);
+	}
+
+	entry = row_rec_to_index_entry_low(
+		mrec - data_size, index, offsets, heap);
+	/* Online index creation is only implemented for secondary
+	indexes, which never contain off-page columns. */
+	ut_ad(dtuple_get_n_ext(entry) == 0);
+
+	row_log_apply_op_low(index, dup, error, offsets_heap,
+			     has_index_lock, op, trx_id, entry);
+	return(mrec);
+}
+
+/** Applies operations to a secondary index that was being created.
+@param[in]	trx	transaction (for checking if the operation was
+interrupted)
+@param[in,out]	index	index
+@param[in,out]	dup	for reporting duplicate key errors
+@param[in,out]	stage	performance schema accounting object, used by
+ALTER TABLE. If not NULL, then stage->inc() will be called for each block
+of log that is applied or nullptr when row log applied done by DML
+thread.
+@return DB_SUCCESS, or error code on failure */
+static
+dberr_t
+row_log_apply_ops(
+	const trx_t*		trx,
+	dict_index_t*		index,
+	row_merge_dup_t*	dup,
+	ut_stage_alter_t*	stage)
+{
+	dberr_t		error;
+	const mrec_t*	mrec	= NULL;
+	const mrec_t*	next_mrec;
+	const mrec_t*	mrec_end= NULL; /* silence bogus warning */
+	const mrec_t*	next_mrec_end;
+	mem_heap_t*	offsets_heap;
+	mem_heap_t*	heap;
+	rec_offs*	offsets;
+	bool		has_index_lock;
+	const ulint	i	= 1 + REC_OFFS_HEADER_SIZE
+		+ dict_index_get_n_fields(index);
+
+	ut_ad(dict_index_is_online_ddl(index)
+	      || (index->online_log
+		  && index->online_status == ONLINE_INDEX_COMPLETE));
+	ut_ad(!index->is_committed());
+	ut_ad(index->lock.have_x());
+	ut_ad(index->online_log);
+
+	MEM_UNDEFINED(&mrec_end, sizeof mrec_end);
+
+	offsets = static_cast<rec_offs*>(ut_malloc_nokey(i * sizeof *offsets));
+	rec_offs_set_n_alloc(offsets, i);
+	rec_offs_set_n_fields(offsets, dict_index_get_n_fields(index));
+
+	offsets_heap = mem_heap_create(srv_page_size);
+	heap = mem_heap_create(srv_page_size);
+	has_index_lock = true;
+
+next_block:
+	ut_ad(has_index_lock);
+	ut_ad(index->lock.have_x());
+	ut_ad(index->online_log->head.bytes == 0);
+
+	if (stage) {
+		stage->inc(row_log_progress_inc_per_block());
+	}
+
+	if (trx_is_interrupted(trx)) {
+		goto interrupted;
+	}
+
+	error = index->online_log->error;
+	if (error != DB_SUCCESS) {
+		goto func_exit;
+	}
+
+	if (index->is_corrupted()) {
+		error = DB_INDEX_CORRUPT;
+		goto func_exit;
+	}
+
+	if (UNIV_UNLIKELY(index->online_log->head.blocks
+			  > index->online_log->tail.blocks)) {
+unexpected_eof:
+		ib::error() << "Unexpected end of temporary file for index "
+			<< index->name;
+corruption:
+		error = DB_CORRUPTION;
+		goto func_exit;
+	}
+
+	if (index->online_log->head.blocks
+	    == index->online_log->tail.blocks) {
+		if (index->online_log->head.blocks) {
+#ifdef HAVE_FTRUNCATE
+			/* Truncate the file in order to save space. */
+			if (index->online_log->fd > 0
+			    && ftruncate(index->online_log->fd, 0) == -1) {
+				ib::error()
+					<< "\'" << index->name + 1
+					<< "\' failed with error "
+					<< errno << ":" << strerror(errno);
+
+				goto corruption;
+			}
+#endif /* HAVE_FTRUNCATE */
+			index->online_log->head.blocks
+				= index->online_log->tail.blocks = 0;
+		}
+
+		next_mrec = index->online_log->tail.block;
+		next_mrec_end = next_mrec + index->online_log->tail.bytes;
+
+		if (next_mrec_end == next_mrec) {
+			/* End of log reached. */
+all_done:
+			ut_ad(has_index_lock);
+			ut_ad(index->online_log->head.blocks == 0);
+			ut_ad(index->online_log->tail.blocks == 0);
+			index->online_log->tail.bytes = 0;
+			index->online_log->head.bytes = 0;
+			error = DB_SUCCESS;
+			goto func_exit;
+		}
+	} else {
+		os_offset_t	ofs = static_cast<os_offset_t>(
+			index->online_log->head.blocks)
+			* srv_sort_buf_size;
+		ut_ad(has_index_lock);
+		has_index_lock = false;
+		index->lock.x_unlock();
+
+		log_free_check();
+
+		if (!row_log_block_allocate(index->online_log->head)) {
+			error = DB_OUT_OF_MEMORY;
+			goto func_exit;
+		}
+
+		byte*	buf = index->online_log->head.block;
+
+		if (DB_SUCCESS
+		    != os_file_read(IORequestRead, index->online_log->fd,
+				    buf, ofs, srv_sort_buf_size, nullptr)) {
+			ib::error()
+				<< "Unable to read temporary file"
+				" for index " << index->name;
+			goto corruption;
+		}
+
+		if (srv_encrypt_log) {
+			if (!log_tmp_block_decrypt(
+				    buf, srv_sort_buf_size,
+				    index->online_log->crypt_head, ofs)) {
+				error = DB_DECRYPTION_FAILED;
+				goto func_exit;
+			}
+
+			srv_stats.n_rowlog_blocks_decrypted.inc();
+			memcpy(buf, index->online_log->crypt_head, srv_sort_buf_size);
+		}
+
+#ifdef POSIX_FADV_DONTNEED
+		/* Each block is read exactly once.  Free up the file cache. */
+		posix_fadvise(index->online_log->fd,
+			      ofs, srv_sort_buf_size, POSIX_FADV_DONTNEED);
+#endif /* POSIX_FADV_DONTNEED */
+
+		next_mrec = index->online_log->head.block;
+		next_mrec_end = next_mrec + srv_sort_buf_size;
+	}
+
+	if (mrec) {
+		/* A partial record was read from the previous block.
+		Copy the temporary buffer full, as we do not know the
+		length of the record. Parse subsequent records from
+		the bigger buffer index->online_log->head.block
+		or index->online_log->tail.block. */
+
+		ut_ad(mrec == index->online_log->head.buf);
+		ut_ad(mrec_end > mrec);
+		ut_ad(mrec_end < (&index->online_log->head.buf)[1]);
+
+		memcpy((mrec_t*) mrec_end, next_mrec,
+		       ulint((&index->online_log->head.buf)[1] - mrec_end));
+		mrec = row_log_apply_op(
+			index, dup, &error, offsets_heap, heap,
+			has_index_lock, index->online_log->head.buf,
+			(&index->online_log->head.buf)[1], offsets);
+		if (error != DB_SUCCESS) {
+			goto func_exit;
+		} else if (UNIV_UNLIKELY(mrec == NULL)) {
+			/* The record was not reassembled properly. */
+			goto corruption;
+		}
+		/* The record was previously found out to be
+		truncated. Now that the parse buffer was extended,
+		it should proceed beyond the old end of the buffer. */
+		ut_a(mrec > mrec_end);
+
+		index->online_log->head.bytes = ulint(mrec - mrec_end);
+		next_mrec += index->online_log->head.bytes;
+	}
+
+	ut_ad(next_mrec <= next_mrec_end);
+	/* The following loop must not be parsing the temporary
+	buffer, but head.block or tail.block. */
+
+	/* mrec!=NULL means that the next record starts from the
+	middle of the block */
+	ut_ad((mrec == NULL) == (index->online_log->head.bytes == 0));
+
+#ifdef UNIV_DEBUG
+	if (next_mrec_end == index->online_log->head.block
+	    + srv_sort_buf_size) {
+		/* If tail.bytes == 0, next_mrec_end can also be at
+		the end of tail.block. */
+		if (index->online_log->tail.bytes == 0) {
+			ut_ad(next_mrec == next_mrec_end);
+			ut_ad(index->online_log->tail.blocks == 0);
+			ut_ad(index->online_log->head.blocks == 0);
+			ut_ad(index->online_log->head.bytes == 0);
+		} else {
+			ut_ad(next_mrec == index->online_log->head.block
+			      + index->online_log->head.bytes);
+			ut_ad(index->online_log->tail.blocks
+			      > index->online_log->head.blocks);
+		}
+	} else if (next_mrec_end == index->online_log->tail.block
+		   + index->online_log->tail.bytes) {
+		ut_ad(next_mrec == index->online_log->tail.block
+		      + index->online_log->head.bytes);
+		ut_ad(index->online_log->tail.blocks == 0);
+		ut_ad(index->online_log->head.blocks == 0);
+		ut_ad(index->online_log->head.bytes
+		      <= index->online_log->tail.bytes);
+	} else {
+		ut_error;
+	}
+#endif /* UNIV_DEBUG */
+
+	mrec_end = next_mrec_end;
+
+	while (!trx_is_interrupted(trx)) {
+		mrec = next_mrec;
+		ut_ad(mrec < mrec_end);
+
+		if (!has_index_lock) {
+			/* We are applying operations from a different
+			block than the one that is being written to.
+			We do not hold index->lock in order to
+			allow other threads to concurrently buffer
+			modifications. */
+			ut_ad(mrec >= index->online_log->head.block);
+			ut_ad(mrec_end == index->online_log->head.block
+			      + srv_sort_buf_size);
+			ut_ad(index->online_log->head.bytes
+			      < srv_sort_buf_size);
+
+			/* Take the opportunity to do a redo log
+			checkpoint if needed. */
+			log_free_check();
+		} else {
+			/* We are applying operations from the last block.
+			Do not allow other threads to buffer anything,
+			so that we can finally catch up and synchronize. */
+			ut_ad(index->online_log->head.blocks == 0);
+			ut_ad(index->online_log->tail.blocks == 0);
+			ut_ad(mrec_end == index->online_log->tail.block
+			      + index->online_log->tail.bytes);
+			ut_ad(mrec >= index->online_log->tail.block);
+		}
+
+		next_mrec = row_log_apply_op(
+			index, dup, &error, offsets_heap, heap,
+			has_index_lock, mrec, mrec_end, offsets);
+
+		if (error != DB_SUCCESS) {
+			goto func_exit;
+		} else if (next_mrec == next_mrec_end) {
+			/* The record happened to end on a block boundary.
+			Do we have more blocks left? */
+			if (has_index_lock) {
+				/* The index will be locked while
+				applying the last block. */
+				goto all_done;
+			}
+
+			mrec = NULL;
+process_next_block:
+			index->lock.x_lock(SRW_LOCK_CALL);
+			has_index_lock = true;
+
+			index->online_log->head.bytes = 0;
+			index->online_log->head.blocks++;
+			goto next_block;
+		} else if (next_mrec != NULL) {
+			ut_ad(next_mrec < next_mrec_end);
+			index->online_log->head.bytes
+				+= ulint(next_mrec - mrec);
+		} else if (has_index_lock) {
+			/* When mrec is within tail.block, it should
+			be a complete record, because we are holding
+			index->lock and thus excluding the writer. */
+			ut_ad(index->online_log->tail.blocks == 0);
+			ut_ad(mrec_end == index->online_log->tail.block
+			      + index->online_log->tail.bytes);
+			ut_ad(0);
+			goto unexpected_eof;
+		} else {
+			memcpy(index->online_log->head.buf, mrec,
+			       ulint(mrec_end - mrec));
+			mrec_end += ulint(index->online_log->head.buf - mrec);
+			mrec = index->online_log->head.buf;
+			goto process_next_block;
+		}
+	}
+
+interrupted:
+	error = DB_INTERRUPTED;
+func_exit:
+	if (!has_index_lock) {
+		index->lock.x_lock(SRW_LOCK_CALL);
+	}
+
+	switch (error) {
+	case DB_SUCCESS:
+		break;
+	case DB_INDEX_CORRUPT:
+		if (((os_offset_t) index->online_log->tail.blocks + 1)
+		    * srv_sort_buf_size >= srv_online_max_size) {
+			/* The log file grew too big. */
+			error = DB_ONLINE_LOG_TOO_BIG;
+		}
+		/* fall through */
+	default:
+		index->type |= DICT_CORRUPT;
+	}
+
+	mem_heap_free(heap);
+	mem_heap_free(offsets_heap);
+	row_log_block_free(index->online_log->head);
+	ut_free(offsets);
+	return(error);
+}
+
+/** Apply the row log to the index upon completing index creation.
+@param[in]	trx	transaction (for checking if the operation was
+interrupted)
+@param[in,out]	index	secondary index
+@param[in,out]	table	MySQL table (for reporting duplicates)
+@param[in,out]	stage	performance schema accounting object, used by
+ALTER TABLE. stage->begin_phase_log_index() will be called initially and then
+stage->inc() will be called for each block of log that is applied or nullptr
+when row log has been applied by DML thread.
+@return DB_SUCCESS, or error code on failure */
+dberr_t
+row_log_apply(
+	const trx_t*		trx,
+	dict_index_t*		index,
+	struct TABLE*		table,
+	ut_stage_alter_t*	stage)
+{
+	dberr_t		error;
+	row_merge_dup_t	dup = { index, table, NULL, 0 };
+	DBUG_ENTER("row_log_apply");
+
+	ut_ad(dict_index_is_online_ddl(index)
+	      || (index->online_log
+		  && index->online_status == ONLINE_INDEX_COMPLETE));
+	ut_ad(!dict_index_is_clust(index));
+
+	if (stage) {
+		stage->begin_phase_log_index();
+	}
+
+	log_free_check();
+
+	index->lock.x_lock(SRW_LOCK_CALL);
+
+	if (index->online_log && !index->table->corrupted) {
+		error = row_log_apply_ops(trx, index, &dup, stage);
+	} else {
+		error = DB_SUCCESS;
+	}
+
+	if (error != DB_SUCCESS) {
+		ut_ad(index->table->space);
+		index->type |= DICT_CORRUPT;
+		index->table->drop_aborted = TRUE;
+
+		dict_index_set_online_status(index, ONLINE_INDEX_ABORTED);
+	} else if (stage) {
+		/* Mark the index as completed only when it is
+		being called by DDL thread */
+		ut_ad(dup.n_dup == 0);
+		dict_index_set_online_status(index, ONLINE_INDEX_COMPLETE);
+	}
+
+	index->lock.x_unlock();
+
+	DBUG_RETURN(error);
+}
+
+unsigned row_log_get_n_core_fields(const dict_index_t *index)
+{
+  ut_ad(index->online_log);
+  return index->online_log->n_core_fields;
+}
+
+dberr_t row_log_get_error(const dict_index_t *index)
+{
+  ut_ad(index->online_log);
+  return index->online_log->error;
+}
+
+dberr_t dict_table_t::clear(que_thr_t *thr)
+{
+  dberr_t err= DB_SUCCESS;
+  for (dict_index_t *index= UT_LIST_GET_FIRST(indexes); index;
+       index= UT_LIST_GET_NEXT(indexes, index))
+  {
+    if (index->type & DICT_FTS)
+      continue;
+
+    switch (dict_index_get_online_status(index)) {
+    case ONLINE_INDEX_ABORTED:
+    case ONLINE_INDEX_ABORTED_DROPPED:
+      continue;
+    case ONLINE_INDEX_COMPLETE:
+      break;
+    case ONLINE_INDEX_CREATION:
+      ut_ad("invalid type" == 0);
+      MY_ASSERT_UNREACHABLE();
+      break;
+    }
+    if (dberr_t err_index= index->clear(thr))
+      err= err_index;
+  }
+  return err;
+}
+
+inline bool UndorecApplier::is_same(roll_ptr_t roll_ptr) const
+{
+  return uint16_t(roll_ptr) == offset &&
+    uint32_t(roll_ptr >> 16) == page_id.page_no();
+}
+
+const rec_t *
+UndorecApplier::get_old_rec(const dtuple_t &tuple, dict_index_t *index,
+                            const rec_t **clust_rec, rec_offs **offsets)
+{
+  ut_ad(index->is_primary());
+  btr_pcur_t pcur;
+
+  bool found= row_search_on_row_ref(&pcur, BTR_MODIFY_LEAF,
+                                    index->table, &tuple, &mtr);
+  ut_a(found);
+  *clust_rec= btr_pcur_get_rec(&pcur);
+
+  ulint len= 0;
+  rec_t *prev_version;
+  const rec_t *version= *clust_rec;
+  do
+  {
+    *offsets= rec_get_offsets(version, index, *offsets,
+                              index->n_core_fields, ULINT_UNDEFINED,
+                              &heap);
+    roll_ptr_t roll_ptr= trx_read_roll_ptr(
+      rec_get_nth_field(version, *offsets, index->db_roll_ptr(), &len));
+    ut_ad(len == DATA_ROLL_PTR_LEN);
+    if (is_same(roll_ptr))
+      return version;
+    trx_undo_prev_version_build(version, index, *offsets, heap, &prev_version,
+                                nullptr, nullptr, 0);
+    version= prev_version;
+  }
+  while (version);
+
+  return nullptr;
+}
+
+/** Clear out all online log of other online indexes after
+encountering the error during row_log_apply() in DML thread
+@param	table	table which does online DDL */
+static void row_log_mark_other_online_index_abort(dict_table_t *table)
+{
+  dict_index_t *clust_index= dict_table_get_first_index(table);
+  for (dict_index_t *index= dict_table_get_next_index(clust_index);
+       index; index= dict_table_get_next_index(index))
+  {
+    if (index->online_log &&
+        index->online_status <= ONLINE_INDEX_CREATION &&
+        !index->is_corrupted())
+    {
+      index->lock.x_lock(SRW_LOCK_CALL);
+      row_log_abort_sec(index);
+      index->type|= DICT_CORRUPT;
+      index->lock.x_unlock();
+      MONITOR_ATOMIC_INC(MONITOR_BACKGROUND_DROP_INDEX);
+    }
+  }
+
+  clust_index->lock.x_lock(SRW_LOCK_CALL);
+  clust_index->online_log= nullptr;
+  clust_index->lock.x_unlock();
+  table->drop_aborted= TRUE;
+}
+
+void dtype_t::assign(const dict_col_t &col)
+{
+  prtype= col.prtype;
+  mtype= col.mtype;
+  len= col.len;
+  mbminlen= col.mbminlen;
+  mbmaxlen= col.mbmaxlen;
+}
+
+inline void dtuple_t::copy_field_types(const dict_index_t &index)
+{
+  ut_ad(index.n_fields == n_fields);
+  if (UNIV_LIKELY_NULL(index.change_col_info))
+    for (ulint i= 0; i < n_fields; i++)
+      fields[i].type.assign(*index.fields[i].col);
+}
+
+void UndorecApplier::log_insert(const dtuple_t &tuple,
+                                dict_index_t *clust_index)
+{
+  DEBUG_SYNC_C("row_log_insert_handle");
+  ut_ad(clust_index->is_primary());
+  rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+  rec_offs *offsets= offsets_;
+
+  rec_offs_init(offsets_);
+  mtr.start();
+  const rec_t *rec;
+  const rec_t *match_rec= get_old_rec(tuple, clust_index, &rec, &offsets);
+  if (!match_rec)
+  {
+    mtr.commit();
+    return;
+  }
+  const rec_t *copy_rec= match_rec;
+  if (match_rec == rec)
+  {
+    copy_rec= rec_copy(mem_heap_alloc(
+      heap, rec_offs_size(offsets)), match_rec, offsets);
+    rec_offs_make_valid(copy_rec, clust_index, true, offsets);
+  }
+  mtr.commit();
+
+  dict_table_t *table= clust_index->table;
+  clust_index->lock.s_lock(SRW_LOCK_CALL);
+  if (clust_index->online_log &&
+      !clust_index->online_log_is_dummy() &&
+      clust_index->online_status <= ONLINE_INDEX_CREATION)
+  {
+    row_log_table_insert(copy_rec, clust_index, offsets);
+    clust_index->lock.s_unlock();
+  }
+  else
+  {
+    clust_index->lock.s_unlock();
+    row_ext_t *ext;
+    dtuple_t *row= row_build(ROW_COPY_POINTERS, clust_index,
+      copy_rec, offsets, table, nullptr, nullptr, &ext, heap);
+
+    if (table->n_v_cols)
+    {
+      /* Update the row with virtual column values present
+      in the undo log or update vector */
+      if (type == TRX_UNDO_UPD_DEL_REC)
+        row_upd_replace_vcol(row, table, update, false, nullptr,
+                             (cmpl_info & UPD_NODE_NO_ORD_CHANGE)
+                             ? nullptr : undo_rec);
+      else
+        trx_undo_read_v_cols(table, undo_rec, row, false);
+    }
+
+    bool success= true;
+    for (dict_index_t *index= clust_index;
+         (index= dict_table_get_next_index(index)) != nullptr; )
+    {
+      index->lock.s_lock(SRW_LOCK_CALL);
+      if (index->online_log &&
+          index->online_status <= ONLINE_INDEX_CREATION &&
+          !index->is_corrupted())
+      {
+        dtuple_t *entry= row_build_index_entry_low(row, ext, index,
+                                                   heap, ROW_BUILD_NORMAL);
+        entry->copy_field_types(*index);
+	success= row_log_online_op(index, entry, trx_id);
+      }
+
+      index->lock.s_unlock();
+      if (!success)
+      {
+        row_log_mark_other_online_index_abort(index->table);
+        return;
+      }
+    }
+  }
+}
+
+void UndorecApplier::log_update(const dtuple_t &tuple,
+                                dict_index_t *clust_index)
+{
+  rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+  rec_offs offsets2_[REC_OFFS_NORMAL_SIZE];
+  rec_offs *offsets= offsets_;
+  rec_offs *prev_offsets= offsets2_;
+
+  rec_offs_init(offsets_);
+  rec_offs_init(offsets2_);
+
+  dict_table_t *table= clust_index->table;
+
+  clust_index->lock.s_lock(SRW_LOCK_CALL);
+  bool table_rebuild=
+    (clust_index->online_log
+     && !clust_index->online_log_is_dummy()
+     && clust_index->online_status <= ONLINE_INDEX_CREATION);
+  clust_index->lock.s_unlock();
+
+  mtr.start();
+  const rec_t *rec;
+  rec_t *prev_version;
+  bool is_update= (type == TRX_UNDO_UPD_EXIST_REC);
+  const rec_t *match_rec= get_old_rec(tuple, clust_index, &rec, &offsets);
+  if (!match_rec)
+  {
+    mtr.commit();
+    return;
+  }
+
+  if (table_rebuild)
+  {
+    const rec_t *copy_rec= match_rec;
+    if (match_rec == rec)
+      copy_rec= rec_copy(mem_heap_alloc(
+        heap, rec_offs_size(offsets)), match_rec, offsets);
+    trx_undo_prev_version_build(match_rec, clust_index, offsets, heap,
+                                &prev_version, nullptr, nullptr, 0);
+
+    prev_offsets= rec_get_offsets(prev_version, clust_index, prev_offsets,
+                                  clust_index->n_core_fields,
+                                  ULINT_UNDEFINED, &heap);
+    rec_offs_make_valid(copy_rec, clust_index, true, offsets);
+    mtr.commit();
+
+    clust_index->lock.s_lock(SRW_LOCK_CALL);
+    /* Recheck whether clustered index online log has been cleared */
+    if (clust_index->online_log)
+    {
+      if (is_update)
+      {
+        const dtuple_t *rebuilt_old_pk= row_log_table_get_pk(
+          prev_version, clust_index, prev_offsets, nullptr, &heap);
+        row_log_table_update(copy_rec, clust_index, offsets, rebuilt_old_pk);
+      }
+      else
+        row_log_table_delete(prev_version, clust_index, prev_offsets, nullptr);
+    }
+    clust_index->lock.s_unlock();
+    return;
+  }
+
+  dtuple_t *row= nullptr;
+  row_ext_t *new_ext;
+  if (match_rec != rec)
+    row= row_build(ROW_COPY_POINTERS, clust_index, match_rec, offsets,
+                   clust_index->table, NULL, NULL, &new_ext, heap);
+  else
+    row= row_build(ROW_COPY_DATA, clust_index, rec, offsets,
+                   clust_index->table, NULL, NULL, &new_ext, heap);
+  mtr.commit();
+  row_ext_t *old_ext;
+  dtuple_t *old_row= nullptr;
+  if (!(this->cmpl_info & UPD_NODE_NO_ORD_CHANGE))
+  {
+    for (ulint i = 0; i < dict_table_get_n_v_cols(table); i++)
+       dfield_get_type(
+         dtuple_get_nth_v_field(row, i))->mtype = DATA_MISSING;
+  }
+
+  if (is_update)
+  {
+    old_row= dtuple_copy(row, heap);
+    row_upd_replace(old_row, &old_ext, clust_index, update, heap);
+  }
+
+  if (table->n_v_cols)
+    row_upd_replace_vcol(row, table, update, false, nullptr,
+                         (cmpl_info & UPD_NODE_NO_ORD_CHANGE)
+                         ? nullptr : undo_rec);
+
+  bool success= true;
+  dict_index_t *index= dict_table_get_next_index(clust_index);
+  while (index)
+  {
+    index->lock.s_lock(SRW_LOCK_CALL);
+    if (index->online_log &&
+        index->online_status <= ONLINE_INDEX_CREATION &&
+        !index->is_corrupted())
+    {
+      if (is_update)
+      {
+        /* Ignore the index if the update doesn't affect the index */
+        if (!row_upd_changes_ord_field_binary(index, update,
+                                              nullptr,
+                                              row, new_ext))
+          goto next_index;
+        dtuple_t *old_entry= row_build_index_entry_low(
+          old_row, old_ext, index, heap, ROW_BUILD_NORMAL);
+
+        old_entry->copy_field_types(*index);
+
+	success= row_log_online_op(index, old_entry, 0);
+
+	dtuple_t *new_entry= row_build_index_entry_low(
+          row, new_ext, index, heap, ROW_BUILD_NORMAL);
+
+        new_entry->copy_field_types(*index);
+
+	if (success)
+	  success= row_log_online_op(index, new_entry, trx_id);
+      }
+      else
+      {
+        dtuple_t *old_entry= row_build_index_entry_low(
+          row, new_ext, index, heap, ROW_BUILD_NORMAL);
+
+        old_entry->copy_field_types(*index);
+
+        success= row_log_online_op(index, old_entry, 0);
+      }
+    }
+next_index:
+    index->lock.s_unlock();
+    if (!success)
+    {
+      row_log_mark_other_online_index_abort(index->table);
+      return;
+    }
+    index= dict_table_get_next_index(index);
+  }
+}
+
diff --git a/storage/innobase/row/row0merge.cc b/storage/innobase/row/row0merge.cc
new file mode 100644
index 00000000..5df93fe6
--- /dev/null
+++ b/storage/innobase/row/row0merge.cc
@@ -0,0 +1,5406 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2014, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0merge.cc
+New index creation routines using a merge sort
+
+Created 12/4/2005 Jan Lindstrom
+Completed by Sunny Bains and Marko Makela
+*******************************************************/
+#include <my_global.h>
+#include <log.h>
+#include <sql_class.h>
+#include <math.h>
+
+#include "row0merge.h"
+#include "row0ext.h"
+#include "row0log.h"
+#include "row0ins.h"
+#include "row0row.h"
+#include "row0sel.h"
+#include "log0crypt.h"
+#include "dict0crea.h"
+#include "trx0purge.h"
+#include "lock0lock.h"
+#include "pars0pars.h"
+#include "ut0sort.h"
+#include "row0ftsort.h"
+#include "row0import.h"
+#include "row0vers.h"
+#include "handler0alter.h"
+#include "btr0bulk.h"
+#ifdef BTR_CUR_ADAPT
+# include "btr0sea.h"
+#endif /* BTR_CUR_ADAPT */
+#include "ut0stage.h"
+#include "fil0crypt.h"
+#include "srv0mon.h"
+
+/* Ignore posix_fadvise() on those platforms where it does not exist */
+#if defined _WIN32
+# define posix_fadvise(fd, offset, len, advice) /* nothing */
+#endif /* _WIN32 */
+
+/* Whether to disable file system cache */
+char	srv_disable_sort_file_cache;
+
+/** Class that caches spatial index row tuples made from a single cluster
+index page scan, and then insert into corresponding index tree */
+class spatial_index_info {
+public:
+  /** constructor
+  @param index	spatial index to be created */
+  spatial_index_info(dict_index_t *index) : index(index)
+  {
+    ut_ad(index->is_spatial());
+  }
+
+  /** Caches an index row into index tuple vector
+  @param[in]	row	table row
+  @param[in]	ext	externally stored column prefixes, or NULL */
+  void add(const dtuple_t *row, const row_ext_t *ext, mem_heap_t *heap)
+  {
+    dtuple_t *dtuple= row_build_index_entry(row, ext, index, heap);
+    ut_ad(dtuple);
+    ut_ad(dtuple->n_fields == index->n_fields);
+    if (ext)
+    {
+      /* Replace any references to ext, because ext will be allocated
+      from row_heap. */
+      for (ulint i= 1; i < dtuple->n_fields; i++)
+      {
+        dfield_t &dfield= dtuple->fields[i];
+        if (dfield.data >= ext->buf &&
+            dfield.data <= &ext->buf[ext->n_ext * ext->max_len])
+          dfield_dup(&dfield, heap);
+      }
+    }
+    m_dtuple_vec.push_back(dtuple);
+  }
+
+	/** Insert spatial index rows cached in vector into spatial index
+	@param[in]	trx_id		transaction id
+	@param[in]	pcur		cluster index scanning cursor
+	@param[in,out]	mtr_started	whether scan_mtr is active
+	@param[in,out]	heap		temporary memory heap
+	@param[in,out]	scan_mtr	mini-transaction for pcur
+	@return DB_SUCCESS if successful, else error number */
+	dberr_t insert(trx_id_t trx_id, btr_pcur_t* pcur,
+		       bool& mtr_started, mem_heap_t* heap, mtr_t* scan_mtr)
+	{
+		big_rec_t*      big_rec;
+		rec_t*          rec;
+		btr_cur_t       ins_cur;
+		mtr_t           mtr;
+		rtr_info_t      rtr_info;
+		rec_offs*	ins_offsets = NULL;
+		dberr_t		error = DB_SUCCESS;
+		dtuple_t*	dtuple;
+		const ulint	flag = BTR_NO_UNDO_LOG_FLAG
+				       | BTR_NO_LOCKING_FLAG
+				       | BTR_KEEP_SYS_FLAG | BTR_CREATE_FLAG;
+
+		ut_ad(mtr_started == scan_mtr->is_active());
+
+		DBUG_EXECUTE_IF("row_merge_instrument_log_check_flush",
+				log_sys.set_check_flush_or_checkpoint(););
+
+		for (idx_tuple_vec::iterator it = m_dtuple_vec.begin();
+		     it != m_dtuple_vec.end();
+		     ++it) {
+			dtuple = *it;
+			ut_ad(dtuple);
+
+			if (log_sys.check_flush_or_checkpoint()) {
+				if (mtr_started) {
+					if (!btr_pcur_move_to_prev_on_page(pcur)) {
+						error = DB_CORRUPTION;
+						break;
+					}
+					btr_pcur_store_position(pcur, scan_mtr);
+					scan_mtr->commit();
+					mtr_started = false;
+				}
+
+				log_free_check();
+			}
+
+			mtr.start();
+			index->set_modified(mtr);
+
+			ins_cur.page_cur.index = index;
+			rtr_init_rtr_info(&rtr_info, false, &ins_cur, index,
+					  false);
+			rtr_info_update_btr(&ins_cur, &rtr_info);
+
+			error = rtr_insert_leaf(&ins_cur, dtuple,
+						BTR_MODIFY_LEAF, &mtr);
+
+			/* It need to update MBR in parent entry,
+			so change search mode to BTR_MODIFY_TREE */
+			if (error == DB_SUCCESS && rtr_info.mbr_adj) {
+				mtr.commit();
+				rtr_clean_rtr_info(&rtr_info, true);
+				rtr_init_rtr_info(&rtr_info, false, &ins_cur,
+						  index, false);
+				rtr_info_update_btr(&ins_cur, &rtr_info);
+				mtr.start();
+				index->set_modified(mtr);
+				error = rtr_insert_leaf(&ins_cur, dtuple,
+							BTR_MODIFY_TREE, &mtr);
+			}
+
+			if (error == DB_SUCCESS) {
+				error = btr_cur_optimistic_insert(
+					flag, &ins_cur, &ins_offsets,
+					&heap, dtuple, &rec, &big_rec,
+					0, NULL, &mtr);
+			}
+
+			ut_ad(!big_rec);
+
+			if (error == DB_FAIL) {
+				mtr.commit();
+				mtr.start();
+				index->set_modified(mtr);
+
+				rtr_clean_rtr_info(&rtr_info, true);
+				rtr_init_rtr_info(&rtr_info, false,
+						  &ins_cur, index, false);
+
+				rtr_info_update_btr(&ins_cur, &rtr_info);
+				error = rtr_insert_leaf(&ins_cur, dtuple,
+							BTR_MODIFY_TREE, &mtr);
+
+				if (error == DB_SUCCESS) {
+					error = btr_cur_pessimistic_insert(
+						flag, &ins_cur, &ins_offsets,
+						&heap, dtuple, &rec,
+						&big_rec, 0, NULL, &mtr);
+				}
+			}
+
+			ut_ad(!big_rec);
+
+			DBUG_EXECUTE_IF(
+				"row_merge_ins_spatial_fail",
+				error = DB_FAIL;
+			);
+
+			if (error == DB_SUCCESS) {
+				if (rtr_info.mbr_adj) {
+					error = rtr_ins_enlarge_mbr(
+							&ins_cur, &mtr);
+				}
+
+				if (error == DB_SUCCESS) {
+					page_update_max_trx_id(
+						btr_cur_get_block(&ins_cur),
+						btr_cur_get_page_zip(&ins_cur),
+						trx_id, &mtr);
+				}
+			}
+
+			mtr.commit();
+
+			rtr_clean_rtr_info(&rtr_info, true);
+		}
+
+		m_dtuple_vec.clear();
+
+		return(error);
+	}
+
+private:
+  /** Cache index rows made from a cluster index scan. Usually
+  for rows on single cluster index page */
+  typedef std::vector<dtuple_t*, ut_allocator<dtuple_t*> > idx_tuple_vec;
+
+  /** vector used to cache index rows made from cluster index scan */
+  idx_tuple_vec m_dtuple_vec;
+public:
+  /** the index being built */
+  dict_index_t*const	index;
+};
+
+/* Maximum pending doc memory limit in bytes for a fts tokenization thread */
+#define FTS_PENDING_DOC_MEMORY_LIMIT	1000000
+
+/** Insert sorted data tuples to the index.
+@param[in]	index		index to be inserted
+@param[in]	old_table	old table
+@param[in]	fd		file descriptor
+@param[in,out]	block		file buffer
+@param[in]	row_buf		row_buf the sorted data tuples,
+or NULL if fd, block will be used instead
+@param[in,out]	btr_bulk	btr bulk instance
+@param[in]	table_total_rows total rows of old table
+@param[in]	pct_progress	total progress percent untill now
+@param[in]	pct_cost	current progress percent
+@param[in]	crypt_block	buffer for encryption or NULL
+@param[in]	space		space id
+@param[in,out]	stage		performance schema accounting object, used by
+ALTER TABLE. If not NULL stage->begin_phase_insert() will be called initially
+and then stage->inc() will be called for each record that is processed.
+@param[in]	blob_file	To read big column field data from
+				the given blob file. It is
+				applicable only for bulk insert
+				operation
+@return DB_SUCCESS or error number */
+static	MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+row_merge_insert_index_tuples(
+	dict_index_t*		index,
+	const dict_table_t*	old_table,
+	const pfs_os_file_t&	fd,
+	row_merge_block_t*	block,
+	const row_merge_buf_t*	row_buf,
+	BtrBulk*		btr_bulk,
+	const ib_uint64_t	table_total_rows,
+	double			pct_progress,
+	double			pct_cost,
+	row_merge_block_t*	crypt_block,
+	ulint			space,
+	ut_stage_alter_t*	stage= nullptr,
+	merge_file_t*		blob_file= nullptr);
+
+/** Encode an index record.
+@return size of the record */
+static MY_ATTRIBUTE((nonnull))
+ulint
+row_merge_buf_encode(
+/*=================*/
+	byte**			b,		/*!< in/out: pointer to
+						current end of output buffer */
+	const dict_index_t*	index,		/*!< in: index */
+	const mtuple_t*		entry,		/*!< in: index fields
+						of the record to encode */
+	ulint			n_fields)	/*!< in: number of fields
+						in the entry */
+{
+	ulint	size;
+	ulint	extra_size;
+
+	size = rec_get_converted_size_temp<false>(
+		index, entry->fields, n_fields, &extra_size);
+	ut_ad(size >= extra_size);
+
+	/* Encode extra_size + 1 */
+	if (extra_size + 1 < 0x80) {
+		*(*b)++ = (byte) (extra_size + 1);
+	} else {
+		ut_ad((extra_size + 1) < 0x8000);
+		*(*b)++ = (byte) (0x80 | ((extra_size + 1) >> 8));
+		*(*b)++ = (byte) (extra_size + 1);
+	}
+
+	rec_convert_dtuple_to_temp<false>(*b + extra_size, index,
+				   entry->fields, n_fields);
+
+	*b += size;
+	return size;
+}
+
+static MY_ATTRIBUTE((malloc, nonnull))
+row_merge_buf_t*
+row_merge_buf_create_low(
+  row_merge_buf_t *buf, mem_heap_t *heap, dict_index_t *index)
+{
+  ulint max_tuples = srv_sort_buf_size
+                     / std::max<ulint>(1, dict_index_get_min_size(index));
+  ut_ad(max_tuples > 0);
+  ut_ad(max_tuples <= srv_sort_buf_size);
+
+  buf->heap = heap;
+  buf->index = index;
+  buf->max_tuples = max_tuples;
+  buf->tuples = static_cast<mtuple_t*>(
+   ut_malloc_nokey(2 * max_tuples * sizeof *buf->tuples));
+  buf->tmp_tuples = buf->tuples + max_tuples;
+  return(buf);
+}
+
+/******************************************************//**
+Allocate a sort buffer.
+@return own: sort buffer */
+row_merge_buf_t*
+row_merge_buf_create(
+/*=================*/
+	dict_index_t*	index)	/*!< in: secondary index */
+{
+	row_merge_buf_t*	buf;
+	ulint			buf_size;
+	mem_heap_t*		heap;
+
+	buf_size = (sizeof *buf);
+
+	heap = mem_heap_create(buf_size);
+
+	buf = static_cast<row_merge_buf_t*>(
+		mem_heap_zalloc(heap, buf_size));
+	row_merge_buf_create_low(buf, heap, index);
+
+	return(buf);
+}
+
+/******************************************************//**
+Empty a sort buffer.
+@return sort buffer */
+row_merge_buf_t*
+row_merge_buf_empty(
+/*================*/
+	row_merge_buf_t*	buf)	/*!< in,own: sort buffer */
+{
+	ulint		buf_size	= sizeof *buf;
+	ulint		max_tuples	= buf->max_tuples;
+	mem_heap_t*	heap		= buf->heap;
+	dict_index_t*	index		= buf->index;
+	mtuple_t*	tuples		= buf->tuples;
+
+	mem_heap_empty(heap);
+
+	buf = static_cast<row_merge_buf_t*>(mem_heap_zalloc(heap, buf_size));
+	buf->heap = heap;
+	buf->index = index;
+	buf->max_tuples = max_tuples;
+	buf->tuples = tuples;
+	buf->tmp_tuples = buf->tuples + max_tuples;
+
+	return(buf);
+}
+
+/******************************************************//**
+Deallocate a sort buffer. */
+void
+row_merge_buf_free(
+/*===============*/
+	row_merge_buf_t*	buf)	/*!< in,own: sort buffer to be freed */
+{
+	ut_free(buf->tuples);
+	mem_heap_free(buf->heap);
+}
+
+/** Convert the field data from compact to redundant format.
+@param[in]	row_field	field to copy from
+@param[out]	field		field to copy to
+@param[in]	len		length of the field data
+@param[in]	zip_size	compressed BLOB page size,
+				zero for uncompressed BLOBs
+@param[in,out]	heap		memory heap where to allocate data when
+				converting to ROW_FORMAT=REDUNDANT, or NULL
+				when not to invoke
+				row_merge_buf_redundant_convert(). */
+static
+void
+row_merge_buf_redundant_convert(
+	const dfield_t*		row_field,
+	dfield_t*		field,
+	ulint			len,
+	ulint			zip_size,
+	mem_heap_t*		heap)
+{
+	ut_ad(field->type.mbminlen == 1);
+	ut_ad(field->type.mbmaxlen > 1);
+
+	byte*		buf = (byte*) mem_heap_alloc(heap, len);
+	ulint		field_len = row_field->len;
+	ut_ad(field_len <= len);
+
+	if (row_field->ext) {
+		const byte*	field_data = static_cast<const byte*>(
+			dfield_get_data(row_field));
+		ulint		ext_len;
+
+		ut_a(field_len >= BTR_EXTERN_FIELD_REF_SIZE);
+		ut_a(memcmp(field_data + field_len - BTR_EXTERN_FIELD_REF_SIZE,
+			    field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE));
+
+		byte*	data = btr_copy_externally_stored_field(
+			&ext_len, field_data, zip_size, field_len, heap);
+
+		ut_ad(ext_len < len);
+
+		memcpy(buf, data, ext_len);
+		field_len = ext_len;
+	} else {
+		memcpy(buf, row_field->data, field_len);
+	}
+
+	memset(buf + field_len, 0x20, len - field_len);
+
+	dfield_set_data(field, buf, len);
+}
+
+/** Insert the tuple into bulk buffer insert operation
+@param	buf	merge buffer for the index operation
+@param	table	bulk insert operation for the table
+@param	row	tuple to be inserted
+@return number of rows inserted */
+static ulint row_merge_bulk_buf_add(row_merge_buf_t* buf,
+                                    const dict_table_t &table,
+                                    const dtuple_t &row)
+{
+  if (buf->n_tuples >= buf->max_tuples)
+    return 0;
+
+  const dict_index_t *index= buf->index;
+  ulint n_fields= dict_index_get_n_fields(index);
+  mtuple_t *entry= &buf->tuples[buf->n_tuples];
+  ulint data_size= 0;
+  ulint extra_size= UT_BITS_IN_BYTES(unsigned(index->n_nullable));
+  dfield_t *field= entry->fields= static_cast<dfield_t*>(
+     mem_heap_alloc(buf->heap, n_fields * sizeof *entry->fields));
+  const dict_field_t *ifield= dict_index_get_nth_field(index, 0);
+
+  for (ulint i = 0; i < n_fields; i++, field++, ifield++)
+  {
+    dfield_copy(field, &row.fields[i]);
+    ulint len= dfield_get_len(field);
+    const dict_col_t* const col= ifield->col;
+
+    if (dfield_is_null(field))
+      continue;
+
+    ulint fixed_len= ifield->fixed_len;
+
+    /* CHAR in ROW_FORMAT=REDUNDANT is always
+    fixed-length, but in the temporary file it is
+    variable-length for variable-length character sets. */
+    if (fixed_len && !index->table->not_redundant() &&
+        col->mbminlen != col->mbmaxlen)
+      fixed_len= 0;
+
+    if (fixed_len);
+    else if (len < 128 || (!DATA_BIG_COL(col)))
+      extra_size++;
+    else
+      extra_size += 2;
+    data_size += len;
+  }
+
+  /* Add to the total size of the record in row_merge_block_t
+  the encoded length of extra_size and the extra bytes (extra_size).
+  See row_merge_buf_write() for the variable-length encoding
+  of extra_size. */
+  data_size += (extra_size + 1) + ((extra_size + 1) >= 0x80);
+
+  /* Reserve bytes for the end marker of row_merge_block_t. */
+  if (buf->total_size + data_size >= srv_sort_buf_size)
+    return 0;
+
+  buf->total_size += data_size;
+  buf->n_tuples++;
+
+  field= entry->fields;
+
+  do
+    dfield_dup(field++, buf->heap);
+  while (--n_fields);
+
+  return 1;
+}
+
+/** Insert a data tuple into a sort buffer.
+@param[in,out]	buf		sort buffer
+@param[in]	fts_index	fts index to be created
+@param[in]	old_table	original table
+@param[in]	new_table	new table
+@param[in,out]	psort_info	parallel sort info
+@param[in,out]	row		table row
+@param[in]	ext		cache of externally stored
+				column prefixes, or NULL
+@param[in]	history_fts	row is historical in a system-versioned table
+				on which a FTS_DOC_ID_INDEX(FTS_DOC_ID) exists
+@param[in,out]	doc_id		Doc ID if we are creating
+				FTS index
+@param[in,out]	conv_heap	memory heap where to allocate data when
+				converting to ROW_FORMAT=REDUNDANT, or NULL
+				when not to invoke
+				row_merge_buf_redundant_convert()
+@param[in,out]	err		set if error occurs
+@param[in,out]	v_heap		heap memory to process data for virtual column
+@param[in,out]	my_table	mysql table object
+@param[in]	trx		transaction object
+@param[in]	col_collate	columns whose collations changed, or nullptr
+@return number of rows added, 0 if out of space */
+static
+ulint
+row_merge_buf_add(
+	row_merge_buf_t*	buf,
+	dict_index_t*		fts_index,
+	const dict_table_t*	old_table,
+	const dict_table_t*	new_table,
+	fts_psort_t*		psort_info,
+	dtuple_t*		row,
+	const row_ext_t*	ext,
+	const bool		history_fts,
+	doc_id_t*		doc_id,
+	mem_heap_t*		conv_heap,
+	dberr_t*		err,
+	mem_heap_t**		v_heap,
+	TABLE*			my_table,
+	trx_t*			trx,
+	const col_collations*	col_collate)
+{
+	ulint			i;
+	const dict_index_t*	index;
+	mtuple_t*		entry;
+	dfield_t*		field;
+	const dict_field_t*	ifield;
+	ulint			n_fields;
+	ulint			data_size;
+	ulint			extra_size;
+	ulint			bucket = 0;
+	doc_id_t		write_doc_id;
+	ulint			n_row_added = 0;
+	VCOL_STORAGE		vcol_storage;
+
+	DBUG_ENTER("row_merge_buf_add");
+
+	if (buf->n_tuples >= buf->max_tuples) {
+error:
+		n_row_added = 0;
+		goto end;
+	}
+
+	DBUG_EXECUTE_IF(
+		"ib_row_merge_buf_add_two",
+		if (buf->n_tuples >= 2) DBUG_RETURN(0););
+
+	UNIV_PREFETCH_R(row->fields);
+
+	/* If we are building FTS index, buf->index points to
+	the 'fts_sort_idx', and real FTS index is stored in
+	fts_index */
+	index = (buf->index->type & DICT_FTS) ? fts_index : buf->index;
+
+	/* create spatial index should not come here */
+	ut_ad(!dict_index_is_spatial(index));
+
+	n_fields = dict_index_get_n_fields(index);
+
+	entry = &buf->tuples[buf->n_tuples];
+	field = entry->fields = static_cast<dfield_t*>(
+		mem_heap_alloc(buf->heap, n_fields * sizeof *entry->fields));
+
+	data_size = 0;
+	extra_size = UT_BITS_IN_BYTES(unsigned(index->n_nullable));
+
+	ifield = dict_index_get_nth_field(index, 0);
+
+	for (i = 0; i < n_fields; i++, field++, ifield++) {
+		ulint			len;
+		ulint			fixed_len;
+		const dfield_t*		row_field;
+		const dict_col_t* const col = ifield->col;
+		const dict_v_col_t* const v_col = col->is_virtual()
+			? reinterpret_cast<const dict_v_col_t*>(col)
+			: NULL;
+
+		/* Process the Doc ID column */
+		if (!v_col && (history_fts || *doc_id)
+		    && col->ind == index->table->fts->doc_col) {
+			fts_write_doc_id((byte*) &write_doc_id, *doc_id);
+
+			/* Note: field->data now points to a value on the
+			stack: &write_doc_id after dfield_set_data(). Because
+			there is only one doc_id per row, it shouldn't matter.
+			We allocate a new buffer before we leave the function
+			later below. */
+
+			dfield_set_data(
+				field, &write_doc_id, sizeof(write_doc_id));
+
+			field->type.mtype = ifield->col->mtype;
+			field->type.prtype = ifield->col->prtype;
+			field->type.mbminlen = 0;
+			field->type.mbmaxlen = 0;
+			field->type.len = ifield->col->len;
+		} else {
+			/* Use callback to get the virtual column value */
+			if (v_col) {
+				dict_index_t*	clust_index
+					= dict_table_get_first_index(new_table);
+
+                                if (!vcol_storage.innobase_record &&
+                                    !innobase_allocate_row_for_vcol(
+						trx->mysql_thd, clust_index,
+						v_heap, &my_table,
+						&vcol_storage)) {
+					*err = DB_OUT_OF_MEMORY;
+					goto error;
+				}
+
+				row_field = innobase_get_computed_value(
+					row, v_col, clust_index,
+					v_heap, NULL, ifield, trx->mysql_thd,
+					my_table, vcol_storage.innobase_record,
+					old_table, NULL);
+
+				if (row_field == NULL) {
+					*err = DB_COMPUTE_VALUE_FAILED;
+					goto error;
+				}
+				dfield_copy(field, row_field);
+			} else {
+				row_field = dtuple_get_nth_field(row,
+								 col->ind);
+				dfield_copy(field, row_field);
+
+				/* Copy the column collation to the
+				tuple field */
+				if (col_collate) {
+					auto it = col_collate->find(col->ind);
+					if (it != col_collate->end()) {
+						field->type
+							.assign(*it->second);
+					}
+				}
+			}
+
+			/* Tokenize and process data for FTS */
+			if (!history_fts && (index->type & DICT_FTS)) {
+				fts_doc_item_t*	doc_item;
+				byte*		value;
+				void*		ptr;
+				const ulint	max_trial_count = 10000;
+				ulint		trial_count = 0;
+
+				/* fetch Doc ID if it already exists
+				in the row, and not supplied by the
+				caller. Even if the value column is
+				NULL, we still need to get the Doc
+				ID so to maintain the correct max
+				Doc ID */
+				if (*doc_id == 0) {
+					const dfield_t*	doc_field;
+					doc_field = dtuple_get_nth_field(
+						row,
+						index->table->fts->doc_col);
+					*doc_id = (doc_id_t) mach_read_from_8(
+						static_cast<const byte*>(
+						dfield_get_data(doc_field)));
+
+					if (*doc_id == 0) {
+						ib::warn() << "FTS Doc ID is"
+							" zero. Record"
+							" skipped";
+						goto error;
+					}
+				}
+
+				if (dfield_is_null(field)) {
+					n_row_added = 1;
+					continue;
+				}
+
+				ptr = ut_malloc_nokey(sizeof(*doc_item)
+						      + field->len);
+
+				doc_item = static_cast<fts_doc_item_t*>(ptr);
+				value = static_cast<byte*>(ptr)
+					+ sizeof(*doc_item);
+				memcpy(value, field->data, field->len);
+				field->data = value;
+
+				doc_item->field = field;
+				doc_item->doc_id = *doc_id;
+
+				bucket = static_cast<ulint>(
+					*doc_id % fts_sort_pll_degree);
+
+				/* Add doc item to fts_doc_list */
+				mysql_mutex_lock(&psort_info[bucket].mutex);
+
+				if (psort_info[bucket].error == DB_SUCCESS) {
+					UT_LIST_ADD_LAST(
+						psort_info[bucket].fts_doc_list,
+						doc_item);
+					psort_info[bucket].memory_used +=
+						sizeof(*doc_item) + field->len;
+				} else {
+					ut_free(doc_item);
+				}
+
+				mysql_mutex_unlock(&psort_info[bucket].mutex);
+
+				/* Sleep when memory used exceeds limit*/
+				while (psort_info[bucket].memory_used
+				       > FTS_PENDING_DOC_MEMORY_LIMIT
+				       && trial_count++ < max_trial_count) {
+					std::this_thread::sleep_for(
+						std::chrono::milliseconds(1));
+				}
+
+				n_row_added = 1;
+				continue;
+			}
+
+			/* innobase_get_computed_value() sets the
+			length of the virtual column field. */
+			if (v_col == NULL
+			    && field->len != UNIV_SQL_NULL
+			    && col->mtype == DATA_MYSQL
+			    && col->len != field->len) {
+				if (conv_heap != NULL) {
+					row_merge_buf_redundant_convert(
+						row_field, field, col->len,
+						old_table->space->zip_size(),
+						conv_heap);
+				}
+			}
+		}
+
+		len = dfield_get_len(field);
+
+		if (dfield_is_null(field)) {
+			ut_ad(!(col->prtype & DATA_NOT_NULL));
+			continue;
+		} else if (!ext) {
+		} else if (dict_index_is_clust(index)) {
+			/* Flag externally stored fields. */
+			const byte*	buf = row_ext_lookup(ext, col->ind,
+							     &len);
+			if (UNIV_LIKELY_NULL(buf)) {
+				ut_a(buf != field_ref_zero);
+				if (i < dict_index_get_n_unique(index)) {
+					dfield_set_data(field, buf, len);
+				} else {
+					dfield_set_ext(field);
+					len = dfield_get_len(field);
+				}
+			}
+		} else if (!v_col) {
+			/* Only non-virtual column are stored externally */
+			const byte*	buf = row_ext_lookup(ext, col->ind,
+							     &len);
+			if (UNIV_LIKELY_NULL(buf)) {
+				ut_a(buf != field_ref_zero);
+				dfield_set_data(field, buf, len);
+			}
+		}
+
+		/* If a column prefix index, take only the prefix */
+
+		if (ifield->prefix_len) {
+			len = dtype_get_at_most_n_mbchars(
+				col->prtype,
+				col->mbminlen, col->mbmaxlen,
+				ifield->prefix_len,
+				len,
+				static_cast<char*>(dfield_get_data(field)));
+			dfield_set_len(field, len);
+		}
+
+		ut_ad(len <= col->len
+		      || DATA_LARGE_MTYPE(col->mtype));
+
+		fixed_len = ifield->fixed_len;
+		if (fixed_len && !dict_table_is_comp(index->table)
+		    && col->mbminlen != col->mbmaxlen) {
+			/* CHAR in ROW_FORMAT=REDUNDANT is always
+			fixed-length, but in the temporary file it is
+			variable-length for variable-length character
+			sets. */
+			fixed_len = 0;
+		}
+
+		if (fixed_len) {
+#ifdef UNIV_DEBUG
+			/* len should be between size calcualted base on
+			mbmaxlen and mbminlen */
+			ut_ad(len <= fixed_len);
+			ut_ad(!col->mbmaxlen || len >= col->mbminlen
+			      * (fixed_len / col->mbmaxlen));
+
+			ut_ad(!dfield_is_ext(field));
+#endif /* UNIV_DEBUG */
+		} else if (dfield_is_ext(field)) {
+			extra_size += 2;
+		} else if (len < 128
+			   || (!DATA_BIG_COL(col))) {
+			extra_size++;
+		} else {
+			/* For variable-length columns, we look up the
+			maximum length from the column itself.  If this
+			is a prefix index column shorter than 256 bytes,
+			this will waste one byte. */
+			extra_size += 2;
+		}
+		data_size += len;
+	}
+
+	/* If this is FTS index, we already populated the sort buffer, return
+	here */
+	if (index->type & DICT_FTS) {
+		goto end;
+	}
+
+#ifdef UNIV_DEBUG
+	{
+		ulint	size;
+		ulint	extra;
+
+		size = rec_get_converted_size_temp<false>(
+			index, entry->fields, n_fields, &extra);
+
+		ut_ad(data_size + extra_size == size);
+		ut_ad(extra_size == extra);
+	}
+#endif /* UNIV_DEBUG */
+
+	/* Add to the total size of the record in row_merge_block_t
+	the encoded length of extra_size and the extra bytes (extra_size).
+	See row_merge_buf_write() for the variable-length encoding
+	of extra_size. */
+	data_size += (extra_size + 1) + ((extra_size + 1) >= 0x80);
+
+	/* Record size can exceed page size while converting to
+	redundant row format. But there is assert
+	ut_ad(size < srv_page_size) in rec_offs_data_size().
+	It may hit the assert before attempting to insert the row. */
+	if (conv_heap != NULL && data_size > srv_page_size) {
+		*err = DB_TOO_BIG_RECORD;
+	}
+
+	ut_ad(data_size < srv_sort_buf_size);
+
+	/* Reserve bytes for the end marker of row_merge_block_t. */
+	if (buf->total_size + data_size >= srv_sort_buf_size) {
+		goto error;
+	}
+
+	buf->total_size += data_size;
+	buf->n_tuples++;
+	n_row_added++;
+
+	field = entry->fields;
+
+	/* Copy the data fields. */
+
+	do {
+		dfield_dup(field++, buf->heap);
+	} while (--n_fields);
+
+	if (conv_heap != NULL) {
+		mem_heap_empty(conv_heap);
+	}
+
+end:
+        if (vcol_storage.innobase_record)
+		innobase_free_row_for_vcol(&vcol_storage);
+	DBUG_RETURN(n_row_added);
+}
+
+/*************************************************************//**
+Report a duplicate key. */
+void
+row_merge_dup_report(
+/*=================*/
+	row_merge_dup_t*	dup,	/*!< in/out: for reporting duplicates */
+	const dfield_t*		entry)	/*!< in: duplicate index entry */
+{
+	if (!dup->n_dup++ && dup->table) {
+		/* Only report the first duplicate record,
+		but count all duplicate records. */
+		innobase_fields_to_mysql(dup->table, dup->index, entry);
+	}
+}
+
+/*************************************************************//**
+Compare two tuples.
+@return positive, 0, negative if a is greater, equal, less, than b,
+respectively */
+static MY_ATTRIBUTE((warn_unused_result))
+int
+row_merge_tuple_cmp(
+/*================*/
+	const dict_index_t*	index,	/*< in: index tree */
+	ulint			n_uniq,	/*!< in: number of unique fields */
+	ulint			n_field,/*!< in: number of fields */
+	const mtuple_t&		a,	/*!< in: first tuple to be compared */
+	const mtuple_t&		b,	/*!< in: second tuple to be compared */
+	row_merge_dup_t*	dup)	/*!< in/out: for reporting duplicates,
+					NULL if non-unique index */
+{
+	int		cmp;
+	const dfield_t*	af	= a.fields;
+	const dfield_t*	bf	= b.fields;
+	ulint		n	= n_uniq;
+	const dict_field_t* f = index->fields;
+
+	ut_ad(n_uniq > 0);
+	ut_ad(n_uniq <= n_field);
+
+	/* Compare the fields of the tuples until a difference is
+	found or we run out of fields to compare.  If !cmp at the
+	end, the tuples are equal. */
+	do {
+		cmp = cmp_dfield_dfield(af++, bf++, (f++)->descending);
+	} while (!cmp && --n);
+
+	if (cmp) {
+		return(cmp);
+	}
+
+	if (dup) {
+		/* Report a duplicate value error if the tuples are
+		logically equal.  NULL columns are logically inequal,
+		although they are equal in the sorting order.  Find
+		out if any of the fields are NULL. */
+		for (const dfield_t* df = a.fields; df != af; df++) {
+			if (dfield_is_null(df)) {
+				goto no_report;
+			}
+		}
+
+		row_merge_dup_report(dup, a.fields);
+	}
+
+no_report:
+	/* The n_uniq fields were equal, but we compare all fields so
+	that we will get the same (internal) order as in the B-tree. */
+	for (n = n_field - n_uniq + 1; --n; ) {
+		cmp = cmp_dfield_dfield(af++, bf++, (f++)->descending);
+		if (cmp) {
+			return(cmp);
+		}
+	}
+
+	/* This should never be reached, except in a secondary index
+	when creating a secondary index and a PRIMARY KEY, and there
+	is a duplicate in the PRIMARY KEY that has not been detected
+	yet. Internally, an index must never contain duplicates. */
+	return(cmp);
+}
+
+/** Wrapper for row_merge_tuple_sort() to inject some more context to
+UT_SORT_FUNCTION_BODY().
+@param tuples array of tuples that being sorted
+@param aux work area, same size as tuples[]
+@param low lower bound of the sorting area, inclusive
+@param high upper bound of the sorting area, inclusive */
+#define row_merge_tuple_sort_ctx(tuples, aux, low, high)		\
+	row_merge_tuple_sort(index,n_uniq,n_field,dup, tuples, aux, low, high)
+/** Wrapper for row_merge_tuple_cmp() to inject some more context to
+UT_SORT_FUNCTION_BODY().
+@param a first tuple to be compared
+@param b second tuple to be compared
+@return positive, 0, negative, if a is greater, equal, less, than b,
+respectively */
+#define row_merge_tuple_cmp_ctx(a,b)			\
+	row_merge_tuple_cmp(index, n_uniq, n_field, a, b, dup)
+
+/**********************************************************************//**
+Merge sort the tuple buffer in main memory. */
+static
+void
+row_merge_tuple_sort(
+/*=================*/
+	const dict_index_t*	index,	/*!< in: index tree */
+	ulint			n_uniq,	/*!< in: number of unique fields */
+	ulint			n_field,/*!< in: number of fields */
+	row_merge_dup_t*	dup,	/*!< in/out: reporter of duplicates
+					(NULL if non-unique index) */
+	mtuple_t*		tuples,	/*!< in/out: tuples */
+	mtuple_t*		aux,	/*!< in/out: work area */
+	ulint			low,	/*!< in: lower bound of the
+					sorting area, inclusive */
+	ulint			high)	/*!< in: upper bound of the
+					sorting area, exclusive */
+{
+	ut_ad(n_field > 0);
+	ut_ad(n_uniq <= n_field);
+
+	UT_SORT_FUNCTION_BODY(row_merge_tuple_sort_ctx,
+			      tuples, aux, low, high, row_merge_tuple_cmp_ctx);
+}
+
+/******************************************************//**
+Sort a buffer. */
+void
+row_merge_buf_sort(
+/*===============*/
+	row_merge_buf_t*	buf,	/*!< in/out: sort buffer */
+	row_merge_dup_t*	dup)	/*!< in/out: reporter of duplicates
+					(NULL if non-unique index) */
+{
+  ut_ad(!buf->index->is_spatial());
+  row_merge_tuple_sort(buf->index, buf->index->n_uniq, buf->index->n_fields,
+                       dup, buf->tuples, buf->tmp_tuples, 0, buf->n_tuples);
+}
+
+/** Write the blob field data to temporary file and fill the offset,
+length in the field data
+@param	field		tuple field
+@param	blob_file	file to store the blob data
+@param	heap		heap to store the blob offset and length
+@return DB_SUCCESS if successful */
+static dberr_t row_merge_write_blob_to_tmp_file(
+   dfield_t *field, merge_file_t *blob_file,mem_heap_t **heap)
+{
+  if (blob_file->fd == OS_FILE_CLOSED)
+  {
+    blob_file->fd= row_merge_file_create_low(nullptr);
+    if (blob_file->fd == OS_FILE_CLOSED)
+      return DB_OUT_OF_MEMORY;
+  }
+  uint64_t val= blob_file->offset;
+  uint32_t len= field->len;
+  dberr_t err= os_file_write(
+    IORequestWrite, "(bulk insert)", blob_file->fd,
+    field->data, blob_file->offset, len);
+
+  if (err != DB_SUCCESS)
+    return err;
+
+  byte *data= static_cast<byte*>
+    (mem_heap_alloc(*heap, BTR_EXTERN_FIELD_REF_SIZE));
+
+  /* Write zeroes for first 8 bytes */
+  memset(data, 0, 8);
+  /* Write offset for next 8 bytes */
+  mach_write_to_8(data + 8, val);
+  /* Write length of the blob in 4 bytes */
+  mach_write_to_4(data + 16, len);
+  blob_file->offset+= field->len;
+  blob_file->n_rec++;
+  dfield_set_data(field, data, BTR_EXTERN_FIELD_REF_SIZE);
+  dfield_set_ext(field);
+  return err;
+}
+
+/** This function is invoked when tuple size is greater than
+innodb_sort_buffer_size. Basically it recreates the tuple
+by writing the blob field to the temporary file.
+@param entry     index fields to be encode the blob
+@param blob_file file to store the blob data
+@param heap      heap to store the blob offset and blob length
+@return tuple which fits into sort_buffer_size */
+static dtuple_t* row_merge_buf_large_tuple(const dtuple_t &entry,
+                                           merge_file_t *blob_file,
+                                           mem_heap_t **heap)
+{
+  if (!*heap)
+    *heap= mem_heap_create(DTUPLE_EST_ALLOC(entry.n_fields));
+
+  dtuple_t *tuple= dtuple_copy(&entry, *heap);
+  for (ulint i= 0; i < tuple->n_fields; i++)
+  {
+    dfield_t *field= &tuple->fields[i];
+    if (dfield_is_null(field) || field->len <= 2000)
+      continue;
+
+    dberr_t err= row_merge_write_blob_to_tmp_file(field, blob_file, heap);
+    if (err != DB_SUCCESS)
+      return nullptr;
+  }
+
+  return tuple;
+}
+
+
+/** Write the field data whose length is more than 2000 bytes
+into blob temporary file and write offset, length into the
+tuple field
+@param entry     index fields to be encode the blob
+@param n_fields  number of fields in the entry
+@param heap      heap to store the blob offset and blob length
+@param blob_file file to store the blob data */
+static dberr_t row_merge_buf_blob(const mtuple_t *entry, ulint n_fields,
+                                  mem_heap_t **heap, merge_file_t *blob_file)
+{
+
+  if (!*heap)
+    *heap= mem_heap_create(100);
+
+  for (ulint i= 0; i < n_fields; i++)
+  {
+    dfield_t *field= &entry->fields[i];
+    if (dfield_is_null(field) || field->len <= 2000)
+      continue;
+
+    dberr_t err= row_merge_write_blob_to_tmp_file(field, blob_file, heap);
+    if (err != DB_SUCCESS)
+      return err;
+  }
+
+  return DB_SUCCESS;
+}
+
+/** Write a buffer to a block.
+@param buf              sorted buffer
+@param block            buffer for writing to file
+@param blob_file        blob file handle for doing bulk insert operation */
+dberr_t row_merge_buf_write(const row_merge_buf_t *buf,
+#ifndef DBUG_OFF
+                            const merge_file_t *of, /*!< output file */
+#endif
+                            row_merge_block_t *block,
+                            merge_file_t *blob_file)
+{
+	const dict_index_t*	index	= buf->index;
+	ulint			n_fields= dict_index_get_n_fields(index);
+	byte*			b	= &block[0];
+	mem_heap_t*		blob_heap = nullptr;
+	dberr_t			err = DB_SUCCESS;
+
+	DBUG_ENTER("row_merge_buf_write");
+
+	for (ulint i = 0; i < buf->n_tuples; i++) {
+		const mtuple_t*	entry	= &buf->tuples[i];
+
+		if (blob_file) {
+			ut_ad(buf->index->is_primary());
+			err = row_merge_buf_blob(
+				entry, n_fields, &blob_heap, blob_file);
+			if (err != DB_SUCCESS) {
+				goto func_exit;
+			}
+		}
+
+		ulint rec_size= row_merge_buf_encode(
+				&b, index, entry, n_fields);
+		if (blob_file && rec_size > srv_page_size) {
+			err = DB_TOO_BIG_RECORD;
+			goto func_exit;
+		}
+
+		ut_ad(b < &block[srv_sort_buf_size]);
+
+		DBUG_LOG("ib_merge_sort",
+			 reinterpret_cast<const void*>(b) << ','
+			 << of->fd << ',' << of->offset << ' ' <<
+			 i << ": " <<
+			 rec_printer(entry->fields, n_fields).str());
+	}
+
+	/* Write an "end-of-chunk" marker. */
+	ut_a(b < &block[srv_sort_buf_size]);
+	ut_a(b == &block[0] + buf->total_size || blob_file);
+	*b++ = 0;
+#ifdef HAVE_valgrind
+	/* The rest of the block is uninitialized.  Initialize it
+	to avoid bogus warnings. */
+	memset(b, 0xff, &block[srv_sort_buf_size] - b);
+#endif /* HAVE_valgrind */
+	DBUG_LOG("ib_merge_sort",
+		 "write " << reinterpret_cast<const void*>(b) << ','
+		 << of->fd << ',' << of->offset << " EOF");
+func_exit:
+	if (blob_heap) {
+		mem_heap_free(blob_heap);
+	}
+
+	DBUG_RETURN(err);
+}
+
+/******************************************************//**
+Create a memory heap and allocate space for row_merge_rec_offsets()
+and mrec_buf_t[3].
+@return memory heap */
+static
+mem_heap_t*
+row_merge_heap_create(
+/*==================*/
+	const dict_index_t*	index,		/*!< in: record descriptor */
+	mrec_buf_t**		buf,		/*!< out: 3 buffers */
+	rec_offs**		offsets1,	/*!< out: offsets */
+	rec_offs**		offsets2)	/*!< out: offsets */
+{
+	ulint		i	= 1 + REC_OFFS_HEADER_SIZE
+		+ dict_index_get_n_fields(index);
+	mem_heap_t*	heap	= mem_heap_create(2 * i * sizeof **offsets1
+						  + 3 * sizeof **buf);
+
+	*buf = static_cast<mrec_buf_t*>(
+		mem_heap_alloc(heap, 3 * sizeof **buf));
+	*offsets1 = static_cast<rec_offs*>(
+		mem_heap_alloc(heap, i * sizeof **offsets1));
+	*offsets2 = static_cast<rec_offs*>(
+		mem_heap_alloc(heap, i * sizeof **offsets2));
+
+	rec_offs_set_n_alloc(*offsets1, i);
+	rec_offs_set_n_alloc(*offsets2, i);
+	rec_offs_set_n_fields(*offsets1, dict_index_get_n_fields(index));
+	rec_offs_set_n_fields(*offsets2, dict_index_get_n_fields(index));
+
+	return(heap);
+}
+
+/** Read a merge block from the file system.
+@return whether the request was completed successfully */
+bool
+row_merge_read(
+/*===========*/
+	const pfs_os_file_t&	fd,	/*!< in: file descriptor */
+	ulint			offset,	/*!< in: offset where to read
+					in number of row_merge_block_t
+					elements */
+	row_merge_block_t*	buf,	/*!< out: data */
+	row_merge_block_t*	crypt_buf, /*!< in: crypt buf or NULL */
+	ulint			space)		/*!< in: space id */
+{
+	os_offset_t	ofs = ((os_offset_t) offset) * srv_sort_buf_size;
+
+	DBUG_ENTER("row_merge_read");
+	DBUG_LOG("ib_merge_sort", "fd=" << fd << " ofs=" << ofs);
+	DBUG_EXECUTE_IF("row_merge_read_failure", DBUG_RETURN(FALSE););
+
+	const dberr_t err = os_file_read(
+		IORequestRead, fd, buf, ofs, srv_sort_buf_size, nullptr);
+
+	/* If encryption is enabled decrypt buffer */
+	if (err == DB_SUCCESS && srv_encrypt_log) {
+		if (!log_tmp_block_decrypt(buf, srv_sort_buf_size,
+					   crypt_buf, ofs)) {
+			DBUG_RETURN(false);
+		}
+
+		srv_stats.n_merge_blocks_decrypted.inc();
+		memcpy(buf, crypt_buf, srv_sort_buf_size);
+	}
+
+#ifdef POSIX_FADV_DONTNEED
+	/* Each block is read exactly once.  Free up the file cache. */
+	posix_fadvise(fd, ofs, srv_sort_buf_size, POSIX_FADV_DONTNEED);
+#endif /* POSIX_FADV_DONTNEED */
+
+	DBUG_RETURN(err == DB_SUCCESS);
+}
+
+/********************************************************************//**
+Write a merge block to the file system.
+@return whether the request was completed successfully
+@retval	false	on error
+@retval	true	on success */
+bool
+row_merge_write(
+	const pfs_os_file_t&	fd,			/*!< in: file descriptor */
+	ulint		offset,			/*!< in: offset where to write,
+						in number of row_merge_block_t elements */
+	const void*	buf,			/*!< in: data */
+	void*		crypt_buf,		/*!< in: crypt buf or NULL */
+	ulint		space)			/*!< in: space id */
+{
+	size_t		buf_len = srv_sort_buf_size;
+	os_offset_t	ofs = buf_len * (os_offset_t) offset;
+	void*		out_buf = (void *)buf;
+
+	DBUG_ENTER("row_merge_write");
+	DBUG_LOG("ib_merge_sort", "fd=" << fd << " ofs=" << ofs);
+	DBUG_EXECUTE_IF("row_merge_write_failure", DBUG_RETURN(FALSE););
+
+	/* For encrypted tables, encrypt data before writing */
+	if (srv_encrypt_log) {
+		if (!log_tmp_block_encrypt(static_cast<const byte*>(buf),
+					   buf_len,
+					   static_cast<byte*>(crypt_buf),
+					   ofs)) {
+			DBUG_RETURN(false);
+		}
+
+		srv_stats.n_merge_blocks_encrypted.inc();
+		out_buf = crypt_buf;
+	}
+
+	const bool	success = DB_SUCCESS == os_file_write(
+		IORequestWrite, "(merge)", fd, out_buf, ofs, buf_len);
+
+#ifdef POSIX_FADV_DONTNEED
+	/* The block will be needed on the next merge pass,
+	but it can be evicted from the file cache meanwhile. */
+	posix_fadvise(fd, ofs, buf_len, POSIX_FADV_DONTNEED);
+#endif /* POSIX_FADV_DONTNEED */
+
+	DBUG_RETURN(success);
+}
+
+/********************************************************************//**
+Read a merge record.
+@return pointer to next record, or NULL on I/O error or end of list */
+const byte*
+row_merge_read_rec(
+/*===============*/
+	row_merge_block_t*	block,	/*!< in/out: file buffer */
+	mrec_buf_t*		buf,	/*!< in/out: secondary buffer */
+	const byte*		b,	/*!< in: pointer to record */
+	const dict_index_t*	index,	/*!< in: index of the record */
+	const pfs_os_file_t&	fd,	/*!< in: file descriptor */
+	ulint*			foffs,	/*!< in/out: file offset */
+	const mrec_t**		mrec,	/*!< out: pointer to merge record,
+					or NULL on end of list
+					(non-NULL on I/O error) */
+	rec_offs*		offsets,/*!< out: offsets of mrec */
+	row_merge_block_t*	crypt_block, /*!< in: crypt buf or NULL */
+	ulint			space) /*!< in: space id */
+{
+	ulint	extra_size;
+	ulint	data_size;
+	ulint	avail_size;
+
+	ut_ad(b >= &block[0]);
+	ut_ad(b < &block[srv_sort_buf_size]);
+
+	ut_ad(rec_offs_get_n_alloc(offsets) == 1 + REC_OFFS_HEADER_SIZE
+	      + dict_index_get_n_fields(index));
+
+	DBUG_ENTER("row_merge_read_rec");
+
+	extra_size = *b++;
+
+	if (UNIV_UNLIKELY(!extra_size)) {
+		/* End of list */
+		*mrec = NULL;
+		DBUG_LOG("ib_merge_sort",
+			 "read " << reinterpret_cast<const void*>(b) << ',' <<
+			 reinterpret_cast<const void*>(block) << ',' <<
+			 fd << ',' << *foffs << " EOF");
+		DBUG_RETURN(NULL);
+	}
+
+	if (extra_size >= 0x80) {
+		/* Read another byte of extra_size. */
+
+		if (UNIV_UNLIKELY(b >= &block[srv_sort_buf_size])) {
+			if (!row_merge_read(fd, ++(*foffs), block,
+					    crypt_block,
+					    space)) {
+err_exit:
+				/* Signal I/O error. */
+				*mrec = b;
+				DBUG_RETURN(NULL);
+			}
+
+			/* Wrap around to the beginning of the buffer. */
+			b = &block[0];
+		}
+
+		extra_size = (extra_size & 0x7f) << 8;
+		extra_size |= *b++;
+	}
+
+	/* Normalize extra_size.  Above, value 0 signals "end of list". */
+	extra_size--;
+
+	/* Read the extra bytes. */
+
+	if (UNIV_UNLIKELY(b + extra_size >= &block[srv_sort_buf_size])) {
+		/* The record spans two blocks.  Copy the entire record
+		to the auxiliary buffer and handle this as a special
+		case. */
+
+		avail_size = ulint(&block[srv_sort_buf_size] - b);
+		ut_ad(avail_size < sizeof *buf);
+		memcpy(*buf, b, avail_size);
+
+		if (!row_merge_read(fd, ++(*foffs), block,
+				    crypt_block,
+				    space)) {
+
+			goto err_exit;
+		}
+
+		/* Wrap around to the beginning of the buffer. */
+		b = &block[0];
+
+		/* Copy the record. */
+		memcpy(*buf + avail_size, b, extra_size - avail_size);
+		b += extra_size - avail_size;
+
+		*mrec = *buf + extra_size;
+
+		rec_init_offsets_temp(*mrec, index, offsets);
+
+		data_size = rec_offs_data_size(offsets);
+
+		/* These overflows should be impossible given that
+		records are much smaller than either buffer, and
+		the record starts near the beginning of each buffer. */
+		ut_a(extra_size + data_size < sizeof *buf);
+		ut_a(b + data_size < &block[srv_sort_buf_size]);
+
+		/* Copy the data bytes. */
+		memcpy(*buf + extra_size, b, data_size);
+		b += data_size;
+
+		goto func_exit;
+	}
+
+	*mrec = b + extra_size;
+
+	rec_init_offsets_temp(*mrec, index, offsets);
+
+	data_size = rec_offs_data_size(offsets);
+	ut_ad(extra_size + data_size < sizeof *buf);
+
+	b += extra_size + data_size;
+
+	if (UNIV_LIKELY(b < &block[srv_sort_buf_size])) {
+		/* The record fits entirely in the block.
+		This is the normal case. */
+		goto func_exit;
+	}
+
+	/* The record spans two blocks.  Copy it to buf. */
+
+	b -= extra_size + data_size;
+	avail_size = ulint(&block[srv_sort_buf_size] - b);
+	memcpy(*buf, b, avail_size);
+	*mrec = *buf + extra_size;
+
+	rec_init_offsets_temp(*mrec, index, offsets);
+
+	if (!row_merge_read(fd, ++(*foffs), block,
+			    crypt_block,
+			    space)) {
+
+		goto err_exit;
+	}
+
+	/* Wrap around to the beginning of the buffer. */
+	b = &block[0];
+
+	/* Copy the rest of the record. */
+	memcpy(*buf + avail_size, b, extra_size + data_size - avail_size);
+	b += extra_size + data_size - avail_size;
+
+func_exit:
+	DBUG_LOG("ib_merge_sort",
+		 reinterpret_cast<const void*>(b) << ',' <<
+		 reinterpret_cast<const void*>(block)
+		 << ",fd=" << fd << ',' << *foffs << ": "
+		 << rec_printer(*mrec, 0, offsets).str());
+	DBUG_RETURN(b);
+}
+
+/********************************************************************//**
+Write a merge record. */
+static
+void
+row_merge_write_rec_low(
+/*====================*/
+	byte*		b,	/*!< out: buffer */
+	ulint		e,	/*!< in: encoded extra_size */
+#ifndef DBUG_OFF
+	ulint		size,	/*!< in: total size to write */
+	const pfs_os_file_t&	fd,	/*!< in: file descriptor */
+	ulint		foffs,	/*!< in: file offset */
+#endif /* !DBUG_OFF */
+	const mrec_t*	mrec,	/*!< in: record to write */
+	const rec_offs*	offsets)/*!< in: offsets of mrec */
+#ifdef DBUG_OFF
+# define row_merge_write_rec_low(b, e, size, fd, foffs, mrec, offsets)	\
+	row_merge_write_rec_low(b, e, mrec, offsets)
+#endif /* DBUG_OFF */
+{
+	DBUG_ENTER("row_merge_write_rec_low");
+
+#ifndef DBUG_OFF
+	const byte* const end = b + size;
+#endif /* DBUG_OFF */
+	DBUG_ASSERT(e == rec_offs_extra_size(offsets) + 1);
+
+	DBUG_LOG("ib_merge_sort",
+		 reinterpret_cast<const void*>(b) << ",fd=" << fd << ','
+		 << foffs << ": " << rec_printer(mrec, 0, offsets).str());
+
+	if (e < 0x80) {
+		*b++ = (byte) e;
+	} else {
+		*b++ = (byte) (0x80 | (e >> 8));
+		*b++ = (byte) e;
+	}
+
+	memcpy(b, mrec - rec_offs_extra_size(offsets), rec_offs_size(offsets));
+	DBUG_SLOW_ASSERT(b + rec_offs_size(offsets) == end);
+	DBUG_VOID_RETURN;
+}
+
+/********************************************************************//**
+Write a merge record.
+@return pointer to end of block, or NULL on error */
+static
+byte*
+row_merge_write_rec(
+/*================*/
+	row_merge_block_t*	block,	/*!< in/out: file buffer */
+	mrec_buf_t*		buf,	/*!< in/out: secondary buffer */
+	byte*			b,	/*!< in: pointer to end of block */
+	const pfs_os_file_t&	fd,	/*!< in: file descriptor */
+	ulint*			foffs,	/*!< in/out: file offset */
+	const mrec_t*		mrec,	/*!< in: record to write */
+	const rec_offs*         offsets,/*!< in: offsets of mrec */
+	row_merge_block_t*	crypt_block, /*!< in: crypt buf or NULL */
+	ulint			space)	   /*!< in: space id */
+{
+	ulint	extra_size;
+	ulint	size;
+	ulint	avail_size;
+
+	ut_ad(block);
+	ut_ad(buf);
+	ut_ad(b >= &block[0]);
+	ut_ad(b < &block[srv_sort_buf_size]);
+	ut_ad(mrec);
+	ut_ad(foffs);
+	ut_ad(mrec < &block[0] || mrec > &block[srv_sort_buf_size]);
+	ut_ad(mrec < buf[0] || mrec > buf[1]);
+
+	/* Normalize extra_size.  Value 0 signals "end of list". */
+	extra_size = rec_offs_extra_size(offsets) + 1;
+
+	size = extra_size + (extra_size >= 0x80)
+		+ rec_offs_data_size(offsets);
+
+	if (UNIV_UNLIKELY(b + size >= &block[srv_sort_buf_size])) {
+		/* The record spans two blocks.
+		Copy it to the temporary buffer first. */
+		avail_size = ulint(&block[srv_sort_buf_size] - b);
+
+		row_merge_write_rec_low(buf[0],
+					extra_size, size, fd, *foffs,
+					mrec, offsets);
+
+		/* Copy the head of the temporary buffer, write
+		the completed block, and copy the tail of the
+		record to the head of the new block. */
+		memcpy(b, buf[0], avail_size);
+
+		if (!row_merge_write(fd, (*foffs)++, block,
+				     crypt_block,
+				     space)) {
+			return(NULL);
+		}
+
+		MEM_UNDEFINED(&block[0], srv_sort_buf_size);
+
+		/* Copy the rest. */
+		b = &block[0];
+		memcpy(b, buf[0] + avail_size, size - avail_size);
+		b += size - avail_size;
+	} else {
+		row_merge_write_rec_low(b, extra_size, size, fd, *foffs,
+					mrec, offsets);
+		b += size;
+	}
+
+	return(b);
+}
+
+/********************************************************************//**
+Write an end-of-list marker.
+@return pointer to end of block, or NULL on error */
+static
+byte*
+row_merge_write_eof(
+/*================*/
+	row_merge_block_t*	block,		/*!< in/out: file buffer */
+	byte*			b,		/*!< in: pointer to end of block */
+	const pfs_os_file_t&	fd,		/*!< in: file descriptor */
+	ulint*			foffs,		/*!< in/out: file offset */
+	row_merge_block_t*	crypt_block, 	/*!< in: crypt buf or NULL */
+	ulint			space)	   	/*!< in: space id */
+{
+	ut_ad(block);
+	ut_ad(b >= &block[0]);
+	ut_ad(b < &block[srv_sort_buf_size]);
+	ut_ad(foffs);
+
+	DBUG_ENTER("row_merge_write_eof");
+	DBUG_LOG("ib_merge_sort",
+		 reinterpret_cast<const void*>(b) << ',' <<
+		 reinterpret_cast<const void*>(block) <<
+		 ",fd=" << fd << ',' << *foffs);
+
+	*b++ = 0;
+	MEM_CHECK_DEFINED(&block[0], b - &block[0]);
+	MEM_CHECK_ADDRESSABLE(&block[0], srv_sort_buf_size);
+
+	/* The rest of the block is uninitialized. Silence warnings. */
+	MEM_MAKE_DEFINED(b, &block[srv_sort_buf_size] - b);
+
+	if (!row_merge_write(fd, (*foffs)++, block, crypt_block, space)) {
+		DBUG_RETURN(NULL);
+	}
+
+	MEM_UNDEFINED(&block[0], srv_sort_buf_size);
+	DBUG_RETURN(&block[0]);
+}
+
+/** Create a temporary file if it has not been created already.
+@param[in,out]	tmpfd	temporary file handle
+@param[in]	path	location for creating temporary file
+@return true on success, false on error */
+static MY_ATTRIBUTE((warn_unused_result))
+bool
+row_merge_tmpfile_if_needed(
+	pfs_os_file_t*		tmpfd,
+	const char*	path)
+{
+	if (*tmpfd == OS_FILE_CLOSED) {
+		*tmpfd = row_merge_file_create_low(path);
+		if (*tmpfd != OS_FILE_CLOSED) {
+			MONITOR_ATOMIC_INC(MONITOR_ALTER_TABLE_SORT_FILES);
+		}
+	}
+
+	return(*tmpfd != OS_FILE_CLOSED);
+}
+
+/** Create a temporary file for merge sort if it was not created already.
+@param[in,out]	file	merge file structure
+@param[in]	nrec	number of records in the file
+@param[in]	path	location for creating temporary file
+@return  true on success, false on error */
+static MY_ATTRIBUTE((warn_unused_result))
+bool
+row_merge_file_create_if_needed(
+	merge_file_t*	file,
+	pfs_os_file_t*	tmpfd,
+	ulint		nrec,
+	const char*	path)
+{
+	ut_ad(file->fd == OS_FILE_CLOSED || *tmpfd != OS_FILE_CLOSED);
+	if (file->fd == OS_FILE_CLOSED && row_merge_file_create(file, path)!= OS_FILE_CLOSED) {
+		MONITOR_ATOMIC_INC(MONITOR_ALTER_TABLE_SORT_FILES);
+		if (!row_merge_tmpfile_if_needed(tmpfd, path) ) {
+			return(false);
+		}
+
+		file->n_rec = nrec;
+	}
+
+	ut_ad(file->fd == OS_FILE_CLOSED || *tmpfd != OS_FILE_CLOSED);
+	return(file->fd != OS_FILE_CLOSED);
+}
+
+/** Copy the merge data tuple from another merge data tuple.
+@param[in]	mtuple		source merge data tuple
+@param[in,out]	prev_mtuple	destination merge data tuple
+@param[in]	n_unique	number of unique fields exist in the mtuple
+@param[in,out]	heap		memory heap where last_mtuple allocated */
+static
+void
+row_mtuple_create(
+	const mtuple_t*	mtuple,
+	mtuple_t*	prev_mtuple,
+	ulint		n_unique,
+	mem_heap_t*	heap)
+{
+	memcpy(prev_mtuple->fields, mtuple->fields,
+	       n_unique * sizeof *mtuple->fields);
+
+	dfield_t*	field = prev_mtuple->fields;
+
+	for (ulint i = 0; i < n_unique; i++) {
+		dfield_dup(field++, heap);
+	}
+}
+
+/** Compare two merge data tuples.
+@param[in]	prev_mtuple	merge data tuple
+@param[in]	current_mtuple	merge data tuple
+@param[in,out]	dup		reporter of duplicates
+@retval positive, 0, negative if current_mtuple is greater, equal, less, than
+last_mtuple. */
+static
+int
+row_mtuple_cmp(
+	const mtuple_t*		prev_mtuple,
+	const mtuple_t*		current_mtuple,
+	row_merge_dup_t*	dup)
+{
+  ut_ad(dup->index->is_primary());
+  const ulint n_uniq= dup->index->n_uniq;
+  return row_merge_tuple_cmp(dup->index, n_uniq, n_uniq,
+                             *current_mtuple, *prev_mtuple, dup);
+}
+
+/** Insert cached spatial index rows.
+@param[in]	trx_id		transaction id
+@param[in]	sp_tuples	cached spatial rows
+@param[in]	num_spatial	number of spatial indexes
+@param[in,out]	heap		temporary memory heap
+@param[in,out]	pcur		cluster index cursor
+@param[in,out]	started		whether mtr is active
+@param[in,out]	mtr		mini-transaction
+@return DB_SUCCESS or error number */
+static
+dberr_t
+row_merge_spatial_rows(
+	trx_id_t		trx_id,
+	spatial_index_info**	sp_tuples,
+	ulint			num_spatial,
+	mem_heap_t*		heap,
+	btr_pcur_t*		pcur,
+	bool&			started,
+	mtr_t*			mtr)
+{
+  if (!sp_tuples)
+    return DB_SUCCESS;
+
+  for (ulint j= 0; j < num_spatial; j++)
+    if (dberr_t err= sp_tuples[j]->insert(trx_id, pcur, started, heap, mtr))
+      return err;
+
+  mem_heap_empty(heap);
+  return DB_SUCCESS;
+}
+
+/** Check if the geometry field is valid.
+@param[in]	row		the row
+@param[in]	index		spatial index
+@return true if it's valid, false if it's invalid. */
+static
+bool
+row_geo_field_is_valid(
+	const dtuple_t*		row,
+	dict_index_t*		index)
+{
+	const dict_field_t*	ind_field
+		= dict_index_get_nth_field(index, 0);
+	const dict_col_t*	col
+		= ind_field->col;
+	ulint			col_no
+		= dict_col_get_no(col);
+	const dfield_t*		dfield
+		= dtuple_get_nth_field(row, col_no);
+
+	if (dfield_is_null(dfield)
+	    || dfield_get_len(dfield) < GEO_DATA_HEADER_SIZE) {
+		return(false);
+	}
+
+	return(true);
+}
+
+/** Reads clustered index of the table and create temporary files
+containing the index entries for the indexes to be built.
+@param[in]	trx		transaction
+@param[in,out]	table		MySQL table object, for reporting erroneous
+				records
+@param[in]	old_table	table where rows are read from
+@param[in]	new_table	table where indexes are created; identical to
+				old_table unless creating a PRIMARY KEY
+@param[in]	online		true if creating indexes online
+@param[in]	index		indexes to be created
+@param[in]	fts_sort_idx	full-text index to be created, or NULL
+@param[in]	psort_info	parallel sort info for fts_sort_idx creation,
+				or NULL
+@param[in]	files		temporary files
+@param[in]	key_numbers	MySQL key numbers to create
+@param[in]	n_index		number of indexes to create
+@param[in]	defaults	default values of added, changed columns, or NULL
+@param[in]	add_v		newly added virtual columns along with indexes
+@param[in]	col_map		mapping of old column numbers to new ones, or
+NULL if old_table == new_table
+@param[in]	add_autoinc	number of added AUTO_INCREMENT columns, or
+ULINT_UNDEFINED if none is added
+@param[in,out]	sequence	autoinc sequence
+@param[in,out]	block		file buffer
+@param[in]	skip_pk_sort	whether the new PRIMARY KEY will follow
+existing order
+@param[in,out]	tmpfd		temporary file handle
+@param[in,out]	stage		performance schema accounting object, used by
+ALTER TABLE. stage->n_pk_recs_inc() will be called for each record read and
+stage->inc() will be called for each page read.
+@param[in]	pct_cost	percent of task weight out of total alter job
+@param[in,out]	crypt_block	crypted file buffer
+@param[in]	eval_table	mysql table used to evaluate virtual column
+				value, see innobase_get_computed_value().
+@param[in]	allow_not_null	allow null to not-null conversion
+@param[in]	col_collate	columns whose collations changed, or nullptr
+@return DB_SUCCESS or error */
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+row_merge_read_clustered_index(
+	trx_t*			trx,
+	struct TABLE*		table,
+	const dict_table_t*	old_table,
+	dict_table_t*		new_table,
+	bool			online,
+	dict_index_t**		index,
+	dict_index_t*		fts_sort_idx,
+	fts_psort_t*		psort_info,
+	merge_file_t*		files,
+	const ulint*		key_numbers,
+	ulint			n_index,
+	const dtuple_t*		defaults,
+	const dict_add_v_col_t*	add_v,
+	const ulint*		col_map,
+	ulint			add_autoinc,
+	ib_sequence_t&		sequence,
+	row_merge_block_t*	block,
+	bool			skip_pk_sort,
+	pfs_os_file_t*		tmpfd,
+	ut_stage_alter_t*	stage,
+	double 			pct_cost,
+	row_merge_block_t*	crypt_block,
+	struct TABLE*		eval_table,
+	bool			allow_not_null,
+	const col_collations*	col_collate)
+{
+	dict_index_t*		clust_index;	/* Clustered index */
+	mem_heap_t*		row_heap = NULL;/* Heap memory to create
+						clustered index tuples */
+	row_merge_buf_t**	merge_buf;	/* Temporary list for records*/
+	mem_heap_t*		v_heap = NULL;	/* Heap memory to process large
+						data for virtual column */
+	btr_pcur_t		pcur;		/* Cursor on the clustered
+						index */
+	mtr_t			mtr;		/* Mini transaction */
+	bool			mtr_started = false;
+	dberr_t			err = DB_SUCCESS;/* Return code */
+	ulint			n_nonnull = 0;	/* number of columns
+						changed to NOT NULL */
+	ulint*			nonnull = NULL;	/* NOT NULL columns */
+	dict_index_t*		fts_index = NULL;/* FTS index */
+	doc_id_t		doc_id = 0;
+	doc_id_t		max_doc_id = 0;
+	ibool			add_doc_id = FALSE;
+	pthread_cond_t*		fts_parallel_sort_cond = nullptr;
+	spatial_index_info**	sp_tuples = nullptr;
+	ulint			num_spatial = 0;
+	BtrBulk*		clust_btr_bulk = NULL;
+	bool			clust_temp_file = false;
+	mem_heap_t*		mtuple_heap = NULL;
+	mtuple_t		prev_mtuple;
+	mem_heap_t*		conv_heap = NULL;
+	double 			curr_progress = 0.0;
+	ib_uint64_t		read_rows = 0;
+	ib_uint64_t		table_total_rows = 0;
+	char			new_sys_trx_start[8];
+	char			new_sys_trx_end[8];
+	byte			any_autoinc_data[8] = {0};
+	bool			vers_update_trt = false;
+
+	DBUG_ENTER("row_merge_read_clustered_index");
+
+	ut_ad((old_table == new_table) == !col_map);
+	ut_ad(!defaults || col_map);
+	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+	ut_ad(trx->id);
+
+	table_total_rows = dict_table_get_n_rows(old_table);
+	if(table_total_rows == 0) {
+		/* We don't know total row count */
+		table_total_rows = 1;
+	}
+
+	trx->op_info = "reading clustered index";
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+	DEBUG_FTS_SORT_PRINT("FTS_SORT: Start Create Index\n");
+#endif
+
+	/* Create and initialize memory for record buffers */
+
+	merge_buf = static_cast<row_merge_buf_t**>(
+		ut_malloc_nokey(n_index * sizeof *merge_buf));
+
+	row_merge_dup_t	clust_dup = {index[0], table, col_map, 0};
+	dfield_t*	prev_fields = nullptr;
+	const ulint	n_uniq = dict_index_get_n_unique(index[0]);
+
+	ut_ad(trx->mysql_thd != NULL);
+
+	const char*	path = thd_innodb_tmpdir(trx->mysql_thd);
+
+	ut_ad(!skip_pk_sort || dict_index_is_clust(index[0]));
+	/* There is no previous tuple yet. */
+	prev_mtuple.fields = NULL;
+
+	for (ulint i = 0; i < n_index; i++) {
+		if (index[i]->type & DICT_FTS) {
+
+			/* We are building a FT index, make sure
+			we have the temporary 'fts_sort_idx' */
+			ut_a(fts_sort_idx);
+
+			fts_index = index[i];
+
+			merge_buf[i] = row_merge_buf_create(fts_sort_idx);
+
+			add_doc_id = DICT_TF2_FLAG_IS_SET(
+				new_table, DICT_TF2_FTS_ADD_DOC_ID);
+
+			/* If Doc ID does not exist in the table itself,
+			fetch the first FTS Doc ID */
+			if (add_doc_id) {
+				fts_get_next_doc_id(
+					(dict_table_t*) new_table,
+					&doc_id);
+				ut_ad(doc_id > 0);
+			}
+
+			row_fts_start_psort(psort_info);
+			fts_parallel_sort_cond =
+				 &psort_info[0].psort_common->sort_cond;
+		} else {
+			if (dict_index_is_spatial(index[i])) {
+				num_spatial++;
+			}
+
+			merge_buf[i] = row_merge_buf_create(index[i]);
+		}
+	}
+
+	if (num_spatial > 0) {
+		ulint	count = 0;
+
+		sp_tuples = static_cast<spatial_index_info**>(
+			ut_malloc_nokey(num_spatial
+					* sizeof(*sp_tuples)));
+
+		for (ulint i = 0; i < n_index; i++) {
+			if (dict_index_is_spatial(index[i])) {
+				sp_tuples[count]
+					= UT_NEW_NOKEY(
+						spatial_index_info(index[i]));
+				count++;
+			}
+		}
+
+		ut_ad(count == num_spatial);
+	}
+
+	mtr.start();
+	mtr_started = true;
+
+	/* Find the clustered index and create a persistent cursor
+	based on that. */
+
+	clust_index = dict_table_get_first_index(old_table);
+	const ulint old_trx_id_col = ulint(old_table->n_cols)
+		- (DATA_N_SYS_COLS - DATA_TRX_ID);
+	ut_ad(old_table->cols[old_trx_id_col].mtype == DATA_SYS);
+	ut_ad(old_table->cols[old_trx_id_col].prtype
+	      == (DATA_TRX_ID | DATA_NOT_NULL));
+	ut_ad(old_table->cols[old_trx_id_col + 1].mtype == DATA_SYS);
+	ut_ad(old_table->cols[old_trx_id_col + 1].prtype
+	      == (DATA_ROLL_PTR | DATA_NOT_NULL));
+	const ulint new_trx_id_col = col_map
+		? col_map[old_trx_id_col] : old_trx_id_col;
+	uint64_t n_rows = 0;
+
+	err = pcur.open_leaf(true, clust_index, BTR_SEARCH_LEAF, &mtr);
+	if (err != DB_SUCCESS) {
+err_exit:
+		trx->error_key_num = 0;
+		goto func_exit;
+	} else {
+		rec_t* rec = page_rec_get_next(btr_pcur_get_rec(&pcur));
+		if (!rec) {
+corrupted_metadata:
+			err = DB_CORRUPTION;
+			goto err_exit;
+		}
+		if (rec_get_info_bits(rec, page_rec_is_comp(rec))
+		    & REC_INFO_MIN_REC_FLAG) {
+			if (!clust_index->is_instant()) {
+				goto corrupted_metadata;
+			}
+			if (page_rec_is_comp(rec)
+			    && rec_get_status(rec) != REC_STATUS_INSTANT) {
+				goto corrupted_metadata;
+			}
+			/* Skip the metadata pseudo-record. */
+			btr_pcur_get_page_cur(&pcur)->rec = rec;
+		} else if (clust_index->is_instant()) {
+			goto corrupted_metadata;
+		}
+	}
+
+	/* Check if the table is supposed to be empty for our read view.
+
+	If we read bulk_trx_id as an older transaction ID, it is not
+	incorrect to check here whether that transaction should be
+	visible to us. If bulk_trx_id is not visible to us, the table
+	must have been empty at an earlier point of time, also in our
+	read view.
+
+	An INSERT would only update bulk_trx_id in
+	row_ins_clust_index_entry_low() if the table really was empty
+	(everything had been purged), when holding a leaf page latch
+	in the clustered index (actually, the root page is the only
+	leaf page in that case).
+
+	We are holding a clustered index leaf page latch here.
+	That will obviously prevent any concurrent INSERT from
+	updating bulk_trx_id while we read it. */
+	if (!online) {
+	} else if (trx_id_t bulk_trx_id = old_table->bulk_trx_id) {
+		ut_ad(trx->read_view.is_open());
+		ut_ad(bulk_trx_id != trx->id);
+		if (!trx->read_view.changes_visible(bulk_trx_id)) {
+			goto func_exit;
+		}
+	}
+
+	if (old_table != new_table) {
+		/* The table is being rebuilt.  Identify the columns
+		that were flagged NOT NULL in the new table, so that
+		we can quickly check that the records in the old table
+		do not violate the added NOT NULL constraints. */
+
+		nonnull = static_cast<ulint*>(
+			ut_malloc_nokey(dict_table_get_n_cols(new_table)
+				  * sizeof *nonnull));
+
+		for (ulint i = 0; i < dict_table_get_n_cols(old_table); i++) {
+			if (dict_table_get_nth_col(old_table, i)->prtype
+			    & DATA_NOT_NULL) {
+				continue;
+			}
+
+			const ulint j = col_map[i];
+
+			if (j == ULINT_UNDEFINED) {
+				/* The column was dropped. */
+				continue;
+			}
+
+			if (dict_table_get_nth_col(new_table, j)->prtype
+			    & DATA_NOT_NULL) {
+				nonnull[n_nonnull++] = j;
+			}
+		}
+
+		if (!n_nonnull) {
+			ut_free(nonnull);
+			nonnull = NULL;
+		}
+	}
+
+	row_heap = mem_heap_create(sizeof(mrec_buf_t));
+
+	if (dict_table_is_comp(old_table)
+	    && !dict_table_is_comp(new_table)) {
+		conv_heap = mem_heap_create(sizeof(mrec_buf_t));
+	}
+
+	if (skip_pk_sort) {
+		prev_fields = static_cast<dfield_t*>(
+			ut_malloc_nokey(n_uniq * sizeof *prev_fields));
+		mtuple_heap = mem_heap_create(sizeof(mrec_buf_t));
+	}
+
+	mach_write_to_8(new_sys_trx_start, trx->id);
+	mach_write_to_8(new_sys_trx_end, TRX_ID_MAX);
+
+	/* Scan the clustered index. */
+	for (;;) {
+		/* Do not continue if table pages are still encrypted */
+		if (!old_table->is_readable() || !new_table->is_readable()) {
+			err = DB_DECRYPTION_FAILED;
+			goto err_exit;
+		}
+
+		const rec_t*	rec;
+		trx_id_t	rec_trx_id;
+		rec_offs*	offsets;
+		dtuple_t*	row;
+		row_ext_t*	ext;
+		page_cur_t*	cur	= btr_pcur_get_page_cur(&pcur);
+		bool history_row, history_fts = false;
+
+		stage->n_pk_recs_inc();
+
+		if (!page_cur_move_to_next(cur)) {
+corrupted_rec:
+			err = DB_CORRUPTION;
+			goto err_exit;
+		}
+
+		if (page_cur_is_after_last(cur)) {
+
+			stage->inc();
+
+			if (UNIV_UNLIKELY(trx_is_interrupted(trx))) {
+				err = DB_INTERRUPTED;
+				goto err_exit;
+			}
+
+			if (online && old_table != new_table) {
+				err = row_log_table_get_error(clust_index);
+				if (err != DB_SUCCESS) {
+					goto err_exit;
+				}
+			}
+
+			/* Insert the cached spatial index rows. */
+			err = row_merge_spatial_rows(
+				trx->id, sp_tuples, num_spatial,
+				row_heap, &pcur, mtr_started, &mtr);
+
+			if (err != DB_SUCCESS) {
+				goto func_exit;
+			}
+
+			mem_heap_empty(row_heap);
+
+			if (!mtr_started) {
+				goto scan_next;
+			}
+
+			if (clust_index->lock.is_waiting()) {
+				/* There are waiters on the clustered
+				index tree lock, likely the purge
+				thread. Store and restore the cursor
+				position, and yield so that scanning a
+				large table will not starve other
+				threads. */
+
+				/* Store the cursor position on the last user
+				record on the page. */
+				if (!btr_pcur_move_to_prev_on_page(&pcur)) {
+					goto corrupted_index;
+				}
+				/* Leaf pages must never be empty, unless
+				this is the only page in the index tree. */
+				if (!btr_pcur_is_on_user_rec(&pcur)
+				    && btr_pcur_get_block(&pcur)->page.id()
+				    .page_no() != clust_index->page) {
+					goto corrupted_index;
+				}
+
+				btr_pcur_store_position(&pcur, &mtr);
+				mtr.commit();
+				mtr_started = false;
+
+				/* Give the waiters a chance to proceed. */
+				std::this_thread::yield();
+scan_next:
+				ut_ad(!mtr_started);
+				ut_ad(!mtr.is_active());
+				mtr.start();
+				mtr_started = true;
+				/* Restore position on the record, or its
+				predecessor if the record was purged
+				meanwhile. */
+				if (pcur.restore_position(BTR_SEARCH_LEAF,
+							  &mtr)
+				    == btr_pcur_t::CORRUPTED) {
+corrupted_index:
+					err = DB_CORRUPTION;
+					goto func_exit;
+                                }
+				/* Move to the successor of the
+				original record. */
+				if (!btr_pcur_move_to_next_user_rec(
+					    &pcur, &mtr)) {
+end_of_index:
+					row = NULL;
+					mtr.commit();
+					mtr_started = false;
+					mem_heap_free(row_heap);
+					row_heap = NULL;
+					ut_free(nonnull);
+					nonnull = NULL;
+					goto write_buffers;
+				}
+			} else {
+				uint32_t next_page_no = btr_page_get_next(
+					page_cur_get_page(cur));
+
+				if (next_page_no == FIL_NULL) {
+					goto end_of_index;
+				}
+
+				buf_block_t* block = buf_page_get_gen(
+					page_id_t(old_table->space->id,
+						  next_page_no),
+					old_table->space->zip_size(),
+					RW_S_LATCH, nullptr, BUF_GET, &mtr,
+					&err, false);
+				if (!block) {
+					goto err_exit;
+				}
+
+				page_cur_set_before_first(block, cur);
+				if (!page_cur_move_to_next(cur)
+				    || page_cur_is_after_last(cur)) {
+					goto corrupted_rec;
+				}
+
+				const auto s = mtr.get_savepoint();
+				mtr.rollback_to_savepoint(s - 2, s - 1);
+			}
+		} else {
+			mem_heap_empty(row_heap);
+		}
+
+		rec = page_cur_get_rec(cur);
+
+		if (online) {
+			offsets = rec_get_offsets(rec, clust_index, NULL,
+						  clust_index->n_core_fields,
+						  ULINT_UNDEFINED, &row_heap);
+			rec_trx_id = row_get_rec_trx_id(rec, clust_index,
+							offsets);
+
+			/* Perform a REPEATABLE READ.
+
+			When rebuilding the table online,
+			row_log_table_apply() must not see a newer
+			state of the table when applying the log.
+			This is mainly to prevent false duplicate key
+			errors, because the log will identify records
+			by the PRIMARY KEY, and also to prevent unsafe
+			BLOB access.
+
+			When creating a secondary index online, this
+			table scan must not see records that have only
+			been inserted to the clustered index, but have
+			not been written to the online_log of
+			index[]. If we performed READ UNCOMMITTED, it
+			could happen that the ADD INDEX reaches
+			ONLINE_INDEX_COMPLETE state between the time
+			the DML thread has updated the clustered index
+			but has not yet accessed secondary index. */
+			ut_ad(trx->read_view.is_open());
+			ut_ad(rec_trx_id != trx->id);
+
+			if (!trx->read_view.changes_visible(rec_trx_id)) {
+				if (rec_trx_id
+				    >= trx->read_view.low_limit_id()
+				    && rec_trx_id
+				    >= trx_sys.get_max_trx_id()) {
+					goto corrupted_rec;
+				}
+
+				rec_t*	old_vers;
+
+				row_vers_build_for_consistent_read(
+					rec, &mtr, clust_index, &offsets,
+					&trx->read_view, &row_heap,
+					row_heap, &old_vers, NULL);
+
+				if (!old_vers) {
+					continue;
+				}
+
+				/* The old version must necessarily be
+				in the "prehistory", because the
+				exclusive lock in
+				ha_innobase::prepare_inplace_alter_table()
+				forced the completion of any transactions
+				that accessed this table. */
+				ut_ad(row_get_rec_trx_id(old_vers, clust_index,
+							 offsets) < trx->id);
+
+				rec = old_vers;
+				rec_trx_id = 0;
+			}
+
+			if (rec_get_deleted_flag(
+				    rec,
+				    dict_table_is_comp(old_table))) {
+				/* In delete-marked records, DB_TRX_ID must
+				always refer to an existing undo log record.
+				Above, we did reset rec_trx_id = 0
+				for rec = old_vers.*/
+				ut_ad(rec == page_cur_get_rec(cur)
+				      ? rec_trx_id
+				      : !rec_trx_id);
+				/* This record was deleted in the latest
+				committed version, or it was deleted and
+				then reinserted-by-update before purge
+				kicked in. Skip it. */
+				continue;
+			}
+
+			ut_ad(!rec_offs_any_null_extern(rec, offsets));
+		} else if (rec_get_deleted_flag(
+				   rec, dict_table_is_comp(old_table))) {
+			/* In delete-marked records, DB_TRX_ID must
+			always refer to an existing undo log record. */
+			ut_d(rec_trx_id = rec_get_trx_id(rec, clust_index));
+			ut_ad(rec_trx_id);
+			/* This must be a purgeable delete-marked record,
+			and the transaction that delete-marked the record
+			must have been committed before this
+			!online ALTER TABLE transaction. */
+			ut_ad(rec_trx_id < trx->id);
+			/* Skip delete-marked records.
+
+			Skipping delete-marked records will make the
+			created indexes unuseable for transactions
+			whose read views were created before the index
+			creation completed, but an attempt to preserve
+			the history would make it tricky to detect
+			duplicate keys. */
+			continue;
+		} else {
+			offsets = rec_get_offsets(rec, clust_index, NULL,
+						  clust_index->n_core_fields,
+						  ULINT_UNDEFINED, &row_heap);
+			/* This is a locking ALTER TABLE.
+
+			If we are not rebuilding the table, the
+			DB_TRX_ID does not matter, as it is not being
+			written to any secondary indexes; see
+			if (old_table == new_table) below.
+
+			If we are rebuilding the table, the
+			DB_TRX_ID,DB_ROLL_PTR should be reset, because
+			there will be no history available. */
+			ut_ad(rec_get_trx_id(rec, clust_index) < trx->id);
+			rec_trx_id = 0;
+		}
+
+		/* When !online, we are holding a lock on old_table, preventing
+		any inserts that could have written a record 'stub' before
+		writing out off-page columns. */
+		ut_ad(!rec_offs_any_null_extern(rec, offsets));
+
+		/* Build a row based on the clustered index. */
+
+		row = row_build_w_add_vcol(ROW_COPY_POINTERS, clust_index,
+					   rec, offsets, new_table,
+					   defaults, add_v, col_map, &ext,
+					   row_heap);
+		ut_ad(row);
+
+		history_row = new_table->versioned()
+		       && dtuple_get_nth_field(row, new_table->vers_end)
+		       ->vers_history_row();
+		history_fts = history_row && new_table->fts;
+
+		for (ulint i = 0; i < n_nonnull; i++) {
+			dfield_t*	field	= &row->fields[nonnull[i]];
+
+			ut_ad(dfield_get_type(field)->prtype & DATA_NOT_NULL);
+
+			if (dfield_is_null(field)) {
+
+				Field* null_field =
+					table->field[nonnull[i]];
+
+				null_field->set_warning(
+					Sql_condition::WARN_LEVEL_WARN,
+					WARN_DATA_TRUNCATED, 1,
+					ulong(n_rows + 1));
+
+				if (!allow_not_null) {
+					err = DB_INVALID_NULL;
+					goto err_exit;
+				}
+
+				const dfield_t& default_field
+					= defaults->fields[nonnull[i]];
+
+				*field = default_field;
+			}
+		}
+
+		/* Get the next Doc ID */
+		if (add_doc_id && !history_fts) {
+			doc_id++;
+		} else {
+			doc_id = 0;
+		}
+
+		ut_ad(row->fields[new_trx_id_col].type.mtype == DATA_SYS);
+		ut_ad(row->fields[new_trx_id_col].type.prtype
+		      == (DATA_TRX_ID | DATA_NOT_NULL));
+		ut_ad(row->fields[new_trx_id_col].len == DATA_TRX_ID_LEN);
+		ut_ad(row->fields[new_trx_id_col + 1].type.mtype == DATA_SYS);
+		ut_ad(row->fields[new_trx_id_col + 1].type.prtype
+		      == (DATA_ROLL_PTR | DATA_NOT_NULL));
+		ut_ad(row->fields[new_trx_id_col + 1].len == DATA_ROLL_PTR_LEN);
+
+		if (old_table == new_table) {
+			/* Do not bother touching DB_TRX_ID,DB_ROLL_PTR
+			because they are not going to be written into
+			secondary indexes. */
+		} else if (rec_trx_id < trx->id) {
+			/* Reset the DB_TRX_ID,DB_ROLL_PTR of old rows
+			for which history is not going to be
+			available after the rebuild operation.
+			This essentially mimics row_purge_reset_trx_id(). */
+			row->fields[new_trx_id_col].data
+				= const_cast<byte*>(reset_trx_id);
+			row->fields[new_trx_id_col + 1].data
+				= const_cast<byte*>(reset_trx_id
+						    + DATA_TRX_ID_LEN);
+		}
+
+		if (add_autoinc != ULINT_UNDEFINED) {
+
+			ut_ad(add_autoinc
+			      < dict_table_get_n_user_cols(new_table));
+
+			dfield_t* dfield = dtuple_get_nth_field(row,
+								add_autoinc);
+
+			if (new_table->versioned()) {
+				if (history_row) {
+					if (dfield_get_type(dfield)->prtype & DATA_NOT_NULL) {
+						err = DB_UNSUPPORTED;
+						my_error(ER_UNSUPPORTED_EXTENSION, MYF(0),
+							 old_table->name.m_name);
+						goto func_exit;
+					}
+					dfield_set_null(dfield);
+				} else {
+					// set not null
+					ulint len = dfield_get_type(dfield)->len;
+					dfield_set_data(dfield, any_autoinc_data, len);
+				}
+			}
+
+			if (dfield_is_null(dfield)) {
+				goto write_buffers;
+			}
+
+			const dtype_t*  dtype = dfield_get_type(dfield);
+			byte*	b = static_cast<byte*>(dfield_get_data(dfield));
+
+			if (sequence.eof()) {
+				ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+					ER_AUTOINC_READ_FAILED, "[NULL]");
+				err = DB_ERROR;
+				goto err_exit;
+			}
+
+			ulonglong	value = sequence++;
+
+			switch (dtype_get_mtype(dtype)) {
+			case DATA_INT: {
+				ibool	usign;
+				ulint	len = dfield_get_len(dfield);
+
+				usign = dtype_get_prtype(dtype) & DATA_UNSIGNED;
+				mach_write_ulonglong(b, value, len, usign);
+
+				break;
+				}
+
+			case DATA_FLOAT:
+				mach_float_write(
+					b, static_cast<float>(value));
+				break;
+
+			case DATA_DOUBLE:
+				mach_double_write(
+					b, static_cast<double>(value));
+				break;
+
+			default:
+				ut_ad(0);
+			}
+		}
+
+		if (old_table->versioned()) {
+			if (!new_table->versioned()
+			    && clust_index->vers_history_row(rec, offsets)) {
+				continue;
+			}
+		} else if (new_table->versioned()) {
+			dfield_t* start =
+			    dtuple_get_nth_field(row, new_table->vers_start);
+			dfield_t* end =
+			    dtuple_get_nth_field(row, new_table->vers_end);
+			dfield_set_data(start, new_sys_trx_start, 8);
+			dfield_set_data(end, new_sys_trx_end, 8);
+			vers_update_trt = true;
+		}
+
+write_buffers:
+		/* Build all entries for all the indexes to be created
+		in a single scan of the clustered index. */
+
+		n_rows++;
+		ulint	s_idx_cnt = 0;
+		bool	skip_sort = skip_pk_sort
+			&& dict_index_is_clust(merge_buf[0]->index);
+
+		for (ulint k = 0, i = 0; i < n_index; i++, skip_sort = false) {
+			row_merge_buf_t*	buf	= merge_buf[i];
+			ulint			rows_added = 0;
+
+			if (dict_index_is_spatial(buf->index)) {
+				if (!row) {
+					continue;
+				}
+
+				ut_ad(sp_tuples[s_idx_cnt]->index
+				      == buf->index);
+
+				/* If the geometry field is invalid, report
+				error. */
+				if (!row_geo_field_is_valid(row, buf->index)) {
+					err = DB_CANT_CREATE_GEOMETRY_OBJECT;
+					break;
+				}
+
+				sp_tuples[s_idx_cnt]->add(row, ext, buf->heap);
+				s_idx_cnt++;
+
+				continue;
+			}
+
+			ut_ad(!row
+			      || !dict_index_is_clust(buf->index)
+			      || trx_id_check(row->fields[new_trx_id_col].data,
+					      trx->id));
+
+			merge_file_t*	file = &files[k++];
+
+			if (UNIV_LIKELY
+			    (row && (rows_added = row_merge_buf_add(
+					buf, fts_index, old_table, new_table,
+					psort_info, row, ext, history_fts,
+					&doc_id, conv_heap, &err,
+					&v_heap, eval_table, trx,
+					col_collate)))) {
+
+				/* If we are creating FTS index,
+				a single row can generate more
+				records for tokenized word */
+				file->n_rec += rows_added;
+
+				if (err != DB_SUCCESS) {
+					ut_ad(err == DB_TOO_BIG_RECORD);
+					break;
+				}
+
+				if (doc_id > max_doc_id) {
+					max_doc_id = doc_id;
+				}
+
+				if (buf->index->type & DICT_FTS) {
+					/* Check if error occurs in child thread */
+					for (ulint j = 0;
+					     j < fts_sort_pll_degree; j++) {
+						if (psort_info[j].error
+							!= DB_SUCCESS) {
+							err = psort_info[j].error;
+							trx->error_key_num = i;
+							break;
+						}
+					}
+
+					if (err != DB_SUCCESS) {
+						break;
+					}
+				}
+
+				if (skip_sort) {
+					ut_ad(buf->n_tuples > 0);
+					const mtuple_t*	curr =
+						&buf->tuples[buf->n_tuples - 1];
+
+					ut_ad(i == 0);
+					ut_ad(dict_index_is_clust(merge_buf[0]->index));
+					/* Detect duplicates by comparing the
+					current record with previous record.
+					When temp file is not used, records
+					should be in sorted order. */
+					if (prev_mtuple.fields != NULL
+					    && (row_mtuple_cmp(
+						&prev_mtuple, curr,
+						&clust_dup) == 0)) {
+
+						err = DB_DUPLICATE_KEY;
+						trx->error_key_num
+							= key_numbers[0];
+						goto func_exit;
+					}
+
+					prev_mtuple.fields = curr->fields;
+				}
+
+				continue;
+			}
+
+			if (err == DB_COMPUTE_VALUE_FAILED) {
+				trx->error_key_num = i;
+				goto func_exit;
+			}
+
+			if (buf->index->type & DICT_FTS) {
+				if (!row || !doc_id) {
+					continue;
+				}
+			}
+
+			/* The buffer must be sufficiently large
+			to hold at least one record. It may only
+			be empty when we reach the end of the
+			clustered index. row_merge_buf_add()
+			must not have been called in this loop. */
+			ut_ad(buf->n_tuples || row == NULL);
+
+			/* We have enough data tuples to form a block.
+			Sort them and write to disk if temp file is used
+			or insert into index if temp file is not used. */
+			ut_ad(old_table == new_table
+			      ? !dict_index_is_clust(buf->index)
+			      : (i == 0) == dict_index_is_clust(buf->index));
+
+			/* We have enough data tuples to form a block.
+			Sort them (if !skip_sort) and write to disk. */
+
+			if (buf->n_tuples) {
+				if (skip_sort) {
+					/* Temporary File is not used.
+					so insert sorted block to the index */
+					if (row != NULL) {
+						/* We have to do insert the
+						cached spatial index rows, since
+						after the mtr_commit, the cluster
+						index page could be updated, then
+						the data in cached rows become
+						invalid. */
+						err = row_merge_spatial_rows(
+							trx->id, sp_tuples,
+							num_spatial,
+							row_heap,
+							&pcur, mtr_started,
+							&mtr);
+
+						if (err != DB_SUCCESS) {
+							goto func_exit;
+						}
+
+						/* We are not at the end of
+						the scan yet. We must
+						mtr.commit() in order to be
+						able to call log_free_check()
+						in row_merge_insert_index_tuples().
+						Due to mtr.commit(), the
+						current row will be invalid, and
+						we must reread it on the next
+						loop iteration. */
+						if (mtr_started) {
+							if (!btr_pcur_move_to_prev_on_page(&pcur)) {
+								err = DB_CORRUPTION;
+								goto func_exit;
+							}
+							btr_pcur_store_position(
+								&pcur, &mtr);
+
+							mtr.commit();
+							mtr_started = false;
+						}
+					}
+
+					mem_heap_empty(mtuple_heap);
+					prev_mtuple.fields = prev_fields;
+
+					row_mtuple_create(
+						&buf->tuples[buf->n_tuples - 1],
+						&prev_mtuple, n_uniq,
+						mtuple_heap);
+
+					if (clust_btr_bulk == NULL) {
+						clust_btr_bulk = UT_NEW_NOKEY(
+							BtrBulk(index[i],
+								trx));
+					} else {
+						clust_btr_bulk->latch();
+					}
+
+					err = row_merge_insert_index_tuples(
+						index[i], old_table,
+						OS_FILE_CLOSED, NULL, buf,
+						clust_btr_bulk,
+						table_total_rows,
+						curr_progress,
+						pct_cost,
+						crypt_block,
+						new_table->space_id);
+
+					if (row == NULL) {
+						err = clust_btr_bulk->finish(
+							err);
+						UT_DELETE(clust_btr_bulk);
+						clust_btr_bulk = NULL;
+					} else {
+						/* Release latches for possible
+						log_free_chck in spatial index
+						build. */
+						clust_btr_bulk->release();
+					}
+
+					if (err != DB_SUCCESS) {
+						break;
+					}
+
+					if (row != NULL) {
+						/* Restore the cursor on the
+						previous clustered index record,
+						and empty the buffer. The next
+						iteration of the outer loop will
+						advance the cursor and read the
+						next record (the one which we
+						had to ignore due to the buffer
+						overflow). */
+						mtr.start();
+						mtr_started = true;
+						if (pcur.restore_position(
+							BTR_SEARCH_LEAF, &mtr)
+						    == btr_pcur_t::CORRUPTED) {
+							goto corrupted_index;
+						}
+						buf = row_merge_buf_empty(buf);
+						merge_buf[i] = buf;
+						/* Restart the outer loop on the
+						record. We did not insert it
+						into any index yet. */
+						ut_ad(i == 0);
+						break;
+					}
+				} else if (dict_index_is_unique(buf->index)) {
+					row_merge_dup_t	dup = {
+						buf->index, table, col_map, 0};
+
+					row_merge_buf_sort(buf, &dup);
+
+					if (dup.n_dup) {
+						err = DB_DUPLICATE_KEY;
+						trx->error_key_num
+							= key_numbers[i];
+						break;
+					}
+				} else {
+					row_merge_buf_sort(buf, NULL);
+				}
+			} else if (online && new_table == old_table) {
+				/* Note the newest transaction that
+				modified this index when the scan was
+				completed. We prevent older readers
+				from accessing this index, to ensure
+				read consistency. */
+
+				ut_a(row == NULL);
+
+				dict_index_t* index = buf->index;
+				index->lock.x_lock(SRW_LOCK_CALL);
+				ut_a(dict_index_get_online_status(index)
+				     == ONLINE_INDEX_CREATION);
+
+				trx_id_t max_trx_id = row_log_get_max_trx(
+					index);
+
+				if (max_trx_id > index->trx_id) {
+					index->trx_id = max_trx_id;
+				}
+
+				index->lock.x_unlock();
+			}
+
+			/* Secondary index and clustered index which is
+			not in sorted order can use the temporary file.
+			Fulltext index should not use the temporary file. */
+			if (!skip_sort && !(buf->index->type & DICT_FTS)) {
+				/* In case we can have all rows in sort buffer,
+				we can insert directly into the index without
+				temporary file if clustered index does not uses
+				temporary file. */
+				if (row == NULL && file->fd == OS_FILE_CLOSED
+				    && !clust_temp_file) {
+					DBUG_EXECUTE_IF(
+						"row_merge_write_failure",
+						err = DB_TEMP_FILE_WRITE_FAIL;
+						trx->error_key_num = i;
+						goto all_done;);
+
+					DBUG_EXECUTE_IF(
+						"row_merge_tmpfile_fail",
+						err = DB_OUT_OF_MEMORY;
+						trx->error_key_num = i;
+						goto all_done;);
+
+					BtrBulk	btr_bulk(index[i], trx);
+
+					err = row_merge_insert_index_tuples(
+						index[i], old_table,
+						OS_FILE_CLOSED, NULL, buf,
+						&btr_bulk,
+						table_total_rows,
+						curr_progress,
+						pct_cost,
+						crypt_block,
+						new_table->space_id);
+
+					err = btr_bulk.finish(err);
+
+					DBUG_EXECUTE_IF(
+						"row_merge_insert_big_row",
+						err = DB_TOO_BIG_RECORD;);
+
+					if (err != DB_SUCCESS) {
+						break;
+					}
+				} else {
+					if (!row_merge_file_create_if_needed(
+						file, tmpfd,
+						buf->n_tuples, path)) {
+						err = DB_OUT_OF_MEMORY;
+						trx->error_key_num = i;
+						break;
+					}
+
+					/* Ensure that duplicates in the
+					clustered index will be detected before
+					inserting secondary index records. */
+					if (dict_index_is_clust(buf->index)) {
+						clust_temp_file = true;
+					}
+
+					ut_ad(file->n_rec > 0);
+
+					row_merge_buf_write(buf,
+#ifndef DBUG_OFF
+							    file,
+#endif
+							    block);
+
+					if (!row_merge_write(
+						    file->fd, file->offset++,
+						    block, crypt_block,
+						    new_table->space_id)) {
+						err = DB_TEMP_FILE_WRITE_FAIL;
+						trx->error_key_num = i;
+						break;
+					}
+
+					MEM_UNDEFINED(
+						&block[0], srv_sort_buf_size);
+				}
+			}
+			merge_buf[i] = row_merge_buf_empty(buf);
+			buf = merge_buf[i];
+
+			if (UNIV_LIKELY(row != NULL)) {
+				/* Try writing the record again, now
+				that the buffer has been written out
+				and emptied. */
+
+				if (UNIV_UNLIKELY
+				    (!(rows_added = row_merge_buf_add(
+						buf, fts_index, old_table,
+						new_table, psort_info,
+						row, ext, history_fts, &doc_id,
+						conv_heap, &err, &v_heap,
+						eval_table, trx, col_collate)))) {
+                                        /* An empty buffer should have enough
+                                        room for at least one record. */
+					ut_ad(err == DB_COMPUTE_VALUE_FAILED
+					      || err == DB_OUT_OF_MEMORY
+					      || err == DB_TOO_BIG_RECORD);
+				} else if (err == DB_SUCCESS) {
+					file->n_rec += rows_added;
+					continue;
+				}
+
+				trx->error_key_num = i;
+				break;
+			}
+		}
+
+		if (row == NULL) {
+			if (old_table != new_table) {
+				new_table->stat_n_rows = n_rows;
+			}
+
+			goto all_done;
+		}
+
+		if (err != DB_SUCCESS) {
+			goto func_exit;
+		}
+
+		if (v_heap) {
+			mem_heap_empty(v_heap);
+		}
+
+		/* Increment innodb_onlineddl_pct_progress status variable */
+		read_rows++;
+		if(read_rows % 1000 == 0) {
+			/* Update progress for each 1000 rows */
+			curr_progress = (read_rows >= table_total_rows) ?
+					pct_cost :
+				pct_cost * static_cast<double>(read_rows)
+				/ static_cast<double>(table_total_rows);
+			/* presenting 10.12% as 1012 integer */
+			onlineddl_pct_progress = (ulint) (curr_progress * 100);
+		}
+	}
+
+func_exit:
+	ut_ad(mtr_started == mtr.is_active());
+	if (mtr_started) {
+		mtr.commit();
+	}
+	if (row_heap) {
+		mem_heap_free(row_heap);
+	}
+	ut_free(nonnull);
+
+all_done:
+	if (clust_btr_bulk != NULL) {
+		ut_ad(err != DB_SUCCESS);
+		clust_btr_bulk->latch();
+		err = clust_btr_bulk->finish(
+			err);
+		UT_DELETE(clust_btr_bulk);
+	}
+
+	if (prev_fields) {
+		ut_free(prev_fields);
+		mem_heap_free(mtuple_heap);
+	}
+
+	if (v_heap) {
+		mem_heap_free(v_heap);
+	}
+
+	if (conv_heap != NULL) {
+		mem_heap_free(conv_heap);
+	}
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+	DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Scan Table\n");
+#endif
+	if (UNIV_LIKELY_NULL(fts_parallel_sort_cond)) {
+wait_again:
+                /* Check if error occurs in child thread */
+		for (ulint j = 0; j < fts_sort_pll_degree; j++) {
+			if (psort_info[j].error != DB_SUCCESS) {
+				err = psort_info[j].error;
+				trx->error_key_num = j;
+				break;
+			}
+		}
+
+		/* Tell all children that parent has done scanning */
+		for (ulint i = 0; i < fts_sort_pll_degree; i++) {
+			if (err == DB_SUCCESS) {
+				psort_info[i].state = FTS_PARENT_COMPLETE;
+			} else {
+				psort_info[i].state = FTS_PARENT_EXITING;
+			}
+		}
+
+		/* Now wait all children to report back to be completed */
+		timespec abstime;
+		set_timespec(abstime, 1);
+		mysql_mutex_lock(&psort_info[0].mutex);
+		my_cond_timedwait(fts_parallel_sort_cond,
+				  &psort_info[0].mutex.m_mutex, &abstime);
+		mysql_mutex_unlock(&psort_info[0].mutex);
+
+		for (ulint i = 0; i < fts_sort_pll_degree; i++) {
+			if (!psort_info[i].child_status) {
+				goto wait_again;
+			}
+		}
+
+		for (ulint j = 0; j < fts_sort_pll_degree; j++) {
+			psort_info[j].task->wait();
+			delete psort_info[j].task;
+		}
+	}
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+	DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Tokenization\n");
+#endif
+	for (ulint i = 0; i < n_index; i++) {
+		row_merge_buf_free(merge_buf[i]);
+	}
+
+	row_fts_free_pll_merge_buf(psort_info);
+
+	ut_free(merge_buf);
+	ut_free(pcur.old_rec_buf);
+
+	if (sp_tuples != NULL) {
+		for (ulint i = 0; i < num_spatial; i++) {
+			UT_DELETE(sp_tuples[i]);
+		}
+		ut_free(sp_tuples);
+	}
+
+	/* Update the next Doc ID we used. Table should be locked, so
+	no concurrent DML */
+	if (max_doc_id && err == DB_SUCCESS) {
+		/* Sync fts cache for other fts indexes to keep all
+		fts indexes consistent in sync_doc_id. */
+		err = fts_sync_table(const_cast<dict_table_t*>(new_table));
+
+		if (err == DB_SUCCESS) {
+			new_table->fts->cache->synced_doc_id = max_doc_id;
+
+			/* Update the max value as next FTS_DOC_ID */
+			if (max_doc_id >= new_table->fts->cache->next_doc_id) {
+				new_table->fts->cache->next_doc_id =
+					max_doc_id + 1;
+			}
+
+			new_table->fts->cache->first_doc_id =
+				new_table->fts->cache->next_doc_id;
+
+			err= fts_update_sync_doc_id(
+				new_table,
+				new_table->fts->cache->synced_doc_id,
+				NULL);
+		}
+	}
+
+	if (vers_update_trt) {
+		trx->mod_tables.emplace(new_table, 0)
+			.first->second.set_versioned(0);
+	}
+
+	trx->op_info = "";
+
+	DBUG_RETURN(err);
+}
+
+/** Write a record via buffer 2 and read the next record to buffer N.
+@param N number of the buffer (0 or 1)
+@param INDEX record descriptor
+@param AT_END statement to execute at end of input */
+#define ROW_MERGE_WRITE_GET_NEXT_LOW(N, INDEX, AT_END)			\
+	do {								\
+		b2 = row_merge_write_rec(&block[2 * srv_sort_buf_size], \
+					 &buf[2], b2,			\
+					 of->fd, &of->offset,		\
+					 mrec##N, offsets##N,		\
+			crypt_block ? &crypt_block[2 * srv_sort_buf_size] : NULL , \
+					space);				\
+		if (UNIV_UNLIKELY(!b2 || ++of->n_rec > file->n_rec)) {	\
+			goto corrupt;					\
+		}							\
+		b##N = row_merge_read_rec(&block[N * srv_sort_buf_size],\
+					  &buf[N], b##N, INDEX,		\
+					  file->fd, foffs##N,		\
+					  &mrec##N, offsets##N,		\
+			crypt_block ? &crypt_block[N * srv_sort_buf_size] : NULL, \
+					  space);			\
+									\
+		if (UNIV_UNLIKELY(!b##N)) {				\
+			if (mrec##N) {					\
+				goto corrupt;				\
+			}						\
+			AT_END;						\
+		}							\
+	} while (0)
+
+#ifdef HAVE_PSI_STAGE_INTERFACE
+#define ROW_MERGE_WRITE_GET_NEXT(N, INDEX, AT_END)			\
+	do {								\
+		if (stage != NULL) {					\
+			stage->inc();					\
+		}							\
+		ROW_MERGE_WRITE_GET_NEXT_LOW(N, INDEX, AT_END);		\
+	} while (0)
+#else /* HAVE_PSI_STAGE_INTERFACE */
+#define ROW_MERGE_WRITE_GET_NEXT(N, INDEX, AT_END)			\
+	ROW_MERGE_WRITE_GET_NEXT_LOW(N, INDEX, AT_END)
+#endif /* HAVE_PSI_STAGE_INTERFACE */
+
+/** Merge two blocks of records on disk and write a bigger block.
+@param[in]	dup	descriptor of index being created
+@param[in]	file	file containing index entries
+@param[in,out]	block	3 buffers
+@param[in,out]	foffs0	offset of first source list in the file
+@param[in,out]	foffs1	offset of second source list in the file
+@param[in,out]	of	output file
+@param[in,out]	stage	performance schema accounting object, used by
+ALTER TABLE. If not NULL stage->inc() will be called for each record
+processed.
+@param[in,out]	crypt_block	encryption buffer
+@param[in]	space	tablespace ID for encryption
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+row_merge_blocks(
+	const row_merge_dup_t*	dup,
+	const merge_file_t*	file,
+	row_merge_block_t*	block,
+	ulint*			foffs0,
+	ulint*			foffs1,
+	merge_file_t*		of,
+	ut_stage_alter_t*	stage MY_ATTRIBUTE((unused)),
+	row_merge_block_t*	crypt_block,
+	ulint			space)
+{
+	mem_heap_t*	heap;	/*!< memory heap for offsets0, offsets1 */
+
+	mrec_buf_t*	buf;	/*!< buffer for handling
+				split mrec in block[] */
+	const byte*	b0;	/*!< pointer to block[0] */
+	const byte*	b1;	/*!< pointer to block[srv_sort_buf_size] */
+	byte*		b2;	/*!< pointer to block[2 * srv_sort_buf_size] */
+	const mrec_t*	mrec0;	/*!< merge rec, points to block[0] or buf[0] */
+	const mrec_t*	mrec1;	/*!< merge rec, points to
+				block[srv_sort_buf_size] or buf[1] */
+	rec_offs*	offsets0;/* offsets of mrec0 */
+	rec_offs*	offsets1;/* offsets of mrec1 */
+
+	DBUG_ENTER("row_merge_blocks");
+	DBUG_LOG("ib_merge_sort",
+		 "fd=" << file->fd << ',' << *foffs0 << '+' << *foffs1
+		 << " to fd=" << of->fd << ',' << of->offset);
+
+	heap = row_merge_heap_create(dup->index, &buf, &offsets0, &offsets1);
+
+	/* Write a record and read the next record.  Split the output
+	file in two halves, which can be merged on the following pass. */
+
+	if (!row_merge_read(file->fd, *foffs0, &block[0],
+			    crypt_block ? &crypt_block[0] : NULL,
+			    space) ||
+	    !row_merge_read(file->fd, *foffs1, &block[srv_sort_buf_size],
+			    crypt_block ? &crypt_block[srv_sort_buf_size] : NULL,
+			    space)) {
+corrupt:
+		mem_heap_free(heap);
+		DBUG_RETURN(DB_CORRUPTION);
+	}
+
+	b0 = &block[0];
+	b1 = &block[srv_sort_buf_size];
+	b2 = &block[2 * srv_sort_buf_size];
+
+	b0 = row_merge_read_rec(
+		&block[0], &buf[0], b0, dup->index,
+		file->fd, foffs0, &mrec0, offsets0,
+		crypt_block ? &crypt_block[0] : NULL,
+		space);
+
+	b1 = row_merge_read_rec(
+		&block[srv_sort_buf_size],
+		&buf[srv_sort_buf_size], b1, dup->index,
+		file->fd, foffs1, &mrec1, offsets1,
+		crypt_block ? &crypt_block[srv_sort_buf_size] : NULL,
+		space);
+
+	if (UNIV_UNLIKELY(!b0 && mrec0)
+	    || UNIV_UNLIKELY(!b1 && mrec1)) {
+
+		goto corrupt;
+	}
+
+	while (mrec0 && mrec1) {
+		int cmp = cmp_rec_rec_simple(
+			mrec0, mrec1, offsets0, offsets1,
+			dup->index, dup->table);
+		if (cmp < 0) {
+			ROW_MERGE_WRITE_GET_NEXT(0, dup->index, goto merged);
+		} else if (cmp) {
+			ROW_MERGE_WRITE_GET_NEXT(1, dup->index, goto merged);
+		} else {
+			mem_heap_free(heap);
+			DBUG_RETURN(DB_DUPLICATE_KEY);
+		}
+	}
+
+merged:
+	if (mrec0) {
+		/* append all mrec0 to output */
+		for (;;) {
+			ROW_MERGE_WRITE_GET_NEXT(0, dup->index, goto done0);
+		}
+	}
+done0:
+	if (mrec1) {
+		/* append all mrec1 to output */
+		for (;;) {
+			ROW_MERGE_WRITE_GET_NEXT(1, dup->index, goto done1);
+		}
+	}
+done1:
+
+	mem_heap_free(heap);
+
+	b2 = row_merge_write_eof(
+		&block[2 * srv_sort_buf_size],
+		b2, of->fd, &of->offset,
+		crypt_block ? &crypt_block[2 * srv_sort_buf_size] : NULL,
+		space);
+	DBUG_RETURN(b2 ? DB_SUCCESS : DB_CORRUPTION);
+}
+
+/** Copy a block of index entries.
+@param[in]	index	index being created
+@param[in]	file	input file
+@param[in,out]	block	3 buffers
+@param[in,out]	foffs0	input file offset
+@param[in,out]	of	output file
+@param[in,out]	stage	performance schema accounting object, used by
+ALTER TABLE. If not NULL stage->inc() will be called for each record
+processed.
+@param[in,out]	crypt_block	encryption buffer
+@param[in]	space	tablespace ID for encryption
+@return TRUE on success, FALSE on failure */
+static MY_ATTRIBUTE((warn_unused_result))
+ibool
+row_merge_blocks_copy(
+	const dict_index_t*	index,
+	const merge_file_t*	file,
+	row_merge_block_t*	block,
+	ulint*			foffs0,
+	merge_file_t*		of,
+	ut_stage_alter_t*	stage MY_ATTRIBUTE((unused)),
+	row_merge_block_t*	crypt_block,
+	ulint			space)
+{
+	mem_heap_t*	heap;	/*!< memory heap for offsets0, offsets1 */
+
+	mrec_buf_t*	buf;	/*!< buffer for handling
+				split mrec in block[] */
+	const byte*	b0;	/*!< pointer to block[0] */
+	byte*		b2;	/*!< pointer to block[2 * srv_sort_buf_size] */
+	const mrec_t*	mrec0;	/*!< merge rec, points to block[0] */
+	rec_offs*	offsets0;/* offsets of mrec0 */
+	rec_offs*	offsets1;/* dummy offsets */
+
+	DBUG_ENTER("row_merge_blocks_copy");
+	DBUG_LOG("ib_merge_sort",
+		 "fd=" << file->fd << ',' << foffs0
+		 << " to fd=" << of->fd << ',' << of->offset);
+
+	heap = row_merge_heap_create(index, &buf, &offsets0, &offsets1);
+
+	/* Write a record and read the next record.  Split the output
+	file in two halves, which can be merged on the following pass. */
+
+	if (!row_merge_read(file->fd, *foffs0, &block[0],
+			crypt_block ? &crypt_block[0] : NULL,
+			space)) {
+corrupt:
+		mem_heap_free(heap);
+		DBUG_RETURN(FALSE);
+	}
+
+	b0 = &block[0];
+
+	b2 = &block[2 * srv_sort_buf_size];
+
+	b0 = row_merge_read_rec(&block[0], &buf[0], b0, index,
+				file->fd, foffs0, &mrec0, offsets0,
+				crypt_block ? &crypt_block[0] : NULL,
+				space);
+
+	if (UNIV_UNLIKELY(!b0 && mrec0)) {
+
+		goto corrupt;
+	}
+
+	if (mrec0) {
+		/* append all mrec0 to output */
+		for (;;) {
+			ROW_MERGE_WRITE_GET_NEXT(0, index, goto done0);
+		}
+	}
+done0:
+
+	/* The file offset points to the beginning of the last page
+	that has been read.  Update it to point to the next block. */
+	(*foffs0)++;
+
+	mem_heap_free(heap);
+
+	DBUG_RETURN(row_merge_write_eof(
+			    &block[2 * srv_sort_buf_size],
+			    b2, of->fd, &of->offset,
+			    crypt_block
+			    ? &crypt_block[2 * srv_sort_buf_size]
+			    : NULL, space)
+		    != NULL);
+}
+
+/** Merge disk files.
+@param[in]	trx		transaction
+@param[in]	dup		descriptor of index being created
+@param[in,out]	file		file containing index entries
+@param[in,out]	block		3 buffers
+@param[in,out]	tmpfd		temporary file handle
+@param[in,out]	num_run		Number of runs that remain to be merged
+@param[in,out]	run_offset	Array that contains the first offset number
+for each merge run
+@param[in,out]	stage		performance schema accounting object, used by
+@param[in,out]	crypt_block	encryption buffer
+@param[in]	space		tablespace ID for encryption
+ALTER TABLE. If not NULL stage->inc() will be called for each record
+processed.
+@return DB_SUCCESS or error code */
+static
+dberr_t
+row_merge(
+	trx_t*			trx,
+	const row_merge_dup_t*	dup,
+	merge_file_t*		file,
+	row_merge_block_t*	block,
+	pfs_os_file_t*		tmpfd,
+	ulint*			num_run,
+	ulint*			run_offset,
+	ut_stage_alter_t*	stage,
+	row_merge_block_t*	crypt_block,
+	ulint			space)
+{
+	ulint		foffs0;	/*!< first input offset */
+	ulint		foffs1;	/*!< second input offset */
+	dberr_t		error;	/*!< error code */
+	merge_file_t	of;	/*!< output file */
+	const ulint	ihalf	= run_offset[*num_run / 2];
+				/*!< half the input file */
+	ulint		n_run	= 0;
+				/*!< num of runs generated from this merge */
+
+	MEM_CHECK_ADDRESSABLE(&block[0], 3 * srv_sort_buf_size);
+
+	if (crypt_block) {
+		MEM_CHECK_ADDRESSABLE(&crypt_block[0], 3 * srv_sort_buf_size);
+	}
+
+	ut_ad(ihalf < file->offset);
+
+	of.fd = *tmpfd;
+	of.offset = 0;
+	of.n_rec = 0;
+
+#ifdef POSIX_FADV_SEQUENTIAL
+	/* The input file will be read sequentially, starting from the
+	beginning and the middle.  In Linux, the POSIX_FADV_SEQUENTIAL
+	affects the entire file.  Each block will be read exactly once. */
+	posix_fadvise(file->fd, 0, 0,
+		      POSIX_FADV_SEQUENTIAL | POSIX_FADV_NOREUSE);
+#endif /* POSIX_FADV_SEQUENTIAL */
+
+	/* Merge blocks to the output file. */
+	foffs0 = 0;
+	foffs1 = ihalf;
+
+	MEM_UNDEFINED(run_offset, *num_run * sizeof *run_offset);
+
+	for (; foffs0 < ihalf && foffs1 < file->offset; foffs0++, foffs1++) {
+
+		if (trx_is_interrupted(trx)) {
+			return(DB_INTERRUPTED);
+		}
+
+		/* Remember the offset number for this run */
+		run_offset[n_run++] = of.offset;
+
+		error = row_merge_blocks(dup, file, block,
+					 &foffs0, &foffs1, &of, stage,
+					 crypt_block, space);
+
+		if (error != DB_SUCCESS) {
+			return(error);
+		}
+
+	}
+
+	/* Copy the last blocks, if there are any. */
+
+	while (foffs0 < ihalf) {
+
+		if (UNIV_UNLIKELY(trx_is_interrupted(trx))) {
+			return(DB_INTERRUPTED);
+		}
+
+		/* Remember the offset number for this run */
+		run_offset[n_run++] = of.offset;
+
+		if (!row_merge_blocks_copy(dup->index, file, block,
+					   &foffs0, &of, stage,
+					   crypt_block, space)) {
+			return(DB_CORRUPTION);
+		}
+	}
+
+	ut_ad(foffs0 == ihalf);
+
+	while (foffs1 < file->offset) {
+
+		if (trx_is_interrupted(trx)) {
+			return(DB_INTERRUPTED);
+		}
+
+		/* Remember the offset number for this run */
+		run_offset[n_run++] = of.offset;
+
+		if (!row_merge_blocks_copy(dup->index, file, block,
+					   &foffs1, &of, stage,
+					   crypt_block, space)) {
+			return(DB_CORRUPTION);
+		}
+	}
+
+	ut_ad(foffs1 == file->offset);
+
+	if (UNIV_UNLIKELY(of.n_rec != file->n_rec)) {
+		return(DB_CORRUPTION);
+	}
+
+	ut_ad(n_run <= *num_run);
+
+	*num_run = n_run;
+
+	/* Each run can contain one or more offsets. As merge goes on,
+	the number of runs (to merge) will reduce until we have one
+	single run. So the number of runs will always be smaller than
+	the number of offsets in file */
+	ut_ad((*num_run) <= file->offset);
+
+	/* The number of offsets in output file is always equal or
+	smaller than input file */
+	ut_ad(of.offset <= file->offset);
+
+	/* Swap file descriptors for the next pass. */
+	*tmpfd = file->fd;
+	*file = of;
+
+	MEM_UNDEFINED(&block[0], 3 * srv_sort_buf_size);
+
+	return(DB_SUCCESS);
+}
+
+/** Merge disk files.
+@param[in]	trx	transaction
+@param[in]	dup	descriptor of index being created
+@param[in,out]	file	file containing index entries
+@param[in,out]	block	3 buffers
+@param[in,out]	tmpfd	temporary file handle
+@param[in,out]	stage	performance schema accounting object, used by
+ALTER TABLE. If not NULL, stage->begin_phase_sort() will be called initially
+and then stage->inc() will be called for each record processed.
+@return DB_SUCCESS or error code */
+dberr_t
+row_merge_sort(
+	trx_t*			trx,
+	const row_merge_dup_t*	dup,
+	merge_file_t*		file,
+	row_merge_block_t*	block,
+	pfs_os_file_t*			tmpfd,
+	const bool		update_progress,
+					/*!< in: update progress
+					status variable or not */
+	const double 		pct_progress,
+					/*!< in: total progress percent
+					until now */
+	const double		pct_cost, /*!< in: current progress percent */
+	row_merge_block_t*	crypt_block, /*!< in: crypt buf or NULL */
+	ulint			space,	   /*!< in: space id */
+	ut_stage_alter_t* 	stage)
+{
+	const ulint	half	= file->offset / 2;
+	ulint		num_runs;
+	ulint*		run_offset;
+	dberr_t		error	= DB_SUCCESS;
+	ulint		merge_count = 0;
+	ulint		total_merge_sort_count;
+	double		curr_progress = 0;
+
+	DBUG_ENTER("row_merge_sort");
+
+	/* Record the number of merge runs we need to perform */
+	num_runs = file->offset;
+
+	if (stage != NULL) {
+		stage->begin_phase_sort(log2(double(num_runs)));
+	}
+
+	/* If num_runs are less than 1, nothing to merge */
+	if (num_runs <= 1) {
+		DBUG_RETURN(error);
+	}
+
+	total_merge_sort_count = ulint(ceil(log2(double(num_runs))));
+
+	/* "run_offset" records each run's first offset number */
+	run_offset = (ulint*) ut_malloc_nokey(file->offset * sizeof(ulint));
+
+	/* This tells row_merge() where to start for the first round
+	of merge. */
+	run_offset[half] = half;
+
+	/* The file should always contain at least one byte (the end
+	of file marker).  Thus, it must be at least one block. */
+	ut_ad(file->offset > 0);
+
+	/* These thd_progress* calls will crash on sol10-64 when innodb_plugin
+	is used. MDEV-9356: innodb.innodb_bug53290 fails (crashes) on
+	sol10-64 in buildbot.
+	*/
+#ifndef __sun__
+	/* Progress report only for "normal" indexes. */
+	if (dup && !(dup->index->type & DICT_FTS)) {
+		thd_progress_init(trx->mysql_thd, 1);
+	}
+#endif /* __sun__ */
+
+	if (global_system_variables.log_warnings > 2) {
+		sql_print_information("InnoDB: Online DDL : merge-sorting"
+				      " has estimated " ULINTPF " runs",
+				      num_runs);
+	}
+
+	/* Merge the runs until we have one big run */
+	do {
+		/* Report progress of merge sort to MySQL for
+		show processlist progress field */
+		/* Progress report only for "normal" indexes. */
+#ifndef __sun__
+		if (dup && !(dup->index->type & DICT_FTS)) {
+			thd_progress_report(trx->mysql_thd, file->offset - num_runs, file->offset);
+		}
+#endif /* __sun__ */
+
+		error = row_merge(trx, dup, file, block, tmpfd,
+				  &num_runs, run_offset, stage,
+				  crypt_block, space);
+
+		if(update_progress) {
+			merge_count++;
+			curr_progress = (merge_count >= total_merge_sort_count) ?
+				pct_cost :
+				pct_cost * static_cast<double>(merge_count)
+				/ static_cast<double>(total_merge_sort_count);
+			/* presenting 10.12% as 1012 integer */;
+			onlineddl_pct_progress = (ulint) ((pct_progress + curr_progress) * 100);
+		}
+
+		if (error != DB_SUCCESS) {
+			break;
+		}
+
+		MEM_CHECK_DEFINED(run_offset, num_runs * sizeof *run_offset);
+	} while (num_runs > 1);
+
+	ut_free(run_offset);
+
+	/* Progress report only for "normal" indexes. */
+#ifndef __sun__
+	if (dup && !(dup->index->type & DICT_FTS)) {
+		thd_progress_end(trx->mysql_thd);
+	}
+#endif /* __sun__ */
+
+	DBUG_RETURN(error);
+}
+
+/** Copy the blob from the given blob file and store it
+in field data for the tuple
+@param tuple     tuple to be inserted
+@param heap      heap to allocate the memory for the blob storage
+@param blob_file file to handle blob data */
+static dberr_t row_merge_copy_blob_from_file(dtuple_t *tuple, mem_heap_t *heap,
+                                             merge_file_t *blob_file)
+{
+  for (ulint i = 0; i < dtuple_get_n_fields(tuple); i++)
+  {
+    dfield_t *field= dtuple_get_nth_field(tuple, i);
+    const byte *field_data= static_cast<byte*>(dfield_get_data(field));
+    ulint field_len= dfield_get_len(field);
+    if (!dfield_is_ext(field))
+      continue;
+
+    ut_a(field_len >= BTR_EXTERN_FIELD_REF_SIZE);
+    ut_ad(!dfield_is_null(field));
+
+    ut_ad(mach_read_from_8(field_data) == 0);
+    uint64_t offset= mach_read_from_8(field_data + 8);
+    uint32_t len= mach_read_from_4(field_data + 16);
+
+    byte *data= (byte*) mem_heap_alloc(heap, len);
+    if (dberr_t err= os_file_read(IORequestRead, blob_file->fd, data,
+                                  offset, len, nullptr))
+      return err;
+    dfield_set_data(field, data, len);
+  }
+
+  return DB_SUCCESS;
+}
+
+/** Copy externally stored columns to the data tuple.
+@param[in]	mrec		record containing BLOB pointers,
+or NULL to use tuple instead
+@param[in]	offsets		offsets of mrec
+@param[in]	zip_size	compressed page size in bytes, or 0
+@param[in,out]	tuple		data tuple
+@param[in,out]	heap		memory heap */
+static
+void
+row_merge_copy_blobs(
+	const mrec_t*		mrec,
+	const rec_offs*		offsets,
+	ulint			zip_size,
+	dtuple_t*		tuple,
+	mem_heap_t*		heap)
+{
+	ut_ad(mrec == NULL || rec_offs_any_extern(offsets));
+
+	for (ulint i = 0; i < dtuple_get_n_fields(tuple); i++) {
+		ulint		len;
+		const void*	data;
+		dfield_t*	field = dtuple_get_nth_field(tuple, i);
+		ulint		field_len;
+		const byte*	field_data;
+
+		if (!dfield_is_ext(field)) {
+			continue;
+		}
+
+		ut_ad(!dfield_is_null(field));
+
+		/* During the creation of a PRIMARY KEY, the table is
+		X-locked, and we skip copying records that have been
+		marked for deletion. Therefore, externally stored
+		columns cannot possibly be freed between the time the
+		BLOB pointers are read (row_merge_read_clustered_index())
+		and dereferenced (below). */
+		if (mrec == NULL) {
+			field_data
+				= static_cast<byte*>(dfield_get_data(field));
+			field_len = dfield_get_len(field);
+
+			ut_a(field_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+			ut_a(memcmp(field_data + field_len
+				     - BTR_EXTERN_FIELD_REF_SIZE,
+				     field_ref_zero,
+				     BTR_EXTERN_FIELD_REF_SIZE));
+
+			data = btr_copy_externally_stored_field(
+				&len, field_data, zip_size, field_len, heap);
+		} else {
+			data = btr_rec_copy_externally_stored_field(
+				mrec, offsets, zip_size, i, &len, heap);
+		}
+
+		/* Because we have locked the table, any records
+		written by incomplete transactions must have been
+		rolled back already. There must not be any incomplete
+		BLOB columns. */
+		ut_a(data);
+
+		dfield_set_data(field, data, len);
+	}
+}
+
+/** Convert a merge record to a typed data tuple. Note that externally
+stored fields are not copied to heap.
+@param[in,out]	index	index on the table
+@param[in]	mtuple	merge record
+@param[in]	heap	memory heap from which memory needed is allocated
+@return	index entry built. */
+static
+void
+row_merge_mtuple_to_dtuple(
+	dict_index_t*	index,
+	dtuple_t*	dtuple,
+	const mtuple_t* mtuple)
+{
+	ut_ad(!dict_index_is_ibuf(index));
+
+	memcpy(dtuple->fields, mtuple->fields,
+	       dtuple->n_fields * sizeof *mtuple->fields);
+}
+
+static	MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+row_merge_insert_index_tuples(
+	dict_index_t*		index,
+	const dict_table_t*	old_table,
+	const pfs_os_file_t&	fd,
+	row_merge_block_t*	block,
+	const row_merge_buf_t*	row_buf,
+	BtrBulk*		btr_bulk,
+	const ib_uint64_t	table_total_rows,
+	double			pct_progress,
+	double			pct_cost,
+	row_merge_block_t*	crypt_block,
+	ulint			space,
+	ut_stage_alter_t*	stage,
+	merge_file_t*		blob_file)
+{
+	const byte*		b;
+	mem_heap_t*		heap;
+	mem_heap_t*		tuple_heap;
+	dberr_t			error = DB_SUCCESS;
+	ulint			foffs = 0;
+	rec_offs*		offsets;
+	mrec_buf_t*		buf;
+	ulint			n_rows = 0;
+	dtuple_t*		dtuple;
+	ib_uint64_t		inserted_rows = 0;
+	double			curr_progress = 0;
+	dict_index_t*		old_index = NULL;
+	const mrec_t*		mrec  = NULL;
+	mtr_t			mtr;
+
+
+	DBUG_ENTER("row_merge_insert_index_tuples");
+
+	ut_ad(!srv_read_only_mode);
+	ut_ad(!(index->type & DICT_FTS));
+	ut_ad(!dict_index_is_spatial(index));
+
+	if (stage != NULL) {
+		stage->begin_phase_insert();
+	}
+
+	tuple_heap = mem_heap_create(1000);
+
+	{
+		ulint i	= 1 + REC_OFFS_HEADER_SIZE
+			+ dict_index_get_n_fields(index);
+		heap = mem_heap_create(sizeof *buf + i * sizeof *offsets);
+		offsets = static_cast<rec_offs*>(
+			mem_heap_alloc(heap, i * sizeof *offsets));
+		rec_offs_set_n_alloc(offsets, i);
+		rec_offs_set_n_fields(offsets, dict_index_get_n_fields(index));
+	}
+
+	if (row_buf != NULL) {
+		ut_ad(fd == OS_FILE_CLOSED);
+		ut_ad(block == NULL);
+		DBUG_EXECUTE_IF("row_merge_read_failure",
+				error = DB_CORRUPTION;
+				goto err_exit;);
+		buf = NULL;
+		b = NULL;
+		dtuple = dtuple_create(
+			heap, dict_index_get_n_fields(index));
+		dtuple_set_n_fields_cmp(
+			dtuple, dict_index_get_n_unique_in_tree(index));
+	} else {
+		b = block;
+		dtuple = NULL;
+
+		if (!row_merge_read(fd, foffs, block, crypt_block, space)) {
+			error = DB_CORRUPTION;
+			goto err_exit;
+		} else {
+			buf = static_cast<mrec_buf_t*>(
+				mem_heap_alloc(heap, sizeof *buf));
+		}
+	}
+
+	for (;;) {
+
+		if (stage != NULL) {
+			stage->inc();
+		}
+
+		if (row_buf != NULL) {
+			if (n_rows >= row_buf->n_tuples) {
+				break;
+			}
+
+			/* Convert merge tuple record from
+			row buffer to data tuple record */
+			row_merge_mtuple_to_dtuple(
+				index, dtuple, &row_buf->tuples[n_rows]);
+			n_rows++;
+			/* BLOB pointers must be copied from dtuple */
+			mrec = NULL;
+		} else {
+			b = row_merge_read_rec(block, buf, b, index,
+					       fd, &foffs, &mrec, offsets,
+					       crypt_block,
+					       space);
+
+			if (UNIV_UNLIKELY(!b)) {
+				/* End of list, or I/O error */
+				if (mrec) {
+					error = DB_CORRUPTION;
+				}
+				break;
+			}
+
+			dtuple = row_rec_to_index_entry_low(
+				mrec, index, offsets, tuple_heap);
+		}
+
+		old_index	= dict_table_get_first_index(old_table);
+
+		if (dict_index_is_clust(index)
+		    && dict_index_is_online_ddl(old_index)) {
+			error = row_log_table_get_error(old_index);
+			if (error != DB_SUCCESS) {
+				break;
+			}
+		}
+
+		ut_ad(!dtuple_get_n_ext(dtuple) || index->is_primary());
+
+		if (!dtuple_get_n_ext(dtuple)) {
+		} else if (blob_file) {
+			error = row_merge_copy_blob_from_file(
+				dtuple, tuple_heap, blob_file);
+			if (error != DB_SUCCESS) {
+				break;
+			}
+		} else {
+			/* Off-page columns can be fetched safely
+			when concurrent modifications to the table
+			are disabled. (Purge can process delete-marked
+			records, but row_merge_read_clustered_index()
+			would have skipped them.)
+
+			When concurrent modifications are enabled,
+			row_merge_read_clustered_index() will
+			only see rows from transactions that were
+			committed before the ALTER TABLE started
+			(REPEATABLE READ).
+
+			Any modifications after the
+			row_merge_read_clustered_index() scan
+			will go through row_log_table_apply(). */
+			row_merge_copy_blobs(
+				mrec, offsets,
+				old_table->space->zip_size(),
+				dtuple, tuple_heap);
+		}
+
+		ut_ad(dtuple_validate(dtuple));
+		error = btr_bulk->insert(dtuple);
+
+		if (error != DB_SUCCESS) {
+			goto err_exit;
+		}
+
+		mem_heap_empty(tuple_heap);
+
+		/* Increment innodb_onlineddl_pct_progress status variable */
+		inserted_rows++;
+		if(inserted_rows % 1000 == 0) {
+			/* Update progress for each 1000 rows */
+			curr_progress = (inserted_rows >= table_total_rows ||
+				table_total_rows <= 0) ?
+				pct_cost :
+				pct_cost * static_cast<double>(inserted_rows)
+				/ static_cast<double>(table_total_rows);
+
+			/* presenting 10.12% as 1012 integer */;
+			onlineddl_pct_progress = (ulint) ((pct_progress + curr_progress) * 100);
+		}
+	}
+
+err_exit:
+	mem_heap_free(tuple_heap);
+	mem_heap_free(heap);
+
+	DBUG_RETURN(error);
+}
+
+/*********************************************************************//**
+Drop an index that was created before an error occurred.
+The data dictionary must have been locked exclusively by the caller,
+because the transaction will not be committed. */
+static
+void
+row_merge_drop_index_dict(
+/*======================*/
+	trx_t*		trx,	/*!< in/out: dictionary transaction */
+	index_id_t	index_id)/*!< in: index identifier */
+{
+	static const char sql[] =
+		"PROCEDURE DROP_INDEX_PROC () IS\n"
+		"BEGIN\n"
+		"DELETE FROM SYS_FIELDS WHERE INDEX_ID=:indexid;\n"
+		"DELETE FROM SYS_INDEXES WHERE ID=:indexid;\n"
+		"END;\n";
+	dberr_t		error;
+	pars_info_t*	info;
+
+	ut_ad(!srv_read_only_mode);
+	ut_ad(trx->dict_operation_lock_mode);
+	ut_ad(trx->dict_operation);
+	ut_ad(dict_sys.locked());
+
+	info = pars_info_create();
+	pars_info_add_ull_literal(info, "indexid", index_id);
+	trx->op_info = "dropping index from dictionary";
+	error = que_eval_sql(info, sql, trx);
+
+	if (error != DB_SUCCESS) {
+		/* Even though we ensure that DDL transactions are WAIT
+		and DEADLOCK free, we could encounter other errors e.g.,
+		DB_TOO_MANY_CONCURRENT_TRXS. */
+		trx->error_state = DB_SUCCESS;
+
+		ib::error() << "row_merge_drop_index_dict failed with error "
+			<< error;
+	}
+
+	trx->op_info = "";
+}
+
+/*********************************************************************//**
+Drop indexes that were created before an error occurred.
+The data dictionary must have been locked exclusively by the caller,
+because the transaction will not be committed. */
+static
+void
+row_merge_drop_indexes_dict(
+/*========================*/
+	trx_t*		trx,	/*!< in/out: dictionary transaction */
+	table_id_t	table_id)/*!< in: table identifier */
+{
+	static const char sql[] =
+		"PROCEDURE DROP_INDEXES_PROC () IS\n"
+		"ixid CHAR;\n"
+		"found INT;\n"
+
+		"DECLARE CURSOR index_cur IS\n"
+		" SELECT ID FROM SYS_INDEXES\n"
+		" WHERE TABLE_ID=:tableid AND\n"
+		" SUBSTR(NAME,0,1)='" TEMP_INDEX_PREFIX_STR "'\n"
+		"FOR UPDATE;\n"
+
+		"BEGIN\n"
+		"found := 1;\n"
+		"OPEN index_cur;\n"
+		"WHILE found = 1 LOOP\n"
+		"  FETCH index_cur INTO ixid;\n"
+		"  IF (SQL % NOTFOUND) THEN\n"
+		"    found := 0;\n"
+		"  ELSE\n"
+		"    DELETE FROM SYS_FIELDS WHERE INDEX_ID=ixid;\n"
+		"    DELETE FROM SYS_INDEXES WHERE CURRENT OF index_cur;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE index_cur;\n"
+
+		"END;\n";
+	dberr_t		error;
+	pars_info_t*	info;
+
+	ut_ad(!srv_read_only_mode);
+	ut_ad(trx->dict_operation_lock_mode);
+	ut_ad(trx->dict_operation);
+	ut_ad(dict_sys.locked());
+
+	/* It is possible that table->n_ref_count > 1 when
+	locked=TRUE. In this case, all code that should have an open
+	handle to the table be waiting for the next statement to execute,
+	or waiting for a meta-data lock.
+
+	A concurrent purge will be prevented by dict_sys.latch. */
+
+	info = pars_info_create();
+	pars_info_add_ull_literal(info, "tableid", table_id);
+	trx->op_info = "dropping indexes";
+	error = que_eval_sql(info, sql, trx);
+
+	switch (error) {
+	case DB_SUCCESS:
+		break;
+	default:
+		/* Even though we ensure that DDL transactions are WAIT
+		and DEADLOCK free, we could encounter other errors e.g.,
+		DB_TOO_MANY_CONCURRENT_TRXS. */
+		ib::error() << "row_merge_drop_indexes_dict failed with error "
+			<< error;
+		/* fall through */
+	case DB_TOO_MANY_CONCURRENT_TRXS:
+		trx->error_state = DB_SUCCESS;
+	}
+
+	trx->op_info = "";
+}
+
+/** Drop common internal tables if all fulltext indexes are dropped
+@param trx   transaction
+@param table user table */
+static void row_merge_drop_fulltext_indexes(trx_t *trx, dict_table_t *table)
+{
+  if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID) ||
+      !table->fts ||
+      !ib_vector_is_empty(table->fts->indexes))
+    return;
+
+  for (const dict_index_t *index= dict_table_get_first_index(table);
+       index; index= dict_table_get_next_index(index))
+    if (index->type & DICT_FTS)
+      return;
+
+  fts_optimize_remove_table(table);
+  fts_drop_tables(trx, *table);
+  table->fts->~fts_t();
+  table->fts= nullptr;
+  DICT_TF2_FLAG_UNSET(table, DICT_TF2_FTS);
+}
+
+/** Drop indexes that were created before an error occurred.
+The data dictionary must have been locked exclusively by the caller,
+because the transaction will not be committed.
+@param trx              dictionary transaction
+@param table            table containing the indexes
+@param locked           True if table is locked,
+                        false - may need to do lazy drop
+@param alter_trx        Alter table transaction */
+void
+row_merge_drop_indexes(
+        trx_t*          trx,
+        dict_table_t*   table,
+        bool            locked,
+        const trx_t*    alter_trx)
+{
+	dict_index_t*	index;
+	dict_index_t*	next_index;
+
+	ut_ad(!srv_read_only_mode);
+	ut_ad(trx->dict_operation_lock_mode);
+	ut_ad(trx->dict_operation);
+	ut_ad(dict_sys.locked());
+
+	index = dict_table_get_first_index(table);
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(dict_index_get_online_status(index) == ONLINE_INDEX_COMPLETE);
+
+	/* the caller should have an open handle to the table */
+	ut_ad(table->get_ref_count() >= 1);
+
+	/* It is possible that table->n_ref_count > 1 when
+	locked=TRUE. In this case, all code that should have an open
+	handle to the table be waiting for the next statement to execute,
+	or waiting for a meta-data lock.
+
+	A concurrent purge will be prevented by MDL. */
+
+	if (!locked && (table->get_ref_count() > 1
+			|| table->has_lock_other_than(alter_trx))) {
+		while ((index = dict_table_get_next_index(index)) != NULL) {
+			ut_ad(!dict_index_is_clust(index));
+
+			switch (dict_index_get_online_status(index)) {
+			case ONLINE_INDEX_ABORTED_DROPPED:
+				continue;
+			case ONLINE_INDEX_COMPLETE:
+				if (index->is_committed()) {
+					/* Do nothing to already
+					published indexes. */
+				} else if (index->type & DICT_FTS) {
+					/* Drop a completed FULLTEXT
+					index, due to a timeout during
+					MDL upgrade for
+					commit_inplace_alter_table().
+					Because only concurrent reads
+					are allowed (and they are not
+					seeing this index yet) we
+					are safe to drop the index. */
+					dict_index_t* prev = UT_LIST_GET_PREV(
+						indexes, index);
+					/* At least there should be
+					the clustered index before
+					this one. */
+					ut_ad(prev);
+					ut_a(table->fts);
+					fts_drop_index(table, index, trx);
+					row_merge_drop_index_dict(
+						trx, index->id);
+					/* We can remove a DICT_FTS
+					index from the cache, because
+					we do not allow ADD FULLTEXT INDEX
+					with LOCK=NONE. If we allowed that,
+					we should exclude FTS entries from
+					prebuilt->ins_node->entry_list
+					in ins_node_create_entry_list(). */
+#ifdef BTR_CUR_HASH_ADAPT
+					ut_ad(!index->search_info->ref_count);
+#endif /* BTR_CUR_HASH_ADAPT */
+					dict_index_remove_from_cache(
+						table, index);
+					index = prev;
+				} else {
+					index->lock.x_lock(SRW_LOCK_CALL);
+					dict_index_set_online_status(
+						index, ONLINE_INDEX_ABORTED);
+					index->type |= DICT_CORRUPT;
+					table->drop_aborted = TRUE;
+					goto drop_aborted;
+				}
+				continue;
+			case ONLINE_INDEX_CREATION:
+				index->lock.x_lock(SRW_LOCK_CALL);
+				ut_ad(!index->is_committed());
+				row_log_abort_sec(index);
+			drop_aborted:
+				index->lock.x_unlock();
+
+				DEBUG_SYNC_C("merge_drop_index_after_abort");
+				/* covered by dict_sys.latch */
+				MONITOR_INC(MONITOR_BACKGROUND_DROP_INDEX);
+				/* fall through */
+			case ONLINE_INDEX_ABORTED:
+				/* Drop the index tree from the
+				data dictionary and free it from
+				the tablespace, but keep the object
+				in the data dictionary cache. */
+				row_merge_drop_index_dict(trx, index->id);
+				index->lock.x_lock(SRW_LOCK_CALL);
+				dict_index_set_online_status(
+					index, ONLINE_INDEX_ABORTED_DROPPED);
+				index->lock.x_unlock();
+				table->drop_aborted = TRUE;
+				continue;
+			}
+			ut_error;
+		}
+
+		row_merge_drop_fulltext_indexes(trx, table);
+		return;
+	}
+
+	row_merge_drop_indexes_dict(trx, table->id);
+
+	/* Invalidate all row_prebuilt_t::ins_graph that are referring
+	to this table. That is, force row_get_prebuilt_insert_row() to
+	rebuild prebuilt->ins_node->entry_list). */
+	if (table->def_trx_id < trx->id) {
+		table->def_trx_id = trx->id;
+	} else {
+		ut_ad(table->def_trx_id == trx->id || table->name.part());
+	}
+
+	next_index = dict_table_get_next_index(index);
+
+	while ((index = next_index) != NULL) {
+		/* read the next pointer before freeing the index */
+		next_index = dict_table_get_next_index(index);
+
+		ut_ad(!dict_index_is_clust(index));
+
+		if (!index->is_committed()) {
+			/* If it is FTS index, drop from table->fts
+			and also drop its auxiliary tables */
+			if (index->type & DICT_FTS) {
+				ut_a(table->fts);
+				fts_drop_index(table, index, trx);
+			}
+
+			switch (dict_index_get_online_status(index)) {
+			case ONLINE_INDEX_CREATION:
+				/* This state should only be possible
+				when prepare_inplace_alter_table() fails
+				after invoking row_merge_create_index().
+				In inplace_alter_table(),
+				row_merge_build_indexes()
+				should never leave the index in this state.
+				It would invoke row_log_abort_sec() on
+				failure. */
+			case ONLINE_INDEX_COMPLETE:
+				/* In these cases, we are able to drop
+				the index straight. The DROP INDEX was
+				never deferred. */
+				break;
+			case ONLINE_INDEX_ABORTED:
+			case ONLINE_INDEX_ABORTED_DROPPED:
+				/* covered by dict_sys.latch */
+				MONITOR_DEC(MONITOR_BACKGROUND_DROP_INDEX);
+			}
+
+			dict_index_remove_from_cache(table, index);
+		}
+	}
+
+	row_merge_drop_fulltext_indexes(trx, table);
+	table->drop_aborted = FALSE;
+	ut_d(dict_table_check_for_dup_indexes(table, CHECK_ALL_COMPLETE));
+}
+
+/** Drop fulltext indexes */
+static ibool row_merge_drop_fts(void *node, void *trx)
+{
+   auto s= static_cast<sel_node_t*>(node);
+
+   const dfield_t *table_id= que_node_get_val(s->select_list);
+   ut_ad(table_id->type.mtype == DATA_BINARY);
+   node= que_node_get_next(s->select_list);
+   ut_ad(!que_node_get_next(node));
+   const dfield_t *index_id= que_node_get_val(node);
+   ut_ad(index_id->type.mtype == DATA_BINARY);
+
+   static const char sql[]=
+     "PROCEDURE DROP_TABLES_PROC () IS\n"
+     "tid CHAR;\n"
+     "iid CHAR;\n"
+
+     "DECLARE CURSOR cur_tab IS\n"
+     "SELECT ID FROM SYS_TABLES\n"
+     "WHERE INSTR(NAME,:name)+45=LENGTH(NAME)"
+     " AND INSTR('123456',SUBSTR(NAME,LENGTH(NAME)-1,1))>0"
+     " FOR UPDATE;\n"
+
+     "DECLARE CURSOR cur_idx IS\n"
+     "SELECT ID FROM SYS_INDEXES\n"
+     "WHERE TABLE_ID = tid FOR UPDATE;\n"
+
+     "BEGIN\n"
+     "OPEN cur_tab;\n"
+     "WHILE 1 = 1 LOOP\n"
+     "  FETCH cur_tab INTO tid;\n"
+     "  IF (SQL % NOTFOUND) THEN EXIT; END IF;\n"
+     "  OPEN cur_idx;\n"
+     "  WHILE 1 = 1 LOOP\n"
+     "    FETCH cur_idx INTO iid;\n"
+     "    IF (SQL % NOTFOUND) THEN EXIT; END IF;\n"
+     "    DELETE FROM SYS_FIELDS WHERE INDEX_ID=iid;\n"
+     "    DELETE FROM SYS_INDEXES WHERE CURRENT OF cur_idx;\n"
+     "  END LOOP;\n"
+     "  CLOSE cur_idx;\n"
+     "  DELETE FROM SYS_COLUMNS WHERE TABLE_ID=tid;\n"
+     "  DELETE FROM SYS_TABLES WHERE CURRENT OF cur_tab;\n"
+     "END LOOP;\n"
+     "CLOSE cur_tab;\n"
+     "END;\n";
+
+   if (table_id->len == 8 && index_id->len == 8)
+   {
+     char buf[sizeof "/FTS_0000000000000000_0000000000000000_INDEX_"];
+     snprintf(buf, sizeof buf, "/FTS_%016llx_%016llx_INDEX_",
+              static_cast<ulonglong>
+              (mach_read_from_8(static_cast<const byte*>(table_id->data))),
+              static_cast<ulonglong>
+              (mach_read_from_8(static_cast<const byte*>(index_id->data))));
+     auto pinfo= pars_info_create();
+     pars_info_add_str_literal(pinfo, "name", buf);
+     que_eval_sql(pinfo, sql, static_cast<trx_t*>(trx));
+   }
+
+   return true;
+}
+
+/** During recovery, drop recovered index stubs that were created in
+prepare_inplace_alter_table_dict(). */
+void row_merge_drop_temp_indexes()
+{
+	static_assert(DICT_FTS == 32, "compatibility");
+
+	static const char sql[] =
+		"PROCEDURE DROP_TEMP_INDEXES_PROC () IS\n"
+		"ixid CHAR;\n"
+		"found INT;\n"
+
+		"DECLARE FUNCTION drop_fts;\n"
+
+		"DECLARE CURSOR fts_cur IS\n"
+		" SELECT TABLE_ID,ID FROM SYS_INDEXES\n"
+		" WHERE TYPE=32"
+		" AND SUBSTR(NAME,0,1)='" TEMP_INDEX_PREFIX_STR "'\n"
+		" FOR UPDATE;\n"
+
+		"DECLARE CURSOR index_cur IS\n"
+		" SELECT ID FROM SYS_INDEXES\n"
+		" WHERE SUBSTR(NAME,0,1)='" TEMP_INDEX_PREFIX_STR "'\n"
+		"FOR UPDATE;\n"
+
+		"BEGIN\n"
+		"found := 1;\n"
+		"OPEN fts_cur;\n"
+		"WHILE found = 1 LOOP\n"
+		"  FETCH fts_cur INTO drop_fts();\n"
+		"  IF (SQL % NOTFOUND) THEN\n"
+		"    found := 0;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE fts_cur;\n"
+
+		"OPEN index_cur;\n"
+		"WHILE found = 1 LOOP\n"
+		"  FETCH index_cur INTO ixid;\n"
+		"  IF (SQL % NOTFOUND) THEN\n"
+		"    found := 0;\n"
+		"  ELSE\n"
+		"    DELETE FROM SYS_FIELDS WHERE INDEX_ID=ixid;\n"
+		"    DELETE FROM SYS_INDEXES WHERE CURRENT OF index_cur;\n"
+		"  END IF;\n"
+		"END LOOP;\n"
+		"CLOSE index_cur;\n"
+		"END;\n";
+
+	/* Load the table definitions that contain partially defined
+	indexes, so that the data dictionary information can be checked
+	when accessing the tablename.ibd files. */
+	trx_t* trx = trx_create();
+	trx_start_for_ddl(trx);
+	trx->op_info = "dropping partially created indexes";
+	dberr_t error = lock_sys_tables(trx);
+
+	row_mysql_lock_data_dictionary(trx);
+	/* Ensure that this transaction will be rolled back and locks
+	will be released, if the server gets killed before the commit
+	gets written to the redo log. */
+	trx->dict_operation = true;
+
+	trx->op_info = "dropping indexes";
+
+	pars_info_t* pinfo = pars_info_create();
+	pars_info_bind_function(pinfo, "drop_fts", row_merge_drop_fts, trx);
+	if (error == DB_SUCCESS) {
+		error = que_eval_sql(pinfo, sql, trx);
+	}
+
+	if (error) {
+		/* Even though we ensure that DDL transactions are WAIT
+		and DEADLOCK free, we could encounter other errors e.g.,
+		DB_TOO_MANY_CONCURRENT_TRXS. */
+		trx->error_state = DB_SUCCESS;
+
+		ib::error() << "row_merge_drop_temp_indexes(): " << error;
+	}
+
+	trx_commit_for_mysql(trx);
+	row_mysql_unlock_data_dictionary(trx);
+	trx->free();
+}
+
+
+/** Create temporary merge files in the given paramater path, and if
+UNIV_PFS_IO defined, register the file descriptor with Performance Schema.
+@param[in]	path	location for creating temporary merge files, or NULL
+@return File descriptor */
+pfs_os_file_t
+row_merge_file_create_low(
+	const char*	path)
+{
+	if (!path) {
+		path = mysql_tmpdir;
+	}
+#ifdef UNIV_PFS_IO
+	/* This temp file open does not go through normal
+	file APIs, add instrumentation to register with
+	performance schema */
+	struct PSI_file_locker*	locker;
+	PSI_file_locker_state	state;
+	static const char label[] = "/Innodb Merge Temp File";
+	char* name = static_cast<char*>(
+		ut_malloc_nokey(strlen(path) + sizeof label));
+	strcpy(name, path);
+	strcat(name, label);
+
+	register_pfs_file_open_begin(
+		&state, locker, innodb_temp_file_key,
+		PSI_FILE_CREATE, path ? name : label, __FILE__, __LINE__);
+
+#endif
+	DBUG_ASSERT(strlen(path) + 2 <= FN_REFLEN);
+	char filename[FN_REFLEN];
+	File f = create_temp_file(filename, path, "ib",
+				  O_BINARY | O_SEQUENTIAL,
+				  MYF(MY_WME | MY_TEMPORARY));
+	pfs_os_file_t fd = IF_WIN((os_file_t)my_get_osfhandle(f), f);
+
+#ifdef UNIV_PFS_IO
+	register_pfs_file_open_end(locker, fd, 
+		(fd == OS_FILE_CLOSED)?NULL:&fd);
+	ut_free(name);
+#endif
+
+	if (fd == OS_FILE_CLOSED) {
+		ib::error() << "Cannot create temporary merge file";
+	}
+	return(fd);
+}
+
+
+/** Create a merge file in the given location.
+@param[out]	merge_file	merge file structure
+@param[in]	path		location for creating temporary file, or NULL
+@return file descriptor, or OS_FILE_CLOSED on error */
+pfs_os_file_t
+row_merge_file_create(
+	merge_file_t*	merge_file,
+	const char*	path)
+{
+	merge_file->fd = row_merge_file_create_low(path);
+	merge_file->offset = 0;
+	merge_file->n_rec = 0;
+
+	if (merge_file->fd != OS_FILE_CLOSED) {
+		if (srv_disable_sort_file_cache) {
+			os_file_set_nocache(merge_file->fd,
+				"row0merge.cc", "sort");
+		}
+	}
+	return(merge_file->fd);
+}
+
+/*********************************************************************//**
+Destroy a merge file. And de-register the file from Performance Schema
+if UNIV_PFS_IO is defined. */
+void
+row_merge_file_destroy_low(
+/*=======================*/
+	const pfs_os_file_t& fd)	/*!< in: merge file descriptor */
+{
+	if (fd != OS_FILE_CLOSED) {
+		int res = mysql_file_close(IF_WIN(my_win_handle2File((os_file_t)fd), fd),
+					   MYF(MY_WME));
+		ut_a(res != -1);
+	}
+}
+/*********************************************************************//**
+Destroy a merge file. */
+void
+row_merge_file_destroy(
+/*===================*/
+	merge_file_t*	merge_file)	/*!< in/out: merge file structure */
+{
+	ut_ad(!srv_read_only_mode);
+
+	if (merge_file->fd != OS_FILE_CLOSED) {
+		row_merge_file_destroy_low(merge_file->fd);
+		merge_file->fd = OS_FILE_CLOSED;
+	}
+}
+
+/*********************************************************************//**
+Rename an index in the dictionary that was created. The data
+dictionary must have been locked exclusively by the caller, because
+the transaction will not be committed.
+@return DB_SUCCESS if all OK */
+dberr_t
+row_merge_rename_index_to_add(
+/*==========================*/
+	trx_t*		trx,		/*!< in/out: transaction */
+	table_id_t	table_id,	/*!< in: table identifier */
+	index_id_t	index_id)	/*!< in: index identifier */
+{
+	dberr_t		err = DB_SUCCESS;
+	pars_info_t*	info = pars_info_create();
+
+	/* We use the private SQL parser of Innobase to generate the
+	query graphs needed in renaming indexes. */
+
+	static const char rename_index[] =
+		"PROCEDURE RENAME_INDEX_PROC () IS\n"
+		"BEGIN\n"
+		"UPDATE SYS_INDEXES SET NAME=SUBSTR(NAME,1,LENGTH(NAME)-1)\n"
+		"WHERE TABLE_ID = :tableid AND ID = :indexid;\n"
+		"END;\n";
+
+	ut_ad(trx->dict_operation_lock_mode);
+	ut_ad(trx->dict_operation);
+
+	trx->op_info = "renaming index to add";
+
+	pars_info_add_ull_literal(info, "tableid", table_id);
+	pars_info_add_ull_literal(info, "indexid", index_id);
+
+	err = que_eval_sql(info, rename_index, trx);
+
+	if (err != DB_SUCCESS) {
+		/* Even though we ensure that DDL transactions are WAIT
+		and DEADLOCK free, we could encounter other errors e.g.,
+		DB_TOO_MANY_CONCURRENT_TRXS. */
+		trx->error_state = DB_SUCCESS;
+
+		ib::error() << "row_merge_rename_index_to_add failed with"
+			" error " << err;
+	}
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/** Create the index and load in to the dictionary.
+@param[in,out]	table		the index is on this table
+@param[in]	index_def	the index definition
+@param[in]	add_v		new virtual columns added along with add
+				index call
+@return index, or NULL on error */
+dict_index_t*
+row_merge_create_index(
+	dict_table_t*		table,
+	const index_def_t*	index_def,
+	const dict_add_v_col_t*	add_v)
+{
+	dict_index_t*	index;
+	ulint		n_fields = index_def->n_fields;
+	ulint		i;
+	ulint		n_add_vcol = 0;
+
+	DBUG_ENTER("row_merge_create_index");
+
+	ut_ad(!srv_read_only_mode);
+
+	/* Create the index prototype, using the passed in def, this is not
+	a persistent operation. We pass 0 as the space id, and determine at
+	a lower level the space id where to store the table. */
+
+	index = dict_mem_index_create(table, index_def->name,
+				      index_def->ind_type, n_fields);
+	index->set_committed(index_def->rebuild);
+
+	for (i = 0; i < n_fields; i++) {
+		const char*	name;
+		index_field_t*	ifield = &index_def->fields[i];
+
+		if (ifield->is_v_col) {
+			if (ifield->col_no >= table->n_v_def) {
+				ut_ad(ifield->col_no < table->n_v_def
+				      + add_v->n_v_col);
+				ut_ad(ifield->col_no >= table->n_v_def);
+				name = add_v->v_col_name[
+					ifield->col_no - table->n_v_def];
+				n_add_vcol++;
+			} else {
+				name = dict_table_get_v_col_name(
+					table, ifield->col_no);
+			}
+		} else {
+			name = dict_table_get_col_name(table, ifield->col_no);
+		}
+
+		dict_mem_index_add_field(index, name, ifield->prefix_len,
+					 ifield->descending);
+	}
+
+	if (n_add_vcol) {
+		index->assign_new_v_col(n_add_vcol);
+	}
+
+	DBUG_RETURN(index);
+}
+
+/*********************************************************************//**
+Check if a transaction can use an index. */
+bool
+row_merge_is_index_usable(
+/*======================*/
+	const trx_t*		trx,	/*!< in: transaction */
+	const dict_index_t*	index)	/*!< in: index to check */
+{
+	if (!index->is_primary()
+	    && dict_index_is_online_ddl(index)) {
+		/* Indexes that are being created are not useable. */
+		return(false);
+	}
+
+	return(!index->is_corrupted()
+	       && (index->table->is_temporary() || index->table->no_rollback()
+		   || index->trx_id == 0
+		   || !trx->read_view.is_open()
+		   || trx->read_view.changes_visible(index->trx_id)));
+}
+
+/** Build indexes on a table by reading a clustered index, creating a temporary
+file containing index entries, merge sorting these index entries and inserting
+sorted index entries to indexes.
+@param[in]	trx		transaction
+@param[in]	old_table	table where rows are read from
+@param[in]	new_table	table where indexes are created; identical to
+old_table unless creating a PRIMARY KEY
+@param[in]	online		true if creating indexes online
+@param[in]	indexes		indexes to be created
+@param[in]	key_numbers	MySQL key numbers
+@param[in]	n_indexes	size of indexes[]
+@param[in,out]	table		MySQL table, for reporting erroneous key value
+if applicable
+@param[in]	defaults	default values of added, changed columns, or NULL
+@param[in]	col_map		mapping of old column numbers to new ones, or
+NULL if old_table == new_table
+@param[in]	add_autoinc	number of added AUTO_INCREMENT columns, or
+ULINT_UNDEFINED if none is added
+@param[in,out]	sequence	autoinc sequence
+@param[in]	skip_pk_sort	whether the new PRIMARY KEY will follow
+existing order
+@param[in,out]	stage		performance schema accounting object, used by
+ALTER TABLE. stage->begin_phase_read_pk() will be called at the beginning of
+this function and it will be passed to other functions for further accounting.
+@param[in]	add_v		new virtual columns added along with indexes
+@param[in]	eval_table	mysql table used to evaluate virtual column
+				value, see innobase_get_computed_value().
+@param[in]	allow_not_null	allow the conversion from null to not-null
+@param[in]	col_collate	columns whose collations changed, or nullptr
+@return DB_SUCCESS or error code */
+dberr_t
+row_merge_build_indexes(
+	trx_t*			trx,
+	dict_table_t*		old_table,
+	dict_table_t*		new_table,
+	bool			online,
+	dict_index_t**		indexes,
+	const ulint*		key_numbers,
+	ulint			n_indexes,
+	struct TABLE*		table,
+	const dtuple_t*		defaults,
+	const ulint*		col_map,
+	ulint			add_autoinc,
+	ib_sequence_t&		sequence,
+	bool			skip_pk_sort,
+	ut_stage_alter_t*	stage,
+	const dict_add_v_col_t*	add_v,
+	struct TABLE*		eval_table,
+	bool			allow_not_null,
+	const col_collations*	col_collate)
+{
+	merge_file_t*		merge_files;
+	row_merge_block_t*	block;
+	ut_new_pfx_t		block_pfx;
+	size_t			block_size;
+	ut_new_pfx_t		crypt_pfx;
+	row_merge_block_t*	crypt_block = NULL;
+	ulint			i;
+	ulint			j;
+	dberr_t			error;
+	pfs_os_file_t		tmpfd = OS_FILE_CLOSED;
+	dict_index_t*		fts_sort_idx = NULL;
+	fts_psort_t*		psort_info = NULL;
+	fts_psort_t*		merge_info = NULL;
+	bool			fts_psort_initiated = false;
+
+	double total_static_cost = 0;
+	double total_dynamic_cost = 0;
+	ulint total_index_blocks = 0;
+	double pct_cost=0;
+	double pct_progress=0;
+
+	DBUG_ENTER("row_merge_build_indexes");
+
+	ut_ad(!srv_read_only_mode);
+	ut_ad((old_table == new_table) == !col_map);
+	ut_ad(!defaults || col_map);
+
+	stage->begin_phase_read_pk(skip_pk_sort && new_table != old_table
+				   ? n_indexes - 1
+				   : n_indexes);
+
+	/* Allocate memory for merge file data structure and initialize
+	fields */
+
+	ut_allocator<row_merge_block_t>	alloc(mem_key_row_merge_sort);
+
+	/* This will allocate "3 * srv_sort_buf_size" elements of type
+	row_merge_block_t. The latter is defined as byte. */
+	block_size = 3 * srv_sort_buf_size;
+	block = alloc.allocate_large(block_size, &block_pfx);
+
+	if (block == NULL) {
+		DBUG_RETURN(DB_OUT_OF_MEMORY);
+	}
+
+	crypt_pfx.m_size = 0; /* silence bogus -Wmaybe-uninitialized */
+	TRASH_ALLOC(&crypt_pfx, sizeof crypt_pfx);
+
+	if (srv_encrypt_log) {
+		crypt_block = static_cast<row_merge_block_t*>(
+			alloc.allocate_large(block_size,
+					     &crypt_pfx));
+
+		if (crypt_block == NULL) {
+			DBUG_RETURN(DB_OUT_OF_MEMORY);
+		}
+	}
+
+	trx_start_if_not_started_xa(trx, true);
+	ulint	n_merge_files = 0;
+
+	for (ulint i = 0; i < n_indexes; i++)
+	{
+		if (!dict_index_is_spatial(indexes[i])) {
+			n_merge_files++;
+		}
+	}
+
+	merge_files = static_cast<merge_file_t*>(
+		ut_malloc_nokey(n_merge_files * sizeof *merge_files));
+
+	/* Initialize all the merge file descriptors, so that we
+	don't call row_merge_file_destroy() on uninitialized
+	merge file descriptor */
+
+	for (i = 0; i < n_merge_files; i++) {
+		merge_files[i].fd = OS_FILE_CLOSED;
+		merge_files[i].offset = 0;
+		merge_files[i].n_rec = 0;
+	}
+
+	total_static_cost = COST_BUILD_INDEX_STATIC
+		* static_cast<double>(n_indexes) + COST_READ_CLUSTERED_INDEX;
+	total_dynamic_cost = COST_BUILD_INDEX_DYNAMIC
+		* static_cast<double>(n_indexes);
+	for (i = 0; i < n_indexes; i++) {
+		if (indexes[i]->type & DICT_FTS) {
+			ibool	opt_doc_id_size = FALSE;
+
+			/* To build FTS index, we would need to extract
+			doc's word, Doc ID, and word's position, so
+			we need to build a "fts sort index" indexing
+			on above three 'fields' */
+			fts_sort_idx = row_merge_create_fts_sort_index(
+				indexes[i], old_table, &opt_doc_id_size);
+
+			row_merge_dup_t*	dup
+				= static_cast<row_merge_dup_t*>(
+					ut_malloc_nokey(sizeof *dup));
+			dup->index = fts_sort_idx;
+			dup->table = table;
+			dup->col_map = col_map;
+			dup->n_dup = 0;
+
+			/* This can fail e.g. if temporal files can't be
+			created */
+			if (!row_fts_psort_info_init(
+					trx, dup, new_table, opt_doc_id_size,
+					old_table->space->zip_size(),
+					&psort_info, &merge_info)) {
+				error = DB_CORRUPTION;
+				goto func_exit;
+			}
+
+			/* We need to ensure that we free the resources
+			allocated */
+			fts_psort_initiated = true;
+		}
+	}
+
+	if (global_system_variables.log_warnings > 2) {
+		sql_print_information("InnoDB: Online DDL : Start reading"
+				      " clustered index of the table"
+				      " and create temporary files");
+	}
+
+	pct_cost = COST_READ_CLUSTERED_INDEX * 100 / (total_static_cost + total_dynamic_cost);
+
+	/* Do not continue if we can't encrypt table pages */
+	if (!old_table->is_readable() ||
+	    !new_table->is_readable()) {
+		error = DB_DECRYPTION_FAILED;
+		ib_push_warning(trx->mysql_thd, DB_DECRYPTION_FAILED,
+			"Table %s is encrypted but encryption service or"
+			" used key_id is not available. "
+			" Can't continue reading table.",
+			!old_table->is_readable() ? old_table->name.m_name :
+				new_table->name.m_name);
+		goto func_exit;
+	}
+
+	/* Read clustered index of the table and create files for
+	secondary index entries for merge sort */
+	error = row_merge_read_clustered_index(
+		trx, table, old_table, new_table, online, indexes,
+		fts_sort_idx, psort_info, merge_files, key_numbers,
+		n_indexes, defaults, add_v, col_map, add_autoinc,
+		sequence, block, skip_pk_sort, &tmpfd, stage,
+		pct_cost, crypt_block, eval_table, allow_not_null,
+		col_collate);
+
+	stage->end_phase_read_pk();
+
+	pct_progress += pct_cost;
+
+	if (global_system_variables.log_warnings > 2) {
+		sql_print_information("InnoDB: Online DDL : End of reading "
+				      "clustered index of the table"
+				      " and create temporary files");
+	}
+
+	for (i = 0; i < n_merge_files; i++) {
+		total_index_blocks += merge_files[i].offset;
+	}
+
+	if (error != DB_SUCCESS) {
+		goto func_exit;
+	}
+
+	DEBUG_SYNC_C("row_merge_after_scan");
+
+	/* Now we have files containing index entries ready for
+	sorting and inserting. */
+
+	for (ulint k = 0, i = 0; i < n_indexes; i++) {
+		dict_index_t*	sort_idx = indexes[i];
+
+		if (dict_index_is_spatial(sort_idx)) {
+			continue;
+		}
+
+		if (indexes[i]->type & DICT_FTS) {
+
+			sort_idx = fts_sort_idx;
+
+			if (FTS_PLL_MERGE) {
+				row_fts_start_parallel_merge(merge_info);
+				for (j = 0; j < FTS_NUM_AUX_INDEX; j++) {
+					merge_info[j].task->wait();
+					delete merge_info[j].task;
+				}
+			} else {
+				/* This cannot report duplicates; an
+				assertion would fail in that case. */
+				error = row_fts_merge_insert(
+					sort_idx, new_table,
+					psort_info, 0);
+			}
+
+#ifdef FTS_INTERNAL_DIAG_PRINT
+			DEBUG_FTS_SORT_PRINT("FTS_SORT: Complete Insert\n");
+#endif
+		} else if (merge_files[k].fd != OS_FILE_CLOSED) {
+			char	buf[NAME_LEN + 1];
+			row_merge_dup_t	dup = {
+				sort_idx, table, col_map, 0};
+
+			pct_cost = (COST_BUILD_INDEX_STATIC +
+				    (total_dynamic_cost
+				     * static_cast<double>(merge_files[k].offset)
+				     / static_cast<double>(total_index_blocks)))
+				/ (total_static_cost + total_dynamic_cost)
+				* PCT_COST_MERGESORT_INDEX * 100;
+			char*	bufend = innobase_convert_name(
+				buf, sizeof buf,
+				indexes[i]->name,
+				strlen(indexes[i]->name),
+				trx->mysql_thd);
+			buf[bufend - buf]='\0';
+
+			if (global_system_variables.log_warnings > 2) {
+				sql_print_information("InnoDB: Online DDL :"
+						      " Start merge-sorting"
+						      " index %s"
+						      " (" ULINTPF
+						      " / " ULINTPF "),"
+						      " estimated cost :"
+						      " %2.4f",
+						      buf, i + 1, n_indexes,
+						      pct_cost);
+			}
+
+			error = row_merge_sort(
+					trx, &dup, &merge_files[k],
+					block, &tmpfd, true,
+					pct_progress, pct_cost,
+					crypt_block, new_table->space_id,
+					stage);
+
+			pct_progress += pct_cost;
+
+			if (global_system_variables.log_warnings > 2) {
+				sql_print_information("InnoDB: Online DDL :"
+						      " End of "
+						      " merge-sorting index %s"
+						      " (" ULINTPF
+						      " / " ULINTPF ")",
+						      buf, i + 1, n_indexes);
+			}
+
+			if (error == DB_SUCCESS) {
+				BtrBulk	btr_bulk(sort_idx, trx);
+
+				pct_cost = (COST_BUILD_INDEX_STATIC +
+					    (total_dynamic_cost
+					     * static_cast<double>(
+						     merge_files[k].offset)
+					     / static_cast<double>(
+						     total_index_blocks)))
+					/ (total_static_cost
+					   + total_dynamic_cost)
+					* PCT_COST_INSERT_INDEX * 100;
+
+				if (global_system_variables.log_warnings > 2) {
+					sql_print_information(
+						"InnoDB: Online DDL : Start "
+						"building index %s"
+						" (" ULINTPF
+						" / " ULINTPF "), estimated "
+						"cost : %2.4f", buf, i + 1,
+						n_indexes, pct_cost);
+				}
+
+				error = row_merge_insert_index_tuples(
+					sort_idx, old_table,
+					merge_files[k].fd, block, NULL,
+					&btr_bulk,
+					merge_files[k].n_rec, pct_progress, pct_cost,
+					crypt_block, new_table->space_id,
+					stage);
+
+				error = btr_bulk.finish(error);
+
+				pct_progress += pct_cost;
+
+				if (global_system_variables.log_warnings > 2) {
+					sql_print_information(
+						"InnoDB: Online DDL : "
+						"End of building index %s"
+						" (" ULINTPF " / " ULINTPF ")",
+						buf, i + 1, n_indexes);
+				}
+			}
+		}
+
+		/* Close the temporary file to free up space. */
+		row_merge_file_destroy(&merge_files[k++]);
+
+		if (indexes[i]->type & DICT_FTS) {
+			row_fts_psort_info_destroy(psort_info, merge_info);
+			fts_psort_initiated = false;
+		} else if (old_table != new_table) {
+			ut_ad(!sort_idx->online_log);
+			ut_ad(sort_idx->online_status
+			      == ONLINE_INDEX_COMPLETE);
+		}
+
+		if (old_table != new_table
+		    || (indexes[i]->type & (DICT_FTS | DICT_SPATIAL))
+		    || error != DB_SUCCESS || !online) {
+			/* Do not apply any online log. */
+		} else {
+			if (global_system_variables.log_warnings > 2) {
+				sql_print_information(
+					"InnoDB: Online DDL : Applying"
+					" log to index");
+			}
+
+			DEBUG_SYNC_C("row_log_apply_before");
+			error = row_log_apply(trx, sort_idx, table, stage);
+			DEBUG_SYNC_C("row_log_apply_after");
+		}
+
+		if (error != DB_SUCCESS) {
+			trx->error_key_num = key_numbers[i];
+			goto func_exit;
+		}
+
+		if (indexes[i]->type & DICT_FTS
+		    && UNIV_UNLIKELY(fts_enable_diag_print)) {
+			ib::info() << "Finished building full-text index "
+				<< indexes[i]->name;
+		}
+	}
+
+func_exit:
+
+	DBUG_EXECUTE_IF(
+		"ib_build_indexes_too_many_concurrent_trxs",
+		error = DB_TOO_MANY_CONCURRENT_TRXS;
+		trx->error_state = error;);
+
+	if (fts_psort_initiated) {
+		/* Clean up FTS psort related resource */
+		row_fts_psort_info_destroy(psort_info, merge_info);
+		fts_psort_initiated = false;
+	}
+
+	row_merge_file_destroy_low(tmpfd);
+
+	for (i = 0; i < n_merge_files; i++) {
+		row_merge_file_destroy(&merge_files[i]);
+	}
+
+	if (fts_sort_idx) {
+		dict_mem_index_free(fts_sort_idx);
+	}
+
+	ut_free(merge_files);
+
+	alloc.deallocate_large(block, &block_pfx);
+
+	if (crypt_block) {
+		alloc.deallocate_large(crypt_block, &crypt_pfx);
+	}
+
+	DICT_TF2_FLAG_UNSET(new_table, DICT_TF2_FTS_ADD_DOC_ID);
+
+	if (online && old_table == new_table && error != DB_SUCCESS) {
+		/* On error, flag all online secondary index creation
+		as aborted. */
+		for (i = 0; i < n_indexes; i++) {
+			ut_ad(!(indexes[i]->type & DICT_FTS));
+			ut_ad(!indexes[i]->is_committed());
+			ut_ad(!dict_index_is_clust(indexes[i]));
+
+			/* Completed indexes should be dropped as
+			well, and indexes whose creation was aborted
+			should be dropped from the persistent
+			storage. However, at this point we can only
+			set some flags in the not-yet-published
+			indexes. These indexes will be dropped later
+			in row_merge_drop_indexes(), called by
+			rollback_inplace_alter_table(). */
+
+			switch (dict_index_get_online_status(indexes[i])) {
+			case ONLINE_INDEX_COMPLETE:
+				break;
+			case ONLINE_INDEX_CREATION:
+				indexes[i]->lock.x_lock(SRW_LOCK_CALL);
+				row_log_abort_sec(indexes[i]);
+				indexes[i]->type |= DICT_CORRUPT;
+				indexes[i]->lock.x_unlock();
+				new_table->drop_aborted = TRUE;
+				/* fall through */
+			case ONLINE_INDEX_ABORTED_DROPPED:
+			case ONLINE_INDEX_ABORTED:
+				MONITOR_ATOMIC_INC(
+					MONITOR_BACKGROUND_DROP_INDEX);
+			}
+		}
+
+		dict_index_t *clust_index= new_table->indexes.start;
+		clust_index->lock.x_lock(SRW_LOCK_CALL);
+		ut_ad(!clust_index->online_log ||
+		      clust_index->online_log_is_dummy());
+		clust_index->online_log= nullptr;
+		clust_index->lock.x_unlock();
+	}
+
+	DBUG_RETURN(error);
+}
+
+dberr_t row_merge_bulk_t::alloc_block()
+{
+  if (m_block)
+    return DB_SUCCESS;
+  m_block= m_alloc.allocate_large_dontdump(
+             3 * srv_sort_buf_size, &m_block_pfx);
+  if (m_block == nullptr)
+    return DB_OUT_OF_MEMORY;
+
+  m_crypt_pfx.m_size= 0;
+  TRASH_ALLOC(&m_crypt_pfx, sizeof m_crypt_pfx);
+  if (srv_encrypt_log)
+  {
+    m_crypt_block= static_cast<row_merge_block_t*>(
+       m_alloc.allocate_large(3 * srv_sort_buf_size, &m_crypt_pfx));
+    if (!m_crypt_block)
+      return DB_OUT_OF_MEMORY;
+  }
+  return DB_SUCCESS;
+}
+
+row_merge_bulk_t::row_merge_bulk_t(dict_table_t *table)
+{
+  ulint n_index= 0;
+  for (dict_index_t *index= UT_LIST_GET_FIRST(table->indexes);
+       index; index= UT_LIST_GET_NEXT(indexes, index))
+  {
+    if (!index->is_btree())
+      continue;
+    n_index++;
+  }
+
+  m_merge_buf= static_cast<row_merge_buf_t*>(
+    ut_zalloc_nokey(n_index * sizeof *m_merge_buf));
+
+  ulint i= 0;
+  for (dict_index_t *index= UT_LIST_GET_FIRST(table->indexes);
+       index; index= UT_LIST_GET_NEXT(indexes, index))
+  {
+    if (!index->is_btree())
+      continue;
+
+    mem_heap_t *heap= mem_heap_create(100);
+    row_merge_buf_create_low(&m_merge_buf[i], heap, index);
+    i++;
+  }
+
+  m_tmpfd= OS_FILE_CLOSED;
+  m_blob_file.fd= OS_FILE_CLOSED;
+  m_blob_file.offset= 0;
+  m_blob_file.n_rec= 0;
+}
+
+row_merge_bulk_t::~row_merge_bulk_t()
+{
+  ulint i= 0;
+  dict_table_t *table= m_merge_buf[0].index->table;
+  for (dict_index_t *index= UT_LIST_GET_FIRST(table->indexes);
+       index; index= UT_LIST_GET_NEXT(indexes, index))
+  {
+    if (!index->is_btree())
+      continue;
+    row_merge_buf_free(&m_merge_buf[i]);
+    if (m_merge_files)
+      row_merge_file_destroy(&m_merge_files[i]);
+    i++;
+  }
+
+  row_merge_file_destroy_low(m_tmpfd);
+
+  row_merge_file_destroy(&m_blob_file);
+
+  ut_free(m_merge_buf);
+
+  ut_free(m_merge_files);
+
+  if (m_block)
+    m_alloc.deallocate_large(m_block, &m_block_pfx);
+
+  if (m_crypt_block)
+    m_alloc.deallocate_large(m_crypt_block, &m_crypt_pfx);
+}
+
+void row_merge_bulk_t::init_tmp_file()
+{
+  if (m_merge_files)
+    return;
+
+  ulint n_index= 0;
+  dict_table_t *table= m_merge_buf[0].index->table;
+  for (dict_index_t *index= UT_LIST_GET_FIRST(table->indexes);
+       index; index= UT_LIST_GET_NEXT(indexes, index))
+  {
+    if (!index->is_btree())
+      continue;
+    n_index++;
+  }
+
+  m_merge_files= static_cast<merge_file_t*>(
+    ut_malloc_nokey(n_index * sizeof *m_merge_files));
+
+  for (ulint i= 0; i < n_index; i++)
+  {
+    m_merge_files[i].fd= OS_FILE_CLOSED;
+    m_merge_files[i].offset= 0;
+    m_merge_files[i].n_rec= 0;
+  }
+}
+
+void row_merge_bulk_t::clean_bulk_buffer(ulint index_no)
+{
+  mem_heap_empty(m_merge_buf[index_no].heap);
+  m_merge_buf[index_no].total_size = m_merge_buf[index_no].n_tuples = 0;
+}
+
+bool row_merge_bulk_t::create_tmp_file(ulint index_no)
+{
+  return row_merge_file_create_if_needed(
+            &m_merge_files[index_no], &m_tmpfd,
+            m_merge_buf[index_no].n_tuples, NULL);
+}
+
+dberr_t row_merge_bulk_t::write_to_tmp_file(ulint index_no)
+{
+  if (!create_tmp_file(index_no))
+    return DB_OUT_OF_MEMORY;
+  merge_file_t *file= &m_merge_files[index_no];
+  row_merge_buf_t *buf= &m_merge_buf[index_no];
+
+  alloc_block();
+
+  if (dberr_t err= row_merge_buf_write(buf,
+#ifndef DBUG_OFF
+                                       file,
+#endif
+                                       m_block,
+                                       index_no == 0 ? &m_blob_file : nullptr))
+    return err;
+
+  if (!row_merge_write(file->fd, file->offset++,
+                       m_block, m_crypt_block,
+                       buf->index->table->space->id))
+    return DB_TEMP_FILE_WRITE_FAIL;
+  MEM_UNDEFINED(&m_block[0], srv_sort_buf_size);
+  return DB_SUCCESS;
+}
+
+dberr_t row_merge_bulk_t::bulk_insert_buffered(const dtuple_t &row,
+                                               const dict_index_t &ind,
+                                               trx_t *trx)
+{
+  dberr_t err= DB_SUCCESS;
+  ulint i= 0;
+  mem_heap_t *large_tuple_heap= nullptr;
+  for (dict_index_t *index= UT_LIST_GET_FIRST(ind.table->indexes);
+       index; index= UT_LIST_GET_NEXT(indexes, index))
+  {
+    if (!index->is_btree())
+      continue;
+
+    if (index != &ind)
+    {
+      i++;
+      continue;
+    }
+    row_merge_buf_t *buf= &m_merge_buf[i];
+add_to_buf:
+    if (row_merge_bulk_buf_add(buf, *ind.table, row))
+    {
+      i++;
+      goto func_exit;
+    }
+
+    if (buf->n_tuples == 0)
+    {
+      /* Tuple data size is greater than srv_sort_buf_size */
+      dtuple_t *big_tuple= row_merge_buf_large_tuple(
+        row, &m_blob_file, &large_tuple_heap);
+      if (row_merge_bulk_buf_add(buf, *ind.table, *big_tuple))
+      {
+        i++;
+	goto func_exit;
+      }
+    }
+
+    if (index->is_unique())
+    {
+      row_merge_dup_t dup{index, nullptr, nullptr, 0};
+      row_merge_buf_sort(buf, &dup);
+      if (dup.n_dup)
+      {
+        trx->error_info= index;
+        err= DB_DUPLICATE_KEY;
+        goto func_exit;
+      }
+    }
+    else
+      row_merge_buf_sort(buf, NULL);
+    init_tmp_file();
+    merge_file_t *file= &m_merge_files[i];
+    file->n_rec+= buf->n_tuples;
+    err= write_to_tmp_file(i);
+    if (err != DB_SUCCESS)
+    {
+      trx->error_info= index;
+      goto func_exit;
+    }
+    clean_bulk_buffer(i);
+    buf= &m_merge_buf[i];
+    goto add_to_buf;
+  }
+
+func_exit:
+  if (large_tuple_heap)
+    mem_heap_free(large_tuple_heap);
+  return err;
+}
+
+dberr_t row_merge_bulk_t::write_to_index(ulint index_no, trx_t *trx)
+{
+  dberr_t err= DB_SUCCESS;
+  row_merge_buf_t buf= m_merge_buf[index_no];
+  merge_file_t *file= m_merge_files ?
+    &m_merge_files[index_no] : nullptr;
+  dict_index_t *index= buf.index;
+  dict_table_t *table= index->table;
+  BtrBulk btr_bulk(index, trx);
+  row_merge_dup_t dup = {index, nullptr, nullptr, 0};
+
+  if (buf.n_tuples)
+  {
+    if (dict_index_is_unique(index))
+    {
+      row_merge_buf_sort(&buf, &dup);
+      if (dup.n_dup)
+      {
+        err= DB_DUPLICATE_KEY;
+        goto func_exit;
+      }
+    }
+    else row_merge_buf_sort(&buf, NULL);
+    if (file && file->fd != OS_FILE_CLOSED)
+    {
+      file->n_rec+= buf.n_tuples;
+      err= write_to_tmp_file(index_no);
+      if (err!= DB_SUCCESS)
+        goto func_exit;
+    }
+    else
+    {
+      /* Data got fit in merge buffer. */
+      err= row_merge_insert_index_tuples(
+            index, table, OS_FILE_CLOSED, nullptr,
+            &buf, &btr_bulk, 0, 0, 0, nullptr, table->space_id, nullptr,
+            m_blob_file.fd == OS_FILE_CLOSED ? nullptr : &m_blob_file);
+      goto func_exit;
+    }
+  }
+
+  err= row_merge_sort(trx, &dup, file,
+                      m_block, &m_tmpfd, true, 0, 0,
+                      m_crypt_block, table->space_id, nullptr);
+  if (err != DB_SUCCESS)
+    goto func_exit;
+
+  err= row_merge_insert_index_tuples(
+        index, table, file->fd, m_block, nullptr,
+        &btr_bulk, 0, 0, 0, m_crypt_block, table->space_id,
+        nullptr, &m_blob_file);
+
+func_exit:
+  if (err != DB_SUCCESS)
+    trx->error_info= index;
+  else if (index->is_primary() && table->persistent_autoinc)
+    btr_write_autoinc(index, table->autoinc - 1);
+  err= btr_bulk.finish(err);
+  return err;
+}
+
+dberr_t row_merge_bulk_t::write_to_table(dict_table_t *table, trx_t *trx)
+{
+  ulint i= 0;
+  for (dict_index_t *index= UT_LIST_GET_FIRST(table->indexes);
+       index; index= UT_LIST_GET_NEXT(indexes, index))
+  {
+    if (!index->is_btree())
+      continue;
+
+    dberr_t err= write_to_index(i, trx);
+    if (err != DB_SUCCESS)
+      return err;
+    i++;
+  }
+
+  return DB_SUCCESS;
+}
+
+dberr_t trx_mod_table_time_t::write_bulk(dict_table_t *table, trx_t *trx)
+{
+  if (!bulk_store)
+    return DB_SUCCESS;
+  dberr_t err= bulk_store->write_to_table(table, trx);
+  delete bulk_store;
+  bulk_store= nullptr;
+  return err;
+}
+
+dberr_t trx_t::bulk_insert_apply_low()
+{
+  ut_ad(bulk_insert);
+  ut_ad(!check_unique_secondary);
+  ut_ad(!check_foreigns);
+  dberr_t err;
+  for (auto& t : mod_tables)
+    if (t.second.is_bulk_insert())
+      if ((err= t.second.write_bulk(t.first, this)) != DB_SUCCESS)
+        goto bulk_rollback;
+  return DB_SUCCESS;
+bulk_rollback:
+  undo_no_t low_limit= UINT64_MAX;
+  for (auto& t : mod_tables)
+  {
+    if (t.second.is_bulk_insert())
+    {
+      if (t.second.get_first() < low_limit)
+        low_limit= t.second.get_first();
+      delete t.second.bulk_store;
+      t.second.bulk_store= nullptr;
+    }
+  }
+  trx_savept_t bulk_save{low_limit};
+  rollback(&bulk_save);
+  return err;
+}
diff --git a/storage/innobase/row/row0mysql.cc b/storage/innobase/row/row0mysql.cc
new file mode 100644
index 00000000..c5ee3be7
--- /dev/null
+++ b/storage/innobase/row/row0mysql.cc
@@ -0,0 +1,2916 @@
+/*****************************************************************************
+
+Copyright (c) 2000, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0mysql.cc
+Interface between Innobase row operations and MySQL.
+Contains also create table and other data dictionary operations.
+
+Created 9/17/2000 Heikki Tuuri
+*******************************************************/
+
+#include "univ.i"
+#include <debug_sync.h>
+#include <gstream.h>
+#include <spatial.h>
+
+#include "row0mysql.h"
+#include "buf0flu.h"
+#include "btr0sea.h"
+#include "dict0boot.h"
+#include "dict0crea.h"
+#include "dict0dict.h"
+#include "dict0load.h"
+#include "dict0stats.h"
+#include "dict0stats_bg.h"
+#include "fil0fil.h"
+#include "fil0crypt.h"
+#include "fsp0file.h"
+#include "fts0fts.h"
+#include "fts0types.h"
+#include "ibuf0ibuf.h"
+#include "lock0lock.h"
+#include "log0log.h"
+#include "pars0pars.h"
+#include "que0que.h"
+#include "rem0cmp.h"
+#include "row0import.h"
+#include "row0ins.h"
+#include "row0row.h"
+#include "row0sel.h"
+#include "row0upd.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "srv0mon.h"
+#include "srv0start.h"
+#include "log.h"
+
+#include <algorithm>
+#include <vector>
+#include <thread>
+
+
+/** Delay an INSERT, DELETE or UPDATE operation if the purge is lagging. */
+static void row_mysql_delay_if_needed()
+{
+  const auto delay= srv_dml_needed_delay;
+  if (UNIV_UNLIKELY(delay != 0))
+  {
+    /* Adjust for purge_coordinator_state::refresh() */
+    log_sys.latch.rd_lock(SRW_LOCK_CALL);
+    const lsn_t last= log_sys.last_checkpoint_lsn,
+      max_age= log_sys.max_checkpoint_age;
+    log_sys.latch.rd_unlock();
+    const lsn_t lsn= log_sys.get_lsn();
+    if ((lsn - last) / 4 >= max_age / 5)
+      buf_flush_ahead(last + max_age / 5, false);
+    purge_sys.wake_if_not_active();
+    std::this_thread::sleep_for(std::chrono::microseconds(delay));
+  }
+}
+
+/*******************************************************************//**
+Frees the blob heap in prebuilt when no longer needed. */
+void
+row_mysql_prebuilt_free_blob_heap(
+/*==============================*/
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct of a
+					ha_innobase:: table handle */
+{
+	DBUG_ENTER("row_mysql_prebuilt_free_blob_heap");
+
+	DBUG_PRINT("row_mysql_prebuilt_free_blob_heap",
+		   ("blob_heap freeing: %p", prebuilt->blob_heap));
+
+	mem_heap_free(prebuilt->blob_heap);
+	prebuilt->blob_heap = NULL;
+	DBUG_VOID_RETURN;
+}
+
+/*******************************************************************//**
+Stores a >= 5.0.3 format true VARCHAR length to dest, in the MySQL row
+format.
+@return pointer to the data, we skip the 1 or 2 bytes at the start
+that are used to store the len */
+byte*
+row_mysql_store_true_var_len(
+/*=========================*/
+	byte*	dest,	/*!< in: where to store */
+	ulint	len,	/*!< in: length, must fit in two bytes */
+	ulint	lenlen)	/*!< in: storage length of len: either 1 or 2 bytes */
+{
+	if (lenlen == 2) {
+		ut_a(len < 256 * 256);
+
+		mach_write_to_2_little_endian(dest, len);
+
+		return(dest + 2);
+	}
+
+	ut_a(lenlen == 1);
+	ut_a(len < 256);
+
+	mach_write_to_1(dest, len);
+
+	return(dest + 1);
+}
+
+/*******************************************************************//**
+Reads a >= 5.0.3 format true VARCHAR length, in the MySQL row format, and
+returns a pointer to the data.
+@return pointer to the data, we skip the 1 or 2 bytes at the start
+that are used to store the len */
+const byte*
+row_mysql_read_true_varchar(
+/*========================*/
+	ulint*		len,	/*!< out: variable-length field length */
+	const byte*	field,	/*!< in: field in the MySQL format */
+	ulint		lenlen)	/*!< in: storage length of len: either 1
+				or 2 bytes */
+{
+	if (lenlen == 2) {
+		*len = mach_read_from_2_little_endian(field);
+
+		return(field + 2);
+	}
+
+	ut_a(lenlen == 1);
+
+	*len = mach_read_from_1(field);
+
+	return(field + 1);
+}
+
+/*******************************************************************//**
+Stores a reference to a BLOB in the MySQL format. */
+void
+row_mysql_store_blob_ref(
+/*=====================*/
+	byte*		dest,	/*!< in: where to store */
+	ulint		col_len,/*!< in: dest buffer size: determines into
+				how many bytes the BLOB length is stored,
+				the space for the length may vary from 1
+				to 4 bytes */
+	const void*	data,	/*!< in: BLOB data; if the value to store
+				is SQL NULL this should be NULL pointer */
+	ulint		len)	/*!< in: BLOB length; if the value to store
+				is SQL NULL this should be 0; remember
+				also to set the NULL bit in the MySQL record
+				header! */
+{
+	/* MySQL might assume the field is set to zero except the length and
+	the pointer fields */
+
+	memset(dest, '\0', col_len);
+
+	/* In dest there are 1 - 4 bytes reserved for the BLOB length,
+	and after that 8 bytes reserved for the pointer to the data.
+	In 32-bit architectures we only use the first 4 bytes of the pointer
+	slot. */
+
+	ut_a(col_len - 8 > 1 || len < 256);
+	ut_a(col_len - 8 > 2 || len < 256 * 256);
+	ut_a(col_len - 8 > 3 || len < 256 * 256 * 256);
+
+	mach_write_to_n_little_endian(dest, col_len - 8, len);
+
+	memcpy(dest + col_len - 8, &data, sizeof data);
+}
+
+/*******************************************************************//**
+Reads a reference to a BLOB in the MySQL format.
+@return pointer to BLOB data */
+const byte*
+row_mysql_read_blob_ref(
+/*====================*/
+	ulint*		len,		/*!< out: BLOB length */
+	const byte*	ref,		/*!< in: BLOB reference in the
+					MySQL format */
+	ulint		col_len)	/*!< in: BLOB reference length
+					(not BLOB length) */
+{
+	byte*	data;
+
+	*len = mach_read_from_n_little_endian(ref, col_len - 8);
+
+	memcpy(&data, ref + col_len - 8, sizeof data);
+
+	return(data);
+}
+
+/*******************************************************************//**
+Converting InnoDB geometry data format to MySQL data format. */
+void
+row_mysql_store_geometry(
+/*=====================*/
+	byte*		dest,		/*!< in/out: where to store */
+	ulint		dest_len,	/*!< in: dest buffer size: determines
+					into how many bytes the GEOMETRY length
+					is stored, the space for the length
+					may vary from 1 to 4 bytes */
+	const byte*	src,		/*!< in: GEOMETRY data; if the value to
+					store is SQL NULL this should be NULL
+					pointer */
+	ulint		src_len)	/*!< in: GEOMETRY length; if the value
+					to store is SQL NULL this should be 0;
+					remember also to set the NULL bit in
+					the MySQL record header! */
+{
+	/* MySQL might assume the field is set to zero except the length and
+	the pointer fields */
+	MEM_CHECK_DEFINED(src, src_len);
+
+	memset(dest, '\0', dest_len);
+
+	/* In dest there are 1 - 4 bytes reserved for the BLOB length,
+	and after that 8 bytes reserved for the pointer to the data.
+	In 32-bit architectures we only use the first 4 bytes of the pointer
+	slot. */
+
+	ut_ad(dest_len - 8 > 1 || src_len < 1<<8);
+	ut_ad(dest_len - 8 > 2 || src_len < 1<<16);
+	ut_ad(dest_len - 8 > 3 || src_len < 1<<24);
+
+	mach_write_to_n_little_endian(dest, dest_len - 8, src_len);
+
+	memcpy(dest + dest_len - 8, &src, sizeof src);
+}
+
+/*******************************************************************//**
+Read geometry data in the MySQL format.
+@return pointer to geometry data */
+static
+const byte*
+row_mysql_read_geometry(
+/*====================*/
+	ulint*		len,		/*!< out: data length */
+	const byte*	ref,		/*!< in: geometry data in the
+					MySQL format */
+	ulint		col_len)	/*!< in: MySQL format length */
+{
+	byte*		data;
+	ut_ad(col_len > 8);
+
+	*len = mach_read_from_n_little_endian(ref, col_len - 8);
+
+	memcpy(&data, ref + col_len - 8, sizeof data);
+
+	return(data);
+}
+
+/**************************************************************//**
+Pad a column with spaces. */
+void
+row_mysql_pad_col(
+/*==============*/
+	ulint	mbminlen,	/*!< in: minimum size of a character,
+				in bytes */
+	byte*	pad,		/*!< out: padded buffer */
+	ulint	len)		/*!< in: number of bytes to pad */
+{
+	const byte*	pad_end;
+
+	switch (UNIV_EXPECT(mbminlen, 1)) {
+	default:
+		ut_error;
+	case 1:
+		/* space=0x20 */
+		memset(pad, 0x20, len);
+		break;
+	case 2:
+		/* space=0x0020 */
+		pad_end = pad + len;
+		ut_a(!(len % 2));
+		while (pad < pad_end) {
+			*pad++ = 0x00;
+			*pad++ = 0x20;
+		};
+		break;
+	case 4:
+		/* space=0x00000020 */
+		pad_end = pad + len;
+		ut_a(!(len % 4));
+		while (pad < pad_end) {
+			*pad++ = 0x00;
+			*pad++ = 0x00;
+			*pad++ = 0x00;
+			*pad++ = 0x20;
+		}
+		break;
+	}
+}
+
+/**************************************************************//**
+Stores a non-SQL-NULL field given in the MySQL format in the InnoDB format.
+The counterpart of this function is row_sel_field_store_in_mysql_format() in
+row0sel.cc.
+@return up to which byte we used buf in the conversion */
+byte*
+row_mysql_store_col_in_innobase_format(
+/*===================================*/
+	dfield_t*	dfield,		/*!< in/out: dfield where dtype
+					information must be already set when
+					this function is called! */
+	byte*		buf,		/*!< in/out: buffer for a converted
+					integer value; this must be at least
+					col_len long then! NOTE that dfield
+					may also get a pointer to 'buf',
+					therefore do not discard this as long
+					as dfield is used! */
+	ibool		row_format_col,	/*!< TRUE if the mysql_data is from
+					a MySQL row, FALSE if from a MySQL
+					key value;
+					in MySQL, a true VARCHAR storage
+					format differs in a row and in a
+					key value: in a key value the length
+					is always stored in 2 bytes! */
+	const byte*	mysql_data,	/*!< in: MySQL column value, not
+					SQL NULL; NOTE that dfield may also
+					get a pointer to mysql_data,
+					therefore do not discard this as long
+					as dfield is used! */
+	ulint		col_len,	/*!< in: MySQL column length; NOTE that
+					this is the storage length of the
+					column in the MySQL format row, not
+					necessarily the length of the actual
+					payload data; if the column is a true
+					VARCHAR then this is irrelevant */
+	ulint		comp)		/*!< in: nonzero=compact format */
+{
+	const byte*	ptr	= mysql_data;
+	const dtype_t*	dtype;
+	ulint		type;
+	ulint		lenlen;
+
+	dtype = dfield_get_type(dfield);
+
+	type = dtype->mtype;
+
+	if (type == DATA_INT) {
+		/* Store integer data in Innobase in a big-endian format,
+		sign bit negated if the data is a signed integer. In MySQL,
+		integers are stored in a little-endian format. */
+
+		byte*	p = buf + col_len;
+
+		for (;;) {
+			p--;
+			*p = *mysql_data;
+			if (p == buf) {
+				break;
+			}
+			mysql_data++;
+		}
+
+		if (!(dtype->prtype & DATA_UNSIGNED)) {
+
+			*buf ^= 128;
+		}
+
+		ptr = buf;
+		buf += col_len;
+	} else if ((type == DATA_VARCHAR
+		    || type == DATA_VARMYSQL
+		    || type == DATA_BINARY)) {
+
+		if (dtype_get_mysql_type(dtype) == DATA_MYSQL_TRUE_VARCHAR) {
+			/* The length of the actual data is stored to 1 or 2
+			bytes at the start of the field */
+
+			if (row_format_col) {
+				if (dtype->prtype & DATA_LONG_TRUE_VARCHAR) {
+					lenlen = 2;
+				} else {
+					lenlen = 1;
+				}
+			} else {
+				/* In a MySQL key value, lenlen is always 2 */
+				lenlen = 2;
+			}
+
+			ptr = row_mysql_read_true_varchar(&col_len, mysql_data,
+							  lenlen);
+		} else {
+			/* Remove trailing spaces from old style VARCHAR
+			columns. */
+
+			/* Handle Unicode strings differently. */
+			ulint	mbminlen	= dtype_get_mbminlen(dtype);
+
+			ptr = mysql_data;
+
+			switch (mbminlen) {
+			default:
+				ut_error;
+			case 4:
+				/* space=0x00000020 */
+				/* Trim "half-chars", just in case. */
+				col_len &= ~3U;
+
+				while (col_len >= 4
+				       && ptr[col_len - 4] == 0x00
+				       && ptr[col_len - 3] == 0x00
+				       && ptr[col_len - 2] == 0x00
+				       && ptr[col_len - 1] == 0x20) {
+					col_len -= 4;
+				}
+				break;
+			case 2:
+				/* space=0x0020 */
+				/* Trim "half-chars", just in case. */
+				col_len &= ~1U;
+
+				while (col_len >= 2 && ptr[col_len - 2] == 0x00
+				       && ptr[col_len - 1] == 0x20) {
+					col_len -= 2;
+				}
+				break;
+			case 1:
+				/* space=0x20 */
+				while (col_len > 0
+				       && ptr[col_len - 1] == 0x20) {
+					col_len--;
+				}
+			}
+		}
+	} else if (comp && type == DATA_MYSQL
+		   && dtype_get_mbminlen(dtype) == 1
+		   && dtype_get_mbmaxlen(dtype) > 1) {
+		/* In some cases we strip trailing spaces from UTF-8 and other
+		multibyte charsets, from FIXED-length CHAR columns, to save
+		space. UTF-8 would otherwise normally use 3 * the string length
+		bytes to store an ASCII string! */
+
+		/* We assume that this CHAR field is encoded in a
+		variable-length character set where spaces have
+		1:1 correspondence to 0x20 bytes, such as UTF-8.
+
+		Consider a CHAR(n) field, a field of n characters.
+		It will contain between n * mbminlen and n * mbmaxlen bytes.
+		We will try to truncate it to n bytes by stripping
+		space padding.	If the field contains single-byte
+		characters only, it will be truncated to n characters.
+		Consider a CHAR(5) field containing the string
+		".a   " where "." denotes a 3-byte character represented
+		by the bytes "$%&". After our stripping, the string will
+		be stored as "$%&a " (5 bytes). The string
+		".abc " will be stored as "$%&abc" (6 bytes).
+
+		The space padding will be restored in row0sel.cc, function
+		row_sel_field_store_in_mysql_format(). */
+
+		ulint		n_chars;
+
+		ut_a(!(dtype_get_len(dtype) % dtype_get_mbmaxlen(dtype)));
+
+		n_chars = dtype_get_len(dtype) / dtype_get_mbmaxlen(dtype);
+
+		/* Strip space padding. */
+		while (col_len > n_chars && ptr[col_len - 1] == 0x20) {
+			col_len--;
+		}
+	} else if (!row_format_col) {
+		/* if mysql data is from a MySQL key value
+		since the length is always stored in 2 bytes,
+		we need do nothing here. */
+	} else if (type == DATA_BLOB) {
+
+		ptr = row_mysql_read_blob_ref(&col_len, mysql_data, col_len);
+	} else if (DATA_GEOMETRY_MTYPE(type)) {
+		ptr = row_mysql_read_geometry(&col_len, mysql_data, col_len);
+	}
+
+	dfield_set_data(dfield, ptr, col_len);
+
+	return(buf);
+}
+
+/**************************************************************//**
+Convert a row in the MySQL format to a row in the Innobase format. Note that
+the function to convert a MySQL format key value to an InnoDB dtuple is
+row_sel_convert_mysql_key_to_innobase() in row0sel.cc. */
+static
+void
+row_mysql_convert_row_to_innobase(
+/*==============================*/
+	dtuple_t*	row,		/*!< in/out: Innobase row where the
+					field type information is already
+					copied there! */
+	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct where template
+					must be of type ROW_MYSQL_WHOLE_ROW */
+	const byte*	mysql_rec,	/*!< in: row in the MySQL format;
+					NOTE: do not discard as long as
+					row is used, as row may contain
+					pointers to this record! */
+	mem_heap_t**	blob_heap)	/*!< in: FIX_ME, remove this after
+					server fixes its issue */
+{
+	const mysql_row_templ_t*templ;
+	dfield_t*		dfield;
+	ulint			i;
+	ulint			n_col = 0;
+	ulint			n_v_col = 0;
+
+	ut_ad(prebuilt->template_type == ROW_MYSQL_WHOLE_ROW);
+	ut_ad(prebuilt->mysql_template);
+
+	for (i = 0; i < prebuilt->n_template; i++) {
+
+		templ = prebuilt->mysql_template + i;
+
+		if (templ->is_virtual) {
+			ut_ad(n_v_col < dtuple_get_n_v_fields(row));
+			dfield = dtuple_get_nth_v_field(row, n_v_col);
+			n_v_col++;
+		} else {
+			dfield = dtuple_get_nth_field(row, n_col);
+			n_col++;
+		}
+
+		if (templ->mysql_null_bit_mask != 0) {
+			/* Column may be SQL NULL */
+
+			if (mysql_rec[templ->mysql_null_byte_offset]
+			    & (byte) (templ->mysql_null_bit_mask)) {
+
+				/* It is SQL NULL */
+
+				dfield_set_null(dfield);
+
+				goto next_column;
+			}
+		}
+
+		row_mysql_store_col_in_innobase_format(
+			dfield,
+			prebuilt->ins_upd_rec_buff + templ->mysql_col_offset,
+			TRUE, /* MySQL row format data */
+			mysql_rec + templ->mysql_col_offset,
+			templ->mysql_col_len,
+			dict_table_is_comp(prebuilt->table));
+
+		/* server has issue regarding handling BLOB virtual fields,
+		and we need to duplicate it with our own memory here */
+		if (templ->is_virtual
+		    && DATA_LARGE_MTYPE(dfield_get_type(dfield)->mtype)) {
+			if (*blob_heap == NULL) {
+				*blob_heap = mem_heap_create(dfield->len);
+			}
+			dfield_dup(dfield, *blob_heap);
+		}
+next_column:
+		;
+	}
+
+	/* If there is a FTS doc id column and it is not user supplied (
+	generated by server) then assign it a new doc id. */
+	if (!prebuilt->table->fts) {
+		return;
+	}
+
+	ut_a(prebuilt->table->fts->doc_col != ULINT_UNDEFINED);
+
+	doc_id_t	doc_id;
+
+	if (!DICT_TF2_FLAG_IS_SET(prebuilt->table, DICT_TF2_FTS_HAS_DOC_ID)) {
+		if (prebuilt->table->fts->cache->first_doc_id
+		    == FTS_NULL_DOC_ID) {
+			fts_get_next_doc_id(prebuilt->table, &doc_id);
+		}
+		return;
+	}
+
+	dfield_t*	fts_doc_id = dtuple_get_nth_field(
+		row, prebuilt->table->fts->doc_col);
+
+	if (fts_get_next_doc_id(prebuilt->table, &doc_id) == DB_SUCCESS) {
+		ut_a(doc_id != FTS_NULL_DOC_ID);
+		ut_ad(sizeof(doc_id) == fts_doc_id->type.len);
+		dfield_set_data(fts_doc_id, prebuilt->ins_upd_rec_buff
+				+ prebuilt->mysql_row_len, 8);
+		fts_write_doc_id(fts_doc_id->data, doc_id);
+	} else {
+		dfield_set_null(fts_doc_id);
+	}
+}
+
+/****************************************************************//**
+Handles user errors and lock waits detected by the database engine.
+@return true if it was a lock wait and we should continue running the
+query thread and in that case the thr is ALREADY in the running state. */
+bool
+row_mysql_handle_errors(
+/*====================*/
+	dberr_t*	new_err,/*!< out: possible new error encountered in
+				lock wait, or if no new error, the value
+				of trx->error_state at the entry of this
+				function */
+	trx_t*		trx,	/*!< in: transaction */
+	que_thr_t*	thr,	/*!< in: query thread, or NULL */
+	trx_savept_t*	savept)	/*!< in: savepoint, or NULL */
+{
+	dberr_t	err;
+
+	DBUG_ENTER("row_mysql_handle_errors");
+	DEBUG_SYNC_C("row_mysql_handle_errors");
+
+	err = trx->error_state;
+
+handle_new_error:
+	ut_a(err != DB_SUCCESS);
+
+	trx->error_state = DB_SUCCESS;
+
+	DBUG_LOG("trx", "handle error: " << err
+		 << ";id=" << ib::hex(trx->id) << ", " << trx);
+
+	switch (err) {
+	case DB_LOCK_WAIT_TIMEOUT:
+		extern my_bool innobase_rollback_on_timeout;
+		if (innobase_rollback_on_timeout) {
+			goto rollback;
+		}
+		/* fall through */
+	case DB_DUPLICATE_KEY:
+	case DB_FOREIGN_DUPLICATE_KEY:
+	case DB_TOO_BIG_RECORD:
+	case DB_UNDO_RECORD_TOO_BIG:
+	case DB_ROW_IS_REFERENCED:
+	case DB_NO_REFERENCED_ROW:
+	case DB_CANNOT_ADD_CONSTRAINT:
+	case DB_TOO_MANY_CONCURRENT_TRXS:
+	case DB_OUT_OF_FILE_SPACE:
+	case DB_READ_ONLY:
+	case DB_FTS_INVALID_DOCID:
+	case DB_INTERRUPTED:
+	case DB_CANT_CREATE_GEOMETRY_OBJECT:
+	case DB_TABLE_NOT_FOUND:
+	case DB_DECRYPTION_FAILED:
+	case DB_COMPUTE_VALUE_FAILED:
+	rollback_to_savept:
+		DBUG_EXECUTE_IF("row_mysql_crash_if_error", {
+					log_buffer_flush_to_disk();
+					DBUG_SUICIDE(); });
+		if (savept) {
+			/* Roll back the latest, possibly incomplete insertion
+			or update */
+
+			trx->rollback(savept);
+		}
+		if (!trx->bulk_insert) {
+			/* MariaDB will roll back the latest SQL statement */
+			break;
+		}
+		/* MariaDB will roll back the entire transaction. */
+		trx->bulk_insert = false;
+		trx->last_sql_stat_start.least_undo_no = 0;
+		trx->savepoints_discard();
+		break;
+	case DB_LOCK_WAIT:
+		err = lock_wait(thr);
+		if (err != DB_SUCCESS) {
+			goto handle_new_error;
+		}
+
+		*new_err = err;
+
+		DBUG_RETURN(true);
+
+	case DB_DEADLOCK:
+	case DB_LOCK_TABLE_FULL:
+	rollback:
+		/* Roll back the whole transaction; this resolution was added
+		to version 3.23.43 */
+
+		trx->rollback();
+		break;
+
+	case DB_IO_ERROR:
+	case DB_TABLE_CORRUPT:
+	case DB_CORRUPTION:
+	case DB_PAGE_CORRUPTED:
+		ib::error() << "We detected index corruption in an InnoDB type"
+			" table. You have to dump + drop + reimport the"
+			" table or, in a case of widespread corruption,"
+			" dump all InnoDB tables and recreate the whole"
+			" tablespace. If the mariadbd server crashes after"
+			" the startup or when you dump the tables. "
+			<< FORCE_RECOVERY_MSG;
+		goto rollback_to_savept;
+	case DB_FOREIGN_EXCEED_MAX_CASCADE:
+		ib::error() << "Cannot delete/update rows with cascading"
+			" foreign key constraints that exceed max depth of "
+			<< FK_MAX_CASCADE_DEL << ". Please drop excessive"
+			" foreign constraints and try again";
+		goto rollback_to_savept;
+	case DB_UNSUPPORTED:
+		ib::error() << "Cannot delete/update rows with cascading"
+			" foreign key constraints in timestamp-based temporal"
+			" table. Please drop excessive"
+			" foreign constraints and try again";
+		goto rollback_to_savept;
+	default:
+		ib::fatal() << "Unknown error " << err;
+	}
+
+	if (dberr_t n_err = trx->error_state) {
+		trx->error_state = DB_SUCCESS;
+		*new_err = n_err;
+	} else {
+		*new_err = err;
+	}
+
+	DBUG_RETURN(false);
+}
+
+/********************************************************************//**
+Create a prebuilt struct for a MySQL table handle.
+@return own: a prebuilt struct */
+row_prebuilt_t*
+row_create_prebuilt(
+/*================*/
+	dict_table_t*	table,		/*!< in: Innobase table handle */
+	ulint		mysql_row_len)	/*!< in: length in bytes of a row in
+					the MySQL format */
+{
+	DBUG_ENTER("row_create_prebuilt");
+
+	row_prebuilt_t*	prebuilt;
+	mem_heap_t*	heap;
+	dict_index_t*	clust_index;
+	dict_index_t*	temp_index;
+	dtuple_t*	ref;
+	ulint		ref_len;
+	uint		srch_key_len = 0;
+	ulint		search_tuple_n_fields;
+
+	search_tuple_n_fields = 2 * (dict_table_get_n_cols(table)
+				     + dict_table_get_n_v_cols(table));
+
+	clust_index = dict_table_get_first_index(table);
+
+	/* Make sure that search_tuple is long enough for clustered index */
+	ut_a(2 * unsigned(table->n_cols) >= unsigned(clust_index->n_fields)
+	     - clust_index->table->n_dropped());
+
+	ref_len = dict_index_get_n_unique(clust_index);
+
+
+        /* Maximum size of the buffer needed for conversion of INTs from
+	little endian format to big endian format in an index. An index
+	can have maximum 16 columns (MAX_REF_PARTS) in it. Therfore
+	Max size for PK: 16 * 8 bytes (BIGINT's size) = 128 bytes
+	Max size Secondary index: 16 * 8 bytes + PK = 256 bytes. */
+#define MAX_SRCH_KEY_VAL_BUFFER         2* (8 * MAX_REF_PARTS)
+
+#define PREBUILT_HEAP_INITIAL_SIZE	\
+	( \
+	sizeof(*prebuilt) \
+	/* allocd in this function */ \
+	+ DTUPLE_EST_ALLOC(search_tuple_n_fields) \
+	+ DTUPLE_EST_ALLOC(ref_len) \
+	/* allocd in row_prebuild_sel_graph() */ \
+	+ sizeof(sel_node_t) \
+	+ sizeof(que_fork_t) \
+	+ sizeof(que_thr_t) \
+	/* allocd in row_get_prebuilt_update_vector() */ \
+	+ sizeof(upd_node_t) \
+	+ sizeof(upd_t) \
+	+ sizeof(upd_field_t) \
+	  * dict_table_get_n_cols(table) \
+	+ sizeof(que_fork_t) \
+	+ sizeof(que_thr_t) \
+	/* allocd in row_get_prebuilt_insert_row() */ \
+	+ sizeof(ins_node_t) \
+	/* mysql_row_len could be huge and we are not \
+	sure if this prebuilt instance is going to be \
+	used in inserts */ \
+	+ (mysql_row_len < 256 ? mysql_row_len : 0) \
+	+ DTUPLE_EST_ALLOC(dict_table_get_n_cols(table) \
+			   + dict_table_get_n_v_cols(table)) \
+	+ sizeof(que_fork_t) \
+	+ sizeof(que_thr_t) \
+	+ sizeof(*prebuilt->pcur) \
+	+ sizeof(*prebuilt->clust_pcur) \
+	)
+
+	/* Calculate size of key buffer used to store search key in
+	InnoDB format. MySQL stores INTs in little endian format and
+	InnoDB stores INTs in big endian format with the sign bit
+	flipped. All other field types are stored/compared the same
+	in MySQL and InnoDB, so we must create a buffer containing
+	the INT key parts in InnoDB format.We need two such buffers
+	since both start and end keys are used in records_in_range(). */
+
+	for (temp_index = dict_table_get_first_index(table); temp_index;
+	     temp_index = dict_table_get_next_index(temp_index)) {
+		DBUG_EXECUTE_IF("innodb_srch_key_buffer_max_value",
+			ut_a(temp_index->n_user_defined_cols
+						== MAX_REF_PARTS););
+		if (temp_index->is_corrupted()) {
+			continue;
+		}
+
+		uint temp_len = 0;
+		for (uint i = 0; i < temp_index->n_uniq; i++) {
+			ulint type = temp_index->fields[i].col->mtype;
+			if (type == DATA_INT) {
+				temp_len +=
+					temp_index->fields[i].fixed_len;
+			}
+		}
+		srch_key_len = std::max(srch_key_len,temp_len);
+	}
+
+	ut_a(srch_key_len <= MAX_SRCH_KEY_VAL_BUFFER);
+
+	DBUG_EXECUTE_IF("innodb_srch_key_buffer_max_value",
+		ut_a(srch_key_len == MAX_SRCH_KEY_VAL_BUFFER););
+
+	/* We allocate enough space for the objects that are likely to
+	be created later in order to minimize the number of malloc()
+	calls */
+	heap = mem_heap_create(PREBUILT_HEAP_INITIAL_SIZE + 2 * srch_key_len);
+
+	prebuilt = static_cast<row_prebuilt_t*>(
+		mem_heap_zalloc(heap, sizeof(*prebuilt)));
+
+	prebuilt->magic_n = ROW_PREBUILT_ALLOCATED;
+	prebuilt->magic_n2 = ROW_PREBUILT_ALLOCATED;
+
+	prebuilt->table = table;
+
+	prebuilt->sql_stat_start = TRUE;
+	prebuilt->heap = heap;
+
+	prebuilt->srch_key_val_len = srch_key_len;
+	if (prebuilt->srch_key_val_len) {
+		prebuilt->srch_key_val1 = static_cast<byte*>(
+			mem_heap_alloc(prebuilt->heap,
+				       2 * prebuilt->srch_key_val_len));
+		prebuilt->srch_key_val2 = prebuilt->srch_key_val1 +
+						prebuilt->srch_key_val_len;
+	} else {
+		prebuilt->srch_key_val1 = NULL;
+		prebuilt->srch_key_val2 = NULL;
+	}
+
+	prebuilt->pcur = static_cast<btr_pcur_t*>(
+				mem_heap_zalloc(prebuilt->heap,
+					       sizeof(btr_pcur_t)));
+	prebuilt->clust_pcur = static_cast<btr_pcur_t*>(
+					mem_heap_zalloc(prebuilt->heap,
+						       sizeof(btr_pcur_t)));
+	btr_pcur_reset(prebuilt->pcur);
+	btr_pcur_reset(prebuilt->clust_pcur);
+
+	prebuilt->select_lock_type = LOCK_NONE;
+	prebuilt->stored_select_lock_type = LOCK_NONE_UNSET;
+
+	prebuilt->search_tuple = dtuple_create(heap, search_tuple_n_fields);
+
+	ref = dtuple_create(heap, ref_len);
+
+	dict_index_copy_types(ref, clust_index, ref_len);
+
+	prebuilt->clust_ref = ref;
+
+	prebuilt->autoinc_error = DB_SUCCESS;
+	prebuilt->autoinc_offset = 0;
+
+	/* Default to 1, we will set the actual value later in
+	ha_innobase::get_auto_increment(). */
+	prebuilt->autoinc_increment = 1;
+
+	prebuilt->autoinc_last_value = 0;
+
+	/* During UPDATE and DELETE we need the doc id. */
+	prebuilt->fts_doc_id = 0;
+
+	prebuilt->mysql_row_len = mysql_row_len;
+
+	prebuilt->fts_doc_id_in_read_set = 0;
+	prebuilt->blob_heap = NULL;
+
+	DBUG_RETURN(prebuilt);
+}
+
+/** Free a prebuilt struct for a TABLE handle. */
+void row_prebuilt_free(row_prebuilt_t *prebuilt)
+{
+	DBUG_ENTER("row_prebuilt_free");
+
+	ut_a(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED);
+	ut_a(prebuilt->magic_n2 == ROW_PREBUILT_ALLOCATED);
+
+	prebuilt->magic_n = ROW_PREBUILT_FREED;
+	prebuilt->magic_n2 = ROW_PREBUILT_FREED;
+
+	btr_pcur_reset(prebuilt->pcur);
+	btr_pcur_reset(prebuilt->clust_pcur);
+
+	ut_free(prebuilt->mysql_template);
+
+	if (prebuilt->ins_graph) {
+		que_graph_free_recursive(prebuilt->ins_graph);
+	}
+
+	if (prebuilt->sel_graph) {
+		que_graph_free_recursive(prebuilt->sel_graph);
+	}
+
+	if (prebuilt->upd_graph) {
+		que_graph_free_recursive(prebuilt->upd_graph);
+	}
+
+	if (prebuilt->blob_heap) {
+		row_mysql_prebuilt_free_blob_heap(prebuilt);
+	}
+
+	if (prebuilt->old_vers_heap) {
+		mem_heap_free(prebuilt->old_vers_heap);
+	}
+
+	if (prebuilt->fetch_cache[0] != NULL) {
+		byte*	base = prebuilt->fetch_cache[0] - 4;
+		byte*	ptr = base;
+
+		for (ulint i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) {
+			ulint	magic1 = mach_read_from_4(ptr);
+			ut_a(magic1 == ROW_PREBUILT_FETCH_MAGIC_N);
+			ptr += 4;
+
+			byte*	row = ptr;
+			ut_a(row == prebuilt->fetch_cache[i]);
+			ptr += prebuilt->mysql_row_len;
+
+			ulint	magic2 = mach_read_from_4(ptr);
+			ut_a(magic2 == ROW_PREBUILT_FETCH_MAGIC_N);
+			ptr += 4;
+		}
+
+		ut_free(base);
+	}
+
+	if (prebuilt->rtr_info) {
+		rtr_clean_rtr_info(prebuilt->rtr_info, true);
+	}
+	if (prebuilt->table) {
+		dict_table_close(prebuilt->table);
+	}
+
+	mem_heap_free(prebuilt->heap);
+
+	DBUG_VOID_RETURN;
+}
+
+/*********************************************************************//**
+Updates the transaction pointers in query graphs stored in the prebuilt
+struct. */
+void
+row_update_prebuilt_trx(
+/*====================*/
+	row_prebuilt_t*	prebuilt,	/*!< in/out: prebuilt struct
+					in MySQL handle */
+	trx_t*		trx)		/*!< in: transaction handle */
+{
+	ut_a(trx->magic_n == TRX_MAGIC_N);
+	ut_a(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED);
+	ut_a(prebuilt->magic_n2 == ROW_PREBUILT_ALLOCATED);
+
+	prebuilt->trx = trx;
+
+	if (prebuilt->ins_graph) {
+		prebuilt->ins_graph->trx = trx;
+	}
+
+	if (prebuilt->upd_graph) {
+		prebuilt->upd_graph->trx = trx;
+	}
+
+	if (prebuilt->sel_graph) {
+		prebuilt->sel_graph->trx = trx;
+	}
+}
+
+/*********************************************************************//**
+Gets pointer to a prebuilt dtuple used in insertions. If the insert graph
+has not yet been built in the prebuilt struct, then this function first
+builds it.
+@return prebuilt dtuple; the column type information is also set in it */
+static
+dtuple_t*
+row_get_prebuilt_insert_row(
+/*========================*/
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in MySQL
+					handle */
+{
+	dict_table_t*		table	= prebuilt->table;
+
+	ut_ad(prebuilt && table && prebuilt->trx);
+
+	if (prebuilt->ins_node != 0) {
+
+		/* Check if indexes have been dropped or added and we
+		may need to rebuild the row insert template. */
+
+		if (prebuilt->trx_id == table->def_trx_id
+		    && prebuilt->ins_node->entry_list.size()
+		    == UT_LIST_GET_LEN(table->indexes)) {
+			return(prebuilt->ins_node->row);
+		}
+
+		ut_ad(prebuilt->trx_id < table->def_trx_id);
+
+		que_graph_free_recursive(prebuilt->ins_graph);
+
+		prebuilt->ins_graph = 0;
+	}
+
+	/* Create an insert node and query graph to the prebuilt struct */
+
+	ins_node_t*		node;
+
+	node = ins_node_create(INS_DIRECT, table, prebuilt->heap);
+
+	prebuilt->ins_node = node;
+
+	if (prebuilt->ins_upd_rec_buff == 0) {
+		prebuilt->ins_upd_rec_buff = static_cast<byte*>(
+			mem_heap_alloc(
+				prebuilt->heap,
+				DICT_TF2_FLAG_IS_SET(prebuilt->table,
+						     DICT_TF2_FTS_HAS_DOC_ID)
+				? prebuilt->mysql_row_len + 8/* FTS_DOC_ID */
+				: prebuilt->mysql_row_len));
+	}
+
+	dtuple_t*	row;
+
+	row = dtuple_create_with_vcol(
+			prebuilt->heap, dict_table_get_n_cols(table),
+			dict_table_get_n_v_cols(table));
+
+	dict_table_copy_types(row, table);
+
+	ins_node_set_new_row(node, row);
+	que_thr_t* fork = pars_complete_graph_for_exec(
+		node, prebuilt->trx, prebuilt->heap, prebuilt);
+	fork->state = QUE_THR_RUNNING;
+
+	prebuilt->ins_graph = static_cast<que_fork_t*>(
+		que_node_get_parent(fork));
+
+	prebuilt->ins_graph->state = QUE_FORK_ACTIVE;
+
+	prebuilt->trx_id = table->def_trx_id;
+
+	return(prebuilt->ins_node->row);
+}
+
+/*********************************************************************//**
+Sets an AUTO_INC type lock on the table mentioned in prebuilt. The
+AUTO_INC lock gives exclusive access to the auto-inc counter of the
+table. The lock is reserved only for the duration of an SQL statement.
+It is not compatible with another AUTO_INC or exclusive lock on the
+table.
+@return error code or DB_SUCCESS */
+dberr_t
+row_lock_table_autoinc_for_mysql(
+/*=============================*/
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in the MySQL
+					table handle */
+{
+	trx_t*			trx	= prebuilt->trx;
+	ins_node_t*		node	= prebuilt->ins_node;
+	const dict_table_t*	table	= prebuilt->table;
+	que_thr_t*		thr;
+	dberr_t			err;
+
+	/* If we already hold an AUTOINC lock on the table then do nothing.
+	Note: We peek at the value of the current owner without acquiring
+	lock_sys.latch. */
+	if (trx == table->autoinc_trx) {
+
+		return(DB_SUCCESS);
+	}
+
+	trx->op_info = "setting auto-inc lock";
+
+	row_get_prebuilt_insert_row(prebuilt);
+	node = prebuilt->ins_node;
+
+	/* We use the insert query graph as the dummy graph needed
+	in the lock module call */
+
+	thr = que_fork_get_first_thr(prebuilt->ins_graph);
+
+	do {
+		thr->run_node = node;
+		thr->prev_node = node;
+
+		/* It may be that the current session has not yet started
+		its transaction, or it has been committed: */
+
+		trx_start_if_not_started_xa(trx, true);
+
+		err = lock_table(prebuilt->table, NULL, LOCK_AUTO_INC, thr);
+
+		trx->error_state = err;
+	} while (err != DB_SUCCESS
+		 && row_mysql_handle_errors(&err, trx, thr, NULL));
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/** Lock a table.
+@param[in,out]	prebuilt	table handle
+@return error code or DB_SUCCESS */
+dberr_t
+row_lock_table(row_prebuilt_t* prebuilt)
+{
+	trx_t*		trx		= prebuilt->trx;
+	que_thr_t*	thr;
+	dberr_t		err;
+
+	trx->op_info = "setting table lock";
+
+	if (prebuilt->sel_graph == NULL) {
+		/* Build a dummy select query graph */
+		row_prebuild_sel_graph(prebuilt);
+	}
+
+	/* We use the select query graph as the dummy graph needed
+	in the lock module call */
+
+	thr = que_fork_get_first_thr(prebuilt->sel_graph);
+
+	do {
+		thr->run_node = thr;
+		thr->prev_node = thr->common.parent;
+
+		/* It may be that the current session has not yet started
+		its transaction, or it has been committed: */
+
+		trx_start_if_not_started_xa(trx, false);
+
+		err = lock_table(prebuilt->table, NULL, static_cast<lock_mode>(
+					 prebuilt->select_lock_type), thr);
+		trx->error_state = err;
+	} while (err != DB_SUCCESS
+		 && row_mysql_handle_errors(&err, trx, thr, NULL));
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/** Determine is tablespace encrypted but decryption failed, is table corrupted
+or is tablespace .ibd file missing.
+@param[in]	table		Table
+@param[in]	trx		Transaction
+@param[in]	push_warning	true if we should push warning to user
+@retval	DB_DECRYPTION_FAILED	table is encrypted but decryption failed
+@retval	DB_CORRUPTION		table is corrupted
+@retval	DB_TABLESPACE_NOT_FOUND	tablespace .ibd file not found */
+static
+dberr_t
+row_mysql_get_table_status(
+	const dict_table_t*	table,
+	trx_t*			trx,
+	bool 			push_warning = true)
+{
+	dberr_t err;
+	if (const fil_space_t* space = table->space) {
+		if (space->crypt_data && space->crypt_data->is_encrypted()) {
+			// maybe we cannot access the table due to failing
+			// to decrypt
+			if (push_warning) {
+				ib_push_warning(trx, DB_DECRYPTION_FAILED,
+					"Table %s is encrypted."
+					"However key management plugin or used key_id is not found or"
+					" used encryption algorithm or method does not match.",
+					table->name.m_name);
+			}
+
+			err = DB_DECRYPTION_FAILED;
+		} else {
+			if (push_warning) {
+				ib_push_warning(trx, DB_CORRUPTION,
+					"Table %s in tablespace %lu corrupted.",
+					table->name.m_name, table->space);
+			}
+
+			err = DB_CORRUPTION;
+		}
+	} else {
+		ib::error() << ".ibd file is missing for table "
+			<< table->name;
+		err = DB_TABLESPACE_NOT_FOUND;
+	}
+
+	return(err);
+}
+
+/** Does an insert for MySQL.
+@param[in]	mysql_rec	row in the MySQL format
+@param[in,out]	prebuilt	prebuilt struct in MySQL handle
+@return error code or DB_SUCCESS */
+dberr_t
+row_insert_for_mysql(
+	const byte*	mysql_rec,
+	row_prebuilt_t*	prebuilt,
+	ins_mode_t	ins_mode)
+{
+	trx_savept_t	savept;
+	que_thr_t*	thr;
+	dberr_t		err;
+	ibool		was_lock_wait;
+	trx_t*		trx		= prebuilt->trx;
+	ins_node_t*	node		= prebuilt->ins_node;
+	dict_table_t*	table		= prebuilt->table;
+
+	/* FIX_ME: This blob heap is used to compensate an issue in server
+	for virtual column blob handling */
+	mem_heap_t*	blob_heap = NULL;
+
+	ut_ad(trx);
+	ut_a(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED);
+	ut_a(prebuilt->magic_n2 == ROW_PREBUILT_ALLOCATED);
+
+	if (!table->space) {
+		ib::error() << "The table " << table->name
+			<< " doesn't have a corresponding tablespace, it was"
+			" discarded.";
+
+		return(DB_TABLESPACE_DELETED);
+	} else if (!table->is_readable()) {
+		return row_mysql_get_table_status(table, trx, true);
+	} else if (high_level_read_only) {
+		return(DB_READ_ONLY);
+	} else if (UNIV_UNLIKELY(table->corrupted)
+		   || dict_table_get_first_index(table)->is_corrupted()) {
+		return DB_TABLE_CORRUPT;
+	}
+
+	trx->op_info = "inserting";
+
+	row_mysql_delay_if_needed();
+
+	if (!table->no_rollback()) {
+		trx_start_if_not_started_xa(trx, true);
+	}
+
+	row_get_prebuilt_insert_row(prebuilt);
+	node = prebuilt->ins_node;
+
+	row_mysql_convert_row_to_innobase(node->row, prebuilt, mysql_rec,
+					  &blob_heap);
+
+	if (ins_mode != ROW_INS_NORMAL) {
+          node->vers_update_end(prebuilt, ins_mode == ROW_INS_HISTORICAL);
+        }
+
+	/* Because we now allow multiple INSERT into the same
+	initially empty table in bulk insert mode, on error we must
+	roll back to the start of the transaction. For correctness, it
+	would suffice to roll back to the start of the first insert
+	into this empty table, but we will keep it simple and efficient. */
+	savept.least_undo_no = trx->bulk_insert ? 0 : trx->undo_no;
+
+	thr = que_fork_get_first_thr(prebuilt->ins_graph);
+
+	if (prebuilt->sql_stat_start) {
+		node->state = INS_NODE_SET_IX_LOCK;
+		prebuilt->sql_stat_start = FALSE;
+	} else {
+		node->state = INS_NODE_ALLOC_ROW_ID;
+		node->trx_id = trx->id;
+	}
+
+run_again:
+	thr->run_node = node;
+	thr->prev_node = node;
+
+	row_ins_step(thr);
+
+	DEBUG_SYNC_C("ib_after_row_insert_step");
+
+	err = trx->error_state;
+
+	if (err != DB_SUCCESS) {
+error_exit:
+		/* FIXME: What's this ? */
+		thr->lock_state = QUE_THR_LOCK_ROW;
+
+		was_lock_wait = row_mysql_handle_errors(
+			&err, trx, thr, &savept);
+
+		thr->lock_state = QUE_THR_LOCK_NOLOCK;
+
+		if (was_lock_wait) {
+			ut_ad(node->state == INS_NODE_INSERT_ENTRIES
+			      || node->state == INS_NODE_ALLOC_ROW_ID
+			      || node->state == INS_NODE_SET_IX_LOCK);
+			goto run_again;
+		}
+
+		trx->op_info = "";
+
+		if (blob_heap != NULL) {
+			mem_heap_free(blob_heap);
+		}
+
+		return(err);
+	}
+
+	if (dict_table_has_fts_index(table)
+	    && (!table->versioned()
+		|| !node->row->fields[table->vers_end].vers_history_row())) {
+
+		doc_id_t	doc_id;
+
+		/* Extract the doc id from the hidden FTS column */
+		doc_id = fts_get_doc_id_from_row(table, node->row);
+
+		if (doc_id <= 0) {
+			ib::error() << "FTS_DOC_ID must be larger than 0 for table "
+				    << table->name;
+			err = DB_FTS_INVALID_DOCID;
+			trx->error_state = DB_FTS_INVALID_DOCID;
+			goto error_exit;
+		}
+
+		if (!DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
+			doc_id_t	next_doc_id
+				= table->fts->cache->next_doc_id;
+
+			if (doc_id < next_doc_id) {
+				ib::error() << "FTS_DOC_ID must be larger than "
+					<< next_doc_id - 1 << " for table "
+					<< table->name;
+
+				err = DB_FTS_INVALID_DOCID;
+				trx->error_state = DB_FTS_INVALID_DOCID;
+				goto error_exit;
+			}
+		}
+
+		if (table->skip_alter_undo) {
+			if (trx->fts_trx == NULL) {
+				trx->fts_trx = fts_trx_create(trx);
+			}
+
+			fts_trx_table_t ftt;
+			ftt.table = table;
+			ftt.fts_trx = trx->fts_trx;
+
+			fts_add_doc_from_tuple(&ftt, doc_id, node->row);
+		} else {
+			/* Pass NULL for the columns affected, since an INSERT affects
+			all FTS indexes. */
+			fts_trx_add_op(trx, table, doc_id, FTS_INSERT, NULL);
+		}
+	}
+
+	/* Not protected by dict_sys.latch or table->stats_mutex_lock()
+	for performance
+	reasons, we would rather get garbage in stat_n_rows (which is
+	just an estimate anyway) than protecting the following code
+	with a latch. */
+	dict_table_n_rows_inc(table);
+
+	if (prebuilt->clust_index_was_generated) {
+		/* set row id to prebuilt */
+		memcpy(prebuilt->row_id, node->sys_buf, DATA_ROW_ID_LEN);
+	}
+
+	dict_stats_update_if_needed(table, *trx);
+	trx->op_info = "";
+
+	if (blob_heap != NULL) {
+		mem_heap_free(blob_heap);
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Builds a dummy query graph used in selects. */
+void
+row_prebuild_sel_graph(
+/*===================*/
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in MySQL
+					handle */
+{
+	sel_node_t*	node;
+
+	ut_ad(prebuilt && prebuilt->trx);
+
+	if (prebuilt->sel_graph == NULL) {
+
+		node = sel_node_create(prebuilt->heap);
+
+		que_thr_t* fork = pars_complete_graph_for_exec(
+			node, prebuilt->trx, prebuilt->heap, prebuilt);
+		fork->state = QUE_THR_RUNNING;
+
+		prebuilt->sel_graph = static_cast<que_fork_t*>(
+			que_node_get_parent(fork));
+
+		prebuilt->sel_graph->state = QUE_FORK_ACTIVE;
+	}
+}
+
+/*********************************************************************//**
+Creates an query graph node of 'update' type to be used in the MySQL
+interface.
+@return own: update node */
+upd_node_t*
+row_create_update_node_for_mysql(
+/*=============================*/
+	dict_table_t*	table,	/*!< in: table to update */
+	mem_heap_t*	heap)	/*!< in: mem heap from which allocated */
+{
+	upd_node_t*	node;
+
+	DBUG_ENTER("row_create_update_node_for_mysql");
+
+	node = upd_node_create(heap);
+
+	node->in_mysql_interface = true;
+	node->is_delete = NO_DELETE;
+	node->pcur = new (mem_heap_alloc(heap, sizeof(btr_pcur_t)))
+		btr_pcur_t();
+
+	node->table = table;
+
+	node->update = upd_create(dict_table_get_n_cols(table)
+				  + dict_table_get_n_v_cols(table), heap);
+
+	node->update_n_fields = dict_table_get_n_cols(table);
+
+	UT_LIST_INIT(node->columns, &sym_node_t::col_var_list);
+
+	node->has_clust_rec_x_lock = TRUE;
+
+	DBUG_RETURN(node);
+}
+
+/*********************************************************************//**
+Gets pointer to a prebuilt update vector used in updates. If the update
+graph has not yet been built in the prebuilt struct, then this function
+first builds it.
+@return prebuilt update vector */
+upd_t*
+row_get_prebuilt_update_vector(
+/*===========================*/
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in MySQL
+					handle */
+{
+	if (prebuilt->upd_node == NULL) {
+
+		/* Not called before for this handle: create an update node
+		and query graph to the prebuilt struct */
+
+		prebuilt->upd_node = row_create_update_node_for_mysql(
+			prebuilt->table, prebuilt->heap);
+
+		prebuilt->upd_graph = static_cast<que_fork_t*>(
+			que_node_get_parent(
+				pars_complete_graph_for_exec(
+					prebuilt->upd_node,
+					prebuilt->trx, prebuilt->heap,
+					prebuilt)));
+
+		prebuilt->upd_graph->state = QUE_FORK_ACTIVE;
+	}
+
+	return(prebuilt->upd_node->update);
+}
+
+/********************************************************************
+Handle an update of a column that has an FTS index. */
+static
+void
+row_fts_do_update(
+/*==============*/
+	trx_t*		trx,		/* in: transaction */
+	dict_table_t*	table,		/* in: Table with FTS index */
+	doc_id_t	old_doc_id,	/* in: old document id */
+	doc_id_t	new_doc_id)	/* in: new document id */
+{
+	if(trx->fts_next_doc_id) {
+		fts_trx_add_op(trx, table, old_doc_id, FTS_DELETE, NULL);
+		if(new_doc_id != FTS_NULL_DOC_ID)
+		fts_trx_add_op(trx, table, new_doc_id, FTS_INSERT, NULL);
+	}
+}
+
+/************************************************************************
+Handles FTS matters for an update or a delete.
+NOTE: should not be called if the table does not have an FTS index. .*/
+static
+dberr_t
+row_fts_update_or_delete(
+/*=====================*/
+	row_prebuilt_t*	prebuilt)	/* in: prebuilt struct in MySQL
+					handle */
+{
+	trx_t*		trx = prebuilt->trx;
+	dict_table_t*	table = prebuilt->table;
+	upd_node_t*	node = prebuilt->upd_node;
+	doc_id_t	old_doc_id = prebuilt->fts_doc_id;
+
+	DBUG_ENTER("row_fts_update_or_delete");
+
+	ut_a(dict_table_has_fts_index(prebuilt->table));
+
+	/* Deletes are simple; get them out of the way first. */
+	if (node->is_delete) {
+		/* A delete affects all FTS indexes, so we pass NULL */
+		fts_trx_add_op(trx, table, old_doc_id, FTS_DELETE, NULL);
+	} else {
+		doc_id_t	new_doc_id;
+		new_doc_id = fts_read_doc_id((byte*) &trx->fts_next_doc_id);
+
+		if (new_doc_id == 0) {
+			ib::error() << "InnoDB FTS: Doc ID cannot be 0";
+			DBUG_RETURN(DB_FTS_INVALID_DOCID);
+		}
+		row_fts_do_update(trx, table, old_doc_id, new_doc_id);
+	}
+
+	DBUG_RETURN(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Initialize the Doc ID system for FK table with FTS index */
+static
+void
+init_fts_doc_id_for_ref(
+/*====================*/
+	dict_table_t*	table,		/*!< in: table */
+	ulint*		depth)		/*!< in: recusive call depth */
+{
+	table->fk_max_recusive_level = 0;
+
+	/* Limit on tables involved in cascading delete/update */
+	if (++*depth > FK_MAX_CASCADE_DEL) {
+		return;
+	}
+
+	/* Loop through this table's referenced list and also
+	recursively traverse each table's foreign table list */
+	for (dict_foreign_t* foreign : table->referenced_set) {
+		ut_ad(foreign->foreign_table);
+
+		if (foreign->foreign_table->fts) {
+			fts_init_doc_id(foreign->foreign_table);
+		}
+
+		if (foreign->foreign_table != table
+		    && !foreign->foreign_table->referenced_set.empty()) {
+			init_fts_doc_id_for_ref(
+				foreign->foreign_table, depth);
+		}
+	}
+}
+
+/** Does an update or delete of a row for MySQL.
+@param[in,out]	prebuilt	prebuilt struct in MySQL handle
+@return error code or DB_SUCCESS */
+dberr_t
+row_update_for_mysql(row_prebuilt_t* prebuilt)
+{
+	trx_savept_t	savept;
+	dberr_t		err;
+	que_thr_t*	thr;
+	dict_index_t*	clust_index;
+	upd_node_t*	node;
+	dict_table_t*	table		= prebuilt->table;
+	trx_t*		trx		= prebuilt->trx;
+	ulint		fk_depth	= 0;
+
+	DBUG_ENTER("row_update_for_mysql");
+
+	ut_ad(trx);
+	ut_a(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED);
+	ut_a(prebuilt->magic_n2 == ROW_PREBUILT_ALLOCATED);
+	ut_a(prebuilt->template_type == ROW_MYSQL_WHOLE_ROW);
+	ut_ad(table->stat_initialized);
+
+	if (!table->is_readable()) {
+		return(row_mysql_get_table_status(table, trx, true));
+	}
+
+	if (high_level_read_only) {
+		return(DB_READ_ONLY);
+	}
+
+	DEBUG_SYNC_C("innodb_row_update_for_mysql_begin");
+
+	trx->op_info = "updating or deleting";
+
+	row_mysql_delay_if_needed();
+
+	init_fts_doc_id_for_ref(table, &fk_depth);
+
+	if (!table->no_rollback()) {
+		trx_start_if_not_started_xa(trx, true);
+	}
+
+	node = prebuilt->upd_node;
+	const bool is_delete = node->is_delete == PLAIN_DELETE;
+	ut_ad(node->table == table);
+
+	clust_index = dict_table_get_first_index(table);
+
+	btr_pcur_copy_stored_position(node->pcur,
+				      prebuilt->pcur->index() == clust_index
+				      ? prebuilt->pcur
+				      : prebuilt->clust_pcur);
+
+	ut_a(node->pcur->rel_pos == BTR_PCUR_ON);
+
+	/* MySQL seems to call rnd_pos before updating each row it
+	has cached: we can get the correct cursor position from
+	prebuilt->pcur; NOTE that we cannot build the row reference
+	from mysql_rec if the clustered index was automatically
+	generated for the table: MySQL does not know anything about
+	the row id used as the clustered index key */
+
+	savept.least_undo_no = trx->undo_no;
+
+	thr = que_fork_get_first_thr(prebuilt->upd_graph);
+
+	node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+	ut_ad(!prebuilt->sql_stat_start);
+
+	ut_ad(!prebuilt->versioned_write || node->table->versioned());
+
+	if (prebuilt->versioned_write && node->is_delete == VERSIONED_DELETE) {
+		node->vers_make_delete(trx);
+	}
+
+	for (;;) {
+		thr->run_node = node;
+		thr->prev_node = node;
+		thr->fk_cascade_depth = 0;
+
+		row_upd_step(thr);
+
+		err = trx->error_state;
+
+		if (err == DB_SUCCESS) {
+			break;
+		}
+
+		if (err == DB_RECORD_NOT_FOUND) {
+			trx->error_state = DB_SUCCESS;
+			goto error;
+		}
+
+		thr->lock_state= QUE_THR_LOCK_ROW;
+
+		DEBUG_SYNC(trx->mysql_thd, "row_update_for_mysql_error");
+
+		bool was_lock_wait = row_mysql_handle_errors(
+			&err, trx, thr, &savept);
+		thr->lock_state= QUE_THR_LOCK_NOLOCK;
+
+		if (!was_lock_wait) {
+			goto error;
+		}
+	}
+
+	if (dict_table_has_fts_index(table)
+	    && trx->fts_next_doc_id != UINT64_UNDEFINED) {
+		err = row_fts_update_or_delete(prebuilt);
+		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+			ut_ad("unexpected error" == 0);
+			goto error;
+		}
+	}
+
+	/* Completed cascading operations (if any) */
+	bool	update_statistics;
+	ut_ad(is_delete == (node->is_delete == PLAIN_DELETE));
+
+	if (is_delete) {
+		/* Not protected by dict_sys.latch
+		or prebuilt->table->stats_mutex_lock() for performance
+		reasons, we would rather get garbage in stat_n_rows (which is
+		just an estimate anyway) than protecting the following code
+		with a latch. */
+		dict_table_n_rows_dec(prebuilt->table);
+
+		update_statistics = !srv_stats_include_delete_marked;
+	} else {
+		update_statistics
+			= !(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE);
+	}
+
+	if (update_statistics) {
+		dict_stats_update_if_needed(prebuilt->table, *trx);
+	} else {
+		/* Always update the table modification counter. */
+		prebuilt->table->stat_modified_counter++;
+	}
+
+error:
+	trx->op_info = "";
+	DBUG_RETURN(err);
+}
+
+/** This can only be used when the current transaction is at
+READ COMMITTED or READ UNCOMMITTED isolation level.
+Before calling this function row_search_mvcc() must have
+initialized prebuilt->new_rec_locks to store the information which new
+record locks really were set. This function removes a newly set
+clustered index record lock under prebuilt->pcur or
+prebuilt->clust_pcur.  Thus, this implements a 'mini-rollback' that
+releases the latest clustered index record lock we set.
+@param[in,out]	prebuilt		prebuilt struct in MySQL handle
+@param[in]	has_latches_on_recs	TRUE if called so that we have the
+					latches on the records under pcur
+					and clust_pcur, and we do not need
+					to reposition the cursors. */
+void
+row_unlock_for_mysql(
+	row_prebuilt_t*	prebuilt,
+	ibool		has_latches_on_recs)
+{
+	if (prebuilt->new_rec_locks == 1 && prebuilt->index->is_clust()) {
+		trx_t* trx = prebuilt->trx;
+		ut_ad(trx->isolation_level <= TRX_ISO_READ_COMMITTED);
+		trx->op_info = "unlock_row";
+
+		const rec_t*	rec;
+		dict_index_t*	index;
+		trx_id_t	rec_trx_id;
+		mtr_t		mtr;
+		btr_pcur_t*	pcur	= prebuilt->pcur;
+
+		mtr_start(&mtr);
+
+		/* Restore the cursor position and find the record */
+
+		if (!has_latches_on_recs
+		    && pcur->restore_position(BTR_SEARCH_LEAF, &mtr)
+		    != btr_pcur_t::SAME_ALL) {
+			goto no_unlock;
+		}
+
+		rec = btr_pcur_get_rec(pcur);
+		index = pcur->index();
+
+		/* If the record has been modified by this
+		transaction, do not unlock it. */
+
+		if (index->trx_id_offset) {
+			rec_trx_id = trx_read_trx_id(rec
+						     + index->trx_id_offset);
+		} else {
+			mem_heap_t*	heap			= NULL;
+			rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+			rec_offs* offsets				= offsets_;
+
+			rec_offs_init(offsets_);
+			offsets = rec_get_offsets(rec, index, offsets,
+						  index->n_core_fields,
+						  ULINT_UNDEFINED, &heap);
+
+			rec_trx_id = row_get_rec_trx_id(rec, index, offsets);
+
+			if (UNIV_LIKELY_NULL(heap)) {
+				mem_heap_free(heap);
+			}
+		}
+
+		if (rec_trx_id != trx->id) {
+			/* We did not update the record: unlock it */
+
+			rec = btr_pcur_get_rec(pcur);
+
+			lock_rec_unlock(
+				trx,
+				btr_pcur_get_block(pcur)->page.id(),
+				rec,
+				static_cast<enum lock_mode>(
+					prebuilt->select_lock_type));
+		}
+no_unlock:
+		mtr_commit(&mtr);
+		trx->op_info = "";
+	}
+}
+
+/** Write query start time as SQL field data to a buffer. Needed by InnoDB.
+@param	thd	Thread object
+@param	buf	Buffer to hold start time data */
+void thd_get_query_start_data(THD *thd, char *buf);
+
+/** Insert history row when evaluating foreign key referential action.
+
+1. Create new dtuple_t 'row' from node->historical_row;
+2. Update its row_end to current timestamp;
+3. Insert it to a table;
+4. Update table statistics.
+
+This is used in UPDATE CASCADE/SET NULL of a system versioned referenced table.
+
+node->historical_row: dtuple_t containing pointers of row changed by refertial
+action.
+
+@param[in]	thr	current query thread
+@param[in]	node	a node which just updated a row in a foreign table
+@return DB_SUCCESS or some error */
+static dberr_t row_update_vers_insert(que_thr_t* thr, upd_node_t* node)
+{
+	trx_t* trx = thr_get_trx(thr);
+	dfield_t* row_end;
+	char row_end_data[8];
+	dict_table_t* table = node->table;
+	const unsigned zip_size = table->space->zip_size();
+	ut_ad(table->versioned());
+
+	dtuple_t*       row;
+	const ulint     n_cols        = dict_table_get_n_cols(table);
+	const ulint     n_v_cols      = dict_table_get_n_v_cols(table);
+
+	ut_ad(n_cols == dtuple_get_n_fields(node->historical_row));
+	ut_ad(n_v_cols == dtuple_get_n_v_fields(node->historical_row));
+
+	row = dtuple_create_with_vcol(node->historical_heap, n_cols, n_v_cols);
+
+	dict_table_copy_types(row, table);
+
+	ins_node_t* insert_node =
+		ins_node_create(INS_DIRECT, table, node->historical_heap);
+
+	if (!insert_node) {
+		trx->error_state = DB_OUT_OF_MEMORY;
+		goto exit;
+	}
+
+	insert_node->common.parent = thr;
+	ins_node_set_new_row(insert_node, row);
+
+	ut_ad(n_cols > DATA_N_SYS_COLS);
+	// Exclude DB_ROW_ID, DB_TRX_ID, DB_ROLL_PTR
+	for (ulint i = 0; i < n_cols - DATA_N_SYS_COLS; i++) {
+		dfield_t *src= dtuple_get_nth_field(node->historical_row, i);
+		dfield_t *dst= dtuple_get_nth_field(row, i);
+		dfield_copy(dst, src);
+		if (dfield_is_ext(src)) {
+			byte *field_data
+				= static_cast<byte*>(dfield_get_data(src));
+			ulint ext_len;
+			ulint field_len = dfield_get_len(src);
+
+			ut_a(field_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+			ut_a(memcmp(field_data + field_len
+				     - BTR_EXTERN_FIELD_REF_SIZE,
+				     field_ref_zero,
+				     BTR_EXTERN_FIELD_REF_SIZE));
+
+			byte *data = btr_copy_externally_stored_field(
+				&ext_len, field_data, zip_size, field_len,
+				node->historical_heap);
+			dfield_set_data(dst, data, ext_len);
+		}
+	}
+
+	for (ulint i = 0; i < n_v_cols; i++) {
+		dfield_t *dst= dtuple_get_nth_v_field(row, i);
+		dfield_t *src= dtuple_get_nth_v_field(node->historical_row, i);
+		dfield_copy(dst, src);
+	}
+
+	node->historical_row = NULL;
+
+	row_end = dtuple_get_nth_field(row, table->vers_end);
+	if (dict_table_get_nth_col(table, table->vers_end)->vers_native()) {
+		mach_write_to_8(row_end_data, trx->id);
+		dfield_set_data(row_end, row_end_data, 8);
+	} else {
+		thd_get_query_start_data(trx->mysql_thd, row_end_data);
+		dfield_set_data(row_end, row_end_data, 7);
+	}
+
+	for (;;) {
+		thr->run_node = insert_node;
+		thr->prev_node = insert_node;
+
+		row_ins_step(thr);
+
+		switch (trx->error_state) {
+		case DB_LOCK_WAIT:
+			if (lock_wait(thr) == DB_SUCCESS) {
+				continue;
+			}
+
+			/* fall through */
+		default:
+			/* Other errors are handled for the parent node. */
+			thr->fk_cascade_depth = 0;
+			goto exit;
+
+		case DB_SUCCESS:
+			dict_stats_update_if_needed(table, *trx);
+			goto exit;
+		}
+	}
+exit:
+	que_graph_free_recursive(insert_node);
+	mem_heap_free(node->historical_heap);
+	node->historical_heap = NULL;
+	return trx->error_state;
+}
+
+/**********************************************************************//**
+Does a cascaded delete or set null in a foreign key operation.
+@return error code or DB_SUCCESS */
+dberr_t
+row_update_cascade_for_mysql(
+/*=========================*/
+        que_thr_t*      thr,    /*!< in: query thread */
+        upd_node_t*     node,   /*!< in: update node used in the cascade
+                                or set null operation */
+        dict_table_t*   table)  /*!< in: table where we do the operation */
+{
+        /* Increment fk_cascade_depth to record the recursive call depth on
+        a single update/delete that affects multiple tables chained
+        together with foreign key relations. */
+
+        if (++thr->fk_cascade_depth > FK_MAX_CASCADE_DEL) {
+                return(DB_FOREIGN_EXCEED_MAX_CASCADE);
+        }
+
+	trx_t* trx = thr_get_trx(thr);
+
+	if (table->versioned()) {
+		if (node->is_delete == PLAIN_DELETE) {
+                  node->vers_make_delete(trx);
+                } else if (node->update->affects_versioned()) {
+			dberr_t err = row_update_vers_insert(thr, node);
+			if (err != DB_SUCCESS) {
+				return err;
+			}
+                        node->vers_make_update(trx);
+                }
+	}
+
+	for (;;) {
+		thr->run_node = node;
+		thr->prev_node = node;
+
+		DEBUG_SYNC_C("foreign_constraint_update_cascade");
+		{
+			TABLE *mysql_table = thr->prebuilt->m_mysql_table;
+			thr->prebuilt->m_mysql_table = NULL;
+			row_upd_step(thr);
+			thr->prebuilt->m_mysql_table = mysql_table;
+		}
+
+		switch (trx->error_state) {
+		case DB_LOCK_WAIT:
+			if (lock_wait(thr) == DB_SUCCESS) {
+				continue;
+			}
+
+			/* fall through */
+		default:
+			/* Other errors are handled for the parent node. */
+			thr->fk_cascade_depth = 0;
+			return trx->error_state;
+
+		case DB_SUCCESS:
+			thr->fk_cascade_depth = 0;
+			bool stats;
+
+			if (node->is_delete == PLAIN_DELETE) {
+				/* Not protected by dict_sys.latch
+				or node->table->stats_mutex_lock() for
+				performance reasons, we would rather
+				get garbage in stat_n_rows (which is
+				just an estimate anyway) than
+				protecting the following code with a
+				latch. */
+				dict_table_n_rows_dec(node->table);
+
+				stats = !srv_stats_include_delete_marked;
+			} else {
+				stats = !(node->cmpl_info
+					  & UPD_NODE_NO_ORD_CHANGE);
+			}
+
+			if (stats) {
+				dict_stats_update_if_needed(node->table, *trx);
+			} else {
+				/* Always update the table
+				modification counter. */
+				node->table->stat_modified_counter++;
+			}
+
+			return(DB_SUCCESS);
+		}
+	}
+}
+
+/*********************************************************************//**
+Creates a table for MySQL. On failure the transaction will be rolled back
+and the 'table' object will be freed.
+@return error code or DB_SUCCESS */
+dberr_t
+row_create_table_for_mysql(
+/*=======================*/
+	dict_table_t*	table,	/*!< in, own: table definition
+				(will be freed, or on DB_SUCCESS
+				added to the data dictionary cache) */
+	trx_t*		trx)	/*!< in/out: transaction */
+{
+	tab_node_t*	node;
+	mem_heap_t*	heap;
+	que_thr_t*	thr;
+
+	ut_ad(trx->state == TRX_STATE_ACTIVE);
+	ut_ad(dict_sys.sys_tables_exist());
+	ut_ad(dict_sys.locked());
+	ut_ad(trx->dict_operation_lock_mode);
+
+	DEBUG_SYNC_C("create_table");
+
+	DBUG_EXECUTE_IF(
+		"ib_create_table_fail_at_start_of_row_create_table_for_mysql",
+		dict_mem_table_free(table); return DB_ERROR;
+	);
+
+	trx->op_info = "creating table";
+
+	heap = mem_heap_create(512);
+
+	trx->dict_operation = true;
+
+	node = tab_create_graph_create(table, heap);
+
+	thr = pars_complete_graph_for_exec(node, trx, heap, NULL);
+
+	ut_a(thr == que_fork_start_command(
+			static_cast<que_fork_t*>(que_node_get_parent(thr))));
+
+	que_run_threads(thr);
+
+	dberr_t err = trx->error_state;
+
+	if (err != DB_SUCCESS) {
+		trx->error_state = DB_SUCCESS;
+		trx->rollback();
+		dict_mem_table_free(table);
+	}
+
+	que_graph_free((que_t*) que_node_get_parent(thr));
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*********************************************************************//**
+Create an index when creating a table.
+On failure, the caller must drop the table!
+@return error number or DB_SUCCESS */
+dberr_t
+row_create_index_for_mysql(
+/*=======================*/
+	dict_index_t*	index,		/*!< in, own: index definition
+					(will be freed) */
+	trx_t*		trx,		/*!< in: transaction handle */
+	const ulint*	field_lengths,	/*!< in: if not NULL, must contain
+					dict_index_get_n_fields(index)
+					actual field lengths for the
+					index columns, which are
+					then checked for not being too
+					large. */
+	fil_encryption_t mode,		/*!< in: encryption mode */
+	uint32_t	key_id)		/*!< in: encryption key_id */
+{
+	ind_node_t*	node;
+	mem_heap_t*	heap;
+	que_thr_t*	thr;
+	dberr_t		err;
+	ulint		i;
+	ulint		len;
+	dict_table_t*	table = index->table;
+
+	ut_ad(dict_sys.locked());
+
+	for (i = 0; i < index->n_def; i++) {
+		/* Check that prefix_len and actual length
+		< DICT_MAX_INDEX_COL_LEN */
+
+		len = dict_index_get_nth_field(index, i)->prefix_len;
+
+		if (field_lengths && field_lengths[i]) {
+			len = ut_max(len, field_lengths[i]);
+		}
+
+		DBUG_EXECUTE_IF(
+			"ib_create_table_fail_at_create_index",
+			len = DICT_MAX_FIELD_LEN_BY_FORMAT(table) + 1;
+		);
+
+		/* Column or prefix length exceeds maximum column length */
+		if (len > (ulint) DICT_MAX_FIELD_LEN_BY_FORMAT(table)) {
+			dict_mem_index_free(index);
+			return DB_TOO_BIG_INDEX_COL;
+		}
+	}
+
+	/* For temp-table we avoid insertion into SYSTEM TABLES to
+	maintain performance and so we have separate path that directly
+	just updates dictonary cache. */
+	if (!table->is_temporary()) {
+		ut_ad(trx->state == TRX_STATE_ACTIVE);
+		ut_ad(trx->dict_operation);
+		trx->op_info = "creating index";
+
+		/* Note that the space id where we store the index is
+		inherited from the table in dict_build_index_def_step()
+		in dict0crea.cc. */
+
+		heap = mem_heap_create(512);
+		node = ind_create_graph_create(index, table->name.m_name,
+					       heap, mode, key_id);
+
+		thr = pars_complete_graph_for_exec(node, trx, heap, NULL);
+
+		ut_a(thr == que_fork_start_command(
+				static_cast<que_fork_t*>(
+					que_node_get_parent(thr))));
+
+		que_run_threads(thr);
+
+		err = trx->error_state;
+
+		index = node->index;
+
+		ut_ad(!index == (err != DB_SUCCESS));
+
+		que_graph_free((que_t*) que_node_get_parent(thr));
+
+		if (index && (index->type & DICT_FTS)) {
+			err = fts_create_index_tables(trx, index, table->id);
+		}
+
+		trx->op_info = "";
+	} else {
+		dict_build_index_def(table, index, trx);
+
+		err = dict_index_add_to_cache(index, FIL_NULL);
+		ut_ad((index == NULL) == (err != DB_SUCCESS));
+		if (UNIV_LIKELY(err == DB_SUCCESS)) {
+			ut_ad(!index->is_instant());
+			index->n_core_null_bytes = static_cast<uint8_t>(
+				UT_BITS_IN_BYTES(unsigned(index->n_nullable)));
+
+			err = dict_create_index_tree_in_mem(index, trx);
+#ifdef BTR_CUR_HASH_ADAPT
+			ut_ad(!index->search_info->ref_count);
+#endif /* BTR_CUR_HASH_ADAPT */
+
+			if (err != DB_SUCCESS) {
+				dict_index_remove_from_cache(table, index);
+			}
+		}
+	}
+
+	return(err);
+}
+
+/** Reassigns the table identifier of a table.
+@param[in,out]	table	table
+@param[in,out]	trx	transaction
+@param[out]	new_id	new table id
+@return error code or DB_SUCCESS */
+static
+dberr_t
+row_mysql_table_id_reassign(
+	dict_table_t*	table,
+	trx_t*		trx,
+	table_id_t*	new_id)
+{
+	if (!dict_sys.sys_tables || dict_sys.sys_tables->corrupted ||
+	    !dict_sys.sys_columns || dict_sys.sys_columns->corrupted ||
+	    !dict_sys.sys_indexes || dict_sys.sys_indexes->corrupted ||
+	    !dict_sys.sys_virtual || dict_sys.sys_virtual->corrupted) {
+		return DB_CORRUPTION;
+	}
+
+	dberr_t		err;
+	pars_info_t*	info	= pars_info_create();
+
+	dict_hdr_get_new_id(new_id, NULL, NULL);
+
+	pars_info_add_ull_literal(info, "old_id", table->id);
+	pars_info_add_ull_literal(info, "new_id", *new_id);
+
+	/* Note: This cannot be rolled back. Rollback would see the
+	UPDATE SYS_INDEXES as two operations: DELETE and INSERT.
+	It would invoke btr_free_if_exists() when rolling back the
+	INSERT, effectively dropping all indexes of the table. */
+	err = que_eval_sql(
+		info,
+		"PROCEDURE RENUMBER_TABLE_PROC () IS\n"
+		"BEGIN\n"
+		"UPDATE SYS_TABLES SET ID = :new_id\n"
+		" WHERE ID = :old_id;\n"
+		"UPDATE SYS_COLUMNS SET TABLE_ID = :new_id\n"
+		" WHERE TABLE_ID = :old_id;\n"
+		"UPDATE SYS_INDEXES SET TABLE_ID = :new_id\n"
+		" WHERE TABLE_ID = :old_id;\n"
+		"UPDATE SYS_VIRTUAL SET TABLE_ID = :new_id\n"
+		" WHERE TABLE_ID = :old_id;\n"
+		"END;\n", trx);
+
+	return(err);
+}
+
+/*********************************************************************//**
+Do the foreign key constraint checks.
+@return DB_SUCCESS or error code. */
+static
+dberr_t
+row_discard_tablespace_foreign_key_checks(
+/*======================================*/
+	const trx_t*		trx,	/*!< in: transaction handle */
+	const dict_table_t*	table)	/*!< in: table to be discarded */
+{
+
+	if (srv_read_only_mode || !trx->check_foreigns) {
+		return(DB_SUCCESS);
+	}
+
+	/* Check if the table is referenced by foreign key constraints from
+	some other table (not the table itself) */
+	dict_foreign_set::const_iterator	it
+		= std::find_if(table->referenced_set.begin(),
+			       table->referenced_set.end(),
+			       dict_foreign_different_tables());
+
+	if (it == table->referenced_set.end()) {
+		return(DB_SUCCESS);
+	}
+
+	const dict_foreign_t*	foreign	= *it;
+	FILE*			ef	= dict_foreign_err_file;
+
+	ut_ad(foreign->foreign_table != table);
+	ut_ad(foreign->referenced_table == table);
+
+	/* We only allow discarding a referenced table if
+	FOREIGN_KEY_CHECKS is set to 0 */
+
+	mysql_mutex_lock(&dict_foreign_err_mutex);
+
+	rewind(ef);
+
+	ut_print_timestamp(ef);
+
+	fputs("  Cannot DISCARD table ", ef);
+	ut_print_name(ef, trx, table->name.m_name);
+	fputs("\n"
+	      "because it is referenced by ", ef);
+	ut_print_name(ef, trx, foreign->foreign_table_name);
+	putc('\n', ef);
+
+	mysql_mutex_unlock(&dict_foreign_err_mutex);
+
+	return(DB_CANNOT_DROP_CONSTRAINT);
+}
+
+/*********************************************************************//**
+Do the DISCARD TABLESPACE operation.
+@return DB_SUCCESS or error code. */
+static
+dberr_t
+row_discard_tablespace(
+/*===================*/
+	trx_t*		trx,	/*!< in/out: transaction handle */
+	dict_table_t*	table)	/*!< in/out: table to be discarded */
+{
+	dberr_t err;
+
+	/* How do we prevent crashes caused by ongoing operations on
+	the table? Old operations could try to access non-existent
+	pages. The SQL layer will block all DML on the table using MDL and a
+	DISCARD will not start unless all existing operations on the
+	table to be discarded are completed.
+
+	1) Acquire the data dictionary latch in X mode. This will
+	prevent any internal operations that are not covered by
+	MDL or InnoDB table locks.
+
+	2) Purge and rollback: we assign a new table id for the
+	table. Since purge and rollback look for the table based on
+	the table id, they see the table as 'dropped' and discard
+	their operations.
+
+	3) Insert buffer: we remove all entries for the tablespace in
+	the insert buffer tree. */
+
+	ibuf_delete_for_discarded_space(table->space_id);
+
+	table_id_t	new_id;
+
+	/* Set the TABLESPACE DISCARD flag in the table definition
+	on disk. */
+	err = row_import_update_discarded_flag(trx, table->id, true);
+
+	if (err != DB_SUCCESS) {
+		return(err);
+	}
+
+	/* Update the index root pages in the system tables, on disk */
+	err = row_import_update_index_root(trx, table, true);
+
+	if (err != DB_SUCCESS) {
+		return(err);
+	}
+
+	/* Drop all the FTS auxiliary tables. */
+	if (dict_table_has_fts_index(table)
+	    || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
+
+		fts_drop_tables(trx, *table);
+	}
+
+	/* Assign a new space ID to the table definition so that purge
+	can ignore the changes. Update the system table on disk. */
+
+	err = row_mysql_table_id_reassign(table, trx, &new_id);
+
+	if (err != DB_SUCCESS) {
+		return(err);
+	}
+
+	/* All persistent operations successful, update the
+	data dictionary memory cache. */
+
+	dict_table_change_id_in_cache(table, new_id);
+
+	dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
+	if (index) index->clear_instant_alter();
+
+	/* Reset the root page numbers. */
+	for (; index; index = UT_LIST_GET_NEXT(indexes, index)) {
+		index->page = FIL_NULL;
+	}
+
+	/* If the tablespace did not already exist or we couldn't
+	write to it, we treat that as a successful DISCARD. It is
+	unusable anyway. */
+	return DB_SUCCESS;
+}
+
+/*********************************************************************//**
+Discards the tablespace of a table which stored in an .ibd file. Discarding
+means that this function renames the .ibd file and assigns a new table id for
+the table. Also the file_unreadable flag is set.
+@return error code or DB_SUCCESS */
+dberr_t row_discard_tablespace_for_mysql(dict_table_t *table, trx_t *trx)
+{
+  ut_ad(!is_system_tablespace(table->space_id));
+  ut_ad(!table->is_temporary());
+
+  const auto fts_exist = table->flags2 &
+    (DICT_TF2_FTS_HAS_DOC_ID | DICT_TF2_FTS);
+
+  dberr_t err;
+
+  if (fts_exist)
+  {
+    fts_optimize_remove_table(table);
+    purge_sys.stop_FTS(*table);
+    err= fts_lock_tables(trx, *table);
+    if (err != DB_SUCCESS)
+    {
+rollback:
+      if (fts_exist)
+      {
+        purge_sys.resume_FTS();
+        fts_optimize_add_table(table);
+      }
+      trx->rollback();
+      if (trx->dict_operation_lock_mode)
+        row_mysql_unlock_data_dictionary(trx);
+      return err;
+    }
+  }
+
+  row_mysql_lock_data_dictionary(trx);
+  trx->op_info = "discarding tablespace";
+  trx->dict_operation= true;
+
+  /* We serialize data dictionary operations with dict_sys.latch:
+  this is to avoid deadlocks during data dictionary operations */
+
+  err= row_discard_tablespace_foreign_key_checks(trx, table);
+  if (err != DB_SUCCESS)
+    goto rollback;
+
+  /* Note: The following cannot be rolled back. Rollback would see the
+  UPDATE of SYS_INDEXES.TABLE_ID as two operations: DELETE and INSERT.
+  It would invoke btr_free_if_exists() when rolling back the INSERT,
+  effectively dropping all indexes of the table. Furthermore, calls like
+  ibuf_delete_for_discarded_space() are already discarding data
+  before the transaction is committed.
+
+  It would be better to remove the integrity-breaking
+  ALTER TABLE...DISCARD TABLESPACE operation altogether. */
+  table->file_unreadable= true;
+  table->space= nullptr;
+  table->flags2|= DICT_TF2_DISCARDED;
+  err= row_discard_tablespace(trx, table);
+  DBUG_EXECUTE_IF("ib_discard_before_commit_crash",
+                  log_buffer_flush_to_disk(); DBUG_SUICIDE(););
+  /* FTS_ tables may be deleted */
+  std::vector<pfs_os_file_t> deleted;
+  trx->commit(deleted);
+  const auto space_id= table->space_id;
+  pfs_os_file_t d= fil_delete_tablespace(space_id);
+  DBUG_EXECUTE_IF("ib_discard_after_commit_crash", DBUG_SUICIDE(););
+  row_mysql_unlock_data_dictionary(trx);
+
+  if (d != OS_FILE_CLOSED)
+    os_file_close(d);
+  for (pfs_os_file_t d : deleted)
+    os_file_close(d);
+
+  if (fts_exist)
+    purge_sys.resume_FTS();
+
+  ibuf_delete_for_discarded_space(space_id);
+  buf_flush_remove_pages(space_id);
+  trx->op_info= "";
+  return err;
+}
+
+/****************************************************************//**
+Delete a single constraint.
+@return error code or DB_SUCCESS */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_delete_constraint_low(
+/*======================*/
+	const char*	id,		/*!< in: constraint id */
+	trx_t*		trx)		/*!< in: transaction handle */
+{
+	pars_info_t*	info = pars_info_create();
+
+	pars_info_add_str_literal(info, "id", id);
+
+	return(que_eval_sql(info,
+			    "PROCEDURE DELETE_CONSTRAINT () IS\n"
+			    "BEGIN\n"
+			    "DELETE FROM SYS_FOREIGN_COLS WHERE ID = :id;\n"
+			    "DELETE FROM SYS_FOREIGN WHERE ID = :id;\n"
+			    "END;\n", trx));
+}
+
+/****************************************************************//**
+Delete a single constraint.
+@return error code or DB_SUCCESS */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_delete_constraint(
+/*==================*/
+	const char*	id,		/*!< in: constraint id */
+	const char*	database_name,	/*!< in: database name, with the
+					trailing '/' */
+	mem_heap_t*	heap,		/*!< in: memory heap */
+	trx_t*		trx)		/*!< in: transaction handle */
+{
+	dberr_t	err;
+
+	/* New format constraints have ids <databasename>/<constraintname>. */
+	err = row_delete_constraint_low(
+		mem_heap_strcat(heap, database_name, id), trx);
+
+	if ((err == DB_SUCCESS) && !strchr(id, '/')) {
+		/* Old format < 4.0.18 constraints have constraint ids
+		NUMBER_NUMBER. We only try deleting them if the
+		constraint name does not contain a '/' character, otherwise
+		deleting a new format constraint named 'foo/bar' from
+		database 'baz' would remove constraint 'bar' from database
+		'foo', if it existed. */
+
+		err = row_delete_constraint_low(id, trx);
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Renames a table for MySQL.
+@return error code or DB_SUCCESS */
+dberr_t
+row_rename_table_for_mysql(
+/*=======================*/
+	const char*	old_name,	/*!< in: old table name */
+	const char*	new_name,	/*!< in: new table name */
+	trx_t*		trx,		/*!< in/out: transaction */
+	bool		use_fk)		/*!< in: whether to parse and enforce
+					FOREIGN KEY constraints */
+{
+	dict_table_t*	table			= NULL;
+	dberr_t		err			= DB_ERROR;
+	mem_heap_t*	heap			= NULL;
+	const char**	constraints_to_drop	= NULL;
+	ulint		n_constraints_to_drop	= 0;
+	ibool		old_is_tmp, new_is_tmp;
+	pars_info_t*	info			= NULL;
+
+	ut_a(old_name != NULL);
+	ut_a(new_name != NULL);
+	ut_ad(trx->state == TRX_STATE_ACTIVE);
+	ut_ad(trx->dict_operation_lock_mode);
+
+	if (high_level_read_only) {
+		return(DB_READ_ONLY);
+	}
+
+	trx->op_info = "renaming table";
+
+	old_is_tmp = dict_table_t::is_temporary_name(old_name);
+	new_is_tmp = dict_table_t::is_temporary_name(new_name);
+
+	table = dict_table_open_on_name(old_name, true,
+					DICT_ERR_IGNORE_FK_NOKEY);
+
+	/* MariaDB partition engine hard codes the file name
+	separator as "#P#" and "#SP#". The text case is fixed even if
+	lower_case_table_names is set to 1 or 2. InnoDB always
+	normalises file names to lower case on Windows, this
+	can potentially cause problems when copying/moving
+	tables between platforms.
+
+	1) If boot against an installation from Windows
+	platform, then its partition table name could
+	be all be in lower case in system tables. So we
+	will need to check lower case name when load table.
+
+	2) If  we boot an installation from other case
+	sensitive platform in Windows, we might need to
+	check the existence of table name without lowering
+	case them in the system table. */
+	if (!table && lower_case_table_names == 1
+	    && strstr(old_name, table_name_t::part_suffix)) {
+		char par_case_name[MAX_FULL_NAME_LEN + 1];
+#ifndef _WIN32
+		/* Check for the table using lower
+		case name, including the partition
+		separator "P" */
+		memcpy(par_case_name, old_name,
+			strlen(old_name));
+		par_case_name[strlen(old_name)] = 0;
+		innobase_casedn_str(par_case_name);
+#else
+		/* On Windows platfrom, check
+		whether there exists table name in
+		system table whose name is
+		not being normalized to lower case */
+		normalize_table_name_c_low(
+			par_case_name, old_name, FALSE);
+#endif
+		table = dict_table_open_on_name(par_case_name, true,
+						DICT_ERR_IGNORE_FK_NOKEY);
+	}
+
+	if (!table) {
+		err = DB_TABLE_NOT_FOUND;
+		goto funct_exit;
+	}
+
+	ut_ad(!table->is_temporary());
+
+	if (!table->is_readable() && !table->space
+	    && !(table->flags2 & DICT_TF2_DISCARDED)) {
+
+		err = DB_TABLE_NOT_FOUND;
+
+		ib::error() << "Table " << old_name << " does not have an .ibd"
+			" file in the database directory. "
+			<< TROUBLESHOOTING_MSG;
+
+		goto funct_exit;
+
+	} else if (use_fk && !old_is_tmp && new_is_tmp) {
+		/* MySQL is doing an ALTER TABLE command and it renames the
+		original table to a temporary table name. We want to preserve
+		the original foreign key constraint definitions despite the
+		name change. An exception is those constraints for which
+		the ALTER TABLE contained DROP FOREIGN KEY <foreign key id>.*/
+
+		heap = mem_heap_create(100);
+
+		err = dict_foreign_parse_drop_constraints(
+			heap, trx, table, &n_constraints_to_drop,
+			&constraints_to_drop);
+
+		if (err != DB_SUCCESS) {
+			goto funct_exit;
+		}
+	}
+
+	err = trx_undo_report_rename(trx, table);
+
+	if (err != DB_SUCCESS) {
+		goto funct_exit;
+	}
+
+	/* We use the private SQL parser of Innobase to generate the query
+	graphs needed in updating the dictionary data from system tables. */
+
+	info = pars_info_create();
+
+	pars_info_add_str_literal(info, "new_table_name", new_name);
+	pars_info_add_str_literal(info, "old_table_name", old_name);
+
+	err = que_eval_sql(info,
+			   "PROCEDURE RENAME_TABLE () IS\n"
+			   "BEGIN\n"
+			   "UPDATE SYS_TABLES"
+			   " SET NAME = :new_table_name\n"
+			   " WHERE NAME = :old_table_name;\n"
+			   "END;\n", trx);
+
+	if (err != DB_SUCCESS) {
+		// Assume the caller guarantees destination name doesn't exist.
+		ut_ad(err != DB_DUPLICATE_KEY);
+		goto rollback_and_exit;
+	}
+
+	if (!new_is_tmp) {
+		/* Rename all constraints. */
+		char	new_table_name[MAX_TABLE_NAME_LEN + 1];
+		char	old_table_utf8[MAX_TABLE_NAME_LEN + 1];
+		uint	errors = 0;
+
+		strncpy(old_table_utf8, old_name, MAX_TABLE_NAME_LEN);
+		old_table_utf8[MAX_TABLE_NAME_LEN] = '\0';
+		innobase_convert_to_system_charset(
+			strchr(old_table_utf8, '/') + 1,
+			strchr(old_name, '/') +1,
+			MAX_TABLE_NAME_LEN, &errors);
+
+		if (errors) {
+			/* Table name could not be converted from charset
+			my_charset_filename to UTF-8. This means that the
+			table name is already in UTF-8 (#mysql#50). */
+			strncpy(old_table_utf8, old_name, MAX_TABLE_NAME_LEN);
+			old_table_utf8[MAX_TABLE_NAME_LEN] = '\0';
+		}
+
+		info = pars_info_create();
+
+		pars_info_add_str_literal(info, "new_table_name", new_name);
+		pars_info_add_str_literal(info, "old_table_name", old_name);
+		pars_info_add_str_literal(info, "old_table_name_utf8",
+					  old_table_utf8);
+
+		strncpy(new_table_name, new_name, MAX_TABLE_NAME_LEN);
+		new_table_name[MAX_TABLE_NAME_LEN] = '\0';
+		innobase_convert_to_system_charset(
+			strchr(new_table_name, '/') + 1,
+			strchr(new_name, '/') +1,
+			MAX_TABLE_NAME_LEN, &errors);
+
+		if (errors) {
+			/* Table name could not be converted from charset
+			my_charset_filename to UTF-8. This means that the
+			table name is already in UTF-8 (#mysql#50). */
+			strncpy(new_table_name, new_name, MAX_TABLE_NAME_LEN);
+			new_table_name[MAX_TABLE_NAME_LEN] = '\0';
+		}
+
+		pars_info_add_str_literal(info, "new_table_utf8", new_table_name);
+
+		err = que_eval_sql(
+			info,
+			"PROCEDURE RENAME_CONSTRAINT_IDS () IS\n"
+			"gen_constr_prefix CHAR;\n"
+			"new_db_name CHAR;\n"
+			"foreign_id CHAR;\n"
+			"new_foreign_id CHAR;\n"
+			"old_db_name_len INT;\n"
+			"old_t_name_len INT;\n"
+			"new_db_name_len INT;\n"
+			"id_len INT;\n"
+			"offset INT;\n"
+			"found INT;\n"
+			"BEGIN\n"
+			"found := 1;\n"
+			"old_db_name_len := INSTR(:old_table_name, '/')-1;\n"
+			"new_db_name_len := INSTR(:new_table_name, '/')-1;\n"
+			"new_db_name := SUBSTR(:new_table_name, 0,\n"
+			"                      new_db_name_len);\n"
+			"old_t_name_len := LENGTH(:old_table_name);\n"
+			"gen_constr_prefix := CONCAT(:old_table_name_utf8,\n"
+			"                            '_ibfk_');\n"
+			"WHILE found = 1 LOOP\n"
+			"       SELECT ID INTO foreign_id\n"
+			"        FROM SYS_FOREIGN\n"
+			"        WHERE FOR_NAME = :old_table_name\n"
+			"         AND TO_BINARY(FOR_NAME)\n"
+			"           = TO_BINARY(:old_table_name)\n"
+			"         LOCK IN SHARE MODE;\n"
+			"       IF (SQL % NOTFOUND) THEN\n"
+			"        found := 0;\n"
+			"       ELSE\n"
+			"        UPDATE SYS_FOREIGN\n"
+			"        SET FOR_NAME = :new_table_name\n"
+			"         WHERE ID = foreign_id;\n"
+			"        id_len := LENGTH(foreign_id);\n"
+			"        IF (INSTR(foreign_id, '/') > 0) THEN\n"
+			"               IF (INSTR(foreign_id,\n"
+			"                         gen_constr_prefix) > 0)\n"
+			"               THEN\n"
+                        "                offset := INSTR(foreign_id, '_ibfk_') - 1;\n"
+			"                new_foreign_id :=\n"
+			"                CONCAT(:new_table_utf8,\n"
+			"                SUBSTR(foreign_id, offset,\n"
+			"                       id_len - offset));\n"
+			"               ELSE\n"
+			"                new_foreign_id :=\n"
+			"                CONCAT(new_db_name,\n"
+			"                SUBSTR(foreign_id,\n"
+			"                       old_db_name_len,\n"
+			"                       id_len - old_db_name_len));\n"
+			"               END IF;\n"
+			"               UPDATE SYS_FOREIGN\n"
+			"                SET ID = new_foreign_id\n"
+			"                WHERE ID = foreign_id;\n"
+			"               UPDATE SYS_FOREIGN_COLS\n"
+			"                SET ID = new_foreign_id\n"
+			"                WHERE ID = foreign_id;\n"
+			"        END IF;\n"
+			"       END IF;\n"
+			"END LOOP;\n"
+			"UPDATE SYS_FOREIGN SET REF_NAME = :new_table_name\n"
+			"WHERE REF_NAME = :old_table_name\n"
+			"  AND TO_BINARY(REF_NAME)\n"
+			"    = TO_BINARY(:old_table_name);\n"
+			"END;\n", trx);
+
+	} else if (n_constraints_to_drop > 0) {
+		/* Drop some constraints of tmp tables. */
+
+		ulint	db_name_len = dict_get_db_name_len(old_name) + 1;
+		char*	db_name = mem_heap_strdupl(heap, old_name,
+						   db_name_len);
+		ulint	i;
+
+		for (i = 0; i < n_constraints_to_drop; i++) {
+			err = row_delete_constraint(constraints_to_drop[i],
+						    db_name, heap, trx);
+
+			if (err != DB_SUCCESS) {
+				break;
+			}
+		}
+	}
+
+	if (err == DB_SUCCESS
+	    && (dict_table_has_fts_index(table)
+	    || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID))
+	    && !dict_tables_have_same_db(old_name, new_name)) {
+		err = fts_rename_aux_tables(table, new_name, trx);
+	}
+
+	switch (err) {
+	case DB_DUPLICATE_KEY:
+		ib::error() << "Table rename might cause two"
+			" FOREIGN KEY constraints to have the same"
+			" internal name in case-insensitive comparison.";
+		ib::info() << TROUBLESHOOTING_MSG;
+		/* fall through */
+	rollback_and_exit:
+	default:
+		trx->error_state = DB_SUCCESS;
+		trx->rollback();
+		trx->error_state = DB_SUCCESS;
+		break;
+	case DB_SUCCESS:
+		DEBUG_SYNC_C("innodb_rename_in_cache");
+		/* The following call will also rename the .ibd file */
+		err = dict_table_rename_in_cache(
+			table, span<const char>{new_name,strlen(new_name)},
+			false);
+		if (err != DB_SUCCESS) {
+			goto rollback_and_exit;
+		}
+
+		/* In case of copy alter, template db_name and
+		table_name should be renamed only for newly
+		created table. */
+		if (table->vc_templ != NULL && !new_is_tmp) {
+			innobase_rename_vc_templ(table);
+		}
+
+		/* We only want to switch off some of the type checking in
+		an ALTER TABLE, not in a RENAME. */
+		dict_names_t	fk_tables;
+
+		err = dict_load_foreigns(
+			new_name, nullptr, trx->id,
+			!old_is_tmp || trx->check_foreigns,
+			use_fk
+			? DICT_ERR_IGNORE_NONE
+			: DICT_ERR_IGNORE_FK_NOKEY,
+			fk_tables);
+
+		if (err != DB_SUCCESS) {
+			if (old_is_tmp) {
+				/* In case of copy alter, ignore the
+				loading of foreign key constraint
+				when foreign_key_check is disabled */
+				ib::error_or_warn(trx->check_foreigns)
+					<< "In ALTER TABLE "
+					<< ut_get_name(trx, new_name)
+					<< " has or is referenced in foreign"
+					" key constraints which are not"
+					" compatible with the new table"
+					" definition.";
+				if (!trx->check_foreigns) {
+					err = DB_SUCCESS;
+					break;
+				}
+			} else {
+				ib::error() << "In RENAME TABLE table "
+					<< ut_get_name(trx, new_name)
+					<< " is referenced in foreign key"
+					" constraints which are not compatible"
+					" with the new table definition.";
+			}
+
+			goto rollback_and_exit;
+		}
+
+		/* Check whether virtual column or stored column affects
+		the foreign key constraint of the table. */
+		if (dict_foreigns_has_s_base_col(table->foreign_set, table)) {
+			err = DB_NO_FK_ON_S_BASE_COL;
+			goto rollback_and_exit;
+		}
+
+		/* Fill the virtual column set in foreign when
+		the table undergoes copy alter operation. */
+		dict_mem_table_free_foreign_vcol_set(table);
+		dict_mem_table_fill_foreign_vcol_set(table);
+
+		while (!fk_tables.empty()) {
+			const char *f = fk_tables.front();
+			dict_sys.load_table({f, strlen(f)});
+			fk_tables.pop_front();
+		}
+
+		table->data_dir_path= NULL;
+	}
+
+funct_exit:
+	if (table) {
+		table->release();
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	trx->op_info = "";
+
+	return(err);
+}
diff --git a/storage/innobase/row/row0purge.cc b/storage/innobase/row/row0purge.cc
new file mode 100644
index 00000000..4756cc37
--- /dev/null
+++ b/storage/innobase/row/row0purge.cc
@@ -0,0 +1,1304 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0purge.cc
+Purge obsolete records
+
+Created 3/14/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0purge.h"
+#include "btr0cur.h"
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "dict0crea.h"
+#include "dict0stats.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "row0vers.h"
+#include "row0mysql.h"
+#include "log0log.h"
+#include "srv0mon.h"
+#include "srv0start.h"
+#include "handler.h"
+#include "ha_innodb.h"
+#include "fil0fil.h"
+#include "debug_sync.h"
+#include <mysql/service_thd_mdl.h>
+
+/*************************************************************************
+IMPORTANT NOTE: Any operation that generates redo MUST check that there
+is enough space in the redo log before for that operation. This is
+done by calling log_free_check(). The reason for checking the
+availability of the redo log space before the start of the operation is
+that we MUST not hold any synchonization objects when performing the
+check.
+If you make a change in this module make sure that no codepath is
+introduced where a call to log_free_check() is bypassed. */
+
+/***********************************************************//**
+Repositions the pcur in the purge node on the clustered index record,
+if found. If the record is not found, close pcur.
+@return TRUE if the record was found */
+static
+ibool
+row_purge_reposition_pcur(
+/*======================*/
+	btr_latch_mode	mode,	/*!< in: latching mode */
+	purge_node_t*	node,	/*!< in: row purge node */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	if (node->found_clust) {
+		ut_ad(node->validate_pcur());
+
+		node->found_clust =
+		  node->pcur.restore_position(mode, mtr) ==
+		    btr_pcur_t::SAME_ALL;
+
+	} else {
+		node->found_clust = row_search_on_row_ref(
+			&node->pcur, mode, node->table, node->ref, mtr);
+
+		if (node->found_clust) {
+			btr_pcur_store_position(&node->pcur, mtr);
+		}
+	}
+
+	/* Close the current cursor if we fail to position it correctly. */
+	if (!node->found_clust) {
+		btr_pcur_close(&node->pcur);
+	}
+
+	return(node->found_clust);
+}
+
+/***********************************************************//**
+Removes a delete marked clustered index record if possible.
+@retval true if the row was not found, or it was successfully removed
+@retval false if the row was modified after the delete marking */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+row_purge_remove_clust_if_poss_low(
+/*===============================*/
+	purge_node_t*	node,	/*!< in/out: row purge node */
+	btr_latch_mode	mode)	/*!< in: BTR_MODIFY_LEAF or BTR_PURGE_TREE */
+{
+	dict_index_t* index = dict_table_get_first_index(node->table);
+	table_id_t table_id = 0;
+	index_id_t index_id = 0;
+	dict_table_t *table = nullptr;
+	pfs_os_file_t f = OS_FILE_CLOSED;
+
+	if (table_id) {
+retry:
+		dict_sys.lock(SRW_LOCK_CALL);
+		table = dict_sys.find_table(table_id);
+		if (!table) {
+			dict_sys.unlock();
+		} else if (table->n_rec_locks) {
+			for (dict_index_t* ind = UT_LIST_GET_FIRST(
+				     table->indexes); ind;
+			     ind = UT_LIST_GET_NEXT(indexes, ind)) {
+				if (ind->id == index_id) {
+					lock_discard_for_index(*ind);
+				}
+			}
+		}
+	}
+	mtr_t mtr;
+	mtr.start();
+	index->set_modified(mtr);
+	log_free_check();
+	bool success = true;
+
+	if (!row_purge_reposition_pcur(mode, node, &mtr)) {
+		/* The record was already removed. */
+removed:
+		mtr.commit();
+close_and_exit:
+		if (table) {
+			dict_sys.unlock();
+		}
+		return success;
+	}
+
+	if (node->table->id == DICT_INDEXES_ID) {
+		/* If this is a record of the SYS_INDEXES table, then
+		we have to free the file segments of the index tree
+		associated with the index */
+		if (!table_id) {
+			const rec_t* rec = btr_pcur_get_rec(&node->pcur);
+
+			table_id = mach_read_from_8(rec);
+			index_id = mach_read_from_8(rec + 8);
+			if (table_id) {
+				mtr.commit();
+				goto retry;
+			}
+			ut_ad("corrupted SYS_INDEXES record" == 0);
+		}
+
+		const uint32_t space_id = dict_drop_index_tree(
+			&node->pcur, nullptr, &mtr);
+		if (space_id) {
+			if (table) {
+				if (table->get_ref_count() == 0) {
+					dict_sys.remove(table);
+				} else if (table->space_id == space_id) {
+					table->space = nullptr;
+					table->file_unreadable = true;
+				}
+				dict_sys.unlock();
+				table = nullptr;
+			}
+			f = fil_delete_tablespace(space_id);
+		}
+
+		mtr.commit();
+
+		if (table) {
+			dict_sys.unlock();
+			table = nullptr;
+		}
+
+		if (space_id) {
+			ibuf_delete_for_discarded_space(space_id);
+		}
+
+		mtr.start();
+		index->set_modified(mtr);
+
+		if (!row_purge_reposition_pcur(mode, node, &mtr)) {
+			goto removed;
+		}
+	}
+
+	rec_t* rec = btr_pcur_get_rec(&node->pcur);
+	rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs_init(offsets_);
+	mem_heap_t* heap = NULL;
+	rec_offs* offsets = rec_get_offsets(rec, index, offsets_,
+					    index->n_core_fields,
+					    ULINT_UNDEFINED, &heap);
+
+	if (node->roll_ptr != row_get_rec_roll_ptr(rec, index, offsets)) {
+		/* Someone else has modified the record later: do not remove */
+		goto func_exit;
+	}
+
+	ut_ad(rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
+	/* In delete-marked records, DB_TRX_ID must
+	always refer to an existing undo log record. */
+	ut_ad(row_get_rec_trx_id(rec, index, offsets));
+
+	if (mode == BTR_MODIFY_LEAF) {
+		success = DB_FAIL != btr_cur_optimistic_delete(
+			btr_pcur_get_btr_cur(&node->pcur), 0, &mtr);
+	} else {
+		dberr_t	err;
+		ut_ad(mode == BTR_PURGE_TREE);
+		btr_cur_pessimistic_delete(
+			&err, FALSE, btr_pcur_get_btr_cur(&node->pcur), 0,
+			false, &mtr);
+		success = err == DB_SUCCESS;
+	}
+
+func_exit:
+	if (heap) {
+		mem_heap_free(heap);
+	}
+
+	/* Persistent cursor is closed if reposition fails. */
+	if (node->found_clust) {
+		btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
+	} else {
+		mtr_commit(&mtr);
+	}
+
+	goto close_and_exit;
+}
+
+/***********************************************************//**
+Removes a clustered index record if it has not been modified after the delete
+marking.
+@retval true if the row was not found, or it was successfully removed
+@retval false the purge needs to be suspended because of running out
+of file space. */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+row_purge_remove_clust_if_poss(
+/*===========================*/
+	purge_node_t*	node)	/*!< in/out: row purge node */
+{
+	if (row_purge_remove_clust_if_poss_low(node, BTR_MODIFY_LEAF)) {
+		return(true);
+	}
+
+	for (ulint n_tries = 0;
+	     n_tries < BTR_CUR_RETRY_DELETE_N_TIMES;
+	     n_tries++) {
+		if (row_purge_remove_clust_if_poss_low(node, BTR_PURGE_TREE)) {
+			return(true);
+		}
+
+		std::this_thread::sleep_for(BTR_CUR_RETRY_SLEEP_TIME);
+	}
+
+	return(false);
+}
+
+/** Determines if it is possible to remove a secondary index entry.
+Removal is possible if the secondary index entry does not refer to any
+not delete marked version of a clustered index record where DB_TRX_ID
+is newer than the purge view.
+
+NOTE: This function should only be called by the purge thread, only
+while holding a latch on the leaf page of the secondary index entry
+(or keeping the buffer pool watch on the page).  It is possible that
+this function first returns true and then false, if a user transaction
+inserts a record that the secondary index entry would refer to.
+However, in that case, the user transaction would also re-insert the
+secondary index entry after purge has removed it and released the leaf
+page latch.
+@param[in,out]	node		row purge node
+@param[in]	index		secondary index
+@param[in]	entry		secondary index entry
+@param[in,out]	sec_pcur	secondary index cursor or NULL
+				if it is called for purge buffering
+				operation.
+@param[in,out]	sec_mtr		mini-transaction which holds
+				secondary index entry or NULL if it is
+				called for purge buffering operation.
+@param[in]	is_tree		true=pessimistic purge,
+				false=optimistic (leaf-page only)
+@return true if the secondary index record can be purged */
+bool
+row_purge_poss_sec(
+	purge_node_t*	node,
+	dict_index_t*	index,
+	const dtuple_t*	entry,
+	btr_pcur_t*	sec_pcur,
+	mtr_t*		sec_mtr,
+	bool		is_tree)
+{
+	bool	can_delete;
+	mtr_t	mtr;
+
+	ut_ad(!dict_index_is_clust(index));
+
+	mtr_start(&mtr);
+
+	can_delete = !row_purge_reposition_pcur(BTR_SEARCH_LEAF, node, &mtr)
+		|| !row_vers_old_has_index_entry(true,
+						 btr_pcur_get_rec(&node->pcur),
+						 &mtr, index, entry,
+						 node->roll_ptr, node->trx_id);
+
+	/* Persistent cursor is closed if reposition fails. */
+	if (node->found_clust) {
+		btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
+	} else {
+		mtr.commit();
+	}
+
+	ut_ad(mtr.has_committed());
+
+	return can_delete;
+}
+
+/***************************************************************
+Removes a secondary index entry if possible, by modifying the
+index tree.  Does not try to buffer the delete.
+@return TRUE if success or if not found */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+ibool
+row_purge_remove_sec_if_poss_tree(
+/*==============================*/
+	purge_node_t*	node,	/*!< in: row purge node */
+	dict_index_t*	index,	/*!< in: index */
+	const dtuple_t*	entry)	/*!< in: index entry */
+{
+	btr_pcur_t		pcur;
+	ibool			success	= TRUE;
+	dberr_t			err;
+	mtr_t			mtr;
+
+	log_free_check();
+	mtr.start();
+	index->set_modified(mtr);
+	pcur.btr_cur.page_cur.index = index;
+
+	if (index->is_spatial()) {
+		if (!rtr_search(entry, BTR_PURGE_TREE, &pcur, &mtr)) {
+			goto found;
+		}
+		goto func_exit;
+	}
+
+	switch (row_search_index_entry(entry, BTR_PURGE_TREE, &pcur, &mtr)) {
+	case ROW_NOT_FOUND:
+		/* Not found.  This is a legitimate condition.  In a
+		rollback, InnoDB will remove secondary recs that would
+		be purged anyway.  Then the actual purge will not find
+		the secondary index record.  Also, the purge itself is
+		eager: if it comes to consider a secondary index
+		record, and notices it does not need to exist in the
+		index, it will remove it.  Then if/when the purge
+		comes to consider the secondary index record a second
+		time, it will not exist any more in the index. */
+
+		/* fputs("PURGE:........sec entry not found\n", stderr); */
+		/* dtuple_print(stderr, entry); */
+		goto func_exit;
+	case ROW_FOUND:
+		break;
+	case ROW_BUFFERED:
+	case ROW_NOT_DELETED_REF:
+		/* These are invalid outcomes, because the mode passed
+		to row_search_index_entry() did not include any of the
+		flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */
+		ut_error;
+	}
+
+	/* We should remove the index record if no later version of the row,
+	which cannot be purged yet, requires its existence. If some requires,
+	we should do nothing. */
+
+found:
+	if (row_purge_poss_sec(node, index, entry, &pcur, &mtr, true)) {
+
+		/* Remove the index record, which should have been
+		marked for deletion. */
+		if (!rec_get_deleted_flag(btr_cur_get_rec(
+						btr_pcur_get_btr_cur(&pcur)),
+					  dict_table_is_comp(index->table))) {
+			ib::error()
+				<< "tried to purge non-delete-marked record"
+				" in index " << index->name
+				<< " of table " << index->table->name
+				<< ": tuple: " << *entry
+				<< ", record: " << rec_index_print(
+					btr_cur_get_rec(
+						btr_pcur_get_btr_cur(&pcur)),
+					index);
+
+			ut_ad(0);
+
+			goto func_exit;
+		}
+
+		btr_cur_pessimistic_delete(&err, FALSE,
+					   btr_pcur_get_btr_cur(&pcur),
+					   0, false, &mtr);
+		switch (UNIV_EXPECT(err, DB_SUCCESS)) {
+		case DB_SUCCESS:
+			break;
+		case DB_OUT_OF_FILE_SPACE:
+			success = FALSE;
+			break;
+		default:
+			ut_error;
+		}
+	}
+
+func_exit:
+	btr_pcur_close(&pcur); // FIXME: need this?
+	mtr.commit();
+
+	return(success);
+}
+
+/***************************************************************
+Removes a secondary index entry without modifying the index tree,
+if possible.
+@retval true if success or if not found
+@retval false if row_purge_remove_sec_if_poss_tree() should be invoked */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+row_purge_remove_sec_if_poss_leaf(
+/*==============================*/
+	purge_node_t*	node,	/*!< in: row purge node */
+	dict_index_t*	index,	/*!< in: index */
+	const dtuple_t*	entry)	/*!< in: index entry */
+{
+	mtr_t			mtr;
+	btr_pcur_t		pcur;
+	bool			success	= true;
+
+	log_free_check();
+	ut_ad(index->table == node->table);
+	ut_ad(!index->table->is_temporary());
+	mtr.start();
+	index->set_modified(mtr);
+
+	pcur.btr_cur.page_cur.index = index;
+
+	/* Set the purge node for the call to row_purge_poss_sec(). */
+	pcur.btr_cur.purge_node = node;
+	if (index->is_spatial()) {
+		pcur.btr_cur.thr = NULL;
+		if (!rtr_search(entry, BTR_MODIFY_LEAF, &pcur, &mtr)) {
+			goto found;
+		}
+		goto func_exit;
+	}
+
+	/* Set the query thread, so that ibuf_insert_low() will be
+	able to invoke thd_get_trx(). */
+	pcur.btr_cur.thr = static_cast<que_thr_t*>(que_node_get_parent(node));
+
+	switch (row_search_index_entry(entry, index->has_virtual()
+				       ? BTR_MODIFY_LEAF : BTR_PURGE_LEAF,
+				       &pcur, &mtr)) {
+	case ROW_FOUND:
+found:
+		/* Before attempting to purge a record, check
+		if it is safe to do so. */
+		if (row_purge_poss_sec(node, index, entry, &pcur, &mtr, false)) {
+			btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+			/* Only delete-marked records should be purged. */
+			if (!rec_get_deleted_flag(
+				btr_cur_get_rec(btr_cur),
+				dict_table_is_comp(index->table))) {
+
+				ib::error()
+					<< "tried to purge non-delete-marked"
+					" record" " in index " << index->name
+					<< " of table " << index->table->name
+					<< ": tuple: " << *entry
+					<< ", record: "
+					<< rec_index_print(
+						btr_cur_get_rec(btr_cur),
+						index);
+				mtr.commit();
+				dict_set_corrupted(index, "purge");
+				goto cleanup;
+			}
+
+			if (index->is_spatial()) {
+				const buf_block_t* block = btr_cur_get_block(
+					btr_cur);
+
+				if (block->page.id().page_no()
+				    != index->page
+				    && page_get_n_recs(block->page.frame) < 2
+				    && !lock_test_prdt_page_lock(
+					    btr_cur->rtr_info
+					    && btr_cur->rtr_info->thr
+					    ? thr_get_trx(
+						    btr_cur->rtr_info->thr)
+					    : nullptr,
+					    block->page.id())) {
+					/* this is the last record on page,
+					and it has a "page" lock on it,
+					which mean search is still depending
+					on it, so do not delete */
+					DBUG_LOG("purge",
+						 "skip purging last"
+						 " record on page "
+						 << block->page.id());
+					goto func_exit;
+				}
+			}
+
+			success = btr_cur_optimistic_delete(btr_cur, 0, &mtr)
+				!= DB_FAIL;
+		}
+
+		/* (The index entry is still needed,
+		or the deletion succeeded) */
+		/* fall through */
+	case ROW_NOT_DELETED_REF:
+		/* The index entry is still needed. */
+	case ROW_BUFFERED:
+		/* The deletion was buffered. */
+	case ROW_NOT_FOUND:
+		/* The index entry does not exist, nothing to do. */
+func_exit:
+		mtr.commit();
+cleanup:
+		btr_pcur_close(&pcur); // FIXME: do we need these? when is btr_cur->rtr_info set?
+		return(success);
+	}
+
+	ut_error;
+	return(false);
+}
+
+/***********************************************************//**
+Removes a secondary index entry if possible. */
+UNIV_INLINE MY_ATTRIBUTE((nonnull(1,2)))
+void
+row_purge_remove_sec_if_poss(
+/*=========================*/
+	purge_node_t*	node,	/*!< in: row purge node */
+	dict_index_t*	index,	/*!< in: index */
+	const dtuple_t*	entry)	/*!< in: index entry */
+{
+	ibool	success;
+	ulint	n_tries		= 0;
+
+	/*	fputs("Purge: Removing secondary record\n", stderr); */
+
+	if (!entry) {
+		/* The node->row must have lacked some fields of this
+		index. This is possible when the undo log record was
+		written before this index was created. */
+		return;
+	}
+
+	if (row_purge_remove_sec_if_poss_leaf(node, index, entry)) {
+
+		return;
+	}
+retry:
+	success = row_purge_remove_sec_if_poss_tree(node, index, entry);
+	/* The delete operation may fail if we have little
+	file space left: TODO: easiest to crash the database
+	and restart with more file space */
+
+	if (!success && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+
+		n_tries++;
+
+		std::this_thread::sleep_for(BTR_CUR_RETRY_SLEEP_TIME);
+
+		goto retry;
+	}
+
+	ut_a(success);
+}
+
+/***********************************************************//**
+Purges a delete marking of a record.
+@retval true if the row was not found, or it was successfully removed
+@retval false the purge needs to be suspended because of
+running out of file space */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+row_purge_del_mark(
+/*===============*/
+	purge_node_t*	node)	/*!< in/out: row purge node */
+{
+  if (node->index)
+  {
+    mem_heap_t *heap= mem_heap_create(1024);
+
+    do
+    {
+      if (node->index->type & (DICT_FTS | DICT_CORRUPT))
+        continue;
+      if (!node->index->is_committed())
+        continue;
+      dtuple_t* entry= row_build_index_entry_low(node->row, nullptr,
+                                                 node->index, heap,
+                                                 ROW_BUILD_FOR_PURGE);
+      row_purge_remove_sec_if_poss(node, node->index, entry);
+      mem_heap_empty(heap);
+    }
+    while ((node->index= dict_table_get_next_index(node->index)));
+
+    mem_heap_free(heap);
+  }
+
+  bool result= row_purge_remove_clust_if_poss(node);
+
+#ifdef ENABLED_DEBUG_SYNC
+  DBUG_EXECUTE_IF("enable_row_purge_del_mark_exit_sync_point",
+                  debug_sync_set_action
+                  (current_thd,
+                   STRING_WITH_LEN("now SIGNAL row_purge_del_mark_finished"));
+                  );
+#endif
+
+  return result;
+}
+
+/** Reset DB_TRX_ID, DB_ROLL_PTR of a clustered index record
+whose old history can no longer be observed.
+@param[in,out]	node	purge node
+@param[in,out]	mtr	mini-transaction (will be started and committed) */
+static void row_purge_reset_trx_id(purge_node_t* node, mtr_t* mtr)
+{
+	/* Reset DB_TRX_ID, DB_ROLL_PTR for old records. */
+	mtr->start();
+
+	if (row_purge_reposition_pcur(BTR_MODIFY_LEAF, node, mtr)) {
+		dict_index_t*	index = dict_table_get_first_index(
+			node->table);
+		ulint	trx_id_pos = index->n_uniq ? index->n_uniq : 1;
+		rec_t*	rec = btr_pcur_get_rec(&node->pcur);
+		mem_heap_t*	heap = NULL;
+		/* Reserve enough offsets for the PRIMARY KEY and 2 columns
+		so that we can access DB_TRX_ID, DB_ROLL_PTR. */
+		rec_offs offsets_[REC_OFFS_HEADER_SIZE + MAX_REF_PARTS + 2];
+		rec_offs_init(offsets_);
+		rec_offs*	offsets = rec_get_offsets(
+			rec, index, offsets_, index->n_core_fields,
+			trx_id_pos + 2, &heap);
+		ut_ad(heap == NULL);
+
+		ut_ad(dict_index_get_nth_field(index, trx_id_pos)
+		      ->col->mtype == DATA_SYS);
+		ut_ad(dict_index_get_nth_field(index, trx_id_pos)
+		      ->col->prtype == (DATA_TRX_ID | DATA_NOT_NULL));
+		ut_ad(dict_index_get_nth_field(index, trx_id_pos + 1)
+		      ->col->mtype == DATA_SYS);
+		ut_ad(dict_index_get_nth_field(index, trx_id_pos + 1)
+		      ->col->prtype == (DATA_ROLL_PTR | DATA_NOT_NULL));
+
+		/* Only update the record if DB_ROLL_PTR matches (the
+		record has not been modified after this transaction
+		became purgeable) */
+		if (node->roll_ptr
+		    == row_get_rec_roll_ptr(rec, index, offsets)) {
+			ut_ad(!rec_get_deleted_flag(
+					rec, rec_offs_comp(offsets))
+			      || rec_is_alter_metadata(rec, *index));
+			DBUG_LOG("purge", "reset DB_TRX_ID="
+				 << ib::hex(row_get_rec_trx_id(
+						    rec, index, offsets)));
+
+			index->set_modified(*mtr);
+			buf_block_t* block = btr_pcur_get_block(&node->pcur);
+			if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+				page_zip_write_trx_id_and_roll_ptr(
+					block, rec, offsets, trx_id_pos,
+					0, 1ULL << ROLL_PTR_INSERT_FLAG_POS,
+					mtr);
+			} else {
+				ulint	len;
+				byte*	ptr = rec_get_nth_field(
+					rec, offsets, trx_id_pos, &len);
+				ut_ad(len == DATA_TRX_ID_LEN);
+				size_t offs = page_offset(ptr);
+				mtr->memset(block, offs, DATA_TRX_ID_LEN, 0);
+				offs += DATA_TRX_ID_LEN;
+				mtr->write<1,mtr_t::MAYBE_NOP>(
+					*block, block->page.frame + offs,
+					0x80U);
+				mtr->memset(block, offs + 1,
+					    DATA_ROLL_PTR_LEN - 1, 0);
+			}
+		}
+	}
+
+	mtr->commit();
+}
+
+/***********************************************************//**
+Purges an update of an existing record. Also purges an update of a delete
+marked record if that record contained an externally stored field. */
+static
+void
+row_purge_upd_exist_or_extern_func(
+/*===============================*/
+#ifdef UNIV_DEBUG
+	const que_thr_t*thr,		/*!< in: query thread */
+#endif /* UNIV_DEBUG */
+	purge_node_t*	node,		/*!< in: row purge node */
+	const trx_undo_rec_t*	undo_rec)	/*!< in: record to purge */
+{
+	mem_heap_t*	heap;
+
+	ut_ad(!node->table->skip_alter_undo);
+
+	if (node->rec_type == TRX_UNDO_UPD_DEL_REC
+	    || (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)
+	    || !node->index) {
+
+		goto skip_secondaries;
+	}
+
+	heap = mem_heap_create(1024);
+
+	do {
+		if (node->index->type & (DICT_FTS | DICT_CORRUPT)) {
+			continue;
+		}
+
+		if (!node->index->is_committed()) {
+			continue;
+		}
+
+		if (row_upd_changes_ord_field_binary(node->index, node->update,
+						     thr, NULL, NULL)) {
+			/* Build the older version of the index entry */
+			dtuple_t*	entry = row_build_index_entry_low(
+				node->row, NULL, node->index,
+				heap, ROW_BUILD_FOR_PURGE);
+			row_purge_remove_sec_if_poss(node, node->index, entry);
+
+			ut_ad(node->table);
+
+			mem_heap_empty(heap);
+		}
+	} while ((node->index = dict_table_get_next_index(node->index)));
+
+	mem_heap_free(heap);
+
+skip_secondaries:
+	mtr_t		mtr;
+	dict_index_t*	index = dict_table_get_first_index(node->table);
+	/* Free possible externally stored fields */
+	for (ulint i = 0; i < upd_get_n_fields(node->update); i++) {
+
+		const upd_field_t*	ufield
+			= upd_get_nth_field(node->update, i);
+
+		if (dfield_is_ext(&ufield->new_val)) {
+			bool		is_insert;
+			ulint		rseg_id;
+			uint32_t	page_no;
+			uint16_t	offset;
+
+			/* We use the fact that new_val points to
+			undo_rec and get thus the offset of
+			dfield data inside the undo record. Then we
+			can calculate from node->roll_ptr the file
+			address of the new_val data */
+
+			const uint16_t internal_offset = uint16_t(
+				static_cast<const byte*>
+				(dfield_get_data(&ufield->new_val))
+				- undo_rec);
+
+			ut_a(internal_offset < srv_page_size);
+
+			trx_undo_decode_roll_ptr(node->roll_ptr,
+						 &is_insert, &rseg_id,
+						 &page_no, &offset);
+
+			const trx_rseg_t &rseg = trx_sys.rseg_array[rseg_id];
+			ut_ad(rseg.is_persistent());
+
+			mtr.start();
+
+			/* We have to acquire an SX-latch to the clustered
+			index tree (exclude other tree changes) */
+
+			mtr_sx_lock_index(index, &mtr);
+
+			index->set_modified(mtr);
+
+			/* NOTE: we must also acquire a U latch to the
+			root page of the tree. We will need it when we
+			free pages from the tree. If the tree is of height 1,
+			the tree X-latch does NOT protect the root page,
+			because it is also a leaf page. Since we will have a
+			latch on an undo log page, we would break the
+			latching order if we would only later latch the
+			root page of such a tree! */
+
+			dberr_t err;
+			if (!btr_root_block_get(index, RW_SX_LATCH, &mtr,
+						&err)) {
+			} else if (buf_block_t* block =
+				   buf_page_get(page_id_t(rseg.space->id,
+							  page_no),
+						0, RW_X_LATCH, &mtr)) {
+				block->page.set_accessed();
+				buf_page_make_young_if_needed(&block->page);
+
+				byte* data_field = block->page.frame
+					+ offset + internal_offset;
+
+				ut_a(dfield_get_len(&ufield->new_val)
+				     >= BTR_EXTERN_FIELD_REF_SIZE);
+				btr_free_externally_stored_field(
+					index,
+					data_field
+					+ dfield_get_len(&ufield->new_val)
+					- BTR_EXTERN_FIELD_REF_SIZE,
+					NULL, NULL, block, 0, false, &mtr);
+			}
+
+			mtr.commit();
+		}
+	}
+
+	row_purge_reset_trx_id(node, &mtr);
+}
+
+#ifdef UNIV_DEBUG
+# define row_purge_upd_exist_or_extern(thr,node,undo_rec)	\
+	row_purge_upd_exist_or_extern_func(thr,node,undo_rec)
+#else /* UNIV_DEBUG */
+# define row_purge_upd_exist_or_extern(thr,node,undo_rec)	\
+	row_purge_upd_exist_or_extern_func(node,undo_rec)
+#endif /* UNIV_DEBUG */
+
+/** Build a partial row from an update undo log record for purge.
+Any columns which occur as ordering in any index of the table are present.
+Any missing columns are indicated by col->mtype == DATA_MISSING.
+
+@param ptr    remaining part of the undo log record
+@param index  clustered index
+@param node   purge node
+@return pointer to remaining part of undo record */
+static byte *row_purge_get_partial(const byte *ptr, const dict_index_t &index,
+                                   purge_node_t *node)
+{
+  bool first_v_col= true;
+  bool is_undo_log= true;
+
+  ut_ad(index.is_primary());
+  ut_ad(index.n_uniq == node->ref->n_fields);
+
+  node->row= dtuple_create_with_vcol(node->heap, index.table->n_cols,
+                                     index.table->n_v_cols);
+
+  /* Mark all columns in the row uninitialized, so that
+  we can distinguish missing fields from fields that are SQL NULL. */
+  for (ulint i= 0; i < index.table->n_cols; i++)
+    node->row->fields[i].type.mtype= DATA_MISSING;
+
+  dtuple_init_v_fld(node->row);
+
+  for (const upd_field_t *uf= node->update->fields, *const ue=
+         node->update->fields + node->update->n_fields; uf != ue; uf++)
+  {
+    if (!uf->old_v_val)
+    {
+      const dict_col_t &c= *dict_index_get_nth_col(&index, uf->field_no);
+      if (!c.is_dropped())
+        node->row->fields[c.ind]= uf->new_val;
+    }
+  }
+
+  const byte *end_ptr= ptr + mach_read_from_2(ptr);
+  ptr+= 2;
+
+  while (ptr != end_ptr)
+  {
+    dfield_t *dfield;
+    const byte *field;
+    const dict_col_t *col;
+    uint32_t len, orig_len, field_no= mach_read_next_compressed(&ptr);
+
+    if (field_no >= REC_MAX_N_FIELDS)
+    {
+      ptr= trx_undo_read_v_idx(index.table, ptr, first_v_col, &is_undo_log,
+                               &field_no);
+      first_v_col= false;
+
+      ptr= trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+
+      if (field_no == FIL_NULL)
+        continue; /* there no longer is an index on the virtual column */
+
+      dict_v_col_t *vcol= dict_table_get_nth_v_col(index.table, field_no);
+      col =&vcol->m_col;
+      dfield= dtuple_get_nth_v_field(node->row, vcol->v_pos);
+      dict_col_copy_type(&vcol->m_col, &dfield->type);
+    }
+    else
+    {
+      ptr= trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+      col= dict_index_get_nth_col(&index, field_no);
+      if (col->is_dropped())
+        continue;
+      dfield= dtuple_get_nth_field(node->row, col->ind);
+      ut_ad(dfield->type.mtype == DATA_MISSING ||
+            dict_col_type_assert_equal(col, &dfield->type));
+      ut_ad(dfield->type.mtype == DATA_MISSING ||
+            dfield->len == len ||
+            (len != UNIV_SQL_NULL && len >= UNIV_EXTERN_STORAGE_FIELD));
+      dict_col_copy_type(dict_table_get_nth_col(index.table, col->ind),
+                         &dfield->type);
+    }
+
+    dfield_set_data(dfield, field, len);
+
+    if (len == UNIV_SQL_NULL || len < UNIV_EXTERN_STORAGE_FIELD)
+      continue;
+
+    spatial_status_t spatial_status= static_cast<spatial_status_t>
+      ((len & SPATIAL_STATUS_MASK) >> SPATIAL_STATUS_SHIFT);
+    len&= ~SPATIAL_STATUS_MASK;
+
+    /* Keep compatible with 5.7.9 format. */
+    if (spatial_status == SPATIAL_UNKNOWN)
+      spatial_status= dict_col_get_spatial_status(col);
+
+    switch (UNIV_EXPECT(spatial_status, SPATIAL_NONE)) {
+    case SPATIAL_ONLY:
+      ut_ad(len - UNIV_EXTERN_STORAGE_FIELD == DATA_MBR_LEN);
+      dfield_set_len(dfield, len - UNIV_EXTERN_STORAGE_FIELD);
+      break;
+
+    case SPATIAL_MIXED:
+      dfield_set_len(dfield, len - UNIV_EXTERN_STORAGE_FIELD - DATA_MBR_LEN);
+      break;
+
+    default:
+      dfield_set_len(dfield, len - UNIV_EXTERN_STORAGE_FIELD);
+      break;
+    }
+
+    dfield_set_ext(dfield);
+    dfield_set_spatial_status(dfield, spatial_status);
+
+    if (!col->ord_part || spatial_status == SPATIAL_ONLY ||
+        node->rec_type == TRX_UNDO_UPD_DEL_REC)
+      continue;
+    /* If the prefix of this BLOB column is indexed, ensure that enough
+    prefix is stored in the undo log record. */
+    ut_a(dfield_get_len(dfield) >= BTR_EXTERN_FIELD_REF_SIZE);
+    ut_a(dict_table_has_atomic_blobs(index.table) ||
+         dfield_get_len(dfield) >=
+         REC_ANTELOPE_MAX_INDEX_COL_LEN + BTR_EXTERN_FIELD_REF_SIZE);
+  }
+
+  for (ulint i= 0; i < index.n_uniq; i++)
+  {
+    dfield_t &field= node->row->fields[index.fields[i].col->ind];
+    if (field.type.mtype == DATA_MISSING)
+      field= node->ref->fields[i];
+  }
+
+  return const_cast<byte*>(ptr);
+}
+
+MY_ATTRIBUTE((nonnull,warn_unused_result))
+/** Parses the row reference and other info in a modify undo log record.
+@param[in]	node		row undo node
+@param[in]	undo_rec	record to purge
+@param[in]	thr		query thread
+@param[out]	updated_extern	true if an externally stored field was
+				updated
+@return true if purge operation required */
+static
+bool
+row_purge_parse_undo_rec(
+	purge_node_t*		node,
+	const trx_undo_rec_t*	undo_rec,
+	que_thr_t*		thr,
+	bool*			updated_extern)
+{
+	dict_index_t*	clust_index;
+	undo_no_t	undo_no;
+	table_id_t	table_id;
+	roll_ptr_t	roll_ptr;
+	byte		info_bits;
+	byte		type;
+
+	const byte* ptr = trx_undo_rec_get_pars(
+		undo_rec, &type, &node->cmpl_info,
+		updated_extern, &undo_no, &table_id);
+
+	node->rec_type = type;
+
+	switch (type) {
+	case TRX_UNDO_RENAME_TABLE:
+		return false;
+	case TRX_UNDO_EMPTY:
+	case TRX_UNDO_INSERT_METADATA:
+	case TRX_UNDO_INSERT_REC:
+		/* These records do not store any transaction identifier. */
+		node->trx_id = TRX_ID_MAX;
+		break;
+	default:
+#ifdef UNIV_DEBUG
+		ut_ad("unknown undo log record type" == 0);
+		return false;
+	case TRX_UNDO_UPD_DEL_REC:
+	case TRX_UNDO_UPD_EXIST_REC:
+	case TRX_UNDO_DEL_MARK_REC:
+#endif /* UNIV_DEBUG */
+		ptr = trx_undo_update_rec_get_sys_cols(ptr, &node->trx_id,
+						       &roll_ptr, &info_bits);
+		break;
+	}
+
+	auto &tables_entry= node->tables[table_id];
+	node->table = tables_entry.first;
+	if (!node->table) {
+		return false;
+	}
+
+#ifndef DBUG_OFF
+	if (MDL_ticket* mdl = tables_entry.second) {
+		static_cast<MDL_context*>(thd_mdl_context(current_thd))
+			->lock_warrant = mdl->get_ctx();
+	}
+#endif
+	ut_ad(!node->table->is_temporary());
+
+	clust_index = dict_table_get_first_index(node->table);
+
+	if (clust_index->is_corrupted()) {
+		/* The table was corrupt in the data dictionary.
+		dict_set_corrupted() works on an index, and
+		we do not have an index to call it with. */
+		DBUG_ASSERT(table_id == node->table->id);
+		return false;
+	}
+
+	switch (type) {
+	case TRX_UNDO_INSERT_METADATA:
+		node->ref = &trx_undo_metadata;
+		return true;
+	case TRX_UNDO_EMPTY:
+		node->ref = nullptr;
+		return true;
+	}
+
+	ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref),
+				       node->heap);
+
+	if (type == TRX_UNDO_INSERT_REC) {
+		return(true);
+	}
+
+	ptr = trx_undo_update_rec_get_update(ptr, clust_index, type,
+					     node->trx_id,
+					     roll_ptr, info_bits,
+					     node->heap, &(node->update));
+
+	/* Read to the partial row the fields that occur in indexes */
+
+	if (!(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+		ut_ad(!(node->update->info_bits & REC_INFO_MIN_REC_FLAG));
+		ptr = row_purge_get_partial(ptr, *clust_index, node);
+	} else if (node->update->info_bits & REC_INFO_MIN_REC_FLAG) {
+		node->ref = &trx_undo_metadata;
+	}
+
+	return(true);
+}
+
+/** Purges the parsed record.
+@param[in]	node		row purge node
+@param[in]	undo_rec	record to purge
+@param[in]	thr		query thread
+@param[in]	updated_extern	whether external columns were updated
+@return true if purged, false if skipped */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+bool
+row_purge_record_func(
+	purge_node_t*	node,
+	const trx_undo_rec_t*	undo_rec,
+#if defined UNIV_DEBUG || defined WITH_WSREP
+	const que_thr_t*thr,
+#endif /* UNIV_DEBUG || WITH_WSREP */
+	bool		updated_extern)
+{
+	ut_ad(!node->found_clust);
+	ut_ad(!node->table->skip_alter_undo);
+	ut_ad(!trx_undo_roll_ptr_is_insert(node->roll_ptr));
+
+	node->index = dict_table_get_next_index(
+		dict_table_get_first_index(node->table));
+
+	bool purged = true;
+
+	switch (node->rec_type) {
+	case TRX_UNDO_EMPTY:
+		break;
+	case TRX_UNDO_DEL_MARK_REC:
+		purged = row_purge_del_mark(node);
+		if (purged) {
+			if (node->table->stat_initialized
+			    && srv_stats_include_delete_marked) {
+				dict_stats_update_if_needed(
+					node->table, *thr->graph->trx);
+			}
+			MONITOR_INC(MONITOR_N_DEL_ROW_PURGE);
+		}
+		break;
+	case TRX_UNDO_INSERT_METADATA:
+	case TRX_UNDO_INSERT_REC:
+		node->roll_ptr |= 1ULL << ROLL_PTR_INSERT_FLAG_POS;
+		/* fall through */
+	default:
+		if (!updated_extern) {
+			mtr_t		mtr;
+			row_purge_reset_trx_id(node, &mtr);
+			break;
+		}
+		/* fall through */
+	case TRX_UNDO_UPD_EXIST_REC:
+		row_purge_upd_exist_or_extern(thr, node, undo_rec);
+		MONITOR_INC(MONITOR_N_UPD_EXIST_EXTERN);
+		break;
+	}
+
+	if (node->found_clust) {
+		node->found_clust = false;
+		btr_pcur_close(&node->pcur);
+	}
+
+	return(purged);
+}
+
+#if defined UNIV_DEBUG || defined WITH_WSREP
+# define row_purge_record(node,undo_rec,thr,updated_extern)	\
+	row_purge_record_func(node,undo_rec,thr,updated_extern)
+#else /* UNIV_DEBUG || WITH_WSREP */
+# define row_purge_record(node,undo_rec,thr,updated_extern)	\
+	row_purge_record_func(node,undo_rec,updated_extern)
+#endif /* UNIV_DEBUG || WITH_WSREP */
+
+/***********************************************************//**
+Fetches an undo log record and does the purge for the recorded operation.
+If none left, or the current purge completed, returns the control to the
+parent node, which is always a query thread node. */
+static MY_ATTRIBUTE((nonnull))
+void
+row_purge(
+/*======*/
+	purge_node_t*	node,		/*!< in: row purge node */
+	const trx_undo_rec_t*	undo_rec,	/*!< in: record to purge */
+	que_thr_t*	thr)		/*!< in: query thread */
+{
+	if (undo_rec != reinterpret_cast<trx_undo_rec_t*>(-1)) {
+		bool	updated_extern;
+
+		while (row_purge_parse_undo_rec(
+			       node, undo_rec, thr, &updated_extern)) {
+
+			bool purged = row_purge_record(
+				node, undo_rec, thr, updated_extern);
+
+			if (purged
+			    || srv_shutdown_state > SRV_SHUTDOWN_INITIATED) {
+				return;
+			}
+
+			/* Retry the purge in a second. */
+			std::this_thread::sleep_for(std::chrono::seconds(1));
+		}
+	}
+}
+
+inline void purge_node_t::start()
+{
+  ut_ad(in_progress);
+  DBUG_ASSERT(common.type == QUE_NODE_PURGE);
+
+  row= nullptr;
+  ref= nullptr;
+  index= nullptr;
+  update= nullptr;
+  found_clust= false;
+  rec_type= 0;
+  cmpl_info= 0;
+}
+
+/** Reset the state at end
+@return the query graph parent */
+inline que_node_t *purge_node_t::end(THD *thd)
+{
+  DBUG_ASSERT(common.type == QUE_NODE_PURGE);
+  ut_ad(undo_recs.empty());
+  ut_d(in_progress= false);
+  innobase_reset_background_thd(thd);
+#ifndef DBUG_OFF
+  static_cast<MDL_context*>(thd_mdl_context(thd))->lock_warrant= nullptr;
+#endif
+  mem_heap_empty(heap);
+  return common.parent;
+}
+
+
+/***********************************************************//**
+Does the purge operation.
+@return query thread to run next */
+que_thr_t*
+row_purge_step(
+/*===========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	purge_node_t*	node;
+
+	node = static_cast<purge_node_t*>(thr->run_node);
+
+	node->start();
+
+	while (!node->undo_recs.empty()) {
+		trx_purge_rec_t purge_rec = node->undo_recs.front();
+		node->undo_recs.pop();
+		node->roll_ptr = purge_rec.roll_ptr;
+
+		row_purge(node, purge_rec.undo_rec, thr);
+	}
+
+	thr->run_node = node->end(current_thd);
+	return(thr);
+}
+
+#ifdef UNIV_DEBUG
+/***********************************************************//**
+Validate the persisent cursor. The purge node has two references
+to the clustered index record - one via the ref member, and the
+other via the persistent cursor.  These two references must match
+each other if the found_clust flag is set.
+@return true if the stored copy of persistent cursor is consistent
+with the ref member.*/
+bool
+purge_node_t::validate_pcur()
+{
+	if (!found_clust) {
+		return(true);
+	}
+
+	if (index == NULL) {
+		return(true);
+	}
+
+	if (index->type == DICT_FTS) {
+		return(true);
+	}
+
+	if (!pcur.old_rec) {
+		return(true);
+	}
+
+	dict_index_t* clust_index = pcur.index();
+
+	rec_offs* offsets = rec_get_offsets(
+		pcur.old_rec, clust_index, NULL, pcur.old_n_core_fields,
+		pcur.old_n_fields, &heap);
+
+	/* Here we are comparing the purge ref record and the stored initial
+	part in persistent cursor. Both cases we store n_uniq fields of the
+	cluster index and so it is fine to do the comparison. We note this
+	dependency here as pcur and ref belong to different modules. */
+	int st = cmp_dtuple_rec(ref, pcur.old_rec, clust_index, offsets);
+
+	if (st != 0) {
+		ib::error() << "Purge node pcur validation failed";
+		ib::error() << rec_printer(ref).str();
+		ib::error() << rec_printer(pcur.old_rec, offsets).str();
+		return(false);
+	}
+
+	return(true);
+}
+#endif /* UNIV_DEBUG */
diff --git a/storage/innobase/row/row0quiesce.cc b/storage/innobase/row/row0quiesce.cc
new file mode 100644
index 00000000..e927096f
--- /dev/null
+++ b/storage/innobase/row/row0quiesce.cc
@@ -0,0 +1,715 @@
+/*****************************************************************************
+
+Copyright (c) 2012, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0quiesce.cc
+Quiesce a tablespace.
+
+Created 2012-02-08 by Sunny Bains.
+*******************************************************/
+
+#include "row0quiesce.h"
+#include "row0mysql.h"
+#include "buf0flu.h"
+#include "ibuf0ibuf.h"
+#include "srv0start.h"
+#include "trx0purge.h"
+
+#ifdef HAVE_MY_AES_H
+#include <my_aes.h>
+#endif
+
+/*********************************************************************//**
+Write the meta data (index user fields) config file.
+@return DB_SUCCESS or error code. */
+static	MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_quiesce_write_index_fields(
+/*===========================*/
+	const dict_index_t*	index,	/*!< in: write the meta data for
+					this index */
+	FILE*			file,	/*!< in: file to write to */
+	THD*			thd)	/*!< in/out: session */
+{
+	byte			row[sizeof(ib_uint32_t) * 2];
+
+	for (ulint i = 0; i < index->n_fields; ++i) {
+		byte*			ptr = row;
+		const dict_field_t*	field = &index->fields[i];
+
+		mach_write_to_4(ptr, field->prefix_len);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, field->fixed_len);
+
+		DBUG_EXECUTE_IF("ib_export_io_write_failure_9",
+				close(fileno(file)););
+
+		if (fwrite(row, 1, sizeof(row), file) != sizeof(row)) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+				(ulong) errno, strerror(errno),
+				"while writing index fields.");
+
+			return(DB_IO_ERROR);
+		}
+
+		const char* field_name = field->name ? field->name : "";
+		/* Include the NUL byte in the length. */
+		ib_uint32_t	len = static_cast<ib_uint32_t>(strlen(field_name) + 1);
+		mach_write_to_4(row, len);
+
+		DBUG_EXECUTE_IF("ib_export_io_write_failure_10",
+				close(fileno(file)););
+
+		if (fwrite(row, 1,  sizeof(len), file) != sizeof(len)
+		    || fwrite(field_name, 1, len, file) != len) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+				(ulong) errno, strerror(errno),
+				"while writing index column.");
+
+			return(DB_IO_ERROR);
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Write the meta data config file index information.
+@return DB_SUCCESS or error code. */
+static	MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_quiesce_write_indexes(
+/*======================*/
+	const dict_table_t*	table,	/*!< in: write the meta data for
+					this table */
+	FILE*			file,	/*!< in: file to write to */
+	THD*			thd)	/*!< in/out: session */
+{
+	ulint n_indexes = 0;
+	for (const dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
+	     index; index = UT_LIST_GET_NEXT(indexes, index)) {
+		n_indexes += index->is_committed();
+	}
+
+	{
+		byte		row[sizeof(ib_uint32_t)];
+
+		/* Write the number of indexes in the table. */
+		mach_write_to_4(row, n_indexes);
+
+		DBUG_EXECUTE_IF("ib_export_io_write_failure_11",
+				close(fileno(file)););
+
+		if (fwrite(row, 1,  sizeof(row), file) != sizeof(row)) {
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+				(ulong) errno, strerror(errno),
+				"while writing index count.");
+
+			return(DB_IO_ERROR);
+		}
+	}
+
+	dberr_t			err = DB_SUCCESS;
+
+	/* Write the index meta data. */
+	for (const dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
+	     index != 0 && err == DB_SUCCESS;
+	     index = UT_LIST_GET_NEXT(indexes, index)) {
+
+		if (!index->is_committed()) {
+			continue;
+		}
+
+		ut_ad(n_indexes); ut_d(n_indexes--);
+
+		byte*		ptr;
+		byte		row[sizeof(index_id_t)
+				    + sizeof(ib_uint32_t) * 8];
+
+		ptr = row;
+
+		ut_ad(sizeof(index_id_t) == 8);
+		mach_write_to_8(ptr, index->id);
+		ptr += sizeof(index_id_t);
+
+		mach_write_to_4(ptr, table->space_id);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, index->page);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, index->type);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, index->trx_id_offset);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, index->n_user_defined_cols);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, index->n_uniq);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, index->n_nullable);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, index->n_fields);
+
+		DBUG_EXECUTE_IF("ib_export_io_write_failure_12",
+				close(fileno(file)););
+
+		if (fwrite(row, 1, sizeof(row), file) != sizeof(row)) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+				(ulong) errno, strerror(errno),
+				"while writing index meta-data.");
+
+			return(DB_IO_ERROR);
+		}
+
+		/* Write the length of the index name.
+		NUL byte is included in the length. */
+		ib_uint32_t	len = static_cast<ib_uint32_t>(strlen(index->name) + 1);
+		ut_a(len > 1);
+
+		mach_write_to_4(row, len);
+
+		DBUG_EXECUTE_IF("ib_export_io_write_failure_1",
+				close(fileno(file)););
+
+		if (fwrite(row, 1, sizeof(len), file) != sizeof(len)
+		    || fwrite(index->name, 1, len, file) != len) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+				(ulong) errno, strerror(errno),
+				"while writing index name.");
+
+			return(DB_IO_ERROR);
+		}
+
+		err = row_quiesce_write_index_fields(index, file, thd);
+	}
+
+	ut_ad(!n_indexes);
+	return(err);
+}
+
+/*********************************************************************//**
+Write the meta data (table columns) config file. Serialise the contents of
+dict_col_t structure, along with the column name. All fields are serialized
+as ib_uint32_t.
+@return DB_SUCCESS or error code. */
+static	MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_quiesce_write_table(
+/*====================*/
+	const dict_table_t*	table,	/*!< in: write the meta data for
+					this table */
+	FILE*			file,	/*!< in: file to write to */
+	THD*			thd)	/*!< in/out: session */
+{
+	dict_col_t*		col;
+	byte			row[sizeof(ib_uint32_t) * 7];
+
+	col = table->cols;
+
+	for (ulint i = 0; i < table->n_cols; ++i, ++col) {
+		byte*		ptr = row;
+
+		mach_write_to_4(ptr, col->prtype);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, col->mtype);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, col->len);
+		ptr += sizeof(ib_uint32_t);
+
+		/* FIXME: This will not work if mbminlen>4.
+		This field is also redundant, because the lengths
+		are a property of the character set encoding, which
+		in turn is encodedin prtype above. */
+		mach_write_to_4(ptr, ulint(col->mbmaxlen * 5 + col->mbminlen));
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, col->ind);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, col->ord_part);
+		ptr += sizeof(ib_uint32_t);
+
+		mach_write_to_4(ptr, col->max_prefix);
+
+		DBUG_EXECUTE_IF("ib_export_io_write_failure_2",
+				close(fileno(file)););
+
+		if (fwrite(row, 1,  sizeof(row), file) != sizeof(row)) {
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+				(ulong) errno, strerror(errno),
+				"while writing table column data.");
+
+			return(DB_IO_ERROR);
+		}
+
+		/* Write out the column name as [len, byte array]. The len
+		includes the NUL byte. */
+		ib_uint32_t	len;
+		const char*	col_name;
+
+		col_name = dict_table_get_col_name(table, dict_col_get_no(col));
+
+		/* Include the NUL byte in the length. */
+		len = static_cast<ib_uint32_t>(strlen(col_name) + 1);
+		ut_a(len > 1);
+
+		mach_write_to_4(row, len);
+
+		DBUG_EXECUTE_IF("ib_export_io_write_failure_3",
+				close(fileno(file)););
+
+		if (fwrite(row, 1,  sizeof(len), file) != sizeof(len)
+		    || fwrite(col_name, 1, len, file) != len) {
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+				(ulong) errno, strerror(errno),
+				"while writing column name.");
+
+			return(DB_IO_ERROR);
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Write the meta data config file header.
+@return DB_SUCCESS or error code. */
+static	MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_quiesce_write_header(
+/*=====================*/
+	const dict_table_t*	table,	/*!< in: write the meta data for
+					this table */
+	FILE*			file,	/*!< in: file to write to */
+	THD*			thd)	/*!< in/out: session */
+{
+	byte			value[sizeof(ib_uint32_t)];
+
+	/* Write the meta-data version number. */
+	mach_write_to_4(value, IB_EXPORT_CFG_VERSION_V1);
+
+	DBUG_EXECUTE_IF("ib_export_io_write_failure_4", close(fileno(file)););
+
+	if (fwrite(&value, 1,  sizeof(value), file) != sizeof(value)) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+			(ulong) errno, strerror(errno),
+			"while writing meta-data version number.");
+
+		return(DB_IO_ERROR);
+	}
+
+	/* Write the server hostname. */
+	ib_uint32_t		len;
+	const char*		hostname = server_get_hostname();
+
+	/* Play it safe and check for NULL. */
+	if (hostname == 0) {
+		static const char	NullHostname[] = "Hostname unknown";
+
+		ib::warn() << "Unable to determine server hostname.";
+
+		hostname = NullHostname;
+	}
+
+	/* The server hostname includes the NUL byte. */
+	len = static_cast<ib_uint32_t>(strlen(hostname) + 1);
+	mach_write_to_4(value, len);
+
+	DBUG_EXECUTE_IF("ib_export_io_write_failure_5", close(fileno(file)););
+
+	if (fwrite(&value, 1,  sizeof(value), file) != sizeof(value)
+	    || fwrite(hostname, 1,  len, file) != len) {
+
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+			(ulong) errno, strerror(errno),
+			"while writing hostname.");
+
+		return(DB_IO_ERROR);
+	}
+
+	/* The table name includes the NUL byte. */
+	ut_a(table->name.m_name != NULL);
+	len = static_cast<ib_uint32_t>(strlen(table->name.m_name) + 1);
+
+	/* Write the table name. */
+	mach_write_to_4(value, len);
+
+	DBUG_EXECUTE_IF("ib_export_io_write_failure_6", close(fileno(file)););
+
+	if (fwrite(&value, 1,  sizeof(value), file) != sizeof(value)
+	    || fwrite(table->name.m_name, 1,  len, file) != len) {
+
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+			(ulong) errno, strerror(errno),
+			"while writing table name.");
+
+		return(DB_IO_ERROR);
+	}
+
+	byte		row[sizeof(ib_uint32_t) * 3];
+
+	/* Write the next autoinc value. */
+	mach_write_to_8(row, table->autoinc);
+
+	DBUG_EXECUTE_IF("ib_export_io_write_failure_7", close(fileno(file)););
+
+	if (fwrite(row, 1,  sizeof(ib_uint64_t), file) != sizeof(ib_uint64_t)) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+			(ulong) errno, strerror(errno),
+			"while writing table autoinc value.");
+
+		return(DB_IO_ERROR);
+	}
+
+	byte*		ptr = row;
+
+	/* Write the system page size. */
+	mach_write_to_4(ptr, srv_page_size);
+	ptr += sizeof(ib_uint32_t);
+
+	/* Write the table->flags. */
+	mach_write_to_4(ptr, table->flags);
+	ptr += sizeof(ib_uint32_t);
+
+	/* Write the number of columns in the table. */
+	mach_write_to_4(ptr, table->n_cols);
+
+	DBUG_EXECUTE_IF("ib_export_io_write_failure_8", close(fileno(file)););
+
+	if (fwrite(row, 1,  sizeof(row), file) != sizeof(row)) {
+		ib_senderrf(
+			thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+			(ulong) errno, strerror(errno),
+			"while writing table meta-data.");
+
+		return(DB_IO_ERROR);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Write the table meta data after quiesce.
+@return DB_SUCCESS or error code */
+static	MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_quiesce_write_cfg(
+/*==================*/
+	dict_table_t*	table,	/*!< in: write the meta data for
+					this table */
+	THD*			thd)	/*!< in/out: session */
+{
+	dberr_t			err;
+	char			name[OS_FILE_MAX_PATH];
+
+	srv_get_meta_data_filename(table, name, sizeof(name));
+
+	ib::info() << "Writing table metadata to '" << name << "'";
+
+	FILE*	file = fopen(name, "w+b");
+
+	if (file == NULL) {
+		ib_errf(thd, IB_LOG_LEVEL_WARN, ER_CANT_CREATE_FILE,
+			 name, errno, strerror(errno));
+
+		err = DB_IO_ERROR;
+	} else {
+		err = row_quiesce_write_header(table, file, thd);
+
+		if (err == DB_SUCCESS) {
+			err = row_quiesce_write_table(table, file, thd);
+		}
+
+		if (err == DB_SUCCESS) {
+			err = row_quiesce_write_indexes(table, file, thd);
+		}
+
+		if (fflush(file) != 0) {
+
+			char	msg[BUFSIZ];
+
+			snprintf(msg, sizeof(msg), "%s flush() failed", name);
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+				(ulong) errno, strerror(errno), msg);
+		}
+
+		if (fclose(file) != 0) {
+			char	msg[BUFSIZ];
+
+			snprintf(msg, sizeof(msg), "%s flose() failed", name);
+
+			ib_senderrf(
+				thd, IB_LOG_LEVEL_WARN, ER_IO_WRITE_ERROR,
+				(ulong) errno, strerror(errno), msg);
+		}
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Check whether a table has an FTS index defined on it.
+@return true if an FTS index exists on the table */
+static
+bool
+row_quiesce_table_has_fts_index(
+/*============================*/
+	const dict_table_t*	table)	/*!< in: quiesce this table */
+{
+	bool			exists = false;
+
+	for (const dict_index_t* index = UT_LIST_GET_FIRST(table->indexes);
+	     index != 0;
+	     index = UT_LIST_GET_NEXT(indexes, index)) {
+
+		if (index->type & DICT_FTS) {
+			exists = true;
+			break;
+		}
+	}
+
+	return(exists);
+}
+
+/*********************************************************************//**
+Quiesce the tablespace that the table resides in. */
+void
+row_quiesce_table_start(
+/*====================*/
+	dict_table_t*	table,		/*!< in: quiesce this table */
+	trx_t*		trx)		/*!< in/out: transaction/session */
+{
+	ut_a(trx->mysql_thd != 0);
+	ut_a(srv_n_purge_threads > 0);
+	ut_ad(!srv_read_only_mode);
+
+	ut_a(trx->mysql_thd != 0);
+
+	ut_ad(table->space != NULL);
+	ib::info() << "Sync to disk of " << table->name << " started.";
+
+	if (srv_undo_sources) {
+		purge_sys.stop();
+	}
+
+	for (ulint count = 0;
+	     ibuf_merge_space(table->space_id);
+	     ++count) {
+		if (trx_is_interrupted(trx)) {
+			goto aborted;
+		}
+		if (!(count % 20)) {
+			ib::info() << "Merging change buffer entries for "
+				<< table->name;
+		}
+	}
+
+	while (buf_flush_list_space(table->space)) {
+		if (trx_is_interrupted(trx)) {
+			goto aborted;
+		}
+	}
+
+	if (!trx_is_interrupted(trx)) {
+		/* Ensure that all asynchronous IO is completed. */
+		os_aio_wait_until_no_pending_writes(true);
+		table->space->flush<false>();
+
+		if (row_quiesce_write_cfg(table, trx->mysql_thd)
+		    != DB_SUCCESS) {
+			ib::warn() << "There was an error writing to the"
+				" meta data file";
+		} else {
+			ib::info() << "Table " << table->name
+				<< " flushed to disk";
+		}
+	} else {
+aborted:
+		ib::warn() << "Quiesce aborted!";
+	}
+
+	dberr_t	err = row_quiesce_set_state(table, QUIESCE_COMPLETE, trx);
+	ut_a(err == DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Cleanup after table quiesce. */
+void
+row_quiesce_table_complete(
+/*=======================*/
+	dict_table_t*	table,		/*!< in: quiesce this table */
+	trx_t*		trx)		/*!< in/out: transaction/session */
+{
+	ulint		count = 0;
+
+	ut_a(trx->mysql_thd != 0);
+
+	/* We need to wait for the operation to complete if the
+	transaction has been killed. */
+
+	while (table->quiesce != QUIESCE_COMPLETE) {
+
+		/* Print a warning after every minute. */
+		if (!(count % 60)) {
+			ib::warn() << "Waiting for quiesce of " << table->name
+				<< " to complete";
+		}
+
+		std::this_thread::sleep_for(std::chrono::seconds(1));
+
+		++count;
+	}
+
+	if (!opt_bootstrap) {
+		/* Remove the .cfg file now that the user has resumed
+		normal operations. Otherwise it will cause problems when
+		the user tries to drop the database (remove directory). */
+		char		cfg_name[OS_FILE_MAX_PATH];
+
+		srv_get_meta_data_filename(table, cfg_name, sizeof(cfg_name));
+
+		os_file_delete_if_exists(innodb_data_file_key, cfg_name, NULL);
+
+		ib::info() << "Deleting the meta-data file '" << cfg_name << "'";
+	}
+
+	if (srv_undo_sources) {
+		purge_sys.resume();
+	}
+
+	dberr_t	err = row_quiesce_set_state(table, QUIESCE_NONE, trx);
+	ut_a(err == DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Set a table's quiesce state.
+@return DB_SUCCESS or error code. */
+dberr_t
+row_quiesce_set_state(
+/*==================*/
+	dict_table_t*	table,		/*!< in: quiesce this table */
+	ib_quiesce_t	state,		/*!< in: quiesce state to set */
+	trx_t*		trx)		/*!< in/out: transaction */
+{
+	ut_a(srv_n_purge_threads > 0);
+
+	if (srv_read_only_mode) {
+
+		ib_senderrf(trx->mysql_thd,
+			    IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE);
+
+		return(DB_UNSUPPORTED);
+
+	} else if (table->is_temporary()) {
+
+		ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_WARN,
+			    ER_CANNOT_DISCARD_TEMPORARY_TABLE);
+
+		return(DB_UNSUPPORTED);
+	} else if (table->space_id == TRX_SYS_SPACE) {
+
+		char	table_name[MAX_FULL_NAME_LEN + 1];
+
+		innobase_format_name(
+			table_name, sizeof(table_name),
+			table->name.m_name);
+
+		ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_WARN,
+			    ER_TABLE_IN_SYSTEM_TABLESPACE, table_name);
+
+		return(DB_UNSUPPORTED);
+	} else if (row_quiesce_table_has_fts_index(table)) {
+
+		ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_WARN,
+			    ER_NOT_SUPPORTED_YET,
+			    "FLUSH TABLES on tables that have an FTS index."
+			    " FTS auxiliary tables will not be flushed.");
+
+	} else if (DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
+		/* If this flag is set then the table may not have any active
+		FTS indexes but it will still have the auxiliary tables. */
+
+		ib_senderrf(trx->mysql_thd, IB_LOG_LEVEL_WARN,
+			    ER_NOT_SUPPORTED_YET,
+			    "FLUSH TABLES on a table that had an FTS index,"
+			    " created on a hidden column, the"
+			    " auxiliary tables haven't been dropped as yet."
+			    " FTS auxiliary tables will not be flushed.");
+	}
+
+	dict_index_t* clust_index = dict_table_get_first_index(table);
+
+	for (dict_index_t* index = dict_table_get_next_index(clust_index);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+		index->lock.x_lock(SRW_LOCK_CALL);
+	}
+
+	clust_index->lock.x_lock(SRW_LOCK_CALL);
+
+	switch (state) {
+	case QUIESCE_START:
+		break;
+
+	case QUIESCE_COMPLETE:
+		ut_a(table->quiesce == QUIESCE_START);
+		break;
+
+	case QUIESCE_NONE:
+		ut_a(table->quiesce == QUIESCE_COMPLETE);
+		break;
+	}
+
+	table->quiesce = state;
+
+	for (dict_index_t* index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+		index->lock.x_unlock();
+	}
+
+	return(DB_SUCCESS);
+}
+
diff --git a/storage/innobase/row/row0row.cc b/storage/innobase/row/row0row.cc
new file mode 100644
index 00000000..4a00b2a4
--- /dev/null
+++ b/storage/innobase/row/row0row.cc
@@ -0,0 +1,1720 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2018, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2018, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0row.cc
+General row routines
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#include "row0row.h"
+#include "data0type.h"
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0ext.h"
+#include "row0upd.h"
+#include "rem0cmp.h"
+#include "ut0mem.h"
+#include "gis0geo.h"
+#include "row0mysql.h"
+
+/** Build a spatial index key.
+@param[in]	index	spatial index
+@param[in]	ext	externally stored column prefixes, or NULL
+@param[in,out]	dfield	field of the tuple to be copied
+@param[in]	dfield2	field of the tuple to copy
+@param[in]	flag	ROW_BUILD_NORMAL, ROW_BUILD_FOR_PURGE or
+			ROW_BUILD_FOR_UNDO
+@param[in,out]	heap	memory heap from which the memory
+			of the field entry is allocated.
+@retval false if undo log is logged before spatial index creation. */
+static bool row_build_spatial_index_key(
+	const dict_index_t*	index,
+	const row_ext_t*	ext,
+	dfield_t*		dfield,
+	const dfield_t*		dfield2,
+	ulint			flag,
+	mem_heap_t*		heap)
+{
+	if (dfield2->type.mtype == DATA_MISSING) {
+		return false;
+	}
+
+	double*			mbr;
+
+	dfield_copy(dfield, dfield2);
+	dfield->type.prtype |= DATA_GIS_MBR;
+
+	/* Allocate memory for mbr field */
+	mbr = static_cast<double*>(mem_heap_alloc(heap, DATA_MBR_LEN));
+
+	/* Set mbr field data. */
+	dfield_set_data(dfield, mbr, DATA_MBR_LEN);
+
+	const fil_space_t* space = index->table->space;
+
+	if (UNIV_UNLIKELY(!dfield2->data || !space)) {
+		/* FIXME: dfield contains uninitialized data,
+		but row_build_index_entry_low() will not return NULL.
+		This bug is inherited from MySQL 5.7.5
+		commit b66ad511b61fffe75c58d0a607cdb837c6e6c821. */
+		return true;
+	}
+
+	const byte* dptr = NULL;
+	ulint	dlen = 0;
+	ulint	flen = 0;
+	double	tmp_mbr[SPDIMS * 2];
+	mem_heap_t*	temp_heap = NULL;
+
+	if (!dfield_is_ext(dfield2)) {
+		dptr = static_cast<const byte*>(dfield_get_data(dfield2));
+		dlen = dfield_get_len(dfield2);
+		ut_ad(dptr != &data_error);
+		goto write_mbr;
+	}
+
+	if (flag == ROW_BUILD_FOR_PURGE) {
+		const byte* ptr = static_cast<const byte*>(
+			dfield_get_data(dfield2));
+
+		switch (dfield_get_spatial_status(dfield2)) {
+		case SPATIAL_ONLY:
+			ut_ad(dfield_get_len(dfield2) == DATA_MBR_LEN);
+			break;
+
+		case SPATIAL_MIXED:
+			ptr += dfield_get_len(dfield2);
+			break;
+
+		case SPATIAL_UNKNOWN:
+			ut_ad(0);
+			/* fall through */
+		case SPATIAL_NONE:
+			/* Undo record is logged before
+			spatial index is created.*/
+			return false;
+		}
+
+		memcpy(mbr, ptr, DATA_MBR_LEN);
+		return true;
+	}
+
+	if (flag == ROW_BUILD_FOR_UNDO
+	    && dict_table_has_atomic_blobs(index->table)) {
+		/* For ROW_FORMAT=DYNAMIC or COMPRESSED, a prefix of
+		off-page records is stored in the undo log record (for
+		any column prefix indexes). For SPATIAL INDEX, we
+		must ignore this prefix. The full column value is
+		stored in the BLOB.  For non-spatial index, we would
+		have already fetched a necessary prefix of the BLOB,
+		available in the "ext" parameter.
+
+		Here, for SPATIAL INDEX, we are fetching the full
+		column, which is potentially wasting a lot of I/O,
+		memory, and possibly involving a concurrency problem,
+		similar to ones that existed before the introduction
+		of row_ext_t.
+
+		MDEV-11657 FIXME: write the MBR directly to the undo
+		log record, and avoid recomputing it here! */
+		flen = BTR_EXTERN_FIELD_REF_SIZE;
+		ut_ad(dfield_get_len(dfield2) >= BTR_EXTERN_FIELD_REF_SIZE);
+		dptr = static_cast<const byte*>(dfield_get_data(dfield2))
+			+ dfield_get_len(dfield2)
+			- BTR_EXTERN_FIELD_REF_SIZE;
+	} else {
+		flen = dfield_get_len(dfield2);
+		dptr = static_cast<const byte*>(dfield_get_data(dfield2));
+	}
+
+	temp_heap = mem_heap_create(1000);
+
+	dptr = btr_copy_externally_stored_field(
+		&dlen, dptr, ext ? ext->zip_size : space->zip_size(),
+		flen, temp_heap);
+
+write_mbr:
+	if (dlen <= GEO_DATA_HEADER_SIZE) {
+		for (uint i = 0; i < SPDIMS; i += 2) {
+			tmp_mbr[i] = DBL_MAX;
+			tmp_mbr[i + 1] = -DBL_MAX;
+		}
+	} else {
+		rtree_mbr_from_wkb(dptr + GEO_DATA_HEADER_SIZE,
+				   uint(dlen - GEO_DATA_HEADER_SIZE),
+				   SPDIMS, tmp_mbr);
+	}
+
+	dfield_write_mbr(dfield, tmp_mbr);
+	if (temp_heap) {
+		mem_heap_free(temp_heap);
+	}
+
+	return true;
+}
+
+/*****************************************************************//**
+When an insert or purge to a table is performed, this function builds
+the entry to be inserted into or purged from an index on the table.
+@return index entry which should be inserted or purged
+@retval NULL if the externally stored columns in the clustered index record
+are unavailable and ext != NULL, or row is missing some needed columns. */
+dtuple_t*
+row_build_index_entry_low(
+/*======================*/
+	const dtuple_t*		row,	/*!< in: row which should be
+					inserted or purged */
+	const row_ext_t*	ext,	/*!< in: externally stored column
+					prefixes, or NULL */
+	const dict_index_t*	index,	/*!< in: index on the table */
+	mem_heap_t*		heap,	/*!< in,out: memory heap from which
+					the memory for the index entry
+					is allocated */
+	ulint			flag)	/*!< in: ROW_BUILD_NORMAL,
+					ROW_BUILD_FOR_PURGE
+                                        or ROW_BUILD_FOR_UNDO */
+{
+	dtuple_t*	entry;
+	ulint		entry_len;
+	ulint		i = 0;
+	ulint		num_v = 0;
+
+	entry_len = dict_index_get_n_fields(index);
+
+	if (flag == ROW_BUILD_FOR_INSERT && dict_index_is_clust(index)) {
+		num_v = dict_table_get_n_v_cols(index->table);
+		entry = dtuple_create_with_vcol(heap, entry_len, num_v);
+	} else {
+		entry = dtuple_create(heap, entry_len);
+	}
+
+	if (dict_index_is_ibuf(index)) {
+		dtuple_set_n_fields_cmp(entry, entry_len);
+		/* There may only be externally stored columns
+		in a clustered index B-tree of a user table. */
+		ut_a(!ext);
+	} else {
+		dtuple_set_n_fields_cmp(
+			entry, dict_index_get_n_unique_in_tree(index));
+		if (dict_index_is_spatial(index)) {
+			/* Set the MBR field */
+			if (!row_build_spatial_index_key(
+				    index, ext,
+				    dtuple_get_nth_field(entry, 0),
+				    dtuple_get_nth_field(
+					    row,
+					    dict_index_get_nth_field(index, i)
+					    ->col->ind), flag, heap)) {
+				return NULL;
+			}
+
+			i = 1;
+		}
+	}
+
+	for (; i < entry_len; i++) {
+		const dict_field_t& f = index->fields[i];
+		dfield_t* dfield = dtuple_get_nth_field(entry, i);
+
+		if (f.col->is_dropped()) {
+			ut_ad(index->is_primary());
+			ut_ad(index->is_instant());
+			ut_ad(!f.col->is_virtual());
+			dict_col_copy_type(f.col, &dfield->type);
+			if (f.col->is_nullable()) {
+				dfield_set_null(dfield);
+			} else {
+				dfield_set_data(dfield, field_ref_zero,
+						f.fixed_len);
+			}
+			continue;
+		}
+
+		const dfield_t* dfield2;
+
+		if (f.col->is_virtual()) {
+			const dict_v_col_t* v_col
+				= reinterpret_cast<const dict_v_col_t*>(f.col);
+
+			ut_ad(v_col->v_pos < dtuple_get_n_v_fields(row));
+			dfield2 = dtuple_get_nth_v_field(row, v_col->v_pos);
+
+			ut_ad(dfield_is_null(dfield2) ||
+			      dfield_get_len(dfield2) == 0 || dfield2->data);
+			ut_ad(!dfield_is_ext(dfield2));
+			if (UNIV_UNLIKELY(dfield2->type.mtype
+					  == DATA_MISSING)) {
+				ut_ad(flag == ROW_BUILD_FOR_PURGE);
+				return(NULL);
+			}
+		} else {
+			dfield2 = dtuple_get_nth_field(row, f.col->ind);
+			if (UNIV_UNLIKELY(dfield2->type.mtype
+					  == DATA_MISSING)) {
+				/* The field has not been initialized in
+				the row. This should be from
+				trx_undo_rec_get_partial_row(). */
+				return(NULL);
+			}
+
+			ut_ad(!(dfield2->type.prtype & DATA_VIRTUAL));
+		}
+
+		compile_time_assert(DATA_MISSING == 0);
+
+		*dfield = *dfield2;
+
+		if (dfield_is_null(dfield)) {
+			continue;
+		}
+
+		ut_ad(!(index->type & DICT_FTS));
+
+		ulint len = dfield_get_len(dfield);
+
+		if (f.prefix_len == 0
+		    && (!dfield_is_ext(dfield)
+			|| dict_index_is_clust(index))) {
+			/* The *dfield = *dfield2 above suffices for
+			columns that are stored in-page, or for
+			clustered index record columns that are not
+			part of a column prefix in the PRIMARY KEY. */
+			continue;
+		}
+
+		/* If the column is stored externally (off-page) in
+		the clustered index, it must be an ordering field in
+		the secondary index. If !atomic_blobs, the only way
+		we may have a secondary index pointing to a clustered
+		index record with an off-page column is when it is a
+		column prefix index. If atomic_blobs, also fully
+		indexed long columns may be stored off-page. */
+		ut_ad(f.col->ord_part);
+
+		if (ext && !f.col->is_virtual()) {
+			/* See if the column is stored externally. */
+			const byte*	buf = row_ext_lookup(ext, f.col->ind,
+							     &len);
+			if (UNIV_LIKELY_NULL(buf)) {
+				if (UNIV_UNLIKELY(buf == field_ref_zero)) {
+					return(NULL);
+				}
+				dfield_set_data(dfield, buf, len);
+			}
+
+			if (f.prefix_len == 0) {
+				/* If ROW_FORMAT=DYNAMIC or
+				ROW_FORMAT=COMPRESSED, we can have a
+				secondary index on an entire column
+				that is stored off-page in the
+				clustered index. As this is not a
+				prefix index (prefix_len == 0),
+				include the entire off-page column in
+				the secondary index record. */
+				continue;
+			}
+		} else if (dfield_is_ext(dfield)) {
+			/* This table is either in
+			(ROW_FORMAT=REDUNDANT or ROW_FORMAT=COMPACT)
+			or a purge record where the ordered part of
+			the field is not external.
+			In ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT,
+			the maximum column prefix
+			index length is 767 bytes, and the clustered
+			index record contains a 768-byte prefix of
+			each off-page column. */
+			ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+			len -= BTR_EXTERN_FIELD_REF_SIZE;
+			dfield_set_len(dfield, len);
+		}
+
+		/* If a column prefix index, take only the prefix. */
+		if (f.prefix_len) {
+			len = dtype_get_at_most_n_mbchars(
+				f.col->prtype,
+				f.col->mbminlen, f.col->mbmaxlen,
+				f.prefix_len, len,
+				static_cast<char*>(dfield_get_data(dfield)));
+			dfield_set_len(dfield, len);
+		}
+	}
+
+	for (i = num_v; i--; ) {
+		ut_ad(index->is_primary());
+		ut_ad(flag == ROW_BUILD_FOR_INSERT);
+		dfield_t* dfield = dtuple_get_nth_v_field(entry, i);
+		const dict_v_col_t* v_col = dict_table_get_nth_v_col(
+			index->table, i);
+		ut_ad(!v_col->m_col.is_dropped());
+		ut_ad(v_col->v_pos < dtuple_get_n_v_fields(row));
+		const dfield_t* dfield2 = dtuple_get_nth_v_field(
+			row, v_col->v_pos);
+		ut_ad(dfield_is_null(dfield2) ||
+		      dfield_get_len(dfield2) == 0 || dfield2->data);
+		ut_ad(dfield2->type.mtype != DATA_MISSING);
+		*dfield = *dfield2;
+	}
+
+	return entry;
+}
+
+/** An inverse function to row_build_index_entry. Builds a row from a
+record in a clustered index, with possible indexing on ongoing
+addition of new virtual columns.
+@param[in]	type		ROW_COPY_POINTERS or ROW_COPY_DATA;
+@param[in]	index		clustered index
+@param[in]	rec		record in the clustered index
+@param[in]	offsets		rec_get_offsets(rec,index) or NULL
+@param[in]	col_table	table, to check which
+				externally stored columns
+				occur in the ordering columns
+				of an index, or NULL if
+				index->table should be
+				consulted instead
+@param[in]	defaults	default values of added/changed columns, or NULL
+@param[in]	add_v		new virtual columns added
+				along with new indexes
+@param[in]	col_map		mapping of old column
+				numbers to new ones, or NULL
+@param[in]	ext		cache of externally stored column
+				prefixes, or NULL
+@param[in]	heap		memory heap from which
+				the memory needed is allocated
+@return own: row built; */
+static inline
+dtuple_t*
+row_build_low(
+	ulint			type,
+	const dict_index_t*	index,
+	const rec_t*		rec,
+	const rec_offs*		offsets,
+	const dict_table_t*	col_table,
+	const dtuple_t*		defaults,
+	const dict_add_v_col_t*	add_v,
+	const ulint*		col_map,
+	row_ext_t**		ext,
+	mem_heap_t*		heap)
+{
+	const byte*		copy;
+	dtuple_t*		row;
+	ulint			n_ext_cols;
+	ulint*			ext_cols	= NULL; /* remove warning */
+	ulint			len;
+	byte*			buf;
+	ulint			j;
+	mem_heap_t*		tmp_heap	= NULL;
+	rec_offs		offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs_init(offsets_);
+
+	ut_ad(index != NULL);
+	ut_ad(rec != NULL);
+	ut_ad(heap != NULL);
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(!col_map || col_table);
+
+	if (!offsets) {
+		offsets = rec_get_offsets(rec, index, offsets_,
+					  index->n_core_fields,
+					  ULINT_UNDEFINED, &tmp_heap);
+	} else {
+		ut_ad(rec_offs_validate(rec, index, offsets));
+	}
+
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+	/* Some blob refs can be NULL during crash recovery before
+	trx_rollback_active() has completed execution, or when a concurrently
+	executing insert or update has committed the B-tree mini-transaction
+	but has not yet managed to restore the cursor position for writing
+	the big_rec. Note that the mini-transaction can be committed multiple
+	times, and the cursor restore can happen multiple times for single
+	insert or update statement.  */
+	ut_a(!rec_offs_any_null_extern(rec, offsets)
+	     || trx_sys.is_registered(current_trx(),
+				      row_get_rec_trx_id(rec, index,
+							 offsets)));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
+	if (type != ROW_COPY_POINTERS) {
+		/* Take a copy of rec to heap */
+		buf = static_cast<byte*>(
+			mem_heap_alloc(heap, rec_offs_size(offsets)));
+
+		copy = rec_copy(buf, rec, offsets);
+	} else {
+		copy = rec;
+	}
+
+	n_ext_cols = rec_offs_n_extern(offsets);
+	if (n_ext_cols) {
+		ext_cols = static_cast<ulint*>(
+			mem_heap_alloc(heap, n_ext_cols * sizeof *ext_cols));
+	}
+
+	/* Avoid a debug assertion in rec_offs_validate(). */
+	rec_offs_make_valid(copy, index, true, const_cast<rec_offs*>(offsets));
+
+	if (!col_table) {
+		ut_ad(!col_map);
+		ut_ad(!defaults);
+		col_table = index->table;
+	}
+
+	if (defaults) {
+		ut_ad(col_map);
+		row = dtuple_copy(defaults, heap);
+		/* dict_table_copy_types() would set the fields to NULL */
+		for (ulint i = 0; i < dict_table_get_n_cols(col_table); i++) {
+			dict_col_copy_type(
+				dict_table_get_nth_col(col_table, i),
+				dfield_get_type(dtuple_get_nth_field(row, i)));
+		}
+	} else if (add_v != NULL) {
+		row = dtuple_create_with_vcol(
+			heap, dict_table_get_n_cols(col_table),
+			dict_table_get_n_v_cols(col_table) + add_v->n_v_col);
+		dict_table_copy_types(row, col_table);
+
+		for (ulint i = 0; i < add_v->n_v_col; i++) {
+			dict_col_copy_type(
+				&add_v->v_col[i].m_col,
+				dfield_get_type(dtuple_get_nth_v_field(
+					row, i + col_table->n_v_def)));
+		}
+	} else {
+		row = dtuple_create_with_vcol(
+			heap, dict_table_get_n_cols(col_table),
+			dict_table_get_n_v_cols(col_table));
+		dict_table_copy_types(row, col_table);
+	}
+
+	dtuple_set_info_bits(row, rec_get_info_bits(
+				     copy, rec_offs_comp(offsets)));
+
+	j = 0;
+
+	const dict_field_t* ind_field = index->fields;
+
+	for (ulint i = 0; i < rec_offs_n_fields(offsets); i++) {
+		if (i == index->first_user_field()
+		    && rec_is_alter_metadata(rec, *index)) {
+			ut_ad(rec_offs_nth_extern(offsets, i));
+			ut_d(ulint len);
+			ut_d(rec_get_nth_field_offs(offsets, i, &len));
+			ut_ad(len == FIELD_REF_SIZE);
+			continue;
+		}
+
+		if (UNIV_UNLIKELY(ind_field
+				  >= &index->fields[index->n_fields])) {
+			ut_ad(rec_is_metadata(rec, *index));
+			continue;
+		}
+
+		const dict_col_t* col = dict_field_get_col(ind_field);
+
+		if ((ind_field++)->prefix_len) {
+			/* Column prefixes can only occur in key
+			fields, which cannot be stored externally. For
+			a column prefix, there should also be the full
+			field in the clustered index tuple. The row
+			tuple comprises full fields, not prefixes. */
+			ut_ad(!rec_offs_nth_extern(offsets, i));
+			continue;
+		}
+
+		if (col->is_dropped()) {
+			continue;
+		}
+
+		ulint	col_no = dict_col_get_no(col);
+
+		if (col_map) {
+			col_no = col_map[col_no];
+
+			if (col_no == ULINT_UNDEFINED) {
+				/* dropped column */
+				continue;
+			}
+		}
+
+		dfield_t*	dfield = dtuple_get_nth_field(row, col_no);
+
+		const void*	field = rec_get_nth_field(
+			copy, offsets, i, &len);
+		if (len == UNIV_SQL_DEFAULT) {
+			field = index->instant_field_value(i, &len);
+			if (field && type != ROW_COPY_POINTERS) {
+				field = mem_heap_dup(heap, field, len);
+			}
+		}
+		dfield_set_data(dfield, field, len);
+
+		if (rec_offs_nth_extern(offsets, i)) {
+			dfield_set_ext(dfield);
+
+			col = dict_table_get_nth_col(col_table, col_no);
+
+			if (col->ord_part) {
+				/* We will have to fetch prefixes of
+				externally stored columns that are
+				referenced by column prefixes. */
+				ext_cols[j++] = col_no;
+			}
+		}
+	}
+
+	rec_offs_make_valid(rec, index, true, const_cast<rec_offs*>(offsets));
+
+	ut_ad(dtuple_check_typed(row));
+
+	if (!ext) {
+		/* REDUNDANT and COMPACT formats store a local
+		768-byte prefix of each externally stored
+		column. No cache is needed.
+
+		During online table rebuild,
+		row_log_table_apply_delete_low()
+		may use a cache that was set up by
+		row_log_table_delete(). */
+
+	} else if (j) {
+		*ext = row_ext_create(j, ext_cols, *index->table, row,
+				      heap);
+	} else {
+		*ext = NULL;
+	}
+
+	if (tmp_heap) {
+		mem_heap_free(tmp_heap);
+	}
+
+	return(row);
+}
+
+
+/*******************************************************************//**
+An inverse function to row_build_index_entry. Builds a row from a
+record in a clustered index.
+@return own: row built; see the NOTE below! */
+dtuple_t*
+row_build(
+/*======*/
+	ulint			type,	/*!< in: ROW_COPY_POINTERS or
+					ROW_COPY_DATA; the latter
+					copies also the data fields to
+					heap while the first only
+					places pointers to data fields
+					on the index page, and thus is
+					more efficient */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const rec_t*		rec,	/*!< in: record in the clustered
+					index; NOTE: in the case
+					ROW_COPY_POINTERS the data
+					fields in the row will point
+					directly into this record,
+					therefore, the buffer page of
+					this record must be at least
+					s-latched and the latch held
+					as long as the row dtuple is used! */
+	const rec_offs*		offsets,/*!< in: rec_get_offsets(rec,index)
+					or NULL, in which case this function
+					will invoke rec_get_offsets() */
+	const dict_table_t*	col_table,
+					/*!< in: table, to check which
+					externally stored columns
+					occur in the ordering columns
+					of an index, or NULL if
+					index->table should be
+					consulted instead */
+	const dtuple_t*		defaults,
+					/*!< in: default values of
+					added and changed columns, or NULL */
+	const ulint*		col_map,/*!< in: mapping of old column
+					numbers to new ones, or NULL */
+	row_ext_t**		ext,	/*!< out, own: cache of
+					externally stored column
+					prefixes, or NULL */
+	mem_heap_t*		heap)	/*!< in: memory heap from which
+					 the memory needed is allocated */
+{
+	return(row_build_low(type, index, rec, offsets, col_table,
+			     defaults, NULL, col_map, ext, heap));
+}
+
+/** An inverse function to row_build_index_entry. Builds a row from a
+record in a clustered index, with possible indexing on ongoing
+addition of new virtual columns.
+@param[in]	type		ROW_COPY_POINTERS or ROW_COPY_DATA;
+@param[in]	index		clustered index
+@param[in]	rec		record in the clustered index
+@param[in]	offsets		rec_get_offsets(rec,index) or NULL
+@param[in]	col_table	table, to check which
+				externally stored columns
+				occur in the ordering columns
+				of an index, or NULL if
+				index->table should be
+				consulted instead
+@param[in]	defaults	default values of added, changed columns, or NULL
+@param[in]	add_v		new virtual columns added
+				along with new indexes
+@param[in]	col_map		mapping of old column
+				numbers to new ones, or NULL
+@param[in]	ext		cache of externally stored column
+				prefixes, or NULL
+@param[in]	heap		memory heap from which
+				the memory needed is allocated
+@return own: row built; */
+dtuple_t*
+row_build_w_add_vcol(
+	ulint			type,
+	const dict_index_t*	index,
+	const rec_t*		rec,
+	const rec_offs*		offsets,
+	const dict_table_t*	col_table,
+	const dtuple_t*		defaults,
+	const dict_add_v_col_t*	add_v,
+	const ulint*		col_map,
+	row_ext_t**		ext,
+	mem_heap_t*		heap)
+{
+	return(row_build_low(type, index, rec, offsets, col_table,
+			     defaults, add_v, col_map, ext, heap));
+}
+
+/** Convert an index record to a data tuple.
+@tparam metadata whether the index->instant_field_value() needs to be accessed
+@tparam mblob 1 if rec_is_alter_metadata();
+2 if we want converted metadata corresponding to info_bits
+@param[in]	rec		index record
+@param[in]	index		index
+@param[in]	offsets		rec_get_offsets(rec, index)
+@param[out]	n_ext		number of externally stored columns
+@param[in,out]	heap		memory heap for allocations
+@param[in]	info_bits	(only used if mblob=2)
+@param[in]	pad		(only used if mblob=2)
+@return index entry built; does not set info_bits, and the data fields
+in the entry will point directly to rec */
+template<bool metadata, int mblob = 0>
+static inline
+dtuple_t*
+row_rec_to_index_entry_impl(
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	const rec_offs*		offsets,
+	mem_heap_t*		heap,
+	ulint			info_bits = 0,
+	bool			pad = false)
+{
+	ut_ad(rec != NULL);
+	ut_ad(heap != NULL);
+	ut_ad(index != NULL);
+	ut_ad(!mblob || index->is_primary());
+	ut_ad(!mblob || !index->table->is_temporary());
+	ut_ad(!mblob || !dict_index_is_spatial(index));
+	compile_time_assert(!mblob || metadata);
+	compile_time_assert(mblob <= 2);
+	/* Because this function may be invoked by row0merge.cc
+	on a record whose header is in different format, the check
+	rec_offs_validate(rec, index, offsets) must be avoided here. */
+
+	const bool got = mblob == 2 && rec_is_alter_metadata(rec, *index);
+	ulint rec_len = rec_offs_n_fields(offsets);
+	if (mblob == 2) {
+		ut_ad(info_bits == REC_INFO_METADATA_ALTER
+		      || info_bits == REC_INFO_METADATA_ADD);
+		if (pad) {
+			ut_ad(rec_len <= ulint(index->n_fields + got));
+			rec_len = ulint(index->n_fields)
+				+ (info_bits == REC_INFO_METADATA_ALTER);
+		} else if (got) {
+			rec_len = std::min(rec_len,
+					   ulint(index->n_fields + got));
+		} else if (info_bits == REC_INFO_METADATA_ALTER) {
+			ut_ad(rec_len <= index->n_fields);
+			rec_len++;
+		}
+	} else {
+		ut_ad(info_bits == 0);
+		ut_ad(!pad);
+	}
+	dtuple_t* entry = dtuple_create(heap, rec_len);
+	dfield_t* dfield = entry->fields;
+
+	dtuple_set_n_fields_cmp(entry,
+				dict_index_get_n_unique_in_tree(index));
+	ut_ad(mblob == 2
+	      || rec_len == dict_index_get_n_fields(index) + uint(mblob == 1)
+	      /* a record for older SYS_INDEXES table
+	      (missing merge_threshold column) is acceptable. */
+	      || (!index->table->is_temporary()
+		  && index->table->id == DICT_INDEXES_ID
+		  && rec_len + 1 == dict_index_get_n_fields(index)));
+
+	ulint i;
+	for (i = 0; i < (mblob ? index->first_user_field() : rec_len);
+	     i++, dfield++) {
+		dict_col_copy_type(dict_index_get_nth_col(index, i),
+				   &dfield->type);
+		if (!mblob
+		    && dict_index_is_spatial(index)
+		    && DATA_GEOMETRY_MTYPE(dfield->type.mtype)) {
+			dfield->type.prtype |= DATA_GIS_MBR;
+		}
+
+		ulint len;
+		const byte* field = metadata
+			? rec_get_nth_cfield(rec, index, offsets, i, &len)
+			: rec_get_nth_field(rec, offsets, i, &len);
+
+		dfield_set_data(dfield, field, len);
+
+		if (rec_offs_nth_extern(offsets, i)) {
+			dfield_set_ext(dfield);
+		}
+	}
+
+	if (mblob) {
+		ulint len;
+		const byte* field;
+		ulint j = i;
+
+		if (mblob == 2) {
+			const bool want = info_bits == REC_INFO_METADATA_ALTER;
+			if (got == want) {
+				if (got) {
+					goto copy_metadata;
+				}
+			} else {
+				if (want) {
+					/* Allocate a placeholder for
+					adding metadata in an update. */
+					len = FIELD_REF_SIZE;
+					field = static_cast<byte*>(
+						mem_heap_zalloc(heap, len));
+					/* In reality there is one fewer
+					field present in the record. */
+					rec_len--;
+					goto init_metadata;
+				}
+
+				/* Skip the undesired metadata blob
+				(for example, when rolling back an
+				instant ALTER TABLE). */
+				i++;
+			}
+			goto copy_user_fields;
+		}
+copy_metadata:
+		ut_ad(rec_offs_nth_extern(offsets, i));
+		field = rec_get_nth_field(rec, offsets, i++, &len);
+init_metadata:
+		dfield->type.metadata_blob_init();
+		ut_ad(len == FIELD_REF_SIZE);
+		dfield_set_data(dfield, field, len);
+		dfield_set_ext(dfield++);
+copy_user_fields:
+		for (; i < rec_len; i++, dfield++) {
+			dict_col_copy_type(dict_index_get_nth_col(index, j++),
+					   &dfield->type);
+			if (mblob == 2 && pad
+			    && i >= rec_offs_n_fields(offsets)) {
+				field = index->instant_field_value(j - 1,
+								   &len);
+				dfield_set_data(dfield, field, len);
+				continue;
+			}
+
+			field = rec_get_nth_field(rec, offsets, i, &len);
+			dfield_set_data(dfield, field, len);
+
+			if (rec_offs_nth_extern(offsets, i)) {
+				dfield_set_ext(dfield);
+			}
+		}
+	}
+
+	if (mblob == 2) {
+		ulint n_fields = ulint(dfield - entry->fields);
+		ut_ad(entry->n_fields >= n_fields);
+		entry->n_fields = n_fields;
+	}
+	ut_ad(dfield == entry->fields + entry->n_fields);
+	ut_ad(dtuple_check_typed(entry));
+	return entry;
+}
+
+/** Convert an index record to a data tuple.
+@param[in]	rec	index record
+@param[in]	index	index
+@param[in]	offsets	rec_get_offsets(rec, index)
+@param[in,out]	heap	memory heap for allocations */
+dtuple_t*
+row_rec_to_index_entry_low(
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	const rec_offs*		offsets,
+	mem_heap_t*		heap)
+{
+	return row_rec_to_index_entry_impl<false>(rec, index, offsets, heap);
+}
+
+/*******************************************************************//**
+Converts an index record to a typed data tuple. NOTE that externally
+stored (often big) fields are NOT copied to heap.
+@return own: index entry built */
+dtuple_t*
+row_rec_to_index_entry(
+/*===================*/
+	const rec_t*		rec,	/*!< in: record in the index */
+	const dict_index_t*	index,	/*!< in: index */
+	const rec_offs*		offsets,/*!< in: rec_get_offsets(rec) */
+	mem_heap_t*		heap)	/*!< in: memory heap from which
+					the memory needed is allocated */
+{
+	ut_ad(rec != NULL);
+	ut_ad(heap != NULL);
+	ut_ad(index != NULL);
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	/* Take a copy of rec to heap */
+	const rec_t* copy_rec = rec_copy(
+		static_cast<byte*>(mem_heap_alloc(heap,
+						  rec_offs_size(offsets))),
+		rec, offsets);
+
+	rec_offs_make_valid(copy_rec, index, true,
+			    const_cast<rec_offs*>(offsets));
+
+	dtuple_t* entry = rec_is_alter_metadata(copy_rec, *index)
+		? row_rec_to_index_entry_impl<true,1>(
+			copy_rec, index, offsets, heap)
+		: row_rec_to_index_entry_impl<true>(
+			copy_rec, index, offsets, heap);
+
+	rec_offs_make_valid(rec, index, true,
+			    const_cast<rec_offs*>(offsets));
+
+	dtuple_set_info_bits(entry,
+			     rec_get_info_bits(rec, rec_offs_comp(offsets)));
+
+	return(entry);
+}
+
+/** Convert a metadata record to a data tuple.
+@param[in]	rec		metadata record
+@param[in]	index		clustered index after instant ALTER TABLE
+@param[in]	offsets		rec_get_offsets(rec)
+@param[in,out]	heap		memory heap for allocations
+@param[in]	info_bits	the info_bits after an update
+@param[in]	pad		whether to pad to index->n_fields */
+dtuple_t*
+row_metadata_to_tuple(
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	const rec_offs*		offsets,
+	mem_heap_t*		heap,
+	ulint			info_bits,
+	bool			pad)
+{
+	ut_ad(info_bits == REC_INFO_METADATA_ALTER
+	      || info_bits == REC_INFO_METADATA_ADD);
+	ut_ad(rec_is_metadata(rec, *index));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	const rec_t* copy_rec = rec_copy(
+		static_cast<byte*>(mem_heap_alloc(heap,
+						  rec_offs_size(offsets))),
+		rec, offsets);
+
+	rec_offs_make_valid(copy_rec, index, true,
+			    const_cast<rec_offs*>(offsets));
+
+	dtuple_t* entry = info_bits == REC_INFO_METADATA_ALTER
+		|| rec_is_alter_metadata(copy_rec, *index)
+		? row_rec_to_index_entry_impl<true,2>(
+			copy_rec, index, offsets, heap, info_bits, pad)
+		: row_rec_to_index_entry_impl<true>(
+			copy_rec, index, offsets, heap);
+
+	rec_offs_make_valid(rec, index, true,
+			    const_cast<rec_offs*>(offsets));
+
+	dtuple_set_info_bits(entry, info_bits);
+	return entry;
+}
+
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record.
+@return own: row reference built; see the NOTE below! */
+dtuple_t*
+row_build_row_ref(
+/*==============*/
+	ulint		type,	/*!< in: ROW_COPY_DATA, or ROW_COPY_POINTERS:
+				the former copies also the data fields to
+				heap, whereas the latter only places pointers
+				to data fields on the index page */
+	dict_index_t*	index,	/*!< in: secondary index */
+	const rec_t*	rec,	/*!< in: record in the index;
+				NOTE: in the case ROW_COPY_POINTERS
+				the data fields in the row will point
+				directly into this record, therefore,
+				the buffer page of this record must be
+				at least s-latched and the latch held
+				as long as the row reference is used! */
+	mem_heap_t*	heap)	/*!< in: memory heap from which the memory
+				needed is allocated */
+{
+	dict_table_t*	table;
+	dict_index_t*	clust_index;
+	dfield_t*	dfield;
+	dtuple_t*	ref;
+	const byte*	field;
+	ulint		len;
+	ulint		ref_len;
+	ulint		pos;
+	byte*		buf;
+	ulint		clust_col_prefix_len;
+	ulint		i;
+	mem_heap_t*	tmp_heap	= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(index != NULL);
+	ut_ad(rec != NULL);
+	ut_ad(heap != NULL);
+	ut_ad(!dict_index_is_clust(index));
+
+	offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+				  ULINT_UNDEFINED, &tmp_heap);
+	/* Secondary indexes must not contain externally stored columns. */
+	ut_ad(!rec_offs_any_extern(offsets));
+
+	if (type == ROW_COPY_DATA) {
+		/* Take a copy of rec to heap */
+
+		buf = static_cast<byte*>(
+			mem_heap_alloc(heap, rec_offs_size(offsets)));
+
+		rec = rec_copy(buf, rec, offsets);
+		rec_offs_make_valid(rec, index, true, offsets);
+	}
+
+	table = index->table;
+
+	clust_index = dict_table_get_first_index(table);
+
+	ref_len = dict_index_get_n_unique(clust_index);
+
+	ref = dtuple_create(heap, ref_len);
+
+	dict_index_copy_types(ref, clust_index, ref_len);
+
+	for (i = 0; i < ref_len; i++) {
+		dfield = dtuple_get_nth_field(ref, i);
+
+		pos = dict_index_get_nth_field_pos(index, clust_index, i);
+
+		ut_a(pos != ULINT_UNDEFINED);
+
+		ut_ad(!rec_offs_nth_default(offsets, pos));
+		field = rec_get_nth_field(rec, offsets, pos, &len);
+
+		dfield_set_data(dfield, field, len);
+
+		/* If the primary key contains a column prefix, then the
+		secondary index may contain a longer prefix of the same
+		column, or the full column, and we must adjust the length
+		accordingly. */
+
+		clust_col_prefix_len = dict_index_get_nth_field(
+			clust_index, i)->prefix_len;
+
+		if (clust_col_prefix_len > 0) {
+			if (len != UNIV_SQL_NULL) {
+
+				const dtype_t*	dtype
+					= dfield_get_type(dfield);
+
+				dfield_set_len(dfield,
+					       dtype_get_at_most_n_mbchars(
+						       dtype->prtype,
+						       dtype->mbminlen,
+						       dtype->mbmaxlen,
+						       clust_col_prefix_len,
+						       len, (char*) field));
+			}
+		}
+	}
+
+	ut_ad(dtuple_check_typed(ref));
+	if (tmp_heap) {
+		mem_heap_free(tmp_heap);
+	}
+
+	return(ref);
+}
+
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+void
+row_build_row_ref_in_tuple(
+/*=======================*/
+	dtuple_t*		ref,	/*!< in/out: row reference built;
+					see the NOTE below! */
+	const rec_t*		rec,	/*!< in: record in the index;
+					NOTE: the data fields in ref
+					will point directly into this
+					record, therefore, the buffer
+					page of this record must be at
+					least s-latched and the latch
+					held as long as the row
+					reference is used! */
+	const dict_index_t*	index,	/*!< in: secondary index */
+	rec_offs*		offsets)/*!< in: rec_get_offsets(rec, index)
+					or NULL */
+{
+	const dict_index_t*	clust_index;
+	dfield_t*		dfield;
+	const byte*		field;
+	ulint			len;
+	ulint			ref_len;
+	ulint			pos;
+	ulint			clust_col_prefix_len;
+	ulint			i;
+	mem_heap_t*		heap		= NULL;
+	rec_offs		offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs_init(offsets_);
+
+	ut_ad(!dict_index_is_clust(index));
+	ut_a(index->table);
+
+	clust_index = dict_table_get_first_index(index->table);
+	ut_ad(clust_index);
+
+	if (!offsets) {
+		offsets = rec_get_offsets(rec, index, offsets_,
+					  index->n_core_fields,
+					  ULINT_UNDEFINED, &heap);
+	} else {
+		ut_ad(rec_offs_validate(rec, index, offsets));
+	}
+
+	/* Secondary indexes must not contain externally stored columns. */
+	ut_ad(!rec_offs_any_extern(offsets));
+	ref_len = dict_index_get_n_unique(clust_index);
+
+	ut_ad(ref_len == dtuple_get_n_fields(ref));
+
+	dict_index_copy_types(ref, clust_index, ref_len);
+
+	for (i = 0; i < ref_len; i++) {
+		dfield = dtuple_get_nth_field(ref, i);
+
+		pos = dict_index_get_nth_field_pos(index, clust_index, i);
+
+		ut_a(pos != ULINT_UNDEFINED);
+
+		ut_ad(!rec_offs_nth_default(offsets, pos));
+		field = rec_get_nth_field(rec, offsets, pos, &len);
+
+		dfield_set_data(dfield, field, len);
+
+		/* If the primary key contains a column prefix, then the
+		secondary index may contain a longer prefix of the same
+		column, or the full column, and we must adjust the length
+		accordingly. */
+
+		clust_col_prefix_len = dict_index_get_nth_field(
+			clust_index, i)->prefix_len;
+
+		if (clust_col_prefix_len > 0) {
+			if (len != UNIV_SQL_NULL) {
+
+				const dtype_t*	dtype
+					= dfield_get_type(dfield);
+
+				dfield_set_len(dfield,
+					       dtype_get_at_most_n_mbchars(
+						       dtype->prtype,
+						       dtype->mbminlen,
+						       dtype->mbmaxlen,
+						       clust_col_prefix_len,
+						       len, (char*) field));
+			}
+		}
+	}
+
+	ut_ad(dtuple_check_typed(ref));
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+}
+
+/***************************************************************//**
+Searches the clustered index record for a row, if we have the row reference.
+@return TRUE if found */
+bool
+row_search_on_row_ref(
+/*==================*/
+	btr_pcur_t*		pcur,	/*!< out: persistent cursor, which must
+					be closed by the caller */
+	btr_latch_mode		mode,	/*!< in: BTR_MODIFY_LEAF, ... */
+	const dict_table_t*	table,	/*!< in: table */
+	const dtuple_t*		ref,	/*!< in: row reference */
+	mtr_t*			mtr)	/*!< in/out: mtr */
+{
+	ut_ad(dtuple_check_typed(ref));
+
+	dict_index_t *index = dict_table_get_first_index(table);
+	btr_pcur_init(pcur);
+	pcur->btr_cur.page_cur.index = index;
+
+	if (UNIV_UNLIKELY(ref->info_bits != 0)) {
+		ut_ad(ref->is_metadata());
+		ut_ad(ref->n_fields <= index->n_uniq);
+		if (pcur->open_leaf(true, index, mode, mtr) != DB_SUCCESS
+		    || !btr_pcur_move_to_next_user_rec(pcur, mtr)) {
+			return false;
+		}
+		/* We do not necessarily have index->is_instant() here,
+		because we could be executing a rollback of an
+		instant ADD COLUMN operation. The function
+		rec_is_metadata() asserts index->is_instant();
+		we do not want to call it here. */
+		return rec_get_info_bits(btr_pcur_get_rec(pcur),
+					 dict_table_is_comp(index->table))
+			& REC_INFO_MIN_REC_FLAG;
+	} else {
+		ut_a(ref->n_fields == index->n_uniq);
+		if (btr_pcur_open(ref, PAGE_CUR_LE, mode, pcur, mtr)
+		    != DB_SUCCESS) {
+			return false;
+		}
+	}
+
+	return !page_rec_is_infimum(btr_pcur_get_rec(pcur))
+		&& btr_pcur_get_low_match(pcur) == dtuple_get_n_fields(ref);
+}
+
+/*********************************************************************//**
+Fetches the clustered index record for a secondary index record. The latches
+on the secondary index record are preserved.
+@return record or NULL, if no record found */
+rec_t*
+row_get_clust_rec(
+/*==============*/
+	btr_latch_mode	mode,	/*!< in: BTR_MODIFY_LEAF, ... */
+	const rec_t*	rec,	/*!< in: record in a secondary index */
+	dict_index_t*	index,	/*!< in: secondary index */
+	dict_index_t**	clust_index,/*!< out: clustered index */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	mem_heap_t*	heap;
+	dtuple_t*	ref;
+	dict_table_t*	table;
+	btr_pcur_t	pcur;
+
+	ut_ad(!dict_index_is_clust(index));
+
+	table = index->table;
+
+	heap = mem_heap_create(256);
+
+	ref = row_build_row_ref(ROW_COPY_POINTERS, index, rec, heap);
+
+	auto found = row_search_on_row_ref(&pcur, mode, table, ref, mtr);
+
+	mem_heap_free(heap);
+
+	*clust_index = dict_table_get_first_index(table);
+	return found ? btr_pcur_get_rec(&pcur) : nullptr;
+}
+
+/***************************************************************//**
+Searches an index record.
+@return whether the record was found or buffered */
+enum row_search_result
+row_search_index_entry(
+/*===================*/
+	const dtuple_t*	entry,	/*!< in: index entry */
+	btr_latch_mode	mode,	/*!< in: BTR_MODIFY_LEAF, ... */
+	btr_pcur_t*	pcur,	/*!< in/out: persistent cursor, which must
+				be closed by the caller */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ulint	n_fields;
+	ulint	low_match;
+	rec_t*	rec;
+
+	ut_ad(dtuple_check_typed(entry));
+
+	if (btr_pcur_open(entry, PAGE_CUR_LE, mode, pcur, mtr) != DB_SUCCESS) {
+		return ROW_NOT_FOUND;
+	}
+
+	switch (btr_pcur_get_btr_cur(pcur)->flag) {
+	case BTR_CUR_DELETE_REF:
+		ut_ad(!(~mode & BTR_DELETE));
+		return(ROW_NOT_DELETED_REF);
+
+	case BTR_CUR_DEL_MARK_IBUF:
+	case BTR_CUR_DELETE_IBUF:
+	case BTR_CUR_INSERT_TO_IBUF:
+		return(ROW_BUFFERED);
+
+	case BTR_CUR_HASH:
+	case BTR_CUR_HASH_FAIL:
+	case BTR_CUR_BINARY:
+		break;
+	}
+
+	low_match = btr_pcur_get_low_match(pcur);
+
+	rec = btr_pcur_get_rec(pcur);
+
+	n_fields = dtuple_get_n_fields(entry);
+
+	if (page_rec_is_infimum(rec)) {
+
+		return(ROW_NOT_FOUND);
+	} else if (low_match != n_fields) {
+
+		return(ROW_NOT_FOUND);
+	}
+
+	return(ROW_FOUND);
+}
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) that is of
+type DATA_INT using "prtype" and writes the result to "buf".
+If the data is in unknown format, then nothing is written to "buf",
+0 is returned and "format_in_hex" is set to TRUE, otherwise
+"format_in_hex" is left untouched.
+Not more than "buf_size" bytes are written to "buf".
+The result is always '\0'-terminated (provided buf_size > 0) and the
+number of bytes that were written to "buf" is returned (including the
+terminating '\0').
+@return number of bytes that were written */
+static
+ulint
+row_raw_format_int(
+/*===============*/
+	const char*	data,		/*!< in: raw data */
+	ulint		data_len,	/*!< in: raw data length
+					in bytes */
+	ulint		prtype,		/*!< in: precise type */
+	char*		buf,		/*!< out: output buffer */
+	ulint		buf_size,	/*!< in: output buffer size
+					in bytes */
+	ibool*		format_in_hex)	/*!< out: should the data be
+					formatted in hex */
+{
+	ulint	ret;
+
+	if (data_len <= sizeof(ib_uint64_t)) {
+
+		ib_uint64_t	value;
+		ibool		unsigned_type = prtype & DATA_UNSIGNED;
+
+		value = mach_read_int_type(
+			(const byte*) data, data_len, unsigned_type);
+
+		ret = (ulint) snprintf(
+			buf, buf_size,
+			unsigned_type ? "%llu" : "%lld", (longlong) value)+1;
+	} else {
+
+		*format_in_hex = TRUE;
+		ret = 0;
+	}
+
+	return(ut_min(ret, buf_size));
+}
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) that is of
+type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "prtype" and writes the
+result to "buf".
+If the data is in binary format, then nothing is written to "buf",
+0 is returned and "format_in_hex" is set to TRUE, otherwise
+"format_in_hex" is left untouched.
+Not more than "buf_size" bytes are written to "buf".
+The result is always '\0'-terminated (provided buf_size > 0) and the
+number of bytes that were written to "buf" is returned (including the
+terminating '\0').
+@return number of bytes that were written */
+static
+ulint
+row_raw_format_str(
+/*===============*/
+	const char*	data,		/*!< in: raw data */
+	ulint		data_len,	/*!< in: raw data length
+					in bytes */
+	ulint		prtype,		/*!< in: precise type */
+	char*		buf,		/*!< out: output buffer */
+	ulint		buf_size,	/*!< in: output buffer size
+					in bytes */
+	ibool*		format_in_hex)	/*!< out: should the data be
+					formatted in hex */
+{
+	ulint	charset_coll;
+
+	if (buf_size == 0) {
+
+		return(0);
+	}
+
+	/* we assume system_charset_info is UTF-8 */
+
+	charset_coll = dtype_get_charset_coll(prtype);
+
+	if (UNIV_LIKELY(dtype_is_utf8(prtype))) {
+
+		return(ut_str_sql_format(data, data_len, buf, buf_size));
+	}
+	/* else */
+
+	if (charset_coll == DATA_MYSQL_BINARY_CHARSET_COLL) {
+
+		*format_in_hex = TRUE;
+		return(0);
+	}
+	/* else */
+
+	return(innobase_raw_format(data, data_len, charset_coll,
+					  buf, buf_size));
+}
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) using
+"dict_field" and writes the result to "buf".
+Not more than "buf_size" bytes are written to "buf".
+The result is always NUL-terminated (provided buf_size is positive) and the
+number of bytes that were written to "buf" is returned (including the
+terminating NUL).
+@return number of bytes that were written */
+ulint
+row_raw_format(
+/*===========*/
+	const char*		data,		/*!< in: raw data */
+	ulint			data_len,	/*!< in: raw data length
+						in bytes */
+	const dict_field_t*	dict_field,	/*!< in: index field */
+	char*			buf,		/*!< out: output buffer */
+	ulint			buf_size)	/*!< in: output buffer size
+						in bytes */
+{
+	ulint	mtype;
+	ulint	prtype;
+	ulint	ret;
+	ibool	format_in_hex;
+
+	ut_ad(data_len != UNIV_SQL_DEFAULT);
+
+	if (buf_size == 0) {
+
+		return(0);
+	}
+
+	if (data_len == UNIV_SQL_NULL) {
+
+		ret = snprintf((char*) buf, buf_size, "NULL") + 1;
+
+		return(ut_min(ret, buf_size));
+	}
+
+	mtype = dict_field->col->mtype;
+	prtype = dict_field->col->prtype;
+
+	format_in_hex = FALSE;
+
+	switch (mtype) {
+	case DATA_INT:
+
+		ret = row_raw_format_int(data, data_len, prtype,
+					 buf, buf_size, &format_in_hex);
+		if (format_in_hex) {
+
+			goto format_in_hex;
+		}
+		break;
+	case DATA_CHAR:
+	case DATA_VARCHAR:
+	case DATA_MYSQL:
+	case DATA_VARMYSQL:
+
+		ret = row_raw_format_str(data, data_len, prtype,
+					 buf, buf_size, &format_in_hex);
+		if (format_in_hex) {
+
+			goto format_in_hex;
+		}
+
+		break;
+	/* XXX support more data types */
+	default:
+	format_in_hex:
+
+		if (UNIV_LIKELY(buf_size > 2)) {
+
+			memcpy(buf, "0x", 2);
+			buf += 2;
+			buf_size -= 2;
+			ret = 2 + ut_raw_to_hex(data, data_len,
+						buf, buf_size);
+		} else {
+
+			buf[0] = '\0';
+			ret = 1;
+		}
+	}
+
+	return(ret);
+}
+
+#ifdef UNIV_ENABLE_UNIT_TEST_ROW_RAW_FORMAT_INT
+
+#ifdef HAVE_UT_CHRONO_T
+
+void
+test_row_raw_format_int()
+{
+	ulint	ret;
+	char	buf[128];
+	ibool	format_in_hex;
+	ulint	i;
+
+#define CALL_AND_TEST(data, data_len, prtype, buf, buf_size,\
+		      ret_expected, buf_expected, format_in_hex_expected)\
+	do {\
+		ibool	ok = TRUE;\
+		ulint	i;\
+		memset(buf, 'x', 10);\
+		buf[10] = '\0';\
+		format_in_hex = FALSE;\
+		fprintf(stderr, "TESTING \"\\x");\
+		for (i = 0; i < data_len; i++) {\
+			fprintf(stderr, "%02hhX", data[i]);\
+		}\
+		fprintf(stderr, "\", %lu, %lu, %lu\n",\
+                        (ulint) data_len, (ulint) prtype,\
+			(ulint) buf_size);\
+		ret = row_raw_format_int(data, data_len, prtype,\
+					 buf, buf_size, &format_in_hex);\
+		if (ret != ret_expected) {\
+			fprintf(stderr, "expected ret %lu, got %lu\n",\
+				(ulint) ret_expected, ret);\
+			ok = FALSE;\
+                }\
+                if (strcmp((char*) buf, buf_expected) != 0) {\
+                        fprintf(stderr, "expected buf \"%s\", got \"%s\"\n",\
+                                buf_expected, buf);\
+                        ok = FALSE;\
+                }\
+                if (format_in_hex != format_in_hex_expected) {\
+                        fprintf(stderr, "expected format_in_hex %d, got %d\n",\
+                                (int) format_in_hex_expected,\
+				(int) format_in_hex);\
+                        ok = FALSE;\
+                }\
+                if (ok) {\
+                        fprintf(stderr, "OK: %lu, \"%s\" %d\n\n",\
+                                (ulint) ret, buf, (int) format_in_hex);\
+                } else {\
+                        return;\
+                }\
+        } while (0)
+
+#if 1
+	/* min values for signed 1-8 byte integers */
+
+	CALL_AND_TEST("\x00", 1, 0,
+		      buf, sizeof(buf), 5, "-128", 0);
+
+	CALL_AND_TEST("\x00\x00", 2, 0,
+		      buf, sizeof(buf), 7, "-32768", 0);
+
+	CALL_AND_TEST("\x00\x00\x00", 3, 0,
+		      buf, sizeof(buf), 9, "-8388608", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00", 4, 0,
+		      buf, sizeof(buf), 12, "-2147483648", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00", 5, 0,
+		      buf, sizeof(buf), 14, "-549755813888", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00\x00", 6, 0,
+		      buf, sizeof(buf), 17, "-140737488355328", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00", 7, 0,
+		      buf, sizeof(buf), 19, "-36028797018963968", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00\x00", 8, 0,
+		      buf, sizeof(buf), 21, "-9223372036854775808", 0);
+
+	/* min values for unsigned 1-8 byte integers */
+
+	CALL_AND_TEST("\x00", 1, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	CALL_AND_TEST("\x00\x00", 2, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	CALL_AND_TEST("\x00\x00\x00", 3, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00", 4, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00", 5, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00\x00", 6, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00", 7, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00\x00", 8, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	/* max values for signed 1-8 byte integers */
+
+	CALL_AND_TEST("\xFF", 1, 0,
+		      buf, sizeof(buf), 4, "127", 0);
+
+	CALL_AND_TEST("\xFF\xFF", 2, 0,
+		      buf, sizeof(buf), 6, "32767", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF", 3, 0,
+		      buf, sizeof(buf), 8, "8388607", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF", 4, 0,
+		      buf, sizeof(buf), 11, "2147483647", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF", 5, 0,
+		      buf, sizeof(buf), 13, "549755813887", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF", 6, 0,
+		      buf, sizeof(buf), 16, "140737488355327", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 7, 0,
+		      buf, sizeof(buf), 18, "36028797018963967", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 8, 0,
+		      buf, sizeof(buf), 20, "9223372036854775807", 0);
+
+	/* max values for unsigned 1-8 byte integers */
+
+	CALL_AND_TEST("\xFF", 1, DATA_UNSIGNED,
+		      buf, sizeof(buf), 4, "255", 0);
+
+	CALL_AND_TEST("\xFF\xFF", 2, DATA_UNSIGNED,
+		      buf, sizeof(buf), 6, "65535", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF", 3, DATA_UNSIGNED,
+		      buf, sizeof(buf), 9, "16777215", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF", 4, DATA_UNSIGNED,
+		      buf, sizeof(buf), 11, "4294967295", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF", 5, DATA_UNSIGNED,
+		      buf, sizeof(buf), 14, "1099511627775", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF", 6, DATA_UNSIGNED,
+		      buf, sizeof(buf), 16, "281474976710655", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 7, DATA_UNSIGNED,
+		      buf, sizeof(buf), 18, "72057594037927935", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 8, DATA_UNSIGNED,
+		      buf, sizeof(buf), 21, "18446744073709551615", 0);
+
+	/* some random values */
+
+	CALL_AND_TEST("\x52", 1, 0,
+		      buf, sizeof(buf), 4, "-46", 0);
+
+	CALL_AND_TEST("\x0E", 1, DATA_UNSIGNED,
+		      buf, sizeof(buf), 3, "14", 0);
+
+	CALL_AND_TEST("\x62\xCE", 2, 0,
+		      buf, sizeof(buf), 6, "-7474", 0);
+
+	CALL_AND_TEST("\x29\xD6", 2, DATA_UNSIGNED,
+		      buf, sizeof(buf), 6, "10710", 0);
+
+	CALL_AND_TEST("\x7F\xFF\x90", 3, 0,
+		      buf, sizeof(buf), 5, "-112", 0);
+
+	CALL_AND_TEST("\x00\xA1\x16", 3, DATA_UNSIGNED,
+		      buf, sizeof(buf), 6, "41238", 0);
+
+	CALL_AND_TEST("\x7F\xFF\xFF\xF7", 4, 0,
+		      buf, sizeof(buf), 3, "-9", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x5C", 4, DATA_UNSIGNED,
+		      buf, sizeof(buf), 3, "92", 0);
+
+	CALL_AND_TEST("\x7F\xFF\xFF\xFF\xFF\xFF\xDC\x63", 8, 0,
+		      buf, sizeof(buf), 6, "-9117", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00\x01\x64\x62", 8, DATA_UNSIGNED,
+		      buf, sizeof(buf), 6, "91234", 0);
+#endif
+
+	/* speed test */
+
+	ut_chrono_t	ch(__func__);
+
+	for (i = 0; i < 1000000; i++) {
+		row_raw_format_int("\x23", 1,
+				   0, buf, sizeof(buf),
+				   &format_in_hex);
+		row_raw_format_int("\x23", 1,
+				   DATA_UNSIGNED, buf, sizeof(buf),
+				   &format_in_hex);
+
+		row_raw_format_int("\x00\x00\x00\x00\x00\x01\x64\x62", 8,
+				   0, buf, sizeof(buf),
+				   &format_in_hex);
+		row_raw_format_int("\x00\x00\x00\x00\x00\x01\x64\x62", 8,
+				   DATA_UNSIGNED, buf, sizeof(buf),
+				   &format_in_hex);
+	}
+}
+
+#endif /* HAVE_UT_CHRONO_T */
+
+#endif /* UNIV_ENABLE_UNIT_TEST_ROW_RAW_FORMAT_INT */
diff --git a/storage/innobase/row/row0sel.cc b/storage/innobase/row/row0sel.cc
new file mode 100644
index 00000000..6c76dd91
--- /dev/null
+++ b/storage/innobase/row/row0sel.cc
@@ -0,0 +1,6947 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/***************************************************//**
+@file row/row0sel.cc
+Select
+
+Created 12/19/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0sel.h"
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "trx0undo.h"
+#include "trx0trx.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "btr0sea.h"
+#include "gis0rtree.h"
+#include "mach0data.h"
+#include "que0que.h"
+#include "row0upd.h"
+#include "row0row.h"
+#include "row0vers.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "eval0eval.h"
+#include "pars0sym.h"
+#include "pars0pars.h"
+#include "row0mysql.h"
+#include "buf0lru.h"
+#include "srv0srv.h"
+#include "srv0mon.h"
+#include "sql_error.h"
+#ifdef WITH_WSREP
+#include "mysql/service_wsrep.h" /* For wsrep_thd_skip_locking */
+#endif
+
+/* Maximum number of rows to prefetch; MySQL interface has another parameter */
+#define SEL_MAX_N_PREFETCH	16
+
+/* Number of rows fetched, after which to start prefetching; MySQL interface
+has another parameter */
+#define SEL_PREFETCH_LIMIT	1
+
+/* When a select has accessed about this many pages, it returns control back
+to que_run_threads: this is to allow canceling runaway queries */
+
+#define SEL_COST_LIMIT	100
+
+/* Flags for search shortcut */
+#define SEL_FOUND	0
+#define	SEL_EXHAUSTED	1
+#define SEL_RETRY	2
+
+/********************************************************************//**
+Returns TRUE if the user-defined column in a secondary index record
+is alphabetically the same as the corresponding BLOB column in the clustered
+index record.
+NOTE: the comparison is NOT done as a binary comparison, but character
+fields are compared with collation!
+@return whether the columns are equal */
+static
+bool
+row_sel_sec_rec_is_for_blob(
+/*========================*/
+	ulint		mtype,		/*!< in: main type */
+	ulint		prtype,		/*!< in: precise type */
+	ulint		mbminlen,	/*!< in: minimum length of
+					a character, in bytes */
+	ulint		mbmaxlen,	/*!< in: maximum length of
+					a character, in bytes */
+	const byte*	clust_field,	/*!< in: the locally stored part of
+					the clustered index column, including
+					the BLOB pointer; the clustered
+					index record must be covered by
+					a lock or a page latch to protect it
+					against deletion (rollback or purge) */
+	ulint		clust_len,	/*!< in: length of clust_field */
+	const byte*	sec_field,	/*!< in: column in secondary index */
+	ulint		sec_len,	/*!< in: length of sec_field */
+	ulint		prefix_len,	/*!< in: index column prefix length
+					in bytes, or 0 for full column */
+	dict_table_t*	table)		/*!< in: table */
+{
+	ulint	len;
+	byte	buf[REC_VERSION_56_MAX_INDEX_COL_LEN + 1];
+
+	/* This function should never be invoked on tables in
+	ROW_FORMAT=REDUNDANT or ROW_FORMAT=COMPACT, because they
+	should always contain enough prefix in the clustered index record. */
+	ut_ad(dict_table_has_atomic_blobs(table));
+	ut_a(clust_len >= BTR_EXTERN_FIELD_REF_SIZE);
+	ut_ad(!prefix_len || prefix_len >= sec_len);
+	ut_a(prefix_len <= sizeof buf);
+
+	if (!memcmp(clust_field + clust_len - BTR_EXTERN_FIELD_REF_SIZE,
+		    field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE)) {
+		/* The externally stored field was not written yet.
+		This record should only be seen by
+		trx_rollback_recovered() or any
+		TRX_ISO_READ_UNCOMMITTED transactions. */
+		return false;
+	}
+
+	len = btr_copy_externally_stored_field_prefix(
+		buf, prefix_len ? prefix_len : sizeof buf,
+		table->space->zip_size(),
+		clust_field, clust_len);
+
+	if (len == 0) {
+		/* The BLOB was being deleted as the server crashed.
+		There should not be any secondary index records
+		referring to this clustered index record, because
+		btr_free_externally_stored_field() is called after all
+		secondary index entries of the row have been purged. */
+		return false;
+	}
+
+	if (prefix_len) {
+		len = dtype_get_at_most_n_mbchars(prtype, mbminlen, mbmaxlen,
+						  prefix_len, len,
+						  reinterpret_cast<const char*>
+						  (buf));
+	} else if (len >= sizeof buf) {
+		ut_ad("too long column" == 0);
+		return false;
+	}
+
+	return !cmp_data(mtype, prtype, false, buf, len, sec_field, sec_len);
+}
+
+/** Function to read the secondary spatial index, calculate
+the minimum bounding rectangle for clustered index record
+and secondary index record and compare it.
+@param sec_rec		secondary index record
+@param sec_index	spatial secondary index
+@param clust_rec	clustered index record
+@param clust_index	clustered index
+@retval DB_SUCCESS_LOCKED_REC if the secondary record is equal to the
+	corresponding fields in the clustered record, when compared with
+	collation;
+@retval DB_SUCCESS if not equal */
+static
+dberr_t
+row_sel_spatial_sec_rec_is_for_clust_rec(
+  const rec_t *sec_rec, const dict_index_t *sec_index,
+  const rec_t *clust_rec, dict_index_t *clust_index)
+{
+  mem_heap_t *heap= mem_heap_create(256);
+  rec_offs clust_offsets_[REC_OFFS_NORMAL_SIZE];
+  rec_offs *clust_offs= clust_offsets_;
+  ulint clust_len;
+
+  rec_offs_init(clust_offsets_);
+  ulint clust_pos= dict_col_get_clust_pos(
+    dict_index_get_nth_col(sec_index, 0), clust_index);
+  clust_offs= rec_get_offsets(clust_rec, clust_index, clust_offs,
+                              clust_index->n_core_fields, clust_pos + 1,
+                              &heap);
+  ut_ad(sec_index->n_user_defined_cols == 1);
+  const byte *clust_field= rec_get_nth_field(clust_rec, clust_offs,
+                                             clust_pos, &clust_len);
+  if (clust_len == UNIV_SQL_NULL || clust_len < GEO_DATA_HEADER_SIZE)
+  {
+    ut_ad("corrupted geometry column" == 0);
+err_exit:
+    mem_heap_free(heap);
+    return DB_SUCCESS;
+  }
+
+  /* For externally stored field, we need to get full
+  geo data to generate the MBR for comparing. */
+  if (rec_offs_nth_extern(clust_offs, clust_pos))
+  {
+    clust_field= btr_copy_externally_stored_field(
+      &clust_len, clust_field, sec_index->table->space->zip_size(),
+      clust_len, heap);
+    if (clust_field == NULL)
+    {
+      ut_ad("corrupted geometry blob" == 0);
+      goto err_exit;
+    }
+  }
+
+  ut_ad(clust_len >= GEO_DATA_HEADER_SIZE);
+  rtr_mbr_t tmp_mbr;
+  rtr_mbr_t sec_mbr;
+
+  rtree_mbr_from_wkb(
+    clust_field + GEO_DATA_HEADER_SIZE,
+    static_cast<uint>(clust_len - GEO_DATA_HEADER_SIZE),
+    SPDIMS, reinterpret_cast<double*>(&tmp_mbr));
+
+  rtr_read_mbr(sec_rec, &sec_mbr);
+
+  mem_heap_free(heap);
+  return MBR_EQUAL_CMP(&sec_mbr, &tmp_mbr)
+         ? DB_SUCCESS_LOCKED_REC
+         : DB_SUCCESS;
+}
+
+/** Returns TRUE if the user-defined column values in a secondary index record
+are alphabetically the same as the corresponding columns in the clustered
+index record.
+NOTE: the comparison is NOT done as a binary comparison, but character
+fields are compared with collation!
+@param[in]	sec_rec		secondary index record
+@param[in]	sec_index	secondary index
+@param[in]	clust_rec	clustered index record;
+				must be protected by a page s-latch
+@param[in]	clust_index	clustered index
+@param[in]	thr		query thread
+@retval	DB_COMPUTE_VALUE_FAILED in case of virtual column value computation
+	failure.
+@retval DB_SUCCESS_LOCKED_REC if the secondary record is equal to the
+	corresponding fields in the clustered record, when compared with
+	collation;
+@retval DB_SUCCESS if not equal or if the clustered record has been marked
+	for deletion */
+static
+dberr_t
+row_sel_sec_rec_is_for_clust_rec(
+	const rec_t*	sec_rec,
+	dict_index_t*	sec_index,
+	const rec_t*	clust_rec,
+	dict_index_t*	clust_index,
+	que_thr_t*	thr)
+{
+	if (rec_get_deleted_flag(clust_rec,
+				 dict_table_is_comp(clust_index->table))) {
+		/* In delete-marked records, DB_TRX_ID must
+		always refer to an existing undo log record. */
+		ut_ad(rec_get_trx_id(clust_rec, clust_index));
+
+		/* The clustered index record is delete-marked;
+		it is not visible in the read view.  Besides,
+		if there are any externally stored columns,
+		some of them may have already been purged. */
+		return DB_SUCCESS;
+	}
+
+	if (dict_index_is_spatial(sec_index)) {
+		return row_sel_spatial_sec_rec_is_for_clust_rec(
+				sec_rec, sec_index, clust_rec,
+				clust_index);
+	}
+
+	const byte*	sec_field;
+	ulint		sec_len;
+	const byte*	clust_field;
+	ulint		n;
+	ulint		i;
+	mem_heap_t*	heap		= mem_heap_create(256);
+	rec_offs	clust_offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs	sec_offsets_[REC_OFFS_SMALL_SIZE];
+	rec_offs*	clust_offs	= clust_offsets_;
+	rec_offs*	sec_offs	= sec_offsets_;
+
+	rec_offs_init(clust_offsets_);
+	rec_offs_init(sec_offsets_);
+
+	ib_vcol_row vc(heap);
+
+	clust_offs = rec_get_offsets(clust_rec, clust_index, clust_offs,
+				     clust_index->n_core_fields,
+				     ULINT_UNDEFINED, &heap);
+	sec_offs = rec_get_offsets(sec_rec, sec_index, sec_offs,
+				   sec_index->n_fields,
+				   ULINT_UNDEFINED, &heap);
+
+	n = dict_index_get_n_ordering_defined_by_user(sec_index);
+
+	for (i = 0; i < n; i++) {
+		const dict_field_t*	ifield;
+		const dict_col_t*	col;
+		ulint			clust_pos = 0;
+		ulint			clust_len = 0;
+		ulint			len;
+
+		ifield = dict_index_get_nth_field(sec_index, i);
+		col = dict_field_get_col(ifield);
+
+		sec_field = rec_get_nth_field(sec_rec, sec_offs, i, &sec_len);
+
+		const bool is_virtual = col->is_virtual();
+
+		/* For virtual column, its value will need to be
+		reconstructed from base column in cluster index */
+		if (is_virtual) {
+			const dict_v_col_t*	v_col;
+			dfield_t*		vfield;
+			row_ext_t*		ext;
+
+			byte *record = vc.record(thr_get_trx(thr)->mysql_thd,
+						 clust_index,
+						 &thr->prebuilt->m_mysql_table);
+
+			v_col = reinterpret_cast<const dict_v_col_t*>(col);
+
+			dtuple_t* row = row_build(
+				ROW_COPY_POINTERS,
+				clust_index, clust_rec,
+				clust_offs,
+				NULL, NULL, NULL, &ext, heap);
+
+			vfield = innobase_get_computed_value(
+					row, v_col, clust_index,
+					&heap, NULL, NULL,
+					thr_get_trx(thr)->mysql_thd,
+					thr->prebuilt->m_mysql_table,
+					record, NULL, NULL,
+					true);
+
+			if (vfield == NULL) {
+				innobase_report_computed_value_failed(row);
+				return DB_COMPUTE_VALUE_FAILED;
+			}
+			len = clust_len = vfield->len;
+			clust_field = static_cast<byte*>(vfield->data);
+		} else {
+			clust_pos = dict_col_get_clust_pos(col, clust_index);
+
+			clust_field = rec_get_nth_cfield(
+				clust_rec, clust_index, clust_offs,
+				clust_pos, &clust_len);
+			if (clust_len == UNIV_SQL_NULL) {
+				if (sec_len == UNIV_SQL_NULL) {
+					continue;
+				}
+				return DB_SUCCESS;
+			}
+			if (sec_len == UNIV_SQL_NULL) {
+				return DB_SUCCESS;
+			}
+
+			len = clust_len;
+			ulint prefix_len = ifield->prefix_len;
+			if (rec_offs_nth_extern(clust_offs, clust_pos)) {
+				/* BLOB can contain prefix. */
+				len -= BTR_EXTERN_FIELD_REF_SIZE;
+				if (!len) {
+					goto compare_blobs;
+				}
+			}
+
+			if (prefix_len) {
+				len = dtype_get_at_most_n_mbchars(
+					col->prtype, col->mbminlen,
+					col->mbmaxlen, prefix_len, len,
+					reinterpret_cast<const char*>(
+						clust_field));
+				if (len < sec_len) {
+					goto check_for_blob;
+				}
+			} else {
+check_for_blob:
+				if (rec_offs_nth_extern(clust_offs,
+							clust_pos)) {
+compare_blobs:
+					if (!row_sel_sec_rec_is_for_blob(
+						    col->mtype, col->prtype,
+						    col->mbminlen,
+						    col->mbmaxlen,
+						    clust_field, clust_len,
+						    sec_field, sec_len,
+						    prefix_len,
+						    clust_index->table)) {
+						return DB_SUCCESS;
+					}
+
+					continue;
+				}
+			}
+		}
+
+		if (cmp_data(col->mtype, col->prtype, false,
+			     clust_field, len, sec_field, sec_len)) {
+			return DB_SUCCESS;
+		}
+	}
+
+	return DB_SUCCESS_LOCKED_REC;
+}
+
+/*********************************************************************//**
+Creates a select node struct.
+@return own: select node struct */
+sel_node_t*
+sel_node_create(
+/*============*/
+	mem_heap_t*	heap)	/*!< in: memory heap where created */
+{
+	sel_node_t*	node;
+
+	node = static_cast<sel_node_t*>(
+		mem_heap_alloc(heap, sizeof(sel_node_t)));
+
+	node->common.type = QUE_NODE_SELECT;
+	node->state = SEL_NODE_OPEN;
+
+	node->plans = NULL;
+
+	return(node);
+}
+
+/*********************************************************************//**
+Frees the memory private to a select node when a query graph is freed,
+does not free the heap where the node was originally created. */
+void
+sel_node_free_private(
+/*==================*/
+	sel_node_t*	node)	/*!< in: select node struct */
+{
+	ulint	i;
+	plan_t*	plan;
+
+	if (node->plans != NULL) {
+		for (i = 0; i < node->n_tables; i++) {
+			plan = sel_node_get_nth_plan(node, i);
+
+			btr_pcur_close(&(plan->pcur));
+			btr_pcur_close(&(plan->clust_pcur));
+
+			if (plan->old_vers_heap) {
+				mem_heap_free(plan->old_vers_heap);
+			}
+		}
+	}
+}
+
+/*********************************************************************//**
+Evaluates the values in a select list. If there are aggregate functions,
+their argument value is added to the aggregate total. */
+UNIV_INLINE
+void
+sel_eval_select_list(
+/*=================*/
+	sel_node_t*	node)	/*!< in: select node */
+{
+	que_node_t*	exp;
+
+	exp = node->select_list;
+
+	while (exp) {
+		eval_exp(exp);
+
+		exp = que_node_get_next(exp);
+	}
+}
+
+/*********************************************************************//**
+Assigns the values in the select list to the possible into-variables in
+SELECT ... INTO ... */
+UNIV_INLINE
+void
+sel_assign_into_var_values(
+/*=======================*/
+	sym_node_t*	var,	/*!< in: first variable in a list of
+				variables */
+	sel_node_t*	node)	/*!< in: select node */
+{
+	que_node_t*	exp;
+
+	if (var == NULL) {
+
+		return;
+	}
+
+	for (exp = node->select_list;
+	     var != 0;
+	     var = static_cast<sym_node_t*>(que_node_get_next(var))) {
+
+		ut_ad(exp);
+
+		eval_node_copy_val(var->alias, exp);
+
+		exp = que_node_get_next(exp);
+	}
+}
+
+/*********************************************************************//**
+Resets the aggregate value totals in the select list of an aggregate type
+query. */
+UNIV_INLINE
+void
+sel_reset_aggregate_vals(
+/*=====================*/
+	sel_node_t*	node)	/*!< in: select node */
+{
+	func_node_t*	func_node;
+
+	ut_ad(node->is_aggregate);
+
+	for (func_node = static_cast<func_node_t*>(node->select_list);
+	     func_node != 0;
+	     func_node = static_cast<func_node_t*>(
+		     	que_node_get_next(func_node))) {
+
+		eval_node_set_int_val(func_node, 0);
+	}
+
+	node->aggregate_already_fetched = FALSE;
+}
+
+/*********************************************************************//**
+Copies the input variable values when an explicit cursor is opened. */
+UNIV_INLINE
+void
+row_sel_copy_input_variable_vals(
+/*=============================*/
+	sel_node_t*	node)	/*!< in: select node */
+{
+	sym_node_t*	var;
+
+	var = UT_LIST_GET_FIRST(node->copy_variables);
+
+	while (var) {
+		eval_node_copy_val(var, var->alias);
+
+		var->indirection = NULL;
+
+		var = UT_LIST_GET_NEXT(col_var_list, var);
+	}
+}
+
+/*********************************************************************//**
+Fetches the column values from a record. */
+static
+void
+row_sel_fetch_columns(
+/*==================*/
+	dict_index_t*	index,	/*!< in: record index */
+	const rec_t*	rec,	/*!< in: record in a clustered or non-clustered
+				index; must be protected by a page latch */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	sym_node_t*	column)	/*!< in: first column in a column list, or
+				NULL */
+{
+	dfield_t*	val;
+	ulint		index_type;
+	ulint		field_no;
+	const byte*	data;
+	ulint		len;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	if (dict_index_is_clust(index)) {
+		index_type = SYM_CLUST_FIELD_NO;
+	} else {
+		index_type = SYM_SEC_FIELD_NO;
+	}
+
+	while (column) {
+		mem_heap_t*	heap = NULL;
+		ibool		needs_copy;
+
+		field_no = column->field_nos[index_type];
+
+		if (field_no != ULINT_UNDEFINED) {
+
+			if (UNIV_UNLIKELY(rec_offs_nth_extern(
+						  offsets, field_no) != 0)) {
+
+				/* Copy an externally stored field to the
+				temporary heap, if possible. */
+
+				heap = mem_heap_create(1);
+
+				data = btr_rec_copy_externally_stored_field(
+					rec, offsets,
+					index->table->space->zip_size(),
+					field_no, &len, heap);
+
+				/* data == NULL means that the
+				externally stored field was not
+				written yet. This record
+				should only be seen by
+				trx_rollback_recovered() or any
+				TRX_ISO_READ_UNCOMMITTED
+				transactions. The InnoDB SQL parser
+				(the sole caller of this function)
+				does not implement READ UNCOMMITTED,
+				and it is not involved during rollback. */
+				ut_a(data);
+				ut_a(len != UNIV_SQL_NULL);
+
+				needs_copy = TRUE;
+			} else {
+				data = rec_get_nth_cfield(rec, index, offsets,
+							  field_no, &len);
+				needs_copy = column->copy_val;
+			}
+
+			if (needs_copy) {
+				eval_node_copy_and_alloc_val(column, data,
+							     len);
+			} else {
+				val = que_node_get_val(column);
+				dfield_set_data(val, data, len);
+			}
+
+			if (UNIV_LIKELY_NULL(heap)) {
+				mem_heap_free(heap);
+			}
+		}
+
+		column = UT_LIST_GET_NEXT(col_var_list, column);
+	}
+}
+
+/*********************************************************************//**
+Allocates a prefetch buffer for a column when prefetch is first time done. */
+static
+void
+sel_col_prefetch_buf_alloc(
+/*=======================*/
+	sym_node_t*	column)	/*!< in: symbol table node for a column */
+{
+	sel_buf_t*	sel_buf;
+	ulint		i;
+
+	ut_ad(que_node_get_type(column) == QUE_NODE_SYMBOL);
+
+	column->prefetch_buf = static_cast<sel_buf_t*>(
+		ut_malloc_nokey(SEL_MAX_N_PREFETCH * sizeof(sel_buf_t)));
+
+	for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
+		sel_buf = column->prefetch_buf + i;
+
+		sel_buf->data = NULL;
+		sel_buf->len = 0;
+		sel_buf->val_buf_size = 0;
+	}
+}
+
+/*********************************************************************//**
+Frees a prefetch buffer for a column, including the dynamically allocated
+memory for data stored there. */
+void
+sel_col_prefetch_buf_free(
+/*======================*/
+	sel_buf_t*	prefetch_buf)	/*!< in, own: prefetch buffer */
+{
+	sel_buf_t*	sel_buf;
+	ulint		i;
+
+	for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
+		sel_buf = prefetch_buf + i;
+
+		if (sel_buf->val_buf_size > 0) {
+
+			ut_free(sel_buf->data);
+		}
+	}
+
+	ut_free(prefetch_buf);
+}
+
+/*********************************************************************//**
+Pops the column values for a prefetched, cached row from the column prefetch
+buffers and places them to the val fields in the column nodes. */
+static
+void
+sel_dequeue_prefetched_row(
+/*=======================*/
+	plan_t*	plan)	/*!< in: plan node for a table */
+{
+	sym_node_t*	column;
+	sel_buf_t*	sel_buf;
+	dfield_t*	val;
+	byte*		data;
+	ulint		len;
+	ulint		val_buf_size;
+
+	ut_ad(plan->n_rows_prefetched > 0);
+
+	column = UT_LIST_GET_FIRST(plan->columns);
+
+	while (column) {
+		val = que_node_get_val(column);
+
+		if (!column->copy_val) {
+			/* We did not really push any value for the
+			column */
+
+			ut_ad(!column->prefetch_buf);
+			ut_ad(que_node_get_val_buf_size(column) == 0);
+			ut_d(dfield_set_null(val));
+
+			goto next_col;
+		}
+
+		ut_ad(column->prefetch_buf);
+		ut_ad(!dfield_is_ext(val));
+
+		sel_buf = column->prefetch_buf + plan->first_prefetched;
+
+		data = sel_buf->data;
+		len = sel_buf->len;
+		val_buf_size = sel_buf->val_buf_size;
+
+		/* We must keep track of the allocated memory for
+		column values to be able to free it later: therefore
+		we swap the values for sel_buf and val */
+
+		sel_buf->data = static_cast<byte*>(dfield_get_data(val));
+		sel_buf->len = dfield_get_len(val);
+		sel_buf->val_buf_size = que_node_get_val_buf_size(column);
+
+		dfield_set_data(val, data, len);
+		que_node_set_val_buf_size(column, val_buf_size);
+next_col:
+		column = UT_LIST_GET_NEXT(col_var_list, column);
+	}
+
+	plan->n_rows_prefetched--;
+
+	plan->first_prefetched++;
+}
+
+/*********************************************************************//**
+Pushes the column values for a prefetched, cached row to the column prefetch
+buffers from the val fields in the column nodes. */
+UNIV_INLINE
+void
+sel_enqueue_prefetched_row(
+/*=======================*/
+	plan_t*	plan)	/*!< in: plan node for a table */
+{
+	sym_node_t*	column;
+	sel_buf_t*	sel_buf;
+	dfield_t*	val;
+	byte*		data;
+	ulint		len;
+	ulint		pos;
+	ulint		val_buf_size;
+
+	if (plan->n_rows_prefetched == 0) {
+		pos = 0;
+		plan->first_prefetched = 0;
+	} else {
+		pos = plan->n_rows_prefetched;
+
+		/* We have the convention that pushing new rows starts only
+		after the prefetch stack has been emptied: */
+
+		ut_ad(plan->first_prefetched == 0);
+	}
+
+	plan->n_rows_prefetched++;
+
+	ut_ad(pos < SEL_MAX_N_PREFETCH);
+
+	for (column = UT_LIST_GET_FIRST(plan->columns);
+	     column != 0;
+	     column = UT_LIST_GET_NEXT(col_var_list, column)) {
+
+		if (!column->copy_val) {
+			/* There is no sense to push pointers to database
+			page fields when we do not keep latch on the page! */
+			continue;
+		}
+
+		if (!column->prefetch_buf) {
+			/* Allocate a new prefetch buffer */
+
+			sel_col_prefetch_buf_alloc(column);
+		}
+
+		sel_buf = column->prefetch_buf + pos;
+
+		val = que_node_get_val(column);
+
+		data = static_cast<byte*>(dfield_get_data(val));
+		len = dfield_get_len(val);
+		val_buf_size = que_node_get_val_buf_size(column);
+
+		/* We must keep track of the allocated memory for
+		column values to be able to free it later: therefore
+		we swap the values for sel_buf and val */
+
+		dfield_set_data(val, sel_buf->data, sel_buf->len);
+		que_node_set_val_buf_size(column, sel_buf->val_buf_size);
+
+		sel_buf->data = data;
+		sel_buf->len = len;
+		sel_buf->val_buf_size = val_buf_size;
+	}
+}
+
+/*********************************************************************//**
+Builds a previous version of a clustered index record for a consistent read
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_sel_build_prev_vers(
+/*====================*/
+	ReadView*	read_view,	/*!< in: read view */
+	dict_index_t*	index,		/*!< in: plan node for table */
+	rec_t*		rec,		/*!< in: record in a clustered index */
+	rec_offs**	offsets,	/*!< in/out: offsets returned by
+					rec_get_offsets(rec, plan->index) */
+	mem_heap_t**	offset_heap,	/*!< in/out: memory heap from which
+					the offsets are allocated */
+	mem_heap_t**    old_vers_heap,  /*!< out: old version heap to use */
+	rec_t**		old_vers,	/*!< out: old version, or NULL if the
+					record does not exist in the view:
+					i.e., it was freshly inserted
+					afterwards */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	dberr_t	err;
+
+	if (*old_vers_heap) {
+		mem_heap_empty(*old_vers_heap);
+	} else {
+		*old_vers_heap = mem_heap_create(512);
+	}
+
+	err = row_vers_build_for_consistent_read(
+		rec, mtr, index, offsets, read_view, offset_heap,
+		*old_vers_heap, old_vers, NULL);
+	return(err);
+}
+
+/*********************************************************************//**
+Builds the last committed version of a clustered index record for a
+semi-consistent read. */
+static
+void
+row_sel_build_committed_vers_for_mysql(
+/*===================================*/
+	dict_index_t*	clust_index,	/*!< in: clustered index */
+	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct */
+	const rec_t*	rec,		/*!< in: record in a clustered index */
+	rec_offs**	offsets,	/*!< in/out: offsets returned by
+					rec_get_offsets(rec, clust_index) */
+	mem_heap_t**	offset_heap,	/*!< in/out: memory heap from which
+					the offsets are allocated */
+	const rec_t**	old_vers,	/*!< out: old version, or NULL if the
+					record does not exist in the view:
+					i.e., it was freshly inserted
+					afterwards */
+	dtuple_t**	vrow,		/*!< out: to be filled with old virtual
+					column version if any */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	if (prebuilt->old_vers_heap) {
+		mem_heap_empty(prebuilt->old_vers_heap);
+	} else {
+		prebuilt->old_vers_heap = mem_heap_create(
+			rec_offs_size(*offsets));
+	}
+
+	row_vers_build_for_semi_consistent_read(prebuilt->trx,
+		rec, mtr, clust_index, offsets, offset_heap,
+		prebuilt->old_vers_heap, old_vers, vrow);
+}
+
+/*********************************************************************//**
+Tests the conditions which determine when the index segment we are searching
+through has been exhausted.
+@return TRUE if row passed the tests */
+UNIV_INLINE
+ibool
+row_sel_test_end_conds(
+/*===================*/
+	plan_t*	plan)	/*!< in: plan for the table; the column values must
+			already have been retrieved and the right sides of
+			comparisons evaluated */
+{
+	func_node_t*	cond;
+
+	/* All conditions in end_conds are comparisons of a column to an
+	expression */
+
+	for (cond = UT_LIST_GET_FIRST(plan->end_conds);
+	     cond != 0;
+	     cond = UT_LIST_GET_NEXT(cond_list, cond)) {
+
+		/* Evaluate the left side of the comparison, i.e., get the
+		column value if there is an indirection */
+
+		eval_sym(static_cast<sym_node_t*>(cond->args));
+
+		/* Do the comparison */
+
+		if (!eval_cmp(cond)) {
+
+			return(FALSE);
+		}
+	}
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Tests the other conditions.
+@return TRUE if row passed the tests */
+UNIV_INLINE
+ibool
+row_sel_test_other_conds(
+/*=====================*/
+	plan_t*	plan)	/*!< in: plan for the table; the column values must
+			already have been retrieved */
+{
+	func_node_t*	cond;
+
+	cond = UT_LIST_GET_FIRST(plan->other_conds);
+
+	while (cond) {
+		eval_exp(cond);
+
+		if (!eval_node_get_ibool_val(cond)) {
+
+			return(FALSE);
+		}
+
+		cond = UT_LIST_GET_NEXT(cond_list, cond);
+	}
+
+	return(TRUE);
+}
+
+/** Check that a clustered index record is visible in a consistent read view.
+@param rec      clustered index record (in leaf page, or in memory)
+@param index    clustered index
+@param offsets  rec_get_offsets(rec, index)
+@param view     consistent read view
+@retval DB_SUCCESS             if rec is visible in view
+@retval DB_SUCCESS_LOCKED_REC  if rec is not visible in view
+@retval DB_CORRUPTION          if the DB_TRX_ID is corrupted */
+static dberr_t row_sel_clust_sees(const rec_t *rec, const dict_index_t &index,
+                                  const rec_offs *offsets,
+                                  const ReadView &view)
+{
+  ut_ad(index.is_primary());
+  ut_ad(page_rec_is_user_rec(rec));
+  ut_ad(rec_offs_validate(rec, &index, offsets));
+  ut_ad(!rec_is_metadata(rec, index));
+  ut_ad(!index.table->is_temporary());
+
+  const trx_id_t id= row_get_rec_trx_id(rec, &index, offsets);
+
+  if (view.changes_visible(id))
+    return DB_SUCCESS;
+  if (UNIV_LIKELY(id < view.low_limit_id() || id < trx_sys.get_max_trx_id()))
+    return DB_SUCCESS_LOCKED_REC;
+
+  ib::warn() << "A transaction id in a record of table " << index.table->name
+             << " is newer than the system-wide maximum.";
+  return DB_CORRUPTION;
+}
+
+/*********************************************************************//**
+Retrieves the clustered index record corresponding to a record in a
+non-clustered index. Does the necessary locking.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_sel_get_clust_rec(
+/*==================*/
+	sel_node_t*	node,	/*!< in: select_node */
+	plan_t*		plan,	/*!< in: plan node for table */
+	rec_t*		rec,	/*!< in: record in a non-clustered index */
+	que_thr_t*	thr,	/*!< in: query thread */
+	rec_t**		out_rec,/*!< out: clustered record or an old version of
+				it, NULL if the old version did not exist
+				in the read view, i.e., it was a fresh
+				inserted version */
+	mtr_t*		mtr)	/*!< in: mtr used to get access to the
+				non-clustered record; the same mtr is used to
+				access the clustered index */
+{
+	dict_index_t*	index;
+	rec_t*		clust_rec;
+	rec_t*		old_vers;
+	mem_heap_t*	heap		= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	*out_rec = NULL;
+
+	offsets = rec_get_offsets(rec, plan->pcur.index(), offsets,
+				  plan->pcur.index()->n_core_fields,
+				  ULINT_UNDEFINED, &heap);
+
+	row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec, offsets);
+
+	index = dict_table_get_first_index(plan->table);
+	plan->clust_pcur.old_rec = nullptr;
+	plan->clust_pcur.btr_cur.page_cur.index = index;
+	dberr_t err = btr_pcur_open_with_no_init(plan->clust_ref,
+						 PAGE_CUR_LE, BTR_SEARCH_LEAF,
+						 &plan->clust_pcur, mtr);
+	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+		goto err_exit;
+	}
+
+	clust_rec = btr_pcur_get_rec(&(plan->clust_pcur));
+
+	/* Note: only if the search ends up on a non-infimum record is the
+	low_match value the real match to the search tuple */
+
+	if (!page_rec_is_user_rec(clust_rec)
+	    || btr_pcur_get_low_match(&(plan->clust_pcur))
+	    < dict_index_get_n_unique(index)) {
+
+		if (!node->read_view ||
+		    !rec_get_deleted_flag(rec, plan->table->not_redundant())) {
+			err = DB_CORRUPTION;
+		}
+
+		/* In a rare case it is possible that no clust rec is found
+		for a delete-marked secondary index record: if in row0umod.cc
+		in row_undo_mod_remove_clust_low() we have already removed
+		the clust rec, while purge is still cleaning and removing
+		secondary index records associated with earlier versions of
+		the clustered index record. In that case we know that the
+		clustered index record did not exist in the read view of
+		trx. */
+
+		goto err_exit;
+	}
+
+	offsets = rec_get_offsets(clust_rec, index, offsets,
+				  index->n_core_fields,
+				  ULINT_UNDEFINED, &heap);
+
+	if (!node->read_view) {
+		/* Try to place a lock on the index record */
+		trx_t* trx = thr_get_trx(thr);
+
+		/* At READ UNCOMMITTED or READ COMMITTED isolation level
+		we lock only the record, i.e., next-key locking is
+		not used. */
+		err = lock_clust_rec_read_check_and_lock(
+			0, btr_pcur_get_block(&plan->clust_pcur),
+			clust_rec, index, offsets,
+			node->row_lock_mode,
+			trx->isolation_level <= TRX_ISO_READ_COMMITTED
+			? LOCK_REC_NOT_GAP : LOCK_ORDINARY,
+			thr);
+
+		switch (err) {
+		case DB_SUCCESS:
+		case DB_SUCCESS_LOCKED_REC:
+			/* Declare the variable uninitialized.
+			It should be set to DB_SUCCESS at func_exit. */
+			MEM_UNDEFINED(&err, sizeof err);
+			break;
+		default:
+			goto err_exit;
+		}
+	} else {
+		/* This is a non-locking consistent read: if necessary, fetch
+		a previous version of the record */
+
+		old_vers = NULL;
+
+		err = row_sel_clust_sees(clust_rec, *index, offsets,
+                                         *node->read_view);
+
+		switch (err) {
+		default:
+			goto err_exit;
+		case DB_SUCCESS:
+			break;
+		case DB_SUCCESS_LOCKED_REC:
+			err = row_sel_build_prev_vers(
+				node->read_view, index, clust_rec,
+				&offsets, &heap, &plan->old_vers_heap,
+				&old_vers, mtr);
+
+			if (err != DB_SUCCESS) {
+
+				goto err_exit;
+			}
+
+			clust_rec = old_vers;
+
+			if (clust_rec == NULL) {
+				goto err_exit;
+			}
+		}
+
+		/* If we had to go to an earlier version of row or the
+		secondary index record is delete marked, then it may be that
+		the secondary index record corresponding to clust_rec
+		(or old_vers) is not rec; in that case we must ignore
+		such row because in our snapshot rec would not have existed.
+		Remember that from rec we cannot see directly which transaction
+		id corresponds to it: we have to go to the clustered index
+		record. A query where we want to fetch all rows where
+		the secondary index value is in some interval would return
+		a wrong result if we would not drop rows which we come to
+		visit through secondary index records that would not really
+		exist in our snapshot. */
+
+		if (old_vers || rec_get_deleted_flag(rec, dict_table_is_comp(
+							       plan->table))) {
+			err = row_sel_sec_rec_is_for_clust_rec(rec,
+							plan->index, clust_rec,
+							index, thr);
+			if (err != DB_SUCCESS_LOCKED_REC) {
+				goto err_exit;
+			}
+		}
+	}
+
+	/* Fetch the columns needed in test conditions.  The clustered
+	index record is protected by a page latch that was acquired
+	when plan->clust_pcur was positioned.  The latch will not be
+	released until mtr->commit(). */
+
+	ut_ad(!rec_get_deleted_flag(clust_rec, rec_offs_comp(offsets)));
+	row_sel_fetch_columns(index, clust_rec, offsets,
+			      UT_LIST_GET_FIRST(plan->columns));
+	*out_rec = clust_rec;
+	err = DB_SUCCESS;
+err_exit:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(err);
+}
+
+/*********************************************************************//**
+Sets a lock on a page of R-Tree record. This is all or none action,
+mostly due to we cannot reposition a record in R-Tree (with the
+nature of splitting)
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
+UNIV_INLINE
+dberr_t
+sel_set_rtr_rec_lock(
+/*=================*/
+	btr_pcur_t*		pcur,	/*!< in: cursor */
+	const rec_t*		first_rec,/*!< in: record */
+	dict_index_t*		index,	/*!< in: index */
+	const rec_offs*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	unsigned		mode,	/*!< in: lock mode */
+	unsigned		type,	/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+					LOC_REC_NOT_GAP */
+	que_thr_t*		thr,	/*!< in: query thread */
+	mtr_t*			mtr)	/*!< in: mtr */
+{
+	matched_rec_t*  match = pcur->btr_cur.rtr_info->matches;
+	mem_heap_t*     heap = NULL;
+	dberr_t		err = DB_SUCCESS;
+	trx_t*		trx = thr_get_trx(thr);
+	buf_block_t*	cur_block = btr_pcur_get_block(pcur);
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	my_offsets = const_cast<rec_offs*>(offsets);
+	rec_t*		rec = const_cast<rec_t*>(first_rec);
+	rtr_rec_vector*	match_rec;
+	rtr_rec_vector::iterator end;
+
+	rec_offs_init(offsets_);
+
+	if (match->locked || page_rec_is_supremum(first_rec)) {
+		return(DB_SUCCESS_LOCKED_REC);
+	}
+
+	ut_ad(page_align(first_rec) == cur_block->page.frame);
+	ut_ad(match->valid);
+
+	match->block.page.lock.x_lock();
+retry:
+	cur_block = btr_pcur_get_block(pcur);
+	ut_ad(match->block.page.lock.have_x()
+	      || match->block.page.lock.have_s());
+	ut_ad(page_is_leaf(cur_block->page.frame));
+
+	err = lock_sec_rec_read_check_and_lock(
+		0, cur_block, rec, index, my_offsets,
+		static_cast<lock_mode>(mode), type, thr);
+
+	if (err == DB_LOCK_WAIT) {
+re_scan:
+		mtr->commit();
+		trx->error_state = err;
+		thr->lock_state = QUE_THR_LOCK_ROW;
+		if (row_mysql_handle_errors(
+			&err, trx, thr, NULL)) {
+			thr->lock_state = QUE_THR_LOCK_NOLOCK;
+			mtr->start();
+
+			mysql_mutex_lock(&match->rtr_match_mutex);
+			if (!match->valid && match->matched_recs->empty()) {
+				mysql_mutex_unlock(&match->rtr_match_mutex);
+				err = DB_RECORD_NOT_FOUND;
+				goto func_end;
+			}
+			mysql_mutex_unlock(&match->rtr_match_mutex);
+
+			/* MDEV-14059 FIXME: why re-latch the block?
+			pcur is already positioned on it! */
+			cur_block = buf_page_get_gen(
+				btr_pcur_get_block(pcur)->page.id(),
+				btr_pcur_get_block(pcur)->zip_size(),
+				RW_X_LATCH, NULL, BUF_GET, mtr, &err);
+			if (!cur_block) {
+				goto func_end;
+			}
+		} else {
+			mtr->start();
+			goto func_end;
+		}
+
+		DEBUG_SYNC_C("rtr_set_lock_wait");
+
+		if (!match->valid) {
+			/* Page got deleted */
+			mtr->commit();
+			mtr->start();
+			err = DB_RECORD_NOT_FOUND;
+			goto func_end;
+		}
+
+		match->matched_recs->clear();
+		// FIXME: check for !cur_block
+
+		rtr_cur_search_with_match(
+			cur_block, index,
+			pcur->btr_cur.rtr_info->search_tuple,
+			pcur->btr_cur.rtr_info->search_mode,
+			&pcur->btr_cur.page_cur,
+			pcur->btr_cur.rtr_info);
+
+		if (!page_is_leaf(buf_block_get_frame(cur_block))) {
+			/* Page got splitted and promoted (only for
+			root page it is possible).  Release the
+			page and ask for a re-search */
+			mtr->commit();
+			mtr->start();
+			err = DB_RECORD_NOT_FOUND;
+			goto func_end;
+		}
+
+		rec = btr_pcur_get_rec(pcur);
+		my_offsets = offsets_;
+		my_offsets = rec_get_offsets(rec, index, my_offsets,
+					     index->n_fields,
+					     ULINT_UNDEFINED, &heap);
+
+		/* No match record */
+		if (page_rec_is_supremum(rec) || !match->valid) {
+			mtr->commit();
+			mtr->start();
+			err = DB_RECORD_NOT_FOUND;
+			goto func_end;
+		}
+
+		goto retry;
+	}
+
+	my_offsets = offsets_;
+	match_rec = match->matched_recs;
+	end = match_rec->end();
+
+	for (rtr_rec_vector::iterator it = match_rec->begin();
+	     it != end; ++it) {
+		rtr_rec_t*	rtr_rec = &(*it);
+
+		my_offsets = rec_get_offsets(
+			rtr_rec->r_rec, index, my_offsets, index->n_fields,
+			ULINT_UNDEFINED, &heap);
+
+		err = lock_sec_rec_read_check_and_lock(
+			0, &match->block, rtr_rec->r_rec, index,
+			my_offsets, static_cast<lock_mode>(mode),
+			type, thr);
+
+		if (err == DB_SUCCESS || err == DB_SUCCESS_LOCKED_REC) {
+			rtr_rec->locked = true;
+		} else if (err == DB_LOCK_WAIT) {
+			goto re_scan;
+		} else {
+			goto func_end;
+		}
+	}
+
+	match->locked = true;
+
+func_end:
+	match->block.page.lock.x_unlock();
+	if (heap != NULL) {
+		mem_heap_free(heap);
+	}
+
+	ut_ad(err != DB_LOCK_WAIT);
+
+	return(err);
+}
+
+/*********************************************************************//**
+Sets a lock on a record.
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
+UNIV_INLINE
+dberr_t
+sel_set_rec_lock(
+/*=============*/
+	btr_pcur_t*		pcur,	/*!< in: cursor */
+	const rec_t*		rec,	/*!< in: record */
+	dict_index_t*		index,	/*!< in: index */
+	const rec_offs*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	unsigned		mode,	/*!< in: lock mode */
+	unsigned		type,	/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+					LOC_REC_NOT_GAP */
+	que_thr_t*		thr,	/*!< in: query thread */
+	mtr_t*			mtr)	/*!< in: mtr */
+{
+	trx_t*			trx;
+	dberr_t			err = DB_SUCCESS;
+	const buf_block_t*	block;
+
+	block = btr_pcur_get_block(pcur);
+
+	trx = thr_get_trx(thr);
+
+	if (UT_LIST_GET_LEN(trx->lock.trx_locks) > 10000
+	    && buf_pool.running_out()) {
+		return DB_LOCK_TABLE_FULL;
+	}
+
+	if (dict_index_is_clust(index)) {
+		err = lock_clust_rec_read_check_and_lock(
+			0, block, rec, index, offsets,
+			static_cast<lock_mode>(mode), type, thr);
+	} else {
+
+		if (dict_index_is_spatial(index)) {
+			if (type == LOCK_GAP || type == LOCK_ORDINARY) {
+				ut_ad(0);
+				ib::error() << "Incorrectly request GAP lock "
+					"on RTree";
+				return(DB_SUCCESS);
+			}
+			err = sel_set_rtr_rec_lock(pcur, rec, index, offsets,
+						   mode, type, thr, mtr);
+		} else {
+			err = lock_sec_rec_read_check_and_lock(
+				0, block, rec, index, offsets,
+				static_cast<lock_mode>(mode), type, thr);
+		}
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Opens a pcur to a table index. */
+MY_ATTRIBUTE((warn_unused_result, nonnull))
+static
+dberr_t
+row_sel_open_pcur(
+/*==============*/
+	plan_t*		plan,	/*!< in: table plan */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	dict_index_t*	index;
+	func_node_t*	cond;
+	que_node_t*	exp;
+	ulint		n_fields;
+	ulint		i;
+
+	ut_ad(!plan->n_rows_prefetched);
+	ut_ad(!plan->n_rows_fetched);
+	ut_ad(!plan->cursor_at_end);
+
+	index = plan->index;
+
+	/* Calculate the value of the search tuple: the exact match columns
+	get their expressions evaluated when we evaluate the right sides of
+	end_conds */
+
+	cond = UT_LIST_GET_FIRST(plan->end_conds);
+
+	while (cond) {
+		eval_exp(que_node_get_next(cond->args));
+
+		cond = UT_LIST_GET_NEXT(cond_list, cond);
+	}
+
+	plan->pcur.old_rec = nullptr;
+	plan->pcur.btr_cur.page_cur.index = index;
+
+	dberr_t err;
+
+	if (plan->tuple) {
+		n_fields = dtuple_get_n_fields(plan->tuple);
+
+		if (plan->n_exact_match < n_fields) {
+			/* There is a non-exact match field which must be
+			evaluated separately */
+
+			eval_exp(plan->tuple_exps[n_fields - 1]);
+		}
+
+		for (i = 0; i < n_fields; i++) {
+			exp = plan->tuple_exps[i];
+
+			dfield_copy_data(dtuple_get_nth_field(plan->tuple, i),
+					 que_node_get_val(exp));
+		}
+
+		err = btr_pcur_open_with_no_init(plan->tuple,
+						 plan->mode, BTR_SEARCH_LEAF,
+						 &plan->pcur, mtr);
+	} else {
+		err = plan->pcur.open_leaf(plan->asc, index, BTR_SEARCH_LEAF,
+					   mtr);
+	}
+
+	plan->pcur_is_open = err == DB_SUCCESS;
+	return err;
+}
+
+/*********************************************************************//**
+Restores a stored pcur position to a table index.
+@return TRUE if the cursor should be moved to the next record after we
+return from this function (moved to the previous, in the case of a
+descending cursor) without processing again the current cursor
+record */
+static
+ibool
+row_sel_restore_pcur_pos(
+/*=====================*/
+	plan_t*		plan,	/*!< in: table plan */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ibool	equal_position;
+	ulint	relative_position;
+
+	ut_ad(!plan->cursor_at_end);
+
+	relative_position = btr_pcur_get_rel_pos(&(plan->pcur));
+
+	equal_position =
+	  plan->pcur.restore_position(BTR_SEARCH_LEAF, mtr) ==
+	  btr_pcur_t::SAME_ALL;
+
+	/* If the cursor is traveling upwards, and relative_position is
+
+	(1) BTR_PCUR_BEFORE: this is not allowed, as we did not have a lock
+	yet on the successor of the page infimum;
+	(2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
+	first record GREATER than the predecessor of a page supremum; we have
+	not yet processed the cursor record: no need to move the cursor to the
+	next record;
+	(3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
+	last record LESS or EQUAL to the old stored user record; (a) if
+	equal_position is FALSE, this means that the cursor is now on a record
+	less than the old user record, and we must move to the next record;
+	(b) if equal_position is TRUE, then if
+	plan->stored_cursor_rec_processed is TRUE, we must move to the next
+	record, else there is no need to move the cursor. */
+
+	if (plan->asc) {
+		if (relative_position == BTR_PCUR_ON) {
+
+			if (equal_position) {
+
+				return(plan->stored_cursor_rec_processed);
+			}
+
+			return(TRUE);
+		}
+
+		ut_ad(relative_position == BTR_PCUR_AFTER
+		      || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
+
+		return(FALSE);
+	}
+
+	/* If the cursor is traveling downwards, and relative_position is
+
+	(1) BTR_PCUR_BEFORE: btr_pcur_restore_position placed the cursor on
+	the last record LESS than the successor of a page infimum; we have not
+	processed the cursor record: no need to move the cursor;
+	(2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
+	first record GREATER than the predecessor of a page supremum; we have
+	processed the cursor record: we should move the cursor to the previous
+	record;
+	(3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
+	last record LESS or EQUAL to the old stored user record; (a) if
+	equal_position is FALSE, this means that the cursor is now on a record
+	less than the old user record, and we need not move to the previous
+	record; (b) if equal_position is TRUE, then if
+	plan->stored_cursor_rec_processed is TRUE, we must move to the previous
+	record, else there is no need to move the cursor. */
+
+	if (relative_position == BTR_PCUR_BEFORE
+	    || relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE) {
+
+		return(FALSE);
+	}
+
+	if (relative_position == BTR_PCUR_ON) {
+
+		if (equal_position) {
+
+			return(plan->stored_cursor_rec_processed);
+		}
+
+		return(FALSE);
+	}
+
+	ut_ad(relative_position == BTR_PCUR_AFTER
+	      || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Resets a plan cursor to a closed state. */
+UNIV_INLINE
+void
+plan_reset_cursor(
+/*==============*/
+	plan_t*	plan)	/*!< in: plan */
+{
+	plan->pcur_is_open = FALSE;
+	plan->cursor_at_end = FALSE;
+	plan->n_rows_fetched = 0;
+	plan->n_rows_prefetched = 0;
+}
+
+#ifdef BTR_CUR_HASH_ADAPT
+/*********************************************************************//**
+Tries to do a shortcut to fetch a clustered index record with a unique key,
+using the hash index if possible (not always).
+@return SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
+static
+ulint
+row_sel_try_search_shortcut(
+/*========================*/
+	sel_node_t*	node,	/*!< in: select node for a consistent read */
+	plan_t*		plan,	/*!< in: plan for a unique search in clustered
+				index */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	dict_index_t*	index = plan->index;
+
+	ut_ad(!index->table->is_temporary());
+	ut_ad(node->read_view);
+	ut_ad(node->read_view->is_open());
+	ut_ad(plan->unique_search);
+	ut_ad(!plan->must_get_clust);
+
+	if (row_sel_open_pcur(plan, mtr) != DB_SUCCESS) {
+		return SEL_RETRY;
+	}
+
+	const rec_t* rec = btr_pcur_get_rec(&(plan->pcur));
+
+	if (!page_rec_is_user_rec(rec) || rec_is_metadata(rec, *index)) {
+		return SEL_RETRY;
+	}
+
+	ut_ad(plan->mode == PAGE_CUR_GE);
+
+	/* As the cursor is now placed on a user record after a search with
+	the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
+	fields in the user record matched to the search tuple */
+
+	if (btr_pcur_get_up_match(&(plan->pcur)) < plan->n_exact_match) {
+		return SEL_EXHAUSTED;
+	}
+
+	if (trx_id_t bulk_trx_id = index->table->bulk_trx_id) {
+		/* See row_search_mvcc() for a comment on bulk_trx_id */
+		if (!node->read_view->changes_visible(bulk_trx_id)) {
+			return SEL_EXHAUSTED;
+		}
+	}
+
+	/* This is a non-locking consistent read: if necessary, fetch
+	a previous version of the record */
+
+	mem_heap_t*	heap		= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets		= offsets_;
+	rec_offs_init(offsets_);
+	offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+				  ULINT_UNDEFINED, &heap);
+
+	if (dict_index_is_clust(index)) {
+		if (row_sel_clust_sees(rec, *index, offsets, *node->read_view)
+		    != DB_SUCCESS) {
+			return SEL_RETRY;
+		}
+	} else if (!srv_read_only_mode) {
+		trx_id_t trx_id = page_get_max_trx_id(page_align(rec));
+		ut_ad(trx_id);
+		if (!node->read_view->sees(trx_id)) {
+			return SEL_RETRY;
+		}
+	}
+
+	if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))) {
+		return SEL_EXHAUSTED;
+	}
+
+	/* Fetch the columns needed in test conditions.  The index
+	record is protected by a page latch that was acquired when
+	plan->pcur was positioned.  The latch will not be released
+	until mtr->commit(). */
+
+	row_sel_fetch_columns(index, rec, offsets,
+			      UT_LIST_GET_FIRST(plan->columns));
+
+	/* Test the rest of search conditions */
+
+	if (!row_sel_test_other_conds(plan)) {
+		return SEL_EXHAUSTED;
+	}
+
+	ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF);
+
+	plan->n_rows_fetched++;
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return SEL_FOUND;
+}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+/*********************************************************************//**
+Performs a select step.
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+row_sel(
+/*====*/
+	sel_node_t*	node,	/*!< in: select node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dict_index_t*	index;
+	plan_t*		plan;
+	mtr_t		mtr;
+	ibool		moved;
+	rec_t*		rec;
+	rec_t*		old_vers;
+	rec_t*		clust_rec;
+
+	/* The following flag becomes TRUE when we are doing a
+	consistent read from a non-clustered index and we must look
+	at the clustered index to find out the previous delete mark
+	state of the non-clustered record: */
+
+	ibool		cons_read_requires_clust_rec	= FALSE;
+	ulint		cost_counter			= 0;
+	ibool		cursor_just_opened;
+	ibool		must_go_to_next;
+	ibool		mtr_has_extra_clust_latch	= FALSE;
+	/* TRUE if the search was made using
+	a non-clustered index, and we had to
+	access the clustered record: now &mtr
+	contains a clustered index latch, and
+	&mtr must be committed before we move
+	to the next non-clustered record */
+	dberr_t		err;
+	mem_heap_t*	heap				= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets				= offsets_;
+	rec_offs_init(offsets_);
+	const trx_t*	trx = thr_get_trx(thr);
+
+	ut_ad(thr->run_node == node);
+	ut_ad(!node->read_view || node->read_view == &trx->read_view);
+	ut_ad(!node->read_view || node->read_view->is_open());
+
+table_loop:
+	/* TABLE LOOP
+	----------
+	This is the outer major loop in calculating a join. We come here when
+	node->fetch_table changes, and after adding a row to aggregate totals
+	and, of course, when this function is called. */
+
+	ut_ad(mtr_has_extra_clust_latch == FALSE);
+
+	plan = sel_node_get_nth_plan(node, node->fetch_table);
+	index = plan->index;
+
+	if (plan->n_rows_prefetched > 0) {
+		sel_dequeue_prefetched_row(plan);
+
+		goto next_table_no_mtr;
+	}
+
+	if (plan->cursor_at_end) {
+		/* The cursor has already reached the result set end: no more
+		rows to process for this table cursor, as also the prefetch
+		stack was empty */
+
+		ut_ad(plan->pcur_is_open);
+
+		goto table_exhausted_no_mtr;
+	}
+
+	/* Open a cursor to index, or restore an open cursor position */
+
+	mtr.start();
+
+#ifdef BTR_CUR_HASH_ADAPT
+	if (node->read_view && plan->unique_search && !plan->pcur_is_open
+	    && !plan->must_get_clust) {
+		switch (row_sel_try_search_shortcut(node, plan, &mtr)) {
+		case SEL_FOUND:
+			goto next_table;
+		case SEL_EXHAUSTED:
+			goto table_exhausted;
+		default:
+			ut_ad(0);
+			/* fall through */
+		case SEL_RETRY:
+			break;
+		}
+
+		plan_reset_cursor(plan);
+
+		mtr.commit();
+		mtr.start();
+	}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+	if (!plan->pcur_is_open) {
+		/* Evaluate the expressions to build the search tuple and
+		open the cursor */
+		err = row_sel_open_pcur(plan, &mtr);
+
+		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+			goto mtr_commit_exit;
+		}
+
+		cursor_just_opened = TRUE;
+
+		/* A new search was made: increment the cost counter */
+		cost_counter++;
+	} else {
+		/* Restore pcur position to the index */
+
+		must_go_to_next = row_sel_restore_pcur_pos(plan, &mtr);
+
+		cursor_just_opened = FALSE;
+
+		if (must_go_to_next) {
+			/* We have already processed the cursor record: move
+			to the next */
+
+			goto next_rec;
+		}
+	}
+
+	if (!node->read_view
+	    || trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) {
+	} else if (trx_id_t bulk_trx_id = index->table->bulk_trx_id) {
+		/* See row_search_mvcc() for a comment on bulk_trx_id */
+		if (!trx->read_view.changes_visible(bulk_trx_id)) {
+			goto table_exhausted;
+		}
+	}
+
+rec_loop:
+	/* RECORD LOOP
+	-----------
+	In this loop we use pcur and try to fetch a qualifying row, and
+	also fill the prefetch buffer for this table if n_rows_fetched has
+	exceeded a threshold. While we are inside this loop, the following
+	holds:
+	(1) &mtr is started,
+	(2) pcur is positioned and open.
+
+	NOTE that if cursor_just_opened is TRUE here, it means that we came
+	to this point right after row_sel_open_pcur. */
+
+	ut_ad(mtr_has_extra_clust_latch == FALSE);
+
+	rec = btr_pcur_get_rec(&(plan->pcur));
+
+	/* PHASE 1: Set a lock if specified */
+
+	if (!node->asc && cursor_just_opened
+	    && !page_rec_is_supremum(rec)) {
+
+		/* Do not support "descending search" for Spatial index */
+		ut_ad(!dict_index_is_spatial(index));
+
+		/* When we open a cursor for a descending search, we must set
+		a next-key lock on the successor record: otherwise it would
+		be possible to insert new records next to the cursor position,
+		and it might be that these new records should appear in the
+		search result set, resulting in the phantom problem. */
+
+		if (!node->read_view) {
+			const rec_t* next_rec = page_rec_get_next_const(rec);
+			if (UNIV_UNLIKELY(!next_rec)) {
+				err = DB_CORRUPTION;
+				goto lock_wait_or_error;
+			}
+			unsigned lock_type;
+
+			offsets = rec_get_offsets(next_rec, index, offsets,
+						  index->n_core_fields,
+						  ULINT_UNDEFINED, &heap);
+
+			/* At READ UNCOMMITTED or READ COMMITTED
+			isolation level, we lock only the record,
+			i.e., next-key locking is not used. */
+			if (trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
+				if (page_rec_is_supremum(next_rec)) {
+					goto skip_lock;
+				}
+
+				lock_type = LOCK_REC_NOT_GAP;
+			} else {
+				lock_type = LOCK_ORDINARY;
+			}
+
+			err = sel_set_rec_lock(&plan->pcur,
+					       next_rec, index, offsets,
+					       node->row_lock_mode,
+					       lock_type, thr, &mtr);
+
+			switch (err) {
+			case DB_SUCCESS_LOCKED_REC:
+				err = DB_SUCCESS;
+				/* fall through */
+			case DB_SUCCESS:
+				break;
+			default:
+				/* Note that in this case we will store in pcur
+				the PREDECESSOR of the record we are waiting
+				the lock for */
+				goto lock_wait_or_error;
+			}
+		}
+	}
+
+skip_lock:
+	if (page_rec_is_infimum(rec)) {
+
+		/* The infimum record on a page cannot be in the result set,
+		and neither can a record lock be placed on it: we skip such
+		a record. We also increment the cost counter as we may have
+		processed yet another page of index. */
+
+		cost_counter++;
+
+		goto next_rec;
+	}
+
+	if (rec_is_metadata(rec, *index)) {
+		/* Skip the metadata pseudo-record. */
+		cost_counter++;
+		goto next_rec;
+	}
+
+	if (!node->read_view) {
+		/* Try to place a lock on the index record */
+		unsigned lock_type;
+
+		offsets = rec_get_offsets(rec, index, offsets,
+					  index->n_core_fields,
+					  ULINT_UNDEFINED, &heap);
+
+		/* At READ UNCOMMITTED or READ COMMITTED isolation level,
+		we lock only the record, i.e., next-key locking is
+		not used. */
+		if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
+		    || dict_index_is_spatial(index)) {
+
+			if (page_rec_is_supremum(rec)) {
+
+				goto next_rec;
+			}
+
+			lock_type = LOCK_REC_NOT_GAP;
+		} else {
+			lock_type = LOCK_ORDINARY;
+		}
+
+		err = sel_set_rec_lock(&plan->pcur,
+				       rec, index, offsets,
+				       node->row_lock_mode, lock_type,
+				       thr, &mtr);
+
+		switch (err) {
+		case DB_SUCCESS_LOCKED_REC:
+			err = DB_SUCCESS;
+			/* fall through */
+		case DB_SUCCESS:
+			break;
+		default:
+			goto lock_wait_or_error;
+		}
+	}
+
+	if (page_rec_is_supremum(rec)) {
+
+		/* A page supremum record cannot be in the result set: skip
+		it now when we have placed a possible lock on it */
+
+		goto next_rec;
+	}
+
+	ut_ad(page_rec_is_user_rec(rec));
+
+	if (cost_counter > SEL_COST_LIMIT) {
+
+		/* Now that we have placed the necessary locks, we can stop
+		for a while and store the cursor position; NOTE that if we
+		would store the cursor position BEFORE placing a record lock,
+		it might happen that the cursor would jump over some records
+		that another transaction could meanwhile insert adjacent to
+		the cursor: this would result in the phantom problem. */
+
+		goto stop_for_a_while;
+	}
+
+	/* PHASE 2: Check a mixed index mix id if needed */
+
+	if (plan->unique_search && cursor_just_opened) {
+
+		ut_ad(plan->mode == PAGE_CUR_GE);
+
+		/* As the cursor is now placed on a user record after a search
+		with the mode PAGE_CUR_GE, the up_match field in the cursor
+		tells how many fields in the user record matched to the search
+		tuple */
+
+		if (btr_pcur_get_up_match(&(plan->pcur))
+		    < plan->n_exact_match) {
+			goto table_exhausted;
+		}
+
+		/* Ok, no need to test end_conds or mix id */
+
+	}
+
+	/* We are ready to look at a possible new index entry in the result
+	set: the cursor is now placed on a user record */
+
+	/* PHASE 3: Get previous version in a consistent read */
+
+	cons_read_requires_clust_rec = FALSE;
+	offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+				  ULINT_UNDEFINED, &heap);
+
+	if (node->read_view) {
+		/* This is a non-locking consistent read: if necessary, fetch
+		a previous version of the record */
+
+		if (dict_index_is_clust(index)) {
+			const trx_id_t id = row_get_rec_trx_id(
+				rec, index, offsets);
+
+			if (!node->read_view->changes_visible(id)) {
+				if (id >= node->read_view->low_limit_id()
+				    && id >= trx_sys.get_max_trx_id()) {
+					err = DB_CORRUPTION;
+					goto lock_wait_or_error;
+				}
+
+				err = row_sel_build_prev_vers(
+					node->read_view, index, rec,
+					&offsets, &heap, &plan->old_vers_heap,
+					&old_vers, &mtr);
+
+				if (err != DB_SUCCESS) {
+
+					goto lock_wait_or_error;
+				}
+
+				if (old_vers == NULL) {
+					/* The record does not exist
+					in our read view. Skip it, but
+					first attempt to determine
+					whether the index segment we
+					are searching through has been
+					exhausted. */
+
+					offsets = rec_get_offsets(
+						rec, index, offsets,
+						index->n_core_fields,
+						ULINT_UNDEFINED, &heap);
+
+					/* Fetch the columns needed in
+					test conditions. The clustered
+					index record is protected by a
+					page latch that was acquired
+					by row_sel_open_pcur() or
+					row_sel_restore_pcur_pos().
+					The latch will not be released
+					until mtr.commit(). */
+
+					row_sel_fetch_columns(
+						index, rec, offsets,
+						UT_LIST_GET_FIRST(
+							plan->columns));
+
+					if (!row_sel_test_end_conds(plan)) {
+
+						goto table_exhausted;
+					}
+
+					goto next_rec;
+				}
+
+				rec = old_vers;
+			}
+		} else if (!srv_read_only_mode) {
+			trx_id_t trx_id = page_get_max_trx_id(page_align(rec));
+			ut_ad(trx_id);
+			if (!node->read_view->sees(trx_id)) {
+				cons_read_requires_clust_rec = TRUE;
+			}
+		}
+	}
+
+	/* PHASE 4: Test search end conditions and deleted flag */
+
+	/* Fetch the columns needed in test conditions.  The record is
+	protected by a page latch that was acquired by
+	row_sel_open_pcur() or row_sel_restore_pcur_pos().  The latch
+	will not be released until mtr.commit(). */
+
+	row_sel_fetch_columns(index, rec, offsets,
+			      UT_LIST_GET_FIRST(plan->columns));
+
+	/* Test the selection end conditions: these can only contain columns
+	which already are found in the index, even though the index might be
+	non-clustered */
+
+	if (plan->unique_search && cursor_just_opened) {
+
+		/* No test necessary: the test was already made above */
+
+	} else if (!row_sel_test_end_conds(plan)) {
+
+		goto table_exhausted;
+	}
+
+	if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))
+	    && !cons_read_requires_clust_rec) {
+
+		/* The record is delete marked: we can skip it if this is
+		not a consistent read which might see an earlier version
+		of a non-clustered index record */
+
+		if (plan->unique_search) {
+
+			goto table_exhausted;
+		}
+
+		goto next_rec;
+	}
+
+	/* PHASE 5: Get the clustered index record, if needed and if we did
+	not do the search using the clustered index */
+
+	if (plan->must_get_clust || cons_read_requires_clust_rec) {
+
+		/* It was a non-clustered index and we must fetch also the
+		clustered index record */
+
+		err = row_sel_get_clust_rec(node, plan, rec, thr, &clust_rec,
+					    &mtr);
+		mtr_has_extra_clust_latch = TRUE;
+
+		if (err != DB_SUCCESS) {
+
+			goto lock_wait_or_error;
+		}
+
+		/* Retrieving the clustered record required a search:
+		increment the cost counter */
+
+		cost_counter++;
+
+		if (clust_rec == NULL) {
+			/* The record did not exist in the read view */
+			ut_ad(node->read_view);
+
+			goto next_rec;
+		}
+
+		if (rec_get_deleted_flag(clust_rec,
+					 dict_table_is_comp(plan->table))) {
+			/* In delete-marked records, DB_TRX_ID must
+			always refer to an existing update_undo log record. */
+			ut_ad(rec_get_trx_id(clust_rec,
+					     dict_table_get_first_index(
+						     plan->table)));
+
+			/* The record is delete marked: we can skip it */
+
+			goto next_rec;
+		}
+
+		if (node->can_get_updated) {
+
+			btr_pcur_store_position(&(plan->clust_pcur), &mtr);
+		}
+	}
+
+	/* PHASE 6: Test the rest of search conditions */
+
+	if (!row_sel_test_other_conds(plan)) {
+
+		if (plan->unique_search) {
+
+			goto table_exhausted;
+		}
+
+		goto next_rec;
+	}
+
+	/* PHASE 7: We found a new qualifying row for the current table; push
+	the row if prefetch is on, or move to the next table in the join */
+
+	plan->n_rows_fetched++;
+
+	ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF);
+
+	if ((plan->n_rows_fetched <= SEL_PREFETCH_LIMIT)
+	    || plan->unique_search || plan->no_prefetch) {
+
+		/* No prefetch in operation: go to the next table */
+
+		goto next_table;
+	}
+
+	sel_enqueue_prefetched_row(plan);
+
+	if (plan->n_rows_prefetched == SEL_MAX_N_PREFETCH) {
+
+		/* The prefetch buffer is now full */
+
+		sel_dequeue_prefetched_row(plan);
+
+		goto next_table;
+	}
+
+next_rec:
+	if (mtr_has_extra_clust_latch) {
+
+		/* We must commit &mtr if we are moving to the next
+		non-clustered index record, because we could break the
+		latching order if we would access a different clustered
+		index page right away without releasing the previous. */
+
+		goto commit_mtr_for_a_while;
+	}
+
+	if (node->asc) {
+		moved = btr_pcur_move_to_next(&(plan->pcur), &mtr);
+	} else {
+		moved = btr_pcur_move_to_prev(&(plan->pcur), &mtr);
+	}
+
+	if (!moved) {
+
+		goto table_exhausted;
+	}
+
+	cursor_just_opened = FALSE;
+
+	/* END OF RECORD LOOP
+	------------------ */
+	goto rec_loop;
+
+next_table:
+	/* We found a record which satisfies the conditions: we can move to
+	the next table or return a row in the result set */
+
+	ut_ad(btr_pcur_is_on_user_rec(&plan->pcur));
+
+	if (plan->unique_search && !node->can_get_updated) {
+
+		plan->cursor_at_end = TRUE;
+	} else {
+		plan->stored_cursor_rec_processed = TRUE;
+
+		btr_pcur_store_position(&(plan->pcur), &mtr);
+	}
+
+	mtr.commit();
+
+	mtr_has_extra_clust_latch = FALSE;
+
+next_table_no_mtr:
+	/* If we use 'goto' to this label, it means that the row was popped
+	from the prefetched rows stack, and &mtr is already committed */
+
+	if (node->fetch_table + 1 == node->n_tables) {
+
+		sel_eval_select_list(node);
+
+		if (node->is_aggregate) {
+
+			goto table_loop;
+		}
+
+		sel_assign_into_var_values(node->into_list, node);
+
+		thr->run_node = que_node_get_parent(node);
+
+		err = DB_SUCCESS;
+		goto func_exit;
+	}
+
+	node->fetch_table++;
+
+	/* When we move to the next table, we first reset the plan cursor:
+	we do not care about resetting it when we backtrack from a table */
+
+	plan_reset_cursor(sel_node_get_nth_plan(node, node->fetch_table));
+
+	goto table_loop;
+
+table_exhausted:
+	/* The table cursor pcur reached the result set end: backtrack to the
+	previous table in the join if we do not have cached prefetched rows */
+
+	plan->cursor_at_end = TRUE;
+
+	mtr.commit();
+
+	mtr_has_extra_clust_latch = FALSE;
+
+	if (plan->n_rows_prefetched > 0) {
+		/* The table became exhausted during a prefetch */
+
+		sel_dequeue_prefetched_row(plan);
+
+		goto next_table_no_mtr;
+	}
+
+table_exhausted_no_mtr:
+	if (node->fetch_table == 0) {
+		err = DB_SUCCESS;
+
+		if (node->is_aggregate && !node->aggregate_already_fetched) {
+
+			node->aggregate_already_fetched = TRUE;
+
+			sel_assign_into_var_values(node->into_list, node);
+
+			thr->run_node = que_node_get_parent(node);
+		} else {
+			node->state = SEL_NODE_NO_MORE_ROWS;
+
+			thr->run_node = que_node_get_parent(node);
+		}
+
+		goto func_exit;
+	}
+
+	node->fetch_table--;
+
+	goto table_loop;
+
+stop_for_a_while:
+	/* Return control for a while to que_run_threads, so that runaway
+	queries can be canceled. NOTE that when we come here, we must, in a
+	locking read, have placed the necessary (possibly waiting request)
+	record lock on the cursor record or its successor: when we reposition
+	the cursor, this record lock guarantees that nobody can meanwhile have
+	inserted new records which should have appeared in the result set,
+	which would result in the phantom problem. */
+
+	plan->stored_cursor_rec_processed = FALSE;
+	btr_pcur_store_position(&(plan->pcur), &mtr);
+
+	err = DB_SUCCESS;
+	goto mtr_commit_exit;
+
+commit_mtr_for_a_while:
+	/* Stores the cursor position and commits &mtr; this is used if
+	&mtr may contain latches which would break the latching order if
+	&mtr would not be committed and the latches released. */
+
+	plan->stored_cursor_rec_processed = TRUE;
+
+	btr_pcur_store_position(&(plan->pcur), &mtr);
+
+	mtr.commit();
+
+	mtr_has_extra_clust_latch = FALSE;
+
+	goto table_loop;
+
+lock_wait_or_error:
+	/* See the note at stop_for_a_while: the same holds for this case */
+
+	ut_ad(!btr_pcur_is_before_first_on_page(&plan->pcur) || !node->asc);
+
+	plan->stored_cursor_rec_processed = FALSE;
+	btr_pcur_store_position(&(plan->pcur), &mtr);
+mtr_commit_exit:
+	mtr.commit();
+
+func_exit:
+	if (heap != NULL) {
+		mem_heap_free(heap);
+	}
+	return(err);
+}
+
+/**********************************************************************//**
+Performs a select step. This is a high-level function used in SQL execution
+graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+row_sel_step(
+/*=========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	sel_node_t*	node;
+
+	ut_ad(thr);
+
+	node = static_cast<sel_node_t*>(thr->run_node);
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_SELECT);
+
+	/* If this is a new time this node is executed (or when execution
+	resumes after wait for a table intention lock), set intention locks
+	on the tables, or assign a read view */
+
+	if (node->into_list && (thr->prev_node == que_node_get_parent(node))) {
+
+		node->state = SEL_NODE_OPEN;
+	}
+
+	if (node->state == SEL_NODE_OPEN) {
+
+		/* It may be that the current session has not yet started
+		its transaction, or it has been committed: */
+
+		trx_start_if_not_started_xa(thr_get_trx(thr), false);
+
+		plan_reset_cursor(sel_node_get_nth_plan(node, 0));
+
+		if (node->consistent_read) {
+			trx_t *trx = thr_get_trx(thr);
+			/* Assign a read view for the query */
+			trx->read_view.open(trx);
+			node->read_view = trx->read_view.is_open() ?
+					  &trx->read_view : NULL;
+		} else {
+			sym_node_t*	table_node;
+			lock_mode	i_lock_mode;
+
+			if (node->set_x_locks) {
+				i_lock_mode = LOCK_IX;
+			} else {
+				i_lock_mode = LOCK_IS;
+			}
+
+			for (table_node = node->table_list;
+			     table_node != 0;
+			     table_node = static_cast<sym_node_t*>(
+					que_node_get_next(table_node))) {
+
+				dberr_t	err = lock_table(
+					table_node->table, nullptr,
+					i_lock_mode, thr);
+
+				if (err != DB_SUCCESS) {
+					trx_t*	trx;
+
+					trx = thr_get_trx(thr);
+					trx->error_state = err;
+
+					return(NULL);
+				}
+			}
+		}
+
+		/* If this is an explicit cursor, copy stored procedure
+		variable values, so that the values cannot change between
+		fetches (currently, we copy them also for non-explicit
+		cursors) */
+
+		if (node->explicit_cursor
+		    && UT_LIST_GET_FIRST(node->copy_variables)) {
+
+			row_sel_copy_input_variable_vals(node);
+		}
+
+		node->state = SEL_NODE_FETCH;
+		node->fetch_table = 0;
+
+		if (node->is_aggregate) {
+			/* Reset the aggregate total values */
+			sel_reset_aggregate_vals(node);
+		}
+	}
+
+	dberr_t	err = row_sel(node, thr);
+
+	/* NOTE! if queries are parallelized, the following assignment may
+	have problems; the assignment should be made only if thr is the
+	only top-level thr in the graph: */
+
+	thr->graph->last_sel_node = node;
+
+	if (err != DB_SUCCESS) {
+		thr_get_trx(thr)->error_state = err;
+
+		return(NULL);
+	}
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Performs a fetch for a cursor.
+@return query thread to run next or NULL */
+que_thr_t*
+fetch_step(
+/*=======*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	sel_node_t*	sel_node;
+	fetch_node_t*	node;
+
+	ut_ad(thr);
+
+	node = static_cast<fetch_node_t*>(thr->run_node);
+	sel_node = node->cursor_def;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_FETCH);
+
+	if (thr->prev_node != que_node_get_parent(node)) {
+
+		if (sel_node->state != SEL_NODE_NO_MORE_ROWS) {
+
+			if (node->into_list) {
+				sel_assign_into_var_values(node->into_list,
+							   sel_node);
+			} else {
+				ibool ret = (*node->func->func)(
+					sel_node, node->func->arg);
+
+				if (!ret) {
+					sel_node->state
+						 = SEL_NODE_NO_MORE_ROWS;
+				}
+			}
+		}
+
+		thr->run_node = que_node_get_parent(node);
+
+		return(thr);
+	}
+
+	/* Make the fetch node the parent of the cursor definition for
+	the time of the fetch, so that execution knows to return to this
+	fetch node after a row has been selected or we know that there is
+	no row left */
+
+	sel_node->common.parent = node;
+
+	if (sel_node->state == SEL_NODE_CLOSED) {
+		ib::error() << "fetch called on a closed cursor";
+
+		thr_get_trx(thr)->error_state = DB_ERROR;
+
+		return(NULL);
+	}
+
+	thr->run_node = sel_node;
+
+	return(thr);
+}
+
+/***********************************************************//**
+Prints a row in a select result.
+@return query thread to run next or NULL */
+que_thr_t*
+row_printf_step(
+/*============*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	row_printf_node_t*	node;
+	sel_node_t*		sel_node;
+	que_node_t*		arg;
+
+	ut_ad(thr);
+
+	node = static_cast<row_printf_node_t*>(thr->run_node);
+
+	sel_node = node->sel_node;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_ROW_PRINTF);
+
+	if (thr->prev_node == que_node_get_parent(node)) {
+
+		/* Reset the cursor */
+		sel_node->state = SEL_NODE_OPEN;
+
+		/* Fetch next row to print */
+
+		thr->run_node = sel_node;
+
+		return(thr);
+	}
+
+	if (sel_node->state != SEL_NODE_FETCH) {
+
+		ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
+
+		/* No more rows to print */
+
+		thr->run_node = que_node_get_parent(node);
+
+		return(thr);
+	}
+
+	arg = sel_node->select_list;
+
+	while (arg) {
+		dfield_print_also_hex(que_node_get_val(arg));
+
+		fputs(" ::: ", stderr);
+
+		arg = que_node_get_next(arg);
+	}
+
+	putc('\n', stderr);
+
+	/* Fetch next row to print */
+
+	thr->run_node = sel_node;
+
+	return(thr);
+}
+
+/****************************************************************//**
+Converts a key value stored in MySQL format to an Innobase dtuple. The last
+field of the key value may be just a prefix of a fixed length field: hence
+the parameter key_len. But currently we do not allow search keys where the
+last field is only a prefix of the full key field len and print a warning if
+such appears. A counterpart of this function is
+ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
+void
+row_sel_convert_mysql_key_to_innobase(
+/*==================================*/
+	dtuple_t*	tuple,		/*!< in/out: tuple where to build;
+					NOTE: we assume that the type info
+					in the tuple is already according
+					to index! */
+	byte*		buf,		/*!< in: buffer to use in field
+					conversions; NOTE that dtuple->data
+					may end up pointing inside buf so
+					do not discard that buffer while
+					the tuple is being used. See
+					row_mysql_store_col_in_innobase_format()
+					in the case of DATA_INT */
+	ulint		buf_len,	/*!< in: buffer length */
+	dict_index_t*	index,		/*!< in: index of the key value */
+	const byte*	key_ptr,	/*!< in: MySQL key value */
+	ulint		key_len)	/*!< in: MySQL key value length */
+{
+	byte*		original_buf	= buf;
+	const byte*	original_key_ptr = key_ptr;
+	dict_field_t*	field;
+	dfield_t*	dfield;
+	ulint		data_offset;
+	ulint		data_len;
+	ulint		data_field_len;
+	ibool		is_null;
+	const byte*	key_end;
+	ulint		n_fields = 0;
+
+	/* For documentation of the key value storage format in MySQL, see
+	ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
+
+	key_end = key_ptr + key_len;
+
+	/* Permit us to access any field in the tuple (ULINT_MAX): */
+
+	dtuple_set_n_fields(tuple, ULINT_MAX);
+
+	dfield = dtuple_get_nth_field(tuple, 0);
+	field = dict_index_get_nth_field(index, 0);
+
+	if (UNIV_UNLIKELY(dfield_get_type(dfield)->mtype == DATA_SYS)) {
+		/* A special case: we are looking for a position in the
+		generated clustered index which InnoDB automatically added
+		to a table with no primary key: the first and the only
+		ordering column is ROW_ID which InnoDB stored to the key_ptr
+		buffer. */
+
+		ut_a(key_len == DATA_ROW_ID_LEN);
+
+		dfield_set_data(dfield, key_ptr, DATA_ROW_ID_LEN);
+
+		dtuple_set_n_fields(tuple, 1);
+
+		return;
+	}
+
+	while (key_ptr < key_end) {
+
+		ulint	type = dfield_get_type(dfield)->mtype;
+		ut_a(field->col->mtype == type);
+
+		data_offset = 0;
+		is_null = FALSE;
+
+		if (!(dfield_get_type(dfield)->prtype & DATA_NOT_NULL)) {
+			/* The first byte in the field tells if this is
+			an SQL NULL value */
+
+			data_offset = 1;
+
+			if (*key_ptr != 0) {
+				dfield_set_null(dfield);
+
+				is_null = TRUE;
+			}
+		}
+
+		/* Calculate data length and data field total length */
+		if (DATA_LARGE_MTYPE(type) || DATA_GEOMETRY_MTYPE(type)) {
+
+			/* For R-tree index, data length should be the
+			total size of the wkb data.*/
+			if (dict_index_is_spatial(index)) {
+				ut_ad(DATA_GEOMETRY_MTYPE(type));
+				data_len = key_len;
+				data_field_len = data_offset + data_len;
+			} else {
+				/* The key field is a column prefix of a BLOB
+				or TEXT. */
+
+				ut_a(field->prefix_len > 0);
+
+				/* MySQL stores the actual data length to the
+				first 2 bytes after the optional SQL NULL
+				marker byte. The storage format is
+				little-endian, that is, the most significant
+				byte at a higher address. In UTF-8, MySQL
+				seems to reserve field->prefix_len bytes for
+				storing this field in the key value buffer,
+				even though the actual value only takes data
+				len bytes from the start. */
+
+				data_len = ulint(key_ptr[data_offset])
+					| ulint(key_ptr[data_offset + 1]) << 8;
+				data_field_len = data_offset + 2
+					+ field->prefix_len;
+
+				data_offset += 2;
+
+				/* Now that we know the length, we store the
+				column value like it would be a fixed char
+				field */
+			}
+
+
+		} else if (field->prefix_len > 0) {
+			/* Looks like MySQL pads unused end bytes in the
+			prefix with space. Therefore, also in UTF-8, it is ok
+			to compare with a prefix containing full prefix_len
+			bytes, and no need to take at most prefix_len / 3
+			UTF-8 characters from the start.
+			If the prefix is used as the upper end of a LIKE
+			'abc%' query, then MySQL pads the end with chars
+			0xff. TODO: in that case does it any harm to compare
+			with the full prefix_len bytes. How do characters
+			0xff in UTF-8 behave? */
+
+			data_len = field->prefix_len;
+			data_field_len = data_offset + data_len;
+		} else {
+			data_len = dfield_get_type(dfield)->len;
+			data_field_len = data_offset + data_len;
+		}
+
+		if ((dtype_get_mysql_type(dfield_get_type(dfield))
+		     == DATA_MYSQL_TRUE_VARCHAR)
+		    && (type != DATA_INT)) {
+			/* In a MySQL key value format, a true VARCHAR is
+			always preceded by 2 bytes of a length field.
+			dfield_get_type(dfield)->len returns the maximum
+			'payload' len in bytes. That does not include the
+			2 bytes that tell the actual data length.
+
+			We added the check != DATA_INT to make sure we do
+			not treat MySQL ENUM or SET as a true VARCHAR! */
+
+			data_len += 2;
+			data_field_len += 2;
+		}
+
+		/* Storing may use at most data_len bytes of buf */
+
+		if (UNIV_LIKELY(!is_null)) {
+			buf = row_mysql_store_col_in_innobase_format(
+					dfield, buf,
+					FALSE, /* MySQL key value format col */
+					key_ptr + data_offset, data_len,
+					dict_table_is_comp(index->table));
+			ut_a(buf <= original_buf + buf_len);
+		}
+
+		key_ptr += data_field_len;
+
+		if (UNIV_UNLIKELY(key_ptr > key_end)) {
+			/* The last field in key was not a complete key field
+			but a prefix of it.
+
+			Print a warning about this! HA_READ_PREFIX_LAST does
+			not currently work in InnoDB with partial-field key
+			value prefixes. Since MySQL currently uses a padding
+			trick to calculate LIKE 'abc%' type queries there
+			should never be partial-field prefixes in searches. */
+
+			ib::warn() << "Using a partial-field key prefix in"
+				" search, index " << index->name
+				<< " of table " << index->table->name
+				<< ". Last data field length "
+				<< data_field_len << " bytes, key ptr now"
+				" exceeds key end by " << (key_ptr - key_end)
+				<< " bytes. Key value in the MariaDB format:";
+
+			ut_print_buf(stderr, original_key_ptr, key_len);
+			putc('\n', stderr);
+
+			if (!is_null) {
+				ulint	len = dfield_get_len(dfield);
+				dfield_set_len(dfield, len
+					       - (ulint) (key_ptr - key_end));
+			}
+			ut_ad(0);
+		}
+
+		n_fields++;
+		field++;
+		dfield++;
+	}
+
+	ut_a(buf <= original_buf + buf_len);
+
+	/* We set the length of tuple to n_fields: we assume that the memory
+	area allocated for it is big enough (usually bigger than n_fields). */
+
+	dtuple_set_n_fields(tuple, n_fields);
+}
+
+/**************************************************************//**
+Stores a non-SQL-NULL field in the MySQL format. The counterpart of this
+function is row_mysql_store_col_in_innobase_format() in row0mysql.cc. */
+void
+row_sel_field_store_in_mysql_format_func(
+	byte*		dest,
+	const mysql_row_templ_t* templ,
+#ifdef UNIV_DEBUG
+	const dict_index_t* index,
+	ulint		field_no,
+#endif /* UNIV_DEBUG */
+	const byte*	data,
+	ulint		len)
+{
+#ifdef UNIV_DEBUG
+	const dict_field_t*	field
+		= templ->is_virtual
+			 ? NULL : dict_index_get_nth_field(index, field_no);
+#endif /* UNIV_DEBUG */
+
+	ut_ad(len != UNIV_SQL_NULL);
+	MEM_CHECK_DEFINED(data, len);
+	MEM_CHECK_ADDRESSABLE(dest, templ->mysql_col_len);
+	MEM_UNDEFINED(dest, templ->mysql_col_len);
+
+	byte* pad = dest + len;
+
+	switch (templ->type) {
+		const byte*	field_end;
+	case DATA_VARCHAR:
+	case DATA_VARMYSQL:
+	case DATA_BINARY:
+		field_end = dest + templ->mysql_col_len;
+
+		if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) {
+			/* This is a >= 5.0.3 type true VARCHAR. Store the
+			length of the data to the first byte or the first
+			two bytes of dest. */
+
+			dest = row_mysql_store_true_var_len(
+				dest, len, templ->mysql_length_bytes);
+			/* Copy the actual data. Leave the rest of the
+			buffer uninitialized. */
+			memcpy(dest, data, len);
+			break;
+		}
+
+		/* Copy the actual data */
+		memcpy(dest, data, len);
+
+		/* Pad with trailing spaces. */
+
+		if (pad == field_end) {
+			break;
+		}
+
+		if (UNIV_UNLIKELY(templ->type == DATA_FIXBINARY)) {
+			memset(pad, 0, field_end - pad);
+			break;
+		}
+
+		ut_ad(templ->mbminlen <= templ->mbmaxlen);
+
+		/* We treat some Unicode charset strings specially. */
+		switch (templ->mbminlen) {
+		case 4:
+			/* InnoDB should never have stripped partial
+			UTF-32 characters. */
+			ut_a(!(len & 3));
+			break;
+		case 2:
+			/* A space char is two bytes,
+			0x0020 in UCS2 and UTF-16 */
+
+			if (UNIV_UNLIKELY(len & 1)) {
+				/* A 0x20 has been stripped from the column.
+				Pad it back. */
+
+				if (pad < field_end) {
+					*pad++ = 0x20;
+				}
+			}
+		}
+
+		row_mysql_pad_col(templ->mbminlen, pad,
+				  ulint(field_end - pad));
+		break;
+
+	case DATA_BLOB:
+		/* Store a pointer to the BLOB buffer to dest: the BLOB was
+		already copied to the buffer in row_sel_store_mysql_rec */
+
+		row_mysql_store_blob_ref(dest, templ->mysql_col_len, data,
+					 len);
+		break;
+
+	case DATA_GEOMETRY:
+		/* We store all geometry data as BLOB data at server layer. */
+		row_mysql_store_geometry(dest, templ->mysql_col_len, data, len);
+		break;
+
+	case DATA_MYSQL:
+		memcpy(dest, data, len);
+
+		ut_ad(templ->mysql_col_len >= len);
+		ut_ad(templ->mbmaxlen >= templ->mbminlen);
+
+		/* If field_no equals to templ->icp_rec_field_no,
+		we are examining a row pointed by "icp_rec_field_no".
+		There is possibility that icp_rec_field_no refers to
+		a field in a secondary index while templ->rec_field_no
+		points to field in a primary index. The length
+		should still be equal, unless the field pointed
+		by icp_rec_field_no has a prefix */
+		ut_ad(templ->mbmaxlen > templ->mbminlen
+		      || templ->mysql_col_len == len
+		      || (field_no == templ->icp_rec_field_no
+			  && field->prefix_len > 0));
+
+		/* The following assertion would fail for old tables
+		containing UTF-8 ENUM columns due to Bug #9526. */
+		ut_ad(!templ->mbmaxlen
+		      || !(templ->mysql_col_len % templ->mbmaxlen));
+		ut_ad(len * templ->mbmaxlen >= templ->mysql_col_len
+		      || (field_no == templ->icp_rec_field_no
+			  && field->prefix_len > 0)
+		      || templ->rec_field_is_prefix);
+
+		ut_ad(templ->is_virtual
+		      || !(field->prefix_len % templ->mbmaxlen));
+
+		if (templ->mbminlen == 1 && templ->mbmaxlen != 1) {
+			/* Pad with spaces. This undoes the stripping
+			done in row0mysql.cc, function
+			row_mysql_store_col_in_innobase_format(). */
+
+			memset(pad, 0x20, templ->mysql_col_len - len);
+		}
+		break;
+
+	default:
+#ifdef UNIV_DEBUG
+	case DATA_SYS_CHILD:
+	case DATA_SYS:
+		/* These column types should never be shipped to MySQL. */
+		ut_ad(0);
+		/* fall through */
+
+	case DATA_CHAR:
+	case DATA_FIXBINARY:
+	case DATA_FLOAT:
+	case DATA_DOUBLE:
+	case DATA_DECIMAL:
+#endif /* UNIV_DEBUG */
+		ut_ad((templ->is_virtual && !field)
+		      || (field && field->prefix_len
+				? field->prefix_len == len
+				: templ->mysql_col_len == len));
+		memcpy(dest, data, len);
+		break;
+
+	case DATA_INT:
+		/* Convert InnoDB big-endian integer to little-endian
+		format, sign bit restored to 2's complement form */
+		DBUG_ASSERT(templ->mysql_col_len == len);
+
+		byte* ptr = pad;
+		do *--ptr = *data++; while (ptr != dest);
+		if (!templ->is_unsigned) {
+			pad[-1] ^= 0x80;
+		}
+	}
+}
+
+/** Convert a field in the Innobase format to a field in the MySQL format.
+@param[out]	mysql_rec		record in the MySQL format
+@param[in,out]	prebuilt		prebuilt struct
+@param[in]	rec			InnoDB record; must be protected
+					by a page latch
+@param[in]	index			index of rec
+@param[in]	offsets			array returned by rec_get_offsets()
+@param[in]	field_no		templ->rec_field_no or
+					templ->clust_rec_field_no
+					or templ->icp_rec_field_no
+@param[in]	templ			row template
+*/
+static MY_ATTRIBUTE((warn_unused_result))
+ibool
+row_sel_store_mysql_field(
+	byte*			mysql_rec,
+	row_prebuilt_t*		prebuilt,
+	const rec_t*		rec,
+	const dict_index_t*	index,
+	const rec_offs*		offsets,
+	ulint			field_no,
+	const mysql_row_templ_t*templ)
+{
+	DBUG_ENTER("row_sel_store_mysql_field_func");
+
+	const byte*	data;
+	ulint		len;
+
+	ut_ad(prebuilt->default_rec);
+	ut_ad(templ);
+	ut_ad(templ >= prebuilt->mysql_template);
+	ut_ad(templ < &prebuilt->mysql_template[prebuilt->n_template]);
+	ut_ad(field_no == templ->clust_rec_field_no
+	      || field_no == templ->rec_field_no
+	      || field_no == templ->icp_rec_field_no);
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets, field_no) != 0)) {
+
+		mem_heap_t*	heap;
+		/* Copy an externally stored field to a temporary heap */
+
+		ut_ad(field_no == templ->clust_rec_field_no);
+
+		if (DATA_LARGE_MTYPE(templ->type)) {
+			if (prebuilt->blob_heap == NULL) {
+				prebuilt->blob_heap = mem_heap_create(
+					srv_page_size);
+			}
+
+			heap = prebuilt->blob_heap;
+		} else {
+			heap = mem_heap_create(srv_page_size);
+		}
+
+		/* NOTE: if we are retrieving a big BLOB, we may
+		already run out of memory in the next call, which
+		causes an assert */
+
+		data = btr_rec_copy_externally_stored_field(
+			rec, offsets, prebuilt->table->space->zip_size(),
+			field_no, &len, heap);
+
+		if (UNIV_UNLIKELY(!data)) {
+			/* The externally stored field was not written
+			yet. This record should only be seen by
+			trx_rollback_recovered() or any
+			TRX_ISO_READ_UNCOMMITTED transactions. */
+
+			if (heap != prebuilt->blob_heap) {
+				mem_heap_free(heap);
+			}
+
+			ut_a(prebuilt->trx->isolation_level
+			     == TRX_ISO_READ_UNCOMMITTED);
+			DBUG_RETURN(FALSE);
+		}
+
+		ut_a(len != UNIV_SQL_NULL);
+
+		row_sel_field_store_in_mysql_format(
+			mysql_rec + templ->mysql_col_offset,
+			templ, index, field_no, data, len);
+
+		if (heap != prebuilt->blob_heap) {
+			mem_heap_free(heap);
+		}
+	} else {
+		/* The field is stored in the index record, or
+		in the metadata for instant ADD COLUMN. */
+		data = rec_get_nth_cfield(rec, index, offsets, field_no, &len);
+
+		if (len == UNIV_SQL_NULL) {
+			/* MySQL assumes that the field for an SQL
+			NULL value is set to the default value. */
+			ut_ad(templ->mysql_null_bit_mask);
+
+			MEM_CHECK_DEFINED(prebuilt->default_rec
+					  + templ->mysql_col_offset,
+					  templ->mysql_col_len);
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 may need this here */
+#endif
+			mysql_rec[templ->mysql_null_byte_offset]
+				|= (byte) templ->mysql_null_bit_mask;
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
+			memcpy(mysql_rec + templ->mysql_col_offset,
+			       (const byte*) prebuilt->default_rec
+			       + templ->mysql_col_offset,
+			       templ->mysql_col_len);
+			DBUG_RETURN(TRUE);
+		}
+
+		if (DATA_LARGE_MTYPE(templ->type)
+		    || DATA_GEOMETRY_MTYPE(templ->type)) {
+
+			/* It is a BLOB field locally stored in the
+			InnoDB record: we MUST copy its contents to
+			prebuilt->blob_heap here because
+			row_sel_field_store_in_mysql_format() stores a
+			pointer to the data, and the data passed to us
+			will be invalid as soon as the
+			mini-transaction is committed and the page
+			latch on the clustered index page is
+			released. */
+
+			if (prebuilt->blob_heap == NULL) {
+				prebuilt->blob_heap = mem_heap_create(
+					srv_page_size);
+				DBUG_PRINT("anna", ("blob_heap allocated: %p",
+						    prebuilt->blob_heap));
+			}
+
+			data = static_cast<byte*>(
+				mem_heap_dup(prebuilt->blob_heap, data, len));
+		}
+
+		row_sel_field_store_in_mysql_format(
+			mysql_rec + templ->mysql_col_offset,
+			templ, index, field_no, data, len);
+	}
+
+	ut_ad(len != UNIV_SQL_NULL);
+
+	if (templ->mysql_null_bit_mask) {
+		/* It is a nullable column with a non-NULL
+		value */
+		mysql_rec[templ->mysql_null_byte_offset]
+			&= static_cast<byte>(~templ->mysql_null_bit_mask);
+	}
+
+	DBUG_RETURN(TRUE);
+}
+
+/** Convert a row in the Innobase format to a row in the MySQL format.
+Note that the template in prebuilt may advise us to copy only a few
+columns to mysql_rec, other columns are left blank. All columns may not
+be needed in the query.
+@param[out]	mysql_rec	row in the MySQL format
+@param[in]	prebuilt	cursor
+@param[in]	rec		Innobase record in the index
+				which was described in prebuilt's
+				template, or in the clustered index;
+				must be protected by a page latch
+@param[in]	vrow		virtual columns
+@param[in]	rec_clust	whether index must be the clustered index
+@param[in]	index		index of rec
+@param[in]	offsets		array returned by rec_get_offsets(rec)
+@retval true on success
+@retval false if not all columns could be retrieved */
+MY_ATTRIBUTE((warn_unused_result))
+static bool row_sel_store_mysql_rec(
+	byte*		mysql_rec,
+	row_prebuilt_t*	prebuilt,
+	const rec_t*	rec,
+	const dtuple_t*	vrow,
+	bool		rec_clust,
+	const dict_index_t* index,
+	const rec_offs*	offsets)
+{
+	DBUG_ENTER("row_sel_store_mysql_rec");
+
+	ut_ad(rec_clust || index == prebuilt->index);
+	ut_ad(!rec_clust || dict_index_is_clust(index));
+
+	if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) {
+		row_mysql_prebuilt_free_blob_heap(prebuilt);
+	}
+
+	for (ulint i = 0; i < prebuilt->n_template; i++) {
+		const mysql_row_templ_t*templ = &prebuilt->mysql_template[i];
+
+		if (templ->is_virtual && dict_index_is_clust(index)) {
+			/* Skip virtual columns if it is not a covered
+			search or virtual key read is not requested. */
+			if (!rec_clust
+			    || !prebuilt->index->has_virtual()
+			    || !prebuilt->read_just_key) {
+				/* Initialize the NULL bit. */
+				if (templ->mysql_null_bit_mask) {
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 may need this here */
+#endif
+					mysql_rec[templ->mysql_null_byte_offset]
+						|= (byte) templ->mysql_null_bit_mask;
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
+				}
+				continue;
+			}
+
+			dict_v_col_t*   col;
+			col = dict_table_get_nth_v_col(
+				index->table, templ->clust_rec_field_no);
+
+			ut_ad(vrow);
+
+			const dfield_t* dfield = dtuple_get_nth_v_field(
+				vrow, col->v_pos);
+
+			if (dfield_get_type(dfield)->mtype == DATA_MISSING) {
+				ut_ad("no ha_innopart in MariaDB" == 0);
+				continue;
+			}
+
+			if (dfield->len == UNIV_SQL_NULL) {
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 may need this here */
+#endif
+				mysql_rec[templ->mysql_null_byte_offset]
+				|= (byte) templ->mysql_null_bit_mask;
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
+				memcpy(mysql_rec
+				+ templ->mysql_col_offset,
+				(const byte*) prebuilt->default_rec
+				+ templ->mysql_col_offset,
+				templ->mysql_col_len);
+			} else {
+				row_sel_field_store_in_mysql_format(
+				mysql_rec + templ->mysql_col_offset,
+				templ, index, templ->clust_rec_field_no,
+				(const byte*)dfield->data, dfield->len);
+				if (templ->mysql_null_bit_mask) {
+					mysql_rec[
+						templ->mysql_null_byte_offset]
+						&= static_cast<byte>
+						(~templ->mysql_null_bit_mask);
+				}
+			}
+
+			continue;
+		}
+
+		const ulint		field_no
+			= rec_clust
+			? templ->clust_rec_field_no
+			: templ->rec_field_no;
+		/* We should never deliver column prefixes to the SQL layer,
+		except for evaluating handler_index_cond_check()
+		or handler_rowid_filter_check(). */
+		/* ...actually, we do want to do this in order to
+		support the prefix query optimization.
+
+		ut_ad(dict_index_get_nth_field(index, field_no)->prefix_len
+		      == 0);
+
+		...so we disable this assert. */
+
+		if (!row_sel_store_mysql_field(mysql_rec, prebuilt,
+					       rec, index, offsets,
+					       field_no, templ)) {
+
+			DBUG_RETURN(false);
+		}
+	}
+
+	/* FIXME: We only need to read the doc_id if an FTS indexed
+	column is being updated.
+	NOTE, the record can be cluster or secondary index record.
+	if secondary index is used then FTS_DOC_ID column should be part
+	of this index. */
+	if (dict_table_has_fts_index(prebuilt->table)) {
+		if (dict_index_is_clust(index)
+		    || prebuilt->fts_doc_id_in_read_set) {
+			prebuilt->fts_doc_id = fts_get_doc_id_from_rec(
+				rec, index, offsets);
+		}
+	}
+
+	DBUG_RETURN(true);
+}
+
+static void row_sel_reset_old_vers_heap(row_prebuilt_t *prebuilt)
+{
+  if (prebuilt->old_vers_heap)
+    mem_heap_empty(prebuilt->old_vers_heap);
+  else
+    prebuilt->old_vers_heap= mem_heap_create(200);
+}
+
+/*********************************************************************//**
+Builds a previous version of a clustered index record for a consistent read
+@return DB_SUCCESS or error code */
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+row_sel_build_prev_vers_for_mysql(
+/*==============================*/
+	row_prebuilt_t*	prebuilt,	/*!< in/out: prebuilt struct */
+	dict_index_t*	clust_index,	/*!< in: clustered index */
+	const rec_t*	rec,		/*!< in: record in a clustered index */
+	rec_offs**	offsets,	/*!< in/out: offsets returned by
+					rec_get_offsets(rec, clust_index) */
+	mem_heap_t**	offset_heap,	/*!< in/out: memory heap from which
+					the offsets are allocated */
+	rec_t**		old_vers,	/*!< out: old version, or NULL if the
+					record does not exist in the view:
+					i.e., it was freshly inserted
+					afterwards */
+	dtuple_t**	vrow,		/*!< out: dtuple to hold old virtual
+					column data */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	row_sel_reset_old_vers_heap(prebuilt);
+
+	return row_vers_build_for_consistent_read(
+		rec, mtr, clust_index, offsets,
+		&prebuilt->trx->read_view, offset_heap,
+		prebuilt->old_vers_heap, old_vers, vrow);
+}
+
+/** Helper class to cache clust_rec and old_vers */
+class Row_sel_get_clust_rec_for_mysql
+{
+  const rec_t *cached_clust_rec;
+  rec_t *cached_old_vers;
+  lsn_t cached_lsn;
+  page_id_t cached_page_id;
+
+#ifdef UNIV_DEBUG
+  void check_eq(const dict_index_t *index, const rec_offs *offsets) const
+  {
+    rec_offs vers_offs[REC_OFFS_HEADER_SIZE + MAX_REF_PARTS];
+    rec_offs_init(vers_offs);
+    mem_heap_t *heap= nullptr;
+
+    ut_ad(rec_offs_validate(cached_clust_rec, index, offsets));
+    ut_ad(index->first_user_field() <= rec_offs_n_fields(offsets));
+    ut_ad(vers_offs == rec_get_offsets(cached_old_vers, index, vers_offs,
+                                       index->n_core_fields,
+                                       index->db_trx_id(), &heap));
+    ut_ad(!heap);
+    for (auto n= index->db_trx_id(); n--; )
+    {
+      const dict_col_t *col= dict_index_get_nth_col(index, n);
+      ulint len1, len2;
+      const byte *b1= rec_get_nth_field(cached_clust_rec, offsets, n, &len1);
+      const byte *b2= rec_get_nth_field(cached_old_vers, vers_offs, n, &len2);
+      ut_ad(!cmp_data(col->mtype, col->prtype, false, b1, len1, b2, len2));
+    }
+  }
+#endif
+
+public:
+  Row_sel_get_clust_rec_for_mysql() :
+    cached_clust_rec(NULL), cached_old_vers(NULL), cached_lsn(0),
+    cached_page_id(page_id_t(0,0)) {}
+
+  dberr_t operator()(row_prebuilt_t *prebuilt, dict_index_t *sec_index,
+                     const rec_t *rec, que_thr_t *thr, const rec_t **out_rec,
+                     rec_offs **offsets, mem_heap_t **offset_heap,
+                     dtuple_t **vrow, mtr_t *mtr);
+};
+
+/*********************************************************************//**
+Retrieves the clustered index record corresponding to a record in a
+non-clustered index. Does the necessary locking. Used in the MySQL
+interface.
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
+dberr_t
+Row_sel_get_clust_rec_for_mysql::operator()(
+/*============================*/
+	row_prebuilt_t*	prebuilt,/*!< in: prebuilt struct in the handle */
+	dict_index_t*	sec_index,/*!< in: secondary index where rec resides */
+	const rec_t*	rec,	/*!< in: record in a non-clustered index; if
+				this is a locking read, then rec is not
+				allowed to be delete-marked, and that would
+				not make sense either */
+	que_thr_t*	thr,	/*!< in: query thread */
+	const rec_t**	out_rec,/*!< out: clustered record or an old version of
+				it, NULL if the old version did not exist
+				in the read view, i.e., it was a fresh
+				inserted version */
+	rec_offs**	offsets,/*!< in: offsets returned by
+				rec_get_offsets(rec, sec_index);
+				out: offsets returned by
+				rec_get_offsets(out_rec, clust_index) */
+	mem_heap_t**	offset_heap,/*!< in/out: memory heap from which
+				the offsets are allocated */
+	dtuple_t**	vrow,	/*!< out: virtual column to fill */
+	mtr_t*		mtr)	/*!< in: mtr used to get access to the
+				non-clustered record; the same mtr is used to
+				access the clustered index */
+{
+	dict_index_t*	clust_index;
+	rec_t*		old_vers;
+	trx_t*		trx;
+
+	prebuilt->clust_pcur->old_rec = nullptr;
+	*out_rec = NULL;
+	trx = thr_get_trx(thr);
+
+	row_build_row_ref_in_tuple(prebuilt->clust_ref, rec,
+				   sec_index, *offsets);
+
+	clust_index = dict_table_get_first_index(sec_index->table);
+	prebuilt->clust_pcur->btr_cur.page_cur.index = clust_index;
+
+	dberr_t err = btr_pcur_open_with_no_init(prebuilt->clust_ref,
+						 PAGE_CUR_LE, BTR_SEARCH_LEAF,
+						 prebuilt->clust_pcur, mtr);
+	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+		return err;
+	}
+
+	const rec_t* clust_rec = btr_pcur_get_rec(prebuilt->clust_pcur);
+
+	prebuilt->clust_pcur->trx_if_known = trx;
+
+	/* Note: only if the search ends up on a non-infimum record is the
+	low_match value the real match to the search tuple */
+
+	if (!page_rec_is_user_rec(clust_rec)
+	    || btr_pcur_get_low_match(prebuilt->clust_pcur)
+	    < dict_index_get_n_unique(clust_index)) {
+		btr_cur_t*	btr_cur = btr_pcur_get_btr_cur(prebuilt->pcur);
+
+		/* If this is a spatial index scan, and we are reading
+		from a shadow buffer, the record could be already
+		deleted (due to rollback etc.). So get the original
+		page and verify that */
+		if  (dict_index_is_spatial(sec_index)
+		     && btr_cur->rtr_info->matches
+		     && (page_align(rec)
+			== btr_cur->rtr_info->matches->block.page.frame
+			|| rec != btr_pcur_get_rec(prebuilt->pcur))) {
+#ifdef UNIV_DEBUG
+			rtr_info_t*	rtr_info = btr_cur->rtr_info;
+			mysql_mutex_lock(&rtr_info->matches->rtr_match_mutex);
+			/* The page could be deallocated (by rollback etc.) */
+			if (!rtr_info->matches->valid) {
+				mysql_mutex_unlock(&rtr_info->matches->rtr_match_mutex);
+				clust_rec = NULL;
+                                goto func_exit;
+			}
+			mysql_mutex_unlock(&rtr_info->matches->rtr_match_mutex);
+
+			if (rec_get_deleted_flag(rec,
+                                          dict_table_is_comp(sec_index->table))
+                                  && prebuilt->select_lock_type == LOCK_NONE) {
+
+				clust_rec = NULL;
+				goto func_exit;
+			}
+
+			if (rec != btr_pcur_get_rec(prebuilt->pcur)) {
+				clust_rec = NULL;
+                                goto func_exit;
+			}
+
+			/* FIXME: Why is this block not the
+			same as btr_pcur_get_block(prebuilt->pcur),
+			and is it not unsafe to use RW_NO_LATCH here? */
+			buf_block_t*	block = buf_page_get_gen(
+				btr_pcur_get_block(prebuilt->pcur)->page.id(),
+				btr_pcur_get_block(prebuilt->pcur)->zip_size(),
+				RW_NO_LATCH, NULL, BUF_GET, mtr, &err);
+			ut_ad(block); // FIXME: avoid crash
+			mem_heap_t*	heap = mem_heap_create(256);
+			dtuple_t*       tuple = dict_index_build_data_tuple(
+				rec, sec_index, true,
+				sec_index->n_fields, heap);
+			page_cur_t     page_cursor;
+			page_cursor.block = block;
+			page_cursor.index = sec_index;
+			ulint up_match = 0, low_match = 0;
+			ut_ad(!page_cur_search_with_match(tuple, PAGE_CUR_LE,
+							  &up_match,
+							  &low_match,
+							  &page_cursor,
+							  nullptr));
+			ut_ad(low_match < dtuple_get_n_fields_cmp(tuple));
+			mem_heap_free(heap);
+			err = DB_SUCCESS;
+#endif /* UNIV_DEBUG */
+		} else if (!rec_get_deleted_flag(rec,
+					  dict_table_is_comp(sec_index->table))
+			   || prebuilt->select_lock_type != LOCK_NONE) {
+			/* In a rare case it is possible that no clust
+			rec is found for a delete-marked secondary index
+			record: if row_undo_mod_clust() has already removed
+			the clust rec, while purge is still cleaning and
+			removing secondary index records associated with
+			earlier versions of the clustered index record.
+			In that case we know that the clustered index
+			record did not exist in the read view of trx. */
+			ib::error() << "Clustered record for sec rec not found"
+				" index " << sec_index->name
+				<< " of table " << sec_index->table->name;
+
+			fputs("InnoDB: sec index record ", stderr);
+			rec_print(stderr, rec, sec_index);
+			fputs("\n"
+			      "InnoDB: clust index record ", stderr);
+			rec_print(stderr, clust_rec, clust_index);
+			err = DB_CORRUPTION;
+		}
+
+		clust_rec = NULL;
+		goto func_exit;
+	}
+
+	*offsets = rec_get_offsets(clust_rec, clust_index, *offsets,
+				   clust_index->n_core_fields,
+				   ULINT_UNDEFINED, offset_heap);
+
+	if (prebuilt->select_lock_type != LOCK_NONE) {
+		/* Try to place a lock on the index record; we are searching
+		the clust rec with a unique condition, hence
+		we set a LOCK_REC_NOT_GAP type lock */
+
+		err = lock_clust_rec_read_check_and_lock(
+			0, btr_pcur_get_block(prebuilt->clust_pcur),
+			clust_rec, clust_index, *offsets,
+			prebuilt->select_lock_type,
+			LOCK_REC_NOT_GAP,
+			thr);
+
+		switch (err) {
+		case DB_SUCCESS:
+		case DB_SUCCESS_LOCKED_REC:
+			break;
+		default:
+			return err;
+		}
+	} else {
+		/* This is a non-locking consistent read: if necessary, fetch
+		a previous version of the record */
+
+		old_vers = NULL;
+
+		if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED
+		    || clust_index->table->is_temporary()) {
+		} else {
+			/* If the isolation level allows reading of
+			uncommitted data, then we never look for an
+			earlier version */
+			err = row_sel_clust_sees(clust_rec, *clust_index,
+						 *offsets, trx->read_view);
+		}
+
+		switch (err) {
+		default:
+			return err;
+		case DB_SUCCESS:
+			break;
+		case DB_SUCCESS_LOCKED_REC:
+			const buf_page_t& bpage = btr_pcur_get_block(
+				prebuilt->clust_pcur)->page;
+
+			const lsn_t lsn = mach_read_from_8(
+				page_align(clust_rec) + FIL_PAGE_LSN);
+
+			if (lsn != cached_lsn
+			    || bpage.id() != cached_page_id
+			    || clust_rec != cached_clust_rec) {
+				/* The following call returns 'offsets' associated with
+				'old_vers' */
+				err = row_sel_build_prev_vers_for_mysql(
+					prebuilt, clust_index,
+					clust_rec, offsets, offset_heap, &old_vers,
+					vrow, mtr);
+
+				if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+					return err;
+				}
+				cached_lsn = lsn;
+				cached_page_id = bpage.id();
+				cached_clust_rec = clust_rec;
+				cached_old_vers = old_vers;
+			} else {
+				err = DB_SUCCESS;
+				old_vers = cached_old_vers;
+
+				/* The offsets need not be same for the latest
+				version of clust_rec and its old version
+				old_vers. Re-calculate the offsets for old_vers. */
+
+				if (old_vers) {
+					ut_d(check_eq(clust_index, *offsets));
+					*offsets = rec_get_offsets(
+						old_vers, clust_index, *offsets,
+						clust_index->n_core_fields,
+						ULINT_UNDEFINED, offset_heap);
+				}
+			}
+
+			if (old_vers == NULL) {
+				return err;
+			}
+
+			clust_rec = old_vers;
+		}
+
+		/* If we had to go to an earlier version of row or the
+		secondary index record is delete marked, then it may be that
+		the secondary index record corresponding to clust_rec
+		(or old_vers) is not rec; in that case we must ignore
+		such row because in our snapshot rec would not have existed.
+		Remember that from rec we cannot see directly which transaction
+		id corresponds to it: we have to go to the clustered index
+		record. A query where we want to fetch all rows where
+		the secondary index value is in some interval would return
+		a wrong result if we would not drop rows which we come to
+		visit through secondary index records that would not really
+		exist in our snapshot. */
+
+		/* And for spatial index, since the rec is from shadow buffer,
+		so we need to check if it's exactly match the clust_rec. */
+		if (clust_rec
+		    && (old_vers
+			|| trx->isolation_level <= TRX_ISO_READ_UNCOMMITTED
+			|| dict_index_is_spatial(sec_index)
+			|| rec_get_deleted_flag(rec, dict_table_is_comp(
+							sec_index->table)))) {
+			err = row_sel_sec_rec_is_for_clust_rec(rec, sec_index,
+						clust_rec, clust_index, thr);
+			switch (err) {
+			case DB_SUCCESS:
+				clust_rec = NULL;
+				break;
+			case DB_SUCCESS_LOCKED_REC:
+				break;
+			default:
+				return err;
+			}
+		}
+
+		err = DB_SUCCESS;
+	}
+
+func_exit:
+	*out_rec = clust_rec;
+
+	if (prebuilt->select_lock_type != LOCK_NONE) {
+		/* We may use the cursor in update or in unlock_row():
+		store its position */
+
+		btr_pcur_store_position(prebuilt->clust_pcur, mtr);
+	}
+
+	return err;
+}
+
+/** Restores cursor position after it has been stored. We have to take into
+account that the record cursor was positioned on may have been deleted.
+Then we may have to move the cursor one step up or down.
+@param[out] same_user_rec true if we were able to restore the cursor on a user
+record with the same ordering prefix in in the B-tree index
+@param[in] latch_mode latch mode wished in restoration
+@param[in] pcur cursor whose position has been stored
+@param[in] moves_up true if the cursor moves up in the index
+@param[in,out] mtr mtr; CAUTION: may commit mtr temporarily!
+@return true if we may need to process the record the cursor is now
+positioned on (i.e. we should not go to the next record yet) */
+static bool sel_restore_position_for_mysql(bool *same_user_rec,
+                                           btr_latch_mode latch_mode,
+                                           btr_pcur_t *pcur,
+                                           bool moves_up, mtr_t *mtr)
+{
+	auto status = pcur->restore_position(latch_mode, mtr);
+
+	*same_user_rec = status == btr_pcur_t::SAME_ALL;
+
+	ut_ad(!*same_user_rec || pcur->rel_pos == BTR_PCUR_ON);
+#ifdef UNIV_DEBUG
+	if (pcur->pos_state == BTR_PCUR_IS_POSITIONED_OPTIMISTIC) {
+		ut_ad(pcur->rel_pos == BTR_PCUR_BEFORE
+		      || pcur->rel_pos == BTR_PCUR_AFTER);
+	} else {
+		ut_ad(pcur->pos_state == BTR_PCUR_IS_POSITIONED);
+		ut_ad((pcur->rel_pos == BTR_PCUR_ON)
+		      == btr_pcur_is_on_user_rec(pcur));
+	}
+#endif /* UNIV_DEBUG */
+
+	/* The position may need be adjusted for rel_pos and moves_up. */
+
+	switch (pcur->rel_pos) {
+	case BTR_PCUR_ON:
+		if (!*same_user_rec && moves_up) {
+			if (status == btr_pcur_t::SAME_UNIQ)
+			  return true;
+next:
+			if (btr_pcur_move_to_next(pcur, mtr)
+			    && rec_is_metadata(btr_pcur_get_rec(pcur),
+					       *pcur->index())) {
+				btr_pcur_move_to_next(pcur, mtr);
+			}
+
+			return true;
+		}
+		return(!*same_user_rec);
+	case BTR_PCUR_AFTER_LAST_IN_TREE:
+	case BTR_PCUR_BEFORE_FIRST_IN_TREE:
+		return true;
+	case BTR_PCUR_AFTER:
+		/* positioned to record after pcur->old_rec. */
+		pcur->pos_state = BTR_PCUR_IS_POSITIONED;
+prev:
+		if (btr_pcur_is_on_user_rec(pcur) && !moves_up
+		    && !rec_is_metadata(btr_pcur_get_rec(pcur),
+					*pcur->index())) {
+			if (!btr_pcur_move_to_prev(pcur, mtr)) {
+				return true;
+			}
+		}
+		return true;
+	case BTR_PCUR_BEFORE:
+		/* For non optimistic restoration:
+		The position is now set to the record before pcur->old_rec.
+
+		For optimistic restoration:
+		The position also needs to take the previous search_mode into
+		consideration. */
+
+		switch (pcur->pos_state) {
+		case BTR_PCUR_IS_POSITIONED_OPTIMISTIC:
+			pcur->pos_state = BTR_PCUR_IS_POSITIONED;
+			if (pcur->search_mode == PAGE_CUR_GE) {
+				/* Positioned during Greater or Equal search
+				with BTR_PCUR_BEFORE. Optimistic restore to
+				the same record. If scanning for lower then
+				we must move to previous record.
+				This can happen with:
+				HANDLER READ idx a = (const);
+				HANDLER READ idx PREV; */
+				goto prev;
+			}
+			return true;
+		case BTR_PCUR_IS_POSITIONED:
+			if (moves_up && btr_pcur_is_on_user_rec(pcur)) {
+				goto next;
+			}
+			return true;
+		case BTR_PCUR_WAS_POSITIONED:
+		case BTR_PCUR_NOT_POSITIONED:
+			break;
+		}
+	}
+	ut_ad(0);
+	return true;
+}
+
+/********************************************************************//**
+Copies a cached field for MySQL from the fetch cache. */
+static
+void
+row_sel_copy_cached_field_for_mysql(
+/*================================*/
+	byte*			buf,	/*!< in/out: row buffer */
+	const byte*		cache,	/*!< in: cached row */
+	const mysql_row_templ_t*templ)	/*!< in: column template */
+{
+	ulint	len;
+
+	buf += templ->mysql_col_offset;
+	cache += templ->mysql_col_offset;
+
+	MEM_CHECK_ADDRESSABLE(buf, templ->mysql_col_len);
+
+	if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR
+	    && (templ->type != DATA_INT)) {
+		/* Check for != DATA_INT to make sure we do
+		not treat MySQL ENUM or SET as a true VARCHAR!
+		Find the actual length of the true VARCHAR field. */
+		row_mysql_read_true_varchar(
+			&len, cache, templ->mysql_length_bytes);
+		len += templ->mysql_length_bytes;
+		MEM_UNDEFINED(buf, templ->mysql_col_len);
+	} else {
+		len = templ->mysql_col_len;
+	}
+
+	memcpy(buf, cache, len);
+}
+
+/** Copy used fields from cached row.
+Copy cache record field by field, don't touch fields that
+are not covered by current key.
+@param[out]	buf		Where to copy the MySQL row.
+@param[in]	cached_rec	What to copy (in MySQL row format).
+@param[in]	prebuilt	prebuilt struct. */
+void
+row_sel_copy_cached_fields_for_mysql(
+	byte*		buf,
+	const byte*	cached_rec,
+	row_prebuilt_t*	prebuilt)
+{
+	const mysql_row_templ_t*templ;
+	ulint			i;
+	for (i = 0; i < prebuilt->n_template; i++) {
+		templ = prebuilt->mysql_template + i;
+
+		/* Skip virtual columns */
+		if (templ->is_virtual) {
+			continue;
+		}
+
+		row_sel_copy_cached_field_for_mysql(
+			buf, cached_rec, templ);
+		/* Copy NULL bit of the current field from cached_rec
+		to buf */
+		if (templ->mysql_null_bit_mask) {
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wconversion" /* GCC 5 may need this here */
+#endif
+			buf[templ->mysql_null_byte_offset]
+				^= (buf[templ->mysql_null_byte_offset]
+				    ^ cached_rec[templ->mysql_null_byte_offset])
+				& (byte) templ->mysql_null_bit_mask;
+#if defined __GNUC__ && !defined __clang__ && __GNUC__ < 6
+# pragma GCC diagnostic pop
+#endif
+		}
+	}
+}
+
+/********************************************************************//**
+Pops a cached row for MySQL from the fetch cache. */
+UNIV_INLINE
+void
+row_sel_dequeue_cached_row_for_mysql(
+/*=================================*/
+	byte*		buf,		/*!< in/out: buffer where to copy the
+					row */
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct */
+{
+	ulint			i;
+	const mysql_row_templ_t*templ;
+	const byte*		cached_rec;
+	ut_ad(prebuilt->n_fetch_cached > 0);
+	ut_ad(prebuilt->mysql_prefix_len <= prebuilt->mysql_row_len);
+
+	MEM_CHECK_ADDRESSABLE(buf, prebuilt->mysql_row_len);
+
+	cached_rec = prebuilt->fetch_cache[prebuilt->fetch_cache_first];
+
+	if (UNIV_UNLIKELY(prebuilt->keep_other_fields_on_keyread)) {
+		row_sel_copy_cached_fields_for_mysql(buf, cached_rec, prebuilt);
+	} else if (prebuilt->mysql_prefix_len > 63) {
+		/* The record is long. Copy it field by field, in case
+		there are some long VARCHAR column of which only a
+		small length is being used. */
+		MEM_UNDEFINED(buf, prebuilt->mysql_prefix_len);
+
+		/* First copy the NULL bits. */
+		memcpy(buf, cached_rec, prebuilt->null_bitmap_len);
+		/* Then copy the requested fields. */
+
+		for (i = 0; i < prebuilt->n_template; i++) {
+			templ = prebuilt->mysql_template + i;
+
+			/* Skip virtual columns */
+			if (templ->is_virtual
+			    && !(dict_index_has_virtual(prebuilt->index)
+				 && prebuilt->read_just_key)) {
+				continue;
+			}
+
+			row_sel_copy_cached_field_for_mysql(
+				buf, cached_rec, templ);
+		}
+	} else {
+		memcpy(buf, cached_rec, prebuilt->mysql_prefix_len);
+	}
+
+	prebuilt->n_fetch_cached--;
+	prebuilt->fetch_cache_first++;
+
+	if (prebuilt->n_fetch_cached == 0) {
+		prebuilt->fetch_cache_first = 0;
+	}
+}
+
+/********************************************************************//**
+Initialise the prefetch cache. */
+UNIV_INLINE
+void
+row_sel_prefetch_cache_init(
+/*========================*/
+	row_prebuilt_t*	prebuilt)	/*!< in/out: prebuilt struct */
+{
+	ulint	i;
+	ulint	sz;
+	byte*	ptr;
+
+	/* Reserve space for the magic number. */
+	sz = UT_ARR_SIZE(prebuilt->fetch_cache) * (prebuilt->mysql_row_len + 8);
+	ptr = static_cast<byte*>(ut_malloc_nokey(sz));
+
+	for (i = 0; i < UT_ARR_SIZE(prebuilt->fetch_cache); i++) {
+
+		/* A user has reported memory corruption in these
+		buffers in Linux. Put magic numbers there to help
+		to track a possible bug. */
+
+		mach_write_to_4(ptr, ROW_PREBUILT_FETCH_MAGIC_N);
+		ptr += 4;
+
+		prebuilt->fetch_cache[i] = ptr;
+		ptr += prebuilt->mysql_row_len;
+
+		mach_write_to_4(ptr, ROW_PREBUILT_FETCH_MAGIC_N);
+		ptr += 4;
+	}
+}
+
+/********************************************************************//**
+Get the last fetch cache buffer from the queue.
+@return pointer to buffer. */
+UNIV_INLINE
+byte*
+row_sel_fetch_last_buf(
+/*===================*/
+	row_prebuilt_t*	prebuilt)	/*!< in/out: prebuilt struct */
+{
+	ut_ad(!prebuilt->templ_contains_blob);
+	ut_ad(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
+
+	if (prebuilt->fetch_cache[0] == NULL) {
+		/* Allocate memory for the fetch cache */
+		ut_ad(prebuilt->n_fetch_cached == 0);
+
+		row_sel_prefetch_cache_init(prebuilt);
+	}
+
+	ut_ad(prebuilt->fetch_cache_first == 0);
+	MEM_UNDEFINED(prebuilt->fetch_cache[prebuilt->n_fetch_cached],
+		      prebuilt->mysql_row_len);
+
+	return(prebuilt->fetch_cache[prebuilt->n_fetch_cached]);
+}
+
+/********************************************************************//**
+Pushes a row for MySQL to the fetch cache. */
+UNIV_INLINE
+void
+row_sel_enqueue_cache_row_for_mysql(
+/*================================*/
+	byte*		mysql_rec,	/*!< in/out: MySQL record */
+	row_prebuilt_t*	prebuilt)	/*!< in/out: prebuilt struct */
+{
+	/* For non ICP code path the row should already exist in the
+	next fetch cache slot. */
+
+	if (prebuilt->pk_filter || prebuilt->idx_cond) {
+		memcpy(row_sel_fetch_last_buf(prebuilt), mysql_rec,
+		       prebuilt->mysql_row_len);
+	}
+
+	++prebuilt->n_fetch_cached;
+}
+
+#ifdef BTR_CUR_HASH_ADAPT
+/*********************************************************************//**
+Tries to do a shortcut to fetch a clustered index record with a unique key,
+using the hash index if possible (not always). We assume that the search
+mode is PAGE_CUR_GE, it is a consistent read, there is a read view in trx,
+btr search latch has been locked in S-mode if AHI is enabled.
+@return SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
+static
+ulint
+row_sel_try_search_shortcut_for_mysql(
+/*==================================*/
+	const rec_t**	out_rec,/*!< out: record if found */
+	row_prebuilt_t*	prebuilt,/*!< in: prebuilt struct */
+	rec_offs**	offsets,/*!< in/out: for rec_get_offsets(*out_rec) */
+	mem_heap_t**	heap,	/*!< in/out: heap for rec_get_offsets() */
+	mtr_t*		mtr)	/*!< in: started mtr */
+{
+	dict_index_t*	index		= prebuilt->index;
+	const dtuple_t*	search_tuple	= prebuilt->search_tuple;
+	btr_pcur_t*	pcur		= prebuilt->pcur;
+	trx_t*		trx		= prebuilt->trx;
+	const rec_t*	rec;
+
+	ut_ad(index->is_primary());
+	ut_ad(!index->table->is_temporary());
+	ut_ad(!prebuilt->templ_contains_blob);
+	ut_ad(trx->read_view.is_open());
+	pcur->old_rec = nullptr;
+
+	if (btr_pcur_open_with_no_init(search_tuple, PAGE_CUR_GE,
+				       BTR_SEARCH_LEAF, pcur, mtr)
+	    != DB_SUCCESS) {
+		return SEL_RETRY;
+	}
+
+	rec = btr_pcur_get_rec(pcur);
+
+	if (!page_rec_is_user_rec(rec) || rec_is_metadata(rec, *index)) {
+		return SEL_RETRY;
+	}
+
+	/* As the cursor is now placed on a user record after a search with
+	the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
+	fields in the user record matched to the search tuple */
+
+	if (btr_pcur_get_up_match(pcur) < dtuple_get_n_fields(search_tuple)) {
+		return SEL_EXHAUSTED;
+	}
+
+	if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) {
+	} else if (trx_id_t bulk_trx_id = index->table->bulk_trx_id) {
+		/* See row_search_mvcc() for a comment on bulk_trx_id */
+		if (!trx->read_view.changes_visible(bulk_trx_id)) {
+			return SEL_EXHAUSTED;
+		}
+	}
+
+	/* This is a non-locking consistent read: if necessary, fetch
+	a previous version of the record */
+
+	*offsets = rec_get_offsets(rec, index, *offsets, index->n_core_fields,
+				   ULINT_UNDEFINED, heap);
+
+	if (row_sel_clust_sees(rec, *index, *offsets, trx->read_view)
+	    != DB_SUCCESS) {
+		return SEL_RETRY;
+	}
+
+	if (rec_get_deleted_flag(rec, dict_table_is_comp(index->table))) {
+		/* In delete-marked records, DB_TRX_ID must
+		always refer to an existing undo log record. */
+		ut_ad(row_get_rec_trx_id(rec, index, *offsets));
+		return SEL_EXHAUSTED;
+	}
+
+	*out_rec = rec;
+
+	return SEL_FOUND;
+}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+/*********************************************************************//**
+Check a pushed-down index condition.
+@return CHECK_ABORTED_BY_USER, CHECK_NEG, CHECK_POS, or CHECK_OUT_OF_RANGE */
+static
+check_result_t
+row_search_idx_cond_check(
+/*======================*/
+	byte*			mysql_rec,	/*!< out: record
+						in MySQL format (invalid unless
+						prebuilt->idx_cond!=NULL and
+						we return ICP_MATCH) */
+	row_prebuilt_t*		prebuilt,	/*!< in/out: prebuilt struct
+						for the table handle */
+	const rec_t*		rec,		/*!< in: InnoDB record */
+	const rec_offs*		offsets)	/*!< in: rec_get_offsets() */
+{
+	ulint		i;
+
+	ut_ad(rec_offs_validate(rec, prebuilt->index, offsets));
+
+	if (!prebuilt->idx_cond) {
+		if (!handler_rowid_filter_is_active(prebuilt->pk_filter)) {
+			return(CHECK_POS);
+		}
+	} else {
+		MONITOR_INC(MONITOR_ICP_ATTEMPTS);
+	}
+
+	/* Convert to MySQL format those fields that are needed for
+	evaluating the index condition. */
+
+	if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) {
+		mem_heap_empty(prebuilt->blob_heap);
+	}
+
+	for (i = 0; i < prebuilt->idx_cond_n_cols; i++) {
+		const mysql_row_templ_t*templ = &prebuilt->mysql_template[i];
+
+		/* Skip virtual columns */
+		if (templ->is_virtual) {
+			continue;
+		}
+
+		if (!row_sel_store_mysql_field(mysql_rec, prebuilt,
+					       rec, prebuilt->index, offsets,
+					       templ->icp_rec_field_no,
+					       templ)) {
+			return(CHECK_NEG);
+		}
+	}
+
+	/* We assume that the index conditions on
+	case-insensitive columns are case-insensitive. The
+	case of such columns may be wrong in a secondary
+	index, if the case of the column has been updated in
+	the past, or a record has been deleted and a record
+	inserted in a different case. */
+	check_result_t result = prebuilt->idx_cond
+		? handler_index_cond_check(prebuilt->idx_cond)
+		: CHECK_POS;
+
+	switch (result) {
+	case CHECK_POS:
+	        if (handler_rowid_filter_is_active(prebuilt->pk_filter)) {
+		        ut_ad(!prebuilt->index->is_primary());
+		        if (prebuilt->clust_index_was_generated) {
+                               ulint len;
+                               dict_index_t* index = prebuilt->index;
+                               const byte* data = rec_get_nth_field(
+                                       rec, offsets, index->n_fields - 1,
+                                       &len);
+                               ut_ad(dict_index_get_nth_col(index,
+                                                            index->n_fields - 1)
+                                     ->prtype == (DATA_ROW_ID | DATA_NOT_NULL));
+                               ut_ad(len == DATA_ROW_ID_LEN);
+                               memcpy(prebuilt->row_id, data, DATA_ROW_ID_LEN);
+                        }
+                        result = handler_rowid_filter_check(prebuilt->pk_filter);
+                        switch (result) {
+                        case CHECK_NEG:
+			        MONITOR_INC(MONITOR_ICP_NO_MATCH);
+                                return(result);
+                        case CHECK_OUT_OF_RANGE:
+                                MONITOR_INC(MONITOR_ICP_OUT_OF_RANGE);
+                                return(result);
+                        case CHECK_POS:
+                                break;
+                        default:
+                                return(result);
+                        }
+		}
+		/* Convert the remaining fields to MySQL format.
+		If this is a secondary index record, we must defer
+		this until we have fetched the clustered index record. */
+		if (!prebuilt->need_to_access_clustered
+		    || dict_index_is_clust(prebuilt->index)) {
+			if (!row_sel_store_mysql_rec(
+				    mysql_rec, prebuilt, rec, NULL, false,
+				    prebuilt->index, offsets)) {
+				ut_ad(dict_index_is_clust(prebuilt->index));
+				return(CHECK_NEG);
+			}
+		}
+		MONITOR_INC(MONITOR_ICP_MATCH);
+		return(result);
+	case CHECK_NEG:
+		MONITOR_INC(MONITOR_ICP_NO_MATCH);
+		return(result);
+	case CHECK_OUT_OF_RANGE:
+		MONITOR_INC(MONITOR_ICP_OUT_OF_RANGE);
+		return(result);
+        case CHECK_ERROR:
+        case CHECK_ABORTED_BY_USER:
+		return(result);
+	}
+
+	ut_error;
+	return(result);
+}
+
+/** Extract virtual column data from a virtual index record and fill a dtuple
+@param[in]	rec		the virtual (secondary) index record
+@param[in]	index		the virtual index
+@param[in,out]	vrow		the dtuple where data extract to
+@param[in]	heap		memory heap to allocate memory
+*/
+static
+void
+row_sel_fill_vrow(
+	const rec_t*		rec,
+	dict_index_t*		index,
+	dtuple_t**		vrow,
+	mem_heap_t*		heap)
+{
+	rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs* offsets	= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(!(*vrow));
+	ut_ad(heap);
+	ut_ad(!dict_index_is_clust(index));
+	ut_ad(!index->is_instant());
+	ut_ad(page_rec_is_leaf(rec));
+
+	offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+				  ULINT_UNDEFINED, &heap);
+
+	*vrow = dtuple_create_with_vcol(
+		heap, 0, dict_table_get_n_v_cols(index->table));
+
+	/* Initialize all virtual row's mtype to DATA_MISSING */
+	dtuple_init_v_fld(*vrow);
+
+	for (ulint i = 0; i < dict_index_get_n_fields(index); i++) {
+		const dict_field_t*     field;
+		const dict_col_t*       col;
+
+		field = dict_index_get_nth_field(index, i);
+		col = dict_field_get_col(field);
+
+		if (col->is_virtual()) {
+			const byte*     data;
+			ulint           len;
+
+			data = rec_get_nth_field(rec, offsets, i, &len);
+
+			const dict_v_col_t*     vcol = reinterpret_cast<
+				const dict_v_col_t*>(col);
+
+			dfield_t* dfield = dtuple_get_nth_v_field(
+				*vrow, vcol->v_pos);
+			dfield_set_data(dfield, data, len);
+			dict_col_copy_type(col, dfield_get_type(dfield));
+		}
+	}
+}
+
+/** Return the record field length in characters.
+@param[in]	col		table column of the field
+@param[in]	field_no	field number
+@param[in]	rec		physical record
+@param[in]	offsets		field offsets in the physical record
+@return field length in characters. */
+static
+size_t
+rec_field_len_in_chars(
+	const dict_col_t*	col,
+	const ulint		field_no,
+	const rec_t*		rec,
+	const rec_offs*		offsets)
+{
+	const ulint cset = dtype_get_charset_coll(col->prtype);
+	const CHARSET_INFO* cs = all_charsets[cset];
+	ulint rec_field_len;
+	const char* rec_field = reinterpret_cast<const char *>(
+		rec_get_nth_field(
+			rec, offsets, field_no, &rec_field_len));
+
+	if (UNIV_UNLIKELY(!cs)) {
+		ib::warn() << "Missing collation " << cset;
+		return SIZE_T_MAX;
+	}
+
+	return cs->numchars(rec_field, rec_field + rec_field_len);
+}
+
+/** Avoid the clustered index lookup if all the following conditions
+are true:
+1) all columns are in secondary index
+2) all values for columns that are prefix-only indexes are shorter
+than the prefix size. This optimization can avoid many IOs for certain schemas.
+@return true, to avoid clustered index lookup. */
+static
+bool row_search_with_covering_prefix(
+	row_prebuilt_t*	prebuilt,
+	const rec_t*	rec,
+	const rec_offs*	offsets)
+{
+	const dict_index_t*	index = prebuilt->index;
+	ut_ad(!dict_index_is_clust(index));
+
+	/* In ha_innobase::build_template() we choose to access the
+	whole row when using exclusive row locks or In case of fts
+	query, we need to read from clustered index */
+	if (prebuilt->select_lock_type == LOCK_X || prebuilt->in_fts_query
+	    || !index->is_btree()) {
+		return false;
+	}
+
+	/** Optimization only applicable if there the number of secondary index
+	fields are greater than or equal to number of clustered index fields. */
+	if (prebuilt->n_template > index->n_fields) {
+		return false;
+	}
+
+	/* We can avoid a clustered index lookup if
+	all of the following hold:
+	(1) all columns are in the secondary index
+	(2) all values for columns that are prefix-only
+	indexes are shorter than the prefix size
+	This optimization can avoid many IOs for certain schemas. */
+	for (ulint i = 0; i < prebuilt->n_template; i++) {
+		mysql_row_templ_t* templ = prebuilt->mysql_template + i;
+		ulint j = templ->rec_prefix_field_no;
+		ut_ad(!templ->mbminlen == !templ->mbmaxlen);
+
+		/** Condition (1) : is the field in the index. */
+		if (j == ULINT_UNDEFINED) {
+			return false;
+		}
+
+		/** Condition (2): If this is a prefix index then
+		row's value size shorter than prefix length. */
+
+		if (!templ->rec_field_is_prefix
+		    || rec_offs_nth_sql_null(offsets, j)) {
+			continue;
+		}
+
+		const dict_field_t* field = dict_index_get_nth_field(index, j);
+
+		if (!field->prefix_len) {
+			continue;
+		}
+
+		const ulint rec_size = rec_offs_nth_size(offsets, j);
+
+		if (rec_size >= field->prefix_len) {
+			/* Shortest representation string by the
+			byte length of the record is longer than the
+			maximum possible index prefix. */
+			return false;
+		}
+
+		if (templ->mbminlen != templ->mbmaxlen
+		    && rec_field_len_in_chars(field->col, j, rec, offsets)
+		    >= field->prefix_len / templ->mbmaxlen) {
+			/* No of chars to store the record exceeds
+			the index prefix character length. */
+			return false;
+		}
+	}
+
+	/* If prefix index optimization condition satisfied then
+	for all columns above, use rec_prefix_field_no instead of
+	rec_field_no, and skip the clustered lookup below. */
+	for (ulint i = 0; i < prebuilt->n_template; i++) {
+		mysql_row_templ_t* templ = prebuilt->mysql_template + i;
+		templ->rec_field_no = templ->rec_prefix_field_no;
+		ut_a(templ->rec_field_no != ULINT_UNDEFINED);
+	}
+
+	return true;
+}
+
+/** Searches for rows in the database using cursor.
+Function is mainly used for tables that are shared across connections and
+so it employs technique that can help re-construct the rows that
+transaction is suppose to see.
+It also has optimization such as pre-caching the rows, using AHI, etc.
+
+@param[out]	buf		buffer for the fetched row in MySQL format
+@param[in]	mode		search mode PAGE_CUR_L
+@param[in,out]	prebuilt	prebuilt struct for the table handler;
+				this contains the info to search_tuple,
+				index; if search tuple contains 0 field then
+				we position the cursor at start or the end of
+				index, depending on 'mode'
+@param[in]	match_mode	0 or ROW_SEL_EXACT or ROW_SEL_EXACT_PREFIX
+@param[in]	direction	0 or ROW_SEL_NEXT or ROW_SEL_PREV;
+				Note: if this is != 0, then prebuilt must has a
+				pcur with stored position! In opening of a
+				cursor 'direction' should be 0.
+@return DB_SUCCESS or error code */
+dberr_t
+row_search_mvcc(
+	byte*		buf,
+	page_cur_mode_t	mode,
+	row_prebuilt_t*	prebuilt,
+	ulint		match_mode,
+	ulint		direction)
+{
+	DBUG_ENTER("row_search_mvcc");
+	DBUG_ASSERT(prebuilt->index->table == prebuilt->table);
+
+	dict_index_t*	index		= prebuilt->index;
+	ibool		comp		= dict_table_is_comp(prebuilt->table);
+	const dtuple_t*	search_tuple	= prebuilt->search_tuple;
+	btr_pcur_t*	pcur		= prebuilt->pcur;
+	trx_t*		trx		= prebuilt->trx;
+	dict_index_t*	clust_index;
+	que_thr_t*	thr;
+	const rec_t*	UNINIT_VAR(rec);
+	dtuple_t*	vrow = NULL;
+	const rec_t*	result_rec = NULL;
+	const rec_t*	clust_rec;
+	Row_sel_get_clust_rec_for_mysql row_sel_get_clust_rec_for_mysql;
+	ibool		unique_search			= FALSE;
+	ulint		mtr_extra_clust_savepoint	= 0;
+	bool		moves_up			= false;
+	/* if the returned record was locked and we did a semi-consistent
+	read (fetch the newest committed version), then this is set to
+	TRUE */
+	ulint		next_offs;
+	bool		same_user_rec;
+	ibool		table_lock_waited		= FALSE;
+	byte*		next_buf			= 0;
+	bool		spatial_search			= false;
+
+	ut_ad(index && pcur && search_tuple);
+	ut_a(prebuilt->magic_n == ROW_PREBUILT_ALLOCATED);
+	ut_a(prebuilt->magic_n2 == ROW_PREBUILT_ALLOCATED);
+
+	/* We don't support FTS queries from the HANDLER interfaces, because
+	we implemented FTS as reversed inverted index with auxiliary tables.
+	So anything related to traditional index query would not apply to
+	it. */
+	if (prebuilt->index->type & DICT_FTS) {
+		DBUG_RETURN(DB_END_OF_INDEX);
+	}
+
+	if (!prebuilt->table->space) {
+		DBUG_RETURN(DB_TABLESPACE_DELETED);
+	} else if (!prebuilt->table->is_readable()) {
+		if (fil_space_crypt_t* crypt_data =
+		    prebuilt->table->space->crypt_data) {
+			if (crypt_data->should_encrypt()) {
+				DBUG_RETURN(DB_DECRYPTION_FAILED);
+			}
+		}
+		DBUG_RETURN(DB_CORRUPTION);
+	} else if (!prebuilt->index_usable) {
+		DBUG_RETURN(DB_MISSING_HISTORY);
+	} else if (prebuilt->index->is_corrupted()) {
+		DBUG_RETURN(DB_CORRUPTION);
+	}
+
+	pcur->btr_cur.page_cur.index = index;
+
+	/* We need to get the virtual column values stored in secondary
+	index key, if this is covered index scan or virtual key read is
+	requested. */
+	bool    need_vrow = prebuilt->read_just_key
+		&& prebuilt->index->has_virtual();
+
+	/* Reset the new record lock info if READ UNCOMMITTED or
+	READ COMMITED isolation level is used. Then
+	we are able to remove the record locks set here on an individual
+	row. */
+	prebuilt->new_rec_locks = 0;
+
+	/*-------------------------------------------------------------*/
+	/* PHASE 1: Try to pop the row from the prefetch cache */
+
+	if (UNIV_UNLIKELY(direction == 0)) {
+		trx->op_info = "starting index read";
+
+		prebuilt->n_rows_fetched = 0;
+		prebuilt->n_fetch_cached = 0;
+		prebuilt->fetch_cache_first = 0;
+
+		if (prebuilt->sel_graph == NULL) {
+			/* Build a dummy select query graph */
+			row_prebuild_sel_graph(prebuilt);
+		}
+	} else {
+		trx->op_info = "fetching rows";
+
+		if (prebuilt->n_rows_fetched == 0) {
+			prebuilt->fetch_direction = direction;
+		}
+
+		if (UNIV_UNLIKELY(direction != prebuilt->fetch_direction)) {
+			if (UNIV_UNLIKELY(prebuilt->n_fetch_cached > 0)) {
+				ut_error;
+				/* TODO: scrollable cursor: restore cursor to
+				the place of the latest returned row,
+				or better: prevent caching for a scroll
+				cursor! */
+			}
+
+			prebuilt->n_rows_fetched = 0;
+			prebuilt->n_fetch_cached = 0;
+			prebuilt->fetch_cache_first = 0;
+
+		} else if (UNIV_LIKELY(prebuilt->n_fetch_cached > 0)) {
+			row_sel_dequeue_cached_row_for_mysql(buf, prebuilt);
+
+			prebuilt->n_rows_fetched++;
+			trx->op_info = "";
+			DBUG_RETURN(DB_SUCCESS);
+		}
+
+		if (prebuilt->fetch_cache_first > 0
+		    && prebuilt->fetch_cache_first < MYSQL_FETCH_CACHE_SIZE) {
+early_not_found:
+			/* The previous returned row was popped from the fetch
+			cache, but the cache was not full at the time of the
+			popping: no more rows can exist in the result set */
+			trx->op_info = "";
+			DBUG_RETURN(DB_RECORD_NOT_FOUND);
+		}
+
+		prebuilt->n_rows_fetched++;
+
+		if (prebuilt->n_rows_fetched > 1000000000) {
+			/* Prevent wrap-over */
+			prebuilt->n_rows_fetched = 500000000;
+		}
+
+		mode = pcur->search_mode;
+	}
+
+	/* In a search where at most one record in the index may match, we
+	can use a LOCK_REC_NOT_GAP type record lock when locking a
+	non-delete-marked matching record.
+
+	Note that in a unique secondary index there may be different
+	delete-marked versions of a record where only the primary key
+	values differ: thus in a secondary index we must use next-key
+	locks when locking delete-marked records. */
+
+	if (match_mode == ROW_SEL_EXACT
+	    && dict_index_is_unique(index)
+	    && dtuple_get_n_fields(search_tuple)
+	    == dict_index_get_n_unique(index)
+	    && (dict_index_is_clust(index)
+		|| !dtuple_contains_null(search_tuple))) {
+
+		/* Note above that a UNIQUE secondary index can contain many
+		rows with the same key value if one of the columns is the SQL
+		null. A clustered index under MySQL can never contain null
+		columns because we demand that all the columns in primary key
+		are non-null. */
+
+		unique_search = TRUE;
+
+		/* Even if the condition is unique, MySQL seems to try to
+		retrieve also a second row if a primary key contains more than
+		1 column. Return immediately if this is not a HANDLER
+		command. */
+
+		if (UNIV_UNLIKELY(direction != 0
+				  && !prebuilt->used_in_HANDLER)) {
+			goto early_not_found;
+		}
+	}
+
+	/* We don't support sequencial scan for Rtree index, because it
+	is no meaning to do so. */
+	if (dict_index_is_spatial(index) && !RTREE_SEARCH_MODE(mode)) {
+		trx->op_info = "";
+		DBUG_RETURN(DB_END_OF_INDEX);
+	}
+
+	/* if the query is a plain locking SELECT, and the isolation level
+	is <= TRX_ISO_READ_COMMITTED, then this is set to FALSE */
+	bool did_semi_consistent_read = false;
+	mtr_t mtr;
+	mtr.start();
+
+	mem_heap_t*	heap				= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets				= offsets_;
+	rec_offs_init(offsets_);
+
+#ifdef BTR_CUR_HASH_ADAPT
+	/*-------------------------------------------------------------*/
+	/* PHASE 2: Try fast adaptive hash index search if possible */
+
+	/* Next test if this is the special case where we can use the fast
+	adaptive hash index to try the search. Since we must release the
+	search system latch when we retrieve an externally stored field, we
+	cannot use the adaptive hash index in a search in the case the row
+	may be long and there may be externally stored fields */
+
+	if (UNIV_UNLIKELY(direction == 0)
+	    && unique_search
+	    && btr_search_enabled
+	    && dict_index_is_clust(index)
+	    && !index->table->is_temporary()
+	    && !prebuilt->templ_contains_blob
+	    && !prebuilt->used_in_HANDLER
+	    && (prebuilt->mysql_row_len < srv_page_size / 8)) {
+
+		mode = PAGE_CUR_GE;
+
+		if (prebuilt->select_lock_type == LOCK_NONE
+		    && trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
+		    && trx->read_view.is_open()) {
+
+			/* This is a SELECT query done as a consistent read,
+			and the read view has already been allocated:
+			let us try a search shortcut through the hash
+			index. */
+
+			dberr_t err = DB_SUCCESS;
+			switch (row_sel_try_search_shortcut_for_mysql(
+					&rec, prebuilt, &offsets, &heap,
+					&mtr)) {
+			case SEL_FOUND:
+				/* At this point, rec is protected by
+				a page latch that was acquired by
+				row_sel_try_search_shortcut_for_mysql().
+				The latch will not be released until
+				mtr.commit(). */
+				ut_ad(!rec_get_deleted_flag(rec, comp));
+
+				if (prebuilt->pk_filter || prebuilt->idx_cond) {
+					switch (row_search_idx_cond_check(
+							buf, prebuilt,
+							rec, offsets)) {
+					case CHECK_ABORTED_BY_USER:
+						goto aborted;
+					case CHECK_NEG:
+					case CHECK_OUT_OF_RANGE:
+                                        case CHECK_ERROR:
+						err = DB_RECORD_NOT_FOUND;
+						goto shortcut_done;
+					case CHECK_POS:
+						goto shortcut_done;
+					}
+
+					ut_ad("incorrect code" == 0);
+aborted:
+					err = DB_INTERRUPTED;
+					goto shortcut_done;
+				}
+
+				if (!row_sel_store_mysql_rec(
+					    buf, prebuilt,
+					    rec, NULL, false, index,
+					    offsets)) {
+					/* Only fresh inserts may contain
+					incomplete externally stored
+					columns. Pretend that such
+					records do not exist. Such
+					records may only be accessed
+					at the READ UNCOMMITTED
+					isolation level or when
+					rolling back a recovered
+					transaction. Rollback happens
+					at a lower level, not here. */
+
+					/* Proceed as in case SEL_RETRY. */
+					break;
+				}
+
+				goto shortcut_done;
+
+			case SEL_EXHAUSTED:
+				err = DB_RECORD_NOT_FOUND;
+			shortcut_done:
+				mtr.commit();
+
+				/* NOTE that we do NOT store the cursor
+				position */
+				trx->op_info = "";
+				ut_ad(!did_semi_consistent_read);
+				if (UNIV_LIKELY_NULL(heap)) {
+					mem_heap_free(heap);
+				}
+				DBUG_RETURN(err);
+
+			case SEL_RETRY:
+				break;
+
+			default:
+				ut_ad(0);
+			}
+
+			mtr.commit();
+			mtr.start();
+		}
+	}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+	/*-------------------------------------------------------------*/
+	/* PHASE 3: Open or restore index cursor position */
+
+	spatial_search = dict_index_is_spatial(index)
+			 && mode >= PAGE_CUR_CONTAIN;
+
+#ifdef UNIV_DEBUG
+	/* The state of a running trx can only be changed by the
+	thread that is currently serving the transaction. Because we
+	are that thread, we can read trx->state without holding any
+	mutex. */
+	switch (trx->state) {
+	case TRX_STATE_ACTIVE:
+		break;
+	case TRX_STATE_NOT_STARTED:
+		ut_ad(prebuilt->sql_stat_start
+		      || prebuilt->table->no_rollback());
+		break;
+	default:
+		ut_ad("invalid trx->state" == 0);
+	}
+#endif
+
+	ut_ad(prebuilt->sql_stat_start
+	      || prebuilt->select_lock_type != LOCK_NONE
+	      || trx->read_view.is_open()
+	      || prebuilt->table->no_rollback()
+	      || srv_read_only_mode);
+
+	/* Do not lock gaps at READ UNCOMMITTED or READ COMMITTED
+	isolation level */
+	const bool set_also_gap_locks =
+		prebuilt->select_lock_type != LOCK_NONE
+		&& trx->isolation_level > TRX_ISO_READ_COMMITTED
+#ifdef WITH_WSREP
+		&& !wsrep_thd_skip_locking(trx->mysql_thd)
+#endif /* WITH_WSREP */
+		;
+
+	/* Note that if the search mode was GE or G, then the cursor
+	naturally moves upward (in fetch next) in alphabetical order,
+	otherwise downward */
+
+	if (UNIV_UNLIKELY(direction == 0)) {
+		if (mode == PAGE_CUR_GE || mode == PAGE_CUR_G
+		    || mode >= PAGE_CUR_CONTAIN) {
+			moves_up = true;
+		}
+	} else if (direction == ROW_SEL_NEXT) {
+		moves_up = true;
+	}
+
+	thr = que_fork_get_first_thr(prebuilt->sel_graph);
+
+	clust_index = dict_table_get_first_index(prebuilt->table);
+
+	dberr_t err = DB_SUCCESS;
+
+	/* Do some start-of-statement preparations */
+
+	if (prebuilt->table->no_rollback()) {
+		/* NO_ROLLBACK tables do not support MVCC or locking. */
+		prebuilt->select_lock_type = LOCK_NONE;
+		prebuilt->sql_stat_start = FALSE;
+	} else if (!prebuilt->sql_stat_start) {
+		/* No need to set an intention lock or assign a read view */
+		ut_a(prebuilt->select_lock_type != LOCK_NONE
+		     || srv_read_only_mode || trx->read_view.is_open());
+	} else {
+		prebuilt->sql_stat_start = FALSE;
+		trx_start_if_not_started(trx, false);
+
+		if (prebuilt->select_lock_type == LOCK_NONE) {
+			trx->read_view.open(trx);
+		} else {
+wait_table_again:
+			err = lock_table(prebuilt->table, nullptr,
+					 prebuilt->select_lock_type == LOCK_S
+					 ? LOCK_IS : LOCK_IX, thr);
+
+			if (err != DB_SUCCESS) {
+
+				table_lock_waited = TRUE;
+				goto lock_table_wait;
+			}
+		}
+	}
+
+	/* Open or restore index cursor position */
+
+	if (UNIV_LIKELY(direction != 0)) {
+		if (spatial_search) {
+			/* R-Tree access does not need to do
+			cursor position and resposition */
+			goto next_rec;
+		}
+
+		bool	need_to_process = sel_restore_position_for_mysql(
+			&same_user_rec, BTR_SEARCH_LEAF,
+			pcur, moves_up, &mtr);
+
+		if (UNIV_UNLIKELY(need_to_process)) {
+			if (UNIV_UNLIKELY(!btr_pcur_get_rec(pcur))) {
+				mtr.commit();
+				trx->op_info = "";
+				if (UNIV_LIKELY_NULL(heap)) {
+					mem_heap_free(heap);
+				}
+				return DB_CORRUPTION;
+			}
+
+			if (UNIV_UNLIKELY(prebuilt->row_read_type
+					  == ROW_READ_DID_SEMI_CONSISTENT)) {
+				/* We did a semi-consistent read,
+				but the record was removed in
+				the meantime. */
+				prebuilt->row_read_type
+					= ROW_READ_TRY_SEMI_CONSISTENT;
+			}
+		} else if (UNIV_LIKELY(prebuilt->row_read_type
+				       != ROW_READ_DID_SEMI_CONSISTENT)) {
+
+			/* The cursor was positioned on the record
+			that we returned previously.  If we need
+			to repeat a semi-consistent read as a
+			pessimistic locking read, the record
+			cannot be skipped. */
+
+			goto next_rec_after_check;
+		}
+
+	} else if (dtuple_get_n_fields(search_tuple) > 0) {
+		pcur->btr_cur.thr = thr;
+		pcur->old_rec = nullptr;
+
+		if (index->is_spatial()) {
+			if (!prebuilt->rtr_info) {
+				prebuilt->rtr_info = rtr_create_rtr_info(
+					set_also_gap_locks, true,
+					btr_pcur_get_btr_cur(pcur), index);
+				prebuilt->rtr_info->search_tuple = search_tuple;
+				prebuilt->rtr_info->search_mode = mode;
+				rtr_info_update_btr(btr_pcur_get_btr_cur(pcur),
+						    prebuilt->rtr_info);
+			} else {
+				rtr_info_reinit_in_cursor(
+					btr_pcur_get_btr_cur(pcur),
+					index, set_also_gap_locks);
+				prebuilt->rtr_info->search_tuple = search_tuple;
+				prebuilt->rtr_info->search_mode = mode;
+			}
+
+			err = rtr_search_leaf(pcur, search_tuple, mode, &mtr);
+		} else {
+			err = btr_pcur_open_with_no_init(search_tuple, mode,
+							 BTR_SEARCH_LEAF,
+							 pcur, &mtr);
+		}
+
+		if (err != DB_SUCCESS) {
+page_corrupted:
+			rec = NULL;
+			goto page_read_error;
+		}
+
+		pcur->trx_if_known = trx;
+
+		rec = btr_pcur_get_rec(pcur);
+		ut_ad(page_rec_is_leaf(rec));
+
+		if (!moves_up
+		    && set_also_gap_locks
+		    && !page_rec_is_supremum(rec)
+		    && !dict_index_is_spatial(index)) {
+
+			/* Try to place a gap lock on the next index record
+			to prevent phantoms in ORDER BY ... DESC queries */
+			const rec_t*	next_rec = page_rec_get_next_const(rec);
+			if (UNIV_UNLIKELY(!next_rec)) {
+				err = DB_CORRUPTION;
+				goto page_corrupted;
+			}
+
+			offsets = rec_get_offsets(next_rec, index, offsets,
+						  index->n_core_fields,
+						  ULINT_UNDEFINED, &heap);
+			err = sel_set_rec_lock(pcur,
+					       next_rec, index, offsets,
+					       prebuilt->select_lock_type,
+					       LOCK_GAP, thr, &mtr);
+
+			switch (err) {
+			case DB_SUCCESS_LOCKED_REC:
+				err = DB_SUCCESS;
+				/* fall through */
+			case DB_SUCCESS:
+				break;
+			default:
+				goto lock_wait_or_error;
+			}
+		}
+	} else if (mode == PAGE_CUR_G || mode == PAGE_CUR_L) {
+		err = pcur->open_leaf(mode == PAGE_CUR_G, index,
+				      BTR_SEARCH_LEAF, &mtr);
+
+		if (err != DB_SUCCESS) {
+			if (err == DB_DECRYPTION_FAILED) {
+				btr_decryption_failed(*index);
+			}
+			rec = NULL;
+			goto page_read_error;
+		}
+	}
+
+	/* Check if the table is supposed to be empty for our read view.
+
+	If we read bulk_trx_id as an older transaction ID, it is not
+	incorrect to check here whether that transaction should be
+	visible to us. If bulk_trx_id is not visible to us, the table
+	must have been empty at an earlier point of time, also in our
+	read view.
+
+	An INSERT would only update bulk_trx_id in
+	row_ins_clust_index_entry_low() if the table really was empty
+	(everything had been purged), when holding a leaf page latch
+	in the clustered index (actually, the root page is the only
+	leaf page in that case).
+
+	We are already holding a leaf page latch here, either
+	in a secondary index or in a clustered index.
+
+	If we are holding a clustered index page latch, there clearly
+	is no potential for race condition with a concurrent INSERT:
+	such INSERT would be blocked by us.
+
+	If we are holding a secondary index page latch, then we are
+	not directly blocking a concurrent INSERT that might update
+	bulk_trx_id to something that does not exist in our read view.
+	But, in that case, the entire table (all indexes) must have
+	been empty. So, even if our read below missed the update of
+	index->table->bulk_trx_id, we can safely proceed to reading
+	the empty secondary index page. Our latch will prevent the
+	INSERT from proceeding to that page. It will first modify
+	the clustered index. Also, we may only look up something in
+	the clustered index if the secondary index page is not empty
+	to begin with. So, only if the table is corrupted
+	(the clustered index is empty but the secondary index is not)
+	we could return corrupted results. */
+	if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED
+	    || !trx->read_view.is_open()) {
+	} else if (trx_id_t bulk_trx_id = index->table->bulk_trx_id) {
+		/* InnoDB should allow the transaction to read all
+		the rows when InnoDB intends to do any locking
+		on the record */
+		if (prebuilt->select_lock_type == LOCK_NONE
+		    && !trx->read_view.changes_visible(bulk_trx_id)) {
+			trx->op_info = "";
+			err = DB_END_OF_INDEX;
+			goto normal_return;
+		}
+	}
+
+rec_loop:
+	DEBUG_SYNC_C("row_search_rec_loop");
+	if (trx_is_interrupted(trx)) {
+		if (!spatial_search) {
+			btr_pcur_store_position(pcur, &mtr);
+		}
+		err = DB_INTERRUPTED;
+		goto normal_return;
+	}
+
+	/*-------------------------------------------------------------*/
+	/* PHASE 4: Look for matching records in a loop */
+
+	rec = btr_pcur_get_rec(pcur);
+
+	ut_ad(!!page_rec_is_comp(rec) == comp);
+	ut_ad(page_rec_is_leaf(rec));
+
+	if (page_rec_is_infimum(rec)) {
+
+		/* The infimum record on a page cannot be in the result set,
+		and neither can a record lock be placed on it: we skip such
+		a record. */
+
+		goto next_rec;
+	}
+
+	if (page_rec_is_supremum(rec)) {
+
+		if (set_also_gap_locks
+		    && !dict_index_is_spatial(index)) {
+
+			/* Try to place a lock on the index record */
+
+			/* If the transaction isolation level is
+			READ UNCOMMITTED or READ COMMITTED,
+			we do not lock gaps. Supremum record is really
+			a gap and therefore we do not set locks there. */
+
+			offsets = rec_get_offsets(rec, index, offsets,
+						  index->n_core_fields,
+						  ULINT_UNDEFINED, &heap);
+			err = sel_set_rec_lock(pcur,
+					       rec, index, offsets,
+					       prebuilt->select_lock_type,
+					       LOCK_ORDINARY, thr, &mtr);
+
+			switch (err) {
+			case DB_SUCCESS_LOCKED_REC:
+				err = DB_SUCCESS;
+				/* fall through */
+			case DB_SUCCESS:
+				break;
+			default:
+				goto lock_wait_or_error;
+			}
+		}
+
+		/* A page supremum record cannot be in the result set: skip
+		it now that we have placed a possible lock on it */
+
+		goto next_rec;
+	}
+
+	/*-------------------------------------------------------------*/
+	/* Do sanity checks in case our cursor has bumped into page
+	corruption */
+
+	if (comp) {
+		if (rec_get_info_bits(rec, true) & REC_INFO_MIN_REC_FLAG) {
+			/* Skip the metadata pseudo-record. */
+			ut_ad(index->is_instant());
+			goto next_rec;
+		}
+
+		next_offs = rec_get_next_offs(rec, TRUE);
+		if (UNIV_UNLIKELY(next_offs < PAGE_NEW_SUPREMUM)) {
+
+			goto wrong_offs;
+		}
+	} else {
+		if (rec_get_info_bits(rec, false) & REC_INFO_MIN_REC_FLAG) {
+			/* Skip the metadata pseudo-record. */
+			ut_ad(index->is_instant());
+			goto next_rec;
+		}
+
+		next_offs = rec_get_next_offs(rec, FALSE);
+		if (UNIV_UNLIKELY(next_offs < PAGE_OLD_SUPREMUM)) {
+
+			goto wrong_offs;
+		}
+	}
+
+	if (UNIV_UNLIKELY(next_offs >= srv_page_size - PAGE_DIR)) {
+
+wrong_offs:
+		if (srv_force_recovery == 0 || moves_up == false) {
+			ib::error() << "Rec address "
+				<< static_cast<const void*>(rec)
+				<< ", buf block fix count "
+				<< btr_pcur_get_block(pcur)->page
+				.buf_fix_count();
+
+			ib::error() << "Index corruption: rec offs "
+				<< page_offset(rec) << " next offs "
+				<< next_offs
+				<< btr_pcur_get_block(pcur)->page.id()
+				<< ", index " << index->name
+				<< " of table " << index->table->name
+				<< ". Run CHECK TABLE. You may need to"
+				" restore from a backup, or dump + drop +"
+				" reimport the table.";
+			ut_ad(0);
+			err = DB_CORRUPTION;
+
+			goto page_read_error;
+		} else {
+			/* The user may be dumping a corrupt table. Jump
+			over the corruption to recover as much as possible. */
+
+			ib::info() << "Index corruption: rec offs "
+				<< page_offset(rec) << " next offs "
+				<< next_offs
+				<< btr_pcur_get_block(pcur)->page.id()
+				<< ", index " << index->name
+				<< " of table " << index->table->name
+				<< ". We try to skip the rest of the page.";
+
+			page_cur_set_after_last(btr_pcur_get_block(pcur),
+						btr_pcur_get_page_cur(pcur));
+			pcur->old_rec = nullptr;
+			goto next_rec;
+		}
+	}
+	/*-------------------------------------------------------------*/
+
+	/* Calculate the 'offsets' associated with 'rec' */
+
+	ut_ad(fil_page_index_page_check(btr_pcur_get_page(pcur)));
+	ut_ad(btr_page_get_index_id(btr_pcur_get_page(pcur)) == index->id);
+
+	offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+				  ULINT_UNDEFINED, &heap);
+
+	if (UNIV_UNLIKELY(srv_force_recovery > 0)) {
+		if (!rec_validate(rec, offsets)
+		    || !btr_index_rec_validate(rec, index, FALSE)) {
+
+			ib::error() << "Index corruption: rec offs "
+				<< page_offset(rec) << " next offs "
+				<< next_offs
+				<< btr_pcur_get_block(pcur)->page.id()
+				<< ", index " << index->name
+				<< " of table " << index->table->name
+				<< ". We try to skip the record.";
+
+			goto next_rec;
+		}
+	}
+
+	/* Note that we cannot trust the up_match value in the cursor at this
+	place because we can arrive here after moving the cursor! Thus
+	we have to recompare rec and search_tuple to determine if they
+	match enough. */
+
+	if (match_mode == ROW_SEL_EXACT) {
+		/* Test if the index record matches completely to search_tuple
+		in prebuilt: if not, then we return with DB_RECORD_NOT_FOUND */
+
+		/* fputs("Comparing rec and search tuple\n", stderr); */
+
+		if (cmp_dtuple_rec(search_tuple, rec, index, offsets)) {
+
+			if (set_also_gap_locks
+			    && !dict_index_is_spatial(index)) {
+				err = sel_set_rec_lock(
+					pcur,
+					rec, index, offsets,
+					prebuilt->select_lock_type, LOCK_GAP,
+					thr, &mtr);
+
+				switch (err) {
+				case DB_SUCCESS_LOCKED_REC:
+				case DB_SUCCESS:
+					break;
+				default:
+					goto lock_wait_or_error;
+				}
+			}
+
+			btr_pcur_store_position(pcur, &mtr);
+
+			/* The found record was not a match, but may be used
+			as NEXT record (index_next). Set the relative position
+			to BTR_PCUR_BEFORE, to reflect that the position of
+			the persistent cursor is before the found/stored row
+			(pcur->old_rec). */
+			ut_ad(pcur->rel_pos == BTR_PCUR_ON);
+			pcur->rel_pos = BTR_PCUR_BEFORE;
+
+			err = DB_RECORD_NOT_FOUND;
+			goto normal_return;
+		}
+
+	} else if (match_mode == ROW_SEL_EXACT_PREFIX) {
+
+		if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec,
+						 index, offsets)) {
+
+			if (set_also_gap_locks
+			    && !dict_index_is_spatial(index)) {
+				err = sel_set_rec_lock(
+					pcur,
+					rec, index, offsets,
+					prebuilt->select_lock_type, LOCK_GAP,
+					thr, &mtr);
+
+				switch (err) {
+				case DB_SUCCESS_LOCKED_REC:
+				case DB_SUCCESS:
+					break;
+				default:
+					goto lock_wait_or_error;
+				}
+			}
+
+			btr_pcur_store_position(pcur, &mtr);
+
+			/* The found record was not a match, but may be used
+			as NEXT record (index_next). Set the relative position
+			to BTR_PCUR_BEFORE, to reflect that the position of
+			the persistent cursor is before the found/stored row
+			(pcur->old_rec). */
+			ut_ad(pcur->rel_pos == BTR_PCUR_ON);
+			pcur->rel_pos = BTR_PCUR_BEFORE;
+
+			err = DB_RECORD_NOT_FOUND;
+			goto normal_return;
+		}
+	}
+
+	/* We are ready to look at a possible new index entry in the result
+	set: the cursor is now placed on a user record */
+
+	if (prebuilt->select_lock_type != LOCK_NONE) {
+		/* Try to place a lock on the index record; note that delete
+		marked records are a special case in a unique search. If there
+		is a non-delete marked record, then it is enough to lock its
+		existence with LOCK_REC_NOT_GAP. */
+
+		unsigned lock_type;
+
+		if (trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
+			/* At READ COMMITTED or READ UNCOMMITTED
+			isolation levels, do not lock committed
+			delete-marked records. */
+			if (!rec_get_deleted_flag(rec, comp)) {
+				goto no_gap_lock;
+			}
+
+			/* At most one transaction can be active
+			for temporary table. */
+			if (clust_index->table->is_temporary()) {
+				goto no_gap_lock;
+			}
+
+			if (index == clust_index) {
+				trx_id_t trx_id = row_get_rec_trx_id(
+					rec, index, offsets);
+				/* In delete-marked records, DB_TRX_ID must
+				always refer to an existing undo log record. */
+				ut_ad(trx_id);
+				if (!trx_sys.is_registered(trx, trx_id)) {
+					/* The clustered index record
+					was delete-marked in a committed
+					transaction. Ignore the record. */
+					goto locks_ok_del_marked;
+				}
+			} else if (trx_t* t = row_vers_impl_x_locked(
+					   trx, rec, index, offsets)) {
+				/* The record belongs to an active
+				transaction. We must acquire a lock. */
+				t->release_reference();
+			} else {
+				/* The secondary index record does not
+				point to a delete-marked clustered index
+				record that belongs to an active transaction.
+				Ignore the secondary index record, because
+				it is not locked. */
+				goto next_rec;
+			}
+
+			goto no_gap_lock;
+		}
+
+#ifdef WITH_WSREP
+		if (UNIV_UNLIKELY(!set_also_gap_locks)) {
+			ut_ad(wsrep_thd_skip_locking(trx->mysql_thd));
+			goto no_gap_lock;
+		}
+#else /* WITH_WSREP */
+		ut_ad(set_also_gap_locks);
+#endif /* WITH_WSREP */
+
+		/* Set next-key lock both for delete- and non-delete-marked
+		records for unique search, because non-delete-marked record can
+		be marked as deleted while transaction suspends. */
+		if (index->is_spatial()) {
+			goto no_gap_lock;
+		}
+
+		/* If we are doing a 'greater or equal than a primary key
+		value' search from a clustered index, and we find a record
+		that has that exact primary key value, then there is no need
+		to lock the gap before the record, because no insert in the
+		gap can be in our search range. That is, no phantom row can
+		appear that way.
+
+		An example: if col1 is the primary key, the search is WHERE
+		col1 >= 100, and we find a record where col1 = 100, then no
+		need to lock the gap before that record. */
+
+		if (index == clust_index
+		    && mode == PAGE_CUR_GE
+		    && direction == 0
+		    && dtuple_get_n_fields_cmp(search_tuple)
+		    == dict_index_get_n_unique(index)
+		    && !cmp_dtuple_rec(search_tuple, rec, index, offsets)) {
+no_gap_lock:
+			lock_type = LOCK_REC_NOT_GAP;
+		} else {
+			lock_type = LOCK_ORDINARY;
+		}
+
+		err = sel_set_rec_lock(pcur,
+				       rec, index, offsets,
+				       prebuilt->select_lock_type,
+				       lock_type, thr, &mtr);
+
+		switch (err) {
+			const rec_t*	old_vers;
+		case DB_SUCCESS_LOCKED_REC:
+			if (trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
+				/* Note that a record of
+				prebuilt->index was locked. */
+				prebuilt->new_rec_locks = 1;
+			}
+			err = DB_SUCCESS;
+			/* fall through */
+		case DB_SUCCESS:
+			break;
+		case DB_LOCK_WAIT:
+			/* Lock wait for R-tree should already
+			be handled in sel_set_rtr_rec_lock() */
+			ut_ad(!dict_index_is_spatial(index));
+			/* Never unlock rows that were part of a conflict. */
+			prebuilt->new_rec_locks = 0;
+
+			if (UNIV_LIKELY(prebuilt->row_read_type
+					!= ROW_READ_TRY_SEMI_CONSISTENT)
+			    || unique_search
+			    || index != clust_index) {
+				if (!prebuilt->skip_locked) {
+					goto lock_wait_or_error;
+				}
+			} else {
+				/* The following call returns 'offsets'
+				associated with 'old_vers' */
+				row_sel_build_committed_vers_for_mysql(
+					clust_index, prebuilt, rec,
+					&offsets, &heap, &old_vers,
+					need_vrow ? &vrow : NULL, &mtr);
+			}
+
+			/* Check whether it was a deadlock or not, if not
+			a deadlock and the transaction had to wait then
+			release the lock it is waiting on. */
+
+			err = lock_trx_handle_wait(trx);
+
+			switch (err) {
+			case DB_SUCCESS:
+				ut_ad(
+				    !trx->lock.was_chosen_as_deadlock_victim);
+				/* The lock was granted while we were
+				searching for the last committed version.
+				Do a normal locking read. */
+
+				offsets = rec_get_offsets(
+					rec, index, offsets,
+					index->n_core_fields,
+					ULINT_UNDEFINED, &heap);
+				goto locks_ok;
+			case DB_DEADLOCK:
+				goto lock_wait_or_error;
+			case DB_LOCK_WAIT:
+				ut_ad(!dict_index_is_spatial(index));
+				err = DB_SUCCESS;
+				if (prebuilt->skip_locked) {
+					goto next_rec;
+				}
+				break;
+		        case DB_LOCK_WAIT_TIMEOUT:
+				if (prebuilt->skip_locked) {
+					err = DB_SUCCESS;
+					goto next_rec;
+				}
+				/* fall through */
+			default:
+				ut_error;
+			}
+
+			if (old_vers == NULL) {
+				/* The row was not yet committed */
+
+				goto next_rec;
+			}
+
+			did_semi_consistent_read = true;
+			rec = old_vers;
+			break;
+		case DB_RECORD_NOT_FOUND:
+			if (dict_index_is_spatial(index)) {
+				goto next_rec;
+			} else {
+				goto lock_wait_or_error;
+			}
+			break;
+		case DB_LOCK_WAIT_TIMEOUT:
+			if (prebuilt->skip_locked) {
+				err = DB_SUCCESS;
+				goto next_rec;
+			}
+			/* fall through */
+		default:
+
+			goto lock_wait_or_error;
+		}
+	} else {
+		/* This is a non-locking consistent read: if necessary, fetch
+		a previous version of the record */
+
+		if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED
+		    || prebuilt->table->is_temporary()
+		    || prebuilt->table->no_rollback()) {
+
+			/* Do nothing: we let a non-locking SELECT read the
+			latest version of the record */
+
+		} else if (index == clust_index) {
+
+			/* Fetch a previous version of the row if the current
+			one is not visible in the snapshot; if we have a very
+			high force recovery level set, we try to avoid crashes
+			by skipping this lookup */
+
+			err = row_sel_clust_sees(rec, *index, offsets,
+						 trx->read_view);
+
+			switch (err) {
+			default:
+				goto lock_wait_or_error;
+			case DB_SUCCESS:
+				break;
+			case DB_SUCCESS_LOCKED_REC:
+				ut_ad(srv_force_recovery
+				      < SRV_FORCE_NO_UNDO_LOG_SCAN);
+				rec_t*	old_vers;
+				/* The following call returns 'offsets'
+				associated with 'old_vers' */
+				err = row_sel_build_prev_vers_for_mysql(
+					prebuilt, clust_index,
+					rec, &offsets, &heap, &old_vers,
+					need_vrow ? &vrow : nullptr, &mtr);
+
+				if (err != DB_SUCCESS) {
+
+					goto lock_wait_or_error;
+				}
+
+				if (old_vers == NULL) {
+					/* The row did not exist yet in
+					the read view */
+
+					goto next_rec;
+				}
+
+				rec = old_vers;
+			}
+		} else {
+			/* We are looking into a non-clustered index,
+			and to get the right version of the record we
+			have to look also into the clustered index: this
+			is necessary, because we can only get the undo
+			information via the clustered index record. */
+
+			ut_ad(!dict_index_is_clust(index));
+
+			if (!srv_read_only_mode) {
+				trx_id_t trx_id = page_get_max_trx_id(
+					page_align(rec));
+				ut_ad(trx_id);
+				if (trx->read_view.sees(trx_id)) {
+					goto locks_ok;
+				}
+				/* We should look at the clustered index.
+				However, as this is a non-locking read,
+				we can skip the clustered index lookup if
+				the condition does not match the secondary
+				index entry. */
+				switch (row_search_idx_cond_check(
+						buf, prebuilt, rec, offsets)) {
+				case CHECK_NEG:
+					goto next_rec;
+                                case CHECK_ABORTED_BY_USER:
+					err = DB_INTERRUPTED;
+					goto idx_cond_failed;
+				case CHECK_OUT_OF_RANGE:
+				case CHECK_ERROR:
+					err = DB_RECORD_NOT_FOUND;
+					goto idx_cond_failed;
+				case CHECK_POS:
+					goto requires_clust_rec;
+				}
+
+				ut_error;
+			}
+		}
+	}
+
+locks_ok:
+	/* NOTE that at this point rec can be an old version of a clustered
+	index record built for a consistent read. We cannot assume after this
+	point that rec is on a buffer pool page. Functions like
+	page_rec_is_comp() cannot be used! */
+
+	if (rec_get_deleted_flag(rec, comp)) {
+locks_ok_del_marked:
+		/* In delete-marked records, DB_TRX_ID must
+		always refer to an existing undo log record. */
+		ut_ad(index != clust_index
+		      || row_get_rec_trx_id(rec, index, offsets));
+
+		/* The record is delete-marked: we can skip it */
+
+		/* This is an optimization to skip setting the next key lock
+		on the record that follows this delete-marked record. This
+		optimization works because of the unique search criteria
+		which precludes the presence of a range lock between this
+		delete marked record and the record following it.
+
+		For now this is applicable only to clustered indexes while
+		doing a unique search except for HANDLER queries because
+		HANDLER allows NEXT and PREV even in unique search on
+		clustered index. There is scope for further optimization
+		applicable to unique secondary indexes. Current behaviour is
+		to widen the scope of a lock on an already delete marked record
+		if the same record is deleted twice by the same transaction */
+		if (index == clust_index && unique_search
+		    && !prebuilt->used_in_HANDLER) {
+
+			err = DB_RECORD_NOT_FOUND;
+
+			goto normal_return;
+		}
+
+		goto next_rec;
+	}
+
+	/* Check if the record matches the index condition. */
+	switch (row_search_idx_cond_check(buf, prebuilt, rec, offsets)) {
+	case CHECK_NEG:
+		if (did_semi_consistent_read) {
+			row_unlock_for_mysql(prebuilt, TRUE);
+		}
+		goto next_rec;
+        case CHECK_ABORTED_BY_USER:
+		err = DB_INTERRUPTED;
+		goto idx_cond_failed;
+	case CHECK_OUT_OF_RANGE:
+        case CHECK_ERROR:
+		err = DB_RECORD_NOT_FOUND;
+		goto idx_cond_failed;
+	case CHECK_POS:
+		break;
+	}
+
+	if (index != clust_index && prebuilt->need_to_access_clustered) {
+		if (row_search_with_covering_prefix(prebuilt, rec, offsets)) {
+			goto use_covering_index;
+		}
+requires_clust_rec:
+		ut_ad(index != clust_index);
+		/* We use a 'goto' to the preceding label if a consistent
+		read of a secondary index record requires us to look up old
+		versions of the associated clustered index record. */
+
+		ut_ad(rec_offs_validate(rec, index, offsets));
+
+		/* It was a non-clustered index and we must fetch also the
+		clustered index record */
+
+		mtr_extra_clust_savepoint = mtr.get_savepoint();
+
+		ut_ad(!vrow);
+		/* The following call returns 'offsets' associated with
+		'clust_rec'. Note that 'clust_rec' can be an old version
+		built for a consistent read. */
+
+		err = row_sel_get_clust_rec_for_mysql(prebuilt, index, rec,
+						      thr, &clust_rec,
+						      &offsets, &heap,
+						      need_vrow ? &vrow : NULL,
+						      &mtr);
+		if (err == DB_LOCK_WAIT && prebuilt->skip_locked) {
+			err = lock_trx_handle_wait(trx);
+		}
+		switch (err) {
+		case DB_SUCCESS:
+			if (clust_rec == NULL) {
+				/* The record did not exist in the read view */
+				ut_ad(prebuilt->select_lock_type == LOCK_NONE
+				      || dict_index_is_spatial(index));
+				goto next_rec;
+			}
+			break;
+		case DB_SUCCESS_LOCKED_REC:
+			ut_a(clust_rec != NULL);
+			if (trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
+				/* Note that the clustered index record
+				was locked. */
+				prebuilt->new_rec_locks = 2;
+			}
+			err = DB_SUCCESS;
+			break;
+		case DB_LOCK_WAIT_TIMEOUT:
+		case DB_LOCK_WAIT:
+			if (prebuilt->skip_locked) {
+				err = DB_SUCCESS;
+				goto next_rec;
+			}
+			/* fall through */
+		default:
+			vrow = NULL;
+			goto lock_wait_or_error;
+		}
+
+		if (rec_get_deleted_flag(clust_rec, comp)) {
+
+			/* The record is delete marked: we can skip it */
+
+			if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
+			    && prebuilt->select_lock_type != LOCK_NONE) {
+
+				/* No need to keep a lock on a delete-marked
+				record if we do not want to use next-key
+				locking. */
+
+				row_unlock_for_mysql(prebuilt, TRUE);
+			}
+
+			goto next_rec;
+		}
+
+		if (need_vrow && !vrow) {
+			if (!heap) {
+				heap = mem_heap_create(100);
+			}
+			row_sel_fill_vrow(rec, index, &vrow, heap);
+		}
+
+		result_rec = clust_rec;
+		ut_ad(rec_offs_validate(result_rec, clust_index, offsets));
+
+		if (prebuilt->pk_filter || prebuilt->idx_cond) {
+			/* Convert the record to MySQL format. We were
+			unable to do this in row_search_idx_cond_check(),
+			because the condition is on the secondary index
+			and the requested column is in the clustered index.
+			We convert all fields, including those that
+			may have been used in ICP, because the
+			secondary index may contain a column prefix
+			rather than the full column. Also, as noted
+			in Bug #56680, the column in the secondary
+			index may be in the wrong case, and the
+			authoritative case is in result_rec, the
+			appropriate version of the clustered index record. */
+			if (!row_sel_store_mysql_rec(
+				    buf, prebuilt, result_rec, vrow,
+				    true, clust_index, offsets)) {
+				goto next_rec;
+			}
+		}
+	} else {
+use_covering_index:
+		result_rec = rec;
+	}
+
+	/* We found a qualifying record 'result_rec'. At this point,
+	'offsets' are associated with 'result_rec'. */
+
+	ut_ad(rec_offs_validate(result_rec,
+				result_rec != rec ? clust_index : index,
+				offsets));
+	ut_ad(!rec_get_deleted_flag(result_rec, comp));
+
+	/* Decide whether to prefetch extra rows.
+	At this point, the clustered index record is protected
+	by a page latch that was acquired when pcur was positioned.
+	The latch will not be released until mtr.commit(). */
+
+	if ((match_mode == ROW_SEL_EXACT
+	     || prebuilt->n_rows_fetched >= MYSQL_FETCH_CACHE_THRESHOLD)
+	    && prebuilt->select_lock_type == LOCK_NONE
+	    && !prebuilt->templ_contains_blob
+	    && !prebuilt->clust_index_was_generated
+	    && !prebuilt->used_in_HANDLER
+	    && !prebuilt->in_fts_query) {
+		/* Inside an update, for example, we do not cache rows,
+		since we may use the cursor position to do the actual
+		update, that is why we require ...lock_type == LOCK_NONE.
+		Since we keep space in prebuilt only for the BLOBs of
+		a single row, we cannot cache rows in the case there
+		are BLOBs in the fields to be fetched. In HANDLER we do
+		not cache rows because there the cursor is a scrollable
+		cursor. */
+
+		ut_a(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
+
+		/* We only convert from InnoDB row format to MySQL row
+		format when ICP is disabled. */
+
+		if (!prebuilt->pk_filter && !prebuilt->idx_cond) {
+			/* We use next_buf to track the allocation of buffers
+			where we store and enqueue the buffers for our
+			pre-fetch optimisation.
+
+			If next_buf == 0 then we store the converted record
+			directly into the MySQL record buffer (buf). If it is
+			!= 0 then we allocate a pre-fetch buffer and store the
+			converted record there.
+
+			If the conversion fails and the MySQL record buffer
+			was not written to then we reset next_buf so that
+			we can re-use the MySQL record buffer in the next
+			iteration. */
+
+			next_buf = next_buf
+				 ? row_sel_fetch_last_buf(prebuilt) : buf;
+
+			if (!row_sel_store_mysql_rec(
+				next_buf, prebuilt, result_rec, vrow,
+				result_rec != rec,
+				result_rec != rec ? clust_index : index,
+				offsets)) {
+
+				if (next_buf == buf) {
+					ut_a(prebuilt->n_fetch_cached == 0);
+					next_buf = 0;
+				}
+
+				/* Only fresh inserts may contain incomplete
+				externally stored columns. Pretend that such
+				records do not exist. Such records may only be
+				accessed at the READ UNCOMMITTED isolation
+				level or when rolling back a recovered
+				transaction. Rollback happens at a lower
+				level, not here. */
+				goto next_rec;
+			}
+
+			if (next_buf != buf) {
+				row_sel_enqueue_cache_row_for_mysql(
+					next_buf, prebuilt);
+			}
+		} else {
+			row_sel_enqueue_cache_row_for_mysql(buf, prebuilt);
+		}
+
+		if (prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE) {
+			goto next_rec;
+		}
+	} else {
+		if (!prebuilt->pk_filter && !prebuilt->idx_cond) {
+			/* The record was not yet converted to MySQL format. */
+			if (!row_sel_store_mysql_rec(
+				    buf, prebuilt, result_rec, vrow,
+				    result_rec != rec,
+				    result_rec != rec ? clust_index : index,
+				    offsets)) {
+				/* Only fresh inserts may contain
+				incomplete externally stored
+				columns. Pretend that such records do
+				not exist. Such records may only be
+				accessed at the READ UNCOMMITTED
+				isolation level or when rolling back a
+				recovered transaction. Rollback
+				happens at a lower level, not here. */
+				goto next_rec;
+			}
+		}
+
+		if (!prebuilt->clust_index_was_generated) {
+		} else if (result_rec != rec || index->is_primary()) {
+			memcpy(prebuilt->row_id, result_rec, DATA_ROW_ID_LEN);
+		} else {
+			ulint len;
+			const byte* data = rec_get_nth_field(
+				result_rec, offsets, index->n_fields - 1,
+				&len);
+			ut_ad(dict_index_get_nth_col(index,
+						     index->n_fields - 1)
+			      ->prtype == (DATA_ROW_ID | DATA_NOT_NULL));
+			ut_ad(len == DATA_ROW_ID_LEN);
+			memcpy(prebuilt->row_id, data, DATA_ROW_ID_LEN);
+		}
+	}
+
+	/* From this point on, 'offsets' are invalid. */
+
+	/* We have an optimization to save CPU time: if this is a consistent
+	read on a unique condition on the clustered index, then we do not
+	store the pcur position, because any fetch next or prev will anyway
+	return 'end of file'. Exceptions are locking reads and the MySQL
+	HANDLER command where the user can move the cursor with PREV or NEXT
+	even after a unique search. */
+
+	err = DB_SUCCESS;
+
+idx_cond_failed:
+	if (!unique_search
+	    || !dict_index_is_clust(index)
+	    || direction != 0
+	    || prebuilt->select_lock_type != LOCK_NONE
+	    || prebuilt->used_in_HANDLER) {
+
+		/* Inside an update always store the cursor position */
+
+		if (!spatial_search) {
+			btr_pcur_store_position(pcur, &mtr);
+		}
+	}
+
+	goto normal_return;
+
+next_rec:
+	/* Reset the old and new "did semi-consistent read" flags. */
+	if (UNIV_UNLIKELY(prebuilt->row_read_type
+			  == ROW_READ_DID_SEMI_CONSISTENT)) {
+		prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
+	}
+next_rec_after_check:
+	did_semi_consistent_read = false;
+	prebuilt->new_rec_locks = 0;
+	vrow = NULL;
+
+	/*-------------------------------------------------------------*/
+	/* PHASE 5: Move the cursor to the next index record */
+
+	/* NOTE: For moves_up==FALSE, the mini-transaction will be
+	committed and restarted every time when switching b-tree
+	pages. For moves_up==TRUE in index condition pushdown, we can
+	scan an entire secondary index tree within a single
+	mini-transaction. As long as the prebuilt->idx_cond does not
+	match, we do not need to consult the clustered index or
+	return records to MySQL, and thus we can avoid repositioning
+	the cursor. What prevents us from buffer-fixing all leaf pages
+	within the mini-transaction is the btr_leaf_page_release()
+	call in btr_pcur_move_to_next_page(). Only the leaf page where
+	the cursor is positioned will remain buffer-fixed.
+	For R-tree spatial search, we also commit the mini-transaction
+	each time  */
+
+	if (spatial_search) {
+		/* No need to do store restore for R-tree */
+		mtr.rollback_to_savepoint(0);
+	} else if (mtr_extra_clust_savepoint) {
+		/* We must release any clustered index latches
+		if we are moving to the next non-clustered
+		index record, because we could break the latching
+		order if we would access a different clustered
+		index page right away without releasing the previous. */
+		mtr.rollback_to_savepoint(mtr_extra_clust_savepoint);
+	}
+
+	mtr_extra_clust_savepoint = 0;
+
+	if (moves_up) {
+		if (UNIV_UNLIKELY(spatial_search)) {
+			if (rtr_pcur_move_to_next(
+				    search_tuple, mode, pcur, 0, &mtr)) {
+				goto rec_loop;
+			}
+		} else {
+			/* This is based on btr_pcur_move_to_next() */
+			ut_ad(pcur->pos_state == BTR_PCUR_IS_POSITIONED);
+			ut_ad(pcur->latch_mode != BTR_NO_LATCHES);
+			pcur->old_rec = nullptr;
+			if (btr_pcur_is_after_last_on_page(pcur)) {
+				if (btr_pcur_is_after_last_in_tree(pcur)) {
+					goto not_moved;
+				}
+				err = btr_pcur_move_to_next_page(pcur, &mtr);
+				if (err != DB_SUCCESS) {
+					goto lock_wait_or_error;
+				}
+			} else if (!btr_pcur_move_to_next_on_page(pcur)) {
+				goto corrupted;
+			}
+
+			goto rec_loop;
+		}
+	} else {
+		if (btr_pcur_move_to_prev(pcur, &mtr)) {
+			goto rec_loop;
+		}
+		if (UNIV_UNLIKELY(!btr_pcur_get_rec(pcur))) {
+corrupted:
+			err = DB_CORRUPTION;
+			goto normal_return;
+		}
+	}
+
+not_moved:
+	if (!spatial_search) {
+		btr_pcur_store_position(pcur, &mtr);
+	}
+
+	err = match_mode ? DB_RECORD_NOT_FOUND : DB_END_OF_INDEX;
+	goto normal_return;
+
+lock_wait_or_error:
+	if (!dict_index_is_spatial(index)) {
+		btr_pcur_store_position(pcur, &mtr);
+	}
+page_read_error:
+	/* Reset the old and new "did semi-consistent read" flags. */
+	if (UNIV_UNLIKELY(prebuilt->row_read_type
+			  == ROW_READ_DID_SEMI_CONSISTENT)) {
+		prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
+	}
+	did_semi_consistent_read = false;
+
+lock_table_wait:
+	mtr.commit();
+	mtr_extra_clust_savepoint = 0;
+
+	trx->error_state = err;
+	thr->lock_state = QUE_THR_LOCK_ROW;
+
+	if (row_mysql_handle_errors(&err, trx, thr, NULL)) {
+		/* It was a lock wait, and it ended */
+
+		thr->lock_state = QUE_THR_LOCK_NOLOCK;
+		mtr.start();
+
+		/* Table lock waited, go try to obtain table lock
+		again */
+		if (table_lock_waited) {
+			table_lock_waited = FALSE;
+
+			goto wait_table_again;
+		}
+
+		if (!dict_index_is_spatial(index)) {
+			sel_restore_position_for_mysql(
+				&same_user_rec, BTR_SEARCH_LEAF, pcur,
+				moves_up, &mtr);
+		}
+
+		if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
+		    && !same_user_rec) {
+
+			/* Since we were not able to restore the cursor
+			on the same user record, we cannot use
+			row_unlock_for_mysql() to unlock any records, and
+			we must thus reset the new rec lock info. Since
+			in lock0lock.cc we have blocked the inheriting of gap
+			X-locks, we actually do not have any new record locks
+			set in this case.
+
+			Note that if we were able to restore on the 'same'
+			user record, it is still possible that we were actually
+			waiting on a delete-marked record, and meanwhile
+			it was removed by purge and inserted again by some
+			other user. But that is no problem, because in
+			rec_loop we will again try to set a lock, and
+			new_rec_lock_info in trx will be right at the end. */
+
+			prebuilt->new_rec_locks = 0;
+		}
+
+		mode = pcur->search_mode;
+
+		goto rec_loop;
+	}
+
+	thr->lock_state = QUE_THR_LOCK_NOLOCK;
+
+	goto func_exit;
+
+normal_return:
+	mtr.commit();
+
+	DEBUG_SYNC_C("row_search_for_mysql_before_return");
+
+	if (prebuilt->pk_filter || prebuilt->idx_cond) {
+		/* When ICP is active we don't write to the MySQL buffer
+		directly, only to buffers that are enqueued in the pre-fetch
+		queue. We need to dequeue the first buffer and copy the contents
+		to the record buffer that was passed in by MySQL. */
+
+		if (prebuilt->n_fetch_cached > 0) {
+			row_sel_dequeue_cached_row_for_mysql(buf, prebuilt);
+			err = DB_SUCCESS;
+		}
+
+	} else if (next_buf != 0) {
+
+		/* We may or may not have enqueued some buffers to the
+		pre-fetch queue, but we definitely wrote to the record
+		buffer passed to use by MySQL. */
+
+		DEBUG_SYNC_C("row_search_cached_row");
+		err = DB_SUCCESS;
+	}
+
+#ifdef UNIV_DEBUG
+	if (dict_index_is_spatial(index) && err != DB_SUCCESS
+	    && err != DB_END_OF_INDEX && err != DB_INTERRUPTED) {
+		rtr_node_path_t*	path = pcur->btr_cur.rtr_info->path;
+
+		ut_ad(path->empty());
+	}
+#endif
+
+func_exit:
+	trx->op_info = "";
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	/* Set or reset the "did semi-consistent read" flag on return.
+	The flag did_semi_consistent_read is set if and only if
+	the record being returned was fetched with a semi-consistent read. */
+	ut_ad(prebuilt->row_read_type != ROW_READ_WITH_LOCKS
+	      || !did_semi_consistent_read);
+
+	if (prebuilt->row_read_type != ROW_READ_WITH_LOCKS) {
+		if (did_semi_consistent_read) {
+			prebuilt->row_read_type = ROW_READ_DID_SEMI_CONSISTENT;
+		} else {
+			prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
+		}
+	}
+
+	DEBUG_SYNC_C("innodb_row_search_for_mysql_exit");
+
+	DBUG_RETURN(err);
+}
+
+/********************************************************************//**
+Count rows in a R-Tree leaf level.
+@return DB_SUCCESS if successful */
+dberr_t
+row_count_rtree_recs(
+/*=================*/
+	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct for the
+					table handle; this contains the info
+					of search_tuple, index; if search
+					tuple contains 0 fields then we
+					position the cursor at the start or
+					the end of the index, depending on
+					'mode' */
+	ulint*		n_rows)		/*!< out: number of entries
+					seen in the consistent read */
+{
+	dict_index_t*	index		= prebuilt->index;
+	dberr_t		ret		= DB_SUCCESS;
+	mtr_t		mtr;
+	mem_heap_t*	heap;
+	dtuple_t*	entry;
+	dtuple_t*	search_entry	= prebuilt->search_tuple;
+	ulint		entry_len;
+	ulint		i;
+	byte*		buf;
+
+	ut_a(dict_index_is_spatial(index));
+
+	*n_rows = 0;
+
+	heap = mem_heap_create(256);
+
+	/* Build a search tuple. */
+	entry_len = dict_index_get_n_fields(index);
+	entry = dtuple_create(heap, entry_len);
+
+	for (i = 0; i < entry_len; i++) {
+		const dict_field_t*	ind_field
+			= dict_index_get_nth_field(index, i);
+		const dict_col_t*	col
+			= ind_field->col;
+		dfield_t*		dfield
+			= dtuple_get_nth_field(entry, i);
+
+		if (i == 0) {
+			double*	mbr;
+			double	tmp_mbr[SPDIMS * 2];
+
+			dfield->type.mtype = DATA_GEOMETRY;
+			dfield->type.prtype |= DATA_GIS_MBR;
+
+			/* Allocate memory for mbr field */
+			mbr = static_cast<double*>
+				(mem_heap_alloc(heap, DATA_MBR_LEN));
+
+			/* Set mbr field data. */
+			dfield_set_data(dfield, mbr, DATA_MBR_LEN);
+
+			for (uint j = 0; j < SPDIMS; j++) {
+				tmp_mbr[j * 2] = DBL_MAX;
+				tmp_mbr[j * 2 + 1] = -DBL_MAX;
+			}
+			dfield_write_mbr(dfield, tmp_mbr);
+			continue;
+		}
+
+		dfield->type.mtype = col->mtype;
+		dfield->type.prtype = col->prtype;
+
+	}
+
+	prebuilt->search_tuple = entry;
+
+	ulint bufsize = std::max<ulint>(srv_page_size,
+					prebuilt->mysql_row_len);
+	buf = static_cast<byte*>(ut_malloc_nokey(bufsize));
+
+	ulint direction = 0;
+
+loop:
+	ret = row_search_mvcc(buf, PAGE_CUR_WITHIN, prebuilt, 0, direction);
+	direction = ROW_SEL_NEXT;
+
+	switch (ret) {
+	case DB_SUCCESS:
+		break;
+	case DB_DEADLOCK:
+	case DB_LOCK_TABLE_FULL:
+	case DB_LOCK_WAIT_TIMEOUT:
+	case DB_INTERRUPTED:
+		goto func_exit;
+	default:
+		/* fall through (this error is ignored by CHECK TABLE) */
+	case DB_END_OF_INDEX:
+		ret = DB_SUCCESS;
+func_exit:
+		prebuilt->search_tuple = search_entry;
+		ut_free(buf);
+		mem_heap_free(heap);
+
+		return(ret);
+	}
+
+	++*n_rows;
+	goto loop;
+}
+
+/** Check if a version of a clustered index record and a secondary
+index record match.
+
+@param prebuilt       index and transaction
+@param clust_rec      a version of a clustered index record
+@param clust_index    clustered index
+@param clust_offsets  rec_get_offsets(clust_rec, clust_index)
+@param rec            secondary index leaf page record
+@param offsets        rec_get_offsets(rec, index)
+@return an error code
+@retval DB_SUCCESS             if rec matches clust_rec
+@retval DB_SUCCESS_LOCKED_REC  if rec does not match clust_rec
+*/
+static dberr_t row_check_index_match(row_prebuilt_t *prebuilt,
+                                     const rec_t *clust_rec,
+                                     const dict_index_t *clust_index,
+                                     const rec_offs *clust_offsets,
+                                     const rec_t *rec,
+                                     const dict_index_t *index,
+                                     const rec_offs *offsets)
+{
+  ut_ad(index == prebuilt->index);
+
+  ib_vcol_row vc(index->has_virtual() ? mem_heap_create(256) : nullptr);
+
+  const uint16_t n= index->n_user_defined_cols;
+
+  for (uint16_t i= 0; i < n; i++)
+  {
+    ulint pos= 0;
+    ulint len, sec_len;
+
+    const dict_field_t &ifield= index->fields[i];
+    const byte *sec_field= rec_get_nth_field(rec, offsets, i, &sec_len);
+    const byte *field;
+
+    if (ifield.col->is_virtual())
+    {
+      /* Virtual column values must be reconstructed from the base columns. */
+      row_ext_t *ext;
+      byte *record= vc.record(prebuilt->trx->mysql_thd, clust_index,
+                              &prebuilt->m_mysql_table);
+      const dict_v_col_t *v_col= reinterpret_cast<const dict_v_col_t*>
+        (ifield.col);
+      dtuple_t *row= row_build(ROW_COPY_POINTERS,
+                               clust_index, clust_rec, clust_offsets,
+                               nullptr, nullptr, nullptr, &ext, vc.heap);
+      if (dfield_t *vfield=
+          innobase_get_computed_value(row, v_col, clust_index, &vc.heap,
+                                      nullptr, nullptr,
+                                      prebuilt->trx->mysql_thd,
+                                      prebuilt->m_mysql_table,
+                                      record, nullptr, nullptr))
+      {
+        len= vfield->len;
+        field= static_cast<byte*>(vfield->data);
+      }
+      else
+      {
+        innobase_report_computed_value_failed(row);
+        return DB_COMPUTE_VALUE_FAILED;
+      }
+    }
+    else
+    {
+      pos= dict_col_get_clust_pos(ifield.col, clust_index);
+      field= rec_get_nth_cfield(clust_rec, clust_index, clust_offsets, pos,
+                                &len);
+      if (len == UNIV_SQL_NULL)
+      {
+        if (sec_len == UNIV_SQL_NULL)
+          continue;
+        return DB_SUCCESS_LOCKED_REC;
+      }
+      if (sec_len == UNIV_SQL_NULL)
+        return DB_SUCCESS_LOCKED_REC;
+
+      if (rec_offs_nth_extern(clust_offsets, pos))
+      {
+        if (len == BTR_EXTERN_FIELD_REF_SIZE)
+          goto compare_blobs;
+        len-= BTR_EXTERN_FIELD_REF_SIZE;
+      }
+
+      if (ifield.prefix_len)
+      {
+        len=
+          dtype_get_at_most_n_mbchars(ifield.col->prtype, ifield.col->mbminlen,
+                                      ifield.col->mbmaxlen,
+                                      ifield.prefix_len, len,
+                                      reinterpret_cast<const char*>(field));
+        if (len < sec_len)
+          goto check_for_blob;
+      }
+      else
+      {
+check_for_blob:
+        if (rec_offs_nth_extern(clust_offsets, pos))
+        {
+compare_blobs:
+          if (!row_sel_sec_rec_is_for_blob(ifield.col->mtype,
+                                           ifield.col->prtype,
+                                           ifield.col->mbminlen,
+                                           ifield.col->mbmaxlen,
+                                           field, len, sec_field, sec_len,
+                                           ifield.prefix_len,
+                                           clust_index->table))
+            return DB_SUCCESS_LOCKED_REC;
+          continue;
+        }
+      }
+    }
+
+    if (cmp_data(ifield.col->mtype, ifield.col->prtype, false,
+                 field, len, sec_field, sec_len))
+      return DB_SUCCESS_LOCKED_REC;
+  }
+
+  return DB_SUCCESS;
+}
+
+/**
+Check the index records in CHECK TABLE.
+The index must contain entries in an ascending order,
+unique constraint must not be violated by duplicated keys,
+and the number of index entries is counted in according to the
+current read view.
+
+@param prebuilt    index and transaction
+@param n_rows      number of records counted
+
+@return error code
+@retval DB_SUCCESS  if no error was found */
+dberr_t row_check_index(row_prebuilt_t *prebuilt, ulint *n_rows)
+{
+  rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
+  rec_offs_init(offsets_);
+
+  *n_rows= 0;
+  dict_index_t *const index= prebuilt->index;
+
+  if (!index->is_btree())
+    return DB_CORRUPTION;
+
+  mem_heap_t *heap= mem_heap_create(100);
+
+  dtuple_t *prev_entry= nullptr;
+  mtr_t mtr;
+  mtr.start();
+
+  dict_index_t *clust_index= dict_table_get_first_index(prebuilt->table);
+  prebuilt->clust_pcur->btr_cur.page_cur.index = clust_index;
+  dberr_t err= prebuilt->pcur->open_leaf(true, index, BTR_SEARCH_LEAF, &mtr);
+  if (UNIV_UNLIKELY(err != DB_SUCCESS))
+  {
+func_exit:
+    mtr.commit();
+    mem_heap_free(heap);
+    return err;
+  }
+
+  if (const trx_id_t bulk_trx_id= index->table->bulk_trx_id)
+    if (!prebuilt->trx->read_view.changes_visible(bulk_trx_id))
+      goto func_exit;
+
+  ReadView check_table_extended_view;
+  ReadView &view=
+    prebuilt->need_to_access_clustered &&
+    !prebuilt->table->is_temporary() &&
+    prebuilt->trx->isolation_level != TRX_ISO_READ_UNCOMMITTED
+    ? check_table_extended_view : prebuilt->trx->read_view;
+  if (&view == &check_table_extended_view)
+    check_table_extended_view.set_creator_trx_id(prebuilt->trx->id);
+
+page_loop:
+  if (&view == &check_table_extended_view)
+    /* In CHECK TABLE...EXTENDED, we make a copy of purge_sys.end_view
+    while holding a shared latch on the index leaf page.
+    Should a currently active purge batch desire to remove any further
+    records from this page, it would be blocked by our page latch.
+
+    We will consult check_table_extended_view to determine if a
+    clustered index record corresponding to a secondary index record
+    is visible to the current purge batch. Right after we have made our
+    copy, purge_sys.end_view is free to be changed again.
+
+    If we have an orphan secondary index record, we may attempt to
+    request a clustered index record version that cannot be retrieved
+    any more because the undo log records may have been freed
+    (according to the purge_sys.end_view). In such a case,
+    trx_undo_get_undo_rec() would cause
+    trx_undo_prev_version_build() and trx_undo_prev_version_build()
+    to return DB_MISSING_HISTORY. */
+    static_cast<ReadViewBase&>(check_table_extended_view)=
+      purge_sys_t::end_view_guard{}.view();
+
+rec_loop:
+  ut_ad(err == DB_SUCCESS);
+
+  if (!btr_pcur_move_to_next_on_page(prebuilt->pcur))
+  {
+    err= DB_CORRUPTION;
+    goto func_exit;
+  }
+
+  const rec_t *rec= btr_pcur_get_rec(prebuilt->pcur);
+  rec_offs *offsets= offsets_;
+
+  if (page_rec_is_supremum(rec))
+  {
+  next_page:
+    if (btr_pcur_is_after_last_in_tree(prebuilt->pcur))
+      goto func_exit;
+    err= btr_pcur_move_to_next_page(prebuilt->pcur, &mtr);
+    if (err == DB_SUCCESS && trx_is_interrupted(prebuilt->trx))
+      err= DB_INTERRUPTED;
+    if (UNIV_UNLIKELY(err != DB_SUCCESS))
+      goto func_exit;
+    goto page_loop;
+  }
+
+  offsets= rec_get_offsets(rec, index, offsets, index->n_core_fields,
+                           ULINT_UNDEFINED, &heap);
+
+  const auto info_bits=
+    rec_get_info_bits(rec, prebuilt->table->not_redundant());
+  const bool rec_deleted= info_bits & REC_INFO_DELETED_FLAG;
+
+  if (UNIV_UNLIKELY(info_bits & REC_INFO_MIN_REC_FLAG))
+  {
+    if (*n_rows || !index->is_instant())
+    {
+      push_warning_printf(prebuilt->trx->mysql_thd,
+                          Sql_condition::WARN_LEVEL_WARN, ER_NOT_KEYFILE,
+                          "InnoDB: invalid record encountered");
+      prebuilt->autoinc_error= DB_INDEX_CORRUPT;
+    }
+    goto next_rec;
+  }
+
+  if (prebuilt->table->is_temporary())
+  {
+  count_or_not:
+    if (rec_deleted)
+      goto next_rec;
+  }
+  else if (index->is_clust())
+  {
+    if (prebuilt->trx->isolation_level == TRX_ISO_READ_UNCOMMITTED)
+      goto count_or_not;
+
+    trx_id_t rec_trx_id= row_get_rec_trx_id(rec, index, offsets);
+
+    if (rec_trx_id >= prebuilt->trx->read_view.low_limit_id() &&
+        UNIV_UNLIKELY(rec_trx_id >= trx_sys.get_max_trx_id()))
+    {
+    invalid_trx_id:
+      if (prebuilt->autoinc_error == DB_SUCCESS)
+        push_warning_printf(prebuilt->trx->mysql_thd,
+                            Sql_condition::WARN_LEVEL_WARN,
+                            ER_NOT_KEYFILE,
+                            "InnoDB: DB_TRX_ID=" TRX_ID_FMT
+                            " exceeds the system-wide maximum",
+                            rec_trx_id);
+      prebuilt->autoinc_error= DB_CORRUPTION;
+      goto next_rec;
+    }
+
+    if (!prebuilt->trx->read_view.changes_visible(rec_trx_id))
+    {
+      ut_ad(srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN);
+      rec_t *old_vers;
+      /* The following call returns 'offsets' associated with 'old_vers' */
+      err= row_sel_build_prev_vers_for_mysql(prebuilt, index, rec, &offsets,
+                                             &heap, &old_vers, nullptr, &mtr);
+
+      if (err != DB_SUCCESS)
+        goto func_exit;
+
+      if (old_vers)
+      {
+        rec= old_vers;
+        rec_trx_id= row_get_rec_trx_id(rec, index, offsets);
+
+        if (rec_trx_id >= prebuilt->trx->read_view.low_limit_id() &&
+            UNIV_UNLIKELY(rec_trx_id >= trx_sys.get_max_trx_id()))
+          goto invalid_trx_id;
+
+        if (!rec_get_deleted_flag(rec, prebuilt->table->not_redundant()))
+          goto count_row;
+      }
+      else
+        offsets= rec_get_offsets(rec, index, offsets, index->n_core_fields,
+                                 ULINT_UNDEFINED, &heap);
+      goto next_rec;
+    }
+    else if (!rec_deleted && !rec_trx_id);
+    else if (!check_table_extended_view.changes_visible(rec_trx_id));
+    else if (prebuilt->autoinc_error == DB_SUCCESS)
+    {
+      const char *msg= rec_deleted
+        ? "Unpurged clustered index record"
+        : "Clustered index record with stale history";
+
+      ib::warn w;
+      w << msg << " in table " << index->table->name << ": "
+        << rec_offsets_print(rec, offsets);
+      prebuilt->autoinc_error= DB_MISSING_HISTORY;
+      push_warning_printf(prebuilt->trx->mysql_thd,
+                          Sql_condition::WARN_LEVEL_WARN,
+                          ER_NOT_KEYFILE, "InnoDB: %s", w.m_oss.str().c_str());
+    }
+
+    goto count_or_not;
+  }
+  else if (const trx_id_t page_trx_id= page_get_max_trx_id(page_align(rec)))
+  {
+    if (page_trx_id >= trx_sys.get_max_trx_id())
+      goto invalid_PAGE_MAX_TRX_ID;
+    if (prebuilt->trx->isolation_level == TRX_ISO_READ_UNCOMMITTED);
+    else if (&view == &check_table_extended_view || rec_deleted ||
+             !view.sees(page_trx_id))
+    {
+      bool got_extended_match= &view == &check_table_extended_view;
+      const auto savepoint= mtr.get_savepoint();
+
+      row_build_row_ref_in_tuple(prebuilt->clust_ref, rec, index, offsets);
+      err= btr_pcur_open_with_no_init(prebuilt->clust_ref,
+                                      PAGE_CUR_LE, BTR_SEARCH_LEAF,
+                                      prebuilt->clust_pcur, &mtr);
+      if (err != DB_SUCCESS)
+        goto func_exit;
+
+      const rec_t *clust_rec= btr_pcur_get_rec(prebuilt->clust_pcur);
+
+      /* Note: only if the search ends up on a non-infimum record is the
+      low_match value the real match to the search tuple */
+
+      if (!page_rec_is_user_rec(clust_rec) ||
+          btr_pcur_get_low_match(prebuilt->clust_pcur) < clust_index->n_uniq)
+      {
+        if (!rec_deleted)
+        {
+        not_found:
+          /* MDEV-29823 FIXME: There is a race condition between
+          rollback, purge, and possibly other SQL connections that
+          are creating and releasing read views. At the time
+          row_undo_mod_del_mark_or_remove_sec_low() is executing
+          rollback on a secondary index record, purge_sys.view
+          may not allow it to delete the record, and it will be
+          delete-marked. Eventually purge_sys.view would advance,
+          but the delete-marked record could never be removed,
+          because no undo log record was ever added to
+          the purge queue by trx_purge_add_undo_to_history().
+
+          For now, we will not flag an error about orphan secondary index
+          records that are delete-marked; we will only warn about them. */
+
+          if (!rec_deleted || prebuilt->autoinc_error == DB_SUCCESS)
+          {
+            ib::error_or_warn w(!rec_deleted);
+            w << "Clustered index record not found for index "
+              << index->name << " of table " << index->table->name
+              << ": " << rec_offsets_print(rec, offsets);
+            push_warning_printf(prebuilt->trx->mysql_thd,
+                                Sql_condition::WARN_LEVEL_WARN,
+                                ER_NOT_KEYFILE, "InnoDB: %s",
+                                w.m_oss.str().c_str());
+          }
+
+          if (prebuilt->autoinc_error == DB_SUCCESS)
+            prebuilt->autoinc_error= rec_deleted
+              ? DB_MISSING_HISTORY
+              : DB_CORRUPTION;
+        }
+        else if (&view == &check_table_extended_view)
+        extended_not_found:
+          if (view.changes_visible(page_trx_id))
+            goto not_found;
+      did_not_find:
+        mtr.rollback_to_savepoint(savepoint);
+        goto next_rec;
+      }
+
+      rec_offs *clust_offsets;
+      trx_id_t rec_trx_id;
+      rec_t *old_vers= nullptr;
+
+      bool found_in_view= false;
+      trx_id_t visible_trx_id= ~0ULL;
+
+      if (ulint trx_id_offset= clust_index->trx_id_offset)
+      {
+        clust_offsets= nullptr;
+      read_trx_id:
+        rec_trx_id= trx_read_trx_id(clust_rec + trx_id_offset);
+
+        if (clust_rec[trx_id_offset + DATA_TRX_ID_LEN] & 0x80)
+        {
+          if (UNIV_UNLIKELY
+              (rec_get_deleted_flag(clust_rec,
+                                    prebuilt->table->not_redundant())))
+          {
+            err= DB_CORRUPTION;
+            goto func_exit;
+          }
+
+          /* This is the oldest available record version (fresh insert). */
+          if (!view.changes_visible(rec_trx_id))
+          {
+            if (rec_trx_id >= view.low_limit_id() &&
+                UNIV_UNLIKELY(rec_trx_id >= trx_sys.get_max_trx_id()))
+              goto invalid_rec_trx_id;
+            if (got_extended_match)
+              goto check_latest_version;
+            goto did_not_find;
+          }
+        }
+      }
+      else
+      {
+        clust_offsets= rec_get_offsets(clust_rec, clust_index, nullptr,
+                                       clust_index->n_core_fields,
+                                       ULINT_UNDEFINED, &heap);
+        ulint trx_id_pos= clust_index->n_uniq ? clust_index->n_uniq : 1;
+        ulint len;
+        trx_id_offset= rec_get_nth_field_offs(clust_offsets, trx_id_pos, &len);
+        ut_ad(len == DATA_TRX_ID_LEN);
+        goto read_trx_id;
+      }
+
+      if (got_extended_match)
+      {
+      check_latest_version:
+        /* In CHECK TABLE...EXTENDED, always check if the secondary
+        index record matches the latest clustered index record
+        version, no matter if it is visible in our own read view.
+
+        If the latest clustered index version is delete-marked and
+        purgeable, it is not safe to fetch any BLOBs for column prefix
+        indexes because they may already have been freed. */
+        if (rec_trx_id &&
+            rec_get_deleted_flag(clust_rec,
+                                 prebuilt->table->not_redundant()) &&
+            purge_sys.is_purgeable(rec_trx_id))
+          goto did_not_find;
+
+        if (!clust_offsets)
+          clust_offsets= rec_get_offsets(clust_rec, clust_index, nullptr,
+                                         clust_index->n_core_fields,
+                                         ULINT_UNDEFINED, &heap);
+        err= row_check_index_match(prebuilt,
+                                   clust_rec, clust_index, clust_offsets,
+                                   rec, index, offsets);
+
+        switch (err) {
+        default:
+          goto func_exit;
+        case DB_SUCCESS_LOCKED_REC:
+        case DB_SUCCESS:
+          break;
+        }
+
+        got_extended_match= err == DB_SUCCESS;
+        err= DB_SUCCESS;
+
+        if (!prebuilt->trx->read_view.changes_visible(rec_trx_id))
+          /* While CHECK TABLE ... EXTENDED checks for a matching
+          clustered index record version for each secondary index
+          record, it must count only those records that belong to its
+          own read view.
+
+          If the latest version of clust_rec matches rec but is not
+          in our read view, there may still be an older version of
+          clust_rec that not only matches rec but is in our view.
+          We must evaluate old versions before deciding whether rec
+          should be counted. */
+          goto check_old_vers;
+
+        /* Remember that this is the visible clust_rec for rec,
+        and whether it matches rec. */
+        visible_trx_id= rec_trx_id;
+        found_in_view= got_extended_match &&
+          !rec_get_deleted_flag(clust_rec,
+                                prebuilt->table->not_redundant());
+
+        if (!got_extended_match)
+          goto check_old_vers;
+
+        if (!found_in_view)
+          goto did_not_find;
+
+      found_match:
+        mtr.rollback_to_savepoint(savepoint);
+        goto count_row;
+      }
+      else if (!view.changes_visible(rec_trx_id))
+      {
+      check_old_vers:
+        if (rec_trx_id >= view.low_limit_id() &&
+            UNIV_UNLIKELY(rec_trx_id >= trx_sys.get_max_trx_id()))
+        {
+        invalid_rec_trx_id:
+          if (prebuilt->autoinc_error == DB_SUCCESS)
+            push_warning_printf(prebuilt->trx->mysql_thd,
+                                Sql_condition::WARN_LEVEL_WARN,
+                                ER_NOT_KEYFILE,
+                                "InnoDB: DB_TRX_ID=" TRX_ID_FMT
+                                " exceeds the system-wide maximum",
+                                rec_trx_id);
+          goto not_found;
+        }
+
+        if (!clust_offsets)
+          clust_offsets= rec_get_offsets(clust_rec, clust_index, nullptr,
+                                         clust_index->n_core_fields,
+                                         ULINT_UNDEFINED, &heap);
+
+        row_sel_reset_old_vers_heap(prebuilt);
+        /* The following is adapted from row_vers_build_for_consistent_read()
+        because when using check_table_extended_view, we must
+        consider every available version of the clustered index record. */
+        mem_heap_t *vers_heap= nullptr;
+
+        for (;;)
+        {
+          mem_heap_t *prev_heap= vers_heap;
+          vers_heap= mem_heap_create(1024);
+          err= trx_undo_prev_version_build(clust_rec,
+                                           clust_index, clust_offsets,
+                                           vers_heap, &old_vers,
+                                           nullptr, nullptr, 0);
+          if (prev_heap)
+            mem_heap_free(prev_heap);
+          if (err != DB_SUCCESS)
+          {
+          old_vers_err:
+            mem_heap_free(vers_heap);
+            if (err == DB_MISSING_HISTORY)
+            {
+              err= DB_SUCCESS;
+              if (got_extended_match)
+                goto did_not_find;
+              goto not_found;
+            }
+            goto func_exit;
+          }
+
+          if (UNIV_UNLIKELY(!old_vers))
+          {
+            mem_heap_free(vers_heap);
+            /* We did not find a matching clustered index record version
+            for the secondary index record. Normal CHECK TABLE will simply
+            not count the secondary index record; CHECK TABLE ... EXTENDED
+            will flag such orphan records if appropriate.
+
+            A secondary index record may may be "temporarily orphan"
+            if purge is in progress. We will only flag them if
+            everything up to PAGE_MAX_TRX_ID has been fully purged.
+
+            "Temporary orphans" may be produced when
+            row_undo_mod_clust() resets the DB_TRX_ID of the latest
+            clust_rec version or when trx_undo_prev_version_build()
+            encounters a BLOB that may have been freed according to
+            purge_sys.view (not purge_sys.end_view). */
+            if (&view == &check_table_extended_view && !got_extended_match)
+              goto extended_not_found;
+            goto did_not_find;
+          }
+
+          clust_rec= old_vers;
+          clust_offsets= rec_get_offsets(clust_rec, clust_index, clust_offsets,
+                                         clust_index->n_core_fields,
+                                         ULINT_UNDEFINED, &heap);
+
+          rec_trx_id= row_get_rec_trx_id(clust_rec, clust_index,
+                                         clust_offsets);
+
+          if (UNIV_UNLIKELY(rec_trx_id >=
+                            prebuilt->trx->read_view.low_limit_id() &&
+                            rec_trx_id >= trx_sys.get_max_trx_id()))
+          {
+            mem_heap_free(vers_heap);
+            goto invalid_rec_trx_id;
+          }
+
+          const bool rec_visible=
+            prebuilt->trx->read_view.changes_visible(rec_trx_id);
+          const bool clust_rec_deleted=
+            rec_get_deleted_flag(clust_rec, prebuilt->table->not_redundant());
+
+          if (&view != &prebuilt->trx->read_view)
+          {
+            /* It is not safe to fetch BLOBs of committed delete-marked
+            records that may have been freed in purge. */
+            err= clust_rec_deleted && rec_trx_id &&
+              purge_sys.is_purgeable(rec_trx_id)
+              ? DB_SUCCESS_LOCKED_REC
+              : row_check_index_match(prebuilt,
+                                      clust_rec, clust_index, clust_offsets,
+                                      rec, index, offsets);
+
+            switch (err) {
+            default:
+              goto old_vers_err;
+            case DB_SUCCESS_LOCKED_REC:
+              if (rec_visible && !~visible_trx_id)
+                visible_trx_id= rec_trx_id;
+              continue;
+            case DB_SUCCESS:
+              got_extended_match= true;
+              if (!rec_visible)
+                continue;
+              if (!~visible_trx_id)
+              {
+                visible_trx_id= rec_trx_id;
+                found_in_view= !clust_rec_deleted;
+              }
+              mem_heap_free(vers_heap);
+              if (!found_in_view)
+                goto did_not_find;
+              goto found_match;
+            }
+          }
+          else if (rec_visible)
+          {
+            if (!clust_rec_deleted)
+            {
+              clust_rec= rec_copy(mem_heap_alloc(heap,
+                                                 rec_offs_size(clust_offsets)),
+                                  clust_rec, clust_offsets);
+              rec_offs_make_valid(clust_rec, clust_index, true, clust_offsets);
+            }
+            mem_heap_free(vers_heap);
+            if (clust_rec_deleted)
+              goto did_not_find;
+            goto check_match;
+          }
+        }
+      }
+      else if (rec_get_deleted_flag(clust_rec,
+                                    prebuilt->table->not_redundant()))
+        goto did_not_find;
+
+      ut_ad(clust_rec);
+      ut_ad(&view != &check_table_extended_view);
+
+      /* If we had to go to an earlier version of row or the secondary
+      index record is delete marked, then it may be that the secondary
+      index record corresponding to clust_rec (or old_vers) is not
+      rec; in that case we must ignore such row because in our
+      snapshot rec would not have existed. Remember that from rec we
+      cannot see directly which transaction id corresponds to it: we
+      have to go to the clustered index record. A query where we want
+      to fetch all rows where the secondary index value is in some
+      interval would return a wrong result if we would not drop rows
+      which we come to visit through secondary index records that
+      would not really exist in our snapshot. */
+
+      if (rec_deleted)
+      {
+        if (!clust_offsets)
+          clust_offsets= rec_get_offsets(clust_rec, clust_index, nullptr,
+                                         clust_index->n_core_fields,
+                                         ULINT_UNDEFINED, &heap);
+      check_match:
+        /* This clustered index record version exists in
+        prebuilt->trx->read_view and is not delete-marked.
+        By design, any BLOBs in it are not allowed to be
+        freed in the purge of committed transaction history. */
+        err= row_check_index_match(prebuilt, clust_rec, clust_index,
+                                   clust_offsets, rec, index, offsets);
+        switch (err) {
+        case DB_SUCCESS:
+          break;
+        case DB_SUCCESS_LOCKED_REC:
+          err= DB_SUCCESS;
+          goto did_not_find;
+        default:
+          goto func_exit;
+        }
+      }
+
+      mtr.rollback_to_savepoint(savepoint);
+    }
+  }
+  else
+  {
+  invalid_PAGE_MAX_TRX_ID:
+    if (UNIV_LIKELY(srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN))
+    {
+      push_warning_printf(prebuilt->trx->mysql_thd,
+                          Sql_condition::WARN_LEVEL_WARN, ER_NOT_KEYFILE,
+                          "InnoDB: Invalid PAGE_MAX_TRX_ID=%llu"
+                          " in index '%-.200s'",
+                          page_trx_id, index->name());
+      prebuilt->autoinc_error= DB_INDEX_CORRUPT;
+    }
+    goto next_rec;
+  }
+
+count_row:
+  ++*n_rows;
+
+  if (prev_entry)
+  {
+    ulint matched_fields= 0;
+    int cmp= cmp_dtuple_rec_with_match(prev_entry, rec, index, offsets,
+                                       &matched_fields);
+    const char* msg;
+
+    if (UNIV_LIKELY(cmp < 0));
+    else if (cmp > 0)
+    {
+      prebuilt->autoinc_error= DB_INDEX_CORRUPT;
+      msg= "index records in a wrong order in ";
+not_ok:
+      ib::error() << msg << index->name << " of table " << index->table->name
+                  << ": " << *prev_entry << ", "
+                  << rec_offsets_print(rec, offsets);
+    }
+    else if (index->is_unique() && matched_fields >=
+             dict_index_get_n_ordering_defined_by_user(index))
+    {
+      /* NULL values in unique indexes are considered not to be duplicates */
+      for (ulint i= 0; i < dict_index_get_n_ordering_defined_by_user(index);
+           i++)
+        if (dfield_is_null(dtuple_get_nth_field(prev_entry, i)))
+          goto next_rec;
+
+      if (prebuilt->autoinc_error == DB_SUCCESS)
+        prebuilt->autoinc_error= DB_DUPLICATE_KEY;
+      msg= "duplicate key in ";
+      goto not_ok;
+    }
+  }
+
+next_rec:
+  ut_ad(err == DB_SUCCESS);
+
+  {
+    mem_heap_t *tmp_heap= nullptr;
+
+    /* Empty the heap on each round.  But preserve offsets[]
+    for the row_rec_to_index_entry() call, by copying them
+    into a separate memory heap when needed. */
+    if (UNIV_UNLIKELY(offsets != offsets_))
+    {
+      ulint size= rec_offs_get_n_alloc(offsets) * sizeof *offsets;
+      tmp_heap= mem_heap_create(size);
+      offsets= static_cast<rec_offs*>(mem_heap_dup(tmp_heap, offsets, size));
+    }
+
+    mem_heap_empty(heap);
+    prev_entry= row_rec_to_index_entry(rec, index, offsets, heap);
+
+    if (UNIV_LIKELY_NULL(tmp_heap))
+      mem_heap_free(tmp_heap);
+  }
+
+  if (btr_pcur_is_after_last_on_page(prebuilt->pcur))
+    goto next_page;
+
+  goto rec_loop;
+}
+
+/*******************************************************************//**
+Read the AUTOINC column from the current row. If the value is less than
+0 and the type is not unsigned then we reset the value to 0.
+@return value read from the column */
+static
+ib_uint64_t
+row_search_autoinc_read_column(
+/*===========================*/
+	dict_index_t*	index,		/*!< in: index to read from */
+	const rec_t*	rec,		/*!< in: current rec */
+	ulint		col_no,		/*!< in: column number */
+	ulint		mtype,		/*!< in: column main type */
+	ibool		unsigned_type)	/*!< in: signed or unsigned flag */
+{
+	ulint		len;
+	const byte*	data;
+	ib_uint64_t	value;
+	mem_heap_t*	heap = NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets	= offsets_;
+
+	rec_offs_init(offsets_);
+	ut_ad(page_rec_is_leaf(rec));
+
+	offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+				  col_no + 1, &heap);
+
+	if (rec_offs_nth_sql_null(offsets, col_no)) {
+		/* There is no non-NULL value in the auto-increment column. */
+		value = 0;
+		goto func_exit;
+	}
+
+	data = rec_get_nth_field(rec, offsets, col_no, &len);
+
+	value = row_parse_int(data, len, mtype, unsigned_type);
+
+func_exit:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	return(value);
+}
+
+/** Get the maximum and non-delete-marked record in an index.
+@param[in]	index	index tree
+@param[in,out]	mtr	mini-transaction (may be committed and restarted)
+@return maximum record, page s-latched in mtr
+@retval NULL if there are no records, or if all of them are delete-marked */
+static
+const rec_t*
+row_search_get_max_rec(
+	dict_index_t*	index,
+	mtr_t*		mtr)
+{
+	btr_pcur_t	pcur;
+	const rec_t*	rec;
+	const bool	desc	= index->fields[0].descending;
+
+	if (pcur.open_leaf(desc, index, BTR_SEARCH_LEAF, mtr) != DB_SUCCESS) {
+		return nullptr;
+	}
+
+	if (desc) {
+		const bool comp = index->table->not_redundant();
+		while (btr_pcur_move_to_next_user_rec(&pcur, mtr)) {
+			rec = btr_pcur_get_rec(&pcur);
+			if (rec_is_metadata(rec, *index)) {
+				continue;
+			}
+			if (!rec_get_deleted_flag(rec, comp)) {
+				goto found;
+			}
+		}
+	} else {
+		do {
+			rec = page_find_rec_last_not_deleted(
+				btr_pcur_get_page(&pcur));
+			if (page_rec_is_user_rec(rec)) {
+				goto found;
+			}
+			btr_pcur_move_before_first_on_page(&pcur);
+		} while (btr_pcur_move_to_prev(&pcur, mtr));
+	}
+
+	rec = nullptr;
+
+found:
+	ut_ad(!rec
+	      || !(rec_get_info_bits(rec, dict_table_is_comp(index->table))
+		   & (REC_INFO_MIN_REC_FLAG | REC_INFO_DELETED_FLAG)));
+	return(rec);
+}
+
+/** Read the max AUTOINC value from an index.
+@param[in] index	index starting with an AUTO_INCREMENT column
+@return	the largest AUTO_INCREMENT value
+@retval	0	if no records were found */
+ib_uint64_t
+row_search_max_autoinc(dict_index_t* index)
+{
+	const dict_field_t*	dfield = dict_index_get_nth_field(index, 0);
+
+	ib_uint64_t	value = 0;
+
+	mtr_t		mtr;
+	mtr.start();
+
+	if (const rec_t* rec = row_search_get_max_rec(index, &mtr)) {
+		value = row_search_autoinc_read_column(
+			index, rec, 0,
+			dfield->col->mtype,
+			dfield->col->prtype & DATA_UNSIGNED);
+	}
+
+	mtr.commit();
+	return(value);
+}
diff --git a/storage/innobase/row/row0uins.cc b/storage/innobase/row/row0uins.cc
new file mode 100644
index 00000000..23255cc9
--- /dev/null
+++ b/storage/innobase/row/row0uins.cc
@@ -0,0 +1,652 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0uins.cc
+Fresh insert undo
+
+Created 2/25/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0uins.h"
+#include "dict0dict.h"
+#include "dict0stats.h"
+#include "dict0boot.h"
+#include "dict0crea.h"
+#include "trx0undo.h"
+#include "trx0roll.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "row0undo.h"
+#include "row0vers.h"
+#include "trx0trx.h"
+#include "trx0rec.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "que0que.h"
+#include "ibuf0ibuf.h"
+#include "log0log.h"
+#include "fil0fil.h"
+#include <mysql/service_thd_mdl.h>
+
+/*************************************************************************
+IMPORTANT NOTE: Any operation that generates redo MUST check that there
+is enough space in the redo log before for that operation. This is
+done by calling log_free_check(). The reason for checking the
+availability of the redo log space before the start of the operation is
+that we MUST not hold any synchonization objects when performing the
+check.
+If you make a change in this module make sure that no codepath is
+introduced where a call to log_free_check() is bypassed. */
+
+/***************************************************************//**
+Removes a clustered index record. The pcur in node was positioned on the
+record, now it is detached.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static  MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_ins_remove_clust_rec(
+/*==========================*/
+	undo_node_t*	node)	/*!< in: undo node */
+{
+	dberr_t		err;
+	ulint		n_tries	= 0;
+	mtr_t		mtr;
+	dict_index_t*	index	= node->pcur.index();
+	table_id_t table_id = 0;
+	const bool dict_locked = node->trx->dict_operation_lock_mode;
+restart:
+	MDL_ticket* mdl_ticket = nullptr;
+	ut_ad(!table_id || dict_locked
+	      || !node->trx->dict_operation_lock_mode);
+	dict_table_t *table = table_id
+		? dict_table_open_on_id(table_id, dict_locked,
+					DICT_TABLE_OP_OPEN_ONLY_IF_CACHED,
+					node->trx->mysql_thd, &mdl_ticket)
+		: nullptr;
+
+	ut_ad(index->is_primary());
+	ut_ad(node->trx->in_rollback);
+
+	mtr.start();
+	if (index->table->is_temporary()) {
+		ut_ad(node->rec_type == TRX_UNDO_INSERT_REC);
+		mtr.set_log_mode(MTR_LOG_NO_REDO);
+		ut_ad(index->table->id >= DICT_HDR_FIRST_ID);
+	} else {
+		index->set_modified(mtr);
+		ut_ad(lock_table_has_locks(index->table));
+	}
+
+	/* This is similar to row_undo_mod_clust(). The DDL thread may
+	already have copied this row from the log to the new table.
+	We must log the removal, so that the row will be correctly
+	purged. However, we can log the removal out of sync with the
+	B-tree modification. */
+	ut_a(node->pcur.restore_position(
+	      (node->rec_type == TRX_UNDO_INSERT_METADATA)
+		? BTR_MODIFY_TREE
+		: BTR_MODIFY_LEAF,
+	      &mtr) == btr_pcur_t::SAME_ALL);
+	rec_t* rec = btr_pcur_get_rec(&node->pcur);
+
+	ut_ad(rec_get_trx_id(rec, index) == node->trx->id
+	      || node->table->is_temporary());
+	ut_ad(!rec_get_deleted_flag(rec, index->table->not_redundant())
+	      || rec_is_alter_metadata(rec, index->table->not_redundant()));
+	ut_ad(rec_is_metadata(rec, index->table->not_redundant())
+	      == (node->rec_type == TRX_UNDO_INSERT_METADATA));
+
+	switch (node->table->id) {
+	case DICT_COLUMNS_ID:
+		/* This is rolling back an INSERT into SYS_COLUMNS.
+		If it was part of an instant ALTER TABLE operation, we
+		must evict the table definition, so that it can be
+		reloaded after the dictionary operation has been
+		completed. At this point, any corresponding operation
+		to the metadata record will have been rolled back. */
+		ut_ad(node->trx->dict_operation_lock_mode);
+		ut_ad(node->rec_type == TRX_UNDO_INSERT_REC);
+		if (rec_get_n_fields_old(rec)
+		    != DICT_NUM_FIELDS__SYS_COLUMNS
+		    || (rec_get_1byte_offs_flag(rec)
+			? rec_1_get_field_end_info(rec, 0) != 8
+			: rec_2_get_field_end_info(rec, 0) != 8)) {
+			break;
+		}
+		static_assert(!DICT_FLD__SYS_COLUMNS__TABLE_ID, "");
+		node->trx->evict_table(mach_read_from_8(rec));
+		break;
+	case DICT_INDEXES_ID:
+		ut_ad(node->trx->dict_operation_lock_mode);
+		ut_ad(node->rec_type == TRX_UNDO_INSERT_REC);
+		if (!table_id) {
+			table_id = mach_read_from_8(rec);
+			if (table_id) {
+				mtr.commit();
+				goto restart;
+			}
+			ut_ad("corrupted SYS_INDEXES record" == 0);
+		}
+
+		pfs_os_file_t d = OS_FILE_CLOSED;
+
+		const uint32_t space_id = dict_drop_index_tree(
+			&node->pcur, node->trx, &mtr);
+		if (space_id) {
+			if (table) {
+				lock_release_on_rollback(node->trx,
+							 table);
+				if (!dict_locked) {
+					dict_sys.lock(SRW_LOCK_CALL);
+				}
+				if (table->release()) {
+					dict_sys.remove(table);
+				} else if (table->space_id
+					   == space_id) {
+					table->space = nullptr;
+					table->file_unreadable = true;
+				}
+				if (!dict_locked) {
+					dict_sys.unlock();
+				}
+				table = nullptr;
+				if (!mdl_ticket);
+				else if (MDL_context* mdl_context =
+					 static_cast<MDL_context*>(
+						 thd_mdl_context(
+							 node->trx->
+							 mysql_thd))) {
+					mdl_context->release_lock(
+						mdl_ticket);
+					mdl_ticket = nullptr;
+				}
+			}
+
+			d = fil_delete_tablespace(space_id);
+		}
+
+		mtr.commit();
+
+		if (d != OS_FILE_CLOSED) {
+			os_file_close(d);
+		}
+
+		if (space_id) {
+			ibuf_delete_for_discarded_space(space_id);
+		}
+
+		mtr.start();
+		ut_a(node->pcur.restore_position(
+			BTR_MODIFY_LEAF, &mtr) == btr_pcur_t::SAME_ALL);
+	}
+
+	err = btr_cur_optimistic_delete(&node->pcur.btr_cur, 0, &mtr);
+
+	if (err != DB_FAIL) {
+		goto func_exit;
+	}
+
+	btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
+retry:
+	/* If did not succeed, try pessimistic descent to tree */
+	mtr.start();
+	if (index->table->is_temporary()) {
+		mtr.set_log_mode(MTR_LOG_NO_REDO);
+	} else {
+		index->set_modified(mtr);
+	}
+	ut_a(node->pcur.restore_position(BTR_PURGE_TREE, &mtr)
+	     == btr_pcur_t::SAME_ALL);
+
+	btr_cur_pessimistic_delete(&err, FALSE, &node->pcur.btr_cur, 0, true,
+				   &mtr);
+
+	/* The delete operation may fail if we have little
+	file space left: TODO: easiest to crash the database
+	and restart with more file space */
+
+	if (err == DB_OUT_OF_FILE_SPACE
+	    && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+
+		btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
+
+		n_tries++;
+
+		std::this_thread::sleep_for(BTR_CUR_RETRY_SLEEP_TIME);
+
+		goto retry;
+	}
+
+func_exit:
+	if (err == DB_SUCCESS && node->rec_type == TRX_UNDO_INSERT_METADATA) {
+		/* When rolling back the very first instant ADD COLUMN
+		operation, reset the root page to the basic state. */
+		btr_reset_instant(*index, true, &mtr);
+	}
+
+	btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
+
+	if (UNIV_LIKELY_NULL(table)) {
+		dict_table_close(table, dict_locked,
+				 node->trx->mysql_thd, mdl_ticket);
+	}
+
+	return(err);
+}
+
+/***************************************************************//**
+Removes a secondary index entry if found.
+@return DB_SUCCESS, DB_FAIL, or DB_OUT_OF_FILE_SPACE */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_ins_remove_sec_low(
+/*========================*/
+	btr_latch_mode	mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+				depending on whether we wish optimistic or
+				pessimistic descent down the index tree */
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry,	/*!< in: index entry to remove */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	btr_pcur_t		pcur;
+	dberr_t			err	= DB_SUCCESS;
+	mtr_t			mtr;
+	const bool		modify_leaf = mode == BTR_MODIFY_LEAF;
+
+	pcur.btr_cur.page_cur.index = index;
+	row_mtr_start(&mtr, index, !modify_leaf);
+
+	if (index->is_spatial()) {
+		mode = modify_leaf
+			? btr_latch_mode(BTR_MODIFY_LEAF
+					 | BTR_RTREE_DELETE_MARK
+					 | BTR_RTREE_UNDO_INS)
+			: btr_latch_mode(BTR_PURGE_TREE | BTR_RTREE_UNDO_INS);
+		btr_pcur_get_btr_cur(&pcur)->thr = thr;
+		if (rtr_search(entry, mode, &pcur, &mtr)) {
+			goto func_exit;
+		}
+
+		if (rec_get_deleted_flag(
+			    btr_pcur_get_rec(&pcur),
+			    dict_table_is_comp(index->table))) {
+			ib::error() << "Record found in index " << index->name
+				<< " is deleted marked on insert rollback.";
+			ut_ad(0);
+		}
+		goto found;
+	} else if (modify_leaf) {
+		mode = BTR_MODIFY_LEAF_ALREADY_LATCHED;
+		mtr_s_lock_index(index, &mtr);
+	} else {
+		ut_ad(mode == BTR_PURGE_TREE);
+		mode = BTR_PURGE_TREE_ALREADY_LATCHED;
+		mtr_x_lock_index(index, &mtr);
+	}
+
+	switch (row_search_index_entry(entry, mode, &pcur, &mtr)) {
+	case ROW_BUFFERED:
+	case ROW_NOT_DELETED_REF:
+		/* These are invalid outcomes, because the mode passed
+		to row_search_index_entry() did not include any of the
+		flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */
+		ut_error;
+	case ROW_NOT_FOUND:
+		break;
+	case ROW_FOUND:
+        found:
+		btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+		if (modify_leaf) {
+			err = btr_cur_optimistic_delete(btr_cur, 0, &mtr);
+		} else {
+			/* Passing rollback=false here, because we are
+			deleting a secondary index record: the distinction
+			only matters when deleting a record that contains
+			externally stored columns. */
+			btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0,
+						   false, &mtr);
+		}
+	}
+
+func_exit:
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	return(err);
+}
+
+/***************************************************************//**
+Removes a secondary index entry from the index if found. Tries first
+optimistic, then pessimistic descent down the tree.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_ins_remove_sec(
+/*====================*/
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry,	/*!< in: index entry to insert */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dberr_t	err;
+	ulint	n_tries	= 0;
+
+	/* Try first optimistic descent to the B-tree */
+
+	err = row_undo_ins_remove_sec_low(BTR_MODIFY_LEAF, index, entry, thr);
+
+	if (err == DB_SUCCESS) {
+
+		return(err);
+	}
+
+	/* Try then pessimistic descent to the B-tree */
+retry:
+	err = row_undo_ins_remove_sec_low(BTR_PURGE_TREE, index, entry, thr);
+
+	/* The delete operation may fail if we have little
+	file space left: TODO: easiest to crash the database
+	and restart with more file space */
+
+	if (err != DB_SUCCESS && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+
+		n_tries++;
+
+		std::this_thread::sleep_for(BTR_CUR_RETRY_SLEEP_TIME);
+
+		goto retry;
+	}
+
+	return(err);
+}
+
+/** Parse an insert undo record.
+@param[in,out]	node		row rollback state
+@param[in]	dict_locked	whether the data dictionary cache is locked */
+static bool row_undo_ins_parse_undo_rec(undo_node_t* node, bool dict_locked)
+{
+	dict_index_t*	clust_index;
+	const byte*	ptr;
+	undo_no_t	undo_no;
+	table_id_t	table_id;
+	byte		dummy;
+	bool		dummy_extern;
+
+	ut_ad(node->trx->in_rollback);
+	ut_ad(trx_undo_roll_ptr_is_insert(node->roll_ptr));
+
+	ptr = trx_undo_rec_get_pars(node->undo_rec, &node->rec_type, &dummy,
+				    &dummy_extern, &undo_no, &table_id);
+
+	node->update = NULL;
+	if (!node->is_temp) {
+		node->table = dict_table_open_on_id(table_id, dict_locked,
+						    DICT_TABLE_OP_NORMAL);
+	} else if (!dict_locked) {
+		dict_sys.freeze(SRW_LOCK_CALL);
+		node->table = dict_sys.acquire_temporary_table(table_id);
+		dict_sys.unfreeze();
+	} else {
+		node->table = dict_sys.acquire_temporary_table(table_id);
+	}
+
+	if (!node->table) {
+		return false;
+	}
+
+	switch (node->rec_type) {
+	default:
+		ut_ad("wrong undo record type" == 0);
+		goto close_table;
+	case TRX_UNDO_INSERT_METADATA:
+	case TRX_UNDO_INSERT_REC:
+	case TRX_UNDO_EMPTY:
+		break;
+	case TRX_UNDO_RENAME_TABLE:
+		dict_table_t* table = node->table;
+		ut_ad(!table->is_temporary());
+		ut_ad(table->file_unreadable
+		      || dict_table_is_file_per_table(table)
+		      == !is_system_tablespace(table->space_id));
+		size_t len = mach_read_from_2(node->undo_rec)
+			- page_offset(ptr) - 2;
+		const span<const char> name(reinterpret_cast<const char*>(ptr),
+					    len);
+		if (strlen(table->name.m_name) != len
+		    || memcmp(table->name.m_name, ptr, len)) {
+			dict_table_rename_in_cache(table, name, true);
+		} else if (table->space && table->space->id) {
+			const auto s = table->space->name();
+			if (len != s.size() || memcmp(ptr, s.data(), len)) {
+				table->rename_tablespace(name, true);
+			}
+		}
+		goto close_table;
+	}
+
+	if (UNIV_UNLIKELY(!node->table->is_accessible())) {
+close_table:
+		/* Normally, tables should not disappear or become
+		unaccessible during ROLLBACK, because they should be
+		protected by InnoDB table locks. Corruption could be
+		a valid exception.
+
+		FIXME: When running out of temporary tablespace, it
+		would probably be better to just drop all temporary
+		tables (and temporary undo log records) of the current
+		connection, instead of doing this rollback. */
+		dict_table_close(node->table, dict_locked);
+		node->table = NULL;
+		return false;
+	} else {
+		ut_ad(!node->table->skip_alter_undo);
+		clust_index = dict_table_get_first_index(node->table);
+
+		if (clust_index != NULL) {
+			switch (node->rec_type) {
+			case TRX_UNDO_INSERT_REC:
+				ptr = trx_undo_rec_get_row_ref(
+					ptr, clust_index, &node->ref,
+					node->heap);
+				break;
+			case TRX_UNDO_EMPTY:
+				node->ref = nullptr;
+				return true;
+			default:
+				node->ref = &trx_undo_metadata;
+				if (!row_undo_search_clust_to_pcur(node)) {
+					/* An error probably occurred during
+					an insert into the clustered index,
+					after we wrote the undo log record. */
+					goto close_table;
+				}
+				return true;
+			}
+
+			if (!row_undo_search_clust_to_pcur(node)) {
+				/* An error probably occurred during
+				an insert into the clustered index,
+				after we wrote the undo log record. */
+				goto close_table;
+			}
+			if (node->table->n_v_cols) {
+				trx_undo_read_v_cols(node->table, ptr,
+						     node->row, false);
+			}
+
+		} else {
+			ib::warn() << "Table " << node->table->name
+				 << " has no indexes,"
+				" ignoring the table";
+			goto close_table;
+		}
+	}
+
+	return true;
+}
+
+/***************************************************************//**
+Removes secondary index records.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_ins_remove_sec_rec(
+/*========================*/
+	undo_node_t*	node,	/*!< in/out: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dberr_t		err	= DB_SUCCESS;
+	dict_index_t*	index;
+	mem_heap_t*	heap;
+
+	heap = mem_heap_create(1024);
+
+	for (index = node->index; index;
+             index = dict_table_get_next_index(index)) {
+		if (index->type & (DICT_FTS | DICT_CORRUPT)
+		    || !index->is_committed()) {
+			continue;
+		}
+
+		/* An insert undo record TRX_UNDO_INSERT_REC will
+		always contain all fields of the index. It does not
+		matter if any indexes were created afterwards; all
+		index entries can be reconstructed from the row. */
+		dtuple_t* entry = row_build_index_entry(
+			node->row, node->ext, index, heap);
+		if (UNIV_UNLIKELY(!entry)) {
+			/* The database must have crashed after
+			inserting a clustered index record but before
+			writing all the externally stored columns of
+			that record, or a statement is being rolled
+			back because an error occurred while storing
+			off-page columns.
+
+			Because secondary index entries are inserted
+			after the clustered index record, we may
+			assume that the secondary index record does
+			not exist. */
+		} else {
+			err = row_undo_ins_remove_sec(index, entry, thr);
+
+			if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+				goto func_exit;
+			}
+		}
+
+		mem_heap_empty(heap);
+	}
+
+func_exit:
+	node->index = index;
+	mem_heap_free(heap);
+	return(err);
+}
+
+/***********************************************************//**
+Undoes a fresh insert of a row to a table. A fresh insert means that
+the same clustered index unique key did not have any record, even delete
+marked, at the time of the insert.  InnoDB is eager in a rollback:
+if it figures out that an index record will be removed in the purge
+anyway, it will remove it in the rollback.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+dberr_t
+row_undo_ins(
+/*=========*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dberr_t	err;
+	const bool dict_locked = node->trx->dict_operation_lock_mode;
+
+	if (!row_undo_ins_parse_undo_rec(node, dict_locked)) {
+		return DB_SUCCESS;
+	}
+
+	ut_ad(node->table->is_temporary()
+	      || lock_table_has_locks(node->table));
+
+	/* Iterate over all the indexes and undo the insert.*/
+
+	node->index = dict_table_get_first_index(node->table);
+	ut_ad(dict_index_is_clust(node->index));
+
+	switch (node->rec_type) {
+	default:
+		ut_ad("wrong undo record type" == 0);
+		/* fall through */
+	case TRX_UNDO_INSERT_REC:
+		/* Skip the clustered index (the first index) */
+		node->index = dict_table_get_next_index(node->index);
+
+		err = row_undo_ins_remove_sec_rec(node, thr);
+
+		if (err != DB_SUCCESS) {
+			break;
+		}
+
+		log_free_check();
+
+		if (!dict_locked && node->table->id == DICT_INDEXES_ID) {
+			dict_sys.lock(SRW_LOCK_CALL);
+			err = row_undo_ins_remove_clust_rec(node);
+			dict_sys.unlock();
+		} else {
+			ut_ad(node->table->id != DICT_INDEXES_ID
+			      || !node->table->is_temporary());
+			err = row_undo_ins_remove_clust_rec(node);
+		}
+
+		if (err == DB_SUCCESS && node->table->stat_initialized) {
+			/* Not protected by dict_sys.latch
+			or table->stats_mutex_lock() for
+			performance reasons, we would rather get garbage
+			in stat_n_rows (which is just an estimate anyway)
+			than protecting the following code with a latch. */
+			dict_table_n_rows_dec(node->table);
+
+			/* Do not attempt to update statistics when
+			executing ROLLBACK in the InnoDB SQL
+			interpreter, because in that case we would
+			already be holding dict_sys.latch, which
+			would be acquired when updating statistics. */
+			if (!dict_locked) {
+				dict_stats_update_if_needed(node->table,
+							    *node->trx);
+			}
+		}
+		break;
+
+	case TRX_UNDO_INSERT_METADATA:
+		log_free_check();
+		ut_ad(!node->table->is_temporary());
+		err = row_undo_ins_remove_clust_rec(node);
+		break;
+	case TRX_UNDO_EMPTY:
+		err = node->table->clear(thr);
+		break;
+	}
+
+	dict_table_close(node->table, dict_locked);
+
+	node->table = NULL;
+
+	return(err);
+}
diff --git a/storage/innobase/row/row0umod.cc b/storage/innobase/row/row0umod.cc
new file mode 100644
index 00000000..a01eaea5
--- /dev/null
+++ b/storage/innobase/row/row0umod.cc
@@ -0,0 +1,1288 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0umod.cc
+Undo modify of a row
+
+Created 2/27/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0umod.h"
+#include "dict0dict.h"
+#include "dict0stats.h"
+#include "dict0boot.h"
+#include "trx0undo.h"
+#include "trx0roll.h"
+#include "trx0purge.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "ibuf0ibuf.h"
+#include "row0undo.h"
+#include "row0vers.h"
+#include "trx0trx.h"
+#include "trx0rec.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "que0que.h"
+#include "log0log.h"
+
+/* Considerations on undoing a modify operation.
+(1) Undoing a delete marking: all index records should be found. Some of
+them may have delete mark already FALSE, if the delete mark operation was
+stopped underway, or if the undo operation ended prematurely because of a
+system crash.
+(2) Undoing an update of a delete unmarked record: the newer version of
+an updated secondary index entry should be removed if no prior version
+of the clustered index record requires its existence. Otherwise, it should
+be delete marked.
+(3) Undoing an update of a delete marked record. In this kind of update a
+delete marked clustered index record was delete unmarked and possibly also
+some of its fields were changed. Now, it is possible that the delete marked
+version has become obsolete at the time the undo is started. */
+
+/*************************************************************************
+IMPORTANT NOTE: Any operation that generates redo MUST check that there
+is enough space in the redo log before for that operation. This is
+done by calling log_free_check(). The reason for checking the
+availability of the redo log space before the start of the operation is
+that we MUST not hold any synchonization objects when performing the
+check.
+If you make a change in this module make sure that no codepath is
+introduced where a call to log_free_check() is bypassed. */
+
+/***********************************************************//**
+Undoes a modify in a clustered index record.
+@return DB_SUCCESS, DB_FAIL, or error code: we may run out of file space */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_clust_low(
+/*===================*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	rec_offs**	offsets,/*!< out: rec_get_offsets() on the record */
+	mem_heap_t**	offsets_heap,
+				/*!< in/out: memory heap that can be emptied */
+	mem_heap_t*	heap,	/*!< in/out: memory heap */
+	byte*		sys,	/*!< out: DB_TRX_ID, DB_ROLL_PTR
+				for row_log_table_delete() */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr,	/*!< in: mtr; must be committed before
+				latching any further pages */
+	btr_latch_mode	mode)	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
+{
+	btr_pcur_t*	pcur;
+	btr_cur_t*	btr_cur;
+	dberr_t		err;
+
+	pcur = &node->pcur;
+	btr_cur = btr_pcur_get_btr_cur(pcur);
+
+	if (pcur->restore_position(mode, mtr) != btr_pcur_t::SAME_ALL) {
+		return DB_CORRUPTION;
+	}
+
+	ut_ad(rec_get_trx_id(btr_cur_get_rec(btr_cur),
+			     btr_cur_get_index(btr_cur))
+	      == thr_get_trx(thr)->id
+	      || btr_cur_get_index(btr_cur)->table->is_temporary());
+	ut_ad(node->ref != &trx_undo_metadata
+	      || node->update->info_bits == REC_INFO_METADATA_ADD
+	      || node->update->info_bits == REC_INFO_METADATA_ALTER);
+
+	if (mode != BTR_MODIFY_TREE) {
+		ut_ad(mode == BTR_MODIFY_LEAF
+		      || mode == BTR_MODIFY_LEAF_ALREADY_LATCHED);
+
+		err = btr_cur_optimistic_update(
+			BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG
+			| BTR_KEEP_SYS_FLAG,
+			btr_cur, offsets, offsets_heap,
+			node->update, node->cmpl_info,
+			thr, thr_get_trx(thr)->id, mtr);
+		ut_ad(err != DB_SUCCESS || node->ref != &trx_undo_metadata);
+	} else {
+		big_rec_t*	dummy_big_rec;
+
+		err = btr_cur_pessimistic_update(
+			BTR_NO_LOCKING_FLAG
+			| BTR_NO_UNDO_LOG_FLAG
+			| BTR_KEEP_SYS_FLAG,
+			btr_cur, offsets, offsets_heap, heap,
+			&dummy_big_rec, node->update,
+			node->cmpl_info, thr, thr_get_trx(thr)->id, mtr);
+
+		ut_a(!dummy_big_rec);
+
+		if (err == DB_SUCCESS
+		    && node->ref == &trx_undo_metadata
+		    && btr_cur_get_index(btr_cur)->table->instant
+		    && node->update->info_bits == REC_INFO_METADATA_ADD) {
+			btr_reset_instant(*btr_cur->index(), false, mtr);
+		}
+	}
+
+	if (err != DB_SUCCESS) {
+		return err;
+	}
+
+	switch (const auto id = btr_cur_get_index(btr_cur)->table->id) {
+		unsigned c;
+	case DICT_TABLES_ID:
+		if (node->trx != trx_roll_crash_recv_trx) {
+			break;
+		}
+		c = DICT_COL__SYS_TABLES__ID;
+		goto evict;
+	case DICT_INDEXES_ID:
+		if (node->trx != trx_roll_crash_recv_trx) {
+			break;
+		} else if (node->rec_type == TRX_UNDO_DEL_MARK_REC
+			   && btr_cur_get_rec(btr_cur)
+			   [8 + 8 + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN]
+			   == static_cast<byte>(*TEMP_INDEX_PREFIX_STR)) {
+			/* We are rolling back the DELETE of metadata
+			for a failed ADD INDEX operation. This does
+			not affect any cached table definition,
+			because we are filtering out such indexes in
+			dict_load_indexes(). */
+			break;
+		}
+		/* fall through */
+	case DICT_COLUMNS_ID:
+		static_assert(!DICT_COL__SYS_INDEXES__TABLE_ID, "");
+		static_assert(!DICT_COL__SYS_COLUMNS__TABLE_ID, "");
+		c = DICT_COL__SYS_COLUMNS__TABLE_ID;
+		/* This is rolling back an UPDATE or DELETE on SYS_COLUMNS.
+		If it was part of an instant ALTER TABLE operation, we
+		must evict the table definition, so that it can be
+		reloaded after the dictionary operation has been
+		completed. At this point, any corresponding operation
+		to the metadata record will have been rolled back. */
+	evict:
+		const dfield_t& table_id = *dtuple_get_nth_field(node->row, c);
+		ut_ad(dfield_get_len(&table_id) == 8);
+		node->trx->evict_table(mach_read_from_8(
+					       static_cast<byte*>(
+						       table_id.data)),
+				       id == DICT_COLUMNS_ID);
+	}
+
+	return DB_SUCCESS;
+}
+
+/** Get the byte offset of the DB_TRX_ID column
+@param[in]	rec	clustered index record
+@param[in]	index	clustered index
+@return	the byte offset of DB_TRX_ID, from the start of rec */
+static ulint row_trx_id_offset(const rec_t* rec, const dict_index_t* index)
+{
+	ut_ad(index->n_uniq <= MAX_REF_PARTS);
+	ulint trx_id_offset = index->trx_id_offset;
+	if (!trx_id_offset) {
+		/* Reserve enough offsets for the PRIMARY KEY and 2 columns
+		so that we can access DB_TRX_ID, DB_ROLL_PTR. */
+		rec_offs offsets_[REC_OFFS_HEADER_SIZE + MAX_REF_PARTS + 2];
+		rec_offs_init(offsets_);
+		mem_heap_t* heap = NULL;
+		const ulint trx_id_pos = index->n_uniq ? index->n_uniq : 1;
+		rec_offs* offsets = rec_get_offsets(rec, index, offsets_,
+						    index->n_core_fields,
+						    trx_id_pos + 1, &heap);
+		ut_ad(!heap);
+		ulint len;
+		trx_id_offset = rec_get_nth_field_offs(
+			offsets, trx_id_pos, &len);
+		ut_ad(len == DATA_TRX_ID_LEN);
+	}
+
+	return trx_id_offset;
+}
+
+/** Determine if rollback must execute a purge-like operation.
+@param node   row undo
+@return	whether the record should be purged */
+static bool row_undo_mod_must_purge(const undo_node_t &node)
+{
+  ut_ad(node.rec_type == TRX_UNDO_UPD_DEL_REC);
+  ut_ad(!node.table->is_temporary());
+
+  const btr_cur_t &btr_cur= node.pcur.btr_cur;
+  ut_ad(btr_cur.index()->is_primary());
+  DEBUG_SYNC_C("rollback_purge_clust");
+
+  if (!purge_sys.is_purgeable(node.new_trx_id))
+    return false;
+
+  const rec_t *rec= btr_cur_get_rec(&btr_cur);
+  return trx_read_trx_id(rec + row_trx_id_offset(rec, btr_cur.index())) ==
+    node.new_trx_id;
+}
+
+/***********************************************************//**
+Undoes a modify in a clustered index record. Sets also the node state for the
+next round of undo.
+@return DB_SUCCESS or error code: we may run out of file space */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_clust(
+/*===============*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	btr_pcur_t*	pcur;
+	mtr_t		mtr;
+	dberr_t		err;
+	dict_index_t*	index;
+
+	ut_ad(thr_get_trx(thr) == node->trx);
+	ut_ad(node->trx->in_rollback);
+
+	log_free_check();
+	pcur = &node->pcur;
+	index = btr_cur_get_index(btr_pcur_get_btr_cur(pcur));
+	ut_ad(index->is_primary());
+
+	mtr.start();
+	if (index->table->is_temporary()) {
+		mtr.set_log_mode(MTR_LOG_NO_REDO);
+	} else {
+		index->set_modified(mtr);
+		ut_ad(lock_table_has_locks(index->table));
+	}
+
+	mem_heap_t*	heap		= mem_heap_create(1024);
+	mem_heap_t*	offsets_heap	= NULL;
+	rec_offs*	offsets		= NULL;
+	byte		sys[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN];
+
+	/* Try optimistic processing of the record, keeping changes within
+	the index page */
+
+	err = row_undo_mod_clust_low(node, &offsets, &offsets_heap,
+				     heap, sys, thr, &mtr, BTR_MODIFY_LEAF);
+
+	if (err != DB_SUCCESS) {
+		btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+		/* We may have to modify tree structure: do a pessimistic
+		descent down the index tree */
+
+		mtr.start();
+		if (index->table->is_temporary()) {
+			mtr.set_log_mode(MTR_LOG_NO_REDO);
+		} else {
+			index->set_modified(mtr);
+		}
+
+		err = row_undo_mod_clust_low(node, &offsets, &offsets_heap,
+					     heap, sys, thr, &mtr,
+					     BTR_MODIFY_TREE);
+		ut_ad(err == DB_SUCCESS || err == DB_OUT_OF_FILE_SPACE);
+	}
+
+	/**
+	* when scrubbing, and records gets cleared,
+	*   the transaction id is not present afterwards.
+	*   this is safe as: since the record is on free-list
+	*   it can be reallocated at any time after this mtr-commits
+	*   which is just below
+	*/
+	ut_ad(srv_immediate_scrub_data_uncompressed
+	      || row_get_rec_trx_id(btr_pcur_get_rec(pcur), index, offsets)
+	      == node->new_trx_id);
+
+	btr_pcur_commit_specify_mtr(pcur, &mtr);
+	DEBUG_SYNC_C("rollback_undo_pk");
+
+	if (err != DB_SUCCESS) {
+		goto func_exit;
+	}
+
+	/* FIXME: Perform the below operations in the above
+	mini-transaction when possible. */
+
+	if (node->rec_type == TRX_UNDO_UPD_DEL_REC) {
+		/* In delete-marked records, DB_TRX_ID must
+		always refer to an existing update_undo log record. */
+		ut_ad(node->new_trx_id);
+
+		mtr.start();
+		if (pcur->restore_position(BTR_MODIFY_LEAF, &mtr) !=
+		    btr_pcur_t::SAME_ALL) {
+			goto mtr_commit_exit;
+		}
+
+		ut_ad(rec_get_deleted_flag(btr_pcur_get_rec(pcur),
+					   dict_table_is_comp(node->table)));
+
+		if (index->table->is_temporary()) {
+			mtr.set_log_mode(MTR_LOG_NO_REDO);
+			err = btr_cur_optimistic_delete(&pcur->btr_cur, 0,
+							&mtr);
+			if (err != DB_FAIL) {
+				goto mtr_commit_exit;
+			}
+			err = DB_SUCCESS;
+			btr_pcur_commit_specify_mtr(pcur, &mtr);
+		} else {
+			index->set_modified(mtr);
+			if (!row_undo_mod_must_purge(*node)) {
+				goto mtr_commit_exit;
+			}
+			err = btr_cur_optimistic_delete(&pcur->btr_cur, 0,
+							&mtr);
+			if (err != DB_FAIL) {
+				goto mtr_commit_exit;
+			}
+			err = DB_SUCCESS;
+			btr_pcur_commit_specify_mtr(pcur, &mtr);
+		}
+
+		mtr.start();
+		if (pcur->restore_position(BTR_PURGE_TREE, &mtr) !=
+		    btr_pcur_t::SAME_ALL) {
+			goto mtr_commit_exit;
+		}
+
+		ut_ad(rec_get_deleted_flag(btr_pcur_get_rec(pcur),
+					   dict_table_is_comp(node->table)));
+
+		if (index->table->is_temporary()) {
+			mtr.set_log_mode(MTR_LOG_NO_REDO);
+		} else {
+			if (!row_undo_mod_must_purge(*node)) {
+				goto mtr_commit_exit;
+			}
+			index->set_modified(mtr);
+		}
+
+		/* This operation is analogous to purge, we can free
+		also inherited externally stored fields. We can also
+		assume that the record was complete (including BLOBs),
+		because it had been delete-marked after it had been
+		completely inserted. Therefore, we are passing
+		rollback=false, just like purge does. */
+		btr_cur_pessimistic_delete(&err, FALSE, &pcur->btr_cur, 0,
+					   false, &mtr);
+		ut_ad(err == DB_SUCCESS || err == DB_OUT_OF_FILE_SPACE);
+	} else if (!index->table->is_temporary() && node->new_trx_id) {
+		/* We rolled back a record so that it still exists.
+		We must reset the DB_TRX_ID if the history is no
+		longer accessible by any active read view. */
+
+		mtr.start();
+		if (pcur->restore_position(BTR_MODIFY_LEAF, &mtr)
+		    != btr_pcur_t::SAME_ALL
+		    || !purge_sys.is_purgeable(node->new_trx_id)) {
+			goto mtr_commit_exit;
+		}
+
+		rec_t* rec = btr_pcur_get_rec(pcur);
+		ulint trx_id_offset = index->trx_id_offset;
+		ulint trx_id_pos = index->n_uniq ? index->n_uniq : 1;
+		/* Reserve enough offsets for the PRIMARY KEY and
+		2 columns so that we can access DB_TRX_ID, DB_ROLL_PTR. */
+		rec_offs offsets_[REC_OFFS_HEADER_SIZE + MAX_REF_PARTS + 2];
+		if (trx_id_offset) {
+#ifdef UNIV_DEBUG
+			ut_ad(rec_offs_validate(NULL, index, offsets));
+			if (buf_block_get_page_zip(
+				    btr_pcur_get_block(&node->pcur))) {
+				/* Below, page_zip_write_trx_id_and_roll_ptr()
+				needs offsets to access DB_TRX_ID,DB_ROLL_PTR.
+				We already computed offsets for possibly
+				another record in the clustered index.
+				Because the PRIMARY KEY is fixed-length,
+				the offsets for the PRIMARY KEY and
+				DB_TRX_ID,DB_ROLL_PTR are still valid.
+				Silence the rec_offs_validate() assertion. */
+				rec_offs_make_valid(rec, index, true, offsets);
+			}
+#endif
+		} else if (rec_is_metadata(rec, *index)) {
+			ut_ad(!buf_block_get_page_zip(btr_pcur_get_block(
+							      pcur)));
+			for (unsigned i = index->first_user_field(); i--; ) {
+				trx_id_offset += index->fields[i].fixed_len;
+			}
+		} else {
+			ut_ad(index->n_uniq <= MAX_REF_PARTS);
+			rec_offs_init(offsets_);
+			offsets = rec_get_offsets(rec, index, offsets_,
+						  index->n_core_fields,
+						  trx_id_pos + 2, &heap);
+			ulint len;
+			trx_id_offset = rec_get_nth_field_offs(
+				offsets, trx_id_pos, &len);
+			ut_ad(len == DATA_TRX_ID_LEN);
+		}
+
+		if (trx_read_trx_id(rec + trx_id_offset) == node->new_trx_id) {
+			ut_ad(!rec_get_deleted_flag(
+				      rec, dict_table_is_comp(node->table))
+			      || rec_is_alter_metadata(rec, *index));
+			index->set_modified(mtr);
+			buf_block_t* block = btr_pcur_get_block(pcur);
+			if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+				page_zip_write_trx_id_and_roll_ptr(
+					block, rec, offsets, trx_id_pos,
+					0, 1ULL << ROLL_PTR_INSERT_FLAG_POS,
+					&mtr);
+			} else {
+				size_t offs = page_offset(rec + trx_id_offset);
+				mtr.memset(block, offs, DATA_TRX_ID_LEN, 0);
+				offs += DATA_TRX_ID_LEN;
+				mtr.write<1,mtr_t::MAYBE_NOP>(*block,
+							      block->page.frame
+							      + offs, 0x80U);
+				mtr.memset(block, offs + 1,
+					   DATA_ROLL_PTR_LEN - 1, 0);
+			}
+		}
+	} else {
+		goto func_exit;
+	}
+
+mtr_commit_exit:
+	btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+func_exit:
+	if (offsets_heap) {
+		mem_heap_free(offsets_heap);
+	}
+	mem_heap_free(heap);
+	return(err);
+}
+
+/***********************************************************//**
+Delete marks or removes a secondary index entry if found.
+@return DB_SUCCESS, DB_FAIL, or DB_OUT_OF_FILE_SPACE */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_del_mark_or_remove_sec_low(
+/*====================================*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr,	/*!< in: query thread */
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry,	/*!< in: index entry */
+	btr_latch_mode	mode)	/*!< in: latch mode BTR_MODIFY_LEAF or
+				BTR_MODIFY_TREE */
+{
+	btr_pcur_t		pcur;
+	btr_cur_t*		btr_cur;
+	dberr_t			err	= DB_SUCCESS;
+	mtr_t			mtr;
+	mtr_t			mtr_vers;
+	const bool		modify_leaf = mode == BTR_MODIFY_LEAF;
+
+	row_mtr_start(&mtr, index, !modify_leaf);
+
+	pcur.btr_cur.page_cur.index = index;
+	btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+	if (index->is_spatial()) {
+		mode = modify_leaf
+			? btr_latch_mode(BTR_MODIFY_LEAF
+					 | BTR_RTREE_DELETE_MARK
+					 | BTR_RTREE_UNDO_INS)
+			: btr_latch_mode(BTR_PURGE_TREE | BTR_RTREE_UNDO_INS);
+		btr_cur->thr = thr;
+		if (UNIV_LIKELY(!rtr_search(entry, mode, &pcur, &mtr))) {
+			goto found;
+		} else {
+			goto func_exit;
+		}
+	} else if (!index->is_committed()) {
+		/* The index->online_status may change if the index is
+		or was being created online, but not committed yet. It
+		is protected by index->lock. */
+		if (modify_leaf) {
+			mode = BTR_MODIFY_LEAF_ALREADY_LATCHED;
+			mtr_s_lock_index(index, &mtr);
+		} else {
+			ut_ad(mode == BTR_PURGE_TREE);
+			mode = BTR_PURGE_TREE_ALREADY_LATCHED;
+			mtr_x_lock_index(index, &mtr);
+		}
+	} else {
+		/* For secondary indexes,
+		index->online_status==ONLINE_INDEX_COMPLETE if
+		index->is_committed(). */
+		ut_ad(!dict_index_is_online_ddl(index));
+	}
+
+	switch (UNIV_EXPECT(row_search_index_entry(entry, mode, &pcur, &mtr),
+			    ROW_FOUND)) {
+	case ROW_NOT_FOUND:
+		/* In crash recovery, the secondary index record may
+		be missing if the UPDATE did not have time to insert
+		the secondary index records before the crash.  When we
+		are undoing that UPDATE in crash recovery, the record
+		may be missing.
+
+		In normal processing, if an update ends in a deadlock
+		before it has inserted all updated secondary index
+		records, then the undo will not find those records. */
+		goto func_exit;
+	case ROW_FOUND:
+		break;
+	case ROW_BUFFERED:
+	case ROW_NOT_DELETED_REF:
+		/* These are invalid outcomes, because the mode passed
+		to row_search_index_entry() did not include any of the
+		flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */
+		ut_error;
+	}
+
+found:
+	/* We should remove the index record if no prior version of the row,
+	which cannot be purged yet, requires its existence. If some requires,
+	we should delete mark the record. */
+
+	mtr_vers.start();
+
+	ut_a(node->pcur.restore_position(BTR_SEARCH_LEAF, &mtr_vers) ==
+	      btr_pcur_t::SAME_ALL);
+
+	/* For temporary table, we can skip to check older version of
+	clustered index entry, because there is no MVCC or purge. */
+	if (node->table->is_temporary()
+	    || row_vers_old_has_index_entry(
+		    false, btr_pcur_get_rec(&node->pcur),
+		    &mtr_vers, index, entry, 0, 0)) {
+		btr_rec_set_deleted<true>(btr_cur_get_block(btr_cur),
+					  btr_cur_get_rec(btr_cur), &mtr);
+	} else {
+		/* Remove the index record */
+
+		if (dict_index_is_spatial(index)) {
+			rec_t*	rec = btr_pcur_get_rec(&pcur);
+			if (rec_get_deleted_flag(rec,
+						 dict_table_is_comp(index->table))) {
+				ib::error() << "Record found in index "
+					<< index->name << " is deleted marked"
+					" on rollback update.";
+				ut_ad(0);
+			}
+		}
+
+		if (modify_leaf) {
+			err = btr_cur_optimistic_delete(btr_cur, 0, &mtr);
+		} else {
+			/* Passing rollback=false,
+			because we are deleting a secondary index record:
+			the distinction only matters when deleting a
+			record that contains externally stored columns. */
+			ut_ad(!index->is_primary());
+			btr_cur_pessimistic_delete(&err, FALSE, btr_cur, 0,
+						   false, &mtr);
+
+			/* The delete operation may fail if we have little
+			file space left: TODO: easiest to crash the database
+			and restart with more file space */
+		}
+	}
+
+	btr_pcur_commit_specify_mtr(&(node->pcur), &mtr_vers);
+
+func_exit:
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	return(err);
+}
+
+/***********************************************************//**
+Delete marks or removes a secondary index entry if found.
+NOTE that if we updated the fields of a delete-marked secondary index record
+so that alphabetically they stayed the same, e.g., 'abc' -> 'aBc', we cannot
+return to the original values because we do not know them. But this should
+not cause problems because in row0sel.cc, in queries we always retrieve the
+clustered index record or an earlier version of it, if the secondary index
+record through which we do the search is delete-marked.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_del_mark_or_remove_sec(
+/*================================*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr,	/*!< in: query thread */
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry)	/*!< in: index entry */
+{
+	dberr_t	err;
+
+	err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index,
+						      entry, BTR_MODIFY_LEAF);
+	if (err == DB_SUCCESS) {
+
+		return(err);
+	}
+
+	err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index,
+		entry, BTR_PURGE_TREE);
+	return(err);
+}
+
+/***********************************************************//**
+Delete unmarks a secondary index entry which must be found. It might not be
+delete-marked at the moment, but it does not harm to unmark it anyway. We also
+need to update the fields of the secondary index record if we updated its
+fields but alphabetically they stayed the same, e.g., 'abc' -> 'aBc'.
+@retval DB_SUCCESS on success
+@retval DB_FAIL if BTR_MODIFY_TREE should be tried
+@retval DB_OUT_OF_FILE_SPACE when running out of tablespace
+@retval DB_DUPLICATE_KEY if the value was missing
+	and an insert would lead to a duplicate exists */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_del_unmark_sec_and_undo_update(
+/*========================================*/
+	btr_latch_mode	mode,	/*!< in: search mode: BTR_MODIFY_LEAF or
+				BTR_MODIFY_TREE */
+	que_thr_t*	thr,	/*!< in: query thread */
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry)	/*!< in: index entry */
+{
+	btr_pcur_t		pcur;
+	btr_cur_t*		btr_cur		= btr_pcur_get_btr_cur(&pcur);
+	upd_t*			update;
+	dberr_t			err		= DB_SUCCESS;
+	big_rec_t*		dummy_big_rec;
+	mtr_t			mtr;
+	trx_t*			trx		= thr_get_trx(thr);
+	const ulint		flags
+		= BTR_KEEP_SYS_FLAG | BTR_NO_LOCKING_FLAG;
+	const auto		orig_mode = mode;
+
+	pcur.btr_cur.page_cur.index = index;
+	ut_ad(trx->id != 0);
+
+	if (index->is_spatial()) {
+		/* FIXME: Currently we do a 2-pass search for the undo
+		due to avoid undel-mark a wrong rec in rolling back in
+		partial update.  Later, we could log some info in
+		secondary index updates to avoid this. */
+		static_assert(BTR_MODIFY_TREE == (8 | BTR_MODIFY_LEAF), "");
+		ut_ad(!(mode & 8));
+		mode = btr_latch_mode(mode | BTR_RTREE_DELETE_MARK);
+	}
+
+try_again:
+	row_mtr_start(&mtr, index, mode & 8);
+
+	btr_cur->thr = thr;
+
+	if (index->is_spatial()) {
+		if (!rtr_search(entry, mode, &pcur, &mtr)) {
+			goto found;
+		}
+
+		if (mode != orig_mode && btr_cur->rtr_info->fd_del) {
+			mode = orig_mode;
+			btr_pcur_close(&pcur);
+			mtr.commit();
+			goto try_again;
+		}
+
+		goto not_found;
+	}
+
+	switch (row_search_index_entry(entry, mode, &pcur, &mtr)) {
+		mem_heap_t*	heap;
+		mem_heap_t*	offsets_heap;
+		rec_offs*	offsets;
+	case ROW_BUFFERED:
+	case ROW_NOT_DELETED_REF:
+		/* These are invalid outcomes, because the mode passed
+		to row_search_index_entry() did not include any of the
+		flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */
+		ut_error;
+	case ROW_NOT_FOUND:
+not_found:
+		if (btr_cur->up_match >= dict_index_get_n_unique(index)
+		    || btr_cur->low_match >= dict_index_get_n_unique(index)) {
+			ib::warn() << "Record in index " << index->name
+				<< " of table " << index->table->name
+				<< " was not found on rollback, and"
+				" a duplicate exists: "
+				<< *entry
+				<< " at: " << rec_index_print(
+					btr_cur_get_rec(btr_cur), index);
+			err = DB_DUPLICATE_KEY;
+			break;
+		}
+
+		ib::warn() << "Record in index " << index->name
+			<< " of table " << index->table->name
+			<< " was not found on rollback, trying to insert: "
+			<< *entry
+			<< " at: " << rec_index_print(
+				btr_cur_get_rec(btr_cur), index);
+
+		/* Insert the missing record that we were trying to
+		delete-unmark. */
+		big_rec_t*	big_rec;
+		rec_t*		insert_rec;
+		offsets = NULL;
+		offsets_heap = NULL;
+
+		err = btr_cur_optimistic_insert(
+			flags, btr_cur, &offsets, &offsets_heap,
+			entry, &insert_rec, &big_rec,
+			0, thr, &mtr);
+		ut_ad(!big_rec);
+
+		if (err == DB_FAIL && mode == BTR_MODIFY_TREE) {
+			err = btr_cur_pessimistic_insert(
+				flags, btr_cur,
+				&offsets, &offsets_heap,
+				entry, &insert_rec, &big_rec,
+				0, thr, &mtr);
+			/* There are no off-page columns in
+			secondary indexes. */
+			ut_ad(!big_rec);
+		}
+
+		if (err == DB_SUCCESS) {
+			page_update_max_trx_id(
+				btr_cur_get_block(btr_cur),
+				btr_cur_get_page_zip(btr_cur),
+				trx->id, &mtr);
+		}
+
+		if (offsets_heap) {
+			mem_heap_free(offsets_heap);
+		}
+
+		break;
+	case ROW_FOUND:
+found:
+		btr_rec_set_deleted<false>(btr_cur_get_block(btr_cur),
+					   btr_cur_get_rec(btr_cur), &mtr);
+		heap = mem_heap_create(
+			sizeof(upd_t)
+			+ dtuple_get_n_fields(entry) * sizeof(upd_field_t));
+		offsets_heap = NULL;
+		offsets = rec_get_offsets(
+			btr_cur_get_rec(btr_cur),
+			index, nullptr, index->n_core_fields, ULINT_UNDEFINED,
+			&offsets_heap);
+		update = row_upd_build_sec_rec_difference_binary(
+			btr_cur_get_rec(btr_cur), index, offsets, entry, heap);
+		if (upd_get_n_fields(update) == 0) {
+
+			/* Do nothing */
+
+		} else if (mode != BTR_MODIFY_TREE) {
+			/* Try an optimistic updating of the record, keeping
+			changes within the page */
+
+			/* TODO: pass offsets, not &offsets */
+			err = btr_cur_optimistic_update(
+				flags, btr_cur, &offsets, &offsets_heap,
+				update, 0, thr, thr_get_trx(thr)->id, &mtr);
+			switch (err) {
+			case DB_OVERFLOW:
+			case DB_UNDERFLOW:
+			case DB_ZIP_OVERFLOW:
+				err = DB_FAIL;
+			default:
+				break;
+			}
+		} else {
+			err = btr_cur_pessimistic_update(
+				flags, btr_cur, &offsets, &offsets_heap,
+				heap, &dummy_big_rec,
+				update, 0, thr, thr_get_trx(thr)->id, &mtr);
+			ut_a(!dummy_big_rec);
+		}
+
+		mem_heap_free(heap);
+		mem_heap_free(offsets_heap);
+	}
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	return(err);
+}
+
+/***********************************************************//**
+Undoes a modify in secondary indexes when undo record type is UPD_DEL.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_upd_del_sec(
+/*=====================*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	mem_heap_t*	heap;
+	dberr_t		err	= DB_SUCCESS;
+
+	ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC);
+	ut_ad(!node->undo_row);
+
+	heap = mem_heap_create(1024);
+
+	do {
+		dict_index_t* index = node->index;
+
+		if (index->type & (DICT_FTS | DICT_CORRUPT)
+		    || !index->is_committed()) {
+			continue;
+		}
+
+		/* During online index creation,
+		HA_ALTER_INPLACE_COPY_NO_LOCK or HA_ALTER_INPLACE_NOCOPY_NO_LOCk
+		should guarantee that any active transaction has not modified
+		indexed columns such that col->ord_part was 0 at the
+		time when the undo log record was written. When we get
+		to roll back an undo log entry TRX_UNDO_DEL_MARK_REC,
+		it should always cover all affected indexes. */
+		dtuple_t* entry = row_build_index_entry(
+			node->row, node->ext, index, heap);
+
+		if (UNIV_UNLIKELY(!entry)) {
+			/* The database must have crashed after
+			inserting a clustered index record but before
+			writing all the externally stored columns of
+			that record.  Because secondary index entries
+			are inserted after the clustered index record,
+			we may assume that the secondary index record
+			does not exist.  However, this situation may
+			only occur during the rollback of incomplete
+			transactions. */
+			ut_a(thr_get_trx(thr) == trx_roll_crash_recv_trx);
+		} else {
+			err = row_undo_mod_del_mark_or_remove_sec(
+				node, thr, index, entry);
+
+			if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+
+				break;
+			}
+		}
+
+		mem_heap_empty(heap);
+	} while ((node->index = dict_table_get_next_index(node->index)));
+
+	mem_heap_free(heap);
+
+	return(err);
+}
+
+/***********************************************************//**
+Undoes a modify in secondary indexes when undo record type is DEL_MARK.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_del_mark_sec(
+/*======================*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	mem_heap_t*	heap;
+	dberr_t		err	= DB_SUCCESS;
+
+	ut_ad(!node->undo_row);
+
+	heap = mem_heap_create(1024);
+
+	do {
+		dict_index_t* index = node->index;
+
+		if (index->type & (DICT_FTS | DICT_CORRUPT)
+		    || !index->is_committed()) {
+			continue;
+		}
+
+		/* During online index creation,
+		HA_ALTER_INPLACE_COPY_NO_LOCK or HA_ALTER_INPLACE_NOCOPY_NO_LOCK
+		should guarantee that any active transaction has not modified
+		indexed columns such that col->ord_part was 0 at the
+		time when the undo log record was written. When we get
+		to roll back an undo log entry TRX_UNDO_DEL_MARK_REC,
+		it should always cover all affected indexes. */
+		dtuple_t* entry = row_build_index_entry(
+			node->row, node->ext, index, heap);
+
+		ut_a(entry);
+
+		err = row_undo_mod_del_unmark_sec_and_undo_update(
+			BTR_MODIFY_LEAF, thr, index, entry);
+		if (err == DB_FAIL) {
+			err = row_undo_mod_del_unmark_sec_and_undo_update(
+				BTR_MODIFY_TREE, thr, index, entry);
+		}
+
+		if (err == DB_DUPLICATE_KEY) {
+			index->type |= DICT_CORRUPT;
+			err = DB_SUCCESS;
+			/* Do not return any error to the caller. The
+			duplicate will be reported by ALTER TABLE or
+			CREATE UNIQUE INDEX. Unfortunately we cannot
+			report the duplicate key value to the DDL
+			thread, because the altered_table object is
+			private to its call stack. */
+		} else if (err != DB_SUCCESS) {
+			break;
+		}
+
+		mem_heap_empty(heap);
+	} while ((node->index = dict_table_get_next_index(node->index)));
+
+	mem_heap_free(heap);
+
+	return(err);
+}
+
+/***********************************************************//**
+Undoes a modify in secondary indexes when undo record type is UPD_EXIST.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_undo_mod_upd_exist_sec(
+/*=======================*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) {
+		return DB_SUCCESS;
+	}
+
+	mem_heap_t* heap = mem_heap_create(1024);
+	dberr_t err = DB_SUCCESS;
+
+	do {
+		dict_index_t* index = node->index;
+
+		if (index->type & (DICT_FTS | DICT_CORRUPT)
+		    || !index->is_committed()) {
+			continue;
+		}
+
+		if (!row_upd_changes_ord_field_binary_func(
+			index, node->update,
+#ifdef UNIV_DEBUG
+			thr,
+#endif /* UNIV_DEBUG */
+			node->row, node->ext, ROW_BUILD_FOR_UNDO)) {
+			continue;
+		}
+
+		/* Build the newest version of the index entry */
+		dtuple_t* entry = row_build_index_entry(
+			node->row, node->ext, index, heap);
+		if (UNIV_UNLIKELY(!entry)) {
+			/* The server must have crashed in
+			row_upd_clust_rec_by_insert() before
+			the updated externally stored columns (BLOBs)
+			of the new clustered index entry were written. */
+
+			/* The table must be in DYNAMIC or COMPRESSED
+			format.  REDUNDANT and COMPACT formats
+			store a local 768-byte prefix of each
+			externally stored column. */
+			ut_a(dict_table_has_atomic_blobs(index->table));
+
+			/* This is only legitimate when
+			rolling back an incomplete transaction
+			after crash recovery. */
+			ut_a(thr_get_trx(thr)->is_recovered);
+
+			/* The server must have crashed before
+			completing the insert of the new
+			clustered index entry and before
+			inserting to the secondary indexes.
+			Because node->row was not yet written
+			to this index, we can ignore it.  But
+			we must restore node->undo_row. */
+		} else {
+			/* NOTE that if we updated the fields of a
+			delete-marked secondary index record so that
+			alphabetically they stayed the same, e.g.,
+			'abc' -> 'aBc', we cannot return to the
+			original values because we do not know them.
+			But this should not cause problems because
+			in row0sel.cc, in queries we always retrieve
+			the clustered index record or an earlier
+			version of it, if the secondary index record
+			through which we do the search is
+			delete-marked. */
+
+			err = row_undo_mod_del_mark_or_remove_sec(
+				node, thr, index, entry);
+			if (err != DB_SUCCESS) {
+				break;
+			}
+		}
+
+		mem_heap_empty(heap);
+		/* We may have to update the delete mark in the
+		secondary index record of the previous version of
+		the row. We also need to update the fields of
+		the secondary index record if we updated its fields
+		but alphabetically they stayed the same, e.g.,
+		'abc' -> 'aBc'. */
+		entry = row_build_index_entry_low(node->undo_row,
+						  node->undo_ext,
+						  index, heap,
+						  ROW_BUILD_FOR_UNDO);
+		ut_a(entry);
+
+		err = row_undo_mod_del_unmark_sec_and_undo_update(
+			BTR_MODIFY_LEAF, thr, index, entry);
+		if (err == DB_FAIL) {
+			err = row_undo_mod_del_unmark_sec_and_undo_update(
+				BTR_MODIFY_TREE, thr, index, entry);
+		}
+
+		if (err == DB_DUPLICATE_KEY) {
+			index->type |= DICT_CORRUPT;
+			err = DB_SUCCESS;
+		} else if (err != DB_SUCCESS) {
+			break;
+		}
+
+		mem_heap_empty(heap);
+	} while ((node->index = dict_table_get_next_index(node->index)));
+
+	mem_heap_free(heap);
+
+	return(err);
+}
+
+/** Parse an update undo record.
+@param[in,out]	node		row rollback state
+@param[in]	dict_locked	whether the data dictionary cache is locked */
+static bool row_undo_mod_parse_undo_rec(undo_node_t* node, bool dict_locked)
+{
+	dict_index_t*	clust_index;
+	undo_no_t	undo_no;
+	table_id_t	table_id;
+	trx_id_t	trx_id;
+	roll_ptr_t	roll_ptr;
+	byte		info_bits;
+	byte		type;
+	byte		cmpl_info;
+	bool		dummy_extern;
+
+	ut_ad(node->trx->in_rollback);
+	ut_ad(!trx_undo_roll_ptr_is_insert(node->roll_ptr));
+
+	const byte *ptr = trx_undo_rec_get_pars(
+		node->undo_rec, &type, &cmpl_info,
+		&dummy_extern, &undo_no, &table_id);
+	node->rec_type = type;
+
+	if (!node->is_temp) {
+		node->table = dict_table_open_on_id(table_id, dict_locked,
+						    DICT_TABLE_OP_NORMAL);
+	} else if (!dict_locked) {
+		dict_sys.freeze(SRW_LOCK_CALL);
+		node->table = dict_sys.acquire_temporary_table(table_id);
+		dict_sys.unfreeze();
+	} else {
+		node->table = dict_sys.acquire_temporary_table(table_id);
+	}
+
+	if (!node->table) {
+		return false;
+	}
+
+	ut_ad(!node->table->skip_alter_undo);
+
+	if (UNIV_UNLIKELY(!node->table->is_accessible())) {
+close_table:
+		/* Normally, tables should not disappear or become
+		unaccessible during ROLLBACK, because they should be
+		protected by InnoDB table locks. Corruption could be
+		a valid exception.
+
+		FIXME: When running out of temporary tablespace, it
+		would probably be better to just drop all temporary
+		tables (and temporary undo log records) of the current
+		connection, instead of doing this rollback. */
+		dict_table_close(node->table, dict_locked);
+		node->table = NULL;
+		return false;
+	}
+
+	clust_index = dict_table_get_first_index(node->table);
+
+	ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr,
+					       &info_bits);
+
+	ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref),
+				       node->heap);
+
+	ptr = trx_undo_update_rec_get_update(ptr, clust_index, type, trx_id,
+				       roll_ptr, info_bits,
+				       node->heap, &(node->update));
+	node->new_trx_id = trx_id;
+	node->cmpl_info = cmpl_info;
+	ut_ad(!node->ref->info_bits);
+
+	if (node->update->info_bits & REC_INFO_MIN_REC_FLAG) {
+		if ((node->update->info_bits & ~REC_INFO_DELETED_FLAG)
+		    != REC_INFO_MIN_REC_FLAG) {
+			ut_ad("wrong info_bits in undo log record" == 0);
+			goto close_table;
+		}
+		/* This must be an undo log record for a subsequent
+		instant ALTER TABLE, extending the metadata record. */
+		ut_ad(clust_index->is_instant());
+		ut_ad(clust_index->table->instant
+		      || !(node->update->info_bits & REC_INFO_DELETED_FLAG));
+		node->ref = &trx_undo_metadata;
+		node->update->info_bits = (node->update->info_bits
+					   & REC_INFO_DELETED_FLAG)
+			? REC_INFO_METADATA_ALTER
+			: REC_INFO_METADATA_ADD;
+	}
+
+	if (!row_undo_search_clust_to_pcur(node)) {
+		/* As long as this rolling-back transaction exists,
+		the PRIMARY KEY value pointed to by the undo log
+		record should exist.
+
+		However, if InnoDB is killed during a rollback, or
+		shut down during the rollback of recovered
+		transactions, then after restart we may try to roll
+		back some of the same undo log records again, because
+		trx_roll_try_truncate() is not being invoked after
+		every undo log record.
+
+		It is also possible that the record
+		was not modified yet (the DB_ROLL_PTR does not match
+		node->roll_ptr) and thus there is nothing to roll back.
+
+		btr_cur_upd_lock_and_undo() only writes the undo log
+		record after successfully acquiring an exclusive lock
+		on the the clustered index record. That lock will not
+		be released before the transaction is committed or
+		fully rolled back. (Exception: if the server was
+		killed, restarted, and shut down again before the
+		rollback of the recovered transaction was completed,
+		it is possible that the transaction was partially
+		rolled back and locks released.) */
+		goto close_table;
+	}
+
+	/* Extract indexed virtual columns from undo log */
+	if (node->ref != &trx_undo_metadata && node->table->n_v_cols) {
+		row_upd_replace_vcol(node->row, node->table,
+				     node->update, false, node->undo_row,
+				     (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)
+				     ? nullptr : ptr);
+	}
+
+	return true;
+}
+
+/***********************************************************//**
+Undoes a modify operation on a row of a table.
+@return DB_SUCCESS or error code */
+dberr_t
+row_undo_mod(
+/*=========*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dberr_t	err = DB_SUCCESS;
+	ut_ad(thr_get_trx(thr) == node->trx);
+	const bool dict_locked = node->trx->dict_operation_lock_mode;
+
+	if (!row_undo_mod_parse_undo_rec(node, dict_locked)) {
+		return DB_SUCCESS;
+	}
+
+	ut_ad(node->table->is_temporary()
+	      || lock_table_has_locks(node->table));
+	node->index = dict_table_get_first_index(node->table);
+	ut_ad(dict_index_is_clust(node->index));
+
+	if (node->ref->info_bits) {
+		ut_ad(node->ref->is_metadata());
+		goto rollback_clust;
+	}
+
+	/* Skip the clustered index (the first index) */
+	node->index = dict_table_get_next_index(node->index);
+	if (node->index) {
+		switch (node->rec_type) {
+		case TRX_UNDO_UPD_EXIST_REC:
+			err = row_undo_mod_upd_exist_sec(node, thr);
+			break;
+		case TRX_UNDO_DEL_MARK_REC:
+			err = row_undo_mod_del_mark_sec(node, thr);
+			break;
+		case TRX_UNDO_UPD_DEL_REC:
+			err = row_undo_mod_upd_del_sec(node, thr);
+			break;
+		default:
+			MY_ASSERT_UNREACHABLE();
+		}
+	}
+
+	if (err == DB_SUCCESS) {
+rollback_clust:
+		err = row_undo_mod_clust(node, thr);
+
+		bool update_statistics
+			= !(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE);
+
+		if (err == DB_SUCCESS && node->table->stat_initialized) {
+			switch (node->rec_type) {
+			case TRX_UNDO_UPD_EXIST_REC:
+				break;
+			case TRX_UNDO_DEL_MARK_REC:
+				dict_table_n_rows_inc(node->table);
+				update_statistics = update_statistics
+					|| !srv_stats_include_delete_marked;
+				break;
+			case TRX_UNDO_UPD_DEL_REC:
+				dict_table_n_rows_dec(node->table);
+				update_statistics = update_statistics
+					|| !srv_stats_include_delete_marked;
+				break;
+			}
+
+			/* Do not attempt to update statistics when
+			executing ROLLBACK in the InnoDB SQL
+			interpreter, because in that case we would
+			already be holding dict_sys.latch, which
+			would be acquired when updating statistics. */
+			if (update_statistics && !dict_locked) {
+				dict_stats_update_if_needed(node->table,
+							    *node->trx);
+			} else {
+				node->table->stat_modified_counter++;
+			}
+		}
+	}
+
+	dict_table_close(node->table, dict_locked);
+
+	node->table = NULL;
+
+	return(err);
+}
diff --git a/storage/innobase/row/row0undo.cc b/storage/innobase/row/row0undo.cc
new file mode 100644
index 00000000..8a1041c8
--- /dev/null
+++ b/storage/innobase/row/row0undo.cc
@@ -0,0 +1,453 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0undo.cc
+Row undo
+
+Created 1/8/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0undo.h"
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0uins.h"
+#include "row0umod.h"
+#include "row0upd.h"
+#include "row0mysql.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+
+/* How to undo row operations?
+(1) For an insert, we have stored a prefix of the clustered index record
+in the undo log. Using it, we look for the clustered record, and using
+that we look for the records in the secondary indexes. The insert operation
+may have been left incomplete, if the database crashed, for example.
+We may have look at the trx id and roll ptr to make sure the record in the
+clustered index is really the one for which the undo log record was
+written. We can use the framework we get from the original insert op.
+(2) Delete marking: We can use the framework we get from the original
+delete mark op. We only have to check the trx id.
+(3) Update: This may be the most complicated. We have to use the framework
+we get from the original update op.
+
+What if the same trx repeatedly deletes and inserts an identical row.
+Then the row id changes and also roll ptr. What if the row id was not
+part of the ordering fields in the clustered index? Maybe we have to write
+it to undo log. Well, maybe not, because if we order the row id and trx id
+in descending order, then the only undeleted copy is the first in the
+index. Our searches in row operations always position the cursor before
+the first record in the result set. But, if there is no key defined for
+a table, then it would be desirable that row id is in ascending order.
+So, lets store row id in descending order only if it is not an ordering
+field in the clustered index.
+
+NOTE: Deletes and inserts may lead to situation where there are identical
+records in a secondary index. Is that a problem in the B-tree? Yes.
+Also updates can lead to this, unless trx id and roll ptr are included in
+ord fields.
+(1) Fix in clustered indexes: include row id, trx id, and roll ptr
+in node pointers of B-tree.
+(2) Fix in secondary indexes: include all fields in node pointers, and
+if an entry is inserted, check if it is equal to the right neighbor,
+in which case update the right neighbor: the neighbor must be delete
+marked, set it unmarked and write the trx id of the current transaction.
+
+What if the same trx repeatedly updates the same row, updating a secondary
+index field or not? Updating a clustered index ordering field?
+
+(1) If it does not update the secondary index and not the clustered index
+ord field. Then the secondary index record stays unchanged, but the
+trx id in the secondary index record may be smaller than in the clustered
+index record. This is no problem?
+(2) If it updates secondary index ord field but not clustered: then in
+secondary index there are delete marked records, which differ in an
+ord field. No problem.
+(3) Updates clustered ord field but not secondary, and secondary index
+is unique. Then the record in secondary index is just updated at the
+clustered ord field.
+(4)
+
+Problem with duplicate records:
+Fix 1: Add a trx op no field to all indexes. A problem: if a trx with a
+bigger trx id has inserted and delete marked a similar row, our trx inserts
+again a similar row, and a trx with an even bigger id delete marks it. Then
+the position of the row should change in the index if the trx id affects
+the alphabetical ordering.
+
+Fix 2: If an insert encounters a similar row marked deleted, we turn the
+insert into an 'update' of the row marked deleted. Then we must write undo
+info on the update. A problem: what if a purge operation tries to remove
+the delete marked row?
+
+We can think of the database row versions as a linked list which starts
+from the record in the clustered index, and is linked by roll ptrs
+through undo logs. The secondary index records are references which tell
+what kinds of records can be found in this linked list for a record
+in the clustered index.
+
+How to do the purge? A record can be removed from the clustered index
+if its linked list becomes empty, i.e., the row has been marked deleted
+and its roll ptr points to the record in the undo log we are going through,
+doing the purge. Similarly, during a rollback, a record can be removed
+if the stored roll ptr in the undo log points to a trx already (being) purged,
+or if the roll ptr is NULL, i.e., it was a fresh insert. */
+
+/********************************************************************//**
+Creates a row undo node to a query graph.
+@return own: undo node */
+undo_node_t*
+row_undo_node_create(
+/*=================*/
+	trx_t*		trx,	/*!< in/out: transaction */
+	que_thr_t*	parent,	/*!< in: parent node, i.e., a thr node */
+	mem_heap_t*	heap)	/*!< in: memory heap where created */
+{
+	undo_node_t*	undo;
+
+	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE)
+	      || trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED)
+	      || trx_state_eq(trx, TRX_STATE_PREPARED));
+	ut_ad(parent);
+
+	undo = static_cast<undo_node_t*>(
+		mem_heap_alloc(heap, sizeof(undo_node_t)));
+
+	undo->common.type = QUE_NODE_UNDO;
+	undo->common.parent = parent;
+
+	undo->trx = trx;
+
+	btr_pcur_init(&(undo->pcur));
+
+	undo->heap = mem_heap_create(256);
+
+	return(undo);
+}
+
+/***********************************************************//**
+Looks for the clustered index record when node has the row reference.
+The pcur in node is used in the search. If found, stores the row to node,
+and stores the position of pcur, and detaches it. The pcur must be closed
+by the caller in any case.
+@return true if found; NOTE the node->pcur must be closed by the
+caller, regardless of the return value */
+bool
+row_undo_search_clust_to_pcur(
+/*==========================*/
+	undo_node_t*	node)	/*!< in/out: row undo node */
+{
+	dict_index_t*	clust_index;
+	bool		found;
+	mtr_t		mtr;
+	row_ext_t**	ext;
+	const rec_t*	rec;
+	mem_heap_t*	heap		= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(!node->table->skip_alter_undo);
+
+	mtr_start(&mtr);
+
+	clust_index = dict_table_get_first_index(node->table);
+
+	found = row_search_on_row_ref(&node->pcur, BTR_MODIFY_LEAF,
+				      node->table, node->ref, &mtr);
+
+	if (!found) {
+		goto func_exit;
+	}
+
+	rec = btr_pcur_get_rec(&node->pcur);
+
+	offsets = rec_get_offsets(rec, clust_index, offsets,
+				  clust_index->n_core_fields,
+				  ULINT_UNDEFINED, &heap);
+
+	found = row_get_rec_roll_ptr(rec, clust_index, offsets)
+		== node->roll_ptr;
+
+	if (found) {
+		ut_ad(row_get_rec_trx_id(rec, clust_index, offsets)
+		      == node->trx->id || node->table->is_temporary());
+
+		if (dict_table_has_atomic_blobs(node->table)) {
+			/* There is no prefix of externally stored
+			columns in the clustered index record. Build a
+			cache of column prefixes. */
+			ext = &node->ext;
+		} else {
+			/* REDUNDANT and COMPACT formats store a local
+			768-byte prefix of each externally stored
+			column. No cache is needed. */
+			ext = NULL;
+			node->ext = NULL;
+		}
+
+		node->row = row_build(ROW_COPY_DATA, clust_index, rec,
+				      offsets, NULL,
+				      NULL, NULL, ext, node->heap);
+
+		/* We will need to parse out virtual column info from undo
+		log, first mark them DATA_MISSING. So we will know if the
+		value gets updated */
+		if (node->table->n_v_cols
+		    && !trx_undo_roll_ptr_is_insert(node->roll_ptr)
+		    && !(node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+			for (ulint i = 0;
+			     i < dict_table_get_n_v_cols(node->table); i++) {
+				dfield_get_type(dtuple_get_nth_v_field(
+					node->row, i))->mtype = DATA_MISSING;
+			}
+		}
+
+		if (node->rec_type == TRX_UNDO_UPD_EXIST_REC) {
+			ut_ad((node->row->info_bits & ~REC_INFO_DELETED_FLAG)
+			      == REC_INFO_MIN_REC_FLAG
+			      || node->row->info_bits == 0);
+			node->undo_row = dtuple_copy(node->row, node->heap);
+			row_upd_replace(node->undo_row, &node->undo_ext,
+					clust_index, node->update, node->heap);
+		} else {
+			ut_ad(((node->row->info_bits & ~REC_INFO_DELETED_FLAG)
+			       == REC_INFO_MIN_REC_FLAG)
+			      == (node->rec_type == TRX_UNDO_INSERT_METADATA));
+			node->undo_row = NULL;
+			node->undo_ext = NULL;
+		}
+
+		btr_pcur_store_position(&node->pcur, &mtr);
+	}
+
+	if (heap) {
+		mem_heap_free(heap);
+	}
+
+func_exit:
+	btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
+	return(found);
+}
+
+/** Get the latest undo log record for rollback.
+@param[in,out]	node		rollback context
+@return	undo block for the undo log record
+@retval nullptr if no undo log record was fetched */
+static buf_block_t* row_undo_rec_get(undo_node_t* node)
+{
+	trx_t* trx = node->trx;
+
+	if (trx->pages_undone) {
+		trx->pages_undone = 0;
+		trx_undo_try_truncate(*trx);
+	}
+
+	trx_undo_t*	undo	= NULL;
+	trx_undo_t*	update	= trx->rsegs.m_redo.undo;
+	trx_undo_t*	temp	= trx->rsegs.m_noredo.undo;
+	const undo_no_t	limit	= trx->roll_limit;
+	node->is_temp = false;
+
+	ut_ad(!update || !temp || update->empty() || temp->empty()
+	      || update->top_undo_no != temp->top_undo_no);
+
+	if (update && !update->empty() && update->top_undo_no >= limit) {
+		if (!undo) {
+			undo = update;
+		} else if (undo->top_undo_no < update->top_undo_no) {
+			undo = update;
+		}
+	}
+
+	if (temp && !temp->empty() && temp->top_undo_no >= limit) {
+		if (!undo || undo->top_undo_no < temp->top_undo_no) {
+			undo = temp;
+			node->is_temp = true;
+		}
+	}
+
+	if (undo == NULL) {
+		trx_undo_try_truncate(*trx);
+		/* Mark any ROLLBACK TO SAVEPOINT completed, so that
+		if the transaction object is committed and reused
+		later, we will default to a full ROLLBACK. */
+		trx->roll_limit = 0;
+		trx->in_rollback = false;
+		return nullptr;
+	}
+
+	ut_ad(!undo->empty());
+	ut_ad(limit <= undo->top_undo_no);
+
+	node->roll_ptr = trx_undo_build_roll_ptr(
+		false, trx_sys.rseg_id(undo->rseg, !node->is_temp),
+		undo->top_page_no, undo->top_offset);
+
+	mtr_t	mtr;
+	mtr.start();
+
+	buf_block_t* undo_page = buf_page_get(
+		page_id_t(undo->rseg->space->id, undo->top_page_no),
+		0, RW_S_LATCH, &mtr);
+	if (!undo_page) {
+		return nullptr;
+	}
+
+	uint16_t offset = undo->top_offset;
+
+	buf_block_t* prev_page = undo_page;
+	if (trx_undo_rec_t* prev_rec = trx_undo_get_prev_rec(
+		    prev_page, offset, undo->hdr_page_no, undo->hdr_offset,
+		    true, &mtr)) {
+		if (prev_page != undo_page) {
+			trx->pages_undone++;
+		}
+
+		undo->top_page_no = prev_page->page.id().page_no();
+		undo->top_offset  = page_offset(prev_rec);
+		undo->top_undo_no = trx_undo_rec_get_undo_no(prev_rec);
+		ut_ad(!undo->empty());
+	} else {
+		undo->top_undo_no = IB_ID_MAX;
+		ut_ad(undo->empty());
+	}
+
+	undo_page->fix();
+	mtr.commit();
+
+	node->undo_rec = undo_page->page.frame + offset;
+
+	const size_t end = mach_read_from_2(node->undo_rec);
+	if (UNIV_UNLIKELY(end <= offset
+			  || end >= srv_page_size - FIL_PAGE_DATA_END)) {
+		undo_page->unfix();
+		node->undo_rec = nullptr;
+		return nullptr;
+	}
+
+	switch (node->undo_rec[2] & (TRX_UNDO_CMPL_INFO_MULT - 1)) {
+	case TRX_UNDO_INSERT_METADATA:
+		/* This record type was introduced in MDEV-11369
+		instant ADD COLUMN, which was implemented after
+		MDEV-12288 removed the insert_undo log. There is no
+		instant ADD COLUMN for temporary tables. Therefore,
+		this record can only be present in the main undo log. */
+		/* fall through */
+	case TRX_UNDO_RENAME_TABLE:
+		ut_ad(undo == update);
+		/* fall through */
+	case TRX_UNDO_INSERT_REC:
+	case TRX_UNDO_EMPTY:
+		node->roll_ptr |= 1ULL << ROLL_PTR_INSERT_FLAG_POS;
+	}
+
+	trx->undo_no = node->undo_no = trx_undo_rec_get_undo_no(
+		node->undo_rec);
+	return undo_page;
+}
+
+/***********************************************************//**
+Fetches an undo log record and does the undo for the recorded operation.
+If none left, or a partial rollback completed, returns control to the
+parent node, which is always a query thread node.
+@return DB_SUCCESS if operation successfully completed, else error code */
+static MY_ATTRIBUTE((warn_unused_result))
+dberr_t
+row_undo(
+/*=====*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ut_ad(node->trx->in_rollback);
+
+	buf_block_t* undo_page = row_undo_rec_get(node);
+
+	if (!undo_page) {
+		/* Rollback completed for this query thread */
+		thr->run_node = que_node_get_parent(node);
+		return DB_SUCCESS;
+	}
+
+	dberr_t err = trx_undo_roll_ptr_is_insert(node->roll_ptr)
+		? row_undo_ins(node, thr) : row_undo_mod(node, thr);
+	undo_page->unfix();
+	btr_pcur_close(&(node->pcur));
+
+	mem_heap_empty(node->heap);
+
+	thr->run_node = node;
+
+	return(err);
+}
+
+/***********************************************************//**
+Undoes a row operation in a table. This is a high-level function used
+in SQL execution graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+row_undo_step(
+/*==========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dberr_t		err;
+	undo_node_t*	node;
+	trx_t*		trx = thr_get_trx(thr);
+
+	node = static_cast<undo_node_t*>(thr->run_node);
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_UNDO);
+
+	if (UNIV_UNLIKELY(!trx->dict_operation
+			  && !srv_undo_sources
+			  && srv_shutdown_state != SRV_SHUTDOWN_NONE)
+	    && (srv_fast_shutdown == 3 || trx == trx_roll_crash_recv_trx)) {
+		/* Shutdown has been initiated. */
+		trx->error_state = DB_INTERRUPTED;
+		return NULL;
+	}
+
+	if (UNIV_UNLIKELY(trx == trx_roll_crash_recv_trx)) {
+		trx_roll_report_progress();
+	}
+
+	err = row_undo(node, thr);
+
+#ifdef ENABLED_DEBUG_SYNC
+	if (trx->mysql_thd) {
+		DEBUG_SYNC_C("trx_after_rollback_row");
+	}
+#endif /* ENABLED_DEBUG_SYNC */
+
+	trx->error_state = err;
+
+	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+		ib::fatal() << "Error (" << err << ") in rollback.";
+	}
+
+	return(thr);
+}
diff --git a/storage/innobase/row/row0upd.cc b/storage/innobase/row/row0upd.cc
new file mode 100644
index 00000000..bec53841
--- /dev/null
+++ b/storage/innobase/row/row0upd.cc
@@ -0,0 +1,3002 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2023, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0upd.cc
+Update of a row
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#include "row0upd.h"
+#include "dict0dict.h"
+#include "dict0mem.h"
+#include "trx0undo.h"
+#include "rem0rec.h"
+#include "dict0boot.h"
+#include "dict0crea.h"
+#include "mach0data.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "que0que.h"
+#include "row0ext.h"
+#include "row0ins.h"
+#include "row0log.h"
+#include "row0row.h"
+#include "row0sel.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "log0log.h"
+#include "pars0sym.h"
+#include "eval0eval.h"
+#include "buf0lru.h"
+#include "trx0rec.h"
+#include "fts0fts.h"
+#include "fts0types.h"
+#include <algorithm>
+#include <mysql/plugin.h>
+#include <mysql/service_wsrep.h>
+#ifdef WITH_WSREP
+#include "log.h"
+#include "wsrep.h"
+#endif /* WITH_WSREP */
+
+
+/* What kind of latch and lock can we assume when the control comes to
+   -------------------------------------------------------------------
+an update node?
+--------------
+Efficiency of massive updates would require keeping an x-latch on a
+clustered index page through many updates, and not setting an explicit
+x-lock on clustered index records, as they anyway will get an implicit
+x-lock when they are updated. A problem is that the read nodes in the
+graph should know that they must keep the latch when passing the control
+up to the update node, and not set any record lock on the record which
+will be updated. Another problem occurs if the execution is stopped,
+as the kernel switches to another query thread, or the transaction must
+wait for a lock. Then we should be able to release the latch and, maybe,
+acquire an explicit x-lock on the record.
+	Because this seems too complicated, we conclude that the less
+efficient solution of releasing all the latches when the control is
+transferred to another node, and acquiring explicit x-locks, is better. */
+
+/* How is a delete performed? If there is a delete without an
+explicit cursor, i.e., a searched delete, there are at least
+two different situations:
+the implicit select cursor may run on (1) the clustered index or
+on (2) a secondary index. The delete is performed by setting
+the delete bit in the record and substituting the id of the
+deleting transaction for the original trx id, and substituting a
+new roll ptr for previous roll ptr. The old trx id and roll ptr
+are saved in the undo log record. Thus, no physical changes occur
+in the index tree structure at the time of the delete. Only
+when the undo log is purged, the index records will be physically
+deleted from the index trees.
+
+The query graph executing a searched delete would consist of
+a delete node which has as a subtree a select subgraph.
+The select subgraph should return a (persistent) cursor
+in the clustered index, placed on page which is x-latched.
+The delete node should look for all secondary index records for
+this clustered index entry and mark them as deleted. When is
+the x-latch freed? The most efficient way for performing a
+searched delete is obviously to keep the x-latch for several
+steps of query graph execution. */
+
+/*************************************************************************
+IMPORTANT NOTE: Any operation that generates redo MUST check that there
+is enough space in the redo log before for that operation. This is
+done by calling log_free_check(). The reason for checking the
+availability of the redo log space before the start of the operation is
+that we MUST not hold any synchonization objects when performing the
+check.
+If you make a change in this module make sure that no codepath is
+introduced where a call to log_free_check() is bypassed. */
+
+/***********************************************************//**
+Checks if an update vector changes some of the first ordering fields of an
+index record. This is only used in foreign key checks and we can assume
+that index does not contain column prefixes.
+@return TRUE if changes */
+static
+ibool
+row_upd_changes_first_fields_binary(
+/*================================*/
+	dtuple_t*	entry,	/*!< in: old value of index entry */
+	dict_index_t*	index,	/*!< in: index of entry */
+	const upd_t*	update,	/*!< in: update vector for the row */
+	ulint		n);	/*!< in: how many first fields to check */
+
+/*********************************************************************//**
+Checks if index currently is mentioned as a referenced index in a foreign
+key constraint.
+
+@return true if referenced */
+static
+bool
+row_upd_index_is_referenced(
+/*========================*/
+	dict_index_t*	index,	/*!< in: index */
+	trx_t*		trx)	/*!< in: transaction */
+{
+  dict_table_t *table= index->table;
+  /* The pointers in table->referenced_set are safe to dereference
+  thanks to the SQL layer having acquired MDL on all (grand)parent tables. */
+  dict_foreign_set::iterator end= table->referenced_set.end();
+  return end != std::find_if(table->referenced_set.begin(), end,
+                             dict_foreign_with_index(index));
+}
+
+#ifdef WITH_WSREP
+static
+bool
+wsrep_row_upd_index_is_foreign(
+/*========================*/
+	dict_index_t*	index,	/*!< in: index */
+	trx_t*		trx)	/*!< in: transaction */
+{
+  if (!trx->is_wsrep())
+    return false;
+
+  dict_table_t *table= index->table;
+
+  if (table->foreign_set.empty())
+    return false;
+
+  /* No MDL protects dereferencing the members of table->foreign_set. */
+  const bool no_lock= !trx->dict_operation_lock_mode;
+  if (no_lock)
+    dict_sys.freeze(SRW_LOCK_CALL);
+
+  auto end= table->foreign_set.end();
+  const bool is_referenced= end !=
+    std::find_if(table->foreign_set.begin(), end,
+                 [index](const dict_foreign_t* f)
+                 {return f->foreign_index == index;});
+  if (no_lock)
+    dict_sys.unfreeze();
+
+  return is_referenced;
+}
+#endif /* WITH_WSREP */
+
+/*********************************************************************//**
+Checks if possible foreign key constraints hold after a delete of the record
+under pcur.
+
+NOTE that this function will temporarily commit mtr and lose the
+pcur position!
+
+@return DB_SUCCESS or an error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_upd_check_references_constraints(
+/*=================================*/
+	upd_node_t*	node,	/*!< in: row update node */
+	btr_pcur_t*	pcur,	/*!< in: cursor positioned on a record; NOTE: the
+				cursor position is lost in this function! */
+	dict_table_t*	table,	/*!< in: table in question */
+	dict_index_t*	index,	/*!< in: index of the cursor */
+	rec_offs*	offsets,/*!< in/out: rec_get_offsets(pcur.rec, index) */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	dict_foreign_t*	foreign;
+	mem_heap_t*	heap;
+	dtuple_t*	entry;
+	const rec_t*	rec;
+	dberr_t		err;
+
+	DBUG_ENTER("row_upd_check_references_constraints");
+
+	if (table->referenced_set.empty()) {
+		DBUG_RETURN(DB_SUCCESS);
+	}
+
+	rec = btr_pcur_get_rec(pcur);
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	heap = mem_heap_create(500);
+
+	entry = row_rec_to_index_entry(rec, index, offsets, heap);
+
+	mtr_commit(mtr);
+
+	DEBUG_SYNC_C("foreign_constraint_check_for_update");
+
+	mtr->start();
+
+	DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd,
+			    "foreign_constraint_check_for_insert");
+
+	for (dict_foreign_set::iterator it = table->referenced_set.begin();
+	     it != table->referenced_set.end();
+	     ++it) {
+
+		foreign = *it;
+
+		/* Note that we may have an update which updates the index
+		record, but does NOT update the first fields which are
+		referenced in a foreign key constraint. Then the update does
+		NOT break the constraint. */
+
+		if (foreign->referenced_index == index
+		    && (node->is_delete
+			|| row_upd_changes_first_fields_binary(
+				entry, index, node->update,
+				foreign->n_fields))) {
+			dict_table_t*	ref_table = nullptr;
+
+			if (!foreign->foreign_table) {
+				ref_table = dict_table_open_on_name(
+					foreign->foreign_table_name_lookup,
+					false, DICT_ERR_IGNORE_NONE);
+			}
+
+			err = row_ins_check_foreign_constraint(
+				FALSE, foreign, table, entry, thr);
+
+			if (ref_table) {
+				dict_table_close(ref_table);
+			}
+
+			if (err != DB_SUCCESS) {
+				goto func_exit;
+			}
+		}
+	}
+
+	err = DB_SUCCESS;
+
+func_exit:
+	mem_heap_free(heap);
+
+	DEBUG_SYNC_C("foreign_constraint_check_for_update_done");
+	DBUG_RETURN(err);
+}
+
+#ifdef WITH_WSREP
+static
+dberr_t
+wsrep_row_upd_check_foreign_constraints(
+/*=================================*/
+	upd_node_t*	node,	/*!< in: row update node */
+	btr_pcur_t*	pcur,	/*!< in: cursor positioned on a record; NOTE: the
+				cursor position is lost in this function! */
+	dict_table_t*	table,	/*!< in: table in question */
+	dict_index_t*	index,	/*!< in: index of the cursor */
+	rec_offs*	offsets,/*!< in/out: rec_get_offsets(pcur.rec, index) */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	dict_foreign_t*	foreign;
+	mem_heap_t*	heap;
+	dtuple_t*	entry;
+	const rec_t*	rec;
+	dberr_t		err;
+
+	if (table->foreign_set.empty()) {
+		return(DB_SUCCESS);
+	}
+
+	/* TODO: make native slave thread bail out here */
+
+	rec = btr_pcur_get_rec(pcur);
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	heap = mem_heap_create(500);
+
+	entry = row_rec_to_index_entry(rec, index, offsets, heap);
+
+	mtr_commit(mtr);
+
+	mtr_start(mtr);
+
+	for (dict_foreign_set::iterator it = table->foreign_set.begin();
+	     it != table->foreign_set.end();
+	     ++it) {
+
+		foreign = *it;
+		/* Note that we may have an update which updates the index
+		record, but does NOT update the first fields which are
+		referenced in a foreign key constraint. Then the update does
+		NOT break the constraint. */
+
+		if (foreign->foreign_index == index
+		    && (node->is_delete
+			|| row_upd_changes_first_fields_binary(
+				entry, index, node->update,
+				foreign->n_fields))) {
+
+			dict_table_t *opened = nullptr;
+
+			if (!foreign->referenced_table) {
+				foreign->referenced_table =
+					dict_table_open_on_name(
+					  foreign->referenced_table_name_lookup,
+					  false, DICT_ERR_IGNORE_NONE);
+				opened = foreign->referenced_table;
+			}
+
+			err = row_ins_check_foreign_constraint(
+				TRUE, foreign, table, entry, thr);
+
+			if (opened) {
+				dict_table_close(opened);
+			}
+
+			if (err != DB_SUCCESS) {
+				goto func_exit;
+			}
+		}
+	}
+
+	err = DB_SUCCESS;
+func_exit:
+	mem_heap_free(heap);
+
+	return(err);
+}
+
+/** Determine if a FOREIGN KEY constraint needs to be processed.
+@param[in]	node	query node
+@param[in]	trx	transaction
+@return	whether the node cannot be ignored */
+
+inline bool wsrep_must_process_fk(const upd_node_t* node, const trx_t* trx)
+{
+	if (!trx->is_wsrep()) {
+		return false;
+	}
+	return que_node_get_type(node->common.parent) != QUE_NODE_UPDATE
+		|| static_cast<upd_node_t*>(node->common.parent)->cascade_node
+		!= node;
+}
+#endif /* WITH_WSREP */
+
+/*********************************************************************//**
+Creates an update node for a query graph.
+@return own: update node */
+upd_node_t*
+upd_node_create(
+/*============*/
+	mem_heap_t*	heap)	/*!< in: mem heap where created */
+{
+	upd_node_t*	node;
+
+	node = static_cast<upd_node_t*>(
+		mem_heap_zalloc(heap, sizeof(upd_node_t)));
+
+	node->common.type = QUE_NODE_UPDATE;
+	node->state = UPD_NODE_UPDATE_CLUSTERED;
+	node->heap = mem_heap_create(128);
+	node->magic_n = UPD_NODE_MAGIC_N;
+
+	return(node);
+}
+
+/***********************************************************//**
+Returns TRUE if row update changes size of some field in index or if some
+field to be updated is stored externally in rec or update.
+@return TRUE if the update changes the size of some field in index or
+the field is external in rec or update */
+ibool
+row_upd_changes_field_size_or_external(
+/*===================================*/
+	dict_index_t*	index,	/*!< in: index */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	const upd_t*	update)	/*!< in: update vector */
+{
+	const upd_field_t*	upd_field;
+	const dfield_t*		new_val;
+	ulint			old_len;
+	ulint			new_len;
+	ulint			n_fields;
+	ulint			i;
+
+	ut_ad(rec_offs_validate(NULL, index, offsets));
+	ut_ad(!index->table->skip_alter_undo);
+	n_fields = upd_get_n_fields(update);
+
+	for (i = 0; i < n_fields; i++) {
+		upd_field = upd_get_nth_field(update, i);
+
+		/* We should ignore virtual field if the index is not
+		a virtual index */
+		if (upd_fld_is_virtual_col(upd_field)
+		    && !index->has_virtual()) {
+			continue;
+		}
+
+		new_val = &(upd_field->new_val);
+		if (dfield_is_ext(new_val)) {
+			return(TRUE);
+		}
+		new_len = dfield_get_len(new_val);
+		ut_ad(new_len != UNIV_SQL_DEFAULT);
+
+		if (dfield_is_null(new_val) && !rec_offs_comp(offsets)) {
+			new_len = dict_col_get_sql_null_size(
+				dict_index_get_nth_col(index,
+						       upd_field->field_no),
+				0);
+		}
+
+		if (rec_offs_nth_default(offsets, upd_field->field_no)) {
+			/* This is an instantly added column that is
+			at the initial default value. */
+			return(TRUE);
+		}
+
+		if (rec_offs_comp(offsets)
+		    && rec_offs_nth_sql_null(offsets, upd_field->field_no)) {
+			/* Note that in the compact table format, for a
+			variable length field, an SQL NULL will use zero
+			bytes in the offset array at the start of the physical
+			record, but a zero-length value (empty string) will
+			use one byte! Thus, we cannot use update-in-place
+			if we update an SQL NULL varchar to an empty string! */
+
+			old_len = UNIV_SQL_NULL;
+		} else {
+			old_len = rec_offs_nth_size(offsets,
+						    upd_field->field_no);
+		}
+
+		if (old_len != new_len
+		    || rec_offs_nth_extern(offsets, upd_field->field_no)) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/***************************************************************//**
+Builds an update vector from those fields which in a secondary index entry
+differ from a record that has the equal ordering fields. NOTE: we compare
+the fields as binary strings!
+@return own: update vector of differing fields */
+upd_t*
+row_upd_build_sec_rec_difference_binary(
+/*====================================*/
+	const rec_t*	rec,	/*!< in: secondary index record */
+	dict_index_t*	index,	/*!< in: index */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	const dtuple_t*	entry,	/*!< in: entry to insert */
+	mem_heap_t*	heap)	/*!< in: memory heap from which allocated */
+{
+	upd_field_t*	upd_field;
+	const dfield_t*	dfield;
+	const byte*	data;
+	ulint		len;
+	upd_t*		update;
+	ulint		n_diff;
+
+	/* This function is used only for a secondary index */
+	ut_a(!dict_index_is_clust(index));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(rec_offs_n_fields(offsets) == dtuple_get_n_fields(entry));
+	ut_ad(!rec_offs_any_extern(offsets));
+	ut_ad(!rec_offs_any_default(offsets));
+	ut_ad(!index->table->skip_alter_undo);
+
+	update = upd_create(dtuple_get_n_fields(entry), heap);
+
+	n_diff = 0;
+
+	for (uint16_t i = 0; i < dtuple_get_n_fields(entry); i++) {
+
+		data = rec_get_nth_field(rec, offsets, i, &len);
+
+		dfield = dtuple_get_nth_field(entry, i);
+
+		/* NOTE that it may be that len != dfield_get_len(dfield) if we
+		are updating in a character set and collation where strings of
+		different length can be equal in an alphabetical comparison,
+		and also in the case where we have a column prefix index
+		and the last characters in the index field are spaces; the
+		latter case probably caused the assertion failures reported at
+		row0upd.cc line 713 in versions 4.0.14 - 4.0.16. */
+
+		/* NOTE: we compare the fields as binary strings!
+		(No collation) */
+
+		if (!dfield_data_is_binary_equal(dfield, len, data)) {
+
+			upd_field = upd_get_nth_field(update, n_diff);
+
+			dfield_copy(&(upd_field->new_val), dfield);
+
+			upd_field_set_field_no(upd_field, i, index);
+
+			n_diff++;
+		}
+	}
+
+	update->n_fields = n_diff;
+
+	return(update);
+}
+
+
+/** Builds an update vector from those fields, excluding the roll ptr and
+trx id fields, which in an index entry differ from a record that has
+the equal ordering fields. NOTE: we compare the fields as binary strings!
+@param[in]	index		clustered index
+@param[in]	entry		clustered index entry to insert
+@param[in]	rec		clustered index record
+@param[in]	offsets		rec_get_offsets(rec,index), or NULL
+@param[in]	no_sys		skip the system columns
+				DB_TRX_ID and DB_ROLL_PTR
+@param[in]	trx		transaction (for diagnostics),
+				or NULL
+@param[in]	heap		memory heap from which allocated
+@param[in]	mysql_table	NULL, or mysql table object when
+				user thread invokes dml
+@param[out]	error		error number in case of failure
+@return own: update vector of differing fields, excluding roll ptr and
+trx id,if error is not equal to DB_SUCCESS, return NULL */
+upd_t*
+row_upd_build_difference_binary(
+	dict_index_t*	index,
+	const dtuple_t*	entry,
+	const rec_t*	rec,
+	const rec_offs*	offsets,
+	bool		no_sys,
+	bool		ignore_warnings,
+	trx_t*		trx,
+	mem_heap_t*	heap,
+	TABLE*		mysql_table,
+	dberr_t*	error)
+{
+	ulint		len;
+	upd_t*		update;
+	ulint		n_diff;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	const ulint	n_v_fld = dtuple_get_n_v_fields(entry);
+	rec_offs_init(offsets_);
+
+	/* This function is used only for a clustered index */
+	ut_a(dict_index_is_clust(index));
+	ut_ad(!index->table->skip_alter_undo);
+	ut_ad(entry->n_fields <= index->n_fields);
+	ut_ad(entry->n_fields >= index->n_core_fields);
+
+	update = upd_create(index->n_fields + n_v_fld, heap);
+
+	n_diff = 0;
+
+	if (!offsets) {
+		offsets = rec_get_offsets(rec, index, offsets_,
+					  index->n_core_fields,
+					  ULINT_UNDEFINED, &heap);
+	} else {
+		ut_ad(rec_offs_validate(rec, index, offsets));
+	}
+
+	for (uint16_t i = 0; i < entry->n_fields; i++) {
+		const byte* data = rec_get_nth_cfield(rec, index, offsets, i,
+						      &len);
+		const dfield_t* dfield = dtuple_get_nth_field(entry, i);
+
+		/* NOTE: we compare the fields as binary strings!
+		(No collation) */
+		if (no_sys && (i == index->db_trx_id()
+			       || i == index->db_roll_ptr())) {
+			continue;
+		}
+
+		if (!dfield_is_ext(dfield)
+		    != !rec_offs_nth_extern(offsets, i)
+		    || !dfield_data_is_binary_equal(dfield, len, data)) {
+			upd_field_t* uf = upd_get_nth_field(update, n_diff++);
+			dfield_copy(&uf->new_val, dfield);
+			upd_field_set_field_no(uf, i, index);
+		}
+	}
+
+	for (uint16_t i = static_cast<uint16_t>(entry->n_fields);
+	     i < index->n_fields; i++) {
+		upd_field_t* uf = upd_get_nth_field(update, n_diff++);
+		const dict_col_t* col = dict_index_get_nth_col(index, i);
+		/* upd_create() zero-initialized uf */
+		uf->new_val.data = const_cast<byte*>(col->instant_value(&len));
+		uf->new_val.len = static_cast<unsigned>(len);
+		dict_col_copy_type(col, &uf->new_val.type);
+		upd_field_set_field_no(uf, i, index);
+	}
+
+	/* Check the virtual columns updates. Even if there is no non-virtual
+	column (base columns) change, we will still need to build the
+	indexed virtual column value so that undo log would log them (
+	for purge/mvcc purpose) */
+	if (n_v_fld > 0) {
+		row_ext_t*	ext;
+		THD*		thd;
+
+		if (trx == NULL) {
+			thd = current_thd;
+		} else {
+			thd = trx->mysql_thd;
+		}
+
+		ut_ad(!update->old_vrow);
+
+		ib_vcol_row vc(NULL);
+		uchar *record = vc.record(thd, index, &mysql_table);
+
+		for (uint16_t i = 0; i < n_v_fld; i++) {
+			const dict_v_col_t*     col
+                                = dict_table_get_nth_v_col(index->table, i);
+
+			if (!col->m_col.ord_part) {
+				continue;
+			}
+
+			if (update->old_vrow == NULL) {
+				update->old_vrow = row_build(
+					ROW_COPY_POINTERS, index, rec, offsets,
+					index->table, NULL, NULL, &ext, heap);
+			}
+
+			dfield_t*	vfield = innobase_get_computed_value(
+				update->old_vrow, col, index,
+				&vc.heap, heap, NULL, thd, mysql_table, record,
+				NULL, NULL, ignore_warnings);
+			if (vfield == NULL) {
+				*error = DB_COMPUTE_VALUE_FAILED;
+				return(NULL);
+			}
+
+			const dfield_t* dfield = dtuple_get_nth_v_field(
+				entry, i);
+
+			if (!dfield_data_is_binary_equal(
+				    dfield, vfield->len,
+				    static_cast<byte*>(vfield->data))) {
+				upd_field_t* uf = upd_get_nth_field(update,
+								    n_diff++);
+				uf->old_v_val = static_cast<dfield_t*>(
+					mem_heap_alloc(heap,
+						       sizeof *uf->old_v_val));
+				dfield_copy(uf->old_v_val, vfield);
+				dfield_copy(&uf->new_val, dfield);
+				upd_field_set_v_field_no(uf, i, index);
+			}
+		}
+	}
+
+	update->n_fields = n_diff;
+	ut_ad(update->validate());
+
+	return(update);
+}
+
+/** Fetch a prefix of an externally stored column.
+This is similar to row_ext_lookup(), but the row_ext_t holds the old values
+of the column and must not be poisoned with the new values.
+@param[in]	data		'internally' stored part of the field
+containing also the reference to the external part
+@param[in]	local_len	length of data, in bytes
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out]	len		input - length of prefix to
+fetch; output: fetched length of the prefix
+@param[in,out]	heap		heap where to allocate
+@return BLOB prefix
+@retval NULL if the record is incomplete (should only happen
+in row_vers_vc_matches_cluster() executed concurrently with another purge) */
+static
+byte*
+row_upd_ext_fetch(
+	const byte*		data,
+	ulint			local_len,
+	ulint			zip_size,
+	ulint*			len,
+	mem_heap_t*		heap)
+{
+	byte*	buf = static_cast<byte*>(mem_heap_alloc(heap, *len));
+
+	*len = btr_copy_externally_stored_field_prefix(
+		buf, *len, zip_size, data, local_len);
+
+	return *len ? buf : NULL;
+}
+
+/** Replaces the new column value stored in the update vector in
+the given index entry field.
+@param[in,out]	dfield		data field of the index entry
+@param[in]	field		index field
+@param[in]	col		field->col
+@param[in]	uf		update field
+@param[in,out]	heap		memory heap for allocating and copying
+the new value
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@return whether the previous version was built successfully */
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+static
+bool
+row_upd_index_replace_new_col_val(
+	dfield_t*		dfield,
+	const dict_field_t*	field,
+	const dict_col_t*	col,
+	const upd_field_t*	uf,
+	mem_heap_t*		heap,
+	ulint			zip_size)
+{
+	ulint		len;
+	const byte*	data;
+
+	dfield_copy_data(dfield, &uf->new_val);
+
+	if (dfield_is_null(dfield)) {
+		return true;
+	}
+
+	len = dfield_get_len(dfield);
+	data = static_cast<const byte*>(dfield_get_data(dfield));
+
+	if (field->prefix_len > 0) {
+		ibool		fetch_ext = dfield_is_ext(dfield)
+			&& len < (ulint) field->prefix_len
+			+ BTR_EXTERN_FIELD_REF_SIZE;
+
+		if (fetch_ext) {
+			ulint	l = len;
+
+			len = field->prefix_len;
+
+			data = row_upd_ext_fetch(data, l, zip_size,
+						 &len, heap);
+			if (UNIV_UNLIKELY(!data)) {
+				return false;
+			}
+		}
+
+		len = dtype_get_at_most_n_mbchars(col->prtype,
+						  col->mbminlen, col->mbmaxlen,
+						  field->prefix_len, len,
+						  (const char*) data);
+
+		dfield_set_data(dfield, data, len);
+
+		if (!fetch_ext) {
+			dfield_dup(dfield, heap);
+		}
+
+		return true;
+	}
+
+	switch (uf->orig_len) {
+		byte*	buf;
+	case BTR_EXTERN_FIELD_REF_SIZE:
+		/* Restore the original locally stored
+		part of the column.  In the undo log,
+		InnoDB writes a longer prefix of externally
+		stored columns, so that column prefixes
+		in secondary indexes can be reconstructed. */
+		dfield_set_data(dfield,
+				data + len - BTR_EXTERN_FIELD_REF_SIZE,
+				BTR_EXTERN_FIELD_REF_SIZE);
+		dfield_set_ext(dfield);
+		/* fall through */
+	case 0:
+		dfield_dup(dfield, heap);
+		break;
+	default:
+		/* Reconstruct the original locally
+		stored part of the column.  The data
+		will have to be copied. */
+		ut_a(uf->orig_len > BTR_EXTERN_FIELD_REF_SIZE);
+		buf = static_cast<byte*>(mem_heap_alloc(heap, uf->orig_len));
+
+		/* Copy the locally stored prefix. */
+		memcpy(buf, data,
+		       unsigned(uf->orig_len) - BTR_EXTERN_FIELD_REF_SIZE);
+
+		/* Copy the BLOB pointer. */
+		memcpy(buf + uf->orig_len - BTR_EXTERN_FIELD_REF_SIZE,
+		       data + len - BTR_EXTERN_FIELD_REF_SIZE,
+		       BTR_EXTERN_FIELD_REF_SIZE);
+
+		dfield_set_data(dfield, buf, uf->orig_len);
+		dfield_set_ext(dfield);
+		break;
+	}
+
+	return true;
+}
+
+/** Apply an update vector to an metadata entry.
+@param[in,out]	entry	clustered index metadata record to be updated
+@param[in]	index	index of the entry
+@param[in]	update	update vector built for the entry
+@param[in,out]	heap	memory heap for copying off-page columns */
+static
+void
+row_upd_index_replace_metadata(
+	dtuple_t*		entry,
+	const dict_index_t*	index,
+	const upd_t*		update,
+	mem_heap_t*		heap)
+{
+	ut_ad(!index->table->skip_alter_undo);
+	ut_ad(update->is_alter_metadata());
+	ut_ad(entry->info_bits == update->info_bits);
+	ut_ad(entry->n_fields == ulint(index->n_fields) + 1);
+	const ulint zip_size = index->table->space->zip_size();
+	const ulint first = index->first_user_field();
+	ut_d(bool found_mblob = false);
+
+	for (ulint i = upd_get_n_fields(update); i--; ) {
+		const upd_field_t* uf = upd_get_nth_field(update, i);
+		ut_ad(!upd_fld_is_virtual_col(uf));
+		ut_ad(uf->field_no >= first - 2);
+		ulint f = uf->field_no;
+		dfield_t* dfield = dtuple_get_nth_field(entry, f);
+
+		if (f == first) {
+			ut_d(found_mblob = true);
+			ut_ad(!dfield_is_null(&uf->new_val));
+			ut_ad(dfield_is_ext(dfield));
+			ut_ad(dfield_get_len(dfield) == FIELD_REF_SIZE);
+			ut_ad(!dfield_is_null(dfield));
+			dfield_set_data(dfield, uf->new_val.data,
+					uf->new_val.len);
+			if (dfield_is_ext(&uf->new_val)) {
+				dfield_set_ext(dfield);
+			}
+			continue;
+		}
+
+		f -= f > first;
+		const dict_field_t* field = dict_index_get_nth_field(index, f);
+		if (!row_upd_index_replace_new_col_val(dfield, field,
+						       field->col,
+						       uf, heap, zip_size)) {
+			ut_error;
+		}
+	}
+
+	ut_ad(found_mblob);
+}
+
+/** Apply an update vector to an index entry.
+@param[in,out]	entry	index entry to be updated; the clustered index record
+			must be covered by a lock or a page latch to prevent
+			deletion (rollback or purge)
+@param[in]	index	index of the entry
+@param[in]	update	update vector built for the entry
+@param[in,out]	heap	memory heap for copying off-page columns */
+void
+row_upd_index_replace_new_col_vals_index_pos(
+	dtuple_t*		entry,
+	const dict_index_t*	index,
+	const upd_t*		update,
+	mem_heap_t*		heap)
+{
+	ut_ad(!index->table->skip_alter_undo);
+	ut_ad(!entry->is_metadata() || entry->info_bits == update->info_bits);
+
+	if (UNIV_UNLIKELY(entry->is_alter_metadata())) {
+		row_upd_index_replace_metadata(entry, index, update, heap);
+		return;
+	}
+
+	const ulint zip_size = index->table->space->zip_size();
+
+	dtuple_set_info_bits(entry, update->info_bits);
+
+	for (uint16_t i = index->n_fields; i--; ) {
+		const dict_field_t*	field;
+		const dict_col_t*	col;
+		const upd_field_t*	uf;
+
+		field = dict_index_get_nth_field(index, i);
+		col = dict_field_get_col(field);
+		if (col->is_virtual()) {
+			const dict_v_col_t*	vcol = reinterpret_cast<
+							const dict_v_col_t*>(
+								col);
+
+			uf = upd_get_field_by_field_no(
+				update, vcol->v_pos, true);
+		} else {
+			uf = upd_get_field_by_field_no(
+				update, i, false);
+		}
+
+		if (uf && UNIV_UNLIKELY(!row_upd_index_replace_new_col_val(
+						dtuple_get_nth_field(entry, i),
+						field, col, uf, heap,
+						zip_size))) {
+			ut_error;
+		}
+	}
+}
+
+/** Replace the new column values stored in the update vector,
+during trx_undo_prev_version_build().
+@param entry   clustered index tuple where the values are replaced
+               (the clustered index leaf page latch must be held)
+@param index   clustered index
+@param update  update vector for the clustered index
+@param heap    memory heap for allocating and copying values
+@return whether the previous version was built successfully */
+bool
+row_upd_index_replace_new_col_vals(dtuple_t *entry, const dict_index_t &index,
+                                   const upd_t *update, mem_heap_t *heap)
+{
+  ut_ad(index.is_primary());
+  const ulint zip_size= index.table->space->zip_size();
+
+  ut_ad(!index.table->skip_alter_undo);
+  dtuple_set_info_bits(entry, update->info_bits);
+
+  for (ulint i= 0; i < index.n_fields; i++)
+  {
+   const dict_field_t *field= &index.fields[i];
+   const dict_col_t* col= dict_field_get_col(field);
+   const upd_field_t *uf;
+
+   if (col->is_virtual())
+   {
+     const dict_v_col_t *vcol= reinterpret_cast<const dict_v_col_t*>(col);
+     uf= upd_get_field_by_field_no(update, vcol->v_pos, true);
+   }
+   else
+     uf= upd_get_field_by_field_no(update, static_cast<uint16_t>
+                                   (dict_col_get_clust_pos(col, &index)),
+                                   false);
+
+   if (!uf)
+     continue;
+
+   if (!row_upd_index_replace_new_col_val(dtuple_get_nth_field(entry, i),
+                                          field, col, uf, heap, zip_size))
+     return false;
+  }
+
+  return true;
+}
+
+/** Replaces the virtual column values stored in the update vector.
+@param[in,out]	row	row whose column to be set
+@param[in]	field	data to set
+@param[in]	len	data length
+@param[in]	vcol	virtual column info */
+static
+void
+row_upd_set_vcol_data(
+	dtuple_t*		row,
+	const byte*             field,
+	ulint                   len,
+	dict_v_col_t*		vcol)
+{
+	dfield_t*	dfield = dtuple_get_nth_v_field(row, vcol->v_pos);
+
+	if (dfield_get_type(dfield)->mtype == DATA_MISSING) {
+		dict_col_copy_type(&vcol->m_col, dfield_get_type(dfield));
+
+		dfield_set_data(dfield, field, len);
+	}
+}
+
+/** Replaces the virtual column values stored in a dtuple with that of
+a update vector.
+@param[in,out]	row	row whose column to be updated
+@param[in]	table	table
+@param[in]	update	an update vector built for the clustered index
+@param[in]	upd_new	update to new or old value
+@param[in,out]	undo_row undo row (if needs to be updated)
+@param[in]	ptr	remaining part in update undo log */
+void
+row_upd_replace_vcol(
+	dtuple_t*		row,
+	const dict_table_t*	table,
+	const upd_t*		update,
+	bool			upd_new,
+	dtuple_t*		undo_row,
+	const byte*		ptr)
+{
+	ulint			col_no;
+	ulint			i;
+	ulint			n_cols;
+
+	ut_ad(!table->skip_alter_undo);
+
+	n_cols = dtuple_get_n_v_fields(row);
+	for (col_no = 0; col_no < n_cols; col_no++) {
+		dfield_t*		dfield;
+
+		const dict_v_col_t*	col
+			= dict_table_get_nth_v_col(table, col_no);
+
+		/* If there is no index on the column, do not bother for
+		value update */
+		if (!col->m_col.ord_part) {
+			continue;
+		}
+
+		dfield = dtuple_get_nth_v_field(row, col_no);
+
+		for (i = 0; i < upd_get_n_fields(update); i++) {
+			const upd_field_t*	upd_field
+				= upd_get_nth_field(update, i);
+			if (!upd_fld_is_virtual_col(upd_field)
+			    || upd_field->field_no != col->v_pos) {
+				continue;
+			}
+
+			if (upd_new) {
+				dfield_copy_data(dfield, &upd_field->new_val);
+			} else {
+				dfield_copy_data(dfield, upd_field->old_v_val);
+			}
+
+			dfield->type = upd_field->new_val.type;
+			break;
+		}
+	}
+
+	bool	first_v_col = true;
+	bool	is_undo_log = true;
+
+	/* We will read those unchanged (but indexed) virtual columns in */
+	if (ptr) {
+		const byte* const end_ptr = ptr + mach_read_from_2(ptr);
+		ptr += 2;
+
+		while (ptr != end_ptr) {
+			const byte* field;
+			uint32_t field_no, len, orig_len;
+
+			field_no = mach_read_next_compressed(&ptr);
+
+			const bool is_v = (field_no >= REC_MAX_N_FIELDS);
+
+			if (is_v) {
+				ptr = trx_undo_read_v_idx(
+					table, ptr, first_v_col, &is_undo_log,
+					&field_no);
+				first_v_col = false;
+			}
+
+			ptr = trx_undo_rec_get_col_val(
+				ptr, &field, &len, &orig_len);
+
+			if (field_no == FIL_NULL) {
+				ut_ad(is_v);
+				continue;
+			}
+
+			if (is_v) {
+				dict_v_col_t* vcol = dict_table_get_nth_v_col(
+							table, field_no);
+
+				row_upd_set_vcol_data(row, field, len, vcol);
+
+				if (undo_row) {
+					row_upd_set_vcol_data(
+						undo_row, field, len, vcol);
+				}
+			}
+			ut_ad(ptr<= end_ptr);
+		}
+	}
+}
+
+/***********************************************************//**
+Replaces the new column values stored in the update vector. */
+void
+row_upd_replace(
+/*============*/
+	dtuple_t*		row,	/*!< in/out: row where replaced,
+					indexed by col_no;
+					the clustered index record must be
+					covered by a lock or a page latch to
+					prevent deletion (rollback or purge) */
+	row_ext_t**		ext,	/*!< out, own: NULL, or externally
+					stored column prefixes */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const upd_t*		update,	/*!< in: an update vector built for the
+					clustered index */
+	mem_heap_t*		heap)	/*!< in: memory heap */
+{
+	ulint			col_no;
+	ulint			i;
+	ulint			n_cols;
+	ulint			n_ext_cols;
+	ulint*			ext_cols;
+	const dict_table_t*	table;
+
+	ut_ad(row);
+	ut_ad(ext);
+	ut_ad(index);
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(update);
+	ut_ad(heap);
+	ut_ad(update->validate());
+
+	n_cols = dtuple_get_n_fields(row);
+	table = index->table;
+	ut_ad(n_cols == dict_table_get_n_cols(table));
+
+	ext_cols = static_cast<ulint*>(
+		mem_heap_alloc(heap, n_cols * sizeof *ext_cols));
+
+	n_ext_cols = 0;
+
+	dtuple_set_info_bits(row, update->info_bits);
+
+	for (col_no = 0; col_no < n_cols; col_no++) {
+
+		const dict_col_t*	col
+			= dict_table_get_nth_col(table, col_no);
+		const ulint		clust_pos
+			= dict_col_get_clust_pos(col, index);
+		dfield_t*		dfield;
+
+		if (UNIV_UNLIKELY(clust_pos == ULINT_UNDEFINED)) {
+
+			continue;
+		}
+
+		dfield = dtuple_get_nth_field(row, col_no);
+
+		for (i = 0; i < upd_get_n_fields(update); i++) {
+
+			const upd_field_t*	upd_field
+				= upd_get_nth_field(update, i);
+
+			if (upd_field->field_no != clust_pos
+			    || upd_fld_is_virtual_col(upd_field)) {
+
+				continue;
+			}
+
+			dfield_copy_data(dfield, &upd_field->new_val);
+			break;
+		}
+
+		if (dfield_is_ext(dfield) && col->ord_part) {
+			ext_cols[n_ext_cols++] = col_no;
+		}
+	}
+
+	if (n_ext_cols) {
+		*ext = row_ext_create(n_ext_cols, ext_cols, *table, row, heap);
+	} else {
+		*ext = NULL;
+	}
+
+	row_upd_replace_vcol(row, table, update, true, nullptr, nullptr);
+}
+
+/***********************************************************//**
+Checks if an update vector changes an ordering field of an index record.
+
+This function is fast if the update vector is short or the number of ordering
+fields in the index is small. Otherwise, this can be quadratic.
+NOTE: we compare the fields as binary strings!
+@return TRUE if update vector changes an ordering field in the index record */
+ibool
+row_upd_changes_ord_field_binary_func(
+/*==================================*/
+	dict_index_t*	index,	/*!< in: index of the record */
+	const upd_t*	update,	/*!< in: update vector for the row; NOTE: the
+				field numbers in this MUST be clustered index
+				positions! */
+#ifdef UNIV_DEBUG
+	const que_thr_t*thr,	/*!< in: query thread */
+#endif /* UNIV_DEBUG */
+	const dtuple_t*	row,	/*!< in: old value of row, or NULL if the
+				row and the data values in update are not
+				known when this function is called, e.g., at
+				compile time */
+	const row_ext_t*ext,	/*!< NULL, or prefixes of the externally
+				stored columns in the old row */
+	ulint		flag)	/*!< in: ROW_BUILD_NORMAL,
+				ROW_BUILD_FOR_PURGE or ROW_BUILD_FOR_UNDO */
+{
+	ulint			n_unique;
+	ulint			i;
+	const dict_index_t*	clust_index;
+
+	ut_ad(!index->table->skip_alter_undo);
+
+	n_unique = dict_index_get_n_unique(index);
+
+	clust_index = dict_table_get_first_index(index->table);
+
+	for (i = 0; i < n_unique; i++) {
+
+		const dict_field_t*	ind_field;
+		const dict_col_t*	col;
+		ulint			col_no;
+		const upd_field_t*	upd_field;
+		const dfield_t*		dfield;
+		dfield_t		dfield_ext;
+		ulint			dfield_len= 0;
+		const byte*		buf;
+		bool			is_virtual;
+		const dict_v_col_t*	vcol = NULL;
+
+		ind_field = dict_index_get_nth_field(index, i);
+		col = dict_field_get_col(ind_field);
+		col_no = dict_col_get_no(col);
+		is_virtual = col->is_virtual();
+
+		if (is_virtual) {
+			vcol = reinterpret_cast<const dict_v_col_t*>(col);
+
+			upd_field = upd_get_field_by_field_no(
+				update, vcol->v_pos, true);
+		} else {
+			upd_field = upd_get_field_by_field_no(
+				update, static_cast<uint16_t>(
+					dict_col_get_clust_pos(
+						col, clust_index)),
+				false);
+		}
+
+		if (upd_field == NULL) {
+			continue;
+		}
+
+		if (row == NULL) {
+			ut_ad(ext == NULL);
+			return(TRUE);
+		}
+
+		if (is_virtual) {
+			dfield = dtuple_get_nth_v_field(
+				row,  vcol->v_pos);
+		} else {
+			dfield = dtuple_get_nth_field(row, col_no);
+		}
+
+		/* For spatial index update, since the different geometry
+		data could generate same MBR, so, if the new index entry is
+		same as old entry, which means the MBR is not changed, we
+		don't need to do anything. */
+		if (dict_index_is_spatial(index) && i == 0) {
+			double		mbr1[SPDIMS * 2];
+			double		mbr2[SPDIMS * 2];
+			rtr_mbr_t*	old_mbr;
+			rtr_mbr_t*	new_mbr;
+			const uchar*	dptr = NULL;
+			ulint		flen = 0;
+			ulint		dlen = 0;
+			mem_heap_t*	temp_heap = NULL;
+			const dfield_t*	new_field = &upd_field->new_val;
+
+			const ulint zip_size = ext
+				? ext->zip_size
+				: index->table->space->zip_size();
+
+			ut_ad(dfield->data != NULL
+			      && dfield->len > GEO_DATA_HEADER_SIZE);
+			ut_ad(dict_col_get_spatial_status(col) != SPATIAL_NONE);
+
+			/* Get the old mbr. */
+			if (dfield_is_ext(dfield)) {
+				/* For off-page stored data, we
+				need to read the whole field data. */
+				flen = dfield_get_len(dfield);
+				dptr = static_cast<const byte*>(
+					dfield_get_data(dfield));
+				temp_heap = mem_heap_create(1000);
+
+				dptr = btr_copy_externally_stored_field(
+					&dlen, dptr,
+					zip_size,
+					flen,
+					temp_heap);
+			} else {
+				dptr = static_cast<const uchar*>(dfield->data);
+				dlen = dfield->len;
+			}
+
+			rtree_mbr_from_wkb(dptr + GEO_DATA_HEADER_SIZE,
+					   static_cast<uint>(dlen
+					   - GEO_DATA_HEADER_SIZE),
+					   SPDIMS, mbr1);
+			old_mbr = reinterpret_cast<rtr_mbr_t*>(mbr1);
+
+			/* Get the new mbr. */
+			if (dfield_is_ext(new_field)) {
+				if (flag == ROW_BUILD_FOR_UNDO
+				    && dict_table_has_atomic_blobs(
+					    index->table)) {
+					/* For ROW_FORMAT=DYNAMIC
+					or COMPRESSED, a prefix of
+					off-page records is stored
+					in the undo log record
+					(for any column prefix indexes).
+					For SPATIAL INDEX, we must
+					ignore this prefix. The
+					full column value is stored in
+					the BLOB.
+					For non-spatial index, we
+					would have already fetched a
+					necessary prefix of the BLOB,
+					available in the "ext" parameter.
+
+					Here, for SPATIAL INDEX, we are
+					fetching the full column, which is
+					potentially wasting a lot of I/O,
+					memory, and possibly involving a
+					concurrency problem, similar to ones
+					that existed before the introduction
+					of row_ext_t.
+
+					MDEV-11657 FIXME: write the MBR
+					directly to the undo log record,
+					and avoid recomputing it here! */
+					flen = BTR_EXTERN_FIELD_REF_SIZE;
+					ut_ad(dfield_get_len(new_field) >=
+					      BTR_EXTERN_FIELD_REF_SIZE);
+					dptr = static_cast<const byte*>(
+						dfield_get_data(new_field))
+						+ dfield_get_len(new_field)
+						- BTR_EXTERN_FIELD_REF_SIZE;
+				} else {
+					flen = dfield_get_len(new_field);
+					dptr = static_cast<const byte*>(
+						dfield_get_data(new_field));
+				}
+
+				if (temp_heap == NULL) {
+					temp_heap = mem_heap_create(1000);
+				}
+
+				dptr = btr_copy_externally_stored_field(
+					&dlen, dptr,
+					zip_size,
+					flen,
+					temp_heap);
+			} else {
+				dptr = static_cast<const byte*>(
+					upd_field->new_val.data);
+				dlen = upd_field->new_val.len;
+			}
+			rtree_mbr_from_wkb(dptr + GEO_DATA_HEADER_SIZE,
+					   static_cast<uint>(dlen
+					   - GEO_DATA_HEADER_SIZE),
+					   SPDIMS, mbr2);
+			new_mbr = reinterpret_cast<rtr_mbr_t*>(mbr2);
+
+			if (temp_heap) {
+				mem_heap_free(temp_heap);
+			}
+
+			if (!MBR_EQUAL_CMP(old_mbr, new_mbr)) {
+				return(TRUE);
+			} else {
+				continue;
+			}
+		}
+
+		/* This treatment of column prefix indexes is loosely
+		based on row_build_index_entry(). */
+
+		if (UNIV_LIKELY(ind_field->prefix_len == 0)
+		    || dfield_is_null(dfield)) {
+			/* do nothing special */
+		} else if (ext) {
+			/* Silence a compiler warning without
+			silencing a Valgrind error. */
+			dfield_len = 0;
+			MEM_UNDEFINED(&dfield_len, sizeof dfield_len);
+			/* See if the column is stored externally. */
+			buf = row_ext_lookup(ext, col_no, &dfield_len);
+
+			ut_ad(col->ord_part);
+
+			if (UNIV_LIKELY_NULL(buf)) {
+				if (UNIV_UNLIKELY(buf == field_ref_zero)) {
+					/* The externally stored field
+					was not written yet. This
+					record should only be seen by
+					trx_rollback_recovered()
+					when the server had crashed before
+					storing the field. */
+					ut_ad(!thr
+					      || thr->graph->trx->is_recovered);
+					ut_ad(!thr
+					      || thr->graph->trx
+					         == trx_roll_crash_recv_trx);
+					return(TRUE);
+				}
+
+				goto copy_dfield;
+			}
+		} else if (dfield_is_ext(dfield)) {
+			dfield_len = dfield_get_len(dfield);
+			ut_a(dfield_len > BTR_EXTERN_FIELD_REF_SIZE);
+			dfield_len -= BTR_EXTERN_FIELD_REF_SIZE;
+			ut_a(dict_index_is_clust(index)
+			     || ind_field->prefix_len <= dfield_len);
+
+			buf= static_cast<const byte*>(dfield_get_data(dfield));
+copy_dfield:
+			ut_a(dfield_len > 0);
+			dfield_copy(&dfield_ext, dfield);
+			dfield_set_data(&dfield_ext, buf, dfield_len);
+			dfield = &dfield_ext;
+		}
+
+		if (!dfield_datas_are_binary_equal(
+			    dfield, &upd_field->new_val,
+			    ind_field->prefix_len)) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/***********************************************************//**
+Checks if an update vector changes an ordering field of an index record.
+NOTE: we compare the fields as binary strings!
+@return TRUE if update vector may change an ordering field in an index
+record */
+ibool
+row_upd_changes_some_index_ord_field_binary(
+/*========================================*/
+	const dict_table_t*	table,	/*!< in: table */
+	const upd_t*		update)	/*!< in: update vector for the row */
+{
+	upd_field_t*	upd_field;
+	dict_index_t*	index;
+	ulint		i;
+
+	index = dict_table_get_first_index(table);
+
+	for (i = 0; i < upd_get_n_fields(update); i++) {
+
+		upd_field = upd_get_nth_field(update, i);
+
+		if (upd_fld_is_virtual_col(upd_field)) {
+			if (dict_table_get_nth_v_col(index->table,
+						     upd_field->field_no)
+			    ->m_col.ord_part) {
+				return(TRUE);
+			}
+		} else {
+			if (dict_field_get_col(dict_index_get_nth_field(
+				index, upd_field->field_no))->ord_part) {
+				return(TRUE);
+			}
+		}
+	}
+
+	return(FALSE);
+}
+
+/***********************************************************//**
+Checks if an FTS Doc ID column is affected by an UPDATE.
+@return whether the Doc ID column is changed */
+bool
+row_upd_changes_doc_id(
+/*===================*/
+	dict_table_t*	table,		/*!< in: table */
+	upd_field_t*	upd_field)	/*!< in: field to check */
+{
+	ulint		col_no;
+	dict_index_t*	clust_index;
+	fts_t*		fts = table->fts;
+
+	ut_ad(!table->skip_alter_undo);
+
+	clust_index = dict_table_get_first_index(table);
+
+	/* Convert from index-specific column number to table-global
+	column number. */
+	col_no = dict_index_get_nth_col_no(clust_index, upd_field->field_no);
+
+	return(col_no == fts->doc_col);
+}
+/***********************************************************//**
+Checks if an FTS indexed column is affected by an UPDATE.
+@return offset within fts_t::indexes if FTS indexed column updated else
+ULINT_UNDEFINED */
+ulint
+row_upd_changes_fts_column(
+/*=======================*/
+	dict_table_t*	table,		/*!< in: table */
+	upd_field_t*	upd_field)	/*!< in: field to check */
+{
+	ulint		col_no;
+	dict_index_t*	clust_index;
+	fts_t*		fts = table->fts;
+
+	ut_ad(!table->skip_alter_undo);
+
+	if (upd_fld_is_virtual_col(upd_field)) {
+		col_no = upd_field->field_no;
+		return(dict_table_is_fts_column(fts->indexes, col_no, true));
+	} else {
+		clust_index = dict_table_get_first_index(table);
+
+		/* Convert from index-specific column number to table-global
+		column number. */
+		col_no = dict_index_get_nth_col_no(clust_index,
+						   upd_field->field_no);
+		return(dict_table_is_fts_column(fts->indexes, col_no, false));
+	}
+
+}
+
+/***********************************************************//**
+Checks if an update vector changes some of the first ordering fields of an
+index record. This is only used in foreign key checks and we can assume
+that index does not contain column prefixes.
+@return TRUE if changes */
+static
+ibool
+row_upd_changes_first_fields_binary(
+/*================================*/
+	dtuple_t*	entry,	/*!< in: index entry */
+	dict_index_t*	index,	/*!< in: index of entry */
+	const upd_t*	update,	/*!< in: update vector for the row */
+	ulint		n)	/*!< in: how many first fields to check */
+{
+	ulint		n_upd_fields;
+	ulint		i, j;
+	dict_index_t*	clust_index;
+
+	ut_ad(update && index);
+	ut_ad(n <= dict_index_get_n_fields(index));
+
+	n_upd_fields = upd_get_n_fields(update);
+	clust_index = dict_table_get_first_index(index->table);
+
+	for (i = 0; i < n; i++) {
+
+		const dict_field_t*	ind_field;
+		const dict_col_t*	col;
+		ulint			col_pos;
+
+		ind_field = dict_index_get_nth_field(index, i);
+		col = dict_field_get_col(ind_field);
+		col_pos = dict_col_get_clust_pos(col, clust_index);
+
+		ut_a(ind_field->prefix_len == 0);
+
+		for (j = 0; j < n_upd_fields; j++) {
+
+			upd_field_t*	upd_field
+				= upd_get_nth_field(update, j);
+
+			if (col_pos == upd_field->field_no
+			    && !dfield_datas_are_binary_equal(
+				    dtuple_get_nth_field(entry, i),
+				    &upd_field->new_val, 0)) {
+
+				return(TRUE);
+			}
+		}
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Copies the column values from a record. */
+UNIV_INLINE
+void
+row_upd_copy_columns(
+/*=================*/
+	rec_t*		rec,	/*!< in: record in a clustered index */
+	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
+	const dict_index_t*	index, /*!< in: index of rec */
+	sym_node_t*	column)	/*!< in: first column in a column list, or
+				NULL */
+{
+	ut_ad(dict_index_is_clust(index));
+
+	const byte*	data;
+	ulint	len;
+
+	while (column) {
+		data = rec_get_nth_cfield(
+			rec, index, offsets,
+			column->field_nos[SYM_CLUST_FIELD_NO], &len);
+		eval_node_copy_and_alloc_val(column, data, len);
+
+		column = UT_LIST_GET_NEXT(col_var_list, column);
+	}
+}
+
+/*********************************************************************//**
+Calculates the new values for fields to update. Note that row_upd_copy_columns
+must have been called first. */
+UNIV_INLINE
+void
+row_upd_eval_new_vals(
+/*==================*/
+	upd_t*	update)	/*!< in/out: update vector */
+{
+	que_node_t*	exp;
+	upd_field_t*	upd_field;
+	ulint		n_fields;
+	ulint		i;
+
+	n_fields = upd_get_n_fields(update);
+
+	for (i = 0; i < n_fields; i++) {
+		upd_field = upd_get_nth_field(update, i);
+
+		exp = upd_field->exp;
+
+		eval_exp(exp);
+
+		dfield_copy_data(&(upd_field->new_val), que_node_get_val(exp));
+	}
+}
+
+/** Stores to the heap the virtual columns that need for any indexes
+@param[in,out]	node		row update node
+@param[in]	update		an update vector if it is update
+@param[in]	thd		mysql thread handle
+@param[in,out]	mysql_table	mysql table object
+@return true if success
+	false if virtual column value computation fails. */
+static
+bool
+row_upd_store_v_row(
+	upd_node_t*	node,
+	const upd_t*	update,
+	THD*		thd,
+	TABLE*		mysql_table)
+{
+	dict_index_t*	index = dict_table_get_first_index(node->table);
+	ib_vcol_row	vc(NULL);
+
+	for (ulint col_no = 0; col_no < dict_table_get_n_v_cols(node->table);
+	     col_no++) {
+
+		const dict_v_col_t*     col
+			= dict_table_get_nth_v_col(node->table, col_no);
+
+		if (col->m_col.ord_part) {
+			dfield_t*	dfield
+				= dtuple_get_nth_v_field(node->row, col_no);
+			ulint		n_upd
+				= update ? upd_get_n_fields(update) : 0;
+			ulint		i = 0;
+
+			/* Check if the value is already in update vector */
+			for (i = 0; i < n_upd; i++) {
+				const upd_field_t*      upd_field
+					= upd_get_nth_field(update, i);
+				if (!(upd_field->new_val.type.prtype
+				      & DATA_VIRTUAL)
+				    || upd_field->field_no != col->v_pos) {
+					continue;
+				}
+
+				dfield_copy_data(dfield, upd_field->old_v_val);
+				dfield_dup(dfield, node->heap);
+				break;
+			}
+
+			/* Not updated */
+			if (i >= n_upd) {
+				/* If this is an update, then the value
+				should be in update->old_vrow */
+				if (update) {
+					if (update->old_vrow == NULL) {
+						/* This only happens in
+						cascade update. And virtual
+						column can't be affected,
+						so it is Ok to set it to NULL */
+						dfield_set_null(dfield);
+					} else {
+						dfield_t*       vfield
+							= dtuple_get_nth_v_field(
+								update->old_vrow,
+								col_no);
+						dfield_copy_data(dfield, vfield);
+						dfield_dup(dfield, node->heap);
+					}
+				} else {
+					uchar *record = vc.record(thd, index,
+								  &mysql_table);
+					/* Need to compute, this happens when
+					deleting row */
+					dfield_t* vfield =
+						innobase_get_computed_value(
+							node->row, col, index,
+							&vc.heap, node->heap,
+							NULL, thd, mysql_table,
+							record, NULL, NULL);
+					if (vfield == NULL) {
+						return false;
+					}
+				}
+			}
+		}
+	}
+
+	return true;
+}
+
+/** Stores to the heap the row on which the node->pcur is positioned.
+@param[in]	node		row update node
+@param[in]	thd		mysql thread handle
+@param[in,out]	mysql_table	NULL, or mysql table object when
+				user thread invokes dml
+@return false if virtual column value computation fails
+	true otherwise. */
+static
+bool
+row_upd_store_row(
+	upd_node_t*	node,
+	THD*		thd,
+	TABLE*		mysql_table)
+{
+	dict_index_t*	clust_index;
+	rec_t*		rec;
+	mem_heap_t*	heap		= NULL;
+	row_ext_t**	ext;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	const rec_offs*	offsets;
+	rec_offs_init(offsets_);
+
+	ut_ad(node->pcur->latch_mode != BTR_NO_LATCHES);
+
+	if (node->row != NULL) {
+		mem_heap_empty(node->heap);
+	}
+
+	clust_index = dict_table_get_first_index(node->table);
+
+	rec = btr_pcur_get_rec(node->pcur);
+
+	offsets = rec_get_offsets(rec, clust_index, offsets_,
+				  clust_index->n_core_fields,
+				  ULINT_UNDEFINED, &heap);
+
+	if (dict_table_has_atomic_blobs(node->table)) {
+		/* There is no prefix of externally stored columns in
+		the clustered index record. Build a cache of column
+		prefixes. */
+		ext = &node->ext;
+	} else {
+		/* REDUNDANT and COMPACT formats store a local
+		768-byte prefix of each externally stored column.
+		No cache is needed. */
+		ext = NULL;
+		node->ext = NULL;
+	}
+
+	node->row = row_build(ROW_COPY_DATA, clust_index, rec, offsets,
+			      NULL, NULL, NULL, ext, node->heap);
+
+	if (node->table->n_v_cols) {
+		bool ok = row_upd_store_v_row(node,
+				    node->is_delete ? NULL : node->update,
+				    thd, mysql_table);
+		if (!ok) {
+			return false;
+		}
+	}
+
+	if (node->is_delete == PLAIN_DELETE) {
+		node->upd_row = NULL;
+		node->upd_ext = NULL;
+	} else {
+		node->upd_row = dtuple_copy(node->row, node->heap);
+		row_upd_replace(node->upd_row, &node->upd_ext,
+				clust_index, node->update, node->heap);
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return true;
+}
+
+/***********************************************************//**
+Updates a secondary index entry of a row.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_upd_sec_index_entry(
+/*====================*/
+	upd_node_t*	node,	/*!< in: row update node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	mtr_t			mtr;
+	btr_pcur_t		pcur;
+	mem_heap_t*		heap;
+	dtuple_t*		entry;
+	dict_index_t*		index;
+	dberr_t			err	= DB_SUCCESS;
+	trx_t*			trx	= thr_get_trx(thr);
+	btr_latch_mode		mode;
+	ulint			flags;
+	enum row_search_result	search_result;
+
+	ut_ad(trx->id != 0);
+
+	index = node->index;
+	ut_ad(index->is_committed());
+
+	/* For secondary indexes, index->online_status==ONLINE_INDEX_COMPLETE
+	if index->is_committed(). */
+	ut_ad(!dict_index_is_online_ddl(index));
+
+	const bool referenced = row_upd_index_is_referenced(index, trx);
+#ifdef WITH_WSREP
+	const bool foreign = wsrep_row_upd_index_is_foreign(index, trx);
+#endif /* WITH_WSREP */
+
+	heap = mem_heap_create(1024);
+
+	/* Build old index entry */
+	entry = row_build_index_entry(node->row, node->ext, index, heap);
+	ut_a(entry);
+
+	log_free_check();
+
+	DEBUG_SYNC_C_IF_THD(trx->mysql_thd,
+			    "before_row_upd_sec_index_entry");
+
+	mtr.start();
+	mode = BTR_MODIFY_LEAF;
+
+	switch (index->table->space_id) {
+	case SRV_TMP_SPACE_ID:
+		mtr.set_log_mode(MTR_LOG_NO_REDO);
+		flags = BTR_NO_LOCKING_FLAG;
+		break;
+	default:
+		index->set_modified(mtr);
+		/* fall through */
+	case IBUF_SPACE_ID:
+		flags = index->table->no_rollback() ? BTR_NO_ROLLBACK : 0;
+		/* We can only buffer delete-mark operations if there
+		are no foreign key constraints referring to the index. */
+		if (!referenced) {
+			mode = BTR_DELETE_MARK_LEAF;
+		}
+		break;
+	}
+
+	/* Set the query thread, so that ibuf_insert_low() will be
+	able to invoke thd_get_trx(). */
+	pcur.btr_cur.thr = thr;
+	pcur.btr_cur.page_cur.index = index;
+
+	if (index->is_spatial()) {
+		mode = btr_latch_mode(BTR_MODIFY_LEAF | BTR_RTREE_DELETE_MARK);
+		if (UNIV_LIKELY(!rtr_search(entry, mode, &pcur, &mtr))) {
+			goto found;
+		}
+
+		if (pcur.btr_cur.rtr_info->fd_del) {
+			/* We found the record, but a delete marked */
+			goto close;
+		}
+
+		goto not_found;
+	}
+
+	search_result = row_search_index_entry(entry, mode, &pcur, &mtr);
+
+	switch (search_result) {
+	const rec_t* rec;
+	case ROW_NOT_DELETED_REF:	/* should only occur for BTR_DELETE */
+		ut_error;
+		break;
+	case ROW_BUFFERED:
+		/* Entry was delete marked already. */
+		break;
+
+	case ROW_NOT_FOUND:
+not_found:
+		rec = btr_pcur_get_rec(&pcur);
+		ib::error()
+			<< "Record in index " << index->name
+			<< " of table " << index->table->name
+			<< " was not found on update: " << *entry
+			<< " at: " << rec_index_print(rec, index);
+#ifdef UNIV_DEBUG
+		mtr_commit(&mtr);
+		mtr_start(&mtr);
+		ut_ad(btr_validate_index(index, 0) == DB_SUCCESS);
+		ut_ad(0);
+#endif /* UNIV_DEBUG */
+		break;
+	case ROW_FOUND:
+found:
+		ut_ad(err == DB_SUCCESS);
+		rec = btr_pcur_get_rec(&pcur);
+
+		/* Delete mark the old index record; it can already be
+		delete marked if we return after a lock wait in
+		row_ins_sec_index_entry() below */
+		if (!rec_get_deleted_flag(
+			    rec, dict_table_is_comp(index->table))) {
+			err = lock_sec_rec_modify_check_and_lock(
+				flags,
+				btr_pcur_get_block(&pcur),
+				btr_pcur_get_rec(&pcur), index, thr, &mtr);
+			if (err != DB_SUCCESS) {
+				break;
+			}
+
+			btr_rec_set_deleted<true>(btr_pcur_get_block(&pcur),
+						  btr_pcur_get_rec(&pcur),
+						  &mtr);
+#ifdef WITH_WSREP
+			if (!referenced && foreign
+			    && wsrep_must_process_fk(node, trx)
+			    && !wsrep_thd_is_BF(trx->mysql_thd, FALSE)) {
+
+				rec_offs* offsets = rec_get_offsets(
+					rec, index, NULL, index->n_core_fields,
+					ULINT_UNDEFINED, &heap);
+
+				err = wsrep_row_upd_check_foreign_constraints(
+					node, &pcur, index->table,
+					index, offsets, thr, &mtr);
+
+				switch (err) {
+				case DB_SUCCESS:
+				case DB_NO_REFERENCED_ROW:
+					err = DB_SUCCESS;
+					break;
+				case DB_LOCK_WAIT:
+				case DB_DEADLOCK:
+				case DB_LOCK_WAIT_TIMEOUT:
+					WSREP_DEBUG("Foreign key check fail: "
+						"%s on table %s index %s query %s",
+						ut_strerr(err), index->name(), index->table->name.m_name,
+						wsrep_thd_query(trx->mysql_thd));
+					break;
+				default:
+					WSREP_ERROR("Foreign key check fail: "
+						"%s on table %s index %s query %s",
+						ut_strerr(err), index->name(), index->table->name.m_name,
+						wsrep_thd_query(trx->mysql_thd));
+					break;
+				}
+			}
+#endif /* WITH_WSREP */
+		}
+
+#ifdef WITH_WSREP
+		ut_ad(err == DB_SUCCESS || err == DB_LOCK_WAIT
+		      || err == DB_DEADLOCK || err == DB_LOCK_WAIT_TIMEOUT);
+#else
+		ut_ad(err == DB_SUCCESS);
+#endif
+
+		if (referenced) {
+			rec_offs* offsets = rec_get_offsets(
+				rec, index, NULL, index->n_core_fields,
+				ULINT_UNDEFINED, &heap);
+
+			/* NOTE that the following call loses
+			the position of pcur ! */
+			err = row_upd_check_references_constraints(
+				node, &pcur, index->table,
+				index, offsets, thr, &mtr);
+		}
+	}
+
+close:
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	if (node->is_delete == PLAIN_DELETE || err != DB_SUCCESS) {
+
+		goto func_exit;
+	}
+
+	mem_heap_empty(heap);
+
+	DEBUG_SYNC_C_IF_THD(trx->mysql_thd,
+			    "before_row_upd_sec_new_index_entry");
+
+	/* Build a new index entry */
+	entry = row_build_index_entry(node->upd_row, node->upd_ext,
+				      index, heap);
+	ut_a(entry);
+
+	/* Insert new index entry */
+	err = row_ins_sec_index_entry(index, entry, thr, !node->is_delete);
+
+func_exit:
+	mem_heap_free(heap);
+
+	return(err);
+}
+
+/***********************************************************//**
+Updates the secondary index record if it is changed in the row update or
+deletes it if this is a delete.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_upd_sec_step(
+/*=============*/
+	upd_node_t*	node,	/*!< in: row update node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ut_ad((node->state == UPD_NODE_UPDATE_ALL_SEC)
+	      || (node->state == UPD_NODE_UPDATE_SOME_SEC));
+	ut_ad(!dict_index_is_clust(node->index));
+
+	if (node->state == UPD_NODE_UPDATE_ALL_SEC
+	    || row_upd_changes_ord_field_binary(node->index, node->update,
+						thr, node->row, node->ext)) {
+		return(row_upd_sec_index_entry(node, thr));
+	}
+
+	return(DB_SUCCESS);
+}
+
+#ifdef UNIV_DEBUG
+# define row_upd_clust_rec_by_insert_inherit(rec,index,offsets,entry,update) \
+	row_upd_clust_rec_by_insert_inherit_func(rec,index,offsets,entry,update)
+#else /* UNIV_DEBUG */
+# define row_upd_clust_rec_by_insert_inherit(rec,index,offsets,entry,update) \
+	row_upd_clust_rec_by_insert_inherit_func(rec,entry,update)
+#endif /* UNIV_DEBUG */
+/*******************************************************************//**
+Mark non-updated off-page columns inherited when the primary key is
+updated. We must mark them as inherited in entry, so that they are not
+freed in a rollback. A limited version of this function used to be
+called btr_cur_mark_dtuple_inherited_extern().
+@return whether any columns were inherited */
+static
+bool
+row_upd_clust_rec_by_insert_inherit_func(
+/*=====================================*/
+	const rec_t*	rec,	/*!< in: old record, or NULL */
+#ifdef UNIV_DEBUG
+	dict_index_t*	index,	/*!< in: index, or NULL */
+	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec), or NULL */
+#endif /* UNIV_DEBUG */
+	dtuple_t*	entry,	/*!< in/out: updated entry to be
+				inserted into the clustered index */
+	const upd_t*	update)	/*!< in: update vector */
+{
+	bool	inherit	= false;
+
+	ut_ad(!rec == !offsets);
+	ut_ad(!rec == !index);
+	ut_ad(!rec || rec_offs_validate(rec, index, offsets));
+	ut_ad(!rec || rec_offs_any_extern(offsets));
+
+	for (uint16_t i = 0; i < dtuple_get_n_fields(entry); i++) {
+		dfield_t*	dfield	= dtuple_get_nth_field(entry, i);
+		byte*		data;
+		ulint		len;
+
+		ut_ad(!offsets
+		      || !rec_offs_nth_extern(offsets, i)
+		      == !dfield_is_ext(dfield)
+		      || (!dict_index_get_nth_field(index, i)->name
+			  && !dfield_is_ext(dfield)
+			  && (dfield_is_null(dfield) || dfield->len == 0))
+		      || upd_get_field_by_field_no(update, i, false));
+		if (!dfield_is_ext(dfield)
+		    || upd_get_field_by_field_no(update, i, false)) {
+			continue;
+		}
+
+#ifdef UNIV_DEBUG
+		if (UNIV_LIKELY(rec != NULL)) {
+			ut_ad(!rec_offs_nth_default(offsets, i));
+			const byte* rec_data
+				= rec_get_nth_field(rec, offsets, i, &len);
+			ut_ad(len == dfield_get_len(dfield));
+			ut_ad(len != UNIV_SQL_NULL);
+			ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+			rec_data += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+			/* The pointer must not be zero. */
+			ut_ad(memcmp(rec_data, field_ref_zero,
+				     BTR_EXTERN_FIELD_REF_SIZE));
+			/* The BLOB must be owned. */
+			ut_ad(!(rec_data[BTR_EXTERN_LEN]
+				& BTR_EXTERN_OWNER_FLAG));
+		}
+#endif /* UNIV_DEBUG */
+
+		len = dfield_get_len(dfield);
+		ut_a(len != UNIV_SQL_NULL);
+		ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+		data = static_cast<byte*>(dfield_get_data(dfield));
+
+		data += len - BTR_EXTERN_FIELD_REF_SIZE;
+		/* The pointer must not be zero. */
+		ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE));
+
+		/* The BLOB must be owned, unless we are resuming from
+		a lock wait and we already had disowned the BLOB. */
+		ut_a(rec == NULL
+		     || !(data[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
+		data[BTR_EXTERN_LEN] &= byte(~BTR_EXTERN_OWNER_FLAG);
+		data[BTR_EXTERN_LEN] |= BTR_EXTERN_INHERITED_FLAG;
+		/* The BTR_EXTERN_INHERITED_FLAG only matters in
+		rollback of a fresh insert. Purge will always free
+		the extern fields of a delete-marked row. */
+
+		inherit = true;
+	}
+
+	return(inherit);
+}
+
+/***********************************************************//**
+Marks the clustered index record deleted and inserts the updated version
+of the record to the index. This function should be used when the ordering
+fields of the clustered index record change. This should be quite rare in
+database applications.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_upd_clust_rec_by_insert(
+/*========================*/
+	upd_node_t*	node,	/*!< in/out: row update node */
+	dict_index_t*	index,	/*!< in: clustered index of the record */
+	que_thr_t*	thr,	/*!< in: query thread */
+	bool		referenced,/*!< in: whether index may be referenced in
+				a foreign key constraint */
+#ifdef WITH_WSREP
+	bool		foreign,/*!< in: whether this is a foreign key */
+#endif
+	mtr_t*		mtr)	/*!< in/out: mini-transaction,
+				may be committed and restarted */
+{
+	mem_heap_t*	heap;
+	btr_pcur_t*	pcur;
+	btr_cur_t*	btr_cur;
+	trx_t*		trx;
+	dict_table_t*	table;
+	dtuple_t*	entry;
+	dberr_t		err;
+	rec_t*		rec;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets			= offsets_;
+
+	ut_ad(dict_index_is_clust(index));
+
+	rec_offs_init(offsets_);
+
+	trx = thr_get_trx(thr);
+	table = node->table;
+	pcur = node->pcur;
+	btr_cur	= btr_pcur_get_btr_cur(pcur);
+
+	heap = mem_heap_create(1000);
+
+	entry = row_build_index_entry_low(node->upd_row, node->upd_ext,
+					  index, heap, ROW_BUILD_FOR_INSERT);
+	if (index->is_instant()) entry->trim(*index);
+	ut_ad(dtuple_get_info_bits(entry) == 0);
+
+	{
+		dfield_t* t = dtuple_get_nth_field(entry, index->db_trx_id());
+		ut_ad(t->len == DATA_TRX_ID_LEN);
+		trx_write_trx_id(static_cast<byte*>(t->data), trx->id);
+	}
+
+	switch (node->state) {
+	default:
+		ut_error;
+	case UPD_NODE_INSERT_CLUSTERED:
+		/* A lock wait occurred in row_ins_clust_index_entry() in
+		the previous invocation of this function. */
+		row_upd_clust_rec_by_insert_inherit(
+			NULL, NULL, NULL, entry, node->update);
+		break;
+	case UPD_NODE_UPDATE_CLUSTERED:
+		/* This is the first invocation of the function where
+		we update the primary key.  Delete-mark the old record
+		in the clustered index and prepare to insert a new entry. */
+		rec = btr_cur_get_rec(btr_cur);
+		offsets = rec_get_offsets(rec, index, offsets,
+					  index->n_core_fields,
+					  ULINT_UNDEFINED, &heap);
+		ut_ad(page_rec_is_user_rec(rec));
+
+		if (rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
+			/* If the clustered index record is already delete
+			marked, then we are here after a DB_LOCK_WAIT.
+			Skip delete marking clustered index and disowning
+			its blobs. */
+			ut_ad(row_get_rec_trx_id(rec, index, offsets)
+			      == trx->id);
+			ut_ad(!trx_undo_roll_ptr_is_insert(
+			              row_get_rec_roll_ptr(rec, index,
+							   offsets)));
+			goto check_fk;
+		}
+
+		err = btr_cur_del_mark_set_clust_rec(
+			btr_cur_get_block(btr_cur), rec, index, offsets,
+			thr, node->row, mtr);
+		if (err != DB_SUCCESS) {
+			goto err_exit;
+		}
+
+		/* If the the new row inherits externally stored
+		fields (off-page columns a.k.a. BLOBs) from the
+		delete-marked old record, mark them disowned by the
+		old record and owned by the new entry. */
+
+		if (rec_offs_any_extern(offsets)) {
+			if (row_upd_clust_rec_by_insert_inherit(
+				    rec, index, offsets,
+				    entry, node->update)) {
+				/* The blobs are disowned here, expecting the
+				insert down below to inherit them.  But if the
+				insert fails, then this disown will be undone
+				when the operation is rolled back. */
+				btr_cur_disown_inherited_fields(
+					btr_cur_get_block(btr_cur),
+					rec, index, offsets, node->update,
+					mtr);
+			}
+		}
+check_fk:
+		if (referenced) {
+			/* NOTE that the following call loses
+			the position of pcur ! */
+
+			err = row_upd_check_references_constraints(
+				node, pcur, table, index, offsets, thr, mtr);
+
+			if (err != DB_SUCCESS) {
+				goto err_exit;
+			}
+#ifdef WITH_WSREP
+		} else if (foreign && wsrep_must_process_fk(node, trx)) {
+			err = wsrep_row_upd_check_foreign_constraints(
+				node, pcur, table, index, offsets, thr, mtr);
+
+			switch (err) {
+			case DB_SUCCESS:
+			case DB_NO_REFERENCED_ROW:
+				err = DB_SUCCESS;
+				break;
+			case DB_LOCK_WAIT:
+			case DB_DEADLOCK:
+			case DB_LOCK_WAIT_TIMEOUT:
+				WSREP_DEBUG("Foreign key check fail: "
+					    "%s on table %s index %s query %s",
+					    ut_strerr(err), index->name(), index->table->name.m_name,
+					    wsrep_thd_query(trx->mysql_thd));
+
+				goto err_exit;
+			default:
+				WSREP_ERROR("Foreign key check fail: "
+					    "%s on table %s index %s query %s",
+					    ut_strerr(err), index->name(), index->table->name.m_name,
+					    wsrep_thd_query(trx->mysql_thd));
+
+				goto err_exit;
+			}
+#endif /* WITH_WSREP */
+		}
+	}
+
+	mtr->commit();
+	mtr->start();
+
+	node->state = UPD_NODE_INSERT_CLUSTERED;
+	err = row_ins_clust_index_entry(index, entry, thr,
+					dtuple_get_n_ext(entry));
+err_exit:
+	mem_heap_free(heap);
+	return(err);
+}
+
+/***********************************************************//**
+Updates a clustered index record of a row when the ordering fields do
+not change.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_upd_clust_rec(
+/*==============*/
+	ulint		flags,  /*!< in: undo logging and locking flags */
+	upd_node_t*	node,	/*!< in: row update node */
+	dict_index_t*	index,	/*!< in: clustered index */
+	rec_offs*	offsets,/*!< in: rec_get_offsets() on node->pcur */
+	mem_heap_t**	offsets_heap,
+				/*!< in/out: memory heap, can be emptied */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr)	/*!< in,out: mini-transaction; may be
+				committed and restarted here */
+{
+	mem_heap_t*	heap		= NULL;
+	big_rec_t*	big_rec		= NULL;
+	btr_pcur_t*	pcur;
+	btr_cur_t*	btr_cur;
+	dberr_t		err;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(!thr_get_trx(thr)->in_rollback);
+	ut_ad(!node->table->skip_alter_undo);
+
+	pcur = node->pcur;
+	btr_cur = btr_pcur_get_btr_cur(pcur);
+
+	ut_ad(btr_cur_get_index(btr_cur) == index);
+	ut_ad(!rec_get_deleted_flag(btr_cur_get_rec(btr_cur),
+				    dict_table_is_comp(index->table)));
+	ut_ad(rec_offs_validate(btr_cur_get_rec(btr_cur), index, offsets));
+
+	/* Try optimistic updating of the record, keeping changes within
+	the page; we do not check locks because we assume the x-lock on the
+	record to update */
+
+	if (node->cmpl_info & UPD_NODE_NO_SIZE_CHANGE) {
+		err = btr_cur_update_in_place(
+			flags | BTR_NO_LOCKING_FLAG, btr_cur,
+			offsets, node->update,
+			node->cmpl_info, thr, thr_get_trx(thr)->id, mtr);
+	} else {
+		err = btr_cur_optimistic_update(
+			flags | BTR_NO_LOCKING_FLAG, btr_cur,
+			&offsets, offsets_heap, node->update,
+			node->cmpl_info, thr, thr_get_trx(thr)->id, mtr);
+	}
+
+	if (err == DB_SUCCESS) {
+		goto func_exit;
+	}
+
+	if (buf_pool.running_out()) {
+		err = DB_LOCK_TABLE_FULL;
+		goto func_exit;
+	}
+
+	/* We may have to modify the tree structure: do a pessimistic descent
+	down the index tree */
+
+	mtr->commit();
+	mtr->start();
+
+	if (index->table->is_temporary()) {
+		/* Disable locking, because temporary tables are never
+		shared between transactions or connections. */
+		flags |= BTR_NO_LOCKING_FLAG;
+		mtr->set_log_mode(MTR_LOG_NO_REDO);
+	} else {
+		index->set_modified(*mtr);
+	}
+
+	/* NOTE: this transaction has an s-lock or x-lock on the record and
+	therefore other transactions cannot modify the record when we have no
+	latch on the page. In addition, we assume that other query threads of
+	the same transaction do not modify the record in the meantime.
+	Therefore we can assert that the restoration of the cursor succeeds. */
+
+	ut_a(pcur->restore_position(BTR_MODIFY_TREE, mtr) ==
+	    btr_pcur_t::SAME_ALL);
+
+	ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur),
+				    dict_table_is_comp(index->table)));
+
+	if (!heap) {
+		heap = mem_heap_create(1024);
+	}
+
+	err = btr_cur_pessimistic_update(
+		flags | BTR_NO_LOCKING_FLAG | BTR_KEEP_POS_FLAG, btr_cur,
+		&offsets, offsets_heap, heap, &big_rec,
+		node->update, node->cmpl_info,
+		thr, thr_get_trx(thr)->id, mtr);
+	if (big_rec) {
+		ut_a(err == DB_SUCCESS);
+
+		DEBUG_SYNC_C("before_row_upd_extern");
+		err = btr_store_big_rec_extern_fields(
+			pcur, offsets, big_rec, mtr, BTR_STORE_UPDATE);
+		DEBUG_SYNC_C("after_row_upd_extern");
+	}
+
+func_exit:
+	if (heap) {
+		mem_heap_free(heap);
+	}
+
+	if (big_rec) {
+		dtuple_big_rec_free(big_rec);
+	}
+
+	return(err);
+}
+
+/***********************************************************//**
+Delete marks a clustered index record.
+@return DB_SUCCESS if operation successfully completed, else error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_upd_del_mark_clust_rec(
+/*=======================*/
+	upd_node_t*	node,	/*!< in: row update node */
+	dict_index_t*	index,	/*!< in: clustered index */
+	rec_offs*	offsets,/*!< in/out: rec_get_offsets() for the
+				record under the cursor */
+	que_thr_t*	thr,	/*!< in: query thread */
+	bool		referenced,
+				/*!< in: whether index may be referenced in
+				a foreign key constraint */
+#ifdef WITH_WSREP
+	bool		foreign,/*!< in: whether this is a foreign key */
+#endif
+	mtr_t*		mtr)	/*!< in,out: mini-transaction;
+				will be committed and restarted */
+{
+	btr_pcur_t*	pcur;
+	btr_cur_t*	btr_cur;
+	rec_t*		rec;
+	trx_t*		trx = thr_get_trx(thr);
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(node->is_delete == PLAIN_DELETE);
+
+	pcur = node->pcur;
+	btr_cur = btr_pcur_get_btr_cur(pcur);
+
+	/* Store row because we have to build also the secondary index
+	entries */
+
+	if (!row_upd_store_row(node, trx->mysql_thd,
+			  thr->prebuilt  && thr->prebuilt->table == node->table
+			  ? thr->prebuilt->m_mysql_table : NULL)) {
+		return DB_COMPUTE_VALUE_FAILED;
+	}
+
+	/* Mark the clustered index record deleted; we do not have to check
+	locks, because we assume that we have an x-lock on the record */
+
+	rec = btr_cur_get_rec(btr_cur);
+
+	dberr_t err = btr_cur_del_mark_set_clust_rec(
+		btr_cur_get_block(btr_cur), rec,
+		index, offsets, thr, node->row, mtr);
+
+	if (err != DB_SUCCESS) {
+	} else if (referenced) {
+		/* NOTE that the following call loses the position of pcur ! */
+
+		err = row_upd_check_references_constraints(
+			node, pcur, index->table, index, offsets, thr, mtr);
+#ifdef WITH_WSREP
+	} else if (foreign && wsrep_must_process_fk(node, trx)) {
+		err = wsrep_row_upd_check_foreign_constraints(
+			node, pcur, index->table, index, offsets, thr, mtr);
+
+		switch (err) {
+		case DB_SUCCESS:
+		case DB_NO_REFERENCED_ROW:
+			err = DB_SUCCESS;
+			break;
+		case DB_LOCK_WAIT:
+		case DB_DEADLOCK:
+		case DB_LOCK_WAIT_TIMEOUT:
+			WSREP_DEBUG("Foreign key check fail: "
+				    "%d on table %s index %s query %s",
+				    err, index->name(), index->table->name.m_name,
+				    wsrep_thd_query(trx->mysql_thd));
+			break;
+		default:
+			WSREP_ERROR("Foreign key check fail: "
+				    "%d on table %s index %s query %s",
+				    err, index->name(), index->table->name.m_name,
+				    wsrep_thd_query(trx->mysql_thd));
+			break;
+		}
+#endif /* WITH_WSREP */
+	}
+
+	return(err);
+}
+
+/***********************************************************//**
+Updates the clustered index record.
+@return DB_SUCCESS if operation successfully completed, DB_LOCK_WAIT
+in case of a lock wait, else error code */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+row_upd_clust_step(
+/*===============*/
+	upd_node_t*	node,	/*!< in: row update node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dict_index_t*	index;
+	btr_pcur_t*	pcur;
+	dberr_t		err;
+	mtr_t		mtr;
+	rec_t*		rec;
+	mem_heap_t*	heap	= NULL;
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets;
+	ulint		flags;
+	trx_t*		trx = thr_get_trx(thr);
+
+	rec_offs_init(offsets_);
+
+	index = dict_table_get_first_index(node->table);
+
+	if (index->is_corrupted()) {
+		return DB_TABLE_CORRUPT;
+	}
+
+	const bool referenced = row_upd_index_is_referenced(index, trx);
+#ifdef WITH_WSREP
+	const bool foreign = wsrep_row_upd_index_is_foreign(index, trx);
+#endif
+
+	pcur = node->pcur;
+
+	/* We have to restore the cursor to its position */
+
+	mtr.start();
+
+	if (node->table->is_temporary()) {
+		/* Disable locking, because temporary tables are
+		private to the connection (no concurrent access). */
+		flags = node->table->no_rollback()
+			? BTR_NO_ROLLBACK
+			: BTR_NO_LOCKING_FLAG;
+		/* Redo logging only matters for persistent tables. */
+		mtr.set_log_mode(MTR_LOG_NO_REDO);
+	} else {
+		flags = node->table->no_rollback() ? BTR_NO_ROLLBACK : 0;
+		index->set_modified(mtr);
+	}
+
+	/* If the restoration does not succeed, then the same
+	transaction has deleted the record on which the cursor was,
+	and that is an SQL error. If the restoration succeeds, it may
+	still be that the same transaction has successively deleted
+	and inserted a record with the same ordering fields, but in
+	that case we know that the transaction has at least an
+	implicit x-lock on the record. */
+
+	ut_a(pcur->rel_pos == BTR_PCUR_ON);
+
+	btr_latch_mode mode;
+
+	DEBUG_SYNC_C_IF_THD(trx->mysql_thd, "innodb_row_upd_clust_step_enter");
+
+	if (dict_index_is_online_ddl(index)) {
+		ut_ad(node->table->id != DICT_INDEXES_ID);
+		mode = BTR_MODIFY_LEAF_ALREADY_LATCHED;
+		mtr_s_lock_index(index, &mtr);
+	} else {
+		mode = BTR_MODIFY_LEAF;
+	}
+
+	if (pcur->restore_position(mode, &mtr) != btr_pcur_t::SAME_ALL) {
+		err = DB_RECORD_NOT_FOUND;
+		goto exit_func;
+	}
+
+	rec = btr_pcur_get_rec(pcur);
+	offsets = rec_get_offsets(rec, index, offsets_, index->n_core_fields,
+				  ULINT_UNDEFINED, &heap);
+
+	if (!flags && !node->has_clust_rec_x_lock) {
+		err = lock_clust_rec_modify_check_and_lock(
+			btr_pcur_get_block(pcur),
+			rec, index, offsets, thr);
+		if (err != DB_SUCCESS) {
+			goto exit_func;
+		}
+	}
+
+	ut_ad(index->table->no_rollback() || index->table->is_temporary()
+	      || row_get_rec_trx_id(rec, index, offsets) == trx->id
+	      || lock_trx_has_expl_x_lock(*trx, *index->table,
+					  btr_pcur_get_block(pcur)->page.id(),
+					  page_rec_get_heap_no(rec)));
+
+	if (node->is_delete == PLAIN_DELETE) {
+		err = row_upd_del_mark_clust_rec(
+			node, index, offsets, thr, referenced,
+#ifdef WITH_WSREP
+			foreign,
+#endif
+			&mtr);
+		goto all_done;
+	}
+
+	/* If the update is made for MySQL, we already have the update vector
+	ready, else we have to do some evaluation: */
+
+	if (UNIV_UNLIKELY(!node->in_mysql_interface)) {
+		/* Copy the necessary columns from clust_rec and calculate the
+		new values to set */
+		row_upd_copy_columns(rec, offsets, index,
+				     UT_LIST_GET_FIRST(node->columns));
+		row_upd_eval_new_vals(node->update);
+	}
+
+	if (!node->is_delete && node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) {
+		err = row_upd_clust_rec(
+			flags, node, index, offsets, &heap, thr, &mtr);
+		goto exit_func;
+	}
+
+	if (!row_upd_store_row(node, trx->mysql_thd, thr->prebuilt
+			       ? thr->prebuilt->m_mysql_table : NULL)) {
+		err = DB_COMPUTE_VALUE_FAILED;
+		goto exit_func;
+	}
+
+	if (row_upd_changes_ord_field_binary(index, node->update, thr,
+					     node->row, node->ext)) {
+
+		/* Update causes an ordering field (ordering fields within
+		the B-tree) of the clustered index record to change: perform
+		the update by delete marking and inserting.
+
+		TODO! What to do to the 'Halloween problem', where an update
+		moves the record forward in index so that it is again
+		updated when the cursor arrives there? Solution: the
+		read operation must check the undo record undo number when
+		choosing records to update. MySQL solves now the problem
+		externally! */
+
+		err = row_upd_clust_rec_by_insert(
+			node, index, thr, referenced,
+#ifdef WITH_WSREP
+			foreign,
+#endif
+			&mtr);
+all_done:
+		if (err == DB_SUCCESS) {
+			node->state = UPD_NODE_UPDATE_ALL_SEC;
+success:
+			node->index = dict_table_get_next_index(index);
+		}
+	} else {
+		err = row_upd_clust_rec(
+			flags, node, index, offsets, &heap, thr, &mtr);
+
+		if (err == DB_SUCCESS) {
+			ut_ad(node->is_delete != PLAIN_DELETE);
+			node->state = node->is_delete
+				? UPD_NODE_UPDATE_ALL_SEC
+				: UPD_NODE_UPDATE_SOME_SEC;
+			goto success;
+		}
+	}
+
+exit_func:
+	mtr.commit();
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return err;
+}
+
+/***********************************************************//**
+Updates the affected index records of a row. When the control is transferred
+to this node, we assume that we have a persistent cursor which was on a
+record, and the position of the cursor is stored in the cursor.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static
+dberr_t
+row_upd(
+/*====*/
+	upd_node_t*	node,	/*!< in: row update node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dberr_t		err	= DB_SUCCESS;
+	DBUG_ENTER("row_upd");
+
+	ut_ad(!thr_get_trx(thr)->in_rollback);
+
+	DBUG_PRINT("row_upd", ("table: %s", node->table->name.m_name));
+	DBUG_PRINT("row_upd", ("info bits in update vector: 0x%x",
+			       node->update ? node->update->info_bits: 0));
+	DBUG_PRINT("row_upd", ("foreign_id: %s",
+			       node->foreign ? node->foreign->id: "NULL"));
+
+	if (UNIV_LIKELY(node->in_mysql_interface)) {
+
+		/* We do not get the cmpl_info value from the MySQL
+		interpreter: we must calculate it on the fly: */
+
+		if (node->is_delete == PLAIN_DELETE
+		    || row_upd_changes_some_index_ord_field_binary(
+			    node->table, node->update)) {
+			node->cmpl_info = 0;
+		} else {
+			node->cmpl_info = UPD_NODE_NO_ORD_CHANGE;
+		}
+	}
+
+	switch (node->state) {
+	case UPD_NODE_UPDATE_CLUSTERED:
+	case UPD_NODE_INSERT_CLUSTERED:
+		log_free_check();
+
+		err = row_upd_clust_step(node, thr);
+
+		if (err != DB_SUCCESS) {
+
+			DBUG_RETURN(err);
+		}
+	}
+
+	DEBUG_SYNC_C_IF_THD(thr_get_trx(thr)->mysql_thd,
+			    "after_row_upd_clust");
+
+	if (node->index == NULL
+	    || (!node->is_delete
+		&& (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE))) {
+
+		DBUG_RETURN(DB_SUCCESS);
+	}
+
+	DBUG_EXECUTE_IF("row_upd_skip_sec", node->index = NULL;);
+
+	do {
+		if (!node->index) {
+			break;
+		}
+
+		if (!(node->index->type & (DICT_FTS | DICT_CORRUPT))
+		    && node->index->is_committed()) {
+			err = row_upd_sec_step(node, thr);
+
+			if (err != DB_SUCCESS) {
+
+				DBUG_RETURN(err);
+			}
+		}
+
+		node->index = dict_table_get_next_index(node->index);
+	} while (node->index != NULL);
+
+	ut_ad(err == DB_SUCCESS);
+
+	/* Do some cleanup */
+
+	if (node->row != NULL) {
+		node->row = NULL;
+		node->ext = NULL;
+		node->upd_row = NULL;
+		node->upd_ext = NULL;
+		mem_heap_empty(node->heap);
+	}
+
+	node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+	DBUG_RETURN(err);
+}
+
+/***********************************************************//**
+Updates a row in a table. This is a high-level function used in SQL execution
+graphs.
+@return query thread to run next or NULL */
+que_thr_t*
+row_upd_step(
+/*=========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	upd_node_t*	node;
+	sel_node_t*	sel_node;
+	que_node_t*	parent;
+	dberr_t		err		= DB_SUCCESS;
+	trx_t*		trx;
+	DBUG_ENTER("row_upd_step");
+
+	ut_ad(thr);
+
+	trx = thr_get_trx(thr);
+
+	node = static_cast<upd_node_t*>(thr->run_node);
+
+	sel_node = node->select;
+
+	parent = que_node_get_parent(node);
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_UPDATE);
+
+	if (thr->prev_node == parent) {
+		node->state = UPD_NODE_SET_IX_LOCK;
+	}
+
+	if (node->state == UPD_NODE_SET_IX_LOCK) {
+
+		if (!node->has_clust_rec_x_lock) {
+			/* It may be that the current session has not yet
+			started its transaction, or it has been committed: */
+
+			err = lock_table(node->table, nullptr, LOCK_IX, thr);
+
+			if (err != DB_SUCCESS) {
+
+				goto error_handling;
+			}
+		}
+
+		node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+		if (node->searched_update) {
+			/* Reset the cursor */
+			sel_node->state = SEL_NODE_OPEN;
+
+			/* Fetch a row to update */
+
+			thr->run_node = sel_node;
+
+			DBUG_RETURN(thr);
+		}
+	}
+
+	/* sel_node is NULL if we are in the MySQL interface */
+
+	if (sel_node && (sel_node->state != SEL_NODE_FETCH)) {
+
+		if (!node->searched_update) {
+			/* An explicit cursor should be positioned on a row
+			to update */
+
+			ut_error;
+
+			err = DB_ERROR;
+
+			goto error_handling;
+		}
+
+		ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
+
+		/* No more rows to update, or the select node performed the
+		updates directly in-place */
+
+		thr->run_node = parent;
+
+		DBUG_RETURN(thr);
+	}
+
+	/* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */
+
+	err = row_upd(node, thr);
+
+error_handling:
+	trx->error_state = err;
+
+	if (err != DB_SUCCESS) {
+		DBUG_RETURN(NULL);
+	}
+
+	/* DO THE TRIGGER ACTIONS HERE */
+
+	if (node->searched_update) {
+		/* Fetch next row to update */
+
+		thr->run_node = sel_node;
+	} else {
+		/* It was an explicit cursor update */
+
+		thr->run_node = parent;
+	}
+
+	node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+	DBUG_RETURN(thr);
+}
+
+/** Write query start time as SQL field data to a buffer. Needed by InnoDB.
+@param	thd	Thread object
+@param	buf	Buffer to hold start time data */
+void thd_get_query_start_data(THD *thd, char *buf);
+
+/** Appends row_start or row_end field to update vector and sets a
+CURRENT_TIMESTAMP/trx->id value to it. Called by vers_make_update() and
+vers_make_delete().
+@param[in]	trx	transaction
+@param[in]	vers_sys_idx	table->row_start or table->row_end */
+void upd_node_t::vers_update_fields(const trx_t *trx, ulint idx)
+{
+  ut_ad(in_mysql_interface); // otherwise needs to recalculate node->cmpl_info
+  ut_ad(idx == table->vers_start || idx == table->vers_end);
+
+  dict_index_t *clust_index= dict_table_get_first_index(table);
+  const dict_col_t *col= dict_table_get_nth_col(table, idx);
+  ulint field_no= dict_col_get_clust_pos(col, clust_index);
+  upd_field_t *ufield;
+
+  for (ulint i= 0; i < update->n_fields; ++i)
+  {
+    if (update->fields[i].field_no == field_no)
+    {
+      ufield= &update->fields[i];
+      goto skip_append;
+    }
+  }
+
+  /* row_create_update_node_for_mysql() pre-allocated this much.
+  At least one PK column always remains unchanged. */
+  ut_ad(update->n_fields < ulint(table->n_cols + table->n_v_cols));
+
+  update->n_fields++;
+  ufield= upd_get_nth_field(update, update->n_fields - 1);
+  upd_field_set_field_no(ufield, static_cast<uint16_t>(field_no), clust_index);
+
+skip_append:
+  char *where= reinterpret_cast<char *>(update->vers_sys_value);
+  if (col->vers_native())
+    mach_write_to_8(where, trx->id);
+  else
+    thd_get_query_start_data(trx->mysql_thd, where);
+
+  dfield_set_data(&ufield->new_val, update->vers_sys_value, col->len);
+
+  for (ulint col_no= 0; col_no < dict_table_get_n_v_cols(table); col_no++)
+  {
+    const dict_v_col_t *v_col= dict_table_get_nth_v_col(table, col_no);
+    if (!v_col->m_col.ord_part)
+      continue;
+    for (ulint i= 0; i < unsigned(v_col->num_base); i++)
+    {
+      dict_col_t *base_col= v_col->base_col[i];
+      if (base_col->ind == col->ind)
+      {
+        /* Virtual column depends on system field value
+        which we updated above. Remove it from update
+        vector, so it is recalculated in
+        row_upd_store_v_row() (see !update branch). */
+        update->remove(v_col->v_pos);
+        break;
+      }
+    }
+  }
+}
+
+
+/** Prepare update vector for versioned delete.
+Set row_end to CURRENT_TIMESTAMP or trx->id.
+Initialize fts_next_doc_id for versioned delete.
+@param[in] trx transaction */
+void upd_node_t::vers_make_delete(trx_t* trx)
+{
+  update->n_fields= 0;
+  is_delete= VERSIONED_DELETE;
+  vers_update_fields(trx, table->vers_end);
+  trx->fts_next_doc_id= table->fts ? UINT64_UNDEFINED : 0;
+}
diff --git a/storage/innobase/row/row0vers.cc b/storage/innobase/row/row0vers.cc
new file mode 100644
index 00000000..c3acf325
--- /dev/null
+++ b/storage/innobase/row/row0vers.cc
@@ -0,0 +1,1419 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0vers.cc
+Row versions
+
+Created 2/6/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0vers.h"
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "row0mysql.h"
+
+/** Check whether all non-virtual index fields are equal.
+@param[in]	index	the secondary index
+@param[in]	a	first index entry to compare
+@param[in]	b	second index entry to compare
+@return	whether all non-virtual fields are equal */
+static
+bool
+row_vers_non_virtual_fields_equal(
+	const dict_index_t*	index,
+	const dfield_t*		a,
+	const dfield_t*		b)
+{
+	const dict_field_t* end = &index->fields[index->n_fields];
+
+	for (const dict_field_t* ifield = index->fields; ifield != end;
+	     ifield++) {
+		if (!ifield->col->is_virtual()
+		    && cmp_dfield_dfield(a++, b++)) {
+			return false;
+		}
+	}
+
+	return true;
+}
+
+/** Determine if an active transaction has inserted or modified a secondary
+index record.
+@param[in,out]	caller_trx	trx of current thread
+@param[in]	clust_rec	clustered index record
+@param[in]	clust_index	clustered index
+@param[in]	rec		secondary index record
+@param[in]	index		secondary index
+@param[in]	offsets		rec_get_offsets(rec, index)
+@param[in,out]	mtr		mini-transaction
+@return	the active transaction; state must be rechecked after
+acquiring trx->mutex, and trx->release_reference() must be invoked
+@retval	NULL if the record was committed */
+UNIV_INLINE
+trx_t*
+row_vers_impl_x_locked_low(
+	trx_t*		caller_trx,
+	const rec_t*	clust_rec,
+	dict_index_t*	clust_index,
+	const rec_t*	rec,
+	dict_index_t*	index,
+	const rec_offs*	offsets,
+	mtr_t*		mtr)
+{
+	trx_id_t	trx_id;
+	rec_t*		prev_version = NULL;
+	rec_offs	clust_offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	clust_offsets;
+	mem_heap_t*	heap;
+	dtuple_t*	ientry = NULL;
+	mem_heap_t*	v_heap = NULL;
+	dtuple_t*	cur_vrow = NULL;
+
+	rec_offs_init(clust_offsets_);
+
+	DBUG_ENTER("row_vers_impl_x_locked_low");
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(mtr->memo_contains_page_flagged(clust_rec,
+					      MTR_MEMO_PAGE_S_FIX
+					      | MTR_MEMO_PAGE_X_FIX));
+
+	if (ulint trx_id_offset = clust_index->trx_id_offset) {
+		trx_id = mach_read_from_6(clust_rec + trx_id_offset);
+		if (trx_id == 0) {
+			/* The transaction history was already purged. */
+			DBUG_RETURN(0);
+		}
+	}
+
+	heap = mem_heap_create(1024);
+
+	clust_offsets = rec_get_offsets(clust_rec, clust_index, clust_offsets_,
+					clust_index->n_core_fields,
+					ULINT_UNDEFINED, &heap);
+
+	trx_id = row_get_rec_trx_id(clust_rec, clust_index, clust_offsets);
+	if (trx_id == 0) {
+		/* The transaction history was already purged. */
+		mem_heap_free(heap);
+		DBUG_RETURN(0);
+	}
+
+	ut_ad(!clust_index->table->is_temporary());
+
+	trx_t*	trx;
+
+	if (trx_id == caller_trx->id) {
+		trx = caller_trx;
+		trx->reference();
+	} else {
+		trx = trx_sys.find(caller_trx, trx_id);
+		if (trx == 0) {
+			/* The transaction that modified or inserted
+			clust_rec is no longer active, or it is
+			corrupt: no implicit lock on rec */
+			lock_check_trx_id_sanity(trx_id, clust_rec,
+						 clust_index, clust_offsets);
+			mem_heap_free(heap);
+			DBUG_RETURN(0);
+		}
+	}
+
+	const ulint comp = page_rec_is_comp(rec);
+	ut_ad(index->table == clust_index->table);
+	ut_ad(!!comp == dict_table_is_comp(index->table));
+	ut_ad(!comp == !page_rec_is_comp(clust_rec));
+
+	const ulint rec_del = rec_get_deleted_flag(rec, comp);
+
+	if (dict_index_has_virtual(index)) {
+		ulint	est_size = DTUPLE_EST_ALLOC(index->n_fields);
+
+		/* Allocate the dtuple for virtual columns extracted from undo
+		log with its own heap, so to avoid it being freed as we
+		iterating in the version loop below. */
+		v_heap = mem_heap_create(est_size);
+		ientry = row_rec_to_index_entry(rec, index, offsets, v_heap);
+	}
+
+	/* We look up if some earlier version, which was modified by
+	the trx_id transaction, of the clustered index record would
+	require rec to be in a different state (delete marked or
+	unmarked, or have different field values, or not existing). If
+	there is such a version, then rec was modified by the trx_id
+	transaction, and it has an implicit x-lock on rec. Note that
+	if clust_rec itself would require rec to be in a different
+	state, then the trx_id transaction has not yet had time to
+	modify rec, and does not necessarily have an implicit x-lock
+	on rec. */
+
+	for (const rec_t* version = clust_rec;; version = prev_version) {
+		row_ext_t*	ext;
+		dtuple_t*	row;
+		dtuple_t*	entry;
+		ulint		vers_del;
+		trx_id_t	prev_trx_id;
+		mem_heap_t*	old_heap = heap;
+		dtuple_t*	vrow = NULL;
+
+		/* We keep the semaphore in mtr on the clust_rec page, so
+		that no other transaction can update it and get an
+		implicit x-lock on rec until mtr_commit(mtr). */
+
+		heap = mem_heap_create(1024);
+
+		trx_undo_prev_version_build(
+			version, clust_index, clust_offsets,
+			heap, &prev_version, NULL,
+			dict_index_has_virtual(index) ? &vrow : NULL, 0);
+
+		ut_d(trx->mutex_lock());
+		const bool committed = trx_state_eq(
+			trx, TRX_STATE_COMMITTED_IN_MEMORY);
+		ut_d(trx->mutex_unlock());
+
+		/* The oldest visible clustered index version must not be
+		delete-marked, because we never start a transaction by
+		inserting a delete-marked record. */
+		ut_ad(committed || prev_version
+		      || !rec_get_deleted_flag(version, comp));
+
+		/* Free version and clust_offsets. */
+		mem_heap_free(old_heap);
+
+		if (committed) {
+			goto not_locked;
+		}
+
+		if (prev_version == NULL) {
+
+			/* We reached the oldest visible version without
+			finding an older version of clust_rec that would
+			match the secondary index record.  If the secondary
+			index record is not delete marked, then clust_rec
+			is considered the correct match of the secondary
+			index record and hence holds the implicit lock. */
+
+			if (rec_del) {
+				/* The secondary index record is del marked.
+				So, the implicit lock holder of clust_rec
+				did not modify the secondary index record yet,
+				and is not holding an implicit lock on it.
+
+				This assumes that whenever a row is inserted
+				or updated, the leaf page record always is
+				created with a clear delete-mark flag.
+				(We never insert a delete-marked record.) */
+not_locked:
+				trx->release_reference();
+				trx = 0;
+			}
+
+			break;
+		}
+
+		clust_offsets = rec_get_offsets(
+			prev_version, clust_index, clust_offsets_,
+			clust_index->n_core_fields,
+			ULINT_UNDEFINED, &heap);
+
+		vers_del = rec_get_deleted_flag(prev_version, comp);
+
+		prev_trx_id = row_get_rec_trx_id(prev_version, clust_index,
+						 clust_offsets);
+
+		/* The stack of versions is locked by mtr.  Thus, it
+		is safe to fetch the prefixes for externally stored
+		columns. */
+
+		row = row_build(ROW_COPY_POINTERS, clust_index, prev_version,
+				clust_offsets,
+				NULL, NULL, NULL, &ext, heap);
+
+		if (dict_index_has_virtual(index)) {
+			if (vrow) {
+				/* Keep the virtual row info for the next
+				version */
+				cur_vrow = dtuple_copy(vrow, v_heap);
+				dtuple_dup_v_fld(cur_vrow, v_heap);
+			}
+
+			if (!cur_vrow) {
+				/* Build index entry out of row */
+				entry = row_build_index_entry(row, ext, index,
+							      heap);
+
+				/* entry could only be NULL (the
+				clustered index record could contain
+				BLOB pointers that are NULL) if we
+				were accessing a freshly inserted
+				record before it was fully inserted.
+				prev_version cannot possibly be such
+				an incomplete record, because its
+				transaction would have to be committed
+				in order for later versions of the
+				record to be able to exist. */
+				ut_ad(entry);
+
+				/* If the indexed virtual columns has changed,
+				there must be log record to generate vrow.
+				Otherwise, it is not changed, so no need
+				to compare */
+				if (!row_vers_non_virtual_fields_equal(
+					    index,
+					    ientry->fields, entry->fields)) {
+					if (rec_del != vers_del) {
+						break;
+					}
+				} else if (!rec_del) {
+					break;
+				}
+
+				goto result_check;
+			} else {
+				ut_ad(row->n_v_fields == cur_vrow->n_v_fields);
+				dtuple_copy_v_fields(row, cur_vrow);
+			}
+		}
+
+		entry = row_build_index_entry(row, ext, index, heap);
+
+		/* entry could only be NULL (the clustered index
+		record could contain BLOB pointers that are NULL) if
+		we were accessing a freshly inserted record before it
+		was fully inserted.  prev_version cannot possibly be
+		such an incomplete record, because its transaction
+		would have to be committed in order for later versions
+		of the record to be able to exist. */
+		ut_ad(entry);
+
+		/* If we get here, we know that the trx_id transaction
+		modified prev_version. Let us check if prev_version
+		would require rec to be in a different state. */
+
+		/* The previous version of clust_rec must be
+		accessible, because clust_rec was not a fresh insert.
+		There is no guarantee that the transaction is still
+		active. */
+
+		/* We check if entry and rec are identified in the alphabetical
+		ordering */
+		if (0 == cmp_dtuple_rec(entry, rec, index, offsets)) {
+			/* The delete marks of rec and prev_version should be
+			equal for rec to be in the state required by
+			prev_version */
+
+			if (rec_del != vers_del) {
+
+				break;
+			}
+
+			/* It is possible that the row was updated so that the
+			secondary index record remained the same in
+			alphabetical ordering, but the field values changed
+			still. For example, 'abc' -> 'ABC'. Check also that. */
+
+			dtuple_set_types_binary(
+				entry, dtuple_get_n_fields(entry));
+
+			if (cmp_dtuple_rec(entry, rec, index, offsets)) {
+
+				break;
+			}
+
+		} else if (!rec_del) {
+			/* The delete mark should be set in rec for it to be
+			in the state required by prev_version */
+
+			break;
+		}
+
+result_check:
+		if (trx->id != prev_trx_id) {
+			/* prev_version was the first version modified by
+			the trx_id transaction: no implicit x-lock */
+			goto not_locked;
+		}
+	}
+
+	if (trx) {
+		DBUG_PRINT("info", ("Implicit lock is held by trx:" TRX_ID_FMT,
+				    trx_id));
+	}
+
+	if (v_heap != NULL) {
+		mem_heap_free(v_heap);
+	}
+
+	mem_heap_free(heap);
+	DBUG_RETURN(trx);
+}
+
+/** Determine if an active transaction has inserted or modified a secondary
+index record.
+@param[in,out]	caller_trx	trx of current thread
+@param[in]	rec	secondary index record
+@param[in]	index	secondary index
+@param[in]	offsets	rec_get_offsets(rec, index)
+@return	the active transaction; state must be rechecked after
+acquiring trx->mutex, and trx->release_reference() must be invoked
+@retval	NULL if the record was committed */
+trx_t*
+row_vers_impl_x_locked(
+	trx_t*		caller_trx,
+	const rec_t*	rec,
+	dict_index_t*	index,
+	const rec_offs*	offsets)
+{
+	mtr_t		mtr;
+	trx_t*		trx;
+	const rec_t*	clust_rec;
+	dict_index_t*	clust_index;
+
+	lock_sys.assert_unlocked();
+
+	mtr_start(&mtr);
+
+	/* Search for the clustered index record. The latch on the
+	page of clust_rec locks the top of the stack of versions. The
+	bottom of the version stack is not locked; oldest versions may
+	disappear by the fact that transactions may be committed and
+	collected by the purge. This is not a problem, because we are
+	only interested in active transactions. */
+
+	clust_rec = row_get_clust_rec(
+		BTR_SEARCH_LEAF, rec, index, &clust_index, &mtr);
+
+	if (!clust_rec) {
+		/* In a rare case it is possible that no clust rec is found
+		for a secondary index record: if in row0umod.cc
+		row_undo_mod_remove_clust_low() we have already removed the
+		clust rec, while purge is still cleaning and removing
+		secondary index records associated with earlier versions of
+		the clustered index record. In that case there cannot be
+		any implicit lock on the secondary index record, because
+		an active transaction which has modified the secondary index
+		record has also modified the clustered index record. And in
+		a rollback we always undo the modifications to secondary index
+		records before the clustered index record. */
+
+		trx = 0;
+	} else {
+		trx = row_vers_impl_x_locked_low(
+				caller_trx, clust_rec, clust_index, rec, index,
+				offsets, &mtr);
+
+		ut_ad(trx == 0 || trx->is_referenced());
+	}
+
+	mtr_commit(&mtr);
+
+	return(trx);
+}
+
+/** build virtual column value from current cluster index record data
+@param[in,out]	row		the cluster index row in dtuple form
+@param[in]	clust_index	clustered index
+@param[in]	index		the secondary index
+@param[in]	heap		heap used to build virtual dtuple. */
+static
+bool
+row_vers_build_clust_v_col(
+	dtuple_t*		row,
+	dict_index_t*		clust_index,
+	dict_index_t*		index,
+	mem_heap_t*		heap)
+{
+	THD*		thd= current_thd;
+	TABLE*		maria_table= 0;
+
+	ut_ad(dict_index_has_virtual(index));
+	ut_ad(index->table == clust_index->table);
+
+	DEBUG_SYNC(current_thd, "ib_clust_v_col_before_row_allocated");
+
+	ib_vcol_row vc(nullptr);
+	byte *record = vc.record(thd, index, &maria_table);
+
+	ut_ad(maria_table);
+
+	for (ulint i = 0; i < dict_index_get_n_fields(index); i++) {
+		const dict_col_t* c = dict_index_get_nth_col(index, i);
+
+		if (c->is_virtual()) {
+			const dict_v_col_t* col
+				= reinterpret_cast<const dict_v_col_t*>(c);
+
+			dfield_t *vfield = innobase_get_computed_value(
+				row, col, clust_index, &vc.heap,
+				heap, NULL, thd, maria_table, record, NULL,
+				NULL);
+			if (!vfield) {
+				innobase_report_computed_value_failed(row);
+				ut_ad(0);
+				return false;
+			}
+		}
+	}
+
+	return true;
+}
+
+/** Build latest virtual column data from undo log
+@param[in]	in_purge	whether this is the purge thread
+@param[in]	rec		clustered index record
+@param[in]	clust_index	clustered index
+@param[in,out]	clust_offsets	offsets on the clustered index record
+@param[in]	index		the secondary index
+@param[in]	roll_ptr	the rollback pointer for the purging record
+@param[in]	trx_id		trx id for the purging record
+@param[in,out]	v_heap		heap used to build vrow
+@param[out]	v_row		dtuple holding the virtual rows
+@param[in,out]	mtr		mtr holding the latch on rec */
+static
+void
+row_vers_build_cur_vrow_low(
+	bool			in_purge,
+	const rec_t*		rec,
+	dict_index_t*		clust_index,
+	rec_offs*		clust_offsets,
+	dict_index_t*		index,
+	roll_ptr_t		roll_ptr,
+	trx_id_t		trx_id,
+	mem_heap_t*		v_heap,
+	dtuple_t**		vrow,
+	mtr_t*			mtr)
+{
+	const rec_t*	version;
+	rec_t*		prev_version;
+	mem_heap_t*	heap = NULL;
+	ulint		num_v = dict_table_get_n_v_cols(index->table);
+	const dfield_t* field;
+	ulint		i;
+	bool		all_filled = false;
+
+	*vrow = dtuple_create_with_vcol(v_heap, 0, num_v);
+	dtuple_init_v_fld(*vrow);
+
+	for (i = 0; i < num_v; i++) {
+		dfield_get_type(dtuple_get_nth_v_field(*vrow, i))->mtype
+			 = DATA_MISSING;
+	}
+
+	ut_ad(mtr->memo_contains_page_flagged(rec,
+					      MTR_MEMO_PAGE_S_FIX
+					      | MTR_MEMO_PAGE_X_FIX));
+
+	version = rec;
+
+	/* If this is called by purge thread, set TRX_UNDO_PREV_IN_PURGE
+	bit to search the undo log until we hit the current undo log with
+	roll_ptr */
+	const ulint	status = in_purge
+		? TRX_UNDO_PREV_IN_PURGE | TRX_UNDO_GET_OLD_V_VALUE
+		: TRX_UNDO_GET_OLD_V_VALUE;
+
+	while (!all_filled) {
+		mem_heap_t*	heap2 = heap;
+		heap = mem_heap_create(1024);
+		roll_ptr_t	cur_roll_ptr = row_get_rec_roll_ptr(
+			version, clust_index, clust_offsets);
+
+		trx_undo_prev_version_build(
+			version, clust_index, clust_offsets,
+			heap, &prev_version, NULL, vrow, status);
+
+		if (heap2) {
+			mem_heap_free(heap2);
+		}
+
+		if (!prev_version) {
+			/* Versions end here */
+			break;
+		}
+
+		clust_offsets = rec_get_offsets(prev_version, clust_index,
+						NULL,
+						clust_index->n_core_fields,
+						ULINT_UNDEFINED, &heap);
+
+		ulint	entry_len = dict_index_get_n_fields(index);
+
+		all_filled = true;
+
+		for (i = 0; i < entry_len; i++) {
+			const dict_col_t* col
+				= dict_index_get_nth_col(index, i);
+
+			if (!col->is_virtual()) {
+				continue;
+			}
+
+			const dict_v_col_t*	v_col
+				= reinterpret_cast<const dict_v_col_t*>(col);
+			field = dtuple_get_nth_v_field(*vrow, v_col->v_pos);
+
+			if (dfield_get_type(field)->mtype == DATA_MISSING) {
+				all_filled = false;
+				break;
+			}
+
+		}
+
+		trx_id_t	rec_trx_id = row_get_rec_trx_id(
+			prev_version, clust_index, clust_offsets);
+
+		if (rec_trx_id < trx_id || roll_ptr == cur_roll_ptr) {
+			break;
+		}
+
+		version = prev_version;
+	}
+
+	mem_heap_free(heap);
+}
+
+/** Check a virtual column value index secondary virtual index matches
+that of current cluster index record, which is recreated from information
+stored in undo log
+@param[in]	rec		record in the clustered index
+@param[in]	icentry		the index entry built from a cluster row
+@param[in]	clust_index	cluster index
+@param[in]	clust_offsets	offsets on the cluster record
+@param[in]	index		the secondary index
+@param[in]	ientry		the secondary index entry
+@param[in]	roll_ptr	the rollback pointer for the purging record
+@param[in]	trx_id		trx id for the purging record
+@param[in,out]	v_heap		heap used to build virtual dtuple
+@param[in,out]	v_row		dtuple holding the virtual rows (if needed)
+@param[in]	mtr		mtr holding the latch on rec
+@return true if matches, false otherwise */
+static
+bool
+row_vers_vc_matches_cluster(
+	const rec_t*	rec,
+	const dtuple_t* icentry,
+	dict_index_t*	clust_index,
+	rec_offs*	clust_offsets,
+	dict_index_t*	index,
+	const dtuple_t* ientry,
+	roll_ptr_t	roll_ptr,
+	trx_id_t	trx_id,
+	mem_heap_t*	v_heap,
+	dtuple_t**	vrow,
+	mtr_t*		mtr)
+{
+	const rec_t*	version;
+	rec_t*          prev_version;
+	mem_heap_t*	heap2;
+	mem_heap_t*	heap = NULL;
+	mem_heap_t*	tuple_heap;
+	ulint		num_v = dict_table_get_n_v_cols(index->table);
+	bool		compare[REC_MAX_N_FIELDS];
+	ulint		n_fields = dtuple_get_n_fields(ientry);
+	ulint		n_non_v_col = 0;
+	ulint		n_cmp_v_col = 0;
+	const dfield_t* field1;
+	dfield_t*	field2;
+	ulint		i;
+
+	/* First compare non-virtual columns (primary keys) */
+	ut_ad(index->n_fields == n_fields);
+	ut_ad(n_fields == dtuple_get_n_fields(icentry));
+	ut_ad(mtr->memo_contains_page_flagged(rec,
+					      MTR_MEMO_PAGE_S_FIX
+					      | MTR_MEMO_PAGE_X_FIX));
+
+	{
+		const dfield_t* a = ientry->fields;
+		const dfield_t* b = icentry->fields;
+
+		for (const dict_field_t *ifield = index->fields,
+			     *const end = &index->fields[index->n_fields];
+		     ifield != end; ifield++, a++, b++) {
+			if (!ifield->col->is_virtual()) {
+				if (cmp_dfield_dfield(a, b)) {
+					return false;
+				}
+				n_non_v_col++;
+			}
+		}
+	}
+
+	tuple_heap = mem_heap_create(1024);
+
+	ut_ad(n_fields > n_non_v_col);
+
+	*vrow = dtuple_create_with_vcol(v_heap ? v_heap : tuple_heap, 0, num_v);
+	dtuple_init_v_fld(*vrow);
+
+	for (i = 0; i < num_v; i++) {
+		dfield_get_type(dtuple_get_nth_v_field(*vrow, i))->mtype
+			 = DATA_MISSING;
+		compare[i] = false;
+	}
+
+	version = rec;
+
+	while (n_cmp_v_col < n_fields - n_non_v_col) {
+		heap2 = heap;
+		heap = mem_heap_create(1024);
+		roll_ptr_t	cur_roll_ptr = row_get_rec_roll_ptr(
+			version, clust_index, clust_offsets);
+
+		ut_ad(cur_roll_ptr != 0);
+		ut_ad(roll_ptr != 0);
+
+		trx_undo_prev_version_build(
+			version, clust_index, clust_offsets,
+			heap, &prev_version, NULL, vrow,
+			TRX_UNDO_PREV_IN_PURGE | TRX_UNDO_GET_OLD_V_VALUE);
+
+		if (heap2) {
+			mem_heap_free(heap2);
+		}
+
+		if (!prev_version) {
+			/* Versions end here */
+			goto func_exit;
+		}
+
+		clust_offsets = rec_get_offsets(prev_version, clust_index,
+						NULL,
+						clust_index->n_core_fields,
+						ULINT_UNDEFINED, &heap);
+
+		ulint	entry_len = dict_index_get_n_fields(index);
+
+		for (i = 0; i < entry_len; i++) {
+			const dict_field_t*	ind_field
+				 = dict_index_get_nth_field(index, i);
+			const dict_col_t*	col = ind_field->col;
+			field1 = dtuple_get_nth_field(ientry, i);
+
+			if (!col->is_virtual()) {
+				continue;
+			}
+
+			const dict_v_col_t*     v_col
+                                = reinterpret_cast<const dict_v_col_t*>(col);
+			field2
+				= dtuple_get_nth_v_field(*vrow, v_col->v_pos);
+
+			if ((dfield_get_type(field2)->mtype != DATA_MISSING)
+			    && (!compare[v_col->v_pos])) {
+
+				if (ind_field->prefix_len != 0
+				    && !dfield_is_null(field2)) {
+					field2->len = unsigned(
+						dtype_get_at_most_n_mbchars(
+							field2->type.prtype,
+							field2->type.mbminlen,
+							field2->type.mbmaxlen,
+							ind_field->prefix_len,
+							field2->len,
+							static_cast<char*>
+							(field2->data)));
+				}
+
+				/* The index field mismatch */
+				if (v_heap
+				    || cmp_dfield_dfield(field2, field1)) {
+					if (v_heap) {
+						dtuple_dup_v_fld(*vrow, v_heap);
+					}
+
+					mem_heap_free(tuple_heap);
+					mem_heap_free(heap);
+					return(false);
+				}
+
+				compare[v_col->v_pos] = true;
+				n_cmp_v_col++;
+			}
+		}
+
+		trx_id_t	rec_trx_id = row_get_rec_trx_id(
+			prev_version, clust_index, clust_offsets);
+
+		if (rec_trx_id < trx_id || roll_ptr == cur_roll_ptr) {
+			break;
+		}
+
+		version = prev_version;
+	}
+
+func_exit:
+	if (n_cmp_v_col == 0) {
+		*vrow = NULL;
+	}
+
+	mem_heap_free(tuple_heap);
+	mem_heap_free(heap);
+
+	/* FIXME: In the case of n_cmp_v_col is not the same as
+	n_fields - n_non_v_col, callback is needed to compare the rest
+	columns. At the timebeing, we will need to return true */
+	return (true);
+}
+
+/** Build a dtuple contains virtual column data for current cluster index
+@param[in]	in_purge	called by purge thread
+@param[in]	rec		cluster index rec
+@param[in]	clust_index	cluster index
+@param[in]	clust_offsets	cluster rec offset
+@param[in]	index		secondary index
+@param[in]	roll_ptr	roll_ptr for the purge record
+@param[in]	trx_id		transaction ID on the purging record
+@param[in,out]	heap		heap memory
+@param[in,out]	v_heap		heap memory to keep virtual colum dtuple
+@param[in]	mtr		mtr holding the latch on rec
+@return dtuple contains virtual column data */
+static
+dtuple_t*
+row_vers_build_cur_vrow(
+	bool			in_purge,
+	const rec_t*		rec,
+	dict_index_t*		clust_index,
+	rec_offs**		clust_offsets,
+	dict_index_t*		index,
+	roll_ptr_t		roll_ptr,
+	trx_id_t		trx_id,
+	mem_heap_t*		heap,
+	mem_heap_t*		v_heap,
+	mtr_t*			mtr)
+{
+	dtuple_t* cur_vrow = NULL;
+
+	roll_ptr_t t_roll_ptr = row_get_rec_roll_ptr(
+		rec, clust_index, *clust_offsets);
+
+	/* if the row is newly inserted, then the virtual
+	columns need to be computed */
+	if (trx_undo_roll_ptr_is_insert(t_roll_ptr)) {
+
+		ut_ad(!rec_get_deleted_flag(rec, page_rec_is_comp(rec)));
+
+		/* This is a newly inserted record and cannot
+		be deleted, So the externally stored field
+		cannot be freed yet. */
+		dtuple_t* row = row_build(ROW_COPY_POINTERS, clust_index,
+					  rec, *clust_offsets,
+					  NULL, NULL, NULL, NULL, heap);
+
+		if (!row_vers_build_clust_v_col(row, clust_index, index,
+						heap)) {
+			return nullptr;
+		}
+
+		cur_vrow = dtuple_copy(row, v_heap);
+		dtuple_dup_v_fld(cur_vrow, v_heap);
+	} else {
+		/* Try to fetch virtual column data from undo log */
+		row_vers_build_cur_vrow_low(
+			in_purge, rec, clust_index, *clust_offsets,
+			index, roll_ptr, trx_id, v_heap, &cur_vrow, mtr);
+	}
+
+	*clust_offsets = rec_get_offsets(rec, clust_index, NULL,
+					 clust_index->n_core_fields,
+					 ULINT_UNDEFINED, &heap);
+	return(cur_vrow);
+}
+
+/** @return whether two data tuples are equal */
+static bool dtuple_coll_eq(const dtuple_t &tuple1, const dtuple_t &tuple2)
+{
+  ut_ad(tuple1.magic_n == DATA_TUPLE_MAGIC_N);
+  ut_ad(tuple2.magic_n == DATA_TUPLE_MAGIC_N);
+  ut_ad(dtuple_check_typed(&tuple1));
+  ut_ad(dtuple_check_typed(&tuple2));
+  ut_ad(tuple1.n_fields == tuple2.n_fields);
+
+  for (ulint i= 0; i < tuple1.n_fields; i++)
+    if (cmp_dfield_dfield(&tuple1.fields[i], &tuple2.fields[i]))
+      return false;
+  return true;
+}
+
+/** Find out whether data tuple has missing data type
+for indexed virtual column.
+@param tuple   data tuple
+@param index   virtual index
+@return true if tuple has missing column type */
+static bool dtuple_vcol_data_missing(const dtuple_t &tuple,
+                                     dict_index_t *index)
+{
+  for (ulint i= 0; i < index->n_uniq; i++)
+  {
+    dict_col_t *col= index->fields[i].col;
+    if (!col->is_virtual())
+      continue;
+    dict_v_col_t *vcol= reinterpret_cast<dict_v_col_t*>(col);
+    for (ulint j= 0; j < index->table->n_v_cols; j++)
+    {
+      if (vcol == &index->table->v_cols[j]
+          && tuple.v_fields[j].type.mtype == DATA_MISSING)
+        return true;
+    }
+  }
+  return false;
+}
+
+/** Finds out if a version of the record, where the version >= the current
+purge_sys.view, should have ientry as its secondary index entry. We check
+if there is any not delete marked version of the record where the trx
+id >= purge view, and the secondary index entry == ientry; exactly in
+this case we return TRUE.
+@param[in]	also_curr	TRUE if also rec is included in the versions
+				to search; otherwise only versions prior
+				to it are searched
+@param[in]	rec		record in the clustered index; the caller
+				must have a latch on the page
+@param[in]	mtr		mtr holding the latch on rec; it will
+				also hold the latch on purge_view
+@param[in]	index		secondary index
+@param[in]	ientry		secondary index entry
+@param[in]	roll_ptr	roll_ptr for the purge record
+@param[in]	trx_id		transaction ID on the purging record
+@return TRUE if earlier version should have */
+bool
+row_vers_old_has_index_entry(
+	bool			also_curr,
+	const rec_t*		rec,
+	mtr_t*			mtr,
+	dict_index_t*		index,
+	const dtuple_t*		ientry,
+	roll_ptr_t		roll_ptr,
+	trx_id_t		trx_id)
+{
+	const rec_t*	version;
+	rec_t*		prev_version;
+	dict_index_t*	clust_index;
+	rec_offs*	clust_offsets;
+	mem_heap_t*	heap;
+	mem_heap_t*	heap2;
+	dtuple_t*	row;
+	const dtuple_t*	entry;
+	ulint		comp;
+	dtuple_t*	vrow = NULL;
+	mem_heap_t*	v_heap = NULL;
+	dtuple_t*	cur_vrow = NULL;
+
+	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
+					      | MTR_MEMO_PAGE_S_FIX));
+	clust_index = dict_table_get_first_index(index->table);
+
+	comp = page_rec_is_comp(rec);
+	ut_ad(!dict_table_is_comp(index->table) == !comp);
+	heap = mem_heap_create(1024);
+	clust_offsets = rec_get_offsets(rec, clust_index, NULL,
+					clust_index->n_core_fields,
+					ULINT_UNDEFINED, &heap);
+
+	if (dict_index_has_virtual(index)) {
+		v_heap = mem_heap_create(100);
+	}
+
+	DBUG_EXECUTE_IF("ib_purge_virtual_index_crash",
+			DBUG_SUICIDE(););
+
+	if (also_curr && !rec_get_deleted_flag(rec, comp)) {
+		row_ext_t*	ext;
+
+		/* The top of the stack of versions is locked by the
+		mtr holding a latch on the page containing the
+		clustered index record. The bottom of the stack is
+		locked by the fact that the purge_sys.view must
+		'overtake' any read view of an active transaction.
+		Thus, it is safe to fetch the prefixes for
+		externally stored columns. */
+		row = row_build(ROW_COPY_POINTERS, clust_index,
+				rec, clust_offsets,
+				NULL, NULL, NULL, &ext, heap);
+
+		if (dict_index_has_virtual(index)) {
+
+
+#ifdef DBUG_OFF
+# define dbug_v_purge false
+#else /* DBUG_OFF */
+                        bool    dbug_v_purge = false;
+#endif /* DBUG_OFF */
+
+			DBUG_EXECUTE_IF(
+				"ib_purge_virtual_index_callback",
+				dbug_v_purge = true;);
+
+			roll_ptr_t t_roll_ptr = row_get_rec_roll_ptr(
+				rec, clust_index, clust_offsets);
+
+			/* if the row is newly inserted, then the virtual
+			columns need to be computed */
+			if (trx_undo_roll_ptr_is_insert(t_roll_ptr)
+			    || dbug_v_purge) {
+
+				if (!row_vers_build_clust_v_col(
+					    row, clust_index, index, heap)) {
+					goto unsafe_to_purge;
+				}
+
+				entry = row_build_index_entry(
+					row, ext, index, heap);
+				if (entry && dtuple_coll_eq(*ientry, *entry)) {
+					goto unsafe_to_purge;
+				}
+			} else {
+				/* Build index entry out of row */
+				entry = row_build_index_entry(row, ext, index, heap);
+				/* entry could only be NULL if
+				the clustered index record is an uncommitted
+				inserted record whose BLOBs have not been
+				written yet. The secondary index record
+				can be safely removed, because it cannot
+				possibly refer to this incomplete
+				clustered index record. (Insert would
+				always first be completed for the
+				clustered index record, then proceed to
+				secondary indexes.) */
+
+				if (entry && row_vers_vc_matches_cluster(
+					    rec, entry,
+					    clust_index, clust_offsets,
+					    index, ientry, roll_ptr,
+					    trx_id, NULL, &vrow, mtr)) {
+					goto unsafe_to_purge;
+				}
+			}
+			clust_offsets = rec_get_offsets(rec, clust_index, NULL,
+							clust_index
+							->n_core_fields,
+							ULINT_UNDEFINED, &heap);
+		} else {
+
+			entry = row_build_index_entry(
+				row, ext, index, heap);
+
+			/* If entry == NULL, the record contains unset BLOB
+			pointers.  This must be a freshly inserted record.  If
+			this is called from
+			row_purge_remove_sec_if_poss_low(), the thread will
+			hold latches on the clustered index and the secondary
+			index.  Because the insert works in three steps:
+
+				(1) insert the record to clustered index
+				(2) store the BLOBs and update BLOB pointers
+				(3) insert records to secondary indexes
+
+			the purge thread can safely ignore freshly inserted
+			records and delete the secondary index record.  The
+			thread that inserted the new record will be inserting
+			the secondary index records. */
+
+			/* NOTE that we cannot do the comparison as binary
+			fields because the row is maybe being modified so that
+			the clustered index record has already been updated to
+			a different binary value in a char field, but the
+			collation identifies the old and new value anyway! */
+			if (entry && dtuple_coll_eq(*ientry, *entry)) {
+unsafe_to_purge:
+				mem_heap_free(heap);
+
+				if (v_heap) {
+					mem_heap_free(v_heap);
+				}
+				return true;
+			}
+		}
+	} else if (dict_index_has_virtual(index)) {
+		/* The current cluster index record could be
+		deleted, but the previous version of it might not. We will
+		need to get the virtual column data from undo record
+		associated with current cluster index */
+
+		cur_vrow = row_vers_build_cur_vrow(
+			also_curr, rec, clust_index, &clust_offsets,
+			index, roll_ptr, trx_id, heap, v_heap, mtr);
+	}
+
+	version = rec;
+
+	for (;;) {
+		heap2 = heap;
+		heap = mem_heap_create(1024);
+		vrow = NULL;
+
+		trx_undo_prev_version_build(version,
+					    clust_index, clust_offsets,
+					    heap, &prev_version, nullptr,
+					    dict_index_has_virtual(index)
+					    ? &vrow : nullptr,
+					    TRX_UNDO_CHECK_PURGEABILITY);
+		mem_heap_free(heap2); /* free version and clust_offsets */
+
+		if (!prev_version) {
+			/* Versions end here */
+			mem_heap_free(heap);
+
+			if (v_heap) {
+				mem_heap_free(v_heap);
+			}
+
+			return false;
+		}
+
+		clust_offsets = rec_get_offsets(prev_version, clust_index,
+						NULL,
+						clust_index->n_core_fields,
+						ULINT_UNDEFINED, &heap);
+
+		if (dict_index_has_virtual(index)) {
+			if (vrow) {
+				if (dtuple_vcol_data_missing(*vrow, index)) {
+					goto nochange_index;
+				}
+				/* Keep the virtual row info for the next
+				version, unless it is changed */
+				mem_heap_empty(v_heap);
+				cur_vrow = dtuple_copy(vrow, v_heap);
+				dtuple_dup_v_fld(cur_vrow, v_heap);
+			}
+
+			if (!cur_vrow) {
+				/* Nothing for this index has changed,
+				continue */
+nochange_index:
+				version = prev_version;
+				continue;
+			}
+		}
+
+		if (!rec_get_deleted_flag(prev_version, comp)) {
+			row_ext_t*	ext;
+
+			/* The stack of versions is locked by mtr.
+			Thus, it is safe to fetch the prefixes for
+			externally stored columns. */
+			row = row_build(ROW_COPY_POINTERS, clust_index,
+					prev_version, clust_offsets,
+					NULL, NULL, NULL, &ext, heap);
+
+			if (dict_index_has_virtual(index)) {
+				ut_ad(cur_vrow);
+				ut_ad(row->n_v_fields == cur_vrow->n_v_fields);
+				dtuple_copy_v_fields(row, cur_vrow);
+			}
+
+			entry = row_build_index_entry(row, ext, index, heap);
+
+			/* If entry == NULL, the record contains unset
+			BLOB pointers.  This must be a freshly
+			inserted record that we can safely ignore.
+			For the justification, see the comments after
+			the previous row_build_index_entry() call. */
+
+			/* NOTE that we cannot do the comparison as binary
+			fields because maybe the secondary index record has
+			already been updated to a different binary value in
+			a char field, but the collation identifies the old
+			and new value anyway! */
+
+			if (entry && dtuple_coll_eq(*ientry, *entry)) {
+				goto unsafe_to_purge;
+			}
+		}
+
+		version = prev_version;
+	}
+}
+
+/*****************************************************************//**
+Constructs the version of a clustered index record which a consistent
+read should see. We assume that the trx id stored in rec is such that
+the consistent read should not see rec in its present version.
+@return error code
+@retval DB_SUCCESS if a previous version was fetched
+@retval DB_MISSING_HISTORY if the history is missing (a sign of corruption) */
+dberr_t
+row_vers_build_for_consistent_read(
+/*===============================*/
+	const rec_t*	rec,	/*!< in: record in a clustered index; the
+				caller must have a latch on the page; this
+				latch locks the top of the stack of versions
+				of this records */
+	mtr_t*		mtr,	/*!< in: mtr holding the latch on rec */
+	dict_index_t*	index,	/*!< in: the clustered index */
+	rec_offs**	offsets,/*!< in/out: offsets returned by
+				rec_get_offsets(rec, index) */
+	ReadView*	view,	/*!< in: the consistent read view */
+	mem_heap_t**	offset_heap,/*!< in/out: memory heap from which
+				the offsets are allocated */
+	mem_heap_t*	in_heap,/*!< in: memory heap from which the memory for
+				*old_vers is allocated; memory for possible
+				intermediate versions is allocated and freed
+				locally within the function */
+	rec_t**		old_vers,/*!< out, own: old version, or NULL
+				if the history is missing or the record
+				does not exist in the view, that is,
+				it was freshly inserted afterwards */
+	dtuple_t**	vrow)	/*!< out: virtual row */
+{
+	const rec_t*	version;
+	rec_t*		prev_version;
+	trx_id_t	trx_id;
+	mem_heap_t*	heap		= NULL;
+	byte*		buf;
+	dberr_t		err;
+
+	ut_ad(index->is_primary());
+	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
+					      | MTR_MEMO_PAGE_S_FIX));
+
+	ut_ad(rec_offs_validate(rec, index, *offsets));
+
+	trx_id = row_get_rec_trx_id(rec, index, *offsets);
+
+	ut_ad(!view->changes_visible(trx_id));
+
+	ut_ad(!vrow || !(*vrow));
+
+	version = rec;
+
+	for (;;) {
+		mem_heap_t*	prev_heap = heap;
+
+		heap = mem_heap_create(1024);
+
+		if (vrow) {
+			*vrow = NULL;
+		}
+
+		/* If purge can't see the record then we can't rely on
+		the UNDO log record. */
+
+		err = trx_undo_prev_version_build(
+			version, index, *offsets, heap,
+			&prev_version, NULL, vrow, 0);
+
+		if (prev_heap != NULL) {
+			mem_heap_free(prev_heap);
+		}
+
+		if (prev_version == NULL) {
+			/* It was a freshly inserted version */
+			*old_vers = NULL;
+			ut_ad(!vrow || !(*vrow));
+			break;
+		}
+
+		*offsets = rec_get_offsets(
+			prev_version, index, *offsets,
+			index->n_core_fields, ULINT_UNDEFINED, offset_heap);
+
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+		ut_a(!rec_offs_any_null_extern(prev_version, *offsets));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
+		trx_id = row_get_rec_trx_id(prev_version, index, *offsets);
+
+		if (view->changes_visible(trx_id)) {
+
+			/* The view already sees this version: we can copy
+			it to in_heap and return */
+
+			buf = static_cast<byte*>(
+				mem_heap_alloc(
+					in_heap, rec_offs_size(*offsets)));
+
+			*old_vers = rec_copy(buf, prev_version, *offsets);
+			rec_offs_make_valid(*old_vers, index, true, *offsets);
+
+			if (vrow && *vrow) {
+				*vrow = dtuple_copy(*vrow, in_heap);
+				dtuple_dup_v_fld(*vrow, in_heap);
+			}
+			break;
+		} else if (trx_id >= view->low_limit_id()
+			   && trx_id >= trx_sys.get_max_trx_id()) {
+			err = DB_CORRUPTION;
+			break;
+		}
+		version = prev_version;
+	}
+
+	mem_heap_free(heap);
+
+	return(err);
+}
+
+#if defined __aarch64__&&defined __GNUC__&&__GNUC__==4&&!defined __clang__
+/* Avoid GCC 4.8.5 internal compiler error "could not split insn". */
+# pragma GCC optimize ("O0")
+#endif
+/*****************************************************************//**
+Constructs the last committed version of a clustered index record,
+which should be seen by a semi-consistent read. */
+void
+row_vers_build_for_semi_consistent_read(
+/*====================================*/
+	trx_t*		caller_trx,/*!<in/out: trx of current thread */
+	const rec_t*	rec,	/*!< in: record in a clustered index; the
+				caller must have a latch on the page; this
+				latch locks the top of the stack of versions
+				of this records */
+	mtr_t*		mtr,	/*!< in: mtr holding the latch on rec */
+	dict_index_t*	index,	/*!< in: the clustered index */
+	rec_offs**	offsets,/*!< in/out: offsets returned by
+				rec_get_offsets(rec, index) */
+	mem_heap_t**	offset_heap,/*!< in/out: memory heap from which
+				the offsets are allocated */
+	mem_heap_t*	in_heap,/*!< in: memory heap from which the memory for
+				*old_vers is allocated; memory for possible
+				intermediate versions is allocated and freed
+				locally within the function */
+	const rec_t**	old_vers,/*!< out: rec, old version, or NULL if the
+				record does not exist in the view, that is,
+				it was freshly inserted afterwards */
+	dtuple_t**	vrow)	/*!< out: virtual row, old version, or NULL
+				if it is not updated in the view */
+{
+	const rec_t*	version;
+	mem_heap_t*	heap		= NULL;
+	byte*		buf;
+	trx_id_t	rec_trx_id	= 0;
+
+	ut_ad(index->is_primary());
+	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX
+					      | MTR_MEMO_PAGE_S_FIX));
+
+	ut_ad(rec_offs_validate(rec, index, *offsets));
+
+	version = rec;
+	ut_ad(!vrow || !(*vrow));
+
+	for (;;) {
+		mem_heap_t*	heap2;
+		rec_t*		prev_version;
+		trx_id_t	version_trx_id;
+
+		version_trx_id = row_get_rec_trx_id(version, index, *offsets);
+		if (rec == version) {
+			rec_trx_id = version_trx_id;
+		}
+
+		if (!trx_sys.is_registered(caller_trx, version_trx_id)) {
+committed_version_trx:
+			/* We found a version that belongs to a
+			committed transaction: return it. */
+
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+			ut_a(!rec_offs_any_null_extern(version, *offsets));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+
+			if (rec == version) {
+				*old_vers = rec;
+				if (vrow) {
+					*vrow = NULL;
+				}
+				break;
+			}
+
+			/* We assume that a rolled-back transaction stays in
+			TRX_STATE_ACTIVE state until all the changes have been
+			rolled back and the transaction is removed from
+			the global list of transactions. */
+
+			if (rec_trx_id == version_trx_id) {
+				/* The transaction was committed while
+				we searched for earlier versions.
+				Return the current version as a
+				semi-consistent read. */
+
+				version = rec;
+				*offsets = rec_get_offsets(
+					version, index, *offsets,
+					index->n_core_fields, ULINT_UNDEFINED,
+					offset_heap);
+			}
+
+			buf = static_cast<byte*>(
+				mem_heap_alloc(
+					in_heap, rec_offs_size(*offsets)));
+
+			*old_vers = rec_copy(buf, version, *offsets);
+			rec_offs_make_valid(*old_vers, index, true, *offsets);
+			if (vrow && *vrow) {
+				*vrow = dtuple_copy(*vrow, in_heap);
+				dtuple_dup_v_fld(*vrow, in_heap);
+			}
+			break;
+		}
+
+		DEBUG_SYNC_C("after_row_vers_check_trx_active");
+
+		heap2 = heap;
+		heap = mem_heap_create(1024);
+
+		if (trx_undo_prev_version_build(version, index, *offsets, heap,
+						&prev_version, in_heap, vrow,
+						0) != DB_SUCCESS) {
+			mem_heap_free(heap);
+			heap = heap2;
+			heap2 = NULL;
+			goto committed_version_trx;
+		}
+
+		if (heap2) {
+			mem_heap_free(heap2); /* free version */
+		}
+
+		if (prev_version == NULL) {
+			/* It was a freshly inserted version */
+			*old_vers = NULL;
+			ut_ad(!vrow || !(*vrow));
+			break;
+		}
+
+		version = prev_version;
+		*offsets = rec_get_offsets(version, index, *offsets,
+					   index->n_core_fields,
+					   ULINT_UNDEFINED, offset_heap);
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+		ut_a(!rec_offs_any_null_extern(version, *offsets));
+#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
+	}/* for (;;) */
+
+	if (heap) {
+		mem_heap_free(heap);
+	}
+}
diff --git a/storage/innobase/srv/srv0mon.cc b/storage/innobase/srv/srv0mon.cc
new file mode 100644
index 00000000..75798241
--- /dev/null
+++ b/storage/innobase/srv/srv0mon.cc
@@ -0,0 +1,1799 @@
+/*****************************************************************************
+
+Copyright (c) 2010, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file srv/srv0mon.cc
+Database monitor counter interfaces
+
+Created 12/9/2009 Jimmy Yang
+*******************************************************/
+
+#include "buf0flu.h"
+#include "dict0mem.h"
+#include "ibuf0ibuf.h"
+#include "lock0lock.h"
+#include "mach0data.h"
+#include "os0file.h"
+#include "srv0mon.h"
+#include "srv0srv.h"
+#include "trx0rseg.h"
+#include "trx0sys.h"
+
+/* Macro to standardize the counter names for counters in the
+"monitor_buf_page" module as they have very structured defines */
+#define	MONITOR_BUF_PAGE(name, description, code, op, op_code)	\
+	{"buffer_page_" op "_" name, "buffer_page_io",		\
+	 "Number of " description " Pages " op,			\
+	 MONITOR_GROUP_MODULE, MONITOR_DEFAULT_START,		\
+	 MONITOR_##code##_##op_code}
+
+#define MONITOR_BUF_PAGE_READ(name, description, code)		\
+	 MONITOR_BUF_PAGE(name, description, code, "read", PAGE_READ)
+
+#define MONITOR_BUF_PAGE_WRITTEN(name, description, code)	\
+	 MONITOR_BUF_PAGE(name, description, code, "written", PAGE_WRITTEN)
+
+/** This array defines basic static information of monitor counters,
+including each monitor's name, module it belongs to, a short
+description and its property/type and corresponding monitor_id.
+Please note: If you add a monitor here, please add its corresponding
+monitor_id to "enum monitor_id_value" structure in srv0mon.h file. */
+
+static monitor_info_t	innodb_counter_info[] =
+{
+	/* A dummy item to mark the module start, this is
+	to accomodate the default value (0) set for the
+	global variables with the control system. */
+	{"module_start", "module_start", "module_start",
+	MONITOR_MODULE,
+	MONITOR_DEFAULT_START, MONITOR_DEFAULT_START},
+
+	/* ========== Counters for Server Metadata ========== */
+	{"module_metadata", "metadata", "Server Metadata",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_METADATA},
+
+	{"metadata_table_handles_opened", "metadata",
+	 "Number of table handles opened",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TABLE_OPEN},
+
+	/* ========== Counters for Lock Module ========== */
+	{"module_lock", "lock", "Lock Module",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_LOCK},
+
+	{"lock_deadlocks", "lock", "Number of deadlocks",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_DEADLOCK},
+
+	{"lock_timeouts", "lock", "Number of lock timeouts",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_TIMEOUT},
+
+	{"lock_rec_lock_waits", "lock",
+	 "Number of times enqueued into record lock wait queue",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_LOCKREC_WAIT},
+
+	{"lock_table_lock_waits", "lock",
+	 "Number of times enqueued into table lock wait queue",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TABLELOCK_WAIT},
+
+	{"lock_rec_lock_requests", "lock",
+	 "Number of record locks requested",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_NUM_RECLOCK_REQ},
+
+	{"lock_rec_lock_created", "lock", "Number of record locks created",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_RECLOCK_CREATED},
+
+	{"lock_rec_lock_removed", "lock",
+	 "Number of record locks removed from the lock queue",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_RECLOCK_REMOVED},
+
+	{"lock_rec_locks", "lock",
+	 "Current number of record locks on tables",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_NUM_RECLOCK},
+
+	{"lock_table_lock_created", "lock", "Number of table locks created",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TABLELOCK_CREATED},
+
+	{"lock_table_lock_removed", "lock",
+	 "Number of table locks removed from the lock queue",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TABLELOCK_REMOVED},
+
+	{"lock_table_locks", "lock",
+	 "Current number of table locks on tables",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_NUM_TABLELOCK},
+
+	{"lock_row_lock_current_waits", "lock",
+	 "Number of row locks currently being waited for"
+	 " (innodb_row_lock_current_waits)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_ROW_LOCK_CURRENT_WAIT},
+
+	{"lock_row_lock_time", "lock",
+	 "Time spent in acquiring row locks, in milliseconds"
+	 " (innodb_row_lock_time)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_LOCK_WAIT_TIME},
+
+	{"lock_row_lock_time_max", "lock",
+	 "The maximum time to acquire a row lock, in milliseconds"
+	 " (innodb_row_lock_time_max)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_LOCK_MAX_WAIT_TIME},
+
+	{"lock_row_lock_waits", "lock",
+	 "Number of times a row lock had to be waited for"
+	 " (innodb_row_lock_waits)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_ROW_LOCK_WAIT},
+
+	{"lock_row_lock_time_avg", "lock",
+	 "The average time to acquire a row lock, in milliseconds"
+	 " (innodb_row_lock_time_avg)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_LOCK_AVG_WAIT_TIME},
+
+	/* ========== Counters for Buffer Manager and I/O ========== */
+	{"module_buffer", "buffer", "Buffer Manager Module",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_BUFFER},
+
+	{"buffer_pool_size", "server",
+	 "Server buffer pool size (all buffer pools) in bytes",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUFFER_POOL_SIZE},
+
+	{"buffer_pool_reads", "buffer",
+	 "Number of reads directly from disk (innodb_buffer_pool_reads)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_READS},
+
+	{"buffer_pool_read_requests", "buffer",
+	 "Number of logical read requests (innodb_buffer_pool_read_requests)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_READ_REQUESTS},
+
+	{"buffer_pool_write_requests", "buffer",
+	 "Number of write requests (innodb_buffer_pool_write_requests)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_WRITE_REQUEST},
+
+	{"buffer_pool_wait_free", "buffer",
+	 "Number of times waited for free buffer"
+	 " (innodb_buffer_pool_wait_free)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_WAIT_FREE},
+
+	{"buffer_pool_read_ahead", "buffer",
+	 "Number of pages read as read ahead (innodb_buffer_pool_read_ahead)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_READ_AHEAD},
+
+	{"buffer_pool_read_ahead_evicted", "buffer",
+	 "Read-ahead pages evicted without being accessed"
+	 " (innodb_buffer_pool_read_ahead_evicted)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_READ_AHEAD_EVICTED},
+
+	{"buffer_pool_pages_total", "buffer",
+	 "Total buffer pool size in pages (innodb_buffer_pool_pages_total)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_PAGE_TOTAL},
+
+	{"buffer_pool_pages_misc", "buffer",
+	 "Buffer pages for misc use such as row locks or the adaptive"
+	 " hash index (innodb_buffer_pool_pages_misc)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_PAGE_MISC},
+
+	{"buffer_pool_pages_data", "buffer",
+	 "Buffer pages containing data (innodb_buffer_pool_pages_data)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_PAGES_DATA},
+
+	{"buffer_pool_bytes_data", "buffer",
+	 "Buffer bytes containing data (innodb_buffer_pool_bytes_data)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_BYTES_DATA},
+
+	{"buffer_pool_pages_dirty", "buffer",
+	 "Buffer pages currently dirty (innodb_buffer_pool_pages_dirty)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_PAGES_DIRTY},
+
+	{"buffer_pool_bytes_dirty", "buffer",
+         "Buffer bytes currently dirty (innodb_buffer_pool_bytes_dirty)",
+         static_cast<monitor_type_t>(
+         MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+         MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_BYTES_DIRTY},
+
+	{"buffer_pool_pages_free", "buffer",
+	 "Buffer pages currently free (innodb_buffer_pool_pages_free)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_POOL_PAGES_FREE},
+
+	{"buffer_pages_created", "buffer",
+	 "Number of pages created (innodb_pages_created)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_CREATED},
+
+	{"buffer_pages_written", "buffer",
+	 "Number of pages written (innodb_pages_written)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_WRITTEN},
+
+	{"buffer_pages_read", "buffer",
+	 "Number of pages read (innodb_pages_read)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_READ},
+
+	{"buffer_data_reads", "buffer",
+	 "Amount of data read in bytes (innodb_data_reads)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BYTE_READ},
+
+	{"buffer_data_written", "buffer",
+	 "Amount of data written in bytes (innodb_data_written)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BYTE_WRITTEN},
+
+	/* Cumulative counter for scanning in flush batches */
+	{"buffer_flush_batch_scanned", "buffer",
+	 "Total pages scanned as part of flush batch",
+	 MONITOR_SET_OWNER,
+	 MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
+	 MONITOR_FLUSH_BATCH_SCANNED},
+
+	{"buffer_flush_batch_num_scan", "buffer",
+	 "Number of times buffer flush list flush is called",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_BATCH_SCANNED,
+	 MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL},
+
+	{"buffer_flush_batch_scanned_per_call", "buffer",
+	 "Pages scanned per flush batch scan",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_BATCH_SCANNED,
+	 MONITOR_FLUSH_BATCH_SCANNED_PER_CALL},
+
+	/* Cumulative counter for pages flushed in flush batches */
+	{"buffer_flush_batch_total_pages", "buffer",
+	 "Total pages flushed as part of flush batch",
+	 MONITOR_SET_OWNER, MONITOR_FLUSH_BATCH_COUNT,
+	 MONITOR_FLUSH_BATCH_TOTAL_PAGE},
+
+	{"buffer_flush_batches", "buffer",
+	 "Number of flush batches",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_BATCH_TOTAL_PAGE,
+	 MONITOR_FLUSH_BATCH_COUNT},
+
+	{"buffer_flush_batch_pages", "buffer",
+	 "Pages queued as a flush batch",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_BATCH_TOTAL_PAGE,
+	 MONITOR_FLUSH_BATCH_PAGES},
+
+	/* Cumulative counter for flush batches because of neighbor */
+	{"buffer_flush_neighbor_total_pages", "buffer",
+	 "Total neighbors flushed as part of neighbor flush",
+	 MONITOR_SET_OWNER, MONITOR_FLUSH_NEIGHBOR_COUNT,
+	 MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE},
+
+	{"buffer_flush_neighbor", "buffer",
+	 "Number of times neighbors flushing is invoked",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
+	 MONITOR_FLUSH_NEIGHBOR_COUNT},
+
+	{"buffer_flush_neighbor_pages", "buffer",
+	 "Pages queued as a neighbor batch",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
+	 MONITOR_FLUSH_NEIGHBOR_PAGES},
+
+	{"buffer_flush_n_to_flush_requested", "buffer",
+	 "Number of pages requested for flushing.",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_FLUSH_N_TO_FLUSH_REQUESTED},
+
+	{"buffer_flush_n_to_flush_by_age", "buffer",
+	 "Number of pages target by LSN Age for flushing.",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_FLUSH_N_TO_FLUSH_BY_AGE},
+
+	{"buffer_flush_adaptive_avg_time", "buffer",
+	 "Avg time (ms) spent for adaptive flushing recently.",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_FLUSH_ADAPTIVE_AVG_TIME},
+
+	{"buffer_flush_adaptive_avg_pass", "buffer",
+	 "Number of adaptive flushes passed during the recent Avg period.",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_FLUSH_ADAPTIVE_AVG_PASS},
+
+	{"buffer_LRU_get_free_loops", "buffer",
+	 "Total loops in LRU get free.",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_LRU_GET_FREE_LOOPS},
+
+	{"buffer_LRU_get_free_waits", "buffer",
+	 "Total sleep waits in LRU get free.",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_LRU_GET_FREE_WAITS},
+
+	{"buffer_flush_avg_page_rate", "buffer",
+	 "Average number of pages at which flushing is happening",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_FLUSH_AVG_PAGE_RATE},
+
+	{"buffer_flush_lsn_avg_rate", "buffer",
+	 "Average redo generation rate",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_FLUSH_LSN_AVG_RATE},
+
+	{"buffer_flush_pct_for_dirty", "buffer",
+	 "Percent of IO capacity used to avoid max dirty page limit",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_FLUSH_PCT_FOR_DIRTY},
+
+	{"buffer_flush_pct_for_lsn", "buffer",
+	 "Percent of IO capacity used to avoid reusable redo space limit",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_FLUSH_PCT_FOR_LSN},
+
+	{"buffer_flush_sync_waits", "buffer",
+	 "Number of times a wait happens due to sync flushing",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_FLUSH_SYNC_WAITS},
+
+	/* Cumulative counter for flush batches for adaptive flushing  */
+	{"buffer_flush_adaptive_total_pages", "buffer",
+	 "Total pages flushed as part of adaptive flushing",
+	 MONITOR_SET_OWNER, MONITOR_FLUSH_ADAPTIVE_COUNT,
+	 MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE},
+
+	{"buffer_flush_adaptive", "buffer",
+	 "Number of adaptive batches",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
+	 MONITOR_FLUSH_ADAPTIVE_COUNT},
+
+	{"buffer_flush_adaptive_pages", "buffer",
+	 "Pages queued as an adaptive batch",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
+	 MONITOR_FLUSH_ADAPTIVE_PAGES},
+
+	/* Cumulative counter for flush batches because of sync */
+	{"buffer_flush_sync_total_pages", "buffer",
+	 "Total pages flushed as part of sync batches",
+	 MONITOR_SET_OWNER, MONITOR_FLUSH_SYNC_COUNT,
+	 MONITOR_FLUSH_SYNC_TOTAL_PAGE},
+
+	{"buffer_flush_sync", "buffer",
+	 "Number of sync batches",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_SYNC_TOTAL_PAGE,
+	 MONITOR_FLUSH_SYNC_COUNT},
+
+	{"buffer_flush_sync_pages", "buffer",
+	 "Pages queued as a sync batch",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_SYNC_TOTAL_PAGE,
+	 MONITOR_FLUSH_SYNC_PAGES},
+
+	/* Cumulative counter for flush batches because of background */
+	{"buffer_flush_background_total_pages", "buffer",
+	 "Total pages flushed as part of background batches",
+	 MONITOR_SET_OWNER, MONITOR_FLUSH_BACKGROUND_COUNT,
+	 MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE},
+
+	{"buffer_flush_background", "buffer",
+	 "Number of background batches",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
+	 MONITOR_FLUSH_BACKGROUND_COUNT},
+
+	{"buffer_flush_background_pages", "buffer",
+	 "Pages queued as a background batch",
+	 MONITOR_SET_MEMBER, MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
+	 MONITOR_FLUSH_BACKGROUND_PAGES},
+
+	/* Cumulative counter for LRU batch scan */
+	{"buffer_LRU_batch_scanned", "buffer",
+	 "Total pages scanned as part of LRU batch",
+	 MONITOR_SET_OWNER, MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
+	 MONITOR_LRU_BATCH_SCANNED},
+
+	{"buffer_LRU_batch_num_scan", "buffer",
+	 "Number of times LRU batch is called",
+	 MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_SCANNED,
+	 MONITOR_LRU_BATCH_SCANNED_NUM_CALL},
+
+	{"buffer_LRU_batch_scanned_per_call", "buffer",
+	 "Pages scanned per LRU batch call",
+	 MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_SCANNED,
+	 MONITOR_LRU_BATCH_SCANNED_PER_CALL},
+
+	/* Cumulative counter for LRU batch pages flushed */
+	{"buffer_LRU_batch_flush_total_pages", "buffer",
+	 "Total pages flushed as part of LRU batches",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE},
+
+	/* Cumulative counter for LRU batch pages flushed */
+	{"buffer_LRU_batch_evict_total_pages", "buffer",
+	 "Total pages evicted as part of LRU batches",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE},
+
+	{"buffer_LRU_single_flush_failure_count", "Buffer",
+	 "Number of times attempt to flush a single page from LRU failed",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT},
+
+	{"buffer_LRU_get_free_search", "Buffer",
+	 "Number of searches performed for a clean page",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_LRU_GET_FREE_SEARCH},
+
+	/* Cumulative counter for LRU search scans */
+	{"buffer_LRU_search_scanned", "buffer",
+	 "Total pages scanned as part of LRU search",
+	 MONITOR_SET_OWNER,
+	 MONITOR_LRU_SEARCH_SCANNED_NUM_CALL,
+	 MONITOR_LRU_SEARCH_SCANNED},
+
+	{"buffer_LRU_search_num_scan", "buffer",
+	 "Number of times LRU search is performed",
+	 MONITOR_SET_MEMBER, MONITOR_LRU_SEARCH_SCANNED,
+	 MONITOR_LRU_SEARCH_SCANNED_NUM_CALL},
+
+	{"buffer_LRU_search_scanned_per_call", "buffer",
+	 "Page scanned per single LRU search",
+	 MONITOR_SET_MEMBER, MONITOR_LRU_SEARCH_SCANNED,
+	 MONITOR_LRU_SEARCH_SCANNED_PER_CALL},
+
+	/* Cumulative counter for LRU unzip search scans */
+	{"buffer_LRU_unzip_search_scanned", "buffer",
+	 "Total pages scanned as part of LRU unzip search",
+	 MONITOR_SET_OWNER,
+	 MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL,
+	 MONITOR_LRU_UNZIP_SEARCH_SCANNED},
+
+	{"buffer_LRU_unzip_search_num_scan", "buffer",
+	 "Number of times LRU unzip search is performed",
+	 MONITOR_SET_MEMBER, MONITOR_LRU_UNZIP_SEARCH_SCANNED,
+	 MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL},
+
+	{"buffer_LRU_unzip_search_scanned_per_call", "buffer",
+	 "Page scanned per single LRU unzip search",
+	 MONITOR_SET_MEMBER, MONITOR_LRU_UNZIP_SEARCH_SCANNED,
+	 MONITOR_LRU_UNZIP_SEARCH_SCANNED_PER_CALL},
+
+	/* ========== Counters for Buffer Page I/O ========== */
+	{"module_buffer_page", "buffer_page_io", "Buffer Page I/O Module",
+	 static_cast<monitor_type_t>(
+	 MONITOR_MODULE | MONITOR_GROUP_MODULE),
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_BUF_PAGE},
+
+	MONITOR_BUF_PAGE_READ("index_leaf","Index Leaf", INDEX_LEAF),
+
+	MONITOR_BUF_PAGE_READ("index_non_leaf","Index Non-leaf",
+			      INDEX_NON_LEAF),
+
+	MONITOR_BUF_PAGE_READ("index_ibuf_leaf", "Insert Buffer Index Leaf",
+			      INDEX_IBUF_LEAF),
+
+	MONITOR_BUF_PAGE_READ("index_ibuf_non_leaf",
+			      "Insert Buffer Index Non-Leaf",
+			       INDEX_IBUF_NON_LEAF),
+
+	MONITOR_BUF_PAGE_READ("undo_log", "Undo Log", UNDO_LOG),
+
+	MONITOR_BUF_PAGE_READ("index_inode", "Index Inode", INODE),
+
+	MONITOR_BUF_PAGE_READ("ibuf_free_list", "Insert Buffer Free List",
+			      IBUF_FREELIST),
+
+	MONITOR_BUF_PAGE_READ("ibuf_bitmap", "Insert Buffer Bitmap",
+			      IBUF_BITMAP),
+
+	MONITOR_BUF_PAGE_READ("system_page", "System", SYSTEM),
+
+	MONITOR_BUF_PAGE_READ("trx_system", "Transaction System", TRX_SYSTEM),
+
+	MONITOR_BUF_PAGE_READ("fsp_hdr", "File Space Header", FSP_HDR),
+
+	MONITOR_BUF_PAGE_READ("xdes", "Extent Descriptor", XDES),
+
+	MONITOR_BUF_PAGE_READ("blob", "Uncompressed BLOB", BLOB),
+
+	MONITOR_BUF_PAGE_READ("zblob", "First Compressed BLOB", ZBLOB),
+
+	MONITOR_BUF_PAGE_READ("zblob2", "Subsequent Compressed BLOB", ZBLOB2),
+
+	MONITOR_BUF_PAGE_READ("other", "other/unknown (old version of InnoDB)",
+			      OTHER),
+
+	MONITOR_BUF_PAGE_WRITTEN("index_leaf","Index Leaf", INDEX_LEAF),
+
+	MONITOR_BUF_PAGE_WRITTEN("index_non_leaf","Index Non-leaf",
+				 INDEX_NON_LEAF),
+
+	MONITOR_BUF_PAGE_WRITTEN("index_ibuf_leaf", "Insert Buffer Index Leaf",
+				 INDEX_IBUF_LEAF),
+
+	MONITOR_BUF_PAGE_WRITTEN("index_ibuf_non_leaf",
+				 "Insert Buffer Index Non-Leaf",
+				 INDEX_IBUF_NON_LEAF),
+
+	MONITOR_BUF_PAGE_WRITTEN("undo_log", "Undo Log", UNDO_LOG),
+
+	MONITOR_BUF_PAGE_WRITTEN("index_inode", "Index Inode", INODE),
+
+	MONITOR_BUF_PAGE_WRITTEN("ibuf_free_list", "Insert Buffer Free List",
+				 IBUF_FREELIST),
+
+	MONITOR_BUF_PAGE_WRITTEN("ibuf_bitmap", "Insert Buffer Bitmap",
+				 IBUF_BITMAP),
+
+	MONITOR_BUF_PAGE_WRITTEN("system_page", "System", SYSTEM),
+
+	MONITOR_BUF_PAGE_WRITTEN("trx_system", "Transaction System",
+				 TRX_SYSTEM),
+
+	MONITOR_BUF_PAGE_WRITTEN("fsp_hdr", "File Space Header", FSP_HDR),
+
+	MONITOR_BUF_PAGE_WRITTEN("xdes", "Extent Descriptor", XDES),
+
+	MONITOR_BUF_PAGE_WRITTEN("blob", "Uncompressed BLOB", BLOB),
+
+	MONITOR_BUF_PAGE_WRITTEN("zblob", "First Compressed BLOB", ZBLOB),
+
+	MONITOR_BUF_PAGE_WRITTEN("zblob2", "Subsequent Compressed BLOB",
+				 ZBLOB2),
+
+	MONITOR_BUF_PAGE_WRITTEN("other", "other/unknown (old version InnoDB)",
+				 OTHER),
+
+	/* ========== Counters for OS level operations ========== */
+	{"module_os", "os", "OS Level Operation",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_OS},
+
+	{"os_data_reads", "os",
+	 "Number of reads initiated (innodb_data_reads)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_OS_FILE_READ},
+
+	{"os_data_writes", "os",
+	 "Number of writes initiated (innodb_data_writes)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_OS_FILE_WRITE},
+
+	{"os_data_fsyncs", "os",
+	 "Number of fsync() calls (innodb_data_fsyncs)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_OS_FSYNC},
+
+	{"os_pending_reads", "os", "Number of reads pending",
+	 MONITOR_DEFAULT_ON,
+	 MONITOR_DEFAULT_START, MONITOR_OS_PENDING_READS},
+
+	{"os_pending_writes", "os", "Number of writes pending",
+	 MONITOR_DEFAULT_ON,
+	 MONITOR_DEFAULT_START, MONITOR_OS_PENDING_WRITES},
+
+	{"os_log_bytes_written", "os",
+	 "Bytes of log written (innodb_os_log_written)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_OS_LOG_WRITTEN},
+
+	/* ========== Counters for Transaction Module ========== */
+	{"module_trx", "transaction", "Transaction Manager",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_TRX},
+
+	{"trx_rw_commits", "transaction",
+	 "Number of read-write transactions  committed",
+	 MONITOR_NONE, MONITOR_DEFAULT_START, MONITOR_TRX_RW_COMMIT},
+
+	{"trx_ro_commits", "transaction",
+	 "Number of read-only transactions committed",
+	 MONITOR_NONE, MONITOR_DEFAULT_START, MONITOR_TRX_RO_COMMIT},
+
+	{"trx_nl_ro_commits", "transaction",
+	 "Number of non-locking auto-commit read-only transactions committed",
+	 MONITOR_NONE, MONITOR_DEFAULT_START, MONITOR_TRX_NL_RO_COMMIT},
+
+	{"trx_commits_insert_update", "transaction",
+	 "Number of transactions committed with inserts and updates",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TRX_COMMIT_UNDO},
+
+	{"trx_rollbacks", "transaction",
+	 "Number of transactions rolled back",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TRX_ROLLBACK},
+
+	{"trx_rollbacks_savepoint", "transaction",
+	 "Number of transactions rolled back to savepoint",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_TRX_ROLLBACK_SAVEPOINT},
+
+	{"trx_rseg_history_len", "transaction",
+	 "Length of the TRX_RSEG_HISTORY list",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_RSEG_HISTORY_LEN},
+
+	{"trx_undo_slots_used", "transaction", "Number of undo slots used",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_NUM_UNDO_SLOT_USED},
+
+	{"trx_undo_slots_cached", "transaction",
+	 "Number of undo slots cached",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_NUM_UNDO_SLOT_CACHED},
+
+	{"trx_rseg_current_size", "transaction",
+	 "Current rollback segment size in pages",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_RSEG_CUR_SIZE},
+
+	/* ========== Counters for Purge Module ========== */
+	{"module_purge", "purge", "Purge Module",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_PURGE},
+
+	{"purge_del_mark_records", "purge",
+	 "Number of delete-marked rows purged",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_N_DEL_ROW_PURGE},
+
+	{"purge_upd_exist_or_extern_records", "purge",
+	 "Number of purges on updates of existing records and"
+	 " updates on delete marked record with externally stored field",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_N_UPD_EXIST_EXTERN},
+
+	{"purge_invoked", "purge",
+	 "Number of times purge was invoked",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_PURGE_INVOKED},
+
+	{"purge_undo_log_pages", "purge",
+	 "Number of undo log pages handled by the purge",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_PURGE_N_PAGE_HANDLED},
+
+	{"purge_dml_delay_usec", "purge",
+	 "Microseconds DML to be delayed due to purge lagging",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_DML_PURGE_DELAY},
+
+	{"purge_stop_count", "purge",
+	 "Number of times purge was stopped",
+	 MONITOR_DISPLAY_CURRENT,
+	 MONITOR_DEFAULT_START, MONITOR_PURGE_STOP_COUNT},
+
+	{"purge_resume_count", "purge",
+	 "Number of times purge was resumed",
+	 MONITOR_DISPLAY_CURRENT,
+	 MONITOR_DEFAULT_START, MONITOR_PURGE_RESUME_COUNT},
+
+	/* ========== Counters for Recovery Module ========== */
+	{"module_log", "recovery", "Recovery Module",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_RECOVERY},
+
+	{"log_checkpoints", "recovery", "Number of checkpoints",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_CHECKPOINTS},
+
+	{"log_lsn_last_flush", "recovery", "LSN of Last flush",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_LSN_FLUSHDISK},
+
+	{"log_lsn_last_checkpoint", "recovery", "LSN at last checkpoint",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_LSN_CHECKPOINT},
+
+	{"log_lsn_current", "recovery", "Current LSN value",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_LSN_CURRENT},
+
+	{"log_lsn_checkpoint_age", "recovery",
+	 "Current LSN value minus LSN at last checkpoint",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_LSN_CHECKPOINT_AGE},
+
+	{"log_lsn_buf_pool_oldest", "recovery",
+	 "The oldest modified block LSN in the buffer pool",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_BUF_OLDEST_LSN},
+
+	{"log_max_modified_age_async", "recovery",
+	 "Maximum LSN difference; when exceeded, start asynchronous preflush",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_MAX_AGE_ASYNC},
+
+	{"log_waits", "recovery",
+	 "Number of log waits due to small log buffer (innodb_log_waits)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_LOG_WAITS},
+
+	{"log_write_requests", "recovery",
+	 "Number of log write requests (innodb_log_write_requests)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_LOG_WRITE_REQUEST},
+
+	{"log_writes", "recovery",
+	 "Number of log writes (innodb_log_writes)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_LOG_WRITES},
+
+	/* ========== Counters for Page Compression ========== */
+	{"module_compress", "compression", "Page Compression Info",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_PAGE},
+
+	{"compress_pages_compressed", "compression",
+	 "Number of pages compressed", MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_PAGE_COMPRESS},
+
+	{"compress_pages_decompressed", "compression",
+	 "Number of pages decompressed",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_PAGE_DECOMPRESS},
+
+	{"compression_pad_increments", "compression",
+	 "Number of times padding is incremented to avoid compression failures",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_PAD_INCREMENTS},
+
+	{"compression_pad_decrements", "compression",
+	 "Number of times padding is decremented due to good compressibility",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_PAD_DECREMENTS},
+
+	{"compress_saved", "compression",
+	 "Number of bytes saved by page compression",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_SAVED},
+
+	{"compress_pages_page_compressed", "compression",
+	 "Number of pages compressed by page compression",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_PAGE_COMPRESSED},
+
+	{"compress_page_compressed_trim_op", "compression",
+	 "Number of TRIM operation performed by page compression",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP},
+
+	{"compress_pages_page_decompressed", "compression",
+	 "Number of pages decompressed by page compression",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED},
+
+	{"compress_pages_page_compression_error", "compression",
+	 "Number of page compression errors",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_PAGE_COMPRESSION_ERROR},
+
+	{"compress_pages_encrypted", "compression",
+	 "Number of pages encrypted",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_ENCRYPTED},
+
+	{"compress_pages_decrypted", "compression",
+	 "Number of pages decrypted",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_DECRYPTED},
+
+	/* ========== Counters for Index ========== */
+	{"module_index", "index", "Index Manager",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_INDEX},
+
+	{"index_page_splits", "index", "Number of index page splits",
+	 MONITOR_EXISTING,
+	 MONITOR_DEFAULT_START, MONITOR_INDEX_SPLIT},
+
+	{"index_page_merge_attempts", "index",
+	 "Number of index page merge attempts",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_INDEX_MERGE_ATTEMPTS},
+
+	{"index_page_merge_successful", "index",
+	 "Number of successful index page merges",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_INDEX_MERGE_SUCCESSFUL},
+
+	{"index_page_reorg_attempts", "index",
+	 "Number of index page reorganization attempts",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_INDEX_REORG_ATTEMPTS},
+
+	{"index_page_reorg_successful", "index",
+	 "Number of successful index page reorganizations",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_INDEX_REORG_SUCCESSFUL},
+
+	{"index_page_discards", "index", "Number of index pages discarded",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_INDEX_DISCARD},
+
+#ifdef BTR_CUR_HASH_ADAPT
+	/* ========== Counters for Adaptive Hash Index ========== */
+	{"module_adaptive_hash", "adaptive_hash_index", "Adaptive Hash Index",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_ADAPTIVE_HASH},
+
+	{"adaptive_hash_searches", "adaptive_hash_index",
+	 "Number of successful searches using Adaptive Hash Index",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_ADAPTIVE_HASH_SEARCH},
+
+	{"adaptive_hash_searches_btree", "adaptive_hash_index",
+	 "Number of searches using B-tree on an index search",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_ADAPTIVE_HASH_SEARCH_BTREE},
+
+	{"adaptive_hash_pages_added", "adaptive_hash_index",
+	 "Number of index pages on which the Adaptive Hash Index is built",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ADAPTIVE_HASH_PAGE_ADDED},
+
+	{"adaptive_hash_pages_removed", "adaptive_hash_index",
+	 "Number of index pages whose corresponding Adaptive Hash Index"
+	 " entries were removed",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ADAPTIVE_HASH_PAGE_REMOVED},
+
+	{"adaptive_hash_rows_added", "adaptive_hash_index",
+	 "Number of Adaptive Hash Index rows added",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ADAPTIVE_HASH_ROW_ADDED},
+
+	{"adaptive_hash_rows_removed", "adaptive_hash_index",
+	 "Number of Adaptive Hash Index rows removed",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ADAPTIVE_HASH_ROW_REMOVED},
+
+	{"adaptive_hash_rows_deleted_no_hash_entry", "adaptive_hash_index",
+	 "Number of rows deleted that did not have corresponding Adaptive Hash"
+	 " Index entries",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ADAPTIVE_HASH_ROW_REMOVE_NOT_FOUND},
+
+	{"adaptive_hash_rows_updated", "adaptive_hash_index",
+	 "Number of Adaptive Hash Index rows updated",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ADAPTIVE_HASH_ROW_UPDATED},
+#endif /* BTR_CUR_HASH_ADAPT */
+
+	/* ========== Counters for tablespace ========== */
+	{"module_file", "file_system", "Tablespace and File System Manager",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_FIL_SYSTEM},
+
+	{"file_num_open_files", "file_system",
+	 "Number of files currently open (innodb_num_open_files)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DISPLAY_CURRENT | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_N_FILE_OPENED},
+
+	/* ========== Counters for Change Buffer ========== */
+	{"module_ibuf_system", "change_buffer", "InnoDB Change Buffer",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_IBUF_SYSTEM},
+
+	{"ibuf_merges_insert", "change_buffer",
+	 "Number of inserted records merged by change buffering",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_INSERT},
+
+	{"ibuf_merges_delete_mark", "change_buffer",
+	 "Number of deleted records merged by change buffering",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_DELETE},
+
+	{"ibuf_merges_delete", "change_buffer",
+	 "Number of purge records merged by change buffering",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_PURGE},
+
+	{"ibuf_merges_discard_insert", "change_buffer",
+	 "Number of insert merged operations discarded",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_DISCARD_INSERT},
+
+	{"ibuf_merges_discard_delete_mark", "change_buffer",
+	 "Number of deleted merged operations discarded",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_DISCARD_DELETE},
+
+	{"ibuf_merges_discard_delete", "change_buffer",
+	 "Number of purge merged  operations discarded",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGE_DISCARD_PURGE},
+
+	{"ibuf_merges", "change_buffer", "Number of change buffer merges",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_MERGES},
+
+	{"ibuf_size", "change_buffer", "Change buffer size in pages",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_IBUF_SIZE},
+
+	/* ========== Counters for server operations ========== */
+	{"module_innodb", "innodb",
+	 "Counter for general InnoDB server wide operations and properties",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_SERVER},
+
+	{"innodb_master_thread_sleeps", "server",
+	 "Number of times (seconds) master thread sleeps",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_MASTER_THREAD_SLEEP},
+
+	{"innodb_activity_count", "server", "Current server activity count",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_SERVER_ACTIVITY},
+
+	{"innodb_master_active_loops", "server",
+	 "Number of times master thread performs its tasks when"
+	 " server is active",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_MASTER_ACTIVE_LOOPS},
+
+	{"innodb_master_idle_loops", "server",
+	 "Number of times master thread performs its tasks when server is idle",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_MASTER_IDLE_LOOPS},
+
+	{"innodb_log_flush_usec", "server",
+	 "Time (in microseconds) spent to flush log records",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_SRV_LOG_FLUSH_MICROSECOND},
+
+	{"innodb_dict_lru_usec", "server",
+	 "Time (in microseconds) spent to process DICT LRU list",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_SRV_DICT_LRU_MICROSECOND},
+
+	{"innodb_dict_lru_count_active", "server",
+	 "Number of tables evicted from DICT LRU list in the active loop",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_SRV_DICT_LRU_EVICT_COUNT_ACTIVE},
+
+	{"innodb_dict_lru_count_idle", "server",
+	 "Number of tables evicted from DICT LRU list in the idle loop",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_SRV_DICT_LRU_EVICT_COUNT_IDLE},
+
+	{"innodb_dblwr_writes", "server",
+	 "Number of doublewrite operations that have been performed"
+	 " (innodb_dblwr_writes)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_SRV_DBLWR_WRITES},
+
+	{"innodb_dblwr_pages_written", "server",
+	 "Number of pages that have been written for doublewrite operations"
+	 " (innodb_dblwr_pages_written)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_SRV_DBLWR_PAGES_WRITTEN},
+
+	{"innodb_page_size", "server",
+	 "InnoDB page size in bytes (innodb_page_size)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON | MONITOR_DISPLAY_CURRENT),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_SRV_PAGE_SIZE},
+
+	/* ========== Counters for DDL operations ========== */
+	{"module_ddl", "ddl", "Statistics for DDLs",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_DDL_STATS},
+
+	{"ddl_background_drop_indexes", "ddl",
+	 "Number of indexes waiting to be dropped after failed index creation",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_BACKGROUND_DROP_INDEX},
+
+	{"ddl_online_create_index", "ddl",
+	 "Number of indexes being created online",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ONLINE_CREATE_INDEX},
+
+	{"ddl_pending_alter_table", "ddl",
+	 "Number of ALTER TABLE, CREATE INDEX, DROP INDEX in progress",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_PENDING_ALTER_TABLE},
+
+	{"ddl_sort_file_alter_table", "ddl",
+	 "Number of sort files created during alter table",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ALTER_TABLE_SORT_FILES},
+
+	{"ddl_log_file_alter_table", "ddl",
+	 "Number of log files created during alter table",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ALTER_TABLE_LOG_FILES},
+
+	/* ===== Counters for ICP (Index Condition Pushdown) Module ===== */
+	{"module_icp", "icp", "Index Condition Pushdown",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_MODULE_ICP},
+
+	{"icp_attempts", "icp",
+	 "Number of attempts for index push-down condition checks",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ICP_ATTEMPTS},
+
+	{"icp_no_match", "icp", "Index push-down condition does not match",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ICP_NO_MATCH},
+
+	{"icp_out_of_range", "icp", "Index push-down condition out of range",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ICP_OUT_OF_RANGE},
+
+	{"icp_match", "icp", "Index push-down condition matches",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_ICP_MATCH},
+
+	/* ========== To turn on/off reset all counters ========== */
+	{"all", "All Counters", "Turn on/off and reset all counters",
+	 MONITOR_MODULE,
+	 MONITOR_DEFAULT_START, MONITOR_ALL_COUNTER}
+};
+
+/* The "innodb_counter_value" array stores actual counter values */
+monitor_value_t	innodb_counter_value[NUM_MONITOR];
+
+/* monitor_set_tbl is used to record and determine whether a monitor
+has been turned on/off. */
+Atomic_relaxed<ulint>
+    monitor_set_tbl[(NUM_MONITOR + NUM_BITS_ULINT - 1) / NUM_BITS_ULINT];
+
+/****************************************************************//**
+Get a monitor's "monitor_info" by its monitor id (index into the
+innodb_counter_info array.
+@return Point to corresponding monitor_info_t, or NULL if no such
+monitor */
+monitor_info_t*
+srv_mon_get_info(
+/*=============*/
+	monitor_id_t	monitor_id)	/*!< id indexing into the
+					innodb_counter_info array */
+{
+	ut_a(monitor_id < NUM_MONITOR);
+
+	return((monitor_id < NUM_MONITOR)
+			? &innodb_counter_info[monitor_id]
+			: NULL);
+}
+
+/****************************************************************//**
+Get monitor's name by its monitor id (indexing into the
+innodb_counter_info array.
+@return corresponding monitor name, or NULL if no such
+monitor */
+const char*
+srv_mon_get_name(
+/*=============*/
+	monitor_id_t	monitor_id)	/*!< id index into the
+					innodb_counter_info array */
+{
+	ut_a(monitor_id < NUM_MONITOR);
+
+	return((monitor_id < NUM_MONITOR)
+			? innodb_counter_info[monitor_id].monitor_name
+			: NULL);
+}
+
+/****************************************************************//**
+Turn on/off, reset monitor counters in a module. If module_id
+is MONITOR_ALL_COUNTER then turn on all monitor counters.
+turned on because it has already been turned on. */
+void
+srv_mon_set_module_control(
+/*=======================*/
+	monitor_id_t	module_id,	/*!< in: Module ID as in
+					monitor_counter_id. If it is
+					set to MONITOR_ALL_COUNTER, this means
+					we shall turn on all the counters */
+	mon_option_t	set_option)	/*!< in: Turn on/off reset the
+					counter */
+{
+	lint	ix;
+	lint	start_id;
+	ibool	set_current_module = FALSE;
+
+	ut_a(module_id <= NUM_MONITOR);
+	compile_time_assert(array_elements(innodb_counter_info)
+			    == NUM_MONITOR);
+
+	/* The module_id must be an ID of MONITOR_MODULE type */
+	ut_a(innodb_counter_info[module_id].monitor_type & MONITOR_MODULE);
+
+	/* start with the first monitor in the module. If module_id
+	is MONITOR_ALL_COUNTER, this means we need to turn on all
+	monitor counters. */
+	if (module_id == MONITOR_ALL_COUNTER) {
+		start_id = 1;
+	} else if (innodb_counter_info[module_id].monitor_type
+		   & MONITOR_GROUP_MODULE) {
+		/* Counters in this module are set as a group together
+		and cannot be turned on/off individually. Need to set
+		the on/off bit in the module counter */
+		start_id = module_id;
+		set_current_module = TRUE;
+
+	} else {
+		start_id = module_id + 1;
+	}
+
+	for (ix = start_id; ix < NUM_MONITOR; ix++) {
+		/* if we hit the next module counter, we will
+		continue if we want to turn on all monitor counters,
+		and break if just turn on the counters in the
+		current module. */
+		if (innodb_counter_info[ix].monitor_type & MONITOR_MODULE) {
+
+			if (set_current_module) {
+				/* Continue to set on/off bit on current
+				module */
+				set_current_module = FALSE;
+			} else if (module_id == MONITOR_ALL_COUNTER) {
+				if (!(innodb_counter_info[ix].monitor_type
+				      & MONITOR_GROUP_MODULE)) {
+					continue;
+				}
+			} else {
+				/* Hitting the next module, stop */
+				break;
+			}
+		}
+
+		/* Cannot turn on a monitor already been turned on. User
+		should be aware some counters are already on before
+		turn them on again (which could reset counter value) */
+		if (MONITOR_IS_ON(ix) && (set_option == MONITOR_TURN_ON)) {
+			ib::info() << "Monitor '"
+				<< srv_mon_get_name((monitor_id_t) ix)
+				<< "' is already enabled.";
+			continue;
+		}
+
+		/* For some existing counters (server status variables),
+		we will get its counter value at the start/stop time
+		to calculate the actual value during the time. */
+		if (innodb_counter_info[ix].monitor_type & MONITOR_EXISTING) {
+			srv_mon_process_existing_counter(
+				static_cast<monitor_id_t>(ix), set_option);
+		}
+
+		/* Currently support 4 operations on the monitor counters:
+		turn on, turn off, reset and reset all operations. */
+		switch (set_option) {
+		case MONITOR_TURN_ON:
+			MONITOR_ON(ix);
+			MONITOR_INIT(ix);
+			MONITOR_SET_START(ix);
+			break;
+
+		case MONITOR_TURN_OFF:
+			MONITOR_OFF(ix);
+			MONITOR_SET_OFF(ix);
+			break;
+
+		case MONITOR_RESET_VALUE:
+			srv_mon_reset(static_cast<monitor_id_t>(ix));
+			break;
+
+		case MONITOR_RESET_ALL_VALUE:
+			srv_mon_reset_all(static_cast<monitor_id_t>(ix));
+			break;
+
+		default:
+			ut_error;
+		}
+	}
+}
+
+/****************************************************************//**
+Get transaction system's rollback segment size in pages
+@return size in pages */
+TPOOL_SUPPRESS_TSAN static ulint srv_mon_get_rseg_size()
+{
+  ulint size= 0;
+  for (const auto &rseg : trx_sys.rseg_array)
+    size+= rseg.curr_size;
+  return size;
+}
+
+/** @return number of used undo log slots */
+TPOOL_SUPPRESS_TSAN static ulint srv_mon_get_rseg_used()
+{
+  ulint size= 0;
+  for (const auto &rseg : trx_sys.rseg_array)
+    size+= UT_LIST_GET_LEN(rseg.undo_list);
+  return size;
+}
+
+/** @return number of cached undo log slots */
+TPOOL_SUPPRESS_TSAN static ulint srv_mon_get_rseg_cached()
+{
+  ulint size= 0;
+  for (const auto &rseg : trx_sys.rseg_array)
+    size+= UT_LIST_GET_LEN(rseg.undo_cached);
+  return size;
+}
+
+/****************************************************************//**
+This function consolidates some existing server counters used
+by "system status variables". These existing system variables do not have
+mechanism to start/stop and reset the counters, so we simulate these
+controls by remembering the corresponding counter values when the
+corresponding monitors are turned on/off/reset, and do appropriate
+mathematics to deduct the actual value. Please also refer to
+srv_export_innodb_status() for related global counters used by
+the existing status variables.*/
+TPOOL_SUPPRESS_TSAN
+void
+srv_mon_process_existing_counter(
+/*=============================*/
+	monitor_id_t	monitor_id,	/*!< in: the monitor's ID as in
+					monitor_counter_id */
+	mon_option_t	set_option)	/*!< in: Turn on/off reset the
+					counter */
+{
+	mon_type_t		value;
+	monitor_info_t*		monitor_info;
+	ibool			update_min = FALSE;
+
+	monitor_info = srv_mon_get_info(monitor_id);
+
+	ut_a(monitor_info->monitor_type & MONITOR_EXISTING);
+	ut_a(monitor_id < NUM_MONITOR);
+
+	/* Get the value from corresponding global variable */
+	switch (monitor_id) {
+	case MONITOR_INDEX_SPLIT:
+		value = buf_pool.pages_split;
+		break;
+
+	case MONITOR_OVLD_BUF_POOL_READS:
+		value = buf_pool.stat.n_pages_read;
+		break;
+
+	/* innodb_buffer_pool_read_requests, the number of logical
+	read requests */
+	case MONITOR_OVLD_BUF_POOL_READ_REQUESTS:
+		value = buf_pool.stat.n_page_gets;
+		break;
+
+	/* innodb_buffer_pool_write_requests, the number of
+	write request */
+	case MONITOR_OVLD_BUF_POOL_WRITE_REQUEST:
+		value = buf_pool.flush_list_requests;
+		break;
+
+	/* innodb_buffer_pool_wait_free */
+	case MONITOR_OVLD_BUF_POOL_WAIT_FREE:
+		value = buf_pool.stat.LRU_waits;
+		break;
+
+	/* innodb_buffer_pool_read_ahead */
+	case MONITOR_OVLD_BUF_POOL_READ_AHEAD:
+		value = buf_pool.stat.n_ra_pages_read;
+		break;
+
+	/* innodb_buffer_pool_read_ahead_evicted */
+	case MONITOR_OVLD_BUF_POOL_READ_AHEAD_EVICTED:
+		value = buf_pool.stat.n_ra_pages_evicted;
+		break;
+
+	/* innodb_buffer_pool_pages_total */
+	case MONITOR_OVLD_BUF_POOL_PAGE_TOTAL:
+		value = buf_pool.get_n_pages();
+		break;
+
+	/* innodb_buffer_pool_pages_misc */
+	case MONITOR_OVLD_BUF_POOL_PAGE_MISC:
+		value = buf_pool.get_n_pages()
+			- UT_LIST_GET_LEN(buf_pool.LRU)
+			- UT_LIST_GET_LEN(buf_pool.free);
+		break;
+
+	/* innodb_buffer_pool_pages_data */
+	case MONITOR_OVLD_BUF_POOL_PAGES_DATA:
+		value = UT_LIST_GET_LEN(buf_pool.LRU);
+		break;
+
+	/* innodb_buffer_pool_bytes_data */
+	case MONITOR_OVLD_BUF_POOL_BYTES_DATA:
+		value = buf_pool.stat.LRU_bytes
+			+ (UT_LIST_GET_LEN(buf_pool.unzip_LRU)
+			   << srv_page_size_shift);
+		break;
+
+	/* innodb_buffer_pool_pages_dirty */
+	case MONITOR_OVLD_BUF_POOL_PAGES_DIRTY:
+		value = UT_LIST_GET_LEN(buf_pool.flush_list);
+		break;
+
+	/* innodb_buffer_pool_bytes_dirty */
+	case MONITOR_OVLD_BUF_POOL_BYTES_DIRTY:
+		value = buf_pool.flush_list_bytes;
+		break;
+
+	/* innodb_buffer_pool_pages_free */
+	case MONITOR_OVLD_BUF_POOL_PAGES_FREE:
+		value = UT_LIST_GET_LEN(buf_pool.free);
+		break;
+
+	/* innodb_pages_created, the number of pages created */
+	case MONITOR_OVLD_PAGE_CREATED:
+		value = buf_pool.stat.n_pages_created;
+		break;
+
+	/* innodb_pages_written, the number of page written */
+	case MONITOR_OVLD_PAGES_WRITTEN:
+		value = buf_pool.stat.n_pages_written;
+		break;
+
+	case MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE:
+		value = buf_lru_flush_page_count;
+		break;
+
+	case MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE:
+		value = buf_lru_freed_page_count;
+		break;
+
+	/* innodb_pages_read */
+	case MONITOR_OVLD_PAGES_READ:
+		value = buf_pool.stat.n_pages_read;
+		break;
+
+	/* innodb_data_reads, the total number of data reads */
+	case MONITOR_OVLD_BYTE_READ:
+		value = srv_stats.data_read;
+		break;
+
+	/* innodb_data_writes, the total number of data writes. */
+	case MONITOR_OVLD_BYTE_WRITTEN:
+		value = srv_stats.data_written;
+		break;
+
+	/* innodb_data_reads, the total number of data reads. */
+	case MONITOR_OVLD_OS_FILE_READ:
+		value = os_n_file_reads;
+		break;
+
+	/* innodb_data_writes, the total number of data writes*/
+	case MONITOR_OVLD_OS_FILE_WRITE:
+		value = os_n_file_writes;
+		break;
+
+	/* innodb_data_fsyncs, number of fsync() operations so far. */
+	case MONITOR_OVLD_OS_FSYNC:
+		value = os_n_fsyncs;
+		break;
+
+	/* innodb_os_log_written */
+	case MONITOR_OVLD_OS_LOG_WRITTEN:
+		value = log_sys.get_lsn() - recv_sys.lsn;
+		break;
+
+	/* innodb_log_waits */
+	case MONITOR_OVLD_LOG_WAITS:
+		value = log_sys.waits;
+		break;
+
+	/* innodb_log_write_requests */
+	case MONITOR_OVLD_LOG_WRITE_REQUEST:
+		value = log_sys.write_to_buf;
+		break;
+
+	/* innodb_log_writes */
+	case MONITOR_OVLD_LOG_WRITES:
+		value = log_sys.write_to_log;
+		break;
+
+	/* innodb_dblwr_writes */
+	case MONITOR_OVLD_SRV_DBLWR_WRITES:
+		buf_dblwr.lock();
+		value = buf_dblwr.batches();
+		buf_dblwr.unlock();
+		break;
+
+	/* innodb_dblwr_pages_written */
+	case MONITOR_OVLD_SRV_DBLWR_PAGES_WRITTEN:
+		buf_dblwr.lock();
+		value = buf_dblwr.written();
+		buf_dblwr.unlock();
+		break;
+
+	/* innodb_page_size */
+	case MONITOR_OVLD_SRV_PAGE_SIZE:
+		value = srv_page_size;
+		break;
+
+	case MONITOR_OVLD_BUFFER_POOL_SIZE:
+		value = srv_buf_pool_size;
+		break;
+
+	/* innodb_row_lock_current_waits */
+	case MONITOR_OVLD_ROW_LOCK_CURRENT_WAIT:
+		// dirty read without lock_sys.wait_mutex
+		value = lock_sys.get_wait_pending();
+		break;
+
+	/* innodb_row_lock_time */
+	case MONITOR_OVLD_LOCK_WAIT_TIME:
+		// dirty read without lock_sys.wait_mutex
+		value = lock_sys.get_wait_time_cumulative();
+		break;
+
+	/* innodb_row_lock_time_max */
+	case MONITOR_OVLD_LOCK_MAX_WAIT_TIME:
+		// dirty read without lock_sys.wait_mutex
+		value = lock_sys.get_wait_time_max();
+		break;
+
+	/* innodb_row_lock_time_avg */
+	case MONITOR_OVLD_LOCK_AVG_WAIT_TIME:
+		mysql_mutex_lock(&lock_sys.wait_mutex);
+		if (auto count = lock_sys.get_wait_cumulative()) {
+			value = lock_sys.get_wait_time_cumulative() / count;
+		} else {
+			value = 0;
+		}
+		mysql_mutex_unlock(&lock_sys.wait_mutex);
+		break;
+
+	/* innodb_row_lock_waits */
+	case MONITOR_OVLD_ROW_LOCK_WAIT:
+		// dirty read without lock_sys.wait_mutex
+		value = lock_sys.get_wait_cumulative();
+		break;
+
+	case MONITOR_RSEG_HISTORY_LEN:
+		value = trx_sys.history_size_approx();
+		break;
+
+	case MONITOR_RSEG_CUR_SIZE:
+		value = srv_mon_get_rseg_size();
+		break;
+	case MONITOR_DML_PURGE_DELAY:
+		value = srv_max_purge_lag_delay;
+		break;
+	case MONITOR_NUM_UNDO_SLOT_USED:
+		value = srv_mon_get_rseg_used();
+		break;
+	case MONITOR_NUM_UNDO_SLOT_CACHED:
+		value = srv_mon_get_rseg_cached();
+		break;
+	case MONITOR_OVLD_N_FILE_OPENED:
+		value = fil_system.n_open;
+		break;
+
+	case MONITOR_OVLD_IBUF_MERGE_INSERT:
+		value = ibuf.n_merged_ops[IBUF_OP_INSERT];
+		break;
+
+	case MONITOR_OVLD_IBUF_MERGE_DELETE:
+		value = ibuf.n_merged_ops[IBUF_OP_DELETE_MARK];
+		break;
+
+	case MONITOR_OVLD_IBUF_MERGE_PURGE:
+		value = ibuf.n_merged_ops[IBUF_OP_DELETE];
+		break;
+
+	case MONITOR_OVLD_IBUF_MERGE_DISCARD_INSERT:
+		value = ibuf.n_discarded_ops[IBUF_OP_INSERT];
+		break;
+
+	case MONITOR_OVLD_IBUF_MERGE_DISCARD_DELETE:
+		value = ibuf.n_discarded_ops[IBUF_OP_DELETE_MARK];
+		break;
+
+	case MONITOR_OVLD_IBUF_MERGE_DISCARD_PURGE:
+		value = ibuf.n_discarded_ops[IBUF_OP_DELETE];
+		break;
+
+	case MONITOR_OVLD_IBUF_MERGES:
+		value = ibuf.n_merges;
+		break;
+
+	case MONITOR_OVLD_IBUF_SIZE:
+		value = ibuf.size;
+		break;
+
+	case MONITOR_OVLD_SERVER_ACTIVITY:
+		value = srv_get_activity_count();
+		break;
+
+	case MONITOR_OVLD_LSN_FLUSHDISK:
+		value = log_sys.get_flushed_lsn();
+		break;
+
+	case MONITOR_OVLD_LSN_CURRENT:
+		value = log_sys.get_lsn();
+		break;
+
+        case MONITOR_OVLD_CHECKPOINTS:
+		value = log_sys.next_checkpoint_no;
+		break;
+
+	case MONITOR_LSN_CHECKPOINT_AGE:
+		log_sys.latch.rd_lock(SRW_LOCK_CALL);
+		value = static_cast<mon_type_t>(log_sys.get_lsn()
+						- log_sys.last_checkpoint_lsn);
+		log_sys.latch.rd_unlock();
+		break;
+
+	case MONITOR_OVLD_BUF_OLDEST_LSN:
+		mysql_mutex_lock(&buf_pool.flush_list_mutex);
+		value = (mon_type_t) buf_pool.get_oldest_modification(0);
+		mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+		break;
+
+	case MONITOR_OVLD_LSN_CHECKPOINT:
+		value = (mon_type_t) log_sys.last_checkpoint_lsn;
+		break;
+
+	case MONITOR_OVLD_MAX_AGE_ASYNC:
+		value = log_sys.max_modified_age_async;
+		break;
+
+#ifdef BTR_CUR_HASH_ADAPT
+	case MONITOR_OVLD_ADAPTIVE_HASH_SEARCH:
+		value = btr_cur_n_sea;
+		break;
+
+	case MONITOR_OVLD_ADAPTIVE_HASH_SEARCH_BTREE:
+		value = btr_cur_n_non_sea;
+		break;
+#endif /* BTR_CUR_HASH_ADAPT */
+
+        case MONITOR_OVLD_PAGE_COMPRESS_SAVED:
+		value = srv_stats.page_compression_saved;
+		break;
+        case MONITOR_OVLD_PAGES_PAGE_COMPRESSED:
+		value = srv_stats.pages_page_compressed;
+		break;
+        case MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP:
+		value = srv_stats.page_compressed_trim_op;
+		break;
+        case MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED:
+		value = srv_stats.pages_page_decompressed;
+		break;
+        case MONITOR_OVLD_PAGES_PAGE_COMPRESSION_ERROR:
+		value = srv_stats.pages_page_compression_error;
+		break;
+        case MONITOR_OVLD_PAGES_ENCRYPTED:
+		value = srv_stats.pages_encrypted;
+		break;
+        case MONITOR_OVLD_PAGES_DECRYPTED:
+		value = srv_stats.pages_decrypted;
+		break;
+	case MONITOR_DEADLOCK:
+		value = lock_sys.deadlocks;
+		break;
+	case MONITOR_TIMEOUT:
+		value = lock_sys.timeouts;
+		break;
+	default:
+		ut_error;
+	}
+
+	switch (set_option) {
+	case MONITOR_TURN_ON:
+		/* Save the initial counter value in mon_start_value
+		field */
+		MONITOR_SAVE_START(monitor_id, value);
+		return;
+
+	case MONITOR_TURN_OFF:
+		/* Save the counter value to mon_last_value when we
+		turn off the monitor but not yet reset. Note the
+		counter has not yet been set to off in the bitmap
+		table for normal turn off. We need to check the
+		count status (on/off) to avoid reset the value
+		for an already off conte */
+		if (MONITOR_IS_ON(monitor_id)) {
+			srv_mon_process_existing_counter(monitor_id,
+							 MONITOR_GET_VALUE);
+			MONITOR_SAVE_LAST(monitor_id);
+		}
+		return;
+
+	case MONITOR_GET_VALUE:
+		if (MONITOR_IS_ON(monitor_id)) {
+
+			/* If MONITOR_DISPLAY_CURRENT bit is on, we
+			only record the current value, rather than
+			incremental value over a period. Most of
+`			this type of counters are resource related
+			counters such as number of buffer pages etc. */
+			if (monitor_info->monitor_type
+			    & MONITOR_DISPLAY_CURRENT) {
+				MONITOR_SET(monitor_id, value);
+			} else {
+				/* Most status counters are montonically
+				increasing, no need to update their
+				minimum values. Only do so
+				if "update_min" set to TRUE */
+				MONITOR_SET_DIFF(monitor_id, value);
+
+				if (update_min
+				    && (MONITOR_VALUE(monitor_id)
+					< MONITOR_MIN_VALUE(monitor_id))) {
+					MONITOR_MIN_VALUE(monitor_id) =
+						MONITOR_VALUE(monitor_id);
+				}
+			}
+		}
+		return;
+
+	case MONITOR_RESET_VALUE:
+		if (!MONITOR_IS_ON(monitor_id)) {
+			MONITOR_LAST_VALUE(monitor_id) = 0;
+		}
+		return;
+
+	/* Nothing special for reset all operation for these existing
+	counters */
+	case MONITOR_RESET_ALL_VALUE:
+		return;
+	}
+}
+
+/*************************************************************//**
+Reset a monitor, create a new base line with the current monitor
+value. This baseline is recorded by MONITOR_VALUE_RESET(monitor) */
+void
+srv_mon_reset(
+/*==========*/
+	monitor_id_t	monitor)	/*!< in: monitor id */
+{
+	ibool	monitor_was_on;
+
+	monitor_was_on = MONITOR_IS_ON(monitor);
+
+	if (monitor_was_on) {
+		/* Temporarily turn off the counter for the resetting
+		operation */
+		MONITOR_OFF(monitor);
+	}
+
+	/* Before resetting the current monitor value, first
+	calculate and set the max/min value since monitor
+	start */
+	srv_mon_calc_max_since_start(monitor);
+	srv_mon_calc_min_since_start(monitor);
+
+	/* Monitors with MONITOR_DISPLAY_CURRENT bit
+	are not incremental, no need to remember
+	the reset value. */
+	if (innodb_counter_info[monitor].monitor_type
+	    & MONITOR_DISPLAY_CURRENT) {
+		MONITOR_VALUE_RESET(monitor) = 0;
+	} else {
+		/* Remember the new baseline */
+		MONITOR_VALUE_RESET(monitor) = MONITOR_VALUE_RESET(monitor)
+					       + MONITOR_VALUE(monitor);
+	}
+
+	/* Reset the counter value */
+	MONITOR_VALUE(monitor) = 0;
+	MONITOR_MAX_VALUE(monitor) = MAX_RESERVED;
+	MONITOR_MIN_VALUE(monitor) = MIN_RESERVED;
+
+	MONITOR_FIELD((monitor), mon_reset_time) = time(NULL);
+
+	if (monitor_was_on) {
+		MONITOR_ON(monitor);
+	}
+}
+
+/*************************************************************//**
+Turn on monitor counters that are marked as default ON. */
+void
+srv_mon_default_on(void)
+/*====================*/
+{
+	ulint   ix;
+
+	for (ix = 0; ix < NUM_MONITOR; ix++) {
+		if (innodb_counter_info[ix].monitor_type
+		    & MONITOR_DEFAULT_ON) {
+			/* Turn on monitor counters that are default on */
+			MONITOR_ON(ix);
+			MONITOR_INIT(ix);
+			MONITOR_SET_START(ix);
+		}
+	}
+}
diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc
new file mode 100644
index 00000000..bf9755fb
--- /dev/null
+++ b/storage/innobase/srv/srv0srv.cc
@@ -0,0 +1,1659 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2008, 2009 Google Inc.
+Copyright (c) 2009, Percona Inc.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file srv/srv0srv.cc
+The database server main program
+
+Created 10/8/1995 Heikki Tuuri
+*******************************************************/
+
+#include "my_global.h"
+#include "mysql/psi/mysql_stage.h"
+#include "mysql/psi/psi.h"
+
+#include "btr0sea.h"
+#include "buf0flu.h"
+#include "buf0lru.h"
+#include "dict0boot.h"
+#include "dict0load.h"
+#include "ibuf0ibuf.h"
+#include "lock0lock.h"
+#include "log0recv.h"
+#include "mem0mem.h"
+#include "pars0pars.h"
+#include "que0que.h"
+#include "row0mysql.h"
+#include "row0log.h"
+#include "srv0mon.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "trx0i_s.h"
+#include "trx0purge.h"
+#include "btr0defragment.h"
+#include "ut0mem.h"
+#include "fil0fil.h"
+#include "fil0crypt.h"
+#include "fil0pagecompress.h"
+#include "trx0types.h"
+#include <list>
+#include "log.h"
+
+#include "transactional_lock_guard.h"
+
+#include <my_service_manager.h>
+/* The following is the maximum allowed duration of a lock wait. */
+ulong	srv_fatal_semaphore_wait_threshold =  DEFAULT_SRV_FATAL_SEMAPHORE_TIMEOUT;
+
+/* How much data manipulation language (DML) statements need to be delayed,
+in microseconds, in order to reduce the lagging of the purge thread. */
+ulint	srv_dml_needed_delay;
+
+const char*	srv_main_thread_op_info = "";
+
+/** Prefix used by MySQL to indicate pre-5.1 table name encoding */
+const char		srv_mysql50_table_name_prefix[10] = "#mysql50#";
+
+/* Server parameters which are read from the initfile */
+
+/* The following three are dir paths which are catenated before file
+names, where the file name itself may also contain a path */
+
+char*	srv_data_home;
+
+/** Rollback files directory, can be absolute. */
+char*	srv_undo_dir;
+
+/** The number of tablespaces to use for rollback segments. */
+uint	srv_undo_tablespaces;
+
+/** The number of UNDO tablespaces that are open and ready to use. */
+uint32_t srv_undo_tablespaces_open;
+
+/** The number of UNDO tablespaces that are active (hosting some rollback
+segment). It is quite possible that some of the tablespaces doesn't host
+any of the rollback-segment based on configuration used. */
+uint32_t srv_undo_tablespaces_active;
+
+/** Rate at which UNDO records should be purged. */
+ulong	srv_purge_rseg_truncate_frequency;
+
+/** Enable or Disable Truncate of UNDO tablespace.
+Note: If enabled then UNDO tablespace will be selected for truncate.
+While Server waits for undo-tablespace to truncate if user disables
+it, truncate action is completed but no new tablespace is marked
+for truncate (action is never aborted). */
+my_bool	srv_undo_log_truncate;
+
+/** Maximum size of undo tablespace. */
+unsigned long long	srv_max_undo_log_size;
+
+/** Set if InnoDB must operate in read-only mode. We don't do any
+recovery and open all tables in RO mode instead of RW mode. We don't
+sync the max trx id to disk either. */
+my_bool	srv_read_only_mode;
+/** store to its own file each table created by an user; data
+dictionary tables are in the system tablespace 0 */
+my_bool	srv_file_per_table;
+/** Set if InnoDB operates in read-only mode or innodb-force-recovery
+is greater than SRV_FORCE_NO_TRX_UNDO. */
+my_bool	high_level_read_only;
+
+/** Sort buffer size in index creation */
+ulong	srv_sort_buf_size;
+/** Maximum modification log file size for online index creation */
+unsigned long long	srv_online_max_size;
+
+/* If this flag is TRUE, then we will use the native aio of the
+OS (provided we compiled Innobase with it in), otherwise we will
+use simulated aio we build below with threads.
+Currently we support native aio on windows and linux */
+my_bool	srv_use_native_aio;
+my_bool	srv_numa_interleave;
+/** copy of innodb_use_atomic_writes; @see innodb_init_params() */
+my_bool	srv_use_atomic_writes;
+/** innodb_compression_algorithm; used with page compression */
+ulong	innodb_compression_algorithm;
+
+/*------------------------- LOG FILES ------------------------ */
+char*	srv_log_group_home_dir;
+
+/** The InnoDB redo log file size, or 0 when changing the redo log format
+at startup (while disallowing writes to the redo log). */
+ulonglong	srv_log_file_size;
+/** innodb_flush_log_at_trx_commit */
+ulong		srv_flush_log_at_trx_commit;
+/** innodb_flush_log_at_timeout */
+uint		srv_flush_log_at_timeout;
+/** innodb_page_size */
+ulong		srv_page_size;
+/** log2 of innodb_page_size; @see innodb_init_params() */
+uint32_t	srv_page_size_shift;
+
+/** innodb_adaptive_flushing; try to flush dirty pages so as to avoid
+IO bursts at the checkpoints. */
+my_bool	srv_adaptive_flushing;
+
+/** innodb_flush_sync; whether to ignore io_capacity at log checkpoints */
+my_bool	srv_flush_sync;
+
+/** common thread pool*/
+tpool::thread_pool* srv_thread_pool;
+
+/** Maximum number of times allowed to conditionally acquire
+mutex before switching to blocking wait on the mutex */
+#define MAX_MUTEX_NOWAIT	2
+
+/** Check whether the number of failed nonblocking mutex
+acquisition attempts exceeds maximum allowed value. If so,
+srv_printf_innodb_monitor() will request mutex acquisition
+with mysql_mutex_lock(), which will wait until it gets the mutex. */
+#define MUTEX_NOWAIT(mutex_skipped)	((mutex_skipped) < MAX_MUTEX_NOWAIT)
+
+/** copy of innodb_buffer_pool_size */
+ulint	srv_buf_pool_size;
+/** Requested buffer pool chunk size */
+size_t	srv_buf_pool_chunk_unit;
+/** innodb_lru_scan_depth; number of blocks scanned in LRU flush batch */
+ulong	srv_LRU_scan_depth;
+/** innodb_flush_neighbors; whether or not to flush neighbors of a block */
+ulong	srv_flush_neighbors;
+/** Previously requested size */
+ulint	srv_buf_pool_old_size;
+/** Current size as scaling factor for the other components */
+ulint	srv_buf_pool_base_size;
+/** Current size in bytes */
+ulint	srv_buf_pool_curr_size;
+/** Dump this % of each buffer pool during BP dump */
+ulong	srv_buf_pool_dump_pct;
+/** Abort load after this amount of pages */
+#ifdef UNIV_DEBUG
+ulong srv_buf_pool_load_pages_abort = LONG_MAX;
+#endif
+/** Lock table size in bytes */
+ulint	srv_lock_table_size	= ULINT_MAX;
+
+/** the value of innodb_checksum_algorithm */
+ulong	srv_checksum_algorithm;
+
+/** innodb_read_io_threads */
+uint	srv_n_read_io_threads;
+/** innodb_write_io_threads */
+uint	srv_n_write_io_threads;
+
+/** innodb_random_read_ahead */
+my_bool	srv_random_read_ahead;
+/** innodb_read_ahead_threshold; the number of pages that must be present
+in the buffer cache and accessed sequentially for InnoDB to trigger a
+readahead request. */
+ulong	srv_read_ahead_threshold;
+
+/** innodb_change_buffer_max_size; maximum on-disk size of change
+buffer in terms of percentage of the buffer pool. */
+uint	srv_change_buffer_max_size;
+
+ulong	srv_file_flush_method;
+
+
+/** copy of innodb_open_files; @see innodb_init_params() */
+ulint	srv_max_n_open_files;
+
+/** innodb_io_capacity */
+ulong	srv_io_capacity;
+/** innodb_io_capacity_max */
+ulong	srv_max_io_capacity;
+
+/* The InnoDB main thread tries to keep the ratio of modified pages
+in the buffer pool to all database pages in the buffer pool smaller than
+the following number. But it is not guaranteed that the value stays below
+that during a time of heavy update/insert activity. */
+
+/** innodb_max_dirty_pages_pct */
+double	srv_max_buf_pool_modified_pct;
+/** innodb_max_dirty_pages_pct_lwm */
+double	srv_max_dirty_pages_pct_lwm;
+
+/** innodb_adaptive_flushing_lwm; the percentage of log capacity at
+which adaptive flushing, if enabled, will kick in. */
+double	srv_adaptive_flushing_lwm;
+
+/** innodb_flushing_avg_loops; number of iterations over which
+adaptive flushing is averaged */
+ulong	srv_flushing_avg_loops;
+
+/** innodb_purge_threads; the number of purge tasks to use */
+uint srv_n_purge_threads;
+
+/** innodb_purge_batch_size, in pages */
+ulong	srv_purge_batch_size;
+
+/** innodb_stats_method decides how InnoDB treats
+NULL value when collecting statistics. By default, it is set to
+SRV_STATS_NULLS_EQUAL(0), ie. all NULL value are treated equal */
+ulong srv_innodb_stats_method;
+
+srv_stats_t	srv_stats;
+
+/* structure to pass status variables to MySQL */
+export_var_t export_vars;
+
+/** Normally 0. When nonzero, skip some phases of crash recovery,
+starting from SRV_FORCE_IGNORE_CORRUPT, so that data can be recovered
+by SELECT or mysqldump. When this is nonzero, we do not allow any user
+modifications to the data. */
+ulong	srv_force_recovery;
+
+/** innodb_print_all_deadlocks; whether to print all user-level
+transactions deadlocks to the error log */
+my_bool	srv_print_all_deadlocks;
+
+/** innodb_cmp_per_index_enabled; enable
+INFORMATION_SCHEMA.innodb_cmp_per_index */
+my_bool	srv_cmp_per_index_enabled;
+
+/** innodb_fast_shutdown=1 skips purge and change buffer merge.
+innodb_fast_shutdown=2 effectively crashes the server (no log checkpoint).
+innodb_fast_shutdown=3 is a clean shutdown that skips the rollback
+of active transaction (to be done on restart). */
+uint	srv_fast_shutdown;
+
+/** copy of innodb_status_file; generate a innodb_status.<pid> file */
+ibool	srv_innodb_status;
+
+/** innodb_stats_transient_sample_pages;
+When estimating number of different key values in an index, sample
+this many index pages, there are 2 ways to calculate statistics:
+* persistent stats that are calculated by ANALYZE TABLE and saved
+  in the innodb database.
+* quick transient stats, that are used if persistent stats for the given
+  table/index are not found in the innodb database */
+unsigned long long	srv_stats_transient_sample_pages;
+/** innodb_stats_persistent */
+my_bool		srv_stats_persistent;
+/** innodb_stats_include_delete_marked */
+my_bool		srv_stats_include_delete_marked;
+/** innodb_stats_persistent_sample_pages */
+unsigned long long	srv_stats_persistent_sample_pages;
+/** innodb_stats_auto_recalc */
+my_bool		srv_stats_auto_recalc;
+
+/** innodb_stats_modified_counter; The number of rows modified before
+we calculate new statistics (default 0 = current limits) */
+unsigned long long srv_stats_modified_counter;
+
+/** innodb_stats_traditional; enable traditional statistic calculation
+based on number of configured pages */
+my_bool	srv_stats_sample_traditional;
+
+my_bool	srv_use_doublewrite_buf;
+
+/** innodb_sync_spin_loops */
+ulong	srv_n_spin_wait_rounds;
+/** innodb_spin_wait_delay */
+uint	srv_spin_wait_delay;
+
+/** Number of initialized rollback segments for persistent undo log */
+ulong	srv_available_undo_logs;
+
+/* Defragmentation */
+my_bool	srv_defragment;
+/** innodb_defragment_n_pages */
+uint	srv_defragment_n_pages;
+uint	srv_defragment_stats_accuracy;
+/** innodb_defragment_fill_factor_n_recs */
+uint	srv_defragment_fill_factor_n_recs;
+/** innodb_defragment_fill_factor */
+double	srv_defragment_fill_factor;
+/** innodb_defragment_frequency */
+uint	srv_defragment_frequency;
+/** derived from innodb_defragment_frequency;
+@see innodb_defragment_frequency_update() */
+ulonglong	srv_defragment_interval;
+
+/** Current mode of operation */
+enum srv_operation_mode srv_operation;
+
+/** whether this is the server's first start after mariabackup --prepare */
+bool srv_start_after_restore;
+
+/* Set the following to 0 if you want InnoDB to write messages on
+stderr on startup/shutdown. Not enabled on the embedded server. */
+ibool	srv_print_verbose_log;
+my_bool	srv_print_innodb_monitor;
+my_bool	srv_print_innodb_lock_monitor;
+/** innodb_force_primary_key; whether to disallow CREATE TABLE without
+PRIMARY KEY */
+my_bool	srv_force_primary_key;
+
+/** Key version to encrypt the temporary tablespace */
+my_bool innodb_encrypt_temporary_tables;
+
+my_bool srv_immediate_scrub_data_uncompressed;
+
+static time_t	srv_last_monitor_time;
+
+static mysql_mutex_t srv_innodb_monitor_mutex;
+
+/** Mutex protecting page_zip_stat_per_index */
+mysql_mutex_t page_zip_stat_per_index_mutex;
+
+/** Mutex for locking srv_monitor_file */
+mysql_mutex_t srv_monitor_file_mutex;
+
+/** Temporary file for innodb monitor output */
+FILE*	srv_monitor_file;
+/** Mutex for locking srv_misc_tmpfile */
+mysql_mutex_t srv_misc_tmpfile_mutex;
+/** Temporary file for miscellanous diagnostic output */
+FILE*	srv_misc_tmpfile;
+
+/* The following counts are used by the srv_master_callback. */
+
+/** Iterations of the loop bounded by 'srv_active' label. */
+ulint		srv_main_active_loops;
+/** Iterations of the loop bounded by the 'srv_idle' label. */
+ulint		srv_main_idle_loops;
+/** Iterations of the loop bounded by the 'srv_shutdown' label. */
+static ulint		srv_main_shutdown_loops;
+/** Log writes involving flush. */
+ulint		srv_log_writes_and_flush;
+
+/* This is only ever touched by the master thread. It records the
+time when the last flush of log file has happened. The master
+thread ensures that we flush the log files at least once per
+second. */
+static time_t	srv_last_log_flush_time;
+
+/** Buffer pool dump status frequence in percentages */
+ulong srv_buf_dump_status_frequency;
+
+/*
+	IMPLEMENTATION OF THE SERVER MAIN PROGRAM
+	=========================================
+
+There is the following analogue between this database
+server and an operating system kernel:
+
+DB concept			equivalent OS concept
+----------			---------------------
+transaction		--	process;
+
+query thread		--	thread;
+
+lock			--	semaphore;
+
+kernel			--	kernel;
+
+query thread execution:
+(a) without lock_sys.latch
+reserved		--	process executing in user mode;
+(b) with lock_sys.latch reserved
+			--	process executing in kernel mode;
+
+The server has several background threads all running at the same
+priority as user threads.
+
+The threads which we call user threads serve the queries of the MySQL
+server. They run at normal priority.
+
+When there is no activity in the system, also the master thread
+suspends itself to wait for an event making the server totally silent.
+
+There is still one complication in our server design. If a
+background utility thread obtains a resource (e.g., mutex) needed by a user
+thread, and there is also some other user activity in the system,
+the user thread may have to wait indefinitely long for the
+resource, as the OS does not schedule a background thread if
+there is some other runnable user thread. This problem is called
+priority inversion in real-time programming.
+
+One solution to the priority inversion problem would be to keep record
+of which thread owns which resource and in the above case boost the
+priority of the background thread so that it will be scheduled and it
+can release the resource.  This solution is called priority inheritance
+in real-time programming.  A drawback of this solution is that the overhead
+of acquiring a mutex increases slightly, maybe 0.2 microseconds on a 100
+MHz Pentium, because the thread has to call pthread_self.  This may
+be compared to 0.5 microsecond overhead for a mutex lock-unlock pair. Note
+that the thread cannot store the information in the resource , say mutex,
+itself, because competing threads could wipe out the information if it is
+stored before acquiring the mutex, and if it stored afterwards, the
+information is outdated for the time of one machine instruction, at least.
+(To be precise, the information could be stored to lock_word in mutex if
+the machine supports atomic swap.)
+
+The above solution with priority inheritance may become actual in the
+future, currently we do not implement any priority twiddling solution.
+Our general aim is to reduce the contention of all mutexes by making
+them more fine grained.
+
+The thread table contains information of the current status of each
+thread existing in the system, and also the event semaphores used in
+suspending the master thread and utility threads when they have nothing
+to do.  The thread table can be seen as an analogue to the process table
+in a traditional Unix implementation. */
+
+/** The server system struct */
+struct srv_sys_t{
+	mysql_mutex_t	tasks_mutex;		/*!< variable protecting the
+						tasks queue */
+	UT_LIST_BASE_NODE_T(que_thr_t)
+			tasks;			/*!< task queue */
+
+	srv_stats_t::ulint_ctr_1_t
+			activity_count;		/*!< For tracking server
+						activity */
+};
+
+static srv_sys_t	srv_sys;
+
+/*
+  Structure shared by timer and coordinator_callback.
+  No protection necessary since timer and task never run
+  in parallel (being in the same task group of size 1).
+*/
+struct purge_coordinator_state
+{
+  /** Snapshot of the last history length before the purge call.*/
+  size_t history_size;
+  Atomic_counter<int> m_running;
+public:
+  inline void do_purge();
+};
+
+static purge_coordinator_state purge_state;
+
+/** threadpool timer for srv_monitor_task() */
+std::unique_ptr<tpool::timer> srv_monitor_timer;
+
+
+/** The buffer pool dump/load file name */
+char*	srv_buf_dump_filename;
+
+/** Boolean config knobs that tell InnoDB to dump the buffer pool at shutdown
+and/or load it during startup. */
+char	srv_buffer_pool_dump_at_shutdown = TRUE;
+char	srv_buffer_pool_load_at_startup = TRUE;
+
+#ifdef HAVE_PSI_STAGE_INTERFACE
+/** Performance schema stage event for monitoring ALTER TABLE progress
+in ha_innobase::commit_inplace_alter_table(). */
+PSI_stage_info	srv_stage_alter_table_end
+	= {0, "alter table (end)", PSI_FLAG_STAGE_PROGRESS};
+
+/** Performance schema stage event for monitoring ALTER TABLE progress
+row_merge_insert_index_tuples(). */
+PSI_stage_info	srv_stage_alter_table_insert
+	= {0, "alter table (insert)", PSI_FLAG_STAGE_PROGRESS};
+
+/** Performance schema stage event for monitoring ALTER TABLE progress
+row_log_apply(). */
+PSI_stage_info	srv_stage_alter_table_log_index
+	= {0, "alter table (log apply index)", PSI_FLAG_STAGE_PROGRESS};
+
+/** Performance schema stage event for monitoring ALTER TABLE progress
+row_log_table_apply(). */
+PSI_stage_info	srv_stage_alter_table_log_table
+	= {0, "alter table (log apply table)", PSI_FLAG_STAGE_PROGRESS};
+
+/** Performance schema stage event for monitoring ALTER TABLE progress
+row_merge_sort(). */
+PSI_stage_info	srv_stage_alter_table_merge_sort
+	= {0, "alter table (merge sort)", PSI_FLAG_STAGE_PROGRESS};
+
+/** Performance schema stage event for monitoring ALTER TABLE progress
+row_merge_read_clustered_index(). */
+PSI_stage_info	srv_stage_alter_table_read_pk_internal_sort
+	= {0, "alter table (read PK and internal sort)", PSI_FLAG_STAGE_PROGRESS};
+
+/** Performance schema stage event for monitoring buffer pool load progress. */
+PSI_stage_info	srv_stage_buffer_pool_load
+	= {0, "buffer pool load", PSI_FLAG_STAGE_PROGRESS};
+#endif /* HAVE_PSI_STAGE_INTERFACE */
+
+/*********************************************************************//**
+Prints counters for work done by srv_master_thread. */
+static
+void
+srv_print_master_thread_info(
+/*=========================*/
+	FILE  *file)    /* in: output stream */
+{
+	fprintf(file, "srv_master_thread loops: " ULINTPF " srv_active, "
+		ULINTPF " srv_shutdown, " ULINTPF " srv_idle\n"
+		"srv_master_thread log flush and writes: " ULINTPF "\n",
+		srv_main_active_loops,
+		srv_main_shutdown_loops,
+		srv_main_idle_loops,
+		srv_log_writes_and_flush);
+}
+
+static void thread_pool_thread_init()
+{
+	my_thread_init();
+	pfs_register_thread(thread_pool_thread_key);
+}
+static void thread_pool_thread_end()
+{
+	pfs_delete_thread();
+	my_thread_end();
+}
+
+
+void srv_thread_pool_init()
+{
+  DBUG_ASSERT(!srv_thread_pool);
+
+#if defined (_WIN32)
+  srv_thread_pool= tpool::create_thread_pool_win();
+#else
+  srv_thread_pool= tpool::create_thread_pool_generic();
+#endif
+  srv_thread_pool->set_thread_callbacks(thread_pool_thread_init,
+                                        thread_pool_thread_end);
+}
+
+
+void srv_thread_pool_end()
+{
+  ut_ad(!srv_master_timer);
+  delete srv_thread_pool;
+  srv_thread_pool= nullptr;
+}
+
+static bool need_srv_free;
+
+/** Initialize the server. */
+static void srv_init()
+{
+	mysql_mutex_init(srv_innodb_monitor_mutex_key,
+			 &srv_innodb_monitor_mutex, nullptr);
+	mysql_mutex_init(srv_threads_mutex_key, &srv_sys.tasks_mutex, nullptr);
+	UT_LIST_INIT(srv_sys.tasks, &que_thr_t::queue);
+
+	need_srv_free = true;
+
+	mysql_mutex_init(page_zip_stat_per_index_mutex_key,
+			 &page_zip_stat_per_index_mutex, nullptr);
+
+	/* Initialize some INFORMATION SCHEMA internal structures */
+	trx_i_s_cache_init(trx_i_s_cache);
+}
+
+/*********************************************************************//**
+Frees the data structures created in srv_init(). */
+void
+srv_free(void)
+/*==========*/
+{
+	if (!need_srv_free) {
+		return;
+	}
+
+	mysql_mutex_destroy(&srv_innodb_monitor_mutex);
+	mysql_mutex_destroy(&page_zip_stat_per_index_mutex);
+	mysql_mutex_destroy(&srv_sys.tasks_mutex);
+
+	trx_i_s_cache_free(trx_i_s_cache);
+	srv_thread_pool_end();
+}
+
+/*********************************************************************//**
+Boots the InnoDB server. */
+void srv_boot()
+{
+#ifndef NO_ELISION
+  if (transactional_lock_enabled())
+    sql_print_information("InnoDB: Using transactional memory");
+#endif
+  buf_dblwr.init();
+  srv_thread_pool_init();
+  trx_pool_init();
+  srv_init();
+}
+
+/******************************************************************//**
+Refreshes the values used to calculate per-second averages. */
+static void srv_refresh_innodb_monitor_stats(time_t current_time)
+{
+	mysql_mutex_lock(&srv_innodb_monitor_mutex);
+
+	if (difftime(current_time, srv_last_monitor_time) < 60) {
+		/* We refresh InnoDB Monitor values so that averages are
+		printed from at most 60 last seconds */
+		mysql_mutex_unlock(&srv_innodb_monitor_mutex);
+		return;
+	}
+
+	srv_last_monitor_time = current_time;
+
+	os_aio_refresh_stats();
+
+#ifdef BTR_CUR_HASH_ADAPT
+	btr_cur_n_sea_old = btr_cur_n_sea;
+	btr_cur_n_non_sea_old = btr_cur_n_non_sea;
+#endif /* BTR_CUR_HASH_ADAPT */
+
+	buf_refresh_io_stats();
+
+	mysql_mutex_unlock(&srv_innodb_monitor_mutex);
+}
+
+/******************************************************************//**
+Outputs to a file the output of the InnoDB Monitor.
+@return FALSE if not all information printed
+due to failure to obtain necessary mutex */
+ibool
+srv_printf_innodb_monitor(
+/*======================*/
+	FILE*	file,		/*!< in: output stream */
+	ibool	nowait,		/*!< in: whether to wait for lock_sys.latch */
+	ulint*	trx_start_pos,	/*!< out: file position of the start of
+				the list of active transactions */
+	ulint*	trx_end)	/*!< out: file position of the end of
+				the list of active transactions */
+{
+	double	time_elapsed;
+	time_t	current_time;
+	ibool	ret;
+
+	mysql_mutex_lock(&srv_innodb_monitor_mutex);
+
+	current_time = time(NULL);
+
+	/* We add 0.001 seconds to time_elapsed to prevent division
+	by zero if two users happen to call SHOW ENGINE INNODB STATUS at the
+	same time */
+
+	time_elapsed = difftime(current_time, srv_last_monitor_time)
+		+ 0.001;
+
+	srv_last_monitor_time = time(NULL);
+
+	fputs("\n=====================================\n", file);
+
+	ut_print_timestamp(file);
+	fprintf(file,
+		" INNODB MONITOR OUTPUT\n"
+		"=====================================\n"
+		"Per second averages calculated from the last %lu seconds\n",
+		(ulong) time_elapsed);
+
+	fputs("-----------------\n"
+	      "BACKGROUND THREAD\n"
+	      "-----------------\n", file);
+	srv_print_master_thread_info(file);
+
+	/* This section is intentionally left blank, for tools like "innotop" */
+	fputs("----------\n"
+	      "SEMAPHORES\n"
+	      "----------\n", file);
+	/* End of intentionally blank section */
+
+	/* Conceptually, srv_innodb_monitor_mutex has a very high latching
+	order level, while dict_foreign_err_mutex has a very low level.
+	Therefore we can reserve the latter mutex here without
+	a danger of a deadlock of threads. */
+
+	mysql_mutex_lock(&dict_foreign_err_mutex);
+
+	if (!srv_read_only_mode && ftell(dict_foreign_err_file) != 0L) {
+		fputs("------------------------\n"
+		      "LATEST FOREIGN KEY ERROR\n"
+		      "------------------------\n", file);
+		ut_copy_file(file, dict_foreign_err_file);
+	}
+
+	mysql_mutex_unlock(&dict_foreign_err_mutex);
+
+	/* Only if lock_print_info_summary proceeds correctly,
+	before we call the lock_print_info_all_transactions
+	to print all the lock information. IMPORTANT NOTE: This
+	function acquires exclusive lock_sys.latch on success. */
+	ret = lock_print_info_summary(file, nowait);
+
+	if (ret) {
+		if (trx_start_pos) {
+			long	t = ftell(file);
+			if (t < 0) {
+				*trx_start_pos = ULINT_UNDEFINED;
+			} else {
+				*trx_start_pos = (ulint) t;
+			}
+		}
+
+		/* NOTE: The following function will release the lock_sys.latch
+		that lock_print_info_summary() acquired. */
+
+		lock_print_info_all_transactions(file);
+
+		if (trx_end) {
+			long	t = ftell(file);
+			if (t < 0) {
+				*trx_end = ULINT_UNDEFINED;
+			} else {
+				*trx_end = (ulint) t;
+			}
+		}
+	}
+
+	fputs("--------\n"
+	      "FILE I/O\n"
+	      "--------\n", file);
+	os_aio_print(file);
+
+	ibuf_print(file);
+
+#ifdef BTR_CUR_HASH_ADAPT
+	if (btr_search_enabled) {
+		fputs("-------------------\n"
+		      "ADAPTIVE HASH INDEX\n"
+		      "-------------------\n", file);
+		for (ulint i = 0; i < btr_ahi_parts; ++i) {
+			const auto part= &btr_search_sys.parts[i];
+			part->latch.rd_lock(SRW_LOCK_CALL);
+			ut_ad(part->heap->type == MEM_HEAP_FOR_BTR_SEARCH);
+			fprintf(file, "Hash table size " ULINTPF
+				", node heap has " ULINTPF " buffer(s)\n",
+				part->table.n_cells,
+				part->heap->base.count
+				- !part->heap->free_block);
+			part->latch.rd_unlock();
+		}
+
+		const ulint with_ahi = btr_cur_n_sea;
+		const ulint without_ahi = btr_cur_n_non_sea;
+		fprintf(file,
+			"%.2f hash searches/s, %.2f non-hash searches/s\n",
+			static_cast<double>(with_ahi - btr_cur_n_sea_old)
+			/ time_elapsed,
+			static_cast<double>(without_ahi - btr_cur_n_non_sea_old)
+			/ time_elapsed);
+		btr_cur_n_sea_old = with_ahi;
+		btr_cur_n_non_sea_old = without_ahi;
+	}
+#endif /* BTR_CUR_HASH_ADAPT */
+
+	fputs("---\n"
+	      "LOG\n"
+	      "---\n", file);
+	log_print(file);
+
+	fputs("----------------------\n"
+	      "BUFFER POOL AND MEMORY\n"
+	      "----------------------\n", file);
+	fprintf(file,
+		"Total large memory allocated " ULINTPF "\n"
+		"Dictionary memory allocated " ULINTPF "\n",
+		ulint{os_total_large_mem_allocated},
+		dict_sys.rough_size());
+
+	buf_print_io(file);
+
+	fputs("--------------\n"
+	      "ROW OPERATIONS\n"
+	      "--------------\n", file);
+	fprintf(file, ULINTPF " read views open inside InnoDB\n",
+		trx_sys.view_count());
+
+	if (ulint n_reserved = fil_system.sys_space->n_reserved_extents) {
+		fprintf(file,
+			ULINTPF " tablespace extents now reserved for"
+			" B-tree split operations\n",
+			n_reserved);
+	}
+
+	fprintf(file, "state: %s\n", srv_main_thread_op_info);
+
+	fputs("----------------------------\n"
+	      "END OF INNODB MONITOR OUTPUT\n"
+	      "============================\n", file);
+	mysql_mutex_unlock(&srv_innodb_monitor_mutex);
+	fflush(file);
+
+	return(ret);
+}
+
+/******************************************************************//**
+Function to pass InnoDB status variables to MySQL */
+void
+srv_export_innodb_status(void)
+/*==========================*/
+{
+	fil_crypt_stat_t	crypt_stat;
+
+	if (!srv_read_only_mode) {
+		fil_crypt_total_stat(&crypt_stat);
+	}
+
+#ifdef BTR_CUR_HASH_ADAPT
+	export_vars.innodb_ahi_hit = btr_cur_n_sea;
+	export_vars.innodb_ahi_miss = btr_cur_n_non_sea;
+
+	ulint mem_adaptive_hash = 0;
+	for (ulong i = 0; i < btr_ahi_parts; i++) {
+		const auto part= &btr_search_sys.parts[i];
+		part->latch.rd_lock(SRW_LOCK_CALL);
+		if (part->heap) {
+			ut_ad(part->heap->type == MEM_HEAP_FOR_BTR_SEARCH);
+
+			mem_adaptive_hash += mem_heap_get_size(part->heap)
+				+ part->table.n_cells * sizeof(hash_cell_t);
+		}
+		part->latch.rd_unlock();
+	}
+	export_vars.innodb_mem_adaptive_hash = mem_adaptive_hash;
+#endif
+
+	export_vars.innodb_mem_dictionary = dict_sys.rough_size();
+
+	mysql_mutex_lock(&srv_innodb_monitor_mutex);
+
+	export_vars.innodb_data_pending_reads =
+		ulint(MONITOR_VALUE(MONITOR_OS_PENDING_READS));
+
+	export_vars.innodb_data_pending_writes =
+		ulint(MONITOR_VALUE(MONITOR_OS_PENDING_WRITES));
+
+	export_vars.innodb_data_read = srv_stats.data_read;
+
+	export_vars.innodb_data_reads = os_n_file_reads;
+
+	export_vars.innodb_data_writes = os_n_file_writes;
+
+	buf_dblwr.lock();
+	ulint dblwr = buf_dblwr.written();
+	export_vars.innodb_dblwr_pages_written = dblwr;
+	export_vars.innodb_dblwr_writes = buf_dblwr.batches();
+	buf_dblwr.unlock();
+
+	export_vars.innodb_data_written = srv_stats.data_written
+		+ (dblwr << srv_page_size_shift);
+
+	export_vars.innodb_buffer_pool_bytes_data =
+		buf_pool.stat.LRU_bytes
+		+ (UT_LIST_GET_LEN(buf_pool.unzip_LRU)
+		   << srv_page_size_shift);
+
+#ifdef UNIV_DEBUG
+	export_vars.innodb_buffer_pool_pages_latched =
+		buf_get_latched_pages_number();
+#endif /* UNIV_DEBUG */
+	export_vars.innodb_buffer_pool_pages_total = buf_pool.get_n_pages();
+
+	export_vars.innodb_buffer_pool_pages_misc =
+		buf_pool.get_n_pages()
+		- UT_LIST_GET_LEN(buf_pool.LRU)
+		- UT_LIST_GET_LEN(buf_pool.free);
+
+	export_vars.innodb_max_trx_id = trx_sys.get_max_trx_id();
+	export_vars.innodb_history_list_length = trx_sys.history_size_approx();
+
+	mysql_mutex_lock(&lock_sys.wait_mutex);
+	export_vars.innodb_row_lock_waits = lock_sys.get_wait_cumulative();
+
+	export_vars.innodb_row_lock_current_waits= lock_sys.get_wait_pending();
+
+	export_vars.innodb_row_lock_time = lock_sys.get_wait_time_cumulative();
+	export_vars.innodb_row_lock_time_max = lock_sys.get_wait_time_max();
+
+	mysql_mutex_unlock(&lock_sys.wait_mutex);
+
+	export_vars.innodb_row_lock_time_avg= export_vars.innodb_row_lock_waits
+		? static_cast<ulint>(export_vars.innodb_row_lock_time
+				     / export_vars.innodb_row_lock_waits)
+		: 0;
+
+	export_vars.innodb_page_compression_saved = srv_stats.page_compression_saved;
+	export_vars.innodb_pages_page_compressed = srv_stats.pages_page_compressed;
+	export_vars.innodb_page_compressed_trim_op = srv_stats.page_compressed_trim_op;
+	export_vars.innodb_pages_page_decompressed = srv_stats.pages_page_decompressed;
+	export_vars.innodb_pages_page_compression_error = srv_stats.pages_page_compression_error;
+	export_vars.innodb_pages_decrypted = srv_stats.pages_decrypted;
+	export_vars.innodb_pages_encrypted = srv_stats.pages_encrypted;
+	export_vars.innodb_n_merge_blocks_encrypted = srv_stats.n_merge_blocks_encrypted;
+	export_vars.innodb_n_merge_blocks_decrypted = srv_stats.n_merge_blocks_decrypted;
+	export_vars.innodb_n_rowlog_blocks_encrypted = srv_stats.n_rowlog_blocks_encrypted;
+	export_vars.innodb_n_rowlog_blocks_decrypted = srv_stats.n_rowlog_blocks_decrypted;
+
+	export_vars.innodb_n_temp_blocks_encrypted =
+		srv_stats.n_temp_blocks_encrypted;
+
+	export_vars.innodb_n_temp_blocks_decrypted =
+		srv_stats.n_temp_blocks_decrypted;
+
+	export_vars.innodb_defragment_compression_failures =
+		btr_defragment_compression_failures;
+	export_vars.innodb_defragment_failures = btr_defragment_failures;
+	export_vars.innodb_defragment_count = btr_defragment_count;
+
+	export_vars.innodb_onlineddl_rowlog_rows = onlineddl_rowlog_rows;
+	export_vars.innodb_onlineddl_rowlog_pct_used = onlineddl_rowlog_pct_used;
+	export_vars.innodb_onlineddl_pct_progress = onlineddl_pct_progress;
+
+	if (!srv_read_only_mode) {
+		export_vars.innodb_encryption_rotation_pages_read_from_cache =
+			crypt_stat.pages_read_from_cache;
+		export_vars.innodb_encryption_rotation_pages_read_from_disk =
+			crypt_stat.pages_read_from_disk;
+		export_vars.innodb_encryption_rotation_pages_modified =
+			crypt_stat.pages_modified;
+		export_vars.innodb_encryption_rotation_pages_flushed =
+			crypt_stat.pages_flushed;
+		export_vars.innodb_encryption_rotation_estimated_iops =
+			crypt_stat.estimated_iops;
+		export_vars.innodb_encryption_key_requests =
+			srv_stats.n_key_requests;
+	}
+
+	mysql_mutex_unlock(&srv_innodb_monitor_mutex);
+
+	log_sys.latch.rd_lock(SRW_LOCK_CALL);
+	export_vars.innodb_lsn_current = log_sys.get_lsn();
+	export_vars.innodb_lsn_flushed = log_sys.get_flushed_lsn();
+	export_vars.innodb_lsn_last_checkpoint = log_sys.last_checkpoint_lsn;
+	export_vars.innodb_checkpoint_max_age = static_cast<ulint>(
+		log_sys.max_checkpoint_age);
+	log_sys.latch.rd_unlock();
+	export_vars.innodb_os_log_written = export_vars.innodb_lsn_current
+		- recv_sys.lsn;
+
+	export_vars.innodb_checkpoint_age = static_cast<ulint>(
+		export_vars.innodb_lsn_current
+		- export_vars.innodb_lsn_last_checkpoint);
+}
+
+struct srv_monitor_state_t
+{
+  time_t last_monitor_time;
+  ulint mutex_skipped;
+  bool last_srv_print_monitor;
+  srv_monitor_state_t() : mutex_skipped(0), last_srv_print_monitor(false)
+  {
+    srv_last_monitor_time = time(NULL);
+    last_monitor_time= srv_last_monitor_time;
+  }
+};
+
+static srv_monitor_state_t monitor_state;
+
+/** A task which prints the info output by various InnoDB monitors.*/
+static void srv_monitor()
+{
+	time_t current_time = time(NULL);
+
+	if (difftime(current_time, monitor_state.last_monitor_time) >= 15) {
+		monitor_state.last_monitor_time = current_time;
+
+		if (srv_print_innodb_monitor) {
+			/* Reset mutex_skipped counter everytime
+			srv_print_innodb_monitor changes. This is to
+			ensure we will not be blocked by lock_sys.latch
+			for short duration information printing */
+			if (!monitor_state.last_srv_print_monitor) {
+				monitor_state.mutex_skipped = 0;
+				monitor_state.last_srv_print_monitor = true;
+			}
+
+			if (!srv_printf_innodb_monitor(stderr,
+						MUTEX_NOWAIT(monitor_state.mutex_skipped),
+						NULL, NULL)) {
+				monitor_state.mutex_skipped++;
+			} else {
+				/* Reset the counter */
+				monitor_state.mutex_skipped = 0;
+			}
+		} else {
+			monitor_state.last_monitor_time = 0;
+		}
+
+
+		/* We don't create the temp files or associated
+		mutexes in read-only-mode */
+
+		if (!srv_read_only_mode && srv_innodb_status) {
+			mysql_mutex_lock(&srv_monitor_file_mutex);
+			rewind(srv_monitor_file);
+			if (!srv_printf_innodb_monitor(srv_monitor_file,
+						MUTEX_NOWAIT(monitor_state.mutex_skipped),
+						NULL, NULL)) {
+				monitor_state.mutex_skipped++;
+			} else {
+				monitor_state.mutex_skipped = 0;
+			}
+
+			os_file_set_eof(srv_monitor_file);
+			mysql_mutex_unlock(&srv_monitor_file_mutex);
+		}
+	}
+
+	srv_refresh_innodb_monitor_stats(current_time);
+}
+
+/** Periodic task which prints the info output by various InnoDB monitors.*/
+void srv_monitor_task(void*)
+{
+	/* number of successive fatal timeouts observed */
+	static lsn_t		old_lsn = recv_sys.lsn;
+
+	ut_ad(!srv_read_only_mode);
+
+	/* Try to track a strange bug reported by Harald Fuchs and others,
+	where the lsn seems to decrease at times */
+
+	lsn_t new_lsn = log_sys.get_lsn();
+	ut_a(new_lsn >= old_lsn);
+	old_lsn = new_lsn;
+
+	/* Update the statistics collected for deciding LRU
+	eviction policy. */
+	buf_LRU_stat_update();
+
+	ulonglong now = my_hrtime_coarse().val;
+	const ulong threshold = srv_fatal_semaphore_wait_threshold;
+
+	if (ulonglong start = dict_sys.oldest_wait()) {
+		if (now >= start) {
+			now -= start;
+			ulong waited = static_cast<ulong>(now / 1000000);
+			if (waited >= threshold) {
+				ib::fatal() << dict_sys.fatal_msg;
+			}
+
+			if (waited == threshold / 4
+			    || waited == threshold / 2
+			    || waited == threshold / 4 * 3) {
+				ib::warn() << "Long wait (" << waited
+					   << " seconds) for dict_sys.latch";
+			}
+		}
+	}
+
+	srv_monitor();
+}
+
+/******************************************************************//**
+Increment the server activity count. */
+void
+srv_inc_activity_count(void)
+/*========================*/
+{
+	srv_sys.activity_count.inc();
+}
+
+#ifdef UNIV_DEBUG
+/** @return whether purge or master task is active */
+bool srv_any_background_activity()
+{
+  if (purge_sys.enabled() || srv_master_timer.get())
+  {
+    ut_ad(!srv_read_only_mode);
+    return true;
+  }
+  return false;
+}
+#endif /* UNIV_DEBUG */
+
+static void purge_worker_callback(void*);
+static void purge_coordinator_callback(void*);
+static void purge_truncation_callback(void*)
+{
+  purge_sys.latch.rd_lock(SRW_LOCK_CALL);
+  const purge_sys_t::iterator head= purge_sys.head;
+  purge_sys.latch.rd_unlock();
+  head.free_history();
+}
+
+static tpool::task_group purge_task_group;
+tpool::waitable_task purge_worker_task(purge_worker_callback, nullptr,
+                                       &purge_task_group);
+static tpool::task_group purge_coordinator_task_group(1);
+static tpool::waitable_task purge_coordinator_task
+  (purge_coordinator_callback, nullptr, &purge_coordinator_task_group);
+static tpool::task_group purge_truncation_task_group(1);
+static tpool::waitable_task purge_truncation_task
+  (purge_truncation_callback, nullptr, &purge_truncation_task_group);
+
+/** Wake up the purge threads if there is work to do. */
+void purge_sys_t::wake_if_not_active()
+{
+  if (enabled() && !paused() && !purge_state.m_running &&
+      (srv_undo_log_truncate || trx_sys.history_exists()) &&
+      ++purge_state.m_running == 1)
+    srv_thread_pool->submit_task(&purge_coordinator_task);
+}
+
+/** @return whether the purge tasks are active */
+bool purge_sys_t::running()
+{
+  return purge_coordinator_task.is_running();
+}
+
+void purge_sys_t::stop_FTS()
+{
+  latch.rd_lock(SRW_LOCK_CALL);
+  m_FTS_paused++;
+  latch.rd_unlock();
+  while (m_active)
+    std::this_thread::sleep_for(std::chrono::seconds(1));
+}
+
+/** Stop purge during FLUSH TABLES FOR EXPORT */
+void purge_sys_t::stop()
+{
+  latch.wr_lock(SRW_LOCK_CALL);
+
+  if (!enabled())
+  {
+    /* Shutdown must have been initiated during FLUSH TABLES FOR EXPORT. */
+    ut_ad(!srv_undo_sources);
+    latch.wr_unlock();
+    return;
+  }
+
+  ut_ad(srv_n_purge_threads > 0);
+
+  const auto paused= m_paused++;
+
+  latch.wr_unlock();
+
+  if (!paused)
+  {
+    ib::info() << "Stopping purge";
+    MONITOR_ATOMIC_INC(MONITOR_PURGE_STOP_COUNT);
+    purge_coordinator_task.disable();
+  }
+}
+
+/** Resume purge in data dictionary tables */
+void purge_sys_t::resume_SYS(void *)
+{
+  ut_d(auto paused=) purge_sys.m_SYS_paused--;
+  ut_ad(paused);
+}
+
+/** Resume purge at UNLOCK TABLES after FLUSH TABLES FOR EXPORT */
+void purge_sys_t::resume()
+{
+   if (!enabled())
+   {
+     /* Shutdown must have been initiated during FLUSH TABLES FOR EXPORT. */
+     ut_ad(!srv_undo_sources);
+     return;
+   }
+   ut_ad(!srv_read_only_mode);
+   ut_ad(srv_force_recovery < SRV_FORCE_NO_BACKGROUND);
+   purge_coordinator_task.enable();
+   latch.wr_lock(SRW_LOCK_CALL);
+   int32_t paused= m_paused--;
+   ut_a(paused);
+
+   if (paused == 1)
+   {
+     ib::info() << "Resuming purge";
+     purge_state.m_running= 1;
+     srv_thread_pool->submit_task(&purge_coordinator_task);
+     MONITOR_ATOMIC_INC(MONITOR_PURGE_RESUME_COUNT);
+   }
+   latch.wr_unlock();
+}
+
+/*******************************************************************//**
+Get current server activity count.
+@return activity count. */
+ulint
+srv_get_activity_count(void)
+/*========================*/
+{
+	return(srv_sys.activity_count);
+}
+
+/** Check if srv_inc_activity_count() has been called.
+@param activity_count   copy of srv_sys.activity_count
+@return whether the activity_count had changed */
+static bool srv_check_activity(ulint *activity_count)
+{
+  ulint new_activity_count= srv_sys.activity_count;
+  if (new_activity_count != *activity_count)
+  {
+    *activity_count= new_activity_count;
+    return true;
+  }
+
+  return false;
+}
+
+/********************************************************************//**
+The master thread is tasked to ensure that flush of log file happens
+once every second in the background. This is to ensure that not more
+than one second of trxs are lost in case of crash when
+innodb_flush_logs_at_trx_commit != 1 */
+static void srv_sync_log_buffer_in_background()
+{
+	time_t	current_time = time(NULL);
+
+	srv_main_thread_op_info = "flushing log";
+	if (difftime(current_time, srv_last_log_flush_time)
+	    >= srv_flush_log_at_timeout) {
+		log_buffer_flush_to_disk();
+		srv_last_log_flush_time = current_time;
+		srv_log_writes_and_flush++;
+	}
+}
+
+/** Report progress during shutdown.
+@param last   time of last output
+@param n_read number of page reads initiated for change buffer merge */
+static void srv_shutdown_print(time_t &last, ulint n_read)
+{
+  time_t now= time(nullptr);
+  if (now - last >= 15)
+  {
+    last= now;
+
+    const ulint ibuf_size= ibuf.size;
+    sql_print_information("Completing change buffer merge;"
+                          " %zu page reads initiated;"
+                          " %zu change buffer pages remain",
+                          n_read, ibuf_size);
+#if defined HAVE_SYSTEMD && !defined EMBEDDED_LIBRARY
+    service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
+                                   "Completing change buffer merge;"
+                                   " %zu page reads initiated;"
+                                   " %zu change buffer pages remain",
+                                   n_read, ibuf_size);
+#endif
+  }
+}
+
+/** Perform periodic tasks whenever the server is active.
+@param counter_time  microsecond_interval_timer() */
+static void srv_master_do_active_tasks(ulonglong counter_time)
+{
+	++srv_main_active_loops;
+
+	MONITOR_INC(MONITOR_MASTER_ACTIVE_LOOPS);
+
+	if (!(counter_time % (47 * 1000000ULL))) {
+		srv_main_thread_op_info = "enforcing dict cache limit";
+		if (ulint n_evicted = dict_sys.evict_table_LRU(true)) {
+			MONITOR_INC_VALUE(
+				MONITOR_SRV_DICT_LRU_EVICT_COUNT_ACTIVE,
+				n_evicted);
+		}
+		MONITOR_INC_TIME_IN_MICRO_SECS(
+			MONITOR_SRV_DICT_LRU_MICROSECOND, counter_time);
+	}
+}
+
+/** Perform periodic tasks whenever the server is idle.
+@param counter_time  microsecond_interval_timer() */
+static void srv_master_do_idle_tasks(ulonglong counter_time)
+{
+	++srv_main_idle_loops;
+
+	MONITOR_INC(MONITOR_MASTER_IDLE_LOOPS);
+
+	srv_main_thread_op_info = "enforcing dict cache limit";
+	if (ulint n_evicted = dict_sys.evict_table_LRU(false)) {
+		MONITOR_INC_VALUE(
+			MONITOR_SRV_DICT_LRU_EVICT_COUNT_IDLE, n_evicted);
+	}
+	MONITOR_INC_TIME_IN_MICRO_SECS(
+		MONITOR_SRV_DICT_LRU_MICROSECOND, counter_time);
+}
+
+/**
+Complete the shutdown tasks such as background DROP TABLE,
+and optionally change buffer merge (on innodb_fast_shutdown=0). */
+void srv_shutdown(bool ibuf_merge)
+{
+	ulint		n_read = 0;
+	time_t		now = time(NULL);
+
+	do {
+		ut_ad(!srv_read_only_mode);
+		ut_ad(srv_shutdown_state == SRV_SHUTDOWN_CLEANUP);
+		++srv_main_shutdown_loops;
+
+		if (ibuf_merge) {
+			srv_main_thread_op_info = "doing insert buffer merge";
+			/* Disallow the use of change buffer to
+			avoid a race condition with
+			ibuf_read_merge_pages() */
+			ibuf_max_size_update(0);
+			log_free_check();
+			n_read = ibuf_contract();
+			srv_shutdown_print(now, n_read);
+		}
+	} while (n_read);
+}
+
+/** The periodic master task controlling the server. */
+void srv_master_callback(void*)
+{
+  static ulint old_activity_count;
+
+  ut_a(srv_shutdown_state <= SRV_SHUTDOWN_INITIATED);
+
+  MONITOR_INC(MONITOR_MASTER_THREAD_SLEEP);
+  purge_sys.wake_if_not_active();
+  ulonglong counter_time= microsecond_interval_timer();
+  srv_sync_log_buffer_in_background();
+  MONITOR_INC_TIME_IN_MICRO_SECS(MONITOR_SRV_LOG_FLUSH_MICROSECOND,
+				 counter_time);
+
+  if (srv_check_activity(&old_activity_count))
+    srv_master_do_active_tasks(counter_time);
+  else
+    srv_master_do_idle_tasks(counter_time);
+
+  srv_main_thread_op_info= "sleeping";
+}
+
+/** @return whether purge should exit due to shutdown */
+static bool srv_purge_should_exit(size_t old_history_size)
+{
+  ut_ad(srv_shutdown_state <= SRV_SHUTDOWN_CLEANUP);
+
+  if (srv_undo_sources)
+    return false;
+
+  if (srv_fast_shutdown)
+    return true;
+
+  /* Slow shutdown was requested. */
+  size_t prepared, active= trx_sys.any_active_transactions(&prepared);
+  const size_t history_size= trx_sys.history_size();
+
+  if (!history_size);
+  else if (!active && history_size == old_history_size && prepared);
+  else
+  {
+    static time_t progress_time;
+    time_t now= time(NULL);
+    if (now - progress_time >= 15)
+    {
+      progress_time= now;
+#if defined HAVE_SYSTEMD && !defined EMBEDDED_LIBRARY
+      service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
+				     "InnoDB: to purge %zu transactions",
+				     history_size);
+      sql_print_information("InnoDB: to purge %zu transactions", history_size);
+#endif
+    }
+    return false;
+  }
+
+  return !active;
+}
+
+/*********************************************************************//**
+Fetch and execute a task from the work queue.
+@param [in,out]	slot	purge worker thread slot
+@return true if a task was executed */
+static bool srv_task_execute()
+{
+	ut_ad(!srv_read_only_mode);
+	ut_ad(srv_force_recovery < SRV_FORCE_NO_BACKGROUND);
+
+	mysql_mutex_lock(&srv_sys.tasks_mutex);
+
+	if (que_thr_t* thr = UT_LIST_GET_FIRST(srv_sys.tasks)) {
+		ut_a(que_node_get_type(thr->child) == QUE_NODE_PURGE);
+		UT_LIST_REMOVE(srv_sys.tasks, thr);
+		mysql_mutex_unlock(&srv_sys.tasks_mutex);
+		que_run_threads(thr);
+		return true;
+	}
+
+	ut_ad(UT_LIST_GET_LEN(srv_sys.tasks) == 0);
+	mysql_mutex_unlock(&srv_sys.tasks_mutex);
+	return false;
+}
+
+static void purge_create_background_thds(int );
+
+/** Flag which is set, whenever innodb_purge_threads changes. */
+static Atomic_relaxed<bool> srv_purge_thread_count_changed;
+
+static std::mutex purge_thread_count_mtx;
+void srv_update_purge_thread_count(uint n)
+{
+	std::lock_guard<std::mutex> lk(purge_thread_count_mtx);
+	ut_ad(n > 0);
+	ut_ad(n <= innodb_purge_threads_MAX);
+	srv_n_purge_threads = n;
+	srv_purge_thread_count_changed = true;
+}
+
+inline void purge_coordinator_state::do_purge()
+{
+  ut_ad(!srv_read_only_mode);
+
+  if (!purge_sys.enabled() || purge_sys.paused())
+    return;
+
+  uint n_threads;
+
+  {
+    std::lock_guard<std::mutex> lk(purge_thread_count_mtx);
+    n_threads= srv_n_purge_threads;
+    srv_purge_thread_count_changed= false;
+    goto first_loop;
+  }
+
+  do
+  {
+    if (UNIV_UNLIKELY(srv_purge_thread_count_changed))
+    {
+      /* Read the fresh value of srv_n_purge_threads, reset
+      the changed flag. Both are protected by purge_thread_count_mtx. */
+      {
+        std::lock_guard<std::mutex> lk(purge_thread_count_mtx);
+        n_threads= srv_n_purge_threads;
+        srv_purge_thread_count_changed= false;
+      }
+    }
+  first_loop:
+    ut_ad(n_threads);
+
+    history_size= trx_sys.history_size();
+
+    if (!history_size)
+    {
+    no_history:
+      srv_dml_needed_delay= 0;
+      purge_truncation_task.wait();
+      trx_purge_truncate_history();
+      break;
+    }
+
+    ulint n_pages_handled= trx_purge(n_threads, history_size);
+    if (!trx_sys.history_exists())
+      goto no_history;
+    if (purge_sys.truncate.current || srv_shutdown_state != SRV_SHUTDOWN_NONE)
+    {
+      purge_truncation_task.wait();
+      trx_purge_truncate_history();
+    }
+    else
+      srv_thread_pool->submit_task(&purge_truncation_task);
+    if (!n_pages_handled)
+      break;
+  }
+  while (purge_sys.enabled() && !purge_sys.paused() &&
+         !srv_purge_should_exit(history_size));
+
+  m_running= 0;
+}
+
+static std::list<THD*> purge_thds;
+static std::mutex purge_thd_mutex;
+extern void* thd_attach_thd(THD*);
+extern void thd_detach_thd(void *);
+static int n_purge_thds;
+
+/* Ensure  that we have at least n background THDs for purge */
+static void purge_create_background_thds(int n)
+{
+	THD *thd= current_thd;
+	std::unique_lock<std::mutex> lk(purge_thd_mutex);
+	while (n_purge_thds < n)
+	{
+		purge_thds.push_back(innobase_create_background_thd("InnoDB purge worker"));
+		n_purge_thds++;
+	}
+	set_current_thd(thd);
+}
+
+static THD *acquire_thd(void **ctx)
+{
+	std::unique_lock<std::mutex> lk(purge_thd_mutex);
+	ut_a(!purge_thds.empty());
+	THD* thd = purge_thds.front();
+	purge_thds.pop_front();
+	lk.unlock();
+
+	/* Set current thd, and thd->mysys_var as well,
+	it might be used by something in the server.*/
+	*ctx = thd_attach_thd(thd);
+	return thd;
+}
+
+static void release_thd(THD *thd, void *ctx)
+{
+	thd_detach_thd(ctx);
+	std::unique_lock<std::mutex> lk(purge_thd_mutex);
+	purge_thds.push_back(thd);
+	lk.unlock();
+	set_current_thd(0);
+}
+
+static void purge_worker_callback(void*)
+{
+  ut_ad(!current_thd);
+  ut_ad(!srv_read_only_mode);
+  ut_ad(srv_force_recovery < SRV_FORCE_NO_BACKGROUND);
+  void *ctx;
+  THD *thd= acquire_thd(&ctx);
+  while (srv_task_execute())
+    ut_ad(purge_sys.running());
+  release_thd(thd,ctx);
+}
+
+static void purge_coordinator_callback(void*)
+{
+  void *ctx;
+  THD *thd= acquire_thd(&ctx);
+  purge_state.do_purge();
+  release_thd(thd, ctx);
+}
+
+void srv_init_purge_tasks()
+{
+  purge_create_background_thds(innodb_purge_threads_MAX);
+  purge_sys.coordinator_startup();
+}
+
+static void srv_shutdown_purge_tasks()
+{
+  purge_coordinator_task.disable();
+  purge_worker_task.wait();
+  std::unique_lock<std::mutex> lk(purge_thd_mutex);
+  while (!purge_thds.empty())
+  {
+    destroy_background_thd(purge_thds.front());
+    purge_thds.pop_front();
+  }
+  n_purge_thds= 0;
+  purge_truncation_task.wait();
+}
+
+/**********************************************************************//**
+Enqueues a task to server task queue and releases a worker thread, if there
+is a suspended one. */
+void
+srv_que_task_enqueue_low(
+/*=====================*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ut_ad(!srv_read_only_mode);
+	mysql_mutex_lock(&srv_sys.tasks_mutex);
+
+	UT_LIST_ADD_LAST(srv_sys.tasks, thr);
+
+	mysql_mutex_unlock(&srv_sys.tasks_mutex);
+}
+
+#ifdef UNIV_DEBUG
+/** @return number of tasks in queue */
+ulint srv_get_task_queue_length()
+{
+	ulint	n_tasks;
+
+	ut_ad(!srv_read_only_mode);
+
+	mysql_mutex_lock(&srv_sys.tasks_mutex);
+
+	n_tasks = UT_LIST_GET_LEN(srv_sys.tasks);
+
+	mysql_mutex_unlock(&srv_sys.tasks_mutex);
+
+	return(n_tasks);
+}
+#endif
+
+/** Shut down the purge threads. */
+void srv_purge_shutdown()
+{
+  if (purge_sys.enabled())
+  {
+    if (!srv_fast_shutdown && !opt_bootstrap)
+    {
+      srv_purge_batch_size= innodb_purge_batch_size_MAX;
+      srv_update_purge_thread_count(innodb_purge_threads_MAX);
+    }
+    size_t history_size= trx_sys.history_size();
+    while (!srv_purge_should_exit(history_size))
+    {
+      history_size= trx_sys.history_size();
+      ut_a(!purge_sys.paused());
+      srv_thread_pool->submit_task(&purge_coordinator_task);
+      purge_coordinator_task.wait();
+    }
+    purge_sys.coordinator_shutdown();
+    srv_shutdown_purge_tasks();
+  }
+}
diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc
new file mode 100644
index 00000000..ef5bcb67
--- /dev/null
+++ b/storage/innobase/srv/srv0start.cc
@@ -0,0 +1,2101 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2017, Oracle and/or its affiliates. All rights reserved.
+Copyright (c) 2009, Percona Inc.
+Copyright (c) 2013, 2022, MariaDB Corporation.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file srv/srv0start.cc
+Starts the InnoDB database server
+
+Created 2/16/1996 Heikki Tuuri
+*************************************************************************/
+
+#include "my_global.h"
+
+#include "mysqld.h"
+#include "mysql/psi/mysql_stage.h"
+#include "mysql/psi/psi.h"
+
+#include "row0ftsort.h"
+#include "ut0mem.h"
+#include "mem0mem.h"
+#include "data0data.h"
+#include "data0type.h"
+#include "dict0dict.h"
+#include "buf0buf.h"
+#include "buf0dblwr.h"
+#include "buf0dump.h"
+#include "os0file.h"
+#include "fil0fil.h"
+#include "fil0crypt.h"
+#include "fsp0fsp.h"
+#include "rem0rec.h"
+#include "mtr0mtr.h"
+#include "log0crypt.h"
+#include "log0recv.h"
+#include "page0page.h"
+#include "page0cur.h"
+#include "trx0trx.h"
+#include "trx0sys.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "rem0rec.h"
+#include "ibuf0ibuf.h"
+#include "srv0start.h"
+#include "srv0srv.h"
+#include "btr0defragment.h"
+#include "mysql/service_wsrep.h" /* wsrep_recovery */
+#include "trx0rseg.h"
+#include "buf0flu.h"
+#include "buf0rea.h"
+#include "dict0boot.h"
+#include "dict0load.h"
+#include "dict0stats_bg.h"
+#include "que0que.h"
+#include "lock0lock.h"
+#include "trx0roll.h"
+#include "trx0purge.h"
+#include "lock0lock.h"
+#include "pars0pars.h"
+#include "btr0sea.h"
+#include "rem0cmp.h"
+#include "dict0crea.h"
+#include "row0ins.h"
+#include "row0sel.h"
+#include "row0upd.h"
+#include "row0row.h"
+#include "row0mysql.h"
+#include "btr0pcur.h"
+#include "zlib.h"
+#include "log.h"
+
+/** We are prepared for a situation that we have this many threads waiting for
+a transactional lock inside InnoDB. srv_start() sets the value. */
+ulint srv_max_n_threads;
+
+/** Log sequence number at shutdown */
+lsn_t	srv_shutdown_lsn;
+
+/** TRUE if a raw partition is in use */
+ibool	srv_start_raw_disk_in_use;
+
+/** UNDO tablespaces starts with space id. */
+uint32_t srv_undo_space_id_start;
+
+/** TRUE if the server is being started, before rolling back any
+incomplete transactions */
+bool	srv_startup_is_before_trx_rollback_phase;
+/** TRUE if the server is being started */
+bool	srv_is_being_started;
+/** TRUE if the server was successfully started */
+bool	srv_was_started;
+/** whether srv_start() has been called */
+static bool		srv_start_has_been_called;
+
+/** Whether any undo log records can be generated */
+bool	srv_undo_sources;
+
+/** innodb_encrypt_log */
+my_bool srv_encrypt_log;
+
+#ifdef UNIV_DEBUG
+/** InnoDB system tablespace to set during recovery */
+uint	srv_sys_space_size_debug;
+/** whether redo log file have been created at startup */
+bool	srv_log_file_created;
+#endif /* UNIV_DEBUG */
+
+/** whether some background threads that create redo log have been started */
+static bool srv_started_redo;
+
+/** At a shutdown this value climbs from SRV_SHUTDOWN_NONE to
+SRV_SHUTDOWN_CLEANUP and then to SRV_SHUTDOWN_LAST_PHASE, and so on */
+enum srv_shutdown_t	srv_shutdown_state = SRV_SHUTDOWN_NONE;
+
+/** Name of srv_monitor_file */
+static char*	srv_monitor_file_name;
+std::unique_ptr<tpool::timer> srv_master_timer;
+
+/** */
+#define SRV_MAX_N_PENDING_SYNC_IOS	100
+
+#ifdef UNIV_PFS_THREAD
+/* Keys to register InnoDB threads with performance schema */
+mysql_pfs_key_t	thread_pool_thread_key;
+#endif /* UNIV_PFS_THREAD */
+
+#ifdef HAVE_PSI_STAGE_INTERFACE
+/** Array of all InnoDB stage events for monitoring activities via
+performance schema. */
+static PSI_stage_info*	srv_stages[] =
+{
+	&srv_stage_alter_table_end,
+	&srv_stage_alter_table_insert,
+	&srv_stage_alter_table_log_index,
+	&srv_stage_alter_table_log_table,
+	&srv_stage_alter_table_merge_sort,
+	&srv_stage_alter_table_read_pk_internal_sort,
+	&srv_stage_buffer_pool_load,
+};
+#endif /* HAVE_PSI_STAGE_INTERFACE */
+
+/** Delete any garbage log files */
+static void delete_log_files()
+{
+  for (size_t i= 1; i < 102; i++)
+    delete_log_file(std::to_string(i).c_str());
+}
+
+/** Creates log file.
+@param create_new_db   whether the database is being initialized
+@param lsn             log sequence number
+@param logfile0        name of the log file
+@return DB_SUCCESS or error code */
+static dberr_t create_log_file(bool create_new_db, lsn_t lsn)
+{
+	ut_ad(!srv_read_only_mode);
+
+	/* We will retain ib_logfile0 until we have written a new logically
+	empty log as ib_logfile101 and atomically renamed it to
+	ib_logfile0 in log_t::rename_resized(). */
+	delete_log_files();
+
+	ut_ad(!os_aio_pending_reads());
+	ut_d(mysql_mutex_lock(&buf_pool.flush_list_mutex));
+	ut_ad(!buf_pool.get_oldest_modification(0));
+	ut_d(mysql_mutex_unlock(&buf_pool.flush_list_mutex));
+	/* os_aio_pending_writes() may hold here if some
+	write_io_callback() did not release the slot yet.  However,
+	the page write itself must have completed, because the
+	buf_pool.flush_list is empty. In debug builds, we wait for
+	this to happen, hoping to get a hung process if this
+	assumption does not hold. */
+	ut_d(os_aio_wait_until_no_pending_writes(false));
+
+	log_sys.latch.wr_lock(SRW_LOCK_CALL);
+	log_sys.set_capacity();
+
+	std::string logfile0{get_log_file_path("ib_logfile101")};
+	bool ret;
+	os_file_t file{
+          os_file_create_func(logfile0.c_str(),
+                              OS_FILE_CREATE | OS_FILE_ON_ERROR_NO_EXIT,
+                              OS_FILE_NORMAL, OS_LOG_FILE, false, &ret)
+        };
+
+	if (!ret) {
+		sql_print_error("InnoDB: Cannot create %.*s",
+				int(logfile0.size()), logfile0.data());
+err_exit:
+		log_sys.latch.wr_unlock();
+		return DB_ERROR;
+	}
+
+	ret = os_file_set_size(logfile0.c_str(), file, srv_log_file_size);
+	if (!ret) {
+		ib::error() << "Cannot set log file " << logfile0
+			    << " size to " << ib::bytes_iec{srv_log_file_size};
+close_and_exit:
+		os_file_close_func(file);
+		goto err_exit;
+	}
+
+	log_sys.set_latest_format(srv_encrypt_log);
+	if (!log_sys.attach(file, srv_log_file_size)) {
+		goto close_and_exit;
+	}
+	if (!fil_system.sys_space->open(create_new_db)) {
+		goto err_exit;
+	}
+
+	/* Create a log checkpoint. */
+	if (log_sys.is_encrypted() && !log_crypt_init()) {
+		goto err_exit;
+	}
+	ut_d(recv_no_log_write = false);
+	log_sys.create(lsn);
+
+	ut_ad(srv_startup_is_before_trx_rollback_phase);
+	if (create_new_db) {
+		srv_startup_is_before_trx_rollback_phase = false;
+	}
+
+	/* Enable checkpoints in buf_flush_page_cleaner(). */
+	recv_sys.recovery_on = false;
+	log_sys.latch.wr_unlock();
+
+	log_make_checkpoint();
+	log_buffer_flush_to_disk();
+
+	return DB_SUCCESS;
+}
+
+/** Rename the redo log file after resizing.
+@return whether an error occurred */
+bool log_t::resize_rename() noexcept
+{
+  std::string old_name{get_log_file_path("ib_logfile101")};
+  std::string new_name{get_log_file_path()};
+
+  if (IF_WIN(MoveFileEx(old_name.c_str(), new_name.c_str(),
+                        MOVEFILE_REPLACE_EXISTING),
+             !rename(old_name.c_str(), new_name.c_str())))
+    return false;
+
+  sql_print_error("InnoDB: Failed to rename log from %.*s to %.*s (error %d)",
+                  int(old_name.size()), old_name.data(),
+                  int(new_name.size()), new_name.data(),
+                  IF_WIN(int(GetLastError()), errno));
+  return true;
+}
+
+/** Create an undo tablespace file
+@param[in] name	 file name
+@return DB_SUCCESS or error code */
+static dberr_t srv_undo_tablespace_create(const char* name)
+{
+	pfs_os_file_t	fh;
+	bool		ret;
+	dberr_t		err = DB_SUCCESS;
+
+	os_file_create_subdirs_if_needed(name);
+
+	fh = os_file_create(
+		innodb_data_file_key,
+		name,
+		srv_read_only_mode ? OS_FILE_OPEN : OS_FILE_CREATE,
+		OS_FILE_NORMAL, OS_DATA_FILE, srv_read_only_mode, &ret);
+
+	if (!ret) {
+		if (os_file_get_last_error(false) != OS_FILE_ALREADY_EXISTS
+#ifdef _AIX
+			/* AIX 5.1 after security patch ML7 may have
+			errno set to 0 here, which causes our function
+			to return 100; work around that AIX problem */
+		    && os_file_get_last_error(false) != 100
+#endif
+		) {
+			ib::error() << "Can't create UNDO tablespace "
+				<< name;
+		}
+		err = DB_ERROR;
+	} else if (srv_read_only_mode) {
+		ib::info() << name << " opened in read-only mode";
+	} else {
+		/* We created the data file and now write it full of zeros */
+
+		ib::info() << "Data file " << name << " did not exist: new to"
+			" be created";
+
+		ib::info() << "Setting file " << name << " size to "
+			<< ib::bytes_iec{SRV_UNDO_TABLESPACE_SIZE_IN_PAGES
+					 << srv_page_size_shift};
+
+		ib::info() << "Database physically writes the file full: "
+			<< "wait...";
+
+		if (!os_file_set_size(name, fh, os_offset_t
+				      {SRV_UNDO_TABLESPACE_SIZE_IN_PAGES}
+				      << srv_page_size_shift)) {
+			ib::error() << "Unable to allocate " << name;
+			err = DB_ERROR;
+		}
+
+		os_file_close(fh);
+	}
+
+	return(err);
+}
+
+inline dberr_t trx_sys_t::reset_page(mtr_t *mtr)
+{
+  dberr_t err= DB_SUCCESS;
+  buf_block_t *sys_header= buf_page_get_gen(
+    page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO), 0, RW_X_LATCH, nullptr,
+    BUF_GET, mtr, &err);
+
+  if (!sys_header) return err;
+
+  const bool dblwr_enabled=
+    mach_read_from_4(TRX_SYS_DOUBLEWRITE_MAGIC + TRX_SYS_DOUBLEWRITE +
+                     sys_header->page.frame)
+    == TRX_SYS_DOUBLEWRITE_MAGIC_N;
+
+  char doublewrite[TRX_SYS_DOUBLEWRITE_BLOCK2 + 4];
+  memcpy(doublewrite, TRX_SYS_DOUBLEWRITE + sys_header->page.frame,
+         sizeof doublewrite);
+
+  fsp_init_file_page(fil_system.sys_space, sys_header, mtr);
+
+  mtr->write<2>(*sys_header, FIL_PAGE_TYPE + sys_header->page.frame,
+                FIL_PAGE_TYPE_TRX_SYS);
+
+  mtr->write<4>(*sys_header,
+                TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_PAGE_NO +
+                sys_header->page.frame, FSP_FIRST_RSEG_PAGE_NO);
+  mtr->memset(sys_header,
+             TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_SLOT_SIZE,
+	     254 * TRX_SYS_RSEG_SLOT_SIZE, 0xff);
+
+  static_assert(TRX_SYS_RSEG_SLOT_SIZE == 8, "");
+
+  if (dblwr_enabled)
+  {
+    mtr->memcpy(
+      *sys_header, sys_header->page.frame + TRX_SYS_DOUBLEWRITE,
+      doublewrite, sizeof doublewrite);
+    mtr->memmove(
+      *sys_header,
+      TRX_SYS_DOUBLEWRITE + FSEG_HEADER_SIZE + TRX_SYS_DOUBLEWRITE_REPEAT,
+      TRX_SYS_DOUBLEWRITE + FSEG_HEADER_SIZE, 12);
+    memcpy(
+      sys_header->page.frame + TRX_SYS_DOUBLEWRITE
+      + FSEG_HEADER_SIZE + TRX_SYS_DOUBLEWRITE_REPEAT,
+      sys_header->page.frame + TRX_SYS_DOUBLEWRITE + FSEG_HEADER_SIZE, 12);
+  }
+
+  return DB_SUCCESS;
+}
+
+/** Delete the old undo tablespaces present in the undo log directory */
+static dberr_t srv_undo_delete_old_tablespaces()
+{
+  /* Delete the old undo tablespaces*/
+  for (uint32_t i= 0; i < srv_undo_tablespaces_open; ++i)
+    fil_close_tablespace(srv_undo_space_id_start + i);
+
+  DBUG_EXECUTE_IF("after_deleting_old_undo_abort", return DB_ERROR;);
+
+  /* Do checkpoint to get rid of old undo log tablespaces redo logs */
+  log_make_checkpoint();
+
+  DBUG_EXECUTE_IF("after_deleting_old_undo_success", return DB_ERROR;);
+
+  return DB_SUCCESS;
+}
+
+/** Recreate the undo log tablespaces */
+ATTRIBUTE_COLD static dberr_t srv_undo_tablespaces_reinit()
+{
+  mtr_t mtr;
+  dberr_t err;
+  buf_block_t *first_rseg_hdr;
+  uint32_t latest_space_id;
+
+  mtr.start();
+
+  buf_block_t *dict_hdr= buf_page_get_gen(
+    page_id_t(DICT_HDR_SPACE, DICT_HDR_PAGE_NO), 0, RW_X_LATCH,
+    nullptr, BUF_GET, &mtr, &err);
+
+  if (!dict_hdr)
+    goto func_exit;
+
+  /* Assign the new space id for the first undo tablespace */
+  latest_space_id= mach_read_from_4(
+    DICT_HDR + DICT_HDR_MAX_SPACE_ID + dict_hdr->page.frame);
+
+  if (latest_space_id + srv_undo_tablespaces > SRV_SPACE_ID_UPPER_BOUND)
+  {
+    err= DB_ERROR;
+    sql_print_error("InnoDB: Running out of tablespace id");
+    goto func_exit;
+  }
+
+  first_rseg_hdr=
+    buf_page_get_gen(trx_sys.rseg_array[0].page_id(), 0, RW_X_LATCH,
+                     nullptr, BUF_GET, &mtr, &err);
+  if (!first_rseg_hdr)
+    goto func_exit;
+
+  if (UNIV_UNLIKELY(mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT +
+                                     first_rseg_hdr->page.frame)))
+    trx_rseg_format_upgrade(first_rseg_hdr, &mtr);
+
+  mtr.write<8,mtr_t::MAYBE_NOP>(*first_rseg_hdr,
+                                TRX_RSEG + TRX_RSEG_MAX_TRX_ID +
+                                first_rseg_hdr->page.frame,
+                                trx_sys.get_max_trx_id() - 1);
+
+  /* Reset TRX_SYS page */
+  err= trx_sys.reset_page(&mtr);
+
+  if (err)
+    goto func_exit;
+
+  if (srv_undo_tablespaces_open == 0)
+  {
+    /* Free the system rollback segment */
+    for (ulint i= 1; i < TRX_SYS_N_RSEGS; i++)
+    {
+      trx_rseg_t *rseg= &trx_sys.rseg_array[i];
+      if (rseg->space != fil_system.sys_space)
+        continue;
+      buf_block_t *block= buf_page_get_gen(
+        rseg->page_id(), 0, RW_X_LATCH, nullptr, BUF_GET, &mtr);
+      if (!block) break;
+      while (!fseg_free_step(TRX_RSEG + TRX_RSEG_FSEG_HEADER +
+                             block->page.frame, &mtr));
+    }
+  }
+
+  for (ulint rseg_id= 1; rseg_id < TRX_SYS_N_RSEGS; rseg_id++)
+  {
+    trx_rseg_t *rseg= &trx_sys.rseg_array[rseg_id];
+    rseg->destroy();
+    rseg->init(nullptr, FIL_NULL);
+  }
+
+  if (trx_sys.recovered_binlog_lsn
+#ifdef WITH_WSREP
+      || !trx_sys.recovered_wsrep_xid.is_null()
+#endif /* WITH_WSREP */
+     )
+  {
+    /* Update binlog offset, binlog file name & wsrep xid in
+    system tablespace rollback segment */
+    if (trx_sys.recovered_binlog_lsn)
+    {
+      ut_d(const size_t len = strlen(trx_sys.recovered_binlog_filename) + 1);
+      ut_ad(len > 1);
+      ut_ad(len <= TRX_RSEG_BINLOG_NAME_LEN);
+      trx_rseg_update_binlog_offset(
+        first_rseg_hdr, trx_sys.recovered_binlog_filename,
+        trx_sys.recovered_binlog_offset, &mtr);
+    }
+
+#ifdef WITH_WSREP
+    if (!trx_sys.recovered_wsrep_xid.is_null())
+      trx_rseg_update_wsrep_checkpoint(
+        first_rseg_hdr, &trx_sys.recovered_wsrep_xid, &mtr);
+#endif /* WITH_WSREP */
+  }
+
+  dict_hdr->page.fix();
+
+  mtr.commit();
+
+  DBUG_EXECUTE_IF("after_rseg_reset_abort",
+                  log_write_up_to(mtr.commit_lsn(), true);
+                  dict_hdr->page.unfix();
+                  return DB_ERROR;);
+
+  sql_print_information(
+    "InnoDB: Reinitializing innodb_undo_tablespaces= %u from %u",
+    srv_undo_tablespaces, srv_undo_tablespaces_open);
+
+  /* Delete the old undo tablespaces */
+  err= srv_undo_delete_old_tablespaces();
+  if (err)
+  {
+    dict_hdr->page.unfix();
+    return err;
+  }
+
+  mtr.start();
+
+  dict_hdr->page.lock.x_lock();
+  mtr.memo_push(dict_hdr, MTR_MEMO_PAGE_X_FIX);
+
+  if (srv_undo_tablespaces == 0)
+  {
+    srv_undo_space_id_start= 0;
+    srv_undo_tablespaces_open= 0;
+    goto func_exit;
+  }
+
+  srv_undo_space_id_start= latest_space_id;
+  if (fil_assign_new_space_id(&srv_undo_space_id_start))
+    mtr.write<4>(*dict_hdr, DICT_HDR + DICT_HDR_MAX_SPACE_ID
+                 + dict_hdr->page.frame, srv_undo_space_id_start);
+
+  /* Re-create the new undo tablespaces */
+  err= srv_undo_tablespaces_init(true, &mtr);
+func_exit:
+  mtr.commit();
+
+  DBUG_EXECUTE_IF("after_reinit_undo_abort",
+                  log_write_up_to(mtr.commit_lsn(), true);
+                  err= DB_ERROR;);
+
+  if (err == DB_SUCCESS)
+  {
+    /* Usually, recovery must work no matter when
+    log_checkpoints are triggered. This is a special case,
+    because this code is executed as part of InnoDB startup.
+    Backup requires that the server has been started up,
+    backup should never observe the log records that
+    were written in mtr and also srv_undo_tablespaces_init()
+    initializes the undo tablespace start id based on page0
+    content before reading the redo log */
+    log_make_checkpoint();
+
+    DBUG_EXECUTE_IF("after_reinit_undo_success", err= DB_ERROR;);
+    srv_undo_tablespaces_active= srv_undo_tablespaces;
+  }
+  return err;
+}
+
+/** Reinitialize the undo tablespaces when there is no undo log
+left to purge/rollback and validate the number of undo opened
+undo tablespace and user given undo tablespace
+@return DB_SUCCESS if it is valid */
+static dberr_t srv_undo_tablespaces_reinitialize()
+{
+
+  /* Re-create the undo tablespaces if it has no undo logs
+  left to purge/rollback */
+  if (srv_undo_tablespaces != srv_undo_tablespaces_open &&
+      trx_sys.is_undo_empty())
+    return srv_undo_tablespaces_reinit();
+
+  /* If the user says that there are fewer than what we find we
+  tolerate that discrepancy but not the inverse. Because there could
+  be unused undo tablespaces for future use. */
+
+  if (srv_undo_tablespaces != srv_undo_tablespaces_open)
+  {
+    sql_print_warning("InnoDB: Cannot change innodb_undo_tablespaces=%u "
+                      "because previous shutdown was not with "
+                      "innodb_fast_shutdown=0", srv_undo_tablespaces);
+    srv_undo_tablespaces= srv_undo_tablespaces_open;
+  }
+  else if (srv_undo_tablespaces_open > 0)
+    sql_print_information("InnoDB: Opened " UINT32PF " undo tablespaces",
+                          srv_undo_tablespaces_open);
+
+  return DB_SUCCESS;
+}
+
+/** @return the number of active undo tablespaces (except system tablespace) */
+static uint32_t trx_rseg_get_n_undo_tablespaces()
+{
+  std::set<uint32_t> space_ids;
+  mtr_t mtr;
+  mtr.start();
+
+  if (const buf_block_t *sys_header= trx_sysf_get(&mtr, false))
+    for (ulint rseg_id= 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++)
+      if (trx_sysf_rseg_get_page_no(sys_header, rseg_id) != FIL_NULL)
+	if (uint32_t space= trx_sysf_rseg_get_space(sys_header, rseg_id))
+	  space_ids.insert(space);
+  mtr.commit();
+  return static_cast<uint32_t>(space_ids.size());
+}
+
+/** Open an undo tablespace.
+@param[in]	create	whether undo tablespaces are being created
+@param[in]	name	tablespace file name
+@param[in]	i	undo tablespace count
+@return undo tablespace identifier
+@retval 0   if file doesn't exist
+@retval ~0U if page0 is corrupted */
+static uint32_t srv_undo_tablespace_open(bool create, const char* name,
+                                         uint32_t i)
+{
+  bool success;
+  uint32_t space_id= 0;
+  uint32_t fsp_flags= 0;
+
+  if (create)
+  {
+    space_id= srv_undo_space_id_start + i;
+    switch (srv_checksum_algorithm) {
+    case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
+    case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
+      fsp_flags= FSP_FLAGS_FCRC32_MASK_MARKER | FSP_FLAGS_FCRC32_PAGE_SSIZE();
+      break;
+    default:
+      fsp_flags= FSP_FLAGS_PAGE_SSIZE();
+    }
+  }
+
+  pfs_os_file_t fh= os_file_create(innodb_data_file_key, name, OS_FILE_OPEN |
+                                   OS_FILE_ON_ERROR_NO_EXIT |
+                                   OS_FILE_ON_ERROR_SILENT,
+                                   OS_FILE_AIO, OS_DATA_FILE,
+                                   srv_read_only_mode, &success);
+
+  if (!success)
+    return 0;
+
+  os_offset_t size= os_file_get_size(fh);
+  ut_a(size != os_offset_t(-1));
+
+  if (!create)
+  {
+    page_t *page= static_cast<byte*>(aligned_malloc(srv_page_size,
+                                                    srv_page_size));
+    if (os_file_read(IORequestRead, fh, page, 0, srv_page_size, nullptr) !=
+        DB_SUCCESS)
+    {
+err_exit:
+      ib::error() << "Unable to read first page of file " << name;
+      aligned_free(page);
+      return ~0U;
+    }
+
+    uint32_t id= mach_read_from_4(FIL_PAGE_SPACE_ID + page);
+    if (id == 0 || id >= SRV_SPACE_ID_UPPER_BOUND ||
+        memcmp_aligned<2>(FIL_PAGE_SPACE_ID + page,
+                          FSP_HEADER_OFFSET + FSP_SPACE_ID + page, 4))
+    {
+      ib::error() << "Inconsistent tablespace ID in file " << name;
+      goto err_exit;
+    }
+
+    space_id= id;
+    fsp_flags= mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + page);
+
+    if (buf_page_is_corrupted(false, page, fsp_flags))
+    {
+      sql_print_error("InnoDB: Checksum mismatch in the first page of file %s",
+                      name);
+      if (recv_sys.dblwr.restore_first_page(space_id, name, fh))
+        goto err_exit;
+    }
+
+    aligned_free(page);
+  }
+
+  /* Load the tablespace into InnoDB's internal data structures. */
+
+  /* We set the biggest space id to the undo tablespace
+  because InnoDB hasn't opened any other tablespace apart
+  from the system tablespace. */
+
+  fil_set_max_space_id_if_bigger(space_id);
+
+  mysql_mutex_lock(&fil_system.mutex);
+  fil_space_t *space= fil_space_t::create(space_id, fsp_flags,
+                                          FIL_TYPE_TABLESPACE, nullptr,
+                                          FIL_ENCRYPTION_DEFAULT, true);
+  ut_ad(space);
+  fil_node_t *file= space->add(name, fh, 0, false, true);
+
+  if (create)
+  {
+    space->set_sizes(SRV_UNDO_TABLESPACE_SIZE_IN_PAGES);
+    space->size= file->size= uint32_t(size >> srv_page_size_shift);
+  }
+  else if (!file->read_page0())
+  {
+    os_file_close(file->handle);
+    file->handle= OS_FILE_CLOSED;
+    ut_a(fil_system.n_open > 0);
+    fil_system.n_open--;
+  }
+
+  mysql_mutex_unlock(&fil_system.mutex);
+  return space_id;
+}
+
+/** Check if undo tablespaces and redo log files exist before creating a
+new system tablespace
+@retval DB_SUCCESS  if all undo and redo logs are not found
+@retval DB_ERROR    if any undo and redo logs are found */
+static
+dberr_t
+srv_check_undo_redo_logs_exists()
+{
+	bool		ret;
+	os_file_t	fh;
+	char	name[OS_FILE_MAX_PATH];
+
+	/* Check if any undo tablespaces exist */
+	for (ulint i = 1; i <= srv_undo_tablespaces; ++i) {
+
+		snprintf(name, sizeof name, "%s/undo%03zu", srv_undo_dir, i);
+
+		fh = os_file_create_func(
+			name,
+			OS_FILE_OPEN_RETRY
+			| OS_FILE_ON_ERROR_NO_EXIT
+			| OS_FILE_ON_ERROR_SILENT,
+			OS_FILE_NORMAL,
+			OS_DATA_FILE,
+			srv_read_only_mode,
+			&ret);
+
+		if (ret) {
+			os_file_close_func(fh);
+			ib::error()
+				<< "undo tablespace '" << name << "' exists."
+				" Creating system tablespace with existing undo"
+				" tablespaces is not supported. Please delete"
+				" all undo tablespaces before creating new"
+				" system tablespace.";
+			return(DB_ERROR);
+		}
+	}
+
+	/* Check if redo log file exists */
+	auto logfilename = get_log_file_path();
+
+	fh = os_file_create_func(logfilename.c_str(),
+				 OS_FILE_OPEN_RETRY | OS_FILE_ON_ERROR_NO_EXIT
+				 | OS_FILE_ON_ERROR_SILENT,
+				 OS_FILE_NORMAL, OS_LOG_FILE,
+				 srv_read_only_mode, &ret);
+
+	if (ret) {
+		os_file_close_func(fh);
+		ib::error() << "redo log file '" << logfilename
+			    << "' exists. Creating system tablespace with"
+			       " existing redo log file is not recommended."
+			       " Please delete redo log file before"
+			       " creating new system tablespace.";
+		return DB_ERROR;
+	}
+
+	return(DB_SUCCESS);
+}
+
+static dberr_t srv_all_undo_tablespaces_open(bool create_new_undo,
+                                             uint32_t n_undo)
+{
+  /* Open all the undo tablespaces that are currently in use. If we
+  fail to open any of these it is a fatal error. The tablespace ids
+  should be contiguous. It is a fatal error because they are required
+  for recovery and are referenced by the UNDO logs (a.k.a RBS). */
+
+  uint32_t prev_id= create_new_undo ? srv_undo_space_id_start - 1 : 0;
+
+  for (uint32_t i= 0; i < n_undo; ++i)
+  {
+    char name[OS_FILE_MAX_PATH];
+    snprintf(name, sizeof name, "%s/undo%03u", srv_undo_dir, i + 1);
+    uint32_t space_id= srv_undo_tablespace_open(create_new_undo, name, i);
+    switch (space_id) {
+    case ~0U:
+      return DB_CORRUPTION;
+    case 0:
+      if (!create_new_undo)
+        goto unused_undo;
+      sql_print_error("InnoDB: Unable to open create tablespace '%s'.", name);
+      return DB_ERROR;
+    default:
+      /* Should be no gaps in undo tablespace ids. */
+      ut_a(!i || prev_id + 1 == space_id);
+    }
+
+    prev_id= space_id;
+
+    /* Note the first undo tablespace id in case of
+    no active undo tablespace. */
+    if (0 == srv_undo_tablespaces_open++)
+      srv_undo_space_id_start= space_id;
+  }
+
+  /* Open any extra unused undo tablespaces. These must be contiguous.
+  We stop at the first failure. These are undo tablespaces that are
+  not in use and therefore not required by recovery. We only check
+  that there are no gaps. */
+unused_undo:
+  for (uint32_t i= prev_id + 1; i < srv_undo_space_id_start + TRX_SYS_N_RSEGS;
+       ++i)
+  {
+     char name[OS_FILE_MAX_PATH];
+     snprintf(name, sizeof name, "%s/undo%03u", srv_undo_dir, i);
+     uint32_t space_id= srv_undo_tablespace_open(create_new_undo, name, i);
+     if (!space_id || space_id == ~0U)
+       break;
+     if (0 == srv_undo_tablespaces_open++)
+       srv_undo_space_id_start= space_id;
+  }
+
+  return DB_SUCCESS;
+}
+
+/** Open the configured number of dedicated undo tablespaces.
+@param[in]	create_new_undo	whether the undo tablespaces has to be created
+@param[in,out]	mtr		mini-transaction
+@return DB_SUCCESS or error code */
+dberr_t srv_undo_tablespaces_init(bool create_new_undo, mtr_t *mtr)
+{
+  srv_undo_tablespaces_open= 0;
+
+  ut_ad(!create_new_undo || mtr);
+  ut_a(srv_undo_tablespaces <= TRX_SYS_N_RSEGS);
+  ut_a(!create_new_undo || srv_operation <= SRV_OPERATION_EXPORT_RESTORED);
+
+  if (srv_undo_tablespaces == 1)
+    srv_undo_tablespaces= 0;
+
+  /* Create the undo spaces only if we are creating a new
+  instance. We don't allow creating of new undo tablespaces
+  in an existing instance (yet). */
+  if (create_new_undo)
+  {
+    DBUG_EXECUTE_IF("innodb_undo_upgrade", srv_undo_space_id_start= 3;);
+
+    for (ulint i= 0; i < srv_undo_tablespaces; ++i)
+    {
+      char name[OS_FILE_MAX_PATH];
+      snprintf(name, sizeof name, "%s/undo%03zu", srv_undo_dir, i + 1);
+      if (dberr_t err= srv_undo_tablespace_create(name))
+      {
+	ib::error() << "Could not create undo tablespace '" << name << "'.";
+	return err;
+      }
+    }
+  }
+
+  /* Get the tablespace ids of all the undo segments excluding
+  the system tablespace (0). If we are creating a new instance then
+  we build the undo_tablespace_ids ourselves since they don't
+  already exist. */
+  srv_undo_tablespaces_active= srv_undo_tablespaces;
+
+  uint32_t n_undo= (create_new_undo || srv_operation == SRV_OPERATION_BACKUP ||
+                 srv_operation == SRV_OPERATION_RESTORE_DELTA)
+    ? srv_undo_tablespaces : TRX_SYS_N_RSEGS;
+
+  if (dberr_t err= srv_all_undo_tablespaces_open(create_new_undo, n_undo))
+    return err;
+
+  /* Initialize srv_undo_space_id_start=0 when there are no
+  dedicated undo tablespaces. */
+  if (srv_undo_tablespaces_open == 0)
+    srv_undo_space_id_start= 0;
+
+  if (create_new_undo)
+  {
+    for (uint32_t i= 0; i < srv_undo_tablespaces; ++i)
+    {
+      dberr_t err= fsp_header_init(fil_space_get(srv_undo_space_id_start + i),
+                                   SRV_UNDO_TABLESPACE_SIZE_IN_PAGES, mtr);
+      if (err) return err;
+    }
+  }
+
+  return DB_SUCCESS;
+}
+
+/** Create the temporary file tablespace.
+@param[in]	create_new_db	whether we are creating a new database
+@return DB_SUCCESS or error code. */
+static
+dberr_t
+srv_open_tmp_tablespace(bool create_new_db)
+{
+	ulint	sum_of_new_sizes;
+
+	/* Will try to remove if there is existing file left-over by last
+	unclean shutdown */
+	srv_tmp_space.set_sanity_check_status(true);
+	srv_tmp_space.delete_files();
+	srv_tmp_space.set_ignore_read_only(true);
+
+	bool	create_new_temp_space;
+
+	srv_tmp_space.set_space_id(SRV_TMP_SPACE_ID);
+
+	dberr_t	err = srv_tmp_space.check_file_spec(
+		&create_new_temp_space, 12 * 1024 * 1024);
+
+	if (err == DB_FAIL) {
+		ib::error() << "The innodb_temporary"
+			" data file must be writable!";
+		err = DB_ERROR;
+	} else if (err != DB_SUCCESS) {
+		ib::error() << "Could not create the shared innodb_temporary.";
+	} else if ((err = srv_tmp_space.open_or_create(
+			    true, create_new_db, &sum_of_new_sizes))
+		   != DB_SUCCESS) {
+		ib::error() << "Unable to create the shared innodb_temporary";
+	} else if (fil_system.temp_space->open(true)) {
+		/* Initialize the header page */
+		mtr_t mtr;
+		mtr.start();
+		mtr.set_log_mode(MTR_LOG_NO_REDO);
+		err = fsp_header_init(fil_system.temp_space,
+				      srv_tmp_space.get_sum_of_sizes(),
+				      &mtr);
+		mtr.commit();
+		if (err == DB_SUCCESS) {
+			err = trx_temp_rseg_create(&mtr);
+		}
+	} else {
+		/* This file was just opened in the code above! */
+		ib::error() << "The innodb_temporary"
+			" data file cannot be re-opened"
+			" after check_file_spec() succeeded!";
+		err = DB_ERROR;
+	}
+
+	return(err);
+}
+
+/** Shutdown background threads, except the page cleaner. */
+static void srv_shutdown_threads()
+{
+	ut_ad(!srv_undo_sources);
+	srv_master_timer.reset();
+	srv_shutdown_state = SRV_SHUTDOWN_EXIT_THREADS;
+
+	if (purge_sys.enabled()) {
+		srv_purge_shutdown();
+	}
+
+	if (srv_n_fil_crypt_threads) {
+		fil_crypt_set_thread_cnt(0);
+	}
+}
+
+
+/** Shut down background threads that can generate undo log. */
+static void srv_shutdown_bg_undo_sources()
+{
+  srv_shutdown_state= SRV_SHUTDOWN_INITIATED;
+
+  if (srv_undo_sources)
+  {
+    ut_ad(!srv_read_only_mode);
+    fts_optimize_shutdown();
+    dict_stats_shutdown();
+    srv_undo_sources= false;
+  }
+}
+
+#ifdef UNIV_DEBUG
+# define srv_init_abort(_db_err)	\
+	srv_init_abort_low(create_new_db, __FILE__, __LINE__, _db_err)
+#else
+# define srv_init_abort(_db_err)	\
+	srv_init_abort_low(create_new_db, _db_err)
+#endif /* UNIV_DEBUG */
+
+/** Innobase start-up aborted. Perform cleanup actions.
+@param[in]	create_new_db	TRUE if new db is  being created
+@param[in]	file		File name
+@param[in]	line		Line number
+@param[in]	err		Reason for aborting InnoDB startup
+@return DB_SUCCESS or error code. */
+MY_ATTRIBUTE((warn_unused_result, nonnull))
+static
+dberr_t
+srv_init_abort_low(
+	bool		create_new_db,
+#ifdef UNIV_DEBUG
+	const char*	file,
+	unsigned	line,
+#endif /* UNIV_DEBUG */
+	dberr_t		err)
+{
+	ut_ad(srv_is_being_started);
+
+	if (create_new_db) {
+		ib::error() << "Database creation was aborted"
+#ifdef UNIV_DEBUG
+			" at " << innobase_basename(file) << "[" << line << "]"
+#endif /* UNIV_DEBUG */
+			" with error " << err << ". You may need"
+			" to delete the ibdata1 file before trying to start"
+			" up again.";
+	} else if (srv_operation == SRV_OPERATION_NORMAL) {
+		ib::error() << "Plugin initialization aborted"
+#ifdef UNIV_DEBUG
+			" at " << innobase_basename(file) << "[" << line << "]"
+#endif /* UNIV_DEBUG */
+			" with error " << err;
+	}
+
+	srv_shutdown_bg_undo_sources();
+	srv_shutdown_threads();
+	return(err);
+}
+
+/** Prepare to delete the redo log file. Flush the dirty pages from all the
+buffer pools.  Flush the redo log buffer to the redo log file.
+@return lsn upto which data pages have been flushed. */
+static lsn_t srv_prepare_to_delete_redo_log_file()
+{
+  DBUG_ENTER("srv_prepare_to_delete_redo_log_file");
+
+  ut_ad(recv_sys.recovery_on);
+
+  /* Clean the buffer pool. */
+  buf_flush_sync();
+
+  DBUG_EXECUTE_IF("innodb_log_abort_1", DBUG_RETURN(0););
+  DBUG_PRINT("ib_log", ("After innodb_log_abort_1"));
+
+  log_sys.latch.wr_lock(SRW_LOCK_CALL);
+  const bool latest_format{log_sys.is_latest()};
+  lsn_t flushed_lsn{log_sys.get_lsn()};
+
+  if (latest_format && !(log_sys.file_size & 4095) &&
+      flushed_lsn != log_sys.next_checkpoint_lsn +
+      (log_sys.is_encrypted()
+       ? SIZE_OF_FILE_CHECKPOINT + 8
+       : SIZE_OF_FILE_CHECKPOINT))
+  {
+    fil_names_clear(flushed_lsn);
+    flushed_lsn= log_sys.get_lsn();
+  }
+
+  {
+    const char *msg;
+    if (!latest_format)
+    {
+      msg= "Upgrading redo log: ";
+same_size:
+      ib::info() << msg << ib::bytes_iec(srv_log_file_size)
+                 << "; LSN=" << flushed_lsn;
+    }
+    else if (srv_log_file_size == log_sys.file_size)
+    {
+      msg= srv_encrypt_log
+        ? "Encrypting redo log: " : "Removing redo log encryption: ";
+      goto same_size;
+    }
+    else
+    {
+      if (srv_encrypt_log == (my_bool)log_sys.is_encrypted())
+        msg= srv_encrypt_log ? "Resizing encrypted" : "Resizing";
+      else
+        msg= srv_encrypt_log
+          ? "Encrypting and resizing"
+          : "Removing encryption and resizing";
+
+      ib::info() << msg << " redo log from "
+                 << ib::bytes_iec{log_sys.file_size} << " to "
+                 << ib::bytes_iec{srv_log_file_size}
+                 << "; LSN=" << flushed_lsn;
+    }
+  }
+
+  log_sys.latch.wr_unlock();
+
+  log_write_up_to(flushed_lsn, false);
+
+  ut_ad(flushed_lsn == log_sys.get_lsn());
+  ut_ad(!os_aio_pending_reads());
+  ut_d(mysql_mutex_lock(&buf_pool.flush_list_mutex));
+  ut_ad(!buf_pool.get_oldest_modification(0));
+  ut_d(mysql_mutex_unlock(&buf_pool.flush_list_mutex));
+  ut_d(os_aio_wait_until_no_pending_writes(false));
+
+  DBUG_RETURN(flushed_lsn);
+}
+
+static tpool::task_group rollback_all_recovered_group(1);
+static tpool::task rollback_all_recovered_task(trx_rollback_all_recovered,
+					       nullptr,
+					       &rollback_all_recovered_group);
+
+/** Start InnoDB.
+@param[in]	create_new_db	whether to create a new database
+@return DB_SUCCESS or error code */
+dberr_t srv_start(bool create_new_db)
+{
+	dberr_t		err		= DB_SUCCESS;
+	mtr_t		mtr;
+
+	ut_ad(srv_operation <= SRV_OPERATION_RESTORE_EXPORT
+	      || srv_operation == SRV_OPERATION_RESTORE
+	      || srv_operation == SRV_OPERATION_RESTORE_EXPORT);
+
+	if (srv_force_recovery) {
+		ib::info() << "!!! innodb_force_recovery is set to "
+			<< srv_force_recovery << " !!!";
+	}
+
+	if (srv_force_recovery == SRV_FORCE_NO_LOG_REDO) {
+		srv_read_only_mode = true;
+	}
+
+	high_level_read_only = srv_read_only_mode
+		|| srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN
+		|| srv_sys_space.created_new_raw();
+
+	srv_started_redo = false;
+
+	compile_time_assert(sizeof(ulint) == sizeof(void*));
+
+#ifdef UNIV_DEBUG
+	ib::info() << "!!!!!!!! UNIV_DEBUG switched on !!!!!!!!!";
+#endif
+
+#ifdef UNIV_IBUF_DEBUG
+	ib::info() << "!!!!!!!! UNIV_IBUF_DEBUG switched on !!!!!!!!!";
+#endif
+
+	ib::info() << "Compressed tables use zlib " ZLIB_VERSION
+#ifdef UNIV_ZIP_DEBUG
+	      " with validation"
+#endif /* UNIV_ZIP_DEBUG */
+	      ;
+#ifdef UNIV_ZIP_COPY
+	ib::info() << "and extra copying";
+#endif /* UNIV_ZIP_COPY */
+
+	/* Since InnoDB does not currently clean up all its internal data
+	structures in MySQL Embedded Server Library server_end(), we
+	print an error message if someone tries to start up InnoDB a
+	second time during the process lifetime. */
+
+	if (srv_start_has_been_called) {
+		ib::error() << "Startup called second time"
+			" during the process lifetime."
+			" In the MariaDB Embedded Server Library"
+			" you cannot call server_init() more than"
+			" once during the process lifetime.";
+	}
+
+	srv_start_has_been_called = true;
+
+	srv_is_being_started = true;
+
+	/* Register performance schema stages before any real work has been
+	started which may need to be instrumented. */
+	mysql_stage_register("innodb", srv_stages,
+			     static_cast<int>(UT_ARR_SIZE(srv_stages)));
+
+	srv_max_n_threads =
+		1 /* dict_stats_thread */
+		+ 1 /* fts_optimize_thread */
+		+ 128 /* safety margin */
+		+ max_connections;
+
+	srv_boot();
+
+	ib::info() << my_crc32c_implementation();
+
+	if (!srv_read_only_mode) {
+		mysql_mutex_init(srv_monitor_file_mutex_key,
+				 &srv_monitor_file_mutex, nullptr);
+		mysql_mutex_init(srv_misc_tmpfile_mutex_key,
+				 &srv_misc_tmpfile_mutex, nullptr);
+	}
+
+	if (!srv_read_only_mode) {
+		if (srv_innodb_status) {
+
+			srv_monitor_file_name = static_cast<char*>(
+				ut_malloc_nokey(
+					strlen(fil_path_to_mysql_datadir)
+					+ 20 + sizeof "/innodb_status."));
+
+			sprintf(srv_monitor_file_name,
+				"%s/innodb_status." ULINTPF,
+				fil_path_to_mysql_datadir,
+				static_cast<ulint>
+				(IF_WIN(GetCurrentProcessId(), getpid())));
+
+			srv_monitor_file = my_fopen(srv_monitor_file_name,
+						    O_RDWR|O_TRUNC|O_CREAT,
+						    MYF(MY_WME));
+
+			if (!srv_monitor_file) {
+				ib::error() << "Unable to create "
+					<< srv_monitor_file_name << ": "
+					<< strerror(errno);
+				if (err == DB_SUCCESS) {
+					err = DB_ERROR;
+				}
+			}
+		} else {
+
+			srv_monitor_file_name = NULL;
+			srv_monitor_file = os_file_create_tmpfile();
+
+			if (!srv_monitor_file && err == DB_SUCCESS) {
+				err = DB_ERROR;
+			}
+		}
+
+		srv_misc_tmpfile = os_file_create_tmpfile();
+
+		if (!srv_misc_tmpfile && err == DB_SUCCESS) {
+			err = DB_ERROR;
+		}
+	}
+
+	if (err != DB_SUCCESS) {
+		return(srv_init_abort(err));
+	}
+
+	if (srv_read_only_mode) {
+		ib::info() << "Disabling background log and ibuf IO write"
+			<< " threads.";
+	}
+
+	if (os_aio_init()) {
+		ib::error() << "Cannot initialize AIO sub-system";
+
+		return(srv_init_abort(DB_ERROR));
+	}
+
+#ifdef LINUX_NATIVE_AIO
+	if (srv_use_native_aio) {
+		ib::info() << "Using Linux native AIO";
+	}
+#endif
+#ifdef HAVE_URING
+	if (srv_use_native_aio) {
+		ib::info() << "Using liburing";
+	}
+#endif
+
+	fil_system.create(srv_file_per_table ? 50000 : 5000);
+
+	ib::info() << "Initializing buffer pool, total size = "
+		<< ib::bytes_iec{srv_buf_pool_size}
+		<< ", chunk size = " << ib::bytes_iec{srv_buf_pool_chunk_unit};
+
+	if (buf_pool.create()) {
+		ib::error() << "Cannot allocate memory for the buffer pool";
+
+		return(srv_init_abort(DB_ERROR));
+	}
+
+	ib::info() << "Completed initialization of buffer pool";
+
+#ifdef UNIV_DEBUG
+	/* We have observed deadlocks with a 5MB buffer pool but
+	the actual lower limit could very well be a little higher. */
+
+	if (srv_buf_pool_size <= 5 * 1024 * 1024) {
+
+		ib::info() << "Small buffer pool size ("
+			<< ib::bytes_iec{srv_buf_pool_size}
+			<< "), the flst_validate() debug function can cause a"
+			<< " deadlock if the buffer pool fills up.";
+	}
+#endif /* UNIV_DEBUG */
+
+	if (!log_sys.create()) {
+		return srv_init_abort(DB_ERROR);
+	}
+
+	recv_sys.create();
+	lock_sys.create(srv_lock_table_size);
+
+	srv_startup_is_before_trx_rollback_phase = true;
+
+	if (!srv_read_only_mode) {
+		buf_flush_page_cleaner_init();
+		ut_ad(buf_page_cleaner_is_active);
+	}
+
+	/* Check if undo tablespaces and redo log files exist before creating
+	a new system tablespace */
+	if (create_new_db) {
+		err = srv_check_undo_redo_logs_exists();
+		if (err != DB_SUCCESS) {
+			return(srv_init_abort(DB_ERROR));
+		}
+		recv_sys.debug_free();
+	}
+
+	/* Open or create the data files. */
+	ulint	sum_of_new_sizes;
+
+	err = srv_sys_space.open_or_create(
+		false, create_new_db, &sum_of_new_sizes);
+
+	switch (err) {
+	case DB_SUCCESS:
+		break;
+	case DB_CANNOT_OPEN_FILE:
+		ib::error()
+			<< "Could not open or create the system tablespace. If"
+			" you tried to add new data files to the system"
+			" tablespace, and it failed here, you should now"
+			" edit innodb_data_file_path in my.cnf back to what"
+			" it was, and remove the new ibdata files InnoDB"
+			" created in this failed attempt. InnoDB only wrote"
+			" those files full of zeros, but did not yet use"
+			" them in any way. But be careful: do not remove"
+			" old data files which contain your precious data!";
+		/* fall through */
+	default:
+		/* Other errors might come from Datafile::validate_first_page() */
+		return(srv_init_abort(err));
+	}
+
+	if (innodb_encrypt_temporary_tables && !log_crypt_init()) {
+		return srv_init_abort(DB_ERROR);
+	}
+
+	if (create_new_db) {
+		lsn_t flushed_lsn = log_sys.init_lsn();
+
+		err = create_log_file(true, flushed_lsn);
+
+		if (err != DB_SUCCESS) {
+			for (const Datafile &file: srv_sys_space) {
+				os_file_delete(innodb_data_file_key,
+					       file.filepath());
+			}
+			return srv_init_abort(err);
+		}
+
+		srv_undo_space_id_start= 1;
+	}
+
+	/* Open log file and data files in the systemtablespace: we keep
+	them open until database shutdown */
+	ut_d(fil_system.sys_space->recv_size = srv_sys_space_size_debug);
+
+	if (fil_system.sys_space->open(create_new_db)) {
+		mtr_t mtr;
+		mtr.start();
+		err= srv_undo_tablespaces_init(create_new_db, &mtr);
+		mtr.commit();
+	}
+	else {
+		err= DB_ERROR;
+	}
+
+	/* If the force recovery is set very high then we carry on regardless
+	of all errors. Basically this is fingers crossed mode. */
+
+	if (err != DB_SUCCESS
+	    && srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN) {
+
+		return(srv_init_abort(err));
+	}
+
+	/* Initialize objects used by dict stats gathering thread, which
+	can also be used by recovery if it tries to drop some table */
+	if (!srv_read_only_mode) {
+		dict_stats_init();
+	}
+
+	trx_sys.create();
+
+	if (create_new_db) {
+		ut_ad(!srv_read_only_mode);
+
+		mtr_start(&mtr);
+		ut_ad(fil_system.sys_space->id == 0);
+		compile_time_assert(TRX_SYS_SPACE == 0);
+		compile_time_assert(IBUF_SPACE_ID == 0);
+		ut_a(fsp_header_init(fil_system.sys_space,
+				     uint32_t(sum_of_new_sizes), &mtr)
+		     == DB_SUCCESS);
+
+		ulint ibuf_root = btr_create(
+			DICT_CLUSTERED | DICT_IBUF, fil_system.sys_space,
+			DICT_IBUF_ID_MIN, nullptr, &mtr, &err);
+
+		mtr_commit(&mtr);
+
+		if (ibuf_root == FIL_NULL) {
+			return srv_init_abort(err);
+		}
+
+		ut_ad(ibuf_root == IBUF_TREE_ROOT_PAGE_NO);
+
+		/* To maintain backward compatibility we create only
+		the first rollback segment before the double write buffer.
+		All the remaining rollback segments will be created later,
+		after the double write buffer has been created. */
+		err = trx_sys_create_sys_pages(&mtr);
+
+		if (err != DB_SUCCESS) {
+			return(srv_init_abort(err));
+		}
+
+		err = dict_create();
+
+		if (err != DB_SUCCESS) {
+			return(srv_init_abort(err));
+		}
+
+		buf_flush_sync();
+
+		ut_ad(!srv_log_file_created);
+		ut_d(srv_log_file_created= true);
+
+		if (log_sys.resize_rename()) {
+			return(srv_init_abort(DB_ERROR));
+		}
+	} else {
+		/* Suppress warnings in fil_space_t::create() for files
+		that are being read before dict_boot() has recovered
+		DICT_HDR_MAX_SPACE_ID. */
+		fil_system.space_id_reuse_warned = true;
+
+		/* We always try to do a recovery, even if the database had
+		been shut down normally: this is the normal startup path */
+
+		err = recv_recovery_from_checkpoint_start();
+		recv_sys.close_files();
+
+		recv_sys.dblwr.pages.clear();
+
+		if (err != DB_SUCCESS) {
+			return(srv_init_abort(err));
+		}
+
+		switch (srv_operation) {
+                case SRV_OPERATION_NORMAL:
+		case SRV_OPERATION_EXPORT_RESTORED:
+		case SRV_OPERATION_RESTORE_EXPORT:
+			/* Initialize the change buffer. */
+			err = dict_boot();
+			if (err != DB_SUCCESS) {
+				return(srv_init_abort(err));
+			}
+			/* fall through */
+		case SRV_OPERATION_RESTORE:
+			/* This must precede recv_sys.apply(true). */
+			srv_undo_tablespaces_active
+				= trx_rseg_get_n_undo_tablespaces();
+
+			if (srv_operation != SRV_OPERATION_RESTORE) {
+				dict_sys.load_sys_tables();
+			}
+			err = trx_lists_init_at_db_start();
+			if (err != DB_SUCCESS) {
+				return srv_init_abort(err);
+			}
+			break;
+		case SRV_OPERATION_RESTORE_DELTA:
+		case SRV_OPERATION_BACKUP:
+		case SRV_OPERATION_BACKUP_NO_DEFER:
+			ut_ad("wrong mariabackup mode" == 0);
+		}
+
+		if (srv_force_recovery < SRV_FORCE_NO_LOG_REDO) {
+			/* Apply the hashed log records to the
+			respective file pages, for the last batch of
+			recv_group_scan_log_recs().
+			Since it may generate huge batch of threadpool tasks,
+			for read io task group, scale down thread creation rate
+			by temporarily restricting tpool concurrency.
+			*/
+			srv_thread_pool->set_concurrency(srv_n_read_io_threads);
+
+			mysql_mutex_lock(&recv_sys.mutex);
+			recv_sys.apply(true);
+			mysql_mutex_unlock(&recv_sys.mutex);
+
+			srv_thread_pool->set_concurrency();
+
+			if (recv_sys.is_corrupt_log()
+			    || recv_sys.is_corrupt_fs()) {
+				return(srv_init_abort(DB_CORRUPTION));
+			}
+
+			DBUG_PRINT("ib_log", ("apply completed"));
+
+			if (recv_needed_recovery) {
+				trx_sys_print_mysql_binlog_offset();
+			}
+		}
+
+		fil_system.space_id_reuse_warned = false;
+
+		if (!srv_read_only_mode) {
+			const uint32_t flags = FSP_FLAGS_PAGE_SSIZE();
+			for (uint32_t id = srv_undo_space_id_start;
+			     id <= srv_undo_tablespaces; id++) {
+				if (fil_space_t* space = fil_space_get(id)) {
+					fsp_flags_try_adjust(space, flags);
+				}
+			}
+
+			if (sum_of_new_sizes > 0) {
+				/* New data file(s) were added */
+				mtr.start();
+				mtr.x_lock_space(fil_system.sys_space);
+				buf_block_t* block = buf_page_get(
+					page_id_t(0, 0), 0,
+					RW_SX_LATCH, &mtr);
+				/* The first page of the system tablespace
+				should already have been successfully
+				accessed earlier during startup. */
+				ut_a(block);
+				ulint size = mach_read_from_4(
+					FSP_HEADER_OFFSET + FSP_SIZE
+					+ block->page.frame);
+				ut_ad(size == fil_system.sys_space
+				      ->size_in_header);
+				size += sum_of_new_sizes;
+				mtr.write<4>(*block,
+					     FSP_HEADER_OFFSET + FSP_SIZE
+					     + block->page.frame, size);
+				fil_system.sys_space->size_in_header
+					= uint32_t(size);
+				mtr.commit();
+				log_write_up_to(mtr.commit_lsn(), true);
+			}
+		}
+
+#ifdef UNIV_DEBUG
+		{
+			mtr.start();
+			buf_block_t* block = buf_page_get(page_id_t(0, 0), 0,
+							  RW_S_LATCH, &mtr);
+			ut_ad(mach_read_from_4(FSP_SIZE + FSP_HEADER_OFFSET
+					       + block->page.frame)
+			      == fil_system.sys_space->size_in_header);
+			mtr.commit();
+		}
+#endif
+		const ulint	tablespace_size_in_header
+			= fil_system.sys_space->size_in_header;
+		const ulint	sum_of_data_file_sizes
+			= srv_sys_space.get_sum_of_sizes();
+		/* Compare the system tablespace file size to what is
+		stored in FSP_SIZE. In srv_sys_space.open_or_create()
+		we already checked that the file sizes match the
+		innodb_data_file_path specification. */
+		if (srv_read_only_mode
+		    || sum_of_data_file_sizes == tablespace_size_in_header) {
+			/* Do not complain about the size. */
+		} else if (!srv_sys_space.can_auto_extend_last_file()
+			   || sum_of_data_file_sizes
+			   < tablespace_size_in_header) {
+			ib::error() << "Tablespace size stored in header is "
+				<< tablespace_size_in_header
+				<< " pages, but the sum of data file sizes is "
+				<< sum_of_data_file_sizes << " pages";
+
+			if (srv_force_recovery == 0
+			    && sum_of_data_file_sizes
+			    < tablespace_size_in_header) {
+				ib::error() <<
+					"Cannot start InnoDB. The tail of"
+					" the system tablespace is"
+					" missing. Have you edited"
+					" innodb_data_file_path in my.cnf"
+					" in an inappropriate way, removing"
+					" data files from there?"
+					" You can set innodb_force_recovery=1"
+					" in my.cnf to force"
+					" a startup if you are trying to"
+					" recover a badly corrupt database.";
+
+				return(srv_init_abort(DB_ERROR));
+			}
+		}
+
+		if (srv_operation > SRV_OPERATION_EXPORT_RESTORED) {
+			ut_ad(srv_operation == SRV_OPERATION_RESTORE_EXPORT
+			      || srv_operation == SRV_OPERATION_RESTORE);
+			return(err);
+		}
+
+		/* Upgrade or resize or rebuild the redo logs before
+		generating any dirty pages, so that the old redo log
+		file will not be written to. */
+
+		if (srv_force_recovery == SRV_FORCE_NO_LOG_REDO) {
+			/* Completely ignore the redo log. */
+		} else if (srv_read_only_mode) {
+			/* Leave the redo log alone. */
+		} else if (log_sys.file_size == srv_log_file_size
+			   && log_sys.format
+			   == (srv_encrypt_log
+			       ? log_t::FORMAT_ENC_10_8
+			       : log_t::FORMAT_10_8)) {
+			/* No need to add or remove encryption,
+			upgrade, or resize. */
+			delete_log_files();
+		} else {
+			/* Prepare to delete the old redo log file */
+			const lsn_t lsn{srv_prepare_to_delete_redo_log_file()};
+
+			DBUG_EXECUTE_IF("innodb_log_abort_1",
+					return(srv_init_abort(DB_ERROR)););
+			/* Prohibit redo log writes from any other
+			threads until creating a log checkpoint at the
+			end of create_log_file(). */
+			ut_d(recv_no_log_write = true);
+			ut_ad(!os_aio_pending_reads());
+			ut_d(mysql_mutex_lock(&buf_pool.flush_list_mutex));
+			ut_ad(!buf_pool.get_oldest_modification(0));
+			ut_d(mysql_mutex_unlock(&buf_pool.flush_list_mutex));
+			/* os_aio_pending_writes() may hold here if
+			some write_io_callback() did not release the
+			slot yet. However, the page write itself must
+			have completed, because the buf_pool.flush_list
+			is empty. In debug builds, we wait for this to
+			happen, hoping to get a hung process if this
+			assumption does not hold. */
+			ut_d(os_aio_wait_until_no_pending_writes(false));
+
+			/* Close the redo log file, so that we can replace it */
+			log_sys.close_file();
+
+			DBUG_EXECUTE_IF("innodb_log_abort_5",
+					return(srv_init_abort(DB_ERROR)););
+			DBUG_PRINT("ib_log", ("After innodb_log_abort_5"));
+
+			err = create_log_file(false, lsn);
+
+			if (err == DB_SUCCESS && log_sys.resize_rename()) {
+				err = DB_ERROR;
+			}
+
+			if (err != DB_SUCCESS) {
+				return(srv_init_abort(err));
+			}
+		}
+
+		recv_sys.debug_free();
+	}
+
+	ut_ad(err == DB_SUCCESS);
+	ut_a(sum_of_new_sizes != ULINT_UNDEFINED);
+
+	/* Create the doublewrite buffer to a new tablespace */
+	if (!srv_read_only_mode && srv_force_recovery < SRV_FORCE_NO_TRX_UNDO
+	    && !buf_dblwr.create()) {
+		return(srv_init_abort(DB_ERROR));
+	}
+
+	/* Recreate the undo tablespaces */
+	if (!high_level_read_only) {
+		err = srv_undo_tablespaces_reinitialize();
+		if (err) {
+			return srv_init_abort(err);
+		}
+	}
+
+	srv_undo_tablespaces = srv_undo_tablespaces_open;
+
+	/* Here the double write buffer has already been created and so
+	any new rollback segments will be allocated after the double
+	write buffer. The default segment should already exist.
+	We create the new segments only if it's a new database or
+	the database was shutdown cleanly. */
+
+	/* Note: When creating the extra rollback segments during an upgrade
+	we violate the latching order, even if the change buffer is empty.
+	It cannot create a deadlock because we are still
+	running in single threaded mode essentially. Only the IO threads
+	should be running at this stage. */
+
+	if (!trx_sys_create_rsegs()) {
+		return(srv_init_abort(DB_ERROR));
+	}
+
+	if (!create_new_db) {
+		ut_ad(high_level_read_only
+		      || srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN);
+
+		/* Validate a few system page types that were left
+		uninitialized before MySQL or MariaDB 5.5. */
+		if (!high_level_read_only
+		    && !fil_system.sys_space->full_crc32()) {
+			buf_block_t*	block;
+			mtr.start();
+			/* Bitmap page types will be reset in
+			buf_dblwr_check_block() without redo logging. */
+			block = buf_page_get(
+				page_id_t(IBUF_SPACE_ID,
+					  FSP_IBUF_HEADER_PAGE_NO),
+				0, RW_X_LATCH, &mtr);
+			if (UNIV_UNLIKELY(!block)) {
+			corrupted_old_page:
+				mtr.commit();
+				return srv_init_abort(DB_CORRUPTION);
+			}
+			fil_block_check_type(*block, FIL_PAGE_TYPE_SYS, &mtr);
+			/* Already MySQL 3.23.53 initialized
+			FSP_IBUF_TREE_ROOT_PAGE_NO to
+			FIL_PAGE_INDEX. No need to reset that one. */
+			block = buf_page_get(
+				page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO),
+				0, RW_X_LATCH, &mtr);
+			if (UNIV_UNLIKELY(!block)) {
+				goto corrupted_old_page;
+			}
+			fil_block_check_type(*block, FIL_PAGE_TYPE_TRX_SYS,
+					     &mtr);
+			block = buf_page_get(
+				page_id_t(TRX_SYS_SPACE,
+					  FSP_FIRST_RSEG_PAGE_NO),
+				0, RW_X_LATCH, &mtr);
+			if (UNIV_UNLIKELY(!block)) {
+				goto corrupted_old_page;
+			}
+			fil_block_check_type(*block, FIL_PAGE_TYPE_SYS, &mtr);
+			block = buf_page_get(
+				page_id_t(TRX_SYS_SPACE, FSP_DICT_HDR_PAGE_NO),
+				0, RW_X_LATCH, &mtr);
+			if (UNIV_UNLIKELY(!block)) {
+				goto corrupted_old_page;
+			}
+			fil_block_check_type(*block, FIL_PAGE_TYPE_SYS, &mtr);
+			mtr.commit();
+		}
+
+		/* Roll back any recovered data dictionary
+		transactions, so that the data dictionary tables will
+		be free of any locks.  The data dictionary latch
+		should guarantee that there is at most one data
+		dictionary transaction active at a time. */
+		if (!high_level_read_only
+		    && srv_force_recovery <= SRV_FORCE_NO_TRX_UNDO) {
+			/* If the following call is ever removed, the
+			first-time ha_innobase::open() must hold (or
+			acquire and release) a table lock that
+			conflicts with trx_resurrect_table_locks(), to
+			ensure that any recovered incomplete ALTER
+			TABLE will have been rolled back. Otherwise,
+			dict_table_t::instant could be cleared by
+			rollback invoking
+			dict_index_t::clear_instant_alter() while open
+			table handles exist in client connections. */
+			trx_rollback_recovered(false);
+		}
+
+		if (srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN) {
+			/* The following call is necessary for the insert
+			buffer to work with multiple tablespaces. We must
+			know the mapping between space id's and .ibd file
+			names.
+
+			In a crash recovery, we check that the info in data
+			dictionary is consistent with what we already know
+			about space id's from the calls to fil_ibd_load().
+
+			In a normal startup, we create the space objects for
+			every table in the InnoDB data dictionary that has
+			an .ibd file.
+
+			We also determine the maximum tablespace id used. */
+			dict_check_tablespaces_and_store_max_id();
+		}
+
+		if (srv_force_recovery < SRV_FORCE_NO_TRX_UNDO
+		    && !srv_read_only_mode) {
+			/* Drop partially created indexes. */
+			row_merge_drop_temp_indexes();
+			/* Rollback incomplete non-DDL transactions */
+			trx_rollback_is_active = true;
+			srv_thread_pool->submit_task(&rollback_all_recovered_task);
+		}
+	}
+
+	srv_startup_is_before_trx_rollback_phase = false;
+
+	if (!srv_read_only_mode) {
+		DBUG_EXECUTE_IF("innodb_skip_monitors", goto skip_monitors;);
+		/* Create the task which warns of long semaphore waits */
+		srv_start_periodic_timer(srv_monitor_timer, srv_monitor_task,
+					 SRV_MONITOR_INTERVAL);
+
+#ifndef DBUG_OFF
+skip_monitors:
+#endif
+		ut_ad(srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN
+		      || !purge_sys.enabled());
+
+		if (srv_force_recovery < SRV_FORCE_NO_BACKGROUND) {
+			srv_undo_sources = true;
+			/* Create the dict stats gathering task */
+			dict_stats_start();
+			/* Create the thread that will optimize the
+			FULLTEXT search index subsystem. */
+			fts_optimize_init();
+		}
+	}
+
+	err = dict_sys.create_or_check_sys_tables();
+	switch (err) {
+	case DB_SUCCESS:
+		break;
+	case DB_READ_ONLY:
+		if (srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO) {
+			break;
+		}
+		ib::error() << "Cannot create system tables in read-only mode";
+		/* fall through */
+	default:
+		return(srv_init_abort(err));
+	}
+
+	if (!srv_read_only_mode
+	    && srv_operation <= SRV_OPERATION_EXPORT_RESTORED) {
+		/* Initialize the innodb_temporary tablespace and keep
+		it open until shutdown. */
+		err = srv_open_tmp_tablespace(create_new_db);
+
+		if (err != DB_SUCCESS) {
+			return(srv_init_abort(err));
+		}
+
+		if (srv_force_recovery < SRV_FORCE_NO_BACKGROUND) {
+			srv_start_periodic_timer(srv_master_timer, srv_master_callback, 1000);
+		}
+	}
+
+	srv_is_being_started = false;
+
+	if (srv_print_verbose_log) {
+		sql_print_information("InnoDB: "
+				      "log sequence number " LSN_PF
+#ifdef HAVE_PMEM
+				      "%s"
+#endif
+				      "; transaction id " TRX_ID_FMT,
+				      recv_sys.lsn,
+#ifdef HAVE_PMEM
+				      log_sys.is_pmem()
+				      ? " (memory-mapped)" : "",
+#endif
+				      trx_sys.get_max_trx_id());
+	}
+
+	if (srv_force_recovery == 0) {
+		/* In the change buffer we may have even bigger tablespace
+		id's, because we may have dropped those tablespaces, but
+		the buffered records have not been cleaned yet. */
+		ibuf_update_max_tablespace_id();
+	}
+
+	if (!srv_read_only_mode) {
+		if (create_new_db) {
+			srv_buffer_pool_load_at_startup = FALSE;
+		}
+
+#ifdef WITH_WSREP
+		/*
+		  Create the dump/load thread only when not running with
+		  --wsrep-recover.
+		*/
+		if (!get_wsrep_recovery()) {
+#endif /* WITH_WSREP */
+
+		/* Start buffer pool dump/load task */
+		buf_load_at_startup();
+
+#ifdef WITH_WSREP
+		} else {
+			ib::warn() <<
+				"Skipping buffer pool dump/restore during "
+				"wsrep recovery.";
+		}
+#endif /* WITH_WSREP */
+
+		/* Create thread(s) that handles key rotation. This is
+		needed already here as log_preflush_pool_modified_pages
+		will flush dirty pages and that might need e.g.
+		fil_crypt_threads_cond. */
+		fil_crypt_threads_init();
+
+		/* Initialize online defragmentation. */
+		btr_defragment_init();
+
+		srv_started_redo = true;
+	}
+
+	return(DB_SUCCESS);
+}
+
+/**
+  Shutdown purge to make sure that there is no possibility that we call any
+  plugin code (e.g., audit) inside virtual column computation.
+*/
+void innodb_preshutdown()
+{
+  static bool first_time= true;
+  if (!first_time)
+    return;
+  first_time= false;
+
+  if (srv_read_only_mode)
+    return;
+  if (!srv_fast_shutdown && srv_operation <= SRV_OPERATION_EXPORT_RESTORED)
+  {
+    /* Because a slow shutdown must empty the change buffer, we had
+    better prevent any further changes from being buffered. */
+    innodb_change_buffering= 0;
+
+    if (trx_sys.is_initialised())
+      while (trx_sys.any_active_transactions())
+        std::this_thread::sleep_for(std::chrono::milliseconds(1));
+  }
+  srv_shutdown_bg_undo_sources();
+  srv_purge_shutdown();
+
+  if (srv_n_fil_crypt_threads)
+    fil_crypt_set_thread_cnt(0);
+}
+
+
+/** Shut down InnoDB. */
+void innodb_shutdown()
+{
+	innodb_preshutdown();
+	ut_ad(!srv_undo_sources);
+	switch (srv_operation) {
+	case SRV_OPERATION_BACKUP:
+	case SRV_OPERATION_RESTORE_DELTA:
+	case SRV_OPERATION_BACKUP_NO_DEFER:
+		break;
+	case SRV_OPERATION_RESTORE:
+	case SRV_OPERATION_RESTORE_EXPORT:
+		mysql_mutex_lock(&buf_pool.flush_list_mutex);
+		srv_shutdown_state = SRV_SHUTDOWN_CLEANUP;
+		while (buf_page_cleaner_is_active) {
+			pthread_cond_signal(&buf_pool.do_flush_list);
+			my_cond_wait(&buf_pool.done_flush_list,
+				     &buf_pool.flush_list_mutex.m_mutex);
+		}
+		mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+		break;
+	case SRV_OPERATION_NORMAL:
+	case SRV_OPERATION_EXPORT_RESTORED:
+		/* Shut down the persistent files. */
+		logs_empty_and_mark_files_at_shutdown();
+	}
+
+	os_aio_free();
+	fil_space_t::close_all();
+	/* Exit any remaining threads. */
+	ut_ad(!buf_page_cleaner_is_active);
+	srv_shutdown_threads();
+
+	if (srv_monitor_file) {
+		my_fclose(srv_monitor_file, MYF(MY_WME));
+		srv_monitor_file = 0;
+		if (srv_monitor_file_name) {
+			unlink(srv_monitor_file_name);
+			ut_free(srv_monitor_file_name);
+		}
+	}
+
+	if (srv_misc_tmpfile) {
+		my_fclose(srv_misc_tmpfile, MYF(MY_WME));
+		srv_misc_tmpfile = 0;
+	}
+
+	ut_ad(dict_sys.is_initialised() || !srv_was_started);
+	ut_ad(trx_sys.is_initialised() || !srv_was_started);
+	ut_ad(buf_dblwr.is_created() || !srv_was_started
+	      || srv_read_only_mode
+	      || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO);
+	ut_ad(lock_sys.is_initialised() || !srv_was_started);
+	ut_ad(log_sys.is_initialised() || !srv_was_started);
+	ut_ad(ibuf.index || !innodb_change_buffering || !srv_was_started
+	      || srv_force_recovery >= SRV_FORCE_NO_DDL_UNDO);
+
+	dict_stats_deinit();
+
+	if (srv_started_redo) {
+		ut_ad(!srv_read_only_mode);
+		/* srv_shutdown_bg_undo_sources() already invoked
+		fts_optimize_shutdown(); dict_stats_shutdown(); */
+
+		fil_crypt_threads_cleanup();
+		btr_defragment_shutdown();
+	}
+
+	/* This must be disabled before closing the buffer pool
+	and closing the data dictionary.  */
+
+#ifdef BTR_CUR_HASH_ADAPT
+	if (dict_sys.is_initialised()) {
+		btr_search_disable();
+	}
+#endif /* BTR_CUR_HASH_ADAPT */
+	ibuf_close();
+	log_sys.close();
+	purge_sys.close();
+	trx_sys.close();
+	buf_dblwr.close();
+	lock_sys.close();
+	trx_pool_close();
+
+	if (!srv_read_only_mode) {
+		mysql_mutex_destroy(&srv_monitor_file_mutex);
+		mysql_mutex_destroy(&srv_misc_tmpfile_mutex);
+	}
+
+	dict_sys.close();
+	btr_search_sys_free();
+	srv_free();
+	fil_system.close();
+	pars_lexer_close();
+	recv_sys.close();
+
+	ut_ad(buf_pool.is_initialised() || !srv_was_started);
+	buf_pool.close();
+
+	srv_sys_space.shutdown();
+	if (srv_tmp_space.get_sanity_check_status()) {
+		if (fil_system.temp_space) {
+			fil_system.temp_space->close();
+		}
+		srv_tmp_space.delete_files();
+	}
+	srv_tmp_space.shutdown();
+
+	if (srv_stats.pages_page_compression_error)
+		ib::warn() << "Page compression errors: "
+			   << srv_stats.pages_page_compression_error;
+
+	if (srv_was_started && srv_print_verbose_log) {
+		ib::info() << "Shutdown completed; log sequence number "
+			   << srv_shutdown_lsn
+			   << "; transaction id " << trx_sys.get_max_trx_id();
+	}
+	srv_thread_pool_end();
+	srv_started_redo = false;
+	srv_was_started = false;
+	srv_start_has_been_called = false;
+}
+
+/** Get the meta-data filename from the table name for a
+single-table tablespace.
+@param[in]	table		table object
+@param[out]	filename	filename
+@param[in]	max_len		filename max length */
+void
+srv_get_meta_data_filename(
+	dict_table_t*	table,
+	char*		filename,
+	ulint		max_len)
+{
+	ulint		len;
+	char*		path;
+
+	/* Make sure the data_dir_path is set. */
+	dict_get_and_save_data_dir_path(table);
+
+	const char* data_dir_path = DICT_TF_HAS_DATA_DIR(table->flags)
+		? table->data_dir_path : nullptr;
+	ut_ad(!DICT_TF_HAS_DATA_DIR(table->flags) || data_dir_path);
+
+	path = fil_make_filepath(data_dir_path, table->name, CFG,
+				 data_dir_path != nullptr);
+	ut_a(path);
+	len = strlen(path);
+	ut_a(max_len >= len);
+
+	strcpy(filename, path);
+
+	ut_free(path);
+}
diff --git a/storage/innobase/sync/srw_lock.cc b/storage/innobase/sync/srw_lock.cc
new file mode 100644
index 00000000..e41451d8
--- /dev/null
+++ b/storage/innobase/sync/srw_lock.cc
@@ -0,0 +1,550 @@
+/*****************************************************************************
+
+Copyright (c) 2020, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#include "srw_lock.h"
+#include "srv0srv.h"
+#include "my_cpu.h"
+#include "transactional_lock_guard.h"
+
+#ifdef NO_ELISION
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+# include <intrin.h>
+bool have_transactional_memory;
+bool transactional_lock_enabled()
+{
+  int regs[4];
+  __cpuid(regs, 0);
+  if (regs[0] < 7)
+    return false;
+  __cpuidex(regs, 7, 0);
+  /* Restricted Transactional Memory (RTM) */
+  have_transactional_memory= regs[1] & 1U << 11;
+  return have_transactional_memory;
+}
+#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+# include <cpuid.h>
+bool have_transactional_memory;
+bool transactional_lock_enabled()
+{
+  if (__get_cpuid_max(0, nullptr) < 7)
+    return false;
+  unsigned eax, ebx, ecx, edx;
+  __cpuid_count(7, 0, eax, ebx, ecx, edx);
+  /* Restricted Transactional Memory (RTM) */
+  have_transactional_memory= ebx & 1U << 11;
+  return have_transactional_memory;
+}
+
+# ifdef UNIV_DEBUG
+TRANSACTIONAL_TARGET
+bool xtest() { return have_transactional_memory && _xtest(); }
+# endif
+#elif defined __powerpc64__ || defined __s390__
+# include <htmxlintrin.h>
+# include <setjmp.h>
+# include <signal.h>
+
+__attribute__((target("htm"),hot))
+bool xbegin()
+{
+  return have_transactional_memory &&
+    __TM_simple_begin() == _HTM_TBEGIN_STARTED;
+}
+
+__attribute__((target("htm"),hot))
+void xabort() { __TM_abort(); }
+
+__attribute__((target("htm"),hot))
+void xend() { __TM_end(); }
+
+bool have_transactional_memory;
+static sigjmp_buf ill_jmp;
+static void ill_handler(int sig)
+{
+  siglongjmp(ill_jmp, sig);
+}
+/**
+  Here we are testing we can do a transaction without SIGILL
+  and a 1 instruction store can succeed.
+*/
+__attribute__((noinline))
+static void test_tm(bool *r)
+{
+  if (__TM_simple_begin() == _HTM_TBEGIN_STARTED)
+  {
+    *r= true;
+    __TM_end();
+  }
+}
+bool transactional_lock_enabled()
+{
+  bool r= false;
+  sigset_t oset;
+  struct sigaction ill_act, oact_ill;
+
+  memset(&ill_act, 0, sizeof(ill_act));
+  ill_act.sa_handler = ill_handler;
+  sigfillset(&ill_act.sa_mask);
+  sigdelset(&ill_act.sa_mask, SIGILL);
+
+  sigprocmask(SIG_SETMASK, &ill_act.sa_mask, &oset);
+  sigaction(SIGILL, &ill_act, &oact_ill);
+  if (sigsetjmp(ill_jmp, 1) == 0)
+  {
+    test_tm(&r);
+  }
+  sigaction(SIGILL, &oact_ill, NULL);
+  sigprocmask(SIG_SETMASK, &oset, NULL);
+  return r;
+}
+
+# ifdef UNIV_DEBUG
+__attribute__((target("htm"),hot))
+bool xtest()
+{
+# ifdef __s390x__
+  return have_transactional_memory &&
+    __builtin_tx_nesting_depth() > 0;
+# else
+  return have_transactional_memory &&
+    _HTM_STATE (__builtin_ttest ()) == _HTM_TRANSACTIONAL;
+# endif
+}
+# endif
+#endif
+
+/** @return the parameter for srw_pause() */
+static inline unsigned srw_pause_delay()
+{
+  return my_cpu_relax_multiplier / 4 * srv_spin_wait_delay;
+}
+
+/** Pause the CPU for some time, with no memory accesses. */
+static inline void srw_pause(unsigned delay)
+{
+  HMT_low();
+  while (delay--)
+    MY_RELAX_CPU();
+  HMT_medium();
+}
+
+#ifdef SUX_LOCK_GENERIC
+# ifndef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
+template<> void pthread_mutex_wrapper<true>::wr_wait()
+{
+  const unsigned delay= srw_pause_delay();
+
+  for (auto spin= srv_n_spin_wait_rounds; spin; spin--)
+  {
+    srw_pause(delay);
+    if (wr_lock_try())
+      return;
+  }
+
+  pthread_mutex_lock(&lock);
+}
+# endif
+
+template void ssux_lock_impl<false>::init();
+template void ssux_lock_impl<true>::init();
+template void ssux_lock_impl<false>::destroy();
+template void ssux_lock_impl<true>::destroy();
+
+template<bool spinloop>
+inline void srw_mutex_impl<spinloop>::wait(uint32_t lk)
+{
+  pthread_mutex_lock(&mutex);
+  while (lock.load(std::memory_order_relaxed) == lk)
+    pthread_cond_wait(&cond, &mutex);
+  pthread_mutex_unlock(&mutex);
+}
+
+template<bool spinloop>
+inline void ssux_lock_impl<spinloop>::wait(uint32_t lk)
+{
+  pthread_mutex_lock(&writer.mutex);
+  while (readers.load(std::memory_order_relaxed) == lk)
+    pthread_cond_wait(&readers_cond, &writer.mutex);
+  pthread_mutex_unlock(&writer.mutex);
+}
+
+template<bool spinloop>
+void srw_mutex_impl<spinloop>::wake()
+{
+  pthread_mutex_lock(&mutex);
+  pthread_cond_signal(&cond);
+  pthread_mutex_unlock(&mutex);
+}
+template<bool spinloop>
+void ssux_lock_impl<spinloop>::wake()
+{
+  pthread_mutex_lock(&writer.mutex);
+  pthread_cond_signal(&readers_cond);
+  pthread_mutex_unlock(&writer.mutex);
+}
+#else
+static_assert(4 == sizeof(rw_lock), "ABI");
+# ifdef _WIN32
+#  include <synchapi.h>
+
+template<bool spinloop>
+inline void srw_mutex_impl<spinloop>::wait(uint32_t lk)
+{ WaitOnAddress(&lock, &lk, 4, INFINITE); }
+template<bool spinloop>
+void srw_mutex_impl<spinloop>::wake() { WakeByAddressSingle(&lock); }
+
+template<bool spinloop>
+inline void ssux_lock_impl<spinloop>::wait(uint32_t lk)
+{ WaitOnAddress(&readers, &lk, 4, INFINITE); }
+template<bool spinloop>
+void ssux_lock_impl<spinloop>::wake() { WakeByAddressSingle(&readers); }
+# else
+#  ifdef __linux__
+#   include <linux/futex.h>
+#   include <sys/syscall.h>
+#   define SRW_FUTEX(a,op,n) \
+    syscall(SYS_futex, a, FUTEX_ ## op ## _PRIVATE, n, nullptr, nullptr, 0)
+#  elif defined __OpenBSD__
+#   include <sys/time.h>
+#   include <sys/futex.h>
+#   define SRW_FUTEX(a,op,n) \
+    futex((volatile uint32_t*) a, FUTEX_ ## op, n, nullptr, nullptr)
+#  elif defined __FreeBSD__
+#   include <sys/types.h>
+#   include <sys/umtx.h>
+#   define FUTEX_WAKE UMTX_OP_WAKE_PRIVATE
+#   define FUTEX_WAIT UMTX_OP_WAIT_UINT_PRIVATE
+#   define SRW_FUTEX(a,op,n) _umtx_op(a, FUTEX_ ## op, n, nullptr, nullptr)
+#  elif defined __DragonFly__
+#   include <unistd.h>
+#   define FUTEX_WAKE(a,n) umtx_wakeup(a,n)
+#   define FUTEX_WAIT(a,n) umtx_sleep(a,n,0)
+#   define SRW_FUTEX(a,op,n) FUTEX_ ## op((volatile int*) a, int(n))
+#  else
+#   error "no futex support"
+#  endif
+
+template<bool spinloop>
+inline void srw_mutex_impl<spinloop>::wait(uint32_t lk)
+{ SRW_FUTEX(&lock, WAIT, lk); }
+template<bool spinloop>
+void srw_mutex_impl<spinloop>::wake() { SRW_FUTEX(&lock, WAKE, 1); }
+
+template<bool spinloop>
+inline void ssux_lock_impl<spinloop>::wait(uint32_t lk)
+{ SRW_FUTEX(&readers, WAIT, lk); }
+template<bool spinloop>
+void ssux_lock_impl<spinloop>::wake() { SRW_FUTEX(&readers, WAKE, 1); }
+# endif
+#endif
+
+template void srw_mutex_impl<false>::wake();
+template void ssux_lock_impl<false>::wake();
+template void srw_mutex_impl<true>::wake();
+template void ssux_lock_impl<true>::wake();
+
+/*
+
+Unfortunately, compilers targeting IA-32 or AMD64 currently cannot
+translate the following single-bit operations into Intel 80386 instructions:
+
+     m.fetch_or(1<<b) & 1<<b       LOCK BTS b, m
+     m.fetch_and(~(1<<b)) & 1<<b   LOCK BTR b, m
+     m.fetch_xor(1<<b) & 1<<b      LOCK BTC b, m
+
+Hence, we will manually translate fetch_or() using GCC-style inline
+assembler code or a Microsoft intrinsic function.
+
+*/
+
+#if defined __clang_major__ && __clang_major__ < 10
+/* Only clang-10 introduced support for asm goto */
+#elif defined __APPLE__
+/* At least some versions of Apple Xcode do not support asm goto */
+#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+# define IF_FETCH_OR_GOTO(mem, bit, label)				\
+  __asm__ goto("lock btsl $" #bit ", %0\n\t"				\
+               "jc %l1" : : "m" (mem) : "cc", "memory" : label);
+# define IF_NOT_FETCH_OR_GOTO(mem, bit, label)				\
+  __asm__ goto("lock btsl $" #bit ", %0\n\t"				\
+               "jnc %l1" : : "m" (mem) : "cc", "memory" : label);
+#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
+# define IF_FETCH_OR_GOTO(mem, bit, label)				\
+  if (_interlockedbittestandset(reinterpret_cast<volatile long*>(&mem), bit)) \
+    goto label;
+# define IF_NOT_FETCH_OR_GOTO(mem, bit, label)				\
+  if (!_interlockedbittestandset(reinterpret_cast<volatile long*>(&mem), bit))\
+    goto label;
+#endif
+
+template<bool spinloop>
+void srw_mutex_impl<spinloop>::wait_and_lock()
+{
+  uint32_t lk= 1 + lock.fetch_add(1, std::memory_order_relaxed);
+
+  if (spinloop)
+  {
+    const unsigned delay= srw_pause_delay();
+
+    for (auto spin= srv_n_spin_wait_rounds;;)
+    {
+      DBUG_ASSERT(~HOLDER & lk);
+      if (lk & HOLDER)
+        lk= lock.load(std::memory_order_relaxed);
+      else
+      {
+#ifdef IF_NOT_FETCH_OR_GOTO
+        static_assert(HOLDER == (1U << 31), "compatibility");
+        IF_NOT_FETCH_OR_GOTO(*this, 31, acquired);
+        lk|= HOLDER;
+#else
+        if (!((lk= lock.fetch_or(HOLDER, std::memory_order_relaxed)) & HOLDER))
+          goto acquired;
+#endif
+        srw_pause(delay);
+      }
+      if (!--spin)
+        break;
+    }
+  }
+
+  for (;;)
+  {
+    DBUG_ASSERT(~HOLDER & lk);
+    if (lk & HOLDER)
+    {
+      wait(lk);
+#ifdef IF_FETCH_OR_GOTO
+reload:
+#endif
+      lk= lock.load(std::memory_order_relaxed);
+    }
+    else
+    {
+#ifdef IF_FETCH_OR_GOTO
+      static_assert(HOLDER == (1U << 31), "compatibility");
+      IF_FETCH_OR_GOTO(*this, 31, reload);
+#else
+      if ((lk= lock.fetch_or(HOLDER, std::memory_order_relaxed)) & HOLDER)
+        continue;
+      DBUG_ASSERT(lk);
+#endif
+acquired:
+      std::atomic_thread_fence(std::memory_order_acquire);
+      return;
+    }
+  }
+}
+
+template void srw_mutex_impl<false>::wait_and_lock();
+template void srw_mutex_impl<true>::wait_and_lock();
+
+template<bool spinloop>
+void ssux_lock_impl<spinloop>::wr_wait(uint32_t lk)
+{
+  DBUG_ASSERT(writer.is_locked());
+  DBUG_ASSERT(lk);
+  DBUG_ASSERT(lk < WRITER);
+
+  if (spinloop)
+  {
+    const unsigned delay= srw_pause_delay();
+
+    for (auto spin= srv_n_spin_wait_rounds; spin; spin--)
+    {
+      srw_pause(delay);
+      lk= readers.load(std::memory_order_acquire);
+      if (lk == WRITER)
+        return;
+      DBUG_ASSERT(lk > WRITER);
+    }
+  }
+
+  lk|= WRITER;
+
+  do
+  {
+    DBUG_ASSERT(lk > WRITER);
+    wait(lk);
+    lk= readers.load(std::memory_order_acquire);
+  }
+  while (lk != WRITER);
+}
+
+template void ssux_lock_impl<true>::wr_wait(uint32_t);
+template void ssux_lock_impl<false>::wr_wait(uint32_t);
+
+template<bool spinloop>
+void ssux_lock_impl<spinloop>::rd_wait()
+{
+  for (;;)
+  {
+    writer.wr_lock();
+    bool acquired= rd_lock_try();
+    writer.wr_unlock();
+    if (acquired)
+      break;
+  }
+}
+
+template void ssux_lock_impl<true>::rd_wait();
+template void ssux_lock_impl<false>::rd_wait();
+
+#if defined _WIN32 || defined SUX_LOCK_GENERIC
+template<> void srw_lock_<true>::rd_wait()
+{
+  const unsigned delay= srw_pause_delay();
+
+  for (auto spin= srv_n_spin_wait_rounds; spin; spin--)
+  {
+    srw_pause(delay);
+    if (rd_lock_try())
+      return;
+  }
+
+  IF_WIN(AcquireSRWLockShared(&lk), rw_rdlock(&lk));
+}
+
+template<> void srw_lock_<true>::wr_wait()
+{
+  const unsigned delay= srw_pause_delay();
+
+  for (auto spin= srv_n_spin_wait_rounds; spin; spin--)
+  {
+    srw_pause(delay);
+    if (wr_lock_try())
+      return;
+  }
+
+  IF_WIN(AcquireSRWLockExclusive(&lk), rw_wrlock(&lk));
+}
+#endif
+
+#ifdef UNIV_PFS_RWLOCK
+template void srw_lock_impl<false>::psi_rd_lock(const char*, unsigned);
+template void srw_lock_impl<false>::psi_wr_lock(const char*, unsigned);
+template void srw_lock_impl<true>::psi_rd_lock(const char*, unsigned);
+template void srw_lock_impl<true>::psi_wr_lock(const char*, unsigned);
+
+template<bool spinloop>
+void srw_lock_impl<spinloop>::psi_rd_lock(const char *file, unsigned line)
+{
+  PSI_rwlock_locker_state state;
+  const bool nowait= lock.rd_lock_try();
+  if (PSI_rwlock_locker *locker= PSI_RWLOCK_CALL(start_rwlock_rdwait)
+      (&state, pfs_psi,
+       nowait ? PSI_RWLOCK_TRYREADLOCK : PSI_RWLOCK_READLOCK, file, line))
+  {
+    if (!nowait)
+      lock.rd_lock();
+    PSI_RWLOCK_CALL(end_rwlock_rdwait)(locker, 0);
+  }
+  else if (!nowait)
+    lock.rd_lock();
+}
+
+template<bool spinloop>
+void srw_lock_impl<spinloop>::psi_wr_lock(const char *file, unsigned line)
+{
+  PSI_rwlock_locker_state state;
+  const bool nowait= lock.wr_lock_try();
+  if (PSI_rwlock_locker *locker= PSI_RWLOCK_CALL(start_rwlock_wrwait)
+      (&state, pfs_psi,
+       nowait ? PSI_RWLOCK_TRYWRITELOCK : PSI_RWLOCK_WRITELOCK, file, line))
+  {
+    if (!nowait)
+      lock.wr_lock();
+    PSI_RWLOCK_CALL(end_rwlock_rdwait)(locker, 0);
+  }
+  else if (!nowait)
+    lock.wr_lock();
+}
+
+void ssux_lock::psi_rd_lock(const char *file, unsigned line)
+{
+  PSI_rwlock_locker_state state;
+  const bool nowait= lock.rd_lock_try();
+  if (PSI_rwlock_locker *locker= PSI_RWLOCK_CALL(start_rwlock_rdwait)
+      (&state, pfs_psi,
+       nowait ? PSI_RWLOCK_TRYSHAREDLOCK : PSI_RWLOCK_SHAREDLOCK, file, line))
+  {
+    if (!nowait)
+      lock.rd_lock();
+    PSI_RWLOCK_CALL(end_rwlock_rdwait)(locker, 0);
+  }
+  else if (!nowait)
+    lock.rd_lock();
+}
+
+void ssux_lock::psi_u_lock(const char *file, unsigned line)
+{
+  PSI_rwlock_locker_state state;
+  if (PSI_rwlock_locker *locker= PSI_RWLOCK_CALL(start_rwlock_wrwait)
+      (&state, pfs_psi, PSI_RWLOCK_SHAREDEXCLUSIVELOCK, file, line))
+  {
+    lock.u_lock();
+    PSI_RWLOCK_CALL(end_rwlock_rdwait)(locker, 0);
+  }
+  else
+    lock.u_lock();
+}
+
+void ssux_lock::psi_wr_lock(const char *file, unsigned line)
+{
+  PSI_rwlock_locker_state state;
+  const bool nowait= lock.wr_lock_try();
+  if (PSI_rwlock_locker *locker= PSI_RWLOCK_CALL(start_rwlock_wrwait)
+      (&state, pfs_psi,
+       nowait ? PSI_RWLOCK_TRYEXCLUSIVELOCK : PSI_RWLOCK_EXCLUSIVELOCK,
+       file, line))
+  {
+    if (!nowait)
+      lock.wr_lock();
+    PSI_RWLOCK_CALL(end_rwlock_rdwait)(locker, 0);
+  }
+  else if (!nowait)
+    lock.wr_lock();
+}
+
+void ssux_lock::psi_u_wr_upgrade(const char *file, unsigned line)
+{
+  PSI_rwlock_locker_state state;
+  DBUG_ASSERT(lock.writer.is_locked());
+  uint32_t lk= 1;
+  const bool nowait=
+    lock.readers.compare_exchange_strong(lk, ssux_lock_impl<false>::WRITER,
+                                         std::memory_order_acquire,
+                                         std::memory_order_relaxed);
+  if (PSI_rwlock_locker *locker= PSI_RWLOCK_CALL(start_rwlock_wrwait)
+      (&state, pfs_psi,
+       nowait ? PSI_RWLOCK_TRYEXCLUSIVELOCK : PSI_RWLOCK_EXCLUSIVELOCK,
+       file, line))
+  {
+    if (!nowait)
+      lock.u_wr_upgrade();
+    PSI_RWLOCK_CALL(end_rwlock_rdwait)(locker, 0);
+  }
+  else if (!nowait)
+    lock.u_wr_upgrade();
+}
+#else /* UNIV_PFS_RWLOCK */
+template void ssux_lock_impl<false>::rd_lock();
+template void ssux_lock_impl<false>::rd_unlock();
+template void ssux_lock_impl<false>::u_unlock();
+template void ssux_lock_impl<false>::wr_unlock();
+#endif /* UNIV_PFS_RWLOCK */
diff --git a/storage/innobase/trx/trx0i_s.cc b/storage/innobase/trx/trx0i_s.cc
new file mode 100644
index 00000000..2dc39118
--- /dev/null
+++ b/storage/innobase/trx/trx0i_s.cc
@@ -0,0 +1,1471 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0i_s.cc
+INFORMATION SCHEMA innodb_trx, innodb_locks and
+innodb_lock_waits tables fetch code.
+
+The code below fetches information needed to fill those
+3 dynamic tables and uploads it into a "transactions
+table cache" for later retrieval.
+
+Created July 17, 2007 Vasil Dimov
+*******************************************************/
+
+#include "trx0i_s.h"
+#include "buf0buf.h"
+#include "dict0dict.h"
+#include "ha0storage.h"
+#include "hash0hash.h"
+#include "lock0iter.h"
+#include "lock0lock.h"
+#include "mem0mem.h"
+#include "page0page.h"
+#include "rem0rec.h"
+#include "row0row.h"
+#include "srv0srv.h"
+#include "trx0sys.h"
+#include "que0que.h"
+#include "trx0purge.h"
+#include "sql_class.h"
+
+/** Initial number of rows in the table cache */
+#define TABLE_CACHE_INITIAL_ROWSNUM	1024
+
+/** @brief The maximum number of chunks to allocate for a table cache.
+
+The rows of a table cache are stored in a set of chunks. When a new
+row is added a new chunk is allocated if necessary. Assuming that the
+first one is 1024 rows (TABLE_CACHE_INITIAL_ROWSNUM) and each
+subsequent is N/2 where N is the number of rows we have allocated till
+now, then 39th chunk would accommodate 1677416425 rows and all chunks
+would accommodate 3354832851 rows. */
+#define MEM_CHUNKS_IN_TABLE_CACHE	39
+
+/** The following are some testing auxiliary macros. Do not enable them
+in a production environment. */
+/* @{ */
+
+#if 0
+/** If this is enabled then lock folds will always be different
+resulting in equal rows being put in a different cells of the hash
+table. Checking for duplicates will be flawed because different
+fold will be calculated when a row is searched in the hash table. */
+#define TEST_LOCK_FOLD_ALWAYS_DIFFERENT
+#endif
+
+#if 0
+/** This effectively kills the search-for-duplicate-before-adding-a-row
+function, but searching in the hash is still performed. It will always
+be assumed that lock is not present and insertion will be performed in
+the hash table. */
+#define TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T
+#endif
+
+#if 0
+/** This aggressively repeats adding each row many times. Depending on
+the above settings this may be noop or may result in lots of rows being
+added. */
+#define TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES
+#endif
+
+#if 0
+/** Very similar to TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T but hash
+table search is not performed at all. */
+#define TEST_DO_NOT_CHECK_FOR_DUPLICATE_ROWS
+#endif
+
+#if 0
+/** Do not insert each row into the hash table, duplicates may appear
+if this is enabled, also if this is enabled searching into the hash is
+noop because it will be empty. */
+#define TEST_DO_NOT_INSERT_INTO_THE_HASH_TABLE
+#endif
+/* @} */
+
+/** Memory limit passed to ha_storage_put_memlim().
+@param cache hash storage
+@return maximum allowed allocation size */
+#define MAX_ALLOWED_FOR_STORAGE(cache)		\
+	(TRX_I_S_MEM_LIMIT			\
+	 - (cache)->mem_allocd)
+
+/** Memory limit in table_cache_create_empty_row().
+@param cache hash storage
+@return maximum allowed allocation size */
+#define MAX_ALLOWED_FOR_ALLOC(cache)		\
+	(TRX_I_S_MEM_LIMIT			\
+	 - (cache)->mem_allocd			\
+	 - ha_storage_get_size((cache)->storage))
+
+/** Memory for each table in the intermediate buffer is allocated in
+separate chunks. These chunks are considered to be concatenated to
+represent one flat array of rows. */
+struct i_s_mem_chunk_t {
+	ulint	offset;		/*!< offset, in number of rows */
+	ulint	rows_allocd;	/*!< the size of this chunk, in number
+				of rows */
+	void*	base;		/*!< start of the chunk */
+};
+
+/** This represents one table's cache. */
+struct i_s_table_cache_t {
+	ulint		rows_used;	/*!< number of used rows */
+	ulint		rows_allocd;	/*!< number of allocated rows */
+	ulint		row_size;	/*!< size of a single row */
+	i_s_mem_chunk_t	chunks[MEM_CHUNKS_IN_TABLE_CACHE]; /*!< array of
+					memory chunks that stores the
+					rows */
+};
+
+/** This structure describes the intermediate buffer */
+struct trx_i_s_cache_t {
+	srw_lock rw_lock;		/*!< read-write lock protecting this */
+	Atomic_relaxed<ulonglong> last_read;
+					/*!< last time the cache was read;
+					measured in nanoseconds */
+	i_s_table_cache_t innodb_trx;	/*!< innodb_trx table */
+	i_s_table_cache_t innodb_locks;	/*!< innodb_locks table */
+	i_s_table_cache_t innodb_lock_waits;/*!< innodb_lock_waits table */
+/** the hash table size is LOCKS_HASH_CELLS_NUM * sizeof(void*) bytes */
+#define LOCKS_HASH_CELLS_NUM		10000
+	hash_table_t	locks_hash;	/*!< hash table used to eliminate
+					duplicate entries in the
+					innodb_locks table */
+/** Initial size of the cache storage */
+#define CACHE_STORAGE_INITIAL_SIZE	1024
+/** Number of hash cells in the cache storage */
+#define CACHE_STORAGE_HASH_CELLS	2048
+	ha_storage_t*	storage;	/*!< storage for external volatile
+					data that may become unavailable
+					when we release
+					lock_sys.latch */
+	ulint		mem_allocd;	/*!< the amount of memory
+					allocated with mem_alloc*() */
+	bool		is_truncated;	/*!< this is true if the memory
+					limit was hit and thus the data
+					in the cache is truncated */
+};
+
+/** This is the intermediate buffer where data needed to fill the
+INFORMATION SCHEMA tables is fetched and later retrieved by the C++
+code in handler/i_s.cc. */
+static trx_i_s_cache_t	trx_i_s_cache_static;
+/** This is the intermediate buffer where data needed to fill the
+INFORMATION SCHEMA tables is fetched and later retrieved by the C++
+code in handler/i_s.cc. */
+trx_i_s_cache_t*	trx_i_s_cache = &trx_i_s_cache_static;
+
+/** @return the heap number of a record lock
+@retval 0xFFFF for table locks */
+static uint16_t wait_lock_get_heap_no(const lock_t *lock)
+{
+  return !lock->is_table()
+    ? static_cast<uint16_t>(lock_rec_find_set_bit(lock))
+    : uint16_t{0xFFFF};
+}
+
+/*******************************************************************//**
+Initializes the members of a table cache. */
+static
+void
+table_cache_init(
+/*=============*/
+	i_s_table_cache_t*	table_cache,	/*!< out: table cache */
+	size_t			row_size)	/*!< in: the size of a
+						row */
+{
+	ulint	i;
+
+	table_cache->rows_used = 0;
+	table_cache->rows_allocd = 0;
+	table_cache->row_size = row_size;
+
+	for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+		/* the memory is actually allocated in
+		table_cache_create_empty_row() */
+		table_cache->chunks[i].base = NULL;
+	}
+}
+
+/*******************************************************************//**
+Frees a table cache. */
+static
+void
+table_cache_free(
+/*=============*/
+	i_s_table_cache_t*	table_cache)	/*!< in/out: table cache */
+{
+	ulint	i;
+
+	for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+		/* the memory is actually allocated in
+		table_cache_create_empty_row() */
+		if (table_cache->chunks[i].base) {
+			ut_free(table_cache->chunks[i].base);
+			table_cache->chunks[i].base = NULL;
+		}
+	}
+}
+
+/*******************************************************************//**
+Returns an empty row from a table cache. The row is allocated if no more
+empty rows are available. The number of used rows is incremented.
+If the memory limit is hit then NULL is returned and nothing is
+allocated.
+@return empty row, or NULL if out of memory */
+static
+void*
+table_cache_create_empty_row(
+/*=========================*/
+	i_s_table_cache_t*	table_cache,	/*!< in/out: table cache */
+	trx_i_s_cache_t*	cache)		/*!< in/out: cache to record
+						how many bytes are
+						allocated */
+{
+	ulint	i;
+	void*	row;
+
+	ut_a(table_cache->rows_used <= table_cache->rows_allocd);
+
+	if (table_cache->rows_used == table_cache->rows_allocd) {
+
+		/* rows_used == rows_allocd means that new chunk needs
+		to be allocated: either no more empty rows in the
+		last allocated chunk or nothing has been allocated yet
+		(rows_num == rows_allocd == 0); */
+
+		i_s_mem_chunk_t*	chunk;
+		ulint			req_bytes;
+		ulint			got_bytes;
+		ulint			req_rows;
+		ulint			got_rows;
+
+		/* find the first not allocated chunk */
+		for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+			if (table_cache->chunks[i].base == NULL) {
+
+				break;
+			}
+		}
+
+		/* i == MEM_CHUNKS_IN_TABLE_CACHE means that all chunks
+		have been allocated :-X */
+		ut_a(i < MEM_CHUNKS_IN_TABLE_CACHE);
+
+		/* allocate the chunk we just found */
+
+		if (i == 0) {
+
+			/* first chunk, nothing is allocated yet */
+			req_rows = TABLE_CACHE_INITIAL_ROWSNUM;
+		} else {
+
+			/* Memory is increased by the formula
+			new = old + old / 2; We are trying not to be
+			aggressive here (= using the common new = old * 2)
+			because the allocated memory will not be freed
+			until InnoDB exit (it is reused). So it is better
+			to once allocate the memory in more steps, but
+			have less unused/wasted memory than to use less
+			steps in allocation (which is done once in a
+			lifetime) but end up with lots of unused/wasted
+			memory. */
+			req_rows = table_cache->rows_allocd / 2;
+		}
+		req_bytes = req_rows * table_cache->row_size;
+
+		if (req_bytes > MAX_ALLOWED_FOR_ALLOC(cache)) {
+
+			return(NULL);
+		}
+
+		chunk = &table_cache->chunks[i];
+
+		got_bytes = req_bytes;
+		chunk->base = ut_malloc_nokey(req_bytes);
+
+		got_rows = got_bytes / table_cache->row_size;
+
+		cache->mem_allocd += got_bytes;
+
+#if 0
+		printf("allocating chunk %d req bytes=%lu, got bytes=%lu,"
+		       " row size=%lu,"
+		       " req rows=%lu, got rows=%lu\n",
+		       i, req_bytes, got_bytes,
+		       table_cache->row_size,
+		       req_rows, got_rows);
+#endif
+
+		chunk->rows_allocd = got_rows;
+
+		table_cache->rows_allocd += got_rows;
+
+		/* adjust the offset of the next chunk */
+		if (i < MEM_CHUNKS_IN_TABLE_CACHE - 1) {
+
+			table_cache->chunks[i + 1].offset
+				= chunk->offset + chunk->rows_allocd;
+		}
+
+		/* return the first empty row in the newly allocated
+		chunk */
+		row = chunk->base;
+	} else {
+
+		char*	chunk_start;
+		ulint	offset;
+
+		/* there is an empty row, no need to allocate new
+		chunks */
+
+		/* find the first chunk that contains allocated but
+		empty/unused rows */
+		for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+			if (table_cache->chunks[i].offset
+			    + table_cache->chunks[i].rows_allocd
+			    > table_cache->rows_used) {
+
+				break;
+			}
+		}
+
+		/* i == MEM_CHUNKS_IN_TABLE_CACHE means that all chunks
+		are full, but
+		table_cache->rows_used != table_cache->rows_allocd means
+		exactly the opposite - there are allocated but
+		empty/unused rows :-X */
+		ut_a(i < MEM_CHUNKS_IN_TABLE_CACHE);
+
+		chunk_start = (char*) table_cache->chunks[i].base;
+		offset = table_cache->rows_used
+			- table_cache->chunks[i].offset;
+
+		row = chunk_start + offset * table_cache->row_size;
+	}
+
+	table_cache->rows_used++;
+
+	return(row);
+}
+
+#ifdef UNIV_DEBUG
+/*******************************************************************//**
+Validates a row in the locks cache.
+@return TRUE if valid */
+static
+ibool
+i_s_locks_row_validate(
+/*===================*/
+	const i_s_locks_row_t*	row)	/*!< in: row to validate */
+{
+	ut_ad(row->lock_mode);
+	ut_ad(row->lock_table != NULL);
+	ut_ad(row->lock_table_id != 0);
+
+	if (!row->lock_index) {
+		/* table lock */
+		ut_ad(!row->lock_data);
+		ut_ad(row->lock_page == page_id_t(0, 0));
+		ut_ad(!row->lock_rec);
+	} else {
+		/* record lock */
+		/* row->lock_data == NULL if buf_page_try_get() == NULL */
+	}
+
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+Fills i_s_trx_row_t object.
+If memory can not be allocated then FALSE is returned.
+@return FALSE if allocation fails */
+static
+ibool
+fill_trx_row(
+/*=========*/
+	i_s_trx_row_t*		row,		/*!< out: result object
+						that's filled */
+	const trx_t*		trx,		/*!< in: transaction to
+						get data from */
+	const i_s_locks_row_t*	requested_lock_row,/*!< in: pointer to the
+						corresponding row in
+						innodb_locks if trx is
+						waiting or NULL if trx
+						is not waiting */
+	trx_i_s_cache_t*	cache)		/*!< in/out: cache into
+						which to copy volatile
+						strings */
+{
+	const char*	s;
+
+	lock_sys.assert_locked();
+
+	const lock_t* wait_lock = trx->lock.wait_lock;
+
+	row->trx_id = trx->id;
+	row->trx_started = trx->start_time;
+	if (trx->in_rollback) {
+		row->trx_state = "ROLLING BACK";
+	} else if (trx->state == TRX_STATE_COMMITTED_IN_MEMORY) {
+		row->trx_state = "COMMITTING";
+	} else if (wait_lock) {
+		row->trx_state = "LOCK WAIT";
+	} else {
+		row->trx_state = "RUNNING";
+	}
+
+	row->requested_lock_row = requested_lock_row;
+	ut_ad(requested_lock_row == NULL
+	      || i_s_locks_row_validate(requested_lock_row));
+
+	ut_ad(!wait_lock == !requested_lock_row);
+
+	const my_hrtime_t suspend_time= trx->lock.suspend_time;
+	row->trx_wait_started = wait_lock ? hrtime_to_time(suspend_time) : 0;
+
+	row->trx_weight = static_cast<uintmax_t>(TRX_WEIGHT(trx));
+
+	if (trx->mysql_thd == NULL) {
+		/* For internal transactions e.g., purge and transactions
+		being recovered at startup there is no associated MySQL
+		thread data structure. */
+		row->trx_mysql_thread_id = 0;
+		row->trx_query = NULL;
+		goto thd_done;
+	}
+
+	row->trx_mysql_thread_id = thd_get_thread_id(trx->mysql_thd);
+
+	char	query[TRX_I_S_TRX_QUERY_MAX_LEN + 1];
+	if (size_t stmt_len = thd_query_safe(trx->mysql_thd, query,
+					     sizeof query)) {
+		row->trx_query = static_cast<const char*>(
+			ha_storage_put_memlim(
+				cache->storage, query, stmt_len + 1,
+				MAX_ALLOWED_FOR_STORAGE(cache)));
+
+		row->trx_query_cs = thd_charset(trx->mysql_thd);
+
+		if (row->trx_query == NULL) {
+
+			return(FALSE);
+		}
+	} else {
+
+		row->trx_query = NULL;
+	}
+
+thd_done:
+	row->trx_operation_state = trx->op_info;
+
+	row->trx_tables_in_use = trx->n_mysql_tables_in_use;
+
+	row->trx_tables_locked = lock_number_of_tables_locked(&trx->lock);
+
+	/* These are protected by lock_sys.latch (which we are holding)
+	and sometimes also trx->mutex. */
+
+	row->trx_lock_structs = UT_LIST_GET_LEN(trx->lock.trx_locks);
+
+	row->trx_lock_memory_bytes = mem_heap_get_size(trx->lock.lock_heap);
+
+	row->trx_rows_locked = trx->lock.n_rec_locks;
+
+	row->trx_rows_modified = trx->undo_no;
+
+	row->trx_isolation_level = trx->isolation_level;
+
+	row->trx_unique_checks = (ibool) trx->check_unique_secondary;
+
+	row->trx_foreign_key_checks = (ibool) trx->check_foreigns;
+
+	s = trx->detailed_error;
+
+	if (s != NULL && s[0] != '\0') {
+
+		TRX_I_S_STRING_COPY(s,
+				    row->trx_foreign_key_error,
+				    TRX_I_S_TRX_FK_ERROR_MAX_LEN, cache);
+
+		if (row->trx_foreign_key_error == NULL) {
+
+			return(FALSE);
+		}
+	} else {
+		row->trx_foreign_key_error = NULL;
+	}
+
+	row->trx_is_read_only = trx->read_only;
+
+	row->trx_is_autocommit_non_locking = trx->is_autocommit_non_locking();
+
+	return(TRUE);
+}
+
+/*******************************************************************//**
+Format the nth field of "rec" and put it in "buf". The result is always
+NUL-terminated. Returns the number of bytes that were written to "buf"
+(including the terminating NUL).
+@return end of the result */
+static
+ulint
+put_nth_field(
+/*==========*/
+	char*			buf,	/*!< out: buffer */
+	ulint			buf_size,/*!< in: buffer size in bytes */
+	ulint			n,	/*!< in: number of field */
+	const dict_index_t*	index,	/*!< in: index */
+	const rec_t*		rec,	/*!< in: record */
+	const rec_offs*		offsets)/*!< in: record offsets, returned
+					by rec_get_offsets() */
+{
+	const byte*	data;
+	ulint		data_len;
+	dict_field_t*	dict_field;
+	ulint		ret;
+
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+	if (buf_size == 0) {
+
+		return(0);
+	}
+
+	ret = 0;
+
+	if (n > 0) {
+		/* we must append ", " before the actual data */
+
+		if (buf_size < 3) {
+
+			buf[0] = '\0';
+			return(1);
+		}
+
+		memcpy(buf, ", ", 3);
+
+		buf += 2;
+		buf_size -= 2;
+		ret += 2;
+	}
+
+	/* now buf_size >= 1 */
+
+	data = rec_get_nth_field(rec, offsets, n, &data_len);
+
+	dict_field = dict_index_get_nth_field(index, n);
+
+	ret += row_raw_format((const char*) data, data_len,
+			      dict_field, buf, buf_size);
+
+	return(ret);
+}
+
+/*******************************************************************//**
+Fills the "lock_data" member of i_s_locks_row_t object.
+If memory can not be allocated then FALSE is returned.
+@return FALSE if allocation fails */
+static
+ibool
+fill_lock_data(
+/*===========*/
+	const char**		lock_data,/*!< out: "lock_data" to fill */
+	const lock_t*		lock,	/*!< in: lock used to find the data */
+	ulint			heap_no,/*!< in: rec num used to find the data */
+	trx_i_s_cache_t*	cache)	/*!< in/out: cache where to store
+					volatile data */
+{
+	ut_a(!lock->is_table());
+
+	switch (heap_no) {
+	case PAGE_HEAP_NO_INFIMUM:
+	case PAGE_HEAP_NO_SUPREMUM:
+		*lock_data = ha_storage_put_str_memlim(
+			cache->storage,
+			heap_no == PAGE_HEAP_NO_INFIMUM
+			? "infimum pseudo-record"
+			: "supremum pseudo-record",
+			MAX_ALLOWED_FOR_STORAGE(cache));
+		return(*lock_data != NULL);
+	}
+
+	mtr_t			mtr;
+
+	const buf_block_t*	block;
+	const page_t*		page;
+	const rec_t*		rec;
+	ulint			n_fields;
+	mem_heap_t*		heap;
+	rec_offs		offsets_onstack[REC_OFFS_NORMAL_SIZE];
+	rec_offs*		offsets;
+	char			buf[TRX_I_S_LOCK_DATA_MAX_LEN];
+	ulint			buf_used;
+	ulint			i;
+
+	mtr_start(&mtr);
+
+	block = buf_page_try_get(lock->un_member.rec_lock.page_id, &mtr);
+
+	if (block == NULL) {
+
+		*lock_data = NULL;
+
+		mtr_commit(&mtr);
+
+		return(TRUE);
+	}
+
+	page = reinterpret_cast<const page_t*>(buf_block_get_frame(block));
+
+	rec_offs_init(offsets_onstack);
+	offsets = offsets_onstack;
+
+	rec = page_find_rec_with_heap_no(page, heap_no);
+
+	const dict_index_t* index = lock->index;
+	ut_ad(index->is_primary() || !dict_index_is_online_ddl(index));
+
+	n_fields = dict_index_get_n_unique(index);
+
+	ut_a(n_fields > 0);
+
+	heap = NULL;
+	offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+				  n_fields, &heap);
+
+	/* format and store the data */
+
+	buf_used = 0;
+	for (i = 0; i < n_fields; i++) {
+
+		buf_used += put_nth_field(
+			buf + buf_used, sizeof(buf) - buf_used,
+			i, index, rec, offsets) - 1;
+	}
+
+	*lock_data = (const char*) ha_storage_put_memlim(
+		cache->storage, buf, buf_used + 1,
+		MAX_ALLOWED_FOR_STORAGE(cache));
+
+	if (heap != NULL) {
+
+		/* this means that rec_get_offsets() has created a new
+		heap and has stored offsets in it; check that this is
+		really the case and free the heap */
+		ut_a(offsets != offsets_onstack);
+		mem_heap_free(heap);
+	}
+
+	mtr_commit(&mtr);
+
+	if (*lock_data == NULL) {
+
+		return(FALSE);
+	}
+
+	return(TRUE);
+}
+
+/** @return the table of a lock */
+static const dict_table_t *lock_get_table(const lock_t &lock)
+{
+  if (lock.is_table())
+    return lock.un_member.tab_lock.table;
+  ut_ad(lock.index->is_primary() || !dict_index_is_online_ddl(lock.index));
+  return lock.index->table;
+}
+
+/*******************************************************************//**
+Fills i_s_locks_row_t object. Returns its first argument.
+If memory can not be allocated then FALSE is returned.
+@return false if allocation fails */
+static bool fill_locks_row(
+	i_s_locks_row_t* row,	/*!< out: result object that's filled */
+	const lock_t*	lock,	/*!< in: lock to get data from */
+	uint16_t	heap_no,/*!< in: lock's record number
+				or 0 if the lock
+				is a table lock */
+	trx_i_s_cache_t* cache)	/*!< in/out: cache into which to copy
+				volatile strings */
+{
+	row->lock_trx_id = lock->trx->id;
+	const bool is_gap_lock = lock->is_gap();
+	ut_ad(!is_gap_lock || !lock->is_table());
+	switch (lock->mode()) {
+	case LOCK_S:
+		row->lock_mode = uint8_t(1 + is_gap_lock);
+		break;
+	case LOCK_X:
+		row->lock_mode = uint8_t(3 + is_gap_lock);
+		break;
+	case LOCK_IS:
+		row->lock_mode = uint8_t(5 + is_gap_lock);
+		break;
+	case LOCK_IX:
+		row->lock_mode = uint8_t(7 + is_gap_lock);
+		break;
+	case LOCK_AUTO_INC:
+		row->lock_mode = 9;
+		break;
+	default:
+		ut_ad("unknown lock mode" == 0);
+		row->lock_mode = 0;
+	}
+
+	const dict_table_t* table= lock_get_table(*lock);
+
+	row->lock_table = ha_storage_put_str_memlim(
+		cache->storage, table->name.m_name,
+		MAX_ALLOWED_FOR_STORAGE(cache));
+
+	/* memory could not be allocated */
+	if (row->lock_table == NULL) {
+
+		return false;
+	}
+
+	if (!lock->is_table()) {
+		row->lock_index = ha_storage_put_str_memlim(
+			cache->storage, lock->index->name,
+			MAX_ALLOWED_FOR_STORAGE(cache));
+
+		/* memory could not be allocated */
+		if (row->lock_index == NULL) {
+
+			return false;
+		}
+
+		row->lock_page = lock->un_member.rec_lock.page_id;
+		row->lock_rec = heap_no;
+
+		if (!fill_lock_data(&row->lock_data, lock, heap_no, cache)) {
+
+			/* memory could not be allocated */
+			return false;
+		}
+	} else {
+		row->lock_index = NULL;
+
+		row->lock_page = page_id_t(0, 0);
+		row->lock_rec = 0;
+
+		row->lock_data = NULL;
+	}
+
+	row->lock_table_id = table->id;
+
+	row->hash_chain.value = row;
+	ut_ad(i_s_locks_row_validate(row));
+
+	return true;
+}
+
+/*******************************************************************//**
+Fills i_s_lock_waits_row_t object. Returns its first argument.
+@return result object that's filled */
+static
+i_s_lock_waits_row_t*
+fill_lock_waits_row(
+/*================*/
+	i_s_lock_waits_row_t*	row,		/*!< out: result object
+						that's filled */
+	const i_s_locks_row_t*	requested_lock_row,/*!< in: pointer to the
+						relevant requested lock
+						row in innodb_locks */
+	const i_s_locks_row_t*	blocking_lock_row)/*!< in: pointer to the
+						relevant blocking lock
+						row in innodb_locks */
+{
+	ut_ad(i_s_locks_row_validate(requested_lock_row));
+	ut_ad(i_s_locks_row_validate(blocking_lock_row));
+
+	row->requested_lock_row = requested_lock_row;
+	row->blocking_lock_row = blocking_lock_row;
+
+	return(row);
+}
+
+/*******************************************************************//**
+Calculates a hash fold for a lock. For a record lock the fold is
+calculated from 4 elements, which uniquely identify a lock at a given
+point in time: transaction id, space id, page number, record number.
+For a table lock the fold is table's id.
+@return fold */
+static
+ulint
+fold_lock(
+/*======*/
+	const lock_t*	lock,	/*!< in: lock object to fold */
+	ulint		heap_no)/*!< in: lock's record number
+				or 0xFFFF if the lock
+				is a table lock */
+{
+#ifdef TEST_LOCK_FOLD_ALWAYS_DIFFERENT
+	static ulint	fold = 0;
+
+	return(fold++);
+#else
+	ulint	ret;
+
+	if (!lock->is_table()) {
+		ut_a(heap_no != 0xFFFF);
+		ret = ut_fold_ulint_pair((ulint) lock->trx->id,
+					 lock->un_member.rec_lock.page_id.
+					 fold());
+		ret = ut_fold_ulint_pair(ret, heap_no);
+	} else {
+		/* this check is actually not necessary for continuing
+		correct operation, but something must have gone wrong if
+		it fails. */
+		ut_a(heap_no == 0xFFFF);
+
+		ret = (ulint) lock_get_table(*lock)->id;
+	}
+
+	return(ret);
+#endif
+}
+
+/*******************************************************************//**
+Checks whether i_s_locks_row_t object represents a lock_t object.
+@return TRUE if they match */
+static
+ibool
+locks_row_eq_lock(
+/*==============*/
+	const i_s_locks_row_t*	row,	/*!< in: innodb_locks row */
+	const lock_t*		lock,	/*!< in: lock object */
+	ulint			heap_no)/*!< in: lock's record number
+					or 0xFFFF if the lock
+					is a table lock */
+{
+	ut_ad(i_s_locks_row_validate(row));
+#ifdef TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T
+	return(0);
+#else
+	if (!lock->is_table()) {
+		ut_a(heap_no != 0xFFFF);
+
+		return(row->lock_trx_id == lock->trx->id
+		       && row->lock_page == lock->un_member.rec_lock.page_id
+		       && row->lock_rec == heap_no);
+	} else {
+		/* this check is actually not necessary for continuing
+		correct operation, but something must have gone wrong if
+		it fails. */
+		ut_a(heap_no == 0xFFFF);
+
+		return(row->lock_trx_id == lock->trx->id
+		       && row->lock_table_id == lock_get_table(*lock)->id);
+	}
+#endif
+}
+
+/*******************************************************************//**
+Searches for a row in the innodb_locks cache that has a specified id.
+This happens in O(1) time since a hash table is used. Returns pointer to
+the row or NULL if none is found.
+@return row or NULL */
+static
+i_s_locks_row_t*
+search_innodb_locks(
+/*================*/
+	trx_i_s_cache_t*	cache,	/*!< in: cache */
+	const lock_t*		lock,	/*!< in: lock to search for */
+	uint16_t		heap_no)/*!< in: lock's record number
+					or 0xFFFF if the lock
+					is a table lock */
+{
+	i_s_hash_chain_t*	hash_chain;
+
+	HASH_SEARCH(
+		/* hash_chain->"next" */
+		next,
+		/* the hash table */
+		&cache->locks_hash,
+		/* fold */
+		fold_lock(lock, heap_no),
+		/* the type of the next variable */
+		i_s_hash_chain_t*,
+		/* auxiliary variable */
+		hash_chain,
+		/* assertion on every traversed item */
+		ut_ad(i_s_locks_row_validate(hash_chain->value)),
+		/* this determines if we have found the lock */
+		locks_row_eq_lock(hash_chain->value, lock, heap_no));
+
+	if (hash_chain == NULL) {
+
+		return(NULL);
+	}
+	/* else */
+
+	return(hash_chain->value);
+}
+
+/*******************************************************************//**
+Adds new element to the locks cache, enlarging it if necessary.
+Returns a pointer to the added row. If the row is already present then
+no row is added and a pointer to the existing row is returned.
+If row can not be allocated then NULL is returned.
+@return row */
+static
+i_s_locks_row_t*
+add_lock_to_cache(
+/*==============*/
+	trx_i_s_cache_t*	cache,	/*!< in/out: cache */
+	const lock_t*		lock,	/*!< in: the element to add */
+	uint16_t		heap_no)/*!< in: lock's record number
+					or 0 if the lock
+					is a table lock */
+{
+	i_s_locks_row_t*	dst_row;
+
+#ifdef TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES
+	ulint	i;
+	for (i = 0; i < 10000; i++) {
+#endif
+#ifndef TEST_DO_NOT_CHECK_FOR_DUPLICATE_ROWS
+	/* quit if this lock is already present */
+	dst_row = search_innodb_locks(cache, lock, heap_no);
+	if (dst_row != NULL) {
+
+		ut_ad(i_s_locks_row_validate(dst_row));
+		return(dst_row);
+	}
+#endif
+
+	dst_row = (i_s_locks_row_t*)
+		table_cache_create_empty_row(&cache->innodb_locks, cache);
+
+	/* memory could not be allocated */
+	if (dst_row == NULL) {
+
+		return(NULL);
+	}
+
+	if (!fill_locks_row(dst_row, lock, heap_no, cache)) {
+
+		/* memory could not be allocated */
+		cache->innodb_locks.rows_used--;
+		return(NULL);
+	}
+
+#ifndef TEST_DO_NOT_INSERT_INTO_THE_HASH_TABLE
+	HASH_INSERT(
+		/* the type used in the hash chain */
+		i_s_hash_chain_t,
+		/* hash_chain->"next" */
+		next,
+		/* the hash table */
+		&cache->locks_hash,
+		/* fold */
+		fold_lock(lock, heap_no),
+		/* add this data to the hash */
+		&dst_row->hash_chain);
+#endif
+#ifdef TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES
+	} /* for()-loop */
+#endif
+
+	ut_ad(i_s_locks_row_validate(dst_row));
+	return(dst_row);
+}
+
+/*******************************************************************//**
+Adds new pair of locks to the lock waits cache.
+If memory can not be allocated then FALSE is returned.
+@return FALSE if allocation fails */
+static
+ibool
+add_lock_wait_to_cache(
+/*===================*/
+	trx_i_s_cache_t*	cache,		/*!< in/out: cache */
+	const i_s_locks_row_t*	requested_lock_row,/*!< in: pointer to the
+						relevant requested lock
+						row in innodb_locks */
+	const i_s_locks_row_t*	blocking_lock_row)/*!< in: pointer to the
+						relevant blocking lock
+						row in innodb_locks */
+{
+	i_s_lock_waits_row_t*	dst_row;
+
+	dst_row = (i_s_lock_waits_row_t*)
+		table_cache_create_empty_row(&cache->innodb_lock_waits,
+					     cache);
+
+	/* memory could not be allocated */
+	if (dst_row == NULL) {
+
+		return(FALSE);
+	}
+
+	fill_lock_waits_row(dst_row, requested_lock_row, blocking_lock_row);
+
+	return(TRUE);
+}
+
+/*******************************************************************//**
+Adds transaction's relevant (important) locks to cache.
+If the transaction is waiting, then the wait lock is added to
+innodb_locks and a pointer to the added row is returned in
+requested_lock_row, otherwise requested_lock_row is set to NULL.
+If rows can not be allocated then FALSE is returned and the value of
+requested_lock_row is undefined.
+@return FALSE if allocation fails */
+static
+ibool
+add_trx_relevant_locks_to_cache(
+/*============================*/
+	trx_i_s_cache_t*	cache,	/*!< in/out: cache */
+	const trx_t*		trx,	/*!< in: transaction */
+	i_s_locks_row_t**	requested_lock_row)/*!< out: pointer to the
+					requested lock row, or NULL or
+					undefined */
+{
+	lock_sys.assert_locked();
+
+	/* If transaction is waiting we add the wait lock and all locks
+	from another transactions that are blocking the wait lock. */
+	if (const lock_t *wait_lock = trx->lock.wait_lock) {
+
+		const lock_t*		curr_lock;
+		i_s_locks_row_t*	blocking_lock_row;
+		lock_queue_iterator_t	iter;
+
+		uint16_t wait_lock_heap_no
+			= wait_lock_get_heap_no(wait_lock);
+
+		/* add the requested lock */
+		*requested_lock_row = add_lock_to_cache(cache, wait_lock,
+							wait_lock_heap_no);
+
+		/* memory could not be allocated */
+		if (*requested_lock_row == NULL) {
+
+			return(FALSE);
+		}
+
+		/* then iterate over the locks before the wait lock and
+		add the ones that are blocking it */
+
+		lock_queue_iterator_reset(&iter, wait_lock, ULINT_UNDEFINED);
+
+		for (curr_lock = lock_queue_iterator_get_prev(&iter);
+		     curr_lock != NULL;
+		     curr_lock = lock_queue_iterator_get_prev(&iter)) {
+
+			if (lock_has_to_wait(wait_lock, curr_lock)) {
+
+				/* add the lock that is
+				blocking wait_lock */
+				blocking_lock_row
+					= add_lock_to_cache(
+						cache, curr_lock,
+						/* heap_no is the same
+						for the wait and waited
+						locks */
+						wait_lock_heap_no);
+
+				/* memory could not be allocated */
+				if (blocking_lock_row == NULL) {
+
+					return(FALSE);
+				}
+
+				/* add the relation between both locks
+				to innodb_lock_waits */
+				if (!add_lock_wait_to_cache(
+						cache, *requested_lock_row,
+						blocking_lock_row)) {
+
+					/* memory could not be allocated */
+					return(FALSE);
+				}
+			}
+		}
+	} else {
+
+		*requested_lock_row = NULL;
+	}
+
+	return(TRUE);
+}
+
+/** The minimum time that a cache must not be updated after it has been
+read for the last time; measured in nanoseconds. We use this technique
+to ensure that SELECTs which join several INFORMATION SCHEMA tables read
+the same version of the cache. */
+#define CACHE_MIN_IDLE_TIME_NS	100000000 /* 0.1 sec */
+
+/*******************************************************************//**
+Checks if the cache can safely be updated.
+@return whether the cache can be updated */
+static bool can_cache_be_updated(trx_i_s_cache_t* cache)
+{
+	/* cache->last_read is only updated when a shared rw lock on the
+	whole cache is being held (see trx_i_s_cache_end_read()) and
+	we are currently holding an exclusive rw lock on the cache.
+	So it is not possible for last_read to be updated while we are
+	reading it. */
+	return my_interval_timer() - cache->last_read > CACHE_MIN_IDLE_TIME_NS;
+}
+
+/*******************************************************************//**
+Declare a cache empty, preparing it to be filled up. Not all resources
+are freed because they can be reused. */
+static
+void
+trx_i_s_cache_clear(
+/*================*/
+	trx_i_s_cache_t*	cache)	/*!< out: cache to clear */
+{
+	cache->innodb_trx.rows_used = 0;
+	cache->innodb_locks.rows_used = 0;
+	cache->innodb_lock_waits.rows_used = 0;
+
+	cache->locks_hash.clear();
+
+	ha_storage_empty(&cache->storage);
+}
+
+
+/**
+  Add transactions to innodb_trx's cache.
+
+  We also add all locks that are relevant to each transaction into
+  innodb_locks' and innodb_lock_waits' caches.
+*/
+
+static void fetch_data_into_cache_low(trx_i_s_cache_t *cache, const trx_t *trx)
+{
+  i_s_locks_row_t *requested_lock_row;
+
+#ifdef UNIV_DEBUG
+  {
+    const auto state= trx->state;
+
+    if (trx->is_autocommit_non_locking())
+    {
+      ut_ad(trx->read_only);
+      ut_ad(!trx->is_recovered);
+      ut_ad(trx->mysql_thd);
+      ut_ad(state == TRX_STATE_NOT_STARTED || state == TRX_STATE_ACTIVE);
+    }
+    else
+      ut_ad(state == TRX_STATE_ACTIVE ||
+            state == TRX_STATE_PREPARED ||
+            state == TRX_STATE_PREPARED_RECOVERED ||
+            state == TRX_STATE_COMMITTED_IN_MEMORY);
+  }
+#endif /* UNIV_DEBUG */
+
+  if (add_trx_relevant_locks_to_cache(cache, trx, &requested_lock_row))
+  {
+    if (i_s_trx_row_t *trx_row= reinterpret_cast<i_s_trx_row_t*>(
+        table_cache_create_empty_row(&cache->innodb_trx, cache)))
+    {
+      if (fill_trx_row(trx_row, trx, requested_lock_row, cache))
+        return;
+      --cache->innodb_trx.rows_used;
+    }
+  }
+
+  /* memory could not be allocated */
+  cache->is_truncated= true;
+}
+
+
+/**
+  Fetches the data needed to fill the 3 INFORMATION SCHEMA tables into the
+  table cache buffer. Cache must be locked for write.
+*/
+
+static void fetch_data_into_cache(trx_i_s_cache_t *cache)
+{
+  LockMutexGuard g{SRW_LOCK_CALL};
+  trx_i_s_cache_clear(cache);
+
+  /* Capture the state of transactions */
+  trx_sys.trx_list.for_each([cache](trx_t &trx) {
+    if (!cache->is_truncated && trx.state != TRX_STATE_NOT_STARTED &&
+        &trx != (purge_sys.query ? purge_sys.query->trx : nullptr))
+    {
+      trx.mutex_lock();
+      if (trx.state != TRX_STATE_NOT_STARTED)
+        fetch_data_into_cache_low(cache, &trx);
+      trx.mutex_unlock();
+    }
+  });
+  cache->is_truncated= false;
+}
+
+
+/*******************************************************************//**
+Update the transactions cache if it has not been read for some time.
+Called from handler/i_s.cc.
+@return 0 - fetched, 1 - not */
+int
+trx_i_s_possibly_fetch_data_into_cache(
+/*===================================*/
+	trx_i_s_cache_t*	cache)	/*!< in/out: cache */
+{
+	if (!can_cache_be_updated(cache)) {
+
+		return(1);
+	}
+
+	/* We need to read trx_sys and record/table lock queues */
+	fetch_data_into_cache(cache);
+
+	/* update cache last read time */
+	cache->last_read = my_interval_timer();
+
+	return(0);
+}
+
+/*******************************************************************//**
+Returns TRUE if the data in the cache is truncated due to the memory
+limit posed by TRX_I_S_MEM_LIMIT.
+@return TRUE if truncated */
+bool
+trx_i_s_cache_is_truncated(
+/*=======================*/
+	trx_i_s_cache_t*	cache)	/*!< in: cache */
+{
+	return(cache->is_truncated);
+}
+
+/*******************************************************************//**
+Initialize INFORMATION SCHEMA trx related cache. */
+void
+trx_i_s_cache_init(
+/*===============*/
+	trx_i_s_cache_t*	cache)	/*!< out: cache to init */
+{
+	/* The latching is done in the following order:
+	acquire trx_i_s_cache_t::rw_lock, rwlock
+	acquire exclusive lock_sys.latch
+	release exclusive lock_sys.latch
+	release trx_i_s_cache_t::rw_lock
+	acquire trx_i_s_cache_t::rw_lock, rdlock
+	release trx_i_s_cache_t::rw_lock */
+
+	cache->rw_lock.SRW_LOCK_INIT(trx_i_s_cache_lock_key);
+
+	cache->last_read = 0;
+
+	table_cache_init(&cache->innodb_trx, sizeof(i_s_trx_row_t));
+	table_cache_init(&cache->innodb_locks, sizeof(i_s_locks_row_t));
+	table_cache_init(&cache->innodb_lock_waits,
+			 sizeof(i_s_lock_waits_row_t));
+
+	cache->locks_hash.create(LOCKS_HASH_CELLS_NUM);
+
+	cache->storage = ha_storage_create(CACHE_STORAGE_INITIAL_SIZE,
+					   CACHE_STORAGE_HASH_CELLS);
+
+	cache->mem_allocd = 0;
+
+	cache->is_truncated = false;
+}
+
+/*******************************************************************//**
+Free the INFORMATION SCHEMA trx related cache. */
+void
+trx_i_s_cache_free(
+/*===============*/
+	trx_i_s_cache_t*	cache)	/*!< in, own: cache to free */
+{
+	cache->rw_lock.destroy();
+
+	cache->locks_hash.free();
+	ha_storage_free(cache->storage);
+	table_cache_free(&cache->innodb_trx);
+	table_cache_free(&cache->innodb_locks);
+	table_cache_free(&cache->innodb_lock_waits);
+}
+
+/*******************************************************************//**
+Issue a shared/read lock on the tables cache. */
+void
+trx_i_s_cache_start_read(
+/*=====================*/
+	trx_i_s_cache_t*	cache)	/*!< in: cache */
+{
+	cache->rw_lock.rd_lock(SRW_LOCK_CALL);
+}
+
+/*******************************************************************//**
+Release a shared/read lock on the tables cache. */
+void
+trx_i_s_cache_end_read(
+/*===================*/
+	trx_i_s_cache_t*	cache)	/*!< in: cache */
+{
+	cache->last_read = my_interval_timer();
+	cache->rw_lock.rd_unlock();
+}
+
+/*******************************************************************//**
+Issue an exclusive/write lock on the tables cache. */
+void
+trx_i_s_cache_start_write(
+/*======================*/
+	trx_i_s_cache_t*	cache)	/*!< in: cache */
+{
+	cache->rw_lock.wr_lock(SRW_LOCK_CALL);
+}
+
+/*******************************************************************//**
+Release an exclusive/write lock on the tables cache. */
+void
+trx_i_s_cache_end_write(
+/*====================*/
+	trx_i_s_cache_t*	cache)	/*!< in: cache */
+{
+	cache->rw_lock.wr_unlock();
+}
+
+/*******************************************************************//**
+Selects a INFORMATION SCHEMA table cache from the whole cache.
+@return table cache */
+static
+i_s_table_cache_t*
+cache_select_table(
+/*===============*/
+	trx_i_s_cache_t*	cache,	/*!< in: whole cache */
+	enum i_s_table		table)	/*!< in: which table */
+{
+	switch (table) {
+	case I_S_INNODB_TRX:
+		return &cache->innodb_trx;
+	case I_S_INNODB_LOCKS:
+		return &cache->innodb_locks;
+	case I_S_INNODB_LOCK_WAITS:
+		return &cache->innodb_lock_waits;
+	}
+
+	ut_error;
+	return NULL;
+}
+
+/*******************************************************************//**
+Retrieves the number of used rows in the cache for a given
+INFORMATION SCHEMA table.
+@return number of rows */
+ulint
+trx_i_s_cache_get_rows_used(
+/*========================*/
+	trx_i_s_cache_t*	cache,	/*!< in: cache */
+	enum i_s_table		table)	/*!< in: which table */
+{
+	i_s_table_cache_t*	table_cache;
+
+	table_cache = cache_select_table(cache, table);
+
+	return(table_cache->rows_used);
+}
+
+/*******************************************************************//**
+Retrieves the nth row (zero-based) in the cache for a given
+INFORMATION SCHEMA table.
+@return row */
+void*
+trx_i_s_cache_get_nth_row(
+/*======================*/
+	trx_i_s_cache_t*	cache,	/*!< in: cache */
+	enum i_s_table		table,	/*!< in: which table */
+	ulint			n)	/*!< in: row number */
+{
+	i_s_table_cache_t*	table_cache;
+	ulint			i;
+	void*			row;
+
+	table_cache = cache_select_table(cache, table);
+
+	ut_a(n < table_cache->rows_used);
+
+	row = NULL;
+
+	for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+		if (table_cache->chunks[i].offset
+		    + table_cache->chunks[i].rows_allocd > n) {
+
+			row = (char*) table_cache->chunks[i].base
+				+ (n - table_cache->chunks[i].offset)
+				* table_cache->row_size;
+			break;
+		}
+	}
+
+	ut_a(row != NULL);
+
+	return(row);
+}
+
+/*******************************************************************//**
+Crafts a lock id string from a i_s_locks_row_t object. Returns its
+second argument. This function aborts if there is not enough space in
+lock_id. Be sure to provide at least TRX_I_S_LOCK_ID_MAX_LEN + 1 if you
+want to be 100% sure that it will not abort.
+@return resulting lock id */
+char*
+trx_i_s_create_lock_id(
+/*===================*/
+	const i_s_locks_row_t*	row,	/*!< in: innodb_locks row */
+	char*			lock_id,/*!< out: resulting lock_id */
+	ulint			lock_id_size)/*!< in: size of the lock id
+					buffer */
+{
+	int	res_len;
+
+	/* please adjust TRX_I_S_LOCK_ID_MAX_LEN if you change this */
+
+	if (row->lock_index) {
+		/* record lock */
+		res_len = snprintf(lock_id, lock_id_size,
+				   TRX_ID_FMT
+				   ":%u:%u:%u",
+				   row->lock_trx_id, row->lock_page.space(),
+				   row->lock_page.page_no(), row->lock_rec);
+	} else {
+		/* table lock */
+		res_len = snprintf(lock_id, lock_id_size,
+				   TRX_ID_FMT":" UINT64PF,
+				   row->lock_trx_id,
+				   row->lock_table_id);
+	}
+
+	/* the typecast is safe because snprintf(3) never returns
+	negative result */
+	ut_a(res_len >= 0);
+	ut_a((ulint) res_len < lock_id_size);
+
+	return(lock_id);
+}
diff --git a/storage/innobase/trx/trx0purge.cc b/storage/innobase/trx/trx0purge.cc
new file mode 100644
index 00000000..1f31ceda
--- /dev/null
+++ b/storage/innobase/trx/trx0purge.cc
@@ -0,0 +1,1480 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0purge.cc
+Purge old versions
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0purge.h"
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "mtr0log.h"
+#include "que0que.h"
+#include "row0purge.h"
+#include "row0upd.h"
+#include "srv0mon.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "trx0rec.h"
+#include "trx0roll.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "dict0load.h"
+#include <mysql/service_thd_mdl.h>
+#include <mysql/service_wsrep.h>
+
+/** Maximum allowable purge history length.  <=0 means 'infinite'. */
+ulong		srv_max_purge_lag = 0;
+
+/** Max DML user threads delay in micro-seconds. */
+ulong		srv_max_purge_lag_delay = 0;
+
+/** The global data structure coordinating a purge */
+purge_sys_t	purge_sys;
+
+#ifdef UNIV_DEBUG
+my_bool		srv_purge_view_update_only_debug;
+#endif /* UNIV_DEBUG */
+
+/** Sentinel value */
+static const TrxUndoRsegs NullElement;
+
+/** Default constructor */
+TrxUndoRsegsIterator::TrxUndoRsegsIterator()
+	: m_rsegs(NullElement), m_iter(m_rsegs.begin())
+{
+}
+
+/** Sets the next rseg to purge in purge_sys.
+Executed in the purge coordinator thread.
+@retval false when nothing is to be purged
+@retval true  when purge_sys.rseg->latch was locked */
+inline bool TrxUndoRsegsIterator::set_next()
+{
+	ut_ad(!purge_sys.next_stored);
+	mysql_mutex_lock(&purge_sys.pq_mutex);
+
+	/* Only purge consumes events from the priority queue, user
+	threads only produce the events. */
+
+	/* Check if there are more rsegs to process in the
+	current element. */
+	if (m_iter != m_rsegs.end()) {
+		/* We are still processing rollback segment from
+		the same transaction and so expected transaction
+		number shouldn't increase. Undo the increment of
+		expected commit done by caller assuming rollback
+		segments from given transaction are done. */
+		purge_sys.tail.trx_no = (*m_iter)->last_trx_no();
+	} else if (!purge_sys.purge_queue.empty()) {
+		m_rsegs = purge_sys.purge_queue.top();
+		purge_sys.purge_queue.pop();
+		ut_ad(purge_sys.purge_queue.empty()
+		      || purge_sys.purge_queue.top() != m_rsegs);
+		m_iter = m_rsegs.begin();
+	} else {
+		/* Queue is empty, reset iterator. */
+		purge_sys.rseg = NULL;
+		mysql_mutex_unlock(&purge_sys.pq_mutex);
+		m_rsegs = NullElement;
+		m_iter = m_rsegs.begin();
+		return false;
+	}
+
+	purge_sys.rseg = *m_iter++;
+	mysql_mutex_unlock(&purge_sys.pq_mutex);
+
+	/* We assume in purge of externally stored fields that space
+	id is in the range of UNDO tablespace space ids */
+	ut_ad(purge_sys.rseg->space->id == TRX_SYS_SPACE
+	      || srv_is_undo_tablespace(purge_sys.rseg->space->id));
+
+	purge_sys.rseg->latch.wr_lock(SRW_LOCK_CALL);
+	trx_id_t last_trx_no = purge_sys.rseg->last_trx_no();
+	purge_sys.hdr_offset = purge_sys.rseg->last_offset();
+	purge_sys.hdr_page_no = purge_sys.rseg->last_page_no;
+
+	/* Only the purge_coordinator_task will access this object
+	purge_sys.rseg_iter, or any of purge_sys.hdr_page_no,
+	purge_sys.tail.
+	The field purge_sys.head and purge_sys.view are modified by
+	purge_sys_t::clone_end_view()
+	in the purge_coordinator_task
+	while holding exclusive purge_sys.latch.
+	The purge_sys.view may also be modified by
+	purge_sys_t::wake_if_not_active() while holding exclusive
+	purge_sys.latch.
+	The purge_sys.head may be read by
+	purge_truncation_callback(). */
+	ut_ad(last_trx_no == m_rsegs.trx_no);
+	ut_a(purge_sys.hdr_page_no != FIL_NULL);
+	ut_a(purge_sys.tail.trx_no <= last_trx_no);
+	purge_sys.tail.trx_no = last_trx_no;
+
+	return(true);
+}
+
+/** Build a purge 'query' graph. The actual purge is performed by executing
+this query graph.
+@return own: the query graph */
+static
+que_t*
+purge_graph_build()
+{
+	ut_a(srv_n_purge_threads > 0);
+
+	trx_t* trx = trx_create();
+	ut_ad(!trx->id);
+	trx->start_time = time(NULL);
+	trx->start_time_micro = microsecond_interval_timer();
+	trx->state = TRX_STATE_ACTIVE;
+	trx->op_info = "purge trx";
+
+	mem_heap_t*	heap = mem_heap_create(512);
+	que_fork_t*	fork = que_fork_create(heap);
+	fork->trx = trx;
+
+	for (auto i = innodb_purge_threads_MAX; i; i--) {
+		que_thr_t*	thr = que_thr_create(fork, heap, NULL);
+		thr->child = new(mem_heap_alloc(heap, sizeof(purge_node_t)))
+			purge_node_t(thr);
+	}
+
+	return(fork);
+}
+
+/** Initialise the purge system. */
+void purge_sys_t::create()
+{
+  ut_ad(this == &purge_sys);
+  ut_ad(!m_initialized);
+  ut_ad(!enabled());
+  m_paused= 0;
+  query= purge_graph_build();
+  next_stored= false;
+  rseg= NULL;
+  page_no= 0;
+  offset= 0;
+  hdr_page_no= 0;
+  hdr_offset= 0;
+  latch.SRW_LOCK_INIT(trx_purge_latch_key);
+  end_latch.init();
+  mysql_mutex_init(purge_sys_pq_mutex_key, &pq_mutex, nullptr);
+  truncate.current= NULL;
+  truncate.last= NULL;
+  m_initialized= true;
+}
+
+/** Close the purge subsystem on shutdown. */
+void purge_sys_t::close()
+{
+  ut_ad(this == &purge_sys);
+  if (!m_initialized)
+    return;
+
+  ut_ad(!enabled());
+  trx_t *trx= query->trx;
+  que_graph_free(query);
+  ut_ad(!trx->id);
+  ut_ad(trx->state == TRX_STATE_ACTIVE);
+  trx->state= TRX_STATE_NOT_STARTED;
+  trx->free();
+  latch.destroy();
+  end_latch.destroy();
+  mysql_mutex_destroy(&pq_mutex);
+  m_initialized= false;
+}
+
+/** Determine if the history of a transaction is purgeable.
+@param trx_id  transaction identifier
+@return whether the history is purgeable */
+TRANSACTIONAL_TARGET bool purge_sys_t::is_purgeable(trx_id_t trx_id) const
+{
+  bool purgeable;
+#if !defined SUX_LOCK_GENERIC && !defined NO_ELISION
+  purgeable= false;
+  if (xbegin())
+  {
+    if (!latch.is_write_locked())
+    {
+      purgeable= view.changes_visible(trx_id);
+      xend();
+    }
+    else
+      xabort();
+  }
+  else
+#endif
+  {
+    latch.rd_lock(SRW_LOCK_CALL);
+    purgeable= view.changes_visible(trx_id);
+    latch.rd_unlock();
+  }
+  return purgeable;
+}
+
+/*================ UNDO LOG HISTORY LIST =============================*/
+
+/** Prepend the history list with an undo log.
+Remove the undo log segment from the rseg slot if it is too big for reuse.
+@param[in]	trx		transaction
+@param[in,out]	undo		undo log
+@param[in,out]	mtr		mini-transaction */
+void
+trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr)
+{
+  DBUG_PRINT("trx", ("commit(" TRX_ID_FMT "," TRX_ID_FMT ")",
+                     trx->id, trx_id_t{trx->rw_trx_hash_element->no}));
+  ut_ad(undo->id < TRX_RSEG_N_SLOTS);
+  ut_ad(undo == trx->rsegs.m_redo.undo);
+  trx_rseg_t *rseg= trx->rsegs.m_redo.rseg;
+  ut_ad(undo->rseg == rseg);
+  buf_block_t *rseg_header= rseg->get(mtr, nullptr);
+  /* We are in transaction commit; we cannot return an error. If the
+  database is corrupted, it is better to crash it than to
+  intentionally violate ACID by committing something that is known to
+  be corrupted. */
+  ut_ad(rseg_header);
+  buf_block_t *undo_page=
+    buf_page_get(page_id_t(rseg->space->id, undo->hdr_page_no), 0,
+                 RW_X_LATCH, mtr);
+  /* This function is invoked during transaction commit, which is not
+  allowed to fail. If we get a corrupted undo header, we will crash here. */
+  ut_a(undo_page);
+  trx_ulogf_t *undo_header= undo_page->page.frame + undo->hdr_offset;
+
+  ut_ad(mach_read_from_2(undo_header + TRX_UNDO_NEEDS_PURGE) <= 1);
+  ut_ad(rseg->needs_purge > trx->id);
+  ut_ad(rseg->last_page_no != FIL_NULL);
+
+  rseg->history_size++;
+
+  if (UNIV_UNLIKELY(mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT +
+                                     rseg_header->page.frame)))
+    /* This database must have been upgraded from before MariaDB 10.3.5. */
+    trx_rseg_format_upgrade(rseg_header, mtr);
+
+  uint16_t undo_state;
+
+  if (undo->size == 1 &&
+      TRX_UNDO_PAGE_REUSE_LIMIT >
+      mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE +
+                       undo_page->page.frame))
+  {
+    undo->state= undo_state= TRX_UNDO_CACHED;
+    UT_LIST_ADD_FIRST(rseg->undo_cached, undo);
+  }
+  else
+  {
+    ut_ad(undo->size == flst_get_len(TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST +
+                                     undo_page->page.frame));
+    /* The undo log segment will not be reused */
+    static_assert(FIL_NULL == 0xffffffff, "");
+    mtr->memset(rseg_header, TRX_RSEG + TRX_RSEG_UNDO_SLOTS +
+                undo->id * TRX_RSEG_SLOT_SIZE, 4, 0xff);
+    uint32_t hist_size= mach_read_from_4(TRX_RSEG_HISTORY_SIZE + TRX_RSEG +
+                                         rseg_header->page.frame);
+    mtr->write<4>(*rseg_header, TRX_RSEG + TRX_RSEG_HISTORY_SIZE +
+                  rseg_header->page.frame, hist_size + undo->size);
+    mtr->write<8>(*rseg_header, TRX_RSEG + TRX_RSEG_MAX_TRX_ID +
+                  rseg_header->page.frame, trx_sys.get_max_trx_id());
+    ut_free(undo);
+    undo_state= TRX_UNDO_TO_PURGE;
+  }
+
+  undo= nullptr;
+
+  /*
+  Before any transaction-generating background threads or the purge
+  have been started, we can start transactions in
+  row_merge_drop_temp_indexes(), and roll back recovered transactions.
+
+  Arbitrary user transactions may be executed when all the undo log
+  related background processes (including purge) are disabled due to
+  innodb_force_recovery=2 or innodb_force_recovery=3.  DROP TABLE may
+  be executed at any innodb_force_recovery level.
+
+  During fast shutdown, we may also continue to execute user
+  transactions. */
+  ut_ad(srv_undo_sources || srv_fast_shutdown ||
+        (!purge_sys.enabled() &&
+         (srv_is_being_started ||
+          srv_force_recovery >= SRV_FORCE_NO_BACKGROUND)));
+
+#ifdef WITH_WSREP
+  if (wsrep_is_wsrep_xid(&trx->xid))
+    trx_rseg_update_wsrep_checkpoint(rseg_header, &trx->xid, mtr);
+#endif
+
+  if (trx->mysql_log_file_name && *trx->mysql_log_file_name)
+    /* Update the latest binlog name and offset if log_bin=ON or this
+    is a replica. */
+    trx_rseg_update_binlog_offset(rseg_header, trx->mysql_log_file_name,
+                                  trx->mysql_log_offset, mtr);
+
+  /* Add the log as the first in the history list */
+
+  /* We are in transaction commit; we cannot return an error
+  when detecting corruption. It is better to crash the server
+  than to intentionally violate ACID by committing something
+  that is known to be corrupted. */
+  ut_a(flst_add_first(rseg_header, TRX_RSEG + TRX_RSEG_HISTORY, undo_page,
+                      uint16_t(page_offset(undo_header) +
+                               TRX_UNDO_HISTORY_NODE), mtr) == DB_SUCCESS);
+
+  mtr->write<2>(*undo_page, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE +
+                undo_page->page.frame, undo_state);
+  mtr->write<8,mtr_t::MAYBE_NOP>(*undo_page, undo_header + TRX_UNDO_TRX_NO,
+                                 trx->rw_trx_hash_element->no);
+  mtr->write<2,mtr_t::MAYBE_NOP>(*undo_page, undo_header +
+                                 TRX_UNDO_NEEDS_PURGE, 1U);
+}
+
+/** Free an undo log segment.
+@param block     rollback segment header page
+@param mtr       mini-transaction */
+static void trx_purge_free_segment(buf_block_t *block, mtr_t &mtr)
+{
+  while (!fseg_free_step_not_header(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER +
+                                    block->page.frame, &mtr))
+  {
+    block->fix();
+    ut_d(const page_id_t id{block->page.id()});
+    mtr.commit();
+    /* NOTE: If the server is killed after the log that was produced
+    up to this point was written, and before the log from the mtr.commit()
+    in our caller is written, then the pages belonging to the
+    undo log will become unaccessible garbage.
+
+    This does not matter when using multiple innodb_undo_tablespaces;
+    innodb_undo_log_truncate=ON will be able to reclaim the space. */
+    mtr.start();
+    block->page.lock.x_lock();
+    ut_ad(block->page.id() == id);
+    mtr.memo_push(block, MTR_MEMO_PAGE_X_MODIFY);
+  }
+
+  while (!fseg_free_step(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER +
+                         block->page.frame, &mtr));
+}
+
+/** Remove unnecessary history data from a rollback segment.
+@param rseg   rollback segment
+@param limit  truncate anything before this
+@param all    whether everything can be truncated
+@return error code */
+static dberr_t
+trx_purge_truncate_rseg_history(trx_rseg_t &rseg,
+                                const purge_sys_t::iterator &limit, bool all)
+{
+  fil_addr_t hdr_addr;
+  mtr_t mtr;
+
+  mtr.start();
+
+  dberr_t err;
+  buf_block_t *rseg_hdr= rseg.get(&mtr, &err);
+  if (!rseg_hdr)
+  {
+func_exit:
+    mtr.commit();
+    return err;
+  }
+
+  hdr_addr= flst_get_last(TRX_RSEG + TRX_RSEG_HISTORY + rseg_hdr->page.frame);
+  hdr_addr.boffset= static_cast<uint16_t>(hdr_addr.boffset -
+                                          TRX_UNDO_HISTORY_NODE);
+
+loop:
+  if (hdr_addr.page == FIL_NULL)
+    goto func_exit;
+
+  buf_block_t *b=
+    buf_page_get_gen(page_id_t(rseg.space->id, hdr_addr.page),
+                     0, RW_X_LATCH, nullptr, BUF_GET_POSSIBLY_FREED,
+                     &mtr, &err);
+  if (!b)
+    goto func_exit;
+
+  const trx_id_t undo_trx_no=
+    mach_read_from_8(b->page.frame + hdr_addr.boffset + TRX_UNDO_TRX_NO);
+
+  if (undo_trx_no >= limit.trx_no)
+  {
+    if (undo_trx_no == limit.trx_no)
+      err = trx_undo_truncate_start(&rseg, hdr_addr.page,
+                                    hdr_addr.boffset, limit.undo_no);
+    goto func_exit;
+  }
+
+  if (!all)
+    goto func_exit;
+
+  fil_addr_t prev_hdr_addr=
+    flst_get_prev_addr(b->page.frame + hdr_addr.boffset +
+                       TRX_UNDO_HISTORY_NODE);
+  prev_hdr_addr.boffset= static_cast<uint16_t>(prev_hdr_addr.boffset -
+                                               TRX_UNDO_HISTORY_NODE);
+
+  err= flst_remove(rseg_hdr, TRX_RSEG + TRX_RSEG_HISTORY, b,
+                   uint16_t(hdr_addr.boffset + TRX_UNDO_HISTORY_NODE), &mtr);
+  if (UNIV_UNLIKELY(err != DB_SUCCESS))
+    goto func_exit;
+
+  rseg_hdr->fix();
+
+  if (mach_read_from_2(b->page.frame + hdr_addr.boffset + TRX_UNDO_NEXT_LOG))
+    /* We cannot free the entire undo log segment. */;
+  else
+  {
+    const uint32_t seg_size=
+      flst_get_len(TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + b->page.frame);
+    switch (mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_STATE +
+                             b->page.frame)) {
+    case TRX_UNDO_TO_PURGE:
+      {
+        byte *hist= TRX_RSEG + TRX_RSEG_HISTORY_SIZE + rseg_hdr->page.frame;
+        ut_ad(mach_read_from_4(hist) >= seg_size);
+        mtr.write<4>(*rseg_hdr, hist, mach_read_from_4(hist) - seg_size);
+      }
+    free_segment:
+      ut_ad(rseg.curr_size >= seg_size);
+      rseg.curr_size-= seg_size;
+      trx_purge_free_segment(b, mtr);
+      break;
+    case TRX_UNDO_CACHED:
+      /* rseg.undo_cached must point to this page */
+      trx_undo_t *undo= UT_LIST_GET_FIRST(rseg.undo_cached);
+      for (; undo; undo= UT_LIST_GET_NEXT(undo_list, undo))
+        if (undo->hdr_page_no == hdr_addr.page)
+          goto found_cached;
+      ut_ad("inconsistent undo logs" == 0);
+      if (false)
+      found_cached:
+        UT_LIST_REMOVE(rseg.undo_cached, undo);
+      static_assert(FIL_NULL == 0xffffffff, "");
+      if (UNIV_UNLIKELY(mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT +
+                                         rseg_hdr->page.frame)))
+        trx_rseg_format_upgrade(rseg_hdr, &mtr);
+      mtr.memset(rseg_hdr, TRX_RSEG + TRX_RSEG_UNDO_SLOTS +
+                 undo->id * TRX_RSEG_SLOT_SIZE, 4, 0xff);
+      ut_free(undo);
+      mtr.write<8,mtr_t::MAYBE_NOP>(*rseg_hdr, TRX_RSEG + TRX_RSEG_MAX_TRX_ID +
+                                    rseg_hdr->page.frame,
+                                    trx_sys.get_max_trx_id() - 1);
+      goto free_segment;
+    }
+  }
+
+  hdr_addr= prev_hdr_addr;
+
+  mtr.commit();
+  ut_ad(rseg.history_size > 0);
+  rseg.history_size--;
+  mtr.start();
+  rseg_hdr->page.lock.x_lock();
+  ut_ad(rseg_hdr->page.id() == rseg.page_id());
+  mtr.memo_push(rseg_hdr, MTR_MEMO_PAGE_X_MODIFY);
+
+  goto loop;
+}
+
+/** Cleanse purge queue to remove the rseg that reside in undo-tablespace
+marked for truncate.
+@param[in]	space	undo tablespace being truncated */
+static void trx_purge_cleanse_purge_queue(const fil_space_t& space)
+{
+	typedef	std::vector<TrxUndoRsegs>	purge_elem_list_t;
+	purge_elem_list_t			purge_elem_list;
+
+	mysql_mutex_lock(&purge_sys.pq_mutex);
+
+	/* Remove rseg instances that are in the purge queue before we start
+	truncate of corresponding UNDO truncate. */
+	while (!purge_sys.purge_queue.empty()) {
+		purge_elem_list.push_back(purge_sys.purge_queue.top());
+		purge_sys.purge_queue.pop();
+	}
+
+	for (purge_elem_list_t::iterator it = purge_elem_list.begin();
+	     it != purge_elem_list.end();
+	     ++it) {
+
+		for (TrxUndoRsegs::iterator it2 = it->begin();
+		     it2 != it->end();
+		     ++it2) {
+			if ((*it2)->space == &space) {
+				it->erase(it2);
+				break;
+			}
+		}
+
+		if (!it->empty()) {
+			purge_sys.purge_queue.push(*it);
+		}
+	}
+
+	mysql_mutex_unlock(&purge_sys.pq_mutex);
+}
+
+dberr_t purge_sys_t::iterator::free_history() const
+{
+  for (auto &rseg : trx_sys.rseg_array)
+    if (rseg.space)
+    {
+      ut_ad(rseg.is_persistent());
+      log_free_check();
+      rseg.latch.wr_lock(SRW_LOCK_CALL);
+      dberr_t err=
+        trx_purge_truncate_rseg_history(rseg, *this, !rseg.is_referenced() &&
+                                        purge_sys.sees(rseg.needs_purge));
+      rseg.latch.wr_unlock();
+      if (err)
+        return err;
+    }
+  return DB_SUCCESS;
+}
+
+#if defined __GNUC__ && __GNUC__ == 4 && !defined __clang__
+# if defined __arm__ || defined __aarch64__
+/* Work around an internal compiler error in GCC 4.8.5 */
+__attribute__((optimize(0)))
+# endif
+#endif
+/**
+Remove unnecessary history data from rollback segments. NOTE that when this
+function is called, the caller
+(purge_coordinator_callback or purge_truncation_callback)
+must not have any latches on undo log pages!
+*/
+TRANSACTIONAL_TARGET void trx_purge_truncate_history()
+{
+  ut_ad(purge_sys.head <= purge_sys.tail);
+  purge_sys_t::iterator &head= purge_sys.head.trx_no
+    ? purge_sys.head : purge_sys.tail;
+
+  if (head.trx_no >= purge_sys.low_limit_no())
+  {
+    /* This is sometimes necessary. TODO: find out why. */
+    head.trx_no= purge_sys.low_limit_no();
+    head.undo_no= 0;
+  }
+
+  if (head.free_history() != DB_SUCCESS || srv_undo_tablespaces_active < 2)
+    return;
+
+  while (srv_undo_log_truncate)
+  {
+    if (!purge_sys.truncate.current)
+    {
+      const ulint threshold=
+        ulint(srv_max_undo_log_size >> srv_page_size_shift);
+      for (uint32_t i= purge_sys.truncate.last
+           ? purge_sys.truncate.last->id - srv_undo_space_id_start : 0,
+           j= i;; )
+      {
+        const uint32_t space_id= srv_undo_space_id_start + i;
+        ut_ad(srv_is_undo_tablespace(space_id));
+        fil_space_t *space= fil_space_get(space_id);
+        ut_a(UT_LIST_GET_LEN(space->chain) == 1);
+
+        if (space && space->get_size() > threshold)
+        {
+          purge_sys.truncate.current= space;
+          break;
+        }
+
+        ++i;
+        i %= srv_undo_tablespaces_active;
+        if (i == j)
+          return;
+      }
+    }
+
+    fil_space_t &space= *purge_sys.truncate.current;
+    /* Undo tablespace always are a single file. */
+    fil_node_t *file= UT_LIST_GET_FIRST(space.chain);
+    /* The undo tablespace files are never closed. */
+    ut_ad(file->is_open());
+
+    DBUG_LOG("undo", "marking for truncate: " << file->name);
+
+    for (auto &rseg : trx_sys.rseg_array)
+      if (rseg.space == &space)
+        /* Once set, this rseg will not be allocated to subsequent
+        transactions, but we will wait for existing active
+        transactions to finish. */
+        rseg.set_skip_allocation();
+
+    for (auto &rseg : trx_sys.rseg_array)
+    {
+      if (rseg.space != &space)
+        continue;
+
+      rseg.latch.rd_lock(SRW_LOCK_CALL);
+      ut_ad(rseg.skip_allocation());
+      if (rseg.is_referenced() || !purge_sys.sees(rseg.needs_purge))
+      {
+not_free:
+        rseg.latch.rd_unlock();
+        return;
+      }
+
+      ut_ad(UT_LIST_GET_LEN(rseg.undo_list) == 0);
+      /* Check if all segments are cached and safe to remove. */
+      ulint cached= 0;
+
+      for (const trx_undo_t *undo= UT_LIST_GET_FIRST(rseg.undo_cached); undo;
+           undo= UT_LIST_GET_NEXT(undo_list, undo))
+      {
+        if (head.trx_no && head.trx_no < undo->trx_id)
+          goto not_free;
+        else
+          cached+= undo->size;
+      }
+
+      ut_ad(rseg.curr_size > cached);
+      if (rseg.curr_size > cached + 1 &&
+          (rseg.history_size || srv_fast_shutdown || srv_undo_sources))
+        goto not_free;
+
+      rseg.latch.rd_unlock();
+    }
+
+    ib::info() << "Truncating " << file->name;
+    trx_purge_cleanse_purge_queue(space);
+
+    log_free_check();
+
+    mtr_t mtr;
+    mtr.start();
+    mtr.x_lock_space(&space);
+    const auto space_id= space.id;
+
+    /* Lock all modified pages of the tablespace.
+
+    During truncation, we do not want any writes to the file.
+
+    If a log checkpoint was completed at LSN earlier than our
+    mini-transaction commit and the server was killed, then
+    discarding the to-be-trimmed pages without flushing would
+    break crash recovery. */
+  rescan:
+    mysql_mutex_lock(&buf_pool.flush_list_mutex);
+    for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); bpage; )
+    {
+      ut_ad(bpage->oldest_modification());
+      ut_ad(bpage->in_file());
+
+      buf_page_t *prev= UT_LIST_GET_PREV(list, bpage);
+
+      if (bpage->oldest_modification() > 2 && bpage->id().space() == space_id)
+      {
+        ut_ad(bpage->frame);
+        bpage->fix();
+        {
+          /* Try to acquire an exclusive latch while the cache line is
+          fresh after fix(). */
+          const bool got_lock{bpage->lock.x_lock_try()};
+          buf_pool.flush_hp.set(prev);
+          mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+          if (!got_lock)
+            bpage->lock.x_lock();
+        }
+
+#ifdef BTR_CUR_HASH_ADAPT
+        /* There is no AHI on undo tablespaces. */
+        ut_ad(!reinterpret_cast<buf_block_t*>(bpage)->index);
+#endif
+        ut_ad(!bpage->is_io_fixed());
+        ut_ad(bpage->id().space() == space_id);
+
+        if (bpage->oldest_modification() > 2)
+        {
+          mtr.memo_push(reinterpret_cast<buf_block_t*>(bpage),
+                        MTR_MEMO_PAGE_X_FIX);
+          mysql_mutex_lock(&buf_pool.flush_list_mutex);
+          ut_ad(bpage->oldest_modification() > 2);
+          bpage->reset_oldest_modification();
+        }
+        else
+        {
+          bpage->unfix();
+          bpage->lock.x_unlock();
+          mysql_mutex_lock(&buf_pool.flush_list_mutex);
+        }
+
+        if (prev != buf_pool.flush_hp.get())
+        {
+          mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+          goto rescan;
+        }
+      }
+
+      bpage= prev;
+    }
+
+    mysql_mutex_unlock(&buf_pool.flush_list_mutex);
+
+    /* Re-initialize tablespace, in a single mini-transaction. */
+    const ulint size= SRV_UNDO_TABLESPACE_SIZE_IN_PAGES;
+
+    /* Adjust the tablespace metadata. */
+    mysql_mutex_lock(&fil_system.mutex);
+    space.set_stopping();
+    space.is_being_truncated= true;
+    if (space.crypt_data)
+    {
+      space.reacquire();
+      mysql_mutex_unlock(&fil_system.mutex);
+      fil_space_crypt_close_tablespace(&space);
+      space.release();
+    }
+    else
+      mysql_mutex_unlock(&fil_system.mutex);
+
+    for (auto i= 6000; space.referenced();
+         std::this_thread::sleep_for(std::chrono::milliseconds(10)))
+    {
+      if (!--i)
+      {
+        mtr.commit();
+        ib::error() << "Failed to freeze UNDO tablespace " << file->name;
+        return;
+      }
+    }
+
+    /* Associate the undo tablespace with mtr.
+    During mtr::commit_shrink(), InnoDB can use the undo
+    tablespace object to clear all freed ranges */
+    mtr.set_named_space(&space);
+    mtr.trim_pages(page_id_t(space.id, size));
+    ut_a(fsp_header_init(&space, size, &mtr) == DB_SUCCESS);
+    mysql_mutex_lock(&fil_system.mutex);
+    space.size= file->size= size;
+    mysql_mutex_unlock(&fil_system.mutex);
+
+    for (auto &rseg : trx_sys.rseg_array)
+    {
+      if (rseg.space != &space)
+        continue;
+
+      ut_ad(!rseg.is_referenced());
+      /* We may actually have rseg.needs_purge > head.trx_no here
+      if trx_t::commit_empty() had been executed in the past,
+      possibly before this server had been started up. */
+
+      dberr_t err;
+      buf_block_t *rblock= trx_rseg_header_create(&space,
+                                                  &rseg - trx_sys.rseg_array,
+                                                  trx_sys.get_max_trx_id(),
+                                                  &mtr, &err);
+      ut_a(rblock);
+      /* These were written by trx_rseg_header_create(). */
+      ut_ad(!mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT +
+                              rblock->page.frame));
+      ut_ad(!mach_read_from_4(TRX_RSEG + TRX_RSEG_HISTORY_SIZE +
+                              rblock->page.frame));
+      rseg.reinit(rblock->page.id().page_no());
+    }
+
+    mtr.commit_shrink(space);
+
+    /* No mutex; this is only updated by the purge coordinator. */
+    export_vars.innodb_undo_truncations++;
+
+    if (purge_sys.rseg && purge_sys.rseg->last_page_no == FIL_NULL)
+    {
+      /* If purge_sys.rseg is pointing to rseg that was recently
+      truncated then move to next rseg element.
+
+      Note: Ideally purge_sys.rseg should be NULL because purge should
+      complete processing of all the records but srv_purge_batch_size
+      can force the purge loop to exit before all the records are purged. */
+      purge_sys.rseg= nullptr;
+      purge_sys.next_stored= false;
+    }
+
+    DBUG_EXECUTE_IF("ib_undo_trunc", ib::info() << "ib_undo_trunc";
+                    log_buffer_flush_to_disk();
+                    DBUG_SUICIDE(););
+
+    ib::info() << "Truncated " << file->name;
+    purge_sys.truncate.last= purge_sys.truncate.current;
+    ut_ad(&space == purge_sys.truncate.current);
+    purge_sys.truncate.current= nullptr;
+  }
+}
+
+buf_block_t *purge_sys_t::get_page(page_id_t id)
+{
+  buf_block_t*& undo_page= pages[id];
+
+  if (undo_page)
+    return undo_page;
+
+  mtr_t mtr;
+  mtr.start();
+  undo_page=
+    buf_page_get_gen(id, 0, RW_S_LATCH, nullptr, BUF_GET_POSSIBLY_FREED, &mtr);
+
+  if (UNIV_LIKELY(undo_page != nullptr))
+  {
+    undo_page->fix();
+    mtr.commit();
+    return undo_page;
+  }
+
+  mtr.commit();
+  pages.erase(id);
+  return nullptr;
+}
+
+void purge_sys_t::rseg_get_next_history_log()
+{
+  fil_addr_t prev_log_addr;
+
+  ut_ad(rseg->latch.is_write_locked());
+  ut_a(rseg->last_page_no != FIL_NULL);
+
+  tail.trx_no= rseg->last_trx_no() + 1;
+  tail.undo_no= 0;
+  next_stored= false;
+
+  if (buf_block_t *undo_page=
+      get_page(page_id_t(rseg->space->id, rseg->last_page_no)))
+  {
+    const byte *log_hdr= undo_page->page.frame + rseg->last_offset();
+    prev_log_addr= flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE);
+    prev_log_addr.boffset = static_cast<uint16_t>(prev_log_addr.boffset -
+                                                  TRX_UNDO_HISTORY_NODE);
+  }
+  else
+    prev_log_addr.page= FIL_NULL;
+
+  if (prev_log_addr.page == FIL_NULL)
+    rseg->last_page_no= FIL_NULL;
+  else
+  {
+    /* Read the previous log header. */
+    trx_id_t trx_no= 0;
+    if (const buf_block_t* undo_page=
+        get_page(page_id_t(rseg->space->id,
+                                     prev_log_addr.page)))
+    {
+      const byte *log_hdr= undo_page->page.frame + prev_log_addr.boffset;
+      trx_no= mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO);
+      ut_ad(mach_read_from_2(log_hdr + TRX_UNDO_NEEDS_PURGE) <= 1);
+    }
+
+    if (UNIV_LIKELY(trx_no != 0))
+    {
+      rseg->last_page_no= prev_log_addr.page;
+      rseg->set_last_commit(prev_log_addr.boffset, trx_no);
+
+      /* Purge can also produce events, however these are already
+      ordered in the rollback segment and any user generated event
+      will be greater than the events that Purge produces. ie. Purge
+      can never produce events from an empty rollback segment. */
+
+      mysql_mutex_lock(&pq_mutex);
+      purge_queue.push(*rseg);
+      mysql_mutex_unlock(&pq_mutex);
+    }
+  }
+
+  rseg->latch.wr_unlock();
+}
+
+/** Position the purge sys "iterator" on the undo record to use for purging.
+@retval false when nothing is to be purged
+@retval true  when purge_sys.rseg->latch was locked */
+bool purge_sys_t::choose_next_log()
+{
+  if (!rseg_iter.set_next())
+    return false;
+
+  hdr_offset= rseg->last_offset();
+  hdr_page_no= rseg->last_page_no;
+
+  if (!rseg->needs_purge)
+  {
+  purge_nothing:
+    page_no= hdr_page_no;
+    offset= 0;
+    tail.undo_no= 0;
+  }
+  else
+  {
+    page_id_t id{rseg->space->id, hdr_page_no};
+    buf_block_t *b= get_page(id);
+    if (!b)
+      goto purge_nothing;
+    const trx_undo_rec_t *undo_rec=
+      trx_undo_page_get_first_rec(b, hdr_page_no, hdr_offset);
+    if (!undo_rec)
+    {
+      if (mach_read_from_2(b->page.frame + hdr_offset + TRX_UNDO_NEXT_LOG))
+        goto purge_nothing;
+      const uint32_t next=
+        mach_read_from_4(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE +
+                         FLST_NEXT + FIL_ADDR_PAGE + b->page.frame);
+      if (next == FIL_NULL)
+        goto purge_nothing;
+      id.set_page_no(next);
+      b= get_page(id);
+      if (!b)
+        goto purge_nothing;
+      undo_rec=
+        trx_undo_page_get_first_rec(b, page_no, hdr_offset);
+      if (!undo_rec)
+        goto purge_nothing;
+    }
+
+    offset= page_offset(undo_rec);
+    tail.undo_no= trx_undo_rec_get_undo_no(undo_rec);
+    page_no= id.page_no();
+  }
+
+  next_stored= true;
+  return true;
+}
+
+/**
+Get the next record to purge and update the info in the purge system.
+@param roll_ptr           undo log pointer to the record
+@return buffer-fixed reference to undo log record
+@retval {nullptr,1} if the whole undo log can skipped in purge
+@retval {nullptr,0} if nothing is left, or on corruption */
+inline trx_purge_rec_t purge_sys_t::get_next_rec(roll_ptr_t roll_ptr)
+{
+  ut_ad(next_stored);
+  ut_ad(tail.trx_no < low_limit_no());
+  ut_ad(rseg->latch.is_write_locked());
+
+  if (!offset)
+  {
+    /* It is the dummy undo log record, which means that there is no
+    need to purge this undo log */
+    rseg_get_next_history_log();
+
+    /* Look for the next undo log and record to purge */
+    if (choose_next_log())
+      rseg->latch.wr_unlock();
+    return {nullptr, 1};
+  }
+
+  ut_ad(offset == uint16_t(roll_ptr));
+
+  page_id_t page_id{rseg->space->id, page_no};
+  bool locked= true;
+  buf_block_t *b= get_page(page_id);
+  if (UNIV_UNLIKELY(!b))
+  {
+    if (locked)
+      rseg->latch.wr_unlock();
+    return {nullptr, 0};
+  }
+
+  if (const trx_undo_rec_t *rec2=
+      trx_undo_page_get_next_rec(b, offset, hdr_page_no, hdr_offset))
+  {
+  got_rec:
+    ut_ad(page_no == page_id.page_no());
+    offset= page_offset(rec2);
+    tail.undo_no= trx_undo_rec_get_undo_no(rec2);
+  }
+  else if (hdr_page_no != page_no ||
+           !mach_read_from_2(b->page.frame + hdr_offset + TRX_UNDO_NEXT_LOG))
+  {
+    uint32_t next= mach_read_from_4(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE +
+                                    FLST_NEXT + FIL_ADDR_PAGE + b->page.frame);
+    if (next != FIL_NULL)
+    {
+      page_id.set_page_no(next);
+      if (buf_block_t *next_page= get_page(page_id))
+      {
+        rec2= trx_undo_page_get_first_rec(next_page, hdr_page_no, hdr_offset);
+        if (rec2)
+        {
+          page_no= next;
+          goto got_rec;
+        }
+      }
+    }
+    goto got_no_rec;
+  }
+  else
+  {
+  got_no_rec:
+    rseg_get_next_history_log();
+    /* Look for the next undo log and record to purge */
+    locked= choose_next_log();
+  }
+
+  if (locked)
+    rseg->latch.wr_unlock();
+
+  return {b->page.frame + uint16_t(roll_ptr), roll_ptr};
+}
+
+inline trx_purge_rec_t purge_sys_t::fetch_next_rec()
+{
+  roll_ptr_t roll_ptr;
+
+  if (!next_stored)
+  {
+    bool locked= choose_next_log();
+    ut_ad(locked == next_stored);
+    if (!locked)
+      goto got_nothing;
+    if (tail.trx_no >= low_limit_no())
+    {
+      rseg->latch.wr_unlock();
+      goto got_nothing;
+    }
+    /* row_purge_record_func() will later set ROLL_PTR_INSERT_FLAG for
+    TRX_UNDO_INSERT_REC */
+    roll_ptr= trx_undo_build_roll_ptr(false, trx_sys.rseg_id(rseg, true),
+                                      page_no, offset);
+  }
+  else if (tail.trx_no >= low_limit_no())
+  got_nothing:
+    return {nullptr, 0};
+  else
+  {
+    roll_ptr= trx_undo_build_roll_ptr(false, trx_sys.rseg_id(rseg, true),
+                                      page_no, offset);
+    rseg->latch.wr_lock(SRW_LOCK_CALL);
+  }
+
+  /* The following will advance the purge iterator. */
+  return get_next_rec(roll_ptr);
+}
+
+/** Close all tables that were opened in a purge batch for a worker.
+@param node   purge task context
+@param thd    purge coordinator thread handle */
+static void trx_purge_close_tables(purge_node_t *node, THD *thd)
+{
+  for (auto &t : node->tables)
+  {
+    if (!t.second.first);
+    else if (t.second.first == reinterpret_cast<dict_table_t*>(-1));
+    else
+    {
+      dict_table_close(t.second.first, false, thd, t.second.second);
+      t.second.first= reinterpret_cast<dict_table_t*>(-1);
+    }
+  }
+}
+
+void purge_sys_t::wait_FTS(bool also_sys)
+{
+  bool paused;
+  do
+  {
+    latch.wr_lock(SRW_LOCK_CALL);
+    paused= m_FTS_paused || (also_sys && m_SYS_paused);
+    latch.wr_unlock();
+    std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  }
+  while (paused);
+}
+
+__attribute__((nonnull))
+/** Aqcuire a metadata lock on a table.
+@param table        table handle
+@param mdl_context  metadata lock acquisition context
+@param mdl          metadata lcok
+@return table handle
+@retval nullptr if the table is not found or accessible
+@retval -1      if the purge of history must be suspended due to DDL */
+static dict_table_t *trx_purge_table_acquire(dict_table_t *table,
+                                             MDL_context *mdl_context,
+                                             MDL_ticket **mdl)
+{
+  ut_ad(dict_sys.frozen_not_locked());
+  *mdl= nullptr;
+
+  if (!table->is_readable() || table->corrupted)
+  {
+    table->release();
+    return nullptr;
+  }
+
+  size_t db_len= dict_get_db_name_len(table->name.m_name);
+  if (db_len == 0)
+    return table; /* InnoDB system tables are not covered by MDL */
+
+  if (purge_sys.must_wait_FTS())
+  {
+  must_wait:
+    table->release();
+    return reinterpret_cast<dict_table_t*>(-1);
+  }
+
+  char db_buf[NAME_LEN + 1];
+  char tbl_buf[NAME_LEN + 1];
+  size_t tbl_len;
+
+  if (!table->parse_name<true>(db_buf, tbl_buf, &db_len, &tbl_len))
+    /* The name of an intermediate table starts with #sql */
+    return table;
+
+  {
+    MDL_request request;
+    MDL_REQUEST_INIT(&request,MDL_key::TABLE, db_buf, tbl_buf, MDL_SHARED,
+                     MDL_EXPLICIT);
+    if (mdl_context->try_acquire_lock(&request))
+      goto must_wait;
+    *mdl= request.ticket;
+    if (!*mdl)
+      goto must_wait;
+  }
+
+  return table;
+}
+
+/** Open a table handle for the purge of committed transaction history
+@param table_id     InnoDB table identifier
+@param mdl_context  metadata lock acquisition context
+@param mdl          metadata lcok
+@return table handle
+@retval nullptr if the table is not found or accessible
+@retval -1      if the purge of history must be suspended due to DDL */
+static dict_table_t *trx_purge_table_open(table_id_t table_id,
+                                          MDL_context *mdl_context,
+                                          MDL_ticket **mdl)
+{
+  dict_sys.freeze(SRW_LOCK_CALL);
+
+  dict_table_t *table= dict_sys.find_table(table_id);
+
+  if (table)
+    table->acquire();
+  else
+  {
+    dict_sys.unfreeze();
+    dict_sys.lock(SRW_LOCK_CALL);
+    table= dict_load_table_on_id(table_id, DICT_ERR_IGNORE_FK_NOKEY);
+    if (table)
+      table->acquire();
+    dict_sys.unlock();
+    if (!table)
+      return nullptr;
+    dict_sys.freeze(SRW_LOCK_CALL);
+  }
+
+  table= trx_purge_table_acquire(table, mdl_context, mdl);
+  dict_sys.unfreeze();
+  return table;
+}
+
+ATTRIBUTE_COLD
+dict_table_t *purge_sys_t::close_and_reopen(table_id_t id, THD *thd,
+                                            MDL_ticket **mdl)
+{
+  MDL_context *mdl_context= static_cast<MDL_context*>(thd_mdl_context(thd));
+  ut_ad(mdl_context);
+ retry:
+  ut_ad(m_active);
+
+  for (que_thr_t *thr= UT_LIST_GET_FIRST(purge_sys.query->thrs); thr;
+       thr= UT_LIST_GET_NEXT(thrs, thr))
+  {
+    purge_node_t *node= static_cast<purge_node_t*>(thr->child);
+    trx_purge_close_tables(node, thd);
+  }
+
+  m_active= false;
+  wait_FTS(false);
+  m_active= true;
+
+  dict_table_t *table= trx_purge_table_open(id, mdl_context, mdl);
+  if (table == reinterpret_cast<dict_table_t*>(-1))
+    goto retry;
+
+  for (que_thr_t *thr= UT_LIST_GET_FIRST(purge_sys.query->thrs); thr;
+       thr= UT_LIST_GET_NEXT(thrs, thr))
+  {
+    purge_node_t *node= static_cast<purge_node_t*>(thr->child);
+    for (auto &t : node->tables)
+    {
+      if (t.second.first)
+      {
+        t.second.first= trx_purge_table_open(t.first, mdl_context,
+                                             &t.second.second);
+        if (t.second.first == reinterpret_cast<dict_table_t*>(-1))
+        {
+          if (table)
+            dict_table_close(table, false, thd, *mdl);
+          goto retry;
+        }
+      }
+    }
+  }
+
+  return table;
+}
+
+/** Run a purge batch.
+@param n_purge_threads	number of purge threads
+@return new purge_sys.head */
+static purge_sys_t::iterator
+trx_purge_attach_undo_recs(ulint n_purge_threads, THD *thd)
+{
+	que_thr_t*	thr;
+	ulint		i;
+
+	ut_a(n_purge_threads > 0);
+	ut_a(UT_LIST_GET_LEN(purge_sys.query->thrs) >= n_purge_threads);
+
+	purge_sys_t::iterator head = purge_sys.tail;
+
+#ifdef UNIV_DEBUG
+	i = 0;
+	/* Debug code to validate some pre-requisites and reset done flag. */
+	for (thr = UT_LIST_GET_FIRST(purge_sys.query->thrs);
+	     thr != NULL && i < n_purge_threads;
+	     thr = UT_LIST_GET_NEXT(thrs, thr), ++i) {
+
+		purge_node_t*		node;
+
+		/* Get the purge node. */
+		node = (purge_node_t*) thr->child;
+
+		ut_ad(que_node_get_type(node) == QUE_NODE_PURGE);
+		ut_ad(node->undo_recs.empty());
+		ut_ad(!node->in_progress);
+		ut_d(node->in_progress = true);
+	}
+
+	/* There should never be fewer nodes than threads, the inverse
+	however is allowed because we only use purge threads as needed. */
+	ut_ad(i == n_purge_threads);
+#endif
+
+	/* Fetch and parse the UNDO records. The UNDO records are added
+	to a per purge node vector. */
+	thr = UT_LIST_GET_FIRST(purge_sys.query->thrs);
+
+	ut_ad(head <= purge_sys.tail);
+
+	i = 0;
+
+	std::unordered_map<table_id_t, purge_node_t*>
+		table_id_map(TRX_PURGE_TABLE_BUCKETS);
+	purge_sys.m_active = true;
+
+	MDL_context* const mdl_context
+		= static_cast<MDL_context*>(thd_mdl_context(thd));
+	ut_ad(mdl_context);
+
+	const size_t max_pages = std::min(buf_pool.curr_size * 3 / 4,
+					  size_t{srv_purge_batch_size});
+
+	while (UNIV_LIKELY(srv_undo_sources) || !srv_fast_shutdown) {
+		/* Track the max {trx_id, undo_no} for truncating the
+		UNDO logs once we have purged the records. */
+
+		if (head <= purge_sys.tail) {
+			head = purge_sys.tail;
+		}
+
+		/* Fetch the next record, and advance the purge_sys.tail. */
+		trx_purge_rec_t purge_rec = purge_sys.fetch_next_rec();
+
+		if (!purge_rec.undo_rec) {
+			if (!purge_rec.roll_ptr) {
+				break;
+			}
+			ut_ad(purge_rec.roll_ptr == 1);
+			continue;
+		}
+
+		table_id_t table_id = trx_undo_rec_get_table_id(
+			purge_rec.undo_rec);
+
+		purge_node_t*& table_node = table_id_map[table_id];
+
+		if (!table_node) {
+			std::pair<dict_table_t*,MDL_ticket*> p;
+			p.first = trx_purge_table_open(table_id, mdl_context,
+						       &p.second);
+			if (p.first == reinterpret_cast<dict_table_t*>(-1)) {
+				p.first = purge_sys.close_and_reopen(
+					table_id, thd, &p.second);
+			}
+
+			thr = UT_LIST_GET_NEXT(thrs, thr);
+
+			if (!(++i % n_purge_threads)) {
+				thr = UT_LIST_GET_FIRST(
+					purge_sys.query->thrs);
+			}
+
+			table_node = static_cast<purge_node_t*>(thr->child);
+			ut_a(que_node_get_type(table_node) == QUE_NODE_PURGE);
+			ut_d(auto i=)
+			table_node->tables.emplace(table_id, p);
+			ut_ad(i.second);
+			if (p.first) {
+				goto enqueue;
+			}
+		} else if (table_node->tables[table_id].first) {
+enqueue:
+			table_node->undo_recs.push(purge_rec);
+		}
+
+		if (purge_sys.n_pages_handled() >= max_pages) {
+			break;
+		}
+	}
+
+	purge_sys.m_active = false;
+
+	ut_ad(head <= purge_sys.tail);
+
+	return head;
+}
+
+extern tpool::waitable_task purge_worker_task;
+
+/** Wait for pending purge jobs to complete. */
+static void trx_purge_wait_for_workers_to_complete()
+{
+  const bool notify_wait{purge_worker_task.is_running()};
+
+  if (notify_wait)
+    tpool::tpool_wait_begin();
+
+  purge_worker_task.wait();
+
+  if (notify_wait)
+    tpool::tpool_wait_end();
+
+  /* There should be no outstanding tasks as long
+  as the worker threads are active. */
+  ut_ad(srv_get_task_queue_length() == 0);
+}
+
+TRANSACTIONAL_INLINE
+void purge_sys_t::batch_cleanup(const purge_sys_t::iterator &head)
+{
+  /* Release the undo pages. */
+  for (auto p : pages)
+    p.second->unfix();
+  pages.clear();
+  pages.reserve(srv_purge_batch_size);
+
+  /* This is only invoked only by the purge coordinator,
+  which is the only thread that can modify our inputs head, tail, view.
+  Therefore, we only need to protect end_view from concurrent reads. */
+
+  /* Limit the end_view similar to what trx_purge_truncate_history() does. */
+  const trx_id_t trx_no= head.trx_no ? head.trx_no : tail.trx_no;
+#ifdef SUX_LOCK_GENERIC
+  end_latch.wr_lock();
+#else
+  transactional_lock_guard<srw_spin_lock_low> g(end_latch);
+#endif
+  this->head= head;
+  end_view= view;
+  end_view.clamp_low_limit_id(trx_no);
+#ifdef SUX_LOCK_GENERIC
+  end_latch.wr_unlock();
+#endif
+}
+
+/**
+Run a purge batch.
+@param n_tasks       number of purge tasks to submit to the queue
+@param history_size  trx_sys.history_size()
+@return number of undo log pages handled in the batch */
+TRANSACTIONAL_TARGET ulint trx_purge(ulint n_tasks, ulint history_size)
+{
+	ut_ad(n_tasks > 0);
+
+	purge_sys.clone_oldest_view();
+
+#ifdef UNIV_DEBUG
+	if (srv_purge_view_update_only_debug) {
+		return(0);
+	}
+#endif /* UNIV_DEBUG */
+
+	THD* const thd = current_thd;
+
+	/* Fetch the UNDO recs that need to be purged. */
+	const purge_sys_t::iterator head
+		=  trx_purge_attach_undo_recs(n_tasks, thd);
+	const size_t n_pages = purge_sys.n_pages_handled();
+
+	{
+		ulint delay = n_pages ? srv_max_purge_lag : 0;
+		if (UNIV_UNLIKELY(delay)) {
+			if (delay >= history_size) {
+		no_throttle:
+				delay = 0;
+			} else if (const ulint max_delay =
+				   srv_max_purge_lag_delay) {
+				delay = std::min(max_delay,
+						 10000 * history_size / delay
+						 - 5000);
+			} else {
+				goto no_throttle;
+			}
+		}
+		srv_dml_needed_delay = delay;
+	}
+
+	que_thr_t* thr = nullptr;
+
+	/* Submit tasks to workers queue if using multi-threaded purge. */
+	for (ulint i = n_tasks; --i; ) {
+		thr = que_fork_scheduler_round_robin(purge_sys.query, thr);
+		ut_a(thr);
+		srv_que_task_enqueue_low(thr);
+		srv_thread_pool->submit_task(&purge_worker_task);
+	}
+
+	thr = que_fork_scheduler_round_robin(purge_sys.query, thr);
+
+	que_run_threads(thr);
+
+	trx_purge_wait_for_workers_to_complete();
+
+	for (thr = UT_LIST_GET_FIRST(purge_sys.query->thrs); thr;
+	     thr = UT_LIST_GET_NEXT(thrs, thr)) {
+		purge_node_t* node = static_cast<purge_node_t*>(thr->child);
+		trx_purge_close_tables(node, thd);
+		node->tables.clear();
+	}
+
+	purge_sys.batch_cleanup(head);
+
+	MONITOR_INC_VALUE(MONITOR_PURGE_INVOKED, 1);
+	MONITOR_INC_VALUE(MONITOR_PURGE_N_PAGE_HANDLED, n_pages);
+
+	return n_pages;
+}
diff --git a/storage/innobase/trx/trx0rec.cc b/storage/innobase/trx/trx0rec.cc
new file mode 100644
index 00000000..b381c9de
--- /dev/null
+++ b/storage/innobase/trx/trx0rec.cc
@@ -0,0 +1,2448 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2019, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0rec.cc
+Transaction undo log record
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0rec.h"
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0undo.h"
+#include "mtr0log.h"
+#include "dict0dict.h"
+#include "ut0mem.h"
+#include "row0ext.h"
+#include "row0upd.h"
+#include "que0que.h"
+#include "trx0purge.h"
+#include "trx0rseg.h"
+#include "row0row.h"
+#include "row0mysql.h"
+#include "row0ins.h"
+#include "mariadb_stats.h"
+
+/** The search tuple corresponding to TRX_UNDO_INSERT_METADATA. */
+const dtuple_t trx_undo_metadata = {
+	/* This also works for REC_INFO_METADATA_ALTER, because the
+	delete-mark (REC_INFO_DELETED_FLAG) is ignored when searching. */
+	REC_INFO_METADATA_ADD, 0, 0,
+	NULL, 0, NULL
+#ifdef UNIV_DEBUG
+	, DATA_TUPLE_MAGIC_N
+#endif /* UNIV_DEBUG */
+};
+
+/*=========== UNDO LOG RECORD CREATION AND DECODING ====================*/
+
+/** Calculate the free space left for extending an undo log record.
+@param undo_block    undo log page
+@param ptr           current end of the undo page
+@return bytes left */
+static ulint trx_undo_left(const buf_block_t *undo_block, const byte *ptr)
+{
+  ut_ad(ptr >=
+        &undo_block->page.frame[TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE]);
+  /* The 10 is supposed to be an extra safety margin (and needed for
+  compatibility with older versions) */
+  lint left= srv_page_size - (ptr - undo_block->page.frame) -
+    (10 + FIL_PAGE_DATA_END);
+  ut_ad(left >= 0);
+  return left < 0 ? 0 : static_cast<ulint>(left);
+}
+
+/**********************************************************************//**
+Set the next and previous pointers in the undo page for the undo record
+that was written to ptr. Update the first free value by the number of bytes
+written for this undo record.
+@return offset of the inserted entry on the page if succeeded, 0 if fail */
+static
+uint16_t
+trx_undo_page_set_next_prev_and_add(
+/*================================*/
+	buf_block_t*	undo_block,	/*!< in/out: undo log page */
+	byte*		ptr,		/*!< in: ptr up to where data has been
+					written on this undo page. */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+  ut_ad(page_align(ptr) == undo_block->page.frame);
+
+  if (UNIV_UNLIKELY(trx_undo_left(undo_block, ptr) < 2))
+    return 0;
+
+  byte *ptr_to_first_free= my_assume_aligned<2>(TRX_UNDO_PAGE_HDR +
+						TRX_UNDO_PAGE_FREE +
+						undo_block->page.frame);
+
+  const uint16_t first_free= mach_read_from_2(ptr_to_first_free);
+
+  /* Write offset of the previous undo log record */
+  memcpy(ptr, ptr_to_first_free, 2);
+  ptr += 2;
+
+  const uint16_t end_of_rec= static_cast<uint16_t>
+    (ptr - undo_block->page.frame);
+
+  /* Update the offset to first free undo record */
+  mach_write_to_2(ptr_to_first_free, end_of_rec);
+  /* Write offset of the next undo log record */
+  memcpy(undo_block->page.frame + first_free, ptr_to_first_free, 2);
+  const byte *start= undo_block->page.frame + first_free + 2;
+
+  mtr->undo_append(*undo_block, start, ptr - start - 2);
+  return first_free;
+}
+
+/** Virtual column undo log version. To distinguish it from a length value
+in 5.7.8 undo log, it starts with 0xF1 */
+static const ulint VIRTUAL_COL_UNDO_FORMAT_1 = 0xF1;
+
+/** Write virtual column index info (index id and column position in index)
+to the undo log
+@param[in,out]	undo_block	undo log page
+@param[in]	table           the table
+@param[in]	pos		the virtual column position
+@param[in]      ptr             undo log record being written
+@param[in]	first_v_col	whether this is the first virtual column
+				which could start with a version marker
+@return new undo log pointer */
+static
+byte*
+trx_undo_log_v_idx(
+	buf_block_t*		undo_block,
+	const dict_table_t*	table,
+	ulint			pos,
+	byte*			ptr,
+	bool			first_v_col)
+{
+	ut_ad(pos < table->n_v_def);
+	dict_v_col_t*	vcol = dict_table_get_nth_v_col(table, pos);
+	byte*		old_ptr;
+
+	ut_ad(!vcol->v_indexes.empty());
+
+	ulint		size = first_v_col ? 1 + 2 : 2;
+	const ulint	avail = trx_undo_left(undo_block, ptr);
+
+	/* The mach_write_compressed(ptr, flen) in
+	trx_undo_page_report_modify() will consume additional 1 to 5 bytes. */
+	if (avail < size + 5) {
+		return(NULL);
+	}
+
+	ulint n_idx = 0;
+	for (const auto& v_index : vcol->v_indexes) {
+		n_idx++;
+		/* FIXME: index->id is 64 bits! */
+		size += mach_get_compressed_size(uint32_t(v_index.index->id));
+		size += mach_get_compressed_size(v_index.nth_field);
+	}
+
+	size += mach_get_compressed_size(n_idx);
+
+	if (avail < size + 5) {
+		return(NULL);
+	}
+
+	ut_d(const byte* orig_ptr = ptr);
+
+	if (first_v_col) {
+		/* write the version marker */
+		mach_write_to_1(ptr, VIRTUAL_COL_UNDO_FORMAT_1);
+
+		ptr += 1;
+	}
+
+	old_ptr = ptr;
+
+	ptr += 2;
+
+	ptr += mach_write_compressed(ptr, n_idx);
+
+	for (const auto& v_index : vcol->v_indexes) {
+		ptr += mach_write_compressed(
+			/* FIXME: index->id is 64 bits! */
+			ptr, uint32_t(v_index.index->id));
+
+		ptr += mach_write_compressed(ptr, v_index.nth_field);
+	}
+
+	ut_ad(orig_ptr + size == ptr);
+
+	mach_write_to_2(old_ptr, ulint(ptr - old_ptr));
+
+	return(ptr);
+}
+
+/** Read virtual column index from undo log, and verify the column is still
+indexed, and return its position
+@param[in]	table		the table
+@param[in]	ptr		undo log pointer
+@param[out]	col_pos		the column number or FIL_NULL
+				if the column is not indexed any more
+@return remaining part of undo log record after reading these values */
+static
+const byte*
+trx_undo_read_v_idx_low(
+	const dict_table_t*	table,
+	const byte*		ptr,
+	uint32_t*		col_pos)
+{
+	ulint		len = mach_read_from_2(ptr);
+	const byte*	old_ptr = ptr;
+
+	*col_pos = FIL_NULL;
+
+	ptr += 2;
+
+	ulint	num_idx = mach_read_next_compressed(&ptr);
+
+	ut_ad(num_idx > 0);
+
+	dict_index_t*	clust_index = dict_table_get_first_index(table);
+
+	for (ulint i = 0; i < num_idx; i++) {
+		index_id_t	id = mach_read_next_compressed(&ptr);
+		ulint		pos = mach_read_next_compressed(&ptr);
+		dict_index_t*	index = dict_table_get_next_index(clust_index);
+
+		while (index != NULL) {
+			/* Return if we find a matching index.
+			TODO: in the future, it might be worth to add
+			checks on other indexes */
+			if (index->id == id) {
+				const dict_col_t* col = dict_index_get_nth_col(
+					index, pos);
+				ut_ad(col->is_virtual());
+				const dict_v_col_t*	vcol = reinterpret_cast<
+					const dict_v_col_t*>(col);
+				*col_pos = vcol->v_pos;
+				return(old_ptr + len);
+			}
+
+			index = dict_table_get_next_index(index);
+		}
+	}
+
+	return(old_ptr + len);
+}
+
+/** Read virtual column index from undo log or online log if the log
+contains such info, and in the undo log case, verify the column is
+still indexed, and output its position
+@param[in]	table		the table
+@param[in]	ptr		undo log pointer
+@param[in]	first_v_col	if this is the first virtual column, which
+				has the version marker
+@param[in,out]	is_undo_log	this function is used to parse both undo log,
+				and online log for virtual columns. So
+				check to see if this is undo log. When
+				first_v_col is true, is_undo_log is output,
+				when first_v_col is false, is_undo_log is input
+@param[out]	field_no	the column number, or FIL_NULL if not indexed
+@return remaining part of undo log record after reading these values */
+const byte*
+trx_undo_read_v_idx(
+	const dict_table_t*	table,
+	const byte*		ptr,
+	bool			first_v_col,
+	bool*			is_undo_log,
+	uint32_t*		field_no)
+{
+	/* Version marker only put on the first virtual column */
+	if (first_v_col) {
+		/* Undo log has the virtual undo log marker */
+		*is_undo_log = (mach_read_from_1(ptr)
+				== VIRTUAL_COL_UNDO_FORMAT_1);
+
+		if (*is_undo_log) {
+			ptr += 1;
+		}
+	}
+
+	if (*is_undo_log) {
+		ptr = trx_undo_read_v_idx_low(table, ptr, field_no);
+	} else {
+		*field_no -= REC_MAX_N_FIELDS;
+	}
+
+	return(ptr);
+}
+
+/** Reports in the undo log of an insert of virtual columns.
+@param[in]	undo_block	undo log page
+@param[in]	table		the table
+@param[in]	row		dtuple contains the virtual columns
+@param[in,out]	ptr		log ptr
+@return true if write goes well, false if out of space */
+static
+bool
+trx_undo_report_insert_virtual(
+	buf_block_t*	undo_block,
+	dict_table_t*	table,
+	const dtuple_t*	row,
+	byte**		ptr)
+{
+	byte*	start = *ptr;
+	bool	first_v_col = true;
+
+	if (trx_undo_left(undo_block, *ptr) < 2) {
+		return(false);
+	}
+
+	/* Reserve 2 bytes to write the number
+	of bytes the stored fields take in this
+	undo record */
+	*ptr += 2;
+
+	for (ulint col_no = 0; col_no < dict_table_get_n_v_cols(table);
+	     col_no++) {
+		const dict_v_col_t*     col
+			= dict_table_get_nth_v_col(table, col_no);
+
+		if (col->m_col.ord_part) {
+
+			/* make sure enought space to write the length */
+			if (trx_undo_left(undo_block, *ptr) < 5) {
+				return(false);
+			}
+
+			ulint   pos = col_no;
+			pos += REC_MAX_N_FIELDS;
+			*ptr += mach_write_compressed(*ptr, pos);
+
+			*ptr = trx_undo_log_v_idx(undo_block, table,
+						  col_no, *ptr, first_v_col);
+			first_v_col = false;
+
+			if (*ptr == NULL) {
+				return(false);
+			}
+
+			const dfield_t* vfield = dtuple_get_nth_v_field(
+				row, col->v_pos);
+			switch (ulint flen = vfield->len) {
+			case 0: case UNIV_SQL_NULL:
+				if (trx_undo_left(undo_block, *ptr) < 5) {
+					return(false);
+				}
+
+				*ptr += mach_write_compressed(*ptr, flen);
+				break;
+			default:
+				ulint	max_len
+					= dict_max_v_field_len_store_undo(
+						table, col_no);
+
+				if (flen > max_len) {
+					flen = max_len;
+				}
+
+				if (trx_undo_left(undo_block, *ptr)
+				    < flen + 5) {
+					return(false);
+				}
+				*ptr += mach_write_compressed(*ptr, flen);
+
+				memcpy(*ptr, vfield->data, flen);
+				*ptr += flen;
+			}
+		}
+	}
+
+	/* Always mark the end of the log with 2 bytes length field */
+	mach_write_to_2(start, ulint(*ptr - start));
+
+	return(true);
+}
+
+/** Reports in the undo log of an insert of a clustered index record.
+@param	undo_block	undo log page
+@param	trx		transaction
+@param	index		clustered index
+@param	clust_entry	index entry which will be inserted to the
+			clustered index
+@param	mtr		mini-transaction
+@param	write_empty	write empty table undo log record
+@return offset of the inserted entry on the page if succeed, 0 if fail */
+static
+uint16_t
+trx_undo_page_report_insert(
+	buf_block_t*	undo_block,
+	trx_t*		trx,
+	dict_index_t*	index,
+	const dtuple_t*	clust_entry,
+	mtr_t*		mtr,
+	bool		write_empty)
+{
+	ut_ad(index->is_primary());
+	/* MariaDB 10.3.1+ in trx_undo_page_init() always initializes
+	TRX_UNDO_PAGE_TYPE as 0, but previous versions wrote
+	TRX_UNDO_INSERT == 1 into insert_undo pages,
+	or TRX_UNDO_UPDATE == 2 into update_undo pages. */
+	ut_ad(mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE
+			       + undo_block->page.frame) <= 2);
+
+	uint16_t first_free = mach_read_from_2(my_assume_aligned<2>
+					       (TRX_UNDO_PAGE_HDR
+						+ TRX_UNDO_PAGE_FREE
+						+ undo_block->page.frame));
+	byte* ptr = undo_block->page.frame + first_free;
+
+	if (trx_undo_left(undo_block, ptr) < 2 + 1 + 11 + 11) {
+		/* Not enough space for writing the general parameters */
+		return(0);
+	}
+
+	/* Reserve 2 bytes for the pointer to the next undo log record */
+	ptr += 2;
+
+	/* Store first some general parameters to the undo log */
+	*ptr++ = TRX_UNDO_INSERT_REC;
+	ptr += mach_u64_write_much_compressed(ptr, trx->undo_no);
+	ptr += mach_u64_write_much_compressed(ptr, index->table->id);
+
+	if (write_empty) {
+		/* Table is in bulk operation */
+		undo_block->page.frame[first_free + 2] = TRX_UNDO_EMPTY;
+		goto done;
+	}
+
+	/*----------------------------------------*/
+	/* Store then the fields required to uniquely determine the record
+	to be inserted in the clustered index */
+	if (UNIV_UNLIKELY(clust_entry->info_bits != 0)) {
+		ut_ad(clust_entry->is_metadata());
+		ut_ad(index->is_instant());
+		ut_ad(undo_block->page.frame[first_free + 2]
+		      == TRX_UNDO_INSERT_REC);
+		undo_block->page.frame[first_free + 2]
+			= TRX_UNDO_INSERT_METADATA;
+		goto done;
+	}
+
+	for (unsigned i = 0; i < dict_index_get_n_unique(index); i++) {
+
+		const dfield_t*	field	= dtuple_get_nth_field(clust_entry, i);
+		ulint		flen	= dfield_get_len(field);
+
+		if (trx_undo_left(undo_block, ptr) < 5) {
+
+			return(0);
+		}
+
+		ptr += mach_write_compressed(ptr, flen);
+
+		switch (flen) {
+		case 0: case UNIV_SQL_NULL:
+			break;
+		default:
+			if (trx_undo_left(undo_block, ptr) < flen) {
+
+				return(0);
+			}
+
+			memcpy(ptr, dfield_get_data(field), flen);
+			ptr += flen;
+		}
+	}
+
+	if (index->table->n_v_cols) {
+		if (!trx_undo_report_insert_virtual(
+			undo_block, index->table, clust_entry, &ptr)) {
+			return(0);
+		}
+	}
+
+done:
+	return(trx_undo_page_set_next_prev_and_add(undo_block, ptr, mtr));
+}
+
+/**********************************************************************//**
+Reads from an undo log record the general parameters.
+@return remaining part of undo log record after reading these values */
+const byte*
+trx_undo_rec_get_pars(
+/*==================*/
+	const trx_undo_rec_t*	undo_rec,	/*!< in: undo log record */
+	byte*		type,		/*!< out: undo record type:
+					TRX_UNDO_INSERT_REC, ... */
+	byte*		cmpl_info,	/*!< out: compiler info, relevant only
+					for update type records */
+	bool*		updated_extern,	/*!< out: true if we updated an
+					externally stored fild */
+	undo_no_t*	undo_no,	/*!< out: undo log record number */
+	table_id_t*	table_id)	/*!< out: table id */
+{
+	ulint		type_cmpl;
+
+	type_cmpl = undo_rec[2];
+	const byte *ptr = undo_rec + 3;
+
+	*updated_extern = !!(type_cmpl & TRX_UNDO_UPD_EXTERN);
+	type_cmpl &= ~TRX_UNDO_UPD_EXTERN;
+	*type = type_cmpl & (TRX_UNDO_CMPL_INFO_MULT - 1);
+	ut_ad(*type >= TRX_UNDO_RENAME_TABLE);
+	ut_ad(*type <= TRX_UNDO_EMPTY);
+	*cmpl_info = byte(type_cmpl / TRX_UNDO_CMPL_INFO_MULT);
+
+	*undo_no = mach_read_next_much_compressed(&ptr);
+	*table_id = mach_read_next_much_compressed(&ptr);
+	ut_ad(*table_id);
+
+	return ptr;
+}
+
+/** Read from an undo log record a non-virtual column value.
+@param ptr	pointer to remaining part of the undo record
+@param field	stored field
+@param len	length of the field, or UNIV_SQL_NULL
+@param orig_len	original length of the locally stored part
+of an externally stored column, or 0
+@return remaining part of undo log record after reading these values */
+const byte *trx_undo_rec_get_col_val(const byte *ptr, const byte **field,
+                                     uint32_t *len, uint32_t *orig_len)
+{
+	*len = mach_read_next_compressed(&ptr);
+	*orig_len = 0;
+
+	switch (*len) {
+	case UNIV_SQL_NULL:
+		*field = NULL;
+		break;
+	case UNIV_EXTERN_STORAGE_FIELD:
+		*orig_len = mach_read_next_compressed(&ptr);
+		*len = mach_read_next_compressed(&ptr);
+		*field = ptr;
+		ptr += *len & ~SPATIAL_STATUS_MASK;
+
+		ut_ad(*orig_len >= BTR_EXTERN_FIELD_REF_SIZE);
+		ut_ad(*len > *orig_len);
+		/* @see dtuple_convert_big_rec() */
+		ut_ad(*len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+		/* we do not have access to index->table here
+		ut_ad(dict_table_has_atomic_blobs(index->table)
+		      || *len >= col->max_prefix
+		      + BTR_EXTERN_FIELD_REF_SIZE);
+		*/
+
+		*len += UNIV_EXTERN_STORAGE_FIELD;
+		break;
+	default:
+		*field = ptr;
+		if (*len >= UNIV_EXTERN_STORAGE_FIELD) {
+			ptr += (*len - UNIV_EXTERN_STORAGE_FIELD)
+				& ~SPATIAL_STATUS_MASK;
+		} else {
+			ptr += *len;
+		}
+	}
+
+	return ptr;
+}
+
+/*******************************************************************//**
+Builds a row reference from an undo log record.
+@return pointer to remaining part of undo record */
+const byte*
+trx_undo_rec_get_row_ref(
+/*=====================*/
+	const byte*	ptr,	/*!< in: remaining part of a copy of an undo log
+				record, at the start of the row reference;
+				NOTE that this copy of the undo log record must
+				be preserved as long as the row reference is
+				used, as we do NOT copy the data in the
+				record! */
+	dict_index_t*	index,	/*!< in: clustered index */
+	const dtuple_t**ref,	/*!< out, own: row reference */
+	mem_heap_t*	heap)	/*!< in: memory heap from which the memory
+				needed is allocated */
+{
+	ut_ad(index->is_primary());
+
+	const ulint ref_len = dict_index_get_n_unique(index);
+
+	dtuple_t* tuple = dtuple_create(heap, ref_len);
+	*ref = tuple;
+
+	dict_index_copy_types(tuple, index, ref_len);
+
+	for (ulint i = 0; i < ref_len; i++) {
+		const byte*	field;
+		uint32_t	len, orig_len;
+
+		dfield_t* dfield = dtuple_get_nth_field(tuple, i);
+
+		ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+
+		dfield_set_data(dfield, field, len);
+	}
+
+	return ptr;
+}
+
+/** Skip a row reference from an undo log record.
+@param ptr    part of an update undo log record
+@param index  clustered index
+@return pointer to remaining part of undo record */
+static const byte *trx_undo_rec_skip_row_ref(const byte *ptr,
+                                             const dict_index_t *index)
+{
+	ut_ad(index->is_primary());
+
+	ulint ref_len = dict_index_get_n_unique(index);
+
+	for (ulint i = 0; i < ref_len; i++) {
+		const byte*	field;
+		uint32_t len, orig_len;
+
+		ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+	}
+
+	return(ptr);
+}
+
+/** Fetch a prefix of an externally stored column, for writing to the undo
+log of an update or delete marking of a clustered index record.
+@param[out]	ext_buf		buffer to hold the prefix data and BLOB pointer
+@param[in]	prefix_len	prefix size to store in the undo log
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	field		an externally stored column
+@param[in,out]	len		input: length of field; output: used length of
+ext_buf
+@return ext_buf */
+static
+byte*
+trx_undo_page_fetch_ext(
+	byte*			ext_buf,
+	ulint			prefix_len,
+	ulint			zip_size,
+	const byte*		field,
+	ulint*			len)
+{
+	/* Fetch the BLOB. */
+	ulint	ext_len = btr_copy_externally_stored_field_prefix(
+		ext_buf, prefix_len, zip_size, field, *len);
+	/* BLOBs should always be nonempty. */
+	ut_a(ext_len);
+	/* Append the BLOB pointer to the prefix. */
+	memcpy(ext_buf + ext_len,
+	       field + *len - BTR_EXTERN_FIELD_REF_SIZE,
+	       BTR_EXTERN_FIELD_REF_SIZE);
+	*len = ext_len + BTR_EXTERN_FIELD_REF_SIZE;
+	return(ext_buf);
+}
+
+/** Writes to the undo log a prefix of an externally stored column.
+@param[out]	ptr		undo log position, at least 15 bytes must be
+available
+@param[out]	ext_buf		a buffer of DICT_MAX_FIELD_LEN_BY_FORMAT()
+				size, or NULL when should not fetch a longer
+				prefix
+@param[in]	prefix_len	prefix size to store in the undo log
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in,out]	field		the locally stored part of the externally
+stored column
+@param[in,out]	len		length of field, in bytes
+@param[in]	spatial_status	whether the column is used by spatial index or
+				regular index
+@return undo log position */
+static
+byte*
+trx_undo_page_report_modify_ext(
+	byte*			ptr,
+	byte*			ext_buf,
+	ulint			prefix_len,
+	ulint			zip_size,
+	const byte**		field,
+	ulint*			len,
+	spatial_status_t	spatial_status)
+{
+	ulint	spatial_len= 0;
+
+	switch (spatial_status) {
+	case SPATIAL_UNKNOWN:
+	case SPATIAL_NONE:
+		break;
+
+	case SPATIAL_MIXED:
+	case SPATIAL_ONLY:
+		spatial_len = DATA_MBR_LEN;
+		break;
+	}
+
+	/* Encode spatial status into length. */
+	spatial_len |= ulint(spatial_status) << SPATIAL_STATUS_SHIFT;
+
+	if (spatial_status == SPATIAL_ONLY) {
+		/* If the column is only used by gis index, log its
+		MBR is enough.*/
+		ptr += mach_write_compressed(ptr, UNIV_EXTERN_STORAGE_FIELD
+					     + spatial_len);
+
+		return(ptr);
+	}
+
+	if (ext_buf) {
+		ut_a(prefix_len > 0);
+
+		/* If an ordering column is externally stored, we will
+		have to store a longer prefix of the field.  In this
+		case, write to the log a marker followed by the
+		original length and the real length of the field. */
+		ptr += mach_write_compressed(ptr, UNIV_EXTERN_STORAGE_FIELD);
+
+		ptr += mach_write_compressed(ptr, *len);
+
+		*field = trx_undo_page_fetch_ext(ext_buf, prefix_len,
+						 zip_size, *field, len);
+
+		ptr += mach_write_compressed(ptr, *len + spatial_len);
+	} else {
+		ptr += mach_write_compressed(ptr, UNIV_EXTERN_STORAGE_FIELD
+					     + *len + spatial_len);
+	}
+
+	return(ptr);
+}
+
+/** Get MBR from a Geometry column stored externally
+@param[out]	mbr		MBR to fill
+@param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
+@param[in]	field		field contain the geometry data
+@param[in,out]	len		length of field, in bytes
+*/
+static
+void
+trx_undo_get_mbr_from_ext(
+/*======================*/
+	double*		mbr,
+	ulint		zip_size,
+	const byte*	field,
+	ulint*		len)
+{
+	uchar*		dptr = NULL;
+	ulint		dlen;
+	mem_heap_t*	heap = mem_heap_create(100);
+
+	dptr = btr_copy_externally_stored_field(
+		&dlen, field, zip_size, *len, heap);
+
+	if (dlen <= GEO_DATA_HEADER_SIZE) {
+		for (uint i = 0; i < SPDIMS; ++i) {
+			mbr[i * 2] = DBL_MAX;
+			mbr[i * 2 + 1] = -DBL_MAX;
+		}
+	} else {
+		rtree_mbr_from_wkb(dptr + GEO_DATA_HEADER_SIZE,
+				   static_cast<uint>(dlen
+				   - GEO_DATA_HEADER_SIZE), SPDIMS, mbr);
+	}
+
+	mem_heap_free(heap);
+}
+
+/**********************************************************************//**
+Reports in the undo log of an update or delete marking of a clustered index
+record.
+@return byte offset of the inserted undo log entry on the page if
+succeed, 0 if fail */
+static
+uint16_t
+trx_undo_page_report_modify(
+/*========================*/
+	buf_block_t*	undo_block,	/*!< in: undo log page */
+	trx_t*		trx,		/*!< in: transaction */
+	dict_index_t*	index,		/*!< in: clustered index where update or
+					delete marking is done */
+	const rec_t*	rec,		/*!< in: clustered index record which
+					has NOT yet been modified */
+	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec, index) */
+	const upd_t*	update,		/*!< in: update vector which tells the
+					columns to be updated; in the case of
+					a delete, this should be set to NULL */
+	ulint		cmpl_info,	/*!< in: compiler info on secondary
+					index updates */
+	const dtuple_t*	row,		/*!< in: clustered index row contains
+					virtual column info */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	ut_ad(index->is_primary());
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	/* MariaDB 10.3.1+ in trx_undo_page_init() always initializes
+	TRX_UNDO_PAGE_TYPE as 0, but previous versions wrote
+	TRX_UNDO_INSERT == 1 into insert_undo pages,
+	or TRX_UNDO_UPDATE == 2 into update_undo pages. */
+	ut_ad(mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE
+			       + undo_block->page.frame) <= 2);
+
+	byte* ptr_to_first_free = my_assume_aligned<2>(
+		TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE
+		+ undo_block->page.frame);
+
+	const uint16_t first_free = mach_read_from_2(ptr_to_first_free);
+	byte *ptr = undo_block->page.frame + first_free;
+
+	if (trx_undo_left(undo_block, ptr) < 50) {
+		/* NOTE: the value 50 must be big enough so that the general
+		fields written below fit on the undo log page */
+		return 0;
+	}
+
+	/* Reserve 2 bytes for the pointer to the next undo log record */
+	ptr += 2;
+
+	dict_table_t*	table		= index->table;
+	const byte*	field;
+	ulint		flen;
+	ulint		col_no;
+	ulint		type_cmpl;
+	byte*		type_cmpl_ptr;
+	ulint		i;
+	trx_id_t	trx_id;
+	ibool		ignore_prefix = FALSE;
+	byte		ext_buf[REC_VERSION_56_MAX_INDEX_COL_LEN
+				+ BTR_EXTERN_FIELD_REF_SIZE];
+	bool		first_v_col = true;
+
+	/* Store first some general parameters to the undo log */
+
+	if (!update) {
+		ut_ad(!rec_is_delete_marked(rec, dict_table_is_comp(table)));
+		type_cmpl = TRX_UNDO_DEL_MARK_REC;
+	} else if (rec_is_delete_marked(rec, dict_table_is_comp(table))) {
+		/* In delete-marked records, DB_TRX_ID must
+		always refer to an existing update_undo log record. */
+		ut_ad(row_get_rec_trx_id(rec, index, offsets));
+
+		type_cmpl = TRX_UNDO_UPD_DEL_REC;
+		/* We are about to update a delete marked record.
+		We don't typically need the prefix in this case unless
+		the delete marking is done by the same transaction
+		(which we check below). */
+		ignore_prefix = TRUE;
+	} else {
+		type_cmpl = TRX_UNDO_UPD_EXIST_REC;
+	}
+
+	type_cmpl |= cmpl_info * TRX_UNDO_CMPL_INFO_MULT;
+	type_cmpl_ptr = ptr;
+
+	*ptr++ = (byte) type_cmpl;
+	ptr += mach_u64_write_much_compressed(ptr, trx->undo_no);
+
+	ptr += mach_u64_write_much_compressed(ptr, table->id);
+
+	/*----------------------------------------*/
+	/* Store the state of the info bits */
+
+	*ptr++ = (byte) rec_get_info_bits(rec, dict_table_is_comp(table));
+
+	/* Store the values of the system columns */
+	field = rec_get_nth_field(rec, offsets, index->db_trx_id(), &flen);
+	ut_ad(flen == DATA_TRX_ID_LEN);
+
+	trx_id = trx_read_trx_id(field);
+
+	/* If it is an update of a delete marked record, then we are
+	allowed to ignore blob prefixes if the delete marking was done
+	by some other trx as it must have committed by now for us to
+	allow an over-write. */
+	if (trx_id == trx->id) {
+		ignore_prefix = false;
+	}
+	ptr += mach_u64_write_compressed(ptr, trx_id);
+
+	field = rec_get_nth_field(rec, offsets, index->db_roll_ptr(), &flen);
+	ut_ad(flen == DATA_ROLL_PTR_LEN);
+	ut_ad(memcmp(field, field_ref_zero, DATA_ROLL_PTR_LEN));
+
+	ptr += mach_u64_write_compressed(ptr, trx_read_roll_ptr(field));
+
+	/*----------------------------------------*/
+	/* Store then the fields required to uniquely determine the
+	record which will be modified in the clustered index */
+
+	for (i = 0; i < dict_index_get_n_unique(index); i++) {
+
+		/* The ordering columns must not be instant added columns. */
+		ut_ad(!rec_offs_nth_default(offsets, i));
+		field = rec_get_nth_field(rec, offsets, i, &flen);
+
+		/* The ordering columns must not be stored externally. */
+		ut_ad(!rec_offs_nth_extern(offsets, i));
+		ut_ad(dict_index_get_nth_col(index, i)->ord_part);
+
+		if (trx_undo_left(undo_block, ptr) < 5) {
+			return(0);
+		}
+
+		ptr += mach_write_compressed(ptr, flen);
+
+		if (flen != UNIV_SQL_NULL) {
+			if (trx_undo_left(undo_block, ptr) < flen) {
+				return(0);
+			}
+
+			memcpy(ptr, field, flen);
+			ptr += flen;
+		}
+	}
+
+	/*----------------------------------------*/
+	/* Save to the undo log the old values of the columns to be updated. */
+
+	if (update) {
+		if (trx_undo_left(undo_block, ptr) < 5) {
+			return(0);
+		}
+
+		ulint	n_updated = upd_get_n_fields(update);
+
+		/* If this is an online update while an inplace alter table
+		is in progress and the table has virtual column, we will
+		need to double check if there are any non-indexed columns
+		being registered in update vector in case they will be indexed
+		in new table */
+		if (dict_index_is_online_ddl(index) && table->n_v_cols > 0) {
+			for (i = 0; i < upd_get_n_fields(update); i++) {
+				upd_field_t*	fld = upd_get_nth_field(
+					update, i);
+				ulint		pos = fld->field_no;
+
+				/* These columns must not have an index
+				on them */
+				if (upd_fld_is_virtual_col(fld)
+				    && dict_table_get_nth_v_col(
+					    table, pos)->v_indexes.empty()) {
+					n_updated--;
+				}
+			}
+		}
+
+		i = 0;
+
+		if (UNIV_UNLIKELY(update->is_alter_metadata())) {
+			ut_ad(update->n_fields >= 1);
+			ut_ad(!upd_fld_is_virtual_col(&update->fields[0]));
+			ut_ad(update->fields[0].field_no
+			      == index->first_user_field());
+			ut_ad(!dfield_is_ext(&update->fields[0].new_val));
+			ut_ad(!dfield_is_null(&update->fields[0].new_val));
+			/* The instant ADD COLUMN metadata record does not
+			contain the BLOB. Do not write anything for it. */
+			i = !rec_is_alter_metadata(rec, *index);
+			n_updated -= i;
+		}
+
+		ptr += mach_write_compressed(ptr, n_updated);
+
+		for (; i < upd_get_n_fields(update); i++) {
+			if (trx_undo_left(undo_block, ptr) < 5) {
+				return 0;
+			}
+
+			upd_field_t*	fld = upd_get_nth_field(update, i);
+
+			bool	is_virtual = upd_fld_is_virtual_col(fld);
+			ulint	max_v_log_len = 0;
+
+			ulint pos = fld->field_no;
+			const dict_col_t* col = NULL;
+
+			if (is_virtual) {
+				/* Skip the non-indexed column, during
+				an online alter table */
+				if (dict_index_is_online_ddl(index)
+				    && dict_table_get_nth_v_col(
+					table, pos)->v_indexes.empty()) {
+					continue;
+				}
+
+				/* add REC_MAX_N_FIELDS to mark this
+				is a virtual col */
+				ptr += mach_write_compressed(
+					ptr, pos + REC_MAX_N_FIELDS);
+
+				if (trx_undo_left(undo_block, ptr) < 15) {
+					return 0;
+				}
+
+				ut_ad(fld->field_no < table->n_v_def);
+
+				ptr = trx_undo_log_v_idx(undo_block, table,
+							 fld->field_no, ptr,
+							 first_v_col);
+				if (ptr == NULL) {
+					 return(0);
+				}
+				first_v_col = false;
+
+				max_v_log_len
+					= dict_max_v_field_len_store_undo(
+						table, fld->field_no);
+
+				field = static_cast<byte*>(
+					fld->old_v_val->data);
+				flen = fld->old_v_val->len;
+
+				/* Only log sufficient bytes for index
+				record update */
+				if (flen != UNIV_SQL_NULL) {
+					flen = ut_min(
+						flen, max_v_log_len);
+				}
+
+				goto store_len;
+			}
+
+			if (UNIV_UNLIKELY(update->is_metadata())) {
+				ut_ad(pos >= index->first_user_field());
+				ut_ad(rec_is_metadata(rec, *index));
+
+				if (rec_is_alter_metadata(rec, *index)) {
+					ut_ad(update->is_alter_metadata());
+
+					field = rec_offs_n_fields(offsets)
+						> pos
+						&& !rec_offs_nth_default(
+							offsets, pos)
+						? rec_get_nth_field(
+							rec, offsets,
+							pos, &flen)
+						: index->instant_field_value(
+							pos - 1, &flen);
+
+					if (pos == index->first_user_field()) {
+						ut_ad(rec_offs_nth_extern(
+							offsets, pos));
+						ut_ad(flen == FIELD_REF_SIZE);
+						goto write_field;
+					}
+					col = dict_index_get_nth_col(index,
+								     pos - 1);
+				} else if (!update->is_alter_metadata()) {
+					goto get_field;
+				} else {
+					/* We are converting an ADD COLUMN
+					metadata record to an ALTER TABLE
+					metadata record, with BLOB. Subtract
+					the missing metadata BLOB field. */
+					ut_ad(pos > index->first_user_field());
+					--pos;
+					goto get_field;
+				}
+			} else {
+get_field:
+				col = dict_index_get_nth_col(index, pos);
+				field = rec_get_nth_cfield(
+					rec, index, offsets, pos, &flen);
+			}
+write_field:
+			/* Write field number to undo log */
+			ptr += mach_write_compressed(ptr, pos);
+
+			if (trx_undo_left(undo_block, ptr) < 15) {
+				return 0;
+			}
+
+			if (rec_offs_n_fields(offsets) > pos
+			    && rec_offs_nth_extern(offsets, pos)) {
+				ut_ad(col || pos == index->first_user_field());
+				ut_ad(col || update->is_alter_metadata());
+				ut_ad(col
+				      || rec_is_alter_metadata(rec, *index));
+				ulint prefix_len = col
+					? dict_max_field_len_store_undo(
+						table, col)
+					: 0;
+
+				ut_ad(prefix_len + BTR_EXTERN_FIELD_REF_SIZE
+				      <= sizeof ext_buf);
+
+				ptr = trx_undo_page_report_modify_ext(
+					ptr,
+					col
+					&& col->ord_part
+					&& !ignore_prefix
+					&& flen < REC_ANTELOPE_MAX_INDEX_COL_LEN
+					? ext_buf : NULL, prefix_len,
+					table->space->zip_size(),
+					&field, &flen, SPATIAL_UNKNOWN);
+
+				*type_cmpl_ptr |= TRX_UNDO_UPD_EXTERN;
+			} else {
+store_len:
+				ptr += mach_write_compressed(ptr, flen);
+			}
+
+			if (flen != UNIV_SQL_NULL) {
+				if (trx_undo_left(undo_block, ptr) < flen) {
+					return(0);
+				}
+
+				memcpy(ptr, field, flen);
+				ptr += flen;
+			}
+
+			/* Also record the new value for virtual column */
+			if (is_virtual) {
+				field = static_cast<byte*>(fld->new_val.data);
+				flen = fld->new_val.len;
+				if (flen != UNIV_SQL_NULL) {
+					flen = ut_min(
+						flen, max_v_log_len);
+				}
+
+				if (trx_undo_left(undo_block, ptr) < 15) {
+					return(0);
+				}
+
+				ptr += mach_write_compressed(ptr, flen);
+
+				if (flen != UNIV_SQL_NULL) {
+					if (trx_undo_left(undo_block, ptr)
+					    < flen) {
+						return(0);
+					}
+
+					memcpy(ptr, field, flen);
+					ptr += flen;
+				}
+			}
+		}
+	}
+
+	/* Reset the first_v_col, so to put the virtual column undo
+	version marker again, when we log all the indexed columns */
+	first_v_col = true;
+
+	/*----------------------------------------*/
+	/* In the case of a delete marking, and also in the case of an update
+	where any ordering field of any index changes, store the values of all
+	columns which occur as ordering fields in any index. This info is used
+	in the purge of old versions where we use it to build and search the
+	delete marked index records, to look if we can remove them from the
+	index tree. Note that starting from 4.0.14 also externally stored
+	fields can be ordering in some index. Starting from 5.2, we no longer
+	store REC_MAX_INDEX_COL_LEN first bytes to the undo log record,
+	but we can construct the column prefix fields in the index by
+	fetching the first page of the BLOB that is pointed to by the
+	clustered index. This works also in crash recovery, because all pages
+	(including BLOBs) are recovered before anything is rolled back. */
+
+	if (!update || !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+		byte*		old_ptr = ptr;
+		double		mbr[SPDIMS * 2];
+		mem_heap_t*	row_heap = NULL;
+
+		if (trx_undo_left(undo_block, ptr) < 5) {
+			return(0);
+		}
+
+		/* Reserve 2 bytes to write the number of bytes the stored
+		fields take in this undo record */
+
+		ptr += 2;
+
+		for (col_no = 0; col_no < dict_table_get_n_cols(table);
+		     col_no++) {
+
+			const dict_col_t*	col
+				= dict_table_get_nth_col(table, col_no);
+
+			if (!col->ord_part) {
+				continue;
+			}
+
+			const ulint pos = dict_index_get_nth_col_pos(
+				index, col_no, NULL);
+			/* All non-virtual columns must be present in
+			the clustered index. */
+			ut_ad(pos != ULINT_UNDEFINED);
+
+			const bool is_ext = rec_offs_nth_extern(offsets, pos);
+			const spatial_status_t spatial_status = is_ext
+				? dict_col_get_spatial_status(col)
+				: SPATIAL_NONE;
+
+			switch (spatial_status) {
+			case SPATIAL_UNKNOWN:
+				ut_ad(0);
+				/* fall through */
+			case SPATIAL_MIXED:
+			case SPATIAL_ONLY:
+				/* Externally stored spatially indexed
+				columns will be (redundantly) logged
+				again, because we did not write the
+				MBR yet, that is, the previous call to
+				trx_undo_page_report_modify_ext()
+				was with SPATIAL_UNKNOWN. */
+				break;
+			case SPATIAL_NONE:
+				if (!update) {
+					/* This is a DELETE operation. */
+					break;
+				}
+				/* Avoid redundantly logging indexed
+				columns that were updated. */
+
+				for (i = 0; i < update->n_fields; i++) {
+					const ulint field_no
+						= upd_get_nth_field(update, i)
+						->field_no;
+					if (field_no >= index->n_fields
+					    || dict_index_get_nth_field(
+						    index, field_no)->col
+					    == col) {
+						goto already_logged;
+					}
+				}
+			}
+
+			if (true) {
+				/* Write field number to undo log */
+				if (trx_undo_left(undo_block, ptr) < 5 + 15) {
+					return(0);
+				}
+
+				ptr += mach_write_compressed(ptr, pos);
+
+				/* Save the old value of field */
+				field = rec_get_nth_cfield(
+					rec, index, offsets, pos, &flen);
+
+				if (is_ext) {
+					const dict_col_t*	col =
+						dict_index_get_nth_col(
+							index, pos);
+					ulint			prefix_len =
+						dict_max_field_len_store_undo(
+							table, col);
+
+					ut_a(prefix_len < sizeof ext_buf);
+					const ulint zip_size
+						= table->space->zip_size();
+
+					/* If there is a spatial index on it,
+					log its MBR */
+					if (spatial_status != SPATIAL_NONE) {
+						ut_ad(DATA_GEOMETRY_MTYPE(
+								col->mtype));
+
+						trx_undo_get_mbr_from_ext(
+							mbr, zip_size,
+							field, &flen);
+					}
+
+					ptr = trx_undo_page_report_modify_ext(
+						ptr,
+						flen < REC_ANTELOPE_MAX_INDEX_COL_LEN
+						&& !ignore_prefix
+						? ext_buf : NULL, prefix_len,
+						zip_size,
+						&field, &flen,
+						spatial_status);
+				} else {
+					ptr += mach_write_compressed(
+						ptr, flen);
+				}
+
+				if (flen != UNIV_SQL_NULL
+				    && spatial_status != SPATIAL_ONLY) {
+					if (trx_undo_left(undo_block, ptr)
+					    < flen) {
+						return(0);
+					}
+
+					memcpy(ptr, field, flen);
+					ptr += flen;
+				}
+
+				if (spatial_status != SPATIAL_NONE) {
+					if (trx_undo_left(undo_block, ptr)
+					    < DATA_MBR_LEN) {
+						return(0);
+					}
+
+					for (int i = 0; i < SPDIMS * 2;
+					     i++) {
+						mach_double_write(
+							ptr, mbr[i]);
+						ptr +=  sizeof(double);
+					}
+				}
+			}
+
+already_logged:
+			continue;
+		}
+
+		for (col_no = 0; col_no < dict_table_get_n_v_cols(table);
+		     col_no++) {
+			const dict_v_col_t*     col
+				= dict_table_get_nth_v_col(table, col_no);
+
+			if (col->m_col.ord_part) {
+				ulint   pos = col_no;
+				ulint	max_v_log_len
+					= dict_max_v_field_len_store_undo(
+						table, pos);
+
+				/* Write field number to undo log.
+				Make sure there is enought space in log */
+				if (trx_undo_left(undo_block, ptr) < 5) {
+					return(0);
+				}
+
+				pos += REC_MAX_N_FIELDS;
+				ptr += mach_write_compressed(ptr, pos);
+
+				ut_ad(col_no < table->n_v_def);
+				ptr = trx_undo_log_v_idx(undo_block, table,
+							 col_no, ptr,
+							 first_v_col);
+				first_v_col = false;
+
+				if (!ptr) {
+					 return(0);
+				}
+
+				const dfield_t* vfield = NULL;
+
+				if (update) {
+					ut_ad(!row);
+					if (update->old_vrow == NULL) {
+						flen = UNIV_SQL_NULL;
+					} else {
+						vfield = dtuple_get_nth_v_field(
+							update->old_vrow,
+							col->v_pos);
+					}
+				} else if (row) {
+					vfield = dtuple_get_nth_v_field(
+						row, col->v_pos);
+				} else {
+					ut_ad(0);
+				}
+
+				if (vfield) {
+					field = static_cast<byte*>(vfield->data);
+					flen = vfield->len;
+				} else {
+					ut_ad(flen == UNIV_SQL_NULL);
+				}
+
+				if (flen != UNIV_SQL_NULL) {
+					flen = ut_min(
+						flen, max_v_log_len);
+				}
+
+				ptr += mach_write_compressed(ptr, flen);
+
+				switch (flen) {
+				case 0: case UNIV_SQL_NULL:
+					break;
+				default:
+					if (trx_undo_left(undo_block, ptr)
+					    < flen) {
+						return(0);
+					}
+
+					memcpy(ptr, field, flen);
+					ptr += flen;
+				}
+			}
+		}
+
+		mach_write_to_2(old_ptr, ulint(ptr - old_ptr));
+
+		if (row_heap) {
+			mem_heap_free(row_heap);
+		}
+	}
+
+	/*----------------------------------------*/
+	/* Write pointers to the previous and the next undo log records */
+	if (trx_undo_left(undo_block, ptr) < 2) {
+		return(0);
+	}
+
+	mach_write_to_2(ptr, first_free);
+	const uint16_t new_free = static_cast<uint16_t>(
+		ptr + 2 - undo_block->page.frame);
+	mach_write_to_2(undo_block->page.frame + first_free, new_free);
+
+	mach_write_to_2(ptr_to_first_free, new_free);
+
+	const byte* start = &undo_block->page.frame[first_free + 2];
+	mtr->undo_append(*undo_block, start, ptr - start);
+	return(first_free);
+}
+
+/**********************************************************************//**
+Reads from an undo log update record the system field values of the old
+version.
+@return remaining part of undo log record after reading these values */
+byte*
+trx_undo_update_rec_get_sys_cols(
+/*=============================*/
+	const byte*	ptr,		/*!< in: remaining part of undo
+					log record after reading
+					general parameters */
+	trx_id_t*	trx_id,		/*!< out: trx id */
+	roll_ptr_t*	roll_ptr,	/*!< out: roll ptr */
+	byte*		info_bits)	/*!< out: info bits state */
+{
+	/* Read the state of the info bits */
+	*info_bits = *ptr++;
+
+	/* Read the values of the system columns */
+
+	*trx_id = mach_u64_read_next_compressed(&ptr);
+	*roll_ptr = mach_u64_read_next_compressed(&ptr);
+
+	return(const_cast<byte*>(ptr));
+}
+
+/*******************************************************************//**
+Builds an update vector based on a remaining part of an undo log record.
+@return remaining part of the record, NULL if an error detected, which
+means that the record is corrupted */
+byte*
+trx_undo_update_rec_get_update(
+/*===========================*/
+	const byte*	ptr,	/*!< in: remaining part in update undo log
+				record, after reading the row reference
+				NOTE that this copy of the undo log record must
+				be preserved as long as the update vector is
+				used, as we do NOT copy the data in the
+				record! */
+	dict_index_t*	index,	/*!< in: clustered index */
+	ulint		type,	/*!< in: TRX_UNDO_UPD_EXIST_REC,
+				TRX_UNDO_UPD_DEL_REC, or
+				TRX_UNDO_DEL_MARK_REC; in the last case,
+				only trx id and roll ptr fields are added to
+				the update vector */
+	trx_id_t	trx_id,	/*!< in: transaction id from this undo record */
+	roll_ptr_t	roll_ptr,/*!< in: roll pointer from this undo record */
+	byte		info_bits,/*!< in: info bits from this undo record */
+	mem_heap_t*	heap,	/*!< in: memory heap from which the memory
+				needed is allocated */
+	upd_t**		upd)	/*!< out, own: update vector */
+{
+	upd_field_t*	upd_field;
+	upd_t*		update;
+	ulint		n_fields;
+	byte*		buf;
+	bool		first_v_col = true;
+	bool		is_undo_log = true;
+	ulint		n_skip_field = 0;
+
+	ut_a(dict_index_is_clust(index));
+
+	if (type != TRX_UNDO_DEL_MARK_REC) {
+		n_fields = mach_read_next_compressed(&ptr);
+	} else {
+		n_fields = 0;
+	}
+
+	*upd = update = upd_create(n_fields + 2, heap);
+
+	update->info_bits = info_bits;
+
+	/* Store first trx id and roll ptr to update vector */
+
+	upd_field = upd_get_nth_field(update, n_fields);
+
+	buf = static_cast<byte*>(mem_heap_alloc(heap, DATA_TRX_ID_LEN));
+
+	mach_write_to_6(buf, trx_id);
+
+	upd_field_set_field_no(upd_field, index->db_trx_id(), index);
+	dfield_set_data(&(upd_field->new_val), buf, DATA_TRX_ID_LEN);
+
+	upd_field = upd_get_nth_field(update, n_fields + 1);
+
+	buf = static_cast<byte*>(mem_heap_alloc(heap, DATA_ROLL_PTR_LEN));
+
+	trx_write_roll_ptr(buf, roll_ptr);
+
+	upd_field_set_field_no(upd_field, index->db_roll_ptr(), index);
+	dfield_set_data(&(upd_field->new_val), buf, DATA_ROLL_PTR_LEN);
+
+	/* Store then the updated ordinary columns to the update vector */
+
+	for (ulint i = 0; i < n_fields; i++) {
+		const byte* field;
+		uint32_t len, orig_len;
+
+		upd_field = upd_get_nth_field(update, i);
+		uint32_t field_no = mach_read_next_compressed(&ptr);
+
+		const bool is_virtual = (field_no >= REC_MAX_N_FIELDS);
+
+		if (is_virtual) {
+			/* If new version, we need to check index list to figure
+			out the correct virtual column position */
+			ptr = trx_undo_read_v_idx(
+				index->table, ptr, first_v_col, &is_undo_log,
+				&field_no);
+			first_v_col = false;
+			/* This column could be dropped or no longer indexed */
+			if (field_no >= index->n_fields) {
+				/* Mark this is no longer needed */
+				upd_field->field_no = REC_MAX_N_FIELDS;
+
+				ptr = trx_undo_rec_get_col_val(
+					ptr, &field, &len, &orig_len);
+				ptr = trx_undo_rec_get_col_val(
+					ptr, &field, &len, &orig_len);
+				n_skip_field++;
+				continue;
+			}
+
+			upd_field_set_v_field_no(
+				upd_field, static_cast<uint16_t>(field_no),
+				index);
+		} else if (UNIV_UNLIKELY((update->info_bits
+					  & ~REC_INFO_DELETED_FLAG)
+					 == REC_INFO_MIN_REC_FLAG)) {
+			ut_ad(type == TRX_UNDO_UPD_EXIST_REC);
+			const uint32_t uf = index->first_user_field();
+			ut_ad(field_no >= uf);
+
+			if (update->info_bits != REC_INFO_MIN_REC_FLAG) {
+				/* Generic instant ALTER TABLE */
+				if (field_no == uf) {
+					upd_field->new_val.type
+						.metadata_blob_init();
+				} else if (field_no >= index->n_fields) {
+					/* This is reachable during
+					purge if the table was emptied
+					and converted to the canonical
+					format on a later ALTER TABLE.
+					In this case,
+					row_purge_upd_exist_or_extern()
+					would only be interested in
+					freeing any BLOBs that were
+					updated, that is, the metadata
+					BLOB above.  Other BLOBs in
+					the metadata record are never
+					updated; they are for the
+					initial DEFAULT values of the
+					instantly added columns, and
+					they will never change.
+
+					Note: if the table becomes
+					empty during ROLLBACK or is
+					empty during subsequent ALTER
+					TABLE, and btr_page_empty() is
+					called to re-create the root
+					page without the metadata
+					record, in that case we should
+					only free the latest version
+					of BLOBs in the record,
+					which purge would never touch. */
+					field_no = REC_MAX_N_FIELDS;
+					n_skip_field++;
+				} else {
+					dict_col_copy_type(
+						dict_index_get_nth_col(
+							index, field_no - 1),
+						&upd_field->new_val.type);
+				}
+			} else {
+				/* Instant ADD COLUMN...LAST */
+				dict_col_copy_type(
+					dict_index_get_nth_col(index,
+							       field_no),
+					&upd_field->new_val.type);
+			}
+			upd_field->field_no = field_no
+				& dict_index_t::MAX_N_FIELDS;
+		} else if (field_no < index->n_fields) {
+			upd_field_set_field_no(upd_field,
+					       static_cast<uint16_t>(field_no),
+					       index);
+		} else {
+			ib::error() << "Trying to access update undo rec"
+				" field " << field_no
+				<< " in index " << index->name
+				<< " of table " << index->table->name
+				<< " but index has only "
+				<< dict_index_get_n_fields(index)
+				<< " fields " << BUG_REPORT_MSG
+				<< ". Run also CHECK TABLE "
+				<< index->table->name << "."
+				" n_fields = " << n_fields << ", i = " << i;
+
+			ut_ad(0);
+			*upd = NULL;
+			return(NULL);
+		}
+
+		ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+
+		upd_field->orig_len = static_cast<uint16_t>(orig_len);
+
+		if (len == UNIV_SQL_NULL) {
+			dfield_set_null(&upd_field->new_val);
+		} else if (len < UNIV_EXTERN_STORAGE_FIELD) {
+			dfield_set_data(&upd_field->new_val, field, len);
+		} else {
+			len -= UNIV_EXTERN_STORAGE_FIELD;
+
+			dfield_set_data(&upd_field->new_val, field, len);
+			dfield_set_ext(&upd_field->new_val);
+		}
+
+		ut_ad(update->info_bits != (REC_INFO_DELETED_FLAG
+					    | REC_INFO_MIN_REC_FLAG)
+		      || field_no != index->first_user_field()
+		      || (upd_field->new_val.ext
+			  && upd_field->new_val.len == FIELD_REF_SIZE));
+
+		if (is_virtual) {
+			upd_field->old_v_val = static_cast<dfield_t*>(
+				mem_heap_alloc(
+					heap, sizeof *upd_field->old_v_val));
+			ptr = trx_undo_rec_get_col_val(
+				ptr, &field, &len, &orig_len);
+	                if (len == UNIV_SQL_NULL) {
+				dfield_set_null(upd_field->old_v_val);
+			} else if (len < UNIV_EXTERN_STORAGE_FIELD) {
+				dfield_set_data(
+					upd_field->old_v_val, field, len);
+			} else {
+				ut_ad(0);
+			}
+		}
+	}
+
+	/* We may have to skip dropped indexed virtual columns.
+	Also, we may have to trim the update vector of a metadata record
+	if dict_index_t::clear_instant_alter() was invoked on the table
+	later, and the number of fields no longer matches. */
+
+	if (n_skip_field) {
+		upd_field_t* d = upd_get_nth_field(update, 0);
+		const upd_field_t* const end = d + n_fields + 2;
+
+		for (const upd_field_t* s = d; s != end; s++) {
+			if (s->field_no != REC_MAX_N_FIELDS) {
+				*d++ = *s;
+			}
+		}
+
+		ut_ad(d + n_skip_field == end);
+		update->n_fields = d - upd_get_nth_field(update, 0);
+	}
+
+	return(const_cast<byte*>(ptr));
+}
+
+/** Report a RENAME TABLE operation.
+@param[in,out]	trx	transaction
+@param[in]	table	table that is being renamed
+@param[in,out]	block	undo page
+@param[in,out]	mtr	mini-transaction
+@return	byte offset of the undo log record
+@retval	0	in case of failure */
+static
+uint16_t
+trx_undo_page_report_rename(trx_t* trx, const dict_table_t* table,
+			    buf_block_t* block, mtr_t* mtr)
+{
+	byte*	ptr_first_free  = my_assume_aligned<2>(TRX_UNDO_PAGE_HDR
+						       + TRX_UNDO_PAGE_FREE
+						       + block->page.frame);
+	const uint16_t first_free = mach_read_from_2(ptr_first_free);
+	ut_ad(first_free >= TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
+	ut_ad(first_free <= srv_page_size - FIL_PAGE_DATA_END);
+	byte* const start = block->page.frame + first_free;
+	size_t len = strlen(table->name.m_name);
+	const size_t fixed = 2 + 1 + 11 + 11 + 2;
+	ut_ad(len <= NAME_CHAR_LEN * 5 * 2 + 1);
+	/* The -10 is used in trx_undo_left() */
+	compile_time_assert(NAME_CHAR_LEN * 5 * 2 + fixed
+			    + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE
+			    < UNIV_PAGE_SIZE_MIN - 10 - FIL_PAGE_DATA_END);
+
+	if (trx_undo_left(block, start) < fixed + len) {
+		ut_ad(first_free > TRX_UNDO_PAGE_HDR
+		      + TRX_UNDO_PAGE_HDR_SIZE);
+		return 0;
+	}
+
+	byte* ptr = start + 2;
+	*ptr++ = TRX_UNDO_RENAME_TABLE;
+	ptr += mach_u64_write_much_compressed(ptr, trx->undo_no);
+	ptr += mach_u64_write_much_compressed(ptr, table->id);
+	memcpy(ptr, table->name.m_name, len);
+	ptr += len;
+	mach_write_to_2(ptr, first_free);
+	mach_write_to_2(ptr_first_free, ptr + 2 - block->page.frame);
+	memcpy(start, ptr_first_free, 2);
+	mtr->undo_append(*block, start + 2, ptr - start - 2);
+	return first_free;
+}
+
+/** Report a RENAME TABLE operation.
+@param[in,out]	trx	transaction
+@param[in]	table	table that is being renamed
+@return	DB_SUCCESS or error code */
+dberr_t trx_undo_report_rename(trx_t* trx, const dict_table_t* table)
+{
+	ut_ad(!trx->read_only);
+	ut_ad(trx->id);
+	ut_ad(!table->is_temporary());
+
+	mtr_t		mtr;
+	dberr_t		err;
+	mtr.start();
+	if (buf_block_t* block = trx_undo_assign(trx, &err, &mtr)) {
+		trx_undo_t*	undo = trx->rsegs.m_redo.undo;
+		ut_ad(err == DB_SUCCESS);
+		ut_ad(undo);
+		for (ut_d(int loop_count = 0);;) {
+			ut_ad(loop_count++ < 2);
+			ut_ad(undo->last_page_no
+			      == block->page.id().page_no());
+
+			if (uint16_t offset = trx_undo_page_report_rename(
+				    trx, table, block, &mtr)) {
+				undo->top_page_no = undo->last_page_no;
+				undo->top_offset  = offset;
+				undo->top_undo_no = trx->undo_no++;
+				undo->guess_block = block;
+				ut_ad(!undo->empty());
+
+				err = DB_SUCCESS;
+				break;
+			} else {
+				mtr.commit();
+				mtr.start();
+				block = trx_undo_add_page(undo, &mtr, &err);
+				if (!block) {
+					break;
+				}
+			}
+		}
+	}
+
+	mtr.commit();
+	return err;
+}
+
+TRANSACTIONAL_TARGET ATTRIBUTE_NOINLINE
+/** @return whether the transaction holds an exclusive lock on a table */
+static bool trx_has_lock_x(const trx_t &trx, dict_table_t& table)
+{
+  ut_ad(!table.is_temporary());
+
+  uint32_t n;
+
+#if !defined NO_ELISION && !defined SUX_LOCK_GENERIC
+  if (xbegin())
+  {
+    if (table.lock_mutex_is_locked())
+      xabort();
+    n= table.n_lock_x_or_s;
+    xend();
+  }
+  else
+#endif
+  {
+    table.lock_mutex_lock();
+    n= table.n_lock_x_or_s;
+    table.lock_mutex_unlock();
+  }
+
+  /* This thread is executing trx. No other thread can modify our table locks
+  (only record locks might be created, in an implicit-to-explicit conversion).
+  Hence, no mutex is needed here. */
+  if (n)
+    for (const lock_t *lock : trx.lock.table_locks)
+      if (lock && lock->type_mode == (LOCK_X | LOCK_TABLE))
+        return true;
+
+  return false;
+}
+
+/***********************************************************************//**
+Writes information to an undo log about an insert, update, or a delete marking
+of a clustered index record. This information is used in a rollback of the
+transaction and in consistent reads that must look to the history of this
+transaction.
+@return DB_SUCCESS or error code */
+dberr_t
+trx_undo_report_row_operation(
+/*==========================*/
+	que_thr_t*	thr,		/*!< in: query thread */
+	dict_index_t*	index,		/*!< in: clustered index */
+	const dtuple_t*	clust_entry,	/*!< in: in the case of an insert,
+					index entry to insert into the
+					clustered index; in updates,
+					may contain a clustered index
+					record tuple that also contains
+					virtual columns of the table;
+					otherwise, NULL */
+	const upd_t*	update,		/*!< in: in the case of an update,
+					the update vector, otherwise NULL */
+	ulint		cmpl_info,	/*!< in: compiler info on secondary
+					index updates */
+	const rec_t*	rec,		/*!< in: case of an update or delete
+					marking, the record in the clustered
+					index; NULL if insert */
+	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec) */
+	roll_ptr_t*	roll_ptr)	/*!< out: DB_ROLL_PTR to the
+					undo log record */
+{
+	trx_t*		trx;
+#ifdef UNIV_DEBUG
+	int		loop_count	= 0;
+#endif /* UNIV_DEBUG */
+
+	ut_a(dict_index_is_clust(index));
+	ut_ad(!update || rec);
+	ut_ad(!rec || rec_offs_validate(rec, index, offsets));
+	ut_ad(!srv_read_only_mode);
+
+	trx = thr_get_trx(thr);
+	/* This function must not be invoked during rollback
+	(of a TRX_STATE_PREPARE transaction or otherwise). */
+	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+	ut_ad(!trx->in_rollback);
+
+	/* We must determine if this is the first time when this
+	transaction modifies this table. */
+	auto m = trx->mod_tables.emplace(index->table, trx->undo_no);
+	ut_ad(m.first->second.valid(trx->undo_no));
+
+	if (m.second && index->table->is_active_ddl()) {
+		trx->apply_online_log= true;
+	}
+
+	bool bulk = !rec;
+
+	if (!bulk) {
+		/* An UPDATE or DELETE must not be covered by an
+		earlier start_bulk_insert(). */
+		ut_ad(!m.first->second.is_bulk_insert());
+	} else if (m.first->second.is_bulk_insert()) {
+		/* Above, the emplace() tried to insert an object with
+		!is_bulk_insert(). Only an explicit start_bulk_insert()
+		(below) can set the flag. */
+		ut_ad(!m.second);
+		/* We already wrote a TRX_UNDO_EMPTY record. */
+		ut_ad(thr->run_node);
+		ut_ad(que_node_get_type(thr->run_node) == QUE_NODE_INSERT);
+		ut_ad(trx->bulk_insert);
+		return DB_SUCCESS;
+	} else if (!m.second || !trx->bulk_insert) {
+		bulk = false;
+	} else if (index->table->is_temporary()) {
+	} else if (trx_has_lock_x(*trx, *index->table)
+		   && index->table->bulk_trx_id == trx->id) {
+		m.first->second.start_bulk_insert(index->table);
+
+		if (dberr_t err = m.first->second.bulk_insert_buffered(
+			    *clust_entry, *index, trx)) {
+			return err;
+		}
+	} else {
+		bulk = false;
+	}
+
+	mtr_t		mtr;
+	dberr_t		err;
+	mtr.start();
+	trx_undo_t**	pundo;
+	trx_rseg_t*	rseg;
+	const bool	is_temp	= index->table->is_temporary();
+	buf_block_t*	undo_block;
+
+	if (is_temp) {
+		mtr.set_log_mode(MTR_LOG_NO_REDO);
+		rseg = trx->get_temp_rseg();
+		pundo = &trx->rsegs.m_noredo.undo;
+		undo_block = trx_undo_assign_low<true>(trx, rseg, pundo,
+						       &mtr, &err);
+	} else {
+		ut_ad(!trx->read_only);
+		ut_ad(trx->id);
+		pundo = &trx->rsegs.m_redo.undo;
+		rseg = trx->rsegs.m_redo.rseg;
+		undo_block = trx_undo_assign_low<false>(trx, rseg, pundo,
+							&mtr, &err);
+	}
+
+	trx_undo_t*	undo	= *pundo;
+	ut_ad((err == DB_SUCCESS) == (undo_block != NULL));
+	if (UNIV_UNLIKELY(undo_block == NULL)) {
+err_exit:
+		mtr.commit();
+		return err;
+	}
+
+	ut_ad(undo != NULL);
+
+	do {
+		uint16_t offset = !rec
+			? trx_undo_page_report_insert(
+				undo_block, trx, index, clust_entry, &mtr,
+				bulk)
+			: trx_undo_page_report_modify(
+				undo_block, trx, index, rec, offsets, update,
+				cmpl_info, clust_entry, &mtr);
+
+		if (UNIV_UNLIKELY(offset == 0)) {
+			const uint16_t first_free = mach_read_from_2(
+				TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE
+				+ undo_block->page.frame);
+			memset(undo_block->page.frame + first_free, 0,
+			       (srv_page_size - FIL_PAGE_DATA_END)
+			       - first_free);
+
+			if (first_free
+			    == TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE) {
+				/* The record did not fit on an empty
+				undo page. Discard the freshly allocated
+				page and return an error. */
+
+				/* When we remove a page from an undo
+				log, this is analogous to a
+				pessimistic insert in a B-tree, and we
+				must reserve the counterpart of the
+				tree latch, which is the rseg
+				mutex. We must commit the mini-transaction
+				first, because it may be holding lower-level
+				latches, such as SYNC_FSP_PAGE. */
+
+				mtr.commit();
+				mtr.start();
+				if (is_temp) {
+					mtr.set_log_mode(MTR_LOG_NO_REDO);
+				}
+
+				rseg->latch.wr_lock(SRW_LOCK_CALL);
+				err = trx_undo_free_last_page(undo, &mtr);
+				rseg->latch.wr_unlock();
+
+				if (m.second) {
+					/* We are not going to modify
+					this table after all. */
+					trx->mod_tables.erase(m.first);
+				}
+
+				if (err == DB_SUCCESS) {
+					err = DB_UNDO_RECORD_TOO_BIG;
+				}
+				goto err_exit;
+			} else {
+				/* Write log for clearing the unused
+				tail of the undo page. It might
+				contain some garbage from a previously
+				written record, and mtr_t::write()
+				will optimize away writes of unchanged
+				bytes. Failure to write this caused a
+				recovery failure when we avoided
+				reading the undo log page from the
+				data file and initialized it based on
+				redo log records (which included the
+				write of the previous garbage). */
+				mtr.memset(*undo_block, first_free,
+					   srv_page_size - first_free
+					   - FIL_PAGE_DATA_END, 0);
+			}
+
+			mtr.commit();
+		} else {
+			/* Success */
+			undo->top_page_no = undo_block->page.id().page_no();
+			mtr.commit();
+			undo->top_offset  = offset;
+			undo->top_undo_no = trx->undo_no++;
+			undo->guess_block = undo_block;
+			ut_ad(!undo->empty());
+
+			if (!is_temp) {
+				trx_mod_table_time_t& time = m.first->second;
+				ut_ad(time.valid(undo->top_undo_no));
+
+				if (!time.is_versioned()
+				    && index->table->versioned_by_id()
+				    && (!rec /* INSERT */
+					|| (update
+					    && update->affects_versioned()))) {
+					time.set_versioned(undo->top_undo_no);
+				}
+			}
+
+			if (!bulk) {
+				*roll_ptr = trx_undo_build_roll_ptr(
+					!rec, trx_sys.rseg_id(rseg, !is_temp),
+					undo->top_page_no, offset);
+			}
+
+			return(DB_SUCCESS);
+		}
+
+		ut_ad(undo_block->page.id().page_no() == undo->last_page_no);
+
+		/* We have to extend the undo log by one page */
+
+		ut_ad(++loop_count < 2);
+		mtr.start();
+
+		if (is_temp) {
+			mtr.set_log_mode(MTR_LOG_NO_REDO);
+		}
+
+		undo_block = trx_undo_add_page(undo, &mtr, &err);
+
+		DBUG_EXECUTE_IF("ib_err_ins_undo_page_add_failure",
+				undo_block = NULL;);
+	} while (UNIV_LIKELY(undo_block != NULL));
+
+	if (err != DB_OUT_OF_FILE_SPACE) {
+		goto err_exit;
+	}
+
+	ib_errf(trx->mysql_thd, IB_LOG_LEVEL_ERROR,
+		DB_OUT_OF_FILE_SPACE,
+		//ER_INNODB_UNDO_LOG_FULL,
+		"No more space left over in %s tablespace for allocating UNDO"
+		" log pages. Please add new data file to the tablespace or"
+		" check if filesystem is full or enable auto-extension for"
+		" the tablespace",
+		undo->rseg->space == fil_system.sys_space
+		? "system" : is_temp ? "temporary" : "undo");
+
+	goto err_exit;
+}
+
+/*============== BUILDING PREVIOUS VERSION OF A RECORD ===============*/
+
+/** Copy an undo record to heap.
+@param[in]	roll_ptr	roll pointer to a record that exists
+@param[in,out]	heap		memory heap where copied */
+static
+trx_undo_rec_t*
+trx_undo_get_undo_rec_low(
+	roll_ptr_t		roll_ptr,
+	mem_heap_t*		heap)
+{
+  ulint rseg_id;
+  uint32_t page_no;
+  uint16_t offset;
+  bool is_insert;
+  mtr_t mtr;
+
+  trx_undo_decode_roll_ptr(roll_ptr, &is_insert, &rseg_id, &page_no, &offset);
+  ut_ad(page_no > FSP_FIRST_INODE_PAGE_NO);
+  ut_ad(offset >= TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
+  trx_rseg_t *rseg= &trx_sys.rseg_array[rseg_id];
+  ut_ad(rseg->is_persistent());
+
+  mtr.start();
+
+  trx_undo_rec_t *undo_rec= nullptr;
+  if (const buf_block_t* undo_page=
+      buf_page_get(page_id_t(rseg->space->id, page_no), 0, RW_S_LATCH, &mtr))
+  {
+    undo_rec= undo_page->page.frame + offset;
+    const size_t end= mach_read_from_2(undo_rec);
+    if (UNIV_UNLIKELY(end <= offset ||
+                      end >= srv_page_size - FIL_PAGE_DATA_END))
+      undo_rec= nullptr;
+    else
+    {
+      size_t len{end - offset};
+      undo_rec=
+        static_cast<trx_undo_rec_t*>(mem_heap_dup(heap, undo_rec, len));
+      mach_write_to_2(undo_rec, len);
+    }
+  }
+
+  mtr.commit();
+  return undo_rec;
+}
+
+/** Copy an undo record to heap, to check if a secondary index record
+can be safely purged.
+@param trx_id   DB_TRX_ID corresponding to roll_ptr
+@param name     table name
+@param roll_ptr	DB_ROLL_PTR pointing to the undo log record
+@param heap     memory heap for allocation
+@return copy of the record
+@retval nullptr if the version is visible to purge_sys.view */
+static trx_undo_rec_t *trx_undo_get_rec_if_purgeable(trx_id_t trx_id,
+                                                     const table_name_t &name,
+                                                     roll_ptr_t roll_ptr,
+                                                     mem_heap_t* heap)
+{
+  {
+    purge_sys_t::view_guard check;
+    if (!check.view().changes_visible(trx_id))
+      return trx_undo_get_undo_rec_low(roll_ptr, heap);
+  }
+  return nullptr;
+}
+
+/** Copy an undo record to heap.
+@param trx_id   DB_TRX_ID corresponding to roll_ptr
+@param name     table name
+@param roll_ptr	DB_ROLL_PTR pointing to the undo log record
+@param heap     memory heap for allocation
+@return copy of the record
+@retval nullptr if the undo log is not available */
+static trx_undo_rec_t *trx_undo_get_undo_rec(trx_id_t trx_id,
+                                             const table_name_t &name,
+                                             roll_ptr_t roll_ptr,
+                                             mem_heap_t *heap)
+{
+  {
+    purge_sys_t::end_view_guard check;
+    if (!check.view().changes_visible(trx_id))
+      return trx_undo_get_undo_rec_low(roll_ptr, heap);
+  }
+  return nullptr;
+}
+
+/** Build a previous version of a clustered index record. The caller
+must hold a latch on the index page of the clustered index record.
+@param	rec		version of a clustered index record
+@param	index		clustered index
+@param	offsets		rec_get_offsets(rec, index)
+@param	heap		memory heap from which the memory needed is
+			allocated
+@param	old_vers	previous version or NULL if rec is the
+			first inserted version, or if history data
+			has been deleted (an error), or if the purge
+			could have removed the version
+			though it has not yet done so
+@param	v_heap		memory heap used to create vrow
+			dtuple if it is not yet created. This heap
+			diffs from "heap" above in that it could be
+			prebuilt->old_vers_heap for selection
+@param	v_row		virtual column info, if any
+@param	v_status	status determine if it is going into this
+			function by purge thread or not.
+			And if we read "after image" of undo log
+@param	undo_block	undo log block which was cached during
+			online dml apply or nullptr
+@return error code
+@retval DB_SUCCESS if previous version was successfully built,
+or if it was an insert or the undo record refers to the table before rebuild
+@retval DB_MISSING_HISTORY if the history is missing */
+TRANSACTIONAL_TARGET
+dberr_t
+trx_undo_prev_version_build(
+	const rec_t 	*rec,
+	dict_index_t	*index,
+	rec_offs	*offsets,
+	mem_heap_t	*heap,
+	rec_t		**old_vers,
+	mem_heap_t	*v_heap,
+	dtuple_t	**vrow,
+	ulint		v_status)
+{
+	dtuple_t*	entry;
+	trx_id_t	rec_trx_id;
+	undo_no_t	undo_no;
+	table_id_t	table_id;
+	trx_id_t	trx_id;
+	roll_ptr_t	roll_ptr;
+	upd_t*		update;
+	byte		type;
+	byte		info_bits;
+	byte		cmpl_info;
+	bool		dummy_extern;
+	byte*		buf;
+
+	ut_ad(!index->table->is_temporary());
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	roll_ptr = row_get_rec_roll_ptr(rec, index, offsets);
+
+	*old_vers = NULL;
+
+	if (trx_undo_roll_ptr_is_insert(roll_ptr)) {
+		/* The record rec is the first inserted version */
+		return DB_SUCCESS;
+	}
+
+	mariadb_increment_undo_records_read();
+	rec_trx_id = row_get_rec_trx_id(rec, index, offsets);
+
+	ut_ad(!index->table->skip_alter_undo);
+
+	trx_undo_rec_t*	undo_rec = v_status == TRX_UNDO_CHECK_PURGEABILITY
+		? trx_undo_get_rec_if_purgeable(rec_trx_id, index->table->name,
+						roll_ptr, heap)
+		: trx_undo_get_undo_rec(rec_trx_id, index->table->name,
+					roll_ptr, heap);
+	if (!undo_rec) {
+		return DB_MISSING_HISTORY;
+	}
+
+	const byte *ptr =
+		trx_undo_rec_get_pars(undo_rec, &type, &cmpl_info,
+				      &dummy_extern, &undo_no, &table_id);
+
+	if (table_id != index->table->id) {
+		/* The table should have been rebuilt, but purge has
+		not yet removed the undo log records for the
+		now-dropped old table (table_id). */
+		return DB_SUCCESS;
+	}
+
+	ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr,
+					       &info_bits);
+
+	/* (a) If a clustered index record version is such that the
+	trx id stamp in it is bigger than purge_sys.view, then the
+	BLOBs in that version are known to exist (the purge has not
+	progressed that far);
+
+	(b) if the version is the first version such that trx id in it
+	is less than purge_sys.view, and it is not delete-marked,
+	then the BLOBs in that version are known to exist (the purge
+	cannot have purged the BLOBs referenced by that version
+	yet).
+
+	This function does not fetch any BLOBs.  The callers might, by
+	possibly invoking row_ext_create() via row_build().  However,
+	they should have all needed information in the *old_vers
+	returned by this function.  This is because *old_vers is based
+	on the transaction undo log records.  The function
+	trx_undo_page_fetch_ext() will write BLOB prefixes to the
+	transaction undo log that are at least as long as the longest
+	possible column prefix in a secondary index.  Thus, secondary
+	index entries for *old_vers can be constructed without
+	dereferencing any BLOB pointers. */
+
+	ptr = trx_undo_rec_skip_row_ref(ptr, index);
+
+	ptr = trx_undo_update_rec_get_update(ptr, index, type, trx_id,
+					     roll_ptr, info_bits,
+					     heap, &update);
+	ut_a(ptr);
+
+	if (row_upd_changes_field_size_or_external(index, offsets, update)) {
+		/* We should confirm the existence of disowned external data,
+		if the previous version record is delete marked. If the trx_id
+		of the previous record is seen by purge view, we should treat
+		it as missing history, because the disowned external data
+		might be purged already.
+
+		The inherited external data (BLOBs) can be freed (purged)
+		after trx_id was committed, provided that no view was started
+		before trx_id. If the purge view can see the committed
+		delete-marked record by trx_id, no transactions need to access
+		the BLOB. */
+
+		if (update->info_bits & REC_INFO_DELETED_FLAG
+		    && purge_sys.is_purgeable(trx_id)) {
+			return DB_SUCCESS;
+		}
+
+		/* We have to set the appropriate extern storage bits in the
+		old version of the record: the extern bits in rec for those
+		fields that update does NOT update, as well as the bits for
+		those fields that update updates to become externally stored
+		fields. Store the info: */
+
+		entry = row_rec_to_index_entry(rec, index, offsets, heap);
+		/* The page containing the clustered index record
+		corresponding to entry is latched in mtr.  Thus the
+		following call is safe. */
+		if (!row_upd_index_replace_new_col_vals(entry, *index, update,
+							heap)) {
+			return (v_status & TRX_UNDO_PREV_IN_PURGE)
+				? DB_MISSING_HISTORY : DB_CORRUPTION;
+		}
+
+		/* Get number of externally stored columns in updated record */
+		const ulint n_ext = index->is_primary()
+			? dtuple_get_n_ext(entry) : 0;
+
+		buf = static_cast<byte*>(mem_heap_alloc(
+			heap, rec_get_converted_size(index, entry, n_ext)));
+
+		*old_vers = rec_convert_dtuple_to_rec(buf, index,
+						      entry, n_ext);
+	} else {
+		buf = static_cast<byte*>(mem_heap_alloc(
+			heap, rec_offs_size(offsets)));
+
+		*old_vers = rec_copy(buf, rec, offsets);
+		rec_offs_make_valid(*old_vers, index, true, offsets);
+		rec_set_bit_field_1(*old_vers, update->info_bits,
+				    rec_offs_comp(offsets)
+				    ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS,
+				    REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT);
+		for (ulint i = 0; i < update->n_fields; i++) {
+			const upd_field_t* uf = upd_get_nth_field(update, i);
+			if (upd_fld_is_virtual_col(uf)) {
+				/* There are no virtual columns in
+				a clustered index record. */
+				continue;
+			}
+			const ulint n = uf->field_no;
+			ut_ad(!dfield_is_ext(&uf->new_val)
+			      == !rec_offs_nth_extern(offsets, n));
+			ut_ad(!rec_offs_nth_default(offsets, n));
+
+			if (UNIV_UNLIKELY(dfield_is_null(&uf->new_val))) {
+				if (rec_offs_nth_sql_null(offsets, n)) {
+					ut_ad(index->table->is_instant());
+					ut_ad(n >= index->n_core_fields);
+					continue;
+				}
+				ut_ad(!index->table->not_redundant());
+				ulint l = rec_get_1byte_offs_flag(*old_vers)
+					? (n + 1) : (n + 1) * 2;
+				byte* b = *old_vers - REC_N_OLD_EXTRA_BYTES
+					- l;
+				*b= byte(*b | REC_1BYTE_SQL_NULL_MASK);
+				compile_time_assert(REC_1BYTE_SQL_NULL_MASK << 8
+						    == REC_2BYTE_SQL_NULL_MASK);
+				continue;
+			}
+
+			ulint len;
+			memcpy(rec_get_nth_field(*old_vers, offsets, n, &len),
+			       uf->new_val.data, uf->new_val.len);
+			if (UNIV_UNLIKELY(len != uf->new_val.len)) {
+				ut_ad(len == UNIV_SQL_NULL);
+				ut_ad(!rec_offs_comp(offsets));
+				ut_ad(uf->new_val.len
+				      == rec_get_nth_field_size(rec, n));
+				ulint l = rec_get_1byte_offs_flag(*old_vers)
+					? (n + 1) : (n + 1) * 2;
+				*(*old_vers - REC_N_OLD_EXTRA_BYTES - l)
+					&= byte(~REC_1BYTE_SQL_NULL_MASK);
+			}
+		}
+	}
+
+	/* Set the old value (which is the after image of an update) in the
+	update vector to dtuple vrow */
+	if (v_status & TRX_UNDO_GET_OLD_V_VALUE) {
+		row_upd_replace_vcol((dtuple_t*)*vrow, index->table, update,
+				     false, nullptr, nullptr);
+	}
+
+#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+	rec_offs offsets_dbg[REC_OFFS_NORMAL_SIZE];
+	rec_offs_init(offsets_dbg);
+	ut_a(!rec_offs_any_null_extern(
+		*old_vers, rec_get_offsets(*old_vers, index, offsets_dbg,
+					   index->n_core_fields,
+					   ULINT_UNDEFINED, &heap)));
+#endif // defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
+
+	if (vrow && !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+		if (!(*vrow)) {
+			*vrow = dtuple_create_with_vcol(
+				v_heap ? v_heap : heap,
+				dict_table_get_n_cols(index->table),
+				dict_table_get_n_v_cols(index->table));
+			dtuple_init_v_fld(*vrow);
+		}
+
+		ut_ad(index->table->n_v_cols);
+		trx_undo_read_v_cols(index->table, ptr, *vrow,
+				     v_status & TRX_UNDO_PREV_IN_PURGE);
+	}
+
+	return DB_SUCCESS;
+}
+
+/** Read virtual column value from undo log
+@param[in]	table		the table
+@param[in]	ptr		undo log pointer
+@param[in,out]	row		the dtuple to fill
+@param[in]	in_purge	whether this is called by purge */
+void
+trx_undo_read_v_cols(
+	const dict_table_t*	table,
+	const byte*		ptr,
+	dtuple_t*		row,
+	bool			in_purge)
+{
+	const byte*     end_ptr;
+	bool		first_v_col = true;
+	bool		is_undo_log = true;
+
+	end_ptr = ptr + mach_read_from_2(ptr);
+	ptr += 2;
+	while (ptr < end_ptr) {
+		dfield_t* dfield;
+		const byte* field;
+		uint32_t field_no, len, orig_len;
+
+		field_no = mach_read_next_compressed(
+				const_cast<const byte**>(&ptr));
+
+		const bool is_virtual = (field_no >= REC_MAX_N_FIELDS);
+
+		if (is_virtual) {
+			ptr = trx_undo_read_v_idx(
+				table, ptr, first_v_col, &is_undo_log,
+				&field_no);
+			first_v_col = false;
+		}
+
+		ptr = trx_undo_rec_get_col_val(
+			ptr, &field, &len, &orig_len);
+
+		/* The virtual column is no longer indexed or does not exist.
+		This needs to put after trx_undo_rec_get_col_val() so the
+		undo ptr advances */
+		if (field_no == FIL_NULL) {
+			ut_ad(is_virtual);
+			continue;
+		}
+
+		if (is_virtual) {
+			dict_v_col_t*	vcol = dict_table_get_nth_v_col(
+				table, field_no);
+
+			dfield = dtuple_get_nth_v_field(row, vcol->v_pos);
+
+			if (!in_purge
+			    || dfield_get_type(dfield)->mtype == DATA_MISSING) {
+				dict_col_copy_type(
+					&vcol->m_col,
+					dfield_get_type(dfield));
+				dfield_set_data(dfield, field, len);
+			}
+		}
+	}
+
+	ut_ad(ptr == end_ptr);
+}
diff --git a/storage/innobase/trx/trx0roll.cc b/storage/innobase/trx/trx0roll.cc
new file mode 100644
index 00000000..f21ba422
--- /dev/null
+++ b/storage/innobase/trx/trx0roll.cc
@@ -0,0 +1,933 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2016, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0roll.cc
+Transaction rollback
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0roll.h"
+
+#include <my_service_manager.h>
+#include <mysql/service_wsrep.h>
+
+#include "fsp0fsp.h"
+#include "lock0lock.h"
+#include "mach0data.h"
+#include "pars0pars.h"
+#include "que0que.h"
+#include "row0mysql.h"
+#include "row0undo.h"
+#include "srv0mon.h"
+#include "srv0start.h"
+#include "trx0rec.h"
+#include "trx0rseg.h"
+#include "trx0sys.h"
+#include "trx0trx.h"
+#include "trx0undo.h"
+
+#ifdef UNIV_PFS_THREAD
+mysql_pfs_key_t	trx_rollback_clean_thread_key;
+#endif
+
+/** true if trx_rollback_all_recovered() thread is active */
+bool			trx_rollback_is_active;
+
+/** In crash recovery, the current trx to be rolled back; NULL otherwise */
+const trx_t*		trx_roll_crash_recv_trx;
+
+/** Finish transaction rollback.
+@return	whether the rollback was completed normally
+@retval	false	if the rollback was aborted by shutdown  */
+inline bool trx_t::rollback_finish()
+{
+  apply_online_log= false;
+  if (UNIV_LIKELY(error_state == DB_SUCCESS))
+  {
+    commit();
+    return true;
+  }
+
+  ut_a(error_state == DB_INTERRUPTED);
+  ut_ad(srv_shutdown_state != SRV_SHUTDOWN_NONE);
+  ut_a(!srv_undo_sources);
+  ut_ad(srv_fast_shutdown);
+  ut_d(in_rollback= false);
+  if (trx_undo_t *&undo= rsegs.m_redo.undo)
+  {
+    UT_LIST_REMOVE(rsegs.m_redo.rseg->undo_list, undo);
+    ut_free(undo);
+    undo= nullptr;
+  }
+  if (trx_undo_t *&undo= rsegs.m_noredo.undo)
+  {
+    UT_LIST_REMOVE(rsegs.m_noredo.rseg->undo_list, undo);
+    ut_free(undo);
+    undo= nullptr;
+  }
+  commit_low();
+  commit_cleanup();
+  return false;
+}
+
+/** Roll back an active transaction. */
+inline void trx_t::rollback_low(trx_savept_t *savept)
+{
+  mem_heap_t *heap= mem_heap_create(512);
+  roll_node_t *roll_node= roll_node_create(heap);
+  roll_node->savept= savept;
+
+  ut_ad(!in_rollback);
+#ifdef UNIV_DEBUG
+  {
+    const auto s= state;
+    ut_ad(s == TRX_STATE_ACTIVE ||
+          s == TRX_STATE_PREPARED ||
+          s == TRX_STATE_PREPARED_RECOVERED);
+    if (savept)
+    {
+      ut_ad(s == TRX_STATE_ACTIVE);
+      ut_ad(mysql_thd);
+      ut_ad(!is_recovered);
+    }
+  }
+#endif
+
+  error_state = DB_SUCCESS;
+
+  if (has_logged())
+  {
+    ut_ad(rsegs.m_redo.rseg || rsegs.m_noredo.rseg);
+    que_thr_t *thr= pars_complete_graph_for_exec(roll_node, this, heap,
+                                                 nullptr);
+    ut_a(thr == que_fork_start_command(static_cast<que_fork_t*>
+                                       (que_node_get_parent(thr))));
+    que_run_threads(thr);
+    que_run_threads(roll_node->undo_thr);
+
+    /* Free the memory reserved by the undo graph. */
+    que_graph_free(static_cast<que_t*>(roll_node->undo_thr->common.parent));
+  }
+
+  if (!savept)
+  {
+    rollback_finish();
+    MONITOR_INC(MONITOR_TRX_ROLLBACK);
+  }
+  else
+  {
+    /* There must not be partial rollback if transaction was chosen as deadlock
+    victim. Galera transaction abort can be invoked during partial rollback. */
+    ut_ad(!(lock.was_chosen_as_deadlock_victim & 1));
+    ut_a(error_state == DB_SUCCESS);
+    const undo_no_t limit= savept->least_undo_no;
+    apply_online_log= false;
+    for (trx_mod_tables_t::iterator i= mod_tables.begin();
+         i != mod_tables.end(); )
+    {
+      trx_mod_tables_t::iterator j= i++;
+      ut_ad(j->second.valid());
+      if (j->second.rollback(limit))
+      {
+        j->second.clear_bulk_buffer();
+        mod_tables.erase(j);
+      }
+      else if (!apply_online_log)
+        apply_online_log= j->first->is_active_ddl();
+    }
+    MONITOR_INC(MONITOR_TRX_ROLLBACK_SAVEPOINT);
+  }
+
+  mem_heap_free(heap);
+}
+
+/** Initiate rollback.
+@param savept     savepoint
+@return error code or DB_SUCCESS */
+dberr_t trx_t::rollback(trx_savept_t *savept)
+{
+  ut_ad(!mutex_is_owner());
+  if (state == TRX_STATE_NOT_STARTED)
+  {
+    error_state= DB_SUCCESS;
+    return DB_SUCCESS;
+  }
+  ut_ad(state == TRX_STATE_ACTIVE);
+#ifdef WITH_WSREP
+  if (!savept && is_wsrep() && wsrep_thd_is_SR(mysql_thd))
+    wsrep_handle_SR_rollback(nullptr, mysql_thd);
+#endif /* WITH_WSREP */
+  rollback_low(savept);
+  return error_state;
+}
+
+/*******************************************************************//**
+Rollback a transaction used in MySQL.
+@return error code or DB_SUCCESS */
+static
+dberr_t
+trx_rollback_for_mysql_low(
+/*=======================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	trx->op_info = "rollback";
+
+	/* If we are doing the XA recovery of prepared transactions,
+	then the transaction object does not have an InnoDB session
+	object, and we set a dummy session that we use for all MySQL
+	transactions. */
+
+	trx->rollback_low();
+
+	trx->op_info = "";
+
+	return(trx->error_state);
+}
+
+/** Rollback a transaction used in MySQL
+@param[in, out]	trx	transaction
+@return error code or DB_SUCCESS */
+dberr_t trx_rollback_for_mysql(trx_t* trx)
+{
+	/* We are reading trx->state without holding trx->mutex
+	here, because the rollback should be invoked for a running
+	active MySQL transaction (or recovered prepared transaction)
+	that is associated with the current thread. */
+
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		trx->will_lock = false;
+		ut_ad(trx->mysql_thd);
+		/* Galera transaction abort can be invoked from MDL acquision
+		code, so trx->lock.was_chosen_as_deadlock_victim can be set
+		even if trx->state is TRX_STATE_NOT_STARTED. */
+		ut_ad(!(trx->lock.was_chosen_as_deadlock_victim & 1));
+#ifdef WITH_WSREP
+		trx->wsrep= false;
+		trx->lock.was_chosen_as_deadlock_victim= false;
+#endif
+		return(DB_SUCCESS);
+
+	case TRX_STATE_ACTIVE:
+		ut_ad(trx->mysql_thd);
+		ut_ad(!trx->is_recovered);
+		ut_ad(!trx->is_autocommit_non_locking() || trx->read_only);
+		return(trx_rollback_for_mysql_low(trx));
+
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_PREPARED_RECOVERED:
+		ut_ad(!trx->is_autocommit_non_locking());
+		if (trx->rsegs.m_redo.undo) {
+			/* The XA ROLLBACK of a XA PREPARE transaction
+			will consist of multiple mini-transactions.
+
+			As the very first step of XA ROLLBACK, we must
+			change the undo log state back from
+			TRX_UNDO_PREPARED to TRX_UNDO_ACTIVE, in order
+			to ensure that recovery will complete the
+			rollback.
+
+			Failure to perform this step could cause a
+			situation where we would roll back part of
+			a XA PREPARE transaction, the server would be
+			killed, and finally, the transaction would be
+			recovered in XA PREPARE state, with some of
+			the actions already having been rolled back. */
+			ut_ad(trx->rsegs.m_redo.undo->rseg
+			      == trx->rsegs.m_redo.rseg);
+			mtr_t		mtr;
+			mtr.start();
+			if (trx_undo_t* undo = trx->rsegs.m_redo.undo) {
+				trx_undo_set_state_at_prepare(trx, undo, true,
+							      &mtr);
+			}
+			/* Write the redo log for the XA ROLLBACK
+			state change to the global buffer. It is
+			not necessary to flush the redo log. If
+			a durable log write of a later mini-transaction
+			takes place for whatever reason, then this state
+			change will be durable as well. */
+			mtr.commit();
+			ut_ad(mtr.commit_lsn() > 0);
+		}
+		return(trx_rollback_for_mysql_low(trx));
+
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		ut_ad(!trx->is_autocommit_non_locking());
+		break;
+	}
+
+	ut_error;
+	return(DB_CORRUPTION);
+}
+
+/*******************************************************************//**
+Rollback the latest SQL statement for MySQL.
+@return error code or DB_SUCCESS */
+dberr_t
+trx_rollback_last_sql_stat_for_mysql(
+/*=================================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	dberr_t	err;
+
+	/* We are reading trx->state without holding trx->mutex
+	here, because the statement rollback should be invoked for a
+	running active MySQL transaction that is associated with the
+	current thread. */
+	ut_ad(trx->mysql_thd);
+
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		return(DB_SUCCESS);
+
+	case TRX_STATE_ACTIVE:
+		ut_ad(trx->mysql_thd);
+		ut_ad(!trx->is_recovered);
+		ut_ad(!trx->is_autocommit_non_locking() || trx->read_only);
+
+		trx->op_info = "rollback of SQL statement";
+
+		err = trx->rollback(&trx->last_sql_stat_start);
+
+		if (trx->fts_trx != NULL) {
+			fts_savepoint_rollback_last_stmt(trx);
+			fts_savepoint_laststmt_refresh(trx);
+		}
+
+		trx->last_sql_stat_start.least_undo_no = trx->undo_no;
+		trx->end_bulk_insert();
+
+		trx->op_info = "";
+
+		return(err);
+
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_PREPARED_RECOVERED:
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		/* The statement rollback is only allowed on an ACTIVE
+		transaction, not a PREPARED or COMMITTED one. */
+		break;
+	}
+
+	ut_error;
+	return(DB_CORRUPTION);
+}
+
+/*******************************************************************//**
+Search for a savepoint using name.
+@return savepoint if found else NULL */
+static
+trx_named_savept_t*
+trx_savepoint_find(
+/*===============*/
+	trx_t*		trx,			/*!< in: transaction */
+	const char*	name)			/*!< in: savepoint name */
+{
+	trx_named_savept_t*	savep;
+
+	for (savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
+	     savep != NULL;
+	     savep = UT_LIST_GET_NEXT(trx_savepoints, savep)) {
+		if (!strcmp(savep->name, name)) {
+			return(savep);
+		}
+	}
+
+	return(NULL);
+}
+
+/*******************************************************************//**
+Frees a single savepoint struct. */
+static
+void
+trx_roll_savepoint_free(
+/*=====================*/
+	trx_t*			trx,	/*!< in: transaction handle */
+	trx_named_savept_t*	savep)	/*!< in: savepoint to free */
+{
+	UT_LIST_REMOVE(trx->trx_savepoints, savep);
+
+	ut_free(savep->name);
+	ut_free(savep);
+}
+
+/** Discard all savepoints starting from a particular savepoint.
+@param savept    first savepoint to discard */
+void trx_t::savepoints_discard(trx_named_savept_t *savept)
+{
+  while (savept)
+  {
+    auto next= UT_LIST_GET_NEXT(trx_savepoints, savept);
+    trx_roll_savepoint_free(this, savept);
+    savept= next;
+  }
+}
+
+/*******************************************************************//**
+Rolls back a transaction back to a named savepoint. Modifications after the
+savepoint are undone but InnoDB does NOT release the corresponding locks
+which are stored in memory. If a lock is 'implicit', that is, a new inserted
+row holds a lock where the lock information is carried by the trx id stored in
+the row, these locks are naturally released in the rollback. Savepoints which
+were set after this savepoint are deleted.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+dberr_t
+trx_rollback_to_savepoint_for_mysql_low(
+/*====================================*/
+	trx_t*			trx,	/*!< in/out: transaction */
+	trx_named_savept_t*	savep,	/*!< in/out: savepoint */
+	int64_t*		mysql_binlog_cache_pos)
+					/*!< out: the MySQL binlog
+					cache position corresponding
+					to this savepoint; MySQL needs
+					this information to remove the
+					binlog entries of the queries
+					executed after the savepoint */
+{
+	dberr_t	err;
+
+	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+	ut_ad(trx->mysql_thd);
+
+	/* Free all savepoints strictly later than savep. */
+
+	trx->savepoints_discard(UT_LIST_GET_NEXT(trx_savepoints, savep));
+
+	*mysql_binlog_cache_pos = savep->mysql_binlog_cache_pos;
+
+	trx->op_info = "rollback to a savepoint";
+
+	err = trx->rollback(&savep->savept);
+
+	/* Store the current undo_no of the transaction so that
+	we know where to roll back if we have to roll back the
+	next SQL statement: */
+
+	trx_mark_sql_stat_end(trx);
+
+	trx->op_info = "";
+	return(err);
+}
+
+/*******************************************************************//**
+Rolls back a transaction back to a named savepoint. Modifications after the
+savepoint are undone but InnoDB does NOT release the corresponding locks
+which are stored in memory. If a lock is 'implicit', that is, a new inserted
+row holds a lock where the lock information is carried by the trx id stored in
+the row, these locks are naturally released in the rollback. Savepoints which
+were set after this savepoint are deleted.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+dberr_t
+trx_rollback_to_savepoint_for_mysql(
+/*================================*/
+	trx_t*		trx,			/*!< in: transaction handle */
+	const char*	savepoint_name,		/*!< in: savepoint name */
+	int64_t*	mysql_binlog_cache_pos)	/*!< out: the MySQL binlog cache
+						position corresponding to this
+						savepoint; MySQL needs this
+						information to remove the
+						binlog entries of the queries
+						executed after the savepoint */
+{
+	trx_named_savept_t*	savep;
+
+	/* We are reading trx->state without holding trx->mutex
+	here, because the savepoint rollback should be invoked for a
+	running active MySQL transaction that is associated with the
+	current thread. */
+	ut_ad(trx->mysql_thd);
+
+	savep = trx_savepoint_find(trx, savepoint_name);
+
+	if (savep == NULL) {
+		return(DB_NO_SAVEPOINT);
+	}
+
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		ib::error() << "Transaction has a savepoint "
+			<< savep->name
+			<< " though it is not started";
+		return(DB_ERROR);
+
+	case TRX_STATE_ACTIVE:
+
+		return(trx_rollback_to_savepoint_for_mysql_low(
+				trx, savep, mysql_binlog_cache_pos));
+
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_PREPARED_RECOVERED:
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		/* The savepoint rollback is only allowed on an ACTIVE
+		transaction, not a PREPARED or COMMITTED one. */
+		break;
+	}
+
+	ut_error;
+	return(DB_CORRUPTION);
+}
+
+/*******************************************************************//**
+Creates a named savepoint. If the transaction is not yet started, starts it.
+If there is already a savepoint of the same name, this call erases that old
+savepoint and replaces it with a new. Savepoints are deleted in a transaction
+commit or rollback.
+@return always DB_SUCCESS */
+dberr_t
+trx_savepoint_for_mysql(
+/*====================*/
+	trx_t*		trx,			/*!< in: transaction handle */
+	const char*	savepoint_name,		/*!< in: savepoint name */
+	int64_t		binlog_cache_pos)	/*!< in: MySQL binlog cache
+						position corresponding to this
+						connection at the time of the
+						savepoint */
+{
+	trx_named_savept_t*	savep;
+
+	trx_start_if_not_started_xa(trx, false);
+
+	savep = trx_savepoint_find(trx, savepoint_name);
+
+	if (savep) {
+		/* There is a savepoint with the same name: free that */
+
+		UT_LIST_REMOVE(trx->trx_savepoints, savep);
+
+		ut_free(savep->name);
+		ut_free(savep);
+	}
+
+	/* Create a new savepoint and add it as the last in the list */
+
+	savep = static_cast<trx_named_savept_t*>(
+		ut_malloc_nokey(sizeof(*savep)));
+
+	savep->name = mem_strdup(savepoint_name);
+
+	savep->savept.least_undo_no = trx->undo_no;
+	trx->last_sql_stat_start.least_undo_no = trx->undo_no;
+
+	savep->mysql_binlog_cache_pos = binlog_cache_pos;
+
+	UT_LIST_ADD_LAST(trx->trx_savepoints, savep);
+
+	trx->end_bulk_insert();
+
+	return(DB_SUCCESS);
+}
+
+/*******************************************************************//**
+Releases only the named savepoint. Savepoints which were set after this
+savepoint are left as is.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+dberr_t
+trx_release_savepoint_for_mysql(
+/*============================*/
+	trx_t*		trx,			/*!< in: transaction handle */
+	const char*	savepoint_name)		/*!< in: savepoint name */
+{
+	trx_named_savept_t*	savep;
+
+	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE, true)
+	      || trx_state_eq(trx, TRX_STATE_PREPARED, true));
+	ut_ad(trx->mysql_thd);
+
+	savep = trx_savepoint_find(trx, savepoint_name);
+
+	if (savep != NULL) {
+		trx_roll_savepoint_free(trx, savep);
+		return DB_SUCCESS;
+	} else if (trx->last_sql_stat_start.least_undo_no == 0) {
+		/* Bulk insert could have discarded savepoints */
+		return DB_SUCCESS;
+	}
+
+	return DB_NO_SAVEPOINT;
+}
+
+/*******************************************************************//**
+Roll back an active transaction. */
+static
+void
+trx_rollback_active(
+/*================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	mem_heap_t*	heap;
+	que_fork_t*	fork;
+	que_thr_t*	thr;
+	roll_node_t*	roll_node;
+	const trx_id_t	trx_id = trx->id;
+
+	ut_ad(trx_id);
+
+	heap = mem_heap_create(512);
+
+	fork = que_fork_create(heap);
+	fork->trx = trx;
+
+	thr = que_thr_create(fork, heap, NULL);
+
+	roll_node = roll_node_create(heap);
+
+	thr->child = roll_node;
+	roll_node->common.parent = thr;
+
+	trx->graph = fork;
+
+	ut_a(thr == que_fork_start_command(fork));
+
+	trx_roll_crash_recv_trx	= trx;
+
+	const bool dictionary_locked = trx->dict_operation;
+
+	if (dictionary_locked) {
+		row_mysql_lock_data_dictionary(trx);
+	}
+
+	que_run_threads(thr);
+	ut_a(roll_node->undo_thr != NULL);
+
+	que_run_threads(roll_node->undo_thr);
+
+	que_graph_free(
+		static_cast<que_t*>(roll_node->undo_thr->common.parent));
+
+	if (UNIV_UNLIKELY(!trx->rollback_finish())) {
+		ut_ad(!dictionary_locked);
+	} else {
+		ib::info() << "Rolled back recovered transaction " << trx_id;
+	}
+
+	if (dictionary_locked) {
+		row_mysql_unlock_data_dictionary(trx);
+	}
+
+	mem_heap_free(heap);
+
+	trx_roll_crash_recv_trx	= NULL;
+}
+
+
+struct trx_roll_count_callback_arg
+{
+  uint32_t n_trx;
+  uint64_t n_rows;
+  trx_roll_count_callback_arg(): n_trx(0), n_rows(0) {}
+};
+
+
+static my_bool trx_roll_count_callback(rw_trx_hash_element_t *element,
+                                       trx_roll_count_callback_arg *arg)
+{
+  element->mutex.wr_lock();
+  if (trx_t *trx= element->trx)
+  {
+    if (trx->is_recovered && trx_state_eq(trx, TRX_STATE_ACTIVE))
+    {
+      arg->n_trx++;
+      arg->n_rows+= trx->undo_no;
+    }
+  }
+  element->mutex.wr_unlock();
+  return 0;
+}
+
+/** Report progress when rolling back a row of a recovered transaction. */
+void trx_roll_report_progress()
+{
+	time_t now = time(NULL);
+	mysql_mutex_lock(&recv_sys.mutex);
+	bool report = recv_sys.report(now);
+	mysql_mutex_unlock(&recv_sys.mutex);
+
+	if (report) {
+		trx_roll_count_callback_arg arg;
+
+		/* Get number of recovered active transactions and number of
+		rows they modified. Numbers must be accurate, because only this
+		thread is allowed to touch recovered transactions. */
+		trx_sys.rw_trx_hash.iterate_no_dups(
+			trx_roll_count_callback, &arg);
+
+		if (arg.n_rows > 0) {
+			service_manager_extend_timeout(
+				INNODB_EXTEND_TIMEOUT_INTERVAL,
+				"To roll back: " UINT32PF " transactions, "
+				UINT64PF " rows", arg.n_trx, arg.n_rows);
+		}
+
+		ib::info() << "To roll back: " << arg.n_trx
+			   << " transactions, " << arg.n_rows << " rows";
+
+	}
+}
+
+
+static my_bool trx_rollback_recovered_callback(rw_trx_hash_element_t *element,
+                                               std::vector<trx_t*> *trx_list)
+{
+  element->mutex.wr_lock();
+  if (trx_t *trx= element->trx)
+  {
+    trx->mutex_lock();
+    if (trx_state_eq(trx, TRX_STATE_ACTIVE) && trx->is_recovered)
+      trx_list->push_back(trx);
+    trx->mutex_unlock();
+  }
+  element->mutex.wr_unlock();
+  return 0;
+}
+
+/**
+  Rollback any incomplete transactions which were encountered in crash recovery.
+
+  If the transaction already was committed, then we clean up a possible insert
+  undo log. If the transaction was not yet committed, then we roll it back.
+
+  Note: For XA recovered transactions, we rely on MySQL to
+  do rollback. They will be in TRX_STATE_PREPARED state. If the server
+  is shutdown and they are still lingering in trx_sys_t::trx_list
+  then the shutdown will hang.
+
+  @param[in]  all  true=roll back all recovered active transactions;
+                   false=roll back any incomplete dictionary transaction
+*/
+
+void trx_rollback_recovered(bool all)
+{
+  std::vector<trx_t*> trx_list;
+
+  ut_a(srv_force_recovery <
+       ulong(all ? SRV_FORCE_NO_TRX_UNDO : SRV_FORCE_NO_DDL_UNDO));
+
+  /*
+    Collect list of recovered ACTIVE transaction ids first. Once collected, no
+    other thread is allowed to modify or remove these transactions from
+    rw_trx_hash.
+  */
+  trx_sys.rw_trx_hash.iterate_no_dups(trx_rollback_recovered_callback,
+                                      &trx_list);
+
+  while (!trx_list.empty())
+  {
+    trx_t *trx= trx_list.back();
+    trx_list.pop_back();
+
+    ut_ad(trx);
+    ut_d(trx->mutex_lock());
+    ut_ad(trx->is_recovered);
+    ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
+    ut_d(trx->mutex_unlock());
+
+    if (srv_shutdown_state != SRV_SHUTDOWN_NONE && !srv_undo_sources &&
+        srv_fast_shutdown)
+      goto discard;
+
+    if (all || trx->dict_operation || trx->has_stats_table_lock())
+    {
+      trx_rollback_active(trx);
+      if (trx->error_state != DB_SUCCESS)
+      {
+        ut_ad(trx->error_state == DB_INTERRUPTED);
+        trx->error_state= DB_SUCCESS;
+        ut_ad(!srv_undo_sources);
+        ut_ad(srv_fast_shutdown);
+discard:
+        /* Note: before kill_server() invoked innobase_end() via
+        unireg_end(), it invoked close_connections(), which should initiate
+        the rollback of any user transactions via THD::cleanup() in the
+        connection threads, and wait for all THD::cleanup() to complete.
+        So, no active user transactions should exist at this point.
+
+        srv_undo_sources=false was cleared early in innobase_end().
+
+        Generally, the server guarantees that all connections using
+        InnoDB must be disconnected by the time we are reaching this code,
+        be it during shutdown or UNINSTALL PLUGIN.
+
+        Because there is no possible race condition with any
+        concurrent user transaction, we do not have to invoke
+        trx->commit_state() or wait for !trx->is_referenced()
+        before trx_sys.deregister_rw(trx). */
+        trx_sys.deregister_rw(trx);
+        trx_free_at_shutdown(trx);
+      }
+      else
+        trx->free();
+    }
+  }
+}
+
+/*******************************************************************//**
+Rollback or clean up any incomplete transactions which were
+encountered in crash recovery.  If the transaction already was
+committed, then we clean up a possible insert undo log. If the
+transaction was not yet committed, then we roll it back.
+Note: this is done in a background thread. */
+void trx_rollback_all_recovered(void*)
+{
+	ut_ad(!srv_read_only_mode);
+
+	if (trx_sys.rw_trx_hash.size()) {
+		ib::info() << "Starting in background the rollback of"
+			" recovered transactions";
+		trx_rollback_recovered(true);
+		ib::info() << "Rollback of non-prepared transactions"
+			" completed";
+	}
+
+	trx_rollback_is_active = false;
+}
+
+/****************************************************************//**
+Builds an undo 'query' graph for a transaction. The actual rollback is
+performed by executing this query graph like a query subprocedure call.
+The reply about the completion of the rollback will be sent by this
+graph.
+@return own: the query graph */
+static
+que_t*
+trx_roll_graph_build(
+/*=================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	mem_heap_t*	heap;
+	que_fork_t*	fork;
+	que_thr_t*	thr;
+
+	ut_ad(trx->mutex_is_owner());
+	heap = mem_heap_create(512);
+	fork = que_fork_create(heap);
+	fork->trx = trx;
+
+	thr = que_thr_create(fork, heap, NULL);
+
+	thr->child = row_undo_node_create(trx, thr, heap);
+
+	return(fork);
+}
+
+/*********************************************************************//**
+Starts a rollback operation, creates the UNDO graph that will do the
+actual undo operation.
+@return query graph thread that will perform the UNDO operations. */
+static
+que_thr_t*
+trx_rollback_start(
+/*===============*/
+	trx_t*		trx,		/*!< in: transaction */
+	undo_no_t	roll_limit)	/*!< in: rollback to undo no (for
+					partial undo), 0 if we are rolling back
+					the entire transaction */
+{
+	/* Initialize the rollback field in the transaction */
+
+	ut_ad(trx->mutex_is_owner());
+	ut_ad(!trx->roll_limit);
+	ut_ad(!trx->in_rollback);
+
+	trx->roll_limit = roll_limit;
+	trx->in_rollback = true;
+
+	ut_a(trx->roll_limit <= trx->undo_no);
+
+	trx->pages_undone = 0;
+
+	/* Build a 'query' graph which will perform the undo operations */
+
+	que_t*	roll_graph = trx_roll_graph_build(trx);
+
+	trx->graph = roll_graph;
+
+	return(que_fork_start_command(roll_graph));
+}
+
+/*********************************************************************//**
+Creates a rollback command node struct.
+@return own: rollback node struct */
+roll_node_t*
+roll_node_create(
+/*=============*/
+	mem_heap_t*	heap)	/*!< in: mem heap where created */
+{
+	roll_node_t*	node;
+
+	node = static_cast<roll_node_t*>(mem_heap_zalloc(heap, sizeof(*node)));
+
+	node->state = ROLL_NODE_SEND;
+
+	node->common.type = QUE_NODE_ROLLBACK;
+
+	return(node);
+}
+
+/***********************************************************//**
+Performs an execution step for a rollback command node in a query graph.
+@return query thread to run next, or NULL */
+que_thr_t*
+trx_rollback_step(
+/*==============*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	roll_node_t*	node;
+
+	node = static_cast<roll_node_t*>(thr->run_node);
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_ROLLBACK);
+
+	if (thr->prev_node == que_node_get_parent(node)) {
+		node->state = ROLL_NODE_SEND;
+	}
+
+	if (node->state == ROLL_NODE_SEND) {
+		trx_t*		trx;
+		ib_id_t		roll_limit;
+
+		trx = thr_get_trx(thr);
+
+		node->state = ROLL_NODE_WAIT;
+
+		ut_a(node->undo_thr == NULL);
+
+		roll_limit = node->savept ? node->savept->least_undo_no : 0;
+
+		trx->mutex_lock();
+
+		trx_commit_or_rollback_prepare(trx);
+
+		node->undo_thr = trx_rollback_start(trx, roll_limit);
+
+		trx->mutex_unlock();
+	} else {
+		ut_ad(node->state == ROLL_NODE_WAIT);
+
+		thr->run_node = que_node_get_parent(node);
+	}
+
+	return(thr);
+}
diff --git a/storage/innobase/trx/trx0rseg.cc b/storage/innobase/trx/trx0rseg.cc
new file mode 100644
index 00000000..8d1a381c
--- /dev/null
+++ b/storage/innobase/trx/trx0rseg.cc
@@ -0,0 +1,727 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0rseg.cc
+Rollback segment
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0rseg.h"
+#include "trx0undo.h"
+#include "fut0lst.h"
+#include "srv0srv.h"
+#include "trx0purge.h"
+#include "srv0mon.h"
+#include "log.h"
+
+#ifdef WITH_WSREP
+# include <mysql/service_wsrep.h>
+
+/** The offset to WSREP XID headers, after TRX_RSEG */
+# define TRX_RSEG_WSREP_XID_INFO      TRX_RSEG_MAX_TRX_ID + 16 + 512
+
+/** WSREP XID format (1 if present and valid, 0 if not present) */
+# define TRX_RSEG_WSREP_XID_FORMAT    TRX_RSEG_WSREP_XID_INFO
+/** WSREP XID GTRID length */
+# define TRX_RSEG_WSREP_XID_GTRID_LEN TRX_RSEG_WSREP_XID_INFO + 4
+/** WSREP XID bqual length */
+# define TRX_RSEG_WSREP_XID_BQUAL_LEN TRX_RSEG_WSREP_XID_INFO + 8
+/** WSREP XID data (XIDDATASIZE bytes) */
+# define TRX_RSEG_WSREP_XID_DATA      TRX_RSEG_WSREP_XID_INFO + 12
+
+# ifdef UNIV_DEBUG
+/** The latest known WSREP XID sequence number */
+static long long wsrep_seqno = -1;
+# endif /* UNIV_DEBUG */
+/** The latest known WSREP XID UUID */
+static unsigned char wsrep_uuid[16];
+
+/** Write the WSREP XID information into rollback segment header.
+@param[in,out]	rseg_header	rollback segment header
+@param[in]	xid		WSREP XID
+@param[in,out]	mtr		mini transaction */
+static void
+trx_rseg_write_wsrep_checkpoint(
+	buf_block_t*	rseg_header,
+	const XID*	xid,
+	mtr_t*		mtr)
+{
+	DBUG_ASSERT(xid->gtrid_length >= 0);
+	DBUG_ASSERT(xid->bqual_length >= 0);
+	DBUG_ASSERT(xid->gtrid_length + xid->bqual_length < XIDDATASIZE);
+
+	mtr->write<4,mtr_t::MAYBE_NOP>(*rseg_header,
+				       TRX_RSEG + TRX_RSEG_WSREP_XID_FORMAT
+				       + rseg_header->page.frame,
+				       uint32_t(xid->formatID));
+
+	mtr->write<4,mtr_t::MAYBE_NOP>(*rseg_header,
+				       TRX_RSEG + TRX_RSEG_WSREP_XID_GTRID_LEN
+				       + rseg_header->page.frame,
+				       uint32_t(xid->gtrid_length));
+
+	mtr->write<4,mtr_t::MAYBE_NOP>(*rseg_header,
+				       TRX_RSEG + TRX_RSEG_WSREP_XID_BQUAL_LEN
+				       + rseg_header->page.frame,
+				       uint32_t(xid->bqual_length));
+
+	const ulint xid_length = static_cast<ulint>(xid->gtrid_length
+						    + xid->bqual_length);
+	mtr->memcpy<mtr_t::MAYBE_NOP>(*rseg_header,
+				      TRX_RSEG + TRX_RSEG_WSREP_XID_DATA
+				      + rseg_header->page.frame,
+				      xid->data, xid_length);
+	if (xid_length < XIDDATASIZE
+	    && memcmp(TRX_RSEG + TRX_RSEG_WSREP_XID_DATA
+		      + rseg_header->page.frame, field_ref_zero,
+		      XIDDATASIZE - xid_length)) {
+		mtr->memset(rseg_header,
+			    TRX_RSEG + TRX_RSEG_WSREP_XID_DATA + xid_length,
+			    XIDDATASIZE - xid_length, 0);
+	}
+}
+
+/** Update the WSREP XID information in rollback segment header.
+@param[in,out]	rseg_header	rollback segment header
+@param[in]	xid		WSREP XID
+@param[in,out]	mtr		mini-transaction */
+void
+trx_rseg_update_wsrep_checkpoint(
+	buf_block_t*	rseg_header,
+	const XID*	xid,
+	mtr_t*		mtr)
+{
+	ut_ad(wsrep_is_wsrep_xid(xid));
+
+#ifdef UNIV_DEBUG
+	/* Check that seqno is monotonically increasing */
+	long long xid_seqno = wsrep_xid_seqno(xid);
+	const byte* xid_uuid = wsrep_xid_uuid(xid);
+
+	if (xid_seqno != -1
+	    && !memcmp(xid_uuid, wsrep_uuid, sizeof wsrep_uuid)) {
+		ut_ad(xid_seqno > wsrep_seqno);
+	} else {
+		memcpy(wsrep_uuid, xid_uuid, sizeof wsrep_uuid);
+	}
+	wsrep_seqno = xid_seqno;
+#endif /* UNIV_DEBUG */
+	trx_rseg_write_wsrep_checkpoint(rseg_header, xid, mtr);
+}
+
+static dberr_t trx_rseg_update_wsrep_checkpoint(const XID* xid, mtr_t* mtr)
+{
+  dberr_t err;
+  buf_block_t *rseg_header = trx_sys.rseg_array[0].get(mtr, &err);
+
+  if (UNIV_UNLIKELY(!rseg_header))
+    return err;
+
+  /* We must make check against wsrep_uuid here, the
+  trx_rseg_update_wsrep_checkpoint() writes over wsrep_uuid with xid
+  contents in debug mode and the memcmp() will never give nonzero
+  result. */
+  const bool must_clear_rsegs=
+    memcmp(wsrep_uuid, wsrep_xid_uuid(xid), sizeof wsrep_uuid);
+
+  if (UNIV_UNLIKELY(mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT +
+                                     rseg_header->page.frame)))
+    trx_rseg_format_upgrade(rseg_header, mtr);
+
+  trx_rseg_update_wsrep_checkpoint(rseg_header, xid, mtr);
+
+  if (must_clear_rsegs)
+    /* Because the UUID part of the WSREP XID differed from
+    current_xid_uuid, the WSREP group UUID was changed, and we must
+    reset the XID in all rollback segment headers. */
+    for (ulint rseg_id= 1; rseg_id < TRX_SYS_N_RSEGS; ++rseg_id)
+      if (buf_block_t* block= trx_sys.rseg_array[rseg_id].get(mtr, &err))
+        mtr->memset(block, TRX_RSEG + TRX_RSEG_WSREP_XID_INFO,
+                    TRX_RSEG_WSREP_XID_DATA + XIDDATASIZE -
+                    TRX_RSEG_WSREP_XID_INFO, 0);
+  return err;
+}
+
+/** Update WSREP checkpoint XID in first rollback segment header
+as part of wsrep_set_SE_checkpoint() when it is guaranteed that there
+are no wsrep transactions committing.
+If the UUID part of the WSREP XID does not match to the UUIDs of XIDs already
+stored into rollback segments, the WSREP XID in all the remaining rollback
+segments will be reset.
+@param[in]	xid		WSREP XID */
+void trx_rseg_update_wsrep_checkpoint(const XID* xid)
+{
+	mtr_t	mtr;
+	mtr.start();
+	trx_rseg_update_wsrep_checkpoint(xid, &mtr);
+	mtr.commit();
+}
+
+/** Read the WSREP XID information in rollback segment header.
+@param[in]	rseg_header	Rollback segment header
+@param[out]	xid		Transaction XID
+@return	whether the WSREP XID was present */
+static
+bool trx_rseg_read_wsrep_checkpoint(const buf_block_t *rseg_header, XID &xid)
+{
+	int formatID = static_cast<int>(
+		mach_read_from_4(TRX_RSEG + TRX_RSEG_WSREP_XID_FORMAT
+				 + rseg_header->page.frame));
+	if (formatID == 0) {
+		return false;
+	}
+
+	xid.formatID = formatID;
+	xid.gtrid_length = static_cast<int>(
+		mach_read_from_4(TRX_RSEG + TRX_RSEG_WSREP_XID_GTRID_LEN
+				 + rseg_header->page.frame));
+
+	xid.bqual_length = static_cast<int>(
+		mach_read_from_4(TRX_RSEG + TRX_RSEG_WSREP_XID_BQUAL_LEN
+				 + rseg_header->page.frame));
+
+	memcpy(xid.data, TRX_RSEG + TRX_RSEG_WSREP_XID_DATA
+	       + rseg_header->page.frame, XIDDATASIZE);
+
+	return true;
+}
+
+/** Read the WSREP XID from the TRX_SYS page (in case of upgrade).
+@param[in]	page	TRX_SYS page
+@param[out]	xid	WSREP XID (if present)
+@return	whether the WSREP XID is present */
+static bool trx_rseg_init_wsrep_xid(const page_t* page, XID& xid)
+{
+	if (mach_read_from_4(TRX_SYS + TRX_SYS_WSREP_XID_INFO
+			     + TRX_SYS_WSREP_XID_MAGIC_N_FLD
+			     + page)
+	    != TRX_SYS_WSREP_XID_MAGIC_N) {
+		return false;
+	}
+
+	xid.formatID = static_cast<int>(
+		mach_read_from_4(
+			TRX_SYS + TRX_SYS_WSREP_XID_INFO
+			+ TRX_SYS_WSREP_XID_FORMAT + page));
+	xid.gtrid_length = static_cast<int>(
+		mach_read_from_4(
+			TRX_SYS + TRX_SYS_WSREP_XID_INFO
+			+ TRX_SYS_WSREP_XID_GTRID_LEN + page));
+	xid.bqual_length = static_cast<int>(
+		mach_read_from_4(
+			TRX_SYS + TRX_SYS_WSREP_XID_INFO
+			+ TRX_SYS_WSREP_XID_BQUAL_LEN + page));
+	memcpy(xid.data,
+	       TRX_SYS + TRX_SYS_WSREP_XID_INFO
+	       + TRX_SYS_WSREP_XID_DATA + page, XIDDATASIZE);
+	return true;
+}
+
+/** Recover the latest WSREP checkpoint XID.
+@param[out]	xid	WSREP XID
+@return	whether the WSREP XID was found */
+bool trx_rseg_read_wsrep_checkpoint(XID& xid)
+{
+	mtr_t		mtr;
+	long long       max_xid_seqno = -1;
+	bool		found = false;
+
+	for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS;
+	     rseg_id++, mtr.commit()) {
+		mtr.start();
+		const buf_block_t* sys = trx_sysf_get(&mtr, false);
+		if (UNIV_UNLIKELY(!sys)) {
+			continue;
+		}
+		const uint32_t page_no = trx_sysf_rseg_get_page_no(
+			sys, rseg_id);
+
+		if (page_no == FIL_NULL) {
+			continue;
+		}
+
+		const buf_block_t* rseg_header = buf_page_get_gen(
+			page_id_t(trx_sysf_rseg_get_space(sys, rseg_id),
+				  page_no),
+			0, RW_S_LATCH, nullptr, BUF_GET, &mtr);
+
+		if (!rseg_header) {
+			continue;
+		}
+
+		if (mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT
+				     + rseg_header->page.frame)) {
+			continue;
+		}
+
+		XID tmp_xid;
+		long long tmp_seqno = 0;
+		if (trx_rseg_read_wsrep_checkpoint(rseg_header, tmp_xid)
+		    && (tmp_seqno = wsrep_xid_seqno(&tmp_xid))
+		    > max_xid_seqno) {
+			found = true;
+			max_xid_seqno = tmp_seqno;
+			xid = tmp_xid;
+			memcpy(wsrep_uuid, wsrep_xid_uuid(&tmp_xid),
+			       sizeof wsrep_uuid);
+		}
+	}
+
+	return found;
+}
+#endif /* WITH_WSREP */
+
+buf_block_t *trx_rseg_t::get(mtr_t *mtr, dberr_t *err) const
+{
+  if (!space)
+  {
+    if (err) *err= DB_TABLESPACE_NOT_FOUND;
+    return nullptr;
+  }
+  return buf_page_get_gen(page_id(), 0, RW_X_LATCH, nullptr,
+                          BUF_GET, mtr, err);
+}
+
+/** Upgrade a rollback segment header page to MariaDB 10.3 format.
+@param[in,out]	rseg_header	rollback segment header page
+@param[in,out]	mtr		mini-transaction */
+void trx_rseg_format_upgrade(buf_block_t *rseg_header, mtr_t *mtr)
+{
+  mtr->memset(rseg_header, TRX_RSEG + TRX_RSEG_FORMAT, 4, 0);
+  /* Clear also possible garbage at the end of the page. Old
+  InnoDB versions did not initialize unused parts of pages. */
+  mtr->memset(rseg_header, TRX_RSEG + TRX_RSEG_MAX_TRX_ID + 8,
+              srv_page_size
+              - (FIL_PAGE_DATA_END + TRX_RSEG + TRX_RSEG_MAX_TRX_ID + 8),
+              0);
+}
+
+/** Create a rollback segment header.
+@param[in,out]  space           system, undo, or temporary tablespace
+@param[in]      rseg_id         rollback segment identifier
+@param[in]      max_trx_id      new value of TRX_RSEG_MAX_TRX_ID
+@param[in,out]  mtr             mini-transaction
+@param[out]     err             error code
+@return the created rollback segment
+@retval nullptr on failure */
+buf_block_t *trx_rseg_header_create(fil_space_t *space, ulint rseg_id,
+                                    trx_id_t max_trx_id, mtr_t *mtr,
+                                    dberr_t *err)
+{
+  ut_ad(mtr->memo_contains(*space));
+  buf_block_t *block=
+    fseg_create(space, TRX_RSEG + TRX_RSEG_FSEG_HEADER, mtr, err);
+  if (block)
+  {
+    ut_ad(0 == mach_read_from_4(TRX_RSEG_FORMAT + TRX_RSEG +
+                                block->page.frame));
+    ut_ad(0 == mach_read_from_4(TRX_RSEG_HISTORY_SIZE + TRX_RSEG +
+                                block->page.frame));
+    ut_ad(0 == mach_read_from_4(TRX_RSEG_MAX_TRX_ID + TRX_RSEG +
+                                block->page.frame));
+
+    /* Initialize the history list */
+    flst_init(block, TRX_RSEG_HISTORY + TRX_RSEG, mtr);
+
+    mtr->write<8,mtr_t::MAYBE_NOP>(*block, TRX_RSEG + TRX_RSEG_MAX_TRX_ID +
+                                   block->page.frame, max_trx_id);
+
+    /* Reset the undo log slots */
+    mtr->memset(block, TRX_RSEG_UNDO_SLOTS + TRX_RSEG, TRX_RSEG_N_SLOTS * 4,
+                0xff);
+  }
+  return block;
+}
+
+void trx_rseg_t::destroy()
+{
+  latch.destroy();
+
+  /* There can't be any active transactions. */
+  ut_a(!UT_LIST_GET_LEN(undo_list));
+
+  for (trx_undo_t *next, *undo= UT_LIST_GET_FIRST(undo_cached); undo;
+       undo= next)
+  {
+    next= UT_LIST_GET_NEXT(undo_list, undo);
+    UT_LIST_REMOVE(undo_cached, undo);
+    ut_free(undo);
+  }
+}
+
+void trx_rseg_t::init(fil_space_t *space, uint32_t page)
+{
+  latch.SRW_LOCK_INIT(trx_rseg_latch_key);
+  ut_ad(!this->space || this->space != space);
+  this->space= space;
+  page_no= page;
+  last_page_no= FIL_NULL;
+  curr_size= 1;
+
+  UT_LIST_INIT(undo_list, &trx_undo_t::undo_list);
+  UT_LIST_INIT(undo_cached, &trx_undo_t::undo_list);
+}
+
+void trx_rseg_t::reinit(uint32_t page)
+{
+  ut_ad(is_persistent());
+  ut_ad(page_no == page);
+  ut_a(!UT_LIST_GET_LEN(undo_list));
+  ut_ad(!history_size || UT_LIST_GET_FIRST(undo_cached));
+
+  history_size= 0;
+  page_no= page;
+
+  for (trx_undo_t *next, *undo= UT_LIST_GET_FIRST(undo_cached); undo;
+       undo= next)
+  {
+    next= UT_LIST_GET_NEXT(undo_list, undo);
+    UT_LIST_REMOVE(undo_cached, undo);
+    ut_free(undo);
+  }
+
+  ut_ad(!is_referenced());
+  needs_purge= 0;
+  last_commit_and_offset= 0;
+  last_page_no= FIL_NULL;
+  curr_size= 1;
+  ref.store(0, std::memory_order_release);
+}
+
+/** Read the undo log lists.
+@param[in,out]  rseg            rollback segment
+@param[in]      rseg_header     rollback segment header
+@return error code */
+static dberr_t trx_undo_lists_init(trx_rseg_t *rseg,
+                                   const buf_block_t *rseg_header)
+{
+  ut_ad(srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN);
+  bool is_undo_empty= true;
+
+  for (ulint i= 0; i < TRX_RSEG_N_SLOTS; i++)
+  {
+    uint32_t page_no= trx_rsegf_get_nth_undo(rseg_header, i);
+    if (page_no != FIL_NULL)
+    {
+      const trx_undo_t *undo=
+        trx_undo_mem_create_at_db_start(rseg, i, page_no);
+      if (!undo)
+        return DB_CORRUPTION;
+      if (is_undo_empty)
+        is_undo_empty= !undo->size || undo->state == TRX_UNDO_CACHED;
+      rseg->curr_size+= undo->size;
+    }
+  }
+
+  trx_sys.set_undo_non_empty(!is_undo_empty);
+  return DB_SUCCESS;
+}
+
+/** Restore the state of a persistent rollback segment.
+@param[in,out]	rseg		persistent rollback segment
+@param[in,out]	mtr		mini-transaction
+@return error code */
+static dberr_t trx_rseg_mem_restore(trx_rseg_t *rseg, mtr_t *mtr)
+{
+  if (!rseg->space)
+    return DB_TABLESPACE_NOT_FOUND;
+  dberr_t err;
+  const buf_block_t *rseg_hdr=
+    buf_page_get_gen(rseg->page_id(), 0, RW_S_LATCH, nullptr, BUF_GET, mtr,
+                     &err);
+  if (!rseg_hdr)
+    return err;
+
+  if (!mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT + rseg_hdr->page.frame))
+  {
+    trx_id_t id= mach_read_from_8(TRX_RSEG + TRX_RSEG_MAX_TRX_ID +
+                                  rseg_hdr->page.frame);
+
+    if (id > rseg->needs_purge)
+      rseg->needs_purge= id;
+
+    const byte *binlog_name=
+      TRX_RSEG + TRX_RSEG_BINLOG_NAME + rseg_hdr->page.frame;
+    if (*binlog_name)
+    {
+      lsn_t lsn= mach_read_from_8(my_assume_aligned<8>
+                                  (FIL_PAGE_LSN + rseg_hdr->page.frame));
+      static_assert(TRX_RSEG_BINLOG_NAME_LEN ==
+                    sizeof trx_sys.recovered_binlog_filename, "compatibility");
+      if (lsn > trx_sys.recovered_binlog_lsn)
+      {
+        trx_sys.recovered_binlog_lsn= lsn;
+        trx_sys.recovered_binlog_offset=
+          mach_read_from_8(TRX_RSEG + TRX_RSEG_BINLOG_OFFSET +
+                           rseg_hdr->page.frame);
+        memcpy(trx_sys.recovered_binlog_filename, binlog_name,
+               TRX_RSEG_BINLOG_NAME_LEN);
+      }
+
+#ifdef WITH_WSREP
+      trx_rseg_read_wsrep_checkpoint(rseg_hdr, trx_sys.recovered_wsrep_xid);
+#endif
+    }
+  }
+
+  if (srv_operation == SRV_OPERATION_RESTORE)
+    /* mariabackup --prepare only deals with
+    the redo log and the data files, not with
+    transactions or the data dictionary. */
+    return DB_SUCCESS;
+
+  /* Initialize the undo log lists according to the rseg header */
+
+  rseg->curr_size = mach_read_from_4(TRX_RSEG + TRX_RSEG_HISTORY_SIZE +
+                                     rseg_hdr->page.frame) + 1;
+  err= trx_undo_lists_init(rseg, rseg_hdr);
+  if (err != DB_SUCCESS);
+  else if (auto len= flst_get_len(TRX_RSEG + TRX_RSEG_HISTORY +
+                                  rseg_hdr->page.frame))
+  {
+    rseg->history_size+= len;
+
+    fil_addr_t node_addr= flst_get_last(TRX_RSEG + TRX_RSEG_HISTORY +
+                                        rseg_hdr->page.frame);
+    node_addr.boffset= static_cast<uint16_t>(node_addr.boffset -
+                                             TRX_UNDO_HISTORY_NODE);
+    rseg->last_page_no= node_addr.page;
+
+    const buf_block_t* block=
+      buf_page_get_gen(page_id_t(rseg->space->id, node_addr.page),
+                       0, RW_S_LATCH, nullptr, BUF_GET, mtr, &err);
+    if (!block)
+      return err;
+
+    trx_id_t id= mach_read_from_8(block->page.frame + node_addr.boffset +
+                                  TRX_UNDO_TRX_ID);
+    if (id > rseg->needs_purge)
+      rseg->needs_purge= id;
+    id= mach_read_from_8(block->page.frame + node_addr.boffset +
+                         TRX_UNDO_TRX_NO);
+    if (id > rseg->needs_purge)
+      rseg->needs_purge= id;
+
+    rseg->set_last_commit(node_addr.boffset, id);
+    ut_ad(mach_read_from_2(block->page.frame + node_addr.boffset +
+                           TRX_UNDO_NEEDS_PURGE) <= 1);
+
+    if (rseg->last_page_no != FIL_NULL)
+      /* There is no need to cover this operation by the purge
+      mutex because we are still bootstrapping. */
+      purge_sys.purge_queue.push(*rseg);
+  }
+
+  trx_sys.set_undo_non_empty(rseg->history_size > 0);
+  return err;
+}
+
+/** Read binlog metadata from the TRX_SYS page, in case we are upgrading
+from MySQL or a MariaDB version older than 10.3.5. */
+static void trx_rseg_init_binlog_info(const page_t* page)
+{
+	if (mach_read_from_4(TRX_SYS + TRX_SYS_MYSQL_LOG_INFO
+			     + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD
+			     + page)
+	    == TRX_SYS_MYSQL_LOG_MAGIC_N) {
+		memcpy(trx_sys.recovered_binlog_filename,
+		       TRX_SYS_MYSQL_LOG_INFO + TRX_SYS_MYSQL_LOG_NAME
+		       + TRX_SYS + page, TRX_SYS_MYSQL_LOG_NAME_LEN);
+		trx_sys.recovered_binlog_offset = mach_read_from_8(
+			TRX_SYS_MYSQL_LOG_INFO + TRX_SYS_MYSQL_LOG_OFFSET
+			+ TRX_SYS + page);
+	}
+
+#ifdef WITH_WSREP
+	trx_rseg_init_wsrep_xid(page, trx_sys.recovered_wsrep_xid);
+#endif
+}
+
+/** Initialize or recover the rollback segments at startup. */
+dberr_t trx_rseg_array_init()
+{
+	trx_id_t max_trx_id = 0;
+
+	*trx_sys.recovered_binlog_filename = '\0';
+	trx_sys.recovered_binlog_offset = 0;
+#ifdef WITH_WSREP
+	trx_sys.recovered_wsrep_xid.null();
+	XID wsrep_sys_xid;
+	wsrep_sys_xid.null();
+	bool wsrep_xid_in_rseg_found = false;
+#endif
+	mtr_t mtr;
+	dberr_t err = DB_SUCCESS;
+
+	for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++) {
+		mtr.start();
+		if (const buf_block_t* sys = trx_sysf_get(&mtr, false)) {
+			if (rseg_id == 0) {
+				/* In case this is an upgrade from
+				before MariaDB 10.3.5, fetch the base
+				information from the TRX_SYS page. */
+				max_trx_id = mach_read_from_8(
+					TRX_SYS + TRX_SYS_TRX_ID_STORE
+					+ sys->page.frame);
+				trx_rseg_init_binlog_info(sys->page.frame);
+#ifdef WITH_WSREP
+				wsrep_sys_xid.set(&trx_sys.recovered_wsrep_xid);
+#endif
+			}
+
+			const uint32_t	page_no = trx_sysf_rseg_get_page_no(
+				sys, rseg_id);
+			if (page_no != FIL_NULL) {
+				trx_rseg_t& rseg = trx_sys.rseg_array[rseg_id];
+				uint32_t space_id=
+					trx_sysf_rseg_get_space(
+						sys, rseg_id);
+
+				fil_space_t *rseg_space =
+					fil_space_get(space_id);
+				if (!rseg_space) {
+					mtr.commit();
+					err = DB_ERROR;
+					sql_print_error(
+					  "InnoDB: Failed to open the undo "
+					  "tablespace undo%03" PRIu32,
+					  (space_id -
+					   srv_undo_space_id_start + 1));
+					break;
+				}
+
+				rseg.init(rseg_space, page_no);
+				ut_ad(rseg.is_persistent());
+				err = trx_rseg_mem_restore(&rseg, &mtr);
+				if (rseg.needs_purge > max_trx_id) {
+					max_trx_id = rseg.needs_purge;
+				}
+				if (err != DB_SUCCESS) {
+					mtr.commit();
+					break;
+				}
+#ifdef WITH_WSREP
+				if (!wsrep_sys_xid.is_null() &&
+				    !wsrep_sys_xid.eq(&trx_sys.recovered_wsrep_xid)) {
+					wsrep_xid_in_rseg_found = true;
+					ut_ad(memcmp(wsrep_xid_uuid(&wsrep_sys_xid),
+						     wsrep_xid_uuid(&trx_sys.recovered_wsrep_xid),
+						     sizeof wsrep_uuid)
+					      || wsrep_xid_seqno(
+						      &wsrep_sys_xid)
+					      <= wsrep_xid_seqno(
+						      &trx_sys.recovered_wsrep_xid));
+				}
+#endif
+			}
+		}
+
+		mtr.commit();
+	}
+
+	if (err != DB_SUCCESS) {
+		for (auto& rseg : trx_sys.rseg_array) {
+			while (auto u = UT_LIST_GET_FIRST(rseg.undo_list)) {
+				UT_LIST_REMOVE(rseg.undo_list, u);
+				ut_free(u);
+			}
+		}
+		return err;
+	}
+
+#ifdef WITH_WSREP
+	if (!wsrep_sys_xid.is_null()) {
+		/* Upgrade from a version prior to 10.3.5,
+		where WSREP XID was stored in TRX_SYS page.
+		If no rollback segment has a WSREP XID set,
+		we must copy the XID found in TRX_SYS page
+		to rollback segments. */
+		mtr.start();
+
+		if (!wsrep_xid_in_rseg_found) {
+			trx_rseg_update_wsrep_checkpoint(&wsrep_sys_xid, &mtr);
+		}
+
+		/* Finally, clear WSREP XID in TRX_SYS page. */
+		mtr.memset(trx_sysf_get(&mtr),
+			   TRX_SYS + TRX_SYS_WSREP_XID_INFO,
+			   TRX_SYS_WSREP_XID_LEN, 0);
+		mtr.commit();
+	}
+#endif
+
+	trx_sys.init_max_trx_id(max_trx_id + 1);
+	return DB_SUCCESS;
+}
+
+/** Create the temporary rollback segments. */
+dberr_t trx_temp_rseg_create(mtr_t *mtr)
+{
+  for (ulong i= 0; i < array_elements(trx_sys.temp_rsegs); i++)
+  {
+    mtr->start();
+    mtr->set_log_mode(MTR_LOG_NO_REDO);
+    mtr->x_lock_space(fil_system.temp_space);
+    dberr_t err;
+    buf_block_t *rblock=
+      trx_rseg_header_create(fil_system.temp_space, i, 0, mtr, &err);
+    if (UNIV_UNLIKELY(!rblock))
+    {
+      mtr->commit();
+      return err;
+    }
+    trx_sys.temp_rsegs[i].init(fil_system.temp_space,
+                               rblock->page.id().page_no());
+    mtr->commit();
+  }
+  return DB_SUCCESS;
+}
+
+/** Update the offset information about the end of the binlog entry
+which corresponds to the transaction just being committed.
+In a replication slave, this updates the master binlog position
+up to which replication has proceeded.
+@param[in,out]	rseg_header	rollback segment header
+@param[in]	log_file_name	binlog file name
+@param[in]	log_offset	binlog file offset
+@param[in,out]	mtr		mini-transaction */
+void trx_rseg_update_binlog_offset(buf_block_t *rseg_header,
+                                   const char *log_file_name,
+                                   ulonglong log_offset,
+                                   mtr_t *mtr)
+{
+  DBUG_PRINT("trx", ("trx_mysql_binlog_offset %llu", log_offset));
+  const size_t len= strlen(log_file_name) + 1;
+  ut_ad(len > 1);
+
+  if (UNIV_UNLIKELY(len > TRX_RSEG_BINLOG_NAME_LEN))
+    return;
+
+  mtr->write<8,mtr_t::MAYBE_NOP>(
+    *rseg_header,
+    TRX_RSEG + TRX_RSEG_BINLOG_OFFSET + rseg_header->page.frame,
+    log_offset);
+
+  byte *name= TRX_RSEG + TRX_RSEG_BINLOG_NAME + rseg_header->page.frame;
+
+  if (memcmp(log_file_name, name, len))
+    mtr->memcpy(*rseg_header, name, log_file_name, len);
+}
diff --git a/storage/innobase/trx/trx0sys.cc b/storage/innobase/trx/trx0sys.cc
new file mode 100644
index 00000000..319ba99a
--- /dev/null
+++ b/storage/innobase/trx/trx0sys.cc
@@ -0,0 +1,370 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0sys.cc
+Transaction system
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0sys.h"
+#include "mysqld.h"
+#include "sql_error.h"
+
+#include "fsp0fsp.h"
+#include "mtr0log.h"
+#include "mtr0log.h"
+#include "trx0trx.h"
+#include "trx0rseg.h"
+#include "trx0undo.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "trx0purge.h"
+#include "log0log.h"
+#include "log0recv.h"
+#include "os0file.h"
+
+/** The transaction system */
+trx_sys_t		trx_sys;
+
+#ifdef UNIV_DEBUG
+/* Flag to control TRX_RSEG_N_SLOTS behavior debugging. */
+uint	trx_rseg_n_slots_debug = 0;
+#endif
+
+/** Display the MySQL binlog offset info if it is present in the trx
+system header. */
+void
+trx_sys_print_mysql_binlog_offset()
+{
+	if (!*trx_sys.recovered_binlog_filename) {
+		return;
+	}
+
+	ib::info() << "Last binlog file '"
+		<< trx_sys.recovered_binlog_filename
+		<< "', position "
+		<< trx_sys.recovered_binlog_offset;
+}
+
+/** Find an available rollback segment.
+@param[in]	sys_header
+@return an unallocated rollback segment slot in the TRX_SYS header
+@retval ULINT_UNDEFINED if not found */
+ulint
+trx_sys_rseg_find_free(const buf_block_t* sys_header)
+{
+	for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++) {
+		if (trx_sysf_rseg_get_page_no(sys_header, rseg_id)
+		    == FIL_NULL) {
+			return rseg_id;
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/** Count the number of initialized persistent rollback segment slots. */
+static
+void
+trx_sysf_get_n_rseg_slots()
+{
+	mtr_t		mtr;
+	mtr.start();
+
+	srv_available_undo_logs = 0;
+	if (const buf_block_t* sys_header = trx_sysf_get(&mtr, false)) {
+		for (ulint rseg_id = 0; rseg_id < TRX_SYS_N_RSEGS; rseg_id++) {
+			srv_available_undo_logs
+				+= trx_sysf_rseg_get_page_no(sys_header,
+							     rseg_id)
+				!= FIL_NULL;
+		}
+	}
+
+	mtr.commit();
+}
+
+/** Initialize the transaction system when creating the database. */
+dberr_t trx_sys_create_sys_pages(mtr_t *mtr)
+{
+  mtr->start();
+  mtr->x_lock_space(fil_system.sys_space);
+  static_assert(TRX_SYS_SPACE == 0, "compatibility");
+
+  /* Create the trx sys file block in a new allocated file segment */
+  dberr_t err;
+  buf_block_t *block= fseg_create(fil_system.sys_space,
+                                  TRX_SYS + TRX_SYS_FSEG_HEADER, mtr, &err);
+  if (UNIV_UNLIKELY(!block))
+  {
+  error:
+    mtr->commit();
+    return err;
+  }
+  ut_a(block->page.id() == page_id_t(0, TRX_SYS_PAGE_NO));
+
+  mtr->write<2>(*block, FIL_PAGE_TYPE + block->page.frame,
+                FIL_PAGE_TYPE_TRX_SYS);
+
+  /* Reset the rollback segment slots.  Old versions of InnoDB
+  (before MySQL 5.5) define TRX_SYS_N_RSEGS as 256 and expect
+  that the whole array is initialized. */
+  static_assert(256 >= TRX_SYS_N_RSEGS, "");
+  static_assert(TRX_SYS + TRX_SYS_RSEGS + 256 * TRX_SYS_RSEG_SLOT_SIZE <=
+                UNIV_PAGE_SIZE_MIN - FIL_PAGE_DATA_END, "");
+  mtr->write<4>(*block, TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_PAGE_NO +
+                block->page.frame, FSP_FIRST_RSEG_PAGE_NO);
+  mtr->memset(block, TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_SLOT_SIZE,
+              255 * TRX_SYS_RSEG_SLOT_SIZE, 0xff);
+
+  buf_block_t *r= trx_rseg_header_create(fil_system.sys_space, 0, 0,
+                                         mtr, &err);
+  if (UNIV_UNLIKELY(!r))
+    goto error;
+  ut_a(r->page.id() == page_id_t(0, FSP_FIRST_RSEG_PAGE_NO));
+  mtr->commit();
+
+  return trx_lists_init_at_db_start();
+}
+
+void trx_sys_t::create()
+{
+  ut_ad(this == &trx_sys);
+  ut_ad(!is_initialised());
+  m_initialised= true;
+  trx_list.create();
+  rw_trx_hash.init();
+}
+
+size_t trx_sys_t::history_size()
+{
+  ut_ad(is_initialised());
+  size_t size= 0;
+  for (auto &rseg : rseg_array)
+  {
+    rseg.latch.rd_lock(SRW_LOCK_CALL);
+    size+= rseg.history_size;
+  }
+  for (auto &rseg : rseg_array)
+    rseg.latch.rd_unlock();
+  return size;
+}
+
+bool trx_sys_t::history_exceeds(size_t threshold)
+{
+  ut_ad(is_initialised());
+  size_t size= 0;
+  bool exceeds= false;
+  size_t i;
+  for (i= 0; i < array_elements(rseg_array); i++)
+  {
+    rseg_array[i].latch.rd_lock(SRW_LOCK_CALL);
+    size+= rseg_array[i].history_size;
+    if (size > threshold)
+    {
+      exceeds= true;
+      i++;
+      break;
+    }
+  }
+  while (i)
+    rseg_array[--i].latch.rd_unlock();
+  return exceeds;
+}
+
+TPOOL_SUPPRESS_TSAN bool trx_sys_t::history_exists()
+{
+  ut_ad(is_initialised());
+  for (auto &rseg : rseg_array)
+    if (rseg.history_size)
+      return true;
+  return false;
+}
+
+TPOOL_SUPPRESS_TSAN size_t trx_sys_t::history_size_approx() const
+{
+  ut_ad(is_initialised());
+  size_t size= 0;
+  for (auto &rseg : rseg_array)
+    size+= rseg.history_size;
+  return size;
+}
+
+/** Create a persistent rollback segment.
+@param space_id   system or undo tablespace id
+@return pointer to new rollback segment
+@retval nullptr  on failure */
+static trx_rseg_t *trx_rseg_create(uint32_t space_id)
+{
+  trx_rseg_t *rseg= nullptr;
+  mtr_t mtr;
+
+  mtr.start();
+
+  if (fil_space_t *space= mtr.x_lock_space(space_id))
+  {
+    ut_ad(space->purpose == FIL_TYPE_TABLESPACE);
+    if (buf_block_t *sys_header= trx_sysf_get(&mtr))
+    {
+      ulint rseg_id= trx_sys_rseg_find_free(sys_header);
+      dberr_t err;
+      if (buf_block_t *rblock= rseg_id == ULINT_UNDEFINED
+          ? nullptr : trx_rseg_header_create(space, rseg_id, 0, &mtr, &err))
+      {
+        rseg= &trx_sys.rseg_array[rseg_id];
+        rseg->init(space, rblock->page.id().page_no());
+        ut_ad(rseg->is_persistent());
+        mtr.write<4,mtr_t::MAYBE_NOP>
+          (*sys_header, TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_SPACE +
+           rseg_id * TRX_SYS_RSEG_SLOT_SIZE + sys_header->page.frame,
+           space_id);
+        mtr.write<4,mtr_t::MAYBE_NOP>
+          (*sys_header, TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_PAGE_NO +
+           rseg_id * TRX_SYS_RSEG_SLOT_SIZE + sys_header->page.frame,
+           rseg->page_no);
+      }
+    }
+  }
+
+  mtr.commit();
+  return rseg;
+}
+
+/** Create the rollback segments.
+@return	whether the creation succeeded */
+bool trx_sys_create_rsegs()
+{
+	/* srv_available_undo_logs reflects the number of persistent
+	rollback segments that have been initialized in the
+	transaction system header page. */
+	ut_ad(srv_undo_tablespaces <= TRX_SYS_MAX_UNDO_SPACES);
+
+	if (high_level_read_only) {
+		srv_available_undo_logs = 0;
+		return(true);
+	}
+
+	/* This is executed in single-threaded mode therefore it is not
+	necessary to use the same mtr in trx_rseg_create(). n_used cannot
+	change while the function is executing. */
+	trx_sysf_get_n_rseg_slots();
+
+	ut_ad(srv_available_undo_logs <= TRX_SYS_N_RSEGS);
+
+	/* The first persistent rollback segment is always initialized
+	in the system tablespace. */
+	ut_a(srv_available_undo_logs > 0);
+
+	for (uint32_t i = 0; srv_available_undo_logs < TRX_SYS_N_RSEGS;
+	     i++, srv_available_undo_logs++) {
+		/* Tablespace 0 is the system tablespace.
+		Dedicated undo log tablespaces start from 1. */
+		uint32_t space = srv_undo_tablespaces > 0
+			? (i % srv_undo_tablespaces)
+			+ srv_undo_space_id_start
+			: TRX_SYS_SPACE;
+
+		if (!trx_rseg_create(space)) {
+			ib::error() << "Unable to allocate the"
+				" requested innodb_undo_logs";
+			return(false);
+		}
+
+		/* Increase the number of active undo
+		tablespace in case new rollback segment
+		assigned to new undo tablespace. */
+		if (space > (srv_undo_space_id_start
+			     + srv_undo_tablespaces_active - 1)) {
+			srv_undo_tablespaces_active++;
+		}
+	}
+
+	ut_ad(srv_available_undo_logs == TRX_SYS_N_RSEGS);
+
+	ib::info info;
+	info << srv_available_undo_logs;
+	if (srv_undo_tablespaces_active) {
+		info << " rollback segments in " << srv_undo_tablespaces_active
+		<< " undo tablespaces are active.";
+	} else {
+		info << " rollback segments are active.";
+	}
+
+	return(true);
+}
+
+/** Close the transaction system on shutdown */
+void
+trx_sys_t::close()
+{
+	ut_ad(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS);
+	if (!is_initialised()) {
+		return;
+	}
+
+	if (size_t size = view_count()) {
+		ib::error() << "All read views were not closed before"
+			" shutdown: " << size << " read views open";
+	}
+
+	rw_trx_hash.destroy();
+
+	/* There can't be any active transactions. */
+
+	for (ulint i = 0; i < array_elements(temp_rsegs); ++i) {
+		temp_rsegs[i].destroy();
+	}
+	for (ulint i = 0; i < array_elements(rseg_array); ++i) {
+		rseg_array[i].destroy();
+	}
+
+	ut_a(trx_list.empty());
+	trx_list.close();
+	m_initialised = false;
+}
+
+/** @return total number of active (non-prepared) transactions */
+size_t trx_sys_t::any_active_transactions(size_t *prepared)
+{
+  size_t total_trx= 0, prepared_trx= 0;
+
+  trx_sys.trx_list.for_each([&](const trx_t &trx) {
+    switch (trx.state) {
+    case TRX_STATE_NOT_STARTED:
+      break;
+    case TRX_STATE_ACTIVE:
+      if (!trx.id)
+        break;
+      /* fall through */
+    case TRX_STATE_COMMITTED_IN_MEMORY:
+      total_trx++;
+      break;
+    case TRX_STATE_PREPARED:
+    case TRX_STATE_PREPARED_RECOVERED:
+      prepared_trx++;
+    }
+  });
+
+  if (prepared)
+    *prepared= prepared_trx;
+
+  return total_trx;
+}
diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc
new file mode 100644
index 00000000..e5e2ef9e
--- /dev/null
+++ b/storage/innobase/trx/trx0trx.cc
@@ -0,0 +1,2292 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2015, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0trx.cc
+The transaction
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0trx.h"
+
+#ifdef WITH_WSREP
+#include <mysql/service_wsrep.h>
+#endif
+
+#include <mysql/service_thd_error_context.h>
+
+#include "btr0sea.h"
+#include "lock0lock.h"
+#include "log0log.h"
+#include "que0que.h"
+#include "srv0mon.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "trx0roll.h"
+#include "trx0rseg.h"
+#include "trx0undo.h"
+#include "trx0xa.h"
+#include "ut0pool.h"
+#include "ut0vec.h"
+#include "log.h"
+
+#include <set>
+#include <new>
+
+/** The bit pattern corresponding to TRX_ID_MAX */
+const byte trx_id_max_bytes[8] = {
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+};
+
+/** The bit pattern corresponding to max timestamp */
+const byte timestamp_max_bytes[7] = {
+	0x7f, 0xff, 0xff, 0xff, 0x0f, 0x42, 0x3f
+};
+
+
+static const ulint MAX_DETAILED_ERROR_LEN = 256;
+
+/*************************************************************//**
+Set detailed error message for the transaction. */
+void
+trx_set_detailed_error(
+/*===================*/
+	trx_t*		trx,	/*!< in: transaction struct */
+	const char*	msg)	/*!< in: detailed error message */
+{
+	strncpy(trx->detailed_error, msg, MAX_DETAILED_ERROR_LEN - 1);
+	trx->detailed_error[MAX_DETAILED_ERROR_LEN - 1] = '\0';
+}
+
+/*************************************************************//**
+Set detailed error message for the transaction from a file. Note that the
+file is rewinded before reading from it. */
+void
+trx_set_detailed_error_from_file(
+/*=============================*/
+	trx_t*	trx,	/*!< in: transaction struct */
+	FILE*	file)	/*!< in: file to read message from */
+{
+	os_file_read_string(file, trx->detailed_error, MAX_DETAILED_ERROR_LEN);
+}
+
+/********************************************************************//**
+Initialize transaction object.
+@param trx trx to initialize */
+static
+void
+trx_init(
+/*=====*/
+	trx_t*	trx)
+{
+	trx->state = TRX_STATE_NOT_STARTED;
+
+	trx->is_recovered = false;
+
+	trx->op_info = "";
+
+	trx->active_commit_ordered = false;
+
+	trx->isolation_level = TRX_ISO_REPEATABLE_READ;
+
+	trx->check_foreigns = true;
+
+	trx->check_unique_secondary = true;
+
+	trx->lock.n_rec_locks = 0;
+
+	trx->dict_operation = false;
+
+	trx->error_state = DB_SUCCESS;
+
+	trx->error_key_num = ULINT_UNDEFINED;
+
+	trx->undo_no = 0;
+
+	trx->rsegs.m_redo.rseg = NULL;
+
+	trx->rsegs.m_noredo.rseg = NULL;
+
+	trx->read_only = false;
+
+	trx->auto_commit = false;
+
+	trx->will_lock = false;
+
+	trx->bulk_insert = false;
+
+	trx->apply_online_log = false;
+
+	ut_d(trx->start_file = 0);
+
+	ut_d(trx->start_line = 0);
+
+	trx->magic_n = TRX_MAGIC_N;
+
+	trx->last_sql_stat_start.least_undo_no = 0;
+
+	ut_ad(!trx->read_view.is_open());
+
+	trx->lock.rec_cached = 0;
+
+	trx->lock.table_cached = 0;
+#ifdef WITH_WSREP
+	ut_ad(!trx->wsrep);
+#endif /* WITH_WSREP */
+}
+
+/** For managing the life-cycle of the trx_t instance that we get
+from the pool. */
+struct TrxFactory {
+
+	/** Initializes a transaction object. It must be explicitly started
+	with trx_start_if_not_started() before using it. The default isolation
+	level is TRX_ISO_REPEATABLE_READ.
+	@param trx Transaction instance to initialise */
+	static void init(trx_t* trx)
+	{
+		/* Explicitly call the constructor of the already
+		allocated object. trx_t objects are allocated by
+		ut_zalloc_nokey() in Pool::Pool() which would not call
+		the constructors of the trx_t members. */
+		new(&trx->mod_tables) trx_mod_tables_t();
+
+		new(&trx->lock.table_locks) lock_list();
+
+		new(&trx->read_view) ReadView();
+
+		trx->rw_trx_hash_pins = 0;
+		trx_init(trx);
+
+		trx->dict_operation_lock_mode = false;
+
+		trx->detailed_error = reinterpret_cast<char*>(
+			ut_zalloc_nokey(MAX_DETAILED_ERROR_LEN));
+
+		trx->lock.lock_heap = mem_heap_create_typed(
+			1024, MEM_HEAP_FOR_LOCK_HEAP);
+		pthread_cond_init(&trx->lock.cond, nullptr);
+
+		UT_LIST_INIT(trx->lock.trx_locks, &lock_t::trx_locks);
+		UT_LIST_INIT(trx->lock.evicted_tables,
+			     &dict_table_t::table_LRU);
+
+		UT_LIST_INIT(
+			trx->trx_savepoints,
+			&trx_named_savept_t::trx_savepoints);
+
+		trx->mutex_init();
+	}
+
+	/** Release resources held by the transaction object.
+	@param trx the transaction for which to release resources */
+	static void destroy(trx_t* trx)
+	{
+#ifdef __SANITIZE_ADDRESS__
+		/* Unpoison the memory for AddressSanitizer */
+		MEM_MAKE_ADDRESSABLE(trx, sizeof *trx);
+#elif !__has_feature(memory_sanitizer)
+		/* In Valgrind, we cannot cancel MEM_NOACCESS() without
+		changing the state of the V bits (which indicate
+		which bits are initialized).
+		We will declare the contents as initialized.
+		We did invoke MEM_CHECK_DEFINED() in trx_t::free(). */
+		MEM_MAKE_DEFINED(trx, sizeof *trx);
+#endif
+
+		ut_a(trx->magic_n == TRX_MAGIC_N);
+		ut_ad(!trx->mysql_thd);
+
+		ut_a(trx->lock.wait_lock == NULL);
+		ut_a(trx->lock.wait_thr == NULL);
+		ut_a(!trx->dict_operation_lock_mode);
+
+		if (trx->lock.lock_heap != NULL) {
+			mem_heap_free(trx->lock.lock_heap);
+			trx->lock.lock_heap = NULL;
+		}
+
+		pthread_cond_destroy(&trx->lock.cond);
+
+		ut_a(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
+		ut_ad(UT_LIST_GET_LEN(trx->lock.evicted_tables) == 0);
+
+		ut_free(trx->detailed_error);
+
+		trx->mutex_destroy();
+
+		trx->mod_tables.~trx_mod_tables_t();
+
+		ut_ad(!trx->read_view.is_open());
+
+		trx->lock.table_locks.~lock_list();
+
+		trx->read_view.~ReadView();
+	}
+};
+
+/** The lock strategy for TrxPool */
+class TrxPoolLock
+{
+  mysql_mutex_t mutex;
+
+public:
+  /** Create the mutex */
+  void create()
+  {
+    mysql_mutex_init(trx_pool_mutex_key, &mutex, nullptr);
+  }
+
+  /** Acquire the mutex */
+  void enter() { mysql_mutex_lock(&mutex); }
+
+  /** Release the mutex */
+  void exit() { mysql_mutex_unlock(&mutex); }
+
+  /** Free the mutex */
+  void destroy() { mysql_mutex_destroy(&mutex); }
+};
+
+/** The lock strategy for the TrxPoolManager */
+class TrxPoolManagerLock
+{
+  mysql_mutex_t mutex;
+
+public:
+  /** Create the mutex */
+  void create()
+  {
+    mysql_mutex_init(trx_pool_manager_mutex_key, &mutex, nullptr);
+  }
+
+  /** Acquire the mutex */
+  void enter() { mysql_mutex_lock(&mutex); }
+
+  /** Release the mutex */
+  void exit() { mysql_mutex_unlock(&mutex); }
+
+  /** Free the mutex */
+  void destroy() { mysql_mutex_destroy(&mutex); }
+};
+
+/** Use explicit mutexes for the trx_t pool and its manager. */
+typedef Pool<trx_t, TrxFactory, TrxPoolLock> trx_pool_t;
+typedef PoolManager<trx_pool_t, TrxPoolManagerLock > trx_pools_t;
+
+/** The trx_t pool manager */
+static trx_pools_t* trx_pools;
+
+/** Size of on trx_t pool in bytes. */
+static const ulint MAX_TRX_BLOCK_SIZE = 1024 * 1024 * 4;
+
+/** Create the trx_t pool */
+void
+trx_pool_init()
+{
+	trx_pools = UT_NEW_NOKEY(trx_pools_t(MAX_TRX_BLOCK_SIZE));
+
+	ut_a(trx_pools != 0);
+}
+
+/** Destroy the trx_t pool */
+void
+trx_pool_close()
+{
+	UT_DELETE(trx_pools);
+
+	trx_pools = 0;
+}
+
+/** @return an allocated transaction */
+trx_t *trx_create()
+{
+	trx_t*	trx = trx_pools->get();
+
+#ifdef __SANITIZE_ADDRESS__
+	/* Unpoison the memory for AddressSanitizer.
+	It may have been poisoned in trx_t::free().*/
+	MEM_MAKE_ADDRESSABLE(trx, sizeof *trx);
+#elif !__has_feature(memory_sanitizer)
+	/* In Valgrind, we cannot cancel MEM_NOACCESS() without
+	changing the state of the V bits (which indicate
+	which bits are initialized).
+	We will declare the contents as initialized.
+	We did invoke MEM_CHECK_DEFINED() in trx_t::free(). */
+	MEM_MAKE_DEFINED(trx, sizeof *trx);
+#endif
+
+	trx->assert_freed();
+
+	mem_heap_t*	heap;
+	ib_alloc_t*	alloc;
+
+	/* We just got trx from pool, it should be non locking */
+	ut_ad(!trx->will_lock);
+	ut_ad(!trx->rw_trx_hash_pins);
+
+	DBUG_LOG("trx", "Create: " << trx);
+
+	heap = mem_heap_create(sizeof(ib_vector_t) + sizeof(void*) * 8);
+
+	alloc = ib_heap_allocator_create(heap);
+
+	trx->autoinc_locks = ib_vector_create(alloc, sizeof(void**), 4);
+
+	ut_ad(trx->mod_tables.empty());
+	ut_ad(trx->lock.n_rec_locks == 0);
+	ut_ad(trx->lock.table_cached == 0);
+	ut_ad(trx->lock.rec_cached == 0);
+	ut_ad(UT_LIST_GET_LEN(trx->lock.evicted_tables) == 0);
+
+	trx_sys.register_trx(trx);
+
+	return(trx);
+}
+
+/** Free the memory to trx_pools */
+void trx_t::free()
+{
+#ifdef HAVE_MEM_CHECK
+  if (xid.is_null())
+    MEM_MAKE_DEFINED(&xid, sizeof xid);
+  else
+    MEM_MAKE_DEFINED(&xid.data[xid.gtrid_length + xid.bqual_length],
+                     sizeof xid.data - (xid.gtrid_length + xid.bqual_length));
+#endif
+  MEM_CHECK_DEFINED(this, sizeof *this);
+
+  ut_ad(!n_mysql_tables_in_use);
+  ut_ad(!mysql_log_file_name);
+  ut_ad(!mysql_n_tables_locked);
+  ut_ad(!will_lock);
+  ut_ad(error_state == DB_SUCCESS);
+  ut_ad(magic_n == TRX_MAGIC_N);
+  ut_ad(!read_only);
+  ut_ad(!lock.wait_lock);
+
+  dict_operation= false;
+  trx_sys.deregister_trx(this);
+  check_unique_secondary= true;
+  check_foreigns= true;
+  assert_freed();
+  trx_sys.rw_trx_hash.put_pins(this);
+  mysql_thd= nullptr;
+
+  // FIXME: We need to avoid this heap free/alloc for each commit.
+  if (autoinc_locks)
+  {
+    ut_ad(ib_vector_is_empty(autoinc_locks));
+    /* We allocated a dedicated heap for the vector. */
+    ib_vector_free(autoinc_locks);
+    autoinc_locks= NULL;
+  }
+
+  MEM_NOACCESS(&skip_lock_inheritance_and_n_ref,
+               sizeof skip_lock_inheritance_and_n_ref);
+  /* do not poison mutex */
+  MEM_NOACCESS(&id, sizeof id);
+  MEM_NOACCESS(&max_inactive_id, sizeof id);
+  MEM_NOACCESS(&state, sizeof state);
+  MEM_NOACCESS(&is_recovered, sizeof is_recovered);
+#ifdef WITH_WSREP
+  MEM_NOACCESS(&wsrep, sizeof wsrep);
+#endif
+  read_view.mem_noaccess();
+  MEM_NOACCESS(&lock, sizeof lock);
+  MEM_NOACCESS(&op_info, sizeof op_info);
+  MEM_NOACCESS(&isolation_level, sizeof isolation_level);
+  MEM_NOACCESS(&check_foreigns, sizeof check_foreigns);
+  MEM_NOACCESS(&is_registered, sizeof is_registered);
+  MEM_NOACCESS(&active_commit_ordered, sizeof active_commit_ordered);
+  MEM_NOACCESS(&check_unique_secondary, sizeof check_unique_secondary);
+  MEM_NOACCESS(&flush_log_later, sizeof flush_log_later);
+  MEM_NOACCESS(&duplicates, sizeof duplicates);
+  MEM_NOACCESS(&dict_operation, sizeof dict_operation);
+  MEM_NOACCESS(&dict_operation_lock_mode, sizeof dict_operation_lock_mode);
+  MEM_NOACCESS(&start_time, sizeof start_time);
+  MEM_NOACCESS(&start_time_micro, sizeof start_time_micro);
+  MEM_NOACCESS(&commit_lsn, sizeof commit_lsn);
+  MEM_NOACCESS(&mysql_thd, sizeof mysql_thd);
+  MEM_NOACCESS(&mysql_log_file_name, sizeof mysql_log_file_name);
+  MEM_NOACCESS(&mysql_log_offset, sizeof mysql_log_offset);
+  MEM_NOACCESS(&n_mysql_tables_in_use, sizeof n_mysql_tables_in_use);
+  MEM_NOACCESS(&mysql_n_tables_locked, sizeof mysql_n_tables_locked);
+  MEM_NOACCESS(&error_state, sizeof error_state);
+  MEM_NOACCESS(&error_info, sizeof error_info);
+  MEM_NOACCESS(&error_key_num, sizeof error_key_num);
+  MEM_NOACCESS(&graph, sizeof graph);
+  MEM_NOACCESS(&trx_savepoints, sizeof trx_savepoints);
+  MEM_NOACCESS(&undo_no, sizeof undo_no);
+  MEM_NOACCESS(&last_sql_stat_start, sizeof last_sql_stat_start);
+  MEM_NOACCESS(&rsegs, sizeof rsegs);
+  MEM_NOACCESS(&roll_limit, sizeof roll_limit);
+  MEM_NOACCESS(&in_rollback, sizeof in_rollback);
+  MEM_NOACCESS(&pages_undone, sizeof pages_undone);
+  MEM_NOACCESS(&n_autoinc_rows, sizeof n_autoinc_rows);
+  MEM_NOACCESS(&autoinc_locks, sizeof autoinc_locks);
+  MEM_NOACCESS(&read_only, sizeof read_only);
+  MEM_NOACCESS(&auto_commit, sizeof auto_commit);
+  MEM_NOACCESS(&will_lock, sizeof will_lock);
+  MEM_NOACCESS(&fts_trx, sizeof fts_trx);
+  MEM_NOACCESS(&fts_next_doc_id, sizeof fts_next_doc_id);
+  MEM_NOACCESS(&flush_tables, sizeof flush_tables);
+#ifdef UNIV_DEBUG
+  MEM_NOACCESS(&start_line, sizeof start_line);
+  MEM_NOACCESS(&start_file, sizeof start_file);
+#endif /* UNIV_DEBUG */
+  MEM_NOACCESS(&xid, sizeof xid);
+  MEM_NOACCESS(&mod_tables, sizeof mod_tables);
+  MEM_NOACCESS(&detailed_error, sizeof detailed_error);
+  MEM_NOACCESS(&magic_n, sizeof magic_n);
+  MEM_NOACCESS(&apply_online_log, sizeof apply_online_log);
+  trx_pools->mem_free(this);
+}
+
+/** Transition to committed state, to release implicit locks. */
+TRANSACTIONAL_INLINE inline void trx_t::commit_state()
+{
+  ut_ad(state == TRX_STATE_PREPARED
+	|| state == TRX_STATE_PREPARED_RECOVERED
+	|| state == TRX_STATE_ACTIVE);
+  /* This makes the transaction committed in memory and makes its
+  changes to data visible to other transactions. NOTE that there is a
+  small discrepancy from the strict formal visibility rules here: a
+  user of the database can see modifications made by another
+  transaction T even before the necessary redo log segment has been
+  flushed to the disk. If the database happens to crash before the
+  flush, the user has seen modifications from T which will never be a
+  committed transaction. However, any transaction T2 which sees the
+  modifications of the committing transaction T, and which also itself
+  makes modifications to the database, will get an lsn larger than the
+  committing transaction T. In the case where the log flush fails, and
+  T never gets committed, also T2 will never get committed. */
+  TMTrxGuard tg{*this};
+  state= TRX_STATE_COMMITTED_IN_MEMORY;
+  ut_ad(id || !is_referenced());
+}
+
+/** Release any explicit locks of a committing transaction. */
+inline void trx_t::release_locks()
+{
+  DEBUG_SYNC_C("trx_t_release_locks_enter");
+  DBUG_ASSERT(state == TRX_STATE_COMMITTED_IN_MEMORY);
+  DBUG_ASSERT(!is_referenced());
+
+  if (UT_LIST_GET_LEN(lock.trx_locks))
+  {
+    lock_release(this);
+    ut_ad(!lock.n_rec_locks);
+    ut_ad(UT_LIST_GET_LEN(lock.trx_locks) == 0);
+    ut_ad(ib_vector_is_empty(autoinc_locks));
+    mem_heap_empty(lock.lock_heap);
+  }
+
+  lock.table_locks.clear();
+  reset_skip_lock_inheritance();
+  id= 0;
+  while (dict_table_t *table= UT_LIST_GET_FIRST(lock.evicted_tables))
+  {
+    UT_LIST_REMOVE(lock.evicted_tables, table);
+    dict_mem_table_free(table);
+  }
+  DEBUG_SYNC_C("after_trx_committed_in_memory");
+}
+
+/** At shutdown, frees a transaction object. */
+TRANSACTIONAL_TARGET void trx_free_at_shutdown(trx_t *trx)
+{
+	ut_ad(trx->is_recovered);
+	ut_a(trx_state_eq(trx, TRX_STATE_PREPARED)
+	     || trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED)
+	     || (trx_state_eq(trx, TRX_STATE_ACTIVE)
+		 && (!srv_was_started
+		     || srv_operation == SRV_OPERATION_RESTORE
+		     || srv_operation == SRV_OPERATION_RESTORE_EXPORT
+		     || srv_read_only_mode
+		     || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO
+		     || (!srv_is_being_started
+		         && !srv_undo_sources && srv_fast_shutdown))));
+	ut_a(trx->magic_n == TRX_MAGIC_N);
+
+	ut_d(trx->apply_online_log = false);
+	trx->commit_state();
+	trx->release_locks();
+	trx->mod_tables.clear();
+	trx_undo_free_at_shutdown(trx);
+
+	ut_a(!trx->read_only);
+
+	DBUG_LOG("trx", "Free prepared: " << trx);
+	trx->state = TRX_STATE_NOT_STARTED;
+	ut_ad(!UT_LIST_GET_LEN(trx->lock.trx_locks));
+	trx->free();
+}
+
+
+/**
+  Disconnect a prepared transaction from MySQL
+  @param[in,out] trx transaction
+*/
+void trx_disconnect_prepared(trx_t *trx)
+{
+  ut_ad(trx_state_eq(trx, TRX_STATE_PREPARED));
+  ut_ad(trx->mysql_thd);
+  ut_ad(!trx->mysql_log_file_name);
+  trx->read_view.close();
+  trx_sys.trx_list.freeze();
+  trx->is_recovered= true;
+  trx->mysql_thd= NULL;
+  trx_sys.trx_list.unfreeze();
+  /* todo/fixme: suggest to do it at innodb prepare */
+  trx->will_lock= false;
+  trx_sys.rw_trx_hash.put_pins(trx);
+}
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/** Resurrect the table locks for a resurrected transaction. */
+static dberr_t trx_resurrect_table_locks(trx_t *trx, const trx_undo_t &undo)
+{
+  ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) ||
+        trx_state_eq(trx, TRX_STATE_PREPARED));
+  ut_ad(undo.rseg == trx->rsegs.m_redo.rseg);
+
+  if (undo.empty())
+    return DB_SUCCESS;
+
+  mtr_t mtr;
+  std::map<table_id_t, bool> tables;
+  mtr.start();
+
+  dberr_t err;
+  if (buf_block_t *block=
+      buf_page_get_gen(page_id_t(trx->rsegs.m_redo.rseg->space->id,
+                                 undo.top_page_no), 0, RW_S_LATCH, nullptr,
+                       BUF_GET, &mtr, &err))
+  {
+    buf_block_t *undo_block= block;
+    const trx_undo_rec_t *undo_rec= block->page.frame + undo.top_offset;
+
+    do
+    {
+      byte type;
+      byte cmpl_info;
+      undo_no_t undo_no;
+      table_id_t table_id;
+      bool updated_extern;
+
+      if (undo_block != block)
+      {
+        mtr.release(*undo_block);
+        undo_block= block;
+      }
+      trx_undo_rec_get_pars(undo_rec, &type, &cmpl_info,
+                            &updated_extern, &undo_no, &table_id);
+      tables.emplace(table_id, type == TRX_UNDO_EMPTY);
+      undo_rec= trx_undo_get_prev_rec(block, page_offset(undo_rec),
+                                      undo.hdr_page_no, undo.hdr_offset,
+                                      true, &mtr);
+    }
+    while (undo_rec);
+  }
+
+  mtr.commit();
+
+  if (err != DB_SUCCESS)
+    return err;
+
+  for (auto p : tables)
+  {
+    if (dict_table_t *table=
+        dict_table_open_on_id(p.first, FALSE, DICT_TABLE_OP_LOAD_TABLESPACE))
+    {
+      if (!table->is_readable())
+      {
+        dict_sys.lock(SRW_LOCK_CALL);
+        table->release();
+        dict_sys.remove(table);
+        dict_sys.unlock();
+        continue;
+      }
+
+      if (trx->state == TRX_STATE_PREPARED)
+        trx->mod_tables.emplace(table, 0);
+
+      lock_table_resurrect(table, trx, p.second ? LOCK_X : LOCK_IX);
+
+      DBUG_LOG("ib_trx",
+               "resurrect " << ib::hex(trx->id) << " lock on " << table->name);
+      table->release();
+    }
+  }
+
+  return DB_SUCCESS;
+}
+
+
+MY_ATTRIBUTE((nonnull, warn_unused_result))
+/**
+  Resurrect the transactions that were doing inserts/updates the time of the
+  crash, they need to be undone.
+*/
+static dberr_t trx_resurrect(trx_undo_t *undo, trx_rseg_t *rseg,
+                             time_t start_time, ulonglong start_time_micro,
+                             uint64_t *rows_to_undo)
+{
+  trx_state_t state;
+  ut_ad(rseg->needs_purge >= undo->trx_id);
+  /*
+    This is single-threaded startup code, we do not need the
+    protection of trx->mutex here.
+  */
+  switch (undo->state)
+  {
+  case TRX_UNDO_ACTIVE:
+    state= TRX_STATE_ACTIVE;
+    break;
+  case TRX_UNDO_PREPARED:
+    /*
+      Prepared transactions are left in the prepared state
+      waiting for a commit or abort decision from MySQL
+    */
+    state= TRX_STATE_PREPARED;
+    sql_print_information("InnoDB: Transaction " TRX_ID_FMT
+                          " was in the XA prepared state.", undo->trx_id);
+    break;
+  default:
+    return DB_SUCCESS;
+  }
+
+  rseg->acquire();
+  trx_t *trx= trx_create();
+  trx->state= state;
+  ut_d(trx->start_file= __FILE__);
+  ut_d(trx->start_line= __LINE__);
+
+  trx->rsegs.m_redo.undo= undo;
+  trx->undo_no= undo->top_undo_no + 1;
+  trx->rsegs.m_redo.rseg= rseg;
+  trx->xid= undo->xid;
+  trx->id= undo->trx_id;
+  trx->is_recovered= true;
+  trx->start_time= start_time;
+  trx->start_time_micro= start_time_micro;
+  trx->dict_operation= undo->dict_operation;
+
+  trx_sys.rw_trx_hash.insert(trx);
+  trx_sys.rw_trx_hash.put_pins(trx);
+  if (trx_state_eq(trx, TRX_STATE_ACTIVE))
+    *rows_to_undo+= trx->undo_no;
+  return trx_resurrect_table_locks(trx, *undo);
+}
+
+
+/** Initialize (resurrect) transactions at startup. */
+dberr_t trx_lists_init_at_db_start()
+{
+	ut_a(srv_is_being_started);
+	ut_ad(!srv_was_started);
+
+	if (srv_operation == SRV_OPERATION_RESTORE) {
+		/* mariabackup --prepare only deals with
+		the redo log and the data files, not with
+		transactions or the data dictionary. */
+		return trx_rseg_array_init();
+	}
+
+	if (srv_force_recovery >= SRV_FORCE_NO_UNDO_LOG_SCAN) {
+		return DB_SUCCESS;
+	}
+
+	purge_sys.create();
+	dberr_t err = trx_rseg_array_init();
+
+	if (err != DB_SUCCESS) {
+corrupted:
+		ib::info() << "Retry with innodb_force_recovery=5";
+		return err;
+	}
+
+	if (trx_sys.is_undo_empty()) {
+func_exit:
+		purge_sys.clone_oldest_view<true>();
+		return DB_SUCCESS;
+	}
+
+	/* Look from the rollback segments if there exist undo logs for
+	transactions. */
+	const time_t	start_time	= time(NULL);
+	const ulonglong	start_time_micro= microsecond_interval_timer();
+	uint64_t	rows_to_undo	= 0;
+
+	for (auto& rseg : trx_sys.rseg_array) {
+		trx_undo_t*	undo;
+
+		/* Some rollback segment may be unavailable,
+		especially if the server was previously run with a
+		non-default value of innodb_undo_logs. */
+		if (!rseg.space) {
+			continue;
+		}
+		/* Resurrect other transactions. */
+		for (undo = UT_LIST_GET_FIRST(rseg.undo_list);
+		     undo != NULL;
+		     undo = UT_LIST_GET_NEXT(undo_list, undo)) {
+			trx_t *trx = trx_sys.find(0, undo->trx_id, false);
+			if (!trx) {
+				err = trx_resurrect(undo, &rseg, start_time,
+						    start_time_micro,
+						    &rows_to_undo);
+			} else {
+				ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE) ||
+				      trx_state_eq(trx, TRX_STATE_PREPARED));
+				ut_ad(trx->start_time == start_time);
+				ut_ad(trx->is_recovered);
+				ut_ad(trx->rsegs.m_redo.rseg == &rseg);
+				ut_ad(rseg.is_referenced());
+				ut_ad(rseg.needs_purge);
+
+				trx->rsegs.m_redo.undo = undo;
+				if (undo->top_undo_no >= trx->undo_no) {
+					if (trx_state_eq(trx,
+							 TRX_STATE_ACTIVE)) {
+						rows_to_undo -= trx->undo_no;
+						rows_to_undo +=
+							undo->top_undo_no + 1;
+					}
+
+					trx->undo_no = undo->top_undo_no + 1;
+				}
+				err = trx_resurrect_table_locks(trx, *undo);
+			}
+
+			if (err != DB_SUCCESS) {
+				goto corrupted;
+			}
+		}
+	}
+
+	if (const auto size = trx_sys.rw_trx_hash.size()) {
+		ib::info() << size
+			<< " transaction(s) which must be rolled back or"
+			" cleaned up in total " << rows_to_undo
+			<< " row operations to undo";
+		ib::info() << "Trx id counter is " << trx_sys.get_max_trx_id();
+	}
+
+	goto func_exit;
+}
+
+/** Assign a persistent rollback segment in a round-robin fashion,
+evenly distributed between 0 and innodb_undo_logs-1
+@param trx transaction */
+static void trx_assign_rseg_low(trx_t *trx)
+{
+	ut_ad(!trx->rsegs.m_redo.rseg);
+	ut_ad(srv_available_undo_logs == TRX_SYS_N_RSEGS);
+
+	/* The first slot is always assigned to the system tablespace. */
+	ut_ad(trx_sys.rseg_array[0].space == fil_system.sys_space);
+
+	trx_sys.register_rw(trx);
+	ut_ad(trx->id);
+
+	/* Choose a rollback segment evenly distributed between 0 and
+	innodb_undo_logs-1 in a round-robin fashion, skipping those
+	undo tablespaces that are scheduled for truncation. */
+	static Atomic_counter<unsigned>	rseg_slot;
+	unsigned slot = rseg_slot++ % TRX_SYS_N_RSEGS;
+	ut_d(if (trx_rseg_n_slots_debug) slot = 0);
+	ut_d(const auto start_scan_slot = slot);
+	ut_d(bool look_for_rollover = false);
+	trx_rseg_t*	rseg;
+
+	bool	allocated;
+
+	do {
+		for (;;) {
+			rseg = &trx_sys.rseg_array[slot];
+			ut_ad(!look_for_rollover || start_scan_slot != slot);
+			ut_d(look_for_rollover = true);
+			ut_d(if (!trx_rseg_n_slots_debug))
+			slot = (slot + 1) % TRX_SYS_N_RSEGS;
+
+			if (!rseg->space) {
+				continue;
+			}
+
+			ut_ad(rseg->is_persistent());
+
+			if (rseg->space != fil_system.sys_space) {
+				if (rseg->skip_allocation()) {
+					continue;
+				}
+			} else if (const fil_space_t *space =
+				   trx_sys.rseg_array[slot].space) {
+				if (space != fil_system.sys_space
+				    && srv_undo_tablespaces > 0) {
+					/** If dedicated
+					innodb_undo_tablespaces have
+					been configured, try to use them
+					instead of the system tablespace. */
+					continue;
+				}
+			}
+
+			break;
+		}
+
+		/* By now we have only selected the rseg but not marked it
+		allocated. By marking it allocated we are ensuring that it will
+		never be selected for UNDO truncate purge. */
+		allocated = rseg->acquire_if_available();
+	} while (!allocated);
+
+	trx->rsegs.m_redo.rseg = rseg;
+}
+
+/** Assign a rollback segment for modifying temporary tables.
+@return the assigned rollback segment */
+trx_rseg_t *trx_t::assign_temp_rseg()
+{
+	ut_ad(!rsegs.m_noredo.rseg);
+	ut_ad(!is_autocommit_non_locking());
+	compile_time_assert(ut_is_2pow(TRX_SYS_N_RSEGS));
+
+	/* Choose a temporary rollback segment between 0 and 127
+	in a round-robin fashion. */
+	static Atomic_counter<unsigned> rseg_slot;
+	trx_rseg_t*	rseg = &trx_sys.temp_rsegs[
+		rseg_slot++ & (TRX_SYS_N_RSEGS - 1)];
+	ut_ad(!rseg->is_persistent());
+	rsegs.m_noredo.rseg = rseg;
+
+	if (id == 0) {
+		trx_sys.register_rw(this);
+	}
+
+	return(rseg);
+}
+
+/****************************************************************//**
+Starts a transaction. */
+static
+void
+trx_start_low(
+/*==========*/
+	trx_t*	trx,		/*!< in: transaction */
+	bool	read_write)	/*!< in: true if read-write transaction */
+{
+	ut_ad(!trx->in_rollback);
+	ut_ad(!trx->is_recovered);
+	ut_ad(trx->start_line != 0);
+	ut_ad(trx->start_file != 0);
+	ut_ad(trx->roll_limit == 0);
+	ut_ad(trx->error_state == DB_SUCCESS);
+	ut_ad(trx->rsegs.m_redo.rseg == NULL);
+	ut_ad(trx->rsegs.m_noredo.rseg == NULL);
+	ut_ad(trx_state_eq(trx, TRX_STATE_NOT_STARTED));
+	ut_ad(UT_LIST_GET_LEN(trx->lock.trx_locks) == 0);
+
+	/* Check whether it is an AUTOCOMMIT SELECT */
+	trx->auto_commit = thd_trx_is_auto_commit(trx->mysql_thd);
+
+	trx->read_only = srv_read_only_mode
+		|| (!trx->dict_operation
+		    && thd_trx_is_read_only(trx->mysql_thd));
+
+	if (!trx->auto_commit) {
+		trx->will_lock = true;
+	} else if (!trx->will_lock) {
+		trx->read_only = true;
+	}
+
+#ifdef WITH_WSREP
+	trx->xid.null();
+#endif /* WITH_WSREP */
+
+	ut_a(ib_vector_is_empty(trx->autoinc_locks));
+	ut_a(trx->lock.table_locks.empty());
+
+	/* No other thread can access this trx object through rw_trx_hash,
+	still it can be found through trx_sys.trx_list. Sometimes it's
+	possible to indirectly protect trx_t::state by freezing
+	trx_sys.trx_list.
+
+	For now we update it without mutex protection, because original code
+	did it this way. It has to be reviewed and fixed properly. */
+	trx->state = TRX_STATE_ACTIVE;
+
+	/* By default all transactions are in the read-only list unless they
+	are non-locking auto-commit read only transactions or background
+	(internal) transactions. Note: Transactions marked explicitly as
+	read only can write to temporary tables, we put those on the RO
+	list too. */
+
+	if (!trx->read_only
+	    && (!trx->mysql_thd || read_write || trx->dict_operation)) {
+		/* Temporary rseg is assigned only if the transaction
+		updates a temporary table */
+		if (!high_level_read_only) {
+			trx_assign_rseg_low(trx);
+		}
+	} else {
+		if (!trx->is_autocommit_non_locking()) {
+
+			/* If this is a read-only transaction that is writing
+			to a temporary table then it needs a transaction id
+			to write to the temporary table. */
+
+			if (read_write) {
+				ut_ad(!srv_read_only_mode);
+				trx_sys.register_rw(trx);
+			}
+		} else {
+			ut_ad(!read_write);
+		}
+	}
+
+	trx->start_time = time(NULL);
+	trx->start_time_micro = trx->mysql_thd
+		? thd_start_utime(trx->mysql_thd)
+		: microsecond_interval_timer();
+
+	ut_a(trx->error_state == DB_SUCCESS);
+}
+
+/** Release an empty undo log that was associated with a transaction. */
+ATTRIBUTE_COLD
+void trx_t::commit_empty(mtr_t *mtr)
+{
+  trx_rseg_t *rseg= rsegs.m_redo.rseg;
+  trx_undo_t *&undo= rsegs.m_redo.undo;
+
+  ut_ad(undo->state == TRX_UNDO_ACTIVE || undo->state == TRX_UNDO_PREPARED);
+  ut_ad(undo->size == 1);
+
+  if (buf_block_t *u=
+      buf_page_get(page_id_t(rseg->space->id, undo->hdr_page_no), 0,
+                   RW_X_LATCH, mtr))
+  {
+    ut_d(const uint16_t state=
+         mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + u->page.frame));
+    ut_ad(state == undo->state || state == TRX_UNDO_ACTIVE);
+    static_assert(TRX_UNDO_PAGE_START + 2 == TRX_UNDO_PAGE_FREE,
+                  "compatibility");
+    ut_ad(!memcmp(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START + u->page.frame,
+                  TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + u->page.frame, 2));
+    ut_ad(mach_read_from_4(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_PREV +
+                           FIL_ADDR_PAGE + u->page.frame) == FIL_NULL);
+    ut_ad(mach_read_from_2(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_PREV +
+                           FIL_ADDR_BYTE + u->page.frame) == 0);
+    ut_ad(!memcmp(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_PREV +
+                  u->page.frame,
+                  TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_NEXT +
+                  u->page.frame, FIL_ADDR_SIZE));
+
+    /* Delete the last undo log header, which must be for this transaction.
+
+    An undo segment can be reused (TRX_UNDO_CACHED) only if it
+    comprises of one page and that single page contains enough space
+    for the undo log header of a subsequent transaction. See
+    trx_purge_add_undo_to_history(), which is executed when committing
+    a nonempty transaction.
+
+    If we simply changed the undo page state to TRX_UNDO_CACHED,
+    then trx_undo_reuse_cached() could run out of space. We will
+    release the space consumed by our empty undo log to avoid that. */
+    for (byte *last= &u->page.frame[TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE],
+           *prev= nullptr;;)
+    {
+      /* TRX_UNDO_PREV_LOG is only being read in debug assertions, and
+      written in trx_undo_header_create(). To remain compatible with
+      possibly corrupted old data files, we will not read the field
+      TRX_UNDO_PREV_LOG but instead rely on TRX_UNDO_NEXT_LOG. */
+      ut_ad(mach_read_from_2(TRX_UNDO_PREV_LOG + last) ==
+            (reinterpret_cast<size_t>(prev) & (srv_page_size - 1)));
+
+      if (uint16_t next= mach_read_from_2(TRX_UNDO_NEXT_LOG + last))
+      {
+        ut_ad(ulint{next} + TRX_UNDO_LOG_XA_HDR_SIZE < srv_page_size - 100);
+        ut_ad(&u->page.frame[next] > last);
+        ut_ad(mach_read_from_2(TRX_UNDO_LOG_START + last) <= next);
+        prev= last;
+        last= &u->page.frame[next];
+        continue;
+      }
+
+      ut_ad(mach_read_from_8(TRX_UNDO_TRX_ID + last) == id);
+      ut_ad(!mach_read_from_8(TRX_UNDO_TRX_NO + last));
+      ut_ad(!memcmp(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START + u->page.frame,
+                    TRX_UNDO_LOG_START + last, 2));
+
+      if (prev)
+      {
+        mtr->memcpy(*u, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START +
+                    u->page.frame, prev + TRX_UNDO_LOG_START, 2);
+        const ulint free= page_offset(last);
+        mtr->write<2>(*u, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE +
+                      u->page.frame, free);
+        mtr->write<2>(*u, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + u->page.frame,
+                      TRX_UNDO_CACHED);
+        mtr->write<2>(*u, TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG + u->page.frame,
+                      page_offset(prev));
+        mtr->write<2>(*u, prev + TRX_UNDO_NEXT_LOG, 0U);
+        mtr->memset(u, free, srv_page_size - FIL_PAGE_DATA_END - free, 0);
+
+        /* We may have updated PAGE_MAX_TRX_ID on secondary index pages
+        to this->id. Ensure that trx_sys.m_max_trx_id will be recovered
+        correctly, even though we removed our undo log record along
+        with the TRX_UNDO_TRX_ID above. */
+
+        /* Below, we are acquiring rseg_header->page.lock after
+        u->page.lock (the opposite of trx_purge_add_undo_to_history()).
+        This is fine, because both functions are holding exclusive
+        rseg->latch. */
+
+        if (mach_read_from_8(prev + TRX_UNDO_TRX_NO) >= id);
+        else if (buf_block_t *rseg_header= rseg->get(mtr, nullptr))
+        {
+          byte *m= TRX_RSEG + TRX_RSEG_MAX_TRX_ID + rseg_header->page.frame;
+
+          do
+          {
+            if (UNIV_UNLIKELY(mach_read_from_4(TRX_RSEG + TRX_RSEG_FORMAT +
+                                               rseg_header->page.frame)))
+              /* This must have been upgraded from before MariaDB 10.3.5. */
+              trx_rseg_format_upgrade(rseg_header, mtr);
+            else if (mach_read_from_8(m) >= id)
+              continue;
+            mtr->write<8>(*rseg_header, m, id);
+          }
+          while (0);
+        }
+      }
+      else
+        /* Our undo log header was right after the undo log segment header.
+        This page should have been created by trx_undo_create(), not
+        returned by trx_undo_reuse_cached().
+
+        We retain the dummy empty log in order to remain compatible with
+        trx_undo_mem_create_at_db_start(). This page will remain available
+        to trx_undo_reuse_cached(), and it will eventually be freed by
+        trx_purge_truncate_rseg_history(). */
+        mtr->write<2>(*u, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + u->page.frame,
+                      TRX_UNDO_CACHED);
+      break;
+    }
+  }
+  else
+    ut_ad("undo log page was not found" == 0);
+
+  UT_LIST_REMOVE(rseg->undo_list, undo);
+  UT_LIST_ADD_FIRST(rseg->undo_cached, undo);
+  undo->state= TRX_UNDO_CACHED;
+  undo= nullptr;
+
+  /* We must assign an "end" identifier even though we are not going
+  to persistently write it anywhere, to make sure that the purge of
+  history will not be stuck. */
+  trx_sys.assign_new_trx_no(this);
+}
+
+/** Assign the transaction its history serialisation number and write the
+UNDO log to the assigned rollback segment.
+@param mtr   mini-transaction */
+inline void trx_t::write_serialisation_history(mtr_t *mtr)
+{
+  ut_ad(!read_only);
+  trx_rseg_t *rseg= rsegs.m_redo.rseg;
+  trx_undo_t *&undo= rsegs.m_redo.undo;
+  if (UNIV_LIKELY(undo != nullptr))
+  {
+    MONITOR_INC(MONITOR_TRX_COMMIT_UNDO);
+
+    /* We have to hold exclusive rseg->latch because undo log headers have
+    to be put to the history list in the (serialisation) order of the
+    UNDO trx number. This is required for purge_sys too. */
+    rseg->latch.wr_lock(SRW_LOCK_CALL);
+    ut_ad(undo->rseg == rseg);
+    /* Assign the transaction serialisation number and add any
+    undo log to the purge queue. */
+    if (UNIV_UNLIKELY(!undo_no))
+    {
+      /* The transaction was rolled back. */
+      commit_empty(mtr);
+      goto done;
+    }
+    else if (rseg->last_page_no == FIL_NULL)
+    {
+      mysql_mutex_lock(&purge_sys.pq_mutex);
+      trx_sys.assign_new_trx_no(this);
+      const trx_id_t end{rw_trx_hash_element->no};
+      /* end cannot be less than anything in rseg. User threads only
+      produce events when a rollback segment is empty. */
+      purge_sys.purge_queue.push(TrxUndoRsegs{end, *rseg});
+      mysql_mutex_unlock(&purge_sys.pq_mutex);
+      rseg->last_page_no= undo->hdr_page_no;
+      rseg->set_last_commit(undo->hdr_offset, end);
+    }
+    else
+      trx_sys.assign_new_trx_no(this);
+    UT_LIST_REMOVE(rseg->undo_list, undo);
+    /* Change the undo log segment state from TRX_UNDO_ACTIVE, to
+    define the transaction as committed in the file based domain,
+    at mtr->commit_lsn() obtained in mtr->commit() below. */
+    trx_purge_add_undo_to_history(this, undo, mtr);
+  done:
+    rseg->release();
+    rseg->latch.wr_unlock();
+  }
+  else
+    rseg->release();
+  mtr->commit();
+}
+
+/********************************************************************
+Finalize a transaction containing updates for a FTS table. */
+static
+void
+trx_finalize_for_fts_table(
+/*=======================*/
+	fts_trx_table_t*	ftt)	    /* in: FTS trx table */
+{
+	fts_t*		  fts = ftt->table->fts;
+	fts_doc_ids_t*	  doc_ids = ftt->added_doc_ids;
+
+	ut_a(fts->add_wq);
+
+	mem_heap_t* heap = static_cast<mem_heap_t*>(doc_ids->self_heap->arg);
+
+	ib_wqueue_add(fts->add_wq, doc_ids, heap);
+
+	/* fts_trx_table_t no longer owns the list. */
+	ftt->added_doc_ids = NULL;
+}
+
+/******************************************************************//**
+Finalize a transaction containing updates to FTS tables. */
+static
+void
+trx_finalize_for_fts(
+/*=================*/
+	trx_t*	trx,		/*!< in/out: transaction */
+	bool	is_commit)	/*!< in: true if the transaction was
+				committed, false if it was rolled back. */
+{
+	if (is_commit) {
+		const ib_rbt_node_t*	node;
+		ib_rbt_t*		tables;
+		fts_savepoint_t*	savepoint;
+
+		savepoint = static_cast<fts_savepoint_t*>(
+			ib_vector_last(trx->fts_trx->savepoints));
+
+		tables = savepoint->tables;
+
+		for (node = rbt_first(tables);
+		     node;
+		     node = rbt_next(tables, node)) {
+			fts_trx_table_t**	ftt;
+
+			ftt = rbt_value(fts_trx_table_t*, node);
+
+			if ((*ftt)->added_doc_ids) {
+				trx_finalize_for_fts_table(*ftt);
+			}
+		}
+	}
+
+	fts_trx_free(trx->fts_trx);
+	trx->fts_trx = NULL;
+}
+
+extern "C" MYSQL_THD thd_increment_pending_ops(MYSQL_THD);
+extern "C" void  thd_decrement_pending_ops(MYSQL_THD);
+
+
+#include "../log/log0sync.h"
+
+/*
+  If required, initiates write and optionally flush of the log to
+  disk
+  @param lsn   LSN up to which logs are to be flushed.
+  @param trx   transaction; if trx->state is PREPARED, the function will
+  also wait for the flush to complete.
+*/
+static void trx_flush_log_if_needed(lsn_t lsn, trx_t *trx)
+{
+  ut_ad(srv_flush_log_at_trx_commit);
+  ut_ad(trx->state != TRX_STATE_PREPARED);
+
+  if (log_sys.get_flushed_lsn(std::memory_order_relaxed) >= lsn)
+    return;
+
+  const bool flush=
+    (srv_file_flush_method != SRV_NOSYNC &&
+     (srv_flush_log_at_trx_commit & 1));
+
+  completion_callback cb;
+  if (!log_sys.is_pmem() &&
+      (cb.m_param= thd_increment_pending_ops(trx->mysql_thd)))
+  {
+    cb.m_callback = (void (*)(void *)) thd_decrement_pending_ops;
+    log_write_up_to(lsn, flush, &cb);
+  }
+  else
+  {
+    trx->op_info= "flushing log";
+    log_write_up_to(lsn, flush);
+    trx->op_info= "";
+  }
+}
+
+/** Process tables that were modified by the committing transaction. */
+inline void trx_t::commit_tables()
+{
+  if (undo_no && !mod_tables.empty())
+  {
+    const trx_id_t max_trx_id= trx_sys.get_max_trx_id();
+    const auto now= start_time;
+
+    for (const auto &p : mod_tables)
+    {
+      dict_table_t *table= p.first;
+      table->update_time= now;
+      table->query_cache_inv_trx_id= max_trx_id;
+    }
+  }
+}
+
+/** Evict a table definition due to the rollback of ALTER TABLE.
+@param table_id   table identifier
+@param reset_only whether to only reset dict_table_t::def_trx_id */
+void trx_t::evict_table(table_id_t table_id, bool reset_only)
+{
+	ut_ad(in_rollback);
+
+	dict_table_t* table = dict_sys.find_table(table_id);
+	if (!table) {
+		return;
+	}
+
+	table->def_trx_id = 0;
+
+	if (auto ref_count = table->get_ref_count()) {
+		/* This must be a DDL operation that is being rolled
+		back in an active connection. */
+		ut_a(ref_count == 1);
+		ut_ad(!is_recovered);
+		ut_ad(mysql_thd);
+		return;
+	}
+
+	if (reset_only) {
+		return;
+	}
+
+	/* This table should only be locked by this transaction, if at all. */
+	ut_ad(UT_LIST_GET_LEN(table->locks) <= 1);
+	const bool locked = UT_LIST_GET_LEN(table->locks);
+	ut_ad(!locked || UT_LIST_GET_FIRST(table->locks)->trx == this);
+	dict_sys.remove(table, true, locked);
+	if (locked) {
+		UT_LIST_ADD_FIRST(lock.evicted_tables, table);
+	}
+}
+
+/** Free temporary undo log after commit or rollback.
+@param undo  temporary undo log */
+ATTRIBUTE_NOINLINE static void trx_commit_cleanup(trx_undo_t *&undo)
+{
+  trx_rseg_t *const rseg= undo->rseg;
+  ut_ad(rseg->space == fil_system.temp_space);
+  rseg->latch.wr_lock(SRW_LOCK_CALL);
+  UT_LIST_REMOVE(rseg->undo_list, undo);
+  ut_ad(undo->state == TRX_UNDO_ACTIVE || undo->state == TRX_UNDO_PREPARED);
+  ut_ad(undo->id < TRX_RSEG_N_SLOTS);
+  /* Delete first the undo log segment in the file */
+  bool finished;
+  mtr_t mtr;
+  do
+  {
+    mtr.start();
+    mtr.set_log_mode(MTR_LOG_NO_REDO);
+
+    finished= true;
+
+    if (buf_block_t *block=
+        buf_page_get(page_id_t(SRV_TMP_SPACE_ID, undo->hdr_page_no), 0,
+                     RW_X_LATCH, &mtr))
+    {
+      fseg_header_t *file_seg= TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER +
+        block->page.frame;
+
+      finished= fseg_free_step(file_seg, &mtr);
+
+      if (!finished);
+      else if (buf_block_t *rseg_header= rseg->get(&mtr, nullptr))
+      {
+        static_assert(FIL_NULL == 0xffffffff, "compatibility");
+        memset(rseg_header->page.frame + TRX_RSEG + TRX_RSEG_UNDO_SLOTS +
+               undo->id * TRX_RSEG_SLOT_SIZE, 0xff, 4);
+      }
+    }
+
+    mtr.commit();
+  }
+  while (!finished);
+
+  ut_ad(rseg->curr_size > undo->size);
+  rseg->curr_size-= undo->size;
+  rseg->latch.wr_unlock();
+  ut_free(undo);
+  undo= nullptr;
+}
+
+TRANSACTIONAL_INLINE inline void trx_t::commit_in_memory(const mtr_t *mtr)
+{
+  /* We already detached from rseg in write_serialisation_history() */
+  ut_ad(!rsegs.m_redo.undo);
+  read_view.close();
+
+  if (is_autocommit_non_locking())
+  {
+    ut_ad(id == 0);
+    ut_ad(read_only);
+    ut_ad(!will_lock);
+    ut_a(!is_recovered);
+    ut_ad(!rsegs.m_redo.rseg);
+    ut_ad(!rsegs.m_redo.undo);
+    ut_ad(mysql_thd);
+    ut_ad(state == TRX_STATE_ACTIVE);
+
+    /* Note: We do not have to hold any lock_sys latch here, because
+    this is a non-locking transaction. */
+    ut_a(UT_LIST_GET_LEN(lock.trx_locks) == 0);
+    ut_ad(UT_LIST_GET_LEN(lock.evicted_tables) == 0);
+
+    /* This state change is not protected by any mutex, therefore
+    there is an inherent race here around state transition during
+    printouts. We ignore this race for the sake of efficiency.
+    However, the freezing of trx_sys.trx_list will protect the trx_t
+    instance and it cannot be removed from the trx_list and freed
+    without first unfreezing trx_list. */
+    state= TRX_STATE_NOT_STARTED;
+
+    MONITOR_INC(MONITOR_TRX_NL_RO_COMMIT);
+
+    DBUG_LOG("trx", "Autocommit in memory: " << this);
+  }
+  else
+  {
+#ifdef UNIV_DEBUG
+    if (!UT_LIST_GET_LEN(lock.trx_locks))
+      for (auto l : lock.table_locks)
+        ut_ad(!l);
+#endif /* UNIV_DEBUG */
+    commit_state();
+
+    if (id)
+    {
+      trx_sys.deregister_rw(this);
+
+      /* Wait for any implicit-to-explicit lock conversions to cease,
+      so that there will be no race condition in lock_release(). */
+      while (UNIV_UNLIKELY(is_referenced()))
+        LF_BACKOFF();
+    }
+    else
+      ut_ad(read_only || !rsegs.m_redo.rseg);
+
+    if (read_only || !rsegs.m_redo.rseg)
+    {
+      MONITOR_INC(MONITOR_TRX_RO_COMMIT);
+    }
+    else
+    {
+      commit_tables();
+      MONITOR_INC(MONITOR_TRX_RW_COMMIT);
+      is_recovered= false;
+    }
+
+    if (UNIV_LIKELY(!dict_operation))
+      release_locks();
+  }
+
+  if (trx_undo_t *&undo= rsegs.m_noredo.undo)
+  {
+    ut_ad(undo->rseg == rsegs.m_noredo.rseg);
+    trx_commit_cleanup(undo);
+  }
+
+  if (mtr)
+  {
+    /* NOTE that we could possibly make a group commit more efficient
+    here: call std::this_thread::yield() here to allow also other trxs to come
+    to commit! */
+
+    /*-------------------------------------*/
+
+    /* Depending on the my.cnf options, we may now write the log
+    buffer to the log files, making the transaction durable if the OS
+    does not crash. We may also flush the log files to disk, making
+    the transaction durable also at an OS crash or a power outage.
+
+    The idea in InnoDB's group commit is that a group of transactions
+    gather behind a trx doing a physical disk write to log files, and
+    when that physical write has been completed, one of those
+    transactions does a write which commits the whole group. Note that
+    this group commit will only bring benefit if there are > 2 users
+    in the database. Then at least 2 users can gather behind one doing
+    the physical log write to disk.
+
+    If we are calling trx_t::commit() under prepare_commit_mutex, we
+    will delay possible log write and flush to a separate function
+    trx_commit_complete_for_mysql(), which is only called when the
+    thread has released the mutex. This is to make the group commit
+    algorithm to work. Otherwise, the prepare_commit mutex would
+    serialize all commits and prevent a group of transactions from
+    gathering. */
+
+    commit_lsn= undo_no || !xid.is_null() ? mtr->commit_lsn() : 0;
+    if (commit_lsn && !flush_log_later && srv_flush_log_at_trx_commit)
+    {
+      trx_flush_log_if_needed(commit_lsn, this);
+      commit_lsn= 0;
+    }
+  }
+
+  savepoints_discard();
+
+  if (fts_trx)
+    trx_finalize_for_fts(this, undo_no != 0);
+
+#ifdef WITH_WSREP
+  /* Serialization history has been written and the transaction is
+  committed in memory, which makes this commit ordered. Release commit
+  order critical section. */
+  if (wsrep)
+  {
+    wsrep= false;
+    wsrep_commit_ordered(mysql_thd);
+  }
+#endif /* WITH_WSREP */
+  lock.was_chosen_as_deadlock_victim= false;
+}
+
+void trx_t::commit_cleanup()
+{
+  ut_ad(!dict_operation);
+  ut_ad(!was_dict_operation);
+
+  if (is_bulk_insert())
+    for (auto &t : mod_tables)
+      delete t.second.bulk_store;
+
+  mutex.wr_lock();
+  state= TRX_STATE_NOT_STARTED;
+  mod_tables.clear();
+
+  check_foreigns= true;
+  check_unique_secondary= true;
+  assert_freed();
+  trx_init(this);
+  mutex.wr_unlock();
+
+  ut_a(error_state == DB_SUCCESS);
+}
+
+/** Commit the transaction in a mini-transaction.
+@param mtr  mini-transaction (if there are any persistent modifications) */
+TRANSACTIONAL_TARGET void trx_t::commit_low(mtr_t *mtr)
+{
+  ut_ad(!mtr || mtr->is_active());
+  ut_d(bool aborted= in_rollback && error_state == DB_DEADLOCK);
+  ut_ad(!mtr == (aborted || !has_logged_persistent()));
+  ut_ad(!mtr || !aborted);
+
+  if (fts_trx && undo_no)
+  {
+    ut_a(!is_autocommit_non_locking());
+    /* MDEV-24088 FIXME: Invoke fts_commit() earlier (before possible
+    XA PREPARE), so that we will be able to return an error and rollback
+    the transaction, instead of violating consistency!
+
+    The original claim about DB_DUPLICATE KEY was:
+    This is a possible scenario if there is a crash between
+    insert to DELETED table committing and transaction committing. The
+    fix would be able to return error from this function */
+    if (ut_d(dberr_t error=) fts_commit(this))
+      ut_ad(error == DB_DUPLICATE_KEY || error == DB_LOCK_WAIT_TIMEOUT);
+  }
+
+#ifdef ENABLED_DEBUG_SYNC
+  const bool debug_sync= mysql_thd && has_logged_persistent();
+#endif
+
+  if (mtr)
+  {
+    if (UNIV_UNLIKELY(apply_online_log))
+      apply_log();
+
+    /* The following call commits the mini-transaction, making the
+    whole transaction committed in the file-based world, at this log
+    sequence number. The transaction becomes 'durable' when we write
+    the log to disk, but in the logical sense the commit in the
+    file-based data structures (undo logs etc.) happens here.
+
+    NOTE that transaction numbers do not necessarily come in
+    exactly the same order as commit lsn's, if the transactions have
+    different rollback segments. However, if a transaction T2 is
+    able to see modifications made by a transaction T1, T2 will always
+    get a bigger transaction number and a bigger commit lsn than T1. */
+    write_serialisation_history(mtr);
+  }
+  else if (trx_rseg_t *rseg= rsegs.m_redo.rseg)
+  {
+    ut_ad(id);
+    ut_ad(!rsegs.m_redo.undo);
+    rseg->release();
+  }
+
+#ifdef ENABLED_DEBUG_SYNC
+  if (debug_sync)
+    DEBUG_SYNC_C("before_trx_state_committed_in_memory");
+#endif
+
+  commit_in_memory(mtr);
+}
+
+
+void trx_t::commit_persist()
+{
+  mtr_t *mtr= nullptr;
+  mtr_t local_mtr;
+
+  if (has_logged_persistent())
+  {
+    mtr= &local_mtr;
+    local_mtr.start();
+  }
+  commit_low(mtr);
+}
+
+
+void trx_t::commit()
+{
+  ut_ad(!was_dict_operation);
+  ut_d(was_dict_operation= dict_operation);
+  dict_operation= false;
+  commit_persist();
+#ifdef UNIV_DEBUG
+  if (!was_dict_operation)
+    for (const auto &p : mod_tables) ut_ad(!p.second.is_dropped());
+#endif /* UNIV_DEBUG */
+  ut_d(was_dict_operation= false);
+  commit_cleanup();
+}
+
+
+/****************************************************************//**
+Prepares a transaction for commit/rollback. */
+void
+trx_commit_or_rollback_prepare(
+/*===========================*/
+	trx_t*	trx)		/*!< in/out: transaction */
+{
+	/* We are reading trx->state without holding trx->mutex
+	here, because the commit or rollback should be invoked for a
+	running (or recovered prepared) transaction that is associated
+	with the current thread. */
+
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		trx_start_low(trx, true);
+		/* fall through */
+
+	case TRX_STATE_ACTIVE:
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_PREPARED_RECOVERED:
+		trx->lock.wait_thr = NULL;
+		return;
+
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		break;
+	}
+
+	ut_error;
+}
+
+/*********************************************************************//**
+Creates a commit command node struct.
+@return own: commit node struct */
+commit_node_t*
+trx_commit_node_create(
+/*===================*/
+	mem_heap_t*	heap)	/*!< in: mem heap where created */
+{
+	commit_node_t*	node;
+
+	node = static_cast<commit_node_t*>(mem_heap_alloc(heap, sizeof(*node)));
+	node->common.type  = QUE_NODE_COMMIT;
+	node->state = COMMIT_NODE_SEND;
+
+	return(node);
+}
+
+/***********************************************************//**
+Performs an execution step for a commit type node in a query graph.
+@return query thread to run next, or NULL */
+que_thr_t*
+trx_commit_step(
+/*============*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	commit_node_t*	node;
+
+	node = static_cast<commit_node_t*>(thr->run_node);
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_COMMIT);
+
+	if (thr->prev_node == que_node_get_parent(node)) {
+		node->state = COMMIT_NODE_SEND;
+	}
+
+	if (node->state == COMMIT_NODE_SEND) {
+		trx_t*	trx;
+
+		node->state = COMMIT_NODE_WAIT;
+
+		trx = thr_get_trx(thr);
+
+		ut_a(trx->lock.wait_thr == NULL);
+
+		trx_commit_or_rollback_prepare(trx);
+
+		trx->commit();
+		ut_ad(trx->lock.wait_thr == NULL);
+
+		thr = NULL;
+	} else {
+		ut_ad(node->state == COMMIT_NODE_WAIT);
+
+		node->state = COMMIT_NODE_SEND;
+
+		thr->run_node = que_node_get_parent(node);
+	}
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Does the transaction commit for MySQL.
+@return DB_SUCCESS or error number */
+dberr_t
+trx_commit_for_mysql(
+/*=================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	/* Because we do not do the commit by sending an Innobase
+	sig to the transaction, we must here make sure that trx has been
+	started. */
+
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		return DB_SUCCESS;
+	case TRX_STATE_ACTIVE:
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_PREPARED_RECOVERED:
+		trx->op_info = "committing";
+		trx->commit();
+		trx->op_info = "";
+		return(DB_SUCCESS);
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		break;
+	}
+	ut_error;
+	return(DB_CORRUPTION);
+}
+
+/** Durably write log until trx->commit_lsn
+(if trx_t::commit_in_memory() was invoked with flush_log_later=true). */
+void trx_commit_complete_for_mysql(trx_t *trx)
+{
+  const lsn_t lsn= trx->commit_lsn;
+  if (!lsn)
+    return;
+  switch (srv_flush_log_at_trx_commit) {
+  case 0:
+    return;
+  case 1:
+    if (trx->active_commit_ordered)
+      return;
+  }
+  trx_flush_log_if_needed(lsn, trx);
+}
+
+/**********************************************************************//**
+Marks the latest SQL statement ended. */
+void
+trx_mark_sql_stat_end(
+/*==================*/
+	trx_t*	trx)	/*!< in: trx handle */
+{
+	ut_a(trx);
+
+	switch (trx->state) {
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_PREPARED_RECOVERED:
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		break;
+	case TRX_STATE_NOT_STARTED:
+		trx->undo_no = 0;
+		/* fall through */
+	case TRX_STATE_ACTIVE:
+		if (trx->fts_trx != NULL) {
+			fts_savepoint_laststmt_refresh(trx);
+		}
+
+		if (trx->is_bulk_insert()) {
+			/* MDEV-25036 FIXME: we support buffered
+			insert only for the first insert statement */
+			trx->error_state = trx->bulk_insert_apply();
+			/* Allow a subsequent INSERT into an empty table
+			if !unique_checks && !foreign_key_checks. */
+			return;
+		}
+
+		trx->last_sql_stat_start.least_undo_no = trx->undo_no;
+		trx->end_bulk_insert();
+		return;
+	}
+
+	ut_error;
+}
+
+/**********************************************************************//**
+Prints info about a transaction. */
+void
+trx_print_low(
+/*==========*/
+	FILE*		f,
+			/*!< in: output stream */
+	const trx_t*	trx,
+			/*!< in: transaction */
+	ulint		max_query_len,
+			/*!< in: max query length to print,
+			or 0 to use the default max length */
+	ulint		n_rec_locks,
+			/*!< in: trx->lock.n_rec_locks */
+	ulint		n_trx_locks,
+			/*!< in: length of trx->lock.trx_locks */
+	ulint		heap_size)
+			/*!< in: mem_heap_get_size(trx->lock.lock_heap) */
+{
+	if (const trx_id_t id = trx->id) {
+		fprintf(f, "TRANSACTION " TRX_ID_FMT, trx->id);
+	} else {
+		fprintf(f, "TRANSACTION (%p)", trx);
+	}
+
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		fputs(", not started", f);
+		goto state_ok;
+	case TRX_STATE_ACTIVE:
+		fprintf(f, ", ACTIVE %lu sec",
+			(ulong) difftime(time(NULL), trx->start_time));
+		goto state_ok;
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_PREPARED_RECOVERED:
+		fprintf(f, ", ACTIVE (PREPARED) %lu sec",
+			(ulong) difftime(time(NULL), trx->start_time));
+		goto state_ok;
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		fputs(", COMMITTED IN MEMORY", f);
+		goto state_ok;
+	}
+	fprintf(f, ", state %lu", (ulong) trx->state);
+	ut_ad(0);
+state_ok:
+	const char* op_info = trx->op_info;
+
+	if (*op_info) {
+		putc(' ', f);
+		fputs(op_info, f);
+	}
+
+	if (trx->is_recovered) {
+		fputs(" recovered trx", f);
+	}
+
+	putc('\n', f);
+
+	if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) {
+		fprintf(f, "mysql tables in use %lu, locked %lu\n",
+			(ulong) trx->n_mysql_tables_in_use,
+			(ulong) trx->mysql_n_tables_locked);
+	}
+
+	bool newline = true;
+
+	if (trx->in_rollback) { /* dirty read for performance reasons */
+		fputs("ROLLING BACK ", f);
+	} else if (trx->lock.wait_lock) {
+		fputs("LOCK WAIT ", f);
+	} else {
+		newline = false;
+	}
+
+	if (n_trx_locks > 0 || heap_size > 400) {
+		newline = true;
+
+		fprintf(f, "%lu lock struct(s), heap size %lu,"
+			" %lu row lock(s)",
+			(ulong) n_trx_locks,
+			(ulong) heap_size,
+			(ulong) n_rec_locks);
+	}
+
+	if (trx->undo_no != 0) {
+		newline = true;
+		fprintf(f, ", undo log entries " TRX_ID_FMT, trx->undo_no);
+	}
+
+	if (newline) {
+		putc('\n', f);
+	}
+
+	if (trx->state != TRX_STATE_NOT_STARTED && trx->mysql_thd != NULL) {
+		innobase_mysql_print_thd(
+			f, trx->mysql_thd, static_cast<uint>(max_query_len));
+	}
+}
+
+/**********************************************************************//**
+Prints info about a transaction.
+The caller must hold lock_sys.latch.
+When possible, use trx_print() instead. */
+void
+trx_print_latched(
+/*==============*/
+	FILE*		f,		/*!< in: output stream */
+	const trx_t*	trx,		/*!< in: transaction */
+	ulint		max_query_len)	/*!< in: max query length to print,
+					or 0 to use the default max length */
+{
+	lock_sys.assert_locked();
+
+	trx_print_low(f, trx, max_query_len,
+		      trx->lock.n_rec_locks,
+		      UT_LIST_GET_LEN(trx->lock.trx_locks),
+		      mem_heap_get_size(trx->lock.lock_heap));
+}
+
+/**********************************************************************//**
+Prints info about a transaction.
+Acquires and releases lock_sys.latch. */
+TRANSACTIONAL_TARGET
+void
+trx_print(
+/*======*/
+	FILE*		f,		/*!< in: output stream */
+	const trx_t*	trx,		/*!< in: transaction */
+	ulint		max_query_len)	/*!< in: max query length to print,
+					or 0 to use the default max length */
+{
+  ulint n_rec_locks, n_trx_locks, heap_size;
+  {
+    TMLockMutexGuard g{SRW_LOCK_CALL};
+    n_rec_locks= trx->lock.n_rec_locks;
+    n_trx_locks= UT_LIST_GET_LEN(trx->lock.trx_locks);
+    heap_size= mem_heap_get_size(trx->lock.lock_heap);
+  }
+
+  trx_print_low(f, trx, max_query_len, n_rec_locks, n_trx_locks, heap_size);
+}
+
+/** Prepare a transaction.
+@return	log sequence number that makes the XA PREPARE durable
+@retval	0	if no changes needed to be made durable */
+static lsn_t trx_prepare_low(trx_t *trx)
+{
+	ut_ad(!trx->is_recovered);
+
+	mtr_t	mtr;
+
+	if (trx_undo_t* undo = trx->rsegs.m_noredo.undo) {
+		ut_ad(undo->rseg == trx->rsegs.m_noredo.rseg);
+
+		mtr.start();
+		mtr.set_log_mode(MTR_LOG_NO_REDO);
+		trx_undo_set_state_at_prepare(trx, undo, false, &mtr);
+		mtr.commit();
+	}
+
+	trx_undo_t* undo = trx->rsegs.m_redo.undo;
+
+	if (!undo) {
+		/* There were no changes to persistent tables. */
+		return(0);
+	}
+
+	ut_ad(undo->rseg == trx->rsegs.m_redo.rseg);
+
+	mtr.start();
+
+	/* Change the undo log segment states from TRX_UNDO_ACTIVE to
+	TRX_UNDO_PREPARED: these modifications to the file data
+	structure define the transaction as prepared in the file-based
+	world, at the serialization point of lsn. */
+	trx_undo_set_state_at_prepare(trx, undo, false, &mtr);
+
+	/* Make the XA PREPARE durable. */
+	mtr.commit();
+	ut_ad(mtr.commit_lsn() > 0);
+	return(mtr.commit_lsn());
+}
+
+/****************************************************************//**
+Prepares a transaction. */
+TRANSACTIONAL_TARGET
+static
+void
+trx_prepare(
+/*========*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	/* Only fresh user transactions can be prepared.
+	Recovered transactions cannot. */
+	ut_a(!trx->is_recovered);
+
+	lsn_t	lsn = trx_prepare_low(trx);
+
+	ut_a(trx->state == TRX_STATE_ACTIVE);
+	{
+		TMTrxGuard tg{*trx};
+		trx->state = TRX_STATE_PREPARED;
+	}
+
+	if (lsn) {
+		/* Depending on the my.cnf options, we may now write the log
+		buffer to the log files, making the prepared state of the
+		transaction durable if the OS does not crash. We may also
+		flush the log files to disk, making the prepared state of the
+		transaction durable also at an OS crash or a power outage.
+
+		The idea in InnoDB's group prepare is that a group of
+		transactions gather behind a trx doing a physical disk write
+		to log files, and when that physical write has been completed,
+		one of those transactions does a write which prepares the whole
+		group. Note that this group prepare will only bring benefit if
+		there are > 2 users in the database. Then at least 2 users can
+		gather behind one doing the physical log write to disk.
+
+		We must not be holding any mutexes or latches here. */
+		if (auto f = srv_flush_log_at_trx_commit) {
+			log_write_up_to(lsn, (f & 1) && srv_file_flush_method
+					!= SRV_NOSYNC);
+		}
+
+		if (!UT_LIST_GET_LEN(trx->lock.trx_locks)
+		    || trx->isolation_level == TRX_ISO_SERIALIZABLE) {
+			/* Do not release any locks at the
+			SERIALIZABLE isolation level. */
+		} else if (!trx->mysql_thd
+			   || thd_sql_command(trx->mysql_thd)
+			   != SQLCOM_XA_PREPARE) {
+			/* Do not release locks for XA COMMIT ONE PHASE
+			or for internal distributed transactions
+			(XID::get_my_xid() would be nonzero). */
+		} else {
+			lock_release_on_prepare(trx);
+		}
+	}
+}
+
+/** XA PREPARE a transaction.
+@param[in,out]	trx	transaction to prepare */
+void trx_prepare_for_mysql(trx_t* trx)
+{
+	trx_start_if_not_started_xa(trx, false);
+
+	trx->op_info = "preparing";
+
+	trx_prepare(trx);
+
+	trx->op_info = "";
+}
+
+
+struct trx_recover_for_mysql_callback_arg
+{
+  XID *xid_list;
+  uint len;
+  uint count;
+};
+
+
+static my_bool trx_recover_for_mysql_callback(rw_trx_hash_element_t *element,
+  trx_recover_for_mysql_callback_arg *arg)
+{
+  DBUG_ASSERT(arg->len > 0);
+  element->mutex.wr_lock();
+  if (trx_t *trx= element->trx)
+  {
+    /*
+      The state of a read-write transaction can only change from ACTIVE to
+      PREPARED while we are holding the element->mutex. But since it is
+      executed at startup no state change should occur.
+    */
+    if (trx_state_eq(trx, TRX_STATE_PREPARED))
+    {
+      ut_ad(trx->is_recovered);
+      ut_ad(trx->id);
+      if (arg->count == 0)
+        ib::info() << "Starting recovery for XA transactions...";
+      XID& xid= arg->xid_list[arg->count];
+      if (arg->count++ < arg->len)
+      {
+        trx->state= TRX_STATE_PREPARED_RECOVERED;
+        ib::info() << "Transaction " << trx->id
+                   << " in prepared state after recovery";
+        ib::info() << "Transaction contains changes to " << trx->undo_no
+                   << " rows";
+        xid= trx->xid;
+      }
+    }
+  }
+  element->mutex.wr_unlock();
+  /* Do not terminate upon reaching arg->len; count all transactions */
+  return false;
+}
+
+
+static my_bool trx_recover_reset_callback(rw_trx_hash_element_t *element,
+  void*)
+{
+  element->mutex.wr_lock();
+  if (trx_t *trx= element->trx)
+  {
+    if (trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED))
+      trx->state= TRX_STATE_PREPARED;
+  }
+  element->mutex.wr_unlock();
+  return false;
+}
+
+
+/**
+  Find prepared transaction objects for recovery.
+
+  @param[out]  xid_list  prepared transactions
+  @param[in]   len       number of slots in xid_list
+
+  @return number of prepared transactions stored in xid_list
+*/
+
+int trx_recover_for_mysql(XID *xid_list, uint len)
+{
+  trx_recover_for_mysql_callback_arg arg= { xid_list, len, 0 };
+
+  ut_ad(xid_list);
+  ut_ad(len);
+
+  /* Fill xid_list with PREPARED transactions. */
+  trx_sys.rw_trx_hash.iterate_no_dups(trx_recover_for_mysql_callback, &arg);
+  if (arg.count)
+  {
+    ib::info() << arg.count
+        << " transactions in prepared state after recovery";
+    /* After returning the full list, reset the state, because
+    init_server_components() wants to recover the collection of
+    transactions twice, by first calling tc_log->open() and then
+    ha_recover() directly. */
+    if (arg.count <= len)
+      trx_sys.rw_trx_hash.iterate(trx_recover_reset_callback);
+  }
+  return int(std::min(arg.count, len));
+}
+
+
+struct trx_get_trx_by_xid_callback_arg
+{
+  const XID *xid;
+  trx_t *trx;
+};
+
+
+static my_bool trx_get_trx_by_xid_callback(rw_trx_hash_element_t *element,
+  trx_get_trx_by_xid_callback_arg *arg)
+{
+  my_bool found= 0;
+  element->mutex.wr_lock();
+  if (trx_t *trx= element->trx)
+  {
+    trx->mutex_lock();
+    if (trx->is_recovered &&
+	(trx_state_eq(trx, TRX_STATE_PREPARED) ||
+	 trx_state_eq(trx, TRX_STATE_PREPARED_RECOVERED)) &&
+        arg->xid->eq(&trx->xid))
+    {
+#ifdef WITH_WSREP
+      /* The commit of a prepared recovered Galera
+      transaction needs a valid trx->xid for
+      invoking trx_sys_update_wsrep_checkpoint(). */
+      if (!wsrep_is_wsrep_xid(&trx->xid))
+#endif /* WITH_WSREP */
+      /* Invalidate the XID, so that subsequent calls will not find it. */
+      trx->xid.null();
+      arg->trx= trx;
+      found= 1;
+    }
+    trx->mutex_unlock();
+  }
+  element->mutex.wr_unlock();
+  return found;
+}
+
+/** Look up an X/Open distributed transaction in XA PREPARE state.
+@param[in]	xid	X/Open XA transaction identifier
+@return	transaction on match (the trx_t::xid will be invalidated);
+note that the trx may have been committed before the caller acquires
+trx_t::mutex
+@retval	NULL if no match */
+trx_t* trx_get_trx_by_xid(const XID* xid)
+{
+  trx_get_trx_by_xid_callback_arg arg= { xid, 0 };
+
+  if (xid)
+    trx_sys.rw_trx_hash.iterate(trx_get_trx_by_xid_callback, &arg);
+  return arg.trx;
+}
+
+
+/*************************************************************//**
+Starts the transaction if it is not yet started. */
+void
+trx_start_if_not_started_xa_low(
+/*============================*/
+	trx_t*	trx,		/*!< in/out: transaction */
+	bool	read_write)	/*!< in: true if read write transaction */
+{
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		trx_start_low(trx, read_write);
+		return;
+
+	case TRX_STATE_ACTIVE:
+		if (trx->id == 0 && read_write) {
+			/* If the transaction is tagged as read-only then
+			it can only write to temp tables and for such
+			transactions we don't want to move them to the
+			trx_sys_t::rw_trx_hash. */
+			if (!trx->read_only) {
+				trx_set_rw_mode(trx);
+			}
+		}
+		return;
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_PREPARED_RECOVERED:
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		break;
+	}
+
+	ut_error;
+}
+
+/*************************************************************//**
+Starts the transaction if it is not yet started. */
+void
+trx_start_if_not_started_low(
+/*==========================*/
+	trx_t*	trx,		/*!< in: transaction */
+	bool	read_write)	/*!< in: true if read write transaction */
+{
+	switch (trx->state) {
+	case TRX_STATE_NOT_STARTED:
+		trx_start_low(trx, read_write);
+		return;
+
+	case TRX_STATE_ACTIVE:
+		if (read_write && trx->id == 0 && !trx->read_only) {
+			trx_set_rw_mode(trx);
+		}
+		return;
+
+	case TRX_STATE_PREPARED:
+	case TRX_STATE_PREPARED_RECOVERED:
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+		break;
+	}
+
+	ut_error;
+}
+
+/**
+Start a transaction for internal processing.
+@param trx          transaction
+@param read_write   whether writes may be performed */
+void trx_start_internal_low(trx_t *trx, bool read_write)
+{
+  trx->will_lock= true;
+  trx_start_low(trx, read_write);
+}
+
+/** Start a transaction for a DDL operation.
+@param trx   transaction */
+void trx_start_for_ddl_low(trx_t *trx)
+{
+  /* Flag this transaction as a dictionary operation, so that
+  the data dictionary will be locked in crash recovery. */
+  trx->dict_operation= true;
+  trx_start_internal_low(trx, true);
+}
+
+/*************************************************************//**
+Set the transaction as a read-write transaction if it is not already
+tagged as such. Read-only transactions that are writing to temporary
+tables are assigned an ID and a rollback segment but are not added
+to the trx read-write list because their updates should not be visible
+to other transactions and therefore their changes can be ignored by
+by MVCC. */
+void
+trx_set_rw_mode(
+/*============*/
+	trx_t*		trx)		/*!< in/out: transaction that is RW */
+{
+	ut_ad(trx->rsegs.m_redo.rseg == 0);
+	ut_ad(!trx->is_autocommit_non_locking());
+	ut_ad(!trx->read_only);
+	ut_ad(trx->id == 0);
+
+	if (high_level_read_only) {
+		return;
+	}
+
+	trx_assign_rseg_low(trx);
+
+	/* So that we can see our own changes. */
+	if (trx->read_view.is_open()) {
+		trx->read_view.set_creator_trx_id(trx->id);
+	}
+}
diff --git a/storage/innobase/trx/trx0undo.cc b/storage/innobase/trx/trx0undo.cc
new file mode 100644
index 00000000..203edd9f
--- /dev/null
+++ b/storage/innobase/trx/trx0undo.cc
@@ -0,0 +1,1478 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2014, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0undo.cc
+Transaction undo log
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0undo.h"
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "mtr0log.h"
+#include "srv0mon.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "trx0rseg.h"
+#include "log.h"
+
+/* How should the old versions in the history list be managed?
+   ----------------------------------------------------------
+If each transaction is given a whole page for its update undo log, file
+space consumption can be 10 times higher than necessary. Therefore,
+partly filled update undo log pages should be reusable. But then there
+is no way individual pages can be ordered so that the ordering agrees
+with the serialization numbers of the transactions on the pages. Thus,
+the history list must be formed of undo logs, not their header pages as
+it was in the old implementation.
+	However, on a single header page the transactions are placed in
+the order of their serialization numbers. As old versions are purged, we
+may free the page when the last transaction on the page has been purged.
+	A problem is that the purge has to go through the transactions
+in the serialization order. This means that we have to look through all
+rollback segments for the one that has the smallest transaction number
+in its history list.
+	When should we do a purge? A purge is necessary when space is
+running out in any of the rollback segments. Then we may have to purge
+also old version which might be needed by some consistent read. How do
+we trigger the start of a purge? When a transaction writes to an undo log,
+it may notice that the space is running out. When a read view is closed,
+it may make some history superfluous. The server can have an utility which
+periodically checks if it can purge some history.
+	In a parallellized purge we have the problem that a query thread
+can remove a delete marked clustered index record before another query
+thread has processed an earlier version of the record, which cannot then
+be done because the row cannot be constructed from the clustered index
+record. To avoid this problem, we will store in the update and delete mark
+undo record also the columns necessary to construct the secondary index
+entries which are modified.
+	We can latch the stack of versions of a single clustered index record
+by taking a latch on the clustered index page. As long as the latch is held,
+no new versions can be added and no versions removed by undo. But, a purge
+can still remove old versions from the bottom of the stack. */
+
+/* How to protect rollback segments, undo logs, and history lists with
+   -------------------------------------------------------------------
+latches?
+-------
+When a transaction does its first insert or modify in the clustered index, an
+undo log is assigned for it. Then we must have an x-latch to the rollback
+segment header.
+	When the transaction performs modifications or rolls back, its
+undo log is protected by undo page latches.
+Only the thread that is associated with the transaction may hold multiple
+undo page latches at a time. Undo pages are always private to a single
+transaction. Other threads that are performing MVCC reads
+or checking for implicit locks will lock at most one undo page at a time
+in trx_undo_get_undo_rec_low().
+	When the transaction commits, its persistent undo log is added
+to the history list. If it is not suitable for reuse, its slot is reset.
+In both cases, an x-latch must be acquired on the rollback segment header page.
+	The purge operation steps through the history list without modifying
+it until a truncate operation occurs, which can remove undo logs from the end
+of the list and release undo log segments. In stepping through the list,
+s-latches on the undo log pages are enough, but in a truncate, x-latches must
+be obtained on the rollback segment and individual pages. */
+
+/********************************************************************//**
+Creates and initializes an undo log memory object.
+@return own: the undo log memory object */
+static
+trx_undo_t*
+trx_undo_mem_create(
+/*================*/
+	trx_rseg_t*	rseg,	/*!< in: rollback segment memory object */
+	ulint		id,	/*!< in: slot index within rseg */
+	trx_id_t	trx_id,	/*!< in: id of the trx for which the undo log
+				is created */
+	const XID*	xid,	/*!< in: X/Open XA transaction identification*/
+	uint32_t	page_no,/*!< in: undo log header page number */
+	uint16_t	offset);/*!< in: undo log header byte offset on page */
+
+/** Determine the start offset of undo log records of an undo log page.
+@param[in]	block	undo log page
+@param[in]	page_no		undo log header page number
+@param[in]	offset		undo log header offset
+@return start offset */
+static
+uint16_t trx_undo_page_get_start(const buf_block_t *block, uint32_t page_no,
+                                 uint16_t offset)
+{
+  return page_no == block->page.id().page_no()
+    ? mach_read_from_2(offset + TRX_UNDO_LOG_START + block->page.frame)
+    : TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE;
+}
+
+/** Get the first undo log record on a page.
+@param[in]	block	undo log page
+@param[in]	page_no	undo log header page number
+@param[in]	offset	undo log header page offset
+@return	pointer to first record
+@retval	nullptr	if none exists */
+trx_undo_rec_t*
+trx_undo_page_get_first_rec(const buf_block_t *block, uint32_t page_no,
+                            uint16_t offset)
+{
+  uint16_t start= trx_undo_page_get_start(block, page_no, offset);
+  return start == trx_undo_page_get_end(block, page_no, offset)
+    ? nullptr : block->page.frame + start;
+}
+
+/** Get the last undo log record on a page.
+@param[in]	page	undo log page
+@param[in]	page_no	undo log header page number
+@param[in]	offset	undo log header page offset
+@return	pointer to last record
+@retval	NULL	if none exists */
+static
+trx_undo_rec_t*
+trx_undo_page_get_last_rec(const buf_block_t *block, uint32_t page_no,
+                           uint16_t offset)
+{
+  uint16_t end= trx_undo_page_get_end(block, page_no, offset);
+  return trx_undo_page_get_start(block, page_no, offset) == end
+    ? nullptr
+    : block->page.frame + mach_read_from_2(block->page.frame + end - 2);
+}
+
+/** Get the previous record in an undo log from the previous page.
+@param[in,out]  block   undo log page
+@param[in]      rec     undo record offset in the page
+@param[in]      page_no undo log header page number
+@param[in]      offset  undo log header offset on page
+@param[in]      shared  latching mode: true=RW_S_LATCH, false=RW_X_LATCH
+@param[in,out]  mtr     mini-transaction
+@return undo log record, the page latched, NULL if none */
+static trx_undo_rec_t*
+trx_undo_get_prev_rec_from_prev_page(buf_block_t *&block, uint16_t rec,
+                                     uint32_t page_no, uint16_t offset,
+                                     bool shared, mtr_t *mtr)
+{
+  uint32_t prev_page_no= mach_read_from_4(TRX_UNDO_PAGE_HDR +
+                                          TRX_UNDO_PAGE_NODE +
+                                          FLST_PREV + FIL_ADDR_PAGE +
+                                          block->page.frame);
+
+  if (prev_page_no == FIL_NULL)
+    return nullptr;
+
+  block= buf_page_get(page_id_t(block->page.id().space(), prev_page_no),
+                      0, shared ? RW_S_LATCH : RW_X_LATCH, mtr);
+
+  return block ? trx_undo_page_get_last_rec(block, page_no, offset) : nullptr;
+}
+
+/** Get the previous undo log record.
+@param[in]	block	undo log page
+@param[in]	rec	undo log record
+@param[in]	page_no	undo log header page number
+@param[in]	offset	undo log header page offset
+@return	pointer to record
+@retval	NULL if none */
+static
+trx_undo_rec_t*
+trx_undo_page_get_prev_rec(const buf_block_t *block, trx_undo_rec_t *rec,
+                           uint32_t page_no, uint16_t offset)
+{
+  ut_ad(block->page.frame == page_align(rec));
+  return
+    rec == block->page.frame + trx_undo_page_get_start(block, page_no, offset)
+    ? nullptr
+    : block->page.frame + mach_read_from_2(rec - 2);
+}
+
+/** Get the previous record in an undo log.
+@param[in,out]  block   undo log page
+@param[in]      rec     undo record offset in the page
+@param[in]      page_no undo log header page number
+@param[in]      offset  undo log header offset on page
+@param[in]      shared  latching mode: true=RW_S_LATCH, false=RW_X_LATCH
+@param[in,out]  mtr     mini-transaction
+@return undo log record, the page latched, NULL if none */
+trx_undo_rec_t*
+trx_undo_get_prev_rec(buf_block_t *&block, uint16_t rec, uint32_t page_no,
+                      uint16_t offset, bool shared, mtr_t *mtr)
+{
+  if (trx_undo_rec_t *prev= trx_undo_page_get_prev_rec(block,
+                                                       block->page.frame + rec,
+                                                       page_no, offset))
+    return prev;
+
+  /* We have to go to the previous undo log page to look for the
+  previous record */
+
+  return trx_undo_get_prev_rec_from_prev_page(block, rec, page_no, offset,
+                                              shared, mtr);
+}
+
+/** Get the next record in an undo log from the next page.
+@param[in,out]  block   undo log page
+@param[in]      page_no undo log header page number
+@param[in]      offset  undo log header offset on page
+@param[in]      mode    latching mode: RW_S_LATCH or RW_X_LATCH
+@param[in,out]  mtr     mini-transaction
+@return undo log record, the page latched, NULL if none */
+static trx_undo_rec_t*
+trx_undo_get_next_rec_from_next_page(const buf_block_t *&block,
+                                     uint32_t page_no, uint16_t offset,
+                                     ulint mode, mtr_t *mtr)
+{
+  if (page_no == block->page.id().page_no() &&
+      mach_read_from_2(block->page.frame + offset + TRX_UNDO_NEXT_LOG))
+    return nullptr;
+
+  uint32_t next= mach_read_from_4(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE +
+                                  FLST_NEXT + FIL_ADDR_PAGE +
+                                  block->page.frame);
+  if (next == FIL_NULL)
+    return nullptr;
+
+  block= buf_page_get_gen(page_id_t(block->page.id().space(), next), 0, mode,
+                          nullptr, BUF_GET_POSSIBLY_FREED, mtr);
+
+  return block ? trx_undo_page_get_first_rec(block, page_no, offset) : nullptr;
+}
+
+/** Get the first record in an undo log.
+@param[in]      space   undo log header space
+@param[in]      page_no undo log header page number
+@param[in]      offset  undo log header offset on page
+@param[in]      mode    latching mode: RW_S_LATCH or RW_X_LATCH
+@param[out]     block   undo log page
+@param[in,out]  mtr     mini-transaction
+@param[out]     err     error code
+@return undo log record, the page latched
+@retval nullptr if none */
+static trx_undo_rec_t*
+trx_undo_get_first_rec(const fil_space_t &space, uint32_t page_no,
+                       uint16_t offset, ulint mode, const buf_block_t*& block,
+                       mtr_t *mtr, dberr_t *err)
+{
+  block= buf_page_get_gen(page_id_t{space.id, page_no}, 0, mode,
+                          nullptr, BUF_GET, mtr, err);
+  if (!block)
+    return nullptr;
+
+  if (trx_undo_rec_t *rec= trx_undo_page_get_first_rec(block, page_no, offset))
+    return rec;
+
+  return trx_undo_get_next_rec_from_next_page(block, page_no, offset, mode,
+                                              mtr);
+}
+
+inline void UndorecApplier::apply_undo_rec(const trx_undo_rec_t *rec)
+{
+  undo_rec= rec;
+  if (!undo_rec)
+    return;
+  offset= page_offset(undo_rec);
+
+  bool updated_extern= false;
+  undo_no_t undo_no= 0;
+  table_id_t table_id= 0;
+  undo_rec= trx_undo_rec_get_pars(undo_rec, &type,
+                                  &cmpl_info,
+                                  &updated_extern, &undo_no, &table_id);
+  dict_sys.freeze(SRW_LOCK_CALL);
+  dict_table_t *table= dict_sys.find_table(table_id);
+  dict_sys.unfreeze();
+
+  ut_ad(table);
+  if (!table->is_active_ddl())
+    return;
+
+  dict_index_t *index= dict_table_get_first_index(table);
+  const dtuple_t *undo_tuple;
+  switch (type) {
+  default:
+    ut_ad("invalid type" == 0);
+    MY_ASSERT_UNREACHABLE();
+  case TRX_UNDO_INSERT_REC:
+    undo_rec= trx_undo_rec_get_row_ref(undo_rec, index, &undo_tuple, heap);
+  insert:
+    log_insert(*undo_tuple, index);
+    break;
+  case TRX_UNDO_UPD_EXIST_REC:
+  case TRX_UNDO_UPD_DEL_REC:
+  case TRX_UNDO_DEL_MARK_REC:
+    trx_id_t trx_id;
+    roll_ptr_t roll_ptr;
+    byte info_bits;
+    undo_rec= trx_undo_update_rec_get_sys_cols(
+      undo_rec, &trx_id, &roll_ptr, &info_bits);
+
+    undo_rec= trx_undo_rec_get_row_ref(undo_rec, index, &undo_tuple, heap);
+    undo_rec= trx_undo_update_rec_get_update(undo_rec, index, type, trx_id,
+                                             roll_ptr, info_bits,
+                                             heap, &update);
+    if (type == TRX_UNDO_UPD_DEL_REC)
+      goto insert;
+    log_update(*undo_tuple, index);
+  }
+
+  clear_undo_rec();
+}
+
+/** Apply any changes to tables for which online DDL is in progress. */
+ATTRIBUTE_COLD void trx_t::apply_log()
+{
+  const trx_undo_t *undo= rsegs.m_redo.undo;
+  if (!undo || !undo_no)
+    return;
+  page_id_t page_id{rsegs.m_redo.rseg->space->id, undo->hdr_page_no};
+  page_id_t next_page_id(page_id);
+  mtr_t mtr;
+  mtr.start();
+  buf_block_t *block= buf_page_get(page_id, 0, RW_S_LATCH, &mtr);
+  if (UNIV_UNLIKELY(!block))
+  {
+    mtr.commit();
+    return;
+  }
+
+  UndorecApplier log_applier(page_id, id);
+
+  for (;;)
+  {
+    trx_undo_rec_t *rec= trx_undo_page_get_first_rec(block, page_id.page_no(),
+                                                     undo->hdr_offset);
+    while (rec)
+    {
+      block->page.fix();
+      mtr.commit();
+      /* Since we are the only thread who could write to this undo page,
+      it is safe to dereference rec while only holding a buffer-fix. */
+      log_applier.apply_undo_rec(rec);
+      mtr.start();
+      mtr.page_lock(block, RW_S_LATCH);
+      rec= trx_undo_page_get_next_rec(block, page_offset(rec),
+                                      page_id.page_no(), undo->hdr_offset);
+    }
+
+    uint32_t next= mach_read_from_4(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE +
+                                    FLST_NEXT + FIL_ADDR_PAGE +
+                                    block->page.frame);
+    if (next == FIL_NULL)
+      break;
+    next_page_id.set_page_no(next);
+    mtr.commit();
+    mtr.start();
+    block= buf_page_get_gen(next_page_id, 0, RW_S_LATCH, block, BUF_GET, &mtr);
+    if (UNIV_UNLIKELY(!block))
+      break;
+    log_applier.assign_next(next_page_id);
+  }
+  mtr.commit();
+  apply_online_log= false;
+}
+
+/*============== UNDO LOG FILE COPY CREATION AND FREEING ==================*/
+
+/** Initialize an undo log page.
+NOTE: This corresponds to a redo log record and must not be changed!
+@see mtr_t::undo_create()
+@param block   undo log page */
+void trx_undo_page_init(const buf_block_t &block)
+{
+  mach_write_to_2(my_assume_aligned<2>(FIL_PAGE_TYPE + block.page.frame),
+                  FIL_PAGE_UNDO_LOG);
+  static_assert(TRX_UNDO_PAGE_HDR == FIL_PAGE_DATA, "compatibility");
+  memset_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE + block.page.frame,
+                    0, 2);
+  mach_write_to_2(my_assume_aligned<2>
+                  (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START + block.page.frame),
+                  TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
+  memcpy_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE + block.page.frame,
+                    TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START + block.page.frame,
+                    2);
+  /* The following corresponds to flst_zero_both(), but without writing log. */
+  memset_aligned<4>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_PREV +
+                    FIL_ADDR_PAGE + block.page.frame, 0xff, 4);
+  memset_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_PREV +
+                    FIL_ADDR_BYTE + block.page.frame, 0, 2);
+  memset_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_NEXT +
+                    FIL_ADDR_PAGE + block.page.frame, 0xff, 4);
+  memset_aligned<2>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE + FLST_NEXT +
+                    FIL_ADDR_BYTE + block.page.frame, 0, 2);
+  static_assert(TRX_UNDO_PAGE_NODE + FLST_NEXT + FIL_ADDR_BYTE + 2 ==
+                TRX_UNDO_PAGE_HDR_SIZE, "compatibility");
+  /* Preserve TRX_UNDO_SEG_HDR, but clear the rest of the page. */
+  memset_aligned<2>(TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE +
+                    block.page.frame, 0,
+                    srv_page_size - (TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE +
+                                     FIL_PAGE_DATA_END));
+}
+
+/** Look for a free slot for an undo log segment.
+@param rseg_header   rollback segment header
+@return slot index
+@retval ULINT_UNDEFINED if not found */
+static ulint trx_rsegf_undo_find_free(const buf_block_t *rseg_header)
+{
+  ulint max_slots= TRX_RSEG_N_SLOTS;
+
+#ifdef UNIV_DEBUG
+  if (trx_rseg_n_slots_debug)
+    max_slots= std::min<ulint>(trx_rseg_n_slots_debug, TRX_RSEG_N_SLOTS);
+#endif
+
+  for (ulint i= 0; i < max_slots; i++)
+    if (trx_rsegf_get_nth_undo(rseg_header, i) == FIL_NULL)
+      return i;
+
+  return ULINT_UNDEFINED;
+}
+
+/** Create an undo log segment.
+@param[in,out]	space		tablespace
+@param[in,out]	rseg_hdr	rollback segment header (x-latched)
+@param[out]	id		undo slot number
+@param[out]	err		error code
+@param[in,out]	mtr		mini-transaction
+@return	undo log block
+@retval	NULL	on failure */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+buf_block_t*
+trx_undo_seg_create(fil_space_t *space, buf_block_t *rseg_hdr, ulint *id,
+                    dberr_t *err, mtr_t *mtr)
+{
+	buf_block_t*	block;
+	uint32_t	n_reserved;
+
+	const ulint slot_no = trx_rsegf_undo_find_free(rseg_hdr);
+
+	if (slot_no == ULINT_UNDEFINED) {
+		ib::warn() << "Cannot find a free slot for an undo log. Do"
+			" you have too many active transactions running"
+			" concurrently?";
+
+		*err = DB_TOO_MANY_CONCURRENT_TRXS;
+		return NULL;
+	}
+
+	ut_ad(slot_no < TRX_RSEG_N_SLOTS);
+
+	*err = fsp_reserve_free_extents(&n_reserved, space, 2, FSP_UNDO, mtr);
+	if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+		return NULL;
+	}
+
+	/* Allocate a new file segment for the undo log */
+	block = fseg_create(space, TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER,
+			    mtr, err, true);
+
+	space->release_free_extents(n_reserved);
+
+	if (!block) {
+		return block;
+	}
+
+	mtr->undo_create(*block);
+	trx_undo_page_init(*block);
+
+	mtr->write<2>(*block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE
+		      + block->page.frame,
+		      TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE);
+	mtr->write<2,mtr_t::MAYBE_NOP>(*block,
+				       TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG
+				       + block->page.frame, 0U);
+
+	flst_init(*block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST
+		  + block->page.frame, mtr);
+
+	*err = flst_add_last(block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
+			     block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE,
+			     mtr);
+
+	*id = slot_no;
+	mtr->write<4>(*rseg_hdr, TRX_RSEG + TRX_RSEG_UNDO_SLOTS
+		      + slot_no * TRX_RSEG_SLOT_SIZE + rseg_hdr->page.frame,
+		      block->page.id().page_no());
+
+	*err = DB_SUCCESS;
+	return block;
+}
+
+/** Initialize an undo log header.
+@param[in,out]  undo_page   undo log segment header page
+@param[in]      trx_id      transaction identifier
+@param[in,out]  mtr         mini-transaction
+@return header byte offset on page */
+static uint16_t trx_undo_header_create(buf_block_t *undo_page, trx_id_t trx_id,
+                                       mtr_t* mtr)
+{
+  /* Reset the TRX_UNDO_PAGE_TYPE in case this page is being
+  repurposed after upgrading to MariaDB 10.3. */
+  byte *undo_type= my_assume_aligned<2>
+    (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE + undo_page->page.frame);
+  ut_ad(mach_read_from_2(undo_type) <= 2);
+  mtr->write<2,mtr_t::MAYBE_NOP>(*undo_page, undo_type, 0U);
+  byte *start= my_assume_aligned<4>(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START +
+                                    undo_page->page.frame);
+  const uint16_t free= mach_read_from_2(start + 2);
+  static_assert(TRX_UNDO_PAGE_START + 2 == TRX_UNDO_PAGE_FREE,
+                "compatibility");
+  ut_a(free + TRX_UNDO_LOG_XA_HDR_SIZE < srv_page_size - 100);
+
+  mach_write_to_2(start, free + TRX_UNDO_LOG_XA_HDR_SIZE);
+  /* A WRITE of 2 bytes is never longer than a MEMMOVE.
+  So, WRITE 2+2 bytes is better than WRITE+MEMMOVE.
+  But, a MEMSET will only be 1+2 bytes, that is, 1 byte shorter! */
+  memcpy_aligned<2>(start + 2, start, 2);
+  mtr->memset(*undo_page, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_START, 4,
+              start, 2);
+  uint16_t prev_log= mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG +
+                                      undo_page->page.frame);
+  ut_ad(prev_log < free);
+  alignas(4) byte buf[4];
+  mach_write_to_2(buf, TRX_UNDO_ACTIVE);
+  mach_write_to_2(buf + 2, free);
+  static_assert(TRX_UNDO_STATE + 2 == TRX_UNDO_LAST_LOG, "compatibility");
+  static_assert(!((TRX_UNDO_SEG_HDR + TRX_UNDO_STATE) % 4), "alignment");
+  mtr->memcpy<mtr_t::MAYBE_NOP>
+    (*undo_page, my_assume_aligned<4>
+     (TRX_UNDO_SEG_HDR + TRX_UNDO_STATE + undo_page->page.frame), buf, 4);
+  if (prev_log)
+    mtr->write<2>(*undo_page, prev_log + TRX_UNDO_NEXT_LOG +
+                  undo_page->page.frame, free);
+  mtr->write<8,mtr_t::MAYBE_NOP>(*undo_page, free + TRX_UNDO_TRX_ID +
+                                 undo_page->page.frame, trx_id);
+  if (UNIV_UNLIKELY(mach_read_from_8(free + TRX_UNDO_TRX_NO +
+                                     undo_page->page.frame) != 0))
+    mtr->memset(undo_page, free + TRX_UNDO_TRX_NO, 8, 0);
+
+  /* Write TRX_UNDO_NEEDS_PURGE=1 and TRX_UNDO_LOG_START. */
+  mach_write_to_2(buf, 1);
+  memcpy_aligned<2>(buf + 2, start, 2);
+  static_assert(TRX_UNDO_NEEDS_PURGE + 2 == TRX_UNDO_LOG_START,
+                "compatibility");
+  mtr->memcpy<mtr_t::MAYBE_NOP>(*undo_page, free + TRX_UNDO_NEEDS_PURGE +
+                                undo_page->page.frame, buf, 4);
+  /* Initialize all fields TRX_UNDO_XID_EXISTS to TRX_UNDO_HISTORY_NODE. */
+  if (prev_log)
+  {
+    mtr->memset(undo_page, free + TRX_UNDO_XID_EXISTS,
+                TRX_UNDO_PREV_LOG - TRX_UNDO_XID_EXISTS, 0);
+    mtr->write<2,mtr_t::MAYBE_NOP>(*undo_page, free + TRX_UNDO_PREV_LOG +
+                                   undo_page->page.frame, prev_log);
+    static_assert(TRX_UNDO_PREV_LOG + 2 == TRX_UNDO_HISTORY_NODE,
+                  "compatibility");
+    mtr->memset(undo_page, free + TRX_UNDO_HISTORY_NODE, FLST_NODE_SIZE, 0);
+    static_assert(TRX_UNDO_LOG_OLD_HDR_SIZE == TRX_UNDO_HISTORY_NODE +
+                  FLST_NODE_SIZE, "compatibility");
+  }
+  else
+    mtr->memset(undo_page, free + TRX_UNDO_XID_EXISTS,
+                TRX_UNDO_LOG_OLD_HDR_SIZE - TRX_UNDO_XID_EXISTS, 0);
+  return free;
+}
+
+/** Write X/Open XA Transaction Identifier (XID) to undo log header
+@param[in,out]  block   undo header page
+@param[in]      offset  undo header record offset
+@param[in]      xid     distributed transaction identifier
+@param[in,out]  mtr     mini-transaction */
+static void trx_undo_write_xid(buf_block_t *block, uint16_t offset,
+                               const XID &xid, mtr_t *mtr)
+{
+  DBUG_ASSERT(xid.gtrid_length > 0);
+  DBUG_ASSERT(xid.bqual_length >= 0);
+  DBUG_ASSERT(xid.gtrid_length <= MAXGTRIDSIZE);
+  DBUG_ASSERT(xid.bqual_length <= MAXBQUALSIZE);
+  static_assert(MAXGTRIDSIZE + MAXBQUALSIZE == XIDDATASIZE,
+                "gtrid and bqual don't fit xid data");
+  DBUG_ASSERT(mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG +
+                               block->page.frame) == offset);
+
+  trx_ulogf_t* log_hdr= block->page.frame + offset;
+
+  mtr->write<4,mtr_t::MAYBE_NOP>(*block, log_hdr + TRX_UNDO_XA_FORMAT,
+                                 static_cast<uint32_t>(xid.formatID));
+  mtr->write<4,mtr_t::MAYBE_NOP>(*block, log_hdr + TRX_UNDO_XA_TRID_LEN,
+                                 static_cast<uint32_t>(xid.gtrid_length));
+  mtr->write<4,mtr_t::MAYBE_NOP>(*block, log_hdr + TRX_UNDO_XA_BQUAL_LEN,
+                                 static_cast<uint32_t>(xid.bqual_length));
+  const ulint xid_length= static_cast<ulint>(xid.gtrid_length
+                                             + xid.bqual_length);
+  mtr->memcpy<mtr_t::MAYBE_NOP>(*block,
+                                &block->page.frame[offset + TRX_UNDO_XA_XID],
+                                xid.data, xid_length);
+  if (UNIV_LIKELY(xid_length < XIDDATASIZE))
+    mtr->memset(block, offset + TRX_UNDO_XA_XID + xid_length,
+                XIDDATASIZE - xid_length, 0);
+}
+
+/********************************************************************//**
+Read X/Open XA Transaction Identification (XID) from undo log header */
+static
+void
+trx_undo_read_xid(const trx_ulogf_t* log_hdr, XID* xid)
+{
+	xid->formatID=static_cast<long>(mach_read_from_4(
+		log_hdr + TRX_UNDO_XA_FORMAT));
+
+	xid->gtrid_length=static_cast<long>(mach_read_from_4(
+		log_hdr + TRX_UNDO_XA_TRID_LEN));
+
+	xid->bqual_length=static_cast<long>(mach_read_from_4(
+		log_hdr + TRX_UNDO_XA_BQUAL_LEN));
+
+	memcpy(xid->data, log_hdr + TRX_UNDO_XA_XID, XIDDATASIZE);
+}
+
+/** Allocate an undo log page.
+@param[in,out]	undo	undo log
+@param[in,out]	mtr	mini-transaction that does not hold any page latch
+@param[out]	err	error code
+@return	X-latched block if success
+@retval	nullptr	on failure */
+buf_block_t *trx_undo_add_page(trx_undo_t *undo, mtr_t *mtr, dberr_t *err)
+{
+  buf_block_t *new_block= nullptr;
+  uint32_t n_reserved;
+
+  /* When we add a page to an undo log, this is analogous to
+   a pessimistic insert in a B-tree, and we must reserve the
+   counterpart of the tree latch, which is the rseg mutex. */
+
+  trx_rseg_t *rseg= undo->rseg;
+  rseg->latch.wr_lock(SRW_LOCK_CALL);
+
+  buf_block_t *header_block=
+    buf_page_get_gen(page_id_t{rseg->space->id, undo->hdr_page_no},
+                     0, RW_X_LATCH, nullptr, BUF_GET, mtr, err);
+  if (!header_block)
+    goto func_exit;
+  *err= fsp_reserve_free_extents(&n_reserved, rseg->space, 1, FSP_UNDO, mtr);
+
+  if (UNIV_UNLIKELY(*err != DB_SUCCESS))
+    goto func_exit;
+
+  new_block=
+    fseg_alloc_free_page_general(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER +
+                                 header_block->page.frame,
+                                 undo->top_page_no + 1, FSP_UP, true,
+                                 mtr, mtr, err);
+  rseg->space->release_free_extents(n_reserved);
+
+  if (!new_block)
+    goto func_exit;
+
+  undo->last_page_no= new_block->page.id().page_no();
+
+  mtr->undo_create(*new_block);
+  trx_undo_page_init(*new_block);
+  *err= flst_add_last(header_block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
+                      new_block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr);
+  if (UNIV_UNLIKELY(*err != DB_SUCCESS))
+    new_block= nullptr;
+  else
+  {
+    undo->size++;
+    rseg->curr_size++;
+  }
+
+func_exit:
+  rseg->latch.wr_unlock();
+  return new_block;
+}
+
+/********************************************************************//**
+Frees an undo log page that is not the header page.
+@return last page number in remaining log */
+static
+uint32_t
+trx_undo_free_page(
+/*===============*/
+	trx_rseg_t* rseg,	/*!< in: rollback segment */
+	bool	in_history,	/*!< in: TRUE if the undo log is in the history
+				list */
+	uint32_t hdr_page_no,	/*!< in: header page number */
+	uint32_t page_no,	/*!< in: page number to free: must not be the
+				header page */
+	mtr_t*	mtr,		/*!< in: mtr which does not have a latch to any
+				undo log page; the caller must have reserved
+				the rollback segment mutex */
+	dberr_t* err)		/*!< out: error code */
+{
+	ut_a(hdr_page_no != page_no);
+
+	buf_block_t* undo_block = buf_page_get_gen(page_id_t(rseg->space->id,
+							     page_no),
+						   0, RW_X_LATCH, nullptr,
+						   BUF_GET, mtr, err);
+	if (UNIV_UNLIKELY(!undo_block)) {
+		return FIL_NULL;
+	}
+	buf_block_t* header_block = buf_page_get_gen(page_id_t(rseg->space->id,
+							       hdr_page_no),
+						     0, RW_X_LATCH, nullptr,
+						     BUF_GET, mtr, err);
+	if (UNIV_UNLIKELY(!header_block)) {
+		return FIL_NULL;
+	}
+
+	*err = flst_remove(header_block, TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
+			   undo_block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE,
+			   mtr);
+
+	if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+		return FIL_NULL;
+	}
+
+	*err = fseg_free_page(TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER
+			      + header_block->page.frame,
+			      rseg->space, page_no, mtr);
+	if (UNIV_UNLIKELY(*err != DB_SUCCESS)) {
+		return FIL_NULL;
+	}
+	buf_page_free(rseg->space, page_no, mtr);
+
+	const fil_addr_t last_addr = flst_get_last(
+		TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST
+		+ header_block->page.frame);
+	rseg->curr_size--;
+
+	if (!in_history) {
+	} else if (buf_block_t* rseg_header = rseg->get(mtr, err)) {
+		byte* rseg_hist_size = TRX_RSEG + TRX_RSEG_HISTORY_SIZE
+			+ rseg_header->page.frame;
+		uint32_t hist_size = mach_read_from_4(rseg_hist_size);
+		ut_ad(hist_size > 0);
+		mtr->write<4>(*rseg_header, rseg_hist_size, hist_size - 1);
+	} else {
+		return FIL_NULL;
+	}
+
+	return(last_addr.page);
+}
+
+/** Free the last undo log page. The caller must hold the rseg mutex.
+@param[in,out]	undo	undo log
+@param[in,out]	mtr	mini-transaction that does not hold any undo log page
+			or that has allocated the undo log page
+@return error code */
+dberr_t trx_undo_free_last_page(trx_undo_t *undo, mtr_t *mtr)
+{
+  ut_ad(undo->hdr_page_no != undo->last_page_no);
+  ut_ad(undo->size > 0);
+  undo->size--;
+
+  dberr_t err;
+  undo->last_page_no= trx_undo_free_page(undo->rseg, false, undo->hdr_page_no,
+                                         undo->last_page_no, mtr, &err);
+  return err;
+}
+
+/** Truncate the tail of an undo log during rollback.
+@param[in,out]	undo	undo log
+@param[in]	limit	all undo logs after this limit will be discarded
+@param[in]	is_temp	whether this is temporary undo log
+@return error code */
+static dberr_t trx_undo_truncate_end(trx_undo_t &undo, undo_no_t limit,
+                                     bool is_temp)
+{
+  ut_ad(is_temp == !undo.rseg->is_persistent());
+
+  for (mtr_t mtr;;)
+  {
+    mtr.start();
+    if (is_temp)
+      mtr.set_log_mode(MTR_LOG_NO_REDO);
+
+    trx_undo_rec_t *trunc_here= nullptr;
+    undo.rseg->latch.wr_lock(SRW_LOCK_CALL);
+    dberr_t err;
+    buf_block_t *undo_block=
+      buf_page_get_gen(page_id_t{undo.rseg->space->id, undo.last_page_no},
+                       0, RW_X_LATCH, nullptr, BUF_GET, &mtr, &err);
+    if (UNIV_UNLIKELY(!undo_block))
+      goto func_exit;
+
+    for (trx_undo_rec_t *rec=
+           trx_undo_page_get_last_rec(undo_block,
+                                      undo.hdr_page_no, undo.hdr_offset);
+         rec; )
+    {
+      if (trx_undo_rec_get_undo_no(rec) < limit)
+        goto func_exit;
+      /* Truncate at least this record off, maybe more */
+      trunc_here= rec;
+      rec= trx_undo_page_get_prev_rec(undo_block, rec,
+                                      undo.hdr_page_no, undo.hdr_offset);
+    }
+
+    if (undo.last_page_no != undo.hdr_page_no)
+    {
+      err= trx_undo_free_last_page(&undo, &mtr);
+      if (UNIV_UNLIKELY(err != DB_SUCCESS))
+        goto func_exit;
+      undo.rseg->latch.wr_unlock();
+      mtr.commit();
+      continue;
+    }
+
+func_exit:
+    undo.rseg->latch.wr_unlock();
+
+    if (trunc_here && err == DB_SUCCESS)
+      mtr.write<2>(*undo_block, TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE +
+                   undo_block->page.frame,
+                   ulint(trunc_here - undo_block->page.frame));
+
+    mtr.commit();
+    return err;
+  }
+}
+
+/** Try to truncate the undo logs.
+@param trx transaction
+@return error code */
+dberr_t trx_undo_try_truncate(const trx_t &trx)
+{
+  if (trx_undo_t *undo= trx.rsegs.m_redo.undo)
+  {
+    ut_ad(undo->rseg == trx.rsegs.m_redo.rseg);
+    if (dberr_t err= trx_undo_truncate_end(*undo, trx.undo_no, false))
+      return err;
+  }
+
+  if (trx_undo_t *undo = trx.rsegs.m_noredo.undo)
+  {
+    ut_ad(undo->rseg == trx.rsegs.m_noredo.rseg);
+    if (dberr_t err= trx_undo_truncate_end(*undo, trx.undo_no, true))
+      return err;
+  }
+
+  return DB_SUCCESS;
+}
+
+/** Truncate the head of an undo log.
+NOTE that only whole pages are freed; the header page is not
+freed, but emptied, if all the records there are below the limit.
+@param[in,out]	rseg		rollback segment
+@param[in]	hdr_page_no	header page number
+@param[in]	hdr_offset	header offset on the page
+@param[in]	limit		first undo number to preserve
+(everything below the limit will be truncated)
+@return error code  */
+dberr_t
+trx_undo_truncate_start(
+	trx_rseg_t*	rseg,
+	uint32_t	hdr_page_no,
+	uint16_t	hdr_offset,
+	undo_no_t	limit)
+{
+	trx_undo_rec_t* rec;
+	trx_undo_rec_t* last_rec;
+	mtr_t		mtr;
+
+	if (!limit) {
+		return DB_SUCCESS;
+	}
+loop:
+	mtr_start(&mtr);
+
+	if (!rseg->is_persistent()) {
+		mtr.set_log_mode(MTR_LOG_NO_REDO);
+	}
+
+	dberr_t err;
+	const buf_block_t* undo_page;
+	rec = trx_undo_get_first_rec(*rseg->space, hdr_page_no, hdr_offset,
+				     RW_X_LATCH, undo_page, &mtr, &err);
+	if (rec == NULL) {
+		/* Already empty */
+done:
+		mtr.commit();
+		return err;
+	}
+
+	last_rec = trx_undo_page_get_last_rec(undo_page, hdr_page_no,
+					      hdr_offset);
+	if (trx_undo_rec_get_undo_no(last_rec) >= limit) {
+		goto done;
+	}
+
+	if (undo_page->page.id().page_no() == hdr_page_no) {
+		uint16_t end = mach_read_from_2(hdr_offset + TRX_UNDO_NEXT_LOG
+						+ undo_page->page.frame);
+		if (end == 0) {
+			end = mach_read_from_2(TRX_UNDO_PAGE_HDR
+					       + TRX_UNDO_PAGE_FREE
+					       + undo_page->page.frame);
+		}
+
+		mtr.write<2>(*undo_page, undo_page->page.frame + hdr_offset
+			     + TRX_UNDO_LOG_START, end);
+	} else {
+		trx_undo_free_page(rseg, true, hdr_page_no,
+				   undo_page->page.id().page_no(), &mtr, &err);
+		if (err != DB_SUCCESS) {
+			goto done;
+		}
+	}
+
+	mtr.commit();
+	goto loop;
+}
+
+/*========== UNDO LOG MEMORY COPY INITIALIZATION =====================*/
+
+/** Read an undo log when starting up the database.
+@param[in,out]	rseg		rollback segment
+@param[in]	id		rollback segment slot
+@param[in]	page_no		undo log segment page number
+@return	the undo log
+@retval nullptr on error */
+trx_undo_t *
+trx_undo_mem_create_at_db_start(trx_rseg_t *rseg, ulint id, uint32_t page_no)
+{
+	mtr_t		mtr;
+	XID		xid;
+
+	ut_ad(id < TRX_RSEG_N_SLOTS);
+
+	mtr.start();
+	const buf_block_t* block = buf_page_get(
+		page_id_t(rseg->space->id, page_no), 0, RW_X_LATCH, &mtr);
+	if (UNIV_UNLIKELY(!block)) {
+corrupted:
+		mtr.commit();
+		return nullptr;
+	}
+
+	const uint16_t type = mach_read_from_2(TRX_UNDO_PAGE_HDR
+					       + TRX_UNDO_PAGE_TYPE
+					       + block->page.frame);
+	if (UNIV_UNLIKELY(type > 2)) {
+corrupted_type:
+		sql_print_error("InnoDB: unsupported undo header type %u",
+				type);
+		goto corrupted;
+	}
+
+	uint16_t offset = mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG
+					   + block->page.frame);
+	if (offset < TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE ||
+	    offset >= srv_page_size - TRX_UNDO_LOG_OLD_HDR_SIZE) {
+		sql_print_error("InnoDB: invalid undo header offset %u",
+				offset);
+		goto corrupted;
+	}
+
+	const trx_ulogf_t* const undo_header = block->page.frame + offset;
+	uint16_t state = mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_STATE
+					  + block->page.frame);
+
+	const trx_id_t trx_id= mach_read_from_8(undo_header + TRX_UNDO_TRX_ID);
+	if (trx_id >> 48) {
+		sql_print_error("InnoDB: corrupted TRX_ID %llx", trx_id);
+		goto corrupted;
+	}
+	/* We will increment rseg->needs_purge, like trx_undo_reuse_cached()
+	would do it, to avoid trouble on rollback or XA COMMIT. */
+	trx_id_t trx_no = trx_id + 1;
+
+	switch (state) {
+	case TRX_UNDO_ACTIVE:
+	case TRX_UNDO_PREPARED:
+		if (UNIV_LIKELY(type != 1)) {
+			break;
+		}
+		sql_print_error("InnoDB: upgrade from older version than"
+				" MariaDB 10.3 requires clean shutdown");
+		goto corrupted;
+	default:
+		sql_print_error("InnoDB: unsupported undo header state %u",
+				state);
+		goto corrupted;
+	case TRX_UNDO_CACHED:
+		if (UNIV_UNLIKELY(type != 0)) {
+			/* This undo page was not updated by MariaDB
+			10.3 or later. The TRX_UNDO_TRX_NO field may
+			contain garbage. */
+			break;
+		}
+		goto read_trx_no;
+	case TRX_UNDO_TO_PURGE:
+		if (UNIV_UNLIKELY(type == 1)) {
+			goto corrupted_type;
+		}
+	read_trx_no:
+		trx_no = mach_read_from_8(TRX_UNDO_TRX_NO + undo_header);
+		if (trx_no >> 48) {
+			sql_print_error("InnoDB: corrupted TRX_NO %llx",
+					trx_no);
+			goto corrupted;
+		}
+		if (trx_no < trx_id) {
+			trx_no = trx_id;
+		}
+	}
+
+	/* Read X/Open XA transaction identification if it exists, or
+	set it to NULL. */
+
+	if (undo_header[TRX_UNDO_XID_EXISTS]) {
+		trx_undo_read_xid(undo_header, &xid);
+	} else {
+		xid.null();
+	}
+
+	if (trx_no > rseg->needs_purge) {
+		rseg->needs_purge = trx_no;
+	}
+
+	trx_undo_t* undo = trx_undo_mem_create(
+		rseg, id, trx_id, &xid, page_no, offset);
+	if (!undo) {
+		return undo;
+	}
+
+	undo->dict_operation = undo_header[TRX_UNDO_DICT_TRANS];
+	undo->size = flst_get_len(TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST
+				  + block->page.frame);
+
+	fil_addr_t	last_addr = flst_get_last(
+		TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST + block->page.frame);
+
+	undo->last_page_no = last_addr.page;
+	undo->top_page_no = last_addr.page;
+
+	const buf_block_t* last = buf_page_get(
+		page_id_t(rseg->space->id, undo->last_page_no), 0,
+		RW_X_LATCH, &mtr);
+
+	if (UNIV_UNLIKELY(!last)) {
+		ut_free(undo);
+		goto corrupted;
+        }
+
+	if (const trx_undo_rec_t* rec = trx_undo_page_get_last_rec(
+		    last, page_no, offset)) {
+		undo->top_offset = static_cast<uint16_t>(
+			rec - last->page.frame);
+		undo->top_undo_no = trx_undo_rec_get_undo_no(rec);
+		ut_ad(!undo->empty());
+	} else {
+		undo->top_undo_no = IB_ID_MAX;
+		ut_ad(undo->empty());
+	}
+
+	undo->state = state;
+
+	if (state != TRX_UNDO_CACHED) {
+		UT_LIST_ADD_LAST(rseg->undo_list, undo);
+	} else {
+		UT_LIST_ADD_LAST(rseg->undo_cached, undo);
+	}
+
+	mtr.commit();
+	return undo;
+}
+
+/********************************************************************//**
+Creates and initializes an undo log memory object.
+@return own: the undo log memory object */
+static
+trx_undo_t*
+trx_undo_mem_create(
+/*================*/
+	trx_rseg_t*	rseg,	/*!< in: rollback segment memory object */
+	ulint		id,	/*!< in: slot index within rseg */
+	trx_id_t	trx_id,	/*!< in: id of the trx for which the undo log
+				is created */
+	const XID*	xid,	/*!< in: X/Open transaction identification */
+	uint32_t	page_no,/*!< in: undo log header page number */
+	uint16_t	offset)	/*!< in: undo log header byte offset on page */
+{
+	trx_undo_t*	undo;
+
+	ut_a(id < TRX_RSEG_N_SLOTS);
+
+	undo = static_cast<trx_undo_t*>(ut_malloc_nokey(sizeof(*undo)));
+
+	if (undo == NULL) {
+
+		return(NULL);
+	}
+
+	undo->id = id;
+	undo->state = TRX_UNDO_ACTIVE;
+	undo->trx_id = trx_id;
+	undo->xid = *xid;
+
+	undo->dict_operation = FALSE;
+
+	undo->rseg = rseg;
+
+	undo->hdr_page_no = page_no;
+	undo->hdr_offset = offset;
+	undo->last_page_no = page_no;
+	undo->size = 1;
+
+	undo->top_undo_no = IB_ID_MAX;
+	undo->top_page_no = page_no;
+	undo->guess_block = NULL;
+	ut_ad(undo->empty());
+
+	return(undo);
+}
+
+/********************************************************************//**
+Initializes a cached undo log object for new use. */
+static
+void
+trx_undo_mem_init_for_reuse(
+/*========================*/
+	trx_undo_t*	undo,	/*!< in: undo log to init */
+	trx_id_t	trx_id,	/*!< in: id of the trx for which the undo log
+				is created */
+	const XID*	xid,	/*!< in: X/Open XA transaction identification*/
+	uint16_t	offset)	/*!< in: undo log header byte offset on page */
+{
+	ut_a(undo->id < TRX_RSEG_N_SLOTS);
+
+	undo->state = TRX_UNDO_ACTIVE;
+	undo->trx_id = trx_id;
+	undo->xid = *xid;
+
+	undo->dict_operation = FALSE;
+
+	undo->hdr_offset = offset;
+	undo->top_undo_no = IB_ID_MAX;
+	ut_ad(undo->empty());
+}
+
+/** Create an undo log.
+@param[in,out]	trx	transaction
+@param[in,out]	rseg	rollback segment
+@param[out]	undo	undo log object
+@param[out]	err	error code
+@param[in,out]	mtr	mini-transaction
+@return undo log block
+@retval	NULL	on failure */
+static MY_ATTRIBUTE((nonnull, warn_unused_result))
+buf_block_t*
+trx_undo_create(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** undo,
+		dberr_t* err, mtr_t* mtr)
+{
+	ulint		id;
+	buf_block_t*	block = rseg->get(mtr, err);
+
+	if (block) {
+		block = trx_undo_seg_create(rseg->space, block, &id, err, mtr);
+	}
+
+	if (!block) {
+		return NULL;
+	}
+
+	rseg->curr_size++;
+
+	uint16_t offset = trx_undo_header_create(block, trx->id, mtr);
+
+	*undo = trx_undo_mem_create(rseg, id, trx->id, &trx->xid,
+				    block->page.id().page_no(), offset);
+	if (*undo == NULL) {
+		*err = DB_OUT_OF_MEMORY;
+		 /* FIXME: this will not free the undo block to the file */
+		return NULL;
+	} else if (rseg != trx->rsegs.m_redo.rseg) {
+		return block;
+	}
+
+	if (trx->dict_operation) {
+		(*undo)->dict_operation = true;
+		mtr->write<1,mtr_t::MAYBE_NOP>(*block,
+					       block->page.frame + offset
+					       + TRX_UNDO_DICT_TRANS, 1U);
+		mtr->write<8,mtr_t::MAYBE_NOP>(*block,
+					       block->page.frame + offset
+					       + TRX_UNDO_TABLE_ID, 0U);
+	}
+
+	*err = DB_SUCCESS;
+	return block;
+}
+
+/*================ UNDO LOG ASSIGNMENT AND CLEANUP =====================*/
+
+/** Reuse a cached undo log block.
+@param[in,out]	trx	transaction
+@param[in,out]	rseg	rollback segment
+@param[out]	pundo	the undo log memory object
+@param[in,out]	mtr	mini-transaction
+@param[out]	err	error code
+@return	the undo log block
+@retval	NULL	if none cached */
+static
+buf_block_t*
+trx_undo_reuse_cached(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** pundo,
+		      mtr_t* mtr, dberr_t *err)
+{
+	ut_ad(rseg->is_persistent());
+	ut_ad(rseg->is_referenced());
+	ut_ad(rseg == trx->rsegs.m_redo.rseg);
+
+	if (rseg->needs_purge <= trx->id) {
+		/* trx_purge_truncate_history() checks
+		purge_sys.sees(rseg.needs_purge)
+		so we need to compensate for that.
+		The rseg->needs_purge after crash
+		recovery would be at least trx->id + 1,
+		because that is the minimum possible value
+		assigned by trx_serialise() on commit. */
+		rseg->needs_purge = trx->id + 1;
+	}
+
+	trx_undo_t* undo = UT_LIST_GET_FIRST(rseg->undo_cached);
+	if (!undo) {
+		return NULL;
+	}
+
+	ut_ad(undo->size == 1);
+	ut_ad(undo->id < TRX_RSEG_N_SLOTS);
+
+	buf_block_t* block = buf_page_get_gen(page_id_t(undo->rseg->space->id,
+							undo->hdr_page_no),
+					      0, RW_X_LATCH, nullptr, BUF_GET,
+					      mtr, err);
+	if (!block) {
+		return NULL;
+	}
+
+	UT_LIST_REMOVE(rseg->undo_cached, undo);
+
+	*pundo = undo;
+
+	uint16_t offset = trx_undo_header_create(block, trx->id, mtr);
+
+	trx_undo_mem_init_for_reuse(undo, trx->id, &trx->xid, offset);
+
+	if (trx->dict_operation) {
+		undo->dict_operation = TRUE;
+		mtr->write<1,mtr_t::MAYBE_NOP>(*block,
+					       block->page.frame + offset
+					       + TRX_UNDO_DICT_TRANS, 1U);
+		mtr->write<8,mtr_t::MAYBE_NOP>(*block,
+					       block->page.frame + offset
+					       + TRX_UNDO_TABLE_ID, 0U);
+	}
+
+	return block;
+}
+
+/** Assign an undo log for a persistent transaction.
+A new undo log is created or a cached undo log reused.
+@param[in,out]	trx	transaction
+@param[out]	err	error code
+@param[in,out]	mtr	mini-transaction
+@return	the undo log block
+@retval	NULL	on error */
+buf_block_t*
+trx_undo_assign(trx_t* trx, dberr_t* err, mtr_t* mtr)
+{
+	ut_ad(mtr->get_log_mode() == MTR_LOG_ALL);
+
+	trx_undo_t* undo = trx->rsegs.m_redo.undo;
+
+	if (undo) {
+		return buf_page_get_gen(
+			page_id_t(undo->rseg->space->id, undo->last_page_no),
+			0, RW_X_LATCH, undo->guess_block,
+			BUF_GET, mtr, err);
+	}
+
+	*err = DB_SUCCESS;
+	trx_rseg_t* rseg = trx->rsegs.m_redo.rseg;
+
+	rseg->latch.wr_lock(SRW_LOCK_CALL);
+	buf_block_t* block = trx_undo_reuse_cached(
+		trx, rseg, &trx->rsegs.m_redo.undo, mtr, err);
+
+	if (!block) {
+		block = trx_undo_create(trx, rseg, &trx->rsegs.m_redo.undo,
+					err, mtr);
+		ut_ad(!block == (*err != DB_SUCCESS));
+		if (!block) {
+			goto func_exit;
+		}
+	}
+
+	UT_LIST_ADD_FIRST(rseg->undo_list, trx->rsegs.m_redo.undo);
+
+func_exit:
+	rseg->latch.wr_unlock();
+	return block;
+}
+
+/** Assign an undo log for a transaction.
+A new undo log is created or a cached undo log reused.
+@tparam is_temp  whether this is temporary undo log
+@param[in,out]	trx	transaction
+@param[in]	rseg	rollback segment
+@param[out]	undo	the undo log
+@param[in,out]	mtr	mini-transaction
+@param[out]	err	error code
+@return	the undo log block
+@retval	nullptr	on error */
+template<bool is_temp>
+buf_block_t*
+trx_undo_assign_low(trx_t *trx, trx_rseg_t *rseg, trx_undo_t **undo,
+                    mtr_t *mtr, dberr_t *err)
+{
+	ut_ad(is_temp == (rseg == trx->rsegs.m_noredo.rseg));
+	ut_ad(is_temp || rseg == trx->rsegs.m_redo.rseg);
+	ut_ad(undo == (is_temp
+		       ? &trx->rsegs.m_noredo.undo
+		       : &trx->rsegs.m_redo.undo));
+	ut_ad(mtr->get_log_mode()
+	      == (is_temp ? MTR_LOG_NO_REDO : MTR_LOG_ALL));
+
+	if (*undo) {
+		return buf_page_get_gen(
+			page_id_t(rseg->space->id, (*undo)->last_page_no),
+			0, RW_X_LATCH, (*undo)->guess_block,
+			BUF_GET, mtr, err);
+	}
+
+	DBUG_EXECUTE_IF(
+		"ib_create_table_fail_too_many_trx",
+		*err = DB_TOO_MANY_CONCURRENT_TRXS; return NULL;
+	);
+
+	*err = DB_SUCCESS;
+	rseg->latch.wr_lock(SRW_LOCK_CALL);
+	buf_block_t* block;
+	if (is_temp) {
+		ut_ad(!UT_LIST_GET_LEN(rseg->undo_cached));
+	} else {
+		block = trx_undo_reuse_cached(trx, rseg, undo, mtr, err);
+		if (block) {
+			goto got_block;
+		}
+	}
+	block = trx_undo_create(trx, rseg, undo, err, mtr);
+	ut_ad(!block == (*err != DB_SUCCESS));
+	if (!block) {
+		goto func_exit;
+	}
+
+got_block:
+	UT_LIST_ADD_FIRST(rseg->undo_list, *undo);
+
+func_exit:
+	rseg->latch.wr_unlock();
+	return block;
+}
+
+template buf_block_t*
+trx_undo_assign_low<false>(trx_t *trx, trx_rseg_t *rseg, trx_undo_t **undo,
+                           mtr_t *mtr, dberr_t *err);
+template buf_block_t*
+trx_undo_assign_low<true>(trx_t *trx, trx_rseg_t *rseg, trx_undo_t **undo,
+                          mtr_t *mtr, dberr_t *err);
+
+/** Set the state of the undo log segment at a XA PREPARE or XA ROLLBACK.
+@param[in,out]	trx		transaction
+@param[in,out]	undo		undo log
+@param[in]	rollback	false=XA PREPARE, true=XA ROLLBACK
+@param[in,out]	mtr		mini-transaction
+@return undo log segment header page, x-latched */
+void trx_undo_set_state_at_prepare(trx_t *trx, trx_undo_t *undo, bool rollback,
+				   mtr_t *mtr)
+{
+	ut_a(undo->id < TRX_RSEG_N_SLOTS);
+
+	buf_block_t* block = buf_page_get(
+		page_id_t(undo->rseg->space->id, undo->hdr_page_no), 0,
+		RW_X_LATCH, mtr);
+	if (UNIV_UNLIKELY(!block)) {
+		/* In case of !rollback the undo header page
+		corruption would leave the transaction object in an
+		unexpected (active) state. */
+		ut_a(rollback);
+		return;
+	}
+
+	if (rollback) {
+		ut_ad(undo->state == TRX_UNDO_PREPARED);
+		mtr->write<2>(*block, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE
+			      + block->page.frame, TRX_UNDO_ACTIVE);
+		return;
+	}
+
+	/*------------------------------*/
+	ut_ad(undo->state == TRX_UNDO_ACTIVE);
+	undo->state = TRX_UNDO_PREPARED;
+	undo->xid   = trx->xid;
+	/*------------------------------*/
+
+	mtr->write<2>(*block, TRX_UNDO_SEG_HDR + TRX_UNDO_STATE
+		      + block->page.frame, undo->state);
+	uint16_t offset = mach_read_from_2(TRX_UNDO_SEG_HDR + TRX_UNDO_LAST_LOG
+					   + block->page.frame);
+	mtr->write<1>(*block, block->page.frame + offset + TRX_UNDO_XID_EXISTS,
+		      1U);
+
+	trx_undo_write_xid(block, offset, undo->xid, mtr);
+}
+
+/** At shutdown, frees the undo logs of a transaction. */
+void trx_undo_free_at_shutdown(trx_t *trx)
+{
+	if (trx_undo_t*& undo = trx->rsegs.m_redo.undo) {
+		switch (undo->state) {
+		case TRX_UNDO_PREPARED:
+			break;
+		case TRX_UNDO_CACHED:
+		case TRX_UNDO_TO_PURGE:
+			ut_ad(trx_state_eq(trx,
+					   TRX_STATE_COMMITTED_IN_MEMORY));
+			/* fall through */
+		case TRX_UNDO_ACTIVE:
+			/* trx_t::commit_state() assigns
+			trx->state = TRX_STATE_COMMITTED_IN_MEMORY. */
+			ut_a(!srv_was_started
+			     || srv_read_only_mode
+			     || srv_force_recovery >= SRV_FORCE_NO_TRX_UNDO
+			     || srv_fast_shutdown);
+			break;
+		default:
+			ut_error;
+		}
+
+		UT_LIST_REMOVE(trx->rsegs.m_redo.rseg->undo_list, undo);
+		ut_free(undo);
+		undo = NULL;
+	}
+	if (trx_undo_t*& undo = trx->rsegs.m_noredo.undo) {
+		ut_a(undo->state == TRX_UNDO_PREPARED);
+
+		UT_LIST_REMOVE(trx->rsegs.m_noredo.rseg->undo_list, undo);
+		ut_free(undo);
+		undo = NULL;
+	}
+}
diff --git a/storage/innobase/unittest/CMakeLists.txt b/storage/innobase/unittest/CMakeLists.txt
new file mode 100644
index 00000000..7dd7c111
--- /dev/null
+++ b/storage/innobase/unittest/CMakeLists.txt
@@ -0,0 +1,34 @@
+# Copyright (c) 2021, MariaDB Corporation.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1335 USA
+
+INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/include
+                    ${CMAKE_SOURCE_DIR}/unittest/mytap
+                    ${CMAKE_SOURCE_DIR}/storage/innobase/include
+                    ${CMAKE_SOURCE_DIR}/tpool)
+ADD_EXECUTABLE(innodb_fts-t innodb_fts-t.cc)
+TARGET_LINK_LIBRARIES(innodb_fts-t mysys mytap)
+ADD_DEPENDENCIES(innodb_fts-t GenError)
+MY_ADD_TEST(innodb_fts)
+# See explanation in innobase/CmakeLists.txt
+IF(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64|powerpc64|s390x")
+  ADD_COMPILE_FLAGS(
+      ../sync/srw_lock.cc
+      COMPILE_FLAGS "-mhtm"
+      )
+ENDIF()
+ADD_EXECUTABLE(innodb_sync-t innodb_sync-t.cc ../sync/srw_lock.cc)
+TARGET_LINK_LIBRARIES(innodb_sync-t mysys mytap)
+ADD_DEPENDENCIES(innodb_sync-t GenError)
+MY_ADD_TEST(innodb_sync)
diff --git a/storage/innobase/unittest/innodb_fts-t.cc b/storage/innobase/unittest/innodb_fts-t.cc
new file mode 100644
index 00000000..72a4ca3a
--- /dev/null
+++ b/storage/innobase/unittest/innodb_fts-t.cc
@@ -0,0 +1,52 @@
+#include "tap.h"
+#include "fts0fts.h"
+#include "fts0vlc.h"
+
+struct fts_encode_info
+{
+  const byte buf[10];
+  size_t len;
+  doc_id_t val;
+};
+
+/* Contains fts encoding min & max value for each length bytes */
+static const fts_encode_info fts_info[]=
+{
+  {{0x80}, 1, 0},
+  {{0xFF}, 1, (1 << 7) - 1},
+  {{0x01, 0x80}, 2, 1 << 7},
+  {{0x7F, 0XFF}, 2, (1 << 14) - 1},
+  {{0x01, 0x00, 0x80}, 3, 1 << 14},
+  {{0x7F, 0X7F, 0XFF}, 3, (1 << 21) - 1},
+  {{0x01, 0x00, 0x00, 0x80}, 4, 1 << 21},
+  {{0x7F, 0X7F, 0X7F, 0xFF}, 4, (1 << 28) - 1},
+  {{0x01, 0x00, 0x00, 0x00, 0x80}, 5, 1 << 28},
+  {{0x7F, 0X7F, 0X7F, 0x7F, 0xFF}, 5, (1ULL << 35) - 1},
+  {{0x01, 0x00, 0x00, 0x00, 0x00, 0x80}, 6, 1ULL << 35},
+  {{0x7F, 0X7F, 0X7F, 0x7F, 0x7F, 0xFF}, 6, (1ULL << 42) - 1},
+  {{0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80}, 7, 1ULL << 42},
+  {{0x7F, 0X7F, 0X7F, 0x7F, 0x7F, 0x7F, 0XFF}, 7, (1ULL << 49) - 1},
+  {{0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80}, 8, 1ULL << 49},
+  {{0x7F, 0X7F, 0X7F, 0x7F, 0x7F, 0x7F, 0X7F, 0XFF}, 8, (1ULL << 56) -1},
+  {{0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80}, 9, 1ULL << 56},
+  {{0x7F, 0X7F, 0X7F, 0x7F, 0x7F, 0x7F, 0X7F, 0x7F, 0XFF}, 9, (1ULL << 63) -1},
+  {{0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80}, 10, 1ULL << 63},
+  {{0x01, 0X7F, 0X7F, 0x7F, 0x7F, 0x7F, 0X7F, 0x7F, 0x7F, 0xFF}, 10, ~0ULL}
+};
+
+int main(int, char**)
+{
+  for (int i= array_elements(fts_info); i--;)
+  {
+    byte buf[10];
+    const byte* fts_buf= buf;
+    size_t len= fts_encode_int(fts_info[i].val, buf) - &buf[0];
+    if (fts_info[i].len == len &&
+        !memcmp(&fts_info[i].buf, buf, len) &&
+        fts_decode_vlc(&fts_buf) == fts_info[i].val &&
+        fts_buf == &buf[len])
+      ok(true, "FTS Encoded for %zu bytes", fts_info[i].len);
+    else
+      ok(false, "FTS Encoded for %zu bytes", fts_info[i].len);
+  }
+}
diff --git a/storage/innobase/unittest/innodb_sync-t.cc b/storage/innobase/unittest/innodb_sync-t.cc
new file mode 100644
index 00000000..d0289086
--- /dev/null
+++ b/storage/innobase/unittest/innodb_sync-t.cc
@@ -0,0 +1,185 @@
+/* Copyright (c) 2021, MariaDB Corporation.
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335  USA */
+
+#include <thread>
+#include "tap.h"
+#include "my_sys.h"
+#include "sux_lock.h"
+
+static std::atomic<bool> critical;
+
+ulong srv_n_spin_wait_rounds= 30;
+uint srv_spin_wait_delay= 4;
+
+constexpr unsigned N_THREADS= 30;
+constexpr unsigned N_ROUNDS= 100;
+constexpr unsigned M_ROUNDS= 100;
+
+static srw_mutex m;
+
+static void test_srw_mutex()
+{
+  for (auto i= N_ROUNDS * M_ROUNDS; i--; )
+  {
+    m.wr_lock();
+    assert(!critical);
+    critical= true;
+    critical= false;
+    m.wr_unlock();
+  }
+}
+
+static srw_lock_low l;
+
+static void test_srw_lock()
+{
+  for (auto i= N_ROUNDS; i--; )
+  {
+    l.wr_lock();
+    assert(!critical);
+    critical= true;
+    critical= false;
+    l.wr_unlock();
+
+    for (auto j= M_ROUNDS; j--; )
+    {
+      l.rd_lock();
+      assert(!critical);
+      l.rd_unlock();
+    }
+  }
+}
+
+static ssux_lock_impl<false> ssux;
+
+static void test_ssux_lock()
+{
+  for (auto i= N_ROUNDS; i--; )
+  {
+    ssux.wr_lock();
+    assert(!critical);
+    critical= true;
+    critical= false;
+    ssux.wr_unlock();
+
+    for (auto j= M_ROUNDS; j--; )
+    {
+      ssux.rd_lock();
+      assert(!critical);
+      ssux.rd_unlock();
+    }
+
+    for (auto j= M_ROUNDS; j--; )
+    {
+      ssux.u_lock();
+      assert(!critical);
+      ssux.u_wr_upgrade();
+      assert(!critical);
+      critical= true;
+      critical= false;
+      ssux.wr_u_downgrade();
+      ssux.u_unlock();
+    }
+  }
+}
+
+static sux_lock<ssux_lock_impl<true>> sux;
+
+static void test_sux_lock()
+{
+  for (auto i= N_ROUNDS; i--; )
+  {
+    sux.x_lock();
+    assert(!critical);
+    critical= true;
+    for (auto j= M_ROUNDS; j--; )
+      sux.x_lock();
+    critical= false;
+    for (auto j= M_ROUNDS + 1; j--; )
+      sux.x_unlock();
+
+    for (auto j= M_ROUNDS; j--; )
+    {
+      sux.s_lock();
+      assert(!critical);
+      sux.s_unlock();
+    }
+
+    for (auto j= M_ROUNDS / 2; j--; )
+    {
+      sux.u_lock();
+      assert(!critical);
+      sux.u_lock();
+      sux.u_x_upgrade();
+      assert(!critical);
+      critical= true;
+      sux.x_unlock();
+      critical= false;
+      sux.x_u_downgrade();
+      sux.u_unlock();
+    }
+  }
+}
+
+int main(int argc __attribute__((unused)), char **argv)
+{
+  std::thread t[N_THREADS];
+
+  MY_INIT(argv[0]);
+
+  plan(4);
+
+  m.init();
+  for (auto i= N_THREADS; i--; )
+    t[i]= std::thread(test_srw_mutex);
+
+  for (auto i= N_THREADS; i--; )
+    t[i].join();
+
+  m.destroy();
+  ok(true, "srw_mutex");
+
+  l.init();
+
+  for (auto i= N_THREADS; i--; )
+    t[i]= std::thread(test_srw_lock);
+
+  for (auto i= N_THREADS; i--; )
+    t[i].join();
+
+  ok(true, "srw_lock");
+
+  l.destroy();
+
+  ssux.init();
+  for (auto i= N_THREADS; i--; )
+    t[i]= std::thread(test_ssux_lock);
+
+  for (auto i= N_THREADS; i--; )
+    t[i].join();
+
+  ok(true, "ssux_lock");
+  ssux.destroy();
+
+  sux.init();
+  for (auto i= N_THREADS; i--; )
+    t[i]= std::thread(test_sux_lock);
+
+  for (auto i= N_THREADS; i--; )
+    t[i].join();
+
+  ok(true, "sux_lock");
+  sux.free();
+}
diff --git a/storage/innobase/ut/ut0dbg.cc b/storage/innobase/ut/ut0dbg.cc
new file mode 100644
index 00000000..167cc714
--- /dev/null
+++ b/storage/innobase/ut/ut0dbg.cc
@@ -0,0 +1,61 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*****************************************************************//**
+@file ut/ut0dbg.cc
+Debug utilities for Innobase.
+
+Created 1/30/1994 Heikki Tuuri
+**********************************************************************/
+
+#include "univ.i"
+#include "ut0dbg.h"
+
+/*************************************************************//**
+Report a failed assertion. */
+ATTRIBUTE_NORETURN
+void
+ut_dbg_assertion_failed(
+/*====================*/
+	const char* expr,	/*!< in: the failed assertion (optional) */
+	const char* file,	/*!< in: source file containing the assertion */
+	unsigned line)		/*!< in: line number of the assertion */
+{
+	ut_print_timestamp(stderr);
+	fprintf(stderr, "  InnoDB: Assertion failure in file %s line %u\n",
+		file, line);
+	if (expr) {
+		fprintf(stderr,
+			"InnoDB: Failing assertion: %s\n", expr);
+	}
+
+	fputs("InnoDB: We intentionally generate a memory trap.\n"
+	      "InnoDB: Submit a detailed bug report"
+	      " to https://jira.mariadb.org/\n"
+	      "InnoDB: If you get repeated assertion failures"
+	      " or crashes, even\n"
+	      "InnoDB: immediately after the mariadbd startup, there may be\n"
+	      "InnoDB: corruption in the InnoDB tablespace. Please refer to\n"
+	      "InnoDB: https://mariadb.com/kb/en/library/innodb-recovery-modes/\n"
+	      "InnoDB: about forcing recovery.\n", stderr);
+
+	fflush(stderr);
+	fflush(stdout);
+	abort();
+}
diff --git a/storage/innobase/ut/ut0list.cc b/storage/innobase/ut/ut0list.cc
new file mode 100644
index 00000000..370c18d4
--- /dev/null
+++ b/storage/innobase/ut/ut0list.cc
@@ -0,0 +1,151 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file ut/ut0list.cc
+A double-linked list
+
+Created 4/26/2006 Osku Salerma
+************************************************************************/
+
+#include "ut0list.h"
+
+/****************************************************************//**
+Create a new list.
+@return list */
+ib_list_t*
+ib_list_create(void)
+/*=================*/
+{
+	return(static_cast<ib_list_t*>(ut_zalloc_nokey(sizeof(ib_list_t))));
+}
+
+/****************************************************************//**
+Free a list. */
+void
+ib_list_free(
+/*=========*/
+	ib_list_t*	list)	/*!< in: list */
+{
+	/* We don't check that the list is empty because it's entirely valid
+	to e.g. have all the nodes allocated from a single heap that is then
+	freed after the list itself is freed. */
+
+	ut_free(list);
+}
+
+/****************************************************************//**
+Add the data after the indicated node.
+@return new list node */
+static
+ib_list_node_t*
+ib_list_add_after(
+/*==============*/
+	ib_list_t*	list,		/*!< in: list */
+	ib_list_node_t*	prev_node,	/*!< in: node preceding new node (can
+					be NULL) */
+	void*		data,		/*!< in: data */
+	mem_heap_t*	heap)		/*!< in: memory heap to use */
+{
+	ib_list_node_t*	node;
+
+	node = static_cast<ib_list_node_t*>(
+		mem_heap_alloc(heap, sizeof(*node)));
+
+	node->data = data;
+
+	if (!list->first) {
+		/* Empty list. */
+
+		ut_a(!prev_node);
+
+		node->prev = NULL;
+		node->next = NULL;
+
+		list->first = node;
+		list->last = node;
+	} else if (!prev_node) {
+		/* Start of list. */
+
+		node->prev = NULL;
+		node->next = list->first;
+
+		list->first->prev = node;
+
+		list->first = node;
+	} else {
+		/* Middle or end of list. */
+
+		node->prev = prev_node;
+		node->next = prev_node->next;
+
+		prev_node->next = node;
+
+		if (node->next) {
+			node->next->prev = node;
+		} else {
+			list->last = node;
+		}
+	}
+
+	return(node);
+}
+
+/****************************************************************//**
+Add the data to the end of the list.
+@return new list node */
+ib_list_node_t*
+ib_list_add_last(
+/*=============*/
+	ib_list_t*	list,	/*!< in: list */
+	void*		data,	/*!< in: data */
+	mem_heap_t*	heap)	/*!< in: memory heap to use */
+{
+	return(ib_list_add_after(list, ib_list_get_last(list), data, heap));
+}
+
+/****************************************************************//**
+Remove the node from the list. */
+void
+ib_list_remove(
+/*===========*/
+	ib_list_t*	list,	/*!< in: list */
+	ib_list_node_t*	node)	/*!< in: node to remove */
+{
+	if (node->prev) {
+		node->prev->next = node->next;
+	} else {
+		/* First item in list. */
+
+		ut_ad(list->first == node);
+
+		list->first = node->next;
+	}
+
+	if (node->next) {
+		node->next->prev = node->prev;
+	} else {
+		/* Last item in list. */
+
+		ut_ad(list->last == node);
+
+		list->last = node->prev;
+	}
+
+	node->prev = node->next = NULL;
+}
diff --git a/storage/innobase/ut/ut0mem.cc b/storage/innobase/ut/ut0mem.cc
new file mode 100644
index 00000000..15d4da12
--- /dev/null
+++ b/storage/innobase/ut/ut0mem.cc
@@ -0,0 +1,55 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file ut/ut0mem.cc
+Memory primitives
+
+Created 5/11/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "ut0mem.h"
+#include "ut0new.h"
+
+/********************************************************************
+Concatenate 3 strings.*/
+char*
+ut_str3cat(
+/*=======*/
+				/* out, own: concatenated string, must be
+				freed with ut_free() */
+	const char*	s1,	/* in: string 1 */
+	const char*	s2,	/* in: string 2 */
+	const char*	s3)	/* in: string 3 */
+{
+	char*	s;
+	ulint	s1_len = strlen(s1);
+	ulint	s2_len = strlen(s2);
+	ulint	s3_len = strlen(s3);
+
+	s = static_cast<char*>(ut_malloc_nokey(s1_len + s2_len + s3_len + 1));
+
+	memcpy(s, s1, s1_len);
+	memcpy(s + s1_len, s2, s2_len);
+	memcpy(s + s1_len + s2_len, s3, s3_len);
+
+	s[s1_len + s2_len + s3_len] = '\0';
+
+	return(s);
+}
diff --git a/storage/innobase/ut/ut0new.cc b/storage/innobase/ut/ut0new.cc
new file mode 100644
index 00000000..a3ce1bdf
--- /dev/null
+++ b/storage/innobase/ut/ut0new.cc
@@ -0,0 +1,112 @@
+/*****************************************************************************
+
+Copyright (c) 2014, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file ut/ut0new.cc
+Instrumented memory allocator.
+
+Created May 26, 2014 Vasil Dimov
+*******************************************************/
+
+#include "univ.i"
+#include "ut0new.h"
+/** The total amount of memory currently allocated from the operating
+system with allocate_large(). */
+Atomic_counter<ulint> os_total_large_mem_allocated;
+
+/** Maximum number of retries to allocate memory. */
+const size_t	alloc_max_retries = 60;
+
+/** Keys for registering allocations with performance schema.
+Keep this list alphabetically sorted. */
+#ifdef BTR_CUR_HASH_ADAPT
+PSI_memory_key	mem_key_ahi;
+#endif /* BTR_CUR_HASH_ADAPT */
+PSI_memory_key	mem_key_buf_buf_pool;
+PSI_memory_key	mem_key_dict_stats_bg_recalc_pool_t;
+PSI_memory_key	mem_key_dict_stats_index_map_t;
+PSI_memory_key	mem_key_dict_stats_n_diff_on_level;
+PSI_memory_key	mem_key_other;
+PSI_memory_key	mem_key_row_log_buf;
+PSI_memory_key	mem_key_row_merge_sort;
+PSI_memory_key	mem_key_std;
+
+#ifdef UNIV_PFS_MEMORY
+
+/** Auxiliary array of performance schema 'PSI_memory_info'.
+Each allocation appears in
+performance_schema.memory_summary_global_by_event_name (and alike) in the form
+of e.g. 'memory/innodb/NAME' where the last component NAME is picked from
+the list below:
+1. If key is specified, then the respective name is used
+2. Without a specified key, allocations from inside std::* containers use
+   mem_key_std
+3. Without a specified key, allocations from outside std::* pick up the key
+   based on the file name, and if file name is not found in the predefined list
+   (in ut_new_boot()) then mem_key_other is used.
+Keep this list alphabetically sorted. */
+static PSI_memory_info	pfs_info[] = {
+#ifdef BTR_CUR_HASH_ADAPT
+  {&mem_key_ahi, "adaptive hash index", 0},
+#endif /* BTR_CUR_HASH_ADAPT */
+  {&mem_key_buf_buf_pool, "buf_buf_pool", 0},
+  {&mem_key_dict_stats_bg_recalc_pool_t, "dict_stats_bg_recalc_pool_t", 0},
+  {&mem_key_dict_stats_index_map_t, "dict_stats_index_map_t", 0},
+  {&mem_key_dict_stats_n_diff_on_level, "dict_stats_n_diff_on_level", 0},
+  {&mem_key_other, "other", 0},
+  {&mem_key_row_log_buf, "row_log_buf", 0},
+  {&mem_key_row_merge_sort, "row_merge_sort", 0},
+  {&mem_key_std, "std", 0},
+};
+
+static const int NKEYS = static_cast<int>UT_ARR_SIZE(auto_event_names)-1;
+static PSI_memory_key auto_event_keys[NKEYS];
+
+/** Setup the internal objects needed for UT_NEW() to operate.
+This must be called before the first call to UT_NEW(). */
+void ut_new_boot()
+{
+  PSI_MEMORY_CALL(register_memory)("innodb", pfs_info, static_cast<int>
+                                   UT_ARR_SIZE(pfs_info));
+
+  PSI_memory_info pfs_info_auto[NKEYS];
+  for (int i= 0; i < NKEYS; i++)
+  {
+    pfs_info_auto[i]= {&auto_event_keys[i], auto_event_names[i], 0};
+  }
+
+  PSI_MEMORY_CALL(register_memory)("innodb", pfs_info_auto,NKEYS);
+}
+
+/** Retrieve a memory key (registered with PFS), corresponding to source file .
+
+@param[in] autoevent_idx - offset to the auto_event_names corresponding to the
+file name of the caller.
+
+@return registered memory key or PSI_NOT_INSTRUMENTED
+*/
+PSI_memory_key ut_new_get_key_by_file(uint32_t autoevent_idx)
+{
+  ut_ad(autoevent_idx < NKEYS);
+  return auto_event_keys[autoevent_idx];
+}
+
+#else /* UNIV_PFS_MEMORY */
+void ut_new_boot(){}
+#endif
diff --git a/storage/innobase/ut/ut0rbt.cc b/storage/innobase/ut/ut0rbt.cc
new file mode 100644
index 00000000..7ba6693c
--- /dev/null
+++ b/storage/innobase/ut/ut0rbt.cc
@@ -0,0 +1,1142 @@
+/***************************************************************************//**
+
+Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/********************************************************************//**
+Red-Black tree implementation
+
+(c) 2007 Oracle/Innobase Oy
+
+Created 2007-03-20 Sunny Bains
+***********************************************************************/
+
+#include "ut0rbt.h"
+#include "ut0new.h"
+
+/**********************************************************************//**
+Definition of a red-black tree
+==============================
+
+A red-black tree is a binary search tree which has the following
+red-black properties:
+
+   1. Every node is either red or black.
+   2. Every leaf (NULL - in our case tree->nil) is black.
+   3. If a node is red, then both its children are black.
+   4. Every simple path from a node to a descendant leaf contains the
+      same number of black nodes.
+
+   from (3) above, the implication is that on any path from the root
+   to a leaf, red nodes must not be adjacent.
+
+   However, any number of black nodes may appear in a sequence.
+ */
+
+#if	defined(IB_RBT_TESTING)
+#warning "Testing enabled!"
+#endif
+
+#define ROOT(t)		(t->root->left)
+#define	SIZEOF_NODE(t)	((sizeof(ib_rbt_node_t) + t->sizeof_value) - 1)
+
+#if defined UNIV_DEBUG || defined IB_RBT_TESTING
+/**********************************************************************//**
+Verify that the keys are in order.
+@return TRUE of OK. FALSE if not ordered */
+static
+ibool
+rbt_check_ordering(
+/*===============*/
+	const ib_rbt_t*		tree)		/*!< in: tree to verfify */
+{
+	const ib_rbt_node_t*	node;
+	const ib_rbt_node_t*	prev = NULL;
+
+	/* Iterate over all the nodes, comparing each node with the prev */
+	for (node = rbt_first(tree); node; node = rbt_next(tree, prev)) {
+
+		if (prev) {
+			int	result;
+
+			if (tree->cmp_arg) {
+				result = tree->compare_with_arg(
+					tree->cmp_arg, prev->value,
+					node->value);
+			} else {
+				result = tree->compare(
+					prev->value, node->value);
+			}
+
+			if (result >= 0) {
+				return(FALSE);
+			}
+		}
+
+		prev = node;
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Check that every path from the root to the leaves has the same count.
+Count is expressed in the number of black nodes.
+@return 0 on failure else black height of the subtree */
+static
+ibool
+rbt_count_black_nodes(
+/*==================*/
+	const ib_rbt_t*		tree,		/*!< in: tree to verify */
+	const ib_rbt_node_t*	node)		/*!< in: start of sub-tree */
+{
+	ulint	result;
+
+	if (node != tree->nil) {
+		ulint	left_height = rbt_count_black_nodes(tree, node->left);
+
+		ulint	right_height = rbt_count_black_nodes(tree, node->right);
+
+		if (left_height == 0
+		    || right_height == 0
+		    || left_height != right_height) {
+
+			result = 0;
+		} else if (node->color == IB_RBT_RED) {
+
+			/* Case 3 */
+			if (node->left->color != IB_RBT_BLACK
+			    || node->right->color != IB_RBT_BLACK) {
+
+				result = 0;
+			} else {
+				result = left_height;
+			}
+		/* Check if it's anything other than RED or BLACK. */
+		} else if (node->color != IB_RBT_BLACK) {
+
+			result = 0;
+		} else {
+
+			result = right_height + 1;
+		}
+	} else {
+		result = 1;
+	}
+
+	return(result);
+}
+#endif /* UNIV_DEBUG || IB_RBT_TESTING */
+
+/**********************************************************************//**
+Turn the node's right child's left sub-tree into node's right sub-tree.
+This will also make node's right child it's parent. */
+static
+void
+rbt_rotate_left(
+/*============*/
+	const ib_rbt_node_t*	nil,		/*!< in: nil node of the tree */
+	ib_rbt_node_t*		node)		/*!< in: node to rotate */
+{
+	ib_rbt_node_t*	right = node->right;
+
+	node->right = right->left;
+
+	if (right->left != nil) {
+		right->left->parent = node;
+	}
+
+	/* Right's new parent was node's parent. */
+	right->parent = node->parent;
+
+	/* Since root's parent is tree->nil and root->parent->left points
+	back to root, we can avoid the check. */
+	if (node == node->parent->left) {
+		/* Node was on the left of its parent. */
+		node->parent->left = right;
+	} else {
+		/* Node must have been on the right. */
+		node->parent->right = right;
+	}
+
+	/* Finally, put node on right's left. */
+	right->left = node;
+	node->parent = right;
+}
+
+/**********************************************************************//**
+Turn the node's left child's right sub-tree into node's left sub-tree.
+This also make node's left child it's parent. */
+static
+void
+rbt_rotate_right(
+/*=============*/
+	const ib_rbt_node_t*	nil,		/*!< in: nil node of tree */
+	ib_rbt_node_t*		node)		/*!< in: node to rotate */
+{
+	ib_rbt_node_t*	left = node->left;
+
+	node->left = left->right;
+
+	if (left->right != nil) {
+		left->right->parent = node;
+	}
+
+	/* Left's new parent was node's parent. */
+	left->parent = node->parent;
+
+	/* Since root's parent is tree->nil and root->parent->left points
+	back to root, we can avoid the check. */
+	if (node == node->parent->right) {
+	    /* Node was on the left of its parent. */
+            node->parent->right = left;
+	} else {
+	    /* Node must have been on the left. */
+            node->parent->left = left;
+	}
+
+	/* Finally, put node on left's right. */
+	left->right = node;
+	node->parent = left;
+}
+
+/**********************************************************************//**
+Append a node to the tree. */
+static
+ib_rbt_node_t*
+rbt_tree_add_child(
+/*===============*/
+	const ib_rbt_t*	tree,
+	ib_rbt_bound_t*	parent,
+	ib_rbt_node_t*	node)
+{
+	/* Cast away the const. */
+	ib_rbt_node_t*	last = (ib_rbt_node_t*) parent->last;
+
+	if (last == tree->root || parent->result < 0) {
+		last->left = node;
+	} else {
+		/* FIXME: We don't handle duplicates (yet)! */
+		ut_a(parent->result != 0);
+
+		last->right = node;
+	}
+
+	node->parent = last;
+
+	return(node);
+}
+
+/**********************************************************************//**
+Generic binary tree insert */
+static
+ib_rbt_node_t*
+rbt_tree_insert(
+/*============*/
+	ib_rbt_t*	tree,
+	const void*	key,
+	ib_rbt_node_t*	node)
+{
+	ib_rbt_bound_t	parent;
+	ib_rbt_node_t*	current = ROOT(tree);
+
+	parent.result = 0;
+	parent.last = tree->root;
+
+	/* Regular binary search. */
+	while (current != tree->nil) {
+
+		parent.last = current;
+
+		if (tree->cmp_arg) {
+			parent.result = tree->compare_with_arg(
+				tree->cmp_arg, key, current->value);
+		} else {
+			parent.result = tree->compare(key, current->value);
+		}
+
+		if (parent.result < 0) {
+			current = current->left;
+		} else {
+			current = current->right;
+		}
+	}
+
+	ut_a(current == tree->nil);
+
+	rbt_tree_add_child(tree, &parent, node);
+
+	return(node);
+}
+
+/**********************************************************************//**
+Balance a tree after inserting a node. */
+static
+void
+rbt_balance_tree(
+/*=============*/
+	const ib_rbt_t*	tree,			/*!< in: tree to balance */
+	ib_rbt_node_t*	node)			/*!< in: node that was inserted */
+{
+	const ib_rbt_node_t*	nil = tree->nil;
+	ib_rbt_node_t*		parent = node->parent;
+
+	/* Restore the red-black property. */
+	node->color = IB_RBT_RED;
+
+	while (node != ROOT(tree) && parent->color == IB_RBT_RED) {
+		ib_rbt_node_t*	grand_parent = parent->parent;
+
+		if (parent == grand_parent->left) {
+			ib_rbt_node_t*	uncle = grand_parent->right;
+
+			if (uncle->color == IB_RBT_RED) {
+
+				/* Case 1 - change the colors. */
+				uncle->color = IB_RBT_BLACK;
+				parent->color = IB_RBT_BLACK;
+				grand_parent->color = IB_RBT_RED;
+
+				/* Move node up the tree. */
+				node = grand_parent;
+
+			} else {
+
+				if (node == parent->right) {
+					/* Right is a black node and node is
+					to the right, case 2 - move node
+					up and rotate. */
+					node = parent;
+					rbt_rotate_left(nil, node);
+				}
+
+				grand_parent = node->parent->parent;
+
+				/* Case 3. */
+				node->parent->color = IB_RBT_BLACK;
+				grand_parent->color = IB_RBT_RED;
+
+				rbt_rotate_right(nil, grand_parent);
+			}
+
+		} else {
+			ib_rbt_node_t*	uncle = grand_parent->left;
+
+			if (uncle->color == IB_RBT_RED) {
+
+				/* Case 1 - change the colors. */
+				uncle->color = IB_RBT_BLACK;
+				parent->color = IB_RBT_BLACK;
+				grand_parent->color = IB_RBT_RED;
+
+				/* Move node up the tree. */
+				node = grand_parent;
+
+			} else {
+
+				if (node == parent->left) {
+					/* Left is a black node and node is to
+					the right, case 2 - move node up and
+					rotate. */
+					node = parent;
+					rbt_rotate_right(nil, node);
+				}
+
+				grand_parent = node->parent->parent;
+
+				/* Case 3. */
+				node->parent->color = IB_RBT_BLACK;
+				grand_parent->color = IB_RBT_RED;
+
+				rbt_rotate_left(nil, grand_parent);
+			}
+		}
+
+		parent = node->parent;
+	}
+
+	/* Color the root black. */
+	ROOT(tree)->color = IB_RBT_BLACK;
+}
+
+/**********************************************************************//**
+Find the given node's successor.
+@return successor node or NULL if no successor */
+static
+ib_rbt_node_t*
+rbt_find_successor(
+/*===============*/
+	const ib_rbt_t*		tree,		/*!< in: rb tree */
+	const ib_rbt_node_t*	current)	/*!< in: this is declared const
+						because it can be called via
+						rbt_next() */
+{
+	const ib_rbt_node_t*	nil = tree->nil;
+	ib_rbt_node_t*		next = current->right;
+
+	/* Is there a sub-tree to the right that we can follow. */
+	if (next != nil) {
+
+		/* Follow the left most links of the current right child. */
+		while (next->left != nil) {
+			next = next->left;
+		}
+
+	} else { /* We will have to go up the tree to find the successor. */
+		ib_rbt_node_t*	parent = current->parent;
+
+		/* Cast away the const. */
+		next = (ib_rbt_node_t*) current;
+
+		while (parent != tree->root && next == parent->right) {
+			next = parent;
+			parent = next->parent;
+		}
+
+		next = (parent == tree->root) ? NULL : parent;
+	}
+
+	return(next);
+}
+
+/**********************************************************************//**
+Find the given node's precedecessor.
+@return predecessor node or NULL if no predecesor */
+static
+ib_rbt_node_t*
+rbt_find_predecessor(
+/*=================*/
+	const ib_rbt_t*		tree,		/*!< in: rb tree */
+	const ib_rbt_node_t*	current)	/*!< in: this is declared const
+						because it can be called via
+						rbt_prev() */
+{
+	const ib_rbt_node_t*	nil = tree->nil;
+	ib_rbt_node_t*		prev = current->left;
+
+	/* Is there a sub-tree to the left that we can follow. */
+	if (prev != nil) {
+
+		/* Follow the right most links of the current left child. */
+		while (prev->right != nil) {
+			prev = prev->right;
+		}
+
+	} else { /* We will have to go up the tree to find the precedecessor. */
+		ib_rbt_node_t*	parent = current->parent;
+
+		/* Cast away the const. */
+		prev = (ib_rbt_node_t*) current;
+
+		while (parent != tree->root && prev == parent->left) {
+			prev = parent;
+			parent = prev->parent;
+		}
+
+		prev = (parent == tree->root) ? NULL : parent;
+	}
+
+	return(prev);
+}
+
+/**********************************************************************//**
+Replace node with child. After applying transformations eject becomes
+an orphan. */
+static
+void
+rbt_eject_node(
+/*===========*/
+	ib_rbt_node_t*	eject,			/*!< in: node to eject */
+	ib_rbt_node_t*	node)			/*!< in: node to replace with */
+{
+	/* Update the to be ejected node's parent's child pointers. */
+	if (eject->parent->left == eject) {
+		eject->parent->left = node;
+	} else if (eject->parent->right == eject) {
+		eject->parent->right = node;
+	} else {
+		ut_a(0);
+	}
+	/* eject is now an orphan but otherwise its pointers
+	and color are left intact. */
+
+	node->parent = eject->parent;
+}
+
+/**********************************************************************//**
+Replace a node with another node. */
+static
+void
+rbt_replace_node(
+/*=============*/
+	ib_rbt_node_t*	replace,		/*!< in: node to replace */
+	ib_rbt_node_t*	node)			/*!< in: node to replace with */
+{
+	ib_rbt_color_t	color = node->color;
+
+	/* Update the node pointers. */
+	node->left = replace->left;
+	node->right = replace->right;
+
+	/* Update the child node pointers. */
+	node->left->parent = node;
+	node->right->parent = node;
+
+	/* Make the parent of replace point to node. */
+	rbt_eject_node(replace, node);
+
+	/* Swap the colors. */
+	node->color = replace->color;
+	replace->color = color;
+}
+
+/**********************************************************************//**
+Detach node from the tree replacing it with one of it's children.
+@return the child node that now occupies the position of the detached node */
+static
+ib_rbt_node_t*
+rbt_detach_node(
+/*============*/
+	const ib_rbt_t*	tree,			/*!< in: rb tree */
+	ib_rbt_node_t*	node)			/*!< in: node to detach */
+{
+	ib_rbt_node_t*		child;
+	const ib_rbt_node_t*	nil = tree->nil;
+
+	if (node->left != nil && node->right != nil) {
+		/* Case where the node to be deleted has two children. */
+		ib_rbt_node_t*	successor = rbt_find_successor(tree, node);
+
+		ut_a(successor != nil);
+		ut_a(successor->parent != nil);
+		ut_a(successor->left == nil);
+
+		child = successor->right;
+
+		/* Remove the successor node and replace with its child. */
+		rbt_eject_node(successor, child);
+
+		/* Replace the node to delete with its successor node. */
+		rbt_replace_node(node, successor);
+	} else {
+		ut_a(node->left == nil || node->right == nil);
+
+		child = (node->left != nil) ? node->left : node->right;
+
+		/* Replace the node to delete with one of it's children. */
+		rbt_eject_node(node, child);
+	}
+
+	/* Reset the node links. */
+	node->parent = node->right = node->left = tree->nil;
+
+	return(child);
+}
+
+/**********************************************************************//**
+Rebalance the right sub-tree after deletion.
+@return node to rebalance if more rebalancing required else NULL */
+static
+ib_rbt_node_t*
+rbt_balance_right(
+/*==============*/
+	const ib_rbt_node_t*	nil,		/*!< in: rb tree nil node */
+	ib_rbt_node_t*		parent,		/*!< in: parent node */
+	ib_rbt_node_t*		sibling)	/*!< in: sibling node */
+{
+	ib_rbt_node_t*		node = NULL;
+
+	ut_a(sibling != nil);
+
+	/* Case 3. */
+	if (sibling->color == IB_RBT_RED) {
+
+		parent->color = IB_RBT_RED;
+		sibling->color = IB_RBT_BLACK;
+
+		rbt_rotate_left(nil, parent);
+
+		sibling = parent->right;
+
+		ut_a(sibling != nil);
+	}
+
+	/* Since this will violate case 3 because of the change above. */
+	if (sibling->left->color == IB_RBT_BLACK
+	    && sibling->right->color == IB_RBT_BLACK) {
+
+		node = parent; /* Parent needs to be rebalanced too. */
+		sibling->color = IB_RBT_RED;
+
+	} else {
+		if (sibling->right->color == IB_RBT_BLACK) {
+
+			ut_a(sibling->left->color == IB_RBT_RED);
+
+			sibling->color = IB_RBT_RED;
+			sibling->left->color = IB_RBT_BLACK;
+
+			rbt_rotate_right(nil, sibling);
+
+			sibling = parent->right;
+			ut_a(sibling != nil);
+		}
+
+		sibling->color = parent->color;
+		sibling->right->color = IB_RBT_BLACK;
+
+		parent->color = IB_RBT_BLACK;
+
+		rbt_rotate_left(nil, parent);
+	}
+
+	return(node);
+}
+
+/**********************************************************************//**
+Rebalance the left sub-tree after deletion.
+@return node to rebalance if more rebalancing required else NULL */
+static
+ib_rbt_node_t*
+rbt_balance_left(
+/*=============*/
+	const ib_rbt_node_t*	nil,		/*!< in: rb tree nil node */
+	ib_rbt_node_t*		parent,		/*!< in: parent node */
+	ib_rbt_node_t*		sibling)	/*!< in: sibling node */
+{
+	ib_rbt_node_t*	node = NULL;
+
+	ut_a(sibling != nil);
+
+	/* Case 3. */
+	if (sibling->color == IB_RBT_RED) {
+
+		parent->color = IB_RBT_RED;
+		sibling->color = IB_RBT_BLACK;
+
+		rbt_rotate_right(nil, parent);
+		sibling = parent->left;
+
+		ut_a(sibling != nil);
+	}
+
+	/* Since this will violate case 3 because of the change above. */
+	if (sibling->right->color == IB_RBT_BLACK
+	    && sibling->left->color == IB_RBT_BLACK) {
+
+		node = parent; /* Parent needs to be rebalanced too. */
+		sibling->color = IB_RBT_RED;
+
+	} else {
+		if (sibling->left->color == IB_RBT_BLACK) {
+
+			ut_a(sibling->right->color == IB_RBT_RED);
+
+			sibling->color = IB_RBT_RED;
+			sibling->right->color = IB_RBT_BLACK;
+
+			rbt_rotate_left(nil, sibling);
+
+			sibling = parent->left;
+
+			ut_a(sibling != nil);
+		}
+
+		sibling->color = parent->color;
+		sibling->left->color = IB_RBT_BLACK;
+
+		parent->color = IB_RBT_BLACK;
+
+		rbt_rotate_right(nil, parent);
+	}
+
+	return(node);
+}
+
+/**********************************************************************//**
+Delete the node and rebalance the tree if necessary */
+static
+void
+rbt_remove_node_and_rebalance(
+/*==========================*/
+	ib_rbt_t*		tree,		/*!< in: rb tree */
+	ib_rbt_node_t*		node)		/*!< in: node to remove */
+{
+	/* Detach node and get the node that will be used
+	as rebalance start. */
+	ib_rbt_node_t*	child = rbt_detach_node(tree, node);
+
+	if (node->color == IB_RBT_BLACK) {
+		ib_rbt_node_t*	last = child;
+
+		ROOT(tree)->color = IB_RBT_RED;
+
+		while (child && child->color == IB_RBT_BLACK) {
+			ib_rbt_node_t*	parent = child->parent;
+
+			/* Did the deletion cause an imbalance in the
+			parents left sub-tree. */
+			if (parent->left == child) {
+
+				child = rbt_balance_right(
+					tree->nil, parent, parent->right);
+
+			} else if (parent->right == child) {
+
+				child = rbt_balance_left(
+					tree->nil, parent, parent->left);
+
+			} else {
+				ut_error;
+			}
+
+			if (child) {
+				last = child;
+			}
+		}
+
+		ut_a(last);
+
+		last->color = IB_RBT_BLACK;
+		ROOT(tree)->color = IB_RBT_BLACK;
+	}
+
+	/* Note that we have removed a node from the tree. */
+	--tree->n_nodes;
+}
+
+/**********************************************************************//**
+Recursively free the nodes. */
+static
+void
+rbt_free_node(
+/*==========*/
+	ib_rbt_node_t*	node,			/*!< in: node to free */
+	ib_rbt_node_t*	nil)			/*!< in: rb tree nil node */
+{
+	if (node != nil) {
+		rbt_free_node(node->left, nil);
+		rbt_free_node(node->right, nil);
+
+		ut_free(node);
+	}
+}
+
+/**********************************************************************//**
+Free all the nodes and free the tree. */
+void
+rbt_free(
+/*=====*/
+	ib_rbt_t*	tree)			/*!< in: rb tree to free */
+{
+	rbt_free_node(tree->root, tree->nil);
+	ut_free(tree->nil);
+	ut_free(tree);
+}
+
+/**********************************************************************//**
+Create an instance of a red black tree, whose comparison function takes
+an argument
+@return an empty rb tree */
+ib_rbt_t*
+rbt_create_arg_cmp(
+/*===============*/
+	size_t		sizeof_value,		/*!< in: sizeof data item */
+	ib_rbt_arg_compare
+			compare,		/*!< in: fn to compare items */
+	void*		cmp_arg)		/*!< in: compare fn arg */
+{
+	ib_rbt_t*       tree;
+
+	ut_a(cmp_arg);
+
+	tree = rbt_create(sizeof_value, NULL);
+	tree->cmp_arg = cmp_arg;
+	tree->compare_with_arg = compare;
+
+	return(tree);
+}
+
+/**********************************************************************//**
+Create an instance of a red black tree.
+@return an empty rb tree */
+ib_rbt_t*
+rbt_create(
+/*=======*/
+	size_t		sizeof_value,		/*!< in: sizeof data item */
+	ib_rbt_compare	compare)		/*!< in: fn to compare items */
+{
+	ib_rbt_t*	tree;
+	ib_rbt_node_t*	node;
+
+	tree = (ib_rbt_t*) ut_zalloc_nokey(sizeof(*tree));
+
+	tree->sizeof_value = sizeof_value;
+
+	/* Create the sentinel (NIL) node. */
+	node = tree->nil = (ib_rbt_node_t*) ut_zalloc_nokey(sizeof(*node));
+
+	node->color = IB_RBT_BLACK;
+	node->parent = node->left = node->right = node;
+
+	/* Create the "fake" root, the real root node will be the
+	left child of this node. */
+	node = tree->root = (ib_rbt_node_t*) ut_zalloc_nokey(sizeof(*node));
+
+	node->color = IB_RBT_BLACK;
+	node->parent = node->left = node->right = tree->nil;
+
+	tree->compare = compare;
+
+	return(tree);
+}
+
+/**********************************************************************//**
+Generic insert of a value in the rb tree.
+@return inserted node */
+const ib_rbt_node_t*
+rbt_insert(
+/*=======*/
+	ib_rbt_t*	tree,			/*!< in: rb tree */
+	const void*	key,			/*!< in: key for ordering */
+	const void*	value)			/*!< in: value of key, this value
+						is copied to the node */
+{
+	ib_rbt_node_t*	node;
+
+	/* Create the node that will hold the value data. */
+	node = (ib_rbt_node_t*) ut_malloc_nokey(SIZEOF_NODE(tree));
+
+	memcpy(node->value, value, tree->sizeof_value);
+	node->parent = node->left = node->right = tree->nil;
+
+	/* Insert in the tree in the usual way. */
+	rbt_tree_insert(tree, key, node);
+	rbt_balance_tree(tree, node);
+
+	++tree->n_nodes;
+
+	return(node);
+}
+
+/**********************************************************************//**
+Add a new node to the tree, useful for data that is pre-sorted.
+@return appended node */
+const ib_rbt_node_t*
+rbt_add_node(
+/*=========*/
+	ib_rbt_t*	tree,			/*!< in: rb tree */
+	ib_rbt_bound_t*	parent,			/*!< in: bounds */
+	const void*	value)			/*!< in: this value is copied
+						to the node */
+{
+	ib_rbt_node_t*	node;
+
+	/* Create the node that will hold the value data */
+	node = (ib_rbt_node_t*) ut_malloc_nokey(SIZEOF_NODE(tree));
+
+	memcpy(node->value, value, tree->sizeof_value);
+	node->parent = node->left = node->right = tree->nil;
+
+	/* If tree is empty */
+	if (parent->last == NULL) {
+		parent->last = tree->root;
+	}
+
+	/* Append the node, the hope here is that the caller knows
+	what s/he is doing. */
+	rbt_tree_add_child(tree, parent, node);
+	rbt_balance_tree(tree, node);
+
+	++tree->n_nodes;
+
+#if defined UNIV_DEBUG || defined IB_RBT_TESTING
+	ut_a(rbt_validate(tree));
+#endif
+	return(node);
+}
+
+/**********************************************************************//**
+Find a matching node in the rb tree.
+@return NULL if not found else the node where key was found */
+static
+const ib_rbt_node_t*
+rbt_lookup(
+/*=======*/
+	const ib_rbt_t*	tree,			/*!< in: rb tree */
+	const void*	key)			/*!< in: key to use for search */
+{
+	const ib_rbt_node_t*	current = ROOT(tree);
+
+	/* Regular binary search. */
+	while (current != tree->nil) {
+		int	result;
+
+		if (tree->cmp_arg) {
+			result = tree->compare_with_arg(
+				tree->cmp_arg, key, current->value);
+		} else {
+			result = tree->compare(key, current->value);
+		}
+
+		if (result < 0) {
+			current = current->left;
+		} else if (result > 0) {
+			current = current->right;
+		} else {
+			break;
+		}
+	}
+
+	return(current != tree->nil ? current : NULL);
+}
+
+/**********************************************************************//**
+Delete a node indentified by key.
+@return TRUE if success FALSE if not found */
+ibool
+rbt_delete(
+/*=======*/
+	ib_rbt_t*	tree,			/*!< in: rb tree */
+	const void*	key)			/*!< in: key to delete */
+{
+	ibool		deleted = FALSE;
+	ib_rbt_node_t*	node = (ib_rbt_node_t*) rbt_lookup(tree, key);
+
+	if (node) {
+		rbt_remove_node_and_rebalance(tree, node);
+
+		ut_free(node);
+		deleted = TRUE;
+	}
+
+	return(deleted);
+}
+
+/**********************************************************************//**
+Remove a node from the rb tree, the node is not free'd, that is the
+callers responsibility.
+@return deleted node but without the const */
+ib_rbt_node_t*
+rbt_remove_node(
+/*============*/
+	ib_rbt_t*		tree,		/*!< in: rb tree */
+	const ib_rbt_node_t*	const_node)	/*!< in: node to delete, this
+						is a fudge and declared const
+						because the caller can access
+						only const nodes */
+{
+	/* Cast away the const. */
+	rbt_remove_node_and_rebalance(tree, (ib_rbt_node_t*) const_node);
+
+	/* This is to make it easier to do something like this:
+		ut_free(rbt_remove_node(node));
+	*/
+
+	return((ib_rbt_node_t*) const_node);
+}
+
+/**********************************************************************//**
+Find the node that has the greatest key that is <= key.
+@return value of result */
+int
+rbt_search(
+/*=======*/
+	const ib_rbt_t*	tree,			/*!< in: rb tree */
+	ib_rbt_bound_t*	parent,			/*!< in: search bounds */
+	const void*	key)			/*!< in: key to search */
+{
+	ib_rbt_node_t*	current = ROOT(tree);
+
+	/* Every thing is greater than the NULL root. */
+	parent->result = 1;
+	parent->last = NULL;
+
+	while (current != tree->nil) {
+
+		parent->last = current;
+
+		if (tree->cmp_arg) {
+			parent->result = tree->compare_with_arg(
+				tree->cmp_arg, key, current->value);
+		} else {
+			parent->result = tree->compare(key, current->value);
+		}
+
+		if (parent->result > 0) {
+			current = current->right;
+		} else if (parent->result < 0) {
+			current = current->left;
+		} else {
+			break;
+		}
+	}
+
+	return(parent->result);
+}
+
+/**********************************************************************//**
+Find the node that has the greatest key that is <= key. But use the
+supplied comparison function.
+@return value of result */
+int
+rbt_search_cmp(
+/*===========*/
+	const ib_rbt_t*	tree,			/*!< in: rb tree */
+	ib_rbt_bound_t*	parent,			/*!< in: search bounds */
+	const void*	key,			/*!< in: key to search */
+	ib_rbt_compare	compare,		/*!< in: fn to compare items */
+	ib_rbt_arg_compare
+			arg_compare)		/*!< in: fn to compare items
+						with argument */
+{
+	ib_rbt_node_t*	current = ROOT(tree);
+
+	/* Every thing is greater than the NULL root. */
+	parent->result = 1;
+	parent->last = NULL;
+
+	while (current != tree->nil) {
+
+		parent->last = current;
+
+		if (arg_compare) {
+			ut_ad(tree->cmp_arg);
+			parent->result = arg_compare(
+				tree->cmp_arg, key, current->value);
+		} else {
+			parent->result = compare(key, current->value);
+		}
+
+		if (parent->result > 0) {
+			current = current->right;
+		} else if (parent->result < 0) {
+			current = current->left;
+		} else {
+			break;
+		}
+	}
+
+	return(parent->result);
+}
+
+/**********************************************************************//**
+Return the left most node in the tree. */
+const ib_rbt_node_t*
+rbt_first(
+/*======*/
+						/* out leftmost node or NULL */
+	const ib_rbt_t*	tree)			/* in: rb tree */
+{
+	ib_rbt_node_t*	first = NULL;
+	ib_rbt_node_t*	current = ROOT(tree);
+
+	while (current != tree->nil) {
+		first = current;
+		current = current->left;
+	}
+
+	return(first);
+}
+
+/**********************************************************************//**
+Return the right most node in the tree.
+@return the rightmost node or NULL */
+const ib_rbt_node_t*
+rbt_last(
+/*=====*/
+	const ib_rbt_t*	tree)			/*!< in: rb tree */
+{
+	ib_rbt_node_t*	last = NULL;
+	ib_rbt_node_t*	current = ROOT(tree);
+
+	while (current != tree->nil) {
+		last = current;
+		current = current->right;
+	}
+
+	return(last);
+}
+
+/**********************************************************************//**
+Return the next node.
+@return node next from current */
+const ib_rbt_node_t*
+rbt_next(
+/*=====*/
+	const ib_rbt_t*		tree,		/*!< in: rb tree */
+	const ib_rbt_node_t*	current)	/*!< in: current node */
+{
+	return(current ? rbt_find_successor(tree, current) : NULL);
+}
+
+/**********************************************************************//**
+Return the previous node.
+@return node prev from current */
+const ib_rbt_node_t*
+rbt_prev(
+/*=====*/
+	const ib_rbt_t*		tree,		/*!< in: rb tree */
+	const ib_rbt_node_t*	current)	/*!< in: current node */
+{
+	return(current ? rbt_find_predecessor(tree, current) : NULL);
+}
+
+/**********************************************************************//**
+Merge the node from dst into src. Return the number of nodes merged.
+@return no. of recs merged */
+ulint
+rbt_merge_uniq(
+/*===========*/
+	ib_rbt_t*	dst,			/*!< in: dst rb tree */
+	const ib_rbt_t*	src)			/*!< in: src rb tree */
+{
+	ib_rbt_bound_t		parent;
+	ulint			n_merged = 0;
+	const	ib_rbt_node_t*	src_node = rbt_first(src);
+
+	if (rbt_empty(src) || dst == src) {
+		return(0);
+	}
+
+	for (/* No op */; src_node; src_node = rbt_next(src, src_node)) {
+
+		if (rbt_search(dst, &parent, src_node->value) != 0) {
+			rbt_add_node(dst, &parent, src_node->value);
+			++n_merged;
+		}
+	}
+
+	return(n_merged);
+}
+
+#if defined UNIV_DEBUG || defined IB_RBT_TESTING
+/**********************************************************************//**
+Check that every path from the root to the leaves has the same count and
+the tree nodes are in order.
+@return TRUE if OK FALSE otherwise */
+ibool
+rbt_validate(
+/*=========*/
+	const ib_rbt_t*	tree)		/*!< in: RB tree to validate */
+{
+	if (rbt_count_black_nodes(tree, ROOT(tree)) > 0) {
+		return(rbt_check_ordering(tree));
+	}
+
+	return(FALSE);
+}
+#endif /* UNIV_DEBUG || IB_RBT_TESTING */
diff --git a/storage/innobase/ut/ut0rnd.cc b/storage/innobase/ut/ut0rnd.cc
new file mode 100644
index 00000000..a2e56951
--- /dev/null
+++ b/storage/innobase/ut/ut0rnd.cc
@@ -0,0 +1,93 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2020, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/***************************************************************//**
+@file ut/ut0rnd.cc
+Random numbers and hashing
+
+Created 5/11/1994 Heikki Tuuri
+********************************************************************/
+
+#include "ut0rnd.h"
+
+/** Seed value of ut_rnd_gen() */
+std::atomic<uint32_t> ut_rnd_current;
+
+/** These random numbers are used in ut_find_prime */
+/*@{*/
+#define	UT_RANDOM_1	1.0412321
+#define	UT_RANDOM_2	1.1131347
+#define UT_RANDOM_3	1.0132677
+/*@}*/
+
+/***********************************************************//**
+Looks for a prime number slightly greater than the given argument.
+The prime is chosen so that it is not near any power of 2.
+@return prime */
+ulint
+ut_find_prime(
+/*==========*/
+	ulint	n)	/*!< in: positive number > 100 */
+{
+	ulint	pow2;
+	ulint	i;
+
+	n += 100;
+
+	pow2 = 1;
+	while (pow2 * 2 < n) {
+		pow2 = 2 * pow2;
+	}
+
+	if ((double) n < 1.05 * (double) pow2) {
+		n = (ulint) ((double) n * UT_RANDOM_1);
+	}
+
+	pow2 = 2 * pow2;
+
+	if ((double) n > 0.95 * (double) pow2) {
+		n = (ulint) ((double) n * UT_RANDOM_2);
+	}
+
+	if (n > pow2 - 20) {
+		n += 30;
+	}
+
+	/* Now we have n far enough from powers of 2. To make
+	n more random (especially, if it was not near
+	a power of 2), we then multiply it by a random number. */
+
+	n = (ulint) ((double) n * UT_RANDOM_3);
+
+	for (;; n++) {
+		i = 2;
+		while (i * i <= n) {
+			if (n % i == 0) {
+				goto next_n;
+			}
+			i++;
+		}
+
+		/* Found a prime */
+		break;
+next_n:		;
+	}
+
+	return(n);
+}
diff --git a/storage/innobase/ut/ut0ut.cc b/storage/innobase/ut/ut0ut.cc
new file mode 100644
index 00000000..7b69042c
--- /dev/null
+++ b/storage/innobase/ut/ut0ut.cc
@@ -0,0 +1,599 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2017, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2017, 2022, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/***************************************************************//**
+@file ut/ut0ut.cc
+Various utilities for Innobase.
+
+Created 5/11/1994 Heikki Tuuri
+********************************************************************/
+
+#include "ha_prototypes.h"
+
+#if HAVE_SYS_TIME_H
+#include <sys/time.h>
+#endif
+
+#ifndef UNIV_INNOCHECKSUM
+#include <mysql_com.h>
+#include "ut0ut.h"
+#include "trx0trx.h"
+#include <string>
+#include "log.h"
+#include "my_cpu.h"
+#ifndef DBUG_OFF
+#include "rem0rec.h"
+#endif
+
+/**********************************************************//**
+Returns the number of milliseconds since some epoch.  The
+value may wrap around.  It should only be used for heuristic
+purposes.
+@return ms since epoch */
+ulint
+ut_time_ms(void)
+/*============*/
+{
+	return static_cast<ulint>(my_interval_timer() / 1000000);
+}
+#endif /* !UNIV_INNOCHECKSUM */
+
+/**********************************************************//**
+Prints a timestamp to a file. */
+void
+ut_print_timestamp(
+/*===============*/
+	FILE*  file) /*!< in: file where to print */
+{
+#ifdef _WIN32
+	SYSTEMTIME cal_tm;
+	GetLocalTime(&cal_tm);
+#else
+	time_t	   tm;
+	struct tm  cal_tm;
+	time(&tm);
+	localtime_r(&tm, &cal_tm);
+#endif
+	fprintf(file,
+		IF_WIN("%u-%02u-%02u %02u:%02u:%02u %#zx",
+		       "%d-%02d-%02d %02d:%02d:%02d %#zx"),
+#ifdef _WIN32
+		cal_tm.wYear,
+		cal_tm.wMonth,
+		cal_tm.wDay,
+		cal_tm.wHour,
+		cal_tm.wMinute,
+		cal_tm.wSecond,
+#else
+		cal_tm.tm_year + 1900,
+		cal_tm.tm_mon + 1,
+		cal_tm.tm_mday,
+		cal_tm.tm_hour,
+		cal_tm.tm_min,
+		cal_tm.tm_sec,
+#endif
+#ifdef UNIV_INNOCHECKSUM
+		ulint{0}
+#else
+		ulint(pthread_self())
+#endif
+		);
+}
+
+#ifndef UNIV_INNOCHECKSUM
+
+/**********************************************************//**
+Sprintfs a timestamp to a buffer, 13..14 chars plus terminating NUL. */
+void
+ut_sprintf_timestamp(
+/*=================*/
+	char*	buf) /*!< in: buffer where to sprintf */
+{
+#ifdef _WIN32
+	SYSTEMTIME cal_tm;
+	GetLocalTime(&cal_tm);
+
+	sprintf(buf, "%02u%02u%02u %2u:%02u:%02u",
+		cal_tm.wYear % 100,
+		cal_tm.wMonth,
+		cal_tm.wDay,
+		cal_tm.wHour,
+		cal_tm.wMinute,
+		cal_tm.wSecond);
+#else
+	time_t	   tm;
+	struct tm  cal_tm;
+	time(&tm);
+	localtime_r(&tm, &cal_tm);
+	sprintf(buf, "%02d%02d%02d %2d:%02d:%02d",
+		cal_tm.tm_year % 100,
+		cal_tm.tm_mon + 1,
+		cal_tm.tm_mday,
+		cal_tm.tm_hour,
+		cal_tm.tm_min,
+		cal_tm.tm_sec);
+#endif
+}
+
+/*************************************************************//**
+Prints the contents of a memory buffer in hex and ascii. */
+void
+ut_print_buf(
+/*=========*/
+	FILE*		file,	/*!< in: file where to print */
+	const void*	buf,	/*!< in: memory buffer */
+	ulint		len)	/*!< in: length of the buffer */
+{
+	const byte*	data;
+	ulint		i;
+
+	fprintf(file, " len " ULINTPF "; hex ", len);
+
+	for (data = (const byte*) buf, i = 0; i < len; i++) {
+		fprintf(file, "%02x", *data++);
+	}
+
+	fputs("; asc ", file);
+
+	data = (const byte*) buf;
+
+	for (i = 0; i < len; i++) {
+		int	c = (int) *data++;
+		putc(isprint(c) ? c : ' ', file);
+	}
+
+	putc(';', file);
+}
+
+/*************************************************************//**
+Prints the contents of a memory buffer in hex. */
+void
+ut_print_buf_hex(
+/*=============*/
+	std::ostream&	o,	/*!< in/out: output stream */
+	const void*	buf,	/*!< in: memory buffer */
+	ulint		len)	/*!< in: length of the buffer */
+{
+	const byte*		data;
+	ulint			i;
+
+	static const char	hexdigit[16] = {
+		'0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F'
+	};
+
+	o << "(0x";
+
+	for (data = static_cast<const byte*>(buf), i = 0; i < len; i++) {
+		byte	b = *data++;
+		o << hexdigit[int(b) >> 4] << hexdigit[b & 15];
+	}
+
+	o << ")";
+}
+
+/*************************************************************//**
+Prints the contents of a memory buffer in hex and ascii. */
+void
+ut_print_buf(
+/*=========*/
+	std::ostream&	o,	/*!< in/out: output stream */
+	const void*	buf,	/*!< in: memory buffer */
+	ulint		len)	/*!< in: length of the buffer */
+{
+	const byte*	data;
+	ulint		i;
+
+	for (data = static_cast<const byte*>(buf), i = 0; i < len; i++) {
+		int	c = static_cast<int>(*data++);
+		o << (isprint(c) ? static_cast<char>(c) : ' ');
+	}
+
+	ut_print_buf_hex(o, buf, len);
+}
+
+/** Get a fixed-length string, quoted as an SQL identifier.
+If the string contains a slash '/', the string will be
+output as two identifiers separated by a period (.),
+as in SQL database_name.identifier.
+ @param		[in]	trx		transaction (NULL=no quotes).
+ @param		[in]	name		table name.
+ @retval	String quoted as an SQL identifier.
+*/
+std::string
+ut_get_name(
+	const trx_t*	trx,
+	const char*	name)
+{
+	/* 2 * NAME_LEN for database and table name,
+	and some slack for the #mysql50# prefix and quotes */
+	char		buf[3 * NAME_LEN];
+	const char*	bufend;
+
+	bufend = innobase_convert_name(buf, sizeof buf,
+				       name, strlen(name),
+				       trx ? trx->mysql_thd : NULL);
+	buf[bufend - buf] = '\0';
+	return(std::string(buf, 0, size_t(bufend - buf)));
+}
+
+/**********************************************************************//**
+Outputs a fixed-length string, quoted as an SQL identifier.
+If the string contains a slash '/', the string will be
+output as two identifiers separated by a period (.),
+as in SQL database_name.identifier. */
+void
+ut_print_name(
+/*==========*/
+	FILE*		f,	/*!< in: output stream */
+	const trx_t*	trx,	/*!< in: transaction */
+	const char*	name)	/*!< in: name to print */
+{
+	/* 2 * NAME_LEN for database and table name,
+	and some slack for the #mysql50# prefix and quotes */
+	char		buf[3 * NAME_LEN];
+	const char*	bufend;
+
+	bufend = innobase_convert_name(buf, sizeof buf,
+				       name, strlen(name),
+				       trx ? trx->mysql_thd : NULL);
+
+	if (fwrite(buf, 1, size_t(bufend - buf), f) != size_t(bufend - buf)) {
+		perror("fwrite");
+	}
+}
+
+/** Format a table name, quoted as an SQL identifier.
+If the name contains a slash '/', the result will contain two
+identifiers separated by a period (.), as in SQL
+database_name.table_name.
+@see table_name_t
+@param[in]	name		table or index name
+@param[out]	formatted	formatted result, will be NUL-terminated
+@param[in]	formatted_size	size of the buffer in bytes
+@return pointer to 'formatted' */
+char*
+ut_format_name(
+	const char*	name,
+	char*		formatted,
+	ulint		formatted_size)
+{
+	switch (formatted_size) {
+	case 1:
+		formatted[0] = '\0';
+		/* FALL-THROUGH */
+	case 0:
+		return(formatted);
+	}
+
+	char*	end;
+
+	end = innobase_convert_name(formatted, formatted_size,
+				    name, strlen(name), NULL);
+
+	/* If the space in 'formatted' was completely used, then sacrifice
+	the last character in order to write '\0' at the end. */
+	if ((ulint) (end - formatted) == formatted_size) {
+		end--;
+	}
+
+	ut_a((ulint) (end - formatted) < formatted_size);
+
+	*end = '\0';
+
+	return(formatted);
+}
+
+/**********************************************************************//**
+Catenate files. */
+void
+ut_copy_file(
+/*=========*/
+	FILE*	dest,	/*!< in: output file */
+	FILE*	src)	/*!< in: input file to be appended to output */
+{
+	long	len = ftell(src);
+	char	buf[4096];
+
+	rewind(src);
+	do {
+		size_t	maxs = len < (long) sizeof buf
+			? (size_t) len
+			: sizeof buf;
+		size_t	size = fread(buf, 1, maxs, src);
+		if (fwrite(buf, 1, size, dest) != size) {
+			perror("fwrite");
+		}
+		len -= (long) size;
+		if (size < maxs) {
+			break;
+		}
+	} while (len > 0);
+}
+
+/** Convert an error number to a human readable text message.
+The returned string is static and should not be freed or modified.
+@param[in]	num	InnoDB internal error number
+@return string, describing the error */
+const char*
+ut_strerr(
+	dberr_t	num)
+{
+	switch (num) {
+	case DB_SUCCESS:
+		return("Success");
+	case DB_SUCCESS_LOCKED_REC:
+		return("Success, record lock created");
+	case DB_ERROR:
+		return("Generic error");
+	case DB_READ_ONLY:
+		return("Read only transaction");
+	case DB_INTERRUPTED:
+		return("Operation interrupted");
+	case DB_OUT_OF_MEMORY:
+		return("Cannot allocate memory");
+	case DB_OUT_OF_FILE_SPACE:
+		return("Out of disk space");
+	case DB_LOCK_WAIT:
+		return("Lock wait");
+	case DB_DEADLOCK:
+		return("Deadlock");
+	case DB_ROLLBACK:
+		return("Rollback");
+	case DB_DUPLICATE_KEY:
+		return("Duplicate key");
+	case DB_MISSING_HISTORY:
+		return("Required history data has been deleted");
+	case DB_CLUSTER_NOT_FOUND:
+		return("Cluster not found");
+	case DB_TABLE_NOT_FOUND:
+		return("Table not found");
+	case DB_TOO_BIG_RECORD:
+		return("Record too big");
+	case DB_TOO_BIG_INDEX_COL:
+		return("Index columns size too big");
+	case DB_LOCK_WAIT_TIMEOUT:
+		return("Lock wait timeout");
+	case DB_NO_REFERENCED_ROW:
+		return("Referenced key value not found");
+	case DB_ROW_IS_REFERENCED:
+		return("Row is referenced");
+	case DB_CANNOT_ADD_CONSTRAINT:
+		return("Cannot add constraint");
+	case DB_CORRUPTION:
+		return("Data structure corruption");
+	case DB_CANNOT_DROP_CONSTRAINT:
+		return("Cannot drop constraint");
+	case DB_NO_SAVEPOINT:
+		return("No such savepoint");
+	case DB_TABLESPACE_EXISTS:
+		return("Tablespace already exists");
+	case DB_TABLESPACE_DELETED:
+		return("Tablespace deleted or being deleted");
+	case DB_TABLESPACE_NOT_FOUND:
+		return("Tablespace not found");
+	case DB_LOCK_TABLE_FULL:
+		return("Lock structs have exhausted the buffer pool");
+	case DB_FOREIGN_DUPLICATE_KEY:
+		return("Foreign key activated with duplicate keys");
+	case DB_FOREIGN_EXCEED_MAX_CASCADE:
+		return("Foreign key cascade delete/update exceeds max depth");
+	case DB_TOO_MANY_CONCURRENT_TRXS:
+		return("Too many concurrent transactions");
+	case DB_UNSUPPORTED:
+		return("Unsupported");
+	case DB_INVALID_NULL:
+		return("NULL value encountered in NOT NULL column");
+	case DB_STATS_DO_NOT_EXIST:
+		return("Persistent statistics do not exist");
+	case DB_FAIL:
+		return("Failed, retry may succeed");
+	case DB_OVERFLOW:
+		return("Overflow");
+	case DB_UNDERFLOW:
+		return("Underflow");
+	case DB_STRONG_FAIL:
+		return("Failed, retry will not succeed");
+	case DB_ZIP_OVERFLOW:
+		return("Zip overflow");
+	case DB_RECORD_NOT_FOUND:
+		return("Record not found");
+	case DB_CHILD_NO_INDEX:
+		return("No index on referencing keys in referencing table");
+	case DB_PARENT_NO_INDEX:
+		return("No index on referenced keys in referenced table");
+	case DB_FTS_INVALID_DOCID:
+		return("FTS Doc ID cannot be zero");
+	case DB_INDEX_CORRUPT:
+		return("Index corrupted");
+	case DB_UNDO_RECORD_TOO_BIG:
+		return("Undo record too big");
+	case DB_END_OF_INDEX:
+		return("End of index");
+	case DB_IO_ERROR:
+		return("I/O error");
+	case DB_NOT_FOUND:
+		return("not found");
+	case DB_ONLINE_LOG_TOO_BIG:
+		return("Log size exceeded during online index creation");
+	case DB_IDENTIFIER_TOO_LONG:
+		return("Identifier name is too long");
+	case DB_FTS_EXCEED_RESULT_CACHE_LIMIT:
+		return("FTS query exceeds result cache limit");
+	case DB_TEMP_FILE_WRITE_FAIL:
+		return("Temp file write failure");
+	case DB_CANT_CREATE_GEOMETRY_OBJECT:
+		return("Can't create specificed geometry data object");
+	case DB_CANNOT_OPEN_FILE:
+		return("Cannot open a file");
+	case DB_TABLE_CORRUPT:
+		return("Table is corrupted");
+	case DB_FTS_TOO_MANY_WORDS_IN_PHRASE:
+		return("Too many words in a FTS phrase or proximity search");
+	case DB_DECRYPTION_FAILED:
+		return("Table is compressed or encrypted but uncompress or decrypt failed.");
+	case DB_IO_PARTIAL_FAILED:
+		return("Partial IO failed");
+	case DB_COMPUTE_VALUE_FAILED:
+		return("Compute generated column failed");
+	case DB_NO_FK_ON_S_BASE_COL:
+		return("Cannot add foreign key on the base column "
+		       "of stored column");
+	case DB_IO_NO_PUNCH_HOLE:
+		return ("File system does not support punch hole (trim) operation.");
+	case DB_PAGE_CORRUPTED:
+		return("Page read from tablespace is corrupted.");
+
+	/* do not add default: in order to produce a warning if new code
+	is added to the enum but not added here */
+	}
+
+	/* we abort here because if unknown error code is given, this could
+	mean that memory corruption has happened and someone's error-code
+	variable has been overwritten with bogus data */
+	ut_error;
+
+	/* NOT REACHED */
+	return("Unknown error");
+}
+
+namespace ib {
+
+std::ostream &operator<<(std::ostream &lhs, const bytes_iec &rhs)
+{
+  static const char *sizes[]= {"B", "KiB", "MiB", "GiB", "TiB", "PiB",
+                              "EiB", "ZiB", "YiB"};
+  size_t i= 0;
+  double d= rhs.get_double();
+  for (; d > 512.0 && i < array_elements(sizes); i++, d/= 1024.0);
+  lhs.precision(3);
+  lhs << std::fixed << d << sizes[i];
+  return lhs;
+}
+
+ATTRIBUTE_COLD logger& logger::operator<<(dberr_t err)
+{
+  m_oss << ut_strerr(err);
+  return *this;
+}
+
+info::~info()
+{
+	sql_print_information("InnoDB: %s", m_oss.str().c_str());
+}
+
+warn::~warn()
+{
+	sql_print_warning("InnoDB: %s", m_oss.str().c_str());
+}
+
+/** true if error::~error() was invoked, false otherwise */
+bool error::logged;
+
+error::~error()
+{
+	sql_print_error("InnoDB: %s", m_oss.str().c_str());
+	logged = true;
+}
+
+#ifdef _MSC_VER
+/* disable warning
+  "ib::fatal::~fatal': destructor never returns, potential memory leak"
+   on Windows.
+*/
+#pragma warning (push)
+#pragma warning (disable : 4722)
+#endif
+
+ATTRIBUTE_NORETURN
+fatal::~fatal()
+{
+	sql_print_error("[FATAL] InnoDB: %s", m_oss.str().c_str());
+	abort();
+}
+
+#ifdef _MSC_VER
+#pragma warning (pop)
+#endif
+
+error_or_warn::~error_or_warn()
+{
+	if (m_error) {
+		sql_print_error("InnoDB: %s", m_oss.str().c_str());
+	} else {
+		sql_print_warning("InnoDB: %s", m_oss.str().c_str());
+	}
+}
+
+fatal_or_error::~fatal_or_error()
+{
+	sql_print_error(m_fatal ? "[FATAL] InnoDB: %s" : "InnoDB: %s",
+			m_oss.str().c_str());
+	if (m_fatal) {
+		abort();
+	}
+}
+
+} // namespace ib
+
+#ifndef DBUG_OFF
+static char dbug_print_buf[1024];
+
+const char * dbug_print_rec(const rec_t* rec, const rec_offs* offsets)
+{
+	rec_printer r(rec, offsets);
+	strmake(dbug_print_buf, r.str().c_str(), sizeof(dbug_print_buf) - 1);
+	return dbug_print_buf;
+}
+
+const char * dbug_print_rec(const rec_t* rec, ulint info, const rec_offs* offsets)
+{
+	rec_printer r(rec, info, offsets);
+	strmake(dbug_print_buf, r.str().c_str(), sizeof(dbug_print_buf) - 1);
+	return dbug_print_buf;
+}
+
+const char * dbug_print_rec(const dtuple_t* tuple)
+{
+	rec_printer r(tuple);
+	strmake(dbug_print_buf, r.str().c_str(), sizeof(dbug_print_buf) - 1);
+	return dbug_print_buf;
+}
+
+const char * dbug_print_rec(const dfield_t* field, ulint n)
+{
+	rec_printer r(field, n);
+	strmake(dbug_print_buf, r.str().c_str(), sizeof(dbug_print_buf) - 1);
+	return dbug_print_buf;
+}
+
+const char * dbug_print_rec(const rec_t* rec, dict_index_t* index)
+{
+	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs*	offsets		= offsets_;
+	rec_offs_init(offsets_);
+	mem_heap_t*	tmp_heap	= NULL;
+	offsets = rec_get_offsets(rec, index, offsets, index->n_core_fields,
+				  ULINT_UNDEFINED, &tmp_heap);
+	rec_printer r(rec, offsets);
+	strmake(dbug_print_buf, r.str().c_str(), sizeof(dbug_print_buf) - 1);
+	return dbug_print_buf;
+}
+#endif /* !DBUG_OFF */
+
+#endif /* !UNIV_INNOCHECKSUM */
diff --git a/storage/innobase/ut/ut0vec.cc b/storage/innobase/ut/ut0vec.cc
new file mode 100644
index 00000000..c9262bc9
--- /dev/null
+++ b/storage/innobase/ut/ut0vec.cc
@@ -0,0 +1,73 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2016, Oracle and/or its affiliates. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file ut/ut0vec.cc
+A vector of pointers to data items
+
+Created 4/6/2006 Osku Salerma
+************************************************************************/
+
+#include "ut0vec.h"
+#include "mem0mem.h"
+
+/********************************************************************
+Create a new vector with the given initial size. */
+ib_vector_t*
+ib_vector_create(
+/*=============*/
+					/* out: vector */
+	ib_alloc_t*	allocator,	/* in: vector allocator */
+	ulint		sizeof_value,	/* in: size of data item */
+	ulint		size)		/* in: initial size */
+{
+	ib_vector_t*	vec;
+
+	ut_a(size > 0);
+
+	vec = static_cast<ib_vector_t*>(
+		allocator->mem_malloc(allocator, sizeof(*vec)));
+
+	vec->used = 0;
+	vec->total = size;
+	vec->allocator = allocator;
+	vec->sizeof_value = sizeof_value;
+
+	vec->data = static_cast<void*>(
+		allocator->mem_malloc(allocator, vec->sizeof_value * size));
+
+	return(vec);
+}
+
+/********************************************************************
+Resize the vector, currently the vector can only grow and we
+expand the number of elements it can hold by 2 times. */
+void
+ib_vector_resize(
+/*=============*/
+	ib_vector_t*	vec)		/* in: vector */
+{
+	ulint		new_total = vec->total * 2;
+	ulint		old_size = vec->used * vec->sizeof_value;
+	ulint		new_size = new_total * vec->sizeof_value;
+
+	vec->data = static_cast<void*>(vec->allocator->mem_resize(
+		vec->allocator, vec->data, old_size, new_size));
+
+	vec->total = new_total;
+}
diff --git a/storage/innobase/ut/ut0wqueue.cc b/storage/innobase/ut/ut0wqueue.cc
new file mode 100644
index 00000000..f4af031f
--- /dev/null
+++ b/storage/innobase/ut/ut0wqueue.cc
@@ -0,0 +1,118 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2019, 2021, MariaDB Corporation.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#include "ut0list.h"
+#include "mem0mem.h"
+#include "ut0wqueue.h"
+
+/*******************************************************************//**
+@file ut/ut0wqueue.cc
+A work queue
+
+Created 4/26/2006 Osku Salerma
+************************************************************************/
+
+/****************************************************************//**
+Create a new work queue.
+@return work queue */
+ib_wqueue_t*
+ib_wqueue_create(void)
+/*===================*/
+{
+	ib_wqueue_t*	wq = static_cast<ib_wqueue_t*>(
+		ut_malloc_nokey(sizeof(*wq)));
+
+	mysql_mutex_init(0, &wq->mutex, nullptr);
+
+	wq->items = ib_list_create();
+	wq->length = 0;
+
+	return(wq);
+}
+
+/****************************************************************//**
+Free a work queue. */
+void
+ib_wqueue_free(
+/*===========*/
+	ib_wqueue_t*	wq)	/*!< in: work queue */
+{
+	mysql_mutex_destroy(&wq->mutex);
+	ib_list_free(wq->items);
+
+	ut_free(wq);
+}
+
+/** Add a work item to the queue.
+@param[in,out]	wq		work queue
+@param[in]	item		work item
+@param[in,out]	heap		memory heap to use for allocating list node
+@param[in]	wq_locked	work queue mutex locked */
+void
+ib_wqueue_add(ib_wqueue_t* wq, void* item, mem_heap_t* heap, bool wq_locked)
+{
+	if (!wq_locked) {
+		mysql_mutex_lock(&wq->mutex);
+	}
+
+	ib_list_add_last(wq->items, item, heap);
+	wq->length++;
+	ut_ad(wq->length == ib_list_len(wq->items));
+
+	if (!wq_locked) {
+		mysql_mutex_unlock(&wq->mutex);
+	}
+}
+
+/********************************************************************
+Return first item on work queue or NULL if queue is empty
+@return work item or NULL */
+void*
+ib_wqueue_nowait(
+/*=============*/
+	ib_wqueue_t*	wq)		/*<! in: work queue */
+{
+	ib_list_node_t*	node = NULL;
+
+	mysql_mutex_lock(&wq->mutex);
+
+	if(!ib_list_is_empty(wq->items)) {
+		node = ib_list_get_first(wq->items);
+
+		if (node) {
+			ib_list_remove(wq->items, node);
+			--wq->length;
+			ut_ad(wq->length == ib_list_len(wq->items));
+		}
+	}
+
+	mysql_mutex_unlock(&wq->mutex);
+
+	return (node ? node->data : NULL);
+}
+/** Check if queue is empty.
+@param wq wait queue
+@return whether the queue is empty */
+bool ib_wqueue_is_empty(ib_wqueue_t* wq)
+{
+	mysql_mutex_lock(&wq->mutex);
+	bool is_empty = ib_list_is_empty(wq->items);
+	mysql_mutex_unlock(&wq->mutex);
+	return is_empty;
+}
-- 
cgit v1.2.3